diff --git "a/checkpoint-48327/trainer_state.json" "b/checkpoint-48327/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-48327/trainer_state.json" @@ -0,0 +1,338322 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 48327, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.207709975789931e-05, + "grad_norm": 1.1658770863900954, + "learning_rate": 2.069108214359611e-08, + "loss": 11.975, + "step": 1 + }, + { + "epoch": 0.00012415419951579862, + "grad_norm": 1.2228204293972478, + "learning_rate": 4.138216428719222e-08, + "loss": 11.9692, + "step": 2 + }, + { + "epoch": 0.00018623129927369795, + "grad_norm": 1.300081448571089, + "learning_rate": 6.207324643078834e-08, + "loss": 11.9658, + "step": 3 + }, + { + "epoch": 0.00024830839903159724, + "grad_norm": 1.2162240579784744, + "learning_rate": 8.276432857438443e-08, + "loss": 11.9687, + "step": 4 + }, + { + "epoch": 0.00031038549878949657, + "grad_norm": 1.1704195266494946, + "learning_rate": 1.0345541071798054e-07, + "loss": 11.9689, + "step": 5 + }, + { + "epoch": 0.0003724625985473959, + "grad_norm": 1.2509271521120053, + "learning_rate": 1.2414649286157668e-07, + "loss": 11.9685, + "step": 6 + }, + { + "epoch": 0.00043453969830529516, + "grad_norm": 1.1071393764458715, + "learning_rate": 1.448375750051728e-07, + "loss": 11.9713, + "step": 7 + }, + { + "epoch": 0.0004966167980631945, + "grad_norm": 1.223051549609549, + "learning_rate": 1.6552865714876887e-07, + "loss": 11.9642, + "step": 8 + }, + { + "epoch": 0.0005586938978210938, + "grad_norm": 1.2077596990929833, + "learning_rate": 1.86219739292365e-07, + "loss": 11.9665, + "step": 9 + }, + { + "epoch": 0.0006207709975789931, + "grad_norm": 1.2927895044278435, + "learning_rate": 2.0691082143596109e-07, + "loss": 11.9684, + "step": 10 + }, + { + "epoch": 0.0006828480973368925, + "grad_norm": 1.235533110656194, + "learning_rate": 2.2760190357955722e-07, + "loss": 11.9719, + "step": 11 + }, + { + "epoch": 0.0007449251970947918, + "grad_norm": 1.2041215113760864, + "learning_rate": 2.4829298572315336e-07, + "loss": 11.9656, + "step": 12 + }, + { + "epoch": 0.000807002296852691, + "grad_norm": 1.1626000658672793, + "learning_rate": 2.6898406786674944e-07, + "loss": 11.9709, + "step": 13 + }, + { + "epoch": 0.0008690793966105903, + "grad_norm": 1.1966564626356078, + "learning_rate": 2.896751500103456e-07, + "loss": 11.9731, + "step": 14 + }, + { + "epoch": 0.0009311564963684896, + "grad_norm": 1.190737577307479, + "learning_rate": 3.103662321539417e-07, + "loss": 11.9659, + "step": 15 + }, + { + "epoch": 0.000993233596126389, + "grad_norm": 1.2267807687337262, + "learning_rate": 3.3105731429753774e-07, + "loss": 11.9656, + "step": 16 + }, + { + "epoch": 0.0010553106958842882, + "grad_norm": 1.1622729274517531, + "learning_rate": 3.517483964411339e-07, + "loss": 11.9682, + "step": 17 + }, + { + "epoch": 0.0011173877956421876, + "grad_norm": 1.1861740539808145, + "learning_rate": 3.7243947858473e-07, + "loss": 11.9731, + "step": 18 + }, + { + "epoch": 0.0011794648954000868, + "grad_norm": 1.2303935194481033, + "learning_rate": 3.931305607283261e-07, + "loss": 11.968, + "step": 19 + }, + { + "epoch": 0.0012415419951579863, + "grad_norm": 1.2373681778488974, + "learning_rate": 4.1382164287192217e-07, + "loss": 11.9653, + "step": 20 + }, + { + "epoch": 0.0013036190949158855, + "grad_norm": 1.2237210740439695, + "learning_rate": 4.345127250155183e-07, + "loss": 11.9635, + "step": 21 + }, + { + "epoch": 0.001365696194673785, + "grad_norm": 1.1966227412507837, + "learning_rate": 4.5520380715911444e-07, + "loss": 11.9619, + "step": 22 + }, + { + "epoch": 0.0014277732944316841, + "grad_norm": 1.1867409840272058, + "learning_rate": 4.758948893027105e-07, + "loss": 11.9685, + "step": 23 + }, + { + "epoch": 0.0014898503941895836, + "grad_norm": 1.2643803257059272, + "learning_rate": 4.965859714463067e-07, + "loss": 11.9607, + "step": 24 + }, + { + "epoch": 0.0015519274939474828, + "grad_norm": 1.2719278572965018, + "learning_rate": 5.172770535899027e-07, + "loss": 11.9658, + "step": 25 + }, + { + "epoch": 0.001614004593705382, + "grad_norm": 1.1510435366147878, + "learning_rate": 5.379681357334989e-07, + "loss": 11.9684, + "step": 26 + }, + { + "epoch": 0.0016760816934632814, + "grad_norm": 1.2926853941245655, + "learning_rate": 5.58659217877095e-07, + "loss": 11.9622, + "step": 27 + }, + { + "epoch": 0.0017381587932211806, + "grad_norm": 1.2601421414535645, + "learning_rate": 5.793503000206911e-07, + "loss": 11.9667, + "step": 28 + }, + { + "epoch": 0.00180023589297908, + "grad_norm": 1.1862684187750496, + "learning_rate": 6.000413821642873e-07, + "loss": 11.9598, + "step": 29 + }, + { + "epoch": 0.0018623129927369793, + "grad_norm": 1.1887935820571864, + "learning_rate": 6.207324643078834e-07, + "loss": 11.954, + "step": 30 + }, + { + "epoch": 0.0019243900924948787, + "grad_norm": 1.2175389655298996, + "learning_rate": 6.414235464514794e-07, + "loss": 11.9593, + "step": 31 + }, + { + "epoch": 0.001986467192252778, + "grad_norm": 1.2845569825086178, + "learning_rate": 6.621146285950755e-07, + "loss": 11.9547, + "step": 32 + }, + { + "epoch": 0.002048544292010677, + "grad_norm": 1.1241037719547897, + "learning_rate": 6.828057107386716e-07, + "loss": 11.9541, + "step": 33 + }, + { + "epoch": 0.0021106213917685764, + "grad_norm": 1.184798933265922, + "learning_rate": 7.034967928822677e-07, + "loss": 11.9614, + "step": 34 + }, + { + "epoch": 0.002172698491526476, + "grad_norm": 1.2918873208782127, + "learning_rate": 7.241878750258639e-07, + "loss": 11.9509, + "step": 35 + }, + { + "epoch": 0.0022347755912843752, + "grad_norm": 1.2377719734412693, + "learning_rate": 7.4487895716946e-07, + "loss": 11.954, + "step": 36 + }, + { + "epoch": 0.0022968526910422745, + "grad_norm": 1.2993975086840597, + "learning_rate": 7.655700393130562e-07, + "loss": 11.9532, + "step": 37 + }, + { + "epoch": 0.0023589297908001737, + "grad_norm": 1.3269694518487725, + "learning_rate": 7.862611214566522e-07, + "loss": 11.9519, + "step": 38 + }, + { + "epoch": 0.0024210068905580733, + "grad_norm": 1.095268067942966, + "learning_rate": 8.069522036002483e-07, + "loss": 11.9514, + "step": 39 + }, + { + "epoch": 0.0024830839903159725, + "grad_norm": 1.2025193510659653, + "learning_rate": 8.276432857438443e-07, + "loss": 11.9503, + "step": 40 + }, + { + "epoch": 0.0025451610900738717, + "grad_norm": 1.2260406744776589, + "learning_rate": 8.483343678874406e-07, + "loss": 11.9386, + "step": 41 + }, + { + "epoch": 0.002607238189831771, + "grad_norm": 1.283750652398498, + "learning_rate": 8.690254500310366e-07, + "loss": 11.9305, + "step": 42 + }, + { + "epoch": 0.00266931528958967, + "grad_norm": 1.1679965625117943, + "learning_rate": 8.897165321746329e-07, + "loss": 11.927, + "step": 43 + }, + { + "epoch": 0.00273139238934757, + "grad_norm": 1.272257486328479, + "learning_rate": 9.104076143182289e-07, + "loss": 11.9218, + "step": 44 + }, + { + "epoch": 0.002793469489105469, + "grad_norm": 1.30512412982638, + "learning_rate": 9.31098696461825e-07, + "loss": 11.9156, + "step": 45 + }, + { + "epoch": 0.0028555465888633683, + "grad_norm": 1.3130991340279348, + "learning_rate": 9.51789778605421e-07, + "loss": 11.9155, + "step": 46 + }, + { + "epoch": 0.0029176236886212675, + "grad_norm": 1.2230463837555905, + "learning_rate": 9.724808607490173e-07, + "loss": 11.9254, + "step": 47 + }, + { + "epoch": 0.002979700788379167, + "grad_norm": 1.2265682341812123, + "learning_rate": 9.931719428926134e-07, + "loss": 11.9162, + "step": 48 + }, + { + "epoch": 0.0030417778881370663, + "grad_norm": 1.27683287503911, + "learning_rate": 1.0138630250362094e-06, + "loss": 11.9117, + "step": 49 + }, + { + "epoch": 0.0031038549878949656, + "grad_norm": 1.2276770029429536, + "learning_rate": 1.0345541071798055e-06, + "loss": 11.9053, + "step": 50 + }, + { + "epoch": 0.0031659320876528648, + "grad_norm": 1.2315551321626708, + "learning_rate": 1.0552451893234016e-06, + "loss": 11.9005, + "step": 51 + }, + { + "epoch": 0.003228009187410764, + "grad_norm": 1.344001081207953, + "learning_rate": 1.0759362714669978e-06, + "loss": 11.8965, + "step": 52 + }, + { + "epoch": 0.0032900862871686636, + "grad_norm": 1.2483097092176096, + "learning_rate": 1.0966273536105939e-06, + "loss": 11.8993, + "step": 53 + }, + { + "epoch": 0.003352163386926563, + "grad_norm": 1.2235187255087403, + "learning_rate": 1.11731843575419e-06, + "loss": 11.9004, + "step": 54 + }, + { + "epoch": 0.003414240486684462, + "grad_norm": 1.2514451732719367, + "learning_rate": 1.1380095178977862e-06, + "loss": 11.8967, + "step": 55 + }, + { + "epoch": 0.0034763175864423613, + "grad_norm": 1.2650842806668046, + "learning_rate": 1.1587006000413823e-06, + "loss": 11.8851, + "step": 56 + }, + { + "epoch": 0.003538394686200261, + "grad_norm": 1.3319229574175264, + "learning_rate": 1.1793916821849782e-06, + "loss": 11.8549, + "step": 57 + }, + { + "epoch": 0.00360047178595816, + "grad_norm": 1.3326727253828792, + "learning_rate": 1.2000827643285746e-06, + "loss": 11.8364, + "step": 58 + }, + { + "epoch": 0.0036625488857160594, + "grad_norm": 1.426119913091288, + "learning_rate": 1.2207738464721705e-06, + "loss": 11.8066, + "step": 59 + }, + { + "epoch": 0.0037246259854739586, + "grad_norm": 1.3464593409686436, + "learning_rate": 1.2414649286157668e-06, + "loss": 11.8029, + "step": 60 + }, + { + "epoch": 0.003786703085231858, + "grad_norm": 1.39073883619393, + "learning_rate": 1.2621560107593628e-06, + "loss": 11.8092, + "step": 61 + }, + { + "epoch": 0.0038487801849897574, + "grad_norm": 1.4810794587285407, + "learning_rate": 1.282847092902959e-06, + "loss": 11.7737, + "step": 62 + }, + { + "epoch": 0.003910857284747656, + "grad_norm": 1.385073564345366, + "learning_rate": 1.303538175046555e-06, + "loss": 11.7953, + "step": 63 + }, + { + "epoch": 0.003972934384505556, + "grad_norm": 1.4357076334307783, + "learning_rate": 1.324229257190151e-06, + "loss": 11.7672, + "step": 64 + }, + { + "epoch": 0.0040350114842634555, + "grad_norm": 1.4220391482666295, + "learning_rate": 1.3449203393337473e-06, + "loss": 11.7697, + "step": 65 + }, + { + "epoch": 0.004097088584021354, + "grad_norm": 1.3763110239634164, + "learning_rate": 1.3656114214773432e-06, + "loss": 11.7751, + "step": 66 + }, + { + "epoch": 0.004159165683779254, + "grad_norm": 1.5765034871545245, + "learning_rate": 1.3863025036209396e-06, + "loss": 11.739, + "step": 67 + }, + { + "epoch": 0.004221242783537153, + "grad_norm": 1.544549986475292, + "learning_rate": 1.4069935857645355e-06, + "loss": 11.7323, + "step": 68 + }, + { + "epoch": 0.004283319883295052, + "grad_norm": 1.5911732206677456, + "learning_rate": 1.4276846679081316e-06, + "loss": 11.7293, + "step": 69 + }, + { + "epoch": 0.004345396983052952, + "grad_norm": 1.6144535709863208, + "learning_rate": 1.4483757500517278e-06, + "loss": 11.7175, + "step": 70 + }, + { + "epoch": 0.004407474082810851, + "grad_norm": 1.6117971766404415, + "learning_rate": 1.469066832195324e-06, + "loss": 11.7039, + "step": 71 + }, + { + "epoch": 0.0044695511825687505, + "grad_norm": 1.7685112107913543, + "learning_rate": 1.48975791433892e-06, + "loss": 11.6729, + "step": 72 + }, + { + "epoch": 0.00453162828232665, + "grad_norm": 1.7125234003626202, + "learning_rate": 1.5104489964825162e-06, + "loss": 11.6881, + "step": 73 + }, + { + "epoch": 0.004593705382084549, + "grad_norm": 1.6686162830742066, + "learning_rate": 1.5311400786261123e-06, + "loss": 11.6707, + "step": 74 + }, + { + "epoch": 0.0046557824818424486, + "grad_norm": 1.750967031049769, + "learning_rate": 1.5518311607697082e-06, + "loss": 11.6603, + "step": 75 + }, + { + "epoch": 0.004717859581600347, + "grad_norm": 1.8342887455223817, + "learning_rate": 1.5725222429133044e-06, + "loss": 11.6185, + "step": 76 + }, + { + "epoch": 0.004779936681358247, + "grad_norm": 1.8918185919476707, + "learning_rate": 1.5932133250569005e-06, + "loss": 11.5734, + "step": 77 + }, + { + "epoch": 0.004842013781116147, + "grad_norm": 1.8531518223382946, + "learning_rate": 1.6139044072004966e-06, + "loss": 11.5222, + "step": 78 + }, + { + "epoch": 0.004904090880874045, + "grad_norm": 1.7983444176521728, + "learning_rate": 1.634595489344093e-06, + "loss": 11.4946, + "step": 79 + }, + { + "epoch": 0.004966167980631945, + "grad_norm": 1.915592376281836, + "learning_rate": 1.6552865714876887e-06, + "loss": 11.4519, + "step": 80 + }, + { + "epoch": 0.005028245080389844, + "grad_norm": 1.7739669748821176, + "learning_rate": 1.675977653631285e-06, + "loss": 11.4118, + "step": 81 + }, + { + "epoch": 0.0050903221801477435, + "grad_norm": 1.6665023325329515, + "learning_rate": 1.6966687357748812e-06, + "loss": 11.3993, + "step": 82 + }, + { + "epoch": 0.005152399279905643, + "grad_norm": 1.7904392592144196, + "learning_rate": 1.7173598179184773e-06, + "loss": 11.3096, + "step": 83 + }, + { + "epoch": 0.005214476379663542, + "grad_norm": 1.691078700534655, + "learning_rate": 1.7380509000620732e-06, + "loss": 11.3287, + "step": 84 + }, + { + "epoch": 0.005276553479421442, + "grad_norm": 1.5903190633346398, + "learning_rate": 1.7587419822056694e-06, + "loss": 11.3335, + "step": 85 + }, + { + "epoch": 0.00533863057917934, + "grad_norm": 1.5853392125516104, + "learning_rate": 1.7794330643492657e-06, + "loss": 11.291, + "step": 86 + }, + { + "epoch": 0.00540070767893724, + "grad_norm": 1.6281790814048274, + "learning_rate": 1.8001241464928614e-06, + "loss": 11.2361, + "step": 87 + }, + { + "epoch": 0.00546278477869514, + "grad_norm": 1.4687137193459934, + "learning_rate": 1.8208152286364578e-06, + "loss": 11.2686, + "step": 88 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.5080458702578505, + "learning_rate": 1.841506310780054e-06, + "loss": 11.2079, + "step": 89 + }, + { + "epoch": 0.005586938978210938, + "grad_norm": 1.4939972453417527, + "learning_rate": 1.86219739292365e-06, + "loss": 11.172, + "step": 90 + }, + { + "epoch": 0.005649016077968837, + "grad_norm": 1.5280788048000664, + "learning_rate": 1.882888475067246e-06, + "loss": 11.1415, + "step": 91 + }, + { + "epoch": 0.0057110931777267365, + "grad_norm": 1.459660425802791, + "learning_rate": 1.903579557210842e-06, + "loss": 11.2075, + "step": 92 + }, + { + "epoch": 0.005773170277484636, + "grad_norm": 1.416480005346014, + "learning_rate": 1.9242706393544382e-06, + "loss": 11.1541, + "step": 93 + }, + { + "epoch": 0.005835247377242535, + "grad_norm": 1.38671316226317, + "learning_rate": 1.9449617214980346e-06, + "loss": 11.0869, + "step": 94 + }, + { + "epoch": 0.005897324477000435, + "grad_norm": 1.3657207359434123, + "learning_rate": 1.9656528036416305e-06, + "loss": 11.1176, + "step": 95 + }, + { + "epoch": 0.005959401576758334, + "grad_norm": 1.3642896622562848, + "learning_rate": 1.986343885785227e-06, + "loss": 11.093, + "step": 96 + }, + { + "epoch": 0.006021478676516233, + "grad_norm": 1.3477166203784456, + "learning_rate": 2.0070349679288228e-06, + "loss": 11.0185, + "step": 97 + }, + { + "epoch": 0.006083555776274133, + "grad_norm": 1.3394165692662927, + "learning_rate": 2.0277260500724187e-06, + "loss": 11.0711, + "step": 98 + }, + { + "epoch": 0.0061456328760320315, + "grad_norm": 1.330637675485222, + "learning_rate": 2.048417132216015e-06, + "loss": 10.9776, + "step": 99 + }, + { + "epoch": 0.006207709975789931, + "grad_norm": 1.2329708273283622, + "learning_rate": 2.069108214359611e-06, + "loss": 10.9942, + "step": 100 + }, + { + "epoch": 0.006269787075547831, + "grad_norm": 1.2654993913273005, + "learning_rate": 2.0897992965032073e-06, + "loss": 10.976, + "step": 101 + }, + { + "epoch": 0.0063318641753057296, + "grad_norm": 1.2808033234342862, + "learning_rate": 2.1104903786468032e-06, + "loss": 10.9573, + "step": 102 + }, + { + "epoch": 0.006393941275063629, + "grad_norm": 1.221629482490968, + "learning_rate": 2.1311814607903996e-06, + "loss": 10.9562, + "step": 103 + }, + { + "epoch": 0.006456018374821528, + "grad_norm": 1.2906759626103699, + "learning_rate": 2.1518725429339955e-06, + "loss": 10.9308, + "step": 104 + }, + { + "epoch": 0.006518095474579428, + "grad_norm": 1.210692497817739, + "learning_rate": 2.172563625077592e-06, + "loss": 10.9608, + "step": 105 + }, + { + "epoch": 0.006580172574337327, + "grad_norm": 1.191326451365128, + "learning_rate": 2.1932547072211878e-06, + "loss": 10.8879, + "step": 106 + }, + { + "epoch": 0.006642249674095226, + "grad_norm": 1.2282759246108994, + "learning_rate": 2.2139457893647837e-06, + "loss": 10.8424, + "step": 107 + }, + { + "epoch": 0.006704326773853126, + "grad_norm": 1.1351049444440933, + "learning_rate": 2.23463687150838e-06, + "loss": 10.9169, + "step": 108 + }, + { + "epoch": 0.0067664038736110245, + "grad_norm": 1.155017918039028, + "learning_rate": 2.2553279536519764e-06, + "loss": 10.8426, + "step": 109 + }, + { + "epoch": 0.006828480973368924, + "grad_norm": 1.1652418984845798, + "learning_rate": 2.2760190357955723e-06, + "loss": 10.8167, + "step": 110 + }, + { + "epoch": 0.006890558073126824, + "grad_norm": 1.1568516892650955, + "learning_rate": 2.2967101179391682e-06, + "loss": 10.8533, + "step": 111 + }, + { + "epoch": 0.006952635172884723, + "grad_norm": 1.1090170591777668, + "learning_rate": 2.3174012000827646e-06, + "loss": 10.8255, + "step": 112 + }, + { + "epoch": 0.007014712272642622, + "grad_norm": 1.1525034131146448, + "learning_rate": 2.3380922822263605e-06, + "loss": 10.7552, + "step": 113 + }, + { + "epoch": 0.007076789372400522, + "grad_norm": 1.1629882566493825, + "learning_rate": 2.3587833643699564e-06, + "loss": 10.7525, + "step": 114 + }, + { + "epoch": 0.007138866472158421, + "grad_norm": 1.1363088386501417, + "learning_rate": 2.3794744465135528e-06, + "loss": 10.6902, + "step": 115 + }, + { + "epoch": 0.00720094357191632, + "grad_norm": 1.0581650094468784, + "learning_rate": 2.400165528657149e-06, + "loss": 10.7903, + "step": 116 + }, + { + "epoch": 0.007263020671674219, + "grad_norm": 1.1152139050530812, + "learning_rate": 2.420856610800745e-06, + "loss": 10.69, + "step": 117 + }, + { + "epoch": 0.007325097771432119, + "grad_norm": 1.0483696928425528, + "learning_rate": 2.441547692944341e-06, + "loss": 10.7025, + "step": 118 + }, + { + "epoch": 0.007387174871190018, + "grad_norm": 1.0839705182813664, + "learning_rate": 2.4622387750879373e-06, + "loss": 10.7467, + "step": 119 + }, + { + "epoch": 0.007449251970947917, + "grad_norm": 1.033371563686196, + "learning_rate": 2.4829298572315337e-06, + "loss": 10.6782, + "step": 120 + }, + { + "epoch": 0.007511329070705817, + "grad_norm": 1.0159079524342813, + "learning_rate": 2.503620939375129e-06, + "loss": 10.704, + "step": 121 + }, + { + "epoch": 0.007573406170463716, + "grad_norm": 1.023996913696645, + "learning_rate": 2.5243120215187255e-06, + "loss": 10.702, + "step": 122 + }, + { + "epoch": 0.007635483270221615, + "grad_norm": 1.0575184445767918, + "learning_rate": 2.545003103662322e-06, + "loss": 10.6643, + "step": 123 + }, + { + "epoch": 0.007697560369979515, + "grad_norm": 1.019905051038203, + "learning_rate": 2.565694185805918e-06, + "loss": 10.7022, + "step": 124 + }, + { + "epoch": 0.007759637469737414, + "grad_norm": 1.0039281582946593, + "learning_rate": 2.5863852679495137e-06, + "loss": 10.6718, + "step": 125 + }, + { + "epoch": 0.007821714569495312, + "grad_norm": 1.0028093928891955, + "learning_rate": 2.60707635009311e-06, + "loss": 10.6208, + "step": 126 + }, + { + "epoch": 0.007883791669253213, + "grad_norm": 0.9507581684504929, + "learning_rate": 2.6277674322367064e-06, + "loss": 10.6433, + "step": 127 + }, + { + "epoch": 0.007945868769011112, + "grad_norm": 1.0240498814578054, + "learning_rate": 2.648458514380302e-06, + "loss": 10.5543, + "step": 128 + }, + { + "epoch": 0.00800794586876901, + "grad_norm": 0.9700532576107965, + "learning_rate": 2.6691495965238983e-06, + "loss": 10.6058, + "step": 129 + }, + { + "epoch": 0.008070022968526911, + "grad_norm": 1.007575369446098, + "learning_rate": 2.6898406786674946e-06, + "loss": 10.554, + "step": 130 + }, + { + "epoch": 0.00813210006828481, + "grad_norm": 0.9753669439518667, + "learning_rate": 2.7105317608110905e-06, + "loss": 10.5474, + "step": 131 + }, + { + "epoch": 0.008194177168042709, + "grad_norm": 0.9692094390190671, + "learning_rate": 2.7312228429546864e-06, + "loss": 10.5873, + "step": 132 + }, + { + "epoch": 0.00825625426780061, + "grad_norm": 0.9765051863064332, + "learning_rate": 2.751913925098283e-06, + "loss": 10.5738, + "step": 133 + }, + { + "epoch": 0.008318331367558508, + "grad_norm": 0.9597678160031081, + "learning_rate": 2.772605007241879e-06, + "loss": 10.5615, + "step": 134 + }, + { + "epoch": 0.008380408467316407, + "grad_norm": 0.9855188354422477, + "learning_rate": 2.7932960893854746e-06, + "loss": 10.4764, + "step": 135 + }, + { + "epoch": 0.008442485567074305, + "grad_norm": 0.9496078635564252, + "learning_rate": 2.813987171529071e-06, + "loss": 10.517, + "step": 136 + }, + { + "epoch": 0.008504562666832206, + "grad_norm": 0.9008679562448515, + "learning_rate": 2.8346782536726673e-06, + "loss": 10.567, + "step": 137 + }, + { + "epoch": 0.008566639766590105, + "grad_norm": 0.9218195235807091, + "learning_rate": 2.8553693358162633e-06, + "loss": 10.5415, + "step": 138 + }, + { + "epoch": 0.008628716866348004, + "grad_norm": 0.9271994885890034, + "learning_rate": 2.876060417959859e-06, + "loss": 10.5277, + "step": 139 + }, + { + "epoch": 0.008690793966105904, + "grad_norm": 0.9578490208295364, + "learning_rate": 2.8967515001034555e-06, + "loss": 10.4735, + "step": 140 + }, + { + "epoch": 0.008752871065863803, + "grad_norm": 0.9036915279272014, + "learning_rate": 2.917442582247052e-06, + "loss": 10.5163, + "step": 141 + }, + { + "epoch": 0.008814948165621702, + "grad_norm": 0.9331187232816299, + "learning_rate": 2.938133664390648e-06, + "loss": 10.4431, + "step": 142 + }, + { + "epoch": 0.008877025265379602, + "grad_norm": 0.9444755127571309, + "learning_rate": 2.9588247465342437e-06, + "loss": 10.4067, + "step": 143 + }, + { + "epoch": 0.008939102365137501, + "grad_norm": 0.9075998579209251, + "learning_rate": 2.97951582867784e-06, + "loss": 10.5054, + "step": 144 + }, + { + "epoch": 0.0090011794648954, + "grad_norm": 0.9172258333322117, + "learning_rate": 3.000206910821436e-06, + "loss": 10.4303, + "step": 145 + }, + { + "epoch": 0.0090632565646533, + "grad_norm": 0.9172857787525003, + "learning_rate": 3.0208979929650323e-06, + "loss": 10.4337, + "step": 146 + }, + { + "epoch": 0.009125333664411199, + "grad_norm": 0.926439686913461, + "learning_rate": 3.0415890751086283e-06, + "loss": 10.3916, + "step": 147 + }, + { + "epoch": 0.009187410764169098, + "grad_norm": 0.9167888769411228, + "learning_rate": 3.0622801572522246e-06, + "loss": 10.4268, + "step": 148 + }, + { + "epoch": 0.009249487863926997, + "grad_norm": 0.887347968605843, + "learning_rate": 3.0829712393958205e-06, + "loss": 10.4483, + "step": 149 + }, + { + "epoch": 0.009311564963684897, + "grad_norm": 0.9136006419256397, + "learning_rate": 3.1036623215394165e-06, + "loss": 10.3947, + "step": 150 + }, + { + "epoch": 0.009373642063442796, + "grad_norm": 0.9325122075138336, + "learning_rate": 3.124353403683013e-06, + "loss": 10.3515, + "step": 151 + }, + { + "epoch": 0.009435719163200695, + "grad_norm": 0.8899146141390316, + "learning_rate": 3.1450444858266087e-06, + "loss": 10.4112, + "step": 152 + }, + { + "epoch": 0.009497796262958595, + "grad_norm": 0.90522744597936, + "learning_rate": 3.165735567970205e-06, + "loss": 10.3582, + "step": 153 + }, + { + "epoch": 0.009559873362716494, + "grad_norm": 0.8662820849180851, + "learning_rate": 3.186426650113801e-06, + "loss": 10.4071, + "step": 154 + }, + { + "epoch": 0.009621950462474393, + "grad_norm": 0.9043613023649903, + "learning_rate": 3.2071177322573973e-06, + "loss": 10.3644, + "step": 155 + }, + { + "epoch": 0.009684027562232293, + "grad_norm": 0.8671444031599936, + "learning_rate": 3.2278088144009933e-06, + "loss": 10.4022, + "step": 156 + }, + { + "epoch": 0.009746104661990192, + "grad_norm": 0.8933741319352455, + "learning_rate": 3.248499896544589e-06, + "loss": 10.3443, + "step": 157 + }, + { + "epoch": 0.00980818176174809, + "grad_norm": 0.9041675780038299, + "learning_rate": 3.269190978688186e-06, + "loss": 10.3076, + "step": 158 + }, + { + "epoch": 0.00987025886150599, + "grad_norm": 0.9098838758783813, + "learning_rate": 3.2898820608317815e-06, + "loss": 10.283, + "step": 159 + }, + { + "epoch": 0.00993233596126389, + "grad_norm": 0.8486398009614977, + "learning_rate": 3.3105731429753774e-06, + "loss": 10.3722, + "step": 160 + }, + { + "epoch": 0.009994413061021789, + "grad_norm": 0.8731111507095632, + "learning_rate": 3.331264225118974e-06, + "loss": 10.3298, + "step": 161 + }, + { + "epoch": 0.010056490160779688, + "grad_norm": 0.8696287691693463, + "learning_rate": 3.35195530726257e-06, + "loss": 10.3679, + "step": 162 + }, + { + "epoch": 0.010118567260537588, + "grad_norm": 0.8763182110976814, + "learning_rate": 3.3726463894061656e-06, + "loss": 10.3161, + "step": 163 + }, + { + "epoch": 0.010180644360295487, + "grad_norm": 0.8632247008707642, + "learning_rate": 3.3933374715497623e-06, + "loss": 10.3274, + "step": 164 + }, + { + "epoch": 0.010242721460053386, + "grad_norm": 0.8932379823181082, + "learning_rate": 3.4140285536933583e-06, + "loss": 10.2573, + "step": 165 + }, + { + "epoch": 0.010304798559811286, + "grad_norm": 0.851806780573756, + "learning_rate": 3.4347196358369546e-06, + "loss": 10.3305, + "step": 166 + }, + { + "epoch": 0.010366875659569185, + "grad_norm": 0.8614988611936653, + "learning_rate": 3.4554107179805505e-06, + "loss": 10.2906, + "step": 167 + }, + { + "epoch": 0.010428952759327084, + "grad_norm": 0.868368873424122, + "learning_rate": 3.4761018001241465e-06, + "loss": 10.2636, + "step": 168 + }, + { + "epoch": 0.010491029859084984, + "grad_norm": 0.8666560976288761, + "learning_rate": 3.496792882267743e-06, + "loss": 10.2842, + "step": 169 + }, + { + "epoch": 0.010553106958842883, + "grad_norm": 0.8673555365991513, + "learning_rate": 3.5174839644113387e-06, + "loss": 10.2576, + "step": 170 + }, + { + "epoch": 0.010615184058600782, + "grad_norm": 0.8438155537323919, + "learning_rate": 3.5381750465549347e-06, + "loss": 10.2596, + "step": 171 + }, + { + "epoch": 0.01067726115835868, + "grad_norm": 0.8760912085520602, + "learning_rate": 3.5588661286985314e-06, + "loss": 10.2112, + "step": 172 + }, + { + "epoch": 0.010739338258116581, + "grad_norm": 0.8360919274980575, + "learning_rate": 3.579557210842127e-06, + "loss": 10.3103, + "step": 173 + }, + { + "epoch": 0.01080141535787448, + "grad_norm": 0.832094145354231, + "learning_rate": 3.600248292985723e-06, + "loss": 10.2974, + "step": 174 + }, + { + "epoch": 0.010863492457632379, + "grad_norm": 0.8545937090919148, + "learning_rate": 3.6209393751293196e-06, + "loss": 10.2144, + "step": 175 + }, + { + "epoch": 0.01092556955739028, + "grad_norm": 0.8210563193808231, + "learning_rate": 3.6416304572729155e-06, + "loss": 10.2575, + "step": 176 + }, + { + "epoch": 0.010987646657148178, + "grad_norm": 0.8201875500657563, + "learning_rate": 3.662321539416512e-06, + "loss": 10.2433, + "step": 177 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8297708817877483, + "learning_rate": 3.683012621560108e-06, + "loss": 10.2544, + "step": 178 + }, + { + "epoch": 0.011111800856663977, + "grad_norm": 0.8280836490887579, + "learning_rate": 3.7037037037037037e-06, + "loss": 10.1968, + "step": 179 + }, + { + "epoch": 0.011173877956421876, + "grad_norm": 0.8240947178085678, + "learning_rate": 3.7243947858473e-06, + "loss": 10.2441, + "step": 180 + }, + { + "epoch": 0.011235955056179775, + "grad_norm": 0.8305154882361339, + "learning_rate": 3.745085867990896e-06, + "loss": 10.175, + "step": 181 + }, + { + "epoch": 0.011298032155937674, + "grad_norm": 0.8226839573497058, + "learning_rate": 3.765776950134492e-06, + "loss": 10.2359, + "step": 182 + }, + { + "epoch": 0.011360109255695574, + "grad_norm": 0.8407558013234916, + "learning_rate": 3.7864680322780887e-06, + "loss": 10.2077, + "step": 183 + }, + { + "epoch": 0.011422186355453473, + "grad_norm": 0.806250378512508, + "learning_rate": 3.807159114421684e-06, + "loss": 10.2146, + "step": 184 + }, + { + "epoch": 0.011484263455211372, + "grad_norm": 0.8496569039215542, + "learning_rate": 3.8278501965652806e-06, + "loss": 10.1782, + "step": 185 + }, + { + "epoch": 0.011546340554969272, + "grad_norm": 0.8378900559074408, + "learning_rate": 3.8485412787088765e-06, + "loss": 10.1861, + "step": 186 + }, + { + "epoch": 0.011608417654727171, + "grad_norm": 0.889147721213586, + "learning_rate": 3.869232360852472e-06, + "loss": 10.0808, + "step": 187 + }, + { + "epoch": 0.01167049475448507, + "grad_norm": 0.8549667697159773, + "learning_rate": 3.889923442996069e-06, + "loss": 10.1188, + "step": 188 + }, + { + "epoch": 0.01173257185424297, + "grad_norm": 0.7845384814961033, + "learning_rate": 3.910614525139665e-06, + "loss": 10.1985, + "step": 189 + }, + { + "epoch": 0.01179464895400087, + "grad_norm": 0.8024126879411563, + "learning_rate": 3.931305607283261e-06, + "loss": 10.2006, + "step": 190 + }, + { + "epoch": 0.011856726053758768, + "grad_norm": 0.8556309028126509, + "learning_rate": 3.951996689426858e-06, + "loss": 10.0381, + "step": 191 + }, + { + "epoch": 0.011918803153516669, + "grad_norm": 0.8136805614020594, + "learning_rate": 3.972687771570454e-06, + "loss": 10.0976, + "step": 192 + }, + { + "epoch": 0.011980880253274567, + "grad_norm": 0.8230022836563112, + "learning_rate": 3.993378853714049e-06, + "loss": 10.0853, + "step": 193 + }, + { + "epoch": 0.012042957353032466, + "grad_norm": 0.8078588564648462, + "learning_rate": 4.0140699358576456e-06, + "loss": 10.104, + "step": 194 + }, + { + "epoch": 0.012105034452790365, + "grad_norm": 0.8046242402034466, + "learning_rate": 4.0347610180012415e-06, + "loss": 10.132, + "step": 195 + }, + { + "epoch": 0.012167111552548265, + "grad_norm": 0.8007731367491054, + "learning_rate": 4.055452100144837e-06, + "loss": 10.1146, + "step": 196 + }, + { + "epoch": 0.012229188652306164, + "grad_norm": 0.8114741892279586, + "learning_rate": 4.076143182288434e-06, + "loss": 10.0759, + "step": 197 + }, + { + "epoch": 0.012291265752064063, + "grad_norm": 0.7877653431804027, + "learning_rate": 4.09683426443203e-06, + "loss": 10.1456, + "step": 198 + }, + { + "epoch": 0.012353342851821963, + "grad_norm": 0.8433322932182904, + "learning_rate": 4.117525346575626e-06, + "loss": 9.9908, + "step": 199 + }, + { + "epoch": 0.012415419951579862, + "grad_norm": 0.8036599867327024, + "learning_rate": 4.138216428719222e-06, + "loss": 10.1131, + "step": 200 + }, + { + "epoch": 0.012477497051337761, + "grad_norm": 0.808167571018201, + "learning_rate": 4.158907510862818e-06, + "loss": 10.0742, + "step": 201 + }, + { + "epoch": 0.012539574151095662, + "grad_norm": 0.7912989356289419, + "learning_rate": 4.179598593006415e-06, + "loss": 10.1046, + "step": 202 + }, + { + "epoch": 0.01260165125085356, + "grad_norm": 0.836994216267737, + "learning_rate": 4.2002896751500106e-06, + "loss": 10.0167, + "step": 203 + }, + { + "epoch": 0.012663728350611459, + "grad_norm": 0.8384170834575325, + "learning_rate": 4.2209807572936065e-06, + "loss": 10.0646, + "step": 204 + }, + { + "epoch": 0.012725805450369358, + "grad_norm": 0.8217394452300653, + "learning_rate": 4.241671839437203e-06, + "loss": 10.0238, + "step": 205 + }, + { + "epoch": 0.012787882550127258, + "grad_norm": 0.8226985561871475, + "learning_rate": 4.262362921580799e-06, + "loss": 10.0368, + "step": 206 + }, + { + "epoch": 0.012849959649885157, + "grad_norm": 0.8050528132250728, + "learning_rate": 4.283054003724395e-06, + "loss": 10.0739, + "step": 207 + }, + { + "epoch": 0.012912036749643056, + "grad_norm": 0.8242866162748282, + "learning_rate": 4.303745085867991e-06, + "loss": 10.0033, + "step": 208 + }, + { + "epoch": 0.012974113849400956, + "grad_norm": 0.8158974773360971, + "learning_rate": 4.324436168011587e-06, + "loss": 9.9731, + "step": 209 + }, + { + "epoch": 0.013036190949158855, + "grad_norm": 0.8125407135372976, + "learning_rate": 4.345127250155184e-06, + "loss": 9.9929, + "step": 210 + }, + { + "epoch": 0.013098268048916754, + "grad_norm": 0.8097716519416255, + "learning_rate": 4.36581833229878e-06, + "loss": 9.9748, + "step": 211 + }, + { + "epoch": 0.013160345148674655, + "grad_norm": 0.7742716596983806, + "learning_rate": 4.3865094144423756e-06, + "loss": 10.0469, + "step": 212 + }, + { + "epoch": 0.013222422248432553, + "grad_norm": 0.777394700417342, + "learning_rate": 4.4072004965859715e-06, + "loss": 10.0394, + "step": 213 + }, + { + "epoch": 0.013284499348190452, + "grad_norm": 0.7825561667201965, + "learning_rate": 4.427891578729567e-06, + "loss": 9.9805, + "step": 214 + }, + { + "epoch": 0.013346576447948353, + "grad_norm": 0.7971549570820177, + "learning_rate": 4.448582660873163e-06, + "loss": 9.9448, + "step": 215 + }, + { + "epoch": 0.013408653547706251, + "grad_norm": 0.8040007466198885, + "learning_rate": 4.46927374301676e-06, + "loss": 9.9251, + "step": 216 + }, + { + "epoch": 0.01347073064746415, + "grad_norm": 0.7660066081775454, + "learning_rate": 4.489964825160356e-06, + "loss": 10.0172, + "step": 217 + }, + { + "epoch": 0.013532807747222049, + "grad_norm": 0.7802594199132964, + "learning_rate": 4.510655907303953e-06, + "loss": 9.9906, + "step": 218 + }, + { + "epoch": 0.01359488484697995, + "grad_norm": 0.7916025503864426, + "learning_rate": 4.531346989447549e-06, + "loss": 9.9484, + "step": 219 + }, + { + "epoch": 0.013656961946737848, + "grad_norm": 0.7946466442771053, + "learning_rate": 4.552038071591145e-06, + "loss": 9.9451, + "step": 220 + }, + { + "epoch": 0.013719039046495747, + "grad_norm": 0.8080588889167087, + "learning_rate": 4.5727291537347406e-06, + "loss": 9.886, + "step": 221 + }, + { + "epoch": 0.013781116146253648, + "grad_norm": 0.795638902157661, + "learning_rate": 4.5934202358783365e-06, + "loss": 9.9369, + "step": 222 + }, + { + "epoch": 0.013843193246011546, + "grad_norm": 0.7549093421922921, + "learning_rate": 4.614111318021932e-06, + "loss": 9.9923, + "step": 223 + }, + { + "epoch": 0.013905270345769445, + "grad_norm": 0.7783516718387445, + "learning_rate": 4.634802400165529e-06, + "loss": 9.9092, + "step": 224 + }, + { + "epoch": 0.013967347445527346, + "grad_norm": 0.7608300526796483, + "learning_rate": 4.655493482309125e-06, + "loss": 9.9368, + "step": 225 + }, + { + "epoch": 0.014029424545285244, + "grad_norm": 0.8353323484061306, + "learning_rate": 4.676184564452721e-06, + "loss": 9.841, + "step": 226 + }, + { + "epoch": 0.014091501645043143, + "grad_norm": 0.7918156475293942, + "learning_rate": 4.696875646596317e-06, + "loss": 9.9057, + "step": 227 + }, + { + "epoch": 0.014153578744801044, + "grad_norm": 0.7978512636742097, + "learning_rate": 4.717566728739913e-06, + "loss": 9.894, + "step": 228 + }, + { + "epoch": 0.014215655844558943, + "grad_norm": 0.7630150353186459, + "learning_rate": 4.73825781088351e-06, + "loss": 9.8702, + "step": 229 + }, + { + "epoch": 0.014277732944316841, + "grad_norm": 0.749941823610371, + "learning_rate": 4.7589488930271056e-06, + "loss": 9.924, + "step": 230 + }, + { + "epoch": 0.01433981004407474, + "grad_norm": 0.7362543367604636, + "learning_rate": 4.7796399751707015e-06, + "loss": 9.9264, + "step": 231 + }, + { + "epoch": 0.01440188714383264, + "grad_norm": 0.7550634881035151, + "learning_rate": 4.800331057314298e-06, + "loss": 9.8236, + "step": 232 + }, + { + "epoch": 0.01446396424359054, + "grad_norm": 0.7554970200465349, + "learning_rate": 4.821022139457894e-06, + "loss": 9.9213, + "step": 233 + }, + { + "epoch": 0.014526041343348438, + "grad_norm": 0.7677349289557975, + "learning_rate": 4.84171322160149e-06, + "loss": 9.8472, + "step": 234 + }, + { + "epoch": 0.014588118443106339, + "grad_norm": 0.7610512572055891, + "learning_rate": 4.862404303745086e-06, + "loss": 9.9017, + "step": 235 + }, + { + "epoch": 0.014650195542864237, + "grad_norm": 0.7748865752127115, + "learning_rate": 4.883095385888682e-06, + "loss": 9.83, + "step": 236 + }, + { + "epoch": 0.014712272642622136, + "grad_norm": 0.7661394609557554, + "learning_rate": 4.903786468032278e-06, + "loss": 9.8695, + "step": 237 + }, + { + "epoch": 0.014774349742380037, + "grad_norm": 0.7853471514415028, + "learning_rate": 4.924477550175875e-06, + "loss": 9.7811, + "step": 238 + }, + { + "epoch": 0.014836426842137936, + "grad_norm": 0.7402030112836346, + "learning_rate": 4.945168632319471e-06, + "loss": 9.8856, + "step": 239 + }, + { + "epoch": 0.014898503941895834, + "grad_norm": 0.7575625764538182, + "learning_rate": 4.965859714463067e-06, + "loss": 9.83, + "step": 240 + }, + { + "epoch": 0.014960581041653733, + "grad_norm": 0.775839572339183, + "learning_rate": 4.9865507966066624e-06, + "loss": 9.8268, + "step": 241 + }, + { + "epoch": 0.015022658141411634, + "grad_norm": 0.7940682741786167, + "learning_rate": 5.007241878750258e-06, + "loss": 9.7773, + "step": 242 + }, + { + "epoch": 0.015084735241169532, + "grad_norm": 0.7555981956126174, + "learning_rate": 5.027932960893855e-06, + "loss": 9.8258, + "step": 243 + }, + { + "epoch": 0.015146812340927431, + "grad_norm": 0.7760782723892773, + "learning_rate": 5.048624043037451e-06, + "loss": 9.8003, + "step": 244 + }, + { + "epoch": 0.015208889440685332, + "grad_norm": 0.7467198565886576, + "learning_rate": 5.069315125181047e-06, + "loss": 9.8197, + "step": 245 + }, + { + "epoch": 0.01527096654044323, + "grad_norm": 0.7367183956335516, + "learning_rate": 5.090006207324644e-06, + "loss": 9.8415, + "step": 246 + }, + { + "epoch": 0.01533304364020113, + "grad_norm": 0.7353047106484706, + "learning_rate": 5.11069728946824e-06, + "loss": 9.8177, + "step": 247 + }, + { + "epoch": 0.01539512073995903, + "grad_norm": 0.760057878549066, + "learning_rate": 5.131388371611836e-06, + "loss": 9.7679, + "step": 248 + }, + { + "epoch": 0.015457197839716929, + "grad_norm": 0.7523761698708515, + "learning_rate": 5.1520794537554315e-06, + "loss": 9.7357, + "step": 249 + }, + { + "epoch": 0.015519274939474827, + "grad_norm": 0.7588497417445742, + "learning_rate": 5.1727705358990274e-06, + "loss": 9.7859, + "step": 250 + }, + { + "epoch": 0.015581352039232728, + "grad_norm": 0.7549953993600289, + "learning_rate": 5.193461618042624e-06, + "loss": 9.7473, + "step": 251 + }, + { + "epoch": 0.015643429138990625, + "grad_norm": 0.7807393503235351, + "learning_rate": 5.21415270018622e-06, + "loss": 9.6784, + "step": 252 + }, + { + "epoch": 0.015705506238748525, + "grad_norm": 0.74849104082534, + "learning_rate": 5.234843782329816e-06, + "loss": 9.7819, + "step": 253 + }, + { + "epoch": 0.015767583338506426, + "grad_norm": 0.7501265101227691, + "learning_rate": 5.255534864473413e-06, + "loss": 9.6689, + "step": 254 + }, + { + "epoch": 0.015829660438264323, + "grad_norm": 0.7746298105811636, + "learning_rate": 5.276225946617008e-06, + "loss": 9.6424, + "step": 255 + }, + { + "epoch": 0.015891737538022224, + "grad_norm": 0.7508950455848992, + "learning_rate": 5.296917028760604e-06, + "loss": 9.6437, + "step": 256 + }, + { + "epoch": 0.015953814637780124, + "grad_norm": 0.74625658585088, + "learning_rate": 5.317608110904201e-06, + "loss": 9.6585, + "step": 257 + }, + { + "epoch": 0.01601589173753802, + "grad_norm": 0.7223121442405258, + "learning_rate": 5.3382991930477965e-06, + "loss": 9.7514, + "step": 258 + }, + { + "epoch": 0.01607796883729592, + "grad_norm": 0.7445093077621544, + "learning_rate": 5.3589902751913924e-06, + "loss": 9.672, + "step": 259 + }, + { + "epoch": 0.016140045937053822, + "grad_norm": 0.733717928807764, + "learning_rate": 5.379681357334989e-06, + "loss": 9.7365, + "step": 260 + }, + { + "epoch": 0.01620212303681172, + "grad_norm": 0.7548323166553235, + "learning_rate": 5.400372439478585e-06, + "loss": 9.6354, + "step": 261 + }, + { + "epoch": 0.01626420013656962, + "grad_norm": 0.7395406003256544, + "learning_rate": 5.421063521622181e-06, + "loss": 9.6921, + "step": 262 + }, + { + "epoch": 0.01632627723632752, + "grad_norm": 0.7605364500486282, + "learning_rate": 5.441754603765777e-06, + "loss": 9.6556, + "step": 263 + }, + { + "epoch": 0.016388354336085417, + "grad_norm": 0.7443635461472138, + "learning_rate": 5.462445685909373e-06, + "loss": 9.6748, + "step": 264 + }, + { + "epoch": 0.016450431435843318, + "grad_norm": 0.7656986045489897, + "learning_rate": 5.48313676805297e-06, + "loss": 9.634, + "step": 265 + }, + { + "epoch": 0.01651250853560122, + "grad_norm": 0.7304933105256456, + "learning_rate": 5.503827850196566e-06, + "loss": 9.6594, + "step": 266 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.7192311542310669, + "learning_rate": 5.5245189323401615e-06, + "loss": 9.6682, + "step": 267 + }, + { + "epoch": 0.016636662735117016, + "grad_norm": 0.7201173050344655, + "learning_rate": 5.545210014483758e-06, + "loss": 9.6739, + "step": 268 + }, + { + "epoch": 0.016698739834874916, + "grad_norm": 0.7192815390822982, + "learning_rate": 5.565901096627354e-06, + "loss": 9.586, + "step": 269 + }, + { + "epoch": 0.016760816934632813, + "grad_norm": 0.7224637986719753, + "learning_rate": 5.586592178770949e-06, + "loss": 9.6228, + "step": 270 + }, + { + "epoch": 0.016822894034390714, + "grad_norm": 0.7384248220523391, + "learning_rate": 5.607283260914546e-06, + "loss": 9.5669, + "step": 271 + }, + { + "epoch": 0.01688497113414861, + "grad_norm": 0.70907571167488, + "learning_rate": 5.627974343058142e-06, + "loss": 9.6638, + "step": 272 + }, + { + "epoch": 0.01694704823390651, + "grad_norm": 0.7378211631411719, + "learning_rate": 5.648665425201739e-06, + "loss": 9.586, + "step": 273 + }, + { + "epoch": 0.017009125333664412, + "grad_norm": 0.6927657689158644, + "learning_rate": 5.669356507345335e-06, + "loss": 9.6512, + "step": 274 + }, + { + "epoch": 0.01707120243342231, + "grad_norm": 0.7166179464842456, + "learning_rate": 5.690047589488931e-06, + "loss": 9.6481, + "step": 275 + }, + { + "epoch": 0.01713327953318021, + "grad_norm": 0.737070614877722, + "learning_rate": 5.7107386716325265e-06, + "loss": 9.5836, + "step": 276 + }, + { + "epoch": 0.01719535663293811, + "grad_norm": 0.7404641069217155, + "learning_rate": 5.7314297537761224e-06, + "loss": 9.5205, + "step": 277 + }, + { + "epoch": 0.017257433732696007, + "grad_norm": 0.7320119295450983, + "learning_rate": 5.752120835919718e-06, + "loss": 9.5949, + "step": 278 + }, + { + "epoch": 0.017319510832453908, + "grad_norm": 0.7336569176921202, + "learning_rate": 5.772811918063315e-06, + "loss": 9.4866, + "step": 279 + }, + { + "epoch": 0.017381587932211808, + "grad_norm": 0.7192012365555449, + "learning_rate": 5.793503000206911e-06, + "loss": 9.5919, + "step": 280 + }, + { + "epoch": 0.017443665031969705, + "grad_norm": 0.7508927039264038, + "learning_rate": 5.814194082350508e-06, + "loss": 9.5387, + "step": 281 + }, + { + "epoch": 0.017505742131727606, + "grad_norm": 0.7437251174995909, + "learning_rate": 5.834885164494104e-06, + "loss": 9.4902, + "step": 282 + }, + { + "epoch": 0.017567819231485506, + "grad_norm": 0.712496463030686, + "learning_rate": 5.8555762466377e-06, + "loss": 9.5313, + "step": 283 + }, + { + "epoch": 0.017629896331243403, + "grad_norm": 0.7067383428164807, + "learning_rate": 5.876267328781296e-06, + "loss": 9.5875, + "step": 284 + }, + { + "epoch": 0.017691973431001304, + "grad_norm": 0.7127998189705427, + "learning_rate": 5.8969584109248915e-06, + "loss": 9.548, + "step": 285 + }, + { + "epoch": 0.017754050530759204, + "grad_norm": 0.7175323704319166, + "learning_rate": 5.9176494930684874e-06, + "loss": 9.4971, + "step": 286 + }, + { + "epoch": 0.0178161276305171, + "grad_norm": 0.7035609695080406, + "learning_rate": 5.938340575212084e-06, + "loss": 9.5311, + "step": 287 + }, + { + "epoch": 0.017878204730275002, + "grad_norm": 0.7030604014921921, + "learning_rate": 5.95903165735568e-06, + "loss": 9.5257, + "step": 288 + }, + { + "epoch": 0.017940281830032902, + "grad_norm": 0.7135850842114468, + "learning_rate": 5.979722739499276e-06, + "loss": 9.5098, + "step": 289 + }, + { + "epoch": 0.0180023589297908, + "grad_norm": 0.7040191263518636, + "learning_rate": 6.000413821642872e-06, + "loss": 9.4358, + "step": 290 + }, + { + "epoch": 0.0180644360295487, + "grad_norm": 0.7357200722526358, + "learning_rate": 6.021104903786468e-06, + "loss": 9.4213, + "step": 291 + }, + { + "epoch": 0.0181265131293066, + "grad_norm": 0.6923296091490438, + "learning_rate": 6.041795985930065e-06, + "loss": 9.5035, + "step": 292 + }, + { + "epoch": 0.018188590229064498, + "grad_norm": 0.6804017884760769, + "learning_rate": 6.062487068073661e-06, + "loss": 9.5005, + "step": 293 + }, + { + "epoch": 0.018250667328822398, + "grad_norm": 0.7382781820889465, + "learning_rate": 6.0831781502172565e-06, + "loss": 9.4662, + "step": 294 + }, + { + "epoch": 0.018312744428580295, + "grad_norm": 0.6793621205194336, + "learning_rate": 6.103869232360853e-06, + "loss": 9.4554, + "step": 295 + }, + { + "epoch": 0.018374821528338196, + "grad_norm": 0.6912963909220504, + "learning_rate": 6.124560314504449e-06, + "loss": 9.4348, + "step": 296 + }, + { + "epoch": 0.018436898628096096, + "grad_norm": 0.6662637690182375, + "learning_rate": 6.145251396648045e-06, + "loss": 9.4702, + "step": 297 + }, + { + "epoch": 0.018498975727853993, + "grad_norm": 0.7229827555661607, + "learning_rate": 6.165942478791641e-06, + "loss": 9.347, + "step": 298 + }, + { + "epoch": 0.018561052827611894, + "grad_norm": 0.6983336275421773, + "learning_rate": 6.186633560935237e-06, + "loss": 9.3816, + "step": 299 + }, + { + "epoch": 0.018623129927369794, + "grad_norm": 0.6698011283937907, + "learning_rate": 6.207324643078833e-06, + "loss": 9.3955, + "step": 300 + }, + { + "epoch": 0.01868520702712769, + "grad_norm": 0.7075418135752162, + "learning_rate": 6.22801572522243e-06, + "loss": 9.3722, + "step": 301 + }, + { + "epoch": 0.018747284126885592, + "grad_norm": 0.71788188089178, + "learning_rate": 6.248706807366026e-06, + "loss": 9.3331, + "step": 302 + }, + { + "epoch": 0.018809361226643492, + "grad_norm": 0.6939335124985132, + "learning_rate": 6.269397889509622e-06, + "loss": 9.3728, + "step": 303 + }, + { + "epoch": 0.01887143832640139, + "grad_norm": 0.6955506849050698, + "learning_rate": 6.2900889716532175e-06, + "loss": 9.4517, + "step": 304 + }, + { + "epoch": 0.01893351542615929, + "grad_norm": 0.7229262388916671, + "learning_rate": 6.310780053796813e-06, + "loss": 9.344, + "step": 305 + }, + { + "epoch": 0.01899559252591719, + "grad_norm": 0.7092930145612397, + "learning_rate": 6.33147113594041e-06, + "loss": 9.3261, + "step": 306 + }, + { + "epoch": 0.019057669625675087, + "grad_norm": 0.6890259627086119, + "learning_rate": 6.352162218084005e-06, + "loss": 9.3358, + "step": 307 + }, + { + "epoch": 0.019119746725432988, + "grad_norm": 0.688688140175073, + "learning_rate": 6.372853300227602e-06, + "loss": 9.3504, + "step": 308 + }, + { + "epoch": 0.01918182382519089, + "grad_norm": 0.6862223848230734, + "learning_rate": 6.393544382371199e-06, + "loss": 9.3954, + "step": 309 + }, + { + "epoch": 0.019243900924948786, + "grad_norm": 0.7139529321527637, + "learning_rate": 6.414235464514795e-06, + "loss": 9.2746, + "step": 310 + }, + { + "epoch": 0.019305978024706686, + "grad_norm": 0.6649160621486692, + "learning_rate": 6.434926546658391e-06, + "loss": 9.3926, + "step": 311 + }, + { + "epoch": 0.019368055124464587, + "grad_norm": 0.67851397174289, + "learning_rate": 6.4556176288019865e-06, + "loss": 9.3969, + "step": 312 + }, + { + "epoch": 0.019430132224222484, + "grad_norm": 0.7643882727854742, + "learning_rate": 6.476308710945583e-06, + "loss": 9.3138, + "step": 313 + }, + { + "epoch": 0.019492209323980384, + "grad_norm": 0.6868637782421964, + "learning_rate": 6.496999793089178e-06, + "loss": 9.3516, + "step": 314 + }, + { + "epoch": 0.019554286423738285, + "grad_norm": 0.6676846842711346, + "learning_rate": 6.517690875232775e-06, + "loss": 9.3596, + "step": 315 + }, + { + "epoch": 0.01961636352349618, + "grad_norm": 0.7526242518577285, + "learning_rate": 6.538381957376372e-06, + "loss": 9.3233, + "step": 316 + }, + { + "epoch": 0.019678440623254082, + "grad_norm": 0.7085757168091273, + "learning_rate": 6.559073039519967e-06, + "loss": 9.245, + "step": 317 + }, + { + "epoch": 0.01974051772301198, + "grad_norm": 0.829412251593877, + "learning_rate": 6.579764121663563e-06, + "loss": 9.2256, + "step": 318 + }, + { + "epoch": 0.01980259482276988, + "grad_norm": 0.6920031110357352, + "learning_rate": 6.60045520380716e-06, + "loss": 9.1833, + "step": 319 + }, + { + "epoch": 0.01986467192252778, + "grad_norm": 0.787700610872511, + "learning_rate": 6.621146285950755e-06, + "loss": 9.3018, + "step": 320 + }, + { + "epoch": 0.019926749022285677, + "grad_norm": 0.702659876947456, + "learning_rate": 6.6418373680943515e-06, + "loss": 9.288, + "step": 321 + }, + { + "epoch": 0.019988826122043578, + "grad_norm": 0.750604465403845, + "learning_rate": 6.662528450237948e-06, + "loss": 9.2521, + "step": 322 + }, + { + "epoch": 0.02005090322180148, + "grad_norm": 0.8504233484100705, + "learning_rate": 6.683219532381543e-06, + "loss": 9.3008, + "step": 323 + }, + { + "epoch": 0.020112980321559375, + "grad_norm": 0.6877670853009966, + "learning_rate": 6.70391061452514e-06, + "loss": 9.2064, + "step": 324 + }, + { + "epoch": 0.020175057421317276, + "grad_norm": 0.8168677162014957, + "learning_rate": 6.724601696668736e-06, + "loss": 9.209, + "step": 325 + }, + { + "epoch": 0.020237134521075176, + "grad_norm": 0.8428300858396647, + "learning_rate": 6.745292778812331e-06, + "loss": 9.2474, + "step": 326 + }, + { + "epoch": 0.020299211620833073, + "grad_norm": 0.7423648421579667, + "learning_rate": 6.765983860955928e-06, + "loss": 9.2425, + "step": 327 + }, + { + "epoch": 0.020361288720590974, + "grad_norm": 0.8915692727800731, + "learning_rate": 6.786674943099525e-06, + "loss": 9.2356, + "step": 328 + }, + { + "epoch": 0.020423365820348875, + "grad_norm": 0.7528388021711178, + "learning_rate": 6.8073660252431215e-06, + "loss": 9.1339, + "step": 329 + }, + { + "epoch": 0.02048544292010677, + "grad_norm": 0.9302786222061522, + "learning_rate": 6.8280571073867165e-06, + "loss": 9.2085, + "step": 330 + }, + { + "epoch": 0.020547520019864672, + "grad_norm": 0.9367285733507243, + "learning_rate": 6.848748189530313e-06, + "loss": 9.1822, + "step": 331 + }, + { + "epoch": 0.020609597119622573, + "grad_norm": 0.7062103184380216, + "learning_rate": 6.869439271673909e-06, + "loss": 9.1891, + "step": 332 + }, + { + "epoch": 0.02067167421938047, + "grad_norm": 1.048287875873381, + "learning_rate": 6.890130353817504e-06, + "loss": 9.1886, + "step": 333 + }, + { + "epoch": 0.02073375131913837, + "grad_norm": 0.7023201044194932, + "learning_rate": 6.910821435961101e-06, + "loss": 9.2017, + "step": 334 + }, + { + "epoch": 0.02079582841889627, + "grad_norm": 0.9564368445668321, + "learning_rate": 6.931512518104698e-06, + "loss": 9.2709, + "step": 335 + }, + { + "epoch": 0.020857905518654168, + "grad_norm": 0.6704369836205871, + "learning_rate": 6.952203600248293e-06, + "loss": 9.2109, + "step": 336 + }, + { + "epoch": 0.020919982618412068, + "grad_norm": 0.6497112311303878, + "learning_rate": 6.97289468239189e-06, + "loss": 9.1495, + "step": 337 + }, + { + "epoch": 0.02098205971816997, + "grad_norm": 0.7726079943084091, + "learning_rate": 6.993585764535486e-06, + "loss": 9.2015, + "step": 338 + }, + { + "epoch": 0.021044136817927866, + "grad_norm": 0.6542508464963726, + "learning_rate": 7.0142768466790815e-06, + "loss": 9.2229, + "step": 339 + }, + { + "epoch": 0.021106213917685766, + "grad_norm": 0.7353738930846421, + "learning_rate": 7.0349679288226775e-06, + "loss": 9.1848, + "step": 340 + }, + { + "epoch": 0.021168291017443663, + "grad_norm": 0.6386722716350666, + "learning_rate": 7.055659010966274e-06, + "loss": 9.132, + "step": 341 + }, + { + "epoch": 0.021230368117201564, + "grad_norm": 0.6095353429090216, + "learning_rate": 7.076350093109869e-06, + "loss": 9.1723, + "step": 342 + }, + { + "epoch": 0.021292445216959464, + "grad_norm": 0.7013667481351202, + "learning_rate": 7.097041175253466e-06, + "loss": 9.1014, + "step": 343 + }, + { + "epoch": 0.02135452231671736, + "grad_norm": 0.6488034697825575, + "learning_rate": 7.117732257397063e-06, + "loss": 9.1033, + "step": 344 + }, + { + "epoch": 0.021416599416475262, + "grad_norm": 0.7033518531208137, + "learning_rate": 7.138423339540658e-06, + "loss": 9.1355, + "step": 345 + }, + { + "epoch": 0.021478676516233162, + "grad_norm": 0.6982002085624499, + "learning_rate": 7.159114421684254e-06, + "loss": 9.0217, + "step": 346 + }, + { + "epoch": 0.02154075361599106, + "grad_norm": 0.5955597001313278, + "learning_rate": 7.179805503827851e-06, + "loss": 9.0909, + "step": 347 + }, + { + "epoch": 0.02160283071574896, + "grad_norm": 0.7236950393258998, + "learning_rate": 7.200496585971446e-06, + "loss": 9.0773, + "step": 348 + }, + { + "epoch": 0.02166490781550686, + "grad_norm": 0.6431051023943734, + "learning_rate": 7.2211876681150425e-06, + "loss": 9.1337, + "step": 349 + }, + { + "epoch": 0.021726984915264758, + "grad_norm": 0.7645200034444204, + "learning_rate": 7.241878750258639e-06, + "loss": 9.0661, + "step": 350 + }, + { + "epoch": 0.021789062015022658, + "grad_norm": 0.6658198272980824, + "learning_rate": 7.262569832402236e-06, + "loss": 9.0367, + "step": 351 + }, + { + "epoch": 0.02185113911478056, + "grad_norm": 0.642337649721702, + "learning_rate": 7.283260914545831e-06, + "loss": 9.0363, + "step": 352 + }, + { + "epoch": 0.021913216214538456, + "grad_norm": 0.6211201404267044, + "learning_rate": 7.303951996689427e-06, + "loss": 9.0671, + "step": 353 + }, + { + "epoch": 0.021975293314296356, + "grad_norm": 0.7455592212935885, + "learning_rate": 7.324643078833024e-06, + "loss": 9.0279, + "step": 354 + }, + { + "epoch": 0.022037370414054257, + "grad_norm": 0.8625561530036773, + "learning_rate": 7.345334160976619e-06, + "loss": 8.9901, + "step": 355 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.6853588965940248, + "learning_rate": 7.366025243120216e-06, + "loss": 9.0065, + "step": 356 + }, + { + "epoch": 0.022161524613570054, + "grad_norm": 0.5704039408915466, + "learning_rate": 7.386716325263812e-06, + "loss": 9.0281, + "step": 357 + }, + { + "epoch": 0.022223601713327955, + "grad_norm": 0.8179925720185376, + "learning_rate": 7.4074074074074075e-06, + "loss": 9.0244, + "step": 358 + }, + { + "epoch": 0.022285678813085852, + "grad_norm": 0.6056045133906908, + "learning_rate": 7.428098489551004e-06, + "loss": 9.038, + "step": 359 + }, + { + "epoch": 0.022347755912843752, + "grad_norm": 0.7032083673857082, + "learning_rate": 7.4487895716946e-06, + "loss": 9.0518, + "step": 360 + }, + { + "epoch": 0.022409833012601653, + "grad_norm": 0.9230652254002261, + "learning_rate": 7.469480653838195e-06, + "loss": 9.0163, + "step": 361 + }, + { + "epoch": 0.02247191011235955, + "grad_norm": 0.6119608179682945, + "learning_rate": 7.490171735981792e-06, + "loss": 8.9002, + "step": 362 + }, + { + "epoch": 0.02253398721211745, + "grad_norm": 1.1079661597685742, + "learning_rate": 7.510862818125389e-06, + "loss": 9.0316, + "step": 363 + }, + { + "epoch": 0.022596064311875348, + "grad_norm": 0.6420719506849202, + "learning_rate": 7.531553900268984e-06, + "loss": 8.9621, + "step": 364 + }, + { + "epoch": 0.022658141411633248, + "grad_norm": 0.9636106758493018, + "learning_rate": 7.552244982412581e-06, + "loss": 9.0059, + "step": 365 + }, + { + "epoch": 0.02272021851139115, + "grad_norm": 0.7828644664822604, + "learning_rate": 7.572936064556177e-06, + "loss": 8.9177, + "step": 366 + }, + { + "epoch": 0.022782295611149046, + "grad_norm": 0.7631843356997153, + "learning_rate": 7.5936271466997725e-06, + "loss": 8.9164, + "step": 367 + }, + { + "epoch": 0.022844372710906946, + "grad_norm": 0.7760568826365665, + "learning_rate": 7.614318228843368e-06, + "loss": 8.9006, + "step": 368 + }, + { + "epoch": 0.022906449810664847, + "grad_norm": 0.578909007283387, + "learning_rate": 7.635009310986966e-06, + "loss": 8.9886, + "step": 369 + }, + { + "epoch": 0.022968526910422744, + "grad_norm": 1.0658444332492274, + "learning_rate": 7.655700393130561e-06, + "loss": 8.9736, + "step": 370 + }, + { + "epoch": 0.023030604010180644, + "grad_norm": 0.7078455245711638, + "learning_rate": 7.676391475274156e-06, + "loss": 8.819, + "step": 371 + }, + { + "epoch": 0.023092681109938545, + "grad_norm": 0.7154391368061775, + "learning_rate": 7.697082557417753e-06, + "loss": 8.8588, + "step": 372 + }, + { + "epoch": 0.023154758209696442, + "grad_norm": 0.6014288221183335, + "learning_rate": 7.71777363956135e-06, + "loss": 8.9127, + "step": 373 + }, + { + "epoch": 0.023216835309454342, + "grad_norm": 0.6693046774019235, + "learning_rate": 7.738464721704945e-06, + "loss": 8.9346, + "step": 374 + }, + { + "epoch": 0.023278912409212243, + "grad_norm": 0.5821779821399026, + "learning_rate": 7.759155803848542e-06, + "loss": 8.8529, + "step": 375 + }, + { + "epoch": 0.02334098950897014, + "grad_norm": 0.8102504693830026, + "learning_rate": 7.779846885992138e-06, + "loss": 8.8868, + "step": 376 + }, + { + "epoch": 0.02340306660872804, + "grad_norm": 0.7193106662144506, + "learning_rate": 7.800537968135733e-06, + "loss": 8.9204, + "step": 377 + }, + { + "epoch": 0.02346514370848594, + "grad_norm": 0.603467550492528, + "learning_rate": 7.82122905027933e-06, + "loss": 8.8164, + "step": 378 + }, + { + "epoch": 0.023527220808243838, + "grad_norm": 0.651955380444428, + "learning_rate": 7.841920132422927e-06, + "loss": 8.8715, + "step": 379 + }, + { + "epoch": 0.02358929790800174, + "grad_norm": 0.6995556195691908, + "learning_rate": 7.862611214566522e-06, + "loss": 8.8413, + "step": 380 + }, + { + "epoch": 0.02365137500775964, + "grad_norm": 0.7794425656078579, + "learning_rate": 7.883302296710119e-06, + "loss": 8.9024, + "step": 381 + }, + { + "epoch": 0.023713452107517536, + "grad_norm": 0.6135384330167761, + "learning_rate": 7.903993378853716e-06, + "loss": 8.8383, + "step": 382 + }, + { + "epoch": 0.023775529207275437, + "grad_norm": 0.5953543720888643, + "learning_rate": 7.92468446099731e-06, + "loss": 8.8399, + "step": 383 + }, + { + "epoch": 0.023837606307033337, + "grad_norm": 0.6570079377491771, + "learning_rate": 7.945375543140907e-06, + "loss": 8.7646, + "step": 384 + }, + { + "epoch": 0.023899683406791234, + "grad_norm": 0.5816013617550126, + "learning_rate": 7.966066625284502e-06, + "loss": 8.83, + "step": 385 + }, + { + "epoch": 0.023961760506549135, + "grad_norm": 0.6828984893845242, + "learning_rate": 7.986757707428098e-06, + "loss": 8.7941, + "step": 386 + }, + { + "epoch": 0.02402383760630703, + "grad_norm": 0.6362183599717712, + "learning_rate": 8.007448789571694e-06, + "loss": 8.8061, + "step": 387 + }, + { + "epoch": 0.024085914706064932, + "grad_norm": 0.8647495056464135, + "learning_rate": 8.028139871715291e-06, + "loss": 8.7476, + "step": 388 + }, + { + "epoch": 0.024147991805822833, + "grad_norm": 0.7666600296175506, + "learning_rate": 8.048830953858886e-06, + "loss": 8.7687, + "step": 389 + }, + { + "epoch": 0.02421006890558073, + "grad_norm": 0.9028482389258926, + "learning_rate": 8.069522036002483e-06, + "loss": 8.7613, + "step": 390 + }, + { + "epoch": 0.02427214600533863, + "grad_norm": 0.9057967927213502, + "learning_rate": 8.09021311814608e-06, + "loss": 8.7762, + "step": 391 + }, + { + "epoch": 0.02433422310509653, + "grad_norm": 0.667705193887631, + "learning_rate": 8.110904200289675e-06, + "loss": 8.77, + "step": 392 + }, + { + "epoch": 0.024396300204854428, + "grad_norm": 0.6843735089568874, + "learning_rate": 8.131595282433272e-06, + "loss": 8.8071, + "step": 393 + }, + { + "epoch": 0.02445837730461233, + "grad_norm": 0.6493371444697357, + "learning_rate": 8.152286364576868e-06, + "loss": 8.7693, + "step": 394 + }, + { + "epoch": 0.02452045440437023, + "grad_norm": 0.5660910156672514, + "learning_rate": 8.172977446720465e-06, + "loss": 8.694, + "step": 395 + }, + { + "epoch": 0.024582531504128126, + "grad_norm": 0.9285552880173417, + "learning_rate": 8.19366852886406e-06, + "loss": 8.7564, + "step": 396 + }, + { + "epoch": 0.024644608603886026, + "grad_norm": 0.6254711997493786, + "learning_rate": 8.214359611007657e-06, + "loss": 8.7238, + "step": 397 + }, + { + "epoch": 0.024706685703643927, + "grad_norm": 1.1748744471411, + "learning_rate": 8.235050693151252e-06, + "loss": 8.6861, + "step": 398 + }, + { + "epoch": 0.024768762803401824, + "grad_norm": 0.617874173958126, + "learning_rate": 8.255741775294847e-06, + "loss": 8.6906, + "step": 399 + }, + { + "epoch": 0.024830839903159724, + "grad_norm": 0.9467142604451334, + "learning_rate": 8.276432857438444e-06, + "loss": 8.7641, + "step": 400 + }, + { + "epoch": 0.024892917002917625, + "grad_norm": 0.6397245871373687, + "learning_rate": 8.29712393958204e-06, + "loss": 8.7389, + "step": 401 + }, + { + "epoch": 0.024954994102675522, + "grad_norm": 0.6809407223349067, + "learning_rate": 8.317815021725636e-06, + "loss": 8.6956, + "step": 402 + }, + { + "epoch": 0.025017071202433423, + "grad_norm": 0.627930114651026, + "learning_rate": 8.338506103869233e-06, + "loss": 8.7315, + "step": 403 + }, + { + "epoch": 0.025079148302191323, + "grad_norm": 0.5764786501650738, + "learning_rate": 8.35919718601283e-06, + "loss": 8.6878, + "step": 404 + }, + { + "epoch": 0.02514122540194922, + "grad_norm": 0.740513288547403, + "learning_rate": 8.379888268156424e-06, + "loss": 8.6631, + "step": 405 + }, + { + "epoch": 0.02520330250170712, + "grad_norm": 0.8780405485874124, + "learning_rate": 8.400579350300021e-06, + "loss": 8.6595, + "step": 406 + }, + { + "epoch": 0.02526537960146502, + "grad_norm": 0.6235388365108941, + "learning_rate": 8.421270432443618e-06, + "loss": 8.6835, + "step": 407 + }, + { + "epoch": 0.025327456701222918, + "grad_norm": 0.5847946720388079, + "learning_rate": 8.441961514587213e-06, + "loss": 8.6981, + "step": 408 + }, + { + "epoch": 0.02538953380098082, + "grad_norm": 0.5999318352364379, + "learning_rate": 8.46265259673081e-06, + "loss": 8.7007, + "step": 409 + }, + { + "epoch": 0.025451610900738716, + "grad_norm": 0.5471843389553787, + "learning_rate": 8.483343678874407e-06, + "loss": 8.6224, + "step": 410 + }, + { + "epoch": 0.025513688000496616, + "grad_norm": 0.6210888076681551, + "learning_rate": 8.504034761018002e-06, + "loss": 8.7236, + "step": 411 + }, + { + "epoch": 0.025575765100254517, + "grad_norm": 0.5794472908753928, + "learning_rate": 8.524725843161598e-06, + "loss": 8.6385, + "step": 412 + }, + { + "epoch": 0.025637842200012414, + "grad_norm": 0.6732935899943532, + "learning_rate": 8.545416925305193e-06, + "loss": 8.6618, + "step": 413 + }, + { + "epoch": 0.025699919299770314, + "grad_norm": 0.5445970777349839, + "learning_rate": 8.56610800744879e-06, + "loss": 8.5789, + "step": 414 + }, + { + "epoch": 0.025761996399528215, + "grad_norm": 0.5813754182271492, + "learning_rate": 8.586799089592385e-06, + "loss": 8.5973, + "step": 415 + }, + { + "epoch": 0.025824073499286112, + "grad_norm": 0.5513846696979944, + "learning_rate": 8.607490171735982e-06, + "loss": 8.6878, + "step": 416 + }, + { + "epoch": 0.025886150599044012, + "grad_norm": 0.5381635827954443, + "learning_rate": 8.628181253879579e-06, + "loss": 8.6536, + "step": 417 + }, + { + "epoch": 0.025948227698801913, + "grad_norm": 0.5418024504250235, + "learning_rate": 8.648872336023174e-06, + "loss": 8.5343, + "step": 418 + }, + { + "epoch": 0.02601030479855981, + "grad_norm": 0.6546310482404955, + "learning_rate": 8.66956341816677e-06, + "loss": 8.557, + "step": 419 + }, + { + "epoch": 0.02607238189831771, + "grad_norm": 0.5777495685924073, + "learning_rate": 8.690254500310367e-06, + "loss": 8.5328, + "step": 420 + }, + { + "epoch": 0.02613445899807561, + "grad_norm": 0.6205606872104888, + "learning_rate": 8.710945582453963e-06, + "loss": 8.6036, + "step": 421 + }, + { + "epoch": 0.026196536097833508, + "grad_norm": 0.5631243896057682, + "learning_rate": 8.73163666459756e-06, + "loss": 8.6062, + "step": 422 + }, + { + "epoch": 0.02625861319759141, + "grad_norm": 0.5853130573569443, + "learning_rate": 8.752327746741156e-06, + "loss": 8.6166, + "step": 423 + }, + { + "epoch": 0.02632069029734931, + "grad_norm": 0.6664636620748433, + "learning_rate": 8.773018828884751e-06, + "loss": 8.5557, + "step": 424 + }, + { + "epoch": 0.026382767397107206, + "grad_norm": 0.5624563412147694, + "learning_rate": 8.793709911028348e-06, + "loss": 8.4795, + "step": 425 + }, + { + "epoch": 0.026444844496865107, + "grad_norm": 0.830053315633385, + "learning_rate": 8.814400993171943e-06, + "loss": 8.5246, + "step": 426 + }, + { + "epoch": 0.026506921596623007, + "grad_norm": 0.5521487851001791, + "learning_rate": 8.835092075315538e-06, + "loss": 8.5472, + "step": 427 + }, + { + "epoch": 0.026568998696380904, + "grad_norm": 0.5417068142562156, + "learning_rate": 8.855783157459135e-06, + "loss": 8.5477, + "step": 428 + }, + { + "epoch": 0.026631075796138805, + "grad_norm": 0.7514395012402532, + "learning_rate": 8.876474239602732e-06, + "loss": 8.5307, + "step": 429 + }, + { + "epoch": 0.026693152895896705, + "grad_norm": 0.5542485555458776, + "learning_rate": 8.897165321746327e-06, + "loss": 8.5434, + "step": 430 + }, + { + "epoch": 0.026755229995654602, + "grad_norm": 0.6956951074496999, + "learning_rate": 8.917856403889923e-06, + "loss": 8.5795, + "step": 431 + }, + { + "epoch": 0.026817307095412503, + "grad_norm": 0.5970739382607854, + "learning_rate": 8.93854748603352e-06, + "loss": 8.5562, + "step": 432 + }, + { + "epoch": 0.026879384195170403, + "grad_norm": 0.6792015881911851, + "learning_rate": 8.959238568177115e-06, + "loss": 8.5052, + "step": 433 + }, + { + "epoch": 0.0269414612949283, + "grad_norm": 0.5682063994725691, + "learning_rate": 8.979929650320712e-06, + "loss": 8.5663, + "step": 434 + }, + { + "epoch": 0.0270035383946862, + "grad_norm": 0.514967036210778, + "learning_rate": 9.000620732464309e-06, + "loss": 8.5651, + "step": 435 + }, + { + "epoch": 0.027065615494444098, + "grad_norm": 0.7423932643557078, + "learning_rate": 9.021311814607906e-06, + "loss": 8.4736, + "step": 436 + }, + { + "epoch": 0.027127692594202, + "grad_norm": 0.674970923786521, + "learning_rate": 9.0420028967515e-06, + "loss": 8.54, + "step": 437 + }, + { + "epoch": 0.0271897696939599, + "grad_norm": 0.5419761586611381, + "learning_rate": 9.062693978895097e-06, + "loss": 8.4833, + "step": 438 + }, + { + "epoch": 0.027251846793717796, + "grad_norm": 0.5189542219023882, + "learning_rate": 9.083385061038693e-06, + "loss": 8.4969, + "step": 439 + }, + { + "epoch": 0.027313923893475697, + "grad_norm": 0.6817110566311242, + "learning_rate": 9.10407614318229e-06, + "loss": 8.4799, + "step": 440 + }, + { + "epoch": 0.027376000993233597, + "grad_norm": 0.5853832929969912, + "learning_rate": 9.124767225325884e-06, + "loss": 8.4916, + "step": 441 + }, + { + "epoch": 0.027438078092991494, + "grad_norm": 0.8650607931680971, + "learning_rate": 9.145458307469481e-06, + "loss": 8.5226, + "step": 442 + }, + { + "epoch": 0.027500155192749395, + "grad_norm": 0.9144846601560553, + "learning_rate": 9.166149389613076e-06, + "loss": 8.5321, + "step": 443 + }, + { + "epoch": 0.027562232292507295, + "grad_norm": 0.5704732433216316, + "learning_rate": 9.186840471756673e-06, + "loss": 8.503, + "step": 444 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.8285744724840242, + "learning_rate": 9.20753155390027e-06, + "loss": 8.5015, + "step": 445 + }, + { + "epoch": 0.027686386492023093, + "grad_norm": 0.6421456780470957, + "learning_rate": 9.228222636043865e-06, + "loss": 8.4105, + "step": 446 + }, + { + "epoch": 0.027748463591780993, + "grad_norm": 0.874447886801389, + "learning_rate": 9.248913718187462e-06, + "loss": 8.52, + "step": 447 + }, + { + "epoch": 0.02781054069153889, + "grad_norm": 0.6606014714160832, + "learning_rate": 9.269604800331058e-06, + "loss": 8.4662, + "step": 448 + }, + { + "epoch": 0.02787261779129679, + "grad_norm": 0.7312312506855937, + "learning_rate": 9.290295882474653e-06, + "loss": 8.4706, + "step": 449 + }, + { + "epoch": 0.02793469489105469, + "grad_norm": 0.9962118041447543, + "learning_rate": 9.31098696461825e-06, + "loss": 8.4561, + "step": 450 + }, + { + "epoch": 0.02799677199081259, + "grad_norm": 0.9384510302767092, + "learning_rate": 9.331678046761847e-06, + "loss": 8.3969, + "step": 451 + }, + { + "epoch": 0.02805884909057049, + "grad_norm": 0.6516896926449344, + "learning_rate": 9.352369128905442e-06, + "loss": 8.4148, + "step": 452 + }, + { + "epoch": 0.02812092619032839, + "grad_norm": 0.4967112715291057, + "learning_rate": 9.373060211049039e-06, + "loss": 8.4256, + "step": 453 + }, + { + "epoch": 0.028183003290086286, + "grad_norm": 0.6904383204242814, + "learning_rate": 9.393751293192634e-06, + "loss": 8.3598, + "step": 454 + }, + { + "epoch": 0.028245080389844187, + "grad_norm": 0.6100873695793734, + "learning_rate": 9.41444237533623e-06, + "loss": 8.2853, + "step": 455 + }, + { + "epoch": 0.028307157489602087, + "grad_norm": 0.5069493808099562, + "learning_rate": 9.435133457479826e-06, + "loss": 8.4488, + "step": 456 + }, + { + "epoch": 0.028369234589359985, + "grad_norm": 0.6217392739788433, + "learning_rate": 9.455824539623423e-06, + "loss": 8.3111, + "step": 457 + }, + { + "epoch": 0.028431311689117885, + "grad_norm": 0.5508288640212718, + "learning_rate": 9.47651562176702e-06, + "loss": 8.436, + "step": 458 + }, + { + "epoch": 0.028493388788875782, + "grad_norm": 0.6596310123056107, + "learning_rate": 9.497206703910614e-06, + "loss": 8.3724, + "step": 459 + }, + { + "epoch": 0.028555465888633683, + "grad_norm": 0.6335047479435033, + "learning_rate": 9.517897786054211e-06, + "loss": 8.3783, + "step": 460 + }, + { + "epoch": 0.028617542988391583, + "grad_norm": 0.7011994805387389, + "learning_rate": 9.538588868197808e-06, + "loss": 8.3719, + "step": 461 + }, + { + "epoch": 0.02867962008814948, + "grad_norm": 0.9657787163612793, + "learning_rate": 9.559279950341403e-06, + "loss": 8.3817, + "step": 462 + }, + { + "epoch": 0.02874169718790738, + "grad_norm": 0.6086535217549915, + "learning_rate": 9.579971032485e-06, + "loss": 8.3063, + "step": 463 + }, + { + "epoch": 0.02880377428766528, + "grad_norm": 0.8493633747217534, + "learning_rate": 9.600662114628597e-06, + "loss": 8.3649, + "step": 464 + }, + { + "epoch": 0.02886585138742318, + "grad_norm": 1.0170964658817367, + "learning_rate": 9.621353196772192e-06, + "loss": 8.3726, + "step": 465 + }, + { + "epoch": 0.02892792848718108, + "grad_norm": 0.6975394600811924, + "learning_rate": 9.642044278915788e-06, + "loss": 8.3659, + "step": 466 + }, + { + "epoch": 0.02899000558693898, + "grad_norm": 0.8966542666817504, + "learning_rate": 9.662735361059385e-06, + "loss": 8.2775, + "step": 467 + }, + { + "epoch": 0.029052082686696876, + "grad_norm": 0.8580739316032246, + "learning_rate": 9.68342644320298e-06, + "loss": 8.2836, + "step": 468 + }, + { + "epoch": 0.029114159786454777, + "grad_norm": 0.5306416843428782, + "learning_rate": 9.704117525346575e-06, + "loss": 8.3493, + "step": 469 + }, + { + "epoch": 0.029176236886212677, + "grad_norm": 1.0625397169880295, + "learning_rate": 9.724808607490172e-06, + "loss": 8.2186, + "step": 470 + }, + { + "epoch": 0.029238313985970574, + "grad_norm": 0.6104638568573694, + "learning_rate": 9.745499689633767e-06, + "loss": 8.2686, + "step": 471 + }, + { + "epoch": 0.029300391085728475, + "grad_norm": 0.6820074586334581, + "learning_rate": 9.766190771777364e-06, + "loss": 8.3692, + "step": 472 + }, + { + "epoch": 0.029362468185486375, + "grad_norm": 0.5177780628897101, + "learning_rate": 9.78688185392096e-06, + "loss": 8.2288, + "step": 473 + }, + { + "epoch": 0.029424545285244273, + "grad_norm": 0.9082081784343615, + "learning_rate": 9.807572936064556e-06, + "loss": 8.2926, + "step": 474 + }, + { + "epoch": 0.029486622385002173, + "grad_norm": 0.7891494400926373, + "learning_rate": 9.828264018208153e-06, + "loss": 8.3659, + "step": 475 + }, + { + "epoch": 0.029548699484760074, + "grad_norm": 0.6744632768170747, + "learning_rate": 9.84895510035175e-06, + "loss": 8.2732, + "step": 476 + }, + { + "epoch": 0.02961077658451797, + "grad_norm": 1.0056136636705535, + "learning_rate": 9.869646182495346e-06, + "loss": 8.3287, + "step": 477 + }, + { + "epoch": 0.02967285368427587, + "grad_norm": 0.8314628988121691, + "learning_rate": 9.890337264638941e-06, + "loss": 8.2156, + "step": 478 + }, + { + "epoch": 0.02973493078403377, + "grad_norm": 1.0003405229623026, + "learning_rate": 9.911028346782538e-06, + "loss": 8.2527, + "step": 479 + }, + { + "epoch": 0.02979700788379167, + "grad_norm": 0.7145890162457926, + "learning_rate": 9.931719428926135e-06, + "loss": 8.2358, + "step": 480 + }, + { + "epoch": 0.02985908498354957, + "grad_norm": 0.946783985510081, + "learning_rate": 9.95241051106973e-06, + "loss": 8.3192, + "step": 481 + }, + { + "epoch": 0.029921162083307466, + "grad_norm": 1.0093254738576847, + "learning_rate": 9.973101593213325e-06, + "loss": 8.2699, + "step": 482 + }, + { + "epoch": 0.029983239183065367, + "grad_norm": 0.6867007464760205, + "learning_rate": 9.993792675356922e-06, + "loss": 8.2176, + "step": 483 + }, + { + "epoch": 0.030045316282823267, + "grad_norm": 0.9249459212191938, + "learning_rate": 1.0014483757500517e-05, + "loss": 8.1982, + "step": 484 + }, + { + "epoch": 0.030107393382581164, + "grad_norm": 0.6269164980828517, + "learning_rate": 1.0035174839644113e-05, + "loss": 8.311, + "step": 485 + }, + { + "epoch": 0.030169470482339065, + "grad_norm": 0.6731332706874843, + "learning_rate": 1.005586592178771e-05, + "loss": 8.2615, + "step": 486 + }, + { + "epoch": 0.030231547582096965, + "grad_norm": 0.6155600518371502, + "learning_rate": 1.0076557003931305e-05, + "loss": 8.2079, + "step": 487 + }, + { + "epoch": 0.030293624681854862, + "grad_norm": 1.3542717477180033, + "learning_rate": 1.0097248086074902e-05, + "loss": 8.2296, + "step": 488 + }, + { + "epoch": 0.030355701781612763, + "grad_norm": 1.3065169522499196, + "learning_rate": 1.0117939168218499e-05, + "loss": 8.1729, + "step": 489 + }, + { + "epoch": 0.030417778881370663, + "grad_norm": 0.7278553934262108, + "learning_rate": 1.0138630250362094e-05, + "loss": 8.2593, + "step": 490 + }, + { + "epoch": 0.03047985598112856, + "grad_norm": 1.4972788447138077, + "learning_rate": 1.015932133250569e-05, + "loss": 8.0967, + "step": 491 + }, + { + "epoch": 0.03054193308088646, + "grad_norm": 0.5952489653798244, + "learning_rate": 1.0180012414649287e-05, + "loss": 8.2206, + "step": 492 + }, + { + "epoch": 0.03060401018064436, + "grad_norm": 1.0994888700379004, + "learning_rate": 1.0200703496792883e-05, + "loss": 8.2067, + "step": 493 + }, + { + "epoch": 0.03066608728040226, + "grad_norm": 0.5925104352780558, + "learning_rate": 1.022139457893648e-05, + "loss": 8.1634, + "step": 494 + }, + { + "epoch": 0.03072816438016016, + "grad_norm": 1.14973495787759, + "learning_rate": 1.0242085661080076e-05, + "loss": 8.2031, + "step": 495 + }, + { + "epoch": 0.03079024147991806, + "grad_norm": 0.982334656379783, + "learning_rate": 1.0262776743223671e-05, + "loss": 8.1612, + "step": 496 + }, + { + "epoch": 0.030852318579675957, + "grad_norm": 0.64368936290151, + "learning_rate": 1.0283467825367266e-05, + "loss": 8.2122, + "step": 497 + }, + { + "epoch": 0.030914395679433857, + "grad_norm": 1.060077598538473, + "learning_rate": 1.0304158907510863e-05, + "loss": 8.08, + "step": 498 + }, + { + "epoch": 0.030976472779191758, + "grad_norm": 0.7217602097701215, + "learning_rate": 1.032484998965446e-05, + "loss": 8.1665, + "step": 499 + }, + { + "epoch": 0.031038549878949655, + "grad_norm": 1.3088875056538147, + "learning_rate": 1.0345541071798055e-05, + "loss": 8.1731, + "step": 500 + }, + { + "epoch": 0.031100626978707555, + "grad_norm": 0.7091263137689555, + "learning_rate": 1.0366232153941652e-05, + "loss": 8.1131, + "step": 501 + }, + { + "epoch": 0.031162704078465456, + "grad_norm": 0.6755686519249876, + "learning_rate": 1.0386923236085248e-05, + "loss": 8.0752, + "step": 502 + }, + { + "epoch": 0.031224781178223353, + "grad_norm": 0.5748618281249551, + "learning_rate": 1.0407614318228843e-05, + "loss": 8.1117, + "step": 503 + }, + { + "epoch": 0.03128685827798125, + "grad_norm": 0.5228953044322332, + "learning_rate": 1.042830540037244e-05, + "loss": 8.1538, + "step": 504 + }, + { + "epoch": 0.031348935377739154, + "grad_norm": 0.8469508527731063, + "learning_rate": 1.0448996482516037e-05, + "loss": 7.9936, + "step": 505 + }, + { + "epoch": 0.03141101247749705, + "grad_norm": 0.8314829707830519, + "learning_rate": 1.0469687564659632e-05, + "loss": 8.1637, + "step": 506 + }, + { + "epoch": 0.03147308957725495, + "grad_norm": 0.6619732021860439, + "learning_rate": 1.0490378646803229e-05, + "loss": 8.1155, + "step": 507 + }, + { + "epoch": 0.03153516667701285, + "grad_norm": 0.6820821124815302, + "learning_rate": 1.0511069728946826e-05, + "loss": 8.0999, + "step": 508 + }, + { + "epoch": 0.03159724377677075, + "grad_norm": 0.6862130908386143, + "learning_rate": 1.053176081109042e-05, + "loss": 8.0928, + "step": 509 + }, + { + "epoch": 0.031659320876528646, + "grad_norm": 0.6101472314224258, + "learning_rate": 1.0552451893234016e-05, + "loss": 8.0991, + "step": 510 + }, + { + "epoch": 0.03172139797628655, + "grad_norm": 0.6549835928470881, + "learning_rate": 1.0573142975377613e-05, + "loss": 8.082, + "step": 511 + }, + { + "epoch": 0.03178347507604445, + "grad_norm": 0.6815177128981513, + "learning_rate": 1.0593834057521208e-05, + "loss": 8.1232, + "step": 512 + }, + { + "epoch": 0.031845552175802344, + "grad_norm": 0.5871264792138012, + "learning_rate": 1.0614525139664804e-05, + "loss": 8.0947, + "step": 513 + }, + { + "epoch": 0.03190762927556025, + "grad_norm": 0.6377593588103097, + "learning_rate": 1.0635216221808401e-05, + "loss": 8.0375, + "step": 514 + }, + { + "epoch": 0.031969706375318145, + "grad_norm": 0.6125898655341752, + "learning_rate": 1.0655907303951996e-05, + "loss": 8.1442, + "step": 515 + }, + { + "epoch": 0.03203178347507604, + "grad_norm": 0.6515587749420458, + "learning_rate": 1.0676598386095593e-05, + "loss": 8.0708, + "step": 516 + }, + { + "epoch": 0.032093860574833946, + "grad_norm": 1.0828098842044298, + "learning_rate": 1.069728946823919e-05, + "loss": 8.0713, + "step": 517 + }, + { + "epoch": 0.03215593767459184, + "grad_norm": 2.3890528277643583, + "learning_rate": 1.0717980550382785e-05, + "loss": 8.1381, + "step": 518 + }, + { + "epoch": 0.03221801477434974, + "grad_norm": 0.9590814545201147, + "learning_rate": 1.0738671632526382e-05, + "loss": 7.979, + "step": 519 + }, + { + "epoch": 0.032280091874107644, + "grad_norm": 1.8903824831948097, + "learning_rate": 1.0759362714669978e-05, + "loss": 8.0763, + "step": 520 + }, + { + "epoch": 0.03234216897386554, + "grad_norm": 0.9957287587173211, + "learning_rate": 1.0780053796813575e-05, + "loss": 8.0617, + "step": 521 + }, + { + "epoch": 0.03240424607362344, + "grad_norm": 3.0419210214388186, + "learning_rate": 1.080074487895717e-05, + "loss": 7.9931, + "step": 522 + }, + { + "epoch": 0.03246632317338134, + "grad_norm": 2.069602527881401, + "learning_rate": 1.0821435961100767e-05, + "loss": 8.0779, + "step": 523 + }, + { + "epoch": 0.03252840027313924, + "grad_norm": 1.6342179669711037, + "learning_rate": 1.0842127043244362e-05, + "loss": 8.0269, + "step": 524 + }, + { + "epoch": 0.032590477372897136, + "grad_norm": 1.4987634489268924, + "learning_rate": 1.0862818125387957e-05, + "loss": 8.0559, + "step": 525 + }, + { + "epoch": 0.03265255447265504, + "grad_norm": 0.8209666595682124, + "learning_rate": 1.0883509207531554e-05, + "loss": 7.9654, + "step": 526 + }, + { + "epoch": 0.03271463157241294, + "grad_norm": 1.3741494062059525, + "learning_rate": 1.090420028967515e-05, + "loss": 7.9724, + "step": 527 + }, + { + "epoch": 0.032776708672170835, + "grad_norm": 0.9130549624487904, + "learning_rate": 1.0924891371818746e-05, + "loss": 8.0122, + "step": 528 + }, + { + "epoch": 0.03283878577192874, + "grad_norm": 1.2545535119367146, + "learning_rate": 1.0945582453962343e-05, + "loss": 8.0867, + "step": 529 + }, + { + "epoch": 0.032900862871686636, + "grad_norm": 1.1358035358840761, + "learning_rate": 1.096627353610594e-05, + "loss": 8.1284, + "step": 530 + }, + { + "epoch": 0.03296293997144453, + "grad_norm": 0.7938081029216498, + "learning_rate": 1.0986964618249534e-05, + "loss": 7.9774, + "step": 531 + }, + { + "epoch": 0.03302501707120244, + "grad_norm": 0.9503490797966326, + "learning_rate": 1.1007655700393131e-05, + "loss": 7.9578, + "step": 532 + }, + { + "epoch": 0.033087094170960334, + "grad_norm": 1.3228278042642199, + "learning_rate": 1.1028346782536728e-05, + "loss": 7.9942, + "step": 533 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.6649305168220457, + "learning_rate": 1.1049037864680323e-05, + "loss": 7.9732, + "step": 534 + }, + { + "epoch": 0.033211248370476135, + "grad_norm": 0.5148755319883854, + "learning_rate": 1.106972894682392e-05, + "loss": 8.0827, + "step": 535 + }, + { + "epoch": 0.03327332547023403, + "grad_norm": 1.3875875840595717, + "learning_rate": 1.1090420028967517e-05, + "loss": 8.0503, + "step": 536 + }, + { + "epoch": 0.03333540256999193, + "grad_norm": 1.0933128028101167, + "learning_rate": 1.1111111111111112e-05, + "loss": 8.0474, + "step": 537 + }, + { + "epoch": 0.03339747966974983, + "grad_norm": 0.42186201923321154, + "learning_rate": 1.1131802193254708e-05, + "loss": 8.0318, + "step": 538 + }, + { + "epoch": 0.03345955676950773, + "grad_norm": 1.7752238280249064, + "learning_rate": 1.1152493275398303e-05, + "loss": 8.0216, + "step": 539 + }, + { + "epoch": 0.03352163386926563, + "grad_norm": 0.7268956148827427, + "learning_rate": 1.1173184357541899e-05, + "loss": 7.9119, + "step": 540 + }, + { + "epoch": 0.033583710969023524, + "grad_norm": 0.7158235599293238, + "learning_rate": 1.1193875439685495e-05, + "loss": 8.0355, + "step": 541 + }, + { + "epoch": 0.03364578806878143, + "grad_norm": 0.6310236808410932, + "learning_rate": 1.1214566521829092e-05, + "loss": 7.9363, + "step": 542 + }, + { + "epoch": 0.033707865168539325, + "grad_norm": 0.781251487730517, + "learning_rate": 1.1235257603972689e-05, + "loss": 8.0213, + "step": 543 + }, + { + "epoch": 0.03376994226829722, + "grad_norm": 0.6681592123195123, + "learning_rate": 1.1255948686116284e-05, + "loss": 7.9267, + "step": 544 + }, + { + "epoch": 0.033832019368055126, + "grad_norm": 0.6876622788768038, + "learning_rate": 1.127663976825988e-05, + "loss": 7.8835, + "step": 545 + }, + { + "epoch": 0.03389409646781302, + "grad_norm": 0.6474592113903391, + "learning_rate": 1.1297330850403477e-05, + "loss": 7.9779, + "step": 546 + }, + { + "epoch": 0.03395617356757092, + "grad_norm": 1.5457502958028142, + "learning_rate": 1.1318021932547073e-05, + "loss": 7.9129, + "step": 547 + }, + { + "epoch": 0.034018250667328824, + "grad_norm": 1.3561727141408424, + "learning_rate": 1.133871301469067e-05, + "loss": 7.9589, + "step": 548 + }, + { + "epoch": 0.03408032776708672, + "grad_norm": 1.0712525075484742, + "learning_rate": 1.1359404096834266e-05, + "loss": 7.9876, + "step": 549 + }, + { + "epoch": 0.03414240486684462, + "grad_norm": 1.9744883876602999, + "learning_rate": 1.1380095178977861e-05, + "loss": 8.0687, + "step": 550 + }, + { + "epoch": 0.03420448196660252, + "grad_norm": 1.0764770075233565, + "learning_rate": 1.1400786261121458e-05, + "loss": 7.9087, + "step": 551 + }, + { + "epoch": 0.03426655906636042, + "grad_norm": 1.109572057281127, + "learning_rate": 1.1421477343265053e-05, + "loss": 7.8768, + "step": 552 + }, + { + "epoch": 0.034328636166118316, + "grad_norm": 1.1078146048069855, + "learning_rate": 1.1442168425408648e-05, + "loss": 7.9844, + "step": 553 + }, + { + "epoch": 0.03439071326587622, + "grad_norm": 1.0226924018502264, + "learning_rate": 1.1462859507552245e-05, + "loss": 7.8575, + "step": 554 + }, + { + "epoch": 0.03445279036563412, + "grad_norm": 0.9556144659386121, + "learning_rate": 1.1483550589695842e-05, + "loss": 7.9465, + "step": 555 + }, + { + "epoch": 0.034514867465392014, + "grad_norm": 0.7552330319980729, + "learning_rate": 1.1504241671839437e-05, + "loss": 7.9206, + "step": 556 + }, + { + "epoch": 0.03457694456514992, + "grad_norm": 1.1716653763522933, + "learning_rate": 1.1524932753983033e-05, + "loss": 7.8954, + "step": 557 + }, + { + "epoch": 0.034639021664907815, + "grad_norm": 0.6580945839514553, + "learning_rate": 1.154562383612663e-05, + "loss": 7.9761, + "step": 558 + }, + { + "epoch": 0.03470109876466571, + "grad_norm": 1.422410633518134, + "learning_rate": 1.1566314918270225e-05, + "loss": 7.9138, + "step": 559 + }, + { + "epoch": 0.034763175864423616, + "grad_norm": 0.44990899503580173, + "learning_rate": 1.1587006000413822e-05, + "loss": 7.8712, + "step": 560 + }, + { + "epoch": 0.03482525296418151, + "grad_norm": 1.1997425538309114, + "learning_rate": 1.1607697082557419e-05, + "loss": 7.9889, + "step": 561 + }, + { + "epoch": 0.03488733006393941, + "grad_norm": 0.5226597773000983, + "learning_rate": 1.1628388164701016e-05, + "loss": 7.818, + "step": 562 + }, + { + "epoch": 0.034949407163697314, + "grad_norm": 1.0673307107702183, + "learning_rate": 1.164907924684461e-05, + "loss": 7.7973, + "step": 563 + }, + { + "epoch": 0.03501148426345521, + "grad_norm": 0.6607115402127816, + "learning_rate": 1.1669770328988208e-05, + "loss": 7.8621, + "step": 564 + }, + { + "epoch": 0.03507356136321311, + "grad_norm": 1.3469815099250004, + "learning_rate": 1.1690461411131803e-05, + "loss": 7.869, + "step": 565 + }, + { + "epoch": 0.03513563846297101, + "grad_norm": 0.8036455983037764, + "learning_rate": 1.17111524932754e-05, + "loss": 7.8369, + "step": 566 + }, + { + "epoch": 0.03519771556272891, + "grad_norm": 1.866157790484721, + "learning_rate": 1.1731843575418994e-05, + "loss": 7.9548, + "step": 567 + }, + { + "epoch": 0.03525979266248681, + "grad_norm": 1.172725419781641, + "learning_rate": 1.1752534657562591e-05, + "loss": 7.8344, + "step": 568 + }, + { + "epoch": 0.03532186976224471, + "grad_norm": 1.5873988892803077, + "learning_rate": 1.1773225739706186e-05, + "loss": 7.969, + "step": 569 + }, + { + "epoch": 0.03538394686200261, + "grad_norm": 1.6479006069219038, + "learning_rate": 1.1793916821849783e-05, + "loss": 7.905, + "step": 570 + }, + { + "epoch": 0.035446023961760505, + "grad_norm": 0.7463429219653765, + "learning_rate": 1.181460790399338e-05, + "loss": 7.8331, + "step": 571 + }, + { + "epoch": 0.03550810106151841, + "grad_norm": 1.9056808997963837, + "learning_rate": 1.1835298986136975e-05, + "loss": 7.9483, + "step": 572 + }, + { + "epoch": 0.035570178161276306, + "grad_norm": 0.7347292755724238, + "learning_rate": 1.1855990068280572e-05, + "loss": 7.9428, + "step": 573 + }, + { + "epoch": 0.0356322552610342, + "grad_norm": 1.0665344049490397, + "learning_rate": 1.1876681150424168e-05, + "loss": 7.8084, + "step": 574 + }, + { + "epoch": 0.03569433236079211, + "grad_norm": 1.4152211032027178, + "learning_rate": 1.1897372232567764e-05, + "loss": 7.8941, + "step": 575 + }, + { + "epoch": 0.035756409460550004, + "grad_norm": 0.6173016647789162, + "learning_rate": 1.191806331471136e-05, + "loss": 7.8552, + "step": 576 + }, + { + "epoch": 0.0358184865603079, + "grad_norm": 0.8540690567875964, + "learning_rate": 1.1938754396854957e-05, + "loss": 7.8583, + "step": 577 + }, + { + "epoch": 0.035880563660065805, + "grad_norm": 0.6810285120693581, + "learning_rate": 1.1959445478998552e-05, + "loss": 7.8985, + "step": 578 + }, + { + "epoch": 0.0359426407598237, + "grad_norm": 0.98497190996965, + "learning_rate": 1.1980136561142149e-05, + "loss": 7.8856, + "step": 579 + }, + { + "epoch": 0.0360047178595816, + "grad_norm": 0.979932782923636, + "learning_rate": 1.2000827643285744e-05, + "loss": 7.7905, + "step": 580 + }, + { + "epoch": 0.0360667949593395, + "grad_norm": 0.798502512933244, + "learning_rate": 1.2021518725429339e-05, + "loss": 7.8071, + "step": 581 + }, + { + "epoch": 0.0361288720590974, + "grad_norm": 0.6777068910330306, + "learning_rate": 1.2042209807572936e-05, + "loss": 7.7506, + "step": 582 + }, + { + "epoch": 0.0361909491588553, + "grad_norm": 0.5558853915379629, + "learning_rate": 1.2062900889716533e-05, + "loss": 7.8906, + "step": 583 + }, + { + "epoch": 0.0362530262586132, + "grad_norm": 0.8632299830891366, + "learning_rate": 1.208359197186013e-05, + "loss": 7.7903, + "step": 584 + }, + { + "epoch": 0.0363151033583711, + "grad_norm": 0.879360471549869, + "learning_rate": 1.2104283054003724e-05, + "loss": 7.8295, + "step": 585 + }, + { + "epoch": 0.036377180458128995, + "grad_norm": 0.604557483702369, + "learning_rate": 1.2124974136147321e-05, + "loss": 7.8511, + "step": 586 + }, + { + "epoch": 0.03643925755788689, + "grad_norm": 0.7707515664593582, + "learning_rate": 1.2145665218290918e-05, + "loss": 7.8364, + "step": 587 + }, + { + "epoch": 0.036501334657644796, + "grad_norm": 0.7027269932357594, + "learning_rate": 1.2166356300434513e-05, + "loss": 7.7972, + "step": 588 + }, + { + "epoch": 0.03656341175740269, + "grad_norm": 0.7114241056088708, + "learning_rate": 1.218704738257811e-05, + "loss": 7.7064, + "step": 589 + }, + { + "epoch": 0.03662548885716059, + "grad_norm": 1.1278383160197274, + "learning_rate": 1.2207738464721707e-05, + "loss": 7.7676, + "step": 590 + }, + { + "epoch": 0.036687565956918494, + "grad_norm": 0.4659078103986766, + "learning_rate": 1.2228429546865302e-05, + "loss": 7.7144, + "step": 591 + }, + { + "epoch": 0.03674964305667639, + "grad_norm": 0.9034173037034089, + "learning_rate": 1.2249120629008898e-05, + "loss": 7.7599, + "step": 592 + }, + { + "epoch": 0.03681172015643429, + "grad_norm": 0.7583864928745867, + "learning_rate": 1.2269811711152494e-05, + "loss": 7.8548, + "step": 593 + }, + { + "epoch": 0.03687379725619219, + "grad_norm": 0.6943556169814092, + "learning_rate": 1.229050279329609e-05, + "loss": 7.7486, + "step": 594 + }, + { + "epoch": 0.03693587435595009, + "grad_norm": 0.9174577305175301, + "learning_rate": 1.2311193875439685e-05, + "loss": 7.8427, + "step": 595 + }, + { + "epoch": 0.036997951455707986, + "grad_norm": 0.843983370684918, + "learning_rate": 1.2331884957583282e-05, + "loss": 7.7791, + "step": 596 + }, + { + "epoch": 0.03706002855546589, + "grad_norm": 1.1786622398449402, + "learning_rate": 1.2352576039726877e-05, + "loss": 7.8146, + "step": 597 + }, + { + "epoch": 0.03712210565522379, + "grad_norm": 0.5897426644920557, + "learning_rate": 1.2373267121870474e-05, + "loss": 7.6984, + "step": 598 + }, + { + "epoch": 0.037184182754981684, + "grad_norm": 0.6763824258487274, + "learning_rate": 1.239395820401407e-05, + "loss": 7.6379, + "step": 599 + }, + { + "epoch": 0.03724625985473959, + "grad_norm": 0.6667558689838905, + "learning_rate": 1.2414649286157666e-05, + "loss": 7.7662, + "step": 600 + }, + { + "epoch": 0.037308336954497485, + "grad_norm": 0.6849451712084684, + "learning_rate": 1.2435340368301263e-05, + "loss": 7.7833, + "step": 601 + }, + { + "epoch": 0.03737041405425538, + "grad_norm": 0.7046958119310304, + "learning_rate": 1.245603145044486e-05, + "loss": 7.6814, + "step": 602 + }, + { + "epoch": 0.037432491154013287, + "grad_norm": 0.7648929677524542, + "learning_rate": 1.2476722532588454e-05, + "loss": 7.727, + "step": 603 + }, + { + "epoch": 0.037494568253771184, + "grad_norm": 0.5414390421923237, + "learning_rate": 1.2497413614732051e-05, + "loss": 7.6226, + "step": 604 + }, + { + "epoch": 0.03755664535352908, + "grad_norm": 0.7069370143748077, + "learning_rate": 1.2518104696875646e-05, + "loss": 7.7239, + "step": 605 + }, + { + "epoch": 0.037618722453286985, + "grad_norm": 0.7825156450383384, + "learning_rate": 1.2538795779019245e-05, + "loss": 7.6737, + "step": 606 + }, + { + "epoch": 0.03768079955304488, + "grad_norm": 0.49933183251912117, + "learning_rate": 1.255948686116284e-05, + "loss": 7.7258, + "step": 607 + }, + { + "epoch": 0.03774287665280278, + "grad_norm": 0.8322859181829044, + "learning_rate": 1.2580177943306435e-05, + "loss": 7.7431, + "step": 608 + }, + { + "epoch": 0.03780495375256068, + "grad_norm": 1.1317434067379244, + "learning_rate": 1.2600869025450032e-05, + "loss": 7.6982, + "step": 609 + }, + { + "epoch": 0.03786703085231858, + "grad_norm": 0.4939884811006284, + "learning_rate": 1.2621560107593627e-05, + "loss": 7.619, + "step": 610 + }, + { + "epoch": 0.03792910795207648, + "grad_norm": 0.7972911795109707, + "learning_rate": 1.2642251189737222e-05, + "loss": 7.6468, + "step": 611 + }, + { + "epoch": 0.03799118505183438, + "grad_norm": 1.1392328310274027, + "learning_rate": 1.266294227188082e-05, + "loss": 7.6775, + "step": 612 + }, + { + "epoch": 0.03805326215159228, + "grad_norm": 1.3245701956459932, + "learning_rate": 1.2683633354024415e-05, + "loss": 7.7099, + "step": 613 + }, + { + "epoch": 0.038115339251350175, + "grad_norm": 1.484514410599749, + "learning_rate": 1.270432443616801e-05, + "loss": 7.7434, + "step": 614 + }, + { + "epoch": 0.03817741635110808, + "grad_norm": 0.8425478318330326, + "learning_rate": 1.2725015518311609e-05, + "loss": 7.6605, + "step": 615 + }, + { + "epoch": 0.038239493450865976, + "grad_norm": 3.7540159973551575, + "learning_rate": 1.2745706600455204e-05, + "loss": 7.758, + "step": 616 + }, + { + "epoch": 0.03830157055062387, + "grad_norm": 1.0756142170092386, + "learning_rate": 1.2766397682598802e-05, + "loss": 7.6389, + "step": 617 + }, + { + "epoch": 0.03836364765038178, + "grad_norm": 1.7257242368620531, + "learning_rate": 1.2787088764742398e-05, + "loss": 7.685, + "step": 618 + }, + { + "epoch": 0.038425724750139674, + "grad_norm": 2.158775398710208, + "learning_rate": 1.2807779846885993e-05, + "loss": 7.7268, + "step": 619 + }, + { + "epoch": 0.03848780184989757, + "grad_norm": 0.8576816869053789, + "learning_rate": 1.282847092902959e-05, + "loss": 7.6328, + "step": 620 + }, + { + "epoch": 0.038549878949655475, + "grad_norm": 1.7855631491871429, + "learning_rate": 1.2849162011173184e-05, + "loss": 7.6935, + "step": 621 + }, + { + "epoch": 0.03861195604941337, + "grad_norm": 1.8205963044502964, + "learning_rate": 1.2869853093316781e-05, + "loss": 7.699, + "step": 622 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7140078216920375, + "learning_rate": 1.2890544175460378e-05, + "loss": 7.6648, + "step": 623 + }, + { + "epoch": 0.03873611024892917, + "grad_norm": 0.889390725957146, + "learning_rate": 1.2911235257603973e-05, + "loss": 7.6117, + "step": 624 + }, + { + "epoch": 0.03879818734868707, + "grad_norm": 0.8163009283584611, + "learning_rate": 1.2931926339747568e-05, + "loss": 7.7165, + "step": 625 + }, + { + "epoch": 0.03886026444844497, + "grad_norm": 0.9728915071739976, + "learning_rate": 1.2952617421891167e-05, + "loss": 7.7007, + "step": 626 + }, + { + "epoch": 0.03892234154820287, + "grad_norm": 1.457405349118355, + "learning_rate": 1.2973308504034762e-05, + "loss": 7.6328, + "step": 627 + }, + { + "epoch": 0.03898441864796077, + "grad_norm": 1.060168906735774, + "learning_rate": 1.2993999586178357e-05, + "loss": 7.65, + "step": 628 + }, + { + "epoch": 0.039046495747718665, + "grad_norm": 0.9962811462051943, + "learning_rate": 1.3014690668321955e-05, + "loss": 7.6696, + "step": 629 + }, + { + "epoch": 0.03910857284747657, + "grad_norm": 1.504381614958955, + "learning_rate": 1.303538175046555e-05, + "loss": 7.5863, + "step": 630 + }, + { + "epoch": 0.039170649947234466, + "grad_norm": 0.7049677965912002, + "learning_rate": 1.3056072832609145e-05, + "loss": 7.6685, + "step": 631 + }, + { + "epoch": 0.03923272704699236, + "grad_norm": 1.728104950993767, + "learning_rate": 1.3076763914752744e-05, + "loss": 7.6842, + "step": 632 + }, + { + "epoch": 0.03929480414675026, + "grad_norm": 1.1438971315397468, + "learning_rate": 1.3097454996896339e-05, + "loss": 7.549, + "step": 633 + }, + { + "epoch": 0.039356881246508164, + "grad_norm": 0.7107971388205935, + "learning_rate": 1.3118146079039934e-05, + "loss": 7.6223, + "step": 634 + }, + { + "epoch": 0.03941895834626606, + "grad_norm": 0.7262020242310354, + "learning_rate": 1.313883716118353e-05, + "loss": 7.604, + "step": 635 + }, + { + "epoch": 0.03948103544602396, + "grad_norm": 1.5207778014752718, + "learning_rate": 1.3159528243327126e-05, + "loss": 7.6524, + "step": 636 + }, + { + "epoch": 0.03954311254578186, + "grad_norm": 0.5517174089880553, + "learning_rate": 1.3180219325470723e-05, + "loss": 7.6278, + "step": 637 + }, + { + "epoch": 0.03960518964553976, + "grad_norm": 1.4194046190766922, + "learning_rate": 1.320091040761432e-05, + "loss": 7.5649, + "step": 638 + }, + { + "epoch": 0.03966726674529766, + "grad_norm": 0.7453422956288948, + "learning_rate": 1.3221601489757914e-05, + "loss": 7.6323, + "step": 639 + }, + { + "epoch": 0.03972934384505556, + "grad_norm": 1.2948214807041956, + "learning_rate": 1.324229257190151e-05, + "loss": 7.6574, + "step": 640 + }, + { + "epoch": 0.03979142094481346, + "grad_norm": 0.9437749038340952, + "learning_rate": 1.3262983654045108e-05, + "loss": 7.6545, + "step": 641 + }, + { + "epoch": 0.039853498044571355, + "grad_norm": 0.611839032192425, + "learning_rate": 1.3283674736188703e-05, + "loss": 7.5798, + "step": 642 + }, + { + "epoch": 0.03991557514432926, + "grad_norm": 1.572056954320343, + "learning_rate": 1.3304365818332298e-05, + "loss": 7.5344, + "step": 643 + }, + { + "epoch": 0.039977652244087156, + "grad_norm": 0.7974538757474956, + "learning_rate": 1.3325056900475897e-05, + "loss": 7.6269, + "step": 644 + }, + { + "epoch": 0.04003972934384505, + "grad_norm": 1.0081557642954249, + "learning_rate": 1.3345747982619492e-05, + "loss": 7.642, + "step": 645 + }, + { + "epoch": 0.04010180644360296, + "grad_norm": 0.7708820164402764, + "learning_rate": 1.3366439064763087e-05, + "loss": 7.5624, + "step": 646 + }, + { + "epoch": 0.040163883543360854, + "grad_norm": 0.7045176595297951, + "learning_rate": 1.3387130146906685e-05, + "loss": 7.5666, + "step": 647 + }, + { + "epoch": 0.04022596064311875, + "grad_norm": 1.1218457507727684, + "learning_rate": 1.340782122905028e-05, + "loss": 7.6056, + "step": 648 + }, + { + "epoch": 0.040288037742876655, + "grad_norm": 0.6351246214777106, + "learning_rate": 1.3428512311193875e-05, + "loss": 7.5491, + "step": 649 + }, + { + "epoch": 0.04035011484263455, + "grad_norm": 1.2228364659297382, + "learning_rate": 1.3449203393337472e-05, + "loss": 7.4538, + "step": 650 + }, + { + "epoch": 0.04041219194239245, + "grad_norm": 0.5955938270232943, + "learning_rate": 1.3469894475481067e-05, + "loss": 7.6184, + "step": 651 + }, + { + "epoch": 0.04047426904215035, + "grad_norm": 0.5303678762176415, + "learning_rate": 1.3490585557624662e-05, + "loss": 7.6012, + "step": 652 + }, + { + "epoch": 0.04053634614190825, + "grad_norm": 1.4150120197500045, + "learning_rate": 1.351127663976826e-05, + "loss": 7.5259, + "step": 653 + }, + { + "epoch": 0.04059842324166615, + "grad_norm": 0.6917305309808128, + "learning_rate": 1.3531967721911856e-05, + "loss": 7.633, + "step": 654 + }, + { + "epoch": 0.04066050034142405, + "grad_norm": 0.8520337122215793, + "learning_rate": 1.3552658804055451e-05, + "loss": 7.523, + "step": 655 + }, + { + "epoch": 0.04072257744118195, + "grad_norm": 0.8823702894637101, + "learning_rate": 1.357334988619905e-05, + "loss": 7.543, + "step": 656 + }, + { + "epoch": 0.040784654540939845, + "grad_norm": 0.7617796653452216, + "learning_rate": 1.3594040968342644e-05, + "loss": 7.6307, + "step": 657 + }, + { + "epoch": 0.04084673164069775, + "grad_norm": 0.6298299131432639, + "learning_rate": 1.3614732050486243e-05, + "loss": 7.6119, + "step": 658 + }, + { + "epoch": 0.040908808740455646, + "grad_norm": 1.195345859436071, + "learning_rate": 1.3635423132629838e-05, + "loss": 7.5666, + "step": 659 + }, + { + "epoch": 0.04097088584021354, + "grad_norm": 0.6853528739979704, + "learning_rate": 1.3656114214773433e-05, + "loss": 7.5076, + "step": 660 + }, + { + "epoch": 0.04103296293997145, + "grad_norm": 0.8261776105532846, + "learning_rate": 1.3676805296917032e-05, + "loss": 7.5068, + "step": 661 + }, + { + "epoch": 0.041095040039729344, + "grad_norm": 0.9923479145763321, + "learning_rate": 1.3697496379060627e-05, + "loss": 7.5789, + "step": 662 + }, + { + "epoch": 0.04115711713948724, + "grad_norm": 1.019145264035622, + "learning_rate": 1.3718187461204222e-05, + "loss": 7.5715, + "step": 663 + }, + { + "epoch": 0.041219194239245145, + "grad_norm": 0.8032782723787716, + "learning_rate": 1.3738878543347818e-05, + "loss": 7.5812, + "step": 664 + }, + { + "epoch": 0.04128127133900304, + "grad_norm": 1.0456610516194826, + "learning_rate": 1.3759569625491414e-05, + "loss": 7.5012, + "step": 665 + }, + { + "epoch": 0.04134334843876094, + "grad_norm": 0.5576137402322706, + "learning_rate": 1.3780260707635009e-05, + "loss": 7.5791, + "step": 666 + }, + { + "epoch": 0.04140542553851884, + "grad_norm": 1.629253103527504, + "learning_rate": 1.3800951789778607e-05, + "loss": 7.5403, + "step": 667 + }, + { + "epoch": 0.04146750263827674, + "grad_norm": 1.118925383475733, + "learning_rate": 1.3821642871922202e-05, + "loss": 7.5119, + "step": 668 + }, + { + "epoch": 0.04152957973803464, + "grad_norm": 0.9070725161343026, + "learning_rate": 1.3842333954065797e-05, + "loss": 7.4808, + "step": 669 + }, + { + "epoch": 0.04159165683779254, + "grad_norm": 0.8265327044418711, + "learning_rate": 1.3863025036209396e-05, + "loss": 7.474, + "step": 670 + }, + { + "epoch": 0.04165373393755044, + "grad_norm": 0.6830683841959736, + "learning_rate": 1.388371611835299e-05, + "loss": 7.4818, + "step": 671 + }, + { + "epoch": 0.041715811037308335, + "grad_norm": 1.0489059788740362, + "learning_rate": 1.3904407200496586e-05, + "loss": 7.5792, + "step": 672 + }, + { + "epoch": 0.04177788813706624, + "grad_norm": 0.8929359837191622, + "learning_rate": 1.3925098282640184e-05, + "loss": 7.566, + "step": 673 + }, + { + "epoch": 0.041839965236824136, + "grad_norm": 0.995460129213884, + "learning_rate": 1.394578936478378e-05, + "loss": 7.5282, + "step": 674 + }, + { + "epoch": 0.041902042336582034, + "grad_norm": 0.690562039953364, + "learning_rate": 1.3966480446927374e-05, + "loss": 7.5039, + "step": 675 + }, + { + "epoch": 0.04196411943633994, + "grad_norm": 1.0382975006553992, + "learning_rate": 1.3987171529070971e-05, + "loss": 7.5153, + "step": 676 + }, + { + "epoch": 0.042026196536097835, + "grad_norm": 0.6829797003188252, + "learning_rate": 1.4007862611214568e-05, + "loss": 7.5959, + "step": 677 + }, + { + "epoch": 0.04208827363585573, + "grad_norm": 0.9878081142831414, + "learning_rate": 1.4028553693358163e-05, + "loss": 7.5464, + "step": 678 + }, + { + "epoch": 0.042150350735613636, + "grad_norm": 0.5336146740141066, + "learning_rate": 1.404924477550176e-05, + "loss": 7.4892, + "step": 679 + }, + { + "epoch": 0.04221242783537153, + "grad_norm": 0.5091209281142285, + "learning_rate": 1.4069935857645355e-05, + "loss": 7.4707, + "step": 680 + }, + { + "epoch": 0.04227450493512943, + "grad_norm": 0.5256485997003834, + "learning_rate": 1.409062693978895e-05, + "loss": 7.3396, + "step": 681 + }, + { + "epoch": 0.04233658203488733, + "grad_norm": 0.7530009709715441, + "learning_rate": 1.4111318021932548e-05, + "loss": 7.5165, + "step": 682 + }, + { + "epoch": 0.04239865913464523, + "grad_norm": 0.7185809724504926, + "learning_rate": 1.4132009104076144e-05, + "loss": 7.4558, + "step": 683 + }, + { + "epoch": 0.04246073623440313, + "grad_norm": 0.7388001087060589, + "learning_rate": 1.4152700186219739e-05, + "loss": 7.4653, + "step": 684 + }, + { + "epoch": 0.042522813334161025, + "grad_norm": 0.4435425770566433, + "learning_rate": 1.4173391268363337e-05, + "loss": 7.3991, + "step": 685 + }, + { + "epoch": 0.04258489043391893, + "grad_norm": 1.7254583358001299, + "learning_rate": 1.4194082350506932e-05, + "loss": 7.4892, + "step": 686 + }, + { + "epoch": 0.042646967533676826, + "grad_norm": 1.4638302972075783, + "learning_rate": 1.4214773432650527e-05, + "loss": 7.4175, + "step": 687 + }, + { + "epoch": 0.04270904463343472, + "grad_norm": 0.9905620386470501, + "learning_rate": 1.4235464514794126e-05, + "loss": 7.4152, + "step": 688 + }, + { + "epoch": 0.04277112173319263, + "grad_norm": 2.5516726935490106, + "learning_rate": 1.425615559693772e-05, + "loss": 7.4982, + "step": 689 + }, + { + "epoch": 0.042833198832950524, + "grad_norm": 1.0068120796344346, + "learning_rate": 1.4276846679081316e-05, + "loss": 7.4112, + "step": 690 + }, + { + "epoch": 0.04289527593270842, + "grad_norm": 0.9930243079657138, + "learning_rate": 1.4297537761224913e-05, + "loss": 7.5157, + "step": 691 + }, + { + "epoch": 0.042957353032466325, + "grad_norm": 1.3250035123958457, + "learning_rate": 1.4318228843368508e-05, + "loss": 7.382, + "step": 692 + }, + { + "epoch": 0.04301943013222422, + "grad_norm": 1.005165112345316, + "learning_rate": 1.4338919925512104e-05, + "loss": 7.4706, + "step": 693 + }, + { + "epoch": 0.04308150723198212, + "grad_norm": 0.7346860348692141, + "learning_rate": 1.4359611007655701e-05, + "loss": 7.4673, + "step": 694 + }, + { + "epoch": 0.04314358433174002, + "grad_norm": 0.9319884412707719, + "learning_rate": 1.4380302089799296e-05, + "loss": 7.5663, + "step": 695 + }, + { + "epoch": 0.04320566143149792, + "grad_norm": 0.7906941455262976, + "learning_rate": 1.4400993171942891e-05, + "loss": 7.4504, + "step": 696 + }, + { + "epoch": 0.04326773853125582, + "grad_norm": 0.7782926279311176, + "learning_rate": 1.442168425408649e-05, + "loss": 7.4893, + "step": 697 + }, + { + "epoch": 0.04332981563101372, + "grad_norm": 1.2234373187653271, + "learning_rate": 1.4442375336230085e-05, + "loss": 7.4911, + "step": 698 + }, + { + "epoch": 0.04339189273077162, + "grad_norm": 0.5929955156037726, + "learning_rate": 1.446306641837368e-05, + "loss": 7.382, + "step": 699 + }, + { + "epoch": 0.043453969830529515, + "grad_norm": 0.5046469397529648, + "learning_rate": 1.4483757500517278e-05, + "loss": 7.2851, + "step": 700 + }, + { + "epoch": 0.04351604693028742, + "grad_norm": 0.5427598894589711, + "learning_rate": 1.4504448582660874e-05, + "loss": 7.4372, + "step": 701 + }, + { + "epoch": 0.043578124030045316, + "grad_norm": 1.2620815074492266, + "learning_rate": 1.4525139664804472e-05, + "loss": 7.4342, + "step": 702 + }, + { + "epoch": 0.04364020112980321, + "grad_norm": 0.641671233553137, + "learning_rate": 1.4545830746948067e-05, + "loss": 7.4873, + "step": 703 + }, + { + "epoch": 0.04370227822956112, + "grad_norm": 1.1864558196717994, + "learning_rate": 1.4566521829091662e-05, + "loss": 7.3103, + "step": 704 + }, + { + "epoch": 0.043764355329319014, + "grad_norm": 1.2132574684578195, + "learning_rate": 1.4587212911235259e-05, + "loss": 7.3501, + "step": 705 + }, + { + "epoch": 0.04382643242907691, + "grad_norm": 0.8377423988960518, + "learning_rate": 1.4607903993378854e-05, + "loss": 7.5183, + "step": 706 + }, + { + "epoch": 0.043888509528834815, + "grad_norm": 1.1207860863428496, + "learning_rate": 1.4628595075522449e-05, + "loss": 7.3762, + "step": 707 + }, + { + "epoch": 0.04395058662859271, + "grad_norm": 0.5375337346158537, + "learning_rate": 1.4649286157666048e-05, + "loss": 7.3756, + "step": 708 + }, + { + "epoch": 0.04401266372835061, + "grad_norm": 0.8419007062491941, + "learning_rate": 1.4669977239809643e-05, + "loss": 7.3773, + "step": 709 + }, + { + "epoch": 0.04407474082810851, + "grad_norm": 0.8967867405758531, + "learning_rate": 1.4690668321953238e-05, + "loss": 7.3991, + "step": 710 + }, + { + "epoch": 0.04413681792786641, + "grad_norm": 0.8627872693532576, + "learning_rate": 1.4711359404096836e-05, + "loss": 7.3749, + "step": 711 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 0.677007719855195, + "learning_rate": 1.4732050486240431e-05, + "loss": 7.3622, + "step": 712 + }, + { + "epoch": 0.04426097212738221, + "grad_norm": 0.6358185237403059, + "learning_rate": 1.4752741568384026e-05, + "loss": 7.3008, + "step": 713 + }, + { + "epoch": 0.04432304922714011, + "grad_norm": 0.9793352969361787, + "learning_rate": 1.4773432650527625e-05, + "loss": 7.3314, + "step": 714 + }, + { + "epoch": 0.044385126326898006, + "grad_norm": 0.6879087230201396, + "learning_rate": 1.479412373267122e-05, + "loss": 7.3009, + "step": 715 + }, + { + "epoch": 0.04444720342665591, + "grad_norm": 0.9494264163571349, + "learning_rate": 1.4814814814814815e-05, + "loss": 7.4371, + "step": 716 + }, + { + "epoch": 0.04450928052641381, + "grad_norm": 0.561587228883311, + "learning_rate": 1.4835505896958413e-05, + "loss": 7.3499, + "step": 717 + }, + { + "epoch": 0.044571357626171704, + "grad_norm": 0.9579254332732114, + "learning_rate": 1.4856196979102009e-05, + "loss": 7.3291, + "step": 718 + }, + { + "epoch": 0.04463343472592961, + "grad_norm": 0.7561357569510325, + "learning_rate": 1.4876888061245604e-05, + "loss": 7.3592, + "step": 719 + }, + { + "epoch": 0.044695511825687505, + "grad_norm": 0.7973989372433865, + "learning_rate": 1.48975791433892e-05, + "loss": 7.266, + "step": 720 + }, + { + "epoch": 0.0447575889254454, + "grad_norm": 1.2933728834049922, + "learning_rate": 1.4918270225532795e-05, + "loss": 7.4153, + "step": 721 + }, + { + "epoch": 0.044819666025203306, + "grad_norm": 0.591548200173502, + "learning_rate": 1.493896130767639e-05, + "loss": 7.3247, + "step": 722 + }, + { + "epoch": 0.0448817431249612, + "grad_norm": 0.6099711705258541, + "learning_rate": 1.4959652389819989e-05, + "loss": 7.224, + "step": 723 + }, + { + "epoch": 0.0449438202247191, + "grad_norm": 1.0429596632658062, + "learning_rate": 1.4980343471963584e-05, + "loss": 7.3741, + "step": 724 + }, + { + "epoch": 0.045005897324477004, + "grad_norm": 0.6163818799363832, + "learning_rate": 1.5001034554107179e-05, + "loss": 7.3404, + "step": 725 + }, + { + "epoch": 0.0450679744242349, + "grad_norm": 0.6553831960793867, + "learning_rate": 1.5021725636250778e-05, + "loss": 7.3791, + "step": 726 + }, + { + "epoch": 0.0451300515239928, + "grad_norm": 1.0558778615131263, + "learning_rate": 1.5042416718394373e-05, + "loss": 7.3354, + "step": 727 + }, + { + "epoch": 0.045192128623750695, + "grad_norm": 0.6325451321388103, + "learning_rate": 1.5063107800537968e-05, + "loss": 7.3324, + "step": 728 + }, + { + "epoch": 0.0452542057235086, + "grad_norm": 0.761755057227659, + "learning_rate": 1.5083798882681566e-05, + "loss": 7.2643, + "step": 729 + }, + { + "epoch": 0.045316282823266496, + "grad_norm": 0.6832981233325669, + "learning_rate": 1.5104489964825161e-05, + "loss": 7.2532, + "step": 730 + }, + { + "epoch": 0.04537835992302439, + "grad_norm": 1.0372604576166584, + "learning_rate": 1.5125181046968756e-05, + "loss": 7.3812, + "step": 731 + }, + { + "epoch": 0.0454404370227823, + "grad_norm": 0.8337510293957077, + "learning_rate": 1.5145872129112355e-05, + "loss": 7.3081, + "step": 732 + }, + { + "epoch": 0.045502514122540194, + "grad_norm": 1.5179339253214699, + "learning_rate": 1.516656321125595e-05, + "loss": 7.376, + "step": 733 + }, + { + "epoch": 0.04556459122229809, + "grad_norm": 0.7594701589607304, + "learning_rate": 1.5187254293399545e-05, + "loss": 7.3126, + "step": 734 + }, + { + "epoch": 0.045626668322055995, + "grad_norm": 0.7847930675548435, + "learning_rate": 1.5207945375543142e-05, + "loss": 7.3075, + "step": 735 + }, + { + "epoch": 0.04568874542181389, + "grad_norm": 0.6427553390509679, + "learning_rate": 1.5228636457686737e-05, + "loss": 7.3389, + "step": 736 + }, + { + "epoch": 0.04575082252157179, + "grad_norm": 0.808720749174249, + "learning_rate": 1.5249327539830332e-05, + "loss": 7.3323, + "step": 737 + }, + { + "epoch": 0.04581289962132969, + "grad_norm": 2.5862328859097614, + "learning_rate": 1.5270018621973932e-05, + "loss": 7.3749, + "step": 738 + }, + { + "epoch": 0.04587497672108759, + "grad_norm": 0.8273462916867386, + "learning_rate": 1.5290709704117527e-05, + "loss": 7.4288, + "step": 739 + }, + { + "epoch": 0.04593705382084549, + "grad_norm": 1.3728161722468863, + "learning_rate": 1.5311400786261122e-05, + "loss": 7.3114, + "step": 740 + }, + { + "epoch": 0.04599913092060339, + "grad_norm": 1.239385888325488, + "learning_rate": 1.5332091868404717e-05, + "loss": 7.326, + "step": 741 + }, + { + "epoch": 0.04606120802036129, + "grad_norm": 0.8447992723516922, + "learning_rate": 1.5352782950548312e-05, + "loss": 7.3833, + "step": 742 + }, + { + "epoch": 0.046123285120119185, + "grad_norm": 0.8250657561135133, + "learning_rate": 1.537347403269191e-05, + "loss": 7.3264, + "step": 743 + }, + { + "epoch": 0.04618536221987709, + "grad_norm": 1.5264133439068557, + "learning_rate": 1.5394165114835506e-05, + "loss": 7.2699, + "step": 744 + }, + { + "epoch": 0.046247439319634986, + "grad_norm": 0.8503013861903188, + "learning_rate": 1.54148561969791e-05, + "loss": 7.2957, + "step": 745 + }, + { + "epoch": 0.046309516419392883, + "grad_norm": 0.7109921193363484, + "learning_rate": 1.54355472791227e-05, + "loss": 7.2494, + "step": 746 + }, + { + "epoch": 0.04637159351915079, + "grad_norm": 2.0461326592508438, + "learning_rate": 1.5456238361266295e-05, + "loss": 7.3348, + "step": 747 + }, + { + "epoch": 0.046433670618908685, + "grad_norm": 0.7321721986546734, + "learning_rate": 1.547692944340989e-05, + "loss": 7.2523, + "step": 748 + }, + { + "epoch": 0.04649574771866658, + "grad_norm": 0.8709302671835017, + "learning_rate": 1.5497620525553488e-05, + "loss": 7.3434, + "step": 749 + }, + { + "epoch": 0.046557824818424486, + "grad_norm": 0.5863590504256609, + "learning_rate": 1.5518311607697083e-05, + "loss": 7.4201, + "step": 750 + }, + { + "epoch": 0.04661990191818238, + "grad_norm": 0.8657592490305558, + "learning_rate": 1.5539002689840678e-05, + "loss": 7.2196, + "step": 751 + }, + { + "epoch": 0.04668197901794028, + "grad_norm": 0.7083116602854378, + "learning_rate": 1.5559693771984277e-05, + "loss": 7.2241, + "step": 752 + }, + { + "epoch": 0.046744056117698184, + "grad_norm": 0.8466433612990121, + "learning_rate": 1.5580384854127872e-05, + "loss": 7.2395, + "step": 753 + }, + { + "epoch": 0.04680613321745608, + "grad_norm": 0.57137116714357, + "learning_rate": 1.5601075936271467e-05, + "loss": 7.2482, + "step": 754 + }, + { + "epoch": 0.04686821031721398, + "grad_norm": 0.6762374044010383, + "learning_rate": 1.5621767018415065e-05, + "loss": 7.3735, + "step": 755 + }, + { + "epoch": 0.04693028741697188, + "grad_norm": 0.53400422445155, + "learning_rate": 1.564245810055866e-05, + "loss": 7.313, + "step": 756 + }, + { + "epoch": 0.04699236451672978, + "grad_norm": 0.48878155825944036, + "learning_rate": 1.5663149182702255e-05, + "loss": 7.181, + "step": 757 + }, + { + "epoch": 0.047054441616487676, + "grad_norm": 1.3310194451640616, + "learning_rate": 1.5683840264845854e-05, + "loss": 7.2041, + "step": 758 + }, + { + "epoch": 0.04711651871624558, + "grad_norm": 0.5627912456135912, + "learning_rate": 1.570453134698945e-05, + "loss": 7.2722, + "step": 759 + }, + { + "epoch": 0.04717859581600348, + "grad_norm": 0.6685290970001552, + "learning_rate": 1.5725222429133044e-05, + "loss": 7.2893, + "step": 760 + }, + { + "epoch": 0.047240672915761374, + "grad_norm": 0.5370952812426466, + "learning_rate": 1.5745913511276643e-05, + "loss": 7.2225, + "step": 761 + }, + { + "epoch": 0.04730275001551928, + "grad_norm": 0.5966715960414439, + "learning_rate": 1.5766604593420238e-05, + "loss": 7.2201, + "step": 762 + }, + { + "epoch": 0.047364827115277175, + "grad_norm": 0.5194860453048565, + "learning_rate": 1.5787295675563833e-05, + "loss": 7.2464, + "step": 763 + }, + { + "epoch": 0.04742690421503507, + "grad_norm": 0.5032869421925111, + "learning_rate": 1.580798675770743e-05, + "loss": 7.2968, + "step": 764 + }, + { + "epoch": 0.047488981314792976, + "grad_norm": 0.8140539567905467, + "learning_rate": 1.5828677839851026e-05, + "loss": 7.108, + "step": 765 + }, + { + "epoch": 0.04755105841455087, + "grad_norm": 0.6362164159878398, + "learning_rate": 1.584936892199462e-05, + "loss": 7.194, + "step": 766 + }, + { + "epoch": 0.04761313551430877, + "grad_norm": 0.5790419395078125, + "learning_rate": 1.5870060004138216e-05, + "loss": 7.336, + "step": 767 + }, + { + "epoch": 0.047675212614066674, + "grad_norm": 0.6585527391437138, + "learning_rate": 1.5890751086281815e-05, + "loss": 7.1459, + "step": 768 + }, + { + "epoch": 0.04773728971382457, + "grad_norm": 0.6996100412405005, + "learning_rate": 1.591144216842541e-05, + "loss": 7.1714, + "step": 769 + }, + { + "epoch": 0.04779936681358247, + "grad_norm": 0.5595567045837334, + "learning_rate": 1.5932133250569005e-05, + "loss": 7.2508, + "step": 770 + }, + { + "epoch": 0.04786144391334037, + "grad_norm": 0.5789658900542531, + "learning_rate": 1.59528243327126e-05, + "loss": 7.2449, + "step": 771 + }, + { + "epoch": 0.04792352101309827, + "grad_norm": 0.4504253767755244, + "learning_rate": 1.5973515414856195e-05, + "loss": 7.2381, + "step": 772 + }, + { + "epoch": 0.047985598112856166, + "grad_norm": 0.542321567285065, + "learning_rate": 1.5994206496999794e-05, + "loss": 7.2302, + "step": 773 + }, + { + "epoch": 0.04804767521261406, + "grad_norm": 0.518399231448531, + "learning_rate": 1.601489757914339e-05, + "loss": 7.2554, + "step": 774 + }, + { + "epoch": 0.04810975231237197, + "grad_norm": 0.5616960343183387, + "learning_rate": 1.6035588661286984e-05, + "loss": 7.1474, + "step": 775 + }, + { + "epoch": 0.048171829412129864, + "grad_norm": 0.39349705579400285, + "learning_rate": 1.6056279743430582e-05, + "loss": 7.2494, + "step": 776 + }, + { + "epoch": 0.04823390651188776, + "grad_norm": 0.5812298166195801, + "learning_rate": 1.6076970825574177e-05, + "loss": 7.1535, + "step": 777 + }, + { + "epoch": 0.048295983611645665, + "grad_norm": 0.3971519300018907, + "learning_rate": 1.6097661907717772e-05, + "loss": 7.2026, + "step": 778 + }, + { + "epoch": 0.04835806071140356, + "grad_norm": 0.731659637546302, + "learning_rate": 1.611835298986137e-05, + "loss": 7.3063, + "step": 779 + }, + { + "epoch": 0.04842013781116146, + "grad_norm": 0.5307501236281608, + "learning_rate": 1.6139044072004966e-05, + "loss": 7.108, + "step": 780 + }, + { + "epoch": 0.04848221491091936, + "grad_norm": 0.39900903288337347, + "learning_rate": 1.615973515414856e-05, + "loss": 7.2024, + "step": 781 + }, + { + "epoch": 0.04854429201067726, + "grad_norm": 0.4360116779290224, + "learning_rate": 1.618042623629216e-05, + "loss": 7.1376, + "step": 782 + }, + { + "epoch": 0.04860636911043516, + "grad_norm": 0.420170182734411, + "learning_rate": 1.6201117318435755e-05, + "loss": 7.1604, + "step": 783 + }, + { + "epoch": 0.04866844621019306, + "grad_norm": 0.8766961687681564, + "learning_rate": 1.622180840057935e-05, + "loss": 7.1753, + "step": 784 + }, + { + "epoch": 0.04873052330995096, + "grad_norm": 0.9699895066254116, + "learning_rate": 1.6242499482722948e-05, + "loss": 7.208, + "step": 785 + }, + { + "epoch": 0.048792600409708856, + "grad_norm": 0.5945042788268958, + "learning_rate": 1.6263190564866543e-05, + "loss": 7.1153, + "step": 786 + }, + { + "epoch": 0.04885467750946676, + "grad_norm": 0.6496368026219081, + "learning_rate": 1.628388164701014e-05, + "loss": 7.2451, + "step": 787 + }, + { + "epoch": 0.04891675460922466, + "grad_norm": 0.5355140436683561, + "learning_rate": 1.6304572729153737e-05, + "loss": 7.0724, + "step": 788 + }, + { + "epoch": 0.048978831708982554, + "grad_norm": 0.7314318259693121, + "learning_rate": 1.6325263811297332e-05, + "loss": 7.1184, + "step": 789 + }, + { + "epoch": 0.04904090880874046, + "grad_norm": 0.6774801607499948, + "learning_rate": 1.634595489344093e-05, + "loss": 7.1635, + "step": 790 + }, + { + "epoch": 0.049102985908498355, + "grad_norm": 0.564671352136592, + "learning_rate": 1.6366645975584525e-05, + "loss": 7.2033, + "step": 791 + }, + { + "epoch": 0.04916506300825625, + "grad_norm": 0.675276109949763, + "learning_rate": 1.638733705772812e-05, + "loss": 7.2465, + "step": 792 + }, + { + "epoch": 0.049227140108014156, + "grad_norm": 0.771380762796074, + "learning_rate": 1.640802813987172e-05, + "loss": 7.2079, + "step": 793 + }, + { + "epoch": 0.04928921720777205, + "grad_norm": 0.9744045065179759, + "learning_rate": 1.6428719222015314e-05, + "loss": 7.2043, + "step": 794 + }, + { + "epoch": 0.04935129430752995, + "grad_norm": 0.6289772562257386, + "learning_rate": 1.644941030415891e-05, + "loss": 7.1729, + "step": 795 + }, + { + "epoch": 0.049413371407287854, + "grad_norm": 1.8769347699048855, + "learning_rate": 1.6470101386302504e-05, + "loss": 7.176, + "step": 796 + }, + { + "epoch": 0.04947544850704575, + "grad_norm": 0.6976418855732577, + "learning_rate": 1.64907924684461e-05, + "loss": 7.2363, + "step": 797 + }, + { + "epoch": 0.04953752560680365, + "grad_norm": 0.8750564693212877, + "learning_rate": 1.6511483550589694e-05, + "loss": 7.1574, + "step": 798 + }, + { + "epoch": 0.04959960270656155, + "grad_norm": 0.9649814090519186, + "learning_rate": 1.6532174632733293e-05, + "loss": 7.1629, + "step": 799 + }, + { + "epoch": 0.04966167980631945, + "grad_norm": 0.7315763348522287, + "learning_rate": 1.6552865714876888e-05, + "loss": 7.126, + "step": 800 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.2792867179642113, + "learning_rate": 1.6573556797020483e-05, + "loss": 7.0738, + "step": 801 + }, + { + "epoch": 0.04978583400583525, + "grad_norm": 0.8419527055637193, + "learning_rate": 1.659424787916408e-05, + "loss": 7.0421, + "step": 802 + }, + { + "epoch": 0.04984791110559315, + "grad_norm": 0.7828434524964989, + "learning_rate": 1.6614938961307676e-05, + "loss": 7.0684, + "step": 803 + }, + { + "epoch": 0.049909988205351044, + "grad_norm": 0.6355015171702213, + "learning_rate": 1.663563004345127e-05, + "loss": 7.1038, + "step": 804 + }, + { + "epoch": 0.04997206530510895, + "grad_norm": 1.4238636237430025, + "learning_rate": 1.665632112559487e-05, + "loss": 7.1801, + "step": 805 + }, + { + "epoch": 0.050034142404866845, + "grad_norm": 0.9390582803387022, + "learning_rate": 1.6677012207738465e-05, + "loss": 7.0183, + "step": 806 + }, + { + "epoch": 0.05009621950462474, + "grad_norm": 0.43583237053756907, + "learning_rate": 1.669770328988206e-05, + "loss": 7.0846, + "step": 807 + }, + { + "epoch": 0.050158296604382646, + "grad_norm": 0.9029794080652798, + "learning_rate": 1.671839437202566e-05, + "loss": 7.0919, + "step": 808 + }, + { + "epoch": 0.05022037370414054, + "grad_norm": 1.1875823644885237, + "learning_rate": 1.6739085454169254e-05, + "loss": 7.1824, + "step": 809 + }, + { + "epoch": 0.05028245080389844, + "grad_norm": 1.9133283803235497, + "learning_rate": 1.675977653631285e-05, + "loss": 7.1647, + "step": 810 + }, + { + "epoch": 0.050344527903656344, + "grad_norm": 0.6786575845289926, + "learning_rate": 1.6780467618456447e-05, + "loss": 7.0406, + "step": 811 + }, + { + "epoch": 0.05040660500341424, + "grad_norm": 0.9158568742889259, + "learning_rate": 1.6801158700600042e-05, + "loss": 7.1174, + "step": 812 + }, + { + "epoch": 0.05046868210317214, + "grad_norm": 0.5105919407259685, + "learning_rate": 1.6821849782743637e-05, + "loss": 7.119, + "step": 813 + }, + { + "epoch": 0.05053075920293004, + "grad_norm": 1.1701913564649327, + "learning_rate": 1.6842540864887236e-05, + "loss": 7.1592, + "step": 814 + }, + { + "epoch": 0.05059283630268794, + "grad_norm": 0.4454109685347196, + "learning_rate": 1.686323194703083e-05, + "loss": 7.1537, + "step": 815 + }, + { + "epoch": 0.050654913402445836, + "grad_norm": 0.6593752874581673, + "learning_rate": 1.6883923029174426e-05, + "loss": 7.1173, + "step": 816 + }, + { + "epoch": 0.05071699050220374, + "grad_norm": 0.6359770966370561, + "learning_rate": 1.6904614111318024e-05, + "loss": 7.0158, + "step": 817 + }, + { + "epoch": 0.05077906760196164, + "grad_norm": 0.5601165395624271, + "learning_rate": 1.692530519346162e-05, + "loss": 6.9994, + "step": 818 + }, + { + "epoch": 0.050841144701719534, + "grad_norm": 0.48081644394939377, + "learning_rate": 1.6945996275605215e-05, + "loss": 7.039, + "step": 819 + }, + { + "epoch": 0.05090322180147743, + "grad_norm": 0.537971692150623, + "learning_rate": 1.6966687357748813e-05, + "loss": 7.0909, + "step": 820 + }, + { + "epoch": 0.050965298901235336, + "grad_norm": 0.5484238353885386, + "learning_rate": 1.6987378439892408e-05, + "loss": 7.0081, + "step": 821 + }, + { + "epoch": 0.05102737600099323, + "grad_norm": 0.47938951375841277, + "learning_rate": 1.7008069522036003e-05, + "loss": 6.9612, + "step": 822 + }, + { + "epoch": 0.05108945310075113, + "grad_norm": 0.7000048816033856, + "learning_rate": 1.70287606041796e-05, + "loss": 7.0497, + "step": 823 + }, + { + "epoch": 0.051151530200509034, + "grad_norm": 0.5217298579968547, + "learning_rate": 1.7049451686323197e-05, + "loss": 7.1133, + "step": 824 + }, + { + "epoch": 0.05121360730026693, + "grad_norm": 0.45691188655523324, + "learning_rate": 1.7070142768466792e-05, + "loss": 7.1052, + "step": 825 + }, + { + "epoch": 0.05127568440002483, + "grad_norm": 0.5647055249696599, + "learning_rate": 1.7090833850610387e-05, + "loss": 6.9828, + "step": 826 + }, + { + "epoch": 0.05133776149978273, + "grad_norm": 0.39688167491315857, + "learning_rate": 1.7111524932753982e-05, + "loss": 7.1046, + "step": 827 + }, + { + "epoch": 0.05139983859954063, + "grad_norm": 0.4373295315648967, + "learning_rate": 1.713221601489758e-05, + "loss": 7.0388, + "step": 828 + }, + { + "epoch": 0.051461915699298526, + "grad_norm": 0.4459930038929808, + "learning_rate": 1.7152907097041175e-05, + "loss": 7.0696, + "step": 829 + }, + { + "epoch": 0.05152399279905643, + "grad_norm": 0.42360556599717275, + "learning_rate": 1.717359817918477e-05, + "loss": 7.0227, + "step": 830 + }, + { + "epoch": 0.05158606989881433, + "grad_norm": 0.6118809695629656, + "learning_rate": 1.719428926132837e-05, + "loss": 6.9967, + "step": 831 + }, + { + "epoch": 0.051648146998572224, + "grad_norm": 0.4210783182405806, + "learning_rate": 1.7214980343471964e-05, + "loss": 7.0954, + "step": 832 + }, + { + "epoch": 0.05171022409833013, + "grad_norm": 0.5156707454387341, + "learning_rate": 1.723567142561556e-05, + "loss": 7.0909, + "step": 833 + }, + { + "epoch": 0.051772301198088025, + "grad_norm": 0.49138445661824603, + "learning_rate": 1.7256362507759158e-05, + "loss": 7.0344, + "step": 834 + }, + { + "epoch": 0.05183437829784592, + "grad_norm": 0.4577457190489998, + "learning_rate": 1.7277053589902753e-05, + "loss": 7.054, + "step": 835 + }, + { + "epoch": 0.051896455397603826, + "grad_norm": 0.4247297725172015, + "learning_rate": 1.7297744672046348e-05, + "loss": 7.0079, + "step": 836 + }, + { + "epoch": 0.05195853249736172, + "grad_norm": 0.4262678358256214, + "learning_rate": 1.7318435754189946e-05, + "loss": 7.0522, + "step": 837 + }, + { + "epoch": 0.05202060959711962, + "grad_norm": 0.4086288714221104, + "learning_rate": 1.733912683633354e-05, + "loss": 7.043, + "step": 838 + }, + { + "epoch": 0.052082686696877524, + "grad_norm": 0.5142749683547941, + "learning_rate": 1.7359817918477136e-05, + "loss": 6.9496, + "step": 839 + }, + { + "epoch": 0.05214476379663542, + "grad_norm": 0.4686081671714957, + "learning_rate": 1.7380509000620735e-05, + "loss": 7.0484, + "step": 840 + }, + { + "epoch": 0.05220684089639332, + "grad_norm": 0.459110843198201, + "learning_rate": 1.740120008276433e-05, + "loss": 7.115, + "step": 841 + }, + { + "epoch": 0.05226891799615122, + "grad_norm": 0.7031662610981053, + "learning_rate": 1.7421891164907925e-05, + "loss": 7.1006, + "step": 842 + }, + { + "epoch": 0.05233099509590912, + "grad_norm": 0.5747105834125169, + "learning_rate": 1.7442582247051523e-05, + "loss": 6.9785, + "step": 843 + }, + { + "epoch": 0.052393072195667016, + "grad_norm": 0.6098759621614348, + "learning_rate": 1.746327332919512e-05, + "loss": 6.9576, + "step": 844 + }, + { + "epoch": 0.05245514929542492, + "grad_norm": 0.4874500677017351, + "learning_rate": 1.7483964411338714e-05, + "loss": 7.0204, + "step": 845 + }, + { + "epoch": 0.05251722639518282, + "grad_norm": 0.5916541654537679, + "learning_rate": 1.7504655493482312e-05, + "loss": 6.8485, + "step": 846 + }, + { + "epoch": 0.052579303494940714, + "grad_norm": 0.5090032262231473, + "learning_rate": 1.7525346575625907e-05, + "loss": 7.0029, + "step": 847 + }, + { + "epoch": 0.05264138059469862, + "grad_norm": 0.5691563632231694, + "learning_rate": 1.7546037657769502e-05, + "loss": 6.9475, + "step": 848 + }, + { + "epoch": 0.052703457694456515, + "grad_norm": 1.0791133275836304, + "learning_rate": 1.75667287399131e-05, + "loss": 7.1518, + "step": 849 + }, + { + "epoch": 0.05276553479421441, + "grad_norm": 1.290693511252859, + "learning_rate": 1.7587419822056696e-05, + "loss": 6.9614, + "step": 850 + }, + { + "epoch": 0.052827611893972316, + "grad_norm": 1.2895305824070058, + "learning_rate": 1.760811090420029e-05, + "loss": 6.9753, + "step": 851 + }, + { + "epoch": 0.05288968899373021, + "grad_norm": 0.9780843549577181, + "learning_rate": 1.7628801986343886e-05, + "loss": 7.0031, + "step": 852 + }, + { + "epoch": 0.05295176609348811, + "grad_norm": 0.8137846108150346, + "learning_rate": 1.764949306848748e-05, + "loss": 7.0544, + "step": 853 + }, + { + "epoch": 0.053013843193246014, + "grad_norm": 0.8734596522598076, + "learning_rate": 1.7670184150631076e-05, + "loss": 6.846, + "step": 854 + }, + { + "epoch": 0.05307592029300391, + "grad_norm": 0.7016780375814999, + "learning_rate": 1.7690875232774675e-05, + "loss": 6.9791, + "step": 855 + }, + { + "epoch": 0.05313799739276181, + "grad_norm": 1.491038414269038, + "learning_rate": 1.771156631491827e-05, + "loss": 6.9131, + "step": 856 + }, + { + "epoch": 0.05320007449251971, + "grad_norm": 1.0124772222335112, + "learning_rate": 1.7732257397061865e-05, + "loss": 6.9625, + "step": 857 + }, + { + "epoch": 0.05326215159227761, + "grad_norm": 1.066103881895138, + "learning_rate": 1.7752948479205463e-05, + "loss": 7.0435, + "step": 858 + }, + { + "epoch": 0.05332422869203551, + "grad_norm": 0.9902264295631681, + "learning_rate": 1.7773639561349058e-05, + "loss": 7.068, + "step": 859 + }, + { + "epoch": 0.05338630579179341, + "grad_norm": 0.7869855911191258, + "learning_rate": 1.7794330643492653e-05, + "loss": 6.9023, + "step": 860 + }, + { + "epoch": 0.05344838289155131, + "grad_norm": 0.8132418766806282, + "learning_rate": 1.7815021725636252e-05, + "loss": 7.0588, + "step": 861 + }, + { + "epoch": 0.053510459991309205, + "grad_norm": 0.5900359188073113, + "learning_rate": 1.7835712807779847e-05, + "loss": 6.9763, + "step": 862 + }, + { + "epoch": 0.05357253709106711, + "grad_norm": 0.9455947769791316, + "learning_rate": 1.7856403889923442e-05, + "loss": 6.9408, + "step": 863 + }, + { + "epoch": 0.053634614190825006, + "grad_norm": 0.7599098441331575, + "learning_rate": 1.787709497206704e-05, + "loss": 6.9269, + "step": 864 + }, + { + "epoch": 0.0536966912905829, + "grad_norm": 0.824938527620249, + "learning_rate": 1.7897786054210635e-05, + "loss": 6.8942, + "step": 865 + }, + { + "epoch": 0.05375876839034081, + "grad_norm": 0.7779991798862361, + "learning_rate": 1.791847713635423e-05, + "loss": 6.9979, + "step": 866 + }, + { + "epoch": 0.053820845490098704, + "grad_norm": 0.6355772961895431, + "learning_rate": 1.793916821849783e-05, + "loss": 6.8617, + "step": 867 + }, + { + "epoch": 0.0538829225898566, + "grad_norm": 0.9498998137399483, + "learning_rate": 1.7959859300641424e-05, + "loss": 6.9509, + "step": 868 + }, + { + "epoch": 0.0539449996896145, + "grad_norm": 0.8613003058060622, + "learning_rate": 1.798055038278502e-05, + "loss": 6.9409, + "step": 869 + }, + { + "epoch": 0.0540070767893724, + "grad_norm": 0.9351811978510788, + "learning_rate": 1.8001241464928618e-05, + "loss": 6.8996, + "step": 870 + }, + { + "epoch": 0.0540691538891303, + "grad_norm": 0.6858790752928728, + "learning_rate": 1.8021932547072213e-05, + "loss": 6.9192, + "step": 871 + }, + { + "epoch": 0.054131230988888196, + "grad_norm": 0.9557094242783629, + "learning_rate": 1.804262362921581e-05, + "loss": 6.9657, + "step": 872 + }, + { + "epoch": 0.0541933080886461, + "grad_norm": 0.5949417000746644, + "learning_rate": 1.8063314711359406e-05, + "loss": 7.0635, + "step": 873 + }, + { + "epoch": 0.054255385188404, + "grad_norm": 0.894478575642353, + "learning_rate": 1.8084005793503e-05, + "loss": 6.9737, + "step": 874 + }, + { + "epoch": 0.054317462288161894, + "grad_norm": 0.7210856097289888, + "learning_rate": 1.81046968756466e-05, + "loss": 6.9281, + "step": 875 + }, + { + "epoch": 0.0543795393879198, + "grad_norm": 0.518839037963814, + "learning_rate": 1.8125387957790195e-05, + "loss": 6.8534, + "step": 876 + }, + { + "epoch": 0.054441616487677695, + "grad_norm": 1.0892134727357847, + "learning_rate": 1.814607903993379e-05, + "loss": 6.9683, + "step": 877 + }, + { + "epoch": 0.05450369358743559, + "grad_norm": 0.7981638489829662, + "learning_rate": 1.8166770122077385e-05, + "loss": 6.8763, + "step": 878 + }, + { + "epoch": 0.054565770687193496, + "grad_norm": 1.0734608699025951, + "learning_rate": 1.8187461204220984e-05, + "loss": 6.9315, + "step": 879 + }, + { + "epoch": 0.05462784778695139, + "grad_norm": 0.9140017562636482, + "learning_rate": 1.820815228636458e-05, + "loss": 6.9739, + "step": 880 + }, + { + "epoch": 0.05468992488670929, + "grad_norm": 1.2527760673890957, + "learning_rate": 1.8228843368508174e-05, + "loss": 7.0806, + "step": 881 + }, + { + "epoch": 0.054752001986467194, + "grad_norm": 1.0847818826041633, + "learning_rate": 1.824953445065177e-05, + "loss": 7.0279, + "step": 882 + }, + { + "epoch": 0.05481407908622509, + "grad_norm": 0.8297620104075372, + "learning_rate": 1.8270225532795364e-05, + "loss": 6.9358, + "step": 883 + }, + { + "epoch": 0.05487615618598299, + "grad_norm": 0.8784255005713439, + "learning_rate": 1.8290916614938962e-05, + "loss": 6.8978, + "step": 884 + }, + { + "epoch": 0.05493823328574089, + "grad_norm": 0.8536881331428802, + "learning_rate": 1.8311607697082557e-05, + "loss": 6.9222, + "step": 885 + }, + { + "epoch": 0.05500031038549879, + "grad_norm": 1.1685713422360857, + "learning_rate": 1.8332298779226152e-05, + "loss": 6.882, + "step": 886 + }, + { + "epoch": 0.055062387485256686, + "grad_norm": 1.358209746341622, + "learning_rate": 1.835298986136975e-05, + "loss": 6.9528, + "step": 887 + }, + { + "epoch": 0.05512446458501459, + "grad_norm": 1.6985760755955475, + "learning_rate": 1.8373680943513346e-05, + "loss": 6.9252, + "step": 888 + }, + { + "epoch": 0.05518654168477249, + "grad_norm": 1.1725561578311814, + "learning_rate": 1.839437202565694e-05, + "loss": 6.8923, + "step": 889 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 0.9804563787523334, + "learning_rate": 1.841506310780054e-05, + "loss": 6.9448, + "step": 890 + }, + { + "epoch": 0.05531069588428829, + "grad_norm": 1.0528129590770323, + "learning_rate": 1.8435754189944135e-05, + "loss": 6.8221, + "step": 891 + }, + { + "epoch": 0.055372772984046185, + "grad_norm": 0.6702849402464146, + "learning_rate": 1.845644527208773e-05, + "loss": 6.9407, + "step": 892 + }, + { + "epoch": 0.05543485008380408, + "grad_norm": 1.1085441056709626, + "learning_rate": 1.8477136354231328e-05, + "loss": 6.9404, + "step": 893 + }, + { + "epoch": 0.055496927183561987, + "grad_norm": 0.9236726529230164, + "learning_rate": 1.8497827436374923e-05, + "loss": 6.8323, + "step": 894 + }, + { + "epoch": 0.055559004283319884, + "grad_norm": 0.6297793991007912, + "learning_rate": 1.8518518518518518e-05, + "loss": 6.905, + "step": 895 + }, + { + "epoch": 0.05562108138307778, + "grad_norm": 0.9833732533568488, + "learning_rate": 1.8539209600662117e-05, + "loss": 6.9343, + "step": 896 + }, + { + "epoch": 0.055683158482835685, + "grad_norm": 0.8530746473047987, + "learning_rate": 1.8559900682805712e-05, + "loss": 6.9676, + "step": 897 + }, + { + "epoch": 0.05574523558259358, + "grad_norm": 0.7252072235452529, + "learning_rate": 1.8580591764949307e-05, + "loss": 6.7433, + "step": 898 + }, + { + "epoch": 0.05580731268235148, + "grad_norm": 0.8085246770722297, + "learning_rate": 1.8601282847092905e-05, + "loss": 6.8553, + "step": 899 + }, + { + "epoch": 0.05586938978210938, + "grad_norm": 0.8701062956362697, + "learning_rate": 1.86219739292365e-05, + "loss": 6.8266, + "step": 900 + }, + { + "epoch": 0.05593146688186728, + "grad_norm": 0.6971939007282748, + "learning_rate": 1.8642665011380096e-05, + "loss": 6.8373, + "step": 901 + }, + { + "epoch": 0.05599354398162518, + "grad_norm": 0.7781362662974315, + "learning_rate": 1.8663356093523694e-05, + "loss": 6.8371, + "step": 902 + }, + { + "epoch": 0.05605562108138308, + "grad_norm": 0.8374765079676594, + "learning_rate": 1.868404717566729e-05, + "loss": 6.8285, + "step": 903 + }, + { + "epoch": 0.05611769818114098, + "grad_norm": 0.6823972639723864, + "learning_rate": 1.8704738257810884e-05, + "loss": 6.9048, + "step": 904 + }, + { + "epoch": 0.056179775280898875, + "grad_norm": 0.6976113624271886, + "learning_rate": 1.8725429339954483e-05, + "loss": 6.8396, + "step": 905 + }, + { + "epoch": 0.05624185238065678, + "grad_norm": 0.8310090712546815, + "learning_rate": 1.8746120422098078e-05, + "loss": 6.839, + "step": 906 + }, + { + "epoch": 0.056303929480414676, + "grad_norm": 0.6395062004531422, + "learning_rate": 1.8766811504241673e-05, + "loss": 6.9057, + "step": 907 + }, + { + "epoch": 0.05636600658017257, + "grad_norm": 0.6743745440392283, + "learning_rate": 1.8787502586385268e-05, + "loss": 6.8231, + "step": 908 + }, + { + "epoch": 0.05642808367993048, + "grad_norm": 0.5911343583213011, + "learning_rate": 1.8808193668528863e-05, + "loss": 6.9328, + "step": 909 + }, + { + "epoch": 0.056490160779688374, + "grad_norm": 0.943525058894695, + "learning_rate": 1.882888475067246e-05, + "loss": 6.7319, + "step": 910 + }, + { + "epoch": 0.05655223787944627, + "grad_norm": 0.8431362462972705, + "learning_rate": 1.8849575832816056e-05, + "loss": 6.9046, + "step": 911 + }, + { + "epoch": 0.056614314979204175, + "grad_norm": 0.851068090700202, + "learning_rate": 1.887026691495965e-05, + "loss": 6.9779, + "step": 912 + }, + { + "epoch": 0.05667639207896207, + "grad_norm": 0.8119348981696834, + "learning_rate": 1.889095799710325e-05, + "loss": 6.9259, + "step": 913 + }, + { + "epoch": 0.05673846917871997, + "grad_norm": 0.9850348425768467, + "learning_rate": 1.8911649079246845e-05, + "loss": 6.8735, + "step": 914 + }, + { + "epoch": 0.056800546278477866, + "grad_norm": 0.9372087662062586, + "learning_rate": 1.893234016139044e-05, + "loss": 6.7753, + "step": 915 + }, + { + "epoch": 0.05686262337823577, + "grad_norm": 2.4869190361915727, + "learning_rate": 1.895303124353404e-05, + "loss": 6.9217, + "step": 916 + }, + { + "epoch": 0.05692470047799367, + "grad_norm": 0.9399020575672027, + "learning_rate": 1.8973722325677634e-05, + "loss": 6.9385, + "step": 917 + }, + { + "epoch": 0.056986777577751564, + "grad_norm": 0.8254220143247165, + "learning_rate": 1.899441340782123e-05, + "loss": 6.8533, + "step": 918 + }, + { + "epoch": 0.05704885467750947, + "grad_norm": 0.9202469820233731, + "learning_rate": 1.9015104489964827e-05, + "loss": 6.8165, + "step": 919 + }, + { + "epoch": 0.057110931777267365, + "grad_norm": 0.9498463820182377, + "learning_rate": 1.9035795572108422e-05, + "loss": 6.7459, + "step": 920 + }, + { + "epoch": 0.05717300887702526, + "grad_norm": 0.7958798646222511, + "learning_rate": 1.9056486654252017e-05, + "loss": 6.8003, + "step": 921 + }, + { + "epoch": 0.057235085976783166, + "grad_norm": 0.8792461162000864, + "learning_rate": 1.9077177736395616e-05, + "loss": 6.9193, + "step": 922 + }, + { + "epoch": 0.05729716307654106, + "grad_norm": 0.9223571744830763, + "learning_rate": 1.909786881853921e-05, + "loss": 6.7439, + "step": 923 + }, + { + "epoch": 0.05735924017629896, + "grad_norm": 1.0471799036363494, + "learning_rate": 1.9118559900682806e-05, + "loss": 6.7046, + "step": 924 + }, + { + "epoch": 0.057421317276056864, + "grad_norm": 0.5797950233620913, + "learning_rate": 1.9139250982826404e-05, + "loss": 6.8571, + "step": 925 + }, + { + "epoch": 0.05748339437581476, + "grad_norm": 0.5387674712957097, + "learning_rate": 1.915994206497e-05, + "loss": 6.7757, + "step": 926 + }, + { + "epoch": 0.05754547147557266, + "grad_norm": 0.5891419081452007, + "learning_rate": 1.9180633147113595e-05, + "loss": 6.8493, + "step": 927 + }, + { + "epoch": 0.05760754857533056, + "grad_norm": 0.5612478093909522, + "learning_rate": 1.9201324229257193e-05, + "loss": 6.8123, + "step": 928 + }, + { + "epoch": 0.05766962567508846, + "grad_norm": 0.5196438143978375, + "learning_rate": 1.9222015311400788e-05, + "loss": 6.8462, + "step": 929 + }, + { + "epoch": 0.05773170277484636, + "grad_norm": 0.4370719484562318, + "learning_rate": 1.9242706393544383e-05, + "loss": 6.7679, + "step": 930 + }, + { + "epoch": 0.05779377987460426, + "grad_norm": 0.5609476338270752, + "learning_rate": 1.926339747568798e-05, + "loss": 6.8288, + "step": 931 + }, + { + "epoch": 0.05785585697436216, + "grad_norm": 0.49065864107489476, + "learning_rate": 1.9284088557831577e-05, + "loss": 6.7531, + "step": 932 + }, + { + "epoch": 0.057917934074120055, + "grad_norm": 0.5880279887473316, + "learning_rate": 1.9304779639975172e-05, + "loss": 6.9136, + "step": 933 + }, + { + "epoch": 0.05798001117387796, + "grad_norm": 0.5304917084749736, + "learning_rate": 1.932547072211877e-05, + "loss": 6.8347, + "step": 934 + }, + { + "epoch": 0.058042088273635856, + "grad_norm": 0.46989639289240276, + "learning_rate": 1.9346161804262365e-05, + "loss": 6.8361, + "step": 935 + }, + { + "epoch": 0.05810416537339375, + "grad_norm": 0.7256761702731636, + "learning_rate": 1.936685288640596e-05, + "loss": 6.8933, + "step": 936 + }, + { + "epoch": 0.05816624247315166, + "grad_norm": 0.48838859903945586, + "learning_rate": 1.9387543968549556e-05, + "loss": 6.7756, + "step": 937 + }, + { + "epoch": 0.058228319572909554, + "grad_norm": 0.5386719048758009, + "learning_rate": 1.940823505069315e-05, + "loss": 6.7511, + "step": 938 + }, + { + "epoch": 0.05829039667266745, + "grad_norm": 0.6720445995517121, + "learning_rate": 1.9428926132836746e-05, + "loss": 6.7819, + "step": 939 + }, + { + "epoch": 0.058352473772425355, + "grad_norm": 0.5085218903247462, + "learning_rate": 1.9449617214980344e-05, + "loss": 6.783, + "step": 940 + }, + { + "epoch": 0.05841455087218325, + "grad_norm": 0.5326119803498297, + "learning_rate": 1.947030829712394e-05, + "loss": 6.8789, + "step": 941 + }, + { + "epoch": 0.05847662797194115, + "grad_norm": 0.5640107899895126, + "learning_rate": 1.9490999379267534e-05, + "loss": 6.7709, + "step": 942 + }, + { + "epoch": 0.05853870507169905, + "grad_norm": 0.7244494073945806, + "learning_rate": 1.9511690461411133e-05, + "loss": 6.7012, + "step": 943 + }, + { + "epoch": 0.05860078217145695, + "grad_norm": 0.8943387953110051, + "learning_rate": 1.9532381543554728e-05, + "loss": 6.7986, + "step": 944 + }, + { + "epoch": 0.05866285927121485, + "grad_norm": 0.7457081462357342, + "learning_rate": 1.9553072625698323e-05, + "loss": 6.8828, + "step": 945 + }, + { + "epoch": 0.05872493637097275, + "grad_norm": 1.1132845694509175, + "learning_rate": 1.957376370784192e-05, + "loss": 6.9166, + "step": 946 + }, + { + "epoch": 0.05878701347073065, + "grad_norm": 1.1072157679778705, + "learning_rate": 1.9594454789985516e-05, + "loss": 6.808, + "step": 947 + }, + { + "epoch": 0.058849090570488545, + "grad_norm": 1.1897559070284212, + "learning_rate": 1.961514587212911e-05, + "loss": 6.7591, + "step": 948 + }, + { + "epoch": 0.05891116767024645, + "grad_norm": 0.996453224092997, + "learning_rate": 1.963583695427271e-05, + "loss": 6.7148, + "step": 949 + }, + { + "epoch": 0.058973244770004346, + "grad_norm": 1.1137331931989993, + "learning_rate": 1.9656528036416305e-05, + "loss": 6.789, + "step": 950 + }, + { + "epoch": 0.05903532186976224, + "grad_norm": 1.1974219862626205, + "learning_rate": 1.96772191185599e-05, + "loss": 6.7635, + "step": 951 + }, + { + "epoch": 0.05909739896952015, + "grad_norm": 0.7687588993580853, + "learning_rate": 1.96979102007035e-05, + "loss": 6.7651, + "step": 952 + }, + { + "epoch": 0.059159476069278044, + "grad_norm": 0.9133857765131063, + "learning_rate": 1.9718601282847094e-05, + "loss": 6.7958, + "step": 953 + }, + { + "epoch": 0.05922155316903594, + "grad_norm": 0.8311228203104496, + "learning_rate": 1.9739292364990692e-05, + "loss": 6.8084, + "step": 954 + }, + { + "epoch": 0.059283630268793845, + "grad_norm": 1.2305623821177307, + "learning_rate": 1.9759983447134287e-05, + "loss": 6.7831, + "step": 955 + }, + { + "epoch": 0.05934570736855174, + "grad_norm": 0.812842370305352, + "learning_rate": 1.9780674529277882e-05, + "loss": 6.8073, + "step": 956 + }, + { + "epoch": 0.05940778446830964, + "grad_norm": 0.7035985623474028, + "learning_rate": 1.980136561142148e-05, + "loss": 6.8235, + "step": 957 + }, + { + "epoch": 0.05946986156806754, + "grad_norm": 0.6812942630492125, + "learning_rate": 1.9822056693565076e-05, + "loss": 6.6871, + "step": 958 + }, + { + "epoch": 0.05953193866782544, + "grad_norm": 0.7228459289710563, + "learning_rate": 1.984274777570867e-05, + "loss": 6.7401, + "step": 959 + }, + { + "epoch": 0.05959401576758334, + "grad_norm": 0.7015135418773697, + "learning_rate": 1.986343885785227e-05, + "loss": 6.7157, + "step": 960 + }, + { + "epoch": 0.059656092867341234, + "grad_norm": 0.8440844614673746, + "learning_rate": 1.9884129939995864e-05, + "loss": 6.6961, + "step": 961 + }, + { + "epoch": 0.05971816996709914, + "grad_norm": 0.7468047025150403, + "learning_rate": 1.990482102213946e-05, + "loss": 6.7231, + "step": 962 + }, + { + "epoch": 0.059780247066857035, + "grad_norm": 1.1141524269223086, + "learning_rate": 1.9925512104283055e-05, + "loss": 6.6135, + "step": 963 + }, + { + "epoch": 0.05984232416661493, + "grad_norm": 0.7644050766274768, + "learning_rate": 1.994620318642665e-05, + "loss": 6.758, + "step": 964 + }, + { + "epoch": 0.059904401266372836, + "grad_norm": 1.5304811844215958, + "learning_rate": 1.9966894268570248e-05, + "loss": 6.8208, + "step": 965 + }, + { + "epoch": 0.059966478366130734, + "grad_norm": 1.212219396546712, + "learning_rate": 1.9987585350713843e-05, + "loss": 6.6273, + "step": 966 + }, + { + "epoch": 0.06002855546588863, + "grad_norm": 0.578298027333235, + "learning_rate": 2.000827643285744e-05, + "loss": 6.6986, + "step": 967 + }, + { + "epoch": 0.060090632565646535, + "grad_norm": 0.6190990109101955, + "learning_rate": 2.0028967515001033e-05, + "loss": 6.6939, + "step": 968 + }, + { + "epoch": 0.06015270966540443, + "grad_norm": 1.0493045910950682, + "learning_rate": 2.0049658597144632e-05, + "loss": 6.6873, + "step": 969 + }, + { + "epoch": 0.06021478676516233, + "grad_norm": 0.8674097549593601, + "learning_rate": 2.0070349679288227e-05, + "loss": 6.6889, + "step": 970 + }, + { + "epoch": 0.06027686386492023, + "grad_norm": 1.2868953173730127, + "learning_rate": 2.0091040761431822e-05, + "loss": 6.7127, + "step": 971 + }, + { + "epoch": 0.06033894096467813, + "grad_norm": 1.2138717540799309, + "learning_rate": 2.011173184357542e-05, + "loss": 6.871, + "step": 972 + }, + { + "epoch": 0.06040101806443603, + "grad_norm": 1.0350939462013913, + "learning_rate": 2.0132422925719016e-05, + "loss": 6.8139, + "step": 973 + }, + { + "epoch": 0.06046309516419393, + "grad_norm": 0.908246012669743, + "learning_rate": 2.015311400786261e-05, + "loss": 6.7431, + "step": 974 + }, + { + "epoch": 0.06052517226395183, + "grad_norm": 0.7832859594818751, + "learning_rate": 2.017380509000621e-05, + "loss": 6.6279, + "step": 975 + }, + { + "epoch": 0.060587249363709725, + "grad_norm": 0.8137307382463274, + "learning_rate": 2.0194496172149804e-05, + "loss": 6.7229, + "step": 976 + }, + { + "epoch": 0.06064932646346763, + "grad_norm": 0.6744853256317508, + "learning_rate": 2.02151872542934e-05, + "loss": 6.7809, + "step": 977 + }, + { + "epoch": 0.060711403563225526, + "grad_norm": 0.8926556843589407, + "learning_rate": 2.0235878336436998e-05, + "loss": 6.717, + "step": 978 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.0225709347731144, + "learning_rate": 2.0256569418580593e-05, + "loss": 6.7406, + "step": 979 + }, + { + "epoch": 0.06083555776274133, + "grad_norm": 0.662220925805706, + "learning_rate": 2.0277260500724188e-05, + "loss": 6.7881, + "step": 980 + }, + { + "epoch": 0.060897634862499224, + "grad_norm": 0.8738933445103575, + "learning_rate": 2.0297951582867786e-05, + "loss": 6.6526, + "step": 981 + }, + { + "epoch": 0.06095971196225712, + "grad_norm": 0.6638397965331673, + "learning_rate": 2.031864266501138e-05, + "loss": 6.7002, + "step": 982 + }, + { + "epoch": 0.061021789062015025, + "grad_norm": 0.8721944244339515, + "learning_rate": 2.0339333747154976e-05, + "loss": 6.6586, + "step": 983 + }, + { + "epoch": 0.06108386616177292, + "grad_norm": 0.5564869718800322, + "learning_rate": 2.0360024829298575e-05, + "loss": 6.7583, + "step": 984 + }, + { + "epoch": 0.06114594326153082, + "grad_norm": 0.6900118112244249, + "learning_rate": 2.038071591144217e-05, + "loss": 6.6815, + "step": 985 + }, + { + "epoch": 0.06120802036128872, + "grad_norm": 0.9867630405709604, + "learning_rate": 2.0401406993585765e-05, + "loss": 6.6812, + "step": 986 + }, + { + "epoch": 0.06127009746104662, + "grad_norm": 0.6014204724583304, + "learning_rate": 2.0422098075729364e-05, + "loss": 6.6738, + "step": 987 + }, + { + "epoch": 0.06133217456080452, + "grad_norm": 0.7426979317456148, + "learning_rate": 2.044278915787296e-05, + "loss": 6.6597, + "step": 988 + }, + { + "epoch": 0.06139425166056242, + "grad_norm": 0.6562384876876313, + "learning_rate": 2.0463480240016554e-05, + "loss": 6.6879, + "step": 989 + }, + { + "epoch": 0.06145632876032032, + "grad_norm": 0.7212562815956267, + "learning_rate": 2.0484171322160152e-05, + "loss": 6.7104, + "step": 990 + }, + { + "epoch": 0.061518405860078215, + "grad_norm": 0.6614443609809463, + "learning_rate": 2.0504862404303747e-05, + "loss": 6.6147, + "step": 991 + }, + { + "epoch": 0.06158048295983612, + "grad_norm": 0.5501481062706993, + "learning_rate": 2.0525553486447342e-05, + "loss": 6.6949, + "step": 992 + }, + { + "epoch": 0.061642560059594016, + "grad_norm": 0.61545776671163, + "learning_rate": 2.0546244568590937e-05, + "loss": 6.6513, + "step": 993 + }, + { + "epoch": 0.06170463715935191, + "grad_norm": 0.6869490973828379, + "learning_rate": 2.0566935650734532e-05, + "loss": 6.6527, + "step": 994 + }, + { + "epoch": 0.06176671425910982, + "grad_norm": 0.7659580907462448, + "learning_rate": 2.0587626732878128e-05, + "loss": 6.5793, + "step": 995 + }, + { + "epoch": 0.061828791358867714, + "grad_norm": 0.6568722726491245, + "learning_rate": 2.0608317815021726e-05, + "loss": 6.6523, + "step": 996 + }, + { + "epoch": 0.06189086845862561, + "grad_norm": 0.49697326432241673, + "learning_rate": 2.062900889716532e-05, + "loss": 6.6855, + "step": 997 + }, + { + "epoch": 0.061952945558383515, + "grad_norm": 0.6209566936780567, + "learning_rate": 2.064969997930892e-05, + "loss": 6.7814, + "step": 998 + }, + { + "epoch": 0.06201502265814141, + "grad_norm": 0.5669354301636056, + "learning_rate": 2.0670391061452515e-05, + "loss": 6.6237, + "step": 999 + }, + { + "epoch": 0.06207709975789931, + "grad_norm": 0.5075357416561482, + "learning_rate": 2.069108214359611e-05, + "loss": 6.5661, + "step": 1000 + }, + { + "epoch": 0.06213917685765721, + "grad_norm": 0.6772359504847724, + "learning_rate": 2.0711773225739708e-05, + "loss": 6.6332, + "step": 1001 + }, + { + "epoch": 0.06220125395741511, + "grad_norm": 0.49407236283632183, + "learning_rate": 2.0732464307883303e-05, + "loss": 6.701, + "step": 1002 + }, + { + "epoch": 0.06226333105717301, + "grad_norm": 0.5529630208318232, + "learning_rate": 2.07531553900269e-05, + "loss": 6.6035, + "step": 1003 + }, + { + "epoch": 0.06232540815693091, + "grad_norm": 0.6118380580033149, + "learning_rate": 2.0773846472170497e-05, + "loss": 6.6216, + "step": 1004 + }, + { + "epoch": 0.06238748525668881, + "grad_norm": 0.5950601829016525, + "learning_rate": 2.0794537554314092e-05, + "loss": 6.7, + "step": 1005 + }, + { + "epoch": 0.062449562356446706, + "grad_norm": 0.619507617668072, + "learning_rate": 2.0815228636457687e-05, + "loss": 6.5878, + "step": 1006 + }, + { + "epoch": 0.06251163945620461, + "grad_norm": 0.807257501843169, + "learning_rate": 2.0835919718601285e-05, + "loss": 6.6077, + "step": 1007 + }, + { + "epoch": 0.0625737165559625, + "grad_norm": 0.7233204032105576, + "learning_rate": 2.085661080074488e-05, + "loss": 6.6432, + "step": 1008 + }, + { + "epoch": 0.0626357936557204, + "grad_norm": 0.7190403144659986, + "learning_rate": 2.0877301882888476e-05, + "loss": 6.6259, + "step": 1009 + }, + { + "epoch": 0.06269787075547831, + "grad_norm": 0.4779006431618366, + "learning_rate": 2.0897992965032074e-05, + "loss": 6.6621, + "step": 1010 + }, + { + "epoch": 0.0627599478552362, + "grad_norm": 0.657661962613495, + "learning_rate": 2.091868404717567e-05, + "loss": 6.5745, + "step": 1011 + }, + { + "epoch": 0.0628220249549941, + "grad_norm": 0.609625618090149, + "learning_rate": 2.0939375129319264e-05, + "loss": 6.7389, + "step": 1012 + }, + { + "epoch": 0.062884102054752, + "grad_norm": 0.6800972153089333, + "learning_rate": 2.0960066211462863e-05, + "loss": 6.6386, + "step": 1013 + }, + { + "epoch": 0.0629461791545099, + "grad_norm": 0.8169746428048327, + "learning_rate": 2.0980757293606458e-05, + "loss": 6.622, + "step": 1014 + }, + { + "epoch": 0.0630082562542678, + "grad_norm": 0.836096917753968, + "learning_rate": 2.1001448375750053e-05, + "loss": 6.6103, + "step": 1015 + }, + { + "epoch": 0.0630703333540257, + "grad_norm": 0.5751205421644768, + "learning_rate": 2.102213945789365e-05, + "loss": 6.6911, + "step": 1016 + }, + { + "epoch": 0.0631324104537836, + "grad_norm": 0.9163672904784453, + "learning_rate": 2.1042830540037246e-05, + "loss": 6.63, + "step": 1017 + }, + { + "epoch": 0.0631944875535415, + "grad_norm": 0.6754864617522165, + "learning_rate": 2.106352162218084e-05, + "loss": 6.666, + "step": 1018 + }, + { + "epoch": 0.0632565646532994, + "grad_norm": 0.6452861641966402, + "learning_rate": 2.1084212704324436e-05, + "loss": 6.6115, + "step": 1019 + }, + { + "epoch": 0.06331864175305729, + "grad_norm": 0.5584071454559684, + "learning_rate": 2.110490378646803e-05, + "loss": 6.6178, + "step": 1020 + }, + { + "epoch": 0.0633807188528152, + "grad_norm": 0.6936188548082761, + "learning_rate": 2.112559486861163e-05, + "loss": 6.6173, + "step": 1021 + }, + { + "epoch": 0.0634427959525731, + "grad_norm": 0.5506662830460362, + "learning_rate": 2.1146285950755225e-05, + "loss": 6.6852, + "step": 1022 + }, + { + "epoch": 0.06350487305233099, + "grad_norm": 0.6128206406382831, + "learning_rate": 2.116697703289882e-05, + "loss": 6.6451, + "step": 1023 + }, + { + "epoch": 0.0635669501520889, + "grad_norm": 0.5533097308891152, + "learning_rate": 2.1187668115042415e-05, + "loss": 6.6076, + "step": 1024 + }, + { + "epoch": 0.0636290272518468, + "grad_norm": 0.5177333383131031, + "learning_rate": 2.1208359197186014e-05, + "loss": 6.6631, + "step": 1025 + }, + { + "epoch": 0.06369110435160469, + "grad_norm": 0.525378968063425, + "learning_rate": 2.122905027932961e-05, + "loss": 6.5528, + "step": 1026 + }, + { + "epoch": 0.06375318145136259, + "grad_norm": 0.5127375122141734, + "learning_rate": 2.1249741361473204e-05, + "loss": 6.665, + "step": 1027 + }, + { + "epoch": 0.0638152585511205, + "grad_norm": 0.5146341050412512, + "learning_rate": 2.1270432443616802e-05, + "loss": 6.6101, + "step": 1028 + }, + { + "epoch": 0.06387733565087839, + "grad_norm": 0.504026561243411, + "learning_rate": 2.1291123525760397e-05, + "loss": 6.5806, + "step": 1029 + }, + { + "epoch": 0.06393941275063629, + "grad_norm": 0.5562260740470812, + "learning_rate": 2.1311814607903992e-05, + "loss": 6.5463, + "step": 1030 + }, + { + "epoch": 0.0640014898503942, + "grad_norm": 0.5107182630256404, + "learning_rate": 2.133250569004759e-05, + "loss": 6.6885, + "step": 1031 + }, + { + "epoch": 0.06406356695015208, + "grad_norm": 0.6249791618688043, + "learning_rate": 2.1353196772191186e-05, + "loss": 6.5721, + "step": 1032 + }, + { + "epoch": 0.06412564404990999, + "grad_norm": 0.6574518914263614, + "learning_rate": 2.137388785433478e-05, + "loss": 6.6481, + "step": 1033 + }, + { + "epoch": 0.06418772114966789, + "grad_norm": 1.050170323771801, + "learning_rate": 2.139457893647838e-05, + "loss": 6.5487, + "step": 1034 + }, + { + "epoch": 0.06424979824942578, + "grad_norm": 0.8392387517584943, + "learning_rate": 2.1415270018621975e-05, + "loss": 6.6137, + "step": 1035 + }, + { + "epoch": 0.06431187534918369, + "grad_norm": 0.9897232692892716, + "learning_rate": 2.143596110076557e-05, + "loss": 6.6162, + "step": 1036 + }, + { + "epoch": 0.06437395244894159, + "grad_norm": 0.8776603854290618, + "learning_rate": 2.1456652182909168e-05, + "loss": 6.7224, + "step": 1037 + }, + { + "epoch": 0.06443602954869948, + "grad_norm": 0.8367723592045166, + "learning_rate": 2.1477343265052763e-05, + "loss": 6.6878, + "step": 1038 + }, + { + "epoch": 0.06449810664845738, + "grad_norm": 1.341785325357407, + "learning_rate": 2.1498034347196362e-05, + "loss": 6.5169, + "step": 1039 + }, + { + "epoch": 0.06456018374821529, + "grad_norm": 0.9709993827554644, + "learning_rate": 2.1518725429339957e-05, + "loss": 6.6408, + "step": 1040 + }, + { + "epoch": 0.06462226084797318, + "grad_norm": 0.6373220288801739, + "learning_rate": 2.1539416511483552e-05, + "loss": 6.5605, + "step": 1041 + }, + { + "epoch": 0.06468433794773108, + "grad_norm": 1.2732448783245698, + "learning_rate": 2.156010759362715e-05, + "loss": 6.6624, + "step": 1042 + }, + { + "epoch": 0.06474641504748899, + "grad_norm": 0.5888795108043435, + "learning_rate": 2.1580798675770745e-05, + "loss": 6.5357, + "step": 1043 + }, + { + "epoch": 0.06480849214724688, + "grad_norm": 1.1270241541436157, + "learning_rate": 2.160148975791434e-05, + "loss": 6.5963, + "step": 1044 + }, + { + "epoch": 0.06487056924700478, + "grad_norm": 0.6934285733790316, + "learning_rate": 2.162218084005794e-05, + "loss": 6.5984, + "step": 1045 + }, + { + "epoch": 0.06493264634676268, + "grad_norm": 0.91128695738773, + "learning_rate": 2.1642871922201534e-05, + "loss": 6.4925, + "step": 1046 + }, + { + "epoch": 0.06499472344652057, + "grad_norm": 0.710108262928295, + "learning_rate": 2.166356300434513e-05, + "loss": 6.633, + "step": 1047 + }, + { + "epoch": 0.06505680054627848, + "grad_norm": 0.9628590855259478, + "learning_rate": 2.1684254086488724e-05, + "loss": 6.6058, + "step": 1048 + }, + { + "epoch": 0.06511887764603638, + "grad_norm": 0.6261811063565527, + "learning_rate": 2.170494516863232e-05, + "loss": 6.524, + "step": 1049 + }, + { + "epoch": 0.06518095474579427, + "grad_norm": 0.9893321002809427, + "learning_rate": 2.1725636250775914e-05, + "loss": 6.6534, + "step": 1050 + }, + { + "epoch": 0.06524303184555218, + "grad_norm": 0.5097824620064287, + "learning_rate": 2.1746327332919513e-05, + "loss": 6.7246, + "step": 1051 + }, + { + "epoch": 0.06530510894531008, + "grad_norm": 0.5969174125342223, + "learning_rate": 2.1767018415063108e-05, + "loss": 6.5737, + "step": 1052 + }, + { + "epoch": 0.06536718604506797, + "grad_norm": 0.5958346612510911, + "learning_rate": 2.1787709497206703e-05, + "loss": 6.6085, + "step": 1053 + }, + { + "epoch": 0.06542926314482587, + "grad_norm": 0.7250383860193038, + "learning_rate": 2.18084005793503e-05, + "loss": 6.6058, + "step": 1054 + }, + { + "epoch": 0.06549134024458378, + "grad_norm": 0.8296973578088351, + "learning_rate": 2.1829091661493897e-05, + "loss": 6.573, + "step": 1055 + }, + { + "epoch": 0.06555341734434167, + "grad_norm": 0.7019400041868565, + "learning_rate": 2.184978274363749e-05, + "loss": 6.6135, + "step": 1056 + }, + { + "epoch": 0.06561549444409957, + "grad_norm": 0.6564022523376438, + "learning_rate": 2.187047382578109e-05, + "loss": 6.3865, + "step": 1057 + }, + { + "epoch": 0.06567757154385748, + "grad_norm": 0.6478284250336312, + "learning_rate": 2.1891164907924685e-05, + "loss": 6.5322, + "step": 1058 + }, + { + "epoch": 0.06573964864361537, + "grad_norm": 0.6388817538563281, + "learning_rate": 2.191185599006828e-05, + "loss": 6.6193, + "step": 1059 + }, + { + "epoch": 0.06580172574337327, + "grad_norm": 0.6357340072363377, + "learning_rate": 2.193254707221188e-05, + "loss": 6.5234, + "step": 1060 + }, + { + "epoch": 0.06586380284313118, + "grad_norm": 0.7868355400123458, + "learning_rate": 2.1953238154355474e-05, + "loss": 6.469, + "step": 1061 + }, + { + "epoch": 0.06592587994288907, + "grad_norm": 0.5545892762807451, + "learning_rate": 2.197392923649907e-05, + "loss": 6.5293, + "step": 1062 + }, + { + "epoch": 0.06598795704264697, + "grad_norm": 0.729633976282321, + "learning_rate": 2.1994620318642667e-05, + "loss": 6.5189, + "step": 1063 + }, + { + "epoch": 0.06605003414240487, + "grad_norm": 0.5392127725982313, + "learning_rate": 2.2015311400786262e-05, + "loss": 6.4587, + "step": 1064 + }, + { + "epoch": 0.06611211124216276, + "grad_norm": 0.5612337111337439, + "learning_rate": 2.2036002482929857e-05, + "loss": 6.4796, + "step": 1065 + }, + { + "epoch": 0.06617418834192067, + "grad_norm": 0.7935697410933927, + "learning_rate": 2.2056693565073456e-05, + "loss": 6.5447, + "step": 1066 + }, + { + "epoch": 0.06623626544167857, + "grad_norm": 0.5259714142624978, + "learning_rate": 2.207738464721705e-05, + "loss": 6.4481, + "step": 1067 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 0.6852480245128215, + "learning_rate": 2.2098075729360646e-05, + "loss": 6.4936, + "step": 1068 + }, + { + "epoch": 0.06636041964119437, + "grad_norm": 0.6076607433809176, + "learning_rate": 2.2118766811504245e-05, + "loss": 6.507, + "step": 1069 + }, + { + "epoch": 0.06642249674095227, + "grad_norm": 0.5894444610990394, + "learning_rate": 2.213945789364784e-05, + "loss": 6.4373, + "step": 1070 + }, + { + "epoch": 0.06648457384071016, + "grad_norm": 0.8576691082063337, + "learning_rate": 2.2160148975791435e-05, + "loss": 6.5956, + "step": 1071 + }, + { + "epoch": 0.06654665094046806, + "grad_norm": 0.6107192252014338, + "learning_rate": 2.2180840057935033e-05, + "loss": 6.6088, + "step": 1072 + }, + { + "epoch": 0.06660872804022597, + "grad_norm": 0.6746463175754616, + "learning_rate": 2.2201531140078628e-05, + "loss": 6.5078, + "step": 1073 + }, + { + "epoch": 0.06667080513998386, + "grad_norm": 0.9863640504847354, + "learning_rate": 2.2222222222222223e-05, + "loss": 6.651, + "step": 1074 + }, + { + "epoch": 0.06673288223974176, + "grad_norm": 1.1089856713782213, + "learning_rate": 2.224291330436582e-05, + "loss": 6.5584, + "step": 1075 + }, + { + "epoch": 0.06679495933949967, + "grad_norm": 0.838093339633546, + "learning_rate": 2.2263604386509417e-05, + "loss": 6.4938, + "step": 1076 + }, + { + "epoch": 0.06685703643925756, + "grad_norm": 0.9076589943714126, + "learning_rate": 2.2284295468653012e-05, + "loss": 6.517, + "step": 1077 + }, + { + "epoch": 0.06691911353901546, + "grad_norm": 0.9282510703933672, + "learning_rate": 2.2304986550796607e-05, + "loss": 6.5871, + "step": 1078 + }, + { + "epoch": 0.06698119063877335, + "grad_norm": 1.0062998575689062, + "learning_rate": 2.2325677632940202e-05, + "loss": 6.4956, + "step": 1079 + }, + { + "epoch": 0.06704326773853125, + "grad_norm": 0.6957500880999, + "learning_rate": 2.2346368715083797e-05, + "loss": 6.4554, + "step": 1080 + }, + { + "epoch": 0.06710534483828916, + "grad_norm": 0.7798341509977869, + "learning_rate": 2.2367059797227396e-05, + "loss": 6.4916, + "step": 1081 + }, + { + "epoch": 0.06716742193804705, + "grad_norm": 0.5991778731547147, + "learning_rate": 2.238775087937099e-05, + "loss": 6.5539, + "step": 1082 + }, + { + "epoch": 0.06722949903780495, + "grad_norm": 0.6640388484557612, + "learning_rate": 2.240844196151459e-05, + "loss": 6.5906, + "step": 1083 + }, + { + "epoch": 0.06729157613756286, + "grad_norm": 0.8792320156367768, + "learning_rate": 2.2429133043658184e-05, + "loss": 6.4909, + "step": 1084 + }, + { + "epoch": 0.06735365323732075, + "grad_norm": 0.7028111711135859, + "learning_rate": 2.244982412580178e-05, + "loss": 6.448, + "step": 1085 + }, + { + "epoch": 0.06741573033707865, + "grad_norm": 0.6248786808522846, + "learning_rate": 2.2470515207945378e-05, + "loss": 6.5806, + "step": 1086 + }, + { + "epoch": 0.06747780743683655, + "grad_norm": 0.8634824561483333, + "learning_rate": 2.2491206290088973e-05, + "loss": 6.4595, + "step": 1087 + }, + { + "epoch": 0.06753988453659444, + "grad_norm": 0.6854725984944015, + "learning_rate": 2.2511897372232568e-05, + "loss": 6.5508, + "step": 1088 + }, + { + "epoch": 0.06760196163635235, + "grad_norm": 0.5537767051646759, + "learning_rate": 2.2532588454376166e-05, + "loss": 6.5134, + "step": 1089 + }, + { + "epoch": 0.06766403873611025, + "grad_norm": 0.6413400660019013, + "learning_rate": 2.255327953651976e-05, + "loss": 6.4652, + "step": 1090 + }, + { + "epoch": 0.06772611583586814, + "grad_norm": 0.564905401218353, + "learning_rate": 2.2573970618663357e-05, + "loss": 6.5462, + "step": 1091 + }, + { + "epoch": 0.06778819293562605, + "grad_norm": 0.5263919010624163, + "learning_rate": 2.2594661700806955e-05, + "loss": 6.5373, + "step": 1092 + }, + { + "epoch": 0.06785027003538395, + "grad_norm": 0.557025361531554, + "learning_rate": 2.261535278295055e-05, + "loss": 6.4424, + "step": 1093 + }, + { + "epoch": 0.06791234713514184, + "grad_norm": 0.4759329561804355, + "learning_rate": 2.2636043865094145e-05, + "loss": 6.5442, + "step": 1094 + }, + { + "epoch": 0.06797442423489974, + "grad_norm": 0.5475604257813202, + "learning_rate": 2.2656734947237744e-05, + "loss": 6.5242, + "step": 1095 + }, + { + "epoch": 0.06803650133465765, + "grad_norm": 0.46795744651427584, + "learning_rate": 2.267742602938134e-05, + "loss": 6.4771, + "step": 1096 + }, + { + "epoch": 0.06809857843441554, + "grad_norm": 0.5288094954689911, + "learning_rate": 2.2698117111524934e-05, + "loss": 6.5023, + "step": 1097 + }, + { + "epoch": 0.06816065553417344, + "grad_norm": 0.6833360022593535, + "learning_rate": 2.2718808193668532e-05, + "loss": 6.4023, + "step": 1098 + }, + { + "epoch": 0.06822273263393135, + "grad_norm": 0.5999381440782433, + "learning_rate": 2.2739499275812127e-05, + "loss": 6.5055, + "step": 1099 + }, + { + "epoch": 0.06828480973368924, + "grad_norm": 0.6226220910096847, + "learning_rate": 2.2760190357955722e-05, + "loss": 6.4941, + "step": 1100 + }, + { + "epoch": 0.06834688683344714, + "grad_norm": 0.549131913462675, + "learning_rate": 2.278088144009932e-05, + "loss": 6.4617, + "step": 1101 + }, + { + "epoch": 0.06840896393320504, + "grad_norm": 0.662840717799536, + "learning_rate": 2.2801572522242916e-05, + "loss": 6.4786, + "step": 1102 + }, + { + "epoch": 0.06847104103296293, + "grad_norm": 0.5587251148613439, + "learning_rate": 2.282226360438651e-05, + "loss": 6.4661, + "step": 1103 + }, + { + "epoch": 0.06853311813272084, + "grad_norm": 1.2854982981839334, + "learning_rate": 2.2842954686530106e-05, + "loss": 6.5602, + "step": 1104 + }, + { + "epoch": 0.06859519523247874, + "grad_norm": 0.8100645169713561, + "learning_rate": 2.28636457686737e-05, + "loss": 6.4553, + "step": 1105 + }, + { + "epoch": 0.06865727233223663, + "grad_norm": 0.6829635985421698, + "learning_rate": 2.2884336850817296e-05, + "loss": 6.5975, + "step": 1106 + }, + { + "epoch": 0.06871934943199454, + "grad_norm": 0.6499647286822263, + "learning_rate": 2.2905027932960895e-05, + "loss": 6.5796, + "step": 1107 + }, + { + "epoch": 0.06878142653175244, + "grad_norm": 0.7829224804277828, + "learning_rate": 2.292571901510449e-05, + "loss": 6.3957, + "step": 1108 + }, + { + "epoch": 0.06884350363151033, + "grad_norm": 0.7739632871186036, + "learning_rate": 2.2946410097248085e-05, + "loss": 6.4766, + "step": 1109 + }, + { + "epoch": 0.06890558073126823, + "grad_norm": 0.9917582687310169, + "learning_rate": 2.2967101179391683e-05, + "loss": 6.5491, + "step": 1110 + }, + { + "epoch": 0.06896765783102614, + "grad_norm": 0.8887685534529803, + "learning_rate": 2.298779226153528e-05, + "loss": 6.4026, + "step": 1111 + }, + { + "epoch": 0.06902973493078403, + "grad_norm": 0.7567962045391914, + "learning_rate": 2.3008483343678873e-05, + "loss": 6.5064, + "step": 1112 + }, + { + "epoch": 0.06909181203054193, + "grad_norm": 0.7607203665310567, + "learning_rate": 2.3029174425822472e-05, + "loss": 6.5996, + "step": 1113 + }, + { + "epoch": 0.06915388913029984, + "grad_norm": 0.6540074633586608, + "learning_rate": 2.3049865507966067e-05, + "loss": 6.4035, + "step": 1114 + }, + { + "epoch": 0.06921596623005773, + "grad_norm": 0.46934696967167816, + "learning_rate": 2.3070556590109662e-05, + "loss": 6.4052, + "step": 1115 + }, + { + "epoch": 0.06927804332981563, + "grad_norm": 0.5589424325494368, + "learning_rate": 2.309124767225326e-05, + "loss": 6.4726, + "step": 1116 + }, + { + "epoch": 0.06934012042957353, + "grad_norm": 0.5502098853443139, + "learning_rate": 2.3111938754396856e-05, + "loss": 6.4932, + "step": 1117 + }, + { + "epoch": 0.06940219752933142, + "grad_norm": 0.6111338248751952, + "learning_rate": 2.313262983654045e-05, + "loss": 6.38, + "step": 1118 + }, + { + "epoch": 0.06946427462908933, + "grad_norm": 0.6461882590162042, + "learning_rate": 2.315332091868405e-05, + "loss": 6.4475, + "step": 1119 + }, + { + "epoch": 0.06952635172884723, + "grad_norm": 0.6620105005667519, + "learning_rate": 2.3174012000827644e-05, + "loss": 6.5588, + "step": 1120 + }, + { + "epoch": 0.06958842882860512, + "grad_norm": 0.6574612293488673, + "learning_rate": 2.319470308297124e-05, + "loss": 6.3946, + "step": 1121 + }, + { + "epoch": 0.06965050592836303, + "grad_norm": 0.8890685485205025, + "learning_rate": 2.3215394165114838e-05, + "loss": 6.5034, + "step": 1122 + }, + { + "epoch": 0.06971258302812093, + "grad_norm": 0.5891557685414245, + "learning_rate": 2.3236085247258433e-05, + "loss": 6.4351, + "step": 1123 + }, + { + "epoch": 0.06977466012787882, + "grad_norm": 0.662455398029853, + "learning_rate": 2.325677632940203e-05, + "loss": 6.4418, + "step": 1124 + }, + { + "epoch": 0.06983673722763672, + "grad_norm": 0.6423556513740444, + "learning_rate": 2.3277467411545626e-05, + "loss": 6.3354, + "step": 1125 + }, + { + "epoch": 0.06989881432739463, + "grad_norm": 0.6156866681731586, + "learning_rate": 2.329815849368922e-05, + "loss": 6.42, + "step": 1126 + }, + { + "epoch": 0.06996089142715252, + "grad_norm": 0.6879733363546015, + "learning_rate": 2.331884957583282e-05, + "loss": 6.4835, + "step": 1127 + }, + { + "epoch": 0.07002296852691042, + "grad_norm": 0.5524248800086733, + "learning_rate": 2.3339540657976415e-05, + "loss": 6.3551, + "step": 1128 + }, + { + "epoch": 0.07008504562666833, + "grad_norm": 0.6608707917369688, + "learning_rate": 2.336023174012001e-05, + "loss": 6.4062, + "step": 1129 + }, + { + "epoch": 0.07014712272642622, + "grad_norm": 0.38069567952741845, + "learning_rate": 2.3380922822263605e-05, + "loss": 6.3424, + "step": 1130 + }, + { + "epoch": 0.07020919982618412, + "grad_norm": 0.5674918196420469, + "learning_rate": 2.34016139044072e-05, + "loss": 6.4639, + "step": 1131 + }, + { + "epoch": 0.07027127692594203, + "grad_norm": 0.450280022254068, + "learning_rate": 2.34223049865508e-05, + "loss": 6.364, + "step": 1132 + }, + { + "epoch": 0.07033335402569992, + "grad_norm": 0.43941214476350116, + "learning_rate": 2.3442996068694394e-05, + "loss": 6.3009, + "step": 1133 + }, + { + "epoch": 0.07039543112545782, + "grad_norm": 0.6307783517683536, + "learning_rate": 2.346368715083799e-05, + "loss": 6.3448, + "step": 1134 + }, + { + "epoch": 0.07045750822521572, + "grad_norm": 0.4101293645857312, + "learning_rate": 2.3484378232981584e-05, + "loss": 6.3512, + "step": 1135 + }, + { + "epoch": 0.07051958532497361, + "grad_norm": 0.5920742379531753, + "learning_rate": 2.3505069315125182e-05, + "loss": 6.3232, + "step": 1136 + }, + { + "epoch": 0.07058166242473152, + "grad_norm": 0.4609943451824206, + "learning_rate": 2.3525760397268777e-05, + "loss": 6.3674, + "step": 1137 + }, + { + "epoch": 0.07064373952448942, + "grad_norm": 0.47441736112588784, + "learning_rate": 2.3546451479412373e-05, + "loss": 6.3156, + "step": 1138 + }, + { + "epoch": 0.07070581662424731, + "grad_norm": 0.6957379014899074, + "learning_rate": 2.356714256155597e-05, + "loss": 6.4551, + "step": 1139 + }, + { + "epoch": 0.07076789372400522, + "grad_norm": 0.6955453504219815, + "learning_rate": 2.3587833643699566e-05, + "loss": 6.462, + "step": 1140 + }, + { + "epoch": 0.07082997082376312, + "grad_norm": 0.5007589362018918, + "learning_rate": 2.360852472584316e-05, + "loss": 6.3024, + "step": 1141 + }, + { + "epoch": 0.07089204792352101, + "grad_norm": 0.6743352557288635, + "learning_rate": 2.362921580798676e-05, + "loss": 6.4007, + "step": 1142 + }, + { + "epoch": 0.07095412502327891, + "grad_norm": 1.7196072607992305, + "learning_rate": 2.3649906890130355e-05, + "loss": 6.3743, + "step": 1143 + }, + { + "epoch": 0.07101620212303682, + "grad_norm": 0.8722875651270798, + "learning_rate": 2.367059797227395e-05, + "loss": 6.3835, + "step": 1144 + }, + { + "epoch": 0.07107827922279471, + "grad_norm": 0.4210730807693628, + "learning_rate": 2.3691289054417548e-05, + "loss": 6.4097, + "step": 1145 + }, + { + "epoch": 0.07114035632255261, + "grad_norm": 0.6643721363511906, + "learning_rate": 2.3711980136561143e-05, + "loss": 6.4018, + "step": 1146 + }, + { + "epoch": 0.07120243342231052, + "grad_norm": 0.6463752723668191, + "learning_rate": 2.373267121870474e-05, + "loss": 6.4083, + "step": 1147 + }, + { + "epoch": 0.0712645105220684, + "grad_norm": 0.6845085344722966, + "learning_rate": 2.3753362300848337e-05, + "loss": 6.3504, + "step": 1148 + }, + { + "epoch": 0.07132658762182631, + "grad_norm": 1.0217997135344885, + "learning_rate": 2.3774053382991932e-05, + "loss": 6.4385, + "step": 1149 + }, + { + "epoch": 0.07138866472158421, + "grad_norm": 0.7210761451431977, + "learning_rate": 2.3794744465135527e-05, + "loss": 6.4306, + "step": 1150 + }, + { + "epoch": 0.0714507418213421, + "grad_norm": 1.422099755936823, + "learning_rate": 2.3815435547279125e-05, + "loss": 6.4063, + "step": 1151 + }, + { + "epoch": 0.07151281892110001, + "grad_norm": 0.6889376563987634, + "learning_rate": 2.383612662942272e-05, + "loss": 6.5129, + "step": 1152 + }, + { + "epoch": 0.07157489602085791, + "grad_norm": 0.8088799306512606, + "learning_rate": 2.3856817711566316e-05, + "loss": 6.3992, + "step": 1153 + }, + { + "epoch": 0.0716369731206158, + "grad_norm": 0.6779209506769539, + "learning_rate": 2.3877508793709914e-05, + "loss": 6.4337, + "step": 1154 + }, + { + "epoch": 0.0716990502203737, + "grad_norm": 1.1995438622759085, + "learning_rate": 2.389819987585351e-05, + "loss": 6.3721, + "step": 1155 + }, + { + "epoch": 0.07176112732013161, + "grad_norm": 0.8153121041135751, + "learning_rate": 2.3918890957997104e-05, + "loss": 6.4909, + "step": 1156 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 0.779684672990769, + "learning_rate": 2.3939582040140703e-05, + "loss": 6.394, + "step": 1157 + }, + { + "epoch": 0.0718852815196474, + "grad_norm": 0.8910538494200986, + "learning_rate": 2.3960273122284298e-05, + "loss": 6.2674, + "step": 1158 + }, + { + "epoch": 0.07194735861940531, + "grad_norm": 1.0845700992671263, + "learning_rate": 2.3980964204427893e-05, + "loss": 6.3002, + "step": 1159 + }, + { + "epoch": 0.0720094357191632, + "grad_norm": 0.7448864500827261, + "learning_rate": 2.4001655286571488e-05, + "loss": 6.2479, + "step": 1160 + }, + { + "epoch": 0.0720715128189211, + "grad_norm": 1.4216881094713365, + "learning_rate": 2.4022346368715083e-05, + "loss": 6.3652, + "step": 1161 + }, + { + "epoch": 0.072133589918679, + "grad_norm": 1.1436264575878499, + "learning_rate": 2.4043037450858678e-05, + "loss": 6.4728, + "step": 1162 + }, + { + "epoch": 0.0721956670184369, + "grad_norm": 0.7945788343915979, + "learning_rate": 2.4063728533002277e-05, + "loss": 6.4188, + "step": 1163 + }, + { + "epoch": 0.0722577441181948, + "grad_norm": 0.8811068303570927, + "learning_rate": 2.408441961514587e-05, + "loss": 6.4344, + "step": 1164 + }, + { + "epoch": 0.0723198212179527, + "grad_norm": 0.8385451101544364, + "learning_rate": 2.4105110697289467e-05, + "loss": 6.3602, + "step": 1165 + }, + { + "epoch": 0.0723818983177106, + "grad_norm": 0.674916458258279, + "learning_rate": 2.4125801779433065e-05, + "loss": 6.3462, + "step": 1166 + }, + { + "epoch": 0.0724439754174685, + "grad_norm": 1.0331549534087279, + "learning_rate": 2.414649286157666e-05, + "loss": 6.3499, + "step": 1167 + }, + { + "epoch": 0.0725060525172264, + "grad_norm": 0.5478702752764509, + "learning_rate": 2.416718394372026e-05, + "loss": 6.4118, + "step": 1168 + }, + { + "epoch": 0.07256812961698429, + "grad_norm": 0.6641627292165346, + "learning_rate": 2.4187875025863854e-05, + "loss": 6.3711, + "step": 1169 + }, + { + "epoch": 0.0726302067167422, + "grad_norm": 0.8979406890546993, + "learning_rate": 2.420856610800745e-05, + "loss": 6.4269, + "step": 1170 + }, + { + "epoch": 0.07269228381650009, + "grad_norm": 0.5893457343965535, + "learning_rate": 2.4229257190151047e-05, + "loss": 6.3489, + "step": 1171 + }, + { + "epoch": 0.07275436091625799, + "grad_norm": 0.7174330131150521, + "learning_rate": 2.4249948272294642e-05, + "loss": 6.3232, + "step": 1172 + }, + { + "epoch": 0.0728164380160159, + "grad_norm": 0.6426182186523535, + "learning_rate": 2.4270639354438237e-05, + "loss": 6.3229, + "step": 1173 + }, + { + "epoch": 0.07287851511577378, + "grad_norm": 0.6993256385981135, + "learning_rate": 2.4291330436581836e-05, + "loss": 6.3693, + "step": 1174 + }, + { + "epoch": 0.07294059221553169, + "grad_norm": 0.723293703754477, + "learning_rate": 2.431202151872543e-05, + "loss": 6.3524, + "step": 1175 + }, + { + "epoch": 0.07300266931528959, + "grad_norm": 0.7232070473404602, + "learning_rate": 2.4332712600869026e-05, + "loss": 6.3926, + "step": 1176 + }, + { + "epoch": 0.07306474641504748, + "grad_norm": 0.5632949351263333, + "learning_rate": 2.4353403683012625e-05, + "loss": 6.3991, + "step": 1177 + }, + { + "epoch": 0.07312682351480539, + "grad_norm": 0.7500176030318458, + "learning_rate": 2.437409476515622e-05, + "loss": 6.3641, + "step": 1178 + }, + { + "epoch": 0.07318890061456329, + "grad_norm": 0.7629299608784502, + "learning_rate": 2.4394785847299815e-05, + "loss": 6.3504, + "step": 1179 + }, + { + "epoch": 0.07325097771432118, + "grad_norm": 0.5926724493989366, + "learning_rate": 2.4415476929443413e-05, + "loss": 6.3818, + "step": 1180 + }, + { + "epoch": 0.07331305481407908, + "grad_norm": 0.7032665957822785, + "learning_rate": 2.4436168011587008e-05, + "loss": 6.2565, + "step": 1181 + }, + { + "epoch": 0.07337513191383699, + "grad_norm": 1.0894084320803734, + "learning_rate": 2.4456859093730603e-05, + "loss": 6.3479, + "step": 1182 + }, + { + "epoch": 0.07343720901359488, + "grad_norm": 0.8682712278530785, + "learning_rate": 2.4477550175874202e-05, + "loss": 6.4231, + "step": 1183 + }, + { + "epoch": 0.07349928611335278, + "grad_norm": 0.6790253055250338, + "learning_rate": 2.4498241258017797e-05, + "loss": 6.3523, + "step": 1184 + }, + { + "epoch": 0.07356136321311069, + "grad_norm": 0.5681663788518441, + "learning_rate": 2.4518932340161392e-05, + "loss": 6.3191, + "step": 1185 + }, + { + "epoch": 0.07362344031286858, + "grad_norm": 0.6785493287811546, + "learning_rate": 2.4539623422304987e-05, + "loss": 6.314, + "step": 1186 + }, + { + "epoch": 0.07368551741262648, + "grad_norm": 0.5152044893494921, + "learning_rate": 2.4560314504448585e-05, + "loss": 6.2202, + "step": 1187 + }, + { + "epoch": 0.07374759451238438, + "grad_norm": 0.7801666377648268, + "learning_rate": 2.458100558659218e-05, + "loss": 6.2751, + "step": 1188 + }, + { + "epoch": 0.07380967161214227, + "grad_norm": 0.8560246442518272, + "learning_rate": 2.4601696668735776e-05, + "loss": 6.291, + "step": 1189 + }, + { + "epoch": 0.07387174871190018, + "grad_norm": 0.9761793071437951, + "learning_rate": 2.462238775087937e-05, + "loss": 6.3382, + "step": 1190 + }, + { + "epoch": 0.07393382581165808, + "grad_norm": 0.8117152238756387, + "learning_rate": 2.4643078833022966e-05, + "loss": 6.3417, + "step": 1191 + }, + { + "epoch": 0.07399590291141597, + "grad_norm": 0.5231326909565222, + "learning_rate": 2.4663769915166564e-05, + "loss": 6.2735, + "step": 1192 + }, + { + "epoch": 0.07405798001117388, + "grad_norm": 1.1296403867149156, + "learning_rate": 2.468446099731016e-05, + "loss": 6.214, + "step": 1193 + }, + { + "epoch": 0.07412005711093178, + "grad_norm": 0.5017164689222745, + "learning_rate": 2.4705152079453754e-05, + "loss": 6.2762, + "step": 1194 + }, + { + "epoch": 0.07418213421068967, + "grad_norm": 0.9072131596325629, + "learning_rate": 2.4725843161597353e-05, + "loss": 6.307, + "step": 1195 + }, + { + "epoch": 0.07424421131044757, + "grad_norm": 0.5972848124444003, + "learning_rate": 2.4746534243740948e-05, + "loss": 6.4426, + "step": 1196 + }, + { + "epoch": 0.07430628841020548, + "grad_norm": 0.8211937666500816, + "learning_rate": 2.4767225325884543e-05, + "loss": 6.3432, + "step": 1197 + }, + { + "epoch": 0.07436836550996337, + "grad_norm": 1.2749842979361847, + "learning_rate": 2.478791640802814e-05, + "loss": 6.3462, + "step": 1198 + }, + { + "epoch": 0.07443044260972127, + "grad_norm": 0.5506266924932753, + "learning_rate": 2.4808607490171737e-05, + "loss": 6.4561, + "step": 1199 + }, + { + "epoch": 0.07449251970947918, + "grad_norm": 0.931243843979938, + "learning_rate": 2.482929857231533e-05, + "loss": 6.3318, + "step": 1200 + }, + { + "epoch": 0.07455459680923707, + "grad_norm": 0.8652349261073301, + "learning_rate": 2.484998965445893e-05, + "loss": 6.2481, + "step": 1201 + }, + { + "epoch": 0.07461667390899497, + "grad_norm": 1.2888981107449482, + "learning_rate": 2.4870680736602525e-05, + "loss": 6.2854, + "step": 1202 + }, + { + "epoch": 0.07467875100875287, + "grad_norm": 0.8424306373455344, + "learning_rate": 2.489137181874612e-05, + "loss": 6.2638, + "step": 1203 + }, + { + "epoch": 0.07474082810851077, + "grad_norm": 0.697068194479382, + "learning_rate": 2.491206290088972e-05, + "loss": 6.2651, + "step": 1204 + }, + { + "epoch": 0.07480290520826867, + "grad_norm": 0.7578459663473534, + "learning_rate": 2.4932753983033314e-05, + "loss": 6.3128, + "step": 1205 + }, + { + "epoch": 0.07486498230802657, + "grad_norm": 1.067454343529654, + "learning_rate": 2.495344506517691e-05, + "loss": 6.3148, + "step": 1206 + }, + { + "epoch": 0.07492705940778446, + "grad_norm": 1.0850055455360559, + "learning_rate": 2.4974136147320507e-05, + "loss": 6.324, + "step": 1207 + }, + { + "epoch": 0.07498913650754237, + "grad_norm": 1.00023028583163, + "learning_rate": 2.4994827229464102e-05, + "loss": 6.3463, + "step": 1208 + }, + { + "epoch": 0.07505121360730027, + "grad_norm": 0.7125796541227486, + "learning_rate": 2.50155183116077e-05, + "loss": 6.3216, + "step": 1209 + }, + { + "epoch": 0.07511329070705816, + "grad_norm": 1.5931224709539742, + "learning_rate": 2.5036209393751293e-05, + "loss": 6.2816, + "step": 1210 + }, + { + "epoch": 0.07517536780681607, + "grad_norm": 0.7163439990903856, + "learning_rate": 2.505690047589489e-05, + "loss": 6.2575, + "step": 1211 + }, + { + "epoch": 0.07523744490657397, + "grad_norm": 1.458914350389306, + "learning_rate": 2.507759155803849e-05, + "loss": 6.327, + "step": 1212 + }, + { + "epoch": 0.07529952200633186, + "grad_norm": 0.8230833938962687, + "learning_rate": 2.509828264018208e-05, + "loss": 6.2694, + "step": 1213 + }, + { + "epoch": 0.07536159910608976, + "grad_norm": 1.1365304902430358, + "learning_rate": 2.511897372232568e-05, + "loss": 6.3388, + "step": 1214 + }, + { + "epoch": 0.07542367620584767, + "grad_norm": 0.9855855733838962, + "learning_rate": 2.5139664804469275e-05, + "loss": 6.3685, + "step": 1215 + }, + { + "epoch": 0.07548575330560556, + "grad_norm": 0.9372142356221222, + "learning_rate": 2.516035588661287e-05, + "loss": 6.3412, + "step": 1216 + }, + { + "epoch": 0.07554783040536346, + "grad_norm": 0.8213038675649819, + "learning_rate": 2.5181046968756465e-05, + "loss": 6.337, + "step": 1217 + }, + { + "epoch": 0.07560990750512137, + "grad_norm": 1.274899678864866, + "learning_rate": 2.5201738050900063e-05, + "loss": 6.2058, + "step": 1218 + }, + { + "epoch": 0.07567198460487926, + "grad_norm": 0.9817778825717619, + "learning_rate": 2.522242913304366e-05, + "loss": 6.2323, + "step": 1219 + }, + { + "epoch": 0.07573406170463716, + "grad_norm": 0.7558921392865512, + "learning_rate": 2.5243120215187254e-05, + "loss": 6.2266, + "step": 1220 + }, + { + "epoch": 0.07579613880439506, + "grad_norm": 1.0862186534471863, + "learning_rate": 2.5263811297330852e-05, + "loss": 6.2533, + "step": 1221 + }, + { + "epoch": 0.07585821590415295, + "grad_norm": 0.7801892900031544, + "learning_rate": 2.5284502379474444e-05, + "loss": 6.3418, + "step": 1222 + }, + { + "epoch": 0.07592029300391086, + "grad_norm": 0.96811433671727, + "learning_rate": 2.5305193461618042e-05, + "loss": 6.2982, + "step": 1223 + }, + { + "epoch": 0.07598237010366876, + "grad_norm": 0.9004141914113586, + "learning_rate": 2.532588454376164e-05, + "loss": 6.2948, + "step": 1224 + }, + { + "epoch": 0.07604444720342665, + "grad_norm": 0.6163565437959165, + "learning_rate": 2.5346575625905232e-05, + "loss": 6.3521, + "step": 1225 + }, + { + "epoch": 0.07610652430318456, + "grad_norm": 0.7697286594674954, + "learning_rate": 2.536726670804883e-05, + "loss": 6.3013, + "step": 1226 + }, + { + "epoch": 0.07616860140294246, + "grad_norm": 0.6376703558298461, + "learning_rate": 2.538795779019243e-05, + "loss": 6.3168, + "step": 1227 + }, + { + "epoch": 0.07623067850270035, + "grad_norm": 0.9884741975506899, + "learning_rate": 2.540864887233602e-05, + "loss": 6.2249, + "step": 1228 + }, + { + "epoch": 0.07629275560245825, + "grad_norm": 0.8978074559814592, + "learning_rate": 2.542933995447962e-05, + "loss": 6.2575, + "step": 1229 + }, + { + "epoch": 0.07635483270221616, + "grad_norm": 0.8099064146538392, + "learning_rate": 2.5450031036623218e-05, + "loss": 6.2951, + "step": 1230 + }, + { + "epoch": 0.07641690980197405, + "grad_norm": 0.7884168721818254, + "learning_rate": 2.5470722118766816e-05, + "loss": 6.2244, + "step": 1231 + }, + { + "epoch": 0.07647898690173195, + "grad_norm": 0.7829364884002209, + "learning_rate": 2.5491413200910408e-05, + "loss": 6.2082, + "step": 1232 + }, + { + "epoch": 0.07654106400148986, + "grad_norm": 0.8424104370710495, + "learning_rate": 2.5512104283054006e-05, + "loss": 6.1459, + "step": 1233 + }, + { + "epoch": 0.07660314110124775, + "grad_norm": 0.7725266767420244, + "learning_rate": 2.5532795365197605e-05, + "loss": 6.322, + "step": 1234 + }, + { + "epoch": 0.07666521820100565, + "grad_norm": 0.7467001403034998, + "learning_rate": 2.5553486447341197e-05, + "loss": 6.2284, + "step": 1235 + }, + { + "epoch": 0.07672729530076355, + "grad_norm": 0.6014743591735183, + "learning_rate": 2.5574177529484795e-05, + "loss": 6.2629, + "step": 1236 + }, + { + "epoch": 0.07678937240052144, + "grad_norm": 0.9890440157207155, + "learning_rate": 2.5594868611628394e-05, + "loss": 6.2317, + "step": 1237 + }, + { + "epoch": 0.07685144950027935, + "grad_norm": 0.7825606224537793, + "learning_rate": 2.5615559693771985e-05, + "loss": 6.2611, + "step": 1238 + }, + { + "epoch": 0.07691352660003725, + "grad_norm": 1.5587575725992473, + "learning_rate": 2.5636250775915584e-05, + "loss": 6.3104, + "step": 1239 + }, + { + "epoch": 0.07697560369979514, + "grad_norm": 1.1307469445507154, + "learning_rate": 2.565694185805918e-05, + "loss": 6.1807, + "step": 1240 + }, + { + "epoch": 0.07703768079955305, + "grad_norm": 0.7917721744969813, + "learning_rate": 2.5677632940202774e-05, + "loss": 6.1713, + "step": 1241 + }, + { + "epoch": 0.07709975789931095, + "grad_norm": 1.446452006061042, + "learning_rate": 2.569832402234637e-05, + "loss": 6.2452, + "step": 1242 + }, + { + "epoch": 0.07716183499906884, + "grad_norm": 1.0126543468992495, + "learning_rate": 2.5719015104489967e-05, + "loss": 6.2901, + "step": 1243 + }, + { + "epoch": 0.07722391209882674, + "grad_norm": 0.8614462427646189, + "learning_rate": 2.5739706186633562e-05, + "loss": 6.2379, + "step": 1244 + }, + { + "epoch": 0.07728598919858465, + "grad_norm": 0.7373928800472463, + "learning_rate": 2.5760397268777158e-05, + "loss": 6.2357, + "step": 1245 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 0.8192702438698354, + "learning_rate": 2.5781088350920756e-05, + "loss": 6.2295, + "step": 1246 + }, + { + "epoch": 0.07741014339810044, + "grad_norm": 0.682630503337689, + "learning_rate": 2.5801779433064348e-05, + "loss": 6.3215, + "step": 1247 + }, + { + "epoch": 0.07747222049785835, + "grad_norm": 0.800483313239433, + "learning_rate": 2.5822470515207946e-05, + "loss": 6.2726, + "step": 1248 + }, + { + "epoch": 0.07753429759761624, + "grad_norm": 0.6704538529749557, + "learning_rate": 2.5843161597351545e-05, + "loss": 6.1629, + "step": 1249 + }, + { + "epoch": 0.07759637469737414, + "grad_norm": 0.7036427816630237, + "learning_rate": 2.5863852679495136e-05, + "loss": 6.2947, + "step": 1250 + }, + { + "epoch": 0.07765845179713204, + "grad_norm": 0.6947355126637419, + "learning_rate": 2.5884543761638735e-05, + "loss": 6.3135, + "step": 1251 + }, + { + "epoch": 0.07772052889688993, + "grad_norm": 0.5866767797176425, + "learning_rate": 2.5905234843782333e-05, + "loss": 6.2706, + "step": 1252 + }, + { + "epoch": 0.07778260599664784, + "grad_norm": 0.7050811894315873, + "learning_rate": 2.5925925925925925e-05, + "loss": 6.2456, + "step": 1253 + }, + { + "epoch": 0.07784468309640574, + "grad_norm": 0.7807061018706184, + "learning_rate": 2.5946617008069523e-05, + "loss": 6.2204, + "step": 1254 + }, + { + "epoch": 0.07790676019616363, + "grad_norm": 0.6177831986096662, + "learning_rate": 2.5967308090213122e-05, + "loss": 6.2563, + "step": 1255 + }, + { + "epoch": 0.07796883729592154, + "grad_norm": 0.6362829296531483, + "learning_rate": 2.5987999172356714e-05, + "loss": 6.2545, + "step": 1256 + }, + { + "epoch": 0.07803091439567944, + "grad_norm": 0.5219358854126102, + "learning_rate": 2.6008690254500312e-05, + "loss": 6.1611, + "step": 1257 + }, + { + "epoch": 0.07809299149543733, + "grad_norm": 0.6124639471303598, + "learning_rate": 2.602938133664391e-05, + "loss": 6.2849, + "step": 1258 + }, + { + "epoch": 0.07815506859519523, + "grad_norm": 0.5808993091721645, + "learning_rate": 2.6050072418787502e-05, + "loss": 6.2644, + "step": 1259 + }, + { + "epoch": 0.07821714569495314, + "grad_norm": 0.5843806373609871, + "learning_rate": 2.60707635009311e-05, + "loss": 6.2569, + "step": 1260 + }, + { + "epoch": 0.07827922279471103, + "grad_norm": 0.6538809977660907, + "learning_rate": 2.60914545830747e-05, + "loss": 6.1374, + "step": 1261 + }, + { + "epoch": 0.07834129989446893, + "grad_norm": 0.6054402006247956, + "learning_rate": 2.611214566521829e-05, + "loss": 6.1682, + "step": 1262 + }, + { + "epoch": 0.07840337699422684, + "grad_norm": 0.633660325434342, + "learning_rate": 2.613283674736189e-05, + "loss": 6.2136, + "step": 1263 + }, + { + "epoch": 0.07846545409398473, + "grad_norm": 0.5139207418410201, + "learning_rate": 2.6153527829505488e-05, + "loss": 6.1575, + "step": 1264 + }, + { + "epoch": 0.07852753119374263, + "grad_norm": 0.6446684951310854, + "learning_rate": 2.617421891164908e-05, + "loss": 6.1644, + "step": 1265 + }, + { + "epoch": 0.07858960829350052, + "grad_norm": 0.5812219202785931, + "learning_rate": 2.6194909993792678e-05, + "loss": 6.0921, + "step": 1266 + }, + { + "epoch": 0.07865168539325842, + "grad_norm": 0.529013385374405, + "learning_rate": 2.6215601075936276e-05, + "loss": 6.206, + "step": 1267 + }, + { + "epoch": 0.07871376249301633, + "grad_norm": 0.75685562600399, + "learning_rate": 2.6236292158079868e-05, + "loss": 6.3085, + "step": 1268 + }, + { + "epoch": 0.07877583959277422, + "grad_norm": 0.5886683848074439, + "learning_rate": 2.6256983240223466e-05, + "loss": 6.3428, + "step": 1269 + }, + { + "epoch": 0.07883791669253212, + "grad_norm": 0.6755749602582588, + "learning_rate": 2.627767432236706e-05, + "loss": 6.2599, + "step": 1270 + }, + { + "epoch": 0.07889999379229003, + "grad_norm": 0.5736171139681895, + "learning_rate": 2.6298365404510657e-05, + "loss": 6.1994, + "step": 1271 + }, + { + "epoch": 0.07896207089204792, + "grad_norm": 0.6567183821262125, + "learning_rate": 2.631905648665425e-05, + "loss": 6.1753, + "step": 1272 + }, + { + "epoch": 0.07902414799180582, + "grad_norm": 0.5655089538566916, + "learning_rate": 2.633974756879785e-05, + "loss": 6.1828, + "step": 1273 + }, + { + "epoch": 0.07908622509156372, + "grad_norm": 0.5982064600434687, + "learning_rate": 2.6360438650941445e-05, + "loss": 6.1476, + "step": 1274 + }, + { + "epoch": 0.07914830219132162, + "grad_norm": 0.7440895603711303, + "learning_rate": 2.638112973308504e-05, + "loss": 6.1886, + "step": 1275 + }, + { + "epoch": 0.07921037929107952, + "grad_norm": 0.5578582029025562, + "learning_rate": 2.640182081522864e-05, + "loss": 6.1724, + "step": 1276 + }, + { + "epoch": 0.07927245639083742, + "grad_norm": 0.5766233841091746, + "learning_rate": 2.642251189737223e-05, + "loss": 6.2276, + "step": 1277 + }, + { + "epoch": 0.07933453349059531, + "grad_norm": 0.4569723955390005, + "learning_rate": 2.644320297951583e-05, + "loss": 6.2718, + "step": 1278 + }, + { + "epoch": 0.07939661059035322, + "grad_norm": 0.7560864484497106, + "learning_rate": 2.6463894061659427e-05, + "loss": 6.2112, + "step": 1279 + }, + { + "epoch": 0.07945868769011112, + "grad_norm": 0.5895249724287698, + "learning_rate": 2.648458514380302e-05, + "loss": 6.1068, + "step": 1280 + }, + { + "epoch": 0.07952076478986901, + "grad_norm": 0.777930148749726, + "learning_rate": 2.6505276225946618e-05, + "loss": 6.1774, + "step": 1281 + }, + { + "epoch": 0.07958284188962692, + "grad_norm": 0.590807146918782, + "learning_rate": 2.6525967308090216e-05, + "loss": 6.1773, + "step": 1282 + }, + { + "epoch": 0.07964491898938482, + "grad_norm": 0.6437763764246147, + "learning_rate": 2.6546658390233808e-05, + "loss": 6.1794, + "step": 1283 + }, + { + "epoch": 0.07970699608914271, + "grad_norm": 0.4392239121195129, + "learning_rate": 2.6567349472377406e-05, + "loss": 6.2235, + "step": 1284 + }, + { + "epoch": 0.07976907318890061, + "grad_norm": 0.5656407412029946, + "learning_rate": 2.6588040554521005e-05, + "loss": 6.2491, + "step": 1285 + }, + { + "epoch": 0.07983115028865852, + "grad_norm": 0.6294540246940736, + "learning_rate": 2.6608731636664596e-05, + "loss": 6.2495, + "step": 1286 + }, + { + "epoch": 0.07989322738841641, + "grad_norm": 0.5510830965193989, + "learning_rate": 2.6629422718808195e-05, + "loss": 6.2571, + "step": 1287 + }, + { + "epoch": 0.07995530448817431, + "grad_norm": 0.4141357465052614, + "learning_rate": 2.6650113800951793e-05, + "loss": 6.157, + "step": 1288 + }, + { + "epoch": 0.08001738158793222, + "grad_norm": 0.5724400329417185, + "learning_rate": 2.6670804883095385e-05, + "loss": 6.1562, + "step": 1289 + }, + { + "epoch": 0.0800794586876901, + "grad_norm": 0.7844635294161671, + "learning_rate": 2.6691495965238983e-05, + "loss": 6.1114, + "step": 1290 + }, + { + "epoch": 0.08014153578744801, + "grad_norm": 0.5156551121097589, + "learning_rate": 2.6712187047382582e-05, + "loss": 6.2228, + "step": 1291 + }, + { + "epoch": 0.08020361288720591, + "grad_norm": 0.6052529965814579, + "learning_rate": 2.6732878129526174e-05, + "loss": 6.1738, + "step": 1292 + }, + { + "epoch": 0.0802656899869638, + "grad_norm": 0.6163648603537916, + "learning_rate": 2.6753569211669772e-05, + "loss": 6.2146, + "step": 1293 + }, + { + "epoch": 0.08032776708672171, + "grad_norm": 0.4991505231486386, + "learning_rate": 2.677426029381337e-05, + "loss": 6.1065, + "step": 1294 + }, + { + "epoch": 0.08038984418647961, + "grad_norm": 1.1223176661958438, + "learning_rate": 2.6794951375956962e-05, + "loss": 6.1479, + "step": 1295 + }, + { + "epoch": 0.0804519212862375, + "grad_norm": 0.7366817040296731, + "learning_rate": 2.681564245810056e-05, + "loss": 6.1459, + "step": 1296 + }, + { + "epoch": 0.0805139983859954, + "grad_norm": 0.7688609268049089, + "learning_rate": 2.6836333540244156e-05, + "loss": 6.1577, + "step": 1297 + }, + { + "epoch": 0.08057607548575331, + "grad_norm": 0.6826698381150359, + "learning_rate": 2.685702462238775e-05, + "loss": 6.2213, + "step": 1298 + }, + { + "epoch": 0.0806381525855112, + "grad_norm": 0.7589871063418873, + "learning_rate": 2.687771570453135e-05, + "loss": 6.1825, + "step": 1299 + }, + { + "epoch": 0.0807002296852691, + "grad_norm": 1.6376822828799282, + "learning_rate": 2.6898406786674944e-05, + "loss": 6.2974, + "step": 1300 + }, + { + "epoch": 0.08076230678502701, + "grad_norm": 0.7602688806880682, + "learning_rate": 2.691909786881854e-05, + "loss": 6.1779, + "step": 1301 + }, + { + "epoch": 0.0808243838847849, + "grad_norm": 2.1801483870234426, + "learning_rate": 2.6939788950962134e-05, + "loss": 6.3041, + "step": 1302 + }, + { + "epoch": 0.0808864609845428, + "grad_norm": 0.9665245828674558, + "learning_rate": 2.6960480033105733e-05, + "loss": 6.1704, + "step": 1303 + }, + { + "epoch": 0.0809485380843007, + "grad_norm": 0.6206210995078089, + "learning_rate": 2.6981171115249325e-05, + "loss": 6.1031, + "step": 1304 + }, + { + "epoch": 0.0810106151840586, + "grad_norm": 0.7933800293165149, + "learning_rate": 2.7001862197392923e-05, + "loss": 6.1016, + "step": 1305 + }, + { + "epoch": 0.0810726922838165, + "grad_norm": 0.6798205409891797, + "learning_rate": 2.702255327953652e-05, + "loss": 6.2236, + "step": 1306 + }, + { + "epoch": 0.0811347693835744, + "grad_norm": 0.6191433446204687, + "learning_rate": 2.7043244361680113e-05, + "loss": 6.2103, + "step": 1307 + }, + { + "epoch": 0.0811968464833323, + "grad_norm": 0.7617450416135648, + "learning_rate": 2.7063935443823712e-05, + "loss": 6.1175, + "step": 1308 + }, + { + "epoch": 0.0812589235830902, + "grad_norm": 0.55337489996996, + "learning_rate": 2.708462652596731e-05, + "loss": 6.2521, + "step": 1309 + }, + { + "epoch": 0.0813210006828481, + "grad_norm": 0.7844509164007678, + "learning_rate": 2.7105317608110902e-05, + "loss": 6.1407, + "step": 1310 + }, + { + "epoch": 0.08138307778260599, + "grad_norm": 0.5663122691090964, + "learning_rate": 2.71260086902545e-05, + "loss": 6.182, + "step": 1311 + }, + { + "epoch": 0.0814451548823639, + "grad_norm": 1.0826395576501628, + "learning_rate": 2.71466997723981e-05, + "loss": 6.2471, + "step": 1312 + }, + { + "epoch": 0.0815072319821218, + "grad_norm": 0.971832263411673, + "learning_rate": 2.716739085454169e-05, + "loss": 6.1049, + "step": 1313 + }, + { + "epoch": 0.08156930908187969, + "grad_norm": 0.8176467815647712, + "learning_rate": 2.718808193668529e-05, + "loss": 6.1549, + "step": 1314 + }, + { + "epoch": 0.0816313861816376, + "grad_norm": 0.7265029390594618, + "learning_rate": 2.7208773018828887e-05, + "loss": 6.2077, + "step": 1315 + }, + { + "epoch": 0.0816934632813955, + "grad_norm": 0.8253099711739429, + "learning_rate": 2.7229464100972486e-05, + "loss": 6.1364, + "step": 1316 + }, + { + "epoch": 0.08175554038115339, + "grad_norm": 0.6623178564555032, + "learning_rate": 2.7250155183116078e-05, + "loss": 6.1724, + "step": 1317 + }, + { + "epoch": 0.08181761748091129, + "grad_norm": 0.6380330661976337, + "learning_rate": 2.7270846265259676e-05, + "loss": 6.0983, + "step": 1318 + }, + { + "epoch": 0.0818796945806692, + "grad_norm": 0.7715159349384315, + "learning_rate": 2.7291537347403274e-05, + "loss": 6.1467, + "step": 1319 + }, + { + "epoch": 0.08194177168042709, + "grad_norm": 0.7553652545528187, + "learning_rate": 2.7312228429546866e-05, + "loss": 6.1293, + "step": 1320 + }, + { + "epoch": 0.08200384878018499, + "grad_norm": 0.7490356126017091, + "learning_rate": 2.7332919511690465e-05, + "loss": 6.1949, + "step": 1321 + }, + { + "epoch": 0.0820659258799429, + "grad_norm": 0.6559582924784566, + "learning_rate": 2.7353610593834063e-05, + "loss": 6.2863, + "step": 1322 + }, + { + "epoch": 0.08212800297970078, + "grad_norm": 0.49936543370753267, + "learning_rate": 2.7374301675977655e-05, + "loss": 6.1519, + "step": 1323 + }, + { + "epoch": 0.08219008007945869, + "grad_norm": 0.6600509649886487, + "learning_rate": 2.7394992758121253e-05, + "loss": 6.1048, + "step": 1324 + }, + { + "epoch": 0.08225215717921659, + "grad_norm": 0.5921500131221332, + "learning_rate": 2.741568384026485e-05, + "loss": 6.1427, + "step": 1325 + }, + { + "epoch": 0.08231423427897448, + "grad_norm": 0.51036875039263, + "learning_rate": 2.7436374922408443e-05, + "loss": 6.2164, + "step": 1326 + }, + { + "epoch": 0.08237631137873239, + "grad_norm": 0.595044055998655, + "learning_rate": 2.745706600455204e-05, + "loss": 6.0842, + "step": 1327 + }, + { + "epoch": 0.08243838847849029, + "grad_norm": 0.5135441030118253, + "learning_rate": 2.7477757086695637e-05, + "loss": 6.0051, + "step": 1328 + }, + { + "epoch": 0.08250046557824818, + "grad_norm": 0.580900899586742, + "learning_rate": 2.7498448168839232e-05, + "loss": 6.2127, + "step": 1329 + }, + { + "epoch": 0.08256254267800608, + "grad_norm": 0.4870457391232762, + "learning_rate": 2.7519139250982827e-05, + "loss": 6.1755, + "step": 1330 + }, + { + "epoch": 0.08262461977776399, + "grad_norm": 0.5964279018166653, + "learning_rate": 2.7539830333126426e-05, + "loss": 6.1376, + "step": 1331 + }, + { + "epoch": 0.08268669687752188, + "grad_norm": 0.4894594093156785, + "learning_rate": 2.7560521415270017e-05, + "loss": 6.1519, + "step": 1332 + }, + { + "epoch": 0.08274877397727978, + "grad_norm": 0.6350850636660097, + "learning_rate": 2.7581212497413616e-05, + "loss": 6.0755, + "step": 1333 + }, + { + "epoch": 0.08281085107703769, + "grad_norm": 0.4295845428576803, + "learning_rate": 2.7601903579557214e-05, + "loss": 6.1363, + "step": 1334 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 0.5411435192990017, + "learning_rate": 2.7622594661700806e-05, + "loss": 6.0241, + "step": 1335 + }, + { + "epoch": 0.08293500527655348, + "grad_norm": 0.6162544150513504, + "learning_rate": 2.7643285743844404e-05, + "loss": 6.1155, + "step": 1336 + }, + { + "epoch": 0.08299708237631138, + "grad_norm": 0.4690464671709561, + "learning_rate": 2.7663976825988003e-05, + "loss": 6.047, + "step": 1337 + }, + { + "epoch": 0.08305915947606927, + "grad_norm": 0.5455442586456671, + "learning_rate": 2.7684667908131594e-05, + "loss": 6.0515, + "step": 1338 + }, + { + "epoch": 0.08312123657582718, + "grad_norm": 0.5759113822061268, + "learning_rate": 2.7705358990275193e-05, + "loss": 6.1546, + "step": 1339 + }, + { + "epoch": 0.08318331367558508, + "grad_norm": 0.5065999740492148, + "learning_rate": 2.772605007241879e-05, + "loss": 6.0943, + "step": 1340 + }, + { + "epoch": 0.08324539077534297, + "grad_norm": 0.5501794706480437, + "learning_rate": 2.7746741154562383e-05, + "loss": 6.0826, + "step": 1341 + }, + { + "epoch": 0.08330746787510088, + "grad_norm": 0.43265888021819277, + "learning_rate": 2.776743223670598e-05, + "loss": 6.1155, + "step": 1342 + }, + { + "epoch": 0.08336954497485878, + "grad_norm": 0.521404588860737, + "learning_rate": 2.778812331884958e-05, + "loss": 6.1826, + "step": 1343 + }, + { + "epoch": 0.08343162207461667, + "grad_norm": 0.7240034999525714, + "learning_rate": 2.7808814400993172e-05, + "loss": 6.1008, + "step": 1344 + }, + { + "epoch": 0.08349369917437457, + "grad_norm": 0.4934572009598353, + "learning_rate": 2.782950548313677e-05, + "loss": 6.0743, + "step": 1345 + }, + { + "epoch": 0.08355577627413248, + "grad_norm": 0.6765874122455643, + "learning_rate": 2.785019656528037e-05, + "loss": 6.1259, + "step": 1346 + }, + { + "epoch": 0.08361785337389037, + "grad_norm": 0.5631532319895461, + "learning_rate": 2.787088764742396e-05, + "loss": 6.0886, + "step": 1347 + }, + { + "epoch": 0.08367993047364827, + "grad_norm": 0.575742037510608, + "learning_rate": 2.789157872956756e-05, + "loss": 6.0775, + "step": 1348 + }, + { + "epoch": 0.08374200757340618, + "grad_norm": 0.7076030126463316, + "learning_rate": 2.7912269811711157e-05, + "loss": 6.0657, + "step": 1349 + }, + { + "epoch": 0.08380408467316407, + "grad_norm": 0.6847959138533308, + "learning_rate": 2.793296089385475e-05, + "loss": 6.0532, + "step": 1350 + }, + { + "epoch": 0.08386616177292197, + "grad_norm": 0.7722015524079181, + "learning_rate": 2.7953651975998347e-05, + "loss": 6.0804, + "step": 1351 + }, + { + "epoch": 0.08392823887267988, + "grad_norm": 0.7788038486826451, + "learning_rate": 2.7974343058141942e-05, + "loss": 6.0411, + "step": 1352 + }, + { + "epoch": 0.08399031597243777, + "grad_norm": 0.8794292673963621, + "learning_rate": 2.7995034140285538e-05, + "loss": 6.175, + "step": 1353 + }, + { + "epoch": 0.08405239307219567, + "grad_norm": 0.8572941643704399, + "learning_rate": 2.8015725222429136e-05, + "loss": 6.0229, + "step": 1354 + }, + { + "epoch": 0.08411447017195357, + "grad_norm": 0.5926891436652104, + "learning_rate": 2.803641630457273e-05, + "loss": 6.1382, + "step": 1355 + }, + { + "epoch": 0.08417654727171146, + "grad_norm": 0.7014269139166497, + "learning_rate": 2.8057107386716326e-05, + "loss": 6.1232, + "step": 1356 + }, + { + "epoch": 0.08423862437146937, + "grad_norm": 0.8602640235176896, + "learning_rate": 2.807779846885992e-05, + "loss": 6.1513, + "step": 1357 + }, + { + "epoch": 0.08430070147122727, + "grad_norm": 0.8044045700886188, + "learning_rate": 2.809848955100352e-05, + "loss": 6.1269, + "step": 1358 + }, + { + "epoch": 0.08436277857098516, + "grad_norm": 0.76867457694389, + "learning_rate": 2.811918063314711e-05, + "loss": 6.0811, + "step": 1359 + }, + { + "epoch": 0.08442485567074307, + "grad_norm": 0.6061914376357999, + "learning_rate": 2.813987171529071e-05, + "loss": 6.0146, + "step": 1360 + }, + { + "epoch": 0.08448693277050096, + "grad_norm": 0.6485664974727993, + "learning_rate": 2.816056279743431e-05, + "loss": 6.0563, + "step": 1361 + }, + { + "epoch": 0.08454900987025886, + "grad_norm": 0.5725583314122394, + "learning_rate": 2.81812538795779e-05, + "loss": 6.0527, + "step": 1362 + }, + { + "epoch": 0.08461108697001676, + "grad_norm": 0.5944381290047824, + "learning_rate": 2.82019449617215e-05, + "loss": 6.1761, + "step": 1363 + }, + { + "epoch": 0.08467316406977465, + "grad_norm": 0.5333541823822079, + "learning_rate": 2.8222636043865097e-05, + "loss": 6.0609, + "step": 1364 + }, + { + "epoch": 0.08473524116953256, + "grad_norm": 0.5969966414962562, + "learning_rate": 2.824332712600869e-05, + "loss": 6.0369, + "step": 1365 + }, + { + "epoch": 0.08479731826929046, + "grad_norm": 0.5919210473650871, + "learning_rate": 2.8264018208152287e-05, + "loss": 6.0509, + "step": 1366 + }, + { + "epoch": 0.08485939536904835, + "grad_norm": 0.9501638534685809, + "learning_rate": 2.8284709290295886e-05, + "loss": 6.0374, + "step": 1367 + }, + { + "epoch": 0.08492147246880626, + "grad_norm": 0.5203150236509597, + "learning_rate": 2.8305400372439477e-05, + "loss": 6.1773, + "step": 1368 + }, + { + "epoch": 0.08498354956856416, + "grad_norm": 0.6139671474903221, + "learning_rate": 2.8326091454583076e-05, + "loss": 6.1336, + "step": 1369 + }, + { + "epoch": 0.08504562666832205, + "grad_norm": 0.5874266203242055, + "learning_rate": 2.8346782536726674e-05, + "loss": 6.114, + "step": 1370 + }, + { + "epoch": 0.08510770376807995, + "grad_norm": 0.5471244515565943, + "learning_rate": 2.8367473618870266e-05, + "loss": 6.0435, + "step": 1371 + }, + { + "epoch": 0.08516978086783786, + "grad_norm": 0.69305137993169, + "learning_rate": 2.8388164701013864e-05, + "loss": 5.9764, + "step": 1372 + }, + { + "epoch": 0.08523185796759575, + "grad_norm": 0.6403982179756318, + "learning_rate": 2.8408855783157463e-05, + "loss": 6.1414, + "step": 1373 + }, + { + "epoch": 0.08529393506735365, + "grad_norm": 0.4692335972070861, + "learning_rate": 2.8429546865301055e-05, + "loss": 6.0735, + "step": 1374 + }, + { + "epoch": 0.08535601216711156, + "grad_norm": 0.47956974932768104, + "learning_rate": 2.8450237947444653e-05, + "loss": 6.0244, + "step": 1375 + }, + { + "epoch": 0.08541808926686945, + "grad_norm": 0.929904335444327, + "learning_rate": 2.847092902958825e-05, + "loss": 6.0397, + "step": 1376 + }, + { + "epoch": 0.08548016636662735, + "grad_norm": 0.5797039070789359, + "learning_rate": 2.8491620111731843e-05, + "loss": 6.0644, + "step": 1377 + }, + { + "epoch": 0.08554224346638525, + "grad_norm": 0.6179668961797365, + "learning_rate": 2.851231119387544e-05, + "loss": 5.9255, + "step": 1378 + }, + { + "epoch": 0.08560432056614314, + "grad_norm": 0.477189105467728, + "learning_rate": 2.853300227601904e-05, + "loss": 6.0731, + "step": 1379 + }, + { + "epoch": 0.08566639766590105, + "grad_norm": 0.6088340020627285, + "learning_rate": 2.8553693358162632e-05, + "loss": 6.024, + "step": 1380 + }, + { + "epoch": 0.08572847476565895, + "grad_norm": 0.4939576213444867, + "learning_rate": 2.857438444030623e-05, + "loss": 6.0502, + "step": 1381 + }, + { + "epoch": 0.08579055186541684, + "grad_norm": 0.6067838353631774, + "learning_rate": 2.8595075522449825e-05, + "loss": 6.0237, + "step": 1382 + }, + { + "epoch": 0.08585262896517475, + "grad_norm": 0.4595789203737429, + "learning_rate": 2.861576660459342e-05, + "loss": 6.0742, + "step": 1383 + }, + { + "epoch": 0.08591470606493265, + "grad_norm": 0.5188205418250114, + "learning_rate": 2.8636457686737015e-05, + "loss": 6.0115, + "step": 1384 + }, + { + "epoch": 0.08597678316469054, + "grad_norm": 0.5265364507463264, + "learning_rate": 2.8657148768880614e-05, + "loss": 6.1349, + "step": 1385 + }, + { + "epoch": 0.08603886026444844, + "grad_norm": 0.6246369260970486, + "learning_rate": 2.867783985102421e-05, + "loss": 6.0089, + "step": 1386 + }, + { + "epoch": 0.08610093736420635, + "grad_norm": 0.7191130094597775, + "learning_rate": 2.8698530933167804e-05, + "loss": 6.0781, + "step": 1387 + }, + { + "epoch": 0.08616301446396424, + "grad_norm": 0.6879956452819603, + "learning_rate": 2.8719222015311403e-05, + "loss": 6.0817, + "step": 1388 + }, + { + "epoch": 0.08622509156372214, + "grad_norm": 0.6527074839122451, + "learning_rate": 2.8739913097454994e-05, + "loss": 6.073, + "step": 1389 + }, + { + "epoch": 0.08628716866348005, + "grad_norm": 0.8432101535990164, + "learning_rate": 2.8760604179598593e-05, + "loss": 6.0498, + "step": 1390 + }, + { + "epoch": 0.08634924576323794, + "grad_norm": 0.8506452187203041, + "learning_rate": 2.878129526174219e-05, + "loss": 6.0577, + "step": 1391 + }, + { + "epoch": 0.08641132286299584, + "grad_norm": 0.6818594813482164, + "learning_rate": 2.8801986343885783e-05, + "loss": 6.0651, + "step": 1392 + }, + { + "epoch": 0.08647339996275374, + "grad_norm": 0.7859252655715171, + "learning_rate": 2.882267742602938e-05, + "loss": 6.0824, + "step": 1393 + }, + { + "epoch": 0.08653547706251163, + "grad_norm": 0.6308113528654795, + "learning_rate": 2.884336850817298e-05, + "loss": 6.0328, + "step": 1394 + }, + { + "epoch": 0.08659755416226954, + "grad_norm": 0.9202316637272092, + "learning_rate": 2.886405959031657e-05, + "loss": 6.1046, + "step": 1395 + }, + { + "epoch": 0.08665963126202744, + "grad_norm": 0.8621314491450004, + "learning_rate": 2.888475067246017e-05, + "loss": 6.1234, + "step": 1396 + }, + { + "epoch": 0.08672170836178533, + "grad_norm": 0.7709717411497975, + "learning_rate": 2.890544175460377e-05, + "loss": 5.927, + "step": 1397 + }, + { + "epoch": 0.08678378546154324, + "grad_norm": 0.6915071704724428, + "learning_rate": 2.892613283674736e-05, + "loss": 6.0304, + "step": 1398 + }, + { + "epoch": 0.08684586256130114, + "grad_norm": 0.6878242161683207, + "learning_rate": 2.894682391889096e-05, + "loss": 6.1694, + "step": 1399 + }, + { + "epoch": 0.08690793966105903, + "grad_norm": 0.639758710601407, + "learning_rate": 2.8967515001034557e-05, + "loss": 6.0783, + "step": 1400 + }, + { + "epoch": 0.08697001676081693, + "grad_norm": 0.6624613138815572, + "learning_rate": 2.8988206083178155e-05, + "loss": 6.079, + "step": 1401 + }, + { + "epoch": 0.08703209386057484, + "grad_norm": 0.7592884909250053, + "learning_rate": 2.9008897165321747e-05, + "loss": 6.1436, + "step": 1402 + }, + { + "epoch": 0.08709417096033273, + "grad_norm": 0.9948732865222104, + "learning_rate": 2.9029588247465346e-05, + "loss": 6.0031, + "step": 1403 + }, + { + "epoch": 0.08715624806009063, + "grad_norm": 0.8707742147745017, + "learning_rate": 2.9050279329608944e-05, + "loss": 6.0415, + "step": 1404 + }, + { + "epoch": 0.08721832515984854, + "grad_norm": 0.5916707121571958, + "learning_rate": 2.9070970411752536e-05, + "loss": 6.0493, + "step": 1405 + }, + { + "epoch": 0.08728040225960643, + "grad_norm": 0.6546534235787337, + "learning_rate": 2.9091661493896134e-05, + "loss": 6.0095, + "step": 1406 + }, + { + "epoch": 0.08734247935936433, + "grad_norm": 0.6497281340442046, + "learning_rate": 2.911235257603973e-05, + "loss": 6.0482, + "step": 1407 + }, + { + "epoch": 0.08740455645912223, + "grad_norm": 0.5071181906058911, + "learning_rate": 2.9133043658183324e-05, + "loss": 6.026, + "step": 1408 + }, + { + "epoch": 0.08746663355888012, + "grad_norm": 0.5456397957123021, + "learning_rate": 2.9153734740326923e-05, + "loss": 6.018, + "step": 1409 + }, + { + "epoch": 0.08752871065863803, + "grad_norm": 0.48337163369205405, + "learning_rate": 2.9174425822470518e-05, + "loss": 6.0961, + "step": 1410 + }, + { + "epoch": 0.08759078775839593, + "grad_norm": 0.5203632776725526, + "learning_rate": 2.9195116904614113e-05, + "loss": 6.0915, + "step": 1411 + }, + { + "epoch": 0.08765286485815382, + "grad_norm": 0.4481846724700148, + "learning_rate": 2.9215807986757708e-05, + "loss": 5.9625, + "step": 1412 + }, + { + "epoch": 0.08771494195791173, + "grad_norm": 0.5480971984163205, + "learning_rate": 2.9236499068901307e-05, + "loss": 6.0823, + "step": 1413 + }, + { + "epoch": 0.08777701905766963, + "grad_norm": 0.449412295898785, + "learning_rate": 2.9257190151044898e-05, + "loss": 6.054, + "step": 1414 + }, + { + "epoch": 0.08783909615742752, + "grad_norm": 0.6117012177314131, + "learning_rate": 2.9277881233188497e-05, + "loss": 5.9581, + "step": 1415 + }, + { + "epoch": 0.08790117325718542, + "grad_norm": 0.48564633296814136, + "learning_rate": 2.9298572315332095e-05, + "loss": 5.9651, + "step": 1416 + }, + { + "epoch": 0.08796325035694333, + "grad_norm": 0.6481655193894815, + "learning_rate": 2.9319263397475687e-05, + "loss": 5.9926, + "step": 1417 + }, + { + "epoch": 0.08802532745670122, + "grad_norm": 0.4935594993993782, + "learning_rate": 2.9339954479619285e-05, + "loss": 6.096, + "step": 1418 + }, + { + "epoch": 0.08808740455645912, + "grad_norm": 0.7305681701565434, + "learning_rate": 2.9360645561762884e-05, + "loss": 6.001, + "step": 1419 + }, + { + "epoch": 0.08814948165621703, + "grad_norm": 0.6102264276020455, + "learning_rate": 2.9381336643906475e-05, + "loss": 6.0104, + "step": 1420 + }, + { + "epoch": 0.08821155875597492, + "grad_norm": 0.8309337559357786, + "learning_rate": 2.9402027726050074e-05, + "loss": 6.013, + "step": 1421 + }, + { + "epoch": 0.08827363585573282, + "grad_norm": 0.7021856178764377, + "learning_rate": 2.9422718808193672e-05, + "loss": 5.9949, + "step": 1422 + }, + { + "epoch": 0.08833571295549072, + "grad_norm": 0.6670439715411045, + "learning_rate": 2.9443409890337264e-05, + "loss": 5.949, + "step": 1423 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 0.6695338201306577, + "learning_rate": 2.9464100972480863e-05, + "loss": 6.0448, + "step": 1424 + }, + { + "epoch": 0.08845986715500652, + "grad_norm": 0.6102161715069109, + "learning_rate": 2.948479205462446e-05, + "loss": 6.1287, + "step": 1425 + }, + { + "epoch": 0.08852194425476442, + "grad_norm": 0.542691688379971, + "learning_rate": 2.9505483136768053e-05, + "loss": 6.0462, + "step": 1426 + }, + { + "epoch": 0.08858402135452231, + "grad_norm": 0.6338629304838477, + "learning_rate": 2.952617421891165e-05, + "loss": 5.9772, + "step": 1427 + }, + { + "epoch": 0.08864609845428022, + "grad_norm": 0.6339478844784326, + "learning_rate": 2.954686530105525e-05, + "loss": 5.989, + "step": 1428 + }, + { + "epoch": 0.08870817555403812, + "grad_norm": 0.7015969945502856, + "learning_rate": 2.956755638319884e-05, + "loss": 5.9677, + "step": 1429 + }, + { + "epoch": 0.08877025265379601, + "grad_norm": 0.9741256522547632, + "learning_rate": 2.958824746534244e-05, + "loss": 5.9863, + "step": 1430 + }, + { + "epoch": 0.08883232975355392, + "grad_norm": 0.751627745145793, + "learning_rate": 2.9608938547486038e-05, + "loss": 5.9249, + "step": 1431 + }, + { + "epoch": 0.08889440685331182, + "grad_norm": 1.2566158217524952, + "learning_rate": 2.962962962962963e-05, + "loss": 6.0959, + "step": 1432 + }, + { + "epoch": 0.08895648395306971, + "grad_norm": 0.7460357084216571, + "learning_rate": 2.965032071177323e-05, + "loss": 5.9203, + "step": 1433 + }, + { + "epoch": 0.08901856105282761, + "grad_norm": 0.8132774594793015, + "learning_rate": 2.9671011793916827e-05, + "loss": 5.9369, + "step": 1434 + }, + { + "epoch": 0.08908063815258552, + "grad_norm": 0.7018430892130694, + "learning_rate": 2.969170287606042e-05, + "loss": 5.9712, + "step": 1435 + }, + { + "epoch": 0.08914271525234341, + "grad_norm": 0.8320461504681029, + "learning_rate": 2.9712393958204017e-05, + "loss": 6.0329, + "step": 1436 + }, + { + "epoch": 0.08920479235210131, + "grad_norm": 0.8071020990755844, + "learning_rate": 2.9733085040347612e-05, + "loss": 5.9867, + "step": 1437 + }, + { + "epoch": 0.08926686945185922, + "grad_norm": 1.1441037157434328, + "learning_rate": 2.9753776122491207e-05, + "loss": 6.0456, + "step": 1438 + }, + { + "epoch": 0.0893289465516171, + "grad_norm": 0.7075071893198687, + "learning_rate": 2.9774467204634802e-05, + "loss": 5.9342, + "step": 1439 + }, + { + "epoch": 0.08939102365137501, + "grad_norm": 0.6830152407016711, + "learning_rate": 2.97951582867784e-05, + "loss": 5.9582, + "step": 1440 + }, + { + "epoch": 0.08945310075113291, + "grad_norm": 0.7220222785291628, + "learning_rate": 2.9815849368921996e-05, + "loss": 6.0143, + "step": 1441 + }, + { + "epoch": 0.0895151778508908, + "grad_norm": 0.7002870933148995, + "learning_rate": 2.983654045106559e-05, + "loss": 6.0128, + "step": 1442 + }, + { + "epoch": 0.08957725495064871, + "grad_norm": 0.5278160523317091, + "learning_rate": 2.985723153320919e-05, + "loss": 6.0869, + "step": 1443 + }, + { + "epoch": 0.08963933205040661, + "grad_norm": 0.7026474920723332, + "learning_rate": 2.987792261535278e-05, + "loss": 6.099, + "step": 1444 + }, + { + "epoch": 0.0897014091501645, + "grad_norm": 0.564092104866664, + "learning_rate": 2.989861369749638e-05, + "loss": 6.0108, + "step": 1445 + }, + { + "epoch": 0.0897634862499224, + "grad_norm": 0.5551253457758719, + "learning_rate": 2.9919304779639978e-05, + "loss": 6.021, + "step": 1446 + }, + { + "epoch": 0.08982556334968031, + "grad_norm": 0.6964516897870773, + "learning_rate": 2.993999586178357e-05, + "loss": 5.9879, + "step": 1447 + }, + { + "epoch": 0.0898876404494382, + "grad_norm": 0.4907958929724808, + "learning_rate": 2.9960686943927168e-05, + "loss": 6.0077, + "step": 1448 + }, + { + "epoch": 0.0899497175491961, + "grad_norm": 0.5489801488181396, + "learning_rate": 2.9981378026070767e-05, + "loss": 5.9626, + "step": 1449 + }, + { + "epoch": 0.09001179464895401, + "grad_norm": 0.5367853470398497, + "learning_rate": 3.0002069108214358e-05, + "loss": 5.9644, + "step": 1450 + }, + { + "epoch": 0.0900738717487119, + "grad_norm": 0.59580244893502, + "learning_rate": 3.0022760190357957e-05, + "loss": 6.0339, + "step": 1451 + }, + { + "epoch": 0.0901359488484698, + "grad_norm": 0.650118407138459, + "learning_rate": 3.0043451272501555e-05, + "loss": 5.9406, + "step": 1452 + }, + { + "epoch": 0.09019802594822769, + "grad_norm": 0.6896319062988451, + "learning_rate": 3.0064142354645147e-05, + "loss": 5.992, + "step": 1453 + }, + { + "epoch": 0.0902601030479856, + "grad_norm": 0.5172696327845843, + "learning_rate": 3.0084833436788745e-05, + "loss": 5.9894, + "step": 1454 + }, + { + "epoch": 0.0903221801477435, + "grad_norm": 0.6369357295183429, + "learning_rate": 3.0105524518932344e-05, + "loss": 6.0896, + "step": 1455 + }, + { + "epoch": 0.09038425724750139, + "grad_norm": 0.49346585246041025, + "learning_rate": 3.0126215601075935e-05, + "loss": 6.1063, + "step": 1456 + }, + { + "epoch": 0.0904463343472593, + "grad_norm": 0.5846363370643173, + "learning_rate": 3.0146906683219534e-05, + "loss": 5.9655, + "step": 1457 + }, + { + "epoch": 0.0905084114470172, + "grad_norm": 0.5008608143365838, + "learning_rate": 3.0167597765363132e-05, + "loss": 6.0535, + "step": 1458 + }, + { + "epoch": 0.09057048854677509, + "grad_norm": 0.5585082462200625, + "learning_rate": 3.0188288847506724e-05, + "loss": 5.921, + "step": 1459 + }, + { + "epoch": 0.09063256564653299, + "grad_norm": 0.9026843992667456, + "learning_rate": 3.0208979929650323e-05, + "loss": 5.931, + "step": 1460 + }, + { + "epoch": 0.0906946427462909, + "grad_norm": 0.943220140121523, + "learning_rate": 3.022967101179392e-05, + "loss": 6.0231, + "step": 1461 + }, + { + "epoch": 0.09075671984604879, + "grad_norm": 0.696918946321706, + "learning_rate": 3.0250362093937513e-05, + "loss": 5.9788, + "step": 1462 + }, + { + "epoch": 0.09081879694580669, + "grad_norm": 0.6478518173497883, + "learning_rate": 3.027105317608111e-05, + "loss": 5.9633, + "step": 1463 + }, + { + "epoch": 0.0908808740455646, + "grad_norm": 0.6062954826597546, + "learning_rate": 3.029174425822471e-05, + "loss": 5.9981, + "step": 1464 + }, + { + "epoch": 0.09094295114532248, + "grad_norm": 0.6263358859789595, + "learning_rate": 3.03124353403683e-05, + "loss": 5.9581, + "step": 1465 + }, + { + "epoch": 0.09100502824508039, + "grad_norm": 0.5643374050541399, + "learning_rate": 3.03331264225119e-05, + "loss": 5.941, + "step": 1466 + }, + { + "epoch": 0.09106710534483829, + "grad_norm": 0.5888228012235023, + "learning_rate": 3.0353817504655495e-05, + "loss": 5.8429, + "step": 1467 + }, + { + "epoch": 0.09112918244459618, + "grad_norm": 0.6782211622272512, + "learning_rate": 3.037450858679909e-05, + "loss": 6.0039, + "step": 1468 + }, + { + "epoch": 0.09119125954435409, + "grad_norm": 0.6464879707167281, + "learning_rate": 3.0395199668942685e-05, + "loss": 6.07, + "step": 1469 + }, + { + "epoch": 0.09125333664411199, + "grad_norm": 1.0338550378650033, + "learning_rate": 3.0415890751086283e-05, + "loss": 5.8884, + "step": 1470 + }, + { + "epoch": 0.09131541374386988, + "grad_norm": 0.5664633623465565, + "learning_rate": 3.043658183322988e-05, + "loss": 5.957, + "step": 1471 + }, + { + "epoch": 0.09137749084362778, + "grad_norm": 0.9496167765195149, + "learning_rate": 3.0457272915373474e-05, + "loss": 5.927, + "step": 1472 + }, + { + "epoch": 0.09143956794338569, + "grad_norm": 0.6417885657322273, + "learning_rate": 3.0477963997517072e-05, + "loss": 5.9816, + "step": 1473 + }, + { + "epoch": 0.09150164504314358, + "grad_norm": 0.6264738824062973, + "learning_rate": 3.0498655079660664e-05, + "loss": 5.9072, + "step": 1474 + }, + { + "epoch": 0.09156372214290148, + "grad_norm": 0.7111689850937992, + "learning_rate": 3.051934616180426e-05, + "loss": 5.9515, + "step": 1475 + }, + { + "epoch": 0.09162579924265939, + "grad_norm": 0.507866988559922, + "learning_rate": 3.0540037243947864e-05, + "loss": 5.9413, + "step": 1476 + }, + { + "epoch": 0.09168787634241728, + "grad_norm": 0.7190503032102503, + "learning_rate": 3.056072832609145e-05, + "loss": 5.9868, + "step": 1477 + }, + { + "epoch": 0.09174995344217518, + "grad_norm": 0.6162480072623758, + "learning_rate": 3.0581419408235054e-05, + "loss": 5.8884, + "step": 1478 + }, + { + "epoch": 0.09181203054193308, + "grad_norm": 0.6037324188863386, + "learning_rate": 3.060211049037865e-05, + "loss": 6.0303, + "step": 1479 + }, + { + "epoch": 0.09187410764169097, + "grad_norm": 0.5930568514022349, + "learning_rate": 3.0622801572522244e-05, + "loss": 5.9226, + "step": 1480 + }, + { + "epoch": 0.09193618474144888, + "grad_norm": 0.7230389380779348, + "learning_rate": 3.064349265466584e-05, + "loss": 5.9028, + "step": 1481 + }, + { + "epoch": 0.09199826184120678, + "grad_norm": 0.9151383613764636, + "learning_rate": 3.0664183736809435e-05, + "loss": 5.9791, + "step": 1482 + }, + { + "epoch": 0.09206033894096467, + "grad_norm": 0.6749023720111049, + "learning_rate": 3.068487481895303e-05, + "loss": 5.8708, + "step": 1483 + }, + { + "epoch": 0.09212241604072258, + "grad_norm": 0.6487197032094355, + "learning_rate": 3.0705565901096625e-05, + "loss": 5.9968, + "step": 1484 + }, + { + "epoch": 0.09218449314048048, + "grad_norm": 0.6489119804527944, + "learning_rate": 3.0726256983240227e-05, + "loss": 5.9415, + "step": 1485 + }, + { + "epoch": 0.09224657024023837, + "grad_norm": 0.6633163523736918, + "learning_rate": 3.074694806538382e-05, + "loss": 6.0245, + "step": 1486 + }, + { + "epoch": 0.09230864733999627, + "grad_norm": 0.7580613976174214, + "learning_rate": 3.076763914752742e-05, + "loss": 5.9723, + "step": 1487 + }, + { + "epoch": 0.09237072443975418, + "grad_norm": 0.8961680068068167, + "learning_rate": 3.078833022967101e-05, + "loss": 5.9998, + "step": 1488 + }, + { + "epoch": 0.09243280153951207, + "grad_norm": 0.6700618141753633, + "learning_rate": 3.0809021311814614e-05, + "loss": 5.9725, + "step": 1489 + }, + { + "epoch": 0.09249487863926997, + "grad_norm": 0.6236959442765831, + "learning_rate": 3.08297123939582e-05, + "loss": 5.9004, + "step": 1490 + }, + { + "epoch": 0.09255695573902788, + "grad_norm": 0.5722851798716736, + "learning_rate": 3.0850403476101804e-05, + "loss": 6.0082, + "step": 1491 + }, + { + "epoch": 0.09261903283878577, + "grad_norm": 0.5162728603372453, + "learning_rate": 3.08710945582454e-05, + "loss": 5.9934, + "step": 1492 + }, + { + "epoch": 0.09268110993854367, + "grad_norm": 0.513040277593467, + "learning_rate": 3.0891785640388994e-05, + "loss": 5.927, + "step": 1493 + }, + { + "epoch": 0.09274318703830157, + "grad_norm": 0.5996618341119027, + "learning_rate": 3.091247672253259e-05, + "loss": 5.9841, + "step": 1494 + }, + { + "epoch": 0.09280526413805947, + "grad_norm": 0.6104548252490853, + "learning_rate": 3.093316780467619e-05, + "loss": 5.9668, + "step": 1495 + }, + { + "epoch": 0.09286734123781737, + "grad_norm": 0.5578048577306057, + "learning_rate": 3.095385888681978e-05, + "loss": 6.0013, + "step": 1496 + }, + { + "epoch": 0.09292941833757527, + "grad_norm": 0.7455311996153579, + "learning_rate": 3.097454996896338e-05, + "loss": 5.8998, + "step": 1497 + }, + { + "epoch": 0.09299149543733316, + "grad_norm": 1.0051425787151504, + "learning_rate": 3.0995241051106976e-05, + "loss": 5.9491, + "step": 1498 + }, + { + "epoch": 0.09305357253709107, + "grad_norm": 0.6150526474693757, + "learning_rate": 3.101593213325057e-05, + "loss": 5.9463, + "step": 1499 + }, + { + "epoch": 0.09311564963684897, + "grad_norm": 0.8453962020189477, + "learning_rate": 3.1036623215394166e-05, + "loss": 5.7715, + "step": 1500 + }, + { + "epoch": 0.09317772673660686, + "grad_norm": 0.6258668848086438, + "learning_rate": 3.105731429753777e-05, + "loss": 5.9502, + "step": 1501 + }, + { + "epoch": 0.09323980383636477, + "grad_norm": 0.6507524280187718, + "learning_rate": 3.1078005379681356e-05, + "loss": 5.96, + "step": 1502 + }, + { + "epoch": 0.09330188093612267, + "grad_norm": 0.583410663815054, + "learning_rate": 3.109869646182496e-05, + "loss": 5.8357, + "step": 1503 + }, + { + "epoch": 0.09336395803588056, + "grad_norm": 0.585047363621733, + "learning_rate": 3.111938754396855e-05, + "loss": 5.9166, + "step": 1504 + }, + { + "epoch": 0.09342603513563846, + "grad_norm": 0.7176191306818548, + "learning_rate": 3.114007862611215e-05, + "loss": 6.015, + "step": 1505 + }, + { + "epoch": 0.09348811223539637, + "grad_norm": 0.819495457248537, + "learning_rate": 3.1160769708255743e-05, + "loss": 5.8599, + "step": 1506 + }, + { + "epoch": 0.09355018933515426, + "grad_norm": 0.6114166244457255, + "learning_rate": 3.118146079039934e-05, + "loss": 5.9489, + "step": 1507 + }, + { + "epoch": 0.09361226643491216, + "grad_norm": 0.6045623147002533, + "learning_rate": 3.1202151872542934e-05, + "loss": 5.9288, + "step": 1508 + }, + { + "epoch": 0.09367434353467007, + "grad_norm": 0.559493107820593, + "learning_rate": 3.122284295468653e-05, + "loss": 5.8813, + "step": 1509 + }, + { + "epoch": 0.09373642063442796, + "grad_norm": 0.8973699446781658, + "learning_rate": 3.124353403683013e-05, + "loss": 5.8518, + "step": 1510 + }, + { + "epoch": 0.09379849773418586, + "grad_norm": 0.5004133959315779, + "learning_rate": 3.1264225118973726e-05, + "loss": 5.8987, + "step": 1511 + }, + { + "epoch": 0.09386057483394376, + "grad_norm": 0.6587403139998377, + "learning_rate": 3.128491620111732e-05, + "loss": 5.9606, + "step": 1512 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 0.5108449916328626, + "learning_rate": 3.1305607283260916e-05, + "loss": 5.9434, + "step": 1513 + }, + { + "epoch": 0.09398472903345956, + "grad_norm": 0.6041673271131742, + "learning_rate": 3.132629836540451e-05, + "loss": 5.9285, + "step": 1514 + }, + { + "epoch": 0.09404680613321746, + "grad_norm": 0.6612717948667742, + "learning_rate": 3.1346989447548106e-05, + "loss": 5.8503, + "step": 1515 + }, + { + "epoch": 0.09410888323297535, + "grad_norm": 0.9113528853681507, + "learning_rate": 3.136768052969171e-05, + "loss": 5.905, + "step": 1516 + }, + { + "epoch": 0.09417096033273326, + "grad_norm": 0.50295110860593, + "learning_rate": 3.1388371611835296e-05, + "loss": 5.8085, + "step": 1517 + }, + { + "epoch": 0.09423303743249116, + "grad_norm": 0.7533529751743269, + "learning_rate": 3.14090626939789e-05, + "loss": 5.8309, + "step": 1518 + }, + { + "epoch": 0.09429511453224905, + "grad_norm": 0.5635265942977083, + "learning_rate": 3.142975377612249e-05, + "loss": 5.9251, + "step": 1519 + }, + { + "epoch": 0.09435719163200695, + "grad_norm": 1.1972805103166995, + "learning_rate": 3.145044485826609e-05, + "loss": 5.8993, + "step": 1520 + }, + { + "epoch": 0.09441926873176486, + "grad_norm": 0.8474768523484394, + "learning_rate": 3.147113594040968e-05, + "loss": 5.8732, + "step": 1521 + }, + { + "epoch": 0.09448134583152275, + "grad_norm": 0.7499007318287875, + "learning_rate": 3.1491827022553285e-05, + "loss": 5.9397, + "step": 1522 + }, + { + "epoch": 0.09454342293128065, + "grad_norm": 0.5560002977349836, + "learning_rate": 3.151251810469687e-05, + "loss": 5.901, + "step": 1523 + }, + { + "epoch": 0.09460550003103856, + "grad_norm": 0.6107524562322543, + "learning_rate": 3.1533209186840475e-05, + "loss": 5.9156, + "step": 1524 + }, + { + "epoch": 0.09466757713079645, + "grad_norm": 0.5983640929535543, + "learning_rate": 3.155390026898407e-05, + "loss": 5.8937, + "step": 1525 + }, + { + "epoch": 0.09472965423055435, + "grad_norm": 0.5248641315124414, + "learning_rate": 3.1574591351127665e-05, + "loss": 5.892, + "step": 1526 + }, + { + "epoch": 0.09479173133031225, + "grad_norm": 1.0495408689220824, + "learning_rate": 3.159528243327126e-05, + "loss": 5.9251, + "step": 1527 + }, + { + "epoch": 0.09485380843007014, + "grad_norm": 0.5491826822910261, + "learning_rate": 3.161597351541486e-05, + "loss": 5.8981, + "step": 1528 + }, + { + "epoch": 0.09491588552982805, + "grad_norm": 0.960917464877044, + "learning_rate": 3.163666459755845e-05, + "loss": 5.9043, + "step": 1529 + }, + { + "epoch": 0.09497796262958595, + "grad_norm": 0.8029383936108213, + "learning_rate": 3.165735567970205e-05, + "loss": 5.8712, + "step": 1530 + }, + { + "epoch": 0.09504003972934384, + "grad_norm": 0.8596439634258608, + "learning_rate": 3.167804676184565e-05, + "loss": 5.8823, + "step": 1531 + }, + { + "epoch": 0.09510211682910175, + "grad_norm": 0.8087352684129708, + "learning_rate": 3.169873784398924e-05, + "loss": 5.7942, + "step": 1532 + }, + { + "epoch": 0.09516419392885965, + "grad_norm": 0.7195805850599591, + "learning_rate": 3.171942892613284e-05, + "loss": 5.9254, + "step": 1533 + }, + { + "epoch": 0.09522627102861754, + "grad_norm": 0.8052813736595998, + "learning_rate": 3.174012000827643e-05, + "loss": 5.7684, + "step": 1534 + }, + { + "epoch": 0.09528834812837544, + "grad_norm": 0.6491733190842851, + "learning_rate": 3.176081109042003e-05, + "loss": 5.8306, + "step": 1535 + }, + { + "epoch": 0.09535042522813335, + "grad_norm": 0.6732340972387733, + "learning_rate": 3.178150217256363e-05, + "loss": 5.904, + "step": 1536 + }, + { + "epoch": 0.09541250232789124, + "grad_norm": 0.8433606344322282, + "learning_rate": 3.1802193254707225e-05, + "loss": 5.8919, + "step": 1537 + }, + { + "epoch": 0.09547457942764914, + "grad_norm": 0.6599894882940065, + "learning_rate": 3.182288433685082e-05, + "loss": 5.8973, + "step": 1538 + }, + { + "epoch": 0.09553665652740705, + "grad_norm": 0.7110797194871844, + "learning_rate": 3.1843575418994415e-05, + "loss": 5.865, + "step": 1539 + }, + { + "epoch": 0.09559873362716494, + "grad_norm": 0.6806989357113521, + "learning_rate": 3.186426650113801e-05, + "loss": 5.8901, + "step": 1540 + }, + { + "epoch": 0.09566081072692284, + "grad_norm": 0.6976504933289538, + "learning_rate": 3.1884957583281605e-05, + "loss": 5.8654, + "step": 1541 + }, + { + "epoch": 0.09572288782668074, + "grad_norm": 0.7460647098022468, + "learning_rate": 3.19056486654252e-05, + "loss": 5.9025, + "step": 1542 + }, + { + "epoch": 0.09578496492643863, + "grad_norm": 0.6187918369521673, + "learning_rate": 3.19263397475688e-05, + "loss": 5.928, + "step": 1543 + }, + { + "epoch": 0.09584704202619654, + "grad_norm": 0.6650773316070223, + "learning_rate": 3.194703082971239e-05, + "loss": 5.9007, + "step": 1544 + }, + { + "epoch": 0.09590911912595444, + "grad_norm": 0.5037967947435807, + "learning_rate": 3.196772191185599e-05, + "loss": 5.8793, + "step": 1545 + }, + { + "epoch": 0.09597119622571233, + "grad_norm": 0.6493055898379647, + "learning_rate": 3.198841299399959e-05, + "loss": 5.8578, + "step": 1546 + }, + { + "epoch": 0.09603327332547024, + "grad_norm": 0.47867478818210857, + "learning_rate": 3.200910407614318e-05, + "loss": 5.9923, + "step": 1547 + }, + { + "epoch": 0.09609535042522813, + "grad_norm": 0.5555919204837135, + "learning_rate": 3.202979515828678e-05, + "loss": 5.9327, + "step": 1548 + }, + { + "epoch": 0.09615742752498603, + "grad_norm": 0.5024492357717224, + "learning_rate": 3.205048624043038e-05, + "loss": 5.8998, + "step": 1549 + }, + { + "epoch": 0.09621950462474393, + "grad_norm": 0.6458265293952465, + "learning_rate": 3.207117732257397e-05, + "loss": 5.93, + "step": 1550 + }, + { + "epoch": 0.09628158172450182, + "grad_norm": 0.5302733250963854, + "learning_rate": 3.209186840471757e-05, + "loss": 5.8567, + "step": 1551 + }, + { + "epoch": 0.09634365882425973, + "grad_norm": 0.6230668211228527, + "learning_rate": 3.2112559486861164e-05, + "loss": 5.7796, + "step": 1552 + }, + { + "epoch": 0.09640573592401763, + "grad_norm": 1.0792342850895407, + "learning_rate": 3.213325056900476e-05, + "loss": 5.8892, + "step": 1553 + }, + { + "epoch": 0.09646781302377552, + "grad_norm": 0.8023171930273175, + "learning_rate": 3.2153941651148355e-05, + "loss": 5.8946, + "step": 1554 + }, + { + "epoch": 0.09652989012353343, + "grad_norm": 0.7068837470365926, + "learning_rate": 3.2174632733291956e-05, + "loss": 5.8872, + "step": 1555 + }, + { + "epoch": 0.09659196722329133, + "grad_norm": 0.6995106945379168, + "learning_rate": 3.2195323815435545e-05, + "loss": 5.9182, + "step": 1556 + }, + { + "epoch": 0.09665404432304922, + "grad_norm": 0.6061862263121883, + "learning_rate": 3.2216014897579147e-05, + "loss": 5.8848, + "step": 1557 + }, + { + "epoch": 0.09671612142280712, + "grad_norm": 0.6512690159882432, + "learning_rate": 3.223670597972274e-05, + "loss": 5.9194, + "step": 1558 + }, + { + "epoch": 0.09677819852256503, + "grad_norm": 1.0647702365948633, + "learning_rate": 3.225739706186634e-05, + "loss": 5.8773, + "step": 1559 + }, + { + "epoch": 0.09684027562232292, + "grad_norm": 0.7433046466127374, + "learning_rate": 3.227808814400993e-05, + "loss": 5.874, + "step": 1560 + }, + { + "epoch": 0.09690235272208082, + "grad_norm": 0.7711396811428519, + "learning_rate": 3.2298779226153534e-05, + "loss": 5.8365, + "step": 1561 + }, + { + "epoch": 0.09696442982183873, + "grad_norm": 0.7628735909071586, + "learning_rate": 3.231947030829712e-05, + "loss": 5.8532, + "step": 1562 + }, + { + "epoch": 0.09702650692159662, + "grad_norm": 0.754782093825157, + "learning_rate": 3.2340161390440724e-05, + "loss": 5.8271, + "step": 1563 + }, + { + "epoch": 0.09708858402135452, + "grad_norm": 0.7275521706033918, + "learning_rate": 3.236085247258432e-05, + "loss": 5.7427, + "step": 1564 + }, + { + "epoch": 0.09715066112111242, + "grad_norm": 0.6513282729939479, + "learning_rate": 3.2381543554727914e-05, + "loss": 5.8384, + "step": 1565 + }, + { + "epoch": 0.09721273822087032, + "grad_norm": 0.63451699889749, + "learning_rate": 3.240223463687151e-05, + "loss": 5.7921, + "step": 1566 + }, + { + "epoch": 0.09727481532062822, + "grad_norm": 0.8310546157783092, + "learning_rate": 3.2422925719015104e-05, + "loss": 5.9694, + "step": 1567 + }, + { + "epoch": 0.09733689242038612, + "grad_norm": 0.6151396905609547, + "learning_rate": 3.24436168011587e-05, + "loss": 5.7794, + "step": 1568 + }, + { + "epoch": 0.09739896952014401, + "grad_norm": 0.8213824605735022, + "learning_rate": 3.2464307883302294e-05, + "loss": 5.886, + "step": 1569 + }, + { + "epoch": 0.09746104661990192, + "grad_norm": 0.5122709425335622, + "learning_rate": 3.2484998965445896e-05, + "loss": 5.8388, + "step": 1570 + }, + { + "epoch": 0.09752312371965982, + "grad_norm": 0.6589593589751416, + "learning_rate": 3.250569004758949e-05, + "loss": 5.8127, + "step": 1571 + }, + { + "epoch": 0.09758520081941771, + "grad_norm": 0.5521477055958018, + "learning_rate": 3.2526381129733086e-05, + "loss": 5.9107, + "step": 1572 + }, + { + "epoch": 0.09764727791917562, + "grad_norm": 0.6279421696823869, + "learning_rate": 3.254707221187668e-05, + "loss": 5.7811, + "step": 1573 + }, + { + "epoch": 0.09770935501893352, + "grad_norm": 0.6013447503676943, + "learning_rate": 3.256776329402028e-05, + "loss": 5.809, + "step": 1574 + }, + { + "epoch": 0.09777143211869141, + "grad_norm": 0.6991719704435537, + "learning_rate": 3.258845437616387e-05, + "loss": 5.8281, + "step": 1575 + }, + { + "epoch": 0.09783350921844931, + "grad_norm": 0.5913556408613182, + "learning_rate": 3.260914545830747e-05, + "loss": 5.986, + "step": 1576 + }, + { + "epoch": 0.09789558631820722, + "grad_norm": 0.7090544442685492, + "learning_rate": 3.262983654045107e-05, + "loss": 5.8534, + "step": 1577 + }, + { + "epoch": 0.09795766341796511, + "grad_norm": 0.5550505564834696, + "learning_rate": 3.2650527622594664e-05, + "loss": 5.9332, + "step": 1578 + }, + { + "epoch": 0.09801974051772301, + "grad_norm": 0.5254822831874045, + "learning_rate": 3.267121870473826e-05, + "loss": 5.8568, + "step": 1579 + }, + { + "epoch": 0.09808181761748092, + "grad_norm": 0.6225033963735069, + "learning_rate": 3.269190978688186e-05, + "loss": 5.8161, + "step": 1580 + }, + { + "epoch": 0.0981438947172388, + "grad_norm": 0.7322526660009778, + "learning_rate": 3.271260086902545e-05, + "loss": 5.8395, + "step": 1581 + }, + { + "epoch": 0.09820597181699671, + "grad_norm": 1.1562313387627852, + "learning_rate": 3.273329195116905e-05, + "loss": 5.9189, + "step": 1582 + }, + { + "epoch": 0.09826804891675461, + "grad_norm": 0.7033015983060035, + "learning_rate": 3.2753983033312646e-05, + "loss": 5.8, + "step": 1583 + }, + { + "epoch": 0.0983301260165125, + "grad_norm": 0.7351356787240827, + "learning_rate": 3.277467411545624e-05, + "loss": 5.8707, + "step": 1584 + }, + { + "epoch": 0.09839220311627041, + "grad_norm": 0.7678417888639756, + "learning_rate": 3.2795365197599836e-05, + "loss": 5.7505, + "step": 1585 + }, + { + "epoch": 0.09845428021602831, + "grad_norm": 0.6974286869127059, + "learning_rate": 3.281605627974344e-05, + "loss": 5.8601, + "step": 1586 + }, + { + "epoch": 0.0985163573157862, + "grad_norm": 0.6259739439815796, + "learning_rate": 3.2836747361887026e-05, + "loss": 5.8491, + "step": 1587 + }, + { + "epoch": 0.0985784344155441, + "grad_norm": 0.8763774520106272, + "learning_rate": 3.285743844403063e-05, + "loss": 5.899, + "step": 1588 + }, + { + "epoch": 0.09864051151530201, + "grad_norm": 0.5596347525409103, + "learning_rate": 3.287812952617422e-05, + "loss": 5.7813, + "step": 1589 + }, + { + "epoch": 0.0987025886150599, + "grad_norm": 0.6052770988104968, + "learning_rate": 3.289882060831782e-05, + "loss": 5.8322, + "step": 1590 + }, + { + "epoch": 0.0987646657148178, + "grad_norm": 0.6460605626652446, + "learning_rate": 3.291951169046141e-05, + "loss": 5.8845, + "step": 1591 + }, + { + "epoch": 0.09882674281457571, + "grad_norm": 0.5673495625981105, + "learning_rate": 3.294020277260501e-05, + "loss": 5.7867, + "step": 1592 + }, + { + "epoch": 0.0988888199143336, + "grad_norm": 0.9265111307503855, + "learning_rate": 3.29608938547486e-05, + "loss": 5.8741, + "step": 1593 + }, + { + "epoch": 0.0989508970140915, + "grad_norm": 0.8284564884273137, + "learning_rate": 3.29815849368922e-05, + "loss": 5.8287, + "step": 1594 + }, + { + "epoch": 0.0990129741138494, + "grad_norm": 0.8997977864071389, + "learning_rate": 3.30022760190358e-05, + "loss": 5.8465, + "step": 1595 + }, + { + "epoch": 0.0990750512136073, + "grad_norm": 0.7740404110322142, + "learning_rate": 3.302296710117939e-05, + "loss": 5.875, + "step": 1596 + }, + { + "epoch": 0.0991371283133652, + "grad_norm": 0.7983509681229913, + "learning_rate": 3.304365818332299e-05, + "loss": 5.8519, + "step": 1597 + }, + { + "epoch": 0.0991992054131231, + "grad_norm": 0.5476417887869405, + "learning_rate": 3.3064349265466585e-05, + "loss": 5.8696, + "step": 1598 + }, + { + "epoch": 0.099261282512881, + "grad_norm": 0.8305983764308325, + "learning_rate": 3.308504034761018e-05, + "loss": 5.803, + "step": 1599 + }, + { + "epoch": 0.0993233596126389, + "grad_norm": 0.6286177595475942, + "learning_rate": 3.3105731429753776e-05, + "loss": 5.8078, + "step": 1600 + }, + { + "epoch": 0.0993854367123968, + "grad_norm": 0.5418336928659868, + "learning_rate": 3.312642251189738e-05, + "loss": 5.8942, + "step": 1601 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 0.4927069178743038, + "learning_rate": 3.3147113594040966e-05, + "loss": 5.9339, + "step": 1602 + }, + { + "epoch": 0.0995095909119126, + "grad_norm": 0.5117881960737832, + "learning_rate": 3.316780467618457e-05, + "loss": 5.7925, + "step": 1603 + }, + { + "epoch": 0.0995716680116705, + "grad_norm": 0.5542781555154782, + "learning_rate": 3.318849575832816e-05, + "loss": 5.8249, + "step": 1604 + }, + { + "epoch": 0.09963374511142839, + "grad_norm": 0.5160065597442505, + "learning_rate": 3.320918684047176e-05, + "loss": 5.831, + "step": 1605 + }, + { + "epoch": 0.0996958222111863, + "grad_norm": 0.5271719193431544, + "learning_rate": 3.322987792261535e-05, + "loss": 5.8038, + "step": 1606 + }, + { + "epoch": 0.0997578993109442, + "grad_norm": 0.5334946949560077, + "learning_rate": 3.3250569004758955e-05, + "loss": 5.8254, + "step": 1607 + }, + { + "epoch": 0.09981997641070209, + "grad_norm": 0.5930259707136931, + "learning_rate": 3.327126008690254e-05, + "loss": 5.8022, + "step": 1608 + }, + { + "epoch": 0.09988205351045999, + "grad_norm": 0.5853710997372329, + "learning_rate": 3.3291951169046145e-05, + "loss": 5.8355, + "step": 1609 + }, + { + "epoch": 0.0999441306102179, + "grad_norm": 0.5080406263431466, + "learning_rate": 3.331264225118974e-05, + "loss": 5.7816, + "step": 1610 + }, + { + "epoch": 0.10000620770997579, + "grad_norm": 0.5854211439411731, + "learning_rate": 3.3333333333333335e-05, + "loss": 5.8212, + "step": 1611 + }, + { + "epoch": 0.10006828480973369, + "grad_norm": 0.6361848429834158, + "learning_rate": 3.335402441547693e-05, + "loss": 5.8221, + "step": 1612 + }, + { + "epoch": 0.1001303619094916, + "grad_norm": 0.5956579222491808, + "learning_rate": 3.337471549762053e-05, + "loss": 5.7382, + "step": 1613 + }, + { + "epoch": 0.10019243900924948, + "grad_norm": 0.5158072351880101, + "learning_rate": 3.339540657976412e-05, + "loss": 5.8464, + "step": 1614 + }, + { + "epoch": 0.10025451610900739, + "grad_norm": 0.43603908626754606, + "learning_rate": 3.341609766190772e-05, + "loss": 5.8211, + "step": 1615 + }, + { + "epoch": 0.10031659320876529, + "grad_norm": 0.6318408665177355, + "learning_rate": 3.343678874405132e-05, + "loss": 5.7906, + "step": 1616 + }, + { + "epoch": 0.10037867030852318, + "grad_norm": 0.5539459321909161, + "learning_rate": 3.345747982619491e-05, + "loss": 5.7785, + "step": 1617 + }, + { + "epoch": 0.10044074740828109, + "grad_norm": 0.5331386205760138, + "learning_rate": 3.347817090833851e-05, + "loss": 5.819, + "step": 1618 + }, + { + "epoch": 0.10050282450803899, + "grad_norm": 0.5540492724600192, + "learning_rate": 3.34988619904821e-05, + "loss": 5.781, + "step": 1619 + }, + { + "epoch": 0.10056490160779688, + "grad_norm": 0.4976381756720668, + "learning_rate": 3.35195530726257e-05, + "loss": 5.8143, + "step": 1620 + }, + { + "epoch": 0.10062697870755478, + "grad_norm": 0.4803810970089215, + "learning_rate": 3.354024415476929e-05, + "loss": 5.7834, + "step": 1621 + }, + { + "epoch": 0.10068905580731269, + "grad_norm": 0.4570771218778752, + "learning_rate": 3.3560935236912894e-05, + "loss": 5.8381, + "step": 1622 + }, + { + "epoch": 0.10075113290707058, + "grad_norm": 0.4991135637151566, + "learning_rate": 3.358162631905649e-05, + "loss": 5.7816, + "step": 1623 + }, + { + "epoch": 0.10081321000682848, + "grad_norm": 0.4426338881187621, + "learning_rate": 3.3602317401200084e-05, + "loss": 5.8926, + "step": 1624 + }, + { + "epoch": 0.10087528710658639, + "grad_norm": 0.5350571700771863, + "learning_rate": 3.362300848334368e-05, + "loss": 5.7241, + "step": 1625 + }, + { + "epoch": 0.10093736420634428, + "grad_norm": 0.4799808412940441, + "learning_rate": 3.3643699565487275e-05, + "loss": 5.7561, + "step": 1626 + }, + { + "epoch": 0.10099944130610218, + "grad_norm": 0.42973249373249406, + "learning_rate": 3.366439064763087e-05, + "loss": 5.7973, + "step": 1627 + }, + { + "epoch": 0.10106151840586008, + "grad_norm": 0.5541458171373744, + "learning_rate": 3.368508172977447e-05, + "loss": 5.7565, + "step": 1628 + }, + { + "epoch": 0.10112359550561797, + "grad_norm": 0.46750136010589216, + "learning_rate": 3.370577281191806e-05, + "loss": 5.8543, + "step": 1629 + }, + { + "epoch": 0.10118567260537588, + "grad_norm": 0.4541481240304858, + "learning_rate": 3.372646389406166e-05, + "loss": 5.8183, + "step": 1630 + }, + { + "epoch": 0.10124774970513378, + "grad_norm": 0.6595970584415731, + "learning_rate": 3.374715497620526e-05, + "loss": 5.7691, + "step": 1631 + }, + { + "epoch": 0.10130982680489167, + "grad_norm": 0.6240433285308524, + "learning_rate": 3.376784605834885e-05, + "loss": 5.7603, + "step": 1632 + }, + { + "epoch": 0.10137190390464958, + "grad_norm": 0.5908245244151908, + "learning_rate": 3.378853714049245e-05, + "loss": 5.7387, + "step": 1633 + }, + { + "epoch": 0.10143398100440748, + "grad_norm": 0.5465520177482706, + "learning_rate": 3.380922822263605e-05, + "loss": 5.7624, + "step": 1634 + }, + { + "epoch": 0.10149605810416537, + "grad_norm": 0.6450703995750134, + "learning_rate": 3.382991930477964e-05, + "loss": 5.7071, + "step": 1635 + }, + { + "epoch": 0.10155813520392327, + "grad_norm": 0.588821839568168, + "learning_rate": 3.385061038692324e-05, + "loss": 5.6744, + "step": 1636 + }, + { + "epoch": 0.10162021230368118, + "grad_norm": 0.9360157343242949, + "learning_rate": 3.3871301469066834e-05, + "loss": 5.787, + "step": 1637 + }, + { + "epoch": 0.10168228940343907, + "grad_norm": 0.9210974113255694, + "learning_rate": 3.389199255121043e-05, + "loss": 5.8102, + "step": 1638 + }, + { + "epoch": 0.10174436650319697, + "grad_norm": 0.8423168231705698, + "learning_rate": 3.3912683633354024e-05, + "loss": 5.7847, + "step": 1639 + }, + { + "epoch": 0.10180644360295486, + "grad_norm": 0.7389407971042478, + "learning_rate": 3.3933374715497626e-05, + "loss": 5.7533, + "step": 1640 + }, + { + "epoch": 0.10186852070271277, + "grad_norm": 0.7721812517136419, + "learning_rate": 3.3954065797641214e-05, + "loss": 5.7785, + "step": 1641 + }, + { + "epoch": 0.10193059780247067, + "grad_norm": 0.6624221738051379, + "learning_rate": 3.3974756879784816e-05, + "loss": 5.7984, + "step": 1642 + }, + { + "epoch": 0.10199267490222856, + "grad_norm": 0.5615467365241051, + "learning_rate": 3.399544796192841e-05, + "loss": 5.8065, + "step": 1643 + }, + { + "epoch": 0.10205475200198647, + "grad_norm": 0.9462471747009751, + "learning_rate": 3.4016139044072006e-05, + "loss": 5.7759, + "step": 1644 + }, + { + "epoch": 0.10211682910174437, + "grad_norm": 0.8112144570580371, + "learning_rate": 3.40368301262156e-05, + "loss": 5.7883, + "step": 1645 + }, + { + "epoch": 0.10217890620150226, + "grad_norm": 0.7045862907589732, + "learning_rate": 3.40575212083592e-05, + "loss": 5.7925, + "step": 1646 + }, + { + "epoch": 0.10224098330126016, + "grad_norm": 0.7097270008215768, + "learning_rate": 3.407821229050279e-05, + "loss": 5.7355, + "step": 1647 + }, + { + "epoch": 0.10230306040101807, + "grad_norm": 1.0579048533391682, + "learning_rate": 3.4098903372646393e-05, + "loss": 5.7113, + "step": 1648 + }, + { + "epoch": 0.10236513750077596, + "grad_norm": 0.7007423934455635, + "learning_rate": 3.411959445478999e-05, + "loss": 5.7717, + "step": 1649 + }, + { + "epoch": 0.10242721460053386, + "grad_norm": 0.6190544573874278, + "learning_rate": 3.4140285536933584e-05, + "loss": 5.7948, + "step": 1650 + }, + { + "epoch": 0.10248929170029177, + "grad_norm": 0.6597948608786863, + "learning_rate": 3.416097661907718e-05, + "loss": 5.7711, + "step": 1651 + }, + { + "epoch": 0.10255136880004966, + "grad_norm": 0.6599492984500661, + "learning_rate": 3.4181667701220774e-05, + "loss": 5.6869, + "step": 1652 + }, + { + "epoch": 0.10261344589980756, + "grad_norm": 0.6010515098713385, + "learning_rate": 3.420235878336437e-05, + "loss": 5.8121, + "step": 1653 + }, + { + "epoch": 0.10267552299956546, + "grad_norm": 0.6683487062477803, + "learning_rate": 3.4223049865507964e-05, + "loss": 5.8139, + "step": 1654 + }, + { + "epoch": 0.10273760009932335, + "grad_norm": 0.7115746462140881, + "learning_rate": 3.4243740947651566e-05, + "loss": 5.7883, + "step": 1655 + }, + { + "epoch": 0.10279967719908126, + "grad_norm": 0.5527811468709686, + "learning_rate": 3.426443202979516e-05, + "loss": 5.7981, + "step": 1656 + }, + { + "epoch": 0.10286175429883916, + "grad_norm": 0.7160305323404257, + "learning_rate": 3.4285123111938756e-05, + "loss": 5.7874, + "step": 1657 + }, + { + "epoch": 0.10292383139859705, + "grad_norm": 0.7039990608428521, + "learning_rate": 3.430581419408235e-05, + "loss": 5.6134, + "step": 1658 + }, + { + "epoch": 0.10298590849835496, + "grad_norm": 1.0459116588508406, + "learning_rate": 3.432650527622595e-05, + "loss": 5.8218, + "step": 1659 + }, + { + "epoch": 0.10304798559811286, + "grad_norm": 1.1755918432627035, + "learning_rate": 3.434719635836954e-05, + "loss": 5.7036, + "step": 1660 + }, + { + "epoch": 0.10311006269787075, + "grad_norm": 0.5654490307808302, + "learning_rate": 3.436788744051314e-05, + "loss": 5.7938, + "step": 1661 + }, + { + "epoch": 0.10317213979762865, + "grad_norm": 0.9715719958872278, + "learning_rate": 3.438857852265674e-05, + "loss": 5.749, + "step": 1662 + }, + { + "epoch": 0.10323421689738656, + "grad_norm": 0.5750787743221556, + "learning_rate": 3.440926960480033e-05, + "loss": 5.7226, + "step": 1663 + }, + { + "epoch": 0.10329629399714445, + "grad_norm": 0.6356086414899049, + "learning_rate": 3.442996068694393e-05, + "loss": 5.8212, + "step": 1664 + }, + { + "epoch": 0.10335837109690235, + "grad_norm": 0.8252604983614521, + "learning_rate": 3.445065176908753e-05, + "loss": 5.8074, + "step": 1665 + }, + { + "epoch": 0.10342044819666026, + "grad_norm": 1.36039257330052, + "learning_rate": 3.447134285123112e-05, + "loss": 5.8297, + "step": 1666 + }, + { + "epoch": 0.10348252529641815, + "grad_norm": 0.7410040806749342, + "learning_rate": 3.449203393337472e-05, + "loss": 5.7524, + "step": 1667 + }, + { + "epoch": 0.10354460239617605, + "grad_norm": 0.8294484341846173, + "learning_rate": 3.4512725015518315e-05, + "loss": 5.7645, + "step": 1668 + }, + { + "epoch": 0.10360667949593395, + "grad_norm": 0.6374068023773507, + "learning_rate": 3.453341609766191e-05, + "loss": 5.7244, + "step": 1669 + }, + { + "epoch": 0.10366875659569184, + "grad_norm": 0.5789362396420895, + "learning_rate": 3.4554107179805505e-05, + "loss": 5.7162, + "step": 1670 + }, + { + "epoch": 0.10373083369544975, + "grad_norm": 0.8943981754651167, + "learning_rate": 3.457479826194911e-05, + "loss": 5.8227, + "step": 1671 + }, + { + "epoch": 0.10379291079520765, + "grad_norm": 0.507408920721802, + "learning_rate": 3.4595489344092696e-05, + "loss": 5.8456, + "step": 1672 + }, + { + "epoch": 0.10385498789496554, + "grad_norm": 0.7225446047758266, + "learning_rate": 3.46161804262363e-05, + "loss": 5.7292, + "step": 1673 + }, + { + "epoch": 0.10391706499472345, + "grad_norm": 0.7132189686921164, + "learning_rate": 3.463687150837989e-05, + "loss": 5.7269, + "step": 1674 + }, + { + "epoch": 0.10397914209448135, + "grad_norm": 0.4813642298143144, + "learning_rate": 3.465756259052349e-05, + "loss": 5.6972, + "step": 1675 + }, + { + "epoch": 0.10404121919423924, + "grad_norm": 1.0189418916240855, + "learning_rate": 3.467825367266708e-05, + "loss": 5.6956, + "step": 1676 + }, + { + "epoch": 0.10410329629399714, + "grad_norm": 0.5089934189432832, + "learning_rate": 3.469894475481068e-05, + "loss": 5.7411, + "step": 1677 + }, + { + "epoch": 0.10416537339375505, + "grad_norm": 0.6553813544152125, + "learning_rate": 3.471963583695427e-05, + "loss": 5.6725, + "step": 1678 + }, + { + "epoch": 0.10422745049351294, + "grad_norm": 0.520268598199342, + "learning_rate": 3.474032691909787e-05, + "loss": 5.7551, + "step": 1679 + }, + { + "epoch": 0.10428952759327084, + "grad_norm": 0.48333593407566994, + "learning_rate": 3.476101800124147e-05, + "loss": 5.7126, + "step": 1680 + }, + { + "epoch": 0.10435160469302875, + "grad_norm": 0.5277526965232784, + "learning_rate": 3.478170908338506e-05, + "loss": 5.7593, + "step": 1681 + }, + { + "epoch": 0.10441368179278664, + "grad_norm": 0.6141340558630384, + "learning_rate": 3.480240016552866e-05, + "loss": 5.6629, + "step": 1682 + }, + { + "epoch": 0.10447575889254454, + "grad_norm": 0.5423851444467616, + "learning_rate": 3.4823091247672255e-05, + "loss": 5.7485, + "step": 1683 + }, + { + "epoch": 0.10453783599230244, + "grad_norm": 0.5321488909959908, + "learning_rate": 3.484378232981585e-05, + "loss": 5.7088, + "step": 1684 + }, + { + "epoch": 0.10459991309206033, + "grad_norm": 0.7960395921848012, + "learning_rate": 3.4864473411959445e-05, + "loss": 5.6875, + "step": 1685 + }, + { + "epoch": 0.10466199019181824, + "grad_norm": 0.49875360348960485, + "learning_rate": 3.488516449410305e-05, + "loss": 5.6283, + "step": 1686 + }, + { + "epoch": 0.10472406729157614, + "grad_norm": 0.6067134982895379, + "learning_rate": 3.4905855576246635e-05, + "loss": 5.7086, + "step": 1687 + }, + { + "epoch": 0.10478614439133403, + "grad_norm": 0.5962671330227646, + "learning_rate": 3.492654665839024e-05, + "loss": 5.7018, + "step": 1688 + }, + { + "epoch": 0.10484822149109194, + "grad_norm": 0.47647578591809775, + "learning_rate": 3.494723774053383e-05, + "loss": 5.7546, + "step": 1689 + }, + { + "epoch": 0.10491029859084984, + "grad_norm": 0.536196706608992, + "learning_rate": 3.496792882267743e-05, + "loss": 5.8102, + "step": 1690 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 0.49252894834459016, + "learning_rate": 3.498861990482102e-05, + "loss": 5.7973, + "step": 1691 + }, + { + "epoch": 0.10503445279036563, + "grad_norm": 0.5117215454957038, + "learning_rate": 3.5009310986964624e-05, + "loss": 5.663, + "step": 1692 + }, + { + "epoch": 0.10509652989012354, + "grad_norm": 0.4317763102344739, + "learning_rate": 3.503000206910821e-05, + "loss": 5.7251, + "step": 1693 + }, + { + "epoch": 0.10515860698988143, + "grad_norm": 0.5564211496373755, + "learning_rate": 3.5050693151251814e-05, + "loss": 5.5785, + "step": 1694 + }, + { + "epoch": 0.10522068408963933, + "grad_norm": 0.48911938260326004, + "learning_rate": 3.507138423339541e-05, + "loss": 5.7884, + "step": 1695 + }, + { + "epoch": 0.10528276118939724, + "grad_norm": 0.5139912239913722, + "learning_rate": 3.5092075315539005e-05, + "loss": 5.7569, + "step": 1696 + }, + { + "epoch": 0.10534483828915513, + "grad_norm": 0.4769510811176918, + "learning_rate": 3.51127663976826e-05, + "loss": 5.6936, + "step": 1697 + }, + { + "epoch": 0.10540691538891303, + "grad_norm": 0.46755310476057743, + "learning_rate": 3.51334574798262e-05, + "loss": 5.7006, + "step": 1698 + }, + { + "epoch": 0.10546899248867093, + "grad_norm": 0.5197552363052562, + "learning_rate": 3.515414856196979e-05, + "loss": 5.7501, + "step": 1699 + }, + { + "epoch": 0.10553106958842882, + "grad_norm": 0.5490070743626126, + "learning_rate": 3.517483964411339e-05, + "loss": 5.5948, + "step": 1700 + }, + { + "epoch": 0.10559314668818673, + "grad_norm": 0.4975344246786454, + "learning_rate": 3.519553072625699e-05, + "loss": 5.6471, + "step": 1701 + }, + { + "epoch": 0.10565522378794463, + "grad_norm": 0.48126601155385274, + "learning_rate": 3.521622180840058e-05, + "loss": 5.6249, + "step": 1702 + }, + { + "epoch": 0.10571730088770252, + "grad_norm": 0.5457724219350037, + "learning_rate": 3.523691289054418e-05, + "loss": 5.6281, + "step": 1703 + }, + { + "epoch": 0.10577937798746043, + "grad_norm": 0.529283474122619, + "learning_rate": 3.525760397268777e-05, + "loss": 5.6656, + "step": 1704 + }, + { + "epoch": 0.10584145508721833, + "grad_norm": 0.5730870831509118, + "learning_rate": 3.527829505483137e-05, + "loss": 5.7927, + "step": 1705 + }, + { + "epoch": 0.10590353218697622, + "grad_norm": 0.7258662918249031, + "learning_rate": 3.529898613697496e-05, + "loss": 5.7122, + "step": 1706 + }, + { + "epoch": 0.10596560928673412, + "grad_norm": 0.6361034439236564, + "learning_rate": 3.5319677219118564e-05, + "loss": 5.7292, + "step": 1707 + }, + { + "epoch": 0.10602768638649203, + "grad_norm": 0.5879559147974968, + "learning_rate": 3.534036830126215e-05, + "loss": 5.7438, + "step": 1708 + }, + { + "epoch": 0.10608976348624992, + "grad_norm": 0.534946539783634, + "learning_rate": 3.5361059383405754e-05, + "loss": 5.7821, + "step": 1709 + }, + { + "epoch": 0.10615184058600782, + "grad_norm": 0.6603150261443851, + "learning_rate": 3.538175046554935e-05, + "loss": 5.7061, + "step": 1710 + }, + { + "epoch": 0.10621391768576573, + "grad_norm": 0.5229359971484536, + "learning_rate": 3.5402441547692944e-05, + "loss": 5.6463, + "step": 1711 + }, + { + "epoch": 0.10627599478552362, + "grad_norm": 0.6958883451275453, + "learning_rate": 3.542313262983654e-05, + "loss": 5.6434, + "step": 1712 + }, + { + "epoch": 0.10633807188528152, + "grad_norm": 0.48565151839347354, + "learning_rate": 3.544382371198014e-05, + "loss": 5.767, + "step": 1713 + }, + { + "epoch": 0.10640014898503942, + "grad_norm": 0.5321481349451095, + "learning_rate": 3.546451479412373e-05, + "loss": 5.7531, + "step": 1714 + }, + { + "epoch": 0.10646222608479732, + "grad_norm": 0.6432391537857538, + "learning_rate": 3.548520587626733e-05, + "loss": 5.7016, + "step": 1715 + }, + { + "epoch": 0.10652430318455522, + "grad_norm": 0.6865488972833451, + "learning_rate": 3.5505896958410926e-05, + "loss": 5.7218, + "step": 1716 + }, + { + "epoch": 0.10658638028431312, + "grad_norm": 0.5932489087640923, + "learning_rate": 3.552658804055452e-05, + "loss": 5.7226, + "step": 1717 + }, + { + "epoch": 0.10664845738407101, + "grad_norm": 0.6793721443053082, + "learning_rate": 3.5547279122698117e-05, + "loss": 5.772, + "step": 1718 + }, + { + "epoch": 0.10671053448382892, + "grad_norm": 0.7363303510799201, + "learning_rate": 3.556797020484172e-05, + "loss": 5.7767, + "step": 1719 + }, + { + "epoch": 0.10677261158358682, + "grad_norm": 0.5526625515807995, + "learning_rate": 3.558866128698531e-05, + "loss": 5.7325, + "step": 1720 + }, + { + "epoch": 0.10683468868334471, + "grad_norm": 0.6454006458146843, + "learning_rate": 3.560935236912891e-05, + "loss": 5.7455, + "step": 1721 + }, + { + "epoch": 0.10689676578310262, + "grad_norm": 0.4989143894438381, + "learning_rate": 3.5630043451272504e-05, + "loss": 5.7156, + "step": 1722 + }, + { + "epoch": 0.10695884288286052, + "grad_norm": 0.5806040579198382, + "learning_rate": 3.56507345334161e-05, + "loss": 5.6402, + "step": 1723 + }, + { + "epoch": 0.10702091998261841, + "grad_norm": 0.5272881372585614, + "learning_rate": 3.5671425615559694e-05, + "loss": 5.6892, + "step": 1724 + }, + { + "epoch": 0.10708299708237631, + "grad_norm": 0.5648230426205839, + "learning_rate": 3.5692116697703296e-05, + "loss": 5.5932, + "step": 1725 + }, + { + "epoch": 0.10714507418213422, + "grad_norm": 0.5804994639181443, + "learning_rate": 3.5712807779846884e-05, + "loss": 5.6707, + "step": 1726 + }, + { + "epoch": 0.10720715128189211, + "grad_norm": 0.5398520340677225, + "learning_rate": 3.5733498861990486e-05, + "loss": 5.7864, + "step": 1727 + }, + { + "epoch": 0.10726922838165001, + "grad_norm": 0.6775055204849271, + "learning_rate": 3.575418994413408e-05, + "loss": 5.7102, + "step": 1728 + }, + { + "epoch": 0.10733130548140792, + "grad_norm": 0.5685071196030598, + "learning_rate": 3.5774881026277676e-05, + "loss": 5.766, + "step": 1729 + }, + { + "epoch": 0.1073933825811658, + "grad_norm": 0.8233736234207194, + "learning_rate": 3.579557210842127e-05, + "loss": 5.6886, + "step": 1730 + }, + { + "epoch": 0.10745545968092371, + "grad_norm": 0.5700214636258363, + "learning_rate": 3.5816263190564866e-05, + "loss": 5.6553, + "step": 1731 + }, + { + "epoch": 0.10751753678068161, + "grad_norm": 1.0861141143544837, + "learning_rate": 3.583695427270846e-05, + "loss": 5.7684, + "step": 1732 + }, + { + "epoch": 0.1075796138804395, + "grad_norm": 0.9305120706282642, + "learning_rate": 3.585764535485206e-05, + "loss": 5.7161, + "step": 1733 + }, + { + "epoch": 0.10764169098019741, + "grad_norm": 0.6199211952287472, + "learning_rate": 3.587833643699566e-05, + "loss": 5.6313, + "step": 1734 + }, + { + "epoch": 0.1077037680799553, + "grad_norm": 0.7570890532511779, + "learning_rate": 3.589902751913925e-05, + "loss": 5.6724, + "step": 1735 + }, + { + "epoch": 0.1077658451797132, + "grad_norm": 0.7266063061705906, + "learning_rate": 3.591971860128285e-05, + "loss": 5.6699, + "step": 1736 + }, + { + "epoch": 0.1078279222794711, + "grad_norm": 0.8646708907561211, + "learning_rate": 3.594040968342644e-05, + "loss": 5.6667, + "step": 1737 + }, + { + "epoch": 0.107889999379229, + "grad_norm": 0.535139710108084, + "learning_rate": 3.596110076557004e-05, + "loss": 5.7942, + "step": 1738 + }, + { + "epoch": 0.1079520764789869, + "grad_norm": 0.7192893699634815, + "learning_rate": 3.5981791847713633e-05, + "loss": 5.7226, + "step": 1739 + }, + { + "epoch": 0.1080141535787448, + "grad_norm": 0.547609299457952, + "learning_rate": 3.6002482929857235e-05, + "loss": 5.6711, + "step": 1740 + }, + { + "epoch": 0.1080762306785027, + "grad_norm": 0.6849709289535616, + "learning_rate": 3.602317401200083e-05, + "loss": 5.6998, + "step": 1741 + }, + { + "epoch": 0.1081383077782606, + "grad_norm": 0.6175529026042039, + "learning_rate": 3.6043865094144425e-05, + "loss": 5.6582, + "step": 1742 + }, + { + "epoch": 0.1082003848780185, + "grad_norm": 0.5945780150459249, + "learning_rate": 3.606455617628802e-05, + "loss": 5.6347, + "step": 1743 + }, + { + "epoch": 0.10826246197777639, + "grad_norm": 0.549038639624874, + "learning_rate": 3.608524725843162e-05, + "loss": 5.6099, + "step": 1744 + }, + { + "epoch": 0.1083245390775343, + "grad_norm": 0.5425424342984854, + "learning_rate": 3.610593834057521e-05, + "loss": 5.5899, + "step": 1745 + }, + { + "epoch": 0.1083866161772922, + "grad_norm": 0.653628768292856, + "learning_rate": 3.612662942271881e-05, + "loss": 5.7246, + "step": 1746 + }, + { + "epoch": 0.10844869327705009, + "grad_norm": 0.5919973455624095, + "learning_rate": 3.614732050486241e-05, + "loss": 5.6448, + "step": 1747 + }, + { + "epoch": 0.108510770376808, + "grad_norm": 0.6045238874809876, + "learning_rate": 3.6168011587006e-05, + "loss": 5.7453, + "step": 1748 + }, + { + "epoch": 0.1085728474765659, + "grad_norm": 0.5581229833756567, + "learning_rate": 3.61887026691496e-05, + "loss": 5.6292, + "step": 1749 + }, + { + "epoch": 0.10863492457632379, + "grad_norm": 0.6346345736863637, + "learning_rate": 3.62093937512932e-05, + "loss": 5.5735, + "step": 1750 + }, + { + "epoch": 0.10869700167608169, + "grad_norm": 0.49306014070781734, + "learning_rate": 3.623008483343679e-05, + "loss": 5.5861, + "step": 1751 + }, + { + "epoch": 0.1087590787758396, + "grad_norm": 0.5361468203856511, + "learning_rate": 3.625077591558039e-05, + "loss": 5.6839, + "step": 1752 + }, + { + "epoch": 0.10882115587559749, + "grad_norm": 0.5007594718303923, + "learning_rate": 3.6271466997723985e-05, + "loss": 5.6244, + "step": 1753 + }, + { + "epoch": 0.10888323297535539, + "grad_norm": 0.49080487981078746, + "learning_rate": 3.629215807986758e-05, + "loss": 5.7012, + "step": 1754 + }, + { + "epoch": 0.1089453100751133, + "grad_norm": 0.45898844940325056, + "learning_rate": 3.6312849162011175e-05, + "loss": 5.6599, + "step": 1755 + }, + { + "epoch": 0.10900738717487118, + "grad_norm": 0.49543413479806103, + "learning_rate": 3.633354024415477e-05, + "loss": 5.5917, + "step": 1756 + }, + { + "epoch": 0.10906946427462909, + "grad_norm": 0.5209565366620296, + "learning_rate": 3.6354231326298365e-05, + "loss": 5.7915, + "step": 1757 + }, + { + "epoch": 0.10913154137438699, + "grad_norm": 0.5719932715263638, + "learning_rate": 3.637492240844197e-05, + "loss": 5.7271, + "step": 1758 + }, + { + "epoch": 0.10919361847414488, + "grad_norm": 0.41256251439425506, + "learning_rate": 3.639561349058556e-05, + "loss": 5.7646, + "step": 1759 + }, + { + "epoch": 0.10925569557390279, + "grad_norm": 0.46350978534516984, + "learning_rate": 3.641630457272916e-05, + "loss": 5.4763, + "step": 1760 + }, + { + "epoch": 0.10931777267366069, + "grad_norm": 0.43515943392508233, + "learning_rate": 3.643699565487275e-05, + "loss": 5.6145, + "step": 1761 + }, + { + "epoch": 0.10937984977341858, + "grad_norm": 0.4575190589926775, + "learning_rate": 3.645768673701635e-05, + "loss": 5.6388, + "step": 1762 + }, + { + "epoch": 0.10944192687317648, + "grad_norm": 0.45169911809357594, + "learning_rate": 3.647837781915994e-05, + "loss": 5.6591, + "step": 1763 + }, + { + "epoch": 0.10950400397293439, + "grad_norm": 0.4738106777064035, + "learning_rate": 3.649906890130354e-05, + "loss": 5.7216, + "step": 1764 + }, + { + "epoch": 0.10956608107269228, + "grad_norm": 0.45223003005541695, + "learning_rate": 3.651975998344714e-05, + "loss": 5.6448, + "step": 1765 + }, + { + "epoch": 0.10962815817245018, + "grad_norm": 0.5302788047448648, + "learning_rate": 3.654045106559073e-05, + "loss": 5.7266, + "step": 1766 + }, + { + "epoch": 0.10969023527220809, + "grad_norm": 0.46345709089874976, + "learning_rate": 3.656114214773433e-05, + "loss": 5.6276, + "step": 1767 + }, + { + "epoch": 0.10975231237196598, + "grad_norm": 0.404778253843866, + "learning_rate": 3.6581833229877925e-05, + "loss": 5.5631, + "step": 1768 + }, + { + "epoch": 0.10981438947172388, + "grad_norm": 0.4523700311171263, + "learning_rate": 3.660252431202152e-05, + "loss": 5.7338, + "step": 1769 + }, + { + "epoch": 0.10987646657148178, + "grad_norm": 0.47623551204000847, + "learning_rate": 3.6623215394165115e-05, + "loss": 5.6041, + "step": 1770 + }, + { + "epoch": 0.10993854367123967, + "grad_norm": 0.5550916300989076, + "learning_rate": 3.6643906476308717e-05, + "loss": 5.6767, + "step": 1771 + }, + { + "epoch": 0.11000062077099758, + "grad_norm": 0.46505895809982617, + "learning_rate": 3.6664597558452305e-05, + "loss": 5.6962, + "step": 1772 + }, + { + "epoch": 0.11006269787075548, + "grad_norm": 0.6202171905862395, + "learning_rate": 3.668528864059591e-05, + "loss": 5.6655, + "step": 1773 + }, + { + "epoch": 0.11012477497051337, + "grad_norm": 0.49534598637452004, + "learning_rate": 3.67059797227395e-05, + "loss": 5.6016, + "step": 1774 + }, + { + "epoch": 0.11018685207027128, + "grad_norm": 0.8343849738735754, + "learning_rate": 3.67266708048831e-05, + "loss": 5.599, + "step": 1775 + }, + { + "epoch": 0.11024892917002918, + "grad_norm": 0.841462920962587, + "learning_rate": 3.674736188702669e-05, + "loss": 5.7512, + "step": 1776 + }, + { + "epoch": 0.11031100626978707, + "grad_norm": 0.8240111754039212, + "learning_rate": 3.6768052969170294e-05, + "loss": 5.7282, + "step": 1777 + }, + { + "epoch": 0.11037308336954497, + "grad_norm": 1.1201711972559392, + "learning_rate": 3.678874405131388e-05, + "loss": 5.6786, + "step": 1778 + }, + { + "epoch": 0.11043516046930288, + "grad_norm": 0.6791007699377588, + "learning_rate": 3.6809435133457484e-05, + "loss": 5.5574, + "step": 1779 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 0.7315839933931292, + "learning_rate": 3.683012621560108e-05, + "loss": 5.5327, + "step": 1780 + }, + { + "epoch": 0.11055931466881867, + "grad_norm": 1.0604509341855484, + "learning_rate": 3.6850817297744674e-05, + "loss": 5.6667, + "step": 1781 + }, + { + "epoch": 0.11062139176857658, + "grad_norm": 0.8632090266469022, + "learning_rate": 3.687150837988827e-05, + "loss": 5.6794, + "step": 1782 + }, + { + "epoch": 0.11068346886833447, + "grad_norm": 0.7435448531915075, + "learning_rate": 3.689219946203187e-05, + "loss": 5.5895, + "step": 1783 + }, + { + "epoch": 0.11074554596809237, + "grad_norm": 0.8794468891118924, + "learning_rate": 3.691289054417546e-05, + "loss": 5.6223, + "step": 1784 + }, + { + "epoch": 0.11080762306785027, + "grad_norm": 0.6400140780152248, + "learning_rate": 3.693358162631906e-05, + "loss": 5.6695, + "step": 1785 + }, + { + "epoch": 0.11086970016760817, + "grad_norm": 0.6121112572714683, + "learning_rate": 3.6954272708462656e-05, + "loss": 5.6597, + "step": 1786 + }, + { + "epoch": 0.11093177726736607, + "grad_norm": 0.6252270762875398, + "learning_rate": 3.697496379060625e-05, + "loss": 5.6973, + "step": 1787 + }, + { + "epoch": 0.11099385436712397, + "grad_norm": 0.5132457866059101, + "learning_rate": 3.6995654872749846e-05, + "loss": 5.6581, + "step": 1788 + }, + { + "epoch": 0.11105593146688186, + "grad_norm": 0.5979978046679947, + "learning_rate": 3.701634595489344e-05, + "loss": 5.6672, + "step": 1789 + }, + { + "epoch": 0.11111800856663977, + "grad_norm": 0.5058862980928155, + "learning_rate": 3.7037037037037037e-05, + "loss": 5.6752, + "step": 1790 + }, + { + "epoch": 0.11118008566639767, + "grad_norm": 0.7428760346365818, + "learning_rate": 3.705772811918063e-05, + "loss": 5.6289, + "step": 1791 + }, + { + "epoch": 0.11124216276615556, + "grad_norm": 0.5978241478941309, + "learning_rate": 3.7078419201324233e-05, + "loss": 5.6359, + "step": 1792 + }, + { + "epoch": 0.11130423986591347, + "grad_norm": 0.5458586376333163, + "learning_rate": 3.709911028346782e-05, + "loss": 5.6333, + "step": 1793 + }, + { + "epoch": 0.11136631696567137, + "grad_norm": 0.6736034516317456, + "learning_rate": 3.7119801365611424e-05, + "loss": 5.6084, + "step": 1794 + }, + { + "epoch": 0.11142839406542926, + "grad_norm": 0.5548536293656354, + "learning_rate": 3.714049244775502e-05, + "loss": 5.5964, + "step": 1795 + }, + { + "epoch": 0.11149047116518716, + "grad_norm": 0.6156219714714728, + "learning_rate": 3.7161183529898614e-05, + "loss": 5.5757, + "step": 1796 + }, + { + "epoch": 0.11155254826494507, + "grad_norm": 0.6207523007175489, + "learning_rate": 3.718187461204221e-05, + "loss": 5.6398, + "step": 1797 + }, + { + "epoch": 0.11161462536470296, + "grad_norm": 0.4761718336923658, + "learning_rate": 3.720256569418581e-05, + "loss": 5.6362, + "step": 1798 + }, + { + "epoch": 0.11167670246446086, + "grad_norm": 0.5542517280069892, + "learning_rate": 3.72232567763294e-05, + "loss": 5.6735, + "step": 1799 + }, + { + "epoch": 0.11173877956421877, + "grad_norm": 0.5393730872110374, + "learning_rate": 3.7243947858473e-05, + "loss": 5.6357, + "step": 1800 + }, + { + "epoch": 0.11180085666397666, + "grad_norm": 0.6087972641050668, + "learning_rate": 3.7264638940616596e-05, + "loss": 5.4878, + "step": 1801 + }, + { + "epoch": 0.11186293376373456, + "grad_norm": 0.4755264965721515, + "learning_rate": 3.728533002276019e-05, + "loss": 5.6883, + "step": 1802 + }, + { + "epoch": 0.11192501086349246, + "grad_norm": 0.48941226432470564, + "learning_rate": 3.7306021104903786e-05, + "loss": 5.5865, + "step": 1803 + }, + { + "epoch": 0.11198708796325035, + "grad_norm": 0.4870045259234068, + "learning_rate": 3.732671218704739e-05, + "loss": 5.6727, + "step": 1804 + }, + { + "epoch": 0.11204916506300826, + "grad_norm": 0.558137934450509, + "learning_rate": 3.7347403269190976e-05, + "loss": 5.7147, + "step": 1805 + }, + { + "epoch": 0.11211124216276616, + "grad_norm": 0.6157644985178701, + "learning_rate": 3.736809435133458e-05, + "loss": 5.5952, + "step": 1806 + }, + { + "epoch": 0.11217331926252405, + "grad_norm": 0.45284339770410714, + "learning_rate": 3.738878543347817e-05, + "loss": 5.6292, + "step": 1807 + }, + { + "epoch": 0.11223539636228196, + "grad_norm": 0.6595551050458953, + "learning_rate": 3.740947651562177e-05, + "loss": 5.5385, + "step": 1808 + }, + { + "epoch": 0.11229747346203986, + "grad_norm": 0.5561317125199566, + "learning_rate": 3.743016759776536e-05, + "loss": 5.6835, + "step": 1809 + }, + { + "epoch": 0.11235955056179775, + "grad_norm": 0.6687422172592639, + "learning_rate": 3.7450858679908965e-05, + "loss": 5.7061, + "step": 1810 + }, + { + "epoch": 0.11242162766155565, + "grad_norm": 0.8168373977190558, + "learning_rate": 3.7471549762052553e-05, + "loss": 5.6121, + "step": 1811 + }, + { + "epoch": 0.11248370476131356, + "grad_norm": 0.7451974168133768, + "learning_rate": 3.7492240844196155e-05, + "loss": 5.6955, + "step": 1812 + }, + { + "epoch": 0.11254578186107145, + "grad_norm": 0.5417257943037839, + "learning_rate": 3.751293192633975e-05, + "loss": 5.5934, + "step": 1813 + }, + { + "epoch": 0.11260785896082935, + "grad_norm": 0.552546142230721, + "learning_rate": 3.7533623008483345e-05, + "loss": 5.5236, + "step": 1814 + }, + { + "epoch": 0.11266993606058726, + "grad_norm": 0.5932138431524783, + "learning_rate": 3.755431409062694e-05, + "loss": 5.6045, + "step": 1815 + }, + { + "epoch": 0.11273201316034515, + "grad_norm": 0.5500289432539466, + "learning_rate": 3.7575005172770536e-05, + "loss": 5.4718, + "step": 1816 + }, + { + "epoch": 0.11279409026010305, + "grad_norm": 0.5399566366745637, + "learning_rate": 3.759569625491413e-05, + "loss": 5.6081, + "step": 1817 + }, + { + "epoch": 0.11285616735986095, + "grad_norm": 0.6464109119729423, + "learning_rate": 3.7616387337057726e-05, + "loss": 5.6356, + "step": 1818 + }, + { + "epoch": 0.11291824445961884, + "grad_norm": 0.5022723893282532, + "learning_rate": 3.763707841920133e-05, + "loss": 5.5652, + "step": 1819 + }, + { + "epoch": 0.11298032155937675, + "grad_norm": 0.7317948569339909, + "learning_rate": 3.765776950134492e-05, + "loss": 5.5168, + "step": 1820 + }, + { + "epoch": 0.11304239865913465, + "grad_norm": 0.6281378417339262, + "learning_rate": 3.767846058348852e-05, + "loss": 5.6267, + "step": 1821 + }, + { + "epoch": 0.11310447575889254, + "grad_norm": 0.560810731467338, + "learning_rate": 3.769915166563211e-05, + "loss": 5.6035, + "step": 1822 + }, + { + "epoch": 0.11316655285865045, + "grad_norm": 0.6658695541797911, + "learning_rate": 3.7719842747775715e-05, + "loss": 5.5952, + "step": 1823 + }, + { + "epoch": 0.11322862995840835, + "grad_norm": 0.5839118019352495, + "learning_rate": 3.77405338299193e-05, + "loss": 5.5587, + "step": 1824 + }, + { + "epoch": 0.11329070705816624, + "grad_norm": 0.7274842926505797, + "learning_rate": 3.7761224912062905e-05, + "loss": 5.634, + "step": 1825 + }, + { + "epoch": 0.11335278415792414, + "grad_norm": 0.8161611599792563, + "learning_rate": 3.77819159942065e-05, + "loss": 5.5583, + "step": 1826 + }, + { + "epoch": 0.11341486125768203, + "grad_norm": 0.5092850681790759, + "learning_rate": 3.7802607076350095e-05, + "loss": 5.6618, + "step": 1827 + }, + { + "epoch": 0.11347693835743994, + "grad_norm": 1.0119355900927682, + "learning_rate": 3.782329815849369e-05, + "loss": 5.5918, + "step": 1828 + }, + { + "epoch": 0.11353901545719784, + "grad_norm": 0.6397490444124039, + "learning_rate": 3.784398924063729e-05, + "loss": 5.6801, + "step": 1829 + }, + { + "epoch": 0.11360109255695573, + "grad_norm": 1.098050626994657, + "learning_rate": 3.786468032278088e-05, + "loss": 5.6103, + "step": 1830 + }, + { + "epoch": 0.11366316965671364, + "grad_norm": 1.0443566050070905, + "learning_rate": 3.788537140492448e-05, + "loss": 5.5884, + "step": 1831 + }, + { + "epoch": 0.11372524675647154, + "grad_norm": 0.8362845654036836, + "learning_rate": 3.790606248706808e-05, + "loss": 5.5344, + "step": 1832 + }, + { + "epoch": 0.11378732385622943, + "grad_norm": 1.1691171875097888, + "learning_rate": 3.792675356921167e-05, + "loss": 5.6812, + "step": 1833 + }, + { + "epoch": 0.11384940095598733, + "grad_norm": 0.8512034227845563, + "learning_rate": 3.794744465135527e-05, + "loss": 5.6489, + "step": 1834 + }, + { + "epoch": 0.11391147805574524, + "grad_norm": 1.2789153250789997, + "learning_rate": 3.796813573349887e-05, + "loss": 5.6024, + "step": 1835 + }, + { + "epoch": 0.11397355515550313, + "grad_norm": 0.6597053297010569, + "learning_rate": 3.798882681564246e-05, + "loss": 5.6005, + "step": 1836 + }, + { + "epoch": 0.11403563225526103, + "grad_norm": 0.6148954154074613, + "learning_rate": 3.800951789778606e-05, + "loss": 5.573, + "step": 1837 + }, + { + "epoch": 0.11409770935501894, + "grad_norm": 1.2855630248268664, + "learning_rate": 3.8030208979929654e-05, + "loss": 5.6526, + "step": 1838 + }, + { + "epoch": 0.11415978645477683, + "grad_norm": 0.6901957804970663, + "learning_rate": 3.805090006207325e-05, + "loss": 5.5889, + "step": 1839 + }, + { + "epoch": 0.11422186355453473, + "grad_norm": 0.6777227659083174, + "learning_rate": 3.8071591144216845e-05, + "loss": 5.6386, + "step": 1840 + }, + { + "epoch": 0.11428394065429263, + "grad_norm": 1.5059623633566683, + "learning_rate": 3.809228222636044e-05, + "loss": 5.5844, + "step": 1841 + }, + { + "epoch": 0.11434601775405052, + "grad_norm": 0.8802889311116557, + "learning_rate": 3.8112973308504035e-05, + "loss": 5.722, + "step": 1842 + }, + { + "epoch": 0.11440809485380843, + "grad_norm": 0.8462385284457699, + "learning_rate": 3.813366439064763e-05, + "loss": 5.6706, + "step": 1843 + }, + { + "epoch": 0.11447017195356633, + "grad_norm": 0.6870661146675605, + "learning_rate": 3.815435547279123e-05, + "loss": 5.6337, + "step": 1844 + }, + { + "epoch": 0.11453224905332422, + "grad_norm": 0.6642699422516772, + "learning_rate": 3.817504655493483e-05, + "loss": 5.5378, + "step": 1845 + }, + { + "epoch": 0.11459432615308213, + "grad_norm": 0.7401615801525278, + "learning_rate": 3.819573763707842e-05, + "loss": 5.5973, + "step": 1846 + }, + { + "epoch": 0.11465640325284003, + "grad_norm": 0.8401209883750895, + "learning_rate": 3.821642871922202e-05, + "loss": 5.5674, + "step": 1847 + }, + { + "epoch": 0.11471848035259792, + "grad_norm": 0.6776554600396806, + "learning_rate": 3.823711980136561e-05, + "loss": 5.5703, + "step": 1848 + }, + { + "epoch": 0.11478055745235582, + "grad_norm": 0.5534138900313044, + "learning_rate": 3.825781088350921e-05, + "loss": 5.5056, + "step": 1849 + }, + { + "epoch": 0.11484263455211373, + "grad_norm": 0.8447021128032278, + "learning_rate": 3.827850196565281e-05, + "loss": 5.5269, + "step": 1850 + }, + { + "epoch": 0.11490471165187162, + "grad_norm": 0.9610361498306297, + "learning_rate": 3.82991930477964e-05, + "loss": 5.5463, + "step": 1851 + }, + { + "epoch": 0.11496678875162952, + "grad_norm": 0.6472967293447921, + "learning_rate": 3.831988412994e-05, + "loss": 5.5839, + "step": 1852 + }, + { + "epoch": 0.11502886585138743, + "grad_norm": 0.6556680460454605, + "learning_rate": 3.8340575212083594e-05, + "loss": 5.6664, + "step": 1853 + }, + { + "epoch": 0.11509094295114532, + "grad_norm": 0.618556496196255, + "learning_rate": 3.836126629422719e-05, + "loss": 5.5111, + "step": 1854 + }, + { + "epoch": 0.11515302005090322, + "grad_norm": 0.6282567526579276, + "learning_rate": 3.8381957376370784e-05, + "loss": 5.5195, + "step": 1855 + }, + { + "epoch": 0.11521509715066112, + "grad_norm": 0.6051873075955113, + "learning_rate": 3.8402648458514386e-05, + "loss": 5.4194, + "step": 1856 + }, + { + "epoch": 0.11527717425041902, + "grad_norm": 0.6461292916685079, + "learning_rate": 3.8423339540657974e-05, + "loss": 5.5234, + "step": 1857 + }, + { + "epoch": 0.11533925135017692, + "grad_norm": 0.7443441678315106, + "learning_rate": 3.8444030622801576e-05, + "loss": 5.5318, + "step": 1858 + }, + { + "epoch": 0.11540132844993482, + "grad_norm": 0.6512971808975944, + "learning_rate": 3.846472170494517e-05, + "loss": 5.5891, + "step": 1859 + }, + { + "epoch": 0.11546340554969271, + "grad_norm": 0.6886212785280621, + "learning_rate": 3.8485412787088766e-05, + "loss": 5.63, + "step": 1860 + }, + { + "epoch": 0.11552548264945062, + "grad_norm": 0.5728140016526738, + "learning_rate": 3.850610386923236e-05, + "loss": 5.6823, + "step": 1861 + }, + { + "epoch": 0.11558755974920852, + "grad_norm": 0.6329617500906872, + "learning_rate": 3.852679495137596e-05, + "loss": 5.6081, + "step": 1862 + }, + { + "epoch": 0.11564963684896641, + "grad_norm": 0.4855734396654855, + "learning_rate": 3.854748603351955e-05, + "loss": 5.6327, + "step": 1863 + }, + { + "epoch": 0.11571171394872432, + "grad_norm": 0.5032635515779001, + "learning_rate": 3.8568177115663154e-05, + "loss": 5.6401, + "step": 1864 + }, + { + "epoch": 0.11577379104848222, + "grad_norm": 0.47031575008549364, + "learning_rate": 3.858886819780675e-05, + "loss": 5.6275, + "step": 1865 + }, + { + "epoch": 0.11583586814824011, + "grad_norm": 0.48472702401389994, + "learning_rate": 3.8609559279950344e-05, + "loss": 5.594, + "step": 1866 + }, + { + "epoch": 0.11589794524799801, + "grad_norm": 0.3919083494691831, + "learning_rate": 3.863025036209394e-05, + "loss": 5.5664, + "step": 1867 + }, + { + "epoch": 0.11596002234775592, + "grad_norm": 0.5151736277876453, + "learning_rate": 3.865094144423754e-05, + "loss": 5.6641, + "step": 1868 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 0.5109127773638161, + "learning_rate": 3.867163252638113e-05, + "loss": 5.607, + "step": 1869 + }, + { + "epoch": 0.11608417654727171, + "grad_norm": 0.5118465991868725, + "learning_rate": 3.869232360852473e-05, + "loss": 5.6176, + "step": 1870 + }, + { + "epoch": 0.11614625364702962, + "grad_norm": 0.5516812971956371, + "learning_rate": 3.8713014690668326e-05, + "loss": 5.561, + "step": 1871 + }, + { + "epoch": 0.1162083307467875, + "grad_norm": 0.4816391498577116, + "learning_rate": 3.873370577281192e-05, + "loss": 5.5158, + "step": 1872 + }, + { + "epoch": 0.11627040784654541, + "grad_norm": 0.4614094887738312, + "learning_rate": 3.8754396854955516e-05, + "loss": 5.5953, + "step": 1873 + }, + { + "epoch": 0.11633248494630331, + "grad_norm": 0.4275225807383374, + "learning_rate": 3.877508793709911e-05, + "loss": 5.512, + "step": 1874 + }, + { + "epoch": 0.1163945620460612, + "grad_norm": 0.476358714073304, + "learning_rate": 3.8795779019242706e-05, + "loss": 5.517, + "step": 1875 + }, + { + "epoch": 0.11645663914581911, + "grad_norm": 0.524391244857193, + "learning_rate": 3.88164701013863e-05, + "loss": 5.5975, + "step": 1876 + }, + { + "epoch": 0.11651871624557701, + "grad_norm": 0.5186519208362991, + "learning_rate": 3.88371611835299e-05, + "loss": 5.4946, + "step": 1877 + }, + { + "epoch": 0.1165807933453349, + "grad_norm": 0.4254511507270019, + "learning_rate": 3.885785226567349e-05, + "loss": 5.5756, + "step": 1878 + }, + { + "epoch": 0.1166428704450928, + "grad_norm": 0.649434965557329, + "learning_rate": 3.887854334781709e-05, + "loss": 5.5176, + "step": 1879 + }, + { + "epoch": 0.11670494754485071, + "grad_norm": 0.5072813337188861, + "learning_rate": 3.889923442996069e-05, + "loss": 5.6089, + "step": 1880 + }, + { + "epoch": 0.1167670246446086, + "grad_norm": 0.5003918066316064, + "learning_rate": 3.891992551210428e-05, + "loss": 5.547, + "step": 1881 + }, + { + "epoch": 0.1168291017443665, + "grad_norm": 0.5014830291630148, + "learning_rate": 3.894061659424788e-05, + "loss": 5.461, + "step": 1882 + }, + { + "epoch": 0.11689117884412441, + "grad_norm": 0.6996803251974535, + "learning_rate": 3.896130767639148e-05, + "loss": 5.6175, + "step": 1883 + }, + { + "epoch": 0.1169532559438823, + "grad_norm": 0.5669713588440747, + "learning_rate": 3.898199875853507e-05, + "loss": 5.5288, + "step": 1884 + }, + { + "epoch": 0.1170153330436402, + "grad_norm": 0.47271668229086194, + "learning_rate": 3.900268984067867e-05, + "loss": 5.5598, + "step": 1885 + }, + { + "epoch": 0.1170774101433981, + "grad_norm": 0.5415420450000518, + "learning_rate": 3.9023380922822266e-05, + "loss": 5.4556, + "step": 1886 + }, + { + "epoch": 0.117139487243156, + "grad_norm": 0.4299361983084213, + "learning_rate": 3.904407200496586e-05, + "loss": 5.5224, + "step": 1887 + }, + { + "epoch": 0.1172015643429139, + "grad_norm": 0.4572157528140045, + "learning_rate": 3.9064763087109456e-05, + "loss": 5.5096, + "step": 1888 + }, + { + "epoch": 0.1172636414426718, + "grad_norm": 0.40415583559972296, + "learning_rate": 3.908545416925306e-05, + "loss": 5.5024, + "step": 1889 + }, + { + "epoch": 0.1173257185424297, + "grad_norm": 0.49791814599023576, + "learning_rate": 3.9106145251396646e-05, + "loss": 5.5122, + "step": 1890 + }, + { + "epoch": 0.1173877956421876, + "grad_norm": 0.4907444431915341, + "learning_rate": 3.912683633354025e-05, + "loss": 5.5683, + "step": 1891 + }, + { + "epoch": 0.1174498727419455, + "grad_norm": 0.4616535573204414, + "learning_rate": 3.914752741568384e-05, + "loss": 5.445, + "step": 1892 + }, + { + "epoch": 0.11751194984170339, + "grad_norm": 0.4328359390963024, + "learning_rate": 3.916821849782744e-05, + "loss": 5.4922, + "step": 1893 + }, + { + "epoch": 0.1175740269414613, + "grad_norm": 0.41540252926772303, + "learning_rate": 3.918890957997103e-05, + "loss": 5.5457, + "step": 1894 + }, + { + "epoch": 0.1176361040412192, + "grad_norm": 0.5241522108620702, + "learning_rate": 3.9209600662114635e-05, + "loss": 5.6414, + "step": 1895 + }, + { + "epoch": 0.11769818114097709, + "grad_norm": 0.45861212061620665, + "learning_rate": 3.923029174425822e-05, + "loss": 5.5484, + "step": 1896 + }, + { + "epoch": 0.117760258240735, + "grad_norm": 0.47716412325234625, + "learning_rate": 3.9250982826401825e-05, + "loss": 5.5185, + "step": 1897 + }, + { + "epoch": 0.1178223353404929, + "grad_norm": 0.5826714342409282, + "learning_rate": 3.927167390854542e-05, + "loss": 5.623, + "step": 1898 + }, + { + "epoch": 0.11788441244025079, + "grad_norm": 0.8157617680843264, + "learning_rate": 3.9292364990689015e-05, + "loss": 5.4213, + "step": 1899 + }, + { + "epoch": 0.11794648954000869, + "grad_norm": 0.591746715270621, + "learning_rate": 3.931305607283261e-05, + "loss": 5.593, + "step": 1900 + }, + { + "epoch": 0.1180085666397666, + "grad_norm": 0.7446494778152241, + "learning_rate": 3.9333747154976205e-05, + "loss": 5.5134, + "step": 1901 + }, + { + "epoch": 0.11807064373952449, + "grad_norm": 0.5449267424772807, + "learning_rate": 3.93544382371198e-05, + "loss": 5.5856, + "step": 1902 + }, + { + "epoch": 0.11813272083928239, + "grad_norm": 0.6786639739330073, + "learning_rate": 3.9375129319263395e-05, + "loss": 5.5379, + "step": 1903 + }, + { + "epoch": 0.1181947979390403, + "grad_norm": 0.5170230696288388, + "learning_rate": 3.9395820401407e-05, + "loss": 5.5332, + "step": 1904 + }, + { + "epoch": 0.11825687503879818, + "grad_norm": 0.665294110969925, + "learning_rate": 3.9416511483550586e-05, + "loss": 5.4754, + "step": 1905 + }, + { + "epoch": 0.11831895213855609, + "grad_norm": 0.5968378235189011, + "learning_rate": 3.943720256569419e-05, + "loss": 5.5714, + "step": 1906 + }, + { + "epoch": 0.11838102923831399, + "grad_norm": 0.5206979830030831, + "learning_rate": 3.945789364783778e-05, + "loss": 5.506, + "step": 1907 + }, + { + "epoch": 0.11844310633807188, + "grad_norm": 0.5431665293593974, + "learning_rate": 3.9478584729981384e-05, + "loss": 5.5565, + "step": 1908 + }, + { + "epoch": 0.11850518343782979, + "grad_norm": 0.6359319681962692, + "learning_rate": 3.949927581212497e-05, + "loss": 5.6894, + "step": 1909 + }, + { + "epoch": 0.11856726053758769, + "grad_norm": 0.6246693690284622, + "learning_rate": 3.9519966894268574e-05, + "loss": 5.5629, + "step": 1910 + }, + { + "epoch": 0.11862933763734558, + "grad_norm": 0.7615560504576051, + "learning_rate": 3.954065797641217e-05, + "loss": 5.5384, + "step": 1911 + }, + { + "epoch": 0.11869141473710348, + "grad_norm": 0.5602360039494347, + "learning_rate": 3.9561349058555765e-05, + "loss": 5.5346, + "step": 1912 + }, + { + "epoch": 0.11875349183686139, + "grad_norm": 0.5184598501951171, + "learning_rate": 3.958204014069936e-05, + "loss": 5.5618, + "step": 1913 + }, + { + "epoch": 0.11881556893661928, + "grad_norm": 0.6186569924243878, + "learning_rate": 3.960273122284296e-05, + "loss": 5.5021, + "step": 1914 + }, + { + "epoch": 0.11887764603637718, + "grad_norm": 0.5742067608262893, + "learning_rate": 3.962342230498655e-05, + "loss": 5.5638, + "step": 1915 + }, + { + "epoch": 0.11893972313613509, + "grad_norm": 0.9497117069394059, + "learning_rate": 3.964411338713015e-05, + "loss": 5.5643, + "step": 1916 + }, + { + "epoch": 0.11900180023589298, + "grad_norm": 0.7761920851450623, + "learning_rate": 3.966480446927375e-05, + "loss": 5.5275, + "step": 1917 + }, + { + "epoch": 0.11906387733565088, + "grad_norm": 1.3210364619759494, + "learning_rate": 3.968549555141734e-05, + "loss": 5.5891, + "step": 1918 + }, + { + "epoch": 0.11912595443540878, + "grad_norm": 0.9097650366617858, + "learning_rate": 3.970618663356094e-05, + "loss": 5.5364, + "step": 1919 + }, + { + "epoch": 0.11918803153516667, + "grad_norm": 1.4273560906071443, + "learning_rate": 3.972687771570454e-05, + "loss": 5.5625, + "step": 1920 + }, + { + "epoch": 0.11925010863492458, + "grad_norm": 1.1459767974282182, + "learning_rate": 3.974756879784813e-05, + "loss": 5.5319, + "step": 1921 + }, + { + "epoch": 0.11931218573468247, + "grad_norm": 1.1159333345265496, + "learning_rate": 3.976825987999173e-05, + "loss": 5.484, + "step": 1922 + }, + { + "epoch": 0.11937426283444037, + "grad_norm": 1.0644673477762536, + "learning_rate": 3.9788950962135324e-05, + "loss": 5.5661, + "step": 1923 + }, + { + "epoch": 0.11943633993419828, + "grad_norm": 1.4892024195640923, + "learning_rate": 3.980964204427892e-05, + "loss": 5.5815, + "step": 1924 + }, + { + "epoch": 0.11949841703395617, + "grad_norm": 0.9869891076438663, + "learning_rate": 3.9830333126422514e-05, + "loss": 5.524, + "step": 1925 + }, + { + "epoch": 0.11956049413371407, + "grad_norm": 0.839740125783756, + "learning_rate": 3.985102420856611e-05, + "loss": 5.4848, + "step": 1926 + }, + { + "epoch": 0.11962257123347197, + "grad_norm": 1.7774735288084107, + "learning_rate": 3.9871715290709704e-05, + "loss": 5.5848, + "step": 1927 + }, + { + "epoch": 0.11968464833322986, + "grad_norm": 1.1713959286873505, + "learning_rate": 3.98924063728533e-05, + "loss": 5.617, + "step": 1928 + }, + { + "epoch": 0.11974672543298777, + "grad_norm": 1.2853964723127007, + "learning_rate": 3.99130974549969e-05, + "loss": 5.6175, + "step": 1929 + }, + { + "epoch": 0.11980880253274567, + "grad_norm": 1.1923211758965762, + "learning_rate": 3.9933788537140496e-05, + "loss": 5.6197, + "step": 1930 + }, + { + "epoch": 0.11987087963250356, + "grad_norm": 0.8484717458765025, + "learning_rate": 3.995447961928409e-05, + "loss": 5.5353, + "step": 1931 + }, + { + "epoch": 0.11993295673226147, + "grad_norm": 0.8095715641246733, + "learning_rate": 3.9975170701427686e-05, + "loss": 5.5312, + "step": 1932 + }, + { + "epoch": 0.11999503383201937, + "grad_norm": 1.009373778823728, + "learning_rate": 3.999586178357128e-05, + "loss": 5.6096, + "step": 1933 + }, + { + "epoch": 0.12005711093177726, + "grad_norm": 1.0296320505219207, + "learning_rate": 4.001655286571488e-05, + "loss": 5.5504, + "step": 1934 + }, + { + "epoch": 0.12011918803153517, + "grad_norm": 0.8158753051196171, + "learning_rate": 4.003724394785848e-05, + "loss": 5.4606, + "step": 1935 + }, + { + "epoch": 0.12018126513129307, + "grad_norm": 1.0771087611509265, + "learning_rate": 4.005793503000207e-05, + "loss": 5.5904, + "step": 1936 + }, + { + "epoch": 0.12024334223105096, + "grad_norm": 0.827746250702879, + "learning_rate": 4.007862611214567e-05, + "loss": 5.5239, + "step": 1937 + }, + { + "epoch": 0.12030541933080886, + "grad_norm": 1.0908693799463867, + "learning_rate": 4.0099317194289264e-05, + "loss": 5.5937, + "step": 1938 + }, + { + "epoch": 0.12036749643056677, + "grad_norm": 0.8179827713516086, + "learning_rate": 4.012000827643286e-05, + "loss": 5.5207, + "step": 1939 + }, + { + "epoch": 0.12042957353032466, + "grad_norm": 0.6079376091943758, + "learning_rate": 4.0140699358576454e-05, + "loss": 5.4577, + "step": 1940 + }, + { + "epoch": 0.12049165063008256, + "grad_norm": 0.5009399460169424, + "learning_rate": 4.0161390440720056e-05, + "loss": 5.5098, + "step": 1941 + }, + { + "epoch": 0.12055372772984047, + "grad_norm": 0.648697720444286, + "learning_rate": 4.0182081522863644e-05, + "loss": 5.4827, + "step": 1942 + }, + { + "epoch": 0.12061580482959836, + "grad_norm": 0.7338179646847282, + "learning_rate": 4.0202772605007246e-05, + "loss": 5.3991, + "step": 1943 + }, + { + "epoch": 0.12067788192935626, + "grad_norm": 0.5613977706273072, + "learning_rate": 4.022346368715084e-05, + "loss": 5.5269, + "step": 1944 + }, + { + "epoch": 0.12073995902911416, + "grad_norm": 0.6240135992047711, + "learning_rate": 4.0244154769294436e-05, + "loss": 5.562, + "step": 1945 + }, + { + "epoch": 0.12080203612887205, + "grad_norm": 0.5374656466661688, + "learning_rate": 4.026484585143803e-05, + "loss": 5.4626, + "step": 1946 + }, + { + "epoch": 0.12086411322862996, + "grad_norm": 0.5729942500440088, + "learning_rate": 4.028553693358163e-05, + "loss": 5.5578, + "step": 1947 + }, + { + "epoch": 0.12092619032838786, + "grad_norm": 0.5952481142404318, + "learning_rate": 4.030622801572522e-05, + "loss": 5.6356, + "step": 1948 + }, + { + "epoch": 0.12098826742814575, + "grad_norm": 0.4942463396097961, + "learning_rate": 4.032691909786882e-05, + "loss": 5.4892, + "step": 1949 + }, + { + "epoch": 0.12105034452790366, + "grad_norm": 0.5311131020477164, + "learning_rate": 4.034761018001242e-05, + "loss": 5.4937, + "step": 1950 + }, + { + "epoch": 0.12111242162766156, + "grad_norm": 0.6436149325847524, + "learning_rate": 4.036830126215601e-05, + "loss": 5.5226, + "step": 1951 + }, + { + "epoch": 0.12117449872741945, + "grad_norm": 0.5614580995444788, + "learning_rate": 4.038899234429961e-05, + "loss": 5.5126, + "step": 1952 + }, + { + "epoch": 0.12123657582717735, + "grad_norm": 0.5589769289599091, + "learning_rate": 4.0409683426443203e-05, + "loss": 5.5388, + "step": 1953 + }, + { + "epoch": 0.12129865292693526, + "grad_norm": 0.5731348720353427, + "learning_rate": 4.04303745085868e-05, + "loss": 5.5028, + "step": 1954 + }, + { + "epoch": 0.12136073002669315, + "grad_norm": 0.5575327366681395, + "learning_rate": 4.04510655907304e-05, + "loss": 5.5224, + "step": 1955 + }, + { + "epoch": 0.12142280712645105, + "grad_norm": 0.4764475762048488, + "learning_rate": 4.0471756672873995e-05, + "loss": 5.4712, + "step": 1956 + }, + { + "epoch": 0.12148488422620896, + "grad_norm": 0.5129958951927134, + "learning_rate": 4.049244775501759e-05, + "loss": 5.494, + "step": 1957 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 0.6355410959535532, + "learning_rate": 4.0513138837161186e-05, + "loss": 5.5236, + "step": 1958 + }, + { + "epoch": 0.12160903842572475, + "grad_norm": 0.4932504559373174, + "learning_rate": 4.053382991930478e-05, + "loss": 5.5178, + "step": 1959 + }, + { + "epoch": 0.12167111552548265, + "grad_norm": 0.7702288104358438, + "learning_rate": 4.0554521001448376e-05, + "loss": 5.4364, + "step": 1960 + }, + { + "epoch": 0.12173319262524054, + "grad_norm": 0.6541345651205346, + "learning_rate": 4.057521208359197e-05, + "loss": 5.4031, + "step": 1961 + }, + { + "epoch": 0.12179526972499845, + "grad_norm": 0.5324304312720264, + "learning_rate": 4.059590316573557e-05, + "loss": 5.4825, + "step": 1962 + }, + { + "epoch": 0.12185734682475635, + "grad_norm": 0.565585442735208, + "learning_rate": 4.061659424787916e-05, + "loss": 5.6146, + "step": 1963 + }, + { + "epoch": 0.12191942392451424, + "grad_norm": 0.6214214156393395, + "learning_rate": 4.063728533002276e-05, + "loss": 5.5051, + "step": 1964 + }, + { + "epoch": 0.12198150102427215, + "grad_norm": 0.45777102858671853, + "learning_rate": 4.065797641216636e-05, + "loss": 5.5691, + "step": 1965 + }, + { + "epoch": 0.12204357812403005, + "grad_norm": 0.6002391279062939, + "learning_rate": 4.067866749430995e-05, + "loss": 5.5542, + "step": 1966 + }, + { + "epoch": 0.12210565522378794, + "grad_norm": 0.6157860356502809, + "learning_rate": 4.069935857645355e-05, + "loss": 5.4119, + "step": 1967 + }, + { + "epoch": 0.12216773232354584, + "grad_norm": 0.47664738508831933, + "learning_rate": 4.072004965859715e-05, + "loss": 5.428, + "step": 1968 + }, + { + "epoch": 0.12222980942330375, + "grad_norm": 0.684737028753256, + "learning_rate": 4.074074074074074e-05, + "loss": 5.5455, + "step": 1969 + }, + { + "epoch": 0.12229188652306164, + "grad_norm": 0.4885383081896854, + "learning_rate": 4.076143182288434e-05, + "loss": 5.4772, + "step": 1970 + }, + { + "epoch": 0.12235396362281954, + "grad_norm": 0.6526960232675048, + "learning_rate": 4.0782122905027935e-05, + "loss": 5.4765, + "step": 1971 + }, + { + "epoch": 0.12241604072257745, + "grad_norm": 0.5728339540133953, + "learning_rate": 4.080281398717153e-05, + "loss": 5.4532, + "step": 1972 + }, + { + "epoch": 0.12247811782233534, + "grad_norm": 0.6160091305464765, + "learning_rate": 4.0823505069315125e-05, + "loss": 5.5206, + "step": 1973 + }, + { + "epoch": 0.12254019492209324, + "grad_norm": 0.6228435985956787, + "learning_rate": 4.084419615145873e-05, + "loss": 5.3867, + "step": 1974 + }, + { + "epoch": 0.12260227202185114, + "grad_norm": 0.6392020262080939, + "learning_rate": 4.0864887233602315e-05, + "loss": 5.4806, + "step": 1975 + }, + { + "epoch": 0.12266434912160903, + "grad_norm": 0.581830239628209, + "learning_rate": 4.088557831574592e-05, + "loss": 5.4443, + "step": 1976 + }, + { + "epoch": 0.12272642622136694, + "grad_norm": 0.4902542535771436, + "learning_rate": 4.090626939788951e-05, + "loss": 5.5314, + "step": 1977 + }, + { + "epoch": 0.12278850332112484, + "grad_norm": 0.4646685775105708, + "learning_rate": 4.092696048003311e-05, + "loss": 5.5468, + "step": 1978 + }, + { + "epoch": 0.12285058042088273, + "grad_norm": 0.535103398518614, + "learning_rate": 4.09476515621767e-05, + "loss": 5.5001, + "step": 1979 + }, + { + "epoch": 0.12291265752064064, + "grad_norm": 0.4646461611832689, + "learning_rate": 4.0968342644320304e-05, + "loss": 5.4812, + "step": 1980 + }, + { + "epoch": 0.12297473462039854, + "grad_norm": 0.5450872081575516, + "learning_rate": 4.098903372646389e-05, + "loss": 5.5813, + "step": 1981 + }, + { + "epoch": 0.12303681172015643, + "grad_norm": 0.567616058573205, + "learning_rate": 4.1009724808607494e-05, + "loss": 5.4944, + "step": 1982 + }, + { + "epoch": 0.12309888881991433, + "grad_norm": 0.5179384524775212, + "learning_rate": 4.103041589075109e-05, + "loss": 5.3885, + "step": 1983 + }, + { + "epoch": 0.12316096591967224, + "grad_norm": 0.4903411748900055, + "learning_rate": 4.1051106972894685e-05, + "loss": 5.4618, + "step": 1984 + }, + { + "epoch": 0.12322304301943013, + "grad_norm": 0.5552844475490878, + "learning_rate": 4.107179805503828e-05, + "loss": 5.5142, + "step": 1985 + }, + { + "epoch": 0.12328512011918803, + "grad_norm": 0.4955280562954232, + "learning_rate": 4.1092489137181875e-05, + "loss": 5.5436, + "step": 1986 + }, + { + "epoch": 0.12334719721894594, + "grad_norm": 0.5320132046634694, + "learning_rate": 4.111318021932547e-05, + "loss": 5.4799, + "step": 1987 + }, + { + "epoch": 0.12340927431870383, + "grad_norm": 0.4195734219515331, + "learning_rate": 4.1133871301469065e-05, + "loss": 5.4712, + "step": 1988 + }, + { + "epoch": 0.12347135141846173, + "grad_norm": 0.44002502527379367, + "learning_rate": 4.115456238361267e-05, + "loss": 5.4382, + "step": 1989 + }, + { + "epoch": 0.12353342851821963, + "grad_norm": 0.40424178849292314, + "learning_rate": 4.1175253465756255e-05, + "loss": 5.4417, + "step": 1990 + }, + { + "epoch": 0.12359550561797752, + "grad_norm": 0.5357762664180454, + "learning_rate": 4.119594454789986e-05, + "loss": 5.4169, + "step": 1991 + }, + { + "epoch": 0.12365758271773543, + "grad_norm": 0.37718393246686815, + "learning_rate": 4.121663563004345e-05, + "loss": 5.5309, + "step": 1992 + }, + { + "epoch": 0.12371965981749333, + "grad_norm": 0.447994571017468, + "learning_rate": 4.1237326712187054e-05, + "loss": 5.438, + "step": 1993 + }, + { + "epoch": 0.12378173691725122, + "grad_norm": 0.4975680574035597, + "learning_rate": 4.125801779433064e-05, + "loss": 5.5464, + "step": 1994 + }, + { + "epoch": 0.12384381401700913, + "grad_norm": 0.45255673060791585, + "learning_rate": 4.1278708876474244e-05, + "loss": 5.5521, + "step": 1995 + }, + { + "epoch": 0.12390589111676703, + "grad_norm": 0.8135629816393083, + "learning_rate": 4.129939995861784e-05, + "loss": 5.4598, + "step": 1996 + }, + { + "epoch": 0.12396796821652492, + "grad_norm": 0.4992425963626947, + "learning_rate": 4.1320091040761434e-05, + "loss": 5.4805, + "step": 1997 + }, + { + "epoch": 0.12403004531628282, + "grad_norm": 0.8619511226152617, + "learning_rate": 4.134078212290503e-05, + "loss": 5.4412, + "step": 1998 + }, + { + "epoch": 0.12409212241604073, + "grad_norm": 0.6957552496752821, + "learning_rate": 4.136147320504863e-05, + "loss": 5.4984, + "step": 1999 + }, + { + "epoch": 0.12415419951579862, + "grad_norm": 0.7598413660104996, + "learning_rate": 4.138216428719222e-05, + "loss": 5.5057, + "step": 2000 + }, + { + "epoch": 0.12421627661555652, + "grad_norm": 0.5298431224138688, + "learning_rate": 4.140285536933582e-05, + "loss": 5.4339, + "step": 2001 + }, + { + "epoch": 0.12427835371531443, + "grad_norm": 1.324818613124557, + "learning_rate": 4.1423546451479416e-05, + "loss": 5.4778, + "step": 2002 + }, + { + "epoch": 0.12434043081507232, + "grad_norm": 0.9688785529211164, + "learning_rate": 4.144423753362301e-05, + "loss": 5.4347, + "step": 2003 + }, + { + "epoch": 0.12440250791483022, + "grad_norm": 0.8784147795809603, + "learning_rate": 4.1464928615766607e-05, + "loss": 5.5356, + "step": 2004 + }, + { + "epoch": 0.12446458501458812, + "grad_norm": 0.8880796366414168, + "learning_rate": 4.148561969791021e-05, + "loss": 5.5033, + "step": 2005 + }, + { + "epoch": 0.12452666211434602, + "grad_norm": 0.6354022780076479, + "learning_rate": 4.15063107800538e-05, + "loss": 5.4599, + "step": 2006 + }, + { + "epoch": 0.12458873921410392, + "grad_norm": 0.6312440579200794, + "learning_rate": 4.15270018621974e-05, + "loss": 5.4201, + "step": 2007 + }, + { + "epoch": 0.12465081631386182, + "grad_norm": 0.5940380150787392, + "learning_rate": 4.1547692944340994e-05, + "loss": 5.548, + "step": 2008 + }, + { + "epoch": 0.12471289341361971, + "grad_norm": 0.6287693087834234, + "learning_rate": 4.156838402648459e-05, + "loss": 5.4235, + "step": 2009 + }, + { + "epoch": 0.12477497051337762, + "grad_norm": 0.973720412680153, + "learning_rate": 4.1589075108628184e-05, + "loss": 5.455, + "step": 2010 + }, + { + "epoch": 0.12483704761313552, + "grad_norm": 0.4585702366950305, + "learning_rate": 4.160976619077178e-05, + "loss": 5.4085, + "step": 2011 + }, + { + "epoch": 0.12489912471289341, + "grad_norm": 0.6352103685359144, + "learning_rate": 4.1630457272915374e-05, + "loss": 5.5159, + "step": 2012 + }, + { + "epoch": 0.12496120181265132, + "grad_norm": 0.5647290093471241, + "learning_rate": 4.165114835505897e-05, + "loss": 5.4123, + "step": 2013 + }, + { + "epoch": 0.12502327891240922, + "grad_norm": 0.5877005944818097, + "learning_rate": 4.167183943720257e-05, + "loss": 5.5054, + "step": 2014 + }, + { + "epoch": 0.1250853560121671, + "grad_norm": 0.4544824786884855, + "learning_rate": 4.169253051934616e-05, + "loss": 5.5414, + "step": 2015 + }, + { + "epoch": 0.125147433111925, + "grad_norm": 0.5846584346689032, + "learning_rate": 4.171322160148976e-05, + "loss": 5.5204, + "step": 2016 + }, + { + "epoch": 0.12520951021168292, + "grad_norm": 0.6707896626120656, + "learning_rate": 4.1733912683633356e-05, + "loss": 5.5129, + "step": 2017 + }, + { + "epoch": 0.1252715873114408, + "grad_norm": 0.5748251742497967, + "learning_rate": 4.175460376577695e-05, + "loss": 5.3633, + "step": 2018 + }, + { + "epoch": 0.1253336644111987, + "grad_norm": 0.6472787730251957, + "learning_rate": 4.1775294847920546e-05, + "loss": 5.4088, + "step": 2019 + }, + { + "epoch": 0.12539574151095662, + "grad_norm": 0.5403079420684022, + "learning_rate": 4.179598593006415e-05, + "loss": 5.3283, + "step": 2020 + }, + { + "epoch": 0.1254578186107145, + "grad_norm": 0.6204625887385963, + "learning_rate": 4.1816677012207736e-05, + "loss": 5.4243, + "step": 2021 + }, + { + "epoch": 0.1255198957104724, + "grad_norm": 0.4727471949828481, + "learning_rate": 4.183736809435134e-05, + "loss": 5.323, + "step": 2022 + }, + { + "epoch": 0.1255819728102303, + "grad_norm": 0.5152777745360277, + "learning_rate": 4.185805917649493e-05, + "loss": 5.3845, + "step": 2023 + }, + { + "epoch": 0.1256440499099882, + "grad_norm": 0.40442644337417005, + "learning_rate": 4.187875025863853e-05, + "loss": 5.5379, + "step": 2024 + }, + { + "epoch": 0.1257061270097461, + "grad_norm": 0.4242271267957832, + "learning_rate": 4.1899441340782123e-05, + "loss": 5.4513, + "step": 2025 + }, + { + "epoch": 0.125768204109504, + "grad_norm": 0.5108222096410792, + "learning_rate": 4.1920132422925725e-05, + "loss": 5.4413, + "step": 2026 + }, + { + "epoch": 0.1258302812092619, + "grad_norm": 0.5602347538417077, + "learning_rate": 4.1940823505069314e-05, + "loss": 5.4512, + "step": 2027 + }, + { + "epoch": 0.1258923583090198, + "grad_norm": 0.4739099380042775, + "learning_rate": 4.1961514587212915e-05, + "loss": 5.4301, + "step": 2028 + }, + { + "epoch": 0.1259544354087777, + "grad_norm": 0.5026512782758417, + "learning_rate": 4.198220566935651e-05, + "loss": 5.4436, + "step": 2029 + }, + { + "epoch": 0.1260165125085356, + "grad_norm": 0.5320994934141274, + "learning_rate": 4.2002896751500106e-05, + "loss": 5.4167, + "step": 2030 + }, + { + "epoch": 0.1260785896082935, + "grad_norm": 0.47965913496119744, + "learning_rate": 4.20235878336437e-05, + "loss": 5.3887, + "step": 2031 + }, + { + "epoch": 0.1261406667080514, + "grad_norm": 0.5639619108842164, + "learning_rate": 4.20442789157873e-05, + "loss": 5.4281, + "step": 2032 + }, + { + "epoch": 0.1262027438078093, + "grad_norm": 0.38216674601738015, + "learning_rate": 4.206496999793089e-05, + "loss": 5.4879, + "step": 2033 + }, + { + "epoch": 0.1262648209075672, + "grad_norm": 0.4662630708053784, + "learning_rate": 4.208566108007449e-05, + "loss": 5.3408, + "step": 2034 + }, + { + "epoch": 0.1263268980073251, + "grad_norm": 0.49622077517129537, + "learning_rate": 4.210635216221809e-05, + "loss": 5.4001, + "step": 2035 + }, + { + "epoch": 0.126388975107083, + "grad_norm": 0.4219133218384917, + "learning_rate": 4.212704324436168e-05, + "loss": 5.3476, + "step": 2036 + }, + { + "epoch": 0.12645105220684089, + "grad_norm": 0.42197830207012305, + "learning_rate": 4.214773432650528e-05, + "loss": 5.4863, + "step": 2037 + }, + { + "epoch": 0.1265131293065988, + "grad_norm": 0.4804674629256214, + "learning_rate": 4.216842540864887e-05, + "loss": 5.578, + "step": 2038 + }, + { + "epoch": 0.1265752064063567, + "grad_norm": 0.45346756843968694, + "learning_rate": 4.218911649079247e-05, + "loss": 5.4026, + "step": 2039 + }, + { + "epoch": 0.12663728350611458, + "grad_norm": 0.47921942510268123, + "learning_rate": 4.220980757293606e-05, + "loss": 5.3642, + "step": 2040 + }, + { + "epoch": 0.1266993606058725, + "grad_norm": 0.556459765582977, + "learning_rate": 4.2230498655079665e-05, + "loss": 5.4294, + "step": 2041 + }, + { + "epoch": 0.1267614377056304, + "grad_norm": 0.4803038058117653, + "learning_rate": 4.225118973722326e-05, + "loss": 5.3537, + "step": 2042 + }, + { + "epoch": 0.12682351480538828, + "grad_norm": 0.5005916432901264, + "learning_rate": 4.2271880819366855e-05, + "loss": 5.2435, + "step": 2043 + }, + { + "epoch": 0.1268855919051462, + "grad_norm": 0.5527232138649254, + "learning_rate": 4.229257190151045e-05, + "loss": 5.3902, + "step": 2044 + }, + { + "epoch": 0.1269476690049041, + "grad_norm": 0.48764145706165907, + "learning_rate": 4.2313262983654045e-05, + "loss": 5.3088, + "step": 2045 + }, + { + "epoch": 0.12700974610466198, + "grad_norm": 0.593319636527926, + "learning_rate": 4.233395406579764e-05, + "loss": 5.3803, + "step": 2046 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 0.4356970763269705, + "learning_rate": 4.235464514794124e-05, + "loss": 5.3454, + "step": 2047 + }, + { + "epoch": 0.1271339003041778, + "grad_norm": 0.3854482375780737, + "learning_rate": 4.237533623008483e-05, + "loss": 5.401, + "step": 2048 + }, + { + "epoch": 0.12719597740393568, + "grad_norm": 0.42735552449019115, + "learning_rate": 4.239602731222843e-05, + "loss": 5.4834, + "step": 2049 + }, + { + "epoch": 0.1272580545036936, + "grad_norm": 0.459923129137525, + "learning_rate": 4.241671839437203e-05, + "loss": 5.4068, + "step": 2050 + }, + { + "epoch": 0.1273201316034515, + "grad_norm": 0.48771929638010075, + "learning_rate": 4.243740947651562e-05, + "loss": 5.4779, + "step": 2051 + }, + { + "epoch": 0.12738220870320938, + "grad_norm": 0.546684940554126, + "learning_rate": 4.245810055865922e-05, + "loss": 5.387, + "step": 2052 + }, + { + "epoch": 0.1274442858029673, + "grad_norm": 0.4538383295189525, + "learning_rate": 4.247879164080282e-05, + "loss": 5.3506, + "step": 2053 + }, + { + "epoch": 0.12750636290272518, + "grad_norm": 0.505398485114043, + "learning_rate": 4.249948272294641e-05, + "loss": 5.3607, + "step": 2054 + }, + { + "epoch": 0.12756844000248307, + "grad_norm": 0.42841732250310094, + "learning_rate": 4.252017380509001e-05, + "loss": 5.3595, + "step": 2055 + }, + { + "epoch": 0.127630517102241, + "grad_norm": 0.5210540939408892, + "learning_rate": 4.2540864887233605e-05, + "loss": 5.4312, + "step": 2056 + }, + { + "epoch": 0.12769259420199888, + "grad_norm": 0.6825478536296974, + "learning_rate": 4.25615559693772e-05, + "loss": 5.3978, + "step": 2057 + }, + { + "epoch": 0.12775467130175677, + "grad_norm": 0.5041221038050355, + "learning_rate": 4.2582247051520795e-05, + "loss": 5.3316, + "step": 2058 + }, + { + "epoch": 0.1278167484015147, + "grad_norm": 0.4847598239204231, + "learning_rate": 4.26029381336644e-05, + "loss": 5.4355, + "step": 2059 + }, + { + "epoch": 0.12787882550127258, + "grad_norm": 0.5575368258796816, + "learning_rate": 4.2623629215807985e-05, + "loss": 5.2579, + "step": 2060 + }, + { + "epoch": 0.12794090260103047, + "grad_norm": 0.42767164562919247, + "learning_rate": 4.264432029795159e-05, + "loss": 5.3003, + "step": 2061 + }, + { + "epoch": 0.1280029797007884, + "grad_norm": 0.5832993690503006, + "learning_rate": 4.266501138009518e-05, + "loss": 5.3954, + "step": 2062 + }, + { + "epoch": 0.12806505680054628, + "grad_norm": 0.4471280081980768, + "learning_rate": 4.268570246223878e-05, + "loss": 5.4335, + "step": 2063 + }, + { + "epoch": 0.12812713390030417, + "grad_norm": 0.45283051491146814, + "learning_rate": 4.270639354438237e-05, + "loss": 5.3579, + "step": 2064 + }, + { + "epoch": 0.1281892110000621, + "grad_norm": 0.4886596129271756, + "learning_rate": 4.2727084626525974e-05, + "loss": 5.3097, + "step": 2065 + }, + { + "epoch": 0.12825128809981998, + "grad_norm": 0.43411953295300426, + "learning_rate": 4.274777570866956e-05, + "loss": 5.3537, + "step": 2066 + }, + { + "epoch": 0.12831336519957787, + "grad_norm": 0.5290116671337439, + "learning_rate": 4.2768466790813164e-05, + "loss": 5.4246, + "step": 2067 + }, + { + "epoch": 0.12837544229933578, + "grad_norm": 0.47395156673869054, + "learning_rate": 4.278915787295676e-05, + "loss": 5.3277, + "step": 2068 + }, + { + "epoch": 0.12843751939909367, + "grad_norm": 0.5597433252145334, + "learning_rate": 4.2809848955100354e-05, + "loss": 5.3666, + "step": 2069 + }, + { + "epoch": 0.12849959649885156, + "grad_norm": 0.7512303829964178, + "learning_rate": 4.283054003724395e-05, + "loss": 5.4043, + "step": 2070 + }, + { + "epoch": 0.12856167359860948, + "grad_norm": 0.567897438494398, + "learning_rate": 4.2851231119387544e-05, + "loss": 5.453, + "step": 2071 + }, + { + "epoch": 0.12862375069836737, + "grad_norm": 0.9887623878503535, + "learning_rate": 4.287192220153114e-05, + "loss": 5.4772, + "step": 2072 + }, + { + "epoch": 0.12868582779812526, + "grad_norm": 0.4882260100589078, + "learning_rate": 4.2892613283674735e-05, + "loss": 5.2696, + "step": 2073 + }, + { + "epoch": 0.12874790489788318, + "grad_norm": 0.7020886943586045, + "learning_rate": 4.2913304365818336e-05, + "loss": 5.4606, + "step": 2074 + }, + { + "epoch": 0.12880998199764107, + "grad_norm": 0.5733317316618807, + "learning_rate": 4.2933995447961925e-05, + "loss": 5.3842, + "step": 2075 + }, + { + "epoch": 0.12887205909739896, + "grad_norm": 0.5887852195766691, + "learning_rate": 4.2954686530105527e-05, + "loss": 5.3759, + "step": 2076 + }, + { + "epoch": 0.12893413619715688, + "grad_norm": 0.6284439567640164, + "learning_rate": 4.297537761224912e-05, + "loss": 5.4051, + "step": 2077 + }, + { + "epoch": 0.12899621329691477, + "grad_norm": 0.6719808938611285, + "learning_rate": 4.2996068694392723e-05, + "loss": 5.36, + "step": 2078 + }, + { + "epoch": 0.12905829039667266, + "grad_norm": 0.7431203902015766, + "learning_rate": 4.301675977653631e-05, + "loss": 5.4378, + "step": 2079 + }, + { + "epoch": 0.12912036749643058, + "grad_norm": 0.4158324792174064, + "learning_rate": 4.3037450858679914e-05, + "loss": 5.327, + "step": 2080 + }, + { + "epoch": 0.12918244459618847, + "grad_norm": 0.5631180387907103, + "learning_rate": 4.305814194082351e-05, + "loss": 5.3606, + "step": 2081 + }, + { + "epoch": 0.12924452169594636, + "grad_norm": 0.43751047326222287, + "learning_rate": 4.3078833022967104e-05, + "loss": 5.3246, + "step": 2082 + }, + { + "epoch": 0.12930659879570428, + "grad_norm": 0.5112733139840712, + "learning_rate": 4.30995241051107e-05, + "loss": 5.4266, + "step": 2083 + }, + { + "epoch": 0.12936867589546217, + "grad_norm": 0.5095657307487189, + "learning_rate": 4.31202151872543e-05, + "loss": 5.3203, + "step": 2084 + }, + { + "epoch": 0.12943075299522006, + "grad_norm": 0.457698172397311, + "learning_rate": 4.314090626939789e-05, + "loss": 5.4305, + "step": 2085 + }, + { + "epoch": 0.12949283009497797, + "grad_norm": 0.9867126621713951, + "learning_rate": 4.316159735154149e-05, + "loss": 5.3248, + "step": 2086 + }, + { + "epoch": 0.12955490719473586, + "grad_norm": 0.6951748786774438, + "learning_rate": 4.3182288433685086e-05, + "loss": 5.3552, + "step": 2087 + }, + { + "epoch": 0.12961698429449375, + "grad_norm": 0.5997190841062441, + "learning_rate": 4.320297951582868e-05, + "loss": 5.4248, + "step": 2088 + }, + { + "epoch": 0.12967906139425167, + "grad_norm": 0.7571368007083511, + "learning_rate": 4.3223670597972276e-05, + "loss": 5.3715, + "step": 2089 + }, + { + "epoch": 0.12974113849400956, + "grad_norm": 0.7159200185159197, + "learning_rate": 4.324436168011588e-05, + "loss": 5.4635, + "step": 2090 + }, + { + "epoch": 0.12980321559376745, + "grad_norm": 0.7058032209292602, + "learning_rate": 4.3265052762259466e-05, + "loss": 5.3014, + "step": 2091 + }, + { + "epoch": 0.12986529269352537, + "grad_norm": 0.6067409808192298, + "learning_rate": 4.328574384440307e-05, + "loss": 5.4014, + "step": 2092 + }, + { + "epoch": 0.12992736979328326, + "grad_norm": 0.7562412608248604, + "learning_rate": 4.330643492654666e-05, + "loss": 5.2481, + "step": 2093 + }, + { + "epoch": 0.12998944689304115, + "grad_norm": 0.6239673905731553, + "learning_rate": 4.332712600869026e-05, + "loss": 5.3414, + "step": 2094 + }, + { + "epoch": 0.13005152399279907, + "grad_norm": 0.7473939680725287, + "learning_rate": 4.334781709083385e-05, + "loss": 5.3911, + "step": 2095 + }, + { + "epoch": 0.13011360109255696, + "grad_norm": 0.642499136200558, + "learning_rate": 4.336850817297745e-05, + "loss": 5.3932, + "step": 2096 + }, + { + "epoch": 0.13017567819231485, + "grad_norm": 0.7243353641781145, + "learning_rate": 4.3389199255121043e-05, + "loss": 5.3771, + "step": 2097 + }, + { + "epoch": 0.13023775529207277, + "grad_norm": 0.5407386098130338, + "learning_rate": 4.340989033726464e-05, + "loss": 5.3964, + "step": 2098 + }, + { + "epoch": 0.13029983239183066, + "grad_norm": 0.6237942031303588, + "learning_rate": 4.343058141940824e-05, + "loss": 5.407, + "step": 2099 + }, + { + "epoch": 0.13036190949158855, + "grad_norm": 0.6780019666100284, + "learning_rate": 4.345127250155183e-05, + "loss": 5.3775, + "step": 2100 + }, + { + "epoch": 0.13042398659134646, + "grad_norm": 0.5458968815926368, + "learning_rate": 4.347196358369543e-05, + "loss": 5.379, + "step": 2101 + }, + { + "epoch": 0.13048606369110435, + "grad_norm": 0.6039887582888549, + "learning_rate": 4.3492654665839026e-05, + "loss": 5.3313, + "step": 2102 + }, + { + "epoch": 0.13054814079086224, + "grad_norm": 0.5449518174999788, + "learning_rate": 4.351334574798262e-05, + "loss": 5.4129, + "step": 2103 + }, + { + "epoch": 0.13061021789062016, + "grad_norm": 0.506670621691292, + "learning_rate": 4.3534036830126216e-05, + "loss": 5.3183, + "step": 2104 + }, + { + "epoch": 0.13067229499037805, + "grad_norm": 0.5310971937884386, + "learning_rate": 4.355472791226982e-05, + "loss": 5.3307, + "step": 2105 + }, + { + "epoch": 0.13073437209013594, + "grad_norm": 0.5141434336204311, + "learning_rate": 4.3575418994413406e-05, + "loss": 5.409, + "step": 2106 + }, + { + "epoch": 0.13079644918989386, + "grad_norm": 0.47778669618198083, + "learning_rate": 4.359611007655701e-05, + "loss": 5.3414, + "step": 2107 + }, + { + "epoch": 0.13085852628965175, + "grad_norm": 0.7570789365443062, + "learning_rate": 4.36168011587006e-05, + "loss": 5.3692, + "step": 2108 + }, + { + "epoch": 0.13092060338940964, + "grad_norm": 0.8005513303097965, + "learning_rate": 4.36374922408442e-05, + "loss": 5.454, + "step": 2109 + }, + { + "epoch": 0.13098268048916756, + "grad_norm": 0.5409105384758478, + "learning_rate": 4.365818332298779e-05, + "loss": 5.4094, + "step": 2110 + }, + { + "epoch": 0.13104475758892545, + "grad_norm": 0.7880866560052293, + "learning_rate": 4.3678874405131395e-05, + "loss": 5.4179, + "step": 2111 + }, + { + "epoch": 0.13110683468868334, + "grad_norm": 0.4825636466202529, + "learning_rate": 4.369956548727498e-05, + "loss": 5.1806, + "step": 2112 + }, + { + "epoch": 0.13116891178844126, + "grad_norm": 0.64564944284705, + "learning_rate": 4.3720256569418585e-05, + "loss": 5.4137, + "step": 2113 + }, + { + "epoch": 0.13123098888819915, + "grad_norm": 0.44334641828384064, + "learning_rate": 4.374094765156218e-05, + "loss": 5.3213, + "step": 2114 + }, + { + "epoch": 0.13129306598795704, + "grad_norm": 0.5621668040700172, + "learning_rate": 4.3761638733705775e-05, + "loss": 5.3293, + "step": 2115 + }, + { + "epoch": 0.13135514308771495, + "grad_norm": 0.7010472859067204, + "learning_rate": 4.378232981584937e-05, + "loss": 5.3332, + "step": 2116 + }, + { + "epoch": 0.13141722018747284, + "grad_norm": 0.5125589534533096, + "learning_rate": 4.380302089799297e-05, + "loss": 5.3243, + "step": 2117 + }, + { + "epoch": 0.13147929728723073, + "grad_norm": 0.557111895315728, + "learning_rate": 4.382371198013656e-05, + "loss": 5.4398, + "step": 2118 + }, + { + "epoch": 0.13154137438698865, + "grad_norm": 0.46102466405667325, + "learning_rate": 4.384440306228016e-05, + "loss": 5.3519, + "step": 2119 + }, + { + "epoch": 0.13160345148674654, + "grad_norm": 0.5109687433985276, + "learning_rate": 4.386509414442376e-05, + "loss": 5.3486, + "step": 2120 + }, + { + "epoch": 0.13166552858650443, + "grad_norm": 0.4424339079664581, + "learning_rate": 4.388578522656735e-05, + "loss": 5.1688, + "step": 2121 + }, + { + "epoch": 0.13172760568626235, + "grad_norm": 0.5062816433377301, + "learning_rate": 4.390647630871095e-05, + "loss": 5.3299, + "step": 2122 + }, + { + "epoch": 0.13178968278602024, + "grad_norm": 0.5668249490043615, + "learning_rate": 4.392716739085454e-05, + "loss": 5.3737, + "step": 2123 + }, + { + "epoch": 0.13185175988577813, + "grad_norm": 0.4764077451237695, + "learning_rate": 4.394785847299814e-05, + "loss": 5.3899, + "step": 2124 + }, + { + "epoch": 0.13191383698553605, + "grad_norm": 0.4883354462110272, + "learning_rate": 4.396854955514173e-05, + "loss": 5.3606, + "step": 2125 + }, + { + "epoch": 0.13197591408529394, + "grad_norm": 0.4484244414498368, + "learning_rate": 4.3989240637285335e-05, + "loss": 5.3375, + "step": 2126 + }, + { + "epoch": 0.13203799118505183, + "grad_norm": 0.7432962668447409, + "learning_rate": 4.400993171942892e-05, + "loss": 5.3423, + "step": 2127 + }, + { + "epoch": 0.13210006828480975, + "grad_norm": 0.5757958429731066, + "learning_rate": 4.4030622801572525e-05, + "loss": 5.3589, + "step": 2128 + }, + { + "epoch": 0.13216214538456764, + "grad_norm": 0.4770246359962941, + "learning_rate": 4.405131388371612e-05, + "loss": 5.3313, + "step": 2129 + }, + { + "epoch": 0.13222422248432553, + "grad_norm": 0.5442295667223387, + "learning_rate": 4.4072004965859715e-05, + "loss": 5.3329, + "step": 2130 + }, + { + "epoch": 0.13228629958408344, + "grad_norm": 0.5096331895699341, + "learning_rate": 4.409269604800331e-05, + "loss": 5.4064, + "step": 2131 + }, + { + "epoch": 0.13234837668384133, + "grad_norm": 0.6579343886723572, + "learning_rate": 4.411338713014691e-05, + "loss": 5.3784, + "step": 2132 + }, + { + "epoch": 0.13241045378359922, + "grad_norm": 0.6798849367164976, + "learning_rate": 4.41340782122905e-05, + "loss": 5.219, + "step": 2133 + }, + { + "epoch": 0.13247253088335714, + "grad_norm": 0.6076202068787233, + "learning_rate": 4.41547692944341e-05, + "loss": 5.3651, + "step": 2134 + }, + { + "epoch": 0.13253460798311503, + "grad_norm": 0.8173687750227382, + "learning_rate": 4.41754603765777e-05, + "loss": 5.3567, + "step": 2135 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 0.7006585412414825, + "learning_rate": 4.419615145872129e-05, + "loss": 5.3581, + "step": 2136 + }, + { + "epoch": 0.13265876218263084, + "grad_norm": 0.6465329154885227, + "learning_rate": 4.421684254086489e-05, + "loss": 5.3482, + "step": 2137 + }, + { + "epoch": 0.13272083928238873, + "grad_norm": 1.147929955259602, + "learning_rate": 4.423753362300849e-05, + "loss": 5.3842, + "step": 2138 + }, + { + "epoch": 0.13278291638214662, + "grad_norm": 0.7322562068495118, + "learning_rate": 4.425822470515208e-05, + "loss": 5.4051, + "step": 2139 + }, + { + "epoch": 0.13284499348190454, + "grad_norm": 0.749610680623232, + "learning_rate": 4.427891578729568e-05, + "loss": 5.3378, + "step": 2140 + }, + { + "epoch": 0.13290707058166243, + "grad_norm": 0.6461010629223923, + "learning_rate": 4.4299606869439274e-05, + "loss": 5.3351, + "step": 2141 + }, + { + "epoch": 0.13296914768142032, + "grad_norm": 0.667168165368976, + "learning_rate": 4.432029795158287e-05, + "loss": 5.2824, + "step": 2142 + }, + { + "epoch": 0.13303122478117824, + "grad_norm": 0.5690284539920305, + "learning_rate": 4.4340989033726464e-05, + "loss": 5.3839, + "step": 2143 + }, + { + "epoch": 0.13309330188093613, + "grad_norm": 0.62350647814826, + "learning_rate": 4.4361680115870066e-05, + "loss": 5.3096, + "step": 2144 + }, + { + "epoch": 0.13315537898069402, + "grad_norm": 0.5959327880401043, + "learning_rate": 4.4382371198013655e-05, + "loss": 5.295, + "step": 2145 + }, + { + "epoch": 0.13321745608045193, + "grad_norm": 0.48748844634327887, + "learning_rate": 4.4403062280157256e-05, + "loss": 5.3594, + "step": 2146 + }, + { + "epoch": 0.13327953318020982, + "grad_norm": 0.4775851486840284, + "learning_rate": 4.442375336230085e-05, + "loss": 5.3882, + "step": 2147 + }, + { + "epoch": 0.13334161027996771, + "grad_norm": 0.4633106466812288, + "learning_rate": 4.4444444444444447e-05, + "loss": 5.3894, + "step": 2148 + }, + { + "epoch": 0.13340368737972563, + "grad_norm": 0.4692238955321995, + "learning_rate": 4.446513552658804e-05, + "loss": 5.353, + "step": 2149 + }, + { + "epoch": 0.13346576447948352, + "grad_norm": 0.4556974363634125, + "learning_rate": 4.448582660873164e-05, + "loss": 5.3228, + "step": 2150 + }, + { + "epoch": 0.1335278415792414, + "grad_norm": 0.5162998464964262, + "learning_rate": 4.450651769087523e-05, + "loss": 5.3293, + "step": 2151 + }, + { + "epoch": 0.13358991867899933, + "grad_norm": 0.4855648623608009, + "learning_rate": 4.4527208773018834e-05, + "loss": 5.2545, + "step": 2152 + }, + { + "epoch": 0.13365199577875722, + "grad_norm": 0.501627924327183, + "learning_rate": 4.454789985516243e-05, + "loss": 5.2809, + "step": 2153 + }, + { + "epoch": 0.1337140728785151, + "grad_norm": 0.45894952195318545, + "learning_rate": 4.4568590937306024e-05, + "loss": 5.2607, + "step": 2154 + }, + { + "epoch": 0.133776149978273, + "grad_norm": 0.6355812583958825, + "learning_rate": 4.458928201944962e-05, + "loss": 5.3407, + "step": 2155 + }, + { + "epoch": 0.13383822707803092, + "grad_norm": 0.7221486138670498, + "learning_rate": 4.4609973101593214e-05, + "loss": 5.3185, + "step": 2156 + }, + { + "epoch": 0.1339003041777888, + "grad_norm": 1.0917946912712955, + "learning_rate": 4.463066418373681e-05, + "loss": 5.3722, + "step": 2157 + }, + { + "epoch": 0.1339623812775467, + "grad_norm": 0.5473694065101774, + "learning_rate": 4.4651355265880404e-05, + "loss": 5.2119, + "step": 2158 + }, + { + "epoch": 0.13402445837730462, + "grad_norm": 0.9417899525094089, + "learning_rate": 4.4672046348024006e-05, + "loss": 5.2607, + "step": 2159 + }, + { + "epoch": 0.1340865354770625, + "grad_norm": 0.612210523553095, + "learning_rate": 4.4692737430167594e-05, + "loss": 5.3943, + "step": 2160 + }, + { + "epoch": 0.1341486125768204, + "grad_norm": 0.7028868165845262, + "learning_rate": 4.4713428512311196e-05, + "loss": 5.3695, + "step": 2161 + }, + { + "epoch": 0.13421068967657832, + "grad_norm": 0.6437523499232983, + "learning_rate": 4.473411959445479e-05, + "loss": 5.2547, + "step": 2162 + }, + { + "epoch": 0.1342727667763362, + "grad_norm": 0.5196046418860715, + "learning_rate": 4.475481067659839e-05, + "loss": 5.2244, + "step": 2163 + }, + { + "epoch": 0.1343348438760941, + "grad_norm": 0.6164415172936037, + "learning_rate": 4.477550175874198e-05, + "loss": 5.3983, + "step": 2164 + }, + { + "epoch": 0.134396920975852, + "grad_norm": 0.5087278772272668, + "learning_rate": 4.479619284088558e-05, + "loss": 5.4213, + "step": 2165 + }, + { + "epoch": 0.1344589980756099, + "grad_norm": 0.6043217531198666, + "learning_rate": 4.481688392302918e-05, + "loss": 5.2951, + "step": 2166 + }, + { + "epoch": 0.1345210751753678, + "grad_norm": 0.5531568227115979, + "learning_rate": 4.483757500517277e-05, + "loss": 5.2822, + "step": 2167 + }, + { + "epoch": 0.1345831522751257, + "grad_norm": 0.5430714626725102, + "learning_rate": 4.485826608731637e-05, + "loss": 5.3626, + "step": 2168 + }, + { + "epoch": 0.1346452293748836, + "grad_norm": 0.45745551150966224, + "learning_rate": 4.487895716945997e-05, + "loss": 5.3149, + "step": 2169 + }, + { + "epoch": 0.1347073064746415, + "grad_norm": 0.5563466148909093, + "learning_rate": 4.489964825160356e-05, + "loss": 5.3079, + "step": 2170 + }, + { + "epoch": 0.1347693835743994, + "grad_norm": 0.5007974107250333, + "learning_rate": 4.492033933374716e-05, + "loss": 5.2271, + "step": 2171 + }, + { + "epoch": 0.1348314606741573, + "grad_norm": 0.43658948565057837, + "learning_rate": 4.4941030415890756e-05, + "loss": 5.3256, + "step": 2172 + }, + { + "epoch": 0.1348935377739152, + "grad_norm": 0.6370387765020128, + "learning_rate": 4.496172149803435e-05, + "loss": 5.1883, + "step": 2173 + }, + { + "epoch": 0.1349556148736731, + "grad_norm": 0.5200524834593289, + "learning_rate": 4.4982412580177946e-05, + "loss": 5.2282, + "step": 2174 + }, + { + "epoch": 0.135017691973431, + "grad_norm": 0.4656025919546739, + "learning_rate": 4.500310366232154e-05, + "loss": 5.361, + "step": 2175 + }, + { + "epoch": 0.1350797690731889, + "grad_norm": 0.5942809090413499, + "learning_rate": 4.5023794744465136e-05, + "loss": 5.2666, + "step": 2176 + }, + { + "epoch": 0.1351418461729468, + "grad_norm": 0.44148868027428884, + "learning_rate": 4.504448582660874e-05, + "loss": 5.2591, + "step": 2177 + }, + { + "epoch": 0.1352039232727047, + "grad_norm": 0.8058423763965669, + "learning_rate": 4.506517690875233e-05, + "loss": 5.2576, + "step": 2178 + }, + { + "epoch": 0.13526600037246259, + "grad_norm": 0.5797608660723788, + "learning_rate": 4.508586799089593e-05, + "loss": 5.3567, + "step": 2179 + }, + { + "epoch": 0.1353280774722205, + "grad_norm": 0.5097596909919379, + "learning_rate": 4.510655907303952e-05, + "loss": 5.319, + "step": 2180 + }, + { + "epoch": 0.1353901545719784, + "grad_norm": 0.7474415772868551, + "learning_rate": 4.512725015518312e-05, + "loss": 5.3731, + "step": 2181 + }, + { + "epoch": 0.13545223167173628, + "grad_norm": 0.6174416803318289, + "learning_rate": 4.514794123732671e-05, + "loss": 5.4416, + "step": 2182 + }, + { + "epoch": 0.1355143087714942, + "grad_norm": 0.5066556812128601, + "learning_rate": 4.516863231947031e-05, + "loss": 5.2957, + "step": 2183 + }, + { + "epoch": 0.1355763858712521, + "grad_norm": 0.7499512815546789, + "learning_rate": 4.518932340161391e-05, + "loss": 5.3779, + "step": 2184 + }, + { + "epoch": 0.13563846297100998, + "grad_norm": 0.5468480784737941, + "learning_rate": 4.52100144837575e-05, + "loss": 5.2951, + "step": 2185 + }, + { + "epoch": 0.1357005400707679, + "grad_norm": 0.5822105387594473, + "learning_rate": 4.52307055659011e-05, + "loss": 5.2699, + "step": 2186 + }, + { + "epoch": 0.1357626171705258, + "grad_norm": 0.49947281701828933, + "learning_rate": 4.5251396648044695e-05, + "loss": 5.2648, + "step": 2187 + }, + { + "epoch": 0.13582469427028368, + "grad_norm": 0.8019381470171096, + "learning_rate": 4.527208773018829e-05, + "loss": 5.318, + "step": 2188 + }, + { + "epoch": 0.1358867713700416, + "grad_norm": 0.6263923870711704, + "learning_rate": 4.5292778812331885e-05, + "loss": 5.3254, + "step": 2189 + }, + { + "epoch": 0.1359488484697995, + "grad_norm": 0.48362119532094505, + "learning_rate": 4.531346989447549e-05, + "loss": 5.2793, + "step": 2190 + }, + { + "epoch": 0.13601092556955738, + "grad_norm": 0.7341111703508945, + "learning_rate": 4.5334160976619076e-05, + "loss": 5.273, + "step": 2191 + }, + { + "epoch": 0.1360730026693153, + "grad_norm": 0.6238556160121491, + "learning_rate": 4.535485205876268e-05, + "loss": 5.354, + "step": 2192 + }, + { + "epoch": 0.1361350797690732, + "grad_norm": 0.5797588612930797, + "learning_rate": 4.537554314090627e-05, + "loss": 5.358, + "step": 2193 + }, + { + "epoch": 0.13619715686883108, + "grad_norm": 0.537254674505078, + "learning_rate": 4.539623422304987e-05, + "loss": 5.2899, + "step": 2194 + }, + { + "epoch": 0.136259233968589, + "grad_norm": 0.5694515052088878, + "learning_rate": 4.541692530519346e-05, + "loss": 5.3292, + "step": 2195 + }, + { + "epoch": 0.13632131106834688, + "grad_norm": 0.5817206917524654, + "learning_rate": 4.5437616387337064e-05, + "loss": 5.3139, + "step": 2196 + }, + { + "epoch": 0.13638338816810477, + "grad_norm": 0.5085286006423629, + "learning_rate": 4.545830746948065e-05, + "loss": 5.3553, + "step": 2197 + }, + { + "epoch": 0.1364454652678627, + "grad_norm": 0.43674255265282613, + "learning_rate": 4.5478998551624255e-05, + "loss": 5.2287, + "step": 2198 + }, + { + "epoch": 0.13650754236762058, + "grad_norm": 0.6960361872204927, + "learning_rate": 4.549968963376785e-05, + "loss": 5.2375, + "step": 2199 + }, + { + "epoch": 0.13656961946737847, + "grad_norm": 0.7357464017815701, + "learning_rate": 4.5520380715911445e-05, + "loss": 5.2678, + "step": 2200 + }, + { + "epoch": 0.1366316965671364, + "grad_norm": 0.45782206658793345, + "learning_rate": 4.554107179805504e-05, + "loss": 5.3531, + "step": 2201 + }, + { + "epoch": 0.13669377366689428, + "grad_norm": 0.4981955362489424, + "learning_rate": 4.556176288019864e-05, + "loss": 5.3647, + "step": 2202 + }, + { + "epoch": 0.13675585076665217, + "grad_norm": 0.8595919595292639, + "learning_rate": 4.558245396234223e-05, + "loss": 5.3386, + "step": 2203 + }, + { + "epoch": 0.1368179278664101, + "grad_norm": 0.6606449372486973, + "learning_rate": 4.560314504448583e-05, + "loss": 5.3106, + "step": 2204 + }, + { + "epoch": 0.13688000496616798, + "grad_norm": 0.7172309330439918, + "learning_rate": 4.562383612662943e-05, + "loss": 5.2752, + "step": 2205 + }, + { + "epoch": 0.13694208206592587, + "grad_norm": 0.6180000509444157, + "learning_rate": 4.564452720877302e-05, + "loss": 5.2589, + "step": 2206 + }, + { + "epoch": 0.1370041591656838, + "grad_norm": 0.5408106666991472, + "learning_rate": 4.566521829091662e-05, + "loss": 5.3243, + "step": 2207 + }, + { + "epoch": 0.13706623626544168, + "grad_norm": 0.5367443717160411, + "learning_rate": 4.568590937306021e-05, + "loss": 5.2756, + "step": 2208 + }, + { + "epoch": 0.13712831336519957, + "grad_norm": 1.0635978412395375, + "learning_rate": 4.570660045520381e-05, + "loss": 5.2888, + "step": 2209 + }, + { + "epoch": 0.13719039046495748, + "grad_norm": 0.8245966618947984, + "learning_rate": 4.57272915373474e-05, + "loss": 5.2106, + "step": 2210 + }, + { + "epoch": 0.13725246756471537, + "grad_norm": 0.635343834085071, + "learning_rate": 4.5747982619491004e-05, + "loss": 5.2613, + "step": 2211 + }, + { + "epoch": 0.13731454466447326, + "grad_norm": 0.6612893712269897, + "learning_rate": 4.576867370163459e-05, + "loss": 5.1564, + "step": 2212 + }, + { + "epoch": 0.13737662176423118, + "grad_norm": 0.6083129528847042, + "learning_rate": 4.5789364783778194e-05, + "loss": 5.3645, + "step": 2213 + }, + { + "epoch": 0.13743869886398907, + "grad_norm": 0.6443375729958972, + "learning_rate": 4.581005586592179e-05, + "loss": 5.2187, + "step": 2214 + }, + { + "epoch": 0.13750077596374696, + "grad_norm": 0.6782962108298701, + "learning_rate": 4.5830746948065384e-05, + "loss": 5.2787, + "step": 2215 + }, + { + "epoch": 0.13756285306350488, + "grad_norm": 0.7742186945694828, + "learning_rate": 4.585143803020898e-05, + "loss": 5.157, + "step": 2216 + }, + { + "epoch": 0.13762493016326277, + "grad_norm": 0.9885723605011769, + "learning_rate": 4.587212911235258e-05, + "loss": 5.2428, + "step": 2217 + }, + { + "epoch": 0.13768700726302066, + "grad_norm": 0.5780258867000848, + "learning_rate": 4.589282019449617e-05, + "loss": 5.1828, + "step": 2218 + }, + { + "epoch": 0.13774908436277858, + "grad_norm": 0.873029226513017, + "learning_rate": 4.591351127663977e-05, + "loss": 5.183, + "step": 2219 + }, + { + "epoch": 0.13781116146253647, + "grad_norm": 0.643218257878474, + "learning_rate": 4.5934202358783367e-05, + "loss": 5.3081, + "step": 2220 + }, + { + "epoch": 0.13787323856229436, + "grad_norm": 0.5478124213558647, + "learning_rate": 4.595489344092696e-05, + "loss": 5.2844, + "step": 2221 + }, + { + "epoch": 0.13793531566205228, + "grad_norm": 0.8270450604004783, + "learning_rate": 4.597558452307056e-05, + "loss": 5.3189, + "step": 2222 + }, + { + "epoch": 0.13799739276181017, + "grad_norm": 0.5969829631447962, + "learning_rate": 4.599627560521416e-05, + "loss": 5.1781, + "step": 2223 + }, + { + "epoch": 0.13805946986156806, + "grad_norm": 0.7987429473159644, + "learning_rate": 4.601696668735775e-05, + "loss": 5.2398, + "step": 2224 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 0.5560129474909351, + "learning_rate": 4.603765776950135e-05, + "loss": 5.3119, + "step": 2225 + }, + { + "epoch": 0.13818362406108387, + "grad_norm": 0.744571711110278, + "learning_rate": 4.6058348851644944e-05, + "loss": 5.3505, + "step": 2226 + }, + { + "epoch": 0.13824570116084176, + "grad_norm": 0.6049543595241205, + "learning_rate": 4.607903993378854e-05, + "loss": 5.2083, + "step": 2227 + }, + { + "epoch": 0.13830777826059967, + "grad_norm": 0.8959916699375634, + "learning_rate": 4.6099731015932134e-05, + "loss": 5.2084, + "step": 2228 + }, + { + "epoch": 0.13836985536035756, + "grad_norm": 0.7066073543208513, + "learning_rate": 4.6120422098075736e-05, + "loss": 5.2299, + "step": 2229 + }, + { + "epoch": 0.13843193246011545, + "grad_norm": 0.660303381600994, + "learning_rate": 4.6141113180219324e-05, + "loss": 5.2992, + "step": 2230 + }, + { + "epoch": 0.13849400955987337, + "grad_norm": 0.7009178700379115, + "learning_rate": 4.6161804262362926e-05, + "loss": 5.3698, + "step": 2231 + }, + { + "epoch": 0.13855608665963126, + "grad_norm": 0.4921141070186692, + "learning_rate": 4.618249534450652e-05, + "loss": 5.2701, + "step": 2232 + }, + { + "epoch": 0.13861816375938915, + "grad_norm": 0.4938857887758041, + "learning_rate": 4.6203186426650116e-05, + "loss": 5.245, + "step": 2233 + }, + { + "epoch": 0.13868024085914707, + "grad_norm": 1.028814850775942, + "learning_rate": 4.622387750879371e-05, + "loss": 5.3609, + "step": 2234 + }, + { + "epoch": 0.13874231795890496, + "grad_norm": 0.5116579733129734, + "learning_rate": 4.6244568590937306e-05, + "loss": 5.189, + "step": 2235 + }, + { + "epoch": 0.13880439505866285, + "grad_norm": 0.7894055593318522, + "learning_rate": 4.62652596730809e-05, + "loss": 5.2292, + "step": 2236 + }, + { + "epoch": 0.13886647215842077, + "grad_norm": 0.792790999010092, + "learning_rate": 4.6285950755224496e-05, + "loss": 5.2419, + "step": 2237 + }, + { + "epoch": 0.13892854925817866, + "grad_norm": 0.6443019574685197, + "learning_rate": 4.63066418373681e-05, + "loss": 5.2361, + "step": 2238 + }, + { + "epoch": 0.13899062635793655, + "grad_norm": 0.7548725872655022, + "learning_rate": 4.6327332919511693e-05, + "loss": 5.1343, + "step": 2239 + }, + { + "epoch": 0.13905270345769447, + "grad_norm": 0.9035060382339363, + "learning_rate": 4.634802400165529e-05, + "loss": 5.2686, + "step": 2240 + }, + { + "epoch": 0.13911478055745236, + "grad_norm": 0.7322296297177523, + "learning_rate": 4.6368715083798884e-05, + "loss": 5.2447, + "step": 2241 + }, + { + "epoch": 0.13917685765721025, + "grad_norm": 0.743521886191741, + "learning_rate": 4.638940616594248e-05, + "loss": 5.2187, + "step": 2242 + }, + { + "epoch": 0.13923893475696816, + "grad_norm": 0.5682078156158975, + "learning_rate": 4.6410097248086074e-05, + "loss": 5.1618, + "step": 2243 + }, + { + "epoch": 0.13930101185672605, + "grad_norm": 0.6059653184002844, + "learning_rate": 4.6430788330229676e-05, + "loss": 5.2196, + "step": 2244 + }, + { + "epoch": 0.13936308895648394, + "grad_norm": 0.8345154047641822, + "learning_rate": 4.6451479412373264e-05, + "loss": 5.2755, + "step": 2245 + }, + { + "epoch": 0.13942516605624186, + "grad_norm": 0.4880145145469946, + "learning_rate": 4.6472170494516866e-05, + "loss": 5.2306, + "step": 2246 + }, + { + "epoch": 0.13948724315599975, + "grad_norm": 0.6309877389043053, + "learning_rate": 4.649286157666046e-05, + "loss": 5.3297, + "step": 2247 + }, + { + "epoch": 0.13954932025575764, + "grad_norm": 0.5385638155585233, + "learning_rate": 4.651355265880406e-05, + "loss": 5.1747, + "step": 2248 + }, + { + "epoch": 0.13961139735551556, + "grad_norm": 0.630661712008881, + "learning_rate": 4.653424374094765e-05, + "loss": 5.2998, + "step": 2249 + }, + { + "epoch": 0.13967347445527345, + "grad_norm": 0.5357087617543934, + "learning_rate": 4.655493482309125e-05, + "loss": 5.126, + "step": 2250 + }, + { + "epoch": 0.13973555155503134, + "grad_norm": 0.45063156538478083, + "learning_rate": 4.657562590523485e-05, + "loss": 5.21, + "step": 2251 + }, + { + "epoch": 0.13979762865478926, + "grad_norm": 0.6064603745314663, + "learning_rate": 4.659631698737844e-05, + "loss": 5.1978, + "step": 2252 + }, + { + "epoch": 0.13985970575454715, + "grad_norm": 0.5213249905933769, + "learning_rate": 4.661700806952204e-05, + "loss": 5.2215, + "step": 2253 + }, + { + "epoch": 0.13992178285430504, + "grad_norm": 0.5184505954442857, + "learning_rate": 4.663769915166564e-05, + "loss": 5.2647, + "step": 2254 + }, + { + "epoch": 0.13998385995406296, + "grad_norm": 0.5674189229429111, + "learning_rate": 4.665839023380923e-05, + "loss": 5.2274, + "step": 2255 + }, + { + "epoch": 0.14004593705382085, + "grad_norm": 0.6613601000875109, + "learning_rate": 4.667908131595283e-05, + "loss": 5.1679, + "step": 2256 + }, + { + "epoch": 0.14010801415357874, + "grad_norm": 0.5673263968227826, + "learning_rate": 4.6699772398096425e-05, + "loss": 5.2681, + "step": 2257 + }, + { + "epoch": 0.14017009125333665, + "grad_norm": 0.5105145678407828, + "learning_rate": 4.672046348024002e-05, + "loss": 5.18, + "step": 2258 + }, + { + "epoch": 0.14023216835309454, + "grad_norm": 0.4896510936298309, + "learning_rate": 4.6741154562383615e-05, + "loss": 5.2306, + "step": 2259 + }, + { + "epoch": 0.14029424545285243, + "grad_norm": 0.6240986284735092, + "learning_rate": 4.676184564452721e-05, + "loss": 5.2544, + "step": 2260 + }, + { + "epoch": 0.14035632255261035, + "grad_norm": 0.41252605110327223, + "learning_rate": 4.6782536726670805e-05, + "loss": 5.2644, + "step": 2261 + }, + { + "epoch": 0.14041839965236824, + "grad_norm": 0.43172276263420734, + "learning_rate": 4.68032278088144e-05, + "loss": 5.3359, + "step": 2262 + }, + { + "epoch": 0.14048047675212613, + "grad_norm": 0.4049213396613858, + "learning_rate": 4.6823918890958e-05, + "loss": 5.192, + "step": 2263 + }, + { + "epoch": 0.14054255385188405, + "grad_norm": 0.4885787362822546, + "learning_rate": 4.68446099731016e-05, + "loss": 5.1752, + "step": 2264 + }, + { + "epoch": 0.14060463095164194, + "grad_norm": 0.3888501280908618, + "learning_rate": 4.686530105524519e-05, + "loss": 5.3076, + "step": 2265 + }, + { + "epoch": 0.14066670805139983, + "grad_norm": 0.3828825205389056, + "learning_rate": 4.688599213738879e-05, + "loss": 5.2055, + "step": 2266 + }, + { + "epoch": 0.14072878515115775, + "grad_norm": 0.4527914200556242, + "learning_rate": 4.690668321953238e-05, + "loss": 5.1936, + "step": 2267 + }, + { + "epoch": 0.14079086225091564, + "grad_norm": 0.4136996748879409, + "learning_rate": 4.692737430167598e-05, + "loss": 5.1934, + "step": 2268 + }, + { + "epoch": 0.14085293935067353, + "grad_norm": 0.388961837186119, + "learning_rate": 4.694806538381958e-05, + "loss": 5.1989, + "step": 2269 + }, + { + "epoch": 0.14091501645043145, + "grad_norm": 0.461979012630857, + "learning_rate": 4.696875646596317e-05, + "loss": 5.22, + "step": 2270 + }, + { + "epoch": 0.14097709355018934, + "grad_norm": 0.39472305243440087, + "learning_rate": 4.698944754810677e-05, + "loss": 5.1933, + "step": 2271 + }, + { + "epoch": 0.14103917064994723, + "grad_norm": 0.3592193930057248, + "learning_rate": 4.7010138630250365e-05, + "loss": 5.2276, + "step": 2272 + }, + { + "epoch": 0.14110124774970514, + "grad_norm": 0.41756414277636156, + "learning_rate": 4.703082971239396e-05, + "loss": 5.1393, + "step": 2273 + }, + { + "epoch": 0.14116332484946303, + "grad_norm": 0.3519451284399832, + "learning_rate": 4.7051520794537555e-05, + "loss": 5.1685, + "step": 2274 + }, + { + "epoch": 0.14122540194922092, + "grad_norm": 0.4262650392345916, + "learning_rate": 4.707221187668116e-05, + "loss": 5.1396, + "step": 2275 + }, + { + "epoch": 0.14128747904897884, + "grad_norm": 0.37994011689464774, + "learning_rate": 4.7092902958824745e-05, + "loss": 5.2337, + "step": 2276 + }, + { + "epoch": 0.14134955614873673, + "grad_norm": 0.5248321912150808, + "learning_rate": 4.711359404096835e-05, + "loss": 5.231, + "step": 2277 + }, + { + "epoch": 0.14141163324849462, + "grad_norm": 0.4134562210319423, + "learning_rate": 4.713428512311194e-05, + "loss": 5.1907, + "step": 2278 + }, + { + "epoch": 0.14147371034825254, + "grad_norm": 0.35279215765019667, + "learning_rate": 4.715497620525554e-05, + "loss": 5.184, + "step": 2279 + }, + { + "epoch": 0.14153578744801043, + "grad_norm": 0.3844131559336644, + "learning_rate": 4.717566728739913e-05, + "loss": 5.2026, + "step": 2280 + }, + { + "epoch": 0.14159786454776832, + "grad_norm": 0.34708236089519057, + "learning_rate": 4.7196358369542734e-05, + "loss": 5.1587, + "step": 2281 + }, + { + "epoch": 0.14165994164752624, + "grad_norm": 0.3370037346521186, + "learning_rate": 4.721704945168632e-05, + "loss": 5.1575, + "step": 2282 + }, + { + "epoch": 0.14172201874728413, + "grad_norm": 0.4656072805220515, + "learning_rate": 4.7237740533829924e-05, + "loss": 5.2034, + "step": 2283 + }, + { + "epoch": 0.14178409584704202, + "grad_norm": 0.49442385672034783, + "learning_rate": 4.725843161597352e-05, + "loss": 5.1961, + "step": 2284 + }, + { + "epoch": 0.14184617294679994, + "grad_norm": 0.45786320532484837, + "learning_rate": 4.7279122698117114e-05, + "loss": 5.2202, + "step": 2285 + }, + { + "epoch": 0.14190825004655783, + "grad_norm": 0.43276334407690253, + "learning_rate": 4.729981378026071e-05, + "loss": 5.209, + "step": 2286 + }, + { + "epoch": 0.14197032714631572, + "grad_norm": 0.4184457606040809, + "learning_rate": 4.732050486240431e-05, + "loss": 5.1964, + "step": 2287 + }, + { + "epoch": 0.14203240424607363, + "grad_norm": 0.3439340532183176, + "learning_rate": 4.73411959445479e-05, + "loss": 5.1664, + "step": 2288 + }, + { + "epoch": 0.14209448134583152, + "grad_norm": 0.44693467335319914, + "learning_rate": 4.73618870266915e-05, + "loss": 5.1347, + "step": 2289 + }, + { + "epoch": 0.14215655844558941, + "grad_norm": 0.3233788538483079, + "learning_rate": 4.7382578108835096e-05, + "loss": 5.1665, + "step": 2290 + }, + { + "epoch": 0.14221863554534733, + "grad_norm": 0.45318479801099026, + "learning_rate": 4.740326919097869e-05, + "loss": 5.1629, + "step": 2291 + }, + { + "epoch": 0.14228071264510522, + "grad_norm": 0.3427093206795303, + "learning_rate": 4.742396027312229e-05, + "loss": 5.1798, + "step": 2292 + }, + { + "epoch": 0.1423427897448631, + "grad_norm": 0.3624278884721017, + "learning_rate": 4.744465135526588e-05, + "loss": 5.1676, + "step": 2293 + }, + { + "epoch": 0.14240486684462103, + "grad_norm": 0.34848627382021197, + "learning_rate": 4.746534243740948e-05, + "loss": 5.1025, + "step": 2294 + }, + { + "epoch": 0.14246694394437892, + "grad_norm": 0.3941384995009064, + "learning_rate": 4.748603351955307e-05, + "loss": 5.2416, + "step": 2295 + }, + { + "epoch": 0.1425290210441368, + "grad_norm": 0.3968909110236482, + "learning_rate": 4.7506724601696674e-05, + "loss": 5.2218, + "step": 2296 + }, + { + "epoch": 0.14259109814389473, + "grad_norm": 0.3937455653894563, + "learning_rate": 4.752741568384026e-05, + "loss": 5.1008, + "step": 2297 + }, + { + "epoch": 0.14265317524365262, + "grad_norm": 0.37477303630219483, + "learning_rate": 4.7548106765983864e-05, + "loss": 5.132, + "step": 2298 + }, + { + "epoch": 0.1427152523434105, + "grad_norm": 0.4038929049797835, + "learning_rate": 4.756879784812746e-05, + "loss": 5.1595, + "step": 2299 + }, + { + "epoch": 0.14277732944316843, + "grad_norm": 0.35785502884835657, + "learning_rate": 4.7589488930271054e-05, + "loss": 5.2784, + "step": 2300 + }, + { + "epoch": 0.14283940654292632, + "grad_norm": 0.3774314573094509, + "learning_rate": 4.761018001241465e-05, + "loss": 5.2667, + "step": 2301 + }, + { + "epoch": 0.1429014836426842, + "grad_norm": 0.35726608466530263, + "learning_rate": 4.763087109455825e-05, + "loss": 5.2276, + "step": 2302 + }, + { + "epoch": 0.14296356074244213, + "grad_norm": 0.34742952304500024, + "learning_rate": 4.765156217670184e-05, + "loss": 5.275, + "step": 2303 + }, + { + "epoch": 0.14302563784220002, + "grad_norm": 0.40058280240547406, + "learning_rate": 4.767225325884544e-05, + "loss": 5.1097, + "step": 2304 + }, + { + "epoch": 0.1430877149419579, + "grad_norm": 0.3867509616576003, + "learning_rate": 4.7692944340989036e-05, + "loss": 5.1525, + "step": 2305 + }, + { + "epoch": 0.14314979204171582, + "grad_norm": 0.35260987561260887, + "learning_rate": 4.771363542313263e-05, + "loss": 5.2138, + "step": 2306 + }, + { + "epoch": 0.1432118691414737, + "grad_norm": 0.3493391516417228, + "learning_rate": 4.7734326505276226e-05, + "loss": 5.1487, + "step": 2307 + }, + { + "epoch": 0.1432739462412316, + "grad_norm": 0.4085229232407545, + "learning_rate": 4.775501758741983e-05, + "loss": 5.2124, + "step": 2308 + }, + { + "epoch": 0.14333602334098952, + "grad_norm": 0.3873794768161836, + "learning_rate": 4.7775708669563416e-05, + "loss": 5.1118, + "step": 2309 + }, + { + "epoch": 0.1433981004407474, + "grad_norm": 0.41206120937243074, + "learning_rate": 4.779639975170702e-05, + "loss": 5.0445, + "step": 2310 + }, + { + "epoch": 0.1434601775405053, + "grad_norm": 0.3606963123837381, + "learning_rate": 4.7817090833850613e-05, + "loss": 5.1933, + "step": 2311 + }, + { + "epoch": 0.14352225464026322, + "grad_norm": 0.35440665650300274, + "learning_rate": 4.783778191599421e-05, + "loss": 5.1939, + "step": 2312 + }, + { + "epoch": 0.1435843317400211, + "grad_norm": 0.36308268791332815, + "learning_rate": 4.7858472998137804e-05, + "loss": 5.1631, + "step": 2313 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 0.38238519058166126, + "learning_rate": 4.7879164080281405e-05, + "loss": 5.1468, + "step": 2314 + }, + { + "epoch": 0.14370848593953692, + "grad_norm": 0.49958406970351893, + "learning_rate": 4.7899855162424994e-05, + "loss": 5.1837, + "step": 2315 + }, + { + "epoch": 0.1437705630392948, + "grad_norm": 0.5686251618592004, + "learning_rate": 4.7920546244568596e-05, + "loss": 5.2394, + "step": 2316 + }, + { + "epoch": 0.1438326401390527, + "grad_norm": 0.45568725029866985, + "learning_rate": 4.794123732671219e-05, + "loss": 5.1501, + "step": 2317 + }, + { + "epoch": 0.14389471723881062, + "grad_norm": 0.48846520582661557, + "learning_rate": 4.7961928408855786e-05, + "loss": 5.0936, + "step": 2318 + }, + { + "epoch": 0.1439567943385685, + "grad_norm": 0.5288670895840745, + "learning_rate": 4.798261949099938e-05, + "loss": 5.1905, + "step": 2319 + }, + { + "epoch": 0.1440188714383264, + "grad_norm": 0.737187054073236, + "learning_rate": 4.8003310573142976e-05, + "loss": 5.1729, + "step": 2320 + }, + { + "epoch": 0.1440809485380843, + "grad_norm": 0.6107535053469562, + "learning_rate": 4.802400165528657e-05, + "loss": 5.2202, + "step": 2321 + }, + { + "epoch": 0.1441430256378422, + "grad_norm": 0.6002109752712963, + "learning_rate": 4.8044692737430166e-05, + "loss": 5.1455, + "step": 2322 + }, + { + "epoch": 0.1442051027376001, + "grad_norm": 0.6089035558016921, + "learning_rate": 4.806538381957377e-05, + "loss": 5.2294, + "step": 2323 + }, + { + "epoch": 0.144267179837358, + "grad_norm": 0.6575047441318642, + "learning_rate": 4.8086074901717356e-05, + "loss": 5.1864, + "step": 2324 + }, + { + "epoch": 0.1443292569371159, + "grad_norm": 0.8046516484542285, + "learning_rate": 4.810676598386096e-05, + "loss": 5.0579, + "step": 2325 + }, + { + "epoch": 0.1443913340368738, + "grad_norm": 0.5255468483702709, + "learning_rate": 4.812745706600455e-05, + "loss": 5.1587, + "step": 2326 + }, + { + "epoch": 0.1444534111366317, + "grad_norm": 0.5481073391025431, + "learning_rate": 4.814814814814815e-05, + "loss": 5.2424, + "step": 2327 + }, + { + "epoch": 0.1445154882363896, + "grad_norm": 0.566123839380747, + "learning_rate": 4.816883923029174e-05, + "loss": 5.2578, + "step": 2328 + }, + { + "epoch": 0.1445775653361475, + "grad_norm": 0.524564014783054, + "learning_rate": 4.8189530312435345e-05, + "loss": 5.1356, + "step": 2329 + }, + { + "epoch": 0.1446396424359054, + "grad_norm": 0.7772340936809224, + "learning_rate": 4.8210221394578933e-05, + "loss": 5.1919, + "step": 2330 + }, + { + "epoch": 0.1447017195356633, + "grad_norm": 0.6596658905696817, + "learning_rate": 4.8230912476722535e-05, + "loss": 5.1323, + "step": 2331 + }, + { + "epoch": 0.1447637966354212, + "grad_norm": 0.7080369404029847, + "learning_rate": 4.825160355886613e-05, + "loss": 5.1058, + "step": 2332 + }, + { + "epoch": 0.1448258737351791, + "grad_norm": 0.5537552050567621, + "learning_rate": 4.827229464100973e-05, + "loss": 5.0861, + "step": 2333 + }, + { + "epoch": 0.144887950834937, + "grad_norm": 0.5721175487183943, + "learning_rate": 4.829298572315332e-05, + "loss": 5.0962, + "step": 2334 + }, + { + "epoch": 0.14495002793469489, + "grad_norm": 0.6423310381634376, + "learning_rate": 4.831367680529692e-05, + "loss": 5.1424, + "step": 2335 + }, + { + "epoch": 0.1450121050344528, + "grad_norm": 0.6605159750038183, + "learning_rate": 4.833436788744052e-05, + "loss": 5.0819, + "step": 2336 + }, + { + "epoch": 0.1450741821342107, + "grad_norm": 0.47203156671699675, + "learning_rate": 4.835505896958411e-05, + "loss": 5.0271, + "step": 2337 + }, + { + "epoch": 0.14513625923396858, + "grad_norm": 0.666160769042873, + "learning_rate": 4.837575005172771e-05, + "loss": 5.1499, + "step": 2338 + }, + { + "epoch": 0.1451983363337265, + "grad_norm": 0.5329262150656441, + "learning_rate": 4.839644113387131e-05, + "loss": 5.1651, + "step": 2339 + }, + { + "epoch": 0.1452604134334844, + "grad_norm": 0.697324819750204, + "learning_rate": 4.84171322160149e-05, + "loss": 5.2045, + "step": 2340 + }, + { + "epoch": 0.14532249053324228, + "grad_norm": 0.4914205873517448, + "learning_rate": 4.84378232981585e-05, + "loss": 5.168, + "step": 2341 + }, + { + "epoch": 0.14538456763300017, + "grad_norm": 0.7009593886054354, + "learning_rate": 4.8458514380302095e-05, + "loss": 5.1609, + "step": 2342 + }, + { + "epoch": 0.1454466447327581, + "grad_norm": 0.617060056638483, + "learning_rate": 4.847920546244569e-05, + "loss": 5.1523, + "step": 2343 + }, + { + "epoch": 0.14550872183251598, + "grad_norm": 0.5598262447907034, + "learning_rate": 4.8499896544589285e-05, + "loss": 5.1224, + "step": 2344 + }, + { + "epoch": 0.14557079893227387, + "grad_norm": 0.6695301478765223, + "learning_rate": 4.852058762673288e-05, + "loss": 5.1143, + "step": 2345 + }, + { + "epoch": 0.1456328760320318, + "grad_norm": 0.5426784260637143, + "learning_rate": 4.8541278708876475e-05, + "loss": 5.2371, + "step": 2346 + }, + { + "epoch": 0.14569495313178968, + "grad_norm": 0.5670158263037959, + "learning_rate": 4.856196979102007e-05, + "loss": 5.1264, + "step": 2347 + }, + { + "epoch": 0.14575703023154757, + "grad_norm": 0.400359547967445, + "learning_rate": 4.858266087316367e-05, + "loss": 5.1084, + "step": 2348 + }, + { + "epoch": 0.1458191073313055, + "grad_norm": 0.5369340645182099, + "learning_rate": 4.860335195530727e-05, + "loss": 5.1524, + "step": 2349 + }, + { + "epoch": 0.14588118443106338, + "grad_norm": 0.46113199236359426, + "learning_rate": 4.862404303745086e-05, + "loss": 4.9838, + "step": 2350 + }, + { + "epoch": 0.14594326153082127, + "grad_norm": 0.47500494966939694, + "learning_rate": 4.864473411959446e-05, + "loss": 5.0928, + "step": 2351 + }, + { + "epoch": 0.14600533863057918, + "grad_norm": 0.4403089539687654, + "learning_rate": 4.866542520173805e-05, + "loss": 5.0699, + "step": 2352 + }, + { + "epoch": 0.14606741573033707, + "grad_norm": 0.424151494998314, + "learning_rate": 4.868611628388165e-05, + "loss": 5.0824, + "step": 2353 + }, + { + "epoch": 0.14612949283009496, + "grad_norm": 0.6136947744116883, + "learning_rate": 4.870680736602525e-05, + "loss": 5.2002, + "step": 2354 + }, + { + "epoch": 0.14619156992985288, + "grad_norm": 0.8516613264221263, + "learning_rate": 4.872749844816884e-05, + "loss": 5.0822, + "step": 2355 + }, + { + "epoch": 0.14625364702961077, + "grad_norm": 0.45876178619773267, + "learning_rate": 4.874818953031244e-05, + "loss": 5.1335, + "step": 2356 + }, + { + "epoch": 0.14631572412936866, + "grad_norm": 0.6286906234052301, + "learning_rate": 4.8768880612456034e-05, + "loss": 5.154, + "step": 2357 + }, + { + "epoch": 0.14637780122912658, + "grad_norm": 0.4850002836688677, + "learning_rate": 4.878957169459963e-05, + "loss": 4.9991, + "step": 2358 + }, + { + "epoch": 0.14643987832888447, + "grad_norm": 0.4761369556268963, + "learning_rate": 4.8810262776743225e-05, + "loss": 5.0931, + "step": 2359 + }, + { + "epoch": 0.14650195542864236, + "grad_norm": 1.038286134178781, + "learning_rate": 4.8830953858886826e-05, + "loss": 5.0844, + "step": 2360 + }, + { + "epoch": 0.14656403252840028, + "grad_norm": 0.5919133439981997, + "learning_rate": 4.8851644941030415e-05, + "loss": 5.0625, + "step": 2361 + }, + { + "epoch": 0.14662610962815817, + "grad_norm": 0.8802359123348599, + "learning_rate": 4.8872336023174017e-05, + "loss": 5.1988, + "step": 2362 + }, + { + "epoch": 0.14668818672791606, + "grad_norm": 0.8065194692443401, + "learning_rate": 4.889302710531761e-05, + "loss": 5.0999, + "step": 2363 + }, + { + "epoch": 0.14675026382767398, + "grad_norm": 0.5437195615908723, + "learning_rate": 4.891371818746121e-05, + "loss": 5.1212, + "step": 2364 + }, + { + "epoch": 0.14681234092743187, + "grad_norm": 0.5314652904272963, + "learning_rate": 4.89344092696048e-05, + "loss": 5.0741, + "step": 2365 + }, + { + "epoch": 0.14687441802718976, + "grad_norm": 0.7024314851032399, + "learning_rate": 4.8955100351748404e-05, + "loss": 5.0795, + "step": 2366 + }, + { + "epoch": 0.14693649512694767, + "grad_norm": 0.5312017250566594, + "learning_rate": 4.897579143389199e-05, + "loss": 5.1094, + "step": 2367 + }, + { + "epoch": 0.14699857222670557, + "grad_norm": 1.173891443458502, + "learning_rate": 4.8996482516035594e-05, + "loss": 5.1009, + "step": 2368 + }, + { + "epoch": 0.14706064932646346, + "grad_norm": 0.691557011089098, + "learning_rate": 4.901717359817919e-05, + "loss": 5.0859, + "step": 2369 + }, + { + "epoch": 0.14712272642622137, + "grad_norm": 0.9309001995295857, + "learning_rate": 4.9037864680322784e-05, + "loss": 5.148, + "step": 2370 + }, + { + "epoch": 0.14718480352597926, + "grad_norm": 0.7809083954698693, + "learning_rate": 4.905855576246638e-05, + "loss": 5.1533, + "step": 2371 + }, + { + "epoch": 0.14724688062573715, + "grad_norm": 0.8987000123342868, + "learning_rate": 4.9079246844609974e-05, + "loss": 5.2598, + "step": 2372 + }, + { + "epoch": 0.14730895772549507, + "grad_norm": 0.5799846638098883, + "learning_rate": 4.909993792675357e-05, + "loss": 5.1993, + "step": 2373 + }, + { + "epoch": 0.14737103482525296, + "grad_norm": 0.8137049177019555, + "learning_rate": 4.912062900889717e-05, + "loss": 5.1862, + "step": 2374 + }, + { + "epoch": 0.14743311192501085, + "grad_norm": 0.5876623325088902, + "learning_rate": 4.9141320091040766e-05, + "loss": 5.2261, + "step": 2375 + }, + { + "epoch": 0.14749518902476877, + "grad_norm": 0.8075665200886758, + "learning_rate": 4.916201117318436e-05, + "loss": 5.0329, + "step": 2376 + }, + { + "epoch": 0.14755726612452666, + "grad_norm": 0.6307437897133412, + "learning_rate": 4.9182702255327956e-05, + "loss": 5.0994, + "step": 2377 + }, + { + "epoch": 0.14761934322428455, + "grad_norm": 0.5876554100965783, + "learning_rate": 4.920339333747155e-05, + "loss": 5.2474, + "step": 2378 + }, + { + "epoch": 0.14768142032404247, + "grad_norm": 0.7432899719329588, + "learning_rate": 4.9224084419615146e-05, + "loss": 5.2265, + "step": 2379 + }, + { + "epoch": 0.14774349742380036, + "grad_norm": 0.6822005325100022, + "learning_rate": 4.924477550175874e-05, + "loss": 5.23, + "step": 2380 + }, + { + "epoch": 0.14780557452355825, + "grad_norm": 0.9239994398317662, + "learning_rate": 4.926546658390234e-05, + "loss": 5.0321, + "step": 2381 + }, + { + "epoch": 0.14786765162331617, + "grad_norm": 0.7274386547874909, + "learning_rate": 4.928615766604593e-05, + "loss": 5.0743, + "step": 2382 + }, + { + "epoch": 0.14792972872307406, + "grad_norm": 0.8143769372666709, + "learning_rate": 4.9306848748189533e-05, + "loss": 5.0834, + "step": 2383 + }, + { + "epoch": 0.14799180582283195, + "grad_norm": 0.7330590083608018, + "learning_rate": 4.932753983033313e-05, + "loss": 5.1834, + "step": 2384 + }, + { + "epoch": 0.14805388292258986, + "grad_norm": 0.7318422592618978, + "learning_rate": 4.9348230912476724e-05, + "loss": 4.9868, + "step": 2385 + }, + { + "epoch": 0.14811596002234775, + "grad_norm": 0.7401751492040084, + "learning_rate": 4.936892199462032e-05, + "loss": 5.0488, + "step": 2386 + }, + { + "epoch": 0.14817803712210564, + "grad_norm": 1.0140003414529681, + "learning_rate": 4.938961307676392e-05, + "loss": 5.109, + "step": 2387 + }, + { + "epoch": 0.14824011422186356, + "grad_norm": 0.6250604123482228, + "learning_rate": 4.941030415890751e-05, + "loss": 5.1592, + "step": 2388 + }, + { + "epoch": 0.14830219132162145, + "grad_norm": 0.7192557670088918, + "learning_rate": 4.943099524105111e-05, + "loss": 5.1081, + "step": 2389 + }, + { + "epoch": 0.14836426842137934, + "grad_norm": 0.7952512391484574, + "learning_rate": 4.9451686323194706e-05, + "loss": 5.0986, + "step": 2390 + }, + { + "epoch": 0.14842634552113726, + "grad_norm": 0.7661516752523843, + "learning_rate": 4.94723774053383e-05, + "loss": 5.2094, + "step": 2391 + }, + { + "epoch": 0.14848842262089515, + "grad_norm": 0.7924740756394545, + "learning_rate": 4.9493068487481896e-05, + "loss": 5.1687, + "step": 2392 + }, + { + "epoch": 0.14855049972065304, + "grad_norm": 0.7014057969949933, + "learning_rate": 4.95137595696255e-05, + "loss": 5.0898, + "step": 2393 + }, + { + "epoch": 0.14861257682041096, + "grad_norm": 0.6181384137031798, + "learning_rate": 4.9534450651769086e-05, + "loss": 5.0527, + "step": 2394 + }, + { + "epoch": 0.14867465392016885, + "grad_norm": 0.5294563081490007, + "learning_rate": 4.955514173391269e-05, + "loss": 5.0114, + "step": 2395 + }, + { + "epoch": 0.14873673101992674, + "grad_norm": 0.44444491548645704, + "learning_rate": 4.957583281605628e-05, + "loss": 5.0654, + "step": 2396 + }, + { + "epoch": 0.14879880811968466, + "grad_norm": 0.5380489939003592, + "learning_rate": 4.959652389819988e-05, + "loss": 5.0288, + "step": 2397 + }, + { + "epoch": 0.14886088521944255, + "grad_norm": 0.7251121647985564, + "learning_rate": 4.961721498034347e-05, + "loss": 5.1474, + "step": 2398 + }, + { + "epoch": 0.14892296231920044, + "grad_norm": 0.6940430829708402, + "learning_rate": 4.9637906062487075e-05, + "loss": 5.0666, + "step": 2399 + }, + { + "epoch": 0.14898503941895835, + "grad_norm": 0.43450942738098136, + "learning_rate": 4.965859714463066e-05, + "loss": 4.9439, + "step": 2400 + }, + { + "epoch": 0.14904711651871624, + "grad_norm": 0.552206940279049, + "learning_rate": 4.9679288226774265e-05, + "loss": 4.9804, + "step": 2401 + }, + { + "epoch": 0.14910919361847413, + "grad_norm": 0.5253581777997551, + "learning_rate": 4.969997930891786e-05, + "loss": 5.1046, + "step": 2402 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 0.5003906452497587, + "learning_rate": 4.9720670391061455e-05, + "loss": 5.0925, + "step": 2403 + }, + { + "epoch": 0.14923334781798994, + "grad_norm": 0.6055492470667375, + "learning_rate": 4.974136147320505e-05, + "loss": 5.1155, + "step": 2404 + }, + { + "epoch": 0.14929542491774783, + "grad_norm": 0.5524431411691646, + "learning_rate": 4.9762052555348645e-05, + "loss": 4.9855, + "step": 2405 + }, + { + "epoch": 0.14935750201750575, + "grad_norm": 0.830823962663501, + "learning_rate": 4.978274363749224e-05, + "loss": 5.0895, + "step": 2406 + }, + { + "epoch": 0.14941957911726364, + "grad_norm": 0.5659819493566275, + "learning_rate": 4.9803434719635836e-05, + "loss": 5.0619, + "step": 2407 + }, + { + "epoch": 0.14948165621702153, + "grad_norm": 1.1602714301581485, + "learning_rate": 4.982412580177944e-05, + "loss": 5.089, + "step": 2408 + }, + { + "epoch": 0.14954373331677945, + "grad_norm": 0.7527761339878968, + "learning_rate": 4.9844816883923026e-05, + "loss": 5.0035, + "step": 2409 + }, + { + "epoch": 0.14960581041653734, + "grad_norm": 0.5251949754332982, + "learning_rate": 4.986550796606663e-05, + "loss": 5.0528, + "step": 2410 + }, + { + "epoch": 0.14966788751629523, + "grad_norm": 0.4971478294979696, + "learning_rate": 4.988619904821022e-05, + "loss": 5.0977, + "step": 2411 + }, + { + "epoch": 0.14972996461605315, + "grad_norm": 0.5296203919607063, + "learning_rate": 4.990689013035382e-05, + "loss": 5.0293, + "step": 2412 + }, + { + "epoch": 0.14979204171581104, + "grad_norm": 0.6776780206738012, + "learning_rate": 4.992758121249741e-05, + "loss": 4.9924, + "step": 2413 + }, + { + "epoch": 0.14985411881556893, + "grad_norm": 0.5475300272019002, + "learning_rate": 4.9948272294641015e-05, + "loss": 5.1917, + "step": 2414 + }, + { + "epoch": 0.14991619591532684, + "grad_norm": 0.7145907261709779, + "learning_rate": 4.99689633767846e-05, + "loss": 4.9843, + "step": 2415 + }, + { + "epoch": 0.14997827301508473, + "grad_norm": 0.6419817815435749, + "learning_rate": 4.9989654458928205e-05, + "loss": 5.012, + "step": 2416 + }, + { + "epoch": 0.15004035011484262, + "grad_norm": 0.5527567774677619, + "learning_rate": 5.00103455410718e-05, + "loss": 4.9279, + "step": 2417 + }, + { + "epoch": 0.15010242721460054, + "grad_norm": 0.6658450492724108, + "learning_rate": 5.00310366232154e-05, + "loss": 4.9187, + "step": 2418 + }, + { + "epoch": 0.15016450431435843, + "grad_norm": 0.744855882415006, + "learning_rate": 5.0051727705359e-05, + "loss": 5.247, + "step": 2419 + }, + { + "epoch": 0.15022658141411632, + "grad_norm": 0.9807470466225074, + "learning_rate": 5.0072418787502585e-05, + "loss": 5.0328, + "step": 2420 + }, + { + "epoch": 0.15028865851387424, + "grad_norm": 0.9001172823782355, + "learning_rate": 5.009310986964618e-05, + "loss": 5.1581, + "step": 2421 + }, + { + "epoch": 0.15035073561363213, + "grad_norm": 0.9260844768216296, + "learning_rate": 5.011380095178978e-05, + "loss": 5.0046, + "step": 2422 + }, + { + "epoch": 0.15041281271339002, + "grad_norm": 0.7708500439318904, + "learning_rate": 5.013449203393338e-05, + "loss": 5.0796, + "step": 2423 + }, + { + "epoch": 0.15047488981314794, + "grad_norm": 0.6359538533125583, + "learning_rate": 5.015518311607698e-05, + "loss": 5.005, + "step": 2424 + }, + { + "epoch": 0.15053696691290583, + "grad_norm": 0.6227490661059727, + "learning_rate": 5.0175874198220574e-05, + "loss": 5.024, + "step": 2425 + }, + { + "epoch": 0.15059904401266372, + "grad_norm": 0.5225202580225434, + "learning_rate": 5.019656528036416e-05, + "loss": 4.9174, + "step": 2426 + }, + { + "epoch": 0.15066112111242164, + "grad_norm": 0.5442906487126447, + "learning_rate": 5.021725636250776e-05, + "loss": 5.0438, + "step": 2427 + }, + { + "epoch": 0.15072319821217953, + "grad_norm": 0.4442916009602335, + "learning_rate": 5.023794744465136e-05, + "loss": 5.0536, + "step": 2428 + }, + { + "epoch": 0.15078527531193742, + "grad_norm": 0.4336872859107145, + "learning_rate": 5.0258638526794954e-05, + "loss": 5.0972, + "step": 2429 + }, + { + "epoch": 0.15084735241169533, + "grad_norm": 0.5650376194270589, + "learning_rate": 5.027932960893855e-05, + "loss": 5.1376, + "step": 2430 + }, + { + "epoch": 0.15090942951145322, + "grad_norm": 0.4619445468126129, + "learning_rate": 5.030002069108215e-05, + "loss": 5.0383, + "step": 2431 + }, + { + "epoch": 0.15097150661121111, + "grad_norm": 0.43967952406534905, + "learning_rate": 5.032071177322574e-05, + "loss": 5.0377, + "step": 2432 + }, + { + "epoch": 0.15103358371096903, + "grad_norm": 0.41940280859060913, + "learning_rate": 5.0341402855369335e-05, + "loss": 4.931, + "step": 2433 + }, + { + "epoch": 0.15109566081072692, + "grad_norm": 0.48337303634335455, + "learning_rate": 5.036209393751293e-05, + "loss": 5.0836, + "step": 2434 + }, + { + "epoch": 0.1511577379104848, + "grad_norm": 0.534036006817238, + "learning_rate": 5.038278501965653e-05, + "loss": 5.0839, + "step": 2435 + }, + { + "epoch": 0.15121981501024273, + "grad_norm": 0.3351302761514842, + "learning_rate": 5.040347610180013e-05, + "loss": 5.1604, + "step": 2436 + }, + { + "epoch": 0.15128189211000062, + "grad_norm": 0.4039449588370889, + "learning_rate": 5.042416718394373e-05, + "loss": 5.0446, + "step": 2437 + }, + { + "epoch": 0.1513439692097585, + "grad_norm": 0.42742382603435186, + "learning_rate": 5.044485826608732e-05, + "loss": 4.9631, + "step": 2438 + }, + { + "epoch": 0.15140604630951643, + "grad_norm": 0.605953514709843, + "learning_rate": 5.046554934823091e-05, + "loss": 5.0741, + "step": 2439 + }, + { + "epoch": 0.15146812340927432, + "grad_norm": 0.35430651129053653, + "learning_rate": 5.048624043037451e-05, + "loss": 5.0506, + "step": 2440 + }, + { + "epoch": 0.1515302005090322, + "grad_norm": 0.47893736024157846, + "learning_rate": 5.050693151251811e-05, + "loss": 5.0722, + "step": 2441 + }, + { + "epoch": 0.15159227760879013, + "grad_norm": 0.4484739846995343, + "learning_rate": 5.0527622594661704e-05, + "loss": 5.074, + "step": 2442 + }, + { + "epoch": 0.15165435470854802, + "grad_norm": 0.4783272352920944, + "learning_rate": 5.0548313676805306e-05, + "loss": 5.0691, + "step": 2443 + }, + { + "epoch": 0.1517164318083059, + "grad_norm": 0.5106304330898622, + "learning_rate": 5.056900475894889e-05, + "loss": 5.004, + "step": 2444 + }, + { + "epoch": 0.15177850890806383, + "grad_norm": 0.5834340848654211, + "learning_rate": 5.058969584109249e-05, + "loss": 4.9421, + "step": 2445 + }, + { + "epoch": 0.15184058600782172, + "grad_norm": 0.39959252061846434, + "learning_rate": 5.0610386923236084e-05, + "loss": 5.0096, + "step": 2446 + }, + { + "epoch": 0.1519026631075796, + "grad_norm": 0.43822584226976613, + "learning_rate": 5.0631078005379686e-05, + "loss": 5.0192, + "step": 2447 + }, + { + "epoch": 0.15196474020733752, + "grad_norm": 0.46977422397089214, + "learning_rate": 5.065176908752328e-05, + "loss": 5.0344, + "step": 2448 + }, + { + "epoch": 0.1520268173070954, + "grad_norm": 0.4781952856175673, + "learning_rate": 5.067246016966688e-05, + "loss": 5.0058, + "step": 2449 + }, + { + "epoch": 0.1520888944068533, + "grad_norm": 0.6541468206258252, + "learning_rate": 5.0693151251810465e-05, + "loss": 4.9568, + "step": 2450 + }, + { + "epoch": 0.15215097150661122, + "grad_norm": 0.39108849683853963, + "learning_rate": 5.0713842333954066e-05, + "loss": 4.9784, + "step": 2451 + }, + { + "epoch": 0.1522130486063691, + "grad_norm": 0.7376409913116905, + "learning_rate": 5.073453341609766e-05, + "loss": 5.0314, + "step": 2452 + }, + { + "epoch": 0.152275125706127, + "grad_norm": 0.5512410132006469, + "learning_rate": 5.075522449824126e-05, + "loss": 4.9494, + "step": 2453 + }, + { + "epoch": 0.15233720280588492, + "grad_norm": 0.5978839415611961, + "learning_rate": 5.077591558038486e-05, + "loss": 4.9807, + "step": 2454 + }, + { + "epoch": 0.1523992799056428, + "grad_norm": 0.4610852150303258, + "learning_rate": 5.0796606662528453e-05, + "loss": 4.9218, + "step": 2455 + }, + { + "epoch": 0.1524613570054007, + "grad_norm": 0.4206268248440522, + "learning_rate": 5.081729774467204e-05, + "loss": 4.9326, + "step": 2456 + }, + { + "epoch": 0.15252343410515862, + "grad_norm": 0.4708398802338664, + "learning_rate": 5.0837988826815644e-05, + "loss": 5.0475, + "step": 2457 + }, + { + "epoch": 0.1525855112049165, + "grad_norm": 0.4946756293775696, + "learning_rate": 5.085867990895924e-05, + "loss": 4.974, + "step": 2458 + }, + { + "epoch": 0.1526475883046744, + "grad_norm": 0.5106126318265969, + "learning_rate": 5.0879370991102834e-05, + "loss": 4.9282, + "step": 2459 + }, + { + "epoch": 0.15270966540443232, + "grad_norm": 0.4826517985772489, + "learning_rate": 5.0900062073246436e-05, + "loss": 5.0333, + "step": 2460 + }, + { + "epoch": 0.1527717425041902, + "grad_norm": 0.4112691404068419, + "learning_rate": 5.092075315539003e-05, + "loss": 5.0703, + "step": 2461 + }, + { + "epoch": 0.1528338196039481, + "grad_norm": 0.5554378162896938, + "learning_rate": 5.094144423753363e-05, + "loss": 5.0506, + "step": 2462 + }, + { + "epoch": 0.152895896703706, + "grad_norm": 0.3798880294794609, + "learning_rate": 5.096213531967722e-05, + "loss": 4.9977, + "step": 2463 + }, + { + "epoch": 0.1529579738034639, + "grad_norm": 0.5111943243998157, + "learning_rate": 5.0982826401820816e-05, + "loss": 4.8788, + "step": 2464 + }, + { + "epoch": 0.1530200509032218, + "grad_norm": 0.40909185809810455, + "learning_rate": 5.100351748396441e-05, + "loss": 4.8976, + "step": 2465 + }, + { + "epoch": 0.1530821280029797, + "grad_norm": 0.3532080497811812, + "learning_rate": 5.102420856610801e-05, + "loss": 4.9882, + "step": 2466 + }, + { + "epoch": 0.1531442051027376, + "grad_norm": 0.4574056868453591, + "learning_rate": 5.104489964825161e-05, + "loss": 5.0646, + "step": 2467 + }, + { + "epoch": 0.1532062822024955, + "grad_norm": 0.3242202322133206, + "learning_rate": 5.106559073039521e-05, + "loss": 4.9602, + "step": 2468 + }, + { + "epoch": 0.1532683593022534, + "grad_norm": 0.40116102952421095, + "learning_rate": 5.108628181253879e-05, + "loss": 5.0142, + "step": 2469 + }, + { + "epoch": 0.1533304364020113, + "grad_norm": 0.4177379862406479, + "learning_rate": 5.110697289468239e-05, + "loss": 5.0293, + "step": 2470 + }, + { + "epoch": 0.1533925135017692, + "grad_norm": 0.4600720443492179, + "learning_rate": 5.112766397682599e-05, + "loss": 5.0384, + "step": 2471 + }, + { + "epoch": 0.1534545906015271, + "grad_norm": 0.6312805045660568, + "learning_rate": 5.114835505896959e-05, + "loss": 4.9564, + "step": 2472 + }, + { + "epoch": 0.153516667701285, + "grad_norm": 0.6292343468505629, + "learning_rate": 5.1169046141113185e-05, + "loss": 5.0643, + "step": 2473 + }, + { + "epoch": 0.1535787448010429, + "grad_norm": 0.6444093299919021, + "learning_rate": 5.118973722325679e-05, + "loss": 4.9513, + "step": 2474 + }, + { + "epoch": 0.1536408219008008, + "grad_norm": 0.4556755434005075, + "learning_rate": 5.121042830540037e-05, + "loss": 4.9329, + "step": 2475 + }, + { + "epoch": 0.1537028990005587, + "grad_norm": 0.8435830198291966, + "learning_rate": 5.123111938754397e-05, + "loss": 5.0196, + "step": 2476 + }, + { + "epoch": 0.15376497610031659, + "grad_norm": 0.5204865127413761, + "learning_rate": 5.1251810469687566e-05, + "loss": 4.9641, + "step": 2477 + }, + { + "epoch": 0.1538270532000745, + "grad_norm": 0.6835923985059185, + "learning_rate": 5.127250155183117e-05, + "loss": 5.0513, + "step": 2478 + }, + { + "epoch": 0.1538891302998324, + "grad_norm": 0.9154963861953521, + "learning_rate": 5.129319263397476e-05, + "loss": 5.0317, + "step": 2479 + }, + { + "epoch": 0.15395120739959028, + "grad_norm": 1.4239398090950113, + "learning_rate": 5.131388371611836e-05, + "loss": 5.1179, + "step": 2480 + }, + { + "epoch": 0.1540132844993482, + "grad_norm": 0.9044447050932567, + "learning_rate": 5.1334574798261946e-05, + "loss": 5.0222, + "step": 2481 + }, + { + "epoch": 0.1540753615991061, + "grad_norm": 0.7042540915427555, + "learning_rate": 5.135526588040555e-05, + "loss": 5.0194, + "step": 2482 + }, + { + "epoch": 0.15413743869886398, + "grad_norm": 0.6274397675607857, + "learning_rate": 5.137595696254914e-05, + "loss": 5.0685, + "step": 2483 + }, + { + "epoch": 0.1541995157986219, + "grad_norm": 0.8129121761957286, + "learning_rate": 5.139664804469274e-05, + "loss": 4.894, + "step": 2484 + }, + { + "epoch": 0.1542615928983798, + "grad_norm": 0.5102133994930279, + "learning_rate": 5.141733912683634e-05, + "loss": 4.8503, + "step": 2485 + }, + { + "epoch": 0.15432366999813768, + "grad_norm": 0.5679892419531668, + "learning_rate": 5.1438030208979935e-05, + "loss": 4.9956, + "step": 2486 + }, + { + "epoch": 0.1543857470978956, + "grad_norm": 0.5340661969176898, + "learning_rate": 5.145872129112352e-05, + "loss": 4.9426, + "step": 2487 + }, + { + "epoch": 0.1544478241976535, + "grad_norm": 0.4106143754371345, + "learning_rate": 5.1479412373267125e-05, + "loss": 4.9291, + "step": 2488 + }, + { + "epoch": 0.15450990129741138, + "grad_norm": 0.7791122179505273, + "learning_rate": 5.150010345541072e-05, + "loss": 4.9179, + "step": 2489 + }, + { + "epoch": 0.1545719783971693, + "grad_norm": 0.8274737982620977, + "learning_rate": 5.1520794537554315e-05, + "loss": 5.0198, + "step": 2490 + }, + { + "epoch": 0.1546340554969272, + "grad_norm": 0.5928278337183683, + "learning_rate": 5.154148561969792e-05, + "loss": 5.0356, + "step": 2491 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 0.6064653869353249, + "learning_rate": 5.156217670184151e-05, + "loss": 5.0142, + "step": 2492 + }, + { + "epoch": 0.154758209696443, + "grad_norm": 0.5843091336017763, + "learning_rate": 5.15828677839851e-05, + "loss": 5.1038, + "step": 2493 + }, + { + "epoch": 0.15482028679620088, + "grad_norm": 0.4259506903360117, + "learning_rate": 5.1603558866128695e-05, + "loss": 5.0353, + "step": 2494 + }, + { + "epoch": 0.15488236389595877, + "grad_norm": 0.5928471627807032, + "learning_rate": 5.16242499482723e-05, + "loss": 4.9338, + "step": 2495 + }, + { + "epoch": 0.1549444409957167, + "grad_norm": 0.6269905815938136, + "learning_rate": 5.164494103041589e-05, + "loss": 5.0165, + "step": 2496 + }, + { + "epoch": 0.15500651809547458, + "grad_norm": 0.6832590864326166, + "learning_rate": 5.1665632112559494e-05, + "loss": 5.0018, + "step": 2497 + }, + { + "epoch": 0.15506859519523247, + "grad_norm": 0.4626560018964551, + "learning_rate": 5.168632319470309e-05, + "loss": 4.9994, + "step": 2498 + }, + { + "epoch": 0.1551306722949904, + "grad_norm": 0.44853657880674186, + "learning_rate": 5.170701427684668e-05, + "loss": 4.9525, + "step": 2499 + }, + { + "epoch": 0.15519274939474828, + "grad_norm": 0.48396954948618875, + "learning_rate": 5.172770535899027e-05, + "loss": 4.9944, + "step": 2500 + }, + { + "epoch": 0.15525482649450617, + "grad_norm": 0.5927380672252853, + "learning_rate": 5.1748396441133874e-05, + "loss": 4.8366, + "step": 2501 + }, + { + "epoch": 0.1553169035942641, + "grad_norm": 0.43223344648268575, + "learning_rate": 5.176908752327747e-05, + "loss": 4.9653, + "step": 2502 + }, + { + "epoch": 0.15537898069402198, + "grad_norm": 0.548445897703733, + "learning_rate": 5.178977860542107e-05, + "loss": 4.9724, + "step": 2503 + }, + { + "epoch": 0.15544105779377987, + "grad_norm": 0.4216429284007932, + "learning_rate": 5.1810469687564666e-05, + "loss": 5.0244, + "step": 2504 + }, + { + "epoch": 0.1555031348935378, + "grad_norm": 0.5431844433825882, + "learning_rate": 5.1831160769708255e-05, + "loss": 5.0035, + "step": 2505 + }, + { + "epoch": 0.15556521199329568, + "grad_norm": 0.6442012982684332, + "learning_rate": 5.185185185185185e-05, + "loss": 5.1082, + "step": 2506 + }, + { + "epoch": 0.15562728909305357, + "grad_norm": 0.4547092924106471, + "learning_rate": 5.187254293399545e-05, + "loss": 4.8796, + "step": 2507 + }, + { + "epoch": 0.15568936619281148, + "grad_norm": 0.7383927356972251, + "learning_rate": 5.189323401613905e-05, + "loss": 4.9733, + "step": 2508 + }, + { + "epoch": 0.15575144329256937, + "grad_norm": 0.4692534762809079, + "learning_rate": 5.191392509828265e-05, + "loss": 4.8601, + "step": 2509 + }, + { + "epoch": 0.15581352039232726, + "grad_norm": 0.5210342464760104, + "learning_rate": 5.1934616180426244e-05, + "loss": 4.9221, + "step": 2510 + }, + { + "epoch": 0.15587559749208518, + "grad_norm": 0.4853897540227252, + "learning_rate": 5.195530726256983e-05, + "loss": 4.8622, + "step": 2511 + }, + { + "epoch": 0.15593767459184307, + "grad_norm": 0.5254044938650013, + "learning_rate": 5.197599834471343e-05, + "loss": 4.929, + "step": 2512 + }, + { + "epoch": 0.15599975169160096, + "grad_norm": 0.7054606533325192, + "learning_rate": 5.199668942685703e-05, + "loss": 4.9401, + "step": 2513 + }, + { + "epoch": 0.15606182879135888, + "grad_norm": 0.8385324576367786, + "learning_rate": 5.2017380509000624e-05, + "loss": 5.0392, + "step": 2514 + }, + { + "epoch": 0.15612390589111677, + "grad_norm": 0.5880923004464962, + "learning_rate": 5.203807159114422e-05, + "loss": 4.9911, + "step": 2515 + }, + { + "epoch": 0.15618598299087466, + "grad_norm": 0.5061765876570654, + "learning_rate": 5.205876267328782e-05, + "loss": 4.825, + "step": 2516 + }, + { + "epoch": 0.15624806009063258, + "grad_norm": 0.8186202623501365, + "learning_rate": 5.207945375543141e-05, + "loss": 4.9616, + "step": 2517 + }, + { + "epoch": 0.15631013719039047, + "grad_norm": 0.9526639041902849, + "learning_rate": 5.2100144837575004e-05, + "loss": 5.001, + "step": 2518 + }, + { + "epoch": 0.15637221429014836, + "grad_norm": 0.5876337548443406, + "learning_rate": 5.21208359197186e-05, + "loss": 5.0264, + "step": 2519 + }, + { + "epoch": 0.15643429138990628, + "grad_norm": 0.591731480173672, + "learning_rate": 5.21415270018622e-05, + "loss": 4.9304, + "step": 2520 + }, + { + "epoch": 0.15649636848966417, + "grad_norm": 0.6252158030828842, + "learning_rate": 5.2162218084005796e-05, + "loss": 4.9491, + "step": 2521 + }, + { + "epoch": 0.15655844558942206, + "grad_norm": 1.0669726310444656, + "learning_rate": 5.21829091661494e-05, + "loss": 5.0557, + "step": 2522 + }, + { + "epoch": 0.15662052268917998, + "grad_norm": 0.808333018555203, + "learning_rate": 5.2203600248292986e-05, + "loss": 4.8854, + "step": 2523 + }, + { + "epoch": 0.15668259978893787, + "grad_norm": 0.5773227641378569, + "learning_rate": 5.222429133043658e-05, + "loss": 5.0, + "step": 2524 + }, + { + "epoch": 0.15674467688869576, + "grad_norm": 0.6950033121922009, + "learning_rate": 5.2244982412580177e-05, + "loss": 4.96, + "step": 2525 + }, + { + "epoch": 0.15680675398845367, + "grad_norm": 0.5445291341250672, + "learning_rate": 5.226567349472378e-05, + "loss": 4.8521, + "step": 2526 + }, + { + "epoch": 0.15686883108821156, + "grad_norm": 0.5876906552293447, + "learning_rate": 5.2286364576867374e-05, + "loss": 4.9474, + "step": 2527 + }, + { + "epoch": 0.15693090818796945, + "grad_norm": 0.6180253438197216, + "learning_rate": 5.2307055659010975e-05, + "loss": 4.8958, + "step": 2528 + }, + { + "epoch": 0.15699298528772734, + "grad_norm": 0.588820548923014, + "learning_rate": 5.232774674115456e-05, + "loss": 4.9178, + "step": 2529 + }, + { + "epoch": 0.15705506238748526, + "grad_norm": 0.5627038639243719, + "learning_rate": 5.234843782329816e-05, + "loss": 4.968, + "step": 2530 + }, + { + "epoch": 0.15711713948724315, + "grad_norm": 0.437142055404553, + "learning_rate": 5.2369128905441754e-05, + "loss": 5.0381, + "step": 2531 + }, + { + "epoch": 0.15717921658700104, + "grad_norm": 0.6393479901050166, + "learning_rate": 5.2389819987585356e-05, + "loss": 4.9761, + "step": 2532 + }, + { + "epoch": 0.15724129368675896, + "grad_norm": 0.5508810723932827, + "learning_rate": 5.241051106972895e-05, + "loss": 4.9422, + "step": 2533 + }, + { + "epoch": 0.15730337078651685, + "grad_norm": 0.6439687181406961, + "learning_rate": 5.243120215187255e-05, + "loss": 4.928, + "step": 2534 + }, + { + "epoch": 0.15736544788627474, + "grad_norm": 0.5225334331092015, + "learning_rate": 5.2451893234016134e-05, + "loss": 4.8804, + "step": 2535 + }, + { + "epoch": 0.15742752498603266, + "grad_norm": 0.5235210465897793, + "learning_rate": 5.2472584316159736e-05, + "loss": 4.9011, + "step": 2536 + }, + { + "epoch": 0.15748960208579055, + "grad_norm": 0.5175561000528243, + "learning_rate": 5.249327539830333e-05, + "loss": 4.9234, + "step": 2537 + }, + { + "epoch": 0.15755167918554844, + "grad_norm": 0.7220361471928455, + "learning_rate": 5.251396648044693e-05, + "loss": 4.8972, + "step": 2538 + }, + { + "epoch": 0.15761375628530636, + "grad_norm": 0.4845376972014352, + "learning_rate": 5.253465756259053e-05, + "loss": 4.9047, + "step": 2539 + }, + { + "epoch": 0.15767583338506425, + "grad_norm": 0.6564353726470684, + "learning_rate": 5.255534864473412e-05, + "loss": 4.8549, + "step": 2540 + }, + { + "epoch": 0.15773791048482214, + "grad_norm": 0.44369537796396974, + "learning_rate": 5.257603972687771e-05, + "loss": 4.8656, + "step": 2541 + }, + { + "epoch": 0.15779998758458005, + "grad_norm": 0.5594301697360838, + "learning_rate": 5.259673080902131e-05, + "loss": 4.8887, + "step": 2542 + }, + { + "epoch": 0.15786206468433794, + "grad_norm": 0.586396482626695, + "learning_rate": 5.261742189116491e-05, + "loss": 4.9374, + "step": 2543 + }, + { + "epoch": 0.15792414178409583, + "grad_norm": 0.5582549394781079, + "learning_rate": 5.26381129733085e-05, + "loss": 5.0242, + "step": 2544 + }, + { + "epoch": 0.15798621888385375, + "grad_norm": 0.6085165530643495, + "learning_rate": 5.2658804055452105e-05, + "loss": 4.9601, + "step": 2545 + }, + { + "epoch": 0.15804829598361164, + "grad_norm": 0.4389665749996276, + "learning_rate": 5.26794951375957e-05, + "loss": 4.7787, + "step": 2546 + }, + { + "epoch": 0.15811037308336953, + "grad_norm": 0.8282559849056222, + "learning_rate": 5.27001862197393e-05, + "loss": 4.9416, + "step": 2547 + }, + { + "epoch": 0.15817245018312745, + "grad_norm": 1.2251071921958483, + "learning_rate": 5.272087730188289e-05, + "loss": 4.9073, + "step": 2548 + }, + { + "epoch": 0.15823452728288534, + "grad_norm": 0.6964802526329537, + "learning_rate": 5.2741568384026486e-05, + "loss": 4.9682, + "step": 2549 + }, + { + "epoch": 0.15829660438264323, + "grad_norm": 0.6567071503135078, + "learning_rate": 5.276225946617008e-05, + "loss": 4.8987, + "step": 2550 + }, + { + "epoch": 0.15835868148240115, + "grad_norm": 0.7231905637278437, + "learning_rate": 5.278295054831368e-05, + "loss": 4.8793, + "step": 2551 + }, + { + "epoch": 0.15842075858215904, + "grad_norm": 0.6613355858117758, + "learning_rate": 5.280364163045728e-05, + "loss": 4.9216, + "step": 2552 + }, + { + "epoch": 0.15848283568191693, + "grad_norm": 0.5858379533765499, + "learning_rate": 5.282433271260088e-05, + "loss": 5.0608, + "step": 2553 + }, + { + "epoch": 0.15854491278167485, + "grad_norm": 0.656612863583533, + "learning_rate": 5.284502379474446e-05, + "loss": 4.917, + "step": 2554 + }, + { + "epoch": 0.15860698988143274, + "grad_norm": 0.42885986730483583, + "learning_rate": 5.286571487688806e-05, + "loss": 4.9332, + "step": 2555 + }, + { + "epoch": 0.15866906698119063, + "grad_norm": 1.023278551644207, + "learning_rate": 5.288640595903166e-05, + "loss": 4.9627, + "step": 2556 + }, + { + "epoch": 0.15873114408094854, + "grad_norm": 0.6578916271035834, + "learning_rate": 5.290709704117526e-05, + "loss": 4.9098, + "step": 2557 + }, + { + "epoch": 0.15879322118070643, + "grad_norm": 0.5546912609563849, + "learning_rate": 5.2927788123318855e-05, + "loss": 4.9442, + "step": 2558 + }, + { + "epoch": 0.15885529828046432, + "grad_norm": 0.5999976028950809, + "learning_rate": 5.294847920546246e-05, + "loss": 4.9369, + "step": 2559 + }, + { + "epoch": 0.15891737538022224, + "grad_norm": 0.5312156385641099, + "learning_rate": 5.296917028760604e-05, + "loss": 4.9031, + "step": 2560 + }, + { + "epoch": 0.15897945247998013, + "grad_norm": 0.5918098674896505, + "learning_rate": 5.298986136974964e-05, + "loss": 4.819, + "step": 2561 + }, + { + "epoch": 0.15904152957973802, + "grad_norm": 0.6429347755846839, + "learning_rate": 5.3010552451893235e-05, + "loss": 4.7591, + "step": 2562 + }, + { + "epoch": 0.15910360667949594, + "grad_norm": 0.4548907897171122, + "learning_rate": 5.303124353403684e-05, + "loss": 4.8647, + "step": 2563 + }, + { + "epoch": 0.15916568377925383, + "grad_norm": 1.0823535645707523, + "learning_rate": 5.305193461618043e-05, + "loss": 4.8746, + "step": 2564 + }, + { + "epoch": 0.15922776087901172, + "grad_norm": 0.5351880196378717, + "learning_rate": 5.307262569832403e-05, + "loss": 4.8896, + "step": 2565 + }, + { + "epoch": 0.15928983797876964, + "grad_norm": 0.646520815280402, + "learning_rate": 5.3093316780467615e-05, + "loss": 4.907, + "step": 2566 + }, + { + "epoch": 0.15935191507852753, + "grad_norm": 0.5400661027150462, + "learning_rate": 5.311400786261122e-05, + "loss": 4.8886, + "step": 2567 + }, + { + "epoch": 0.15941399217828542, + "grad_norm": 0.9725097484639377, + "learning_rate": 5.313469894475481e-05, + "loss": 4.8716, + "step": 2568 + }, + { + "epoch": 0.15947606927804334, + "grad_norm": 0.9549223727986698, + "learning_rate": 5.315539002689841e-05, + "loss": 4.9351, + "step": 2569 + }, + { + "epoch": 0.15953814637780123, + "grad_norm": 0.5159630967461755, + "learning_rate": 5.317608110904201e-05, + "loss": 4.9465, + "step": 2570 + }, + { + "epoch": 0.15960022347755912, + "grad_norm": 0.7415539898808057, + "learning_rate": 5.3196772191185604e-05, + "loss": 4.8704, + "step": 2571 + }, + { + "epoch": 0.15966230057731703, + "grad_norm": 0.48904245170962957, + "learning_rate": 5.321746327332919e-05, + "loss": 4.9094, + "step": 2572 + }, + { + "epoch": 0.15972437767707492, + "grad_norm": 0.5795369304873292, + "learning_rate": 5.3238154355472794e-05, + "loss": 4.8685, + "step": 2573 + }, + { + "epoch": 0.15978645477683281, + "grad_norm": 0.6473534956632837, + "learning_rate": 5.325884543761639e-05, + "loss": 4.9266, + "step": 2574 + }, + { + "epoch": 0.15984853187659073, + "grad_norm": 0.6314122756258906, + "learning_rate": 5.3279536519759985e-05, + "loss": 4.8382, + "step": 2575 + }, + { + "epoch": 0.15991060897634862, + "grad_norm": 0.5503415456078634, + "learning_rate": 5.3300227601903586e-05, + "loss": 4.9742, + "step": 2576 + }, + { + "epoch": 0.1599726860761065, + "grad_norm": 0.5333547411536165, + "learning_rate": 5.332091868404718e-05, + "loss": 4.9587, + "step": 2577 + }, + { + "epoch": 0.16003476317586443, + "grad_norm": 0.5487112008978987, + "learning_rate": 5.334160976619077e-05, + "loss": 4.8971, + "step": 2578 + }, + { + "epoch": 0.16009684027562232, + "grad_norm": 0.4217081269731492, + "learning_rate": 5.3362300848334365e-05, + "loss": 4.7671, + "step": 2579 + }, + { + "epoch": 0.1601589173753802, + "grad_norm": 0.7785714580690871, + "learning_rate": 5.338299193047797e-05, + "loss": 4.8996, + "step": 2580 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 0.593216782112115, + "learning_rate": 5.340368301262156e-05, + "loss": 4.8976, + "step": 2581 + }, + { + "epoch": 0.16028307157489602, + "grad_norm": 0.5429899911362001, + "learning_rate": 5.3424374094765164e-05, + "loss": 4.8728, + "step": 2582 + }, + { + "epoch": 0.1603451486746539, + "grad_norm": 0.5799847151945592, + "learning_rate": 5.344506517690876e-05, + "loss": 4.843, + "step": 2583 + }, + { + "epoch": 0.16040722577441183, + "grad_norm": 0.6272507195953219, + "learning_rate": 5.346575625905235e-05, + "loss": 4.8758, + "step": 2584 + }, + { + "epoch": 0.16046930287416972, + "grad_norm": 0.7073974082755858, + "learning_rate": 5.348644734119594e-05, + "loss": 4.9477, + "step": 2585 + }, + { + "epoch": 0.1605313799739276, + "grad_norm": 0.49339881068845176, + "learning_rate": 5.3507138423339544e-05, + "loss": 4.8656, + "step": 2586 + }, + { + "epoch": 0.16059345707368552, + "grad_norm": 0.7070324513124829, + "learning_rate": 5.352782950548314e-05, + "loss": 4.8066, + "step": 2587 + }, + { + "epoch": 0.16065553417344342, + "grad_norm": 0.5668872500928698, + "learning_rate": 5.354852058762674e-05, + "loss": 4.8223, + "step": 2588 + }, + { + "epoch": 0.1607176112732013, + "grad_norm": 0.7379147058680234, + "learning_rate": 5.3569211669770336e-05, + "loss": 4.795, + "step": 2589 + }, + { + "epoch": 0.16077968837295922, + "grad_norm": 0.5174065647454907, + "learning_rate": 5.3589902751913924e-05, + "loss": 4.8815, + "step": 2590 + }, + { + "epoch": 0.1608417654727171, + "grad_norm": 0.5935237101950308, + "learning_rate": 5.361059383405752e-05, + "loss": 4.7954, + "step": 2591 + }, + { + "epoch": 0.160903842572475, + "grad_norm": 0.4893198031139812, + "learning_rate": 5.363128491620112e-05, + "loss": 4.9198, + "step": 2592 + }, + { + "epoch": 0.16096591967223292, + "grad_norm": 0.5666158880127634, + "learning_rate": 5.3651975998344716e-05, + "loss": 4.9077, + "step": 2593 + }, + { + "epoch": 0.1610279967719908, + "grad_norm": 0.7129080574308037, + "learning_rate": 5.367266708048831e-05, + "loss": 4.8874, + "step": 2594 + }, + { + "epoch": 0.1610900738717487, + "grad_norm": 0.5628443035097384, + "learning_rate": 5.369335816263191e-05, + "loss": 4.8425, + "step": 2595 + }, + { + "epoch": 0.16115215097150662, + "grad_norm": 0.5352494096071211, + "learning_rate": 5.37140492447755e-05, + "loss": 4.9307, + "step": 2596 + }, + { + "epoch": 0.1612142280712645, + "grad_norm": 0.42137605691198415, + "learning_rate": 5.37347403269191e-05, + "loss": 4.9247, + "step": 2597 + }, + { + "epoch": 0.1612763051710224, + "grad_norm": 0.5311635732471905, + "learning_rate": 5.37554314090627e-05, + "loss": 4.8545, + "step": 2598 + }, + { + "epoch": 0.16133838227078032, + "grad_norm": 0.5219325450762061, + "learning_rate": 5.3776122491206294e-05, + "loss": 4.8427, + "step": 2599 + }, + { + "epoch": 0.1614004593705382, + "grad_norm": 0.6651148367802315, + "learning_rate": 5.379681357334989e-05, + "loss": 4.9782, + "step": 2600 + }, + { + "epoch": 0.1614625364702961, + "grad_norm": 0.40428482258841336, + "learning_rate": 5.381750465549349e-05, + "loss": 4.8808, + "step": 2601 + }, + { + "epoch": 0.16152461357005402, + "grad_norm": 0.5190366240406988, + "learning_rate": 5.383819573763708e-05, + "loss": 4.8863, + "step": 2602 + }, + { + "epoch": 0.1615866906698119, + "grad_norm": 0.48899348384420044, + "learning_rate": 5.3858886819780674e-05, + "loss": 4.898, + "step": 2603 + }, + { + "epoch": 0.1616487677695698, + "grad_norm": 0.6114829357944687, + "learning_rate": 5.387957790192427e-05, + "loss": 4.8964, + "step": 2604 + }, + { + "epoch": 0.1617108448693277, + "grad_norm": 0.6417321198591396, + "learning_rate": 5.390026898406787e-05, + "loss": 4.8567, + "step": 2605 + }, + { + "epoch": 0.1617729219690856, + "grad_norm": 0.38925363143153074, + "learning_rate": 5.3920960066211466e-05, + "loss": 4.8944, + "step": 2606 + }, + { + "epoch": 0.1618349990688435, + "grad_norm": 0.6298302207105041, + "learning_rate": 5.394165114835507e-05, + "loss": 4.8185, + "step": 2607 + }, + { + "epoch": 0.1618970761686014, + "grad_norm": 0.5459725699818136, + "learning_rate": 5.396234223049865e-05, + "loss": 4.9338, + "step": 2608 + }, + { + "epoch": 0.1619591532683593, + "grad_norm": 0.44741232519280916, + "learning_rate": 5.398303331264225e-05, + "loss": 4.8097, + "step": 2609 + }, + { + "epoch": 0.1620212303681172, + "grad_norm": 0.48310346755273936, + "learning_rate": 5.4003724394785846e-05, + "loss": 4.8058, + "step": 2610 + }, + { + "epoch": 0.1620833074678751, + "grad_norm": 0.524987842782359, + "learning_rate": 5.402441547692945e-05, + "loss": 4.8103, + "step": 2611 + }, + { + "epoch": 0.162145384567633, + "grad_norm": 0.5206739436083521, + "learning_rate": 5.404510655907304e-05, + "loss": 4.8118, + "step": 2612 + }, + { + "epoch": 0.1622074616673909, + "grad_norm": 0.5377100689266453, + "learning_rate": 5.4065797641216645e-05, + "loss": 4.8744, + "step": 2613 + }, + { + "epoch": 0.1622695387671488, + "grad_norm": 0.5981451789696748, + "learning_rate": 5.4086488723360226e-05, + "loss": 4.7815, + "step": 2614 + }, + { + "epoch": 0.1623316158669067, + "grad_norm": 0.888076582843212, + "learning_rate": 5.410717980550383e-05, + "loss": 4.8694, + "step": 2615 + }, + { + "epoch": 0.1623936929666646, + "grad_norm": 0.7917683728455917, + "learning_rate": 5.4127870887647423e-05, + "loss": 4.7976, + "step": 2616 + }, + { + "epoch": 0.1624557700664225, + "grad_norm": 0.8185060865773636, + "learning_rate": 5.4148561969791025e-05, + "loss": 4.9219, + "step": 2617 + }, + { + "epoch": 0.1625178471661804, + "grad_norm": 0.736645900647955, + "learning_rate": 5.416925305193462e-05, + "loss": 4.8976, + "step": 2618 + }, + { + "epoch": 0.16257992426593829, + "grad_norm": 0.8143200662723855, + "learning_rate": 5.4189944134078215e-05, + "loss": 4.7888, + "step": 2619 + }, + { + "epoch": 0.1626420013656962, + "grad_norm": 0.6006351357754962, + "learning_rate": 5.4210635216221804e-05, + "loss": 4.8037, + "step": 2620 + }, + { + "epoch": 0.1627040784654541, + "grad_norm": 0.9398882645286002, + "learning_rate": 5.4231326298365406e-05, + "loss": 4.697, + "step": 2621 + }, + { + "epoch": 0.16276615556521198, + "grad_norm": 0.6298608110710568, + "learning_rate": 5.4252017380509e-05, + "loss": 4.7782, + "step": 2622 + }, + { + "epoch": 0.1628282326649699, + "grad_norm": 0.6199534042215092, + "learning_rate": 5.42727084626526e-05, + "loss": 4.8719, + "step": 2623 + }, + { + "epoch": 0.1628903097647278, + "grad_norm": 0.6494537570839702, + "learning_rate": 5.42933995447962e-05, + "loss": 4.8255, + "step": 2624 + }, + { + "epoch": 0.16295238686448568, + "grad_norm": 0.614324690387834, + "learning_rate": 5.431409062693979e-05, + "loss": 4.8986, + "step": 2625 + }, + { + "epoch": 0.1630144639642436, + "grad_norm": 0.5741748403771124, + "learning_rate": 5.433478170908338e-05, + "loss": 4.7808, + "step": 2626 + }, + { + "epoch": 0.1630765410640015, + "grad_norm": 0.9896712950044382, + "learning_rate": 5.435547279122698e-05, + "loss": 4.8989, + "step": 2627 + }, + { + "epoch": 0.16313861816375938, + "grad_norm": 0.9096167606799949, + "learning_rate": 5.437616387337058e-05, + "loss": 4.904, + "step": 2628 + }, + { + "epoch": 0.1632006952635173, + "grad_norm": 0.6993224424057196, + "learning_rate": 5.439685495551417e-05, + "loss": 4.771, + "step": 2629 + }, + { + "epoch": 0.1632627723632752, + "grad_norm": 0.7075543655023612, + "learning_rate": 5.4417546037657775e-05, + "loss": 4.8408, + "step": 2630 + }, + { + "epoch": 0.16332484946303308, + "grad_norm": 0.5652497685569886, + "learning_rate": 5.443823711980137e-05, + "loss": 4.8912, + "step": 2631 + }, + { + "epoch": 0.163386926562791, + "grad_norm": 0.6073791881416098, + "learning_rate": 5.445892820194497e-05, + "loss": 4.7867, + "step": 2632 + }, + { + "epoch": 0.1634490036625489, + "grad_norm": 0.6375651905910626, + "learning_rate": 5.447961928408855e-05, + "loss": 4.8461, + "step": 2633 + }, + { + "epoch": 0.16351108076230678, + "grad_norm": 0.47326323257360975, + "learning_rate": 5.4500310366232155e-05, + "loss": 4.9774, + "step": 2634 + }, + { + "epoch": 0.1635731578620647, + "grad_norm": 0.41911657429686766, + "learning_rate": 5.452100144837575e-05, + "loss": 4.8825, + "step": 2635 + }, + { + "epoch": 0.16363523496182258, + "grad_norm": 0.46217483998383213, + "learning_rate": 5.454169253051935e-05, + "loss": 4.8574, + "step": 2636 + }, + { + "epoch": 0.16369731206158047, + "grad_norm": 0.45265031483465545, + "learning_rate": 5.456238361266295e-05, + "loss": 4.7558, + "step": 2637 + }, + { + "epoch": 0.1637593891613384, + "grad_norm": 0.4863501547607571, + "learning_rate": 5.458307469480655e-05, + "loss": 4.8305, + "step": 2638 + }, + { + "epoch": 0.16382146626109628, + "grad_norm": 0.5798790586921171, + "learning_rate": 5.460376577695013e-05, + "loss": 4.8151, + "step": 2639 + }, + { + "epoch": 0.16388354336085417, + "grad_norm": 0.42033498980399703, + "learning_rate": 5.462445685909373e-05, + "loss": 4.827, + "step": 2640 + }, + { + "epoch": 0.1639456204606121, + "grad_norm": 0.43104928708550166, + "learning_rate": 5.464514794123733e-05, + "loss": 4.7689, + "step": 2641 + }, + { + "epoch": 0.16400769756036998, + "grad_norm": 0.40050642618460086, + "learning_rate": 5.466583902338093e-05, + "loss": 4.8683, + "step": 2642 + }, + { + "epoch": 0.16406977466012787, + "grad_norm": 0.5439559141047063, + "learning_rate": 5.4686530105524524e-05, + "loss": 4.8869, + "step": 2643 + }, + { + "epoch": 0.1641318517598858, + "grad_norm": 0.5738092376529822, + "learning_rate": 5.4707221187668126e-05, + "loss": 4.852, + "step": 2644 + }, + { + "epoch": 0.16419392885964368, + "grad_norm": 0.3709811391123374, + "learning_rate": 5.472791226981171e-05, + "loss": 4.8841, + "step": 2645 + }, + { + "epoch": 0.16425600595940157, + "grad_norm": 0.4195198441132402, + "learning_rate": 5.474860335195531e-05, + "loss": 4.8597, + "step": 2646 + }, + { + "epoch": 0.1643180830591595, + "grad_norm": 0.43881618541239725, + "learning_rate": 5.4769294434098905e-05, + "loss": 4.8653, + "step": 2647 + }, + { + "epoch": 0.16438016015891738, + "grad_norm": 0.521788376340438, + "learning_rate": 5.4789985516242507e-05, + "loss": 4.7448, + "step": 2648 + }, + { + "epoch": 0.16444223725867527, + "grad_norm": 0.39367302944576554, + "learning_rate": 5.48106765983861e-05, + "loss": 4.7847, + "step": 2649 + }, + { + "epoch": 0.16450431435843318, + "grad_norm": 0.3975424343227396, + "learning_rate": 5.48313676805297e-05, + "loss": 4.7368, + "step": 2650 + }, + { + "epoch": 0.16456639145819107, + "grad_norm": 0.3262731543359486, + "learning_rate": 5.4852058762673285e-05, + "loss": 4.8141, + "step": 2651 + }, + { + "epoch": 0.16462846855794896, + "grad_norm": 0.3972858352546559, + "learning_rate": 5.487274984481689e-05, + "loss": 4.8324, + "step": 2652 + }, + { + "epoch": 0.16469054565770688, + "grad_norm": 0.3886029743101369, + "learning_rate": 5.489344092696048e-05, + "loss": 4.7991, + "step": 2653 + }, + { + "epoch": 0.16475262275746477, + "grad_norm": 0.4126189096608617, + "learning_rate": 5.491413200910408e-05, + "loss": 4.7285, + "step": 2654 + }, + { + "epoch": 0.16481469985722266, + "grad_norm": 0.4378471019394128, + "learning_rate": 5.493482309124768e-05, + "loss": 4.7946, + "step": 2655 + }, + { + "epoch": 0.16487677695698058, + "grad_norm": 0.35148808433473017, + "learning_rate": 5.4955514173391274e-05, + "loss": 4.5922, + "step": 2656 + }, + { + "epoch": 0.16493885405673847, + "grad_norm": 0.34449620966148026, + "learning_rate": 5.497620525553486e-05, + "loss": 4.8483, + "step": 2657 + }, + { + "epoch": 0.16500093115649636, + "grad_norm": 0.32565188503753567, + "learning_rate": 5.4996896337678464e-05, + "loss": 4.8454, + "step": 2658 + }, + { + "epoch": 0.16506300825625428, + "grad_norm": 0.4390197481946843, + "learning_rate": 5.501758741982206e-05, + "loss": 4.8496, + "step": 2659 + }, + { + "epoch": 0.16512508535601217, + "grad_norm": 0.3834585266130756, + "learning_rate": 5.5038278501965654e-05, + "loss": 4.8621, + "step": 2660 + }, + { + "epoch": 0.16518716245577006, + "grad_norm": 0.49426622264655773, + "learning_rate": 5.5058969584109256e-05, + "loss": 4.8111, + "step": 2661 + }, + { + "epoch": 0.16524923955552798, + "grad_norm": 0.46869633685341466, + "learning_rate": 5.507966066625285e-05, + "loss": 4.8049, + "step": 2662 + }, + { + "epoch": 0.16531131665528587, + "grad_norm": 0.4060816599450461, + "learning_rate": 5.510035174839644e-05, + "loss": 4.8862, + "step": 2663 + }, + { + "epoch": 0.16537339375504376, + "grad_norm": 0.4611419935334419, + "learning_rate": 5.5121042830540035e-05, + "loss": 4.7326, + "step": 2664 + }, + { + "epoch": 0.16543547085480168, + "grad_norm": 0.3720748180206721, + "learning_rate": 5.5141733912683636e-05, + "loss": 4.8146, + "step": 2665 + }, + { + "epoch": 0.16549754795455957, + "grad_norm": 0.40802361223555006, + "learning_rate": 5.516242499482723e-05, + "loss": 4.7107, + "step": 2666 + }, + { + "epoch": 0.16555962505431746, + "grad_norm": 0.387041361941953, + "learning_rate": 5.518311607697083e-05, + "loss": 4.7931, + "step": 2667 + }, + { + "epoch": 0.16562170215407537, + "grad_norm": 0.46297090373470073, + "learning_rate": 5.520380715911443e-05, + "loss": 4.9155, + "step": 2668 + }, + { + "epoch": 0.16568377925383326, + "grad_norm": 0.34644529988900374, + "learning_rate": 5.522449824125802e-05, + "loss": 4.7196, + "step": 2669 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 0.6328558730343649, + "learning_rate": 5.524518932340161e-05, + "loss": 4.8538, + "step": 2670 + }, + { + "epoch": 0.16580793345334907, + "grad_norm": 0.7015769022459438, + "learning_rate": 5.5265880405545214e-05, + "loss": 4.8571, + "step": 2671 + }, + { + "epoch": 0.16587001055310696, + "grad_norm": 0.44312577668567166, + "learning_rate": 5.528657148768881e-05, + "loss": 4.7436, + "step": 2672 + }, + { + "epoch": 0.16593208765286485, + "grad_norm": 0.7418566719368608, + "learning_rate": 5.530726256983241e-05, + "loss": 4.7734, + "step": 2673 + }, + { + "epoch": 0.16599416475262277, + "grad_norm": 0.6926559318474629, + "learning_rate": 5.5327953651976006e-05, + "loss": 4.84, + "step": 2674 + }, + { + "epoch": 0.16605624185238066, + "grad_norm": 0.3784497450746173, + "learning_rate": 5.5348644734119594e-05, + "loss": 4.8382, + "step": 2675 + }, + { + "epoch": 0.16611831895213855, + "grad_norm": 0.6082032259412423, + "learning_rate": 5.536933581626319e-05, + "loss": 4.8228, + "step": 2676 + }, + { + "epoch": 0.16618039605189647, + "grad_norm": 0.41759586635545176, + "learning_rate": 5.539002689840679e-05, + "loss": 4.796, + "step": 2677 + }, + { + "epoch": 0.16624247315165436, + "grad_norm": 0.4198670107463841, + "learning_rate": 5.5410717980550386e-05, + "loss": 4.8149, + "step": 2678 + }, + { + "epoch": 0.16630455025141225, + "grad_norm": 0.436377993938105, + "learning_rate": 5.543140906269398e-05, + "loss": 4.8719, + "step": 2679 + }, + { + "epoch": 0.16636662735117017, + "grad_norm": 0.5274315372441442, + "learning_rate": 5.545210014483758e-05, + "loss": 4.7751, + "step": 2680 + }, + { + "epoch": 0.16642870445092806, + "grad_norm": 0.4534327021318035, + "learning_rate": 5.547279122698117e-05, + "loss": 4.696, + "step": 2681 + }, + { + "epoch": 0.16649078155068595, + "grad_norm": 0.4179879478252552, + "learning_rate": 5.5493482309124766e-05, + "loss": 4.7912, + "step": 2682 + }, + { + "epoch": 0.16655285865044386, + "grad_norm": 0.6179800377356203, + "learning_rate": 5.551417339126837e-05, + "loss": 4.7742, + "step": 2683 + }, + { + "epoch": 0.16661493575020175, + "grad_norm": 0.5036766061254427, + "learning_rate": 5.553486447341196e-05, + "loss": 4.8171, + "step": 2684 + }, + { + "epoch": 0.16667701284995964, + "grad_norm": 0.3344634674369358, + "learning_rate": 5.555555555555556e-05, + "loss": 4.7644, + "step": 2685 + }, + { + "epoch": 0.16673908994971756, + "grad_norm": 0.39115295970874153, + "learning_rate": 5.557624663769916e-05, + "loss": 4.8015, + "step": 2686 + }, + { + "epoch": 0.16680116704947545, + "grad_norm": 0.4175248507282038, + "learning_rate": 5.559693771984275e-05, + "loss": 4.8, + "step": 2687 + }, + { + "epoch": 0.16686324414923334, + "grad_norm": 0.643037069776811, + "learning_rate": 5.5617628801986343e-05, + "loss": 4.7977, + "step": 2688 + }, + { + "epoch": 0.16692532124899126, + "grad_norm": 0.9416304515243794, + "learning_rate": 5.563831988412994e-05, + "loss": 4.9031, + "step": 2689 + }, + { + "epoch": 0.16698739834874915, + "grad_norm": 0.5583519579025269, + "learning_rate": 5.565901096627354e-05, + "loss": 4.7604, + "step": 2690 + }, + { + "epoch": 0.16704947544850704, + "grad_norm": 0.6803199631680696, + "learning_rate": 5.5679702048417135e-05, + "loss": 4.8232, + "step": 2691 + }, + { + "epoch": 0.16711155254826496, + "grad_norm": 0.645901489250603, + "learning_rate": 5.570039313056074e-05, + "loss": 4.6477, + "step": 2692 + }, + { + "epoch": 0.16717362964802285, + "grad_norm": 0.4664225493990382, + "learning_rate": 5.572108421270432e-05, + "loss": 4.7743, + "step": 2693 + }, + { + "epoch": 0.16723570674778074, + "grad_norm": 0.4737467835148549, + "learning_rate": 5.574177529484792e-05, + "loss": 4.8494, + "step": 2694 + }, + { + "epoch": 0.16729778384753866, + "grad_norm": 0.6890960719781462, + "learning_rate": 5.5762466376991516e-05, + "loss": 4.9259, + "step": 2695 + }, + { + "epoch": 0.16735986094729655, + "grad_norm": 0.5322655619760238, + "learning_rate": 5.578315745913512e-05, + "loss": 4.8288, + "step": 2696 + }, + { + "epoch": 0.16742193804705444, + "grad_norm": 0.5255494852513001, + "learning_rate": 5.580384854127871e-05, + "loss": 4.8412, + "step": 2697 + }, + { + "epoch": 0.16748401514681235, + "grad_norm": 0.7526346660272708, + "learning_rate": 5.5824539623422315e-05, + "loss": 4.7756, + "step": 2698 + }, + { + "epoch": 0.16754609224657024, + "grad_norm": 0.5487518800603338, + "learning_rate": 5.5845230705565896e-05, + "loss": 4.894, + "step": 2699 + }, + { + "epoch": 0.16760816934632813, + "grad_norm": 1.1364751076210642, + "learning_rate": 5.58659217877095e-05, + "loss": 4.7997, + "step": 2700 + }, + { + "epoch": 0.16767024644608605, + "grad_norm": 0.6158805723315663, + "learning_rate": 5.588661286985309e-05, + "loss": 4.775, + "step": 2701 + }, + { + "epoch": 0.16773232354584394, + "grad_norm": 0.4395052391516801, + "learning_rate": 5.5907303951996695e-05, + "loss": 4.8553, + "step": 2702 + }, + { + "epoch": 0.16779440064560183, + "grad_norm": 0.7106903401097409, + "learning_rate": 5.592799503414029e-05, + "loss": 4.8224, + "step": 2703 + }, + { + "epoch": 0.16785647774535975, + "grad_norm": 0.44681616442510774, + "learning_rate": 5.5948686116283885e-05, + "loss": 4.7493, + "step": 2704 + }, + { + "epoch": 0.16791855484511764, + "grad_norm": 0.637234274329324, + "learning_rate": 5.596937719842747e-05, + "loss": 4.8554, + "step": 2705 + }, + { + "epoch": 0.16798063194487553, + "grad_norm": 0.646957339699298, + "learning_rate": 5.5990068280571075e-05, + "loss": 4.7305, + "step": 2706 + }, + { + "epoch": 0.16804270904463345, + "grad_norm": 0.6200768164762321, + "learning_rate": 5.601075936271467e-05, + "loss": 4.8045, + "step": 2707 + }, + { + "epoch": 0.16810478614439134, + "grad_norm": 0.4990564622794471, + "learning_rate": 5.603145044485827e-05, + "loss": 4.7833, + "step": 2708 + }, + { + "epoch": 0.16816686324414923, + "grad_norm": 0.5843058693126338, + "learning_rate": 5.605214152700187e-05, + "loss": 4.7402, + "step": 2709 + }, + { + "epoch": 0.16822894034390715, + "grad_norm": 0.47166714543002647, + "learning_rate": 5.607283260914546e-05, + "loss": 4.7324, + "step": 2710 + }, + { + "epoch": 0.16829101744366504, + "grad_norm": 0.6422401397366911, + "learning_rate": 5.609352369128905e-05, + "loss": 4.641, + "step": 2711 + }, + { + "epoch": 0.16835309454342293, + "grad_norm": 0.47011867026906246, + "learning_rate": 5.611421477343265e-05, + "loss": 4.8835, + "step": 2712 + }, + { + "epoch": 0.16841517164318084, + "grad_norm": 0.6832471568344706, + "learning_rate": 5.613490585557625e-05, + "loss": 4.8243, + "step": 2713 + }, + { + "epoch": 0.16847724874293873, + "grad_norm": 0.7149318786661549, + "learning_rate": 5.615559693771984e-05, + "loss": 4.8213, + "step": 2714 + }, + { + "epoch": 0.16853932584269662, + "grad_norm": 0.46809602258121547, + "learning_rate": 5.6176288019863444e-05, + "loss": 4.6766, + "step": 2715 + }, + { + "epoch": 0.16860140294245454, + "grad_norm": 0.5997129965844056, + "learning_rate": 5.619697910200704e-05, + "loss": 4.8271, + "step": 2716 + }, + { + "epoch": 0.16866348004221243, + "grad_norm": 0.6125697485466975, + "learning_rate": 5.621767018415064e-05, + "loss": 4.8084, + "step": 2717 + }, + { + "epoch": 0.16872555714197032, + "grad_norm": 0.604259218054879, + "learning_rate": 5.623836126629422e-05, + "loss": 4.8341, + "step": 2718 + }, + { + "epoch": 0.1687876342417282, + "grad_norm": 0.43380718304995236, + "learning_rate": 5.6259052348437825e-05, + "loss": 4.8333, + "step": 2719 + }, + { + "epoch": 0.16884971134148613, + "grad_norm": 0.4423081290759606, + "learning_rate": 5.627974343058142e-05, + "loss": 4.6948, + "step": 2720 + }, + { + "epoch": 0.16891178844124402, + "grad_norm": 0.45281522948796266, + "learning_rate": 5.630043451272502e-05, + "loss": 4.8185, + "step": 2721 + }, + { + "epoch": 0.1689738655410019, + "grad_norm": 0.5041852666508191, + "learning_rate": 5.632112559486862e-05, + "loss": 4.6918, + "step": 2722 + }, + { + "epoch": 0.16903594264075983, + "grad_norm": 0.40508297040905156, + "learning_rate": 5.634181667701222e-05, + "loss": 4.8232, + "step": 2723 + }, + { + "epoch": 0.16909801974051772, + "grad_norm": 0.45718410576729196, + "learning_rate": 5.63625077591558e-05, + "loss": 4.7662, + "step": 2724 + }, + { + "epoch": 0.1691600968402756, + "grad_norm": 0.5865183938972213, + "learning_rate": 5.63831988412994e-05, + "loss": 4.7103, + "step": 2725 + }, + { + "epoch": 0.16922217394003353, + "grad_norm": 0.45014816004148167, + "learning_rate": 5.6403889923443e-05, + "loss": 4.8676, + "step": 2726 + }, + { + "epoch": 0.16928425103979142, + "grad_norm": 0.5147785116491163, + "learning_rate": 5.64245810055866e-05, + "loss": 4.8247, + "step": 2727 + }, + { + "epoch": 0.1693463281395493, + "grad_norm": 0.4539449770454892, + "learning_rate": 5.6445272087730194e-05, + "loss": 4.7426, + "step": 2728 + }, + { + "epoch": 0.16940840523930722, + "grad_norm": 0.5213557452604446, + "learning_rate": 5.646596316987379e-05, + "loss": 4.7465, + "step": 2729 + }, + { + "epoch": 0.16947048233906511, + "grad_norm": 0.3959670426593298, + "learning_rate": 5.648665425201738e-05, + "loss": 4.763, + "step": 2730 + }, + { + "epoch": 0.169532559438823, + "grad_norm": 0.5630314224004919, + "learning_rate": 5.650734533416098e-05, + "loss": 4.777, + "step": 2731 + }, + { + "epoch": 0.16959463653858092, + "grad_norm": 0.4048162432284472, + "learning_rate": 5.6528036416304574e-05, + "loss": 4.7867, + "step": 2732 + }, + { + "epoch": 0.1696567136383388, + "grad_norm": 0.37481611432204104, + "learning_rate": 5.6548727498448176e-05, + "loss": 4.7379, + "step": 2733 + }, + { + "epoch": 0.1697187907380967, + "grad_norm": 0.4771921188590617, + "learning_rate": 5.656941858059177e-05, + "loss": 4.7749, + "step": 2734 + }, + { + "epoch": 0.16978086783785462, + "grad_norm": 0.462379124671749, + "learning_rate": 5.6590109662735366e-05, + "loss": 4.7179, + "step": 2735 + }, + { + "epoch": 0.1698429449376125, + "grad_norm": 0.36388691282018903, + "learning_rate": 5.6610800744878955e-05, + "loss": 4.7283, + "step": 2736 + }, + { + "epoch": 0.1699050220373704, + "grad_norm": 0.3741971401911687, + "learning_rate": 5.6631491827022556e-05, + "loss": 4.8188, + "step": 2737 + }, + { + "epoch": 0.16996709913712832, + "grad_norm": 0.3611309441393834, + "learning_rate": 5.665218290916615e-05, + "loss": 4.6689, + "step": 2738 + }, + { + "epoch": 0.1700291762368862, + "grad_norm": 0.44874707213060294, + "learning_rate": 5.6672873991309747e-05, + "loss": 4.7621, + "step": 2739 + }, + { + "epoch": 0.1700912533366441, + "grad_norm": 0.4405285709459568, + "learning_rate": 5.669356507345335e-05, + "loss": 4.687, + "step": 2740 + }, + { + "epoch": 0.17015333043640202, + "grad_norm": 0.40853353753270516, + "learning_rate": 5.6714256155596943e-05, + "loss": 4.7569, + "step": 2741 + }, + { + "epoch": 0.1702154075361599, + "grad_norm": 0.4549200249958107, + "learning_rate": 5.673494723774053e-05, + "loss": 4.7344, + "step": 2742 + }, + { + "epoch": 0.1702774846359178, + "grad_norm": 0.54915804466441, + "learning_rate": 5.675563831988413e-05, + "loss": 4.6548, + "step": 2743 + }, + { + "epoch": 0.17033956173567572, + "grad_norm": 0.6146436974142973, + "learning_rate": 5.677632940202773e-05, + "loss": 4.7062, + "step": 2744 + }, + { + "epoch": 0.1704016388354336, + "grad_norm": 0.5309680863899905, + "learning_rate": 5.6797020484171324e-05, + "loss": 4.7292, + "step": 2745 + }, + { + "epoch": 0.1704637159351915, + "grad_norm": 0.41269293593495526, + "learning_rate": 5.6817711566314926e-05, + "loss": 4.653, + "step": 2746 + }, + { + "epoch": 0.1705257930349494, + "grad_norm": 0.3681135876607741, + "learning_rate": 5.683840264845852e-05, + "loss": 4.6028, + "step": 2747 + }, + { + "epoch": 0.1705878701347073, + "grad_norm": 0.420063119243931, + "learning_rate": 5.685909373060211e-05, + "loss": 4.7156, + "step": 2748 + }, + { + "epoch": 0.1706499472344652, + "grad_norm": 0.37408084279454495, + "learning_rate": 5.6879784812745704e-05, + "loss": 4.6417, + "step": 2749 + }, + { + "epoch": 0.1707120243342231, + "grad_norm": 0.3743959966767482, + "learning_rate": 5.6900475894889306e-05, + "loss": 4.6746, + "step": 2750 + }, + { + "epoch": 0.170774101433981, + "grad_norm": 0.46217819307666297, + "learning_rate": 5.69211669770329e-05, + "loss": 4.6706, + "step": 2751 + }, + { + "epoch": 0.1708361785337389, + "grad_norm": 0.5009206104822451, + "learning_rate": 5.69418580591765e-05, + "loss": 4.8242, + "step": 2752 + }, + { + "epoch": 0.1708982556334968, + "grad_norm": 0.4560001022052232, + "learning_rate": 5.69625491413201e-05, + "loss": 4.6945, + "step": 2753 + }, + { + "epoch": 0.1709603327332547, + "grad_norm": 0.5259527439712037, + "learning_rate": 5.6983240223463686e-05, + "loss": 4.7031, + "step": 2754 + }, + { + "epoch": 0.1710224098330126, + "grad_norm": 0.47938146303338536, + "learning_rate": 5.700393130560728e-05, + "loss": 4.6804, + "step": 2755 + }, + { + "epoch": 0.1710844869327705, + "grad_norm": 0.3650990199634623, + "learning_rate": 5.702462238775088e-05, + "loss": 4.6797, + "step": 2756 + }, + { + "epoch": 0.1711465640325284, + "grad_norm": 0.38165450602874573, + "learning_rate": 5.704531346989448e-05, + "loss": 4.5039, + "step": 2757 + }, + { + "epoch": 0.1712086411322863, + "grad_norm": 0.40636029946905616, + "learning_rate": 5.706600455203808e-05, + "loss": 4.8604, + "step": 2758 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 0.4512137851818422, + "learning_rate": 5.7086695634181675e-05, + "loss": 4.7546, + "step": 2759 + }, + { + "epoch": 0.1713327953318021, + "grad_norm": 0.4131657588664601, + "learning_rate": 5.7107386716325263e-05, + "loss": 4.596, + "step": 2760 + }, + { + "epoch": 0.17139487243155999, + "grad_norm": 0.41877732543609375, + "learning_rate": 5.712807779846886e-05, + "loss": 4.6454, + "step": 2761 + }, + { + "epoch": 0.1714569495313179, + "grad_norm": 0.3216932473396515, + "learning_rate": 5.714876888061246e-05, + "loss": 4.7404, + "step": 2762 + }, + { + "epoch": 0.1715190266310758, + "grad_norm": 0.3556143074908755, + "learning_rate": 5.7169459962756055e-05, + "loss": 4.7764, + "step": 2763 + }, + { + "epoch": 0.17158110373083368, + "grad_norm": 0.3868408010524313, + "learning_rate": 5.719015104489965e-05, + "loss": 4.7531, + "step": 2764 + }, + { + "epoch": 0.1716431808305916, + "grad_norm": 0.32154924936306295, + "learning_rate": 5.721084212704325e-05, + "loss": 4.7307, + "step": 2765 + }, + { + "epoch": 0.1717052579303495, + "grad_norm": 0.38967449894610584, + "learning_rate": 5.723153320918684e-05, + "loss": 4.696, + "step": 2766 + }, + { + "epoch": 0.17176733503010738, + "grad_norm": 0.4230222115411627, + "learning_rate": 5.7252224291330436e-05, + "loss": 4.6766, + "step": 2767 + }, + { + "epoch": 0.1718294121298653, + "grad_norm": 0.5502886903151393, + "learning_rate": 5.727291537347403e-05, + "loss": 4.6982, + "step": 2768 + }, + { + "epoch": 0.1718914892296232, + "grad_norm": 0.34353162591594905, + "learning_rate": 5.729360645561763e-05, + "loss": 4.7485, + "step": 2769 + }, + { + "epoch": 0.17195356632938108, + "grad_norm": 0.4332234459557629, + "learning_rate": 5.731429753776123e-05, + "loss": 4.7715, + "step": 2770 + }, + { + "epoch": 0.172015643429139, + "grad_norm": 0.48314578457003704, + "learning_rate": 5.733498861990483e-05, + "loss": 4.7151, + "step": 2771 + }, + { + "epoch": 0.1720777205288969, + "grad_norm": 0.5352286686591002, + "learning_rate": 5.735567970204842e-05, + "loss": 4.7479, + "step": 2772 + }, + { + "epoch": 0.17213979762865478, + "grad_norm": 0.47871215832607905, + "learning_rate": 5.737637078419201e-05, + "loss": 4.656, + "step": 2773 + }, + { + "epoch": 0.1722018747284127, + "grad_norm": 0.5148798442222794, + "learning_rate": 5.739706186633561e-05, + "loss": 4.6392, + "step": 2774 + }, + { + "epoch": 0.1722639518281706, + "grad_norm": 0.4239295451382271, + "learning_rate": 5.741775294847921e-05, + "loss": 4.7233, + "step": 2775 + }, + { + "epoch": 0.17232602892792848, + "grad_norm": 0.4440899346109679, + "learning_rate": 5.7438444030622805e-05, + "loss": 4.6712, + "step": 2776 + }, + { + "epoch": 0.1723881060276864, + "grad_norm": 0.9175206815587056, + "learning_rate": 5.745913511276641e-05, + "loss": 4.712, + "step": 2777 + }, + { + "epoch": 0.17245018312744428, + "grad_norm": 0.6119930319872465, + "learning_rate": 5.747982619490999e-05, + "loss": 4.6436, + "step": 2778 + }, + { + "epoch": 0.17251226022720217, + "grad_norm": 0.5705844021965405, + "learning_rate": 5.750051727705359e-05, + "loss": 4.7046, + "step": 2779 + }, + { + "epoch": 0.1725743373269601, + "grad_norm": 0.8107025364696009, + "learning_rate": 5.7521208359197185e-05, + "loss": 4.7392, + "step": 2780 + }, + { + "epoch": 0.17263641442671798, + "grad_norm": 0.7559895563514346, + "learning_rate": 5.754189944134079e-05, + "loss": 4.7335, + "step": 2781 + }, + { + "epoch": 0.17269849152647587, + "grad_norm": 0.49008421098848576, + "learning_rate": 5.756259052348438e-05, + "loss": 4.7908, + "step": 2782 + }, + { + "epoch": 0.1727605686262338, + "grad_norm": 1.4876491375698946, + "learning_rate": 5.7583281605627984e-05, + "loss": 4.7584, + "step": 2783 + }, + { + "epoch": 0.17282264572599168, + "grad_norm": 1.0656311236342622, + "learning_rate": 5.7603972687771566e-05, + "loss": 4.6065, + "step": 2784 + }, + { + "epoch": 0.17288472282574957, + "grad_norm": 0.460578293573948, + "learning_rate": 5.762466376991517e-05, + "loss": 4.6126, + "step": 2785 + }, + { + "epoch": 0.1729467999255075, + "grad_norm": 1.1108916754734597, + "learning_rate": 5.764535485205876e-05, + "loss": 4.6266, + "step": 2786 + }, + { + "epoch": 0.17300887702526538, + "grad_norm": 0.6631382737552515, + "learning_rate": 5.7666045934202364e-05, + "loss": 4.6989, + "step": 2787 + }, + { + "epoch": 0.17307095412502327, + "grad_norm": 0.6874400676266972, + "learning_rate": 5.768673701634596e-05, + "loss": 4.7022, + "step": 2788 + }, + { + "epoch": 0.1731330312247812, + "grad_norm": 0.5611236210358065, + "learning_rate": 5.7707428098489555e-05, + "loss": 4.7305, + "step": 2789 + }, + { + "epoch": 0.17319510832453908, + "grad_norm": 0.6022647622810269, + "learning_rate": 5.772811918063314e-05, + "loss": 4.685, + "step": 2790 + }, + { + "epoch": 0.17325718542429697, + "grad_norm": 0.5027546698468607, + "learning_rate": 5.7748810262776745e-05, + "loss": 4.6943, + "step": 2791 + }, + { + "epoch": 0.17331926252405488, + "grad_norm": 0.41991541657915, + "learning_rate": 5.776950134492034e-05, + "loss": 4.6597, + "step": 2792 + }, + { + "epoch": 0.17338133962381277, + "grad_norm": 0.4489058690346662, + "learning_rate": 5.779019242706394e-05, + "loss": 4.7444, + "step": 2793 + }, + { + "epoch": 0.17344341672357066, + "grad_norm": 0.5286342231214698, + "learning_rate": 5.781088350920754e-05, + "loss": 4.6084, + "step": 2794 + }, + { + "epoch": 0.17350549382332858, + "grad_norm": 0.7890621978457033, + "learning_rate": 5.783157459135113e-05, + "loss": 4.7263, + "step": 2795 + }, + { + "epoch": 0.17356757092308647, + "grad_norm": 0.9234850772794131, + "learning_rate": 5.785226567349472e-05, + "loss": 4.7899, + "step": 2796 + }, + { + "epoch": 0.17362964802284436, + "grad_norm": 0.6245150114901712, + "learning_rate": 5.787295675563832e-05, + "loss": 4.7416, + "step": 2797 + }, + { + "epoch": 0.17369172512260228, + "grad_norm": 0.6324002606831712, + "learning_rate": 5.789364783778192e-05, + "loss": 4.695, + "step": 2798 + }, + { + "epoch": 0.17375380222236017, + "grad_norm": 1.1151108548092825, + "learning_rate": 5.791433891992551e-05, + "loss": 4.6507, + "step": 2799 + }, + { + "epoch": 0.17381587932211806, + "grad_norm": 0.8666814753905591, + "learning_rate": 5.7935030002069114e-05, + "loss": 4.7582, + "step": 2800 + }, + { + "epoch": 0.17387795642187598, + "grad_norm": 0.7872710334097665, + "learning_rate": 5.795572108421271e-05, + "loss": 4.6582, + "step": 2801 + }, + { + "epoch": 0.17394003352163387, + "grad_norm": 0.7650191186871996, + "learning_rate": 5.797641216635631e-05, + "loss": 4.6359, + "step": 2802 + }, + { + "epoch": 0.17400211062139176, + "grad_norm": 0.661841684426915, + "learning_rate": 5.799710324849989e-05, + "loss": 4.7504, + "step": 2803 + }, + { + "epoch": 0.17406418772114968, + "grad_norm": 0.6230394129769646, + "learning_rate": 5.8017794330643494e-05, + "loss": 4.7002, + "step": 2804 + }, + { + "epoch": 0.17412626482090757, + "grad_norm": 0.6834440666898416, + "learning_rate": 5.803848541278709e-05, + "loss": 4.7058, + "step": 2805 + }, + { + "epoch": 0.17418834192066546, + "grad_norm": 0.5833856071029346, + "learning_rate": 5.805917649493069e-05, + "loss": 4.6736, + "step": 2806 + }, + { + "epoch": 0.17425041902042337, + "grad_norm": 0.5372836021132673, + "learning_rate": 5.8079867577074286e-05, + "loss": 4.8451, + "step": 2807 + }, + { + "epoch": 0.17431249612018127, + "grad_norm": 0.5173965999484182, + "learning_rate": 5.810055865921789e-05, + "loss": 4.7053, + "step": 2808 + }, + { + "epoch": 0.17437457321993916, + "grad_norm": 0.5549432406742384, + "learning_rate": 5.812124974136147e-05, + "loss": 4.6686, + "step": 2809 + }, + { + "epoch": 0.17443665031969707, + "grad_norm": 0.47848839195764464, + "learning_rate": 5.814194082350507e-05, + "loss": 4.6421, + "step": 2810 + }, + { + "epoch": 0.17449872741945496, + "grad_norm": 0.40157027107170723, + "learning_rate": 5.8162631905648667e-05, + "loss": 4.7223, + "step": 2811 + }, + { + "epoch": 0.17456080451921285, + "grad_norm": 0.42896141781407404, + "learning_rate": 5.818332298779227e-05, + "loss": 4.6586, + "step": 2812 + }, + { + "epoch": 0.17462288161897077, + "grad_norm": 0.5633421263159003, + "learning_rate": 5.8204014069935864e-05, + "loss": 4.6257, + "step": 2813 + }, + { + "epoch": 0.17468495871872866, + "grad_norm": 0.5797820957457095, + "learning_rate": 5.822470515207946e-05, + "loss": 4.6015, + "step": 2814 + }, + { + "epoch": 0.17474703581848655, + "grad_norm": 0.3929721126829636, + "learning_rate": 5.824539623422305e-05, + "loss": 4.6087, + "step": 2815 + }, + { + "epoch": 0.17480911291824447, + "grad_norm": 0.427323601368027, + "learning_rate": 5.826608731636665e-05, + "loss": 4.7583, + "step": 2816 + }, + { + "epoch": 0.17487119001800236, + "grad_norm": 0.44203635621720194, + "learning_rate": 5.8286778398510244e-05, + "loss": 4.6451, + "step": 2817 + }, + { + "epoch": 0.17493326711776025, + "grad_norm": 0.4184903250661788, + "learning_rate": 5.8307469480653846e-05, + "loss": 4.6113, + "step": 2818 + }, + { + "epoch": 0.17499534421751817, + "grad_norm": 0.34511023937025037, + "learning_rate": 5.832816056279744e-05, + "loss": 4.6415, + "step": 2819 + }, + { + "epoch": 0.17505742131727606, + "grad_norm": 0.5212736239671577, + "learning_rate": 5.8348851644941036e-05, + "loss": 4.7483, + "step": 2820 + }, + { + "epoch": 0.17511949841703395, + "grad_norm": 0.5246227850659332, + "learning_rate": 5.8369542727084624e-05, + "loss": 4.6845, + "step": 2821 + }, + { + "epoch": 0.17518157551679187, + "grad_norm": 0.39518228957183016, + "learning_rate": 5.8390233809228226e-05, + "loss": 4.6515, + "step": 2822 + }, + { + "epoch": 0.17524365261654976, + "grad_norm": 0.548352806094355, + "learning_rate": 5.841092489137182e-05, + "loss": 4.6076, + "step": 2823 + }, + { + "epoch": 0.17530572971630765, + "grad_norm": 0.49952669391576837, + "learning_rate": 5.8431615973515416e-05, + "loss": 4.6399, + "step": 2824 + }, + { + "epoch": 0.17536780681606556, + "grad_norm": 0.4109331855982579, + "learning_rate": 5.845230705565902e-05, + "loss": 4.7858, + "step": 2825 + }, + { + "epoch": 0.17542988391582345, + "grad_norm": 0.4726722651519078, + "learning_rate": 5.847299813780261e-05, + "loss": 4.5735, + "step": 2826 + }, + { + "epoch": 0.17549196101558134, + "grad_norm": 0.9814104823674268, + "learning_rate": 5.84936892199462e-05, + "loss": 4.6721, + "step": 2827 + }, + { + "epoch": 0.17555403811533926, + "grad_norm": 0.9313768697250658, + "learning_rate": 5.8514380302089796e-05, + "loss": 4.6169, + "step": 2828 + }, + { + "epoch": 0.17561611521509715, + "grad_norm": 0.6541812851562234, + "learning_rate": 5.85350713842334e-05, + "loss": 4.6282, + "step": 2829 + }, + { + "epoch": 0.17567819231485504, + "grad_norm": 0.6818223945892117, + "learning_rate": 5.855576246637699e-05, + "loss": 4.6227, + "step": 2830 + }, + { + "epoch": 0.17574026941461296, + "grad_norm": 0.4020594731434414, + "learning_rate": 5.8576453548520595e-05, + "loss": 4.7955, + "step": 2831 + }, + { + "epoch": 0.17580234651437085, + "grad_norm": 0.556535168747228, + "learning_rate": 5.859714463066419e-05, + "loss": 4.7933, + "step": 2832 + }, + { + "epoch": 0.17586442361412874, + "grad_norm": 0.5536641096063337, + "learning_rate": 5.861783571280778e-05, + "loss": 4.6906, + "step": 2833 + }, + { + "epoch": 0.17592650071388666, + "grad_norm": 1.2801795767939481, + "learning_rate": 5.8638526794951374e-05, + "loss": 4.7336, + "step": 2834 + }, + { + "epoch": 0.17598857781364455, + "grad_norm": 0.8951096309881443, + "learning_rate": 5.8659217877094976e-05, + "loss": 4.6757, + "step": 2835 + }, + { + "epoch": 0.17605065491340244, + "grad_norm": 0.612082287372226, + "learning_rate": 5.867990895923857e-05, + "loss": 4.5671, + "step": 2836 + }, + { + "epoch": 0.17611273201316036, + "grad_norm": 0.6057602703563613, + "learning_rate": 5.870060004138217e-05, + "loss": 4.7131, + "step": 2837 + }, + { + "epoch": 0.17617480911291825, + "grad_norm": 0.5250868362183184, + "learning_rate": 5.872129112352577e-05, + "loss": 4.6231, + "step": 2838 + }, + { + "epoch": 0.17623688621267614, + "grad_norm": 0.4510504323828515, + "learning_rate": 5.8741982205669356e-05, + "loss": 4.6433, + "step": 2839 + }, + { + "epoch": 0.17629896331243405, + "grad_norm": 0.4675613591527361, + "learning_rate": 5.876267328781295e-05, + "loss": 4.7026, + "step": 2840 + }, + { + "epoch": 0.17636104041219194, + "grad_norm": 0.48353409838338274, + "learning_rate": 5.878336436995655e-05, + "loss": 4.7183, + "step": 2841 + }, + { + "epoch": 0.17642311751194983, + "grad_norm": 0.5238811264134967, + "learning_rate": 5.880405545210015e-05, + "loss": 4.7989, + "step": 2842 + }, + { + "epoch": 0.17648519461170775, + "grad_norm": 0.6775337603909474, + "learning_rate": 5.882474653424375e-05, + "loss": 4.6958, + "step": 2843 + }, + { + "epoch": 0.17654727171146564, + "grad_norm": 0.515393494162467, + "learning_rate": 5.8845437616387345e-05, + "loss": 4.6114, + "step": 2844 + }, + { + "epoch": 0.17660934881122353, + "grad_norm": 0.5835098584199548, + "learning_rate": 5.886612869853093e-05, + "loss": 4.5276, + "step": 2845 + }, + { + "epoch": 0.17667142591098145, + "grad_norm": 0.5299436270675322, + "learning_rate": 5.888681978067453e-05, + "loss": 4.4925, + "step": 2846 + }, + { + "epoch": 0.17673350301073934, + "grad_norm": 0.5760129547781592, + "learning_rate": 5.890751086281813e-05, + "loss": 4.7942, + "step": 2847 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 0.49128724068674445, + "learning_rate": 5.8928201944961725e-05, + "loss": 4.5852, + "step": 2848 + }, + { + "epoch": 0.17685765721025515, + "grad_norm": 0.7687395157137865, + "learning_rate": 5.894889302710532e-05, + "loss": 4.6867, + "step": 2849 + }, + { + "epoch": 0.17691973431001304, + "grad_norm": 0.6111045891842434, + "learning_rate": 5.896958410924892e-05, + "loss": 4.6783, + "step": 2850 + }, + { + "epoch": 0.17698181140977093, + "grad_norm": 0.5383048330602754, + "learning_rate": 5.899027519139251e-05, + "loss": 4.5489, + "step": 2851 + }, + { + "epoch": 0.17704388850952885, + "grad_norm": 0.48295600758295526, + "learning_rate": 5.9010966273536105e-05, + "loss": 4.7428, + "step": 2852 + }, + { + "epoch": 0.17710596560928674, + "grad_norm": 0.47750866877091697, + "learning_rate": 5.90316573556797e-05, + "loss": 4.5352, + "step": 2853 + }, + { + "epoch": 0.17716804270904463, + "grad_norm": 0.639461833729104, + "learning_rate": 5.90523484378233e-05, + "loss": 4.636, + "step": 2854 + }, + { + "epoch": 0.17723011980880254, + "grad_norm": 0.4373831422597915, + "learning_rate": 5.90730395199669e-05, + "loss": 4.6833, + "step": 2855 + }, + { + "epoch": 0.17729219690856043, + "grad_norm": 0.4188982174596609, + "learning_rate": 5.90937306021105e-05, + "loss": 4.6249, + "step": 2856 + }, + { + "epoch": 0.17735427400831832, + "grad_norm": 0.5521249005725885, + "learning_rate": 5.911442168425409e-05, + "loss": 4.6617, + "step": 2857 + }, + { + "epoch": 0.17741635110807624, + "grad_norm": 0.5293974552098422, + "learning_rate": 5.913511276639768e-05, + "loss": 4.6735, + "step": 2858 + }, + { + "epoch": 0.17747842820783413, + "grad_norm": 0.45741513424018576, + "learning_rate": 5.915580384854128e-05, + "loss": 4.7012, + "step": 2859 + }, + { + "epoch": 0.17754050530759202, + "grad_norm": 0.5053505418809473, + "learning_rate": 5.917649493068488e-05, + "loss": 4.6523, + "step": 2860 + }, + { + "epoch": 0.17760258240734994, + "grad_norm": 0.4632606154376868, + "learning_rate": 5.9197186012828475e-05, + "loss": 4.6847, + "step": 2861 + }, + { + "epoch": 0.17766465950710783, + "grad_norm": 0.627718117105189, + "learning_rate": 5.9217877094972076e-05, + "loss": 4.6535, + "step": 2862 + }, + { + "epoch": 0.17772673660686572, + "grad_norm": 0.6936385331517069, + "learning_rate": 5.923856817711566e-05, + "loss": 4.5191, + "step": 2863 + }, + { + "epoch": 0.17778881370662364, + "grad_norm": 0.4780138148262709, + "learning_rate": 5.925925925925926e-05, + "loss": 4.633, + "step": 2864 + }, + { + "epoch": 0.17785089080638153, + "grad_norm": 0.42931554474294487, + "learning_rate": 5.9279950341402855e-05, + "loss": 4.6192, + "step": 2865 + }, + { + "epoch": 0.17791296790613942, + "grad_norm": 0.45932685573815296, + "learning_rate": 5.930064142354646e-05, + "loss": 4.655, + "step": 2866 + }, + { + "epoch": 0.17797504500589734, + "grad_norm": 0.5150952652437557, + "learning_rate": 5.932133250569005e-05, + "loss": 4.6713, + "step": 2867 + }, + { + "epoch": 0.17803712210565523, + "grad_norm": 0.5811952821551242, + "learning_rate": 5.9342023587833654e-05, + "loss": 4.6998, + "step": 2868 + }, + { + "epoch": 0.17809919920541312, + "grad_norm": 0.4048545971105198, + "learning_rate": 5.9362714669977235e-05, + "loss": 4.6743, + "step": 2869 + }, + { + "epoch": 0.17816127630517103, + "grad_norm": 1.0438255762389301, + "learning_rate": 5.938340575212084e-05, + "loss": 4.5588, + "step": 2870 + }, + { + "epoch": 0.17822335340492892, + "grad_norm": 0.4964716515760331, + "learning_rate": 5.940409683426443e-05, + "loss": 4.7208, + "step": 2871 + }, + { + "epoch": 0.17828543050468681, + "grad_norm": 0.8092638270788071, + "learning_rate": 5.9424787916408034e-05, + "loss": 4.7335, + "step": 2872 + }, + { + "epoch": 0.17834750760444473, + "grad_norm": 0.8327439886544401, + "learning_rate": 5.944547899855163e-05, + "loss": 4.5186, + "step": 2873 + }, + { + "epoch": 0.17840958470420262, + "grad_norm": 0.688765812707899, + "learning_rate": 5.9466170080695224e-05, + "loss": 4.7577, + "step": 2874 + }, + { + "epoch": 0.1784716618039605, + "grad_norm": 0.6373848970134464, + "learning_rate": 5.948686116283881e-05, + "loss": 4.6479, + "step": 2875 + }, + { + "epoch": 0.17853373890371843, + "grad_norm": 0.5351940058177224, + "learning_rate": 5.9507552244982414e-05, + "loss": 4.5985, + "step": 2876 + }, + { + "epoch": 0.17859581600347632, + "grad_norm": 0.5368504680495431, + "learning_rate": 5.952824332712601e-05, + "loss": 4.6245, + "step": 2877 + }, + { + "epoch": 0.1786578931032342, + "grad_norm": 0.6300053440351744, + "learning_rate": 5.9548934409269604e-05, + "loss": 4.6565, + "step": 2878 + }, + { + "epoch": 0.17871997020299213, + "grad_norm": 0.5920997574756591, + "learning_rate": 5.9569625491413206e-05, + "loss": 4.6369, + "step": 2879 + }, + { + "epoch": 0.17878204730275002, + "grad_norm": 0.43732302355946906, + "learning_rate": 5.95903165735568e-05, + "loss": 4.5803, + "step": 2880 + }, + { + "epoch": 0.1788441244025079, + "grad_norm": 0.6578070696971318, + "learning_rate": 5.961100765570039e-05, + "loss": 4.7411, + "step": 2881 + }, + { + "epoch": 0.17890620150226583, + "grad_norm": 0.5487585872160733, + "learning_rate": 5.963169873784399e-05, + "loss": 4.5567, + "step": 2882 + }, + { + "epoch": 0.17896827860202372, + "grad_norm": 0.5091779397891317, + "learning_rate": 5.965238981998759e-05, + "loss": 4.5306, + "step": 2883 + }, + { + "epoch": 0.1790303557017816, + "grad_norm": 0.5218031119898985, + "learning_rate": 5.967308090213118e-05, + "loss": 4.8182, + "step": 2884 + }, + { + "epoch": 0.17909243280153953, + "grad_norm": 0.6374671843441405, + "learning_rate": 5.9693771984274784e-05, + "loss": 4.6197, + "step": 2885 + }, + { + "epoch": 0.17915450990129742, + "grad_norm": 0.378809198751259, + "learning_rate": 5.971446306641838e-05, + "loss": 4.5533, + "step": 2886 + }, + { + "epoch": 0.1792165870010553, + "grad_norm": 0.6627006253974764, + "learning_rate": 5.973515414856198e-05, + "loss": 4.6759, + "step": 2887 + }, + { + "epoch": 0.17927866410081322, + "grad_norm": 0.6060136619497718, + "learning_rate": 5.975584523070556e-05, + "loss": 4.598, + "step": 2888 + }, + { + "epoch": 0.1793407412005711, + "grad_norm": 0.6855309856779636, + "learning_rate": 5.9776536312849164e-05, + "loss": 4.6823, + "step": 2889 + }, + { + "epoch": 0.179402818300329, + "grad_norm": 0.5218005418484419, + "learning_rate": 5.979722739499276e-05, + "loss": 4.5862, + "step": 2890 + }, + { + "epoch": 0.17946489540008692, + "grad_norm": 0.48954918079430404, + "learning_rate": 5.981791847713636e-05, + "loss": 4.659, + "step": 2891 + }, + { + "epoch": 0.1795269724998448, + "grad_norm": 0.4700560494746074, + "learning_rate": 5.9838609559279956e-05, + "loss": 4.6245, + "step": 2892 + }, + { + "epoch": 0.1795890495996027, + "grad_norm": 0.3648333668507653, + "learning_rate": 5.985930064142356e-05, + "loss": 4.5896, + "step": 2893 + }, + { + "epoch": 0.17965112669936062, + "grad_norm": 0.4853213049439292, + "learning_rate": 5.987999172356714e-05, + "loss": 4.4585, + "step": 2894 + }, + { + "epoch": 0.1797132037991185, + "grad_norm": 0.5730924914509671, + "learning_rate": 5.990068280571074e-05, + "loss": 4.4868, + "step": 2895 + }, + { + "epoch": 0.1797752808988764, + "grad_norm": 0.4807386648108394, + "learning_rate": 5.9921373887854336e-05, + "loss": 4.6836, + "step": 2896 + }, + { + "epoch": 0.17983735799863432, + "grad_norm": 0.36691430256633767, + "learning_rate": 5.994206496999794e-05, + "loss": 4.5328, + "step": 2897 + }, + { + "epoch": 0.1798994350983922, + "grad_norm": 0.4223854367834266, + "learning_rate": 5.996275605214153e-05, + "loss": 4.7202, + "step": 2898 + }, + { + "epoch": 0.1799615121981501, + "grad_norm": 0.3792624421602785, + "learning_rate": 5.998344713428513e-05, + "loss": 4.558, + "step": 2899 + }, + { + "epoch": 0.18002358929790802, + "grad_norm": 0.94503445910645, + "learning_rate": 6.0004138216428716e-05, + "loss": 4.6883, + "step": 2900 + }, + { + "epoch": 0.1800856663976659, + "grad_norm": 1.0179112451080794, + "learning_rate": 6.002482929857232e-05, + "loss": 4.5529, + "step": 2901 + }, + { + "epoch": 0.1801477434974238, + "grad_norm": 0.5755549344507856, + "learning_rate": 6.0045520380715913e-05, + "loss": 4.5945, + "step": 2902 + }, + { + "epoch": 0.1802098205971817, + "grad_norm": 0.5752481329455326, + "learning_rate": 6.006621146285951e-05, + "loss": 4.611, + "step": 2903 + }, + { + "epoch": 0.1802718976969396, + "grad_norm": 0.6322698739164881, + "learning_rate": 6.008690254500311e-05, + "loss": 4.6155, + "step": 2904 + }, + { + "epoch": 0.1803339747966975, + "grad_norm": 0.5293691381695148, + "learning_rate": 6.0107593627146705e-05, + "loss": 4.6894, + "step": 2905 + }, + { + "epoch": 0.18039605189645538, + "grad_norm": 0.6348717938180907, + "learning_rate": 6.0128284709290294e-05, + "loss": 4.6834, + "step": 2906 + }, + { + "epoch": 0.1804581289962133, + "grad_norm": 0.5971298642294781, + "learning_rate": 6.0148975791433896e-05, + "loss": 4.6616, + "step": 2907 + }, + { + "epoch": 0.1805202060959712, + "grad_norm": 0.8445763250385443, + "learning_rate": 6.016966687357749e-05, + "loss": 4.6462, + "step": 2908 + }, + { + "epoch": 0.18058228319572908, + "grad_norm": 0.7714321456806194, + "learning_rate": 6.0190357955721086e-05, + "loss": 4.6077, + "step": 2909 + }, + { + "epoch": 0.180644360295487, + "grad_norm": 1.4051425811940945, + "learning_rate": 6.021104903786469e-05, + "loss": 4.7239, + "step": 2910 + }, + { + "epoch": 0.1807064373952449, + "grad_norm": 0.6539006869900973, + "learning_rate": 6.023174012000828e-05, + "loss": 4.4822, + "step": 2911 + }, + { + "epoch": 0.18076851449500278, + "grad_norm": 0.8309892388827954, + "learning_rate": 6.025243120215187e-05, + "loss": 4.5429, + "step": 2912 + }, + { + "epoch": 0.1808305915947607, + "grad_norm": 0.5083514068340167, + "learning_rate": 6.0273122284295466e-05, + "loss": 4.5286, + "step": 2913 + }, + { + "epoch": 0.1808926686945186, + "grad_norm": 0.6366874827565346, + "learning_rate": 6.029381336643907e-05, + "loss": 4.6349, + "step": 2914 + }, + { + "epoch": 0.18095474579427648, + "grad_norm": 0.5194505040800722, + "learning_rate": 6.031450444858266e-05, + "loss": 4.6129, + "step": 2915 + }, + { + "epoch": 0.1810168228940344, + "grad_norm": 0.4695357730285832, + "learning_rate": 6.0335195530726265e-05, + "loss": 4.6243, + "step": 2916 + }, + { + "epoch": 0.18107889999379229, + "grad_norm": 0.3856814952377338, + "learning_rate": 6.035588661286986e-05, + "loss": 4.6068, + "step": 2917 + }, + { + "epoch": 0.18114097709355018, + "grad_norm": 0.6855627204824415, + "learning_rate": 6.037657769501345e-05, + "loss": 4.606, + "step": 2918 + }, + { + "epoch": 0.1812030541933081, + "grad_norm": 0.5136984466526279, + "learning_rate": 6.039726877715704e-05, + "loss": 4.5575, + "step": 2919 + }, + { + "epoch": 0.18126513129306598, + "grad_norm": 0.7183497599901579, + "learning_rate": 6.0417959859300645e-05, + "loss": 4.4941, + "step": 2920 + }, + { + "epoch": 0.18132720839282387, + "grad_norm": 0.6091998900169829, + "learning_rate": 6.043865094144424e-05, + "loss": 4.6223, + "step": 2921 + }, + { + "epoch": 0.1813892854925818, + "grad_norm": 0.4989093717339419, + "learning_rate": 6.045934202358784e-05, + "loss": 4.6039, + "step": 2922 + }, + { + "epoch": 0.18145136259233968, + "grad_norm": 0.4224024230813026, + "learning_rate": 6.048003310573144e-05, + "loss": 4.5926, + "step": 2923 + }, + { + "epoch": 0.18151343969209757, + "grad_norm": 0.388248289463134, + "learning_rate": 6.0500724187875025e-05, + "loss": 4.659, + "step": 2924 + }, + { + "epoch": 0.1815755167918555, + "grad_norm": 0.4477585653979922, + "learning_rate": 6.052141527001862e-05, + "loss": 4.5834, + "step": 2925 + }, + { + "epoch": 0.18163759389161338, + "grad_norm": 0.40152889437640993, + "learning_rate": 6.054210635216222e-05, + "loss": 4.6589, + "step": 2926 + }, + { + "epoch": 0.18169967099137127, + "grad_norm": 0.5716616206414458, + "learning_rate": 6.056279743430582e-05, + "loss": 4.6702, + "step": 2927 + }, + { + "epoch": 0.1817617480911292, + "grad_norm": 0.35668980822326823, + "learning_rate": 6.058348851644942e-05, + "loss": 4.5495, + "step": 2928 + }, + { + "epoch": 0.18182382519088708, + "grad_norm": 0.6057056824802237, + "learning_rate": 6.0604179598593014e-05, + "loss": 4.5174, + "step": 2929 + }, + { + "epoch": 0.18188590229064497, + "grad_norm": 0.5349435035341049, + "learning_rate": 6.06248706807366e-05, + "loss": 4.5759, + "step": 2930 + }, + { + "epoch": 0.1819479793904029, + "grad_norm": 0.3932722607003179, + "learning_rate": 6.06455617628802e-05, + "loss": 4.5417, + "step": 2931 + }, + { + "epoch": 0.18201005649016078, + "grad_norm": 0.4342676373086056, + "learning_rate": 6.06662528450238e-05, + "loss": 4.6357, + "step": 2932 + }, + { + "epoch": 0.18207213358991867, + "grad_norm": 0.4184264057218546, + "learning_rate": 6.0686943927167395e-05, + "loss": 4.5421, + "step": 2933 + }, + { + "epoch": 0.18213421068967658, + "grad_norm": 0.4005801984877135, + "learning_rate": 6.070763500931099e-05, + "loss": 4.5896, + "step": 2934 + }, + { + "epoch": 0.18219628778943447, + "grad_norm": 0.44854822287202684, + "learning_rate": 6.072832609145459e-05, + "loss": 4.6846, + "step": 2935 + }, + { + "epoch": 0.18225836488919236, + "grad_norm": 0.5966288893033601, + "learning_rate": 6.074901717359818e-05, + "loss": 4.5204, + "step": 2936 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 0.34648611547813524, + "learning_rate": 6.0769708255741775e-05, + "loss": 4.5578, + "step": 2937 + }, + { + "epoch": 0.18238251908870817, + "grad_norm": 0.47810512763219676, + "learning_rate": 6.079039933788537e-05, + "loss": 4.6716, + "step": 2938 + }, + { + "epoch": 0.18244459618846606, + "grad_norm": 0.4774807697438282, + "learning_rate": 6.081109042002897e-05, + "loss": 4.535, + "step": 2939 + }, + { + "epoch": 0.18250667328822398, + "grad_norm": 0.5545889538547734, + "learning_rate": 6.083178150217257e-05, + "loss": 4.4076, + "step": 2940 + }, + { + "epoch": 0.18256875038798187, + "grad_norm": 0.7064314980351913, + "learning_rate": 6.085247258431617e-05, + "loss": 4.5308, + "step": 2941 + }, + { + "epoch": 0.18263082748773976, + "grad_norm": 0.44532494778552234, + "learning_rate": 6.087316366645976e-05, + "loss": 4.5828, + "step": 2942 + }, + { + "epoch": 0.18269290458749768, + "grad_norm": 0.9352391320183198, + "learning_rate": 6.089385474860335e-05, + "loss": 4.4701, + "step": 2943 + }, + { + "epoch": 0.18275498168725557, + "grad_norm": 0.7033028483639178, + "learning_rate": 6.091454583074695e-05, + "loss": 4.655, + "step": 2944 + }, + { + "epoch": 0.18281705878701346, + "grad_norm": 1.1268680745365307, + "learning_rate": 6.093523691289055e-05, + "loss": 4.5218, + "step": 2945 + }, + { + "epoch": 0.18287913588677138, + "grad_norm": 0.8024205246309437, + "learning_rate": 6.0955927995034144e-05, + "loss": 4.6761, + "step": 2946 + }, + { + "epoch": 0.18294121298652927, + "grad_norm": 1.3238831041253876, + "learning_rate": 6.0976619077177746e-05, + "loss": 4.6183, + "step": 2947 + }, + { + "epoch": 0.18300329008628716, + "grad_norm": 1.3905236710910398, + "learning_rate": 6.099731015932133e-05, + "loss": 4.7738, + "step": 2948 + }, + { + "epoch": 0.18306536718604507, + "grad_norm": 1.3408675661255254, + "learning_rate": 6.101800124146493e-05, + "loss": 4.6263, + "step": 2949 + }, + { + "epoch": 0.18312744428580296, + "grad_norm": 0.8915982199331512, + "learning_rate": 6.103869232360852e-05, + "loss": 4.6979, + "step": 2950 + }, + { + "epoch": 0.18318952138556086, + "grad_norm": 0.7051184898932066, + "learning_rate": 6.105938340575212e-05, + "loss": 4.5661, + "step": 2951 + }, + { + "epoch": 0.18325159848531877, + "grad_norm": 0.574784914506845, + "learning_rate": 6.108007448789573e-05, + "loss": 4.4977, + "step": 2952 + }, + { + "epoch": 0.18331367558507666, + "grad_norm": 0.804056642513237, + "learning_rate": 6.110076557003932e-05, + "loss": 4.6385, + "step": 2953 + }, + { + "epoch": 0.18337575268483455, + "grad_norm": 0.524309257960212, + "learning_rate": 6.11214566521829e-05, + "loss": 4.6251, + "step": 2954 + }, + { + "epoch": 0.18343782978459247, + "grad_norm": 0.5130086177656075, + "learning_rate": 6.11421477343265e-05, + "loss": 4.5471, + "step": 2955 + }, + { + "epoch": 0.18349990688435036, + "grad_norm": 0.5956667029080684, + "learning_rate": 6.116283881647011e-05, + "loss": 4.7063, + "step": 2956 + }, + { + "epoch": 0.18356198398410825, + "grad_norm": 0.7746419294771706, + "learning_rate": 6.11835298986137e-05, + "loss": 4.6574, + "step": 2957 + }, + { + "epoch": 0.18362406108386617, + "grad_norm": 0.8254132767334416, + "learning_rate": 6.12042209807573e-05, + "loss": 4.6785, + "step": 2958 + }, + { + "epoch": 0.18368613818362406, + "grad_norm": 1.1013107012019678, + "learning_rate": 6.12249120629009e-05, + "loss": 4.5809, + "step": 2959 + }, + { + "epoch": 0.18374821528338195, + "grad_norm": 0.4956514684817956, + "learning_rate": 6.124560314504449e-05, + "loss": 4.5599, + "step": 2960 + }, + { + "epoch": 0.18381029238313987, + "grad_norm": 0.7138911920313249, + "learning_rate": 6.126629422718808e-05, + "loss": 4.6768, + "step": 2961 + }, + { + "epoch": 0.18387236948289776, + "grad_norm": 1.0029266565382937, + "learning_rate": 6.128698530933168e-05, + "loss": 4.6829, + "step": 2962 + }, + { + "epoch": 0.18393444658265565, + "grad_norm": 0.859888027686388, + "learning_rate": 6.130767639147527e-05, + "loss": 4.6793, + "step": 2963 + }, + { + "epoch": 0.18399652368241357, + "grad_norm": 0.6926609658894572, + "learning_rate": 6.132836747361887e-05, + "loss": 4.691, + "step": 2964 + }, + { + "epoch": 0.18405860078217146, + "grad_norm": 0.9485211732961479, + "learning_rate": 6.134905855576248e-05, + "loss": 4.7216, + "step": 2965 + }, + { + "epoch": 0.18412067788192935, + "grad_norm": 0.7648178439092737, + "learning_rate": 6.136974963790606e-05, + "loss": 4.5327, + "step": 2966 + }, + { + "epoch": 0.18418275498168726, + "grad_norm": 0.6200385336668481, + "learning_rate": 6.139044072004965e-05, + "loss": 4.5967, + "step": 2967 + }, + { + "epoch": 0.18424483208144515, + "grad_norm": 0.7330448197197055, + "learning_rate": 6.141113180219325e-05, + "loss": 4.6345, + "step": 2968 + }, + { + "epoch": 0.18430690918120304, + "grad_norm": 0.7251998675198442, + "learning_rate": 6.143182288433686e-05, + "loss": 4.5409, + "step": 2969 + }, + { + "epoch": 0.18436898628096096, + "grad_norm": 0.6369242892223659, + "learning_rate": 6.145251396648045e-05, + "loss": 4.6827, + "step": 2970 + }, + { + "epoch": 0.18443106338071885, + "grad_norm": 0.7069589356592434, + "learning_rate": 6.147320504862405e-05, + "loss": 4.6272, + "step": 2971 + }, + { + "epoch": 0.18449314048047674, + "grad_norm": 0.4477898469933329, + "learning_rate": 6.149389613076764e-05, + "loss": 4.5915, + "step": 2972 + }, + { + "epoch": 0.18455521758023466, + "grad_norm": 0.6430279395112832, + "learning_rate": 6.151458721291124e-05, + "loss": 4.5822, + "step": 2973 + }, + { + "epoch": 0.18461729467999255, + "grad_norm": 0.6702679999459024, + "learning_rate": 6.153527829505483e-05, + "loss": 4.6712, + "step": 2974 + }, + { + "epoch": 0.18467937177975044, + "grad_norm": 0.5837275728250941, + "learning_rate": 6.155596937719843e-05, + "loss": 4.6457, + "step": 2975 + }, + { + "epoch": 0.18474144887950836, + "grad_norm": 0.4902845106286952, + "learning_rate": 6.157666045934202e-05, + "loss": 4.5006, + "step": 2976 + }, + { + "epoch": 0.18480352597926625, + "grad_norm": 1.131570859054878, + "learning_rate": 6.159735154148563e-05, + "loss": 4.5823, + "step": 2977 + }, + { + "epoch": 0.18486560307902414, + "grad_norm": 1.0191123949719727, + "learning_rate": 6.161804262362923e-05, + "loss": 4.6052, + "step": 2978 + }, + { + "epoch": 0.18492768017878206, + "grad_norm": 0.5360353586255937, + "learning_rate": 6.163873370577281e-05, + "loss": 4.4749, + "step": 2979 + }, + { + "epoch": 0.18498975727853995, + "grad_norm": 0.47135241035710285, + "learning_rate": 6.16594247879164e-05, + "loss": 4.6134, + "step": 2980 + }, + { + "epoch": 0.18505183437829784, + "grad_norm": 0.6596599722376868, + "learning_rate": 6.168011587006001e-05, + "loss": 4.6043, + "step": 2981 + }, + { + "epoch": 0.18511391147805575, + "grad_norm": 0.6026996417121588, + "learning_rate": 6.170080695220361e-05, + "loss": 4.555, + "step": 2982 + }, + { + "epoch": 0.18517598857781364, + "grad_norm": 0.4666660556476998, + "learning_rate": 6.17214980343472e-05, + "loss": 4.4873, + "step": 2983 + }, + { + "epoch": 0.18523806567757153, + "grad_norm": 0.6808621352331837, + "learning_rate": 6.17421891164908e-05, + "loss": 4.5573, + "step": 2984 + }, + { + "epoch": 0.18530014277732945, + "grad_norm": 0.5867201600648315, + "learning_rate": 6.176288019863439e-05, + "loss": 4.6002, + "step": 2985 + }, + { + "epoch": 0.18536221987708734, + "grad_norm": 0.5115861211227382, + "learning_rate": 6.178357128077799e-05, + "loss": 4.5834, + "step": 2986 + }, + { + "epoch": 0.18542429697684523, + "grad_norm": 0.5089109424893579, + "learning_rate": 6.180426236292158e-05, + "loss": 4.4763, + "step": 2987 + }, + { + "epoch": 0.18548637407660315, + "grad_norm": 0.6196064685699977, + "learning_rate": 6.182495344506518e-05, + "loss": 4.5988, + "step": 2988 + }, + { + "epoch": 0.18554845117636104, + "grad_norm": 0.8856913121931279, + "learning_rate": 6.184564452720877e-05, + "loss": 4.5611, + "step": 2989 + }, + { + "epoch": 0.18561052827611893, + "grad_norm": 0.5360822810138938, + "learning_rate": 6.186633560935238e-05, + "loss": 4.3891, + "step": 2990 + }, + { + "epoch": 0.18567260537587685, + "grad_norm": 0.469677722318986, + "learning_rate": 6.188702669149596e-05, + "loss": 4.5057, + "step": 2991 + }, + { + "epoch": 0.18573468247563474, + "grad_norm": 0.47032113621788935, + "learning_rate": 6.190771777363956e-05, + "loss": 4.6427, + "step": 2992 + }, + { + "epoch": 0.18579675957539263, + "grad_norm": 0.5892026081974677, + "learning_rate": 6.192840885578315e-05, + "loss": 4.5347, + "step": 2993 + }, + { + "epoch": 0.18585883667515055, + "grad_norm": 0.3508677824037044, + "learning_rate": 6.194909993792676e-05, + "loss": 4.5568, + "step": 2994 + }, + { + "epoch": 0.18592091377490844, + "grad_norm": 0.5034813383161366, + "learning_rate": 6.196979102007036e-05, + "loss": 4.5862, + "step": 2995 + }, + { + "epoch": 0.18598299087466633, + "grad_norm": 0.5418638060138424, + "learning_rate": 6.199048210221395e-05, + "loss": 4.5219, + "step": 2996 + }, + { + "epoch": 0.18604506797442424, + "grad_norm": 0.3238869083369251, + "learning_rate": 6.201117318435755e-05, + "loss": 4.569, + "step": 2997 + }, + { + "epoch": 0.18610714507418213, + "grad_norm": 0.3353096062036735, + "learning_rate": 6.203186426650114e-05, + "loss": 4.468, + "step": 2998 + }, + { + "epoch": 0.18616922217394002, + "grad_norm": 0.3599596036988756, + "learning_rate": 6.205255534864474e-05, + "loss": 4.5342, + "step": 2999 + }, + { + "epoch": 0.18623129927369794, + "grad_norm": 0.34197356970199516, + "learning_rate": 6.207324643078833e-05, + "loss": 4.5078, + "step": 3000 + }, + { + "epoch": 0.18629337637345583, + "grad_norm": 0.505581282245195, + "learning_rate": 6.209393751293193e-05, + "loss": 4.6341, + "step": 3001 + }, + { + "epoch": 0.18635545347321372, + "grad_norm": 0.38079616315500514, + "learning_rate": 6.211462859507554e-05, + "loss": 4.5081, + "step": 3002 + }, + { + "epoch": 0.18641753057297164, + "grad_norm": 0.397839209206083, + "learning_rate": 6.213531967721912e-05, + "loss": 4.49, + "step": 3003 + }, + { + "epoch": 0.18647960767272953, + "grad_norm": 0.37596229427065797, + "learning_rate": 6.215601075936271e-05, + "loss": 4.5653, + "step": 3004 + }, + { + "epoch": 0.18654168477248742, + "grad_norm": 0.6458177615923876, + "learning_rate": 6.217670184150631e-05, + "loss": 4.4936, + "step": 3005 + }, + { + "epoch": 0.18660376187224534, + "grad_norm": 0.40569744679844316, + "learning_rate": 6.219739292364992e-05, + "loss": 4.5165, + "step": 3006 + }, + { + "epoch": 0.18666583897200323, + "grad_norm": 0.9082856827815907, + "learning_rate": 6.221808400579351e-05, + "loss": 4.5489, + "step": 3007 + }, + { + "epoch": 0.18672791607176112, + "grad_norm": 0.7685213772487793, + "learning_rate": 6.22387750879371e-05, + "loss": 4.5467, + "step": 3008 + }, + { + "epoch": 0.18678999317151904, + "grad_norm": 0.46739377281132083, + "learning_rate": 6.225946617008069e-05, + "loss": 4.5431, + "step": 3009 + }, + { + "epoch": 0.18685207027127693, + "grad_norm": 0.4624656142520235, + "learning_rate": 6.22801572522243e-05, + "loss": 4.526, + "step": 3010 + }, + { + "epoch": 0.18691414737103482, + "grad_norm": 0.46329713829547403, + "learning_rate": 6.230084833436789e-05, + "loss": 4.5148, + "step": 3011 + }, + { + "epoch": 0.18697622447079273, + "grad_norm": 0.3884455534697812, + "learning_rate": 6.232153941651149e-05, + "loss": 4.5532, + "step": 3012 + }, + { + "epoch": 0.18703830157055062, + "grad_norm": 0.4694033519954323, + "learning_rate": 6.234223049865508e-05, + "loss": 4.6472, + "step": 3013 + }, + { + "epoch": 0.18710037867030851, + "grad_norm": 0.3569030748671474, + "learning_rate": 6.236292158079868e-05, + "loss": 4.4182, + "step": 3014 + }, + { + "epoch": 0.18716245577006643, + "grad_norm": 0.513718229457736, + "learning_rate": 6.238361266294227e-05, + "loss": 4.4203, + "step": 3015 + }, + { + "epoch": 0.18722453286982432, + "grad_norm": 0.44228757797390167, + "learning_rate": 6.240430374508587e-05, + "loss": 4.5271, + "step": 3016 + }, + { + "epoch": 0.1872866099695822, + "grad_norm": 0.4842625764386459, + "learning_rate": 6.242499482722946e-05, + "loss": 4.5403, + "step": 3017 + }, + { + "epoch": 0.18734868706934013, + "grad_norm": 0.586159778395416, + "learning_rate": 6.244568590937306e-05, + "loss": 4.5666, + "step": 3018 + }, + { + "epoch": 0.18741076416909802, + "grad_norm": 0.8671578067129212, + "learning_rate": 6.246637699151667e-05, + "loss": 4.6084, + "step": 3019 + }, + { + "epoch": 0.1874728412688559, + "grad_norm": 0.5833569221041423, + "learning_rate": 6.248706807366026e-05, + "loss": 4.5448, + "step": 3020 + }, + { + "epoch": 0.18753491836861383, + "grad_norm": 0.5054306511177319, + "learning_rate": 6.250775915580384e-05, + "loss": 4.5321, + "step": 3021 + }, + { + "epoch": 0.18759699546837172, + "grad_norm": 0.43331800748548494, + "learning_rate": 6.252845023794745e-05, + "loss": 4.4929, + "step": 3022 + }, + { + "epoch": 0.1876590725681296, + "grad_norm": 0.5015572200033763, + "learning_rate": 6.254914132009105e-05, + "loss": 4.6333, + "step": 3023 + }, + { + "epoch": 0.18772114966788753, + "grad_norm": 0.36514026020839363, + "learning_rate": 6.256983240223464e-05, + "loss": 4.5328, + "step": 3024 + }, + { + "epoch": 0.18778322676764542, + "grad_norm": 1.1270043743255127, + "learning_rate": 6.259052348437824e-05, + "loss": 4.582, + "step": 3025 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 1.1752978373522966, + "learning_rate": 6.261121456652183e-05, + "loss": 4.5024, + "step": 3026 + }, + { + "epoch": 0.18790738096716122, + "grad_norm": 0.5150916491132782, + "learning_rate": 6.263190564866543e-05, + "loss": 4.4596, + "step": 3027 + }, + { + "epoch": 0.18796945806691912, + "grad_norm": 0.4282196636378114, + "learning_rate": 6.265259673080902e-05, + "loss": 4.633, + "step": 3028 + }, + { + "epoch": 0.188031535166677, + "grad_norm": 0.44518609511595447, + "learning_rate": 6.267328781295262e-05, + "loss": 4.6234, + "step": 3029 + }, + { + "epoch": 0.18809361226643492, + "grad_norm": 0.33631036824527477, + "learning_rate": 6.269397889509621e-05, + "loss": 4.4931, + "step": 3030 + }, + { + "epoch": 0.1881556893661928, + "grad_norm": 0.4832886388135538, + "learning_rate": 6.271466997723982e-05, + "loss": 4.5719, + "step": 3031 + }, + { + "epoch": 0.1882177664659507, + "grad_norm": 0.39944230331901676, + "learning_rate": 6.273536105938342e-05, + "loss": 4.6458, + "step": 3032 + }, + { + "epoch": 0.18827984356570862, + "grad_norm": 0.5109148189631766, + "learning_rate": 6.2756052141527e-05, + "loss": 4.5421, + "step": 3033 + }, + { + "epoch": 0.1883419206654665, + "grad_norm": 0.5930064715825158, + "learning_rate": 6.277674322367059e-05, + "loss": 4.6054, + "step": 3034 + }, + { + "epoch": 0.1884039977652244, + "grad_norm": 0.388444939693242, + "learning_rate": 6.27974343058142e-05, + "loss": 4.5963, + "step": 3035 + }, + { + "epoch": 0.18846607486498232, + "grad_norm": 0.38718692299197155, + "learning_rate": 6.28181253879578e-05, + "loss": 4.4909, + "step": 3036 + }, + { + "epoch": 0.1885281519647402, + "grad_norm": 0.3590943443048426, + "learning_rate": 6.283881647010139e-05, + "loss": 4.5213, + "step": 3037 + }, + { + "epoch": 0.1885902290644981, + "grad_norm": 0.37385523427727735, + "learning_rate": 6.285950755224499e-05, + "loss": 4.5088, + "step": 3038 + }, + { + "epoch": 0.18865230616425602, + "grad_norm": 0.5351902192187143, + "learning_rate": 6.288019863438858e-05, + "loss": 4.5269, + "step": 3039 + }, + { + "epoch": 0.1887143832640139, + "grad_norm": 0.45228699771199665, + "learning_rate": 6.290088971653218e-05, + "loss": 4.5355, + "step": 3040 + }, + { + "epoch": 0.1887764603637718, + "grad_norm": 0.38533172014970146, + "learning_rate": 6.292158079867577e-05, + "loss": 4.4586, + "step": 3041 + }, + { + "epoch": 0.18883853746352972, + "grad_norm": 0.3858707262104647, + "learning_rate": 6.294227188081937e-05, + "loss": 4.4864, + "step": 3042 + }, + { + "epoch": 0.1889006145632876, + "grad_norm": 0.37615142323007805, + "learning_rate": 6.296296296296296e-05, + "loss": 4.5434, + "step": 3043 + }, + { + "epoch": 0.1889626916630455, + "grad_norm": 0.3620037875504225, + "learning_rate": 6.298365404510657e-05, + "loss": 4.4331, + "step": 3044 + }, + { + "epoch": 0.1890247687628034, + "grad_norm": 0.357898873311348, + "learning_rate": 6.300434512725015e-05, + "loss": 4.3816, + "step": 3045 + }, + { + "epoch": 0.1890868458625613, + "grad_norm": 0.5512038699460619, + "learning_rate": 6.302503620939375e-05, + "loss": 4.4257, + "step": 3046 + }, + { + "epoch": 0.1891489229623192, + "grad_norm": 0.36657574602862986, + "learning_rate": 6.304572729153736e-05, + "loss": 4.4891, + "step": 3047 + }, + { + "epoch": 0.1892110000620771, + "grad_norm": 0.4410874050709667, + "learning_rate": 6.306641837368095e-05, + "loss": 4.4749, + "step": 3048 + }, + { + "epoch": 0.189273077161835, + "grad_norm": 0.47361939401397174, + "learning_rate": 6.308710945582455e-05, + "loss": 4.5827, + "step": 3049 + }, + { + "epoch": 0.1893351542615929, + "grad_norm": 0.3395907348087543, + "learning_rate": 6.310780053796814e-05, + "loss": 4.5119, + "step": 3050 + }, + { + "epoch": 0.1893972313613508, + "grad_norm": 0.39190237996093424, + "learning_rate": 6.312849162011174e-05, + "loss": 4.5375, + "step": 3051 + }, + { + "epoch": 0.1894593084611087, + "grad_norm": 0.693748493880278, + "learning_rate": 6.314918270225533e-05, + "loss": 4.5533, + "step": 3052 + }, + { + "epoch": 0.1895213855608666, + "grad_norm": 0.715200449848887, + "learning_rate": 6.316987378439893e-05, + "loss": 4.6078, + "step": 3053 + }, + { + "epoch": 0.1895834626606245, + "grad_norm": 0.5003908834821523, + "learning_rate": 6.319056486654252e-05, + "loss": 4.5192, + "step": 3054 + }, + { + "epoch": 0.1896455397603824, + "grad_norm": 0.45931647440423246, + "learning_rate": 6.321125594868612e-05, + "loss": 4.5869, + "step": 3055 + }, + { + "epoch": 0.1897076168601403, + "grad_norm": 0.34537735368849587, + "learning_rate": 6.323194703082972e-05, + "loss": 4.5617, + "step": 3056 + }, + { + "epoch": 0.1897696939598982, + "grad_norm": 0.8109375896711061, + "learning_rate": 6.325263811297332e-05, + "loss": 4.5385, + "step": 3057 + }, + { + "epoch": 0.1898317710596561, + "grad_norm": 0.7529723950654236, + "learning_rate": 6.32733291951169e-05, + "loss": 4.4943, + "step": 3058 + }, + { + "epoch": 0.18989384815941399, + "grad_norm": 0.4798676056367948, + "learning_rate": 6.32940202772605e-05, + "loss": 4.5099, + "step": 3059 + }, + { + "epoch": 0.1899559252591719, + "grad_norm": 0.5320408768420088, + "learning_rate": 6.33147113594041e-05, + "loss": 4.3857, + "step": 3060 + }, + { + "epoch": 0.1900180023589298, + "grad_norm": 0.3348969935114784, + "learning_rate": 6.33354024415477e-05, + "loss": 4.5402, + "step": 3061 + }, + { + "epoch": 0.19008007945868768, + "grad_norm": 0.44559464802040544, + "learning_rate": 6.33560935236913e-05, + "loss": 4.4675, + "step": 3062 + }, + { + "epoch": 0.1901421565584456, + "grad_norm": 0.32342831649274073, + "learning_rate": 6.337678460583489e-05, + "loss": 4.4287, + "step": 3063 + }, + { + "epoch": 0.1902042336582035, + "grad_norm": 0.3956798038219254, + "learning_rate": 6.339747568797849e-05, + "loss": 4.4492, + "step": 3064 + }, + { + "epoch": 0.19026631075796138, + "grad_norm": 0.530791084438757, + "learning_rate": 6.341816677012208e-05, + "loss": 4.5653, + "step": 3065 + }, + { + "epoch": 0.1903283878577193, + "grad_norm": 0.422262720032756, + "learning_rate": 6.343885785226568e-05, + "loss": 4.6086, + "step": 3066 + }, + { + "epoch": 0.1903904649574772, + "grad_norm": 0.6963829629936147, + "learning_rate": 6.345954893440927e-05, + "loss": 4.4105, + "step": 3067 + }, + { + "epoch": 0.19045254205723508, + "grad_norm": 0.4834662804053833, + "learning_rate": 6.348024001655287e-05, + "loss": 4.513, + "step": 3068 + }, + { + "epoch": 0.190514619156993, + "grad_norm": 0.5274638286500888, + "learning_rate": 6.350093109869647e-05, + "loss": 4.3987, + "step": 3069 + }, + { + "epoch": 0.1905766962567509, + "grad_norm": 0.5262774389185719, + "learning_rate": 6.352162218084006e-05, + "loss": 4.4469, + "step": 3070 + }, + { + "epoch": 0.19063877335650878, + "grad_norm": 0.5127063277741534, + "learning_rate": 6.354231326298365e-05, + "loss": 4.5671, + "step": 3071 + }, + { + "epoch": 0.1907008504562667, + "grad_norm": 0.6576147647605705, + "learning_rate": 6.356300434512726e-05, + "loss": 4.4949, + "step": 3072 + }, + { + "epoch": 0.1907629275560246, + "grad_norm": 0.4180296915850407, + "learning_rate": 6.358369542727085e-05, + "loss": 4.4425, + "step": 3073 + }, + { + "epoch": 0.19082500465578248, + "grad_norm": 0.42075458732524995, + "learning_rate": 6.360438650941445e-05, + "loss": 4.4273, + "step": 3074 + }, + { + "epoch": 0.1908870817555404, + "grad_norm": 0.4117032440855976, + "learning_rate": 6.362507759155804e-05, + "loss": 4.4961, + "step": 3075 + }, + { + "epoch": 0.19094915885529828, + "grad_norm": 0.4262617007800254, + "learning_rate": 6.364576867370164e-05, + "loss": 4.5651, + "step": 3076 + }, + { + "epoch": 0.19101123595505617, + "grad_norm": 0.4186988585828042, + "learning_rate": 6.366645975584523e-05, + "loss": 4.3659, + "step": 3077 + }, + { + "epoch": 0.1910733130548141, + "grad_norm": 0.4714124882341881, + "learning_rate": 6.368715083798883e-05, + "loss": 4.4889, + "step": 3078 + }, + { + "epoch": 0.19113539015457198, + "grad_norm": 0.6848865600267577, + "learning_rate": 6.370784192013242e-05, + "loss": 4.5416, + "step": 3079 + }, + { + "epoch": 0.19119746725432987, + "grad_norm": 0.39969399489433904, + "learning_rate": 6.372853300227602e-05, + "loss": 4.5449, + "step": 3080 + }, + { + "epoch": 0.1912595443540878, + "grad_norm": 0.4687578200641642, + "learning_rate": 6.374922408441963e-05, + "loss": 4.5234, + "step": 3081 + }, + { + "epoch": 0.19132162145384568, + "grad_norm": 0.6922381509543103, + "learning_rate": 6.376991516656321e-05, + "loss": 4.4524, + "step": 3082 + }, + { + "epoch": 0.19138369855360357, + "grad_norm": 0.5615664789040824, + "learning_rate": 6.37906062487068e-05, + "loss": 4.5316, + "step": 3083 + }, + { + "epoch": 0.1914457756533615, + "grad_norm": 0.7246671998017054, + "learning_rate": 6.38112973308504e-05, + "loss": 4.5092, + "step": 3084 + }, + { + "epoch": 0.19150785275311938, + "grad_norm": 0.47865726165355754, + "learning_rate": 6.383198841299401e-05, + "loss": 4.4943, + "step": 3085 + }, + { + "epoch": 0.19156992985287727, + "grad_norm": 0.49048948299437095, + "learning_rate": 6.38526794951376e-05, + "loss": 4.5432, + "step": 3086 + }, + { + "epoch": 0.1916320069526352, + "grad_norm": 0.5820449213048058, + "learning_rate": 6.38733705772812e-05, + "loss": 4.4407, + "step": 3087 + }, + { + "epoch": 0.19169408405239308, + "grad_norm": 0.35775033002580925, + "learning_rate": 6.389406165942478e-05, + "loss": 4.57, + "step": 3088 + }, + { + "epoch": 0.19175616115215097, + "grad_norm": 0.4686636686298141, + "learning_rate": 6.391475274156839e-05, + "loss": 4.3879, + "step": 3089 + }, + { + "epoch": 0.19181823825190888, + "grad_norm": 0.4619042877388891, + "learning_rate": 6.393544382371198e-05, + "loss": 4.3885, + "step": 3090 + }, + { + "epoch": 0.19188031535166677, + "grad_norm": 0.5081044458264977, + "learning_rate": 6.395613490585558e-05, + "loss": 4.3882, + "step": 3091 + }, + { + "epoch": 0.19194239245142466, + "grad_norm": 0.3681440069670574, + "learning_rate": 6.397682598799917e-05, + "loss": 4.47, + "step": 3092 + }, + { + "epoch": 0.19200446955118255, + "grad_norm": 0.4512837919331776, + "learning_rate": 6.399751707014277e-05, + "loss": 4.4281, + "step": 3093 + }, + { + "epoch": 0.19206654665094047, + "grad_norm": 0.5194130956768875, + "learning_rate": 6.401820815228636e-05, + "loss": 4.5411, + "step": 3094 + }, + { + "epoch": 0.19212862375069836, + "grad_norm": 0.3808035393892018, + "learning_rate": 6.403889923442996e-05, + "loss": 4.452, + "step": 3095 + }, + { + "epoch": 0.19219070085045625, + "grad_norm": 0.41374837007806586, + "learning_rate": 6.405959031657355e-05, + "loss": 4.47, + "step": 3096 + }, + { + "epoch": 0.19225277795021417, + "grad_norm": 0.504442033626567, + "learning_rate": 6.408028139871716e-05, + "loss": 4.5293, + "step": 3097 + }, + { + "epoch": 0.19231485504997206, + "grad_norm": 0.4444901754110965, + "learning_rate": 6.410097248086076e-05, + "loss": 4.5211, + "step": 3098 + }, + { + "epoch": 0.19237693214972995, + "grad_norm": 0.5113715821429129, + "learning_rate": 6.412166356300435e-05, + "loss": 4.4207, + "step": 3099 + }, + { + "epoch": 0.19243900924948787, + "grad_norm": 0.41512267224245875, + "learning_rate": 6.414235464514794e-05, + "loss": 4.4169, + "step": 3100 + }, + { + "epoch": 0.19250108634924576, + "grad_norm": 0.46763027285978537, + "learning_rate": 6.416304572729154e-05, + "loss": 4.4857, + "step": 3101 + }, + { + "epoch": 0.19256316344900365, + "grad_norm": 0.3934310264448721, + "learning_rate": 6.418373680943514e-05, + "loss": 4.416, + "step": 3102 + }, + { + "epoch": 0.19262524054876157, + "grad_norm": 0.40488383858417754, + "learning_rate": 6.420442789157873e-05, + "loss": 4.5064, + "step": 3103 + }, + { + "epoch": 0.19268731764851946, + "grad_norm": 0.4478581930036525, + "learning_rate": 6.422511897372233e-05, + "loss": 4.5013, + "step": 3104 + }, + { + "epoch": 0.19274939474827735, + "grad_norm": 0.3271286232484904, + "learning_rate": 6.424581005586592e-05, + "loss": 4.4649, + "step": 3105 + }, + { + "epoch": 0.19281147184803527, + "grad_norm": 0.3578238181443887, + "learning_rate": 6.426650113800952e-05, + "loss": 4.5172, + "step": 3106 + }, + { + "epoch": 0.19287354894779316, + "grad_norm": 0.41847261023704774, + "learning_rate": 6.428719222015311e-05, + "loss": 4.5427, + "step": 3107 + }, + { + "epoch": 0.19293562604755105, + "grad_norm": 0.4838752167813824, + "learning_rate": 6.430788330229671e-05, + "loss": 4.4108, + "step": 3108 + }, + { + "epoch": 0.19299770314730896, + "grad_norm": 0.6205332163988827, + "learning_rate": 6.43285743844403e-05, + "loss": 4.438, + "step": 3109 + }, + { + "epoch": 0.19305978024706685, + "grad_norm": 0.6182413881460247, + "learning_rate": 6.434926546658391e-05, + "loss": 4.4922, + "step": 3110 + }, + { + "epoch": 0.19312185734682474, + "grad_norm": 0.4141749283365066, + "learning_rate": 6.436995654872751e-05, + "loss": 4.4412, + "step": 3111 + }, + { + "epoch": 0.19318393444658266, + "grad_norm": 0.4414946712278157, + "learning_rate": 6.439064763087109e-05, + "loss": 4.4991, + "step": 3112 + }, + { + "epoch": 0.19324601154634055, + "grad_norm": 0.4097477831742727, + "learning_rate": 6.441133871301468e-05, + "loss": 4.5619, + "step": 3113 + }, + { + "epoch": 0.19330808864609844, + "grad_norm": 0.38969675400306875, + "learning_rate": 6.443202979515829e-05, + "loss": 4.3093, + "step": 3114 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 0.4760165448589193, + "learning_rate": 6.445272087730189e-05, + "loss": 4.522, + "step": 3115 + }, + { + "epoch": 0.19343224284561425, + "grad_norm": 0.3560721128952932, + "learning_rate": 6.447341195944548e-05, + "loss": 4.4159, + "step": 3116 + }, + { + "epoch": 0.19349431994537214, + "grad_norm": 0.5130474225688003, + "learning_rate": 6.449410304158908e-05, + "loss": 4.5778, + "step": 3117 + }, + { + "epoch": 0.19355639704513006, + "grad_norm": 0.39308103925110877, + "learning_rate": 6.451479412373267e-05, + "loss": 4.4646, + "step": 3118 + }, + { + "epoch": 0.19361847414488795, + "grad_norm": 0.46393420889693193, + "learning_rate": 6.453548520587627e-05, + "loss": 4.457, + "step": 3119 + }, + { + "epoch": 0.19368055124464584, + "grad_norm": 0.4798460235591528, + "learning_rate": 6.455617628801986e-05, + "loss": 4.4587, + "step": 3120 + }, + { + "epoch": 0.19374262834440376, + "grad_norm": 0.4025614436561715, + "learning_rate": 6.457686737016346e-05, + "loss": 4.3988, + "step": 3121 + }, + { + "epoch": 0.19380470544416165, + "grad_norm": 0.39481653153651497, + "learning_rate": 6.459755845230707e-05, + "loss": 4.5264, + "step": 3122 + }, + { + "epoch": 0.19386678254391954, + "grad_norm": 0.5522761787834018, + "learning_rate": 6.461824953445066e-05, + "loss": 4.3624, + "step": 3123 + }, + { + "epoch": 0.19392885964367745, + "grad_norm": 0.36683219601479267, + "learning_rate": 6.463894061659424e-05, + "loss": 4.4677, + "step": 3124 + }, + { + "epoch": 0.19399093674343534, + "grad_norm": 0.34233159336598484, + "learning_rate": 6.465963169873784e-05, + "loss": 4.4553, + "step": 3125 + }, + { + "epoch": 0.19405301384319323, + "grad_norm": 0.384254154344189, + "learning_rate": 6.468032278088145e-05, + "loss": 4.3701, + "step": 3126 + }, + { + "epoch": 0.19411509094295115, + "grad_norm": 0.376447922144473, + "learning_rate": 6.470101386302504e-05, + "loss": 4.4419, + "step": 3127 + }, + { + "epoch": 0.19417716804270904, + "grad_norm": 0.3328210860922341, + "learning_rate": 6.472170494516864e-05, + "loss": 4.4329, + "step": 3128 + }, + { + "epoch": 0.19423924514246693, + "grad_norm": 0.7086721946758482, + "learning_rate": 6.474239602731223e-05, + "loss": 4.5074, + "step": 3129 + }, + { + "epoch": 0.19430132224222485, + "grad_norm": 0.5946169095983547, + "learning_rate": 6.476308710945583e-05, + "loss": 4.3914, + "step": 3130 + }, + { + "epoch": 0.19436339934198274, + "grad_norm": 0.4588847275370284, + "learning_rate": 6.478377819159942e-05, + "loss": 4.5314, + "step": 3131 + }, + { + "epoch": 0.19442547644174063, + "grad_norm": 0.48776504696347, + "learning_rate": 6.480446927374302e-05, + "loss": 4.4928, + "step": 3132 + }, + { + "epoch": 0.19448755354149855, + "grad_norm": 0.5598366255097926, + "learning_rate": 6.482516035588661e-05, + "loss": 4.6023, + "step": 3133 + }, + { + "epoch": 0.19454963064125644, + "grad_norm": 0.45426183353855, + "learning_rate": 6.484585143803021e-05, + "loss": 4.524, + "step": 3134 + }, + { + "epoch": 0.19461170774101433, + "grad_norm": 0.4037558210915827, + "learning_rate": 6.486654252017382e-05, + "loss": 4.4671, + "step": 3135 + }, + { + "epoch": 0.19467378484077225, + "grad_norm": 0.41194970531552627, + "learning_rate": 6.48872336023174e-05, + "loss": 4.4979, + "step": 3136 + }, + { + "epoch": 0.19473586194053014, + "grad_norm": 0.37322344770352095, + "learning_rate": 6.4907924684461e-05, + "loss": 4.4168, + "step": 3137 + }, + { + "epoch": 0.19479793904028803, + "grad_norm": 0.46918399111684567, + "learning_rate": 6.492861576660459e-05, + "loss": 4.3929, + "step": 3138 + }, + { + "epoch": 0.19486001614004594, + "grad_norm": 0.5000707159102861, + "learning_rate": 6.49493068487482e-05, + "loss": 4.4712, + "step": 3139 + }, + { + "epoch": 0.19492209323980383, + "grad_norm": 0.4507851202152392, + "learning_rate": 6.496999793089179e-05, + "loss": 4.3598, + "step": 3140 + }, + { + "epoch": 0.19498417033956172, + "grad_norm": 0.6182370978866754, + "learning_rate": 6.499068901303539e-05, + "loss": 4.4832, + "step": 3141 + }, + { + "epoch": 0.19504624743931964, + "grad_norm": 0.6559843933659366, + "learning_rate": 6.501138009517898e-05, + "loss": 4.4506, + "step": 3142 + }, + { + "epoch": 0.19510832453907753, + "grad_norm": 0.49867422945691015, + "learning_rate": 6.503207117732258e-05, + "loss": 4.4751, + "step": 3143 + }, + { + "epoch": 0.19517040163883542, + "grad_norm": 0.5304254134936046, + "learning_rate": 6.505276225946617e-05, + "loss": 4.4786, + "step": 3144 + }, + { + "epoch": 0.19523247873859334, + "grad_norm": 0.46647089984567314, + "learning_rate": 6.507345334160977e-05, + "loss": 4.4467, + "step": 3145 + }, + { + "epoch": 0.19529455583835123, + "grad_norm": 0.5771952447128963, + "learning_rate": 6.509414442375336e-05, + "loss": 4.496, + "step": 3146 + }, + { + "epoch": 0.19535663293810912, + "grad_norm": 0.4038322284982241, + "learning_rate": 6.511483550589697e-05, + "loss": 4.3748, + "step": 3147 + }, + { + "epoch": 0.19541871003786704, + "grad_norm": 0.5972689202506536, + "learning_rate": 6.513552658804057e-05, + "loss": 4.4423, + "step": 3148 + }, + { + "epoch": 0.19548078713762493, + "grad_norm": 0.42130413992283094, + "learning_rate": 6.515621767018415e-05, + "loss": 4.436, + "step": 3149 + }, + { + "epoch": 0.19554286423738282, + "grad_norm": 0.6001653671863234, + "learning_rate": 6.517690875232774e-05, + "loss": 4.3788, + "step": 3150 + }, + { + "epoch": 0.19560494133714074, + "grad_norm": 0.5061445279394717, + "learning_rate": 6.519759983447135e-05, + "loss": 4.4438, + "step": 3151 + }, + { + "epoch": 0.19566701843689863, + "grad_norm": 0.6679202056556208, + "learning_rate": 6.521829091661495e-05, + "loss": 4.3836, + "step": 3152 + }, + { + "epoch": 0.19572909553665652, + "grad_norm": 0.5017883386528765, + "learning_rate": 6.523898199875854e-05, + "loss": 4.4026, + "step": 3153 + }, + { + "epoch": 0.19579117263641443, + "grad_norm": 0.3542637785370178, + "learning_rate": 6.525967308090214e-05, + "loss": 4.4195, + "step": 3154 + }, + { + "epoch": 0.19585324973617232, + "grad_norm": 0.7501785939883018, + "learning_rate": 6.528036416304573e-05, + "loss": 4.5457, + "step": 3155 + }, + { + "epoch": 0.19591532683593021, + "grad_norm": 0.6094250780705123, + "learning_rate": 6.530105524518933e-05, + "loss": 4.4208, + "step": 3156 + }, + { + "epoch": 0.19597740393568813, + "grad_norm": 0.9998019737628608, + "learning_rate": 6.532174632733292e-05, + "loss": 4.4563, + "step": 3157 + }, + { + "epoch": 0.19603948103544602, + "grad_norm": 0.6881049485710967, + "learning_rate": 6.534243740947652e-05, + "loss": 4.4623, + "step": 3158 + }, + { + "epoch": 0.1961015581352039, + "grad_norm": 0.6193079917759798, + "learning_rate": 6.536312849162011e-05, + "loss": 4.519, + "step": 3159 + }, + { + "epoch": 0.19616363523496183, + "grad_norm": 0.5746298894584976, + "learning_rate": 6.538381957376372e-05, + "loss": 4.4572, + "step": 3160 + }, + { + "epoch": 0.19622571233471972, + "grad_norm": 0.47727258342166895, + "learning_rate": 6.54045106559073e-05, + "loss": 4.4324, + "step": 3161 + }, + { + "epoch": 0.1962877894344776, + "grad_norm": 0.5637339461973712, + "learning_rate": 6.54252017380509e-05, + "loss": 4.521, + "step": 3162 + }, + { + "epoch": 0.19634986653423553, + "grad_norm": 0.6861923960705546, + "learning_rate": 6.544589282019449e-05, + "loss": 4.5142, + "step": 3163 + }, + { + "epoch": 0.19641194363399342, + "grad_norm": 0.42162333152072173, + "learning_rate": 6.54665839023381e-05, + "loss": 4.426, + "step": 3164 + }, + { + "epoch": 0.1964740207337513, + "grad_norm": 0.3729013888285894, + "learning_rate": 6.54872749844817e-05, + "loss": 4.3685, + "step": 3165 + }, + { + "epoch": 0.19653609783350923, + "grad_norm": 0.40597479008053083, + "learning_rate": 6.550796606662529e-05, + "loss": 4.3848, + "step": 3166 + }, + { + "epoch": 0.19659817493326712, + "grad_norm": 0.414995868903591, + "learning_rate": 6.552865714876887e-05, + "loss": 4.5351, + "step": 3167 + }, + { + "epoch": 0.196660252033025, + "grad_norm": 0.43788717372327096, + "learning_rate": 6.554934823091248e-05, + "loss": 4.4449, + "step": 3168 + }, + { + "epoch": 0.19672232913278292, + "grad_norm": 0.6417317251152731, + "learning_rate": 6.557003931305608e-05, + "loss": 4.4653, + "step": 3169 + }, + { + "epoch": 0.19678440623254081, + "grad_norm": 0.4910417117379251, + "learning_rate": 6.559073039519967e-05, + "loss": 4.4578, + "step": 3170 + }, + { + "epoch": 0.1968464833322987, + "grad_norm": 0.3860167479908892, + "learning_rate": 6.561142147734327e-05, + "loss": 4.4913, + "step": 3171 + }, + { + "epoch": 0.19690856043205662, + "grad_norm": 0.95237777686725, + "learning_rate": 6.563211255948688e-05, + "loss": 4.4024, + "step": 3172 + }, + { + "epoch": 0.1969706375318145, + "grad_norm": 0.8892394548444547, + "learning_rate": 6.565280364163046e-05, + "loss": 4.3956, + "step": 3173 + }, + { + "epoch": 0.1970327146315724, + "grad_norm": 0.42838575678138413, + "learning_rate": 6.567349472377405e-05, + "loss": 4.3411, + "step": 3174 + }, + { + "epoch": 0.19709479173133032, + "grad_norm": 0.5470681258201819, + "learning_rate": 6.569418580591765e-05, + "loss": 4.4555, + "step": 3175 + }, + { + "epoch": 0.1971568688310882, + "grad_norm": 0.7623892547355561, + "learning_rate": 6.571487688806126e-05, + "loss": 4.5262, + "step": 3176 + }, + { + "epoch": 0.1972189459308461, + "grad_norm": 0.9051772378219819, + "learning_rate": 6.573556797020485e-05, + "loss": 4.4671, + "step": 3177 + }, + { + "epoch": 0.19728102303060402, + "grad_norm": 1.1242532901083808, + "learning_rate": 6.575625905234845e-05, + "loss": 4.5389, + "step": 3178 + }, + { + "epoch": 0.1973431001303619, + "grad_norm": 0.9302399540589154, + "learning_rate": 6.577695013449203e-05, + "loss": 4.468, + "step": 3179 + }, + { + "epoch": 0.1974051772301198, + "grad_norm": 0.6881132858346443, + "learning_rate": 6.579764121663564e-05, + "loss": 4.4565, + "step": 3180 + }, + { + "epoch": 0.19746725432987772, + "grad_norm": 0.7062792282470511, + "learning_rate": 6.581833229877923e-05, + "loss": 4.5492, + "step": 3181 + }, + { + "epoch": 0.1975293314296356, + "grad_norm": 0.5844257179731355, + "learning_rate": 6.583902338092283e-05, + "loss": 4.3713, + "step": 3182 + }, + { + "epoch": 0.1975914085293935, + "grad_norm": 0.8730112704328421, + "learning_rate": 6.585971446306642e-05, + "loss": 4.3855, + "step": 3183 + }, + { + "epoch": 0.19765348562915142, + "grad_norm": 1.0071089898785541, + "learning_rate": 6.588040554521002e-05, + "loss": 4.5282, + "step": 3184 + }, + { + "epoch": 0.1977155627289093, + "grad_norm": 0.5799638526438826, + "learning_rate": 6.590109662735361e-05, + "loss": 4.3301, + "step": 3185 + }, + { + "epoch": 0.1977776398286672, + "grad_norm": 0.5195848537699161, + "learning_rate": 6.59217877094972e-05, + "loss": 4.3993, + "step": 3186 + }, + { + "epoch": 0.1978397169284251, + "grad_norm": 0.9340869765841221, + "learning_rate": 6.59424787916408e-05, + "loss": 4.4379, + "step": 3187 + }, + { + "epoch": 0.197901794028183, + "grad_norm": 0.6108610666370887, + "learning_rate": 6.59631698737844e-05, + "loss": 4.4649, + "step": 3188 + }, + { + "epoch": 0.1979638711279409, + "grad_norm": 0.779468450564789, + "learning_rate": 6.5983860955928e-05, + "loss": 4.4504, + "step": 3189 + }, + { + "epoch": 0.1980259482276988, + "grad_norm": 0.5394606847323137, + "learning_rate": 6.60045520380716e-05, + "loss": 4.4045, + "step": 3190 + }, + { + "epoch": 0.1980880253274567, + "grad_norm": 0.5758547928736112, + "learning_rate": 6.602524312021518e-05, + "loss": 4.4653, + "step": 3191 + }, + { + "epoch": 0.1981501024272146, + "grad_norm": 0.48157661922285394, + "learning_rate": 6.604593420235878e-05, + "loss": 4.347, + "step": 3192 + }, + { + "epoch": 0.1982121795269725, + "grad_norm": 0.49435558845229505, + "learning_rate": 6.606662528450239e-05, + "loss": 4.3561, + "step": 3193 + }, + { + "epoch": 0.1982742566267304, + "grad_norm": 0.45767674955989135, + "learning_rate": 6.608731636664598e-05, + "loss": 4.4347, + "step": 3194 + }, + { + "epoch": 0.1983363337264883, + "grad_norm": 0.3203554124697328, + "learning_rate": 6.610800744878958e-05, + "loss": 4.3205, + "step": 3195 + }, + { + "epoch": 0.1983984108262462, + "grad_norm": 0.4182444581133786, + "learning_rate": 6.612869853093317e-05, + "loss": 4.4844, + "step": 3196 + }, + { + "epoch": 0.1984604879260041, + "grad_norm": 0.3968706926727844, + "learning_rate": 6.614938961307677e-05, + "loss": 4.4513, + "step": 3197 + }, + { + "epoch": 0.198522565025762, + "grad_norm": 0.38601188406167086, + "learning_rate": 6.617008069522036e-05, + "loss": 4.3649, + "step": 3198 + }, + { + "epoch": 0.1985846421255199, + "grad_norm": 0.30225099920134485, + "learning_rate": 6.619077177736396e-05, + "loss": 4.2572, + "step": 3199 + }, + { + "epoch": 0.1986467192252778, + "grad_norm": 0.4695800584505705, + "learning_rate": 6.621146285950755e-05, + "loss": 4.4131, + "step": 3200 + }, + { + "epoch": 0.19870879632503569, + "grad_norm": 0.42131816354976087, + "learning_rate": 6.623215394165116e-05, + "loss": 4.4461, + "step": 3201 + }, + { + "epoch": 0.1987708734247936, + "grad_norm": 0.37270291434126823, + "learning_rate": 6.625284502379475e-05, + "loss": 4.4122, + "step": 3202 + }, + { + "epoch": 0.1988329505245515, + "grad_norm": 0.5781078078316945, + "learning_rate": 6.627353610593834e-05, + "loss": 4.3586, + "step": 3203 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 0.4212425577655223, + "learning_rate": 6.629422718808193e-05, + "loss": 4.4965, + "step": 3204 + }, + { + "epoch": 0.1989571047240673, + "grad_norm": 0.39285498198930535, + "learning_rate": 6.631491827022554e-05, + "loss": 4.397, + "step": 3205 + }, + { + "epoch": 0.1990191818238252, + "grad_norm": 0.33695690601331646, + "learning_rate": 6.633560935236914e-05, + "loss": 4.3345, + "step": 3206 + }, + { + "epoch": 0.19908125892358308, + "grad_norm": 0.5581036264784637, + "learning_rate": 6.635630043451273e-05, + "loss": 4.3804, + "step": 3207 + }, + { + "epoch": 0.199143336023341, + "grad_norm": 0.32602485038526313, + "learning_rate": 6.637699151665633e-05, + "loss": 4.269, + "step": 3208 + }, + { + "epoch": 0.1992054131230989, + "grad_norm": 0.3753233945664467, + "learning_rate": 6.639768259879992e-05, + "loss": 4.4128, + "step": 3209 + }, + { + "epoch": 0.19926749022285678, + "grad_norm": 0.6249462104539024, + "learning_rate": 6.641837368094352e-05, + "loss": 4.4688, + "step": 3210 + }, + { + "epoch": 0.1993295673226147, + "grad_norm": 0.6567043366576104, + "learning_rate": 6.643906476308711e-05, + "loss": 4.4501, + "step": 3211 + }, + { + "epoch": 0.1993916444223726, + "grad_norm": 0.4367968153403413, + "learning_rate": 6.64597558452307e-05, + "loss": 4.3347, + "step": 3212 + }, + { + "epoch": 0.19945372152213048, + "grad_norm": 0.5077729723291639, + "learning_rate": 6.64804469273743e-05, + "loss": 4.3516, + "step": 3213 + }, + { + "epoch": 0.1995157986218884, + "grad_norm": 0.4388846896000185, + "learning_rate": 6.650113800951791e-05, + "loss": 4.4472, + "step": 3214 + }, + { + "epoch": 0.1995778757216463, + "grad_norm": 0.5002895947085132, + "learning_rate": 6.652182909166149e-05, + "loss": 4.4572, + "step": 3215 + }, + { + "epoch": 0.19963995282140418, + "grad_norm": 0.3490900732810961, + "learning_rate": 6.654252017380509e-05, + "loss": 4.3256, + "step": 3216 + }, + { + "epoch": 0.1997020299211621, + "grad_norm": 0.44549879560832245, + "learning_rate": 6.656321125594868e-05, + "loss": 4.3118, + "step": 3217 + }, + { + "epoch": 0.19976410702091998, + "grad_norm": 0.3481621543843118, + "learning_rate": 6.658390233809229e-05, + "loss": 4.4462, + "step": 3218 + }, + { + "epoch": 0.19982618412067787, + "grad_norm": 0.5780368170628163, + "learning_rate": 6.660459342023588e-05, + "loss": 4.4518, + "step": 3219 + }, + { + "epoch": 0.1998882612204358, + "grad_norm": 0.4374215702874086, + "learning_rate": 6.662528450237948e-05, + "loss": 4.4305, + "step": 3220 + }, + { + "epoch": 0.19995033832019368, + "grad_norm": 0.4076135932241134, + "learning_rate": 6.664597558452307e-05, + "loss": 4.4164, + "step": 3221 + }, + { + "epoch": 0.20001241541995157, + "grad_norm": 0.4519780870037748, + "learning_rate": 6.666666666666667e-05, + "loss": 4.4725, + "step": 3222 + }, + { + "epoch": 0.2000744925197095, + "grad_norm": 0.3869938498064994, + "learning_rate": 6.668735774881026e-05, + "loss": 4.3168, + "step": 3223 + }, + { + "epoch": 0.20013656961946738, + "grad_norm": 0.44603393073443337, + "learning_rate": 6.670804883095386e-05, + "loss": 4.5048, + "step": 3224 + }, + { + "epoch": 0.20019864671922527, + "grad_norm": 0.32415168712070414, + "learning_rate": 6.672873991309746e-05, + "loss": 4.4586, + "step": 3225 + }, + { + "epoch": 0.2002607238189832, + "grad_norm": 0.5726340608344181, + "learning_rate": 6.674943099524106e-05, + "loss": 4.3771, + "step": 3226 + }, + { + "epoch": 0.20032280091874108, + "grad_norm": 0.41028009997103604, + "learning_rate": 6.677012207738466e-05, + "loss": 4.4516, + "step": 3227 + }, + { + "epoch": 0.20038487801849897, + "grad_norm": 0.297562217878125, + "learning_rate": 6.679081315952824e-05, + "loss": 4.4265, + "step": 3228 + }, + { + "epoch": 0.2004469551182569, + "grad_norm": 0.4188295217703149, + "learning_rate": 6.681150424167184e-05, + "loss": 4.3485, + "step": 3229 + }, + { + "epoch": 0.20050903221801478, + "grad_norm": 0.4006960750442345, + "learning_rate": 6.683219532381544e-05, + "loss": 4.2953, + "step": 3230 + }, + { + "epoch": 0.20057110931777267, + "grad_norm": 0.706964668806115, + "learning_rate": 6.685288640595904e-05, + "loss": 4.3488, + "step": 3231 + }, + { + "epoch": 0.20063318641753058, + "grad_norm": 0.6488009893265176, + "learning_rate": 6.687357748810263e-05, + "loss": 4.424, + "step": 3232 + }, + { + "epoch": 0.20069526351728847, + "grad_norm": 0.48875710760251023, + "learning_rate": 6.689426857024623e-05, + "loss": 4.4828, + "step": 3233 + }, + { + "epoch": 0.20075734061704636, + "grad_norm": 0.5589105201410898, + "learning_rate": 6.691495965238982e-05, + "loss": 4.5041, + "step": 3234 + }, + { + "epoch": 0.20081941771680428, + "grad_norm": 0.5116438774486642, + "learning_rate": 6.693565073453342e-05, + "loss": 4.3969, + "step": 3235 + }, + { + "epoch": 0.20088149481656217, + "grad_norm": 0.4807879155452876, + "learning_rate": 6.695634181667701e-05, + "loss": 4.4462, + "step": 3236 + }, + { + "epoch": 0.20094357191632006, + "grad_norm": 0.7688291415903716, + "learning_rate": 6.697703289882061e-05, + "loss": 4.2826, + "step": 3237 + }, + { + "epoch": 0.20100564901607798, + "grad_norm": 0.682440411513003, + "learning_rate": 6.69977239809642e-05, + "loss": 4.4037, + "step": 3238 + }, + { + "epoch": 0.20106772611583587, + "grad_norm": 1.2642489358280071, + "learning_rate": 6.701841506310781e-05, + "loss": 4.422, + "step": 3239 + }, + { + "epoch": 0.20112980321559376, + "grad_norm": 0.7102859361869834, + "learning_rate": 6.70391061452514e-05, + "loss": 4.3413, + "step": 3240 + }, + { + "epoch": 0.20119188031535168, + "grad_norm": 0.6049440140474455, + "learning_rate": 6.705979722739499e-05, + "loss": 4.352, + "step": 3241 + }, + { + "epoch": 0.20125395741510957, + "grad_norm": 0.8385534265929592, + "learning_rate": 6.708048830953858e-05, + "loss": 4.3802, + "step": 3242 + }, + { + "epoch": 0.20131603451486746, + "grad_norm": 0.6924553794119167, + "learning_rate": 6.71011793916822e-05, + "loss": 4.3211, + "step": 3243 + }, + { + "epoch": 0.20137811161462538, + "grad_norm": 0.5351851518163606, + "learning_rate": 6.712187047382579e-05, + "loss": 4.3943, + "step": 3244 + }, + { + "epoch": 0.20144018871438327, + "grad_norm": 0.677394000942617, + "learning_rate": 6.714256155596938e-05, + "loss": 4.3649, + "step": 3245 + }, + { + "epoch": 0.20150226581414116, + "grad_norm": 0.9901142536567898, + "learning_rate": 6.716325263811298e-05, + "loss": 4.45, + "step": 3246 + }, + { + "epoch": 0.20156434291389907, + "grad_norm": 1.2723357226111391, + "learning_rate": 6.718394372025657e-05, + "loss": 4.5315, + "step": 3247 + }, + { + "epoch": 0.20162642001365697, + "grad_norm": 0.6491352830979923, + "learning_rate": 6.720463480240017e-05, + "loss": 4.4541, + "step": 3248 + }, + { + "epoch": 0.20168849711341486, + "grad_norm": 0.9350892542500964, + "learning_rate": 6.722532588454376e-05, + "loss": 4.4067, + "step": 3249 + }, + { + "epoch": 0.20175057421317277, + "grad_norm": 0.8039246433774836, + "learning_rate": 6.724601696668736e-05, + "loss": 4.4662, + "step": 3250 + }, + { + "epoch": 0.20181265131293066, + "grad_norm": 0.886857547064062, + "learning_rate": 6.726670804883097e-05, + "loss": 4.4797, + "step": 3251 + }, + { + "epoch": 0.20187472841268855, + "grad_norm": 0.5599804452053436, + "learning_rate": 6.728739913097455e-05, + "loss": 4.428, + "step": 3252 + }, + { + "epoch": 0.20193680551244647, + "grad_norm": 0.6974411003648182, + "learning_rate": 6.730809021311814e-05, + "loss": 4.4176, + "step": 3253 + }, + { + "epoch": 0.20199888261220436, + "grad_norm": 1.4307880439026275, + "learning_rate": 6.732878129526174e-05, + "loss": 4.5549, + "step": 3254 + }, + { + "epoch": 0.20206095971196225, + "grad_norm": 1.0985894088648496, + "learning_rate": 6.734947237740535e-05, + "loss": 4.5054, + "step": 3255 + }, + { + "epoch": 0.20212303681172017, + "grad_norm": 0.7343633529064134, + "learning_rate": 6.737016345954894e-05, + "loss": 4.5025, + "step": 3256 + }, + { + "epoch": 0.20218511391147806, + "grad_norm": 0.8879926160578411, + "learning_rate": 6.739085454169254e-05, + "loss": 4.401, + "step": 3257 + }, + { + "epoch": 0.20224719101123595, + "grad_norm": 0.6367232901756505, + "learning_rate": 6.741154562383612e-05, + "loss": 4.5072, + "step": 3258 + }, + { + "epoch": 0.20230926811099387, + "grad_norm": 0.6611531647517663, + "learning_rate": 6.743223670597973e-05, + "loss": 4.427, + "step": 3259 + }, + { + "epoch": 0.20237134521075176, + "grad_norm": 1.140635137643573, + "learning_rate": 6.745292778812332e-05, + "loss": 4.522, + "step": 3260 + }, + { + "epoch": 0.20243342231050965, + "grad_norm": 0.5671944494320867, + "learning_rate": 6.747361887026692e-05, + "loss": 4.4427, + "step": 3261 + }, + { + "epoch": 0.20249549941026757, + "grad_norm": 0.46115142901018336, + "learning_rate": 6.749430995241051e-05, + "loss": 4.4165, + "step": 3262 + }, + { + "epoch": 0.20255757651002546, + "grad_norm": 1.089714780990559, + "learning_rate": 6.751500103455411e-05, + "loss": 4.4888, + "step": 3263 + }, + { + "epoch": 0.20261965360978335, + "grad_norm": 0.8977588910284282, + "learning_rate": 6.75356921166977e-05, + "loss": 4.5091, + "step": 3264 + }, + { + "epoch": 0.20268173070954126, + "grad_norm": 0.7786579237459617, + "learning_rate": 6.75563831988413e-05, + "loss": 4.3971, + "step": 3265 + }, + { + "epoch": 0.20274380780929915, + "grad_norm": 0.972111254368895, + "learning_rate": 6.75770742809849e-05, + "loss": 4.3962, + "step": 3266 + }, + { + "epoch": 0.20280588490905704, + "grad_norm": 0.7616157266442631, + "learning_rate": 6.759776536312849e-05, + "loss": 4.3338, + "step": 3267 + }, + { + "epoch": 0.20286796200881496, + "grad_norm": 0.4188736718730766, + "learning_rate": 6.76184564452721e-05, + "loss": 4.2509, + "step": 3268 + }, + { + "epoch": 0.20293003910857285, + "grad_norm": 0.5230638394442891, + "learning_rate": 6.763914752741569e-05, + "loss": 4.4556, + "step": 3269 + }, + { + "epoch": 0.20299211620833074, + "grad_norm": 0.9829168267362453, + "learning_rate": 6.765983860955927e-05, + "loss": 4.424, + "step": 3270 + }, + { + "epoch": 0.20305419330808866, + "grad_norm": 1.0659384906801102, + "learning_rate": 6.768052969170288e-05, + "loss": 4.4006, + "step": 3271 + }, + { + "epoch": 0.20311627040784655, + "grad_norm": 0.8034290732583903, + "learning_rate": 6.770122077384648e-05, + "loss": 4.4711, + "step": 3272 + }, + { + "epoch": 0.20317834750760444, + "grad_norm": 1.4051632390611468, + "learning_rate": 6.772191185599007e-05, + "loss": 4.4573, + "step": 3273 + }, + { + "epoch": 0.20324042460736236, + "grad_norm": 0.5797059634576607, + "learning_rate": 6.774260293813367e-05, + "loss": 4.515, + "step": 3274 + }, + { + "epoch": 0.20330250170712025, + "grad_norm": 0.7310717944284653, + "learning_rate": 6.776329402027726e-05, + "loss": 4.3909, + "step": 3275 + }, + { + "epoch": 0.20336457880687814, + "grad_norm": 1.0157507011679676, + "learning_rate": 6.778398510242086e-05, + "loss": 4.4623, + "step": 3276 + }, + { + "epoch": 0.20342665590663606, + "grad_norm": 0.8373098427978868, + "learning_rate": 6.780467618456445e-05, + "loss": 4.3262, + "step": 3277 + }, + { + "epoch": 0.20348873300639395, + "grad_norm": 0.6679713823590081, + "learning_rate": 6.782536726670805e-05, + "loss": 4.4408, + "step": 3278 + }, + { + "epoch": 0.20355081010615184, + "grad_norm": 0.4856674732010864, + "learning_rate": 6.784605834885164e-05, + "loss": 4.4224, + "step": 3279 + }, + { + "epoch": 0.20361288720590973, + "grad_norm": 0.6588809318282698, + "learning_rate": 6.786674943099525e-05, + "loss": 4.4921, + "step": 3280 + }, + { + "epoch": 0.20367496430566764, + "grad_norm": 0.5400952660008997, + "learning_rate": 6.788744051313885e-05, + "loss": 4.3838, + "step": 3281 + }, + { + "epoch": 0.20373704140542553, + "grad_norm": 0.4081936885345343, + "learning_rate": 6.790813159528243e-05, + "loss": 4.3674, + "step": 3282 + }, + { + "epoch": 0.20379911850518342, + "grad_norm": 0.4793348190099341, + "learning_rate": 6.792882267742602e-05, + "loss": 4.4918, + "step": 3283 + }, + { + "epoch": 0.20386119560494134, + "grad_norm": 0.5898233119632106, + "learning_rate": 6.794951375956963e-05, + "loss": 4.4277, + "step": 3284 + }, + { + "epoch": 0.20392327270469923, + "grad_norm": 0.4087920435301472, + "learning_rate": 6.797020484171323e-05, + "loss": 4.3681, + "step": 3285 + }, + { + "epoch": 0.20398534980445712, + "grad_norm": 0.7770365966023811, + "learning_rate": 6.799089592385682e-05, + "loss": 4.5035, + "step": 3286 + }, + { + "epoch": 0.20404742690421504, + "grad_norm": 0.3046990783643352, + "learning_rate": 6.801158700600042e-05, + "loss": 4.3306, + "step": 3287 + }, + { + "epoch": 0.20410950400397293, + "grad_norm": 0.544373485856518, + "learning_rate": 6.803227808814401e-05, + "loss": 4.3665, + "step": 3288 + }, + { + "epoch": 0.20417158110373082, + "grad_norm": 0.5100118984910075, + "learning_rate": 6.805296917028761e-05, + "loss": 4.3114, + "step": 3289 + }, + { + "epoch": 0.20423365820348874, + "grad_norm": 0.49427439269970647, + "learning_rate": 6.80736602524312e-05, + "loss": 4.4518, + "step": 3290 + }, + { + "epoch": 0.20429573530324663, + "grad_norm": 0.3671714900970227, + "learning_rate": 6.80943513345748e-05, + "loss": 4.2914, + "step": 3291 + }, + { + "epoch": 0.20435781240300452, + "grad_norm": 0.5363618955378209, + "learning_rate": 6.81150424167184e-05, + "loss": 4.2285, + "step": 3292 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 0.493600678499434, + "learning_rate": 6.8135733498862e-05, + "loss": 4.3472, + "step": 3293 + }, + { + "epoch": 0.20448196660252033, + "grad_norm": 0.6068057649132379, + "learning_rate": 6.815642458100558e-05, + "loss": 4.3634, + "step": 3294 + }, + { + "epoch": 0.20454404370227822, + "grad_norm": 0.4989753296836927, + "learning_rate": 6.817711566314918e-05, + "loss": 4.4383, + "step": 3295 + }, + { + "epoch": 0.20460612080203613, + "grad_norm": 0.4375106776160652, + "learning_rate": 6.819780674529279e-05, + "loss": 4.4124, + "step": 3296 + }, + { + "epoch": 0.20466819790179402, + "grad_norm": 0.4395926360039383, + "learning_rate": 6.821849782743638e-05, + "loss": 4.4146, + "step": 3297 + }, + { + "epoch": 0.20473027500155191, + "grad_norm": 0.3877222930975066, + "learning_rate": 6.823918890957998e-05, + "loss": 4.2402, + "step": 3298 + }, + { + "epoch": 0.20479235210130983, + "grad_norm": 0.31322905612246293, + "learning_rate": 6.825987999172357e-05, + "loss": 4.1609, + "step": 3299 + }, + { + "epoch": 0.20485442920106772, + "grad_norm": 0.35973762792609326, + "learning_rate": 6.828057107386717e-05, + "loss": 4.337, + "step": 3300 + }, + { + "epoch": 0.2049165063008256, + "grad_norm": 0.386838682439018, + "learning_rate": 6.830126215601076e-05, + "loss": 4.3052, + "step": 3301 + }, + { + "epoch": 0.20497858340058353, + "grad_norm": 0.4345440288696158, + "learning_rate": 6.832195323815436e-05, + "loss": 4.273, + "step": 3302 + }, + { + "epoch": 0.20504066050034142, + "grad_norm": 0.3515216591742669, + "learning_rate": 6.834264432029795e-05, + "loss": 4.3417, + "step": 3303 + }, + { + "epoch": 0.2051027376000993, + "grad_norm": 0.4447069233280145, + "learning_rate": 6.836333540244155e-05, + "loss": 4.3995, + "step": 3304 + }, + { + "epoch": 0.20516481469985723, + "grad_norm": 0.36533617447107414, + "learning_rate": 6.838402648458516e-05, + "loss": 4.466, + "step": 3305 + }, + { + "epoch": 0.20522689179961512, + "grad_norm": 0.27338879015171047, + "learning_rate": 6.840471756672874e-05, + "loss": 4.3673, + "step": 3306 + }, + { + "epoch": 0.205288968899373, + "grad_norm": 0.39488978194209895, + "learning_rate": 6.842540864887233e-05, + "loss": 4.3927, + "step": 3307 + }, + { + "epoch": 0.20535104599913093, + "grad_norm": 0.46787348495633535, + "learning_rate": 6.844609973101593e-05, + "loss": 4.4389, + "step": 3308 + }, + { + "epoch": 0.20541312309888882, + "grad_norm": 0.43901803412575596, + "learning_rate": 6.846679081315954e-05, + "loss": 4.3975, + "step": 3309 + }, + { + "epoch": 0.2054752001986467, + "grad_norm": 0.35780126728933703, + "learning_rate": 6.848748189530313e-05, + "loss": 4.311, + "step": 3310 + }, + { + "epoch": 0.20553727729840462, + "grad_norm": 0.27372930487699354, + "learning_rate": 6.850817297744673e-05, + "loss": 4.2871, + "step": 3311 + }, + { + "epoch": 0.20559935439816251, + "grad_norm": 0.2965950775542817, + "learning_rate": 6.852886405959032e-05, + "loss": 4.3383, + "step": 3312 + }, + { + "epoch": 0.2056614314979204, + "grad_norm": 0.39184591227968524, + "learning_rate": 6.854955514173392e-05, + "loss": 4.3622, + "step": 3313 + }, + { + "epoch": 0.20572350859767832, + "grad_norm": 0.39823328656496754, + "learning_rate": 6.857024622387751e-05, + "loss": 4.3891, + "step": 3314 + }, + { + "epoch": 0.2057855856974362, + "grad_norm": 0.39732234703680336, + "learning_rate": 6.859093730602111e-05, + "loss": 4.3376, + "step": 3315 + }, + { + "epoch": 0.2058476627971941, + "grad_norm": 0.775152905054609, + "learning_rate": 6.86116283881647e-05, + "loss": 4.3884, + "step": 3316 + }, + { + "epoch": 0.20590973989695202, + "grad_norm": 0.4557605096847161, + "learning_rate": 6.863231947030831e-05, + "loss": 4.2545, + "step": 3317 + }, + { + "epoch": 0.2059718169967099, + "grad_norm": 0.6582208964951204, + "learning_rate": 6.86530105524519e-05, + "loss": 4.3777, + "step": 3318 + }, + { + "epoch": 0.2060338940964678, + "grad_norm": 0.9193665271837994, + "learning_rate": 6.867370163459549e-05, + "loss": 4.3365, + "step": 3319 + }, + { + "epoch": 0.20609597119622572, + "grad_norm": 0.5942659896621988, + "learning_rate": 6.869439271673908e-05, + "loss": 4.321, + "step": 3320 + }, + { + "epoch": 0.2061580482959836, + "grad_norm": 0.46888880264074023, + "learning_rate": 6.871508379888269e-05, + "loss": 4.3853, + "step": 3321 + }, + { + "epoch": 0.2062201253957415, + "grad_norm": 0.661692732982962, + "learning_rate": 6.873577488102629e-05, + "loss": 4.2625, + "step": 3322 + }, + { + "epoch": 0.20628220249549942, + "grad_norm": 0.543914684481209, + "learning_rate": 6.875646596316988e-05, + "loss": 4.3656, + "step": 3323 + }, + { + "epoch": 0.2063442795952573, + "grad_norm": 0.4793472226287735, + "learning_rate": 6.877715704531348e-05, + "loss": 4.2523, + "step": 3324 + }, + { + "epoch": 0.2064063566950152, + "grad_norm": 0.7754904733167372, + "learning_rate": 6.879784812745707e-05, + "loss": 4.2546, + "step": 3325 + }, + { + "epoch": 0.20646843379477312, + "grad_norm": 0.6784111403690922, + "learning_rate": 6.881853920960067e-05, + "loss": 4.4152, + "step": 3326 + }, + { + "epoch": 0.206530510894531, + "grad_norm": 0.41962054774513563, + "learning_rate": 6.883923029174426e-05, + "loss": 4.4156, + "step": 3327 + }, + { + "epoch": 0.2065925879942889, + "grad_norm": 0.4218360565012995, + "learning_rate": 6.885992137388786e-05, + "loss": 4.3294, + "step": 3328 + }, + { + "epoch": 0.2066546650940468, + "grad_norm": 0.37784453228629816, + "learning_rate": 6.888061245603145e-05, + "loss": 4.3416, + "step": 3329 + }, + { + "epoch": 0.2067167421938047, + "grad_norm": 0.39615910910663743, + "learning_rate": 6.890130353817506e-05, + "loss": 4.3927, + "step": 3330 + }, + { + "epoch": 0.2067788192935626, + "grad_norm": 0.29186747512674555, + "learning_rate": 6.892199462031864e-05, + "loss": 4.319, + "step": 3331 + }, + { + "epoch": 0.2068408963933205, + "grad_norm": 0.4633943094923264, + "learning_rate": 6.894268570246224e-05, + "loss": 4.3751, + "step": 3332 + }, + { + "epoch": 0.2069029734930784, + "grad_norm": 0.37183374608842096, + "learning_rate": 6.896337678460583e-05, + "loss": 4.2622, + "step": 3333 + }, + { + "epoch": 0.2069650505928363, + "grad_norm": 0.3178161010988281, + "learning_rate": 6.898406786674944e-05, + "loss": 4.3597, + "step": 3334 + }, + { + "epoch": 0.2070271276925942, + "grad_norm": 0.4011429951792822, + "learning_rate": 6.900475894889304e-05, + "loss": 4.3872, + "step": 3335 + }, + { + "epoch": 0.2070892047923521, + "grad_norm": 0.4377870639706599, + "learning_rate": 6.902545003103663e-05, + "loss": 4.3394, + "step": 3336 + }, + { + "epoch": 0.20715128189211, + "grad_norm": 0.40179149911137174, + "learning_rate": 6.904614111318021e-05, + "loss": 4.3476, + "step": 3337 + }, + { + "epoch": 0.2072133589918679, + "grad_norm": 0.43102551989444626, + "learning_rate": 6.906683219532382e-05, + "loss": 4.3438, + "step": 3338 + }, + { + "epoch": 0.2072754360916258, + "grad_norm": 0.3497931567431655, + "learning_rate": 6.908752327746742e-05, + "loss": 4.4762, + "step": 3339 + }, + { + "epoch": 0.2073375131913837, + "grad_norm": 0.39656092463812465, + "learning_rate": 6.910821435961101e-05, + "loss": 4.2786, + "step": 3340 + }, + { + "epoch": 0.2073995902911416, + "grad_norm": 0.4339642702900046, + "learning_rate": 6.91289054417546e-05, + "loss": 4.3761, + "step": 3341 + }, + { + "epoch": 0.2074616673908995, + "grad_norm": 0.342055043403237, + "learning_rate": 6.914959652389821e-05, + "loss": 4.38, + "step": 3342 + }, + { + "epoch": 0.20752374449065739, + "grad_norm": 0.45390724208283467, + "learning_rate": 6.91702876060418e-05, + "loss": 4.3037, + "step": 3343 + }, + { + "epoch": 0.2075858215904153, + "grad_norm": 0.41088025814877926, + "learning_rate": 6.919097868818539e-05, + "loss": 4.3184, + "step": 3344 + }, + { + "epoch": 0.2076478986901732, + "grad_norm": 0.5053756934799731, + "learning_rate": 6.921166977032899e-05, + "loss": 4.3506, + "step": 3345 + }, + { + "epoch": 0.20770997578993108, + "grad_norm": 0.5126809835702272, + "learning_rate": 6.92323608524726e-05, + "loss": 4.2517, + "step": 3346 + }, + { + "epoch": 0.207772052889689, + "grad_norm": 0.40545532557143793, + "learning_rate": 6.925305193461619e-05, + "loss": 4.2512, + "step": 3347 + }, + { + "epoch": 0.2078341299894469, + "grad_norm": 0.6383218768671236, + "learning_rate": 6.927374301675979e-05, + "loss": 4.2782, + "step": 3348 + }, + { + "epoch": 0.20789620708920478, + "grad_norm": 0.3981160568897669, + "learning_rate": 6.929443409890337e-05, + "loss": 4.3191, + "step": 3349 + }, + { + "epoch": 0.2079582841889627, + "grad_norm": 0.4432344134506191, + "learning_rate": 6.931512518104698e-05, + "loss": 4.3203, + "step": 3350 + }, + { + "epoch": 0.2080203612887206, + "grad_norm": 0.5484683938304837, + "learning_rate": 6.933581626319057e-05, + "loss": 4.3706, + "step": 3351 + }, + { + "epoch": 0.20808243838847848, + "grad_norm": 0.5341937466014377, + "learning_rate": 6.935650734533417e-05, + "loss": 4.3388, + "step": 3352 + }, + { + "epoch": 0.2081445154882364, + "grad_norm": 0.5619412136844033, + "learning_rate": 6.937719842747776e-05, + "loss": 4.3569, + "step": 3353 + }, + { + "epoch": 0.2082065925879943, + "grad_norm": 0.4444402551287991, + "learning_rate": 6.939788950962136e-05, + "loss": 4.4007, + "step": 3354 + }, + { + "epoch": 0.20826866968775218, + "grad_norm": 0.4070040966502598, + "learning_rate": 6.941858059176495e-05, + "loss": 4.3508, + "step": 3355 + }, + { + "epoch": 0.2083307467875101, + "grad_norm": 0.35374078903742395, + "learning_rate": 6.943927167390855e-05, + "loss": 4.2537, + "step": 3356 + }, + { + "epoch": 0.20839282388726799, + "grad_norm": 0.6169953590400913, + "learning_rate": 6.945996275605214e-05, + "loss": 4.2312, + "step": 3357 + }, + { + "epoch": 0.20845490098702588, + "grad_norm": 0.2960384402791654, + "learning_rate": 6.948065383819574e-05, + "loss": 4.4088, + "step": 3358 + }, + { + "epoch": 0.2085169780867838, + "grad_norm": 0.46879490001531865, + "learning_rate": 6.950134492033934e-05, + "loss": 4.1978, + "step": 3359 + }, + { + "epoch": 0.20857905518654168, + "grad_norm": 0.45520607127915896, + "learning_rate": 6.952203600248294e-05, + "loss": 4.2658, + "step": 3360 + }, + { + "epoch": 0.20864113228629957, + "grad_norm": 0.3305342981248987, + "learning_rate": 6.954272708462652e-05, + "loss": 4.4081, + "step": 3361 + }, + { + "epoch": 0.2087032093860575, + "grad_norm": 0.43945768565884336, + "learning_rate": 6.956341816677012e-05, + "loss": 4.3612, + "step": 3362 + }, + { + "epoch": 0.20876528648581538, + "grad_norm": 0.4274315131970622, + "learning_rate": 6.958410924891372e-05, + "loss": 4.4372, + "step": 3363 + }, + { + "epoch": 0.20882736358557327, + "grad_norm": 0.48295179598326543, + "learning_rate": 6.960480033105732e-05, + "loss": 4.2472, + "step": 3364 + }, + { + "epoch": 0.2088894406853312, + "grad_norm": 0.53058714746229, + "learning_rate": 6.962549141320091e-05, + "loss": 4.3437, + "step": 3365 + }, + { + "epoch": 0.20895151778508908, + "grad_norm": 0.5745473427550815, + "learning_rate": 6.964618249534451e-05, + "loss": 4.4309, + "step": 3366 + }, + { + "epoch": 0.20901359488484697, + "grad_norm": 0.569520295591145, + "learning_rate": 6.96668735774881e-05, + "loss": 4.3188, + "step": 3367 + }, + { + "epoch": 0.2090756719846049, + "grad_norm": 0.5122154678799059, + "learning_rate": 6.96875646596317e-05, + "loss": 4.2779, + "step": 3368 + }, + { + "epoch": 0.20913774908436278, + "grad_norm": 0.3145069881771148, + "learning_rate": 6.97082557417753e-05, + "loss": 4.3541, + "step": 3369 + }, + { + "epoch": 0.20919982618412067, + "grad_norm": 0.3897215886872564, + "learning_rate": 6.972894682391889e-05, + "loss": 4.261, + "step": 3370 + }, + { + "epoch": 0.2092619032838786, + "grad_norm": 0.5436393109927016, + "learning_rate": 6.97496379060625e-05, + "loss": 4.3661, + "step": 3371 + }, + { + "epoch": 0.20932398038363648, + "grad_norm": 0.48731485482351816, + "learning_rate": 6.97703289882061e-05, + "loss": 4.2781, + "step": 3372 + }, + { + "epoch": 0.20938605748339437, + "grad_norm": 0.6503333905671507, + "learning_rate": 6.979102007034968e-05, + "loss": 4.176, + "step": 3373 + }, + { + "epoch": 0.20944813458315228, + "grad_norm": 0.5586976441553719, + "learning_rate": 6.981171115249327e-05, + "loss": 4.2438, + "step": 3374 + }, + { + "epoch": 0.20951021168291017, + "grad_norm": 0.6837727121699051, + "learning_rate": 6.983240223463688e-05, + "loss": 4.4333, + "step": 3375 + }, + { + "epoch": 0.20957228878266806, + "grad_norm": 0.5633363863431525, + "learning_rate": 6.985309331678047e-05, + "loss": 4.2738, + "step": 3376 + }, + { + "epoch": 0.20963436588242598, + "grad_norm": 0.4049141635708805, + "learning_rate": 6.987378439892407e-05, + "loss": 4.4643, + "step": 3377 + }, + { + "epoch": 0.20969644298218387, + "grad_norm": 0.4713815414179165, + "learning_rate": 6.989447548106766e-05, + "loss": 4.2399, + "step": 3378 + }, + { + "epoch": 0.20975852008194176, + "grad_norm": 0.5434937991636937, + "learning_rate": 6.991516656321126e-05, + "loss": 4.311, + "step": 3379 + }, + { + "epoch": 0.20982059718169968, + "grad_norm": 0.5993602371784424, + "learning_rate": 6.993585764535485e-05, + "loss": 4.249, + "step": 3380 + }, + { + "epoch": 0.20988267428145757, + "grad_norm": 0.4111261616438463, + "learning_rate": 6.995654872749845e-05, + "loss": 4.2789, + "step": 3381 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 0.39847815063148123, + "learning_rate": 6.997723980964204e-05, + "loss": 4.3323, + "step": 3382 + }, + { + "epoch": 0.21000682848097338, + "grad_norm": 0.44566635662206927, + "learning_rate": 6.999793089178564e-05, + "loss": 4.2854, + "step": 3383 + }, + { + "epoch": 0.21006890558073127, + "grad_norm": 0.4382107275053232, + "learning_rate": 7.001862197392925e-05, + "loss": 4.243, + "step": 3384 + }, + { + "epoch": 0.21013098268048916, + "grad_norm": 0.26439723800577813, + "learning_rate": 7.003931305607283e-05, + "loss": 4.2314, + "step": 3385 + }, + { + "epoch": 0.21019305978024708, + "grad_norm": 0.4447857099963568, + "learning_rate": 7.006000413821643e-05, + "loss": 4.2556, + "step": 3386 + }, + { + "epoch": 0.21025513688000497, + "grad_norm": 0.36376750356633913, + "learning_rate": 7.008069522036002e-05, + "loss": 4.3564, + "step": 3387 + }, + { + "epoch": 0.21031721397976286, + "grad_norm": 0.4200570354733409, + "learning_rate": 7.010138630250363e-05, + "loss": 4.2809, + "step": 3388 + }, + { + "epoch": 0.21037929107952077, + "grad_norm": 0.37680155419582917, + "learning_rate": 7.012207738464722e-05, + "loss": 4.2896, + "step": 3389 + }, + { + "epoch": 0.21044136817927867, + "grad_norm": 0.3451007940590245, + "learning_rate": 7.014276846679082e-05, + "loss": 4.3184, + "step": 3390 + }, + { + "epoch": 0.21050344527903656, + "grad_norm": 0.3005907272740512, + "learning_rate": 7.01634595489344e-05, + "loss": 4.3035, + "step": 3391 + }, + { + "epoch": 0.21056552237879447, + "grad_norm": 0.38539349211363466, + "learning_rate": 7.018415063107801e-05, + "loss": 4.3008, + "step": 3392 + }, + { + "epoch": 0.21062759947855236, + "grad_norm": 0.3036892015339221, + "learning_rate": 7.02048417132216e-05, + "loss": 4.1828, + "step": 3393 + }, + { + "epoch": 0.21068967657831025, + "grad_norm": 0.402627641093022, + "learning_rate": 7.02255327953652e-05, + "loss": 4.1611, + "step": 3394 + }, + { + "epoch": 0.21075175367806817, + "grad_norm": 0.3396947358632007, + "learning_rate": 7.02462238775088e-05, + "loss": 4.271, + "step": 3395 + }, + { + "epoch": 0.21081383077782606, + "grad_norm": 0.37097232237368233, + "learning_rate": 7.02669149596524e-05, + "loss": 4.3331, + "step": 3396 + }, + { + "epoch": 0.21087590787758395, + "grad_norm": 0.40762030139240013, + "learning_rate": 7.0287606041796e-05, + "loss": 4.5014, + "step": 3397 + }, + { + "epoch": 0.21093798497734187, + "grad_norm": 0.5223715447217774, + "learning_rate": 7.030829712393958e-05, + "loss": 4.3792, + "step": 3398 + }, + { + "epoch": 0.21100006207709976, + "grad_norm": 0.2602869982552411, + "learning_rate": 7.032898820608317e-05, + "loss": 4.3021, + "step": 3399 + }, + { + "epoch": 0.21106213917685765, + "grad_norm": 0.5022533721070093, + "learning_rate": 7.034967928822678e-05, + "loss": 4.273, + "step": 3400 + }, + { + "epoch": 0.21112421627661557, + "grad_norm": 0.3412005972646548, + "learning_rate": 7.037037037037038e-05, + "loss": 4.2137, + "step": 3401 + }, + { + "epoch": 0.21118629337637346, + "grad_norm": 0.45179598029000406, + "learning_rate": 7.039106145251397e-05, + "loss": 4.2082, + "step": 3402 + }, + { + "epoch": 0.21124837047613135, + "grad_norm": 0.500197729114562, + "learning_rate": 7.041175253465757e-05, + "loss": 4.3272, + "step": 3403 + }, + { + "epoch": 0.21131044757588927, + "grad_norm": 0.5721250498102595, + "learning_rate": 7.043244361680116e-05, + "loss": 4.3496, + "step": 3404 + }, + { + "epoch": 0.21137252467564716, + "grad_norm": 0.4555467640198383, + "learning_rate": 7.045313469894476e-05, + "loss": 4.3491, + "step": 3405 + }, + { + "epoch": 0.21143460177540505, + "grad_norm": 0.41448528369966886, + "learning_rate": 7.047382578108835e-05, + "loss": 4.2559, + "step": 3406 + }, + { + "epoch": 0.21149667887516296, + "grad_norm": 0.602914257488521, + "learning_rate": 7.049451686323195e-05, + "loss": 4.3769, + "step": 3407 + }, + { + "epoch": 0.21155875597492085, + "grad_norm": 0.5174727998486698, + "learning_rate": 7.051520794537554e-05, + "loss": 4.2323, + "step": 3408 + }, + { + "epoch": 0.21162083307467874, + "grad_norm": 0.5546861836592408, + "learning_rate": 7.053589902751915e-05, + "loss": 4.2689, + "step": 3409 + }, + { + "epoch": 0.21168291017443666, + "grad_norm": 0.40744576169266195, + "learning_rate": 7.055659010966273e-05, + "loss": 4.1694, + "step": 3410 + }, + { + "epoch": 0.21174498727419455, + "grad_norm": 1.3599808318526374, + "learning_rate": 7.057728119180633e-05, + "loss": 4.3954, + "step": 3411 + }, + { + "epoch": 0.21180706437395244, + "grad_norm": 0.506457325196711, + "learning_rate": 7.059797227394992e-05, + "loss": 4.3512, + "step": 3412 + }, + { + "epoch": 0.21186914147371036, + "grad_norm": 0.5818511633082896, + "learning_rate": 7.061866335609353e-05, + "loss": 4.3127, + "step": 3413 + }, + { + "epoch": 0.21193121857346825, + "grad_norm": 0.49324893032331146, + "learning_rate": 7.063935443823713e-05, + "loss": 4.2363, + "step": 3414 + }, + { + "epoch": 0.21199329567322614, + "grad_norm": 0.5631035110319462, + "learning_rate": 7.066004552038072e-05, + "loss": 4.2927, + "step": 3415 + }, + { + "epoch": 0.21205537277298406, + "grad_norm": 0.5757339106111753, + "learning_rate": 7.06807366025243e-05, + "loss": 4.2479, + "step": 3416 + }, + { + "epoch": 0.21211744987274195, + "grad_norm": 0.6917534511384947, + "learning_rate": 7.070142768466791e-05, + "loss": 4.3242, + "step": 3417 + }, + { + "epoch": 0.21217952697249984, + "grad_norm": 0.5319275462263944, + "learning_rate": 7.072211876681151e-05, + "loss": 4.3168, + "step": 3418 + }, + { + "epoch": 0.21224160407225776, + "grad_norm": 0.6997957033671405, + "learning_rate": 7.07428098489551e-05, + "loss": 4.236, + "step": 3419 + }, + { + "epoch": 0.21230368117201565, + "grad_norm": 0.7394319477867554, + "learning_rate": 7.07635009310987e-05, + "loss": 4.3998, + "step": 3420 + }, + { + "epoch": 0.21236575827177354, + "grad_norm": 0.500775897492892, + "learning_rate": 7.078419201324231e-05, + "loss": 4.448, + "step": 3421 + }, + { + "epoch": 0.21242783537153145, + "grad_norm": 0.5240909710624416, + "learning_rate": 7.080488309538589e-05, + "loss": 4.2418, + "step": 3422 + }, + { + "epoch": 0.21248991247128934, + "grad_norm": 0.5807437353202693, + "learning_rate": 7.082557417752948e-05, + "loss": 4.331, + "step": 3423 + }, + { + "epoch": 0.21255198957104723, + "grad_norm": 0.4415675518397534, + "learning_rate": 7.084626525967308e-05, + "loss": 4.2815, + "step": 3424 + }, + { + "epoch": 0.21261406667080515, + "grad_norm": 0.38358241140230087, + "learning_rate": 7.086695634181669e-05, + "loss": 4.3373, + "step": 3425 + }, + { + "epoch": 0.21267614377056304, + "grad_norm": 0.3505441327260886, + "learning_rate": 7.088764742396028e-05, + "loss": 4.331, + "step": 3426 + }, + { + "epoch": 0.21273822087032093, + "grad_norm": 0.3782206359226234, + "learning_rate": 7.090833850610388e-05, + "loss": 4.2796, + "step": 3427 + }, + { + "epoch": 0.21280029797007885, + "grad_norm": 0.3633713559209755, + "learning_rate": 7.092902958824746e-05, + "loss": 4.2733, + "step": 3428 + }, + { + "epoch": 0.21286237506983674, + "grad_norm": 0.3104577926406961, + "learning_rate": 7.094972067039107e-05, + "loss": 4.2434, + "step": 3429 + }, + { + "epoch": 0.21292445216959463, + "grad_norm": 0.550277587892304, + "learning_rate": 7.097041175253466e-05, + "loss": 4.2758, + "step": 3430 + }, + { + "epoch": 0.21298652926935255, + "grad_norm": 0.6896339589999944, + "learning_rate": 7.099110283467826e-05, + "loss": 4.2385, + "step": 3431 + }, + { + "epoch": 0.21304860636911044, + "grad_norm": 0.4281033183267075, + "learning_rate": 7.101179391682185e-05, + "loss": 4.4072, + "step": 3432 + }, + { + "epoch": 0.21311068346886833, + "grad_norm": 0.32981791199463123, + "learning_rate": 7.103248499896545e-05, + "loss": 4.28, + "step": 3433 + }, + { + "epoch": 0.21317276056862625, + "grad_norm": 0.4248719962278458, + "learning_rate": 7.105317608110904e-05, + "loss": 4.2447, + "step": 3434 + }, + { + "epoch": 0.21323483766838414, + "grad_norm": 0.39583538289125103, + "learning_rate": 7.107386716325264e-05, + "loss": 4.332, + "step": 3435 + }, + { + "epoch": 0.21329691476814203, + "grad_norm": 0.38536064496867584, + "learning_rate": 7.109455824539623e-05, + "loss": 4.2745, + "step": 3436 + }, + { + "epoch": 0.21335899186789994, + "grad_norm": 0.47544456119541617, + "learning_rate": 7.111524932753983e-05, + "loss": 4.2331, + "step": 3437 + }, + { + "epoch": 0.21342106896765783, + "grad_norm": 0.6498966345761983, + "learning_rate": 7.113594040968344e-05, + "loss": 4.2451, + "step": 3438 + }, + { + "epoch": 0.21348314606741572, + "grad_norm": 0.33689072016696125, + "learning_rate": 7.115663149182703e-05, + "loss": 4.2505, + "step": 3439 + }, + { + "epoch": 0.21354522316717364, + "grad_norm": 0.4113065665048507, + "learning_rate": 7.117732257397061e-05, + "loss": 4.3452, + "step": 3440 + }, + { + "epoch": 0.21360730026693153, + "grad_norm": 0.39115246446714014, + "learning_rate": 7.119801365611422e-05, + "loss": 4.3433, + "step": 3441 + }, + { + "epoch": 0.21366937736668942, + "grad_norm": 0.45771611081048286, + "learning_rate": 7.121870473825782e-05, + "loss": 4.3728, + "step": 3442 + }, + { + "epoch": 0.21373145446644734, + "grad_norm": 0.2878879966253539, + "learning_rate": 7.123939582040141e-05, + "loss": 4.2461, + "step": 3443 + }, + { + "epoch": 0.21379353156620523, + "grad_norm": 0.34949218661315856, + "learning_rate": 7.126008690254501e-05, + "loss": 4.3044, + "step": 3444 + }, + { + "epoch": 0.21385560866596312, + "grad_norm": 0.34424739311549873, + "learning_rate": 7.12807779846886e-05, + "loss": 4.2929, + "step": 3445 + }, + { + "epoch": 0.21391768576572104, + "grad_norm": 0.3966198393819304, + "learning_rate": 7.13014690668322e-05, + "loss": 4.199, + "step": 3446 + }, + { + "epoch": 0.21397976286547893, + "grad_norm": 0.3641446426127409, + "learning_rate": 7.132216014897579e-05, + "loss": 4.3279, + "step": 3447 + }, + { + "epoch": 0.21404183996523682, + "grad_norm": 0.3125325186022543, + "learning_rate": 7.134285123111939e-05, + "loss": 4.2734, + "step": 3448 + }, + { + "epoch": 0.21410391706499474, + "grad_norm": 0.3477021733502696, + "learning_rate": 7.136354231326298e-05, + "loss": 4.3307, + "step": 3449 + }, + { + "epoch": 0.21416599416475263, + "grad_norm": 0.30465218755079715, + "learning_rate": 7.138423339540659e-05, + "loss": 4.297, + "step": 3450 + }, + { + "epoch": 0.21422807126451052, + "grad_norm": 0.3435413637771027, + "learning_rate": 7.140492447755019e-05, + "loss": 4.1912, + "step": 3451 + }, + { + "epoch": 0.21429014836426843, + "grad_norm": 0.2582576982591561, + "learning_rate": 7.142561555969377e-05, + "loss": 4.1873, + "step": 3452 + }, + { + "epoch": 0.21435222546402632, + "grad_norm": 0.37153482826570333, + "learning_rate": 7.144630664183736e-05, + "loss": 4.2779, + "step": 3453 + }, + { + "epoch": 0.21441430256378421, + "grad_norm": 0.3375986745309742, + "learning_rate": 7.146699772398097e-05, + "loss": 4.2365, + "step": 3454 + }, + { + "epoch": 0.21447637966354213, + "grad_norm": 0.3029192575298437, + "learning_rate": 7.148768880612457e-05, + "loss": 4.4114, + "step": 3455 + }, + { + "epoch": 0.21453845676330002, + "grad_norm": 0.5041047821474182, + "learning_rate": 7.150837988826816e-05, + "loss": 4.3532, + "step": 3456 + }, + { + "epoch": 0.2146005338630579, + "grad_norm": 0.4870679046916439, + "learning_rate": 7.152907097041176e-05, + "loss": 4.2786, + "step": 3457 + }, + { + "epoch": 0.21466261096281583, + "grad_norm": 0.32781944353006603, + "learning_rate": 7.154976205255535e-05, + "loss": 4.2538, + "step": 3458 + }, + { + "epoch": 0.21472468806257372, + "grad_norm": 0.3533985618539648, + "learning_rate": 7.157045313469895e-05, + "loss": 4.2526, + "step": 3459 + }, + { + "epoch": 0.2147867651623316, + "grad_norm": 0.5337664245204622, + "learning_rate": 7.159114421684254e-05, + "loss": 4.1712, + "step": 3460 + }, + { + "epoch": 0.21484884226208953, + "grad_norm": 0.9305205259483106, + "learning_rate": 7.161183529898614e-05, + "loss": 4.2999, + "step": 3461 + }, + { + "epoch": 0.21491091936184742, + "grad_norm": 0.5552116120786159, + "learning_rate": 7.163252638112973e-05, + "loss": 4.3107, + "step": 3462 + }, + { + "epoch": 0.2149729964616053, + "grad_norm": 1.1898894613681403, + "learning_rate": 7.165321746327334e-05, + "loss": 4.3146, + "step": 3463 + }, + { + "epoch": 0.21503507356136323, + "grad_norm": 0.7003641177776558, + "learning_rate": 7.167390854541692e-05, + "loss": 4.3007, + "step": 3464 + }, + { + "epoch": 0.21509715066112112, + "grad_norm": 0.7314554985365935, + "learning_rate": 7.169459962756052e-05, + "loss": 4.2625, + "step": 3465 + }, + { + "epoch": 0.215159227760879, + "grad_norm": 0.9917490433423155, + "learning_rate": 7.171529070970413e-05, + "loss": 4.3796, + "step": 3466 + }, + { + "epoch": 0.2152213048606369, + "grad_norm": 0.6131599695108328, + "learning_rate": 7.173598179184772e-05, + "loss": 4.3684, + "step": 3467 + }, + { + "epoch": 0.21528338196039482, + "grad_norm": 0.6799301503574332, + "learning_rate": 7.175667287399132e-05, + "loss": 4.2302, + "step": 3468 + }, + { + "epoch": 0.2153454590601527, + "grad_norm": 0.4137639102729556, + "learning_rate": 7.177736395613491e-05, + "loss": 4.2735, + "step": 3469 + }, + { + "epoch": 0.2154075361599106, + "grad_norm": 0.604489278709303, + "learning_rate": 7.17980550382785e-05, + "loss": 4.3301, + "step": 3470 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 0.4592866105400592, + "learning_rate": 7.18187461204221e-05, + "loss": 4.2359, + "step": 3471 + }, + { + "epoch": 0.2155316903594264, + "grad_norm": 0.4745619165373488, + "learning_rate": 7.18394372025657e-05, + "loss": 4.3527, + "step": 3472 + }, + { + "epoch": 0.2155937674591843, + "grad_norm": 0.4232423839559834, + "learning_rate": 7.186012828470929e-05, + "loss": 4.2701, + "step": 3473 + }, + { + "epoch": 0.2156558445589422, + "grad_norm": 0.3411782360947056, + "learning_rate": 7.188081936685289e-05, + "loss": 4.3354, + "step": 3474 + }, + { + "epoch": 0.2157179216587001, + "grad_norm": 0.4012207172380333, + "learning_rate": 7.19015104489965e-05, + "loss": 4.2953, + "step": 3475 + }, + { + "epoch": 0.215779998758458, + "grad_norm": 0.42944627406273533, + "learning_rate": 7.192220153114008e-05, + "loss": 4.1858, + "step": 3476 + }, + { + "epoch": 0.2158420758582159, + "grad_norm": 0.47083624069475266, + "learning_rate": 7.194289261328367e-05, + "loss": 4.2364, + "step": 3477 + }, + { + "epoch": 0.2159041529579738, + "grad_norm": 0.3418184001139268, + "learning_rate": 7.196358369542727e-05, + "loss": 4.3423, + "step": 3478 + }, + { + "epoch": 0.2159662300577317, + "grad_norm": 0.31739931503821717, + "learning_rate": 7.198427477757088e-05, + "loss": 4.1278, + "step": 3479 + }, + { + "epoch": 0.2160283071574896, + "grad_norm": 0.3685732320821134, + "learning_rate": 7.200496585971447e-05, + "loss": 4.1794, + "step": 3480 + }, + { + "epoch": 0.2160903842572475, + "grad_norm": 0.3987489305978145, + "learning_rate": 7.202565694185807e-05, + "loss": 4.3027, + "step": 3481 + }, + { + "epoch": 0.2161524613570054, + "grad_norm": 0.5106975470266447, + "learning_rate": 7.204634802400166e-05, + "loss": 4.4331, + "step": 3482 + }, + { + "epoch": 0.2162145384567633, + "grad_norm": 0.36619584116019543, + "learning_rate": 7.206703910614526e-05, + "loss": 4.1634, + "step": 3483 + }, + { + "epoch": 0.2162766155565212, + "grad_norm": 0.3446375269904729, + "learning_rate": 7.208773018828885e-05, + "loss": 4.2797, + "step": 3484 + }, + { + "epoch": 0.21633869265627909, + "grad_norm": 0.37669155948312966, + "learning_rate": 7.210842127043245e-05, + "loss": 4.2988, + "step": 3485 + }, + { + "epoch": 0.216400769756037, + "grad_norm": 0.30920452324213127, + "learning_rate": 7.212911235257604e-05, + "loss": 4.2804, + "step": 3486 + }, + { + "epoch": 0.2164628468557949, + "grad_norm": 0.2992293089554871, + "learning_rate": 7.214980343471964e-05, + "loss": 4.2847, + "step": 3487 + }, + { + "epoch": 0.21652492395555278, + "grad_norm": 0.4584360459141466, + "learning_rate": 7.217049451686324e-05, + "loss": 4.164, + "step": 3488 + }, + { + "epoch": 0.2165870010553107, + "grad_norm": 0.36082757869621784, + "learning_rate": 7.219118559900683e-05, + "loss": 4.3868, + "step": 3489 + }, + { + "epoch": 0.2166490781550686, + "grad_norm": 0.2664659875024414, + "learning_rate": 7.221187668115042e-05, + "loss": 4.2843, + "step": 3490 + }, + { + "epoch": 0.21671115525482648, + "grad_norm": 0.3787280971501436, + "learning_rate": 7.223256776329403e-05, + "loss": 4.2795, + "step": 3491 + }, + { + "epoch": 0.2167732323545844, + "grad_norm": 0.30286549835635396, + "learning_rate": 7.225325884543763e-05, + "loss": 4.1647, + "step": 3492 + }, + { + "epoch": 0.2168353094543423, + "grad_norm": 0.395605023529096, + "learning_rate": 7.227394992758122e-05, + "loss": 4.1949, + "step": 3493 + }, + { + "epoch": 0.21689738655410018, + "grad_norm": 0.4185061876587008, + "learning_rate": 7.229464100972482e-05, + "loss": 4.2412, + "step": 3494 + }, + { + "epoch": 0.2169594636538581, + "grad_norm": 0.35769692765842204, + "learning_rate": 7.231533209186841e-05, + "loss": 4.1512, + "step": 3495 + }, + { + "epoch": 0.217021540753616, + "grad_norm": 0.32443538000905586, + "learning_rate": 7.2336023174012e-05, + "loss": 4.2215, + "step": 3496 + }, + { + "epoch": 0.21708361785337388, + "grad_norm": 0.3522456631024602, + "learning_rate": 7.23567142561556e-05, + "loss": 4.3045, + "step": 3497 + }, + { + "epoch": 0.2171456949531318, + "grad_norm": 0.49278008648910826, + "learning_rate": 7.23774053382992e-05, + "loss": 4.257, + "step": 3498 + }, + { + "epoch": 0.21720777205288969, + "grad_norm": 0.4625180994822483, + "learning_rate": 7.239809642044279e-05, + "loss": 4.2236, + "step": 3499 + }, + { + "epoch": 0.21726984915264758, + "grad_norm": 0.3802747886096873, + "learning_rate": 7.24187875025864e-05, + "loss": 4.2128, + "step": 3500 + }, + { + "epoch": 0.2173319262524055, + "grad_norm": 0.36925000381727996, + "learning_rate": 7.243947858472998e-05, + "loss": 4.2082, + "step": 3501 + }, + { + "epoch": 0.21739400335216338, + "grad_norm": 0.40774249137289237, + "learning_rate": 7.246016966687358e-05, + "loss": 4.3203, + "step": 3502 + }, + { + "epoch": 0.21745608045192127, + "grad_norm": 0.5167735630120427, + "learning_rate": 7.248086074901717e-05, + "loss": 4.3109, + "step": 3503 + }, + { + "epoch": 0.2175181575516792, + "grad_norm": 0.40417729327726953, + "learning_rate": 7.250155183116078e-05, + "loss": 4.1555, + "step": 3504 + }, + { + "epoch": 0.21758023465143708, + "grad_norm": 0.32719197908704, + "learning_rate": 7.252224291330437e-05, + "loss": 4.2876, + "step": 3505 + }, + { + "epoch": 0.21764231175119497, + "grad_norm": 0.6778694479952542, + "learning_rate": 7.254293399544797e-05, + "loss": 4.1786, + "step": 3506 + }, + { + "epoch": 0.2177043888509529, + "grad_norm": 0.6004659393268589, + "learning_rate": 7.256362507759155e-05, + "loss": 4.2975, + "step": 3507 + }, + { + "epoch": 0.21776646595071078, + "grad_norm": 0.3860147406624487, + "learning_rate": 7.258431615973516e-05, + "loss": 4.2505, + "step": 3508 + }, + { + "epoch": 0.21782854305046867, + "grad_norm": 0.48443656960768244, + "learning_rate": 7.260500724187875e-05, + "loss": 4.2174, + "step": 3509 + }, + { + "epoch": 0.2178906201502266, + "grad_norm": 0.4975335411525541, + "learning_rate": 7.262569832402235e-05, + "loss": 4.199, + "step": 3510 + }, + { + "epoch": 0.21795269724998448, + "grad_norm": 0.44059677507079154, + "learning_rate": 7.264638940616595e-05, + "loss": 4.2119, + "step": 3511 + }, + { + "epoch": 0.21801477434974237, + "grad_norm": 0.491705310105718, + "learning_rate": 7.266708048830954e-05, + "loss": 4.2766, + "step": 3512 + }, + { + "epoch": 0.2180768514495003, + "grad_norm": 0.5031971340714155, + "learning_rate": 7.268777157045314e-05, + "loss": 4.2822, + "step": 3513 + }, + { + "epoch": 0.21813892854925818, + "grad_norm": 0.3820374486377433, + "learning_rate": 7.270846265259673e-05, + "loss": 4.2992, + "step": 3514 + }, + { + "epoch": 0.21820100564901607, + "grad_norm": 0.47497462907595245, + "learning_rate": 7.272915373474033e-05, + "loss": 4.3245, + "step": 3515 + }, + { + "epoch": 0.21826308274877398, + "grad_norm": 0.41400684126554893, + "learning_rate": 7.274984481688393e-05, + "loss": 4.2946, + "step": 3516 + }, + { + "epoch": 0.21832515984853187, + "grad_norm": 0.4402517733313454, + "learning_rate": 7.277053589902753e-05, + "loss": 4.3014, + "step": 3517 + }, + { + "epoch": 0.21838723694828976, + "grad_norm": 0.3436365807428449, + "learning_rate": 7.279122698117112e-05, + "loss": 4.1556, + "step": 3518 + }, + { + "epoch": 0.21844931404804768, + "grad_norm": 0.44092795528622536, + "learning_rate": 7.28119180633147e-05, + "loss": 4.1586, + "step": 3519 + }, + { + "epoch": 0.21851139114780557, + "grad_norm": 0.41511219053598347, + "learning_rate": 7.283260914545831e-05, + "loss": 4.2042, + "step": 3520 + }, + { + "epoch": 0.21857346824756346, + "grad_norm": 0.4803029216141391, + "learning_rate": 7.285330022760191e-05, + "loss": 4.2622, + "step": 3521 + }, + { + "epoch": 0.21863554534732138, + "grad_norm": 0.4904561851814578, + "learning_rate": 7.28739913097455e-05, + "loss": 4.1859, + "step": 3522 + }, + { + "epoch": 0.21869762244707927, + "grad_norm": 0.48629532362536615, + "learning_rate": 7.28946823918891e-05, + "loss": 4.1517, + "step": 3523 + }, + { + "epoch": 0.21875969954683716, + "grad_norm": 0.6281450534105644, + "learning_rate": 7.29153734740327e-05, + "loss": 4.2442, + "step": 3524 + }, + { + "epoch": 0.21882177664659508, + "grad_norm": 0.5394356859442058, + "learning_rate": 7.293606455617629e-05, + "loss": 4.3076, + "step": 3525 + }, + { + "epoch": 0.21888385374635297, + "grad_norm": 0.6309779383494932, + "learning_rate": 7.295675563831988e-05, + "loss": 4.296, + "step": 3526 + }, + { + "epoch": 0.21894593084611086, + "grad_norm": 0.34936356027603244, + "learning_rate": 7.297744672046348e-05, + "loss": 4.2629, + "step": 3527 + }, + { + "epoch": 0.21900800794586878, + "grad_norm": 0.7467558554262382, + "learning_rate": 7.299813780260707e-05, + "loss": 4.2767, + "step": 3528 + }, + { + "epoch": 0.21907008504562667, + "grad_norm": 0.5031779148874248, + "learning_rate": 7.301882888475068e-05, + "loss": 4.2973, + "step": 3529 + }, + { + "epoch": 0.21913216214538456, + "grad_norm": 0.547116907294778, + "learning_rate": 7.303951996689428e-05, + "loss": 4.2595, + "step": 3530 + }, + { + "epoch": 0.21919423924514247, + "grad_norm": 0.5710858627588772, + "learning_rate": 7.306021104903786e-05, + "loss": 4.27, + "step": 3531 + }, + { + "epoch": 0.21925631634490036, + "grad_norm": 0.47181240133935604, + "learning_rate": 7.308090213118146e-05, + "loss": 4.1661, + "step": 3532 + }, + { + "epoch": 0.21931839344465826, + "grad_norm": 0.4976450327472883, + "learning_rate": 7.310159321332506e-05, + "loss": 4.2996, + "step": 3533 + }, + { + "epoch": 0.21938047054441617, + "grad_norm": 0.46882871920499186, + "learning_rate": 7.312228429546866e-05, + "loss": 4.2009, + "step": 3534 + }, + { + "epoch": 0.21944254764417406, + "grad_norm": 0.38028722972182544, + "learning_rate": 7.314297537761225e-05, + "loss": 4.2763, + "step": 3535 + }, + { + "epoch": 0.21950462474393195, + "grad_norm": 0.36155967510178194, + "learning_rate": 7.316366645975585e-05, + "loss": 4.2507, + "step": 3536 + }, + { + "epoch": 0.21956670184368987, + "grad_norm": 0.3706739158900614, + "learning_rate": 7.318435754189944e-05, + "loss": 4.235, + "step": 3537 + }, + { + "epoch": 0.21962877894344776, + "grad_norm": 0.48228914543080575, + "learning_rate": 7.320504862404304e-05, + "loss": 4.3213, + "step": 3538 + }, + { + "epoch": 0.21969085604320565, + "grad_norm": 0.9420463317078888, + "learning_rate": 7.322573970618663e-05, + "loss": 4.2175, + "step": 3539 + }, + { + "epoch": 0.21975293314296357, + "grad_norm": 1.0426890823360484, + "learning_rate": 7.324643078833023e-05, + "loss": 4.2126, + "step": 3540 + }, + { + "epoch": 0.21981501024272146, + "grad_norm": 0.7131551173517787, + "learning_rate": 7.326712187047384e-05, + "loss": 4.1954, + "step": 3541 + }, + { + "epoch": 0.21987708734247935, + "grad_norm": 0.5208358732797361, + "learning_rate": 7.328781295261743e-05, + "loss": 4.2311, + "step": 3542 + }, + { + "epoch": 0.21993916444223727, + "grad_norm": 0.6040288477491723, + "learning_rate": 7.330850403476101e-05, + "loss": 4.252, + "step": 3543 + }, + { + "epoch": 0.22000124154199516, + "grad_norm": 0.45861439504082424, + "learning_rate": 7.332919511690461e-05, + "loss": 4.2944, + "step": 3544 + }, + { + "epoch": 0.22006331864175305, + "grad_norm": 0.5458527684088328, + "learning_rate": 7.334988619904822e-05, + "loss": 4.2451, + "step": 3545 + }, + { + "epoch": 0.22012539574151097, + "grad_norm": 0.6157813169048111, + "learning_rate": 7.337057728119181e-05, + "loss": 4.1064, + "step": 3546 + }, + { + "epoch": 0.22018747284126886, + "grad_norm": 0.3435778078211948, + "learning_rate": 7.339126836333541e-05, + "loss": 4.3306, + "step": 3547 + }, + { + "epoch": 0.22024954994102675, + "grad_norm": 0.7072321569457987, + "learning_rate": 7.3411959445479e-05, + "loss": 4.2965, + "step": 3548 + }, + { + "epoch": 0.22031162704078466, + "grad_norm": 0.43228956781207195, + "learning_rate": 7.34326505276226e-05, + "loss": 4.2852, + "step": 3549 + }, + { + "epoch": 0.22037370414054255, + "grad_norm": 0.4487967125745889, + "learning_rate": 7.34533416097662e-05, + "loss": 4.1917, + "step": 3550 + }, + { + "epoch": 0.22043578124030044, + "grad_norm": 0.3389322180008439, + "learning_rate": 7.347403269190979e-05, + "loss": 4.2266, + "step": 3551 + }, + { + "epoch": 0.22049785834005836, + "grad_norm": 0.39095155417766053, + "learning_rate": 7.349472377405338e-05, + "loss": 4.2088, + "step": 3552 + }, + { + "epoch": 0.22055993543981625, + "grad_norm": 0.48871061139526173, + "learning_rate": 7.351541485619698e-05, + "loss": 4.1953, + "step": 3553 + }, + { + "epoch": 0.22062201253957414, + "grad_norm": 0.5122278316554516, + "learning_rate": 7.353610593834059e-05, + "loss": 4.1748, + "step": 3554 + }, + { + "epoch": 0.22068408963933206, + "grad_norm": 0.49546987999601055, + "learning_rate": 7.355679702048417e-05, + "loss": 4.2407, + "step": 3555 + }, + { + "epoch": 0.22074616673908995, + "grad_norm": 0.5971315611426324, + "learning_rate": 7.357748810262776e-05, + "loss": 4.1295, + "step": 3556 + }, + { + "epoch": 0.22080824383884784, + "grad_norm": 0.36708300706687974, + "learning_rate": 7.359817918477136e-05, + "loss": 4.2257, + "step": 3557 + }, + { + "epoch": 0.22087032093860576, + "grad_norm": 0.6737850673021233, + "learning_rate": 7.361887026691497e-05, + "loss": 4.1972, + "step": 3558 + }, + { + "epoch": 0.22093239803836365, + "grad_norm": 0.41640414869053827, + "learning_rate": 7.363956134905856e-05, + "loss": 4.2324, + "step": 3559 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 0.36943587348591556, + "learning_rate": 7.366025243120216e-05, + "loss": 4.1768, + "step": 3560 + }, + { + "epoch": 0.22105655223787946, + "grad_norm": 0.34246556946058754, + "learning_rate": 7.368094351334574e-05, + "loss": 4.2178, + "step": 3561 + }, + { + "epoch": 0.22111862933763735, + "grad_norm": 0.877876017015192, + "learning_rate": 7.370163459548935e-05, + "loss": 4.3469, + "step": 3562 + }, + { + "epoch": 0.22118070643739524, + "grad_norm": 0.8062520788594731, + "learning_rate": 7.372232567763294e-05, + "loss": 4.2805, + "step": 3563 + }, + { + "epoch": 0.22124278353715315, + "grad_norm": 0.5957209600061324, + "learning_rate": 7.374301675977654e-05, + "loss": 4.179, + "step": 3564 + }, + { + "epoch": 0.22130486063691104, + "grad_norm": 0.700565447798813, + "learning_rate": 7.376370784192013e-05, + "loss": 4.2144, + "step": 3565 + }, + { + "epoch": 0.22136693773666893, + "grad_norm": 0.787420887984785, + "learning_rate": 7.378439892406374e-05, + "loss": 4.2508, + "step": 3566 + }, + { + "epoch": 0.22142901483642685, + "grad_norm": 0.5355547792392572, + "learning_rate": 7.380509000620734e-05, + "loss": 4.2623, + "step": 3567 + }, + { + "epoch": 0.22149109193618474, + "grad_norm": 0.5703644402249817, + "learning_rate": 7.382578108835092e-05, + "loss": 4.2416, + "step": 3568 + }, + { + "epoch": 0.22155316903594263, + "grad_norm": 0.5068004500179355, + "learning_rate": 7.384647217049451e-05, + "loss": 4.2846, + "step": 3569 + }, + { + "epoch": 0.22161524613570055, + "grad_norm": 0.5208606776370965, + "learning_rate": 7.386716325263812e-05, + "loss": 4.2417, + "step": 3570 + }, + { + "epoch": 0.22167732323545844, + "grad_norm": 0.37440654367012394, + "learning_rate": 7.388785433478172e-05, + "loss": 4.2566, + "step": 3571 + }, + { + "epoch": 0.22173940033521633, + "grad_norm": 0.6945510724117147, + "learning_rate": 7.390854541692531e-05, + "loss": 4.1985, + "step": 3572 + }, + { + "epoch": 0.22180147743497425, + "grad_norm": 0.502516552761606, + "learning_rate": 7.392923649906891e-05, + "loss": 4.2042, + "step": 3573 + }, + { + "epoch": 0.22186355453473214, + "grad_norm": 0.5961845578761817, + "learning_rate": 7.39499275812125e-05, + "loss": 4.2088, + "step": 3574 + }, + { + "epoch": 0.22192563163449003, + "grad_norm": 0.38276181566674233, + "learning_rate": 7.39706186633561e-05, + "loss": 4.2663, + "step": 3575 + }, + { + "epoch": 0.22198770873424795, + "grad_norm": 0.5735492566215374, + "learning_rate": 7.399130974549969e-05, + "loss": 4.2984, + "step": 3576 + }, + { + "epoch": 0.22204978583400584, + "grad_norm": 0.5120591931231062, + "learning_rate": 7.401200082764329e-05, + "loss": 4.2492, + "step": 3577 + }, + { + "epoch": 0.22211186293376373, + "grad_norm": 0.4365565481578904, + "learning_rate": 7.403269190978688e-05, + "loss": 4.2631, + "step": 3578 + }, + { + "epoch": 0.22217394003352164, + "grad_norm": 0.42173547556819285, + "learning_rate": 7.405338299193049e-05, + "loss": 4.3138, + "step": 3579 + }, + { + "epoch": 0.22223601713327953, + "grad_norm": 0.5053480944721261, + "learning_rate": 7.407407407407407e-05, + "loss": 4.1763, + "step": 3580 + }, + { + "epoch": 0.22229809423303742, + "grad_norm": 0.6692913858743644, + "learning_rate": 7.409476515621767e-05, + "loss": 4.2339, + "step": 3581 + }, + { + "epoch": 0.22236017133279534, + "grad_norm": 0.7056228562314875, + "learning_rate": 7.411545623836126e-05, + "loss": 4.0221, + "step": 3582 + }, + { + "epoch": 0.22242224843255323, + "grad_norm": 0.47966744508353176, + "learning_rate": 7.413614732050487e-05, + "loss": 4.2293, + "step": 3583 + }, + { + "epoch": 0.22248432553231112, + "grad_norm": 0.7981005575429758, + "learning_rate": 7.415683840264847e-05, + "loss": 4.305, + "step": 3584 + }, + { + "epoch": 0.22254640263206904, + "grad_norm": 0.6046227974984631, + "learning_rate": 7.417752948479206e-05, + "loss": 4.2844, + "step": 3585 + }, + { + "epoch": 0.22260847973182693, + "grad_norm": 0.45564812257489407, + "learning_rate": 7.419822056693564e-05, + "loss": 4.1914, + "step": 3586 + }, + { + "epoch": 0.22267055683158482, + "grad_norm": 0.7698366362806501, + "learning_rate": 7.421891164907925e-05, + "loss": 4.1925, + "step": 3587 + }, + { + "epoch": 0.22273263393134274, + "grad_norm": 0.4967594186238377, + "learning_rate": 7.423960273122285e-05, + "loss": 4.2656, + "step": 3588 + }, + { + "epoch": 0.22279471103110063, + "grad_norm": 0.6259957725683802, + "learning_rate": 7.426029381336644e-05, + "loss": 4.3158, + "step": 3589 + }, + { + "epoch": 0.22285678813085852, + "grad_norm": 0.7098896142953534, + "learning_rate": 7.428098489551004e-05, + "loss": 4.196, + "step": 3590 + }, + { + "epoch": 0.22291886523061644, + "grad_norm": 0.47405265211967446, + "learning_rate": 7.430167597765365e-05, + "loss": 4.2583, + "step": 3591 + }, + { + "epoch": 0.22298094233037433, + "grad_norm": 0.4094945201681796, + "learning_rate": 7.432236705979723e-05, + "loss": 4.2759, + "step": 3592 + }, + { + "epoch": 0.22304301943013222, + "grad_norm": 0.4805836268849031, + "learning_rate": 7.434305814194082e-05, + "loss": 4.2641, + "step": 3593 + }, + { + "epoch": 0.22310509652989013, + "grad_norm": 0.34024149540174314, + "learning_rate": 7.436374922408442e-05, + "loss": 4.1214, + "step": 3594 + }, + { + "epoch": 0.22316717362964802, + "grad_norm": 0.4633865596995391, + "learning_rate": 7.438444030622803e-05, + "loss": 4.2084, + "step": 3595 + }, + { + "epoch": 0.22322925072940591, + "grad_norm": 0.42630560548601615, + "learning_rate": 7.440513138837162e-05, + "loss": 4.161, + "step": 3596 + }, + { + "epoch": 0.22329132782916383, + "grad_norm": 0.4817976213860456, + "learning_rate": 7.442582247051522e-05, + "loss": 4.3148, + "step": 3597 + }, + { + "epoch": 0.22335340492892172, + "grad_norm": 0.4024803892535301, + "learning_rate": 7.44465135526588e-05, + "loss": 4.2201, + "step": 3598 + }, + { + "epoch": 0.2234154820286796, + "grad_norm": 0.39059896382347253, + "learning_rate": 7.44672046348024e-05, + "loss": 4.2046, + "step": 3599 + }, + { + "epoch": 0.22347755912843753, + "grad_norm": 0.4679579241587828, + "learning_rate": 7.4487895716946e-05, + "loss": 4.2337, + "step": 3600 + }, + { + "epoch": 0.22353963622819542, + "grad_norm": 0.872144979973331, + "learning_rate": 7.45085867990896e-05, + "loss": 4.14, + "step": 3601 + }, + { + "epoch": 0.2236017133279533, + "grad_norm": 0.4334350501708399, + "learning_rate": 7.452927788123319e-05, + "loss": 4.3044, + "step": 3602 + }, + { + "epoch": 0.22366379042771123, + "grad_norm": 0.5708165749876706, + "learning_rate": 7.454996896337679e-05, + "loss": 4.1093, + "step": 3603 + }, + { + "epoch": 0.22372586752746912, + "grad_norm": 0.5253244805090302, + "learning_rate": 7.457066004552038e-05, + "loss": 4.2449, + "step": 3604 + }, + { + "epoch": 0.223787944627227, + "grad_norm": 0.5809234983943321, + "learning_rate": 7.459135112766398e-05, + "loss": 4.324, + "step": 3605 + }, + { + "epoch": 0.22385002172698493, + "grad_norm": 0.3401320867581748, + "learning_rate": 7.461204220980757e-05, + "loss": 4.0886, + "step": 3606 + }, + { + "epoch": 0.22391209882674282, + "grad_norm": 0.4889162436000636, + "learning_rate": 7.463273329195117e-05, + "loss": 4.2583, + "step": 3607 + }, + { + "epoch": 0.2239741759265007, + "grad_norm": 0.5431226059539322, + "learning_rate": 7.465342437409478e-05, + "loss": 4.2526, + "step": 3608 + }, + { + "epoch": 0.22403625302625862, + "grad_norm": 0.4871712847954172, + "learning_rate": 7.467411545623837e-05, + "loss": 4.1465, + "step": 3609 + }, + { + "epoch": 0.22409833012601652, + "grad_norm": 0.5172136282875548, + "learning_rate": 7.469480653838195e-05, + "loss": 4.2309, + "step": 3610 + }, + { + "epoch": 0.2241604072257744, + "grad_norm": 0.41264646343050143, + "learning_rate": 7.471549762052555e-05, + "loss": 4.1597, + "step": 3611 + }, + { + "epoch": 0.22422248432553232, + "grad_norm": 0.5170852905879862, + "learning_rate": 7.473618870266916e-05, + "loss": 4.1415, + "step": 3612 + }, + { + "epoch": 0.2242845614252902, + "grad_norm": 0.48767016509840044, + "learning_rate": 7.475687978481275e-05, + "loss": 4.1949, + "step": 3613 + }, + { + "epoch": 0.2243466385250481, + "grad_norm": 0.38170170680757065, + "learning_rate": 7.477757086695635e-05, + "loss": 4.2045, + "step": 3614 + }, + { + "epoch": 0.22440871562480602, + "grad_norm": 0.48373664815352535, + "learning_rate": 7.479826194909994e-05, + "loss": 4.2266, + "step": 3615 + }, + { + "epoch": 0.2244707927245639, + "grad_norm": 0.5464917611201737, + "learning_rate": 7.481895303124354e-05, + "loss": 4.2495, + "step": 3616 + }, + { + "epoch": 0.2245328698243218, + "grad_norm": 0.5808960000698146, + "learning_rate": 7.483964411338713e-05, + "loss": 4.1427, + "step": 3617 + }, + { + "epoch": 0.22459494692407972, + "grad_norm": 0.46181524078551195, + "learning_rate": 7.486033519553073e-05, + "loss": 4.2777, + "step": 3618 + }, + { + "epoch": 0.2246570240238376, + "grad_norm": 0.30188943405868107, + "learning_rate": 7.488102627767432e-05, + "loss": 4.2524, + "step": 3619 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.4166397821177167, + "learning_rate": 7.490171735981793e-05, + "loss": 4.1301, + "step": 3620 + }, + { + "epoch": 0.22478117822335342, + "grad_norm": 0.30685445291369107, + "learning_rate": 7.492240844196153e-05, + "loss": 4.1131, + "step": 3621 + }, + { + "epoch": 0.2248432553231113, + "grad_norm": 0.3233814458141957, + "learning_rate": 7.494309952410511e-05, + "loss": 4.1573, + "step": 3622 + }, + { + "epoch": 0.2249053324228692, + "grad_norm": 0.42155621104713686, + "learning_rate": 7.49637906062487e-05, + "loss": 4.26, + "step": 3623 + }, + { + "epoch": 0.22496740952262712, + "grad_norm": 0.2862059073450649, + "learning_rate": 7.498448168839231e-05, + "loss": 4.1661, + "step": 3624 + }, + { + "epoch": 0.225029486622385, + "grad_norm": 0.530394089041723, + "learning_rate": 7.50051727705359e-05, + "loss": 4.1709, + "step": 3625 + }, + { + "epoch": 0.2250915637221429, + "grad_norm": 0.6638197286801457, + "learning_rate": 7.50258638526795e-05, + "loss": 4.2248, + "step": 3626 + }, + { + "epoch": 0.2251536408219008, + "grad_norm": 0.40132285659893485, + "learning_rate": 7.50465549348231e-05, + "loss": 4.0975, + "step": 3627 + }, + { + "epoch": 0.2252157179216587, + "grad_norm": 0.4956262894189136, + "learning_rate": 7.506724601696669e-05, + "loss": 4.133, + "step": 3628 + }, + { + "epoch": 0.2252777950214166, + "grad_norm": 0.3698402919051611, + "learning_rate": 7.508793709911029e-05, + "loss": 4.3242, + "step": 3629 + }, + { + "epoch": 0.2253398721211745, + "grad_norm": 0.3258076650215754, + "learning_rate": 7.510862818125388e-05, + "loss": 4.1137, + "step": 3630 + }, + { + "epoch": 0.2254019492209324, + "grad_norm": 0.4560244629869258, + "learning_rate": 7.512931926339748e-05, + "loss": 4.1924, + "step": 3631 + }, + { + "epoch": 0.2254640263206903, + "grad_norm": 0.3354454429003059, + "learning_rate": 7.515001034554107e-05, + "loss": 4.1761, + "step": 3632 + }, + { + "epoch": 0.2255261034204482, + "grad_norm": 0.3208612765285339, + "learning_rate": 7.517070142768468e-05, + "loss": 4.2187, + "step": 3633 + }, + { + "epoch": 0.2255881805202061, + "grad_norm": 0.354834924216054, + "learning_rate": 7.519139250982826e-05, + "loss": 4.2727, + "step": 3634 + }, + { + "epoch": 0.225650257619964, + "grad_norm": 0.2511756938210935, + "learning_rate": 7.521208359197186e-05, + "loss": 4.1995, + "step": 3635 + }, + { + "epoch": 0.2257123347197219, + "grad_norm": 0.31259557931264276, + "learning_rate": 7.523277467411545e-05, + "loss": 4.138, + "step": 3636 + }, + { + "epoch": 0.2257744118194798, + "grad_norm": 0.3114168229822779, + "learning_rate": 7.525346575625906e-05, + "loss": 4.2226, + "step": 3637 + }, + { + "epoch": 0.2258364889192377, + "grad_norm": 0.31518533412651367, + "learning_rate": 7.527415683840266e-05, + "loss": 4.1848, + "step": 3638 + }, + { + "epoch": 0.2258985660189956, + "grad_norm": 0.40617212135937014, + "learning_rate": 7.529484792054625e-05, + "loss": 4.2209, + "step": 3639 + }, + { + "epoch": 0.2259606431187535, + "grad_norm": 0.336195724513646, + "learning_rate": 7.531553900268985e-05, + "loss": 4.1773, + "step": 3640 + }, + { + "epoch": 0.22602272021851139, + "grad_norm": 0.28957462051832417, + "learning_rate": 7.533623008483344e-05, + "loss": 4.2131, + "step": 3641 + }, + { + "epoch": 0.2260847973182693, + "grad_norm": 0.39628776598963156, + "learning_rate": 7.535692116697704e-05, + "loss": 4.1122, + "step": 3642 + }, + { + "epoch": 0.2261468744180272, + "grad_norm": 0.3649332569775265, + "learning_rate": 7.537761224912063e-05, + "loss": 4.2566, + "step": 3643 + }, + { + "epoch": 0.22620895151778508, + "grad_norm": 0.3944622017142299, + "learning_rate": 7.539830333126423e-05, + "loss": 4.1179, + "step": 3644 + }, + { + "epoch": 0.226271028617543, + "grad_norm": 0.39236841246564075, + "learning_rate": 7.541899441340783e-05, + "loss": 4.214, + "step": 3645 + }, + { + "epoch": 0.2263331057173009, + "grad_norm": 0.5058523936139718, + "learning_rate": 7.543968549555143e-05, + "loss": 4.2686, + "step": 3646 + }, + { + "epoch": 0.22639518281705878, + "grad_norm": 0.32781634118490677, + "learning_rate": 7.546037657769501e-05, + "loss": 4.2058, + "step": 3647 + }, + { + "epoch": 0.2264572599168167, + "grad_norm": 0.6198000119285915, + "learning_rate": 7.54810676598386e-05, + "loss": 4.1092, + "step": 3648 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 0.3829649213072673, + "learning_rate": 7.550175874198221e-05, + "loss": 4.2161, + "step": 3649 + }, + { + "epoch": 0.22658141411633248, + "grad_norm": 0.4342624216537688, + "learning_rate": 7.552244982412581e-05, + "loss": 4.1067, + "step": 3650 + }, + { + "epoch": 0.2266434912160904, + "grad_norm": 0.39949991660540957, + "learning_rate": 7.55431409062694e-05, + "loss": 4.1217, + "step": 3651 + }, + { + "epoch": 0.2267055683158483, + "grad_norm": 0.33140493674155197, + "learning_rate": 7.5563831988413e-05, + "loss": 4.1908, + "step": 3652 + }, + { + "epoch": 0.22676764541560618, + "grad_norm": 0.39281947749578633, + "learning_rate": 7.55845230705566e-05, + "loss": 4.1794, + "step": 3653 + }, + { + "epoch": 0.22682972251536407, + "grad_norm": 0.42433370153299277, + "learning_rate": 7.560521415270019e-05, + "loss": 4.23, + "step": 3654 + }, + { + "epoch": 0.226891799615122, + "grad_norm": 0.36387991030651795, + "learning_rate": 7.562590523484379e-05, + "loss": 4.2278, + "step": 3655 + }, + { + "epoch": 0.22695387671487988, + "grad_norm": 0.4631288454665768, + "learning_rate": 7.564659631698738e-05, + "loss": 4.0975, + "step": 3656 + }, + { + "epoch": 0.22701595381463777, + "grad_norm": 0.255561976090116, + "learning_rate": 7.566728739913098e-05, + "loss": 4.123, + "step": 3657 + }, + { + "epoch": 0.22707803091439568, + "grad_norm": 0.319286004225129, + "learning_rate": 7.568797848127458e-05, + "loss": 4.1845, + "step": 3658 + }, + { + "epoch": 0.22714010801415357, + "grad_norm": 0.2792045200481058, + "learning_rate": 7.570866956341817e-05, + "loss": 4.139, + "step": 3659 + }, + { + "epoch": 0.22720218511391146, + "grad_norm": 0.3244249882887881, + "learning_rate": 7.572936064556176e-05, + "loss": 4.1643, + "step": 3660 + }, + { + "epoch": 0.22726426221366938, + "grad_norm": 0.3147434055782879, + "learning_rate": 7.575005172770536e-05, + "loss": 4.24, + "step": 3661 + }, + { + "epoch": 0.22732633931342727, + "grad_norm": 0.3861190500156011, + "learning_rate": 7.577074280984896e-05, + "loss": 4.1886, + "step": 3662 + }, + { + "epoch": 0.22738841641318516, + "grad_norm": 0.38408606687395047, + "learning_rate": 7.579143389199256e-05, + "loss": 4.1499, + "step": 3663 + }, + { + "epoch": 0.22745049351294308, + "grad_norm": 0.3968916243728341, + "learning_rate": 7.581212497413615e-05, + "loss": 4.1124, + "step": 3664 + }, + { + "epoch": 0.22751257061270097, + "grad_norm": 0.5083703718084963, + "learning_rate": 7.583281605627975e-05, + "loss": 4.136, + "step": 3665 + }, + { + "epoch": 0.22757464771245886, + "grad_norm": 0.4701155798408713, + "learning_rate": 7.585350713842334e-05, + "loss": 4.1025, + "step": 3666 + }, + { + "epoch": 0.22763672481221678, + "grad_norm": 0.48069749984785415, + "learning_rate": 7.587419822056694e-05, + "loss": 4.2951, + "step": 3667 + }, + { + "epoch": 0.22769880191197467, + "grad_norm": 0.6625263622724942, + "learning_rate": 7.589488930271053e-05, + "loss": 4.2413, + "step": 3668 + }, + { + "epoch": 0.22776087901173256, + "grad_norm": 0.4602792209629989, + "learning_rate": 7.591558038485413e-05, + "loss": 4.335, + "step": 3669 + }, + { + "epoch": 0.22782295611149048, + "grad_norm": 0.36040496430481517, + "learning_rate": 7.593627146699774e-05, + "loss": 4.2253, + "step": 3670 + }, + { + "epoch": 0.22788503321124837, + "grad_norm": 0.6419517919854554, + "learning_rate": 7.595696254914132e-05, + "loss": 4.1862, + "step": 3671 + }, + { + "epoch": 0.22794711031100626, + "grad_norm": 0.7313304856829405, + "learning_rate": 7.597765363128491e-05, + "loss": 4.2082, + "step": 3672 + }, + { + "epoch": 0.22800918741076417, + "grad_norm": 0.6209541979744204, + "learning_rate": 7.599834471342851e-05, + "loss": 4.2635, + "step": 3673 + }, + { + "epoch": 0.22807126451052206, + "grad_norm": 0.4153629251126122, + "learning_rate": 7.601903579557212e-05, + "loss": 4.2165, + "step": 3674 + }, + { + "epoch": 0.22813334161027995, + "grad_norm": 0.4740476384407755, + "learning_rate": 7.603972687771571e-05, + "loss": 4.1748, + "step": 3675 + }, + { + "epoch": 0.22819541871003787, + "grad_norm": 0.3868104845155669, + "learning_rate": 7.606041795985931e-05, + "loss": 4.1698, + "step": 3676 + }, + { + "epoch": 0.22825749580979576, + "grad_norm": 0.3632546897120758, + "learning_rate": 7.608110904200289e-05, + "loss": 4.2686, + "step": 3677 + }, + { + "epoch": 0.22831957290955365, + "grad_norm": 0.32071578991655847, + "learning_rate": 7.61018001241465e-05, + "loss": 4.1962, + "step": 3678 + }, + { + "epoch": 0.22838165000931157, + "grad_norm": 0.3781448855570088, + "learning_rate": 7.61224912062901e-05, + "loss": 4.2664, + "step": 3679 + }, + { + "epoch": 0.22844372710906946, + "grad_norm": 0.38630619584688486, + "learning_rate": 7.614318228843369e-05, + "loss": 4.2345, + "step": 3680 + }, + { + "epoch": 0.22850580420882735, + "grad_norm": 0.3865150146183472, + "learning_rate": 7.616387337057728e-05, + "loss": 4.258, + "step": 3681 + }, + { + "epoch": 0.22856788130858527, + "grad_norm": 0.6388228118515055, + "learning_rate": 7.618456445272088e-05, + "loss": 4.1986, + "step": 3682 + }, + { + "epoch": 0.22862995840834316, + "grad_norm": 0.4027760969689621, + "learning_rate": 7.620525553486447e-05, + "loss": 4.201, + "step": 3683 + }, + { + "epoch": 0.22869203550810105, + "grad_norm": 0.4692930096358663, + "learning_rate": 7.622594661700807e-05, + "loss": 4.1452, + "step": 3684 + }, + { + "epoch": 0.22875411260785897, + "grad_norm": 0.4106703488175292, + "learning_rate": 7.624663769915166e-05, + "loss": 4.1781, + "step": 3685 + }, + { + "epoch": 0.22881618970761686, + "grad_norm": 0.5737925394448775, + "learning_rate": 7.626732878129526e-05, + "loss": 4.1255, + "step": 3686 + }, + { + "epoch": 0.22887826680737475, + "grad_norm": 0.500504150139049, + "learning_rate": 7.628801986343887e-05, + "loss": 4.1194, + "step": 3687 + }, + { + "epoch": 0.22894034390713267, + "grad_norm": 0.32995395469208955, + "learning_rate": 7.630871094558246e-05, + "loss": 4.2167, + "step": 3688 + }, + { + "epoch": 0.22900242100689056, + "grad_norm": 0.6234272719383183, + "learning_rate": 7.632940202772604e-05, + "loss": 4.2027, + "step": 3689 + }, + { + "epoch": 0.22906449810664845, + "grad_norm": 0.5680305604739007, + "learning_rate": 7.635009310986965e-05, + "loss": 4.1783, + "step": 3690 + }, + { + "epoch": 0.22912657520640636, + "grad_norm": 0.3836724878367239, + "learning_rate": 7.637078419201325e-05, + "loss": 4.1558, + "step": 3691 + }, + { + "epoch": 0.22918865230616425, + "grad_norm": 0.4067358449654583, + "learning_rate": 7.639147527415684e-05, + "loss": 4.2559, + "step": 3692 + }, + { + "epoch": 0.22925072940592214, + "grad_norm": 0.46776689114698544, + "learning_rate": 7.641216635630044e-05, + "loss": 4.1954, + "step": 3693 + }, + { + "epoch": 0.22931280650568006, + "grad_norm": 0.4155982525185258, + "learning_rate": 7.643285743844403e-05, + "loss": 4.1572, + "step": 3694 + }, + { + "epoch": 0.22937488360543795, + "grad_norm": 0.3900877882503347, + "learning_rate": 7.645354852058763e-05, + "loss": 4.0608, + "step": 3695 + }, + { + "epoch": 0.22943696070519584, + "grad_norm": 0.5828274139459684, + "learning_rate": 7.647423960273122e-05, + "loss": 4.1338, + "step": 3696 + }, + { + "epoch": 0.22949903780495376, + "grad_norm": 1.0602033139424396, + "learning_rate": 7.649493068487482e-05, + "loss": 4.1581, + "step": 3697 + }, + { + "epoch": 0.22956111490471165, + "grad_norm": 0.5058690368436616, + "learning_rate": 7.651562176701841e-05, + "loss": 4.0934, + "step": 3698 + }, + { + "epoch": 0.22962319200446954, + "grad_norm": 0.6723205730141132, + "learning_rate": 7.653631284916202e-05, + "loss": 4.1257, + "step": 3699 + }, + { + "epoch": 0.22968526910422746, + "grad_norm": 0.43883902132979596, + "learning_rate": 7.655700393130562e-05, + "loss": 4.2321, + "step": 3700 + }, + { + "epoch": 0.22974734620398535, + "grad_norm": 0.37681128249522966, + "learning_rate": 7.65776950134492e-05, + "loss": 4.265, + "step": 3701 + }, + { + "epoch": 0.22980942330374324, + "grad_norm": 0.4324797160017127, + "learning_rate": 7.65983860955928e-05, + "loss": 4.2225, + "step": 3702 + }, + { + "epoch": 0.22987150040350116, + "grad_norm": 0.4488038178477865, + "learning_rate": 7.66190771777364e-05, + "loss": 4.1362, + "step": 3703 + }, + { + "epoch": 0.22993357750325905, + "grad_norm": 0.5288514237041267, + "learning_rate": 7.663976825988e-05, + "loss": 4.1454, + "step": 3704 + }, + { + "epoch": 0.22999565460301694, + "grad_norm": 0.3818293812376711, + "learning_rate": 7.666045934202359e-05, + "loss": 4.0741, + "step": 3705 + }, + { + "epoch": 0.23005773170277485, + "grad_norm": 0.3048601761600599, + "learning_rate": 7.668115042416719e-05, + "loss": 4.2167, + "step": 3706 + }, + { + "epoch": 0.23011980880253274, + "grad_norm": 0.33130655300773687, + "learning_rate": 7.670184150631078e-05, + "loss": 4.2832, + "step": 3707 + }, + { + "epoch": 0.23018188590229063, + "grad_norm": 0.5682078680655825, + "learning_rate": 7.672253258845438e-05, + "loss": 4.2141, + "step": 3708 + }, + { + "epoch": 0.23024396300204855, + "grad_norm": 0.565197305577599, + "learning_rate": 7.674322367059797e-05, + "loss": 4.0461, + "step": 3709 + }, + { + "epoch": 0.23030604010180644, + "grad_norm": 0.45626197368116683, + "learning_rate": 7.676391475274157e-05, + "loss": 4.1677, + "step": 3710 + }, + { + "epoch": 0.23036811720156433, + "grad_norm": 0.4062378037896105, + "learning_rate": 7.678460583488518e-05, + "loss": 4.061, + "step": 3711 + }, + { + "epoch": 0.23043019430132225, + "grad_norm": 0.8294329121543047, + "learning_rate": 7.680529691702877e-05, + "loss": 4.1907, + "step": 3712 + }, + { + "epoch": 0.23049227140108014, + "grad_norm": 0.8077936314162957, + "learning_rate": 7.682598799917235e-05, + "loss": 4.1754, + "step": 3713 + }, + { + "epoch": 0.23055434850083803, + "grad_norm": 0.8288564870208277, + "learning_rate": 7.684667908131595e-05, + "loss": 4.1634, + "step": 3714 + }, + { + "epoch": 0.23061642560059595, + "grad_norm": 0.4620740585181753, + "learning_rate": 7.686737016345956e-05, + "loss": 4.0611, + "step": 3715 + }, + { + "epoch": 0.23067850270035384, + "grad_norm": 0.5827017129115781, + "learning_rate": 7.688806124560315e-05, + "loss": 4.0992, + "step": 3716 + }, + { + "epoch": 0.23074057980011173, + "grad_norm": 0.47307857845045403, + "learning_rate": 7.690875232774675e-05, + "loss": 4.2659, + "step": 3717 + }, + { + "epoch": 0.23080265689986965, + "grad_norm": 0.47911050370617153, + "learning_rate": 7.692944340989034e-05, + "loss": 4.1485, + "step": 3718 + }, + { + "epoch": 0.23086473399962754, + "grad_norm": 0.3657093325786681, + "learning_rate": 7.695013449203394e-05, + "loss": 4.1566, + "step": 3719 + }, + { + "epoch": 0.23092681109938543, + "grad_norm": 0.3480928123119713, + "learning_rate": 7.697082557417753e-05, + "loss": 4.2583, + "step": 3720 + }, + { + "epoch": 0.23098888819914334, + "grad_norm": 0.4755307896028937, + "learning_rate": 7.699151665632113e-05, + "loss": 4.1454, + "step": 3721 + }, + { + "epoch": 0.23105096529890123, + "grad_norm": 0.4323225374191262, + "learning_rate": 7.701220773846472e-05, + "loss": 4.1662, + "step": 3722 + }, + { + "epoch": 0.23111304239865912, + "grad_norm": 0.5468094650238395, + "learning_rate": 7.703289882060832e-05, + "loss": 4.1397, + "step": 3723 + }, + { + "epoch": 0.23117511949841704, + "grad_norm": 0.3701159595479197, + "learning_rate": 7.705358990275193e-05, + "loss": 4.0874, + "step": 3724 + }, + { + "epoch": 0.23123719659817493, + "grad_norm": 0.44441611436449163, + "learning_rate": 7.707428098489551e-05, + "loss": 4.0675, + "step": 3725 + }, + { + "epoch": 0.23129927369793282, + "grad_norm": 0.37033475253691334, + "learning_rate": 7.70949720670391e-05, + "loss": 4.1044, + "step": 3726 + }, + { + "epoch": 0.23136135079769074, + "grad_norm": 0.5069955563435138, + "learning_rate": 7.71156631491827e-05, + "loss": 4.1915, + "step": 3727 + }, + { + "epoch": 0.23142342789744863, + "grad_norm": 0.5818681168277142, + "learning_rate": 7.713635423132631e-05, + "loss": 4.2004, + "step": 3728 + }, + { + "epoch": 0.23148550499720652, + "grad_norm": 0.5573069816933043, + "learning_rate": 7.71570453134699e-05, + "loss": 4.0759, + "step": 3729 + }, + { + "epoch": 0.23154758209696444, + "grad_norm": 0.43605632666512373, + "learning_rate": 7.71777363956135e-05, + "loss": 4.1289, + "step": 3730 + }, + { + "epoch": 0.23160965919672233, + "grad_norm": 0.3973823488413323, + "learning_rate": 7.719842747775709e-05, + "loss": 4.1303, + "step": 3731 + }, + { + "epoch": 0.23167173629648022, + "grad_norm": 0.5395769622112391, + "learning_rate": 7.721911855990069e-05, + "loss": 4.1879, + "step": 3732 + }, + { + "epoch": 0.23173381339623814, + "grad_norm": 0.3639606153172424, + "learning_rate": 7.723980964204428e-05, + "loss": 4.1659, + "step": 3733 + }, + { + "epoch": 0.23179589049599603, + "grad_norm": 0.4306674187285074, + "learning_rate": 7.726050072418788e-05, + "loss": 4.1744, + "step": 3734 + }, + { + "epoch": 0.23185796759575392, + "grad_norm": 0.3001881238067963, + "learning_rate": 7.728119180633147e-05, + "loss": 4.1102, + "step": 3735 + }, + { + "epoch": 0.23192004469551183, + "grad_norm": 0.49805659864320806, + "learning_rate": 7.730188288847508e-05, + "loss": 4.1145, + "step": 3736 + }, + { + "epoch": 0.23198212179526972, + "grad_norm": 0.5530998450556104, + "learning_rate": 7.732257397061868e-05, + "loss": 4.1417, + "step": 3737 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 0.4122189019181, + "learning_rate": 7.734326505276226e-05, + "loss": 4.0941, + "step": 3738 + }, + { + "epoch": 0.23210627599478553, + "grad_norm": 0.3490383877191306, + "learning_rate": 7.736395613490585e-05, + "loss": 4.2061, + "step": 3739 + }, + { + "epoch": 0.23216835309454342, + "grad_norm": 0.5104726222241405, + "learning_rate": 7.738464721704946e-05, + "loss": 4.2058, + "step": 3740 + }, + { + "epoch": 0.2322304301943013, + "grad_norm": 0.4349589463844855, + "learning_rate": 7.740533829919306e-05, + "loss": 4.1436, + "step": 3741 + }, + { + "epoch": 0.23229250729405923, + "grad_norm": 0.33371701609752885, + "learning_rate": 7.742602938133665e-05, + "loss": 4.1859, + "step": 3742 + }, + { + "epoch": 0.23235458439381712, + "grad_norm": 0.6521369240730712, + "learning_rate": 7.744672046348025e-05, + "loss": 4.1352, + "step": 3743 + }, + { + "epoch": 0.232416661493575, + "grad_norm": 0.3404907338160138, + "learning_rate": 7.746741154562384e-05, + "loss": 4.1258, + "step": 3744 + }, + { + "epoch": 0.23247873859333293, + "grad_norm": 0.4096701147957608, + "learning_rate": 7.748810262776744e-05, + "loss": 4.1836, + "step": 3745 + }, + { + "epoch": 0.23254081569309082, + "grad_norm": 0.4345860510558566, + "learning_rate": 7.750879370991103e-05, + "loss": 4.1798, + "step": 3746 + }, + { + "epoch": 0.2326028927928487, + "grad_norm": 0.5281931218110015, + "learning_rate": 7.752948479205463e-05, + "loss": 4.2158, + "step": 3747 + }, + { + "epoch": 0.23266496989260663, + "grad_norm": 0.4157649794028803, + "learning_rate": 7.755017587419822e-05, + "loss": 4.1341, + "step": 3748 + }, + { + "epoch": 0.23272704699236452, + "grad_norm": 0.2880743380005558, + "learning_rate": 7.757086695634183e-05, + "loss": 4.1121, + "step": 3749 + }, + { + "epoch": 0.2327891240921224, + "grad_norm": 0.4206423943197813, + "learning_rate": 7.759155803848541e-05, + "loss": 4.1236, + "step": 3750 + }, + { + "epoch": 0.23285120119188032, + "grad_norm": 0.3671259118232145, + "learning_rate": 7.761224912062901e-05, + "loss": 4.1496, + "step": 3751 + }, + { + "epoch": 0.23291327829163821, + "grad_norm": 0.4459432852569612, + "learning_rate": 7.76329402027726e-05, + "loss": 4.1608, + "step": 3752 + }, + { + "epoch": 0.2329753553913961, + "grad_norm": 0.27078685759094107, + "learning_rate": 7.765363128491621e-05, + "loss": 4.1765, + "step": 3753 + }, + { + "epoch": 0.23303743249115402, + "grad_norm": 0.4218267307218181, + "learning_rate": 7.76743223670598e-05, + "loss": 4.1448, + "step": 3754 + }, + { + "epoch": 0.2330995095909119, + "grad_norm": 0.39323736228570616, + "learning_rate": 7.76950134492034e-05, + "loss": 4.1746, + "step": 3755 + }, + { + "epoch": 0.2331615866906698, + "grad_norm": 0.25137079943216006, + "learning_rate": 7.771570453134698e-05, + "loss": 4.1071, + "step": 3756 + }, + { + "epoch": 0.23322366379042772, + "grad_norm": 0.49172170487235156, + "learning_rate": 7.773639561349059e-05, + "loss": 4.1611, + "step": 3757 + }, + { + "epoch": 0.2332857408901856, + "grad_norm": 0.31160746908932013, + "learning_rate": 7.775708669563419e-05, + "loss": 4.0799, + "step": 3758 + }, + { + "epoch": 0.2333478179899435, + "grad_norm": 0.43399621997123194, + "learning_rate": 7.777777777777778e-05, + "loss": 4.12, + "step": 3759 + }, + { + "epoch": 0.23340989508970142, + "grad_norm": 0.3821011765964845, + "learning_rate": 7.779846885992138e-05, + "loss": 4.1826, + "step": 3760 + }, + { + "epoch": 0.2334719721894593, + "grad_norm": 0.3555354171110385, + "learning_rate": 7.781915994206499e-05, + "loss": 4.1137, + "step": 3761 + }, + { + "epoch": 0.2335340492892172, + "grad_norm": 0.5189063250304148, + "learning_rate": 7.783985102420857e-05, + "loss": 4.1976, + "step": 3762 + }, + { + "epoch": 0.23359612638897512, + "grad_norm": 0.5618116617624063, + "learning_rate": 7.786054210635216e-05, + "loss": 4.1638, + "step": 3763 + }, + { + "epoch": 0.233658203488733, + "grad_norm": 0.3168818466868892, + "learning_rate": 7.788123318849576e-05, + "loss": 4.2136, + "step": 3764 + }, + { + "epoch": 0.2337202805884909, + "grad_norm": 0.4029844978098342, + "learning_rate": 7.790192427063937e-05, + "loss": 4.1002, + "step": 3765 + }, + { + "epoch": 0.23378235768824882, + "grad_norm": 0.30622654795866217, + "learning_rate": 7.792261535278296e-05, + "loss": 4.1004, + "step": 3766 + }, + { + "epoch": 0.2338444347880067, + "grad_norm": 0.6684128918783927, + "learning_rate": 7.794330643492656e-05, + "loss": 4.1688, + "step": 3767 + }, + { + "epoch": 0.2339065118877646, + "grad_norm": 0.3294374014503825, + "learning_rate": 7.796399751707014e-05, + "loss": 4.1469, + "step": 3768 + }, + { + "epoch": 0.2339685889875225, + "grad_norm": 0.6120674854154602, + "learning_rate": 7.798468859921375e-05, + "loss": 4.1113, + "step": 3769 + }, + { + "epoch": 0.2340306660872804, + "grad_norm": 0.4982865747817409, + "learning_rate": 7.800537968135734e-05, + "loss": 4.1067, + "step": 3770 + }, + { + "epoch": 0.2340927431870383, + "grad_norm": 0.7993344145874376, + "learning_rate": 7.802607076350094e-05, + "loss": 4.2695, + "step": 3771 + }, + { + "epoch": 0.2341548202867962, + "grad_norm": 0.6718882404176051, + "learning_rate": 7.804676184564453e-05, + "loss": 4.1351, + "step": 3772 + }, + { + "epoch": 0.2342168973865541, + "grad_norm": 0.6418192594296513, + "learning_rate": 7.806745292778813e-05, + "loss": 4.1334, + "step": 3773 + }, + { + "epoch": 0.234278974486312, + "grad_norm": 0.47490066255040075, + "learning_rate": 7.808814400993172e-05, + "loss": 4.1477, + "step": 3774 + }, + { + "epoch": 0.2343410515860699, + "grad_norm": 0.8971321906856269, + "learning_rate": 7.810883509207532e-05, + "loss": 4.1099, + "step": 3775 + }, + { + "epoch": 0.2344031286858278, + "grad_norm": 0.5524804977927823, + "learning_rate": 7.812952617421891e-05, + "loss": 4.2221, + "step": 3776 + }, + { + "epoch": 0.2344652057855857, + "grad_norm": 0.7213090453283725, + "learning_rate": 7.81502172563625e-05, + "loss": 4.2468, + "step": 3777 + }, + { + "epoch": 0.2345272828853436, + "grad_norm": 0.5877853755584065, + "learning_rate": 7.817090833850612e-05, + "loss": 4.0427, + "step": 3778 + }, + { + "epoch": 0.2345893599851015, + "grad_norm": 0.6770959094909212, + "learning_rate": 7.819159942064971e-05, + "loss": 4.104, + "step": 3779 + }, + { + "epoch": 0.2346514370848594, + "grad_norm": 0.575060273204199, + "learning_rate": 7.821229050279329e-05, + "loss": 4.08, + "step": 3780 + }, + { + "epoch": 0.2347135141846173, + "grad_norm": 0.4123152449144811, + "learning_rate": 7.823298158493689e-05, + "loss": 4.0438, + "step": 3781 + }, + { + "epoch": 0.2347755912843752, + "grad_norm": 0.48031207375640916, + "learning_rate": 7.82536726670805e-05, + "loss": 4.064, + "step": 3782 + }, + { + "epoch": 0.23483766838413309, + "grad_norm": 0.5416153431077068, + "learning_rate": 7.827436374922409e-05, + "loss": 4.0573, + "step": 3783 + }, + { + "epoch": 0.234899745483891, + "grad_norm": 0.532442045774286, + "learning_rate": 7.829505483136769e-05, + "loss": 4.1466, + "step": 3784 + }, + { + "epoch": 0.2349618225836489, + "grad_norm": 0.6146149438605982, + "learning_rate": 7.831574591351128e-05, + "loss": 4.0739, + "step": 3785 + }, + { + "epoch": 0.23502389968340678, + "grad_norm": 0.4830189149462121, + "learning_rate": 7.833643699565488e-05, + "loss": 4.1359, + "step": 3786 + }, + { + "epoch": 0.2350859767831647, + "grad_norm": 0.4964278140140687, + "learning_rate": 7.835712807779847e-05, + "loss": 4.1961, + "step": 3787 + }, + { + "epoch": 0.2351480538829226, + "grad_norm": 0.44397526449924546, + "learning_rate": 7.837781915994207e-05, + "loss": 4.042, + "step": 3788 + }, + { + "epoch": 0.23521013098268048, + "grad_norm": 0.33349069839514334, + "learning_rate": 7.839851024208566e-05, + "loss": 4.1503, + "step": 3789 + }, + { + "epoch": 0.2352722080824384, + "grad_norm": 0.6334027257557124, + "learning_rate": 7.841920132422927e-05, + "loss": 4.061, + "step": 3790 + }, + { + "epoch": 0.2353342851821963, + "grad_norm": 0.3916918681369689, + "learning_rate": 7.843989240637286e-05, + "loss": 4.0904, + "step": 3791 + }, + { + "epoch": 0.23539636228195418, + "grad_norm": 0.47293278194759575, + "learning_rate": 7.846058348851645e-05, + "loss": 4.1429, + "step": 3792 + }, + { + "epoch": 0.2354584393817121, + "grad_norm": 0.7791178026703839, + "learning_rate": 7.848127457066004e-05, + "loss": 4.0916, + "step": 3793 + }, + { + "epoch": 0.23552051648147, + "grad_norm": 0.9851309583232295, + "learning_rate": 7.850196565280365e-05, + "loss": 4.1804, + "step": 3794 + }, + { + "epoch": 0.23558259358122788, + "grad_norm": 0.5584083729700986, + "learning_rate": 7.852265673494724e-05, + "loss": 4.0758, + "step": 3795 + }, + { + "epoch": 0.2356446706809858, + "grad_norm": 0.5225148681186154, + "learning_rate": 7.854334781709084e-05, + "loss": 4.0813, + "step": 3796 + }, + { + "epoch": 0.2357067477807437, + "grad_norm": 0.4709766590364776, + "learning_rate": 7.856403889923444e-05, + "loss": 4.2476, + "step": 3797 + }, + { + "epoch": 0.23576882488050158, + "grad_norm": 0.4329941702256443, + "learning_rate": 7.858472998137803e-05, + "loss": 4.1193, + "step": 3798 + }, + { + "epoch": 0.2358309019802595, + "grad_norm": 0.36126089759414, + "learning_rate": 7.860542106352163e-05, + "loss": 4.1236, + "step": 3799 + }, + { + "epoch": 0.23589297908001738, + "grad_norm": 0.4759965256115391, + "learning_rate": 7.862611214566522e-05, + "loss": 4.0665, + "step": 3800 + }, + { + "epoch": 0.23595505617977527, + "grad_norm": 0.4616346260025754, + "learning_rate": 7.864680322780882e-05, + "loss": 4.1351, + "step": 3801 + }, + { + "epoch": 0.2360171332795332, + "grad_norm": 0.48973661586607964, + "learning_rate": 7.866749430995241e-05, + "loss": 4.1785, + "step": 3802 + }, + { + "epoch": 0.23607921037929108, + "grad_norm": 0.38365302938893214, + "learning_rate": 7.868818539209602e-05, + "loss": 4.1145, + "step": 3803 + }, + { + "epoch": 0.23614128747904897, + "grad_norm": 0.45482884074981755, + "learning_rate": 7.87088764742396e-05, + "loss": 4.0034, + "step": 3804 + }, + { + "epoch": 0.2362033645788069, + "grad_norm": 0.361212036454567, + "learning_rate": 7.87295675563832e-05, + "loss": 4.1567, + "step": 3805 + }, + { + "epoch": 0.23626544167856478, + "grad_norm": 0.36876518088951216, + "learning_rate": 7.875025863852679e-05, + "loss": 4.15, + "step": 3806 + }, + { + "epoch": 0.23632751877832267, + "grad_norm": 0.34407175784427735, + "learning_rate": 7.87709497206704e-05, + "loss": 4.1415, + "step": 3807 + }, + { + "epoch": 0.2363895958780806, + "grad_norm": 0.32065951925584646, + "learning_rate": 7.8791640802814e-05, + "loss": 4.0849, + "step": 3808 + }, + { + "epoch": 0.23645167297783848, + "grad_norm": 0.3257524569112091, + "learning_rate": 7.881233188495759e-05, + "loss": 4.1324, + "step": 3809 + }, + { + "epoch": 0.23651375007759637, + "grad_norm": 0.34378719128642454, + "learning_rate": 7.883302296710117e-05, + "loss": 4.093, + "step": 3810 + }, + { + "epoch": 0.2365758271773543, + "grad_norm": 0.3718252701726489, + "learning_rate": 7.885371404924478e-05, + "loss": 4.211, + "step": 3811 + }, + { + "epoch": 0.23663790427711218, + "grad_norm": 0.47602102129162843, + "learning_rate": 7.887440513138837e-05, + "loss": 4.0882, + "step": 3812 + }, + { + "epoch": 0.23669998137687007, + "grad_norm": 0.4589473952959544, + "learning_rate": 7.889509621353197e-05, + "loss": 4.021, + "step": 3813 + }, + { + "epoch": 0.23676205847662798, + "grad_norm": 0.43542773066917967, + "learning_rate": 7.891578729567556e-05, + "loss": 3.9583, + "step": 3814 + }, + { + "epoch": 0.23682413557638587, + "grad_norm": 0.40254617432110323, + "learning_rate": 7.893647837781917e-05, + "loss": 4.1308, + "step": 3815 + }, + { + "epoch": 0.23688621267614376, + "grad_norm": 0.35925935874938864, + "learning_rate": 7.895716945996277e-05, + "loss": 4.048, + "step": 3816 + }, + { + "epoch": 0.23694828977590168, + "grad_norm": 0.3092402313966975, + "learning_rate": 7.897786054210635e-05, + "loss": 4.1012, + "step": 3817 + }, + { + "epoch": 0.23701036687565957, + "grad_norm": 0.37145067417095406, + "learning_rate": 7.899855162424995e-05, + "loss": 4.1253, + "step": 3818 + }, + { + "epoch": 0.23707244397541746, + "grad_norm": 0.44997554553660263, + "learning_rate": 7.901924270639355e-05, + "loss": 4.0882, + "step": 3819 + }, + { + "epoch": 0.23713452107517538, + "grad_norm": 0.2614428931055501, + "learning_rate": 7.903993378853715e-05, + "loss": 4.2345, + "step": 3820 + }, + { + "epoch": 0.23719659817493327, + "grad_norm": 0.3657343293429258, + "learning_rate": 7.906062487068074e-05, + "loss": 4.1115, + "step": 3821 + }, + { + "epoch": 0.23725867527469116, + "grad_norm": 0.4484584511039589, + "learning_rate": 7.908131595282434e-05, + "loss": 4.2073, + "step": 3822 + }, + { + "epoch": 0.23732075237444908, + "grad_norm": 0.3509627100143133, + "learning_rate": 7.910200703496793e-05, + "loss": 4.1116, + "step": 3823 + }, + { + "epoch": 0.23738282947420697, + "grad_norm": 0.4893625417611941, + "learning_rate": 7.912269811711153e-05, + "loss": 4.0752, + "step": 3824 + }, + { + "epoch": 0.23744490657396486, + "grad_norm": 0.5284531403431564, + "learning_rate": 7.914338919925512e-05, + "loss": 4.0279, + "step": 3825 + }, + { + "epoch": 0.23750698367372278, + "grad_norm": 0.312100846958264, + "learning_rate": 7.916408028139872e-05, + "loss": 4.0863, + "step": 3826 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 0.5631122700716221, + "learning_rate": 7.918477136354231e-05, + "loss": 4.1674, + "step": 3827 + }, + { + "epoch": 0.23763113787323856, + "grad_norm": 0.4240170773978457, + "learning_rate": 7.920546244568592e-05, + "loss": 4.0675, + "step": 3828 + }, + { + "epoch": 0.23769321497299647, + "grad_norm": 0.5410845881277708, + "learning_rate": 7.92261535278295e-05, + "loss": 3.996, + "step": 3829 + }, + { + "epoch": 0.23775529207275437, + "grad_norm": 0.4129564339298315, + "learning_rate": 7.92468446099731e-05, + "loss": 4.2102, + "step": 3830 + }, + { + "epoch": 0.23781736917251226, + "grad_norm": 0.4084619130702127, + "learning_rate": 7.92675356921167e-05, + "loss": 4.0667, + "step": 3831 + }, + { + "epoch": 0.23787944627227017, + "grad_norm": 0.32302055943606733, + "learning_rate": 7.92882267742603e-05, + "loss": 4.159, + "step": 3832 + }, + { + "epoch": 0.23794152337202806, + "grad_norm": 0.37041262307264616, + "learning_rate": 7.93089178564039e-05, + "loss": 4.1472, + "step": 3833 + }, + { + "epoch": 0.23800360047178595, + "grad_norm": 0.2966209504156105, + "learning_rate": 7.93296089385475e-05, + "loss": 4.1027, + "step": 3834 + }, + { + "epoch": 0.23806567757154387, + "grad_norm": 0.42255197179977705, + "learning_rate": 7.935030002069108e-05, + "loss": 4.1204, + "step": 3835 + }, + { + "epoch": 0.23812775467130176, + "grad_norm": 0.34611896784682955, + "learning_rate": 7.937099110283468e-05, + "loss": 4.0913, + "step": 3836 + }, + { + "epoch": 0.23818983177105965, + "grad_norm": 0.3659258737517038, + "learning_rate": 7.939168218497828e-05, + "loss": 4.1283, + "step": 3837 + }, + { + "epoch": 0.23825190887081757, + "grad_norm": 0.3345269709014534, + "learning_rate": 7.941237326712187e-05, + "loss": 4.2221, + "step": 3838 + }, + { + "epoch": 0.23831398597057546, + "grad_norm": 0.3653717598603406, + "learning_rate": 7.943306434926547e-05, + "loss": 4.1632, + "step": 3839 + }, + { + "epoch": 0.23837606307033335, + "grad_norm": 0.4074043233570898, + "learning_rate": 7.945375543140908e-05, + "loss": 4.1457, + "step": 3840 + }, + { + "epoch": 0.23843814017009124, + "grad_norm": 0.3301389965382088, + "learning_rate": 7.947444651355266e-05, + "loss": 4.0597, + "step": 3841 + }, + { + "epoch": 0.23850021726984916, + "grad_norm": 0.46989304730787346, + "learning_rate": 7.949513759569625e-05, + "loss": 4.0227, + "step": 3842 + }, + { + "epoch": 0.23856229436960705, + "grad_norm": 0.41834386588448574, + "learning_rate": 7.951582867783985e-05, + "loss": 4.0137, + "step": 3843 + }, + { + "epoch": 0.23862437146936494, + "grad_norm": 0.4131741621166723, + "learning_rate": 7.953651975998346e-05, + "loss": 4.1739, + "step": 3844 + }, + { + "epoch": 0.23868644856912286, + "grad_norm": 0.39017915077203874, + "learning_rate": 7.955721084212705e-05, + "loss": 4.0843, + "step": 3845 + }, + { + "epoch": 0.23874852566888075, + "grad_norm": 0.46573569370929924, + "learning_rate": 7.957790192427065e-05, + "loss": 3.9394, + "step": 3846 + }, + { + "epoch": 0.23881060276863864, + "grad_norm": 0.503361962474737, + "learning_rate": 7.959859300641423e-05, + "loss": 4.1115, + "step": 3847 + }, + { + "epoch": 0.23887267986839655, + "grad_norm": 0.6202052496911494, + "learning_rate": 7.961928408855784e-05, + "loss": 4.1145, + "step": 3848 + }, + { + "epoch": 0.23893475696815444, + "grad_norm": 0.5304773546287874, + "learning_rate": 7.963997517070143e-05, + "loss": 4.2227, + "step": 3849 + }, + { + "epoch": 0.23899683406791233, + "grad_norm": 0.44811609360424653, + "learning_rate": 7.966066625284503e-05, + "loss": 4.0598, + "step": 3850 + }, + { + "epoch": 0.23905891116767025, + "grad_norm": 0.6237795596486115, + "learning_rate": 7.968135733498862e-05, + "loss": 3.986, + "step": 3851 + }, + { + "epoch": 0.23912098826742814, + "grad_norm": 0.7361286409158133, + "learning_rate": 7.970204841713222e-05, + "loss": 4.1704, + "step": 3852 + }, + { + "epoch": 0.23918306536718603, + "grad_norm": 0.5578362724303514, + "learning_rate": 7.972273949927581e-05, + "loss": 4.1206, + "step": 3853 + }, + { + "epoch": 0.23924514246694395, + "grad_norm": 0.4246117515462363, + "learning_rate": 7.974343058141941e-05, + "loss": 4.1605, + "step": 3854 + }, + { + "epoch": 0.23930721956670184, + "grad_norm": 0.7494117496187351, + "learning_rate": 7.9764121663563e-05, + "loss": 4.1024, + "step": 3855 + }, + { + "epoch": 0.23936929666645973, + "grad_norm": 0.3394318419216524, + "learning_rate": 7.97848127457066e-05, + "loss": 4.1582, + "step": 3856 + }, + { + "epoch": 0.23943137376621765, + "grad_norm": 0.49156673582353366, + "learning_rate": 7.980550382785021e-05, + "loss": 4.1251, + "step": 3857 + }, + { + "epoch": 0.23949345086597554, + "grad_norm": 0.6740540719953374, + "learning_rate": 7.98261949099938e-05, + "loss": 4.0578, + "step": 3858 + }, + { + "epoch": 0.23955552796573343, + "grad_norm": 0.890263835558442, + "learning_rate": 7.984688599213738e-05, + "loss": 4.0664, + "step": 3859 + }, + { + "epoch": 0.23961760506549135, + "grad_norm": 0.8723469772702798, + "learning_rate": 7.986757707428099e-05, + "loss": 4.1016, + "step": 3860 + }, + { + "epoch": 0.23967968216524924, + "grad_norm": 0.6607589227923317, + "learning_rate": 7.988826815642459e-05, + "loss": 4.1208, + "step": 3861 + }, + { + "epoch": 0.23974175926500713, + "grad_norm": 0.6474480951338776, + "learning_rate": 7.990895923856818e-05, + "loss": 4.1316, + "step": 3862 + }, + { + "epoch": 0.23980383636476504, + "grad_norm": 0.45429538979127654, + "learning_rate": 7.992965032071178e-05, + "loss": 4.1319, + "step": 3863 + }, + { + "epoch": 0.23986591346452293, + "grad_norm": 0.8565250651026203, + "learning_rate": 7.995034140285537e-05, + "loss": 4.1641, + "step": 3864 + }, + { + "epoch": 0.23992799056428082, + "grad_norm": 0.7543297008323554, + "learning_rate": 7.997103248499897e-05, + "loss": 4.0538, + "step": 3865 + }, + { + "epoch": 0.23999006766403874, + "grad_norm": 0.5747420074268884, + "learning_rate": 7.999172356714256e-05, + "loss": 4.2491, + "step": 3866 + }, + { + "epoch": 0.24005214476379663, + "grad_norm": 0.4725516022150996, + "learning_rate": 8.001241464928616e-05, + "loss": 4.159, + "step": 3867 + }, + { + "epoch": 0.24011422186355452, + "grad_norm": 0.711795624350968, + "learning_rate": 8.003310573142975e-05, + "loss": 4.1504, + "step": 3868 + }, + { + "epoch": 0.24017629896331244, + "grad_norm": 0.4788335250223858, + "learning_rate": 8.005379681357336e-05, + "loss": 4.0292, + "step": 3869 + }, + { + "epoch": 0.24023837606307033, + "grad_norm": 0.4465592816278915, + "learning_rate": 8.007448789571696e-05, + "loss": 4.0392, + "step": 3870 + }, + { + "epoch": 0.24030045316282822, + "grad_norm": 0.4134439851074916, + "learning_rate": 8.009517897786054e-05, + "loss": 4.0131, + "step": 3871 + }, + { + "epoch": 0.24036253026258614, + "grad_norm": 0.37200626813314547, + "learning_rate": 8.011587006000413e-05, + "loss": 4.0859, + "step": 3872 + }, + { + "epoch": 0.24042460736234403, + "grad_norm": 0.6107543836741578, + "learning_rate": 8.013656114214774e-05, + "loss": 4.0292, + "step": 3873 + }, + { + "epoch": 0.24048668446210192, + "grad_norm": 0.6959163957880872, + "learning_rate": 8.015725222429134e-05, + "loss": 4.1756, + "step": 3874 + }, + { + "epoch": 0.24054876156185984, + "grad_norm": 0.5172516278849264, + "learning_rate": 8.017794330643493e-05, + "loss": 4.1426, + "step": 3875 + }, + { + "epoch": 0.24061083866161773, + "grad_norm": 0.6186046988454624, + "learning_rate": 8.019863438857853e-05, + "loss": 4.1573, + "step": 3876 + }, + { + "epoch": 0.24067291576137562, + "grad_norm": 0.9865656198543485, + "learning_rate": 8.021932547072212e-05, + "loss": 4.2152, + "step": 3877 + }, + { + "epoch": 0.24073499286113353, + "grad_norm": 0.7834093007999098, + "learning_rate": 8.024001655286572e-05, + "loss": 4.0898, + "step": 3878 + }, + { + "epoch": 0.24079706996089142, + "grad_norm": 0.527572469207893, + "learning_rate": 8.026070763500931e-05, + "loss": 4.0005, + "step": 3879 + }, + { + "epoch": 0.24085914706064931, + "grad_norm": 0.5361131063428568, + "learning_rate": 8.028139871715291e-05, + "loss": 4.2003, + "step": 3880 + }, + { + "epoch": 0.24092122416040723, + "grad_norm": 0.33339579066615344, + "learning_rate": 8.03020897992965e-05, + "loss": 4.017, + "step": 3881 + }, + { + "epoch": 0.24098330126016512, + "grad_norm": 0.4692800862468783, + "learning_rate": 8.032278088144011e-05, + "loss": 4.1424, + "step": 3882 + }, + { + "epoch": 0.241045378359923, + "grad_norm": 0.4145186988089371, + "learning_rate": 8.034347196358369e-05, + "loss": 4.0542, + "step": 3883 + }, + { + "epoch": 0.24110745545968093, + "grad_norm": 0.4707220873040438, + "learning_rate": 8.036416304572729e-05, + "loss": 4.1686, + "step": 3884 + }, + { + "epoch": 0.24116953255943882, + "grad_norm": 0.47424940107606384, + "learning_rate": 8.03848541278709e-05, + "loss": 4.1104, + "step": 3885 + }, + { + "epoch": 0.2412316096591967, + "grad_norm": 0.5056448522096174, + "learning_rate": 8.040554521001449e-05, + "loss": 4.1366, + "step": 3886 + }, + { + "epoch": 0.24129368675895463, + "grad_norm": 0.33890259527519534, + "learning_rate": 8.042623629215809e-05, + "loss": 4.0514, + "step": 3887 + }, + { + "epoch": 0.24135576385871252, + "grad_norm": 0.8965777845067094, + "learning_rate": 8.044692737430168e-05, + "loss": 4.2996, + "step": 3888 + }, + { + "epoch": 0.2414178409584704, + "grad_norm": 0.5493526734989531, + "learning_rate": 8.046761845644528e-05, + "loss": 3.9993, + "step": 3889 + }, + { + "epoch": 0.24147991805822833, + "grad_norm": 0.36137167178468355, + "learning_rate": 8.048830953858887e-05, + "loss": 4.0945, + "step": 3890 + }, + { + "epoch": 0.24154199515798622, + "grad_norm": 0.37271005094177556, + "learning_rate": 8.050900062073247e-05, + "loss": 4.0166, + "step": 3891 + }, + { + "epoch": 0.2416040722577441, + "grad_norm": 1.2560784844368789, + "learning_rate": 8.052969170287606e-05, + "loss": 4.0947, + "step": 3892 + }, + { + "epoch": 0.24166614935750202, + "grad_norm": 0.9047235919822427, + "learning_rate": 8.055038278501966e-05, + "loss": 4.1059, + "step": 3893 + }, + { + "epoch": 0.24172822645725991, + "grad_norm": 0.5422219554056014, + "learning_rate": 8.057107386716327e-05, + "loss": 4.1728, + "step": 3894 + }, + { + "epoch": 0.2417903035570178, + "grad_norm": 0.48712708441778035, + "learning_rate": 8.059176494930685e-05, + "loss": 4.1187, + "step": 3895 + }, + { + "epoch": 0.24185238065677572, + "grad_norm": 0.47930942012221656, + "learning_rate": 8.061245603145044e-05, + "loss": 4.1413, + "step": 3896 + }, + { + "epoch": 0.2419144577565336, + "grad_norm": 0.4472342900789641, + "learning_rate": 8.063314711359404e-05, + "loss": 4.0841, + "step": 3897 + }, + { + "epoch": 0.2419765348562915, + "grad_norm": 0.30006263396965904, + "learning_rate": 8.065383819573765e-05, + "loss": 4.0555, + "step": 3898 + }, + { + "epoch": 0.24203861195604942, + "grad_norm": 0.5596238887895975, + "learning_rate": 8.067452927788124e-05, + "loss": 4.0342, + "step": 3899 + }, + { + "epoch": 0.2421006890558073, + "grad_norm": 0.3032690802033426, + "learning_rate": 8.069522036002484e-05, + "loss": 4.1759, + "step": 3900 + }, + { + "epoch": 0.2421627661555652, + "grad_norm": 0.48053198297502636, + "learning_rate": 8.071591144216843e-05, + "loss": 4.0728, + "step": 3901 + }, + { + "epoch": 0.24222484325532312, + "grad_norm": 0.32592213372265477, + "learning_rate": 8.073660252431203e-05, + "loss": 4.1768, + "step": 3902 + }, + { + "epoch": 0.242286920355081, + "grad_norm": 0.47214982196134314, + "learning_rate": 8.075729360645562e-05, + "loss": 4.1101, + "step": 3903 + }, + { + "epoch": 0.2423489974548389, + "grad_norm": 0.27803938222909247, + "learning_rate": 8.077798468859922e-05, + "loss": 4.0506, + "step": 3904 + }, + { + "epoch": 0.24241107455459682, + "grad_norm": 0.2920218105025397, + "learning_rate": 8.079867577074281e-05, + "loss": 4.1293, + "step": 3905 + }, + { + "epoch": 0.2424731516543547, + "grad_norm": 0.27942019854356626, + "learning_rate": 8.081936685288641e-05, + "loss": 4.1127, + "step": 3906 + }, + { + "epoch": 0.2425352287541126, + "grad_norm": 0.3441317454306397, + "learning_rate": 8.084005793503002e-05, + "loss": 4.0833, + "step": 3907 + }, + { + "epoch": 0.24259730585387052, + "grad_norm": 0.33403194378739665, + "learning_rate": 8.08607490171736e-05, + "loss": 4.0093, + "step": 3908 + }, + { + "epoch": 0.2426593829536284, + "grad_norm": 0.3160199880593491, + "learning_rate": 8.088144009931719e-05, + "loss": 4.0847, + "step": 3909 + }, + { + "epoch": 0.2427214600533863, + "grad_norm": 0.31012213348776746, + "learning_rate": 8.09021311814608e-05, + "loss": 4.0945, + "step": 3910 + }, + { + "epoch": 0.2427835371531442, + "grad_norm": 0.3832085565304574, + "learning_rate": 8.09228222636044e-05, + "loss": 4.1044, + "step": 3911 + }, + { + "epoch": 0.2428456142529021, + "grad_norm": 0.4183171504748351, + "learning_rate": 8.094351334574799e-05, + "loss": 4.0755, + "step": 3912 + }, + { + "epoch": 0.24290769135266, + "grad_norm": 0.37291779207169906, + "learning_rate": 8.096420442789159e-05, + "loss": 3.9886, + "step": 3913 + }, + { + "epoch": 0.2429697684524179, + "grad_norm": 0.3355000442136744, + "learning_rate": 8.098489551003518e-05, + "loss": 4.0939, + "step": 3914 + }, + { + "epoch": 0.2430318455521758, + "grad_norm": 0.4339571452418323, + "learning_rate": 8.100558659217878e-05, + "loss": 4.0707, + "step": 3915 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 0.49616283025893243, + "learning_rate": 8.102627767432237e-05, + "loss": 4.045, + "step": 3916 + }, + { + "epoch": 0.2431559997516916, + "grad_norm": 0.381607318935893, + "learning_rate": 8.104696875646597e-05, + "loss": 4.0973, + "step": 3917 + }, + { + "epoch": 0.2432180768514495, + "grad_norm": 0.43643798674482953, + "learning_rate": 8.106765983860956e-05, + "loss": 4.0893, + "step": 3918 + }, + { + "epoch": 0.2432801539512074, + "grad_norm": 0.33753155498987075, + "learning_rate": 8.108835092075317e-05, + "loss": 4.031, + "step": 3919 + }, + { + "epoch": 0.2433422310509653, + "grad_norm": 0.39940425049311373, + "learning_rate": 8.110904200289675e-05, + "loss": 4.0776, + "step": 3920 + }, + { + "epoch": 0.2434043081507232, + "grad_norm": 0.48325512235445806, + "learning_rate": 8.112973308504035e-05, + "loss": 4.1044, + "step": 3921 + }, + { + "epoch": 0.2434663852504811, + "grad_norm": 0.4080298936857411, + "learning_rate": 8.115042416718394e-05, + "loss": 4.0925, + "step": 3922 + }, + { + "epoch": 0.243528462350239, + "grad_norm": 0.30488576307219184, + "learning_rate": 8.117111524932755e-05, + "loss": 4.0948, + "step": 3923 + }, + { + "epoch": 0.2435905394499969, + "grad_norm": 0.4148989272554955, + "learning_rate": 8.119180633147115e-05, + "loss": 4.0074, + "step": 3924 + }, + { + "epoch": 0.24365261654975479, + "grad_norm": 0.3290178888734616, + "learning_rate": 8.121249741361474e-05, + "loss": 4.0769, + "step": 3925 + }, + { + "epoch": 0.2437146936495127, + "grad_norm": 0.616664616693062, + "learning_rate": 8.123318849575832e-05, + "loss": 4.0901, + "step": 3926 + }, + { + "epoch": 0.2437767707492706, + "grad_norm": 0.34980043060421817, + "learning_rate": 8.125387957790193e-05, + "loss": 3.9779, + "step": 3927 + }, + { + "epoch": 0.24383884784902848, + "grad_norm": 0.4770648840983723, + "learning_rate": 8.127457066004553e-05, + "loss": 3.9281, + "step": 3928 + }, + { + "epoch": 0.2439009249487864, + "grad_norm": 0.41826079317498605, + "learning_rate": 8.129526174218912e-05, + "loss": 3.9528, + "step": 3929 + }, + { + "epoch": 0.2439630020485443, + "grad_norm": 0.3104501729628354, + "learning_rate": 8.131595282433272e-05, + "loss": 4.0674, + "step": 3930 + }, + { + "epoch": 0.24402507914830218, + "grad_norm": 0.3859881811803242, + "learning_rate": 8.133664390647631e-05, + "loss": 4.043, + "step": 3931 + }, + { + "epoch": 0.2440871562480601, + "grad_norm": 0.4172819502136211, + "learning_rate": 8.13573349886199e-05, + "loss": 4.0988, + "step": 3932 + }, + { + "epoch": 0.244149233347818, + "grad_norm": 0.5123640752438735, + "learning_rate": 8.13780260707635e-05, + "loss": 4.0269, + "step": 3933 + }, + { + "epoch": 0.24421131044757588, + "grad_norm": 0.47482351487812763, + "learning_rate": 8.13987171529071e-05, + "loss": 4.0693, + "step": 3934 + }, + { + "epoch": 0.2442733875473338, + "grad_norm": 0.39865422431973585, + "learning_rate": 8.14194082350507e-05, + "loss": 4.0998, + "step": 3935 + }, + { + "epoch": 0.2443354646470917, + "grad_norm": 0.47621766105751145, + "learning_rate": 8.14400993171943e-05, + "loss": 3.9769, + "step": 3936 + }, + { + "epoch": 0.24439754174684958, + "grad_norm": 0.5065541448733639, + "learning_rate": 8.14607903993379e-05, + "loss": 3.9903, + "step": 3937 + }, + { + "epoch": 0.2444596188466075, + "grad_norm": 0.4411193793185457, + "learning_rate": 8.148148148148148e-05, + "loss": 4.0896, + "step": 3938 + }, + { + "epoch": 0.24452169594636539, + "grad_norm": 0.45293993294705825, + "learning_rate": 8.150217256362508e-05, + "loss": 4.1384, + "step": 3939 + }, + { + "epoch": 0.24458377304612328, + "grad_norm": 1.1090805307633265, + "learning_rate": 8.152286364576868e-05, + "loss": 4.1317, + "step": 3940 + }, + { + "epoch": 0.2446458501458812, + "grad_norm": 0.731373150554712, + "learning_rate": 8.154355472791228e-05, + "loss": 4.0458, + "step": 3941 + }, + { + "epoch": 0.24470792724563908, + "grad_norm": 0.8111590544003343, + "learning_rate": 8.156424581005587e-05, + "loss": 3.9963, + "step": 3942 + }, + { + "epoch": 0.24477000434539697, + "grad_norm": 0.6437188779844787, + "learning_rate": 8.158493689219947e-05, + "loss": 4.1537, + "step": 3943 + }, + { + "epoch": 0.2448320814451549, + "grad_norm": 0.5481313986932177, + "learning_rate": 8.160562797434306e-05, + "loss": 4.1167, + "step": 3944 + }, + { + "epoch": 0.24489415854491278, + "grad_norm": 0.4944332296847428, + "learning_rate": 8.162631905648666e-05, + "loss": 4.1691, + "step": 3945 + }, + { + "epoch": 0.24495623564467067, + "grad_norm": 0.434101529107238, + "learning_rate": 8.164701013863025e-05, + "loss": 4.0441, + "step": 3946 + }, + { + "epoch": 0.2450183127444286, + "grad_norm": 0.4367490010711391, + "learning_rate": 8.166770122077385e-05, + "loss": 4.0285, + "step": 3947 + }, + { + "epoch": 0.24508038984418648, + "grad_norm": 0.3772720805911552, + "learning_rate": 8.168839230291745e-05, + "loss": 4.0272, + "step": 3948 + }, + { + "epoch": 0.24514246694394437, + "grad_norm": 0.3841363437329449, + "learning_rate": 8.170908338506105e-05, + "loss": 4.0667, + "step": 3949 + }, + { + "epoch": 0.2452045440437023, + "grad_norm": 0.28894619277215605, + "learning_rate": 8.172977446720463e-05, + "loss": 4.102, + "step": 3950 + }, + { + "epoch": 0.24526662114346018, + "grad_norm": 0.4858968116677239, + "learning_rate": 8.175046554934823e-05, + "loss": 4.0667, + "step": 3951 + }, + { + "epoch": 0.24532869824321807, + "grad_norm": 0.3114445984192397, + "learning_rate": 8.177115663149183e-05, + "loss": 4.0824, + "step": 3952 + }, + { + "epoch": 0.245390775342976, + "grad_norm": 0.34946545243773347, + "learning_rate": 8.179184771363543e-05, + "loss": 4.0032, + "step": 3953 + }, + { + "epoch": 0.24545285244273388, + "grad_norm": 0.4978595007184608, + "learning_rate": 8.181253879577902e-05, + "loss": 4.0951, + "step": 3954 + }, + { + "epoch": 0.24551492954249177, + "grad_norm": 0.482655333582637, + "learning_rate": 8.183322987792262e-05, + "loss": 3.9485, + "step": 3955 + }, + { + "epoch": 0.24557700664224968, + "grad_norm": 0.30929254531238776, + "learning_rate": 8.185392096006621e-05, + "loss": 4.1341, + "step": 3956 + }, + { + "epoch": 0.24563908374200757, + "grad_norm": 0.3897074986873093, + "learning_rate": 8.187461204220981e-05, + "loss": 4.0859, + "step": 3957 + }, + { + "epoch": 0.24570116084176546, + "grad_norm": 0.40586260518026823, + "learning_rate": 8.18953031243534e-05, + "loss": 4.0974, + "step": 3958 + }, + { + "epoch": 0.24576323794152338, + "grad_norm": 0.44088631787764737, + "learning_rate": 8.1915994206497e-05, + "loss": 4.0938, + "step": 3959 + }, + { + "epoch": 0.24582531504128127, + "grad_norm": 0.4272573415093108, + "learning_rate": 8.193668528864061e-05, + "loss": 4.0237, + "step": 3960 + }, + { + "epoch": 0.24588739214103916, + "grad_norm": 0.412275940558374, + "learning_rate": 8.19573763707842e-05, + "loss": 3.9535, + "step": 3961 + }, + { + "epoch": 0.24594946924079708, + "grad_norm": 0.502763116899705, + "learning_rate": 8.197806745292779e-05, + "loss": 3.9208, + "step": 3962 + }, + { + "epoch": 0.24601154634055497, + "grad_norm": 0.44174191700567983, + "learning_rate": 8.199875853507138e-05, + "loss": 4.1508, + "step": 3963 + }, + { + "epoch": 0.24607362344031286, + "grad_norm": 0.5192584131630951, + "learning_rate": 8.201944961721499e-05, + "loss": 3.9365, + "step": 3964 + }, + { + "epoch": 0.24613570054007078, + "grad_norm": 0.3765561874936896, + "learning_rate": 8.204014069935858e-05, + "loss": 4.0737, + "step": 3965 + }, + { + "epoch": 0.24619777763982867, + "grad_norm": 0.7317550268310085, + "learning_rate": 8.206083178150218e-05, + "loss": 4.0281, + "step": 3966 + }, + { + "epoch": 0.24625985473958656, + "grad_norm": 0.9127472111521703, + "learning_rate": 8.208152286364577e-05, + "loss": 4.0296, + "step": 3967 + }, + { + "epoch": 0.24632193183934448, + "grad_norm": 0.49096869378693253, + "learning_rate": 8.210221394578937e-05, + "loss": 4.0656, + "step": 3968 + }, + { + "epoch": 0.24638400893910237, + "grad_norm": 1.243445425264767, + "learning_rate": 8.212290502793296e-05, + "loss": 4.1185, + "step": 3969 + }, + { + "epoch": 0.24644608603886026, + "grad_norm": 0.6023543459582471, + "learning_rate": 8.214359611007656e-05, + "loss": 4.1133, + "step": 3970 + }, + { + "epoch": 0.24650816313861817, + "grad_norm": 0.5530320838430037, + "learning_rate": 8.216428719222015e-05, + "loss": 4.0839, + "step": 3971 + }, + { + "epoch": 0.24657024023837606, + "grad_norm": 0.5233028295560385, + "learning_rate": 8.218497827436375e-05, + "loss": 4.0508, + "step": 3972 + }, + { + "epoch": 0.24663231733813396, + "grad_norm": 0.4675321016152816, + "learning_rate": 8.220566935650736e-05, + "loss": 4.0521, + "step": 3973 + }, + { + "epoch": 0.24669439443789187, + "grad_norm": 0.45536300845068184, + "learning_rate": 8.222636043865094e-05, + "loss": 3.8597, + "step": 3974 + }, + { + "epoch": 0.24675647153764976, + "grad_norm": 0.48797306253101463, + "learning_rate": 8.224705152079453e-05, + "loss": 4.1249, + "step": 3975 + }, + { + "epoch": 0.24681854863740765, + "grad_norm": 0.6466765103828087, + "learning_rate": 8.226774260293813e-05, + "loss": 4.1713, + "step": 3976 + }, + { + "epoch": 0.24688062573716557, + "grad_norm": 0.3981278843378477, + "learning_rate": 8.228843368508174e-05, + "loss": 4.0081, + "step": 3977 + }, + { + "epoch": 0.24694270283692346, + "grad_norm": 0.6607054282666209, + "learning_rate": 8.230912476722533e-05, + "loss": 4.0561, + "step": 3978 + }, + { + "epoch": 0.24700477993668135, + "grad_norm": 0.40068755885855506, + "learning_rate": 8.232981584936893e-05, + "loss": 3.986, + "step": 3979 + }, + { + "epoch": 0.24706685703643927, + "grad_norm": 0.6505656880251044, + "learning_rate": 8.235050693151251e-05, + "loss": 4.0141, + "step": 3980 + }, + { + "epoch": 0.24712893413619716, + "grad_norm": 0.5557183907872416, + "learning_rate": 8.237119801365612e-05, + "loss": 4.0233, + "step": 3981 + }, + { + "epoch": 0.24719101123595505, + "grad_norm": 0.5039414977528132, + "learning_rate": 8.239188909579971e-05, + "loss": 4.0424, + "step": 3982 + }, + { + "epoch": 0.24725308833571297, + "grad_norm": 0.5759281742212314, + "learning_rate": 8.241258017794331e-05, + "loss": 4.1395, + "step": 3983 + }, + { + "epoch": 0.24731516543547086, + "grad_norm": 0.35870427275527744, + "learning_rate": 8.24332712600869e-05, + "loss": 3.9589, + "step": 3984 + }, + { + "epoch": 0.24737724253522875, + "grad_norm": 0.40769097720892444, + "learning_rate": 8.245396234223051e-05, + "loss": 4.1316, + "step": 3985 + }, + { + "epoch": 0.24743931963498667, + "grad_norm": 0.3256680486306322, + "learning_rate": 8.247465342437411e-05, + "loss": 4.0919, + "step": 3986 + }, + { + "epoch": 0.24750139673474456, + "grad_norm": 0.3304482140258904, + "learning_rate": 8.249534450651769e-05, + "loss": 4.011, + "step": 3987 + }, + { + "epoch": 0.24756347383450245, + "grad_norm": 0.3684835287434682, + "learning_rate": 8.251603558866128e-05, + "loss": 4.0252, + "step": 3988 + }, + { + "epoch": 0.24762555093426036, + "grad_norm": 0.5777549074559618, + "learning_rate": 8.253672667080489e-05, + "loss": 4.0666, + "step": 3989 + }, + { + "epoch": 0.24768762803401825, + "grad_norm": 0.4796031194273649, + "learning_rate": 8.255741775294849e-05, + "loss": 4.0583, + "step": 3990 + }, + { + "epoch": 0.24774970513377614, + "grad_norm": 0.6987366959289686, + "learning_rate": 8.257810883509208e-05, + "loss": 4.0088, + "step": 3991 + }, + { + "epoch": 0.24781178223353406, + "grad_norm": 0.5546013133828933, + "learning_rate": 8.259879991723568e-05, + "loss": 4.0058, + "step": 3992 + }, + { + "epoch": 0.24787385933329195, + "grad_norm": 0.5725275943802429, + "learning_rate": 8.261949099937927e-05, + "loss": 4.1446, + "step": 3993 + }, + { + "epoch": 0.24793593643304984, + "grad_norm": 0.48633520282053117, + "learning_rate": 8.264018208152287e-05, + "loss": 4.1, + "step": 3994 + }, + { + "epoch": 0.24799801353280776, + "grad_norm": 0.4395020860312195, + "learning_rate": 8.266087316366646e-05, + "loss": 4.0674, + "step": 3995 + }, + { + "epoch": 0.24806009063256565, + "grad_norm": 0.5978263911992833, + "learning_rate": 8.268156424581006e-05, + "loss": 4.1871, + "step": 3996 + }, + { + "epoch": 0.24812216773232354, + "grad_norm": 0.48106880244720535, + "learning_rate": 8.270225532795365e-05, + "loss": 3.9566, + "step": 3997 + }, + { + "epoch": 0.24818424483208146, + "grad_norm": 0.6742003366513064, + "learning_rate": 8.272294641009726e-05, + "loss": 3.9695, + "step": 3998 + }, + { + "epoch": 0.24824632193183935, + "grad_norm": 0.36697775348987133, + "learning_rate": 8.274363749224084e-05, + "loss": 4.0345, + "step": 3999 + }, + { + "epoch": 0.24830839903159724, + "grad_norm": 0.37738877653718683, + "learning_rate": 8.276432857438444e-05, + "loss": 4.13, + "step": 4000 + }, + { + "epoch": 0.24837047613135516, + "grad_norm": 0.3338863608220858, + "learning_rate": 8.278501965652803e-05, + "loss": 4.0205, + "step": 4001 + }, + { + "epoch": 0.24843255323111305, + "grad_norm": 0.4470198509898988, + "learning_rate": 8.280571073867164e-05, + "loss": 3.9734, + "step": 4002 + }, + { + "epoch": 0.24849463033087094, + "grad_norm": 0.41847088322363873, + "learning_rate": 8.282640182081524e-05, + "loss": 4.0225, + "step": 4003 + }, + { + "epoch": 0.24855670743062885, + "grad_norm": 0.31011270366309746, + "learning_rate": 8.284709290295883e-05, + "loss": 4.0116, + "step": 4004 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 0.3758337170619373, + "learning_rate": 8.286778398510241e-05, + "loss": 4.0024, + "step": 4005 + }, + { + "epoch": 0.24868086163014463, + "grad_norm": 0.45405066420270557, + "learning_rate": 8.288847506724602e-05, + "loss": 4.13, + "step": 4006 + }, + { + "epoch": 0.24874293872990255, + "grad_norm": 0.3995411491137412, + "learning_rate": 8.290916614938962e-05, + "loss": 4.0437, + "step": 4007 + }, + { + "epoch": 0.24880501582966044, + "grad_norm": 0.34975139567674096, + "learning_rate": 8.292985723153321e-05, + "loss": 4.0685, + "step": 4008 + }, + { + "epoch": 0.24886709292941833, + "grad_norm": 0.28103341929411757, + "learning_rate": 8.295054831367681e-05, + "loss": 3.942, + "step": 4009 + }, + { + "epoch": 0.24892917002917625, + "grad_norm": 0.5308429617235629, + "learning_rate": 8.297123939582042e-05, + "loss": 4.0721, + "step": 4010 + }, + { + "epoch": 0.24899124712893414, + "grad_norm": 0.3292279665742697, + "learning_rate": 8.2991930477964e-05, + "loss": 4.0658, + "step": 4011 + }, + { + "epoch": 0.24905332422869203, + "grad_norm": 0.5060661511906073, + "learning_rate": 8.30126215601076e-05, + "loss": 4.1123, + "step": 4012 + }, + { + "epoch": 0.24911540132844995, + "grad_norm": 0.3021336409454079, + "learning_rate": 8.303331264225119e-05, + "loss": 4.1183, + "step": 4013 + }, + { + "epoch": 0.24917747842820784, + "grad_norm": 0.4256745650998246, + "learning_rate": 8.30540037243948e-05, + "loss": 4.1091, + "step": 4014 + }, + { + "epoch": 0.24923955552796573, + "grad_norm": 0.43323950859812044, + "learning_rate": 8.307469480653839e-05, + "loss": 4.1758, + "step": 4015 + }, + { + "epoch": 0.24930163262772365, + "grad_norm": 0.41839353420164876, + "learning_rate": 8.309538588868199e-05, + "loss": 4.1905, + "step": 4016 + }, + { + "epoch": 0.24936370972748154, + "grad_norm": 0.37276747860983106, + "learning_rate": 8.311607697082557e-05, + "loss": 4.0958, + "step": 4017 + }, + { + "epoch": 0.24942578682723943, + "grad_norm": 0.37021617521883277, + "learning_rate": 8.313676805296918e-05, + "loss": 4.1452, + "step": 4018 + }, + { + "epoch": 0.24948786392699734, + "grad_norm": 0.4366942035416582, + "learning_rate": 8.315745913511277e-05, + "loss": 4.0671, + "step": 4019 + }, + { + "epoch": 0.24954994102675523, + "grad_norm": 0.5344684106486346, + "learning_rate": 8.317815021725637e-05, + "loss": 3.9776, + "step": 4020 + }, + { + "epoch": 0.24961201812651312, + "grad_norm": 0.39149329003860134, + "learning_rate": 8.319884129939996e-05, + "loss": 3.9972, + "step": 4021 + }, + { + "epoch": 0.24967409522627104, + "grad_norm": 0.6187235152230953, + "learning_rate": 8.321953238154356e-05, + "loss": 4.0704, + "step": 4022 + }, + { + "epoch": 0.24973617232602893, + "grad_norm": 0.3370593656043693, + "learning_rate": 8.324022346368715e-05, + "loss": 4.0722, + "step": 4023 + }, + { + "epoch": 0.24979824942578682, + "grad_norm": 0.5353365963190728, + "learning_rate": 8.326091454583075e-05, + "loss": 4.01, + "step": 4024 + }, + { + "epoch": 0.24986032652554474, + "grad_norm": 0.5765034901264348, + "learning_rate": 8.328160562797434e-05, + "loss": 4.0199, + "step": 4025 + }, + { + "epoch": 0.24992240362530263, + "grad_norm": 0.835688794653214, + "learning_rate": 8.330229671011794e-05, + "loss": 4.1068, + "step": 4026 + }, + { + "epoch": 0.24998448072506052, + "grad_norm": 0.6491369817954968, + "learning_rate": 8.332298779226155e-05, + "loss": 4.0788, + "step": 4027 + }, + { + "epoch": 0.25004655782481844, + "grad_norm": 0.616853527054803, + "learning_rate": 8.334367887440514e-05, + "loss": 4.0147, + "step": 4028 + }, + { + "epoch": 0.2501086349245763, + "grad_norm": 0.3832272987634722, + "learning_rate": 8.336436995654872e-05, + "loss": 3.9452, + "step": 4029 + }, + { + "epoch": 0.2501707120243342, + "grad_norm": 1.0392708820395133, + "learning_rate": 8.338506103869232e-05, + "loss": 4.0554, + "step": 4030 + }, + { + "epoch": 0.25023278912409214, + "grad_norm": 0.6825295366107049, + "learning_rate": 8.340575212083593e-05, + "loss": 4.0002, + "step": 4031 + }, + { + "epoch": 0.25029486622385, + "grad_norm": 0.5971084527639064, + "learning_rate": 8.342644320297952e-05, + "loss": 3.9991, + "step": 4032 + }, + { + "epoch": 0.2503569433236079, + "grad_norm": 1.2033170571551741, + "learning_rate": 8.344713428512312e-05, + "loss": 4.0237, + "step": 4033 + }, + { + "epoch": 0.25041902042336583, + "grad_norm": 0.4828419728835551, + "learning_rate": 8.346782536726671e-05, + "loss": 4.1171, + "step": 4034 + }, + { + "epoch": 0.2504810975231237, + "grad_norm": 0.8524991405180684, + "learning_rate": 8.348851644941031e-05, + "loss": 4.0718, + "step": 4035 + }, + { + "epoch": 0.2505431746228816, + "grad_norm": 1.4922614753321544, + "learning_rate": 8.35092075315539e-05, + "loss": 4.1221, + "step": 4036 + }, + { + "epoch": 0.25060525172263953, + "grad_norm": 0.5489063410836266, + "learning_rate": 8.35298986136975e-05, + "loss": 4.0257, + "step": 4037 + }, + { + "epoch": 0.2506673288223974, + "grad_norm": 0.7943228449233743, + "learning_rate": 8.355058969584109e-05, + "loss": 3.9908, + "step": 4038 + }, + { + "epoch": 0.2507294059221553, + "grad_norm": 1.1911816995100113, + "learning_rate": 8.35712807779847e-05, + "loss": 4.1288, + "step": 4039 + }, + { + "epoch": 0.25079148302191323, + "grad_norm": 0.7582815625960222, + "learning_rate": 8.35919718601283e-05, + "loss": 4.1215, + "step": 4040 + }, + { + "epoch": 0.2508535601216711, + "grad_norm": 0.7344639095297351, + "learning_rate": 8.361266294227188e-05, + "loss": 4.0752, + "step": 4041 + }, + { + "epoch": 0.250915637221429, + "grad_norm": 0.6650512963064598, + "learning_rate": 8.363335402441547e-05, + "loss": 4.0734, + "step": 4042 + }, + { + "epoch": 0.25097771432118693, + "grad_norm": 0.5394204651201697, + "learning_rate": 8.365404510655908e-05, + "loss": 4.0485, + "step": 4043 + }, + { + "epoch": 0.2510397914209448, + "grad_norm": 0.5841441876725105, + "learning_rate": 8.367473618870268e-05, + "loss": 4.1818, + "step": 4044 + }, + { + "epoch": 0.2511018685207027, + "grad_norm": 0.5375316555105272, + "learning_rate": 8.369542727084627e-05, + "loss": 4.0684, + "step": 4045 + }, + { + "epoch": 0.2511639456204606, + "grad_norm": 0.5689646588279662, + "learning_rate": 8.371611835298987e-05, + "loss": 3.9366, + "step": 4046 + }, + { + "epoch": 0.2512260227202185, + "grad_norm": 0.37936521046182503, + "learning_rate": 8.373680943513346e-05, + "loss": 4.0206, + "step": 4047 + }, + { + "epoch": 0.2512880998199764, + "grad_norm": 0.47015126908701116, + "learning_rate": 8.375750051727706e-05, + "loss": 4.0774, + "step": 4048 + }, + { + "epoch": 0.2513501769197343, + "grad_norm": 0.33969833288317075, + "learning_rate": 8.377819159942065e-05, + "loss": 4.1259, + "step": 4049 + }, + { + "epoch": 0.2514122540194922, + "grad_norm": 0.4653047791833854, + "learning_rate": 8.379888268156425e-05, + "loss": 4.0671, + "step": 4050 + }, + { + "epoch": 0.2514743311192501, + "grad_norm": 0.5550085401307555, + "learning_rate": 8.381957376370784e-05, + "loss": 4.0918, + "step": 4051 + }, + { + "epoch": 0.251536408219008, + "grad_norm": 0.387912007459911, + "learning_rate": 8.384026484585145e-05, + "loss": 4.1576, + "step": 4052 + }, + { + "epoch": 0.2515984853187659, + "grad_norm": 0.4168431643350286, + "learning_rate": 8.386095592799503e-05, + "loss": 3.9812, + "step": 4053 + }, + { + "epoch": 0.2516605624185238, + "grad_norm": 0.4988831324706609, + "learning_rate": 8.388164701013863e-05, + "loss": 3.9687, + "step": 4054 + }, + { + "epoch": 0.2517226395182817, + "grad_norm": 0.5540001868908702, + "learning_rate": 8.390233809228222e-05, + "loss": 3.9839, + "step": 4055 + }, + { + "epoch": 0.2517847166180396, + "grad_norm": 0.5114965174678556, + "learning_rate": 8.392302917442583e-05, + "loss": 3.9883, + "step": 4056 + }, + { + "epoch": 0.2518467937177975, + "grad_norm": 0.41717698516167634, + "learning_rate": 8.394372025656943e-05, + "loss": 4.0368, + "step": 4057 + }, + { + "epoch": 0.2519088708175554, + "grad_norm": 0.3552946985058054, + "learning_rate": 8.396441133871302e-05, + "loss": 4.0097, + "step": 4058 + }, + { + "epoch": 0.2519709479173133, + "grad_norm": 0.38992961022132305, + "learning_rate": 8.398510242085662e-05, + "loss": 3.9269, + "step": 4059 + }, + { + "epoch": 0.2520330250170712, + "grad_norm": 0.4101095808816901, + "learning_rate": 8.400579350300021e-05, + "loss": 3.9595, + "step": 4060 + }, + { + "epoch": 0.2520951021168291, + "grad_norm": 0.3407032063148855, + "learning_rate": 8.40264845851438e-05, + "loss": 4.0172, + "step": 4061 + }, + { + "epoch": 0.252157179216587, + "grad_norm": 0.2849244913452286, + "learning_rate": 8.40471756672874e-05, + "loss": 4.1145, + "step": 4062 + }, + { + "epoch": 0.2522192563163449, + "grad_norm": 0.31547542512056237, + "learning_rate": 8.4067866749431e-05, + "loss": 3.9411, + "step": 4063 + }, + { + "epoch": 0.2522813334161028, + "grad_norm": 0.46941101832552284, + "learning_rate": 8.40885578315746e-05, + "loss": 4.0339, + "step": 4064 + }, + { + "epoch": 0.2523434105158607, + "grad_norm": 0.5260743581100005, + "learning_rate": 8.410924891371819e-05, + "loss": 4.0941, + "step": 4065 + }, + { + "epoch": 0.2524054876156186, + "grad_norm": 0.5016616687599625, + "learning_rate": 8.412993999586178e-05, + "loss": 3.9709, + "step": 4066 + }, + { + "epoch": 0.2524675647153765, + "grad_norm": 0.3415385441698824, + "learning_rate": 8.415063107800538e-05, + "loss": 4.0748, + "step": 4067 + }, + { + "epoch": 0.2525296418151344, + "grad_norm": 0.4197662242471211, + "learning_rate": 8.417132216014899e-05, + "loss": 3.9362, + "step": 4068 + }, + { + "epoch": 0.2525917189148923, + "grad_norm": 0.3530874557285225, + "learning_rate": 8.419201324229258e-05, + "loss": 3.9967, + "step": 4069 + }, + { + "epoch": 0.2526537960146502, + "grad_norm": 0.4964886692920875, + "learning_rate": 8.421270432443618e-05, + "loss": 4.2241, + "step": 4070 + }, + { + "epoch": 0.2527158731144081, + "grad_norm": 0.3907390046650281, + "learning_rate": 8.423339540657977e-05, + "loss": 4.0663, + "step": 4071 + }, + { + "epoch": 0.252777950214166, + "grad_norm": 0.3179789065864174, + "learning_rate": 8.425408648872337e-05, + "loss": 3.9717, + "step": 4072 + }, + { + "epoch": 0.2528400273139239, + "grad_norm": 0.35182243909975175, + "learning_rate": 8.427477757086696e-05, + "loss": 4.0221, + "step": 4073 + }, + { + "epoch": 0.25290210441368177, + "grad_norm": 0.4021668091366786, + "learning_rate": 8.429546865301056e-05, + "loss": 4.1157, + "step": 4074 + }, + { + "epoch": 0.2529641815134397, + "grad_norm": 0.2876791069609804, + "learning_rate": 8.431615973515415e-05, + "loss": 3.9854, + "step": 4075 + }, + { + "epoch": 0.2530262586131976, + "grad_norm": 0.3752601039372486, + "learning_rate": 8.433685081729775e-05, + "loss": 4.0293, + "step": 4076 + }, + { + "epoch": 0.25308833571295547, + "grad_norm": 0.39435010471751303, + "learning_rate": 8.435754189944135e-05, + "loss": 4.0601, + "step": 4077 + }, + { + "epoch": 0.2531504128127134, + "grad_norm": 0.3240597921238092, + "learning_rate": 8.437823298158494e-05, + "loss": 3.9596, + "step": 4078 + }, + { + "epoch": 0.2532124899124713, + "grad_norm": 0.32521287337992594, + "learning_rate": 8.439892406372853e-05, + "loss": 3.9352, + "step": 4079 + }, + { + "epoch": 0.25327456701222917, + "grad_norm": 0.5186704229882405, + "learning_rate": 8.441961514587213e-05, + "loss": 3.9533, + "step": 4080 + }, + { + "epoch": 0.2533366441119871, + "grad_norm": 0.36948241703542106, + "learning_rate": 8.444030622801573e-05, + "loss": 4.032, + "step": 4081 + }, + { + "epoch": 0.253398721211745, + "grad_norm": 0.4029748097171567, + "learning_rate": 8.446099731015933e-05, + "loss": 4.05, + "step": 4082 + }, + { + "epoch": 0.25346079831150287, + "grad_norm": 0.3101305059707618, + "learning_rate": 8.448168839230293e-05, + "loss": 3.8908, + "step": 4083 + }, + { + "epoch": 0.2535228754112608, + "grad_norm": 0.2824116400292645, + "learning_rate": 8.450237947444652e-05, + "loss": 3.9969, + "step": 4084 + }, + { + "epoch": 0.2535849525110187, + "grad_norm": 0.3478366940730047, + "learning_rate": 8.452307055659012e-05, + "loss": 4.0193, + "step": 4085 + }, + { + "epoch": 0.25364702961077656, + "grad_norm": 0.2639085061048576, + "learning_rate": 8.454376163873371e-05, + "loss": 4.0103, + "step": 4086 + }, + { + "epoch": 0.2537091067105345, + "grad_norm": 0.3788120506622565, + "learning_rate": 8.45644527208773e-05, + "loss": 3.9926, + "step": 4087 + }, + { + "epoch": 0.2537711838102924, + "grad_norm": 0.30897046285721463, + "learning_rate": 8.45851438030209e-05, + "loss": 4.0015, + "step": 4088 + }, + { + "epoch": 0.25383326091005026, + "grad_norm": 0.29953385835008384, + "learning_rate": 8.460583488516451e-05, + "loss": 4.0355, + "step": 4089 + }, + { + "epoch": 0.2538953380098082, + "grad_norm": 0.23153653339016786, + "learning_rate": 8.462652596730809e-05, + "loss": 3.9409, + "step": 4090 + }, + { + "epoch": 0.2539574151095661, + "grad_norm": 0.31040669522281855, + "learning_rate": 8.464721704945169e-05, + "loss": 4.0152, + "step": 4091 + }, + { + "epoch": 0.25401949220932396, + "grad_norm": 0.3789986173383106, + "learning_rate": 8.466790813159528e-05, + "loss": 4.0293, + "step": 4092 + }, + { + "epoch": 0.2540815693090819, + "grad_norm": 0.2648798223140219, + "learning_rate": 8.468859921373889e-05, + "loss": 4.0499, + "step": 4093 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 0.23891754867152293, + "learning_rate": 8.470929029588248e-05, + "loss": 3.8969, + "step": 4094 + }, + { + "epoch": 0.25420572350859766, + "grad_norm": 0.46417841057785153, + "learning_rate": 8.472998137802608e-05, + "loss": 3.9277, + "step": 4095 + }, + { + "epoch": 0.2542678006083556, + "grad_norm": 0.43130651117558017, + "learning_rate": 8.475067246016966e-05, + "loss": 3.9934, + "step": 4096 + }, + { + "epoch": 0.2543298777081135, + "grad_norm": 0.3080732325694425, + "learning_rate": 8.477136354231327e-05, + "loss": 3.9866, + "step": 4097 + }, + { + "epoch": 0.25439195480787136, + "grad_norm": 0.3058785365546791, + "learning_rate": 8.479205462445686e-05, + "loss": 4.1394, + "step": 4098 + }, + { + "epoch": 0.2544540319076293, + "grad_norm": 0.5231800015337915, + "learning_rate": 8.481274570660046e-05, + "loss": 3.9761, + "step": 4099 + }, + { + "epoch": 0.2545161090073872, + "grad_norm": 0.6323489858129191, + "learning_rate": 8.483343678874405e-05, + "loss": 3.9596, + "step": 4100 + }, + { + "epoch": 0.25457818610714505, + "grad_norm": 0.4117110438050936, + "learning_rate": 8.485412787088765e-05, + "loss": 4.0913, + "step": 4101 + }, + { + "epoch": 0.254640263206903, + "grad_norm": 0.43185823228583364, + "learning_rate": 8.487481895303125e-05, + "loss": 3.9533, + "step": 4102 + }, + { + "epoch": 0.2547023403066609, + "grad_norm": 0.49679877278276957, + "learning_rate": 8.489551003517484e-05, + "loss": 3.983, + "step": 4103 + }, + { + "epoch": 0.25476441740641875, + "grad_norm": 0.36538118074131015, + "learning_rate": 8.491620111731844e-05, + "loss": 4.1749, + "step": 4104 + }, + { + "epoch": 0.25482649450617667, + "grad_norm": 0.5384920944724502, + "learning_rate": 8.493689219946203e-05, + "loss": 4.0124, + "step": 4105 + }, + { + "epoch": 0.2548885716059346, + "grad_norm": 0.45474840261001587, + "learning_rate": 8.495758328160564e-05, + "loss": 3.9504, + "step": 4106 + }, + { + "epoch": 0.25495064870569245, + "grad_norm": 0.4216315308672092, + "learning_rate": 8.497827436374923e-05, + "loss": 4.0747, + "step": 4107 + }, + { + "epoch": 0.25501272580545037, + "grad_norm": 0.6073052886255226, + "learning_rate": 8.499896544589282e-05, + "loss": 4.0158, + "step": 4108 + }, + { + "epoch": 0.2550748029052083, + "grad_norm": 0.6741019977781048, + "learning_rate": 8.501965652803642e-05, + "loss": 3.9996, + "step": 4109 + }, + { + "epoch": 0.25513688000496615, + "grad_norm": 0.5869362012905457, + "learning_rate": 8.504034761018002e-05, + "loss": 3.8937, + "step": 4110 + }, + { + "epoch": 0.25519895710472407, + "grad_norm": 0.4590675765575481, + "learning_rate": 8.506103869232361e-05, + "loss": 3.8572, + "step": 4111 + }, + { + "epoch": 0.255261034204482, + "grad_norm": 0.6562578337065575, + "learning_rate": 8.508172977446721e-05, + "loss": 4.09, + "step": 4112 + }, + { + "epoch": 0.25532311130423985, + "grad_norm": 0.6406843111448014, + "learning_rate": 8.51024208566108e-05, + "loss": 3.9723, + "step": 4113 + }, + { + "epoch": 0.25538518840399776, + "grad_norm": 0.6969825747130144, + "learning_rate": 8.51231119387544e-05, + "loss": 3.9979, + "step": 4114 + }, + { + "epoch": 0.2554472655037557, + "grad_norm": 0.4449479468043964, + "learning_rate": 8.5143803020898e-05, + "loss": 4.0654, + "step": 4115 + }, + { + "epoch": 0.25550934260351355, + "grad_norm": 0.40817923224946523, + "learning_rate": 8.516449410304159e-05, + "loss": 4.0129, + "step": 4116 + }, + { + "epoch": 0.25557141970327146, + "grad_norm": 0.33337987882765147, + "learning_rate": 8.518518518518518e-05, + "loss": 4.0367, + "step": 4117 + }, + { + "epoch": 0.2556334968030294, + "grad_norm": 0.42273682409763913, + "learning_rate": 8.52058762673288e-05, + "loss": 4.1332, + "step": 4118 + }, + { + "epoch": 0.25569557390278724, + "grad_norm": 0.4464686093087521, + "learning_rate": 8.522656734947239e-05, + "loss": 4.0665, + "step": 4119 + }, + { + "epoch": 0.25575765100254516, + "grad_norm": 0.31752548532252894, + "learning_rate": 8.524725843161597e-05, + "loss": 3.9466, + "step": 4120 + }, + { + "epoch": 0.2558197281023031, + "grad_norm": 0.3735394446432245, + "learning_rate": 8.526794951375957e-05, + "loss": 4.0089, + "step": 4121 + }, + { + "epoch": 0.25588180520206094, + "grad_norm": 0.2862951449647112, + "learning_rate": 8.528864059590317e-05, + "loss": 4.0683, + "step": 4122 + }, + { + "epoch": 0.25594388230181886, + "grad_norm": 0.3002325105671264, + "learning_rate": 8.530933167804677e-05, + "loss": 4.0159, + "step": 4123 + }, + { + "epoch": 0.2560059594015768, + "grad_norm": 0.33987802573804193, + "learning_rate": 8.533002276019036e-05, + "loss": 3.886, + "step": 4124 + }, + { + "epoch": 0.25606803650133464, + "grad_norm": 0.356251473172806, + "learning_rate": 8.535071384233396e-05, + "loss": 3.9149, + "step": 4125 + }, + { + "epoch": 0.25613011360109256, + "grad_norm": 0.43252197320058844, + "learning_rate": 8.537140492447755e-05, + "loss": 3.9207, + "step": 4126 + }, + { + "epoch": 0.2561921907008505, + "grad_norm": 0.3771737199366278, + "learning_rate": 8.539209600662115e-05, + "loss": 4.0136, + "step": 4127 + }, + { + "epoch": 0.25625426780060834, + "grad_norm": 0.3263459112513499, + "learning_rate": 8.541278708876474e-05, + "loss": 3.9833, + "step": 4128 + }, + { + "epoch": 0.25631634490036626, + "grad_norm": 0.4271634790382466, + "learning_rate": 8.543347817090834e-05, + "loss": 3.9591, + "step": 4129 + }, + { + "epoch": 0.2563784220001242, + "grad_norm": 0.437119181968711, + "learning_rate": 8.545416925305195e-05, + "loss": 3.9253, + "step": 4130 + }, + { + "epoch": 0.25644049909988204, + "grad_norm": 0.4957721599006413, + "learning_rate": 8.547486033519554e-05, + "loss": 3.8566, + "step": 4131 + }, + { + "epoch": 0.25650257619963995, + "grad_norm": 0.43614663539084164, + "learning_rate": 8.549555141733912e-05, + "loss": 3.9499, + "step": 4132 + }, + { + "epoch": 0.25656465329939787, + "grad_norm": 0.48908079679833094, + "learning_rate": 8.551624249948272e-05, + "loss": 3.986, + "step": 4133 + }, + { + "epoch": 0.25662673039915573, + "grad_norm": 0.2942719023468495, + "learning_rate": 8.553693358162633e-05, + "loss": 4.0165, + "step": 4134 + }, + { + "epoch": 0.25668880749891365, + "grad_norm": 0.4160568503428454, + "learning_rate": 8.555762466376992e-05, + "loss": 4.0196, + "step": 4135 + }, + { + "epoch": 0.25675088459867157, + "grad_norm": 0.41087512641651797, + "learning_rate": 8.557831574591352e-05, + "loss": 4.0603, + "step": 4136 + }, + { + "epoch": 0.25681296169842943, + "grad_norm": 0.5152989715040305, + "learning_rate": 8.559900682805711e-05, + "loss": 4.0988, + "step": 4137 + }, + { + "epoch": 0.25687503879818735, + "grad_norm": 0.5663388442347084, + "learning_rate": 8.561969791020071e-05, + "loss": 4.0482, + "step": 4138 + }, + { + "epoch": 0.25693711589794527, + "grad_norm": 0.5051229706317301, + "learning_rate": 8.56403889923443e-05, + "loss": 3.9871, + "step": 4139 + }, + { + "epoch": 0.25699919299770313, + "grad_norm": 0.4412265006499854, + "learning_rate": 8.56610800744879e-05, + "loss": 4.009, + "step": 4140 + }, + { + "epoch": 0.25706127009746105, + "grad_norm": 0.33125953750555265, + "learning_rate": 8.56817711566315e-05, + "loss": 3.965, + "step": 4141 + }, + { + "epoch": 0.25712334719721897, + "grad_norm": 0.28120017934982716, + "learning_rate": 8.570246223877509e-05, + "loss": 3.977, + "step": 4142 + }, + { + "epoch": 0.25718542429697683, + "grad_norm": 0.7259387548586066, + "learning_rate": 8.57231533209187e-05, + "loss": 4.0366, + "step": 4143 + }, + { + "epoch": 0.25724750139673475, + "grad_norm": 0.5503078617764803, + "learning_rate": 8.574384440306228e-05, + "loss": 3.9862, + "step": 4144 + }, + { + "epoch": 0.25730957849649266, + "grad_norm": 0.5249781638100205, + "learning_rate": 8.576453548520587e-05, + "loss": 3.9694, + "step": 4145 + }, + { + "epoch": 0.2573716555962505, + "grad_norm": 0.7588807275260946, + "learning_rate": 8.578522656734947e-05, + "loss": 4.1116, + "step": 4146 + }, + { + "epoch": 0.25743373269600844, + "grad_norm": 0.5959360386510973, + "learning_rate": 8.580591764949308e-05, + "loss": 3.9547, + "step": 4147 + }, + { + "epoch": 0.25749580979576636, + "grad_norm": 0.6785484942884368, + "learning_rate": 8.582660873163667e-05, + "loss": 4.0629, + "step": 4148 + }, + { + "epoch": 0.2575578868955242, + "grad_norm": 0.6640899876908061, + "learning_rate": 8.584729981378027e-05, + "loss": 3.9216, + "step": 4149 + }, + { + "epoch": 0.25761996399528214, + "grad_norm": 0.4558408019164765, + "learning_rate": 8.586799089592385e-05, + "loss": 4.0688, + "step": 4150 + }, + { + "epoch": 0.25768204109504006, + "grad_norm": 0.3251929526486847, + "learning_rate": 8.588868197806746e-05, + "loss": 3.9805, + "step": 4151 + }, + { + "epoch": 0.2577441181947979, + "grad_norm": 0.4925013706991805, + "learning_rate": 8.590937306021105e-05, + "loss": 3.9974, + "step": 4152 + }, + { + "epoch": 0.25780619529455584, + "grad_norm": 0.3975506805404502, + "learning_rate": 8.593006414235465e-05, + "loss": 3.9192, + "step": 4153 + }, + { + "epoch": 0.25786827239431376, + "grad_norm": 0.5069215147357374, + "learning_rate": 8.595075522449824e-05, + "loss": 4.08, + "step": 4154 + }, + { + "epoch": 0.2579303494940716, + "grad_norm": 0.47743874648803236, + "learning_rate": 8.597144630664185e-05, + "loss": 3.9471, + "step": 4155 + }, + { + "epoch": 0.25799242659382954, + "grad_norm": 0.39928736460985903, + "learning_rate": 8.599213738878545e-05, + "loss": 3.9746, + "step": 4156 + }, + { + "epoch": 0.25805450369358746, + "grad_norm": 0.46816872477845767, + "learning_rate": 8.601282847092903e-05, + "loss": 3.983, + "step": 4157 + }, + { + "epoch": 0.2581165807933453, + "grad_norm": 0.41685505025732095, + "learning_rate": 8.603351955307262e-05, + "loss": 3.9443, + "step": 4158 + }, + { + "epoch": 0.25817865789310324, + "grad_norm": 0.3513936591023535, + "learning_rate": 8.605421063521623e-05, + "loss": 3.9474, + "step": 4159 + }, + { + "epoch": 0.25824073499286115, + "grad_norm": 0.5371824734970252, + "learning_rate": 8.607490171735983e-05, + "loss": 3.9001, + "step": 4160 + }, + { + "epoch": 0.258302812092619, + "grad_norm": 0.2576647103692053, + "learning_rate": 8.609559279950342e-05, + "loss": 4.0044, + "step": 4161 + }, + { + "epoch": 0.25836488919237693, + "grad_norm": 0.4874795053526717, + "learning_rate": 8.611628388164702e-05, + "loss": 3.8858, + "step": 4162 + }, + { + "epoch": 0.25842696629213485, + "grad_norm": 0.3389256231839272, + "learning_rate": 8.613697496379061e-05, + "loss": 3.9404, + "step": 4163 + }, + { + "epoch": 0.2584890433918927, + "grad_norm": 0.5032122660711147, + "learning_rate": 8.615766604593421e-05, + "loss": 3.9976, + "step": 4164 + }, + { + "epoch": 0.25855112049165063, + "grad_norm": 0.32016020154771746, + "learning_rate": 8.61783571280778e-05, + "loss": 4.0933, + "step": 4165 + }, + { + "epoch": 0.25861319759140855, + "grad_norm": 0.2674173303563992, + "learning_rate": 8.61990482102214e-05, + "loss": 3.9574, + "step": 4166 + }, + { + "epoch": 0.2586752746911664, + "grad_norm": 0.32769882818690027, + "learning_rate": 8.621973929236499e-05, + "loss": 3.9541, + "step": 4167 + }, + { + "epoch": 0.25873735179092433, + "grad_norm": 0.3623165735315495, + "learning_rate": 8.62404303745086e-05, + "loss": 3.8968, + "step": 4168 + }, + { + "epoch": 0.25879942889068225, + "grad_norm": 0.7214410823603201, + "learning_rate": 8.626112145665218e-05, + "loss": 4.0338, + "step": 4169 + }, + { + "epoch": 0.2588615059904401, + "grad_norm": 0.41686139523498383, + "learning_rate": 8.628181253879578e-05, + "loss": 3.9265, + "step": 4170 + }, + { + "epoch": 0.25892358309019803, + "grad_norm": 0.30222287188975155, + "learning_rate": 8.630250362093937e-05, + "loss": 3.9524, + "step": 4171 + }, + { + "epoch": 0.25898566018995595, + "grad_norm": 0.35503971492217595, + "learning_rate": 8.632319470308298e-05, + "loss": 3.9504, + "step": 4172 + }, + { + "epoch": 0.2590477372897138, + "grad_norm": 0.4534690964493536, + "learning_rate": 8.634388578522658e-05, + "loss": 4.0253, + "step": 4173 + }, + { + "epoch": 0.2591098143894717, + "grad_norm": 0.5320501192530658, + "learning_rate": 8.636457686737017e-05, + "loss": 4.04, + "step": 4174 + }, + { + "epoch": 0.25917189148922964, + "grad_norm": 0.8967396491128379, + "learning_rate": 8.638526794951375e-05, + "loss": 3.935, + "step": 4175 + }, + { + "epoch": 0.2592339685889875, + "grad_norm": 0.7081332391594578, + "learning_rate": 8.640595903165736e-05, + "loss": 4.0567, + "step": 4176 + }, + { + "epoch": 0.2592960456887454, + "grad_norm": 0.3903275501718385, + "learning_rate": 8.642665011380096e-05, + "loss": 4.0025, + "step": 4177 + }, + { + "epoch": 0.25935812278850334, + "grad_norm": 0.4800254047892944, + "learning_rate": 8.644734119594455e-05, + "loss": 3.8933, + "step": 4178 + }, + { + "epoch": 0.2594201998882612, + "grad_norm": 0.45108440921997406, + "learning_rate": 8.646803227808815e-05, + "loss": 3.9174, + "step": 4179 + }, + { + "epoch": 0.2594822769880191, + "grad_norm": 0.4218236044200536, + "learning_rate": 8.648872336023176e-05, + "loss": 3.8983, + "step": 4180 + }, + { + "epoch": 0.25954435408777704, + "grad_norm": 0.35764112158060923, + "learning_rate": 8.650941444237534e-05, + "loss": 4.0145, + "step": 4181 + }, + { + "epoch": 0.2596064311875349, + "grad_norm": 0.7672437565643656, + "learning_rate": 8.653010552451893e-05, + "loss": 3.9918, + "step": 4182 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 0.5386221649049273, + "learning_rate": 8.655079660666253e-05, + "loss": 4.0568, + "step": 4183 + }, + { + "epoch": 0.25973058538705074, + "grad_norm": 0.38524460388479104, + "learning_rate": 8.657148768880614e-05, + "loss": 4.0604, + "step": 4184 + }, + { + "epoch": 0.2597926624868086, + "grad_norm": 0.5635954468661644, + "learning_rate": 8.659217877094973e-05, + "loss": 4.0338, + "step": 4185 + }, + { + "epoch": 0.2598547395865665, + "grad_norm": 0.3594823759796635, + "learning_rate": 8.661286985309333e-05, + "loss": 4.0311, + "step": 4186 + }, + { + "epoch": 0.25991681668632444, + "grad_norm": 0.6314948928865102, + "learning_rate": 8.663356093523691e-05, + "loss": 4.1066, + "step": 4187 + }, + { + "epoch": 0.2599788937860823, + "grad_norm": 0.3851341573944496, + "learning_rate": 8.665425201738052e-05, + "loss": 3.9995, + "step": 4188 + }, + { + "epoch": 0.2600409708858402, + "grad_norm": 0.37447682284000583, + "learning_rate": 8.667494309952411e-05, + "loss": 3.9892, + "step": 4189 + }, + { + "epoch": 0.26010304798559813, + "grad_norm": 0.637475809872506, + "learning_rate": 8.66956341816677e-05, + "loss": 4.0532, + "step": 4190 + }, + { + "epoch": 0.260165125085356, + "grad_norm": 0.5530594318126747, + "learning_rate": 8.67163252638113e-05, + "loss": 3.9716, + "step": 4191 + }, + { + "epoch": 0.2602272021851139, + "grad_norm": 0.3782155141394959, + "learning_rate": 8.67370163459549e-05, + "loss": 3.9297, + "step": 4192 + }, + { + "epoch": 0.26028927928487183, + "grad_norm": 0.39259216890788723, + "learning_rate": 8.675770742809849e-05, + "loss": 3.9752, + "step": 4193 + }, + { + "epoch": 0.2603513563846297, + "grad_norm": 0.3358358184937059, + "learning_rate": 8.677839851024209e-05, + "loss": 4.0538, + "step": 4194 + }, + { + "epoch": 0.2604134334843876, + "grad_norm": 0.3343149189136776, + "learning_rate": 8.679908959238568e-05, + "loss": 3.9376, + "step": 4195 + }, + { + "epoch": 0.26047551058414553, + "grad_norm": 0.3272054252523619, + "learning_rate": 8.681978067452928e-05, + "loss": 3.9674, + "step": 4196 + }, + { + "epoch": 0.2605375876839034, + "grad_norm": 0.31990509175490295, + "learning_rate": 8.684047175667289e-05, + "loss": 3.9927, + "step": 4197 + }, + { + "epoch": 0.2605996647836613, + "grad_norm": 0.3693682861686163, + "learning_rate": 8.686116283881648e-05, + "loss": 4.0279, + "step": 4198 + }, + { + "epoch": 0.26066174188341923, + "grad_norm": 0.38625751309047507, + "learning_rate": 8.688185392096006e-05, + "loss": 4.0063, + "step": 4199 + }, + { + "epoch": 0.2607238189831771, + "grad_norm": 0.4072851783775485, + "learning_rate": 8.690254500310366e-05, + "loss": 3.8919, + "step": 4200 + }, + { + "epoch": 0.260785896082935, + "grad_norm": 0.2773086498776458, + "learning_rate": 8.692323608524727e-05, + "loss": 3.9221, + "step": 4201 + }, + { + "epoch": 0.2608479731826929, + "grad_norm": 0.3137027720510978, + "learning_rate": 8.694392716739086e-05, + "loss": 3.9312, + "step": 4202 + }, + { + "epoch": 0.2609100502824508, + "grad_norm": 0.3435168017154031, + "learning_rate": 8.696461824953446e-05, + "loss": 3.8664, + "step": 4203 + }, + { + "epoch": 0.2609721273822087, + "grad_norm": 0.26957309785030475, + "learning_rate": 8.698530933167805e-05, + "loss": 3.9852, + "step": 4204 + }, + { + "epoch": 0.2610342044819666, + "grad_norm": 0.5866594825606395, + "learning_rate": 8.700600041382165e-05, + "loss": 3.9906, + "step": 4205 + }, + { + "epoch": 0.2610962815817245, + "grad_norm": 0.36350967035062925, + "learning_rate": 8.702669149596524e-05, + "loss": 3.9739, + "step": 4206 + }, + { + "epoch": 0.2611583586814824, + "grad_norm": 0.5207223646840544, + "learning_rate": 8.704738257810884e-05, + "loss": 3.98, + "step": 4207 + }, + { + "epoch": 0.2612204357812403, + "grad_norm": 0.3175123683893278, + "learning_rate": 8.706807366025243e-05, + "loss": 3.959, + "step": 4208 + }, + { + "epoch": 0.2612825128809982, + "grad_norm": 0.3935154594767898, + "learning_rate": 8.708876474239604e-05, + "loss": 3.8918, + "step": 4209 + }, + { + "epoch": 0.2613445899807561, + "grad_norm": 0.30581981602861247, + "learning_rate": 8.710945582453964e-05, + "loss": 3.9096, + "step": 4210 + }, + { + "epoch": 0.261406667080514, + "grad_norm": 0.3277058762900624, + "learning_rate": 8.713014690668322e-05, + "loss": 3.8814, + "step": 4211 + }, + { + "epoch": 0.2614687441802719, + "grad_norm": 0.373308539369075, + "learning_rate": 8.715083798882681e-05, + "loss": 4.1393, + "step": 4212 + }, + { + "epoch": 0.2615308212800298, + "grad_norm": 0.4001611377576934, + "learning_rate": 8.717152907097042e-05, + "loss": 3.9805, + "step": 4213 + }, + { + "epoch": 0.2615928983797877, + "grad_norm": 0.3719961938599228, + "learning_rate": 8.719222015311402e-05, + "loss": 3.8626, + "step": 4214 + }, + { + "epoch": 0.2616549754795456, + "grad_norm": 0.2861326562666222, + "learning_rate": 8.721291123525761e-05, + "loss": 4.0188, + "step": 4215 + }, + { + "epoch": 0.2617170525793035, + "grad_norm": 0.48622022910291235, + "learning_rate": 8.72336023174012e-05, + "loss": 3.9531, + "step": 4216 + }, + { + "epoch": 0.2617791296790614, + "grad_norm": 0.37970980747452604, + "learning_rate": 8.72542933995448e-05, + "loss": 4.0012, + "step": 4217 + }, + { + "epoch": 0.2618412067788193, + "grad_norm": 0.381165151847567, + "learning_rate": 8.72749844816884e-05, + "loss": 3.9826, + "step": 4218 + }, + { + "epoch": 0.2619032838785772, + "grad_norm": 0.3671203714247151, + "learning_rate": 8.729567556383199e-05, + "loss": 3.9345, + "step": 4219 + }, + { + "epoch": 0.2619653609783351, + "grad_norm": 0.35199634485622827, + "learning_rate": 8.731636664597559e-05, + "loss": 4.0652, + "step": 4220 + }, + { + "epoch": 0.262027438078093, + "grad_norm": 0.7589827870790654, + "learning_rate": 8.733705772811918e-05, + "loss": 4.0204, + "step": 4221 + }, + { + "epoch": 0.2620895151778509, + "grad_norm": 0.5222151133406272, + "learning_rate": 8.735774881026279e-05, + "loss": 4.0099, + "step": 4222 + }, + { + "epoch": 0.2621515922776088, + "grad_norm": 0.5777578734699524, + "learning_rate": 8.737843989240637e-05, + "loss": 3.9423, + "step": 4223 + }, + { + "epoch": 0.2622136693773667, + "grad_norm": 0.4252036665523751, + "learning_rate": 8.739913097454997e-05, + "loss": 3.9938, + "step": 4224 + }, + { + "epoch": 0.2622757464771246, + "grad_norm": 0.4992131210511265, + "learning_rate": 8.741982205669356e-05, + "loss": 4.0776, + "step": 4225 + }, + { + "epoch": 0.2623378235768825, + "grad_norm": 0.46595546273313165, + "learning_rate": 8.744051313883717e-05, + "loss": 3.9617, + "step": 4226 + }, + { + "epoch": 0.2623999006766404, + "grad_norm": 0.3011872041514397, + "learning_rate": 8.746120422098077e-05, + "loss": 4.0614, + "step": 4227 + }, + { + "epoch": 0.2624619777763983, + "grad_norm": 0.4119372437756586, + "learning_rate": 8.748189530312436e-05, + "loss": 4.0194, + "step": 4228 + }, + { + "epoch": 0.2625240548761562, + "grad_norm": 0.5219878719956177, + "learning_rate": 8.750258638526794e-05, + "loss": 4.022, + "step": 4229 + }, + { + "epoch": 0.26258613197591407, + "grad_norm": 0.5628379230592649, + "learning_rate": 8.752327746741155e-05, + "loss": 3.9554, + "step": 4230 + }, + { + "epoch": 0.262648209075672, + "grad_norm": 0.45512144349177236, + "learning_rate": 8.754396854955515e-05, + "loss": 3.943, + "step": 4231 + }, + { + "epoch": 0.2627102861754299, + "grad_norm": 0.5501044694345522, + "learning_rate": 8.756465963169874e-05, + "loss": 3.9221, + "step": 4232 + }, + { + "epoch": 0.26277236327518777, + "grad_norm": 0.49672204128445735, + "learning_rate": 8.758535071384234e-05, + "loss": 3.9537, + "step": 4233 + }, + { + "epoch": 0.2628344403749457, + "grad_norm": 0.47374706659943794, + "learning_rate": 8.760604179598594e-05, + "loss": 3.9276, + "step": 4234 + }, + { + "epoch": 0.2628965174747036, + "grad_norm": 0.4779065985923119, + "learning_rate": 8.762673287812953e-05, + "loss": 3.9657, + "step": 4235 + }, + { + "epoch": 0.26295859457446147, + "grad_norm": 0.3712675791730288, + "learning_rate": 8.764742396027312e-05, + "loss": 3.912, + "step": 4236 + }, + { + "epoch": 0.2630206716742194, + "grad_norm": 0.2950913655908752, + "learning_rate": 8.766811504241672e-05, + "loss": 3.8574, + "step": 4237 + }, + { + "epoch": 0.2630827487739773, + "grad_norm": 0.41401742293949345, + "learning_rate": 8.768880612456032e-05, + "loss": 3.9631, + "step": 4238 + }, + { + "epoch": 0.26314482587373517, + "grad_norm": 0.40703715578213506, + "learning_rate": 8.770949720670392e-05, + "loss": 3.9974, + "step": 4239 + }, + { + "epoch": 0.2632069029734931, + "grad_norm": 0.42508767009128995, + "learning_rate": 8.773018828884751e-05, + "loss": 4.0043, + "step": 4240 + }, + { + "epoch": 0.263268980073251, + "grad_norm": 0.32478210040366257, + "learning_rate": 8.775087937099111e-05, + "loss": 4.028, + "step": 4241 + }, + { + "epoch": 0.26333105717300886, + "grad_norm": 0.39245520038422765, + "learning_rate": 8.77715704531347e-05, + "loss": 3.9047, + "step": 4242 + }, + { + "epoch": 0.2633931342727668, + "grad_norm": 0.3464167282660231, + "learning_rate": 8.77922615352783e-05, + "loss": 3.9148, + "step": 4243 + }, + { + "epoch": 0.2634552113725247, + "grad_norm": 0.4726201035730971, + "learning_rate": 8.78129526174219e-05, + "loss": 3.8964, + "step": 4244 + }, + { + "epoch": 0.26351728847228256, + "grad_norm": 0.300561849658855, + "learning_rate": 8.783364369956549e-05, + "loss": 4.0243, + "step": 4245 + }, + { + "epoch": 0.2635793655720405, + "grad_norm": 0.39516952552683715, + "learning_rate": 8.785433478170909e-05, + "loss": 4.0851, + "step": 4246 + }, + { + "epoch": 0.2636414426717984, + "grad_norm": 0.6694001593612103, + "learning_rate": 8.78750258638527e-05, + "loss": 3.9732, + "step": 4247 + }, + { + "epoch": 0.26370351977155626, + "grad_norm": 0.4340023658518757, + "learning_rate": 8.789571694599628e-05, + "loss": 3.9547, + "step": 4248 + }, + { + "epoch": 0.2637655968713142, + "grad_norm": 0.47456310972920385, + "learning_rate": 8.791640802813987e-05, + "loss": 3.9833, + "step": 4249 + }, + { + "epoch": 0.2638276739710721, + "grad_norm": 0.6312835967688436, + "learning_rate": 8.793709911028347e-05, + "loss": 3.9853, + "step": 4250 + }, + { + "epoch": 0.26388975107082996, + "grad_norm": 0.37539812650141907, + "learning_rate": 8.795779019242707e-05, + "loss": 3.9651, + "step": 4251 + }, + { + "epoch": 0.2639518281705879, + "grad_norm": 0.2905467179193608, + "learning_rate": 8.797848127457067e-05, + "loss": 3.9281, + "step": 4252 + }, + { + "epoch": 0.2640139052703458, + "grad_norm": 0.3556825183686459, + "learning_rate": 8.799917235671426e-05, + "loss": 3.9317, + "step": 4253 + }, + { + "epoch": 0.26407598237010366, + "grad_norm": 0.3159284987898354, + "learning_rate": 8.801986343885785e-05, + "loss": 4.0081, + "step": 4254 + }, + { + "epoch": 0.2641380594698616, + "grad_norm": 0.36381419886861, + "learning_rate": 8.804055452100145e-05, + "loss": 3.7688, + "step": 4255 + }, + { + "epoch": 0.2642001365696195, + "grad_norm": 0.28622658921172994, + "learning_rate": 8.806124560314505e-05, + "loss": 3.9275, + "step": 4256 + }, + { + "epoch": 0.26426221366937735, + "grad_norm": 0.5023537013371924, + "learning_rate": 8.808193668528864e-05, + "loss": 3.8295, + "step": 4257 + }, + { + "epoch": 0.2643242907691353, + "grad_norm": 0.336379636304467, + "learning_rate": 8.810262776743224e-05, + "loss": 4.0435, + "step": 4258 + }, + { + "epoch": 0.2643863678688932, + "grad_norm": 0.26647237645013794, + "learning_rate": 8.812331884957585e-05, + "loss": 3.8031, + "step": 4259 + }, + { + "epoch": 0.26444844496865105, + "grad_norm": 0.3781703409486071, + "learning_rate": 8.814400993171943e-05, + "loss": 3.952, + "step": 4260 + }, + { + "epoch": 0.26451052206840897, + "grad_norm": 0.3019267599974809, + "learning_rate": 8.816470101386302e-05, + "loss": 4.0604, + "step": 4261 + }, + { + "epoch": 0.2645725991681669, + "grad_norm": 0.39812882003867833, + "learning_rate": 8.818539209600662e-05, + "loss": 3.9831, + "step": 4262 + }, + { + "epoch": 0.26463467626792475, + "grad_norm": 0.487545680082241, + "learning_rate": 8.820608317815023e-05, + "loss": 3.9634, + "step": 4263 + }, + { + "epoch": 0.26469675336768267, + "grad_norm": 0.39909643027478137, + "learning_rate": 8.822677426029382e-05, + "loss": 3.8903, + "step": 4264 + }, + { + "epoch": 0.2647588304674406, + "grad_norm": 0.31304250357583824, + "learning_rate": 8.824746534243742e-05, + "loss": 3.9522, + "step": 4265 + }, + { + "epoch": 0.26482090756719845, + "grad_norm": 0.35931688336365947, + "learning_rate": 8.8268156424581e-05, + "loss": 3.9816, + "step": 4266 + }, + { + "epoch": 0.26488298466695637, + "grad_norm": 0.3422029644158559, + "learning_rate": 8.828884750672461e-05, + "loss": 3.8029, + "step": 4267 + }, + { + "epoch": 0.2649450617667143, + "grad_norm": 0.4548893811352331, + "learning_rate": 8.83095385888682e-05, + "loss": 4.04, + "step": 4268 + }, + { + "epoch": 0.26500713886647215, + "grad_norm": 0.4487048153191029, + "learning_rate": 8.83302296710118e-05, + "loss": 4.015, + "step": 4269 + }, + { + "epoch": 0.26506921596623007, + "grad_norm": 0.6899198729872963, + "learning_rate": 8.83509207531554e-05, + "loss": 3.9724, + "step": 4270 + }, + { + "epoch": 0.265131293065988, + "grad_norm": 0.5379570115952939, + "learning_rate": 8.837161183529899e-05, + "loss": 3.9577, + "step": 4271 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 0.369204843576448, + "learning_rate": 8.839230291744258e-05, + "loss": 3.893, + "step": 4272 + }, + { + "epoch": 0.26525544726550376, + "grad_norm": 0.4869566536525676, + "learning_rate": 8.841299399958618e-05, + "loss": 4.1253, + "step": 4273 + }, + { + "epoch": 0.2653175243652617, + "grad_norm": 0.3382408774337659, + "learning_rate": 8.843368508172977e-05, + "loss": 3.8901, + "step": 4274 + }, + { + "epoch": 0.26537960146501954, + "grad_norm": 0.4057049212314062, + "learning_rate": 8.845437616387337e-05, + "loss": 3.9765, + "step": 4275 + }, + { + "epoch": 0.26544167856477746, + "grad_norm": 0.5652089849436184, + "learning_rate": 8.847506724601698e-05, + "loss": 3.9244, + "step": 4276 + }, + { + "epoch": 0.2655037556645354, + "grad_norm": 0.3783621666896052, + "learning_rate": 8.849575832816057e-05, + "loss": 3.8783, + "step": 4277 + }, + { + "epoch": 0.26556583276429324, + "grad_norm": 0.3695886910052708, + "learning_rate": 8.851644941030415e-05, + "loss": 3.827, + "step": 4278 + }, + { + "epoch": 0.26562790986405116, + "grad_norm": 0.26544044197756955, + "learning_rate": 8.853714049244776e-05, + "loss": 3.8679, + "step": 4279 + }, + { + "epoch": 0.2656899869638091, + "grad_norm": 0.3299770426170132, + "learning_rate": 8.855783157459136e-05, + "loss": 4.0413, + "step": 4280 + }, + { + "epoch": 0.26575206406356694, + "grad_norm": 0.4142078468444738, + "learning_rate": 8.857852265673495e-05, + "loss": 3.979, + "step": 4281 + }, + { + "epoch": 0.26581414116332486, + "grad_norm": 0.44135559263967616, + "learning_rate": 8.859921373887855e-05, + "loss": 3.9923, + "step": 4282 + }, + { + "epoch": 0.2658762182630828, + "grad_norm": 0.6017923782856115, + "learning_rate": 8.861990482102214e-05, + "loss": 3.8581, + "step": 4283 + }, + { + "epoch": 0.26593829536284064, + "grad_norm": 0.3966239721030625, + "learning_rate": 8.864059590316574e-05, + "loss": 4.0001, + "step": 4284 + }, + { + "epoch": 0.26600037246259856, + "grad_norm": 0.4035185726091348, + "learning_rate": 8.866128698530933e-05, + "loss": 4.0173, + "step": 4285 + }, + { + "epoch": 0.2660624495623565, + "grad_norm": 0.28830272470520446, + "learning_rate": 8.868197806745293e-05, + "loss": 3.8999, + "step": 4286 + }, + { + "epoch": 0.26612452666211434, + "grad_norm": 0.45818403550600767, + "learning_rate": 8.870266914959652e-05, + "loss": 3.9461, + "step": 4287 + }, + { + "epoch": 0.26618660376187225, + "grad_norm": 0.5491043156098032, + "learning_rate": 8.872336023174013e-05, + "loss": 3.9048, + "step": 4288 + }, + { + "epoch": 0.26624868086163017, + "grad_norm": 0.5805808820275317, + "learning_rate": 8.874405131388373e-05, + "loss": 3.9752, + "step": 4289 + }, + { + "epoch": 0.26631075796138803, + "grad_norm": 0.3047352411085416, + "learning_rate": 8.876474239602731e-05, + "loss": 3.8841, + "step": 4290 + }, + { + "epoch": 0.26637283506114595, + "grad_norm": 0.284097417084971, + "learning_rate": 8.87854334781709e-05, + "loss": 3.9472, + "step": 4291 + }, + { + "epoch": 0.26643491216090387, + "grad_norm": 0.7622361602202405, + "learning_rate": 8.880612456031451e-05, + "loss": 3.9784, + "step": 4292 + }, + { + "epoch": 0.26649698926066173, + "grad_norm": 0.6092462403639671, + "learning_rate": 8.882681564245811e-05, + "loss": 3.9758, + "step": 4293 + }, + { + "epoch": 0.26655906636041965, + "grad_norm": 0.6265025435987442, + "learning_rate": 8.88475067246017e-05, + "loss": 3.9514, + "step": 4294 + }, + { + "epoch": 0.26662114346017757, + "grad_norm": 0.4478583593638828, + "learning_rate": 8.88681978067453e-05, + "loss": 3.8747, + "step": 4295 + }, + { + "epoch": 0.26668322055993543, + "grad_norm": 0.5132414896761622, + "learning_rate": 8.888888888888889e-05, + "loss": 3.989, + "step": 4296 + }, + { + "epoch": 0.26674529765969335, + "grad_norm": 0.6325609625586687, + "learning_rate": 8.890957997103249e-05, + "loss": 3.9575, + "step": 4297 + }, + { + "epoch": 0.26680737475945127, + "grad_norm": 0.5229667425558759, + "learning_rate": 8.893027105317608e-05, + "loss": 3.9127, + "step": 4298 + }, + { + "epoch": 0.26686945185920913, + "grad_norm": 0.6004889185007647, + "learning_rate": 8.895096213531968e-05, + "loss": 3.9666, + "step": 4299 + }, + { + "epoch": 0.26693152895896705, + "grad_norm": 0.33086708402153536, + "learning_rate": 8.897165321746327e-05, + "loss": 4.0694, + "step": 4300 + }, + { + "epoch": 0.26699360605872496, + "grad_norm": 0.4939489181224051, + "learning_rate": 8.899234429960688e-05, + "loss": 4.0504, + "step": 4301 + }, + { + "epoch": 0.2670556831584828, + "grad_norm": 0.6083100623639779, + "learning_rate": 8.901303538175046e-05, + "loss": 4.0349, + "step": 4302 + }, + { + "epoch": 0.26711776025824074, + "grad_norm": 0.34519249440409433, + "learning_rate": 8.903372646389406e-05, + "loss": 4.0574, + "step": 4303 + }, + { + "epoch": 0.26717983735799866, + "grad_norm": 0.5574301758477589, + "learning_rate": 8.905441754603767e-05, + "loss": 3.9641, + "step": 4304 + }, + { + "epoch": 0.2672419144577565, + "grad_norm": 0.2986391005717896, + "learning_rate": 8.907510862818126e-05, + "loss": 3.9071, + "step": 4305 + }, + { + "epoch": 0.26730399155751444, + "grad_norm": 0.6270546281355266, + "learning_rate": 8.909579971032486e-05, + "loss": 4.0404, + "step": 4306 + }, + { + "epoch": 0.26736606865727236, + "grad_norm": 0.32734137013009534, + "learning_rate": 8.911649079246845e-05, + "loss": 3.9705, + "step": 4307 + }, + { + "epoch": 0.2674281457570302, + "grad_norm": 0.3832418214195306, + "learning_rate": 8.913718187461205e-05, + "loss": 3.9234, + "step": 4308 + }, + { + "epoch": 0.26749022285678814, + "grad_norm": 0.3834337526245553, + "learning_rate": 8.915787295675564e-05, + "loss": 4.0283, + "step": 4309 + }, + { + "epoch": 0.267552299956546, + "grad_norm": 0.38045084635068516, + "learning_rate": 8.917856403889924e-05, + "loss": 3.8601, + "step": 4310 + }, + { + "epoch": 0.2676143770563039, + "grad_norm": 0.3763577003253283, + "learning_rate": 8.919925512104283e-05, + "loss": 3.9016, + "step": 4311 + }, + { + "epoch": 0.26767645415606184, + "grad_norm": 0.350541943523286, + "learning_rate": 8.921994620318643e-05, + "loss": 4.0103, + "step": 4312 + }, + { + "epoch": 0.2677385312558197, + "grad_norm": 0.3294571446912371, + "learning_rate": 8.924063728533004e-05, + "loss": 3.8992, + "step": 4313 + }, + { + "epoch": 0.2678006083555776, + "grad_norm": 0.3414333917579078, + "learning_rate": 8.926132836747362e-05, + "loss": 3.872, + "step": 4314 + }, + { + "epoch": 0.26786268545533554, + "grad_norm": 0.29407276672780686, + "learning_rate": 8.928201944961721e-05, + "loss": 3.9461, + "step": 4315 + }, + { + "epoch": 0.2679247625550934, + "grad_norm": 0.3275795785847571, + "learning_rate": 8.930271053176081e-05, + "loss": 3.9151, + "step": 4316 + }, + { + "epoch": 0.2679868396548513, + "grad_norm": 0.4841295974261005, + "learning_rate": 8.932340161390442e-05, + "loss": 3.9667, + "step": 4317 + }, + { + "epoch": 0.26804891675460923, + "grad_norm": 0.3703735188619691, + "learning_rate": 8.934409269604801e-05, + "loss": 3.9677, + "step": 4318 + }, + { + "epoch": 0.2681109938543671, + "grad_norm": 0.25364430927756093, + "learning_rate": 8.936478377819161e-05, + "loss": 3.9669, + "step": 4319 + }, + { + "epoch": 0.268173070954125, + "grad_norm": 0.616553089168631, + "learning_rate": 8.938547486033519e-05, + "loss": 3.9384, + "step": 4320 + }, + { + "epoch": 0.26823514805388293, + "grad_norm": 0.6949726517188396, + "learning_rate": 8.94061659424788e-05, + "loss": 3.8769, + "step": 4321 + }, + { + "epoch": 0.2682972251536408, + "grad_norm": 0.3739243019933524, + "learning_rate": 8.942685702462239e-05, + "loss": 3.9404, + "step": 4322 + }, + { + "epoch": 0.2683593022533987, + "grad_norm": 0.4431557544855853, + "learning_rate": 8.944754810676599e-05, + "loss": 3.9627, + "step": 4323 + }, + { + "epoch": 0.26842137935315663, + "grad_norm": 0.40404452865888807, + "learning_rate": 8.946823918890958e-05, + "loss": 3.8242, + "step": 4324 + }, + { + "epoch": 0.2684834564529145, + "grad_norm": 0.3835631232067787, + "learning_rate": 8.948893027105318e-05, + "loss": 3.954, + "step": 4325 + }, + { + "epoch": 0.2685455335526724, + "grad_norm": 0.5519181010263012, + "learning_rate": 8.950962135319679e-05, + "loss": 4.0007, + "step": 4326 + }, + { + "epoch": 0.26860761065243033, + "grad_norm": 0.43837531265278096, + "learning_rate": 8.953031243534037e-05, + "loss": 3.9363, + "step": 4327 + }, + { + "epoch": 0.2686696877521882, + "grad_norm": 0.4009899027556768, + "learning_rate": 8.955100351748396e-05, + "loss": 3.9275, + "step": 4328 + }, + { + "epoch": 0.2687317648519461, + "grad_norm": 0.3505769797538205, + "learning_rate": 8.957169459962757e-05, + "loss": 3.991, + "step": 4329 + }, + { + "epoch": 0.268793841951704, + "grad_norm": 0.5806148884183305, + "learning_rate": 8.959238568177117e-05, + "loss": 4.0468, + "step": 4330 + }, + { + "epoch": 0.2688559190514619, + "grad_norm": 0.5676769603728158, + "learning_rate": 8.961307676391476e-05, + "loss": 3.8899, + "step": 4331 + }, + { + "epoch": 0.2689179961512198, + "grad_norm": 0.48715727569806855, + "learning_rate": 8.963376784605836e-05, + "loss": 3.891, + "step": 4332 + }, + { + "epoch": 0.2689800732509777, + "grad_norm": 0.5841715330972375, + "learning_rate": 8.965445892820195e-05, + "loss": 4.0216, + "step": 4333 + }, + { + "epoch": 0.2690421503507356, + "grad_norm": 0.44618334029236173, + "learning_rate": 8.967515001034555e-05, + "loss": 4.0311, + "step": 4334 + }, + { + "epoch": 0.2691042274504935, + "grad_norm": 0.35687940110467403, + "learning_rate": 8.969584109248914e-05, + "loss": 3.8109, + "step": 4335 + }, + { + "epoch": 0.2691663045502514, + "grad_norm": 0.43377951345706905, + "learning_rate": 8.971653217463274e-05, + "loss": 3.9071, + "step": 4336 + }, + { + "epoch": 0.2692283816500093, + "grad_norm": 0.6328248623240901, + "learning_rate": 8.973722325677633e-05, + "loss": 3.9673, + "step": 4337 + }, + { + "epoch": 0.2692904587497672, + "grad_norm": 0.3004674389156436, + "learning_rate": 8.975791433891994e-05, + "loss": 3.981, + "step": 4338 + }, + { + "epoch": 0.2693525358495251, + "grad_norm": 0.39228823382590355, + "learning_rate": 8.977860542106352e-05, + "loss": 3.9487, + "step": 4339 + }, + { + "epoch": 0.269414612949283, + "grad_norm": 0.30619848168343633, + "learning_rate": 8.979929650320712e-05, + "loss": 3.8854, + "step": 4340 + }, + { + "epoch": 0.2694766900490409, + "grad_norm": 0.4286460045234397, + "learning_rate": 8.981998758535071e-05, + "loss": 3.9222, + "step": 4341 + }, + { + "epoch": 0.2695387671487988, + "grad_norm": 0.3367519375583214, + "learning_rate": 8.984067866749432e-05, + "loss": 3.913, + "step": 4342 + }, + { + "epoch": 0.2696008442485567, + "grad_norm": 0.3639818634319915, + "learning_rate": 8.986136974963792e-05, + "loss": 3.9769, + "step": 4343 + }, + { + "epoch": 0.2696629213483146, + "grad_norm": 0.4007786429099194, + "learning_rate": 8.988206083178151e-05, + "loss": 3.9404, + "step": 4344 + }, + { + "epoch": 0.2697249984480725, + "grad_norm": 0.263699692565636, + "learning_rate": 8.990275191392509e-05, + "loss": 3.882, + "step": 4345 + }, + { + "epoch": 0.2697870755478304, + "grad_norm": 0.2737745795907429, + "learning_rate": 8.99234429960687e-05, + "loss": 3.9338, + "step": 4346 + }, + { + "epoch": 0.2698491526475883, + "grad_norm": 0.6463312388330918, + "learning_rate": 8.99441340782123e-05, + "loss": 3.9284, + "step": 4347 + }, + { + "epoch": 0.2699112297473462, + "grad_norm": 0.34240577955841195, + "learning_rate": 8.996482516035589e-05, + "loss": 3.8585, + "step": 4348 + }, + { + "epoch": 0.2699733068471041, + "grad_norm": 0.6014364283842633, + "learning_rate": 8.998551624249949e-05, + "loss": 4.0379, + "step": 4349 + }, + { + "epoch": 0.270035383946862, + "grad_norm": 0.34702354893218523, + "learning_rate": 9.000620732464308e-05, + "loss": 3.8934, + "step": 4350 + }, + { + "epoch": 0.2700974610466199, + "grad_norm": 0.4156250351353681, + "learning_rate": 9.002689840678668e-05, + "loss": 3.8722, + "step": 4351 + }, + { + "epoch": 0.2701595381463778, + "grad_norm": 0.7610771019809751, + "learning_rate": 9.004758948893027e-05, + "loss": 3.9854, + "step": 4352 + }, + { + "epoch": 0.2702216152461357, + "grad_norm": 0.3715450754724058, + "learning_rate": 9.006828057107387e-05, + "loss": 3.8296, + "step": 4353 + }, + { + "epoch": 0.2702836923458936, + "grad_norm": 0.4738845628051744, + "learning_rate": 9.008897165321748e-05, + "loss": 4.0159, + "step": 4354 + }, + { + "epoch": 0.2703457694456515, + "grad_norm": 0.41438173187909527, + "learning_rate": 9.010966273536107e-05, + "loss": 4.0324, + "step": 4355 + }, + { + "epoch": 0.2704078465454094, + "grad_norm": 0.41989352108353356, + "learning_rate": 9.013035381750467e-05, + "loss": 3.9106, + "step": 4356 + }, + { + "epoch": 0.2704699236451673, + "grad_norm": 0.5786105771544804, + "learning_rate": 9.015104489964825e-05, + "loss": 3.9401, + "step": 4357 + }, + { + "epoch": 0.27053200074492517, + "grad_norm": 0.45207131306830806, + "learning_rate": 9.017173598179186e-05, + "loss": 3.9767, + "step": 4358 + }, + { + "epoch": 0.2705940778446831, + "grad_norm": 0.5799341761710353, + "learning_rate": 9.019242706393545e-05, + "loss": 3.9732, + "step": 4359 + }, + { + "epoch": 0.270656154944441, + "grad_norm": 0.5472805971416081, + "learning_rate": 9.021311814607905e-05, + "loss": 3.9502, + "step": 4360 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 0.6666618411565025, + "learning_rate": 9.023380922822264e-05, + "loss": 3.9171, + "step": 4361 + }, + { + "epoch": 0.2707803091439568, + "grad_norm": 0.5119896673895894, + "learning_rate": 9.025450031036624e-05, + "loss": 3.9213, + "step": 4362 + }, + { + "epoch": 0.2708423862437147, + "grad_norm": 0.8356834810003546, + "learning_rate": 9.027519139250983e-05, + "loss": 3.9958, + "step": 4363 + }, + { + "epoch": 0.27090446334347257, + "grad_norm": 0.504183995787148, + "learning_rate": 9.029588247465343e-05, + "loss": 3.8779, + "step": 4364 + }, + { + "epoch": 0.2709665404432305, + "grad_norm": 0.571524338488494, + "learning_rate": 9.031657355679702e-05, + "loss": 3.9312, + "step": 4365 + }, + { + "epoch": 0.2710286175429884, + "grad_norm": 0.4373009092466723, + "learning_rate": 9.033726463894062e-05, + "loss": 4.0439, + "step": 4366 + }, + { + "epoch": 0.27109069464274627, + "grad_norm": 0.5145999371659425, + "learning_rate": 9.035795572108422e-05, + "loss": 3.9403, + "step": 4367 + }, + { + "epoch": 0.2711527717425042, + "grad_norm": 0.6404477083463438, + "learning_rate": 9.037864680322782e-05, + "loss": 3.9354, + "step": 4368 + }, + { + "epoch": 0.2712148488422621, + "grad_norm": 0.38140593216573, + "learning_rate": 9.03993378853714e-05, + "loss": 3.8902, + "step": 4369 + }, + { + "epoch": 0.27127692594201996, + "grad_norm": 0.5649547789630406, + "learning_rate": 9.0420028967515e-05, + "loss": 4.0201, + "step": 4370 + }, + { + "epoch": 0.2713390030417779, + "grad_norm": 0.6337062270191584, + "learning_rate": 9.04407200496586e-05, + "loss": 3.9709, + "step": 4371 + }, + { + "epoch": 0.2714010801415358, + "grad_norm": 0.4286296480660493, + "learning_rate": 9.04614111318022e-05, + "loss": 3.9088, + "step": 4372 + }, + { + "epoch": 0.27146315724129366, + "grad_norm": 0.43926248441919596, + "learning_rate": 9.04821022139458e-05, + "loss": 3.9794, + "step": 4373 + }, + { + "epoch": 0.2715252343410516, + "grad_norm": 0.38635718657725043, + "learning_rate": 9.050279329608939e-05, + "loss": 3.8568, + "step": 4374 + }, + { + "epoch": 0.2715873114408095, + "grad_norm": 0.38680416666615164, + "learning_rate": 9.052348437823299e-05, + "loss": 3.9514, + "step": 4375 + }, + { + "epoch": 0.27164938854056736, + "grad_norm": 0.3001317529645493, + "learning_rate": 9.054417546037658e-05, + "loss": 3.8925, + "step": 4376 + }, + { + "epoch": 0.2717114656403253, + "grad_norm": 0.4499273599170082, + "learning_rate": 9.056486654252018e-05, + "loss": 3.8906, + "step": 4377 + }, + { + "epoch": 0.2717735427400832, + "grad_norm": 0.4938562502197179, + "learning_rate": 9.058555762466377e-05, + "loss": 3.9307, + "step": 4378 + }, + { + "epoch": 0.27183561983984106, + "grad_norm": 0.4936619426575108, + "learning_rate": 9.060624870680738e-05, + "loss": 3.8095, + "step": 4379 + }, + { + "epoch": 0.271897696939599, + "grad_norm": 0.4926377462293113, + "learning_rate": 9.062693978895097e-05, + "loss": 3.8434, + "step": 4380 + }, + { + "epoch": 0.2719597740393569, + "grad_norm": 0.38252151874398904, + "learning_rate": 9.064763087109456e-05, + "loss": 3.9929, + "step": 4381 + }, + { + "epoch": 0.27202185113911476, + "grad_norm": 0.6518989005603697, + "learning_rate": 9.066832195323815e-05, + "loss": 3.978, + "step": 4382 + }, + { + "epoch": 0.2720839282388727, + "grad_norm": 0.4699019582432812, + "learning_rate": 9.068901303538176e-05, + "loss": 3.8697, + "step": 4383 + }, + { + "epoch": 0.2721460053386306, + "grad_norm": 0.5876600504039513, + "learning_rate": 9.070970411752535e-05, + "loss": 3.8004, + "step": 4384 + }, + { + "epoch": 0.27220808243838845, + "grad_norm": 0.8071039822532768, + "learning_rate": 9.073039519966895e-05, + "loss": 3.9758, + "step": 4385 + }, + { + "epoch": 0.2722701595381464, + "grad_norm": 0.5009497919793634, + "learning_rate": 9.075108628181254e-05, + "loss": 3.8519, + "step": 4386 + }, + { + "epoch": 0.2723322366379043, + "grad_norm": 0.502245480193806, + "learning_rate": 9.077177736395614e-05, + "loss": 3.9163, + "step": 4387 + }, + { + "epoch": 0.27239431373766215, + "grad_norm": 0.4592147249121331, + "learning_rate": 9.079246844609974e-05, + "loss": 3.9251, + "step": 4388 + }, + { + "epoch": 0.27245639083742007, + "grad_norm": 0.4086973492954802, + "learning_rate": 9.081315952824333e-05, + "loss": 3.9375, + "step": 4389 + }, + { + "epoch": 0.272518467937178, + "grad_norm": 0.6457219078992159, + "learning_rate": 9.083385061038693e-05, + "loss": 3.9256, + "step": 4390 + }, + { + "epoch": 0.27258054503693585, + "grad_norm": 0.3916222621555372, + "learning_rate": 9.085454169253052e-05, + "loss": 3.922, + "step": 4391 + }, + { + "epoch": 0.27264262213669377, + "grad_norm": 0.6948438629568912, + "learning_rate": 9.087523277467413e-05, + "loss": 3.9903, + "step": 4392 + }, + { + "epoch": 0.2727046992364517, + "grad_norm": 0.47114630764989307, + "learning_rate": 9.089592385681771e-05, + "loss": 3.8258, + "step": 4393 + }, + { + "epoch": 0.27276677633620955, + "grad_norm": 0.5809112115240038, + "learning_rate": 9.09166149389613e-05, + "loss": 3.8903, + "step": 4394 + }, + { + "epoch": 0.27282885343596747, + "grad_norm": 0.5151164697219894, + "learning_rate": 9.09373060211049e-05, + "loss": 3.9315, + "step": 4395 + }, + { + "epoch": 0.2728909305357254, + "grad_norm": 0.52278093182282, + "learning_rate": 9.095799710324851e-05, + "loss": 4.0229, + "step": 4396 + }, + { + "epoch": 0.27295300763548325, + "grad_norm": 0.3507599571409372, + "learning_rate": 9.09786881853921e-05, + "loss": 4.0039, + "step": 4397 + }, + { + "epoch": 0.27301508473524116, + "grad_norm": 0.31794661695600485, + "learning_rate": 9.09993792675357e-05, + "loss": 3.915, + "step": 4398 + }, + { + "epoch": 0.2730771618349991, + "grad_norm": 0.35562876359124657, + "learning_rate": 9.102007034967928e-05, + "loss": 3.8244, + "step": 4399 + }, + { + "epoch": 0.27313923893475694, + "grad_norm": 0.4218496032417279, + "learning_rate": 9.104076143182289e-05, + "loss": 3.9753, + "step": 4400 + }, + { + "epoch": 0.27320131603451486, + "grad_norm": 0.5377935594839824, + "learning_rate": 9.106145251396648e-05, + "loss": 3.9246, + "step": 4401 + }, + { + "epoch": 0.2732633931342728, + "grad_norm": 0.28789656282303017, + "learning_rate": 9.108214359611008e-05, + "loss": 3.9004, + "step": 4402 + }, + { + "epoch": 0.27332547023403064, + "grad_norm": 0.39285253546849286, + "learning_rate": 9.110283467825367e-05, + "loss": 3.8268, + "step": 4403 + }, + { + "epoch": 0.27338754733378856, + "grad_norm": 0.43629640824244775, + "learning_rate": 9.112352576039728e-05, + "loss": 3.819, + "step": 4404 + }, + { + "epoch": 0.2734496244335465, + "grad_norm": 0.33175483866672184, + "learning_rate": 9.114421684254086e-05, + "loss": 3.9896, + "step": 4405 + }, + { + "epoch": 0.27351170153330434, + "grad_norm": 0.39002093338417043, + "learning_rate": 9.116490792468446e-05, + "loss": 3.9337, + "step": 4406 + }, + { + "epoch": 0.27357377863306226, + "grad_norm": 0.2498339682829802, + "learning_rate": 9.118559900682806e-05, + "loss": 3.9468, + "step": 4407 + }, + { + "epoch": 0.2736358557328202, + "grad_norm": 0.38706480783999975, + "learning_rate": 9.120629008897166e-05, + "loss": 3.9095, + "step": 4408 + }, + { + "epoch": 0.27369793283257804, + "grad_norm": 0.2703068120721822, + "learning_rate": 9.122698117111526e-05, + "loss": 3.9165, + "step": 4409 + }, + { + "epoch": 0.27376000993233596, + "grad_norm": 0.2903257423846339, + "learning_rate": 9.124767225325885e-05, + "loss": 3.8909, + "step": 4410 + }, + { + "epoch": 0.2738220870320939, + "grad_norm": 0.3333531041043856, + "learning_rate": 9.126836333540245e-05, + "loss": 3.9679, + "step": 4411 + }, + { + "epoch": 0.27388416413185174, + "grad_norm": 0.453428380626042, + "learning_rate": 9.128905441754604e-05, + "loss": 3.846, + "step": 4412 + }, + { + "epoch": 0.27394624123160966, + "grad_norm": 0.411457486292615, + "learning_rate": 9.130974549968964e-05, + "loss": 3.8773, + "step": 4413 + }, + { + "epoch": 0.2740083183313676, + "grad_norm": 0.4911970891194887, + "learning_rate": 9.133043658183323e-05, + "loss": 3.9107, + "step": 4414 + }, + { + "epoch": 0.27407039543112544, + "grad_norm": 0.3247675330220936, + "learning_rate": 9.135112766397683e-05, + "loss": 3.8298, + "step": 4415 + }, + { + "epoch": 0.27413247253088335, + "grad_norm": 0.3290570509152392, + "learning_rate": 9.137181874612042e-05, + "loss": 3.8271, + "step": 4416 + }, + { + "epoch": 0.27419454963064127, + "grad_norm": 0.3316157827070541, + "learning_rate": 9.139250982826403e-05, + "loss": 3.8866, + "step": 4417 + }, + { + "epoch": 0.27425662673039913, + "grad_norm": 0.32512613654753814, + "learning_rate": 9.141320091040761e-05, + "loss": 3.8923, + "step": 4418 + }, + { + "epoch": 0.27431870383015705, + "grad_norm": 0.2968915884503115, + "learning_rate": 9.143389199255121e-05, + "loss": 3.9251, + "step": 4419 + }, + { + "epoch": 0.27438078092991497, + "grad_norm": 0.45010207859547563, + "learning_rate": 9.14545830746948e-05, + "loss": 3.9752, + "step": 4420 + }, + { + "epoch": 0.27444285802967283, + "grad_norm": 0.43057407468359576, + "learning_rate": 9.147527415683841e-05, + "loss": 3.779, + "step": 4421 + }, + { + "epoch": 0.27450493512943075, + "grad_norm": 0.49404827958838143, + "learning_rate": 9.149596523898201e-05, + "loss": 3.8543, + "step": 4422 + }, + { + "epoch": 0.27456701222918867, + "grad_norm": 0.4320514329331374, + "learning_rate": 9.15166563211256e-05, + "loss": 3.759, + "step": 4423 + }, + { + "epoch": 0.27462908932894653, + "grad_norm": 0.2928911233421028, + "learning_rate": 9.153734740326918e-05, + "loss": 3.8859, + "step": 4424 + }, + { + "epoch": 0.27469116642870445, + "grad_norm": 0.38475253475016064, + "learning_rate": 9.15580384854128e-05, + "loss": 3.796, + "step": 4425 + }, + { + "epoch": 0.27475324352846237, + "grad_norm": 0.41399357779907353, + "learning_rate": 9.157872956755639e-05, + "loss": 3.9675, + "step": 4426 + }, + { + "epoch": 0.2748153206282202, + "grad_norm": 0.25805565897918187, + "learning_rate": 9.159942064969998e-05, + "loss": 3.7713, + "step": 4427 + }, + { + "epoch": 0.27487739772797815, + "grad_norm": 0.4173885489655549, + "learning_rate": 9.162011173184358e-05, + "loss": 3.8417, + "step": 4428 + }, + { + "epoch": 0.27493947482773606, + "grad_norm": 0.38147525386620607, + "learning_rate": 9.164080281398719e-05, + "loss": 4.0194, + "step": 4429 + }, + { + "epoch": 0.2750015519274939, + "grad_norm": 0.30998533089216995, + "learning_rate": 9.166149389613077e-05, + "loss": 3.8714, + "step": 4430 + }, + { + "epoch": 0.27506362902725184, + "grad_norm": 0.34016610346388804, + "learning_rate": 9.168218497827436e-05, + "loss": 3.8739, + "step": 4431 + }, + { + "epoch": 0.27512570612700976, + "grad_norm": 0.31765789142265244, + "learning_rate": 9.170287606041796e-05, + "loss": 3.8658, + "step": 4432 + }, + { + "epoch": 0.2751877832267676, + "grad_norm": 0.32179474385614637, + "learning_rate": 9.172356714256157e-05, + "loss": 3.9677, + "step": 4433 + }, + { + "epoch": 0.27524986032652554, + "grad_norm": 0.2657374536669367, + "learning_rate": 9.174425822470516e-05, + "loss": 3.9106, + "step": 4434 + }, + { + "epoch": 0.27531193742628346, + "grad_norm": 0.3104061911669397, + "learning_rate": 9.176494930684876e-05, + "loss": 3.8535, + "step": 4435 + }, + { + "epoch": 0.2753740145260413, + "grad_norm": 0.3267527450469566, + "learning_rate": 9.178564038899234e-05, + "loss": 3.9276, + "step": 4436 + }, + { + "epoch": 0.27543609162579924, + "grad_norm": 0.3617302178033635, + "learning_rate": 9.180633147113595e-05, + "loss": 3.9319, + "step": 4437 + }, + { + "epoch": 0.27549816872555716, + "grad_norm": 0.30035441940641666, + "learning_rate": 9.182702255327954e-05, + "loss": 3.9273, + "step": 4438 + }, + { + "epoch": 0.275560245825315, + "grad_norm": 0.3961128034655096, + "learning_rate": 9.184771363542314e-05, + "loss": 3.99, + "step": 4439 + }, + { + "epoch": 0.27562232292507294, + "grad_norm": 0.4752278634684596, + "learning_rate": 9.186840471756673e-05, + "loss": 4.0297, + "step": 4440 + }, + { + "epoch": 0.27568440002483086, + "grad_norm": 0.35154714550820215, + "learning_rate": 9.188909579971033e-05, + "loss": 3.8805, + "step": 4441 + }, + { + "epoch": 0.2757464771245887, + "grad_norm": 0.26954761400166255, + "learning_rate": 9.190978688185392e-05, + "loss": 3.9241, + "step": 4442 + }, + { + "epoch": 0.27580855422434664, + "grad_norm": 0.4219380614161553, + "learning_rate": 9.193047796399752e-05, + "loss": 3.8599, + "step": 4443 + }, + { + "epoch": 0.27587063132410455, + "grad_norm": 0.5262203825728151, + "learning_rate": 9.195116904614111e-05, + "loss": 3.8743, + "step": 4444 + }, + { + "epoch": 0.2759327084238624, + "grad_norm": 0.3758960786357157, + "learning_rate": 9.197186012828471e-05, + "loss": 3.7998, + "step": 4445 + }, + { + "epoch": 0.27599478552362033, + "grad_norm": 0.398372794488428, + "learning_rate": 9.199255121042832e-05, + "loss": 3.8985, + "step": 4446 + }, + { + "epoch": 0.27605686262337825, + "grad_norm": 0.2672225510879281, + "learning_rate": 9.201324229257191e-05, + "loss": 3.8625, + "step": 4447 + }, + { + "epoch": 0.2761189397231361, + "grad_norm": 0.5570816969532449, + "learning_rate": 9.20339333747155e-05, + "loss": 3.8113, + "step": 4448 + }, + { + "epoch": 0.27618101682289403, + "grad_norm": 0.5256747076908808, + "learning_rate": 9.205462445685909e-05, + "loss": 3.9137, + "step": 4449 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 0.36400323311711297, + "learning_rate": 9.20753155390027e-05, + "loss": 3.809, + "step": 4450 + }, + { + "epoch": 0.2763051710224098, + "grad_norm": 0.38548051674661443, + "learning_rate": 9.209600662114629e-05, + "loss": 3.8514, + "step": 4451 + }, + { + "epoch": 0.27636724812216773, + "grad_norm": 0.4296802693538908, + "learning_rate": 9.211669770328989e-05, + "loss": 4.0501, + "step": 4452 + }, + { + "epoch": 0.27642932522192565, + "grad_norm": 0.4485880197061374, + "learning_rate": 9.213738878543348e-05, + "loss": 3.9926, + "step": 4453 + }, + { + "epoch": 0.2764914023216835, + "grad_norm": 0.37371648239467253, + "learning_rate": 9.215807986757708e-05, + "loss": 3.9601, + "step": 4454 + }, + { + "epoch": 0.27655347942144143, + "grad_norm": 0.40494864444319734, + "learning_rate": 9.217877094972067e-05, + "loss": 3.9398, + "step": 4455 + }, + { + "epoch": 0.27661555652119935, + "grad_norm": 0.33388859228661205, + "learning_rate": 9.219946203186427e-05, + "loss": 3.9581, + "step": 4456 + }, + { + "epoch": 0.2766776336209572, + "grad_norm": 0.4890893429101886, + "learning_rate": 9.222015311400786e-05, + "loss": 3.8931, + "step": 4457 + }, + { + "epoch": 0.2767397107207151, + "grad_norm": 0.5413328542129779, + "learning_rate": 9.224084419615147e-05, + "loss": 4.0326, + "step": 4458 + }, + { + "epoch": 0.27680178782047304, + "grad_norm": 0.5035749425639845, + "learning_rate": 9.226153527829507e-05, + "loss": 3.8643, + "step": 4459 + }, + { + "epoch": 0.2768638649202309, + "grad_norm": 0.6591467729050953, + "learning_rate": 9.228222636043865e-05, + "loss": 3.9908, + "step": 4460 + }, + { + "epoch": 0.2769259420199888, + "grad_norm": 0.7713267877644998, + "learning_rate": 9.230291744258224e-05, + "loss": 3.9245, + "step": 4461 + }, + { + "epoch": 0.27698801911974674, + "grad_norm": 0.45882034617331346, + "learning_rate": 9.232360852472585e-05, + "loss": 3.883, + "step": 4462 + }, + { + "epoch": 0.2770500962195046, + "grad_norm": 0.6636096532257504, + "learning_rate": 9.234429960686945e-05, + "loss": 3.9261, + "step": 4463 + }, + { + "epoch": 0.2771121733192625, + "grad_norm": 0.46902689702720035, + "learning_rate": 9.236499068901304e-05, + "loss": 3.8967, + "step": 4464 + }, + { + "epoch": 0.27717425041902044, + "grad_norm": 0.3296409058752969, + "learning_rate": 9.238568177115664e-05, + "loss": 3.8401, + "step": 4465 + }, + { + "epoch": 0.2772363275187783, + "grad_norm": 0.42770416989214693, + "learning_rate": 9.240637285330023e-05, + "loss": 3.8844, + "step": 4466 + }, + { + "epoch": 0.2772984046185362, + "grad_norm": 0.4036856006188424, + "learning_rate": 9.242706393544383e-05, + "loss": 3.9367, + "step": 4467 + }, + { + "epoch": 0.27736048171829414, + "grad_norm": 0.4891802484548256, + "learning_rate": 9.244775501758742e-05, + "loss": 3.7837, + "step": 4468 + }, + { + "epoch": 0.277422558818052, + "grad_norm": 0.5809649229836219, + "learning_rate": 9.246844609973102e-05, + "loss": 3.9505, + "step": 4469 + }, + { + "epoch": 0.2774846359178099, + "grad_norm": 0.4872792300376499, + "learning_rate": 9.248913718187461e-05, + "loss": 3.8678, + "step": 4470 + }, + { + "epoch": 0.27754671301756784, + "grad_norm": 0.48370866165174087, + "learning_rate": 9.250982826401822e-05, + "loss": 3.8528, + "step": 4471 + }, + { + "epoch": 0.2776087901173257, + "grad_norm": 0.5024034728623437, + "learning_rate": 9.25305193461618e-05, + "loss": 3.963, + "step": 4472 + }, + { + "epoch": 0.2776708672170836, + "grad_norm": 0.3664951289869372, + "learning_rate": 9.25512104283054e-05, + "loss": 3.8723, + "step": 4473 + }, + { + "epoch": 0.27773294431684153, + "grad_norm": 0.36335704381962236, + "learning_rate": 9.257190151044899e-05, + "loss": 3.8158, + "step": 4474 + }, + { + "epoch": 0.2777950214165994, + "grad_norm": 0.41030352536894366, + "learning_rate": 9.25925925925926e-05, + "loss": 3.934, + "step": 4475 + }, + { + "epoch": 0.2778570985163573, + "grad_norm": 0.6426434701084297, + "learning_rate": 9.26132836747362e-05, + "loss": 3.8242, + "step": 4476 + }, + { + "epoch": 0.27791917561611523, + "grad_norm": 0.47664658789538367, + "learning_rate": 9.263397475687979e-05, + "loss": 3.8108, + "step": 4477 + }, + { + "epoch": 0.2779812527158731, + "grad_norm": 0.6888991553706322, + "learning_rate": 9.265466583902339e-05, + "loss": 3.7726, + "step": 4478 + }, + { + "epoch": 0.278043329815631, + "grad_norm": 0.8253586105297788, + "learning_rate": 9.267535692116698e-05, + "loss": 4.0322, + "step": 4479 + }, + { + "epoch": 0.27810540691538893, + "grad_norm": 0.5764551533164529, + "learning_rate": 9.269604800331058e-05, + "loss": 3.8695, + "step": 4480 + }, + { + "epoch": 0.2781674840151468, + "grad_norm": 0.4440422848389365, + "learning_rate": 9.271673908545417e-05, + "loss": 3.8274, + "step": 4481 + }, + { + "epoch": 0.2782295611149047, + "grad_norm": 0.42280793350358914, + "learning_rate": 9.273743016759777e-05, + "loss": 3.9114, + "step": 4482 + }, + { + "epoch": 0.27829163821466263, + "grad_norm": 0.4831495781157739, + "learning_rate": 9.275812124974138e-05, + "loss": 3.8752, + "step": 4483 + }, + { + "epoch": 0.2783537153144205, + "grad_norm": 0.3142385165097301, + "learning_rate": 9.277881233188496e-05, + "loss": 3.8267, + "step": 4484 + }, + { + "epoch": 0.2784157924141784, + "grad_norm": 0.3715673135456881, + "learning_rate": 9.279950341402855e-05, + "loss": 3.8061, + "step": 4485 + }, + { + "epoch": 0.2784778695139363, + "grad_norm": 0.3350340538188673, + "learning_rate": 9.282019449617215e-05, + "loss": 3.8012, + "step": 4486 + }, + { + "epoch": 0.2785399466136942, + "grad_norm": 0.4154050761448529, + "learning_rate": 9.284088557831576e-05, + "loss": 3.8752, + "step": 4487 + }, + { + "epoch": 0.2786020237134521, + "grad_norm": 0.39298434141258337, + "learning_rate": 9.286157666045935e-05, + "loss": 3.9376, + "step": 4488 + }, + { + "epoch": 0.27866410081321, + "grad_norm": 0.3260673309722895, + "learning_rate": 9.288226774260295e-05, + "loss": 3.9096, + "step": 4489 + }, + { + "epoch": 0.2787261779129679, + "grad_norm": 0.2964836728737299, + "learning_rate": 9.290295882474653e-05, + "loss": 3.8015, + "step": 4490 + }, + { + "epoch": 0.2787882550127258, + "grad_norm": 0.4177213453547686, + "learning_rate": 9.292364990689014e-05, + "loss": 3.7932, + "step": 4491 + }, + { + "epoch": 0.2788503321124837, + "grad_norm": 0.5972324938469905, + "learning_rate": 9.294434098903373e-05, + "loss": 3.9244, + "step": 4492 + }, + { + "epoch": 0.2789124092122416, + "grad_norm": 0.4133418307405107, + "learning_rate": 9.296503207117733e-05, + "loss": 3.8215, + "step": 4493 + }, + { + "epoch": 0.2789744863119995, + "grad_norm": 0.3736306261715531, + "learning_rate": 9.298572315332092e-05, + "loss": 3.9081, + "step": 4494 + }, + { + "epoch": 0.2790365634117574, + "grad_norm": 0.44575526996456194, + "learning_rate": 9.300641423546452e-05, + "loss": 3.8646, + "step": 4495 + }, + { + "epoch": 0.2790986405115153, + "grad_norm": 0.41165944709251606, + "learning_rate": 9.302710531760813e-05, + "loss": 3.8447, + "step": 4496 + }, + { + "epoch": 0.2791607176112732, + "grad_norm": 0.4285836867299416, + "learning_rate": 9.304779639975171e-05, + "loss": 3.9726, + "step": 4497 + }, + { + "epoch": 0.2792227947110311, + "grad_norm": 0.34040639056769556, + "learning_rate": 9.30684874818953e-05, + "loss": 3.8347, + "step": 4498 + }, + { + "epoch": 0.279284871810789, + "grad_norm": 0.33293105104895165, + "learning_rate": 9.30891785640389e-05, + "loss": 3.8802, + "step": 4499 + }, + { + "epoch": 0.2793469489105469, + "grad_norm": 0.4222738711586785, + "learning_rate": 9.31098696461825e-05, + "loss": 3.9072, + "step": 4500 + }, + { + "epoch": 0.2794090260103048, + "grad_norm": 0.4417146095160087, + "learning_rate": 9.31305607283261e-05, + "loss": 3.9063, + "step": 4501 + }, + { + "epoch": 0.2794711031100627, + "grad_norm": 0.28633178458796, + "learning_rate": 9.31512518104697e-05, + "loss": 3.906, + "step": 4502 + }, + { + "epoch": 0.2795331802098206, + "grad_norm": 0.30715902208130474, + "learning_rate": 9.317194289261329e-05, + "loss": 3.8172, + "step": 4503 + }, + { + "epoch": 0.2795952573095785, + "grad_norm": 0.38725722998930984, + "learning_rate": 9.319263397475689e-05, + "loss": 3.9236, + "step": 4504 + }, + { + "epoch": 0.2796573344093364, + "grad_norm": 0.6092353073809179, + "learning_rate": 9.321332505690048e-05, + "loss": 3.9101, + "step": 4505 + }, + { + "epoch": 0.2797194115090943, + "grad_norm": 0.505173737694917, + "learning_rate": 9.323401613904408e-05, + "loss": 3.8457, + "step": 4506 + }, + { + "epoch": 0.2797814886088522, + "grad_norm": 0.5159106474831152, + "learning_rate": 9.325470722118767e-05, + "loss": 3.937, + "step": 4507 + }, + { + "epoch": 0.2798435657086101, + "grad_norm": 0.5226196333256783, + "learning_rate": 9.327539830333128e-05, + "loss": 3.8721, + "step": 4508 + }, + { + "epoch": 0.279905642808368, + "grad_norm": 0.6615075496075535, + "learning_rate": 9.329608938547486e-05, + "loss": 3.9061, + "step": 4509 + }, + { + "epoch": 0.2799677199081259, + "grad_norm": 0.4431443722395221, + "learning_rate": 9.331678046761846e-05, + "loss": 3.9138, + "step": 4510 + }, + { + "epoch": 0.2800297970078838, + "grad_norm": 0.35335377945350305, + "learning_rate": 9.333747154976205e-05, + "loss": 3.7715, + "step": 4511 + }, + { + "epoch": 0.2800918741076417, + "grad_norm": 0.31525337086372063, + "learning_rate": 9.335816263190566e-05, + "loss": 3.8583, + "step": 4512 + }, + { + "epoch": 0.2801539512073996, + "grad_norm": 0.28346065782030067, + "learning_rate": 9.337885371404926e-05, + "loss": 4.0986, + "step": 4513 + }, + { + "epoch": 0.28021602830715747, + "grad_norm": 0.29837038473835586, + "learning_rate": 9.339954479619285e-05, + "loss": 3.968, + "step": 4514 + }, + { + "epoch": 0.2802781054069154, + "grad_norm": 0.2738134252327527, + "learning_rate": 9.342023587833643e-05, + "loss": 3.8745, + "step": 4515 + }, + { + "epoch": 0.2803401825066733, + "grad_norm": 0.2541948439008757, + "learning_rate": 9.344092696048004e-05, + "loss": 3.9952, + "step": 4516 + }, + { + "epoch": 0.28040225960643117, + "grad_norm": 0.3099672558434363, + "learning_rate": 9.346161804262364e-05, + "loss": 3.9809, + "step": 4517 + }, + { + "epoch": 0.2804643367061891, + "grad_norm": 0.3553580017278004, + "learning_rate": 9.348230912476723e-05, + "loss": 4.0551, + "step": 4518 + }, + { + "epoch": 0.280526413805947, + "grad_norm": 0.38022315854297534, + "learning_rate": 9.350300020691083e-05, + "loss": 3.8879, + "step": 4519 + }, + { + "epoch": 0.28058849090570487, + "grad_norm": 0.35903325624957105, + "learning_rate": 9.352369128905442e-05, + "loss": 3.862, + "step": 4520 + }, + { + "epoch": 0.2806505680054628, + "grad_norm": 0.40625271429475607, + "learning_rate": 9.354438237119802e-05, + "loss": 3.7932, + "step": 4521 + }, + { + "epoch": 0.2807126451052207, + "grad_norm": 0.3363131596229662, + "learning_rate": 9.356507345334161e-05, + "loss": 3.9138, + "step": 4522 + }, + { + "epoch": 0.28077472220497857, + "grad_norm": 0.34716071484763, + "learning_rate": 9.35857645354852e-05, + "loss": 3.8336, + "step": 4523 + }, + { + "epoch": 0.2808367993047365, + "grad_norm": 0.48409852474419685, + "learning_rate": 9.36064556176288e-05, + "loss": 3.9183, + "step": 4524 + }, + { + "epoch": 0.2808988764044944, + "grad_norm": 0.387993666532038, + "learning_rate": 9.362714669977241e-05, + "loss": 3.8867, + "step": 4525 + }, + { + "epoch": 0.28096095350425226, + "grad_norm": 0.3819690091970907, + "learning_rate": 9.3647837781916e-05, + "loss": 3.8611, + "step": 4526 + }, + { + "epoch": 0.2810230306040102, + "grad_norm": 0.34816439065026467, + "learning_rate": 9.366852886405959e-05, + "loss": 3.9166, + "step": 4527 + }, + { + "epoch": 0.2810851077037681, + "grad_norm": 0.27466883659971647, + "learning_rate": 9.36892199462032e-05, + "loss": 3.9034, + "step": 4528 + }, + { + "epoch": 0.28114718480352596, + "grad_norm": 0.34936421072308105, + "learning_rate": 9.370991102834679e-05, + "loss": 3.8269, + "step": 4529 + }, + { + "epoch": 0.2812092619032839, + "grad_norm": 0.3145262236366457, + "learning_rate": 9.373060211049038e-05, + "loss": 3.7435, + "step": 4530 + }, + { + "epoch": 0.2812713390030418, + "grad_norm": 0.3665601768972893, + "learning_rate": 9.375129319263398e-05, + "loss": 3.908, + "step": 4531 + }, + { + "epoch": 0.28133341610279966, + "grad_norm": 0.26330661198742067, + "learning_rate": 9.377198427477758e-05, + "loss": 3.8535, + "step": 4532 + }, + { + "epoch": 0.2813954932025576, + "grad_norm": 0.3009425882402496, + "learning_rate": 9.379267535692117e-05, + "loss": 3.9474, + "step": 4533 + }, + { + "epoch": 0.2814575703023155, + "grad_norm": 0.33998450291251336, + "learning_rate": 9.381336643906477e-05, + "loss": 3.9165, + "step": 4534 + }, + { + "epoch": 0.28151964740207336, + "grad_norm": 0.2706834381343087, + "learning_rate": 9.383405752120836e-05, + "loss": 3.8745, + "step": 4535 + }, + { + "epoch": 0.2815817245018313, + "grad_norm": 0.22477546853318003, + "learning_rate": 9.385474860335196e-05, + "loss": 3.8852, + "step": 4536 + }, + { + "epoch": 0.2816438016015892, + "grad_norm": 0.24017078829698094, + "learning_rate": 9.387543968549556e-05, + "loss": 3.8839, + "step": 4537 + }, + { + "epoch": 0.28170587870134706, + "grad_norm": 0.24109421349886967, + "learning_rate": 9.389613076763916e-05, + "loss": 3.8808, + "step": 4538 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 0.20894254283599104, + "learning_rate": 9.391682184978274e-05, + "loss": 3.8406, + "step": 4539 + }, + { + "epoch": 0.2818300329008629, + "grad_norm": 0.4149250547114478, + "learning_rate": 9.393751293192634e-05, + "loss": 3.9153, + "step": 4540 + }, + { + "epoch": 0.28189211000062075, + "grad_norm": 0.29018481923807554, + "learning_rate": 9.395820401406994e-05, + "loss": 3.8062, + "step": 4541 + }, + { + "epoch": 0.2819541871003787, + "grad_norm": 0.3118923000514118, + "learning_rate": 9.397889509621354e-05, + "loss": 3.8344, + "step": 4542 + }, + { + "epoch": 0.2820162642001366, + "grad_norm": 0.2930037541140827, + "learning_rate": 9.399958617835713e-05, + "loss": 3.8856, + "step": 4543 + }, + { + "epoch": 0.28207834129989445, + "grad_norm": 0.3844040851908848, + "learning_rate": 9.402027726050073e-05, + "loss": 3.8468, + "step": 4544 + }, + { + "epoch": 0.28214041839965237, + "grad_norm": 0.489242827591443, + "learning_rate": 9.404096834264432e-05, + "loss": 3.7814, + "step": 4545 + }, + { + "epoch": 0.2822024954994103, + "grad_norm": 0.23540701944845024, + "learning_rate": 9.406165942478792e-05, + "loss": 3.8629, + "step": 4546 + }, + { + "epoch": 0.28226457259916815, + "grad_norm": 0.43702157291831617, + "learning_rate": 9.408235050693151e-05, + "loss": 3.8091, + "step": 4547 + }, + { + "epoch": 0.28232664969892607, + "grad_norm": 0.3434723686587816, + "learning_rate": 9.410304158907511e-05, + "loss": 3.8048, + "step": 4548 + }, + { + "epoch": 0.282388726798684, + "grad_norm": 0.27376631992936185, + "learning_rate": 9.412373267121872e-05, + "loss": 3.8538, + "step": 4549 + }, + { + "epoch": 0.28245080389844185, + "grad_norm": 0.5587079925022095, + "learning_rate": 9.414442375336231e-05, + "loss": 3.9282, + "step": 4550 + }, + { + "epoch": 0.28251288099819977, + "grad_norm": 0.4825387110523966, + "learning_rate": 9.41651148355059e-05, + "loss": 3.8821, + "step": 4551 + }, + { + "epoch": 0.2825749580979577, + "grad_norm": 0.4272975345706498, + "learning_rate": 9.418580591764949e-05, + "loss": 3.7901, + "step": 4552 + }, + { + "epoch": 0.28263703519771555, + "grad_norm": 0.4423596037109232, + "learning_rate": 9.42064969997931e-05, + "loss": 3.8651, + "step": 4553 + }, + { + "epoch": 0.28269911229747346, + "grad_norm": 0.651933986778327, + "learning_rate": 9.42271880819367e-05, + "loss": 3.7878, + "step": 4554 + }, + { + "epoch": 0.2827611893972314, + "grad_norm": 0.6325393369753358, + "learning_rate": 9.424787916408029e-05, + "loss": 3.8473, + "step": 4555 + }, + { + "epoch": 0.28282326649698925, + "grad_norm": 0.4146496550388941, + "learning_rate": 9.426857024622388e-05, + "loss": 3.9389, + "step": 4556 + }, + { + "epoch": 0.28288534359674716, + "grad_norm": 0.3776193177263844, + "learning_rate": 9.428926132836748e-05, + "loss": 3.7887, + "step": 4557 + }, + { + "epoch": 0.2829474206965051, + "grad_norm": 0.7023995577797225, + "learning_rate": 9.430995241051107e-05, + "loss": 3.9388, + "step": 4558 + }, + { + "epoch": 0.28300949779626294, + "grad_norm": 0.7002655377146958, + "learning_rate": 9.433064349265467e-05, + "loss": 3.888, + "step": 4559 + }, + { + "epoch": 0.28307157489602086, + "grad_norm": 0.38020546359019947, + "learning_rate": 9.435133457479826e-05, + "loss": 3.8573, + "step": 4560 + }, + { + "epoch": 0.2831336519957788, + "grad_norm": 0.665685040268917, + "learning_rate": 9.437202565694186e-05, + "loss": 3.813, + "step": 4561 + }, + { + "epoch": 0.28319572909553664, + "grad_norm": 0.48893890342525614, + "learning_rate": 9.439271673908547e-05, + "loss": 3.818, + "step": 4562 + }, + { + "epoch": 0.28325780619529456, + "grad_norm": 0.42969776488094136, + "learning_rate": 9.441340782122905e-05, + "loss": 3.7789, + "step": 4563 + }, + { + "epoch": 0.2833198832950525, + "grad_norm": 0.5331632997500514, + "learning_rate": 9.443409890337264e-05, + "loss": 3.8321, + "step": 4564 + }, + { + "epoch": 0.28338196039481034, + "grad_norm": 0.402566089120781, + "learning_rate": 9.445478998551624e-05, + "loss": 3.8078, + "step": 4565 + }, + { + "epoch": 0.28344403749456826, + "grad_norm": 0.5107315675058044, + "learning_rate": 9.447548106765985e-05, + "loss": 3.8075, + "step": 4566 + }, + { + "epoch": 0.2835061145943262, + "grad_norm": 0.49716025327571384, + "learning_rate": 9.449617214980344e-05, + "loss": 3.7979, + "step": 4567 + }, + { + "epoch": 0.28356819169408404, + "grad_norm": 0.3789936043632488, + "learning_rate": 9.451686323194704e-05, + "loss": 3.822, + "step": 4568 + }, + { + "epoch": 0.28363026879384196, + "grad_norm": 0.5149792181840331, + "learning_rate": 9.453755431409062e-05, + "loss": 3.911, + "step": 4569 + }, + { + "epoch": 0.2836923458935999, + "grad_norm": 0.3959402475359577, + "learning_rate": 9.455824539623423e-05, + "loss": 3.8731, + "step": 4570 + }, + { + "epoch": 0.28375442299335774, + "grad_norm": 0.38836279997066453, + "learning_rate": 9.457893647837782e-05, + "loss": 3.8942, + "step": 4571 + }, + { + "epoch": 0.28381650009311565, + "grad_norm": 0.355432277946285, + "learning_rate": 9.459962756052142e-05, + "loss": 3.8099, + "step": 4572 + }, + { + "epoch": 0.28387857719287357, + "grad_norm": 0.400430463344145, + "learning_rate": 9.462031864266501e-05, + "loss": 3.989, + "step": 4573 + }, + { + "epoch": 0.28394065429263143, + "grad_norm": 0.5529507594268852, + "learning_rate": 9.464100972480862e-05, + "loss": 3.8948, + "step": 4574 + }, + { + "epoch": 0.28400273139238935, + "grad_norm": 0.5661603985983074, + "learning_rate": 9.46617008069522e-05, + "loss": 3.8547, + "step": 4575 + }, + { + "epoch": 0.28406480849214727, + "grad_norm": 0.41729319872918497, + "learning_rate": 9.46823918890958e-05, + "loss": 3.8705, + "step": 4576 + }, + { + "epoch": 0.28412688559190513, + "grad_norm": 0.3257214983110775, + "learning_rate": 9.47030829712394e-05, + "loss": 3.8071, + "step": 4577 + }, + { + "epoch": 0.28418896269166305, + "grad_norm": 0.3207203664106121, + "learning_rate": 9.4723774053383e-05, + "loss": 3.8424, + "step": 4578 + }, + { + "epoch": 0.28425103979142097, + "grad_norm": 0.5010232587619712, + "learning_rate": 9.47444651355266e-05, + "loss": 3.9444, + "step": 4579 + }, + { + "epoch": 0.28431311689117883, + "grad_norm": 0.487458061590667, + "learning_rate": 9.476515621767019e-05, + "loss": 3.8617, + "step": 4580 + }, + { + "epoch": 0.28437519399093675, + "grad_norm": 0.34376148724869215, + "learning_rate": 9.478584729981379e-05, + "loss": 3.9385, + "step": 4581 + }, + { + "epoch": 0.28443727109069467, + "grad_norm": 0.6958523915206908, + "learning_rate": 9.480653838195738e-05, + "loss": 3.8694, + "step": 4582 + }, + { + "epoch": 0.28449934819045253, + "grad_norm": 0.6045566210201131, + "learning_rate": 9.482722946410098e-05, + "loss": 3.8172, + "step": 4583 + }, + { + "epoch": 0.28456142529021045, + "grad_norm": 0.45242067384634505, + "learning_rate": 9.484792054624457e-05, + "loss": 3.932, + "step": 4584 + }, + { + "epoch": 0.28462350238996836, + "grad_norm": 0.6426259170893722, + "learning_rate": 9.486861162838817e-05, + "loss": 3.9334, + "step": 4585 + }, + { + "epoch": 0.2846855794897262, + "grad_norm": 0.3544529033993992, + "learning_rate": 9.488930271053176e-05, + "loss": 3.8743, + "step": 4586 + }, + { + "epoch": 0.28474765658948414, + "grad_norm": 0.47853477595406696, + "learning_rate": 9.490999379267537e-05, + "loss": 3.8979, + "step": 4587 + }, + { + "epoch": 0.28480973368924206, + "grad_norm": 0.5852196683246519, + "learning_rate": 9.493068487481895e-05, + "loss": 3.8854, + "step": 4588 + }, + { + "epoch": 0.2848718107889999, + "grad_norm": 0.3304726088639003, + "learning_rate": 9.495137595696255e-05, + "loss": 3.8836, + "step": 4589 + }, + { + "epoch": 0.28493388788875784, + "grad_norm": 0.5713660555089157, + "learning_rate": 9.497206703910614e-05, + "loss": 3.8189, + "step": 4590 + }, + { + "epoch": 0.28499596498851576, + "grad_norm": 0.3630862584351203, + "learning_rate": 9.499275812124975e-05, + "loss": 3.93, + "step": 4591 + }, + { + "epoch": 0.2850580420882736, + "grad_norm": 0.45745736822456234, + "learning_rate": 9.501344920339335e-05, + "loss": 3.8804, + "step": 4592 + }, + { + "epoch": 0.28512011918803154, + "grad_norm": 0.26560821199658996, + "learning_rate": 9.503414028553694e-05, + "loss": 3.8818, + "step": 4593 + }, + { + "epoch": 0.28518219628778946, + "grad_norm": 0.44389349739530826, + "learning_rate": 9.505483136768052e-05, + "loss": 3.8714, + "step": 4594 + }, + { + "epoch": 0.2852442733875473, + "grad_norm": 0.6136172886494268, + "learning_rate": 9.507552244982413e-05, + "loss": 3.8931, + "step": 4595 + }, + { + "epoch": 0.28530635048730524, + "grad_norm": 0.39707731436857324, + "learning_rate": 9.509621353196773e-05, + "loss": 3.8773, + "step": 4596 + }, + { + "epoch": 0.28536842758706316, + "grad_norm": 0.689160617235226, + "learning_rate": 9.511690461411132e-05, + "loss": 3.9366, + "step": 4597 + }, + { + "epoch": 0.285430504686821, + "grad_norm": 0.5351265947798891, + "learning_rate": 9.513759569625492e-05, + "loss": 3.8562, + "step": 4598 + }, + { + "epoch": 0.28549258178657894, + "grad_norm": 0.39784653163719835, + "learning_rate": 9.515828677839853e-05, + "loss": 3.856, + "step": 4599 + }, + { + "epoch": 0.28555465888633685, + "grad_norm": 0.7773616156130477, + "learning_rate": 9.517897786054211e-05, + "loss": 3.8045, + "step": 4600 + }, + { + "epoch": 0.2856167359860947, + "grad_norm": 0.5908995772226124, + "learning_rate": 9.51996689426857e-05, + "loss": 3.8991, + "step": 4601 + }, + { + "epoch": 0.28567881308585263, + "grad_norm": 0.5353536589789162, + "learning_rate": 9.52203600248293e-05, + "loss": 3.7852, + "step": 4602 + }, + { + "epoch": 0.28574089018561055, + "grad_norm": 0.46922589622134336, + "learning_rate": 9.524105110697291e-05, + "loss": 3.8112, + "step": 4603 + }, + { + "epoch": 0.2858029672853684, + "grad_norm": 0.3818971628728475, + "learning_rate": 9.52617421891165e-05, + "loss": 3.8492, + "step": 4604 + }, + { + "epoch": 0.28586504438512633, + "grad_norm": 0.4852615057982946, + "learning_rate": 9.52824332712601e-05, + "loss": 3.9495, + "step": 4605 + }, + { + "epoch": 0.28592712148488425, + "grad_norm": 0.35509989538483944, + "learning_rate": 9.530312435340368e-05, + "loss": 3.9235, + "step": 4606 + }, + { + "epoch": 0.2859891985846421, + "grad_norm": 0.39210586230096817, + "learning_rate": 9.532381543554729e-05, + "loss": 3.9862, + "step": 4607 + }, + { + "epoch": 0.28605127568440003, + "grad_norm": 0.32831984366179784, + "learning_rate": 9.534450651769088e-05, + "loss": 3.748, + "step": 4608 + }, + { + "epoch": 0.28611335278415795, + "grad_norm": 0.3845895277098249, + "learning_rate": 9.536519759983448e-05, + "loss": 3.886, + "step": 4609 + }, + { + "epoch": 0.2861754298839158, + "grad_norm": 0.2663987894216159, + "learning_rate": 9.538588868197807e-05, + "loss": 3.8355, + "step": 4610 + }, + { + "epoch": 0.28623750698367373, + "grad_norm": 0.3769463256855211, + "learning_rate": 9.540657976412167e-05, + "loss": 3.9276, + "step": 4611 + }, + { + "epoch": 0.28629958408343165, + "grad_norm": 0.2771093828939604, + "learning_rate": 9.542727084626526e-05, + "loss": 3.9417, + "step": 4612 + }, + { + "epoch": 0.2863616611831895, + "grad_norm": 0.2621192572105134, + "learning_rate": 9.544796192840886e-05, + "loss": 3.8267, + "step": 4613 + }, + { + "epoch": 0.2864237382829474, + "grad_norm": 0.3176498815308105, + "learning_rate": 9.546865301055245e-05, + "loss": 3.8664, + "step": 4614 + }, + { + "epoch": 0.28648581538270534, + "grad_norm": 0.3014717721590879, + "learning_rate": 9.548934409269605e-05, + "loss": 3.7766, + "step": 4615 + }, + { + "epoch": 0.2865478924824632, + "grad_norm": 0.31506757944148445, + "learning_rate": 9.551003517483966e-05, + "loss": 3.9226, + "step": 4616 + }, + { + "epoch": 0.2866099695822211, + "grad_norm": 0.3120841836132559, + "learning_rate": 9.553072625698325e-05, + "loss": 3.8524, + "step": 4617 + }, + { + "epoch": 0.28667204668197904, + "grad_norm": 0.2252441723796856, + "learning_rate": 9.555141733912683e-05, + "loss": 3.9097, + "step": 4618 + }, + { + "epoch": 0.2867341237817369, + "grad_norm": 0.3109995823133675, + "learning_rate": 9.557210842127043e-05, + "loss": 3.8276, + "step": 4619 + }, + { + "epoch": 0.2867962008814948, + "grad_norm": 0.2721355094483627, + "learning_rate": 9.559279950341404e-05, + "loss": 3.7824, + "step": 4620 + }, + { + "epoch": 0.28685827798125274, + "grad_norm": 0.2631472702297201, + "learning_rate": 9.561349058555763e-05, + "loss": 3.8299, + "step": 4621 + }, + { + "epoch": 0.2869203550810106, + "grad_norm": 0.2453703617691409, + "learning_rate": 9.563418166770123e-05, + "loss": 3.84, + "step": 4622 + }, + { + "epoch": 0.2869824321807685, + "grad_norm": 0.2799434114433732, + "learning_rate": 9.565487274984482e-05, + "loss": 3.8064, + "step": 4623 + }, + { + "epoch": 0.28704450928052644, + "grad_norm": 0.3487087562401367, + "learning_rate": 9.567556383198842e-05, + "loss": 3.7842, + "step": 4624 + }, + { + "epoch": 0.2871065863802843, + "grad_norm": 0.36339889180256396, + "learning_rate": 9.569625491413201e-05, + "loss": 3.7709, + "step": 4625 + }, + { + "epoch": 0.2871686634800422, + "grad_norm": 0.2863667023654253, + "learning_rate": 9.571694599627561e-05, + "loss": 3.7392, + "step": 4626 + }, + { + "epoch": 0.28723074057980014, + "grad_norm": 0.32278372991818566, + "learning_rate": 9.57376370784192e-05, + "loss": 3.8658, + "step": 4627 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 0.3089446957425646, + "learning_rate": 9.575832816056281e-05, + "loss": 3.7904, + "step": 4628 + }, + { + "epoch": 0.2873548947793159, + "grad_norm": 0.3369940952942574, + "learning_rate": 9.57790192427064e-05, + "loss": 3.8038, + "step": 4629 + }, + { + "epoch": 0.28741697187907383, + "grad_norm": 0.3114145620818696, + "learning_rate": 9.579971032484999e-05, + "loss": 3.7593, + "step": 4630 + }, + { + "epoch": 0.2874790489788317, + "grad_norm": 0.29986548040385275, + "learning_rate": 9.582040140699358e-05, + "loss": 3.8711, + "step": 4631 + }, + { + "epoch": 0.2875411260785896, + "grad_norm": 0.32073440910276363, + "learning_rate": 9.584109248913719e-05, + "loss": 3.8625, + "step": 4632 + }, + { + "epoch": 0.28760320317834753, + "grad_norm": 0.2889299604342762, + "learning_rate": 9.586178357128079e-05, + "loss": 3.8282, + "step": 4633 + }, + { + "epoch": 0.2876652802781054, + "grad_norm": 0.2952244198350474, + "learning_rate": 9.588247465342438e-05, + "loss": 3.8613, + "step": 4634 + }, + { + "epoch": 0.2877273573778633, + "grad_norm": 0.26593348877747175, + "learning_rate": 9.590316573556798e-05, + "loss": 3.9018, + "step": 4635 + }, + { + "epoch": 0.28778943447762123, + "grad_norm": 0.3814817185615079, + "learning_rate": 9.592385681771157e-05, + "loss": 3.8001, + "step": 4636 + }, + { + "epoch": 0.2878515115773791, + "grad_norm": 0.34531887704475767, + "learning_rate": 9.594454789985517e-05, + "loss": 3.795, + "step": 4637 + }, + { + "epoch": 0.287913588677137, + "grad_norm": 0.45887415759747857, + "learning_rate": 9.596523898199876e-05, + "loss": 3.8043, + "step": 4638 + }, + { + "epoch": 0.28797566577689493, + "grad_norm": 0.3196108273603943, + "learning_rate": 9.598593006414236e-05, + "loss": 3.8177, + "step": 4639 + }, + { + "epoch": 0.2880377428766528, + "grad_norm": 0.44808554977148807, + "learning_rate": 9.600662114628595e-05, + "loss": 3.8187, + "step": 4640 + }, + { + "epoch": 0.2880998199764107, + "grad_norm": 0.45283253866944256, + "learning_rate": 9.602731222842956e-05, + "loss": 3.7718, + "step": 4641 + }, + { + "epoch": 0.2881618970761686, + "grad_norm": 0.25558228084485146, + "learning_rate": 9.604800331057314e-05, + "loss": 3.8467, + "step": 4642 + }, + { + "epoch": 0.2882239741759265, + "grad_norm": 0.3527106715419812, + "learning_rate": 9.606869439271674e-05, + "loss": 3.952, + "step": 4643 + }, + { + "epoch": 0.2882860512756844, + "grad_norm": 0.24551714604009214, + "learning_rate": 9.608938547486033e-05, + "loss": 3.9363, + "step": 4644 + }, + { + "epoch": 0.2883481283754423, + "grad_norm": 0.4472485834731501, + "learning_rate": 9.611007655700394e-05, + "loss": 3.8418, + "step": 4645 + }, + { + "epoch": 0.2884102054752002, + "grad_norm": 0.33804383966164486, + "learning_rate": 9.613076763914754e-05, + "loss": 3.8238, + "step": 4646 + }, + { + "epoch": 0.2884722825749581, + "grad_norm": 0.24537405863516712, + "learning_rate": 9.615145872129113e-05, + "loss": 3.8218, + "step": 4647 + }, + { + "epoch": 0.288534359674716, + "grad_norm": 0.3121623480080224, + "learning_rate": 9.617214980343471e-05, + "loss": 3.7641, + "step": 4648 + }, + { + "epoch": 0.2885964367744739, + "grad_norm": 0.2672127365998861, + "learning_rate": 9.619284088557832e-05, + "loss": 3.8355, + "step": 4649 + }, + { + "epoch": 0.2886585138742318, + "grad_norm": 0.33826988632814503, + "learning_rate": 9.621353196772192e-05, + "loss": 3.7181, + "step": 4650 + }, + { + "epoch": 0.2887205909739897, + "grad_norm": 0.40122287132677315, + "learning_rate": 9.623422304986551e-05, + "loss": 3.9391, + "step": 4651 + }, + { + "epoch": 0.2887826680737476, + "grad_norm": 0.2828509014887383, + "learning_rate": 9.62549141320091e-05, + "loss": 3.8949, + "step": 4652 + }, + { + "epoch": 0.2888447451735055, + "grad_norm": 0.4479114583917669, + "learning_rate": 9.627560521415271e-05, + "loss": 3.864, + "step": 4653 + }, + { + "epoch": 0.2889068222732634, + "grad_norm": 0.44672684514388733, + "learning_rate": 9.62962962962963e-05, + "loss": 3.8555, + "step": 4654 + }, + { + "epoch": 0.2889688993730213, + "grad_norm": 0.43096674448525446, + "learning_rate": 9.631698737843989e-05, + "loss": 3.7911, + "step": 4655 + }, + { + "epoch": 0.2890309764727792, + "grad_norm": 0.5179864388461847, + "learning_rate": 9.633767846058349e-05, + "loss": 3.8983, + "step": 4656 + }, + { + "epoch": 0.2890930535725371, + "grad_norm": 0.3931239873754805, + "learning_rate": 9.63583695427271e-05, + "loss": 3.8463, + "step": 4657 + }, + { + "epoch": 0.289155130672295, + "grad_norm": 0.2608372804207997, + "learning_rate": 9.637906062487069e-05, + "loss": 3.8816, + "step": 4658 + }, + { + "epoch": 0.2892172077720529, + "grad_norm": 0.3396720452464622, + "learning_rate": 9.639975170701429e-05, + "loss": 3.8728, + "step": 4659 + }, + { + "epoch": 0.2892792848718108, + "grad_norm": 0.3054762193005644, + "learning_rate": 9.642044278915787e-05, + "loss": 3.9201, + "step": 4660 + }, + { + "epoch": 0.2893413619715687, + "grad_norm": 0.4065961280244821, + "learning_rate": 9.644113387130148e-05, + "loss": 3.8416, + "step": 4661 + }, + { + "epoch": 0.2894034390713266, + "grad_norm": 0.5466113681223103, + "learning_rate": 9.646182495344507e-05, + "loss": 3.9264, + "step": 4662 + }, + { + "epoch": 0.2894655161710845, + "grad_norm": 0.4526057721436197, + "learning_rate": 9.648251603558867e-05, + "loss": 3.9747, + "step": 4663 + }, + { + "epoch": 0.2895275932708424, + "grad_norm": 0.3932702335698585, + "learning_rate": 9.650320711773226e-05, + "loss": 3.9208, + "step": 4664 + }, + { + "epoch": 0.2895896703706003, + "grad_norm": 0.4189510567366888, + "learning_rate": 9.652389819987586e-05, + "loss": 3.9034, + "step": 4665 + }, + { + "epoch": 0.2896517474703582, + "grad_norm": 0.4630993561588442, + "learning_rate": 9.654458928201946e-05, + "loss": 3.9051, + "step": 4666 + }, + { + "epoch": 0.2897138245701161, + "grad_norm": 0.5332820121341673, + "learning_rate": 9.656528036416305e-05, + "loss": 3.9667, + "step": 4667 + }, + { + "epoch": 0.289775901669874, + "grad_norm": 0.3590257855295917, + "learning_rate": 9.658597144630664e-05, + "loss": 3.8385, + "step": 4668 + }, + { + "epoch": 0.2898379787696319, + "grad_norm": 0.3986974130233218, + "learning_rate": 9.660666252845024e-05, + "loss": 3.7619, + "step": 4669 + }, + { + "epoch": 0.28990005586938977, + "grad_norm": 0.43843419947234974, + "learning_rate": 9.662735361059384e-05, + "loss": 3.9025, + "step": 4670 + }, + { + "epoch": 0.2899621329691477, + "grad_norm": 0.4443641737465503, + "learning_rate": 9.664804469273744e-05, + "loss": 3.8564, + "step": 4671 + }, + { + "epoch": 0.2900242100689056, + "grad_norm": 0.5050290631910836, + "learning_rate": 9.666873577488103e-05, + "loss": 3.7876, + "step": 4672 + }, + { + "epoch": 0.29008628716866347, + "grad_norm": 0.7651777712892017, + "learning_rate": 9.668942685702462e-05, + "loss": 3.8843, + "step": 4673 + }, + { + "epoch": 0.2901483642684214, + "grad_norm": 0.41663178854240684, + "learning_rate": 9.671011793916823e-05, + "loss": 3.8043, + "step": 4674 + }, + { + "epoch": 0.2902104413681793, + "grad_norm": 0.7374070658562262, + "learning_rate": 9.673080902131182e-05, + "loss": 3.8339, + "step": 4675 + }, + { + "epoch": 0.29027251846793717, + "grad_norm": 0.4919377253283146, + "learning_rate": 9.675150010345542e-05, + "loss": 3.8584, + "step": 4676 + }, + { + "epoch": 0.2903345955676951, + "grad_norm": 0.4562151869464819, + "learning_rate": 9.677219118559901e-05, + "loss": 3.9178, + "step": 4677 + }, + { + "epoch": 0.290396672667453, + "grad_norm": 0.4906643353626898, + "learning_rate": 9.679288226774262e-05, + "loss": 3.8669, + "step": 4678 + }, + { + "epoch": 0.29045874976721087, + "grad_norm": 0.41248552051331516, + "learning_rate": 9.68135733498862e-05, + "loss": 3.8382, + "step": 4679 + }, + { + "epoch": 0.2905208268669688, + "grad_norm": 0.393822855491323, + "learning_rate": 9.68342644320298e-05, + "loss": 3.8357, + "step": 4680 + }, + { + "epoch": 0.2905829039667267, + "grad_norm": 0.4991875992489849, + "learning_rate": 9.685495551417339e-05, + "loss": 3.7995, + "step": 4681 + }, + { + "epoch": 0.29064498106648456, + "grad_norm": 0.4262816087687571, + "learning_rate": 9.6875646596317e-05, + "loss": 3.8033, + "step": 4682 + }, + { + "epoch": 0.2907070581662425, + "grad_norm": 0.4103810375581698, + "learning_rate": 9.68963376784606e-05, + "loss": 3.8432, + "step": 4683 + }, + { + "epoch": 0.29076913526600034, + "grad_norm": 0.3931822611978674, + "learning_rate": 9.691702876060419e-05, + "loss": 3.8049, + "step": 4684 + }, + { + "epoch": 0.29083121236575826, + "grad_norm": 0.4204303590392949, + "learning_rate": 9.693771984274777e-05, + "loss": 3.9441, + "step": 4685 + }, + { + "epoch": 0.2908932894655162, + "grad_norm": 0.4026856682496104, + "learning_rate": 9.695841092489138e-05, + "loss": 3.8077, + "step": 4686 + }, + { + "epoch": 0.29095536656527404, + "grad_norm": 0.46611733234018937, + "learning_rate": 9.697910200703497e-05, + "loss": 3.8821, + "step": 4687 + }, + { + "epoch": 0.29101744366503196, + "grad_norm": 0.4447201922282527, + "learning_rate": 9.699979308917857e-05, + "loss": 3.9879, + "step": 4688 + }, + { + "epoch": 0.2910795207647899, + "grad_norm": 0.43915987190778577, + "learning_rate": 9.702048417132216e-05, + "loss": 3.8227, + "step": 4689 + }, + { + "epoch": 0.29114159786454774, + "grad_norm": 0.3594266190982141, + "learning_rate": 9.704117525346576e-05, + "loss": 3.8095, + "step": 4690 + }, + { + "epoch": 0.29120367496430566, + "grad_norm": 0.4257885468583911, + "learning_rate": 9.706186633560935e-05, + "loss": 3.864, + "step": 4691 + }, + { + "epoch": 0.2912657520640636, + "grad_norm": 0.3462612904763245, + "learning_rate": 9.708255741775295e-05, + "loss": 3.7177, + "step": 4692 + }, + { + "epoch": 0.29132782916382144, + "grad_norm": 0.36132381539738856, + "learning_rate": 9.710324849989655e-05, + "loss": 3.8178, + "step": 4693 + }, + { + "epoch": 0.29138990626357936, + "grad_norm": 0.517279542413964, + "learning_rate": 9.712393958204014e-05, + "loss": 3.877, + "step": 4694 + }, + { + "epoch": 0.2914519833633373, + "grad_norm": 0.4266758017226401, + "learning_rate": 9.714463066418375e-05, + "loss": 3.8134, + "step": 4695 + }, + { + "epoch": 0.29151406046309514, + "grad_norm": 0.35754150723205314, + "learning_rate": 9.716532174632734e-05, + "loss": 3.8511, + "step": 4696 + }, + { + "epoch": 0.29157613756285305, + "grad_norm": 0.46290804029450855, + "learning_rate": 9.718601282847093e-05, + "loss": 3.7805, + "step": 4697 + }, + { + "epoch": 0.291638214662611, + "grad_norm": 0.4280740443068351, + "learning_rate": 9.720670391061453e-05, + "loss": 3.777, + "step": 4698 + }, + { + "epoch": 0.29170029176236884, + "grad_norm": 0.32514278457307916, + "learning_rate": 9.722739499275813e-05, + "loss": 3.7861, + "step": 4699 + }, + { + "epoch": 0.29176236886212675, + "grad_norm": 0.38696163911346315, + "learning_rate": 9.724808607490172e-05, + "loss": 3.8796, + "step": 4700 + }, + { + "epoch": 0.29182444596188467, + "grad_norm": 0.40822057367294334, + "learning_rate": 9.726877715704532e-05, + "loss": 3.8265, + "step": 4701 + }, + { + "epoch": 0.29188652306164253, + "grad_norm": 0.29084158179783587, + "learning_rate": 9.728946823918891e-05, + "loss": 3.938, + "step": 4702 + }, + { + "epoch": 0.29194860016140045, + "grad_norm": 0.3302306325144045, + "learning_rate": 9.731015932133251e-05, + "loss": 3.9164, + "step": 4703 + }, + { + "epoch": 0.29201067726115837, + "grad_norm": 0.30050774313825357, + "learning_rate": 9.73308504034761e-05, + "loss": 3.8658, + "step": 4704 + }, + { + "epoch": 0.29207275436091623, + "grad_norm": 0.30290869339406395, + "learning_rate": 9.73515414856197e-05, + "loss": 3.8591, + "step": 4705 + }, + { + "epoch": 0.29213483146067415, + "grad_norm": 0.33735475328742653, + "learning_rate": 9.73722325677633e-05, + "loss": 3.8102, + "step": 4706 + }, + { + "epoch": 0.29219690856043207, + "grad_norm": 0.303397872843732, + "learning_rate": 9.73929236499069e-05, + "loss": 3.7881, + "step": 4707 + }, + { + "epoch": 0.29225898566018993, + "grad_norm": 0.2931010265551342, + "learning_rate": 9.74136147320505e-05, + "loss": 3.825, + "step": 4708 + }, + { + "epoch": 0.29232106275994785, + "grad_norm": 0.3575809520820541, + "learning_rate": 9.743430581419408e-05, + "loss": 3.8366, + "step": 4709 + }, + { + "epoch": 0.29238313985970577, + "grad_norm": 0.34792390442741705, + "learning_rate": 9.745499689633767e-05, + "loss": 3.8438, + "step": 4710 + }, + { + "epoch": 0.2924452169594636, + "grad_norm": 0.382863527906355, + "learning_rate": 9.747568797848128e-05, + "loss": 3.7939, + "step": 4711 + }, + { + "epoch": 0.29250729405922155, + "grad_norm": 0.2786287572863234, + "learning_rate": 9.749637906062488e-05, + "loss": 3.7992, + "step": 4712 + }, + { + "epoch": 0.29256937115897946, + "grad_norm": 0.4110521069686469, + "learning_rate": 9.751707014276847e-05, + "loss": 3.7233, + "step": 4713 + }, + { + "epoch": 0.2926314482587373, + "grad_norm": 0.2678670645760892, + "learning_rate": 9.753776122491207e-05, + "loss": 3.7908, + "step": 4714 + }, + { + "epoch": 0.29269352535849524, + "grad_norm": 0.4264774929482946, + "learning_rate": 9.755845230705566e-05, + "loss": 3.739, + "step": 4715 + }, + { + "epoch": 0.29275560245825316, + "grad_norm": 0.6970465179597731, + "learning_rate": 9.757914338919926e-05, + "loss": 3.81, + "step": 4716 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 0.48013520487314026, + "learning_rate": 9.759983447134285e-05, + "loss": 3.9014, + "step": 4717 + }, + { + "epoch": 0.29287975665776894, + "grad_norm": 0.3885290563989811, + "learning_rate": 9.762052555348645e-05, + "loss": 3.7769, + "step": 4718 + }, + { + "epoch": 0.29294183375752686, + "grad_norm": 0.3247658124220699, + "learning_rate": 9.764121663563004e-05, + "loss": 3.7689, + "step": 4719 + }, + { + "epoch": 0.2930039108572847, + "grad_norm": 0.5286313761220371, + "learning_rate": 9.766190771777365e-05, + "loss": 3.7819, + "step": 4720 + }, + { + "epoch": 0.29306598795704264, + "grad_norm": 0.5822807135828499, + "learning_rate": 9.768259879991723e-05, + "loss": 3.8483, + "step": 4721 + }, + { + "epoch": 0.29312806505680056, + "grad_norm": 0.5798791871771807, + "learning_rate": 9.770328988206083e-05, + "loss": 3.8545, + "step": 4722 + }, + { + "epoch": 0.2931901421565584, + "grad_norm": 0.6004881244187112, + "learning_rate": 9.772398096420444e-05, + "loss": 3.8196, + "step": 4723 + }, + { + "epoch": 0.29325221925631634, + "grad_norm": 0.576697985436268, + "learning_rate": 9.774467204634803e-05, + "loss": 3.9484, + "step": 4724 + }, + { + "epoch": 0.29331429635607426, + "grad_norm": 0.38546795331967726, + "learning_rate": 9.776536312849163e-05, + "loss": 3.7072, + "step": 4725 + }, + { + "epoch": 0.2933763734558321, + "grad_norm": 0.6580384045512808, + "learning_rate": 9.778605421063522e-05, + "loss": 3.8902, + "step": 4726 + }, + { + "epoch": 0.29343845055559004, + "grad_norm": 0.4324100075793017, + "learning_rate": 9.780674529277882e-05, + "loss": 3.8145, + "step": 4727 + }, + { + "epoch": 0.29350052765534795, + "grad_norm": 0.3080035130134794, + "learning_rate": 9.782743637492241e-05, + "loss": 3.7493, + "step": 4728 + }, + { + "epoch": 0.2935626047551058, + "grad_norm": 0.6760420984894755, + "learning_rate": 9.784812745706601e-05, + "loss": 3.8587, + "step": 4729 + }, + { + "epoch": 0.29362468185486373, + "grad_norm": 0.3999954879029417, + "learning_rate": 9.78688185392096e-05, + "loss": 3.84, + "step": 4730 + }, + { + "epoch": 0.29368675895462165, + "grad_norm": 0.618665928379117, + "learning_rate": 9.78895096213532e-05, + "loss": 3.922, + "step": 4731 + }, + { + "epoch": 0.2937488360543795, + "grad_norm": 0.3362166331799213, + "learning_rate": 9.791020070349681e-05, + "loss": 3.7801, + "step": 4732 + }, + { + "epoch": 0.29381091315413743, + "grad_norm": 0.37955664824948704, + "learning_rate": 9.793089178564039e-05, + "loss": 3.9286, + "step": 4733 + }, + { + "epoch": 0.29387299025389535, + "grad_norm": 0.3890084577660835, + "learning_rate": 9.795158286778398e-05, + "loss": 3.846, + "step": 4734 + }, + { + "epoch": 0.2939350673536532, + "grad_norm": 0.5740495289690557, + "learning_rate": 9.797227394992758e-05, + "loss": 3.847, + "step": 4735 + }, + { + "epoch": 0.29399714445341113, + "grad_norm": 0.42310982585389767, + "learning_rate": 9.799296503207119e-05, + "loss": 3.921, + "step": 4736 + }, + { + "epoch": 0.29405922155316905, + "grad_norm": 0.6397540056312679, + "learning_rate": 9.801365611421478e-05, + "loss": 3.8651, + "step": 4737 + }, + { + "epoch": 0.2941212986529269, + "grad_norm": 0.6181231540926225, + "learning_rate": 9.803434719635838e-05, + "loss": 3.7701, + "step": 4738 + }, + { + "epoch": 0.29418337575268483, + "grad_norm": 0.38004754424690035, + "learning_rate": 9.805503827850196e-05, + "loss": 3.7715, + "step": 4739 + }, + { + "epoch": 0.29424545285244275, + "grad_norm": 0.4336607941352791, + "learning_rate": 9.807572936064557e-05, + "loss": 3.7304, + "step": 4740 + }, + { + "epoch": 0.2943075299522006, + "grad_norm": 0.5118253756957994, + "learning_rate": 9.809642044278916e-05, + "loss": 3.8679, + "step": 4741 + }, + { + "epoch": 0.2943696070519585, + "grad_norm": 0.4157199973338797, + "learning_rate": 9.811711152493276e-05, + "loss": 3.8894, + "step": 4742 + }, + { + "epoch": 0.29443168415171644, + "grad_norm": 0.4460416813541121, + "learning_rate": 9.813780260707635e-05, + "loss": 3.8479, + "step": 4743 + }, + { + "epoch": 0.2944937612514743, + "grad_norm": 0.47214851220961074, + "learning_rate": 9.815849368921995e-05, + "loss": 3.8994, + "step": 4744 + }, + { + "epoch": 0.2945558383512322, + "grad_norm": 0.4680443220302925, + "learning_rate": 9.817918477136354e-05, + "loss": 3.8007, + "step": 4745 + }, + { + "epoch": 0.29461791545099014, + "grad_norm": 0.37172641029498305, + "learning_rate": 9.819987585350714e-05, + "loss": 3.8159, + "step": 4746 + }, + { + "epoch": 0.294679992550748, + "grad_norm": 0.5450936323630603, + "learning_rate": 9.822056693565073e-05, + "loss": 3.7413, + "step": 4747 + }, + { + "epoch": 0.2947420696505059, + "grad_norm": 0.5374803406424726, + "learning_rate": 9.824125801779434e-05, + "loss": 3.8157, + "step": 4748 + }, + { + "epoch": 0.29480414675026384, + "grad_norm": 0.43854705651686876, + "learning_rate": 9.826194909993794e-05, + "loss": 3.7982, + "step": 4749 + }, + { + "epoch": 0.2948662238500217, + "grad_norm": 0.34829900024622695, + "learning_rate": 9.828264018208153e-05, + "loss": 3.9159, + "step": 4750 + }, + { + "epoch": 0.2949283009497796, + "grad_norm": 0.39381613932787934, + "learning_rate": 9.830333126422513e-05, + "loss": 3.7907, + "step": 4751 + }, + { + "epoch": 0.29499037804953754, + "grad_norm": 0.30355856897260486, + "learning_rate": 9.832402234636872e-05, + "loss": 3.8396, + "step": 4752 + }, + { + "epoch": 0.2950524551492954, + "grad_norm": 0.25126736488401735, + "learning_rate": 9.834471342851232e-05, + "loss": 3.8001, + "step": 4753 + }, + { + "epoch": 0.2951145322490533, + "grad_norm": 0.4455792481422014, + "learning_rate": 9.836540451065591e-05, + "loss": 3.8009, + "step": 4754 + }, + { + "epoch": 0.29517660934881124, + "grad_norm": 0.44938132619848253, + "learning_rate": 9.838609559279951e-05, + "loss": 3.7764, + "step": 4755 + }, + { + "epoch": 0.2952386864485691, + "grad_norm": 0.3559415000174996, + "learning_rate": 9.84067866749431e-05, + "loss": 3.7856, + "step": 4756 + }, + { + "epoch": 0.295300763548327, + "grad_norm": 0.5149223568635821, + "learning_rate": 9.842747775708671e-05, + "loss": 3.8465, + "step": 4757 + }, + { + "epoch": 0.29536284064808493, + "grad_norm": 0.3750158346329812, + "learning_rate": 9.844816883923029e-05, + "loss": 3.7968, + "step": 4758 + }, + { + "epoch": 0.2954249177478428, + "grad_norm": 0.5457149736432895, + "learning_rate": 9.846885992137389e-05, + "loss": 3.8226, + "step": 4759 + }, + { + "epoch": 0.2954869948476007, + "grad_norm": 0.43122089052778295, + "learning_rate": 9.848955100351748e-05, + "loss": 3.7591, + "step": 4760 + }, + { + "epoch": 0.29554907194735863, + "grad_norm": 0.3961568334481436, + "learning_rate": 9.851024208566109e-05, + "loss": 3.7583, + "step": 4761 + }, + { + "epoch": 0.2956111490471165, + "grad_norm": 0.42332625669787216, + "learning_rate": 9.853093316780469e-05, + "loss": 3.7644, + "step": 4762 + }, + { + "epoch": 0.2956732261468744, + "grad_norm": 0.31382008919032134, + "learning_rate": 9.855162424994828e-05, + "loss": 3.8643, + "step": 4763 + }, + { + "epoch": 0.29573530324663233, + "grad_norm": 0.3383898492450086, + "learning_rate": 9.857231533209186e-05, + "loss": 3.7617, + "step": 4764 + }, + { + "epoch": 0.2957973803463902, + "grad_norm": 0.3124622202924513, + "learning_rate": 9.859300641423547e-05, + "loss": 3.8619, + "step": 4765 + }, + { + "epoch": 0.2958594574461481, + "grad_norm": 0.32239158942901486, + "learning_rate": 9.861369749637907e-05, + "loss": 3.8, + "step": 4766 + }, + { + "epoch": 0.29592153454590603, + "grad_norm": 0.3597894227878832, + "learning_rate": 9.863438857852266e-05, + "loss": 3.8767, + "step": 4767 + }, + { + "epoch": 0.2959836116456639, + "grad_norm": 0.3920439696880663, + "learning_rate": 9.865507966066626e-05, + "loss": 3.8281, + "step": 4768 + }, + { + "epoch": 0.2960456887454218, + "grad_norm": 0.33046080622583285, + "learning_rate": 9.867577074280985e-05, + "loss": 3.7826, + "step": 4769 + }, + { + "epoch": 0.2961077658451797, + "grad_norm": 0.2897175249261664, + "learning_rate": 9.869646182495345e-05, + "loss": 3.7575, + "step": 4770 + }, + { + "epoch": 0.2961698429449376, + "grad_norm": 0.45759099933423025, + "learning_rate": 9.871715290709704e-05, + "loss": 3.9228, + "step": 4771 + }, + { + "epoch": 0.2962319200446955, + "grad_norm": 0.48339552444967443, + "learning_rate": 9.873784398924064e-05, + "loss": 3.7387, + "step": 4772 + }, + { + "epoch": 0.2962939971444534, + "grad_norm": 0.331554148602091, + "learning_rate": 9.875853507138425e-05, + "loss": 3.9178, + "step": 4773 + }, + { + "epoch": 0.2963560742442113, + "grad_norm": 0.3248839391202887, + "learning_rate": 9.877922615352784e-05, + "loss": 3.8439, + "step": 4774 + }, + { + "epoch": 0.2964181513439692, + "grad_norm": 0.28438148019554593, + "learning_rate": 9.879991723567144e-05, + "loss": 3.7145, + "step": 4775 + }, + { + "epoch": 0.2964802284437271, + "grad_norm": 0.28017149875754527, + "learning_rate": 9.882060831781502e-05, + "loss": 3.7891, + "step": 4776 + }, + { + "epoch": 0.296542305543485, + "grad_norm": 0.29266186217442736, + "learning_rate": 9.884129939995863e-05, + "loss": 3.7548, + "step": 4777 + }, + { + "epoch": 0.2966043826432429, + "grad_norm": 0.36980570063578644, + "learning_rate": 9.886199048210222e-05, + "loss": 3.9351, + "step": 4778 + }, + { + "epoch": 0.2966664597430008, + "grad_norm": 0.5032206758311749, + "learning_rate": 9.888268156424582e-05, + "loss": 3.788, + "step": 4779 + }, + { + "epoch": 0.2967285368427587, + "grad_norm": 0.3540961835887162, + "learning_rate": 9.890337264638941e-05, + "loss": 3.7936, + "step": 4780 + }, + { + "epoch": 0.2967906139425166, + "grad_norm": 0.45513655324288826, + "learning_rate": 9.8924063728533e-05, + "loss": 3.8514, + "step": 4781 + }, + { + "epoch": 0.2968526910422745, + "grad_norm": 0.3968152511857001, + "learning_rate": 9.89447548106766e-05, + "loss": 3.7788, + "step": 4782 + }, + { + "epoch": 0.2969147681420324, + "grad_norm": 0.30254067407601154, + "learning_rate": 9.89654458928202e-05, + "loss": 3.9122, + "step": 4783 + }, + { + "epoch": 0.2969768452417903, + "grad_norm": 0.43811660978960465, + "learning_rate": 9.898613697496379e-05, + "loss": 3.7833, + "step": 4784 + }, + { + "epoch": 0.2970389223415482, + "grad_norm": 0.43310011867930254, + "learning_rate": 9.900682805710739e-05, + "loss": 3.7932, + "step": 4785 + }, + { + "epoch": 0.2971009994413061, + "grad_norm": 0.26559617783747064, + "learning_rate": 9.9027519139251e-05, + "loss": 3.8, + "step": 4786 + }, + { + "epoch": 0.297163076541064, + "grad_norm": 0.40164809594944256, + "learning_rate": 9.904821022139459e-05, + "loss": 3.7149, + "step": 4787 + }, + { + "epoch": 0.2972251536408219, + "grad_norm": 0.3815554200623903, + "learning_rate": 9.906890130353817e-05, + "loss": 3.8231, + "step": 4788 + }, + { + "epoch": 0.2972872307405798, + "grad_norm": 0.41089913439335723, + "learning_rate": 9.908959238568177e-05, + "loss": 3.79, + "step": 4789 + }, + { + "epoch": 0.2973493078403377, + "grad_norm": 0.4193575771311299, + "learning_rate": 9.911028346782538e-05, + "loss": 3.8961, + "step": 4790 + }, + { + "epoch": 0.2974113849400956, + "grad_norm": 0.42586275073126956, + "learning_rate": 9.913097454996897e-05, + "loss": 3.9479, + "step": 4791 + }, + { + "epoch": 0.2974734620398535, + "grad_norm": 0.3134340631785285, + "learning_rate": 9.915166563211257e-05, + "loss": 3.8432, + "step": 4792 + }, + { + "epoch": 0.2975355391396114, + "grad_norm": 0.2860626030409804, + "learning_rate": 9.917235671425616e-05, + "loss": 3.7657, + "step": 4793 + }, + { + "epoch": 0.2975976162393693, + "grad_norm": 0.3703856086190106, + "learning_rate": 9.919304779639976e-05, + "loss": 3.8491, + "step": 4794 + }, + { + "epoch": 0.2976596933391272, + "grad_norm": 0.4649908169742048, + "learning_rate": 9.921373887854335e-05, + "loss": 3.9097, + "step": 4795 + }, + { + "epoch": 0.2977217704388851, + "grad_norm": 0.6110912022333916, + "learning_rate": 9.923442996068695e-05, + "loss": 3.8188, + "step": 4796 + }, + { + "epoch": 0.297783847538643, + "grad_norm": 0.3489935368483231, + "learning_rate": 9.925512104283054e-05, + "loss": 3.7689, + "step": 4797 + }, + { + "epoch": 0.29784592463840087, + "grad_norm": 0.4764907814216911, + "learning_rate": 9.927581212497415e-05, + "loss": 3.705, + "step": 4798 + }, + { + "epoch": 0.2979080017381588, + "grad_norm": 0.42187215662810434, + "learning_rate": 9.929650320711775e-05, + "loss": 3.7119, + "step": 4799 + }, + { + "epoch": 0.2979700788379167, + "grad_norm": 0.4421160737976261, + "learning_rate": 9.931719428926133e-05, + "loss": 3.8594, + "step": 4800 + }, + { + "epoch": 0.29803215593767457, + "grad_norm": 0.3883918251687858, + "learning_rate": 9.933788537140492e-05, + "loss": 3.8398, + "step": 4801 + }, + { + "epoch": 0.2980942330374325, + "grad_norm": 0.38472462934336976, + "learning_rate": 9.935857645354853e-05, + "loss": 3.7129, + "step": 4802 + }, + { + "epoch": 0.2981563101371904, + "grad_norm": 0.33839450596552556, + "learning_rate": 9.937926753569213e-05, + "loss": 3.8532, + "step": 4803 + }, + { + "epoch": 0.29821838723694827, + "grad_norm": 0.4118371036964946, + "learning_rate": 9.939995861783572e-05, + "loss": 3.8188, + "step": 4804 + }, + { + "epoch": 0.2982804643367062, + "grad_norm": 0.3818338496548678, + "learning_rate": 9.942064969997932e-05, + "loss": 3.6988, + "step": 4805 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 0.39921872835103256, + "learning_rate": 9.944134078212291e-05, + "loss": 3.839, + "step": 4806 + }, + { + "epoch": 0.29840461853622197, + "grad_norm": 0.2748800915779515, + "learning_rate": 9.94620318642665e-05, + "loss": 3.8645, + "step": 4807 + }, + { + "epoch": 0.2984666956359799, + "grad_norm": 0.3049386896398333, + "learning_rate": 9.94827229464101e-05, + "loss": 3.8365, + "step": 4808 + }, + { + "epoch": 0.2985287727357378, + "grad_norm": 0.31582547090997, + "learning_rate": 9.95034140285537e-05, + "loss": 3.8097, + "step": 4809 + }, + { + "epoch": 0.29859084983549566, + "grad_norm": 0.29524976931973973, + "learning_rate": 9.952410511069729e-05, + "loss": 3.8182, + "step": 4810 + }, + { + "epoch": 0.2986529269352536, + "grad_norm": 0.23735573302023663, + "learning_rate": 9.95447961928409e-05, + "loss": 3.7508, + "step": 4811 + }, + { + "epoch": 0.2987150040350115, + "grad_norm": 0.40529376978599957, + "learning_rate": 9.956548727498448e-05, + "loss": 3.9009, + "step": 4812 + }, + { + "epoch": 0.29877708113476936, + "grad_norm": 0.32702635579609324, + "learning_rate": 9.958617835712808e-05, + "loss": 3.8113, + "step": 4813 + }, + { + "epoch": 0.2988391582345273, + "grad_norm": 0.28697368763137465, + "learning_rate": 9.960686943927167e-05, + "loss": 3.7951, + "step": 4814 + }, + { + "epoch": 0.2989012353342852, + "grad_norm": 0.3101004985216077, + "learning_rate": 9.962756052141528e-05, + "loss": 3.7412, + "step": 4815 + }, + { + "epoch": 0.29896331243404306, + "grad_norm": 0.35152073718077786, + "learning_rate": 9.964825160355887e-05, + "loss": 3.8632, + "step": 4816 + }, + { + "epoch": 0.299025389533801, + "grad_norm": 0.613913192263153, + "learning_rate": 9.966894268570247e-05, + "loss": 3.8065, + "step": 4817 + }, + { + "epoch": 0.2990874666335589, + "grad_norm": 0.42274561869710015, + "learning_rate": 9.968963376784605e-05, + "loss": 3.8464, + "step": 4818 + }, + { + "epoch": 0.29914954373331676, + "grad_norm": 0.35445862077837603, + "learning_rate": 9.971032484998966e-05, + "loss": 3.7819, + "step": 4819 + }, + { + "epoch": 0.2992116208330747, + "grad_norm": 0.43185212490154823, + "learning_rate": 9.973101593213326e-05, + "loss": 3.8903, + "step": 4820 + }, + { + "epoch": 0.2992736979328326, + "grad_norm": 0.429129758732337, + "learning_rate": 9.975170701427685e-05, + "loss": 3.7952, + "step": 4821 + }, + { + "epoch": 0.29933577503259046, + "grad_norm": 0.4074288650161772, + "learning_rate": 9.977239809642045e-05, + "loss": 3.8146, + "step": 4822 + }, + { + "epoch": 0.2993978521323484, + "grad_norm": 0.43351932264886645, + "learning_rate": 9.979308917856405e-05, + "loss": 3.7266, + "step": 4823 + }, + { + "epoch": 0.2994599292321063, + "grad_norm": 0.2754213031969104, + "learning_rate": 9.981378026070764e-05, + "loss": 3.8717, + "step": 4824 + }, + { + "epoch": 0.29952200633186415, + "grad_norm": 0.3420400354016, + "learning_rate": 9.983447134285123e-05, + "loss": 3.8491, + "step": 4825 + }, + { + "epoch": 0.2995840834316221, + "grad_norm": 0.3217593638095691, + "learning_rate": 9.985516242499483e-05, + "loss": 3.8566, + "step": 4826 + }, + { + "epoch": 0.29964616053138, + "grad_norm": 0.3124193445071253, + "learning_rate": 9.987585350713843e-05, + "loss": 3.8258, + "step": 4827 + }, + { + "epoch": 0.29970823763113785, + "grad_norm": 0.26239346601703767, + "learning_rate": 9.989654458928203e-05, + "loss": 3.8543, + "step": 4828 + }, + { + "epoch": 0.29977031473089577, + "grad_norm": 0.28778241218748946, + "learning_rate": 9.991723567142562e-05, + "loss": 3.9069, + "step": 4829 + }, + { + "epoch": 0.2998323918306537, + "grad_norm": 0.28029151353132015, + "learning_rate": 9.99379267535692e-05, + "loss": 3.8143, + "step": 4830 + }, + { + "epoch": 0.29989446893041155, + "grad_norm": 0.3337567429272146, + "learning_rate": 9.995861783571281e-05, + "loss": 3.798, + "step": 4831 + }, + { + "epoch": 0.29995654603016947, + "grad_norm": 0.5113393235929179, + "learning_rate": 9.997930891785641e-05, + "loss": 3.8567, + "step": 4832 + }, + { + "epoch": 0.3000186231299274, + "grad_norm": 0.6249598728649877, + "learning_rate": 0.0001, + "loss": 3.8043, + "step": 4833 + }, + { + "epoch": 0.30008070022968525, + "grad_norm": 0.43314520514893734, + "learning_rate": 9.999999986956894e-05, + "loss": 3.7627, + "step": 4834 + }, + { + "epoch": 0.30014277732944317, + "grad_norm": 0.8212177912465061, + "learning_rate": 9.999999947827573e-05, + "loss": 3.7372, + "step": 4835 + }, + { + "epoch": 0.3002048544292011, + "grad_norm": 0.37551225007821154, + "learning_rate": 9.99999988261204e-05, + "loss": 3.8157, + "step": 4836 + }, + { + "epoch": 0.30026693152895895, + "grad_norm": 0.3256881347522688, + "learning_rate": 9.999999791310292e-05, + "loss": 3.7745, + "step": 4837 + }, + { + "epoch": 0.30032900862871686, + "grad_norm": 0.4483418405829457, + "learning_rate": 9.99999967392233e-05, + "loss": 3.757, + "step": 4838 + }, + { + "epoch": 0.3003910857284748, + "grad_norm": 0.5237942803448932, + "learning_rate": 9.999999530448159e-05, + "loss": 3.8611, + "step": 4839 + }, + { + "epoch": 0.30045316282823264, + "grad_norm": 0.7048909369444681, + "learning_rate": 9.999999360887775e-05, + "loss": 3.869, + "step": 4840 + }, + { + "epoch": 0.30051523992799056, + "grad_norm": 0.48541567895103893, + "learning_rate": 9.999999165241181e-05, + "loss": 3.8343, + "step": 4841 + }, + { + "epoch": 0.3005773170277485, + "grad_norm": 0.4459490325869338, + "learning_rate": 9.999998943508378e-05, + "loss": 3.7437, + "step": 4842 + }, + { + "epoch": 0.30063939412750634, + "grad_norm": 0.5056452647841164, + "learning_rate": 9.999998695689368e-05, + "loss": 3.6325, + "step": 4843 + }, + { + "epoch": 0.30070147122726426, + "grad_norm": 0.5300301965172581, + "learning_rate": 9.999998421784149e-05, + "loss": 3.7875, + "step": 4844 + }, + { + "epoch": 0.3007635483270222, + "grad_norm": 0.4276084017650354, + "learning_rate": 9.999998121792724e-05, + "loss": 3.8459, + "step": 4845 + }, + { + "epoch": 0.30082562542678004, + "grad_norm": 0.5681414628869722, + "learning_rate": 9.999997795715096e-05, + "loss": 3.7867, + "step": 4846 + }, + { + "epoch": 0.30088770252653796, + "grad_norm": 0.401847514818905, + "learning_rate": 9.999997443551266e-05, + "loss": 3.8826, + "step": 4847 + }, + { + "epoch": 0.3009497796262959, + "grad_norm": 0.5289449331091911, + "learning_rate": 9.999997065301234e-05, + "loss": 3.7492, + "step": 4848 + }, + { + "epoch": 0.30101185672605374, + "grad_norm": 0.3089570791772522, + "learning_rate": 9.999996660965006e-05, + "loss": 3.7864, + "step": 4849 + }, + { + "epoch": 0.30107393382581166, + "grad_norm": 0.5537287256791246, + "learning_rate": 9.999996230542579e-05, + "loss": 3.8122, + "step": 4850 + }, + { + "epoch": 0.3011360109255696, + "grad_norm": 0.39839647119244254, + "learning_rate": 9.99999577403396e-05, + "loss": 3.8376, + "step": 4851 + }, + { + "epoch": 0.30119808802532744, + "grad_norm": 0.40910930528965417, + "learning_rate": 9.999995291439148e-05, + "loss": 3.786, + "step": 4852 + }, + { + "epoch": 0.30126016512508536, + "grad_norm": 0.4382928408145756, + "learning_rate": 9.999994782758147e-05, + "loss": 3.7397, + "step": 4853 + }, + { + "epoch": 0.3013222422248433, + "grad_norm": 0.361130840983149, + "learning_rate": 9.99999424799096e-05, + "loss": 3.8102, + "step": 4854 + }, + { + "epoch": 0.30138431932460114, + "grad_norm": 0.3479163236483103, + "learning_rate": 9.999993687137588e-05, + "loss": 3.7984, + "step": 4855 + }, + { + "epoch": 0.30144639642435905, + "grad_norm": 0.37792416748431606, + "learning_rate": 9.999993100198036e-05, + "loss": 3.7079, + "step": 4856 + }, + { + "epoch": 0.30150847352411697, + "grad_norm": 0.3252979380108658, + "learning_rate": 9.999992487172307e-05, + "loss": 3.6904, + "step": 4857 + }, + { + "epoch": 0.30157055062387483, + "grad_norm": 0.35997154646591223, + "learning_rate": 9.999991848060402e-05, + "loss": 3.8518, + "step": 4858 + }, + { + "epoch": 0.30163262772363275, + "grad_norm": 0.3408331882263534, + "learning_rate": 9.999991182862326e-05, + "loss": 3.6475, + "step": 4859 + }, + { + "epoch": 0.30169470482339067, + "grad_norm": 0.3653589332054869, + "learning_rate": 9.999990491578083e-05, + "loss": 3.7637, + "step": 4860 + }, + { + "epoch": 0.30175678192314853, + "grad_norm": 0.2853288520062972, + "learning_rate": 9.999989774207674e-05, + "loss": 3.8546, + "step": 4861 + }, + { + "epoch": 0.30181885902290645, + "grad_norm": 0.47423709978426454, + "learning_rate": 9.999989030751107e-05, + "loss": 3.772, + "step": 4862 + }, + { + "epoch": 0.30188093612266437, + "grad_norm": 0.41793222356893744, + "learning_rate": 9.999988261208382e-05, + "loss": 3.7787, + "step": 4863 + }, + { + "epoch": 0.30194301322242223, + "grad_norm": 0.34526805116932024, + "learning_rate": 9.999987465579506e-05, + "loss": 3.7583, + "step": 4864 + }, + { + "epoch": 0.30200509032218015, + "grad_norm": 0.4315445219047792, + "learning_rate": 9.999986643864478e-05, + "loss": 3.809, + "step": 4865 + }, + { + "epoch": 0.30206716742193807, + "grad_norm": 0.3539842481745533, + "learning_rate": 9.99998579606331e-05, + "loss": 3.8038, + "step": 4866 + }, + { + "epoch": 0.3021292445216959, + "grad_norm": 0.4551922078044346, + "learning_rate": 9.999984922176e-05, + "loss": 3.6924, + "step": 4867 + }, + { + "epoch": 0.30219132162145385, + "grad_norm": 0.38308437580091, + "learning_rate": 9.999984022202556e-05, + "loss": 3.8129, + "step": 4868 + }, + { + "epoch": 0.30225339872121176, + "grad_norm": 0.38050228887538756, + "learning_rate": 9.99998309614298e-05, + "loss": 3.7323, + "step": 4869 + }, + { + "epoch": 0.3023154758209696, + "grad_norm": 0.35395122267593676, + "learning_rate": 9.99998214399728e-05, + "loss": 3.8003, + "step": 4870 + }, + { + "epoch": 0.30237755292072754, + "grad_norm": 0.34363797920126754, + "learning_rate": 9.999981165765458e-05, + "loss": 3.6897, + "step": 4871 + }, + { + "epoch": 0.30243963002048546, + "grad_norm": 0.38762387403037385, + "learning_rate": 9.999980161447521e-05, + "loss": 3.8758, + "step": 4872 + }, + { + "epoch": 0.3025017071202433, + "grad_norm": 0.4303102142505982, + "learning_rate": 9.999979131043476e-05, + "loss": 3.8847, + "step": 4873 + }, + { + "epoch": 0.30256378422000124, + "grad_norm": 0.29610707698411604, + "learning_rate": 9.999978074553322e-05, + "loss": 3.82, + "step": 4874 + }, + { + "epoch": 0.30262586131975916, + "grad_norm": 0.28364881763763217, + "learning_rate": 9.999976991977071e-05, + "loss": 3.7505, + "step": 4875 + }, + { + "epoch": 0.302687938419517, + "grad_norm": 0.2762031472722894, + "learning_rate": 9.999975883314727e-05, + "loss": 3.7761, + "step": 4876 + }, + { + "epoch": 0.30275001551927494, + "grad_norm": 0.3570569369301564, + "learning_rate": 9.999974748566293e-05, + "loss": 3.731, + "step": 4877 + }, + { + "epoch": 0.30281209261903286, + "grad_norm": 0.29499296885128584, + "learning_rate": 9.999973587731778e-05, + "loss": 3.796, + "step": 4878 + }, + { + "epoch": 0.3028741697187907, + "grad_norm": 0.3636505956601106, + "learning_rate": 9.999972400811186e-05, + "loss": 3.784, + "step": 4879 + }, + { + "epoch": 0.30293624681854864, + "grad_norm": 0.4389991619540929, + "learning_rate": 9.999971187804527e-05, + "loss": 3.784, + "step": 4880 + }, + { + "epoch": 0.30299832391830656, + "grad_norm": 0.2569204851240632, + "learning_rate": 9.999969948711801e-05, + "loss": 3.8333, + "step": 4881 + }, + { + "epoch": 0.3030604010180644, + "grad_norm": 0.24470666498520177, + "learning_rate": 9.99996868353302e-05, + "loss": 3.7438, + "step": 4882 + }, + { + "epoch": 0.30312247811782234, + "grad_norm": 0.30179209344696695, + "learning_rate": 9.999967392268188e-05, + "loss": 3.7833, + "step": 4883 + }, + { + "epoch": 0.30318455521758025, + "grad_norm": 0.35966270788873034, + "learning_rate": 9.999966074917312e-05, + "loss": 3.946, + "step": 4884 + }, + { + "epoch": 0.3032466323173381, + "grad_norm": 0.45195797560717443, + "learning_rate": 9.9999647314804e-05, + "loss": 3.7931, + "step": 4885 + }, + { + "epoch": 0.30330870941709603, + "grad_norm": 0.3127218889216339, + "learning_rate": 9.999963361957459e-05, + "loss": 3.7897, + "step": 4886 + }, + { + "epoch": 0.30337078651685395, + "grad_norm": 0.45131710737005826, + "learning_rate": 9.999961966348493e-05, + "loss": 3.7053, + "step": 4887 + }, + { + "epoch": 0.3034328636166118, + "grad_norm": 0.30573754448056156, + "learning_rate": 9.999960544653513e-05, + "loss": 3.7435, + "step": 4888 + }, + { + "epoch": 0.30349494071636973, + "grad_norm": 0.31959556967516684, + "learning_rate": 9.999959096872525e-05, + "loss": 3.6729, + "step": 4889 + }, + { + "epoch": 0.30355701781612765, + "grad_norm": 0.2713071807457645, + "learning_rate": 9.999957623005537e-05, + "loss": 3.6974, + "step": 4890 + }, + { + "epoch": 0.3036190949158855, + "grad_norm": 0.2806458607334927, + "learning_rate": 9.999956123052556e-05, + "loss": 3.6643, + "step": 4891 + }, + { + "epoch": 0.30368117201564343, + "grad_norm": 0.3698858579928614, + "learning_rate": 9.99995459701359e-05, + "loss": 3.7196, + "step": 4892 + }, + { + "epoch": 0.30374324911540135, + "grad_norm": 0.261551575859856, + "learning_rate": 9.999953044888646e-05, + "loss": 3.7604, + "step": 4893 + }, + { + "epoch": 0.3038053262151592, + "grad_norm": 0.42009082323079355, + "learning_rate": 9.999951466677735e-05, + "loss": 3.7575, + "step": 4894 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 0.4245419621581532, + "learning_rate": 9.999949862380863e-05, + "loss": 3.7779, + "step": 4895 + }, + { + "epoch": 0.30392948041467505, + "grad_norm": 0.29581756241941987, + "learning_rate": 9.999948231998037e-05, + "loss": 3.7778, + "step": 4896 + }, + { + "epoch": 0.3039915575144329, + "grad_norm": 0.3428794718571399, + "learning_rate": 9.99994657552927e-05, + "loss": 3.8042, + "step": 4897 + }, + { + "epoch": 0.3040536346141908, + "grad_norm": 0.35454682908981033, + "learning_rate": 9.999944892974566e-05, + "loss": 3.7832, + "step": 4898 + }, + { + "epoch": 0.30411571171394874, + "grad_norm": 0.2546526574950958, + "learning_rate": 9.999943184333937e-05, + "loss": 3.8083, + "step": 4899 + }, + { + "epoch": 0.3041777888137066, + "grad_norm": 0.49132783685652404, + "learning_rate": 9.99994144960739e-05, + "loss": 3.7354, + "step": 4900 + }, + { + "epoch": 0.3042398659134645, + "grad_norm": 0.39415645737696614, + "learning_rate": 9.999939688794935e-05, + "loss": 3.7642, + "step": 4901 + }, + { + "epoch": 0.30430194301322244, + "grad_norm": 0.38280021881333437, + "learning_rate": 9.99993790189658e-05, + "loss": 3.7715, + "step": 4902 + }, + { + "epoch": 0.3043640201129803, + "grad_norm": 0.3286537496920165, + "learning_rate": 9.999936088912336e-05, + "loss": 3.7944, + "step": 4903 + }, + { + "epoch": 0.3044260972127382, + "grad_norm": 0.3053017684276491, + "learning_rate": 9.999934249842211e-05, + "loss": 3.8041, + "step": 4904 + }, + { + "epoch": 0.30448817431249614, + "grad_norm": 0.5252396320498111, + "learning_rate": 9.999932384686217e-05, + "loss": 3.7887, + "step": 4905 + }, + { + "epoch": 0.304550251412254, + "grad_norm": 0.41888295667713316, + "learning_rate": 9.999930493444361e-05, + "loss": 3.7425, + "step": 4906 + }, + { + "epoch": 0.3046123285120119, + "grad_norm": 0.2792814290504014, + "learning_rate": 9.999928576116652e-05, + "loss": 3.8528, + "step": 4907 + }, + { + "epoch": 0.30467440561176984, + "grad_norm": 0.37105610305157904, + "learning_rate": 9.999926632703104e-05, + "loss": 3.8528, + "step": 4908 + }, + { + "epoch": 0.3047364827115277, + "grad_norm": 0.3356437729712349, + "learning_rate": 9.999924663203724e-05, + "loss": 3.8024, + "step": 4909 + }, + { + "epoch": 0.3047985598112856, + "grad_norm": 0.5825351863269901, + "learning_rate": 9.999922667618524e-05, + "loss": 3.8665, + "step": 4910 + }, + { + "epoch": 0.30486063691104354, + "grad_norm": 0.371056484559813, + "learning_rate": 9.999920645947514e-05, + "loss": 3.7604, + "step": 4911 + }, + { + "epoch": 0.3049227140108014, + "grad_norm": 0.40772056347152835, + "learning_rate": 9.999918598190703e-05, + "loss": 3.7175, + "step": 4912 + }, + { + "epoch": 0.3049847911105593, + "grad_norm": 0.4564813256084353, + "learning_rate": 9.999916524348102e-05, + "loss": 3.7562, + "step": 4913 + }, + { + "epoch": 0.30504686821031723, + "grad_norm": 0.48644215422796433, + "learning_rate": 9.999914424419725e-05, + "loss": 3.8163, + "step": 4914 + }, + { + "epoch": 0.3051089453100751, + "grad_norm": 0.3459847052430928, + "learning_rate": 9.999912298405579e-05, + "loss": 3.6857, + "step": 4915 + }, + { + "epoch": 0.305171022409833, + "grad_norm": 0.49451706611336294, + "learning_rate": 9.999910146305678e-05, + "loss": 3.9039, + "step": 4916 + }, + { + "epoch": 0.30523309950959093, + "grad_norm": 0.42647314288513577, + "learning_rate": 9.999907968120032e-05, + "loss": 3.6538, + "step": 4917 + }, + { + "epoch": 0.3052951766093488, + "grad_norm": 0.46618906456433806, + "learning_rate": 9.999905763848652e-05, + "loss": 3.8489, + "step": 4918 + }, + { + "epoch": 0.3053572537091067, + "grad_norm": 0.4305872773034819, + "learning_rate": 9.99990353349155e-05, + "loss": 3.6246, + "step": 4919 + }, + { + "epoch": 0.30541933080886463, + "grad_norm": 0.4744613322379326, + "learning_rate": 9.999901277048736e-05, + "loss": 3.7359, + "step": 4920 + }, + { + "epoch": 0.3054814079086225, + "grad_norm": 0.4680116082946477, + "learning_rate": 9.999898994520224e-05, + "loss": 3.8433, + "step": 4921 + }, + { + "epoch": 0.3055434850083804, + "grad_norm": 0.452990281697632, + "learning_rate": 9.999896685906028e-05, + "loss": 3.8319, + "step": 4922 + }, + { + "epoch": 0.30560556210813833, + "grad_norm": 0.34330756802352336, + "learning_rate": 9.999894351206154e-05, + "loss": 3.6498, + "step": 4923 + }, + { + "epoch": 0.3056676392078962, + "grad_norm": 0.46094018321145336, + "learning_rate": 9.999891990420618e-05, + "loss": 3.8514, + "step": 4924 + }, + { + "epoch": 0.3057297163076541, + "grad_norm": 0.49469929050316025, + "learning_rate": 9.999889603549431e-05, + "loss": 3.8703, + "step": 4925 + }, + { + "epoch": 0.305791793407412, + "grad_norm": 0.6246720168698147, + "learning_rate": 9.999887190592607e-05, + "loss": 3.7543, + "step": 4926 + }, + { + "epoch": 0.3058538705071699, + "grad_norm": 0.4561876025708689, + "learning_rate": 9.999884751550159e-05, + "loss": 3.76, + "step": 4927 + }, + { + "epoch": 0.3059159476069278, + "grad_norm": 0.7440779051293436, + "learning_rate": 9.999882286422098e-05, + "loss": 3.7466, + "step": 4928 + }, + { + "epoch": 0.3059780247066857, + "grad_norm": 0.34697435777474056, + "learning_rate": 9.999879795208436e-05, + "loss": 3.7132, + "step": 4929 + }, + { + "epoch": 0.3060401018064436, + "grad_norm": 0.5413521226300315, + "learning_rate": 9.999877277909188e-05, + "loss": 3.7611, + "step": 4930 + }, + { + "epoch": 0.3061021789062015, + "grad_norm": 0.41041774588247754, + "learning_rate": 9.999874734524367e-05, + "loss": 3.7328, + "step": 4931 + }, + { + "epoch": 0.3061642560059594, + "grad_norm": 0.4889417529725143, + "learning_rate": 9.999872165053985e-05, + "loss": 3.7594, + "step": 4932 + }, + { + "epoch": 0.3062263331057173, + "grad_norm": 0.4374575424028528, + "learning_rate": 9.999869569498057e-05, + "loss": 3.7385, + "step": 4933 + }, + { + "epoch": 0.3062884102054752, + "grad_norm": 0.4495740894885183, + "learning_rate": 9.999866947856596e-05, + "loss": 3.7981, + "step": 4934 + }, + { + "epoch": 0.3063504873052331, + "grad_norm": 0.37744756498654225, + "learning_rate": 9.999864300129613e-05, + "loss": 3.8474, + "step": 4935 + }, + { + "epoch": 0.306412564404991, + "grad_norm": 0.4348415257091704, + "learning_rate": 9.999861626317126e-05, + "loss": 3.6563, + "step": 4936 + }, + { + "epoch": 0.3064746415047489, + "grad_norm": 0.39263596749130675, + "learning_rate": 9.999858926419148e-05, + "loss": 3.728, + "step": 4937 + }, + { + "epoch": 0.3065367186045068, + "grad_norm": 0.3944152434864934, + "learning_rate": 9.999856200435691e-05, + "loss": 3.6648, + "step": 4938 + }, + { + "epoch": 0.3065987957042647, + "grad_norm": 0.41413151867563713, + "learning_rate": 9.999853448366769e-05, + "loss": 3.7237, + "step": 4939 + }, + { + "epoch": 0.3066608728040226, + "grad_norm": 0.32004125088930946, + "learning_rate": 9.999850670212401e-05, + "loss": 3.7753, + "step": 4940 + }, + { + "epoch": 0.3067229499037805, + "grad_norm": 0.43848803192974634, + "learning_rate": 9.999847865972596e-05, + "loss": 3.8135, + "step": 4941 + }, + { + "epoch": 0.3067850270035384, + "grad_norm": 0.31390604563165736, + "learning_rate": 9.999845035647371e-05, + "loss": 3.7372, + "step": 4942 + }, + { + "epoch": 0.3068471041032963, + "grad_norm": 0.35087028809221527, + "learning_rate": 9.999842179236742e-05, + "loss": 3.8097, + "step": 4943 + }, + { + "epoch": 0.3069091812030542, + "grad_norm": 0.41915308639616855, + "learning_rate": 9.999839296740722e-05, + "loss": 3.8601, + "step": 4944 + }, + { + "epoch": 0.3069712583028121, + "grad_norm": 0.2933252581429506, + "learning_rate": 9.999836388159327e-05, + "loss": 3.711, + "step": 4945 + }, + { + "epoch": 0.30703333540257, + "grad_norm": 0.34789875230905365, + "learning_rate": 9.999833453492571e-05, + "loss": 3.8073, + "step": 4946 + }, + { + "epoch": 0.3070954125023279, + "grad_norm": 0.5249890916690174, + "learning_rate": 9.99983049274047e-05, + "loss": 3.7611, + "step": 4947 + }, + { + "epoch": 0.3071574896020858, + "grad_norm": 0.74024291021011, + "learning_rate": 9.99982750590304e-05, + "loss": 3.822, + "step": 4948 + }, + { + "epoch": 0.3072195667018437, + "grad_norm": 0.5205582528071016, + "learning_rate": 9.999824492980296e-05, + "loss": 3.7494, + "step": 4949 + }, + { + "epoch": 0.3072816438016016, + "grad_norm": 0.45962825009501757, + "learning_rate": 9.999821453972255e-05, + "loss": 3.7316, + "step": 4950 + }, + { + "epoch": 0.3073437209013595, + "grad_norm": 0.5322391333539075, + "learning_rate": 9.99981838887893e-05, + "loss": 3.8425, + "step": 4951 + }, + { + "epoch": 0.3074057980011174, + "grad_norm": 0.37181815666631923, + "learning_rate": 9.999815297700339e-05, + "loss": 3.7512, + "step": 4952 + }, + { + "epoch": 0.3074678751008753, + "grad_norm": 0.3163076058608172, + "learning_rate": 9.999812180436499e-05, + "loss": 3.8577, + "step": 4953 + }, + { + "epoch": 0.30752995220063317, + "grad_norm": 0.4587382366792669, + "learning_rate": 9.999809037087424e-05, + "loss": 3.7895, + "step": 4954 + }, + { + "epoch": 0.3075920293003911, + "grad_norm": 0.3104485530072154, + "learning_rate": 9.999805867653133e-05, + "loss": 3.7038, + "step": 4955 + }, + { + "epoch": 0.307654106400149, + "grad_norm": 0.49312660905988875, + "learning_rate": 9.99980267213364e-05, + "loss": 3.8032, + "step": 4956 + }, + { + "epoch": 0.30771618349990687, + "grad_norm": 0.3229021202420645, + "learning_rate": 9.999799450528961e-05, + "loss": 3.8105, + "step": 4957 + }, + { + "epoch": 0.3077782605996648, + "grad_norm": 0.49450555526729534, + "learning_rate": 9.999796202839118e-05, + "loss": 3.8012, + "step": 4958 + }, + { + "epoch": 0.3078403376994227, + "grad_norm": 0.41631832661751295, + "learning_rate": 9.999792929064122e-05, + "loss": 3.7234, + "step": 4959 + }, + { + "epoch": 0.30790241479918057, + "grad_norm": 0.4011719128869623, + "learning_rate": 9.999789629203993e-05, + "loss": 3.7422, + "step": 4960 + }, + { + "epoch": 0.3079644918989385, + "grad_norm": 0.37097013321931077, + "learning_rate": 9.999786303258749e-05, + "loss": 3.7411, + "step": 4961 + }, + { + "epoch": 0.3080265689986964, + "grad_norm": 0.3504214525217255, + "learning_rate": 9.999782951228404e-05, + "loss": 3.7821, + "step": 4962 + }, + { + "epoch": 0.30808864609845427, + "grad_norm": 0.5017734309847177, + "learning_rate": 9.999779573112979e-05, + "loss": 3.7893, + "step": 4963 + }, + { + "epoch": 0.3081507231982122, + "grad_norm": 0.4703364863568244, + "learning_rate": 9.99977616891249e-05, + "loss": 3.8239, + "step": 4964 + }, + { + "epoch": 0.3082128002979701, + "grad_norm": 0.2854112503948631, + "learning_rate": 9.999772738626956e-05, + "loss": 3.708, + "step": 4965 + }, + { + "epoch": 0.30827487739772796, + "grad_norm": 0.39257721402646173, + "learning_rate": 9.999769282256393e-05, + "loss": 3.8452, + "step": 4966 + }, + { + "epoch": 0.3083369544974859, + "grad_norm": 0.29085956459746226, + "learning_rate": 9.999765799800818e-05, + "loss": 3.8893, + "step": 4967 + }, + { + "epoch": 0.3083990315972438, + "grad_norm": 0.3943918756238687, + "learning_rate": 9.999762291260253e-05, + "loss": 3.7391, + "step": 4968 + }, + { + "epoch": 0.30846110869700166, + "grad_norm": 0.44939231836534377, + "learning_rate": 9.999758756634714e-05, + "loss": 3.7334, + "step": 4969 + }, + { + "epoch": 0.3085231857967596, + "grad_norm": 0.34867337201972576, + "learning_rate": 9.999755195924221e-05, + "loss": 3.7775, + "step": 4970 + }, + { + "epoch": 0.3085852628965175, + "grad_norm": 0.49162207019998855, + "learning_rate": 9.999751609128791e-05, + "loss": 3.7861, + "step": 4971 + }, + { + "epoch": 0.30864733999627536, + "grad_norm": 0.3504841478751879, + "learning_rate": 9.999747996248441e-05, + "loss": 3.7481, + "step": 4972 + }, + { + "epoch": 0.3087094170960333, + "grad_norm": 0.3394226995601331, + "learning_rate": 9.999744357283193e-05, + "loss": 3.7686, + "step": 4973 + }, + { + "epoch": 0.3087714941957912, + "grad_norm": 0.3159449475898162, + "learning_rate": 9.999740692233067e-05, + "loss": 3.7727, + "step": 4974 + }, + { + "epoch": 0.30883357129554906, + "grad_norm": 0.4195140011045693, + "learning_rate": 9.999737001098078e-05, + "loss": 3.7726, + "step": 4975 + }, + { + "epoch": 0.308895648395307, + "grad_norm": 0.3849782017630482, + "learning_rate": 9.99973328387825e-05, + "loss": 3.757, + "step": 4976 + }, + { + "epoch": 0.3089577254950649, + "grad_norm": 0.396406175390108, + "learning_rate": 9.999729540573596e-05, + "loss": 3.8059, + "step": 4977 + }, + { + "epoch": 0.30901980259482276, + "grad_norm": 0.3083614126620876, + "learning_rate": 9.999725771184141e-05, + "loss": 3.6391, + "step": 4978 + }, + { + "epoch": 0.3090818796945807, + "grad_norm": 0.2532038730609111, + "learning_rate": 9.999721975709903e-05, + "loss": 3.6611, + "step": 4979 + }, + { + "epoch": 0.3091439567943386, + "grad_norm": 0.3786967376675242, + "learning_rate": 9.999718154150901e-05, + "loss": 3.7289, + "step": 4980 + }, + { + "epoch": 0.30920603389409645, + "grad_norm": 0.33921556401876113, + "learning_rate": 9.999714306507157e-05, + "loss": 3.7125, + "step": 4981 + }, + { + "epoch": 0.3092681109938544, + "grad_norm": 0.3778619040094717, + "learning_rate": 9.999710432778689e-05, + "loss": 3.7043, + "step": 4982 + }, + { + "epoch": 0.3093301880936123, + "grad_norm": 0.4537234300920757, + "learning_rate": 9.999706532965518e-05, + "loss": 3.6492, + "step": 4983 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 0.37190425100607477, + "learning_rate": 9.999702607067665e-05, + "loss": 3.7621, + "step": 4984 + }, + { + "epoch": 0.30945434229312807, + "grad_norm": 0.2901049709251848, + "learning_rate": 9.999698655085148e-05, + "loss": 3.7267, + "step": 4985 + }, + { + "epoch": 0.309516419392886, + "grad_norm": 0.27874513294692294, + "learning_rate": 9.99969467701799e-05, + "loss": 3.6377, + "step": 4986 + }, + { + "epoch": 0.30957849649264385, + "grad_norm": 0.20762931163911108, + "learning_rate": 9.999690672866212e-05, + "loss": 3.7386, + "step": 4987 + }, + { + "epoch": 0.30964057359240177, + "grad_norm": 0.3211602993611777, + "learning_rate": 9.999686642629832e-05, + "loss": 3.7779, + "step": 4988 + }, + { + "epoch": 0.3097026506921597, + "grad_norm": 0.33900891728825877, + "learning_rate": 9.999682586308875e-05, + "loss": 3.636, + "step": 4989 + }, + { + "epoch": 0.30976472779191755, + "grad_norm": 0.25047572887119285, + "learning_rate": 9.999678503903361e-05, + "loss": 3.7927, + "step": 4990 + }, + { + "epoch": 0.30982680489167547, + "grad_norm": 0.2218977406090628, + "learning_rate": 9.99967439541331e-05, + "loss": 3.8439, + "step": 4991 + }, + { + "epoch": 0.3098888819914334, + "grad_norm": 0.2883361634880886, + "learning_rate": 9.999670260838743e-05, + "loss": 3.8326, + "step": 4992 + }, + { + "epoch": 0.30995095909119125, + "grad_norm": 0.2788833811811563, + "learning_rate": 9.999666100179683e-05, + "loss": 3.7422, + "step": 4993 + }, + { + "epoch": 0.31001303619094916, + "grad_norm": 0.2509611251180063, + "learning_rate": 9.999661913436152e-05, + "loss": 3.6878, + "step": 4994 + }, + { + "epoch": 0.3100751132907071, + "grad_norm": 0.27207972064596697, + "learning_rate": 9.999657700608169e-05, + "loss": 3.7738, + "step": 4995 + }, + { + "epoch": 0.31013719039046495, + "grad_norm": 0.2318041309626298, + "learning_rate": 9.999653461695759e-05, + "loss": 3.8167, + "step": 4996 + }, + { + "epoch": 0.31019926749022286, + "grad_norm": 0.23790700633706582, + "learning_rate": 9.999649196698944e-05, + "loss": 3.7664, + "step": 4997 + }, + { + "epoch": 0.3102613445899808, + "grad_norm": 0.2829027753663142, + "learning_rate": 9.999644905617745e-05, + "loss": 3.8334, + "step": 4998 + }, + { + "epoch": 0.31032342168973864, + "grad_norm": 0.2442080442799713, + "learning_rate": 9.999640588452184e-05, + "loss": 3.7869, + "step": 4999 + }, + { + "epoch": 0.31038549878949656, + "grad_norm": 0.35363219641721677, + "learning_rate": 9.999636245202286e-05, + "loss": 3.7497, + "step": 5000 + }, + { + "epoch": 0.3104475758892545, + "grad_norm": 0.39757260715091003, + "learning_rate": 9.999631875868072e-05, + "loss": 3.7858, + "step": 5001 + }, + { + "epoch": 0.31050965298901234, + "grad_norm": 0.28586836953724243, + "learning_rate": 9.999627480449563e-05, + "loss": 3.7464, + "step": 5002 + }, + { + "epoch": 0.31057173008877026, + "grad_norm": 0.47570979328716123, + "learning_rate": 9.999623058946786e-05, + "loss": 3.89, + "step": 5003 + }, + { + "epoch": 0.3106338071885282, + "grad_norm": 0.32948863418191704, + "learning_rate": 9.99961861135976e-05, + "loss": 3.8105, + "step": 5004 + }, + { + "epoch": 0.31069588428828604, + "grad_norm": 0.4234396740903326, + "learning_rate": 9.99961413768851e-05, + "loss": 3.6557, + "step": 5005 + }, + { + "epoch": 0.31075796138804396, + "grad_norm": 0.35767080674852025, + "learning_rate": 9.999609637933059e-05, + "loss": 3.6823, + "step": 5006 + }, + { + "epoch": 0.3108200384878019, + "grad_norm": 0.3773553905705548, + "learning_rate": 9.999605112093433e-05, + "loss": 3.6502, + "step": 5007 + }, + { + "epoch": 0.31088211558755974, + "grad_norm": 0.5903923565479634, + "learning_rate": 9.999600560169651e-05, + "loss": 3.7879, + "step": 5008 + }, + { + "epoch": 0.31094419268731766, + "grad_norm": 0.4267110908125431, + "learning_rate": 9.99959598216174e-05, + "loss": 3.6475, + "step": 5009 + }, + { + "epoch": 0.3110062697870756, + "grad_norm": 0.3906093594280878, + "learning_rate": 9.999591378069723e-05, + "loss": 3.7103, + "step": 5010 + }, + { + "epoch": 0.31106834688683344, + "grad_norm": 0.4206699007564154, + "learning_rate": 9.999586747893625e-05, + "loss": 3.765, + "step": 5011 + }, + { + "epoch": 0.31113042398659135, + "grad_norm": 0.3780421205597299, + "learning_rate": 9.999582091633469e-05, + "loss": 3.6373, + "step": 5012 + }, + { + "epoch": 0.31119250108634927, + "grad_norm": 0.4714347092607703, + "learning_rate": 9.99957740928928e-05, + "loss": 3.7783, + "step": 5013 + }, + { + "epoch": 0.31125457818610713, + "grad_norm": 0.419537479205211, + "learning_rate": 9.999572700861079e-05, + "loss": 3.7623, + "step": 5014 + }, + { + "epoch": 0.31131665528586505, + "grad_norm": 0.3399121226667653, + "learning_rate": 9.999567966348898e-05, + "loss": 3.7807, + "step": 5015 + }, + { + "epoch": 0.31137873238562297, + "grad_norm": 0.362525892977203, + "learning_rate": 9.999563205752754e-05, + "loss": 3.6506, + "step": 5016 + }, + { + "epoch": 0.31144080948538083, + "grad_norm": 0.5263168792180023, + "learning_rate": 9.999558419072677e-05, + "loss": 3.7368, + "step": 5017 + }, + { + "epoch": 0.31150288658513875, + "grad_norm": 0.6700423996946272, + "learning_rate": 9.999553606308689e-05, + "loss": 3.6882, + "step": 5018 + }, + { + "epoch": 0.31156496368489667, + "grad_norm": 0.40289498501930354, + "learning_rate": 9.999548767460816e-05, + "loss": 3.7213, + "step": 5019 + }, + { + "epoch": 0.31162704078465453, + "grad_norm": 0.3787686992977016, + "learning_rate": 9.999543902529083e-05, + "loss": 3.7394, + "step": 5020 + }, + { + "epoch": 0.31168911788441245, + "grad_norm": 0.47673027024654474, + "learning_rate": 9.999539011513516e-05, + "loss": 3.732, + "step": 5021 + }, + { + "epoch": 0.31175119498417037, + "grad_norm": 0.3482165910745818, + "learning_rate": 9.99953409441414e-05, + "loss": 3.6665, + "step": 5022 + }, + { + "epoch": 0.31181327208392823, + "grad_norm": 0.3369081358457435, + "learning_rate": 9.999529151230982e-05, + "loss": 3.745, + "step": 5023 + }, + { + "epoch": 0.31187534918368615, + "grad_norm": 0.44364074987451996, + "learning_rate": 9.999524181964067e-05, + "loss": 3.757, + "step": 5024 + }, + { + "epoch": 0.31193742628344406, + "grad_norm": 0.5356633088290106, + "learning_rate": 9.99951918661342e-05, + "loss": 3.8289, + "step": 5025 + }, + { + "epoch": 0.3119995033832019, + "grad_norm": 0.27390130081536374, + "learning_rate": 9.999514165179067e-05, + "loss": 3.7256, + "step": 5026 + }, + { + "epoch": 0.31206158048295984, + "grad_norm": 0.38233349473401634, + "learning_rate": 9.999509117661036e-05, + "loss": 3.6792, + "step": 5027 + }, + { + "epoch": 0.31212365758271776, + "grad_norm": 0.2839039779261452, + "learning_rate": 9.999504044059351e-05, + "loss": 3.8582, + "step": 5028 + }, + { + "epoch": 0.3121857346824756, + "grad_norm": 0.2700640346833976, + "learning_rate": 9.999498944374042e-05, + "loss": 3.7064, + "step": 5029 + }, + { + "epoch": 0.31224781178223354, + "grad_norm": 0.5851362981310578, + "learning_rate": 9.999493818605133e-05, + "loss": 3.7778, + "step": 5030 + }, + { + "epoch": 0.31230988888199146, + "grad_norm": 0.6142816827014417, + "learning_rate": 9.999488666752649e-05, + "loss": 3.8063, + "step": 5031 + }, + { + "epoch": 0.3123719659817493, + "grad_norm": 0.4088056397847324, + "learning_rate": 9.99948348881662e-05, + "loss": 3.7769, + "step": 5032 + }, + { + "epoch": 0.31243404308150724, + "grad_norm": 0.4455568245665288, + "learning_rate": 9.999478284797073e-05, + "loss": 3.7144, + "step": 5033 + }, + { + "epoch": 0.31249612018126516, + "grad_norm": 0.3037539141995384, + "learning_rate": 9.999473054694032e-05, + "loss": 3.7471, + "step": 5034 + }, + { + "epoch": 0.312558197281023, + "grad_norm": 0.31391762813000285, + "learning_rate": 9.999467798507527e-05, + "loss": 3.6843, + "step": 5035 + }, + { + "epoch": 0.31262027438078094, + "grad_norm": 0.42144203927974144, + "learning_rate": 9.999462516237585e-05, + "loss": 3.6925, + "step": 5036 + }, + { + "epoch": 0.31268235148053886, + "grad_norm": 0.2537883484806716, + "learning_rate": 9.999457207884235e-05, + "loss": 3.7356, + "step": 5037 + }, + { + "epoch": 0.3127444285802967, + "grad_norm": 0.29519555990752155, + "learning_rate": 9.9994518734475e-05, + "loss": 3.6459, + "step": 5038 + }, + { + "epoch": 0.31280650568005464, + "grad_norm": 0.24593337449390734, + "learning_rate": 9.999446512927415e-05, + "loss": 3.7404, + "step": 5039 + }, + { + "epoch": 0.31286858277981255, + "grad_norm": 0.3010675256945962, + "learning_rate": 9.999441126324001e-05, + "loss": 3.6345, + "step": 5040 + }, + { + "epoch": 0.3129306598795704, + "grad_norm": 0.3235919938161838, + "learning_rate": 9.99943571363729e-05, + "loss": 3.7392, + "step": 5041 + }, + { + "epoch": 0.31299273697932833, + "grad_norm": 0.6980314777129852, + "learning_rate": 9.999430274867309e-05, + "loss": 3.7147, + "step": 5042 + }, + { + "epoch": 0.31305481407908625, + "grad_norm": 0.34177171887057584, + "learning_rate": 9.999424810014086e-05, + "loss": 3.7063, + "step": 5043 + }, + { + "epoch": 0.3131168911788441, + "grad_norm": 0.5576263801097573, + "learning_rate": 9.999419319077649e-05, + "loss": 3.7859, + "step": 5044 + }, + { + "epoch": 0.31317896827860203, + "grad_norm": 0.46641122377641364, + "learning_rate": 9.999413802058031e-05, + "loss": 3.7875, + "step": 5045 + }, + { + "epoch": 0.31324104537835995, + "grad_norm": 0.35574663220820024, + "learning_rate": 9.999408258955257e-05, + "loss": 3.7457, + "step": 5046 + }, + { + "epoch": 0.3133031224781178, + "grad_norm": 0.43154062001576937, + "learning_rate": 9.999402689769354e-05, + "loss": 3.8435, + "step": 5047 + }, + { + "epoch": 0.31336519957787573, + "grad_norm": 0.32880880991988387, + "learning_rate": 9.999397094500355e-05, + "loss": 3.7751, + "step": 5048 + }, + { + "epoch": 0.31342727667763365, + "grad_norm": 0.4466757069866128, + "learning_rate": 9.999391473148288e-05, + "loss": 3.6826, + "step": 5049 + }, + { + "epoch": 0.3134893537773915, + "grad_norm": 0.42982763259495066, + "learning_rate": 9.999385825713182e-05, + "loss": 3.7437, + "step": 5050 + }, + { + "epoch": 0.31355143087714943, + "grad_norm": 0.500425574863666, + "learning_rate": 9.999380152195066e-05, + "loss": 3.7111, + "step": 5051 + }, + { + "epoch": 0.31361350797690735, + "grad_norm": 0.3251998717758822, + "learning_rate": 9.99937445259397e-05, + "loss": 3.8016, + "step": 5052 + }, + { + "epoch": 0.3136755850766652, + "grad_norm": 0.525464871630732, + "learning_rate": 9.999368726909923e-05, + "loss": 3.7109, + "step": 5053 + }, + { + "epoch": 0.3137376621764231, + "grad_norm": 0.5376497913827405, + "learning_rate": 9.999362975142957e-05, + "loss": 3.7948, + "step": 5054 + }, + { + "epoch": 0.31379973927618104, + "grad_norm": 0.40239343058696375, + "learning_rate": 9.9993571972931e-05, + "loss": 3.7831, + "step": 5055 + }, + { + "epoch": 0.3138618163759389, + "grad_norm": 0.8058293583392858, + "learning_rate": 9.999351393360383e-05, + "loss": 3.7313, + "step": 5056 + }, + { + "epoch": 0.3139238934756968, + "grad_norm": 0.6556940221319401, + "learning_rate": 9.999345563344837e-05, + "loss": 3.7748, + "step": 5057 + }, + { + "epoch": 0.3139859705754547, + "grad_norm": 0.4547128644075639, + "learning_rate": 9.999339707246492e-05, + "loss": 3.7236, + "step": 5058 + }, + { + "epoch": 0.3140480476752126, + "grad_norm": 0.36337145844115804, + "learning_rate": 9.999333825065377e-05, + "loss": 3.7728, + "step": 5059 + }, + { + "epoch": 0.3141101247749705, + "grad_norm": 0.3538425472434144, + "learning_rate": 9.999327916801525e-05, + "loss": 3.7556, + "step": 5060 + }, + { + "epoch": 0.3141722018747284, + "grad_norm": 0.48006880016287656, + "learning_rate": 9.999321982454964e-05, + "loss": 3.7292, + "step": 5061 + }, + { + "epoch": 0.3142342789744863, + "grad_norm": 0.35434894452525223, + "learning_rate": 9.999316022025727e-05, + "loss": 3.6749, + "step": 5062 + }, + { + "epoch": 0.3142963560742442, + "grad_norm": 0.33820054278640344, + "learning_rate": 9.999310035513844e-05, + "loss": 3.6926, + "step": 5063 + }, + { + "epoch": 0.3143584331740021, + "grad_norm": 0.33814232332342425, + "learning_rate": 9.999304022919347e-05, + "loss": 3.7898, + "step": 5064 + }, + { + "epoch": 0.31442051027376, + "grad_norm": 0.33262023297190757, + "learning_rate": 9.99929798424227e-05, + "loss": 3.6822, + "step": 5065 + }, + { + "epoch": 0.3144825873735179, + "grad_norm": 0.327825784495874, + "learning_rate": 9.999291919482639e-05, + "loss": 3.8007, + "step": 5066 + }, + { + "epoch": 0.3145446644732758, + "grad_norm": 0.515957869388142, + "learning_rate": 9.99928582864049e-05, + "loss": 3.7675, + "step": 5067 + }, + { + "epoch": 0.3146067415730337, + "grad_norm": 0.34592349900236163, + "learning_rate": 9.999279711715852e-05, + "loss": 3.7414, + "step": 5068 + }, + { + "epoch": 0.3146688186727916, + "grad_norm": 0.4044773049335858, + "learning_rate": 9.999273568708759e-05, + "loss": 3.6866, + "step": 5069 + }, + { + "epoch": 0.3147308957725495, + "grad_norm": 0.5531269202764767, + "learning_rate": 9.999267399619241e-05, + "loss": 3.7509, + "step": 5070 + }, + { + "epoch": 0.3147929728723074, + "grad_norm": 0.2651146726768627, + "learning_rate": 9.999261204447332e-05, + "loss": 3.8197, + "step": 5071 + }, + { + "epoch": 0.3148550499720653, + "grad_norm": 0.35067556611584666, + "learning_rate": 9.999254983193065e-05, + "loss": 3.7391, + "step": 5072 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 0.30502935940500797, + "learning_rate": 9.99924873585647e-05, + "loss": 3.6661, + "step": 5073 + }, + { + "epoch": 0.3149792041715811, + "grad_norm": 0.44382737788006726, + "learning_rate": 9.999242462437581e-05, + "loss": 3.7555, + "step": 5074 + }, + { + "epoch": 0.315041281271339, + "grad_norm": 0.32231560521814906, + "learning_rate": 9.99923616293643e-05, + "loss": 3.824, + "step": 5075 + }, + { + "epoch": 0.3151033583710969, + "grad_norm": 0.3967228815705715, + "learning_rate": 9.99922983735305e-05, + "loss": 3.725, + "step": 5076 + }, + { + "epoch": 0.3151654354708548, + "grad_norm": 0.2781050669567207, + "learning_rate": 9.999223485687474e-05, + "loss": 3.6532, + "step": 5077 + }, + { + "epoch": 0.3152275125706127, + "grad_norm": 0.354480858824072, + "learning_rate": 9.999217107939738e-05, + "loss": 3.7864, + "step": 5078 + }, + { + "epoch": 0.3152895896703706, + "grad_norm": 0.29068430018709196, + "learning_rate": 9.999210704109871e-05, + "loss": 3.6792, + "step": 5079 + }, + { + "epoch": 0.3153516667701285, + "grad_norm": 0.44267798675048287, + "learning_rate": 9.999204274197909e-05, + "loss": 3.7995, + "step": 5080 + }, + { + "epoch": 0.3154137438698864, + "grad_norm": 0.46052861262408396, + "learning_rate": 9.999197818203883e-05, + "loss": 3.8182, + "step": 5081 + }, + { + "epoch": 0.31547582096964427, + "grad_norm": 0.36634205823203597, + "learning_rate": 9.999191336127829e-05, + "loss": 3.6791, + "step": 5082 + }, + { + "epoch": 0.3155378980694022, + "grad_norm": 0.3868702678360193, + "learning_rate": 9.999184827969781e-05, + "loss": 3.8208, + "step": 5083 + }, + { + "epoch": 0.3155999751691601, + "grad_norm": 0.44092859739079854, + "learning_rate": 9.999178293729772e-05, + "loss": 3.6995, + "step": 5084 + }, + { + "epoch": 0.31566205226891797, + "grad_norm": 0.2648100834381279, + "learning_rate": 9.999171733407835e-05, + "loss": 3.7182, + "step": 5085 + }, + { + "epoch": 0.3157241293686759, + "grad_norm": 0.2689419260691392, + "learning_rate": 9.999165147004008e-05, + "loss": 3.7506, + "step": 5086 + }, + { + "epoch": 0.3157862064684338, + "grad_norm": 0.33707471693686697, + "learning_rate": 9.99915853451832e-05, + "loss": 3.6428, + "step": 5087 + }, + { + "epoch": 0.31584828356819167, + "grad_norm": 0.3807587319447966, + "learning_rate": 9.99915189595081e-05, + "loss": 3.7891, + "step": 5088 + }, + { + "epoch": 0.3159103606679496, + "grad_norm": 0.40459712945232834, + "learning_rate": 9.99914523130151e-05, + "loss": 3.6931, + "step": 5089 + }, + { + "epoch": 0.3159724377677075, + "grad_norm": 0.4781766352478106, + "learning_rate": 9.999138540570457e-05, + "loss": 3.7196, + "step": 5090 + }, + { + "epoch": 0.31603451486746537, + "grad_norm": 0.4168198204696131, + "learning_rate": 9.999131823757683e-05, + "loss": 3.7498, + "step": 5091 + }, + { + "epoch": 0.3160965919672233, + "grad_norm": 0.4534557713745245, + "learning_rate": 9.999125080863227e-05, + "loss": 3.6915, + "step": 5092 + }, + { + "epoch": 0.3161586690669812, + "grad_norm": 0.43119391892158054, + "learning_rate": 9.999118311887121e-05, + "loss": 3.6414, + "step": 5093 + }, + { + "epoch": 0.31622074616673906, + "grad_norm": 0.47707291144713615, + "learning_rate": 9.999111516829401e-05, + "loss": 3.7986, + "step": 5094 + }, + { + "epoch": 0.316282823266497, + "grad_norm": 0.326655400673501, + "learning_rate": 9.999104695690101e-05, + "loss": 3.6769, + "step": 5095 + }, + { + "epoch": 0.3163449003662549, + "grad_norm": 0.47061507798433894, + "learning_rate": 9.99909784846926e-05, + "loss": 3.7137, + "step": 5096 + }, + { + "epoch": 0.31640697746601276, + "grad_norm": 0.44529213775124876, + "learning_rate": 9.999090975166911e-05, + "loss": 3.7693, + "step": 5097 + }, + { + "epoch": 0.3164690545657707, + "grad_norm": 0.37900895761860187, + "learning_rate": 9.999084075783092e-05, + "loss": 3.7203, + "step": 5098 + }, + { + "epoch": 0.3165311316655286, + "grad_norm": 0.5680775418828754, + "learning_rate": 9.999077150317837e-05, + "loss": 3.8149, + "step": 5099 + }, + { + "epoch": 0.31659320876528646, + "grad_norm": 0.47882927716291307, + "learning_rate": 9.999070198771184e-05, + "loss": 3.8021, + "step": 5100 + }, + { + "epoch": 0.3166552858650444, + "grad_norm": 0.3231458261002219, + "learning_rate": 9.999063221143165e-05, + "loss": 3.761, + "step": 5101 + }, + { + "epoch": 0.3167173629648023, + "grad_norm": 0.7630870950893647, + "learning_rate": 9.999056217433823e-05, + "loss": 3.6989, + "step": 5102 + }, + { + "epoch": 0.31677944006456016, + "grad_norm": 0.5379275107774246, + "learning_rate": 9.99904918764319e-05, + "loss": 3.7621, + "step": 5103 + }, + { + "epoch": 0.3168415171643181, + "grad_norm": 0.37596730802961964, + "learning_rate": 9.999042131771304e-05, + "loss": 3.7396, + "step": 5104 + }, + { + "epoch": 0.316903594264076, + "grad_norm": 0.4373356476387908, + "learning_rate": 9.999035049818202e-05, + "loss": 3.7527, + "step": 5105 + }, + { + "epoch": 0.31696567136383386, + "grad_norm": 0.4452543889996187, + "learning_rate": 9.99902794178392e-05, + "loss": 3.8067, + "step": 5106 + }, + { + "epoch": 0.3170277484635918, + "grad_norm": 0.41482624656403966, + "learning_rate": 9.999020807668496e-05, + "loss": 3.6941, + "step": 5107 + }, + { + "epoch": 0.3170898255633497, + "grad_norm": 0.6275352793727027, + "learning_rate": 9.999013647471966e-05, + "loss": 3.8131, + "step": 5108 + }, + { + "epoch": 0.31715190266310755, + "grad_norm": 0.5284994107372135, + "learning_rate": 9.99900646119437e-05, + "loss": 3.7249, + "step": 5109 + }, + { + "epoch": 0.3172139797628655, + "grad_norm": 0.46065578903859394, + "learning_rate": 9.998999248835742e-05, + "loss": 3.6257, + "step": 5110 + }, + { + "epoch": 0.3172760568626234, + "grad_norm": 0.3921732736568131, + "learning_rate": 9.998992010396122e-05, + "loss": 3.7132, + "step": 5111 + }, + { + "epoch": 0.31733813396238125, + "grad_norm": 0.31615100973668764, + "learning_rate": 9.998984745875548e-05, + "loss": 3.6975, + "step": 5112 + }, + { + "epoch": 0.31740021106213917, + "grad_norm": 0.3962247026352709, + "learning_rate": 9.998977455274056e-05, + "loss": 3.6852, + "step": 5113 + }, + { + "epoch": 0.3174622881618971, + "grad_norm": 0.3725354948669974, + "learning_rate": 9.998970138591686e-05, + "loss": 3.7514, + "step": 5114 + }, + { + "epoch": 0.31752436526165495, + "grad_norm": 0.28555051120264097, + "learning_rate": 9.998962795828473e-05, + "loss": 3.7298, + "step": 5115 + }, + { + "epoch": 0.31758644236141287, + "grad_norm": 0.3608493210060293, + "learning_rate": 9.99895542698446e-05, + "loss": 3.6737, + "step": 5116 + }, + { + "epoch": 0.3176485194611708, + "grad_norm": 0.4626402519832933, + "learning_rate": 9.998948032059682e-05, + "loss": 3.6506, + "step": 5117 + }, + { + "epoch": 0.31771059656092865, + "grad_norm": 0.48793310342407925, + "learning_rate": 9.998940611054178e-05, + "loss": 3.6916, + "step": 5118 + }, + { + "epoch": 0.31777267366068657, + "grad_norm": 0.5998933598005415, + "learning_rate": 9.998933163967988e-05, + "loss": 3.7228, + "step": 5119 + }, + { + "epoch": 0.3178347507604445, + "grad_norm": 0.47281439161205907, + "learning_rate": 9.998925690801151e-05, + "loss": 3.6963, + "step": 5120 + }, + { + "epoch": 0.31789682786020235, + "grad_norm": 0.5429573332320407, + "learning_rate": 9.998918191553703e-05, + "loss": 3.7803, + "step": 5121 + }, + { + "epoch": 0.31795890495996026, + "grad_norm": 0.4851029524959643, + "learning_rate": 9.998910666225687e-05, + "loss": 3.8088, + "step": 5122 + }, + { + "epoch": 0.3180209820597182, + "grad_norm": 0.63930093192676, + "learning_rate": 9.99890311481714e-05, + "loss": 3.7239, + "step": 5123 + }, + { + "epoch": 0.31808305915947604, + "grad_norm": 0.37260916630383206, + "learning_rate": 9.998895537328102e-05, + "loss": 3.7208, + "step": 5124 + }, + { + "epoch": 0.31814513625923396, + "grad_norm": 0.630193636594126, + "learning_rate": 9.998887933758612e-05, + "loss": 3.7874, + "step": 5125 + }, + { + "epoch": 0.3182072133589919, + "grad_norm": 0.4944112134852742, + "learning_rate": 9.998880304108711e-05, + "loss": 3.6721, + "step": 5126 + }, + { + "epoch": 0.31826929045874974, + "grad_norm": 0.4816427387027195, + "learning_rate": 9.998872648378438e-05, + "loss": 3.7294, + "step": 5127 + }, + { + "epoch": 0.31833136755850766, + "grad_norm": 0.5618212101089598, + "learning_rate": 9.998864966567833e-05, + "loss": 3.6998, + "step": 5128 + }, + { + "epoch": 0.3183934446582656, + "grad_norm": 0.889349576328001, + "learning_rate": 9.998857258676935e-05, + "loss": 3.6055, + "step": 5129 + }, + { + "epoch": 0.31845552175802344, + "grad_norm": 0.5164624840017039, + "learning_rate": 9.998849524705785e-05, + "loss": 3.6059, + "step": 5130 + }, + { + "epoch": 0.31851759885778136, + "grad_norm": 0.4382072760293036, + "learning_rate": 9.998841764654424e-05, + "loss": 3.8143, + "step": 5131 + }, + { + "epoch": 0.3185796759575393, + "grad_norm": 0.2911361982199126, + "learning_rate": 9.998833978522893e-05, + "loss": 3.651, + "step": 5132 + }, + { + "epoch": 0.31864175305729714, + "grad_norm": 0.4794029572353096, + "learning_rate": 9.99882616631123e-05, + "loss": 3.7118, + "step": 5133 + }, + { + "epoch": 0.31870383015705506, + "grad_norm": 0.6111112947415548, + "learning_rate": 9.998818328019476e-05, + "loss": 3.7333, + "step": 5134 + }, + { + "epoch": 0.318765907256813, + "grad_norm": 0.43776318603886244, + "learning_rate": 9.998810463647676e-05, + "loss": 3.7149, + "step": 5135 + }, + { + "epoch": 0.31882798435657084, + "grad_norm": 0.42760258216270064, + "learning_rate": 9.998802573195868e-05, + "loss": 3.77, + "step": 5136 + }, + { + "epoch": 0.31889006145632875, + "grad_norm": 0.3305679388870028, + "learning_rate": 9.998794656664095e-05, + "loss": 3.6119, + "step": 5137 + }, + { + "epoch": 0.3189521385560867, + "grad_norm": 0.3099989946410581, + "learning_rate": 9.998786714052393e-05, + "loss": 3.7341, + "step": 5138 + }, + { + "epoch": 0.31901421565584454, + "grad_norm": 0.4424753660683627, + "learning_rate": 9.99877874536081e-05, + "loss": 3.6567, + "step": 5139 + }, + { + "epoch": 0.31907629275560245, + "grad_norm": 0.34407030701333563, + "learning_rate": 9.998770750589384e-05, + "loss": 3.7578, + "step": 5140 + }, + { + "epoch": 0.31913836985536037, + "grad_norm": 0.2827287054534344, + "learning_rate": 9.998762729738156e-05, + "loss": 3.7074, + "step": 5141 + }, + { + "epoch": 0.31920044695511823, + "grad_norm": 0.34509868906872543, + "learning_rate": 9.998754682807172e-05, + "loss": 3.6851, + "step": 5142 + }, + { + "epoch": 0.31926252405487615, + "grad_norm": 0.46046263269685095, + "learning_rate": 9.99874660979647e-05, + "loss": 3.6368, + "step": 5143 + }, + { + "epoch": 0.31932460115463407, + "grad_norm": 0.3752153890660484, + "learning_rate": 9.998738510706094e-05, + "loss": 3.6939, + "step": 5144 + }, + { + "epoch": 0.31938667825439193, + "grad_norm": 0.308067670096541, + "learning_rate": 9.998730385536084e-05, + "loss": 3.718, + "step": 5145 + }, + { + "epoch": 0.31944875535414985, + "grad_norm": 0.3575615323404849, + "learning_rate": 9.998722234286487e-05, + "loss": 3.6766, + "step": 5146 + }, + { + "epoch": 0.31951083245390777, + "grad_norm": 0.40081840551116205, + "learning_rate": 9.998714056957339e-05, + "loss": 3.6516, + "step": 5147 + }, + { + "epoch": 0.31957290955366563, + "grad_norm": 0.3243836247945789, + "learning_rate": 9.998705853548689e-05, + "loss": 3.7785, + "step": 5148 + }, + { + "epoch": 0.31963498665342355, + "grad_norm": 0.38122490190535613, + "learning_rate": 9.998697624060576e-05, + "loss": 3.7565, + "step": 5149 + }, + { + "epoch": 0.31969706375318147, + "grad_norm": 0.3526688545519892, + "learning_rate": 9.998689368493044e-05, + "loss": 3.7756, + "step": 5150 + }, + { + "epoch": 0.3197591408529393, + "grad_norm": 0.4021422798385629, + "learning_rate": 9.998681086846137e-05, + "loss": 3.7501, + "step": 5151 + }, + { + "epoch": 0.31982121795269725, + "grad_norm": 0.38239331552786227, + "learning_rate": 9.998672779119896e-05, + "loss": 3.6484, + "step": 5152 + }, + { + "epoch": 0.31988329505245516, + "grad_norm": 0.24199942238547617, + "learning_rate": 9.998664445314367e-05, + "loss": 3.6466, + "step": 5153 + }, + { + "epoch": 0.319945372152213, + "grad_norm": 0.37794376317507883, + "learning_rate": 9.998656085429591e-05, + "loss": 3.8106, + "step": 5154 + }, + { + "epoch": 0.32000744925197094, + "grad_norm": 0.2831741449721542, + "learning_rate": 9.998647699465614e-05, + "loss": 3.7514, + "step": 5155 + }, + { + "epoch": 0.32006952635172886, + "grad_norm": 0.287247643944164, + "learning_rate": 9.998639287422477e-05, + "loss": 3.7077, + "step": 5156 + }, + { + "epoch": 0.3201316034514867, + "grad_norm": 0.34949717506063793, + "learning_rate": 9.998630849300226e-05, + "loss": 3.7663, + "step": 5157 + }, + { + "epoch": 0.32019368055124464, + "grad_norm": 0.2642472382746715, + "learning_rate": 9.998622385098903e-05, + "loss": 3.668, + "step": 5158 + }, + { + "epoch": 0.32025575765100256, + "grad_norm": 0.2891642288390158, + "learning_rate": 9.998613894818556e-05, + "loss": 3.7418, + "step": 5159 + }, + { + "epoch": 0.3203178347507604, + "grad_norm": 0.28939851356367147, + "learning_rate": 9.998605378459226e-05, + "loss": 3.6899, + "step": 5160 + }, + { + "epoch": 0.32037991185051834, + "grad_norm": 0.3916374817925059, + "learning_rate": 9.998596836020958e-05, + "loss": 3.6754, + "step": 5161 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 0.3410311705201982, + "learning_rate": 9.998588267503797e-05, + "loss": 3.6848, + "step": 5162 + }, + { + "epoch": 0.3205040660500341, + "grad_norm": 0.2782514654302609, + "learning_rate": 9.998579672907788e-05, + "loss": 3.6698, + "step": 5163 + }, + { + "epoch": 0.32056614314979204, + "grad_norm": 0.23010289523554994, + "learning_rate": 9.998571052232975e-05, + "loss": 3.7769, + "step": 5164 + }, + { + "epoch": 0.32062822024954996, + "grad_norm": 0.3760055728966051, + "learning_rate": 9.998562405479403e-05, + "loss": 3.7077, + "step": 5165 + }, + { + "epoch": 0.3206902973493078, + "grad_norm": 0.23451062887571864, + "learning_rate": 9.998553732647119e-05, + "loss": 3.5668, + "step": 5166 + }, + { + "epoch": 0.32075237444906574, + "grad_norm": 0.41299679202341744, + "learning_rate": 9.998545033736164e-05, + "loss": 3.7116, + "step": 5167 + }, + { + "epoch": 0.32081445154882365, + "grad_norm": 0.2900982035441236, + "learning_rate": 9.99853630874659e-05, + "loss": 3.7562, + "step": 5168 + }, + { + "epoch": 0.3208765286485815, + "grad_norm": 0.46525318482470646, + "learning_rate": 9.998527557678435e-05, + "loss": 3.7321, + "step": 5169 + }, + { + "epoch": 0.32093860574833943, + "grad_norm": 0.2600243506124284, + "learning_rate": 9.99851878053175e-05, + "loss": 3.676, + "step": 5170 + }, + { + "epoch": 0.32100068284809735, + "grad_norm": 0.2600602221894571, + "learning_rate": 9.998509977306578e-05, + "loss": 3.6921, + "step": 5171 + }, + { + "epoch": 0.3210627599478552, + "grad_norm": 0.4385348750635845, + "learning_rate": 9.998501148002967e-05, + "loss": 3.6632, + "step": 5172 + }, + { + "epoch": 0.32112483704761313, + "grad_norm": 0.3998762594992003, + "learning_rate": 9.998492292620962e-05, + "loss": 3.7401, + "step": 5173 + }, + { + "epoch": 0.32118691414737105, + "grad_norm": 0.5771139944678402, + "learning_rate": 9.998483411160609e-05, + "loss": 3.7651, + "step": 5174 + }, + { + "epoch": 0.3212489912471289, + "grad_norm": 0.3376759745861892, + "learning_rate": 9.998474503621954e-05, + "loss": 3.7756, + "step": 5175 + }, + { + "epoch": 0.32131106834688683, + "grad_norm": 0.4956439374454229, + "learning_rate": 9.998465570005045e-05, + "loss": 3.7355, + "step": 5176 + }, + { + "epoch": 0.32137314544664475, + "grad_norm": 0.27685097450228136, + "learning_rate": 9.998456610309928e-05, + "loss": 3.686, + "step": 5177 + }, + { + "epoch": 0.3214352225464026, + "grad_norm": 0.45548619650769373, + "learning_rate": 9.998447624536649e-05, + "loss": 3.7436, + "step": 5178 + }, + { + "epoch": 0.32149729964616053, + "grad_norm": 0.35052023141742206, + "learning_rate": 9.998438612685255e-05, + "loss": 3.6854, + "step": 5179 + }, + { + "epoch": 0.32155937674591845, + "grad_norm": 0.3395693976033304, + "learning_rate": 9.998429574755794e-05, + "loss": 3.7197, + "step": 5180 + }, + { + "epoch": 0.3216214538456763, + "grad_norm": 0.2399119252284816, + "learning_rate": 9.998420510748312e-05, + "loss": 3.5692, + "step": 5181 + }, + { + "epoch": 0.3216835309454342, + "grad_norm": 0.4122392892585456, + "learning_rate": 9.998411420662857e-05, + "loss": 3.6179, + "step": 5182 + }, + { + "epoch": 0.32174560804519214, + "grad_norm": 0.2807944104608297, + "learning_rate": 9.998402304499474e-05, + "loss": 3.5752, + "step": 5183 + }, + { + "epoch": 0.32180768514495, + "grad_norm": 0.26604180732301136, + "learning_rate": 9.998393162258216e-05, + "loss": 3.6463, + "step": 5184 + }, + { + "epoch": 0.3218697622447079, + "grad_norm": 0.3589137060697092, + "learning_rate": 9.998383993939127e-05, + "loss": 3.6585, + "step": 5185 + }, + { + "epoch": 0.32193183934446584, + "grad_norm": 0.2503221046138584, + "learning_rate": 9.998374799542254e-05, + "loss": 3.6726, + "step": 5186 + }, + { + "epoch": 0.3219939164442237, + "grad_norm": 0.3037624131519471, + "learning_rate": 9.998365579067648e-05, + "loss": 3.7867, + "step": 5187 + }, + { + "epoch": 0.3220559935439816, + "grad_norm": 0.2759930939795144, + "learning_rate": 9.998356332515353e-05, + "loss": 3.7019, + "step": 5188 + }, + { + "epoch": 0.32211807064373954, + "grad_norm": 0.27937178507497595, + "learning_rate": 9.998347059885422e-05, + "loss": 3.7309, + "step": 5189 + }, + { + "epoch": 0.3221801477434974, + "grad_norm": 0.2566116426961514, + "learning_rate": 9.9983377611779e-05, + "loss": 3.6963, + "step": 5190 + }, + { + "epoch": 0.3222422248432553, + "grad_norm": 0.33160434654775633, + "learning_rate": 9.998328436392837e-05, + "loss": 3.7871, + "step": 5191 + }, + { + "epoch": 0.32230430194301324, + "grad_norm": 0.3092115953262492, + "learning_rate": 9.998319085530279e-05, + "loss": 3.6886, + "step": 5192 + }, + { + "epoch": 0.3223663790427711, + "grad_norm": 0.25824953374885207, + "learning_rate": 9.998309708590279e-05, + "loss": 3.7134, + "step": 5193 + }, + { + "epoch": 0.322428456142529, + "grad_norm": 0.290209646185493, + "learning_rate": 9.998300305572883e-05, + "loss": 3.7311, + "step": 5194 + }, + { + "epoch": 0.32249053324228694, + "grad_norm": 0.26049906221887265, + "learning_rate": 9.998290876478142e-05, + "loss": 3.8181, + "step": 5195 + }, + { + "epoch": 0.3225526103420448, + "grad_norm": 0.2808477253414454, + "learning_rate": 9.998281421306103e-05, + "loss": 3.761, + "step": 5196 + }, + { + "epoch": 0.3226146874418027, + "grad_norm": 0.24069535601644393, + "learning_rate": 9.998271940056818e-05, + "loss": 3.677, + "step": 5197 + }, + { + "epoch": 0.32267676454156063, + "grad_norm": 0.2711149625146897, + "learning_rate": 9.998262432730334e-05, + "loss": 3.7087, + "step": 5198 + }, + { + "epoch": 0.3227388416413185, + "grad_norm": 0.25880709891175097, + "learning_rate": 9.9982528993267e-05, + "loss": 3.6883, + "step": 5199 + }, + { + "epoch": 0.3228009187410764, + "grad_norm": 0.3147252250189389, + "learning_rate": 9.998243339845969e-05, + "loss": 3.7358, + "step": 5200 + }, + { + "epoch": 0.32286299584083433, + "grad_norm": 0.28720834544327667, + "learning_rate": 9.998233754288189e-05, + "loss": 3.667, + "step": 5201 + }, + { + "epoch": 0.3229250729405922, + "grad_norm": 0.27178384195359195, + "learning_rate": 9.99822414265341e-05, + "loss": 3.5571, + "step": 5202 + }, + { + "epoch": 0.3229871500403501, + "grad_norm": 0.36800676623245193, + "learning_rate": 9.99821450494168e-05, + "loss": 3.6642, + "step": 5203 + }, + { + "epoch": 0.32304922714010803, + "grad_norm": 0.2673100987741945, + "learning_rate": 9.998204841153055e-05, + "loss": 3.6695, + "step": 5204 + }, + { + "epoch": 0.3231113042398659, + "grad_norm": 0.31880035750585634, + "learning_rate": 9.998195151287582e-05, + "loss": 3.6859, + "step": 5205 + }, + { + "epoch": 0.3231733813396238, + "grad_norm": 0.6337663502436859, + "learning_rate": 9.998185435345309e-05, + "loss": 3.6979, + "step": 5206 + }, + { + "epoch": 0.32323545843938173, + "grad_norm": 0.38478423330166206, + "learning_rate": 9.99817569332629e-05, + "loss": 3.8074, + "step": 5207 + }, + { + "epoch": 0.3232975355391396, + "grad_norm": 0.6109947050976496, + "learning_rate": 9.998165925230576e-05, + "loss": 3.6503, + "step": 5208 + }, + { + "epoch": 0.3233596126388975, + "grad_norm": 0.5083289821105003, + "learning_rate": 9.998156131058217e-05, + "loss": 3.776, + "step": 5209 + }, + { + "epoch": 0.3234216897386554, + "grad_norm": 0.48303506468121543, + "learning_rate": 9.998146310809265e-05, + "loss": 3.6521, + "step": 5210 + }, + { + "epoch": 0.3234837668384133, + "grad_norm": 0.4390837240010438, + "learning_rate": 9.998136464483767e-05, + "loss": 3.646, + "step": 5211 + }, + { + "epoch": 0.3235458439381712, + "grad_norm": 0.30178474873930866, + "learning_rate": 9.99812659208178e-05, + "loss": 3.6428, + "step": 5212 + }, + { + "epoch": 0.3236079210379291, + "grad_norm": 0.5731346120413117, + "learning_rate": 9.998116693603354e-05, + "loss": 3.7209, + "step": 5213 + }, + { + "epoch": 0.323669998137687, + "grad_norm": 0.45586742665323343, + "learning_rate": 9.998106769048538e-05, + "loss": 3.5708, + "step": 5214 + }, + { + "epoch": 0.3237320752374449, + "grad_norm": 0.3872887812516019, + "learning_rate": 9.998096818417387e-05, + "loss": 3.6818, + "step": 5215 + }, + { + "epoch": 0.3237941523372028, + "grad_norm": 0.3360551916267421, + "learning_rate": 9.99808684170995e-05, + "loss": 3.7298, + "step": 5216 + }, + { + "epoch": 0.3238562294369607, + "grad_norm": 0.40519511373704964, + "learning_rate": 9.998076838926282e-05, + "loss": 3.6684, + "step": 5217 + }, + { + "epoch": 0.3239183065367186, + "grad_norm": 0.29532310527347744, + "learning_rate": 9.998066810066434e-05, + "loss": 3.7118, + "step": 5218 + }, + { + "epoch": 0.3239803836364765, + "grad_norm": 0.5794377758168245, + "learning_rate": 9.998056755130458e-05, + "loss": 3.7255, + "step": 5219 + }, + { + "epoch": 0.3240424607362344, + "grad_norm": 0.5423653907162013, + "learning_rate": 9.998046674118406e-05, + "loss": 3.6694, + "step": 5220 + }, + { + "epoch": 0.3241045378359923, + "grad_norm": 0.8577998203806602, + "learning_rate": 9.998036567030331e-05, + "loss": 3.7619, + "step": 5221 + }, + { + "epoch": 0.3241666149357502, + "grad_norm": 0.5332115927222973, + "learning_rate": 9.998026433866287e-05, + "loss": 3.6911, + "step": 5222 + }, + { + "epoch": 0.3242286920355081, + "grad_norm": 0.953732843907193, + "learning_rate": 9.998016274626324e-05, + "loss": 3.7626, + "step": 5223 + }, + { + "epoch": 0.324290769135266, + "grad_norm": 0.7124632508019527, + "learning_rate": 9.998006089310499e-05, + "loss": 3.6547, + "step": 5224 + }, + { + "epoch": 0.3243528462350239, + "grad_norm": 0.4783434188202646, + "learning_rate": 9.997995877918862e-05, + "loss": 3.7661, + "step": 5225 + }, + { + "epoch": 0.3244149233347818, + "grad_norm": 0.4682731428135093, + "learning_rate": 9.997985640451466e-05, + "loss": 3.7321, + "step": 5226 + }, + { + "epoch": 0.3244770004345397, + "grad_norm": 0.4032810552301716, + "learning_rate": 9.997975376908368e-05, + "loss": 3.6982, + "step": 5227 + }, + { + "epoch": 0.3245390775342976, + "grad_norm": 0.5661914286520853, + "learning_rate": 9.997965087289617e-05, + "loss": 3.7999, + "step": 5228 + }, + { + "epoch": 0.3246011546340555, + "grad_norm": 0.39861916159358246, + "learning_rate": 9.997954771595271e-05, + "loss": 3.7382, + "step": 5229 + }, + { + "epoch": 0.3246632317338134, + "grad_norm": 0.5453486531687364, + "learning_rate": 9.997944429825379e-05, + "loss": 3.6877, + "step": 5230 + }, + { + "epoch": 0.3247253088335713, + "grad_norm": 0.5674699201048194, + "learning_rate": 9.997934061979998e-05, + "loss": 3.7636, + "step": 5231 + }, + { + "epoch": 0.3247873859333292, + "grad_norm": 0.390540075607312, + "learning_rate": 9.997923668059183e-05, + "loss": 3.7336, + "step": 5232 + }, + { + "epoch": 0.3248494630330871, + "grad_norm": 0.370945870900366, + "learning_rate": 9.997913248062987e-05, + "loss": 3.6804, + "step": 5233 + }, + { + "epoch": 0.324911540132845, + "grad_norm": 0.35629072625786284, + "learning_rate": 9.997902801991463e-05, + "loss": 3.7542, + "step": 5234 + }, + { + "epoch": 0.3249736172326029, + "grad_norm": 0.3037151078799536, + "learning_rate": 9.997892329844669e-05, + "loss": 3.609, + "step": 5235 + }, + { + "epoch": 0.3250356943323608, + "grad_norm": 0.3433018928123961, + "learning_rate": 9.997881831622656e-05, + "loss": 3.6204, + "step": 5236 + }, + { + "epoch": 0.3250977714321187, + "grad_norm": 0.35385774949559873, + "learning_rate": 9.997871307325479e-05, + "loss": 3.741, + "step": 5237 + }, + { + "epoch": 0.32515984853187657, + "grad_norm": 0.3576246843119951, + "learning_rate": 9.997860756953197e-05, + "loss": 3.6132, + "step": 5238 + }, + { + "epoch": 0.3252219256316345, + "grad_norm": 0.2942914730474476, + "learning_rate": 9.99785018050586e-05, + "loss": 3.7112, + "step": 5239 + }, + { + "epoch": 0.3252840027313924, + "grad_norm": 0.2664143949812934, + "learning_rate": 9.997839577983525e-05, + "loss": 3.6994, + "step": 5240 + }, + { + "epoch": 0.32534607983115027, + "grad_norm": 0.36844302670747764, + "learning_rate": 9.99782894938625e-05, + "loss": 3.7141, + "step": 5241 + }, + { + "epoch": 0.3254081569309082, + "grad_norm": 0.2581417841915595, + "learning_rate": 9.997818294714086e-05, + "loss": 3.6992, + "step": 5242 + }, + { + "epoch": 0.3254702340306661, + "grad_norm": 0.2932479545216227, + "learning_rate": 9.997807613967091e-05, + "loss": 3.684, + "step": 5243 + }, + { + "epoch": 0.32553231113042397, + "grad_norm": 0.2521166898921037, + "learning_rate": 9.99779690714532e-05, + "loss": 3.6426, + "step": 5244 + }, + { + "epoch": 0.3255943882301819, + "grad_norm": 0.2580774997159244, + "learning_rate": 9.997786174248832e-05, + "loss": 3.6585, + "step": 5245 + }, + { + "epoch": 0.3256564653299398, + "grad_norm": 0.2531827448688176, + "learning_rate": 9.997775415277678e-05, + "loss": 3.6162, + "step": 5246 + }, + { + "epoch": 0.32571854242969767, + "grad_norm": 0.4022077867716368, + "learning_rate": 9.997764630231917e-05, + "loss": 3.7435, + "step": 5247 + }, + { + "epoch": 0.3257806195294556, + "grad_norm": 0.4384297641172284, + "learning_rate": 9.997753819111604e-05, + "loss": 3.7386, + "step": 5248 + }, + { + "epoch": 0.3258426966292135, + "grad_norm": 0.26835198815630823, + "learning_rate": 9.997742981916795e-05, + "loss": 3.6972, + "step": 5249 + }, + { + "epoch": 0.32590477372897136, + "grad_norm": 0.5126524989099298, + "learning_rate": 9.99773211864755e-05, + "loss": 3.6208, + "step": 5250 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 0.7563413911563672, + "learning_rate": 9.997721229303922e-05, + "loss": 3.7048, + "step": 5251 + }, + { + "epoch": 0.3260289279284872, + "grad_norm": 0.3858976026908864, + "learning_rate": 9.997710313885971e-05, + "loss": 3.6466, + "step": 5252 + }, + { + "epoch": 0.32609100502824506, + "grad_norm": 0.585596519437202, + "learning_rate": 9.99769937239375e-05, + "loss": 3.5881, + "step": 5253 + }, + { + "epoch": 0.326153082128003, + "grad_norm": 0.4396526985521208, + "learning_rate": 9.997688404827319e-05, + "loss": 3.6561, + "step": 5254 + }, + { + "epoch": 0.3262151592277609, + "grad_norm": 0.38410835476582966, + "learning_rate": 9.997677411186733e-05, + "loss": 3.7455, + "step": 5255 + }, + { + "epoch": 0.32627723632751876, + "grad_norm": 0.3366735401470894, + "learning_rate": 9.997666391472052e-05, + "loss": 3.7231, + "step": 5256 + }, + { + "epoch": 0.3263393134272767, + "grad_norm": 0.3424343921132539, + "learning_rate": 9.997655345683332e-05, + "loss": 3.7748, + "step": 5257 + }, + { + "epoch": 0.3264013905270346, + "grad_norm": 0.7978587716382725, + "learning_rate": 9.99764427382063e-05, + "loss": 3.6805, + "step": 5258 + }, + { + "epoch": 0.32646346762679246, + "grad_norm": 0.6219012690991071, + "learning_rate": 9.997633175884006e-05, + "loss": 3.6772, + "step": 5259 + }, + { + "epoch": 0.3265255447265504, + "grad_norm": 0.35752427348847376, + "learning_rate": 9.997622051873517e-05, + "loss": 3.8036, + "step": 5260 + }, + { + "epoch": 0.3265876218263083, + "grad_norm": 0.45451023211885, + "learning_rate": 9.997610901789219e-05, + "loss": 3.7145, + "step": 5261 + }, + { + "epoch": 0.32664969892606616, + "grad_norm": 0.3619477809591364, + "learning_rate": 9.997599725631173e-05, + "loss": 3.722, + "step": 5262 + }, + { + "epoch": 0.3267117760258241, + "grad_norm": 0.35354639414719485, + "learning_rate": 9.997588523399435e-05, + "loss": 3.697, + "step": 5263 + }, + { + "epoch": 0.326773853125582, + "grad_norm": 0.3012415472601744, + "learning_rate": 9.997577295094064e-05, + "loss": 3.6379, + "step": 5264 + }, + { + "epoch": 0.32683593022533985, + "grad_norm": 0.2573729293437958, + "learning_rate": 9.99756604071512e-05, + "loss": 3.699, + "step": 5265 + }, + { + "epoch": 0.3268980073250978, + "grad_norm": 0.39908220453090076, + "learning_rate": 9.997554760262659e-05, + "loss": 3.7125, + "step": 5266 + }, + { + "epoch": 0.3269600844248557, + "grad_norm": 0.38575669273434743, + "learning_rate": 9.997543453736744e-05, + "loss": 3.7049, + "step": 5267 + }, + { + "epoch": 0.32702216152461355, + "grad_norm": 0.5673461219361904, + "learning_rate": 9.99753212113743e-05, + "loss": 3.6521, + "step": 5268 + }, + { + "epoch": 0.32708423862437147, + "grad_norm": 0.5502062562525103, + "learning_rate": 9.99752076246478e-05, + "loss": 3.6362, + "step": 5269 + }, + { + "epoch": 0.3271463157241294, + "grad_norm": 0.49079010912633514, + "learning_rate": 9.997509377718849e-05, + "loss": 3.6024, + "step": 5270 + }, + { + "epoch": 0.32720839282388725, + "grad_norm": 0.3430677926990825, + "learning_rate": 9.997497966899698e-05, + "loss": 3.692, + "step": 5271 + }, + { + "epoch": 0.32727046992364517, + "grad_norm": 0.3514001259412198, + "learning_rate": 9.997486530007388e-05, + "loss": 3.649, + "step": 5272 + }, + { + "epoch": 0.3273325470234031, + "grad_norm": 0.24004226850040103, + "learning_rate": 9.997475067041977e-05, + "loss": 3.7121, + "step": 5273 + }, + { + "epoch": 0.32739462412316095, + "grad_norm": 0.252290042086557, + "learning_rate": 9.997463578003527e-05, + "loss": 3.739, + "step": 5274 + }, + { + "epoch": 0.32745670122291887, + "grad_norm": 0.2270121880002732, + "learning_rate": 9.997452062892094e-05, + "loss": 3.7329, + "step": 5275 + }, + { + "epoch": 0.3275187783226768, + "grad_norm": 0.27877758007287295, + "learning_rate": 9.997440521707741e-05, + "loss": 3.6788, + "step": 5276 + }, + { + "epoch": 0.32758085542243465, + "grad_norm": 0.28367446612469815, + "learning_rate": 9.997428954450529e-05, + "loss": 3.672, + "step": 5277 + }, + { + "epoch": 0.32764293252219256, + "grad_norm": 0.2818376574130064, + "learning_rate": 9.997417361120515e-05, + "loss": 3.6683, + "step": 5278 + }, + { + "epoch": 0.3277050096219505, + "grad_norm": 0.24565101337759587, + "learning_rate": 9.997405741717764e-05, + "loss": 3.544, + "step": 5279 + }, + { + "epoch": 0.32776708672170835, + "grad_norm": 0.2471755184496102, + "learning_rate": 9.997394096242332e-05, + "loss": 3.7418, + "step": 5280 + }, + { + "epoch": 0.32782916382146626, + "grad_norm": 0.2795575132874012, + "learning_rate": 9.997382424694282e-05, + "loss": 3.7601, + "step": 5281 + }, + { + "epoch": 0.3278912409212242, + "grad_norm": 0.3401284067045709, + "learning_rate": 9.997370727073676e-05, + "loss": 3.8019, + "step": 5282 + }, + { + "epoch": 0.32795331802098204, + "grad_norm": 0.2837099210439145, + "learning_rate": 9.997359003380574e-05, + "loss": 3.6288, + "step": 5283 + }, + { + "epoch": 0.32801539512073996, + "grad_norm": 0.5101817686933611, + "learning_rate": 9.997347253615035e-05, + "loss": 3.8621, + "step": 5284 + }, + { + "epoch": 0.3280774722204979, + "grad_norm": 0.3831924382546282, + "learning_rate": 9.997335477777123e-05, + "loss": 3.7365, + "step": 5285 + }, + { + "epoch": 0.32813954932025574, + "grad_norm": 0.3949634714981901, + "learning_rate": 9.997323675866898e-05, + "loss": 3.6758, + "step": 5286 + }, + { + "epoch": 0.32820162642001366, + "grad_norm": 0.43836841226527573, + "learning_rate": 9.997311847884423e-05, + "loss": 3.6788, + "step": 5287 + }, + { + "epoch": 0.3282637035197716, + "grad_norm": 0.4233103459166968, + "learning_rate": 9.99729999382976e-05, + "loss": 3.7084, + "step": 5288 + }, + { + "epoch": 0.32832578061952944, + "grad_norm": 0.4108109290137781, + "learning_rate": 9.997288113702968e-05, + "loss": 3.7521, + "step": 5289 + }, + { + "epoch": 0.32838785771928736, + "grad_norm": 0.3774891140928213, + "learning_rate": 9.997276207504112e-05, + "loss": 3.682, + "step": 5290 + }, + { + "epoch": 0.3284499348190453, + "grad_norm": 0.33618393442206634, + "learning_rate": 9.997264275233253e-05, + "loss": 3.7823, + "step": 5291 + }, + { + "epoch": 0.32851201191880314, + "grad_norm": 0.3449092953419533, + "learning_rate": 9.997252316890452e-05, + "loss": 3.6642, + "step": 5292 + }, + { + "epoch": 0.32857408901856106, + "grad_norm": 0.24179318845831946, + "learning_rate": 9.997240332475772e-05, + "loss": 3.5813, + "step": 5293 + }, + { + "epoch": 0.328636166118319, + "grad_norm": 0.3163031657390059, + "learning_rate": 9.997228321989278e-05, + "loss": 3.6693, + "step": 5294 + }, + { + "epoch": 0.32869824321807684, + "grad_norm": 0.2262441272658367, + "learning_rate": 9.99721628543103e-05, + "loss": 3.6892, + "step": 5295 + }, + { + "epoch": 0.32876032031783475, + "grad_norm": 0.26628378908907413, + "learning_rate": 9.997204222801091e-05, + "loss": 3.634, + "step": 5296 + }, + { + "epoch": 0.32882239741759267, + "grad_norm": 0.3489660599076153, + "learning_rate": 9.997192134099526e-05, + "loss": 3.6392, + "step": 5297 + }, + { + "epoch": 0.32888447451735053, + "grad_norm": 0.33155494634597116, + "learning_rate": 9.997180019326395e-05, + "loss": 3.728, + "step": 5298 + }, + { + "epoch": 0.32894655161710845, + "grad_norm": 0.3927388787300665, + "learning_rate": 9.997167878481763e-05, + "loss": 3.5643, + "step": 5299 + }, + { + "epoch": 0.32900862871686637, + "grad_norm": 0.5428970618915577, + "learning_rate": 9.997155711565693e-05, + "loss": 3.7087, + "step": 5300 + }, + { + "epoch": 0.32907070581662423, + "grad_norm": 0.29142159429246933, + "learning_rate": 9.997143518578249e-05, + "loss": 3.659, + "step": 5301 + }, + { + "epoch": 0.32913278291638215, + "grad_norm": 0.29349106323130836, + "learning_rate": 9.997131299519493e-05, + "loss": 3.7589, + "step": 5302 + }, + { + "epoch": 0.32919486001614007, + "grad_norm": 0.2669753224893253, + "learning_rate": 9.997119054389492e-05, + "loss": 3.5971, + "step": 5303 + }, + { + "epoch": 0.32925693711589793, + "grad_norm": 0.2519405420990787, + "learning_rate": 9.997106783188306e-05, + "loss": 3.7532, + "step": 5304 + }, + { + "epoch": 0.32931901421565585, + "grad_norm": 0.26043507351991096, + "learning_rate": 9.997094485916002e-05, + "loss": 3.6639, + "step": 5305 + }, + { + "epoch": 0.32938109131541377, + "grad_norm": 0.3474097932053413, + "learning_rate": 9.997082162572642e-05, + "loss": 3.7134, + "step": 5306 + }, + { + "epoch": 0.3294431684151716, + "grad_norm": 0.21538721186355322, + "learning_rate": 9.997069813158293e-05, + "loss": 3.567, + "step": 5307 + }, + { + "epoch": 0.32950524551492955, + "grad_norm": 0.4346841213059359, + "learning_rate": 9.997057437673016e-05, + "loss": 3.7021, + "step": 5308 + }, + { + "epoch": 0.32956732261468746, + "grad_norm": 0.5470106774008079, + "learning_rate": 9.997045036116876e-05, + "loss": 3.6589, + "step": 5309 + }, + { + "epoch": 0.3296293997144453, + "grad_norm": 0.314091646445687, + "learning_rate": 9.997032608489942e-05, + "loss": 3.6877, + "step": 5310 + }, + { + "epoch": 0.32969147681420324, + "grad_norm": 0.382200686398064, + "learning_rate": 9.997020154792274e-05, + "loss": 3.7564, + "step": 5311 + }, + { + "epoch": 0.32975355391396116, + "grad_norm": 0.23767540471409754, + "learning_rate": 9.997007675023939e-05, + "loss": 3.6595, + "step": 5312 + }, + { + "epoch": 0.329815631013719, + "grad_norm": 0.36298052328182057, + "learning_rate": 9.996995169185003e-05, + "loss": 3.7112, + "step": 5313 + }, + { + "epoch": 0.32987770811347694, + "grad_norm": 0.3438340214434118, + "learning_rate": 9.996982637275529e-05, + "loss": 3.6949, + "step": 5314 + }, + { + "epoch": 0.32993978521323486, + "grad_norm": 0.34523638235882437, + "learning_rate": 9.996970079295584e-05, + "loss": 3.6273, + "step": 5315 + }, + { + "epoch": 0.3300018623129927, + "grad_norm": 0.36784162491938116, + "learning_rate": 9.996957495245232e-05, + "loss": 3.7108, + "step": 5316 + }, + { + "epoch": 0.33006393941275064, + "grad_norm": 0.34852184805736686, + "learning_rate": 9.996944885124541e-05, + "loss": 3.7469, + "step": 5317 + }, + { + "epoch": 0.33012601651250856, + "grad_norm": 0.2560401161639387, + "learning_rate": 9.996932248933575e-05, + "loss": 3.5388, + "step": 5318 + }, + { + "epoch": 0.3301880936122664, + "grad_norm": 0.38992970575881863, + "learning_rate": 9.996919586672401e-05, + "loss": 3.7331, + "step": 5319 + }, + { + "epoch": 0.33025017071202434, + "grad_norm": 0.3184957153819611, + "learning_rate": 9.996906898341084e-05, + "loss": 3.8263, + "step": 5320 + }, + { + "epoch": 0.33031224781178226, + "grad_norm": 0.24246606649582236, + "learning_rate": 9.996894183939691e-05, + "loss": 3.7964, + "step": 5321 + }, + { + "epoch": 0.3303743249115401, + "grad_norm": 0.39994336710800943, + "learning_rate": 9.996881443468286e-05, + "loss": 3.7443, + "step": 5322 + }, + { + "epoch": 0.33043640201129804, + "grad_norm": 0.3026062602016145, + "learning_rate": 9.99686867692694e-05, + "loss": 3.8185, + "step": 5323 + }, + { + "epoch": 0.33049847911105595, + "grad_norm": 0.3334203832428417, + "learning_rate": 9.996855884315716e-05, + "loss": 3.62, + "step": 5324 + }, + { + "epoch": 0.3305605562108138, + "grad_norm": 0.36432238049998267, + "learning_rate": 9.996843065634682e-05, + "loss": 3.6683, + "step": 5325 + }, + { + "epoch": 0.33062263331057173, + "grad_norm": 0.30890651739530517, + "learning_rate": 9.996830220883905e-05, + "loss": 3.6476, + "step": 5326 + }, + { + "epoch": 0.33068471041032965, + "grad_norm": 0.5902509483772489, + "learning_rate": 9.996817350063452e-05, + "loss": 3.6663, + "step": 5327 + }, + { + "epoch": 0.3307467875100875, + "grad_norm": 0.3646703230302897, + "learning_rate": 9.996804453173388e-05, + "loss": 3.6385, + "step": 5328 + }, + { + "epoch": 0.33080886460984543, + "grad_norm": 0.7139668740781212, + "learning_rate": 9.996791530213783e-05, + "loss": 3.7222, + "step": 5329 + }, + { + "epoch": 0.33087094170960335, + "grad_norm": 0.3995404218472046, + "learning_rate": 9.996778581184704e-05, + "loss": 3.6634, + "step": 5330 + }, + { + "epoch": 0.3309330188093612, + "grad_norm": 0.39501557035782653, + "learning_rate": 9.996765606086218e-05, + "loss": 3.658, + "step": 5331 + }, + { + "epoch": 0.33099509590911913, + "grad_norm": 0.4871815619556428, + "learning_rate": 9.996752604918394e-05, + "loss": 3.7075, + "step": 5332 + }, + { + "epoch": 0.33105717300887705, + "grad_norm": 0.4291394466576336, + "learning_rate": 9.996739577681297e-05, + "loss": 3.5931, + "step": 5333 + }, + { + "epoch": 0.3311192501086349, + "grad_norm": 0.47815951116836075, + "learning_rate": 9.996726524374998e-05, + "loss": 3.693, + "step": 5334 + }, + { + "epoch": 0.33118132720839283, + "grad_norm": 0.32626238726342155, + "learning_rate": 9.996713444999564e-05, + "loss": 3.6525, + "step": 5335 + }, + { + "epoch": 0.33124340430815075, + "grad_norm": 0.357933761838267, + "learning_rate": 9.99670033955506e-05, + "loss": 3.689, + "step": 5336 + }, + { + "epoch": 0.3313054814079086, + "grad_norm": 0.4208927729462213, + "learning_rate": 9.99668720804156e-05, + "loss": 3.6984, + "step": 5337 + }, + { + "epoch": 0.3313675585076665, + "grad_norm": 0.2797961274504148, + "learning_rate": 9.996674050459129e-05, + "loss": 3.5504, + "step": 5338 + }, + { + "epoch": 0.33142963560742444, + "grad_norm": 0.3559085933344556, + "learning_rate": 9.996660866807837e-05, + "loss": 3.7353, + "step": 5339 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 0.23973734542517092, + "learning_rate": 9.996647657087753e-05, + "loss": 3.6579, + "step": 5340 + }, + { + "epoch": 0.3315537898069402, + "grad_norm": 0.2995042444869301, + "learning_rate": 9.996634421298942e-05, + "loss": 3.7263, + "step": 5341 + }, + { + "epoch": 0.33161586690669814, + "grad_norm": 0.5349582667874274, + "learning_rate": 9.99662115944148e-05, + "loss": 3.6956, + "step": 5342 + }, + { + "epoch": 0.331677944006456, + "grad_norm": 0.3958386768013862, + "learning_rate": 9.996607871515429e-05, + "loss": 3.7353, + "step": 5343 + }, + { + "epoch": 0.3317400211062139, + "grad_norm": 0.31900354754260135, + "learning_rate": 9.996594557520864e-05, + "loss": 3.6269, + "step": 5344 + }, + { + "epoch": 0.33180209820597184, + "grad_norm": 0.32418679459059163, + "learning_rate": 9.996581217457852e-05, + "loss": 3.8095, + "step": 5345 + }, + { + "epoch": 0.3318641753057297, + "grad_norm": 0.2970402032063688, + "learning_rate": 9.996567851326463e-05, + "loss": 3.6989, + "step": 5346 + }, + { + "epoch": 0.3319262524054876, + "grad_norm": 0.390817937886632, + "learning_rate": 9.996554459126765e-05, + "loss": 3.5652, + "step": 5347 + }, + { + "epoch": 0.33198832950524554, + "grad_norm": 0.7203644574473013, + "learning_rate": 9.996541040858831e-05, + "loss": 3.7094, + "step": 5348 + }, + { + "epoch": 0.3320504066050034, + "grad_norm": 0.4660993975592278, + "learning_rate": 9.99652759652273e-05, + "loss": 3.6478, + "step": 5349 + }, + { + "epoch": 0.3321124837047613, + "grad_norm": 0.28486588470033025, + "learning_rate": 9.996514126118531e-05, + "loss": 3.6587, + "step": 5350 + }, + { + "epoch": 0.33217456080451924, + "grad_norm": 0.3266345870840627, + "learning_rate": 9.996500629646303e-05, + "loss": 3.7217, + "step": 5351 + }, + { + "epoch": 0.3322366379042771, + "grad_norm": 0.3152709065272982, + "learning_rate": 9.996487107106121e-05, + "loss": 3.6686, + "step": 5352 + }, + { + "epoch": 0.332298715004035, + "grad_norm": 0.5724697855509974, + "learning_rate": 9.99647355849805e-05, + "loss": 3.5702, + "step": 5353 + }, + { + "epoch": 0.33236079210379293, + "grad_norm": 0.4295654470319601, + "learning_rate": 9.996459983822165e-05, + "loss": 3.6679, + "step": 5354 + }, + { + "epoch": 0.3324228692035508, + "grad_norm": 0.352281735412025, + "learning_rate": 9.996446383078535e-05, + "loss": 3.6693, + "step": 5355 + }, + { + "epoch": 0.3324849463033087, + "grad_norm": 0.3827829933472491, + "learning_rate": 9.996432756267232e-05, + "loss": 3.7785, + "step": 5356 + }, + { + "epoch": 0.33254702340306663, + "grad_norm": 0.46432677941490474, + "learning_rate": 9.996419103388326e-05, + "loss": 3.6811, + "step": 5357 + }, + { + "epoch": 0.3326091005028245, + "grad_norm": 0.39088816361178996, + "learning_rate": 9.996405424441889e-05, + "loss": 3.6293, + "step": 5358 + }, + { + "epoch": 0.3326711776025824, + "grad_norm": 0.2812769664976059, + "learning_rate": 9.996391719427991e-05, + "loss": 3.6041, + "step": 5359 + }, + { + "epoch": 0.33273325470234033, + "grad_norm": 0.30687825902385335, + "learning_rate": 9.996377988346706e-05, + "loss": 3.6547, + "step": 5360 + }, + { + "epoch": 0.3327953318020982, + "grad_norm": 0.22582569014951515, + "learning_rate": 9.996364231198103e-05, + "loss": 3.5995, + "step": 5361 + }, + { + "epoch": 0.3328574089018561, + "grad_norm": 0.36055398440145237, + "learning_rate": 9.996350447982255e-05, + "loss": 3.5833, + "step": 5362 + }, + { + "epoch": 0.33291948600161403, + "grad_norm": 0.2482647127005922, + "learning_rate": 9.996336638699234e-05, + "loss": 3.6267, + "step": 5363 + }, + { + "epoch": 0.3329815631013719, + "grad_norm": 0.31219015257799615, + "learning_rate": 9.996322803349112e-05, + "loss": 3.6948, + "step": 5364 + }, + { + "epoch": 0.3330436402011298, + "grad_norm": 0.2112629728312308, + "learning_rate": 9.99630894193196e-05, + "loss": 3.7121, + "step": 5365 + }, + { + "epoch": 0.3331057173008877, + "grad_norm": 0.2875862184073258, + "learning_rate": 9.996295054447853e-05, + "loss": 3.6214, + "step": 5366 + }, + { + "epoch": 0.3331677944006456, + "grad_norm": 0.263663058663369, + "learning_rate": 9.996281140896861e-05, + "loss": 3.5835, + "step": 5367 + }, + { + "epoch": 0.3332298715004035, + "grad_norm": 0.2841322030053538, + "learning_rate": 9.996267201279056e-05, + "loss": 3.7193, + "step": 5368 + }, + { + "epoch": 0.3332919486001614, + "grad_norm": 0.2229084042699463, + "learning_rate": 9.996253235594515e-05, + "loss": 3.6265, + "step": 5369 + }, + { + "epoch": 0.3333540256999193, + "grad_norm": 0.3311412889305367, + "learning_rate": 9.996239243843306e-05, + "loss": 3.5111, + "step": 5370 + }, + { + "epoch": 0.3334161027996772, + "grad_norm": 0.30514821652855845, + "learning_rate": 9.996225226025505e-05, + "loss": 3.7361, + "step": 5371 + }, + { + "epoch": 0.3334781798994351, + "grad_norm": 0.28482169291376813, + "learning_rate": 9.996211182141185e-05, + "loss": 3.7396, + "step": 5372 + }, + { + "epoch": 0.333540256999193, + "grad_norm": 0.4024344592014346, + "learning_rate": 9.996197112190415e-05, + "loss": 3.602, + "step": 5373 + }, + { + "epoch": 0.3336023340989509, + "grad_norm": 0.2542258671829329, + "learning_rate": 9.996183016173273e-05, + "loss": 3.7451, + "step": 5374 + }, + { + "epoch": 0.3336644111987088, + "grad_norm": 0.3221487931681705, + "learning_rate": 9.996168894089834e-05, + "loss": 3.7032, + "step": 5375 + }, + { + "epoch": 0.3337264882984667, + "grad_norm": 0.2569014317077264, + "learning_rate": 9.996154745940168e-05, + "loss": 3.6545, + "step": 5376 + }, + { + "epoch": 0.3337885653982246, + "grad_norm": 0.3120060951549953, + "learning_rate": 9.996140571724347e-05, + "loss": 3.6875, + "step": 5377 + }, + { + "epoch": 0.3338506424979825, + "grad_norm": 0.413306589918166, + "learning_rate": 9.99612637144245e-05, + "loss": 3.6346, + "step": 5378 + }, + { + "epoch": 0.3339127195977404, + "grad_norm": 0.2605153643814364, + "learning_rate": 9.996112145094549e-05, + "loss": 3.693, + "step": 5379 + }, + { + "epoch": 0.3339747966974983, + "grad_norm": 0.40714511925981944, + "learning_rate": 9.996097892680718e-05, + "loss": 3.6766, + "step": 5380 + }, + { + "epoch": 0.3340368737972562, + "grad_norm": 0.27984123267186906, + "learning_rate": 9.996083614201031e-05, + "loss": 3.5989, + "step": 5381 + }, + { + "epoch": 0.3340989508970141, + "grad_norm": 0.25571832336050815, + "learning_rate": 9.996069309655562e-05, + "loss": 3.708, + "step": 5382 + }, + { + "epoch": 0.334161027996772, + "grad_norm": 0.2627747851558791, + "learning_rate": 9.99605497904439e-05, + "loss": 3.6126, + "step": 5383 + }, + { + "epoch": 0.3342231050965299, + "grad_norm": 0.3026282585958125, + "learning_rate": 9.996040622367582e-05, + "loss": 3.696, + "step": 5384 + }, + { + "epoch": 0.3342851821962878, + "grad_norm": 0.30254509453766015, + "learning_rate": 9.99602623962522e-05, + "loss": 3.6331, + "step": 5385 + }, + { + "epoch": 0.3343472592960457, + "grad_norm": 0.32727057610994487, + "learning_rate": 9.996011830817376e-05, + "loss": 3.6343, + "step": 5386 + }, + { + "epoch": 0.3344093363958036, + "grad_norm": 0.2909960900220993, + "learning_rate": 9.995997395944125e-05, + "loss": 3.6122, + "step": 5387 + }, + { + "epoch": 0.3344714134955615, + "grad_norm": 0.2896247627733137, + "learning_rate": 9.995982935005542e-05, + "loss": 3.6096, + "step": 5388 + }, + { + "epoch": 0.3345334905953194, + "grad_norm": 0.27143519113829223, + "learning_rate": 9.995968448001704e-05, + "loss": 3.6609, + "step": 5389 + }, + { + "epoch": 0.3345955676950773, + "grad_norm": 0.3827943019044742, + "learning_rate": 9.995953934932686e-05, + "loss": 3.6313, + "step": 5390 + }, + { + "epoch": 0.3346576447948352, + "grad_norm": 0.24722803682878255, + "learning_rate": 9.995939395798564e-05, + "loss": 3.6473, + "step": 5391 + }, + { + "epoch": 0.3347197218945931, + "grad_norm": 0.42569178770037314, + "learning_rate": 9.995924830599413e-05, + "loss": 3.6229, + "step": 5392 + }, + { + "epoch": 0.334781798994351, + "grad_norm": 0.439195090938755, + "learning_rate": 9.995910239335308e-05, + "loss": 3.6272, + "step": 5393 + }, + { + "epoch": 0.33484387609410887, + "grad_norm": 0.2617530515596735, + "learning_rate": 9.995895622006328e-05, + "loss": 3.6681, + "step": 5394 + }, + { + "epoch": 0.3349059531938668, + "grad_norm": 0.37296370133305795, + "learning_rate": 9.995880978612546e-05, + "loss": 3.6531, + "step": 5395 + }, + { + "epoch": 0.3349680302936247, + "grad_norm": 0.27976016320780284, + "learning_rate": 9.995866309154041e-05, + "loss": 3.8338, + "step": 5396 + }, + { + "epoch": 0.33503010739338257, + "grad_norm": 0.32850314104125866, + "learning_rate": 9.99585161363089e-05, + "loss": 3.699, + "step": 5397 + }, + { + "epoch": 0.3350921844931405, + "grad_norm": 0.36204698553944253, + "learning_rate": 9.995836892043166e-05, + "loss": 3.6761, + "step": 5398 + }, + { + "epoch": 0.3351542615928984, + "grad_norm": 0.28940344369967536, + "learning_rate": 9.995822144390952e-05, + "loss": 3.637, + "step": 5399 + }, + { + "epoch": 0.33521633869265627, + "grad_norm": 0.33029760027567195, + "learning_rate": 9.995807370674318e-05, + "loss": 3.5766, + "step": 5400 + }, + { + "epoch": 0.3352784157924142, + "grad_norm": 0.2808328289702855, + "learning_rate": 9.995792570893344e-05, + "loss": 3.6461, + "step": 5401 + }, + { + "epoch": 0.3353404928921721, + "grad_norm": 0.27366635418549595, + "learning_rate": 9.995777745048109e-05, + "loss": 3.5432, + "step": 5402 + }, + { + "epoch": 0.33540256999192997, + "grad_norm": 0.3403167392792442, + "learning_rate": 9.995762893138687e-05, + "loss": 3.7503, + "step": 5403 + }, + { + "epoch": 0.3354646470916879, + "grad_norm": 0.3235766475473739, + "learning_rate": 9.995748015165158e-05, + "loss": 3.6505, + "step": 5404 + }, + { + "epoch": 0.3355267241914458, + "grad_norm": 0.38734529775761917, + "learning_rate": 9.9957331111276e-05, + "loss": 3.6331, + "step": 5405 + }, + { + "epoch": 0.33558880129120366, + "grad_norm": 0.43541698486219155, + "learning_rate": 9.995718181026088e-05, + "loss": 3.5678, + "step": 5406 + }, + { + "epoch": 0.3356508783909616, + "grad_norm": 0.3965150605151968, + "learning_rate": 9.995703224860702e-05, + "loss": 3.6884, + "step": 5407 + }, + { + "epoch": 0.3357129554907195, + "grad_norm": 0.3234725891650528, + "learning_rate": 9.99568824263152e-05, + "loss": 3.7553, + "step": 5408 + }, + { + "epoch": 0.33577503259047736, + "grad_norm": 0.5108822107293873, + "learning_rate": 9.99567323433862e-05, + "loss": 3.7256, + "step": 5409 + }, + { + "epoch": 0.3358371096902353, + "grad_norm": 0.4723672219570059, + "learning_rate": 9.995658199982079e-05, + "loss": 3.6833, + "step": 5410 + }, + { + "epoch": 0.3358991867899932, + "grad_norm": 0.348864715980879, + "learning_rate": 9.995643139561976e-05, + "loss": 3.6795, + "step": 5411 + }, + { + "epoch": 0.33596126388975106, + "grad_norm": 0.5811495663411934, + "learning_rate": 9.995628053078392e-05, + "loss": 3.5881, + "step": 5412 + }, + { + "epoch": 0.336023340989509, + "grad_norm": 0.27516932422308477, + "learning_rate": 9.995612940531401e-05, + "loss": 3.6918, + "step": 5413 + }, + { + "epoch": 0.3360854180892669, + "grad_norm": 0.3673520835343332, + "learning_rate": 9.995597801921087e-05, + "loss": 3.6928, + "step": 5414 + }, + { + "epoch": 0.33614749518902476, + "grad_norm": 0.3278268526751757, + "learning_rate": 9.995582637247525e-05, + "loss": 3.5322, + "step": 5415 + }, + { + "epoch": 0.3362095722887827, + "grad_norm": 0.9221486800686283, + "learning_rate": 9.995567446510796e-05, + "loss": 3.6502, + "step": 5416 + }, + { + "epoch": 0.3362716493885406, + "grad_norm": 0.5866332691568159, + "learning_rate": 9.99555222971098e-05, + "loss": 3.7168, + "step": 5417 + }, + { + "epoch": 0.33633372648829846, + "grad_norm": 0.42999246352266984, + "learning_rate": 9.995536986848155e-05, + "loss": 3.7032, + "step": 5418 + }, + { + "epoch": 0.3363958035880564, + "grad_norm": 0.4186913136025118, + "learning_rate": 9.995521717922399e-05, + "loss": 3.6951, + "step": 5419 + }, + { + "epoch": 0.3364578806878143, + "grad_norm": 0.5506886343497834, + "learning_rate": 9.995506422933795e-05, + "loss": 3.6476, + "step": 5420 + }, + { + "epoch": 0.33651995778757215, + "grad_norm": 0.5257482906627575, + "learning_rate": 9.995491101882422e-05, + "loss": 3.6322, + "step": 5421 + }, + { + "epoch": 0.3365820348873301, + "grad_norm": 0.30645368429677616, + "learning_rate": 9.995475754768357e-05, + "loss": 3.6581, + "step": 5422 + }, + { + "epoch": 0.336644111987088, + "grad_norm": 0.3089793247046478, + "learning_rate": 9.995460381591683e-05, + "loss": 3.632, + "step": 5423 + }, + { + "epoch": 0.33670618908684585, + "grad_norm": 0.3961696785226699, + "learning_rate": 9.995444982352482e-05, + "loss": 3.6975, + "step": 5424 + }, + { + "epoch": 0.33676826618660377, + "grad_norm": 0.3674609201114359, + "learning_rate": 9.99542955705083e-05, + "loss": 3.6405, + "step": 5425 + }, + { + "epoch": 0.3368303432863617, + "grad_norm": 0.3768453691072745, + "learning_rate": 9.995414105686809e-05, + "loss": 3.6897, + "step": 5426 + }, + { + "epoch": 0.33689242038611955, + "grad_norm": 0.24050440769905054, + "learning_rate": 9.9953986282605e-05, + "loss": 3.6761, + "step": 5427 + }, + { + "epoch": 0.33695449748587747, + "grad_norm": 0.38763544499613783, + "learning_rate": 9.995383124771984e-05, + "loss": 3.6326, + "step": 5428 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 0.41793174223301593, + "learning_rate": 9.995367595221342e-05, + "loss": 3.6973, + "step": 5429 + }, + { + "epoch": 0.33707865168539325, + "grad_norm": 0.4297010939788564, + "learning_rate": 9.995352039608653e-05, + "loss": 3.665, + "step": 5430 + }, + { + "epoch": 0.33714072878515117, + "grad_norm": 0.3597725866516063, + "learning_rate": 9.995336457934001e-05, + "loss": 3.692, + "step": 5431 + }, + { + "epoch": 0.3372028058849091, + "grad_norm": 0.21793982645460827, + "learning_rate": 9.995320850197465e-05, + "loss": 3.6646, + "step": 5432 + }, + { + "epoch": 0.33726488298466695, + "grad_norm": 0.45012235567502107, + "learning_rate": 9.995305216399129e-05, + "loss": 3.7246, + "step": 5433 + }, + { + "epoch": 0.33732696008442487, + "grad_norm": 0.24506002876759692, + "learning_rate": 9.995289556539072e-05, + "loss": 3.6895, + "step": 5434 + }, + { + "epoch": 0.3373890371841827, + "grad_norm": 0.3151772732169548, + "learning_rate": 9.995273870617377e-05, + "loss": 3.6435, + "step": 5435 + }, + { + "epoch": 0.33745111428394065, + "grad_norm": 0.2586324109683761, + "learning_rate": 9.995258158634127e-05, + "loss": 3.6748, + "step": 5436 + }, + { + "epoch": 0.33751319138369856, + "grad_norm": 0.28633381420275716, + "learning_rate": 9.995242420589398e-05, + "loss": 3.6237, + "step": 5437 + }, + { + "epoch": 0.3375752684834564, + "grad_norm": 0.3069620209703647, + "learning_rate": 9.995226656483282e-05, + "loss": 3.6513, + "step": 5438 + }, + { + "epoch": 0.33763734558321434, + "grad_norm": 0.25405518491942586, + "learning_rate": 9.995210866315854e-05, + "loss": 3.6165, + "step": 5439 + }, + { + "epoch": 0.33769942268297226, + "grad_norm": 0.3372453581071323, + "learning_rate": 9.995195050087197e-05, + "loss": 3.7097, + "step": 5440 + }, + { + "epoch": 0.3377614997827301, + "grad_norm": 0.2769221876375426, + "learning_rate": 9.995179207797396e-05, + "loss": 3.603, + "step": 5441 + }, + { + "epoch": 0.33782357688248804, + "grad_norm": 0.35013681190120505, + "learning_rate": 9.995163339446531e-05, + "loss": 3.6602, + "step": 5442 + }, + { + "epoch": 0.33788565398224596, + "grad_norm": 0.3025509555515663, + "learning_rate": 9.99514744503469e-05, + "loss": 3.528, + "step": 5443 + }, + { + "epoch": 0.3379477310820038, + "grad_norm": 0.43769174869691696, + "learning_rate": 9.995131524561949e-05, + "loss": 3.6098, + "step": 5444 + }, + { + "epoch": 0.33800980818176174, + "grad_norm": 0.3614815875808097, + "learning_rate": 9.995115578028394e-05, + "loss": 3.6766, + "step": 5445 + }, + { + "epoch": 0.33807188528151966, + "grad_norm": 0.32594239848434325, + "learning_rate": 9.995099605434109e-05, + "loss": 3.6416, + "step": 5446 + }, + { + "epoch": 0.3381339623812775, + "grad_norm": 0.4309732966116475, + "learning_rate": 9.995083606779177e-05, + "loss": 3.7256, + "step": 5447 + }, + { + "epoch": 0.33819603948103544, + "grad_norm": 0.39736515550037793, + "learning_rate": 9.995067582063682e-05, + "loss": 3.6011, + "step": 5448 + }, + { + "epoch": 0.33825811658079336, + "grad_norm": 0.28278548917818896, + "learning_rate": 9.995051531287706e-05, + "loss": 3.5214, + "step": 5449 + }, + { + "epoch": 0.3383201936805512, + "grad_norm": 0.3364873976402307, + "learning_rate": 9.995035454451334e-05, + "loss": 3.6896, + "step": 5450 + }, + { + "epoch": 0.33838227078030914, + "grad_norm": 0.4535298840550757, + "learning_rate": 9.995019351554649e-05, + "loss": 3.5307, + "step": 5451 + }, + { + "epoch": 0.33844434788006705, + "grad_norm": 0.3899436730888085, + "learning_rate": 9.995003222597736e-05, + "loss": 3.5676, + "step": 5452 + }, + { + "epoch": 0.3385064249798249, + "grad_norm": 0.4623323123673541, + "learning_rate": 9.994987067580678e-05, + "loss": 3.6955, + "step": 5453 + }, + { + "epoch": 0.33856850207958283, + "grad_norm": 0.2836616618703531, + "learning_rate": 9.99497088650356e-05, + "loss": 3.5954, + "step": 5454 + }, + { + "epoch": 0.33863057917934075, + "grad_norm": 0.35433979804848476, + "learning_rate": 9.994954679366466e-05, + "loss": 3.6756, + "step": 5455 + }, + { + "epoch": 0.3386926562790986, + "grad_norm": 0.2956402473671084, + "learning_rate": 9.994938446169482e-05, + "loss": 3.6663, + "step": 5456 + }, + { + "epoch": 0.33875473337885653, + "grad_norm": 0.27130299279059356, + "learning_rate": 9.99492218691269e-05, + "loss": 3.5517, + "step": 5457 + }, + { + "epoch": 0.33881681047861445, + "grad_norm": 0.3884013398971389, + "learning_rate": 9.994905901596178e-05, + "loss": 3.7418, + "step": 5458 + }, + { + "epoch": 0.3388788875783723, + "grad_norm": 0.3869141364446547, + "learning_rate": 9.994889590220029e-05, + "loss": 3.6088, + "step": 5459 + }, + { + "epoch": 0.33894096467813023, + "grad_norm": 0.31539498719947007, + "learning_rate": 9.994873252784327e-05, + "loss": 3.7239, + "step": 5460 + }, + { + "epoch": 0.33900304177788815, + "grad_norm": 0.30722917578689796, + "learning_rate": 9.994856889289159e-05, + "loss": 3.8569, + "step": 5461 + }, + { + "epoch": 0.339065118877646, + "grad_norm": 0.420635893823768, + "learning_rate": 9.994840499734612e-05, + "loss": 3.6267, + "step": 5462 + }, + { + "epoch": 0.33912719597740393, + "grad_norm": 0.27501256166291593, + "learning_rate": 9.994824084120768e-05, + "loss": 3.629, + "step": 5463 + }, + { + "epoch": 0.33918927307716185, + "grad_norm": 0.31923319245696774, + "learning_rate": 9.994807642447714e-05, + "loss": 3.7148, + "step": 5464 + }, + { + "epoch": 0.3392513501769197, + "grad_norm": 0.324920091066923, + "learning_rate": 9.994791174715536e-05, + "loss": 3.548, + "step": 5465 + }, + { + "epoch": 0.3393134272766776, + "grad_norm": 0.395467591300964, + "learning_rate": 9.994774680924321e-05, + "loss": 3.6243, + "step": 5466 + }, + { + "epoch": 0.33937550437643554, + "grad_norm": 0.24865873970241775, + "learning_rate": 9.994758161074154e-05, + "loss": 3.5998, + "step": 5467 + }, + { + "epoch": 0.3394375814761934, + "grad_norm": 0.3337025485077762, + "learning_rate": 9.994741615165121e-05, + "loss": 3.6919, + "step": 5468 + }, + { + "epoch": 0.3394996585759513, + "grad_norm": 0.26734247063531874, + "learning_rate": 9.994725043197309e-05, + "loss": 3.6532, + "step": 5469 + }, + { + "epoch": 0.33956173567570924, + "grad_norm": 0.2820791499707161, + "learning_rate": 9.994708445170802e-05, + "loss": 3.6786, + "step": 5470 + }, + { + "epoch": 0.3396238127754671, + "grad_norm": 0.26700155413368193, + "learning_rate": 9.994691821085688e-05, + "loss": 3.6383, + "step": 5471 + }, + { + "epoch": 0.339685889875225, + "grad_norm": 0.3127122158459276, + "learning_rate": 9.994675170942058e-05, + "loss": 3.545, + "step": 5472 + }, + { + "epoch": 0.33974796697498294, + "grad_norm": 0.22670035443383432, + "learning_rate": 9.994658494739992e-05, + "loss": 3.6183, + "step": 5473 + }, + { + "epoch": 0.3398100440747408, + "grad_norm": 0.24937275340566925, + "learning_rate": 9.994641792479581e-05, + "loss": 3.6342, + "step": 5474 + }, + { + "epoch": 0.3398721211744987, + "grad_norm": 0.26739722771027785, + "learning_rate": 9.994625064160912e-05, + "loss": 3.6443, + "step": 5475 + }, + { + "epoch": 0.33993419827425664, + "grad_norm": 0.21130878832450503, + "learning_rate": 9.994608309784071e-05, + "loss": 3.6107, + "step": 5476 + }, + { + "epoch": 0.3399962753740145, + "grad_norm": 0.2114960463410219, + "learning_rate": 9.994591529349145e-05, + "loss": 3.4737, + "step": 5477 + }, + { + "epoch": 0.3400583524737724, + "grad_norm": 0.2762472478807163, + "learning_rate": 9.994574722856223e-05, + "loss": 3.7002, + "step": 5478 + }, + { + "epoch": 0.34012042957353034, + "grad_norm": 0.2160184994552886, + "learning_rate": 9.994557890305393e-05, + "loss": 3.6271, + "step": 5479 + }, + { + "epoch": 0.3401825066732882, + "grad_norm": 0.2524218671604487, + "learning_rate": 9.994541031696742e-05, + "loss": 3.6105, + "step": 5480 + }, + { + "epoch": 0.3402445837730461, + "grad_norm": 0.2345503310206299, + "learning_rate": 9.994524147030357e-05, + "loss": 3.7473, + "step": 5481 + }, + { + "epoch": 0.34030666087280403, + "grad_norm": 0.3011593733195398, + "learning_rate": 9.994507236306327e-05, + "loss": 3.632, + "step": 5482 + }, + { + "epoch": 0.3403687379725619, + "grad_norm": 0.2204443165239203, + "learning_rate": 9.994490299524741e-05, + "loss": 3.6587, + "step": 5483 + }, + { + "epoch": 0.3404308150723198, + "grad_norm": 0.32922473040246797, + "learning_rate": 9.994473336685688e-05, + "loss": 3.5912, + "step": 5484 + }, + { + "epoch": 0.34049289217207773, + "grad_norm": 0.2170188643626889, + "learning_rate": 9.994456347789254e-05, + "loss": 3.6594, + "step": 5485 + }, + { + "epoch": 0.3405549692718356, + "grad_norm": 0.2690405733033035, + "learning_rate": 9.994439332835528e-05, + "loss": 3.6429, + "step": 5486 + }, + { + "epoch": 0.3406170463715935, + "grad_norm": 0.2753568588009398, + "learning_rate": 9.9944222918246e-05, + "loss": 3.6282, + "step": 5487 + }, + { + "epoch": 0.34067912347135143, + "grad_norm": 0.23687600241275347, + "learning_rate": 9.994405224756557e-05, + "loss": 3.5615, + "step": 5488 + }, + { + "epoch": 0.3407412005711093, + "grad_norm": 0.18986272285378528, + "learning_rate": 9.994388131631492e-05, + "loss": 3.6468, + "step": 5489 + }, + { + "epoch": 0.3408032776708672, + "grad_norm": 0.25937616221615645, + "learning_rate": 9.994371012449492e-05, + "loss": 3.5847, + "step": 5490 + }, + { + "epoch": 0.34086535477062513, + "grad_norm": 0.20784704057563258, + "learning_rate": 9.994353867210643e-05, + "loss": 3.675, + "step": 5491 + }, + { + "epoch": 0.340927431870383, + "grad_norm": 0.3225880714780687, + "learning_rate": 9.99433669591504e-05, + "loss": 3.6801, + "step": 5492 + }, + { + "epoch": 0.3409895089701409, + "grad_norm": 0.2839396272269353, + "learning_rate": 9.99431949856277e-05, + "loss": 3.6538, + "step": 5493 + }, + { + "epoch": 0.3410515860698988, + "grad_norm": 0.3942571575492333, + "learning_rate": 9.994302275153922e-05, + "loss": 3.6535, + "step": 5494 + }, + { + "epoch": 0.3411136631696567, + "grad_norm": 0.38798429543257507, + "learning_rate": 9.994285025688585e-05, + "loss": 3.6322, + "step": 5495 + }, + { + "epoch": 0.3411757402694146, + "grad_norm": 0.2914542790122508, + "learning_rate": 9.994267750166854e-05, + "loss": 3.546, + "step": 5496 + }, + { + "epoch": 0.3412378173691725, + "grad_norm": 0.43331290076146617, + "learning_rate": 9.994250448588814e-05, + "loss": 3.5801, + "step": 5497 + }, + { + "epoch": 0.3412998944689304, + "grad_norm": 0.37110828571963583, + "learning_rate": 9.994233120954558e-05, + "loss": 3.6903, + "step": 5498 + }, + { + "epoch": 0.3413619715686883, + "grad_norm": 0.2750979146255401, + "learning_rate": 9.994215767264172e-05, + "loss": 3.6556, + "step": 5499 + }, + { + "epoch": 0.3414240486684462, + "grad_norm": 0.33155653059221135, + "learning_rate": 9.994198387517752e-05, + "loss": 3.7529, + "step": 5500 + }, + { + "epoch": 0.3414861257682041, + "grad_norm": 0.3835806826912139, + "learning_rate": 9.994180981715388e-05, + "loss": 3.6588, + "step": 5501 + }, + { + "epoch": 0.341548202867962, + "grad_norm": 0.4541814098094195, + "learning_rate": 9.994163549857168e-05, + "loss": 3.6007, + "step": 5502 + }, + { + "epoch": 0.3416102799677199, + "grad_norm": 0.2874488121572182, + "learning_rate": 9.994146091943186e-05, + "loss": 3.6751, + "step": 5503 + }, + { + "epoch": 0.3416723570674778, + "grad_norm": 0.40731916586815514, + "learning_rate": 9.99412860797353e-05, + "loss": 3.6176, + "step": 5504 + }, + { + "epoch": 0.3417344341672357, + "grad_norm": 0.5068813002320643, + "learning_rate": 9.994111097948294e-05, + "loss": 3.6144, + "step": 5505 + }, + { + "epoch": 0.3417965112669936, + "grad_norm": 0.24225401733992297, + "learning_rate": 9.994093561867566e-05, + "loss": 3.6563, + "step": 5506 + }, + { + "epoch": 0.3418585883667515, + "grad_norm": 0.35203695071667573, + "learning_rate": 9.99407599973144e-05, + "loss": 3.6642, + "step": 5507 + }, + { + "epoch": 0.3419206654665094, + "grad_norm": 0.35604811184702145, + "learning_rate": 9.994058411540009e-05, + "loss": 3.6194, + "step": 5508 + }, + { + "epoch": 0.3419827425662673, + "grad_norm": 0.3087108848481952, + "learning_rate": 9.99404079729336e-05, + "loss": 3.6307, + "step": 5509 + }, + { + "epoch": 0.3420448196660252, + "grad_norm": 0.3037211302981815, + "learning_rate": 9.99402315699159e-05, + "loss": 3.7111, + "step": 5510 + }, + { + "epoch": 0.3421068967657831, + "grad_norm": 0.25775144316420034, + "learning_rate": 9.994005490634788e-05, + "loss": 3.6451, + "step": 5511 + }, + { + "epoch": 0.342168973865541, + "grad_norm": 0.2610096363475589, + "learning_rate": 9.993987798223048e-05, + "loss": 3.6011, + "step": 5512 + }, + { + "epoch": 0.3422310509652989, + "grad_norm": 0.2873077226448652, + "learning_rate": 9.993970079756461e-05, + "loss": 3.6358, + "step": 5513 + }, + { + "epoch": 0.3422931280650568, + "grad_norm": 0.30114320551753737, + "learning_rate": 9.99395233523512e-05, + "loss": 3.624, + "step": 5514 + }, + { + "epoch": 0.3423552051648147, + "grad_norm": 0.22681345688152107, + "learning_rate": 9.993934564659115e-05, + "loss": 3.6993, + "step": 5515 + }, + { + "epoch": 0.3424172822645726, + "grad_norm": 0.26591090635292763, + "learning_rate": 9.993916768028543e-05, + "loss": 3.6349, + "step": 5516 + }, + { + "epoch": 0.3424793593643305, + "grad_norm": 0.28167661788721593, + "learning_rate": 9.993898945343495e-05, + "loss": 3.6333, + "step": 5517 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 0.24682799054691704, + "learning_rate": 9.993881096604064e-05, + "loss": 3.5453, + "step": 5518 + }, + { + "epoch": 0.3426035135638463, + "grad_norm": 0.26239318207000417, + "learning_rate": 9.993863221810342e-05, + "loss": 3.6843, + "step": 5519 + }, + { + "epoch": 0.3426655906636042, + "grad_norm": 0.28799420205462456, + "learning_rate": 9.993845320962424e-05, + "loss": 3.6347, + "step": 5520 + }, + { + "epoch": 0.3427276677633621, + "grad_norm": 0.2365182710645607, + "learning_rate": 9.993827394060402e-05, + "loss": 3.7225, + "step": 5521 + }, + { + "epoch": 0.34278974486311997, + "grad_norm": 0.28126952315492676, + "learning_rate": 9.993809441104371e-05, + "loss": 3.6406, + "step": 5522 + }, + { + "epoch": 0.3428518219628779, + "grad_norm": 0.2769388144041571, + "learning_rate": 9.993791462094424e-05, + "loss": 3.6058, + "step": 5523 + }, + { + "epoch": 0.3429138990626358, + "grad_norm": 0.22746264111658657, + "learning_rate": 9.993773457030653e-05, + "loss": 3.5313, + "step": 5524 + }, + { + "epoch": 0.34297597616239367, + "grad_norm": 0.2808730460329426, + "learning_rate": 9.993755425913155e-05, + "loss": 3.6174, + "step": 5525 + }, + { + "epoch": 0.3430380532621516, + "grad_norm": 0.2805802264529198, + "learning_rate": 9.993737368742023e-05, + "loss": 3.7035, + "step": 5526 + }, + { + "epoch": 0.3431001303619095, + "grad_norm": 0.26942419607455437, + "learning_rate": 9.993719285517351e-05, + "loss": 3.656, + "step": 5527 + }, + { + "epoch": 0.34316220746166737, + "grad_norm": 0.19432743571485003, + "learning_rate": 9.993701176239232e-05, + "loss": 3.6366, + "step": 5528 + }, + { + "epoch": 0.3432242845614253, + "grad_norm": 0.24949143508312702, + "learning_rate": 9.993683040907762e-05, + "loss": 3.6768, + "step": 5529 + }, + { + "epoch": 0.3432863616611832, + "grad_norm": 0.31127418663467665, + "learning_rate": 9.993664879523036e-05, + "loss": 3.6861, + "step": 5530 + }, + { + "epoch": 0.34334843876094107, + "grad_norm": 0.2592672859726769, + "learning_rate": 9.993646692085148e-05, + "loss": 3.6439, + "step": 5531 + }, + { + "epoch": 0.343410515860699, + "grad_norm": 0.2957021357974561, + "learning_rate": 9.993628478594193e-05, + "loss": 3.6674, + "step": 5532 + }, + { + "epoch": 0.3434725929604569, + "grad_norm": 0.28300573112612254, + "learning_rate": 9.993610239050265e-05, + "loss": 3.6374, + "step": 5533 + }, + { + "epoch": 0.34353467006021476, + "grad_norm": 0.29880958699089294, + "learning_rate": 9.99359197345346e-05, + "loss": 3.547, + "step": 5534 + }, + { + "epoch": 0.3435967471599727, + "grad_norm": 0.3566391718188258, + "learning_rate": 9.993573681803875e-05, + "loss": 3.6016, + "step": 5535 + }, + { + "epoch": 0.3436588242597306, + "grad_norm": 0.3728329309651184, + "learning_rate": 9.993555364101603e-05, + "loss": 3.4928, + "step": 5536 + }, + { + "epoch": 0.34372090135948846, + "grad_norm": 0.29170128355773695, + "learning_rate": 9.993537020346741e-05, + "loss": 3.6246, + "step": 5537 + }, + { + "epoch": 0.3437829784592464, + "grad_norm": 0.3148150168245006, + "learning_rate": 9.993518650539384e-05, + "loss": 3.6135, + "step": 5538 + }, + { + "epoch": 0.3438450555590043, + "grad_norm": 0.26410293664505646, + "learning_rate": 9.993500254679627e-05, + "loss": 3.7006, + "step": 5539 + }, + { + "epoch": 0.34390713265876216, + "grad_norm": 0.2136263601863587, + "learning_rate": 9.993481832767568e-05, + "loss": 3.5577, + "step": 5540 + }, + { + "epoch": 0.3439692097585201, + "grad_norm": 0.2665844866415748, + "learning_rate": 9.993463384803302e-05, + "loss": 3.5953, + "step": 5541 + }, + { + "epoch": 0.344031286858278, + "grad_norm": 0.37252447489524876, + "learning_rate": 9.993444910786924e-05, + "loss": 3.6142, + "step": 5542 + }, + { + "epoch": 0.34409336395803586, + "grad_norm": 0.3361530721212254, + "learning_rate": 9.993426410718532e-05, + "loss": 3.7246, + "step": 5543 + }, + { + "epoch": 0.3441554410577938, + "grad_norm": 0.3522019187527845, + "learning_rate": 9.993407884598223e-05, + "loss": 3.6028, + "step": 5544 + }, + { + "epoch": 0.3442175181575517, + "grad_norm": 0.2749927026690782, + "learning_rate": 9.993389332426093e-05, + "loss": 3.622, + "step": 5545 + }, + { + "epoch": 0.34427959525730956, + "grad_norm": 0.3036912746135984, + "learning_rate": 9.993370754202239e-05, + "loss": 3.7484, + "step": 5546 + }, + { + "epoch": 0.3443416723570675, + "grad_norm": 0.3586707263181153, + "learning_rate": 9.993352149926756e-05, + "loss": 3.5556, + "step": 5547 + }, + { + "epoch": 0.3444037494568254, + "grad_norm": 0.34264402853918235, + "learning_rate": 9.993333519599743e-05, + "loss": 3.6264, + "step": 5548 + }, + { + "epoch": 0.34446582655658325, + "grad_norm": 0.41235412995115944, + "learning_rate": 9.993314863221299e-05, + "loss": 3.67, + "step": 5549 + }, + { + "epoch": 0.3445279036563412, + "grad_norm": 0.38373534236711315, + "learning_rate": 9.993296180791518e-05, + "loss": 3.7621, + "step": 5550 + }, + { + "epoch": 0.3445899807560991, + "grad_norm": 0.3365779582363619, + "learning_rate": 9.993277472310498e-05, + "loss": 3.695, + "step": 5551 + }, + { + "epoch": 0.34465205785585695, + "grad_norm": 0.31059387139324623, + "learning_rate": 9.993258737778337e-05, + "loss": 3.7426, + "step": 5552 + }, + { + "epoch": 0.34471413495561487, + "grad_norm": 0.35410159111548856, + "learning_rate": 9.993239977195135e-05, + "loss": 3.7283, + "step": 5553 + }, + { + "epoch": 0.3447762120553728, + "grad_norm": 0.3522266789757988, + "learning_rate": 9.993221190560986e-05, + "loss": 3.7585, + "step": 5554 + }, + { + "epoch": 0.34483828915513065, + "grad_norm": 0.2629858912810168, + "learning_rate": 9.99320237787599e-05, + "loss": 3.6786, + "step": 5555 + }, + { + "epoch": 0.34490036625488857, + "grad_norm": 0.3413447195861954, + "learning_rate": 9.993183539140245e-05, + "loss": 3.6943, + "step": 5556 + }, + { + "epoch": 0.3449624433546465, + "grad_norm": 0.3127386969659127, + "learning_rate": 9.99316467435385e-05, + "loss": 3.6266, + "step": 5557 + }, + { + "epoch": 0.34502452045440435, + "grad_norm": 0.32577113128975055, + "learning_rate": 9.993145783516903e-05, + "loss": 3.6564, + "step": 5558 + }, + { + "epoch": 0.34508659755416227, + "grad_norm": 0.2515988483587572, + "learning_rate": 9.993126866629502e-05, + "loss": 3.6697, + "step": 5559 + }, + { + "epoch": 0.3451486746539202, + "grad_norm": 0.4524303570787032, + "learning_rate": 9.993107923691746e-05, + "loss": 3.546, + "step": 5560 + }, + { + "epoch": 0.34521075175367805, + "grad_norm": 0.2641314986332593, + "learning_rate": 9.993088954703734e-05, + "loss": 3.6984, + "step": 5561 + }, + { + "epoch": 0.34527282885343596, + "grad_norm": 0.3520502204202743, + "learning_rate": 9.993069959665565e-05, + "loss": 3.5828, + "step": 5562 + }, + { + "epoch": 0.3453349059531939, + "grad_norm": 0.23853878268673775, + "learning_rate": 9.993050938577338e-05, + "loss": 3.5555, + "step": 5563 + }, + { + "epoch": 0.34539698305295174, + "grad_norm": 0.2644248383803485, + "learning_rate": 9.99303189143915e-05, + "loss": 3.5974, + "step": 5564 + }, + { + "epoch": 0.34545906015270966, + "grad_norm": 0.28105392508970006, + "learning_rate": 9.993012818251105e-05, + "loss": 3.6476, + "step": 5565 + }, + { + "epoch": 0.3455211372524676, + "grad_norm": 0.22437105389209167, + "learning_rate": 9.992993719013299e-05, + "loss": 3.709, + "step": 5566 + }, + { + "epoch": 0.34558321435222544, + "grad_norm": 0.28615124743968073, + "learning_rate": 9.992974593725832e-05, + "loss": 3.6146, + "step": 5567 + }, + { + "epoch": 0.34564529145198336, + "grad_norm": 0.38437254912672764, + "learning_rate": 9.992955442388806e-05, + "loss": 3.6438, + "step": 5568 + }, + { + "epoch": 0.3457073685517413, + "grad_norm": 0.31550954982840795, + "learning_rate": 9.992936265002318e-05, + "loss": 3.7009, + "step": 5569 + }, + { + "epoch": 0.34576944565149914, + "grad_norm": 0.34436786999045577, + "learning_rate": 9.992917061566472e-05, + "loss": 3.6205, + "step": 5570 + }, + { + "epoch": 0.34583152275125706, + "grad_norm": 0.27816470966389717, + "learning_rate": 9.992897832081363e-05, + "loss": 3.6691, + "step": 5571 + }, + { + "epoch": 0.345893599851015, + "grad_norm": 0.25907739375594285, + "learning_rate": 9.992878576547093e-05, + "loss": 3.5574, + "step": 5572 + }, + { + "epoch": 0.34595567695077284, + "grad_norm": 0.3011685144852945, + "learning_rate": 9.992859294963766e-05, + "loss": 3.7198, + "step": 5573 + }, + { + "epoch": 0.34601775405053076, + "grad_norm": 0.293410616487204, + "learning_rate": 9.99283998733148e-05, + "loss": 3.629, + "step": 5574 + }, + { + "epoch": 0.3460798311502887, + "grad_norm": 0.3560762140459714, + "learning_rate": 9.992820653650335e-05, + "loss": 3.5637, + "step": 5575 + }, + { + "epoch": 0.34614190825004654, + "grad_norm": 0.30674634921039606, + "learning_rate": 9.992801293920431e-05, + "loss": 3.6721, + "step": 5576 + }, + { + "epoch": 0.34620398534980446, + "grad_norm": 0.30950647893357836, + "learning_rate": 9.992781908141873e-05, + "loss": 3.6362, + "step": 5577 + }, + { + "epoch": 0.3462660624495624, + "grad_norm": 0.7455782482521973, + "learning_rate": 9.992762496314758e-05, + "loss": 3.5987, + "step": 5578 + }, + { + "epoch": 0.34632813954932024, + "grad_norm": 0.43000887210143135, + "learning_rate": 9.99274305843919e-05, + "loss": 3.6695, + "step": 5579 + }, + { + "epoch": 0.34639021664907815, + "grad_norm": 0.29007510080155774, + "learning_rate": 9.992723594515269e-05, + "loss": 3.5713, + "step": 5580 + }, + { + "epoch": 0.34645229374883607, + "grad_norm": 0.43439300005824594, + "learning_rate": 9.992704104543098e-05, + "loss": 3.5992, + "step": 5581 + }, + { + "epoch": 0.34651437084859393, + "grad_norm": 0.30753246533355577, + "learning_rate": 9.992684588522777e-05, + "loss": 3.6215, + "step": 5582 + }, + { + "epoch": 0.34657644794835185, + "grad_norm": 0.4559972918904768, + "learning_rate": 9.992665046454408e-05, + "loss": 3.6514, + "step": 5583 + }, + { + "epoch": 0.34663852504810977, + "grad_norm": 0.33647375775147653, + "learning_rate": 9.992645478338094e-05, + "loss": 3.7079, + "step": 5584 + }, + { + "epoch": 0.34670060214786763, + "grad_norm": 0.8414598042367495, + "learning_rate": 9.992625884173936e-05, + "loss": 3.5548, + "step": 5585 + }, + { + "epoch": 0.34676267924762555, + "grad_norm": 0.46095591039583145, + "learning_rate": 9.992606263962037e-05, + "loss": 3.6377, + "step": 5586 + }, + { + "epoch": 0.34682475634738347, + "grad_norm": 0.6168618610687276, + "learning_rate": 9.992586617702499e-05, + "loss": 3.5838, + "step": 5587 + }, + { + "epoch": 0.34688683344714133, + "grad_norm": 0.4702480062830842, + "learning_rate": 9.992566945395426e-05, + "loss": 3.7186, + "step": 5588 + }, + { + "epoch": 0.34694891054689925, + "grad_norm": 0.8628309983063521, + "learning_rate": 9.992547247040918e-05, + "loss": 3.6392, + "step": 5589 + }, + { + "epoch": 0.34701098764665717, + "grad_norm": 0.5139629279174701, + "learning_rate": 9.992527522639078e-05, + "loss": 3.6646, + "step": 5590 + }, + { + "epoch": 0.347073064746415, + "grad_norm": 0.6396107902804743, + "learning_rate": 9.992507772190011e-05, + "loss": 3.6667, + "step": 5591 + }, + { + "epoch": 0.34713514184617295, + "grad_norm": 0.567366450430022, + "learning_rate": 9.99248799569382e-05, + "loss": 3.7018, + "step": 5592 + }, + { + "epoch": 0.34719721894593086, + "grad_norm": 1.08777490407867, + "learning_rate": 9.992468193150606e-05, + "loss": 3.6935, + "step": 5593 + }, + { + "epoch": 0.3472592960456887, + "grad_norm": 0.8725291174044202, + "learning_rate": 9.992448364560474e-05, + "loss": 3.692, + "step": 5594 + }, + { + "epoch": 0.34732137314544664, + "grad_norm": 0.5639195807982835, + "learning_rate": 9.992428509923526e-05, + "loss": 3.6472, + "step": 5595 + }, + { + "epoch": 0.34738345024520456, + "grad_norm": 0.4992917229892907, + "learning_rate": 9.992408629239867e-05, + "loss": 3.6351, + "step": 5596 + }, + { + "epoch": 0.3474455273449624, + "grad_norm": 0.8200909996711946, + "learning_rate": 9.9923887225096e-05, + "loss": 3.6006, + "step": 5597 + }, + { + "epoch": 0.34750760444472034, + "grad_norm": 0.5579741983566976, + "learning_rate": 9.992368789732829e-05, + "loss": 3.6119, + "step": 5598 + }, + { + "epoch": 0.34756968154447826, + "grad_norm": 0.6071035158009362, + "learning_rate": 9.99234883090966e-05, + "loss": 3.5592, + "step": 5599 + }, + { + "epoch": 0.3476317586442361, + "grad_norm": 0.5741313167102439, + "learning_rate": 9.992328846040193e-05, + "loss": 3.5754, + "step": 5600 + }, + { + "epoch": 0.34769383574399404, + "grad_norm": 0.4961577997387064, + "learning_rate": 9.992308835124536e-05, + "loss": 3.5878, + "step": 5601 + }, + { + "epoch": 0.34775591284375196, + "grad_norm": 0.5350619044987346, + "learning_rate": 9.99228879816279e-05, + "loss": 3.5709, + "step": 5602 + }, + { + "epoch": 0.3478179899435098, + "grad_norm": 0.41006666749226967, + "learning_rate": 9.992268735155063e-05, + "loss": 3.7018, + "step": 5603 + }, + { + "epoch": 0.34788006704326774, + "grad_norm": 0.3184118169173783, + "learning_rate": 9.992248646101456e-05, + "loss": 3.6444, + "step": 5604 + }, + { + "epoch": 0.34794214414302566, + "grad_norm": 0.4306845108679148, + "learning_rate": 9.992228531002079e-05, + "loss": 3.6541, + "step": 5605 + }, + { + "epoch": 0.3480042212427835, + "grad_norm": 0.43280281923670627, + "learning_rate": 9.992208389857031e-05, + "loss": 3.6693, + "step": 5606 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 0.42436130229096963, + "learning_rate": 9.992188222666422e-05, + "loss": 3.5995, + "step": 5607 + }, + { + "epoch": 0.34812837544229935, + "grad_norm": 0.27463028829498265, + "learning_rate": 9.992168029430355e-05, + "loss": 3.6694, + "step": 5608 + }, + { + "epoch": 0.3481904525420572, + "grad_norm": 0.8418473230809554, + "learning_rate": 9.992147810148936e-05, + "loss": 3.6929, + "step": 5609 + }, + { + "epoch": 0.34825252964181513, + "grad_norm": 0.6672482313259176, + "learning_rate": 9.992127564822267e-05, + "loss": 3.7142, + "step": 5610 + }, + { + "epoch": 0.34831460674157305, + "grad_norm": 0.36006775718390926, + "learning_rate": 9.99210729345046e-05, + "loss": 3.7173, + "step": 5611 + }, + { + "epoch": 0.3483766838413309, + "grad_norm": 0.41855307759853383, + "learning_rate": 9.992086996033616e-05, + "loss": 3.6879, + "step": 5612 + }, + { + "epoch": 0.34843876094108883, + "grad_norm": 0.7006573077332056, + "learning_rate": 9.992066672571841e-05, + "loss": 3.6697, + "step": 5613 + }, + { + "epoch": 0.34850083804084675, + "grad_norm": 0.5563957077094596, + "learning_rate": 9.992046323065242e-05, + "loss": 3.6803, + "step": 5614 + }, + { + "epoch": 0.3485629151406046, + "grad_norm": 0.34667922543776974, + "learning_rate": 9.992025947513928e-05, + "loss": 3.5362, + "step": 5615 + }, + { + "epoch": 0.34862499224036253, + "grad_norm": 0.40839819376795466, + "learning_rate": 9.992005545918002e-05, + "loss": 3.6513, + "step": 5616 + }, + { + "epoch": 0.34868706934012045, + "grad_norm": 0.3879527623250431, + "learning_rate": 9.991985118277571e-05, + "loss": 3.6589, + "step": 5617 + }, + { + "epoch": 0.3487491464398783, + "grad_norm": 0.4294930104607317, + "learning_rate": 9.991964664592741e-05, + "loss": 3.6187, + "step": 5618 + }, + { + "epoch": 0.34881122353963623, + "grad_norm": 0.330201210683224, + "learning_rate": 9.991944184863621e-05, + "loss": 3.7339, + "step": 5619 + }, + { + "epoch": 0.34887330063939415, + "grad_norm": 0.3899750070021789, + "learning_rate": 9.991923679090316e-05, + "loss": 3.6831, + "step": 5620 + }, + { + "epoch": 0.348935377739152, + "grad_norm": 0.32341759293333555, + "learning_rate": 9.991903147272932e-05, + "loss": 3.543, + "step": 5621 + }, + { + "epoch": 0.3489974548389099, + "grad_norm": 0.4901202095505894, + "learning_rate": 9.991882589411578e-05, + "loss": 3.7599, + "step": 5622 + }, + { + "epoch": 0.34905953193866784, + "grad_norm": 0.4852295689612754, + "learning_rate": 9.991862005506361e-05, + "loss": 3.6408, + "step": 5623 + }, + { + "epoch": 0.3491216090384257, + "grad_norm": 0.3070989815547891, + "learning_rate": 9.991841395557388e-05, + "loss": 3.6572, + "step": 5624 + }, + { + "epoch": 0.3491836861381836, + "grad_norm": 0.43454318872735653, + "learning_rate": 9.991820759564768e-05, + "loss": 3.6261, + "step": 5625 + }, + { + "epoch": 0.34924576323794154, + "grad_norm": 0.2910829632573178, + "learning_rate": 9.991800097528606e-05, + "loss": 3.6769, + "step": 5626 + }, + { + "epoch": 0.3493078403376994, + "grad_norm": 0.5165317538712051, + "learning_rate": 9.991779409449011e-05, + "loss": 3.6511, + "step": 5627 + }, + { + "epoch": 0.3493699174374573, + "grad_norm": 0.33588469999693055, + "learning_rate": 9.991758695326091e-05, + "loss": 3.6704, + "step": 5628 + }, + { + "epoch": 0.34943199453721524, + "grad_norm": 0.3938565859265884, + "learning_rate": 9.991737955159954e-05, + "loss": 3.6349, + "step": 5629 + }, + { + "epoch": 0.3494940716369731, + "grad_norm": 0.39085581635847705, + "learning_rate": 9.991717188950709e-05, + "loss": 3.5902, + "step": 5630 + }, + { + "epoch": 0.349556148736731, + "grad_norm": 0.4298626456064675, + "learning_rate": 9.991696396698463e-05, + "loss": 3.6288, + "step": 5631 + }, + { + "epoch": 0.34961822583648894, + "grad_norm": 0.33464804179004665, + "learning_rate": 9.991675578403325e-05, + "loss": 3.7122, + "step": 5632 + }, + { + "epoch": 0.3496803029362468, + "grad_norm": 0.3633791063652113, + "learning_rate": 9.991654734065405e-05, + "loss": 3.4823, + "step": 5633 + }, + { + "epoch": 0.3497423800360047, + "grad_norm": 0.26997695228435264, + "learning_rate": 9.991633863684809e-05, + "loss": 3.7543, + "step": 5634 + }, + { + "epoch": 0.34980445713576264, + "grad_norm": 0.3164341054828869, + "learning_rate": 9.991612967261647e-05, + "loss": 3.6328, + "step": 5635 + }, + { + "epoch": 0.3498665342355205, + "grad_norm": 0.3181630916124006, + "learning_rate": 9.99159204479603e-05, + "loss": 3.6474, + "step": 5636 + }, + { + "epoch": 0.3499286113352784, + "grad_norm": 0.45265882452488426, + "learning_rate": 9.991571096288066e-05, + "loss": 3.672, + "step": 5637 + }, + { + "epoch": 0.34999068843503633, + "grad_norm": 0.3065992801273664, + "learning_rate": 9.991550121737863e-05, + "loss": 3.6927, + "step": 5638 + }, + { + "epoch": 0.3500527655347942, + "grad_norm": 0.2559647234037741, + "learning_rate": 9.991529121145531e-05, + "loss": 3.6273, + "step": 5639 + }, + { + "epoch": 0.3501148426345521, + "grad_norm": 0.22488139722099457, + "learning_rate": 9.99150809451118e-05, + "loss": 3.6429, + "step": 5640 + }, + { + "epoch": 0.35017691973431003, + "grad_norm": 0.27006487612218627, + "learning_rate": 9.991487041834919e-05, + "loss": 3.7205, + "step": 5641 + }, + { + "epoch": 0.3502389968340679, + "grad_norm": 0.3009514760449083, + "learning_rate": 9.991465963116858e-05, + "loss": 3.6401, + "step": 5642 + }, + { + "epoch": 0.3503010739338258, + "grad_norm": 0.3384650645033818, + "learning_rate": 9.991444858357108e-05, + "loss": 3.6033, + "step": 5643 + }, + { + "epoch": 0.35036315103358373, + "grad_norm": 0.271947876710558, + "learning_rate": 9.99142372755578e-05, + "loss": 3.6231, + "step": 5644 + }, + { + "epoch": 0.3504252281333416, + "grad_norm": 0.23786997086886136, + "learning_rate": 9.99140257071298e-05, + "loss": 3.5603, + "step": 5645 + }, + { + "epoch": 0.3504873052330995, + "grad_norm": 0.24328911870874495, + "learning_rate": 9.991381387828823e-05, + "loss": 3.6289, + "step": 5646 + }, + { + "epoch": 0.35054938233285743, + "grad_norm": 0.24147701645903835, + "learning_rate": 9.991360178903418e-05, + "loss": 3.6333, + "step": 5647 + }, + { + "epoch": 0.3506114594326153, + "grad_norm": 0.347581523193567, + "learning_rate": 9.991338943936874e-05, + "loss": 3.6817, + "step": 5648 + }, + { + "epoch": 0.3506735365323732, + "grad_norm": 0.3659183808633129, + "learning_rate": 9.991317682929302e-05, + "loss": 3.6355, + "step": 5649 + }, + { + "epoch": 0.3507356136321311, + "grad_norm": 0.2559346532372225, + "learning_rate": 9.991296395880816e-05, + "loss": 3.6265, + "step": 5650 + }, + { + "epoch": 0.350797690731889, + "grad_norm": 0.2763068197253682, + "learning_rate": 9.991275082791524e-05, + "loss": 3.6004, + "step": 5651 + }, + { + "epoch": 0.3508597678316469, + "grad_norm": 0.27170439936168844, + "learning_rate": 9.991253743661538e-05, + "loss": 3.5928, + "step": 5652 + }, + { + "epoch": 0.3509218449314048, + "grad_norm": 0.30865173760769615, + "learning_rate": 9.99123237849097e-05, + "loss": 3.6507, + "step": 5653 + }, + { + "epoch": 0.3509839220311627, + "grad_norm": 0.25860720308465956, + "learning_rate": 9.991210987279931e-05, + "loss": 3.6062, + "step": 5654 + }, + { + "epoch": 0.3510459991309206, + "grad_norm": 0.2772978623804075, + "learning_rate": 9.991189570028533e-05, + "loss": 3.5461, + "step": 5655 + }, + { + "epoch": 0.3511080762306785, + "grad_norm": 0.35724354031504146, + "learning_rate": 9.991168126736888e-05, + "loss": 3.6618, + "step": 5656 + }, + { + "epoch": 0.3511701533304364, + "grad_norm": 0.1958129384364368, + "learning_rate": 9.991146657405105e-05, + "loss": 3.6673, + "step": 5657 + }, + { + "epoch": 0.3512322304301943, + "grad_norm": 0.30291760959810804, + "learning_rate": 9.9911251620333e-05, + "loss": 3.5698, + "step": 5658 + }, + { + "epoch": 0.3512943075299522, + "grad_norm": 0.2622447350812647, + "learning_rate": 9.991103640621582e-05, + "loss": 3.623, + "step": 5659 + }, + { + "epoch": 0.3513563846297101, + "grad_norm": 0.2396311124898596, + "learning_rate": 9.991082093170065e-05, + "loss": 3.6143, + "step": 5660 + }, + { + "epoch": 0.351418461729468, + "grad_norm": 0.2805011900521259, + "learning_rate": 9.991060519678864e-05, + "loss": 3.6753, + "step": 5661 + }, + { + "epoch": 0.3514805388292259, + "grad_norm": 0.2249808349659381, + "learning_rate": 9.991038920148086e-05, + "loss": 3.6007, + "step": 5662 + }, + { + "epoch": 0.3515426159289838, + "grad_norm": 0.36866976463132595, + "learning_rate": 9.991017294577846e-05, + "loss": 3.5821, + "step": 5663 + }, + { + "epoch": 0.3516046930287417, + "grad_norm": 0.32962810153030037, + "learning_rate": 9.990995642968257e-05, + "loss": 3.6328, + "step": 5664 + }, + { + "epoch": 0.3516667701284996, + "grad_norm": 0.22454001143476543, + "learning_rate": 9.990973965319433e-05, + "loss": 3.6739, + "step": 5665 + }, + { + "epoch": 0.3517288472282575, + "grad_norm": 0.3485269786573327, + "learning_rate": 9.990952261631487e-05, + "loss": 3.5806, + "step": 5666 + }, + { + "epoch": 0.3517909243280154, + "grad_norm": 0.2991060697174342, + "learning_rate": 9.99093053190453e-05, + "loss": 3.5132, + "step": 5667 + }, + { + "epoch": 0.3518530014277733, + "grad_norm": 0.33895130927239153, + "learning_rate": 9.990908776138678e-05, + "loss": 3.6255, + "step": 5668 + }, + { + "epoch": 0.3519150785275312, + "grad_norm": 0.4075345125970909, + "learning_rate": 9.990886994334042e-05, + "loss": 3.6142, + "step": 5669 + }, + { + "epoch": 0.3519771556272891, + "grad_norm": 0.29925090826146883, + "learning_rate": 9.990865186490738e-05, + "loss": 3.5829, + "step": 5670 + }, + { + "epoch": 0.352039232727047, + "grad_norm": 0.33858606952076775, + "learning_rate": 9.990843352608879e-05, + "loss": 3.5183, + "step": 5671 + }, + { + "epoch": 0.3521013098268049, + "grad_norm": 0.31414746895340756, + "learning_rate": 9.990821492688578e-05, + "loss": 3.6869, + "step": 5672 + }, + { + "epoch": 0.3521633869265628, + "grad_norm": 0.2576474326008958, + "learning_rate": 9.99079960672995e-05, + "loss": 3.6207, + "step": 5673 + }, + { + "epoch": 0.3522254640263207, + "grad_norm": 0.35773635537140247, + "learning_rate": 9.990777694733109e-05, + "loss": 3.6073, + "step": 5674 + }, + { + "epoch": 0.3522875411260786, + "grad_norm": 0.2568594773097008, + "learning_rate": 9.990755756698168e-05, + "loss": 3.6277, + "step": 5675 + }, + { + "epoch": 0.3523496182258365, + "grad_norm": 0.2900632726153108, + "learning_rate": 9.990733792625244e-05, + "loss": 3.6241, + "step": 5676 + }, + { + "epoch": 0.3524116953255944, + "grad_norm": 0.24846799651290646, + "learning_rate": 9.99071180251445e-05, + "loss": 3.6241, + "step": 5677 + }, + { + "epoch": 0.35247377242535227, + "grad_norm": 0.3412961179884416, + "learning_rate": 9.990689786365901e-05, + "loss": 3.566, + "step": 5678 + }, + { + "epoch": 0.3525358495251102, + "grad_norm": 0.2256832903567379, + "learning_rate": 9.990667744179712e-05, + "loss": 3.6461, + "step": 5679 + }, + { + "epoch": 0.3525979266248681, + "grad_norm": 0.27811555524926956, + "learning_rate": 9.990645675955996e-05, + "loss": 3.5542, + "step": 5680 + }, + { + "epoch": 0.35266000372462597, + "grad_norm": 0.31469299218803126, + "learning_rate": 9.990623581694872e-05, + "loss": 3.6468, + "step": 5681 + }, + { + "epoch": 0.3527220808243839, + "grad_norm": 0.3994750183898054, + "learning_rate": 9.990601461396454e-05, + "loss": 3.6232, + "step": 5682 + }, + { + "epoch": 0.3527841579241418, + "grad_norm": 0.25263187741483656, + "learning_rate": 9.990579315060855e-05, + "loss": 3.6278, + "step": 5683 + }, + { + "epoch": 0.35284623502389967, + "grad_norm": 0.4526389079776565, + "learning_rate": 9.990557142688193e-05, + "loss": 3.5058, + "step": 5684 + }, + { + "epoch": 0.3529083121236576, + "grad_norm": 0.23211806339782864, + "learning_rate": 9.990534944278582e-05, + "loss": 3.5719, + "step": 5685 + }, + { + "epoch": 0.3529703892234155, + "grad_norm": 0.35704097358935205, + "learning_rate": 9.990512719832139e-05, + "loss": 3.5549, + "step": 5686 + }, + { + "epoch": 0.35303246632317337, + "grad_norm": 0.2178667578443153, + "learning_rate": 9.990490469348981e-05, + "loss": 3.612, + "step": 5687 + }, + { + "epoch": 0.3530945434229313, + "grad_norm": 0.21749532586315018, + "learning_rate": 9.99046819282922e-05, + "loss": 3.571, + "step": 5688 + }, + { + "epoch": 0.3531566205226892, + "grad_norm": 0.3190508044880007, + "learning_rate": 9.990445890272977e-05, + "loss": 3.5819, + "step": 5689 + }, + { + "epoch": 0.35321869762244706, + "grad_norm": 0.2548493111177309, + "learning_rate": 9.990423561680367e-05, + "loss": 3.5202, + "step": 5690 + }, + { + "epoch": 0.353280774722205, + "grad_norm": 0.5698650642929582, + "learning_rate": 9.990401207051504e-05, + "loss": 3.5709, + "step": 5691 + }, + { + "epoch": 0.3533428518219629, + "grad_norm": 0.5266857421816695, + "learning_rate": 9.990378826386508e-05, + "loss": 3.6031, + "step": 5692 + }, + { + "epoch": 0.35340492892172076, + "grad_norm": 0.5282248024624486, + "learning_rate": 9.990356419685492e-05, + "loss": 3.6712, + "step": 5693 + }, + { + "epoch": 0.3534670060214787, + "grad_norm": 0.44996238193632815, + "learning_rate": 9.990333986948577e-05, + "loss": 3.6153, + "step": 5694 + }, + { + "epoch": 0.3535290831212366, + "grad_norm": 0.46429133248073934, + "learning_rate": 9.990311528175877e-05, + "loss": 3.6898, + "step": 5695 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 0.4988925560643818, + "learning_rate": 9.990289043367511e-05, + "loss": 3.6301, + "step": 5696 + }, + { + "epoch": 0.3536532373207524, + "grad_norm": 0.3219531654148683, + "learning_rate": 9.990266532523596e-05, + "loss": 3.5538, + "step": 5697 + }, + { + "epoch": 0.3537153144205103, + "grad_norm": 0.3831608024658376, + "learning_rate": 9.990243995644249e-05, + "loss": 3.5535, + "step": 5698 + }, + { + "epoch": 0.35377739152026816, + "grad_norm": 0.2968763426700145, + "learning_rate": 9.990221432729587e-05, + "loss": 3.6983, + "step": 5699 + }, + { + "epoch": 0.3538394686200261, + "grad_norm": 0.4719843384086071, + "learning_rate": 9.990198843779729e-05, + "loss": 3.6361, + "step": 5700 + }, + { + "epoch": 0.353901545719784, + "grad_norm": 0.2248773965983522, + "learning_rate": 9.990176228794792e-05, + "loss": 3.6119, + "step": 5701 + }, + { + "epoch": 0.35396362281954186, + "grad_norm": 0.2244404862719615, + "learning_rate": 9.990153587774895e-05, + "loss": 3.7046, + "step": 5702 + }, + { + "epoch": 0.3540256999192998, + "grad_norm": 0.32662407144034217, + "learning_rate": 9.990130920720153e-05, + "loss": 3.6145, + "step": 5703 + }, + { + "epoch": 0.3540877770190577, + "grad_norm": 0.4414086299595771, + "learning_rate": 9.990108227630688e-05, + "loss": 3.652, + "step": 5704 + }, + { + "epoch": 0.35414985411881555, + "grad_norm": 0.5742656143682945, + "learning_rate": 9.990085508506616e-05, + "loss": 3.6068, + "step": 5705 + }, + { + "epoch": 0.3542119312185735, + "grad_norm": 0.41210961002867447, + "learning_rate": 9.990062763348059e-05, + "loss": 3.6635, + "step": 5706 + }, + { + "epoch": 0.3542740083183314, + "grad_norm": 0.3883787995880955, + "learning_rate": 9.990039992155131e-05, + "loss": 3.5407, + "step": 5707 + }, + { + "epoch": 0.35433608541808925, + "grad_norm": 0.37526262544312383, + "learning_rate": 9.990017194927954e-05, + "loss": 3.5789, + "step": 5708 + }, + { + "epoch": 0.35439816251784717, + "grad_norm": 0.2920459327864197, + "learning_rate": 9.989994371666647e-05, + "loss": 3.6746, + "step": 5709 + }, + { + "epoch": 0.3544602396176051, + "grad_norm": 0.331703540419176, + "learning_rate": 9.989971522371326e-05, + "loss": 3.577, + "step": 5710 + }, + { + "epoch": 0.35452231671736295, + "grad_norm": 0.31950777429276894, + "learning_rate": 9.989948647042113e-05, + "loss": 3.6104, + "step": 5711 + }, + { + "epoch": 0.35458439381712087, + "grad_norm": 0.3178645893609258, + "learning_rate": 9.989925745679126e-05, + "loss": 3.5759, + "step": 5712 + }, + { + "epoch": 0.3546464709168788, + "grad_norm": 0.2645935761854494, + "learning_rate": 9.989902818282486e-05, + "loss": 3.6405, + "step": 5713 + }, + { + "epoch": 0.35470854801663665, + "grad_norm": 0.29160282878255855, + "learning_rate": 9.989879864852311e-05, + "loss": 3.6628, + "step": 5714 + }, + { + "epoch": 0.35477062511639457, + "grad_norm": 0.26523466039987753, + "learning_rate": 9.989856885388721e-05, + "loss": 3.5626, + "step": 5715 + }, + { + "epoch": 0.3548327022161525, + "grad_norm": 0.26498422751182754, + "learning_rate": 9.989833879891836e-05, + "loss": 3.6955, + "step": 5716 + }, + { + "epoch": 0.35489477931591035, + "grad_norm": 0.2084254160671101, + "learning_rate": 9.989810848361778e-05, + "loss": 3.629, + "step": 5717 + }, + { + "epoch": 0.35495685641566826, + "grad_norm": 0.31939492570340333, + "learning_rate": 9.989787790798665e-05, + "loss": 3.6736, + "step": 5718 + }, + { + "epoch": 0.3550189335154262, + "grad_norm": 0.24032308909843894, + "learning_rate": 9.989764707202616e-05, + "loss": 3.6914, + "step": 5719 + }, + { + "epoch": 0.35508101061518405, + "grad_norm": 0.22339635116635478, + "learning_rate": 9.989741597573756e-05, + "loss": 3.7231, + "step": 5720 + }, + { + "epoch": 0.35514308771494196, + "grad_norm": 0.2780912560984374, + "learning_rate": 9.989718461912201e-05, + "loss": 3.5967, + "step": 5721 + }, + { + "epoch": 0.3552051648146999, + "grad_norm": 0.20105989357904505, + "learning_rate": 9.989695300218073e-05, + "loss": 3.5468, + "step": 5722 + }, + { + "epoch": 0.35526724191445774, + "grad_norm": 0.2440420485803934, + "learning_rate": 9.989672112491494e-05, + "loss": 3.5613, + "step": 5723 + }, + { + "epoch": 0.35532931901421566, + "grad_norm": 0.26812465569889754, + "learning_rate": 9.989648898732586e-05, + "loss": 3.6009, + "step": 5724 + }, + { + "epoch": 0.3553913961139736, + "grad_norm": 0.1974885738212259, + "learning_rate": 9.989625658941467e-05, + "loss": 3.6538, + "step": 5725 + }, + { + "epoch": 0.35545347321373144, + "grad_norm": 0.2889412548341883, + "learning_rate": 9.989602393118259e-05, + "loss": 3.6149, + "step": 5726 + }, + { + "epoch": 0.35551555031348936, + "grad_norm": 0.2018206791087164, + "learning_rate": 9.989579101263085e-05, + "loss": 3.5858, + "step": 5727 + }, + { + "epoch": 0.3555776274132473, + "grad_norm": 0.3859275082982068, + "learning_rate": 9.989555783376066e-05, + "loss": 3.5607, + "step": 5728 + }, + { + "epoch": 0.35563970451300514, + "grad_norm": 0.21076331187411373, + "learning_rate": 9.98953243945732e-05, + "loss": 3.6079, + "step": 5729 + }, + { + "epoch": 0.35570178161276306, + "grad_norm": 0.28828424643429845, + "learning_rate": 9.989509069506976e-05, + "loss": 3.5596, + "step": 5730 + }, + { + "epoch": 0.355763858712521, + "grad_norm": 0.34488938892526494, + "learning_rate": 9.98948567352515e-05, + "loss": 3.5281, + "step": 5731 + }, + { + "epoch": 0.35582593581227884, + "grad_norm": 0.27705823950801545, + "learning_rate": 9.989462251511967e-05, + "loss": 3.6287, + "step": 5732 + }, + { + "epoch": 0.35588801291203676, + "grad_norm": 0.26824439416024676, + "learning_rate": 9.989438803467548e-05, + "loss": 3.6071, + "step": 5733 + }, + { + "epoch": 0.3559500900117947, + "grad_norm": 0.20046485804929767, + "learning_rate": 9.989415329392014e-05, + "loss": 3.645, + "step": 5734 + }, + { + "epoch": 0.35601216711155254, + "grad_norm": 0.28942961191900307, + "learning_rate": 9.989391829285491e-05, + "loss": 3.6319, + "step": 5735 + }, + { + "epoch": 0.35607424421131045, + "grad_norm": 0.29261535947465733, + "learning_rate": 9.989368303148098e-05, + "loss": 3.5975, + "step": 5736 + }, + { + "epoch": 0.35613632131106837, + "grad_norm": 0.25220180038407747, + "learning_rate": 9.98934475097996e-05, + "loss": 3.6299, + "step": 5737 + }, + { + "epoch": 0.35619839841082623, + "grad_norm": 0.2603681646320641, + "learning_rate": 9.989321172781199e-05, + "loss": 3.5418, + "step": 5738 + }, + { + "epoch": 0.35626047551058415, + "grad_norm": 0.2510916153034274, + "learning_rate": 9.989297568551938e-05, + "loss": 3.5288, + "step": 5739 + }, + { + "epoch": 0.35632255261034207, + "grad_norm": 0.2584792676060312, + "learning_rate": 9.989273938292302e-05, + "loss": 3.5608, + "step": 5740 + }, + { + "epoch": 0.35638462971009993, + "grad_norm": 0.19383129175713576, + "learning_rate": 9.989250282002411e-05, + "loss": 3.579, + "step": 5741 + }, + { + "epoch": 0.35644670680985785, + "grad_norm": 0.30791866727487915, + "learning_rate": 9.98922659968239e-05, + "loss": 3.7018, + "step": 5742 + }, + { + "epoch": 0.35650878390961577, + "grad_norm": 0.2312708974108, + "learning_rate": 9.989202891332364e-05, + "loss": 3.5708, + "step": 5743 + }, + { + "epoch": 0.35657086100937363, + "grad_norm": 0.2441069160078264, + "learning_rate": 9.989179156952455e-05, + "loss": 3.5893, + "step": 5744 + }, + { + "epoch": 0.35663293810913155, + "grad_norm": 0.2595636112906725, + "learning_rate": 9.989155396542786e-05, + "loss": 3.5248, + "step": 5745 + }, + { + "epoch": 0.35669501520888947, + "grad_norm": 0.20562351004515814, + "learning_rate": 9.989131610103484e-05, + "loss": 3.6486, + "step": 5746 + }, + { + "epoch": 0.35675709230864733, + "grad_norm": 0.2326892468921993, + "learning_rate": 9.98910779763467e-05, + "loss": 3.5485, + "step": 5747 + }, + { + "epoch": 0.35681916940840525, + "grad_norm": 0.2395139452063907, + "learning_rate": 9.989083959136469e-05, + "loss": 3.6208, + "step": 5748 + }, + { + "epoch": 0.35688124650816316, + "grad_norm": 0.20995171992450742, + "learning_rate": 9.989060094609007e-05, + "loss": 3.5727, + "step": 5749 + }, + { + "epoch": 0.356943323607921, + "grad_norm": 0.2820738012702704, + "learning_rate": 9.989036204052407e-05, + "loss": 3.3941, + "step": 5750 + }, + { + "epoch": 0.35700540070767894, + "grad_norm": 0.25184407202859455, + "learning_rate": 9.989012287466794e-05, + "loss": 3.5965, + "step": 5751 + }, + { + "epoch": 0.35706747780743686, + "grad_norm": 0.2723691778956891, + "learning_rate": 9.988988344852292e-05, + "loss": 3.6443, + "step": 5752 + }, + { + "epoch": 0.3571295549071947, + "grad_norm": 0.2932039080029224, + "learning_rate": 9.988964376209029e-05, + "loss": 3.4982, + "step": 5753 + }, + { + "epoch": 0.35719163200695264, + "grad_norm": 0.38287878435776024, + "learning_rate": 9.988940381537126e-05, + "loss": 3.6449, + "step": 5754 + }, + { + "epoch": 0.35725370910671056, + "grad_norm": 0.2706535580215678, + "learning_rate": 9.98891636083671e-05, + "loss": 3.5429, + "step": 5755 + }, + { + "epoch": 0.3573157862064684, + "grad_norm": 0.3219141575225627, + "learning_rate": 9.988892314107906e-05, + "loss": 3.618, + "step": 5756 + }, + { + "epoch": 0.35737786330622634, + "grad_norm": 0.29802161193411725, + "learning_rate": 9.98886824135084e-05, + "loss": 3.5822, + "step": 5757 + }, + { + "epoch": 0.35743994040598426, + "grad_norm": 0.24520232709131193, + "learning_rate": 9.988844142565637e-05, + "loss": 3.5457, + "step": 5758 + }, + { + "epoch": 0.3575020175057421, + "grad_norm": 0.3093110813232434, + "learning_rate": 9.988820017752423e-05, + "loss": 3.5581, + "step": 5759 + }, + { + "epoch": 0.35756409460550004, + "grad_norm": 0.22957638758302648, + "learning_rate": 9.988795866911325e-05, + "loss": 3.6256, + "step": 5760 + }, + { + "epoch": 0.35762617170525796, + "grad_norm": 0.24207620216826736, + "learning_rate": 9.988771690042467e-05, + "loss": 3.5427, + "step": 5761 + }, + { + "epoch": 0.3576882488050158, + "grad_norm": 0.24113296298676304, + "learning_rate": 9.988747487145976e-05, + "loss": 3.5719, + "step": 5762 + }, + { + "epoch": 0.35775032590477374, + "grad_norm": 0.3297134167758066, + "learning_rate": 9.988723258221977e-05, + "loss": 3.6066, + "step": 5763 + }, + { + "epoch": 0.35781240300453165, + "grad_norm": 0.3249298707785831, + "learning_rate": 9.9886990032706e-05, + "loss": 3.5939, + "step": 5764 + }, + { + "epoch": 0.3578744801042895, + "grad_norm": 0.35952801142182644, + "learning_rate": 9.988674722291969e-05, + "loss": 3.6453, + "step": 5765 + }, + { + "epoch": 0.35793655720404743, + "grad_norm": 0.37190401060281014, + "learning_rate": 9.988650415286209e-05, + "loss": 3.4949, + "step": 5766 + }, + { + "epoch": 0.35799863430380535, + "grad_norm": 0.3683487203924023, + "learning_rate": 9.988626082253451e-05, + "loss": 3.5048, + "step": 5767 + }, + { + "epoch": 0.3580607114035632, + "grad_norm": 0.37164055542607405, + "learning_rate": 9.988601723193818e-05, + "loss": 3.6932, + "step": 5768 + }, + { + "epoch": 0.35812278850332113, + "grad_norm": 0.3278385796914197, + "learning_rate": 9.98857733810744e-05, + "loss": 3.6155, + "step": 5769 + }, + { + "epoch": 0.35818486560307905, + "grad_norm": 0.27437341463678055, + "learning_rate": 9.988552926994444e-05, + "loss": 3.5722, + "step": 5770 + }, + { + "epoch": 0.3582469427028369, + "grad_norm": 0.29498526542444725, + "learning_rate": 9.988528489854953e-05, + "loss": 3.6354, + "step": 5771 + }, + { + "epoch": 0.35830901980259483, + "grad_norm": 0.2923154158243909, + "learning_rate": 9.9885040266891e-05, + "loss": 3.6563, + "step": 5772 + }, + { + "epoch": 0.35837109690235275, + "grad_norm": 0.31266361959942696, + "learning_rate": 9.98847953749701e-05, + "loss": 3.5677, + "step": 5773 + }, + { + "epoch": 0.3584331740021106, + "grad_norm": 0.2681452456302311, + "learning_rate": 9.988455022278812e-05, + "loss": 3.5369, + "step": 5774 + }, + { + "epoch": 0.35849525110186853, + "grad_norm": 0.2622730733150728, + "learning_rate": 9.988430481034632e-05, + "loss": 3.6956, + "step": 5775 + }, + { + "epoch": 0.35855732820162645, + "grad_norm": 0.413721916190649, + "learning_rate": 9.9884059137646e-05, + "loss": 3.5759, + "step": 5776 + }, + { + "epoch": 0.3586194053013843, + "grad_norm": 0.2872806738718315, + "learning_rate": 9.988381320468844e-05, + "loss": 3.615, + "step": 5777 + }, + { + "epoch": 0.3586814824011422, + "grad_norm": 0.33074519226215915, + "learning_rate": 9.98835670114749e-05, + "loss": 3.6866, + "step": 5778 + }, + { + "epoch": 0.35874355950090014, + "grad_norm": 0.2547738437911959, + "learning_rate": 9.988332055800669e-05, + "loss": 3.5686, + "step": 5779 + }, + { + "epoch": 0.358805636600658, + "grad_norm": 0.2861935677977512, + "learning_rate": 9.988307384428509e-05, + "loss": 3.5147, + "step": 5780 + }, + { + "epoch": 0.3588677137004159, + "grad_norm": 0.270546893109622, + "learning_rate": 9.988282687031139e-05, + "loss": 3.4595, + "step": 5781 + }, + { + "epoch": 0.35892979080017384, + "grad_norm": 0.27334150264082596, + "learning_rate": 9.988257963608685e-05, + "loss": 3.6166, + "step": 5782 + }, + { + "epoch": 0.3589918678999317, + "grad_norm": 0.29988484755483064, + "learning_rate": 9.98823321416128e-05, + "loss": 3.6232, + "step": 5783 + }, + { + "epoch": 0.3590539449996896, + "grad_norm": 0.24957334350904387, + "learning_rate": 9.988208438689052e-05, + "loss": 3.5994, + "step": 5784 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 0.2507403111539904, + "learning_rate": 9.988183637192128e-05, + "loss": 3.4805, + "step": 5785 + }, + { + "epoch": 0.3591780991992054, + "grad_norm": 0.3011587795671069, + "learning_rate": 9.98815880967064e-05, + "loss": 3.5937, + "step": 5786 + }, + { + "epoch": 0.3592401762989633, + "grad_norm": 0.2939892225958139, + "learning_rate": 9.988133956124716e-05, + "loss": 3.5752, + "step": 5787 + }, + { + "epoch": 0.35930225339872124, + "grad_norm": 0.3309124103541554, + "learning_rate": 9.988109076554486e-05, + "loss": 3.5494, + "step": 5788 + }, + { + "epoch": 0.3593643304984791, + "grad_norm": 0.34975346201355034, + "learning_rate": 9.988084170960081e-05, + "loss": 3.6375, + "step": 5789 + }, + { + "epoch": 0.359426407598237, + "grad_norm": 0.3045625185779587, + "learning_rate": 9.988059239341629e-05, + "loss": 3.5422, + "step": 5790 + }, + { + "epoch": 0.35948848469799494, + "grad_norm": 0.2622513831223051, + "learning_rate": 9.988034281699263e-05, + "loss": 3.5593, + "step": 5791 + }, + { + "epoch": 0.3595505617977528, + "grad_norm": 0.39571395964631195, + "learning_rate": 9.98800929803311e-05, + "loss": 3.6473, + "step": 5792 + }, + { + "epoch": 0.3596126388975107, + "grad_norm": 0.31969910715236205, + "learning_rate": 9.987984288343302e-05, + "loss": 3.6306, + "step": 5793 + }, + { + "epoch": 0.35967471599726863, + "grad_norm": 0.411379235145284, + "learning_rate": 9.987959252629968e-05, + "loss": 3.5617, + "step": 5794 + }, + { + "epoch": 0.3597367930970265, + "grad_norm": 0.24460482907160183, + "learning_rate": 9.98793419089324e-05, + "loss": 3.6776, + "step": 5795 + }, + { + "epoch": 0.3597988701967844, + "grad_norm": 0.22536399789805697, + "learning_rate": 9.987909103133248e-05, + "loss": 3.6492, + "step": 5796 + }, + { + "epoch": 0.35986094729654233, + "grad_norm": 0.2832989760679625, + "learning_rate": 9.987883989350125e-05, + "loss": 3.6337, + "step": 5797 + }, + { + "epoch": 0.3599230243963002, + "grad_norm": 0.4480657292517178, + "learning_rate": 9.987858849544e-05, + "loss": 3.6384, + "step": 5798 + }, + { + "epoch": 0.3599851014960581, + "grad_norm": 0.3748302671640103, + "learning_rate": 9.987833683715003e-05, + "loss": 3.4843, + "step": 5799 + }, + { + "epoch": 0.36004717859581603, + "grad_norm": 0.2750540766866194, + "learning_rate": 9.987808491863268e-05, + "loss": 3.6075, + "step": 5800 + }, + { + "epoch": 0.3601092556955739, + "grad_norm": 0.329444095740224, + "learning_rate": 9.987783273988926e-05, + "loss": 3.4646, + "step": 5801 + }, + { + "epoch": 0.3601713327953318, + "grad_norm": 0.2737378378348093, + "learning_rate": 9.987758030092106e-05, + "loss": 3.6199, + "step": 5802 + }, + { + "epoch": 0.36023340989508973, + "grad_norm": 0.2992210796454752, + "learning_rate": 9.987732760172943e-05, + "loss": 3.5967, + "step": 5803 + }, + { + "epoch": 0.3602954869948476, + "grad_norm": 0.2996770481439015, + "learning_rate": 9.987707464231568e-05, + "loss": 3.6106, + "step": 5804 + }, + { + "epoch": 0.3603575640946055, + "grad_norm": 0.26672730607006534, + "learning_rate": 9.98768214226811e-05, + "loss": 3.5413, + "step": 5805 + }, + { + "epoch": 0.3604196411943634, + "grad_norm": 0.24924477109522694, + "learning_rate": 9.987656794282705e-05, + "loss": 3.6038, + "step": 5806 + }, + { + "epoch": 0.3604817182941213, + "grad_norm": 0.3078002264025403, + "learning_rate": 9.987631420275483e-05, + "loss": 3.5514, + "step": 5807 + }, + { + "epoch": 0.3605437953938792, + "grad_norm": 0.30549148709100526, + "learning_rate": 9.987606020246578e-05, + "loss": 3.5131, + "step": 5808 + }, + { + "epoch": 0.36060587249363707, + "grad_norm": 0.34313329733525116, + "learning_rate": 9.987580594196121e-05, + "loss": 3.5693, + "step": 5809 + }, + { + "epoch": 0.360667949593395, + "grad_norm": 0.2868085959504012, + "learning_rate": 9.987555142124245e-05, + "loss": 3.5678, + "step": 5810 + }, + { + "epoch": 0.3607300266931529, + "grad_norm": 0.5017569191859019, + "learning_rate": 9.987529664031083e-05, + "loss": 3.6973, + "step": 5811 + }, + { + "epoch": 0.36079210379291077, + "grad_norm": 0.4880470019163721, + "learning_rate": 9.98750415991677e-05, + "loss": 3.5426, + "step": 5812 + }, + { + "epoch": 0.3608541808926687, + "grad_norm": 0.3490102418562184, + "learning_rate": 9.987478629781435e-05, + "loss": 3.6553, + "step": 5813 + }, + { + "epoch": 0.3609162579924266, + "grad_norm": 0.44072733630842614, + "learning_rate": 9.987453073625212e-05, + "loss": 3.5672, + "step": 5814 + }, + { + "epoch": 0.36097833509218447, + "grad_norm": 0.28892281740094833, + "learning_rate": 9.987427491448237e-05, + "loss": 3.6476, + "step": 5815 + }, + { + "epoch": 0.3610404121919424, + "grad_norm": 0.2992294210020193, + "learning_rate": 9.987401883250645e-05, + "loss": 3.5145, + "step": 5816 + }, + { + "epoch": 0.3611024892917003, + "grad_norm": 0.3208302639632769, + "learning_rate": 9.987376249032562e-05, + "loss": 3.597, + "step": 5817 + }, + { + "epoch": 0.36116456639145816, + "grad_norm": 0.37078177740201806, + "learning_rate": 9.987350588794129e-05, + "loss": 3.541, + "step": 5818 + }, + { + "epoch": 0.3612266434912161, + "grad_norm": 0.2535041559619809, + "learning_rate": 9.987324902535476e-05, + "loss": 3.4928, + "step": 5819 + }, + { + "epoch": 0.361288720590974, + "grad_norm": 0.2421027081246512, + "learning_rate": 9.987299190256739e-05, + "loss": 3.6681, + "step": 5820 + }, + { + "epoch": 0.36135079769073186, + "grad_norm": 0.3073876023728313, + "learning_rate": 9.987273451958052e-05, + "loss": 3.5965, + "step": 5821 + }, + { + "epoch": 0.3614128747904898, + "grad_norm": 0.23602961769282016, + "learning_rate": 9.987247687639548e-05, + "loss": 3.6439, + "step": 5822 + }, + { + "epoch": 0.3614749518902477, + "grad_norm": 0.29074860169688654, + "learning_rate": 9.987221897301363e-05, + "loss": 3.5414, + "step": 5823 + }, + { + "epoch": 0.36153702899000556, + "grad_norm": 0.29753771647023264, + "learning_rate": 9.987196080943628e-05, + "loss": 3.6218, + "step": 5824 + }, + { + "epoch": 0.3615991060897635, + "grad_norm": 0.2791602727646499, + "learning_rate": 9.987170238566483e-05, + "loss": 3.5856, + "step": 5825 + }, + { + "epoch": 0.3616611831895214, + "grad_norm": 0.5818344397908143, + "learning_rate": 9.987144370170059e-05, + "loss": 3.5383, + "step": 5826 + }, + { + "epoch": 0.36172326028927926, + "grad_norm": 0.5044860461396968, + "learning_rate": 9.987118475754492e-05, + "loss": 3.5353, + "step": 5827 + }, + { + "epoch": 0.3617853373890372, + "grad_norm": 0.33425861944770124, + "learning_rate": 9.987092555319918e-05, + "loss": 3.5316, + "step": 5828 + }, + { + "epoch": 0.3618474144887951, + "grad_norm": 0.3090176146094523, + "learning_rate": 9.987066608866471e-05, + "loss": 3.5988, + "step": 5829 + }, + { + "epoch": 0.36190949158855296, + "grad_norm": 0.4458933438558632, + "learning_rate": 9.987040636394287e-05, + "loss": 3.6583, + "step": 5830 + }, + { + "epoch": 0.3619715686883109, + "grad_norm": 0.3163618362714613, + "learning_rate": 9.987014637903501e-05, + "loss": 3.5401, + "step": 5831 + }, + { + "epoch": 0.3620336457880688, + "grad_norm": 0.2750263103990326, + "learning_rate": 9.98698861339425e-05, + "loss": 3.6004, + "step": 5832 + }, + { + "epoch": 0.36209572288782665, + "grad_norm": 0.3616879501577278, + "learning_rate": 9.986962562866669e-05, + "loss": 3.5707, + "step": 5833 + }, + { + "epoch": 0.36215779998758457, + "grad_norm": 0.2631555800665525, + "learning_rate": 9.986936486320892e-05, + "loss": 3.5996, + "step": 5834 + }, + { + "epoch": 0.3622198770873425, + "grad_norm": 0.36784006529185853, + "learning_rate": 9.986910383757058e-05, + "loss": 3.54, + "step": 5835 + }, + { + "epoch": 0.36228195418710035, + "grad_norm": 0.3445181500286418, + "learning_rate": 9.9868842551753e-05, + "loss": 3.5961, + "step": 5836 + }, + { + "epoch": 0.36234403128685827, + "grad_norm": 0.20999817628863315, + "learning_rate": 9.986858100575758e-05, + "loss": 3.64, + "step": 5837 + }, + { + "epoch": 0.3624061083866162, + "grad_norm": 0.3104451690726488, + "learning_rate": 9.986831919958565e-05, + "loss": 3.578, + "step": 5838 + }, + { + "epoch": 0.36246818548637405, + "grad_norm": 0.20253623401281734, + "learning_rate": 9.986805713323862e-05, + "loss": 3.6238, + "step": 5839 + }, + { + "epoch": 0.36253026258613197, + "grad_norm": 0.2269244344963978, + "learning_rate": 9.986779480671782e-05, + "loss": 3.6105, + "step": 5840 + }, + { + "epoch": 0.3625923396858899, + "grad_norm": 0.276488004770029, + "learning_rate": 9.986753222002463e-05, + "loss": 3.5926, + "step": 5841 + }, + { + "epoch": 0.36265441678564775, + "grad_norm": 0.2570962493244549, + "learning_rate": 9.986726937316041e-05, + "loss": 3.5936, + "step": 5842 + }, + { + "epoch": 0.36271649388540567, + "grad_norm": 0.28319473254166433, + "learning_rate": 9.986700626612654e-05, + "loss": 3.5587, + "step": 5843 + }, + { + "epoch": 0.3627785709851636, + "grad_norm": 0.2214259573511879, + "learning_rate": 9.98667428989244e-05, + "loss": 3.559, + "step": 5844 + }, + { + "epoch": 0.36284064808492145, + "grad_norm": 0.2081160554739443, + "learning_rate": 9.986647927155534e-05, + "loss": 3.7547, + "step": 5845 + }, + { + "epoch": 0.36290272518467936, + "grad_norm": 0.284461101660066, + "learning_rate": 9.986621538402078e-05, + "loss": 3.5417, + "step": 5846 + }, + { + "epoch": 0.3629648022844373, + "grad_norm": 0.284531680111505, + "learning_rate": 9.986595123632206e-05, + "loss": 3.6886, + "step": 5847 + }, + { + "epoch": 0.36302687938419514, + "grad_norm": 0.4306811028740173, + "learning_rate": 9.986568682846055e-05, + "loss": 3.5627, + "step": 5848 + }, + { + "epoch": 0.36308895648395306, + "grad_norm": 0.3644290961125975, + "learning_rate": 9.986542216043766e-05, + "loss": 3.5343, + "step": 5849 + }, + { + "epoch": 0.363151033583711, + "grad_norm": 0.35434129093782696, + "learning_rate": 9.986515723225476e-05, + "loss": 3.5713, + "step": 5850 + }, + { + "epoch": 0.36321311068346884, + "grad_norm": 0.26160906870489753, + "learning_rate": 9.986489204391323e-05, + "loss": 3.4801, + "step": 5851 + }, + { + "epoch": 0.36327518778322676, + "grad_norm": 0.2761230064007555, + "learning_rate": 9.986462659541445e-05, + "loss": 3.6082, + "step": 5852 + }, + { + "epoch": 0.3633372648829847, + "grad_norm": 0.31717607073557663, + "learning_rate": 9.986436088675978e-05, + "loss": 3.6273, + "step": 5853 + }, + { + "epoch": 0.36339934198274254, + "grad_norm": 0.3784789399280934, + "learning_rate": 9.986409491795067e-05, + "loss": 3.4834, + "step": 5854 + }, + { + "epoch": 0.36346141908250046, + "grad_norm": 0.48618841654059397, + "learning_rate": 9.986382868898847e-05, + "loss": 3.5448, + "step": 5855 + }, + { + "epoch": 0.3635234961822584, + "grad_norm": 0.4015738704570755, + "learning_rate": 9.986356219987456e-05, + "loss": 3.4641, + "step": 5856 + }, + { + "epoch": 0.36358557328201624, + "grad_norm": 0.3466749593922743, + "learning_rate": 9.986329545061034e-05, + "loss": 3.5957, + "step": 5857 + }, + { + "epoch": 0.36364765038177416, + "grad_norm": 0.3730494913538712, + "learning_rate": 9.986302844119722e-05, + "loss": 3.615, + "step": 5858 + }, + { + "epoch": 0.3637097274815321, + "grad_norm": 0.2893121002583473, + "learning_rate": 9.986276117163656e-05, + "loss": 3.6388, + "step": 5859 + }, + { + "epoch": 0.36377180458128994, + "grad_norm": 0.38206217665980774, + "learning_rate": 9.986249364192977e-05, + "loss": 3.7157, + "step": 5860 + }, + { + "epoch": 0.36383388168104785, + "grad_norm": 0.2288631584176136, + "learning_rate": 9.986222585207825e-05, + "loss": 3.5551, + "step": 5861 + }, + { + "epoch": 0.3638959587808058, + "grad_norm": 0.3343232092455188, + "learning_rate": 9.986195780208338e-05, + "loss": 3.6564, + "step": 5862 + }, + { + "epoch": 0.36395803588056364, + "grad_norm": 0.4220531758470811, + "learning_rate": 9.986168949194658e-05, + "loss": 3.6634, + "step": 5863 + }, + { + "epoch": 0.36402011298032155, + "grad_norm": 0.2728051639719758, + "learning_rate": 9.986142092166926e-05, + "loss": 3.556, + "step": 5864 + }, + { + "epoch": 0.36408219008007947, + "grad_norm": 0.4322443059079586, + "learning_rate": 9.986115209125278e-05, + "loss": 3.5192, + "step": 5865 + }, + { + "epoch": 0.36414426717983733, + "grad_norm": 0.3692693738096992, + "learning_rate": 9.986088300069858e-05, + "loss": 3.6218, + "step": 5866 + }, + { + "epoch": 0.36420634427959525, + "grad_norm": 0.2860145324377737, + "learning_rate": 9.986061365000804e-05, + "loss": 3.6499, + "step": 5867 + }, + { + "epoch": 0.36426842137935317, + "grad_norm": 0.24913015883703393, + "learning_rate": 9.986034403918258e-05, + "loss": 3.6598, + "step": 5868 + }, + { + "epoch": 0.36433049847911103, + "grad_norm": 0.4238165752753321, + "learning_rate": 9.98600741682236e-05, + "loss": 3.5133, + "step": 5869 + }, + { + "epoch": 0.36439257557886895, + "grad_norm": 0.4115434892473799, + "learning_rate": 9.985980403713251e-05, + "loss": 3.5924, + "step": 5870 + }, + { + "epoch": 0.36445465267862687, + "grad_norm": 0.5148069655109442, + "learning_rate": 9.985953364591073e-05, + "loss": 3.6012, + "step": 5871 + }, + { + "epoch": 0.36451672977838473, + "grad_norm": 0.3219882812040916, + "learning_rate": 9.985926299455964e-05, + "loss": 3.613, + "step": 5872 + }, + { + "epoch": 0.36457880687814265, + "grad_norm": 0.38705098686669015, + "learning_rate": 9.985899208308068e-05, + "loss": 3.6457, + "step": 5873 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 0.3610931663557987, + "learning_rate": 9.985872091147523e-05, + "loss": 3.536, + "step": 5874 + }, + { + "epoch": 0.3647029610776584, + "grad_norm": 0.24107247212350622, + "learning_rate": 9.985844947974476e-05, + "loss": 3.5231, + "step": 5875 + }, + { + "epoch": 0.36476503817741635, + "grad_norm": 0.2922344924754378, + "learning_rate": 9.985817778789064e-05, + "loss": 3.6129, + "step": 5876 + }, + { + "epoch": 0.36482711527717426, + "grad_norm": 0.33700037417334433, + "learning_rate": 9.985790583591431e-05, + "loss": 3.5479, + "step": 5877 + }, + { + "epoch": 0.3648891923769321, + "grad_norm": 0.24132009499658097, + "learning_rate": 9.985763362381718e-05, + "loss": 3.567, + "step": 5878 + }, + { + "epoch": 0.36495126947669004, + "grad_norm": 0.2710310629887234, + "learning_rate": 9.985736115160067e-05, + "loss": 3.5927, + "step": 5879 + }, + { + "epoch": 0.36501334657644796, + "grad_norm": 0.38147249998991806, + "learning_rate": 9.985708841926622e-05, + "loss": 3.6211, + "step": 5880 + }, + { + "epoch": 0.3650754236762058, + "grad_norm": 0.2788361303623739, + "learning_rate": 9.985681542681522e-05, + "loss": 3.6011, + "step": 5881 + }, + { + "epoch": 0.36513750077596374, + "grad_norm": 0.22114097576104105, + "learning_rate": 9.985654217424911e-05, + "loss": 3.5359, + "step": 5882 + }, + { + "epoch": 0.36519957787572166, + "grad_norm": 0.2598655479816834, + "learning_rate": 9.985626866156932e-05, + "loss": 3.5571, + "step": 5883 + }, + { + "epoch": 0.3652616549754795, + "grad_norm": 0.3364082741576805, + "learning_rate": 9.985599488877728e-05, + "loss": 3.6308, + "step": 5884 + }, + { + "epoch": 0.36532373207523744, + "grad_norm": 0.28012900003473656, + "learning_rate": 9.985572085587441e-05, + "loss": 3.5905, + "step": 5885 + }, + { + "epoch": 0.36538580917499536, + "grad_norm": 0.4066797331107472, + "learning_rate": 9.985544656286213e-05, + "loss": 3.6692, + "step": 5886 + }, + { + "epoch": 0.3654478862747532, + "grad_norm": 0.4214633947034894, + "learning_rate": 9.985517200974188e-05, + "loss": 3.6813, + "step": 5887 + }, + { + "epoch": 0.36550996337451114, + "grad_norm": 0.4445132059145911, + "learning_rate": 9.98548971965151e-05, + "loss": 3.5459, + "step": 5888 + }, + { + "epoch": 0.36557204047426906, + "grad_norm": 0.2751586153187593, + "learning_rate": 9.985462212318323e-05, + "loss": 3.6206, + "step": 5889 + }, + { + "epoch": 0.3656341175740269, + "grad_norm": 0.30558556471953924, + "learning_rate": 9.985434678974769e-05, + "loss": 3.6095, + "step": 5890 + }, + { + "epoch": 0.36569619467378484, + "grad_norm": 0.372812646960144, + "learning_rate": 9.985407119620991e-05, + "loss": 3.5617, + "step": 5891 + }, + { + "epoch": 0.36575827177354275, + "grad_norm": 0.39998790677982565, + "learning_rate": 9.985379534257135e-05, + "loss": 3.5784, + "step": 5892 + }, + { + "epoch": 0.3658203488733006, + "grad_norm": 0.264038156482054, + "learning_rate": 9.985351922883344e-05, + "loss": 3.5762, + "step": 5893 + }, + { + "epoch": 0.36588242597305853, + "grad_norm": 0.5351401556337577, + "learning_rate": 9.98532428549976e-05, + "loss": 3.6533, + "step": 5894 + }, + { + "epoch": 0.36594450307281645, + "grad_norm": 0.4883354767252175, + "learning_rate": 9.985296622106531e-05, + "loss": 3.5246, + "step": 5895 + }, + { + "epoch": 0.3660065801725743, + "grad_norm": 0.4405295180628805, + "learning_rate": 9.985268932703797e-05, + "loss": 3.5212, + "step": 5896 + }, + { + "epoch": 0.36606865727233223, + "grad_norm": 0.45852511540579866, + "learning_rate": 9.985241217291706e-05, + "loss": 3.6277, + "step": 5897 + }, + { + "epoch": 0.36613073437209015, + "grad_norm": 0.24180133858598915, + "learning_rate": 9.985213475870401e-05, + "loss": 3.6334, + "step": 5898 + }, + { + "epoch": 0.366192811471848, + "grad_norm": 0.42415824023022736, + "learning_rate": 9.985185708440028e-05, + "loss": 3.5807, + "step": 5899 + }, + { + "epoch": 0.36625488857160593, + "grad_norm": 0.3155274018382353, + "learning_rate": 9.98515791500073e-05, + "loss": 3.6076, + "step": 5900 + }, + { + "epoch": 0.36631696567136385, + "grad_norm": 0.4592709397508869, + "learning_rate": 9.985130095552654e-05, + "loss": 3.5997, + "step": 5901 + }, + { + "epoch": 0.3663790427711217, + "grad_norm": 0.30082606935360967, + "learning_rate": 9.985102250095943e-05, + "loss": 3.5802, + "step": 5902 + }, + { + "epoch": 0.36644111987087963, + "grad_norm": 0.5345929376220596, + "learning_rate": 9.985074378630745e-05, + "loss": 3.5657, + "step": 5903 + }, + { + "epoch": 0.36650319697063755, + "grad_norm": 0.37729427732045046, + "learning_rate": 9.985046481157202e-05, + "loss": 3.6103, + "step": 5904 + }, + { + "epoch": 0.3665652740703954, + "grad_norm": 0.39709050492611636, + "learning_rate": 9.985018557675462e-05, + "loss": 3.4983, + "step": 5905 + }, + { + "epoch": 0.3666273511701533, + "grad_norm": 0.4517369385484563, + "learning_rate": 9.984990608185669e-05, + "loss": 3.5545, + "step": 5906 + }, + { + "epoch": 0.36668942826991124, + "grad_norm": 0.44872115398448176, + "learning_rate": 9.984962632687972e-05, + "loss": 3.556, + "step": 5907 + }, + { + "epoch": 0.3667515053696691, + "grad_norm": 0.3807450929990528, + "learning_rate": 9.984934631182515e-05, + "loss": 3.5385, + "step": 5908 + }, + { + "epoch": 0.366813582469427, + "grad_norm": 0.354533715876296, + "learning_rate": 9.984906603669442e-05, + "loss": 3.5058, + "step": 5909 + }, + { + "epoch": 0.36687565956918494, + "grad_norm": 0.33866733517183445, + "learning_rate": 9.9848785501489e-05, + "loss": 3.5584, + "step": 5910 + }, + { + "epoch": 0.3669377366689428, + "grad_norm": 0.26664910923011304, + "learning_rate": 9.984850470621039e-05, + "loss": 3.4824, + "step": 5911 + }, + { + "epoch": 0.3669998137687007, + "grad_norm": 0.3271750823600236, + "learning_rate": 9.984822365086002e-05, + "loss": 3.7022, + "step": 5912 + }, + { + "epoch": 0.36706189086845864, + "grad_norm": 0.38105360116026554, + "learning_rate": 9.984794233543937e-05, + "loss": 3.5289, + "step": 5913 + }, + { + "epoch": 0.3671239679682165, + "grad_norm": 0.35518347642946485, + "learning_rate": 9.98476607599499e-05, + "loss": 3.6331, + "step": 5914 + }, + { + "epoch": 0.3671860450679744, + "grad_norm": 0.37321306690934875, + "learning_rate": 9.984737892439309e-05, + "loss": 3.6051, + "step": 5915 + }, + { + "epoch": 0.36724812216773234, + "grad_norm": 0.29850854177889324, + "learning_rate": 9.984709682877039e-05, + "loss": 3.594, + "step": 5916 + }, + { + "epoch": 0.3673101992674902, + "grad_norm": 0.3159753196108035, + "learning_rate": 9.98468144730833e-05, + "loss": 3.6054, + "step": 5917 + }, + { + "epoch": 0.3673722763672481, + "grad_norm": 0.27321750782999804, + "learning_rate": 9.984653185733327e-05, + "loss": 3.4863, + "step": 5918 + }, + { + "epoch": 0.36743435346700604, + "grad_norm": 0.3921532110597603, + "learning_rate": 9.984624898152178e-05, + "loss": 3.5201, + "step": 5919 + }, + { + "epoch": 0.3674964305667639, + "grad_norm": 0.32393951340906535, + "learning_rate": 9.984596584565032e-05, + "loss": 3.514, + "step": 5920 + }, + { + "epoch": 0.3675585076665218, + "grad_norm": 0.232497499488387, + "learning_rate": 9.984568244972034e-05, + "loss": 3.6536, + "step": 5921 + }, + { + "epoch": 0.36762058476627973, + "grad_norm": 0.33089631154810917, + "learning_rate": 9.984539879373335e-05, + "loss": 3.5594, + "step": 5922 + }, + { + "epoch": 0.3676826618660376, + "grad_norm": 0.25919106407465464, + "learning_rate": 9.984511487769079e-05, + "loss": 3.4896, + "step": 5923 + }, + { + "epoch": 0.3677447389657955, + "grad_norm": 0.46667989757491923, + "learning_rate": 9.98448307015942e-05, + "loss": 3.5985, + "step": 5924 + }, + { + "epoch": 0.36780681606555343, + "grad_norm": 0.38499660626686777, + "learning_rate": 9.9844546265445e-05, + "loss": 3.5908, + "step": 5925 + }, + { + "epoch": 0.3678688931653113, + "grad_norm": 0.3718447063833521, + "learning_rate": 9.984426156924471e-05, + "loss": 3.6495, + "step": 5926 + }, + { + "epoch": 0.3679309702650692, + "grad_norm": 0.3895169181956816, + "learning_rate": 9.98439766129948e-05, + "loss": 3.5311, + "step": 5927 + }, + { + "epoch": 0.36799304736482713, + "grad_norm": 0.2688146269518342, + "learning_rate": 9.984369139669678e-05, + "loss": 3.6126, + "step": 5928 + }, + { + "epoch": 0.368055124464585, + "grad_norm": 0.33571179148615615, + "learning_rate": 9.984340592035211e-05, + "loss": 3.5813, + "step": 5929 + }, + { + "epoch": 0.3681172015643429, + "grad_norm": 0.33288911063067594, + "learning_rate": 9.984312018396231e-05, + "loss": 3.6917, + "step": 5930 + }, + { + "epoch": 0.36817927866410083, + "grad_norm": 0.36166578444726344, + "learning_rate": 9.984283418752884e-05, + "loss": 3.6169, + "step": 5931 + }, + { + "epoch": 0.3682413557638587, + "grad_norm": 0.27020038253088613, + "learning_rate": 9.98425479310532e-05, + "loss": 3.5977, + "step": 5932 + }, + { + "epoch": 0.3683034328636166, + "grad_norm": 0.26000169911654686, + "learning_rate": 9.984226141453689e-05, + "loss": 3.6298, + "step": 5933 + }, + { + "epoch": 0.3683655099633745, + "grad_norm": 0.3090887325096757, + "learning_rate": 9.98419746379814e-05, + "loss": 3.5079, + "step": 5934 + }, + { + "epoch": 0.3684275870631324, + "grad_norm": 0.2707998716639675, + "learning_rate": 9.984168760138823e-05, + "loss": 3.7436, + "step": 5935 + }, + { + "epoch": 0.3684896641628903, + "grad_norm": 0.34394155930264403, + "learning_rate": 9.984140030475889e-05, + "loss": 3.4581, + "step": 5936 + }, + { + "epoch": 0.3685517412626482, + "grad_norm": 0.24336962456343378, + "learning_rate": 9.984111274809485e-05, + "loss": 3.5779, + "step": 5937 + }, + { + "epoch": 0.3686138183624061, + "grad_norm": 0.3053868410779224, + "learning_rate": 9.984082493139761e-05, + "loss": 3.51, + "step": 5938 + }, + { + "epoch": 0.368675895462164, + "grad_norm": 0.26659697923458475, + "learning_rate": 9.984053685466872e-05, + "loss": 3.65, + "step": 5939 + }, + { + "epoch": 0.3687379725619219, + "grad_norm": 0.2792592990380519, + "learning_rate": 9.984024851790963e-05, + "loss": 3.6018, + "step": 5940 + }, + { + "epoch": 0.3688000496616798, + "grad_norm": 0.3164262647538558, + "learning_rate": 9.983995992112187e-05, + "loss": 3.5515, + "step": 5941 + }, + { + "epoch": 0.3688621267614377, + "grad_norm": 0.25603189548805433, + "learning_rate": 9.983967106430694e-05, + "loss": 3.5718, + "step": 5942 + }, + { + "epoch": 0.3689242038611956, + "grad_norm": 0.2795934103292916, + "learning_rate": 9.983938194746635e-05, + "loss": 3.6179, + "step": 5943 + }, + { + "epoch": 0.3689862809609535, + "grad_norm": 0.24488635801911926, + "learning_rate": 9.98390925706016e-05, + "loss": 3.5952, + "step": 5944 + }, + { + "epoch": 0.3690483580607114, + "grad_norm": 0.22005251972824855, + "learning_rate": 9.983880293371422e-05, + "loss": 3.5512, + "step": 5945 + }, + { + "epoch": 0.3691104351604693, + "grad_norm": 0.21694817289772997, + "learning_rate": 9.98385130368057e-05, + "loss": 3.519, + "step": 5946 + }, + { + "epoch": 0.3691725122602272, + "grad_norm": 0.48017221309444774, + "learning_rate": 9.983822287987755e-05, + "loss": 3.6255, + "step": 5947 + }, + { + "epoch": 0.3692345893599851, + "grad_norm": 0.3743913200846076, + "learning_rate": 9.983793246293128e-05, + "loss": 3.6245, + "step": 5948 + }, + { + "epoch": 0.369296666459743, + "grad_norm": 0.3067544373608698, + "learning_rate": 9.983764178596844e-05, + "loss": 3.5596, + "step": 5949 + }, + { + "epoch": 0.3693587435595009, + "grad_norm": 0.29712594870021475, + "learning_rate": 9.983735084899051e-05, + "loss": 3.5873, + "step": 5950 + }, + { + "epoch": 0.3694208206592588, + "grad_norm": 0.31718798012532695, + "learning_rate": 9.983705965199904e-05, + "loss": 3.457, + "step": 5951 + }, + { + "epoch": 0.3694828977590167, + "grad_norm": 0.2818066333913916, + "learning_rate": 9.983676819499551e-05, + "loss": 3.5616, + "step": 5952 + }, + { + "epoch": 0.3695449748587746, + "grad_norm": 0.296311533911256, + "learning_rate": 9.983647647798147e-05, + "loss": 3.5166, + "step": 5953 + }, + { + "epoch": 0.3696070519585325, + "grad_norm": 0.2961023087887546, + "learning_rate": 9.983618450095844e-05, + "loss": 3.6144, + "step": 5954 + }, + { + "epoch": 0.3696691290582904, + "grad_norm": 0.26085113363851403, + "learning_rate": 9.983589226392793e-05, + "loss": 3.6201, + "step": 5955 + }, + { + "epoch": 0.3697312061580483, + "grad_norm": 0.28562684615725953, + "learning_rate": 9.983559976689149e-05, + "loss": 3.5366, + "step": 5956 + }, + { + "epoch": 0.3697932832578062, + "grad_norm": 0.2589314482909615, + "learning_rate": 9.983530700985062e-05, + "loss": 3.4989, + "step": 5957 + }, + { + "epoch": 0.3698553603575641, + "grad_norm": 0.34008254507756003, + "learning_rate": 9.983501399280684e-05, + "loss": 3.5429, + "step": 5958 + }, + { + "epoch": 0.369917437457322, + "grad_norm": 0.23204368789166094, + "learning_rate": 9.98347207157617e-05, + "loss": 3.5184, + "step": 5959 + }, + { + "epoch": 0.3699795145570799, + "grad_norm": 0.305313164877192, + "learning_rate": 9.983442717871672e-05, + "loss": 3.6137, + "step": 5960 + }, + { + "epoch": 0.3700415916568378, + "grad_norm": 0.28621199882417797, + "learning_rate": 9.983413338167346e-05, + "loss": 3.602, + "step": 5961 + }, + { + "epoch": 0.37010366875659567, + "grad_norm": 0.2689426463541425, + "learning_rate": 9.98338393246334e-05, + "loss": 3.4784, + "step": 5962 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 0.43622194669967657, + "learning_rate": 9.983354500759811e-05, + "loss": 3.5636, + "step": 5963 + }, + { + "epoch": 0.3702278229561115, + "grad_norm": 0.26538384935086134, + "learning_rate": 9.983325043056914e-05, + "loss": 3.6043, + "step": 5964 + }, + { + "epoch": 0.37028990005586937, + "grad_norm": 0.22366071302121882, + "learning_rate": 9.983295559354798e-05, + "loss": 3.5572, + "step": 5965 + }, + { + "epoch": 0.3703519771556273, + "grad_norm": 0.3347932267800525, + "learning_rate": 9.98326604965362e-05, + "loss": 3.5468, + "step": 5966 + }, + { + "epoch": 0.3704140542553852, + "grad_norm": 0.28663559833068963, + "learning_rate": 9.983236513953534e-05, + "loss": 3.4923, + "step": 5967 + }, + { + "epoch": 0.37047613135514307, + "grad_norm": 0.28791905111401084, + "learning_rate": 9.983206952254693e-05, + "loss": 3.5836, + "step": 5968 + }, + { + "epoch": 0.370538208454901, + "grad_norm": 0.35537083293740646, + "learning_rate": 9.983177364557252e-05, + "loss": 3.5939, + "step": 5969 + }, + { + "epoch": 0.3706002855546589, + "grad_norm": 0.4472141152918021, + "learning_rate": 9.983147750861364e-05, + "loss": 3.5676, + "step": 5970 + }, + { + "epoch": 0.37066236265441677, + "grad_norm": 0.5463520546997483, + "learning_rate": 9.983118111167184e-05, + "loss": 3.5408, + "step": 5971 + }, + { + "epoch": 0.3707244397541747, + "grad_norm": 0.3890104879519745, + "learning_rate": 9.983088445474868e-05, + "loss": 3.576, + "step": 5972 + }, + { + "epoch": 0.3707865168539326, + "grad_norm": 0.2722161263085561, + "learning_rate": 9.983058753784571e-05, + "loss": 3.5644, + "step": 5973 + }, + { + "epoch": 0.37084859395369046, + "grad_norm": 0.3261503909225203, + "learning_rate": 9.983029036096445e-05, + "loss": 3.6045, + "step": 5974 + }, + { + "epoch": 0.3709106710534484, + "grad_norm": 0.26478293122849933, + "learning_rate": 9.982999292410647e-05, + "loss": 3.5641, + "step": 5975 + }, + { + "epoch": 0.3709727481532063, + "grad_norm": 0.27223728256805424, + "learning_rate": 9.982969522727333e-05, + "loss": 3.6183, + "step": 5976 + }, + { + "epoch": 0.37103482525296416, + "grad_norm": 0.338721901037906, + "learning_rate": 9.982939727046656e-05, + "loss": 3.6481, + "step": 5977 + }, + { + "epoch": 0.3710969023527221, + "grad_norm": 0.42884360587263504, + "learning_rate": 9.982909905368775e-05, + "loss": 3.5406, + "step": 5978 + }, + { + "epoch": 0.37115897945248, + "grad_norm": 0.2334763384941294, + "learning_rate": 9.982880057693842e-05, + "loss": 3.5986, + "step": 5979 + }, + { + "epoch": 0.37122105655223786, + "grad_norm": 0.423883513597707, + "learning_rate": 9.982850184022014e-05, + "loss": 3.5853, + "step": 5980 + }, + { + "epoch": 0.3712831336519958, + "grad_norm": 0.27827067685482826, + "learning_rate": 9.982820284353447e-05, + "loss": 3.5574, + "step": 5981 + }, + { + "epoch": 0.3713452107517537, + "grad_norm": 0.35782390143207776, + "learning_rate": 9.982790358688296e-05, + "loss": 3.4743, + "step": 5982 + }, + { + "epoch": 0.37140728785151156, + "grad_norm": 0.3584494073780688, + "learning_rate": 9.982760407026721e-05, + "loss": 3.5482, + "step": 5983 + }, + { + "epoch": 0.3714693649512695, + "grad_norm": 0.582052601672249, + "learning_rate": 9.982730429368872e-05, + "loss": 3.4636, + "step": 5984 + }, + { + "epoch": 0.3715314420510274, + "grad_norm": 0.591489781435734, + "learning_rate": 9.982700425714911e-05, + "loss": 3.5089, + "step": 5985 + }, + { + "epoch": 0.37159351915078526, + "grad_norm": 0.4174178584036046, + "learning_rate": 9.982670396064992e-05, + "loss": 3.4937, + "step": 5986 + }, + { + "epoch": 0.3716555962505432, + "grad_norm": 0.49607925694238664, + "learning_rate": 9.982640340419272e-05, + "loss": 3.6541, + "step": 5987 + }, + { + "epoch": 0.3717176733503011, + "grad_norm": 0.37090413117821996, + "learning_rate": 9.982610258777907e-05, + "loss": 3.6053, + "step": 5988 + }, + { + "epoch": 0.37177975045005895, + "grad_norm": 0.3745173487432818, + "learning_rate": 9.982580151141055e-05, + "loss": 3.6038, + "step": 5989 + }, + { + "epoch": 0.3718418275498169, + "grad_norm": 0.4252922154451656, + "learning_rate": 9.982550017508873e-05, + "loss": 3.5862, + "step": 5990 + }, + { + "epoch": 0.3719039046495748, + "grad_norm": 0.4195090282671284, + "learning_rate": 9.982519857881518e-05, + "loss": 3.5951, + "step": 5991 + }, + { + "epoch": 0.37196598174933265, + "grad_norm": 0.41787818569255053, + "learning_rate": 9.982489672259147e-05, + "loss": 3.5988, + "step": 5992 + }, + { + "epoch": 0.37202805884909057, + "grad_norm": 0.44697131339453416, + "learning_rate": 9.982459460641918e-05, + "loss": 3.6205, + "step": 5993 + }, + { + "epoch": 0.3720901359488485, + "grad_norm": 0.5042883732653056, + "learning_rate": 9.982429223029988e-05, + "loss": 3.5534, + "step": 5994 + }, + { + "epoch": 0.37215221304860635, + "grad_norm": 0.41403757776075895, + "learning_rate": 9.982398959423515e-05, + "loss": 3.5071, + "step": 5995 + }, + { + "epoch": 0.37221429014836427, + "grad_norm": 0.3852434048109786, + "learning_rate": 9.98236866982266e-05, + "loss": 3.4606, + "step": 5996 + }, + { + "epoch": 0.3722763672481222, + "grad_norm": 0.28846392645825564, + "learning_rate": 9.982338354227573e-05, + "loss": 3.5763, + "step": 5997 + }, + { + "epoch": 0.37233844434788005, + "grad_norm": 0.3076490718726807, + "learning_rate": 9.982308012638422e-05, + "loss": 3.5962, + "step": 5998 + }, + { + "epoch": 0.37240052144763797, + "grad_norm": 0.29787522519031484, + "learning_rate": 9.982277645055357e-05, + "loss": 3.5077, + "step": 5999 + }, + { + "epoch": 0.3724625985473959, + "grad_norm": 0.4675845439728424, + "learning_rate": 9.982247251478543e-05, + "loss": 3.5281, + "step": 6000 + }, + { + "epoch": 0.37252467564715375, + "grad_norm": 0.34204792067469136, + "learning_rate": 9.982216831908135e-05, + "loss": 3.4086, + "step": 6001 + }, + { + "epoch": 0.37258675274691166, + "grad_norm": 0.28482918729818907, + "learning_rate": 9.982186386344292e-05, + "loss": 3.5247, + "step": 6002 + }, + { + "epoch": 0.3726488298466696, + "grad_norm": 0.33495557686211347, + "learning_rate": 9.982155914787173e-05, + "loss": 3.4967, + "step": 6003 + }, + { + "epoch": 0.37271090694642744, + "grad_norm": 0.42678991760024115, + "learning_rate": 9.982125417236935e-05, + "loss": 3.5686, + "step": 6004 + }, + { + "epoch": 0.37277298404618536, + "grad_norm": 0.3889671047871905, + "learning_rate": 9.982094893693743e-05, + "loss": 3.5024, + "step": 6005 + }, + { + "epoch": 0.3728350611459433, + "grad_norm": 0.38212572501515896, + "learning_rate": 9.982064344157752e-05, + "loss": 3.7241, + "step": 6006 + }, + { + "epoch": 0.37289713824570114, + "grad_norm": 0.34088081762807104, + "learning_rate": 9.98203376862912e-05, + "loss": 3.5956, + "step": 6007 + }, + { + "epoch": 0.37295921534545906, + "grad_norm": 0.29434975969428856, + "learning_rate": 9.982003167108008e-05, + "loss": 3.541, + "step": 6008 + }, + { + "epoch": 0.373021292445217, + "grad_norm": 0.4270826277688254, + "learning_rate": 9.981972539594577e-05, + "loss": 3.4825, + "step": 6009 + }, + { + "epoch": 0.37308336954497484, + "grad_norm": 0.2969790953791406, + "learning_rate": 9.981941886088986e-05, + "loss": 3.5573, + "step": 6010 + }, + { + "epoch": 0.37314544664473276, + "grad_norm": 0.5072075982380575, + "learning_rate": 9.981911206591395e-05, + "loss": 3.4837, + "step": 6011 + }, + { + "epoch": 0.3732075237444907, + "grad_norm": 0.3691429461047954, + "learning_rate": 9.981880501101964e-05, + "loss": 3.6687, + "step": 6012 + }, + { + "epoch": 0.37326960084424854, + "grad_norm": 0.27595356968953544, + "learning_rate": 9.981849769620852e-05, + "loss": 3.658, + "step": 6013 + }, + { + "epoch": 0.37333167794400646, + "grad_norm": 0.3894317330288649, + "learning_rate": 9.981819012148221e-05, + "loss": 3.5531, + "step": 6014 + }, + { + "epoch": 0.3733937550437644, + "grad_norm": 0.2606729565553225, + "learning_rate": 9.981788228684231e-05, + "loss": 3.6117, + "step": 6015 + }, + { + "epoch": 0.37345583214352224, + "grad_norm": 0.2777846820622283, + "learning_rate": 9.981757419229043e-05, + "loss": 3.6653, + "step": 6016 + }, + { + "epoch": 0.37351790924328016, + "grad_norm": 0.2692253062622768, + "learning_rate": 9.981726583782816e-05, + "loss": 3.4292, + "step": 6017 + }, + { + "epoch": 0.3735799863430381, + "grad_norm": 0.2679333938762731, + "learning_rate": 9.981695722345712e-05, + "loss": 3.5114, + "step": 6018 + }, + { + "epoch": 0.37364206344279594, + "grad_norm": 0.4159366908878794, + "learning_rate": 9.981664834917893e-05, + "loss": 3.5553, + "step": 6019 + }, + { + "epoch": 0.37370414054255385, + "grad_norm": 0.3117716885312926, + "learning_rate": 9.981633921499519e-05, + "loss": 3.5499, + "step": 6020 + }, + { + "epoch": 0.37376621764231177, + "grad_norm": 0.28820478929518406, + "learning_rate": 9.981602982090751e-05, + "loss": 3.4694, + "step": 6021 + }, + { + "epoch": 0.37382829474206963, + "grad_norm": 0.24552278283361748, + "learning_rate": 9.981572016691752e-05, + "loss": 3.5297, + "step": 6022 + }, + { + "epoch": 0.37389037184182755, + "grad_norm": 0.29369316819495755, + "learning_rate": 9.981541025302681e-05, + "loss": 3.5435, + "step": 6023 + }, + { + "epoch": 0.37395244894158547, + "grad_norm": 0.3137117377161615, + "learning_rate": 9.981510007923702e-05, + "loss": 3.6461, + "step": 6024 + }, + { + "epoch": 0.37401452604134333, + "grad_norm": 0.3364004892439806, + "learning_rate": 9.981478964554976e-05, + "loss": 3.4799, + "step": 6025 + }, + { + "epoch": 0.37407660314110125, + "grad_norm": 0.2588092724088281, + "learning_rate": 9.981447895196665e-05, + "loss": 3.5877, + "step": 6026 + }, + { + "epoch": 0.37413868024085917, + "grad_norm": 0.23585212470198041, + "learning_rate": 9.981416799848931e-05, + "loss": 3.5012, + "step": 6027 + }, + { + "epoch": 0.37420075734061703, + "grad_norm": 0.23408542864533838, + "learning_rate": 9.981385678511935e-05, + "loss": 3.5025, + "step": 6028 + }, + { + "epoch": 0.37426283444037495, + "grad_norm": 0.32645297928959643, + "learning_rate": 9.981354531185843e-05, + "loss": 3.4906, + "step": 6029 + }, + { + "epoch": 0.37432491154013287, + "grad_norm": 0.24782618519732336, + "learning_rate": 9.981323357870813e-05, + "loss": 3.6709, + "step": 6030 + }, + { + "epoch": 0.3743869886398907, + "grad_norm": 0.3927035914100218, + "learning_rate": 9.981292158567011e-05, + "loss": 3.5865, + "step": 6031 + }, + { + "epoch": 0.37444906573964865, + "grad_norm": 0.34038054067169665, + "learning_rate": 9.981260933274598e-05, + "loss": 3.5058, + "step": 6032 + }, + { + "epoch": 0.37451114283940656, + "grad_norm": 0.20904307565986632, + "learning_rate": 9.981229681993738e-05, + "loss": 3.528, + "step": 6033 + }, + { + "epoch": 0.3745732199391644, + "grad_norm": 0.3507070411910463, + "learning_rate": 9.981198404724592e-05, + "loss": 3.5181, + "step": 6034 + }, + { + "epoch": 0.37463529703892234, + "grad_norm": 0.3687668376246602, + "learning_rate": 9.981167101467325e-05, + "loss": 3.5115, + "step": 6035 + }, + { + "epoch": 0.37469737413868026, + "grad_norm": 0.29221646680399904, + "learning_rate": 9.981135772222102e-05, + "loss": 3.4188, + "step": 6036 + }, + { + "epoch": 0.3747594512384381, + "grad_norm": 0.25707245586286004, + "learning_rate": 9.981104416989083e-05, + "loss": 3.5023, + "step": 6037 + }, + { + "epoch": 0.37482152833819604, + "grad_norm": 0.25835697882377123, + "learning_rate": 9.981073035768433e-05, + "loss": 3.6131, + "step": 6038 + }, + { + "epoch": 0.37488360543795396, + "grad_norm": 0.22274929745934754, + "learning_rate": 9.981041628560316e-05, + "loss": 3.5226, + "step": 6039 + }, + { + "epoch": 0.3749456825377118, + "grad_norm": 0.3168575694389523, + "learning_rate": 9.981010195364897e-05, + "loss": 3.5776, + "step": 6040 + }, + { + "epoch": 0.37500775963746974, + "grad_norm": 0.32308088121319156, + "learning_rate": 9.980978736182338e-05, + "loss": 3.6008, + "step": 6041 + }, + { + "epoch": 0.37506983673722766, + "grad_norm": 0.3302674399563957, + "learning_rate": 9.980947251012802e-05, + "loss": 3.5302, + "step": 6042 + }, + { + "epoch": 0.3751319138369855, + "grad_norm": 0.27907813783426366, + "learning_rate": 9.980915739856457e-05, + "loss": 3.5729, + "step": 6043 + }, + { + "epoch": 0.37519399093674344, + "grad_norm": 0.27022821733131863, + "learning_rate": 9.980884202713464e-05, + "loss": 3.6012, + "step": 6044 + }, + { + "epoch": 0.37525606803650136, + "grad_norm": 0.34368707340988874, + "learning_rate": 9.980852639583991e-05, + "loss": 3.5235, + "step": 6045 + }, + { + "epoch": 0.3753181451362592, + "grad_norm": 0.3690605480391187, + "learning_rate": 9.980821050468198e-05, + "loss": 3.5413, + "step": 6046 + }, + { + "epoch": 0.37538022223601714, + "grad_norm": 0.357546737623528, + "learning_rate": 9.980789435366253e-05, + "loss": 3.586, + "step": 6047 + }, + { + "epoch": 0.37544229933577505, + "grad_norm": 0.3023504961232095, + "learning_rate": 9.980757794278322e-05, + "loss": 3.6012, + "step": 6048 + }, + { + "epoch": 0.3755043764355329, + "grad_norm": 0.23871729772849834, + "learning_rate": 9.980726127204567e-05, + "loss": 3.5684, + "step": 6049 + }, + { + "epoch": 0.37556645353529083, + "grad_norm": 0.27976921795090753, + "learning_rate": 9.980694434145155e-05, + "loss": 3.6157, + "step": 6050 + }, + { + "epoch": 0.37562853063504875, + "grad_norm": 0.258176344551803, + "learning_rate": 9.980662715100252e-05, + "loss": 3.5524, + "step": 6051 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 0.2580404141177685, + "learning_rate": 9.980630970070022e-05, + "loss": 3.5301, + "step": 6052 + }, + { + "epoch": 0.37575268483456453, + "grad_norm": 0.32261897280478863, + "learning_rate": 9.98059919905463e-05, + "loss": 3.5136, + "step": 6053 + }, + { + "epoch": 0.37581476193432245, + "grad_norm": 0.23841624266433303, + "learning_rate": 9.980567402054243e-05, + "loss": 3.6217, + "step": 6054 + }, + { + "epoch": 0.3758768390340803, + "grad_norm": 0.28171176884824545, + "learning_rate": 9.98053557906903e-05, + "loss": 3.5931, + "step": 6055 + }, + { + "epoch": 0.37593891613383823, + "grad_norm": 0.5186782660925867, + "learning_rate": 9.980503730099152e-05, + "loss": 3.4814, + "step": 6056 + }, + { + "epoch": 0.37600099323359615, + "grad_norm": 0.39259055578184454, + "learning_rate": 9.980471855144775e-05, + "loss": 3.6288, + "step": 6057 + }, + { + "epoch": 0.376063070333354, + "grad_norm": 0.30547530467227907, + "learning_rate": 9.980439954206069e-05, + "loss": 3.5822, + "step": 6058 + }, + { + "epoch": 0.37612514743311193, + "grad_norm": 0.49192498790428635, + "learning_rate": 9.980408027283198e-05, + "loss": 3.5425, + "step": 6059 + }, + { + "epoch": 0.37618722453286985, + "grad_norm": 0.2617377089280748, + "learning_rate": 9.980376074376329e-05, + "loss": 3.5475, + "step": 6060 + }, + { + "epoch": 0.3762493016326277, + "grad_norm": 0.38345079341437754, + "learning_rate": 9.98034409548563e-05, + "loss": 3.5227, + "step": 6061 + }, + { + "epoch": 0.3763113787323856, + "grad_norm": 0.2541590679054231, + "learning_rate": 9.980312090611267e-05, + "loss": 3.4689, + "step": 6062 + }, + { + "epoch": 0.37637345583214354, + "grad_norm": 0.29093941068175544, + "learning_rate": 9.980280059753406e-05, + "loss": 3.5853, + "step": 6063 + }, + { + "epoch": 0.3764355329319014, + "grad_norm": 0.3048308597656916, + "learning_rate": 9.980248002912216e-05, + "loss": 3.4951, + "step": 6064 + }, + { + "epoch": 0.3764976100316593, + "grad_norm": 0.2384203129819765, + "learning_rate": 9.980215920087861e-05, + "loss": 3.542, + "step": 6065 + }, + { + "epoch": 0.37655968713141724, + "grad_norm": 0.37171540642445616, + "learning_rate": 9.980183811280512e-05, + "loss": 3.4787, + "step": 6066 + }, + { + "epoch": 0.3766217642311751, + "grad_norm": 0.26490814585140643, + "learning_rate": 9.980151676490336e-05, + "loss": 3.5497, + "step": 6067 + }, + { + "epoch": 0.376683841330933, + "grad_norm": 0.24289699427074782, + "learning_rate": 9.980119515717498e-05, + "loss": 3.547, + "step": 6068 + }, + { + "epoch": 0.37674591843069094, + "grad_norm": 0.3552870023800097, + "learning_rate": 9.980087328962169e-05, + "loss": 3.5562, + "step": 6069 + }, + { + "epoch": 0.3768079955304488, + "grad_norm": 0.3466742609163481, + "learning_rate": 9.980055116224514e-05, + "loss": 3.5465, + "step": 6070 + }, + { + "epoch": 0.3768700726302067, + "grad_norm": 0.27969340206290905, + "learning_rate": 9.980022877504703e-05, + "loss": 3.561, + "step": 6071 + }, + { + "epoch": 0.37693214972996464, + "grad_norm": 0.3228523346694971, + "learning_rate": 9.979990612802905e-05, + "loss": 3.4799, + "step": 6072 + }, + { + "epoch": 0.3769942268297225, + "grad_norm": 0.21653029229856183, + "learning_rate": 9.979958322119285e-05, + "loss": 3.5818, + "step": 6073 + }, + { + "epoch": 0.3770563039294804, + "grad_norm": 0.2653839756871278, + "learning_rate": 9.979926005454014e-05, + "loss": 3.515, + "step": 6074 + }, + { + "epoch": 0.37711838102923834, + "grad_norm": 0.22502155631172052, + "learning_rate": 9.979893662807261e-05, + "loss": 3.5825, + "step": 6075 + }, + { + "epoch": 0.3771804581289962, + "grad_norm": 0.338480649242174, + "learning_rate": 9.979861294179194e-05, + "loss": 3.5914, + "step": 6076 + }, + { + "epoch": 0.3772425352287541, + "grad_norm": 0.3480746184315118, + "learning_rate": 9.97982889956998e-05, + "loss": 3.5806, + "step": 6077 + }, + { + "epoch": 0.37730461232851203, + "grad_norm": 0.29679990119726796, + "learning_rate": 9.979796478979793e-05, + "loss": 3.5736, + "step": 6078 + }, + { + "epoch": 0.3773666894282699, + "grad_norm": 0.2676563287880732, + "learning_rate": 9.979764032408796e-05, + "loss": 3.5048, + "step": 6079 + }, + { + "epoch": 0.3774287665280278, + "grad_norm": 0.3663799658744318, + "learning_rate": 9.979731559857162e-05, + "loss": 3.6117, + "step": 6080 + }, + { + "epoch": 0.37749084362778573, + "grad_norm": 0.49487670535752243, + "learning_rate": 9.97969906132506e-05, + "loss": 3.634, + "step": 6081 + }, + { + "epoch": 0.3775529207275436, + "grad_norm": 0.32868851229897494, + "learning_rate": 9.979666536812658e-05, + "loss": 3.6533, + "step": 6082 + }, + { + "epoch": 0.3776149978273015, + "grad_norm": 0.5142133636331598, + "learning_rate": 9.979633986320127e-05, + "loss": 3.5783, + "step": 6083 + }, + { + "epoch": 0.37767707492705943, + "grad_norm": 0.32304396991276846, + "learning_rate": 9.979601409847638e-05, + "loss": 3.4404, + "step": 6084 + }, + { + "epoch": 0.3777391520268173, + "grad_norm": 0.3104813704591733, + "learning_rate": 9.979568807395358e-05, + "loss": 3.557, + "step": 6085 + }, + { + "epoch": 0.3778012291265752, + "grad_norm": 0.3269779843048434, + "learning_rate": 9.97953617896346e-05, + "loss": 3.5633, + "step": 6086 + }, + { + "epoch": 0.37786330622633313, + "grad_norm": 0.36105800532626636, + "learning_rate": 9.979503524552112e-05, + "loss": 3.606, + "step": 6087 + }, + { + "epoch": 0.377925383326091, + "grad_norm": 0.33846933496928494, + "learning_rate": 9.979470844161486e-05, + "loss": 3.5586, + "step": 6088 + }, + { + "epoch": 0.3779874604258489, + "grad_norm": 0.26557872874085897, + "learning_rate": 9.979438137791751e-05, + "loss": 3.5279, + "step": 6089 + }, + { + "epoch": 0.3780495375256068, + "grad_norm": 0.26343392832406193, + "learning_rate": 9.979405405443079e-05, + "loss": 3.4976, + "step": 6090 + }, + { + "epoch": 0.3781116146253647, + "grad_norm": 0.3913581262726135, + "learning_rate": 9.97937264711564e-05, + "loss": 3.515, + "step": 6091 + }, + { + "epoch": 0.3781736917251226, + "grad_norm": 0.22330422394687072, + "learning_rate": 9.979339862809605e-05, + "loss": 3.5973, + "step": 6092 + }, + { + "epoch": 0.3782357688248805, + "grad_norm": 0.45362063957298776, + "learning_rate": 9.979307052525146e-05, + "loss": 3.459, + "step": 6093 + }, + { + "epoch": 0.3782978459246384, + "grad_norm": 0.31018720947949935, + "learning_rate": 9.979274216262433e-05, + "loss": 3.5671, + "step": 6094 + }, + { + "epoch": 0.3783599230243963, + "grad_norm": 0.3507983379832199, + "learning_rate": 9.979241354021636e-05, + "loss": 3.6226, + "step": 6095 + }, + { + "epoch": 0.3784220001241542, + "grad_norm": 0.3211203946941207, + "learning_rate": 9.979208465802931e-05, + "loss": 3.498, + "step": 6096 + }, + { + "epoch": 0.3784840772239121, + "grad_norm": 0.31073837853380437, + "learning_rate": 9.979175551606484e-05, + "loss": 3.4948, + "step": 6097 + }, + { + "epoch": 0.37854615432367, + "grad_norm": 0.2718534773495973, + "learning_rate": 9.97914261143247e-05, + "loss": 3.4262, + "step": 6098 + }, + { + "epoch": 0.3786082314234279, + "grad_norm": 0.3807696701436668, + "learning_rate": 9.979109645281062e-05, + "loss": 3.6257, + "step": 6099 + }, + { + "epoch": 0.3786703085231858, + "grad_norm": 0.2588296390607761, + "learning_rate": 9.979076653152428e-05, + "loss": 3.5379, + "step": 6100 + }, + { + "epoch": 0.3787323856229437, + "grad_norm": 0.3191790647480797, + "learning_rate": 9.979043635046744e-05, + "loss": 3.5292, + "step": 6101 + }, + { + "epoch": 0.3787944627227016, + "grad_norm": 0.3166241543032274, + "learning_rate": 9.97901059096418e-05, + "loss": 3.5628, + "step": 6102 + }, + { + "epoch": 0.3788565398224595, + "grad_norm": 0.294846897826097, + "learning_rate": 9.97897752090491e-05, + "loss": 3.5732, + "step": 6103 + }, + { + "epoch": 0.3789186169222174, + "grad_norm": 0.3690928473661401, + "learning_rate": 9.978944424869105e-05, + "loss": 3.587, + "step": 6104 + }, + { + "epoch": 0.3789806940219753, + "grad_norm": 0.3102255544414818, + "learning_rate": 9.978911302856938e-05, + "loss": 3.522, + "step": 6105 + }, + { + "epoch": 0.3790427711217332, + "grad_norm": 0.3129731054093435, + "learning_rate": 9.978878154868582e-05, + "loss": 3.4776, + "step": 6106 + }, + { + "epoch": 0.3791048482214911, + "grad_norm": 0.24863504527153993, + "learning_rate": 9.978844980904209e-05, + "loss": 3.5382, + "step": 6107 + }, + { + "epoch": 0.379166925321249, + "grad_norm": 0.2681641669665617, + "learning_rate": 9.978811780963995e-05, + "loss": 3.5916, + "step": 6108 + }, + { + "epoch": 0.3792290024210069, + "grad_norm": 0.24579247698208684, + "learning_rate": 9.978778555048111e-05, + "loss": 3.5258, + "step": 6109 + }, + { + "epoch": 0.3792910795207648, + "grad_norm": 0.23852794415484413, + "learning_rate": 9.97874530315673e-05, + "loss": 3.5111, + "step": 6110 + }, + { + "epoch": 0.3793531566205227, + "grad_norm": 0.262976570307771, + "learning_rate": 9.978712025290026e-05, + "loss": 3.5538, + "step": 6111 + }, + { + "epoch": 0.3794152337202806, + "grad_norm": 0.3104368170762773, + "learning_rate": 9.978678721448172e-05, + "loss": 3.5357, + "step": 6112 + }, + { + "epoch": 0.3794773108200385, + "grad_norm": 0.31889928192420874, + "learning_rate": 9.978645391631344e-05, + "loss": 3.632, + "step": 6113 + }, + { + "epoch": 0.3795393879197964, + "grad_norm": 0.22425914493570945, + "learning_rate": 9.978612035839714e-05, + "loss": 3.52, + "step": 6114 + }, + { + "epoch": 0.3796014650195543, + "grad_norm": 0.27075132020785186, + "learning_rate": 9.978578654073456e-05, + "loss": 3.562, + "step": 6115 + }, + { + "epoch": 0.3796635421193122, + "grad_norm": 0.32707408230854074, + "learning_rate": 9.978545246332745e-05, + "loss": 3.6014, + "step": 6116 + }, + { + "epoch": 0.3797256192190701, + "grad_norm": 0.28840470862258305, + "learning_rate": 9.978511812617755e-05, + "loss": 3.603, + "step": 6117 + }, + { + "epoch": 0.37978769631882797, + "grad_norm": 0.25650378604329277, + "learning_rate": 9.97847835292866e-05, + "loss": 3.6416, + "step": 6118 + }, + { + "epoch": 0.3798497734185859, + "grad_norm": 0.3420547275667346, + "learning_rate": 9.978444867265635e-05, + "loss": 3.5182, + "step": 6119 + }, + { + "epoch": 0.3799118505183438, + "grad_norm": 0.3260410868951357, + "learning_rate": 9.978411355628854e-05, + "loss": 3.5305, + "step": 6120 + }, + { + "epoch": 0.37997392761810167, + "grad_norm": 0.2592910504355002, + "learning_rate": 9.978377818018491e-05, + "loss": 3.6038, + "step": 6121 + }, + { + "epoch": 0.3800360047178596, + "grad_norm": 0.32742896507194225, + "learning_rate": 9.978344254434727e-05, + "loss": 3.6096, + "step": 6122 + }, + { + "epoch": 0.3800980818176175, + "grad_norm": 0.2895088358405307, + "learning_rate": 9.978310664877728e-05, + "loss": 3.6232, + "step": 6123 + }, + { + "epoch": 0.38016015891737537, + "grad_norm": 0.32624009840836804, + "learning_rate": 9.978277049347675e-05, + "loss": 3.6477, + "step": 6124 + }, + { + "epoch": 0.3802222360171333, + "grad_norm": 0.32344057155385375, + "learning_rate": 9.978243407844742e-05, + "loss": 3.5851, + "step": 6125 + }, + { + "epoch": 0.3802843131168912, + "grad_norm": 0.3329777074725106, + "learning_rate": 9.978209740369107e-05, + "loss": 3.5062, + "step": 6126 + }, + { + "epoch": 0.38034639021664907, + "grad_norm": 0.40861931715380523, + "learning_rate": 9.978176046920941e-05, + "loss": 3.515, + "step": 6127 + }, + { + "epoch": 0.380408467316407, + "grad_norm": 0.3752820623095048, + "learning_rate": 9.978142327500421e-05, + "loss": 3.5959, + "step": 6128 + }, + { + "epoch": 0.3804705444161649, + "grad_norm": 0.29511224526710994, + "learning_rate": 9.978108582107727e-05, + "loss": 3.4796, + "step": 6129 + }, + { + "epoch": 0.38053262151592276, + "grad_norm": 0.44396845114909944, + "learning_rate": 9.97807481074303e-05, + "loss": 3.3736, + "step": 6130 + }, + { + "epoch": 0.3805946986156807, + "grad_norm": 0.37076485766004635, + "learning_rate": 9.97804101340651e-05, + "loss": 3.6216, + "step": 6131 + }, + { + "epoch": 0.3806567757154386, + "grad_norm": 0.35973141452691365, + "learning_rate": 9.97800719009834e-05, + "loss": 3.6841, + "step": 6132 + }, + { + "epoch": 0.38071885281519646, + "grad_norm": 0.4789081753914542, + "learning_rate": 9.977973340818699e-05, + "loss": 3.5189, + "step": 6133 + }, + { + "epoch": 0.3807809299149544, + "grad_norm": 0.30478794936318415, + "learning_rate": 9.97793946556776e-05, + "loss": 3.5109, + "step": 6134 + }, + { + "epoch": 0.3808430070147123, + "grad_norm": 0.3552409479637607, + "learning_rate": 9.977905564345705e-05, + "loss": 3.5149, + "step": 6135 + }, + { + "epoch": 0.38090508411447016, + "grad_norm": 0.4543778266189157, + "learning_rate": 9.977871637152709e-05, + "loss": 3.4834, + "step": 6136 + }, + { + "epoch": 0.3809671612142281, + "grad_norm": 0.2729362389168717, + "learning_rate": 9.977837683988946e-05, + "loss": 3.5784, + "step": 6137 + }, + { + "epoch": 0.381029238313986, + "grad_norm": 0.7187646781417758, + "learning_rate": 9.977803704854596e-05, + "loss": 3.5862, + "step": 6138 + }, + { + "epoch": 0.38109131541374386, + "grad_norm": 0.6946161619444131, + "learning_rate": 9.977769699749836e-05, + "loss": 3.5334, + "step": 6139 + }, + { + "epoch": 0.3811533925135018, + "grad_norm": 0.4689058998256597, + "learning_rate": 9.977735668674843e-05, + "loss": 3.5921, + "step": 6140 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 0.4049714031543213, + "learning_rate": 9.977701611629796e-05, + "loss": 3.4433, + "step": 6141 + }, + { + "epoch": 0.38127754671301756, + "grad_norm": 0.42232046626945413, + "learning_rate": 9.97766752861487e-05, + "loss": 3.4835, + "step": 6142 + }, + { + "epoch": 0.3813396238127755, + "grad_norm": 0.33254705642847954, + "learning_rate": 9.977633419630243e-05, + "loss": 3.611, + "step": 6143 + }, + { + "epoch": 0.3814017009125334, + "grad_norm": 0.5914768323013042, + "learning_rate": 9.977599284676095e-05, + "loss": 3.4592, + "step": 6144 + }, + { + "epoch": 0.38146377801229125, + "grad_norm": 0.35838324075437555, + "learning_rate": 9.977565123752604e-05, + "loss": 3.4043, + "step": 6145 + }, + { + "epoch": 0.3815258551120492, + "grad_norm": 0.3452008256669464, + "learning_rate": 9.977530936859946e-05, + "loss": 3.5178, + "step": 6146 + }, + { + "epoch": 0.3815879322118071, + "grad_norm": 0.33018208734356724, + "learning_rate": 9.977496723998302e-05, + "loss": 3.5439, + "step": 6147 + }, + { + "epoch": 0.38165000931156495, + "grad_norm": 0.3482406611898524, + "learning_rate": 9.977462485167848e-05, + "loss": 3.5064, + "step": 6148 + }, + { + "epoch": 0.38171208641132287, + "grad_norm": 0.38944688522253024, + "learning_rate": 9.977428220368765e-05, + "loss": 3.649, + "step": 6149 + }, + { + "epoch": 0.3817741635110808, + "grad_norm": 0.405495144052981, + "learning_rate": 9.97739392960123e-05, + "loss": 3.585, + "step": 6150 + }, + { + "epoch": 0.38183624061083865, + "grad_norm": 0.31139472765930526, + "learning_rate": 9.977359612865423e-05, + "loss": 3.5995, + "step": 6151 + }, + { + "epoch": 0.38189831771059657, + "grad_norm": 0.44777706839287296, + "learning_rate": 9.977325270161524e-05, + "loss": 3.5327, + "step": 6152 + }, + { + "epoch": 0.3819603948103545, + "grad_norm": 0.3363656930178135, + "learning_rate": 9.977290901489709e-05, + "loss": 3.5681, + "step": 6153 + }, + { + "epoch": 0.38202247191011235, + "grad_norm": 0.4170616860286728, + "learning_rate": 9.977256506850159e-05, + "loss": 3.5987, + "step": 6154 + }, + { + "epoch": 0.38208454900987027, + "grad_norm": 0.3029259967394659, + "learning_rate": 9.977222086243054e-05, + "loss": 3.6118, + "step": 6155 + }, + { + "epoch": 0.3821466261096282, + "grad_norm": 0.34015198683443726, + "learning_rate": 9.977187639668572e-05, + "loss": 3.5394, + "step": 6156 + }, + { + "epoch": 0.38220870320938605, + "grad_norm": 0.259640297737014, + "learning_rate": 9.977153167126896e-05, + "loss": 3.4503, + "step": 6157 + }, + { + "epoch": 0.38227078030914396, + "grad_norm": 0.48715183100567216, + "learning_rate": 9.977118668618202e-05, + "loss": 3.6034, + "step": 6158 + }, + { + "epoch": 0.3823328574089019, + "grad_norm": 0.258977152290721, + "learning_rate": 9.977084144142672e-05, + "loss": 3.4931, + "step": 6159 + }, + { + "epoch": 0.38239493450865975, + "grad_norm": 0.31244762697040557, + "learning_rate": 9.977049593700487e-05, + "loss": 3.543, + "step": 6160 + }, + { + "epoch": 0.38245701160841766, + "grad_norm": 0.3138211456882636, + "learning_rate": 9.977015017291824e-05, + "loss": 3.5558, + "step": 6161 + }, + { + "epoch": 0.3825190887081756, + "grad_norm": 0.29075291956290983, + "learning_rate": 9.976980414916867e-05, + "loss": 3.4685, + "step": 6162 + }, + { + "epoch": 0.38258116580793344, + "grad_norm": 0.30592757739846854, + "learning_rate": 9.976945786575795e-05, + "loss": 3.53, + "step": 6163 + }, + { + "epoch": 0.38264324290769136, + "grad_norm": 0.4719833281257868, + "learning_rate": 9.976911132268787e-05, + "loss": 3.4568, + "step": 6164 + }, + { + "epoch": 0.3827053200074493, + "grad_norm": 0.5493208278901904, + "learning_rate": 9.976876451996026e-05, + "loss": 3.5998, + "step": 6165 + }, + { + "epoch": 0.38276739710720714, + "grad_norm": 0.3525151696390515, + "learning_rate": 9.976841745757694e-05, + "loss": 3.4994, + "step": 6166 + }, + { + "epoch": 0.38282947420696506, + "grad_norm": 0.2310534189403855, + "learning_rate": 9.97680701355397e-05, + "loss": 3.4536, + "step": 6167 + }, + { + "epoch": 0.382891551306723, + "grad_norm": 0.31363545606598703, + "learning_rate": 9.976772255385034e-05, + "loss": 3.5555, + "step": 6168 + }, + { + "epoch": 0.38295362840648084, + "grad_norm": 0.2623577004700084, + "learning_rate": 9.97673747125107e-05, + "loss": 3.5878, + "step": 6169 + }, + { + "epoch": 0.38301570550623876, + "grad_norm": 0.21592045527647816, + "learning_rate": 9.976702661152258e-05, + "loss": 3.5448, + "step": 6170 + }, + { + "epoch": 0.3830777826059967, + "grad_norm": 0.25392090681761287, + "learning_rate": 9.976667825088782e-05, + "loss": 3.4944, + "step": 6171 + }, + { + "epoch": 0.38313985970575454, + "grad_norm": 0.2986149495023513, + "learning_rate": 9.976632963060819e-05, + "loss": 3.6767, + "step": 6172 + }, + { + "epoch": 0.38320193680551246, + "grad_norm": 0.21553599932652084, + "learning_rate": 9.976598075068556e-05, + "loss": 3.5042, + "step": 6173 + }, + { + "epoch": 0.3832640139052704, + "grad_norm": 0.37261266553466177, + "learning_rate": 9.976563161112171e-05, + "loss": 3.5645, + "step": 6174 + }, + { + "epoch": 0.38332609100502824, + "grad_norm": 0.23431103151298271, + "learning_rate": 9.97652822119185e-05, + "loss": 3.4733, + "step": 6175 + }, + { + "epoch": 0.38338816810478615, + "grad_norm": 0.33482273464856405, + "learning_rate": 9.97649325530777e-05, + "loss": 3.4079, + "step": 6176 + }, + { + "epoch": 0.38345024520454407, + "grad_norm": 0.20780898607481196, + "learning_rate": 9.97645826346012e-05, + "loss": 3.5088, + "step": 6177 + }, + { + "epoch": 0.38351232230430193, + "grad_norm": 0.31448352264002216, + "learning_rate": 9.976423245649077e-05, + "loss": 3.5312, + "step": 6178 + }, + { + "epoch": 0.38357439940405985, + "grad_norm": 0.35349660369112906, + "learning_rate": 9.976388201874827e-05, + "loss": 3.5912, + "step": 6179 + }, + { + "epoch": 0.38363647650381777, + "grad_norm": 0.3073618722449331, + "learning_rate": 9.97635313213755e-05, + "loss": 3.5113, + "step": 6180 + }, + { + "epoch": 0.38369855360357563, + "grad_norm": 0.3008353568618397, + "learning_rate": 9.976318036437433e-05, + "loss": 3.5021, + "step": 6181 + }, + { + "epoch": 0.38376063070333355, + "grad_norm": 0.24247883378644125, + "learning_rate": 9.976282914774655e-05, + "loss": 3.4755, + "step": 6182 + }, + { + "epoch": 0.3838227078030914, + "grad_norm": 0.36163216248667107, + "learning_rate": 9.976247767149403e-05, + "loss": 3.4181, + "step": 6183 + }, + { + "epoch": 0.38388478490284933, + "grad_norm": 0.28459635074122897, + "learning_rate": 9.976212593561855e-05, + "loss": 3.5238, + "step": 6184 + }, + { + "epoch": 0.38394686200260725, + "grad_norm": 0.35366114366487217, + "learning_rate": 9.9761773940122e-05, + "loss": 3.3915, + "step": 6185 + }, + { + "epoch": 0.3840089391023651, + "grad_norm": 0.2849834258087094, + "learning_rate": 9.976142168500619e-05, + "loss": 3.5643, + "step": 6186 + }, + { + "epoch": 0.38407101620212303, + "grad_norm": 0.3371813170225888, + "learning_rate": 9.976106917027297e-05, + "loss": 3.5384, + "step": 6187 + }, + { + "epoch": 0.38413309330188095, + "grad_norm": 0.3042111095963952, + "learning_rate": 9.976071639592417e-05, + "loss": 3.438, + "step": 6188 + }, + { + "epoch": 0.3841951704016388, + "grad_norm": 0.21717088244693236, + "learning_rate": 9.976036336196163e-05, + "loss": 3.4352, + "step": 6189 + }, + { + "epoch": 0.3842572475013967, + "grad_norm": 0.24070483561511244, + "learning_rate": 9.97600100683872e-05, + "loss": 3.5607, + "step": 6190 + }, + { + "epoch": 0.38431932460115464, + "grad_norm": 0.22357122747680128, + "learning_rate": 9.975965651520272e-05, + "loss": 3.4521, + "step": 6191 + }, + { + "epoch": 0.3843814017009125, + "grad_norm": 0.2527461030057832, + "learning_rate": 9.975930270241003e-05, + "loss": 3.523, + "step": 6192 + }, + { + "epoch": 0.3844434788006704, + "grad_norm": 0.1950259872842588, + "learning_rate": 9.975894863001096e-05, + "loss": 3.503, + "step": 6193 + }, + { + "epoch": 0.38450555590042834, + "grad_norm": 0.2613699139766535, + "learning_rate": 9.97585942980074e-05, + "loss": 3.548, + "step": 6194 + }, + { + "epoch": 0.3845676330001862, + "grad_norm": 0.2407276702420581, + "learning_rate": 9.975823970640117e-05, + "loss": 3.4787, + "step": 6195 + }, + { + "epoch": 0.3846297100999441, + "grad_norm": 0.28538073234632294, + "learning_rate": 9.975788485519411e-05, + "loss": 3.5437, + "step": 6196 + }, + { + "epoch": 0.38469178719970204, + "grad_norm": 0.22470309748515804, + "learning_rate": 9.97575297443881e-05, + "loss": 3.6176, + "step": 6197 + }, + { + "epoch": 0.3847538642994599, + "grad_norm": 0.2516767923394918, + "learning_rate": 9.975717437398498e-05, + "loss": 3.6273, + "step": 6198 + }, + { + "epoch": 0.3848159413992178, + "grad_norm": 0.29121892343747846, + "learning_rate": 9.975681874398659e-05, + "loss": 3.5912, + "step": 6199 + }, + { + "epoch": 0.38487801849897574, + "grad_norm": 0.24085653578686866, + "learning_rate": 9.97564628543948e-05, + "loss": 3.4473, + "step": 6200 + }, + { + "epoch": 0.3849400955987336, + "grad_norm": 0.2584581382564596, + "learning_rate": 9.975610670521148e-05, + "loss": 3.4721, + "step": 6201 + }, + { + "epoch": 0.3850021726984915, + "grad_norm": 0.2639066710392522, + "learning_rate": 9.975575029643845e-05, + "loss": 3.4333, + "step": 6202 + }, + { + "epoch": 0.38506424979824944, + "grad_norm": 0.26264480899489906, + "learning_rate": 9.97553936280776e-05, + "loss": 3.5921, + "step": 6203 + }, + { + "epoch": 0.3851263268980073, + "grad_norm": 0.18818189880526323, + "learning_rate": 9.975503670013078e-05, + "loss": 3.5479, + "step": 6204 + }, + { + "epoch": 0.3851884039977652, + "grad_norm": 0.26176178991076526, + "learning_rate": 9.975467951259987e-05, + "loss": 3.5208, + "step": 6205 + }, + { + "epoch": 0.38525048109752313, + "grad_norm": 0.24797480580002798, + "learning_rate": 9.97543220654867e-05, + "loss": 3.4829, + "step": 6206 + }, + { + "epoch": 0.385312558197281, + "grad_norm": 0.19508737941632381, + "learning_rate": 9.975396435879316e-05, + "loss": 3.4547, + "step": 6207 + }, + { + "epoch": 0.3853746352970389, + "grad_norm": 0.3263115612316821, + "learning_rate": 9.975360639252111e-05, + "loss": 3.5046, + "step": 6208 + }, + { + "epoch": 0.38543671239679683, + "grad_norm": 0.19593559057348756, + "learning_rate": 9.975324816667242e-05, + "loss": 3.4956, + "step": 6209 + }, + { + "epoch": 0.3854987894965547, + "grad_norm": 0.22576047907394128, + "learning_rate": 9.975288968124896e-05, + "loss": 3.5394, + "step": 6210 + }, + { + "epoch": 0.3855608665963126, + "grad_norm": 0.25329503070728626, + "learning_rate": 9.975253093625258e-05, + "loss": 3.5246, + "step": 6211 + }, + { + "epoch": 0.38562294369607053, + "grad_norm": 0.24002565450445001, + "learning_rate": 9.975217193168519e-05, + "loss": 3.5127, + "step": 6212 + }, + { + "epoch": 0.3856850207958284, + "grad_norm": 0.31445479540647775, + "learning_rate": 9.975181266754863e-05, + "loss": 3.516, + "step": 6213 + }, + { + "epoch": 0.3857470978955863, + "grad_norm": 0.30786045688442343, + "learning_rate": 9.97514531438448e-05, + "loss": 3.5128, + "step": 6214 + }, + { + "epoch": 0.38580917499534423, + "grad_norm": 0.17961523427279136, + "learning_rate": 9.975109336057554e-05, + "loss": 3.4596, + "step": 6215 + }, + { + "epoch": 0.3858712520951021, + "grad_norm": 0.2861816703042674, + "learning_rate": 9.975073331774276e-05, + "loss": 3.4608, + "step": 6216 + }, + { + "epoch": 0.38593332919486, + "grad_norm": 0.20945875748627502, + "learning_rate": 9.975037301534832e-05, + "loss": 3.4216, + "step": 6217 + }, + { + "epoch": 0.3859954062946179, + "grad_norm": 0.18850563497114478, + "learning_rate": 9.975001245339411e-05, + "loss": 3.6008, + "step": 6218 + }, + { + "epoch": 0.3860574833943758, + "grad_norm": 0.26953987439496774, + "learning_rate": 9.974965163188201e-05, + "loss": 3.5335, + "step": 6219 + }, + { + "epoch": 0.3861195604941337, + "grad_norm": 0.29209070234258605, + "learning_rate": 9.97492905508139e-05, + "loss": 3.4823, + "step": 6220 + }, + { + "epoch": 0.3861816375938916, + "grad_norm": 0.3268577878771165, + "learning_rate": 9.974892921019166e-05, + "loss": 3.4707, + "step": 6221 + }, + { + "epoch": 0.3862437146936495, + "grad_norm": 0.2208996547594606, + "learning_rate": 9.974856761001718e-05, + "loss": 3.5181, + "step": 6222 + }, + { + "epoch": 0.3863057917934074, + "grad_norm": 0.2923129562070956, + "learning_rate": 9.974820575029235e-05, + "loss": 3.4781, + "step": 6223 + }, + { + "epoch": 0.3863678688931653, + "grad_norm": 0.23995109075891338, + "learning_rate": 9.974784363101905e-05, + "loss": 3.4917, + "step": 6224 + }, + { + "epoch": 0.3864299459929232, + "grad_norm": 0.2696966272049524, + "learning_rate": 9.974748125219918e-05, + "loss": 3.5208, + "step": 6225 + }, + { + "epoch": 0.3864920230926811, + "grad_norm": 0.25950391392337946, + "learning_rate": 9.974711861383461e-05, + "loss": 3.4817, + "step": 6226 + }, + { + "epoch": 0.386554100192439, + "grad_norm": 0.2838435725180359, + "learning_rate": 9.974675571592726e-05, + "loss": 3.5254, + "step": 6227 + }, + { + "epoch": 0.3866161772921969, + "grad_norm": 0.2886700156517182, + "learning_rate": 9.9746392558479e-05, + "loss": 3.4715, + "step": 6228 + }, + { + "epoch": 0.3866782543919548, + "grad_norm": 0.381713945262524, + "learning_rate": 9.974602914149174e-05, + "loss": 3.4278, + "step": 6229 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 0.2752619774280042, + "learning_rate": 9.974566546496737e-05, + "loss": 3.5395, + "step": 6230 + }, + { + "epoch": 0.3868024085914706, + "grad_norm": 0.2486841246224138, + "learning_rate": 9.974530152890777e-05, + "loss": 3.5168, + "step": 6231 + }, + { + "epoch": 0.3868644856912285, + "grad_norm": 0.27328657344651797, + "learning_rate": 9.974493733331489e-05, + "loss": 3.6332, + "step": 6232 + }, + { + "epoch": 0.3869265627909864, + "grad_norm": 0.22165841155945776, + "learning_rate": 9.974457287819056e-05, + "loss": 3.5179, + "step": 6233 + }, + { + "epoch": 0.3869886398907443, + "grad_norm": 0.3462518550465052, + "learning_rate": 9.974420816353674e-05, + "loss": 3.6246, + "step": 6234 + }, + { + "epoch": 0.3870507169905022, + "grad_norm": 0.25746740016775566, + "learning_rate": 9.97438431893553e-05, + "loss": 3.5132, + "step": 6235 + }, + { + "epoch": 0.3871127940902601, + "grad_norm": 0.22349438276688569, + "learning_rate": 9.974347795564816e-05, + "loss": 3.4856, + "step": 6236 + }, + { + "epoch": 0.387174871190018, + "grad_norm": 0.26684804847293886, + "learning_rate": 9.974311246241721e-05, + "loss": 3.5897, + "step": 6237 + }, + { + "epoch": 0.3872369482897759, + "grad_norm": 0.2990883709989113, + "learning_rate": 9.974274670966436e-05, + "loss": 3.5295, + "step": 6238 + }, + { + "epoch": 0.3872990253895338, + "grad_norm": 0.2736887594857411, + "learning_rate": 9.974238069739156e-05, + "loss": 3.523, + "step": 6239 + }, + { + "epoch": 0.3873611024892917, + "grad_norm": 0.5369356463976118, + "learning_rate": 9.974201442560066e-05, + "loss": 3.5128, + "step": 6240 + }, + { + "epoch": 0.3874231795890496, + "grad_norm": 0.5090531672306596, + "learning_rate": 9.97416478942936e-05, + "loss": 3.5007, + "step": 6241 + }, + { + "epoch": 0.3874852566888075, + "grad_norm": 0.2890325221420274, + "learning_rate": 9.974128110347228e-05, + "loss": 3.4153, + "step": 6242 + }, + { + "epoch": 0.3875473337885654, + "grad_norm": 0.4027903587287452, + "learning_rate": 9.974091405313863e-05, + "loss": 3.4912, + "step": 6243 + }, + { + "epoch": 0.3876094108883233, + "grad_norm": 0.40550918157015864, + "learning_rate": 9.974054674329456e-05, + "loss": 3.5532, + "step": 6244 + }, + { + "epoch": 0.3876714879880812, + "grad_norm": 0.3580826659753795, + "learning_rate": 9.974017917394198e-05, + "loss": 3.4914, + "step": 6245 + }, + { + "epoch": 0.38773356508783907, + "grad_norm": 0.4550993263643242, + "learning_rate": 9.97398113450828e-05, + "loss": 3.5723, + "step": 6246 + }, + { + "epoch": 0.387795642187597, + "grad_norm": 0.2968770955664308, + "learning_rate": 9.973944325671896e-05, + "loss": 3.3825, + "step": 6247 + }, + { + "epoch": 0.3878577192873549, + "grad_norm": 0.2980402489521836, + "learning_rate": 9.973907490885237e-05, + "loss": 3.523, + "step": 6248 + }, + { + "epoch": 0.38791979638711277, + "grad_norm": 0.521442368776484, + "learning_rate": 9.973870630148494e-05, + "loss": 3.6642, + "step": 6249 + }, + { + "epoch": 0.3879818734868707, + "grad_norm": 0.28476697662409506, + "learning_rate": 9.973833743461862e-05, + "loss": 3.6053, + "step": 6250 + }, + { + "epoch": 0.3880439505866286, + "grad_norm": 0.2562395576349773, + "learning_rate": 9.97379683082553e-05, + "loss": 3.5304, + "step": 6251 + }, + { + "epoch": 0.38810602768638647, + "grad_norm": 0.4315124770280098, + "learning_rate": 9.973759892239694e-05, + "loss": 3.4722, + "step": 6252 + }, + { + "epoch": 0.3881681047861444, + "grad_norm": 0.2570069762481657, + "learning_rate": 9.973722927704545e-05, + "loss": 3.4535, + "step": 6253 + }, + { + "epoch": 0.3882301818859023, + "grad_norm": 0.3962541672941744, + "learning_rate": 9.973685937220276e-05, + "loss": 3.5671, + "step": 6254 + }, + { + "epoch": 0.38829225898566017, + "grad_norm": 0.21887517651355276, + "learning_rate": 9.97364892078708e-05, + "loss": 3.4687, + "step": 6255 + }, + { + "epoch": 0.3883543360854181, + "grad_norm": 0.34392982894298035, + "learning_rate": 9.973611878405152e-05, + "loss": 3.5082, + "step": 6256 + }, + { + "epoch": 0.388416413185176, + "grad_norm": 0.3556738670376122, + "learning_rate": 9.97357481007468e-05, + "loss": 3.5241, + "step": 6257 + }, + { + "epoch": 0.38847849028493386, + "grad_norm": 0.23598050331900625, + "learning_rate": 9.973537715795863e-05, + "loss": 3.4918, + "step": 6258 + }, + { + "epoch": 0.3885405673846918, + "grad_norm": 0.2875073069182942, + "learning_rate": 9.973500595568892e-05, + "loss": 3.5736, + "step": 6259 + }, + { + "epoch": 0.3886026444844497, + "grad_norm": 0.37838599285266517, + "learning_rate": 9.973463449393961e-05, + "loss": 3.5225, + "step": 6260 + }, + { + "epoch": 0.38866472158420756, + "grad_norm": 0.3419337407335857, + "learning_rate": 9.973426277271264e-05, + "loss": 3.4968, + "step": 6261 + }, + { + "epoch": 0.3887267986839655, + "grad_norm": 0.48628430762775104, + "learning_rate": 9.973389079200994e-05, + "loss": 3.567, + "step": 6262 + }, + { + "epoch": 0.3887888757837234, + "grad_norm": 0.5029654777162067, + "learning_rate": 9.973351855183346e-05, + "loss": 3.4951, + "step": 6263 + }, + { + "epoch": 0.38885095288348126, + "grad_norm": 0.35176566929067765, + "learning_rate": 9.973314605218516e-05, + "loss": 3.5468, + "step": 6264 + }, + { + "epoch": 0.3889130299832392, + "grad_norm": 0.3927775203999101, + "learning_rate": 9.973277329306694e-05, + "loss": 3.5007, + "step": 6265 + }, + { + "epoch": 0.3889751070829971, + "grad_norm": 0.2977377126762393, + "learning_rate": 9.973240027448077e-05, + "loss": 3.5301, + "step": 6266 + }, + { + "epoch": 0.38903718418275496, + "grad_norm": 0.32445441723645896, + "learning_rate": 9.973202699642861e-05, + "loss": 3.6147, + "step": 6267 + }, + { + "epoch": 0.3890992612825129, + "grad_norm": 0.553866006441133, + "learning_rate": 9.973165345891237e-05, + "loss": 3.5426, + "step": 6268 + }, + { + "epoch": 0.3891613383822708, + "grad_norm": 0.5227424504881085, + "learning_rate": 9.973127966193406e-05, + "loss": 3.4855, + "step": 6269 + }, + { + "epoch": 0.38922341548202866, + "grad_norm": 0.40073697996376784, + "learning_rate": 9.973090560549555e-05, + "loss": 3.5217, + "step": 6270 + }, + { + "epoch": 0.3892854925817866, + "grad_norm": 0.5720818390471337, + "learning_rate": 9.973053128959884e-05, + "loss": 3.433, + "step": 6271 + }, + { + "epoch": 0.3893475696815445, + "grad_norm": 0.2592298819857464, + "learning_rate": 9.97301567142459e-05, + "loss": 3.5039, + "step": 6272 + }, + { + "epoch": 0.38940964678130235, + "grad_norm": 0.35817530699760897, + "learning_rate": 9.972978187943864e-05, + "loss": 3.5222, + "step": 6273 + }, + { + "epoch": 0.38947172388106027, + "grad_norm": 0.4111981736372213, + "learning_rate": 9.972940678517903e-05, + "loss": 3.4934, + "step": 6274 + }, + { + "epoch": 0.3895338009808182, + "grad_norm": 0.31162121712094326, + "learning_rate": 9.972903143146906e-05, + "loss": 3.4712, + "step": 6275 + }, + { + "epoch": 0.38959587808057605, + "grad_norm": 0.3271427324850918, + "learning_rate": 9.972865581831063e-05, + "loss": 3.4758, + "step": 6276 + }, + { + "epoch": 0.38965795518033397, + "grad_norm": 0.9789482576531353, + "learning_rate": 9.972827994570575e-05, + "loss": 3.5856, + "step": 6277 + }, + { + "epoch": 0.3897200322800919, + "grad_norm": 0.38352716648859264, + "learning_rate": 9.972790381365635e-05, + "loss": 3.533, + "step": 6278 + }, + { + "epoch": 0.38978210937984975, + "grad_norm": 0.8260114437377363, + "learning_rate": 9.972752742216443e-05, + "loss": 3.5627, + "step": 6279 + }, + { + "epoch": 0.38984418647960767, + "grad_norm": 0.8833086425701342, + "learning_rate": 9.972715077123189e-05, + "loss": 3.6229, + "step": 6280 + }, + { + "epoch": 0.3899062635793656, + "grad_norm": 0.8874876411007293, + "learning_rate": 9.972677386086075e-05, + "loss": 3.5153, + "step": 6281 + }, + { + "epoch": 0.38996834067912345, + "grad_norm": 0.6608548953694484, + "learning_rate": 9.972639669105297e-05, + "loss": 3.5504, + "step": 6282 + }, + { + "epoch": 0.39003041777888137, + "grad_norm": 0.6504183010135361, + "learning_rate": 9.97260192618105e-05, + "loss": 3.5606, + "step": 6283 + }, + { + "epoch": 0.3900924948786393, + "grad_norm": 0.7770198741565685, + "learning_rate": 9.972564157313531e-05, + "loss": 3.5378, + "step": 6284 + }, + { + "epoch": 0.39015457197839715, + "grad_norm": 0.5435664228789251, + "learning_rate": 9.972526362502938e-05, + "loss": 3.5744, + "step": 6285 + }, + { + "epoch": 0.39021664907815506, + "grad_norm": 0.48274564496213923, + "learning_rate": 9.972488541749468e-05, + "loss": 3.5448, + "step": 6286 + }, + { + "epoch": 0.390278726177913, + "grad_norm": 0.5028923243004577, + "learning_rate": 9.972450695053317e-05, + "loss": 3.6058, + "step": 6287 + }, + { + "epoch": 0.39034080327767084, + "grad_norm": 0.37878868402771176, + "learning_rate": 9.972412822414685e-05, + "loss": 3.6274, + "step": 6288 + }, + { + "epoch": 0.39040288037742876, + "grad_norm": 0.3303337123379007, + "learning_rate": 9.972374923833768e-05, + "loss": 3.5187, + "step": 6289 + }, + { + "epoch": 0.3904649574771867, + "grad_norm": 0.36031118027113707, + "learning_rate": 9.972336999310762e-05, + "loss": 3.5435, + "step": 6290 + }, + { + "epoch": 0.39052703457694454, + "grad_norm": 0.471668614480655, + "learning_rate": 9.97229904884587e-05, + "loss": 3.5717, + "step": 6291 + }, + { + "epoch": 0.39058911167670246, + "grad_norm": 0.4398018364729623, + "learning_rate": 9.972261072439286e-05, + "loss": 3.5848, + "step": 6292 + }, + { + "epoch": 0.3906511887764604, + "grad_norm": 0.3667775158145622, + "learning_rate": 9.972223070091207e-05, + "loss": 3.628, + "step": 6293 + }, + { + "epoch": 0.39071326587621824, + "grad_norm": 0.2872349211567209, + "learning_rate": 9.972185041801836e-05, + "loss": 3.4562, + "step": 6294 + }, + { + "epoch": 0.39077534297597616, + "grad_norm": 0.2947767165721159, + "learning_rate": 9.972146987571367e-05, + "loss": 3.5617, + "step": 6295 + }, + { + "epoch": 0.3908374200757341, + "grad_norm": 0.38850619748227905, + "learning_rate": 9.9721089074e-05, + "loss": 3.6322, + "step": 6296 + }, + { + "epoch": 0.39089949717549194, + "grad_norm": 0.33194778739748, + "learning_rate": 9.972070801287934e-05, + "loss": 3.5236, + "step": 6297 + }, + { + "epoch": 0.39096157427524986, + "grad_norm": 0.32753012866197767, + "learning_rate": 9.972032669235367e-05, + "loss": 3.5367, + "step": 6298 + }, + { + "epoch": 0.3910236513750078, + "grad_norm": 0.34713286746844996, + "learning_rate": 9.971994511242499e-05, + "loss": 3.5051, + "step": 6299 + }, + { + "epoch": 0.39108572847476564, + "grad_norm": 0.34645754722113015, + "learning_rate": 9.971956327309528e-05, + "loss": 3.5235, + "step": 6300 + }, + { + "epoch": 0.39114780557452355, + "grad_norm": 0.2753837395206311, + "learning_rate": 9.971918117436656e-05, + "loss": 3.5747, + "step": 6301 + }, + { + "epoch": 0.3912098826742815, + "grad_norm": 0.2813919954009723, + "learning_rate": 9.971879881624077e-05, + "loss": 3.4668, + "step": 6302 + }, + { + "epoch": 0.39127195977403934, + "grad_norm": 0.2902579846027273, + "learning_rate": 9.971841619871997e-05, + "loss": 3.5065, + "step": 6303 + }, + { + "epoch": 0.39133403687379725, + "grad_norm": 0.34280080224163634, + "learning_rate": 9.971803332180611e-05, + "loss": 3.4856, + "step": 6304 + }, + { + "epoch": 0.39139611397355517, + "grad_norm": 0.21098584927886604, + "learning_rate": 9.97176501855012e-05, + "loss": 3.4735, + "step": 6305 + }, + { + "epoch": 0.39145819107331303, + "grad_norm": 0.2319398176462406, + "learning_rate": 9.971726678980724e-05, + "loss": 3.4964, + "step": 6306 + }, + { + "epoch": 0.39152026817307095, + "grad_norm": 0.3026298219350991, + "learning_rate": 9.971688313472623e-05, + "loss": 3.5386, + "step": 6307 + }, + { + "epoch": 0.39158234527282887, + "grad_norm": 0.19475748358036177, + "learning_rate": 9.971649922026016e-05, + "loss": 3.4417, + "step": 6308 + }, + { + "epoch": 0.39164442237258673, + "grad_norm": 0.29939782931621384, + "learning_rate": 9.971611504641106e-05, + "loss": 3.4426, + "step": 6309 + }, + { + "epoch": 0.39170649947234465, + "grad_norm": 0.265641211968644, + "learning_rate": 9.97157306131809e-05, + "loss": 3.5279, + "step": 6310 + }, + { + "epoch": 0.39176857657210257, + "grad_norm": 0.31211987264116525, + "learning_rate": 9.971534592057172e-05, + "loss": 3.5133, + "step": 6311 + }, + { + "epoch": 0.39183065367186043, + "grad_norm": 0.33861942743128615, + "learning_rate": 9.971496096858552e-05, + "loss": 3.5313, + "step": 6312 + }, + { + "epoch": 0.39189273077161835, + "grad_norm": 0.3043530291161064, + "learning_rate": 9.971457575722429e-05, + "loss": 3.4274, + "step": 6313 + }, + { + "epoch": 0.39195480787137627, + "grad_norm": 0.2998343089626598, + "learning_rate": 9.971419028649004e-05, + "loss": 3.5864, + "step": 6314 + }, + { + "epoch": 0.3920168849711341, + "grad_norm": 0.30816121511440997, + "learning_rate": 9.971380455638481e-05, + "loss": 3.3932, + "step": 6315 + }, + { + "epoch": 0.39207896207089205, + "grad_norm": 0.23458003771185135, + "learning_rate": 9.971341856691058e-05, + "loss": 3.6346, + "step": 6316 + }, + { + "epoch": 0.39214103917064996, + "grad_norm": 0.24437837849596283, + "learning_rate": 9.971303231806939e-05, + "loss": 3.5613, + "step": 6317 + }, + { + "epoch": 0.3922031162704078, + "grad_norm": 0.23947248365052054, + "learning_rate": 9.971264580986323e-05, + "loss": 3.5062, + "step": 6318 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 0.2778470348738473, + "learning_rate": 9.971225904229414e-05, + "loss": 3.5759, + "step": 6319 + }, + { + "epoch": 0.39232727046992366, + "grad_norm": 0.20778051682353058, + "learning_rate": 9.97118720153641e-05, + "loss": 3.4647, + "step": 6320 + }, + { + "epoch": 0.3923893475696815, + "grad_norm": 0.24545642281556485, + "learning_rate": 9.971148472907516e-05, + "loss": 3.5313, + "step": 6321 + }, + { + "epoch": 0.39245142466943944, + "grad_norm": 0.21194630039250192, + "learning_rate": 9.971109718342938e-05, + "loss": 3.4865, + "step": 6322 + }, + { + "epoch": 0.39251350176919736, + "grad_norm": 0.25009292127363986, + "learning_rate": 9.971070937842868e-05, + "loss": 3.4628, + "step": 6323 + }, + { + "epoch": 0.3925755788689552, + "grad_norm": 0.3335566095765199, + "learning_rate": 9.971032131407518e-05, + "loss": 3.4851, + "step": 6324 + }, + { + "epoch": 0.39263765596871314, + "grad_norm": 0.2968513830729387, + "learning_rate": 9.970993299037086e-05, + "loss": 3.6424, + "step": 6325 + }, + { + "epoch": 0.39269973306847106, + "grad_norm": 0.32431362096916605, + "learning_rate": 9.970954440731775e-05, + "loss": 3.4904, + "step": 6326 + }, + { + "epoch": 0.3927618101682289, + "grad_norm": 0.2406957506847346, + "learning_rate": 9.970915556491787e-05, + "loss": 3.4931, + "step": 6327 + }, + { + "epoch": 0.39282388726798684, + "grad_norm": 0.29999276639244266, + "learning_rate": 9.970876646317326e-05, + "loss": 3.4685, + "step": 6328 + }, + { + "epoch": 0.39288596436774476, + "grad_norm": 0.2681690429536182, + "learning_rate": 9.970837710208596e-05, + "loss": 3.4423, + "step": 6329 + }, + { + "epoch": 0.3929480414675026, + "grad_norm": 0.2289470610188153, + "learning_rate": 9.970798748165797e-05, + "loss": 3.4729, + "step": 6330 + }, + { + "epoch": 0.39301011856726054, + "grad_norm": 0.2677496478686613, + "learning_rate": 9.970759760189137e-05, + "loss": 3.4764, + "step": 6331 + }, + { + "epoch": 0.39307219566701845, + "grad_norm": 0.29188014914240773, + "learning_rate": 9.970720746278815e-05, + "loss": 3.4813, + "step": 6332 + }, + { + "epoch": 0.3931342727667763, + "grad_norm": 0.18427348615513345, + "learning_rate": 9.970681706435035e-05, + "loss": 3.5227, + "step": 6333 + }, + { + "epoch": 0.39319634986653423, + "grad_norm": 0.2867336799414014, + "learning_rate": 9.970642640658003e-05, + "loss": 3.4937, + "step": 6334 + }, + { + "epoch": 0.39325842696629215, + "grad_norm": 0.2818898102576371, + "learning_rate": 9.970603548947923e-05, + "loss": 3.542, + "step": 6335 + }, + { + "epoch": 0.39332050406605, + "grad_norm": 0.32721798285695947, + "learning_rate": 9.970564431304996e-05, + "loss": 3.4594, + "step": 6336 + }, + { + "epoch": 0.39338258116580793, + "grad_norm": 0.2738492726255908, + "learning_rate": 9.970525287729428e-05, + "loss": 3.5638, + "step": 6337 + }, + { + "epoch": 0.39344465826556585, + "grad_norm": 0.21116983898901687, + "learning_rate": 9.970486118221422e-05, + "loss": 3.3143, + "step": 6338 + }, + { + "epoch": 0.3935067353653237, + "grad_norm": 0.3269824503664569, + "learning_rate": 9.970446922781186e-05, + "loss": 3.4331, + "step": 6339 + }, + { + "epoch": 0.39356881246508163, + "grad_norm": 0.30135894388332274, + "learning_rate": 9.970407701408919e-05, + "loss": 3.5083, + "step": 6340 + }, + { + "epoch": 0.39363088956483955, + "grad_norm": 0.3006391932929863, + "learning_rate": 9.970368454104831e-05, + "loss": 3.5225, + "step": 6341 + }, + { + "epoch": 0.3936929666645974, + "grad_norm": 0.30025016665712123, + "learning_rate": 9.970329180869123e-05, + "loss": 3.39, + "step": 6342 + }, + { + "epoch": 0.39375504376435533, + "grad_norm": 0.27765633872336115, + "learning_rate": 9.970289881702e-05, + "loss": 3.4307, + "step": 6343 + }, + { + "epoch": 0.39381712086411325, + "grad_norm": 0.30875694209182425, + "learning_rate": 9.970250556603668e-05, + "loss": 3.4145, + "step": 6344 + }, + { + "epoch": 0.3938791979638711, + "grad_norm": 0.4294067853021058, + "learning_rate": 9.970211205574333e-05, + "loss": 3.4606, + "step": 6345 + }, + { + "epoch": 0.393941275063629, + "grad_norm": 0.2929936843433883, + "learning_rate": 9.9701718286142e-05, + "loss": 3.498, + "step": 6346 + }, + { + "epoch": 0.39400335216338694, + "grad_norm": 0.31121127713092606, + "learning_rate": 9.970132425723475e-05, + "loss": 3.3544, + "step": 6347 + }, + { + "epoch": 0.3940654292631448, + "grad_norm": 0.24938890155488075, + "learning_rate": 9.970092996902361e-05, + "loss": 3.5298, + "step": 6348 + }, + { + "epoch": 0.3941275063629027, + "grad_norm": 0.32436658177633576, + "learning_rate": 9.970053542151066e-05, + "loss": 3.5809, + "step": 6349 + }, + { + "epoch": 0.39418958346266064, + "grad_norm": 0.20648314238579812, + "learning_rate": 9.970014061469794e-05, + "loss": 3.5897, + "step": 6350 + }, + { + "epoch": 0.3942516605624185, + "grad_norm": 0.27351138615313203, + "learning_rate": 9.969974554858754e-05, + "loss": 3.4405, + "step": 6351 + }, + { + "epoch": 0.3943137376621764, + "grad_norm": 0.23570324118576247, + "learning_rate": 9.969935022318149e-05, + "loss": 3.5244, + "step": 6352 + }, + { + "epoch": 0.39437581476193434, + "grad_norm": 0.23213954414896093, + "learning_rate": 9.969895463848186e-05, + "loss": 3.4986, + "step": 6353 + }, + { + "epoch": 0.3944378918616922, + "grad_norm": 0.25657136873907305, + "learning_rate": 9.969855879449072e-05, + "loss": 3.481, + "step": 6354 + }, + { + "epoch": 0.3944999689614501, + "grad_norm": 0.3432160478708014, + "learning_rate": 9.969816269121015e-05, + "loss": 3.4544, + "step": 6355 + }, + { + "epoch": 0.39456204606120804, + "grad_norm": 0.23498226015113421, + "learning_rate": 9.969776632864219e-05, + "loss": 3.554, + "step": 6356 + }, + { + "epoch": 0.3946241231609659, + "grad_norm": 0.2659741799310447, + "learning_rate": 9.969736970678892e-05, + "loss": 3.436, + "step": 6357 + }, + { + "epoch": 0.3946862002607238, + "grad_norm": 0.3412045387056753, + "learning_rate": 9.969697282565241e-05, + "loss": 3.5571, + "step": 6358 + }, + { + "epoch": 0.39474827736048174, + "grad_norm": 0.37647333395323507, + "learning_rate": 9.969657568523473e-05, + "loss": 3.5185, + "step": 6359 + }, + { + "epoch": 0.3948103544602396, + "grad_norm": 0.34823031659306797, + "learning_rate": 9.969617828553795e-05, + "loss": 3.5249, + "step": 6360 + }, + { + "epoch": 0.3948724315599975, + "grad_norm": 0.24296751849975534, + "learning_rate": 9.969578062656412e-05, + "loss": 3.3982, + "step": 6361 + }, + { + "epoch": 0.39493450865975543, + "grad_norm": 0.2958456187428459, + "learning_rate": 9.969538270831538e-05, + "loss": 3.4157, + "step": 6362 + }, + { + "epoch": 0.3949965857595133, + "grad_norm": 0.2655559478089718, + "learning_rate": 9.969498453079373e-05, + "loss": 3.5392, + "step": 6363 + }, + { + "epoch": 0.3950586628592712, + "grad_norm": 0.2854194080030864, + "learning_rate": 9.969458609400129e-05, + "loss": 3.578, + "step": 6364 + }, + { + "epoch": 0.39512073995902913, + "grad_norm": 0.2377040783058047, + "learning_rate": 9.969418739794014e-05, + "loss": 3.592, + "step": 6365 + }, + { + "epoch": 0.395182817058787, + "grad_norm": 0.3644251094079715, + "learning_rate": 9.969378844261234e-05, + "loss": 3.543, + "step": 6366 + }, + { + "epoch": 0.3952448941585449, + "grad_norm": 0.27055519597924776, + "learning_rate": 9.969338922802e-05, + "loss": 3.5825, + "step": 6367 + }, + { + "epoch": 0.39530697125830283, + "grad_norm": 0.23097042665891065, + "learning_rate": 9.969298975416516e-05, + "loss": 3.4649, + "step": 6368 + }, + { + "epoch": 0.3953690483580607, + "grad_norm": 0.47572614416447623, + "learning_rate": 9.969259002104994e-05, + "loss": 3.4491, + "step": 6369 + }, + { + "epoch": 0.3954311254578186, + "grad_norm": 0.2752903965575781, + "learning_rate": 9.969219002867642e-05, + "loss": 3.5755, + "step": 6370 + }, + { + "epoch": 0.39549320255757653, + "grad_norm": 0.42356961360596307, + "learning_rate": 9.969178977704668e-05, + "loss": 3.5697, + "step": 6371 + }, + { + "epoch": 0.3955552796573344, + "grad_norm": 0.27088868358079415, + "learning_rate": 9.969138926616282e-05, + "loss": 3.515, + "step": 6372 + }, + { + "epoch": 0.3956173567570923, + "grad_norm": 0.31291719245273864, + "learning_rate": 9.96909884960269e-05, + "loss": 3.44, + "step": 6373 + }, + { + "epoch": 0.3956794338568502, + "grad_norm": 0.29667788788159877, + "learning_rate": 9.969058746664102e-05, + "loss": 3.5508, + "step": 6374 + }, + { + "epoch": 0.3957415109566081, + "grad_norm": 0.21685757517553902, + "learning_rate": 9.969018617800731e-05, + "loss": 3.4921, + "step": 6375 + }, + { + "epoch": 0.395803588056366, + "grad_norm": 0.33814523178043004, + "learning_rate": 9.968978463012782e-05, + "loss": 3.5577, + "step": 6376 + }, + { + "epoch": 0.3958656651561239, + "grad_norm": 0.22888387857161452, + "learning_rate": 9.968938282300466e-05, + "loss": 3.4848, + "step": 6377 + }, + { + "epoch": 0.3959277422558818, + "grad_norm": 0.2773581084711976, + "learning_rate": 9.968898075663995e-05, + "loss": 3.4868, + "step": 6378 + }, + { + "epoch": 0.3959898193556397, + "grad_norm": 0.21209583309494504, + "learning_rate": 9.968857843103573e-05, + "loss": 3.5125, + "step": 6379 + }, + { + "epoch": 0.3960518964553976, + "grad_norm": 0.3434170606152383, + "learning_rate": 9.968817584619417e-05, + "loss": 3.4684, + "step": 6380 + }, + { + "epoch": 0.3961139735551555, + "grad_norm": 0.32864793478550325, + "learning_rate": 9.968777300211731e-05, + "loss": 3.6381, + "step": 6381 + }, + { + "epoch": 0.3961760506549134, + "grad_norm": 0.303274042809375, + "learning_rate": 9.968736989880729e-05, + "loss": 3.5627, + "step": 6382 + }, + { + "epoch": 0.3962381277546713, + "grad_norm": 0.2726680009818209, + "learning_rate": 9.968696653626619e-05, + "loss": 3.4544, + "step": 6383 + }, + { + "epoch": 0.3963002048544292, + "grad_norm": 0.31069159584438183, + "learning_rate": 9.968656291449612e-05, + "loss": 3.482, + "step": 6384 + }, + { + "epoch": 0.3963622819541871, + "grad_norm": 0.2487720941431178, + "learning_rate": 9.96861590334992e-05, + "loss": 3.5481, + "step": 6385 + }, + { + "epoch": 0.396424359053945, + "grad_norm": 0.3580216134049626, + "learning_rate": 9.968575489327752e-05, + "loss": 3.4169, + "step": 6386 + }, + { + "epoch": 0.3964864361537029, + "grad_norm": 0.3473459101621353, + "learning_rate": 9.968535049383318e-05, + "loss": 3.5267, + "step": 6387 + }, + { + "epoch": 0.3965485132534608, + "grad_norm": 0.3732041631520066, + "learning_rate": 9.968494583516833e-05, + "loss": 3.5002, + "step": 6388 + }, + { + "epoch": 0.3966105903532187, + "grad_norm": 0.2609218449950365, + "learning_rate": 9.968454091728505e-05, + "loss": 3.4756, + "step": 6389 + }, + { + "epoch": 0.3966726674529766, + "grad_norm": 0.41315149458908357, + "learning_rate": 9.968413574018545e-05, + "loss": 3.3674, + "step": 6390 + }, + { + "epoch": 0.3967347445527345, + "grad_norm": 0.3668782169389463, + "learning_rate": 9.968373030387166e-05, + "loss": 3.4508, + "step": 6391 + }, + { + "epoch": 0.3967968216524924, + "grad_norm": 0.2527795736469733, + "learning_rate": 9.968332460834579e-05, + "loss": 3.6405, + "step": 6392 + }, + { + "epoch": 0.3968588987522503, + "grad_norm": 0.3747139077826919, + "learning_rate": 9.968291865360994e-05, + "loss": 3.5883, + "step": 6393 + }, + { + "epoch": 0.3969209758520082, + "grad_norm": 0.2684410825941049, + "learning_rate": 9.968251243966626e-05, + "loss": 3.524, + "step": 6394 + }, + { + "epoch": 0.3969830529517661, + "grad_norm": 0.4021695139374838, + "learning_rate": 9.968210596651686e-05, + "loss": 3.4545, + "step": 6395 + }, + { + "epoch": 0.397045130051524, + "grad_norm": 0.37215628844615006, + "learning_rate": 9.968169923416383e-05, + "loss": 3.4872, + "step": 6396 + }, + { + "epoch": 0.3971072071512819, + "grad_norm": 0.29356966062977385, + "learning_rate": 9.968129224260931e-05, + "loss": 3.4708, + "step": 6397 + }, + { + "epoch": 0.3971692842510398, + "grad_norm": 0.38726153958158643, + "learning_rate": 9.968088499185545e-05, + "loss": 3.4442, + "step": 6398 + }, + { + "epoch": 0.3972313613507977, + "grad_norm": 0.33832061838035554, + "learning_rate": 9.968047748190434e-05, + "loss": 3.608, + "step": 6399 + }, + { + "epoch": 0.3972934384505556, + "grad_norm": 0.22454049256640082, + "learning_rate": 9.968006971275813e-05, + "loss": 3.4787, + "step": 6400 + }, + { + "epoch": 0.3973555155503135, + "grad_norm": 0.2569464529162313, + "learning_rate": 9.967966168441892e-05, + "loss": 3.4236, + "step": 6401 + }, + { + "epoch": 0.39741759265007137, + "grad_norm": 0.21390682880777498, + "learning_rate": 9.967925339688886e-05, + "loss": 3.4999, + "step": 6402 + }, + { + "epoch": 0.3974796697498293, + "grad_norm": 0.34134706599618914, + "learning_rate": 9.967884485017008e-05, + "loss": 3.4169, + "step": 6403 + }, + { + "epoch": 0.3975417468495872, + "grad_norm": 0.2343968381244512, + "learning_rate": 9.96784360442647e-05, + "loss": 3.5327, + "step": 6404 + }, + { + "epoch": 0.39760382394934507, + "grad_norm": 0.2906032815632099, + "learning_rate": 9.967802697917488e-05, + "loss": 3.3735, + "step": 6405 + }, + { + "epoch": 0.397665901049103, + "grad_norm": 0.2179401255855039, + "learning_rate": 9.967761765490272e-05, + "loss": 3.5093, + "step": 6406 + }, + { + "epoch": 0.3977279781488609, + "grad_norm": 0.2411780778997315, + "learning_rate": 9.967720807145036e-05, + "loss": 3.4681, + "step": 6407 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 0.1989161399636036, + "learning_rate": 9.967679822881994e-05, + "loss": 3.534, + "step": 6408 + }, + { + "epoch": 0.3978521323483767, + "grad_norm": 0.19387053347483935, + "learning_rate": 9.967638812701361e-05, + "loss": 3.4772, + "step": 6409 + }, + { + "epoch": 0.3979142094481346, + "grad_norm": 0.25908477011051434, + "learning_rate": 9.967597776603353e-05, + "loss": 3.633, + "step": 6410 + }, + { + "epoch": 0.39797628654789247, + "grad_norm": 0.31527068202024916, + "learning_rate": 9.967556714588178e-05, + "loss": 3.5319, + "step": 6411 + }, + { + "epoch": 0.3980383636476504, + "grad_norm": 0.18806239066876687, + "learning_rate": 9.967515626656055e-05, + "loss": 3.4596, + "step": 6412 + }, + { + "epoch": 0.3981004407474083, + "grad_norm": 0.24696821432768729, + "learning_rate": 9.967474512807198e-05, + "loss": 3.5713, + "step": 6413 + }, + { + "epoch": 0.39816251784716616, + "grad_norm": 0.26889978603859555, + "learning_rate": 9.967433373041819e-05, + "loss": 3.5309, + "step": 6414 + }, + { + "epoch": 0.3982245949469241, + "grad_norm": 0.31363585990995985, + "learning_rate": 9.967392207360136e-05, + "loss": 3.5476, + "step": 6415 + }, + { + "epoch": 0.398286672046682, + "grad_norm": 0.28737425749534734, + "learning_rate": 9.96735101576236e-05, + "loss": 3.4978, + "step": 6416 + }, + { + "epoch": 0.39834874914643986, + "grad_norm": 0.29259590583829237, + "learning_rate": 9.967309798248708e-05, + "loss": 3.4953, + "step": 6417 + }, + { + "epoch": 0.3984108262461978, + "grad_norm": 0.2758737884316849, + "learning_rate": 9.967268554819395e-05, + "loss": 3.4172, + "step": 6418 + }, + { + "epoch": 0.3984729033459557, + "grad_norm": 0.35122932965490006, + "learning_rate": 9.967227285474638e-05, + "loss": 3.4163, + "step": 6419 + }, + { + "epoch": 0.39853498044571356, + "grad_norm": 0.2682460467881845, + "learning_rate": 9.967185990214649e-05, + "loss": 3.4534, + "step": 6420 + }, + { + "epoch": 0.3985970575454715, + "grad_norm": 0.2889543408299332, + "learning_rate": 9.967144669039645e-05, + "loss": 3.4329, + "step": 6421 + }, + { + "epoch": 0.3986591346452294, + "grad_norm": 0.34499721330747346, + "learning_rate": 9.967103321949842e-05, + "loss": 3.5646, + "step": 6422 + }, + { + "epoch": 0.39872121174498726, + "grad_norm": 0.2918541623504774, + "learning_rate": 9.967061948945455e-05, + "loss": 3.5445, + "step": 6423 + }, + { + "epoch": 0.3987832888447452, + "grad_norm": 0.4494268184278981, + "learning_rate": 9.9670205500267e-05, + "loss": 3.4, + "step": 6424 + }, + { + "epoch": 0.3988453659445031, + "grad_norm": 0.26204103582371674, + "learning_rate": 9.966979125193791e-05, + "loss": 3.5247, + "step": 6425 + }, + { + "epoch": 0.39890744304426096, + "grad_norm": 0.2826733336339079, + "learning_rate": 9.96693767444695e-05, + "loss": 3.4071, + "step": 6426 + }, + { + "epoch": 0.3989695201440189, + "grad_norm": 0.3673281096029605, + "learning_rate": 9.966896197786386e-05, + "loss": 3.5846, + "step": 6427 + }, + { + "epoch": 0.3990315972437768, + "grad_norm": 0.2818717574252363, + "learning_rate": 9.96685469521232e-05, + "loss": 3.5769, + "step": 6428 + }, + { + "epoch": 0.39909367434353465, + "grad_norm": 0.29160041425597694, + "learning_rate": 9.966813166724969e-05, + "loss": 3.4809, + "step": 6429 + }, + { + "epoch": 0.3991557514432926, + "grad_norm": 0.21774888862497052, + "learning_rate": 9.966771612324545e-05, + "loss": 3.5217, + "step": 6430 + }, + { + "epoch": 0.3992178285430505, + "grad_norm": 0.30981023258012846, + "learning_rate": 9.966730032011268e-05, + "loss": 3.3859, + "step": 6431 + }, + { + "epoch": 0.39927990564280835, + "grad_norm": 0.2654086101098712, + "learning_rate": 9.966688425785358e-05, + "loss": 3.4988, + "step": 6432 + }, + { + "epoch": 0.39934198274256627, + "grad_norm": 0.2595648742749062, + "learning_rate": 9.966646793647026e-05, + "loss": 3.5008, + "step": 6433 + }, + { + "epoch": 0.3994040598423242, + "grad_norm": 0.26764348199501925, + "learning_rate": 9.966605135596492e-05, + "loss": 3.5812, + "step": 6434 + }, + { + "epoch": 0.39946613694208205, + "grad_norm": 0.259965934103531, + "learning_rate": 9.966563451633974e-05, + "loss": 3.4637, + "step": 6435 + }, + { + "epoch": 0.39952821404183997, + "grad_norm": 0.213861656988392, + "learning_rate": 9.96652174175969e-05, + "loss": 3.6412, + "step": 6436 + }, + { + "epoch": 0.3995902911415979, + "grad_norm": 0.2610294317152242, + "learning_rate": 9.966480005973854e-05, + "loss": 3.4256, + "step": 6437 + }, + { + "epoch": 0.39965236824135575, + "grad_norm": 0.2596850162575537, + "learning_rate": 9.966438244276687e-05, + "loss": 3.4496, + "step": 6438 + }, + { + "epoch": 0.39971444534111367, + "grad_norm": 0.2528517614087154, + "learning_rate": 9.966396456668406e-05, + "loss": 3.4744, + "step": 6439 + }, + { + "epoch": 0.3997765224408716, + "grad_norm": 0.24052339706249568, + "learning_rate": 9.96635464314923e-05, + "loss": 3.6358, + "step": 6440 + }, + { + "epoch": 0.39983859954062945, + "grad_norm": 0.2204632594981475, + "learning_rate": 9.966312803719376e-05, + "loss": 3.43, + "step": 6441 + }, + { + "epoch": 0.39990067664038736, + "grad_norm": 0.26622293995187285, + "learning_rate": 9.966270938379061e-05, + "loss": 3.5133, + "step": 6442 + }, + { + "epoch": 0.3999627537401453, + "grad_norm": 0.20491114585034362, + "learning_rate": 9.966229047128505e-05, + "loss": 3.5907, + "step": 6443 + }, + { + "epoch": 0.40002483083990314, + "grad_norm": 0.2801799950693315, + "learning_rate": 9.966187129967929e-05, + "loss": 3.4439, + "step": 6444 + }, + { + "epoch": 0.40008690793966106, + "grad_norm": 0.26412814187827355, + "learning_rate": 9.966145186897546e-05, + "loss": 3.5255, + "step": 6445 + }, + { + "epoch": 0.400148985039419, + "grad_norm": 0.3869169286152856, + "learning_rate": 9.966103217917581e-05, + "loss": 3.5488, + "step": 6446 + }, + { + "epoch": 0.40021106213917684, + "grad_norm": 0.37226910424382864, + "learning_rate": 9.966061223028248e-05, + "loss": 3.5683, + "step": 6447 + }, + { + "epoch": 0.40027313923893476, + "grad_norm": 0.3923812676300124, + "learning_rate": 9.966019202229768e-05, + "loss": 3.4707, + "step": 6448 + }, + { + "epoch": 0.4003352163386927, + "grad_norm": 0.3637616662320898, + "learning_rate": 9.965977155522362e-05, + "loss": 3.3855, + "step": 6449 + }, + { + "epoch": 0.40039729343845054, + "grad_norm": 0.3398050198464019, + "learning_rate": 9.965935082906246e-05, + "loss": 3.4387, + "step": 6450 + }, + { + "epoch": 0.40045937053820846, + "grad_norm": 0.39596168001643195, + "learning_rate": 9.965892984381642e-05, + "loss": 3.5109, + "step": 6451 + }, + { + "epoch": 0.4005214476379664, + "grad_norm": 0.3522086245980377, + "learning_rate": 9.965850859948768e-05, + "loss": 3.5217, + "step": 6452 + }, + { + "epoch": 0.40058352473772424, + "grad_norm": 0.3576338717757878, + "learning_rate": 9.965808709607845e-05, + "loss": 3.4496, + "step": 6453 + }, + { + "epoch": 0.40064560183748216, + "grad_norm": 0.3357615786506592, + "learning_rate": 9.965766533359093e-05, + "loss": 3.4738, + "step": 6454 + }, + { + "epoch": 0.4007076789372401, + "grad_norm": 0.2830611563692991, + "learning_rate": 9.96572433120273e-05, + "loss": 3.4725, + "step": 6455 + }, + { + "epoch": 0.40076975603699794, + "grad_norm": 0.3612894604590302, + "learning_rate": 9.965682103138979e-05, + "loss": 3.5465, + "step": 6456 + }, + { + "epoch": 0.40083183313675586, + "grad_norm": 0.23344097787240034, + "learning_rate": 9.965639849168058e-05, + "loss": 3.4565, + "step": 6457 + }, + { + "epoch": 0.4008939102365138, + "grad_norm": 0.39668916947317173, + "learning_rate": 9.96559756929019e-05, + "loss": 3.4803, + "step": 6458 + }, + { + "epoch": 0.40095598733627164, + "grad_norm": 0.2414272126413918, + "learning_rate": 9.965555263505593e-05, + "loss": 3.4489, + "step": 6459 + }, + { + "epoch": 0.40101806443602955, + "grad_norm": 0.46555602375636435, + "learning_rate": 9.965512931814489e-05, + "loss": 3.5456, + "step": 6460 + }, + { + "epoch": 0.40108014153578747, + "grad_norm": 0.35746712736903163, + "learning_rate": 9.965470574217098e-05, + "loss": 3.5509, + "step": 6461 + }, + { + "epoch": 0.40114221863554533, + "grad_norm": 0.27343117842860026, + "learning_rate": 9.965428190713642e-05, + "loss": 3.4858, + "step": 6462 + }, + { + "epoch": 0.40120429573530325, + "grad_norm": 0.3181485958566874, + "learning_rate": 9.965385781304342e-05, + "loss": 3.4631, + "step": 6463 + }, + { + "epoch": 0.40126637283506117, + "grad_norm": 0.24964045956380462, + "learning_rate": 9.96534334598942e-05, + "loss": 3.4975, + "step": 6464 + }, + { + "epoch": 0.40132844993481903, + "grad_norm": 0.3274689200628658, + "learning_rate": 9.965300884769095e-05, + "loss": 3.5351, + "step": 6465 + }, + { + "epoch": 0.40139052703457695, + "grad_norm": 0.3186613067011748, + "learning_rate": 9.965258397643588e-05, + "loss": 3.5255, + "step": 6466 + }, + { + "epoch": 0.40145260413433487, + "grad_norm": 0.2406230588933753, + "learning_rate": 9.965215884613126e-05, + "loss": 3.4516, + "step": 6467 + }, + { + "epoch": 0.40151468123409273, + "grad_norm": 0.3018582371387631, + "learning_rate": 9.965173345677926e-05, + "loss": 3.4991, + "step": 6468 + }, + { + "epoch": 0.40157675833385065, + "grad_norm": 0.44255837163250816, + "learning_rate": 9.965130780838212e-05, + "loss": 3.4845, + "step": 6469 + }, + { + "epoch": 0.40163883543360857, + "grad_norm": 0.23910800369877552, + "learning_rate": 9.965088190094205e-05, + "loss": 3.5664, + "step": 6470 + }, + { + "epoch": 0.4017009125333664, + "grad_norm": 0.299575220819669, + "learning_rate": 9.965045573446127e-05, + "loss": 3.3584, + "step": 6471 + }, + { + "epoch": 0.40176298963312435, + "grad_norm": 0.24640070980957904, + "learning_rate": 9.9650029308942e-05, + "loss": 3.4555, + "step": 6472 + }, + { + "epoch": 0.40182506673288226, + "grad_norm": 0.2745301079332786, + "learning_rate": 9.964960262438649e-05, + "loss": 3.5941, + "step": 6473 + }, + { + "epoch": 0.4018871438326401, + "grad_norm": 0.25838686801098076, + "learning_rate": 9.964917568079696e-05, + "loss": 3.5425, + "step": 6474 + }, + { + "epoch": 0.40194922093239804, + "grad_norm": 0.5000621935787819, + "learning_rate": 9.964874847817562e-05, + "loss": 3.459, + "step": 6475 + }, + { + "epoch": 0.40201129803215596, + "grad_norm": 0.26577443239145865, + "learning_rate": 9.964832101652469e-05, + "loss": 3.5786, + "step": 6476 + }, + { + "epoch": 0.4020733751319138, + "grad_norm": 0.3367393482926584, + "learning_rate": 9.964789329584642e-05, + "loss": 3.554, + "step": 6477 + }, + { + "epoch": 0.40213545223167174, + "grad_norm": 0.33911575523216547, + "learning_rate": 9.964746531614307e-05, + "loss": 3.4258, + "step": 6478 + }, + { + "epoch": 0.40219752933142966, + "grad_norm": 0.2675188931807552, + "learning_rate": 9.964703707741681e-05, + "loss": 3.4786, + "step": 6479 + }, + { + "epoch": 0.4022596064311875, + "grad_norm": 0.3420071200471766, + "learning_rate": 9.964660857966991e-05, + "loss": 3.553, + "step": 6480 + }, + { + "epoch": 0.40232168353094544, + "grad_norm": 0.30825434502094035, + "learning_rate": 9.96461798229046e-05, + "loss": 3.5705, + "step": 6481 + }, + { + "epoch": 0.40238376063070336, + "grad_norm": 0.4515937055213299, + "learning_rate": 9.964575080712312e-05, + "loss": 3.5147, + "step": 6482 + }, + { + "epoch": 0.4024458377304612, + "grad_norm": 0.3680697049514102, + "learning_rate": 9.964532153232771e-05, + "loss": 3.4776, + "step": 6483 + }, + { + "epoch": 0.40250791483021914, + "grad_norm": 0.2690540179910529, + "learning_rate": 9.96448919985206e-05, + "loss": 3.447, + "step": 6484 + }, + { + "epoch": 0.40256999192997706, + "grad_norm": 0.2743653630929982, + "learning_rate": 9.964446220570405e-05, + "loss": 3.471, + "step": 6485 + }, + { + "epoch": 0.4026320690297349, + "grad_norm": 0.24406501362100871, + "learning_rate": 9.964403215388028e-05, + "loss": 3.3984, + "step": 6486 + }, + { + "epoch": 0.40269414612949284, + "grad_norm": 0.47217546395918125, + "learning_rate": 9.964360184305155e-05, + "loss": 3.5215, + "step": 6487 + }, + { + "epoch": 0.40275622322925075, + "grad_norm": 0.37284200345350405, + "learning_rate": 9.964317127322009e-05, + "loss": 3.4597, + "step": 6488 + }, + { + "epoch": 0.4028183003290086, + "grad_norm": 0.34790511278662006, + "learning_rate": 9.964274044438815e-05, + "loss": 3.5079, + "step": 6489 + }, + { + "epoch": 0.40288037742876653, + "grad_norm": 0.45608525306494657, + "learning_rate": 9.964230935655798e-05, + "loss": 3.5389, + "step": 6490 + }, + { + "epoch": 0.40294245452852445, + "grad_norm": 0.35791188404868385, + "learning_rate": 9.964187800973183e-05, + "loss": 3.4773, + "step": 6491 + }, + { + "epoch": 0.4030045316282823, + "grad_norm": 0.28299655615493824, + "learning_rate": 9.964144640391198e-05, + "loss": 3.5255, + "step": 6492 + }, + { + "epoch": 0.40306660872804023, + "grad_norm": 0.26300718095806425, + "learning_rate": 9.964101453910061e-05, + "loss": 3.4514, + "step": 6493 + }, + { + "epoch": 0.40312868582779815, + "grad_norm": 0.3769871592598897, + "learning_rate": 9.964058241530006e-05, + "loss": 3.4898, + "step": 6494 + }, + { + "epoch": 0.403190762927556, + "grad_norm": 0.2977894452468843, + "learning_rate": 9.96401500325125e-05, + "loss": 3.4538, + "step": 6495 + }, + { + "epoch": 0.40325284002731393, + "grad_norm": 0.26282211703418634, + "learning_rate": 9.963971739074025e-05, + "loss": 3.453, + "step": 6496 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 0.21749387853054036, + "learning_rate": 9.963928448998554e-05, + "loss": 3.513, + "step": 6497 + }, + { + "epoch": 0.4033769942268297, + "grad_norm": 0.2538123910993405, + "learning_rate": 9.963885133025064e-05, + "loss": 3.5621, + "step": 6498 + }, + { + "epoch": 0.40343907132658763, + "grad_norm": 0.34381636065722, + "learning_rate": 9.963841791153778e-05, + "loss": 3.4807, + "step": 6499 + }, + { + "epoch": 0.40350114842634555, + "grad_norm": 0.23835157701107845, + "learning_rate": 9.963798423384927e-05, + "loss": 3.4408, + "step": 6500 + }, + { + "epoch": 0.4035632255261034, + "grad_norm": 0.2558070850083458, + "learning_rate": 9.963755029718733e-05, + "loss": 3.5897, + "step": 6501 + }, + { + "epoch": 0.4036253026258613, + "grad_norm": 0.2287509391327638, + "learning_rate": 9.963711610155424e-05, + "loss": 3.4612, + "step": 6502 + }, + { + "epoch": 0.40368737972561924, + "grad_norm": 0.3140021104779661, + "learning_rate": 9.963668164695228e-05, + "loss": 3.5654, + "step": 6503 + }, + { + "epoch": 0.4037494568253771, + "grad_norm": 0.23243633994602553, + "learning_rate": 9.963624693338369e-05, + "loss": 3.5349, + "step": 6504 + }, + { + "epoch": 0.403811533925135, + "grad_norm": 0.2186697489126413, + "learning_rate": 9.963581196085075e-05, + "loss": 3.4318, + "step": 6505 + }, + { + "epoch": 0.40387361102489294, + "grad_norm": 0.23047835928070853, + "learning_rate": 9.963537672935574e-05, + "loss": 3.5345, + "step": 6506 + }, + { + "epoch": 0.4039356881246508, + "grad_norm": 0.27553741114354985, + "learning_rate": 9.963494123890091e-05, + "loss": 3.5248, + "step": 6507 + }, + { + "epoch": 0.4039977652244087, + "grad_norm": 0.19570512408912524, + "learning_rate": 9.963450548948854e-05, + "loss": 3.4926, + "step": 6508 + }, + { + "epoch": 0.40405984232416664, + "grad_norm": 0.18597267883963442, + "learning_rate": 9.96340694811209e-05, + "loss": 3.445, + "step": 6509 + }, + { + "epoch": 0.4041219194239245, + "grad_norm": 0.3493153491816045, + "learning_rate": 9.96336332138003e-05, + "loss": 3.4343, + "step": 6510 + }, + { + "epoch": 0.4041839965236824, + "grad_norm": 0.36372416168547334, + "learning_rate": 9.963319668752895e-05, + "loss": 3.3975, + "step": 6511 + }, + { + "epoch": 0.40424607362344034, + "grad_norm": 0.33170586518299533, + "learning_rate": 9.963275990230918e-05, + "loss": 3.5126, + "step": 6512 + }, + { + "epoch": 0.4043081507231982, + "grad_norm": 0.29555516742847726, + "learning_rate": 9.963232285814325e-05, + "loss": 3.4561, + "step": 6513 + }, + { + "epoch": 0.4043702278229561, + "grad_norm": 0.20340246726612518, + "learning_rate": 9.963188555503346e-05, + "loss": 3.5373, + "step": 6514 + }, + { + "epoch": 0.40443230492271404, + "grad_norm": 0.2717951909578043, + "learning_rate": 9.963144799298205e-05, + "loss": 3.5057, + "step": 6515 + }, + { + "epoch": 0.4044943820224719, + "grad_norm": 0.22413121618212806, + "learning_rate": 9.963101017199135e-05, + "loss": 3.473, + "step": 6516 + }, + { + "epoch": 0.4045564591222298, + "grad_norm": 0.22445745720883253, + "learning_rate": 9.963057209206362e-05, + "loss": 3.4265, + "step": 6517 + }, + { + "epoch": 0.40461853622198773, + "grad_norm": 0.25177631532137806, + "learning_rate": 9.963013375320112e-05, + "loss": 3.4375, + "step": 6518 + }, + { + "epoch": 0.4046806133217456, + "grad_norm": 0.3327720232424264, + "learning_rate": 9.962969515540618e-05, + "loss": 3.5176, + "step": 6519 + }, + { + "epoch": 0.4047426904215035, + "grad_norm": 0.35273405478495684, + "learning_rate": 9.962925629868108e-05, + "loss": 3.476, + "step": 6520 + }, + { + "epoch": 0.40480476752126143, + "grad_norm": 0.2882313119077563, + "learning_rate": 9.96288171830281e-05, + "loss": 3.4131, + "step": 6521 + }, + { + "epoch": 0.4048668446210193, + "grad_norm": 0.24727111316152656, + "learning_rate": 9.962837780844953e-05, + "loss": 3.4789, + "step": 6522 + }, + { + "epoch": 0.4049289217207772, + "grad_norm": 0.27392764439348694, + "learning_rate": 9.962793817494767e-05, + "loss": 3.4441, + "step": 6523 + }, + { + "epoch": 0.40499099882053513, + "grad_norm": 0.4099713771703033, + "learning_rate": 9.96274982825248e-05, + "loss": 3.4791, + "step": 6524 + }, + { + "epoch": 0.405053075920293, + "grad_norm": 0.29626834522859674, + "learning_rate": 9.962705813118323e-05, + "loss": 3.4835, + "step": 6525 + }, + { + "epoch": 0.4051151530200509, + "grad_norm": 0.24379738139314042, + "learning_rate": 9.962661772092523e-05, + "loss": 3.4261, + "step": 6526 + }, + { + "epoch": 0.40517723011980883, + "grad_norm": 0.3735419179266989, + "learning_rate": 9.962617705175314e-05, + "loss": 3.4426, + "step": 6527 + }, + { + "epoch": 0.4052393072195667, + "grad_norm": 0.39415536102482596, + "learning_rate": 9.962573612366923e-05, + "loss": 3.6193, + "step": 6528 + }, + { + "epoch": 0.4053013843193246, + "grad_norm": 0.29444021087852834, + "learning_rate": 9.962529493667581e-05, + "loss": 3.4662, + "step": 6529 + }, + { + "epoch": 0.4053634614190825, + "grad_norm": 0.23553772049759783, + "learning_rate": 9.962485349077519e-05, + "loss": 3.5527, + "step": 6530 + }, + { + "epoch": 0.4054255385188404, + "grad_norm": 0.21753387788857523, + "learning_rate": 9.962441178596964e-05, + "loss": 3.6096, + "step": 6531 + }, + { + "epoch": 0.4054876156185983, + "grad_norm": 0.23848014878834223, + "learning_rate": 9.96239698222615e-05, + "loss": 3.521, + "step": 6532 + }, + { + "epoch": 0.4055496927183562, + "grad_norm": 0.22656594471121438, + "learning_rate": 9.962352759965307e-05, + "loss": 3.4454, + "step": 6533 + }, + { + "epoch": 0.4056117698181141, + "grad_norm": 0.2858074669051737, + "learning_rate": 9.962308511814664e-05, + "loss": 3.4256, + "step": 6534 + }, + { + "epoch": 0.405673846917872, + "grad_norm": 0.19981425235410633, + "learning_rate": 9.962264237774452e-05, + "loss": 3.4329, + "step": 6535 + }, + { + "epoch": 0.4057359240176299, + "grad_norm": 0.327050550249439, + "learning_rate": 9.962219937844904e-05, + "loss": 3.4854, + "step": 6536 + }, + { + "epoch": 0.4057980011173878, + "grad_norm": 0.26004104067975603, + "learning_rate": 9.96217561202625e-05, + "loss": 3.5179, + "step": 6537 + }, + { + "epoch": 0.4058600782171457, + "grad_norm": 0.4671449201693712, + "learning_rate": 9.96213126031872e-05, + "loss": 3.5096, + "step": 6538 + }, + { + "epoch": 0.4059221553169036, + "grad_norm": 0.37104843264491494, + "learning_rate": 9.96208688272255e-05, + "loss": 3.4567, + "step": 6539 + }, + { + "epoch": 0.4059842324166615, + "grad_norm": 0.25199567450300325, + "learning_rate": 9.962042479237966e-05, + "loss": 3.3494, + "step": 6540 + }, + { + "epoch": 0.4060463095164194, + "grad_norm": 0.3591497378700316, + "learning_rate": 9.961998049865199e-05, + "loss": 3.4712, + "step": 6541 + }, + { + "epoch": 0.4061083866161773, + "grad_norm": 0.27005010228302545, + "learning_rate": 9.961953594604488e-05, + "loss": 3.5777, + "step": 6542 + }, + { + "epoch": 0.4061704637159352, + "grad_norm": 0.38028372274085925, + "learning_rate": 9.961909113456059e-05, + "loss": 3.4146, + "step": 6543 + }, + { + "epoch": 0.4062325408156931, + "grad_norm": 0.301206449213774, + "learning_rate": 9.961864606420145e-05, + "loss": 3.4543, + "step": 6544 + }, + { + "epoch": 0.406294617915451, + "grad_norm": 0.2611397701640424, + "learning_rate": 9.961820073496979e-05, + "loss": 3.5533, + "step": 6545 + }, + { + "epoch": 0.4063566950152089, + "grad_norm": 0.21800711201902304, + "learning_rate": 9.961775514686793e-05, + "loss": 3.5289, + "step": 6546 + }, + { + "epoch": 0.4064187721149668, + "grad_norm": 0.22745197905440276, + "learning_rate": 9.961730929989821e-05, + "loss": 3.4867, + "step": 6547 + }, + { + "epoch": 0.4064808492147247, + "grad_norm": 0.3205327811735017, + "learning_rate": 9.961686319406293e-05, + "loss": 3.4777, + "step": 6548 + }, + { + "epoch": 0.4065429263144826, + "grad_norm": 0.32123524641231166, + "learning_rate": 9.961641682936442e-05, + "loss": 3.4313, + "step": 6549 + }, + { + "epoch": 0.4066050034142405, + "grad_norm": 0.2712760510218359, + "learning_rate": 9.961597020580505e-05, + "loss": 3.5433, + "step": 6550 + }, + { + "epoch": 0.4066670805139984, + "grad_norm": 0.28374762971030887, + "learning_rate": 9.961552332338708e-05, + "loss": 3.5007, + "step": 6551 + }, + { + "epoch": 0.4067291576137563, + "grad_norm": 0.1974239186376264, + "learning_rate": 9.961507618211289e-05, + "loss": 3.4361, + "step": 6552 + }, + { + "epoch": 0.4067912347135142, + "grad_norm": 0.21284950804213326, + "learning_rate": 9.961462878198482e-05, + "loss": 3.5159, + "step": 6553 + }, + { + "epoch": 0.4068533118132721, + "grad_norm": 0.24687530903857238, + "learning_rate": 9.961418112300518e-05, + "loss": 3.529, + "step": 6554 + }, + { + "epoch": 0.40691538891303, + "grad_norm": 0.2457653079164545, + "learning_rate": 9.961373320517632e-05, + "loss": 3.381, + "step": 6555 + }, + { + "epoch": 0.4069774660127879, + "grad_norm": 0.21089261955069272, + "learning_rate": 9.961328502850055e-05, + "loss": 3.5236, + "step": 6556 + }, + { + "epoch": 0.40703954311254575, + "grad_norm": 0.20455164769037518, + "learning_rate": 9.961283659298026e-05, + "loss": 3.464, + "step": 6557 + }, + { + "epoch": 0.40710162021230367, + "grad_norm": 0.22688976033591562, + "learning_rate": 9.961238789861773e-05, + "loss": 3.5378, + "step": 6558 + }, + { + "epoch": 0.4071636973120616, + "grad_norm": 0.24163449123921416, + "learning_rate": 9.961193894541533e-05, + "loss": 3.4453, + "step": 6559 + }, + { + "epoch": 0.40722577441181945, + "grad_norm": 0.2536224246075287, + "learning_rate": 9.961148973337541e-05, + "loss": 3.4171, + "step": 6560 + }, + { + "epoch": 0.40728785151157737, + "grad_norm": 0.25076011497352535, + "learning_rate": 9.96110402625003e-05, + "loss": 3.3652, + "step": 6561 + }, + { + "epoch": 0.4073499286113353, + "grad_norm": 0.1858116770019423, + "learning_rate": 9.961059053279236e-05, + "loss": 3.5784, + "step": 6562 + }, + { + "epoch": 0.40741200571109315, + "grad_norm": 0.20617806776950734, + "learning_rate": 9.96101405442539e-05, + "loss": 3.4195, + "step": 6563 + }, + { + "epoch": 0.40747408281085107, + "grad_norm": 0.2978216562734889, + "learning_rate": 9.960969029688732e-05, + "loss": 3.492, + "step": 6564 + }, + { + "epoch": 0.407536159910609, + "grad_norm": 0.2971214100007379, + "learning_rate": 9.960923979069494e-05, + "loss": 3.4678, + "step": 6565 + }, + { + "epoch": 0.40759823701036685, + "grad_norm": 0.2505798915927548, + "learning_rate": 9.960878902567909e-05, + "loss": 3.5492, + "step": 6566 + }, + { + "epoch": 0.40766031411012477, + "grad_norm": 0.26637816239121775, + "learning_rate": 9.960833800184217e-05, + "loss": 3.5438, + "step": 6567 + }, + { + "epoch": 0.4077223912098827, + "grad_norm": 0.1913580055159779, + "learning_rate": 9.96078867191865e-05, + "loss": 3.4969, + "step": 6568 + }, + { + "epoch": 0.40778446830964055, + "grad_norm": 0.30160187577833, + "learning_rate": 9.960743517771443e-05, + "loss": 3.3637, + "step": 6569 + }, + { + "epoch": 0.40784654540939846, + "grad_norm": 0.22301390850203287, + "learning_rate": 9.960698337742835e-05, + "loss": 3.4464, + "step": 6570 + }, + { + "epoch": 0.4079086225091564, + "grad_norm": 0.5550794962490897, + "learning_rate": 9.960653131833058e-05, + "loss": 3.4974, + "step": 6571 + }, + { + "epoch": 0.40797069960891424, + "grad_norm": 0.41643479770045266, + "learning_rate": 9.96060790004235e-05, + "loss": 3.4417, + "step": 6572 + }, + { + "epoch": 0.40803277670867216, + "grad_norm": 0.29211354365183806, + "learning_rate": 9.960562642370945e-05, + "loss": 3.5766, + "step": 6573 + }, + { + "epoch": 0.4080948538084301, + "grad_norm": 0.37561076811679656, + "learning_rate": 9.960517358819082e-05, + "loss": 3.3767, + "step": 6574 + }, + { + "epoch": 0.40815693090818794, + "grad_norm": 0.3250256803343634, + "learning_rate": 9.960472049386995e-05, + "loss": 3.4124, + "step": 6575 + }, + { + "epoch": 0.40821900800794586, + "grad_norm": 0.3097231153877753, + "learning_rate": 9.960426714074922e-05, + "loss": 3.4806, + "step": 6576 + }, + { + "epoch": 0.4082810851077038, + "grad_norm": 0.3631846186599569, + "learning_rate": 9.960381352883099e-05, + "loss": 3.5067, + "step": 6577 + }, + { + "epoch": 0.40834316220746164, + "grad_norm": 0.4615048295385195, + "learning_rate": 9.960335965811761e-05, + "loss": 3.4623, + "step": 6578 + }, + { + "epoch": 0.40840523930721956, + "grad_norm": 0.34030225670060926, + "learning_rate": 9.960290552861147e-05, + "loss": 3.4996, + "step": 6579 + }, + { + "epoch": 0.4084673164069775, + "grad_norm": 0.3104597605271114, + "learning_rate": 9.960245114031492e-05, + "loss": 3.5069, + "step": 6580 + }, + { + "epoch": 0.40852939350673534, + "grad_norm": 0.24764123173132163, + "learning_rate": 9.960199649323034e-05, + "loss": 3.4965, + "step": 6581 + }, + { + "epoch": 0.40859147060649326, + "grad_norm": 0.26481002716699736, + "learning_rate": 9.960154158736012e-05, + "loss": 3.4096, + "step": 6582 + }, + { + "epoch": 0.4086535477062512, + "grad_norm": 0.2102817214022877, + "learning_rate": 9.960108642270661e-05, + "loss": 3.4554, + "step": 6583 + }, + { + "epoch": 0.40871562480600904, + "grad_norm": 0.27884270347476636, + "learning_rate": 9.960063099927217e-05, + "loss": 3.4787, + "step": 6584 + }, + { + "epoch": 0.40877770190576695, + "grad_norm": 0.2351798939145306, + "learning_rate": 9.960017531705922e-05, + "loss": 3.569, + "step": 6585 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 0.32011380662232475, + "learning_rate": 9.959971937607011e-05, + "loss": 3.5514, + "step": 6586 + }, + { + "epoch": 0.40890185610528273, + "grad_norm": 0.4937994763175718, + "learning_rate": 9.959926317630721e-05, + "loss": 3.4271, + "step": 6587 + }, + { + "epoch": 0.40896393320504065, + "grad_norm": 0.30180120310493025, + "learning_rate": 9.959880671777294e-05, + "loss": 3.4366, + "step": 6588 + }, + { + "epoch": 0.40902601030479857, + "grad_norm": 0.3439052079673277, + "learning_rate": 9.959835000046964e-05, + "loss": 3.441, + "step": 6589 + }, + { + "epoch": 0.40908808740455643, + "grad_norm": 0.3925292520799456, + "learning_rate": 9.95978930243997e-05, + "loss": 3.4519, + "step": 6590 + }, + { + "epoch": 0.40915016450431435, + "grad_norm": 0.42094883097101043, + "learning_rate": 9.959743578956551e-05, + "loss": 3.445, + "step": 6591 + }, + { + "epoch": 0.40921224160407227, + "grad_norm": 0.5385528587885219, + "learning_rate": 9.959697829596948e-05, + "loss": 3.4978, + "step": 6592 + }, + { + "epoch": 0.40927431870383013, + "grad_norm": 0.3592210315434436, + "learning_rate": 9.959652054361395e-05, + "loss": 3.3816, + "step": 6593 + }, + { + "epoch": 0.40933639580358805, + "grad_norm": 0.4836605555867772, + "learning_rate": 9.959606253250134e-05, + "loss": 3.4715, + "step": 6594 + }, + { + "epoch": 0.40939847290334597, + "grad_norm": 1.0165176576843582, + "learning_rate": 9.959560426263404e-05, + "loss": 3.4889, + "step": 6595 + }, + { + "epoch": 0.40946055000310383, + "grad_norm": 0.8578956702653465, + "learning_rate": 9.959514573401443e-05, + "loss": 3.4986, + "step": 6596 + }, + { + "epoch": 0.40952262710286175, + "grad_norm": 0.6195324641597054, + "learning_rate": 9.95946869466449e-05, + "loss": 3.5094, + "step": 6597 + }, + { + "epoch": 0.40958470420261966, + "grad_norm": 0.46738345906921885, + "learning_rate": 9.959422790052786e-05, + "loss": 3.5065, + "step": 6598 + }, + { + "epoch": 0.4096467813023775, + "grad_norm": 0.5215171203619272, + "learning_rate": 9.959376859566568e-05, + "loss": 3.47, + "step": 6599 + }, + { + "epoch": 0.40970885840213545, + "grad_norm": 1.2972984082387988, + "learning_rate": 9.959330903206077e-05, + "loss": 3.2626, + "step": 6600 + }, + { + "epoch": 0.40977093550189336, + "grad_norm": 0.43258456759471786, + "learning_rate": 9.959284920971553e-05, + "loss": 3.4866, + "step": 6601 + }, + { + "epoch": 0.4098330126016512, + "grad_norm": 0.48208462745747777, + "learning_rate": 9.959238912863236e-05, + "loss": 3.379, + "step": 6602 + }, + { + "epoch": 0.40989508970140914, + "grad_norm": 1.0427650509911635, + "learning_rate": 9.959192878881366e-05, + "loss": 3.5491, + "step": 6603 + }, + { + "epoch": 0.40995716680116706, + "grad_norm": 0.47658136987067695, + "learning_rate": 9.959146819026182e-05, + "loss": 3.4125, + "step": 6604 + }, + { + "epoch": 0.4100192439009249, + "grad_norm": 0.3968991144618783, + "learning_rate": 9.959100733297926e-05, + "loss": 3.606, + "step": 6605 + }, + { + "epoch": 0.41008132100068284, + "grad_norm": 0.33241398014147466, + "learning_rate": 9.959054621696837e-05, + "loss": 3.4992, + "step": 6606 + }, + { + "epoch": 0.41014339810044076, + "grad_norm": 0.5131049563248626, + "learning_rate": 9.959008484223156e-05, + "loss": 3.5409, + "step": 6607 + }, + { + "epoch": 0.4102054752001986, + "grad_norm": 0.4311714384142778, + "learning_rate": 9.958962320877125e-05, + "loss": 3.5516, + "step": 6608 + }, + { + "epoch": 0.41026755229995654, + "grad_norm": 0.47713387740753416, + "learning_rate": 9.958916131658983e-05, + "loss": 3.4731, + "step": 6609 + }, + { + "epoch": 0.41032962939971446, + "grad_norm": 0.3391308267369113, + "learning_rate": 9.958869916568971e-05, + "loss": 3.4729, + "step": 6610 + }, + { + "epoch": 0.4103917064994723, + "grad_norm": 0.5841060498912912, + "learning_rate": 9.958823675607332e-05, + "loss": 3.4772, + "step": 6611 + }, + { + "epoch": 0.41045378359923024, + "grad_norm": 0.42913217205317955, + "learning_rate": 9.958777408774306e-05, + "loss": 3.4947, + "step": 6612 + }, + { + "epoch": 0.41051586069898816, + "grad_norm": 0.3464093726004067, + "learning_rate": 9.958731116070135e-05, + "loss": 3.4974, + "step": 6613 + }, + { + "epoch": 0.410577937798746, + "grad_norm": 0.3311908408185628, + "learning_rate": 9.95868479749506e-05, + "loss": 3.4627, + "step": 6614 + }, + { + "epoch": 0.41064001489850394, + "grad_norm": 0.29670193295583636, + "learning_rate": 9.95863845304932e-05, + "loss": 3.5833, + "step": 6615 + }, + { + "epoch": 0.41070209199826185, + "grad_norm": 0.4332325780021394, + "learning_rate": 9.958592082733161e-05, + "loss": 3.5007, + "step": 6616 + }, + { + "epoch": 0.4107641690980197, + "grad_norm": 0.4102042215540638, + "learning_rate": 9.958545686546824e-05, + "loss": 3.3724, + "step": 6617 + }, + { + "epoch": 0.41082624619777763, + "grad_norm": 0.3002833791302593, + "learning_rate": 9.958499264490551e-05, + "loss": 3.4567, + "step": 6618 + }, + { + "epoch": 0.41088832329753555, + "grad_norm": 0.580006116514182, + "learning_rate": 9.958452816564582e-05, + "loss": 3.5825, + "step": 6619 + }, + { + "epoch": 0.4109504003972934, + "grad_norm": 0.5346691671558345, + "learning_rate": 9.958406342769162e-05, + "loss": 3.4851, + "step": 6620 + }, + { + "epoch": 0.41101247749705133, + "grad_norm": 0.39019278456460693, + "learning_rate": 9.958359843104532e-05, + "loss": 3.487, + "step": 6621 + }, + { + "epoch": 0.41107455459680925, + "grad_norm": 0.4764264491007005, + "learning_rate": 9.958313317570935e-05, + "loss": 3.4879, + "step": 6622 + }, + { + "epoch": 0.4111366316965671, + "grad_norm": 0.3743504620950322, + "learning_rate": 9.958266766168613e-05, + "loss": 3.4372, + "step": 6623 + }, + { + "epoch": 0.41119870879632503, + "grad_norm": 0.3642144266835033, + "learning_rate": 9.95822018889781e-05, + "loss": 3.6179, + "step": 6624 + }, + { + "epoch": 0.41126078589608295, + "grad_norm": 0.31509444192849917, + "learning_rate": 9.95817358575877e-05, + "loss": 3.4836, + "step": 6625 + }, + { + "epoch": 0.4113228629958408, + "grad_norm": 0.36142327370401206, + "learning_rate": 9.958126956751735e-05, + "loss": 3.4421, + "step": 6626 + }, + { + "epoch": 0.41138494009559873, + "grad_norm": 0.33313000582213625, + "learning_rate": 9.958080301876946e-05, + "loss": 3.5541, + "step": 6627 + }, + { + "epoch": 0.41144701719535665, + "grad_norm": 0.5430835904313027, + "learning_rate": 9.958033621134647e-05, + "loss": 3.4996, + "step": 6628 + }, + { + "epoch": 0.4115090942951145, + "grad_norm": 0.4902131429383068, + "learning_rate": 9.957986914525086e-05, + "loss": 3.563, + "step": 6629 + }, + { + "epoch": 0.4115711713948724, + "grad_norm": 0.4346137721586536, + "learning_rate": 9.957940182048504e-05, + "loss": 3.4756, + "step": 6630 + }, + { + "epoch": 0.41163324849463034, + "grad_norm": 0.3562691867413258, + "learning_rate": 9.957893423705144e-05, + "loss": 3.5422, + "step": 6631 + }, + { + "epoch": 0.4116953255943882, + "grad_norm": 0.626524068362467, + "learning_rate": 9.95784663949525e-05, + "loss": 3.6092, + "step": 6632 + }, + { + "epoch": 0.4117574026941461, + "grad_norm": 0.2938368587722367, + "learning_rate": 9.957799829419065e-05, + "loss": 3.4065, + "step": 6633 + }, + { + "epoch": 0.41181947979390404, + "grad_norm": 0.3407037092841093, + "learning_rate": 9.957752993476836e-05, + "loss": 3.5335, + "step": 6634 + }, + { + "epoch": 0.4118815568936619, + "grad_norm": 0.33934285531513164, + "learning_rate": 9.957706131668807e-05, + "loss": 3.4687, + "step": 6635 + }, + { + "epoch": 0.4119436339934198, + "grad_norm": 0.27841439246507, + "learning_rate": 9.957659243995221e-05, + "loss": 3.6006, + "step": 6636 + }, + { + "epoch": 0.41200571109317774, + "grad_norm": 0.4879522664916636, + "learning_rate": 9.957612330456323e-05, + "loss": 3.506, + "step": 6637 + }, + { + "epoch": 0.4120677881929356, + "grad_norm": 0.2936371236395603, + "learning_rate": 9.957565391052358e-05, + "loss": 3.4309, + "step": 6638 + }, + { + "epoch": 0.4121298652926935, + "grad_norm": 0.5047391174186194, + "learning_rate": 9.957518425783572e-05, + "loss": 3.5873, + "step": 6639 + }, + { + "epoch": 0.41219194239245144, + "grad_norm": 0.4307554152431152, + "learning_rate": 9.957471434650207e-05, + "loss": 3.4919, + "step": 6640 + }, + { + "epoch": 0.4122540194922093, + "grad_norm": 0.2780822137003799, + "learning_rate": 9.95742441765251e-05, + "loss": 3.4923, + "step": 6641 + }, + { + "epoch": 0.4123160965919672, + "grad_norm": 0.38806816640173725, + "learning_rate": 9.957377374790729e-05, + "loss": 3.512, + "step": 6642 + }, + { + "epoch": 0.41237817369172514, + "grad_norm": 0.3608406490167245, + "learning_rate": 9.957330306065104e-05, + "loss": 3.5468, + "step": 6643 + }, + { + "epoch": 0.412440250791483, + "grad_norm": 0.3510331831547476, + "learning_rate": 9.957283211475884e-05, + "loss": 3.5245, + "step": 6644 + }, + { + "epoch": 0.4125023278912409, + "grad_norm": 0.30219265861408035, + "learning_rate": 9.957236091023314e-05, + "loss": 3.5802, + "step": 6645 + }, + { + "epoch": 0.41256440499099883, + "grad_norm": 0.260426419393387, + "learning_rate": 9.95718894470764e-05, + "loss": 3.3942, + "step": 6646 + }, + { + "epoch": 0.4126264820907567, + "grad_norm": 0.422515788686186, + "learning_rate": 9.957141772529108e-05, + "loss": 3.5674, + "step": 6647 + }, + { + "epoch": 0.4126885591905146, + "grad_norm": 0.2747047215380566, + "learning_rate": 9.957094574487965e-05, + "loss": 3.4933, + "step": 6648 + }, + { + "epoch": 0.41275063629027253, + "grad_norm": 0.29339042834464113, + "learning_rate": 9.957047350584457e-05, + "loss": 3.5074, + "step": 6649 + }, + { + "epoch": 0.4128127133900304, + "grad_norm": 0.3193009616934693, + "learning_rate": 9.957000100818827e-05, + "loss": 3.4703, + "step": 6650 + }, + { + "epoch": 0.4128747904897883, + "grad_norm": 0.24796985571890567, + "learning_rate": 9.956952825191325e-05, + "loss": 3.38, + "step": 6651 + }, + { + "epoch": 0.41293686758954623, + "grad_norm": 0.3315543733188711, + "learning_rate": 9.956905523702197e-05, + "loss": 3.4991, + "step": 6652 + }, + { + "epoch": 0.4129989446893041, + "grad_norm": 0.27994190771470123, + "learning_rate": 9.956858196351688e-05, + "loss": 3.3756, + "step": 6653 + }, + { + "epoch": 0.413061021789062, + "grad_norm": 0.28560402085702186, + "learning_rate": 9.95681084314005e-05, + "loss": 3.5358, + "step": 6654 + }, + { + "epoch": 0.41312309888881993, + "grad_norm": 0.39932012969063163, + "learning_rate": 9.956763464067523e-05, + "loss": 3.4706, + "step": 6655 + }, + { + "epoch": 0.4131851759885778, + "grad_norm": 0.2595291495424784, + "learning_rate": 9.956716059134359e-05, + "loss": 3.5436, + "step": 6656 + }, + { + "epoch": 0.4132472530883357, + "grad_norm": 0.2496244769236548, + "learning_rate": 9.956668628340803e-05, + "loss": 3.5098, + "step": 6657 + }, + { + "epoch": 0.4133093301880936, + "grad_norm": 0.24533467314064275, + "learning_rate": 9.956621171687106e-05, + "loss": 3.4756, + "step": 6658 + }, + { + "epoch": 0.4133714072878515, + "grad_norm": 0.3111356755452394, + "learning_rate": 9.956573689173512e-05, + "loss": 3.5525, + "step": 6659 + }, + { + "epoch": 0.4134334843876094, + "grad_norm": 0.29679901003791376, + "learning_rate": 9.95652618080027e-05, + "loss": 3.3296, + "step": 6660 + }, + { + "epoch": 0.4134955614873673, + "grad_norm": 0.2758504937318771, + "learning_rate": 9.956478646567626e-05, + "loss": 3.4184, + "step": 6661 + }, + { + "epoch": 0.4135576385871252, + "grad_norm": 0.306209260799948, + "learning_rate": 9.956431086475833e-05, + "loss": 3.4968, + "step": 6662 + }, + { + "epoch": 0.4136197156868831, + "grad_norm": 0.515821852820746, + "learning_rate": 9.956383500525133e-05, + "loss": 3.3901, + "step": 6663 + }, + { + "epoch": 0.413681792786641, + "grad_norm": 0.4270264502380161, + "learning_rate": 9.956335888715778e-05, + "loss": 3.4266, + "step": 6664 + }, + { + "epoch": 0.4137438698863989, + "grad_norm": 0.2913577353870718, + "learning_rate": 9.956288251048015e-05, + "loss": 3.4297, + "step": 6665 + }, + { + "epoch": 0.4138059469861568, + "grad_norm": 0.34401579897662565, + "learning_rate": 9.956240587522093e-05, + "loss": 3.532, + "step": 6666 + }, + { + "epoch": 0.4138680240859147, + "grad_norm": 0.25592856891126436, + "learning_rate": 9.956192898138261e-05, + "loss": 3.5086, + "step": 6667 + }, + { + "epoch": 0.4139301011856726, + "grad_norm": 0.23839406172767977, + "learning_rate": 9.956145182896768e-05, + "loss": 3.3549, + "step": 6668 + }, + { + "epoch": 0.4139921782854305, + "grad_norm": 0.33441162666366747, + "learning_rate": 9.956097441797861e-05, + "loss": 3.4611, + "step": 6669 + }, + { + "epoch": 0.4140542553851884, + "grad_norm": 0.4266122181182237, + "learning_rate": 9.956049674841791e-05, + "loss": 3.5164, + "step": 6670 + }, + { + "epoch": 0.4141163324849463, + "grad_norm": 0.3500960388571862, + "learning_rate": 9.956001882028808e-05, + "loss": 3.4623, + "step": 6671 + }, + { + "epoch": 0.4141784095847042, + "grad_norm": 0.2305329281948956, + "learning_rate": 9.955954063359159e-05, + "loss": 3.4895, + "step": 6672 + }, + { + "epoch": 0.4142404866844621, + "grad_norm": 0.3498788466403938, + "learning_rate": 9.955906218833096e-05, + "loss": 3.4664, + "step": 6673 + }, + { + "epoch": 0.41430256378422, + "grad_norm": 0.27807448389145545, + "learning_rate": 9.955858348450865e-05, + "loss": 3.4595, + "step": 6674 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 0.28571486951989744, + "learning_rate": 9.95581045221272e-05, + "loss": 3.4478, + "step": 6675 + }, + { + "epoch": 0.4144267179837358, + "grad_norm": 0.2463615413688347, + "learning_rate": 9.955762530118907e-05, + "loss": 3.4171, + "step": 6676 + }, + { + "epoch": 0.4144887950834937, + "grad_norm": 0.29571154644160086, + "learning_rate": 9.955714582169677e-05, + "loss": 3.5628, + "step": 6677 + }, + { + "epoch": 0.4145508721832516, + "grad_norm": 0.3243330673537351, + "learning_rate": 9.955666608365284e-05, + "loss": 3.5179, + "step": 6678 + }, + { + "epoch": 0.4146129492830095, + "grad_norm": 0.31852256945107943, + "learning_rate": 9.955618608705972e-05, + "loss": 3.5171, + "step": 6679 + }, + { + "epoch": 0.4146750263827674, + "grad_norm": 0.28816498787157246, + "learning_rate": 9.955570583191998e-05, + "loss": 3.5154, + "step": 6680 + }, + { + "epoch": 0.4147371034825253, + "grad_norm": 0.37432056706338085, + "learning_rate": 9.955522531823607e-05, + "loss": 3.4767, + "step": 6681 + }, + { + "epoch": 0.4147991805822832, + "grad_norm": 0.510894606770086, + "learning_rate": 9.955474454601053e-05, + "loss": 3.4449, + "step": 6682 + }, + { + "epoch": 0.4148612576820411, + "grad_norm": 0.4029922629068028, + "learning_rate": 9.955426351524585e-05, + "loss": 3.4453, + "step": 6683 + }, + { + "epoch": 0.414923334781799, + "grad_norm": 0.42991949234305166, + "learning_rate": 9.955378222594455e-05, + "loss": 3.4316, + "step": 6684 + }, + { + "epoch": 0.4149854118815569, + "grad_norm": 0.32760846262037135, + "learning_rate": 9.955330067810915e-05, + "loss": 3.4116, + "step": 6685 + }, + { + "epoch": 0.41504748898131477, + "grad_norm": 0.4484792676607669, + "learning_rate": 9.955281887174214e-05, + "loss": 3.4455, + "step": 6686 + }, + { + "epoch": 0.4151095660810727, + "grad_norm": 0.4262603898613972, + "learning_rate": 9.955233680684604e-05, + "loss": 3.5031, + "step": 6687 + }, + { + "epoch": 0.4151716431808306, + "grad_norm": 0.3068424095476512, + "learning_rate": 9.955185448342337e-05, + "loss": 3.5086, + "step": 6688 + }, + { + "epoch": 0.41523372028058847, + "grad_norm": 0.37181312703464336, + "learning_rate": 9.955137190147665e-05, + "loss": 3.4579, + "step": 6689 + }, + { + "epoch": 0.4152957973803464, + "grad_norm": 0.2917764306726027, + "learning_rate": 9.95508890610084e-05, + "loss": 3.5074, + "step": 6690 + }, + { + "epoch": 0.4153578744801043, + "grad_norm": 0.2476388323462234, + "learning_rate": 9.955040596202113e-05, + "loss": 3.4618, + "step": 6691 + }, + { + "epoch": 0.41541995157986217, + "grad_norm": 0.2890856578932664, + "learning_rate": 9.954992260451737e-05, + "loss": 3.4878, + "step": 6692 + }, + { + "epoch": 0.4154820286796201, + "grad_norm": 0.2275253913930937, + "learning_rate": 9.954943898849962e-05, + "loss": 3.5071, + "step": 6693 + }, + { + "epoch": 0.415544105779378, + "grad_norm": 0.3134971089032058, + "learning_rate": 9.954895511397042e-05, + "loss": 3.4028, + "step": 6694 + }, + { + "epoch": 0.41560618287913587, + "grad_norm": 0.24364489006918894, + "learning_rate": 9.954847098093229e-05, + "loss": 3.5421, + "step": 6695 + }, + { + "epoch": 0.4156682599788938, + "grad_norm": 0.574252847710118, + "learning_rate": 9.954798658938778e-05, + "loss": 3.4547, + "step": 6696 + }, + { + "epoch": 0.4157303370786517, + "grad_norm": 0.35115593607780626, + "learning_rate": 9.954750193933938e-05, + "loss": 3.4151, + "step": 6697 + }, + { + "epoch": 0.41579241417840956, + "grad_norm": 0.46076608153607795, + "learning_rate": 9.954701703078963e-05, + "loss": 3.4624, + "step": 6698 + }, + { + "epoch": 0.4158544912781675, + "grad_norm": 0.46142198672071566, + "learning_rate": 9.954653186374108e-05, + "loss": 3.5794, + "step": 6699 + }, + { + "epoch": 0.4159165683779254, + "grad_norm": 0.4208716010015068, + "learning_rate": 9.954604643819622e-05, + "loss": 3.458, + "step": 6700 + }, + { + "epoch": 0.41597864547768326, + "grad_norm": 0.32027497885917394, + "learning_rate": 9.954556075415764e-05, + "loss": 3.5119, + "step": 6701 + }, + { + "epoch": 0.4160407225774412, + "grad_norm": 0.33230460639002873, + "learning_rate": 9.954507481162782e-05, + "loss": 3.4666, + "step": 6702 + }, + { + "epoch": 0.4161027996771991, + "grad_norm": 0.22483392354661752, + "learning_rate": 9.954458861060933e-05, + "loss": 3.4919, + "step": 6703 + }, + { + "epoch": 0.41616487677695696, + "grad_norm": 0.2799546292705247, + "learning_rate": 9.954410215110468e-05, + "loss": 3.5044, + "step": 6704 + }, + { + "epoch": 0.4162269538767149, + "grad_norm": 0.3202275884709068, + "learning_rate": 9.954361543311643e-05, + "loss": 3.3987, + "step": 6705 + }, + { + "epoch": 0.4162890309764728, + "grad_norm": 0.3558612166021035, + "learning_rate": 9.95431284566471e-05, + "loss": 3.4859, + "step": 6706 + }, + { + "epoch": 0.41635110807623066, + "grad_norm": 0.4086170744210467, + "learning_rate": 9.954264122169928e-05, + "loss": 3.3803, + "step": 6707 + }, + { + "epoch": 0.4164131851759886, + "grad_norm": 0.293398478374293, + "learning_rate": 9.954215372827542e-05, + "loss": 3.4236, + "step": 6708 + }, + { + "epoch": 0.4164752622757465, + "grad_norm": 0.2514204856488523, + "learning_rate": 9.954166597637814e-05, + "loss": 3.4275, + "step": 6709 + }, + { + "epoch": 0.41653733937550436, + "grad_norm": 0.2271605359674285, + "learning_rate": 9.954117796600999e-05, + "loss": 3.4422, + "step": 6710 + }, + { + "epoch": 0.4165994164752623, + "grad_norm": 0.23743959210772708, + "learning_rate": 9.954068969717345e-05, + "loss": 3.42, + "step": 6711 + }, + { + "epoch": 0.4166614935750202, + "grad_norm": 0.2348274631905918, + "learning_rate": 9.954020116987111e-05, + "loss": 3.4599, + "step": 6712 + }, + { + "epoch": 0.41672357067477805, + "grad_norm": 0.28420216841769763, + "learning_rate": 9.953971238410553e-05, + "loss": 3.5072, + "step": 6713 + }, + { + "epoch": 0.41678564777453597, + "grad_norm": 0.29193300904184616, + "learning_rate": 9.953922333987923e-05, + "loss": 3.4622, + "step": 6714 + }, + { + "epoch": 0.4168477248742939, + "grad_norm": 0.2501556686690067, + "learning_rate": 9.953873403719479e-05, + "loss": 3.5132, + "step": 6715 + }, + { + "epoch": 0.41690980197405175, + "grad_norm": 0.2384156879707519, + "learning_rate": 9.953824447605474e-05, + "loss": 3.45, + "step": 6716 + }, + { + "epoch": 0.41697187907380967, + "grad_norm": 0.3196054540340959, + "learning_rate": 9.953775465646165e-05, + "loss": 3.4253, + "step": 6717 + }, + { + "epoch": 0.4170339561735676, + "grad_norm": 0.3708366306386936, + "learning_rate": 9.953726457841806e-05, + "loss": 3.4658, + "step": 6718 + }, + { + "epoch": 0.41709603327332545, + "grad_norm": 0.20317034948736046, + "learning_rate": 9.953677424192653e-05, + "loss": 3.3822, + "step": 6719 + }, + { + "epoch": 0.41715811037308337, + "grad_norm": 0.3760404061941788, + "learning_rate": 9.953628364698963e-05, + "loss": 3.4134, + "step": 6720 + }, + { + "epoch": 0.4172201874728413, + "grad_norm": 0.23453993556937455, + "learning_rate": 9.953579279360991e-05, + "loss": 3.38, + "step": 6721 + }, + { + "epoch": 0.41728226457259915, + "grad_norm": 0.30761558895604, + "learning_rate": 9.953530168178994e-05, + "loss": 3.5061, + "step": 6722 + }, + { + "epoch": 0.41734434167235707, + "grad_norm": 0.21603481283385603, + "learning_rate": 9.953481031153228e-05, + "loss": 3.4703, + "step": 6723 + }, + { + "epoch": 0.417406418772115, + "grad_norm": 0.27004210117373606, + "learning_rate": 9.953431868283949e-05, + "loss": 3.5644, + "step": 6724 + }, + { + "epoch": 0.41746849587187285, + "grad_norm": 0.3068505073065167, + "learning_rate": 9.953382679571412e-05, + "loss": 3.4041, + "step": 6725 + }, + { + "epoch": 0.41753057297163076, + "grad_norm": 0.19755939289257607, + "learning_rate": 9.953333465015875e-05, + "loss": 3.5301, + "step": 6726 + }, + { + "epoch": 0.4175926500713887, + "grad_norm": 0.36034655890570466, + "learning_rate": 9.953284224617596e-05, + "loss": 3.474, + "step": 6727 + }, + { + "epoch": 0.41765472717114654, + "grad_norm": 0.3244970574914157, + "learning_rate": 9.953234958376831e-05, + "loss": 3.4377, + "step": 6728 + }, + { + "epoch": 0.41771680427090446, + "grad_norm": 0.24563129054209173, + "learning_rate": 9.953185666293836e-05, + "loss": 3.3889, + "step": 6729 + }, + { + "epoch": 0.4177788813706624, + "grad_norm": 0.23320569974669625, + "learning_rate": 9.953136348368868e-05, + "loss": 3.3917, + "step": 6730 + }, + { + "epoch": 0.41784095847042024, + "grad_norm": 0.2675519638081059, + "learning_rate": 9.953087004602187e-05, + "loss": 3.5242, + "step": 6731 + }, + { + "epoch": 0.41790303557017816, + "grad_norm": 0.2900030967941679, + "learning_rate": 9.953037634994049e-05, + "loss": 3.4584, + "step": 6732 + }, + { + "epoch": 0.4179651126699361, + "grad_norm": 0.21679702105817944, + "learning_rate": 9.952988239544709e-05, + "loss": 3.3752, + "step": 6733 + }, + { + "epoch": 0.41802718976969394, + "grad_norm": 0.29739680359654835, + "learning_rate": 9.95293881825443e-05, + "loss": 3.449, + "step": 6734 + }, + { + "epoch": 0.41808926686945186, + "grad_norm": 0.24864835727999707, + "learning_rate": 9.952889371123464e-05, + "loss": 3.4816, + "step": 6735 + }, + { + "epoch": 0.4181513439692098, + "grad_norm": 0.2645779195491971, + "learning_rate": 9.952839898152073e-05, + "loss": 3.4836, + "step": 6736 + }, + { + "epoch": 0.41821342106896764, + "grad_norm": 0.28277430460850017, + "learning_rate": 9.952790399340514e-05, + "loss": 3.4327, + "step": 6737 + }, + { + "epoch": 0.41827549816872556, + "grad_norm": 0.2558640195701636, + "learning_rate": 9.952740874689046e-05, + "loss": 3.4653, + "step": 6738 + }, + { + "epoch": 0.4183375752684835, + "grad_norm": 0.259126680497509, + "learning_rate": 9.952691324197924e-05, + "loss": 3.5012, + "step": 6739 + }, + { + "epoch": 0.41839965236824134, + "grad_norm": 0.2402509280448836, + "learning_rate": 9.952641747867412e-05, + "loss": 3.4503, + "step": 6740 + }, + { + "epoch": 0.41846172946799925, + "grad_norm": 0.23113558361564282, + "learning_rate": 9.952592145697764e-05, + "loss": 3.3184, + "step": 6741 + }, + { + "epoch": 0.4185238065677572, + "grad_norm": 0.25002752092992186, + "learning_rate": 9.952542517689241e-05, + "loss": 3.5032, + "step": 6742 + }, + { + "epoch": 0.41858588366751504, + "grad_norm": 0.34739872684827444, + "learning_rate": 9.9524928638421e-05, + "loss": 3.448, + "step": 6743 + }, + { + "epoch": 0.41864796076727295, + "grad_norm": 0.23675029657742092, + "learning_rate": 9.952443184156604e-05, + "loss": 3.4014, + "step": 6744 + }, + { + "epoch": 0.41871003786703087, + "grad_norm": 0.28907688208190596, + "learning_rate": 9.952393478633008e-05, + "loss": 3.386, + "step": 6745 + }, + { + "epoch": 0.41877211496678873, + "grad_norm": 0.31230641567952067, + "learning_rate": 9.952343747271573e-05, + "loss": 3.5077, + "step": 6746 + }, + { + "epoch": 0.41883419206654665, + "grad_norm": 0.24626090412880425, + "learning_rate": 9.952293990072558e-05, + "loss": 3.4385, + "step": 6747 + }, + { + "epoch": 0.41889626916630457, + "grad_norm": 0.3436974246827059, + "learning_rate": 9.952244207036225e-05, + "loss": 3.4332, + "step": 6748 + }, + { + "epoch": 0.41895834626606243, + "grad_norm": 0.24276961847617473, + "learning_rate": 9.952194398162831e-05, + "loss": 3.5163, + "step": 6749 + }, + { + "epoch": 0.41902042336582035, + "grad_norm": 0.2635355137620657, + "learning_rate": 9.952144563452636e-05, + "loss": 3.5388, + "step": 6750 + }, + { + "epoch": 0.41908250046557827, + "grad_norm": 0.2570042801827125, + "learning_rate": 9.952094702905901e-05, + "loss": 3.4668, + "step": 6751 + }, + { + "epoch": 0.41914457756533613, + "grad_norm": 0.2376931938760291, + "learning_rate": 9.952044816522885e-05, + "loss": 3.4913, + "step": 6752 + }, + { + "epoch": 0.41920665466509405, + "grad_norm": 0.28906833797436277, + "learning_rate": 9.95199490430385e-05, + "loss": 3.4308, + "step": 6753 + }, + { + "epoch": 0.41926873176485197, + "grad_norm": 0.23601272122851732, + "learning_rate": 9.951944966249055e-05, + "loss": 3.4595, + "step": 6754 + }, + { + "epoch": 0.4193308088646098, + "grad_norm": 0.28971551901610443, + "learning_rate": 9.951895002358761e-05, + "loss": 3.4679, + "step": 6755 + }, + { + "epoch": 0.41939288596436775, + "grad_norm": 0.22208875145097154, + "learning_rate": 9.951845012633229e-05, + "loss": 3.4229, + "step": 6756 + }, + { + "epoch": 0.41945496306412566, + "grad_norm": 0.28403435100751695, + "learning_rate": 9.95179499707272e-05, + "loss": 3.4078, + "step": 6757 + }, + { + "epoch": 0.4195170401638835, + "grad_norm": 0.27536770880834854, + "learning_rate": 9.951744955677493e-05, + "loss": 3.4895, + "step": 6758 + }, + { + "epoch": 0.41957911726364144, + "grad_norm": 0.2786736370487614, + "learning_rate": 9.951694888447811e-05, + "loss": 3.3747, + "step": 6759 + }, + { + "epoch": 0.41964119436339936, + "grad_norm": 0.22656473619080966, + "learning_rate": 9.951644795383935e-05, + "loss": 3.3748, + "step": 6760 + }, + { + "epoch": 0.4197032714631572, + "grad_norm": 0.2362770939736296, + "learning_rate": 9.951594676486125e-05, + "loss": 3.4879, + "step": 6761 + }, + { + "epoch": 0.41976534856291514, + "grad_norm": 0.23287374983660314, + "learning_rate": 9.951544531754644e-05, + "loss": 3.4301, + "step": 6762 + }, + { + "epoch": 0.41982742566267306, + "grad_norm": 0.2752769993334671, + "learning_rate": 9.951494361189755e-05, + "loss": 3.4834, + "step": 6763 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 0.23719600653440825, + "learning_rate": 9.951444164791717e-05, + "loss": 3.485, + "step": 6764 + }, + { + "epoch": 0.41995157986218884, + "grad_norm": 0.28926833015808534, + "learning_rate": 9.951393942560792e-05, + "loss": 3.5797, + "step": 6765 + }, + { + "epoch": 0.42001365696194676, + "grad_norm": 0.3555350399038309, + "learning_rate": 9.951343694497241e-05, + "loss": 3.489, + "step": 6766 + }, + { + "epoch": 0.4200757340617046, + "grad_norm": 0.33270260872610696, + "learning_rate": 9.951293420601331e-05, + "loss": 3.5054, + "step": 6767 + }, + { + "epoch": 0.42013781116146254, + "grad_norm": 0.19763176053889853, + "learning_rate": 9.951243120873321e-05, + "loss": 3.4061, + "step": 6768 + }, + { + "epoch": 0.42019988826122046, + "grad_norm": 0.6093293686434493, + "learning_rate": 9.951192795313473e-05, + "loss": 3.4159, + "step": 6769 + }, + { + "epoch": 0.4202619653609783, + "grad_norm": 0.323655995912407, + "learning_rate": 9.951142443922051e-05, + "loss": 3.569, + "step": 6770 + }, + { + "epoch": 0.42032404246073624, + "grad_norm": 0.42358553223780593, + "learning_rate": 9.951092066699315e-05, + "loss": 3.4511, + "step": 6771 + }, + { + "epoch": 0.42038611956049415, + "grad_norm": 0.4139645478611325, + "learning_rate": 9.95104166364553e-05, + "loss": 3.3887, + "step": 6772 + }, + { + "epoch": 0.420448196660252, + "grad_norm": 0.30386517891966003, + "learning_rate": 9.95099123476096e-05, + "loss": 3.4931, + "step": 6773 + }, + { + "epoch": 0.42051027376000993, + "grad_norm": 0.25865953190744584, + "learning_rate": 9.950940780045867e-05, + "loss": 3.5139, + "step": 6774 + }, + { + "epoch": 0.42057235085976785, + "grad_norm": 0.32828829879855287, + "learning_rate": 9.950890299500513e-05, + "loss": 3.4777, + "step": 6775 + }, + { + "epoch": 0.4206344279595257, + "grad_norm": 0.27268259199937905, + "learning_rate": 9.950839793125164e-05, + "loss": 3.3391, + "step": 6776 + }, + { + "epoch": 0.42069650505928363, + "grad_norm": 0.30872960057995974, + "learning_rate": 9.950789260920081e-05, + "loss": 3.5311, + "step": 6777 + }, + { + "epoch": 0.42075858215904155, + "grad_norm": 0.22250945288155036, + "learning_rate": 9.950738702885527e-05, + "loss": 3.3865, + "step": 6778 + }, + { + "epoch": 0.4208206592587994, + "grad_norm": 0.21827858127936844, + "learning_rate": 9.95068811902177e-05, + "loss": 3.5071, + "step": 6779 + }, + { + "epoch": 0.42088273635855733, + "grad_norm": 0.22301134437788464, + "learning_rate": 9.95063750932907e-05, + "loss": 3.4557, + "step": 6780 + }, + { + "epoch": 0.42094481345831525, + "grad_norm": 0.24964951743123778, + "learning_rate": 9.950586873807692e-05, + "loss": 3.4257, + "step": 6781 + }, + { + "epoch": 0.4210068905580731, + "grad_norm": 0.25923952448252285, + "learning_rate": 9.950536212457901e-05, + "loss": 3.4309, + "step": 6782 + }, + { + "epoch": 0.42106896765783103, + "grad_norm": 0.31102571817448077, + "learning_rate": 9.95048552527996e-05, + "loss": 3.4434, + "step": 6783 + }, + { + "epoch": 0.42113104475758895, + "grad_norm": 0.23868972130965707, + "learning_rate": 9.950434812274137e-05, + "loss": 3.4396, + "step": 6784 + }, + { + "epoch": 0.4211931218573468, + "grad_norm": 0.3045270081406189, + "learning_rate": 9.950384073440691e-05, + "loss": 3.4237, + "step": 6785 + }, + { + "epoch": 0.4212551989571047, + "grad_norm": 0.4130163291919665, + "learning_rate": 9.95033330877989e-05, + "loss": 3.5152, + "step": 6786 + }, + { + "epoch": 0.42131727605686264, + "grad_norm": 0.3371253857396472, + "learning_rate": 9.950282518291998e-05, + "loss": 3.5111, + "step": 6787 + }, + { + "epoch": 0.4213793531566205, + "grad_norm": 0.3381392275519293, + "learning_rate": 9.950231701977282e-05, + "loss": 3.4299, + "step": 6788 + }, + { + "epoch": 0.4214414302563784, + "grad_norm": 0.29579742520413, + "learning_rate": 9.950180859836004e-05, + "loss": 3.3811, + "step": 6789 + }, + { + "epoch": 0.42150350735613634, + "grad_norm": 0.33096691532182976, + "learning_rate": 9.950129991868431e-05, + "loss": 3.4974, + "step": 6790 + }, + { + "epoch": 0.4215655844558942, + "grad_norm": 0.22358364081038742, + "learning_rate": 9.95007909807483e-05, + "loss": 3.5007, + "step": 6791 + }, + { + "epoch": 0.4216276615556521, + "grad_norm": 0.2533490155149145, + "learning_rate": 9.950028178455461e-05, + "loss": 3.4755, + "step": 6792 + }, + { + "epoch": 0.42168973865541004, + "grad_norm": 0.2749349462985005, + "learning_rate": 9.949977233010596e-05, + "loss": 3.4351, + "step": 6793 + }, + { + "epoch": 0.4217518157551679, + "grad_norm": 0.28526139607010925, + "learning_rate": 9.949926261740499e-05, + "loss": 3.2894, + "step": 6794 + }, + { + "epoch": 0.4218138928549258, + "grad_norm": 0.2898081492849564, + "learning_rate": 9.949875264645433e-05, + "loss": 3.4114, + "step": 6795 + }, + { + "epoch": 0.42187596995468374, + "grad_norm": 0.24881579399994766, + "learning_rate": 9.949824241725667e-05, + "loss": 3.4717, + "step": 6796 + }, + { + "epoch": 0.4219380470544416, + "grad_norm": 0.28350671741055766, + "learning_rate": 9.949773192981466e-05, + "loss": 3.5748, + "step": 6797 + }, + { + "epoch": 0.4220001241541995, + "grad_norm": 0.33155769910940996, + "learning_rate": 9.949722118413096e-05, + "loss": 3.5354, + "step": 6798 + }, + { + "epoch": 0.42206220125395744, + "grad_norm": 0.2757365036451701, + "learning_rate": 9.949671018020826e-05, + "loss": 3.4375, + "step": 6799 + }, + { + "epoch": 0.4221242783537153, + "grad_norm": 0.3058219842966176, + "learning_rate": 9.94961989180492e-05, + "loss": 3.533, + "step": 6800 + }, + { + "epoch": 0.4221863554534732, + "grad_norm": 0.24848936067463395, + "learning_rate": 9.949568739765645e-05, + "loss": 3.4481, + "step": 6801 + }, + { + "epoch": 0.42224843255323113, + "grad_norm": 0.3476854483526584, + "learning_rate": 9.949517561903267e-05, + "loss": 3.4179, + "step": 6802 + }, + { + "epoch": 0.422310509652989, + "grad_norm": 0.2104253113657889, + "learning_rate": 9.949466358218056e-05, + "loss": 3.4527, + "step": 6803 + }, + { + "epoch": 0.4223725867527469, + "grad_norm": 0.36767355220367226, + "learning_rate": 9.949415128710278e-05, + "loss": 3.4064, + "step": 6804 + }, + { + "epoch": 0.42243466385250483, + "grad_norm": 0.3168410386841345, + "learning_rate": 9.949363873380199e-05, + "loss": 3.3535, + "step": 6805 + }, + { + "epoch": 0.4224967409522627, + "grad_norm": 0.2516745868493415, + "learning_rate": 9.949312592228087e-05, + "loss": 3.4937, + "step": 6806 + }, + { + "epoch": 0.4225588180520206, + "grad_norm": 0.3023704433239753, + "learning_rate": 9.94926128525421e-05, + "loss": 3.4518, + "step": 6807 + }, + { + "epoch": 0.42262089515177853, + "grad_norm": 0.30874967860772706, + "learning_rate": 9.949209952458836e-05, + "loss": 3.4911, + "step": 6808 + }, + { + "epoch": 0.4226829722515364, + "grad_norm": 0.30070658165681086, + "learning_rate": 9.949158593842231e-05, + "loss": 3.4501, + "step": 6809 + }, + { + "epoch": 0.4227450493512943, + "grad_norm": 0.2621212611225107, + "learning_rate": 9.949107209404665e-05, + "loss": 3.4803, + "step": 6810 + }, + { + "epoch": 0.42280712645105223, + "grad_norm": 0.2683833182369028, + "learning_rate": 9.949055799146404e-05, + "loss": 3.481, + "step": 6811 + }, + { + "epoch": 0.4228692035508101, + "grad_norm": 0.3048903694658094, + "learning_rate": 9.949004363067718e-05, + "loss": 3.3388, + "step": 6812 + }, + { + "epoch": 0.422931280650568, + "grad_norm": 0.2464606033324381, + "learning_rate": 9.948952901168875e-05, + "loss": 3.5007, + "step": 6813 + }, + { + "epoch": 0.4229933577503259, + "grad_norm": 0.2334036487332756, + "learning_rate": 9.948901413450144e-05, + "loss": 3.4816, + "step": 6814 + }, + { + "epoch": 0.4230554348500838, + "grad_norm": 0.3900941675158406, + "learning_rate": 9.948849899911791e-05, + "loss": 3.4612, + "step": 6815 + }, + { + "epoch": 0.4231175119498417, + "grad_norm": 0.2940722093387963, + "learning_rate": 9.94879836055409e-05, + "loss": 3.4609, + "step": 6816 + }, + { + "epoch": 0.4231795890495996, + "grad_norm": 0.24602822536497518, + "learning_rate": 9.948746795377304e-05, + "loss": 3.4426, + "step": 6817 + }, + { + "epoch": 0.4232416661493575, + "grad_norm": 0.2828674563338849, + "learning_rate": 9.948695204381706e-05, + "loss": 3.5304, + "step": 6818 + }, + { + "epoch": 0.4233037432491154, + "grad_norm": 0.3430480726269376, + "learning_rate": 9.948643587567563e-05, + "loss": 3.3239, + "step": 6819 + }, + { + "epoch": 0.4233658203488733, + "grad_norm": 0.24906958358149217, + "learning_rate": 9.948591944935147e-05, + "loss": 3.4555, + "step": 6820 + }, + { + "epoch": 0.4234278974486312, + "grad_norm": 0.2383027692364362, + "learning_rate": 9.948540276484722e-05, + "loss": 3.5338, + "step": 6821 + }, + { + "epoch": 0.4234899745483891, + "grad_norm": 0.3292678842872083, + "learning_rate": 9.948488582216563e-05, + "loss": 3.4605, + "step": 6822 + }, + { + "epoch": 0.423552051648147, + "grad_norm": 0.2834843390350748, + "learning_rate": 9.948436862130938e-05, + "loss": 3.5053, + "step": 6823 + }, + { + "epoch": 0.4236141287479049, + "grad_norm": 0.24966549846838912, + "learning_rate": 9.948385116228116e-05, + "loss": 3.4309, + "step": 6824 + }, + { + "epoch": 0.4236762058476628, + "grad_norm": 0.23263937126097006, + "learning_rate": 9.948333344508367e-05, + "loss": 3.5167, + "step": 6825 + }, + { + "epoch": 0.4237382829474207, + "grad_norm": 0.23376620219936878, + "learning_rate": 9.948281546971963e-05, + "loss": 3.5002, + "step": 6826 + }, + { + "epoch": 0.4238003600471786, + "grad_norm": 0.21854070460202005, + "learning_rate": 9.948229723619172e-05, + "loss": 3.4286, + "step": 6827 + }, + { + "epoch": 0.4238624371469365, + "grad_norm": 0.34959862335551084, + "learning_rate": 9.948177874450266e-05, + "loss": 3.3455, + "step": 6828 + }, + { + "epoch": 0.4239245142466944, + "grad_norm": 0.24806855324759985, + "learning_rate": 9.948125999465514e-05, + "loss": 3.4488, + "step": 6829 + }, + { + "epoch": 0.4239865913464523, + "grad_norm": 0.2607971447808459, + "learning_rate": 9.948074098665188e-05, + "loss": 3.504, + "step": 6830 + }, + { + "epoch": 0.4240486684462102, + "grad_norm": 0.40284553274012175, + "learning_rate": 9.948022172049559e-05, + "loss": 3.4531, + "step": 6831 + }, + { + "epoch": 0.4241107455459681, + "grad_norm": 0.2972651101908694, + "learning_rate": 9.947970219618896e-05, + "loss": 3.3617, + "step": 6832 + }, + { + "epoch": 0.424172822645726, + "grad_norm": 0.2692589835923579, + "learning_rate": 9.947918241373471e-05, + "loss": 3.4289, + "step": 6833 + }, + { + "epoch": 0.4242348997454839, + "grad_norm": 0.22077598044443728, + "learning_rate": 9.947866237313557e-05, + "loss": 3.3912, + "step": 6834 + }, + { + "epoch": 0.4242969768452418, + "grad_norm": 0.2632408575207507, + "learning_rate": 9.947814207439423e-05, + "loss": 3.4875, + "step": 6835 + }, + { + "epoch": 0.4243590539449997, + "grad_norm": 0.2433872883495676, + "learning_rate": 9.947762151751341e-05, + "loss": 3.4015, + "step": 6836 + }, + { + "epoch": 0.4244211310447576, + "grad_norm": 0.28100629685514517, + "learning_rate": 9.947710070249583e-05, + "loss": 3.3844, + "step": 6837 + }, + { + "epoch": 0.4244832081445155, + "grad_norm": 0.24369243681838712, + "learning_rate": 9.947657962934419e-05, + "loss": 3.4334, + "step": 6838 + }, + { + "epoch": 0.4245452852442734, + "grad_norm": 0.2594598102943093, + "learning_rate": 9.947605829806124e-05, + "loss": 3.3851, + "step": 6839 + }, + { + "epoch": 0.4246073623440313, + "grad_norm": 0.22013071825874056, + "learning_rate": 9.947553670864967e-05, + "loss": 3.4086, + "step": 6840 + }, + { + "epoch": 0.4246694394437892, + "grad_norm": 0.2163830407294281, + "learning_rate": 9.947501486111222e-05, + "loss": 3.3795, + "step": 6841 + }, + { + "epoch": 0.42473151654354707, + "grad_norm": 0.2741248619437577, + "learning_rate": 9.947449275545163e-05, + "loss": 3.4259, + "step": 6842 + }, + { + "epoch": 0.424793593643305, + "grad_norm": 0.28379873586131843, + "learning_rate": 9.947397039167057e-05, + "loss": 3.5344, + "step": 6843 + }, + { + "epoch": 0.4248556707430629, + "grad_norm": 0.27260957444079226, + "learning_rate": 9.94734477697718e-05, + "loss": 3.4024, + "step": 6844 + }, + { + "epoch": 0.42491774784282077, + "grad_norm": 0.26655164537768133, + "learning_rate": 9.947292488975804e-05, + "loss": 3.4357, + "step": 6845 + }, + { + "epoch": 0.4249798249425787, + "grad_norm": 0.3039227939603783, + "learning_rate": 9.947240175163204e-05, + "loss": 3.3788, + "step": 6846 + }, + { + "epoch": 0.4250419020423366, + "grad_norm": 0.2932460489828059, + "learning_rate": 9.94718783553965e-05, + "loss": 3.4836, + "step": 6847 + }, + { + "epoch": 0.42510397914209447, + "grad_norm": 0.2678473990234863, + "learning_rate": 9.947135470105416e-05, + "loss": 3.3766, + "step": 6848 + }, + { + "epoch": 0.4251660562418524, + "grad_norm": 0.24287743891827956, + "learning_rate": 9.947083078860774e-05, + "loss": 3.3983, + "step": 6849 + }, + { + "epoch": 0.4252281333416103, + "grad_norm": 0.2488835558989311, + "learning_rate": 9.947030661806001e-05, + "loss": 3.3676, + "step": 6850 + }, + { + "epoch": 0.42529021044136817, + "grad_norm": 0.26999119201174626, + "learning_rate": 9.946978218941367e-05, + "loss": 3.3888, + "step": 6851 + }, + { + "epoch": 0.4253522875411261, + "grad_norm": 0.2068962790819508, + "learning_rate": 9.946925750267147e-05, + "loss": 3.3969, + "step": 6852 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 0.2176260000049243, + "learning_rate": 9.946873255783613e-05, + "loss": 3.3753, + "step": 6853 + }, + { + "epoch": 0.42547644174064186, + "grad_norm": 0.2765936580714266, + "learning_rate": 9.946820735491043e-05, + "loss": 3.4049, + "step": 6854 + }, + { + "epoch": 0.4255385188403998, + "grad_norm": 0.4284069052024849, + "learning_rate": 9.946768189389706e-05, + "loss": 3.4409, + "step": 6855 + }, + { + "epoch": 0.4256005959401577, + "grad_norm": 0.4266311143179346, + "learning_rate": 9.94671561747988e-05, + "loss": 3.5016, + "step": 6856 + }, + { + "epoch": 0.42566267303991556, + "grad_norm": 0.20858092000701617, + "learning_rate": 9.946663019761837e-05, + "loss": 3.3838, + "step": 6857 + }, + { + "epoch": 0.4257247501396735, + "grad_norm": 0.24301033829353055, + "learning_rate": 9.946610396235851e-05, + "loss": 3.4757, + "step": 6858 + }, + { + "epoch": 0.4257868272394314, + "grad_norm": 0.1811985769165706, + "learning_rate": 9.9465577469022e-05, + "loss": 3.4591, + "step": 6859 + }, + { + "epoch": 0.42584890433918926, + "grad_norm": 0.22150745507066386, + "learning_rate": 9.946505071761156e-05, + "loss": 3.4199, + "step": 6860 + }, + { + "epoch": 0.4259109814389472, + "grad_norm": 0.2854007822104216, + "learning_rate": 9.946452370812993e-05, + "loss": 3.4252, + "step": 6861 + }, + { + "epoch": 0.4259730585387051, + "grad_norm": 0.4703387674492665, + "learning_rate": 9.946399644057989e-05, + "loss": 3.5178, + "step": 6862 + }, + { + "epoch": 0.42603513563846296, + "grad_norm": 0.25856623143027146, + "learning_rate": 9.946346891496417e-05, + "loss": 3.3876, + "step": 6863 + }, + { + "epoch": 0.4260972127382209, + "grad_norm": 0.33912744339619155, + "learning_rate": 9.94629411312855e-05, + "loss": 3.4665, + "step": 6864 + }, + { + "epoch": 0.4261592898379788, + "grad_norm": 0.2589271177171198, + "learning_rate": 9.946241308954668e-05, + "loss": 3.3554, + "step": 6865 + }, + { + "epoch": 0.42622136693773666, + "grad_norm": 0.2637108809265248, + "learning_rate": 9.946188478975043e-05, + "loss": 3.4146, + "step": 6866 + }, + { + "epoch": 0.4262834440374946, + "grad_norm": 0.2778056690333144, + "learning_rate": 9.946135623189954e-05, + "loss": 3.4923, + "step": 6867 + }, + { + "epoch": 0.4263455211372525, + "grad_norm": 0.1961275926391857, + "learning_rate": 9.946082741599673e-05, + "loss": 3.5346, + "step": 6868 + }, + { + "epoch": 0.42640759823701035, + "grad_norm": 0.27106571162447346, + "learning_rate": 9.946029834204478e-05, + "loss": 3.4539, + "step": 6869 + }, + { + "epoch": 0.4264696753367683, + "grad_norm": 0.19838053005673517, + "learning_rate": 9.945976901004645e-05, + "loss": 3.4961, + "step": 6870 + }, + { + "epoch": 0.4265317524365262, + "grad_norm": 0.21317921772483348, + "learning_rate": 9.94592394200045e-05, + "loss": 3.4619, + "step": 6871 + }, + { + "epoch": 0.42659382953628405, + "grad_norm": 0.24426004922130357, + "learning_rate": 9.94587095719217e-05, + "loss": 3.4299, + "step": 6872 + }, + { + "epoch": 0.42665590663604197, + "grad_norm": 0.25797588777515973, + "learning_rate": 9.945817946580078e-05, + "loss": 3.4091, + "step": 6873 + }, + { + "epoch": 0.4267179837357999, + "grad_norm": 0.24160225975287566, + "learning_rate": 9.945764910164455e-05, + "loss": 3.4148, + "step": 6874 + }, + { + "epoch": 0.42678006083555775, + "grad_norm": 0.22367326295657197, + "learning_rate": 9.945711847945576e-05, + "loss": 3.4518, + "step": 6875 + }, + { + "epoch": 0.42684213793531567, + "grad_norm": 0.22822656036419783, + "learning_rate": 9.945658759923716e-05, + "loss": 3.3743, + "step": 6876 + }, + { + "epoch": 0.4269042150350736, + "grad_norm": 0.28949990557305044, + "learning_rate": 9.945605646099154e-05, + "loss": 3.4123, + "step": 6877 + }, + { + "epoch": 0.42696629213483145, + "grad_norm": 0.3395414432868655, + "learning_rate": 9.945552506472168e-05, + "loss": 3.456, + "step": 6878 + }, + { + "epoch": 0.42702836923458937, + "grad_norm": 0.3913252466864732, + "learning_rate": 9.945499341043035e-05, + "loss": 3.4052, + "step": 6879 + }, + { + "epoch": 0.4270904463343473, + "grad_norm": 0.4024427718000749, + "learning_rate": 9.945446149812029e-05, + "loss": 3.506, + "step": 6880 + }, + { + "epoch": 0.42715252343410515, + "grad_norm": 0.2547810084457497, + "learning_rate": 9.945392932779431e-05, + "loss": 3.5698, + "step": 6881 + }, + { + "epoch": 0.42721460053386306, + "grad_norm": 0.2700794560468877, + "learning_rate": 9.945339689945517e-05, + "loss": 3.3677, + "step": 6882 + }, + { + "epoch": 0.427276677633621, + "grad_norm": 0.20897580958130968, + "learning_rate": 9.945286421310566e-05, + "loss": 3.4162, + "step": 6883 + }, + { + "epoch": 0.42733875473337884, + "grad_norm": 0.25829910825825764, + "learning_rate": 9.945233126874854e-05, + "loss": 3.4379, + "step": 6884 + }, + { + "epoch": 0.42740083183313676, + "grad_norm": 0.33147431956442247, + "learning_rate": 9.945179806638661e-05, + "loss": 3.413, + "step": 6885 + }, + { + "epoch": 0.4274629089328947, + "grad_norm": 0.44688435924839415, + "learning_rate": 9.945126460602266e-05, + "loss": 3.4234, + "step": 6886 + }, + { + "epoch": 0.42752498603265254, + "grad_norm": 0.2767475328171778, + "learning_rate": 9.945073088765944e-05, + "loss": 3.4169, + "step": 6887 + }, + { + "epoch": 0.42758706313241046, + "grad_norm": 0.32689267309490017, + "learning_rate": 9.945019691129976e-05, + "loss": 3.3971, + "step": 6888 + }, + { + "epoch": 0.4276491402321684, + "grad_norm": 0.30073643635409864, + "learning_rate": 9.944966267694637e-05, + "loss": 3.4249, + "step": 6889 + }, + { + "epoch": 0.42771121733192624, + "grad_norm": 0.26561309282878587, + "learning_rate": 9.944912818460212e-05, + "loss": 3.4727, + "step": 6890 + }, + { + "epoch": 0.42777329443168416, + "grad_norm": 0.319225712222723, + "learning_rate": 9.944859343426976e-05, + "loss": 3.4091, + "step": 6891 + }, + { + "epoch": 0.4278353715314421, + "grad_norm": 0.21947567497635861, + "learning_rate": 9.944805842595208e-05, + "loss": 3.2927, + "step": 6892 + }, + { + "epoch": 0.42789744863119994, + "grad_norm": 0.23414198100194095, + "learning_rate": 9.944752315965187e-05, + "loss": 3.3579, + "step": 6893 + }, + { + "epoch": 0.42795952573095786, + "grad_norm": 0.32553430020768287, + "learning_rate": 9.944698763537192e-05, + "loss": 3.5143, + "step": 6894 + }, + { + "epoch": 0.4280216028307158, + "grad_norm": 0.23577780529562212, + "learning_rate": 9.944645185311505e-05, + "loss": 3.5236, + "step": 6895 + }, + { + "epoch": 0.42808367993047364, + "grad_norm": 0.37235991653775014, + "learning_rate": 9.944591581288403e-05, + "loss": 3.5066, + "step": 6896 + }, + { + "epoch": 0.42814575703023156, + "grad_norm": 0.315714456609271, + "learning_rate": 9.944537951468166e-05, + "loss": 3.3899, + "step": 6897 + }, + { + "epoch": 0.4282078341299895, + "grad_norm": 0.30628896095635183, + "learning_rate": 9.944484295851075e-05, + "loss": 3.556, + "step": 6898 + }, + { + "epoch": 0.42826991122974734, + "grad_norm": 0.2577988303057353, + "learning_rate": 9.944430614437407e-05, + "loss": 3.3677, + "step": 6899 + }, + { + "epoch": 0.42833198832950525, + "grad_norm": 0.24432026167119378, + "learning_rate": 9.944376907227447e-05, + "loss": 3.3171, + "step": 6900 + }, + { + "epoch": 0.42839406542926317, + "grad_norm": 0.2927750519344716, + "learning_rate": 9.944323174221471e-05, + "loss": 3.413, + "step": 6901 + }, + { + "epoch": 0.42845614252902103, + "grad_norm": 0.2548153375051721, + "learning_rate": 9.944269415419761e-05, + "loss": 3.4537, + "step": 6902 + }, + { + "epoch": 0.42851821962877895, + "grad_norm": 0.27509958024819736, + "learning_rate": 9.944215630822598e-05, + "loss": 3.385, + "step": 6903 + }, + { + "epoch": 0.42858029672853687, + "grad_norm": 0.29162367719635585, + "learning_rate": 9.94416182043026e-05, + "loss": 3.4113, + "step": 6904 + }, + { + "epoch": 0.42864237382829473, + "grad_norm": 0.3364150844252026, + "learning_rate": 9.944107984243031e-05, + "loss": 3.3378, + "step": 6905 + }, + { + "epoch": 0.42870445092805265, + "grad_norm": 0.24371379925820327, + "learning_rate": 9.94405412226119e-05, + "loss": 3.3922, + "step": 6906 + }, + { + "epoch": 0.42876652802781057, + "grad_norm": 0.3673219840328156, + "learning_rate": 9.944000234485018e-05, + "loss": 3.3497, + "step": 6907 + }, + { + "epoch": 0.42882860512756843, + "grad_norm": 0.3042312409276393, + "learning_rate": 9.943946320914797e-05, + "loss": 3.4737, + "step": 6908 + }, + { + "epoch": 0.42889068222732635, + "grad_norm": 0.3096071095062467, + "learning_rate": 9.943892381550808e-05, + "loss": 3.3657, + "step": 6909 + }, + { + "epoch": 0.42895275932708427, + "grad_norm": 0.2807593304076405, + "learning_rate": 9.943838416393332e-05, + "loss": 3.4116, + "step": 6910 + }, + { + "epoch": 0.4290148364268421, + "grad_norm": 0.2536051505411899, + "learning_rate": 9.94378442544265e-05, + "loss": 3.4513, + "step": 6911 + }, + { + "epoch": 0.42907691352660005, + "grad_norm": 0.2555032537648799, + "learning_rate": 9.943730408699045e-05, + "loss": 3.3991, + "step": 6912 + }, + { + "epoch": 0.42913899062635796, + "grad_norm": 0.3275270349504133, + "learning_rate": 9.9436763661628e-05, + "loss": 3.3494, + "step": 6913 + }, + { + "epoch": 0.4292010677261158, + "grad_norm": 0.23895746179924504, + "learning_rate": 9.943622297834194e-05, + "loss": 3.486, + "step": 6914 + }, + { + "epoch": 0.42926314482587374, + "grad_norm": 0.24852618517919728, + "learning_rate": 9.94356820371351e-05, + "loss": 3.4945, + "step": 6915 + }, + { + "epoch": 0.42932522192563166, + "grad_norm": 0.2626186244731799, + "learning_rate": 9.943514083801031e-05, + "loss": 3.5124, + "step": 6916 + }, + { + "epoch": 0.4293872990253895, + "grad_norm": 0.19649501453207407, + "learning_rate": 9.943459938097039e-05, + "loss": 3.4619, + "step": 6917 + }, + { + "epoch": 0.42944937612514744, + "grad_norm": 0.3112941603758345, + "learning_rate": 9.943405766601817e-05, + "loss": 3.4486, + "step": 6918 + }, + { + "epoch": 0.42951145322490536, + "grad_norm": 0.2380962904696822, + "learning_rate": 9.943351569315647e-05, + "loss": 3.5007, + "step": 6919 + }, + { + "epoch": 0.4295735303246632, + "grad_norm": 0.40415967004494474, + "learning_rate": 9.94329734623881e-05, + "loss": 3.5177, + "step": 6920 + }, + { + "epoch": 0.42963560742442114, + "grad_norm": 0.24814335005718138, + "learning_rate": 9.943243097371592e-05, + "loss": 3.428, + "step": 6921 + }, + { + "epoch": 0.42969768452417906, + "grad_norm": 0.2987630436655439, + "learning_rate": 9.943188822714275e-05, + "loss": 3.357, + "step": 6922 + }, + { + "epoch": 0.4297597616239369, + "grad_norm": 0.3704167867037418, + "learning_rate": 9.943134522267142e-05, + "loss": 3.3979, + "step": 6923 + }, + { + "epoch": 0.42982183872369484, + "grad_norm": 0.3281539154935994, + "learning_rate": 9.943080196030474e-05, + "loss": 3.3792, + "step": 6924 + }, + { + "epoch": 0.42988391582345276, + "grad_norm": 0.23915134318676592, + "learning_rate": 9.943025844004558e-05, + "loss": 3.5, + "step": 6925 + }, + { + "epoch": 0.4299459929232106, + "grad_norm": 0.44420314942565525, + "learning_rate": 9.942971466189679e-05, + "loss": 3.4298, + "step": 6926 + }, + { + "epoch": 0.43000807002296854, + "grad_norm": 0.37381385927474586, + "learning_rate": 9.942917062586115e-05, + "loss": 3.4727, + "step": 6927 + }, + { + "epoch": 0.43007014712272645, + "grad_norm": 0.2800460717282776, + "learning_rate": 9.942862633194152e-05, + "loss": 3.3512, + "step": 6928 + }, + { + "epoch": 0.4301322242224843, + "grad_norm": 0.29862451785077804, + "learning_rate": 9.942808178014078e-05, + "loss": 3.5271, + "step": 6929 + }, + { + "epoch": 0.43019430132224223, + "grad_norm": 0.38633419915371875, + "learning_rate": 9.942753697046172e-05, + "loss": 3.395, + "step": 6930 + }, + { + "epoch": 0.4302563784220001, + "grad_norm": 0.2737985543298208, + "learning_rate": 9.942699190290719e-05, + "loss": 3.4894, + "step": 6931 + }, + { + "epoch": 0.430318455521758, + "grad_norm": 0.39847990820095014, + "learning_rate": 9.942644657748005e-05, + "loss": 3.2797, + "step": 6932 + }, + { + "epoch": 0.43038053262151593, + "grad_norm": 0.2861589152957793, + "learning_rate": 9.942590099418314e-05, + "loss": 3.3427, + "step": 6933 + }, + { + "epoch": 0.4304426097212738, + "grad_norm": 0.40093291240633466, + "learning_rate": 9.942535515301932e-05, + "loss": 3.4336, + "step": 6934 + }, + { + "epoch": 0.4305046868210317, + "grad_norm": 0.2872991129569711, + "learning_rate": 9.94248090539914e-05, + "loss": 3.4197, + "step": 6935 + }, + { + "epoch": 0.43056676392078963, + "grad_norm": 0.26156207277178767, + "learning_rate": 9.942426269710227e-05, + "loss": 3.3092, + "step": 6936 + }, + { + "epoch": 0.4306288410205475, + "grad_norm": 0.21321877727874577, + "learning_rate": 9.942371608235476e-05, + "loss": 3.494, + "step": 6937 + }, + { + "epoch": 0.4306909181203054, + "grad_norm": 0.25570270604729534, + "learning_rate": 9.942316920975172e-05, + "loss": 3.4204, + "step": 6938 + }, + { + "epoch": 0.43075299522006333, + "grad_norm": 0.276420493764339, + "learning_rate": 9.942262207929602e-05, + "loss": 3.4837, + "step": 6939 + }, + { + "epoch": 0.4308150723198212, + "grad_norm": 0.19755318898733792, + "learning_rate": 9.942207469099049e-05, + "loss": 3.3833, + "step": 6940 + }, + { + "epoch": 0.4308771494195791, + "grad_norm": 0.23784918776548478, + "learning_rate": 9.9421527044838e-05, + "loss": 3.3116, + "step": 6941 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.24484887198694102, + "learning_rate": 9.942097914084142e-05, + "loss": 3.3981, + "step": 6942 + }, + { + "epoch": 0.4310013036190949, + "grad_norm": 0.3120539700789939, + "learning_rate": 9.942043097900358e-05, + "loss": 3.4951, + "step": 6943 + }, + { + "epoch": 0.4310633807188528, + "grad_norm": 0.3255761697147227, + "learning_rate": 9.941988255932736e-05, + "loss": 3.5773, + "step": 6944 + }, + { + "epoch": 0.4311254578186107, + "grad_norm": 0.24109813044654632, + "learning_rate": 9.941933388181562e-05, + "loss": 3.4359, + "step": 6945 + }, + { + "epoch": 0.4311875349183686, + "grad_norm": 0.3196559783818039, + "learning_rate": 9.941878494647121e-05, + "loss": 3.4329, + "step": 6946 + }, + { + "epoch": 0.4312496120181265, + "grad_norm": 0.210915573181401, + "learning_rate": 9.9418235753297e-05, + "loss": 3.4499, + "step": 6947 + }, + { + "epoch": 0.4313116891178844, + "grad_norm": 0.21314868684903768, + "learning_rate": 9.941768630229586e-05, + "loss": 3.4571, + "step": 6948 + }, + { + "epoch": 0.4313737662176423, + "grad_norm": 0.22724989497305076, + "learning_rate": 9.941713659347067e-05, + "loss": 3.4379, + "step": 6949 + }, + { + "epoch": 0.4314358433174002, + "grad_norm": 0.2553569286495114, + "learning_rate": 9.941658662682426e-05, + "loss": 3.3668, + "step": 6950 + }, + { + "epoch": 0.4314979204171581, + "grad_norm": 0.4315389107700909, + "learning_rate": 9.941603640235952e-05, + "loss": 3.3947, + "step": 6951 + }, + { + "epoch": 0.431559997516916, + "grad_norm": 0.347026468834023, + "learning_rate": 9.941548592007934e-05, + "loss": 3.4045, + "step": 6952 + }, + { + "epoch": 0.4316220746166739, + "grad_norm": 0.32352992501766376, + "learning_rate": 9.941493517998657e-05, + "loss": 3.4095, + "step": 6953 + }, + { + "epoch": 0.4316841517164318, + "grad_norm": 0.30864332501055985, + "learning_rate": 9.941438418208407e-05, + "loss": 3.4459, + "step": 6954 + }, + { + "epoch": 0.4317462288161897, + "grad_norm": 0.46053057019699356, + "learning_rate": 9.941383292637475e-05, + "loss": 3.43, + "step": 6955 + }, + { + "epoch": 0.4318083059159476, + "grad_norm": 0.3592267767369053, + "learning_rate": 9.941328141286146e-05, + "loss": 3.3677, + "step": 6956 + }, + { + "epoch": 0.4318703830157055, + "grad_norm": 0.23292928493170054, + "learning_rate": 9.941272964154708e-05, + "loss": 3.3921, + "step": 6957 + }, + { + "epoch": 0.4319324601154634, + "grad_norm": 0.3633360872616356, + "learning_rate": 9.941217761243451e-05, + "loss": 3.4801, + "step": 6958 + }, + { + "epoch": 0.4319945372152213, + "grad_norm": 0.3582382467808316, + "learning_rate": 9.941162532552662e-05, + "loss": 3.4093, + "step": 6959 + }, + { + "epoch": 0.4320566143149792, + "grad_norm": 0.5643132970459764, + "learning_rate": 9.941107278082626e-05, + "loss": 3.5519, + "step": 6960 + }, + { + "epoch": 0.4321186914147371, + "grad_norm": 0.4785239394107866, + "learning_rate": 9.941051997833635e-05, + "loss": 3.4309, + "step": 6961 + }, + { + "epoch": 0.432180768514495, + "grad_norm": 0.31556659715243784, + "learning_rate": 9.940996691805978e-05, + "loss": 3.4147, + "step": 6962 + }, + { + "epoch": 0.4322428456142529, + "grad_norm": 0.3993228537733311, + "learning_rate": 9.940941359999938e-05, + "loss": 3.429, + "step": 6963 + }, + { + "epoch": 0.4323049227140108, + "grad_norm": 0.29630645469560773, + "learning_rate": 9.94088600241581e-05, + "loss": 3.4397, + "step": 6964 + }, + { + "epoch": 0.4323669998137687, + "grad_norm": 0.2513457525520462, + "learning_rate": 9.94083061905388e-05, + "loss": 3.486, + "step": 6965 + }, + { + "epoch": 0.4324290769135266, + "grad_norm": 0.3640280195615676, + "learning_rate": 9.940775209914438e-05, + "loss": 3.4234, + "step": 6966 + }, + { + "epoch": 0.4324911540132845, + "grad_norm": 0.36855980930701426, + "learning_rate": 9.94071977499777e-05, + "loss": 3.4528, + "step": 6967 + }, + { + "epoch": 0.4325532311130424, + "grad_norm": 0.2137668147639883, + "learning_rate": 9.940664314304169e-05, + "loss": 3.3874, + "step": 6968 + }, + { + "epoch": 0.4326153082128003, + "grad_norm": 0.2511341624328246, + "learning_rate": 9.940608827833923e-05, + "loss": 3.3808, + "step": 6969 + }, + { + "epoch": 0.43267738531255817, + "grad_norm": 0.34364199025199416, + "learning_rate": 9.940553315587321e-05, + "loss": 3.4928, + "step": 6970 + }, + { + "epoch": 0.4327394624123161, + "grad_norm": 0.3362544693525727, + "learning_rate": 9.940497777564653e-05, + "loss": 3.3531, + "step": 6971 + }, + { + "epoch": 0.432801539512074, + "grad_norm": 0.19994184684201813, + "learning_rate": 9.940442213766209e-05, + "loss": 3.4015, + "step": 6972 + }, + { + "epoch": 0.43286361661183187, + "grad_norm": 0.4033795144733108, + "learning_rate": 9.940386624192278e-05, + "loss": 3.4105, + "step": 6973 + }, + { + "epoch": 0.4329256937115898, + "grad_norm": 0.22620216851948868, + "learning_rate": 9.94033100884315e-05, + "loss": 3.4048, + "step": 6974 + }, + { + "epoch": 0.4329877708113477, + "grad_norm": 0.33068167227795764, + "learning_rate": 9.940275367719118e-05, + "loss": 3.4444, + "step": 6975 + }, + { + "epoch": 0.43304984791110557, + "grad_norm": 0.2001597861425045, + "learning_rate": 9.940219700820471e-05, + "loss": 3.3512, + "step": 6976 + }, + { + "epoch": 0.4331119250108635, + "grad_norm": 0.39572128374737114, + "learning_rate": 9.940164008147494e-05, + "loss": 3.4451, + "step": 6977 + }, + { + "epoch": 0.4331740021106214, + "grad_norm": 0.320446777111244, + "learning_rate": 9.940108289700486e-05, + "loss": 3.4652, + "step": 6978 + }, + { + "epoch": 0.43323607921037927, + "grad_norm": 0.24438120623120102, + "learning_rate": 9.940052545479733e-05, + "loss": 3.3344, + "step": 6979 + }, + { + "epoch": 0.4332981563101372, + "grad_norm": 0.30716720851654455, + "learning_rate": 9.939996775485524e-05, + "loss": 3.5519, + "step": 6980 + }, + { + "epoch": 0.4333602334098951, + "grad_norm": 0.32732460630607774, + "learning_rate": 9.939940979718157e-05, + "loss": 3.4135, + "step": 6981 + }, + { + "epoch": 0.43342231050965296, + "grad_norm": 0.2370876764238805, + "learning_rate": 9.939885158177915e-05, + "loss": 3.3275, + "step": 6982 + }, + { + "epoch": 0.4334843876094109, + "grad_norm": 0.32131771203403015, + "learning_rate": 9.939829310865096e-05, + "loss": 3.4598, + "step": 6983 + }, + { + "epoch": 0.4335464647091688, + "grad_norm": 0.2677826760866528, + "learning_rate": 9.939773437779986e-05, + "loss": 3.4554, + "step": 6984 + }, + { + "epoch": 0.43360854180892666, + "grad_norm": 0.2173499107310417, + "learning_rate": 9.939717538922879e-05, + "loss": 3.4011, + "step": 6985 + }, + { + "epoch": 0.4336706189086846, + "grad_norm": 0.27801030612270483, + "learning_rate": 9.939661614294067e-05, + "loss": 3.3816, + "step": 6986 + }, + { + "epoch": 0.4337326960084425, + "grad_norm": 0.4375345182425496, + "learning_rate": 9.939605663893842e-05, + "loss": 3.4978, + "step": 6987 + }, + { + "epoch": 0.43379477310820036, + "grad_norm": 0.4503648153355704, + "learning_rate": 9.939549687722493e-05, + "loss": 3.3836, + "step": 6988 + }, + { + "epoch": 0.4338568502079583, + "grad_norm": 0.279186667193214, + "learning_rate": 9.939493685780317e-05, + "loss": 3.4277, + "step": 6989 + }, + { + "epoch": 0.4339189273077162, + "grad_norm": 0.3693006259573583, + "learning_rate": 9.9394376580676e-05, + "loss": 3.4048, + "step": 6990 + }, + { + "epoch": 0.43398100440747406, + "grad_norm": 0.5262704735622015, + "learning_rate": 9.93938160458464e-05, + "loss": 3.4508, + "step": 6991 + }, + { + "epoch": 0.434043081507232, + "grad_norm": 0.3671123549492024, + "learning_rate": 9.939325525331726e-05, + "loss": 3.3933, + "step": 6992 + }, + { + "epoch": 0.4341051586069899, + "grad_norm": 0.32808861076573026, + "learning_rate": 9.939269420309154e-05, + "loss": 3.3954, + "step": 6993 + }, + { + "epoch": 0.43416723570674776, + "grad_norm": 0.466340021003099, + "learning_rate": 9.939213289517212e-05, + "loss": 3.4519, + "step": 6994 + }, + { + "epoch": 0.4342293128065057, + "grad_norm": 0.3401226455883172, + "learning_rate": 9.939157132956196e-05, + "loss": 3.4, + "step": 6995 + }, + { + "epoch": 0.4342913899062636, + "grad_norm": 0.38888375339447495, + "learning_rate": 9.939100950626398e-05, + "loss": 3.3228, + "step": 6996 + }, + { + "epoch": 0.43435346700602145, + "grad_norm": 0.2690427472066919, + "learning_rate": 9.93904474252811e-05, + "loss": 3.4508, + "step": 6997 + }, + { + "epoch": 0.43441554410577937, + "grad_norm": 0.30414001476541164, + "learning_rate": 9.938988508661628e-05, + "loss": 3.4578, + "step": 6998 + }, + { + "epoch": 0.4344776212055373, + "grad_norm": 0.2585713460385056, + "learning_rate": 9.938932249027244e-05, + "loss": 3.4439, + "step": 6999 + }, + { + "epoch": 0.43453969830529515, + "grad_norm": 0.3321703226882305, + "learning_rate": 9.938875963625252e-05, + "loss": 3.4089, + "step": 7000 + }, + { + "epoch": 0.43460177540505307, + "grad_norm": 0.25485717527576723, + "learning_rate": 9.938819652455943e-05, + "loss": 3.4084, + "step": 7001 + }, + { + "epoch": 0.434663852504811, + "grad_norm": 0.2663837283506625, + "learning_rate": 9.938763315519616e-05, + "loss": 3.3659, + "step": 7002 + }, + { + "epoch": 0.43472592960456885, + "grad_norm": 0.3062359271416713, + "learning_rate": 9.938706952816562e-05, + "loss": 3.4609, + "step": 7003 + }, + { + "epoch": 0.43478800670432677, + "grad_norm": 0.3191952293308683, + "learning_rate": 9.938650564347071e-05, + "loss": 3.5185, + "step": 7004 + }, + { + "epoch": 0.4348500838040847, + "grad_norm": 0.27438898752572305, + "learning_rate": 9.938594150111444e-05, + "loss": 3.3662, + "step": 7005 + }, + { + "epoch": 0.43491216090384255, + "grad_norm": 0.23841444576495252, + "learning_rate": 9.938537710109972e-05, + "loss": 3.3915, + "step": 7006 + }, + { + "epoch": 0.43497423800360047, + "grad_norm": 0.2357048611929438, + "learning_rate": 9.93848124434295e-05, + "loss": 3.425, + "step": 7007 + }, + { + "epoch": 0.4350363151033584, + "grad_norm": 0.3289904987143027, + "learning_rate": 9.93842475281067e-05, + "loss": 3.4896, + "step": 7008 + }, + { + "epoch": 0.43509839220311625, + "grad_norm": 0.2610710869980345, + "learning_rate": 9.938368235513433e-05, + "loss": 3.4978, + "step": 7009 + }, + { + "epoch": 0.43516046930287416, + "grad_norm": 0.2811883885398608, + "learning_rate": 9.938311692451528e-05, + "loss": 3.4362, + "step": 7010 + }, + { + "epoch": 0.4352225464026321, + "grad_norm": 0.25089615895785145, + "learning_rate": 9.938255123625253e-05, + "loss": 3.4167, + "step": 7011 + }, + { + "epoch": 0.43528462350238994, + "grad_norm": 0.24303719455001527, + "learning_rate": 9.938198529034901e-05, + "loss": 3.4633, + "step": 7012 + }, + { + "epoch": 0.43534670060214786, + "grad_norm": 0.4342869381180393, + "learning_rate": 9.938141908680769e-05, + "loss": 3.4436, + "step": 7013 + }, + { + "epoch": 0.4354087777019058, + "grad_norm": 0.4694516652427828, + "learning_rate": 9.938085262563153e-05, + "loss": 3.4994, + "step": 7014 + }, + { + "epoch": 0.43547085480166364, + "grad_norm": 0.30195449542564906, + "learning_rate": 9.938028590682347e-05, + "loss": 3.375, + "step": 7015 + }, + { + "epoch": 0.43553293190142156, + "grad_norm": 0.2786499748041048, + "learning_rate": 9.937971893038647e-05, + "loss": 3.4042, + "step": 7016 + }, + { + "epoch": 0.4355950090011795, + "grad_norm": 0.30787598153800794, + "learning_rate": 9.937915169632348e-05, + "loss": 3.3919, + "step": 7017 + }, + { + "epoch": 0.43565708610093734, + "grad_norm": 0.23237300807874806, + "learning_rate": 9.937858420463748e-05, + "loss": 3.4695, + "step": 7018 + }, + { + "epoch": 0.43571916320069526, + "grad_norm": 0.44840638056051385, + "learning_rate": 9.93780164553314e-05, + "loss": 3.4057, + "step": 7019 + }, + { + "epoch": 0.4357812403004532, + "grad_norm": 0.43026451901592483, + "learning_rate": 9.937744844840824e-05, + "loss": 3.4874, + "step": 7020 + }, + { + "epoch": 0.43584331740021104, + "grad_norm": 0.36612456256766085, + "learning_rate": 9.937688018387093e-05, + "loss": 3.4588, + "step": 7021 + }, + { + "epoch": 0.43590539449996896, + "grad_norm": 0.35655764547214464, + "learning_rate": 9.937631166172246e-05, + "loss": 3.3815, + "step": 7022 + }, + { + "epoch": 0.4359674715997269, + "grad_norm": 0.41748071860144204, + "learning_rate": 9.937574288196578e-05, + "loss": 3.5162, + "step": 7023 + }, + { + "epoch": 0.43602954869948474, + "grad_norm": 0.36634140742281135, + "learning_rate": 9.937517384460388e-05, + "loss": 3.4537, + "step": 7024 + }, + { + "epoch": 0.43609162579924265, + "grad_norm": 0.42718981562937713, + "learning_rate": 9.937460454963969e-05, + "loss": 3.4168, + "step": 7025 + }, + { + "epoch": 0.4361537028990006, + "grad_norm": 0.31035400495294285, + "learning_rate": 9.937403499707622e-05, + "loss": 3.4673, + "step": 7026 + }, + { + "epoch": 0.43621577999875844, + "grad_norm": 0.5345203491990607, + "learning_rate": 9.937346518691641e-05, + "loss": 3.4934, + "step": 7027 + }, + { + "epoch": 0.43627785709851635, + "grad_norm": 0.40080290246962363, + "learning_rate": 9.937289511916325e-05, + "loss": 3.396, + "step": 7028 + }, + { + "epoch": 0.43633993419827427, + "grad_norm": 0.33646977196530514, + "learning_rate": 9.93723247938197e-05, + "loss": 3.4323, + "step": 7029 + }, + { + "epoch": 0.43640201129803213, + "grad_norm": 0.31530429721669084, + "learning_rate": 9.937175421088876e-05, + "loss": 3.3422, + "step": 7030 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.3915097897269026, + "learning_rate": 9.93711833703734e-05, + "loss": 3.3645, + "step": 7031 + }, + { + "epoch": 0.43652616549754797, + "grad_norm": 0.35936598144493404, + "learning_rate": 9.937061227227657e-05, + "loss": 3.4526, + "step": 7032 + }, + { + "epoch": 0.43658824259730583, + "grad_norm": 0.3802569393111646, + "learning_rate": 9.937004091660129e-05, + "loss": 3.3768, + "step": 7033 + }, + { + "epoch": 0.43665031969706375, + "grad_norm": 0.2377849554008057, + "learning_rate": 9.936946930335051e-05, + "loss": 3.3882, + "step": 7034 + }, + { + "epoch": 0.43671239679682167, + "grad_norm": 0.24358358540594252, + "learning_rate": 9.936889743252723e-05, + "loss": 3.4935, + "step": 7035 + }, + { + "epoch": 0.43677447389657953, + "grad_norm": 0.27454816860549647, + "learning_rate": 9.936832530413442e-05, + "loss": 3.5445, + "step": 7036 + }, + { + "epoch": 0.43683655099633745, + "grad_norm": 0.22344544736693325, + "learning_rate": 9.936775291817508e-05, + "loss": 3.3364, + "step": 7037 + }, + { + "epoch": 0.43689862809609536, + "grad_norm": 0.2557155262998999, + "learning_rate": 9.936718027465219e-05, + "loss": 3.4153, + "step": 7038 + }, + { + "epoch": 0.4369607051958532, + "grad_norm": 0.2525860134300563, + "learning_rate": 9.936660737356873e-05, + "loss": 3.4804, + "step": 7039 + }, + { + "epoch": 0.43702278229561115, + "grad_norm": 0.24359097214314493, + "learning_rate": 9.93660342149277e-05, + "loss": 3.3784, + "step": 7040 + }, + { + "epoch": 0.43708485939536906, + "grad_norm": 0.2822133757470579, + "learning_rate": 9.936546079873208e-05, + "loss": 3.4313, + "step": 7041 + }, + { + "epoch": 0.4371469364951269, + "grad_norm": 0.19466575354150747, + "learning_rate": 9.936488712498486e-05, + "loss": 3.292, + "step": 7042 + }, + { + "epoch": 0.43720901359488484, + "grad_norm": 0.20763440711168427, + "learning_rate": 9.936431319368905e-05, + "loss": 3.4527, + "step": 7043 + }, + { + "epoch": 0.43727109069464276, + "grad_norm": 0.2054542634302758, + "learning_rate": 9.936373900484764e-05, + "loss": 3.4103, + "step": 7044 + }, + { + "epoch": 0.4373331677944006, + "grad_norm": 0.646405425538, + "learning_rate": 9.936316455846362e-05, + "loss": 3.3557, + "step": 7045 + }, + { + "epoch": 0.43739524489415854, + "grad_norm": 0.3150711029087466, + "learning_rate": 9.936258985453997e-05, + "loss": 3.4807, + "step": 7046 + }, + { + "epoch": 0.43745732199391646, + "grad_norm": 0.3803104455119727, + "learning_rate": 9.936201489307972e-05, + "loss": 3.4211, + "step": 7047 + }, + { + "epoch": 0.4375193990936743, + "grad_norm": 0.5027975853319537, + "learning_rate": 9.936143967408585e-05, + "loss": 3.3993, + "step": 7048 + }, + { + "epoch": 0.43758147619343224, + "grad_norm": 0.4809711589771493, + "learning_rate": 9.936086419756137e-05, + "loss": 3.3494, + "step": 7049 + }, + { + "epoch": 0.43764355329319016, + "grad_norm": 0.4075911282817203, + "learning_rate": 9.936028846350928e-05, + "loss": 3.3821, + "step": 7050 + }, + { + "epoch": 0.437705630392948, + "grad_norm": 0.31481584515213396, + "learning_rate": 9.935971247193257e-05, + "loss": 3.4269, + "step": 7051 + }, + { + "epoch": 0.43776770749270594, + "grad_norm": 0.23720828794521037, + "learning_rate": 9.935913622283426e-05, + "loss": 3.344, + "step": 7052 + }, + { + "epoch": 0.43782978459246386, + "grad_norm": 0.22548142195982993, + "learning_rate": 9.935855971621736e-05, + "loss": 3.5172, + "step": 7053 + }, + { + "epoch": 0.4378918616922217, + "grad_norm": 0.26745648647397113, + "learning_rate": 9.935798295208488e-05, + "loss": 3.362, + "step": 7054 + }, + { + "epoch": 0.43795393879197964, + "grad_norm": 0.3467514393659007, + "learning_rate": 9.935740593043982e-05, + "loss": 3.446, + "step": 7055 + }, + { + "epoch": 0.43801601589173755, + "grad_norm": 0.417418965053393, + "learning_rate": 9.935682865128518e-05, + "loss": 3.4398, + "step": 7056 + }, + { + "epoch": 0.4380780929914954, + "grad_norm": 0.5694601142738036, + "learning_rate": 9.935625111462398e-05, + "loss": 3.3974, + "step": 7057 + }, + { + "epoch": 0.43814017009125333, + "grad_norm": 0.641441034821144, + "learning_rate": 9.935567332045926e-05, + "loss": 3.4949, + "step": 7058 + }, + { + "epoch": 0.43820224719101125, + "grad_norm": 0.2594104288791917, + "learning_rate": 9.935509526879398e-05, + "loss": 3.5245, + "step": 7059 + }, + { + "epoch": 0.4382643242907691, + "grad_norm": 0.43755292572326665, + "learning_rate": 9.93545169596312e-05, + "loss": 3.446, + "step": 7060 + }, + { + "epoch": 0.43832640139052703, + "grad_norm": 0.37053897949789194, + "learning_rate": 9.935393839297393e-05, + "loss": 3.3192, + "step": 7061 + }, + { + "epoch": 0.43838847849028495, + "grad_norm": 0.28580289184149704, + "learning_rate": 9.935335956882519e-05, + "loss": 3.2964, + "step": 7062 + }, + { + "epoch": 0.4384505555900428, + "grad_norm": 0.2377342684316506, + "learning_rate": 9.935278048718797e-05, + "loss": 3.3916, + "step": 7063 + }, + { + "epoch": 0.43851263268980073, + "grad_norm": 0.26366230982632216, + "learning_rate": 9.935220114806533e-05, + "loss": 3.3542, + "step": 7064 + }, + { + "epoch": 0.43857470978955865, + "grad_norm": 0.34210591105615024, + "learning_rate": 9.935162155146028e-05, + "loss": 3.4066, + "step": 7065 + }, + { + "epoch": 0.4386367868893165, + "grad_norm": 0.24125777496399858, + "learning_rate": 9.935104169737581e-05, + "loss": 3.4355, + "step": 7066 + }, + { + "epoch": 0.43869886398907443, + "grad_norm": 0.3805424472000692, + "learning_rate": 9.9350461585815e-05, + "loss": 3.3113, + "step": 7067 + }, + { + "epoch": 0.43876094108883235, + "grad_norm": 0.2519394479040816, + "learning_rate": 9.934988121678083e-05, + "loss": 3.3803, + "step": 7068 + }, + { + "epoch": 0.4388230181885902, + "grad_norm": 0.25282353121310963, + "learning_rate": 9.934930059027638e-05, + "loss": 3.5086, + "step": 7069 + }, + { + "epoch": 0.4388850952883481, + "grad_norm": 0.21656488773686522, + "learning_rate": 9.934871970630464e-05, + "loss": 3.3965, + "step": 7070 + }, + { + "epoch": 0.43894717238810604, + "grad_norm": 0.20218118227325743, + "learning_rate": 9.934813856486864e-05, + "loss": 3.459, + "step": 7071 + }, + { + "epoch": 0.4390092494878639, + "grad_norm": 0.23181080027101358, + "learning_rate": 9.934755716597142e-05, + "loss": 3.4277, + "step": 7072 + }, + { + "epoch": 0.4390713265876218, + "grad_norm": 0.2305758114439091, + "learning_rate": 9.934697550961602e-05, + "loss": 3.4083, + "step": 7073 + }, + { + "epoch": 0.43913340368737974, + "grad_norm": 0.23488023661043667, + "learning_rate": 9.934639359580548e-05, + "loss": 3.4321, + "step": 7074 + }, + { + "epoch": 0.4391954807871376, + "grad_norm": 0.2024713139908174, + "learning_rate": 9.93458114245428e-05, + "loss": 3.5108, + "step": 7075 + }, + { + "epoch": 0.4392575578868955, + "grad_norm": 0.20143785762843888, + "learning_rate": 9.934522899583106e-05, + "loss": 3.3535, + "step": 7076 + }, + { + "epoch": 0.43931963498665344, + "grad_norm": 0.2920381005975136, + "learning_rate": 9.934464630967328e-05, + "loss": 3.3706, + "step": 7077 + }, + { + "epoch": 0.4393817120864113, + "grad_norm": 0.29099496345651277, + "learning_rate": 9.934406336607251e-05, + "loss": 3.263, + "step": 7078 + }, + { + "epoch": 0.4394437891861692, + "grad_norm": 0.28182161047585846, + "learning_rate": 9.934348016503176e-05, + "loss": 3.5388, + "step": 7079 + }, + { + "epoch": 0.43950586628592714, + "grad_norm": 0.21031811518208726, + "learning_rate": 9.934289670655413e-05, + "loss": 3.3289, + "step": 7080 + }, + { + "epoch": 0.439567943385685, + "grad_norm": 0.22932316663475602, + "learning_rate": 9.93423129906426e-05, + "loss": 3.3734, + "step": 7081 + }, + { + "epoch": 0.4396300204854429, + "grad_norm": 0.23728285795049045, + "learning_rate": 9.934172901730025e-05, + "loss": 3.4793, + "step": 7082 + }, + { + "epoch": 0.43969209758520084, + "grad_norm": 0.2541217622930675, + "learning_rate": 9.934114478653012e-05, + "loss": 3.4986, + "step": 7083 + }, + { + "epoch": 0.4397541746849587, + "grad_norm": 0.22446701679997158, + "learning_rate": 9.934056029833526e-05, + "loss": 3.4052, + "step": 7084 + }, + { + "epoch": 0.4398162517847166, + "grad_norm": 0.3497444083855475, + "learning_rate": 9.933997555271872e-05, + "loss": 3.3331, + "step": 7085 + }, + { + "epoch": 0.43987832888447453, + "grad_norm": 0.266152531240331, + "learning_rate": 9.933939054968355e-05, + "loss": 3.3672, + "step": 7086 + }, + { + "epoch": 0.4399404059842324, + "grad_norm": 0.21391774803562472, + "learning_rate": 9.93388052892328e-05, + "loss": 3.4153, + "step": 7087 + }, + { + "epoch": 0.4400024830839903, + "grad_norm": 0.27635191480943977, + "learning_rate": 9.933821977136955e-05, + "loss": 3.3823, + "step": 7088 + }, + { + "epoch": 0.44006456018374823, + "grad_norm": 0.32030606844888204, + "learning_rate": 9.93376339960968e-05, + "loss": 3.3435, + "step": 7089 + }, + { + "epoch": 0.4401266372835061, + "grad_norm": 0.3688810737869913, + "learning_rate": 9.933704796341765e-05, + "loss": 3.329, + "step": 7090 + }, + { + "epoch": 0.440188714383264, + "grad_norm": 0.25827569965968983, + "learning_rate": 9.933646167333514e-05, + "loss": 3.3714, + "step": 7091 + }, + { + "epoch": 0.44025079148302193, + "grad_norm": 0.3014143930920179, + "learning_rate": 9.933587512585234e-05, + "loss": 3.4292, + "step": 7092 + }, + { + "epoch": 0.4403128685827798, + "grad_norm": 0.2151977829887435, + "learning_rate": 9.933528832097229e-05, + "loss": 3.4562, + "step": 7093 + }, + { + "epoch": 0.4403749456825377, + "grad_norm": 0.3794530678541499, + "learning_rate": 9.933470125869806e-05, + "loss": 3.4322, + "step": 7094 + }, + { + "epoch": 0.44043702278229563, + "grad_norm": 0.28102616834123606, + "learning_rate": 9.933411393903274e-05, + "loss": 3.4379, + "step": 7095 + }, + { + "epoch": 0.4404990998820535, + "grad_norm": 0.2688266001810266, + "learning_rate": 9.933352636197936e-05, + "loss": 3.4425, + "step": 7096 + }, + { + "epoch": 0.4405611769818114, + "grad_norm": 0.21599982205577867, + "learning_rate": 9.933293852754099e-05, + "loss": 3.3333, + "step": 7097 + }, + { + "epoch": 0.4406232540815693, + "grad_norm": 0.378991343588095, + "learning_rate": 9.933235043572072e-05, + "loss": 3.4021, + "step": 7098 + }, + { + "epoch": 0.4406853311813272, + "grad_norm": 0.4384112405016405, + "learning_rate": 9.933176208652158e-05, + "loss": 3.4643, + "step": 7099 + }, + { + "epoch": 0.4407474082810851, + "grad_norm": 0.26894509807902245, + "learning_rate": 9.933117347994668e-05, + "loss": 3.3394, + "step": 7100 + }, + { + "epoch": 0.440809485380843, + "grad_norm": 0.39887120446694874, + "learning_rate": 9.933058461599906e-05, + "loss": 3.3952, + "step": 7101 + }, + { + "epoch": 0.4408715624806009, + "grad_norm": 0.3619915207691748, + "learning_rate": 9.93299954946818e-05, + "loss": 3.5063, + "step": 7102 + }, + { + "epoch": 0.4409336395803588, + "grad_norm": 0.40457985598873347, + "learning_rate": 9.932940611599799e-05, + "loss": 3.372, + "step": 7103 + }, + { + "epoch": 0.4409957166801167, + "grad_norm": 0.43186440859227154, + "learning_rate": 9.932881647995069e-05, + "loss": 3.4355, + "step": 7104 + }, + { + "epoch": 0.4410577937798746, + "grad_norm": 0.4755536642227611, + "learning_rate": 9.932822658654298e-05, + "loss": 3.4729, + "step": 7105 + }, + { + "epoch": 0.4411198708796325, + "grad_norm": 0.3034407342842507, + "learning_rate": 9.932763643577793e-05, + "loss": 3.3967, + "step": 7106 + }, + { + "epoch": 0.4411819479793904, + "grad_norm": 0.28810310959149094, + "learning_rate": 9.932704602765865e-05, + "loss": 3.3188, + "step": 7107 + }, + { + "epoch": 0.4412440250791483, + "grad_norm": 0.2496853800768371, + "learning_rate": 9.932645536218817e-05, + "loss": 3.416, + "step": 7108 + }, + { + "epoch": 0.4413061021789062, + "grad_norm": 0.2505245071004296, + "learning_rate": 9.932586443936962e-05, + "loss": 3.3719, + "step": 7109 + }, + { + "epoch": 0.4413681792786641, + "grad_norm": 0.3759278501190114, + "learning_rate": 9.932527325920604e-05, + "loss": 3.4177, + "step": 7110 + }, + { + "epoch": 0.441430256378422, + "grad_norm": 0.29324665875657446, + "learning_rate": 9.932468182170054e-05, + "loss": 3.4456, + "step": 7111 + }, + { + "epoch": 0.4414923334781799, + "grad_norm": 0.41144973609086993, + "learning_rate": 9.932409012685621e-05, + "loss": 3.4576, + "step": 7112 + }, + { + "epoch": 0.4415544105779378, + "grad_norm": 0.2760771587634937, + "learning_rate": 9.932349817467612e-05, + "loss": 3.2951, + "step": 7113 + }, + { + "epoch": 0.4416164876776957, + "grad_norm": 0.236612338204424, + "learning_rate": 9.932290596516337e-05, + "loss": 3.2737, + "step": 7114 + }, + { + "epoch": 0.4416785647774536, + "grad_norm": 0.2841868448830867, + "learning_rate": 9.932231349832103e-05, + "loss": 3.3741, + "step": 7115 + }, + { + "epoch": 0.4417406418772115, + "grad_norm": 0.3960986586648428, + "learning_rate": 9.932172077415224e-05, + "loss": 3.4343, + "step": 7116 + }, + { + "epoch": 0.4418027189769694, + "grad_norm": 0.3199299877661191, + "learning_rate": 9.932112779266005e-05, + "loss": 3.4857, + "step": 7117 + }, + { + "epoch": 0.4418647960767273, + "grad_norm": 0.324628043068308, + "learning_rate": 9.932053455384755e-05, + "loss": 3.3792, + "step": 7118 + }, + { + "epoch": 0.4419268731764852, + "grad_norm": 0.27941934528047224, + "learning_rate": 9.931994105771785e-05, + "loss": 3.4698, + "step": 7119 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 0.2542462493381654, + "learning_rate": 9.931934730427405e-05, + "loss": 3.3156, + "step": 7120 + }, + { + "epoch": 0.442051027376001, + "grad_norm": 0.373407459247817, + "learning_rate": 9.931875329351922e-05, + "loss": 3.3855, + "step": 7121 + }, + { + "epoch": 0.4421131044757589, + "grad_norm": 0.2172832874689212, + "learning_rate": 9.931815902545651e-05, + "loss": 3.4048, + "step": 7122 + }, + { + "epoch": 0.4421751815755168, + "grad_norm": 0.3669180591665704, + "learning_rate": 9.931756450008899e-05, + "loss": 3.4854, + "step": 7123 + }, + { + "epoch": 0.4422372586752747, + "grad_norm": 0.28442048819877114, + "learning_rate": 9.931696971741976e-05, + "loss": 3.5306, + "step": 7124 + }, + { + "epoch": 0.4422993357750326, + "grad_norm": 0.22753758082091602, + "learning_rate": 9.931637467745192e-05, + "loss": 3.3602, + "step": 7125 + }, + { + "epoch": 0.44236141287479047, + "grad_norm": 0.29955070995542127, + "learning_rate": 9.93157793801886e-05, + "loss": 3.3044, + "step": 7126 + }, + { + "epoch": 0.4424234899745484, + "grad_norm": 0.33927560868307377, + "learning_rate": 9.931518382563286e-05, + "loss": 3.4995, + "step": 7127 + }, + { + "epoch": 0.4424855670743063, + "grad_norm": 0.20699061588771275, + "learning_rate": 9.931458801378784e-05, + "loss": 3.4039, + "step": 7128 + }, + { + "epoch": 0.44254764417406417, + "grad_norm": 0.2909304731075631, + "learning_rate": 9.931399194465665e-05, + "loss": 3.3199, + "step": 7129 + }, + { + "epoch": 0.4426097212738221, + "grad_norm": 0.40247463208572765, + "learning_rate": 9.93133956182424e-05, + "loss": 3.422, + "step": 7130 + }, + { + "epoch": 0.44267179837358, + "grad_norm": 0.26917039515557456, + "learning_rate": 9.931279903454819e-05, + "loss": 3.3883, + "step": 7131 + }, + { + "epoch": 0.44273387547333787, + "grad_norm": 0.27207350444906686, + "learning_rate": 9.931220219357714e-05, + "loss": 3.4518, + "step": 7132 + }, + { + "epoch": 0.4427959525730958, + "grad_norm": 0.30344197424154346, + "learning_rate": 9.931160509533235e-05, + "loss": 3.5015, + "step": 7133 + }, + { + "epoch": 0.4428580296728537, + "grad_norm": 0.2624065414450716, + "learning_rate": 9.931100773981694e-05, + "loss": 3.4067, + "step": 7134 + }, + { + "epoch": 0.44292010677261157, + "grad_norm": 0.24225673147470445, + "learning_rate": 9.931041012703404e-05, + "loss": 3.3733, + "step": 7135 + }, + { + "epoch": 0.4429821838723695, + "grad_norm": 0.6747626637955518, + "learning_rate": 9.930981225698676e-05, + "loss": 3.4039, + "step": 7136 + }, + { + "epoch": 0.4430442609721274, + "grad_norm": 0.2742915304169038, + "learning_rate": 9.930921412967822e-05, + "loss": 3.2889, + "step": 7137 + }, + { + "epoch": 0.44310633807188526, + "grad_norm": 0.31605306387090465, + "learning_rate": 9.930861574511155e-05, + "loss": 3.3685, + "step": 7138 + }, + { + "epoch": 0.4431684151716432, + "grad_norm": 0.37490209652494333, + "learning_rate": 9.930801710328985e-05, + "loss": 3.3529, + "step": 7139 + }, + { + "epoch": 0.4432304922714011, + "grad_norm": 0.27645834777379713, + "learning_rate": 9.930741820421626e-05, + "loss": 3.4663, + "step": 7140 + }, + { + "epoch": 0.44329256937115896, + "grad_norm": 0.274623370187027, + "learning_rate": 9.930681904789387e-05, + "loss": 3.3684, + "step": 7141 + }, + { + "epoch": 0.4433546464709169, + "grad_norm": 0.24106917288774446, + "learning_rate": 9.930621963432586e-05, + "loss": 3.3385, + "step": 7142 + }, + { + "epoch": 0.4434167235706748, + "grad_norm": 0.23050414637934338, + "learning_rate": 9.930561996351534e-05, + "loss": 3.4305, + "step": 7143 + }, + { + "epoch": 0.44347880067043266, + "grad_norm": 0.24104059049871088, + "learning_rate": 9.930502003546542e-05, + "loss": 3.4674, + "step": 7144 + }, + { + "epoch": 0.4435408777701906, + "grad_norm": 0.31671812440408614, + "learning_rate": 9.930441985017925e-05, + "loss": 3.4309, + "step": 7145 + }, + { + "epoch": 0.4436029548699485, + "grad_norm": 0.2644188648835411, + "learning_rate": 9.930381940765995e-05, + "loss": 3.4576, + "step": 7146 + }, + { + "epoch": 0.44366503196970636, + "grad_norm": 0.2329835835959482, + "learning_rate": 9.930321870791065e-05, + "loss": 3.4743, + "step": 7147 + }, + { + "epoch": 0.4437271090694643, + "grad_norm": 0.36002293516983713, + "learning_rate": 9.93026177509345e-05, + "loss": 3.4121, + "step": 7148 + }, + { + "epoch": 0.4437891861692222, + "grad_norm": 0.19755402812877143, + "learning_rate": 9.93020165367346e-05, + "loss": 3.4487, + "step": 7149 + }, + { + "epoch": 0.44385126326898006, + "grad_norm": 0.3364511267162873, + "learning_rate": 9.930141506531414e-05, + "loss": 3.2984, + "step": 7150 + }, + { + "epoch": 0.443913340368738, + "grad_norm": 0.25381372673360064, + "learning_rate": 9.930081333667622e-05, + "loss": 3.4601, + "step": 7151 + }, + { + "epoch": 0.4439754174684959, + "grad_norm": 0.3698065669679978, + "learning_rate": 9.9300211350824e-05, + "loss": 3.3783, + "step": 7152 + }, + { + "epoch": 0.44403749456825375, + "grad_norm": 0.26558791631002665, + "learning_rate": 9.929960910776057e-05, + "loss": 3.2922, + "step": 7153 + }, + { + "epoch": 0.4440995716680117, + "grad_norm": 0.27852184294063637, + "learning_rate": 9.929900660748915e-05, + "loss": 3.3948, + "step": 7154 + }, + { + "epoch": 0.4441616487677696, + "grad_norm": 0.2777250248103655, + "learning_rate": 9.929840385001284e-05, + "loss": 3.4612, + "step": 7155 + }, + { + "epoch": 0.44422372586752745, + "grad_norm": 0.35450332666135576, + "learning_rate": 9.929780083533477e-05, + "loss": 3.3837, + "step": 7156 + }, + { + "epoch": 0.44428580296728537, + "grad_norm": 0.24676547638980378, + "learning_rate": 9.929719756345813e-05, + "loss": 3.4768, + "step": 7157 + }, + { + "epoch": 0.4443478800670433, + "grad_norm": 0.28444065813688907, + "learning_rate": 9.929659403438602e-05, + "loss": 3.4878, + "step": 7158 + }, + { + "epoch": 0.44440995716680115, + "grad_norm": 0.2572781052792807, + "learning_rate": 9.929599024812162e-05, + "loss": 3.3866, + "step": 7159 + }, + { + "epoch": 0.44447203426655907, + "grad_norm": 0.2298693702417019, + "learning_rate": 9.929538620466808e-05, + "loss": 3.4132, + "step": 7160 + }, + { + "epoch": 0.444534111366317, + "grad_norm": 0.38713382832330356, + "learning_rate": 9.929478190402852e-05, + "loss": 3.3571, + "step": 7161 + }, + { + "epoch": 0.44459618846607485, + "grad_norm": 0.20026217948888403, + "learning_rate": 9.929417734620614e-05, + "loss": 3.4364, + "step": 7162 + }, + { + "epoch": 0.44465826556583277, + "grad_norm": 0.3035793203241702, + "learning_rate": 9.929357253120406e-05, + "loss": 3.4284, + "step": 7163 + }, + { + "epoch": 0.4447203426655907, + "grad_norm": 0.29523588983167837, + "learning_rate": 9.929296745902546e-05, + "loss": 3.3971, + "step": 7164 + }, + { + "epoch": 0.44478241976534855, + "grad_norm": 0.2872631543478334, + "learning_rate": 9.929236212967345e-05, + "loss": 3.4263, + "step": 7165 + }, + { + "epoch": 0.44484449686510646, + "grad_norm": 0.20598383223120598, + "learning_rate": 9.929175654315125e-05, + "loss": 3.4345, + "step": 7166 + }, + { + "epoch": 0.4449065739648644, + "grad_norm": 0.2640014566594454, + "learning_rate": 9.929115069946198e-05, + "loss": 3.3308, + "step": 7167 + }, + { + "epoch": 0.44496865106462224, + "grad_norm": 0.1763837343989275, + "learning_rate": 9.929054459860881e-05, + "loss": 3.3331, + "step": 7168 + }, + { + "epoch": 0.44503072816438016, + "grad_norm": 0.22685430085517483, + "learning_rate": 9.928993824059492e-05, + "loss": 3.4358, + "step": 7169 + }, + { + "epoch": 0.4450928052641381, + "grad_norm": 0.1869658969289752, + "learning_rate": 9.928933162542343e-05, + "loss": 3.3579, + "step": 7170 + }, + { + "epoch": 0.44515488236389594, + "grad_norm": 0.6462411336456602, + "learning_rate": 9.928872475309754e-05, + "loss": 3.4905, + "step": 7171 + }, + { + "epoch": 0.44521695946365386, + "grad_norm": 0.3913212674408837, + "learning_rate": 9.928811762362043e-05, + "loss": 3.4699, + "step": 7172 + }, + { + "epoch": 0.4452790365634118, + "grad_norm": 0.4272657291804926, + "learning_rate": 9.928751023699522e-05, + "loss": 3.3951, + "step": 7173 + }, + { + "epoch": 0.44534111366316964, + "grad_norm": 0.34539471894577284, + "learning_rate": 9.92869025932251e-05, + "loss": 3.2813, + "step": 7174 + }, + { + "epoch": 0.44540319076292756, + "grad_norm": 0.4289384381665989, + "learning_rate": 9.928629469231326e-05, + "loss": 3.3903, + "step": 7175 + }, + { + "epoch": 0.4454652678626855, + "grad_norm": 0.29227474664414954, + "learning_rate": 9.928568653426284e-05, + "loss": 3.3849, + "step": 7176 + }, + { + "epoch": 0.44552734496244334, + "grad_norm": 0.43006751855561737, + "learning_rate": 9.928507811907705e-05, + "loss": 3.4356, + "step": 7177 + }, + { + "epoch": 0.44558942206220126, + "grad_norm": 0.44621443176637887, + "learning_rate": 9.928446944675905e-05, + "loss": 3.3485, + "step": 7178 + }, + { + "epoch": 0.4456514991619592, + "grad_norm": 0.2664784297319063, + "learning_rate": 9.928386051731198e-05, + "loss": 3.4419, + "step": 7179 + }, + { + "epoch": 0.44571357626171704, + "grad_norm": 0.309924034897721, + "learning_rate": 9.928325133073906e-05, + "loss": 3.4435, + "step": 7180 + }, + { + "epoch": 0.44577565336147496, + "grad_norm": 0.20768310383286115, + "learning_rate": 9.928264188704345e-05, + "loss": 3.4298, + "step": 7181 + }, + { + "epoch": 0.4458377304612329, + "grad_norm": 0.6407209882698506, + "learning_rate": 9.928203218622834e-05, + "loss": 3.3955, + "step": 7182 + }, + { + "epoch": 0.44589980756099074, + "grad_norm": 0.28881870763117606, + "learning_rate": 9.92814222282969e-05, + "loss": 3.3853, + "step": 7183 + }, + { + "epoch": 0.44596188466074865, + "grad_norm": 0.31186949782401024, + "learning_rate": 9.92808120132523e-05, + "loss": 3.3031, + "step": 7184 + }, + { + "epoch": 0.44602396176050657, + "grad_norm": 0.30280790358953286, + "learning_rate": 9.928020154109776e-05, + "loss": 3.445, + "step": 7185 + }, + { + "epoch": 0.44608603886026443, + "grad_norm": 0.2954077980640961, + "learning_rate": 9.927959081183645e-05, + "loss": 3.2778, + "step": 7186 + }, + { + "epoch": 0.44614811596002235, + "grad_norm": 0.2667484647247265, + "learning_rate": 9.927897982547154e-05, + "loss": 3.3699, + "step": 7187 + }, + { + "epoch": 0.44621019305978027, + "grad_norm": 0.4305046115067789, + "learning_rate": 9.927836858200623e-05, + "loss": 3.4183, + "step": 7188 + }, + { + "epoch": 0.44627227015953813, + "grad_norm": 0.3299135441126858, + "learning_rate": 9.927775708144371e-05, + "loss": 3.3759, + "step": 7189 + }, + { + "epoch": 0.44633434725929605, + "grad_norm": 0.2814966153700234, + "learning_rate": 9.927714532378718e-05, + "loss": 3.3542, + "step": 7190 + }, + { + "epoch": 0.44639642435905397, + "grad_norm": 0.2939125879264469, + "learning_rate": 9.927653330903981e-05, + "loss": 3.3976, + "step": 7191 + }, + { + "epoch": 0.44645850145881183, + "grad_norm": 0.23973190669641536, + "learning_rate": 9.92759210372048e-05, + "loss": 3.3135, + "step": 7192 + }, + { + "epoch": 0.44652057855856975, + "grad_norm": 0.32821280576162354, + "learning_rate": 9.927530850828535e-05, + "loss": 3.3962, + "step": 7193 + }, + { + "epoch": 0.44658265565832767, + "grad_norm": 0.2883473648716202, + "learning_rate": 9.927469572228465e-05, + "loss": 3.4771, + "step": 7194 + }, + { + "epoch": 0.4466447327580855, + "grad_norm": 0.31901805115990867, + "learning_rate": 9.92740826792059e-05, + "loss": 3.4244, + "step": 7195 + }, + { + "epoch": 0.44670680985784345, + "grad_norm": 0.3895806083821894, + "learning_rate": 9.92734693790523e-05, + "loss": 3.35, + "step": 7196 + }, + { + "epoch": 0.44676888695760136, + "grad_norm": 0.2823802964877694, + "learning_rate": 9.927285582182707e-05, + "loss": 3.3577, + "step": 7197 + }, + { + "epoch": 0.4468309640573592, + "grad_norm": 0.2911122948784355, + "learning_rate": 9.927224200753338e-05, + "loss": 3.4016, + "step": 7198 + }, + { + "epoch": 0.44689304115711714, + "grad_norm": 0.33044001812856555, + "learning_rate": 9.927162793617443e-05, + "loss": 3.4021, + "step": 7199 + }, + { + "epoch": 0.44695511825687506, + "grad_norm": 0.2650761684454573, + "learning_rate": 9.927101360775344e-05, + "loss": 3.4563, + "step": 7200 + }, + { + "epoch": 0.4470171953566329, + "grad_norm": 0.35074407894577453, + "learning_rate": 9.927039902227361e-05, + "loss": 3.3648, + "step": 7201 + }, + { + "epoch": 0.44707927245639084, + "grad_norm": 0.34339638424857144, + "learning_rate": 9.926978417973815e-05, + "loss": 3.3484, + "step": 7202 + }, + { + "epoch": 0.44714134955614876, + "grad_norm": 0.2886964568123522, + "learning_rate": 9.926916908015027e-05, + "loss": 3.4236, + "step": 7203 + }, + { + "epoch": 0.4472034266559066, + "grad_norm": 0.28651821493162044, + "learning_rate": 9.926855372351315e-05, + "loss": 3.4968, + "step": 7204 + }, + { + "epoch": 0.44726550375566454, + "grad_norm": 0.2645148471494328, + "learning_rate": 9.926793810983006e-05, + "loss": 3.4736, + "step": 7205 + }, + { + "epoch": 0.44732758085542246, + "grad_norm": 0.31028358526443145, + "learning_rate": 9.926732223910416e-05, + "loss": 3.4343, + "step": 7206 + }, + { + "epoch": 0.4473896579551803, + "grad_norm": 0.30479111499088934, + "learning_rate": 9.926670611133868e-05, + "loss": 3.4186, + "step": 7207 + }, + { + "epoch": 0.44745173505493824, + "grad_norm": 0.3335094086207769, + "learning_rate": 9.926608972653684e-05, + "loss": 3.3348, + "step": 7208 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.23541660122462577, + "learning_rate": 9.926547308470184e-05, + "loss": 3.4004, + "step": 7209 + }, + { + "epoch": 0.447575889254454, + "grad_norm": 0.37472711011722715, + "learning_rate": 9.926485618583691e-05, + "loss": 3.385, + "step": 7210 + }, + { + "epoch": 0.44763796635421194, + "grad_norm": 0.219074783021649, + "learning_rate": 9.926423902994527e-05, + "loss": 3.3959, + "step": 7211 + }, + { + "epoch": 0.44770004345396985, + "grad_norm": 0.40745521547975144, + "learning_rate": 9.926362161703014e-05, + "loss": 3.4493, + "step": 7212 + }, + { + "epoch": 0.4477621205537277, + "grad_norm": 0.362153960181639, + "learning_rate": 9.926300394709474e-05, + "loss": 3.4258, + "step": 7213 + }, + { + "epoch": 0.44782419765348563, + "grad_norm": 0.3876452089311053, + "learning_rate": 9.926238602014226e-05, + "loss": 3.2943, + "step": 7214 + }, + { + "epoch": 0.44788627475324355, + "grad_norm": 0.2786037541046507, + "learning_rate": 9.926176783617597e-05, + "loss": 3.435, + "step": 7215 + }, + { + "epoch": 0.4479483518530014, + "grad_norm": 0.3459133758851713, + "learning_rate": 9.926114939519909e-05, + "loss": 3.4293, + "step": 7216 + }, + { + "epoch": 0.44801042895275933, + "grad_norm": 0.32626298100308265, + "learning_rate": 9.926053069721483e-05, + "loss": 3.4481, + "step": 7217 + }, + { + "epoch": 0.44807250605251725, + "grad_norm": 0.374459871087688, + "learning_rate": 9.925991174222641e-05, + "loss": 3.4225, + "step": 7218 + }, + { + "epoch": 0.4481345831522751, + "grad_norm": 0.375418925731917, + "learning_rate": 9.925929253023709e-05, + "loss": 3.5007, + "step": 7219 + }, + { + "epoch": 0.44819666025203303, + "grad_norm": 0.2926201717584565, + "learning_rate": 9.925867306125008e-05, + "loss": 3.3303, + "step": 7220 + }, + { + "epoch": 0.44825873735179095, + "grad_norm": 0.3105336672295428, + "learning_rate": 9.92580533352686e-05, + "loss": 3.3874, + "step": 7221 + }, + { + "epoch": 0.4483208144515488, + "grad_norm": 0.33393128848773646, + "learning_rate": 9.925743335229592e-05, + "loss": 3.4621, + "step": 7222 + }, + { + "epoch": 0.44838289155130673, + "grad_norm": 0.2780297485933586, + "learning_rate": 9.925681311233523e-05, + "loss": 3.3287, + "step": 7223 + }, + { + "epoch": 0.44844496865106465, + "grad_norm": 0.2887777137048594, + "learning_rate": 9.925619261538981e-05, + "loss": 3.3885, + "step": 7224 + }, + { + "epoch": 0.4485070457508225, + "grad_norm": 0.217028416889064, + "learning_rate": 9.925557186146286e-05, + "loss": 3.5432, + "step": 7225 + }, + { + "epoch": 0.4485691228505804, + "grad_norm": 0.2466310660074955, + "learning_rate": 9.925495085055764e-05, + "loss": 3.387, + "step": 7226 + }, + { + "epoch": 0.44863119995033834, + "grad_norm": 0.29819457517844217, + "learning_rate": 9.925432958267737e-05, + "loss": 3.4243, + "step": 7227 + }, + { + "epoch": 0.4486932770500962, + "grad_norm": 0.35609215786667103, + "learning_rate": 9.925370805782532e-05, + "loss": 3.4997, + "step": 7228 + }, + { + "epoch": 0.4487553541498541, + "grad_norm": 0.3010428145935629, + "learning_rate": 9.925308627600474e-05, + "loss": 3.2771, + "step": 7229 + }, + { + "epoch": 0.44881743124961204, + "grad_norm": 0.21709408878186315, + "learning_rate": 9.925246423721881e-05, + "loss": 3.2749, + "step": 7230 + }, + { + "epoch": 0.4488795083493699, + "grad_norm": 0.3234784049737238, + "learning_rate": 9.925184194147085e-05, + "loss": 3.4103, + "step": 7231 + }, + { + "epoch": 0.4489415854491278, + "grad_norm": 0.2801536942262763, + "learning_rate": 9.925121938876405e-05, + "loss": 3.4499, + "step": 7232 + }, + { + "epoch": 0.44900366254888574, + "grad_norm": 0.29658991527158346, + "learning_rate": 9.92505965791017e-05, + "loss": 3.4327, + "step": 7233 + }, + { + "epoch": 0.4490657396486436, + "grad_norm": 0.30875353961722574, + "learning_rate": 9.924997351248703e-05, + "loss": 3.4569, + "step": 7234 + }, + { + "epoch": 0.4491278167484015, + "grad_norm": 0.29343663047929824, + "learning_rate": 9.924935018892328e-05, + "loss": 3.4731, + "step": 7235 + }, + { + "epoch": 0.44918989384815944, + "grad_norm": 0.2555963751567963, + "learning_rate": 9.924872660841372e-05, + "loss": 3.4298, + "step": 7236 + }, + { + "epoch": 0.4492519709479173, + "grad_norm": 0.22842232519385014, + "learning_rate": 9.92481027709616e-05, + "loss": 3.4275, + "step": 7237 + }, + { + "epoch": 0.4493140480476752, + "grad_norm": 0.2323592766843722, + "learning_rate": 9.924747867657018e-05, + "loss": 3.373, + "step": 7238 + }, + { + "epoch": 0.44937612514743314, + "grad_norm": 0.23719462444325867, + "learning_rate": 9.92468543252427e-05, + "loss": 3.2869, + "step": 7239 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.27709120683071936, + "learning_rate": 9.924622971698243e-05, + "loss": 3.3936, + "step": 7240 + }, + { + "epoch": 0.4495002793469489, + "grad_norm": 0.21245747870077414, + "learning_rate": 9.924560485179262e-05, + "loss": 3.3938, + "step": 7241 + }, + { + "epoch": 0.44956235644670683, + "grad_norm": 0.23425015462292156, + "learning_rate": 9.924497972967652e-05, + "loss": 3.4459, + "step": 7242 + }, + { + "epoch": 0.4496244335464647, + "grad_norm": 0.1813654404539006, + "learning_rate": 9.924435435063744e-05, + "loss": 3.4804, + "step": 7243 + }, + { + "epoch": 0.4496865106462226, + "grad_norm": 0.19646154009351585, + "learning_rate": 9.924372871467859e-05, + "loss": 3.3881, + "step": 7244 + }, + { + "epoch": 0.44974858774598053, + "grad_norm": 0.2682796665293775, + "learning_rate": 9.924310282180325e-05, + "loss": 3.4668, + "step": 7245 + }, + { + "epoch": 0.4498106648457384, + "grad_norm": 0.194468381213501, + "learning_rate": 9.92424766720147e-05, + "loss": 3.4195, + "step": 7246 + }, + { + "epoch": 0.4498727419454963, + "grad_norm": 0.24919374663390828, + "learning_rate": 9.924185026531618e-05, + "loss": 3.4856, + "step": 7247 + }, + { + "epoch": 0.44993481904525423, + "grad_norm": 0.25233463117344673, + "learning_rate": 9.924122360171098e-05, + "loss": 3.3503, + "step": 7248 + }, + { + "epoch": 0.4499968961450121, + "grad_norm": 0.21232585187348307, + "learning_rate": 9.924059668120236e-05, + "loss": 3.4074, + "step": 7249 + }, + { + "epoch": 0.45005897324477, + "grad_norm": 0.2777385051436297, + "learning_rate": 9.92399695037936e-05, + "loss": 3.3449, + "step": 7250 + }, + { + "epoch": 0.45012105034452793, + "grad_norm": 0.25677707128059857, + "learning_rate": 9.923934206948796e-05, + "loss": 3.3746, + "step": 7251 + }, + { + "epoch": 0.4501831274442858, + "grad_norm": 0.23614583985242146, + "learning_rate": 9.923871437828872e-05, + "loss": 3.3036, + "step": 7252 + }, + { + "epoch": 0.4502452045440437, + "grad_norm": 0.4797728546447637, + "learning_rate": 9.923808643019917e-05, + "loss": 3.3243, + "step": 7253 + }, + { + "epoch": 0.4503072816438016, + "grad_norm": 0.353370731651158, + "learning_rate": 9.923745822522255e-05, + "loss": 3.4647, + "step": 7254 + }, + { + "epoch": 0.4503693587435595, + "grad_norm": 0.2804383513784354, + "learning_rate": 9.923682976336216e-05, + "loss": 3.4779, + "step": 7255 + }, + { + "epoch": 0.4504314358433174, + "grad_norm": 0.23703911917240617, + "learning_rate": 9.923620104462128e-05, + "loss": 3.3718, + "step": 7256 + }, + { + "epoch": 0.4504935129430753, + "grad_norm": 0.2739443033249567, + "learning_rate": 9.923557206900318e-05, + "loss": 3.4244, + "step": 7257 + }, + { + "epoch": 0.4505555900428332, + "grad_norm": 0.25417963129080773, + "learning_rate": 9.923494283651115e-05, + "loss": 3.4013, + "step": 7258 + }, + { + "epoch": 0.4506176671425911, + "grad_norm": 0.31255677422731243, + "learning_rate": 9.923431334714849e-05, + "loss": 3.5204, + "step": 7259 + }, + { + "epoch": 0.450679744242349, + "grad_norm": 0.2854255684742943, + "learning_rate": 9.923368360091844e-05, + "loss": 3.3836, + "step": 7260 + }, + { + "epoch": 0.4507418213421069, + "grad_norm": 0.37226718289828875, + "learning_rate": 9.923305359782433e-05, + "loss": 3.378, + "step": 7261 + }, + { + "epoch": 0.4508038984418648, + "grad_norm": 0.30027113424176805, + "learning_rate": 9.923242333786941e-05, + "loss": 3.418, + "step": 7262 + }, + { + "epoch": 0.4508659755416227, + "grad_norm": 0.39143436506034096, + "learning_rate": 9.923179282105701e-05, + "loss": 3.4452, + "step": 7263 + }, + { + "epoch": 0.4509280526413806, + "grad_norm": 0.27871701261319126, + "learning_rate": 9.923116204739038e-05, + "loss": 3.4739, + "step": 7264 + }, + { + "epoch": 0.4509901297411385, + "grad_norm": 0.2768181669749432, + "learning_rate": 9.923053101687283e-05, + "loss": 3.3587, + "step": 7265 + }, + { + "epoch": 0.4510522068408964, + "grad_norm": 0.3340129282385171, + "learning_rate": 9.922989972950765e-05, + "loss": 3.4299, + "step": 7266 + }, + { + "epoch": 0.4511142839406543, + "grad_norm": 0.29279406107816525, + "learning_rate": 9.922926818529811e-05, + "loss": 3.4408, + "step": 7267 + }, + { + "epoch": 0.4511763610404122, + "grad_norm": 0.27504952591642134, + "learning_rate": 9.922863638424755e-05, + "loss": 3.4325, + "step": 7268 + }, + { + "epoch": 0.4512384381401701, + "grad_norm": 0.2958835686199158, + "learning_rate": 9.922800432635924e-05, + "loss": 3.3451, + "step": 7269 + }, + { + "epoch": 0.451300515239928, + "grad_norm": 0.3617373237207914, + "learning_rate": 9.922737201163648e-05, + "loss": 3.4529, + "step": 7270 + }, + { + "epoch": 0.4513625923396859, + "grad_norm": 0.22932532717214527, + "learning_rate": 9.922673944008256e-05, + "loss": 3.3264, + "step": 7271 + }, + { + "epoch": 0.4514246694394438, + "grad_norm": 0.2285469881363767, + "learning_rate": 9.92261066117008e-05, + "loss": 3.4216, + "step": 7272 + }, + { + "epoch": 0.4514867465392017, + "grad_norm": 0.22437230743450265, + "learning_rate": 9.922547352649449e-05, + "loss": 3.3976, + "step": 7273 + }, + { + "epoch": 0.4515488236389596, + "grad_norm": 0.21840820687744084, + "learning_rate": 9.922484018446693e-05, + "loss": 3.3193, + "step": 7274 + }, + { + "epoch": 0.4516109007387175, + "grad_norm": 0.22644379399928738, + "learning_rate": 9.922420658562145e-05, + "loss": 3.4154, + "step": 7275 + }, + { + "epoch": 0.4516729778384754, + "grad_norm": 0.4337744293424155, + "learning_rate": 9.922357272996133e-05, + "loss": 3.4327, + "step": 7276 + }, + { + "epoch": 0.4517350549382333, + "grad_norm": 0.3099360306365162, + "learning_rate": 9.922293861748986e-05, + "loss": 3.344, + "step": 7277 + }, + { + "epoch": 0.4517971320379912, + "grad_norm": 0.2214973136446179, + "learning_rate": 9.922230424821037e-05, + "loss": 3.3563, + "step": 7278 + }, + { + "epoch": 0.4518592091377491, + "grad_norm": 0.3154090543913488, + "learning_rate": 9.922166962212619e-05, + "loss": 3.4282, + "step": 7279 + }, + { + "epoch": 0.451921286237507, + "grad_norm": 0.25694005908030343, + "learning_rate": 9.922103473924061e-05, + "loss": 3.3783, + "step": 7280 + }, + { + "epoch": 0.4519833633372649, + "grad_norm": 0.2730440715378464, + "learning_rate": 9.922039959955692e-05, + "loss": 3.4781, + "step": 7281 + }, + { + "epoch": 0.45204544043702277, + "grad_norm": 0.27114251107754833, + "learning_rate": 9.921976420307848e-05, + "loss": 3.4999, + "step": 7282 + }, + { + "epoch": 0.4521075175367807, + "grad_norm": 0.28512498605820935, + "learning_rate": 9.921912854980857e-05, + "loss": 3.4673, + "step": 7283 + }, + { + "epoch": 0.4521695946365386, + "grad_norm": 0.3330040872762621, + "learning_rate": 9.921849263975051e-05, + "loss": 3.4001, + "step": 7284 + }, + { + "epoch": 0.45223167173629647, + "grad_norm": 0.24857147940640448, + "learning_rate": 9.921785647290765e-05, + "loss": 3.3433, + "step": 7285 + }, + { + "epoch": 0.4522937488360544, + "grad_norm": 0.2690195118393502, + "learning_rate": 9.921722004928326e-05, + "loss": 3.4481, + "step": 7286 + }, + { + "epoch": 0.4523558259358123, + "grad_norm": 0.310727804487676, + "learning_rate": 9.92165833688807e-05, + "loss": 3.4399, + "step": 7287 + }, + { + "epoch": 0.45241790303557017, + "grad_norm": 0.2449653124582068, + "learning_rate": 9.921594643170326e-05, + "loss": 3.3464, + "step": 7288 + }, + { + "epoch": 0.4524799801353281, + "grad_norm": 0.4874411718624221, + "learning_rate": 9.92153092377543e-05, + "loss": 3.4825, + "step": 7289 + }, + { + "epoch": 0.452542057235086, + "grad_norm": 0.21423931059327928, + "learning_rate": 9.921467178703712e-05, + "loss": 3.3685, + "step": 7290 + }, + { + "epoch": 0.45260413433484387, + "grad_norm": 0.35125570686292773, + "learning_rate": 9.921403407955506e-05, + "loss": 3.3868, + "step": 7291 + }, + { + "epoch": 0.4526662114346018, + "grad_norm": 0.3096615750059657, + "learning_rate": 9.921339611531142e-05, + "loss": 3.3395, + "step": 7292 + }, + { + "epoch": 0.4527282885343597, + "grad_norm": 0.26372739419287017, + "learning_rate": 9.921275789430955e-05, + "loss": 3.3427, + "step": 7293 + }, + { + "epoch": 0.45279036563411756, + "grad_norm": 0.3039969419275364, + "learning_rate": 9.921211941655278e-05, + "loss": 3.4169, + "step": 7294 + }, + { + "epoch": 0.4528524427338755, + "grad_norm": 0.30564565877385297, + "learning_rate": 9.921148068204442e-05, + "loss": 3.3229, + "step": 7295 + }, + { + "epoch": 0.4529145198336334, + "grad_norm": 0.32872233062238804, + "learning_rate": 9.921084169078782e-05, + "loss": 3.3785, + "step": 7296 + }, + { + "epoch": 0.45297659693339126, + "grad_norm": 0.2868508193269824, + "learning_rate": 9.921020244278634e-05, + "loss": 3.3351, + "step": 7297 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.27755598893116695, + "learning_rate": 9.920956293804327e-05, + "loss": 3.4554, + "step": 7298 + }, + { + "epoch": 0.4531007511329071, + "grad_norm": 0.24902848631284707, + "learning_rate": 9.920892317656195e-05, + "loss": 3.33, + "step": 7299 + }, + { + "epoch": 0.45316282823266496, + "grad_norm": 0.36072834857859293, + "learning_rate": 9.920828315834575e-05, + "loss": 3.37, + "step": 7300 + }, + { + "epoch": 0.4532249053324229, + "grad_norm": 0.2502066681880847, + "learning_rate": 9.920764288339799e-05, + "loss": 3.3308, + "step": 7301 + }, + { + "epoch": 0.4532869824321808, + "grad_norm": 0.3608733950078121, + "learning_rate": 9.9207002351722e-05, + "loss": 3.3374, + "step": 7302 + }, + { + "epoch": 0.45334905953193866, + "grad_norm": 0.22053876174775983, + "learning_rate": 9.920636156332115e-05, + "loss": 3.4253, + "step": 7303 + }, + { + "epoch": 0.4534111366316966, + "grad_norm": 0.27255944552915484, + "learning_rate": 9.920572051819876e-05, + "loss": 3.4376, + "step": 7304 + }, + { + "epoch": 0.4534732137314545, + "grad_norm": 0.25248527514608093, + "learning_rate": 9.920507921635817e-05, + "loss": 3.2877, + "step": 7305 + }, + { + "epoch": 0.45353529083121236, + "grad_norm": 0.2507867503528564, + "learning_rate": 9.920443765780275e-05, + "loss": 3.2956, + "step": 7306 + }, + { + "epoch": 0.4535973679309703, + "grad_norm": 0.24513626784270334, + "learning_rate": 9.920379584253583e-05, + "loss": 3.3196, + "step": 7307 + }, + { + "epoch": 0.45365944503072814, + "grad_norm": 0.23888125467870056, + "learning_rate": 9.920315377056077e-05, + "loss": 3.4117, + "step": 7308 + }, + { + "epoch": 0.45372152213048605, + "grad_norm": 0.24436819532925583, + "learning_rate": 9.92025114418809e-05, + "loss": 3.3944, + "step": 7309 + }, + { + "epoch": 0.453783599230244, + "grad_norm": 0.22011832175178367, + "learning_rate": 9.920186885649958e-05, + "loss": 3.3317, + "step": 7310 + }, + { + "epoch": 0.45384567633000183, + "grad_norm": 0.21520218859489385, + "learning_rate": 9.920122601442018e-05, + "loss": 3.2852, + "step": 7311 + }, + { + "epoch": 0.45390775342975975, + "grad_norm": 0.27002357355717044, + "learning_rate": 9.920058291564604e-05, + "loss": 3.4124, + "step": 7312 + }, + { + "epoch": 0.45396983052951767, + "grad_norm": 0.23457528140024647, + "learning_rate": 9.91999395601805e-05, + "loss": 3.4157, + "step": 7313 + }, + { + "epoch": 0.45403190762927553, + "grad_norm": 0.21323183695977388, + "learning_rate": 9.919929594802695e-05, + "loss": 3.3623, + "step": 7314 + }, + { + "epoch": 0.45409398472903345, + "grad_norm": 0.2646687067018898, + "learning_rate": 9.919865207918871e-05, + "loss": 3.3717, + "step": 7315 + }, + { + "epoch": 0.45415606182879137, + "grad_norm": 0.1889040846814925, + "learning_rate": 9.919800795366918e-05, + "loss": 3.4422, + "step": 7316 + }, + { + "epoch": 0.45421813892854923, + "grad_norm": 0.24866033523107042, + "learning_rate": 9.919736357147168e-05, + "loss": 3.4499, + "step": 7317 + }, + { + "epoch": 0.45428021602830715, + "grad_norm": 0.23749733001061463, + "learning_rate": 9.919671893259961e-05, + "loss": 3.481, + "step": 7318 + }, + { + "epoch": 0.45434229312806507, + "grad_norm": 0.18196249715369128, + "learning_rate": 9.91960740370563e-05, + "loss": 3.386, + "step": 7319 + }, + { + "epoch": 0.45440437022782293, + "grad_norm": 0.21905248878937675, + "learning_rate": 9.919542888484512e-05, + "loss": 3.3963, + "step": 7320 + }, + { + "epoch": 0.45446644732758085, + "grad_norm": 0.273741376144403, + "learning_rate": 9.919478347596947e-05, + "loss": 3.4809, + "step": 7321 + }, + { + "epoch": 0.45452852442733876, + "grad_norm": 0.24494421885902706, + "learning_rate": 9.919413781043267e-05, + "loss": 3.4418, + "step": 7322 + }, + { + "epoch": 0.4545906015270966, + "grad_norm": 0.2130600403906423, + "learning_rate": 9.919349188823812e-05, + "loss": 3.4717, + "step": 7323 + }, + { + "epoch": 0.45465267862685455, + "grad_norm": 0.2693934710425135, + "learning_rate": 9.919284570938918e-05, + "loss": 3.3811, + "step": 7324 + }, + { + "epoch": 0.45471475572661246, + "grad_norm": 0.20308992193018438, + "learning_rate": 9.919219927388923e-05, + "loss": 3.3255, + "step": 7325 + }, + { + "epoch": 0.4547768328263703, + "grad_norm": 0.24405766412728314, + "learning_rate": 9.919155258174161e-05, + "loss": 3.3971, + "step": 7326 + }, + { + "epoch": 0.45483890992612824, + "grad_norm": 0.3259675990952822, + "learning_rate": 9.919090563294975e-05, + "loss": 3.3878, + "step": 7327 + }, + { + "epoch": 0.45490098702588616, + "grad_norm": 0.2118358819431441, + "learning_rate": 9.919025842751699e-05, + "loss": 3.3443, + "step": 7328 + }, + { + "epoch": 0.454963064125644, + "grad_norm": 0.33179185876541745, + "learning_rate": 9.91896109654467e-05, + "loss": 3.3861, + "step": 7329 + }, + { + "epoch": 0.45502514122540194, + "grad_norm": 0.2863653884725034, + "learning_rate": 9.918896324674226e-05, + "loss": 3.3661, + "step": 7330 + }, + { + "epoch": 0.45508721832515986, + "grad_norm": 0.2392724402490934, + "learning_rate": 9.918831527140707e-05, + "loss": 3.347, + "step": 7331 + }, + { + "epoch": 0.4551492954249177, + "grad_norm": 0.2618221391891815, + "learning_rate": 9.918766703944448e-05, + "loss": 3.338, + "step": 7332 + }, + { + "epoch": 0.45521137252467564, + "grad_norm": 0.2701455868833415, + "learning_rate": 9.918701855085792e-05, + "loss": 3.452, + "step": 7333 + }, + { + "epoch": 0.45527344962443356, + "grad_norm": 0.3319455653313121, + "learning_rate": 9.918636980565072e-05, + "loss": 3.3595, + "step": 7334 + }, + { + "epoch": 0.4553355267241914, + "grad_norm": 0.337784590648938, + "learning_rate": 9.918572080382631e-05, + "loss": 3.4866, + "step": 7335 + }, + { + "epoch": 0.45539760382394934, + "grad_norm": 0.2768038477868623, + "learning_rate": 9.918507154538803e-05, + "loss": 3.4649, + "step": 7336 + }, + { + "epoch": 0.45545968092370726, + "grad_norm": 0.3634471924490604, + "learning_rate": 9.918442203033931e-05, + "loss": 3.4392, + "step": 7337 + }, + { + "epoch": 0.4555217580234651, + "grad_norm": 0.3769591533593922, + "learning_rate": 9.918377225868352e-05, + "loss": 3.4054, + "step": 7338 + }, + { + "epoch": 0.45558383512322304, + "grad_norm": 0.20843854373458953, + "learning_rate": 9.918312223042404e-05, + "loss": 3.3147, + "step": 7339 + }, + { + "epoch": 0.45564591222298095, + "grad_norm": 0.46589424938047563, + "learning_rate": 9.918247194556427e-05, + "loss": 3.4446, + "step": 7340 + }, + { + "epoch": 0.4557079893227388, + "grad_norm": 0.43731010948183097, + "learning_rate": 9.918182140410761e-05, + "loss": 3.3522, + "step": 7341 + }, + { + "epoch": 0.45577006642249673, + "grad_norm": 0.2736674295729188, + "learning_rate": 9.918117060605745e-05, + "loss": 3.4582, + "step": 7342 + }, + { + "epoch": 0.45583214352225465, + "grad_norm": 0.2996584880616784, + "learning_rate": 9.918051955141718e-05, + "loss": 3.3481, + "step": 7343 + }, + { + "epoch": 0.4558942206220125, + "grad_norm": 0.32870129657825614, + "learning_rate": 9.917986824019021e-05, + "loss": 3.3982, + "step": 7344 + }, + { + "epoch": 0.45595629772177043, + "grad_norm": 0.2838394251620674, + "learning_rate": 9.917921667237993e-05, + "loss": 3.4286, + "step": 7345 + }, + { + "epoch": 0.45601837482152835, + "grad_norm": 0.24329450854149567, + "learning_rate": 9.917856484798972e-05, + "loss": 3.3452, + "step": 7346 + }, + { + "epoch": 0.4560804519212862, + "grad_norm": 0.298968724602975, + "learning_rate": 9.9177912767023e-05, + "loss": 3.3719, + "step": 7347 + }, + { + "epoch": 0.45614252902104413, + "grad_norm": 0.3051043129205706, + "learning_rate": 9.917726042948318e-05, + "loss": 3.3667, + "step": 7348 + }, + { + "epoch": 0.45620460612080205, + "grad_norm": 0.27677238065272486, + "learning_rate": 9.917660783537365e-05, + "loss": 3.3772, + "step": 7349 + }, + { + "epoch": 0.4562666832205599, + "grad_norm": 0.29938154149723284, + "learning_rate": 9.917595498469783e-05, + "loss": 3.4576, + "step": 7350 + }, + { + "epoch": 0.45632876032031783, + "grad_norm": 0.24126687733263608, + "learning_rate": 9.917530187745912e-05, + "loss": 3.4053, + "step": 7351 + }, + { + "epoch": 0.45639083742007575, + "grad_norm": 0.24507940996712985, + "learning_rate": 9.91746485136609e-05, + "loss": 3.4722, + "step": 7352 + }, + { + "epoch": 0.4564529145198336, + "grad_norm": 0.25686107265928476, + "learning_rate": 9.917399489330661e-05, + "loss": 3.3944, + "step": 7353 + }, + { + "epoch": 0.4565149916195915, + "grad_norm": 0.2284896379350038, + "learning_rate": 9.917334101639964e-05, + "loss": 3.3489, + "step": 7354 + }, + { + "epoch": 0.45657706871934944, + "grad_norm": 0.2648743372670803, + "learning_rate": 9.917268688294343e-05, + "loss": 3.3715, + "step": 7355 + }, + { + "epoch": 0.4566391458191073, + "grad_norm": 0.26069730736743374, + "learning_rate": 9.917203249294138e-05, + "loss": 3.4245, + "step": 7356 + }, + { + "epoch": 0.4567012229188652, + "grad_norm": 0.1853777566455313, + "learning_rate": 9.917137784639687e-05, + "loss": 3.3349, + "step": 7357 + }, + { + "epoch": 0.45676330001862314, + "grad_norm": 0.20490688258950895, + "learning_rate": 9.917072294331338e-05, + "loss": 3.3993, + "step": 7358 + }, + { + "epoch": 0.456825377118381, + "grad_norm": 0.23741580575037308, + "learning_rate": 9.917006778369427e-05, + "loss": 3.4927, + "step": 7359 + }, + { + "epoch": 0.4568874542181389, + "grad_norm": 0.2354843113496502, + "learning_rate": 9.916941236754298e-05, + "loss": 3.401, + "step": 7360 + }, + { + "epoch": 0.45694953131789684, + "grad_norm": 0.252598844386921, + "learning_rate": 9.916875669486295e-05, + "loss": 3.4224, + "step": 7361 + }, + { + "epoch": 0.4570116084176547, + "grad_norm": 0.24065114133202772, + "learning_rate": 9.916810076565756e-05, + "loss": 3.3418, + "step": 7362 + }, + { + "epoch": 0.4570736855174126, + "grad_norm": 0.19166936112320532, + "learning_rate": 9.916744457993025e-05, + "loss": 3.3778, + "step": 7363 + }, + { + "epoch": 0.45713576261717054, + "grad_norm": 0.21530491145620737, + "learning_rate": 9.916678813768445e-05, + "loss": 3.3285, + "step": 7364 + }, + { + "epoch": 0.4571978397169284, + "grad_norm": 0.2059965458419467, + "learning_rate": 9.91661314389236e-05, + "loss": 3.4648, + "step": 7365 + }, + { + "epoch": 0.4572599168166863, + "grad_norm": 0.22431041890558323, + "learning_rate": 9.916547448365108e-05, + "loss": 3.3019, + "step": 7366 + }, + { + "epoch": 0.45732199391644424, + "grad_norm": 0.22033990662463113, + "learning_rate": 9.916481727187035e-05, + "loss": 3.4345, + "step": 7367 + }, + { + "epoch": 0.4573840710162021, + "grad_norm": 0.18986019173211713, + "learning_rate": 9.916415980358485e-05, + "loss": 3.3816, + "step": 7368 + }, + { + "epoch": 0.45744614811596, + "grad_norm": 0.23215940224590534, + "learning_rate": 9.916350207879798e-05, + "loss": 3.3095, + "step": 7369 + }, + { + "epoch": 0.45750822521571793, + "grad_norm": 0.2051494111187223, + "learning_rate": 9.916284409751318e-05, + "loss": 3.3829, + "step": 7370 + }, + { + "epoch": 0.4575703023154758, + "grad_norm": 0.315978255260893, + "learning_rate": 9.91621858597339e-05, + "loss": 3.4066, + "step": 7371 + }, + { + "epoch": 0.4576323794152337, + "grad_norm": 0.2903615526574323, + "learning_rate": 9.916152736546356e-05, + "loss": 3.4278, + "step": 7372 + }, + { + "epoch": 0.45769445651499163, + "grad_norm": 0.222561916646217, + "learning_rate": 9.91608686147056e-05, + "loss": 3.3143, + "step": 7373 + }, + { + "epoch": 0.4577565336147495, + "grad_norm": 0.19573820450749355, + "learning_rate": 9.916020960746345e-05, + "loss": 3.2985, + "step": 7374 + }, + { + "epoch": 0.4578186107145074, + "grad_norm": 0.32727564146404303, + "learning_rate": 9.915955034374054e-05, + "loss": 3.4368, + "step": 7375 + }, + { + "epoch": 0.45788068781426533, + "grad_norm": 0.21387009640461624, + "learning_rate": 9.915889082354033e-05, + "loss": 3.3369, + "step": 7376 + }, + { + "epoch": 0.4579427649140232, + "grad_norm": 0.24393617079917795, + "learning_rate": 9.915823104686625e-05, + "loss": 3.3964, + "step": 7377 + }, + { + "epoch": 0.4580048420137811, + "grad_norm": 0.2816712350916893, + "learning_rate": 9.915757101372177e-05, + "loss": 3.47, + "step": 7378 + }, + { + "epoch": 0.45806691911353903, + "grad_norm": 0.32084879523130305, + "learning_rate": 9.915691072411027e-05, + "loss": 3.3752, + "step": 7379 + }, + { + "epoch": 0.4581289962132969, + "grad_norm": 0.2576233863807796, + "learning_rate": 9.915625017803525e-05, + "loss": 3.3077, + "step": 7380 + }, + { + "epoch": 0.4581910733130548, + "grad_norm": 0.27613719900545747, + "learning_rate": 9.915558937550014e-05, + "loss": 3.3399, + "step": 7381 + }, + { + "epoch": 0.4582531504128127, + "grad_norm": 0.30488683831088087, + "learning_rate": 9.915492831650839e-05, + "loss": 3.347, + "step": 7382 + }, + { + "epoch": 0.4583152275125706, + "grad_norm": 0.2735225136615913, + "learning_rate": 9.915426700106345e-05, + "loss": 3.4491, + "step": 7383 + }, + { + "epoch": 0.4583773046123285, + "grad_norm": 0.2682716820487596, + "learning_rate": 9.915360542916876e-05, + "loss": 3.3355, + "step": 7384 + }, + { + "epoch": 0.4584393817120864, + "grad_norm": 0.25437817365367743, + "learning_rate": 9.915294360082778e-05, + "loss": 3.328, + "step": 7385 + }, + { + "epoch": 0.4585014588118443, + "grad_norm": 0.3135379243967449, + "learning_rate": 9.915228151604397e-05, + "loss": 3.3524, + "step": 7386 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 0.208211548296676, + "learning_rate": 9.915161917482076e-05, + "loss": 3.3777, + "step": 7387 + }, + { + "epoch": 0.4586256130113601, + "grad_norm": 0.4051816905398233, + "learning_rate": 9.915095657716162e-05, + "loss": 3.3158, + "step": 7388 + }, + { + "epoch": 0.458687690111118, + "grad_norm": 0.27462973213980457, + "learning_rate": 9.915029372307002e-05, + "loss": 3.3875, + "step": 7389 + }, + { + "epoch": 0.4587497672108759, + "grad_norm": 0.2695111253384791, + "learning_rate": 9.91496306125494e-05, + "loss": 3.3805, + "step": 7390 + }, + { + "epoch": 0.4588118443106338, + "grad_norm": 0.2391513276096407, + "learning_rate": 9.914896724560323e-05, + "loss": 3.4855, + "step": 7391 + }, + { + "epoch": 0.4588739214103917, + "grad_norm": 0.37698328559665273, + "learning_rate": 9.914830362223495e-05, + "loss": 3.3866, + "step": 7392 + }, + { + "epoch": 0.4589359985101496, + "grad_norm": 0.2975717950389542, + "learning_rate": 9.914763974244807e-05, + "loss": 3.4256, + "step": 7393 + }, + { + "epoch": 0.4589980756099075, + "grad_norm": 0.2252829858282034, + "learning_rate": 9.914697560624599e-05, + "loss": 3.3928, + "step": 7394 + }, + { + "epoch": 0.4590601527096654, + "grad_norm": 0.31335130607432615, + "learning_rate": 9.914631121363222e-05, + "loss": 3.4433, + "step": 7395 + }, + { + "epoch": 0.4591222298094233, + "grad_norm": 0.24574242597455295, + "learning_rate": 9.914564656461022e-05, + "loss": 3.3824, + "step": 7396 + }, + { + "epoch": 0.4591843069091812, + "grad_norm": 0.22834938882297426, + "learning_rate": 9.914498165918345e-05, + "loss": 3.3317, + "step": 7397 + }, + { + "epoch": 0.4592463840089391, + "grad_norm": 0.28957591984999675, + "learning_rate": 9.914431649735537e-05, + "loss": 3.4687, + "step": 7398 + }, + { + "epoch": 0.459308461108697, + "grad_norm": 0.2190215094739846, + "learning_rate": 9.914365107912946e-05, + "loss": 3.4233, + "step": 7399 + }, + { + "epoch": 0.4593705382084549, + "grad_norm": 0.2585197054603483, + "learning_rate": 9.914298540450922e-05, + "loss": 3.4076, + "step": 7400 + }, + { + "epoch": 0.4594326153082128, + "grad_norm": 0.22481251528158977, + "learning_rate": 9.914231947349805e-05, + "loss": 3.353, + "step": 7401 + }, + { + "epoch": 0.4594946924079707, + "grad_norm": 0.2511627308321559, + "learning_rate": 9.91416532860995e-05, + "loss": 3.3034, + "step": 7402 + }, + { + "epoch": 0.4595567695077286, + "grad_norm": 0.2921371355355881, + "learning_rate": 9.914098684231701e-05, + "loss": 3.379, + "step": 7403 + }, + { + "epoch": 0.4596188466074865, + "grad_norm": 0.21081374211345486, + "learning_rate": 9.914032014215407e-05, + "loss": 3.4928, + "step": 7404 + }, + { + "epoch": 0.4596809237072444, + "grad_norm": 0.19384724307076262, + "learning_rate": 9.913965318561413e-05, + "loss": 3.4133, + "step": 7405 + }, + { + "epoch": 0.4597430008070023, + "grad_norm": 0.24188981681113592, + "learning_rate": 9.913898597270071e-05, + "loss": 3.3495, + "step": 7406 + }, + { + "epoch": 0.4598050779067602, + "grad_norm": 0.259566151604983, + "learning_rate": 9.913831850341726e-05, + "loss": 3.3966, + "step": 7407 + }, + { + "epoch": 0.4598671550065181, + "grad_norm": 0.2399426992379215, + "learning_rate": 9.913765077776728e-05, + "loss": 3.3022, + "step": 7408 + }, + { + "epoch": 0.459929232106276, + "grad_norm": 0.2256133818342919, + "learning_rate": 9.913698279575425e-05, + "loss": 3.364, + "step": 7409 + }, + { + "epoch": 0.45999130920603387, + "grad_norm": 0.22110229448717805, + "learning_rate": 9.913631455738166e-05, + "loss": 3.4179, + "step": 7410 + }, + { + "epoch": 0.4600533863057918, + "grad_norm": 0.7134083161502732, + "learning_rate": 9.913564606265297e-05, + "loss": 3.4341, + "step": 7411 + }, + { + "epoch": 0.4601154634055497, + "grad_norm": 0.42727812728683745, + "learning_rate": 9.913497731157168e-05, + "loss": 3.3763, + "step": 7412 + }, + { + "epoch": 0.46017754050530757, + "grad_norm": 0.2836913011980765, + "learning_rate": 9.91343083041413e-05, + "loss": 3.3498, + "step": 7413 + }, + { + "epoch": 0.4602396176050655, + "grad_norm": 0.34105383625040064, + "learning_rate": 9.91336390403653e-05, + "loss": 3.4261, + "step": 7414 + }, + { + "epoch": 0.4603016947048234, + "grad_norm": 0.28402320250070057, + "learning_rate": 9.913296952024719e-05, + "loss": 3.337, + "step": 7415 + }, + { + "epoch": 0.46036377180458127, + "grad_norm": 0.2635417334489901, + "learning_rate": 9.913229974379043e-05, + "loss": 3.4209, + "step": 7416 + }, + { + "epoch": 0.4604258489043392, + "grad_norm": 0.34006713234232266, + "learning_rate": 9.913162971099857e-05, + "loss": 3.4268, + "step": 7417 + }, + { + "epoch": 0.4604879260040971, + "grad_norm": 0.39773832946741683, + "learning_rate": 9.913095942187503e-05, + "loss": 3.3794, + "step": 7418 + }, + { + "epoch": 0.46055000310385497, + "grad_norm": 0.29435504985438066, + "learning_rate": 9.913028887642337e-05, + "loss": 3.4211, + "step": 7419 + }, + { + "epoch": 0.4606120802036129, + "grad_norm": 0.23033269001780748, + "learning_rate": 9.912961807464707e-05, + "loss": 3.3217, + "step": 7420 + }, + { + "epoch": 0.4606741573033708, + "grad_norm": 0.4063663866144656, + "learning_rate": 9.91289470165496e-05, + "loss": 3.212, + "step": 7421 + }, + { + "epoch": 0.46073623440312866, + "grad_norm": 0.24123181704451, + "learning_rate": 9.91282757021345e-05, + "loss": 3.347, + "step": 7422 + }, + { + "epoch": 0.4607983115028866, + "grad_norm": 0.34599625826752856, + "learning_rate": 9.912760413140528e-05, + "loss": 3.4147, + "step": 7423 + }, + { + "epoch": 0.4608603886026445, + "grad_norm": 0.2632764323441824, + "learning_rate": 9.912693230436541e-05, + "loss": 3.3439, + "step": 7424 + }, + { + "epoch": 0.46092246570240236, + "grad_norm": 0.2353017128983376, + "learning_rate": 9.91262602210184e-05, + "loss": 3.2511, + "step": 7425 + }, + { + "epoch": 0.4609845428021603, + "grad_norm": 0.27689591369692795, + "learning_rate": 9.912558788136777e-05, + "loss": 3.4001, + "step": 7426 + }, + { + "epoch": 0.4610466199019182, + "grad_norm": 0.28792554625432604, + "learning_rate": 9.912491528541702e-05, + "loss": 3.3893, + "step": 7427 + }, + { + "epoch": 0.46110869700167606, + "grad_norm": 0.26993889320263076, + "learning_rate": 9.912424243316967e-05, + "loss": 3.3614, + "step": 7428 + }, + { + "epoch": 0.461170774101434, + "grad_norm": 0.30129005696070627, + "learning_rate": 9.912356932462921e-05, + "loss": 3.3318, + "step": 7429 + }, + { + "epoch": 0.4612328512011919, + "grad_norm": 0.2976343206903179, + "learning_rate": 9.912289595979915e-05, + "loss": 3.3513, + "step": 7430 + }, + { + "epoch": 0.46129492830094976, + "grad_norm": 0.28127172174283527, + "learning_rate": 9.912222233868305e-05, + "loss": 3.2395, + "step": 7431 + }, + { + "epoch": 0.4613570054007077, + "grad_norm": 0.3782198085623537, + "learning_rate": 9.912154846128438e-05, + "loss": 3.4279, + "step": 7432 + }, + { + "epoch": 0.4614190825004656, + "grad_norm": 0.6352702321235157, + "learning_rate": 9.912087432760667e-05, + "loss": 3.3736, + "step": 7433 + }, + { + "epoch": 0.46148115960022346, + "grad_norm": 0.34753529448308645, + "learning_rate": 9.912019993765343e-05, + "loss": 3.4595, + "step": 7434 + }, + { + "epoch": 0.4615432366999814, + "grad_norm": 0.4212579983650893, + "learning_rate": 9.911952529142816e-05, + "loss": 3.3614, + "step": 7435 + }, + { + "epoch": 0.4616053137997393, + "grad_norm": 0.3527624632338915, + "learning_rate": 9.911885038893442e-05, + "loss": 3.4975, + "step": 7436 + }, + { + "epoch": 0.46166739089949715, + "grad_norm": 0.48764947837031947, + "learning_rate": 9.911817523017573e-05, + "loss": 3.5307, + "step": 7437 + }, + { + "epoch": 0.46172946799925507, + "grad_norm": 0.37607876267977997, + "learning_rate": 9.911749981515558e-05, + "loss": 3.3838, + "step": 7438 + }, + { + "epoch": 0.461791545099013, + "grad_norm": 0.3678619804904151, + "learning_rate": 9.911682414387751e-05, + "loss": 3.3507, + "step": 7439 + }, + { + "epoch": 0.46185362219877085, + "grad_norm": 0.33472172728621075, + "learning_rate": 9.911614821634504e-05, + "loss": 3.4095, + "step": 7440 + }, + { + "epoch": 0.46191569929852877, + "grad_norm": 0.43425209309979956, + "learning_rate": 9.911547203256172e-05, + "loss": 3.3964, + "step": 7441 + }, + { + "epoch": 0.4619777763982867, + "grad_norm": 0.27678721291272584, + "learning_rate": 9.911479559253105e-05, + "loss": 3.4519, + "step": 7442 + }, + { + "epoch": 0.46203985349804455, + "grad_norm": 0.42518553050138924, + "learning_rate": 9.911411889625656e-05, + "loss": 3.4106, + "step": 7443 + }, + { + "epoch": 0.46210193059780247, + "grad_norm": 0.37279472017698706, + "learning_rate": 9.911344194374179e-05, + "loss": 3.3561, + "step": 7444 + }, + { + "epoch": 0.4621640076975604, + "grad_norm": 0.4043063839886054, + "learning_rate": 9.911276473499028e-05, + "loss": 3.3135, + "step": 7445 + }, + { + "epoch": 0.46222608479731825, + "grad_norm": 0.27944135596853464, + "learning_rate": 9.911208727000555e-05, + "loss": 3.4391, + "step": 7446 + }, + { + "epoch": 0.46228816189707617, + "grad_norm": 0.38872891874895094, + "learning_rate": 9.911140954879113e-05, + "loss": 3.4112, + "step": 7447 + }, + { + "epoch": 0.4623502389968341, + "grad_norm": 0.31356389620650493, + "learning_rate": 9.911073157135058e-05, + "loss": 3.4085, + "step": 7448 + }, + { + "epoch": 0.46241231609659195, + "grad_norm": 0.306704678496886, + "learning_rate": 9.911005333768741e-05, + "loss": 3.4205, + "step": 7449 + }, + { + "epoch": 0.46247439319634986, + "grad_norm": 0.27068829626557706, + "learning_rate": 9.910937484780518e-05, + "loss": 3.3965, + "step": 7450 + }, + { + "epoch": 0.4625364702961078, + "grad_norm": 0.38601792536136775, + "learning_rate": 9.91086961017074e-05, + "loss": 3.3122, + "step": 7451 + }, + { + "epoch": 0.46259854739586564, + "grad_norm": 0.2824623548482417, + "learning_rate": 9.910801709939765e-05, + "loss": 3.3218, + "step": 7452 + }, + { + "epoch": 0.46266062449562356, + "grad_norm": 0.42551835719695114, + "learning_rate": 9.910733784087944e-05, + "loss": 3.3217, + "step": 7453 + }, + { + "epoch": 0.4627227015953815, + "grad_norm": 0.3180579882717984, + "learning_rate": 9.910665832615635e-05, + "loss": 3.3764, + "step": 7454 + }, + { + "epoch": 0.46278477869513934, + "grad_norm": 0.2948061229589484, + "learning_rate": 9.910597855523187e-05, + "loss": 3.413, + "step": 7455 + }, + { + "epoch": 0.46284685579489726, + "grad_norm": 0.23953971603969773, + "learning_rate": 9.91052985281096e-05, + "loss": 3.3702, + "step": 7456 + }, + { + "epoch": 0.4629089328946552, + "grad_norm": 0.340566786770051, + "learning_rate": 9.910461824479307e-05, + "loss": 3.4479, + "step": 7457 + }, + { + "epoch": 0.46297100999441304, + "grad_norm": 0.3431981707553365, + "learning_rate": 9.91039377052858e-05, + "loss": 3.3879, + "step": 7458 + }, + { + "epoch": 0.46303308709417096, + "grad_norm": 0.35734601075558575, + "learning_rate": 9.91032569095914e-05, + "loss": 3.4426, + "step": 7459 + }, + { + "epoch": 0.4630951641939289, + "grad_norm": 0.3116585847925323, + "learning_rate": 9.910257585771338e-05, + "loss": 3.4188, + "step": 7460 + }, + { + "epoch": 0.46315724129368674, + "grad_norm": 0.4528434635156446, + "learning_rate": 9.910189454965528e-05, + "loss": 3.4737, + "step": 7461 + }, + { + "epoch": 0.46321931839344466, + "grad_norm": 0.35899353513767507, + "learning_rate": 9.91012129854207e-05, + "loss": 3.3439, + "step": 7462 + }, + { + "epoch": 0.4632813954932026, + "grad_norm": 0.3501077967070314, + "learning_rate": 9.910053116501316e-05, + "loss": 3.3358, + "step": 7463 + }, + { + "epoch": 0.46334347259296044, + "grad_norm": 0.2871109449901603, + "learning_rate": 9.909984908843623e-05, + "loss": 3.404, + "step": 7464 + }, + { + "epoch": 0.46340554969271835, + "grad_norm": 0.552485487475465, + "learning_rate": 9.909916675569349e-05, + "loss": 3.3904, + "step": 7465 + }, + { + "epoch": 0.4634676267924763, + "grad_norm": 0.4839550628283385, + "learning_rate": 9.909848416678844e-05, + "loss": 3.3563, + "step": 7466 + }, + { + "epoch": 0.46352970389223414, + "grad_norm": 0.3567656867034288, + "learning_rate": 9.90978013217247e-05, + "loss": 3.3537, + "step": 7467 + }, + { + "epoch": 0.46359178099199205, + "grad_norm": 0.2943271172410003, + "learning_rate": 9.909711822050582e-05, + "loss": 3.3731, + "step": 7468 + }, + { + "epoch": 0.46365385809174997, + "grad_norm": 0.30183549146593347, + "learning_rate": 9.909643486313533e-05, + "loss": 3.3676, + "step": 7469 + }, + { + "epoch": 0.46371593519150783, + "grad_norm": 0.3258076535875584, + "learning_rate": 9.909575124961685e-05, + "loss": 3.3937, + "step": 7470 + }, + { + "epoch": 0.46377801229126575, + "grad_norm": 0.31421340666580677, + "learning_rate": 9.90950673799539e-05, + "loss": 3.4313, + "step": 7471 + }, + { + "epoch": 0.46384008939102367, + "grad_norm": 0.5284617969650377, + "learning_rate": 9.909438325415007e-05, + "loss": 3.3843, + "step": 7472 + }, + { + "epoch": 0.46390216649078153, + "grad_norm": 0.2825794323929718, + "learning_rate": 9.909369887220892e-05, + "loss": 3.2774, + "step": 7473 + }, + { + "epoch": 0.46396424359053945, + "grad_norm": 0.3609311990889776, + "learning_rate": 9.909301423413403e-05, + "loss": 3.3951, + "step": 7474 + }, + { + "epoch": 0.46402632069029737, + "grad_norm": 0.3095667023553544, + "learning_rate": 9.909232933992895e-05, + "loss": 3.345, + "step": 7475 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.2891654913660363, + "learning_rate": 9.90916441895973e-05, + "loss": 3.4047, + "step": 7476 + }, + { + "epoch": 0.46415047488981315, + "grad_norm": 0.37590763401490557, + "learning_rate": 9.90909587831426e-05, + "loss": 3.3715, + "step": 7477 + }, + { + "epoch": 0.46421255198957107, + "grad_norm": 0.2709518671461381, + "learning_rate": 9.909027312056847e-05, + "loss": 3.3495, + "step": 7478 + }, + { + "epoch": 0.4642746290893289, + "grad_norm": 0.32661641830239524, + "learning_rate": 9.908958720187846e-05, + "loss": 3.3058, + "step": 7479 + }, + { + "epoch": 0.46433670618908685, + "grad_norm": 0.2813978071709566, + "learning_rate": 9.908890102707615e-05, + "loss": 3.5246, + "step": 7480 + }, + { + "epoch": 0.46439878328884476, + "grad_norm": 0.28118886548109473, + "learning_rate": 9.908821459616514e-05, + "loss": 3.3634, + "step": 7481 + }, + { + "epoch": 0.4644608603886026, + "grad_norm": 0.29126402475291846, + "learning_rate": 9.908752790914898e-05, + "loss": 3.2994, + "step": 7482 + }, + { + "epoch": 0.46452293748836054, + "grad_norm": 0.2452644954168614, + "learning_rate": 9.908684096603128e-05, + "loss": 3.3432, + "step": 7483 + }, + { + "epoch": 0.46458501458811846, + "grad_norm": 0.2640879562341849, + "learning_rate": 9.908615376681563e-05, + "loss": 3.3035, + "step": 7484 + }, + { + "epoch": 0.4646470916878763, + "grad_norm": 0.3883560853065606, + "learning_rate": 9.908546631150557e-05, + "loss": 3.3739, + "step": 7485 + }, + { + "epoch": 0.46470916878763424, + "grad_norm": 0.3190349828901709, + "learning_rate": 9.908477860010473e-05, + "loss": 3.3855, + "step": 7486 + }, + { + "epoch": 0.46477124588739216, + "grad_norm": 0.41789453507136526, + "learning_rate": 9.908409063261669e-05, + "loss": 3.4865, + "step": 7487 + }, + { + "epoch": 0.46483332298715, + "grad_norm": 0.32868480612504464, + "learning_rate": 9.908340240904504e-05, + "loss": 3.4252, + "step": 7488 + }, + { + "epoch": 0.46489540008690794, + "grad_norm": 0.5130713255638188, + "learning_rate": 9.908271392939334e-05, + "loss": 3.3544, + "step": 7489 + }, + { + "epoch": 0.46495747718666586, + "grad_norm": 0.2617551578989981, + "learning_rate": 9.908202519366521e-05, + "loss": 3.344, + "step": 7490 + }, + { + "epoch": 0.4650195542864237, + "grad_norm": 0.55197606454143, + "learning_rate": 9.908133620186426e-05, + "loss": 3.4218, + "step": 7491 + }, + { + "epoch": 0.46508163138618164, + "grad_norm": 0.3410808037761857, + "learning_rate": 9.908064695399405e-05, + "loss": 3.3929, + "step": 7492 + }, + { + "epoch": 0.46514370848593956, + "grad_norm": 0.5222189940218652, + "learning_rate": 9.907995745005819e-05, + "loss": 3.4582, + "step": 7493 + }, + { + "epoch": 0.4652057855856974, + "grad_norm": 0.4154388297599032, + "learning_rate": 9.907926769006028e-05, + "loss": 3.283, + "step": 7494 + }, + { + "epoch": 0.46526786268545534, + "grad_norm": 0.326784426739097, + "learning_rate": 9.907857767400393e-05, + "loss": 3.3239, + "step": 7495 + }, + { + "epoch": 0.46532993978521325, + "grad_norm": 0.3535407779238846, + "learning_rate": 9.90778874018927e-05, + "loss": 3.362, + "step": 7496 + }, + { + "epoch": 0.4653920168849711, + "grad_norm": 0.28676812009339614, + "learning_rate": 9.907719687373022e-05, + "loss": 3.4652, + "step": 7497 + }, + { + "epoch": 0.46545409398472903, + "grad_norm": 0.3617409693112075, + "learning_rate": 9.90765060895201e-05, + "loss": 3.3191, + "step": 7498 + }, + { + "epoch": 0.46551617108448695, + "grad_norm": 0.2605989326961674, + "learning_rate": 9.907581504926595e-05, + "loss": 3.342, + "step": 7499 + }, + { + "epoch": 0.4655782481842448, + "grad_norm": 0.2895717130817797, + "learning_rate": 9.907512375297134e-05, + "loss": 3.4468, + "step": 7500 + }, + { + "epoch": 0.46564032528400273, + "grad_norm": 0.3069799938563988, + "learning_rate": 9.90744322006399e-05, + "loss": 3.3496, + "step": 7501 + }, + { + "epoch": 0.46570240238376065, + "grad_norm": 0.2818672638611804, + "learning_rate": 9.907374039227523e-05, + "loss": 3.4452, + "step": 7502 + }, + { + "epoch": 0.4657644794835185, + "grad_norm": 0.3328473775793291, + "learning_rate": 9.907304832788094e-05, + "loss": 3.3445, + "step": 7503 + }, + { + "epoch": 0.46582655658327643, + "grad_norm": 0.23686045600759686, + "learning_rate": 9.907235600746066e-05, + "loss": 3.3158, + "step": 7504 + }, + { + "epoch": 0.46588863368303435, + "grad_norm": 0.2934241760446447, + "learning_rate": 9.907166343101797e-05, + "loss": 3.3624, + "step": 7505 + }, + { + "epoch": 0.4659507107827922, + "grad_norm": 0.20846293808903105, + "learning_rate": 9.90709705985565e-05, + "loss": 3.3565, + "step": 7506 + }, + { + "epoch": 0.46601278788255013, + "grad_norm": 0.207737285583581, + "learning_rate": 9.907027751007988e-05, + "loss": 3.3871, + "step": 7507 + }, + { + "epoch": 0.46607486498230805, + "grad_norm": 0.297685345406657, + "learning_rate": 9.906958416559168e-05, + "loss": 3.3818, + "step": 7508 + }, + { + "epoch": 0.4661369420820659, + "grad_norm": 0.2493608022056538, + "learning_rate": 9.906889056509558e-05, + "loss": 3.3112, + "step": 7509 + }, + { + "epoch": 0.4661990191818238, + "grad_norm": 0.26653182686175164, + "learning_rate": 9.906819670859515e-05, + "loss": 3.5166, + "step": 7510 + }, + { + "epoch": 0.46626109628158174, + "grad_norm": 0.32452886755945337, + "learning_rate": 9.906750259609403e-05, + "loss": 3.4219, + "step": 7511 + }, + { + "epoch": 0.4663231733813396, + "grad_norm": 0.2689588384059672, + "learning_rate": 9.906680822759584e-05, + "loss": 3.3682, + "step": 7512 + }, + { + "epoch": 0.4663852504810975, + "grad_norm": 0.27963118119329233, + "learning_rate": 9.906611360310421e-05, + "loss": 3.3683, + "step": 7513 + }, + { + "epoch": 0.46644732758085544, + "grad_norm": 0.2448698596994144, + "learning_rate": 9.906541872262272e-05, + "loss": 3.4039, + "step": 7514 + }, + { + "epoch": 0.4665094046806133, + "grad_norm": 0.23685901691090863, + "learning_rate": 9.906472358615506e-05, + "loss": 3.3703, + "step": 7515 + }, + { + "epoch": 0.4665714817803712, + "grad_norm": 0.4013253257435519, + "learning_rate": 9.906402819370482e-05, + "loss": 3.3047, + "step": 7516 + }, + { + "epoch": 0.46663355888012914, + "grad_norm": 0.2327175583643068, + "learning_rate": 9.906333254527563e-05, + "loss": 3.2923, + "step": 7517 + }, + { + "epoch": 0.466695635979887, + "grad_norm": 0.21544747923272495, + "learning_rate": 9.906263664087112e-05, + "loss": 3.3952, + "step": 7518 + }, + { + "epoch": 0.4667577130796449, + "grad_norm": 0.2769835779532908, + "learning_rate": 9.906194048049493e-05, + "loss": 3.3161, + "step": 7519 + }, + { + "epoch": 0.46681979017940284, + "grad_norm": 0.2023660897843442, + "learning_rate": 9.906124406415068e-05, + "loss": 3.3044, + "step": 7520 + }, + { + "epoch": 0.4668818672791607, + "grad_norm": 0.19946872697973797, + "learning_rate": 9.906054739184202e-05, + "loss": 3.404, + "step": 7521 + }, + { + "epoch": 0.4669439443789186, + "grad_norm": 0.2163118745369894, + "learning_rate": 9.905985046357256e-05, + "loss": 3.3269, + "step": 7522 + }, + { + "epoch": 0.46700602147867654, + "grad_norm": 0.22023127763078257, + "learning_rate": 9.905915327934596e-05, + "loss": 3.3247, + "step": 7523 + }, + { + "epoch": 0.4670680985784344, + "grad_norm": 0.21336878876956294, + "learning_rate": 9.905845583916584e-05, + "loss": 3.3926, + "step": 7524 + }, + { + "epoch": 0.4671301756781923, + "grad_norm": 0.25967290844187446, + "learning_rate": 9.905775814303586e-05, + "loss": 3.3978, + "step": 7525 + }, + { + "epoch": 0.46719225277795023, + "grad_norm": 0.2340002401638021, + "learning_rate": 9.905706019095964e-05, + "loss": 3.3108, + "step": 7526 + }, + { + "epoch": 0.4672543298777081, + "grad_norm": 0.2282644097358962, + "learning_rate": 9.905636198294084e-05, + "loss": 3.2709, + "step": 7527 + }, + { + "epoch": 0.467316406977466, + "grad_norm": 0.25107968657158314, + "learning_rate": 9.905566351898308e-05, + "loss": 3.385, + "step": 7528 + }, + { + "epoch": 0.46737848407722393, + "grad_norm": 0.3180097052139807, + "learning_rate": 9.905496479909e-05, + "loss": 3.3283, + "step": 7529 + }, + { + "epoch": 0.4674405611769818, + "grad_norm": 0.3048240893430833, + "learning_rate": 9.905426582326527e-05, + "loss": 3.4853, + "step": 7530 + }, + { + "epoch": 0.4675026382767397, + "grad_norm": 0.2646753360831193, + "learning_rate": 9.905356659151253e-05, + "loss": 3.3983, + "step": 7531 + }, + { + "epoch": 0.46756471537649763, + "grad_norm": 0.31491921516720395, + "learning_rate": 9.905286710383543e-05, + "loss": 3.4133, + "step": 7532 + }, + { + "epoch": 0.4676267924762555, + "grad_norm": 0.28788260056636084, + "learning_rate": 9.90521673602376e-05, + "loss": 3.4083, + "step": 7533 + }, + { + "epoch": 0.4676888695760134, + "grad_norm": 0.3192420728195168, + "learning_rate": 9.905146736072272e-05, + "loss": 3.3907, + "step": 7534 + }, + { + "epoch": 0.46775094667577133, + "grad_norm": 0.2291441368096641, + "learning_rate": 9.905076710529443e-05, + "loss": 3.4547, + "step": 7535 + }, + { + "epoch": 0.4678130237755292, + "grad_norm": 0.21922968314457797, + "learning_rate": 9.905006659395636e-05, + "loss": 3.2468, + "step": 7536 + }, + { + "epoch": 0.4678751008752871, + "grad_norm": 0.24634774285719233, + "learning_rate": 9.90493658267122e-05, + "loss": 3.3102, + "step": 7537 + }, + { + "epoch": 0.467937177975045, + "grad_norm": 0.23089555240662085, + "learning_rate": 9.904866480356561e-05, + "loss": 3.4094, + "step": 7538 + }, + { + "epoch": 0.4679992550748029, + "grad_norm": 0.22138800736343808, + "learning_rate": 9.90479635245202e-05, + "loss": 3.3988, + "step": 7539 + }, + { + "epoch": 0.4680613321745608, + "grad_norm": 0.202885504897786, + "learning_rate": 9.904726198957965e-05, + "loss": 3.3643, + "step": 7540 + }, + { + "epoch": 0.4681234092743187, + "grad_norm": 0.2752677968219889, + "learning_rate": 9.904656019874765e-05, + "loss": 3.3012, + "step": 7541 + }, + { + "epoch": 0.4681854863740766, + "grad_norm": 0.2920587521211955, + "learning_rate": 9.904585815202784e-05, + "loss": 3.2767, + "step": 7542 + }, + { + "epoch": 0.4682475634738345, + "grad_norm": 0.2665216794348058, + "learning_rate": 9.904515584942386e-05, + "loss": 3.4138, + "step": 7543 + }, + { + "epoch": 0.4683096405735924, + "grad_norm": 0.2444112269874329, + "learning_rate": 9.904445329093941e-05, + "loss": 3.3164, + "step": 7544 + }, + { + "epoch": 0.4683717176733503, + "grad_norm": 0.20686207462972184, + "learning_rate": 9.904375047657815e-05, + "loss": 3.3383, + "step": 7545 + }, + { + "epoch": 0.4684337947731082, + "grad_norm": 0.22702838418074633, + "learning_rate": 9.904304740634372e-05, + "loss": 3.3782, + "step": 7546 + }, + { + "epoch": 0.4684958718728661, + "grad_norm": 0.2709362479448263, + "learning_rate": 9.904234408023982e-05, + "loss": 3.3905, + "step": 7547 + }, + { + "epoch": 0.468557948972624, + "grad_norm": 0.3046648555068085, + "learning_rate": 9.904164049827009e-05, + "loss": 3.3163, + "step": 7548 + }, + { + "epoch": 0.4686200260723819, + "grad_norm": 0.263289336581232, + "learning_rate": 9.904093666043823e-05, + "loss": 3.3559, + "step": 7549 + }, + { + "epoch": 0.4686821031721398, + "grad_norm": 0.18739097127355223, + "learning_rate": 9.904023256674791e-05, + "loss": 3.4355, + "step": 7550 + }, + { + "epoch": 0.4687441802718977, + "grad_norm": 0.22590522134546423, + "learning_rate": 9.903952821720276e-05, + "loss": 3.385, + "step": 7551 + }, + { + "epoch": 0.4688062573716556, + "grad_norm": 0.24212348768838227, + "learning_rate": 9.90388236118065e-05, + "loss": 3.3063, + "step": 7552 + }, + { + "epoch": 0.4688683344714135, + "grad_norm": 0.2688200731742858, + "learning_rate": 9.903811875056282e-05, + "loss": 3.4324, + "step": 7553 + }, + { + "epoch": 0.4689304115711714, + "grad_norm": 0.24810639365695825, + "learning_rate": 9.903741363347532e-05, + "loss": 3.4345, + "step": 7554 + }, + { + "epoch": 0.4689924886709293, + "grad_norm": 0.22996272283878053, + "learning_rate": 9.903670826054776e-05, + "loss": 3.3565, + "step": 7555 + }, + { + "epoch": 0.4690545657706872, + "grad_norm": 0.29974276372225933, + "learning_rate": 9.903600263178378e-05, + "loss": 3.3196, + "step": 7556 + }, + { + "epoch": 0.4691166428704451, + "grad_norm": 0.25405726710111726, + "learning_rate": 9.903529674718707e-05, + "loss": 3.3437, + "step": 7557 + }, + { + "epoch": 0.469178719970203, + "grad_norm": 0.20691293358562346, + "learning_rate": 9.903459060676132e-05, + "loss": 3.4689, + "step": 7558 + }, + { + "epoch": 0.4692407970699609, + "grad_norm": 0.22658984249472147, + "learning_rate": 9.90338842105102e-05, + "loss": 3.3521, + "step": 7559 + }, + { + "epoch": 0.4693028741697188, + "grad_norm": 0.2947173641076386, + "learning_rate": 9.903317755843742e-05, + "loss": 3.4386, + "step": 7560 + }, + { + "epoch": 0.4693649512694767, + "grad_norm": 0.2923697770040759, + "learning_rate": 9.903247065054664e-05, + "loss": 3.307, + "step": 7561 + }, + { + "epoch": 0.4694270283692346, + "grad_norm": 0.2759694583785674, + "learning_rate": 9.903176348684156e-05, + "loss": 3.3328, + "step": 7562 + }, + { + "epoch": 0.4694891054689925, + "grad_norm": 0.27953775062486685, + "learning_rate": 9.903105606732585e-05, + "loss": 3.361, + "step": 7563 + }, + { + "epoch": 0.4695511825687504, + "grad_norm": 0.3182086353681855, + "learning_rate": 9.903034839200323e-05, + "loss": 3.2733, + "step": 7564 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.25484096432022296, + "learning_rate": 9.902964046087737e-05, + "loss": 3.3366, + "step": 7565 + }, + { + "epoch": 0.46967533676826617, + "grad_norm": 0.24566941537968082, + "learning_rate": 9.9028932273952e-05, + "loss": 3.3444, + "step": 7566 + }, + { + "epoch": 0.4697374138680241, + "grad_norm": 0.2631163502356508, + "learning_rate": 9.902822383123076e-05, + "loss": 3.4257, + "step": 7567 + }, + { + "epoch": 0.469799490967782, + "grad_norm": 0.20244989630363694, + "learning_rate": 9.902751513271738e-05, + "loss": 3.3331, + "step": 7568 + }, + { + "epoch": 0.46986156806753987, + "grad_norm": 0.2546147658042804, + "learning_rate": 9.902680617841555e-05, + "loss": 3.3984, + "step": 7569 + }, + { + "epoch": 0.4699236451672978, + "grad_norm": 0.506693266852125, + "learning_rate": 9.902609696832898e-05, + "loss": 3.3787, + "step": 7570 + }, + { + "epoch": 0.4699857222670557, + "grad_norm": 0.2885930398912683, + "learning_rate": 9.902538750246136e-05, + "loss": 3.3678, + "step": 7571 + }, + { + "epoch": 0.47004779936681357, + "grad_norm": 0.22623495562995483, + "learning_rate": 9.90246777808164e-05, + "loss": 3.4361, + "step": 7572 + }, + { + "epoch": 0.4701098764665715, + "grad_norm": 0.24322764013629586, + "learning_rate": 9.902396780339778e-05, + "loss": 3.3487, + "step": 7573 + }, + { + "epoch": 0.4701719535663294, + "grad_norm": 0.23930369345938707, + "learning_rate": 9.902325757020922e-05, + "loss": 3.3972, + "step": 7574 + }, + { + "epoch": 0.47023403066608727, + "grad_norm": 0.21844333201578472, + "learning_rate": 9.902254708125442e-05, + "loss": 3.2386, + "step": 7575 + }, + { + "epoch": 0.4702961077658452, + "grad_norm": 0.2541784148289178, + "learning_rate": 9.902183633653709e-05, + "loss": 3.3676, + "step": 7576 + }, + { + "epoch": 0.4703581848656031, + "grad_norm": 0.40484622361234657, + "learning_rate": 9.902112533606096e-05, + "loss": 3.3311, + "step": 7577 + }, + { + "epoch": 0.47042026196536096, + "grad_norm": 0.31499789418924445, + "learning_rate": 9.902041407982972e-05, + "loss": 3.3538, + "step": 7578 + }, + { + "epoch": 0.4704823390651189, + "grad_norm": 0.19689646255731133, + "learning_rate": 9.901970256784705e-05, + "loss": 3.3262, + "step": 7579 + }, + { + "epoch": 0.4705444161648768, + "grad_norm": 0.30432772792181734, + "learning_rate": 9.901899080011672e-05, + "loss": 3.3746, + "step": 7580 + }, + { + "epoch": 0.47060649326463466, + "grad_norm": 0.248628692413019, + "learning_rate": 9.90182787766424e-05, + "loss": 3.3723, + "step": 7581 + }, + { + "epoch": 0.4706685703643926, + "grad_norm": 0.3002557622918806, + "learning_rate": 9.901756649742782e-05, + "loss": 3.369, + "step": 7582 + }, + { + "epoch": 0.4707306474641505, + "grad_norm": 0.28055347183107354, + "learning_rate": 9.90168539624767e-05, + "loss": 3.3416, + "step": 7583 + }, + { + "epoch": 0.47079272456390836, + "grad_norm": 0.23581661481171473, + "learning_rate": 9.901614117179275e-05, + "loss": 3.4206, + "step": 7584 + }, + { + "epoch": 0.4708548016636663, + "grad_norm": 0.20744271146785512, + "learning_rate": 9.90154281253797e-05, + "loss": 3.3045, + "step": 7585 + }, + { + "epoch": 0.4709168787634242, + "grad_norm": 0.4826310356867319, + "learning_rate": 9.901471482324126e-05, + "loss": 3.4608, + "step": 7586 + }, + { + "epoch": 0.47097895586318206, + "grad_norm": 0.2618122218593378, + "learning_rate": 9.901400126538115e-05, + "loss": 3.3716, + "step": 7587 + }, + { + "epoch": 0.47104103296294, + "grad_norm": 0.3204975640275777, + "learning_rate": 9.901328745180309e-05, + "loss": 3.3998, + "step": 7588 + }, + { + "epoch": 0.4711031100626979, + "grad_norm": 0.256645988164455, + "learning_rate": 9.901257338251083e-05, + "loss": 3.3447, + "step": 7589 + }, + { + "epoch": 0.47116518716245576, + "grad_norm": 0.35004680056682486, + "learning_rate": 9.901185905750808e-05, + "loss": 3.4492, + "step": 7590 + }, + { + "epoch": 0.4712272642622137, + "grad_norm": 0.2743813437508376, + "learning_rate": 9.901114447679853e-05, + "loss": 3.3597, + "step": 7591 + }, + { + "epoch": 0.4712893413619716, + "grad_norm": 0.4782351859291453, + "learning_rate": 9.901042964038596e-05, + "loss": 3.4169, + "step": 7592 + }, + { + "epoch": 0.47135141846172945, + "grad_norm": 0.24539335430902184, + "learning_rate": 9.900971454827409e-05, + "loss": 3.2768, + "step": 7593 + }, + { + "epoch": 0.4714134955614874, + "grad_norm": 0.2894770253310335, + "learning_rate": 9.900899920046663e-05, + "loss": 3.3814, + "step": 7594 + }, + { + "epoch": 0.4714755726612453, + "grad_norm": 0.36821857726730756, + "learning_rate": 9.900828359696733e-05, + "loss": 3.4036, + "step": 7595 + }, + { + "epoch": 0.47153764976100315, + "grad_norm": 0.3051233719645961, + "learning_rate": 9.900756773777992e-05, + "loss": 3.2799, + "step": 7596 + }, + { + "epoch": 0.47159972686076107, + "grad_norm": 0.3147501875101914, + "learning_rate": 9.900685162290812e-05, + "loss": 3.303, + "step": 7597 + }, + { + "epoch": 0.471661803960519, + "grad_norm": 0.3179899307057305, + "learning_rate": 9.90061352523557e-05, + "loss": 3.4294, + "step": 7598 + }, + { + "epoch": 0.47172388106027685, + "grad_norm": 0.36176661098545054, + "learning_rate": 9.900541862612635e-05, + "loss": 3.3577, + "step": 7599 + }, + { + "epoch": 0.47178595816003477, + "grad_norm": 0.47313656322907255, + "learning_rate": 9.900470174422385e-05, + "loss": 3.2048, + "step": 7600 + }, + { + "epoch": 0.4718480352597927, + "grad_norm": 0.3388924932787598, + "learning_rate": 9.900398460665192e-05, + "loss": 3.4283, + "step": 7601 + }, + { + "epoch": 0.47191011235955055, + "grad_norm": 0.312651847663476, + "learning_rate": 9.90032672134143e-05, + "loss": 3.3185, + "step": 7602 + }, + { + "epoch": 0.47197218945930847, + "grad_norm": 0.3921772252594864, + "learning_rate": 9.900254956451474e-05, + "loss": 3.3811, + "step": 7603 + }, + { + "epoch": 0.4720342665590664, + "grad_norm": 0.3087126587283123, + "learning_rate": 9.9001831659957e-05, + "loss": 3.3979, + "step": 7604 + }, + { + "epoch": 0.47209634365882425, + "grad_norm": 0.3163209729600293, + "learning_rate": 9.90011134997448e-05, + "loss": 3.4894, + "step": 7605 + }, + { + "epoch": 0.47215842075858216, + "grad_norm": 0.34525557821285136, + "learning_rate": 9.900039508388188e-05, + "loss": 3.3639, + "step": 7606 + }, + { + "epoch": 0.4722204978583401, + "grad_norm": 0.42191009022336995, + "learning_rate": 9.899967641237203e-05, + "loss": 3.4018, + "step": 7607 + }, + { + "epoch": 0.47228257495809794, + "grad_norm": 0.47001513778880916, + "learning_rate": 9.899895748521895e-05, + "loss": 3.3037, + "step": 7608 + }, + { + "epoch": 0.47234465205785586, + "grad_norm": 0.2176580679434871, + "learning_rate": 9.899823830242643e-05, + "loss": 3.3383, + "step": 7609 + }, + { + "epoch": 0.4724067291576138, + "grad_norm": 0.31439856535712246, + "learning_rate": 9.89975188639982e-05, + "loss": 3.4035, + "step": 7610 + }, + { + "epoch": 0.47246880625737164, + "grad_norm": 0.3136630231622547, + "learning_rate": 9.8996799169938e-05, + "loss": 3.3612, + "step": 7611 + }, + { + "epoch": 0.47253088335712956, + "grad_norm": 0.43255869726199686, + "learning_rate": 9.899607922024962e-05, + "loss": 3.3635, + "step": 7612 + }, + { + "epoch": 0.4725929604568875, + "grad_norm": 0.29291237606625015, + "learning_rate": 9.899535901493682e-05, + "loss": 3.4237, + "step": 7613 + }, + { + "epoch": 0.47265503755664534, + "grad_norm": 0.23591502000437736, + "learning_rate": 9.899463855400332e-05, + "loss": 3.3155, + "step": 7614 + }, + { + "epoch": 0.47271711465640326, + "grad_norm": 0.3233385086608071, + "learning_rate": 9.89939178374529e-05, + "loss": 3.4013, + "step": 7615 + }, + { + "epoch": 0.4727791917561612, + "grad_norm": 0.38528359109014276, + "learning_rate": 9.899319686528932e-05, + "loss": 3.3971, + "step": 7616 + }, + { + "epoch": 0.47284126885591904, + "grad_norm": 0.22799313143581915, + "learning_rate": 9.899247563751633e-05, + "loss": 3.4008, + "step": 7617 + }, + { + "epoch": 0.47290334595567696, + "grad_norm": 0.21438826047295273, + "learning_rate": 9.89917541541377e-05, + "loss": 3.3001, + "step": 7618 + }, + { + "epoch": 0.4729654230554349, + "grad_norm": 0.3061584034005198, + "learning_rate": 9.89910324151572e-05, + "loss": 3.3962, + "step": 7619 + }, + { + "epoch": 0.47302750015519274, + "grad_norm": 0.2717273228868491, + "learning_rate": 9.899031042057859e-05, + "loss": 3.421, + "step": 7620 + }, + { + "epoch": 0.47308957725495066, + "grad_norm": 0.18786375206521141, + "learning_rate": 9.898958817040565e-05, + "loss": 3.2707, + "step": 7621 + }, + { + "epoch": 0.4731516543547086, + "grad_norm": 0.23424188489455136, + "learning_rate": 9.898886566464212e-05, + "loss": 3.4025, + "step": 7622 + }, + { + "epoch": 0.47321373145446644, + "grad_norm": 0.22903652812847544, + "learning_rate": 9.898814290329179e-05, + "loss": 3.4082, + "step": 7623 + }, + { + "epoch": 0.47327580855422435, + "grad_norm": 0.290405978971236, + "learning_rate": 9.898741988635842e-05, + "loss": 3.4476, + "step": 7624 + }, + { + "epoch": 0.47333788565398227, + "grad_norm": 0.2562423780517279, + "learning_rate": 9.89866966138458e-05, + "loss": 3.3653, + "step": 7625 + }, + { + "epoch": 0.47339996275374013, + "grad_norm": 0.2578549061202869, + "learning_rate": 9.898597308575767e-05, + "loss": 3.5107, + "step": 7626 + }, + { + "epoch": 0.47346203985349805, + "grad_norm": 0.2845472600078696, + "learning_rate": 9.898524930209786e-05, + "loss": 3.3963, + "step": 7627 + }, + { + "epoch": 0.47352411695325597, + "grad_norm": 0.2830809494090738, + "learning_rate": 9.89845252628701e-05, + "loss": 3.4242, + "step": 7628 + }, + { + "epoch": 0.47358619405301383, + "grad_norm": 0.2736347440750813, + "learning_rate": 9.898380096807818e-05, + "loss": 3.3322, + "step": 7629 + }, + { + "epoch": 0.47364827115277175, + "grad_norm": 0.19762091226530404, + "learning_rate": 9.898307641772586e-05, + "loss": 3.3722, + "step": 7630 + }, + { + "epoch": 0.47371034825252967, + "grad_norm": 0.22573360547599772, + "learning_rate": 9.898235161181696e-05, + "loss": 3.3369, + "step": 7631 + }, + { + "epoch": 0.47377242535228753, + "grad_norm": 0.1913087362899366, + "learning_rate": 9.898162655035524e-05, + "loss": 3.2314, + "step": 7632 + }, + { + "epoch": 0.47383450245204545, + "grad_norm": 0.2396276224014747, + "learning_rate": 9.898090123334447e-05, + "loss": 3.4006, + "step": 7633 + }, + { + "epoch": 0.47389657955180337, + "grad_norm": 0.22694350138497746, + "learning_rate": 9.898017566078845e-05, + "loss": 3.3491, + "step": 7634 + }, + { + "epoch": 0.4739586566515612, + "grad_norm": 0.34044221821066906, + "learning_rate": 9.897944983269098e-05, + "loss": 3.3062, + "step": 7635 + }, + { + "epoch": 0.47402073375131915, + "grad_norm": 0.23950176647307328, + "learning_rate": 9.897872374905582e-05, + "loss": 3.3304, + "step": 7636 + }, + { + "epoch": 0.47408281085107706, + "grad_norm": 0.36641599890259335, + "learning_rate": 9.897799740988676e-05, + "loss": 3.4419, + "step": 7637 + }, + { + "epoch": 0.4741448879508349, + "grad_norm": 0.3630469807640465, + "learning_rate": 9.897727081518762e-05, + "loss": 3.3907, + "step": 7638 + }, + { + "epoch": 0.47420696505059284, + "grad_norm": 0.373804710715223, + "learning_rate": 9.897654396496214e-05, + "loss": 3.375, + "step": 7639 + }, + { + "epoch": 0.47426904215035076, + "grad_norm": 0.3112343550009429, + "learning_rate": 9.897581685921416e-05, + "loss": 3.4099, + "step": 7640 + }, + { + "epoch": 0.4743311192501086, + "grad_norm": 0.2612753856225746, + "learning_rate": 9.897508949794744e-05, + "loss": 3.3634, + "step": 7641 + }, + { + "epoch": 0.47439319634986654, + "grad_norm": 0.3557353015927301, + "learning_rate": 9.897436188116581e-05, + "loss": 3.4847, + "step": 7642 + }, + { + "epoch": 0.47445527344962446, + "grad_norm": 0.3032700260528353, + "learning_rate": 9.897363400887303e-05, + "loss": 3.3096, + "step": 7643 + }, + { + "epoch": 0.4745173505493823, + "grad_norm": 0.34825895336482926, + "learning_rate": 9.897290588107291e-05, + "loss": 3.4267, + "step": 7644 + }, + { + "epoch": 0.47457942764914024, + "grad_norm": 0.35118663616790413, + "learning_rate": 9.897217749776926e-05, + "loss": 3.4019, + "step": 7645 + }, + { + "epoch": 0.47464150474889816, + "grad_norm": 0.281105481210966, + "learning_rate": 9.897144885896588e-05, + "loss": 3.3988, + "step": 7646 + }, + { + "epoch": 0.474703581848656, + "grad_norm": 0.30339005129743646, + "learning_rate": 9.897071996466656e-05, + "loss": 3.4246, + "step": 7647 + }, + { + "epoch": 0.47476565894841394, + "grad_norm": 0.26937896462286615, + "learning_rate": 9.896999081487509e-05, + "loss": 3.3968, + "step": 7648 + }, + { + "epoch": 0.47482773604817186, + "grad_norm": 0.31837704381297965, + "learning_rate": 9.89692614095953e-05, + "loss": 3.4305, + "step": 7649 + }, + { + "epoch": 0.4748898131479297, + "grad_norm": 0.35571783372616417, + "learning_rate": 9.896853174883098e-05, + "loss": 3.4241, + "step": 7650 + }, + { + "epoch": 0.47495189024768764, + "grad_norm": 0.3383845539364829, + "learning_rate": 9.896780183258594e-05, + "loss": 3.4169, + "step": 7651 + }, + { + "epoch": 0.47501396734744555, + "grad_norm": 0.27691665861290654, + "learning_rate": 9.896707166086401e-05, + "loss": 3.2516, + "step": 7652 + }, + { + "epoch": 0.4750760444472034, + "grad_norm": 0.24151976117244062, + "learning_rate": 9.896634123366895e-05, + "loss": 3.3931, + "step": 7653 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.2848283763969978, + "learning_rate": 9.896561055100462e-05, + "loss": 3.3698, + "step": 7654 + }, + { + "epoch": 0.47520019864671925, + "grad_norm": 0.3066217574614098, + "learning_rate": 9.896487961287482e-05, + "loss": 3.3415, + "step": 7655 + }, + { + "epoch": 0.4752622757464771, + "grad_norm": 0.20418383569117962, + "learning_rate": 9.896414841928334e-05, + "loss": 3.4226, + "step": 7656 + }, + { + "epoch": 0.47532435284623503, + "grad_norm": 0.2405492843056298, + "learning_rate": 9.896341697023401e-05, + "loss": 3.3595, + "step": 7657 + }, + { + "epoch": 0.47538642994599295, + "grad_norm": 0.2891715720312129, + "learning_rate": 9.896268526573067e-05, + "loss": 3.3716, + "step": 7658 + }, + { + "epoch": 0.4754485070457508, + "grad_norm": 0.28858730848029013, + "learning_rate": 9.89619533057771e-05, + "loss": 3.425, + "step": 7659 + }, + { + "epoch": 0.47551058414550873, + "grad_norm": 0.24532495485262804, + "learning_rate": 9.896122109037713e-05, + "loss": 3.3593, + "step": 7660 + }, + { + "epoch": 0.47557266124526665, + "grad_norm": 0.2604812001695065, + "learning_rate": 9.896048861953459e-05, + "loss": 3.3207, + "step": 7661 + }, + { + "epoch": 0.4756347383450245, + "grad_norm": 0.17073000100552396, + "learning_rate": 9.895975589325328e-05, + "loss": 3.2901, + "step": 7662 + }, + { + "epoch": 0.47569681544478243, + "grad_norm": 0.42351604873306703, + "learning_rate": 9.895902291153706e-05, + "loss": 3.3662, + "step": 7663 + }, + { + "epoch": 0.47575889254454035, + "grad_norm": 0.22156682271197467, + "learning_rate": 9.895828967438971e-05, + "loss": 3.3152, + "step": 7664 + }, + { + "epoch": 0.4758209696442982, + "grad_norm": 0.25338887541953115, + "learning_rate": 9.89575561818151e-05, + "loss": 3.3438, + "step": 7665 + }, + { + "epoch": 0.4758830467440561, + "grad_norm": 0.32718678719370964, + "learning_rate": 9.895682243381703e-05, + "loss": 3.3373, + "step": 7666 + }, + { + "epoch": 0.47594512384381404, + "grad_norm": 0.3526125379589471, + "learning_rate": 9.895608843039933e-05, + "loss": 3.3424, + "step": 7667 + }, + { + "epoch": 0.4760072009435719, + "grad_norm": 0.24926455922778593, + "learning_rate": 9.895535417156583e-05, + "loss": 3.3108, + "step": 7668 + }, + { + "epoch": 0.4760692780433298, + "grad_norm": 0.25360663415940027, + "learning_rate": 9.895461965732036e-05, + "loss": 3.4225, + "step": 7669 + }, + { + "epoch": 0.47613135514308774, + "grad_norm": 0.27985628831592524, + "learning_rate": 9.895388488766676e-05, + "loss": 3.2729, + "step": 7670 + }, + { + "epoch": 0.4761934322428456, + "grad_norm": 0.3021794801927954, + "learning_rate": 9.895314986260887e-05, + "loss": 3.3765, + "step": 7671 + }, + { + "epoch": 0.4762555093426035, + "grad_norm": 0.40111088052270716, + "learning_rate": 9.89524145821505e-05, + "loss": 3.415, + "step": 7672 + }, + { + "epoch": 0.47631758644236144, + "grad_norm": 0.39527217297427614, + "learning_rate": 9.895167904629549e-05, + "loss": 3.3942, + "step": 7673 + }, + { + "epoch": 0.4763796635421193, + "grad_norm": 0.29798864743270925, + "learning_rate": 9.895094325504768e-05, + "loss": 3.3767, + "step": 7674 + }, + { + "epoch": 0.4764417406418772, + "grad_norm": 0.35195549086644556, + "learning_rate": 9.895020720841094e-05, + "loss": 3.4499, + "step": 7675 + }, + { + "epoch": 0.47650381774163514, + "grad_norm": 0.2963060020880083, + "learning_rate": 9.894947090638908e-05, + "loss": 3.3679, + "step": 7676 + }, + { + "epoch": 0.476565894841393, + "grad_norm": 0.5023480060687796, + "learning_rate": 9.894873434898595e-05, + "loss": 3.3498, + "step": 7677 + }, + { + "epoch": 0.4766279719411509, + "grad_norm": 0.40091752529561125, + "learning_rate": 9.894799753620538e-05, + "loss": 3.3647, + "step": 7678 + }, + { + "epoch": 0.47669004904090884, + "grad_norm": 0.30562040364528303, + "learning_rate": 9.894726046805123e-05, + "loss": 3.3902, + "step": 7679 + }, + { + "epoch": 0.4767521261406667, + "grad_norm": 0.26461310344329725, + "learning_rate": 9.894652314452734e-05, + "loss": 3.4046, + "step": 7680 + }, + { + "epoch": 0.4768142032404246, + "grad_norm": 0.23219388309961755, + "learning_rate": 9.894578556563756e-05, + "loss": 3.3588, + "step": 7681 + }, + { + "epoch": 0.4768762803401825, + "grad_norm": 0.2714421905017451, + "learning_rate": 9.894504773138573e-05, + "loss": 3.3552, + "step": 7682 + }, + { + "epoch": 0.4769383574399404, + "grad_norm": 0.2781972780499693, + "learning_rate": 9.89443096417757e-05, + "loss": 3.3535, + "step": 7683 + }, + { + "epoch": 0.4770004345396983, + "grad_norm": 0.21322654329918386, + "learning_rate": 9.894357129681134e-05, + "loss": 3.3546, + "step": 7684 + }, + { + "epoch": 0.4770625116394562, + "grad_norm": 0.24230993160345782, + "learning_rate": 9.894283269649647e-05, + "loss": 3.325, + "step": 7685 + }, + { + "epoch": 0.4771245887392141, + "grad_norm": 0.24951018838730396, + "learning_rate": 9.894209384083497e-05, + "loss": 3.3864, + "step": 7686 + }, + { + "epoch": 0.477186665838972, + "grad_norm": 0.3107239560210313, + "learning_rate": 9.894135472983069e-05, + "loss": 3.41, + "step": 7687 + }, + { + "epoch": 0.4772487429387299, + "grad_norm": 0.2778716637312007, + "learning_rate": 9.894061536348746e-05, + "loss": 3.4004, + "step": 7688 + }, + { + "epoch": 0.4773108200384878, + "grad_norm": 0.18734302506294648, + "learning_rate": 9.893987574180918e-05, + "loss": 3.3821, + "step": 7689 + }, + { + "epoch": 0.4773728971382457, + "grad_norm": 0.34853828708806206, + "learning_rate": 9.893913586479969e-05, + "loss": 3.3535, + "step": 7690 + }, + { + "epoch": 0.4774349742380036, + "grad_norm": 0.24367428300504906, + "learning_rate": 9.893839573246284e-05, + "loss": 3.2792, + "step": 7691 + }, + { + "epoch": 0.4774970513377615, + "grad_norm": 0.41361883974373076, + "learning_rate": 9.89376553448025e-05, + "loss": 3.3991, + "step": 7692 + }, + { + "epoch": 0.4775591284375194, + "grad_norm": 0.2917078094188023, + "learning_rate": 9.893691470182252e-05, + "loss": 3.3949, + "step": 7693 + }, + { + "epoch": 0.47762120553727727, + "grad_norm": 0.22602989635644977, + "learning_rate": 9.89361738035268e-05, + "loss": 3.3521, + "step": 7694 + }, + { + "epoch": 0.4776832826370352, + "grad_norm": 0.26911458690907364, + "learning_rate": 9.893543264991916e-05, + "loss": 3.2908, + "step": 7695 + }, + { + "epoch": 0.4777453597367931, + "grad_norm": 0.298528259017977, + "learning_rate": 9.89346912410035e-05, + "loss": 3.3328, + "step": 7696 + }, + { + "epoch": 0.47780743683655097, + "grad_norm": 0.3278594077471646, + "learning_rate": 9.893394957678367e-05, + "loss": 3.305, + "step": 7697 + }, + { + "epoch": 0.4778695139363089, + "grad_norm": 0.33578467774272486, + "learning_rate": 9.893320765726355e-05, + "loss": 3.3641, + "step": 7698 + }, + { + "epoch": 0.4779315910360668, + "grad_norm": 0.3171021262029031, + "learning_rate": 9.893246548244701e-05, + "loss": 3.3759, + "step": 7699 + }, + { + "epoch": 0.47799366813582467, + "grad_norm": 0.24763404854981294, + "learning_rate": 9.893172305233791e-05, + "loss": 3.261, + "step": 7700 + }, + { + "epoch": 0.4780557452355826, + "grad_norm": 0.266437913461678, + "learning_rate": 9.893098036694014e-05, + "loss": 3.3668, + "step": 7701 + }, + { + "epoch": 0.4781178223353405, + "grad_norm": 0.292715675402202, + "learning_rate": 9.893023742625756e-05, + "loss": 3.4021, + "step": 7702 + }, + { + "epoch": 0.47817989943509837, + "grad_norm": 0.24829329618083676, + "learning_rate": 9.892949423029405e-05, + "loss": 3.3014, + "step": 7703 + }, + { + "epoch": 0.4782419765348563, + "grad_norm": 0.3016271215586346, + "learning_rate": 9.892875077905351e-05, + "loss": 3.3857, + "step": 7704 + }, + { + "epoch": 0.4783040536346142, + "grad_norm": 0.3188057210314078, + "learning_rate": 9.892800707253979e-05, + "loss": 3.4157, + "step": 7705 + }, + { + "epoch": 0.47836613073437206, + "grad_norm": 0.31170702699497066, + "learning_rate": 9.892726311075677e-05, + "loss": 3.3836, + "step": 7706 + }, + { + "epoch": 0.47842820783413, + "grad_norm": 0.2927952443374719, + "learning_rate": 9.892651889370834e-05, + "loss": 3.3542, + "step": 7707 + }, + { + "epoch": 0.4784902849338879, + "grad_norm": 0.5552863796790873, + "learning_rate": 9.892577442139839e-05, + "loss": 3.3564, + "step": 7708 + }, + { + "epoch": 0.47855236203364576, + "grad_norm": 0.27608000591029636, + "learning_rate": 9.89250296938308e-05, + "loss": 3.3378, + "step": 7709 + }, + { + "epoch": 0.4786144391334037, + "grad_norm": 0.5374368397619596, + "learning_rate": 9.892428471100945e-05, + "loss": 3.3641, + "step": 7710 + }, + { + "epoch": 0.4786765162331616, + "grad_norm": 0.3740690318073512, + "learning_rate": 9.892353947293825e-05, + "loss": 3.4501, + "step": 7711 + }, + { + "epoch": 0.47873859333291946, + "grad_norm": 0.2593264775677508, + "learning_rate": 9.892279397962104e-05, + "loss": 3.3508, + "step": 7712 + }, + { + "epoch": 0.4788006704326774, + "grad_norm": 0.3415352064956409, + "learning_rate": 9.892204823106175e-05, + "loss": 3.3682, + "step": 7713 + }, + { + "epoch": 0.4788627475324353, + "grad_norm": 0.2816822518578601, + "learning_rate": 9.892130222726426e-05, + "loss": 3.3107, + "step": 7714 + }, + { + "epoch": 0.47892482463219316, + "grad_norm": 0.23268827029707034, + "learning_rate": 9.892055596823244e-05, + "loss": 3.3718, + "step": 7715 + }, + { + "epoch": 0.4789869017319511, + "grad_norm": 0.37073183983552443, + "learning_rate": 9.891980945397022e-05, + "loss": 3.2818, + "step": 7716 + }, + { + "epoch": 0.479048978831709, + "grad_norm": 0.2917107977291481, + "learning_rate": 9.891906268448147e-05, + "loss": 3.437, + "step": 7717 + }, + { + "epoch": 0.47911105593146686, + "grad_norm": 0.30002205886804195, + "learning_rate": 9.891831565977011e-05, + "loss": 3.4253, + "step": 7718 + }, + { + "epoch": 0.4791731330312248, + "grad_norm": 0.2601175147651874, + "learning_rate": 9.891756837984001e-05, + "loss": 3.3943, + "step": 7719 + }, + { + "epoch": 0.4792352101309827, + "grad_norm": 0.2940332655000891, + "learning_rate": 9.891682084469508e-05, + "loss": 3.3298, + "step": 7720 + }, + { + "epoch": 0.47929728723074055, + "grad_norm": 0.24961870143223897, + "learning_rate": 9.891607305433923e-05, + "loss": 3.2995, + "step": 7721 + }, + { + "epoch": 0.47935936433049847, + "grad_norm": 0.30652887288254055, + "learning_rate": 9.891532500877635e-05, + "loss": 3.3242, + "step": 7722 + }, + { + "epoch": 0.4794214414302564, + "grad_norm": 0.36276992580915385, + "learning_rate": 9.891457670801035e-05, + "loss": 3.32, + "step": 7723 + }, + { + "epoch": 0.47948351853001425, + "grad_norm": 0.2969703395744363, + "learning_rate": 9.89138281520451e-05, + "loss": 3.3597, + "step": 7724 + }, + { + "epoch": 0.47954559562977217, + "grad_norm": 0.3295077412387666, + "learning_rate": 9.891307934088456e-05, + "loss": 3.3772, + "step": 7725 + }, + { + "epoch": 0.4796076727295301, + "grad_norm": 0.39497773240496, + "learning_rate": 9.891233027453262e-05, + "loss": 3.28, + "step": 7726 + }, + { + "epoch": 0.47966974982928795, + "grad_norm": 0.3272954468151665, + "learning_rate": 9.891158095299315e-05, + "loss": 3.3181, + "step": 7727 + }, + { + "epoch": 0.47973182692904587, + "grad_norm": 0.319401503898368, + "learning_rate": 9.891083137627011e-05, + "loss": 3.3179, + "step": 7728 + }, + { + "epoch": 0.4797939040288038, + "grad_norm": 0.29009420981633205, + "learning_rate": 9.891008154436738e-05, + "loss": 3.3473, + "step": 7729 + }, + { + "epoch": 0.47985598112856165, + "grad_norm": 0.22987449131816529, + "learning_rate": 9.890933145728889e-05, + "loss": 3.2075, + "step": 7730 + }, + { + "epoch": 0.47991805822831957, + "grad_norm": 0.2508997966759832, + "learning_rate": 9.890858111503853e-05, + "loss": 3.3328, + "step": 7731 + }, + { + "epoch": 0.4799801353280775, + "grad_norm": 0.23138636228627807, + "learning_rate": 9.890783051762025e-05, + "loss": 3.4317, + "step": 7732 + }, + { + "epoch": 0.48004221242783535, + "grad_norm": 0.288593736947002, + "learning_rate": 9.890707966503793e-05, + "loss": 3.3194, + "step": 7733 + }, + { + "epoch": 0.48010428952759326, + "grad_norm": 0.25453172085357256, + "learning_rate": 9.89063285572955e-05, + "loss": 3.4136, + "step": 7734 + }, + { + "epoch": 0.4801663666273512, + "grad_norm": 0.2703654193344718, + "learning_rate": 9.89055771943969e-05, + "loss": 3.3321, + "step": 7735 + }, + { + "epoch": 0.48022844372710904, + "grad_norm": 0.19379811650820963, + "learning_rate": 9.890482557634602e-05, + "loss": 3.3102, + "step": 7736 + }, + { + "epoch": 0.48029052082686696, + "grad_norm": 0.3416162504045084, + "learning_rate": 9.890407370314678e-05, + "loss": 3.3184, + "step": 7737 + }, + { + "epoch": 0.4803525979266249, + "grad_norm": 0.2920496957340755, + "learning_rate": 9.890332157480313e-05, + "loss": 3.3449, + "step": 7738 + }, + { + "epoch": 0.48041467502638274, + "grad_norm": 0.20067825681546425, + "learning_rate": 9.890256919131898e-05, + "loss": 3.2828, + "step": 7739 + }, + { + "epoch": 0.48047675212614066, + "grad_norm": 0.2843769083902231, + "learning_rate": 9.890181655269825e-05, + "loss": 3.2596, + "step": 7740 + }, + { + "epoch": 0.4805388292258986, + "grad_norm": 0.3639194051773311, + "learning_rate": 9.890106365894488e-05, + "loss": 3.2873, + "step": 7741 + }, + { + "epoch": 0.48060090632565644, + "grad_norm": 0.25836883110041975, + "learning_rate": 9.890031051006278e-05, + "loss": 3.3603, + "step": 7742 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.24933957994451916, + "learning_rate": 9.88995571060559e-05, + "loss": 3.3549, + "step": 7743 + }, + { + "epoch": 0.4807250605251723, + "grad_norm": 0.29038792959650045, + "learning_rate": 9.889880344692814e-05, + "loss": 3.4362, + "step": 7744 + }, + { + "epoch": 0.48078713762493014, + "grad_norm": 0.30265505582212476, + "learning_rate": 9.889804953268347e-05, + "loss": 3.3728, + "step": 7745 + }, + { + "epoch": 0.48084921472468806, + "grad_norm": 0.26358706205212895, + "learning_rate": 9.88972953633258e-05, + "loss": 3.2846, + "step": 7746 + }, + { + "epoch": 0.480911291824446, + "grad_norm": 0.297055101977108, + "learning_rate": 9.889654093885908e-05, + "loss": 3.2902, + "step": 7747 + }, + { + "epoch": 0.48097336892420384, + "grad_norm": 0.18578475961666704, + "learning_rate": 9.889578625928722e-05, + "loss": 3.33, + "step": 7748 + }, + { + "epoch": 0.48103544602396175, + "grad_norm": 0.2696000025315879, + "learning_rate": 9.889503132461417e-05, + "loss": 3.3916, + "step": 7749 + }, + { + "epoch": 0.4810975231237197, + "grad_norm": 0.2672800924248651, + "learning_rate": 9.889427613484388e-05, + "loss": 3.3722, + "step": 7750 + }, + { + "epoch": 0.48115960022347753, + "grad_norm": 0.2491695199971039, + "learning_rate": 9.889352068998028e-05, + "loss": 3.3709, + "step": 7751 + }, + { + "epoch": 0.48122167732323545, + "grad_norm": 0.20376716488809996, + "learning_rate": 9.889276499002731e-05, + "loss": 3.3572, + "step": 7752 + }, + { + "epoch": 0.48128375442299337, + "grad_norm": 0.285794732144434, + "learning_rate": 9.889200903498893e-05, + "loss": 3.2472, + "step": 7753 + }, + { + "epoch": 0.48134583152275123, + "grad_norm": 0.23976748151610439, + "learning_rate": 9.889125282486906e-05, + "loss": 3.3388, + "step": 7754 + }, + { + "epoch": 0.48140790862250915, + "grad_norm": 0.26519962920303947, + "learning_rate": 9.889049635967166e-05, + "loss": 3.3019, + "step": 7755 + }, + { + "epoch": 0.48146998572226707, + "grad_norm": 0.2747712169834179, + "learning_rate": 9.888973963940067e-05, + "loss": 3.4146, + "step": 7756 + }, + { + "epoch": 0.48153206282202493, + "grad_norm": 0.2337551822189904, + "learning_rate": 9.888898266406003e-05, + "loss": 3.3258, + "step": 7757 + }, + { + "epoch": 0.48159413992178285, + "grad_norm": 0.2915906920709629, + "learning_rate": 9.888822543365371e-05, + "loss": 3.332, + "step": 7758 + }, + { + "epoch": 0.48165621702154077, + "grad_norm": 0.18717970909883727, + "learning_rate": 9.888746794818564e-05, + "loss": 3.3332, + "step": 7759 + }, + { + "epoch": 0.48171829412129863, + "grad_norm": 0.2789383968558521, + "learning_rate": 9.88867102076598e-05, + "loss": 3.2287, + "step": 7760 + }, + { + "epoch": 0.48178037122105655, + "grad_norm": 0.24789133226285942, + "learning_rate": 9.888595221208011e-05, + "loss": 3.3672, + "step": 7761 + }, + { + "epoch": 0.48184244832081446, + "grad_norm": 0.23020617025892495, + "learning_rate": 9.888519396145055e-05, + "loss": 3.3616, + "step": 7762 + }, + { + "epoch": 0.4819045254205723, + "grad_norm": 0.2064237859468724, + "learning_rate": 9.888443545577506e-05, + "loss": 3.2838, + "step": 7763 + }, + { + "epoch": 0.48196660252033025, + "grad_norm": 0.2453332913450429, + "learning_rate": 9.888367669505761e-05, + "loss": 3.2701, + "step": 7764 + }, + { + "epoch": 0.48202867962008816, + "grad_norm": 0.28684296217146404, + "learning_rate": 9.888291767930213e-05, + "loss": 3.3637, + "step": 7765 + }, + { + "epoch": 0.482090756719846, + "grad_norm": 0.22826585405719343, + "learning_rate": 9.888215840851263e-05, + "loss": 3.3661, + "step": 7766 + }, + { + "epoch": 0.48215283381960394, + "grad_norm": 0.22052592362799778, + "learning_rate": 9.888139888269302e-05, + "loss": 3.2252, + "step": 7767 + }, + { + "epoch": 0.48221491091936186, + "grad_norm": 0.3013704770712762, + "learning_rate": 9.88806391018473e-05, + "loss": 3.4005, + "step": 7768 + }, + { + "epoch": 0.4822769880191197, + "grad_norm": 0.2655611803017979, + "learning_rate": 9.887987906597942e-05, + "loss": 3.2789, + "step": 7769 + }, + { + "epoch": 0.48233906511887764, + "grad_norm": 0.26723416350380347, + "learning_rate": 9.887911877509333e-05, + "loss": 3.2537, + "step": 7770 + }, + { + "epoch": 0.48240114221863556, + "grad_norm": 0.28302716017273494, + "learning_rate": 9.887835822919302e-05, + "loss": 3.2453, + "step": 7771 + }, + { + "epoch": 0.4824632193183934, + "grad_norm": 0.2741108369556158, + "learning_rate": 9.887759742828246e-05, + "loss": 3.3464, + "step": 7772 + }, + { + "epoch": 0.48252529641815134, + "grad_norm": 0.22732092261003908, + "learning_rate": 9.88768363723656e-05, + "loss": 3.3217, + "step": 7773 + }, + { + "epoch": 0.48258737351790926, + "grad_norm": 0.19897083676746086, + "learning_rate": 9.887607506144642e-05, + "loss": 3.3524, + "step": 7774 + }, + { + "epoch": 0.4826494506176671, + "grad_norm": 0.25361155500915145, + "learning_rate": 9.887531349552888e-05, + "loss": 3.4354, + "step": 7775 + }, + { + "epoch": 0.48271152771742504, + "grad_norm": 0.2457502253255849, + "learning_rate": 9.887455167461697e-05, + "loss": 3.3536, + "step": 7776 + }, + { + "epoch": 0.48277360481718296, + "grad_norm": 0.21616544787713576, + "learning_rate": 9.887378959871467e-05, + "loss": 3.2911, + "step": 7777 + }, + { + "epoch": 0.4828356819169408, + "grad_norm": 0.23189215038860192, + "learning_rate": 9.887302726782595e-05, + "loss": 3.4138, + "step": 7778 + }, + { + "epoch": 0.48289775901669874, + "grad_norm": 0.25727180656561843, + "learning_rate": 9.887226468195476e-05, + "loss": 3.3822, + "step": 7779 + }, + { + "epoch": 0.48295983611645665, + "grad_norm": 0.29059046211930173, + "learning_rate": 9.887150184110511e-05, + "loss": 3.2575, + "step": 7780 + }, + { + "epoch": 0.4830219132162145, + "grad_norm": 0.2731659221763156, + "learning_rate": 9.887073874528099e-05, + "loss": 3.2982, + "step": 7781 + }, + { + "epoch": 0.48308399031597243, + "grad_norm": 0.21620225552957983, + "learning_rate": 9.886997539448634e-05, + "loss": 3.3938, + "step": 7782 + }, + { + "epoch": 0.48314606741573035, + "grad_norm": 0.2053065980065853, + "learning_rate": 9.886921178872517e-05, + "loss": 3.3305, + "step": 7783 + }, + { + "epoch": 0.4832081445154882, + "grad_norm": 0.17353484776801673, + "learning_rate": 9.886844792800146e-05, + "loss": 3.3366, + "step": 7784 + }, + { + "epoch": 0.48327022161524613, + "grad_norm": 0.18433145679703758, + "learning_rate": 9.886768381231919e-05, + "loss": 3.3101, + "step": 7785 + }, + { + "epoch": 0.48333229871500405, + "grad_norm": 0.28167967294162094, + "learning_rate": 9.886691944168236e-05, + "loss": 3.3738, + "step": 7786 + }, + { + "epoch": 0.4833943758147619, + "grad_norm": 0.33182706725606737, + "learning_rate": 9.886615481609493e-05, + "loss": 3.3799, + "step": 7787 + }, + { + "epoch": 0.48345645291451983, + "grad_norm": 0.2217529948687098, + "learning_rate": 9.886538993556093e-05, + "loss": 3.4096, + "step": 7788 + }, + { + "epoch": 0.48351853001427775, + "grad_norm": 0.25229538728683243, + "learning_rate": 9.886462480008432e-05, + "loss": 3.4138, + "step": 7789 + }, + { + "epoch": 0.4835806071140356, + "grad_norm": 0.25239840054488877, + "learning_rate": 9.886385940966909e-05, + "loss": 3.3506, + "step": 7790 + }, + { + "epoch": 0.48364268421379353, + "grad_norm": 0.26870257380551504, + "learning_rate": 9.886309376431926e-05, + "loss": 3.2869, + "step": 7791 + }, + { + "epoch": 0.48370476131355145, + "grad_norm": 0.22459953120678225, + "learning_rate": 9.886232786403879e-05, + "loss": 3.2904, + "step": 7792 + }, + { + "epoch": 0.4837668384133093, + "grad_norm": 0.2533991224157075, + "learning_rate": 9.886156170883171e-05, + "loss": 3.4255, + "step": 7793 + }, + { + "epoch": 0.4838289155130672, + "grad_norm": 0.20654057891400998, + "learning_rate": 9.886079529870199e-05, + "loss": 3.3139, + "step": 7794 + }, + { + "epoch": 0.48389099261282514, + "grad_norm": 0.2772559308990529, + "learning_rate": 9.886002863365365e-05, + "loss": 3.3491, + "step": 7795 + }, + { + "epoch": 0.483953069712583, + "grad_norm": 0.23670075729024959, + "learning_rate": 9.885926171369066e-05, + "loss": 3.2993, + "step": 7796 + }, + { + "epoch": 0.4840151468123409, + "grad_norm": 0.20617626093007863, + "learning_rate": 9.885849453881706e-05, + "loss": 3.3832, + "step": 7797 + }, + { + "epoch": 0.48407722391209884, + "grad_norm": 0.2585991936655226, + "learning_rate": 9.885772710903682e-05, + "loss": 3.4031, + "step": 7798 + }, + { + "epoch": 0.4841393010118567, + "grad_norm": 0.2089019862548038, + "learning_rate": 9.885695942435397e-05, + "loss": 3.2989, + "step": 7799 + }, + { + "epoch": 0.4842013781116146, + "grad_norm": 0.39156617268684085, + "learning_rate": 9.88561914847725e-05, + "loss": 3.35, + "step": 7800 + }, + { + "epoch": 0.48426345521137254, + "grad_norm": 0.30017208984932836, + "learning_rate": 9.885542329029642e-05, + "loss": 3.3161, + "step": 7801 + }, + { + "epoch": 0.4843255323111304, + "grad_norm": 0.29689374663743223, + "learning_rate": 9.885465484092973e-05, + "loss": 3.3638, + "step": 7802 + }, + { + "epoch": 0.4843876094108883, + "grad_norm": 0.3054197633104042, + "learning_rate": 9.885388613667646e-05, + "loss": 3.338, + "step": 7803 + }, + { + "epoch": 0.48444968651064624, + "grad_norm": 0.34221471042528784, + "learning_rate": 9.885311717754058e-05, + "loss": 3.2303, + "step": 7804 + }, + { + "epoch": 0.4845117636104041, + "grad_norm": 0.29060921677421486, + "learning_rate": 9.885234796352614e-05, + "loss": 3.2401, + "step": 7805 + }, + { + "epoch": 0.484573840710162, + "grad_norm": 0.3229132070150961, + "learning_rate": 9.885157849463716e-05, + "loss": 3.3991, + "step": 7806 + }, + { + "epoch": 0.48463591780991994, + "grad_norm": 0.25910960085629586, + "learning_rate": 9.88508087708776e-05, + "loss": 3.3059, + "step": 7807 + }, + { + "epoch": 0.4846979949096778, + "grad_norm": 0.3363647959319251, + "learning_rate": 9.885003879225155e-05, + "loss": 3.4188, + "step": 7808 + }, + { + "epoch": 0.4847600720094357, + "grad_norm": 0.2906746755209871, + "learning_rate": 9.884926855876296e-05, + "loss": 3.3443, + "step": 7809 + }, + { + "epoch": 0.48482214910919363, + "grad_norm": 0.2590803989627865, + "learning_rate": 9.884849807041588e-05, + "loss": 3.3239, + "step": 7810 + }, + { + "epoch": 0.4848842262089515, + "grad_norm": 0.29295211744778654, + "learning_rate": 9.884772732721435e-05, + "loss": 3.3538, + "step": 7811 + }, + { + "epoch": 0.4849463033087094, + "grad_norm": 0.29148854481974296, + "learning_rate": 9.884695632916234e-05, + "loss": 3.3419, + "step": 7812 + }, + { + "epoch": 0.48500838040846733, + "grad_norm": 0.2099841792527734, + "learning_rate": 9.884618507626391e-05, + "loss": 3.3496, + "step": 7813 + }, + { + "epoch": 0.4850704575082252, + "grad_norm": 0.29405147118313096, + "learning_rate": 9.884541356852308e-05, + "loss": 3.2982, + "step": 7814 + }, + { + "epoch": 0.4851325346079831, + "grad_norm": 0.29316747918828717, + "learning_rate": 9.884464180594387e-05, + "loss": 3.396, + "step": 7815 + }, + { + "epoch": 0.48519461170774103, + "grad_norm": 0.5685603779357646, + "learning_rate": 9.88438697885303e-05, + "loss": 3.3313, + "step": 7816 + }, + { + "epoch": 0.4852566888074989, + "grad_norm": 0.41614329926535254, + "learning_rate": 9.884309751628642e-05, + "loss": 3.354, + "step": 7817 + }, + { + "epoch": 0.4853187659072568, + "grad_norm": 0.34228829070872235, + "learning_rate": 9.884232498921622e-05, + "loss": 3.3392, + "step": 7818 + }, + { + "epoch": 0.48538084300701473, + "grad_norm": 0.3876717125462298, + "learning_rate": 9.884155220732376e-05, + "loss": 3.3375, + "step": 7819 + }, + { + "epoch": 0.4854429201067726, + "grad_norm": 0.3608028408378529, + "learning_rate": 9.884077917061308e-05, + "loss": 3.3108, + "step": 7820 + }, + { + "epoch": 0.4855049972065305, + "grad_norm": 0.32347725333674515, + "learning_rate": 9.884000587908819e-05, + "loss": 3.2613, + "step": 7821 + }, + { + "epoch": 0.4855670743062884, + "grad_norm": 0.2757059686156864, + "learning_rate": 9.883923233275313e-05, + "loss": 3.3569, + "step": 7822 + }, + { + "epoch": 0.4856291514060463, + "grad_norm": 0.3112964819857937, + "learning_rate": 9.883845853161195e-05, + "loss": 3.2454, + "step": 7823 + }, + { + "epoch": 0.4856912285058042, + "grad_norm": 0.2405188744463699, + "learning_rate": 9.883768447566865e-05, + "loss": 3.3608, + "step": 7824 + }, + { + "epoch": 0.4857533056055621, + "grad_norm": 0.3689103392164736, + "learning_rate": 9.883691016492733e-05, + "loss": 3.3347, + "step": 7825 + }, + { + "epoch": 0.48581538270532, + "grad_norm": 0.24119697561281586, + "learning_rate": 9.883613559939197e-05, + "loss": 3.349, + "step": 7826 + }, + { + "epoch": 0.4858774598050779, + "grad_norm": 0.2515848854719814, + "learning_rate": 9.883536077906663e-05, + "loss": 3.2574, + "step": 7827 + }, + { + "epoch": 0.4859395369048358, + "grad_norm": 0.24497263472131803, + "learning_rate": 9.883458570395537e-05, + "loss": 3.3094, + "step": 7828 + }, + { + "epoch": 0.4860016140045937, + "grad_norm": 0.2979271214435806, + "learning_rate": 9.883381037406223e-05, + "loss": 3.3475, + "step": 7829 + }, + { + "epoch": 0.4860636911043516, + "grad_norm": 0.26469881204710344, + "learning_rate": 9.883303478939124e-05, + "loss": 3.3923, + "step": 7830 + }, + { + "epoch": 0.4861257682041095, + "grad_norm": 0.300410500453005, + "learning_rate": 9.883225894994644e-05, + "loss": 3.4293, + "step": 7831 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 0.22745373153212575, + "learning_rate": 9.88314828557319e-05, + "loss": 3.3577, + "step": 7832 + }, + { + "epoch": 0.4862499224036253, + "grad_norm": 0.248837839413047, + "learning_rate": 9.883070650675166e-05, + "loss": 3.3836, + "step": 7833 + }, + { + "epoch": 0.4863119995033832, + "grad_norm": 0.29865333331939986, + "learning_rate": 9.882992990300976e-05, + "loss": 3.207, + "step": 7834 + }, + { + "epoch": 0.4863740766031411, + "grad_norm": 0.2379928469464526, + "learning_rate": 9.882915304451028e-05, + "loss": 3.341, + "step": 7835 + }, + { + "epoch": 0.486436153702899, + "grad_norm": 0.2002204879189947, + "learning_rate": 9.882837593125724e-05, + "loss": 3.2572, + "step": 7836 + }, + { + "epoch": 0.4864982308026569, + "grad_norm": 0.33075526152063894, + "learning_rate": 9.882759856325471e-05, + "loss": 3.305, + "step": 7837 + }, + { + "epoch": 0.4865603079024148, + "grad_norm": 0.27171004816108996, + "learning_rate": 9.882682094050674e-05, + "loss": 3.3423, + "step": 7838 + }, + { + "epoch": 0.4866223850021727, + "grad_norm": 0.25978921481986617, + "learning_rate": 9.882604306301742e-05, + "loss": 3.437, + "step": 7839 + }, + { + "epoch": 0.4866844621019306, + "grad_norm": 0.28320055993332577, + "learning_rate": 9.882526493079076e-05, + "loss": 3.3352, + "step": 7840 + }, + { + "epoch": 0.4867465392016885, + "grad_norm": 0.3256152651827147, + "learning_rate": 9.882448654383085e-05, + "loss": 3.306, + "step": 7841 + }, + { + "epoch": 0.4868086163014464, + "grad_norm": 0.27336412535249677, + "learning_rate": 9.882370790214173e-05, + "loss": 3.28, + "step": 7842 + }, + { + "epoch": 0.4868706934012043, + "grad_norm": 0.26670683015670826, + "learning_rate": 9.882292900572747e-05, + "loss": 3.3106, + "step": 7843 + }, + { + "epoch": 0.4869327705009622, + "grad_norm": 0.21916601098588642, + "learning_rate": 9.882214985459215e-05, + "loss": 3.3947, + "step": 7844 + }, + { + "epoch": 0.4869948476007201, + "grad_norm": 0.3328279361816547, + "learning_rate": 9.882137044873982e-05, + "loss": 3.1922, + "step": 7845 + }, + { + "epoch": 0.487056924700478, + "grad_norm": 0.31734526192001133, + "learning_rate": 9.882059078817455e-05, + "loss": 3.2904, + "step": 7846 + }, + { + "epoch": 0.4871190018002359, + "grad_norm": 0.23898331602322714, + "learning_rate": 9.88198108729004e-05, + "loss": 3.2937, + "step": 7847 + }, + { + "epoch": 0.4871810788999938, + "grad_norm": 0.23289472100956557, + "learning_rate": 9.881903070292145e-05, + "loss": 3.347, + "step": 7848 + }, + { + "epoch": 0.4872431559997517, + "grad_norm": 0.3424954713744774, + "learning_rate": 9.881825027824175e-05, + "loss": 3.3666, + "step": 7849 + }, + { + "epoch": 0.48730523309950957, + "grad_norm": 0.23028852398788674, + "learning_rate": 9.88174695988654e-05, + "loss": 3.3509, + "step": 7850 + }, + { + "epoch": 0.4873673101992675, + "grad_norm": 0.23434792998070933, + "learning_rate": 9.881668866479645e-05, + "loss": 3.2775, + "step": 7851 + }, + { + "epoch": 0.4874293872990254, + "grad_norm": 0.22163318364718304, + "learning_rate": 9.881590747603901e-05, + "loss": 3.2586, + "step": 7852 + }, + { + "epoch": 0.48749146439878327, + "grad_norm": 0.2651635397030974, + "learning_rate": 9.88151260325971e-05, + "loss": 3.4174, + "step": 7853 + }, + { + "epoch": 0.4875535414985412, + "grad_norm": 0.21967863619844602, + "learning_rate": 9.881434433447485e-05, + "loss": 3.4312, + "step": 7854 + }, + { + "epoch": 0.4876156185982991, + "grad_norm": 0.2276490563674496, + "learning_rate": 9.881356238167632e-05, + "loss": 3.379, + "step": 7855 + }, + { + "epoch": 0.48767769569805697, + "grad_norm": 0.2745453462797502, + "learning_rate": 9.881278017420557e-05, + "loss": 3.4012, + "step": 7856 + }, + { + "epoch": 0.4877397727978149, + "grad_norm": 0.2606530054857986, + "learning_rate": 9.881199771206669e-05, + "loss": 3.2671, + "step": 7857 + }, + { + "epoch": 0.4878018498975728, + "grad_norm": 0.434674042740916, + "learning_rate": 9.881121499526378e-05, + "loss": 3.3741, + "step": 7858 + }, + { + "epoch": 0.48786392699733067, + "grad_norm": 0.29830455415230844, + "learning_rate": 9.88104320238009e-05, + "loss": 3.3753, + "step": 7859 + }, + { + "epoch": 0.4879260040970886, + "grad_norm": 0.4185084486077806, + "learning_rate": 9.880964879768216e-05, + "loss": 3.3949, + "step": 7860 + }, + { + "epoch": 0.4879880811968465, + "grad_norm": 0.26933653335560703, + "learning_rate": 9.880886531691162e-05, + "loss": 3.3195, + "step": 7861 + }, + { + "epoch": 0.48805015829660436, + "grad_norm": 0.43065511819629715, + "learning_rate": 9.88080815814934e-05, + "loss": 3.3637, + "step": 7862 + }, + { + "epoch": 0.4881122353963623, + "grad_norm": 0.2756525647183331, + "learning_rate": 9.880729759143156e-05, + "loss": 3.3192, + "step": 7863 + }, + { + "epoch": 0.4881743124961202, + "grad_norm": 0.42407381170558694, + "learning_rate": 9.88065133467302e-05, + "loss": 3.4066, + "step": 7864 + }, + { + "epoch": 0.48823638959587806, + "grad_norm": 0.416570266857271, + "learning_rate": 9.880572884739341e-05, + "loss": 3.2918, + "step": 7865 + }, + { + "epoch": 0.488298466695636, + "grad_norm": 0.2969035837567472, + "learning_rate": 9.880494409342529e-05, + "loss": 3.2418, + "step": 7866 + }, + { + "epoch": 0.4883605437953939, + "grad_norm": 0.2932409674854239, + "learning_rate": 9.880415908482992e-05, + "loss": 3.3838, + "step": 7867 + }, + { + "epoch": 0.48842262089515176, + "grad_norm": 0.26189177757543497, + "learning_rate": 9.880337382161142e-05, + "loss": 3.3412, + "step": 7868 + }, + { + "epoch": 0.4884846979949097, + "grad_norm": 0.29715177533885323, + "learning_rate": 9.880258830377385e-05, + "loss": 3.3639, + "step": 7869 + }, + { + "epoch": 0.4885467750946676, + "grad_norm": 0.30844079826547144, + "learning_rate": 9.880180253132134e-05, + "loss": 3.3441, + "step": 7870 + }, + { + "epoch": 0.48860885219442546, + "grad_norm": 0.25258334391919146, + "learning_rate": 9.8801016504258e-05, + "loss": 3.4946, + "step": 7871 + }, + { + "epoch": 0.4886709292941834, + "grad_norm": 0.27675582465061616, + "learning_rate": 9.880023022258788e-05, + "loss": 3.3454, + "step": 7872 + }, + { + "epoch": 0.4887330063939413, + "grad_norm": 0.2652906389070253, + "learning_rate": 9.879944368631512e-05, + "loss": 3.3579, + "step": 7873 + }, + { + "epoch": 0.48879508349369916, + "grad_norm": 0.31970181051835783, + "learning_rate": 9.879865689544381e-05, + "loss": 3.3182, + "step": 7874 + }, + { + "epoch": 0.4888571605934571, + "grad_norm": 0.255965465652656, + "learning_rate": 9.879786984997808e-05, + "loss": 3.4639, + "step": 7875 + }, + { + "epoch": 0.488919237693215, + "grad_norm": 0.28458510643828827, + "learning_rate": 9.8797082549922e-05, + "loss": 3.3263, + "step": 7876 + }, + { + "epoch": 0.48898131479297285, + "grad_norm": 0.278811252668165, + "learning_rate": 9.879629499527969e-05, + "loss": 3.3222, + "step": 7877 + }, + { + "epoch": 0.48904339189273077, + "grad_norm": 0.35811564335075824, + "learning_rate": 9.879550718605529e-05, + "loss": 3.1983, + "step": 7878 + }, + { + "epoch": 0.4891054689924887, + "grad_norm": 0.26353660221787384, + "learning_rate": 9.879471912225286e-05, + "loss": 3.3864, + "step": 7879 + }, + { + "epoch": 0.48916754609224655, + "grad_norm": 0.2855855505931155, + "learning_rate": 9.879393080387654e-05, + "loss": 3.3713, + "step": 7880 + }, + { + "epoch": 0.48922962319200447, + "grad_norm": 0.2933417171441563, + "learning_rate": 9.879314223093043e-05, + "loss": 3.3381, + "step": 7881 + }, + { + "epoch": 0.4892917002917624, + "grad_norm": 0.30397811863602464, + "learning_rate": 9.879235340341867e-05, + "loss": 3.323, + "step": 7882 + }, + { + "epoch": 0.48935377739152025, + "grad_norm": 0.30426900194882195, + "learning_rate": 9.879156432134536e-05, + "loss": 3.3378, + "step": 7883 + }, + { + "epoch": 0.48941585449127817, + "grad_norm": 0.27481301246944045, + "learning_rate": 9.87907749847146e-05, + "loss": 3.2706, + "step": 7884 + }, + { + "epoch": 0.4894779315910361, + "grad_norm": 0.27082040951533265, + "learning_rate": 9.878998539353053e-05, + "loss": 3.2845, + "step": 7885 + }, + { + "epoch": 0.48954000869079395, + "grad_norm": 0.34699587305429136, + "learning_rate": 9.878919554779724e-05, + "loss": 3.3834, + "step": 7886 + }, + { + "epoch": 0.48960208579055187, + "grad_norm": 0.3034970551533308, + "learning_rate": 9.87884054475189e-05, + "loss": 3.3971, + "step": 7887 + }, + { + "epoch": 0.4896641628903098, + "grad_norm": 0.3172293306588572, + "learning_rate": 9.87876150926996e-05, + "loss": 3.4016, + "step": 7888 + }, + { + "epoch": 0.48972623999006765, + "grad_norm": 0.2598630966076917, + "learning_rate": 9.878682448334347e-05, + "loss": 3.207, + "step": 7889 + }, + { + "epoch": 0.48978831708982556, + "grad_norm": 0.33924238118251665, + "learning_rate": 9.878603361945462e-05, + "loss": 3.3103, + "step": 7890 + }, + { + "epoch": 0.4898503941895835, + "grad_norm": 0.3223584359355808, + "learning_rate": 9.878524250103719e-05, + "loss": 3.239, + "step": 7891 + }, + { + "epoch": 0.48991247128934134, + "grad_norm": 0.28090796600761464, + "learning_rate": 9.878445112809531e-05, + "loss": 3.2736, + "step": 7892 + }, + { + "epoch": 0.48997454838909926, + "grad_norm": 0.3234555672469107, + "learning_rate": 9.87836595006331e-05, + "loss": 3.3561, + "step": 7893 + }, + { + "epoch": 0.4900366254888572, + "grad_norm": 0.35821097977906663, + "learning_rate": 9.878286761865471e-05, + "loss": 3.3153, + "step": 7894 + }, + { + "epoch": 0.49009870258861504, + "grad_norm": 0.2558726969737819, + "learning_rate": 9.878207548216426e-05, + "loss": 3.3141, + "step": 7895 + }, + { + "epoch": 0.49016077968837296, + "grad_norm": 0.28277933706091196, + "learning_rate": 9.878128309116588e-05, + "loss": 3.318, + "step": 7896 + }, + { + "epoch": 0.4902228567881309, + "grad_norm": 0.2572989840503017, + "learning_rate": 9.878049044566368e-05, + "loss": 3.3678, + "step": 7897 + }, + { + "epoch": 0.49028493388788874, + "grad_norm": 0.2703068120721822, + "learning_rate": 9.877969754566185e-05, + "loss": 3.3449, + "step": 7898 + }, + { + "epoch": 0.49034701098764666, + "grad_norm": 0.277959703766762, + "learning_rate": 9.877890439116448e-05, + "loss": 3.3923, + "step": 7899 + }, + { + "epoch": 0.4904090880874046, + "grad_norm": 0.2895933251864058, + "learning_rate": 9.877811098217573e-05, + "loss": 3.3679, + "step": 7900 + }, + { + "epoch": 0.49047116518716244, + "grad_norm": 0.2814438072656728, + "learning_rate": 9.877731731869972e-05, + "loss": 3.3371, + "step": 7901 + }, + { + "epoch": 0.49053324228692036, + "grad_norm": 0.48582538218995786, + "learning_rate": 9.877652340074061e-05, + "loss": 3.2456, + "step": 7902 + }, + { + "epoch": 0.4905953193866783, + "grad_norm": 0.25118930868867745, + "learning_rate": 9.877572922830256e-05, + "loss": 3.3442, + "step": 7903 + }, + { + "epoch": 0.49065739648643614, + "grad_norm": 0.28052441727258415, + "learning_rate": 9.877493480138966e-05, + "loss": 3.3217, + "step": 7904 + }, + { + "epoch": 0.49071947358619405, + "grad_norm": 0.24935228662618464, + "learning_rate": 9.877414012000609e-05, + "loss": 3.1773, + "step": 7905 + }, + { + "epoch": 0.490781550685952, + "grad_norm": 0.27625260151203446, + "learning_rate": 9.8773345184156e-05, + "loss": 3.3587, + "step": 7906 + }, + { + "epoch": 0.49084362778570984, + "grad_norm": 0.22049832758680152, + "learning_rate": 9.877254999384353e-05, + "loss": 3.2377, + "step": 7907 + }, + { + "epoch": 0.49090570488546775, + "grad_norm": 0.40540002890997484, + "learning_rate": 9.877175454907282e-05, + "loss": 3.2765, + "step": 7908 + }, + { + "epoch": 0.49096778198522567, + "grad_norm": 0.29016662772195007, + "learning_rate": 9.877095884984802e-05, + "loss": 3.3931, + "step": 7909 + }, + { + "epoch": 0.49102985908498353, + "grad_norm": 0.36925020559327143, + "learning_rate": 9.877016289617331e-05, + "loss": 3.3479, + "step": 7910 + }, + { + "epoch": 0.49109193618474145, + "grad_norm": 0.2884881396285351, + "learning_rate": 9.87693666880528e-05, + "loss": 3.3422, + "step": 7911 + }, + { + "epoch": 0.49115401328449937, + "grad_norm": 0.2673607381732756, + "learning_rate": 9.876857022549066e-05, + "loss": 3.4601, + "step": 7912 + }, + { + "epoch": 0.49121609038425723, + "grad_norm": 0.26552773546117725, + "learning_rate": 9.876777350849106e-05, + "loss": 3.4373, + "step": 7913 + }, + { + "epoch": 0.49127816748401515, + "grad_norm": 0.22868516941347775, + "learning_rate": 9.876697653705815e-05, + "loss": 3.357, + "step": 7914 + }, + { + "epoch": 0.49134024458377307, + "grad_norm": 0.3235614387114585, + "learning_rate": 9.87661793111961e-05, + "loss": 3.3542, + "step": 7915 + }, + { + "epoch": 0.49140232168353093, + "grad_norm": 0.22295565301380366, + "learning_rate": 9.876538183090902e-05, + "loss": 3.3317, + "step": 7916 + }, + { + "epoch": 0.49146439878328885, + "grad_norm": 0.24686261067508444, + "learning_rate": 9.876458409620113e-05, + "loss": 3.394, + "step": 7917 + }, + { + "epoch": 0.49152647588304677, + "grad_norm": 0.24497465723932282, + "learning_rate": 9.876378610707656e-05, + "loss": 3.3421, + "step": 7918 + }, + { + "epoch": 0.4915885529828046, + "grad_norm": 0.2477618630063434, + "learning_rate": 9.876298786353948e-05, + "loss": 3.2743, + "step": 7919 + }, + { + "epoch": 0.49165063008256255, + "grad_norm": 0.22189335881425368, + "learning_rate": 9.876218936559405e-05, + "loss": 3.3594, + "step": 7920 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.26747781028693834, + "learning_rate": 9.876139061324445e-05, + "loss": 3.3947, + "step": 7921 + }, + { + "epoch": 0.4917747842820783, + "grad_norm": 0.25849042251412774, + "learning_rate": 9.876059160649483e-05, + "loss": 3.3783, + "step": 7922 + }, + { + "epoch": 0.49183686138183624, + "grad_norm": 0.20725206079827416, + "learning_rate": 9.875979234534936e-05, + "loss": 3.3333, + "step": 7923 + }, + { + "epoch": 0.49189893848159416, + "grad_norm": 0.29115998444716595, + "learning_rate": 9.875899282981222e-05, + "loss": 3.2946, + "step": 7924 + }, + { + "epoch": 0.491961015581352, + "grad_norm": 0.2159180829665953, + "learning_rate": 9.87581930598876e-05, + "loss": 3.3117, + "step": 7925 + }, + { + "epoch": 0.49202309268110994, + "grad_norm": 0.21463822157397885, + "learning_rate": 9.875739303557963e-05, + "loss": 3.2938, + "step": 7926 + }, + { + "epoch": 0.49208516978086786, + "grad_norm": 0.21549046023223253, + "learning_rate": 9.875659275689251e-05, + "loss": 3.2989, + "step": 7927 + }, + { + "epoch": 0.4921472468806257, + "grad_norm": 0.17980910415744375, + "learning_rate": 9.87557922238304e-05, + "loss": 3.3071, + "step": 7928 + }, + { + "epoch": 0.49220932398038364, + "grad_norm": 0.24323266375367275, + "learning_rate": 9.87549914363975e-05, + "loss": 3.2118, + "step": 7929 + }, + { + "epoch": 0.49227140108014156, + "grad_norm": 0.2516869906298833, + "learning_rate": 9.875419039459797e-05, + "loss": 3.349, + "step": 7930 + }, + { + "epoch": 0.4923334781798994, + "grad_norm": 0.35253490963336426, + "learning_rate": 9.875338909843599e-05, + "loss": 3.2368, + "step": 7931 + }, + { + "epoch": 0.49239555527965734, + "grad_norm": 0.2516935771144997, + "learning_rate": 9.875258754791575e-05, + "loss": 3.3963, + "step": 7932 + }, + { + "epoch": 0.49245763237941526, + "grad_norm": 0.2541110459946949, + "learning_rate": 9.875178574304141e-05, + "loss": 3.3613, + "step": 7933 + }, + { + "epoch": 0.4925197094791731, + "grad_norm": 0.2545840825700947, + "learning_rate": 9.875098368381717e-05, + "loss": 3.3559, + "step": 7934 + }, + { + "epoch": 0.49258178657893104, + "grad_norm": 0.26504451771394966, + "learning_rate": 9.875018137024723e-05, + "loss": 3.3902, + "step": 7935 + }, + { + "epoch": 0.49264386367868895, + "grad_norm": 0.4302068692598839, + "learning_rate": 9.874937880233574e-05, + "loss": 3.1575, + "step": 7936 + }, + { + "epoch": 0.4927059407784468, + "grad_norm": 0.31135669419220025, + "learning_rate": 9.874857598008693e-05, + "loss": 3.3909, + "step": 7937 + }, + { + "epoch": 0.49276801787820473, + "grad_norm": 0.3519039191631324, + "learning_rate": 9.874777290350495e-05, + "loss": 3.3253, + "step": 7938 + }, + { + "epoch": 0.49283009497796265, + "grad_norm": 0.23436043217207747, + "learning_rate": 9.874696957259401e-05, + "loss": 3.1854, + "step": 7939 + }, + { + "epoch": 0.4928921720777205, + "grad_norm": 0.2472965652006897, + "learning_rate": 9.874616598735829e-05, + "loss": 3.3624, + "step": 7940 + }, + { + "epoch": 0.49295424917747843, + "grad_norm": 0.31502993027091175, + "learning_rate": 9.874536214780199e-05, + "loss": 3.3555, + "step": 7941 + }, + { + "epoch": 0.49301632627723635, + "grad_norm": 0.28313279429605903, + "learning_rate": 9.874455805392931e-05, + "loss": 3.3274, + "step": 7942 + }, + { + "epoch": 0.4930784033769942, + "grad_norm": 0.37620085924457053, + "learning_rate": 9.874375370574442e-05, + "loss": 3.3482, + "step": 7943 + }, + { + "epoch": 0.49314048047675213, + "grad_norm": 0.448797692046504, + "learning_rate": 9.874294910325154e-05, + "loss": 3.2763, + "step": 7944 + }, + { + "epoch": 0.49320255757651005, + "grad_norm": 0.3193074484899878, + "learning_rate": 9.874214424645488e-05, + "loss": 3.3814, + "step": 7945 + }, + { + "epoch": 0.4932646346762679, + "grad_norm": 0.25882644383339026, + "learning_rate": 9.874133913535861e-05, + "loss": 3.2756, + "step": 7946 + }, + { + "epoch": 0.49332671177602583, + "grad_norm": 0.2443284266085104, + "learning_rate": 9.874053376996694e-05, + "loss": 3.2901, + "step": 7947 + }, + { + "epoch": 0.49338878887578375, + "grad_norm": 0.2243181662849594, + "learning_rate": 9.873972815028408e-05, + "loss": 3.3488, + "step": 7948 + }, + { + "epoch": 0.4934508659755416, + "grad_norm": 0.35167143511514776, + "learning_rate": 9.87389222763142e-05, + "loss": 3.1933, + "step": 7949 + }, + { + "epoch": 0.4935129430752995, + "grad_norm": 0.28569082553374253, + "learning_rate": 9.873811614806156e-05, + "loss": 3.4104, + "step": 7950 + }, + { + "epoch": 0.49357502017505744, + "grad_norm": 0.3685772143475647, + "learning_rate": 9.873730976553032e-05, + "loss": 3.18, + "step": 7951 + }, + { + "epoch": 0.4936370972748153, + "grad_norm": 0.31886497088354315, + "learning_rate": 9.873650312872472e-05, + "loss": 3.3509, + "step": 7952 + }, + { + "epoch": 0.4936991743745732, + "grad_norm": 0.28392919662866023, + "learning_rate": 9.873569623764894e-05, + "loss": 3.1978, + "step": 7953 + }, + { + "epoch": 0.49376125147433114, + "grad_norm": 0.2674291432646968, + "learning_rate": 9.87348890923072e-05, + "loss": 3.291, + "step": 7954 + }, + { + "epoch": 0.493823328574089, + "grad_norm": 0.25698724793271904, + "learning_rate": 9.873408169270372e-05, + "loss": 3.2692, + "step": 7955 + }, + { + "epoch": 0.4938854056738469, + "grad_norm": 0.25772344611736775, + "learning_rate": 9.873327403884269e-05, + "loss": 3.335, + "step": 7956 + }, + { + "epoch": 0.49394748277360484, + "grad_norm": 0.3060892335131089, + "learning_rate": 9.873246613072837e-05, + "loss": 3.2745, + "step": 7957 + }, + { + "epoch": 0.4940095598733627, + "grad_norm": 0.2684722496428928, + "learning_rate": 9.873165796836492e-05, + "loss": 3.318, + "step": 7958 + }, + { + "epoch": 0.4940716369731206, + "grad_norm": 0.3670607611889269, + "learning_rate": 9.873084955175658e-05, + "loss": 3.253, + "step": 7959 + }, + { + "epoch": 0.49413371407287854, + "grad_norm": 0.25282503415264695, + "learning_rate": 9.873004088090758e-05, + "loss": 3.4099, + "step": 7960 + }, + { + "epoch": 0.4941957911726364, + "grad_norm": 0.5117600984830226, + "learning_rate": 9.872923195582211e-05, + "loss": 3.3368, + "step": 7961 + }, + { + "epoch": 0.4942578682723943, + "grad_norm": 0.31595032421613994, + "learning_rate": 9.872842277650443e-05, + "loss": 3.4032, + "step": 7962 + }, + { + "epoch": 0.49431994537215224, + "grad_norm": 0.27240268535760803, + "learning_rate": 9.872761334295873e-05, + "loss": 3.3782, + "step": 7963 + }, + { + "epoch": 0.4943820224719101, + "grad_norm": 0.46409623765166336, + "learning_rate": 9.872680365518922e-05, + "loss": 3.2539, + "step": 7964 + }, + { + "epoch": 0.494444099571668, + "grad_norm": 0.2844873971259483, + "learning_rate": 9.872599371320017e-05, + "loss": 3.3608, + "step": 7965 + }, + { + "epoch": 0.49450617667142593, + "grad_norm": 0.3950346577203666, + "learning_rate": 9.872518351699578e-05, + "loss": 3.2792, + "step": 7966 + }, + { + "epoch": 0.4945682537711838, + "grad_norm": 0.2778801097065451, + "learning_rate": 9.872437306658027e-05, + "loss": 3.3523, + "step": 7967 + }, + { + "epoch": 0.4946303308709417, + "grad_norm": 0.3157584068949088, + "learning_rate": 9.872356236195789e-05, + "loss": 3.293, + "step": 7968 + }, + { + "epoch": 0.49469240797069963, + "grad_norm": 0.26729232951285475, + "learning_rate": 9.872275140313285e-05, + "loss": 3.2179, + "step": 7969 + }, + { + "epoch": 0.4947544850704575, + "grad_norm": 0.255722548029466, + "learning_rate": 9.872194019010938e-05, + "loss": 3.3516, + "step": 7970 + }, + { + "epoch": 0.4948165621702154, + "grad_norm": 0.25246048639379065, + "learning_rate": 9.872112872289174e-05, + "loss": 3.3556, + "step": 7971 + }, + { + "epoch": 0.49487863926997333, + "grad_norm": 0.23188888118816414, + "learning_rate": 9.872031700148413e-05, + "loss": 3.39, + "step": 7972 + }, + { + "epoch": 0.4949407163697312, + "grad_norm": 0.2905140208547437, + "learning_rate": 9.87195050258908e-05, + "loss": 3.2562, + "step": 7973 + }, + { + "epoch": 0.4950027934694891, + "grad_norm": 0.21793616847787572, + "learning_rate": 9.871869279611598e-05, + "loss": 3.2897, + "step": 7974 + }, + { + "epoch": 0.49506487056924703, + "grad_norm": 0.2187316920929802, + "learning_rate": 9.871788031216394e-05, + "loss": 3.3608, + "step": 7975 + }, + { + "epoch": 0.4951269476690049, + "grad_norm": 0.3552338798860112, + "learning_rate": 9.871706757403886e-05, + "loss": 3.3116, + "step": 7976 + }, + { + "epoch": 0.4951890247687628, + "grad_norm": 0.22052001108464642, + "learning_rate": 9.871625458174503e-05, + "loss": 3.364, + "step": 7977 + }, + { + "epoch": 0.4952511018685207, + "grad_norm": 0.42374901099075535, + "learning_rate": 9.871544133528667e-05, + "loss": 3.389, + "step": 7978 + }, + { + "epoch": 0.4953131789682786, + "grad_norm": 0.38381870533856466, + "learning_rate": 9.871462783466803e-05, + "loss": 3.2549, + "step": 7979 + }, + { + "epoch": 0.4953752560680365, + "grad_norm": 0.5162438233862743, + "learning_rate": 9.871381407989333e-05, + "loss": 3.3244, + "step": 7980 + }, + { + "epoch": 0.4954373331677944, + "grad_norm": 0.3152017507960437, + "learning_rate": 9.871300007096686e-05, + "loss": 3.3503, + "step": 7981 + }, + { + "epoch": 0.4954994102675523, + "grad_norm": 0.2953916811880032, + "learning_rate": 9.871218580789283e-05, + "loss": 3.3086, + "step": 7982 + }, + { + "epoch": 0.4955614873673102, + "grad_norm": 0.35057755356648684, + "learning_rate": 9.871137129067552e-05, + "loss": 3.3661, + "step": 7983 + }, + { + "epoch": 0.4956235644670681, + "grad_norm": 0.27979488921753604, + "learning_rate": 9.871055651931914e-05, + "loss": 3.2224, + "step": 7984 + }, + { + "epoch": 0.495685641566826, + "grad_norm": 0.32017803858967303, + "learning_rate": 9.870974149382797e-05, + "loss": 3.4148, + "step": 7985 + }, + { + "epoch": 0.4957477186665839, + "grad_norm": 0.3756087250127278, + "learning_rate": 9.870892621420625e-05, + "loss": 3.2493, + "step": 7986 + }, + { + "epoch": 0.4958097957663418, + "grad_norm": 0.25308418710194275, + "learning_rate": 9.870811068045824e-05, + "loss": 3.3321, + "step": 7987 + }, + { + "epoch": 0.4958718728660997, + "grad_norm": 0.2598047298740507, + "learning_rate": 9.870729489258819e-05, + "loss": 3.3465, + "step": 7988 + }, + { + "epoch": 0.4959339499658576, + "grad_norm": 0.24982241194107976, + "learning_rate": 9.870647885060035e-05, + "loss": 3.2903, + "step": 7989 + }, + { + "epoch": 0.4959960270656155, + "grad_norm": 0.5027759205828953, + "learning_rate": 9.8705662554499e-05, + "loss": 3.3497, + "step": 7990 + }, + { + "epoch": 0.4960581041653734, + "grad_norm": 0.2466623081177145, + "learning_rate": 9.870484600428839e-05, + "loss": 3.2856, + "step": 7991 + }, + { + "epoch": 0.4961201812651313, + "grad_norm": 0.3090419050206927, + "learning_rate": 9.870402919997276e-05, + "loss": 3.3279, + "step": 7992 + }, + { + "epoch": 0.4961822583648892, + "grad_norm": 0.24901529934618813, + "learning_rate": 9.87032121415564e-05, + "loss": 3.4827, + "step": 7993 + }, + { + "epoch": 0.4962443354646471, + "grad_norm": 0.27589489375688536, + "learning_rate": 9.870239482904354e-05, + "loss": 3.3049, + "step": 7994 + }, + { + "epoch": 0.496306412564405, + "grad_norm": 0.35190138907598784, + "learning_rate": 9.870157726243847e-05, + "loss": 3.3216, + "step": 7995 + }, + { + "epoch": 0.4963684896641629, + "grad_norm": 0.2762426763098844, + "learning_rate": 9.870075944174547e-05, + "loss": 3.3311, + "step": 7996 + }, + { + "epoch": 0.4964305667639208, + "grad_norm": 0.3103296131705099, + "learning_rate": 9.869994136696876e-05, + "loss": 3.2816, + "step": 7997 + }, + { + "epoch": 0.4964926438636787, + "grad_norm": 0.29155602939617564, + "learning_rate": 9.869912303811264e-05, + "loss": 3.4379, + "step": 7998 + }, + { + "epoch": 0.4965547209634366, + "grad_norm": 0.3042882479814667, + "learning_rate": 9.869830445518139e-05, + "loss": 3.3326, + "step": 7999 + }, + { + "epoch": 0.4966167980631945, + "grad_norm": 0.33847255979571655, + "learning_rate": 9.869748561817925e-05, + "loss": 3.3233, + "step": 8000 + }, + { + "epoch": 0.4966788751629524, + "grad_norm": 0.23620694576755774, + "learning_rate": 9.869666652711049e-05, + "loss": 3.2617, + "step": 8001 + }, + { + "epoch": 0.4967409522627103, + "grad_norm": 0.231424886067694, + "learning_rate": 9.869584718197941e-05, + "loss": 3.2919, + "step": 8002 + }, + { + "epoch": 0.4968030293624682, + "grad_norm": 0.2585059578760532, + "learning_rate": 9.869502758279028e-05, + "loss": 3.2726, + "step": 8003 + }, + { + "epoch": 0.4968651064622261, + "grad_norm": 0.3429794453232798, + "learning_rate": 9.869420772954738e-05, + "loss": 3.2667, + "step": 8004 + }, + { + "epoch": 0.496927183561984, + "grad_norm": 0.2526840431058062, + "learning_rate": 9.869338762225496e-05, + "loss": 3.4331, + "step": 8005 + }, + { + "epoch": 0.49698926066174187, + "grad_norm": 0.19536549802800096, + "learning_rate": 9.869256726091731e-05, + "loss": 3.2796, + "step": 8006 + }, + { + "epoch": 0.4970513377614998, + "grad_norm": 0.21520371192692003, + "learning_rate": 9.869174664553874e-05, + "loss": 3.4601, + "step": 8007 + }, + { + "epoch": 0.4971134148612577, + "grad_norm": 0.1847843563916102, + "learning_rate": 9.869092577612349e-05, + "loss": 3.3624, + "step": 8008 + }, + { + "epoch": 0.49717549196101557, + "grad_norm": 0.25646612427191134, + "learning_rate": 9.869010465267588e-05, + "loss": 3.3239, + "step": 8009 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.184778530000834, + "learning_rate": 9.868928327520015e-05, + "loss": 3.3484, + "step": 8010 + }, + { + "epoch": 0.4972996461605314, + "grad_norm": 0.2564981944985451, + "learning_rate": 9.868846164370062e-05, + "loss": 3.3077, + "step": 8011 + }, + { + "epoch": 0.49736172326028927, + "grad_norm": 0.2929443730703121, + "learning_rate": 9.868763975818156e-05, + "loss": 3.3367, + "step": 8012 + }, + { + "epoch": 0.4974238003600472, + "grad_norm": 0.23556259001631216, + "learning_rate": 9.868681761864726e-05, + "loss": 3.2889, + "step": 8013 + }, + { + "epoch": 0.4974858774598051, + "grad_norm": 0.21432051666943092, + "learning_rate": 9.868599522510202e-05, + "loss": 3.3047, + "step": 8014 + }, + { + "epoch": 0.49754795455956297, + "grad_norm": 0.2535596829822756, + "learning_rate": 9.868517257755015e-05, + "loss": 3.2597, + "step": 8015 + }, + { + "epoch": 0.4976100316593209, + "grad_norm": 0.17337830882363256, + "learning_rate": 9.868434967599587e-05, + "loss": 3.3678, + "step": 8016 + }, + { + "epoch": 0.4976721087590788, + "grad_norm": 0.23354418844386604, + "learning_rate": 9.868352652044355e-05, + "loss": 3.2691, + "step": 8017 + }, + { + "epoch": 0.49773418585883666, + "grad_norm": 0.22321709886549318, + "learning_rate": 9.868270311089744e-05, + "loss": 3.296, + "step": 8018 + }, + { + "epoch": 0.4977962629585946, + "grad_norm": 0.3500420966547404, + "learning_rate": 9.868187944736186e-05, + "loss": 3.3659, + "step": 8019 + }, + { + "epoch": 0.4978583400583525, + "grad_norm": 0.2820824251742561, + "learning_rate": 9.868105552984109e-05, + "loss": 3.3391, + "step": 8020 + }, + { + "epoch": 0.49792041715811036, + "grad_norm": 0.22304648785131628, + "learning_rate": 9.868023135833944e-05, + "loss": 3.2659, + "step": 8021 + }, + { + "epoch": 0.4979824942578683, + "grad_norm": 0.178742171512918, + "learning_rate": 9.86794069328612e-05, + "loss": 3.2862, + "step": 8022 + }, + { + "epoch": 0.4980445713576262, + "grad_norm": 0.2629910757523328, + "learning_rate": 9.867858225341068e-05, + "loss": 3.305, + "step": 8023 + }, + { + "epoch": 0.49810664845738406, + "grad_norm": 0.21627369032899696, + "learning_rate": 9.867775731999218e-05, + "loss": 3.3495, + "step": 8024 + }, + { + "epoch": 0.498168725557142, + "grad_norm": 0.2608504909797393, + "learning_rate": 9.867693213261e-05, + "loss": 3.3359, + "step": 8025 + }, + { + "epoch": 0.4982308026568999, + "grad_norm": 0.19542376210183104, + "learning_rate": 9.867610669126845e-05, + "loss": 3.2689, + "step": 8026 + }, + { + "epoch": 0.49829287975665776, + "grad_norm": 0.22208378632668058, + "learning_rate": 9.867528099597182e-05, + "loss": 3.2534, + "step": 8027 + }, + { + "epoch": 0.4983549568564157, + "grad_norm": 0.20990052352063507, + "learning_rate": 9.867445504672446e-05, + "loss": 3.2319, + "step": 8028 + }, + { + "epoch": 0.4984170339561736, + "grad_norm": 0.19610354448455708, + "learning_rate": 9.867362884353064e-05, + "loss": 3.3566, + "step": 8029 + }, + { + "epoch": 0.49847911105593146, + "grad_norm": 0.1849203776245348, + "learning_rate": 9.867280238639468e-05, + "loss": 3.3149, + "step": 8030 + }, + { + "epoch": 0.4985411881556894, + "grad_norm": 0.24538071588999447, + "learning_rate": 9.86719756753209e-05, + "loss": 3.283, + "step": 8031 + }, + { + "epoch": 0.4986032652554473, + "grad_norm": 0.18863691910448319, + "learning_rate": 9.86711487103136e-05, + "loss": 3.2634, + "step": 8032 + }, + { + "epoch": 0.49866534235520515, + "grad_norm": 0.21946221450608494, + "learning_rate": 9.867032149137711e-05, + "loss": 3.2606, + "step": 8033 + }, + { + "epoch": 0.4987274194549631, + "grad_norm": 0.1903062273504513, + "learning_rate": 9.866949401851572e-05, + "loss": 3.3463, + "step": 8034 + }, + { + "epoch": 0.498789496554721, + "grad_norm": 0.26361962259848604, + "learning_rate": 9.866866629173379e-05, + "loss": 3.3808, + "step": 8035 + }, + { + "epoch": 0.49885157365447885, + "grad_norm": 0.2133332995946182, + "learning_rate": 9.86678383110356e-05, + "loss": 3.3171, + "step": 8036 + }, + { + "epoch": 0.49891365075423677, + "grad_norm": 0.2933339957524175, + "learning_rate": 9.866701007642548e-05, + "loss": 3.2508, + "step": 8037 + }, + { + "epoch": 0.4989757278539947, + "grad_norm": 0.21393659850088576, + "learning_rate": 9.866618158790776e-05, + "loss": 3.3438, + "step": 8038 + }, + { + "epoch": 0.49903780495375255, + "grad_norm": 0.3292265521674305, + "learning_rate": 9.866535284548676e-05, + "loss": 3.3503, + "step": 8039 + }, + { + "epoch": 0.49909988205351047, + "grad_norm": 0.21999301437373717, + "learning_rate": 9.866452384916679e-05, + "loss": 3.425, + "step": 8040 + }, + { + "epoch": 0.4991619591532684, + "grad_norm": 0.3254874466622975, + "learning_rate": 9.86636945989522e-05, + "loss": 3.3255, + "step": 8041 + }, + { + "epoch": 0.49922403625302625, + "grad_norm": 0.1986174765999147, + "learning_rate": 9.866286509484728e-05, + "loss": 3.2954, + "step": 8042 + }, + { + "epoch": 0.49928611335278417, + "grad_norm": 0.22333188194958617, + "learning_rate": 9.866203533685639e-05, + "loss": 3.3242, + "step": 8043 + }, + { + "epoch": 0.4993481904525421, + "grad_norm": 0.22913048017787735, + "learning_rate": 9.866120532498386e-05, + "loss": 3.3582, + "step": 8044 + }, + { + "epoch": 0.49941026755229995, + "grad_norm": 0.3656750229092554, + "learning_rate": 9.8660375059234e-05, + "loss": 3.3517, + "step": 8045 + }, + { + "epoch": 0.49947234465205786, + "grad_norm": 0.2722867867169435, + "learning_rate": 9.865954453961115e-05, + "loss": 3.2489, + "step": 8046 + }, + { + "epoch": 0.4995344217518158, + "grad_norm": 0.3092005714790287, + "learning_rate": 9.865871376611963e-05, + "loss": 3.2764, + "step": 8047 + }, + { + "epoch": 0.49959649885157364, + "grad_norm": 0.24676096250073334, + "learning_rate": 9.865788273876381e-05, + "loss": 3.2873, + "step": 8048 + }, + { + "epoch": 0.49965857595133156, + "grad_norm": 0.2656775450626128, + "learning_rate": 9.8657051457548e-05, + "loss": 3.2714, + "step": 8049 + }, + { + "epoch": 0.4997206530510895, + "grad_norm": 0.2415660684057987, + "learning_rate": 9.865621992247654e-05, + "loss": 3.3807, + "step": 8050 + }, + { + "epoch": 0.49978273015084734, + "grad_norm": 0.27918896224298895, + "learning_rate": 9.865538813355375e-05, + "loss": 3.3018, + "step": 8051 + }, + { + "epoch": 0.49984480725060526, + "grad_norm": 0.2942178293052688, + "learning_rate": 9.8654556090784e-05, + "loss": 3.3225, + "step": 8052 + }, + { + "epoch": 0.4999068843503632, + "grad_norm": 0.21446660491046257, + "learning_rate": 9.865372379417163e-05, + "loss": 3.2358, + "step": 8053 + }, + { + "epoch": 0.49996896145012104, + "grad_norm": 0.25809451775298664, + "learning_rate": 9.865289124372097e-05, + "loss": 3.3868, + "step": 8054 + }, + { + "epoch": 0.500031038549879, + "grad_norm": 0.2980635092156879, + "learning_rate": 9.865205843943634e-05, + "loss": 3.3438, + "step": 8055 + }, + { + "epoch": 0.5000931156496369, + "grad_norm": 0.23024179307516104, + "learning_rate": 9.865122538132214e-05, + "loss": 3.3321, + "step": 8056 + }, + { + "epoch": 0.5001551927493948, + "grad_norm": 0.24962762577870706, + "learning_rate": 9.865039206938267e-05, + "loss": 3.3722, + "step": 8057 + }, + { + "epoch": 0.5002172698491526, + "grad_norm": 0.3666778029932573, + "learning_rate": 9.864955850362231e-05, + "loss": 3.399, + "step": 8058 + }, + { + "epoch": 0.5002793469489105, + "grad_norm": 0.29545364690273873, + "learning_rate": 9.864872468404538e-05, + "loss": 3.3514, + "step": 8059 + }, + { + "epoch": 0.5003414240486684, + "grad_norm": 0.3486398541249602, + "learning_rate": 9.864789061065625e-05, + "loss": 3.352, + "step": 8060 + }, + { + "epoch": 0.5004035011484264, + "grad_norm": 0.3035029346066523, + "learning_rate": 9.864705628345928e-05, + "loss": 3.3235, + "step": 8061 + }, + { + "epoch": 0.5004655782481843, + "grad_norm": 0.27805985427149943, + "learning_rate": 9.864622170245878e-05, + "loss": 3.3633, + "step": 8062 + }, + { + "epoch": 0.5005276553479422, + "grad_norm": 0.318040336873397, + "learning_rate": 9.864538686765914e-05, + "loss": 3.267, + "step": 8063 + }, + { + "epoch": 0.5005897324477, + "grad_norm": 0.3706205261951116, + "learning_rate": 9.864455177906471e-05, + "loss": 3.3465, + "step": 8064 + }, + { + "epoch": 0.5006518095474579, + "grad_norm": 0.46763782486591393, + "learning_rate": 9.864371643667985e-05, + "loss": 3.3145, + "step": 8065 + }, + { + "epoch": 0.5007138866472158, + "grad_norm": 0.2856873309031245, + "learning_rate": 9.86428808405089e-05, + "loss": 3.2862, + "step": 8066 + }, + { + "epoch": 0.5007759637469738, + "grad_norm": 0.4430197536701411, + "learning_rate": 9.864204499055624e-05, + "loss": 3.1899, + "step": 8067 + }, + { + "epoch": 0.5008380408467317, + "grad_norm": 0.23553966587099068, + "learning_rate": 9.864120888682622e-05, + "loss": 3.288, + "step": 8068 + }, + { + "epoch": 0.5009001179464896, + "grad_norm": 0.3394594321416718, + "learning_rate": 9.864037252932322e-05, + "loss": 3.2931, + "step": 8069 + }, + { + "epoch": 0.5009621950462474, + "grad_norm": 0.21415366403434247, + "learning_rate": 9.863953591805157e-05, + "loss": 3.3238, + "step": 8070 + }, + { + "epoch": 0.5010242721460053, + "grad_norm": 0.30504469841034654, + "learning_rate": 9.863869905301566e-05, + "loss": 3.283, + "step": 8071 + }, + { + "epoch": 0.5010863492457632, + "grad_norm": 0.244073728933173, + "learning_rate": 9.863786193421985e-05, + "loss": 3.1991, + "step": 8072 + }, + { + "epoch": 0.5011484263455211, + "grad_norm": 0.2827335674298599, + "learning_rate": 9.86370245616685e-05, + "loss": 3.2945, + "step": 8073 + }, + { + "epoch": 0.5012105034452791, + "grad_norm": 0.35627353657387184, + "learning_rate": 9.8636186935366e-05, + "loss": 3.2971, + "step": 8074 + }, + { + "epoch": 0.501272580545037, + "grad_norm": 0.26964648174630473, + "learning_rate": 9.86353490553167e-05, + "loss": 3.3101, + "step": 8075 + }, + { + "epoch": 0.5013346576447948, + "grad_norm": 0.3295184814055822, + "learning_rate": 9.863451092152497e-05, + "loss": 3.2575, + "step": 8076 + }, + { + "epoch": 0.5013967347445527, + "grad_norm": 0.25676878714890705, + "learning_rate": 9.86336725339952e-05, + "loss": 3.3852, + "step": 8077 + }, + { + "epoch": 0.5014588118443106, + "grad_norm": 0.24877356166050033, + "learning_rate": 9.863283389273175e-05, + "loss": 3.2264, + "step": 8078 + }, + { + "epoch": 0.5015208889440685, + "grad_norm": 0.3827556840423473, + "learning_rate": 9.8631994997739e-05, + "loss": 3.2766, + "step": 8079 + }, + { + "epoch": 0.5015829660438265, + "grad_norm": 0.3940715755775843, + "learning_rate": 9.863115584902132e-05, + "loss": 3.2727, + "step": 8080 + }, + { + "epoch": 0.5016450431435844, + "grad_norm": 0.40362422844277573, + "learning_rate": 9.86303164465831e-05, + "loss": 3.3104, + "step": 8081 + }, + { + "epoch": 0.5017071202433422, + "grad_norm": 0.32385880816129337, + "learning_rate": 9.86294767904287e-05, + "loss": 3.3091, + "step": 8082 + }, + { + "epoch": 0.5017691973431001, + "grad_norm": 0.3578680411797123, + "learning_rate": 9.862863688056255e-05, + "loss": 3.3404, + "step": 8083 + }, + { + "epoch": 0.501831274442858, + "grad_norm": 0.26464262389381815, + "learning_rate": 9.862779671698896e-05, + "loss": 3.2377, + "step": 8084 + }, + { + "epoch": 0.5018933515426159, + "grad_norm": 0.22290522892893383, + "learning_rate": 9.862695629971237e-05, + "loss": 3.3441, + "step": 8085 + }, + { + "epoch": 0.5019554286423739, + "grad_norm": 0.31806871684485133, + "learning_rate": 9.862611562873714e-05, + "loss": 3.2683, + "step": 8086 + }, + { + "epoch": 0.5020175057421318, + "grad_norm": 0.42191205038631535, + "learning_rate": 9.862527470406766e-05, + "loss": 3.2817, + "step": 8087 + }, + { + "epoch": 0.5020795828418896, + "grad_norm": 0.402205841725121, + "learning_rate": 9.862443352570832e-05, + "loss": 3.3832, + "step": 8088 + }, + { + "epoch": 0.5021416599416475, + "grad_norm": 0.2675645504591997, + "learning_rate": 9.86235920936635e-05, + "loss": 3.3435, + "step": 8089 + }, + { + "epoch": 0.5022037370414054, + "grad_norm": 0.25403280772068154, + "learning_rate": 9.86227504079376e-05, + "loss": 3.3688, + "step": 8090 + }, + { + "epoch": 0.5022658141411633, + "grad_norm": 0.38594604127521326, + "learning_rate": 9.862190846853501e-05, + "loss": 3.3898, + "step": 8091 + }, + { + "epoch": 0.5023278912409213, + "grad_norm": 0.3222419383273964, + "learning_rate": 9.86210662754601e-05, + "loss": 3.3618, + "step": 8092 + }, + { + "epoch": 0.5023899683406792, + "grad_norm": 0.29220114283406756, + "learning_rate": 9.862022382871732e-05, + "loss": 3.4215, + "step": 8093 + }, + { + "epoch": 0.502452045440437, + "grad_norm": 0.39972998848830604, + "learning_rate": 9.8619381128311e-05, + "loss": 3.2745, + "step": 8094 + }, + { + "epoch": 0.5025141225401949, + "grad_norm": 0.25260514164110515, + "learning_rate": 9.861853817424558e-05, + "loss": 3.3684, + "step": 8095 + }, + { + "epoch": 0.5025761996399528, + "grad_norm": 0.3224852536031156, + "learning_rate": 9.861769496652544e-05, + "loss": 3.265, + "step": 8096 + }, + { + "epoch": 0.5026382767397107, + "grad_norm": 0.4137710049071549, + "learning_rate": 9.861685150515498e-05, + "loss": 3.249, + "step": 8097 + }, + { + "epoch": 0.5027003538394686, + "grad_norm": 0.3733735975552692, + "learning_rate": 9.86160077901386e-05, + "loss": 3.2651, + "step": 8098 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.4625996185963333, + "learning_rate": 9.861516382148073e-05, + "loss": 3.3606, + "step": 8099 + }, + { + "epoch": 0.5028245080389844, + "grad_norm": 0.44900774768631546, + "learning_rate": 9.861431959918573e-05, + "loss": 3.3794, + "step": 8100 + }, + { + "epoch": 0.5028865851387423, + "grad_norm": 0.3444492207836054, + "learning_rate": 9.861347512325801e-05, + "loss": 3.2905, + "step": 8101 + }, + { + "epoch": 0.5029486622385002, + "grad_norm": 0.33017318530502154, + "learning_rate": 9.861263039370201e-05, + "loss": 3.3239, + "step": 8102 + }, + { + "epoch": 0.5030107393382581, + "grad_norm": 0.3634653547835365, + "learning_rate": 9.861178541052211e-05, + "loss": 3.3776, + "step": 8103 + }, + { + "epoch": 0.503072816438016, + "grad_norm": 0.3524730229538549, + "learning_rate": 9.861094017372272e-05, + "loss": 3.2892, + "step": 8104 + }, + { + "epoch": 0.503134893537774, + "grad_norm": 0.3440464865260766, + "learning_rate": 9.861009468330826e-05, + "loss": 3.2907, + "step": 8105 + }, + { + "epoch": 0.5031969706375318, + "grad_norm": 0.4382223909127917, + "learning_rate": 9.860924893928313e-05, + "loss": 3.3827, + "step": 8106 + }, + { + "epoch": 0.5032590477372897, + "grad_norm": 0.2643364055680699, + "learning_rate": 9.860840294165174e-05, + "loss": 3.3139, + "step": 8107 + }, + { + "epoch": 0.5033211248370476, + "grad_norm": 0.33360075530082745, + "learning_rate": 9.860755669041853e-05, + "loss": 3.3401, + "step": 8108 + }, + { + "epoch": 0.5033832019368055, + "grad_norm": 0.3945757019381868, + "learning_rate": 9.86067101855879e-05, + "loss": 3.2864, + "step": 8109 + }, + { + "epoch": 0.5034452790365634, + "grad_norm": 0.3730133600808185, + "learning_rate": 9.860586342716425e-05, + "loss": 3.3424, + "step": 8110 + }, + { + "epoch": 0.5035073561363214, + "grad_norm": 0.29776916614938403, + "learning_rate": 9.8605016415152e-05, + "loss": 3.3398, + "step": 8111 + }, + { + "epoch": 0.5035694332360792, + "grad_norm": 0.2592053070859532, + "learning_rate": 9.86041691495556e-05, + "loss": 3.2851, + "step": 8112 + }, + { + "epoch": 0.5036315103358371, + "grad_norm": 0.3208281042345806, + "learning_rate": 9.860332163037944e-05, + "loss": 3.1936, + "step": 8113 + }, + { + "epoch": 0.503693587435595, + "grad_norm": 0.21395901663787795, + "learning_rate": 9.860247385762797e-05, + "loss": 3.3745, + "step": 8114 + }, + { + "epoch": 0.5037556645353529, + "grad_norm": 0.217128525761323, + "learning_rate": 9.860162583130557e-05, + "loss": 3.4038, + "step": 8115 + }, + { + "epoch": 0.5038177416351108, + "grad_norm": 0.26957742323187384, + "learning_rate": 9.860077755141671e-05, + "loss": 3.2301, + "step": 8116 + }, + { + "epoch": 0.5038798187348688, + "grad_norm": 0.2601216679863302, + "learning_rate": 9.859992901796578e-05, + "loss": 3.2741, + "step": 8117 + }, + { + "epoch": 0.5039418958346266, + "grad_norm": 0.3219079431323197, + "learning_rate": 9.859908023095724e-05, + "loss": 3.3236, + "step": 8118 + }, + { + "epoch": 0.5040039729343845, + "grad_norm": 0.22067770347361026, + "learning_rate": 9.859823119039549e-05, + "loss": 3.3187, + "step": 8119 + }, + { + "epoch": 0.5040660500341424, + "grad_norm": 0.21264176303251497, + "learning_rate": 9.859738189628497e-05, + "loss": 3.3183, + "step": 8120 + }, + { + "epoch": 0.5041281271339003, + "grad_norm": 0.32559584961227106, + "learning_rate": 9.85965323486301e-05, + "loss": 3.2495, + "step": 8121 + }, + { + "epoch": 0.5041902042336582, + "grad_norm": 0.2678818336528341, + "learning_rate": 9.859568254743535e-05, + "loss": 3.3584, + "step": 8122 + }, + { + "epoch": 0.5042522813334162, + "grad_norm": 0.2781048392371696, + "learning_rate": 9.85948324927051e-05, + "loss": 3.3166, + "step": 8123 + }, + { + "epoch": 0.504314358433174, + "grad_norm": 0.27285389609079624, + "learning_rate": 9.859398218444383e-05, + "loss": 3.2951, + "step": 8124 + }, + { + "epoch": 0.5043764355329319, + "grad_norm": 0.27832558610169644, + "learning_rate": 9.859313162265595e-05, + "loss": 3.4372, + "step": 8125 + }, + { + "epoch": 0.5044385126326898, + "grad_norm": 0.290646261032286, + "learning_rate": 9.85922808073459e-05, + "loss": 3.228, + "step": 8126 + }, + { + "epoch": 0.5045005897324477, + "grad_norm": 0.27888422272622687, + "learning_rate": 9.859142973851813e-05, + "loss": 3.3762, + "step": 8127 + }, + { + "epoch": 0.5045626668322056, + "grad_norm": 0.39025955271713286, + "learning_rate": 9.859057841617709e-05, + "loss": 3.3537, + "step": 8128 + }, + { + "epoch": 0.5046247439319635, + "grad_norm": 0.26284782775502324, + "learning_rate": 9.858972684032718e-05, + "loss": 3.3199, + "step": 8129 + }, + { + "epoch": 0.5046868210317214, + "grad_norm": 0.28228495168734585, + "learning_rate": 9.85888750109729e-05, + "loss": 3.3606, + "step": 8130 + }, + { + "epoch": 0.5047488981314793, + "grad_norm": 0.26710259164797523, + "learning_rate": 9.858802292811864e-05, + "loss": 3.308, + "step": 8131 + }, + { + "epoch": 0.5048109752312372, + "grad_norm": 0.39724763212464553, + "learning_rate": 9.858717059176888e-05, + "loss": 3.2829, + "step": 8132 + }, + { + "epoch": 0.5048730523309951, + "grad_norm": 0.3668391827933406, + "learning_rate": 9.858631800192806e-05, + "loss": 3.3179, + "step": 8133 + }, + { + "epoch": 0.504935129430753, + "grad_norm": 0.25382093316641946, + "learning_rate": 9.858546515860061e-05, + "loss": 3.3677, + "step": 8134 + }, + { + "epoch": 0.504997206530511, + "grad_norm": 0.27151235257899, + "learning_rate": 9.858461206179101e-05, + "loss": 3.3326, + "step": 8135 + }, + { + "epoch": 0.5050592836302688, + "grad_norm": 0.3584058589322831, + "learning_rate": 9.858375871150369e-05, + "loss": 3.3858, + "step": 8136 + }, + { + "epoch": 0.5051213607300267, + "grad_norm": 0.42972961566174356, + "learning_rate": 9.85829051077431e-05, + "loss": 3.3673, + "step": 8137 + }, + { + "epoch": 0.5051834378297846, + "grad_norm": 0.29853469803674815, + "learning_rate": 9.858205125051369e-05, + "loss": 3.2438, + "step": 8138 + }, + { + "epoch": 0.5052455149295425, + "grad_norm": 0.3165219295634798, + "learning_rate": 9.858119713981994e-05, + "loss": 3.3913, + "step": 8139 + }, + { + "epoch": 0.5053075920293004, + "grad_norm": 0.2384764075376365, + "learning_rate": 9.858034277566628e-05, + "loss": 3.2825, + "step": 8140 + }, + { + "epoch": 0.5053696691290583, + "grad_norm": 0.2798521484273448, + "learning_rate": 9.85794881580572e-05, + "loss": 3.3777, + "step": 8141 + }, + { + "epoch": 0.5054317462288161, + "grad_norm": 0.2908882271890721, + "learning_rate": 9.857863328699712e-05, + "loss": 3.3306, + "step": 8142 + }, + { + "epoch": 0.5054938233285741, + "grad_norm": 0.5593618700415157, + "learning_rate": 9.857777816249051e-05, + "loss": 3.4189, + "step": 8143 + }, + { + "epoch": 0.505555900428332, + "grad_norm": 0.36865165659859384, + "learning_rate": 9.857692278454187e-05, + "loss": 3.3577, + "step": 8144 + }, + { + "epoch": 0.5056179775280899, + "grad_norm": 0.3611525031457625, + "learning_rate": 9.857606715315561e-05, + "loss": 3.3713, + "step": 8145 + }, + { + "epoch": 0.5056800546278478, + "grad_norm": 0.3537843220508887, + "learning_rate": 9.857521126833622e-05, + "loss": 3.3825, + "step": 8146 + }, + { + "epoch": 0.5057421317276057, + "grad_norm": 0.3302453650077821, + "learning_rate": 9.857435513008815e-05, + "loss": 3.3307, + "step": 8147 + }, + { + "epoch": 0.5058042088273635, + "grad_norm": 0.4905430555248092, + "learning_rate": 9.857349873841587e-05, + "loss": 3.3523, + "step": 8148 + }, + { + "epoch": 0.5058662859271215, + "grad_norm": 0.2678934731141547, + "learning_rate": 9.857264209332388e-05, + "loss": 3.3359, + "step": 8149 + }, + { + "epoch": 0.5059283630268794, + "grad_norm": 0.3339138067994261, + "learning_rate": 9.857178519481663e-05, + "loss": 3.3399, + "step": 8150 + }, + { + "epoch": 0.5059904401266373, + "grad_norm": 0.2673446304779447, + "learning_rate": 9.857092804289856e-05, + "loss": 3.2723, + "step": 8151 + }, + { + "epoch": 0.5060525172263952, + "grad_norm": 0.2788235715460615, + "learning_rate": 9.857007063757417e-05, + "loss": 3.3619, + "step": 8152 + }, + { + "epoch": 0.5061145943261531, + "grad_norm": 0.25202893439953805, + "learning_rate": 9.856921297884796e-05, + "loss": 3.2739, + "step": 8153 + }, + { + "epoch": 0.5061766714259109, + "grad_norm": 0.6656162261384633, + "learning_rate": 9.856835506672435e-05, + "loss": 3.3351, + "step": 8154 + }, + { + "epoch": 0.5062387485256689, + "grad_norm": 0.2252164597519892, + "learning_rate": 9.856749690120784e-05, + "loss": 3.2784, + "step": 8155 + }, + { + "epoch": 0.5063008256254268, + "grad_norm": 0.4596971212867454, + "learning_rate": 9.856663848230292e-05, + "loss": 3.2612, + "step": 8156 + }, + { + "epoch": 0.5063629027251847, + "grad_norm": 0.2669261451642677, + "learning_rate": 9.856577981001405e-05, + "loss": 3.3796, + "step": 8157 + }, + { + "epoch": 0.5064249798249426, + "grad_norm": 0.26399088740606547, + "learning_rate": 9.856492088434571e-05, + "loss": 3.3557, + "step": 8158 + }, + { + "epoch": 0.5064870569247005, + "grad_norm": 0.23181702746405203, + "learning_rate": 9.856406170530241e-05, + "loss": 3.3052, + "step": 8159 + }, + { + "epoch": 0.5065491340244583, + "grad_norm": 0.22400268219082778, + "learning_rate": 9.85632022728886e-05, + "loss": 3.3368, + "step": 8160 + }, + { + "epoch": 0.5066112111242163, + "grad_norm": 0.5383953997713052, + "learning_rate": 9.856234258710876e-05, + "loss": 3.2309, + "step": 8161 + }, + { + "epoch": 0.5066732882239742, + "grad_norm": 0.2693059504844188, + "learning_rate": 9.85614826479674e-05, + "loss": 3.3131, + "step": 8162 + }, + { + "epoch": 0.5067353653237321, + "grad_norm": 0.22308032324500687, + "learning_rate": 9.8560622455469e-05, + "loss": 3.3069, + "step": 8163 + }, + { + "epoch": 0.50679744242349, + "grad_norm": 0.2995448151105172, + "learning_rate": 9.855976200961805e-05, + "loss": 3.3128, + "step": 8164 + }, + { + "epoch": 0.5068595195232479, + "grad_norm": 0.26525282030197184, + "learning_rate": 9.855890131041901e-05, + "loss": 3.2498, + "step": 8165 + }, + { + "epoch": 0.5069215966230057, + "grad_norm": 0.23307451411007057, + "learning_rate": 9.855804035787641e-05, + "loss": 3.3356, + "step": 8166 + }, + { + "epoch": 0.5069836737227637, + "grad_norm": 0.36938030797433197, + "learning_rate": 9.855717915199472e-05, + "loss": 3.3868, + "step": 8167 + }, + { + "epoch": 0.5070457508225216, + "grad_norm": 0.22749270020028933, + "learning_rate": 9.855631769277844e-05, + "loss": 3.2366, + "step": 8168 + }, + { + "epoch": 0.5071078279222795, + "grad_norm": 0.4112664411035318, + "learning_rate": 9.855545598023208e-05, + "loss": 3.3615, + "step": 8169 + }, + { + "epoch": 0.5071699050220374, + "grad_norm": 0.3025199499921002, + "learning_rate": 9.85545940143601e-05, + "loss": 3.2972, + "step": 8170 + }, + { + "epoch": 0.5072319821217953, + "grad_norm": 0.20795125595389025, + "learning_rate": 9.855373179516703e-05, + "loss": 3.389, + "step": 8171 + }, + { + "epoch": 0.5072940592215531, + "grad_norm": 0.28241467394162817, + "learning_rate": 9.855286932265733e-05, + "loss": 3.2722, + "step": 8172 + }, + { + "epoch": 0.507356136321311, + "grad_norm": 0.3904356306729572, + "learning_rate": 9.855200659683555e-05, + "loss": 3.3473, + "step": 8173 + }, + { + "epoch": 0.507418213421069, + "grad_norm": 0.267917209373051, + "learning_rate": 9.855114361770616e-05, + "loss": 3.2589, + "step": 8174 + }, + { + "epoch": 0.5074802905208269, + "grad_norm": 0.25964219165151897, + "learning_rate": 9.855028038527366e-05, + "loss": 3.256, + "step": 8175 + }, + { + "epoch": 0.5075423676205848, + "grad_norm": 0.24905592762544906, + "learning_rate": 9.854941689954255e-05, + "loss": 3.2433, + "step": 8176 + }, + { + "epoch": 0.5076044447203427, + "grad_norm": 0.21093145997618717, + "learning_rate": 9.854855316051737e-05, + "loss": 3.2742, + "step": 8177 + }, + { + "epoch": 0.5076665218201005, + "grad_norm": 0.18459212966422195, + "learning_rate": 9.85476891682026e-05, + "loss": 3.3846, + "step": 8178 + }, + { + "epoch": 0.5077285989198584, + "grad_norm": 0.21131152971017292, + "learning_rate": 9.854682492260275e-05, + "loss": 3.3422, + "step": 8179 + }, + { + "epoch": 0.5077906760196164, + "grad_norm": 0.29665698026709675, + "learning_rate": 9.854596042372231e-05, + "loss": 3.3307, + "step": 8180 + }, + { + "epoch": 0.5078527531193743, + "grad_norm": 0.23914799408150164, + "learning_rate": 9.854509567156583e-05, + "loss": 3.2622, + "step": 8181 + }, + { + "epoch": 0.5079148302191322, + "grad_norm": 0.18285433013523053, + "learning_rate": 9.854423066613779e-05, + "loss": 3.19, + "step": 8182 + }, + { + "epoch": 0.5079769073188901, + "grad_norm": 0.22157311923857811, + "learning_rate": 9.854336540744272e-05, + "loss": 3.3874, + "step": 8183 + }, + { + "epoch": 0.5080389844186479, + "grad_norm": 0.22076265831843317, + "learning_rate": 9.854249989548512e-05, + "loss": 3.3187, + "step": 8184 + }, + { + "epoch": 0.5081010615184058, + "grad_norm": 0.2237377337071706, + "learning_rate": 9.854163413026951e-05, + "loss": 3.2435, + "step": 8185 + }, + { + "epoch": 0.5081631386181638, + "grad_norm": 0.2673901224634855, + "learning_rate": 9.854076811180042e-05, + "loss": 3.3642, + "step": 8186 + }, + { + "epoch": 0.5082252157179217, + "grad_norm": 0.19073543945225474, + "learning_rate": 9.853990184008236e-05, + "loss": 3.3332, + "step": 8187 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.24117914368674198, + "learning_rate": 9.853903531511986e-05, + "loss": 3.3861, + "step": 8188 + }, + { + "epoch": 0.5083493699174375, + "grad_norm": 0.2548785739122397, + "learning_rate": 9.853816853691741e-05, + "loss": 3.284, + "step": 8189 + }, + { + "epoch": 0.5084114470171953, + "grad_norm": 0.28664424094422236, + "learning_rate": 9.853730150547955e-05, + "loss": 3.3294, + "step": 8190 + }, + { + "epoch": 0.5084735241169532, + "grad_norm": 0.21348752327829593, + "learning_rate": 9.853643422081083e-05, + "loss": 3.297, + "step": 8191 + }, + { + "epoch": 0.5085356012167112, + "grad_norm": 0.20082719217865974, + "learning_rate": 9.853556668291573e-05, + "loss": 3.2665, + "step": 8192 + }, + { + "epoch": 0.5085976783164691, + "grad_norm": 0.3139751070756697, + "learning_rate": 9.853469889179879e-05, + "loss": 3.2834, + "step": 8193 + }, + { + "epoch": 0.508659755416227, + "grad_norm": 0.18942722654787372, + "learning_rate": 9.853383084746456e-05, + "loss": 3.3684, + "step": 8194 + }, + { + "epoch": 0.5087218325159849, + "grad_norm": 0.23290056732719985, + "learning_rate": 9.853296254991755e-05, + "loss": 3.2873, + "step": 8195 + }, + { + "epoch": 0.5087839096157427, + "grad_norm": 0.18819797264954538, + "learning_rate": 9.853209399916228e-05, + "loss": 3.2781, + "step": 8196 + }, + { + "epoch": 0.5088459867155006, + "grad_norm": 0.23977081419996366, + "learning_rate": 9.85312251952033e-05, + "loss": 3.2414, + "step": 8197 + }, + { + "epoch": 0.5089080638152585, + "grad_norm": 0.196661931877671, + "learning_rate": 9.853035613804514e-05, + "loss": 3.2717, + "step": 8198 + }, + { + "epoch": 0.5089701409150165, + "grad_norm": 0.18814664833369077, + "learning_rate": 9.852948682769234e-05, + "loss": 3.3459, + "step": 8199 + }, + { + "epoch": 0.5090322180147744, + "grad_norm": 0.35250570081267396, + "learning_rate": 9.852861726414941e-05, + "loss": 3.2552, + "step": 8200 + }, + { + "epoch": 0.5090942951145323, + "grad_norm": 0.24071235712083502, + "learning_rate": 9.852774744742092e-05, + "loss": 3.3665, + "step": 8201 + }, + { + "epoch": 0.5091563722142901, + "grad_norm": 0.1924918006414061, + "learning_rate": 9.852687737751137e-05, + "loss": 3.3812, + "step": 8202 + }, + { + "epoch": 0.509218449314048, + "grad_norm": 0.2391092500913785, + "learning_rate": 9.852600705442533e-05, + "loss": 3.2719, + "step": 8203 + }, + { + "epoch": 0.509280526413806, + "grad_norm": 0.241792371889745, + "learning_rate": 9.852513647816735e-05, + "loss": 3.3514, + "step": 8204 + }, + { + "epoch": 0.5093426035135639, + "grad_norm": 0.26535070505934466, + "learning_rate": 9.852426564874193e-05, + "loss": 3.3226, + "step": 8205 + }, + { + "epoch": 0.5094046806133218, + "grad_norm": 0.2354712834693382, + "learning_rate": 9.852339456615364e-05, + "loss": 3.3385, + "step": 8206 + }, + { + "epoch": 0.5094667577130797, + "grad_norm": 0.22363412015440984, + "learning_rate": 9.852252323040703e-05, + "loss": 3.2847, + "step": 8207 + }, + { + "epoch": 0.5095288348128375, + "grad_norm": 0.30885100204922167, + "learning_rate": 9.852165164150662e-05, + "loss": 3.2436, + "step": 8208 + }, + { + "epoch": 0.5095909119125954, + "grad_norm": 0.28039554767002195, + "learning_rate": 9.852077979945698e-05, + "loss": 3.3691, + "step": 8209 + }, + { + "epoch": 0.5096529890123533, + "grad_norm": 0.34326279315807473, + "learning_rate": 9.851990770426266e-05, + "loss": 3.3713, + "step": 8210 + }, + { + "epoch": 0.5097150661121113, + "grad_norm": 0.2417710245762542, + "learning_rate": 9.85190353559282e-05, + "loss": 3.3096, + "step": 8211 + }, + { + "epoch": 0.5097771432118692, + "grad_norm": 0.3573660468187185, + "learning_rate": 9.851816275445815e-05, + "loss": 3.2587, + "step": 8212 + }, + { + "epoch": 0.5098392203116271, + "grad_norm": 0.48919225013622775, + "learning_rate": 9.851728989985708e-05, + "loss": 3.2712, + "step": 8213 + }, + { + "epoch": 0.5099012974113849, + "grad_norm": 0.46984163964434755, + "learning_rate": 9.851641679212951e-05, + "loss": 3.3998, + "step": 8214 + }, + { + "epoch": 0.5099633745111428, + "grad_norm": 0.25542138736795217, + "learning_rate": 9.851554343128004e-05, + "loss": 3.349, + "step": 8215 + }, + { + "epoch": 0.5100254516109007, + "grad_norm": 0.29171054231920385, + "learning_rate": 9.851466981731319e-05, + "loss": 3.2977, + "step": 8216 + }, + { + "epoch": 0.5100875287106587, + "grad_norm": 0.3805139197574223, + "learning_rate": 9.851379595023354e-05, + "loss": 3.323, + "step": 8217 + }, + { + "epoch": 0.5101496058104166, + "grad_norm": 0.22475643319490332, + "learning_rate": 9.851292183004561e-05, + "loss": 3.2771, + "step": 8218 + }, + { + "epoch": 0.5102116829101745, + "grad_norm": 0.3222066767560488, + "learning_rate": 9.851204745675402e-05, + "loss": 3.3465, + "step": 8219 + }, + { + "epoch": 0.5102737600099323, + "grad_norm": 0.265902780693789, + "learning_rate": 9.851117283036328e-05, + "loss": 3.1999, + "step": 8220 + }, + { + "epoch": 0.5103358371096902, + "grad_norm": 0.2076911665276819, + "learning_rate": 9.851029795087799e-05, + "loss": 3.272, + "step": 8221 + }, + { + "epoch": 0.5103979142094481, + "grad_norm": 0.24526875585762054, + "learning_rate": 9.85094228183027e-05, + "loss": 3.3262, + "step": 8222 + }, + { + "epoch": 0.510459991309206, + "grad_norm": 0.23545857129692221, + "learning_rate": 9.850854743264197e-05, + "loss": 3.3957, + "step": 8223 + }, + { + "epoch": 0.510522068408964, + "grad_norm": 0.2662107236753572, + "learning_rate": 9.850767179390037e-05, + "loss": 3.3733, + "step": 8224 + }, + { + "epoch": 0.5105841455087219, + "grad_norm": 0.18985735644335727, + "learning_rate": 9.850679590208248e-05, + "loss": 3.3728, + "step": 8225 + }, + { + "epoch": 0.5106462226084797, + "grad_norm": 0.22899802552177825, + "learning_rate": 9.850591975719286e-05, + "loss": 3.3182, + "step": 8226 + }, + { + "epoch": 0.5107082997082376, + "grad_norm": 0.19848518324330597, + "learning_rate": 9.850504335923608e-05, + "loss": 3.3199, + "step": 8227 + }, + { + "epoch": 0.5107703768079955, + "grad_norm": 0.24840438040210813, + "learning_rate": 9.85041667082167e-05, + "loss": 3.3468, + "step": 8228 + }, + { + "epoch": 0.5108324539077534, + "grad_norm": 0.21090848157987685, + "learning_rate": 9.85032898041393e-05, + "loss": 3.3319, + "step": 8229 + }, + { + "epoch": 0.5108945310075114, + "grad_norm": 0.21608795210644885, + "learning_rate": 9.850241264700848e-05, + "loss": 3.2202, + "step": 8230 + }, + { + "epoch": 0.5109566081072693, + "grad_norm": 0.22089669507702947, + "learning_rate": 9.850153523682881e-05, + "loss": 3.2611, + "step": 8231 + }, + { + "epoch": 0.5110186852070271, + "grad_norm": 0.19453305163181156, + "learning_rate": 9.850065757360484e-05, + "loss": 3.4005, + "step": 8232 + }, + { + "epoch": 0.511080762306785, + "grad_norm": 0.21115010637991555, + "learning_rate": 9.849977965734116e-05, + "loss": 3.2682, + "step": 8233 + }, + { + "epoch": 0.5111428394065429, + "grad_norm": 0.2561984109120889, + "learning_rate": 9.849890148804237e-05, + "loss": 3.3271, + "step": 8234 + }, + { + "epoch": 0.5112049165063008, + "grad_norm": 0.2075308762983402, + "learning_rate": 9.849802306571303e-05, + "loss": 3.3493, + "step": 8235 + }, + { + "epoch": 0.5112669936060588, + "grad_norm": 0.20440788644608857, + "learning_rate": 9.849714439035775e-05, + "loss": 3.2125, + "step": 8236 + }, + { + "epoch": 0.5113290707058167, + "grad_norm": 0.26054444833795004, + "learning_rate": 9.849626546198108e-05, + "loss": 3.2495, + "step": 8237 + }, + { + "epoch": 0.5113911478055745, + "grad_norm": 0.21967170030421548, + "learning_rate": 9.84953862805876e-05, + "loss": 3.3991, + "step": 8238 + }, + { + "epoch": 0.5114532249053324, + "grad_norm": 0.2203480387491985, + "learning_rate": 9.849450684618195e-05, + "loss": 3.2577, + "step": 8239 + }, + { + "epoch": 0.5115153020050903, + "grad_norm": 0.26127190662327127, + "learning_rate": 9.849362715876866e-05, + "loss": 3.2013, + "step": 8240 + }, + { + "epoch": 0.5115773791048482, + "grad_norm": 0.21100519118488129, + "learning_rate": 9.849274721835236e-05, + "loss": 3.2418, + "step": 8241 + }, + { + "epoch": 0.5116394562046062, + "grad_norm": 0.2524888899691201, + "learning_rate": 9.849186702493762e-05, + "loss": 3.4089, + "step": 8242 + }, + { + "epoch": 0.511701533304364, + "grad_norm": 0.22403905027572865, + "learning_rate": 9.849098657852904e-05, + "loss": 3.3033, + "step": 8243 + }, + { + "epoch": 0.5117636104041219, + "grad_norm": 0.1979599526820923, + "learning_rate": 9.849010587913121e-05, + "loss": 3.2989, + "step": 8244 + }, + { + "epoch": 0.5118256875038798, + "grad_norm": 0.20180974220017178, + "learning_rate": 9.848922492674872e-05, + "loss": 3.343, + "step": 8245 + }, + { + "epoch": 0.5118877646036377, + "grad_norm": 0.23893997723988802, + "learning_rate": 9.848834372138617e-05, + "loss": 3.3531, + "step": 8246 + }, + { + "epoch": 0.5119498417033956, + "grad_norm": 0.26269219265471394, + "learning_rate": 9.848746226304819e-05, + "loss": 3.3233, + "step": 8247 + }, + { + "epoch": 0.5120119188031536, + "grad_norm": 0.1895167352129805, + "learning_rate": 9.848658055173931e-05, + "loss": 3.3143, + "step": 8248 + }, + { + "epoch": 0.5120739959029114, + "grad_norm": 0.25574211170713396, + "learning_rate": 9.848569858746419e-05, + "loss": 3.3541, + "step": 8249 + }, + { + "epoch": 0.5121360730026693, + "grad_norm": 0.19165871960466832, + "learning_rate": 9.848481637022741e-05, + "loss": 3.3262, + "step": 8250 + }, + { + "epoch": 0.5121981501024272, + "grad_norm": 0.3518943809572051, + "learning_rate": 9.848393390003356e-05, + "loss": 3.3112, + "step": 8251 + }, + { + "epoch": 0.5122602272021851, + "grad_norm": 0.2735483762194705, + "learning_rate": 9.848305117688727e-05, + "loss": 3.3604, + "step": 8252 + }, + { + "epoch": 0.512322304301943, + "grad_norm": 0.2413946679802868, + "learning_rate": 9.848216820079315e-05, + "loss": 3.2697, + "step": 8253 + }, + { + "epoch": 0.512384381401701, + "grad_norm": 0.33095717893019166, + "learning_rate": 9.848128497175575e-05, + "loss": 3.2959, + "step": 8254 + }, + { + "epoch": 0.5124464585014588, + "grad_norm": 0.34354168909063937, + "learning_rate": 9.848040148977974e-05, + "loss": 3.3377, + "step": 8255 + }, + { + "epoch": 0.5125085356012167, + "grad_norm": 0.2699163284504762, + "learning_rate": 9.84795177548697e-05, + "loss": 3.1687, + "step": 8256 + }, + { + "epoch": 0.5125706127009746, + "grad_norm": 0.2555889418523214, + "learning_rate": 9.847863376703024e-05, + "loss": 3.2854, + "step": 8257 + }, + { + "epoch": 0.5126326898007325, + "grad_norm": 0.23948420498694398, + "learning_rate": 9.847774952626599e-05, + "loss": 3.2753, + "step": 8258 + }, + { + "epoch": 0.5126947669004904, + "grad_norm": 0.20450797070181873, + "learning_rate": 9.847686503258156e-05, + "loss": 3.2068, + "step": 8259 + }, + { + "epoch": 0.5127568440002483, + "grad_norm": 0.24707761007395343, + "learning_rate": 9.847598028598155e-05, + "loss": 3.3267, + "step": 8260 + }, + { + "epoch": 0.5128189211000062, + "grad_norm": 0.27145688857755107, + "learning_rate": 9.847509528647056e-05, + "loss": 3.3413, + "step": 8261 + }, + { + "epoch": 0.5128809981997641, + "grad_norm": 0.319679751782383, + "learning_rate": 9.847421003405327e-05, + "loss": 3.3925, + "step": 8262 + }, + { + "epoch": 0.512943075299522, + "grad_norm": 0.2885490832540465, + "learning_rate": 9.847332452873424e-05, + "loss": 3.4057, + "step": 8263 + }, + { + "epoch": 0.5130051523992799, + "grad_norm": 0.22459855260898529, + "learning_rate": 9.84724387705181e-05, + "loss": 3.3549, + "step": 8264 + }, + { + "epoch": 0.5130672294990378, + "grad_norm": 0.3362256300437051, + "learning_rate": 9.847155275940949e-05, + "loss": 3.3123, + "step": 8265 + }, + { + "epoch": 0.5131293065987957, + "grad_norm": 0.35082510863031774, + "learning_rate": 9.847066649541302e-05, + "loss": 3.2934, + "step": 8266 + }, + { + "epoch": 0.5131913836985535, + "grad_norm": 0.31479363338395516, + "learning_rate": 9.846977997853331e-05, + "loss": 3.3004, + "step": 8267 + }, + { + "epoch": 0.5132534607983115, + "grad_norm": 0.3519989695100439, + "learning_rate": 9.846889320877499e-05, + "loss": 3.3328, + "step": 8268 + }, + { + "epoch": 0.5133155378980694, + "grad_norm": 0.2671511926577257, + "learning_rate": 9.84680061861427e-05, + "loss": 3.2512, + "step": 8269 + }, + { + "epoch": 0.5133776149978273, + "grad_norm": 0.3510349766389034, + "learning_rate": 9.846711891064105e-05, + "loss": 3.3703, + "step": 8270 + }, + { + "epoch": 0.5134396920975852, + "grad_norm": 0.24947322541066452, + "learning_rate": 9.846623138227467e-05, + "loss": 3.186, + "step": 8271 + }, + { + "epoch": 0.5135017691973431, + "grad_norm": 0.2953524824858929, + "learning_rate": 9.84653436010482e-05, + "loss": 3.266, + "step": 8272 + }, + { + "epoch": 0.513563846297101, + "grad_norm": 0.3372849233874916, + "learning_rate": 9.846445556696627e-05, + "loss": 3.342, + "step": 8273 + }, + { + "epoch": 0.5136259233968589, + "grad_norm": 0.3025749771344459, + "learning_rate": 9.84635672800335e-05, + "loss": 3.2526, + "step": 8274 + }, + { + "epoch": 0.5136880004966168, + "grad_norm": 0.2838786388778776, + "learning_rate": 9.846267874025456e-05, + "loss": 3.312, + "step": 8275 + }, + { + "epoch": 0.5137500775963747, + "grad_norm": 0.24119003297978747, + "learning_rate": 9.846178994763404e-05, + "loss": 3.3335, + "step": 8276 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.29517895186657467, + "learning_rate": 9.846090090217659e-05, + "loss": 3.3175, + "step": 8277 + }, + { + "epoch": 0.5138742317958905, + "grad_norm": 0.25966519013202555, + "learning_rate": 9.846001160388687e-05, + "loss": 3.3241, + "step": 8278 + }, + { + "epoch": 0.5139363088956483, + "grad_norm": 0.3826452395878883, + "learning_rate": 9.84591220527695e-05, + "loss": 3.3509, + "step": 8279 + }, + { + "epoch": 0.5139983859954063, + "grad_norm": 0.4540230796016335, + "learning_rate": 9.845823224882913e-05, + "loss": 3.3676, + "step": 8280 + }, + { + "epoch": 0.5140604630951642, + "grad_norm": 0.30370841071252197, + "learning_rate": 9.845734219207038e-05, + "loss": 3.2777, + "step": 8281 + }, + { + "epoch": 0.5141225401949221, + "grad_norm": 0.3302169485302043, + "learning_rate": 9.845645188249794e-05, + "loss": 3.3388, + "step": 8282 + }, + { + "epoch": 0.51418461729468, + "grad_norm": 0.3400005548837285, + "learning_rate": 9.845556132011642e-05, + "loss": 3.2674, + "step": 8283 + }, + { + "epoch": 0.5142466943944379, + "grad_norm": 0.30819370801726986, + "learning_rate": 9.845467050493047e-05, + "loss": 3.3192, + "step": 8284 + }, + { + "epoch": 0.5143087714941957, + "grad_norm": 0.25509000811857707, + "learning_rate": 9.845377943694474e-05, + "loss": 3.3068, + "step": 8285 + }, + { + "epoch": 0.5143708485939537, + "grad_norm": 0.3527724531707193, + "learning_rate": 9.845288811616389e-05, + "loss": 3.3648, + "step": 8286 + }, + { + "epoch": 0.5144329256937116, + "grad_norm": 0.3037700043672576, + "learning_rate": 9.845199654259254e-05, + "loss": 3.2326, + "step": 8287 + }, + { + "epoch": 0.5144950027934695, + "grad_norm": 0.3579221257684217, + "learning_rate": 9.845110471623538e-05, + "loss": 3.3633, + "step": 8288 + }, + { + "epoch": 0.5145570798932274, + "grad_norm": 0.3062158789408141, + "learning_rate": 9.845021263709704e-05, + "loss": 3.2954, + "step": 8289 + }, + { + "epoch": 0.5146191569929853, + "grad_norm": 0.22797897283263532, + "learning_rate": 9.844932030518216e-05, + "loss": 3.3087, + "step": 8290 + }, + { + "epoch": 0.5146812340927431, + "grad_norm": 0.28322703819602185, + "learning_rate": 9.844842772049545e-05, + "loss": 3.4073, + "step": 8291 + }, + { + "epoch": 0.514743311192501, + "grad_norm": 0.23773709684550673, + "learning_rate": 9.84475348830415e-05, + "loss": 3.2489, + "step": 8292 + }, + { + "epoch": 0.514805388292259, + "grad_norm": 0.24071835403776282, + "learning_rate": 9.8446641792825e-05, + "loss": 3.1772, + "step": 8293 + }, + { + "epoch": 0.5148674653920169, + "grad_norm": 0.32658160623755805, + "learning_rate": 9.844574844985064e-05, + "loss": 3.3941, + "step": 8294 + }, + { + "epoch": 0.5149295424917748, + "grad_norm": 0.2955901803283957, + "learning_rate": 9.844485485412303e-05, + "loss": 3.2982, + "step": 8295 + }, + { + "epoch": 0.5149916195915327, + "grad_norm": 0.2672195259219483, + "learning_rate": 9.844396100564685e-05, + "loss": 3.2538, + "step": 8296 + }, + { + "epoch": 0.5150536966912905, + "grad_norm": 0.24483207443531257, + "learning_rate": 9.844306690442677e-05, + "loss": 3.2822, + "step": 8297 + }, + { + "epoch": 0.5151157737910484, + "grad_norm": 0.31807322601699756, + "learning_rate": 9.844217255046743e-05, + "loss": 3.2344, + "step": 8298 + }, + { + "epoch": 0.5151778508908064, + "grad_norm": 0.27288315302734656, + "learning_rate": 9.844127794377355e-05, + "loss": 3.3133, + "step": 8299 + }, + { + "epoch": 0.5152399279905643, + "grad_norm": 0.2644225138044697, + "learning_rate": 9.844038308434975e-05, + "loss": 3.3166, + "step": 8300 + }, + { + "epoch": 0.5153020050903222, + "grad_norm": 0.317053826826878, + "learning_rate": 9.84394879722007e-05, + "loss": 3.3283, + "step": 8301 + }, + { + "epoch": 0.5153640821900801, + "grad_norm": 0.3860010172047331, + "learning_rate": 9.843859260733111e-05, + "loss": 3.3772, + "step": 8302 + }, + { + "epoch": 0.5154261592898379, + "grad_norm": 0.2486242947613978, + "learning_rate": 9.843769698974559e-05, + "loss": 3.2407, + "step": 8303 + }, + { + "epoch": 0.5154882363895958, + "grad_norm": 0.3688398882043121, + "learning_rate": 9.843680111944886e-05, + "loss": 3.3201, + "step": 8304 + }, + { + "epoch": 0.5155503134893538, + "grad_norm": 0.28634422225771655, + "learning_rate": 9.843590499644558e-05, + "loss": 3.2456, + "step": 8305 + }, + { + "epoch": 0.5156123905891117, + "grad_norm": 0.308896145961685, + "learning_rate": 9.843500862074044e-05, + "loss": 3.3351, + "step": 8306 + }, + { + "epoch": 0.5156744676888696, + "grad_norm": 0.3279196573510454, + "learning_rate": 9.843411199233808e-05, + "loss": 3.1954, + "step": 8307 + }, + { + "epoch": 0.5157365447886275, + "grad_norm": 0.23907561079238643, + "learning_rate": 9.843321511124321e-05, + "loss": 3.3631, + "step": 8308 + }, + { + "epoch": 0.5157986218883853, + "grad_norm": 0.2939639798443713, + "learning_rate": 9.843231797746049e-05, + "loss": 3.2588, + "step": 8309 + }, + { + "epoch": 0.5158606989881432, + "grad_norm": 0.33486214117755536, + "learning_rate": 9.843142059099462e-05, + "loss": 3.3533, + "step": 8310 + }, + { + "epoch": 0.5159227760879012, + "grad_norm": 0.32681621934733457, + "learning_rate": 9.843052295185027e-05, + "loss": 3.2596, + "step": 8311 + }, + { + "epoch": 0.5159848531876591, + "grad_norm": 0.357318988437536, + "learning_rate": 9.842962506003212e-05, + "loss": 3.3333, + "step": 8312 + }, + { + "epoch": 0.516046930287417, + "grad_norm": 0.3425982318227661, + "learning_rate": 9.842872691554486e-05, + "loss": 3.2734, + "step": 8313 + }, + { + "epoch": 0.5161090073871749, + "grad_norm": 0.3101115384291887, + "learning_rate": 9.842782851839319e-05, + "loss": 3.2848, + "step": 8314 + }, + { + "epoch": 0.5161710844869327, + "grad_norm": 0.32051947349893806, + "learning_rate": 9.842692986858177e-05, + "loss": 3.3553, + "step": 8315 + }, + { + "epoch": 0.5162331615866906, + "grad_norm": 0.37182787508458603, + "learning_rate": 9.84260309661153e-05, + "loss": 3.2439, + "step": 8316 + }, + { + "epoch": 0.5162952386864486, + "grad_norm": 0.30531831388296926, + "learning_rate": 9.842513181099847e-05, + "loss": 3.2373, + "step": 8317 + }, + { + "epoch": 0.5163573157862065, + "grad_norm": 0.26880121186712674, + "learning_rate": 9.842423240323597e-05, + "loss": 3.1996, + "step": 8318 + }, + { + "epoch": 0.5164193928859644, + "grad_norm": 0.30711745588005546, + "learning_rate": 9.84233327428325e-05, + "loss": 3.2154, + "step": 8319 + }, + { + "epoch": 0.5164814699857223, + "grad_norm": 0.47933968412675776, + "learning_rate": 9.842243282979276e-05, + "loss": 3.2551, + "step": 8320 + }, + { + "epoch": 0.5165435470854801, + "grad_norm": 0.2735297183797241, + "learning_rate": 9.842153266412141e-05, + "loss": 3.2126, + "step": 8321 + }, + { + "epoch": 0.516605624185238, + "grad_norm": 0.25983714789779017, + "learning_rate": 9.842063224582318e-05, + "loss": 3.2282, + "step": 8322 + }, + { + "epoch": 0.516667701284996, + "grad_norm": 0.34823002775296713, + "learning_rate": 9.841973157490276e-05, + "loss": 3.2652, + "step": 8323 + }, + { + "epoch": 0.5167297783847539, + "grad_norm": 0.25909224690277494, + "learning_rate": 9.841883065136484e-05, + "loss": 3.2542, + "step": 8324 + }, + { + "epoch": 0.5167918554845118, + "grad_norm": 0.278443038384428, + "learning_rate": 9.841792947521412e-05, + "loss": 3.2334, + "step": 8325 + }, + { + "epoch": 0.5168539325842697, + "grad_norm": 0.22430749592337118, + "learning_rate": 9.841702804645531e-05, + "loss": 3.3052, + "step": 8326 + }, + { + "epoch": 0.5169160096840275, + "grad_norm": 0.20750256637342016, + "learning_rate": 9.841612636509311e-05, + "loss": 3.2361, + "step": 8327 + }, + { + "epoch": 0.5169780867837854, + "grad_norm": 0.2188711852498837, + "learning_rate": 9.841522443113225e-05, + "loss": 3.2486, + "step": 8328 + }, + { + "epoch": 0.5170401638835433, + "grad_norm": 0.23765812365854855, + "learning_rate": 9.84143222445774e-05, + "loss": 3.2356, + "step": 8329 + }, + { + "epoch": 0.5171022409833013, + "grad_norm": 0.1916069710499904, + "learning_rate": 9.841341980543326e-05, + "loss": 3.3009, + "step": 8330 + }, + { + "epoch": 0.5171643180830592, + "grad_norm": 0.18570003211655903, + "learning_rate": 9.841251711370458e-05, + "loss": 3.1988, + "step": 8331 + }, + { + "epoch": 0.5172263951828171, + "grad_norm": 0.2673778062551005, + "learning_rate": 9.841161416939602e-05, + "loss": 3.281, + "step": 8332 + }, + { + "epoch": 0.5172884722825749, + "grad_norm": 0.22786685760816308, + "learning_rate": 9.841071097251234e-05, + "loss": 3.2834, + "step": 8333 + }, + { + "epoch": 0.5173505493823328, + "grad_norm": 0.4292032981565856, + "learning_rate": 9.840980752305821e-05, + "loss": 3.3868, + "step": 8334 + }, + { + "epoch": 0.5174126264820907, + "grad_norm": 0.3289649408126622, + "learning_rate": 9.840890382103838e-05, + "loss": 3.3458, + "step": 8335 + }, + { + "epoch": 0.5174747035818487, + "grad_norm": 0.196476263481749, + "learning_rate": 9.840799986645753e-05, + "loss": 3.3345, + "step": 8336 + }, + { + "epoch": 0.5175367806816066, + "grad_norm": 0.2471887357466296, + "learning_rate": 9.84070956593204e-05, + "loss": 3.2186, + "step": 8337 + }, + { + "epoch": 0.5175988577813645, + "grad_norm": 0.20567773626794936, + "learning_rate": 9.840619119963171e-05, + "loss": 3.3121, + "step": 8338 + }, + { + "epoch": 0.5176609348811223, + "grad_norm": 0.24793889856965756, + "learning_rate": 9.840528648739615e-05, + "loss": 3.36, + "step": 8339 + }, + { + "epoch": 0.5177230119808802, + "grad_norm": 0.28989490304419085, + "learning_rate": 9.840438152261849e-05, + "loss": 3.3622, + "step": 8340 + }, + { + "epoch": 0.5177850890806381, + "grad_norm": 0.23928673238552936, + "learning_rate": 9.840347630530338e-05, + "loss": 3.2609, + "step": 8341 + }, + { + "epoch": 0.5178471661803961, + "grad_norm": 0.3401242994548701, + "learning_rate": 9.840257083545562e-05, + "loss": 3.2509, + "step": 8342 + }, + { + "epoch": 0.517909243280154, + "grad_norm": 0.2256552024291884, + "learning_rate": 9.840166511307988e-05, + "loss": 3.2525, + "step": 8343 + }, + { + "epoch": 0.5179713203799119, + "grad_norm": 0.29425158339246993, + "learning_rate": 9.84007591381809e-05, + "loss": 3.317, + "step": 8344 + }, + { + "epoch": 0.5180333974796697, + "grad_norm": 0.2723680426727482, + "learning_rate": 9.839985291076341e-05, + "loss": 3.2778, + "step": 8345 + }, + { + "epoch": 0.5180954745794276, + "grad_norm": 0.3328269959816503, + "learning_rate": 9.839894643083216e-05, + "loss": 3.2644, + "step": 8346 + }, + { + "epoch": 0.5181575516791855, + "grad_norm": 0.4283257146199817, + "learning_rate": 9.839803969839183e-05, + "loss": 3.2877, + "step": 8347 + }, + { + "epoch": 0.5182196287789435, + "grad_norm": 0.26451501615117234, + "learning_rate": 9.839713271344719e-05, + "loss": 3.287, + "step": 8348 + }, + { + "epoch": 0.5182817058787014, + "grad_norm": 0.2763485582091486, + "learning_rate": 9.839622547600295e-05, + "loss": 3.3016, + "step": 8349 + }, + { + "epoch": 0.5183437829784593, + "grad_norm": 0.30880692519000796, + "learning_rate": 9.839531798606387e-05, + "loss": 3.3899, + "step": 8350 + }, + { + "epoch": 0.5184058600782171, + "grad_norm": 0.2379585018944437, + "learning_rate": 9.839441024363467e-05, + "loss": 3.2715, + "step": 8351 + }, + { + "epoch": 0.518467937177975, + "grad_norm": 0.23805373696182539, + "learning_rate": 9.839350224872008e-05, + "loss": 3.2281, + "step": 8352 + }, + { + "epoch": 0.5185300142777329, + "grad_norm": 0.24477652323666452, + "learning_rate": 9.839259400132483e-05, + "loss": 3.3551, + "step": 8353 + }, + { + "epoch": 0.5185920913774908, + "grad_norm": 0.269549991121966, + "learning_rate": 9.839168550145367e-05, + "loss": 3.3523, + "step": 8354 + }, + { + "epoch": 0.5186541684772488, + "grad_norm": 0.23272416147779063, + "learning_rate": 9.839077674911135e-05, + "loss": 3.2522, + "step": 8355 + }, + { + "epoch": 0.5187162455770067, + "grad_norm": 0.3133223442465473, + "learning_rate": 9.838986774430258e-05, + "loss": 3.3058, + "step": 8356 + }, + { + "epoch": 0.5187783226767645, + "grad_norm": 0.2128880089415357, + "learning_rate": 9.838895848703213e-05, + "loss": 3.2806, + "step": 8357 + }, + { + "epoch": 0.5188403997765224, + "grad_norm": 0.3446732174498622, + "learning_rate": 9.838804897730476e-05, + "loss": 3.2911, + "step": 8358 + }, + { + "epoch": 0.5189024768762803, + "grad_norm": 0.30925519289962516, + "learning_rate": 9.838713921512516e-05, + "loss": 3.2481, + "step": 8359 + }, + { + "epoch": 0.5189645539760382, + "grad_norm": 0.2697917674345518, + "learning_rate": 9.838622920049814e-05, + "loss": 3.364, + "step": 8360 + }, + { + "epoch": 0.5190266310757962, + "grad_norm": 0.34336085400572036, + "learning_rate": 9.838531893342839e-05, + "loss": 3.2583, + "step": 8361 + }, + { + "epoch": 0.5190887081755541, + "grad_norm": 0.2685971313228219, + "learning_rate": 9.83844084139207e-05, + "loss": 3.2744, + "step": 8362 + }, + { + "epoch": 0.5191507852753119, + "grad_norm": 0.22794376450766843, + "learning_rate": 9.83834976419798e-05, + "loss": 3.2036, + "step": 8363 + }, + { + "epoch": 0.5192128623750698, + "grad_norm": 0.2537071327587867, + "learning_rate": 9.838258661761044e-05, + "loss": 3.3301, + "step": 8364 + }, + { + "epoch": 0.5192749394748277, + "grad_norm": 0.375330342385609, + "learning_rate": 9.838167534081739e-05, + "loss": 3.3515, + "step": 8365 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.24990376467969758, + "learning_rate": 9.83807638116054e-05, + "loss": 3.3563, + "step": 8366 + }, + { + "epoch": 0.5193990936743436, + "grad_norm": 0.2563192878648483, + "learning_rate": 9.837985202997922e-05, + "loss": 3.3556, + "step": 8367 + }, + { + "epoch": 0.5194611707741015, + "grad_norm": 0.2426941095258197, + "learning_rate": 9.837893999594359e-05, + "loss": 3.3507, + "step": 8368 + }, + { + "epoch": 0.5195232478738593, + "grad_norm": 0.27162922624353875, + "learning_rate": 9.83780277095033e-05, + "loss": 3.3485, + "step": 8369 + }, + { + "epoch": 0.5195853249736172, + "grad_norm": 0.21032301267165412, + "learning_rate": 9.837711517066308e-05, + "loss": 3.1892, + "step": 8370 + }, + { + "epoch": 0.5196474020733751, + "grad_norm": 0.20270554626060872, + "learning_rate": 9.837620237942773e-05, + "loss": 3.3164, + "step": 8371 + }, + { + "epoch": 0.519709479173133, + "grad_norm": 0.21766018167684084, + "learning_rate": 9.837528933580197e-05, + "loss": 3.3608, + "step": 8372 + }, + { + "epoch": 0.519771556272891, + "grad_norm": 0.28677243294057864, + "learning_rate": 9.837437603979058e-05, + "loss": 3.3144, + "step": 8373 + }, + { + "epoch": 0.5198336333726489, + "grad_norm": 0.3663639410192125, + "learning_rate": 9.837346249139834e-05, + "loss": 3.3116, + "step": 8374 + }, + { + "epoch": 0.5198957104724067, + "grad_norm": 0.35419096115410975, + "learning_rate": 9.837254869063e-05, + "loss": 3.3627, + "step": 8375 + }, + { + "epoch": 0.5199577875721646, + "grad_norm": 0.33104263553793695, + "learning_rate": 9.837163463749033e-05, + "loss": 3.1707, + "step": 8376 + }, + { + "epoch": 0.5200198646719225, + "grad_norm": 0.31580758856512514, + "learning_rate": 9.837072033198409e-05, + "loss": 3.283, + "step": 8377 + }, + { + "epoch": 0.5200819417716804, + "grad_norm": 0.2583544266200618, + "learning_rate": 9.836980577411607e-05, + "loss": 3.2544, + "step": 8378 + }, + { + "epoch": 0.5201440188714384, + "grad_norm": 0.3713073918585648, + "learning_rate": 9.836889096389103e-05, + "loss": 3.3272, + "step": 8379 + }, + { + "epoch": 0.5202060959711963, + "grad_norm": 0.3274350291699734, + "learning_rate": 9.836797590131374e-05, + "loss": 3.2897, + "step": 8380 + }, + { + "epoch": 0.5202681730709541, + "grad_norm": 0.30268267528478227, + "learning_rate": 9.836706058638898e-05, + "loss": 3.2807, + "step": 8381 + }, + { + "epoch": 0.520330250170712, + "grad_norm": 0.29029642111904314, + "learning_rate": 9.836614501912153e-05, + "loss": 3.2377, + "step": 8382 + }, + { + "epoch": 0.5203923272704699, + "grad_norm": 0.3740119434313663, + "learning_rate": 9.836522919951615e-05, + "loss": 3.3399, + "step": 8383 + }, + { + "epoch": 0.5204544043702278, + "grad_norm": 0.37517571306245395, + "learning_rate": 9.836431312757763e-05, + "loss": 3.3301, + "step": 8384 + }, + { + "epoch": 0.5205164814699857, + "grad_norm": 0.27031086386477615, + "learning_rate": 9.836339680331074e-05, + "loss": 3.3765, + "step": 8385 + }, + { + "epoch": 0.5205785585697437, + "grad_norm": 0.2775023772378427, + "learning_rate": 9.83624802267203e-05, + "loss": 3.1745, + "step": 8386 + }, + { + "epoch": 0.5206406356695015, + "grad_norm": 0.21273480408646608, + "learning_rate": 9.836156339781103e-05, + "loss": 3.2845, + "step": 8387 + }, + { + "epoch": 0.5207027127692594, + "grad_norm": 0.21940921330539254, + "learning_rate": 9.836064631658776e-05, + "loss": 3.2764, + "step": 8388 + }, + { + "epoch": 0.5207647898690173, + "grad_norm": 0.21747408593296283, + "learning_rate": 9.835972898305525e-05, + "loss": 3.2743, + "step": 8389 + }, + { + "epoch": 0.5208268669687752, + "grad_norm": 0.2850563057065685, + "learning_rate": 9.835881139721829e-05, + "loss": 3.3553, + "step": 8390 + }, + { + "epoch": 0.5208889440685331, + "grad_norm": 0.21965101850778454, + "learning_rate": 9.835789355908167e-05, + "loss": 3.2897, + "step": 8391 + }, + { + "epoch": 0.5209510211682911, + "grad_norm": 0.2996942521767616, + "learning_rate": 9.83569754686502e-05, + "loss": 3.2811, + "step": 8392 + }, + { + "epoch": 0.5210130982680489, + "grad_norm": 0.2457125375397271, + "learning_rate": 9.835605712592863e-05, + "loss": 3.2549, + "step": 8393 + }, + { + "epoch": 0.5210751753678068, + "grad_norm": 0.2869253282302318, + "learning_rate": 9.835513853092178e-05, + "loss": 3.3898, + "step": 8394 + }, + { + "epoch": 0.5211372524675647, + "grad_norm": 0.23376452094888822, + "learning_rate": 9.835421968363443e-05, + "loss": 3.2385, + "step": 8395 + }, + { + "epoch": 0.5211993295673226, + "grad_norm": 0.7789220458239782, + "learning_rate": 9.835330058407137e-05, + "loss": 3.2132, + "step": 8396 + }, + { + "epoch": 0.5212614066670805, + "grad_norm": 0.3017954139367982, + "learning_rate": 9.835238123223741e-05, + "loss": 3.2914, + "step": 8397 + }, + { + "epoch": 0.5213234837668385, + "grad_norm": 0.45318359785355744, + "learning_rate": 9.835146162813735e-05, + "loss": 3.2781, + "step": 8398 + }, + { + "epoch": 0.5213855608665963, + "grad_norm": 0.3966264892848319, + "learning_rate": 9.835054177177597e-05, + "loss": 3.3211, + "step": 8399 + }, + { + "epoch": 0.5214476379663542, + "grad_norm": 0.33215806446661916, + "learning_rate": 9.834962166315806e-05, + "loss": 3.3113, + "step": 8400 + }, + { + "epoch": 0.5215097150661121, + "grad_norm": 0.4813493396979574, + "learning_rate": 9.834870130228845e-05, + "loss": 3.3796, + "step": 8401 + }, + { + "epoch": 0.52157179216587, + "grad_norm": 0.37921796345167474, + "learning_rate": 9.834778068917193e-05, + "loss": 3.3521, + "step": 8402 + }, + { + "epoch": 0.5216338692656279, + "grad_norm": 0.4214696879828659, + "learning_rate": 9.83468598238133e-05, + "loss": 3.3277, + "step": 8403 + }, + { + "epoch": 0.5216959463653859, + "grad_norm": 0.5413035098488684, + "learning_rate": 9.834593870621735e-05, + "loss": 3.2886, + "step": 8404 + }, + { + "epoch": 0.5217580234651437, + "grad_norm": 0.3279921512515148, + "learning_rate": 9.834501733638892e-05, + "loss": 3.3805, + "step": 8405 + }, + { + "epoch": 0.5218201005649016, + "grad_norm": 0.6995297598040098, + "learning_rate": 9.83440957143328e-05, + "loss": 3.309, + "step": 8406 + }, + { + "epoch": 0.5218821776646595, + "grad_norm": 0.3334820075571935, + "learning_rate": 9.834317384005379e-05, + "loss": 3.2732, + "step": 8407 + }, + { + "epoch": 0.5219442547644174, + "grad_norm": 0.3456109673029508, + "learning_rate": 9.834225171355671e-05, + "loss": 3.2639, + "step": 8408 + }, + { + "epoch": 0.5220063318641753, + "grad_norm": 0.44773585159359136, + "learning_rate": 9.834132933484637e-05, + "loss": 3.268, + "step": 8409 + }, + { + "epoch": 0.5220684089639333, + "grad_norm": 0.3519113928708104, + "learning_rate": 9.834040670392757e-05, + "loss": 3.2447, + "step": 8410 + }, + { + "epoch": 0.5221304860636911, + "grad_norm": 0.3059690252121085, + "learning_rate": 9.833948382080514e-05, + "loss": 3.336, + "step": 8411 + }, + { + "epoch": 0.522192563163449, + "grad_norm": 0.3180438156936574, + "learning_rate": 9.833856068548389e-05, + "loss": 3.2723, + "step": 8412 + }, + { + "epoch": 0.5222546402632069, + "grad_norm": 0.27306259884944917, + "learning_rate": 9.833763729796863e-05, + "loss": 3.273, + "step": 8413 + }, + { + "epoch": 0.5223167173629648, + "grad_norm": 0.46433738568993854, + "learning_rate": 9.833671365826417e-05, + "loss": 3.292, + "step": 8414 + }, + { + "epoch": 0.5223787944627227, + "grad_norm": 0.3578720176537868, + "learning_rate": 9.833578976637536e-05, + "loss": 3.3179, + "step": 8415 + }, + { + "epoch": 0.5224408715624806, + "grad_norm": 0.3425220208676554, + "learning_rate": 9.8334865622307e-05, + "loss": 3.2391, + "step": 8416 + }, + { + "epoch": 0.5225029486622385, + "grad_norm": 0.5194027677290409, + "learning_rate": 9.83339412260639e-05, + "loss": 3.3187, + "step": 8417 + }, + { + "epoch": 0.5225650257619964, + "grad_norm": 0.3464212985936191, + "learning_rate": 9.833301657765091e-05, + "loss": 3.2508, + "step": 8418 + }, + { + "epoch": 0.5226271028617543, + "grad_norm": 0.26796366072290023, + "learning_rate": 9.833209167707283e-05, + "loss": 3.2825, + "step": 8419 + }, + { + "epoch": 0.5226891799615122, + "grad_norm": 0.41202726947364704, + "learning_rate": 9.833116652433449e-05, + "loss": 3.3381, + "step": 8420 + }, + { + "epoch": 0.5227512570612701, + "grad_norm": 0.39640038638952574, + "learning_rate": 9.833024111944071e-05, + "loss": 3.4389, + "step": 8421 + }, + { + "epoch": 0.522813334161028, + "grad_norm": 0.4114271908656016, + "learning_rate": 9.832931546239636e-05, + "loss": 3.2718, + "step": 8422 + }, + { + "epoch": 0.5228754112607858, + "grad_norm": 0.355753061805465, + "learning_rate": 9.832838955320622e-05, + "loss": 3.2893, + "step": 8423 + }, + { + "epoch": 0.5229374883605438, + "grad_norm": 0.3875177202479942, + "learning_rate": 9.832746339187515e-05, + "loss": 3.1859, + "step": 8424 + }, + { + "epoch": 0.5229995654603017, + "grad_norm": 0.2440564506381839, + "learning_rate": 9.832653697840797e-05, + "loss": 3.2399, + "step": 8425 + }, + { + "epoch": 0.5230616425600596, + "grad_norm": 0.2701746269272563, + "learning_rate": 9.83256103128095e-05, + "loss": 3.2017, + "step": 8426 + }, + { + "epoch": 0.5231237196598175, + "grad_norm": 0.23132880614340492, + "learning_rate": 9.832468339508461e-05, + "loss": 3.1958, + "step": 8427 + }, + { + "epoch": 0.5231857967595754, + "grad_norm": 0.2893757797873353, + "learning_rate": 9.83237562252381e-05, + "loss": 3.2615, + "step": 8428 + }, + { + "epoch": 0.5232478738593332, + "grad_norm": 0.3235601837490884, + "learning_rate": 9.832282880327483e-05, + "loss": 3.3502, + "step": 8429 + }, + { + "epoch": 0.5233099509590912, + "grad_norm": 0.3058866111155699, + "learning_rate": 9.832190112919963e-05, + "loss": 3.2382, + "step": 8430 + }, + { + "epoch": 0.5233720280588491, + "grad_norm": 0.25016125306482156, + "learning_rate": 9.832097320301734e-05, + "loss": 3.2708, + "step": 8431 + }, + { + "epoch": 0.523434105158607, + "grad_norm": 0.4164703820746819, + "learning_rate": 9.83200450247328e-05, + "loss": 3.3138, + "step": 8432 + }, + { + "epoch": 0.5234961822583649, + "grad_norm": 0.2511372181904532, + "learning_rate": 9.831911659435085e-05, + "loss": 3.2333, + "step": 8433 + }, + { + "epoch": 0.5235582593581228, + "grad_norm": 0.27587527382408833, + "learning_rate": 9.831818791187636e-05, + "loss": 3.3351, + "step": 8434 + }, + { + "epoch": 0.5236203364578806, + "grad_norm": 0.39068801371629575, + "learning_rate": 9.831725897731414e-05, + "loss": 3.3421, + "step": 8435 + }, + { + "epoch": 0.5236824135576386, + "grad_norm": 0.23592047566767496, + "learning_rate": 9.831632979066904e-05, + "loss": 3.2407, + "step": 8436 + }, + { + "epoch": 0.5237444906573965, + "grad_norm": 0.38953220096476643, + "learning_rate": 9.831540035194594e-05, + "loss": 3.2963, + "step": 8437 + }, + { + "epoch": 0.5238065677571544, + "grad_norm": 0.35138418125790377, + "learning_rate": 9.831447066114966e-05, + "loss": 3.3378, + "step": 8438 + }, + { + "epoch": 0.5238686448569123, + "grad_norm": 0.358138007981296, + "learning_rate": 9.831354071828505e-05, + "loss": 3.3586, + "step": 8439 + }, + { + "epoch": 0.5239307219566702, + "grad_norm": 0.3356960892999794, + "learning_rate": 9.831261052335698e-05, + "loss": 3.3808, + "step": 8440 + }, + { + "epoch": 0.523992799056428, + "grad_norm": 0.28553803895985375, + "learning_rate": 9.831168007637029e-05, + "loss": 3.2982, + "step": 8441 + }, + { + "epoch": 0.524054876156186, + "grad_norm": 0.41150266271728564, + "learning_rate": 9.831074937732983e-05, + "loss": 3.238, + "step": 8442 + }, + { + "epoch": 0.5241169532559439, + "grad_norm": 0.5548377303729122, + "learning_rate": 9.830981842624047e-05, + "loss": 3.3213, + "step": 8443 + }, + { + "epoch": 0.5241790303557018, + "grad_norm": 0.32995379662737795, + "learning_rate": 9.830888722310704e-05, + "loss": 3.3514, + "step": 8444 + }, + { + "epoch": 0.5242411074554597, + "grad_norm": 0.30357362642213853, + "learning_rate": 9.830795576793446e-05, + "loss": 3.2928, + "step": 8445 + }, + { + "epoch": 0.5243031845552176, + "grad_norm": 0.25791840834381335, + "learning_rate": 9.830702406072751e-05, + "loss": 3.3331, + "step": 8446 + }, + { + "epoch": 0.5243652616549754, + "grad_norm": 0.30175755324861814, + "learning_rate": 9.830609210149111e-05, + "loss": 3.249, + "step": 8447 + }, + { + "epoch": 0.5244273387547334, + "grad_norm": 0.23298086536561047, + "learning_rate": 9.830515989023009e-05, + "loss": 3.2941, + "step": 8448 + }, + { + "epoch": 0.5244894158544913, + "grad_norm": 0.2528790603694502, + "learning_rate": 9.830422742694933e-05, + "loss": 3.2322, + "step": 8449 + }, + { + "epoch": 0.5245514929542492, + "grad_norm": 0.23423580963884286, + "learning_rate": 9.830329471165366e-05, + "loss": 3.2507, + "step": 8450 + }, + { + "epoch": 0.5246135700540071, + "grad_norm": 0.1793092291238802, + "learning_rate": 9.830236174434801e-05, + "loss": 3.2782, + "step": 8451 + }, + { + "epoch": 0.524675647153765, + "grad_norm": 0.24456033107176978, + "learning_rate": 9.830142852503721e-05, + "loss": 3.2211, + "step": 8452 + }, + { + "epoch": 0.5247377242535228, + "grad_norm": 0.2145885252015854, + "learning_rate": 9.830049505372613e-05, + "loss": 3.2369, + "step": 8453 + }, + { + "epoch": 0.5247998013532807, + "grad_norm": 0.2092150814056685, + "learning_rate": 9.829956133041965e-05, + "loss": 3.2299, + "step": 8454 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.2781716870116209, + "learning_rate": 9.82986273551226e-05, + "loss": 3.274, + "step": 8455 + }, + { + "epoch": 0.5249239555527966, + "grad_norm": 0.21986708492131218, + "learning_rate": 9.829769312783992e-05, + "loss": 3.2655, + "step": 8456 + }, + { + "epoch": 0.5249860326525545, + "grad_norm": 0.21365760741640244, + "learning_rate": 9.829675864857644e-05, + "loss": 3.2187, + "step": 8457 + }, + { + "epoch": 0.5250481097523124, + "grad_norm": 0.26637109988833346, + "learning_rate": 9.829582391733703e-05, + "loss": 3.3349, + "step": 8458 + }, + { + "epoch": 0.5251101868520702, + "grad_norm": 0.16489824235047654, + "learning_rate": 9.829488893412659e-05, + "loss": 3.3073, + "step": 8459 + }, + { + "epoch": 0.5251722639518281, + "grad_norm": 0.2849680919938722, + "learning_rate": 9.829395369895e-05, + "loss": 3.2768, + "step": 8460 + }, + { + "epoch": 0.5252343410515861, + "grad_norm": 0.2250157618962278, + "learning_rate": 9.829301821181213e-05, + "loss": 3.2254, + "step": 8461 + }, + { + "epoch": 0.525296418151344, + "grad_norm": 0.26995281754277656, + "learning_rate": 9.829208247271786e-05, + "loss": 3.1795, + "step": 8462 + }, + { + "epoch": 0.5253584952511019, + "grad_norm": 0.19014034297175184, + "learning_rate": 9.829114648167207e-05, + "loss": 3.2268, + "step": 8463 + }, + { + "epoch": 0.5254205723508598, + "grad_norm": 0.21156332990632906, + "learning_rate": 9.829021023867964e-05, + "loss": 3.2197, + "step": 8464 + }, + { + "epoch": 0.5254826494506176, + "grad_norm": 0.24573890140527171, + "learning_rate": 9.828927374374547e-05, + "loss": 3.2119, + "step": 8465 + }, + { + "epoch": 0.5255447265503755, + "grad_norm": 0.2315090751537565, + "learning_rate": 9.828833699687443e-05, + "loss": 3.2997, + "step": 8466 + }, + { + "epoch": 0.5256068036501335, + "grad_norm": 0.20441694396481352, + "learning_rate": 9.828739999807141e-05, + "loss": 3.303, + "step": 8467 + }, + { + "epoch": 0.5256688807498914, + "grad_norm": 0.1816207710818694, + "learning_rate": 9.828646274734131e-05, + "loss": 3.3545, + "step": 8468 + }, + { + "epoch": 0.5257309578496493, + "grad_norm": 0.19144513260747884, + "learning_rate": 9.828552524468901e-05, + "loss": 3.2443, + "step": 8469 + }, + { + "epoch": 0.5257930349494072, + "grad_norm": 0.17246740832345203, + "learning_rate": 9.82845874901194e-05, + "loss": 3.2527, + "step": 8470 + }, + { + "epoch": 0.525855112049165, + "grad_norm": 0.2487168462982156, + "learning_rate": 9.828364948363737e-05, + "loss": 3.1832, + "step": 8471 + }, + { + "epoch": 0.5259171891489229, + "grad_norm": 0.26634251232009587, + "learning_rate": 9.828271122524784e-05, + "loss": 3.4142, + "step": 8472 + }, + { + "epoch": 0.5259792662486809, + "grad_norm": 0.2093002659237125, + "learning_rate": 9.828177271495566e-05, + "loss": 3.2931, + "step": 8473 + }, + { + "epoch": 0.5260413433484388, + "grad_norm": 0.2690778872335646, + "learning_rate": 9.828083395276577e-05, + "loss": 3.2328, + "step": 8474 + }, + { + "epoch": 0.5261034204481967, + "grad_norm": 0.2432353439948884, + "learning_rate": 9.827989493868304e-05, + "loss": 3.3059, + "step": 8475 + }, + { + "epoch": 0.5261654975479546, + "grad_norm": 0.2853698583608409, + "learning_rate": 9.827895567271236e-05, + "loss": 3.3405, + "step": 8476 + }, + { + "epoch": 0.5262275746477124, + "grad_norm": 0.2666461194710416, + "learning_rate": 9.827801615485867e-05, + "loss": 3.3774, + "step": 8477 + }, + { + "epoch": 0.5262896517474703, + "grad_norm": 0.2734748269897717, + "learning_rate": 9.827707638512687e-05, + "loss": 3.3157, + "step": 8478 + }, + { + "epoch": 0.5263517288472283, + "grad_norm": 0.2456184671955242, + "learning_rate": 9.827613636352181e-05, + "loss": 3.3497, + "step": 8479 + }, + { + "epoch": 0.5264138059469862, + "grad_norm": 0.20264409063327252, + "learning_rate": 9.827519609004843e-05, + "loss": 3.3741, + "step": 8480 + }, + { + "epoch": 0.5264758830467441, + "grad_norm": 0.21736204522835723, + "learning_rate": 9.827425556471163e-05, + "loss": 3.3657, + "step": 8481 + }, + { + "epoch": 0.526537960146502, + "grad_norm": 0.19787080865097104, + "learning_rate": 9.827331478751634e-05, + "loss": 3.308, + "step": 8482 + }, + { + "epoch": 0.5266000372462598, + "grad_norm": 0.19315979030038855, + "learning_rate": 9.827237375846744e-05, + "loss": 3.2608, + "step": 8483 + }, + { + "epoch": 0.5266621143460177, + "grad_norm": 0.18633471423320122, + "learning_rate": 9.827143247756985e-05, + "loss": 3.2512, + "step": 8484 + }, + { + "epoch": 0.5267241914457756, + "grad_norm": 0.22991283904507784, + "learning_rate": 9.827049094482846e-05, + "loss": 3.3589, + "step": 8485 + }, + { + "epoch": 0.5267862685455336, + "grad_norm": 0.18762005498991335, + "learning_rate": 9.826954916024822e-05, + "loss": 3.3476, + "step": 8486 + }, + { + "epoch": 0.5268483456452915, + "grad_norm": 0.26128984293491075, + "learning_rate": 9.826860712383401e-05, + "loss": 3.2468, + "step": 8487 + }, + { + "epoch": 0.5269104227450494, + "grad_norm": 0.17924857434366875, + "learning_rate": 9.826766483559078e-05, + "loss": 3.2763, + "step": 8488 + }, + { + "epoch": 0.5269724998448072, + "grad_norm": 0.19713772508251473, + "learning_rate": 9.826672229552341e-05, + "loss": 3.3562, + "step": 8489 + }, + { + "epoch": 0.5270345769445651, + "grad_norm": 0.1962885405879887, + "learning_rate": 9.826577950363682e-05, + "loss": 3.1734, + "step": 8490 + }, + { + "epoch": 0.527096654044323, + "grad_norm": 0.3309505039898518, + "learning_rate": 9.826483645993596e-05, + "loss": 3.248, + "step": 8491 + }, + { + "epoch": 0.527158731144081, + "grad_norm": 0.18254893858446136, + "learning_rate": 9.826389316442574e-05, + "loss": 3.2837, + "step": 8492 + }, + { + "epoch": 0.5272208082438389, + "grad_norm": 0.2032804902698063, + "learning_rate": 9.826294961711105e-05, + "loss": 3.2954, + "step": 8493 + }, + { + "epoch": 0.5272828853435968, + "grad_norm": 0.287822965437889, + "learning_rate": 9.826200581799684e-05, + "loss": 3.2367, + "step": 8494 + }, + { + "epoch": 0.5273449624433546, + "grad_norm": 0.25501816177672415, + "learning_rate": 9.826106176708805e-05, + "loss": 3.3351, + "step": 8495 + }, + { + "epoch": 0.5274070395431125, + "grad_norm": 0.19632384717950865, + "learning_rate": 9.826011746438957e-05, + "loss": 3.2237, + "step": 8496 + }, + { + "epoch": 0.5274691166428704, + "grad_norm": 0.21771888733946146, + "learning_rate": 9.825917290990633e-05, + "loss": 3.1424, + "step": 8497 + }, + { + "epoch": 0.5275311937426284, + "grad_norm": 0.21140411653591076, + "learning_rate": 9.82582281036433e-05, + "loss": 3.2395, + "step": 8498 + }, + { + "epoch": 0.5275932708423863, + "grad_norm": 0.2391261692282751, + "learning_rate": 9.825728304560535e-05, + "loss": 3.3286, + "step": 8499 + }, + { + "epoch": 0.5276553479421442, + "grad_norm": 0.2058600460092224, + "learning_rate": 9.825633773579746e-05, + "loss": 3.2432, + "step": 8500 + }, + { + "epoch": 0.527717425041902, + "grad_norm": 0.2016069466726166, + "learning_rate": 9.825539217422452e-05, + "loss": 3.2999, + "step": 8501 + }, + { + "epoch": 0.5277795021416599, + "grad_norm": 0.3056650253134484, + "learning_rate": 9.82544463608915e-05, + "loss": 3.3059, + "step": 8502 + }, + { + "epoch": 0.5278415792414178, + "grad_norm": 0.23642021150579207, + "learning_rate": 9.825350029580332e-05, + "loss": 3.2935, + "step": 8503 + }, + { + "epoch": 0.5279036563411758, + "grad_norm": 0.1894971068936435, + "learning_rate": 9.825255397896491e-05, + "loss": 3.2705, + "step": 8504 + }, + { + "epoch": 0.5279657334409337, + "grad_norm": 0.22486292584305928, + "learning_rate": 9.825160741038121e-05, + "loss": 3.2259, + "step": 8505 + }, + { + "epoch": 0.5280278105406916, + "grad_norm": 0.2253910808459454, + "learning_rate": 9.825066059005716e-05, + "loss": 3.2693, + "step": 8506 + }, + { + "epoch": 0.5280898876404494, + "grad_norm": 0.21579536880719916, + "learning_rate": 9.82497135179977e-05, + "loss": 3.2566, + "step": 8507 + }, + { + "epoch": 0.5281519647402073, + "grad_norm": 0.215916366260643, + "learning_rate": 9.824876619420777e-05, + "loss": 3.2633, + "step": 8508 + }, + { + "epoch": 0.5282140418399652, + "grad_norm": 0.2322286074767295, + "learning_rate": 9.824781861869232e-05, + "loss": 3.4164, + "step": 8509 + }, + { + "epoch": 0.5282761189397231, + "grad_norm": 0.24442034147079372, + "learning_rate": 9.82468707914563e-05, + "loss": 3.1943, + "step": 8510 + }, + { + "epoch": 0.5283381960394811, + "grad_norm": 0.20714544349850367, + "learning_rate": 9.824592271250462e-05, + "loss": 3.1906, + "step": 8511 + }, + { + "epoch": 0.528400273139239, + "grad_norm": 0.28477729804835017, + "learning_rate": 9.824497438184226e-05, + "loss": 3.2528, + "step": 8512 + }, + { + "epoch": 0.5284623502389968, + "grad_norm": 0.1877105444722517, + "learning_rate": 9.824402579947416e-05, + "loss": 3.2413, + "step": 8513 + }, + { + "epoch": 0.5285244273387547, + "grad_norm": 0.2772063734430703, + "learning_rate": 9.824307696540527e-05, + "loss": 3.3004, + "step": 8514 + }, + { + "epoch": 0.5285865044385126, + "grad_norm": 0.22061134219108147, + "learning_rate": 9.824212787964051e-05, + "loss": 3.3161, + "step": 8515 + }, + { + "epoch": 0.5286485815382705, + "grad_norm": 0.5791728601421839, + "learning_rate": 9.824117854218489e-05, + "loss": 3.2345, + "step": 8516 + }, + { + "epoch": 0.5287106586380285, + "grad_norm": 0.24886816831415529, + "learning_rate": 9.82402289530433e-05, + "loss": 3.3713, + "step": 8517 + }, + { + "epoch": 0.5287727357377864, + "grad_norm": 0.2145324792986474, + "learning_rate": 9.823927911222075e-05, + "loss": 3.1281, + "step": 8518 + }, + { + "epoch": 0.5288348128375442, + "grad_norm": 0.21846015494802987, + "learning_rate": 9.823832901972217e-05, + "loss": 3.3238, + "step": 8519 + }, + { + "epoch": 0.5288968899373021, + "grad_norm": 0.22882508248194622, + "learning_rate": 9.823737867555249e-05, + "loss": 3.4203, + "step": 8520 + }, + { + "epoch": 0.52895896703706, + "grad_norm": 2.5962792439620537, + "learning_rate": 9.823642807971671e-05, + "loss": 3.2384, + "step": 8521 + }, + { + "epoch": 0.5290210441368179, + "grad_norm": 0.37563282499874456, + "learning_rate": 9.82354772322198e-05, + "loss": 3.3823, + "step": 8522 + }, + { + "epoch": 0.5290831212365759, + "grad_norm": 0.3985838527789045, + "learning_rate": 9.823452613306666e-05, + "loss": 3.3253, + "step": 8523 + }, + { + "epoch": 0.5291451983363338, + "grad_norm": 0.2943041186229865, + "learning_rate": 9.82335747822623e-05, + "loss": 3.2855, + "step": 8524 + }, + { + "epoch": 0.5292072754360916, + "grad_norm": 0.30757013596867805, + "learning_rate": 9.823262317981167e-05, + "loss": 3.2873, + "step": 8525 + }, + { + "epoch": 0.5292693525358495, + "grad_norm": 0.46729470684691266, + "learning_rate": 9.823167132571975e-05, + "loss": 3.1609, + "step": 8526 + }, + { + "epoch": 0.5293314296356074, + "grad_norm": 0.3204541358424028, + "learning_rate": 9.823071921999148e-05, + "loss": 3.2865, + "step": 8527 + }, + { + "epoch": 0.5293935067353653, + "grad_norm": 0.3730213496078474, + "learning_rate": 9.822976686263184e-05, + "loss": 3.2955, + "step": 8528 + }, + { + "epoch": 0.5294555838351233, + "grad_norm": 0.32907931878054636, + "learning_rate": 9.82288142536458e-05, + "loss": 3.2533, + "step": 8529 + }, + { + "epoch": 0.5295176609348812, + "grad_norm": 0.28252714313652544, + "learning_rate": 9.822786139303833e-05, + "loss": 3.2448, + "step": 8530 + }, + { + "epoch": 0.529579738034639, + "grad_norm": 0.2862536725179305, + "learning_rate": 9.822690828081439e-05, + "loss": 3.3084, + "step": 8531 + }, + { + "epoch": 0.5296418151343969, + "grad_norm": 0.22382378232780867, + "learning_rate": 9.822595491697898e-05, + "loss": 3.3271, + "step": 8532 + }, + { + "epoch": 0.5297038922341548, + "grad_norm": 0.2446293707136972, + "learning_rate": 9.822500130153703e-05, + "loss": 3.3385, + "step": 8533 + }, + { + "epoch": 0.5297659693339127, + "grad_norm": 0.3341682775479221, + "learning_rate": 9.822404743449356e-05, + "loss": 3.2421, + "step": 8534 + }, + { + "epoch": 0.5298280464336707, + "grad_norm": 0.2968692778989564, + "learning_rate": 9.822309331585352e-05, + "loss": 3.3969, + "step": 8535 + }, + { + "epoch": 0.5298901235334286, + "grad_norm": 0.3121062778705954, + "learning_rate": 9.82221389456219e-05, + "loss": 3.3434, + "step": 8536 + }, + { + "epoch": 0.5299522006331864, + "grad_norm": 0.4075370172350384, + "learning_rate": 9.822118432380368e-05, + "loss": 3.3387, + "step": 8537 + }, + { + "epoch": 0.5300142777329443, + "grad_norm": 0.3359634367223109, + "learning_rate": 9.822022945040382e-05, + "loss": 3.214, + "step": 8538 + }, + { + "epoch": 0.5300763548327022, + "grad_norm": 0.26344541078502715, + "learning_rate": 9.821927432542733e-05, + "loss": 3.2494, + "step": 8539 + }, + { + "epoch": 0.5301384319324601, + "grad_norm": 0.3126207952685647, + "learning_rate": 9.821831894887918e-05, + "loss": 3.3174, + "step": 8540 + }, + { + "epoch": 0.530200509032218, + "grad_norm": 0.3420346223395683, + "learning_rate": 9.821736332076434e-05, + "loss": 3.2929, + "step": 8541 + }, + { + "epoch": 0.530262586131976, + "grad_norm": 0.2465591646677173, + "learning_rate": 9.821640744108782e-05, + "loss": 3.2367, + "step": 8542 + }, + { + "epoch": 0.5303246632317338, + "grad_norm": 0.32572872644111467, + "learning_rate": 9.821545130985461e-05, + "loss": 3.307, + "step": 8543 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.31960653803127387, + "learning_rate": 9.821449492706967e-05, + "loss": 3.3435, + "step": 8544 + }, + { + "epoch": 0.5304488174312496, + "grad_norm": 0.40685518743909904, + "learning_rate": 9.821353829273801e-05, + "loss": 3.3203, + "step": 8545 + }, + { + "epoch": 0.5305108945310075, + "grad_norm": 0.2805349744643214, + "learning_rate": 9.821258140686462e-05, + "loss": 3.35, + "step": 8546 + }, + { + "epoch": 0.5305729716307654, + "grad_norm": 0.27554452261854817, + "learning_rate": 9.821162426945447e-05, + "loss": 3.3677, + "step": 8547 + }, + { + "epoch": 0.5306350487305234, + "grad_norm": 0.24382763445461691, + "learning_rate": 9.821066688051259e-05, + "loss": 3.2991, + "step": 8548 + }, + { + "epoch": 0.5306971258302812, + "grad_norm": 0.26293027218912485, + "learning_rate": 9.820970924004395e-05, + "loss": 3.1577, + "step": 8549 + }, + { + "epoch": 0.5307592029300391, + "grad_norm": 0.23131192061365352, + "learning_rate": 9.820875134805356e-05, + "loss": 3.2413, + "step": 8550 + }, + { + "epoch": 0.530821280029797, + "grad_norm": 0.2294250633606912, + "learning_rate": 9.820779320454643e-05, + "loss": 3.2706, + "step": 8551 + }, + { + "epoch": 0.5308833571295549, + "grad_norm": 0.22662775152359252, + "learning_rate": 9.820683480952751e-05, + "loss": 3.2672, + "step": 8552 + }, + { + "epoch": 0.5309454342293128, + "grad_norm": 0.22900155560358654, + "learning_rate": 9.820587616300184e-05, + "loss": 3.2658, + "step": 8553 + }, + { + "epoch": 0.5310075113290708, + "grad_norm": 0.28161922696288083, + "learning_rate": 9.820491726497441e-05, + "loss": 3.2959, + "step": 8554 + }, + { + "epoch": 0.5310695884288286, + "grad_norm": 0.24049265088978927, + "learning_rate": 9.820395811545023e-05, + "loss": 3.1839, + "step": 8555 + }, + { + "epoch": 0.5311316655285865, + "grad_norm": 0.28309391152094837, + "learning_rate": 9.820299871443429e-05, + "loss": 3.1548, + "step": 8556 + }, + { + "epoch": 0.5311937426283444, + "grad_norm": 0.2533725999267756, + "learning_rate": 9.820203906193162e-05, + "loss": 3.1771, + "step": 8557 + }, + { + "epoch": 0.5312558197281023, + "grad_norm": 0.20916290322720113, + "learning_rate": 9.820107915794718e-05, + "loss": 3.3222, + "step": 8558 + }, + { + "epoch": 0.5313178968278602, + "grad_norm": 0.21155367148219578, + "learning_rate": 9.820011900248604e-05, + "loss": 3.2959, + "step": 8559 + }, + { + "epoch": 0.5313799739276182, + "grad_norm": 0.5873789287158924, + "learning_rate": 9.819915859555317e-05, + "loss": 3.2305, + "step": 8560 + }, + { + "epoch": 0.531442051027376, + "grad_norm": 0.26555634060021627, + "learning_rate": 9.819819793715356e-05, + "loss": 3.342, + "step": 8561 + }, + { + "epoch": 0.5315041281271339, + "grad_norm": 0.36362543516372337, + "learning_rate": 9.819723702729229e-05, + "loss": 3.3462, + "step": 8562 + }, + { + "epoch": 0.5315662052268918, + "grad_norm": 0.31906703394658337, + "learning_rate": 9.819627586597433e-05, + "loss": 3.2637, + "step": 8563 + }, + { + "epoch": 0.5316282823266497, + "grad_norm": 0.23415022243906258, + "learning_rate": 9.819531445320469e-05, + "loss": 3.3397, + "step": 8564 + }, + { + "epoch": 0.5316903594264076, + "grad_norm": 0.242080695689983, + "learning_rate": 9.819435278898839e-05, + "loss": 3.3747, + "step": 8565 + }, + { + "epoch": 0.5317524365261656, + "grad_norm": 0.3068008368780057, + "learning_rate": 9.819339087333045e-05, + "loss": 3.3236, + "step": 8566 + }, + { + "epoch": 0.5318145136259234, + "grad_norm": 0.22217590820881913, + "learning_rate": 9.81924287062359e-05, + "loss": 3.2538, + "step": 8567 + }, + { + "epoch": 0.5318765907256813, + "grad_norm": 0.2622294638359714, + "learning_rate": 9.819146628770975e-05, + "loss": 3.4258, + "step": 8568 + }, + { + "epoch": 0.5319386678254392, + "grad_norm": 0.29457650583188755, + "learning_rate": 9.819050361775701e-05, + "loss": 3.3061, + "step": 8569 + }, + { + "epoch": 0.5320007449251971, + "grad_norm": 0.2346296595856095, + "learning_rate": 9.818954069638272e-05, + "loss": 3.1629, + "step": 8570 + }, + { + "epoch": 0.532062822024955, + "grad_norm": 0.23093540824335124, + "learning_rate": 9.81885775235919e-05, + "loss": 3.4037, + "step": 8571 + }, + { + "epoch": 0.532124899124713, + "grad_norm": 0.30460676933244457, + "learning_rate": 9.818761409938957e-05, + "loss": 3.2586, + "step": 8572 + }, + { + "epoch": 0.5321869762244708, + "grad_norm": 0.28852206047642404, + "learning_rate": 9.818665042378076e-05, + "loss": 3.3229, + "step": 8573 + }, + { + "epoch": 0.5322490533242287, + "grad_norm": 0.2746230717549196, + "learning_rate": 9.81856864967705e-05, + "loss": 3.3814, + "step": 8574 + }, + { + "epoch": 0.5323111304239866, + "grad_norm": 0.3115352401171745, + "learning_rate": 9.818472231836382e-05, + "loss": 3.3011, + "step": 8575 + }, + { + "epoch": 0.5323732075237445, + "grad_norm": 0.23810625685870482, + "learning_rate": 9.818375788856574e-05, + "loss": 3.3577, + "step": 8576 + }, + { + "epoch": 0.5324352846235024, + "grad_norm": 0.31647500127142514, + "learning_rate": 9.818279320738129e-05, + "loss": 3.3576, + "step": 8577 + }, + { + "epoch": 0.5324973617232603, + "grad_norm": 0.218694160350735, + "learning_rate": 9.818182827481553e-05, + "loss": 3.1955, + "step": 8578 + }, + { + "epoch": 0.5325594388230181, + "grad_norm": 0.24758196237033214, + "learning_rate": 9.818086309087347e-05, + "loss": 3.4033, + "step": 8579 + }, + { + "epoch": 0.5326215159227761, + "grad_norm": 0.3149975512235312, + "learning_rate": 9.817989765556014e-05, + "loss": 3.2703, + "step": 8580 + }, + { + "epoch": 0.532683593022534, + "grad_norm": 0.2634374211945874, + "learning_rate": 9.817893196888059e-05, + "loss": 3.2592, + "step": 8581 + }, + { + "epoch": 0.5327456701222919, + "grad_norm": 0.28563675829100926, + "learning_rate": 9.817796603083987e-05, + "loss": 3.2654, + "step": 8582 + }, + { + "epoch": 0.5328077472220498, + "grad_norm": 0.28085735516624327, + "learning_rate": 9.8176999841443e-05, + "loss": 3.2872, + "step": 8583 + }, + { + "epoch": 0.5328698243218077, + "grad_norm": 0.19298361863834418, + "learning_rate": 9.817603340069501e-05, + "loss": 3.2497, + "step": 8584 + }, + { + "epoch": 0.5329319014215655, + "grad_norm": 0.23026025360649277, + "learning_rate": 9.817506670860097e-05, + "loss": 3.3422, + "step": 8585 + }, + { + "epoch": 0.5329939785213235, + "grad_norm": 0.26172831503626737, + "learning_rate": 9.817409976516592e-05, + "loss": 3.3357, + "step": 8586 + }, + { + "epoch": 0.5330560556210814, + "grad_norm": 0.2807784896485637, + "learning_rate": 9.817313257039488e-05, + "loss": 3.179, + "step": 8587 + }, + { + "epoch": 0.5331181327208393, + "grad_norm": 0.29321026066371664, + "learning_rate": 9.817216512429292e-05, + "loss": 3.2445, + "step": 8588 + }, + { + "epoch": 0.5331802098205972, + "grad_norm": 0.28134034613050474, + "learning_rate": 9.817119742686509e-05, + "loss": 3.2611, + "step": 8589 + }, + { + "epoch": 0.5332422869203551, + "grad_norm": 0.22674717446935702, + "learning_rate": 9.817022947811643e-05, + "loss": 3.1757, + "step": 8590 + }, + { + "epoch": 0.5333043640201129, + "grad_norm": 0.23651988548931818, + "learning_rate": 9.816926127805197e-05, + "loss": 3.0623, + "step": 8591 + }, + { + "epoch": 0.5333664411198709, + "grad_norm": 0.21197595000267289, + "learning_rate": 9.81682928266768e-05, + "loss": 3.2538, + "step": 8592 + }, + { + "epoch": 0.5334285182196288, + "grad_norm": 0.21596913794624945, + "learning_rate": 9.816732412399594e-05, + "loss": 3.3096, + "step": 8593 + }, + { + "epoch": 0.5334905953193867, + "grad_norm": 0.195234025448856, + "learning_rate": 9.816635517001447e-05, + "loss": 3.3778, + "step": 8594 + }, + { + "epoch": 0.5335526724191446, + "grad_norm": 0.3583780225713472, + "learning_rate": 9.816538596473742e-05, + "loss": 3.2915, + "step": 8595 + }, + { + "epoch": 0.5336147495189025, + "grad_norm": 0.21634535982931774, + "learning_rate": 9.816441650816986e-05, + "loss": 3.3429, + "step": 8596 + }, + { + "epoch": 0.5336768266186603, + "grad_norm": 0.16867249020053043, + "learning_rate": 9.816344680031687e-05, + "loss": 3.2903, + "step": 8597 + }, + { + "epoch": 0.5337389037184183, + "grad_norm": 0.369468019007389, + "learning_rate": 9.816247684118345e-05, + "loss": 3.146, + "step": 8598 + }, + { + "epoch": 0.5338009808181762, + "grad_norm": 0.1962165888512229, + "learning_rate": 9.816150663077471e-05, + "loss": 3.2921, + "step": 8599 + }, + { + "epoch": 0.5338630579179341, + "grad_norm": 0.22031325955631997, + "learning_rate": 9.81605361690957e-05, + "loss": 3.241, + "step": 8600 + }, + { + "epoch": 0.533925135017692, + "grad_norm": 0.23963292358473468, + "learning_rate": 9.815956545615149e-05, + "loss": 3.2241, + "step": 8601 + }, + { + "epoch": 0.5339872121174499, + "grad_norm": 0.20427813945512802, + "learning_rate": 9.815859449194712e-05, + "loss": 3.223, + "step": 8602 + }, + { + "epoch": 0.5340492892172077, + "grad_norm": 0.22717419623418195, + "learning_rate": 9.815762327648767e-05, + "loss": 3.3213, + "step": 8603 + }, + { + "epoch": 0.5341113663169657, + "grad_norm": 0.19663278651234475, + "learning_rate": 9.815665180977823e-05, + "loss": 3.2998, + "step": 8604 + }, + { + "epoch": 0.5341734434167236, + "grad_norm": 0.25483597949990816, + "learning_rate": 9.815568009182384e-05, + "loss": 3.2564, + "step": 8605 + }, + { + "epoch": 0.5342355205164815, + "grad_norm": 0.31941105603602404, + "learning_rate": 9.815470812262957e-05, + "loss": 3.331, + "step": 8606 + }, + { + "epoch": 0.5342975976162394, + "grad_norm": 0.24571096835330739, + "learning_rate": 9.81537359022005e-05, + "loss": 3.2885, + "step": 8607 + }, + { + "epoch": 0.5343596747159973, + "grad_norm": 0.19900141810386335, + "learning_rate": 9.815276343054169e-05, + "loss": 3.2912, + "step": 8608 + }, + { + "epoch": 0.5344217518157551, + "grad_norm": 0.3144311008677989, + "learning_rate": 9.815179070765825e-05, + "loss": 3.1768, + "step": 8609 + }, + { + "epoch": 0.534483828915513, + "grad_norm": 0.2682616559817668, + "learning_rate": 9.81508177335552e-05, + "loss": 3.3265, + "step": 8610 + }, + { + "epoch": 0.534545906015271, + "grad_norm": 0.26688121618674143, + "learning_rate": 9.814984450823766e-05, + "loss": 3.3199, + "step": 8611 + }, + { + "epoch": 0.5346079831150289, + "grad_norm": 0.28761005212983154, + "learning_rate": 9.814887103171069e-05, + "loss": 3.3031, + "step": 8612 + }, + { + "epoch": 0.5346700602147868, + "grad_norm": 0.3226390524615317, + "learning_rate": 9.814789730397938e-05, + "loss": 3.2926, + "step": 8613 + }, + { + "epoch": 0.5347321373145447, + "grad_norm": 0.25311628844729267, + "learning_rate": 9.814692332504877e-05, + "loss": 3.3541, + "step": 8614 + }, + { + "epoch": 0.5347942144143025, + "grad_norm": 0.29469423160975183, + "learning_rate": 9.8145949094924e-05, + "loss": 3.3308, + "step": 8615 + }, + { + "epoch": 0.5348562915140604, + "grad_norm": 0.219309117825167, + "learning_rate": 9.814497461361013e-05, + "loss": 3.2476, + "step": 8616 + }, + { + "epoch": 0.5349183686138184, + "grad_norm": 0.303705197000989, + "learning_rate": 9.814399988111223e-05, + "loss": 3.2804, + "step": 8617 + }, + { + "epoch": 0.5349804457135763, + "grad_norm": 0.23898831974351648, + "learning_rate": 9.814302489743539e-05, + "loss": 3.3062, + "step": 8618 + }, + { + "epoch": 0.5350425228133342, + "grad_norm": 0.28804970206271924, + "learning_rate": 9.814204966258471e-05, + "loss": 3.3127, + "step": 8619 + }, + { + "epoch": 0.535104599913092, + "grad_norm": 0.2719384793461406, + "learning_rate": 9.814107417656528e-05, + "loss": 3.3272, + "step": 8620 + }, + { + "epoch": 0.5351666770128499, + "grad_norm": 0.4099493140489758, + "learning_rate": 9.814009843938217e-05, + "loss": 3.2944, + "step": 8621 + }, + { + "epoch": 0.5352287541126078, + "grad_norm": 0.28267735313390185, + "learning_rate": 9.813912245104047e-05, + "loss": 3.1776, + "step": 8622 + }, + { + "epoch": 0.5352908312123658, + "grad_norm": 0.2847046476783689, + "learning_rate": 9.813814621154529e-05, + "loss": 3.2877, + "step": 8623 + }, + { + "epoch": 0.5353529083121237, + "grad_norm": 0.22133450794811932, + "learning_rate": 9.81371697209017e-05, + "loss": 3.204, + "step": 8624 + }, + { + "epoch": 0.5354149854118816, + "grad_norm": 0.30639829597089563, + "learning_rate": 9.813619297911482e-05, + "loss": 3.2667, + "step": 8625 + }, + { + "epoch": 0.5354770625116394, + "grad_norm": 0.31076159941092163, + "learning_rate": 9.813521598618976e-05, + "loss": 3.2701, + "step": 8626 + }, + { + "epoch": 0.5355391396113973, + "grad_norm": 0.2880676910311735, + "learning_rate": 9.813423874213157e-05, + "loss": 3.3039, + "step": 8627 + }, + { + "epoch": 0.5356012167111552, + "grad_norm": 0.25429354253832603, + "learning_rate": 9.813326124694537e-05, + "loss": 3.2942, + "step": 8628 + }, + { + "epoch": 0.5356632938109132, + "grad_norm": 0.2827597205298167, + "learning_rate": 9.813228350063627e-05, + "loss": 3.2995, + "step": 8629 + }, + { + "epoch": 0.5357253709106711, + "grad_norm": 0.2530165121924795, + "learning_rate": 9.813130550320936e-05, + "loss": 3.3248, + "step": 8630 + }, + { + "epoch": 0.535787448010429, + "grad_norm": 0.25228200930742556, + "learning_rate": 9.813032725466974e-05, + "loss": 3.183, + "step": 8631 + }, + { + "epoch": 0.5358495251101868, + "grad_norm": 0.25505329145556044, + "learning_rate": 9.812934875502252e-05, + "loss": 3.2466, + "step": 8632 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.2385856997287854, + "learning_rate": 9.81283700042728e-05, + "loss": 3.2926, + "step": 8633 + }, + { + "epoch": 0.5359736793097026, + "grad_norm": 0.23143538119741838, + "learning_rate": 9.81273910024257e-05, + "loss": 3.2743, + "step": 8634 + }, + { + "epoch": 0.5360357564094606, + "grad_norm": 0.2324205286323596, + "learning_rate": 9.812641174948632e-05, + "loss": 3.279, + "step": 8635 + }, + { + "epoch": 0.5360978335092185, + "grad_norm": 0.29400392119150964, + "learning_rate": 9.812543224545976e-05, + "loss": 3.1689, + "step": 8636 + }, + { + "epoch": 0.5361599106089764, + "grad_norm": 0.31036263524751245, + "learning_rate": 9.812445249035113e-05, + "loss": 3.3457, + "step": 8637 + }, + { + "epoch": 0.5362219877087342, + "grad_norm": 0.4423591657975383, + "learning_rate": 9.812347248416556e-05, + "loss": 3.306, + "step": 8638 + }, + { + "epoch": 0.5362840648084921, + "grad_norm": 0.2716592184510384, + "learning_rate": 9.812249222690815e-05, + "loss": 3.2972, + "step": 8639 + }, + { + "epoch": 0.53634614190825, + "grad_norm": 0.2355932680310522, + "learning_rate": 9.812151171858402e-05, + "loss": 3.3186, + "step": 8640 + }, + { + "epoch": 0.536408219008008, + "grad_norm": 0.24722256699876163, + "learning_rate": 9.812053095919827e-05, + "loss": 3.2167, + "step": 8641 + }, + { + "epoch": 0.5364702961077659, + "grad_norm": 0.21860109949470197, + "learning_rate": 9.811954994875602e-05, + "loss": 3.1989, + "step": 8642 + }, + { + "epoch": 0.5365323732075238, + "grad_norm": 0.6279098721191234, + "learning_rate": 9.811856868726243e-05, + "loss": 3.3405, + "step": 8643 + }, + { + "epoch": 0.5365944503072816, + "grad_norm": 0.2560287380963638, + "learning_rate": 9.811758717472256e-05, + "loss": 3.3004, + "step": 8644 + }, + { + "epoch": 0.5366565274070395, + "grad_norm": 0.2654847728615582, + "learning_rate": 9.811660541114155e-05, + "loss": 3.2896, + "step": 8645 + }, + { + "epoch": 0.5367186045067974, + "grad_norm": 0.28228323608328143, + "learning_rate": 9.811562339652457e-05, + "loss": 3.3856, + "step": 8646 + }, + { + "epoch": 0.5367806816065553, + "grad_norm": 0.2619309205867528, + "learning_rate": 9.811464113087666e-05, + "loss": 3.364, + "step": 8647 + }, + { + "epoch": 0.5368427587063133, + "grad_norm": 0.3315239902283623, + "learning_rate": 9.811365861420301e-05, + "loss": 3.2176, + "step": 8648 + }, + { + "epoch": 0.5369048358060712, + "grad_norm": 0.25893198061563766, + "learning_rate": 9.81126758465087e-05, + "loss": 3.2856, + "step": 8649 + }, + { + "epoch": 0.536966912905829, + "grad_norm": 0.2864367072293995, + "learning_rate": 9.811169282779891e-05, + "loss": 3.2613, + "step": 8650 + }, + { + "epoch": 0.5370289900055869, + "grad_norm": 0.3117803990678511, + "learning_rate": 9.811070955807871e-05, + "loss": 3.3062, + "step": 8651 + }, + { + "epoch": 0.5370910671053448, + "grad_norm": 0.3172787187509702, + "learning_rate": 9.810972603735326e-05, + "loss": 3.2265, + "step": 8652 + }, + { + "epoch": 0.5371531442051027, + "grad_norm": 0.25977305351477437, + "learning_rate": 9.810874226562772e-05, + "loss": 3.3501, + "step": 8653 + }, + { + "epoch": 0.5372152213048607, + "grad_norm": 0.342397249725835, + "learning_rate": 9.810775824290717e-05, + "loss": 3.3225, + "step": 8654 + }, + { + "epoch": 0.5372772984046186, + "grad_norm": 0.32174283017659655, + "learning_rate": 9.810677396919677e-05, + "loss": 3.28, + "step": 8655 + }, + { + "epoch": 0.5373393755043764, + "grad_norm": 0.2828207921029629, + "learning_rate": 9.810578944450166e-05, + "loss": 3.2749, + "step": 8656 + }, + { + "epoch": 0.5374014526041343, + "grad_norm": 0.27590245509284134, + "learning_rate": 9.810480466882697e-05, + "loss": 3.2225, + "step": 8657 + }, + { + "epoch": 0.5374635297038922, + "grad_norm": 0.24048472749848945, + "learning_rate": 9.810381964217783e-05, + "loss": 3.2623, + "step": 8658 + }, + { + "epoch": 0.5375256068036501, + "grad_norm": 0.3438647793832093, + "learning_rate": 9.810283436455937e-05, + "loss": 3.2551, + "step": 8659 + }, + { + "epoch": 0.537587683903408, + "grad_norm": 0.23688247388306805, + "learning_rate": 9.810184883597676e-05, + "loss": 3.1873, + "step": 8660 + }, + { + "epoch": 0.537649761003166, + "grad_norm": 0.24660317358647552, + "learning_rate": 9.810086305643513e-05, + "loss": 3.1874, + "step": 8661 + }, + { + "epoch": 0.5377118381029238, + "grad_norm": 0.259783149070778, + "learning_rate": 9.809987702593961e-05, + "loss": 3.2514, + "step": 8662 + }, + { + "epoch": 0.5377739152026817, + "grad_norm": 0.2281424091176079, + "learning_rate": 9.809889074449537e-05, + "loss": 3.1623, + "step": 8663 + }, + { + "epoch": 0.5378359923024396, + "grad_norm": 0.276642059477583, + "learning_rate": 9.809790421210754e-05, + "loss": 3.2655, + "step": 8664 + }, + { + "epoch": 0.5378980694021975, + "grad_norm": 0.36740809773905786, + "learning_rate": 9.809691742878125e-05, + "loss": 3.2312, + "step": 8665 + }, + { + "epoch": 0.5379601465019554, + "grad_norm": 0.3089639640075517, + "learning_rate": 9.809593039452167e-05, + "loss": 3.1941, + "step": 8666 + }, + { + "epoch": 0.5380222236017134, + "grad_norm": 0.25818827726348254, + "learning_rate": 9.809494310933395e-05, + "loss": 3.265, + "step": 8667 + }, + { + "epoch": 0.5380843007014712, + "grad_norm": 0.2649607205970008, + "learning_rate": 9.809395557322325e-05, + "loss": 3.2867, + "step": 8668 + }, + { + "epoch": 0.5381463778012291, + "grad_norm": 0.27938021236822735, + "learning_rate": 9.80929677861947e-05, + "loss": 3.2613, + "step": 8669 + }, + { + "epoch": 0.538208454900987, + "grad_norm": 0.2520075450942747, + "learning_rate": 9.809197974825345e-05, + "loss": 3.2589, + "step": 8670 + }, + { + "epoch": 0.5382705320007449, + "grad_norm": 0.2197650875650473, + "learning_rate": 9.809099145940468e-05, + "loss": 3.3138, + "step": 8671 + }, + { + "epoch": 0.5383326091005028, + "grad_norm": 0.2013325003672915, + "learning_rate": 9.809000291965354e-05, + "loss": 3.2677, + "step": 8672 + }, + { + "epoch": 0.5383946862002608, + "grad_norm": 0.3591633463674371, + "learning_rate": 9.808901412900515e-05, + "loss": 3.2952, + "step": 8673 + }, + { + "epoch": 0.5384567633000186, + "grad_norm": 0.2669758806362536, + "learning_rate": 9.808802508746473e-05, + "loss": 3.333, + "step": 8674 + }, + { + "epoch": 0.5385188403997765, + "grad_norm": 0.3122952744316059, + "learning_rate": 9.80870357950374e-05, + "loss": 3.2793, + "step": 8675 + }, + { + "epoch": 0.5385809174995344, + "grad_norm": 0.23488381309936904, + "learning_rate": 9.808604625172833e-05, + "loss": 3.3383, + "step": 8676 + }, + { + "epoch": 0.5386429945992923, + "grad_norm": 0.2320212429470372, + "learning_rate": 9.808505645754268e-05, + "loss": 3.1957, + "step": 8677 + }, + { + "epoch": 0.5387050716990502, + "grad_norm": 0.20270325820649127, + "learning_rate": 9.808406641248562e-05, + "loss": 3.2375, + "step": 8678 + }, + { + "epoch": 0.5387671487988082, + "grad_norm": 0.2043858423769246, + "learning_rate": 9.808307611656232e-05, + "loss": 3.1005, + "step": 8679 + }, + { + "epoch": 0.538829225898566, + "grad_norm": 0.8548584594465074, + "learning_rate": 9.808208556977792e-05, + "loss": 3.2398, + "step": 8680 + }, + { + "epoch": 0.5388913029983239, + "grad_norm": 0.28152191739699456, + "learning_rate": 9.808109477213763e-05, + "loss": 3.2112, + "step": 8681 + }, + { + "epoch": 0.5389533800980818, + "grad_norm": 0.2663289447390227, + "learning_rate": 9.808010372364657e-05, + "loss": 3.2528, + "step": 8682 + }, + { + "epoch": 0.5390154571978397, + "grad_norm": 0.3238604300551668, + "learning_rate": 9.807911242430995e-05, + "loss": 3.326, + "step": 8683 + }, + { + "epoch": 0.5390775342975976, + "grad_norm": 0.31538393143455357, + "learning_rate": 9.807812087413293e-05, + "loss": 3.2624, + "step": 8684 + }, + { + "epoch": 0.5391396113973556, + "grad_norm": 0.25565804867122793, + "learning_rate": 9.80771290731207e-05, + "loss": 3.1883, + "step": 8685 + }, + { + "epoch": 0.5392016884971134, + "grad_norm": 0.2643442552469027, + "learning_rate": 9.807613702127838e-05, + "loss": 3.3094, + "step": 8686 + }, + { + "epoch": 0.5392637655968713, + "grad_norm": 0.2787514275569883, + "learning_rate": 9.807514471861121e-05, + "loss": 3.2682, + "step": 8687 + }, + { + "epoch": 0.5393258426966292, + "grad_norm": 0.2594683387290896, + "learning_rate": 9.807415216512433e-05, + "loss": 3.2327, + "step": 8688 + }, + { + "epoch": 0.5393879197963871, + "grad_norm": 0.2201842056242361, + "learning_rate": 9.807315936082293e-05, + "loss": 3.3067, + "step": 8689 + }, + { + "epoch": 0.539449996896145, + "grad_norm": 0.22286862567968857, + "learning_rate": 9.807216630571218e-05, + "loss": 3.2375, + "step": 8690 + }, + { + "epoch": 0.539512073995903, + "grad_norm": 0.18805244638821436, + "learning_rate": 9.807117299979728e-05, + "loss": 3.2401, + "step": 8691 + }, + { + "epoch": 0.5395741510956608, + "grad_norm": 0.17674606242036164, + "learning_rate": 9.807017944308341e-05, + "loss": 3.3085, + "step": 8692 + }, + { + "epoch": 0.5396362281954187, + "grad_norm": 0.30308512395903714, + "learning_rate": 9.806918563557572e-05, + "loss": 3.3751, + "step": 8693 + }, + { + "epoch": 0.5396983052951766, + "grad_norm": 0.2005208973244569, + "learning_rate": 9.806819157727943e-05, + "loss": 3.3155, + "step": 8694 + }, + { + "epoch": 0.5397603823949345, + "grad_norm": 0.25261519922500686, + "learning_rate": 9.806719726819972e-05, + "loss": 3.3299, + "step": 8695 + }, + { + "epoch": 0.5398224594946924, + "grad_norm": 0.23776728277887565, + "learning_rate": 9.806620270834178e-05, + "loss": 3.3051, + "step": 8696 + }, + { + "epoch": 0.5398845365944503, + "grad_norm": 0.2799421605546003, + "learning_rate": 9.806520789771077e-05, + "loss": 3.1759, + "step": 8697 + }, + { + "epoch": 0.5399466136942082, + "grad_norm": 0.23558668998910548, + "learning_rate": 9.80642128363119e-05, + "loss": 3.2818, + "step": 8698 + }, + { + "epoch": 0.5400086907939661, + "grad_norm": 0.21516381611260565, + "learning_rate": 9.806321752415038e-05, + "loss": 3.3066, + "step": 8699 + }, + { + "epoch": 0.540070767893724, + "grad_norm": 0.21123310851106764, + "learning_rate": 9.806222196123139e-05, + "loss": 3.1087, + "step": 8700 + }, + { + "epoch": 0.5401328449934819, + "grad_norm": 0.2332763751722142, + "learning_rate": 9.80612261475601e-05, + "loss": 3.1482, + "step": 8701 + }, + { + "epoch": 0.5401949220932398, + "grad_norm": 0.224129313066813, + "learning_rate": 9.806023008314175e-05, + "loss": 3.2978, + "step": 8702 + }, + { + "epoch": 0.5402569991929977, + "grad_norm": 0.2768910164861246, + "learning_rate": 9.80592337679815e-05, + "loss": 3.2305, + "step": 8703 + }, + { + "epoch": 0.5403190762927556, + "grad_norm": 0.23402848058878647, + "learning_rate": 9.805823720208456e-05, + "loss": 3.321, + "step": 8704 + }, + { + "epoch": 0.5403811533925135, + "grad_norm": 0.24465232630414402, + "learning_rate": 9.805724038545614e-05, + "loss": 3.3075, + "step": 8705 + }, + { + "epoch": 0.5404432304922714, + "grad_norm": 0.3547439311841146, + "learning_rate": 9.805624331810141e-05, + "loss": 3.2848, + "step": 8706 + }, + { + "epoch": 0.5405053075920293, + "grad_norm": 0.24624924061750098, + "learning_rate": 9.805524600002562e-05, + "loss": 3.3302, + "step": 8707 + }, + { + "epoch": 0.5405673846917872, + "grad_norm": 0.2781796685613939, + "learning_rate": 9.805424843123392e-05, + "loss": 3.2275, + "step": 8708 + }, + { + "epoch": 0.5406294617915451, + "grad_norm": 0.2516404510942751, + "learning_rate": 9.805325061173157e-05, + "loss": 3.3073, + "step": 8709 + }, + { + "epoch": 0.540691538891303, + "grad_norm": 0.2764739783909332, + "learning_rate": 9.805225254152372e-05, + "loss": 3.1773, + "step": 8710 + }, + { + "epoch": 0.5407536159910609, + "grad_norm": 0.3288675715884535, + "learning_rate": 9.80512542206156e-05, + "loss": 3.2636, + "step": 8711 + }, + { + "epoch": 0.5408156930908188, + "grad_norm": 0.3265290503276751, + "learning_rate": 9.805025564901244e-05, + "loss": 3.2309, + "step": 8712 + }, + { + "epoch": 0.5408777701905767, + "grad_norm": 0.29899905184448733, + "learning_rate": 9.804925682671942e-05, + "loss": 3.2272, + "step": 8713 + }, + { + "epoch": 0.5409398472903346, + "grad_norm": 0.3370593656043693, + "learning_rate": 9.804825775374177e-05, + "loss": 3.2365, + "step": 8714 + }, + { + "epoch": 0.5410019243900925, + "grad_norm": 0.2227786129466186, + "learning_rate": 9.804725843008468e-05, + "loss": 3.1344, + "step": 8715 + }, + { + "epoch": 0.5410640014898503, + "grad_norm": 0.25564937855376213, + "learning_rate": 9.80462588557534e-05, + "loss": 3.2306, + "step": 8716 + }, + { + "epoch": 0.5411260785896083, + "grad_norm": 0.28232242843603916, + "learning_rate": 9.804525903075312e-05, + "loss": 3.3497, + "step": 8717 + }, + { + "epoch": 0.5411881556893662, + "grad_norm": 0.2516365576068653, + "learning_rate": 9.804425895508905e-05, + "loss": 3.2183, + "step": 8718 + }, + { + "epoch": 0.5412502327891241, + "grad_norm": 0.3670621820426043, + "learning_rate": 9.804325862876642e-05, + "loss": 3.2946, + "step": 8719 + }, + { + "epoch": 0.541312309888882, + "grad_norm": 0.2581360838129103, + "learning_rate": 9.804225805179046e-05, + "loss": 3.2947, + "step": 8720 + }, + { + "epoch": 0.5413743869886399, + "grad_norm": 0.3599349096847197, + "learning_rate": 9.804125722416637e-05, + "loss": 3.3011, + "step": 8721 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.3591134944324799, + "learning_rate": 9.804025614589938e-05, + "loss": 3.2552, + "step": 8722 + }, + { + "epoch": 0.5414985411881557, + "grad_norm": 0.25822078280876193, + "learning_rate": 9.80392548169947e-05, + "loss": 3.262, + "step": 8723 + }, + { + "epoch": 0.5415606182879136, + "grad_norm": 0.2786168576561542, + "learning_rate": 9.803825323745757e-05, + "loss": 3.2462, + "step": 8724 + }, + { + "epoch": 0.5416226953876715, + "grad_norm": 0.2416466703993776, + "learning_rate": 9.803725140729321e-05, + "loss": 3.2765, + "step": 8725 + }, + { + "epoch": 0.5416847724874294, + "grad_norm": 0.3303066906918912, + "learning_rate": 9.803624932650686e-05, + "loss": 3.3349, + "step": 8726 + }, + { + "epoch": 0.5417468495871873, + "grad_norm": 0.4234770098338049, + "learning_rate": 9.803524699510374e-05, + "loss": 3.2147, + "step": 8727 + }, + { + "epoch": 0.5418089266869451, + "grad_norm": 0.2048586533869572, + "learning_rate": 9.803424441308906e-05, + "loss": 3.3156, + "step": 8728 + }, + { + "epoch": 0.541871003786703, + "grad_norm": 0.2762541657776696, + "learning_rate": 9.803324158046806e-05, + "loss": 3.318, + "step": 8729 + }, + { + "epoch": 0.541933080886461, + "grad_norm": 0.24541737689664064, + "learning_rate": 9.803223849724599e-05, + "loss": 3.2535, + "step": 8730 + }, + { + "epoch": 0.5419951579862189, + "grad_norm": 0.2578051161431149, + "learning_rate": 9.803123516342807e-05, + "loss": 3.3299, + "step": 8731 + }, + { + "epoch": 0.5420572350859768, + "grad_norm": 0.2157948854407314, + "learning_rate": 9.803023157901953e-05, + "loss": 3.1992, + "step": 8732 + }, + { + "epoch": 0.5421193121857347, + "grad_norm": 0.2708528019558931, + "learning_rate": 9.802922774402563e-05, + "loss": 3.3002, + "step": 8733 + }, + { + "epoch": 0.5421813892854925, + "grad_norm": 0.26057410090775474, + "learning_rate": 9.802822365845157e-05, + "loss": 3.283, + "step": 8734 + }, + { + "epoch": 0.5422434663852504, + "grad_norm": 0.5164233587203939, + "learning_rate": 9.802721932230262e-05, + "loss": 3.3373, + "step": 8735 + }, + { + "epoch": 0.5423055434850084, + "grad_norm": 0.3659711739530918, + "learning_rate": 9.8026214735584e-05, + "loss": 3.3659, + "step": 8736 + }, + { + "epoch": 0.5423676205847663, + "grad_norm": 0.4590658561942216, + "learning_rate": 9.802520989830097e-05, + "loss": 3.3046, + "step": 8737 + }, + { + "epoch": 0.5424296976845242, + "grad_norm": 0.30148765050132326, + "learning_rate": 9.802420481045875e-05, + "loss": 3.1835, + "step": 8738 + }, + { + "epoch": 0.5424917747842821, + "grad_norm": 0.6480894648449224, + "learning_rate": 9.80231994720626e-05, + "loss": 3.3251, + "step": 8739 + }, + { + "epoch": 0.5425538518840399, + "grad_norm": 0.46700729032989163, + "learning_rate": 9.802219388311775e-05, + "loss": 3.2444, + "step": 8740 + }, + { + "epoch": 0.5426159289837978, + "grad_norm": 0.384819937033919, + "learning_rate": 9.802118804362947e-05, + "loss": 3.2988, + "step": 8741 + }, + { + "epoch": 0.5426780060835558, + "grad_norm": 0.3135725450782934, + "learning_rate": 9.802018195360298e-05, + "loss": 3.2802, + "step": 8742 + }, + { + "epoch": 0.5427400831833137, + "grad_norm": 0.31613682237045476, + "learning_rate": 9.801917561304357e-05, + "loss": 3.3047, + "step": 8743 + }, + { + "epoch": 0.5428021602830716, + "grad_norm": 0.285982893378832, + "learning_rate": 9.801816902195645e-05, + "loss": 3.2364, + "step": 8744 + }, + { + "epoch": 0.5428642373828295, + "grad_norm": 0.22680728119758672, + "learning_rate": 9.801716218034688e-05, + "loss": 3.2801, + "step": 8745 + }, + { + "epoch": 0.5429263144825873, + "grad_norm": 0.2840439514886003, + "learning_rate": 9.801615508822012e-05, + "loss": 3.238, + "step": 8746 + }, + { + "epoch": 0.5429883915823452, + "grad_norm": 0.2552887326060468, + "learning_rate": 9.801514774558142e-05, + "loss": 3.3324, + "step": 8747 + }, + { + "epoch": 0.5430504686821032, + "grad_norm": 0.2400296044096468, + "learning_rate": 9.801414015243604e-05, + "loss": 3.2914, + "step": 8748 + }, + { + "epoch": 0.5431125457818611, + "grad_norm": 0.31760641584332916, + "learning_rate": 9.801313230878923e-05, + "loss": 3.2232, + "step": 8749 + }, + { + "epoch": 0.543174622881619, + "grad_norm": 0.31963856674033636, + "learning_rate": 9.801212421464627e-05, + "loss": 3.2423, + "step": 8750 + }, + { + "epoch": 0.5432366999813769, + "grad_norm": 0.3019176294503763, + "learning_rate": 9.801111587001239e-05, + "loss": 3.2275, + "step": 8751 + }, + { + "epoch": 0.5432987770811347, + "grad_norm": 0.215585278570347, + "learning_rate": 9.801010727489287e-05, + "loss": 3.322, + "step": 8752 + }, + { + "epoch": 0.5433608541808926, + "grad_norm": 0.28666022584346024, + "learning_rate": 9.800909842929296e-05, + "loss": 3.22, + "step": 8753 + }, + { + "epoch": 0.5434229312806506, + "grad_norm": 0.26969159926551256, + "learning_rate": 9.800808933321792e-05, + "loss": 3.284, + "step": 8754 + }, + { + "epoch": 0.5434850083804085, + "grad_norm": 0.22119714927375553, + "learning_rate": 9.800707998667305e-05, + "loss": 3.305, + "step": 8755 + }, + { + "epoch": 0.5435470854801664, + "grad_norm": 0.2472884606035992, + "learning_rate": 9.800607038966356e-05, + "loss": 3.3057, + "step": 8756 + }, + { + "epoch": 0.5436091625799243, + "grad_norm": 0.26189770914181365, + "learning_rate": 9.800506054219476e-05, + "loss": 3.24, + "step": 8757 + }, + { + "epoch": 0.5436712396796821, + "grad_norm": 0.31760979385412164, + "learning_rate": 9.800405044427191e-05, + "loss": 3.2519, + "step": 8758 + }, + { + "epoch": 0.54373331677944, + "grad_norm": 0.34180234632200124, + "learning_rate": 9.800304009590028e-05, + "loss": 3.2192, + "step": 8759 + }, + { + "epoch": 0.543795393879198, + "grad_norm": 0.32992711647706463, + "learning_rate": 9.800202949708513e-05, + "loss": 3.2594, + "step": 8760 + }, + { + "epoch": 0.5438574709789559, + "grad_norm": 0.24197566089124076, + "learning_rate": 9.800101864783173e-05, + "loss": 3.3164, + "step": 8761 + }, + { + "epoch": 0.5439195480787138, + "grad_norm": 0.3175784637712124, + "learning_rate": 9.800000754814537e-05, + "loss": 3.2179, + "step": 8762 + }, + { + "epoch": 0.5439816251784717, + "grad_norm": 0.2700528612312632, + "learning_rate": 9.799899619803133e-05, + "loss": 3.3343, + "step": 8763 + }, + { + "epoch": 0.5440437022782295, + "grad_norm": 0.2337800023387653, + "learning_rate": 9.799798459749485e-05, + "loss": 3.1799, + "step": 8764 + }, + { + "epoch": 0.5441057793779874, + "grad_norm": 0.2938614785926434, + "learning_rate": 9.799697274654125e-05, + "loss": 3.147, + "step": 8765 + }, + { + "epoch": 0.5441678564777453, + "grad_norm": 0.2843552273384357, + "learning_rate": 9.799596064517578e-05, + "loss": 3.2154, + "step": 8766 + }, + { + "epoch": 0.5442299335775033, + "grad_norm": 0.27872734422518225, + "learning_rate": 9.799494829340375e-05, + "loss": 3.2509, + "step": 8767 + }, + { + "epoch": 0.5442920106772612, + "grad_norm": 0.2553856810858201, + "learning_rate": 9.799393569123041e-05, + "loss": 3.234, + "step": 8768 + }, + { + "epoch": 0.5443540877770191, + "grad_norm": 0.20830075585445174, + "learning_rate": 9.799292283866106e-05, + "loss": 3.311, + "step": 8769 + }, + { + "epoch": 0.5444161648767769, + "grad_norm": 0.2729082844565646, + "learning_rate": 9.799190973570099e-05, + "loss": 3.2283, + "step": 8770 + }, + { + "epoch": 0.5444782419765348, + "grad_norm": 0.22232364607885258, + "learning_rate": 9.799089638235546e-05, + "loss": 3.2687, + "step": 8771 + }, + { + "epoch": 0.5445403190762927, + "grad_norm": 0.20409680776213995, + "learning_rate": 9.798988277862978e-05, + "loss": 3.3997, + "step": 8772 + }, + { + "epoch": 0.5446023961760507, + "grad_norm": 0.19800687081163285, + "learning_rate": 9.798886892452924e-05, + "loss": 3.2165, + "step": 8773 + }, + { + "epoch": 0.5446644732758086, + "grad_norm": 0.45326932713246854, + "learning_rate": 9.798785482005909e-05, + "loss": 3.2173, + "step": 8774 + }, + { + "epoch": 0.5447265503755665, + "grad_norm": 0.2686804043679661, + "learning_rate": 9.798684046522469e-05, + "loss": 3.3192, + "step": 8775 + }, + { + "epoch": 0.5447886274753243, + "grad_norm": 0.3562297012167735, + "learning_rate": 9.798582586003127e-05, + "loss": 3.2179, + "step": 8776 + }, + { + "epoch": 0.5448507045750822, + "grad_norm": 0.2253209989957945, + "learning_rate": 9.798481100448415e-05, + "loss": 3.2883, + "step": 8777 + }, + { + "epoch": 0.5449127816748401, + "grad_norm": 0.22994608530947588, + "learning_rate": 9.798379589858864e-05, + "loss": 3.1906, + "step": 8778 + }, + { + "epoch": 0.5449748587745981, + "grad_norm": 0.2315787964593573, + "learning_rate": 9.798278054234999e-05, + "loss": 3.3022, + "step": 8779 + }, + { + "epoch": 0.545036935874356, + "grad_norm": 0.2268601961008684, + "learning_rate": 9.798176493577356e-05, + "loss": 3.1768, + "step": 8780 + }, + { + "epoch": 0.5450990129741139, + "grad_norm": 0.2751713820178978, + "learning_rate": 9.798074907886459e-05, + "loss": 3.2353, + "step": 8781 + }, + { + "epoch": 0.5451610900738717, + "grad_norm": 0.34646617063159335, + "learning_rate": 9.797973297162841e-05, + "loss": 3.3254, + "step": 8782 + }, + { + "epoch": 0.5452231671736296, + "grad_norm": 0.2945761517364693, + "learning_rate": 9.797871661407032e-05, + "loss": 3.2106, + "step": 8783 + }, + { + "epoch": 0.5452852442733875, + "grad_norm": 0.2623423505999086, + "learning_rate": 9.797770000619561e-05, + "loss": 3.2677, + "step": 8784 + }, + { + "epoch": 0.5453473213731455, + "grad_norm": 0.2719746286536423, + "learning_rate": 9.797668314800961e-05, + "loss": 3.3344, + "step": 8785 + }, + { + "epoch": 0.5454093984729034, + "grad_norm": 0.25348021637635537, + "learning_rate": 9.797566603951759e-05, + "loss": 3.241, + "step": 8786 + }, + { + "epoch": 0.5454714755726613, + "grad_norm": 0.23225235567191702, + "learning_rate": 9.797464868072488e-05, + "loss": 3.1989, + "step": 8787 + }, + { + "epoch": 0.5455335526724191, + "grad_norm": 0.2686249242314746, + "learning_rate": 9.797363107163678e-05, + "loss": 3.2396, + "step": 8788 + }, + { + "epoch": 0.545595629772177, + "grad_norm": 0.19627333810557243, + "learning_rate": 9.797261321225859e-05, + "loss": 3.1354, + "step": 8789 + }, + { + "epoch": 0.5456577068719349, + "grad_norm": 0.35363369229374875, + "learning_rate": 9.797159510259565e-05, + "loss": 3.3077, + "step": 8790 + }, + { + "epoch": 0.5457197839716929, + "grad_norm": 0.3634467209542505, + "learning_rate": 9.797057674265324e-05, + "loss": 3.2492, + "step": 8791 + }, + { + "epoch": 0.5457818610714508, + "grad_norm": 0.21919325640211323, + "learning_rate": 9.79695581324367e-05, + "loss": 3.2416, + "step": 8792 + }, + { + "epoch": 0.5458439381712087, + "grad_norm": 0.4536297880357852, + "learning_rate": 9.796853927195131e-05, + "loss": 3.3316, + "step": 8793 + }, + { + "epoch": 0.5459060152709665, + "grad_norm": 0.4035290785076497, + "learning_rate": 9.796752016120242e-05, + "loss": 3.3303, + "step": 8794 + }, + { + "epoch": 0.5459680923707244, + "grad_norm": 0.30543754642658727, + "learning_rate": 9.796650080019533e-05, + "loss": 3.1695, + "step": 8795 + }, + { + "epoch": 0.5460301694704823, + "grad_norm": 0.2928613593050508, + "learning_rate": 9.796548118893536e-05, + "loss": 3.232, + "step": 8796 + }, + { + "epoch": 0.5460922465702402, + "grad_norm": 0.2888769250019068, + "learning_rate": 9.796446132742782e-05, + "loss": 3.1957, + "step": 8797 + }, + { + "epoch": 0.5461543236699982, + "grad_norm": 0.3710833497848753, + "learning_rate": 9.796344121567806e-05, + "loss": 3.3784, + "step": 8798 + }, + { + "epoch": 0.5462164007697561, + "grad_norm": 0.26254177215272156, + "learning_rate": 9.796242085369137e-05, + "loss": 3.2553, + "step": 8799 + }, + { + "epoch": 0.5462784778695139, + "grad_norm": 0.3751841530041323, + "learning_rate": 9.79614002414731e-05, + "loss": 3.1845, + "step": 8800 + }, + { + "epoch": 0.5463405549692718, + "grad_norm": 0.28194793886661507, + "learning_rate": 9.796037937902855e-05, + "loss": 3.3083, + "step": 8801 + }, + { + "epoch": 0.5464026320690297, + "grad_norm": 0.26088660594729784, + "learning_rate": 9.795935826636307e-05, + "loss": 3.1977, + "step": 8802 + }, + { + "epoch": 0.5464647091687876, + "grad_norm": 0.3017804776170273, + "learning_rate": 9.795833690348196e-05, + "loss": 3.294, + "step": 8803 + }, + { + "epoch": 0.5465267862685456, + "grad_norm": 0.27589729720399453, + "learning_rate": 9.795731529039058e-05, + "loss": 3.2514, + "step": 8804 + }, + { + "epoch": 0.5465888633683035, + "grad_norm": 0.41943448192071636, + "learning_rate": 9.795629342709422e-05, + "loss": 3.1998, + "step": 8805 + }, + { + "epoch": 0.5466509404680613, + "grad_norm": 0.24604229919041126, + "learning_rate": 9.795527131359826e-05, + "loss": 3.2542, + "step": 8806 + }, + { + "epoch": 0.5467130175678192, + "grad_norm": 0.30082180938724745, + "learning_rate": 9.7954248949908e-05, + "loss": 3.164, + "step": 8807 + }, + { + "epoch": 0.5467750946675771, + "grad_norm": 0.2510671637607368, + "learning_rate": 9.795322633602879e-05, + "loss": 3.2928, + "step": 8808 + }, + { + "epoch": 0.546837171767335, + "grad_norm": 0.395496810948455, + "learning_rate": 9.795220347196595e-05, + "loss": 3.2958, + "step": 8809 + }, + { + "epoch": 0.546899248867093, + "grad_norm": 0.5902446369766055, + "learning_rate": 9.795118035772482e-05, + "loss": 3.2944, + "step": 8810 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.33078265195683704, + "learning_rate": 9.795015699331074e-05, + "loss": 3.2556, + "step": 8811 + }, + { + "epoch": 0.5470234030666087, + "grad_norm": 0.2801366731534196, + "learning_rate": 9.794913337872906e-05, + "loss": 3.1935, + "step": 8812 + }, + { + "epoch": 0.5470854801663666, + "grad_norm": 0.2654341404690803, + "learning_rate": 9.79481095139851e-05, + "loss": 3.2713, + "step": 8813 + }, + { + "epoch": 0.5471475572661245, + "grad_norm": 0.26441258129550893, + "learning_rate": 9.794708539908421e-05, + "loss": 3.3301, + "step": 8814 + }, + { + "epoch": 0.5472096343658824, + "grad_norm": 0.28790924343323876, + "learning_rate": 9.794606103403175e-05, + "loss": 3.3366, + "step": 8815 + }, + { + "epoch": 0.5472717114656404, + "grad_norm": 0.2747086270988101, + "learning_rate": 9.794503641883305e-05, + "loss": 3.2342, + "step": 8816 + }, + { + "epoch": 0.5473337885653983, + "grad_norm": 0.22013566820470873, + "learning_rate": 9.794401155349346e-05, + "loss": 3.2993, + "step": 8817 + }, + { + "epoch": 0.5473958656651561, + "grad_norm": 0.24125855473930768, + "learning_rate": 9.79429864380183e-05, + "loss": 3.1648, + "step": 8818 + }, + { + "epoch": 0.547457942764914, + "grad_norm": 0.2649736271636668, + "learning_rate": 9.794196107241295e-05, + "loss": 3.266, + "step": 8819 + }, + { + "epoch": 0.5475200198646719, + "grad_norm": 0.20827204080814002, + "learning_rate": 9.794093545668277e-05, + "loss": 3.2646, + "step": 8820 + }, + { + "epoch": 0.5475820969644298, + "grad_norm": 0.2025282695988297, + "learning_rate": 9.793990959083307e-05, + "loss": 3.2441, + "step": 8821 + }, + { + "epoch": 0.5476441740641877, + "grad_norm": 0.3207224571729655, + "learning_rate": 9.793888347486924e-05, + "loss": 3.1913, + "step": 8822 + }, + { + "epoch": 0.5477062511639457, + "grad_norm": 0.2553227453615778, + "learning_rate": 9.793785710879661e-05, + "loss": 3.1989, + "step": 8823 + }, + { + "epoch": 0.5477683282637035, + "grad_norm": 0.26395670729730697, + "learning_rate": 9.793683049262053e-05, + "loss": 3.3074, + "step": 8824 + }, + { + "epoch": 0.5478304053634614, + "grad_norm": 0.23160756536210564, + "learning_rate": 9.793580362634638e-05, + "loss": 3.2418, + "step": 8825 + }, + { + "epoch": 0.5478924824632193, + "grad_norm": 0.21922008209476337, + "learning_rate": 9.793477650997951e-05, + "loss": 3.2333, + "step": 8826 + }, + { + "epoch": 0.5479545595629772, + "grad_norm": 0.22121943786714612, + "learning_rate": 9.793374914352526e-05, + "loss": 3.2833, + "step": 8827 + }, + { + "epoch": 0.5480166366627351, + "grad_norm": 0.4422196514720785, + "learning_rate": 9.793272152698902e-05, + "loss": 3.186, + "step": 8828 + }, + { + "epoch": 0.5480787137624931, + "grad_norm": 0.4039898688054002, + "learning_rate": 9.793169366037614e-05, + "loss": 3.3412, + "step": 8829 + }, + { + "epoch": 0.5481407908622509, + "grad_norm": 0.2916576165543605, + "learning_rate": 9.793066554369195e-05, + "loss": 3.323, + "step": 8830 + }, + { + "epoch": 0.5482028679620088, + "grad_norm": 0.31781920728854435, + "learning_rate": 9.792963717694186e-05, + "loss": 3.2369, + "step": 8831 + }, + { + "epoch": 0.5482649450617667, + "grad_norm": 0.263229932226636, + "learning_rate": 9.792860856013122e-05, + "loss": 3.3114, + "step": 8832 + }, + { + "epoch": 0.5483270221615246, + "grad_norm": 0.21856161929840245, + "learning_rate": 9.79275796932654e-05, + "loss": 3.1994, + "step": 8833 + }, + { + "epoch": 0.5483890992612825, + "grad_norm": 0.2921001910765368, + "learning_rate": 9.792655057634974e-05, + "loss": 3.3124, + "step": 8834 + }, + { + "epoch": 0.5484511763610405, + "grad_norm": 0.3588234774344415, + "learning_rate": 9.792552120938965e-05, + "loss": 3.353, + "step": 8835 + }, + { + "epoch": 0.5485132534607983, + "grad_norm": 0.24824738845959374, + "learning_rate": 9.792449159239047e-05, + "loss": 3.209, + "step": 8836 + }, + { + "epoch": 0.5485753305605562, + "grad_norm": 0.24692795915789606, + "learning_rate": 9.79234617253576e-05, + "loss": 3.2012, + "step": 8837 + }, + { + "epoch": 0.5486374076603141, + "grad_norm": 0.23142622213058464, + "learning_rate": 9.792243160829639e-05, + "loss": 3.2385, + "step": 8838 + }, + { + "epoch": 0.548699484760072, + "grad_norm": 0.3062731855724353, + "learning_rate": 9.792140124121221e-05, + "loss": 3.2922, + "step": 8839 + }, + { + "epoch": 0.5487615618598299, + "grad_norm": 0.3411546395405238, + "learning_rate": 9.792037062411046e-05, + "loss": 3.3412, + "step": 8840 + }, + { + "epoch": 0.5488236389595879, + "grad_norm": 0.2913485932774623, + "learning_rate": 9.791933975699649e-05, + "loss": 3.1997, + "step": 8841 + }, + { + "epoch": 0.5488857160593457, + "grad_norm": 0.23513429355455917, + "learning_rate": 9.79183086398757e-05, + "loss": 3.195, + "step": 8842 + }, + { + "epoch": 0.5489477931591036, + "grad_norm": 0.34117157550202004, + "learning_rate": 9.791727727275345e-05, + "loss": 3.2978, + "step": 8843 + }, + { + "epoch": 0.5490098702588615, + "grad_norm": 0.18319730363068212, + "learning_rate": 9.791624565563515e-05, + "loss": 3.3457, + "step": 8844 + }, + { + "epoch": 0.5490719473586194, + "grad_norm": 0.3093560256341257, + "learning_rate": 9.791521378852615e-05, + "loss": 3.2646, + "step": 8845 + }, + { + "epoch": 0.5491340244583773, + "grad_norm": 0.24339119135294887, + "learning_rate": 9.791418167143184e-05, + "loss": 3.3104, + "step": 8846 + }, + { + "epoch": 0.5491961015581353, + "grad_norm": 0.23893261821977438, + "learning_rate": 9.791314930435764e-05, + "loss": 3.1967, + "step": 8847 + }, + { + "epoch": 0.5492581786578931, + "grad_norm": 0.5432716183580735, + "learning_rate": 9.791211668730888e-05, + "loss": 3.2754, + "step": 8848 + }, + { + "epoch": 0.549320255757651, + "grad_norm": 0.3287173782234425, + "learning_rate": 9.7911083820291e-05, + "loss": 3.2448, + "step": 8849 + }, + { + "epoch": 0.5493823328574089, + "grad_norm": 0.27789663894404615, + "learning_rate": 9.791005070330937e-05, + "loss": 3.2661, + "step": 8850 + }, + { + "epoch": 0.5494444099571668, + "grad_norm": 0.3067732967392221, + "learning_rate": 9.790901733636935e-05, + "loss": 3.3202, + "step": 8851 + }, + { + "epoch": 0.5495064870569247, + "grad_norm": 0.23392776097299756, + "learning_rate": 9.790798371947636e-05, + "loss": 3.265, + "step": 8852 + }, + { + "epoch": 0.5495685641566826, + "grad_norm": 0.35745041113165615, + "learning_rate": 9.79069498526358e-05, + "loss": 3.4029, + "step": 8853 + }, + { + "epoch": 0.5496306412564405, + "grad_norm": 0.38301273869950575, + "learning_rate": 9.790591573585305e-05, + "loss": 3.3165, + "step": 8854 + }, + { + "epoch": 0.5496927183561984, + "grad_norm": 0.2584010688492057, + "learning_rate": 9.790488136913351e-05, + "loss": 3.3052, + "step": 8855 + }, + { + "epoch": 0.5497547954559563, + "grad_norm": 0.2814400481155936, + "learning_rate": 9.790384675248256e-05, + "loss": 3.2176, + "step": 8856 + }, + { + "epoch": 0.5498168725557142, + "grad_norm": 0.25011226994167723, + "learning_rate": 9.790281188590563e-05, + "loss": 3.2092, + "step": 8857 + }, + { + "epoch": 0.5498789496554721, + "grad_norm": 0.32226585619369075, + "learning_rate": 9.790177676940811e-05, + "loss": 3.208, + "step": 8858 + }, + { + "epoch": 0.54994102675523, + "grad_norm": 0.21330624877528198, + "learning_rate": 9.790074140299536e-05, + "loss": 3.1607, + "step": 8859 + }, + { + "epoch": 0.5500031038549879, + "grad_norm": 0.26964442323484333, + "learning_rate": 9.789970578667283e-05, + "loss": 3.283, + "step": 8860 + }, + { + "epoch": 0.5500651809547458, + "grad_norm": 0.25013501276700534, + "learning_rate": 9.789866992044592e-05, + "loss": 3.2553, + "step": 8861 + }, + { + "epoch": 0.5501272580545037, + "grad_norm": 0.2311972967223876, + "learning_rate": 9.789763380432e-05, + "loss": 3.3394, + "step": 8862 + }, + { + "epoch": 0.5501893351542616, + "grad_norm": 0.22942194573881236, + "learning_rate": 9.78965974383005e-05, + "loss": 3.2679, + "step": 8863 + }, + { + "epoch": 0.5502514122540195, + "grad_norm": 0.21474712106164948, + "learning_rate": 9.789556082239284e-05, + "loss": 3.093, + "step": 8864 + }, + { + "epoch": 0.5503134893537774, + "grad_norm": 0.30802414632483427, + "learning_rate": 9.78945239566024e-05, + "loss": 3.2406, + "step": 8865 + }, + { + "epoch": 0.5503755664535352, + "grad_norm": 0.23478096135930435, + "learning_rate": 9.78934868409346e-05, + "loss": 3.312, + "step": 8866 + }, + { + "epoch": 0.5504376435532932, + "grad_norm": 0.2305882838984625, + "learning_rate": 9.789244947539487e-05, + "loss": 3.3186, + "step": 8867 + }, + { + "epoch": 0.5504997206530511, + "grad_norm": 0.2229677330480426, + "learning_rate": 9.789141185998858e-05, + "loss": 3.2188, + "step": 8868 + }, + { + "epoch": 0.550561797752809, + "grad_norm": 0.2591885057112014, + "learning_rate": 9.789037399472119e-05, + "loss": 3.167, + "step": 8869 + }, + { + "epoch": 0.5506238748525669, + "grad_norm": 0.254559820151231, + "learning_rate": 9.788933587959807e-05, + "loss": 3.2604, + "step": 8870 + }, + { + "epoch": 0.5506859519523248, + "grad_norm": 0.3417312777106207, + "learning_rate": 9.788829751462467e-05, + "loss": 3.2995, + "step": 8871 + }, + { + "epoch": 0.5507480290520826, + "grad_norm": 0.22796985465155342, + "learning_rate": 9.78872588998064e-05, + "loss": 3.2175, + "step": 8872 + }, + { + "epoch": 0.5508101061518406, + "grad_norm": 0.21840464202959203, + "learning_rate": 9.788622003514866e-05, + "loss": 3.3099, + "step": 8873 + }, + { + "epoch": 0.5508721832515985, + "grad_norm": 0.28944214814205294, + "learning_rate": 9.78851809206569e-05, + "loss": 3.2451, + "step": 8874 + }, + { + "epoch": 0.5509342603513564, + "grad_norm": 0.24718001723266744, + "learning_rate": 9.788414155633654e-05, + "loss": 3.2098, + "step": 8875 + }, + { + "epoch": 0.5509963374511143, + "grad_norm": 0.23532953594542863, + "learning_rate": 9.788310194219297e-05, + "loss": 3.2911, + "step": 8876 + }, + { + "epoch": 0.5510584145508722, + "grad_norm": 0.24383531935668085, + "learning_rate": 9.788206207823164e-05, + "loss": 3.2426, + "step": 8877 + }, + { + "epoch": 0.55112049165063, + "grad_norm": 0.23532955969057878, + "learning_rate": 9.788102196445797e-05, + "loss": 3.2648, + "step": 8878 + }, + { + "epoch": 0.551182568750388, + "grad_norm": 0.22017148219005794, + "learning_rate": 9.787998160087737e-05, + "loss": 3.2372, + "step": 8879 + }, + { + "epoch": 0.5512446458501459, + "grad_norm": 0.2653994584431258, + "learning_rate": 9.787894098749531e-05, + "loss": 3.3058, + "step": 8880 + }, + { + "epoch": 0.5513067229499038, + "grad_norm": 0.20745961839058655, + "learning_rate": 9.787790012431716e-05, + "loss": 3.1758, + "step": 8881 + }, + { + "epoch": 0.5513688000496617, + "grad_norm": 0.20368440332718762, + "learning_rate": 9.78768590113484e-05, + "loss": 3.3207, + "step": 8882 + }, + { + "epoch": 0.5514308771494196, + "grad_norm": 0.2636010958016025, + "learning_rate": 9.787581764859444e-05, + "loss": 3.237, + "step": 8883 + }, + { + "epoch": 0.5514929542491774, + "grad_norm": 0.20421370022111593, + "learning_rate": 9.787477603606072e-05, + "loss": 3.3101, + "step": 8884 + }, + { + "epoch": 0.5515550313489354, + "grad_norm": 0.199496711095551, + "learning_rate": 9.787373417375268e-05, + "loss": 3.2535, + "step": 8885 + }, + { + "epoch": 0.5516171084486933, + "grad_norm": 0.33699439376474966, + "learning_rate": 9.787269206167574e-05, + "loss": 3.2377, + "step": 8886 + }, + { + "epoch": 0.5516791855484512, + "grad_norm": 0.23856280061784313, + "learning_rate": 9.787164969983534e-05, + "loss": 3.249, + "step": 8887 + }, + { + "epoch": 0.5517412626482091, + "grad_norm": 0.25357143628285433, + "learning_rate": 9.787060708823692e-05, + "loss": 3.2528, + "step": 8888 + }, + { + "epoch": 0.551803339747967, + "grad_norm": 0.200854597535621, + "learning_rate": 9.786956422688592e-05, + "loss": 3.2317, + "step": 8889 + }, + { + "epoch": 0.5518654168477248, + "grad_norm": 0.2440125472763881, + "learning_rate": 9.786852111578779e-05, + "loss": 3.1713, + "step": 8890 + }, + { + "epoch": 0.5519274939474828, + "grad_norm": 0.2050705499612431, + "learning_rate": 9.786747775494795e-05, + "loss": 3.1829, + "step": 8891 + }, + { + "epoch": 0.5519895710472407, + "grad_norm": 0.35638623727872665, + "learning_rate": 9.786643414437189e-05, + "loss": 3.357, + "step": 8892 + }, + { + "epoch": 0.5520516481469986, + "grad_norm": 0.20971655706851558, + "learning_rate": 9.786539028406501e-05, + "loss": 3.3736, + "step": 8893 + }, + { + "epoch": 0.5521137252467565, + "grad_norm": 0.22286728846088064, + "learning_rate": 9.786434617403275e-05, + "loss": 3.2485, + "step": 8894 + }, + { + "epoch": 0.5521758023465144, + "grad_norm": 0.2672600352113377, + "learning_rate": 9.78633018142806e-05, + "loss": 3.2554, + "step": 8895 + }, + { + "epoch": 0.5522378794462722, + "grad_norm": 0.23899205298060067, + "learning_rate": 9.786225720481397e-05, + "loss": 3.2804, + "step": 8896 + }, + { + "epoch": 0.5522999565460301, + "grad_norm": 0.18535585107052643, + "learning_rate": 9.786121234563832e-05, + "loss": 3.3593, + "step": 8897 + }, + { + "epoch": 0.5523620336457881, + "grad_norm": 0.20833836291122967, + "learning_rate": 9.786016723675912e-05, + "loss": 3.1763, + "step": 8898 + }, + { + "epoch": 0.552424110745546, + "grad_norm": 0.34350414587486855, + "learning_rate": 9.785912187818181e-05, + "loss": 3.1593, + "step": 8899 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.2883749724215729, + "learning_rate": 9.785807626991184e-05, + "loss": 3.3342, + "step": 8900 + }, + { + "epoch": 0.5525482649450618, + "grad_norm": 0.30359127226253807, + "learning_rate": 9.785703041195466e-05, + "loss": 3.2336, + "step": 8901 + }, + { + "epoch": 0.5526103420448196, + "grad_norm": 0.21335562397841845, + "learning_rate": 9.785598430431575e-05, + "loss": 3.2753, + "step": 8902 + }, + { + "epoch": 0.5526724191445775, + "grad_norm": 0.24094128691075878, + "learning_rate": 9.785493794700053e-05, + "loss": 3.2057, + "step": 8903 + }, + { + "epoch": 0.5527344962443355, + "grad_norm": 0.18506583046162825, + "learning_rate": 9.78538913400145e-05, + "loss": 3.226, + "step": 8904 + }, + { + "epoch": 0.5527965733440934, + "grad_norm": 0.30431557231847817, + "learning_rate": 9.78528444833631e-05, + "loss": 3.2648, + "step": 8905 + }, + { + "epoch": 0.5528586504438513, + "grad_norm": 0.20344535617630874, + "learning_rate": 9.785179737705179e-05, + "loss": 3.1356, + "step": 8906 + }, + { + "epoch": 0.5529207275436092, + "grad_norm": 0.2550246476201163, + "learning_rate": 9.785075002108604e-05, + "loss": 3.1815, + "step": 8907 + }, + { + "epoch": 0.552982804643367, + "grad_norm": 0.21999974540674044, + "learning_rate": 9.78497024154713e-05, + "loss": 3.216, + "step": 8908 + }, + { + "epoch": 0.5530448817431249, + "grad_norm": 0.2543068879437111, + "learning_rate": 9.784865456021307e-05, + "loss": 3.3036, + "step": 8909 + }, + { + "epoch": 0.5531069588428829, + "grad_norm": 0.2187910892860593, + "learning_rate": 9.784760645531679e-05, + "loss": 3.2275, + "step": 8910 + }, + { + "epoch": 0.5531690359426408, + "grad_norm": 0.2828353466900963, + "learning_rate": 9.784655810078791e-05, + "loss": 3.2667, + "step": 8911 + }, + { + "epoch": 0.5532311130423987, + "grad_norm": 0.1863302458636422, + "learning_rate": 9.784550949663193e-05, + "loss": 3.2548, + "step": 8912 + }, + { + "epoch": 0.5532931901421566, + "grad_norm": 0.22334744432574288, + "learning_rate": 9.784446064285431e-05, + "loss": 3.3137, + "step": 8913 + }, + { + "epoch": 0.5533552672419144, + "grad_norm": 0.22260445276729018, + "learning_rate": 9.784341153946053e-05, + "loss": 3.2642, + "step": 8914 + }, + { + "epoch": 0.5534173443416723, + "grad_norm": 0.2612483936470329, + "learning_rate": 9.784236218645605e-05, + "loss": 3.4056, + "step": 8915 + }, + { + "epoch": 0.5534794214414303, + "grad_norm": 0.2724960468600449, + "learning_rate": 9.784131258384637e-05, + "loss": 3.2058, + "step": 8916 + }, + { + "epoch": 0.5535414985411882, + "grad_norm": 0.27598795128261966, + "learning_rate": 9.784026273163693e-05, + "loss": 3.2685, + "step": 8917 + }, + { + "epoch": 0.5536035756409461, + "grad_norm": 0.21010062428124762, + "learning_rate": 9.783921262983324e-05, + "loss": 3.2777, + "step": 8918 + }, + { + "epoch": 0.553665652740704, + "grad_norm": 0.28255402729871654, + "learning_rate": 9.783816227844076e-05, + "loss": 3.2516, + "step": 8919 + }, + { + "epoch": 0.5537277298404618, + "grad_norm": 0.25000810610027174, + "learning_rate": 9.783711167746497e-05, + "loss": 3.3021, + "step": 8920 + }, + { + "epoch": 0.5537898069402197, + "grad_norm": 0.21776609023491245, + "learning_rate": 9.783606082691134e-05, + "loss": 3.1208, + "step": 8921 + }, + { + "epoch": 0.5538518840399776, + "grad_norm": 0.3448090819237294, + "learning_rate": 9.783500972678539e-05, + "loss": 3.1872, + "step": 8922 + }, + { + "epoch": 0.5539139611397356, + "grad_norm": 0.32079307047549205, + "learning_rate": 9.783395837709258e-05, + "loss": 3.2133, + "step": 8923 + }, + { + "epoch": 0.5539760382394935, + "grad_norm": 0.24103984092844127, + "learning_rate": 9.78329067778384e-05, + "loss": 3.1978, + "step": 8924 + }, + { + "epoch": 0.5540381153392514, + "grad_norm": 0.29804959827143884, + "learning_rate": 9.783185492902832e-05, + "loss": 3.2498, + "step": 8925 + }, + { + "epoch": 0.5541001924390092, + "grad_norm": 0.6715955152956558, + "learning_rate": 9.783080283066786e-05, + "loss": 3.2682, + "step": 8926 + }, + { + "epoch": 0.5541622695387671, + "grad_norm": 0.5243485905358883, + "learning_rate": 9.782975048276246e-05, + "loss": 3.2317, + "step": 8927 + }, + { + "epoch": 0.554224346638525, + "grad_norm": 0.2493549010894139, + "learning_rate": 9.782869788531766e-05, + "loss": 3.3199, + "step": 8928 + }, + { + "epoch": 0.554286423738283, + "grad_norm": 0.4526280275763491, + "learning_rate": 9.782764503833894e-05, + "loss": 3.2011, + "step": 8929 + }, + { + "epoch": 0.5543485008380409, + "grad_norm": 0.39889492629968437, + "learning_rate": 9.782659194183177e-05, + "loss": 3.4054, + "step": 8930 + }, + { + "epoch": 0.5544105779377988, + "grad_norm": 0.31880147929644104, + "learning_rate": 9.782553859580167e-05, + "loss": 3.2298, + "step": 8931 + }, + { + "epoch": 0.5544726550375566, + "grad_norm": 0.2578941562691637, + "learning_rate": 9.782448500025411e-05, + "loss": 3.2366, + "step": 8932 + }, + { + "epoch": 0.5545347321373145, + "grad_norm": 0.3902447184584681, + "learning_rate": 9.782343115519462e-05, + "loss": 3.2858, + "step": 8933 + }, + { + "epoch": 0.5545968092370724, + "grad_norm": 0.8193147677027094, + "learning_rate": 9.782237706062868e-05, + "loss": 3.255, + "step": 8934 + }, + { + "epoch": 0.5546588863368304, + "grad_norm": 0.3956751528283555, + "learning_rate": 9.782132271656178e-05, + "loss": 3.2089, + "step": 8935 + }, + { + "epoch": 0.5547209634365883, + "grad_norm": 0.4326798197786351, + "learning_rate": 9.782026812299942e-05, + "loss": 3.2537, + "step": 8936 + }, + { + "epoch": 0.5547830405363462, + "grad_norm": 0.33248019440899623, + "learning_rate": 9.781921327994714e-05, + "loss": 3.2725, + "step": 8937 + }, + { + "epoch": 0.554845117636104, + "grad_norm": 0.4298240964688885, + "learning_rate": 9.78181581874104e-05, + "loss": 3.2479, + "step": 8938 + }, + { + "epoch": 0.5549071947358619, + "grad_norm": 0.29162238698852333, + "learning_rate": 9.781710284539472e-05, + "loss": 3.3307, + "step": 8939 + }, + { + "epoch": 0.5549692718356198, + "grad_norm": 0.653935620230571, + "learning_rate": 9.781604725390561e-05, + "loss": 3.2519, + "step": 8940 + }, + { + "epoch": 0.5550313489353778, + "grad_norm": 0.28605432052203283, + "learning_rate": 9.781499141294857e-05, + "loss": 3.2956, + "step": 8941 + }, + { + "epoch": 0.5550934260351357, + "grad_norm": 0.425991408952087, + "learning_rate": 9.781393532252911e-05, + "loss": 3.2842, + "step": 8942 + }, + { + "epoch": 0.5551555031348936, + "grad_norm": 0.49892890467760853, + "learning_rate": 9.781287898265275e-05, + "loss": 3.2122, + "step": 8943 + }, + { + "epoch": 0.5552175802346514, + "grad_norm": 0.2835865187970957, + "learning_rate": 9.781182239332499e-05, + "loss": 3.2484, + "step": 8944 + }, + { + "epoch": 0.5552796573344093, + "grad_norm": 0.3479079075051423, + "learning_rate": 9.781076555455135e-05, + "loss": 3.2141, + "step": 8945 + }, + { + "epoch": 0.5553417344341672, + "grad_norm": 0.4042847857303345, + "learning_rate": 9.780970846633734e-05, + "loss": 3.2648, + "step": 8946 + }, + { + "epoch": 0.5554038115339252, + "grad_norm": 0.34606287713053546, + "learning_rate": 9.780865112868847e-05, + "loss": 3.2089, + "step": 8947 + }, + { + "epoch": 0.5554658886336831, + "grad_norm": 0.37040594507383634, + "learning_rate": 9.780759354161026e-05, + "loss": 3.2486, + "step": 8948 + }, + { + "epoch": 0.555527965733441, + "grad_norm": 0.2535233765344275, + "learning_rate": 9.780653570510822e-05, + "loss": 3.2299, + "step": 8949 + }, + { + "epoch": 0.5555900428331988, + "grad_norm": 0.2598960377011359, + "learning_rate": 9.78054776191879e-05, + "loss": 3.2855, + "step": 8950 + }, + { + "epoch": 0.5556521199329567, + "grad_norm": 0.3538346931924322, + "learning_rate": 9.780441928385477e-05, + "loss": 3.2719, + "step": 8951 + }, + { + "epoch": 0.5557141970327146, + "grad_norm": 0.27167243755877124, + "learning_rate": 9.78033606991144e-05, + "loss": 3.205, + "step": 8952 + }, + { + "epoch": 0.5557762741324725, + "grad_norm": 0.2809836663067144, + "learning_rate": 9.78023018649723e-05, + "loss": 3.2177, + "step": 8953 + }, + { + "epoch": 0.5558383512322305, + "grad_norm": 0.3024895447114313, + "learning_rate": 9.780124278143396e-05, + "loss": 3.2801, + "step": 8954 + }, + { + "epoch": 0.5559004283319884, + "grad_norm": 0.28040005153350794, + "learning_rate": 9.780018344850495e-05, + "loss": 3.2849, + "step": 8955 + }, + { + "epoch": 0.5559625054317462, + "grad_norm": 0.26066531073135474, + "learning_rate": 9.779912386619079e-05, + "loss": 3.2449, + "step": 8956 + }, + { + "epoch": 0.5560245825315041, + "grad_norm": 0.22272825749832903, + "learning_rate": 9.779806403449699e-05, + "loss": 3.3207, + "step": 8957 + }, + { + "epoch": 0.556086659631262, + "grad_norm": 1.0881682611813, + "learning_rate": 9.779700395342907e-05, + "loss": 3.2635, + "step": 8958 + }, + { + "epoch": 0.5561487367310199, + "grad_norm": 0.2776412040356196, + "learning_rate": 9.779594362299258e-05, + "loss": 3.1744, + "step": 8959 + }, + { + "epoch": 0.5562108138307779, + "grad_norm": 0.4936224289762186, + "learning_rate": 9.779488304319306e-05, + "loss": 3.2372, + "step": 8960 + }, + { + "epoch": 0.5562728909305358, + "grad_norm": 0.36065185770814767, + "learning_rate": 9.779382221403604e-05, + "loss": 3.2989, + "step": 8961 + }, + { + "epoch": 0.5563349680302936, + "grad_norm": 0.3670058511298765, + "learning_rate": 9.779276113552702e-05, + "loss": 3.2077, + "step": 8962 + }, + { + "epoch": 0.5563970451300515, + "grad_norm": 0.3382639724160343, + "learning_rate": 9.779169980767158e-05, + "loss": 3.327, + "step": 8963 + }, + { + "epoch": 0.5564591222298094, + "grad_norm": 0.34640311370930255, + "learning_rate": 9.779063823047525e-05, + "loss": 3.22, + "step": 8964 + }, + { + "epoch": 0.5565211993295673, + "grad_norm": 0.2602806006048031, + "learning_rate": 9.778957640394355e-05, + "loss": 3.2752, + "step": 8965 + }, + { + "epoch": 0.5565832764293253, + "grad_norm": 0.3755192936068999, + "learning_rate": 9.778851432808203e-05, + "loss": 3.2086, + "step": 8966 + }, + { + "epoch": 0.5566453535290832, + "grad_norm": 0.22396075262197204, + "learning_rate": 9.77874520028962e-05, + "loss": 3.2458, + "step": 8967 + }, + { + "epoch": 0.556707430628841, + "grad_norm": 0.2232570740015807, + "learning_rate": 9.778638942839166e-05, + "loss": 3.2003, + "step": 8968 + }, + { + "epoch": 0.5567695077285989, + "grad_norm": 0.24715137282790342, + "learning_rate": 9.778532660457394e-05, + "loss": 3.2605, + "step": 8969 + }, + { + "epoch": 0.5568315848283568, + "grad_norm": 0.5844296445134031, + "learning_rate": 9.778426353144856e-05, + "loss": 3.3442, + "step": 8970 + }, + { + "epoch": 0.5568936619281147, + "grad_norm": 0.29272585654609473, + "learning_rate": 9.778320020902107e-05, + "loss": 3.267, + "step": 8971 + }, + { + "epoch": 0.5569557390278727, + "grad_norm": 0.24946218995619177, + "learning_rate": 9.778213663729702e-05, + "loss": 3.2219, + "step": 8972 + }, + { + "epoch": 0.5570178161276306, + "grad_norm": 0.24145389784166704, + "learning_rate": 9.778107281628197e-05, + "loss": 3.282, + "step": 8973 + }, + { + "epoch": 0.5570798932273884, + "grad_norm": 0.23725699114171367, + "learning_rate": 9.778000874598147e-05, + "loss": 3.2377, + "step": 8974 + }, + { + "epoch": 0.5571419703271463, + "grad_norm": 0.20612708171420208, + "learning_rate": 9.777894442640105e-05, + "loss": 3.1735, + "step": 8975 + }, + { + "epoch": 0.5572040474269042, + "grad_norm": 0.2855123621674273, + "learning_rate": 9.77778798575463e-05, + "loss": 3.2683, + "step": 8976 + }, + { + "epoch": 0.5572661245266621, + "grad_norm": 0.23342627195755503, + "learning_rate": 9.777681503942273e-05, + "loss": 3.339, + "step": 8977 + }, + { + "epoch": 0.55732820162642, + "grad_norm": 0.3158641574789109, + "learning_rate": 9.777574997203594e-05, + "loss": 3.2139, + "step": 8978 + }, + { + "epoch": 0.557390278726178, + "grad_norm": 0.22065567249157153, + "learning_rate": 9.777468465539145e-05, + "loss": 3.3048, + "step": 8979 + }, + { + "epoch": 0.5574523558259358, + "grad_norm": 0.3098665017851186, + "learning_rate": 9.777361908949483e-05, + "loss": 3.1584, + "step": 8980 + }, + { + "epoch": 0.5575144329256937, + "grad_norm": 0.21751917548988434, + "learning_rate": 9.777255327435164e-05, + "loss": 3.2408, + "step": 8981 + }, + { + "epoch": 0.5575765100254516, + "grad_norm": 0.25587920483338156, + "learning_rate": 9.777148720996746e-05, + "loss": 3.2901, + "step": 8982 + }, + { + "epoch": 0.5576385871252095, + "grad_norm": 0.42705560415856364, + "learning_rate": 9.777042089634783e-05, + "loss": 3.2698, + "step": 8983 + }, + { + "epoch": 0.5577006642249674, + "grad_norm": 0.19400707771107742, + "learning_rate": 9.77693543334983e-05, + "loss": 3.2824, + "step": 8984 + }, + { + "epoch": 0.5577627413247254, + "grad_norm": 0.29460232835757255, + "learning_rate": 9.776828752142446e-05, + "loss": 3.2916, + "step": 8985 + }, + { + "epoch": 0.5578248184244832, + "grad_norm": 0.19689626389671908, + "learning_rate": 9.776722046013187e-05, + "loss": 3.3178, + "step": 8986 + }, + { + "epoch": 0.5578868955242411, + "grad_norm": 0.2585787800492388, + "learning_rate": 9.776615314962609e-05, + "loss": 3.2516, + "step": 8987 + }, + { + "epoch": 0.557948972623999, + "grad_norm": 0.506437611843164, + "learning_rate": 9.77650855899127e-05, + "loss": 3.2694, + "step": 8988 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.22042394382822564, + "learning_rate": 9.776401778099728e-05, + "loss": 3.2839, + "step": 8989 + }, + { + "epoch": 0.5580731268235148, + "grad_norm": 0.3909030878120717, + "learning_rate": 9.776294972288535e-05, + "loss": 3.2481, + "step": 8990 + }, + { + "epoch": 0.5581352039232728, + "grad_norm": 0.26844657802663724, + "learning_rate": 9.776188141558253e-05, + "loss": 3.1343, + "step": 8991 + }, + { + "epoch": 0.5581972810230306, + "grad_norm": 0.27352763461496876, + "learning_rate": 9.776081285909439e-05, + "loss": 3.2876, + "step": 8992 + }, + { + "epoch": 0.5582593581227885, + "grad_norm": 0.27929648819476466, + "learning_rate": 9.77597440534265e-05, + "loss": 3.2714, + "step": 8993 + }, + { + "epoch": 0.5583214352225464, + "grad_norm": 0.2958817052359407, + "learning_rate": 9.77586749985844e-05, + "loss": 3.2052, + "step": 8994 + }, + { + "epoch": 0.5583835123223043, + "grad_norm": 0.2409405756851783, + "learning_rate": 9.775760569457373e-05, + "loss": 3.281, + "step": 8995 + }, + { + "epoch": 0.5584455894220622, + "grad_norm": 0.24178992987104353, + "learning_rate": 9.775653614140002e-05, + "loss": 3.2717, + "step": 8996 + }, + { + "epoch": 0.55850766652182, + "grad_norm": 0.20464308781047338, + "learning_rate": 9.775546633906888e-05, + "loss": 3.3325, + "step": 8997 + }, + { + "epoch": 0.558569743621578, + "grad_norm": 0.18993766996734435, + "learning_rate": 9.775439628758588e-05, + "loss": 3.2036, + "step": 8998 + }, + { + "epoch": 0.5586318207213359, + "grad_norm": 0.1971698849694981, + "learning_rate": 9.77533259869566e-05, + "loss": 3.2976, + "step": 8999 + }, + { + "epoch": 0.5586938978210938, + "grad_norm": 0.1826539945993443, + "learning_rate": 9.77522554371866e-05, + "loss": 3.3544, + "step": 9000 + }, + { + "epoch": 0.5587559749208517, + "grad_norm": 0.2067822992875571, + "learning_rate": 9.775118463828152e-05, + "loss": 3.2913, + "step": 9001 + }, + { + "epoch": 0.5588180520206096, + "grad_norm": 0.1899001069267227, + "learning_rate": 9.77501135902469e-05, + "loss": 3.1799, + "step": 9002 + }, + { + "epoch": 0.5588801291203674, + "grad_norm": 0.23146975263489414, + "learning_rate": 9.774904229308836e-05, + "loss": 3.1645, + "step": 9003 + }, + { + "epoch": 0.5589422062201254, + "grad_norm": 0.1751806116900172, + "learning_rate": 9.774797074681146e-05, + "loss": 3.2655, + "step": 9004 + }, + { + "epoch": 0.5590042833198833, + "grad_norm": 0.22997669451748942, + "learning_rate": 9.774689895142181e-05, + "loss": 3.1955, + "step": 9005 + }, + { + "epoch": 0.5590663604196412, + "grad_norm": 0.25305682194266466, + "learning_rate": 9.7745826906925e-05, + "loss": 3.2042, + "step": 9006 + }, + { + "epoch": 0.5591284375193991, + "grad_norm": 0.23622504263958327, + "learning_rate": 9.774475461332664e-05, + "loss": 3.1624, + "step": 9007 + }, + { + "epoch": 0.559190514619157, + "grad_norm": 0.19660276520602024, + "learning_rate": 9.774368207063228e-05, + "loss": 3.2241, + "step": 9008 + }, + { + "epoch": 0.5592525917189148, + "grad_norm": 0.2185312863691778, + "learning_rate": 9.774260927884755e-05, + "loss": 3.2143, + "step": 9009 + }, + { + "epoch": 0.5593146688186728, + "grad_norm": 0.20972574945421188, + "learning_rate": 9.774153623797804e-05, + "loss": 3.292, + "step": 9010 + }, + { + "epoch": 0.5593767459184307, + "grad_norm": 0.20497941866056416, + "learning_rate": 9.774046294802933e-05, + "loss": 3.3082, + "step": 9011 + }, + { + "epoch": 0.5594388230181886, + "grad_norm": 0.40908103973712906, + "learning_rate": 9.773938940900704e-05, + "loss": 3.1652, + "step": 9012 + }, + { + "epoch": 0.5595009001179465, + "grad_norm": 0.329429101246721, + "learning_rate": 9.773831562091678e-05, + "loss": 3.2286, + "step": 9013 + }, + { + "epoch": 0.5595629772177044, + "grad_norm": 0.2195896654705955, + "learning_rate": 9.773724158376413e-05, + "loss": 3.0924, + "step": 9014 + }, + { + "epoch": 0.5596250543174622, + "grad_norm": 0.25884312478864674, + "learning_rate": 9.77361672975547e-05, + "loss": 3.1824, + "step": 9015 + }, + { + "epoch": 0.5596871314172202, + "grad_norm": 0.28810997555548745, + "learning_rate": 9.77350927622941e-05, + "loss": 3.3005, + "step": 9016 + }, + { + "epoch": 0.5597492085169781, + "grad_norm": 0.3621706444715976, + "learning_rate": 9.773401797798794e-05, + "loss": 3.3482, + "step": 9017 + }, + { + "epoch": 0.559811285616736, + "grad_norm": 0.2912603411864664, + "learning_rate": 9.773294294464181e-05, + "loss": 3.2503, + "step": 9018 + }, + { + "epoch": 0.5598733627164939, + "grad_norm": 0.2795097366890741, + "learning_rate": 9.773186766226132e-05, + "loss": 3.2763, + "step": 9019 + }, + { + "epoch": 0.5599354398162518, + "grad_norm": 0.33393426708874796, + "learning_rate": 9.773079213085211e-05, + "loss": 3.185, + "step": 9020 + }, + { + "epoch": 0.5599975169160096, + "grad_norm": 0.23944485418277145, + "learning_rate": 9.772971635041976e-05, + "loss": 3.2377, + "step": 9021 + }, + { + "epoch": 0.5600595940157675, + "grad_norm": 0.2480109631831996, + "learning_rate": 9.77286403209699e-05, + "loss": 3.2576, + "step": 9022 + }, + { + "epoch": 0.5601216711155255, + "grad_norm": 0.283832219643199, + "learning_rate": 9.772756404250813e-05, + "loss": 3.2209, + "step": 9023 + }, + { + "epoch": 0.5601837482152834, + "grad_norm": 0.27597900195167363, + "learning_rate": 9.772648751504008e-05, + "loss": 3.1885, + "step": 9024 + }, + { + "epoch": 0.5602458253150413, + "grad_norm": 0.2983129960636831, + "learning_rate": 9.772541073857135e-05, + "loss": 3.3117, + "step": 9025 + }, + { + "epoch": 0.5603079024147992, + "grad_norm": 0.20094613492752977, + "learning_rate": 9.772433371310756e-05, + "loss": 3.2211, + "step": 9026 + }, + { + "epoch": 0.560369979514557, + "grad_norm": 0.2271565180855771, + "learning_rate": 9.772325643865434e-05, + "loss": 3.2224, + "step": 9027 + }, + { + "epoch": 0.5604320566143149, + "grad_norm": 0.5528696386675452, + "learning_rate": 9.772217891521731e-05, + "loss": 3.3234, + "step": 9028 + }, + { + "epoch": 0.5604941337140729, + "grad_norm": 0.32795032892930404, + "learning_rate": 9.772110114280208e-05, + "loss": 3.1545, + "step": 9029 + }, + { + "epoch": 0.5605562108138308, + "grad_norm": 0.4404344579603239, + "learning_rate": 9.772002312141431e-05, + "loss": 3.2471, + "step": 9030 + }, + { + "epoch": 0.5606182879135887, + "grad_norm": 0.2631663952022366, + "learning_rate": 9.771894485105956e-05, + "loss": 3.2579, + "step": 9031 + }, + { + "epoch": 0.5606803650133466, + "grad_norm": 0.3249429749014571, + "learning_rate": 9.771786633174351e-05, + "loss": 3.1885, + "step": 9032 + }, + { + "epoch": 0.5607424421131044, + "grad_norm": 0.23557925784572448, + "learning_rate": 9.771678756347175e-05, + "loss": 3.1456, + "step": 9033 + }, + { + "epoch": 0.5608045192128623, + "grad_norm": 0.2758541264840911, + "learning_rate": 9.771570854624993e-05, + "loss": 3.2068, + "step": 9034 + }, + { + "epoch": 0.5608665963126203, + "grad_norm": 0.30155171142807846, + "learning_rate": 9.77146292800837e-05, + "loss": 3.2479, + "step": 9035 + }, + { + "epoch": 0.5609286734123782, + "grad_norm": 0.2683095888601456, + "learning_rate": 9.771354976497864e-05, + "loss": 3.2652, + "step": 9036 + }, + { + "epoch": 0.5609907505121361, + "grad_norm": 0.19754375075784628, + "learning_rate": 9.771247000094042e-05, + "loss": 3.254, + "step": 9037 + }, + { + "epoch": 0.561052827611894, + "grad_norm": 0.3129941133381291, + "learning_rate": 9.771138998797465e-05, + "loss": 3.278, + "step": 9038 + }, + { + "epoch": 0.5611149047116518, + "grad_norm": 0.25249722599639723, + "learning_rate": 9.771030972608699e-05, + "loss": 3.263, + "step": 9039 + }, + { + "epoch": 0.5611769818114097, + "grad_norm": 0.2592875879135391, + "learning_rate": 9.770922921528305e-05, + "loss": 3.0994, + "step": 9040 + }, + { + "epoch": 0.5612390589111677, + "grad_norm": 0.25224513497873735, + "learning_rate": 9.770814845556848e-05, + "loss": 3.2417, + "step": 9041 + }, + { + "epoch": 0.5613011360109256, + "grad_norm": 0.24814812403528397, + "learning_rate": 9.770706744694892e-05, + "loss": 3.2505, + "step": 9042 + }, + { + "epoch": 0.5613632131106835, + "grad_norm": 0.22202399482926902, + "learning_rate": 9.770598618943002e-05, + "loss": 3.2793, + "step": 9043 + }, + { + "epoch": 0.5614252902104414, + "grad_norm": 0.26925351857655244, + "learning_rate": 9.770490468301741e-05, + "loss": 3.2338, + "step": 9044 + }, + { + "epoch": 0.5614873673101992, + "grad_norm": 0.21003198530502346, + "learning_rate": 9.770382292771672e-05, + "loss": 3.3227, + "step": 9045 + }, + { + "epoch": 0.5615494444099571, + "grad_norm": 0.26795846124142886, + "learning_rate": 9.77027409235336e-05, + "loss": 3.2474, + "step": 9046 + }, + { + "epoch": 0.561611521509715, + "grad_norm": 0.22522909667002491, + "learning_rate": 9.77016586704737e-05, + "loss": 3.1609, + "step": 9047 + }, + { + "epoch": 0.561673598609473, + "grad_norm": 0.215224743193133, + "learning_rate": 9.770057616854268e-05, + "loss": 3.2294, + "step": 9048 + }, + { + "epoch": 0.5617356757092309, + "grad_norm": 0.21185533087644948, + "learning_rate": 9.769949341774617e-05, + "loss": 3.1477, + "step": 9049 + }, + { + "epoch": 0.5617977528089888, + "grad_norm": 0.49473513397369656, + "learning_rate": 9.769841041808983e-05, + "loss": 3.1964, + "step": 9050 + }, + { + "epoch": 0.5618598299087466, + "grad_norm": 0.3003679383519757, + "learning_rate": 9.76973271695793e-05, + "loss": 3.3243, + "step": 9051 + }, + { + "epoch": 0.5619219070085045, + "grad_norm": 0.23087000265687047, + "learning_rate": 9.769624367222021e-05, + "loss": 3.2945, + "step": 9052 + }, + { + "epoch": 0.5619839841082624, + "grad_norm": 0.2122120146993528, + "learning_rate": 9.769515992601825e-05, + "loss": 3.2926, + "step": 9053 + }, + { + "epoch": 0.5620460612080204, + "grad_norm": 0.23467301654188788, + "learning_rate": 9.769407593097908e-05, + "loss": 3.2027, + "step": 9054 + }, + { + "epoch": 0.5621081383077783, + "grad_norm": 0.25835073525257884, + "learning_rate": 9.769299168710833e-05, + "loss": 3.285, + "step": 9055 + }, + { + "epoch": 0.5621702154075362, + "grad_norm": 0.2730154185805465, + "learning_rate": 9.769190719441166e-05, + "loss": 3.2977, + "step": 9056 + }, + { + "epoch": 0.562232292507294, + "grad_norm": 0.19295145605948505, + "learning_rate": 9.769082245289474e-05, + "loss": 3.2006, + "step": 9057 + }, + { + "epoch": 0.5622943696070519, + "grad_norm": 0.21964223303080377, + "learning_rate": 9.768973746256322e-05, + "loss": 3.3281, + "step": 9058 + }, + { + "epoch": 0.5623564467068098, + "grad_norm": 0.2347384971902176, + "learning_rate": 9.768865222342276e-05, + "loss": 3.2808, + "step": 9059 + }, + { + "epoch": 0.5624185238065678, + "grad_norm": 0.3179444142057165, + "learning_rate": 9.768756673547902e-05, + "loss": 3.1048, + "step": 9060 + }, + { + "epoch": 0.5624806009063257, + "grad_norm": 0.19166745638339125, + "learning_rate": 9.768648099873768e-05, + "loss": 3.1566, + "step": 9061 + }, + { + "epoch": 0.5625426780060836, + "grad_norm": 0.20305597526350652, + "learning_rate": 9.768539501320437e-05, + "loss": 3.3073, + "step": 9062 + }, + { + "epoch": 0.5626047551058414, + "grad_norm": 0.20458714886258392, + "learning_rate": 9.76843087788848e-05, + "loss": 3.3107, + "step": 9063 + }, + { + "epoch": 0.5626668322055993, + "grad_norm": 0.1899411414764834, + "learning_rate": 9.768322229578461e-05, + "loss": 3.1951, + "step": 9064 + }, + { + "epoch": 0.5627289093053572, + "grad_norm": 0.24890886550841407, + "learning_rate": 9.768213556390945e-05, + "loss": 3.2716, + "step": 9065 + }, + { + "epoch": 0.5627909864051152, + "grad_norm": 0.22046147679876985, + "learning_rate": 9.768104858326506e-05, + "loss": 3.234, + "step": 9066 + }, + { + "epoch": 0.5628530635048731, + "grad_norm": 0.2833833844257427, + "learning_rate": 9.767996135385703e-05, + "loss": 3.2804, + "step": 9067 + }, + { + "epoch": 0.562915140604631, + "grad_norm": 0.3078193968760263, + "learning_rate": 9.767887387569108e-05, + "loss": 3.2069, + "step": 9068 + }, + { + "epoch": 0.5629772177043888, + "grad_norm": 0.2567883726684808, + "learning_rate": 9.767778614877286e-05, + "loss": 3.1901, + "step": 9069 + }, + { + "epoch": 0.5630392948041467, + "grad_norm": 0.23019642824284176, + "learning_rate": 9.767669817310808e-05, + "loss": 3.1608, + "step": 9070 + }, + { + "epoch": 0.5631013719039046, + "grad_norm": 0.27148492387674955, + "learning_rate": 9.767560994870236e-05, + "loss": 3.2112, + "step": 9071 + }, + { + "epoch": 0.5631634490036626, + "grad_norm": 0.3128337032056467, + "learning_rate": 9.767452147556142e-05, + "loss": 3.3101, + "step": 9072 + }, + { + "epoch": 0.5632255261034205, + "grad_norm": 0.27038447463726595, + "learning_rate": 9.767343275369094e-05, + "loss": 3.2506, + "step": 9073 + }, + { + "epoch": 0.5632876032031784, + "grad_norm": 0.24588868513455342, + "learning_rate": 9.767234378309658e-05, + "loss": 3.1887, + "step": 9074 + }, + { + "epoch": 0.5633496803029362, + "grad_norm": 0.2224803949542991, + "learning_rate": 9.767125456378403e-05, + "loss": 3.1957, + "step": 9075 + }, + { + "epoch": 0.5634117574026941, + "grad_norm": 0.20985754261717932, + "learning_rate": 9.767016509575897e-05, + "loss": 3.2578, + "step": 9076 + }, + { + "epoch": 0.563473834502452, + "grad_norm": 0.1859797097228064, + "learning_rate": 9.766907537902709e-05, + "loss": 3.2677, + "step": 9077 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.2454514902434274, + "learning_rate": 9.766798541359407e-05, + "loss": 3.2672, + "step": 9078 + }, + { + "epoch": 0.5635979887019679, + "grad_norm": 0.19950815758722162, + "learning_rate": 9.766689519946559e-05, + "loss": 3.1949, + "step": 9079 + }, + { + "epoch": 0.5636600658017258, + "grad_norm": 0.2414719717443486, + "learning_rate": 9.766580473664736e-05, + "loss": 3.2583, + "step": 9080 + }, + { + "epoch": 0.5637221429014836, + "grad_norm": 0.2923727840316775, + "learning_rate": 9.766471402514506e-05, + "loss": 3.2875, + "step": 9081 + }, + { + "epoch": 0.5637842200012415, + "grad_norm": 0.2104243111087345, + "learning_rate": 9.766362306496434e-05, + "loss": 3.1808, + "step": 9082 + }, + { + "epoch": 0.5638462971009994, + "grad_norm": 0.1914698339456191, + "learning_rate": 9.766253185611097e-05, + "loss": 3.3004, + "step": 9083 + }, + { + "epoch": 0.5639083742007573, + "grad_norm": 0.2096414930438389, + "learning_rate": 9.766144039859057e-05, + "loss": 3.2846, + "step": 9084 + }, + { + "epoch": 0.5639704513005153, + "grad_norm": 0.3702854589254781, + "learning_rate": 9.76603486924089e-05, + "loss": 3.1874, + "step": 9085 + }, + { + "epoch": 0.5640325284002732, + "grad_norm": 0.2073325986960966, + "learning_rate": 9.765925673757159e-05, + "loss": 3.2623, + "step": 9086 + }, + { + "epoch": 0.564094605500031, + "grad_norm": 0.1879383367602918, + "learning_rate": 9.765816453408439e-05, + "loss": 3.1874, + "step": 9087 + }, + { + "epoch": 0.5641566825997889, + "grad_norm": 0.25753292183460097, + "learning_rate": 9.765707208195296e-05, + "loss": 3.2748, + "step": 9088 + }, + { + "epoch": 0.5642187596995468, + "grad_norm": 0.1719842314412407, + "learning_rate": 9.765597938118302e-05, + "loss": 3.2762, + "step": 9089 + }, + { + "epoch": 0.5642808367993047, + "grad_norm": 0.23815146022625672, + "learning_rate": 9.765488643178027e-05, + "loss": 3.3251, + "step": 9090 + }, + { + "epoch": 0.5643429138990627, + "grad_norm": 0.21869089826814622, + "learning_rate": 9.76537932337504e-05, + "loss": 3.3614, + "step": 9091 + }, + { + "epoch": 0.5644049909988206, + "grad_norm": 0.21272979575897774, + "learning_rate": 9.765269978709912e-05, + "loss": 3.2752, + "step": 9092 + }, + { + "epoch": 0.5644670680985784, + "grad_norm": 0.2512013360242334, + "learning_rate": 9.765160609183215e-05, + "loss": 3.2578, + "step": 9093 + }, + { + "epoch": 0.5645291451983363, + "grad_norm": 0.19889265373550477, + "learning_rate": 9.765051214795518e-05, + "loss": 3.3167, + "step": 9094 + }, + { + "epoch": 0.5645912222980942, + "grad_norm": 0.2512174111197846, + "learning_rate": 9.764941795547391e-05, + "loss": 3.2198, + "step": 9095 + }, + { + "epoch": 0.5646532993978521, + "grad_norm": 0.23740923113229773, + "learning_rate": 9.764832351439408e-05, + "loss": 3.1498, + "step": 9096 + }, + { + "epoch": 0.56471537649761, + "grad_norm": 0.2179212282531817, + "learning_rate": 9.764722882472137e-05, + "loss": 3.2201, + "step": 9097 + }, + { + "epoch": 0.564777453597368, + "grad_norm": 0.20849282398384736, + "learning_rate": 9.76461338864615e-05, + "loss": 3.2647, + "step": 9098 + }, + { + "epoch": 0.5648395306971258, + "grad_norm": 0.23729766238767616, + "learning_rate": 9.764503869962017e-05, + "loss": 3.2393, + "step": 9099 + }, + { + "epoch": 0.5649016077968837, + "grad_norm": 0.22723278011572057, + "learning_rate": 9.764394326420311e-05, + "loss": 3.2816, + "step": 9100 + }, + { + "epoch": 0.5649636848966416, + "grad_norm": 0.4599438803844897, + "learning_rate": 9.764284758021605e-05, + "loss": 3.2689, + "step": 9101 + }, + { + "epoch": 0.5650257619963995, + "grad_norm": 0.2777920310350539, + "learning_rate": 9.764175164766466e-05, + "loss": 3.1696, + "step": 9102 + }, + { + "epoch": 0.5650878390961575, + "grad_norm": 0.20759758771535797, + "learning_rate": 9.764065546655472e-05, + "loss": 3.3407, + "step": 9103 + }, + { + "epoch": 0.5651499161959154, + "grad_norm": 0.2871609596051385, + "learning_rate": 9.76395590368919e-05, + "loss": 3.2253, + "step": 9104 + }, + { + "epoch": 0.5652119932956732, + "grad_norm": 0.23340980948991136, + "learning_rate": 9.763846235868194e-05, + "loss": 3.2679, + "step": 9105 + }, + { + "epoch": 0.5652740703954311, + "grad_norm": 0.31446957989400937, + "learning_rate": 9.763736543193055e-05, + "loss": 3.1934, + "step": 9106 + }, + { + "epoch": 0.565336147495189, + "grad_norm": 0.2056350231046759, + "learning_rate": 9.763626825664346e-05, + "loss": 3.2093, + "step": 9107 + }, + { + "epoch": 0.5653982245949469, + "grad_norm": 0.22722459928317962, + "learning_rate": 9.763517083282641e-05, + "loss": 3.2897, + "step": 9108 + }, + { + "epoch": 0.5654603016947048, + "grad_norm": 0.35675263416437863, + "learning_rate": 9.763407316048508e-05, + "loss": 3.3105, + "step": 9109 + }, + { + "epoch": 0.5655223787944628, + "grad_norm": 0.25648856513707924, + "learning_rate": 9.763297523962525e-05, + "loss": 3.1996, + "step": 9110 + }, + { + "epoch": 0.5655844558942206, + "grad_norm": 0.26179595776348574, + "learning_rate": 9.763187707025262e-05, + "loss": 3.2039, + "step": 9111 + }, + { + "epoch": 0.5656465329939785, + "grad_norm": 0.22243530621455693, + "learning_rate": 9.763077865237292e-05, + "loss": 3.1396, + "step": 9112 + }, + { + "epoch": 0.5657086100937364, + "grad_norm": 0.43714502102318975, + "learning_rate": 9.762967998599188e-05, + "loss": 3.2108, + "step": 9113 + }, + { + "epoch": 0.5657706871934943, + "grad_norm": 0.26318659453371024, + "learning_rate": 9.762858107111525e-05, + "loss": 3.2802, + "step": 9114 + }, + { + "epoch": 0.5658327642932522, + "grad_norm": 0.31168571717812005, + "learning_rate": 9.762748190774875e-05, + "loss": 3.2224, + "step": 9115 + }, + { + "epoch": 0.5658948413930102, + "grad_norm": 0.3507995486000195, + "learning_rate": 9.76263824958981e-05, + "loss": 3.1831, + "step": 9116 + }, + { + "epoch": 0.565956918492768, + "grad_norm": 0.26024464488772403, + "learning_rate": 9.762528283556907e-05, + "loss": 3.3412, + "step": 9117 + }, + { + "epoch": 0.5660189955925259, + "grad_norm": 0.3425755704047237, + "learning_rate": 9.762418292676736e-05, + "loss": 3.2733, + "step": 9118 + }, + { + "epoch": 0.5660810726922838, + "grad_norm": 0.2136069417059689, + "learning_rate": 9.762308276949874e-05, + "loss": 3.2786, + "step": 9119 + }, + { + "epoch": 0.5661431497920417, + "grad_norm": 0.2913454222409683, + "learning_rate": 9.762198236376894e-05, + "loss": 3.2289, + "step": 9120 + }, + { + "epoch": 0.5662052268917996, + "grad_norm": 0.2346796519418731, + "learning_rate": 9.762088170958368e-05, + "loss": 3.242, + "step": 9121 + }, + { + "epoch": 0.5662673039915576, + "grad_norm": 0.2569200501303199, + "learning_rate": 9.761978080694873e-05, + "loss": 3.2355, + "step": 9122 + }, + { + "epoch": 0.5663293810913154, + "grad_norm": 0.34151215820116676, + "learning_rate": 9.761867965586983e-05, + "loss": 3.164, + "step": 9123 + }, + { + "epoch": 0.5663914581910733, + "grad_norm": 0.25774843691565585, + "learning_rate": 9.761757825635271e-05, + "loss": 3.2994, + "step": 9124 + }, + { + "epoch": 0.5664535352908312, + "grad_norm": 0.25832453367925295, + "learning_rate": 9.761647660840312e-05, + "loss": 3.266, + "step": 9125 + }, + { + "epoch": 0.5665156123905891, + "grad_norm": 0.23632416840070716, + "learning_rate": 9.761537471202682e-05, + "loss": 3.2196, + "step": 9126 + }, + { + "epoch": 0.566577689490347, + "grad_norm": 0.22749346984409635, + "learning_rate": 9.761427256722955e-05, + "loss": 3.1374, + "step": 9127 + }, + { + "epoch": 0.566639766590105, + "grad_norm": 0.25749448467107144, + "learning_rate": 9.761317017401705e-05, + "loss": 3.2466, + "step": 9128 + }, + { + "epoch": 0.5667018436898628, + "grad_norm": 0.3875052228698688, + "learning_rate": 9.761206753239511e-05, + "loss": 3.2302, + "step": 9129 + }, + { + "epoch": 0.5667639207896207, + "grad_norm": 0.23937076315852868, + "learning_rate": 9.761096464236942e-05, + "loss": 3.2549, + "step": 9130 + }, + { + "epoch": 0.5668259978893786, + "grad_norm": 0.22733546631918025, + "learning_rate": 9.76098615039458e-05, + "loss": 3.263, + "step": 9131 + }, + { + "epoch": 0.5668880749891365, + "grad_norm": 0.19612768761025393, + "learning_rate": 9.760875811712995e-05, + "loss": 3.2006, + "step": 9132 + }, + { + "epoch": 0.5669501520888944, + "grad_norm": 0.19638010067935333, + "learning_rate": 9.760765448192768e-05, + "loss": 3.2141, + "step": 9133 + }, + { + "epoch": 0.5670122291886524, + "grad_norm": 0.2431346844923803, + "learning_rate": 9.76065505983447e-05, + "loss": 3.2099, + "step": 9134 + }, + { + "epoch": 0.5670743062884102, + "grad_norm": 0.3015592841732932, + "learning_rate": 9.760544646638678e-05, + "loss": 3.2083, + "step": 9135 + }, + { + "epoch": 0.5671363833881681, + "grad_norm": 0.256766103090946, + "learning_rate": 9.760434208605971e-05, + "loss": 3.2065, + "step": 9136 + }, + { + "epoch": 0.567198460487926, + "grad_norm": 0.29408748648251043, + "learning_rate": 9.760323745736922e-05, + "loss": 3.3393, + "step": 9137 + }, + { + "epoch": 0.5672605375876839, + "grad_norm": 0.2636951860083922, + "learning_rate": 9.760213258032109e-05, + "loss": 3.2184, + "step": 9138 + }, + { + "epoch": 0.5673226146874418, + "grad_norm": 0.23710895048100333, + "learning_rate": 9.760102745492108e-05, + "loss": 3.2493, + "step": 9139 + }, + { + "epoch": 0.5673846917871997, + "grad_norm": 0.2197625957189375, + "learning_rate": 9.759992208117495e-05, + "loss": 3.2103, + "step": 9140 + }, + { + "epoch": 0.5674467688869576, + "grad_norm": 0.1992488632630406, + "learning_rate": 9.759881645908846e-05, + "loss": 3.1689, + "step": 9141 + }, + { + "epoch": 0.5675088459867155, + "grad_norm": 0.21403829270643634, + "learning_rate": 9.75977105886674e-05, + "loss": 3.2249, + "step": 9142 + }, + { + "epoch": 0.5675709230864734, + "grad_norm": 0.3385512228375076, + "learning_rate": 9.759660446991753e-05, + "loss": 3.3307, + "step": 9143 + }, + { + "epoch": 0.5676330001862313, + "grad_norm": 0.1909733470007015, + "learning_rate": 9.759549810284462e-05, + "loss": 3.2634, + "step": 9144 + }, + { + "epoch": 0.5676950772859892, + "grad_norm": 0.27380500347565373, + "learning_rate": 9.759439148745443e-05, + "loss": 3.2425, + "step": 9145 + }, + { + "epoch": 0.5677571543857471, + "grad_norm": 0.18814678693314346, + "learning_rate": 9.759328462375276e-05, + "loss": 3.2012, + "step": 9146 + }, + { + "epoch": 0.567819231485505, + "grad_norm": 0.22624487645844554, + "learning_rate": 9.759217751174536e-05, + "loss": 3.2447, + "step": 9147 + }, + { + "epoch": 0.5678813085852629, + "grad_norm": 0.21116002143053766, + "learning_rate": 9.759107015143804e-05, + "loss": 3.2393, + "step": 9148 + }, + { + "epoch": 0.5679433856850208, + "grad_norm": 0.36495542838719625, + "learning_rate": 9.758996254283654e-05, + "loss": 3.1743, + "step": 9149 + }, + { + "epoch": 0.5680054627847787, + "grad_norm": 0.22087687008481094, + "learning_rate": 9.758885468594664e-05, + "loss": 3.2285, + "step": 9150 + }, + { + "epoch": 0.5680675398845366, + "grad_norm": 0.3209280983607328, + "learning_rate": 9.758774658077414e-05, + "loss": 3.2075, + "step": 9151 + }, + { + "epoch": 0.5681296169842945, + "grad_norm": 0.26782322538302994, + "learning_rate": 9.758663822732483e-05, + "loss": 3.2159, + "step": 9152 + }, + { + "epoch": 0.5681916940840523, + "grad_norm": 0.2515017139848507, + "learning_rate": 9.758552962560445e-05, + "loss": 3.1761, + "step": 9153 + }, + { + "epoch": 0.5682537711838103, + "grad_norm": 0.23522330782272838, + "learning_rate": 9.758442077561882e-05, + "loss": 3.0927, + "step": 9154 + }, + { + "epoch": 0.5683158482835682, + "grad_norm": 0.2478054929683084, + "learning_rate": 9.758331167737372e-05, + "loss": 3.1633, + "step": 9155 + }, + { + "epoch": 0.5683779253833261, + "grad_norm": 0.2865302804703129, + "learning_rate": 9.758220233087494e-05, + "loss": 3.3536, + "step": 9156 + }, + { + "epoch": 0.568440002483084, + "grad_norm": 0.331248212755528, + "learning_rate": 9.758109273612824e-05, + "loss": 3.3315, + "step": 9157 + }, + { + "epoch": 0.5685020795828419, + "grad_norm": 0.23657174597526615, + "learning_rate": 9.757998289313945e-05, + "loss": 3.2949, + "step": 9158 + }, + { + "epoch": 0.5685641566825997, + "grad_norm": 0.2280707062817915, + "learning_rate": 9.757887280191431e-05, + "loss": 3.1843, + "step": 9159 + }, + { + "epoch": 0.5686262337823577, + "grad_norm": 0.21477202171771262, + "learning_rate": 9.757776246245867e-05, + "loss": 3.1976, + "step": 9160 + }, + { + "epoch": 0.5686883108821156, + "grad_norm": 0.19613872293525356, + "learning_rate": 9.757665187477827e-05, + "loss": 3.1691, + "step": 9161 + }, + { + "epoch": 0.5687503879818735, + "grad_norm": 0.24843009121961993, + "learning_rate": 9.757554103887894e-05, + "loss": 3.245, + "step": 9162 + }, + { + "epoch": 0.5688124650816314, + "grad_norm": 0.22722234499399366, + "learning_rate": 9.757442995476646e-05, + "loss": 3.176, + "step": 9163 + }, + { + "epoch": 0.5688745421813893, + "grad_norm": 0.19774297048213188, + "learning_rate": 9.757331862244663e-05, + "loss": 3.121, + "step": 9164 + }, + { + "epoch": 0.5689366192811471, + "grad_norm": 0.38519000356226707, + "learning_rate": 9.757220704192525e-05, + "loss": 3.1645, + "step": 9165 + }, + { + "epoch": 0.5689986963809051, + "grad_norm": 0.21672665257749588, + "learning_rate": 9.757109521320812e-05, + "loss": 3.2332, + "step": 9166 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.32400089436130947, + "learning_rate": 9.756998313630103e-05, + "loss": 3.1815, + "step": 9167 + }, + { + "epoch": 0.5691228505804209, + "grad_norm": 0.2838472210912191, + "learning_rate": 9.75688708112098e-05, + "loss": 3.1708, + "step": 9168 + }, + { + "epoch": 0.5691849276801788, + "grad_norm": 0.246335516352029, + "learning_rate": 9.75677582379402e-05, + "loss": 3.2927, + "step": 9169 + }, + { + "epoch": 0.5692470047799367, + "grad_norm": 0.3178505136417009, + "learning_rate": 9.75666454164981e-05, + "loss": 3.2522, + "step": 9170 + }, + { + "epoch": 0.5693090818796945, + "grad_norm": 0.28427954685690215, + "learning_rate": 9.756553234688923e-05, + "loss": 3.2892, + "step": 9171 + }, + { + "epoch": 0.5693711589794525, + "grad_norm": 0.2412882769687508, + "learning_rate": 9.756441902911943e-05, + "loss": 3.2009, + "step": 9172 + }, + { + "epoch": 0.5694332360792104, + "grad_norm": 0.21338090524675501, + "learning_rate": 9.756330546319452e-05, + "loss": 3.1888, + "step": 9173 + }, + { + "epoch": 0.5694953131789683, + "grad_norm": 0.2120641625061211, + "learning_rate": 9.75621916491203e-05, + "loss": 3.2012, + "step": 9174 + }, + { + "epoch": 0.5695573902787262, + "grad_norm": 0.2915922123814594, + "learning_rate": 9.756107758690258e-05, + "loss": 3.2551, + "step": 9175 + }, + { + "epoch": 0.5696194673784841, + "grad_norm": 0.17735902745657536, + "learning_rate": 9.755996327654717e-05, + "loss": 3.1525, + "step": 9176 + }, + { + "epoch": 0.5696815444782419, + "grad_norm": 0.2078315364034831, + "learning_rate": 9.755884871805989e-05, + "loss": 3.1829, + "step": 9177 + }, + { + "epoch": 0.5697436215779998, + "grad_norm": 0.2401150275164751, + "learning_rate": 9.755773391144654e-05, + "loss": 3.2957, + "step": 9178 + }, + { + "epoch": 0.5698056986777578, + "grad_norm": 0.2568436828141636, + "learning_rate": 9.755661885671296e-05, + "loss": 3.3094, + "step": 9179 + }, + { + "epoch": 0.5698677757775157, + "grad_norm": 0.24153353474262287, + "learning_rate": 9.755550355386493e-05, + "loss": 3.2854, + "step": 9180 + }, + { + "epoch": 0.5699298528772736, + "grad_norm": 0.24574317636019558, + "learning_rate": 9.755438800290832e-05, + "loss": 3.0547, + "step": 9181 + }, + { + "epoch": 0.5699919299770315, + "grad_norm": 0.20906032536890976, + "learning_rate": 9.755327220384892e-05, + "loss": 3.2217, + "step": 9182 + }, + { + "epoch": 0.5700540070767893, + "grad_norm": 0.17218824041141617, + "learning_rate": 9.755215615669255e-05, + "loss": 3.2393, + "step": 9183 + }, + { + "epoch": 0.5701160841765472, + "grad_norm": 0.22284589186866632, + "learning_rate": 9.755103986144505e-05, + "loss": 3.3252, + "step": 9184 + }, + { + "epoch": 0.5701781612763052, + "grad_norm": 0.21686138877150785, + "learning_rate": 9.754992331811222e-05, + "loss": 3.2432, + "step": 9185 + }, + { + "epoch": 0.5702402383760631, + "grad_norm": 0.23289503292351477, + "learning_rate": 9.754880652669989e-05, + "loss": 3.2967, + "step": 9186 + }, + { + "epoch": 0.570302315475821, + "grad_norm": 0.2558688970060664, + "learning_rate": 9.754768948721388e-05, + "loss": 3.337, + "step": 9187 + }, + { + "epoch": 0.5703643925755789, + "grad_norm": 0.18810765823327386, + "learning_rate": 9.754657219966007e-05, + "loss": 3.2181, + "step": 9188 + }, + { + "epoch": 0.5704264696753367, + "grad_norm": 0.229245869359084, + "learning_rate": 9.754545466404424e-05, + "loss": 3.2798, + "step": 9189 + }, + { + "epoch": 0.5704885467750946, + "grad_norm": 0.19850671905092715, + "learning_rate": 9.754433688037222e-05, + "loss": 3.2856, + "step": 9190 + }, + { + "epoch": 0.5705506238748526, + "grad_norm": 0.1864691250801949, + "learning_rate": 9.754321884864985e-05, + "loss": 3.2435, + "step": 9191 + }, + { + "epoch": 0.5706127009746105, + "grad_norm": 0.3433659206803674, + "learning_rate": 9.754210056888298e-05, + "loss": 3.2332, + "step": 9192 + }, + { + "epoch": 0.5706747780743684, + "grad_norm": 0.2945264729889024, + "learning_rate": 9.754098204107741e-05, + "loss": 3.2391, + "step": 9193 + }, + { + "epoch": 0.5707368551741263, + "grad_norm": 0.2498709748031226, + "learning_rate": 9.753986326523901e-05, + "loss": 3.3028, + "step": 9194 + }, + { + "epoch": 0.5707989322738841, + "grad_norm": 0.23796693990298254, + "learning_rate": 9.753874424137361e-05, + "loss": 3.2331, + "step": 9195 + }, + { + "epoch": 0.570861009373642, + "grad_norm": 0.24379903929701432, + "learning_rate": 9.753762496948702e-05, + "loss": 3.2384, + "step": 9196 + }, + { + "epoch": 0.5709230864734, + "grad_norm": 0.26942201142201727, + "learning_rate": 9.753650544958511e-05, + "loss": 3.2031, + "step": 9197 + }, + { + "epoch": 0.5709851635731579, + "grad_norm": 0.24128181559033574, + "learning_rate": 9.753538568167371e-05, + "loss": 3.2683, + "step": 9198 + }, + { + "epoch": 0.5710472406729158, + "grad_norm": 0.32394265288196455, + "learning_rate": 9.753426566575867e-05, + "loss": 3.2247, + "step": 9199 + }, + { + "epoch": 0.5711093177726737, + "grad_norm": 0.2626790463541544, + "learning_rate": 9.753314540184581e-05, + "loss": 3.2185, + "step": 9200 + }, + { + "epoch": 0.5711713948724315, + "grad_norm": 0.3072439199735326, + "learning_rate": 9.753202488994102e-05, + "loss": 3.281, + "step": 9201 + }, + { + "epoch": 0.5712334719721894, + "grad_norm": 0.2550888690175429, + "learning_rate": 9.753090413005011e-05, + "loss": 3.2377, + "step": 9202 + }, + { + "epoch": 0.5712955490719474, + "grad_norm": 0.30409636986132865, + "learning_rate": 9.752978312217891e-05, + "loss": 3.2571, + "step": 9203 + }, + { + "epoch": 0.5713576261717053, + "grad_norm": 0.23614319746153073, + "learning_rate": 9.752866186633331e-05, + "loss": 3.232, + "step": 9204 + }, + { + "epoch": 0.5714197032714632, + "grad_norm": 0.3011923865867647, + "learning_rate": 9.752754036251914e-05, + "loss": 3.2529, + "step": 9205 + }, + { + "epoch": 0.5714817803712211, + "grad_norm": 0.27436893404292284, + "learning_rate": 9.752641861074225e-05, + "loss": 3.2329, + "step": 9206 + }, + { + "epoch": 0.5715438574709789, + "grad_norm": 0.28088015504663805, + "learning_rate": 9.752529661100851e-05, + "loss": 3.0976, + "step": 9207 + }, + { + "epoch": 0.5716059345707368, + "grad_norm": 0.23875183788810467, + "learning_rate": 9.752417436332376e-05, + "loss": 3.2493, + "step": 9208 + }, + { + "epoch": 0.5716680116704947, + "grad_norm": 0.24406760840497585, + "learning_rate": 9.752305186769384e-05, + "loss": 3.2147, + "step": 9209 + }, + { + "epoch": 0.5717300887702527, + "grad_norm": 0.308357051418259, + "learning_rate": 9.752192912412463e-05, + "loss": 3.2416, + "step": 9210 + }, + { + "epoch": 0.5717921658700106, + "grad_norm": 0.2946525253220695, + "learning_rate": 9.752080613262197e-05, + "loss": 3.1875, + "step": 9211 + }, + { + "epoch": 0.5718542429697685, + "grad_norm": 0.2599083787849945, + "learning_rate": 9.751968289319176e-05, + "loss": 3.1385, + "step": 9212 + }, + { + "epoch": 0.5719163200695263, + "grad_norm": 0.3540701126802072, + "learning_rate": 9.75185594058398e-05, + "loss": 3.2335, + "step": 9213 + }, + { + "epoch": 0.5719783971692842, + "grad_norm": 0.3861017603489015, + "learning_rate": 9.751743567057199e-05, + "loss": 3.3235, + "step": 9214 + }, + { + "epoch": 0.5720404742690421, + "grad_norm": 0.42064978032003797, + "learning_rate": 9.751631168739418e-05, + "loss": 3.2242, + "step": 9215 + }, + { + "epoch": 0.5721025513688001, + "grad_norm": 0.41883927005796023, + "learning_rate": 9.751518745631222e-05, + "loss": 3.2903, + "step": 9216 + }, + { + "epoch": 0.572164628468558, + "grad_norm": 0.3375074606530011, + "learning_rate": 9.751406297733202e-05, + "loss": 3.32, + "step": 9217 + }, + { + "epoch": 0.5722267055683159, + "grad_norm": 0.31012495637314474, + "learning_rate": 9.751293825045942e-05, + "loss": 3.2152, + "step": 9218 + }, + { + "epoch": 0.5722887826680737, + "grad_norm": 0.2871948556092949, + "learning_rate": 9.751181327570027e-05, + "loss": 3.187, + "step": 9219 + }, + { + "epoch": 0.5723508597678316, + "grad_norm": 0.27946909683139476, + "learning_rate": 9.751068805306046e-05, + "loss": 3.2495, + "step": 9220 + }, + { + "epoch": 0.5724129368675895, + "grad_norm": 0.33957997312619026, + "learning_rate": 9.750956258254586e-05, + "loss": 3.1431, + "step": 9221 + }, + { + "epoch": 0.5724750139673475, + "grad_norm": 0.35945742118073837, + "learning_rate": 9.750843686416233e-05, + "loss": 3.2409, + "step": 9222 + }, + { + "epoch": 0.5725370910671054, + "grad_norm": 0.2975448404907741, + "learning_rate": 9.750731089791576e-05, + "loss": 3.2352, + "step": 9223 + }, + { + "epoch": 0.5725991681668633, + "grad_norm": 0.27372467764350206, + "learning_rate": 9.750618468381201e-05, + "loss": 3.183, + "step": 9224 + }, + { + "epoch": 0.5726612452666211, + "grad_norm": 0.3469456042355272, + "learning_rate": 9.750505822185696e-05, + "loss": 3.2201, + "step": 9225 + }, + { + "epoch": 0.572723322366379, + "grad_norm": 0.2559452495170244, + "learning_rate": 9.75039315120565e-05, + "loss": 3.2166, + "step": 9226 + }, + { + "epoch": 0.5727853994661369, + "grad_norm": 0.30010912470157425, + "learning_rate": 9.75028045544165e-05, + "loss": 3.3311, + "step": 9227 + }, + { + "epoch": 0.5728474765658949, + "grad_norm": 0.28886320357706025, + "learning_rate": 9.750167734894282e-05, + "loss": 3.2491, + "step": 9228 + }, + { + "epoch": 0.5729095536656528, + "grad_norm": 0.2642271341249036, + "learning_rate": 9.750054989564136e-05, + "loss": 3.1506, + "step": 9229 + }, + { + "epoch": 0.5729716307654107, + "grad_norm": 0.3650121573814215, + "learning_rate": 9.749942219451802e-05, + "loss": 3.2828, + "step": 9230 + }, + { + "epoch": 0.5730337078651685, + "grad_norm": 0.3240977716517327, + "learning_rate": 9.749829424557866e-05, + "loss": 3.3076, + "step": 9231 + }, + { + "epoch": 0.5730957849649264, + "grad_norm": 0.47798900041333986, + "learning_rate": 9.749716604882915e-05, + "loss": 3.3527, + "step": 9232 + }, + { + "epoch": 0.5731578620646843, + "grad_norm": 0.3230452614758937, + "learning_rate": 9.749603760427541e-05, + "loss": 3.2511, + "step": 9233 + }, + { + "epoch": 0.5732199391644422, + "grad_norm": 0.2646868069131868, + "learning_rate": 9.74949089119233e-05, + "loss": 3.227, + "step": 9234 + }, + { + "epoch": 0.5732820162642002, + "grad_norm": 0.2389168937911768, + "learning_rate": 9.749377997177873e-05, + "loss": 3.1471, + "step": 9235 + }, + { + "epoch": 0.5733440933639581, + "grad_norm": 0.38702809840348706, + "learning_rate": 9.749265078384757e-05, + "loss": 3.2488, + "step": 9236 + }, + { + "epoch": 0.5734061704637159, + "grad_norm": 0.26897437853467926, + "learning_rate": 9.749152134813573e-05, + "loss": 3.2798, + "step": 9237 + }, + { + "epoch": 0.5734682475634738, + "grad_norm": 0.3526394666735297, + "learning_rate": 9.749039166464911e-05, + "loss": 3.2042, + "step": 9238 + }, + { + "epoch": 0.5735303246632317, + "grad_norm": 0.331724159494891, + "learning_rate": 9.748926173339357e-05, + "loss": 3.274, + "step": 9239 + }, + { + "epoch": 0.5735924017629896, + "grad_norm": 0.3908571888360984, + "learning_rate": 9.748813155437502e-05, + "loss": 3.2894, + "step": 9240 + }, + { + "epoch": 0.5736544788627476, + "grad_norm": 0.2240229372985098, + "learning_rate": 9.748700112759935e-05, + "loss": 3.1207, + "step": 9241 + }, + { + "epoch": 0.5737165559625055, + "grad_norm": 0.24845708134094938, + "learning_rate": 9.748587045307249e-05, + "loss": 3.2327, + "step": 9242 + }, + { + "epoch": 0.5737786330622633, + "grad_norm": 0.20521282132238547, + "learning_rate": 9.74847395308003e-05, + "loss": 3.1912, + "step": 9243 + }, + { + "epoch": 0.5738407101620212, + "grad_norm": 0.2713518573982447, + "learning_rate": 9.748360836078871e-05, + "loss": 3.2503, + "step": 9244 + }, + { + "epoch": 0.5739027872617791, + "grad_norm": 0.2116280748653151, + "learning_rate": 9.74824769430436e-05, + "loss": 3.1834, + "step": 9245 + }, + { + "epoch": 0.573964864361537, + "grad_norm": 0.20475175409590354, + "learning_rate": 9.748134527757088e-05, + "loss": 3.1947, + "step": 9246 + }, + { + "epoch": 0.574026941461295, + "grad_norm": 0.21399320086313442, + "learning_rate": 9.748021336437645e-05, + "loss": 3.2704, + "step": 9247 + }, + { + "epoch": 0.5740890185610529, + "grad_norm": 0.20818373355418796, + "learning_rate": 9.747908120346623e-05, + "loss": 3.1817, + "step": 9248 + }, + { + "epoch": 0.5741510956608107, + "grad_norm": 0.2947695003753216, + "learning_rate": 9.747794879484612e-05, + "loss": 3.2523, + "step": 9249 + }, + { + "epoch": 0.5742131727605686, + "grad_norm": 0.19871325968995313, + "learning_rate": 9.747681613852203e-05, + "loss": 3.2442, + "step": 9250 + }, + { + "epoch": 0.5742752498603265, + "grad_norm": 0.17264886598634416, + "learning_rate": 9.747568323449984e-05, + "loss": 3.2278, + "step": 9251 + }, + { + "epoch": 0.5743373269600844, + "grad_norm": 0.22532246218441174, + "learning_rate": 9.74745500827855e-05, + "loss": 3.3595, + "step": 9252 + }, + { + "epoch": 0.5743994040598424, + "grad_norm": 0.2044815650957335, + "learning_rate": 9.747341668338492e-05, + "loss": 3.284, + "step": 9253 + }, + { + "epoch": 0.5744614811596003, + "grad_norm": 0.18294351154481403, + "learning_rate": 9.747228303630398e-05, + "loss": 3.2719, + "step": 9254 + }, + { + "epoch": 0.5745235582593581, + "grad_norm": 0.26046664076364723, + "learning_rate": 9.747114914154863e-05, + "loss": 3.1609, + "step": 9255 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.20814141831634494, + "learning_rate": 9.747001499912476e-05, + "loss": 3.1943, + "step": 9256 + }, + { + "epoch": 0.5746477124588739, + "grad_norm": 0.2308298127534802, + "learning_rate": 9.746888060903831e-05, + "loss": 3.2309, + "step": 9257 + }, + { + "epoch": 0.5747097895586318, + "grad_norm": 0.22078081468402902, + "learning_rate": 9.746774597129517e-05, + "loss": 3.1807, + "step": 9258 + }, + { + "epoch": 0.5747718666583898, + "grad_norm": 0.20311883770325054, + "learning_rate": 9.74666110859013e-05, + "loss": 3.2148, + "step": 9259 + }, + { + "epoch": 0.5748339437581477, + "grad_norm": 0.24776742617865435, + "learning_rate": 9.746547595286257e-05, + "loss": 3.324, + "step": 9260 + }, + { + "epoch": 0.5748960208579055, + "grad_norm": 0.20240200191043037, + "learning_rate": 9.746434057218494e-05, + "loss": 3.2658, + "step": 9261 + }, + { + "epoch": 0.5749580979576634, + "grad_norm": 0.20775392646212065, + "learning_rate": 9.746320494387433e-05, + "loss": 3.1769, + "step": 9262 + }, + { + "epoch": 0.5750201750574213, + "grad_norm": 0.18036801594254656, + "learning_rate": 9.746206906793664e-05, + "loss": 3.0407, + "step": 9263 + }, + { + "epoch": 0.5750822521571792, + "grad_norm": 0.27518885911531826, + "learning_rate": 9.746093294437783e-05, + "loss": 3.2351, + "step": 9264 + }, + { + "epoch": 0.5751443292569371, + "grad_norm": 0.18151801082937502, + "learning_rate": 9.74597965732038e-05, + "loss": 3.3184, + "step": 9265 + }, + { + "epoch": 0.5752064063566951, + "grad_norm": 0.23020922871382452, + "learning_rate": 9.745865995442051e-05, + "loss": 3.2362, + "step": 9266 + }, + { + "epoch": 0.5752684834564529, + "grad_norm": 0.2108553797204494, + "learning_rate": 9.745752308803387e-05, + "loss": 3.153, + "step": 9267 + }, + { + "epoch": 0.5753305605562108, + "grad_norm": 0.33335907161208034, + "learning_rate": 9.74563859740498e-05, + "loss": 3.2352, + "step": 9268 + }, + { + "epoch": 0.5753926376559687, + "grad_norm": 0.23609720723658606, + "learning_rate": 9.745524861247424e-05, + "loss": 3.2453, + "step": 9269 + }, + { + "epoch": 0.5754547147557266, + "grad_norm": 0.2404544411628859, + "learning_rate": 9.745411100331311e-05, + "loss": 3.2315, + "step": 9270 + }, + { + "epoch": 0.5755167918554845, + "grad_norm": 0.23189954808971677, + "learning_rate": 9.74529731465724e-05, + "loss": 3.1554, + "step": 9271 + }, + { + "epoch": 0.5755788689552425, + "grad_norm": 0.25633193194891324, + "learning_rate": 9.745183504225798e-05, + "loss": 3.1828, + "step": 9272 + }, + { + "epoch": 0.5756409460550003, + "grad_norm": 0.24565048260306532, + "learning_rate": 9.745069669037584e-05, + "loss": 3.218, + "step": 9273 + }, + { + "epoch": 0.5757030231547582, + "grad_norm": 0.2299297463109564, + "learning_rate": 9.744955809093187e-05, + "loss": 3.3545, + "step": 9274 + }, + { + "epoch": 0.5757651002545161, + "grad_norm": 0.24060142977569948, + "learning_rate": 9.744841924393207e-05, + "loss": 3.2545, + "step": 9275 + }, + { + "epoch": 0.575827177354274, + "grad_norm": 0.24226669585610372, + "learning_rate": 9.744728014938233e-05, + "loss": 3.2933, + "step": 9276 + }, + { + "epoch": 0.5758892544540319, + "grad_norm": 0.20361791921039984, + "learning_rate": 9.744614080728863e-05, + "loss": 3.2809, + "step": 9277 + }, + { + "epoch": 0.5759513315537899, + "grad_norm": 0.2602344813442557, + "learning_rate": 9.744500121765686e-05, + "loss": 3.3414, + "step": 9278 + }, + { + "epoch": 0.5760134086535477, + "grad_norm": 0.19888645395746243, + "learning_rate": 9.744386138049301e-05, + "loss": 3.2383, + "step": 9279 + }, + { + "epoch": 0.5760754857533056, + "grad_norm": 0.20094234371724873, + "learning_rate": 9.744272129580304e-05, + "loss": 3.2349, + "step": 9280 + }, + { + "epoch": 0.5761375628530635, + "grad_norm": 0.21630756042095875, + "learning_rate": 9.744158096359286e-05, + "loss": 3.1922, + "step": 9281 + }, + { + "epoch": 0.5761996399528214, + "grad_norm": 0.35952488220093015, + "learning_rate": 9.744044038386844e-05, + "loss": 3.2475, + "step": 9282 + }, + { + "epoch": 0.5762617170525793, + "grad_norm": 0.21010445413836165, + "learning_rate": 9.743929955663574e-05, + "loss": 3.1014, + "step": 9283 + }, + { + "epoch": 0.5763237941523373, + "grad_norm": 0.20411589905247532, + "learning_rate": 9.743815848190068e-05, + "loss": 3.2981, + "step": 9284 + }, + { + "epoch": 0.5763858712520951, + "grad_norm": 0.23290763709667273, + "learning_rate": 9.743701715966922e-05, + "loss": 3.2621, + "step": 9285 + }, + { + "epoch": 0.576447948351853, + "grad_norm": 0.24274349227446074, + "learning_rate": 9.743587558994734e-05, + "loss": 3.3006, + "step": 9286 + }, + { + "epoch": 0.5765100254516109, + "grad_norm": 0.333162020331408, + "learning_rate": 9.743473377274098e-05, + "loss": 3.2504, + "step": 9287 + }, + { + "epoch": 0.5765721025513688, + "grad_norm": 0.213622157503729, + "learning_rate": 9.74335917080561e-05, + "loss": 3.1648, + "step": 9288 + }, + { + "epoch": 0.5766341796511267, + "grad_norm": 0.24840818208170312, + "learning_rate": 9.743244939589867e-05, + "loss": 3.1965, + "step": 9289 + }, + { + "epoch": 0.5766962567508847, + "grad_norm": 0.35357781384841397, + "learning_rate": 9.743130683627462e-05, + "loss": 3.2109, + "step": 9290 + }, + { + "epoch": 0.5767583338506425, + "grad_norm": 0.20052828196938371, + "learning_rate": 9.743016402918992e-05, + "loss": 3.206, + "step": 9291 + }, + { + "epoch": 0.5768204109504004, + "grad_norm": 0.22368394691862156, + "learning_rate": 9.742902097465056e-05, + "loss": 3.1695, + "step": 9292 + }, + { + "epoch": 0.5768824880501583, + "grad_norm": 0.23852924043170468, + "learning_rate": 9.742787767266247e-05, + "loss": 3.2647, + "step": 9293 + }, + { + "epoch": 0.5769445651499162, + "grad_norm": 0.27367105046659074, + "learning_rate": 9.742673412323163e-05, + "loss": 3.2799, + "step": 9294 + }, + { + "epoch": 0.5770066422496741, + "grad_norm": 0.27469719504041157, + "learning_rate": 9.742559032636401e-05, + "loss": 3.1924, + "step": 9295 + }, + { + "epoch": 0.577068719349432, + "grad_norm": 0.268518438493752, + "learning_rate": 9.742444628206558e-05, + "loss": 3.2612, + "step": 9296 + }, + { + "epoch": 0.5771307964491899, + "grad_norm": 0.29522058378513993, + "learning_rate": 9.742330199034228e-05, + "loss": 3.2428, + "step": 9297 + }, + { + "epoch": 0.5771928735489478, + "grad_norm": 0.3492257234871313, + "learning_rate": 9.742215745120011e-05, + "loss": 3.1584, + "step": 9298 + }, + { + "epoch": 0.5772549506487057, + "grad_norm": 0.26259812609998295, + "learning_rate": 9.742101266464505e-05, + "loss": 3.1827, + "step": 9299 + }, + { + "epoch": 0.5773170277484636, + "grad_norm": 0.3333671510449698, + "learning_rate": 9.741986763068303e-05, + "loss": 3.1847, + "step": 9300 + }, + { + "epoch": 0.5773791048482215, + "grad_norm": 0.31960516263637806, + "learning_rate": 9.741872234932006e-05, + "loss": 3.1772, + "step": 9301 + }, + { + "epoch": 0.5774411819479794, + "grad_norm": 0.3592288715318786, + "learning_rate": 9.741757682056212e-05, + "loss": 3.3149, + "step": 9302 + }, + { + "epoch": 0.5775032590477372, + "grad_norm": 0.3814942570237827, + "learning_rate": 9.741643104441514e-05, + "loss": 3.2867, + "step": 9303 + }, + { + "epoch": 0.5775653361474952, + "grad_norm": 0.3094120042036781, + "learning_rate": 9.741528502088515e-05, + "loss": 3.3142, + "step": 9304 + }, + { + "epoch": 0.5776274132472531, + "grad_norm": 0.35986508248653476, + "learning_rate": 9.741413874997809e-05, + "loss": 3.3047, + "step": 9305 + }, + { + "epoch": 0.577689490347011, + "grad_norm": 0.320807992514465, + "learning_rate": 9.741299223169997e-05, + "loss": 3.3302, + "step": 9306 + }, + { + "epoch": 0.5777515674467689, + "grad_norm": 0.22915802834586907, + "learning_rate": 9.741184546605678e-05, + "loss": 3.2682, + "step": 9307 + }, + { + "epoch": 0.5778136445465268, + "grad_norm": 0.2258264489784407, + "learning_rate": 9.741069845305444e-05, + "loss": 3.221, + "step": 9308 + }, + { + "epoch": 0.5778757216462846, + "grad_norm": 0.20648066164820789, + "learning_rate": 9.740955119269902e-05, + "loss": 3.2499, + "step": 9309 + }, + { + "epoch": 0.5779377987460426, + "grad_norm": 0.27232814273552924, + "learning_rate": 9.740840368499641e-05, + "loss": 3.2037, + "step": 9310 + }, + { + "epoch": 0.5779998758458005, + "grad_norm": 0.3158974972537923, + "learning_rate": 9.74072559299527e-05, + "loss": 3.2454, + "step": 9311 + }, + { + "epoch": 0.5780619529455584, + "grad_norm": 0.20542777998293066, + "learning_rate": 9.74061079275738e-05, + "loss": 3.1333, + "step": 9312 + }, + { + "epoch": 0.5781240300453163, + "grad_norm": 0.21749040146727458, + "learning_rate": 9.740495967786573e-05, + "loss": 3.2104, + "step": 9313 + }, + { + "epoch": 0.5781861071450742, + "grad_norm": 0.27235221744577987, + "learning_rate": 9.740381118083449e-05, + "loss": 3.3219, + "step": 9314 + }, + { + "epoch": 0.578248184244832, + "grad_norm": 0.20190469356622437, + "learning_rate": 9.740266243648604e-05, + "loss": 3.2405, + "step": 9315 + }, + { + "epoch": 0.57831026134459, + "grad_norm": 0.2129163201600507, + "learning_rate": 9.740151344482641e-05, + "loss": 3.248, + "step": 9316 + }, + { + "epoch": 0.5783723384443479, + "grad_norm": 0.28672206471369516, + "learning_rate": 9.740036420586156e-05, + "loss": 3.2533, + "step": 9317 + }, + { + "epoch": 0.5784344155441058, + "grad_norm": 0.23971331308171973, + "learning_rate": 9.739921471959752e-05, + "loss": 3.2671, + "step": 9318 + }, + { + "epoch": 0.5784964926438637, + "grad_norm": 0.20420809069296508, + "learning_rate": 9.739806498604026e-05, + "loss": 3.179, + "step": 9319 + }, + { + "epoch": 0.5785585697436216, + "grad_norm": 0.2563657045822813, + "learning_rate": 9.739691500519579e-05, + "loss": 3.2264, + "step": 9320 + }, + { + "epoch": 0.5786206468433794, + "grad_norm": 0.261705625973635, + "learning_rate": 9.739576477707011e-05, + "loss": 3.3004, + "step": 9321 + }, + { + "epoch": 0.5786827239431374, + "grad_norm": 0.206446189842417, + "learning_rate": 9.739461430166923e-05, + "loss": 3.2239, + "step": 9322 + }, + { + "epoch": 0.5787448010428953, + "grad_norm": 0.35286051289180215, + "learning_rate": 9.739346357899912e-05, + "loss": 3.2376, + "step": 9323 + }, + { + "epoch": 0.5788068781426532, + "grad_norm": 0.21655929710309607, + "learning_rate": 9.739231260906582e-05, + "loss": 3.1877, + "step": 9324 + }, + { + "epoch": 0.5788689552424111, + "grad_norm": 0.19399102431971693, + "learning_rate": 9.739116139187532e-05, + "loss": 3.3431, + "step": 9325 + }, + { + "epoch": 0.578931032342169, + "grad_norm": 0.24522841919967175, + "learning_rate": 9.739000992743363e-05, + "loss": 3.1937, + "step": 9326 + }, + { + "epoch": 0.5789931094419268, + "grad_norm": 0.23051243707426283, + "learning_rate": 9.738885821574674e-05, + "loss": 3.1995, + "step": 9327 + }, + { + "epoch": 0.5790551865416848, + "grad_norm": 0.22865594321617827, + "learning_rate": 9.738770625682068e-05, + "loss": 3.2511, + "step": 9328 + }, + { + "epoch": 0.5791172636414427, + "grad_norm": 0.19937130752719734, + "learning_rate": 9.738655405066146e-05, + "loss": 3.0953, + "step": 9329 + }, + { + "epoch": 0.5791793407412006, + "grad_norm": 0.24395510682112378, + "learning_rate": 9.738540159727508e-05, + "loss": 3.2877, + "step": 9330 + }, + { + "epoch": 0.5792414178409585, + "grad_norm": 0.18415830939161115, + "learning_rate": 9.738424889666755e-05, + "loss": 3.0601, + "step": 9331 + }, + { + "epoch": 0.5793034949407164, + "grad_norm": 0.23111626617894812, + "learning_rate": 9.73830959488449e-05, + "loss": 3.1566, + "step": 9332 + }, + { + "epoch": 0.5793655720404742, + "grad_norm": 0.21349515737953426, + "learning_rate": 9.738194275381312e-05, + "loss": 3.1935, + "step": 9333 + }, + { + "epoch": 0.5794276491402321, + "grad_norm": 0.20732727120407884, + "learning_rate": 9.738078931157826e-05, + "loss": 3.2946, + "step": 9334 + }, + { + "epoch": 0.5794897262399901, + "grad_norm": 0.21265769604843637, + "learning_rate": 9.73796356221463e-05, + "loss": 3.2469, + "step": 9335 + }, + { + "epoch": 0.579551803339748, + "grad_norm": 0.20171733165747605, + "learning_rate": 9.73784816855233e-05, + "loss": 3.215, + "step": 9336 + }, + { + "epoch": 0.5796138804395059, + "grad_norm": 0.20218574253421007, + "learning_rate": 9.737732750171524e-05, + "loss": 3.205, + "step": 9337 + }, + { + "epoch": 0.5796759575392638, + "grad_norm": 0.17669597618962374, + "learning_rate": 9.737617307072817e-05, + "loss": 3.2158, + "step": 9338 + }, + { + "epoch": 0.5797380346390216, + "grad_norm": 0.1868802497438959, + "learning_rate": 9.737501839256812e-05, + "loss": 3.2357, + "step": 9339 + }, + { + "epoch": 0.5798001117387795, + "grad_norm": 0.17920214991030248, + "learning_rate": 9.737386346724109e-05, + "loss": 3.2329, + "step": 9340 + }, + { + "epoch": 0.5798621888385375, + "grad_norm": 0.25711383942049276, + "learning_rate": 9.73727082947531e-05, + "loss": 3.1803, + "step": 9341 + }, + { + "epoch": 0.5799242659382954, + "grad_norm": 0.1815175080156966, + "learning_rate": 9.73715528751102e-05, + "loss": 3.1674, + "step": 9342 + }, + { + "epoch": 0.5799863430380533, + "grad_norm": 0.22832052754570695, + "learning_rate": 9.73703972083184e-05, + "loss": 3.271, + "step": 9343 + }, + { + "epoch": 0.5800484201378112, + "grad_norm": 0.20925188323566968, + "learning_rate": 9.736924129438375e-05, + "loss": 3.2467, + "step": 9344 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.3097855334240242, + "learning_rate": 9.736808513331227e-05, + "loss": 3.2185, + "step": 9345 + }, + { + "epoch": 0.5801725743373269, + "grad_norm": 0.24090038029700872, + "learning_rate": 9.736692872510999e-05, + "loss": 3.2308, + "step": 9346 + }, + { + "epoch": 0.5802346514370849, + "grad_norm": 0.19692747953109885, + "learning_rate": 9.736577206978293e-05, + "loss": 3.1988, + "step": 9347 + }, + { + "epoch": 0.5802967285368428, + "grad_norm": 0.18940930007105083, + "learning_rate": 9.736461516733715e-05, + "loss": 3.1709, + "step": 9348 + }, + { + "epoch": 0.5803588056366007, + "grad_norm": 0.4224911358428255, + "learning_rate": 9.736345801777867e-05, + "loss": 3.2226, + "step": 9349 + }, + { + "epoch": 0.5804208827363586, + "grad_norm": 0.23268640514908243, + "learning_rate": 9.736230062111353e-05, + "loss": 3.2848, + "step": 9350 + }, + { + "epoch": 0.5804829598361164, + "grad_norm": 0.2242597429504391, + "learning_rate": 9.736114297734778e-05, + "loss": 3.2218, + "step": 9351 + }, + { + "epoch": 0.5805450369358743, + "grad_norm": 0.33575882706787435, + "learning_rate": 9.735998508648745e-05, + "loss": 3.179, + "step": 9352 + }, + { + "epoch": 0.5806071140356323, + "grad_norm": 0.2696122863134818, + "learning_rate": 9.735882694853858e-05, + "loss": 3.2966, + "step": 9353 + }, + { + "epoch": 0.5806691911353902, + "grad_norm": 0.33641403244166185, + "learning_rate": 9.73576685635072e-05, + "loss": 3.2851, + "step": 9354 + }, + { + "epoch": 0.5807312682351481, + "grad_norm": 0.3481835855672421, + "learning_rate": 9.735650993139937e-05, + "loss": 3.3266, + "step": 9355 + }, + { + "epoch": 0.580793345334906, + "grad_norm": 0.19422601821262783, + "learning_rate": 9.735535105222114e-05, + "loss": 3.212, + "step": 9356 + }, + { + "epoch": 0.5808554224346638, + "grad_norm": 0.36211676235132456, + "learning_rate": 9.735419192597854e-05, + "loss": 3.2287, + "step": 9357 + }, + { + "epoch": 0.5809174995344217, + "grad_norm": 0.2693722020687649, + "learning_rate": 9.735303255267765e-05, + "loss": 3.2449, + "step": 9358 + }, + { + "epoch": 0.5809795766341797, + "grad_norm": 0.2665102316670797, + "learning_rate": 9.735187293232446e-05, + "loss": 3.255, + "step": 9359 + }, + { + "epoch": 0.5810416537339376, + "grad_norm": 0.20557930876409972, + "learning_rate": 9.735071306492507e-05, + "loss": 3.251, + "step": 9360 + }, + { + "epoch": 0.5811037308336955, + "grad_norm": 0.3648465999843204, + "learning_rate": 9.734955295048551e-05, + "loss": 3.2897, + "step": 9361 + }, + { + "epoch": 0.5811658079334534, + "grad_norm": 0.3414047826041455, + "learning_rate": 9.734839258901185e-05, + "loss": 3.1866, + "step": 9362 + }, + { + "epoch": 0.5812278850332112, + "grad_norm": 0.2636322556858736, + "learning_rate": 9.734723198051011e-05, + "loss": 3.2856, + "step": 9363 + }, + { + "epoch": 0.5812899621329691, + "grad_norm": 0.29007935163884285, + "learning_rate": 9.734607112498638e-05, + "loss": 3.2389, + "step": 9364 + }, + { + "epoch": 0.581352039232727, + "grad_norm": 0.23367766113922883, + "learning_rate": 9.73449100224467e-05, + "loss": 3.3196, + "step": 9365 + }, + { + "epoch": 0.581414116332485, + "grad_norm": 0.2650410600782327, + "learning_rate": 9.734374867289713e-05, + "loss": 3.3094, + "step": 9366 + }, + { + "epoch": 0.5814761934322429, + "grad_norm": 0.2679447252275076, + "learning_rate": 9.734258707634373e-05, + "loss": 3.1955, + "step": 9367 + }, + { + "epoch": 0.5815382705320007, + "grad_norm": 0.24959354589235905, + "learning_rate": 9.734142523279257e-05, + "loss": 3.2666, + "step": 9368 + }, + { + "epoch": 0.5816003476317586, + "grad_norm": 0.23891706530763143, + "learning_rate": 9.73402631422497e-05, + "loss": 3.2126, + "step": 9369 + }, + { + "epoch": 0.5816624247315165, + "grad_norm": 0.3083257356343378, + "learning_rate": 9.733910080472118e-05, + "loss": 3.3079, + "step": 9370 + }, + { + "epoch": 0.5817245018312744, + "grad_norm": 0.28870205702070323, + "learning_rate": 9.733793822021307e-05, + "loss": 3.225, + "step": 9371 + }, + { + "epoch": 0.5817865789310324, + "grad_norm": 0.2632323663987809, + "learning_rate": 9.733677538873146e-05, + "loss": 3.2203, + "step": 9372 + }, + { + "epoch": 0.5818486560307903, + "grad_norm": 0.28479218431066217, + "learning_rate": 9.73356123102824e-05, + "loss": 3.3458, + "step": 9373 + }, + { + "epoch": 0.5819107331305481, + "grad_norm": 0.2808098660525309, + "learning_rate": 9.733444898487195e-05, + "loss": 3.2914, + "step": 9374 + }, + { + "epoch": 0.581972810230306, + "grad_norm": 0.35342724364465006, + "learning_rate": 9.73332854125062e-05, + "loss": 3.1957, + "step": 9375 + }, + { + "epoch": 0.5820348873300639, + "grad_norm": 0.31063167445881285, + "learning_rate": 9.73321215931912e-05, + "loss": 3.2103, + "step": 9376 + }, + { + "epoch": 0.5820969644298218, + "grad_norm": 0.27207388783095876, + "learning_rate": 9.733095752693303e-05, + "loss": 3.2073, + "step": 9377 + }, + { + "epoch": 0.5821590415295798, + "grad_norm": 0.21065342282632094, + "learning_rate": 9.732979321373776e-05, + "loss": 3.2622, + "step": 9378 + }, + { + "epoch": 0.5822211186293377, + "grad_norm": 0.24854920802835306, + "learning_rate": 9.73286286536115e-05, + "loss": 3.1758, + "step": 9379 + }, + { + "epoch": 0.5822831957290955, + "grad_norm": 0.27502258543988867, + "learning_rate": 9.732746384656027e-05, + "loss": 3.2626, + "step": 9380 + }, + { + "epoch": 0.5823452728288534, + "grad_norm": 0.24071415234260252, + "learning_rate": 9.732629879259017e-05, + "loss": 3.262, + "step": 9381 + }, + { + "epoch": 0.5824073499286113, + "grad_norm": 0.24440067174711094, + "learning_rate": 9.732513349170729e-05, + "loss": 3.2163, + "step": 9382 + }, + { + "epoch": 0.5824694270283692, + "grad_norm": 0.3295470598497126, + "learning_rate": 9.732396794391769e-05, + "loss": 3.2184, + "step": 9383 + }, + { + "epoch": 0.5825315041281272, + "grad_norm": 0.5178328265273691, + "learning_rate": 9.732280214922747e-05, + "loss": 3.2418, + "step": 9384 + }, + { + "epoch": 0.5825935812278851, + "grad_norm": 0.34424056464986813, + "learning_rate": 9.732163610764269e-05, + "loss": 3.1333, + "step": 9385 + }, + { + "epoch": 0.5826556583276429, + "grad_norm": 0.24510614590643476, + "learning_rate": 9.732046981916946e-05, + "loss": 3.2193, + "step": 9386 + }, + { + "epoch": 0.5827177354274008, + "grad_norm": 0.31012817563234224, + "learning_rate": 9.731930328381384e-05, + "loss": 3.2403, + "step": 9387 + }, + { + "epoch": 0.5827798125271587, + "grad_norm": 0.37632113433800735, + "learning_rate": 9.731813650158195e-05, + "loss": 3.2811, + "step": 9388 + }, + { + "epoch": 0.5828418896269166, + "grad_norm": 0.43995221089803516, + "learning_rate": 9.731696947247984e-05, + "loss": 3.1768, + "step": 9389 + }, + { + "epoch": 0.5829039667266745, + "grad_norm": 0.295174055092547, + "learning_rate": 9.73158021965136e-05, + "loss": 3.2681, + "step": 9390 + }, + { + "epoch": 0.5829660438264325, + "grad_norm": 0.2569927708540461, + "learning_rate": 9.731463467368935e-05, + "loss": 3.2379, + "step": 9391 + }, + { + "epoch": 0.5830281209261903, + "grad_norm": 0.24025545570488538, + "learning_rate": 9.731346690401317e-05, + "loss": 3.2144, + "step": 9392 + }, + { + "epoch": 0.5830901980259482, + "grad_norm": 0.22739665441645873, + "learning_rate": 9.731229888749114e-05, + "loss": 3.2996, + "step": 9393 + }, + { + "epoch": 0.5831522751257061, + "grad_norm": 0.21401602212641083, + "learning_rate": 9.731113062412936e-05, + "loss": 3.2474, + "step": 9394 + }, + { + "epoch": 0.583214352225464, + "grad_norm": 0.22242967052619764, + "learning_rate": 9.730996211393393e-05, + "loss": 3.2879, + "step": 9395 + }, + { + "epoch": 0.583276429325222, + "grad_norm": 0.22288591686775694, + "learning_rate": 9.730879335691094e-05, + "loss": 3.2021, + "step": 9396 + }, + { + "epoch": 0.5833385064249799, + "grad_norm": 0.25948892642155186, + "learning_rate": 9.730762435306649e-05, + "loss": 3.2258, + "step": 9397 + }, + { + "epoch": 0.5834005835247377, + "grad_norm": 0.2536161673010364, + "learning_rate": 9.730645510240668e-05, + "loss": 3.3037, + "step": 9398 + }, + { + "epoch": 0.5834626606244956, + "grad_norm": 0.2734717347648383, + "learning_rate": 9.730528560493761e-05, + "loss": 3.2993, + "step": 9399 + }, + { + "epoch": 0.5835247377242535, + "grad_norm": 0.18864017757368082, + "learning_rate": 9.730411586066538e-05, + "loss": 3.1941, + "step": 9400 + }, + { + "epoch": 0.5835868148240114, + "grad_norm": 0.23953803643195073, + "learning_rate": 9.730294586959609e-05, + "loss": 3.2272, + "step": 9401 + }, + { + "epoch": 0.5836488919237693, + "grad_norm": 0.1915440647128509, + "learning_rate": 9.730177563173586e-05, + "loss": 3.1925, + "step": 9402 + }, + { + "epoch": 0.5837109690235273, + "grad_norm": 0.22994594760338283, + "learning_rate": 9.730060514709077e-05, + "loss": 3.1962, + "step": 9403 + }, + { + "epoch": 0.5837730461232851, + "grad_norm": 0.3469504897136286, + "learning_rate": 9.729943441566696e-05, + "loss": 3.2804, + "step": 9404 + }, + { + "epoch": 0.583835123223043, + "grad_norm": 0.27761044909660804, + "learning_rate": 9.729826343747049e-05, + "loss": 3.1717, + "step": 9405 + }, + { + "epoch": 0.5838972003228009, + "grad_norm": 0.21192443400374128, + "learning_rate": 9.729709221250751e-05, + "loss": 3.2882, + "step": 9406 + }, + { + "epoch": 0.5839592774225588, + "grad_norm": 0.24869383896818462, + "learning_rate": 9.729592074078414e-05, + "loss": 3.166, + "step": 9407 + }, + { + "epoch": 0.5840213545223167, + "grad_norm": 0.2448710843708357, + "learning_rate": 9.729474902230644e-05, + "loss": 3.1835, + "step": 9408 + }, + { + "epoch": 0.5840834316220747, + "grad_norm": 0.18935144783145924, + "learning_rate": 9.729357705708056e-05, + "loss": 3.1664, + "step": 9409 + }, + { + "epoch": 0.5841455087218325, + "grad_norm": 0.5815719481965959, + "learning_rate": 9.729240484511262e-05, + "loss": 3.1251, + "step": 9410 + }, + { + "epoch": 0.5842075858215904, + "grad_norm": 0.3140844233058668, + "learning_rate": 9.729123238640871e-05, + "loss": 3.2311, + "step": 9411 + }, + { + "epoch": 0.5842696629213483, + "grad_norm": 0.3787961780296371, + "learning_rate": 9.729005968097498e-05, + "loss": 3.3177, + "step": 9412 + }, + { + "epoch": 0.5843317400211062, + "grad_norm": 0.32315819562402465, + "learning_rate": 9.728888672881751e-05, + "loss": 3.2883, + "step": 9413 + }, + { + "epoch": 0.5843938171208641, + "grad_norm": 0.24430880285360576, + "learning_rate": 9.728771352994244e-05, + "loss": 3.2128, + "step": 9414 + }, + { + "epoch": 0.584455894220622, + "grad_norm": 0.46529249761871655, + "learning_rate": 9.728654008435589e-05, + "loss": 3.2915, + "step": 9415 + }, + { + "epoch": 0.5845179713203799, + "grad_norm": 0.2224509145674165, + "learning_rate": 9.728536639206399e-05, + "loss": 3.2015, + "step": 9416 + }, + { + "epoch": 0.5845800484201378, + "grad_norm": 0.3034847680783443, + "learning_rate": 9.728419245307283e-05, + "loss": 3.2098, + "step": 9417 + }, + { + "epoch": 0.5846421255198957, + "grad_norm": 0.24833900790537025, + "learning_rate": 9.728301826738858e-05, + "loss": 3.2244, + "step": 9418 + }, + { + "epoch": 0.5847042026196536, + "grad_norm": 0.23603312941704108, + "learning_rate": 9.728184383501736e-05, + "loss": 3.1979, + "step": 9419 + }, + { + "epoch": 0.5847662797194115, + "grad_norm": 0.3279707182615701, + "learning_rate": 9.728066915596525e-05, + "loss": 3.1807, + "step": 9420 + }, + { + "epoch": 0.5848283568191694, + "grad_norm": 0.3923655071781215, + "learning_rate": 9.727949423023843e-05, + "loss": 3.2468, + "step": 9421 + }, + { + "epoch": 0.5848904339189273, + "grad_norm": 0.29297506961364334, + "learning_rate": 9.7278319057843e-05, + "loss": 3.263, + "step": 9422 + }, + { + "epoch": 0.5849525110186852, + "grad_norm": 0.19403085771614093, + "learning_rate": 9.727714363878511e-05, + "loss": 3.2042, + "step": 9423 + }, + { + "epoch": 0.5850145881184431, + "grad_norm": 0.26515246883701793, + "learning_rate": 9.727596797307089e-05, + "loss": 3.2421, + "step": 9424 + }, + { + "epoch": 0.585076665218201, + "grad_norm": 0.25956579280503445, + "learning_rate": 9.727479206070646e-05, + "loss": 3.1851, + "step": 9425 + }, + { + "epoch": 0.5851387423179589, + "grad_norm": 0.21989606465306863, + "learning_rate": 9.727361590169797e-05, + "loss": 3.1918, + "step": 9426 + }, + { + "epoch": 0.5852008194177168, + "grad_norm": 0.3697131122736103, + "learning_rate": 9.727243949605153e-05, + "loss": 3.2243, + "step": 9427 + }, + { + "epoch": 0.5852628965174747, + "grad_norm": 0.2755731153850936, + "learning_rate": 9.727126284377333e-05, + "loss": 3.2305, + "step": 9428 + }, + { + "epoch": 0.5853249736172326, + "grad_norm": 0.1966933930365461, + "learning_rate": 9.727008594486945e-05, + "loss": 3.1155, + "step": 9429 + }, + { + "epoch": 0.5853870507169905, + "grad_norm": 0.2027112800659056, + "learning_rate": 9.726890879934607e-05, + "loss": 3.0717, + "step": 9430 + }, + { + "epoch": 0.5854491278167484, + "grad_norm": 0.19944919086226665, + "learning_rate": 9.726773140720931e-05, + "loss": 3.2618, + "step": 9431 + }, + { + "epoch": 0.5855112049165063, + "grad_norm": 0.2444317340970595, + "learning_rate": 9.726655376846531e-05, + "loss": 3.2605, + "step": 9432 + }, + { + "epoch": 0.5855732820162642, + "grad_norm": 0.26928492361030704, + "learning_rate": 9.726537588312025e-05, + "loss": 3.2202, + "step": 9433 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.23476508580049107, + "learning_rate": 9.726419775118024e-05, + "loss": 3.1647, + "step": 9434 + }, + { + "epoch": 0.58569743621578, + "grad_norm": 0.2995101401303297, + "learning_rate": 9.726301937265143e-05, + "loss": 3.2332, + "step": 9435 + }, + { + "epoch": 0.5857595133155379, + "grad_norm": 0.2947504037728104, + "learning_rate": 9.726184074753999e-05, + "loss": 3.2554, + "step": 9436 + }, + { + "epoch": 0.5858215904152958, + "grad_norm": 0.26546848117484706, + "learning_rate": 9.726066187585203e-05, + "loss": 3.2096, + "step": 9437 + }, + { + "epoch": 0.5858836675150537, + "grad_norm": 0.3016652949949205, + "learning_rate": 9.725948275759375e-05, + "loss": 3.2591, + "step": 9438 + }, + { + "epoch": 0.5859457446148116, + "grad_norm": 0.2199001050781443, + "learning_rate": 9.725830339277127e-05, + "loss": 3.1981, + "step": 9439 + }, + { + "epoch": 0.5860078217145694, + "grad_norm": 0.2649730507400053, + "learning_rate": 9.725712378139073e-05, + "loss": 3.1852, + "step": 9440 + }, + { + "epoch": 0.5860698988143274, + "grad_norm": 0.24545529971554733, + "learning_rate": 9.725594392345832e-05, + "loss": 3.2957, + "step": 9441 + }, + { + "epoch": 0.5861319759140853, + "grad_norm": 0.39306799846824253, + "learning_rate": 9.725476381898018e-05, + "loss": 3.3018, + "step": 9442 + }, + { + "epoch": 0.5861940530138432, + "grad_norm": 0.30627694400671407, + "learning_rate": 9.725358346796245e-05, + "loss": 3.2667, + "step": 9443 + }, + { + "epoch": 0.5862561301136011, + "grad_norm": 0.272869460133976, + "learning_rate": 9.72524028704113e-05, + "loss": 3.2732, + "step": 9444 + }, + { + "epoch": 0.586318207213359, + "grad_norm": 0.2768645377582773, + "learning_rate": 9.725122202633293e-05, + "loss": 3.1841, + "step": 9445 + }, + { + "epoch": 0.5863802843131168, + "grad_norm": 0.2823283925864564, + "learning_rate": 9.725004093573342e-05, + "loss": 3.2404, + "step": 9446 + }, + { + "epoch": 0.5864423614128748, + "grad_norm": 0.22679128276620283, + "learning_rate": 9.7248859598619e-05, + "loss": 3.2074, + "step": 9447 + }, + { + "epoch": 0.5865044385126327, + "grad_norm": 0.2196482116030502, + "learning_rate": 9.724767801499582e-05, + "loss": 3.1886, + "step": 9448 + }, + { + "epoch": 0.5865665156123906, + "grad_norm": 0.22886412691907662, + "learning_rate": 9.724649618487e-05, + "loss": 3.248, + "step": 9449 + }, + { + "epoch": 0.5866285927121485, + "grad_norm": 0.24690161664144963, + "learning_rate": 9.724531410824778e-05, + "loss": 3.2136, + "step": 9450 + }, + { + "epoch": 0.5866906698119064, + "grad_norm": 0.24728348925090177, + "learning_rate": 9.724413178513526e-05, + "loss": 3.2969, + "step": 9451 + }, + { + "epoch": 0.5867527469116642, + "grad_norm": 0.22090523673065443, + "learning_rate": 9.724294921553865e-05, + "loss": 3.2597, + "step": 9452 + }, + { + "epoch": 0.5868148240114222, + "grad_norm": 0.3191532230575422, + "learning_rate": 9.72417663994641e-05, + "loss": 3.1508, + "step": 9453 + }, + { + "epoch": 0.5868769011111801, + "grad_norm": 0.22253919335271555, + "learning_rate": 9.724058333691776e-05, + "loss": 3.1939, + "step": 9454 + }, + { + "epoch": 0.586938978210938, + "grad_norm": 0.24780074997281185, + "learning_rate": 9.723940002790587e-05, + "loss": 3.1968, + "step": 9455 + }, + { + "epoch": 0.5870010553106959, + "grad_norm": 0.2058164113794135, + "learning_rate": 9.723821647243453e-05, + "loss": 3.2604, + "step": 9456 + }, + { + "epoch": 0.5870631324104538, + "grad_norm": 0.25909957970099623, + "learning_rate": 9.723703267050998e-05, + "loss": 3.205, + "step": 9457 + }, + { + "epoch": 0.5871252095102116, + "grad_norm": 0.20300403073964784, + "learning_rate": 9.723584862213835e-05, + "loss": 3.1709, + "step": 9458 + }, + { + "epoch": 0.5871872866099695, + "grad_norm": 0.2312504555723497, + "learning_rate": 9.723466432732581e-05, + "loss": 3.1316, + "step": 9459 + }, + { + "epoch": 0.5872493637097275, + "grad_norm": 0.23981674458562793, + "learning_rate": 9.723347978607858e-05, + "loss": 3.1529, + "step": 9460 + }, + { + "epoch": 0.5873114408094854, + "grad_norm": 0.2971804828998689, + "learning_rate": 9.723229499840283e-05, + "loss": 3.2386, + "step": 9461 + }, + { + "epoch": 0.5873735179092433, + "grad_norm": 0.342261407439145, + "learning_rate": 9.723110996430472e-05, + "loss": 3.2511, + "step": 9462 + }, + { + "epoch": 0.5874355950090012, + "grad_norm": 0.23798124782733468, + "learning_rate": 9.722992468379043e-05, + "loss": 3.2135, + "step": 9463 + }, + { + "epoch": 0.587497672108759, + "grad_norm": 0.4614740094205294, + "learning_rate": 9.722873915686616e-05, + "loss": 3.1158, + "step": 9464 + }, + { + "epoch": 0.587559749208517, + "grad_norm": 0.25472993029208035, + "learning_rate": 9.72275533835381e-05, + "loss": 3.2004, + "step": 9465 + }, + { + "epoch": 0.5876218263082749, + "grad_norm": 0.2151864353245554, + "learning_rate": 9.722636736381243e-05, + "loss": 3.1508, + "step": 9466 + }, + { + "epoch": 0.5876839034080328, + "grad_norm": 0.2814850619622963, + "learning_rate": 9.722518109769534e-05, + "loss": 3.268, + "step": 9467 + }, + { + "epoch": 0.5877459805077907, + "grad_norm": 0.26402672801850596, + "learning_rate": 9.722399458519303e-05, + "loss": 3.1538, + "step": 9468 + }, + { + "epoch": 0.5878080576075486, + "grad_norm": 0.2542480222593222, + "learning_rate": 9.722280782631165e-05, + "loss": 3.234, + "step": 9469 + }, + { + "epoch": 0.5878701347073064, + "grad_norm": 0.23166293765286938, + "learning_rate": 9.722162082105743e-05, + "loss": 3.1946, + "step": 9470 + }, + { + "epoch": 0.5879322118070643, + "grad_norm": 0.24669738186570744, + "learning_rate": 9.722043356943653e-05, + "loss": 3.2494, + "step": 9471 + }, + { + "epoch": 0.5879942889068223, + "grad_norm": 0.3321518286377464, + "learning_rate": 9.72192460714552e-05, + "loss": 3.2053, + "step": 9472 + }, + { + "epoch": 0.5880563660065802, + "grad_norm": 0.22590228601720022, + "learning_rate": 9.721805832711957e-05, + "loss": 3.2027, + "step": 9473 + }, + { + "epoch": 0.5881184431063381, + "grad_norm": 0.2678216814227761, + "learning_rate": 9.721687033643588e-05, + "loss": 3.1962, + "step": 9474 + }, + { + "epoch": 0.588180520206096, + "grad_norm": 0.1914622653039255, + "learning_rate": 9.721568209941032e-05, + "loss": 3.1688, + "step": 9475 + }, + { + "epoch": 0.5882425973058538, + "grad_norm": 0.22971959311556697, + "learning_rate": 9.721449361604909e-05, + "loss": 3.2166, + "step": 9476 + }, + { + "epoch": 0.5883046744056117, + "grad_norm": 0.21172473730088326, + "learning_rate": 9.721330488635835e-05, + "loss": 3.2688, + "step": 9477 + }, + { + "epoch": 0.5883667515053697, + "grad_norm": 0.21379665622645624, + "learning_rate": 9.721211591034438e-05, + "loss": 3.2366, + "step": 9478 + }, + { + "epoch": 0.5884288286051276, + "grad_norm": 0.20054079344518586, + "learning_rate": 9.721092668801331e-05, + "loss": 3.2127, + "step": 9479 + }, + { + "epoch": 0.5884909057048855, + "grad_norm": 0.18168474502430002, + "learning_rate": 9.720973721937139e-05, + "loss": 3.1349, + "step": 9480 + }, + { + "epoch": 0.5885529828046434, + "grad_norm": 0.16865828834398924, + "learning_rate": 9.720854750442481e-05, + "loss": 3.205, + "step": 9481 + }, + { + "epoch": 0.5886150599044012, + "grad_norm": 0.20782713587569784, + "learning_rate": 9.720735754317978e-05, + "loss": 3.2809, + "step": 9482 + }, + { + "epoch": 0.5886771370041591, + "grad_norm": 0.2834520101355515, + "learning_rate": 9.72061673356425e-05, + "loss": 3.2205, + "step": 9483 + }, + { + "epoch": 0.588739214103917, + "grad_norm": 0.2415239797009523, + "learning_rate": 9.720497688181917e-05, + "loss": 3.2872, + "step": 9484 + }, + { + "epoch": 0.588801291203675, + "grad_norm": 0.25478918175461623, + "learning_rate": 9.720378618171604e-05, + "loss": 3.2727, + "step": 9485 + }, + { + "epoch": 0.5888633683034329, + "grad_norm": 0.2549153449683226, + "learning_rate": 9.720259523533928e-05, + "loss": 3.129, + "step": 9486 + }, + { + "epoch": 0.5889254454031908, + "grad_norm": 0.19708554385768898, + "learning_rate": 9.720140404269513e-05, + "loss": 3.2291, + "step": 9487 + }, + { + "epoch": 0.5889875225029486, + "grad_norm": 0.23134586757034467, + "learning_rate": 9.72002126037898e-05, + "loss": 3.118, + "step": 9488 + }, + { + "epoch": 0.5890495996027065, + "grad_norm": 0.24175423660629272, + "learning_rate": 9.719902091862952e-05, + "loss": 3.0899, + "step": 9489 + }, + { + "epoch": 0.5891116767024644, + "grad_norm": 0.21081062315832874, + "learning_rate": 9.719782898722045e-05, + "loss": 3.1381, + "step": 9490 + }, + { + "epoch": 0.5891737538022224, + "grad_norm": 0.20503561396722966, + "learning_rate": 9.719663680956887e-05, + "loss": 3.1799, + "step": 9491 + }, + { + "epoch": 0.5892358309019803, + "grad_norm": 0.2446932451077953, + "learning_rate": 9.719544438568098e-05, + "loss": 3.1841, + "step": 9492 + }, + { + "epoch": 0.5892979080017382, + "grad_norm": 0.23977686573824422, + "learning_rate": 9.7194251715563e-05, + "loss": 3.1722, + "step": 9493 + }, + { + "epoch": 0.589359985101496, + "grad_norm": 0.20046929010748113, + "learning_rate": 9.719305879922115e-05, + "loss": 3.3635, + "step": 9494 + }, + { + "epoch": 0.5894220622012539, + "grad_norm": 0.271492964793923, + "learning_rate": 9.719186563666165e-05, + "loss": 3.3029, + "step": 9495 + }, + { + "epoch": 0.5894841393010118, + "grad_norm": 0.29654713143662664, + "learning_rate": 9.719067222789074e-05, + "loss": 3.1086, + "step": 9496 + }, + { + "epoch": 0.5895462164007698, + "grad_norm": 0.2315915205244281, + "learning_rate": 9.718947857291462e-05, + "loss": 3.1049, + "step": 9497 + }, + { + "epoch": 0.5896082935005277, + "grad_norm": 0.2305798989936203, + "learning_rate": 9.718828467173956e-05, + "loss": 3.2557, + "step": 9498 + }, + { + "epoch": 0.5896703706002856, + "grad_norm": 0.2082647786992923, + "learning_rate": 9.718709052437175e-05, + "loss": 3.301, + "step": 9499 + }, + { + "epoch": 0.5897324477000434, + "grad_norm": 0.18663094702658653, + "learning_rate": 9.718589613081743e-05, + "loss": 3.1151, + "step": 9500 + }, + { + "epoch": 0.5897945247998013, + "grad_norm": 0.286655027600049, + "learning_rate": 9.718470149108286e-05, + "loss": 3.2235, + "step": 9501 + }, + { + "epoch": 0.5898566018995592, + "grad_norm": 0.24557513131160774, + "learning_rate": 9.718350660517421e-05, + "loss": 3.1625, + "step": 9502 + }, + { + "epoch": 0.5899186789993172, + "grad_norm": 0.23933769760866813, + "learning_rate": 9.718231147309778e-05, + "loss": 3.2443, + "step": 9503 + }, + { + "epoch": 0.5899807560990751, + "grad_norm": 0.22276807787616976, + "learning_rate": 9.718111609485976e-05, + "loss": 3.2383, + "step": 9504 + }, + { + "epoch": 0.590042833198833, + "grad_norm": 0.21789650796612636, + "learning_rate": 9.717992047046641e-05, + "loss": 3.1717, + "step": 9505 + }, + { + "epoch": 0.5901049102985908, + "grad_norm": 0.21667585345806922, + "learning_rate": 9.717872459992396e-05, + "loss": 3.1334, + "step": 9506 + }, + { + "epoch": 0.5901669873983487, + "grad_norm": 0.21020910988019323, + "learning_rate": 9.717752848323864e-05, + "loss": 3.2735, + "step": 9507 + }, + { + "epoch": 0.5902290644981066, + "grad_norm": 0.19137058120337785, + "learning_rate": 9.717633212041672e-05, + "loss": 3.1671, + "step": 9508 + }, + { + "epoch": 0.5902911415978646, + "grad_norm": 0.1868301486251639, + "learning_rate": 9.717513551146443e-05, + "loss": 3.1092, + "step": 9509 + }, + { + "epoch": 0.5903532186976225, + "grad_norm": 0.2614748373306663, + "learning_rate": 9.717393865638798e-05, + "loss": 3.2795, + "step": 9510 + }, + { + "epoch": 0.5904152957973804, + "grad_norm": 0.2240007447220778, + "learning_rate": 9.717274155519365e-05, + "loss": 3.178, + "step": 9511 + }, + { + "epoch": 0.5904773728971382, + "grad_norm": 0.19694901543414975, + "learning_rate": 9.717154420788768e-05, + "loss": 3.2135, + "step": 9512 + }, + { + "epoch": 0.5905394499968961, + "grad_norm": 0.2569953365802693, + "learning_rate": 9.717034661447632e-05, + "loss": 3.1846, + "step": 9513 + }, + { + "epoch": 0.590601527096654, + "grad_norm": 0.3388277080305514, + "learning_rate": 9.71691487749658e-05, + "loss": 3.2203, + "step": 9514 + }, + { + "epoch": 0.590663604196412, + "grad_norm": 0.2616294737040137, + "learning_rate": 9.716795068936237e-05, + "loss": 3.2527, + "step": 9515 + }, + { + "epoch": 0.5907256812961699, + "grad_norm": 0.3015261257549449, + "learning_rate": 9.716675235767231e-05, + "loss": 3.2119, + "step": 9516 + }, + { + "epoch": 0.5907877583959278, + "grad_norm": 0.313938335527709, + "learning_rate": 9.716555377990184e-05, + "loss": 3.236, + "step": 9517 + }, + { + "epoch": 0.5908498354956856, + "grad_norm": 0.4049138139632224, + "learning_rate": 9.716435495605723e-05, + "loss": 3.1324, + "step": 9518 + }, + { + "epoch": 0.5909119125954435, + "grad_norm": 0.2577462978328408, + "learning_rate": 9.716315588614474e-05, + "loss": 3.2227, + "step": 9519 + }, + { + "epoch": 0.5909739896952014, + "grad_norm": 0.43436806076325113, + "learning_rate": 9.71619565701706e-05, + "loss": 3.2168, + "step": 9520 + }, + { + "epoch": 0.5910360667949593, + "grad_norm": 0.30049910254669737, + "learning_rate": 9.71607570081411e-05, + "loss": 3.1668, + "step": 9521 + }, + { + "epoch": 0.5910981438947173, + "grad_norm": 0.2898325072213308, + "learning_rate": 9.715955720006247e-05, + "loss": 3.2244, + "step": 9522 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.34192027452967044, + "learning_rate": 9.715835714594099e-05, + "loss": 3.1342, + "step": 9523 + }, + { + "epoch": 0.591222298094233, + "grad_norm": 0.3030765937223306, + "learning_rate": 9.715715684578291e-05, + "loss": 3.1075, + "step": 9524 + }, + { + "epoch": 0.5912843751939909, + "grad_norm": 0.24393694964842078, + "learning_rate": 9.715595629959449e-05, + "loss": 3.0998, + "step": 9525 + }, + { + "epoch": 0.5913464522937488, + "grad_norm": 0.2695580759381939, + "learning_rate": 9.7154755507382e-05, + "loss": 3.3113, + "step": 9526 + }, + { + "epoch": 0.5914085293935067, + "grad_norm": 0.3054975842381938, + "learning_rate": 9.715355446915171e-05, + "loss": 3.3295, + "step": 9527 + }, + { + "epoch": 0.5914706064932647, + "grad_norm": 0.2631474967364175, + "learning_rate": 9.715235318490987e-05, + "loss": 3.3029, + "step": 9528 + }, + { + "epoch": 0.5915326835930226, + "grad_norm": 0.19344084400928702, + "learning_rate": 9.715115165466275e-05, + "loss": 3.1338, + "step": 9529 + }, + { + "epoch": 0.5915947606927804, + "grad_norm": 0.19004569831075518, + "learning_rate": 9.714994987841665e-05, + "loss": 3.2395, + "step": 9530 + }, + { + "epoch": 0.5916568377925383, + "grad_norm": 0.2623413707904725, + "learning_rate": 9.714874785617781e-05, + "loss": 3.1965, + "step": 9531 + }, + { + "epoch": 0.5917189148922962, + "grad_norm": 0.21258445036371712, + "learning_rate": 9.71475455879525e-05, + "loss": 3.2701, + "step": 9532 + }, + { + "epoch": 0.5917809919920541, + "grad_norm": 0.24478503820224254, + "learning_rate": 9.714634307374699e-05, + "loss": 3.1601, + "step": 9533 + }, + { + "epoch": 0.5918430690918121, + "grad_norm": 0.2174763213619947, + "learning_rate": 9.714514031356756e-05, + "loss": 3.2302, + "step": 9534 + }, + { + "epoch": 0.59190514619157, + "grad_norm": 0.2821355758834614, + "learning_rate": 9.714393730742051e-05, + "loss": 3.2689, + "step": 9535 + }, + { + "epoch": 0.5919672232913278, + "grad_norm": 0.318580597385716, + "learning_rate": 9.714273405531209e-05, + "loss": 3.2646, + "step": 9536 + }, + { + "epoch": 0.5920293003910857, + "grad_norm": 0.3125800864595036, + "learning_rate": 9.714153055724858e-05, + "loss": 3.1609, + "step": 9537 + }, + { + "epoch": 0.5920913774908436, + "grad_norm": 0.23778156354996052, + "learning_rate": 9.714032681323623e-05, + "loss": 3.1907, + "step": 9538 + }, + { + "epoch": 0.5921534545906015, + "grad_norm": 0.5299069312990495, + "learning_rate": 9.713912282328139e-05, + "loss": 3.171, + "step": 9539 + }, + { + "epoch": 0.5922155316903595, + "grad_norm": 0.26372863723694107, + "learning_rate": 9.713791858739029e-05, + "loss": 3.1861, + "step": 9540 + }, + { + "epoch": 0.5922776087901174, + "grad_norm": 0.318329920898159, + "learning_rate": 9.713671410556923e-05, + "loss": 3.1681, + "step": 9541 + }, + { + "epoch": 0.5923396858898752, + "grad_norm": 0.2381074146225348, + "learning_rate": 9.713550937782449e-05, + "loss": 3.2085, + "step": 9542 + }, + { + "epoch": 0.5924017629896331, + "grad_norm": 0.27727580245561145, + "learning_rate": 9.713430440416234e-05, + "loss": 3.18, + "step": 9543 + }, + { + "epoch": 0.592463840089391, + "grad_norm": 0.2630025775478521, + "learning_rate": 9.713309918458911e-05, + "loss": 3.2303, + "step": 9544 + }, + { + "epoch": 0.5925259171891489, + "grad_norm": 0.3739012077747171, + "learning_rate": 9.713189371911102e-05, + "loss": 3.2516, + "step": 9545 + }, + { + "epoch": 0.5925879942889068, + "grad_norm": 0.2807799358262867, + "learning_rate": 9.713068800773444e-05, + "loss": 3.2652, + "step": 9546 + }, + { + "epoch": 0.5926500713886648, + "grad_norm": 0.23087576309360944, + "learning_rate": 9.71294820504656e-05, + "loss": 3.2358, + "step": 9547 + }, + { + "epoch": 0.5927121484884226, + "grad_norm": 0.2080279864347045, + "learning_rate": 9.71282758473108e-05, + "loss": 3.2568, + "step": 9548 + }, + { + "epoch": 0.5927742255881805, + "grad_norm": 0.2578721988801361, + "learning_rate": 9.712706939827636e-05, + "loss": 3.2109, + "step": 9549 + }, + { + "epoch": 0.5928363026879384, + "grad_norm": 0.20288199781334848, + "learning_rate": 9.712586270336856e-05, + "loss": 3.2385, + "step": 9550 + }, + { + "epoch": 0.5928983797876963, + "grad_norm": 0.270745541330682, + "learning_rate": 9.712465576259368e-05, + "loss": 3.2335, + "step": 9551 + }, + { + "epoch": 0.5929604568874542, + "grad_norm": 0.27523452988337876, + "learning_rate": 9.712344857595805e-05, + "loss": 3.2374, + "step": 9552 + }, + { + "epoch": 0.5930225339872122, + "grad_norm": 0.19853441657638302, + "learning_rate": 9.712224114346793e-05, + "loss": 3.199, + "step": 9553 + }, + { + "epoch": 0.59308461108697, + "grad_norm": 0.24598604449030495, + "learning_rate": 9.712103346512964e-05, + "loss": 3.1936, + "step": 9554 + }, + { + "epoch": 0.5931466881867279, + "grad_norm": 0.2449709391388451, + "learning_rate": 9.71198255409495e-05, + "loss": 3.1183, + "step": 9555 + }, + { + "epoch": 0.5932087652864858, + "grad_norm": 0.3996952438200834, + "learning_rate": 9.711861737093377e-05, + "loss": 3.3346, + "step": 9556 + }, + { + "epoch": 0.5932708423862437, + "grad_norm": 0.2695578962782455, + "learning_rate": 9.711740895508879e-05, + "loss": 3.1584, + "step": 9557 + }, + { + "epoch": 0.5933329194860016, + "grad_norm": 0.33356223839757965, + "learning_rate": 9.711620029342083e-05, + "loss": 3.2281, + "step": 9558 + }, + { + "epoch": 0.5933949965857596, + "grad_norm": 0.2767234635557448, + "learning_rate": 9.711499138593623e-05, + "loss": 3.2536, + "step": 9559 + }, + { + "epoch": 0.5934570736855174, + "grad_norm": 0.2385725211021805, + "learning_rate": 9.711378223264128e-05, + "loss": 3.2053, + "step": 9560 + }, + { + "epoch": 0.5935191507852753, + "grad_norm": 0.2578343179891912, + "learning_rate": 9.71125728335423e-05, + "loss": 3.2175, + "step": 9561 + }, + { + "epoch": 0.5935812278850332, + "grad_norm": 0.2315766810760875, + "learning_rate": 9.711136318864558e-05, + "loss": 3.2392, + "step": 9562 + }, + { + "epoch": 0.5936433049847911, + "grad_norm": 0.2624896495912916, + "learning_rate": 9.711015329795743e-05, + "loss": 3.3051, + "step": 9563 + }, + { + "epoch": 0.593705382084549, + "grad_norm": 0.24381121728682398, + "learning_rate": 9.71089431614842e-05, + "loss": 3.1239, + "step": 9564 + }, + { + "epoch": 0.593767459184307, + "grad_norm": 0.22194032849029785, + "learning_rate": 9.710773277923215e-05, + "loss": 3.1165, + "step": 9565 + }, + { + "epoch": 0.5938295362840648, + "grad_norm": 0.2461895453486007, + "learning_rate": 9.710652215120764e-05, + "loss": 3.0385, + "step": 9566 + }, + { + "epoch": 0.5938916133838227, + "grad_norm": 0.34492304446409877, + "learning_rate": 9.710531127741697e-05, + "loss": 3.2572, + "step": 9567 + }, + { + "epoch": 0.5939536904835806, + "grad_norm": 0.28004609833311517, + "learning_rate": 9.710410015786644e-05, + "loss": 3.225, + "step": 9568 + }, + { + "epoch": 0.5940157675833385, + "grad_norm": 0.28747593582034836, + "learning_rate": 9.71028887925624e-05, + "loss": 3.216, + "step": 9569 + }, + { + "epoch": 0.5940778446830964, + "grad_norm": 0.2240648053528155, + "learning_rate": 9.710167718151115e-05, + "loss": 3.2315, + "step": 9570 + }, + { + "epoch": 0.5941399217828544, + "grad_norm": 0.19693253978147196, + "learning_rate": 9.710046532471901e-05, + "loss": 3.2812, + "step": 9571 + }, + { + "epoch": 0.5942019988826122, + "grad_norm": 0.2868951010881793, + "learning_rate": 9.709925322219232e-05, + "loss": 3.1079, + "step": 9572 + }, + { + "epoch": 0.5942640759823701, + "grad_norm": 0.19138090783337117, + "learning_rate": 9.709804087393738e-05, + "loss": 3.234, + "step": 9573 + }, + { + "epoch": 0.594326153082128, + "grad_norm": 0.2547160659169387, + "learning_rate": 9.709682827996054e-05, + "loss": 3.1739, + "step": 9574 + }, + { + "epoch": 0.5943882301818859, + "grad_norm": 0.21960154897722417, + "learning_rate": 9.70956154402681e-05, + "loss": 3.2702, + "step": 9575 + }, + { + "epoch": 0.5944503072816438, + "grad_norm": 0.23469676342344856, + "learning_rate": 9.709440235486643e-05, + "loss": 3.1904, + "step": 9576 + }, + { + "epoch": 0.5945123843814017, + "grad_norm": 0.23837006586506249, + "learning_rate": 9.709318902376179e-05, + "loss": 3.2264, + "step": 9577 + }, + { + "epoch": 0.5945744614811596, + "grad_norm": 0.30318949544183516, + "learning_rate": 9.709197544696057e-05, + "loss": 3.1811, + "step": 9578 + }, + { + "epoch": 0.5946365385809175, + "grad_norm": 0.25380687236018673, + "learning_rate": 9.709076162446908e-05, + "loss": 3.2437, + "step": 9579 + }, + { + "epoch": 0.5946986156806754, + "grad_norm": 0.24333726314061482, + "learning_rate": 9.708954755629366e-05, + "loss": 3.2344, + "step": 9580 + }, + { + "epoch": 0.5947606927804333, + "grad_norm": 0.21159090285328672, + "learning_rate": 9.708833324244063e-05, + "loss": 3.1173, + "step": 9581 + }, + { + "epoch": 0.5948227698801912, + "grad_norm": 0.44176181886856364, + "learning_rate": 9.708711868291635e-05, + "loss": 3.2303, + "step": 9582 + }, + { + "epoch": 0.5948848469799491, + "grad_norm": 0.2676125385959171, + "learning_rate": 9.708590387772712e-05, + "loss": 3.2702, + "step": 9583 + }, + { + "epoch": 0.594946924079707, + "grad_norm": 0.24299605820200074, + "learning_rate": 9.70846888268793e-05, + "loss": 3.1478, + "step": 9584 + }, + { + "epoch": 0.5950090011794649, + "grad_norm": 0.1997506576677651, + "learning_rate": 9.708347353037926e-05, + "loss": 3.2821, + "step": 9585 + }, + { + "epoch": 0.5950710782792228, + "grad_norm": 0.27915502827615185, + "learning_rate": 9.708225798823328e-05, + "loss": 3.1423, + "step": 9586 + }, + { + "epoch": 0.5951331553789807, + "grad_norm": 0.21466278777952036, + "learning_rate": 9.708104220044774e-05, + "loss": 3.144, + "step": 9587 + }, + { + "epoch": 0.5951952324787386, + "grad_norm": 0.24657930429291552, + "learning_rate": 9.707982616702896e-05, + "loss": 3.2368, + "step": 9588 + }, + { + "epoch": 0.5952573095784965, + "grad_norm": 0.2235608380698226, + "learning_rate": 9.707860988798332e-05, + "loss": 3.2231, + "step": 9589 + }, + { + "epoch": 0.5953193866782543, + "grad_norm": 0.20852504591742954, + "learning_rate": 9.707739336331713e-05, + "loss": 3.183, + "step": 9590 + }, + { + "epoch": 0.5953814637780123, + "grad_norm": 0.22486255308663056, + "learning_rate": 9.707617659303675e-05, + "loss": 3.2949, + "step": 9591 + }, + { + "epoch": 0.5954435408777702, + "grad_norm": 0.21267712237688174, + "learning_rate": 9.707495957714853e-05, + "loss": 3.14, + "step": 9592 + }, + { + "epoch": 0.5955056179775281, + "grad_norm": 0.22679741782170432, + "learning_rate": 9.707374231565882e-05, + "loss": 3.1196, + "step": 9593 + }, + { + "epoch": 0.595567695077286, + "grad_norm": 0.18958605202797046, + "learning_rate": 9.707252480857397e-05, + "loss": 3.2802, + "step": 9594 + }, + { + "epoch": 0.5956297721770439, + "grad_norm": 0.2339523478195539, + "learning_rate": 9.707130705590032e-05, + "loss": 3.1628, + "step": 9595 + }, + { + "epoch": 0.5956918492768017, + "grad_norm": 0.1957888801744036, + "learning_rate": 9.707008905764425e-05, + "loss": 3.2666, + "step": 9596 + }, + { + "epoch": 0.5957539263765597, + "grad_norm": 0.2281426540493797, + "learning_rate": 9.706887081381209e-05, + "loss": 3.2644, + "step": 9597 + }, + { + "epoch": 0.5958160034763176, + "grad_norm": 0.23558997112673696, + "learning_rate": 9.70676523244102e-05, + "loss": 3.2625, + "step": 9598 + }, + { + "epoch": 0.5958780805760755, + "grad_norm": 0.34177678730382716, + "learning_rate": 9.706643358944496e-05, + "loss": 3.21, + "step": 9599 + }, + { + "epoch": 0.5959401576758334, + "grad_norm": 0.19054521821925624, + "learning_rate": 9.706521460892268e-05, + "loss": 3.2325, + "step": 9600 + }, + { + "epoch": 0.5960022347755913, + "grad_norm": 0.17373960898737448, + "learning_rate": 9.706399538284978e-05, + "loss": 3.1264, + "step": 9601 + }, + { + "epoch": 0.5960643118753491, + "grad_norm": 0.28769195254267954, + "learning_rate": 9.706277591123258e-05, + "loss": 3.244, + "step": 9602 + }, + { + "epoch": 0.5961263889751071, + "grad_norm": 0.2626201280970831, + "learning_rate": 9.706155619407745e-05, + "loss": 3.2108, + "step": 9603 + }, + { + "epoch": 0.596188466074865, + "grad_norm": 0.35390926806108264, + "learning_rate": 9.706033623139075e-05, + "loss": 3.1892, + "step": 9604 + }, + { + "epoch": 0.5962505431746229, + "grad_norm": 0.24251467386318137, + "learning_rate": 9.705911602317886e-05, + "loss": 3.095, + "step": 9605 + }, + { + "epoch": 0.5963126202743808, + "grad_norm": 0.22865061563054, + "learning_rate": 9.705789556944814e-05, + "loss": 3.1363, + "step": 9606 + }, + { + "epoch": 0.5963746973741387, + "grad_norm": 0.23806739810051913, + "learning_rate": 9.705667487020495e-05, + "loss": 3.1852, + "step": 9607 + }, + { + "epoch": 0.5964367744738965, + "grad_norm": 0.1799323652607419, + "learning_rate": 9.705545392545568e-05, + "loss": 3.268, + "step": 9608 + }, + { + "epoch": 0.5964988515736545, + "grad_norm": 0.24256684220223232, + "learning_rate": 9.705423273520666e-05, + "loss": 3.2126, + "step": 9609 + }, + { + "epoch": 0.5965609286734124, + "grad_norm": 0.16554521784868861, + "learning_rate": 9.705301129946429e-05, + "loss": 3.1701, + "step": 9610 + }, + { + "epoch": 0.5966230057731703, + "grad_norm": 0.2739277531897953, + "learning_rate": 9.705178961823495e-05, + "loss": 3.1998, + "step": 9611 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.20690539871407126, + "learning_rate": 9.7050567691525e-05, + "loss": 3.1757, + "step": 9612 + }, + { + "epoch": 0.5967471599726861, + "grad_norm": 0.21502966206278473, + "learning_rate": 9.704934551934081e-05, + "loss": 3.2976, + "step": 9613 + }, + { + "epoch": 0.5968092370724439, + "grad_norm": 0.1920300361992239, + "learning_rate": 9.704812310168876e-05, + "loss": 3.2441, + "step": 9614 + }, + { + "epoch": 0.5968713141722019, + "grad_norm": 0.17702018906935993, + "learning_rate": 9.704690043857523e-05, + "loss": 3.3081, + "step": 9615 + }, + { + "epoch": 0.5969333912719598, + "grad_norm": 0.210061851912466, + "learning_rate": 9.70456775300066e-05, + "loss": 3.2385, + "step": 9616 + }, + { + "epoch": 0.5969954683717177, + "grad_norm": 0.20170226130920074, + "learning_rate": 9.704445437598925e-05, + "loss": 3.1016, + "step": 9617 + }, + { + "epoch": 0.5970575454714756, + "grad_norm": 0.22680227154320814, + "learning_rate": 9.704323097652958e-05, + "loss": 3.067, + "step": 9618 + }, + { + "epoch": 0.5971196225712335, + "grad_norm": 0.20409883378682536, + "learning_rate": 9.704200733163393e-05, + "loss": 3.2059, + "step": 9619 + }, + { + "epoch": 0.5971816996709913, + "grad_norm": 0.22968126989206392, + "learning_rate": 9.704078344130871e-05, + "loss": 3.2276, + "step": 9620 + }, + { + "epoch": 0.5972437767707492, + "grad_norm": 0.35988166586137277, + "learning_rate": 9.703955930556033e-05, + "loss": 3.2424, + "step": 9621 + }, + { + "epoch": 0.5973058538705072, + "grad_norm": 0.17252849841018114, + "learning_rate": 9.703833492439512e-05, + "loss": 3.1902, + "step": 9622 + }, + { + "epoch": 0.5973679309702651, + "grad_norm": 0.23760263553016228, + "learning_rate": 9.70371102978195e-05, + "loss": 3.1195, + "step": 9623 + }, + { + "epoch": 0.597430008070023, + "grad_norm": 0.20468425165758372, + "learning_rate": 9.703588542583987e-05, + "loss": 3.2306, + "step": 9624 + }, + { + "epoch": 0.5974920851697809, + "grad_norm": 0.24895823503968495, + "learning_rate": 9.70346603084626e-05, + "loss": 3.2474, + "step": 9625 + }, + { + "epoch": 0.5975541622695387, + "grad_norm": 0.20637675963367316, + "learning_rate": 9.703343494569409e-05, + "loss": 3.2785, + "step": 9626 + }, + { + "epoch": 0.5976162393692966, + "grad_norm": 0.2514845905143049, + "learning_rate": 9.703220933754074e-05, + "loss": 3.2213, + "step": 9627 + }, + { + "epoch": 0.5976783164690546, + "grad_norm": 0.32082729142969296, + "learning_rate": 9.703098348400892e-05, + "loss": 3.2754, + "step": 9628 + }, + { + "epoch": 0.5977403935688125, + "grad_norm": 0.2560161663441992, + "learning_rate": 9.702975738510504e-05, + "loss": 3.2588, + "step": 9629 + }, + { + "epoch": 0.5978024706685704, + "grad_norm": 0.34013835151974847, + "learning_rate": 9.702853104083551e-05, + "loss": 3.2576, + "step": 9630 + }, + { + "epoch": 0.5978645477683283, + "grad_norm": 0.32924688509595124, + "learning_rate": 9.702730445120672e-05, + "loss": 3.2018, + "step": 9631 + }, + { + "epoch": 0.5979266248680861, + "grad_norm": 0.2505960660152633, + "learning_rate": 9.702607761622507e-05, + "loss": 3.245, + "step": 9632 + }, + { + "epoch": 0.597988701967844, + "grad_norm": 0.4037338796592115, + "learning_rate": 9.702485053589695e-05, + "loss": 3.1003, + "step": 9633 + }, + { + "epoch": 0.598050779067602, + "grad_norm": 0.41482542036981596, + "learning_rate": 9.702362321022878e-05, + "loss": 3.2622, + "step": 9634 + }, + { + "epoch": 0.5981128561673599, + "grad_norm": 0.2966394996169622, + "learning_rate": 9.702239563922693e-05, + "loss": 3.0605, + "step": 9635 + }, + { + "epoch": 0.5981749332671178, + "grad_norm": 0.4392654696450593, + "learning_rate": 9.702116782289782e-05, + "loss": 3.1849, + "step": 9636 + }, + { + "epoch": 0.5982370103668757, + "grad_norm": 0.30882189562242496, + "learning_rate": 9.70199397612479e-05, + "loss": 3.236, + "step": 9637 + }, + { + "epoch": 0.5982990874666335, + "grad_norm": 0.36570929183275136, + "learning_rate": 9.701871145428352e-05, + "loss": 3.1897, + "step": 9638 + }, + { + "epoch": 0.5983611645663914, + "grad_norm": 0.40803102579729306, + "learning_rate": 9.701748290201111e-05, + "loss": 3.1812, + "step": 9639 + }, + { + "epoch": 0.5984232416661494, + "grad_norm": 0.3916541466166051, + "learning_rate": 9.701625410443707e-05, + "loss": 3.2191, + "step": 9640 + }, + { + "epoch": 0.5984853187659073, + "grad_norm": 0.2797515875632381, + "learning_rate": 9.701502506156783e-05, + "loss": 3.2315, + "step": 9641 + }, + { + "epoch": 0.5985473958656652, + "grad_norm": 0.29241187257651713, + "learning_rate": 9.701379577340979e-05, + "loss": 3.1669, + "step": 9642 + }, + { + "epoch": 0.5986094729654231, + "grad_norm": 0.25401987321146013, + "learning_rate": 9.701256623996935e-05, + "loss": 3.152, + "step": 9643 + }, + { + "epoch": 0.5986715500651809, + "grad_norm": 0.418370953567946, + "learning_rate": 9.701133646125295e-05, + "loss": 3.1991, + "step": 9644 + }, + { + "epoch": 0.5987336271649388, + "grad_norm": 0.27683711453641474, + "learning_rate": 9.7010106437267e-05, + "loss": 3.1698, + "step": 9645 + }, + { + "epoch": 0.5987957042646967, + "grad_norm": 0.28911026998106243, + "learning_rate": 9.70088761680179e-05, + "loss": 3.1037, + "step": 9646 + }, + { + "epoch": 0.5988577813644547, + "grad_norm": 0.3515883330284799, + "learning_rate": 9.70076456535121e-05, + "loss": 3.1194, + "step": 9647 + }, + { + "epoch": 0.5989198584642126, + "grad_norm": 0.24597018030812395, + "learning_rate": 9.700641489375599e-05, + "loss": 3.1445, + "step": 9648 + }, + { + "epoch": 0.5989819355639705, + "grad_norm": 0.2276709505884071, + "learning_rate": 9.7005183888756e-05, + "loss": 3.2251, + "step": 9649 + }, + { + "epoch": 0.5990440126637283, + "grad_norm": 0.26197211968339895, + "learning_rate": 9.700395263851856e-05, + "loss": 3.2327, + "step": 9650 + }, + { + "epoch": 0.5991060897634862, + "grad_norm": 0.31458481110400455, + "learning_rate": 9.700272114305009e-05, + "loss": 3.1593, + "step": 9651 + }, + { + "epoch": 0.5991681668632441, + "grad_norm": 0.3167193006182318, + "learning_rate": 9.700148940235701e-05, + "loss": 3.257, + "step": 9652 + }, + { + "epoch": 0.5992302439630021, + "grad_norm": 0.26403074920097513, + "learning_rate": 9.700025741644574e-05, + "loss": 3.2567, + "step": 9653 + }, + { + "epoch": 0.59929232106276, + "grad_norm": 0.36440591128115774, + "learning_rate": 9.699902518532274e-05, + "loss": 3.2146, + "step": 9654 + }, + { + "epoch": 0.5993543981625179, + "grad_norm": 0.24892859055141198, + "learning_rate": 9.69977927089944e-05, + "loss": 3.1597, + "step": 9655 + }, + { + "epoch": 0.5994164752622757, + "grad_norm": 0.3682135389357959, + "learning_rate": 9.699655998746717e-05, + "loss": 3.1219, + "step": 9656 + }, + { + "epoch": 0.5994785523620336, + "grad_norm": 0.2882967807752461, + "learning_rate": 9.699532702074748e-05, + "loss": 3.1587, + "step": 9657 + }, + { + "epoch": 0.5995406294617915, + "grad_norm": 0.25201703525195696, + "learning_rate": 9.699409380884174e-05, + "loss": 3.2356, + "step": 9658 + }, + { + "epoch": 0.5996027065615495, + "grad_norm": 0.2668462755034818, + "learning_rate": 9.699286035175643e-05, + "loss": 3.2145, + "step": 9659 + }, + { + "epoch": 0.5996647836613074, + "grad_norm": 0.24117771491322518, + "learning_rate": 9.699162664949795e-05, + "loss": 3.2112, + "step": 9660 + }, + { + "epoch": 0.5997268607610653, + "grad_norm": 0.25519587849697734, + "learning_rate": 9.699039270207276e-05, + "loss": 3.2215, + "step": 9661 + }, + { + "epoch": 0.5997889378608231, + "grad_norm": 0.19172465835271746, + "learning_rate": 9.698915850948725e-05, + "loss": 3.1601, + "step": 9662 + }, + { + "epoch": 0.599851014960581, + "grad_norm": 0.2716928820282093, + "learning_rate": 9.698792407174792e-05, + "loss": 3.3343, + "step": 9663 + }, + { + "epoch": 0.5999130920603389, + "grad_norm": 0.373886700926956, + "learning_rate": 9.698668938886118e-05, + "loss": 3.1776, + "step": 9664 + }, + { + "epoch": 0.5999751691600969, + "grad_norm": 0.2978056074867938, + "learning_rate": 9.698545446083347e-05, + "loss": 3.2228, + "step": 9665 + }, + { + "epoch": 0.6000372462598548, + "grad_norm": 0.18992093914935934, + "learning_rate": 9.698421928767124e-05, + "loss": 3.2138, + "step": 9666 + }, + { + "epoch": 0.6000993233596127, + "grad_norm": 0.1955619839856224, + "learning_rate": 9.698298386938093e-05, + "loss": 3.1825, + "step": 9667 + }, + { + "epoch": 0.6001614004593705, + "grad_norm": 0.2226931558105442, + "learning_rate": 9.6981748205969e-05, + "loss": 3.1941, + "step": 9668 + }, + { + "epoch": 0.6002234775591284, + "grad_norm": 0.2872035332771223, + "learning_rate": 9.698051229744188e-05, + "loss": 3.1842, + "step": 9669 + }, + { + "epoch": 0.6002855546588863, + "grad_norm": 0.23374681529257252, + "learning_rate": 9.6979276143806e-05, + "loss": 3.1858, + "step": 9670 + }, + { + "epoch": 0.6003476317586443, + "grad_norm": 0.25657383704373066, + "learning_rate": 9.697803974506785e-05, + "loss": 3.2146, + "step": 9671 + }, + { + "epoch": 0.6004097088584022, + "grad_norm": 0.23761117240570603, + "learning_rate": 9.697680310123387e-05, + "loss": 3.2298, + "step": 9672 + }, + { + "epoch": 0.6004717859581601, + "grad_norm": 0.2544015162235712, + "learning_rate": 9.69755662123105e-05, + "loss": 3.1612, + "step": 9673 + }, + { + "epoch": 0.6005338630579179, + "grad_norm": 0.21140437204974638, + "learning_rate": 9.69743290783042e-05, + "loss": 3.1448, + "step": 9674 + }, + { + "epoch": 0.6005959401576758, + "grad_norm": 0.21046987551676358, + "learning_rate": 9.69730916992214e-05, + "loss": 3.0738, + "step": 9675 + }, + { + "epoch": 0.6006580172574337, + "grad_norm": 0.28548312069070997, + "learning_rate": 9.697185407506859e-05, + "loss": 3.1439, + "step": 9676 + }, + { + "epoch": 0.6007200943571916, + "grad_norm": 0.2913402820197855, + "learning_rate": 9.697061620585221e-05, + "loss": 3.1998, + "step": 9677 + }, + { + "epoch": 0.6007821714569496, + "grad_norm": 0.24815409137876, + "learning_rate": 9.696937809157873e-05, + "loss": 3.2985, + "step": 9678 + }, + { + "epoch": 0.6008442485567075, + "grad_norm": 0.2622940657816258, + "learning_rate": 9.69681397322546e-05, + "loss": 3.2528, + "step": 9679 + }, + { + "epoch": 0.6009063256564653, + "grad_norm": 0.34085493813634077, + "learning_rate": 9.696690112788629e-05, + "loss": 3.242, + "step": 9680 + }, + { + "epoch": 0.6009684027562232, + "grad_norm": 0.22529329581830146, + "learning_rate": 9.696566227848026e-05, + "loss": 3.254, + "step": 9681 + }, + { + "epoch": 0.6010304798559811, + "grad_norm": 0.22954488920194535, + "learning_rate": 9.696442318404295e-05, + "loss": 3.2158, + "step": 9682 + }, + { + "epoch": 0.601092556955739, + "grad_norm": 0.2342222510719367, + "learning_rate": 9.696318384458085e-05, + "loss": 3.1171, + "step": 9683 + }, + { + "epoch": 0.601154634055497, + "grad_norm": 0.23916510518048337, + "learning_rate": 9.696194426010043e-05, + "loss": 3.2657, + "step": 9684 + }, + { + "epoch": 0.6012167111552549, + "grad_norm": 0.20974075837741996, + "learning_rate": 9.696070443060814e-05, + "loss": 3.1998, + "step": 9685 + }, + { + "epoch": 0.6012787882550127, + "grad_norm": 0.320131541518584, + "learning_rate": 9.695946435611045e-05, + "loss": 3.2067, + "step": 9686 + }, + { + "epoch": 0.6013408653547706, + "grad_norm": 0.306928790810729, + "learning_rate": 9.695822403661383e-05, + "loss": 3.1912, + "step": 9687 + }, + { + "epoch": 0.6014029424545285, + "grad_norm": 0.25180163005868594, + "learning_rate": 9.695698347212477e-05, + "loss": 3.175, + "step": 9688 + }, + { + "epoch": 0.6014650195542864, + "grad_norm": 0.20196329382671768, + "learning_rate": 9.695574266264972e-05, + "loss": 3.2389, + "step": 9689 + }, + { + "epoch": 0.6015270966540444, + "grad_norm": 0.28819581861947124, + "learning_rate": 9.695450160819516e-05, + "loss": 3.1646, + "step": 9690 + }, + { + "epoch": 0.6015891737538023, + "grad_norm": 0.30380568912188055, + "learning_rate": 9.695326030876757e-05, + "loss": 3.2598, + "step": 9691 + }, + { + "epoch": 0.6016512508535601, + "grad_norm": 0.22455811952331212, + "learning_rate": 9.695201876437343e-05, + "loss": 3.1747, + "step": 9692 + }, + { + "epoch": 0.601713327953318, + "grad_norm": 0.34178989948239197, + "learning_rate": 9.695077697501919e-05, + "loss": 3.2745, + "step": 9693 + }, + { + "epoch": 0.6017754050530759, + "grad_norm": 0.4487222996612575, + "learning_rate": 9.694953494071137e-05, + "loss": 3.1996, + "step": 9694 + }, + { + "epoch": 0.6018374821528338, + "grad_norm": 0.2910900660807447, + "learning_rate": 9.694829266145642e-05, + "loss": 3.3006, + "step": 9695 + }, + { + "epoch": 0.6018995592525918, + "grad_norm": 0.26089894301115213, + "learning_rate": 9.694705013726082e-05, + "loss": 3.243, + "step": 9696 + }, + { + "epoch": 0.6019616363523497, + "grad_norm": 0.29412375068642405, + "learning_rate": 9.694580736813108e-05, + "loss": 3.2385, + "step": 9697 + }, + { + "epoch": 0.6020237134521075, + "grad_norm": 0.23303476035497567, + "learning_rate": 9.694456435407366e-05, + "loss": 3.1675, + "step": 9698 + }, + { + "epoch": 0.6020857905518654, + "grad_norm": 0.2307214805981896, + "learning_rate": 9.694332109509504e-05, + "loss": 3.3107, + "step": 9699 + }, + { + "epoch": 0.6021478676516233, + "grad_norm": 0.24996607520475875, + "learning_rate": 9.694207759120173e-05, + "loss": 3.2174, + "step": 9700 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.21369475114953876, + "learning_rate": 9.69408338424002e-05, + "loss": 3.1335, + "step": 9701 + }, + { + "epoch": 0.6022720218511391, + "grad_norm": 0.25543735731236167, + "learning_rate": 9.693958984869696e-05, + "loss": 3.2459, + "step": 9702 + }, + { + "epoch": 0.6023340989508971, + "grad_norm": 0.1946681839069208, + "learning_rate": 9.693834561009847e-05, + "loss": 3.1817, + "step": 9703 + }, + { + "epoch": 0.6023961760506549, + "grad_norm": 0.22131138930517386, + "learning_rate": 9.693710112661123e-05, + "loss": 3.1979, + "step": 9704 + }, + { + "epoch": 0.6024582531504128, + "grad_norm": 0.2518953804261496, + "learning_rate": 9.693585639824176e-05, + "loss": 3.1088, + "step": 9705 + }, + { + "epoch": 0.6025203302501707, + "grad_norm": 0.22662434062819758, + "learning_rate": 9.69346114249965e-05, + "loss": 3.1601, + "step": 9706 + }, + { + "epoch": 0.6025824073499286, + "grad_norm": 0.21427792686832992, + "learning_rate": 9.6933366206882e-05, + "loss": 3.2014, + "step": 9707 + }, + { + "epoch": 0.6026444844496865, + "grad_norm": 0.3462529632117934, + "learning_rate": 9.693212074390474e-05, + "loss": 3.2115, + "step": 9708 + }, + { + "epoch": 0.6027065615494445, + "grad_norm": 0.22365191015798308, + "learning_rate": 9.693087503607121e-05, + "loss": 3.2206, + "step": 9709 + }, + { + "epoch": 0.6027686386492023, + "grad_norm": 0.21758655689276393, + "learning_rate": 9.692962908338791e-05, + "loss": 3.277, + "step": 9710 + }, + { + "epoch": 0.6028307157489602, + "grad_norm": 0.19032094736252905, + "learning_rate": 9.692838288586134e-05, + "loss": 3.2251, + "step": 9711 + }, + { + "epoch": 0.6028927928487181, + "grad_norm": 0.240399877403013, + "learning_rate": 9.692713644349801e-05, + "loss": 3.2943, + "step": 9712 + }, + { + "epoch": 0.602954869948476, + "grad_norm": 0.2011182676573111, + "learning_rate": 9.692588975630441e-05, + "loss": 3.0715, + "step": 9713 + }, + { + "epoch": 0.6030169470482339, + "grad_norm": 0.23606664205325417, + "learning_rate": 9.692464282428706e-05, + "loss": 3.2218, + "step": 9714 + }, + { + "epoch": 0.6030790241479919, + "grad_norm": 0.23940367631483633, + "learning_rate": 9.692339564745246e-05, + "loss": 3.1314, + "step": 9715 + }, + { + "epoch": 0.6031411012477497, + "grad_norm": 0.19306999284252174, + "learning_rate": 9.69221482258071e-05, + "loss": 3.2352, + "step": 9716 + }, + { + "epoch": 0.6032031783475076, + "grad_norm": 0.2001313363173562, + "learning_rate": 9.692090055935751e-05, + "loss": 3.1571, + "step": 9717 + }, + { + "epoch": 0.6032652554472655, + "grad_norm": 0.22352172557901495, + "learning_rate": 9.691965264811018e-05, + "loss": 3.2015, + "step": 9718 + }, + { + "epoch": 0.6033273325470234, + "grad_norm": 0.1915118257167985, + "learning_rate": 9.691840449207164e-05, + "loss": 3.1512, + "step": 9719 + }, + { + "epoch": 0.6033894096467813, + "grad_norm": 0.2590726630008567, + "learning_rate": 9.69171560912484e-05, + "loss": 3.1365, + "step": 9720 + }, + { + "epoch": 0.6034514867465393, + "grad_norm": 0.24254940280630433, + "learning_rate": 9.691590744564696e-05, + "loss": 3.0792, + "step": 9721 + }, + { + "epoch": 0.6035135638462971, + "grad_norm": 0.2061640914540547, + "learning_rate": 9.691465855527385e-05, + "loss": 3.1884, + "step": 9722 + }, + { + "epoch": 0.603575640946055, + "grad_norm": 0.2092987619210308, + "learning_rate": 9.691340942013557e-05, + "loss": 3.2918, + "step": 9723 + }, + { + "epoch": 0.6036377180458129, + "grad_norm": 0.22243756714895146, + "learning_rate": 9.691216004023864e-05, + "loss": 3.2415, + "step": 9724 + }, + { + "epoch": 0.6036997951455708, + "grad_norm": 0.1966526213644108, + "learning_rate": 9.69109104155896e-05, + "loss": 3.2556, + "step": 9725 + }, + { + "epoch": 0.6037618722453287, + "grad_norm": 0.20883612856100378, + "learning_rate": 9.690966054619493e-05, + "loss": 3.2078, + "step": 9726 + }, + { + "epoch": 0.6038239493450867, + "grad_norm": 0.20533918385877958, + "learning_rate": 9.69084104320612e-05, + "loss": 3.2214, + "step": 9727 + }, + { + "epoch": 0.6038860264448445, + "grad_norm": 0.24382924631796585, + "learning_rate": 9.690716007319488e-05, + "loss": 3.1788, + "step": 9728 + }, + { + "epoch": 0.6039481035446024, + "grad_norm": 0.24651855558210684, + "learning_rate": 9.690590946960253e-05, + "loss": 3.1482, + "step": 9729 + }, + { + "epoch": 0.6040101806443603, + "grad_norm": 0.24220561913422484, + "learning_rate": 9.690465862129065e-05, + "loss": 3.2331, + "step": 9730 + }, + { + "epoch": 0.6040722577441182, + "grad_norm": 0.2115950842603882, + "learning_rate": 9.69034075282658e-05, + "loss": 3.2741, + "step": 9731 + }, + { + "epoch": 0.6041343348438761, + "grad_norm": 0.1898399709788514, + "learning_rate": 9.690215619053448e-05, + "loss": 3.168, + "step": 9732 + }, + { + "epoch": 0.604196411943634, + "grad_norm": 0.2368464971872013, + "learning_rate": 9.690090460810321e-05, + "loss": 3.216, + "step": 9733 + }, + { + "epoch": 0.6042584890433919, + "grad_norm": 0.22567608505956743, + "learning_rate": 9.689965278097857e-05, + "loss": 3.2406, + "step": 9734 + }, + { + "epoch": 0.6043205661431498, + "grad_norm": 0.25676700261619934, + "learning_rate": 9.689840070916703e-05, + "loss": 3.1941, + "step": 9735 + }, + { + "epoch": 0.6043826432429077, + "grad_norm": 0.2632718477919116, + "learning_rate": 9.689714839267515e-05, + "loss": 3.1697, + "step": 9736 + }, + { + "epoch": 0.6044447203426656, + "grad_norm": 0.28755687690050824, + "learning_rate": 9.689589583150946e-05, + "loss": 3.1773, + "step": 9737 + }, + { + "epoch": 0.6045067974424235, + "grad_norm": 0.1832877409030341, + "learning_rate": 9.68946430256765e-05, + "loss": 3.1823, + "step": 9738 + }, + { + "epoch": 0.6045688745421814, + "grad_norm": 0.22431035247459233, + "learning_rate": 9.689338997518281e-05, + "loss": 3.1955, + "step": 9739 + }, + { + "epoch": 0.6046309516419393, + "grad_norm": 0.17814503900007878, + "learning_rate": 9.689213668003492e-05, + "loss": 3.1577, + "step": 9740 + }, + { + "epoch": 0.6046930287416972, + "grad_norm": 0.23441115736214008, + "learning_rate": 9.689088314023937e-05, + "loss": 3.1451, + "step": 9741 + }, + { + "epoch": 0.6047551058414551, + "grad_norm": 0.2034186478125949, + "learning_rate": 9.688962935580268e-05, + "loss": 3.1743, + "step": 9742 + }, + { + "epoch": 0.604817182941213, + "grad_norm": 0.23927374027718923, + "learning_rate": 9.688837532673143e-05, + "loss": 3.1843, + "step": 9743 + }, + { + "epoch": 0.6048792600409709, + "grad_norm": 0.2947065438865643, + "learning_rate": 9.688712105303215e-05, + "loss": 3.1285, + "step": 9744 + }, + { + "epoch": 0.6049413371407287, + "grad_norm": 0.23864749996253276, + "learning_rate": 9.688586653471137e-05, + "loss": 3.1498, + "step": 9745 + }, + { + "epoch": 0.6050034142404866, + "grad_norm": 0.28647595553669464, + "learning_rate": 9.688461177177563e-05, + "loss": 3.1959, + "step": 9746 + }, + { + "epoch": 0.6050654913402446, + "grad_norm": 0.22219383173047746, + "learning_rate": 9.688335676423151e-05, + "loss": 3.1679, + "step": 9747 + }, + { + "epoch": 0.6051275684400025, + "grad_norm": 0.3457524323726113, + "learning_rate": 9.68821015120855e-05, + "loss": 3.2904, + "step": 9748 + }, + { + "epoch": 0.6051896455397604, + "grad_norm": 0.22438514961461842, + "learning_rate": 9.688084601534421e-05, + "loss": 3.1872, + "step": 9749 + }, + { + "epoch": 0.6052517226395183, + "grad_norm": 0.3213557953795836, + "learning_rate": 9.687959027401418e-05, + "loss": 3.2589, + "step": 9750 + }, + { + "epoch": 0.6053137997392761, + "grad_norm": 0.2716774973997013, + "learning_rate": 9.687833428810194e-05, + "loss": 3.1661, + "step": 9751 + }, + { + "epoch": 0.605375876839034, + "grad_norm": 0.18645723775900522, + "learning_rate": 9.687707805761403e-05, + "loss": 3.1468, + "step": 9752 + }, + { + "epoch": 0.605437953938792, + "grad_norm": 0.2831611602209269, + "learning_rate": 9.687582158255704e-05, + "loss": 3.2265, + "step": 9753 + }, + { + "epoch": 0.6055000310385499, + "grad_norm": 0.22936937008218866, + "learning_rate": 9.68745648629375e-05, + "loss": 3.2429, + "step": 9754 + }, + { + "epoch": 0.6055621081383078, + "grad_norm": 0.21672358433915317, + "learning_rate": 9.6873307898762e-05, + "loss": 3.2265, + "step": 9755 + }, + { + "epoch": 0.6056241852380657, + "grad_norm": 0.1887374515183218, + "learning_rate": 9.687205069003704e-05, + "loss": 3.1643, + "step": 9756 + }, + { + "epoch": 0.6056862623378235, + "grad_norm": 0.2659914070118168, + "learning_rate": 9.687079323676922e-05, + "loss": 3.1506, + "step": 9757 + }, + { + "epoch": 0.6057483394375814, + "grad_norm": 0.2724913713418971, + "learning_rate": 9.68695355389651e-05, + "loss": 3.1382, + "step": 9758 + }, + { + "epoch": 0.6058104165373394, + "grad_norm": 0.24300500347829967, + "learning_rate": 9.686827759663124e-05, + "loss": 3.1418, + "step": 9759 + }, + { + "epoch": 0.6058724936370973, + "grad_norm": 0.30664640773495905, + "learning_rate": 9.68670194097742e-05, + "loss": 3.1862, + "step": 9760 + }, + { + "epoch": 0.6059345707368552, + "grad_norm": 0.27422809747919635, + "learning_rate": 9.686576097840053e-05, + "loss": 3.1396, + "step": 9761 + }, + { + "epoch": 0.6059966478366131, + "grad_norm": 0.23405386859171567, + "learning_rate": 9.686450230251683e-05, + "loss": 3.163, + "step": 9762 + }, + { + "epoch": 0.6060587249363709, + "grad_norm": 0.34286344601699775, + "learning_rate": 9.686324338212962e-05, + "loss": 3.2559, + "step": 9763 + }, + { + "epoch": 0.6061208020361288, + "grad_norm": 0.3093497034770333, + "learning_rate": 9.686198421724551e-05, + "loss": 3.187, + "step": 9764 + }, + { + "epoch": 0.6061828791358868, + "grad_norm": 0.24761924527359286, + "learning_rate": 9.686072480787103e-05, + "loss": 3.1846, + "step": 9765 + }, + { + "epoch": 0.6062449562356447, + "grad_norm": 0.22505761905516375, + "learning_rate": 9.685946515401279e-05, + "loss": 3.1264, + "step": 9766 + }, + { + "epoch": 0.6063070333354026, + "grad_norm": 0.20493926842331664, + "learning_rate": 9.685820525567735e-05, + "loss": 3.2062, + "step": 9767 + }, + { + "epoch": 0.6063691104351605, + "grad_norm": 0.4869425312752263, + "learning_rate": 9.685694511287126e-05, + "loss": 3.2612, + "step": 9768 + }, + { + "epoch": 0.6064311875349183, + "grad_norm": 0.3483439298475128, + "learning_rate": 9.685568472560114e-05, + "loss": 3.1944, + "step": 9769 + }, + { + "epoch": 0.6064932646346762, + "grad_norm": 0.31459274509489593, + "learning_rate": 9.685442409387351e-05, + "loss": 3.1513, + "step": 9770 + }, + { + "epoch": 0.6065553417344342, + "grad_norm": 0.22568229169812495, + "learning_rate": 9.685316321769499e-05, + "loss": 3.2133, + "step": 9771 + }, + { + "epoch": 0.6066174188341921, + "grad_norm": 0.2637976593645805, + "learning_rate": 9.685190209707213e-05, + "loss": 3.145, + "step": 9772 + }, + { + "epoch": 0.60667949593395, + "grad_norm": 0.22388992355197398, + "learning_rate": 9.685064073201154e-05, + "loss": 3.1434, + "step": 9773 + }, + { + "epoch": 0.6067415730337079, + "grad_norm": 0.30312630908722715, + "learning_rate": 9.684937912251976e-05, + "loss": 3.2706, + "step": 9774 + }, + { + "epoch": 0.6068036501334657, + "grad_norm": 0.2670621142733891, + "learning_rate": 9.684811726860343e-05, + "loss": 3.1085, + "step": 9775 + }, + { + "epoch": 0.6068657272332236, + "grad_norm": 0.23254605189693392, + "learning_rate": 9.684685517026909e-05, + "loss": 3.2733, + "step": 9776 + }, + { + "epoch": 0.6069278043329815, + "grad_norm": 0.2774649207607847, + "learning_rate": 9.684559282752332e-05, + "loss": 3.2301, + "step": 9777 + }, + { + "epoch": 0.6069898814327395, + "grad_norm": 0.26975640277582025, + "learning_rate": 9.684433024037271e-05, + "loss": 3.2871, + "step": 9778 + }, + { + "epoch": 0.6070519585324974, + "grad_norm": 0.26851158489776195, + "learning_rate": 9.684306740882388e-05, + "loss": 3.2182, + "step": 9779 + }, + { + "epoch": 0.6071140356322553, + "grad_norm": 0.20039668306410552, + "learning_rate": 9.68418043328834e-05, + "loss": 3.1311, + "step": 9780 + }, + { + "epoch": 0.6071761127320131, + "grad_norm": 0.23582170152089235, + "learning_rate": 9.684054101255783e-05, + "loss": 3.1972, + "step": 9781 + }, + { + "epoch": 0.607238189831771, + "grad_norm": 0.20014855373752488, + "learning_rate": 9.683927744785382e-05, + "loss": 3.1883, + "step": 9782 + }, + { + "epoch": 0.6073002669315289, + "grad_norm": 0.27918628023528863, + "learning_rate": 9.68380136387779e-05, + "loss": 3.2034, + "step": 9783 + }, + { + "epoch": 0.6073623440312869, + "grad_norm": 0.22855411107641438, + "learning_rate": 9.68367495853367e-05, + "loss": 3.2192, + "step": 9784 + }, + { + "epoch": 0.6074244211310448, + "grad_norm": 0.25938735036286353, + "learning_rate": 9.68354852875368e-05, + "loss": 3.1546, + "step": 9785 + }, + { + "epoch": 0.6074864982308027, + "grad_norm": 0.38028090145895416, + "learning_rate": 9.683422074538482e-05, + "loss": 3.1603, + "step": 9786 + }, + { + "epoch": 0.6075485753305605, + "grad_norm": 0.3773388643126185, + "learning_rate": 9.683295595888734e-05, + "loss": 3.205, + "step": 9787 + }, + { + "epoch": 0.6076106524303184, + "grad_norm": 0.3122100677199388, + "learning_rate": 9.683169092805096e-05, + "loss": 3.2619, + "step": 9788 + }, + { + "epoch": 0.6076727295300763, + "grad_norm": 0.26886325102283576, + "learning_rate": 9.683042565288228e-05, + "loss": 3.2375, + "step": 9789 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.2411247207443218, + "learning_rate": 9.682916013338789e-05, + "loss": 3.2794, + "step": 9790 + }, + { + "epoch": 0.6077968837295922, + "grad_norm": 0.19541102785204909, + "learning_rate": 9.68278943695744e-05, + "loss": 3.1943, + "step": 9791 + }, + { + "epoch": 0.6078589608293501, + "grad_norm": 0.238023094351538, + "learning_rate": 9.682662836144843e-05, + "loss": 3.163, + "step": 9792 + }, + { + "epoch": 0.6079210379291079, + "grad_norm": 0.23283780971665627, + "learning_rate": 9.682536210901655e-05, + "loss": 3.1966, + "step": 9793 + }, + { + "epoch": 0.6079831150288658, + "grad_norm": 0.21057223554757845, + "learning_rate": 9.682409561228543e-05, + "loss": 3.2216, + "step": 9794 + }, + { + "epoch": 0.6080451921286237, + "grad_norm": 0.19314720573969116, + "learning_rate": 9.68228288712616e-05, + "loss": 3.1889, + "step": 9795 + }, + { + "epoch": 0.6081072692283817, + "grad_norm": 0.2585635948462263, + "learning_rate": 9.682156188595173e-05, + "loss": 3.1386, + "step": 9796 + }, + { + "epoch": 0.6081693463281396, + "grad_norm": 0.25400724603143177, + "learning_rate": 9.682029465636239e-05, + "loss": 3.1991, + "step": 9797 + }, + { + "epoch": 0.6082314234278975, + "grad_norm": 0.18030893631360465, + "learning_rate": 9.68190271825002e-05, + "loss": 3.1986, + "step": 9798 + }, + { + "epoch": 0.6082935005276553, + "grad_norm": 0.26224814435413263, + "learning_rate": 9.68177594643718e-05, + "loss": 3.2025, + "step": 9799 + }, + { + "epoch": 0.6083555776274132, + "grad_norm": 0.19758315072808047, + "learning_rate": 9.681649150198377e-05, + "loss": 3.1695, + "step": 9800 + }, + { + "epoch": 0.6084176547271711, + "grad_norm": 0.18941021462794222, + "learning_rate": 9.681522329534273e-05, + "loss": 3.2, + "step": 9801 + }, + { + "epoch": 0.608479731826929, + "grad_norm": 0.41446281371445454, + "learning_rate": 9.681395484445532e-05, + "loss": 3.2237, + "step": 9802 + }, + { + "epoch": 0.608541808926687, + "grad_norm": 0.28355439861496773, + "learning_rate": 9.681268614932815e-05, + "loss": 3.2012, + "step": 9803 + }, + { + "epoch": 0.6086038860264449, + "grad_norm": 0.24304255931410634, + "learning_rate": 9.681141720996782e-05, + "loss": 3.1184, + "step": 9804 + }, + { + "epoch": 0.6086659631262027, + "grad_norm": 0.29331264657033773, + "learning_rate": 9.681014802638096e-05, + "loss": 3.1753, + "step": 9805 + }, + { + "epoch": 0.6087280402259606, + "grad_norm": 0.26050139320462257, + "learning_rate": 9.680887859857421e-05, + "loss": 3.2737, + "step": 9806 + }, + { + "epoch": 0.6087901173257185, + "grad_norm": 0.2809494717628128, + "learning_rate": 9.680760892655416e-05, + "loss": 3.0616, + "step": 9807 + }, + { + "epoch": 0.6088521944254764, + "grad_norm": 0.20384203458722422, + "learning_rate": 9.680633901032745e-05, + "loss": 3.2063, + "step": 9808 + }, + { + "epoch": 0.6089142715252344, + "grad_norm": 0.26036277054330287, + "learning_rate": 9.680506884990072e-05, + "loss": 3.2495, + "step": 9809 + }, + { + "epoch": 0.6089763486249923, + "grad_norm": 0.239183601236478, + "learning_rate": 9.680379844528058e-05, + "loss": 3.1812, + "step": 9810 + }, + { + "epoch": 0.6090384257247501, + "grad_norm": 0.3153723673512167, + "learning_rate": 9.680252779647365e-05, + "loss": 3.2265, + "step": 9811 + }, + { + "epoch": 0.609100502824508, + "grad_norm": 0.19224097740668997, + "learning_rate": 9.680125690348659e-05, + "loss": 3.0937, + "step": 9812 + }, + { + "epoch": 0.6091625799242659, + "grad_norm": 0.22207453513177847, + "learning_rate": 9.6799985766326e-05, + "loss": 3.2258, + "step": 9813 + }, + { + "epoch": 0.6092246570240238, + "grad_norm": 0.22626700533665872, + "learning_rate": 9.679871438499852e-05, + "loss": 3.263, + "step": 9814 + }, + { + "epoch": 0.6092867341237818, + "grad_norm": 0.23583364379820815, + "learning_rate": 9.679744275951079e-05, + "loss": 3.1835, + "step": 9815 + }, + { + "epoch": 0.6093488112235397, + "grad_norm": 0.25513115838340455, + "learning_rate": 9.679617088986945e-05, + "loss": 3.2108, + "step": 9816 + }, + { + "epoch": 0.6094108883232975, + "grad_norm": 0.2082569170915887, + "learning_rate": 9.679489877608112e-05, + "loss": 3.2239, + "step": 9817 + }, + { + "epoch": 0.6094729654230554, + "grad_norm": 0.21209789687655775, + "learning_rate": 9.679362641815242e-05, + "loss": 3.1343, + "step": 9818 + }, + { + "epoch": 0.6095350425228133, + "grad_norm": 0.20273765899409243, + "learning_rate": 9.679235381609004e-05, + "loss": 3.2046, + "step": 9819 + }, + { + "epoch": 0.6095971196225712, + "grad_norm": 0.696804503223636, + "learning_rate": 9.67910809699006e-05, + "loss": 3.2249, + "step": 9820 + }, + { + "epoch": 0.6096591967223292, + "grad_norm": 0.23625768447165602, + "learning_rate": 9.67898078795907e-05, + "loss": 3.162, + "step": 9821 + }, + { + "epoch": 0.6097212738220871, + "grad_norm": 0.22942284693073986, + "learning_rate": 9.678853454516704e-05, + "loss": 3.2281, + "step": 9822 + }, + { + "epoch": 0.6097833509218449, + "grad_norm": 0.280483990842233, + "learning_rate": 9.678726096663622e-05, + "loss": 3.2707, + "step": 9823 + }, + { + "epoch": 0.6098454280216028, + "grad_norm": 0.24325057486204493, + "learning_rate": 9.678598714400491e-05, + "loss": 3.1476, + "step": 9824 + }, + { + "epoch": 0.6099075051213607, + "grad_norm": 0.22829819793244763, + "learning_rate": 9.678471307727975e-05, + "loss": 3.2237, + "step": 9825 + }, + { + "epoch": 0.6099695822211186, + "grad_norm": 0.22514151685798714, + "learning_rate": 9.678343876646736e-05, + "loss": 3.0871, + "step": 9826 + }, + { + "epoch": 0.6100316593208766, + "grad_norm": 0.2322605679240888, + "learning_rate": 9.678216421157443e-05, + "loss": 3.2015, + "step": 9827 + }, + { + "epoch": 0.6100937364206345, + "grad_norm": 0.33873552738396456, + "learning_rate": 9.67808894126076e-05, + "loss": 3.1921, + "step": 9828 + }, + { + "epoch": 0.6101558135203923, + "grad_norm": 0.24355126006063385, + "learning_rate": 9.67796143695735e-05, + "loss": 3.2269, + "step": 9829 + }, + { + "epoch": 0.6102178906201502, + "grad_norm": 0.254857599173304, + "learning_rate": 9.677833908247881e-05, + "loss": 3.2117, + "step": 9830 + }, + { + "epoch": 0.6102799677199081, + "grad_norm": 0.2645714003707979, + "learning_rate": 9.677706355133016e-05, + "loss": 3.2567, + "step": 9831 + }, + { + "epoch": 0.610342044819666, + "grad_norm": 0.21636201874729422, + "learning_rate": 9.677578777613422e-05, + "loss": 3.1607, + "step": 9832 + }, + { + "epoch": 0.610404121919424, + "grad_norm": 0.2830237906021509, + "learning_rate": 9.677451175689763e-05, + "loss": 3.253, + "step": 9833 + }, + { + "epoch": 0.6104661990191819, + "grad_norm": 0.27878462225750844, + "learning_rate": 9.677323549362707e-05, + "loss": 3.1253, + "step": 9834 + }, + { + "epoch": 0.6105282761189397, + "grad_norm": 0.20337600971175257, + "learning_rate": 9.677195898632918e-05, + "loss": 3.2024, + "step": 9835 + }, + { + "epoch": 0.6105903532186976, + "grad_norm": 0.22440474767252444, + "learning_rate": 9.677068223501063e-05, + "loss": 3.1519, + "step": 9836 + }, + { + "epoch": 0.6106524303184555, + "grad_norm": 0.23831756502677084, + "learning_rate": 9.676940523967806e-05, + "loss": 3.0969, + "step": 9837 + }, + { + "epoch": 0.6107145074182134, + "grad_norm": 0.2497680825144477, + "learning_rate": 9.676812800033816e-05, + "loss": 3.1772, + "step": 9838 + }, + { + "epoch": 0.6107765845179713, + "grad_norm": 0.3069271765440735, + "learning_rate": 9.676685051699758e-05, + "loss": 3.1518, + "step": 9839 + }, + { + "epoch": 0.6108386616177293, + "grad_norm": 0.3274833104016061, + "learning_rate": 9.6765572789663e-05, + "loss": 3.1171, + "step": 9840 + }, + { + "epoch": 0.6109007387174871, + "grad_norm": 0.23986178091160976, + "learning_rate": 9.676429481834106e-05, + "loss": 3.0986, + "step": 9841 + }, + { + "epoch": 0.610962815817245, + "grad_norm": 0.20988036982997926, + "learning_rate": 9.676301660303846e-05, + "loss": 3.124, + "step": 9842 + }, + { + "epoch": 0.6110248929170029, + "grad_norm": 0.2135747974947818, + "learning_rate": 9.676173814376184e-05, + "loss": 3.2183, + "step": 9843 + }, + { + "epoch": 0.6110869700167608, + "grad_norm": 0.2518054470221114, + "learning_rate": 9.676045944051787e-05, + "loss": 3.2198, + "step": 9844 + }, + { + "epoch": 0.6111490471165187, + "grad_norm": 0.30376054904421584, + "learning_rate": 9.675918049331323e-05, + "loss": 3.2329, + "step": 9845 + }, + { + "epoch": 0.6112111242162767, + "grad_norm": 0.1970636069874352, + "learning_rate": 9.675790130215461e-05, + "loss": 3.13, + "step": 9846 + }, + { + "epoch": 0.6112732013160345, + "grad_norm": 0.21700546608579993, + "learning_rate": 9.675662186704865e-05, + "loss": 3.3145, + "step": 9847 + }, + { + "epoch": 0.6113352784157924, + "grad_norm": 0.23280954487096392, + "learning_rate": 9.675534218800206e-05, + "loss": 3.1758, + "step": 9848 + }, + { + "epoch": 0.6113973555155503, + "grad_norm": 0.2554448533620318, + "learning_rate": 9.675406226502151e-05, + "loss": 3.1625, + "step": 9849 + }, + { + "epoch": 0.6114594326153082, + "grad_norm": 0.2213269001865934, + "learning_rate": 9.675278209811365e-05, + "loss": 3.1393, + "step": 9850 + }, + { + "epoch": 0.6115215097150661, + "grad_norm": 0.2532918391994017, + "learning_rate": 9.675150168728518e-05, + "loss": 3.0356, + "step": 9851 + }, + { + "epoch": 0.611583586814824, + "grad_norm": 0.2674647738534117, + "learning_rate": 9.675022103254277e-05, + "loss": 3.2028, + "step": 9852 + }, + { + "epoch": 0.6116456639145819, + "grad_norm": 0.21936608308618508, + "learning_rate": 9.67489401338931e-05, + "loss": 3.1134, + "step": 9853 + }, + { + "epoch": 0.6117077410143398, + "grad_norm": 0.22201522774707458, + "learning_rate": 9.674765899134289e-05, + "loss": 3.1488, + "step": 9854 + }, + { + "epoch": 0.6117698181140977, + "grad_norm": 0.3132532459214161, + "learning_rate": 9.674637760489878e-05, + "loss": 3.2044, + "step": 9855 + }, + { + "epoch": 0.6118318952138556, + "grad_norm": 0.17835484658105621, + "learning_rate": 9.674509597456746e-05, + "loss": 3.2419, + "step": 9856 + }, + { + "epoch": 0.6118939723136135, + "grad_norm": 0.21769284352193252, + "learning_rate": 9.674381410035563e-05, + "loss": 3.1373, + "step": 9857 + }, + { + "epoch": 0.6119560494133715, + "grad_norm": 0.2333641026991457, + "learning_rate": 9.674253198226998e-05, + "loss": 3.1459, + "step": 9858 + }, + { + "epoch": 0.6120181265131293, + "grad_norm": 0.2287301010716509, + "learning_rate": 9.674124962031719e-05, + "loss": 3.2051, + "step": 9859 + }, + { + "epoch": 0.6120802036128872, + "grad_norm": 0.20273155842553653, + "learning_rate": 9.673996701450396e-05, + "loss": 3.1124, + "step": 9860 + }, + { + "epoch": 0.6121422807126451, + "grad_norm": 0.3709294808861879, + "learning_rate": 9.673868416483697e-05, + "loss": 3.2331, + "step": 9861 + }, + { + "epoch": 0.612204357812403, + "grad_norm": 0.20314080837063375, + "learning_rate": 9.673740107132291e-05, + "loss": 3.1583, + "step": 9862 + }, + { + "epoch": 0.6122664349121609, + "grad_norm": 0.2827252532676529, + "learning_rate": 9.673611773396849e-05, + "loss": 3.2471, + "step": 9863 + }, + { + "epoch": 0.6123285120119188, + "grad_norm": 0.28543722340883837, + "learning_rate": 9.67348341527804e-05, + "loss": 3.2035, + "step": 9864 + }, + { + "epoch": 0.6123905891116767, + "grad_norm": 0.4940185546875, + "learning_rate": 9.673355032776534e-05, + "loss": 3.1719, + "step": 9865 + }, + { + "epoch": 0.6124526662114346, + "grad_norm": 0.25115598210055523, + "learning_rate": 9.673226625893e-05, + "loss": 3.0319, + "step": 9866 + }, + { + "epoch": 0.6125147433111925, + "grad_norm": 0.4298038498536535, + "learning_rate": 9.673098194628109e-05, + "loss": 3.204, + "step": 9867 + }, + { + "epoch": 0.6125768204109504, + "grad_norm": 0.3242674986258032, + "learning_rate": 9.672969738982529e-05, + "loss": 3.2305, + "step": 9868 + }, + { + "epoch": 0.6126388975107083, + "grad_norm": 0.25239330844400804, + "learning_rate": 9.672841258956932e-05, + "loss": 3.2044, + "step": 9869 + }, + { + "epoch": 0.6127009746104662, + "grad_norm": 0.2977300177049867, + "learning_rate": 9.672712754551987e-05, + "loss": 3.278, + "step": 9870 + }, + { + "epoch": 0.612763051710224, + "grad_norm": 0.2647095219121935, + "learning_rate": 9.672584225768366e-05, + "loss": 3.228, + "step": 9871 + }, + { + "epoch": 0.612825128809982, + "grad_norm": 0.3422580332764147, + "learning_rate": 9.672455672606738e-05, + "loss": 3.0515, + "step": 9872 + }, + { + "epoch": 0.6128872059097399, + "grad_norm": 0.29540449405004215, + "learning_rate": 9.672327095067776e-05, + "loss": 3.0905, + "step": 9873 + }, + { + "epoch": 0.6129492830094978, + "grad_norm": 0.26759214434130296, + "learning_rate": 9.672198493152148e-05, + "loss": 3.1696, + "step": 9874 + }, + { + "epoch": 0.6130113601092557, + "grad_norm": 0.2602634106129791, + "learning_rate": 9.672069866860527e-05, + "loss": 3.1648, + "step": 9875 + }, + { + "epoch": 0.6130734372090136, + "grad_norm": 0.225810364558694, + "learning_rate": 9.671941216193583e-05, + "loss": 3.2231, + "step": 9876 + }, + { + "epoch": 0.6131355143087714, + "grad_norm": 0.2504657489837179, + "learning_rate": 9.671812541151988e-05, + "loss": 3.1722, + "step": 9877 + }, + { + "epoch": 0.6131975914085294, + "grad_norm": 0.24839640943332492, + "learning_rate": 9.671683841736411e-05, + "loss": 3.1827, + "step": 9878 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.2696341442578947, + "learning_rate": 9.671555117947527e-05, + "loss": 3.198, + "step": 9879 + }, + { + "epoch": 0.6133217456080452, + "grad_norm": 0.19869629287222884, + "learning_rate": 9.671426369786005e-05, + "loss": 3.2539, + "step": 9880 + }, + { + "epoch": 0.6133838227078031, + "grad_norm": 0.29583256054271356, + "learning_rate": 9.671297597252519e-05, + "loss": 3.2216, + "step": 9881 + }, + { + "epoch": 0.613445899807561, + "grad_norm": 0.278400904715578, + "learning_rate": 9.671168800347738e-05, + "loss": 3.2462, + "step": 9882 + }, + { + "epoch": 0.6135079769073188, + "grad_norm": 0.31130854450524365, + "learning_rate": 9.671039979072335e-05, + "loss": 3.1956, + "step": 9883 + }, + { + "epoch": 0.6135700540070768, + "grad_norm": 0.22321976075950928, + "learning_rate": 9.670911133426986e-05, + "loss": 3.2327, + "step": 9884 + }, + { + "epoch": 0.6136321311068347, + "grad_norm": 0.2464714860039984, + "learning_rate": 9.670782263412356e-05, + "loss": 3.1974, + "step": 9885 + }, + { + "epoch": 0.6136942082065926, + "grad_norm": 0.2871315876159789, + "learning_rate": 9.670653369029121e-05, + "loss": 3.1602, + "step": 9886 + }, + { + "epoch": 0.6137562853063505, + "grad_norm": 0.23406287708939658, + "learning_rate": 9.670524450277956e-05, + "loss": 3.2358, + "step": 9887 + }, + { + "epoch": 0.6138183624061084, + "grad_norm": 0.2533527797578097, + "learning_rate": 9.67039550715953e-05, + "loss": 3.1563, + "step": 9888 + }, + { + "epoch": 0.6138804395058662, + "grad_norm": 0.24019151010572945, + "learning_rate": 9.670266539674516e-05, + "loss": 3.1571, + "step": 9889 + }, + { + "epoch": 0.6139425166056242, + "grad_norm": 0.20746921602516563, + "learning_rate": 9.670137547823589e-05, + "loss": 3.2424, + "step": 9890 + }, + { + "epoch": 0.6140045937053821, + "grad_norm": 0.2144853637171197, + "learning_rate": 9.67000853160742e-05, + "loss": 3.1831, + "step": 9891 + }, + { + "epoch": 0.61406667080514, + "grad_norm": 0.2429508592084218, + "learning_rate": 9.669879491026683e-05, + "loss": 3.2469, + "step": 9892 + }, + { + "epoch": 0.6141287479048979, + "grad_norm": 0.3574511823468404, + "learning_rate": 9.669750426082052e-05, + "loss": 3.2057, + "step": 9893 + }, + { + "epoch": 0.6141908250046558, + "grad_norm": 0.20723251240751867, + "learning_rate": 9.669621336774197e-05, + "loss": 3.1637, + "step": 9894 + }, + { + "epoch": 0.6142529021044136, + "grad_norm": 0.2729019506262793, + "learning_rate": 9.669492223103795e-05, + "loss": 3.18, + "step": 9895 + }, + { + "epoch": 0.6143149792041716, + "grad_norm": 0.22902707794018398, + "learning_rate": 9.669363085071519e-05, + "loss": 3.1669, + "step": 9896 + }, + { + "epoch": 0.6143770563039295, + "grad_norm": 0.21163620729788934, + "learning_rate": 9.669233922678043e-05, + "loss": 3.1888, + "step": 9897 + }, + { + "epoch": 0.6144391334036874, + "grad_norm": 0.1889238633462994, + "learning_rate": 9.669104735924039e-05, + "loss": 3.123, + "step": 9898 + }, + { + "epoch": 0.6145012105034453, + "grad_norm": 0.24861823381592701, + "learning_rate": 9.668975524810182e-05, + "loss": 3.0929, + "step": 9899 + }, + { + "epoch": 0.6145632876032032, + "grad_norm": 0.28900027249663285, + "learning_rate": 9.668846289337147e-05, + "loss": 3.181, + "step": 9900 + }, + { + "epoch": 0.614625364702961, + "grad_norm": 0.22746987989700285, + "learning_rate": 9.668717029505607e-05, + "loss": 3.238, + "step": 9901 + }, + { + "epoch": 0.614687441802719, + "grad_norm": 0.20767356989772126, + "learning_rate": 9.668587745316238e-05, + "loss": 3.1329, + "step": 9902 + }, + { + "epoch": 0.6147495189024769, + "grad_norm": 0.2319309836792899, + "learning_rate": 9.668458436769712e-05, + "loss": 3.2557, + "step": 9903 + }, + { + "epoch": 0.6148115960022348, + "grad_norm": 0.2050392295646314, + "learning_rate": 9.668329103866704e-05, + "loss": 3.1337, + "step": 9904 + }, + { + "epoch": 0.6148736731019927, + "grad_norm": 0.1940044662450606, + "learning_rate": 9.668199746607892e-05, + "loss": 3.1769, + "step": 9905 + }, + { + "epoch": 0.6149357502017506, + "grad_norm": 0.2948975584603347, + "learning_rate": 9.668070364993949e-05, + "loss": 3.2168, + "step": 9906 + }, + { + "epoch": 0.6149978273015084, + "grad_norm": 0.22671163517562545, + "learning_rate": 9.667940959025547e-05, + "loss": 3.2034, + "step": 9907 + }, + { + "epoch": 0.6150599044012663, + "grad_norm": 0.30681575949191114, + "learning_rate": 9.667811528703366e-05, + "loss": 3.2539, + "step": 9908 + }, + { + "epoch": 0.6151219815010243, + "grad_norm": 0.21338488572600622, + "learning_rate": 9.667682074028079e-05, + "loss": 3.1766, + "step": 9909 + }, + { + "epoch": 0.6151840586007822, + "grad_norm": 0.21925773619244904, + "learning_rate": 9.667552595000359e-05, + "loss": 3.0586, + "step": 9910 + }, + { + "epoch": 0.6152461357005401, + "grad_norm": 0.17953545938110457, + "learning_rate": 9.667423091620887e-05, + "loss": 3.2735, + "step": 9911 + }, + { + "epoch": 0.615308212800298, + "grad_norm": 0.31297771180111056, + "learning_rate": 9.667293563890335e-05, + "loss": 3.2576, + "step": 9912 + }, + { + "epoch": 0.6153702899000558, + "grad_norm": 0.24992195043535873, + "learning_rate": 9.66716401180938e-05, + "loss": 3.2214, + "step": 9913 + }, + { + "epoch": 0.6154323669998137, + "grad_norm": 0.43931455334758657, + "learning_rate": 9.667034435378696e-05, + "loss": 3.2317, + "step": 9914 + }, + { + "epoch": 0.6154944440995717, + "grad_norm": 0.2925943525471707, + "learning_rate": 9.66690483459896e-05, + "loss": 3.1186, + "step": 9915 + }, + { + "epoch": 0.6155565211993296, + "grad_norm": 0.3191285933020696, + "learning_rate": 9.666775209470851e-05, + "loss": 3.3077, + "step": 9916 + }, + { + "epoch": 0.6156185982990875, + "grad_norm": 0.3267408992496673, + "learning_rate": 9.66664555999504e-05, + "loss": 3.1899, + "step": 9917 + }, + { + "epoch": 0.6156806753988454, + "grad_norm": 0.2630174781409059, + "learning_rate": 9.666515886172209e-05, + "loss": 3.1283, + "step": 9918 + }, + { + "epoch": 0.6157427524986032, + "grad_norm": 0.30245295334113625, + "learning_rate": 9.66638618800303e-05, + "loss": 3.1818, + "step": 9919 + }, + { + "epoch": 0.6158048295983611, + "grad_norm": 0.2488294032496702, + "learning_rate": 9.666256465488182e-05, + "loss": 3.1818, + "step": 9920 + }, + { + "epoch": 0.615866906698119, + "grad_norm": 0.2605874963575652, + "learning_rate": 9.66612671862834e-05, + "loss": 3.1671, + "step": 9921 + }, + { + "epoch": 0.615928983797877, + "grad_norm": 0.28095180545066556, + "learning_rate": 9.665996947424184e-05, + "loss": 3.1554, + "step": 9922 + }, + { + "epoch": 0.6159910608976349, + "grad_norm": 0.24285097917029885, + "learning_rate": 9.665867151876388e-05, + "loss": 3.1696, + "step": 9923 + }, + { + "epoch": 0.6160531379973928, + "grad_norm": 0.27820492410439607, + "learning_rate": 9.66573733198563e-05, + "loss": 3.1057, + "step": 9924 + }, + { + "epoch": 0.6161152150971506, + "grad_norm": 0.2656288146698742, + "learning_rate": 9.665607487752588e-05, + "loss": 3.2172, + "step": 9925 + }, + { + "epoch": 0.6161772921969085, + "grad_norm": 0.3047312191628298, + "learning_rate": 9.66547761917794e-05, + "loss": 3.2398, + "step": 9926 + }, + { + "epoch": 0.6162393692966665, + "grad_norm": 0.21910045854056068, + "learning_rate": 9.665347726262363e-05, + "loss": 3.263, + "step": 9927 + }, + { + "epoch": 0.6163014463964244, + "grad_norm": 0.20733981260285325, + "learning_rate": 9.665217809006533e-05, + "loss": 3.1544, + "step": 9928 + }, + { + "epoch": 0.6163635234961823, + "grad_norm": 0.41809825138811213, + "learning_rate": 9.66508786741113e-05, + "loss": 3.2804, + "step": 9929 + }, + { + "epoch": 0.6164256005959402, + "grad_norm": 0.2667506992041336, + "learning_rate": 9.66495790147683e-05, + "loss": 3.1084, + "step": 9930 + }, + { + "epoch": 0.616487677695698, + "grad_norm": 0.20424585854208202, + "learning_rate": 9.664827911204314e-05, + "loss": 3.2304, + "step": 9931 + }, + { + "epoch": 0.6165497547954559, + "grad_norm": 0.1895730042438426, + "learning_rate": 9.664697896594257e-05, + "loss": 3.1527, + "step": 9932 + }, + { + "epoch": 0.6166118318952138, + "grad_norm": 0.19631104796916155, + "learning_rate": 9.66456785764734e-05, + "loss": 3.1371, + "step": 9933 + }, + { + "epoch": 0.6166739089949718, + "grad_norm": 0.25784506736588053, + "learning_rate": 9.664437794364238e-05, + "loss": 3.1863, + "step": 9934 + }, + { + "epoch": 0.6167359860947297, + "grad_norm": 0.25894848213782146, + "learning_rate": 9.664307706745634e-05, + "loss": 3.2217, + "step": 9935 + }, + { + "epoch": 0.6167980631944876, + "grad_norm": 0.19805801932074776, + "learning_rate": 9.664177594792203e-05, + "loss": 3.2978, + "step": 9936 + }, + { + "epoch": 0.6168601402942454, + "grad_norm": 0.2998768349981355, + "learning_rate": 9.664047458504625e-05, + "loss": 3.2181, + "step": 9937 + }, + { + "epoch": 0.6169222173940033, + "grad_norm": 0.22887084120727882, + "learning_rate": 9.66391729788358e-05, + "loss": 3.2055, + "step": 9938 + }, + { + "epoch": 0.6169842944937612, + "grad_norm": 0.24523817928944805, + "learning_rate": 9.663787112929746e-05, + "loss": 3.1398, + "step": 9939 + }, + { + "epoch": 0.6170463715935192, + "grad_norm": 0.2560497478413099, + "learning_rate": 9.663656903643803e-05, + "loss": 3.1738, + "step": 9940 + }, + { + "epoch": 0.6171084486932771, + "grad_norm": 0.24212438006954365, + "learning_rate": 9.663526670026429e-05, + "loss": 3.2732, + "step": 9941 + }, + { + "epoch": 0.617170525793035, + "grad_norm": 0.2710988030526133, + "learning_rate": 9.663396412078305e-05, + "loss": 3.1541, + "step": 9942 + }, + { + "epoch": 0.6172326028927928, + "grad_norm": 0.21617914808671362, + "learning_rate": 9.663266129800109e-05, + "loss": 3.1259, + "step": 9943 + }, + { + "epoch": 0.6172946799925507, + "grad_norm": 0.2585797020821836, + "learning_rate": 9.663135823192522e-05, + "loss": 3.2559, + "step": 9944 + }, + { + "epoch": 0.6173567570923086, + "grad_norm": 0.20067744001991644, + "learning_rate": 9.663005492256223e-05, + "loss": 3.1387, + "step": 9945 + }, + { + "epoch": 0.6174188341920666, + "grad_norm": 0.269346934336786, + "learning_rate": 9.662875136991893e-05, + "loss": 3.1458, + "step": 9946 + }, + { + "epoch": 0.6174809112918245, + "grad_norm": 0.29837914939970844, + "learning_rate": 9.66274475740021e-05, + "loss": 3.063, + "step": 9947 + }, + { + "epoch": 0.6175429883915824, + "grad_norm": 0.21753460570428082, + "learning_rate": 9.662614353481859e-05, + "loss": 3.0827, + "step": 9948 + }, + { + "epoch": 0.6176050654913402, + "grad_norm": 0.20292617163306395, + "learning_rate": 9.662483925237513e-05, + "loss": 3.1707, + "step": 9949 + }, + { + "epoch": 0.6176671425910981, + "grad_norm": 0.21625101257373808, + "learning_rate": 9.662353472667859e-05, + "loss": 3.2033, + "step": 9950 + }, + { + "epoch": 0.617729219690856, + "grad_norm": 0.22375379230173176, + "learning_rate": 9.662222995773574e-05, + "loss": 3.2757, + "step": 9951 + }, + { + "epoch": 0.617791296790614, + "grad_norm": 0.18555312899038262, + "learning_rate": 9.662092494555341e-05, + "loss": 3.1625, + "step": 9952 + }, + { + "epoch": 0.6178533738903719, + "grad_norm": 0.18404999309981254, + "learning_rate": 9.661961969013839e-05, + "loss": 3.2164, + "step": 9953 + }, + { + "epoch": 0.6179154509901298, + "grad_norm": 0.20027948796340436, + "learning_rate": 9.661831419149749e-05, + "loss": 3.1094, + "step": 9954 + }, + { + "epoch": 0.6179775280898876, + "grad_norm": 0.20914542151712434, + "learning_rate": 9.661700844963754e-05, + "loss": 3.1549, + "step": 9955 + }, + { + "epoch": 0.6180396051896455, + "grad_norm": 0.22049618192803389, + "learning_rate": 9.661570246456533e-05, + "loss": 3.1626, + "step": 9956 + }, + { + "epoch": 0.6181016822894034, + "grad_norm": 0.21727021412424827, + "learning_rate": 9.661439623628768e-05, + "loss": 3.1813, + "step": 9957 + }, + { + "epoch": 0.6181637593891613, + "grad_norm": 0.2675353663452787, + "learning_rate": 9.661308976481142e-05, + "loss": 3.2054, + "step": 9958 + }, + { + "epoch": 0.6182258364889193, + "grad_norm": 0.2057476105852541, + "learning_rate": 9.661178305014335e-05, + "loss": 3.0954, + "step": 9959 + }, + { + "epoch": 0.6182879135886772, + "grad_norm": 0.20334819305722499, + "learning_rate": 9.66104760922903e-05, + "loss": 3.1206, + "step": 9960 + }, + { + "epoch": 0.618349990688435, + "grad_norm": 0.21685450020022112, + "learning_rate": 9.660916889125907e-05, + "loss": 3.155, + "step": 9961 + }, + { + "epoch": 0.6184120677881929, + "grad_norm": 0.23715903282216744, + "learning_rate": 9.66078614470565e-05, + "loss": 3.1898, + "step": 9962 + }, + { + "epoch": 0.6184741448879508, + "grad_norm": 0.19043187849300472, + "learning_rate": 9.66065537596894e-05, + "loss": 3.0737, + "step": 9963 + }, + { + "epoch": 0.6185362219877087, + "grad_norm": 0.20754552341817462, + "learning_rate": 9.660524582916459e-05, + "loss": 3.2607, + "step": 9964 + }, + { + "epoch": 0.6185982990874667, + "grad_norm": 0.18179586401225092, + "learning_rate": 9.660393765548891e-05, + "loss": 3.1124, + "step": 9965 + }, + { + "epoch": 0.6186603761872246, + "grad_norm": 0.18026236131227746, + "learning_rate": 9.660262923866917e-05, + "loss": 3.1325, + "step": 9966 + }, + { + "epoch": 0.6187224532869824, + "grad_norm": 0.18361718454341888, + "learning_rate": 9.660132057871218e-05, + "loss": 3.166, + "step": 9967 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.24013822849042651, + "learning_rate": 9.660001167562481e-05, + "loss": 3.2231, + "step": 9968 + }, + { + "epoch": 0.6188466074864982, + "grad_norm": 0.2481114311015351, + "learning_rate": 9.659870252941386e-05, + "loss": 3.1425, + "step": 9969 + }, + { + "epoch": 0.6189086845862561, + "grad_norm": 0.36886838935996397, + "learning_rate": 9.659739314008618e-05, + "loss": 3.2285, + "step": 9970 + }, + { + "epoch": 0.6189707616860141, + "grad_norm": 0.19566885863290248, + "learning_rate": 9.659608350764857e-05, + "loss": 3.1196, + "step": 9971 + }, + { + "epoch": 0.619032838785772, + "grad_norm": 0.25469115787055835, + "learning_rate": 9.659477363210788e-05, + "loss": 3.1871, + "step": 9972 + }, + { + "epoch": 0.6190949158855298, + "grad_norm": 0.23846784696031825, + "learning_rate": 9.659346351347096e-05, + "loss": 3.1974, + "step": 9973 + }, + { + "epoch": 0.6191569929852877, + "grad_norm": 0.29633369788539704, + "learning_rate": 9.659215315174461e-05, + "loss": 3.0673, + "step": 9974 + }, + { + "epoch": 0.6192190700850456, + "grad_norm": 0.21175006374473018, + "learning_rate": 9.65908425469357e-05, + "loss": 3.0501, + "step": 9975 + }, + { + "epoch": 0.6192811471848035, + "grad_norm": 0.3535068574102686, + "learning_rate": 9.658953169905105e-05, + "loss": 3.1118, + "step": 9976 + }, + { + "epoch": 0.6193432242845615, + "grad_norm": 0.3442580413557889, + "learning_rate": 9.658822060809751e-05, + "loss": 3.2469, + "step": 9977 + }, + { + "epoch": 0.6194053013843194, + "grad_norm": 0.2982912663995585, + "learning_rate": 9.65869092740819e-05, + "loss": 3.2646, + "step": 9978 + }, + { + "epoch": 0.6194673784840772, + "grad_norm": 0.3273099358330562, + "learning_rate": 9.658559769701109e-05, + "loss": 3.2858, + "step": 9979 + }, + { + "epoch": 0.6195294555838351, + "grad_norm": 0.3162456269583835, + "learning_rate": 9.65842858768919e-05, + "loss": 3.1154, + "step": 9980 + }, + { + "epoch": 0.619591532683593, + "grad_norm": 0.27126767408564845, + "learning_rate": 9.658297381373118e-05, + "loss": 3.287, + "step": 9981 + }, + { + "epoch": 0.6196536097833509, + "grad_norm": 0.4375370214329702, + "learning_rate": 9.658166150753578e-05, + "loss": 3.2123, + "step": 9982 + }, + { + "epoch": 0.6197156868831089, + "grad_norm": 0.2746750483115551, + "learning_rate": 9.658034895831254e-05, + "loss": 3.142, + "step": 9983 + }, + { + "epoch": 0.6197777639828668, + "grad_norm": 0.28599839421968565, + "learning_rate": 9.657903616606832e-05, + "loss": 3.2279, + "step": 9984 + }, + { + "epoch": 0.6198398410826246, + "grad_norm": 0.2431341635465263, + "learning_rate": 9.657772313080995e-05, + "loss": 3.1911, + "step": 9985 + }, + { + "epoch": 0.6199019181823825, + "grad_norm": 0.3123174730347357, + "learning_rate": 9.65764098525443e-05, + "loss": 3.2295, + "step": 9986 + }, + { + "epoch": 0.6199639952821404, + "grad_norm": 0.2687534498392513, + "learning_rate": 9.657509633127822e-05, + "loss": 3.2328, + "step": 9987 + }, + { + "epoch": 0.6200260723818983, + "grad_norm": 0.24831663316512498, + "learning_rate": 9.657378256701855e-05, + "loss": 3.1244, + "step": 9988 + }, + { + "epoch": 0.6200881494816562, + "grad_norm": 0.40919327087434887, + "learning_rate": 9.657246855977213e-05, + "loss": 3.1502, + "step": 9989 + }, + { + "epoch": 0.6201502265814142, + "grad_norm": 0.24165176542287606, + "learning_rate": 9.657115430954586e-05, + "loss": 3.1896, + "step": 9990 + }, + { + "epoch": 0.620212303681172, + "grad_norm": 0.26751609417370825, + "learning_rate": 9.656983981634657e-05, + "loss": 3.2319, + "step": 9991 + }, + { + "epoch": 0.6202743807809299, + "grad_norm": 0.29905688187873786, + "learning_rate": 9.65685250801811e-05, + "loss": 3.2298, + "step": 9992 + }, + { + "epoch": 0.6203364578806878, + "grad_norm": 0.23202522475514614, + "learning_rate": 9.656721010105634e-05, + "loss": 3.1713, + "step": 9993 + }, + { + "epoch": 0.6203985349804457, + "grad_norm": 0.29023013259436337, + "learning_rate": 9.656589487897915e-05, + "loss": 3.1393, + "step": 9994 + }, + { + "epoch": 0.6204606120802036, + "grad_norm": 0.25628719525092347, + "learning_rate": 9.656457941395638e-05, + "loss": 3.0981, + "step": 9995 + }, + { + "epoch": 0.6205226891799616, + "grad_norm": 0.27206049652904285, + "learning_rate": 9.656326370599488e-05, + "loss": 3.161, + "step": 9996 + }, + { + "epoch": 0.6205847662797194, + "grad_norm": 0.32571473894940717, + "learning_rate": 9.656194775510156e-05, + "loss": 3.2439, + "step": 9997 + }, + { + "epoch": 0.6206468433794773, + "grad_norm": 0.37204137574458374, + "learning_rate": 9.656063156128323e-05, + "loss": 3.1791, + "step": 9998 + }, + { + "epoch": 0.6207089204792352, + "grad_norm": 0.3217428880689505, + "learning_rate": 9.655931512454679e-05, + "loss": 3.1694, + "step": 9999 + }, + { + "epoch": 0.6207709975789931, + "grad_norm": 0.3202843653720644, + "learning_rate": 9.65579984448991e-05, + "loss": 3.22, + "step": 10000 + }, + { + "epoch": 0.620833074678751, + "grad_norm": 0.31512634997927963, + "learning_rate": 9.655668152234702e-05, + "loss": 3.1857, + "step": 10001 + }, + { + "epoch": 0.620895151778509, + "grad_norm": 0.25295792057544264, + "learning_rate": 9.655536435689745e-05, + "loss": 3.1228, + "step": 10002 + }, + { + "epoch": 0.6209572288782668, + "grad_norm": 0.29451124391480193, + "learning_rate": 9.655404694855724e-05, + "loss": 3.1423, + "step": 10003 + }, + { + "epoch": 0.6210193059780247, + "grad_norm": 0.3554068500993714, + "learning_rate": 9.655272929733325e-05, + "loss": 3.203, + "step": 10004 + }, + { + "epoch": 0.6210813830777826, + "grad_norm": 0.22648778867435254, + "learning_rate": 9.655141140323238e-05, + "loss": 3.2296, + "step": 10005 + }, + { + "epoch": 0.6211434601775405, + "grad_norm": 0.24633323279401545, + "learning_rate": 9.65500932662615e-05, + "loss": 3.2025, + "step": 10006 + }, + { + "epoch": 0.6212055372772984, + "grad_norm": 0.24966326028949407, + "learning_rate": 9.654877488642747e-05, + "loss": 3.2807, + "step": 10007 + }, + { + "epoch": 0.6212676143770564, + "grad_norm": 0.36551198761481635, + "learning_rate": 9.65474562637372e-05, + "loss": 3.1862, + "step": 10008 + }, + { + "epoch": 0.6213296914768142, + "grad_norm": 0.25555322982644313, + "learning_rate": 9.654613739819754e-05, + "loss": 3.2127, + "step": 10009 + }, + { + "epoch": 0.6213917685765721, + "grad_norm": 0.29363967365806415, + "learning_rate": 9.654481828981536e-05, + "loss": 3.1328, + "step": 10010 + }, + { + "epoch": 0.62145384567633, + "grad_norm": 0.22844983645525344, + "learning_rate": 9.65434989385976e-05, + "loss": 3.1209, + "step": 10011 + }, + { + "epoch": 0.6215159227760879, + "grad_norm": 0.1922784415681166, + "learning_rate": 9.654217934455109e-05, + "loss": 3.0375, + "step": 10012 + }, + { + "epoch": 0.6215779998758458, + "grad_norm": 0.29276539420594005, + "learning_rate": 9.654085950768273e-05, + "loss": 3.2152, + "step": 10013 + }, + { + "epoch": 0.6216400769756038, + "grad_norm": 0.2781011019308363, + "learning_rate": 9.653953942799941e-05, + "loss": 3.2395, + "step": 10014 + }, + { + "epoch": 0.6217021540753616, + "grad_norm": 0.493583591401325, + "learning_rate": 9.653821910550802e-05, + "loss": 3.2299, + "step": 10015 + }, + { + "epoch": 0.6217642311751195, + "grad_norm": 0.22177423653888845, + "learning_rate": 9.653689854021543e-05, + "loss": 3.2552, + "step": 10016 + }, + { + "epoch": 0.6218263082748774, + "grad_norm": 0.241412607436432, + "learning_rate": 9.653557773212856e-05, + "loss": 3.1715, + "step": 10017 + }, + { + "epoch": 0.6218883853746353, + "grad_norm": 0.2264320639327083, + "learning_rate": 9.653425668125429e-05, + "loss": 3.2055, + "step": 10018 + }, + { + "epoch": 0.6219504624743932, + "grad_norm": 0.2801897276169267, + "learning_rate": 9.653293538759949e-05, + "loss": 3.1927, + "step": 10019 + }, + { + "epoch": 0.6220125395741511, + "grad_norm": 0.3325549203467382, + "learning_rate": 9.653161385117107e-05, + "loss": 3.1494, + "step": 10020 + }, + { + "epoch": 0.622074616673909, + "grad_norm": 0.23092027658030245, + "learning_rate": 9.653029207197592e-05, + "loss": 3.2086, + "step": 10021 + }, + { + "epoch": 0.6221366937736669, + "grad_norm": 0.2091249723850558, + "learning_rate": 9.652897005002095e-05, + "loss": 3.1822, + "step": 10022 + }, + { + "epoch": 0.6221987708734248, + "grad_norm": 0.24524880481325695, + "learning_rate": 9.652764778531304e-05, + "loss": 3.0736, + "step": 10023 + }, + { + "epoch": 0.6222608479731827, + "grad_norm": 0.2769317252818925, + "learning_rate": 9.652632527785912e-05, + "loss": 3.2587, + "step": 10024 + }, + { + "epoch": 0.6223229250729406, + "grad_norm": 0.5842730978993681, + "learning_rate": 9.652500252766604e-05, + "loss": 3.2334, + "step": 10025 + }, + { + "epoch": 0.6223850021726985, + "grad_norm": 0.21371869371047814, + "learning_rate": 9.652367953474074e-05, + "loss": 3.0734, + "step": 10026 + }, + { + "epoch": 0.6224470792724563, + "grad_norm": 0.22759079242813227, + "learning_rate": 9.65223562990901e-05, + "loss": 3.1541, + "step": 10027 + }, + { + "epoch": 0.6225091563722143, + "grad_norm": 0.1986481779713945, + "learning_rate": 9.652103282072104e-05, + "loss": 3.1148, + "step": 10028 + }, + { + "epoch": 0.6225712334719722, + "grad_norm": 0.22321400301779148, + "learning_rate": 9.651970909964047e-05, + "loss": 3.2197, + "step": 10029 + }, + { + "epoch": 0.6226333105717301, + "grad_norm": 0.230725848121688, + "learning_rate": 9.651838513585527e-05, + "loss": 3.2525, + "step": 10030 + }, + { + "epoch": 0.622695387671488, + "grad_norm": 0.32020728779144114, + "learning_rate": 9.651706092937236e-05, + "loss": 3.1374, + "step": 10031 + }, + { + "epoch": 0.6227574647712459, + "grad_norm": 0.20540482973623078, + "learning_rate": 9.651573648019866e-05, + "loss": 3.2031, + "step": 10032 + }, + { + "epoch": 0.6228195418710037, + "grad_norm": 0.24877487193656234, + "learning_rate": 9.651441178834106e-05, + "loss": 3.2196, + "step": 10033 + }, + { + "epoch": 0.6228816189707617, + "grad_norm": 0.32024916743826165, + "learning_rate": 9.651308685380648e-05, + "loss": 3.1948, + "step": 10034 + }, + { + "epoch": 0.6229436960705196, + "grad_norm": 0.29143495239364886, + "learning_rate": 9.651176167660185e-05, + "loss": 3.1722, + "step": 10035 + }, + { + "epoch": 0.6230057731702775, + "grad_norm": 0.22370722836056775, + "learning_rate": 9.651043625673406e-05, + "loss": 3.1716, + "step": 10036 + }, + { + "epoch": 0.6230678502700354, + "grad_norm": 0.23620775010242237, + "learning_rate": 9.650911059421004e-05, + "loss": 3.1624, + "step": 10037 + }, + { + "epoch": 0.6231299273697933, + "grad_norm": 0.2093349172884423, + "learning_rate": 9.650778468903669e-05, + "loss": 3.2574, + "step": 10038 + }, + { + "epoch": 0.6231920044695511, + "grad_norm": 0.1892611130307535, + "learning_rate": 9.650645854122096e-05, + "loss": 3.2021, + "step": 10039 + }, + { + "epoch": 0.6232540815693091, + "grad_norm": 0.5093921155335863, + "learning_rate": 9.650513215076972e-05, + "loss": 3.1857, + "step": 10040 + }, + { + "epoch": 0.623316158669067, + "grad_norm": 0.26407939360261795, + "learning_rate": 9.650380551768991e-05, + "loss": 3.166, + "step": 10041 + }, + { + "epoch": 0.6233782357688249, + "grad_norm": 0.2625357558103628, + "learning_rate": 9.650247864198848e-05, + "loss": 3.2576, + "step": 10042 + }, + { + "epoch": 0.6234403128685828, + "grad_norm": 0.32483225537431637, + "learning_rate": 9.650115152367232e-05, + "loss": 3.1758, + "step": 10043 + }, + { + "epoch": 0.6235023899683407, + "grad_norm": 0.28101581519393276, + "learning_rate": 9.649982416274837e-05, + "loss": 3.2108, + "step": 10044 + }, + { + "epoch": 0.6235644670680985, + "grad_norm": 0.25859304237845143, + "learning_rate": 9.649849655922355e-05, + "loss": 3.258, + "step": 10045 + }, + { + "epoch": 0.6236265441678565, + "grad_norm": 0.24116025230000093, + "learning_rate": 9.649716871310477e-05, + "loss": 3.1366, + "step": 10046 + }, + { + "epoch": 0.6236886212676144, + "grad_norm": 0.2511718969508231, + "learning_rate": 9.649584062439898e-05, + "loss": 3.0952, + "step": 10047 + }, + { + "epoch": 0.6237506983673723, + "grad_norm": 0.27893487104920256, + "learning_rate": 9.649451229311311e-05, + "loss": 3.124, + "step": 10048 + }, + { + "epoch": 0.6238127754671302, + "grad_norm": 0.22469559548799423, + "learning_rate": 9.649318371925407e-05, + "loss": 3.1848, + "step": 10049 + }, + { + "epoch": 0.6238748525668881, + "grad_norm": 0.2513318941819687, + "learning_rate": 9.649185490282882e-05, + "loss": 3.1359, + "step": 10050 + }, + { + "epoch": 0.6239369296666459, + "grad_norm": 0.2598331478385149, + "learning_rate": 9.649052584384426e-05, + "loss": 3.1671, + "step": 10051 + }, + { + "epoch": 0.6239990067664039, + "grad_norm": 0.22086691897138028, + "learning_rate": 9.648919654230735e-05, + "loss": 3.2183, + "step": 10052 + }, + { + "epoch": 0.6240610838661618, + "grad_norm": 0.20036717922775735, + "learning_rate": 9.6487866998225e-05, + "loss": 3.224, + "step": 10053 + }, + { + "epoch": 0.6241231609659197, + "grad_norm": 0.18481793016113396, + "learning_rate": 9.648653721160419e-05, + "loss": 3.1814, + "step": 10054 + }, + { + "epoch": 0.6241852380656776, + "grad_norm": 0.3821968070929867, + "learning_rate": 9.648520718245181e-05, + "loss": 3.2807, + "step": 10055 + }, + { + "epoch": 0.6242473151654355, + "grad_norm": 0.3639392021856081, + "learning_rate": 9.648387691077483e-05, + "loss": 3.235, + "step": 10056 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.22167383934070778, + "learning_rate": 9.648254639658019e-05, + "loss": 3.1409, + "step": 10057 + }, + { + "epoch": 0.6243714693649512, + "grad_norm": 0.27933215205279266, + "learning_rate": 9.64812156398748e-05, + "loss": 3.1114, + "step": 10058 + }, + { + "epoch": 0.6244335464647092, + "grad_norm": 0.37798145343805256, + "learning_rate": 9.647988464066564e-05, + "loss": 3.1703, + "step": 10059 + }, + { + "epoch": 0.6244956235644671, + "grad_norm": 0.28178245422928383, + "learning_rate": 9.647855339895961e-05, + "loss": 3.2463, + "step": 10060 + }, + { + "epoch": 0.624557700664225, + "grad_norm": 0.2067280474451193, + "learning_rate": 9.647722191476371e-05, + "loss": 3.195, + "step": 10061 + }, + { + "epoch": 0.6246197777639829, + "grad_norm": 0.2700081903928454, + "learning_rate": 9.647589018808487e-05, + "loss": 3.1983, + "step": 10062 + }, + { + "epoch": 0.6246818548637407, + "grad_norm": 0.21330113161084632, + "learning_rate": 9.647455821893001e-05, + "loss": 3.0715, + "step": 10063 + }, + { + "epoch": 0.6247439319634986, + "grad_norm": 0.3042049989105243, + "learning_rate": 9.64732260073061e-05, + "loss": 3.2767, + "step": 10064 + }, + { + "epoch": 0.6248060090632566, + "grad_norm": 0.37356311967685685, + "learning_rate": 9.647189355322008e-05, + "loss": 3.1745, + "step": 10065 + }, + { + "epoch": 0.6248680861630145, + "grad_norm": 0.20810416054280895, + "learning_rate": 9.647056085667893e-05, + "loss": 3.1184, + "step": 10066 + }, + { + "epoch": 0.6249301632627724, + "grad_norm": 0.2620817343269374, + "learning_rate": 9.646922791768958e-05, + "loss": 3.172, + "step": 10067 + }, + { + "epoch": 0.6249922403625303, + "grad_norm": 0.259199127066174, + "learning_rate": 9.646789473625897e-05, + "loss": 3.1416, + "step": 10068 + }, + { + "epoch": 0.6250543174622881, + "grad_norm": 0.2532723363005602, + "learning_rate": 9.646656131239407e-05, + "loss": 3.221, + "step": 10069 + }, + { + "epoch": 0.625116394562046, + "grad_norm": 0.2428384232030721, + "learning_rate": 9.646522764610186e-05, + "loss": 3.0846, + "step": 10070 + }, + { + "epoch": 0.625178471661804, + "grad_norm": 0.24553217467320487, + "learning_rate": 9.646389373738926e-05, + "loss": 3.1899, + "step": 10071 + }, + { + "epoch": 0.6252405487615619, + "grad_norm": 0.3814871871004003, + "learning_rate": 9.646255958626327e-05, + "loss": 3.2313, + "step": 10072 + }, + { + "epoch": 0.6253026258613198, + "grad_norm": 0.4185820741011123, + "learning_rate": 9.64612251927308e-05, + "loss": 3.158, + "step": 10073 + }, + { + "epoch": 0.6253647029610777, + "grad_norm": 0.29346789754086294, + "learning_rate": 9.645989055679886e-05, + "loss": 3.1404, + "step": 10074 + }, + { + "epoch": 0.6254267800608355, + "grad_norm": 0.2703802723901806, + "learning_rate": 9.645855567847439e-05, + "loss": 3.1757, + "step": 10075 + }, + { + "epoch": 0.6254888571605934, + "grad_norm": 0.2512014250036082, + "learning_rate": 9.645722055776436e-05, + "loss": 3.1989, + "step": 10076 + }, + { + "epoch": 0.6255509342603514, + "grad_norm": 0.32783986372876445, + "learning_rate": 9.645588519467571e-05, + "loss": 3.1812, + "step": 10077 + }, + { + "epoch": 0.6256130113601093, + "grad_norm": 0.2981589685237564, + "learning_rate": 9.645454958921545e-05, + "loss": 3.1817, + "step": 10078 + }, + { + "epoch": 0.6256750884598672, + "grad_norm": 0.20844827501164748, + "learning_rate": 9.645321374139052e-05, + "loss": 3.124, + "step": 10079 + }, + { + "epoch": 0.6257371655596251, + "grad_norm": 0.22236835539284316, + "learning_rate": 9.64518776512079e-05, + "loss": 3.1253, + "step": 10080 + }, + { + "epoch": 0.6257992426593829, + "grad_norm": 0.21789329378087785, + "learning_rate": 9.645054131867455e-05, + "loss": 3.1728, + "step": 10081 + }, + { + "epoch": 0.6258613197591408, + "grad_norm": 0.3230707111352032, + "learning_rate": 9.644920474379746e-05, + "loss": 3.1477, + "step": 10082 + }, + { + "epoch": 0.6259233968588988, + "grad_norm": 0.16449148994611335, + "learning_rate": 9.644786792658358e-05, + "loss": 3.1293, + "step": 10083 + }, + { + "epoch": 0.6259854739586567, + "grad_norm": 0.22388851755739575, + "learning_rate": 9.644653086703992e-05, + "loss": 3.1966, + "step": 10084 + }, + { + "epoch": 0.6260475510584146, + "grad_norm": 0.2753267281716967, + "learning_rate": 9.644519356517341e-05, + "loss": 3.1248, + "step": 10085 + }, + { + "epoch": 0.6261096281581725, + "grad_norm": 0.30904039822690527, + "learning_rate": 9.644385602099107e-05, + "loss": 3.2018, + "step": 10086 + }, + { + "epoch": 0.6261717052579303, + "grad_norm": 0.20411735911589696, + "learning_rate": 9.644251823449984e-05, + "loss": 3.2696, + "step": 10087 + }, + { + "epoch": 0.6262337823576882, + "grad_norm": 0.2068937762893159, + "learning_rate": 9.644118020570673e-05, + "loss": 3.1075, + "step": 10088 + }, + { + "epoch": 0.6262958594574461, + "grad_norm": 0.27389546598889203, + "learning_rate": 9.64398419346187e-05, + "loss": 3.1939, + "step": 10089 + }, + { + "epoch": 0.6263579365572041, + "grad_norm": 0.20881293742862797, + "learning_rate": 9.643850342124275e-05, + "loss": 3.2783, + "step": 10090 + }, + { + "epoch": 0.626420013656962, + "grad_norm": 0.17441622763696335, + "learning_rate": 9.643716466558584e-05, + "loss": 3.1624, + "step": 10091 + }, + { + "epoch": 0.6264820907567199, + "grad_norm": 0.20798842469603684, + "learning_rate": 9.6435825667655e-05, + "loss": 3.1065, + "step": 10092 + }, + { + "epoch": 0.6265441678564777, + "grad_norm": 0.22936512290466904, + "learning_rate": 9.643448642745714e-05, + "loss": 3.2533, + "step": 10093 + }, + { + "epoch": 0.6266062449562356, + "grad_norm": 0.20236305233121674, + "learning_rate": 9.643314694499932e-05, + "loss": 3.1697, + "step": 10094 + }, + { + "epoch": 0.6266683220559935, + "grad_norm": 0.19934393184777358, + "learning_rate": 9.643180722028851e-05, + "loss": 3.1803, + "step": 10095 + }, + { + "epoch": 0.6267303991557515, + "grad_norm": 0.35090760586899844, + "learning_rate": 9.643046725333167e-05, + "loss": 3.067, + "step": 10096 + }, + { + "epoch": 0.6267924762555094, + "grad_norm": 0.23073572923757943, + "learning_rate": 9.642912704413581e-05, + "loss": 3.2039, + "step": 10097 + }, + { + "epoch": 0.6268545533552673, + "grad_norm": 0.1879438273395809, + "learning_rate": 9.642778659270794e-05, + "loss": 3.2403, + "step": 10098 + }, + { + "epoch": 0.6269166304550251, + "grad_norm": 0.21113347733610852, + "learning_rate": 9.642644589905503e-05, + "loss": 3.1816, + "step": 10099 + }, + { + "epoch": 0.626978707554783, + "grad_norm": 0.21757957142977474, + "learning_rate": 9.642510496318407e-05, + "loss": 3.0719, + "step": 10100 + }, + { + "epoch": 0.6270407846545409, + "grad_norm": 0.25166533542169384, + "learning_rate": 9.642376378510207e-05, + "loss": 3.0047, + "step": 10101 + }, + { + "epoch": 0.6271028617542989, + "grad_norm": 0.271204206956491, + "learning_rate": 9.642242236481603e-05, + "loss": 3.1189, + "step": 10102 + }, + { + "epoch": 0.6271649388540568, + "grad_norm": 0.2543065656699451, + "learning_rate": 9.642108070233295e-05, + "loss": 3.2118, + "step": 10103 + }, + { + "epoch": 0.6272270159538147, + "grad_norm": 0.21242368744346668, + "learning_rate": 9.641973879765982e-05, + "loss": 3.2118, + "step": 10104 + }, + { + "epoch": 0.6272890930535725, + "grad_norm": 0.22883101649360618, + "learning_rate": 9.641839665080363e-05, + "loss": 3.1012, + "step": 10105 + }, + { + "epoch": 0.6273511701533304, + "grad_norm": 0.17018375276922412, + "learning_rate": 9.641705426177141e-05, + "loss": 3.1552, + "step": 10106 + }, + { + "epoch": 0.6274132472530883, + "grad_norm": 0.19529258626508794, + "learning_rate": 9.641571163057015e-05, + "loss": 3.0909, + "step": 10107 + }, + { + "epoch": 0.6274753243528463, + "grad_norm": 0.18094015933175997, + "learning_rate": 9.641436875720685e-05, + "loss": 3.1279, + "step": 10108 + }, + { + "epoch": 0.6275374014526042, + "grad_norm": 0.23458572292607088, + "learning_rate": 9.641302564168852e-05, + "loss": 3.1411, + "step": 10109 + }, + { + "epoch": 0.6275994785523621, + "grad_norm": 0.199968423436848, + "learning_rate": 9.641168228402218e-05, + "loss": 3.1973, + "step": 10110 + }, + { + "epoch": 0.6276615556521199, + "grad_norm": 0.21409438097705882, + "learning_rate": 9.641033868421481e-05, + "loss": 3.2297, + "step": 10111 + }, + { + "epoch": 0.6277236327518778, + "grad_norm": 0.2571532896007989, + "learning_rate": 9.640899484227344e-05, + "loss": 3.1722, + "step": 10112 + }, + { + "epoch": 0.6277857098516357, + "grad_norm": 0.22197205845105775, + "learning_rate": 9.640765075820508e-05, + "loss": 3.1836, + "step": 10113 + }, + { + "epoch": 0.6278477869513936, + "grad_norm": 0.24223873150307396, + "learning_rate": 9.640630643201674e-05, + "loss": 3.2335, + "step": 10114 + }, + { + "epoch": 0.6279098640511516, + "grad_norm": 0.18942679389370012, + "learning_rate": 9.640496186371543e-05, + "loss": 3.197, + "step": 10115 + }, + { + "epoch": 0.6279719411509094, + "grad_norm": 0.22909547321263132, + "learning_rate": 9.640361705330818e-05, + "loss": 3.1885, + "step": 10116 + }, + { + "epoch": 0.6280340182506673, + "grad_norm": 0.2613413780284481, + "learning_rate": 9.640227200080198e-05, + "loss": 3.0563, + "step": 10117 + }, + { + "epoch": 0.6280960953504252, + "grad_norm": 0.22714001942564552, + "learning_rate": 9.640092670620388e-05, + "loss": 3.1665, + "step": 10118 + }, + { + "epoch": 0.6281581724501831, + "grad_norm": 0.2607329291939519, + "learning_rate": 9.639958116952087e-05, + "loss": 3.0442, + "step": 10119 + }, + { + "epoch": 0.628220249549941, + "grad_norm": 0.30078443302600605, + "learning_rate": 9.639823539075998e-05, + "loss": 3.0811, + "step": 10120 + }, + { + "epoch": 0.628282326649699, + "grad_norm": 0.2322481130958587, + "learning_rate": 9.639688936992824e-05, + "loss": 3.2121, + "step": 10121 + }, + { + "epoch": 0.6283444037494568, + "grad_norm": 0.30652866627898046, + "learning_rate": 9.639554310703266e-05, + "loss": 3.2235, + "step": 10122 + }, + { + "epoch": 0.6284064808492147, + "grad_norm": 0.24284948353324684, + "learning_rate": 9.639419660208026e-05, + "loss": 3.1833, + "step": 10123 + }, + { + "epoch": 0.6284685579489726, + "grad_norm": 0.24006792054863418, + "learning_rate": 9.639284985507808e-05, + "loss": 3.0772, + "step": 10124 + }, + { + "epoch": 0.6285306350487305, + "grad_norm": 0.3005051398297893, + "learning_rate": 9.639150286603316e-05, + "loss": 3.1727, + "step": 10125 + }, + { + "epoch": 0.6285927121484884, + "grad_norm": 0.24763941903187428, + "learning_rate": 9.639015563495248e-05, + "loss": 3.1267, + "step": 10126 + }, + { + "epoch": 0.6286547892482464, + "grad_norm": 0.2513411282397057, + "learning_rate": 9.63888081618431e-05, + "loss": 3.1871, + "step": 10127 + }, + { + "epoch": 0.6287168663480042, + "grad_norm": 0.26624980539216286, + "learning_rate": 9.638746044671205e-05, + "loss": 3.1378, + "step": 10128 + }, + { + "epoch": 0.6287789434477621, + "grad_norm": 0.24514920758841516, + "learning_rate": 9.638611248956638e-05, + "loss": 3.1063, + "step": 10129 + }, + { + "epoch": 0.62884102054752, + "grad_norm": 0.3157849038963571, + "learning_rate": 9.638476429041308e-05, + "loss": 3.1728, + "step": 10130 + }, + { + "epoch": 0.6289030976472779, + "grad_norm": 0.22498783899927385, + "learning_rate": 9.638341584925919e-05, + "loss": 3.1923, + "step": 10131 + }, + { + "epoch": 0.6289651747470358, + "grad_norm": 0.22647452290566097, + "learning_rate": 9.638206716611178e-05, + "loss": 3.0646, + "step": 10132 + }, + { + "epoch": 0.6290272518467938, + "grad_norm": 0.22082315404991285, + "learning_rate": 9.638071824097786e-05, + "loss": 3.169, + "step": 10133 + }, + { + "epoch": 0.6290893289465516, + "grad_norm": 0.2425582416881117, + "learning_rate": 9.637936907386448e-05, + "loss": 3.2406, + "step": 10134 + }, + { + "epoch": 0.6291514060463095, + "grad_norm": 0.23160952766195708, + "learning_rate": 9.637801966477867e-05, + "loss": 3.205, + "step": 10135 + }, + { + "epoch": 0.6292134831460674, + "grad_norm": 0.19218484016066537, + "learning_rate": 9.637667001372747e-05, + "loss": 3.0876, + "step": 10136 + }, + { + "epoch": 0.6292755602458253, + "grad_norm": 0.2540895827194103, + "learning_rate": 9.637532012071792e-05, + "loss": 3.2126, + "step": 10137 + }, + { + "epoch": 0.6293376373455832, + "grad_norm": 0.22519091910371902, + "learning_rate": 9.637396998575707e-05, + "loss": 3.1804, + "step": 10138 + }, + { + "epoch": 0.6293997144453412, + "grad_norm": 0.1800343338189574, + "learning_rate": 9.637261960885197e-05, + "loss": 3.14, + "step": 10139 + }, + { + "epoch": 0.629461791545099, + "grad_norm": 0.21115642242684296, + "learning_rate": 9.637126899000964e-05, + "loss": 3.0911, + "step": 10140 + }, + { + "epoch": 0.6295238686448569, + "grad_norm": 0.1914446947842424, + "learning_rate": 9.636991812923716e-05, + "loss": 3.1956, + "step": 10141 + }, + { + "epoch": 0.6295859457446148, + "grad_norm": 0.18739381405977795, + "learning_rate": 9.636856702654156e-05, + "loss": 3.1095, + "step": 10142 + }, + { + "epoch": 0.6296480228443727, + "grad_norm": 0.20247892187643876, + "learning_rate": 9.636721568192988e-05, + "loss": 3.1739, + "step": 10143 + }, + { + "epoch": 0.6297100999441306, + "grad_norm": 0.1783034050270274, + "learning_rate": 9.636586409540917e-05, + "loss": 3.1832, + "step": 10144 + }, + { + "epoch": 0.6297721770438885, + "grad_norm": 0.2397208656975539, + "learning_rate": 9.636451226698651e-05, + "loss": 3.2254, + "step": 10145 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.2429948087498815, + "learning_rate": 9.636316019666892e-05, + "loss": 3.1994, + "step": 10146 + }, + { + "epoch": 0.6298963312434043, + "grad_norm": 0.2214393489221038, + "learning_rate": 9.636180788446348e-05, + "loss": 3.1567, + "step": 10147 + }, + { + "epoch": 0.6299584083431622, + "grad_norm": 0.19548121794495368, + "learning_rate": 9.636045533037724e-05, + "loss": 3.0708, + "step": 10148 + }, + { + "epoch": 0.6300204854429201, + "grad_norm": 0.2405247832582879, + "learning_rate": 9.635910253441723e-05, + "loss": 3.1076, + "step": 10149 + }, + { + "epoch": 0.630082562542678, + "grad_norm": 0.21489600499772354, + "learning_rate": 9.635774949659054e-05, + "loss": 3.0234, + "step": 10150 + }, + { + "epoch": 0.6301446396424359, + "grad_norm": 0.17020574513649991, + "learning_rate": 9.63563962169042e-05, + "loss": 3.163, + "step": 10151 + }, + { + "epoch": 0.6302067167421938, + "grad_norm": 0.2971684736986537, + "learning_rate": 9.63550426953653e-05, + "loss": 3.1869, + "step": 10152 + }, + { + "epoch": 0.6302687938419517, + "grad_norm": 0.19337377590921379, + "learning_rate": 9.635368893198088e-05, + "loss": 3.1419, + "step": 10153 + }, + { + "epoch": 0.6303308709417096, + "grad_norm": 0.2379978558379647, + "learning_rate": 9.635233492675804e-05, + "loss": 3.2109, + "step": 10154 + }, + { + "epoch": 0.6303929480414675, + "grad_norm": 0.28596578937524086, + "learning_rate": 9.63509806797038e-05, + "loss": 3.1872, + "step": 10155 + }, + { + "epoch": 0.6304550251412254, + "grad_norm": 0.2647799344668872, + "learning_rate": 9.634962619082523e-05, + "loss": 3.1842, + "step": 10156 + }, + { + "epoch": 0.6305171022409833, + "grad_norm": 0.2508221162197369, + "learning_rate": 9.634827146012942e-05, + "loss": 3.2092, + "step": 10157 + }, + { + "epoch": 0.6305791793407411, + "grad_norm": 0.18956967337795566, + "learning_rate": 9.634691648762342e-05, + "loss": 3.1242, + "step": 10158 + }, + { + "epoch": 0.6306412564404991, + "grad_norm": 0.23833359471692467, + "learning_rate": 9.634556127331431e-05, + "loss": 3.2509, + "step": 10159 + }, + { + "epoch": 0.630703333540257, + "grad_norm": 0.22496654473702424, + "learning_rate": 9.634420581720917e-05, + "loss": 3.1343, + "step": 10160 + }, + { + "epoch": 0.6307654106400149, + "grad_norm": 0.34725453173465853, + "learning_rate": 9.634285011931504e-05, + "loss": 3.1264, + "step": 10161 + }, + { + "epoch": 0.6308274877397728, + "grad_norm": 0.24771778172620992, + "learning_rate": 9.634149417963901e-05, + "loss": 3.1333, + "step": 10162 + }, + { + "epoch": 0.6308895648395307, + "grad_norm": 0.19076386498261028, + "learning_rate": 9.634013799818817e-05, + "loss": 3.1624, + "step": 10163 + }, + { + "epoch": 0.6309516419392885, + "grad_norm": 0.21464854822685295, + "learning_rate": 9.633878157496956e-05, + "loss": 3.2381, + "step": 10164 + }, + { + "epoch": 0.6310137190390465, + "grad_norm": 0.20725038914592178, + "learning_rate": 9.633742490999029e-05, + "loss": 3.0954, + "step": 10165 + }, + { + "epoch": 0.6310757961388044, + "grad_norm": 0.23020654245402078, + "learning_rate": 9.633606800325742e-05, + "loss": 3.1544, + "step": 10166 + }, + { + "epoch": 0.6311378732385623, + "grad_norm": 0.21757493145172466, + "learning_rate": 9.633471085477805e-05, + "loss": 3.145, + "step": 10167 + }, + { + "epoch": 0.6311999503383202, + "grad_norm": 0.1820103666089737, + "learning_rate": 9.633335346455923e-05, + "loss": 3.1331, + "step": 10168 + }, + { + "epoch": 0.6312620274380781, + "grad_norm": 0.20243227653324405, + "learning_rate": 9.633199583260806e-05, + "loss": 3.1346, + "step": 10169 + }, + { + "epoch": 0.6313241045378359, + "grad_norm": 0.18401469003174722, + "learning_rate": 9.633063795893162e-05, + "loss": 3.2157, + "step": 10170 + }, + { + "epoch": 0.6313861816375939, + "grad_norm": 0.2544504788723497, + "learning_rate": 9.632927984353701e-05, + "loss": 3.034, + "step": 10171 + }, + { + "epoch": 0.6314482587373518, + "grad_norm": 0.19338540181403835, + "learning_rate": 9.632792148643128e-05, + "loss": 3.1598, + "step": 10172 + }, + { + "epoch": 0.6315103358371097, + "grad_norm": 0.2404949047100061, + "learning_rate": 9.632656288762154e-05, + "loss": 3.1278, + "step": 10173 + }, + { + "epoch": 0.6315724129368676, + "grad_norm": 0.2271271361894561, + "learning_rate": 9.632520404711487e-05, + "loss": 3.1444, + "step": 10174 + }, + { + "epoch": 0.6316344900366255, + "grad_norm": 0.21406665192357452, + "learning_rate": 9.632384496491838e-05, + "loss": 3.1806, + "step": 10175 + }, + { + "epoch": 0.6316965671363833, + "grad_norm": 0.2104831146062041, + "learning_rate": 9.632248564103915e-05, + "loss": 3.1299, + "step": 10176 + }, + { + "epoch": 0.6317586442361413, + "grad_norm": 0.23095233743103744, + "learning_rate": 9.632112607548424e-05, + "loss": 3.1156, + "step": 10177 + }, + { + "epoch": 0.6318207213358992, + "grad_norm": 0.204028285255808, + "learning_rate": 9.63197662682608e-05, + "loss": 3.1174, + "step": 10178 + }, + { + "epoch": 0.6318827984356571, + "grad_norm": 0.1891985884240784, + "learning_rate": 9.631840621937587e-05, + "loss": 3.2102, + "step": 10179 + }, + { + "epoch": 0.631944875535415, + "grad_norm": 0.1743102085924457, + "learning_rate": 9.631704592883659e-05, + "loss": 3.0343, + "step": 10180 + }, + { + "epoch": 0.6320069526351729, + "grad_norm": 0.1877603610687244, + "learning_rate": 9.631568539665003e-05, + "loss": 3.1798, + "step": 10181 + }, + { + "epoch": 0.6320690297349307, + "grad_norm": 0.17569094563528198, + "learning_rate": 9.631432462282328e-05, + "loss": 3.2021, + "step": 10182 + }, + { + "epoch": 0.6321311068346886, + "grad_norm": 0.17957966097848052, + "learning_rate": 9.631296360736348e-05, + "loss": 3.1081, + "step": 10183 + }, + { + "epoch": 0.6321931839344466, + "grad_norm": 0.17365300540628545, + "learning_rate": 9.63116023502777e-05, + "loss": 3.1222, + "step": 10184 + }, + { + "epoch": 0.6322552610342045, + "grad_norm": 0.17069586055321415, + "learning_rate": 9.631024085157302e-05, + "loss": 3.1514, + "step": 10185 + }, + { + "epoch": 0.6323173381339624, + "grad_norm": 0.19559454580310587, + "learning_rate": 9.630887911125661e-05, + "loss": 3.1581, + "step": 10186 + }, + { + "epoch": 0.6323794152337203, + "grad_norm": 0.2203256281870928, + "learning_rate": 9.630751712933553e-05, + "loss": 3.1901, + "step": 10187 + }, + { + "epoch": 0.6324414923334781, + "grad_norm": 0.16950850246247, + "learning_rate": 9.630615490581688e-05, + "loss": 3.1415, + "step": 10188 + }, + { + "epoch": 0.632503569433236, + "grad_norm": 0.1895212365639895, + "learning_rate": 9.630479244070778e-05, + "loss": 3.2075, + "step": 10189 + }, + { + "epoch": 0.632565646532994, + "grad_norm": 0.245548590563984, + "learning_rate": 9.630342973401534e-05, + "loss": 3.1738, + "step": 10190 + }, + { + "epoch": 0.6326277236327519, + "grad_norm": 0.16897628478226223, + "learning_rate": 9.630206678574665e-05, + "loss": 3.1491, + "step": 10191 + }, + { + "epoch": 0.6326898007325098, + "grad_norm": 0.16868879435686804, + "learning_rate": 9.630070359590885e-05, + "loss": 3.1686, + "step": 10192 + }, + { + "epoch": 0.6327518778322677, + "grad_norm": 0.2206370837222756, + "learning_rate": 9.629934016450905e-05, + "loss": 3.21, + "step": 10193 + }, + { + "epoch": 0.6328139549320255, + "grad_norm": 0.15808957934868226, + "learning_rate": 9.629797649155434e-05, + "loss": 3.0886, + "step": 10194 + }, + { + "epoch": 0.6328760320317834, + "grad_norm": 0.21224523411972768, + "learning_rate": 9.629661257705187e-05, + "loss": 3.2007, + "step": 10195 + }, + { + "epoch": 0.6329381091315414, + "grad_norm": 0.1741861984394558, + "learning_rate": 9.62952484210087e-05, + "loss": 3.1555, + "step": 10196 + }, + { + "epoch": 0.6330001862312993, + "grad_norm": 0.17885492079910367, + "learning_rate": 9.629388402343201e-05, + "loss": 3.1736, + "step": 10197 + }, + { + "epoch": 0.6330622633310572, + "grad_norm": 0.19100398006711225, + "learning_rate": 9.629251938432886e-05, + "loss": 3.1795, + "step": 10198 + }, + { + "epoch": 0.6331243404308151, + "grad_norm": 0.17857321138002902, + "learning_rate": 9.629115450370642e-05, + "loss": 3.2831, + "step": 10199 + }, + { + "epoch": 0.6331864175305729, + "grad_norm": 0.1973614035266491, + "learning_rate": 9.628978938157179e-05, + "loss": 3.1909, + "step": 10200 + }, + { + "epoch": 0.6332484946303308, + "grad_norm": 0.189561714434859, + "learning_rate": 9.628842401793208e-05, + "loss": 3.185, + "step": 10201 + }, + { + "epoch": 0.6333105717300888, + "grad_norm": 0.1772883551370879, + "learning_rate": 9.628705841279441e-05, + "loss": 3.2037, + "step": 10202 + }, + { + "epoch": 0.6333726488298467, + "grad_norm": 0.1726249567214116, + "learning_rate": 9.628569256616595e-05, + "loss": 3.1688, + "step": 10203 + }, + { + "epoch": 0.6334347259296046, + "grad_norm": 0.1838076643899518, + "learning_rate": 9.62843264780538e-05, + "loss": 3.1792, + "step": 10204 + }, + { + "epoch": 0.6334968030293625, + "grad_norm": 0.16426387984311175, + "learning_rate": 9.628296014846505e-05, + "loss": 3.1666, + "step": 10205 + }, + { + "epoch": 0.6335588801291203, + "grad_norm": 0.21725773156074824, + "learning_rate": 9.628159357740688e-05, + "loss": 3.1702, + "step": 10206 + }, + { + "epoch": 0.6336209572288782, + "grad_norm": 0.21402128755673566, + "learning_rate": 9.628022676488639e-05, + "loss": 3.1005, + "step": 10207 + }, + { + "epoch": 0.6336830343286362, + "grad_norm": 0.16476024121477753, + "learning_rate": 9.627885971091075e-05, + "loss": 3.0878, + "step": 10208 + }, + { + "epoch": 0.6337451114283941, + "grad_norm": 0.17606581330931087, + "learning_rate": 9.627749241548705e-05, + "loss": 3.143, + "step": 10209 + }, + { + "epoch": 0.633807188528152, + "grad_norm": 0.3350685547315593, + "learning_rate": 9.627612487862243e-05, + "loss": 3.2163, + "step": 10210 + }, + { + "epoch": 0.6338692656279099, + "grad_norm": 0.19078558895791276, + "learning_rate": 9.627475710032404e-05, + "loss": 3.1497, + "step": 10211 + }, + { + "epoch": 0.6339313427276677, + "grad_norm": 0.3156025769983959, + "learning_rate": 9.6273389080599e-05, + "loss": 3.2154, + "step": 10212 + }, + { + "epoch": 0.6339934198274256, + "grad_norm": 0.19953222482154878, + "learning_rate": 9.627202081945445e-05, + "loss": 3.1353, + "step": 10213 + }, + { + "epoch": 0.6340554969271835, + "grad_norm": 0.20030420644733538, + "learning_rate": 9.627065231689756e-05, + "loss": 3.1448, + "step": 10214 + }, + { + "epoch": 0.6341175740269415, + "grad_norm": 0.17863679609063257, + "learning_rate": 9.626928357293542e-05, + "loss": 3.1617, + "step": 10215 + }, + { + "epoch": 0.6341796511266994, + "grad_norm": 0.2213594501463191, + "learning_rate": 9.626791458757521e-05, + "loss": 3.1409, + "step": 10216 + }, + { + "epoch": 0.6342417282264573, + "grad_norm": 0.21810113516712845, + "learning_rate": 9.626654536082405e-05, + "loss": 3.1662, + "step": 10217 + }, + { + "epoch": 0.6343038053262151, + "grad_norm": 0.18286657389512886, + "learning_rate": 9.626517589268909e-05, + "loss": 3.1317, + "step": 10218 + }, + { + "epoch": 0.634365882425973, + "grad_norm": 0.2065335445192901, + "learning_rate": 9.626380618317748e-05, + "loss": 3.1257, + "step": 10219 + }, + { + "epoch": 0.634427959525731, + "grad_norm": 0.3175363374739334, + "learning_rate": 9.626243623229636e-05, + "loss": 3.1628, + "step": 10220 + }, + { + "epoch": 0.6344900366254889, + "grad_norm": 0.2479081779851252, + "learning_rate": 9.626106604005288e-05, + "loss": 3.1401, + "step": 10221 + }, + { + "epoch": 0.6345521137252468, + "grad_norm": 0.2176842784977267, + "learning_rate": 9.62596956064542e-05, + "loss": 3.1578, + "step": 10222 + }, + { + "epoch": 0.6346141908250047, + "grad_norm": 0.23194625819915854, + "learning_rate": 9.625832493150744e-05, + "loss": 3.1418, + "step": 10223 + }, + { + "epoch": 0.6346762679247625, + "grad_norm": 0.2439614897617971, + "learning_rate": 9.625695401521978e-05, + "loss": 3.1247, + "step": 10224 + }, + { + "epoch": 0.6347383450245204, + "grad_norm": 0.2224405649450458, + "learning_rate": 9.625558285759836e-05, + "loss": 3.2033, + "step": 10225 + }, + { + "epoch": 0.6348004221242783, + "grad_norm": 0.24594028925737843, + "learning_rate": 9.625421145865033e-05, + "loss": 3.1267, + "step": 10226 + }, + { + "epoch": 0.6348624992240363, + "grad_norm": 0.25040772092542624, + "learning_rate": 9.625283981838286e-05, + "loss": 3.1934, + "step": 10227 + }, + { + "epoch": 0.6349245763237942, + "grad_norm": 0.21553112522778156, + "learning_rate": 9.625146793680309e-05, + "loss": 3.2576, + "step": 10228 + }, + { + "epoch": 0.6349866534235521, + "grad_norm": 0.2445269084581572, + "learning_rate": 9.625009581391818e-05, + "loss": 3.1245, + "step": 10229 + }, + { + "epoch": 0.6350487305233099, + "grad_norm": 0.23245434574299745, + "learning_rate": 9.62487234497353e-05, + "loss": 3.2915, + "step": 10230 + }, + { + "epoch": 0.6351108076230678, + "grad_norm": 0.18081845053247517, + "learning_rate": 9.624735084426159e-05, + "loss": 3.1847, + "step": 10231 + }, + { + "epoch": 0.6351728847228257, + "grad_norm": 0.21382611895042875, + "learning_rate": 9.624597799750424e-05, + "loss": 3.1538, + "step": 10232 + }, + { + "epoch": 0.6352349618225837, + "grad_norm": 0.21997895050676458, + "learning_rate": 9.62446049094704e-05, + "loss": 3.1958, + "step": 10233 + }, + { + "epoch": 0.6352970389223416, + "grad_norm": 0.2073783394192277, + "learning_rate": 9.624323158016721e-05, + "loss": 3.176, + "step": 10234 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.24903998226480945, + "learning_rate": 9.624185800960186e-05, + "loss": 3.1916, + "step": 10235 + }, + { + "epoch": 0.6354211931218573, + "grad_norm": 0.2568248847932946, + "learning_rate": 9.624048419778152e-05, + "loss": 3.2092, + "step": 10236 + }, + { + "epoch": 0.6354832702216152, + "grad_norm": 0.22763143146254505, + "learning_rate": 9.623911014471333e-05, + "loss": 3.1833, + "step": 10237 + }, + { + "epoch": 0.6355453473213731, + "grad_norm": 0.20879067151811917, + "learning_rate": 9.62377358504045e-05, + "loss": 3.1483, + "step": 10238 + }, + { + "epoch": 0.635607424421131, + "grad_norm": 0.20227662228612778, + "learning_rate": 9.623636131486218e-05, + "loss": 3.0985, + "step": 10239 + }, + { + "epoch": 0.635669501520889, + "grad_norm": 0.30999915086337665, + "learning_rate": 9.623498653809353e-05, + "loss": 3.1341, + "step": 10240 + }, + { + "epoch": 0.6357315786206469, + "grad_norm": 0.2956570688963515, + "learning_rate": 9.623361152010572e-05, + "loss": 3.1641, + "step": 10241 + }, + { + "epoch": 0.6357936557204047, + "grad_norm": 0.31576126197701204, + "learning_rate": 9.623223626090594e-05, + "loss": 3.0865, + "step": 10242 + }, + { + "epoch": 0.6358557328201626, + "grad_norm": 0.28988773238450044, + "learning_rate": 9.623086076050136e-05, + "loss": 3.1251, + "step": 10243 + }, + { + "epoch": 0.6359178099199205, + "grad_norm": 0.32376127254715426, + "learning_rate": 9.622948501889916e-05, + "loss": 3.2288, + "step": 10244 + }, + { + "epoch": 0.6359798870196784, + "grad_norm": 0.34525786567459527, + "learning_rate": 9.622810903610653e-05, + "loss": 3.1664, + "step": 10245 + }, + { + "epoch": 0.6360419641194364, + "grad_norm": 0.2884118386546685, + "learning_rate": 9.622673281213062e-05, + "loss": 3.1424, + "step": 10246 + }, + { + "epoch": 0.6361040412191943, + "grad_norm": 0.21368531981575234, + "learning_rate": 9.622535634697861e-05, + "loss": 3.3077, + "step": 10247 + }, + { + "epoch": 0.6361661183189521, + "grad_norm": 0.25452580789798085, + "learning_rate": 9.62239796406577e-05, + "loss": 3.1, + "step": 10248 + }, + { + "epoch": 0.63622819541871, + "grad_norm": 0.2116851185202069, + "learning_rate": 9.622260269317509e-05, + "loss": 3.1072, + "step": 10249 + }, + { + "epoch": 0.6362902725184679, + "grad_norm": 0.25368559125608403, + "learning_rate": 9.622122550453792e-05, + "loss": 3.2184, + "step": 10250 + }, + { + "epoch": 0.6363523496182258, + "grad_norm": 0.1962141681718824, + "learning_rate": 9.621984807475339e-05, + "loss": 3.1185, + "step": 10251 + }, + { + "epoch": 0.6364144267179838, + "grad_norm": 0.3779339654567008, + "learning_rate": 9.62184704038287e-05, + "loss": 3.1818, + "step": 10252 + }, + { + "epoch": 0.6364765038177417, + "grad_norm": 0.2139525395611505, + "learning_rate": 9.621709249177104e-05, + "loss": 3.0886, + "step": 10253 + }, + { + "epoch": 0.6365385809174995, + "grad_norm": 0.2338886858194413, + "learning_rate": 9.621571433858758e-05, + "loss": 3.2026, + "step": 10254 + }, + { + "epoch": 0.6366006580172574, + "grad_norm": 0.22539163453720612, + "learning_rate": 9.621433594428552e-05, + "loss": 3.1626, + "step": 10255 + }, + { + "epoch": 0.6366627351170153, + "grad_norm": 0.39575033405215154, + "learning_rate": 9.621295730887205e-05, + "loss": 3.1861, + "step": 10256 + }, + { + "epoch": 0.6367248122167732, + "grad_norm": 0.2724697836256804, + "learning_rate": 9.621157843235436e-05, + "loss": 3.0806, + "step": 10257 + }, + { + "epoch": 0.6367868893165312, + "grad_norm": 0.2499151458382853, + "learning_rate": 9.621019931473964e-05, + "loss": 3.1522, + "step": 10258 + }, + { + "epoch": 0.6368489664162891, + "grad_norm": 0.3048222195067294, + "learning_rate": 9.62088199560351e-05, + "loss": 3.1863, + "step": 10259 + }, + { + "epoch": 0.6369110435160469, + "grad_norm": 0.3451998112493189, + "learning_rate": 9.620744035624794e-05, + "loss": 3.0889, + "step": 10260 + }, + { + "epoch": 0.6369731206158048, + "grad_norm": 0.3061637328735343, + "learning_rate": 9.620606051538534e-05, + "loss": 3.1104, + "step": 10261 + }, + { + "epoch": 0.6370351977155627, + "grad_norm": 0.2619283178718456, + "learning_rate": 9.62046804334545e-05, + "loss": 3.202, + "step": 10262 + }, + { + "epoch": 0.6370972748153206, + "grad_norm": 0.24867351105991026, + "learning_rate": 9.620330011046263e-05, + "loss": 3.243, + "step": 10263 + }, + { + "epoch": 0.6371593519150786, + "grad_norm": 0.21124876862793435, + "learning_rate": 9.620191954641693e-05, + "loss": 3.1353, + "step": 10264 + }, + { + "epoch": 0.6372214290148365, + "grad_norm": 0.21833915104983495, + "learning_rate": 9.620053874132459e-05, + "loss": 3.153, + "step": 10265 + }, + { + "epoch": 0.6372835061145943, + "grad_norm": 0.2078958756401273, + "learning_rate": 9.619915769519283e-05, + "loss": 3.1469, + "step": 10266 + }, + { + "epoch": 0.6373455832143522, + "grad_norm": 0.18878524091291227, + "learning_rate": 9.619777640802885e-05, + "loss": 3.2203, + "step": 10267 + }, + { + "epoch": 0.6374076603141101, + "grad_norm": 0.35343259816434314, + "learning_rate": 9.619639487983984e-05, + "loss": 3.1139, + "step": 10268 + }, + { + "epoch": 0.637469737413868, + "grad_norm": 0.20549511069551518, + "learning_rate": 9.619501311063303e-05, + "loss": 3.1157, + "step": 10269 + }, + { + "epoch": 0.637531814513626, + "grad_norm": 0.23390810078339094, + "learning_rate": 9.619363110041564e-05, + "loss": 3.2612, + "step": 10270 + }, + { + "epoch": 0.6375938916133839, + "grad_norm": 0.24985643824600703, + "learning_rate": 9.619224884919485e-05, + "loss": 3.1559, + "step": 10271 + }, + { + "epoch": 0.6376559687131417, + "grad_norm": 0.2549096455097102, + "learning_rate": 9.61908663569779e-05, + "loss": 3.0525, + "step": 10272 + }, + { + "epoch": 0.6377180458128996, + "grad_norm": 0.18711053612971942, + "learning_rate": 9.618948362377195e-05, + "loss": 3.1821, + "step": 10273 + }, + { + "epoch": 0.6377801229126575, + "grad_norm": 0.19904232628072654, + "learning_rate": 9.618810064958428e-05, + "loss": 3.1934, + "step": 10274 + }, + { + "epoch": 0.6378422000124154, + "grad_norm": 0.42364714313783275, + "learning_rate": 9.618671743442208e-05, + "loss": 3.1819, + "step": 10275 + }, + { + "epoch": 0.6379042771121733, + "grad_norm": 0.29312596235544314, + "learning_rate": 9.618533397829255e-05, + "loss": 3.0851, + "step": 10276 + }, + { + "epoch": 0.6379663542119313, + "grad_norm": 0.22640977842268112, + "learning_rate": 9.618395028120293e-05, + "loss": 3.2408, + "step": 10277 + }, + { + "epoch": 0.6380284313116891, + "grad_norm": 0.22370948476637753, + "learning_rate": 9.618256634316042e-05, + "loss": 3.1688, + "step": 10278 + }, + { + "epoch": 0.638090508411447, + "grad_norm": 0.21460185738732526, + "learning_rate": 9.618118216417226e-05, + "loss": 3.2533, + "step": 10279 + }, + { + "epoch": 0.6381525855112049, + "grad_norm": 0.23206629921885108, + "learning_rate": 9.617979774424566e-05, + "loss": 3.0583, + "step": 10280 + }, + { + "epoch": 0.6382146626109628, + "grad_norm": 0.22615740998284142, + "learning_rate": 9.617841308338784e-05, + "loss": 3.0613, + "step": 10281 + }, + { + "epoch": 0.6382767397107207, + "grad_norm": 0.2415971097173487, + "learning_rate": 9.617702818160603e-05, + "loss": 3.2375, + "step": 10282 + }, + { + "epoch": 0.6383388168104787, + "grad_norm": 0.20532411626646282, + "learning_rate": 9.617564303890745e-05, + "loss": 3.1586, + "step": 10283 + }, + { + "epoch": 0.6384008939102365, + "grad_norm": 0.2371523097097104, + "learning_rate": 9.617425765529932e-05, + "loss": 3.2286, + "step": 10284 + }, + { + "epoch": 0.6384629710099944, + "grad_norm": 0.21091476953741708, + "learning_rate": 9.61728720307889e-05, + "loss": 3.2249, + "step": 10285 + }, + { + "epoch": 0.6385250481097523, + "grad_norm": 0.20634482508611393, + "learning_rate": 9.617148616538338e-05, + "loss": 3.1515, + "step": 10286 + }, + { + "epoch": 0.6385871252095102, + "grad_norm": 0.18704506200845922, + "learning_rate": 9.617010005909002e-05, + "loss": 3.122, + "step": 10287 + }, + { + "epoch": 0.6386492023092681, + "grad_norm": 0.30248417513415266, + "learning_rate": 9.616871371191604e-05, + "loss": 3.1171, + "step": 10288 + }, + { + "epoch": 0.6387112794090261, + "grad_norm": 0.19089354727179472, + "learning_rate": 9.616732712386867e-05, + "loss": 3.0024, + "step": 10289 + }, + { + "epoch": 0.6387733565087839, + "grad_norm": 0.17160809448682776, + "learning_rate": 9.616594029495516e-05, + "loss": 3.2789, + "step": 10290 + }, + { + "epoch": 0.6388354336085418, + "grad_norm": 0.2343900039956557, + "learning_rate": 9.616455322518272e-05, + "loss": 3.1282, + "step": 10291 + }, + { + "epoch": 0.6388975107082997, + "grad_norm": 0.2065101399045022, + "learning_rate": 9.616316591455859e-05, + "loss": 3.1056, + "step": 10292 + }, + { + "epoch": 0.6389595878080576, + "grad_norm": 0.19651803852597086, + "learning_rate": 9.616177836309004e-05, + "loss": 3.0488, + "step": 10293 + }, + { + "epoch": 0.6390216649078155, + "grad_norm": 0.25061773929400233, + "learning_rate": 9.616039057078427e-05, + "loss": 3.1639, + "step": 10294 + }, + { + "epoch": 0.6390837420075735, + "grad_norm": 0.2961220981847014, + "learning_rate": 9.615900253764853e-05, + "loss": 3.2317, + "step": 10295 + }, + { + "epoch": 0.6391458191073313, + "grad_norm": 0.25033680758518356, + "learning_rate": 9.615761426369008e-05, + "loss": 3.2424, + "step": 10296 + }, + { + "epoch": 0.6392078962070892, + "grad_norm": 0.2101752141457968, + "learning_rate": 9.615622574891615e-05, + "loss": 3.3101, + "step": 10297 + }, + { + "epoch": 0.6392699733068471, + "grad_norm": 0.25636195551636665, + "learning_rate": 9.615483699333399e-05, + "loss": 3.1088, + "step": 10298 + }, + { + "epoch": 0.639332050406605, + "grad_norm": 0.18625478720753155, + "learning_rate": 9.615344799695083e-05, + "loss": 3.205, + "step": 10299 + }, + { + "epoch": 0.6393941275063629, + "grad_norm": 0.19686703022462695, + "learning_rate": 9.615205875977394e-05, + "loss": 3.1537, + "step": 10300 + }, + { + "epoch": 0.6394562046061208, + "grad_norm": 0.17194161424668242, + "learning_rate": 9.615066928181053e-05, + "loss": 3.0657, + "step": 10301 + }, + { + "epoch": 0.6395182817058787, + "grad_norm": 0.19794566899839755, + "learning_rate": 9.614927956306789e-05, + "loss": 3.1801, + "step": 10302 + }, + { + "epoch": 0.6395803588056366, + "grad_norm": 0.20597181416360577, + "learning_rate": 9.614788960355327e-05, + "loss": 3.2633, + "step": 10303 + }, + { + "epoch": 0.6396424359053945, + "grad_norm": 0.16446514334901768, + "learning_rate": 9.61464994032739e-05, + "loss": 3.2118, + "step": 10304 + }, + { + "epoch": 0.6397045130051524, + "grad_norm": 0.23915666271569383, + "learning_rate": 9.614510896223703e-05, + "loss": 3.101, + "step": 10305 + }, + { + "epoch": 0.6397665901049103, + "grad_norm": 0.1743865312844835, + "learning_rate": 9.614371828044993e-05, + "loss": 3.1843, + "step": 10306 + }, + { + "epoch": 0.6398286672046682, + "grad_norm": 0.23249604139752184, + "learning_rate": 9.614232735791985e-05, + "loss": 3.2283, + "step": 10307 + }, + { + "epoch": 0.639890744304426, + "grad_norm": 0.17672190643985905, + "learning_rate": 9.614093619465405e-05, + "loss": 3.1778, + "step": 10308 + }, + { + "epoch": 0.639952821404184, + "grad_norm": 0.23416419085339796, + "learning_rate": 9.613954479065979e-05, + "loss": 3.1669, + "step": 10309 + }, + { + "epoch": 0.6400148985039419, + "grad_norm": 0.1850321204743722, + "learning_rate": 9.61381531459443e-05, + "loss": 3.1594, + "step": 10310 + }, + { + "epoch": 0.6400769756036998, + "grad_norm": 0.17151682553180017, + "learning_rate": 9.613676126051489e-05, + "loss": 3.1246, + "step": 10311 + }, + { + "epoch": 0.6401390527034577, + "grad_norm": 0.20044532609297155, + "learning_rate": 9.613536913437879e-05, + "loss": 3.0665, + "step": 10312 + }, + { + "epoch": 0.6402011298032156, + "grad_norm": 0.20200330712129969, + "learning_rate": 9.613397676754325e-05, + "loss": 3.2011, + "step": 10313 + }, + { + "epoch": 0.6402632069029734, + "grad_norm": 0.24120065922016393, + "learning_rate": 9.613258416001559e-05, + "loss": 3.1666, + "step": 10314 + }, + { + "epoch": 0.6403252840027314, + "grad_norm": 0.30615062804403, + "learning_rate": 9.613119131180302e-05, + "loss": 3.1614, + "step": 10315 + }, + { + "epoch": 0.6403873611024893, + "grad_norm": 0.2848913320828548, + "learning_rate": 9.612979822291282e-05, + "loss": 3.2167, + "step": 10316 + }, + { + "epoch": 0.6404494382022472, + "grad_norm": 0.17628180970940122, + "learning_rate": 9.612840489335227e-05, + "loss": 3.2106, + "step": 10317 + }, + { + "epoch": 0.6405115153020051, + "grad_norm": 0.20508637184636777, + "learning_rate": 9.612701132312863e-05, + "loss": 3.0569, + "step": 10318 + }, + { + "epoch": 0.640573592401763, + "grad_norm": 0.3450008846181779, + "learning_rate": 9.612561751224917e-05, + "loss": 3.1375, + "step": 10319 + }, + { + "epoch": 0.6406356695015208, + "grad_norm": 0.1938693229039633, + "learning_rate": 9.612422346072119e-05, + "loss": 3.1182, + "step": 10320 + }, + { + "epoch": 0.6406977466012788, + "grad_norm": 0.1953451320090965, + "learning_rate": 9.61228291685519e-05, + "loss": 3.1572, + "step": 10321 + }, + { + "epoch": 0.6407598237010367, + "grad_norm": 0.22446183873864278, + "learning_rate": 9.612143463574865e-05, + "loss": 3.2126, + "step": 10322 + }, + { + "epoch": 0.6408219008007946, + "grad_norm": 0.21539872189904566, + "learning_rate": 9.612003986231865e-05, + "loss": 3.1723, + "step": 10323 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.24512393531782078, + "learning_rate": 9.611864484826924e-05, + "loss": 3.2495, + "step": 10324 + }, + { + "epoch": 0.6409460550003104, + "grad_norm": 0.23113638948113224, + "learning_rate": 9.611724959360764e-05, + "loss": 3.2224, + "step": 10325 + }, + { + "epoch": 0.6410081321000682, + "grad_norm": 0.24731668243457852, + "learning_rate": 9.611585409834116e-05, + "loss": 3.2258, + "step": 10326 + }, + { + "epoch": 0.6410702091998262, + "grad_norm": 0.24939264341433476, + "learning_rate": 9.611445836247708e-05, + "loss": 3.2337, + "step": 10327 + }, + { + "epoch": 0.6411322862995841, + "grad_norm": 0.24952648440153494, + "learning_rate": 9.611306238602266e-05, + "loss": 3.2473, + "step": 10328 + }, + { + "epoch": 0.641194363399342, + "grad_norm": 0.2619386716883346, + "learning_rate": 9.611166616898521e-05, + "loss": 3.0932, + "step": 10329 + }, + { + "epoch": 0.6412564404990999, + "grad_norm": 0.2072813934068027, + "learning_rate": 9.6110269711372e-05, + "loss": 3.168, + "step": 10330 + }, + { + "epoch": 0.6413185175988578, + "grad_norm": 0.21378849271325118, + "learning_rate": 9.610887301319033e-05, + "loss": 3.09, + "step": 10331 + }, + { + "epoch": 0.6413805946986156, + "grad_norm": 0.22910197746583996, + "learning_rate": 9.610747607444745e-05, + "loss": 3.1097, + "step": 10332 + }, + { + "epoch": 0.6414426717983736, + "grad_norm": 0.45411859295811774, + "learning_rate": 9.610607889515071e-05, + "loss": 3.1345, + "step": 10333 + }, + { + "epoch": 0.6415047488981315, + "grad_norm": 0.40084773696364334, + "learning_rate": 9.610468147530734e-05, + "loss": 3.1184, + "step": 10334 + }, + { + "epoch": 0.6415668259978894, + "grad_norm": 0.25641836008767543, + "learning_rate": 9.610328381492466e-05, + "loss": 3.1744, + "step": 10335 + }, + { + "epoch": 0.6416289030976473, + "grad_norm": 0.3146428903934906, + "learning_rate": 9.610188591400995e-05, + "loss": 3.1673, + "step": 10336 + }, + { + "epoch": 0.6416909801974052, + "grad_norm": 0.28265438190979497, + "learning_rate": 9.610048777257052e-05, + "loss": 3.1924, + "step": 10337 + }, + { + "epoch": 0.641753057297163, + "grad_norm": 0.24222408295254166, + "learning_rate": 9.609908939061365e-05, + "loss": 3.2648, + "step": 10338 + }, + { + "epoch": 0.641815134396921, + "grad_norm": 0.21910595883158232, + "learning_rate": 9.609769076814665e-05, + "loss": 3.2078, + "step": 10339 + }, + { + "epoch": 0.6418772114966789, + "grad_norm": 0.19842173521242865, + "learning_rate": 9.60962919051768e-05, + "loss": 3.2075, + "step": 10340 + }, + { + "epoch": 0.6419392885964368, + "grad_norm": 0.23393104945417814, + "learning_rate": 9.609489280171139e-05, + "loss": 3.1976, + "step": 10341 + }, + { + "epoch": 0.6420013656961947, + "grad_norm": 0.266686101232579, + "learning_rate": 9.609349345775774e-05, + "loss": 3.1551, + "step": 10342 + }, + { + "epoch": 0.6420634427959526, + "grad_norm": 0.2068708986411697, + "learning_rate": 9.609209387332316e-05, + "loss": 3.0466, + "step": 10343 + }, + { + "epoch": 0.6421255198957104, + "grad_norm": 0.22622664814512125, + "learning_rate": 9.609069404841493e-05, + "loss": 3.2445, + "step": 10344 + }, + { + "epoch": 0.6421875969954683, + "grad_norm": 0.2911692220384306, + "learning_rate": 9.608929398304035e-05, + "loss": 3.1985, + "step": 10345 + }, + { + "epoch": 0.6422496740952263, + "grad_norm": 0.21290904149182788, + "learning_rate": 9.608789367720674e-05, + "loss": 3.1975, + "step": 10346 + }, + { + "epoch": 0.6423117511949842, + "grad_norm": 0.3588299141907141, + "learning_rate": 9.608649313092141e-05, + "loss": 3.1919, + "step": 10347 + }, + { + "epoch": 0.6423738282947421, + "grad_norm": 0.2089954444591827, + "learning_rate": 9.608509234419165e-05, + "loss": 3.1772, + "step": 10348 + }, + { + "epoch": 0.6424359053945, + "grad_norm": 0.2566870776540562, + "learning_rate": 9.608369131702477e-05, + "loss": 3.1213, + "step": 10349 + }, + { + "epoch": 0.6424979824942578, + "grad_norm": 0.26657585048001453, + "learning_rate": 9.60822900494281e-05, + "loss": 3.0368, + "step": 10350 + }, + { + "epoch": 0.6425600595940157, + "grad_norm": 0.2911612767040188, + "learning_rate": 9.608088854140894e-05, + "loss": 3.179, + "step": 10351 + }, + { + "epoch": 0.6426221366937737, + "grad_norm": 0.32147899915202877, + "learning_rate": 9.607948679297457e-05, + "loss": 3.1739, + "step": 10352 + }, + { + "epoch": 0.6426842137935316, + "grad_norm": 0.23379787278684244, + "learning_rate": 9.607808480413234e-05, + "loss": 3.1452, + "step": 10353 + }, + { + "epoch": 0.6427462908932895, + "grad_norm": 0.20660050584995662, + "learning_rate": 9.607668257488957e-05, + "loss": 3.1609, + "step": 10354 + }, + { + "epoch": 0.6428083679930474, + "grad_norm": 0.5475033284003407, + "learning_rate": 9.607528010525356e-05, + "loss": 3.071, + "step": 10355 + }, + { + "epoch": 0.6428704450928052, + "grad_norm": 0.26446736746770017, + "learning_rate": 9.607387739523161e-05, + "loss": 3.2076, + "step": 10356 + }, + { + "epoch": 0.6429325221925631, + "grad_norm": 0.30403751343156715, + "learning_rate": 9.607247444483108e-05, + "loss": 3.173, + "step": 10357 + }, + { + "epoch": 0.6429945992923211, + "grad_norm": 0.32191170418357695, + "learning_rate": 9.607107125405925e-05, + "loss": 3.179, + "step": 10358 + }, + { + "epoch": 0.643056676392079, + "grad_norm": 0.3106627218226405, + "learning_rate": 9.606966782292346e-05, + "loss": 3.1783, + "step": 10359 + }, + { + "epoch": 0.6431187534918369, + "grad_norm": 0.2326284260120879, + "learning_rate": 9.606826415143104e-05, + "loss": 3.1266, + "step": 10360 + }, + { + "epoch": 0.6431808305915948, + "grad_norm": 0.23283249781955193, + "learning_rate": 9.606686023958929e-05, + "loss": 3.0475, + "step": 10361 + }, + { + "epoch": 0.6432429076913526, + "grad_norm": 0.20219251364103283, + "learning_rate": 9.606545608740555e-05, + "loss": 3.0632, + "step": 10362 + }, + { + "epoch": 0.6433049847911105, + "grad_norm": 0.1927422974701251, + "learning_rate": 9.606405169488715e-05, + "loss": 3.1612, + "step": 10363 + }, + { + "epoch": 0.6433670618908685, + "grad_norm": 0.21766501665773946, + "learning_rate": 9.60626470620414e-05, + "loss": 3.1808, + "step": 10364 + }, + { + "epoch": 0.6434291389906264, + "grad_norm": 0.3475410935062911, + "learning_rate": 9.606124218887564e-05, + "loss": 3.151, + "step": 10365 + }, + { + "epoch": 0.6434912160903843, + "grad_norm": 0.2342759081854969, + "learning_rate": 9.60598370753972e-05, + "loss": 3.0625, + "step": 10366 + }, + { + "epoch": 0.6435532931901422, + "grad_norm": 0.19288554077122472, + "learning_rate": 9.605843172161341e-05, + "loss": 3.1214, + "step": 10367 + }, + { + "epoch": 0.6436153702899, + "grad_norm": 0.2300538838800936, + "learning_rate": 9.605702612753161e-05, + "loss": 3.2211, + "step": 10368 + }, + { + "epoch": 0.6436774473896579, + "grad_norm": 0.22121209559406071, + "learning_rate": 9.60556202931591e-05, + "loss": 3.0695, + "step": 10369 + }, + { + "epoch": 0.6437395244894158, + "grad_norm": 0.21252458058569276, + "learning_rate": 9.605421421850325e-05, + "loss": 3.1753, + "step": 10370 + }, + { + "epoch": 0.6438016015891738, + "grad_norm": 0.21558124367784118, + "learning_rate": 9.605280790357139e-05, + "loss": 3.2212, + "step": 10371 + }, + { + "epoch": 0.6438636786889317, + "grad_norm": 0.23300081164194122, + "learning_rate": 9.605140134837086e-05, + "loss": 3.1296, + "step": 10372 + }, + { + "epoch": 0.6439257557886896, + "grad_norm": 0.2489156976141285, + "learning_rate": 9.604999455290896e-05, + "loss": 3.0837, + "step": 10373 + }, + { + "epoch": 0.6439878328884474, + "grad_norm": 0.2540503313169776, + "learning_rate": 9.604858751719308e-05, + "loss": 3.1868, + "step": 10374 + }, + { + "epoch": 0.6440499099882053, + "grad_norm": 0.20707853245175178, + "learning_rate": 9.604718024123053e-05, + "loss": 3.1213, + "step": 10375 + }, + { + "epoch": 0.6441119870879632, + "grad_norm": 0.20010139309180378, + "learning_rate": 9.604577272502868e-05, + "loss": 3.2501, + "step": 10376 + }, + { + "epoch": 0.6441740641877212, + "grad_norm": 0.20296238845946177, + "learning_rate": 9.604436496859483e-05, + "loss": 3.1984, + "step": 10377 + }, + { + "epoch": 0.6442361412874791, + "grad_norm": 0.2031444393539551, + "learning_rate": 9.604295697193637e-05, + "loss": 3.1831, + "step": 10378 + }, + { + "epoch": 0.644298218387237, + "grad_norm": 0.19687885666657368, + "learning_rate": 9.604154873506063e-05, + "loss": 3.1578, + "step": 10379 + }, + { + "epoch": 0.6443602954869948, + "grad_norm": 0.17041281230791286, + "learning_rate": 9.604014025797495e-05, + "loss": 3.1743, + "step": 10380 + }, + { + "epoch": 0.6444223725867527, + "grad_norm": 0.1799240317735093, + "learning_rate": 9.603873154068666e-05, + "loss": 3.1282, + "step": 10381 + }, + { + "epoch": 0.6444844496865106, + "grad_norm": 0.18118093629970616, + "learning_rate": 9.603732258320315e-05, + "loss": 3.163, + "step": 10382 + }, + { + "epoch": 0.6445465267862686, + "grad_norm": 0.21135106870023673, + "learning_rate": 9.603591338553174e-05, + "loss": 3.2196, + "step": 10383 + }, + { + "epoch": 0.6446086038860265, + "grad_norm": 0.1870591225566125, + "learning_rate": 9.60345039476798e-05, + "loss": 3.1733, + "step": 10384 + }, + { + "epoch": 0.6446706809857844, + "grad_norm": 0.19807397818717679, + "learning_rate": 9.60330942696547e-05, + "loss": 3.0981, + "step": 10385 + }, + { + "epoch": 0.6447327580855422, + "grad_norm": 0.17864276023275324, + "learning_rate": 9.603168435146375e-05, + "loss": 3.1817, + "step": 10386 + }, + { + "epoch": 0.6447948351853001, + "grad_norm": 0.40990834691659406, + "learning_rate": 9.603027419311433e-05, + "loss": 3.1568, + "step": 10387 + }, + { + "epoch": 0.644856912285058, + "grad_norm": 0.29557387173984284, + "learning_rate": 9.60288637946138e-05, + "loss": 3.1282, + "step": 10388 + }, + { + "epoch": 0.644918989384816, + "grad_norm": 0.2555086483844754, + "learning_rate": 9.60274531559695e-05, + "loss": 3.1931, + "step": 10389 + }, + { + "epoch": 0.6449810664845739, + "grad_norm": 0.20227810483397074, + "learning_rate": 9.602604227718884e-05, + "loss": 3.0982, + "step": 10390 + }, + { + "epoch": 0.6450431435843318, + "grad_norm": 0.24941166553226757, + "learning_rate": 9.602463115827911e-05, + "loss": 3.1138, + "step": 10391 + }, + { + "epoch": 0.6451052206840896, + "grad_norm": 0.2791863869823561, + "learning_rate": 9.602321979924772e-05, + "loss": 3.2348, + "step": 10392 + }, + { + "epoch": 0.6451672977838475, + "grad_norm": 0.1940858756067842, + "learning_rate": 9.6021808200102e-05, + "loss": 3.1761, + "step": 10393 + }, + { + "epoch": 0.6452293748836054, + "grad_norm": 0.2558246763739342, + "learning_rate": 9.602039636084935e-05, + "loss": 3.1084, + "step": 10394 + }, + { + "epoch": 0.6452914519833634, + "grad_norm": 0.1884639221854535, + "learning_rate": 9.601898428149712e-05, + "loss": 3.1746, + "step": 10395 + }, + { + "epoch": 0.6453535290831213, + "grad_norm": 0.26229456287650477, + "learning_rate": 9.601757196205267e-05, + "loss": 3.1031, + "step": 10396 + }, + { + "epoch": 0.6454156061828792, + "grad_norm": 0.19211101211712828, + "learning_rate": 9.60161594025234e-05, + "loss": 3.0033, + "step": 10397 + }, + { + "epoch": 0.645477683282637, + "grad_norm": 0.2138287148207926, + "learning_rate": 9.601474660291662e-05, + "loss": 3.217, + "step": 10398 + }, + { + "epoch": 0.6455397603823949, + "grad_norm": 0.21211914826809294, + "learning_rate": 9.601333356323976e-05, + "loss": 3.2051, + "step": 10399 + }, + { + "epoch": 0.6456018374821528, + "grad_norm": 0.22075750305583933, + "learning_rate": 9.601192028350016e-05, + "loss": 3.13, + "step": 10400 + }, + { + "epoch": 0.6456639145819107, + "grad_norm": 0.19901101183076417, + "learning_rate": 9.601050676370522e-05, + "loss": 3.0993, + "step": 10401 + }, + { + "epoch": 0.6457259916816687, + "grad_norm": 0.21589433255131515, + "learning_rate": 9.600909300386227e-05, + "loss": 3.1123, + "step": 10402 + }, + { + "epoch": 0.6457880687814266, + "grad_norm": 0.23167909814106483, + "learning_rate": 9.600767900397874e-05, + "loss": 3.0738, + "step": 10403 + }, + { + "epoch": 0.6458501458811844, + "grad_norm": 0.19331999098295682, + "learning_rate": 9.600626476406195e-05, + "loss": 3.1716, + "step": 10404 + }, + { + "epoch": 0.6459122229809423, + "grad_norm": 0.19631620001258812, + "learning_rate": 9.600485028411933e-05, + "loss": 3.1932, + "step": 10405 + }, + { + "epoch": 0.6459743000807002, + "grad_norm": 0.22367905884031145, + "learning_rate": 9.600343556415822e-05, + "loss": 3.1927, + "step": 10406 + }, + { + "epoch": 0.6460363771804581, + "grad_norm": 0.19417800428343562, + "learning_rate": 9.600202060418603e-05, + "loss": 3.1423, + "step": 10407 + }, + { + "epoch": 0.6460984542802161, + "grad_norm": 0.22363265424831072, + "learning_rate": 9.600060540421014e-05, + "loss": 3.1012, + "step": 10408 + }, + { + "epoch": 0.646160531379974, + "grad_norm": 0.21591198385390403, + "learning_rate": 9.599918996423792e-05, + "loss": 3.2018, + "step": 10409 + }, + { + "epoch": 0.6462226084797318, + "grad_norm": 0.17223727453587892, + "learning_rate": 9.599777428427676e-05, + "loss": 3.0887, + "step": 10410 + }, + { + "epoch": 0.6462846855794897, + "grad_norm": 0.26902582629746624, + "learning_rate": 9.599635836433404e-05, + "loss": 3.1462, + "step": 10411 + }, + { + "epoch": 0.6463467626792476, + "grad_norm": 0.1963728351133739, + "learning_rate": 9.599494220441715e-05, + "loss": 3.249, + "step": 10412 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.2050871710825738, + "learning_rate": 9.599352580453349e-05, + "loss": 3.173, + "step": 10413 + }, + { + "epoch": 0.6464709168787635, + "grad_norm": 0.23376909455934186, + "learning_rate": 9.599210916469043e-05, + "loss": 3.1876, + "step": 10414 + }, + { + "epoch": 0.6465329939785214, + "grad_norm": 0.278654172659653, + "learning_rate": 9.599069228489536e-05, + "loss": 3.069, + "step": 10415 + }, + { + "epoch": 0.6465950710782792, + "grad_norm": 0.20682297432741045, + "learning_rate": 9.598927516515571e-05, + "loss": 3.1563, + "step": 10416 + }, + { + "epoch": 0.6466571481780371, + "grad_norm": 0.2197513057764336, + "learning_rate": 9.598785780547883e-05, + "loss": 3.1302, + "step": 10417 + }, + { + "epoch": 0.646719225277795, + "grad_norm": 0.17058048098052597, + "learning_rate": 9.598644020587215e-05, + "loss": 3.1872, + "step": 10418 + }, + { + "epoch": 0.6467813023775529, + "grad_norm": 0.21078368159391012, + "learning_rate": 9.598502236634304e-05, + "loss": 3.2333, + "step": 10419 + }, + { + "epoch": 0.6468433794773109, + "grad_norm": 0.20501221097073702, + "learning_rate": 9.59836042868989e-05, + "loss": 3.1812, + "step": 10420 + }, + { + "epoch": 0.6469054565770688, + "grad_norm": 0.1737863511885563, + "learning_rate": 9.598218596754714e-05, + "loss": 3.1801, + "step": 10421 + }, + { + "epoch": 0.6469675336768266, + "grad_norm": 0.2090129743597473, + "learning_rate": 9.598076740829514e-05, + "loss": 3.2542, + "step": 10422 + }, + { + "epoch": 0.6470296107765845, + "grad_norm": 0.1913406590553643, + "learning_rate": 9.597934860915032e-05, + "loss": 3.1916, + "step": 10423 + }, + { + "epoch": 0.6470916878763424, + "grad_norm": 0.20654172423508593, + "learning_rate": 9.597792957012009e-05, + "loss": 3.1175, + "step": 10424 + }, + { + "epoch": 0.6471537649761003, + "grad_norm": 0.18568026115048628, + "learning_rate": 9.597651029121182e-05, + "loss": 3.1672, + "step": 10425 + }, + { + "epoch": 0.6472158420758582, + "grad_norm": 0.20382893071978553, + "learning_rate": 9.597509077243293e-05, + "loss": 3.1931, + "step": 10426 + }, + { + "epoch": 0.6472779191756162, + "grad_norm": 0.18352791437709437, + "learning_rate": 9.597367101379086e-05, + "loss": 3.1781, + "step": 10427 + }, + { + "epoch": 0.647339996275374, + "grad_norm": 0.17452517597929768, + "learning_rate": 9.597225101529296e-05, + "loss": 3.2567, + "step": 10428 + }, + { + "epoch": 0.6474020733751319, + "grad_norm": 0.16848357723994967, + "learning_rate": 9.597083077694668e-05, + "loss": 3.138, + "step": 10429 + }, + { + "epoch": 0.6474641504748898, + "grad_norm": 0.17189153678408792, + "learning_rate": 9.596941029875942e-05, + "loss": 3.1176, + "step": 10430 + }, + { + "epoch": 0.6475262275746477, + "grad_norm": 0.18551092301454067, + "learning_rate": 9.596798958073859e-05, + "loss": 3.0407, + "step": 10431 + }, + { + "epoch": 0.6475883046744056, + "grad_norm": 0.1880933987889431, + "learning_rate": 9.596656862289158e-05, + "loss": 3.116, + "step": 10432 + }, + { + "epoch": 0.6476503817741636, + "grad_norm": 0.19660952963660389, + "learning_rate": 9.596514742522584e-05, + "loss": 3.1559, + "step": 10433 + }, + { + "epoch": 0.6477124588739214, + "grad_norm": 0.20685596062961287, + "learning_rate": 9.596372598774875e-05, + "loss": 3.1385, + "step": 10434 + }, + { + "epoch": 0.6477745359736793, + "grad_norm": 0.1716861445891068, + "learning_rate": 9.596230431046775e-05, + "loss": 3.1802, + "step": 10435 + }, + { + "epoch": 0.6478366130734372, + "grad_norm": 0.2257039723259908, + "learning_rate": 9.596088239339027e-05, + "loss": 3.2055, + "step": 10436 + }, + { + "epoch": 0.6478986901731951, + "grad_norm": 0.1902652715388515, + "learning_rate": 9.595946023652369e-05, + "loss": 3.164, + "step": 10437 + }, + { + "epoch": 0.647960767272953, + "grad_norm": 0.20536621385437065, + "learning_rate": 9.595803783987544e-05, + "loss": 3.1379, + "step": 10438 + }, + { + "epoch": 0.648022844372711, + "grad_norm": 0.1966622444304894, + "learning_rate": 9.595661520345299e-05, + "loss": 2.9953, + "step": 10439 + }, + { + "epoch": 0.6480849214724688, + "grad_norm": 0.21761964918807733, + "learning_rate": 9.595519232726369e-05, + "loss": 3.1144, + "step": 10440 + }, + { + "epoch": 0.6481469985722267, + "grad_norm": 0.1799132649576578, + "learning_rate": 9.595376921131499e-05, + "loss": 3.1043, + "step": 10441 + }, + { + "epoch": 0.6482090756719846, + "grad_norm": 0.19747483141397268, + "learning_rate": 9.595234585561432e-05, + "loss": 3.1629, + "step": 10442 + }, + { + "epoch": 0.6482711527717425, + "grad_norm": 0.21706375672327802, + "learning_rate": 9.595092226016913e-05, + "loss": 3.1212, + "step": 10443 + }, + { + "epoch": 0.6483332298715004, + "grad_norm": 0.1723550032662179, + "learning_rate": 9.594949842498682e-05, + "loss": 3.1905, + "step": 10444 + }, + { + "epoch": 0.6483953069712584, + "grad_norm": 0.17407308955485779, + "learning_rate": 9.59480743500748e-05, + "loss": 3.1284, + "step": 10445 + }, + { + "epoch": 0.6484573840710162, + "grad_norm": 0.1717257339213198, + "learning_rate": 9.594665003544053e-05, + "loss": 3.206, + "step": 10446 + }, + { + "epoch": 0.6485194611707741, + "grad_norm": 0.19438285836283295, + "learning_rate": 9.594522548109144e-05, + "loss": 3.0971, + "step": 10447 + }, + { + "epoch": 0.648581538270532, + "grad_norm": 0.16117191996090696, + "learning_rate": 9.594380068703496e-05, + "loss": 3.1969, + "step": 10448 + }, + { + "epoch": 0.6486436153702899, + "grad_norm": 0.19858535417206735, + "learning_rate": 9.59423756532785e-05, + "loss": 3.1832, + "step": 10449 + }, + { + "epoch": 0.6487056924700478, + "grad_norm": 0.1580808367046488, + "learning_rate": 9.594095037982953e-05, + "loss": 3.1734, + "step": 10450 + }, + { + "epoch": 0.6487677695698058, + "grad_norm": 0.19539886472488388, + "learning_rate": 9.593952486669547e-05, + "loss": 3.1265, + "step": 10451 + }, + { + "epoch": 0.6488298466695636, + "grad_norm": 0.1709985385373611, + "learning_rate": 9.593809911388374e-05, + "loss": 3.206, + "step": 10452 + }, + { + "epoch": 0.6488919237693215, + "grad_norm": 0.32689969299584387, + "learning_rate": 9.593667312140181e-05, + "loss": 3.14, + "step": 10453 + }, + { + "epoch": 0.6489540008690794, + "grad_norm": 0.20953519372265378, + "learning_rate": 9.593524688925711e-05, + "loss": 3.1524, + "step": 10454 + }, + { + "epoch": 0.6490160779688373, + "grad_norm": 0.1821466289007527, + "learning_rate": 9.593382041745705e-05, + "loss": 3.1109, + "step": 10455 + }, + { + "epoch": 0.6490781550685952, + "grad_norm": 0.2822587677951261, + "learning_rate": 9.593239370600912e-05, + "loss": 3.0435, + "step": 10456 + }, + { + "epoch": 0.6491402321683531, + "grad_norm": 0.17777789285299267, + "learning_rate": 9.593096675492073e-05, + "loss": 3.1613, + "step": 10457 + }, + { + "epoch": 0.649202309268111, + "grad_norm": 0.22882990947430032, + "learning_rate": 9.592953956419935e-05, + "loss": 3.1846, + "step": 10458 + }, + { + "epoch": 0.6492643863678689, + "grad_norm": 0.19166460895643134, + "learning_rate": 9.59281121338524e-05, + "loss": 3.1988, + "step": 10459 + }, + { + "epoch": 0.6493264634676268, + "grad_norm": 0.29200383549042663, + "learning_rate": 9.592668446388735e-05, + "loss": 3.131, + "step": 10460 + }, + { + "epoch": 0.6493885405673847, + "grad_norm": 0.29264084585518474, + "learning_rate": 9.592525655431164e-05, + "loss": 3.1724, + "step": 10461 + }, + { + "epoch": 0.6494506176671426, + "grad_norm": 0.1954938523436945, + "learning_rate": 9.59238284051327e-05, + "loss": 3.2375, + "step": 10462 + }, + { + "epoch": 0.6495126947669005, + "grad_norm": 0.25854015255888646, + "learning_rate": 9.5922400016358e-05, + "loss": 3.0936, + "step": 10463 + }, + { + "epoch": 0.6495747718666584, + "grad_norm": 0.30351171061423093, + "learning_rate": 9.5920971387995e-05, + "loss": 3.1723, + "step": 10464 + }, + { + "epoch": 0.6496368489664163, + "grad_norm": 0.26137296408850513, + "learning_rate": 9.591954252005116e-05, + "loss": 2.9953, + "step": 10465 + }, + { + "epoch": 0.6496989260661742, + "grad_norm": 0.22417350438802683, + "learning_rate": 9.591811341253389e-05, + "loss": 3.0635, + "step": 10466 + }, + { + "epoch": 0.6497610031659321, + "grad_norm": 0.2820684524684004, + "learning_rate": 9.59166840654507e-05, + "loss": 3.1298, + "step": 10467 + }, + { + "epoch": 0.64982308026569, + "grad_norm": 0.22987505041612463, + "learning_rate": 9.5915254478809e-05, + "loss": 3.1575, + "step": 10468 + }, + { + "epoch": 0.6498851573654479, + "grad_norm": 0.3213710738216362, + "learning_rate": 9.59138246526163e-05, + "loss": 3.248, + "step": 10469 + }, + { + "epoch": 0.6499472344652057, + "grad_norm": 0.2586256699007189, + "learning_rate": 9.591239458687999e-05, + "loss": 3.1967, + "step": 10470 + }, + { + "epoch": 0.6500093115649637, + "grad_norm": 0.3018261606778815, + "learning_rate": 9.59109642816076e-05, + "loss": 3.0975, + "step": 10471 + }, + { + "epoch": 0.6500713886647216, + "grad_norm": 0.25201469970113216, + "learning_rate": 9.590953373680656e-05, + "loss": 3.1793, + "step": 10472 + }, + { + "epoch": 0.6501334657644795, + "grad_norm": 0.23771178095242884, + "learning_rate": 9.590810295248434e-05, + "loss": 3.1585, + "step": 10473 + }, + { + "epoch": 0.6501955428642374, + "grad_norm": 0.21041503416189797, + "learning_rate": 9.59066719286484e-05, + "loss": 3.1198, + "step": 10474 + }, + { + "epoch": 0.6502576199639953, + "grad_norm": 0.24569169004240057, + "learning_rate": 9.59052406653062e-05, + "loss": 3.1521, + "step": 10475 + }, + { + "epoch": 0.6503196970637531, + "grad_norm": 0.2041038440047517, + "learning_rate": 9.590380916246523e-05, + "loss": 3.1765, + "step": 10476 + }, + { + "epoch": 0.6503817741635111, + "grad_norm": 0.215273202485171, + "learning_rate": 9.590237742013294e-05, + "loss": 3.1776, + "step": 10477 + }, + { + "epoch": 0.650443851263269, + "grad_norm": 0.2117522012670466, + "learning_rate": 9.59009454383168e-05, + "loss": 3.0639, + "step": 10478 + }, + { + "epoch": 0.6505059283630269, + "grad_norm": 0.2369790452740257, + "learning_rate": 9.589951321702428e-05, + "loss": 3.2092, + "step": 10479 + }, + { + "epoch": 0.6505680054627848, + "grad_norm": 0.21260394471259886, + "learning_rate": 9.589808075626286e-05, + "loss": 3.2312, + "step": 10480 + }, + { + "epoch": 0.6506300825625427, + "grad_norm": 0.2160651945112192, + "learning_rate": 9.589664805604002e-05, + "loss": 3.0783, + "step": 10481 + }, + { + "epoch": 0.6506921596623005, + "grad_norm": 0.3383346022189044, + "learning_rate": 9.589521511636322e-05, + "loss": 3.2238, + "step": 10482 + }, + { + "epoch": 0.6507542367620585, + "grad_norm": 0.3314185267129974, + "learning_rate": 9.589378193723993e-05, + "loss": 3.1252, + "step": 10483 + }, + { + "epoch": 0.6508163138618164, + "grad_norm": 0.19603169587819055, + "learning_rate": 9.589234851867766e-05, + "loss": 3.1302, + "step": 10484 + }, + { + "epoch": 0.6508783909615743, + "grad_norm": 0.32048655642286455, + "learning_rate": 9.589091486068385e-05, + "loss": 3.2142, + "step": 10485 + }, + { + "epoch": 0.6509404680613322, + "grad_norm": 0.212208731969697, + "learning_rate": 9.5889480963266e-05, + "loss": 3.1053, + "step": 10486 + }, + { + "epoch": 0.6510025451610901, + "grad_norm": 0.1998681721740785, + "learning_rate": 9.588804682643158e-05, + "loss": 3.1899, + "step": 10487 + }, + { + "epoch": 0.6510646222608479, + "grad_norm": 0.2261217203350527, + "learning_rate": 9.588661245018811e-05, + "loss": 3.2152, + "step": 10488 + }, + { + "epoch": 0.6511266993606059, + "grad_norm": 0.2797904288708842, + "learning_rate": 9.588517783454301e-05, + "loss": 3.1028, + "step": 10489 + }, + { + "epoch": 0.6511887764603638, + "grad_norm": 0.2701330101691399, + "learning_rate": 9.588374297950382e-05, + "loss": 3.1257, + "step": 10490 + }, + { + "epoch": 0.6512508535601217, + "grad_norm": 0.2672875211567173, + "learning_rate": 9.588230788507799e-05, + "loss": 3.1618, + "step": 10491 + }, + { + "epoch": 0.6513129306598796, + "grad_norm": 0.23481173356088256, + "learning_rate": 9.588087255127303e-05, + "loss": 3.1095, + "step": 10492 + }, + { + "epoch": 0.6513750077596374, + "grad_norm": 0.2956243321852772, + "learning_rate": 9.587943697809642e-05, + "loss": 3.2046, + "step": 10493 + }, + { + "epoch": 0.6514370848593953, + "grad_norm": 0.2474703659754403, + "learning_rate": 9.587800116555564e-05, + "loss": 3.1463, + "step": 10494 + }, + { + "epoch": 0.6514991619591533, + "grad_norm": 0.23477167099516003, + "learning_rate": 9.58765651136582e-05, + "loss": 3.1656, + "step": 10495 + }, + { + "epoch": 0.6515612390589112, + "grad_norm": 0.23038987248538098, + "learning_rate": 9.587512882241159e-05, + "loss": 3.1627, + "step": 10496 + }, + { + "epoch": 0.6516233161586691, + "grad_norm": 0.2670668569433049, + "learning_rate": 9.587369229182328e-05, + "loss": 3.2183, + "step": 10497 + }, + { + "epoch": 0.651685393258427, + "grad_norm": 0.29651699313941504, + "learning_rate": 9.587225552190078e-05, + "loss": 3.121, + "step": 10498 + }, + { + "epoch": 0.6517474703581848, + "grad_norm": 0.2557764865475967, + "learning_rate": 9.587081851265161e-05, + "loss": 3.3207, + "step": 10499 + }, + { + "epoch": 0.6518095474579427, + "grad_norm": 0.2047294650120449, + "learning_rate": 9.586938126408322e-05, + "loss": 3.0787, + "step": 10500 + }, + { + "epoch": 0.6518716245577006, + "grad_norm": 0.24335947572513167, + "learning_rate": 9.586794377620314e-05, + "loss": 3.0883, + "step": 10501 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.2038009748145188, + "learning_rate": 9.586650604901887e-05, + "loss": 3.0958, + "step": 10502 + }, + { + "epoch": 0.6519957787572165, + "grad_norm": 0.2722968013870806, + "learning_rate": 9.58650680825379e-05, + "loss": 3.2292, + "step": 10503 + }, + { + "epoch": 0.6520578558569744, + "grad_norm": 0.2390244883816393, + "learning_rate": 9.586362987676773e-05, + "loss": 3.1402, + "step": 10504 + }, + { + "epoch": 0.6521199329567322, + "grad_norm": 0.4093230913463368, + "learning_rate": 9.586219143171587e-05, + "loss": 3.1233, + "step": 10505 + }, + { + "epoch": 0.6521820100564901, + "grad_norm": 0.3057098476256638, + "learning_rate": 9.586075274738983e-05, + "loss": 3.188, + "step": 10506 + }, + { + "epoch": 0.652244087156248, + "grad_norm": 0.2970851731455164, + "learning_rate": 9.58593138237971e-05, + "loss": 3.2091, + "step": 10507 + }, + { + "epoch": 0.652306164256006, + "grad_norm": 0.31534213808649686, + "learning_rate": 9.585787466094521e-05, + "loss": 3.2008, + "step": 10508 + }, + { + "epoch": 0.6523682413557639, + "grad_norm": 0.2641394813330058, + "learning_rate": 9.585643525884165e-05, + "loss": 3.2017, + "step": 10509 + }, + { + "epoch": 0.6524303184555218, + "grad_norm": 0.3196361542091772, + "learning_rate": 9.585499561749392e-05, + "loss": 3.147, + "step": 10510 + }, + { + "epoch": 0.6524923955552796, + "grad_norm": 0.2383666901542774, + "learning_rate": 9.585355573690957e-05, + "loss": 3.201, + "step": 10511 + }, + { + "epoch": 0.6525544726550375, + "grad_norm": 0.2636183083834897, + "learning_rate": 9.585211561709609e-05, + "loss": 3.1143, + "step": 10512 + }, + { + "epoch": 0.6526165497547954, + "grad_norm": 0.22367634412036114, + "learning_rate": 9.585067525806097e-05, + "loss": 3.1629, + "step": 10513 + }, + { + "epoch": 0.6526786268545534, + "grad_norm": 0.27843931899670005, + "learning_rate": 9.584923465981175e-05, + "loss": 3.1601, + "step": 10514 + }, + { + "epoch": 0.6527407039543113, + "grad_norm": 0.2981566820564334, + "learning_rate": 9.584779382235596e-05, + "loss": 3.1414, + "step": 10515 + }, + { + "epoch": 0.6528027810540692, + "grad_norm": 0.24544123017585254, + "learning_rate": 9.584635274570109e-05, + "loss": 3.2385, + "step": 10516 + }, + { + "epoch": 0.652864858153827, + "grad_norm": 0.25277039213767444, + "learning_rate": 9.584491142985466e-05, + "loss": 3.2241, + "step": 10517 + }, + { + "epoch": 0.6529269352535849, + "grad_norm": 0.24895120208826962, + "learning_rate": 9.58434698748242e-05, + "loss": 3.1817, + "step": 10518 + }, + { + "epoch": 0.6529890123533428, + "grad_norm": 0.22498771481618435, + "learning_rate": 9.584202808061724e-05, + "loss": 3.0985, + "step": 10519 + }, + { + "epoch": 0.6530510894531008, + "grad_norm": 0.24665449984905954, + "learning_rate": 9.584058604724128e-05, + "loss": 3.1487, + "step": 10520 + }, + { + "epoch": 0.6531131665528587, + "grad_norm": 0.2752041963506494, + "learning_rate": 9.583914377470387e-05, + "loss": 3.1956, + "step": 10521 + }, + { + "epoch": 0.6531752436526166, + "grad_norm": 0.2585565638239961, + "learning_rate": 9.58377012630125e-05, + "loss": 3.1544, + "step": 10522 + }, + { + "epoch": 0.6532373207523744, + "grad_norm": 0.20846658359351738, + "learning_rate": 9.583625851217473e-05, + "loss": 3.2685, + "step": 10523 + }, + { + "epoch": 0.6532993978521323, + "grad_norm": 0.32789047124866794, + "learning_rate": 9.583481552219806e-05, + "loss": 3.1723, + "step": 10524 + }, + { + "epoch": 0.6533614749518902, + "grad_norm": 0.3506863166110615, + "learning_rate": 9.583337229309004e-05, + "loss": 3.1977, + "step": 10525 + }, + { + "epoch": 0.6534235520516481, + "grad_norm": 0.24439715831329029, + "learning_rate": 9.583192882485818e-05, + "loss": 3.0609, + "step": 10526 + }, + { + "epoch": 0.6534856291514061, + "grad_norm": 0.3948135687792613, + "learning_rate": 9.583048511751003e-05, + "loss": 3.1504, + "step": 10527 + }, + { + "epoch": 0.653547706251164, + "grad_norm": 0.2367041646285018, + "learning_rate": 9.582904117105312e-05, + "loss": 3.1527, + "step": 10528 + }, + { + "epoch": 0.6536097833509218, + "grad_norm": 0.302406712070375, + "learning_rate": 9.582759698549496e-05, + "loss": 3.0737, + "step": 10529 + }, + { + "epoch": 0.6536718604506797, + "grad_norm": 0.2895006390492068, + "learning_rate": 9.582615256084311e-05, + "loss": 3.1896, + "step": 10530 + }, + { + "epoch": 0.6537339375504376, + "grad_norm": 0.32760301578645046, + "learning_rate": 9.58247078971051e-05, + "loss": 3.2014, + "step": 10531 + }, + { + "epoch": 0.6537960146501955, + "grad_norm": 0.2692877457335381, + "learning_rate": 9.582326299428845e-05, + "loss": 3.1099, + "step": 10532 + }, + { + "epoch": 0.6538580917499535, + "grad_norm": 0.21845252382405678, + "learning_rate": 9.582181785240073e-05, + "loss": 3.114, + "step": 10533 + }, + { + "epoch": 0.6539201688497114, + "grad_norm": 0.2549705792733257, + "learning_rate": 9.582037247144946e-05, + "loss": 3.1683, + "step": 10534 + }, + { + "epoch": 0.6539822459494692, + "grad_norm": 0.2028536726711271, + "learning_rate": 9.581892685144218e-05, + "loss": 3.1978, + "step": 10535 + }, + { + "epoch": 0.6540443230492271, + "grad_norm": 0.3057443677815466, + "learning_rate": 9.581748099238643e-05, + "loss": 3.1546, + "step": 10536 + }, + { + "epoch": 0.654106400148985, + "grad_norm": 0.2559105043065779, + "learning_rate": 9.581603489428976e-05, + "loss": 3.1217, + "step": 10537 + }, + { + "epoch": 0.6541684772487429, + "grad_norm": 0.2284208003174825, + "learning_rate": 9.581458855715973e-05, + "loss": 3.0817, + "step": 10538 + }, + { + "epoch": 0.6542305543485009, + "grad_norm": 0.20333145726685567, + "learning_rate": 9.581314198100386e-05, + "loss": 3.2447, + "step": 10539 + }, + { + "epoch": 0.6542926314482588, + "grad_norm": 0.22961193775097988, + "learning_rate": 9.581169516582971e-05, + "loss": 3.1003, + "step": 10540 + }, + { + "epoch": 0.6543547085480166, + "grad_norm": 0.24405336728024818, + "learning_rate": 9.58102481116448e-05, + "loss": 3.1613, + "step": 10541 + }, + { + "epoch": 0.6544167856477745, + "grad_norm": 0.3238054996956773, + "learning_rate": 9.580880081845673e-05, + "loss": 3.1092, + "step": 10542 + }, + { + "epoch": 0.6544788627475324, + "grad_norm": 0.1998425143056788, + "learning_rate": 9.580735328627304e-05, + "loss": 3.1066, + "step": 10543 + }, + { + "epoch": 0.6545409398472903, + "grad_norm": 0.2502716198231967, + "learning_rate": 9.580590551510125e-05, + "loss": 3.0743, + "step": 10544 + }, + { + "epoch": 0.6546030169470483, + "grad_norm": 0.2133150681733101, + "learning_rate": 9.580445750494892e-05, + "loss": 3.158, + "step": 10545 + }, + { + "epoch": 0.6546650940468062, + "grad_norm": 0.19927772883103495, + "learning_rate": 9.580300925582362e-05, + "loss": 3.0915, + "step": 10546 + }, + { + "epoch": 0.654727171146564, + "grad_norm": 0.22957833479085876, + "learning_rate": 9.580156076773291e-05, + "loss": 3.1327, + "step": 10547 + }, + { + "epoch": 0.6547892482463219, + "grad_norm": 0.2962372982947913, + "learning_rate": 9.580011204068433e-05, + "loss": 3.1913, + "step": 10548 + }, + { + "epoch": 0.6548513253460798, + "grad_norm": 0.27429563199910334, + "learning_rate": 9.579866307468546e-05, + "loss": 3.0706, + "step": 10549 + }, + { + "epoch": 0.6549134024458377, + "grad_norm": 0.21572050315435212, + "learning_rate": 9.579721386974385e-05, + "loss": 3.1662, + "step": 10550 + }, + { + "epoch": 0.6549754795455957, + "grad_norm": 0.22300488799986146, + "learning_rate": 9.579576442586703e-05, + "loss": 3.0933, + "step": 10551 + }, + { + "epoch": 0.6550375566453536, + "grad_norm": 0.18844857279823363, + "learning_rate": 9.579431474306262e-05, + "loss": 3.1087, + "step": 10552 + }, + { + "epoch": 0.6550996337451114, + "grad_norm": 0.22920353368948573, + "learning_rate": 9.579286482133814e-05, + "loss": 3.1546, + "step": 10553 + }, + { + "epoch": 0.6551617108448693, + "grad_norm": 0.23371531312019372, + "learning_rate": 9.579141466070117e-05, + "loss": 3.2328, + "step": 10554 + }, + { + "epoch": 0.6552237879446272, + "grad_norm": 0.19205868715113333, + "learning_rate": 9.578996426115927e-05, + "loss": 3.1244, + "step": 10555 + }, + { + "epoch": 0.6552858650443851, + "grad_norm": 0.18390491170728446, + "learning_rate": 9.578851362272001e-05, + "loss": 3.1834, + "step": 10556 + }, + { + "epoch": 0.655347942144143, + "grad_norm": 0.1855897753460606, + "learning_rate": 9.578706274539095e-05, + "loss": 3.1614, + "step": 10557 + }, + { + "epoch": 0.655410019243901, + "grad_norm": 0.19562598819300261, + "learning_rate": 9.57856116291797e-05, + "loss": 3.0765, + "step": 10558 + }, + { + "epoch": 0.6554720963436588, + "grad_norm": 0.1949687603233446, + "learning_rate": 9.578416027409376e-05, + "loss": 3.1511, + "step": 10559 + }, + { + "epoch": 0.6555341734434167, + "grad_norm": 0.23131464235127447, + "learning_rate": 9.578270868014076e-05, + "loss": 3.2453, + "step": 10560 + }, + { + "epoch": 0.6555962505431746, + "grad_norm": 0.19548764005661237, + "learning_rate": 9.578125684732826e-05, + "loss": 3.1534, + "step": 10561 + }, + { + "epoch": 0.6556583276429325, + "grad_norm": 0.18723621290212616, + "learning_rate": 9.577980477566381e-05, + "loss": 3.1006, + "step": 10562 + }, + { + "epoch": 0.6557204047426904, + "grad_norm": 0.25099707493654455, + "learning_rate": 9.577835246515503e-05, + "loss": 3.3049, + "step": 10563 + }, + { + "epoch": 0.6557824818424484, + "grad_norm": 0.18336954021191926, + "learning_rate": 9.577689991580947e-05, + "loss": 3.1333, + "step": 10564 + }, + { + "epoch": 0.6558445589422062, + "grad_norm": 0.24111506452553746, + "learning_rate": 9.57754471276347e-05, + "loss": 3.1489, + "step": 10565 + }, + { + "epoch": 0.6559066360419641, + "grad_norm": 0.1801871216055833, + "learning_rate": 9.577399410063831e-05, + "loss": 3.1837, + "step": 10566 + }, + { + "epoch": 0.655968713141722, + "grad_norm": 0.25516625790612063, + "learning_rate": 9.577254083482787e-05, + "loss": 3.2431, + "step": 10567 + }, + { + "epoch": 0.6560307902414799, + "grad_norm": 0.2006477546711467, + "learning_rate": 9.577108733021099e-05, + "loss": 3.1496, + "step": 10568 + }, + { + "epoch": 0.6560928673412378, + "grad_norm": 0.17878244355684636, + "learning_rate": 9.576963358679522e-05, + "loss": 3.1598, + "step": 10569 + }, + { + "epoch": 0.6561549444409958, + "grad_norm": 0.24807990597645863, + "learning_rate": 9.57681796045882e-05, + "loss": 3.0479, + "step": 10570 + }, + { + "epoch": 0.6562170215407536, + "grad_norm": 0.2235820662713401, + "learning_rate": 9.576672538359743e-05, + "loss": 3.1408, + "step": 10571 + }, + { + "epoch": 0.6562790986405115, + "grad_norm": 0.18157972319782392, + "learning_rate": 9.576527092383056e-05, + "loss": 3.1199, + "step": 10572 + }, + { + "epoch": 0.6563411757402694, + "grad_norm": 0.2907838090131139, + "learning_rate": 9.576381622529518e-05, + "loss": 3.1236, + "step": 10573 + }, + { + "epoch": 0.6564032528400273, + "grad_norm": 0.1919564300178059, + "learning_rate": 9.576236128799884e-05, + "loss": 3.1428, + "step": 10574 + }, + { + "epoch": 0.6564653299397852, + "grad_norm": 0.19962954708526856, + "learning_rate": 9.576090611194915e-05, + "loss": 3.222, + "step": 10575 + }, + { + "epoch": 0.6565274070395432, + "grad_norm": 0.22098202089551283, + "learning_rate": 9.575945069715371e-05, + "loss": 3.2099, + "step": 10576 + }, + { + "epoch": 0.656589484139301, + "grad_norm": 0.2380231726063329, + "learning_rate": 9.57579950436201e-05, + "loss": 3.1577, + "step": 10577 + }, + { + "epoch": 0.6566515612390589, + "grad_norm": 0.18059783959036493, + "learning_rate": 9.575653915135593e-05, + "loss": 3.132, + "step": 10578 + }, + { + "epoch": 0.6567136383388168, + "grad_norm": 0.2253154189519913, + "learning_rate": 9.57550830203688e-05, + "loss": 3.197, + "step": 10579 + }, + { + "epoch": 0.6567757154385747, + "grad_norm": 0.19879937441843049, + "learning_rate": 9.575362665066627e-05, + "loss": 3.1269, + "step": 10580 + }, + { + "epoch": 0.6568377925383326, + "grad_norm": 0.18015737878953664, + "learning_rate": 9.575217004225598e-05, + "loss": 3.1316, + "step": 10581 + }, + { + "epoch": 0.6568998696380905, + "grad_norm": 0.19521375067110494, + "learning_rate": 9.57507131951455e-05, + "loss": 3.0852, + "step": 10582 + }, + { + "epoch": 0.6569619467378484, + "grad_norm": 0.19788393055239542, + "learning_rate": 9.574925610934245e-05, + "loss": 3.1922, + "step": 10583 + }, + { + "epoch": 0.6570240238376063, + "grad_norm": 0.18969696123449778, + "learning_rate": 9.574779878485443e-05, + "loss": 3.1871, + "step": 10584 + }, + { + "epoch": 0.6570861009373642, + "grad_norm": 0.1605013757354089, + "learning_rate": 9.574634122168902e-05, + "loss": 3.1304, + "step": 10585 + }, + { + "epoch": 0.6571481780371221, + "grad_norm": 0.23598662837534487, + "learning_rate": 9.574488341985385e-05, + "loss": 3.1786, + "step": 10586 + }, + { + "epoch": 0.65721025513688, + "grad_norm": 0.3123298539452521, + "learning_rate": 9.574342537935653e-05, + "loss": 3.126, + "step": 10587 + }, + { + "epoch": 0.657272332236638, + "grad_norm": 0.23690717078592172, + "learning_rate": 9.574196710020465e-05, + "loss": 3.0764, + "step": 10588 + }, + { + "epoch": 0.6573344093363958, + "grad_norm": 0.22358360748692455, + "learning_rate": 9.574050858240582e-05, + "loss": 3.1944, + "step": 10589 + }, + { + "epoch": 0.6573964864361537, + "grad_norm": 0.212137947827611, + "learning_rate": 9.573904982596766e-05, + "loss": 3.2376, + "step": 10590 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.20611743062141683, + "learning_rate": 9.573759083089778e-05, + "loss": 3.1752, + "step": 10591 + }, + { + "epoch": 0.6575206406356695, + "grad_norm": 0.2029481723359369, + "learning_rate": 9.573613159720376e-05, + "loss": 3.1127, + "step": 10592 + }, + { + "epoch": 0.6575827177354274, + "grad_norm": 0.2285753400329035, + "learning_rate": 9.573467212489325e-05, + "loss": 3.078, + "step": 10593 + }, + { + "epoch": 0.6576447948351853, + "grad_norm": 0.23739218178156207, + "learning_rate": 9.573321241397385e-05, + "loss": 3.1683, + "step": 10594 + }, + { + "epoch": 0.6577068719349431, + "grad_norm": 0.22238214251137403, + "learning_rate": 9.573175246445318e-05, + "loss": 3.0752, + "step": 10595 + }, + { + "epoch": 0.6577689490347011, + "grad_norm": 0.18763258339311636, + "learning_rate": 9.573029227633887e-05, + "loss": 3.164, + "step": 10596 + }, + { + "epoch": 0.657831026134459, + "grad_norm": 0.24889982558853918, + "learning_rate": 9.57288318496385e-05, + "loss": 3.0559, + "step": 10597 + }, + { + "epoch": 0.6578931032342169, + "grad_norm": 0.2147180970084452, + "learning_rate": 9.572737118435972e-05, + "loss": 3.2125, + "step": 10598 + }, + { + "epoch": 0.6579551803339748, + "grad_norm": 0.19778631439270716, + "learning_rate": 9.572591028051017e-05, + "loss": 3.0462, + "step": 10599 + }, + { + "epoch": 0.6580172574337327, + "grad_norm": 0.24981769232642356, + "learning_rate": 9.572444913809742e-05, + "loss": 3.022, + "step": 10600 + }, + { + "epoch": 0.6580793345334905, + "grad_norm": 0.23497974735859836, + "learning_rate": 9.572298775712912e-05, + "loss": 3.2036, + "step": 10601 + }, + { + "epoch": 0.6581414116332485, + "grad_norm": 0.20796814840331504, + "learning_rate": 9.572152613761289e-05, + "loss": 3.0491, + "step": 10602 + }, + { + "epoch": 0.6582034887330064, + "grad_norm": 0.18162294527376235, + "learning_rate": 9.572006427955638e-05, + "loss": 3.1783, + "step": 10603 + }, + { + "epoch": 0.6582655658327643, + "grad_norm": 0.3505684362096947, + "learning_rate": 9.571860218296718e-05, + "loss": 3.2036, + "step": 10604 + }, + { + "epoch": 0.6583276429325222, + "grad_norm": 0.30251077577613344, + "learning_rate": 9.571713984785293e-05, + "loss": 3.2187, + "step": 10605 + }, + { + "epoch": 0.6583897200322801, + "grad_norm": 0.22137537835905718, + "learning_rate": 9.571567727422127e-05, + "loss": 3.1003, + "step": 10606 + }, + { + "epoch": 0.6584517971320379, + "grad_norm": 0.2839123100553552, + "learning_rate": 9.571421446207982e-05, + "loss": 3.1929, + "step": 10607 + }, + { + "epoch": 0.6585138742317959, + "grad_norm": 0.21935208092213965, + "learning_rate": 9.571275141143623e-05, + "loss": 3.1625, + "step": 10608 + }, + { + "epoch": 0.6585759513315538, + "grad_norm": 0.2703655984578151, + "learning_rate": 9.571128812229811e-05, + "loss": 3.0894, + "step": 10609 + }, + { + "epoch": 0.6586380284313117, + "grad_norm": 0.27642208403351465, + "learning_rate": 9.570982459467309e-05, + "loss": 3.208, + "step": 10610 + }, + { + "epoch": 0.6587001055310696, + "grad_norm": 0.2161349510226341, + "learning_rate": 9.570836082856882e-05, + "loss": 3.1857, + "step": 10611 + }, + { + "epoch": 0.6587621826308275, + "grad_norm": 0.25235184458690885, + "learning_rate": 9.570689682399295e-05, + "loss": 3.155, + "step": 10612 + }, + { + "epoch": 0.6588242597305853, + "grad_norm": 0.22424613772078247, + "learning_rate": 9.57054325809531e-05, + "loss": 3.0336, + "step": 10613 + }, + { + "epoch": 0.6588863368303433, + "grad_norm": 0.2892102173381738, + "learning_rate": 9.570396809945692e-05, + "loss": 3.226, + "step": 10614 + }, + { + "epoch": 0.6589484139301012, + "grad_norm": 0.2027536629264804, + "learning_rate": 9.570250337951204e-05, + "loss": 3.1779, + "step": 10615 + }, + { + "epoch": 0.6590104910298591, + "grad_norm": 0.21841696523978057, + "learning_rate": 9.570103842112609e-05, + "loss": 3.1484, + "step": 10616 + }, + { + "epoch": 0.659072568129617, + "grad_norm": 0.24094443328569362, + "learning_rate": 9.569957322430673e-05, + "loss": 3.1854, + "step": 10617 + }, + { + "epoch": 0.6591346452293749, + "grad_norm": 0.25668034355046065, + "learning_rate": 9.56981077890616e-05, + "loss": 3.1869, + "step": 10618 + }, + { + "epoch": 0.6591967223291327, + "grad_norm": 0.17873403264468213, + "learning_rate": 9.569664211539837e-05, + "loss": 3.0568, + "step": 10619 + }, + { + "epoch": 0.6592587994288907, + "grad_norm": 0.19794189560023284, + "learning_rate": 9.569517620332466e-05, + "loss": 3.1352, + "step": 10620 + }, + { + "epoch": 0.6593208765286486, + "grad_norm": 0.23258788327013158, + "learning_rate": 9.569371005284811e-05, + "loss": 3.178, + "step": 10621 + }, + { + "epoch": 0.6593829536284065, + "grad_norm": 0.22431694565451987, + "learning_rate": 9.569224366397638e-05, + "loss": 3.105, + "step": 10622 + }, + { + "epoch": 0.6594450307281644, + "grad_norm": 0.20100036272566602, + "learning_rate": 9.569077703671714e-05, + "loss": 3.1215, + "step": 10623 + }, + { + "epoch": 0.6595071078279223, + "grad_norm": 0.2686308319342733, + "learning_rate": 9.568931017107802e-05, + "loss": 3.1119, + "step": 10624 + }, + { + "epoch": 0.6595691849276801, + "grad_norm": 0.19511014946662952, + "learning_rate": 9.568784306706666e-05, + "loss": 3.1207, + "step": 10625 + }, + { + "epoch": 0.659631262027438, + "grad_norm": 0.18321835916130363, + "learning_rate": 9.568637572469074e-05, + "loss": 3.1336, + "step": 10626 + }, + { + "epoch": 0.659693339127196, + "grad_norm": 0.19441245599764725, + "learning_rate": 9.568490814395792e-05, + "loss": 3.0237, + "step": 10627 + }, + { + "epoch": 0.6597554162269539, + "grad_norm": 0.19119631679228907, + "learning_rate": 9.568344032487584e-05, + "loss": 3.1858, + "step": 10628 + }, + { + "epoch": 0.6598174933267118, + "grad_norm": 0.1943819097075796, + "learning_rate": 9.568197226745214e-05, + "loss": 3.0196, + "step": 10629 + }, + { + "epoch": 0.6598795704264697, + "grad_norm": 0.20596729251193677, + "learning_rate": 9.568050397169453e-05, + "loss": 3.1514, + "step": 10630 + }, + { + "epoch": 0.6599416475262275, + "grad_norm": 0.19095079573932208, + "learning_rate": 9.567903543761063e-05, + "loss": 3.0836, + "step": 10631 + }, + { + "epoch": 0.6600037246259854, + "grad_norm": 0.18070427730537594, + "learning_rate": 9.567756666520811e-05, + "loss": 3.1303, + "step": 10632 + }, + { + "epoch": 0.6600658017257434, + "grad_norm": 0.2386943956354454, + "learning_rate": 9.567609765449464e-05, + "loss": 3.1072, + "step": 10633 + }, + { + "epoch": 0.6601278788255013, + "grad_norm": 0.20191910307724023, + "learning_rate": 9.567462840547787e-05, + "loss": 3.0863, + "step": 10634 + }, + { + "epoch": 0.6601899559252592, + "grad_norm": 0.1903365078395238, + "learning_rate": 9.56731589181655e-05, + "loss": 3.2168, + "step": 10635 + }, + { + "epoch": 0.6602520330250171, + "grad_norm": 0.1894126337571403, + "learning_rate": 9.567168919256515e-05, + "loss": 3.093, + "step": 10636 + }, + { + "epoch": 0.6603141101247749, + "grad_norm": 0.192420278086446, + "learning_rate": 9.56702192286845e-05, + "loss": 3.1117, + "step": 10637 + }, + { + "epoch": 0.6603761872245328, + "grad_norm": 0.1885971019892616, + "learning_rate": 9.566874902653126e-05, + "loss": 3.1729, + "step": 10638 + }, + { + "epoch": 0.6604382643242908, + "grad_norm": 0.21987784370071156, + "learning_rate": 9.566727858611306e-05, + "loss": 3.1064, + "step": 10639 + }, + { + "epoch": 0.6605003414240487, + "grad_norm": 0.1746215500309904, + "learning_rate": 9.566580790743758e-05, + "loss": 3.0647, + "step": 10640 + }, + { + "epoch": 0.6605624185238066, + "grad_norm": 0.19139475690629568, + "learning_rate": 9.566433699051249e-05, + "loss": 3.0793, + "step": 10641 + }, + { + "epoch": 0.6606244956235645, + "grad_norm": 0.20140527869997282, + "learning_rate": 9.566286583534547e-05, + "loss": 3.1912, + "step": 10642 + }, + { + "epoch": 0.6606865727233223, + "grad_norm": 0.1866153271002236, + "learning_rate": 9.56613944419442e-05, + "loss": 3.0928, + "step": 10643 + }, + { + "epoch": 0.6607486498230802, + "grad_norm": 0.21278296381909653, + "learning_rate": 9.565992281031635e-05, + "loss": 3.1774, + "step": 10644 + }, + { + "epoch": 0.6608107269228382, + "grad_norm": 0.16767985983827585, + "learning_rate": 9.565845094046959e-05, + "loss": 3.1131, + "step": 10645 + }, + { + "epoch": 0.6608728040225961, + "grad_norm": 0.1996363395797637, + "learning_rate": 9.56569788324116e-05, + "loss": 3.097, + "step": 10646 + }, + { + "epoch": 0.660934881122354, + "grad_norm": 0.20830884829323984, + "learning_rate": 9.565550648615008e-05, + "loss": 3.0913, + "step": 10647 + }, + { + "epoch": 0.6609969582221119, + "grad_norm": 0.19905436970802054, + "learning_rate": 9.56540339016927e-05, + "loss": 3.1587, + "step": 10648 + }, + { + "epoch": 0.6610590353218697, + "grad_norm": 0.19396133356679665, + "learning_rate": 9.565256107904714e-05, + "loss": 3.0839, + "step": 10649 + }, + { + "epoch": 0.6611211124216276, + "grad_norm": 0.177808243209489, + "learning_rate": 9.565108801822109e-05, + "loss": 3.0607, + "step": 10650 + }, + { + "epoch": 0.6611831895213856, + "grad_norm": 0.18181278929584105, + "learning_rate": 9.564961471922223e-05, + "loss": 3.2115, + "step": 10651 + }, + { + "epoch": 0.6612452666211435, + "grad_norm": 0.19374933319592327, + "learning_rate": 9.564814118205825e-05, + "loss": 3.107, + "step": 10652 + }, + { + "epoch": 0.6613073437209014, + "grad_norm": 0.1982072320196084, + "learning_rate": 9.564666740673682e-05, + "loss": 3.1066, + "step": 10653 + }, + { + "epoch": 0.6613694208206593, + "grad_norm": 0.21964079136491738, + "learning_rate": 9.564519339326566e-05, + "loss": 3.0687, + "step": 10654 + }, + { + "epoch": 0.6614314979204171, + "grad_norm": 0.20335372554978853, + "learning_rate": 9.564371914165244e-05, + "loss": 3.1982, + "step": 10655 + }, + { + "epoch": 0.661493575020175, + "grad_norm": 0.20041966776243206, + "learning_rate": 9.564224465190487e-05, + "loss": 3.0119, + "step": 10656 + }, + { + "epoch": 0.661555652119933, + "grad_norm": 0.19033015657835572, + "learning_rate": 9.564076992403062e-05, + "loss": 3.1308, + "step": 10657 + }, + { + "epoch": 0.6616177292196909, + "grad_norm": 0.20281944768493582, + "learning_rate": 9.563929495803739e-05, + "loss": 3.1714, + "step": 10658 + }, + { + "epoch": 0.6616798063194488, + "grad_norm": 0.21913434221913647, + "learning_rate": 9.563781975393287e-05, + "loss": 3.1176, + "step": 10659 + }, + { + "epoch": 0.6617418834192067, + "grad_norm": 0.18421661961353356, + "learning_rate": 9.563634431172479e-05, + "loss": 3.1187, + "step": 10660 + }, + { + "epoch": 0.6618039605189645, + "grad_norm": 0.2205745187286121, + "learning_rate": 9.56348686314208e-05, + "loss": 3.0838, + "step": 10661 + }, + { + "epoch": 0.6618660376187224, + "grad_norm": 0.21013980609701052, + "learning_rate": 9.563339271302865e-05, + "loss": 3.1044, + "step": 10662 + }, + { + "epoch": 0.6619281147184803, + "grad_norm": 0.25238315345494333, + "learning_rate": 9.563191655655599e-05, + "loss": 3.1252, + "step": 10663 + }, + { + "epoch": 0.6619901918182383, + "grad_norm": 0.21065803841767003, + "learning_rate": 9.563044016201055e-05, + "loss": 3.1533, + "step": 10664 + }, + { + "epoch": 0.6620522689179962, + "grad_norm": 0.26881252049523857, + "learning_rate": 9.562896352940001e-05, + "loss": 3.0606, + "step": 10665 + }, + { + "epoch": 0.6621143460177541, + "grad_norm": 0.1982562713130219, + "learning_rate": 9.562748665873211e-05, + "loss": 3.0225, + "step": 10666 + }, + { + "epoch": 0.6621764231175119, + "grad_norm": 0.2515344912177625, + "learning_rate": 9.562600955001453e-05, + "loss": 3.1719, + "step": 10667 + }, + { + "epoch": 0.6622385002172698, + "grad_norm": 0.2208414656938083, + "learning_rate": 9.562453220325498e-05, + "loss": 3.1468, + "step": 10668 + }, + { + "epoch": 0.6623005773170277, + "grad_norm": 0.25052293087933075, + "learning_rate": 9.562305461846118e-05, + "loss": 3.2288, + "step": 10669 + }, + { + "epoch": 0.6623626544167857, + "grad_norm": 0.2069520798941548, + "learning_rate": 9.562157679564082e-05, + "loss": 3.2157, + "step": 10670 + }, + { + "epoch": 0.6624247315165436, + "grad_norm": 0.20526685658802873, + "learning_rate": 9.562009873480162e-05, + "loss": 3.1554, + "step": 10671 + }, + { + "epoch": 0.6624868086163015, + "grad_norm": 0.21084676663163204, + "learning_rate": 9.561862043595129e-05, + "loss": 3.1324, + "step": 10672 + }, + { + "epoch": 0.6625488857160593, + "grad_norm": 0.8902462605732875, + "learning_rate": 9.561714189909754e-05, + "loss": 3.1976, + "step": 10673 + }, + { + "epoch": 0.6626109628158172, + "grad_norm": 0.5262471700941096, + "learning_rate": 9.56156631242481e-05, + "loss": 3.1251, + "step": 10674 + }, + { + "epoch": 0.6626730399155751, + "grad_norm": 0.32379149815364333, + "learning_rate": 9.561418411141066e-05, + "loss": 3.1694, + "step": 10675 + }, + { + "epoch": 0.662735117015333, + "grad_norm": 0.30499293838987407, + "learning_rate": 9.561270486059295e-05, + "loss": 3.1487, + "step": 10676 + }, + { + "epoch": 0.662797194115091, + "grad_norm": 0.2206947021227464, + "learning_rate": 9.56112253718027e-05, + "loss": 3.195, + "step": 10677 + }, + { + "epoch": 0.6628592712148489, + "grad_norm": 0.22881969371252098, + "learning_rate": 9.560974564504759e-05, + "loss": 3.1219, + "step": 10678 + }, + { + "epoch": 0.6629213483146067, + "grad_norm": 0.28661848128783723, + "learning_rate": 9.560826568033537e-05, + "loss": 3.069, + "step": 10679 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.21409186662985064, + "learning_rate": 9.560678547767377e-05, + "loss": 3.2301, + "step": 10680 + }, + { + "epoch": 0.6630455025141225, + "grad_norm": 0.3337522000872501, + "learning_rate": 9.560530503707048e-05, + "loss": 3.1533, + "step": 10681 + }, + { + "epoch": 0.6631075796138804, + "grad_norm": 0.29555263393679176, + "learning_rate": 9.560382435853326e-05, + "loss": 3.1573, + "step": 10682 + }, + { + "epoch": 0.6631696567136384, + "grad_norm": 0.24865579581282124, + "learning_rate": 9.560234344206981e-05, + "loss": 3.1137, + "step": 10683 + }, + { + "epoch": 0.6632317338133963, + "grad_norm": 0.23085190555991503, + "learning_rate": 9.560086228768786e-05, + "loss": 3.2581, + "step": 10684 + }, + { + "epoch": 0.6632938109131541, + "grad_norm": 0.25520821331306803, + "learning_rate": 9.559938089539516e-05, + "loss": 3.2086, + "step": 10685 + }, + { + "epoch": 0.663355888012912, + "grad_norm": 0.31902172950875013, + "learning_rate": 9.55978992651994e-05, + "loss": 3.204, + "step": 10686 + }, + { + "epoch": 0.6634179651126699, + "grad_norm": 0.2503948223220453, + "learning_rate": 9.559641739710834e-05, + "loss": 3.1119, + "step": 10687 + }, + { + "epoch": 0.6634800422124278, + "grad_norm": 0.20312937401683775, + "learning_rate": 9.559493529112968e-05, + "loss": 3.0433, + "step": 10688 + }, + { + "epoch": 0.6635421193121858, + "grad_norm": 0.2726114602450019, + "learning_rate": 9.55934529472712e-05, + "loss": 3.1627, + "step": 10689 + }, + { + "epoch": 0.6636041964119437, + "grad_norm": 0.3606097735821731, + "learning_rate": 9.55919703655406e-05, + "loss": 3.2163, + "step": 10690 + }, + { + "epoch": 0.6636662735117015, + "grad_norm": 0.23689722471477442, + "learning_rate": 9.559048754594563e-05, + "loss": 3.1768, + "step": 10691 + }, + { + "epoch": 0.6637283506114594, + "grad_norm": 0.37086809224020395, + "learning_rate": 9.5589004488494e-05, + "loss": 3.0805, + "step": 10692 + }, + { + "epoch": 0.6637904277112173, + "grad_norm": 0.33583297877770224, + "learning_rate": 9.558752119319348e-05, + "loss": 3.1576, + "step": 10693 + }, + { + "epoch": 0.6638525048109752, + "grad_norm": 0.3012412380986192, + "learning_rate": 9.558603766005181e-05, + "loss": 3.1058, + "step": 10694 + }, + { + "epoch": 0.6639145819107332, + "grad_norm": 0.1959599064756993, + "learning_rate": 9.558455388907669e-05, + "loss": 3.1745, + "step": 10695 + }, + { + "epoch": 0.6639766590104911, + "grad_norm": 0.2211451703183092, + "learning_rate": 9.55830698802759e-05, + "loss": 3.0883, + "step": 10696 + }, + { + "epoch": 0.6640387361102489, + "grad_norm": 0.18560004225478122, + "learning_rate": 9.558158563365716e-05, + "loss": 3.2724, + "step": 10697 + }, + { + "epoch": 0.6641008132100068, + "grad_norm": 0.38721214954894695, + "learning_rate": 9.558010114922822e-05, + "loss": 3.2147, + "step": 10698 + }, + { + "epoch": 0.6641628903097647, + "grad_norm": 0.2573290395556565, + "learning_rate": 9.557861642699683e-05, + "loss": 3.061, + "step": 10699 + }, + { + "epoch": 0.6642249674095226, + "grad_norm": 0.3141550582457876, + "learning_rate": 9.557713146697074e-05, + "loss": 3.1746, + "step": 10700 + }, + { + "epoch": 0.6642870445092806, + "grad_norm": 0.24784405744295696, + "learning_rate": 9.557564626915769e-05, + "loss": 3.1274, + "step": 10701 + }, + { + "epoch": 0.6643491216090385, + "grad_norm": 0.2293759964965462, + "learning_rate": 9.557416083356542e-05, + "loss": 3.1806, + "step": 10702 + }, + { + "epoch": 0.6644111987087963, + "grad_norm": 0.20945880194956157, + "learning_rate": 9.557267516020171e-05, + "loss": 3.0731, + "step": 10703 + }, + { + "epoch": 0.6644732758085542, + "grad_norm": 0.22385777481784164, + "learning_rate": 9.557118924907427e-05, + "loss": 3.1859, + "step": 10704 + }, + { + "epoch": 0.6645353529083121, + "grad_norm": 0.2084607579039239, + "learning_rate": 9.55697031001909e-05, + "loss": 3.1224, + "step": 10705 + }, + { + "epoch": 0.66459743000807, + "grad_norm": 0.22007608376864157, + "learning_rate": 9.556821671355929e-05, + "loss": 3.2473, + "step": 10706 + }, + { + "epoch": 0.664659507107828, + "grad_norm": 0.2633905959089998, + "learning_rate": 9.556673008918725e-05, + "loss": 3.1211, + "step": 10707 + }, + { + "epoch": 0.6647215842075859, + "grad_norm": 0.30871590478186245, + "learning_rate": 9.556524322708252e-05, + "loss": 3.1794, + "step": 10708 + }, + { + "epoch": 0.6647836613073437, + "grad_norm": 0.1947650676218956, + "learning_rate": 9.556375612725284e-05, + "loss": 3.1556, + "step": 10709 + }, + { + "epoch": 0.6648457384071016, + "grad_norm": 0.21209445430199708, + "learning_rate": 9.5562268789706e-05, + "loss": 3.2048, + "step": 10710 + }, + { + "epoch": 0.6649078155068595, + "grad_norm": 0.20053456102586015, + "learning_rate": 9.556078121444974e-05, + "loss": 3.1249, + "step": 10711 + }, + { + "epoch": 0.6649698926066174, + "grad_norm": 0.2980477234314608, + "learning_rate": 9.555929340149181e-05, + "loss": 3.1394, + "step": 10712 + }, + { + "epoch": 0.6650319697063753, + "grad_norm": 0.2001801833972268, + "learning_rate": 9.555780535084e-05, + "loss": 3.1622, + "step": 10713 + }, + { + "epoch": 0.6650940468061333, + "grad_norm": 0.24980951294104517, + "learning_rate": 9.555631706250206e-05, + "loss": 3.195, + "step": 10714 + }, + { + "epoch": 0.6651561239058911, + "grad_norm": 0.2550918189866734, + "learning_rate": 9.555482853648576e-05, + "loss": 3.1516, + "step": 10715 + }, + { + "epoch": 0.665218201005649, + "grad_norm": 0.23498837952087276, + "learning_rate": 9.555333977279884e-05, + "loss": 3.0754, + "step": 10716 + }, + { + "epoch": 0.6652802781054069, + "grad_norm": 0.2008456390384615, + "learning_rate": 9.55518507714491e-05, + "loss": 3.1293, + "step": 10717 + }, + { + "epoch": 0.6653423552051648, + "grad_norm": 0.2361655030446224, + "learning_rate": 9.55503615324443e-05, + "loss": 3.195, + "step": 10718 + }, + { + "epoch": 0.6654044323049227, + "grad_norm": 0.22346836190396038, + "learning_rate": 9.55488720557922e-05, + "loss": 3.1728, + "step": 10719 + }, + { + "epoch": 0.6654665094046807, + "grad_norm": 0.2362079235860537, + "learning_rate": 9.554738234150058e-05, + "loss": 3.117, + "step": 10720 + }, + { + "epoch": 0.6655285865044385, + "grad_norm": 0.18152646609973722, + "learning_rate": 9.554589238957721e-05, + "loss": 3.1469, + "step": 10721 + }, + { + "epoch": 0.6655906636041964, + "grad_norm": 0.22204470725890807, + "learning_rate": 9.554440220002986e-05, + "loss": 3.2192, + "step": 10722 + }, + { + "epoch": 0.6656527407039543, + "grad_norm": 0.25273717085693637, + "learning_rate": 9.554291177286632e-05, + "loss": 3.1114, + "step": 10723 + }, + { + "epoch": 0.6657148178037122, + "grad_norm": 0.17987118537428787, + "learning_rate": 9.554142110809434e-05, + "loss": 3.1638, + "step": 10724 + }, + { + "epoch": 0.6657768949034701, + "grad_norm": 0.22723487855836705, + "learning_rate": 9.553993020572172e-05, + "loss": 3.1851, + "step": 10725 + }, + { + "epoch": 0.6658389720032281, + "grad_norm": 0.19468263151593523, + "learning_rate": 9.553843906575621e-05, + "loss": 3.1363, + "step": 10726 + }, + { + "epoch": 0.6659010491029859, + "grad_norm": 0.2751426391874451, + "learning_rate": 9.553694768820562e-05, + "loss": 3.2132, + "step": 10727 + }, + { + "epoch": 0.6659631262027438, + "grad_norm": 0.2638790729367395, + "learning_rate": 9.553545607307773e-05, + "loss": 3.1623, + "step": 10728 + }, + { + "epoch": 0.6660252033025017, + "grad_norm": 0.20174404366881726, + "learning_rate": 9.55339642203803e-05, + "loss": 3.0806, + "step": 10729 + }, + { + "epoch": 0.6660872804022596, + "grad_norm": 0.26704494231394443, + "learning_rate": 9.553247213012114e-05, + "loss": 3.2078, + "step": 10730 + }, + { + "epoch": 0.6661493575020175, + "grad_norm": 0.2320077396131561, + "learning_rate": 9.553097980230799e-05, + "loss": 3.1139, + "step": 10731 + }, + { + "epoch": 0.6662114346017755, + "grad_norm": 0.2520366352875265, + "learning_rate": 9.552948723694868e-05, + "loss": 3.021, + "step": 10732 + }, + { + "epoch": 0.6662735117015333, + "grad_norm": 0.1843274855409004, + "learning_rate": 9.552799443405098e-05, + "loss": 3.1986, + "step": 10733 + }, + { + "epoch": 0.6663355888012912, + "grad_norm": 0.23232535754418165, + "learning_rate": 9.552650139362269e-05, + "loss": 3.1304, + "step": 10734 + }, + { + "epoch": 0.6663976659010491, + "grad_norm": 0.19191425406556883, + "learning_rate": 9.552500811567159e-05, + "loss": 3.1415, + "step": 10735 + }, + { + "epoch": 0.666459743000807, + "grad_norm": 0.20701484795141725, + "learning_rate": 9.552351460020547e-05, + "loss": 3.1352, + "step": 10736 + }, + { + "epoch": 0.6665218201005649, + "grad_norm": 0.20038864290919128, + "learning_rate": 9.552202084723211e-05, + "loss": 3.1333, + "step": 10737 + }, + { + "epoch": 0.6665838972003229, + "grad_norm": 0.1758459713783509, + "learning_rate": 9.552052685675933e-05, + "loss": 3.0296, + "step": 10738 + }, + { + "epoch": 0.6666459743000807, + "grad_norm": 0.2000371913304683, + "learning_rate": 9.551903262879491e-05, + "loss": 3.0483, + "step": 10739 + }, + { + "epoch": 0.6667080513998386, + "grad_norm": 0.17088689492619186, + "learning_rate": 9.551753816334664e-05, + "loss": 3.1446, + "step": 10740 + }, + { + "epoch": 0.6667701284995965, + "grad_norm": 0.17494944753390634, + "learning_rate": 9.551604346042234e-05, + "loss": 3.1316, + "step": 10741 + }, + { + "epoch": 0.6668322055993544, + "grad_norm": 0.19997943310488536, + "learning_rate": 9.551454852002977e-05, + "loss": 3.1197, + "step": 10742 + }, + { + "epoch": 0.6668942826991123, + "grad_norm": 0.2556547555183847, + "learning_rate": 9.551305334217675e-05, + "loss": 3.2167, + "step": 10743 + }, + { + "epoch": 0.6669563597988702, + "grad_norm": 0.28203812746660284, + "learning_rate": 9.55115579268711e-05, + "loss": 3.0481, + "step": 10744 + }, + { + "epoch": 0.667018436898628, + "grad_norm": 0.29263861811325165, + "learning_rate": 9.551006227412059e-05, + "loss": 3.0927, + "step": 10745 + }, + { + "epoch": 0.667080513998386, + "grad_norm": 0.29264765627535333, + "learning_rate": 9.550856638393303e-05, + "loss": 3.0728, + "step": 10746 + }, + { + "epoch": 0.6671425910981439, + "grad_norm": 0.21247192646159577, + "learning_rate": 9.550707025631627e-05, + "loss": 3.1001, + "step": 10747 + }, + { + "epoch": 0.6672046681979018, + "grad_norm": 0.21908013091921485, + "learning_rate": 9.550557389127805e-05, + "loss": 3.1545, + "step": 10748 + }, + { + "epoch": 0.6672667452976597, + "grad_norm": 0.25897181553831267, + "learning_rate": 9.55040772888262e-05, + "loss": 3.1852, + "step": 10749 + }, + { + "epoch": 0.6673288223974176, + "grad_norm": 0.19450424803578786, + "learning_rate": 9.550258044896853e-05, + "loss": 3.1483, + "step": 10750 + }, + { + "epoch": 0.6673908994971754, + "grad_norm": 0.20005312266266442, + "learning_rate": 9.550108337171286e-05, + "loss": 3.1031, + "step": 10751 + }, + { + "epoch": 0.6674529765969334, + "grad_norm": 0.1654836715205687, + "learning_rate": 9.5499586057067e-05, + "loss": 3.1151, + "step": 10752 + }, + { + "epoch": 0.6675150536966913, + "grad_norm": 0.2100962446831512, + "learning_rate": 9.549808850503875e-05, + "loss": 3.1726, + "step": 10753 + }, + { + "epoch": 0.6675771307964492, + "grad_norm": 0.20267906206160421, + "learning_rate": 9.549659071563592e-05, + "loss": 3.0991, + "step": 10754 + }, + { + "epoch": 0.6676392078962071, + "grad_norm": 0.2108504857623751, + "learning_rate": 9.549509268886634e-05, + "loss": 3.0635, + "step": 10755 + }, + { + "epoch": 0.667701284995965, + "grad_norm": 0.18090161351443287, + "learning_rate": 9.549359442473781e-05, + "loss": 3.1686, + "step": 10756 + }, + { + "epoch": 0.6677633620957228, + "grad_norm": 0.2122054404117029, + "learning_rate": 9.549209592325816e-05, + "loss": 3.1409, + "step": 10757 + }, + { + "epoch": 0.6678254391954808, + "grad_norm": 0.19709859518419512, + "learning_rate": 9.54905971844352e-05, + "loss": 3.2609, + "step": 10758 + }, + { + "epoch": 0.6678875162952387, + "grad_norm": 0.24126531011297642, + "learning_rate": 9.548909820827674e-05, + "loss": 3.0484, + "step": 10759 + }, + { + "epoch": 0.6679495933949966, + "grad_norm": 0.242287899556864, + "learning_rate": 9.548759899479063e-05, + "loss": 3.0717, + "step": 10760 + }, + { + "epoch": 0.6680116704947545, + "grad_norm": 0.25400988591190654, + "learning_rate": 9.548609954398467e-05, + "loss": 3.1364, + "step": 10761 + }, + { + "epoch": 0.6680737475945124, + "grad_norm": 0.2438754969092722, + "learning_rate": 9.548459985586668e-05, + "loss": 3.2581, + "step": 10762 + }, + { + "epoch": 0.6681358246942702, + "grad_norm": 0.328197868521189, + "learning_rate": 9.548309993044449e-05, + "loss": 3.1399, + "step": 10763 + }, + { + "epoch": 0.6681979017940282, + "grad_norm": 0.2201892220421401, + "learning_rate": 9.548159976772592e-05, + "loss": 3.1758, + "step": 10764 + }, + { + "epoch": 0.6682599788937861, + "grad_norm": 0.2322859326468874, + "learning_rate": 9.548009936771882e-05, + "loss": 3.1873, + "step": 10765 + }, + { + "epoch": 0.668322055993544, + "grad_norm": 0.21653522992698426, + "learning_rate": 9.5478598730431e-05, + "loss": 3.1073, + "step": 10766 + }, + { + "epoch": 0.6683841330933019, + "grad_norm": 0.18228010186341115, + "learning_rate": 9.547709785587027e-05, + "loss": 3.1156, + "step": 10767 + }, + { + "epoch": 0.6684462101930598, + "grad_norm": 0.19814968299842772, + "learning_rate": 9.54755967440445e-05, + "loss": 3.0633, + "step": 10768 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.1961784240771906, + "learning_rate": 9.547409539496148e-05, + "loss": 3.1613, + "step": 10769 + }, + { + "epoch": 0.6685703643925756, + "grad_norm": 0.21198571220027368, + "learning_rate": 9.54725938086291e-05, + "loss": 3.1165, + "step": 10770 + }, + { + "epoch": 0.6686324414923335, + "grad_norm": 0.20954397627993251, + "learning_rate": 9.547109198505513e-05, + "loss": 3.1204, + "step": 10771 + }, + { + "epoch": 0.6686945185920914, + "grad_norm": 0.1874800015751792, + "learning_rate": 9.546958992424744e-05, + "loss": 3.1027, + "step": 10772 + }, + { + "epoch": 0.6687565956918493, + "grad_norm": 0.204375963237954, + "learning_rate": 9.546808762621386e-05, + "loss": 3.0833, + "step": 10773 + }, + { + "epoch": 0.6688186727916072, + "grad_norm": 0.17517826716113438, + "learning_rate": 9.546658509096225e-05, + "loss": 3.1239, + "step": 10774 + }, + { + "epoch": 0.668880749891365, + "grad_norm": 0.20282187218310851, + "learning_rate": 9.54650823185004e-05, + "loss": 3.1741, + "step": 10775 + }, + { + "epoch": 0.668942826991123, + "grad_norm": 0.2069223855454552, + "learning_rate": 9.54635793088362e-05, + "loss": 3.0978, + "step": 10776 + }, + { + "epoch": 0.6690049040908809, + "grad_norm": 0.1827838973142071, + "learning_rate": 9.546207606197745e-05, + "loss": 3.1409, + "step": 10777 + }, + { + "epoch": 0.6690669811906388, + "grad_norm": 0.2388269320337451, + "learning_rate": 9.546057257793205e-05, + "loss": 3.161, + "step": 10778 + }, + { + "epoch": 0.6691290582903967, + "grad_norm": 0.16714057687247144, + "learning_rate": 9.545906885670777e-05, + "loss": 3.0936, + "step": 10779 + }, + { + "epoch": 0.6691911353901546, + "grad_norm": 0.16519809712435135, + "learning_rate": 9.54575648983125e-05, + "loss": 3.1132, + "step": 10780 + }, + { + "epoch": 0.6692532124899124, + "grad_norm": 0.21000918344161104, + "learning_rate": 9.545606070275408e-05, + "loss": 3.2135, + "step": 10781 + }, + { + "epoch": 0.6693152895896703, + "grad_norm": 0.19642130823609683, + "learning_rate": 9.545455627004036e-05, + "loss": 3.0118, + "step": 10782 + }, + { + "epoch": 0.6693773666894283, + "grad_norm": 0.18097718399899546, + "learning_rate": 9.545305160017919e-05, + "loss": 3.1225, + "step": 10783 + }, + { + "epoch": 0.6694394437891862, + "grad_norm": 0.2066809732025479, + "learning_rate": 9.545154669317839e-05, + "loss": 3.1709, + "step": 10784 + }, + { + "epoch": 0.6695015208889441, + "grad_norm": 0.24410716018204295, + "learning_rate": 9.545004154904586e-05, + "loss": 3.0929, + "step": 10785 + }, + { + "epoch": 0.669563597988702, + "grad_norm": 0.2476405848774398, + "learning_rate": 9.544853616778943e-05, + "loss": 2.9973, + "step": 10786 + }, + { + "epoch": 0.6696256750884598, + "grad_norm": 0.17982531528463822, + "learning_rate": 9.544703054941696e-05, + "loss": 3.2021, + "step": 10787 + }, + { + "epoch": 0.6696877521882177, + "grad_norm": 0.20154964753507884, + "learning_rate": 9.544552469393628e-05, + "loss": 3.2068, + "step": 10788 + }, + { + "epoch": 0.6697498292879757, + "grad_norm": 0.19364029293298088, + "learning_rate": 9.544401860135528e-05, + "loss": 3.0898, + "step": 10789 + }, + { + "epoch": 0.6698119063877336, + "grad_norm": 0.20003309423105733, + "learning_rate": 9.544251227168179e-05, + "loss": 3.0832, + "step": 10790 + }, + { + "epoch": 0.6698739834874915, + "grad_norm": 0.19260812529592267, + "learning_rate": 9.544100570492369e-05, + "loss": 3.1122, + "step": 10791 + }, + { + "epoch": 0.6699360605872494, + "grad_norm": 0.17516906418321831, + "learning_rate": 9.543949890108884e-05, + "loss": 3.1071, + "step": 10792 + }, + { + "epoch": 0.6699981376870072, + "grad_norm": 0.17381243688533532, + "learning_rate": 9.543799186018509e-05, + "loss": 3.089, + "step": 10793 + }, + { + "epoch": 0.6700602147867651, + "grad_norm": 0.30563866262964823, + "learning_rate": 9.54364845822203e-05, + "loss": 3.1506, + "step": 10794 + }, + { + "epoch": 0.6701222918865231, + "grad_norm": 0.26475087959694565, + "learning_rate": 9.543497706720234e-05, + "loss": 3.159, + "step": 10795 + }, + { + "epoch": 0.670184368986281, + "grad_norm": 0.2306771548413111, + "learning_rate": 9.543346931513908e-05, + "loss": 3.2058, + "step": 10796 + }, + { + "epoch": 0.6702464460860389, + "grad_norm": 0.2232334535747551, + "learning_rate": 9.543196132603837e-05, + "loss": 3.168, + "step": 10797 + }, + { + "epoch": 0.6703085231857968, + "grad_norm": 0.22904142385582918, + "learning_rate": 9.54304530999081e-05, + "loss": 3.0531, + "step": 10798 + }, + { + "epoch": 0.6703706002855546, + "grad_norm": 0.2844316080363681, + "learning_rate": 9.542894463675611e-05, + "loss": 3.2489, + "step": 10799 + }, + { + "epoch": 0.6704326773853125, + "grad_norm": 0.2422662037975061, + "learning_rate": 9.542743593659031e-05, + "loss": 3.0382, + "step": 10800 + }, + { + "epoch": 0.6704947544850705, + "grad_norm": 0.26102993121842133, + "learning_rate": 9.542592699941854e-05, + "loss": 3.2433, + "step": 10801 + }, + { + "epoch": 0.6705568315848284, + "grad_norm": 0.2811652559188661, + "learning_rate": 9.54244178252487e-05, + "loss": 3.1531, + "step": 10802 + }, + { + "epoch": 0.6706189086845863, + "grad_norm": 0.217333464663657, + "learning_rate": 9.54229084140886e-05, + "loss": 3.1703, + "step": 10803 + }, + { + "epoch": 0.6706809857843442, + "grad_norm": 0.37921285513880637, + "learning_rate": 9.54213987659462e-05, + "loss": 3.0948, + "step": 10804 + }, + { + "epoch": 0.670743062884102, + "grad_norm": 0.27621083504102123, + "learning_rate": 9.541988888082932e-05, + "loss": 3.1689, + "step": 10805 + }, + { + "epoch": 0.6708051399838599, + "grad_norm": 0.1989783351057914, + "learning_rate": 9.541837875874586e-05, + "loss": 3.1346, + "step": 10806 + }, + { + "epoch": 0.6708672170836179, + "grad_norm": 0.2100683159486989, + "learning_rate": 9.541686839970369e-05, + "loss": 3.1633, + "step": 10807 + }, + { + "epoch": 0.6709292941833758, + "grad_norm": 0.2662363590184687, + "learning_rate": 9.54153578037107e-05, + "loss": 3.1914, + "step": 10808 + }, + { + "epoch": 0.6709913712831337, + "grad_norm": 0.25297869940189477, + "learning_rate": 9.541384697077474e-05, + "loss": 3.177, + "step": 10809 + }, + { + "epoch": 0.6710534483828916, + "grad_norm": 0.3142129798522495, + "learning_rate": 9.541233590090373e-05, + "loss": 3.1542, + "step": 10810 + }, + { + "epoch": 0.6711155254826494, + "grad_norm": 0.2390223142095967, + "learning_rate": 9.541082459410555e-05, + "loss": 3.141, + "step": 10811 + }, + { + "epoch": 0.6711776025824073, + "grad_norm": 0.38802909415580805, + "learning_rate": 9.540931305038806e-05, + "loss": 3.1475, + "step": 10812 + }, + { + "epoch": 0.6712396796821652, + "grad_norm": 0.28946187807920615, + "learning_rate": 9.540780126975915e-05, + "loss": 3.1467, + "step": 10813 + }, + { + "epoch": 0.6713017567819232, + "grad_norm": 0.2203584951044944, + "learning_rate": 9.540628925222675e-05, + "loss": 3.1908, + "step": 10814 + }, + { + "epoch": 0.6713638338816811, + "grad_norm": 0.25970527119888953, + "learning_rate": 9.540477699779869e-05, + "loss": 3.1384, + "step": 10815 + }, + { + "epoch": 0.671425910981439, + "grad_norm": 0.3383934491267744, + "learning_rate": 9.540326450648289e-05, + "loss": 3.1834, + "step": 10816 + }, + { + "epoch": 0.6714879880811968, + "grad_norm": 0.30577291433427556, + "learning_rate": 9.540175177828724e-05, + "loss": 3.0721, + "step": 10817 + }, + { + "epoch": 0.6715500651809547, + "grad_norm": 0.25339826974029656, + "learning_rate": 9.540023881321962e-05, + "loss": 3.1444, + "step": 10818 + }, + { + "epoch": 0.6716121422807126, + "grad_norm": 0.25982180683387485, + "learning_rate": 9.539872561128796e-05, + "loss": 3.1036, + "step": 10819 + }, + { + "epoch": 0.6716742193804706, + "grad_norm": 0.251494707723036, + "learning_rate": 9.53972121725001e-05, + "loss": 3.1822, + "step": 10820 + }, + { + "epoch": 0.6717362964802285, + "grad_norm": 0.2296264499183485, + "learning_rate": 9.539569849686398e-05, + "loss": 3.0771, + "step": 10821 + }, + { + "epoch": 0.6717983735799864, + "grad_norm": 0.24853412200772498, + "learning_rate": 9.539418458438746e-05, + "loss": 3.065, + "step": 10822 + }, + { + "epoch": 0.6718604506797442, + "grad_norm": 0.26430678044316686, + "learning_rate": 9.539267043507849e-05, + "loss": 3.1634, + "step": 10823 + }, + { + "epoch": 0.6719225277795021, + "grad_norm": 0.186424159047426, + "learning_rate": 9.539115604894492e-05, + "loss": 3.1156, + "step": 10824 + }, + { + "epoch": 0.67198460487926, + "grad_norm": 0.19502539513663694, + "learning_rate": 9.538964142599467e-05, + "loss": 3.1035, + "step": 10825 + }, + { + "epoch": 0.672046681979018, + "grad_norm": 0.1820859372584205, + "learning_rate": 9.538812656623564e-05, + "loss": 3.1694, + "step": 10826 + }, + { + "epoch": 0.6721087590787759, + "grad_norm": 0.1878730519854276, + "learning_rate": 9.538661146967574e-05, + "loss": 3.1943, + "step": 10827 + }, + { + "epoch": 0.6721708361785338, + "grad_norm": 0.228146368815681, + "learning_rate": 9.538509613632287e-05, + "loss": 3.1944, + "step": 10828 + }, + { + "epoch": 0.6722329132782916, + "grad_norm": 0.1885613660813055, + "learning_rate": 9.538358056618495e-05, + "loss": 3.1191, + "step": 10829 + }, + { + "epoch": 0.6722949903780495, + "grad_norm": 0.22377263824391538, + "learning_rate": 9.538206475926985e-05, + "loss": 3.1925, + "step": 10830 + }, + { + "epoch": 0.6723570674778074, + "grad_norm": 0.19389953687729597, + "learning_rate": 9.53805487155855e-05, + "loss": 3.1367, + "step": 10831 + }, + { + "epoch": 0.6724191445775654, + "grad_norm": 0.2971126082367551, + "learning_rate": 9.537903243513984e-05, + "loss": 3.1632, + "step": 10832 + }, + { + "epoch": 0.6724812216773233, + "grad_norm": 0.18244897775852378, + "learning_rate": 9.537751591794073e-05, + "loss": 3.1143, + "step": 10833 + }, + { + "epoch": 0.6725432987770812, + "grad_norm": 0.4020429431150319, + "learning_rate": 9.53759991639961e-05, + "loss": 3.1732, + "step": 10834 + }, + { + "epoch": 0.672605375876839, + "grad_norm": 0.2766249704658296, + "learning_rate": 9.537448217331387e-05, + "loss": 3.0803, + "step": 10835 + }, + { + "epoch": 0.6726674529765969, + "grad_norm": 0.21541149377299543, + "learning_rate": 9.537296494590196e-05, + "loss": 3.0923, + "step": 10836 + }, + { + "epoch": 0.6727295300763548, + "grad_norm": 0.22520922297767518, + "learning_rate": 9.537144748176827e-05, + "loss": 3.1384, + "step": 10837 + }, + { + "epoch": 0.6727916071761127, + "grad_norm": 0.18978736338323765, + "learning_rate": 9.536992978092071e-05, + "loss": 3.1714, + "step": 10838 + }, + { + "epoch": 0.6728536842758707, + "grad_norm": 0.24652680637872673, + "learning_rate": 9.536841184336724e-05, + "loss": 3.1141, + "step": 10839 + }, + { + "epoch": 0.6729157613756286, + "grad_norm": 0.19672564449757374, + "learning_rate": 9.536689366911574e-05, + "loss": 3.1734, + "step": 10840 + }, + { + "epoch": 0.6729778384753864, + "grad_norm": 0.38647933253416195, + "learning_rate": 9.536537525817415e-05, + "loss": 3.1656, + "step": 10841 + }, + { + "epoch": 0.6730399155751443, + "grad_norm": 0.22681335833469343, + "learning_rate": 9.536385661055037e-05, + "loss": 3.1348, + "step": 10842 + }, + { + "epoch": 0.6731019926749022, + "grad_norm": 0.24464663900036807, + "learning_rate": 9.536233772625235e-05, + "loss": 3.176, + "step": 10843 + }, + { + "epoch": 0.6731640697746601, + "grad_norm": 0.2769976592206001, + "learning_rate": 9.536081860528799e-05, + "loss": 3.0305, + "step": 10844 + }, + { + "epoch": 0.6732261468744181, + "grad_norm": 0.2600954299720336, + "learning_rate": 9.535929924766524e-05, + "loss": 3.2113, + "step": 10845 + }, + { + "epoch": 0.673288223974176, + "grad_norm": 0.20860753888906555, + "learning_rate": 9.5357779653392e-05, + "loss": 3.0289, + "step": 10846 + }, + { + "epoch": 0.6733503010739338, + "grad_norm": 0.3109157099349114, + "learning_rate": 9.535625982247622e-05, + "loss": 3.1643, + "step": 10847 + }, + { + "epoch": 0.6734123781736917, + "grad_norm": 0.23547385429818835, + "learning_rate": 9.535473975492584e-05, + "loss": 3.1097, + "step": 10848 + }, + { + "epoch": 0.6734744552734496, + "grad_norm": 0.4097800391383024, + "learning_rate": 9.535321945074876e-05, + "loss": 3.2276, + "step": 10849 + }, + { + "epoch": 0.6735365323732075, + "grad_norm": 0.2275358535474836, + "learning_rate": 9.53516989099529e-05, + "loss": 3.067, + "step": 10850 + }, + { + "epoch": 0.6735986094729655, + "grad_norm": 0.2535924439505311, + "learning_rate": 9.535017813254626e-05, + "loss": 3.1247, + "step": 10851 + }, + { + "epoch": 0.6736606865727234, + "grad_norm": 0.2693689382804829, + "learning_rate": 9.534865711853669e-05, + "loss": 3.0833, + "step": 10852 + }, + { + "epoch": 0.6737227636724812, + "grad_norm": 0.23557247383356625, + "learning_rate": 9.534713586793219e-05, + "loss": 3.0947, + "step": 10853 + }, + { + "epoch": 0.6737848407722391, + "grad_norm": 0.21766036138601727, + "learning_rate": 9.534561438074068e-05, + "loss": 3.1607, + "step": 10854 + }, + { + "epoch": 0.673846917871997, + "grad_norm": 0.1931195746897385, + "learning_rate": 9.534409265697007e-05, + "loss": 3.1258, + "step": 10855 + }, + { + "epoch": 0.6739089949717549, + "grad_norm": 0.22585049923505282, + "learning_rate": 9.534257069662832e-05, + "loss": 3.1935, + "step": 10856 + }, + { + "epoch": 0.6739710720715129, + "grad_norm": 0.23708663152625237, + "learning_rate": 9.534104849972341e-05, + "loss": 3.176, + "step": 10857 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.2549417800216723, + "learning_rate": 9.533952606626321e-05, + "loss": 3.158, + "step": 10858 + }, + { + "epoch": 0.6740952262710286, + "grad_norm": 0.21622780697010957, + "learning_rate": 9.533800339625569e-05, + "loss": 3.0328, + "step": 10859 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.2161525309723791, + "learning_rate": 9.533648048970882e-05, + "loss": 3.0244, + "step": 10860 + }, + { + "epoch": 0.6742193804705444, + "grad_norm": 0.2935094675449318, + "learning_rate": 9.533495734663051e-05, + "loss": 3.1121, + "step": 10861 + }, + { + "epoch": 0.6742814575703023, + "grad_norm": 0.34180102754522707, + "learning_rate": 9.533343396702874e-05, + "loss": 3.1027, + "step": 10862 + }, + { + "epoch": 0.6743435346700603, + "grad_norm": 0.24870428689650625, + "learning_rate": 9.533191035091142e-05, + "loss": 3.0954, + "step": 10863 + }, + { + "epoch": 0.6744056117698182, + "grad_norm": 0.2941409502369576, + "learning_rate": 9.533038649828654e-05, + "loss": 3.108, + "step": 10864 + }, + { + "epoch": 0.674467688869576, + "grad_norm": 0.22018279288558415, + "learning_rate": 9.532886240916201e-05, + "loss": 3.1821, + "step": 10865 + }, + { + "epoch": 0.6745297659693339, + "grad_norm": 0.2565614372026963, + "learning_rate": 9.532733808354581e-05, + "loss": 3.2226, + "step": 10866 + }, + { + "epoch": 0.6745918430690918, + "grad_norm": 0.2030035444423522, + "learning_rate": 9.53258135214459e-05, + "loss": 3.1659, + "step": 10867 + }, + { + "epoch": 0.6746539201688497, + "grad_norm": 0.21923922430230688, + "learning_rate": 9.53242887228702e-05, + "loss": 3.1278, + "step": 10868 + }, + { + "epoch": 0.6747159972686076, + "grad_norm": 0.21008087990551474, + "learning_rate": 9.532276368782667e-05, + "loss": 3.1828, + "step": 10869 + }, + { + "epoch": 0.6747780743683655, + "grad_norm": 0.3241663281447957, + "learning_rate": 9.53212384163233e-05, + "loss": 3.1792, + "step": 10870 + }, + { + "epoch": 0.6748401514681234, + "grad_norm": 0.24269862998071573, + "learning_rate": 9.531971290836801e-05, + "loss": 3.2182, + "step": 10871 + }, + { + "epoch": 0.6749022285678813, + "grad_norm": 0.1844161565557108, + "learning_rate": 9.531818716396878e-05, + "loss": 3.1751, + "step": 10872 + }, + { + "epoch": 0.6749643056676392, + "grad_norm": 0.20756851515745045, + "learning_rate": 9.531666118313359e-05, + "loss": 3.1133, + "step": 10873 + }, + { + "epoch": 0.6750263827673971, + "grad_norm": 0.18135963774230124, + "learning_rate": 9.531513496587035e-05, + "loss": 3.1583, + "step": 10874 + }, + { + "epoch": 0.675088459867155, + "grad_norm": 0.21105776113658067, + "learning_rate": 9.531360851218706e-05, + "loss": 3.0243, + "step": 10875 + }, + { + "epoch": 0.6751505369669129, + "grad_norm": 0.1774222074377052, + "learning_rate": 9.531208182209168e-05, + "loss": 3.0931, + "step": 10876 + }, + { + "epoch": 0.6752126140666708, + "grad_norm": 0.46508111423151444, + "learning_rate": 9.531055489559217e-05, + "loss": 3.2469, + "step": 10877 + }, + { + "epoch": 0.6752746911664287, + "grad_norm": 0.28281614238016306, + "learning_rate": 9.530902773269647e-05, + "loss": 3.1381, + "step": 10878 + }, + { + "epoch": 0.6753367682661866, + "grad_norm": 0.29696765507830697, + "learning_rate": 9.53075003334126e-05, + "loss": 3.1775, + "step": 10879 + }, + { + "epoch": 0.6753988453659445, + "grad_norm": 0.21396049658756747, + "learning_rate": 9.53059726977485e-05, + "loss": 3.0837, + "step": 10880 + }, + { + "epoch": 0.6754609224657024, + "grad_norm": 0.2655118673289121, + "learning_rate": 9.530444482571213e-05, + "loss": 3.1532, + "step": 10881 + }, + { + "epoch": 0.6755229995654602, + "grad_norm": 0.21973450540385336, + "learning_rate": 9.530291671731148e-05, + "loss": 3.1745, + "step": 10882 + }, + { + "epoch": 0.6755850766652182, + "grad_norm": 0.19930688927828535, + "learning_rate": 9.530138837255452e-05, + "loss": 3.185, + "step": 10883 + }, + { + "epoch": 0.6756471537649761, + "grad_norm": 0.21928832538601398, + "learning_rate": 9.529985979144922e-05, + "loss": 3.1134, + "step": 10884 + }, + { + "epoch": 0.675709230864734, + "grad_norm": 0.23112952341715104, + "learning_rate": 9.529833097400354e-05, + "loss": 3.0127, + "step": 10885 + }, + { + "epoch": 0.6757713079644919, + "grad_norm": 0.3040723092005348, + "learning_rate": 9.529680192022549e-05, + "loss": 3.2063, + "step": 10886 + }, + { + "epoch": 0.6758333850642498, + "grad_norm": 0.27840445066541325, + "learning_rate": 9.529527263012301e-05, + "loss": 3.1731, + "step": 10887 + }, + { + "epoch": 0.6758954621640076, + "grad_norm": 0.2118622764913699, + "learning_rate": 9.529374310370411e-05, + "loss": 3.0102, + "step": 10888 + }, + { + "epoch": 0.6759575392637656, + "grad_norm": 0.1993519954504027, + "learning_rate": 9.529221334097676e-05, + "loss": 3.1084, + "step": 10889 + }, + { + "epoch": 0.6760196163635235, + "grad_norm": 0.1956581776031554, + "learning_rate": 9.529068334194892e-05, + "loss": 3.1239, + "step": 10890 + }, + { + "epoch": 0.6760816934632814, + "grad_norm": 0.25890789542541515, + "learning_rate": 9.52891531066286e-05, + "loss": 3.0289, + "step": 10891 + }, + { + "epoch": 0.6761437705630393, + "grad_norm": 0.28019366308663063, + "learning_rate": 9.528762263502378e-05, + "loss": 3.0833, + "step": 10892 + }, + { + "epoch": 0.6762058476627972, + "grad_norm": 0.21602074994704193, + "learning_rate": 9.528609192714243e-05, + "loss": 3.0904, + "step": 10893 + }, + { + "epoch": 0.676267924762555, + "grad_norm": 0.31037216550260177, + "learning_rate": 9.528456098299254e-05, + "loss": 3.2423, + "step": 10894 + }, + { + "epoch": 0.676330001862313, + "grad_norm": 0.23295436895336308, + "learning_rate": 9.528302980258213e-05, + "loss": 3.1474, + "step": 10895 + }, + { + "epoch": 0.6763920789620709, + "grad_norm": 0.20780402037163775, + "learning_rate": 9.528149838591913e-05, + "loss": 3.0881, + "step": 10896 + }, + { + "epoch": 0.6764541560618288, + "grad_norm": 0.22941174822133173, + "learning_rate": 9.527996673301156e-05, + "loss": 3.1659, + "step": 10897 + }, + { + "epoch": 0.6765162331615867, + "grad_norm": 0.2866985859653754, + "learning_rate": 9.527843484386742e-05, + "loss": 3.2177, + "step": 10898 + }, + { + "epoch": 0.6765783102613446, + "grad_norm": 0.2308173372106292, + "learning_rate": 9.527690271849471e-05, + "loss": 3.0764, + "step": 10899 + }, + { + "epoch": 0.6766403873611024, + "grad_norm": 0.20887408510358185, + "learning_rate": 9.52753703569014e-05, + "loss": 3.0386, + "step": 10900 + }, + { + "epoch": 0.6767024644608604, + "grad_norm": 0.2719254786666833, + "learning_rate": 9.527383775909547e-05, + "loss": 3.0191, + "step": 10901 + }, + { + "epoch": 0.6767645415606183, + "grad_norm": 0.18971366271410248, + "learning_rate": 9.527230492508497e-05, + "loss": 3.0951, + "step": 10902 + }, + { + "epoch": 0.6768266186603762, + "grad_norm": 0.23407249001547897, + "learning_rate": 9.527077185487784e-05, + "loss": 3.1222, + "step": 10903 + }, + { + "epoch": 0.6768886957601341, + "grad_norm": 0.2224497339309444, + "learning_rate": 9.526923854848211e-05, + "loss": 3.0696, + "step": 10904 + }, + { + "epoch": 0.676950772859892, + "grad_norm": 0.19319462741685758, + "learning_rate": 9.526770500590577e-05, + "loss": 3.1562, + "step": 10905 + }, + { + "epoch": 0.6770128499596498, + "grad_norm": 0.19525277178160885, + "learning_rate": 9.526617122715684e-05, + "loss": 2.9689, + "step": 10906 + }, + { + "epoch": 0.6770749270594077, + "grad_norm": 0.3144518810745182, + "learning_rate": 9.526463721224329e-05, + "loss": 3.1553, + "step": 10907 + }, + { + "epoch": 0.6771370041591657, + "grad_norm": 0.37775255335052776, + "learning_rate": 9.526310296117313e-05, + "loss": 3.189, + "step": 10908 + }, + { + "epoch": 0.6771990812589236, + "grad_norm": 0.2193761447322355, + "learning_rate": 9.52615684739544e-05, + "loss": 3.2291, + "step": 10909 + }, + { + "epoch": 0.6772611583586815, + "grad_norm": 0.22574292124879652, + "learning_rate": 9.526003375059507e-05, + "loss": 3.1105, + "step": 10910 + }, + { + "epoch": 0.6773232354584394, + "grad_norm": 0.20003365293137296, + "learning_rate": 9.525849879110315e-05, + "loss": 3.0831, + "step": 10911 + }, + { + "epoch": 0.6773853125581972, + "grad_norm": 0.2632719751416638, + "learning_rate": 9.525696359548666e-05, + "loss": 3.1123, + "step": 10912 + }, + { + "epoch": 0.6774473896579551, + "grad_norm": 0.20995477179412111, + "learning_rate": 9.525542816375359e-05, + "loss": 3.1007, + "step": 10913 + }, + { + "epoch": 0.6775094667577131, + "grad_norm": 0.2371006940481179, + "learning_rate": 9.525389249591199e-05, + "loss": 3.1134, + "step": 10914 + }, + { + "epoch": 0.677571543857471, + "grad_norm": 0.2093106512567267, + "learning_rate": 9.525235659196982e-05, + "loss": 3.109, + "step": 10915 + }, + { + "epoch": 0.6776336209572289, + "grad_norm": 0.2055305848432058, + "learning_rate": 9.525082045193513e-05, + "loss": 3.2005, + "step": 10916 + }, + { + "epoch": 0.6776956980569868, + "grad_norm": 0.18237029945520367, + "learning_rate": 9.524928407581595e-05, + "loss": 3.209, + "step": 10917 + }, + { + "epoch": 0.6777577751567446, + "grad_norm": 0.1966905236692118, + "learning_rate": 9.524774746362025e-05, + "loss": 3.09, + "step": 10918 + }, + { + "epoch": 0.6778198522565025, + "grad_norm": 0.21074346165453053, + "learning_rate": 9.524621061535606e-05, + "loss": 3.1167, + "step": 10919 + }, + { + "epoch": 0.6778819293562605, + "grad_norm": 0.33755326380575007, + "learning_rate": 9.524467353103142e-05, + "loss": 3.0684, + "step": 10920 + }, + { + "epoch": 0.6779440064560184, + "grad_norm": 0.23258752289410736, + "learning_rate": 9.524313621065432e-05, + "loss": 3.18, + "step": 10921 + }, + { + "epoch": 0.6780060835557763, + "grad_norm": 0.2242105758320076, + "learning_rate": 9.524159865423281e-05, + "loss": 3.0458, + "step": 10922 + }, + { + "epoch": 0.6780681606555342, + "grad_norm": 0.2212161877625616, + "learning_rate": 9.524006086177488e-05, + "loss": 3.14, + "step": 10923 + }, + { + "epoch": 0.678130237755292, + "grad_norm": 0.20726399564480052, + "learning_rate": 9.523852283328858e-05, + "loss": 3.1422, + "step": 10924 + }, + { + "epoch": 0.6781923148550499, + "grad_norm": 0.194426903449104, + "learning_rate": 9.523698456878194e-05, + "loss": 3.076, + "step": 10925 + }, + { + "epoch": 0.6782543919548079, + "grad_norm": 0.2143123383458847, + "learning_rate": 9.523544606826295e-05, + "loss": 3.1593, + "step": 10926 + }, + { + "epoch": 0.6783164690545658, + "grad_norm": 0.33527993007588186, + "learning_rate": 9.523390733173966e-05, + "loss": 3.0759, + "step": 10927 + }, + { + "epoch": 0.6783785461543237, + "grad_norm": 0.22745998793123234, + "learning_rate": 9.523236835922009e-05, + "loss": 3.0522, + "step": 10928 + }, + { + "epoch": 0.6784406232540816, + "grad_norm": 0.2573413445059771, + "learning_rate": 9.523082915071229e-05, + "loss": 3.1634, + "step": 10929 + }, + { + "epoch": 0.6785027003538394, + "grad_norm": 0.2327346300301078, + "learning_rate": 9.522928970622425e-05, + "loss": 3.1877, + "step": 10930 + }, + { + "epoch": 0.6785647774535973, + "grad_norm": 0.2248868472300751, + "learning_rate": 9.522775002576405e-05, + "loss": 3.1952, + "step": 10931 + }, + { + "epoch": 0.6786268545533553, + "grad_norm": 0.23427457247291955, + "learning_rate": 9.522621010933969e-05, + "loss": 3.1435, + "step": 10932 + }, + { + "epoch": 0.6786889316531132, + "grad_norm": 0.24674786571644444, + "learning_rate": 9.52246699569592e-05, + "loss": 3.1523, + "step": 10933 + }, + { + "epoch": 0.6787510087528711, + "grad_norm": 0.23579456060465714, + "learning_rate": 9.522312956863064e-05, + "loss": 3.2264, + "step": 10934 + }, + { + "epoch": 0.678813085852629, + "grad_norm": 0.21097768294869082, + "learning_rate": 9.522158894436204e-05, + "loss": 3.1135, + "step": 10935 + }, + { + "epoch": 0.6788751629523868, + "grad_norm": 0.23176915814655133, + "learning_rate": 9.522004808416142e-05, + "loss": 3.0843, + "step": 10936 + }, + { + "epoch": 0.6789372400521447, + "grad_norm": 0.2296985672211251, + "learning_rate": 9.521850698803685e-05, + "loss": 3.059, + "step": 10937 + }, + { + "epoch": 0.6789993171519026, + "grad_norm": 0.23944360175805623, + "learning_rate": 9.521696565599633e-05, + "loss": 3.0935, + "step": 10938 + }, + { + "epoch": 0.6790613942516606, + "grad_norm": 0.19229429890686361, + "learning_rate": 9.521542408804794e-05, + "loss": 3.0746, + "step": 10939 + }, + { + "epoch": 0.6791234713514185, + "grad_norm": 0.21905447002391795, + "learning_rate": 9.521388228419969e-05, + "loss": 3.1826, + "step": 10940 + }, + { + "epoch": 0.6791855484511764, + "grad_norm": 0.1830588217763317, + "learning_rate": 9.521234024445965e-05, + "loss": 3.0626, + "step": 10941 + }, + { + "epoch": 0.6792476255509342, + "grad_norm": 0.19263548161297853, + "learning_rate": 9.521079796883586e-05, + "loss": 3.1625, + "step": 10942 + }, + { + "epoch": 0.6793097026506921, + "grad_norm": 0.19863673818604327, + "learning_rate": 9.520925545733636e-05, + "loss": 3.0659, + "step": 10943 + }, + { + "epoch": 0.67937177975045, + "grad_norm": 0.19958665012963023, + "learning_rate": 9.520771270996919e-05, + "loss": 3.0971, + "step": 10944 + }, + { + "epoch": 0.679433856850208, + "grad_norm": 0.22816386414349304, + "learning_rate": 9.520616972674241e-05, + "loss": 3.1678, + "step": 10945 + }, + { + "epoch": 0.6794959339499659, + "grad_norm": 0.3215313841146285, + "learning_rate": 9.520462650766407e-05, + "loss": 3.1347, + "step": 10946 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.229657223386281, + "learning_rate": 9.520308305274223e-05, + "loss": 3.2018, + "step": 10947 + }, + { + "epoch": 0.6796200881494816, + "grad_norm": 0.24431625915127692, + "learning_rate": 9.520153936198492e-05, + "loss": 3.2761, + "step": 10948 + }, + { + "epoch": 0.6796821652492395, + "grad_norm": 0.288349612847722, + "learning_rate": 9.519999543540021e-05, + "loss": 3.0708, + "step": 10949 + }, + { + "epoch": 0.6797442423489974, + "grad_norm": 0.297609750114265, + "learning_rate": 9.519845127299616e-05, + "loss": 3.2558, + "step": 10950 + }, + { + "epoch": 0.6798063194487554, + "grad_norm": 0.18750875174442644, + "learning_rate": 9.51969068747808e-05, + "loss": 3.2423, + "step": 10951 + }, + { + "epoch": 0.6798683965485133, + "grad_norm": 0.24261343325853527, + "learning_rate": 9.519536224076222e-05, + "loss": 3.1273, + "step": 10952 + }, + { + "epoch": 0.6799304736482712, + "grad_norm": 0.22365026947250866, + "learning_rate": 9.519381737094845e-05, + "loss": 3.078, + "step": 10953 + }, + { + "epoch": 0.679992550748029, + "grad_norm": 0.2347665615348733, + "learning_rate": 9.519227226534758e-05, + "loss": 3.216, + "step": 10954 + }, + { + "epoch": 0.6800546278477869, + "grad_norm": 0.2936472982081619, + "learning_rate": 9.519072692396764e-05, + "loss": 3.1541, + "step": 10955 + }, + { + "epoch": 0.6801167049475448, + "grad_norm": 0.24389852345390348, + "learning_rate": 9.518918134681671e-05, + "loss": 3.1976, + "step": 10956 + }, + { + "epoch": 0.6801787820473028, + "grad_norm": 0.2700114740476803, + "learning_rate": 9.518763553390286e-05, + "loss": 3.0987, + "step": 10957 + }, + { + "epoch": 0.6802408591470607, + "grad_norm": 0.37163089225427076, + "learning_rate": 9.518608948523413e-05, + "loss": 3.1413, + "step": 10958 + }, + { + "epoch": 0.6803029362468186, + "grad_norm": 0.26956827489892365, + "learning_rate": 9.518454320081862e-05, + "loss": 3.0668, + "step": 10959 + }, + { + "epoch": 0.6803650133465764, + "grad_norm": 0.2704914239539302, + "learning_rate": 9.518299668066438e-05, + "loss": 3.0653, + "step": 10960 + }, + { + "epoch": 0.6804270904463343, + "grad_norm": 0.24098480682937862, + "learning_rate": 9.518144992477945e-05, + "loss": 3.1589, + "step": 10961 + }, + { + "epoch": 0.6804891675460922, + "grad_norm": 0.2502717240180977, + "learning_rate": 9.517990293317193e-05, + "loss": 3.1765, + "step": 10962 + }, + { + "epoch": 0.6805512446458502, + "grad_norm": 0.33957464152408967, + "learning_rate": 9.517835570584992e-05, + "loss": 3.1767, + "step": 10963 + }, + { + "epoch": 0.6806133217456081, + "grad_norm": 0.3770536180066919, + "learning_rate": 9.517680824282145e-05, + "loss": 3.1042, + "step": 10964 + }, + { + "epoch": 0.680675398845366, + "grad_norm": 0.2888961777445117, + "learning_rate": 9.51752605440946e-05, + "loss": 3.1424, + "step": 10965 + }, + { + "epoch": 0.6807374759451238, + "grad_norm": 0.25987065135277027, + "learning_rate": 9.517371260967745e-05, + "loss": 3.1401, + "step": 10966 + }, + { + "epoch": 0.6807995530448817, + "grad_norm": 0.2427637643030148, + "learning_rate": 9.517216443957807e-05, + "loss": 3.1749, + "step": 10967 + }, + { + "epoch": 0.6808616301446396, + "grad_norm": 0.22002825053039332, + "learning_rate": 9.517061603380455e-05, + "loss": 3.1446, + "step": 10968 + }, + { + "epoch": 0.6809237072443975, + "grad_norm": 0.2022699276645884, + "learning_rate": 9.516906739236495e-05, + "loss": 3.114, + "step": 10969 + }, + { + "epoch": 0.6809857843441555, + "grad_norm": 0.2876965493604872, + "learning_rate": 9.516751851526738e-05, + "loss": 3.2187, + "step": 10970 + }, + { + "epoch": 0.6810478614439134, + "grad_norm": 0.3074696626080697, + "learning_rate": 9.516596940251987e-05, + "loss": 3.1072, + "step": 10971 + }, + { + "epoch": 0.6811099385436712, + "grad_norm": 0.25980961936177804, + "learning_rate": 9.516442005413056e-05, + "loss": 3.1496, + "step": 10972 + }, + { + "epoch": 0.6811720156434291, + "grad_norm": 0.24662948756043185, + "learning_rate": 9.516287047010749e-05, + "loss": 3.0786, + "step": 10973 + }, + { + "epoch": 0.681234092743187, + "grad_norm": 0.22629032558073509, + "learning_rate": 9.516132065045877e-05, + "loss": 3.0814, + "step": 10974 + }, + { + "epoch": 0.6812961698429449, + "grad_norm": 0.2498895803623026, + "learning_rate": 9.515977059519248e-05, + "loss": 3.0848, + "step": 10975 + }, + { + "epoch": 0.6813582469427029, + "grad_norm": 0.24845220084450617, + "learning_rate": 9.515822030431669e-05, + "loss": 3.0564, + "step": 10976 + }, + { + "epoch": 0.6814203240424608, + "grad_norm": 0.2738817148921936, + "learning_rate": 9.515666977783952e-05, + "loss": 3.1182, + "step": 10977 + }, + { + "epoch": 0.6814824011422186, + "grad_norm": 0.2694004957962713, + "learning_rate": 9.515511901576903e-05, + "loss": 3.0192, + "step": 10978 + }, + { + "epoch": 0.6815444782419765, + "grad_norm": 0.22120519936104896, + "learning_rate": 9.515356801811333e-05, + "loss": 3.2118, + "step": 10979 + }, + { + "epoch": 0.6816065553417344, + "grad_norm": 0.2225421730401309, + "learning_rate": 9.515201678488049e-05, + "loss": 3.1276, + "step": 10980 + }, + { + "epoch": 0.6816686324414923, + "grad_norm": 0.2126069672717965, + "learning_rate": 9.515046531607863e-05, + "loss": 3.0673, + "step": 10981 + }, + { + "epoch": 0.6817307095412503, + "grad_norm": 0.22929060990776715, + "learning_rate": 9.514891361171583e-05, + "loss": 3.1172, + "step": 10982 + }, + { + "epoch": 0.6817927866410082, + "grad_norm": 0.2037884075731192, + "learning_rate": 9.514736167180018e-05, + "loss": 3.148, + "step": 10983 + }, + { + "epoch": 0.681854863740766, + "grad_norm": 0.2156190149195326, + "learning_rate": 9.51458094963398e-05, + "loss": 3.1749, + "step": 10984 + }, + { + "epoch": 0.6819169408405239, + "grad_norm": 0.2987374560863013, + "learning_rate": 9.514425708534276e-05, + "loss": 3.1185, + "step": 10985 + }, + { + "epoch": 0.6819790179402818, + "grad_norm": 0.2066532769323176, + "learning_rate": 9.514270443881718e-05, + "loss": 3.0678, + "step": 10986 + }, + { + "epoch": 0.6820410950400397, + "grad_norm": 0.22603043200148254, + "learning_rate": 9.514115155677115e-05, + "loss": 3.1734, + "step": 10987 + }, + { + "epoch": 0.6821031721397977, + "grad_norm": 0.18674368113305162, + "learning_rate": 9.513959843921276e-05, + "loss": 3.1942, + "step": 10988 + }, + { + "epoch": 0.6821652492395556, + "grad_norm": 0.21140468042810223, + "learning_rate": 9.513804508615014e-05, + "loss": 3.0449, + "step": 10989 + }, + { + "epoch": 0.6822273263393134, + "grad_norm": 0.17594660293311173, + "learning_rate": 9.513649149759138e-05, + "loss": 3.0789, + "step": 10990 + }, + { + "epoch": 0.6822894034390713, + "grad_norm": 0.2490235721363377, + "learning_rate": 9.513493767354458e-05, + "loss": 3.1264, + "step": 10991 + }, + { + "epoch": 0.6823514805388292, + "grad_norm": 0.20387858206749665, + "learning_rate": 9.513338361401787e-05, + "loss": 3.1346, + "step": 10992 + }, + { + "epoch": 0.6824135576385871, + "grad_norm": 0.19168072114257237, + "learning_rate": 9.513182931901932e-05, + "loss": 3.1008, + "step": 10993 + }, + { + "epoch": 0.682475634738345, + "grad_norm": 0.21747916486303853, + "learning_rate": 9.513027478855707e-05, + "loss": 3.0934, + "step": 10994 + }, + { + "epoch": 0.682537711838103, + "grad_norm": 0.23376732568228661, + "learning_rate": 9.512872002263922e-05, + "loss": 3.253, + "step": 10995 + }, + { + "epoch": 0.6825997889378608, + "grad_norm": 0.18554588116831394, + "learning_rate": 9.512716502127388e-05, + "loss": 3.1149, + "step": 10996 + }, + { + "epoch": 0.6826618660376187, + "grad_norm": 0.20060102708735197, + "learning_rate": 9.512560978446917e-05, + "loss": 3.1237, + "step": 10997 + }, + { + "epoch": 0.6827239431373766, + "grad_norm": 0.17830265287799474, + "learning_rate": 9.51240543122332e-05, + "loss": 3.2103, + "step": 10998 + }, + { + "epoch": 0.6827860202371345, + "grad_norm": 0.31721507408290345, + "learning_rate": 9.512249860457407e-05, + "loss": 3.1277, + "step": 10999 + }, + { + "epoch": 0.6828480973368924, + "grad_norm": 0.190544719676035, + "learning_rate": 9.512094266149994e-05, + "loss": 3.0766, + "step": 11000 + }, + { + "epoch": 0.6829101744366504, + "grad_norm": 0.20804378037366097, + "learning_rate": 9.511938648301888e-05, + "loss": 3.2049, + "step": 11001 + }, + { + "epoch": 0.6829722515364082, + "grad_norm": 0.19090572425231625, + "learning_rate": 9.511783006913903e-05, + "loss": 3.2395, + "step": 11002 + }, + { + "epoch": 0.6830343286361661, + "grad_norm": 0.17649724100336656, + "learning_rate": 9.51162734198685e-05, + "loss": 3.0181, + "step": 11003 + }, + { + "epoch": 0.683096405735924, + "grad_norm": 0.19755316070160953, + "learning_rate": 9.511471653521542e-05, + "loss": 3.0237, + "step": 11004 + }, + { + "epoch": 0.6831584828356819, + "grad_norm": 0.19778886650851124, + "learning_rate": 9.511315941518793e-05, + "loss": 3.1605, + "step": 11005 + }, + { + "epoch": 0.6832205599354398, + "grad_norm": 0.21796350780459056, + "learning_rate": 9.511160205979411e-05, + "loss": 3.1215, + "step": 11006 + }, + { + "epoch": 0.6832826370351978, + "grad_norm": 0.18097822350362044, + "learning_rate": 9.511004446904213e-05, + "loss": 3.2121, + "step": 11007 + }, + { + "epoch": 0.6833447141349556, + "grad_norm": 0.17779882541025535, + "learning_rate": 9.51084866429401e-05, + "loss": 3.147, + "step": 11008 + }, + { + "epoch": 0.6834067912347135, + "grad_norm": 0.41216894144152144, + "learning_rate": 9.510692858149613e-05, + "loss": 3.1443, + "step": 11009 + }, + { + "epoch": 0.6834688683344714, + "grad_norm": 0.2701816175827748, + "learning_rate": 9.510537028471837e-05, + "loss": 3.074, + "step": 11010 + }, + { + "epoch": 0.6835309454342293, + "grad_norm": 0.1826632640461092, + "learning_rate": 9.510381175261493e-05, + "loss": 3.1694, + "step": 11011 + }, + { + "epoch": 0.6835930225339872, + "grad_norm": 0.20186332277650804, + "learning_rate": 9.510225298519398e-05, + "loss": 3.1116, + "step": 11012 + }, + { + "epoch": 0.6836550996337452, + "grad_norm": 0.1861165086584678, + "learning_rate": 9.510069398246361e-05, + "loss": 3.1969, + "step": 11013 + }, + { + "epoch": 0.683717176733503, + "grad_norm": 0.23145906592601365, + "learning_rate": 9.509913474443197e-05, + "loss": 3.1163, + "step": 11014 + }, + { + "epoch": 0.6837792538332609, + "grad_norm": 0.2049784372637049, + "learning_rate": 9.509757527110721e-05, + "loss": 3.1571, + "step": 11015 + }, + { + "epoch": 0.6838413309330188, + "grad_norm": 0.1779278446273756, + "learning_rate": 9.509601556249744e-05, + "loss": 3.1247, + "step": 11016 + }, + { + "epoch": 0.6839034080327767, + "grad_norm": 0.19282444256934783, + "learning_rate": 9.509445561861081e-05, + "loss": 3.0728, + "step": 11017 + }, + { + "epoch": 0.6839654851325346, + "grad_norm": 0.20727388990238615, + "learning_rate": 9.509289543945547e-05, + "loss": 3.1751, + "step": 11018 + }, + { + "epoch": 0.6840275622322926, + "grad_norm": 0.2337633735482533, + "learning_rate": 9.509133502503953e-05, + "loss": 3.1843, + "step": 11019 + }, + { + "epoch": 0.6840896393320504, + "grad_norm": 0.19740768076656104, + "learning_rate": 9.508977437537117e-05, + "loss": 3.1656, + "step": 11020 + }, + { + "epoch": 0.6841517164318083, + "grad_norm": 0.19817924439851428, + "learning_rate": 9.508821349045849e-05, + "loss": 3.1929, + "step": 11021 + }, + { + "epoch": 0.6842137935315662, + "grad_norm": 0.3626357260192376, + "learning_rate": 9.508665237030967e-05, + "loss": 3.1214, + "step": 11022 + }, + { + "epoch": 0.6842758706313241, + "grad_norm": 0.30443151431071913, + "learning_rate": 9.508509101493283e-05, + "loss": 3.1759, + "step": 11023 + }, + { + "epoch": 0.684337947731082, + "grad_norm": 0.2387964120411218, + "learning_rate": 9.508352942433613e-05, + "loss": 3.1776, + "step": 11024 + }, + { + "epoch": 0.68440002483084, + "grad_norm": 0.226202909616667, + "learning_rate": 9.508196759852772e-05, + "loss": 3.09, + "step": 11025 + }, + { + "epoch": 0.6844621019305978, + "grad_norm": 0.20979257087329567, + "learning_rate": 9.508040553751571e-05, + "loss": 3.1266, + "step": 11026 + }, + { + "epoch": 0.6845241790303557, + "grad_norm": 0.2461595825666123, + "learning_rate": 9.507884324130831e-05, + "loss": 3.0929, + "step": 11027 + }, + { + "epoch": 0.6845862561301136, + "grad_norm": 0.1974644461602096, + "learning_rate": 9.507728070991364e-05, + "loss": 3.1898, + "step": 11028 + }, + { + "epoch": 0.6846483332298715, + "grad_norm": 0.24638135694792396, + "learning_rate": 9.507571794333985e-05, + "loss": 3.1098, + "step": 11029 + }, + { + "epoch": 0.6847104103296294, + "grad_norm": 0.19587497779892318, + "learning_rate": 9.507415494159508e-05, + "loss": 3.119, + "step": 11030 + }, + { + "epoch": 0.6847724874293873, + "grad_norm": 0.23046205801865818, + "learning_rate": 9.507259170468751e-05, + "loss": 3.1575, + "step": 11031 + }, + { + "epoch": 0.6848345645291452, + "grad_norm": 0.19615447711249753, + "learning_rate": 9.50710282326253e-05, + "loss": 3.0968, + "step": 11032 + }, + { + "epoch": 0.6848966416289031, + "grad_norm": 0.1956385085131025, + "learning_rate": 9.50694645254166e-05, + "loss": 3.0791, + "step": 11033 + }, + { + "epoch": 0.684958718728661, + "grad_norm": 0.1697820603445941, + "learning_rate": 9.506790058306955e-05, + "loss": 3.1625, + "step": 11034 + }, + { + "epoch": 0.6850207958284189, + "grad_norm": 0.18652995832139485, + "learning_rate": 9.506633640559232e-05, + "loss": 3.1451, + "step": 11035 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.15798998827654018, + "learning_rate": 9.506477199299307e-05, + "loss": 3.0933, + "step": 11036 + }, + { + "epoch": 0.6851449500279347, + "grad_norm": 0.1751544958004663, + "learning_rate": 9.506320734527998e-05, + "loss": 3.1162, + "step": 11037 + }, + { + "epoch": 0.6852070271276925, + "grad_norm": 0.1813797153613166, + "learning_rate": 9.506164246246119e-05, + "loss": 3.1517, + "step": 11038 + }, + { + "epoch": 0.6852691042274505, + "grad_norm": 0.20269258027910309, + "learning_rate": 9.506007734454486e-05, + "loss": 3.122, + "step": 11039 + }, + { + "epoch": 0.6853311813272084, + "grad_norm": 0.18830281486319192, + "learning_rate": 9.505851199153918e-05, + "loss": 3.1416, + "step": 11040 + }, + { + "epoch": 0.6853932584269663, + "grad_norm": 0.21602228475172544, + "learning_rate": 9.50569464034523e-05, + "loss": 3.141, + "step": 11041 + }, + { + "epoch": 0.6854553355267242, + "grad_norm": 0.2214777358471217, + "learning_rate": 9.50553805802924e-05, + "loss": 3.0879, + "step": 11042 + }, + { + "epoch": 0.6855174126264821, + "grad_norm": 0.21871288870315644, + "learning_rate": 9.505381452206763e-05, + "loss": 3.1127, + "step": 11043 + }, + { + "epoch": 0.6855794897262399, + "grad_norm": 0.2476007248881734, + "learning_rate": 9.505224822878618e-05, + "loss": 3.1097, + "step": 11044 + }, + { + "epoch": 0.6856415668259979, + "grad_norm": 0.19655009131995937, + "learning_rate": 9.505068170045621e-05, + "loss": 3.0349, + "step": 11045 + }, + { + "epoch": 0.6857036439257558, + "grad_norm": 0.18386679494316252, + "learning_rate": 9.504911493708591e-05, + "loss": 3.1605, + "step": 11046 + }, + { + "epoch": 0.6857657210255137, + "grad_norm": 0.18729841800575064, + "learning_rate": 9.504754793868343e-05, + "loss": 3.1945, + "step": 11047 + }, + { + "epoch": 0.6858277981252716, + "grad_norm": 0.21905617914318135, + "learning_rate": 9.504598070525696e-05, + "loss": 3.2322, + "step": 11048 + }, + { + "epoch": 0.6858898752250295, + "grad_norm": 0.23365555650466055, + "learning_rate": 9.504441323681467e-05, + "loss": 3.0406, + "step": 11049 + }, + { + "epoch": 0.6859519523247873, + "grad_norm": 0.1749740611164669, + "learning_rate": 9.504284553336475e-05, + "loss": 3.1175, + "step": 11050 + }, + { + "epoch": 0.6860140294245453, + "grad_norm": 0.2070803134289938, + "learning_rate": 9.504127759491536e-05, + "loss": 3.1545, + "step": 11051 + }, + { + "epoch": 0.6860761065243032, + "grad_norm": 0.22429911705020603, + "learning_rate": 9.503970942147469e-05, + "loss": 3.1028, + "step": 11052 + }, + { + "epoch": 0.6861381836240611, + "grad_norm": 0.22286957008563743, + "learning_rate": 9.503814101305093e-05, + "loss": 3.0923, + "step": 11053 + }, + { + "epoch": 0.686200260723819, + "grad_norm": 0.2692036365333908, + "learning_rate": 9.503657236965225e-05, + "loss": 3.1183, + "step": 11054 + }, + { + "epoch": 0.6862623378235769, + "grad_norm": 0.2779225770063537, + "learning_rate": 9.503500349128685e-05, + "loss": 3.1376, + "step": 11055 + }, + { + "epoch": 0.6863244149233347, + "grad_norm": 0.27165312976596456, + "learning_rate": 9.503343437796288e-05, + "loss": 3.043, + "step": 11056 + }, + { + "epoch": 0.6863864920230927, + "grad_norm": 0.23139883129944508, + "learning_rate": 9.503186502968858e-05, + "loss": 3.1019, + "step": 11057 + }, + { + "epoch": 0.6864485691228506, + "grad_norm": 0.21884840216588267, + "learning_rate": 9.503029544647209e-05, + "loss": 3.1023, + "step": 11058 + }, + { + "epoch": 0.6865106462226085, + "grad_norm": 0.3202675462207229, + "learning_rate": 9.502872562832162e-05, + "loss": 3.2361, + "step": 11059 + }, + { + "epoch": 0.6865727233223664, + "grad_norm": 0.2061122614982521, + "learning_rate": 9.502715557524535e-05, + "loss": 3.0699, + "step": 11060 + }, + { + "epoch": 0.6866348004221243, + "grad_norm": 0.2280747325592915, + "learning_rate": 9.502558528725149e-05, + "loss": 3.0602, + "step": 11061 + }, + { + "epoch": 0.6866968775218821, + "grad_norm": 0.2378797979662786, + "learning_rate": 9.502401476434821e-05, + "loss": 3.1156, + "step": 11062 + }, + { + "epoch": 0.68675895462164, + "grad_norm": 0.20471081309741865, + "learning_rate": 9.502244400654372e-05, + "loss": 3.2011, + "step": 11063 + }, + { + "epoch": 0.686821031721398, + "grad_norm": 0.28164219002679064, + "learning_rate": 9.502087301384623e-05, + "loss": 3.0582, + "step": 11064 + }, + { + "epoch": 0.6868831088211559, + "grad_norm": 0.22126042213515962, + "learning_rate": 9.501930178626389e-05, + "loss": 3.164, + "step": 11065 + }, + { + "epoch": 0.6869451859209138, + "grad_norm": 0.19819114290977766, + "learning_rate": 9.501773032380493e-05, + "loss": 3.0815, + "step": 11066 + }, + { + "epoch": 0.6870072630206717, + "grad_norm": 0.22314118397412105, + "learning_rate": 9.501615862647755e-05, + "loss": 3.1419, + "step": 11067 + }, + { + "epoch": 0.6870693401204295, + "grad_norm": 0.20327547808464147, + "learning_rate": 9.501458669428993e-05, + "loss": 3.1279, + "step": 11068 + }, + { + "epoch": 0.6871314172201874, + "grad_norm": 0.17550590991261017, + "learning_rate": 9.50130145272503e-05, + "loss": 3.1177, + "step": 11069 + }, + { + "epoch": 0.6871934943199454, + "grad_norm": 0.1841306242818654, + "learning_rate": 9.501144212536684e-05, + "loss": 3.1154, + "step": 11070 + }, + { + "epoch": 0.6872555714197033, + "grad_norm": 0.2122916448354164, + "learning_rate": 9.500986948864775e-05, + "loss": 3.1137, + "step": 11071 + }, + { + "epoch": 0.6873176485194612, + "grad_norm": 0.17985056651277803, + "learning_rate": 9.500829661710124e-05, + "loss": 3.0477, + "step": 11072 + }, + { + "epoch": 0.6873797256192191, + "grad_norm": 0.23518474887756394, + "learning_rate": 9.500672351073554e-05, + "loss": 3.0855, + "step": 11073 + }, + { + "epoch": 0.6874418027189769, + "grad_norm": 0.2654872986149443, + "learning_rate": 9.500515016955884e-05, + "loss": 3.1286, + "step": 11074 + }, + { + "epoch": 0.6875038798187348, + "grad_norm": 0.18347059339657312, + "learning_rate": 9.500357659357932e-05, + "loss": 3.0553, + "step": 11075 + }, + { + "epoch": 0.6875659569184928, + "grad_norm": 0.19942128414038723, + "learning_rate": 9.500200278280525e-05, + "loss": 3.1474, + "step": 11076 + }, + { + "epoch": 0.6876280340182507, + "grad_norm": 0.2050925658454986, + "learning_rate": 9.500042873724477e-05, + "loss": 3.0936, + "step": 11077 + }, + { + "epoch": 0.6876901111180086, + "grad_norm": 0.18125613341970453, + "learning_rate": 9.499885445690615e-05, + "loss": 3.1532, + "step": 11078 + }, + { + "epoch": 0.6877521882177665, + "grad_norm": 0.20966990505083086, + "learning_rate": 9.499727994179759e-05, + "loss": 3.1759, + "step": 11079 + }, + { + "epoch": 0.6878142653175243, + "grad_norm": 0.2107942500958692, + "learning_rate": 9.499570519192728e-05, + "loss": 3.1321, + "step": 11080 + }, + { + "epoch": 0.6878763424172822, + "grad_norm": 0.22313429727604922, + "learning_rate": 9.499413020730345e-05, + "loss": 3.1246, + "step": 11081 + }, + { + "epoch": 0.6879384195170402, + "grad_norm": 0.23682191193463067, + "learning_rate": 9.499255498793434e-05, + "loss": 3.0101, + "step": 11082 + }, + { + "epoch": 0.6880004966167981, + "grad_norm": 0.21944955955316892, + "learning_rate": 9.499097953382814e-05, + "loss": 3.1605, + "step": 11083 + }, + { + "epoch": 0.688062573716556, + "grad_norm": 0.18339315575417364, + "learning_rate": 9.498940384499307e-05, + "loss": 3.0671, + "step": 11084 + }, + { + "epoch": 0.6881246508163139, + "grad_norm": 0.21401493421143547, + "learning_rate": 9.498782792143737e-05, + "loss": 3.0647, + "step": 11085 + }, + { + "epoch": 0.6881867279160717, + "grad_norm": 0.2581972661125536, + "learning_rate": 9.498625176316925e-05, + "loss": 3.1548, + "step": 11086 + }, + { + "epoch": 0.6882488050158296, + "grad_norm": 0.20131512513709673, + "learning_rate": 9.498467537019691e-05, + "loss": 3.0822, + "step": 11087 + }, + { + "epoch": 0.6883108821155876, + "grad_norm": 0.18399988056456276, + "learning_rate": 9.498309874252862e-05, + "loss": 3.1171, + "step": 11088 + }, + { + "epoch": 0.6883729592153455, + "grad_norm": 0.27174217004734547, + "learning_rate": 9.498152188017259e-05, + "loss": 3.1426, + "step": 11089 + }, + { + "epoch": 0.6884350363151034, + "grad_norm": 0.21418766935657388, + "learning_rate": 9.497994478313703e-05, + "loss": 3.0717, + "step": 11090 + }, + { + "epoch": 0.6884971134148613, + "grad_norm": 0.3513957581876867, + "learning_rate": 9.497836745143019e-05, + "loss": 3.1551, + "step": 11091 + }, + { + "epoch": 0.6885591905146191, + "grad_norm": 0.2924025468403248, + "learning_rate": 9.497678988506027e-05, + "loss": 3.1512, + "step": 11092 + }, + { + "epoch": 0.688621267614377, + "grad_norm": 0.275709832970035, + "learning_rate": 9.497521208403552e-05, + "loss": 3.0771, + "step": 11093 + }, + { + "epoch": 0.688683344714135, + "grad_norm": 0.21906557480943772, + "learning_rate": 9.497363404836417e-05, + "loss": 3.0853, + "step": 11094 + }, + { + "epoch": 0.6887454218138929, + "grad_norm": 0.30022677800823144, + "learning_rate": 9.497205577805448e-05, + "loss": 3.232, + "step": 11095 + }, + { + "epoch": 0.6888074989136508, + "grad_norm": 0.3159737751413734, + "learning_rate": 9.497047727311463e-05, + "loss": 3.1548, + "step": 11096 + }, + { + "epoch": 0.6888695760134087, + "grad_norm": 0.2763970834043372, + "learning_rate": 9.496889853355289e-05, + "loss": 3.176, + "step": 11097 + }, + { + "epoch": 0.6889316531131665, + "grad_norm": 0.22307875350315334, + "learning_rate": 9.49673195593775e-05, + "loss": 3.099, + "step": 11098 + }, + { + "epoch": 0.6889937302129244, + "grad_norm": 0.22873073625748447, + "learning_rate": 9.496574035059666e-05, + "loss": 3.1984, + "step": 11099 + }, + { + "epoch": 0.6890558073126823, + "grad_norm": 0.25830667990167133, + "learning_rate": 9.496416090721867e-05, + "loss": 3.1081, + "step": 11100 + }, + { + "epoch": 0.6891178844124403, + "grad_norm": 0.2327523246545586, + "learning_rate": 9.496258122925173e-05, + "loss": 3.0975, + "step": 11101 + }, + { + "epoch": 0.6891799615121982, + "grad_norm": 0.18902319862207154, + "learning_rate": 9.496100131670407e-05, + "loss": 3.1483, + "step": 11102 + }, + { + "epoch": 0.6892420386119561, + "grad_norm": 0.217605543206808, + "learning_rate": 9.495942116958396e-05, + "loss": 3.1632, + "step": 11103 + }, + { + "epoch": 0.6893041157117139, + "grad_norm": 0.26507862782451724, + "learning_rate": 9.495784078789965e-05, + "loss": 3.1227, + "step": 11104 + }, + { + "epoch": 0.6893661928114718, + "grad_norm": 0.1865455055164182, + "learning_rate": 9.495626017165935e-05, + "loss": 3.1956, + "step": 11105 + }, + { + "epoch": 0.6894282699112297, + "grad_norm": 0.2115348464129225, + "learning_rate": 9.495467932087134e-05, + "loss": 3.1117, + "step": 11106 + }, + { + "epoch": 0.6894903470109877, + "grad_norm": 0.1936337614546027, + "learning_rate": 9.495309823554386e-05, + "loss": 3.0737, + "step": 11107 + }, + { + "epoch": 0.6895524241107456, + "grad_norm": 0.255676393379114, + "learning_rate": 9.495151691568513e-05, + "loss": 3.2224, + "step": 11108 + }, + { + "epoch": 0.6896145012105035, + "grad_norm": 0.2719666704652609, + "learning_rate": 9.494993536130345e-05, + "loss": 3.1118, + "step": 11109 + }, + { + "epoch": 0.6896765783102613, + "grad_norm": 0.18000523836076693, + "learning_rate": 9.494835357240703e-05, + "loss": 3.0781, + "step": 11110 + }, + { + "epoch": 0.6897386554100192, + "grad_norm": 0.2384440147564815, + "learning_rate": 9.494677154900414e-05, + "loss": 3.117, + "step": 11111 + }, + { + "epoch": 0.6898007325097771, + "grad_norm": 0.2075057350509402, + "learning_rate": 9.494518929110304e-05, + "loss": 3.1046, + "step": 11112 + }, + { + "epoch": 0.689862809609535, + "grad_norm": 0.32269926084682615, + "learning_rate": 9.494360679871197e-05, + "loss": 3.0855, + "step": 11113 + }, + { + "epoch": 0.689924886709293, + "grad_norm": 0.2991720974959655, + "learning_rate": 9.494202407183918e-05, + "loss": 3.1041, + "step": 11114 + }, + { + "epoch": 0.6899869638090509, + "grad_norm": 0.2773966805897197, + "learning_rate": 9.494044111049296e-05, + "loss": 3.1873, + "step": 11115 + }, + { + "epoch": 0.6900490409088087, + "grad_norm": 0.27689483739428733, + "learning_rate": 9.493885791468155e-05, + "loss": 3.1737, + "step": 11116 + }, + { + "epoch": 0.6901111180085666, + "grad_norm": 0.2724754165575879, + "learning_rate": 9.49372744844132e-05, + "loss": 3.0933, + "step": 11117 + }, + { + "epoch": 0.6901731951083245, + "grad_norm": 0.18777576671256405, + "learning_rate": 9.493569081969618e-05, + "loss": 3.1334, + "step": 11118 + }, + { + "epoch": 0.6902352722080825, + "grad_norm": 0.20232459239039577, + "learning_rate": 9.493410692053874e-05, + "loss": 3.2002, + "step": 11119 + }, + { + "epoch": 0.6902973493078404, + "grad_norm": 0.17748264679569845, + "learning_rate": 9.493252278694918e-05, + "loss": 3.1061, + "step": 11120 + }, + { + "epoch": 0.6903594264075983, + "grad_norm": 0.3188223018175216, + "learning_rate": 9.493093841893574e-05, + "loss": 3.2189, + "step": 11121 + }, + { + "epoch": 0.6904215035073561, + "grad_norm": 0.35581217933344167, + "learning_rate": 9.492935381650668e-05, + "loss": 3.1815, + "step": 11122 + }, + { + "epoch": 0.690483580607114, + "grad_norm": 0.20102778153738465, + "learning_rate": 9.492776897967026e-05, + "loss": 3.0588, + "step": 11123 + }, + { + "epoch": 0.6905456577068719, + "grad_norm": 0.23665379712687207, + "learning_rate": 9.492618390843478e-05, + "loss": 3.1452, + "step": 11124 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.2241789965328955, + "learning_rate": 9.492459860280849e-05, + "loss": 3.1217, + "step": 11125 + }, + { + "epoch": 0.6906698119063878, + "grad_norm": 0.23710297225527513, + "learning_rate": 9.492301306279965e-05, + "loss": 3.1269, + "step": 11126 + }, + { + "epoch": 0.6907318890061457, + "grad_norm": 0.23359725177653518, + "learning_rate": 9.492142728841656e-05, + "loss": 3.1205, + "step": 11127 + }, + { + "epoch": 0.6907939661059035, + "grad_norm": 0.27383538316801886, + "learning_rate": 9.491984127966747e-05, + "loss": 3.1025, + "step": 11128 + }, + { + "epoch": 0.6908560432056614, + "grad_norm": 0.3147895624371091, + "learning_rate": 9.491825503656067e-05, + "loss": 3.1391, + "step": 11129 + }, + { + "epoch": 0.6909181203054193, + "grad_norm": 0.22527531297120315, + "learning_rate": 9.491666855910442e-05, + "loss": 3.1023, + "step": 11130 + }, + { + "epoch": 0.6909801974051772, + "grad_norm": 0.211550281684596, + "learning_rate": 9.491508184730702e-05, + "loss": 3.1296, + "step": 11131 + }, + { + "epoch": 0.6910422745049352, + "grad_norm": 0.20619924274746296, + "learning_rate": 9.491349490117674e-05, + "loss": 3.0416, + "step": 11132 + }, + { + "epoch": 0.6911043516046931, + "grad_norm": 0.2096557794873537, + "learning_rate": 9.491190772072183e-05, + "loss": 3.1202, + "step": 11133 + }, + { + "epoch": 0.6911664287044509, + "grad_norm": 0.18128119561967723, + "learning_rate": 9.491032030595061e-05, + "loss": 3.1514, + "step": 11134 + }, + { + "epoch": 0.6912285058042088, + "grad_norm": 0.19454278911699244, + "learning_rate": 9.490873265687135e-05, + "loss": 3.0971, + "step": 11135 + }, + { + "epoch": 0.6912905829039667, + "grad_norm": 0.18534422398662403, + "learning_rate": 9.490714477349231e-05, + "loss": 3.1334, + "step": 11136 + }, + { + "epoch": 0.6913526600037246, + "grad_norm": 0.2573401864180879, + "learning_rate": 9.490555665582182e-05, + "loss": 2.9921, + "step": 11137 + }, + { + "epoch": 0.6914147371034826, + "grad_norm": 0.20616243808451437, + "learning_rate": 9.490396830386811e-05, + "loss": 3.1313, + "step": 11138 + }, + { + "epoch": 0.6914768142032405, + "grad_norm": 0.2223562176654283, + "learning_rate": 9.490237971763953e-05, + "loss": 3.0918, + "step": 11139 + }, + { + "epoch": 0.6915388913029983, + "grad_norm": 0.2334585551324903, + "learning_rate": 9.490079089714431e-05, + "loss": 3.0803, + "step": 11140 + }, + { + "epoch": 0.6916009684027562, + "grad_norm": 0.2159506115334824, + "learning_rate": 9.489920184239077e-05, + "loss": 3.1338, + "step": 11141 + }, + { + "epoch": 0.6916630455025141, + "grad_norm": 0.21985955346247787, + "learning_rate": 9.489761255338719e-05, + "loss": 3.0902, + "step": 11142 + }, + { + "epoch": 0.691725122602272, + "grad_norm": 0.20417636431479574, + "learning_rate": 9.489602303014187e-05, + "loss": 3.1181, + "step": 11143 + }, + { + "epoch": 0.69178719970203, + "grad_norm": 0.1844284481230422, + "learning_rate": 9.489443327266311e-05, + "loss": 3.1058, + "step": 11144 + }, + { + "epoch": 0.6918492768017879, + "grad_norm": 0.19961392720326362, + "learning_rate": 9.489284328095918e-05, + "loss": 3.1891, + "step": 11145 + }, + { + "epoch": 0.6919113539015457, + "grad_norm": 0.25258658862483546, + "learning_rate": 9.48912530550384e-05, + "loss": 3.1117, + "step": 11146 + }, + { + "epoch": 0.6919734310013036, + "grad_norm": 0.19195752650777864, + "learning_rate": 9.488966259490904e-05, + "loss": 3.0043, + "step": 11147 + }, + { + "epoch": 0.6920355081010615, + "grad_norm": 0.1796802332694133, + "learning_rate": 9.488807190057942e-05, + "loss": 3.1004, + "step": 11148 + }, + { + "epoch": 0.6920975852008194, + "grad_norm": 0.3287490858855792, + "learning_rate": 9.488648097205783e-05, + "loss": 3.0912, + "step": 11149 + }, + { + "epoch": 0.6921596623005773, + "grad_norm": 0.23348000828799578, + "learning_rate": 9.488488980935258e-05, + "loss": 3.0931, + "step": 11150 + }, + { + "epoch": 0.6922217394003353, + "grad_norm": 0.29487336625070676, + "learning_rate": 9.488329841247197e-05, + "loss": 3.0451, + "step": 11151 + }, + { + "epoch": 0.6922838165000931, + "grad_norm": 0.2661362524354793, + "learning_rate": 9.488170678142427e-05, + "loss": 3.0769, + "step": 11152 + }, + { + "epoch": 0.692345893599851, + "grad_norm": 0.24310405404393415, + "learning_rate": 9.488011491621782e-05, + "loss": 3.0178, + "step": 11153 + }, + { + "epoch": 0.6924079706996089, + "grad_norm": 0.27306596855672555, + "learning_rate": 9.487852281686094e-05, + "loss": 3.1039, + "step": 11154 + }, + { + "epoch": 0.6924700477993668, + "grad_norm": 0.22773855084472502, + "learning_rate": 9.487693048336189e-05, + "loss": 3.0624, + "step": 11155 + }, + { + "epoch": 0.6925321248991247, + "grad_norm": 0.21829489641396685, + "learning_rate": 9.4875337915729e-05, + "loss": 3.1047, + "step": 11156 + }, + { + "epoch": 0.6925942019988827, + "grad_norm": 0.21700939724830343, + "learning_rate": 9.487374511397058e-05, + "loss": 3.0492, + "step": 11157 + }, + { + "epoch": 0.6926562790986405, + "grad_norm": 0.20510176567782765, + "learning_rate": 9.487215207809493e-05, + "loss": 3.1758, + "step": 11158 + }, + { + "epoch": 0.6927183561983984, + "grad_norm": 0.35564535595604824, + "learning_rate": 9.487055880811037e-05, + "loss": 3.0498, + "step": 11159 + }, + { + "epoch": 0.6927804332981563, + "grad_norm": 0.24495645397610813, + "learning_rate": 9.486896530402523e-05, + "loss": 3.1309, + "step": 11160 + }, + { + "epoch": 0.6928425103979142, + "grad_norm": 0.2119811430908468, + "learning_rate": 9.486737156584779e-05, + "loss": 3.1233, + "step": 11161 + }, + { + "epoch": 0.6929045874976721, + "grad_norm": 0.21235429066761966, + "learning_rate": 9.48657775935864e-05, + "loss": 3.2471, + "step": 11162 + }, + { + "epoch": 0.6929666645974301, + "grad_norm": 0.20540817586233048, + "learning_rate": 9.486418338724933e-05, + "loss": 3.1451, + "step": 11163 + }, + { + "epoch": 0.6930287416971879, + "grad_norm": 0.21729971161857004, + "learning_rate": 9.486258894684495e-05, + "loss": 3.1576, + "step": 11164 + }, + { + "epoch": 0.6930908187969458, + "grad_norm": 0.1990187051976017, + "learning_rate": 9.486099427238153e-05, + "loss": 3.1302, + "step": 11165 + }, + { + "epoch": 0.6931528958967037, + "grad_norm": 0.1849694855274671, + "learning_rate": 9.485939936386742e-05, + "loss": 3.0488, + "step": 11166 + }, + { + "epoch": 0.6932149729964616, + "grad_norm": 0.30355386873556245, + "learning_rate": 9.485780422131094e-05, + "loss": 3.1063, + "step": 11167 + }, + { + "epoch": 0.6932770500962195, + "grad_norm": 0.17576174627821659, + "learning_rate": 9.48562088447204e-05, + "loss": 3.0995, + "step": 11168 + }, + { + "epoch": 0.6933391271959775, + "grad_norm": 0.24093953976590296, + "learning_rate": 9.485461323410413e-05, + "loss": 3.0332, + "step": 11169 + }, + { + "epoch": 0.6934012042957353, + "grad_norm": 0.23575110967734975, + "learning_rate": 9.485301738947046e-05, + "loss": 3.1313, + "step": 11170 + }, + { + "epoch": 0.6934632813954932, + "grad_norm": 0.23800367854184667, + "learning_rate": 9.485142131082769e-05, + "loss": 3.0882, + "step": 11171 + }, + { + "epoch": 0.6935253584952511, + "grad_norm": 0.19198756595610192, + "learning_rate": 9.484982499818419e-05, + "loss": 3.1748, + "step": 11172 + }, + { + "epoch": 0.693587435595009, + "grad_norm": 0.17878297489963124, + "learning_rate": 9.484822845154826e-05, + "loss": 3.1375, + "step": 11173 + }, + { + "epoch": 0.6936495126947669, + "grad_norm": 0.2101144185320807, + "learning_rate": 9.484663167092822e-05, + "loss": 3.0308, + "step": 11174 + }, + { + "epoch": 0.6937115897945249, + "grad_norm": 0.20545498870774845, + "learning_rate": 9.484503465633243e-05, + "loss": 3.1331, + "step": 11175 + }, + { + "epoch": 0.6937736668942827, + "grad_norm": 0.22732983739959015, + "learning_rate": 9.484343740776921e-05, + "loss": 3.1049, + "step": 11176 + }, + { + "epoch": 0.6938357439940406, + "grad_norm": 0.21801064064803838, + "learning_rate": 9.484183992524689e-05, + "loss": 3.1103, + "step": 11177 + }, + { + "epoch": 0.6938978210937985, + "grad_norm": 0.202646443696657, + "learning_rate": 9.484024220877378e-05, + "loss": 3.0762, + "step": 11178 + }, + { + "epoch": 0.6939598981935564, + "grad_norm": 0.23914788504007675, + "learning_rate": 9.483864425835828e-05, + "loss": 3.1218, + "step": 11179 + }, + { + "epoch": 0.6940219752933143, + "grad_norm": 0.21480975315567546, + "learning_rate": 9.483704607400868e-05, + "loss": 3.1379, + "step": 11180 + }, + { + "epoch": 0.6940840523930722, + "grad_norm": 0.22738784873587134, + "learning_rate": 9.48354476557333e-05, + "loss": 3.0663, + "step": 11181 + }, + { + "epoch": 0.69414612949283, + "grad_norm": 0.2713568408465761, + "learning_rate": 9.483384900354053e-05, + "loss": 3.1411, + "step": 11182 + }, + { + "epoch": 0.694208206592588, + "grad_norm": 0.20247262952231554, + "learning_rate": 9.483225011743867e-05, + "loss": 3.0946, + "step": 11183 + }, + { + "epoch": 0.6942702836923459, + "grad_norm": 0.22338101733491686, + "learning_rate": 9.483065099743609e-05, + "loss": 3.1385, + "step": 11184 + }, + { + "epoch": 0.6943323607921038, + "grad_norm": 0.21730893465726356, + "learning_rate": 9.482905164354111e-05, + "loss": 3.1019, + "step": 11185 + }, + { + "epoch": 0.6943944378918617, + "grad_norm": 0.19250677926647897, + "learning_rate": 9.482745205576209e-05, + "loss": 3.1263, + "step": 11186 + }, + { + "epoch": 0.6944565149916196, + "grad_norm": 0.22073549691508731, + "learning_rate": 9.482585223410738e-05, + "loss": 3.1034, + "step": 11187 + }, + { + "epoch": 0.6945185920913775, + "grad_norm": 0.18845096473603787, + "learning_rate": 9.48242521785853e-05, + "loss": 3.0615, + "step": 11188 + }, + { + "epoch": 0.6945806691911354, + "grad_norm": 0.23944259047866115, + "learning_rate": 9.482265188920423e-05, + "loss": 3.1341, + "step": 11189 + }, + { + "epoch": 0.6946427462908933, + "grad_norm": 0.23665905473589235, + "learning_rate": 9.48210513659725e-05, + "loss": 3.1403, + "step": 11190 + }, + { + "epoch": 0.6947048233906512, + "grad_norm": 0.1895279392509907, + "learning_rate": 9.481945060889845e-05, + "loss": 3.1813, + "step": 11191 + }, + { + "epoch": 0.6947669004904091, + "grad_norm": 0.17569702037758245, + "learning_rate": 9.481784961799046e-05, + "loss": 3.1385, + "step": 11192 + }, + { + "epoch": 0.694828977590167, + "grad_norm": 0.24144074462065285, + "learning_rate": 9.481624839325687e-05, + "loss": 3.1598, + "step": 11193 + }, + { + "epoch": 0.6948910546899248, + "grad_norm": 0.2165942060977359, + "learning_rate": 9.481464693470603e-05, + "loss": 3.1645, + "step": 11194 + }, + { + "epoch": 0.6949531317896828, + "grad_norm": 0.18368209580022246, + "learning_rate": 9.481304524234629e-05, + "loss": 3.1987, + "step": 11195 + }, + { + "epoch": 0.6950152088894407, + "grad_norm": 0.21503085745282371, + "learning_rate": 9.481144331618603e-05, + "loss": 3.1907, + "step": 11196 + }, + { + "epoch": 0.6950772859891986, + "grad_norm": 0.19123627452902772, + "learning_rate": 9.480984115623357e-05, + "loss": 3.1482, + "step": 11197 + }, + { + "epoch": 0.6951393630889565, + "grad_norm": 0.19255654482994897, + "learning_rate": 9.480823876249731e-05, + "loss": 3.1389, + "step": 11198 + }, + { + "epoch": 0.6952014401887144, + "grad_norm": 0.18803669689988092, + "learning_rate": 9.480663613498559e-05, + "loss": 3.1411, + "step": 11199 + }, + { + "epoch": 0.6952635172884722, + "grad_norm": 0.17183185165830525, + "learning_rate": 9.480503327370676e-05, + "loss": 3.0893, + "step": 11200 + }, + { + "epoch": 0.6953255943882302, + "grad_norm": 0.1969030466235361, + "learning_rate": 9.48034301786692e-05, + "loss": 3.1642, + "step": 11201 + }, + { + "epoch": 0.6953876714879881, + "grad_norm": 0.19027951505912863, + "learning_rate": 9.480182684988126e-05, + "loss": 3.0713, + "step": 11202 + }, + { + "epoch": 0.695449748587746, + "grad_norm": 0.19890095102124533, + "learning_rate": 9.480022328735134e-05, + "loss": 3.0832, + "step": 11203 + }, + { + "epoch": 0.6955118256875039, + "grad_norm": 0.2123086481505758, + "learning_rate": 9.479861949108775e-05, + "loss": 3.1363, + "step": 11204 + }, + { + "epoch": 0.6955739027872618, + "grad_norm": 0.17775478874601952, + "learning_rate": 9.479701546109891e-05, + "loss": 3.126, + "step": 11205 + }, + { + "epoch": 0.6956359798870196, + "grad_norm": 0.1734163411009603, + "learning_rate": 9.479541119739314e-05, + "loss": 3.1249, + "step": 11206 + }, + { + "epoch": 0.6956980569867776, + "grad_norm": 0.19188347510405182, + "learning_rate": 9.479380669997886e-05, + "loss": 3.0761, + "step": 11207 + }, + { + "epoch": 0.6957601340865355, + "grad_norm": 0.3077210992572543, + "learning_rate": 9.47922019688644e-05, + "loss": 3.0926, + "step": 11208 + }, + { + "epoch": 0.6958222111862934, + "grad_norm": 0.27055638011568034, + "learning_rate": 9.479059700405816e-05, + "loss": 3.1083, + "step": 11209 + }, + { + "epoch": 0.6958842882860513, + "grad_norm": 0.23547100660917256, + "learning_rate": 9.47889918055685e-05, + "loss": 3.1069, + "step": 11210 + }, + { + "epoch": 0.6959463653858092, + "grad_norm": 0.2129923638639471, + "learning_rate": 9.478738637340378e-05, + "loss": 3.0539, + "step": 11211 + }, + { + "epoch": 0.696008442485567, + "grad_norm": 0.24666365981417238, + "learning_rate": 9.478578070757241e-05, + "loss": 3.0894, + "step": 11212 + }, + { + "epoch": 0.696070519585325, + "grad_norm": 0.2599005097992239, + "learning_rate": 9.478417480808274e-05, + "loss": 3.2463, + "step": 11213 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.20535475828729205, + "learning_rate": 9.478256867494317e-05, + "loss": 3.1409, + "step": 11214 + }, + { + "epoch": 0.6961946737848408, + "grad_norm": 0.2575363934825716, + "learning_rate": 9.478096230816207e-05, + "loss": 3.1512, + "step": 11215 + }, + { + "epoch": 0.6962567508845987, + "grad_norm": 0.27888594588201954, + "learning_rate": 9.477935570774781e-05, + "loss": 3.0465, + "step": 11216 + }, + { + "epoch": 0.6963188279843566, + "grad_norm": 0.18851514036050684, + "learning_rate": 9.477774887370878e-05, + "loss": 3.1082, + "step": 11217 + }, + { + "epoch": 0.6963809050841144, + "grad_norm": 0.23092576151942498, + "learning_rate": 9.477614180605337e-05, + "loss": 3.1464, + "step": 11218 + }, + { + "epoch": 0.6964429821838723, + "grad_norm": 0.22192180536101586, + "learning_rate": 9.477453450478994e-05, + "loss": 3.06, + "step": 11219 + }, + { + "epoch": 0.6965050592836303, + "grad_norm": 0.2278464128424725, + "learning_rate": 9.477292696992692e-05, + "loss": 3.0736, + "step": 11220 + }, + { + "epoch": 0.6965671363833882, + "grad_norm": 0.21596450649205265, + "learning_rate": 9.477131920147266e-05, + "loss": 3.1468, + "step": 11221 + }, + { + "epoch": 0.6966292134831461, + "grad_norm": 0.22476777835115433, + "learning_rate": 9.476971119943557e-05, + "loss": 3.0564, + "step": 11222 + }, + { + "epoch": 0.696691290582904, + "grad_norm": 0.1981644595499571, + "learning_rate": 9.476810296382402e-05, + "loss": 3.0912, + "step": 11223 + }, + { + "epoch": 0.6967533676826618, + "grad_norm": 0.20588482730168983, + "learning_rate": 9.476649449464638e-05, + "loss": 3.1542, + "step": 11224 + }, + { + "epoch": 0.6968154447824197, + "grad_norm": 0.19390234187857816, + "learning_rate": 9.476488579191111e-05, + "loss": 3.095, + "step": 11225 + }, + { + "epoch": 0.6968775218821777, + "grad_norm": 0.18414787106667393, + "learning_rate": 9.476327685562655e-05, + "loss": 3.1714, + "step": 11226 + }, + { + "epoch": 0.6969395989819356, + "grad_norm": 0.2153957644580328, + "learning_rate": 9.476166768580111e-05, + "loss": 3.2627, + "step": 11227 + }, + { + "epoch": 0.6970016760816935, + "grad_norm": 0.19370178768751464, + "learning_rate": 9.476005828244317e-05, + "loss": 3.0054, + "step": 11228 + }, + { + "epoch": 0.6970637531814514, + "grad_norm": 0.22556902669298962, + "learning_rate": 9.475844864556116e-05, + "loss": 3.0474, + "step": 11229 + }, + { + "epoch": 0.6971258302812092, + "grad_norm": 0.296623625490498, + "learning_rate": 9.475683877516345e-05, + "loss": 3.1053, + "step": 11230 + }, + { + "epoch": 0.6971879073809671, + "grad_norm": 0.17531664165153704, + "learning_rate": 9.475522867125844e-05, + "loss": 3.1875, + "step": 11231 + }, + { + "epoch": 0.6972499844807251, + "grad_norm": 0.1845261859118179, + "learning_rate": 9.475361833385454e-05, + "loss": 3.1933, + "step": 11232 + }, + { + "epoch": 0.697312061580483, + "grad_norm": 0.22274611148801043, + "learning_rate": 9.475200776296016e-05, + "loss": 3.1389, + "step": 11233 + }, + { + "epoch": 0.6973741386802409, + "grad_norm": 0.17457625845246988, + "learning_rate": 9.475039695858369e-05, + "loss": 3.1678, + "step": 11234 + }, + { + "epoch": 0.6974362157799988, + "grad_norm": 0.21042184142714893, + "learning_rate": 9.474878592073353e-05, + "loss": 3.092, + "step": 11235 + }, + { + "epoch": 0.6974982928797566, + "grad_norm": 0.24014799379342328, + "learning_rate": 9.474717464941809e-05, + "loss": 3.1135, + "step": 11236 + }, + { + "epoch": 0.6975603699795145, + "grad_norm": 0.20051874225731656, + "learning_rate": 9.474556314464577e-05, + "loss": 3.1424, + "step": 11237 + }, + { + "epoch": 0.6976224470792725, + "grad_norm": 0.2131273108857687, + "learning_rate": 9.474395140642501e-05, + "loss": 3.1169, + "step": 11238 + }, + { + "epoch": 0.6976845241790304, + "grad_norm": 0.21330051160476948, + "learning_rate": 9.474233943476416e-05, + "loss": 3.1204, + "step": 11239 + }, + { + "epoch": 0.6977466012787883, + "grad_norm": 0.22814266221376756, + "learning_rate": 9.474072722967168e-05, + "loss": 3.1055, + "step": 11240 + }, + { + "epoch": 0.6978086783785461, + "grad_norm": 0.2132352611718723, + "learning_rate": 9.473911479115597e-05, + "loss": 3.2319, + "step": 11241 + }, + { + "epoch": 0.697870755478304, + "grad_norm": 0.21565945702437214, + "learning_rate": 9.473750211922543e-05, + "loss": 3.0305, + "step": 11242 + }, + { + "epoch": 0.6979328325780619, + "grad_norm": 0.28331169152118174, + "learning_rate": 9.473588921388848e-05, + "loss": 3.2219, + "step": 11243 + }, + { + "epoch": 0.6979949096778199, + "grad_norm": 0.2981534959661699, + "learning_rate": 9.473427607515353e-05, + "loss": 3.1778, + "step": 11244 + }, + { + "epoch": 0.6980569867775778, + "grad_norm": 0.2746874306280506, + "learning_rate": 9.473266270302902e-05, + "loss": 3.154, + "step": 11245 + }, + { + "epoch": 0.6981190638773357, + "grad_norm": 0.2789184434079271, + "learning_rate": 9.473104909752335e-05, + "loss": 3.1816, + "step": 11246 + }, + { + "epoch": 0.6981811409770935, + "grad_norm": 0.2134184026063517, + "learning_rate": 9.472943525864493e-05, + "loss": 3.0763, + "step": 11247 + }, + { + "epoch": 0.6982432180768514, + "grad_norm": 0.2616191218900966, + "learning_rate": 9.472782118640218e-05, + "loss": 3.152, + "step": 11248 + }, + { + "epoch": 0.6983052951766093, + "grad_norm": 0.2493954516392425, + "learning_rate": 9.472620688080353e-05, + "loss": 3.0521, + "step": 11249 + }, + { + "epoch": 0.6983673722763672, + "grad_norm": 0.27200855449677536, + "learning_rate": 9.47245923418574e-05, + "loss": 3.1055, + "step": 11250 + }, + { + "epoch": 0.6984294493761252, + "grad_norm": 0.22742869602459126, + "learning_rate": 9.472297756957223e-05, + "loss": 3.177, + "step": 11251 + }, + { + "epoch": 0.6984915264758831, + "grad_norm": 0.2804185046612349, + "learning_rate": 9.472136256395641e-05, + "loss": 3.0937, + "step": 11252 + }, + { + "epoch": 0.6985536035756409, + "grad_norm": 0.35838708677689485, + "learning_rate": 9.471974732501841e-05, + "loss": 2.9871, + "step": 11253 + }, + { + "epoch": 0.6986156806753988, + "grad_norm": 0.20625181847550944, + "learning_rate": 9.471813185276662e-05, + "loss": 3.0947, + "step": 11254 + }, + { + "epoch": 0.6986777577751567, + "grad_norm": 0.24671285954426156, + "learning_rate": 9.471651614720946e-05, + "loss": 3.1311, + "step": 11255 + }, + { + "epoch": 0.6987398348749146, + "grad_norm": 0.21019861829448303, + "learning_rate": 9.471490020835541e-05, + "loss": 3.1116, + "step": 11256 + }, + { + "epoch": 0.6988019119746726, + "grad_norm": 0.25124506558844123, + "learning_rate": 9.471328403621284e-05, + "loss": 3.1204, + "step": 11257 + }, + { + "epoch": 0.6988639890744305, + "grad_norm": 0.1907301952553776, + "learning_rate": 9.471166763079025e-05, + "loss": 3.1228, + "step": 11258 + }, + { + "epoch": 0.6989260661741883, + "grad_norm": 0.40902590536233374, + "learning_rate": 9.4710050992096e-05, + "loss": 3.1528, + "step": 11259 + }, + { + "epoch": 0.6989881432739462, + "grad_norm": 0.31614188936005483, + "learning_rate": 9.470843412013859e-05, + "loss": 3.1058, + "step": 11260 + }, + { + "epoch": 0.6990502203737041, + "grad_norm": 0.20735193104188573, + "learning_rate": 9.470681701492641e-05, + "loss": 3.136, + "step": 11261 + }, + { + "epoch": 0.699112297473462, + "grad_norm": 0.18887860402708737, + "learning_rate": 9.470519967646791e-05, + "loss": 3.0898, + "step": 11262 + }, + { + "epoch": 0.69917437457322, + "grad_norm": 0.23227748072006404, + "learning_rate": 9.470358210477153e-05, + "loss": 3.1722, + "step": 11263 + }, + { + "epoch": 0.6992364516729779, + "grad_norm": 0.16306621180952374, + "learning_rate": 9.470196429984573e-05, + "loss": 3.124, + "step": 11264 + }, + { + "epoch": 0.6992985287727357, + "grad_norm": 0.1992676151409269, + "learning_rate": 9.47003462616989e-05, + "loss": 3.2653, + "step": 11265 + }, + { + "epoch": 0.6993606058724936, + "grad_norm": 0.17969886080445005, + "learning_rate": 9.469872799033954e-05, + "loss": 3.1206, + "step": 11266 + }, + { + "epoch": 0.6994226829722515, + "grad_norm": 0.19554028579577393, + "learning_rate": 9.469710948577605e-05, + "loss": 3.1107, + "step": 11267 + }, + { + "epoch": 0.6994847600720094, + "grad_norm": 0.21945166451880435, + "learning_rate": 9.469549074801689e-05, + "loss": 3.1797, + "step": 11268 + }, + { + "epoch": 0.6995468371717674, + "grad_norm": 0.20230862817132209, + "learning_rate": 9.469387177707052e-05, + "loss": 3.1153, + "step": 11269 + }, + { + "epoch": 0.6996089142715253, + "grad_norm": 0.20041573647846966, + "learning_rate": 9.469225257294535e-05, + "loss": 3.1017, + "step": 11270 + }, + { + "epoch": 0.6996709913712831, + "grad_norm": 0.21598605010398567, + "learning_rate": 9.469063313564985e-05, + "loss": 3.1447, + "step": 11271 + }, + { + "epoch": 0.699733068471041, + "grad_norm": 0.25691046558128283, + "learning_rate": 9.468901346519248e-05, + "loss": 3.1284, + "step": 11272 + }, + { + "epoch": 0.6997951455707989, + "grad_norm": 0.21981580829164496, + "learning_rate": 9.468739356158168e-05, + "loss": 3.2423, + "step": 11273 + }, + { + "epoch": 0.6998572226705568, + "grad_norm": 0.26546368188269365, + "learning_rate": 9.46857734248259e-05, + "loss": 3.2113, + "step": 11274 + }, + { + "epoch": 0.6999192997703148, + "grad_norm": 0.2243172944067518, + "learning_rate": 9.468415305493357e-05, + "loss": 3.1013, + "step": 11275 + }, + { + "epoch": 0.6999813768700727, + "grad_norm": 0.23852495332051135, + "learning_rate": 9.468253245191318e-05, + "loss": 3.0894, + "step": 11276 + }, + { + "epoch": 0.7000434539698305, + "grad_norm": 0.277993971252422, + "learning_rate": 9.468091161577317e-05, + "loss": 3.0393, + "step": 11277 + }, + { + "epoch": 0.7001055310695884, + "grad_norm": 0.25983929844184006, + "learning_rate": 9.4679290546522e-05, + "loss": 3.0258, + "step": 11278 + }, + { + "epoch": 0.7001676081693463, + "grad_norm": 0.24043720490814577, + "learning_rate": 9.467766924416813e-05, + "loss": 3.0205, + "step": 11279 + }, + { + "epoch": 0.7002296852691042, + "grad_norm": 0.2627715103081012, + "learning_rate": 9.467604770872e-05, + "loss": 3.0262, + "step": 11280 + }, + { + "epoch": 0.7002917623688621, + "grad_norm": 0.23817221695309143, + "learning_rate": 9.46744259401861e-05, + "loss": 3.2264, + "step": 11281 + }, + { + "epoch": 0.7003538394686201, + "grad_norm": 0.32783654567506737, + "learning_rate": 9.467280393857487e-05, + "loss": 3.0632, + "step": 11282 + }, + { + "epoch": 0.7004159165683779, + "grad_norm": 0.2803587966289634, + "learning_rate": 9.467118170389477e-05, + "loss": 3.0652, + "step": 11283 + }, + { + "epoch": 0.7004779936681358, + "grad_norm": 0.25323010428906406, + "learning_rate": 9.466955923615428e-05, + "loss": 3.0491, + "step": 11284 + }, + { + "epoch": 0.7005400707678937, + "grad_norm": 0.23491141605215415, + "learning_rate": 9.466793653536184e-05, + "loss": 3.1744, + "step": 11285 + }, + { + "epoch": 0.7006021478676516, + "grad_norm": 0.22058442392399094, + "learning_rate": 9.466631360152595e-05, + "loss": 3.1371, + "step": 11286 + }, + { + "epoch": 0.7006642249674095, + "grad_norm": 0.22522687202841746, + "learning_rate": 9.466469043465505e-05, + "loss": 3.0969, + "step": 11287 + }, + { + "epoch": 0.7007263020671675, + "grad_norm": 0.19428051105424393, + "learning_rate": 9.466306703475763e-05, + "loss": 3.1269, + "step": 11288 + }, + { + "epoch": 0.7007883791669253, + "grad_norm": 0.19289666501272545, + "learning_rate": 9.466144340184212e-05, + "loss": 3.1873, + "step": 11289 + }, + { + "epoch": 0.7008504562666832, + "grad_norm": 0.20463472297442786, + "learning_rate": 9.465981953591702e-05, + "loss": 3.0931, + "step": 11290 + }, + { + "epoch": 0.7009125333664411, + "grad_norm": 0.22948055643678716, + "learning_rate": 9.465819543699082e-05, + "loss": 3.0587, + "step": 11291 + }, + { + "epoch": 0.700974610466199, + "grad_norm": 0.19871337217230836, + "learning_rate": 9.465657110507198e-05, + "loss": 3.0749, + "step": 11292 + }, + { + "epoch": 0.7010366875659569, + "grad_norm": 0.2911956665415177, + "learning_rate": 9.465494654016895e-05, + "loss": 3.1225, + "step": 11293 + }, + { + "epoch": 0.7010987646657149, + "grad_norm": 0.274057924217207, + "learning_rate": 9.465332174229023e-05, + "loss": 3.1146, + "step": 11294 + }, + { + "epoch": 0.7011608417654727, + "grad_norm": 0.19184759405596977, + "learning_rate": 9.465169671144429e-05, + "loss": 3.054, + "step": 11295 + }, + { + "epoch": 0.7012229188652306, + "grad_norm": 0.25330251662099396, + "learning_rate": 9.46500714476396e-05, + "loss": 3.0654, + "step": 11296 + }, + { + "epoch": 0.7012849959649885, + "grad_norm": 0.1870647185997336, + "learning_rate": 9.464844595088465e-05, + "loss": 3.1033, + "step": 11297 + }, + { + "epoch": 0.7013470730647464, + "grad_norm": 0.22016859731981586, + "learning_rate": 9.464682022118791e-05, + "loss": 3.0681, + "step": 11298 + }, + { + "epoch": 0.7014091501645043, + "grad_norm": 0.21768945520099592, + "learning_rate": 9.464519425855789e-05, + "loss": 3.0608, + "step": 11299 + }, + { + "epoch": 0.7014712272642623, + "grad_norm": 0.21528301416372794, + "learning_rate": 9.464356806300306e-05, + "loss": 3.0833, + "step": 11300 + }, + { + "epoch": 0.7015333043640201, + "grad_norm": 0.2386799977944735, + "learning_rate": 9.464194163453188e-05, + "loss": 3.0992, + "step": 11301 + }, + { + "epoch": 0.701595381463778, + "grad_norm": 0.16980911766856496, + "learning_rate": 9.464031497315287e-05, + "loss": 3.1294, + "step": 11302 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.19886780663241999, + "learning_rate": 9.463868807887449e-05, + "loss": 3.1684, + "step": 11303 + }, + { + "epoch": 0.7017195356632938, + "grad_norm": 0.190461747820779, + "learning_rate": 9.463706095170521e-05, + "loss": 3.0724, + "step": 11304 + }, + { + "epoch": 0.7017816127630517, + "grad_norm": 0.24749442026803808, + "learning_rate": 9.46354335916536e-05, + "loss": 3.0567, + "step": 11305 + }, + { + "epoch": 0.7018436898628096, + "grad_norm": 0.24119902979072838, + "learning_rate": 9.463380599872806e-05, + "loss": 3.1315, + "step": 11306 + }, + { + "epoch": 0.7019057669625675, + "grad_norm": 0.16672760375948853, + "learning_rate": 9.463217817293712e-05, + "loss": 3.0738, + "step": 11307 + }, + { + "epoch": 0.7019678440623254, + "grad_norm": 0.1834510898267637, + "learning_rate": 9.463055011428928e-05, + "loss": 3.1026, + "step": 11308 + }, + { + "epoch": 0.7020299211620833, + "grad_norm": 0.2091844082681649, + "learning_rate": 9.462892182279304e-05, + "loss": 3.0879, + "step": 11309 + }, + { + "epoch": 0.7020919982618412, + "grad_norm": 0.24987791477905885, + "learning_rate": 9.462729329845685e-05, + "loss": 3.1108, + "step": 11310 + }, + { + "epoch": 0.7021540753615991, + "grad_norm": 0.23605112125190625, + "learning_rate": 9.462566454128928e-05, + "loss": 3.1927, + "step": 11311 + }, + { + "epoch": 0.702216152461357, + "grad_norm": 0.2411207887787058, + "learning_rate": 9.462403555129876e-05, + "loss": 3.1224, + "step": 11312 + }, + { + "epoch": 0.7022782295611149, + "grad_norm": 0.22746318157647571, + "learning_rate": 9.462240632849379e-05, + "loss": 3.1439, + "step": 11313 + }, + { + "epoch": 0.7023403066608728, + "grad_norm": 0.18895668189586412, + "learning_rate": 9.462077687288292e-05, + "loss": 3.1961, + "step": 11314 + }, + { + "epoch": 0.7024023837606307, + "grad_norm": 0.21692998768914173, + "learning_rate": 9.461914718447462e-05, + "loss": 3.1739, + "step": 11315 + }, + { + "epoch": 0.7024644608603886, + "grad_norm": 0.24564848839708012, + "learning_rate": 9.46175172632774e-05, + "loss": 3.1134, + "step": 11316 + }, + { + "epoch": 0.7025265379601465, + "grad_norm": 0.24377342839010716, + "learning_rate": 9.461588710929976e-05, + "loss": 3.1221, + "step": 11317 + }, + { + "epoch": 0.7025886150599044, + "grad_norm": 0.20277513120087565, + "learning_rate": 9.461425672255021e-05, + "loss": 3.1998, + "step": 11318 + }, + { + "epoch": 0.7026506921596622, + "grad_norm": 0.20476024150502903, + "learning_rate": 9.461262610303725e-05, + "loss": 3.0977, + "step": 11319 + }, + { + "epoch": 0.7027127692594202, + "grad_norm": 0.17624208125627794, + "learning_rate": 9.461099525076939e-05, + "loss": 3.102, + "step": 11320 + }, + { + "epoch": 0.7027748463591781, + "grad_norm": 0.19772170004732245, + "learning_rate": 9.460936416575513e-05, + "loss": 3.134, + "step": 11321 + }, + { + "epoch": 0.702836923458936, + "grad_norm": 0.1652314853190079, + "learning_rate": 9.4607732848003e-05, + "loss": 3.0499, + "step": 11322 + }, + { + "epoch": 0.7028990005586939, + "grad_norm": 0.26262134801203746, + "learning_rate": 9.460610129752149e-05, + "loss": 3.0521, + "step": 11323 + }, + { + "epoch": 0.7029610776584518, + "grad_norm": 0.1744409164567736, + "learning_rate": 9.460446951431911e-05, + "loss": 3.0508, + "step": 11324 + }, + { + "epoch": 0.7030231547582096, + "grad_norm": 0.32678146276716047, + "learning_rate": 9.460283749840442e-05, + "loss": 3.0133, + "step": 11325 + }, + { + "epoch": 0.7030852318579676, + "grad_norm": 0.20695709303914392, + "learning_rate": 9.460120524978586e-05, + "loss": 3.0543, + "step": 11326 + }, + { + "epoch": 0.7031473089577255, + "grad_norm": 0.21830190164934332, + "learning_rate": 9.459957276847201e-05, + "loss": 3.1113, + "step": 11327 + }, + { + "epoch": 0.7032093860574834, + "grad_norm": 0.2038847853518394, + "learning_rate": 9.459794005447136e-05, + "loss": 2.9518, + "step": 11328 + }, + { + "epoch": 0.7032714631572413, + "grad_norm": 0.27338684157726145, + "learning_rate": 9.459630710779242e-05, + "loss": 3.1636, + "step": 11329 + }, + { + "epoch": 0.7033335402569992, + "grad_norm": 0.27219886891811845, + "learning_rate": 9.459467392844373e-05, + "loss": 3.097, + "step": 11330 + }, + { + "epoch": 0.703395617356757, + "grad_norm": 0.20960627920030722, + "learning_rate": 9.459304051643379e-05, + "loss": 3.161, + "step": 11331 + }, + { + "epoch": 0.703457694456515, + "grad_norm": 0.23628381045674218, + "learning_rate": 9.459140687177113e-05, + "loss": 3.0714, + "step": 11332 + }, + { + "epoch": 0.7035197715562729, + "grad_norm": 0.2287521198179695, + "learning_rate": 9.45897729944643e-05, + "loss": 3.1099, + "step": 11333 + }, + { + "epoch": 0.7035818486560308, + "grad_norm": 0.21544258584000173, + "learning_rate": 9.458813888452176e-05, + "loss": 3.1443, + "step": 11334 + }, + { + "epoch": 0.7036439257557887, + "grad_norm": 0.2333749815195584, + "learning_rate": 9.45865045419521e-05, + "loss": 3.1086, + "step": 11335 + }, + { + "epoch": 0.7037060028555466, + "grad_norm": 0.22691321358544092, + "learning_rate": 9.458486996676383e-05, + "loss": 3.0909, + "step": 11336 + }, + { + "epoch": 0.7037680799553044, + "grad_norm": 0.21296530472395872, + "learning_rate": 9.458323515896546e-05, + "loss": 3.106, + "step": 11337 + }, + { + "epoch": 0.7038301570550624, + "grad_norm": 0.22404927617717843, + "learning_rate": 9.458160011856551e-05, + "loss": 3.136, + "step": 11338 + }, + { + "epoch": 0.7038922341548203, + "grad_norm": 0.25138098049779606, + "learning_rate": 9.457996484557257e-05, + "loss": 3.0491, + "step": 11339 + }, + { + "epoch": 0.7039543112545782, + "grad_norm": 0.31330882543269223, + "learning_rate": 9.45783293399951e-05, + "loss": 3.169, + "step": 11340 + }, + { + "epoch": 0.7040163883543361, + "grad_norm": 0.21945409199442878, + "learning_rate": 9.457669360184168e-05, + "loss": 3.1187, + "step": 11341 + }, + { + "epoch": 0.704078465454094, + "grad_norm": 0.24254685321668426, + "learning_rate": 9.457505763112082e-05, + "loss": 3.174, + "step": 11342 + }, + { + "epoch": 0.7041405425538518, + "grad_norm": 0.2149677092400417, + "learning_rate": 9.457342142784106e-05, + "loss": 3.1592, + "step": 11343 + }, + { + "epoch": 0.7042026196536098, + "grad_norm": 0.22490379276711583, + "learning_rate": 9.457178499201094e-05, + "loss": 3.2449, + "step": 11344 + }, + { + "epoch": 0.7042646967533677, + "grad_norm": 0.2966521832372123, + "learning_rate": 9.457014832363901e-05, + "loss": 3.1288, + "step": 11345 + }, + { + "epoch": 0.7043267738531256, + "grad_norm": 0.20144575419663255, + "learning_rate": 9.456851142273379e-05, + "loss": 3.0484, + "step": 11346 + }, + { + "epoch": 0.7043888509528835, + "grad_norm": 0.2688703034849558, + "learning_rate": 9.456687428930383e-05, + "loss": 3.2369, + "step": 11347 + }, + { + "epoch": 0.7044509280526414, + "grad_norm": 0.230081507685459, + "learning_rate": 9.456523692335766e-05, + "loss": 3.0285, + "step": 11348 + }, + { + "epoch": 0.7045130051523992, + "grad_norm": 0.2255619994174078, + "learning_rate": 9.456359932490383e-05, + "loss": 3.1465, + "step": 11349 + }, + { + "epoch": 0.7045750822521571, + "grad_norm": 0.20268106549812856, + "learning_rate": 9.45619614939509e-05, + "loss": 3.1495, + "step": 11350 + }, + { + "epoch": 0.7046371593519151, + "grad_norm": 0.2502524323369051, + "learning_rate": 9.456032343050737e-05, + "loss": 3.1495, + "step": 11351 + }, + { + "epoch": 0.704699236451673, + "grad_norm": 0.22858691937351003, + "learning_rate": 9.455868513458184e-05, + "loss": 3.0661, + "step": 11352 + }, + { + "epoch": 0.7047613135514309, + "grad_norm": 0.20740512162419333, + "learning_rate": 9.45570466061828e-05, + "loss": 3.1802, + "step": 11353 + }, + { + "epoch": 0.7048233906511888, + "grad_norm": 0.19957686008263667, + "learning_rate": 9.455540784531887e-05, + "loss": 3.0845, + "step": 11354 + }, + { + "epoch": 0.7048854677509466, + "grad_norm": 0.19805580924082372, + "learning_rate": 9.455376885199855e-05, + "loss": 3.0661, + "step": 11355 + }, + { + "epoch": 0.7049475448507045, + "grad_norm": 0.17800090005397945, + "learning_rate": 9.455212962623039e-05, + "loss": 3.0641, + "step": 11356 + }, + { + "epoch": 0.7050096219504625, + "grad_norm": 0.18294669833645788, + "learning_rate": 9.455049016802297e-05, + "loss": 3.1145, + "step": 11357 + }, + { + "epoch": 0.7050716990502204, + "grad_norm": 0.19648780061378587, + "learning_rate": 9.454885047738482e-05, + "loss": 3.0814, + "step": 11358 + }, + { + "epoch": 0.7051337761499783, + "grad_norm": 0.2015202662851945, + "learning_rate": 9.45472105543245e-05, + "loss": 3.1478, + "step": 11359 + }, + { + "epoch": 0.7051958532497362, + "grad_norm": 0.18739768057742, + "learning_rate": 9.454557039885059e-05, + "loss": 3.0214, + "step": 11360 + }, + { + "epoch": 0.705257930349494, + "grad_norm": 0.2008763058595637, + "learning_rate": 9.454393001097159e-05, + "loss": 3.0646, + "step": 11361 + }, + { + "epoch": 0.7053200074492519, + "grad_norm": 0.2668790246762763, + "learning_rate": 9.454228939069612e-05, + "loss": 3.1304, + "step": 11362 + }, + { + "epoch": 0.7053820845490099, + "grad_norm": 0.16488798551345543, + "learning_rate": 9.454064853803271e-05, + "loss": 3.0808, + "step": 11363 + }, + { + "epoch": 0.7054441616487678, + "grad_norm": 0.20337895876847972, + "learning_rate": 9.453900745298991e-05, + "loss": 3.0104, + "step": 11364 + }, + { + "epoch": 0.7055062387485257, + "grad_norm": 0.2511802173611405, + "learning_rate": 9.45373661355763e-05, + "loss": 3.0641, + "step": 11365 + }, + { + "epoch": 0.7055683158482836, + "grad_norm": 0.17620216413748252, + "learning_rate": 9.453572458580045e-05, + "loss": 3.0887, + "step": 11366 + }, + { + "epoch": 0.7056303929480414, + "grad_norm": 0.21109272402465332, + "learning_rate": 9.453408280367092e-05, + "loss": 3.1781, + "step": 11367 + }, + { + "epoch": 0.7056924700477993, + "grad_norm": 0.18054372542905348, + "learning_rate": 9.453244078919624e-05, + "loss": 3.0862, + "step": 11368 + }, + { + "epoch": 0.7057545471475573, + "grad_norm": 0.22517747767048713, + "learning_rate": 9.453079854238503e-05, + "loss": 3.2198, + "step": 11369 + }, + { + "epoch": 0.7058166242473152, + "grad_norm": 0.19530134169175797, + "learning_rate": 9.452915606324581e-05, + "loss": 3.0302, + "step": 11370 + }, + { + "epoch": 0.7058787013470731, + "grad_norm": 0.1882162322170957, + "learning_rate": 9.45275133517872e-05, + "loss": 3.0429, + "step": 11371 + }, + { + "epoch": 0.705940778446831, + "grad_norm": 0.21776708242954132, + "learning_rate": 9.452587040801772e-05, + "loss": 3.1446, + "step": 11372 + }, + { + "epoch": 0.7060028555465888, + "grad_norm": 0.1912405406102283, + "learning_rate": 9.452422723194598e-05, + "loss": 3.0913, + "step": 11373 + }, + { + "epoch": 0.7060649326463467, + "grad_norm": 0.1898027909602761, + "learning_rate": 9.452258382358053e-05, + "loss": 3.0576, + "step": 11374 + }, + { + "epoch": 0.7061270097461047, + "grad_norm": 0.2066792969311557, + "learning_rate": 9.452094018292996e-05, + "loss": 3.0087, + "step": 11375 + }, + { + "epoch": 0.7061890868458626, + "grad_norm": 0.1716128593545397, + "learning_rate": 9.451929631000284e-05, + "loss": 3.0395, + "step": 11376 + }, + { + "epoch": 0.7062511639456205, + "grad_norm": 0.17729647632317713, + "learning_rate": 9.451765220480773e-05, + "loss": 3.1129, + "step": 11377 + }, + { + "epoch": 0.7063132410453784, + "grad_norm": 0.17195014503007836, + "learning_rate": 9.451600786735323e-05, + "loss": 3.0093, + "step": 11378 + }, + { + "epoch": 0.7063753181451362, + "grad_norm": 0.16965412003013974, + "learning_rate": 9.451436329764792e-05, + "loss": 3.1404, + "step": 11379 + }, + { + "epoch": 0.7064373952448941, + "grad_norm": 0.17756039624510372, + "learning_rate": 9.451271849570035e-05, + "loss": 3.1254, + "step": 11380 + }, + { + "epoch": 0.706499472344652, + "grad_norm": 0.192187292207435, + "learning_rate": 9.451107346151913e-05, + "loss": 3.1873, + "step": 11381 + }, + { + "epoch": 0.70656154944441, + "grad_norm": 0.16281874692285087, + "learning_rate": 9.450942819511284e-05, + "loss": 3.0892, + "step": 11382 + }, + { + "epoch": 0.7066236265441679, + "grad_norm": 0.1736783496801218, + "learning_rate": 9.450778269649005e-05, + "loss": 3.0642, + "step": 11383 + }, + { + "epoch": 0.7066857036439258, + "grad_norm": 0.23683440149895438, + "learning_rate": 9.450613696565935e-05, + "loss": 3.1307, + "step": 11384 + }, + { + "epoch": 0.7067477807436836, + "grad_norm": 0.19807615044965188, + "learning_rate": 9.450449100262934e-05, + "loss": 3.0039, + "step": 11385 + }, + { + "epoch": 0.7068098578434415, + "grad_norm": 0.2958402922756547, + "learning_rate": 9.45028448074086e-05, + "loss": 3.0961, + "step": 11386 + }, + { + "epoch": 0.7068719349431994, + "grad_norm": 0.19543988841896642, + "learning_rate": 9.450119838000571e-05, + "loss": 3.1534, + "step": 11387 + }, + { + "epoch": 0.7069340120429574, + "grad_norm": 0.3045628855260722, + "learning_rate": 9.449955172042926e-05, + "loss": 3.0167, + "step": 11388 + }, + { + "epoch": 0.7069960891427153, + "grad_norm": 0.24037638397978855, + "learning_rate": 9.449790482868785e-05, + "loss": 3.0991, + "step": 11389 + }, + { + "epoch": 0.7070581662424732, + "grad_norm": 0.2044734122794051, + "learning_rate": 9.449625770479005e-05, + "loss": 3.1069, + "step": 11390 + }, + { + "epoch": 0.707120243342231, + "grad_norm": 0.2157950925979216, + "learning_rate": 9.44946103487445e-05, + "loss": 3.1689, + "step": 11391 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.21045667099647056, + "learning_rate": 9.449296276055975e-05, + "loss": 3.1487, + "step": 11392 + }, + { + "epoch": 0.7072443975417468, + "grad_norm": 0.21478377285323033, + "learning_rate": 9.449131494024442e-05, + "loss": 3.1574, + "step": 11393 + }, + { + "epoch": 0.7073064746415048, + "grad_norm": 0.16696340190119502, + "learning_rate": 9.44896668878071e-05, + "loss": 3.047, + "step": 11394 + }, + { + "epoch": 0.7073685517412627, + "grad_norm": 0.20795545680253344, + "learning_rate": 9.448801860325638e-05, + "loss": 3.0735, + "step": 11395 + }, + { + "epoch": 0.7074306288410206, + "grad_norm": 0.2239006304518209, + "learning_rate": 9.448637008660087e-05, + "loss": 3.1456, + "step": 11396 + }, + { + "epoch": 0.7074927059407784, + "grad_norm": 0.22723573104515685, + "learning_rate": 9.448472133784916e-05, + "loss": 3.077, + "step": 11397 + }, + { + "epoch": 0.7075547830405363, + "grad_norm": 0.18561751376253857, + "learning_rate": 9.448307235700988e-05, + "loss": 3.0775, + "step": 11398 + }, + { + "epoch": 0.7076168601402942, + "grad_norm": 0.19064125632713128, + "learning_rate": 9.44814231440916e-05, + "loss": 3.1272, + "step": 11399 + }, + { + "epoch": 0.7076789372400522, + "grad_norm": 0.211766169400199, + "learning_rate": 9.447977369910294e-05, + "loss": 3.0963, + "step": 11400 + }, + { + "epoch": 0.7077410143398101, + "grad_norm": 0.20674470649144405, + "learning_rate": 9.44781240220525e-05, + "loss": 3.1413, + "step": 11401 + }, + { + "epoch": 0.707803091439568, + "grad_norm": 0.3267500430116574, + "learning_rate": 9.447647411294889e-05, + "loss": 3.2042, + "step": 11402 + }, + { + "epoch": 0.7078651685393258, + "grad_norm": 0.1915553154772692, + "learning_rate": 9.447482397180072e-05, + "loss": 3.1779, + "step": 11403 + }, + { + "epoch": 0.7079272456390837, + "grad_norm": 0.19107059319737538, + "learning_rate": 9.44731735986166e-05, + "loss": 3.1052, + "step": 11404 + }, + { + "epoch": 0.7079893227388416, + "grad_norm": 0.3077745791860825, + "learning_rate": 9.447152299340513e-05, + "loss": 3.1322, + "step": 11405 + }, + { + "epoch": 0.7080513998385995, + "grad_norm": 0.1881075592130821, + "learning_rate": 9.446987215617494e-05, + "loss": 3.0409, + "step": 11406 + }, + { + "epoch": 0.7081134769383575, + "grad_norm": 0.18740572148589327, + "learning_rate": 9.446822108693461e-05, + "loss": 3.0238, + "step": 11407 + }, + { + "epoch": 0.7081755540381154, + "grad_norm": 0.2045167869921314, + "learning_rate": 9.446656978569279e-05, + "loss": 2.9859, + "step": 11408 + }, + { + "epoch": 0.7082376311378732, + "grad_norm": 0.1795296078964579, + "learning_rate": 9.446491825245809e-05, + "loss": 3.0921, + "step": 11409 + }, + { + "epoch": 0.7082997082376311, + "grad_norm": 0.3373872391981306, + "learning_rate": 9.44632664872391e-05, + "loss": 3.1029, + "step": 11410 + }, + { + "epoch": 0.708361785337389, + "grad_norm": 0.22848478744971523, + "learning_rate": 9.446161449004446e-05, + "loss": 3.0601, + "step": 11411 + }, + { + "epoch": 0.708423862437147, + "grad_norm": 0.21887582327863797, + "learning_rate": 9.445996226088279e-05, + "loss": 3.1616, + "step": 11412 + }, + { + "epoch": 0.7084859395369049, + "grad_norm": 0.21831874401152257, + "learning_rate": 9.445830979976269e-05, + "loss": 3.0889, + "step": 11413 + }, + { + "epoch": 0.7085480166366628, + "grad_norm": 0.33590962050323797, + "learning_rate": 9.445665710669281e-05, + "loss": 3.054, + "step": 11414 + }, + { + "epoch": 0.7086100937364206, + "grad_norm": 0.22778460961139657, + "learning_rate": 9.445500418168175e-05, + "loss": 3.068, + "step": 11415 + }, + { + "epoch": 0.7086721708361785, + "grad_norm": 0.19711071012440154, + "learning_rate": 9.445335102473815e-05, + "loss": 3.0519, + "step": 11416 + }, + { + "epoch": 0.7087342479359364, + "grad_norm": 0.2373559998345186, + "learning_rate": 9.44516976358706e-05, + "loss": 3.1055, + "step": 11417 + }, + { + "epoch": 0.7087963250356943, + "grad_norm": 0.2234534748075455, + "learning_rate": 9.445004401508779e-05, + "loss": 3.1124, + "step": 11418 + }, + { + "epoch": 0.7088584021354523, + "grad_norm": 0.20640725440936764, + "learning_rate": 9.444839016239828e-05, + "loss": 3.1071, + "step": 11419 + }, + { + "epoch": 0.7089204792352102, + "grad_norm": 0.2966865393102926, + "learning_rate": 9.444673607781073e-05, + "loss": 3.1705, + "step": 11420 + }, + { + "epoch": 0.708982556334968, + "grad_norm": 0.21765781977101809, + "learning_rate": 9.44450817613338e-05, + "loss": 3.1131, + "step": 11421 + }, + { + "epoch": 0.7090446334347259, + "grad_norm": 0.19810396458843516, + "learning_rate": 9.444342721297606e-05, + "loss": 3.0613, + "step": 11422 + }, + { + "epoch": 0.7091067105344838, + "grad_norm": 0.20520660372229463, + "learning_rate": 9.444177243274618e-05, + "loss": 3.0705, + "step": 11423 + }, + { + "epoch": 0.7091687876342417, + "grad_norm": 0.23301556039544663, + "learning_rate": 9.444011742065277e-05, + "loss": 3.1066, + "step": 11424 + }, + { + "epoch": 0.7092308647339997, + "grad_norm": 0.22302917570582256, + "learning_rate": 9.44384621767045e-05, + "loss": 3.0543, + "step": 11425 + }, + { + "epoch": 0.7092929418337576, + "grad_norm": 0.18263532171122, + "learning_rate": 9.443680670090997e-05, + "loss": 3.0969, + "step": 11426 + }, + { + "epoch": 0.7093550189335154, + "grad_norm": 0.2009895479921058, + "learning_rate": 9.443515099327784e-05, + "loss": 3.105, + "step": 11427 + }, + { + "epoch": 0.7094170960332733, + "grad_norm": 0.24185463118483222, + "learning_rate": 9.443349505381676e-05, + "loss": 3.1002, + "step": 11428 + }, + { + "epoch": 0.7094791731330312, + "grad_norm": 0.21944329546340569, + "learning_rate": 9.443183888253531e-05, + "loss": 3.1409, + "step": 11429 + }, + { + "epoch": 0.7095412502327891, + "grad_norm": 0.20200959565309187, + "learning_rate": 9.44301824794422e-05, + "loss": 3.1258, + "step": 11430 + }, + { + "epoch": 0.709603327332547, + "grad_norm": 0.2023535346828371, + "learning_rate": 9.442852584454602e-05, + "loss": 3.0925, + "step": 11431 + }, + { + "epoch": 0.709665404432305, + "grad_norm": 0.23235884380760832, + "learning_rate": 9.442686897785546e-05, + "loss": 3.0427, + "step": 11432 + }, + { + "epoch": 0.7097274815320628, + "grad_norm": 0.18383864039132675, + "learning_rate": 9.442521187937912e-05, + "loss": 3.1301, + "step": 11433 + }, + { + "epoch": 0.7097895586318207, + "grad_norm": 0.3050109662367086, + "learning_rate": 9.442355454912567e-05, + "loss": 2.9809, + "step": 11434 + }, + { + "epoch": 0.7098516357315786, + "grad_norm": 0.1847130763228513, + "learning_rate": 9.442189698710376e-05, + "loss": 3.1339, + "step": 11435 + }, + { + "epoch": 0.7099137128313365, + "grad_norm": 0.19654199805631742, + "learning_rate": 9.442023919332202e-05, + "loss": 3.0388, + "step": 11436 + }, + { + "epoch": 0.7099757899310944, + "grad_norm": 0.18065892805072012, + "learning_rate": 9.441858116778911e-05, + "loss": 3.0461, + "step": 11437 + }, + { + "epoch": 0.7100378670308524, + "grad_norm": 0.1717364229405197, + "learning_rate": 9.441692291051368e-05, + "loss": 3.0459, + "step": 11438 + }, + { + "epoch": 0.7100999441306102, + "grad_norm": 0.20262690141175194, + "learning_rate": 9.441526442150438e-05, + "loss": 3.2071, + "step": 11439 + }, + { + "epoch": 0.7101620212303681, + "grad_norm": 0.21032448278189186, + "learning_rate": 9.441360570076987e-05, + "loss": 3.1023, + "step": 11440 + }, + { + "epoch": 0.710224098330126, + "grad_norm": 0.1723043919320556, + "learning_rate": 9.44119467483188e-05, + "loss": 3.0663, + "step": 11441 + }, + { + "epoch": 0.7102861754298839, + "grad_norm": 0.204978800744571, + "learning_rate": 9.44102875641598e-05, + "loss": 3.066, + "step": 11442 + }, + { + "epoch": 0.7103482525296418, + "grad_norm": 0.18857644947536226, + "learning_rate": 9.440862814830156e-05, + "loss": 3.1417, + "step": 11443 + }, + { + "epoch": 0.7104103296293998, + "grad_norm": 0.27981561891045803, + "learning_rate": 9.440696850075273e-05, + "loss": 3.1342, + "step": 11444 + }, + { + "epoch": 0.7104724067291576, + "grad_norm": 0.22500164773125775, + "learning_rate": 9.440530862152198e-05, + "loss": 3.0196, + "step": 11445 + }, + { + "epoch": 0.7105344838289155, + "grad_norm": 0.20455070977130982, + "learning_rate": 9.440364851061793e-05, + "loss": 3.1238, + "step": 11446 + }, + { + "epoch": 0.7105965609286734, + "grad_norm": 0.18495068385736946, + "learning_rate": 9.440198816804927e-05, + "loss": 3.1055, + "step": 11447 + }, + { + "epoch": 0.7106586380284313, + "grad_norm": 0.23723181245278221, + "learning_rate": 9.440032759382467e-05, + "loss": 3.0704, + "step": 11448 + }, + { + "epoch": 0.7107207151281892, + "grad_norm": 0.2011125995750022, + "learning_rate": 9.439866678795277e-05, + "loss": 2.9476, + "step": 11449 + }, + { + "epoch": 0.7107827922279472, + "grad_norm": 0.19425380830762876, + "learning_rate": 9.439700575044226e-05, + "loss": 3.0606, + "step": 11450 + }, + { + "epoch": 0.710844869327705, + "grad_norm": 0.3341414879570789, + "learning_rate": 9.439534448130178e-05, + "loss": 3.0504, + "step": 11451 + }, + { + "epoch": 0.7109069464274629, + "grad_norm": 0.28322098774316895, + "learning_rate": 9.439368298054003e-05, + "loss": 3.0689, + "step": 11452 + }, + { + "epoch": 0.7109690235272208, + "grad_norm": 0.2692595093348652, + "learning_rate": 9.439202124816563e-05, + "loss": 3.1207, + "step": 11453 + }, + { + "epoch": 0.7110311006269787, + "grad_norm": 0.1706434254601195, + "learning_rate": 9.439035928418732e-05, + "loss": 3.089, + "step": 11454 + }, + { + "epoch": 0.7110931777267366, + "grad_norm": 0.4016905176071038, + "learning_rate": 9.438869708861368e-05, + "loss": 3.162, + "step": 11455 + }, + { + "epoch": 0.7111552548264946, + "grad_norm": 0.22718154259260154, + "learning_rate": 9.438703466145347e-05, + "loss": 3.2049, + "step": 11456 + }, + { + "epoch": 0.7112173319262524, + "grad_norm": 0.21692263761607664, + "learning_rate": 9.43853720027153e-05, + "loss": 3.1447, + "step": 11457 + }, + { + "epoch": 0.7112794090260103, + "grad_norm": 0.21786676639378438, + "learning_rate": 9.438370911240789e-05, + "loss": 3.0704, + "step": 11458 + }, + { + "epoch": 0.7113414861257682, + "grad_norm": 0.22114816878839452, + "learning_rate": 9.438204599053989e-05, + "loss": 3.1675, + "step": 11459 + }, + { + "epoch": 0.7114035632255261, + "grad_norm": 0.23638079190382674, + "learning_rate": 9.438038263711997e-05, + "loss": 3.1372, + "step": 11460 + }, + { + "epoch": 0.711465640325284, + "grad_norm": 0.24667130921936198, + "learning_rate": 9.437871905215683e-05, + "loss": 3.1434, + "step": 11461 + }, + { + "epoch": 0.711527717425042, + "grad_norm": 0.2846815652364782, + "learning_rate": 9.437705523565914e-05, + "loss": 3.1183, + "step": 11462 + }, + { + "epoch": 0.7115897945247998, + "grad_norm": 0.17857050980604053, + "learning_rate": 9.437539118763558e-05, + "loss": 3.0652, + "step": 11463 + }, + { + "epoch": 0.7116518716245577, + "grad_norm": 0.35777144717127163, + "learning_rate": 9.437372690809483e-05, + "loss": 3.0206, + "step": 11464 + }, + { + "epoch": 0.7117139487243156, + "grad_norm": 0.22715010572691735, + "learning_rate": 9.437206239704556e-05, + "loss": 3.0485, + "step": 11465 + }, + { + "epoch": 0.7117760258240735, + "grad_norm": 0.2440337137739562, + "learning_rate": 9.43703976544965e-05, + "loss": 3.1859, + "step": 11466 + }, + { + "epoch": 0.7118381029238314, + "grad_norm": 0.26471084478267637, + "learning_rate": 9.436873268045628e-05, + "loss": 3.2128, + "step": 11467 + }, + { + "epoch": 0.7119001800235893, + "grad_norm": 0.2552156139165687, + "learning_rate": 9.436706747493361e-05, + "loss": 3.1174, + "step": 11468 + }, + { + "epoch": 0.7119622571233472, + "grad_norm": 0.29422240013414985, + "learning_rate": 9.436540203793718e-05, + "loss": 3.1429, + "step": 11469 + }, + { + "epoch": 0.7120243342231051, + "grad_norm": 0.17951775909345313, + "learning_rate": 9.436373636947568e-05, + "loss": 3.1303, + "step": 11470 + }, + { + "epoch": 0.712086411322863, + "grad_norm": 0.28100332727956845, + "learning_rate": 9.43620704695578e-05, + "loss": 3.0536, + "step": 11471 + }, + { + "epoch": 0.7121484884226209, + "grad_norm": 0.22354707365319812, + "learning_rate": 9.436040433819224e-05, + "loss": 3.0988, + "step": 11472 + }, + { + "epoch": 0.7122105655223788, + "grad_norm": 0.22088813622804915, + "learning_rate": 9.435873797538767e-05, + "loss": 3.1463, + "step": 11473 + }, + { + "epoch": 0.7122726426221367, + "grad_norm": 0.27101208064031573, + "learning_rate": 9.43570713811528e-05, + "loss": 3.1668, + "step": 11474 + }, + { + "epoch": 0.7123347197218945, + "grad_norm": 0.2377158633323006, + "learning_rate": 9.435540455549632e-05, + "loss": 3.0885, + "step": 11475 + }, + { + "epoch": 0.7123967968216525, + "grad_norm": 0.2131141399178368, + "learning_rate": 9.435373749842693e-05, + "loss": 3.1142, + "step": 11476 + }, + { + "epoch": 0.7124588739214104, + "grad_norm": 0.2236132634043197, + "learning_rate": 9.435207020995331e-05, + "loss": 3.1688, + "step": 11477 + }, + { + "epoch": 0.7125209510211683, + "grad_norm": 0.1935677898295102, + "learning_rate": 9.435040269008419e-05, + "loss": 3.044, + "step": 11478 + }, + { + "epoch": 0.7125830281209262, + "grad_norm": 0.2038610400946682, + "learning_rate": 9.434873493882823e-05, + "loss": 3.1216, + "step": 11479 + }, + { + "epoch": 0.7126451052206841, + "grad_norm": 0.20486409053593543, + "learning_rate": 9.434706695619418e-05, + "loss": 3.1988, + "step": 11480 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.20603285578919658, + "learning_rate": 9.43453987421907e-05, + "loss": 3.0786, + "step": 11481 + }, + { + "epoch": 0.7127692594201999, + "grad_norm": 0.22010813324659861, + "learning_rate": 9.434373029682651e-05, + "loss": 3.0836, + "step": 11482 + }, + { + "epoch": 0.7128313365199578, + "grad_norm": 0.19180561752749556, + "learning_rate": 9.434206162011033e-05, + "loss": 3.1773, + "step": 11483 + }, + { + "epoch": 0.7128934136197157, + "grad_norm": 0.23614931829803598, + "learning_rate": 9.434039271205083e-05, + "loss": 3.1448, + "step": 11484 + }, + { + "epoch": 0.7129554907194736, + "grad_norm": 0.18118718678466106, + "learning_rate": 9.433872357265674e-05, + "loss": 3.0912, + "step": 11485 + }, + { + "epoch": 0.7130175678192315, + "grad_norm": 0.22166653732753747, + "learning_rate": 9.433705420193676e-05, + "loss": 3.0995, + "step": 11486 + }, + { + "epoch": 0.7130796449189893, + "grad_norm": 0.22034864737846652, + "learning_rate": 9.433538459989964e-05, + "loss": 3.1477, + "step": 11487 + }, + { + "epoch": 0.7131417220187473, + "grad_norm": 0.2632061978684741, + "learning_rate": 9.433371476655403e-05, + "loss": 3.0523, + "step": 11488 + }, + { + "epoch": 0.7132037991185052, + "grad_norm": 0.22743914624766506, + "learning_rate": 9.433204470190866e-05, + "loss": 3.0927, + "step": 11489 + }, + { + "epoch": 0.7132658762182631, + "grad_norm": 0.195476920538763, + "learning_rate": 9.433037440597226e-05, + "loss": 3.1698, + "step": 11490 + }, + { + "epoch": 0.713327953318021, + "grad_norm": 0.3494123084598396, + "learning_rate": 9.432870387875355e-05, + "loss": 3.1529, + "step": 11491 + }, + { + "epoch": 0.7133900304177789, + "grad_norm": 0.2731481656217001, + "learning_rate": 9.43270331202612e-05, + "loss": 3.1303, + "step": 11492 + }, + { + "epoch": 0.7134521075175367, + "grad_norm": 0.3008785957253106, + "learning_rate": 9.432536213050398e-05, + "loss": 3.1496, + "step": 11493 + }, + { + "epoch": 0.7135141846172947, + "grad_norm": 0.30261120905645117, + "learning_rate": 9.432369090949059e-05, + "loss": 3.0986, + "step": 11494 + }, + { + "epoch": 0.7135762617170526, + "grad_norm": 0.21238071726175414, + "learning_rate": 9.432201945722971e-05, + "loss": 3.1129, + "step": 11495 + }, + { + "epoch": 0.7136383388168105, + "grad_norm": 0.20815392855907608, + "learning_rate": 9.432034777373013e-05, + "loss": 3.094, + "step": 11496 + }, + { + "epoch": 0.7137004159165684, + "grad_norm": 0.24803039916580952, + "learning_rate": 9.431867585900052e-05, + "loss": 3.1727, + "step": 11497 + }, + { + "epoch": 0.7137624930163263, + "grad_norm": 0.29954532500674824, + "learning_rate": 9.431700371304963e-05, + "loss": 3.2249, + "step": 11498 + }, + { + "epoch": 0.7138245701160841, + "grad_norm": 0.23588829253696647, + "learning_rate": 9.431533133588617e-05, + "loss": 3.1749, + "step": 11499 + }, + { + "epoch": 0.713886647215842, + "grad_norm": 0.2522093187463419, + "learning_rate": 9.431365872751886e-05, + "loss": 3.1443, + "step": 11500 + }, + { + "epoch": 0.7139487243156, + "grad_norm": 0.22618359089024523, + "learning_rate": 9.431198588795645e-05, + "loss": 3.0512, + "step": 11501 + }, + { + "epoch": 0.7140108014153579, + "grad_norm": 0.22001869279385913, + "learning_rate": 9.431031281720764e-05, + "loss": 3.0502, + "step": 11502 + }, + { + "epoch": 0.7140728785151158, + "grad_norm": 0.24716609108587329, + "learning_rate": 9.430863951528118e-05, + "loss": 3.1257, + "step": 11503 + }, + { + "epoch": 0.7141349556148737, + "grad_norm": 0.21995405500372142, + "learning_rate": 9.430696598218578e-05, + "loss": 3.0729, + "step": 11504 + }, + { + "epoch": 0.7141970327146315, + "grad_norm": 0.2096675242049289, + "learning_rate": 9.430529221793019e-05, + "loss": 3.1232, + "step": 11505 + }, + { + "epoch": 0.7142591098143894, + "grad_norm": 0.17027326950096458, + "learning_rate": 9.430361822252312e-05, + "loss": 3.035, + "step": 11506 + }, + { + "epoch": 0.7143211869141474, + "grad_norm": 0.20503264331562238, + "learning_rate": 9.430194399597334e-05, + "loss": 3.1631, + "step": 11507 + }, + { + "epoch": 0.7143832640139053, + "grad_norm": 0.25744235314751523, + "learning_rate": 9.430026953828956e-05, + "loss": 3.081, + "step": 11508 + }, + { + "epoch": 0.7144453411136632, + "grad_norm": 0.16576708533583806, + "learning_rate": 9.429859484948052e-05, + "loss": 3.1326, + "step": 11509 + }, + { + "epoch": 0.7145074182134211, + "grad_norm": 0.17544930162611516, + "learning_rate": 9.429691992955494e-05, + "loss": 3.1673, + "step": 11510 + }, + { + "epoch": 0.7145694953131789, + "grad_norm": 0.1989780542742432, + "learning_rate": 9.429524477852158e-05, + "loss": 2.9916, + "step": 11511 + }, + { + "epoch": 0.7146315724129368, + "grad_norm": 0.1637051528021524, + "learning_rate": 9.42935693963892e-05, + "loss": 2.9848, + "step": 11512 + }, + { + "epoch": 0.7146936495126948, + "grad_norm": 0.320268279023728, + "learning_rate": 9.429189378316649e-05, + "loss": 3.1374, + "step": 11513 + }, + { + "epoch": 0.7147557266124527, + "grad_norm": 0.24475448491728402, + "learning_rate": 9.429021793886224e-05, + "loss": 3.1065, + "step": 11514 + }, + { + "epoch": 0.7148178037122106, + "grad_norm": 0.19722289387401273, + "learning_rate": 9.428854186348514e-05, + "loss": 3.1061, + "step": 11515 + }, + { + "epoch": 0.7148798808119685, + "grad_norm": 0.19661113070725944, + "learning_rate": 9.428686555704399e-05, + "loss": 3.0335, + "step": 11516 + }, + { + "epoch": 0.7149419579117263, + "grad_norm": 0.19325795066363768, + "learning_rate": 9.42851890195475e-05, + "loss": 3.1737, + "step": 11517 + }, + { + "epoch": 0.7150040350114842, + "grad_norm": 0.2719661088631364, + "learning_rate": 9.428351225100444e-05, + "loss": 3.0659, + "step": 11518 + }, + { + "epoch": 0.7150661121112422, + "grad_norm": 0.21510871673802473, + "learning_rate": 9.428183525142353e-05, + "loss": 3.0556, + "step": 11519 + }, + { + "epoch": 0.7151281892110001, + "grad_norm": 0.2263839445585097, + "learning_rate": 9.428015802081354e-05, + "loss": 3.2002, + "step": 11520 + }, + { + "epoch": 0.715190266310758, + "grad_norm": 0.176978589850869, + "learning_rate": 9.427848055918322e-05, + "loss": 3.1615, + "step": 11521 + }, + { + "epoch": 0.7152523434105159, + "grad_norm": 0.23508316976573554, + "learning_rate": 9.427680286654132e-05, + "loss": 3.0881, + "step": 11522 + }, + { + "epoch": 0.7153144205102737, + "grad_norm": 0.3949063095209853, + "learning_rate": 9.427512494289659e-05, + "loss": 3.0621, + "step": 11523 + }, + { + "epoch": 0.7153764976100316, + "grad_norm": 0.2678190942261809, + "learning_rate": 9.42734467882578e-05, + "loss": 3.0456, + "step": 11524 + }, + { + "epoch": 0.7154385747097896, + "grad_norm": 0.1892754714805516, + "learning_rate": 9.427176840263367e-05, + "loss": 3.1144, + "step": 11525 + }, + { + "epoch": 0.7155006518095475, + "grad_norm": 0.2528986377878484, + "learning_rate": 9.427008978603298e-05, + "loss": 3.0837, + "step": 11526 + }, + { + "epoch": 0.7155627289093054, + "grad_norm": 0.20869601489607092, + "learning_rate": 9.426841093846448e-05, + "loss": 3.0569, + "step": 11527 + }, + { + "epoch": 0.7156248060090633, + "grad_norm": 0.23996943308964905, + "learning_rate": 9.426673185993695e-05, + "loss": 3.1287, + "step": 11528 + }, + { + "epoch": 0.7156868831088211, + "grad_norm": 0.23260364316877696, + "learning_rate": 9.426505255045912e-05, + "loss": 3.1012, + "step": 11529 + }, + { + "epoch": 0.715748960208579, + "grad_norm": 0.22362577435608347, + "learning_rate": 9.426337301003977e-05, + "loss": 3.0728, + "step": 11530 + }, + { + "epoch": 0.715811037308337, + "grad_norm": 0.2886696672161575, + "learning_rate": 9.426169323868766e-05, + "loss": 3.0871, + "step": 11531 + }, + { + "epoch": 0.7158731144080949, + "grad_norm": 0.23588359419060473, + "learning_rate": 9.426001323641154e-05, + "loss": 3.149, + "step": 11532 + }, + { + "epoch": 0.7159351915078528, + "grad_norm": 0.2282640996544261, + "learning_rate": 9.42583330032202e-05, + "loss": 3.0334, + "step": 11533 + }, + { + "epoch": 0.7159972686076107, + "grad_norm": 0.2853756152282981, + "learning_rate": 9.425665253912239e-05, + "loss": 3.1643, + "step": 11534 + }, + { + "epoch": 0.7160593457073685, + "grad_norm": 0.24579487923472754, + "learning_rate": 9.425497184412688e-05, + "loss": 3.1486, + "step": 11535 + }, + { + "epoch": 0.7161214228071264, + "grad_norm": 0.25034810209509134, + "learning_rate": 9.425329091824244e-05, + "loss": 3.0872, + "step": 11536 + }, + { + "epoch": 0.7161834999068843, + "grad_norm": 0.3119069432429674, + "learning_rate": 9.425160976147782e-05, + "loss": 3.1716, + "step": 11537 + }, + { + "epoch": 0.7162455770066423, + "grad_norm": 0.2389318074664096, + "learning_rate": 9.424992837384183e-05, + "loss": 3.0373, + "step": 11538 + }, + { + "epoch": 0.7163076541064002, + "grad_norm": 0.22792139798268285, + "learning_rate": 9.424824675534322e-05, + "loss": 3.0511, + "step": 11539 + }, + { + "epoch": 0.7163697312061581, + "grad_norm": 0.19415524003647106, + "learning_rate": 9.424656490599075e-05, + "loss": 3.0516, + "step": 11540 + }, + { + "epoch": 0.7164318083059159, + "grad_norm": 0.2712130116588469, + "learning_rate": 9.424488282579322e-05, + "loss": 2.9496, + "step": 11541 + }, + { + "epoch": 0.7164938854056738, + "grad_norm": 0.21227116535076063, + "learning_rate": 9.42432005147594e-05, + "loss": 3.1757, + "step": 11542 + }, + { + "epoch": 0.7165559625054317, + "grad_norm": 0.22247399014703403, + "learning_rate": 9.424151797289806e-05, + "loss": 3.0943, + "step": 11543 + }, + { + "epoch": 0.7166180396051897, + "grad_norm": 0.2406732380806177, + "learning_rate": 9.423983520021798e-05, + "loss": 3.0873, + "step": 11544 + }, + { + "epoch": 0.7166801167049476, + "grad_norm": 0.23226888413745775, + "learning_rate": 9.423815219672793e-05, + "loss": 3.166, + "step": 11545 + }, + { + "epoch": 0.7167421938047055, + "grad_norm": 0.21896583263598227, + "learning_rate": 9.423646896243672e-05, + "loss": 3.1439, + "step": 11546 + }, + { + "epoch": 0.7168042709044633, + "grad_norm": 0.26239470118305785, + "learning_rate": 9.42347854973531e-05, + "loss": 3.0595, + "step": 11547 + }, + { + "epoch": 0.7168663480042212, + "grad_norm": 0.32084892294936135, + "learning_rate": 9.423310180148586e-05, + "loss": 3.1093, + "step": 11548 + }, + { + "epoch": 0.7169284251039791, + "grad_norm": 0.18416186961518782, + "learning_rate": 9.42314178748438e-05, + "loss": 3.0346, + "step": 11549 + }, + { + "epoch": 0.7169905022037371, + "grad_norm": 0.22092261413972902, + "learning_rate": 9.422973371743571e-05, + "loss": 3.148, + "step": 11550 + }, + { + "epoch": 0.717052579303495, + "grad_norm": 0.19208200047920784, + "learning_rate": 9.422804932927034e-05, + "loss": 3.0847, + "step": 11551 + }, + { + "epoch": 0.7171146564032529, + "grad_norm": 0.19964452198945626, + "learning_rate": 9.422636471035651e-05, + "loss": 3.1121, + "step": 11552 + }, + { + "epoch": 0.7171767335030107, + "grad_norm": 0.1992807852703895, + "learning_rate": 9.422467986070298e-05, + "loss": 3.0364, + "step": 11553 + }, + { + "epoch": 0.7172388106027686, + "grad_norm": 0.17166812866496534, + "learning_rate": 9.422299478031858e-05, + "loss": 3.1059, + "step": 11554 + }, + { + "epoch": 0.7173008877025265, + "grad_norm": 0.2947860430236108, + "learning_rate": 9.422130946921209e-05, + "loss": 3.0393, + "step": 11555 + }, + { + "epoch": 0.7173629648022845, + "grad_norm": 0.1979090705886227, + "learning_rate": 9.421962392739227e-05, + "loss": 3.1261, + "step": 11556 + }, + { + "epoch": 0.7174250419020424, + "grad_norm": 0.27704293768035587, + "learning_rate": 9.421793815486794e-05, + "loss": 3.0541, + "step": 11557 + }, + { + "epoch": 0.7174871190018003, + "grad_norm": 0.190106229683817, + "learning_rate": 9.421625215164789e-05, + "loss": 3.0406, + "step": 11558 + }, + { + "epoch": 0.7175491961015581, + "grad_norm": 0.22879955389487436, + "learning_rate": 9.421456591774094e-05, + "loss": 3.1327, + "step": 11559 + }, + { + "epoch": 0.717611273201316, + "grad_norm": 0.2465766603999125, + "learning_rate": 9.421287945315585e-05, + "loss": 3.0906, + "step": 11560 + }, + { + "epoch": 0.7176733503010739, + "grad_norm": 0.22728130164552982, + "learning_rate": 9.421119275790142e-05, + "loss": 3.1304, + "step": 11561 + }, + { + "epoch": 0.7177354274008318, + "grad_norm": 0.21629824301450945, + "learning_rate": 9.420950583198649e-05, + "loss": 3.0687, + "step": 11562 + }, + { + "epoch": 0.7177975045005898, + "grad_norm": 0.1838600175771266, + "learning_rate": 9.420781867541983e-05, + "loss": 3.2198, + "step": 11563 + }, + { + "epoch": 0.7178595816003477, + "grad_norm": 0.18080516150951043, + "learning_rate": 9.420613128821026e-05, + "loss": 3.0777, + "step": 11564 + }, + { + "epoch": 0.7179216587001055, + "grad_norm": 0.19014969807404494, + "learning_rate": 9.420444367036654e-05, + "loss": 3.1397, + "step": 11565 + }, + { + "epoch": 0.7179837357998634, + "grad_norm": 0.17787293912064978, + "learning_rate": 9.420275582189753e-05, + "loss": 3.1537, + "step": 11566 + }, + { + "epoch": 0.7180458128996213, + "grad_norm": 0.3145946160665762, + "learning_rate": 9.420106774281201e-05, + "loss": 3.1593, + "step": 11567 + }, + { + "epoch": 0.7181078899993792, + "grad_norm": 0.18269260909368387, + "learning_rate": 9.419937943311878e-05, + "loss": 3.1548, + "step": 11568 + }, + { + "epoch": 0.7181699670991372, + "grad_norm": 0.2438736103928441, + "learning_rate": 9.419769089282667e-05, + "loss": 3.0121, + "step": 11569 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.206434821274598, + "learning_rate": 9.419600212194446e-05, + "loss": 3.0557, + "step": 11570 + }, + { + "epoch": 0.7182941212986529, + "grad_norm": 0.27885178800919025, + "learning_rate": 9.419431312048099e-05, + "loss": 3.1751, + "step": 11571 + }, + { + "epoch": 0.7183561983984108, + "grad_norm": 0.2406794449283716, + "learning_rate": 9.419262388844507e-05, + "loss": 3.0546, + "step": 11572 + }, + { + "epoch": 0.7184182754981687, + "grad_norm": 0.21593009089996, + "learning_rate": 9.419093442584547e-05, + "loss": 3.0861, + "step": 11573 + }, + { + "epoch": 0.7184803525979266, + "grad_norm": 0.32813173241747584, + "learning_rate": 9.418924473269108e-05, + "loss": 3.0532, + "step": 11574 + }, + { + "epoch": 0.7185424296976846, + "grad_norm": 0.2597926132982307, + "learning_rate": 9.418755480899064e-05, + "loss": 3.0432, + "step": 11575 + }, + { + "epoch": 0.7186045067974425, + "grad_norm": 0.22383009025949238, + "learning_rate": 9.418586465475301e-05, + "loss": 3.0815, + "step": 11576 + }, + { + "epoch": 0.7186665838972003, + "grad_norm": 0.18811919372887997, + "learning_rate": 9.4184174269987e-05, + "loss": 3.1426, + "step": 11577 + }, + { + "epoch": 0.7187286609969582, + "grad_norm": 0.20805610848255587, + "learning_rate": 9.418248365470144e-05, + "loss": 2.9867, + "step": 11578 + }, + { + "epoch": 0.7187907380967161, + "grad_norm": 0.18424647547263603, + "learning_rate": 9.418079280890511e-05, + "loss": 3.0592, + "step": 11579 + }, + { + "epoch": 0.718852815196474, + "grad_norm": 0.17887144749723838, + "learning_rate": 9.417910173260687e-05, + "loss": 3.1438, + "step": 11580 + }, + { + "epoch": 0.718914892296232, + "grad_norm": 0.17980399709068012, + "learning_rate": 9.417741042581554e-05, + "loss": 3.081, + "step": 11581 + }, + { + "epoch": 0.7189769693959899, + "grad_norm": 0.24981455332322808, + "learning_rate": 9.417571888853994e-05, + "loss": 3.1379, + "step": 11582 + }, + { + "epoch": 0.7190390464957477, + "grad_norm": 0.20665860377306228, + "learning_rate": 9.417402712078886e-05, + "loss": 3.0341, + "step": 11583 + }, + { + "epoch": 0.7191011235955056, + "grad_norm": 0.20456869339243014, + "learning_rate": 9.417233512257118e-05, + "loss": 3.0513, + "step": 11584 + }, + { + "epoch": 0.7191632006952635, + "grad_norm": 0.20204026096661615, + "learning_rate": 9.41706428938957e-05, + "loss": 3.1521, + "step": 11585 + }, + { + "epoch": 0.7192252777950214, + "grad_norm": 0.18452967847369714, + "learning_rate": 9.416895043477125e-05, + "loss": 3.0484, + "step": 11586 + }, + { + "epoch": 0.7192873548947794, + "grad_norm": 0.1865632179540369, + "learning_rate": 9.416725774520667e-05, + "loss": 3.0721, + "step": 11587 + }, + { + "epoch": 0.7193494319945373, + "grad_norm": 0.20379616736497067, + "learning_rate": 9.416556482521079e-05, + "loss": 3.1695, + "step": 11588 + }, + { + "epoch": 0.7194115090942951, + "grad_norm": 0.19122856027755183, + "learning_rate": 9.416387167479242e-05, + "loss": 3.0357, + "step": 11589 + }, + { + "epoch": 0.719473586194053, + "grad_norm": 0.23109374727890086, + "learning_rate": 9.416217829396042e-05, + "loss": 3.0728, + "step": 11590 + }, + { + "epoch": 0.7195356632938109, + "grad_norm": 0.19107147055728588, + "learning_rate": 9.416048468272361e-05, + "loss": 3.1436, + "step": 11591 + }, + { + "epoch": 0.7195977403935688, + "grad_norm": 0.2088901360773064, + "learning_rate": 9.415879084109084e-05, + "loss": 3.0906, + "step": 11592 + }, + { + "epoch": 0.7196598174933267, + "grad_norm": 0.1997636560873392, + "learning_rate": 9.415709676907092e-05, + "loss": 3.1863, + "step": 11593 + }, + { + "epoch": 0.7197218945930847, + "grad_norm": 0.20699752676909902, + "learning_rate": 9.415540246667272e-05, + "loss": 3.108, + "step": 11594 + }, + { + "epoch": 0.7197839716928425, + "grad_norm": 0.20752149693175184, + "learning_rate": 9.415370793390508e-05, + "loss": 3.1289, + "step": 11595 + }, + { + "epoch": 0.7198460487926004, + "grad_norm": 0.21596004743764885, + "learning_rate": 9.41520131707768e-05, + "loss": 3.0554, + "step": 11596 + }, + { + "epoch": 0.7199081258923583, + "grad_norm": 0.22235073075960382, + "learning_rate": 9.415031817729676e-05, + "loss": 3.0885, + "step": 11597 + }, + { + "epoch": 0.7199702029921162, + "grad_norm": 0.21034541745361576, + "learning_rate": 9.414862295347379e-05, + "loss": 3.1034, + "step": 11598 + }, + { + "epoch": 0.7200322800918741, + "grad_norm": 0.23428629150095223, + "learning_rate": 9.414692749931675e-05, + "loss": 3.1538, + "step": 11599 + }, + { + "epoch": 0.7200943571916321, + "grad_norm": 0.294152614455541, + "learning_rate": 9.414523181483447e-05, + "loss": 3.068, + "step": 11600 + }, + { + "epoch": 0.7201564342913899, + "grad_norm": 0.29684781903767976, + "learning_rate": 9.414353590003578e-05, + "loss": 3.0556, + "step": 11601 + }, + { + "epoch": 0.7202185113911478, + "grad_norm": 0.21802580541313887, + "learning_rate": 9.414183975492957e-05, + "loss": 3.1247, + "step": 11602 + }, + { + "epoch": 0.7202805884909057, + "grad_norm": 0.23499648823295435, + "learning_rate": 9.414014337952465e-05, + "loss": 3.1516, + "step": 11603 + }, + { + "epoch": 0.7203426655906636, + "grad_norm": 0.22778782323667707, + "learning_rate": 9.413844677382989e-05, + "loss": 3.0965, + "step": 11604 + }, + { + "epoch": 0.7204047426904215, + "grad_norm": 0.3813170397553424, + "learning_rate": 9.413674993785413e-05, + "loss": 3.1597, + "step": 11605 + }, + { + "epoch": 0.7204668197901795, + "grad_norm": 0.25636082206704086, + "learning_rate": 9.413505287160624e-05, + "loss": 3.1043, + "step": 11606 + }, + { + "epoch": 0.7205288968899373, + "grad_norm": 0.1993872827226317, + "learning_rate": 9.413335557509507e-05, + "loss": 3.1236, + "step": 11607 + }, + { + "epoch": 0.7205909739896952, + "grad_norm": 0.21092316790740556, + "learning_rate": 9.413165804832946e-05, + "loss": 3.1147, + "step": 11608 + }, + { + "epoch": 0.7206530510894531, + "grad_norm": 0.22353355003836445, + "learning_rate": 9.412996029131829e-05, + "loss": 3.009, + "step": 11609 + }, + { + "epoch": 0.720715128189211, + "grad_norm": 0.2845686515178394, + "learning_rate": 9.41282623040704e-05, + "loss": 3.1229, + "step": 11610 + }, + { + "epoch": 0.7207772052889689, + "grad_norm": 0.2197778514765817, + "learning_rate": 9.412656408659465e-05, + "loss": 3.1381, + "step": 11611 + }, + { + "epoch": 0.7208392823887269, + "grad_norm": 0.24344601045797806, + "learning_rate": 9.412486563889991e-05, + "loss": 3.1301, + "step": 11612 + }, + { + "epoch": 0.7209013594884847, + "grad_norm": 0.21186671629046971, + "learning_rate": 9.412316696099502e-05, + "loss": 3.1238, + "step": 11613 + }, + { + "epoch": 0.7209634365882426, + "grad_norm": 0.19308980781616733, + "learning_rate": 9.412146805288886e-05, + "loss": 3.0375, + "step": 11614 + }, + { + "epoch": 0.7210255136880005, + "grad_norm": 0.24941492909495241, + "learning_rate": 9.41197689145903e-05, + "loss": 3.0664, + "step": 11615 + }, + { + "epoch": 0.7210875907877584, + "grad_norm": 0.1947394834495901, + "learning_rate": 9.411806954610818e-05, + "loss": 2.9996, + "step": 11616 + }, + { + "epoch": 0.7211496678875163, + "grad_norm": 0.24664732568995387, + "learning_rate": 9.41163699474514e-05, + "loss": 3.1229, + "step": 11617 + }, + { + "epoch": 0.7212117449872741, + "grad_norm": 0.22071756467591402, + "learning_rate": 9.41146701186288e-05, + "loss": 3.0664, + "step": 11618 + }, + { + "epoch": 0.7212738220870321, + "grad_norm": 0.24440692111412027, + "learning_rate": 9.411297005964925e-05, + "loss": 3.0261, + "step": 11619 + }, + { + "epoch": 0.72133589918679, + "grad_norm": 0.22558786135427022, + "learning_rate": 9.411126977052163e-05, + "loss": 3.0252, + "step": 11620 + }, + { + "epoch": 0.7213979762865479, + "grad_norm": 0.18394363827043708, + "learning_rate": 9.41095692512548e-05, + "loss": 3.0748, + "step": 11621 + }, + { + "epoch": 0.7214600533863058, + "grad_norm": 0.22703726123221157, + "learning_rate": 9.410786850185767e-05, + "loss": 3.0159, + "step": 11622 + }, + { + "epoch": 0.7215221304860637, + "grad_norm": 0.26722232803538476, + "learning_rate": 9.410616752233906e-05, + "loss": 3.0867, + "step": 11623 + }, + { + "epoch": 0.7215842075858215, + "grad_norm": 0.18062486996695515, + "learning_rate": 9.410446631270787e-05, + "loss": 3.1112, + "step": 11624 + }, + { + "epoch": 0.7216462846855795, + "grad_norm": 0.24763526707276462, + "learning_rate": 9.410276487297296e-05, + "loss": 3.1507, + "step": 11625 + }, + { + "epoch": 0.7217083617853374, + "grad_norm": 0.29096919207764177, + "learning_rate": 9.410106320314323e-05, + "loss": 3.1346, + "step": 11626 + }, + { + "epoch": 0.7217704388850953, + "grad_norm": 0.2368517662470494, + "learning_rate": 9.409936130322754e-05, + "loss": 3.0501, + "step": 11627 + }, + { + "epoch": 0.7218325159848532, + "grad_norm": 0.2244061753337031, + "learning_rate": 9.409765917323479e-05, + "loss": 3.1134, + "step": 11628 + }, + { + "epoch": 0.7218945930846111, + "grad_norm": 0.2167337342839193, + "learning_rate": 9.409595681317383e-05, + "loss": 3.1609, + "step": 11629 + }, + { + "epoch": 0.7219566701843689, + "grad_norm": 0.16699040283677205, + "learning_rate": 9.409425422305358e-05, + "loss": 2.9982, + "step": 11630 + }, + { + "epoch": 0.7220187472841268, + "grad_norm": 0.21998495379657035, + "learning_rate": 9.409255140288289e-05, + "loss": 3.1477, + "step": 11631 + }, + { + "epoch": 0.7220808243838848, + "grad_norm": 0.19348448741940075, + "learning_rate": 9.409084835267065e-05, + "loss": 3.1567, + "step": 11632 + }, + { + "epoch": 0.7221429014836427, + "grad_norm": 0.19534909859125404, + "learning_rate": 9.408914507242577e-05, + "loss": 3.152, + "step": 11633 + }, + { + "epoch": 0.7222049785834006, + "grad_norm": 0.21703998578302497, + "learning_rate": 9.40874415621571e-05, + "loss": 3.2198, + "step": 11634 + }, + { + "epoch": 0.7222670556831585, + "grad_norm": 0.1940574183697113, + "learning_rate": 9.408573782187355e-05, + "loss": 3.12, + "step": 11635 + }, + { + "epoch": 0.7223291327829163, + "grad_norm": 0.26797622803688614, + "learning_rate": 9.4084033851584e-05, + "loss": 3.1445, + "step": 11636 + }, + { + "epoch": 0.7223912098826742, + "grad_norm": 0.2183077889449332, + "learning_rate": 9.408232965129735e-05, + "loss": 3.0817, + "step": 11637 + }, + { + "epoch": 0.7224532869824322, + "grad_norm": 0.1830267062861506, + "learning_rate": 9.408062522102246e-05, + "loss": 3.0827, + "step": 11638 + }, + { + "epoch": 0.7225153640821901, + "grad_norm": 0.2019321740681016, + "learning_rate": 9.407892056076828e-05, + "loss": 3.117, + "step": 11639 + }, + { + "epoch": 0.722577441181948, + "grad_norm": 0.23563863728721163, + "learning_rate": 9.407721567054367e-05, + "loss": 3.1634, + "step": 11640 + }, + { + "epoch": 0.7226395182817059, + "grad_norm": 0.18767593990386314, + "learning_rate": 9.407551055035751e-05, + "loss": 3.0547, + "step": 11641 + }, + { + "epoch": 0.7227015953814637, + "grad_norm": 0.19839341169474528, + "learning_rate": 9.407380520021871e-05, + "loss": 3.1046, + "step": 11642 + }, + { + "epoch": 0.7227636724812216, + "grad_norm": 0.17919431258717303, + "learning_rate": 9.407209962013617e-05, + "loss": 3.0738, + "step": 11643 + }, + { + "epoch": 0.7228257495809796, + "grad_norm": 0.19687092829838324, + "learning_rate": 9.407039381011879e-05, + "loss": 3.0895, + "step": 11644 + }, + { + "epoch": 0.7228878266807375, + "grad_norm": 0.19997173944126403, + "learning_rate": 9.406868777017548e-05, + "loss": 3.0488, + "step": 11645 + }, + { + "epoch": 0.7229499037804954, + "grad_norm": 0.1828957946646366, + "learning_rate": 9.406698150031512e-05, + "loss": 3.1096, + "step": 11646 + }, + { + "epoch": 0.7230119808802533, + "grad_norm": 0.21168663196537796, + "learning_rate": 9.40652750005466e-05, + "loss": 3.1541, + "step": 11647 + }, + { + "epoch": 0.7230740579800111, + "grad_norm": 0.19906024611198683, + "learning_rate": 9.406356827087888e-05, + "loss": 3.1391, + "step": 11648 + }, + { + "epoch": 0.723136135079769, + "grad_norm": 0.18982028776954382, + "learning_rate": 9.40618613113208e-05, + "loss": 3.1974, + "step": 11649 + }, + { + "epoch": 0.723198212179527, + "grad_norm": 0.19210229551124341, + "learning_rate": 9.406015412188129e-05, + "loss": 3.0587, + "step": 11650 + }, + { + "epoch": 0.7232602892792849, + "grad_norm": 0.2845541070370244, + "learning_rate": 9.405844670256928e-05, + "loss": 3.1804, + "step": 11651 + }, + { + "epoch": 0.7233223663790428, + "grad_norm": 0.18508612999083462, + "learning_rate": 9.405673905339365e-05, + "loss": 3.1389, + "step": 11652 + }, + { + "epoch": 0.7233844434788007, + "grad_norm": 0.18225174307741834, + "learning_rate": 9.405503117436332e-05, + "loss": 2.9744, + "step": 11653 + }, + { + "epoch": 0.7234465205785585, + "grad_norm": 0.17670424582290947, + "learning_rate": 9.405332306548719e-05, + "loss": 3.0824, + "step": 11654 + }, + { + "epoch": 0.7235085976783164, + "grad_norm": 0.17588233160721523, + "learning_rate": 9.405161472677419e-05, + "loss": 3.174, + "step": 11655 + }, + { + "epoch": 0.7235706747780744, + "grad_norm": 0.18229667565866264, + "learning_rate": 9.404990615823321e-05, + "loss": 3.1838, + "step": 11656 + }, + { + "epoch": 0.7236327518778323, + "grad_norm": 0.18109809700765434, + "learning_rate": 9.404819735987319e-05, + "loss": 2.9638, + "step": 11657 + }, + { + "epoch": 0.7236948289775902, + "grad_norm": 0.17699756485333418, + "learning_rate": 9.404648833170302e-05, + "loss": 3.1246, + "step": 11658 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.18644556946639843, + "learning_rate": 9.404477907373163e-05, + "loss": 3.1049, + "step": 11659 + }, + { + "epoch": 0.7238189831771059, + "grad_norm": 0.18888500409673864, + "learning_rate": 9.404306958596794e-05, + "loss": 3.1381, + "step": 11660 + }, + { + "epoch": 0.7238810602768638, + "grad_norm": 0.1763357635080751, + "learning_rate": 9.404135986842086e-05, + "loss": 3.1752, + "step": 11661 + }, + { + "epoch": 0.7239431373766217, + "grad_norm": 0.20227135500963572, + "learning_rate": 9.40396499210993e-05, + "loss": 3.0004, + "step": 11662 + }, + { + "epoch": 0.7240052144763797, + "grad_norm": 0.22250641415758904, + "learning_rate": 9.403793974401222e-05, + "loss": 3.0891, + "step": 11663 + }, + { + "epoch": 0.7240672915761376, + "grad_norm": 0.1900566359546429, + "learning_rate": 9.40362293371685e-05, + "loss": 3.0891, + "step": 11664 + }, + { + "epoch": 0.7241293686758955, + "grad_norm": 0.18508915912976634, + "learning_rate": 9.403451870057709e-05, + "loss": 3.075, + "step": 11665 + }, + { + "epoch": 0.7241914457756533, + "grad_norm": 0.18196685781777475, + "learning_rate": 9.403280783424691e-05, + "loss": 3.1311, + "step": 11666 + }, + { + "epoch": 0.7242535228754112, + "grad_norm": 0.2370773686609565, + "learning_rate": 9.403109673818687e-05, + "loss": 3.1399, + "step": 11667 + }, + { + "epoch": 0.7243155999751691, + "grad_norm": 0.17022470367991024, + "learning_rate": 9.402938541240592e-05, + "loss": 3.0768, + "step": 11668 + }, + { + "epoch": 0.7243776770749271, + "grad_norm": 0.17040527577229278, + "learning_rate": 9.402767385691298e-05, + "loss": 3.0688, + "step": 11669 + }, + { + "epoch": 0.724439754174685, + "grad_norm": 0.2506466043583191, + "learning_rate": 9.402596207171697e-05, + "loss": 3.0975, + "step": 11670 + }, + { + "epoch": 0.7245018312744429, + "grad_norm": 0.15212356109377462, + "learning_rate": 9.402425005682683e-05, + "loss": 3.1066, + "step": 11671 + }, + { + "epoch": 0.7245639083742007, + "grad_norm": 0.16576070286967745, + "learning_rate": 9.402253781225148e-05, + "loss": 3.1189, + "step": 11672 + }, + { + "epoch": 0.7246259854739586, + "grad_norm": 0.2569045638732184, + "learning_rate": 9.402082533799987e-05, + "loss": 3.1465, + "step": 11673 + }, + { + "epoch": 0.7246880625737165, + "grad_norm": 0.1750302910134011, + "learning_rate": 9.401911263408094e-05, + "loss": 3.017, + "step": 11674 + }, + { + "epoch": 0.7247501396734745, + "grad_norm": 0.23767084357795415, + "learning_rate": 9.40173997005036e-05, + "loss": 3.1204, + "step": 11675 + }, + { + "epoch": 0.7248122167732324, + "grad_norm": 0.26882697433584607, + "learning_rate": 9.401568653727682e-05, + "loss": 3.1962, + "step": 11676 + }, + { + "epoch": 0.7248742938729903, + "grad_norm": 0.18402411361761228, + "learning_rate": 9.401397314440949e-05, + "loss": 3.238, + "step": 11677 + }, + { + "epoch": 0.7249363709727481, + "grad_norm": 0.18451313364312816, + "learning_rate": 9.40122595219106e-05, + "loss": 3.0652, + "step": 11678 + }, + { + "epoch": 0.724998448072506, + "grad_norm": 0.23036681361667286, + "learning_rate": 9.401054566978905e-05, + "loss": 3.1557, + "step": 11679 + }, + { + "epoch": 0.7250605251722639, + "grad_norm": 0.19621478521062866, + "learning_rate": 9.40088315880538e-05, + "loss": 3.0828, + "step": 11680 + }, + { + "epoch": 0.7251226022720219, + "grad_norm": 0.17841173358413992, + "learning_rate": 9.400711727671381e-05, + "loss": 3.0985, + "step": 11681 + }, + { + "epoch": 0.7251846793717798, + "grad_norm": 0.22876557105645887, + "learning_rate": 9.400540273577799e-05, + "loss": 3.1261, + "step": 11682 + }, + { + "epoch": 0.7252467564715377, + "grad_norm": 0.21322076030523188, + "learning_rate": 9.400368796525532e-05, + "loss": 3.0279, + "step": 11683 + }, + { + "epoch": 0.7253088335712955, + "grad_norm": 0.24615309014979425, + "learning_rate": 9.40019729651547e-05, + "loss": 3.0408, + "step": 11684 + }, + { + "epoch": 0.7253709106710534, + "grad_norm": 0.17773640809625507, + "learning_rate": 9.400025773548512e-05, + "loss": 3.0669, + "step": 11685 + }, + { + "epoch": 0.7254329877708113, + "grad_norm": 0.2857030172871988, + "learning_rate": 9.39985422762555e-05, + "loss": 3.092, + "step": 11686 + }, + { + "epoch": 0.7254950648705693, + "grad_norm": 0.3172111516506041, + "learning_rate": 9.399682658747484e-05, + "loss": 3.1657, + "step": 11687 + }, + { + "epoch": 0.7255571419703272, + "grad_norm": 0.20310185373916873, + "learning_rate": 9.399511066915204e-05, + "loss": 3.092, + "step": 11688 + }, + { + "epoch": 0.7256192190700851, + "grad_norm": 0.26615408488332776, + "learning_rate": 9.399339452129605e-05, + "loss": 3.0742, + "step": 11689 + }, + { + "epoch": 0.7256812961698429, + "grad_norm": 0.363277004586301, + "learning_rate": 9.399167814391585e-05, + "loss": 3.1137, + "step": 11690 + }, + { + "epoch": 0.7257433732696008, + "grad_norm": 0.2138516233171888, + "learning_rate": 9.398996153702037e-05, + "loss": 3.0326, + "step": 11691 + }, + { + "epoch": 0.7258054503693587, + "grad_norm": 0.2149994805934088, + "learning_rate": 9.398824470061861e-05, + "loss": 3.1152, + "step": 11692 + }, + { + "epoch": 0.7258675274691166, + "grad_norm": 0.2329353782704567, + "learning_rate": 9.398652763471949e-05, + "loss": 3.138, + "step": 11693 + }, + { + "epoch": 0.7259296045688746, + "grad_norm": 0.21510334803692555, + "learning_rate": 9.398481033933197e-05, + "loss": 2.9738, + "step": 11694 + }, + { + "epoch": 0.7259916816686325, + "grad_norm": 0.2213917429202052, + "learning_rate": 9.398309281446503e-05, + "loss": 3.0881, + "step": 11695 + }, + { + "epoch": 0.7260537587683903, + "grad_norm": 0.2851441916437569, + "learning_rate": 9.39813750601276e-05, + "loss": 3.1167, + "step": 11696 + }, + { + "epoch": 0.7261158358681482, + "grad_norm": 0.19333165865993585, + "learning_rate": 9.397965707632868e-05, + "loss": 2.9863, + "step": 11697 + }, + { + "epoch": 0.7261779129679061, + "grad_norm": 0.20055320197758858, + "learning_rate": 9.397793886307719e-05, + "loss": 3.0198, + "step": 11698 + }, + { + "epoch": 0.726239990067664, + "grad_norm": 0.19707527986131826, + "learning_rate": 9.397622042038212e-05, + "loss": 3.1348, + "step": 11699 + }, + { + "epoch": 0.726302067167422, + "grad_norm": 0.28596259773046956, + "learning_rate": 9.397450174825245e-05, + "loss": 3.0046, + "step": 11700 + }, + { + "epoch": 0.7263641442671799, + "grad_norm": 0.2267901329381373, + "learning_rate": 9.397278284669712e-05, + "loss": 3.1097, + "step": 11701 + }, + { + "epoch": 0.7264262213669377, + "grad_norm": 0.26633326685790837, + "learning_rate": 9.39710637157251e-05, + "loss": 3.1417, + "step": 11702 + }, + { + "epoch": 0.7264882984666956, + "grad_norm": 0.21243685737909962, + "learning_rate": 9.396934435534538e-05, + "loss": 3.0396, + "step": 11703 + }, + { + "epoch": 0.7265503755664535, + "grad_norm": 0.19768899864933653, + "learning_rate": 9.396762476556691e-05, + "loss": 3.0946, + "step": 11704 + }, + { + "epoch": 0.7266124526662114, + "grad_norm": 0.22240262058422064, + "learning_rate": 9.396590494639867e-05, + "loss": 3.1392, + "step": 11705 + }, + { + "epoch": 0.7266745297659694, + "grad_norm": 0.4293234409932385, + "learning_rate": 9.396418489784963e-05, + "loss": 3.061, + "step": 11706 + }, + { + "epoch": 0.7267366068657273, + "grad_norm": 0.20787136993633584, + "learning_rate": 9.396246461992876e-05, + "loss": 3.0818, + "step": 11707 + }, + { + "epoch": 0.7267986839654851, + "grad_norm": 0.21296844460009973, + "learning_rate": 9.396074411264504e-05, + "loss": 3.0956, + "step": 11708 + }, + { + "epoch": 0.726860761065243, + "grad_norm": 0.22636139922357038, + "learning_rate": 9.395902337600746e-05, + "loss": 3.0635, + "step": 11709 + }, + { + "epoch": 0.7269228381650009, + "grad_norm": 0.29702882796839686, + "learning_rate": 9.395730241002495e-05, + "loss": 3.1398, + "step": 11710 + }, + { + "epoch": 0.7269849152647588, + "grad_norm": 0.2310992845203973, + "learning_rate": 9.395558121470656e-05, + "loss": 3.0413, + "step": 11711 + }, + { + "epoch": 0.7270469923645168, + "grad_norm": 0.24540464104353638, + "learning_rate": 9.395385979006122e-05, + "loss": 3.0503, + "step": 11712 + }, + { + "epoch": 0.7271090694642747, + "grad_norm": 0.22294240265234036, + "learning_rate": 9.395213813609791e-05, + "loss": 3.1847, + "step": 11713 + }, + { + "epoch": 0.7271711465640325, + "grad_norm": 0.23516621552702763, + "learning_rate": 9.395041625282563e-05, + "loss": 3.1072, + "step": 11714 + }, + { + "epoch": 0.7272332236637904, + "grad_norm": 0.23637967296249876, + "learning_rate": 9.394869414025338e-05, + "loss": 3.1147, + "step": 11715 + }, + { + "epoch": 0.7272953007635483, + "grad_norm": 0.2884585928308037, + "learning_rate": 9.39469717983901e-05, + "loss": 3.0343, + "step": 11716 + }, + { + "epoch": 0.7273573778633062, + "grad_norm": 0.2653788800766733, + "learning_rate": 9.394524922724481e-05, + "loss": 3.1229, + "step": 11717 + }, + { + "epoch": 0.7274194549630641, + "grad_norm": 0.3918787671931738, + "learning_rate": 9.394352642682649e-05, + "loss": 3.0894, + "step": 11718 + }, + { + "epoch": 0.7274815320628221, + "grad_norm": 0.30638173588367756, + "learning_rate": 9.394180339714411e-05, + "loss": 3.1555, + "step": 11719 + }, + { + "epoch": 0.7275436091625799, + "grad_norm": 0.21242003970039047, + "learning_rate": 9.39400801382067e-05, + "loss": 3.0329, + "step": 11720 + }, + { + "epoch": 0.7276056862623378, + "grad_norm": 0.2742466127200357, + "learning_rate": 9.39383566500232e-05, + "loss": 3.1201, + "step": 11721 + }, + { + "epoch": 0.7276677633620957, + "grad_norm": 0.25035372683880686, + "learning_rate": 9.393663293260264e-05, + "loss": 3.1389, + "step": 11722 + }, + { + "epoch": 0.7277298404618536, + "grad_norm": 0.3245930521839912, + "learning_rate": 9.3934908985954e-05, + "loss": 3.1148, + "step": 11723 + }, + { + "epoch": 0.7277919175616115, + "grad_norm": 0.28475331880328403, + "learning_rate": 9.393318481008626e-05, + "loss": 2.9686, + "step": 11724 + }, + { + "epoch": 0.7278539946613695, + "grad_norm": 0.344275246657777, + "learning_rate": 9.393146040500845e-05, + "loss": 2.9965, + "step": 11725 + }, + { + "epoch": 0.7279160717611273, + "grad_norm": 0.2944011765279662, + "learning_rate": 9.392973577072953e-05, + "loss": 3.0947, + "step": 11726 + }, + { + "epoch": 0.7279781488608852, + "grad_norm": 0.2508846069971194, + "learning_rate": 9.392801090725852e-05, + "loss": 3.2028, + "step": 11727 + }, + { + "epoch": 0.7280402259606431, + "grad_norm": 0.2939368085169704, + "learning_rate": 9.392628581460441e-05, + "loss": 3.0655, + "step": 11728 + }, + { + "epoch": 0.728102303060401, + "grad_norm": 0.2253922791312887, + "learning_rate": 9.392456049277622e-05, + "loss": 3.0947, + "step": 11729 + }, + { + "epoch": 0.7281643801601589, + "grad_norm": 0.21456705829479827, + "learning_rate": 9.392283494178292e-05, + "loss": 3.121, + "step": 11730 + }, + { + "epoch": 0.7282264572599169, + "grad_norm": 0.2814296704829106, + "learning_rate": 9.392110916163352e-05, + "loss": 3.0966, + "step": 11731 + }, + { + "epoch": 0.7282885343596747, + "grad_norm": 0.21106121179528142, + "learning_rate": 9.391938315233705e-05, + "loss": 3.0478, + "step": 11732 + }, + { + "epoch": 0.7283506114594326, + "grad_norm": 0.2210887729772043, + "learning_rate": 9.391765691390248e-05, + "loss": 3.0052, + "step": 11733 + }, + { + "epoch": 0.7284126885591905, + "grad_norm": 0.27245493509660795, + "learning_rate": 9.391593044633886e-05, + "loss": 3.1408, + "step": 11734 + }, + { + "epoch": 0.7284747656589484, + "grad_norm": 0.21930110005928202, + "learning_rate": 9.391420374965514e-05, + "loss": 3.1612, + "step": 11735 + }, + { + "epoch": 0.7285368427587063, + "grad_norm": 0.32836249249049626, + "learning_rate": 9.391247682386038e-05, + "loss": 3.0767, + "step": 11736 + }, + { + "epoch": 0.7285989198584643, + "grad_norm": 0.25413451571176554, + "learning_rate": 9.391074966896354e-05, + "loss": 3.1466, + "step": 11737 + }, + { + "epoch": 0.7286609969582221, + "grad_norm": 0.33439093935044295, + "learning_rate": 9.390902228497368e-05, + "loss": 3.1465, + "step": 11738 + }, + { + "epoch": 0.72872307405798, + "grad_norm": 0.2638551851898748, + "learning_rate": 9.390729467189978e-05, + "loss": 3.0879, + "step": 11739 + }, + { + "epoch": 0.7287851511577379, + "grad_norm": 0.24992495393898584, + "learning_rate": 9.390556682975088e-05, + "loss": 3.0565, + "step": 11740 + }, + { + "epoch": 0.7288472282574958, + "grad_norm": 0.21741968340905146, + "learning_rate": 9.390383875853597e-05, + "loss": 3.1143, + "step": 11741 + }, + { + "epoch": 0.7289093053572537, + "grad_norm": 0.19363633944293093, + "learning_rate": 9.390211045826409e-05, + "loss": 2.9963, + "step": 11742 + }, + { + "epoch": 0.7289713824570117, + "grad_norm": 0.27523987614344747, + "learning_rate": 9.390038192894422e-05, + "loss": 3.1024, + "step": 11743 + }, + { + "epoch": 0.7290334595567695, + "grad_norm": 0.1907698991259051, + "learning_rate": 9.389865317058541e-05, + "loss": 3.2224, + "step": 11744 + }, + { + "epoch": 0.7290955366565274, + "grad_norm": 0.3496870348895732, + "learning_rate": 9.389692418319667e-05, + "loss": 3.1484, + "step": 11745 + }, + { + "epoch": 0.7291576137562853, + "grad_norm": 0.2686175186176931, + "learning_rate": 9.389519496678701e-05, + "loss": 3.184, + "step": 11746 + }, + { + "epoch": 0.7292196908560432, + "grad_norm": 0.20454074753055956, + "learning_rate": 9.389346552136548e-05, + "loss": 3.1717, + "step": 11747 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.1999015606826182, + "learning_rate": 9.389173584694106e-05, + "loss": 2.9979, + "step": 11748 + }, + { + "epoch": 0.729343845055559, + "grad_norm": 0.25220962892936194, + "learning_rate": 9.389000594352282e-05, + "loss": 3.1072, + "step": 11749 + }, + { + "epoch": 0.7294059221553169, + "grad_norm": 0.2276409150419653, + "learning_rate": 9.388827581111976e-05, + "loss": 3.0328, + "step": 11750 + }, + { + "epoch": 0.7294679992550748, + "grad_norm": 0.18200207709801353, + "learning_rate": 9.388654544974092e-05, + "loss": 3.0698, + "step": 11751 + }, + { + "epoch": 0.7295300763548327, + "grad_norm": 0.21079766972342306, + "learning_rate": 9.38848148593953e-05, + "loss": 3.1357, + "step": 11752 + }, + { + "epoch": 0.7295921534545906, + "grad_norm": 0.16382581804695023, + "learning_rate": 9.388308404009197e-05, + "loss": 3.082, + "step": 11753 + }, + { + "epoch": 0.7296542305543485, + "grad_norm": 0.20944301688971606, + "learning_rate": 9.38813529918399e-05, + "loss": 3.0598, + "step": 11754 + }, + { + "epoch": 0.7297163076541064, + "grad_norm": 0.17992510841965836, + "learning_rate": 9.387962171464819e-05, + "loss": 3.1148, + "step": 11755 + }, + { + "epoch": 0.7297783847538643, + "grad_norm": 0.19621517441868602, + "learning_rate": 9.387789020852584e-05, + "loss": 3.0996, + "step": 11756 + }, + { + "epoch": 0.7298404618536222, + "grad_norm": 0.1786223019782563, + "learning_rate": 9.387615847348188e-05, + "loss": 3.0721, + "step": 11757 + }, + { + "epoch": 0.7299025389533801, + "grad_norm": 0.2462018471924531, + "learning_rate": 9.387442650952535e-05, + "loss": 3.0653, + "step": 11758 + }, + { + "epoch": 0.729964616053138, + "grad_norm": 0.21964649861033075, + "learning_rate": 9.387269431666528e-05, + "loss": 3.0625, + "step": 11759 + }, + { + "epoch": 0.7300266931528959, + "grad_norm": 0.285721232231879, + "learning_rate": 9.387096189491072e-05, + "loss": 3.131, + "step": 11760 + }, + { + "epoch": 0.7300887702526538, + "grad_norm": 0.2502727957345621, + "learning_rate": 9.386922924427071e-05, + "loss": 3.1304, + "step": 11761 + }, + { + "epoch": 0.7301508473524116, + "grad_norm": 0.20607630900983398, + "learning_rate": 9.386749636475427e-05, + "loss": 3.0513, + "step": 11762 + }, + { + "epoch": 0.7302129244521696, + "grad_norm": 0.19854565586560272, + "learning_rate": 9.386576325637044e-05, + "loss": 3.0736, + "step": 11763 + }, + { + "epoch": 0.7302750015519275, + "grad_norm": 0.20069240170197844, + "learning_rate": 9.38640299191283e-05, + "loss": 3.0887, + "step": 11764 + }, + { + "epoch": 0.7303370786516854, + "grad_norm": 0.2135410783728583, + "learning_rate": 9.386229635303685e-05, + "loss": 3.1203, + "step": 11765 + }, + { + "epoch": 0.7303991557514433, + "grad_norm": 0.22974934880555592, + "learning_rate": 9.386056255810516e-05, + "loss": 3.1593, + "step": 11766 + }, + { + "epoch": 0.7304612328512012, + "grad_norm": 0.20849181445552203, + "learning_rate": 9.385882853434227e-05, + "loss": 3.1312, + "step": 11767 + }, + { + "epoch": 0.730523309950959, + "grad_norm": 0.2114322476356242, + "learning_rate": 9.385709428175721e-05, + "loss": 3.0722, + "step": 11768 + }, + { + "epoch": 0.730585387050717, + "grad_norm": 0.3970666940564557, + "learning_rate": 9.385535980035905e-05, + "loss": 3.0338, + "step": 11769 + }, + { + "epoch": 0.7306474641504749, + "grad_norm": 0.18157612260345135, + "learning_rate": 9.385362509015684e-05, + "loss": 3.0869, + "step": 11770 + }, + { + "epoch": 0.7307095412502328, + "grad_norm": 0.26922686984701993, + "learning_rate": 9.385189015115962e-05, + "loss": 3.0715, + "step": 11771 + }, + { + "epoch": 0.7307716183499907, + "grad_norm": 0.34748811620865333, + "learning_rate": 9.385015498337644e-05, + "loss": 3.0416, + "step": 11772 + }, + { + "epoch": 0.7308336954497486, + "grad_norm": 0.19787672961666544, + "learning_rate": 9.384841958681635e-05, + "loss": 3.0216, + "step": 11773 + }, + { + "epoch": 0.7308957725495064, + "grad_norm": 0.20843251172320726, + "learning_rate": 9.384668396148841e-05, + "loss": 3.0692, + "step": 11774 + }, + { + "epoch": 0.7309578496492644, + "grad_norm": 0.20925390385503465, + "learning_rate": 9.384494810740169e-05, + "loss": 3.0353, + "step": 11775 + }, + { + "epoch": 0.7310199267490223, + "grad_norm": 0.21282073282290145, + "learning_rate": 9.384321202456523e-05, + "loss": 3.0667, + "step": 11776 + }, + { + "epoch": 0.7310820038487802, + "grad_norm": 0.2106563407406557, + "learning_rate": 9.384147571298807e-05, + "loss": 3.1434, + "step": 11777 + }, + { + "epoch": 0.7311440809485381, + "grad_norm": 0.22408780617438992, + "learning_rate": 9.38397391726793e-05, + "loss": 3.1133, + "step": 11778 + }, + { + "epoch": 0.731206158048296, + "grad_norm": 0.2158261639246271, + "learning_rate": 9.383800240364798e-05, + "loss": 3.0425, + "step": 11779 + }, + { + "epoch": 0.7312682351480538, + "grad_norm": 0.22695522963948145, + "learning_rate": 9.383626540590315e-05, + "loss": 3.1077, + "step": 11780 + }, + { + "epoch": 0.7313303122478118, + "grad_norm": 0.21224648029534057, + "learning_rate": 9.383452817945388e-05, + "loss": 3.0726, + "step": 11781 + }, + { + "epoch": 0.7313923893475697, + "grad_norm": 0.18096386546092577, + "learning_rate": 9.383279072430924e-05, + "loss": 3.1447, + "step": 11782 + }, + { + "epoch": 0.7314544664473276, + "grad_norm": 0.20469148610143104, + "learning_rate": 9.383105304047828e-05, + "loss": 3.0746, + "step": 11783 + }, + { + "epoch": 0.7315165435470855, + "grad_norm": 0.19361788875531497, + "learning_rate": 9.382931512797009e-05, + "loss": 3.2037, + "step": 11784 + }, + { + "epoch": 0.7315786206468434, + "grad_norm": 0.1686380988877028, + "learning_rate": 9.38275769867937e-05, + "loss": 3.114, + "step": 11785 + }, + { + "epoch": 0.7316406977466012, + "grad_norm": 0.19220748890458975, + "learning_rate": 9.382583861695822e-05, + "loss": 3.13, + "step": 11786 + }, + { + "epoch": 0.7317027748463591, + "grad_norm": 0.1952872069193947, + "learning_rate": 9.38241000184727e-05, + "loss": 2.9996, + "step": 11787 + }, + { + "epoch": 0.7317648519461171, + "grad_norm": 0.42890388821644715, + "learning_rate": 9.382236119134622e-05, + "loss": 3.1595, + "step": 11788 + }, + { + "epoch": 0.731826929045875, + "grad_norm": 0.22555769706426151, + "learning_rate": 9.382062213558782e-05, + "loss": 3.0487, + "step": 11789 + }, + { + "epoch": 0.7318890061456329, + "grad_norm": 0.35826183075147405, + "learning_rate": 9.381888285120661e-05, + "loss": 3.0736, + "step": 11790 + }, + { + "epoch": 0.7319510832453908, + "grad_norm": 0.25885799137771964, + "learning_rate": 9.381714333821166e-05, + "loss": 3.0628, + "step": 11791 + }, + { + "epoch": 0.7320131603451486, + "grad_norm": 0.25972862266931723, + "learning_rate": 9.381540359661202e-05, + "loss": 3.011, + "step": 11792 + }, + { + "epoch": 0.7320752374449065, + "grad_norm": 0.3219535935381614, + "learning_rate": 9.381366362641679e-05, + "loss": 3.0514, + "step": 11793 + }, + { + "epoch": 0.7321373145446645, + "grad_norm": 0.21509662831254028, + "learning_rate": 9.381192342763504e-05, + "loss": 3.1147, + "step": 11794 + }, + { + "epoch": 0.7321993916444224, + "grad_norm": 0.2118869008179403, + "learning_rate": 9.381018300027585e-05, + "loss": 3.1241, + "step": 11795 + }, + { + "epoch": 0.7322614687441803, + "grad_norm": 0.1843362969655491, + "learning_rate": 9.380844234434829e-05, + "loss": 3.0619, + "step": 11796 + }, + { + "epoch": 0.7323235458439382, + "grad_norm": 0.28819557302067605, + "learning_rate": 9.380670145986147e-05, + "loss": 3.0292, + "step": 11797 + }, + { + "epoch": 0.732385622943696, + "grad_norm": 0.2079935919686979, + "learning_rate": 9.380496034682444e-05, + "loss": 3.0555, + "step": 11798 + }, + { + "epoch": 0.7324477000434539, + "grad_norm": 0.4761121216382065, + "learning_rate": 9.380321900524631e-05, + "loss": 2.9994, + "step": 11799 + }, + { + "epoch": 0.7325097771432119, + "grad_norm": 0.22610319376205756, + "learning_rate": 9.380147743513614e-05, + "loss": 3.0378, + "step": 11800 + }, + { + "epoch": 0.7325718542429698, + "grad_norm": 0.28963309753501076, + "learning_rate": 9.379973563650304e-05, + "loss": 3.0766, + "step": 11801 + }, + { + "epoch": 0.7326339313427277, + "grad_norm": 0.2536704068483585, + "learning_rate": 9.379799360935608e-05, + "loss": 3.1265, + "step": 11802 + }, + { + "epoch": 0.7326960084424856, + "grad_norm": 0.2684104254528062, + "learning_rate": 9.379625135370435e-05, + "loss": 3.06, + "step": 11803 + }, + { + "epoch": 0.7327580855422434, + "grad_norm": 0.21796967769411713, + "learning_rate": 9.379450886955696e-05, + "loss": 3.0815, + "step": 11804 + }, + { + "epoch": 0.7328201626420013, + "grad_norm": 0.214582310175694, + "learning_rate": 9.379276615692296e-05, + "loss": 3.0723, + "step": 11805 + }, + { + "epoch": 0.7328822397417593, + "grad_norm": 0.1896289324330681, + "learning_rate": 9.379102321581149e-05, + "loss": 3.1692, + "step": 11806 + }, + { + "epoch": 0.7329443168415172, + "grad_norm": 0.26909004256538926, + "learning_rate": 9.378928004623163e-05, + "loss": 3.0699, + "step": 11807 + }, + { + "epoch": 0.7330063939412751, + "grad_norm": 0.19106172188752332, + "learning_rate": 9.378753664819244e-05, + "loss": 3.1287, + "step": 11808 + }, + { + "epoch": 0.733068471041033, + "grad_norm": 0.21626201153896205, + "learning_rate": 9.378579302170304e-05, + "loss": 3.0658, + "step": 11809 + }, + { + "epoch": 0.7331305481407908, + "grad_norm": 0.20268211315888413, + "learning_rate": 9.378404916677255e-05, + "loss": 3.1291, + "step": 11810 + }, + { + "epoch": 0.7331926252405487, + "grad_norm": 0.20911763301033326, + "learning_rate": 9.378230508341004e-05, + "loss": 3.0318, + "step": 11811 + }, + { + "epoch": 0.7332547023403067, + "grad_norm": 0.16831595000206276, + "learning_rate": 9.378056077162462e-05, + "loss": 3.1397, + "step": 11812 + }, + { + "epoch": 0.7333167794400646, + "grad_norm": 0.199328140070401, + "learning_rate": 9.377881623142537e-05, + "loss": 3.034, + "step": 11813 + }, + { + "epoch": 0.7333788565398225, + "grad_norm": 0.19235509101473156, + "learning_rate": 9.377707146282143e-05, + "loss": 3.1477, + "step": 11814 + }, + { + "epoch": 0.7334409336395804, + "grad_norm": 0.1982078616486361, + "learning_rate": 9.377532646582185e-05, + "loss": 3.0722, + "step": 11815 + }, + { + "epoch": 0.7335030107393382, + "grad_norm": 0.19665813384766984, + "learning_rate": 9.37735812404358e-05, + "loss": 3.0425, + "step": 11816 + }, + { + "epoch": 0.7335650878390961, + "grad_norm": 0.18589623337906083, + "learning_rate": 9.377183578667233e-05, + "loss": 2.9474, + "step": 11817 + }, + { + "epoch": 0.733627164938854, + "grad_norm": 0.21690890708112578, + "learning_rate": 9.377009010454056e-05, + "loss": 3.0235, + "step": 11818 + }, + { + "epoch": 0.733689242038612, + "grad_norm": 0.19763815046829114, + "learning_rate": 9.376834419404963e-05, + "loss": 3.0016, + "step": 11819 + }, + { + "epoch": 0.7337513191383699, + "grad_norm": 0.24902452954819526, + "learning_rate": 9.376659805520861e-05, + "loss": 3.0958, + "step": 11820 + }, + { + "epoch": 0.7338133962381278, + "grad_norm": 0.2353977063161868, + "learning_rate": 9.376485168802663e-05, + "loss": 3.0933, + "step": 11821 + }, + { + "epoch": 0.7338754733378856, + "grad_norm": 0.22518273853622206, + "learning_rate": 9.376310509251278e-05, + "loss": 3.0724, + "step": 11822 + }, + { + "epoch": 0.7339375504376435, + "grad_norm": 0.19944696818082203, + "learning_rate": 9.376135826867619e-05, + "loss": 3.1131, + "step": 11823 + }, + { + "epoch": 0.7339996275374014, + "grad_norm": 0.20010895145645766, + "learning_rate": 9.375961121652597e-05, + "loss": 3.044, + "step": 11824 + }, + { + "epoch": 0.7340617046371594, + "grad_norm": 0.23038925804335458, + "learning_rate": 9.375786393607125e-05, + "loss": 3.1522, + "step": 11825 + }, + { + "epoch": 0.7341237817369173, + "grad_norm": 0.17773833637224537, + "learning_rate": 9.375611642732111e-05, + "loss": 3.0726, + "step": 11826 + }, + { + "epoch": 0.7341858588366752, + "grad_norm": 0.20876982183629805, + "learning_rate": 9.375436869028469e-05, + "loss": 3.1235, + "step": 11827 + }, + { + "epoch": 0.734247935936433, + "grad_norm": 0.18407797366256587, + "learning_rate": 9.375262072497113e-05, + "loss": 3.0905, + "step": 11828 + }, + { + "epoch": 0.7343100130361909, + "grad_norm": 0.17413705526931378, + "learning_rate": 9.375087253138951e-05, + "loss": 3.1256, + "step": 11829 + }, + { + "epoch": 0.7343720901359488, + "grad_norm": 0.5358715150211273, + "learning_rate": 9.374912410954897e-05, + "loss": 3.005, + "step": 11830 + }, + { + "epoch": 0.7344341672357068, + "grad_norm": 0.24843320273035674, + "learning_rate": 9.374737545945862e-05, + "loss": 3.0562, + "step": 11831 + }, + { + "epoch": 0.7344962443354647, + "grad_norm": 0.24743225515529954, + "learning_rate": 9.37456265811276e-05, + "loss": 2.9855, + "step": 11832 + }, + { + "epoch": 0.7345583214352226, + "grad_norm": 0.26425465358472877, + "learning_rate": 9.374387747456503e-05, + "loss": 3.16, + "step": 11833 + }, + { + "epoch": 0.7346203985349804, + "grad_norm": 0.21524998665354353, + "learning_rate": 9.374212813978002e-05, + "loss": 3.0419, + "step": 11834 + }, + { + "epoch": 0.7346824756347383, + "grad_norm": 0.3123590628384974, + "learning_rate": 9.374037857678173e-05, + "loss": 3.0499, + "step": 11835 + }, + { + "epoch": 0.7347445527344962, + "grad_norm": 0.2616096383007852, + "learning_rate": 9.373862878557926e-05, + "loss": 3.0372, + "step": 11836 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.2295458953994394, + "learning_rate": 9.373687876618174e-05, + "loss": 3.0729, + "step": 11837 + }, + { + "epoch": 0.7348687069340121, + "grad_norm": 0.21154873204474514, + "learning_rate": 9.37351285185983e-05, + "loss": 3.0259, + "step": 11838 + }, + { + "epoch": 0.73493078403377, + "grad_norm": 0.22505396917574588, + "learning_rate": 9.373337804283809e-05, + "loss": 2.9932, + "step": 11839 + }, + { + "epoch": 0.7349928611335278, + "grad_norm": 0.22839874147512695, + "learning_rate": 9.37316273389102e-05, + "loss": 3.127, + "step": 11840 + }, + { + "epoch": 0.7350549382332857, + "grad_norm": 0.22131964563944387, + "learning_rate": 9.372987640682382e-05, + "loss": 3.1999, + "step": 11841 + }, + { + "epoch": 0.7351170153330436, + "grad_norm": 0.27645010091966404, + "learning_rate": 9.372812524658808e-05, + "loss": 3.0275, + "step": 11842 + }, + { + "epoch": 0.7351790924328016, + "grad_norm": 0.23638335284115766, + "learning_rate": 9.372637385821205e-05, + "loss": 3.1975, + "step": 11843 + }, + { + "epoch": 0.7352411695325595, + "grad_norm": 0.33756589996724756, + "learning_rate": 9.372462224170494e-05, + "loss": 3.0985, + "step": 11844 + }, + { + "epoch": 0.7353032466323174, + "grad_norm": 0.2769009991703212, + "learning_rate": 9.372287039707586e-05, + "loss": 3.024, + "step": 11845 + }, + { + "epoch": 0.7353653237320752, + "grad_norm": 0.27425344524810163, + "learning_rate": 9.372111832433394e-05, + "loss": 3.0846, + "step": 11846 + }, + { + "epoch": 0.7354274008318331, + "grad_norm": 0.361100511772334, + "learning_rate": 9.371936602348835e-05, + "loss": 3.0949, + "step": 11847 + }, + { + "epoch": 0.735489477931591, + "grad_norm": 0.2757582949413594, + "learning_rate": 9.37176134945482e-05, + "loss": 3.1743, + "step": 11848 + }, + { + "epoch": 0.735551555031349, + "grad_norm": 0.22229320643272163, + "learning_rate": 9.371586073752264e-05, + "loss": 3.1331, + "step": 11849 + }, + { + "epoch": 0.7356136321311069, + "grad_norm": 0.2703136477142841, + "learning_rate": 9.371410775242084e-05, + "loss": 3.1472, + "step": 11850 + }, + { + "epoch": 0.7356757092308648, + "grad_norm": 0.24593112507110976, + "learning_rate": 9.37123545392519e-05, + "loss": 3.0334, + "step": 11851 + }, + { + "epoch": 0.7357377863306226, + "grad_norm": 0.24496041562014603, + "learning_rate": 9.371060109802503e-05, + "loss": 3.0494, + "step": 11852 + }, + { + "epoch": 0.7357998634303805, + "grad_norm": 0.24200531813840934, + "learning_rate": 9.370884742874933e-05, + "loss": 3.0911, + "step": 11853 + }, + { + "epoch": 0.7358619405301384, + "grad_norm": 0.18811855013616938, + "learning_rate": 9.370709353143397e-05, + "loss": 2.9776, + "step": 11854 + }, + { + "epoch": 0.7359240176298963, + "grad_norm": 0.22771474089482346, + "learning_rate": 9.370533940608807e-05, + "loss": 3.1585, + "step": 11855 + }, + { + "epoch": 0.7359860947296543, + "grad_norm": 0.18299971514755237, + "learning_rate": 9.370358505272082e-05, + "loss": 3.078, + "step": 11856 + }, + { + "epoch": 0.7360481718294122, + "grad_norm": 0.20071976050381324, + "learning_rate": 9.370183047134136e-05, + "loss": 3.1542, + "step": 11857 + }, + { + "epoch": 0.73611024892917, + "grad_norm": 0.21716025545228002, + "learning_rate": 9.370007566195884e-05, + "loss": 3.0191, + "step": 11858 + }, + { + "epoch": 0.7361723260289279, + "grad_norm": 0.195607125270036, + "learning_rate": 9.369832062458241e-05, + "loss": 3.1007, + "step": 11859 + }, + { + "epoch": 0.7362344031286858, + "grad_norm": 0.2025463132378909, + "learning_rate": 9.369656535922124e-05, + "loss": 3.0259, + "step": 11860 + }, + { + "epoch": 0.7362964802284437, + "grad_norm": 0.23708196478439197, + "learning_rate": 9.369480986588447e-05, + "loss": 3.1371, + "step": 11861 + }, + { + "epoch": 0.7363585573282017, + "grad_norm": 0.23734403213423935, + "learning_rate": 9.369305414458127e-05, + "loss": 3.0594, + "step": 11862 + }, + { + "epoch": 0.7364206344279596, + "grad_norm": 0.18254789782055283, + "learning_rate": 9.36912981953208e-05, + "loss": 3.0704, + "step": 11863 + }, + { + "epoch": 0.7364827115277174, + "grad_norm": 0.1952560724762339, + "learning_rate": 9.368954201811224e-05, + "loss": 3.0976, + "step": 11864 + }, + { + "epoch": 0.7365447886274753, + "grad_norm": 0.2218338434570914, + "learning_rate": 9.368778561296472e-05, + "loss": 3.0372, + "step": 11865 + }, + { + "epoch": 0.7366068657272332, + "grad_norm": 0.1803429507784997, + "learning_rate": 9.36860289798874e-05, + "loss": 3.0761, + "step": 11866 + }, + { + "epoch": 0.7366689428269911, + "grad_norm": 0.19831982863320033, + "learning_rate": 9.36842721188895e-05, + "loss": 3.0112, + "step": 11867 + }, + { + "epoch": 0.736731019926749, + "grad_norm": 0.16344229089414516, + "learning_rate": 9.368251502998012e-05, + "loss": 3.0952, + "step": 11868 + }, + { + "epoch": 0.736793097026507, + "grad_norm": 0.1945495294309105, + "learning_rate": 9.368075771316847e-05, + "loss": 3.0118, + "step": 11869 + }, + { + "epoch": 0.7368551741262648, + "grad_norm": 0.3037103119380678, + "learning_rate": 9.36790001684637e-05, + "loss": 3.161, + "step": 11870 + }, + { + "epoch": 0.7369172512260227, + "grad_norm": 0.199490119261559, + "learning_rate": 9.367724239587497e-05, + "loss": 3.1095, + "step": 11871 + }, + { + "epoch": 0.7369793283257806, + "grad_norm": 0.1824367161825762, + "learning_rate": 9.367548439541148e-05, + "loss": 3.1087, + "step": 11872 + }, + { + "epoch": 0.7370414054255385, + "grad_norm": 0.35751276997120257, + "learning_rate": 9.367372616708237e-05, + "loss": 3.0742, + "step": 11873 + }, + { + "epoch": 0.7371034825252964, + "grad_norm": 0.22124332382883977, + "learning_rate": 9.367196771089683e-05, + "loss": 3.1334, + "step": 11874 + }, + { + "epoch": 0.7371655596250544, + "grad_norm": 0.24842338821830331, + "learning_rate": 9.367020902686403e-05, + "loss": 3.1315, + "step": 11875 + }, + { + "epoch": 0.7372276367248122, + "grad_norm": 0.18885426403119593, + "learning_rate": 9.366845011499315e-05, + "loss": 3.0573, + "step": 11876 + }, + { + "epoch": 0.7372897138245701, + "grad_norm": 0.21078298348858548, + "learning_rate": 9.366669097529337e-05, + "loss": 3.0542, + "step": 11877 + }, + { + "epoch": 0.737351790924328, + "grad_norm": 0.2932759010018194, + "learning_rate": 9.366493160777388e-05, + "loss": 3.0767, + "step": 11878 + }, + { + "epoch": 0.7374138680240859, + "grad_norm": 0.2323035171220161, + "learning_rate": 9.366317201244381e-05, + "loss": 3.1824, + "step": 11879 + }, + { + "epoch": 0.7374759451238438, + "grad_norm": 0.29511136163481244, + "learning_rate": 9.36614121893124e-05, + "loss": 3.0901, + "step": 11880 + }, + { + "epoch": 0.7375380222236018, + "grad_norm": 0.24883021169652858, + "learning_rate": 9.365965213838879e-05, + "loss": 2.991, + "step": 11881 + }, + { + "epoch": 0.7376000993233596, + "grad_norm": 0.19701968810291945, + "learning_rate": 9.365789185968216e-05, + "loss": 3.0968, + "step": 11882 + }, + { + "epoch": 0.7376621764231175, + "grad_norm": 0.20215798323202724, + "learning_rate": 9.365613135320174e-05, + "loss": 3.1058, + "step": 11883 + }, + { + "epoch": 0.7377242535228754, + "grad_norm": 0.19692838754821512, + "learning_rate": 9.365437061895666e-05, + "loss": 3.0003, + "step": 11884 + }, + { + "epoch": 0.7377863306226333, + "grad_norm": 0.19747702913153334, + "learning_rate": 9.365260965695615e-05, + "loss": 3.0935, + "step": 11885 + }, + { + "epoch": 0.7378484077223912, + "grad_norm": 0.21260249912561832, + "learning_rate": 9.365084846720937e-05, + "loss": 3.0831, + "step": 11886 + }, + { + "epoch": 0.7379104848221492, + "grad_norm": 0.20708200444371408, + "learning_rate": 9.36490870497255e-05, + "loss": 3.1168, + "step": 11887 + }, + { + "epoch": 0.737972561921907, + "grad_norm": 0.20493347879730173, + "learning_rate": 9.364732540451378e-05, + "loss": 3.1058, + "step": 11888 + }, + { + "epoch": 0.7380346390216649, + "grad_norm": 0.24083236763165464, + "learning_rate": 9.364556353158335e-05, + "loss": 3.0715, + "step": 11889 + }, + { + "epoch": 0.7380967161214228, + "grad_norm": 0.21062583327482304, + "learning_rate": 9.364380143094342e-05, + "loss": 3.1759, + "step": 11890 + }, + { + "epoch": 0.7381587932211807, + "grad_norm": 0.2105646812397321, + "learning_rate": 9.364203910260319e-05, + "loss": 3.065, + "step": 11891 + }, + { + "epoch": 0.7382208703209386, + "grad_norm": 0.17934710976962706, + "learning_rate": 9.364027654657183e-05, + "loss": 3.073, + "step": 11892 + }, + { + "epoch": 0.7382829474206966, + "grad_norm": 0.1884447772643661, + "learning_rate": 9.363851376285858e-05, + "loss": 3.1286, + "step": 11893 + }, + { + "epoch": 0.7383450245204544, + "grad_norm": 0.18780918259634374, + "learning_rate": 9.363675075147258e-05, + "loss": 3.0636, + "step": 11894 + }, + { + "epoch": 0.7384071016202123, + "grad_norm": 0.2652133669279386, + "learning_rate": 9.363498751242308e-05, + "loss": 3.07, + "step": 11895 + }, + { + "epoch": 0.7384691787199702, + "grad_norm": 0.2037732344526442, + "learning_rate": 9.363322404571925e-05, + "loss": 3.0902, + "step": 11896 + }, + { + "epoch": 0.7385312558197281, + "grad_norm": 0.20367772754239616, + "learning_rate": 9.363146035137029e-05, + "loss": 3.1187, + "step": 11897 + }, + { + "epoch": 0.738593332919486, + "grad_norm": 0.2130911783610604, + "learning_rate": 9.362969642938541e-05, + "loss": 3.0787, + "step": 11898 + }, + { + "epoch": 0.738655410019244, + "grad_norm": 0.31387345569478314, + "learning_rate": 9.362793227977384e-05, + "loss": 3.1387, + "step": 11899 + }, + { + "epoch": 0.7387174871190018, + "grad_norm": 0.27623256194626994, + "learning_rate": 9.362616790254472e-05, + "loss": 3.03, + "step": 11900 + }, + { + "epoch": 0.7387795642187597, + "grad_norm": 0.2350174126489546, + "learning_rate": 9.362440329770732e-05, + "loss": 3.0717, + "step": 11901 + }, + { + "epoch": 0.7388416413185176, + "grad_norm": 0.2568464675891364, + "learning_rate": 9.362263846527081e-05, + "loss": 3.0309, + "step": 11902 + }, + { + "epoch": 0.7389037184182755, + "grad_norm": 0.21377604210193282, + "learning_rate": 9.36208734052444e-05, + "loss": 3.1122, + "step": 11903 + }, + { + "epoch": 0.7389657955180334, + "grad_norm": 0.26441910438954763, + "learning_rate": 9.36191081176373e-05, + "loss": 3.121, + "step": 11904 + }, + { + "epoch": 0.7390278726177913, + "grad_norm": 0.2099713522523711, + "learning_rate": 9.361734260245874e-05, + "loss": 3.021, + "step": 11905 + }, + { + "epoch": 0.7390899497175492, + "grad_norm": 0.22771189433091263, + "learning_rate": 9.361557685971789e-05, + "loss": 3.1996, + "step": 11906 + }, + { + "epoch": 0.7391520268173071, + "grad_norm": 0.222712660219026, + "learning_rate": 9.361381088942401e-05, + "loss": 3.2056, + "step": 11907 + }, + { + "epoch": 0.739214103917065, + "grad_norm": 0.18573013090039198, + "learning_rate": 9.361204469158627e-05, + "loss": 3.0654, + "step": 11908 + }, + { + "epoch": 0.7392761810168229, + "grad_norm": 0.2601996789373166, + "learning_rate": 9.361027826621392e-05, + "loss": 2.9688, + "step": 11909 + }, + { + "epoch": 0.7393382581165808, + "grad_norm": 0.2060147106568433, + "learning_rate": 9.360851161331617e-05, + "loss": 3.0456, + "step": 11910 + }, + { + "epoch": 0.7394003352163387, + "grad_norm": 0.18707478508149958, + "learning_rate": 9.36067447329022e-05, + "loss": 3.0887, + "step": 11911 + }, + { + "epoch": 0.7394624123160966, + "grad_norm": 0.23061703905978082, + "learning_rate": 9.360497762498129e-05, + "loss": 3.1053, + "step": 11912 + }, + { + "epoch": 0.7395244894158545, + "grad_norm": 0.22551696515601916, + "learning_rate": 9.360321028956261e-05, + "loss": 3.0808, + "step": 11913 + }, + { + "epoch": 0.7395865665156124, + "grad_norm": 0.19840018079171673, + "learning_rate": 9.36014427266554e-05, + "loss": 3.076, + "step": 11914 + }, + { + "epoch": 0.7396486436153703, + "grad_norm": 0.20401501988241377, + "learning_rate": 9.359967493626887e-05, + "loss": 3.1213, + "step": 11915 + }, + { + "epoch": 0.7397107207151282, + "grad_norm": 0.19357985632166302, + "learning_rate": 9.359790691841225e-05, + "loss": 3.0746, + "step": 11916 + }, + { + "epoch": 0.7397727978148861, + "grad_norm": 0.2682058808796007, + "learning_rate": 9.359613867309478e-05, + "loss": 3.1472, + "step": 11917 + }, + { + "epoch": 0.739834874914644, + "grad_norm": 0.21310317077320728, + "learning_rate": 9.359437020032567e-05, + "loss": 3.0472, + "step": 11918 + }, + { + "epoch": 0.7398969520144019, + "grad_norm": 0.22990626051076887, + "learning_rate": 9.359260150011416e-05, + "loss": 3.0461, + "step": 11919 + }, + { + "epoch": 0.7399590291141598, + "grad_norm": 0.21925849226671734, + "learning_rate": 9.359083257246944e-05, + "loss": 3.083, + "step": 11920 + }, + { + "epoch": 0.7400211062139177, + "grad_norm": 0.19550858191644596, + "learning_rate": 9.358906341740077e-05, + "loss": 3.0986, + "step": 11921 + }, + { + "epoch": 0.7400831833136756, + "grad_norm": 0.18250257802148312, + "learning_rate": 9.358729403491737e-05, + "loss": 3.0616, + "step": 11922 + }, + { + "epoch": 0.7401452604134335, + "grad_norm": 0.2311495005179316, + "learning_rate": 9.358552442502849e-05, + "loss": 3.123, + "step": 11923 + }, + { + "epoch": 0.7402073375131913, + "grad_norm": 0.17279193949012875, + "learning_rate": 9.358375458774334e-05, + "loss": 3.1033, + "step": 11924 + }, + { + "epoch": 0.7402694146129493, + "grad_norm": 0.2605659946821271, + "learning_rate": 9.358198452307117e-05, + "loss": 3.0783, + "step": 11925 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.21816197618871758, + "learning_rate": 9.35802142310212e-05, + "loss": 2.9491, + "step": 11926 + }, + { + "epoch": 0.7403935688124651, + "grad_norm": 0.20824628243099078, + "learning_rate": 9.357844371160267e-05, + "loss": 3.0131, + "step": 11927 + }, + { + "epoch": 0.740455645912223, + "grad_norm": 0.21631021262223993, + "learning_rate": 9.357667296482482e-05, + "loss": 3.1021, + "step": 11928 + }, + { + "epoch": 0.7405177230119809, + "grad_norm": 0.20810206610530652, + "learning_rate": 9.357490199069689e-05, + "loss": 3.1719, + "step": 11929 + }, + { + "epoch": 0.7405798001117387, + "grad_norm": 0.18182511343530888, + "learning_rate": 9.357313078922814e-05, + "loss": 3.0913, + "step": 11930 + }, + { + "epoch": 0.7406418772114967, + "grad_norm": 0.23089187380413045, + "learning_rate": 9.357135936042776e-05, + "loss": 3.0169, + "step": 11931 + }, + { + "epoch": 0.7407039543112546, + "grad_norm": 0.1906984926260448, + "learning_rate": 9.356958770430503e-05, + "loss": 3.0914, + "step": 11932 + }, + { + "epoch": 0.7407660314110125, + "grad_norm": 0.2546204426020035, + "learning_rate": 9.356781582086917e-05, + "loss": 3.1344, + "step": 11933 + }, + { + "epoch": 0.7408281085107704, + "grad_norm": 0.2171088285463312, + "learning_rate": 9.356604371012947e-05, + "loss": 3.0194, + "step": 11934 + }, + { + "epoch": 0.7408901856105283, + "grad_norm": 0.18976752748631126, + "learning_rate": 9.356427137209512e-05, + "loss": 3.0757, + "step": 11935 + }, + { + "epoch": 0.7409522627102861, + "grad_norm": 0.22149647270950473, + "learning_rate": 9.35624988067754e-05, + "loss": 3.1675, + "step": 11936 + }, + { + "epoch": 0.741014339810044, + "grad_norm": 0.21522414603737997, + "learning_rate": 9.356072601417954e-05, + "loss": 3.0922, + "step": 11937 + }, + { + "epoch": 0.741076416909802, + "grad_norm": 0.21340961366282513, + "learning_rate": 9.35589529943168e-05, + "loss": 3.1681, + "step": 11938 + }, + { + "epoch": 0.7411384940095599, + "grad_norm": 0.20232380985999748, + "learning_rate": 9.355717974719644e-05, + "loss": 3.1019, + "step": 11939 + }, + { + "epoch": 0.7412005711093178, + "grad_norm": 0.2059199086752519, + "learning_rate": 9.35554062728277e-05, + "loss": 3.0757, + "step": 11940 + }, + { + "epoch": 0.7412626482090757, + "grad_norm": 0.2805078303915193, + "learning_rate": 9.355363257121981e-05, + "loss": 3.0586, + "step": 11941 + }, + { + "epoch": 0.7413247253088335, + "grad_norm": 0.20275778773070305, + "learning_rate": 9.355185864238206e-05, + "loss": 3.0567, + "step": 11942 + }, + { + "epoch": 0.7413868024085914, + "grad_norm": 0.2392626781358131, + "learning_rate": 9.35500844863237e-05, + "loss": 3.1417, + "step": 11943 + }, + { + "epoch": 0.7414488795083494, + "grad_norm": 0.17356839760550227, + "learning_rate": 9.354831010305396e-05, + "loss": 3.0051, + "step": 11944 + }, + { + "epoch": 0.7415109566081073, + "grad_norm": 0.2018741091540259, + "learning_rate": 9.354653549258212e-05, + "loss": 3.0097, + "step": 11945 + }, + { + "epoch": 0.7415730337078652, + "grad_norm": 0.20326395966977123, + "learning_rate": 9.354476065491744e-05, + "loss": 3.0324, + "step": 11946 + }, + { + "epoch": 0.7416351108076231, + "grad_norm": 0.2211602633280419, + "learning_rate": 9.354298559006916e-05, + "loss": 3.1326, + "step": 11947 + }, + { + "epoch": 0.7416971879073809, + "grad_norm": 0.17509847067427065, + "learning_rate": 9.354121029804657e-05, + "loss": 3.0581, + "step": 11948 + }, + { + "epoch": 0.7417592650071388, + "grad_norm": 0.2153882928472262, + "learning_rate": 9.35394347788589e-05, + "loss": 3.0423, + "step": 11949 + }, + { + "epoch": 0.7418213421068968, + "grad_norm": 0.1908358033685318, + "learning_rate": 9.353765903251543e-05, + "loss": 3.0537, + "step": 11950 + }, + { + "epoch": 0.7418834192066547, + "grad_norm": 0.21137552346785668, + "learning_rate": 9.353588305902543e-05, + "loss": 3.0284, + "step": 11951 + }, + { + "epoch": 0.7419454963064126, + "grad_norm": 0.18644350146507604, + "learning_rate": 9.353410685839816e-05, + "loss": 3.009, + "step": 11952 + }, + { + "epoch": 0.7420075734061705, + "grad_norm": 0.2205291416742586, + "learning_rate": 9.353233043064289e-05, + "loss": 3.0249, + "step": 11953 + }, + { + "epoch": 0.7420696505059283, + "grad_norm": 0.1885688536012624, + "learning_rate": 9.353055377576889e-05, + "loss": 3.0925, + "step": 11954 + }, + { + "epoch": 0.7421317276056862, + "grad_norm": 0.22885331039691026, + "learning_rate": 9.35287768937854e-05, + "loss": 2.994, + "step": 11955 + }, + { + "epoch": 0.7421938047054442, + "grad_norm": 0.16314656164457908, + "learning_rate": 9.352699978470174e-05, + "loss": 3.0732, + "step": 11956 + }, + { + "epoch": 0.7422558818052021, + "grad_norm": 0.17018880376831239, + "learning_rate": 9.352522244852712e-05, + "loss": 2.9044, + "step": 11957 + }, + { + "epoch": 0.74231795890496, + "grad_norm": 0.20005708900398955, + "learning_rate": 9.352344488527088e-05, + "loss": 3.0951, + "step": 11958 + }, + { + "epoch": 0.7423800360047179, + "grad_norm": 0.2150268901182824, + "learning_rate": 9.352166709494226e-05, + "loss": 3.1422, + "step": 11959 + }, + { + "epoch": 0.7424421131044757, + "grad_norm": 0.18700814305963245, + "learning_rate": 9.351988907755054e-05, + "loss": 3.1044, + "step": 11960 + }, + { + "epoch": 0.7425041902042336, + "grad_norm": 0.21037460208057343, + "learning_rate": 9.351811083310498e-05, + "loss": 3.021, + "step": 11961 + }, + { + "epoch": 0.7425662673039916, + "grad_norm": 0.20609346359383418, + "learning_rate": 9.351633236161488e-05, + "loss": 3.0631, + "step": 11962 + }, + { + "epoch": 0.7426283444037495, + "grad_norm": 0.2765452212523419, + "learning_rate": 9.35145536630895e-05, + "loss": 3.0623, + "step": 11963 + }, + { + "epoch": 0.7426904215035074, + "grad_norm": 0.38585900577976245, + "learning_rate": 9.351277473753813e-05, + "loss": 3.1285, + "step": 11964 + }, + { + "epoch": 0.7427524986032653, + "grad_norm": 0.21180662604792774, + "learning_rate": 9.351099558497006e-05, + "loss": 3.0848, + "step": 11965 + }, + { + "epoch": 0.7428145757030231, + "grad_norm": 0.2319967565361157, + "learning_rate": 9.350921620539457e-05, + "loss": 3.0711, + "step": 11966 + }, + { + "epoch": 0.742876652802781, + "grad_norm": 0.24239806281718687, + "learning_rate": 9.350743659882092e-05, + "loss": 2.99, + "step": 11967 + }, + { + "epoch": 0.742938729902539, + "grad_norm": 0.21102461958481342, + "learning_rate": 9.350565676525842e-05, + "loss": 3.0528, + "step": 11968 + }, + { + "epoch": 0.7430008070022969, + "grad_norm": 0.18549164401755078, + "learning_rate": 9.350387670471634e-05, + "loss": 3.0434, + "step": 11969 + }, + { + "epoch": 0.7430628841020548, + "grad_norm": 0.25582070094760745, + "learning_rate": 9.350209641720399e-05, + "loss": 2.9401, + "step": 11970 + }, + { + "epoch": 0.7431249612018127, + "grad_norm": 0.20880482884428414, + "learning_rate": 9.350031590273063e-05, + "loss": 3.1049, + "step": 11971 + }, + { + "epoch": 0.7431870383015705, + "grad_norm": 0.2180824140660035, + "learning_rate": 9.349853516130556e-05, + "loss": 3.0766, + "step": 11972 + }, + { + "epoch": 0.7432491154013284, + "grad_norm": 0.18286051323458283, + "learning_rate": 9.349675419293808e-05, + "loss": 3.0954, + "step": 11973 + }, + { + "epoch": 0.7433111925010863, + "grad_norm": 0.22078458582361898, + "learning_rate": 9.349497299763748e-05, + "loss": 3.1102, + "step": 11974 + }, + { + "epoch": 0.7433732696008443, + "grad_norm": 0.2505402985040673, + "learning_rate": 9.349319157541303e-05, + "loss": 3.1125, + "step": 11975 + }, + { + "epoch": 0.7434353467006022, + "grad_norm": 0.18634020208409835, + "learning_rate": 9.349140992627405e-05, + "loss": 2.9861, + "step": 11976 + }, + { + "epoch": 0.7434974238003601, + "grad_norm": 0.6149327825626442, + "learning_rate": 9.348962805022982e-05, + "loss": 3.1557, + "step": 11977 + }, + { + "epoch": 0.7435595009001179, + "grad_norm": 0.3508596387924599, + "learning_rate": 9.348784594728966e-05, + "loss": 3.0327, + "step": 11978 + }, + { + "epoch": 0.7436215779998758, + "grad_norm": 0.22554318737190615, + "learning_rate": 9.348606361746284e-05, + "loss": 2.9924, + "step": 11979 + }, + { + "epoch": 0.7436836550996337, + "grad_norm": 0.30432163181890043, + "learning_rate": 9.348428106075867e-05, + "loss": 3.0748, + "step": 11980 + }, + { + "epoch": 0.7437457321993917, + "grad_norm": 0.25637562917039763, + "learning_rate": 9.348249827718645e-05, + "loss": 3.0824, + "step": 11981 + }, + { + "epoch": 0.7438078092991496, + "grad_norm": 0.31308405656065136, + "learning_rate": 9.348071526675549e-05, + "loss": 3.0796, + "step": 11982 + }, + { + "epoch": 0.7438698863989075, + "grad_norm": 0.27119210517526743, + "learning_rate": 9.347893202947506e-05, + "loss": 2.9745, + "step": 11983 + }, + { + "epoch": 0.7439319634986653, + "grad_norm": 0.2223156448690912, + "learning_rate": 9.34771485653545e-05, + "loss": 3.0004, + "step": 11984 + }, + { + "epoch": 0.7439940405984232, + "grad_norm": 0.31819416510248116, + "learning_rate": 9.34753648744031e-05, + "loss": 3.2411, + "step": 11985 + }, + { + "epoch": 0.7440561176981811, + "grad_norm": 0.4633747096620344, + "learning_rate": 9.347358095663017e-05, + "loss": 3.1434, + "step": 11986 + }, + { + "epoch": 0.7441181947979391, + "grad_norm": 0.3315281815524857, + "learning_rate": 9.347179681204502e-05, + "loss": 3.1459, + "step": 11987 + }, + { + "epoch": 0.744180271897697, + "grad_norm": 0.4010277122282656, + "learning_rate": 9.347001244065694e-05, + "loss": 3.016, + "step": 11988 + }, + { + "epoch": 0.7442423489974548, + "grad_norm": 0.28224108173549617, + "learning_rate": 9.346822784247527e-05, + "loss": 3.1305, + "step": 11989 + }, + { + "epoch": 0.7443044260972127, + "grad_norm": 0.32366844952552587, + "learning_rate": 9.346644301750929e-05, + "loss": 3.1216, + "step": 11990 + }, + { + "epoch": 0.7443665031969706, + "grad_norm": 0.2392256111740771, + "learning_rate": 9.346465796576831e-05, + "loss": 3.0111, + "step": 11991 + }, + { + "epoch": 0.7444285802967285, + "grad_norm": 0.255956922383378, + "learning_rate": 9.346287268726168e-05, + "loss": 3.066, + "step": 11992 + }, + { + "epoch": 0.7444906573964865, + "grad_norm": 0.20390739513674908, + "learning_rate": 9.346108718199869e-05, + "loss": 3.0439, + "step": 11993 + }, + { + "epoch": 0.7445527344962444, + "grad_norm": 0.39905498378071047, + "learning_rate": 9.345930144998864e-05, + "loss": 3.1251, + "step": 11994 + }, + { + "epoch": 0.7446148115960022, + "grad_norm": 0.2331322544723106, + "learning_rate": 9.345751549124089e-05, + "loss": 3.1405, + "step": 11995 + }, + { + "epoch": 0.7446768886957601, + "grad_norm": 0.2744027810815296, + "learning_rate": 9.345572930576471e-05, + "loss": 3.1006, + "step": 11996 + }, + { + "epoch": 0.744738965795518, + "grad_norm": 0.18419136018465077, + "learning_rate": 9.345394289356946e-05, + "loss": 2.9839, + "step": 11997 + }, + { + "epoch": 0.7448010428952759, + "grad_norm": 0.22504388824887925, + "learning_rate": 9.345215625466444e-05, + "loss": 3.1527, + "step": 11998 + }, + { + "epoch": 0.7448631199950339, + "grad_norm": 0.2513916649864186, + "learning_rate": 9.345036938905895e-05, + "loss": 3.1428, + "step": 11999 + }, + { + "epoch": 0.7449251970947918, + "grad_norm": 0.22783799243360403, + "learning_rate": 9.344858229676237e-05, + "loss": 3.1327, + "step": 12000 + }, + { + "epoch": 0.7449872741945496, + "grad_norm": 0.19492845925503766, + "learning_rate": 9.344679497778396e-05, + "loss": 3.0538, + "step": 12001 + }, + { + "epoch": 0.7450493512943075, + "grad_norm": 0.22590425664909075, + "learning_rate": 9.344500743213307e-05, + "loss": 3.0584, + "step": 12002 + }, + { + "epoch": 0.7451114283940654, + "grad_norm": 0.3096070493446654, + "learning_rate": 9.344321965981905e-05, + "loss": 3.0207, + "step": 12003 + }, + { + "epoch": 0.7451735054938233, + "grad_norm": 0.22140715563204594, + "learning_rate": 9.34414316608512e-05, + "loss": 3.1401, + "step": 12004 + }, + { + "epoch": 0.7452355825935812, + "grad_norm": 0.2761245714005563, + "learning_rate": 9.343964343523886e-05, + "loss": 3.0942, + "step": 12005 + }, + { + "epoch": 0.7452976596933392, + "grad_norm": 0.22945098507084288, + "learning_rate": 9.343785498299134e-05, + "loss": 3.1783, + "step": 12006 + }, + { + "epoch": 0.745359736793097, + "grad_norm": 0.22940694159094446, + "learning_rate": 9.3436066304118e-05, + "loss": 3.1198, + "step": 12007 + }, + { + "epoch": 0.7454218138928549, + "grad_norm": 0.1800211317685092, + "learning_rate": 9.343427739862816e-05, + "loss": 3.0714, + "step": 12008 + }, + { + "epoch": 0.7454838909926128, + "grad_norm": 0.237366695701231, + "learning_rate": 9.343248826653113e-05, + "loss": 3.0739, + "step": 12009 + }, + { + "epoch": 0.7455459680923707, + "grad_norm": 0.26738857600240046, + "learning_rate": 9.343069890783627e-05, + "loss": 3.0551, + "step": 12010 + }, + { + "epoch": 0.7456080451921286, + "grad_norm": 0.17593125195021297, + "learning_rate": 9.342890932255293e-05, + "loss": 3.1361, + "step": 12011 + }, + { + "epoch": 0.7456701222918866, + "grad_norm": 0.20045217457405562, + "learning_rate": 9.34271195106904e-05, + "loss": 2.9438, + "step": 12012 + }, + { + "epoch": 0.7457321993916444, + "grad_norm": 0.19970178947005143, + "learning_rate": 9.342532947225808e-05, + "loss": 3.096, + "step": 12013 + }, + { + "epoch": 0.7457942764914023, + "grad_norm": 0.180604244329923, + "learning_rate": 9.342353920726525e-05, + "loss": 3.0554, + "step": 12014 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.1891946897952097, + "learning_rate": 9.342174871572127e-05, + "loss": 3.1083, + "step": 12015 + }, + { + "epoch": 0.7459184306909181, + "grad_norm": 0.18831067864353526, + "learning_rate": 9.341995799763549e-05, + "loss": 3.0807, + "step": 12016 + }, + { + "epoch": 0.745980507790676, + "grad_norm": 0.19576522808779143, + "learning_rate": 9.341816705301727e-05, + "loss": 3.0051, + "step": 12017 + }, + { + "epoch": 0.746042584890434, + "grad_norm": 0.23437398274518825, + "learning_rate": 9.34163758818759e-05, + "loss": 3.0402, + "step": 12018 + }, + { + "epoch": 0.7461046619901918, + "grad_norm": 0.1865150090255594, + "learning_rate": 9.341458448422079e-05, + "loss": 3.1282, + "step": 12019 + }, + { + "epoch": 0.7461667390899497, + "grad_norm": 0.20249313414966968, + "learning_rate": 9.341279286006123e-05, + "loss": 3.0511, + "step": 12020 + }, + { + "epoch": 0.7462288161897076, + "grad_norm": 0.16270031709518493, + "learning_rate": 9.34110010094066e-05, + "loss": 3.1482, + "step": 12021 + }, + { + "epoch": 0.7462908932894655, + "grad_norm": 0.18749049281812763, + "learning_rate": 9.340920893226624e-05, + "loss": 3.0686, + "step": 12022 + }, + { + "epoch": 0.7463529703892234, + "grad_norm": 0.17148158173252157, + "learning_rate": 9.340741662864949e-05, + "loss": 3.1164, + "step": 12023 + }, + { + "epoch": 0.7464150474889814, + "grad_norm": 0.17260588411889874, + "learning_rate": 9.340562409856572e-05, + "loss": 3.0937, + "step": 12024 + }, + { + "epoch": 0.7464771245887392, + "grad_norm": 0.23248406387647733, + "learning_rate": 9.340383134202429e-05, + "loss": 3.049, + "step": 12025 + }, + { + "epoch": 0.7465392016884971, + "grad_norm": 0.15325267782096474, + "learning_rate": 9.340203835903449e-05, + "loss": 3.0517, + "step": 12026 + }, + { + "epoch": 0.746601278788255, + "grad_norm": 0.16605196162471958, + "learning_rate": 9.340024514960574e-05, + "loss": 3.1261, + "step": 12027 + }, + { + "epoch": 0.7466633558880129, + "grad_norm": 0.15523506386138017, + "learning_rate": 9.33984517137474e-05, + "loss": 3.0673, + "step": 12028 + }, + { + "epoch": 0.7467254329877708, + "grad_norm": 0.3024085722046993, + "learning_rate": 9.339665805146878e-05, + "loss": 3.0937, + "step": 12029 + }, + { + "epoch": 0.7467875100875287, + "grad_norm": 0.18156859292461494, + "learning_rate": 9.339486416277925e-05, + "loss": 3.0385, + "step": 12030 + }, + { + "epoch": 0.7468495871872866, + "grad_norm": 0.1768596840350318, + "learning_rate": 9.33930700476882e-05, + "loss": 3.0243, + "step": 12031 + }, + { + "epoch": 0.7469116642870445, + "grad_norm": 0.2202997910777152, + "learning_rate": 9.339127570620495e-05, + "loss": 3.1346, + "step": 12032 + }, + { + "epoch": 0.7469737413868024, + "grad_norm": 0.22734195538104163, + "learning_rate": 9.33894811383389e-05, + "loss": 3.0967, + "step": 12033 + }, + { + "epoch": 0.7470358184865603, + "grad_norm": 0.1742227126213994, + "learning_rate": 9.338768634409938e-05, + "loss": 3.0534, + "step": 12034 + }, + { + "epoch": 0.7470978955863182, + "grad_norm": 0.20471982081958198, + "learning_rate": 9.338589132349577e-05, + "loss": 3.151, + "step": 12035 + }, + { + "epoch": 0.7471599726860761, + "grad_norm": 0.19415881841137428, + "learning_rate": 9.338409607653743e-05, + "loss": 3.0576, + "step": 12036 + }, + { + "epoch": 0.747222049785834, + "grad_norm": 0.2272545833010048, + "learning_rate": 9.338230060323375e-05, + "loss": 3.011, + "step": 12037 + }, + { + "epoch": 0.7472841268855919, + "grad_norm": 0.19605901152073443, + "learning_rate": 9.338050490359405e-05, + "loss": 3.0091, + "step": 12038 + }, + { + "epoch": 0.7473462039853498, + "grad_norm": 0.22060116801263951, + "learning_rate": 9.337870897762773e-05, + "loss": 3.0901, + "step": 12039 + }, + { + "epoch": 0.7474082810851077, + "grad_norm": 0.19238628829682042, + "learning_rate": 9.337691282534416e-05, + "loss": 3.0855, + "step": 12040 + }, + { + "epoch": 0.7474703581848656, + "grad_norm": 0.1717878738567601, + "learning_rate": 9.33751164467527e-05, + "loss": 2.9781, + "step": 12041 + }, + { + "epoch": 0.7475324352846235, + "grad_norm": 0.1763399939626298, + "learning_rate": 9.337331984186274e-05, + "loss": 2.8765, + "step": 12042 + }, + { + "epoch": 0.7475945123843813, + "grad_norm": 0.19313318335531215, + "learning_rate": 9.337152301068364e-05, + "loss": 3.1009, + "step": 12043 + }, + { + "epoch": 0.7476565894841393, + "grad_norm": 0.2074092616899593, + "learning_rate": 9.336972595322476e-05, + "loss": 3.0217, + "step": 12044 + }, + { + "epoch": 0.7477186665838972, + "grad_norm": 0.18171345880993764, + "learning_rate": 9.336792866949551e-05, + "loss": 2.941, + "step": 12045 + }, + { + "epoch": 0.7477807436836551, + "grad_norm": 0.20154621886508758, + "learning_rate": 9.336613115950523e-05, + "loss": 3.1119, + "step": 12046 + }, + { + "epoch": 0.747842820783413, + "grad_norm": 0.1660045620093205, + "learning_rate": 9.336433342326333e-05, + "loss": 3.1364, + "step": 12047 + }, + { + "epoch": 0.7479048978831709, + "grad_norm": 0.16254102734599968, + "learning_rate": 9.336253546077918e-05, + "loss": 2.9851, + "step": 12048 + }, + { + "epoch": 0.7479669749829287, + "grad_norm": 0.1730752564313312, + "learning_rate": 9.336073727206214e-05, + "loss": 3.1414, + "step": 12049 + }, + { + "epoch": 0.7480290520826867, + "grad_norm": 0.19466285428638536, + "learning_rate": 9.335893885712161e-05, + "loss": 3.0529, + "step": 12050 + }, + { + "epoch": 0.7480911291824446, + "grad_norm": 0.21191322748397645, + "learning_rate": 9.335714021596698e-05, + "loss": 3.101, + "step": 12051 + }, + { + "epoch": 0.7481532062822025, + "grad_norm": 0.21756224377405278, + "learning_rate": 9.335534134860761e-05, + "loss": 3.1683, + "step": 12052 + }, + { + "epoch": 0.7482152833819604, + "grad_norm": 0.20674638223250452, + "learning_rate": 9.335354225505291e-05, + "loss": 3.1626, + "step": 12053 + }, + { + "epoch": 0.7482773604817183, + "grad_norm": 0.19987192784430077, + "learning_rate": 9.335174293531226e-05, + "loss": 3.0521, + "step": 12054 + }, + { + "epoch": 0.7483394375814761, + "grad_norm": 0.1936558463914355, + "learning_rate": 9.334994338939504e-05, + "loss": 3.0673, + "step": 12055 + }, + { + "epoch": 0.7484015146812341, + "grad_norm": 0.16687838820614592, + "learning_rate": 9.334814361731063e-05, + "loss": 2.9412, + "step": 12056 + }, + { + "epoch": 0.748463591780992, + "grad_norm": 0.2240232781939285, + "learning_rate": 9.334634361906844e-05, + "loss": 3.11, + "step": 12057 + }, + { + "epoch": 0.7485256688807499, + "grad_norm": 0.20045498080401444, + "learning_rate": 9.334454339467787e-05, + "loss": 3.1312, + "step": 12058 + }, + { + "epoch": 0.7485877459805078, + "grad_norm": 0.209668101651381, + "learning_rate": 9.334274294414826e-05, + "loss": 3.0474, + "step": 12059 + }, + { + "epoch": 0.7486498230802657, + "grad_norm": 0.3150448651379505, + "learning_rate": 9.334094226748906e-05, + "loss": 3.0067, + "step": 12060 + }, + { + "epoch": 0.7487119001800235, + "grad_norm": 0.6976780032603636, + "learning_rate": 9.333914136470965e-05, + "loss": 3.037, + "step": 12061 + }, + { + "epoch": 0.7487739772797815, + "grad_norm": 0.20256745407063295, + "learning_rate": 9.33373402358194e-05, + "loss": 3.1554, + "step": 12062 + }, + { + "epoch": 0.7488360543795394, + "grad_norm": 0.26945171011854385, + "learning_rate": 9.333553888082774e-05, + "loss": 3.0145, + "step": 12063 + }, + { + "epoch": 0.7488981314792973, + "grad_norm": 0.2374084779424697, + "learning_rate": 9.333373729974404e-05, + "loss": 3.1874, + "step": 12064 + }, + { + "epoch": 0.7489602085790552, + "grad_norm": 0.27449771515619714, + "learning_rate": 9.333193549257772e-05, + "loss": 3.2054, + "step": 12065 + }, + { + "epoch": 0.7490222856788131, + "grad_norm": 0.25142703466518584, + "learning_rate": 9.333013345933816e-05, + "loss": 3.0235, + "step": 12066 + }, + { + "epoch": 0.7490843627785709, + "grad_norm": 0.23609728612971911, + "learning_rate": 9.332833120003479e-05, + "loss": 2.9666, + "step": 12067 + }, + { + "epoch": 0.7491464398783289, + "grad_norm": 0.2681235163987736, + "learning_rate": 9.3326528714677e-05, + "loss": 3.1701, + "step": 12068 + }, + { + "epoch": 0.7492085169780868, + "grad_norm": 0.22263899953841662, + "learning_rate": 9.332472600327418e-05, + "loss": 3.0832, + "step": 12069 + }, + { + "epoch": 0.7492705940778447, + "grad_norm": 0.2312282639031116, + "learning_rate": 9.332292306583576e-05, + "loss": 3.1284, + "step": 12070 + }, + { + "epoch": 0.7493326711776026, + "grad_norm": 0.24801857103477296, + "learning_rate": 9.332111990237114e-05, + "loss": 3.0644, + "step": 12071 + }, + { + "epoch": 0.7493947482773605, + "grad_norm": 0.3299726735346113, + "learning_rate": 9.33193165128897e-05, + "loss": 3.0758, + "step": 12072 + }, + { + "epoch": 0.7494568253771183, + "grad_norm": 0.25577419989021416, + "learning_rate": 9.331751289740088e-05, + "loss": 3.0069, + "step": 12073 + }, + { + "epoch": 0.7495189024768762, + "grad_norm": 0.25980477289047293, + "learning_rate": 9.331570905591406e-05, + "loss": 3.1118, + "step": 12074 + }, + { + "epoch": 0.7495809795766342, + "grad_norm": 0.2417824033680337, + "learning_rate": 9.33139049884387e-05, + "loss": 3.063, + "step": 12075 + }, + { + "epoch": 0.7496430566763921, + "grad_norm": 0.20323797898679824, + "learning_rate": 9.331210069498417e-05, + "loss": 3.0179, + "step": 12076 + }, + { + "epoch": 0.74970513377615, + "grad_norm": 0.21518384717698913, + "learning_rate": 9.331029617555991e-05, + "loss": 3.1491, + "step": 12077 + }, + { + "epoch": 0.7497672108759079, + "grad_norm": 0.21414681883200867, + "learning_rate": 9.33084914301753e-05, + "loss": 2.9536, + "step": 12078 + }, + { + "epoch": 0.7498292879756657, + "grad_norm": 0.2756699707686778, + "learning_rate": 9.330668645883978e-05, + "loss": 3.0339, + "step": 12079 + }, + { + "epoch": 0.7498913650754236, + "grad_norm": 0.19434161153973328, + "learning_rate": 9.330488126156279e-05, + "loss": 3.0435, + "step": 12080 + }, + { + "epoch": 0.7499534421751816, + "grad_norm": 0.23531572377705653, + "learning_rate": 9.330307583835369e-05, + "loss": 3.0872, + "step": 12081 + }, + { + "epoch": 0.7500155192749395, + "grad_norm": 0.18662338175586227, + "learning_rate": 9.330127018922194e-05, + "loss": 3.0457, + "step": 12082 + }, + { + "epoch": 0.7500775963746974, + "grad_norm": 0.22552483627049946, + "learning_rate": 9.329946431417695e-05, + "loss": 3.0119, + "step": 12083 + }, + { + "epoch": 0.7501396734744553, + "grad_norm": 0.18514977174201944, + "learning_rate": 9.329765821322814e-05, + "loss": 3.1141, + "step": 12084 + }, + { + "epoch": 0.7502017505742131, + "grad_norm": 0.5924533438521308, + "learning_rate": 9.329585188638494e-05, + "loss": 3.1456, + "step": 12085 + }, + { + "epoch": 0.750263827673971, + "grad_norm": 0.4002326281433287, + "learning_rate": 9.329404533365678e-05, + "loss": 3.0857, + "step": 12086 + }, + { + "epoch": 0.750325904773729, + "grad_norm": 0.3360708659498029, + "learning_rate": 9.329223855505307e-05, + "loss": 3.0454, + "step": 12087 + }, + { + "epoch": 0.7503879818734869, + "grad_norm": 0.22140467385408774, + "learning_rate": 9.329043155058324e-05, + "loss": 3.1325, + "step": 12088 + }, + { + "epoch": 0.7504500589732448, + "grad_norm": 0.30836675238355726, + "learning_rate": 9.328862432025673e-05, + "loss": 3.0979, + "step": 12089 + }, + { + "epoch": 0.7505121360730027, + "grad_norm": 0.20876399568650417, + "learning_rate": 9.328681686408294e-05, + "loss": 2.9378, + "step": 12090 + }, + { + "epoch": 0.7505742131727605, + "grad_norm": 0.22499841086833103, + "learning_rate": 9.328500918207132e-05, + "loss": 3.0215, + "step": 12091 + }, + { + "epoch": 0.7506362902725184, + "grad_norm": 0.22925073624111, + "learning_rate": 9.328320127423131e-05, + "loss": 3.0694, + "step": 12092 + }, + { + "epoch": 0.7506983673722764, + "grad_norm": 0.21183154701439666, + "learning_rate": 9.328139314057233e-05, + "loss": 3.1048, + "step": 12093 + }, + { + "epoch": 0.7507604444720343, + "grad_norm": 0.25474389627835675, + "learning_rate": 9.327958478110382e-05, + "loss": 3.0646, + "step": 12094 + }, + { + "epoch": 0.7508225215717922, + "grad_norm": 0.2170533218585228, + "learning_rate": 9.32777761958352e-05, + "loss": 3.0461, + "step": 12095 + }, + { + "epoch": 0.7508845986715501, + "grad_norm": 0.24387513793763482, + "learning_rate": 9.327596738477591e-05, + "loss": 2.9943, + "step": 12096 + }, + { + "epoch": 0.7509466757713079, + "grad_norm": 0.36786755022651674, + "learning_rate": 9.32741583479354e-05, + "loss": 3.1182, + "step": 12097 + }, + { + "epoch": 0.7510087528710658, + "grad_norm": 0.22768318948950847, + "learning_rate": 9.32723490853231e-05, + "loss": 3.0461, + "step": 12098 + }, + { + "epoch": 0.7510708299708238, + "grad_norm": 0.251312624590012, + "learning_rate": 9.327053959694846e-05, + "loss": 3.0249, + "step": 12099 + }, + { + "epoch": 0.7511329070705817, + "grad_norm": 0.21984642150546266, + "learning_rate": 9.326872988282089e-05, + "loss": 3.0362, + "step": 12100 + }, + { + "epoch": 0.7511949841703396, + "grad_norm": 0.20996335050393936, + "learning_rate": 9.326691994294987e-05, + "loss": 3.0251, + "step": 12101 + }, + { + "epoch": 0.7512570612700975, + "grad_norm": 0.2254768702418774, + "learning_rate": 9.326510977734482e-05, + "loss": 3.0534, + "step": 12102 + }, + { + "epoch": 0.7513191383698553, + "grad_norm": 0.23841210977551086, + "learning_rate": 9.32632993860152e-05, + "loss": 3.1025, + "step": 12103 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.3272506668012629, + "learning_rate": 9.326148876897043e-05, + "loss": 3.0554, + "step": 12104 + }, + { + "epoch": 0.7514432925693711, + "grad_norm": 0.2615467759103768, + "learning_rate": 9.325967792621997e-05, + "loss": 2.9746, + "step": 12105 + }, + { + "epoch": 0.7515053696691291, + "grad_norm": 0.23589993929459946, + "learning_rate": 9.325786685777327e-05, + "loss": 3.1777, + "step": 12106 + }, + { + "epoch": 0.751567446768887, + "grad_norm": 0.34891546636111814, + "learning_rate": 9.325605556363979e-05, + "loss": 3.0225, + "step": 12107 + }, + { + "epoch": 0.7516295238686449, + "grad_norm": 0.21864001029337166, + "learning_rate": 9.325424404382894e-05, + "loss": 3.1532, + "step": 12108 + }, + { + "epoch": 0.7516916009684027, + "grad_norm": 0.2515704775664847, + "learning_rate": 9.325243229835023e-05, + "loss": 3.0219, + "step": 12109 + }, + { + "epoch": 0.7517536780681606, + "grad_norm": 0.22558459161794553, + "learning_rate": 9.325062032721306e-05, + "loss": 3.1151, + "step": 12110 + }, + { + "epoch": 0.7518157551679185, + "grad_norm": 0.19016492974274873, + "learning_rate": 9.324880813042692e-05, + "loss": 3.0893, + "step": 12111 + }, + { + "epoch": 0.7518778322676765, + "grad_norm": 0.26169019514854786, + "learning_rate": 9.324699570800123e-05, + "loss": 3.1028, + "step": 12112 + }, + { + "epoch": 0.7519399093674344, + "grad_norm": 0.23772742838869382, + "learning_rate": 9.324518305994548e-05, + "loss": 3.1485, + "step": 12113 + }, + { + "epoch": 0.7520019864671923, + "grad_norm": 0.2944259262984466, + "learning_rate": 9.32433701862691e-05, + "loss": 3.0177, + "step": 12114 + }, + { + "epoch": 0.7520640635669501, + "grad_norm": 0.20858472418813517, + "learning_rate": 9.324155708698156e-05, + "loss": 3.0089, + "step": 12115 + }, + { + "epoch": 0.752126140666708, + "grad_norm": 0.2310793111390112, + "learning_rate": 9.323974376209234e-05, + "loss": 3.0953, + "step": 12116 + }, + { + "epoch": 0.7521882177664659, + "grad_norm": 0.21805176675107432, + "learning_rate": 9.323793021161087e-05, + "loss": 3.1106, + "step": 12117 + }, + { + "epoch": 0.7522502948662239, + "grad_norm": 0.22685546677942886, + "learning_rate": 9.323611643554662e-05, + "loss": 2.9388, + "step": 12118 + }, + { + "epoch": 0.7523123719659818, + "grad_norm": 0.21393301138096063, + "learning_rate": 9.323430243390905e-05, + "loss": 3.0049, + "step": 12119 + }, + { + "epoch": 0.7523744490657397, + "grad_norm": 0.24730637170931843, + "learning_rate": 9.323248820670762e-05, + "loss": 2.9726, + "step": 12120 + }, + { + "epoch": 0.7524365261654975, + "grad_norm": 0.2638273557787868, + "learning_rate": 9.323067375395183e-05, + "loss": 3.1262, + "step": 12121 + }, + { + "epoch": 0.7524986032652554, + "grad_norm": 0.20957516560619063, + "learning_rate": 9.322885907565109e-05, + "loss": 3.0801, + "step": 12122 + }, + { + "epoch": 0.7525606803650133, + "grad_norm": 0.21695946275838215, + "learning_rate": 9.322704417181492e-05, + "loss": 3.1184, + "step": 12123 + }, + { + "epoch": 0.7526227574647713, + "grad_norm": 0.2312224478030875, + "learning_rate": 9.322522904245277e-05, + "loss": 3.0394, + "step": 12124 + }, + { + "epoch": 0.7526848345645292, + "grad_norm": 0.244746425519759, + "learning_rate": 9.322341368757409e-05, + "loss": 3.048, + "step": 12125 + }, + { + "epoch": 0.7527469116642871, + "grad_norm": 0.21494240663079864, + "learning_rate": 9.322159810718838e-05, + "loss": 2.9971, + "step": 12126 + }, + { + "epoch": 0.7528089887640449, + "grad_norm": 0.21749017879630453, + "learning_rate": 9.321978230130509e-05, + "loss": 3.0323, + "step": 12127 + }, + { + "epoch": 0.7528710658638028, + "grad_norm": 0.23038590283759455, + "learning_rate": 9.321796626993371e-05, + "loss": 3.0423, + "step": 12128 + }, + { + "epoch": 0.7529331429635607, + "grad_norm": 0.19374835259752393, + "learning_rate": 9.32161500130837e-05, + "loss": 3.112, + "step": 12129 + }, + { + "epoch": 0.7529952200633186, + "grad_norm": 0.30766997111553174, + "learning_rate": 9.321433353076456e-05, + "loss": 3.1022, + "step": 12130 + }, + { + "epoch": 0.7530572971630766, + "grad_norm": 0.3286452256613714, + "learning_rate": 9.321251682298575e-05, + "loss": 3.0969, + "step": 12131 + }, + { + "epoch": 0.7531193742628345, + "grad_norm": 0.25268550264662204, + "learning_rate": 9.321069988975674e-05, + "loss": 3.0162, + "step": 12132 + }, + { + "epoch": 0.7531814513625923, + "grad_norm": 0.2733542996938683, + "learning_rate": 9.320888273108702e-05, + "loss": 3.0408, + "step": 12133 + }, + { + "epoch": 0.7532435284623502, + "grad_norm": 0.23901430311109906, + "learning_rate": 9.320706534698607e-05, + "loss": 3.0876, + "step": 12134 + }, + { + "epoch": 0.7533056055621081, + "grad_norm": 0.23824977666898428, + "learning_rate": 9.320524773746337e-05, + "loss": 3.2062, + "step": 12135 + }, + { + "epoch": 0.753367682661866, + "grad_norm": 0.22820558463719898, + "learning_rate": 9.320342990252842e-05, + "loss": 3.0492, + "step": 12136 + }, + { + "epoch": 0.753429759761624, + "grad_norm": 0.24977308644594642, + "learning_rate": 9.320161184219067e-05, + "loss": 3.007, + "step": 12137 + }, + { + "epoch": 0.7534918368613819, + "grad_norm": 0.2083811476595224, + "learning_rate": 9.319979355645963e-05, + "loss": 2.9992, + "step": 12138 + }, + { + "epoch": 0.7535539139611397, + "grad_norm": 0.22777320210211294, + "learning_rate": 9.319797504534478e-05, + "loss": 2.9717, + "step": 12139 + }, + { + "epoch": 0.7536159910608976, + "grad_norm": 0.2236158372858056, + "learning_rate": 9.31961563088556e-05, + "loss": 3.0525, + "step": 12140 + }, + { + "epoch": 0.7536780681606555, + "grad_norm": 0.2900281704999235, + "learning_rate": 9.319433734700159e-05, + "loss": 3.0261, + "step": 12141 + }, + { + "epoch": 0.7537401452604134, + "grad_norm": 0.28290356545084233, + "learning_rate": 9.319251815979224e-05, + "loss": 3.0692, + "step": 12142 + }, + { + "epoch": 0.7538022223601714, + "grad_norm": 0.2388481291609426, + "learning_rate": 9.319069874723703e-05, + "loss": 3.0557, + "step": 12143 + }, + { + "epoch": 0.7538642994599293, + "grad_norm": 0.380719144387453, + "learning_rate": 9.318887910934546e-05, + "loss": 3.0428, + "step": 12144 + }, + { + "epoch": 0.7539263765596871, + "grad_norm": 0.2389857009870862, + "learning_rate": 9.318705924612703e-05, + "loss": 3.0679, + "step": 12145 + }, + { + "epoch": 0.753988453659445, + "grad_norm": 0.21796225158684687, + "learning_rate": 9.318523915759123e-05, + "loss": 3.0388, + "step": 12146 + }, + { + "epoch": 0.7540505307592029, + "grad_norm": 0.20716372336238925, + "learning_rate": 9.318341884374755e-05, + "loss": 3.0103, + "step": 12147 + }, + { + "epoch": 0.7541126078589608, + "grad_norm": 0.26048812203775323, + "learning_rate": 9.318159830460549e-05, + "loss": 3.1066, + "step": 12148 + }, + { + "epoch": 0.7541746849587188, + "grad_norm": 0.23610246146165914, + "learning_rate": 9.317977754017455e-05, + "loss": 3.011, + "step": 12149 + }, + { + "epoch": 0.7542367620584767, + "grad_norm": 0.21238418150673333, + "learning_rate": 9.317795655046424e-05, + "loss": 3.1028, + "step": 12150 + }, + { + "epoch": 0.7542988391582345, + "grad_norm": 0.23720556316023508, + "learning_rate": 9.317613533548403e-05, + "loss": 3.0233, + "step": 12151 + }, + { + "epoch": 0.7543609162579924, + "grad_norm": 0.25387549580873564, + "learning_rate": 9.317431389524344e-05, + "loss": 3.1089, + "step": 12152 + }, + { + "epoch": 0.7544229933577503, + "grad_norm": 0.2193127104366029, + "learning_rate": 9.317249222975199e-05, + "loss": 3.07, + "step": 12153 + }, + { + "epoch": 0.7544850704575082, + "grad_norm": 0.19178888455778573, + "learning_rate": 9.317067033901916e-05, + "loss": 3.0738, + "step": 12154 + }, + { + "epoch": 0.7545471475572662, + "grad_norm": 0.19720109507310357, + "learning_rate": 9.316884822305446e-05, + "loss": 3.1616, + "step": 12155 + }, + { + "epoch": 0.7546092246570241, + "grad_norm": 0.19539609073954528, + "learning_rate": 9.316702588186741e-05, + "loss": 3.118, + "step": 12156 + }, + { + "epoch": 0.7546713017567819, + "grad_norm": 0.20853200420081472, + "learning_rate": 9.316520331546748e-05, + "loss": 3.1131, + "step": 12157 + }, + { + "epoch": 0.7547333788565398, + "grad_norm": 0.2146442701023684, + "learning_rate": 9.316338052386422e-05, + "loss": 3.0906, + "step": 12158 + }, + { + "epoch": 0.7547954559562977, + "grad_norm": 0.1866990605750812, + "learning_rate": 9.316155750706715e-05, + "loss": 3.0872, + "step": 12159 + }, + { + "epoch": 0.7548575330560556, + "grad_norm": 0.18133859236852257, + "learning_rate": 9.315973426508573e-05, + "loss": 3.1329, + "step": 12160 + }, + { + "epoch": 0.7549196101558135, + "grad_norm": 0.22622188087780915, + "learning_rate": 9.315791079792951e-05, + "loss": 3.0627, + "step": 12161 + }, + { + "epoch": 0.7549816872555715, + "grad_norm": 0.19085393744088036, + "learning_rate": 9.315608710560796e-05, + "loss": 3.0794, + "step": 12162 + }, + { + "epoch": 0.7550437643553293, + "grad_norm": 0.4684091600392211, + "learning_rate": 9.315426318813067e-05, + "loss": 3.0121, + "step": 12163 + }, + { + "epoch": 0.7551058414550872, + "grad_norm": 0.22228017635346273, + "learning_rate": 9.31524390455071e-05, + "loss": 3.1205, + "step": 12164 + }, + { + "epoch": 0.7551679185548451, + "grad_norm": 0.20018825983789965, + "learning_rate": 9.315061467774677e-05, + "loss": 3.0824, + "step": 12165 + }, + { + "epoch": 0.755229995654603, + "grad_norm": 0.18791487610641258, + "learning_rate": 9.314879008485921e-05, + "loss": 3.0122, + "step": 12166 + }, + { + "epoch": 0.7552920727543609, + "grad_norm": 0.19650754583408386, + "learning_rate": 9.314696526685396e-05, + "loss": 3.009, + "step": 12167 + }, + { + "epoch": 0.7553541498541189, + "grad_norm": 0.23809947443993815, + "learning_rate": 9.314514022374049e-05, + "loss": 2.9788, + "step": 12168 + }, + { + "epoch": 0.7554162269538767, + "grad_norm": 0.24744789010857854, + "learning_rate": 9.314331495552836e-05, + "loss": 3.0508, + "step": 12169 + }, + { + "epoch": 0.7554783040536346, + "grad_norm": 0.18851110902729268, + "learning_rate": 9.314148946222708e-05, + "loss": 2.9837, + "step": 12170 + }, + { + "epoch": 0.7555403811533925, + "grad_norm": 0.20478918520011627, + "learning_rate": 9.313966374384618e-05, + "loss": 3.013, + "step": 12171 + }, + { + "epoch": 0.7556024582531504, + "grad_norm": 0.16334258232662685, + "learning_rate": 9.313783780039518e-05, + "loss": 2.9856, + "step": 12172 + }, + { + "epoch": 0.7556645353529083, + "grad_norm": 0.2924853851465576, + "learning_rate": 9.313601163188362e-05, + "loss": 3.0797, + "step": 12173 + }, + { + "epoch": 0.7557266124526663, + "grad_norm": 0.22493446707052098, + "learning_rate": 9.313418523832099e-05, + "loss": 2.9516, + "step": 12174 + }, + { + "epoch": 0.7557886895524241, + "grad_norm": 0.1832975981755551, + "learning_rate": 9.313235861971685e-05, + "loss": 3.0672, + "step": 12175 + }, + { + "epoch": 0.755850766652182, + "grad_norm": 0.19020825741367, + "learning_rate": 9.313053177608076e-05, + "loss": 3.0343, + "step": 12176 + }, + { + "epoch": 0.7559128437519399, + "grad_norm": 0.20885956682721635, + "learning_rate": 9.312870470742217e-05, + "loss": 2.9666, + "step": 12177 + }, + { + "epoch": 0.7559749208516978, + "grad_norm": 0.20291614799381003, + "learning_rate": 9.312687741375068e-05, + "loss": 3.1066, + "step": 12178 + }, + { + "epoch": 0.7560369979514557, + "grad_norm": 0.22458082929912632, + "learning_rate": 9.31250498950758e-05, + "loss": 3.0879, + "step": 12179 + }, + { + "epoch": 0.7560990750512137, + "grad_norm": 0.2497556058075066, + "learning_rate": 9.312322215140705e-05, + "loss": 3.0861, + "step": 12180 + }, + { + "epoch": 0.7561611521509715, + "grad_norm": 0.25101317795416445, + "learning_rate": 9.3121394182754e-05, + "loss": 3.0794, + "step": 12181 + }, + { + "epoch": 0.7562232292507294, + "grad_norm": 0.23470595358364432, + "learning_rate": 9.311956598912617e-05, + "loss": 3.2053, + "step": 12182 + }, + { + "epoch": 0.7562853063504873, + "grad_norm": 0.21879764446746267, + "learning_rate": 9.311773757053309e-05, + "loss": 3.1035, + "step": 12183 + }, + { + "epoch": 0.7563473834502452, + "grad_norm": 0.33953453115044374, + "learning_rate": 9.31159089269843e-05, + "loss": 3.0647, + "step": 12184 + }, + { + "epoch": 0.7564094605500031, + "grad_norm": 0.2147106798920885, + "learning_rate": 9.311408005848936e-05, + "loss": 3.1186, + "step": 12185 + }, + { + "epoch": 0.756471537649761, + "grad_norm": 0.19922365855733398, + "learning_rate": 9.311225096505778e-05, + "loss": 3.1432, + "step": 12186 + }, + { + "epoch": 0.7565336147495189, + "grad_norm": 0.18185552579632844, + "learning_rate": 9.311042164669913e-05, + "loss": 3.0802, + "step": 12187 + }, + { + "epoch": 0.7565956918492768, + "grad_norm": 0.19533525334383994, + "learning_rate": 9.310859210342295e-05, + "loss": 3.0218, + "step": 12188 + }, + { + "epoch": 0.7566577689490347, + "grad_norm": 0.19281735214731233, + "learning_rate": 9.310676233523877e-05, + "loss": 3.0683, + "step": 12189 + }, + { + "epoch": 0.7567198460487926, + "grad_norm": 0.17047664908124632, + "learning_rate": 9.310493234215617e-05, + "loss": 2.9855, + "step": 12190 + }, + { + "epoch": 0.7567819231485505, + "grad_norm": 0.22763430358050632, + "learning_rate": 9.310310212418464e-05, + "loss": 3.0083, + "step": 12191 + }, + { + "epoch": 0.7568440002483084, + "grad_norm": 0.19181916404538485, + "learning_rate": 9.310127168133378e-05, + "loss": 3.0802, + "step": 12192 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.18192584706725598, + "learning_rate": 9.309944101361312e-05, + "loss": 3.1024, + "step": 12193 + }, + { + "epoch": 0.7569681544478242, + "grad_norm": 0.18583168456888577, + "learning_rate": 9.309761012103222e-05, + "loss": 3.0777, + "step": 12194 + }, + { + "epoch": 0.7570302315475821, + "grad_norm": 0.16872472088050675, + "learning_rate": 9.309577900360061e-05, + "loss": 3.0424, + "step": 12195 + }, + { + "epoch": 0.75709230864734, + "grad_norm": 0.28363188810293755, + "learning_rate": 9.309394766132788e-05, + "loss": 3.0865, + "step": 12196 + }, + { + "epoch": 0.7571543857470979, + "grad_norm": 0.20290769360544447, + "learning_rate": 9.309211609422355e-05, + "loss": 3.0677, + "step": 12197 + }, + { + "epoch": 0.7572164628468558, + "grad_norm": 0.2015125575033645, + "learning_rate": 9.309028430229718e-05, + "loss": 3.0609, + "step": 12198 + }, + { + "epoch": 0.7572785399466136, + "grad_norm": 0.22372943340113569, + "learning_rate": 9.308845228555836e-05, + "loss": 3.0572, + "step": 12199 + }, + { + "epoch": 0.7573406170463716, + "grad_norm": 0.22799133408541003, + "learning_rate": 9.30866200440166e-05, + "loss": 3.1091, + "step": 12200 + }, + { + "epoch": 0.7574026941461295, + "grad_norm": 0.2687608378465083, + "learning_rate": 9.30847875776815e-05, + "loss": 3.0862, + "step": 12201 + }, + { + "epoch": 0.7574647712458874, + "grad_norm": 0.1878628894693235, + "learning_rate": 9.30829548865626e-05, + "loss": 3.1348, + "step": 12202 + }, + { + "epoch": 0.7575268483456453, + "grad_norm": 0.2793731451933049, + "learning_rate": 9.308112197066947e-05, + "loss": 3.0708, + "step": 12203 + }, + { + "epoch": 0.7575889254454032, + "grad_norm": 0.1970383118128544, + "learning_rate": 9.307928883001167e-05, + "loss": 3.0982, + "step": 12204 + }, + { + "epoch": 0.757651002545161, + "grad_norm": 0.22074801064531283, + "learning_rate": 9.307745546459875e-05, + "loss": 3.063, + "step": 12205 + }, + { + "epoch": 0.757713079644919, + "grad_norm": 0.1649588666565051, + "learning_rate": 9.30756218744403e-05, + "loss": 3.1079, + "step": 12206 + }, + { + "epoch": 0.7577751567446769, + "grad_norm": 0.21456388972593785, + "learning_rate": 9.307378805954585e-05, + "loss": 3.0181, + "step": 12207 + }, + { + "epoch": 0.7578372338444348, + "grad_norm": 0.2933702261145843, + "learning_rate": 9.307195401992501e-05, + "loss": 3.1018, + "step": 12208 + }, + { + "epoch": 0.7578993109441927, + "grad_norm": 0.20241954149898356, + "learning_rate": 9.307011975558734e-05, + "loss": 3.0601, + "step": 12209 + }, + { + "epoch": 0.7579613880439506, + "grad_norm": 0.21408354906327676, + "learning_rate": 9.306828526654239e-05, + "loss": 3.0757, + "step": 12210 + }, + { + "epoch": 0.7580234651437084, + "grad_norm": 0.18972851705693008, + "learning_rate": 9.306645055279973e-05, + "loss": 3.0426, + "step": 12211 + }, + { + "epoch": 0.7580855422434664, + "grad_norm": 0.15716839537735078, + "learning_rate": 9.306461561436896e-05, + "loss": 3.0078, + "step": 12212 + }, + { + "epoch": 0.7581476193432243, + "grad_norm": 0.2020610307073108, + "learning_rate": 9.306278045125962e-05, + "loss": 3.0266, + "step": 12213 + }, + { + "epoch": 0.7582096964429822, + "grad_norm": 0.19636848133602636, + "learning_rate": 9.306094506348131e-05, + "loss": 3.1111, + "step": 12214 + }, + { + "epoch": 0.7582717735427401, + "grad_norm": 0.3282057571941213, + "learning_rate": 9.30591094510436e-05, + "loss": 3.1251, + "step": 12215 + }, + { + "epoch": 0.758333850642498, + "grad_norm": 0.232208298076415, + "learning_rate": 9.305727361395605e-05, + "loss": 3.066, + "step": 12216 + }, + { + "epoch": 0.7583959277422558, + "grad_norm": 0.23316404312294758, + "learning_rate": 9.305543755222825e-05, + "loss": 3.1198, + "step": 12217 + }, + { + "epoch": 0.7584580048420138, + "grad_norm": 0.17624219222732282, + "learning_rate": 9.30536012658698e-05, + "loss": 3.0807, + "step": 12218 + }, + { + "epoch": 0.7585200819417717, + "grad_norm": 0.230990151415746, + "learning_rate": 9.305176475489025e-05, + "loss": 3.1852, + "step": 12219 + }, + { + "epoch": 0.7585821590415296, + "grad_norm": 0.17620250241135021, + "learning_rate": 9.30499280192992e-05, + "loss": 3.0435, + "step": 12220 + }, + { + "epoch": 0.7586442361412875, + "grad_norm": 0.17923250850623143, + "learning_rate": 9.304809105910621e-05, + "loss": 3.0549, + "step": 12221 + }, + { + "epoch": 0.7587063132410454, + "grad_norm": 0.19857087158852027, + "learning_rate": 9.304625387432089e-05, + "loss": 2.9907, + "step": 12222 + }, + { + "epoch": 0.7587683903408032, + "grad_norm": 0.18992879477557825, + "learning_rate": 9.30444164649528e-05, + "loss": 3.0704, + "step": 12223 + }, + { + "epoch": 0.7588304674405612, + "grad_norm": 0.1784530822243492, + "learning_rate": 9.304257883101155e-05, + "loss": 3.0752, + "step": 12224 + }, + { + "epoch": 0.7588925445403191, + "grad_norm": 0.17340134080491654, + "learning_rate": 9.304074097250672e-05, + "loss": 3.0566, + "step": 12225 + }, + { + "epoch": 0.758954621640077, + "grad_norm": 0.1762789990532115, + "learning_rate": 9.303890288944787e-05, + "loss": 3.1023, + "step": 12226 + }, + { + "epoch": 0.7590166987398349, + "grad_norm": 0.1732880850872241, + "learning_rate": 9.303706458184464e-05, + "loss": 3.0405, + "step": 12227 + }, + { + "epoch": 0.7590787758395928, + "grad_norm": 0.19682678661662092, + "learning_rate": 9.30352260497066e-05, + "loss": 3.0758, + "step": 12228 + }, + { + "epoch": 0.7591408529393506, + "grad_norm": 0.19326236487652146, + "learning_rate": 9.303338729304334e-05, + "loss": 3.0614, + "step": 12229 + }, + { + "epoch": 0.7592029300391085, + "grad_norm": 0.17603451709129736, + "learning_rate": 9.303154831186444e-05, + "loss": 3.0323, + "step": 12230 + }, + { + "epoch": 0.7592650071388665, + "grad_norm": 0.19653184783657796, + "learning_rate": 9.30297091061795e-05, + "loss": 3.1292, + "step": 12231 + }, + { + "epoch": 0.7593270842386244, + "grad_norm": 0.215884729843673, + "learning_rate": 9.302786967599813e-05, + "loss": 2.9937, + "step": 12232 + }, + { + "epoch": 0.7593891613383823, + "grad_norm": 0.24692688046741876, + "learning_rate": 9.302603002132992e-05, + "loss": 3.1222, + "step": 12233 + }, + { + "epoch": 0.7594512384381402, + "grad_norm": 0.2485930744428105, + "learning_rate": 9.302419014218448e-05, + "loss": 3.0184, + "step": 12234 + }, + { + "epoch": 0.759513315537898, + "grad_norm": 0.20494296751788216, + "learning_rate": 9.302235003857138e-05, + "loss": 3.0165, + "step": 12235 + }, + { + "epoch": 0.7595753926376559, + "grad_norm": 0.20232427937859965, + "learning_rate": 9.302050971050023e-05, + "loss": 3.1463, + "step": 12236 + }, + { + "epoch": 0.7596374697374139, + "grad_norm": 0.20919019599683988, + "learning_rate": 9.301866915798065e-05, + "loss": 3.1357, + "step": 12237 + }, + { + "epoch": 0.7596995468371718, + "grad_norm": 0.2051163864773925, + "learning_rate": 9.301682838102223e-05, + "loss": 2.9856, + "step": 12238 + }, + { + "epoch": 0.7597616239369297, + "grad_norm": 0.20769010826077897, + "learning_rate": 9.301498737963457e-05, + "loss": 3.1234, + "step": 12239 + }, + { + "epoch": 0.7598237010366876, + "grad_norm": 0.19130557195624362, + "learning_rate": 9.301314615382727e-05, + "loss": 2.9899, + "step": 12240 + }, + { + "epoch": 0.7598857781364454, + "grad_norm": 0.2029973601072115, + "learning_rate": 9.301130470360995e-05, + "loss": 3.0056, + "step": 12241 + }, + { + "epoch": 0.7599478552362033, + "grad_norm": 0.18891365876037736, + "learning_rate": 9.300946302899221e-05, + "loss": 3.0609, + "step": 12242 + }, + { + "epoch": 0.7600099323359613, + "grad_norm": 0.18960609356178715, + "learning_rate": 9.300762112998367e-05, + "loss": 3.0644, + "step": 12243 + }, + { + "epoch": 0.7600720094357192, + "grad_norm": 0.17949816846378008, + "learning_rate": 9.300577900659393e-05, + "loss": 2.9815, + "step": 12244 + }, + { + "epoch": 0.7601340865354771, + "grad_norm": 0.20002544919163667, + "learning_rate": 9.300393665883258e-05, + "loss": 2.9614, + "step": 12245 + }, + { + "epoch": 0.760196163635235, + "grad_norm": 0.4610356371876463, + "learning_rate": 9.300209408670928e-05, + "loss": 3.0148, + "step": 12246 + }, + { + "epoch": 0.7602582407349928, + "grad_norm": 0.27206008574249074, + "learning_rate": 9.30002512902336e-05, + "loss": 3.0565, + "step": 12247 + }, + { + "epoch": 0.7603203178347507, + "grad_norm": 0.2818057741346891, + "learning_rate": 9.299840826941518e-05, + "loss": 3.071, + "step": 12248 + }, + { + "epoch": 0.7603823949345087, + "grad_norm": 0.24942212073109626, + "learning_rate": 9.299656502426361e-05, + "loss": 3.0211, + "step": 12249 + }, + { + "epoch": 0.7604444720342666, + "grad_norm": 0.224406756356001, + "learning_rate": 9.299472155478855e-05, + "loss": 3.0664, + "step": 12250 + }, + { + "epoch": 0.7605065491340245, + "grad_norm": 0.20321742119340752, + "learning_rate": 9.299287786099957e-05, + "loss": 3.0835, + "step": 12251 + }, + { + "epoch": 0.7605686262337824, + "grad_norm": 0.19661901271117038, + "learning_rate": 9.299103394290631e-05, + "loss": 3.081, + "step": 12252 + }, + { + "epoch": 0.7606307033335402, + "grad_norm": 0.23349082586647232, + "learning_rate": 9.298918980051839e-05, + "loss": 3.1204, + "step": 12253 + }, + { + "epoch": 0.7606927804332981, + "grad_norm": 0.1942833968404867, + "learning_rate": 9.298734543384544e-05, + "loss": 3.047, + "step": 12254 + }, + { + "epoch": 0.760754857533056, + "grad_norm": 0.24952307300152238, + "learning_rate": 9.298550084289705e-05, + "loss": 3.1262, + "step": 12255 + }, + { + "epoch": 0.760816934632814, + "grad_norm": 0.2050490313283378, + "learning_rate": 9.298365602768289e-05, + "loss": 3.098, + "step": 12256 + }, + { + "epoch": 0.7608790117325719, + "grad_norm": 0.2204167187161255, + "learning_rate": 9.298181098821256e-05, + "loss": 3.1183, + "step": 12257 + }, + { + "epoch": 0.7609410888323298, + "grad_norm": 0.3556716884610369, + "learning_rate": 9.297996572449567e-05, + "loss": 2.9892, + "step": 12258 + }, + { + "epoch": 0.7610031659320876, + "grad_norm": 0.2189163273895354, + "learning_rate": 9.297812023654189e-05, + "loss": 3.0672, + "step": 12259 + }, + { + "epoch": 0.7610652430318455, + "grad_norm": 0.34406849887964364, + "learning_rate": 9.297627452436082e-05, + "loss": 3.0898, + "step": 12260 + }, + { + "epoch": 0.7611273201316034, + "grad_norm": 0.30395929417884265, + "learning_rate": 9.297442858796208e-05, + "loss": 3.085, + "step": 12261 + }, + { + "epoch": 0.7611893972313614, + "grad_norm": 0.2548202788714362, + "learning_rate": 9.297258242735532e-05, + "loss": 3.0293, + "step": 12262 + }, + { + "epoch": 0.7612514743311193, + "grad_norm": 0.21192061066948223, + "learning_rate": 9.297073604255015e-05, + "loss": 3.1003, + "step": 12263 + }, + { + "epoch": 0.7613135514308772, + "grad_norm": 0.2553451991765495, + "learning_rate": 9.296888943355623e-05, + "loss": 3.0327, + "step": 12264 + }, + { + "epoch": 0.761375628530635, + "grad_norm": 0.22622523197257452, + "learning_rate": 9.296704260038319e-05, + "loss": 3.1031, + "step": 12265 + }, + { + "epoch": 0.7614377056303929, + "grad_norm": 0.21613345149014881, + "learning_rate": 9.296519554304066e-05, + "loss": 3.0052, + "step": 12266 + }, + { + "epoch": 0.7614997827301508, + "grad_norm": 0.2231791279557059, + "learning_rate": 9.296334826153826e-05, + "loss": 2.9669, + "step": 12267 + }, + { + "epoch": 0.7615618598299088, + "grad_norm": 0.2338589470607675, + "learning_rate": 9.296150075588566e-05, + "loss": 3.1202, + "step": 12268 + }, + { + "epoch": 0.7616239369296667, + "grad_norm": 0.22856193464489333, + "learning_rate": 9.295965302609246e-05, + "loss": 3.0305, + "step": 12269 + }, + { + "epoch": 0.7616860140294246, + "grad_norm": 0.2494509150702567, + "learning_rate": 9.295780507216834e-05, + "loss": 2.9682, + "step": 12270 + }, + { + "epoch": 0.7617480911291824, + "grad_norm": 0.2212952376444271, + "learning_rate": 9.29559568941229e-05, + "loss": 3.0247, + "step": 12271 + }, + { + "epoch": 0.7618101682289403, + "grad_norm": 0.2639107081667411, + "learning_rate": 9.295410849196584e-05, + "loss": 3.024, + "step": 12272 + }, + { + "epoch": 0.7618722453286982, + "grad_norm": 0.22722795198463946, + "learning_rate": 9.295225986570675e-05, + "loss": 3.1053, + "step": 12273 + }, + { + "epoch": 0.7619343224284562, + "grad_norm": 0.22013304517028126, + "learning_rate": 9.295041101535529e-05, + "loss": 3.1475, + "step": 12274 + }, + { + "epoch": 0.7619963995282141, + "grad_norm": 0.1870352628145149, + "learning_rate": 9.294856194092111e-05, + "loss": 3.0805, + "step": 12275 + }, + { + "epoch": 0.762058476627972, + "grad_norm": 0.19553305570607854, + "learning_rate": 9.294671264241386e-05, + "loss": 3.0256, + "step": 12276 + }, + { + "epoch": 0.7621205537277298, + "grad_norm": 0.23499113794113513, + "learning_rate": 9.294486311984317e-05, + "loss": 2.9723, + "step": 12277 + }, + { + "epoch": 0.7621826308274877, + "grad_norm": 0.20962312713678094, + "learning_rate": 9.294301337321872e-05, + "loss": 3.0612, + "step": 12278 + }, + { + "epoch": 0.7622447079272456, + "grad_norm": 0.19331212863337616, + "learning_rate": 9.294116340255015e-05, + "loss": 3.0991, + "step": 12279 + }, + { + "epoch": 0.7623067850270036, + "grad_norm": 0.22296074074285674, + "learning_rate": 9.293931320784708e-05, + "loss": 3.0667, + "step": 12280 + }, + { + "epoch": 0.7623688621267615, + "grad_norm": 0.21278415432338832, + "learning_rate": 9.293746278911921e-05, + "loss": 3.0261, + "step": 12281 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.23592628648143826, + "learning_rate": 9.293561214637617e-05, + "loss": 3.1263, + "step": 12282 + }, + { + "epoch": 0.7624930163262772, + "grad_norm": 0.26385178256297465, + "learning_rate": 9.293376127962762e-05, + "loss": 3.0595, + "step": 12283 + }, + { + "epoch": 0.7625550934260351, + "grad_norm": 0.24524374654411116, + "learning_rate": 9.293191018888322e-05, + "loss": 3.0206, + "step": 12284 + }, + { + "epoch": 0.762617170525793, + "grad_norm": 0.19397816720678482, + "learning_rate": 9.29300588741526e-05, + "loss": 3.0645, + "step": 12285 + }, + { + "epoch": 0.762679247625551, + "grad_norm": 0.20222165905506656, + "learning_rate": 9.292820733544546e-05, + "loss": 3.0538, + "step": 12286 + }, + { + "epoch": 0.7627413247253089, + "grad_norm": 0.23157745323343676, + "learning_rate": 9.292635557277143e-05, + "loss": 3.0885, + "step": 12287 + }, + { + "epoch": 0.7628034018250668, + "grad_norm": 0.19688889439311688, + "learning_rate": 9.29245035861402e-05, + "loss": 3.0847, + "step": 12288 + }, + { + "epoch": 0.7628654789248246, + "grad_norm": 0.20160565320946108, + "learning_rate": 9.29226513755614e-05, + "loss": 3.0644, + "step": 12289 + }, + { + "epoch": 0.7629275560245825, + "grad_norm": 0.22982394000169662, + "learning_rate": 9.292079894104471e-05, + "loss": 3.1856, + "step": 12290 + }, + { + "epoch": 0.7629896331243404, + "grad_norm": 0.2541410387647364, + "learning_rate": 9.29189462825998e-05, + "loss": 3.0665, + "step": 12291 + }, + { + "epoch": 0.7630517102240983, + "grad_norm": 0.24445210233226342, + "learning_rate": 9.291709340023632e-05, + "loss": 3.138, + "step": 12292 + }, + { + "epoch": 0.7631137873238563, + "grad_norm": 0.2618647495530602, + "learning_rate": 9.291524029396394e-05, + "loss": 3.0556, + "step": 12293 + }, + { + "epoch": 0.7631758644236142, + "grad_norm": 0.18058881481336553, + "learning_rate": 9.291338696379235e-05, + "loss": 3.0813, + "step": 12294 + }, + { + "epoch": 0.763237941523372, + "grad_norm": 0.2052892778638423, + "learning_rate": 9.29115334097312e-05, + "loss": 3.1053, + "step": 12295 + }, + { + "epoch": 0.7633000186231299, + "grad_norm": 0.16414283307027866, + "learning_rate": 9.290967963179015e-05, + "loss": 2.9881, + "step": 12296 + }, + { + "epoch": 0.7633620957228878, + "grad_norm": 0.27733300416572304, + "learning_rate": 9.290782562997888e-05, + "loss": 3.1357, + "step": 12297 + }, + { + "epoch": 0.7634241728226457, + "grad_norm": 0.39281145440131654, + "learning_rate": 9.290597140430708e-05, + "loss": 3.205, + "step": 12298 + }, + { + "epoch": 0.7634862499224037, + "grad_norm": 0.2780287436747169, + "learning_rate": 9.290411695478442e-05, + "loss": 3.0083, + "step": 12299 + }, + { + "epoch": 0.7635483270221616, + "grad_norm": 0.2356777621447375, + "learning_rate": 9.290226228142057e-05, + "loss": 3.0084, + "step": 12300 + }, + { + "epoch": 0.7636104041219194, + "grad_norm": 0.2191298371649909, + "learning_rate": 9.290040738422517e-05, + "loss": 3.03, + "step": 12301 + }, + { + "epoch": 0.7636724812216773, + "grad_norm": 0.1944948342337243, + "learning_rate": 9.289855226320796e-05, + "loss": 3.0369, + "step": 12302 + }, + { + "epoch": 0.7637345583214352, + "grad_norm": 0.21586579926602742, + "learning_rate": 9.289669691837857e-05, + "loss": 3.0962, + "step": 12303 + }, + { + "epoch": 0.7637966354211931, + "grad_norm": 0.22146794629936856, + "learning_rate": 9.28948413497467e-05, + "loss": 3.1195, + "step": 12304 + }, + { + "epoch": 0.763858712520951, + "grad_norm": 0.1967239970173284, + "learning_rate": 9.289298555732204e-05, + "loss": 3.0566, + "step": 12305 + }, + { + "epoch": 0.763920789620709, + "grad_norm": 0.19172451262452528, + "learning_rate": 9.289112954111426e-05, + "loss": 3.0452, + "step": 12306 + }, + { + "epoch": 0.7639828667204668, + "grad_norm": 0.1909439674100006, + "learning_rate": 9.288927330113304e-05, + "loss": 3.0571, + "step": 12307 + }, + { + "epoch": 0.7640449438202247, + "grad_norm": 0.17808728237025007, + "learning_rate": 9.288741683738809e-05, + "loss": 3.0074, + "step": 12308 + }, + { + "epoch": 0.7641070209199826, + "grad_norm": 0.18033535928133174, + "learning_rate": 9.288556014988905e-05, + "loss": 3.0959, + "step": 12309 + }, + { + "epoch": 0.7641690980197405, + "grad_norm": 0.24239541173952098, + "learning_rate": 9.288370323864563e-05, + "loss": 3.1386, + "step": 12310 + }, + { + "epoch": 0.7642311751194985, + "grad_norm": 0.20283644611253882, + "learning_rate": 9.288184610366755e-05, + "loss": 3.1078, + "step": 12311 + }, + { + "epoch": 0.7642932522192564, + "grad_norm": 0.18191682673789597, + "learning_rate": 9.287998874496445e-05, + "loss": 3.1043, + "step": 12312 + }, + { + "epoch": 0.7643553293190142, + "grad_norm": 0.17126172825233119, + "learning_rate": 9.287813116254602e-05, + "loss": 3.0611, + "step": 12313 + }, + { + "epoch": 0.7644174064187721, + "grad_norm": 0.19553163632901438, + "learning_rate": 9.2876273356422e-05, + "loss": 3.0292, + "step": 12314 + }, + { + "epoch": 0.76447948351853, + "grad_norm": 0.18672409049979782, + "learning_rate": 9.287441532660204e-05, + "loss": 3.0554, + "step": 12315 + }, + { + "epoch": 0.7645415606182879, + "grad_norm": 0.2578205338584611, + "learning_rate": 9.287255707309585e-05, + "loss": 3.0121, + "step": 12316 + }, + { + "epoch": 0.7646036377180458, + "grad_norm": 0.2624617321639955, + "learning_rate": 9.28706985959131e-05, + "loss": 3.0964, + "step": 12317 + }, + { + "epoch": 0.7646657148178038, + "grad_norm": 0.22621104505717862, + "learning_rate": 9.286883989506354e-05, + "loss": 3.0759, + "step": 12318 + }, + { + "epoch": 0.7647277919175616, + "grad_norm": 0.20333755815786203, + "learning_rate": 9.286698097055684e-05, + "loss": 3.0797, + "step": 12319 + }, + { + "epoch": 0.7647898690173195, + "grad_norm": 0.2025770812002199, + "learning_rate": 9.286512182240268e-05, + "loss": 3.1025, + "step": 12320 + }, + { + "epoch": 0.7648519461170774, + "grad_norm": 0.1622349356224898, + "learning_rate": 9.286326245061077e-05, + "loss": 3.1093, + "step": 12321 + }, + { + "epoch": 0.7649140232168353, + "grad_norm": 0.20612470512935813, + "learning_rate": 9.28614028551908e-05, + "loss": 3.1451, + "step": 12322 + }, + { + "epoch": 0.7649761003165932, + "grad_norm": 0.3117730745863128, + "learning_rate": 9.285954303615252e-05, + "loss": 3.1115, + "step": 12323 + }, + { + "epoch": 0.7650381774163512, + "grad_norm": 0.23558835032952194, + "learning_rate": 9.285768299350557e-05, + "loss": 3.0427, + "step": 12324 + }, + { + "epoch": 0.765100254516109, + "grad_norm": 0.19950540339351616, + "learning_rate": 9.28558227272597e-05, + "loss": 3.039, + "step": 12325 + }, + { + "epoch": 0.7651623316158669, + "grad_norm": 0.18346631923618845, + "learning_rate": 9.28539622374246e-05, + "loss": 3.0664, + "step": 12326 + }, + { + "epoch": 0.7652244087156248, + "grad_norm": 0.18828758094558945, + "learning_rate": 9.285210152400995e-05, + "loss": 3.0216, + "step": 12327 + }, + { + "epoch": 0.7652864858153827, + "grad_norm": 0.27082303681730346, + "learning_rate": 9.285024058702553e-05, + "loss": 3.0043, + "step": 12328 + }, + { + "epoch": 0.7653485629151406, + "grad_norm": 0.19355228703585656, + "learning_rate": 9.284837942648097e-05, + "loss": 3.105, + "step": 12329 + }, + { + "epoch": 0.7654106400148986, + "grad_norm": 0.1727056911501845, + "learning_rate": 9.284651804238602e-05, + "loss": 3.0759, + "step": 12330 + }, + { + "epoch": 0.7654727171146564, + "grad_norm": 0.17881700880332174, + "learning_rate": 9.284465643475038e-05, + "loss": 3.0681, + "step": 12331 + }, + { + "epoch": 0.7655347942144143, + "grad_norm": 0.16506918231664414, + "learning_rate": 9.284279460358378e-05, + "loss": 3.0755, + "step": 12332 + }, + { + "epoch": 0.7655968713141722, + "grad_norm": 0.18336531448659124, + "learning_rate": 9.28409325488959e-05, + "loss": 3.0795, + "step": 12333 + }, + { + "epoch": 0.7656589484139301, + "grad_norm": 0.29759824643217686, + "learning_rate": 9.28390702706965e-05, + "loss": 3.0444, + "step": 12334 + }, + { + "epoch": 0.765721025513688, + "grad_norm": 0.2223903842201878, + "learning_rate": 9.283720776899525e-05, + "loss": 3.1045, + "step": 12335 + }, + { + "epoch": 0.765783102613446, + "grad_norm": 0.1754408507446711, + "learning_rate": 9.28353450438019e-05, + "loss": 3.0447, + "step": 12336 + }, + { + "epoch": 0.7658451797132038, + "grad_norm": 0.26402750404094444, + "learning_rate": 9.283348209512615e-05, + "loss": 3.1154, + "step": 12337 + }, + { + "epoch": 0.7659072568129617, + "grad_norm": 0.18028006081693929, + "learning_rate": 9.283161892297771e-05, + "loss": 3.0369, + "step": 12338 + }, + { + "epoch": 0.7659693339127196, + "grad_norm": 0.23987950887860446, + "learning_rate": 9.282975552736635e-05, + "loss": 3.0482, + "step": 12339 + }, + { + "epoch": 0.7660314110124775, + "grad_norm": 0.17436322350198422, + "learning_rate": 9.282789190830173e-05, + "loss": 2.9775, + "step": 12340 + }, + { + "epoch": 0.7660934881122354, + "grad_norm": 0.16930020273174512, + "learning_rate": 9.282602806579361e-05, + "loss": 2.9802, + "step": 12341 + }, + { + "epoch": 0.7661555652119934, + "grad_norm": 0.197578569082151, + "learning_rate": 9.282416399985172e-05, + "loss": 3.0181, + "step": 12342 + }, + { + "epoch": 0.7662176423117512, + "grad_norm": 0.24382086604007017, + "learning_rate": 9.282229971048575e-05, + "loss": 3.0391, + "step": 12343 + }, + { + "epoch": 0.7662797194115091, + "grad_norm": 0.19710945330484825, + "learning_rate": 9.282043519770545e-05, + "loss": 3.0612, + "step": 12344 + }, + { + "epoch": 0.766341796511267, + "grad_norm": 0.2882764671156353, + "learning_rate": 9.281857046152056e-05, + "loss": 3.0402, + "step": 12345 + }, + { + "epoch": 0.7664038736110249, + "grad_norm": 0.22480929226637614, + "learning_rate": 9.281670550194078e-05, + "loss": 3.1401, + "step": 12346 + }, + { + "epoch": 0.7664659507107828, + "grad_norm": 0.22740991553754109, + "learning_rate": 9.281484031897584e-05, + "loss": 3.0387, + "step": 12347 + }, + { + "epoch": 0.7665280278105407, + "grad_norm": 0.22584212812287627, + "learning_rate": 9.281297491263552e-05, + "loss": 3.0676, + "step": 12348 + }, + { + "epoch": 0.7665901049102986, + "grad_norm": 0.24920763421456216, + "learning_rate": 9.281110928292948e-05, + "loss": 3.1552, + "step": 12349 + }, + { + "epoch": 0.7666521820100565, + "grad_norm": 0.40317124389840314, + "learning_rate": 9.280924342986752e-05, + "loss": 2.9968, + "step": 12350 + }, + { + "epoch": 0.7667142591098144, + "grad_norm": 0.20687171799456364, + "learning_rate": 9.280737735345933e-05, + "loss": 3.0132, + "step": 12351 + }, + { + "epoch": 0.7667763362095723, + "grad_norm": 0.24915481542520923, + "learning_rate": 9.280551105371466e-05, + "loss": 3.2547, + "step": 12352 + }, + { + "epoch": 0.7668384133093302, + "grad_norm": 0.22345662568672958, + "learning_rate": 9.280364453064325e-05, + "loss": 3.0662, + "step": 12353 + }, + { + "epoch": 0.7669004904090881, + "grad_norm": 0.25924826133829226, + "learning_rate": 9.280177778425484e-05, + "loss": 3.1267, + "step": 12354 + }, + { + "epoch": 0.766962567508846, + "grad_norm": 0.18759151053501869, + "learning_rate": 9.279991081455917e-05, + "loss": 3.0135, + "step": 12355 + }, + { + "epoch": 0.7670246446086039, + "grad_norm": 0.25259652896359086, + "learning_rate": 9.279804362156596e-05, + "loss": 3.0473, + "step": 12356 + }, + { + "epoch": 0.7670867217083618, + "grad_norm": 0.17158471866906524, + "learning_rate": 9.279617620528497e-05, + "loss": 3.089, + "step": 12357 + }, + { + "epoch": 0.7671487988081197, + "grad_norm": 0.1842806121908286, + "learning_rate": 9.279430856572594e-05, + "loss": 3.0582, + "step": 12358 + }, + { + "epoch": 0.7672108759078776, + "grad_norm": 0.22015258179553615, + "learning_rate": 9.279244070289861e-05, + "loss": 3.0705, + "step": 12359 + }, + { + "epoch": 0.7672729530076355, + "grad_norm": 0.17617919699430726, + "learning_rate": 9.279057261681274e-05, + "loss": 3.0442, + "step": 12360 + }, + { + "epoch": 0.7673350301073933, + "grad_norm": 0.16675530748771145, + "learning_rate": 9.278870430747805e-05, + "loss": 2.9461, + "step": 12361 + }, + { + "epoch": 0.7673971072071513, + "grad_norm": 0.18640950103380502, + "learning_rate": 9.278683577490432e-05, + "loss": 2.9651, + "step": 12362 + }, + { + "epoch": 0.7674591843069092, + "grad_norm": 0.22521086057667672, + "learning_rate": 9.278496701910127e-05, + "loss": 3.1202, + "step": 12363 + }, + { + "epoch": 0.7675212614066671, + "grad_norm": 0.23499155011609485, + "learning_rate": 9.278309804007867e-05, + "loss": 3.1569, + "step": 12364 + }, + { + "epoch": 0.767583338506425, + "grad_norm": 0.25482099521539775, + "learning_rate": 9.278122883784626e-05, + "loss": 3.2479, + "step": 12365 + }, + { + "epoch": 0.7676454156061828, + "grad_norm": 0.276998170275102, + "learning_rate": 9.277935941241379e-05, + "loss": 3.0169, + "step": 12366 + }, + { + "epoch": 0.7677074927059407, + "grad_norm": 0.1980597591535517, + "learning_rate": 9.2777489763791e-05, + "loss": 2.9348, + "step": 12367 + }, + { + "epoch": 0.7677695698056987, + "grad_norm": 0.24077581627251604, + "learning_rate": 9.277561989198768e-05, + "loss": 3.1236, + "step": 12368 + }, + { + "epoch": 0.7678316469054566, + "grad_norm": 0.32055402581803755, + "learning_rate": 9.277374979701357e-05, + "loss": 3.0881, + "step": 12369 + }, + { + "epoch": 0.7678937240052145, + "grad_norm": 0.19487083087790014, + "learning_rate": 9.277187947887839e-05, + "loss": 3.0668, + "step": 12370 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.24744170248881217, + "learning_rate": 9.277000893759198e-05, + "loss": 3.098, + "step": 12371 + }, + { + "epoch": 0.7680178782047302, + "grad_norm": 0.2801699829565279, + "learning_rate": 9.276813817316404e-05, + "loss": 3.0458, + "step": 12372 + }, + { + "epoch": 0.7680799553044881, + "grad_norm": 0.20306514811880505, + "learning_rate": 9.276626718560432e-05, + "loss": 3.0971, + "step": 12373 + }, + { + "epoch": 0.7681420324042461, + "grad_norm": 0.2893319549430661, + "learning_rate": 9.276439597492262e-05, + "loss": 3.1281, + "step": 12374 + }, + { + "epoch": 0.768204109504004, + "grad_norm": 0.23853387104994747, + "learning_rate": 9.276252454112866e-05, + "loss": 3.0065, + "step": 12375 + }, + { + "epoch": 0.7682661866037619, + "grad_norm": 0.26226490596778357, + "learning_rate": 9.276065288423225e-05, + "loss": 2.972, + "step": 12376 + }, + { + "epoch": 0.7683282637035198, + "grad_norm": 0.22958740531635727, + "learning_rate": 9.275878100424314e-05, + "loss": 3.0779, + "step": 12377 + }, + { + "epoch": 0.7683903408032776, + "grad_norm": 0.23630816004888536, + "learning_rate": 9.275690890117106e-05, + "loss": 3.0278, + "step": 12378 + }, + { + "epoch": 0.7684524179030355, + "grad_norm": 0.2725781834504712, + "learning_rate": 9.275503657502582e-05, + "loss": 3.1006, + "step": 12379 + }, + { + "epoch": 0.7685144950027935, + "grad_norm": 0.27859668060490045, + "learning_rate": 9.275316402581719e-05, + "loss": 3.1188, + "step": 12380 + }, + { + "epoch": 0.7685765721025514, + "grad_norm": 0.29732102217538064, + "learning_rate": 9.27512912535549e-05, + "loss": 3.0613, + "step": 12381 + }, + { + "epoch": 0.7686386492023093, + "grad_norm": 0.23964726417459464, + "learning_rate": 9.274941825824877e-05, + "loss": 3.1267, + "step": 12382 + }, + { + "epoch": 0.7687007263020672, + "grad_norm": 0.21695814921544235, + "learning_rate": 9.274754503990855e-05, + "loss": 3.054, + "step": 12383 + }, + { + "epoch": 0.768762803401825, + "grad_norm": 0.20805261693544896, + "learning_rate": 9.2745671598544e-05, + "loss": 3.0068, + "step": 12384 + }, + { + "epoch": 0.7688248805015829, + "grad_norm": 0.20905779502546576, + "learning_rate": 9.274379793416489e-05, + "loss": 3.0519, + "step": 12385 + }, + { + "epoch": 0.7688869576013408, + "grad_norm": 0.23562858233523887, + "learning_rate": 9.274192404678101e-05, + "loss": 3.1234, + "step": 12386 + }, + { + "epoch": 0.7689490347010988, + "grad_norm": 0.21003942575080992, + "learning_rate": 9.274004993640215e-05, + "loss": 3.1086, + "step": 12387 + }, + { + "epoch": 0.7690111118008567, + "grad_norm": 0.19940802054074266, + "learning_rate": 9.273817560303808e-05, + "loss": 3.1363, + "step": 12388 + }, + { + "epoch": 0.7690731889006146, + "grad_norm": 0.22359924397626516, + "learning_rate": 9.273630104669856e-05, + "loss": 3.003, + "step": 12389 + }, + { + "epoch": 0.7691352660003724, + "grad_norm": 0.19810778191054776, + "learning_rate": 9.27344262673934e-05, + "loss": 2.9448, + "step": 12390 + }, + { + "epoch": 0.7691973431001303, + "grad_norm": 0.21624481948810342, + "learning_rate": 9.273255126513234e-05, + "loss": 3.0223, + "step": 12391 + }, + { + "epoch": 0.7692594201998882, + "grad_norm": 0.2707001728595975, + "learning_rate": 9.27306760399252e-05, + "loss": 3.097, + "step": 12392 + }, + { + "epoch": 0.7693214972996462, + "grad_norm": 0.21379380730901892, + "learning_rate": 9.272880059178175e-05, + "loss": 3.0335, + "step": 12393 + }, + { + "epoch": 0.7693835743994041, + "grad_norm": 0.20144166725267684, + "learning_rate": 9.272692492071176e-05, + "loss": 3.1131, + "step": 12394 + }, + { + "epoch": 0.769445651499162, + "grad_norm": 0.18061090666821683, + "learning_rate": 9.272504902672504e-05, + "loss": 3.142, + "step": 12395 + }, + { + "epoch": 0.7695077285989198, + "grad_norm": 0.19590499653965157, + "learning_rate": 9.272317290983136e-05, + "loss": 3.0749, + "step": 12396 + }, + { + "epoch": 0.7695698056986777, + "grad_norm": 0.22101175621047364, + "learning_rate": 9.272129657004053e-05, + "loss": 3.0283, + "step": 12397 + }, + { + "epoch": 0.7696318827984356, + "grad_norm": 0.18284604832533632, + "learning_rate": 9.271942000736234e-05, + "loss": 3.0316, + "step": 12398 + }, + { + "epoch": 0.7696939598981936, + "grad_norm": 0.2335036770727136, + "learning_rate": 9.271754322180654e-05, + "loss": 3.0506, + "step": 12399 + }, + { + "epoch": 0.7697560369979515, + "grad_norm": 0.1910737029326697, + "learning_rate": 9.271566621338295e-05, + "loss": 3.0214, + "step": 12400 + }, + { + "epoch": 0.7698181140977094, + "grad_norm": 0.22362181790573607, + "learning_rate": 9.271378898210136e-05, + "loss": 3.0246, + "step": 12401 + }, + { + "epoch": 0.7698801911974672, + "grad_norm": 0.17153435784615526, + "learning_rate": 9.271191152797157e-05, + "loss": 2.9893, + "step": 12402 + }, + { + "epoch": 0.7699422682972251, + "grad_norm": 0.20970885647006707, + "learning_rate": 9.271003385100337e-05, + "loss": 3.0151, + "step": 12403 + }, + { + "epoch": 0.770004345396983, + "grad_norm": 0.31242466256876716, + "learning_rate": 9.270815595120655e-05, + "loss": 3.1357, + "step": 12404 + }, + { + "epoch": 0.770066422496741, + "grad_norm": 0.15917417013517593, + "learning_rate": 9.270627782859089e-05, + "loss": 3.0258, + "step": 12405 + }, + { + "epoch": 0.7701284995964989, + "grad_norm": 0.3093018917439055, + "learning_rate": 9.270439948316625e-05, + "loss": 3.0795, + "step": 12406 + }, + { + "epoch": 0.7701905766962568, + "grad_norm": 0.18381662233082458, + "learning_rate": 9.270252091494237e-05, + "loss": 3.0391, + "step": 12407 + }, + { + "epoch": 0.7702526537960146, + "grad_norm": 0.18326036132582982, + "learning_rate": 9.270064212392908e-05, + "loss": 3.0375, + "step": 12408 + }, + { + "epoch": 0.7703147308957725, + "grad_norm": 0.1866921565538609, + "learning_rate": 9.269876311013617e-05, + "loss": 3.0409, + "step": 12409 + }, + { + "epoch": 0.7703768079955304, + "grad_norm": 0.27397003088419225, + "learning_rate": 9.269688387357345e-05, + "loss": 3.0619, + "step": 12410 + }, + { + "epoch": 0.7704388850952884, + "grad_norm": 0.1851351939272472, + "learning_rate": 9.269500441425072e-05, + "loss": 2.9937, + "step": 12411 + }, + { + "epoch": 0.7705009621950463, + "grad_norm": 0.19178044468258565, + "learning_rate": 9.269312473217777e-05, + "loss": 2.9407, + "step": 12412 + }, + { + "epoch": 0.7705630392948042, + "grad_norm": 0.18230643326123924, + "learning_rate": 9.269124482736445e-05, + "loss": 3.0164, + "step": 12413 + }, + { + "epoch": 0.770625116394562, + "grad_norm": 0.19003305455087954, + "learning_rate": 9.268936469982053e-05, + "loss": 2.9803, + "step": 12414 + }, + { + "epoch": 0.7706871934943199, + "grad_norm": 0.22463571974458746, + "learning_rate": 9.268748434955584e-05, + "loss": 3.0463, + "step": 12415 + }, + { + "epoch": 0.7707492705940778, + "grad_norm": 0.18492520237883617, + "learning_rate": 9.268560377658018e-05, + "loss": 3.0424, + "step": 12416 + }, + { + "epoch": 0.7708113476938357, + "grad_norm": 0.18787734486365942, + "learning_rate": 9.268372298090336e-05, + "loss": 3.0774, + "step": 12417 + }, + { + "epoch": 0.7708734247935937, + "grad_norm": 0.16241998100736116, + "learning_rate": 9.26818419625352e-05, + "loss": 3.0206, + "step": 12418 + }, + { + "epoch": 0.7709355018933516, + "grad_norm": 0.19203792194170374, + "learning_rate": 9.267996072148549e-05, + "loss": 3.1772, + "step": 12419 + }, + { + "epoch": 0.7709975789931094, + "grad_norm": 0.1704311740451064, + "learning_rate": 9.267807925776408e-05, + "loss": 3.1304, + "step": 12420 + }, + { + "epoch": 0.7710596560928673, + "grad_norm": 0.216968665921155, + "learning_rate": 9.267619757138077e-05, + "loss": 3.0036, + "step": 12421 + }, + { + "epoch": 0.7711217331926252, + "grad_norm": 0.275754971643557, + "learning_rate": 9.267431566234537e-05, + "loss": 3.092, + "step": 12422 + }, + { + "epoch": 0.7711838102923831, + "grad_norm": 0.18102181556992153, + "learning_rate": 9.267243353066771e-05, + "loss": 3.1348, + "step": 12423 + }, + { + "epoch": 0.7712458873921411, + "grad_norm": 0.2208304670810627, + "learning_rate": 9.267055117635761e-05, + "loss": 3.0616, + "step": 12424 + }, + { + "epoch": 0.771307964491899, + "grad_norm": 0.21787452062410723, + "learning_rate": 9.266866859942488e-05, + "loss": 3.0069, + "step": 12425 + }, + { + "epoch": 0.7713700415916568, + "grad_norm": 0.20424176379048475, + "learning_rate": 9.266678579987935e-05, + "loss": 2.9401, + "step": 12426 + }, + { + "epoch": 0.7714321186914147, + "grad_norm": 0.17347224377257953, + "learning_rate": 9.266490277773083e-05, + "loss": 3.0615, + "step": 12427 + }, + { + "epoch": 0.7714941957911726, + "grad_norm": 0.18991803611412558, + "learning_rate": 9.266301953298918e-05, + "loss": 3.1625, + "step": 12428 + }, + { + "epoch": 0.7715562728909305, + "grad_norm": 0.211490806112597, + "learning_rate": 9.266113606566417e-05, + "loss": 3.0259, + "step": 12429 + }, + { + "epoch": 0.7716183499906885, + "grad_norm": 0.25745362533166033, + "learning_rate": 9.265925237576568e-05, + "loss": 3.0803, + "step": 12430 + }, + { + "epoch": 0.7716804270904464, + "grad_norm": 0.19824510957415312, + "learning_rate": 9.26573684633035e-05, + "loss": 3.118, + "step": 12431 + }, + { + "epoch": 0.7717425041902042, + "grad_norm": 0.18284366456145365, + "learning_rate": 9.265548432828748e-05, + "loss": 3.0385, + "step": 12432 + }, + { + "epoch": 0.7718045812899621, + "grad_norm": 0.17333525530692528, + "learning_rate": 9.265359997072745e-05, + "loss": 2.9992, + "step": 12433 + }, + { + "epoch": 0.77186665838972, + "grad_norm": 0.21981107146281867, + "learning_rate": 9.265171539063323e-05, + "loss": 2.9461, + "step": 12434 + }, + { + "epoch": 0.7719287354894779, + "grad_norm": 0.17575967444358456, + "learning_rate": 9.264983058801465e-05, + "loss": 3.0367, + "step": 12435 + }, + { + "epoch": 0.7719908125892359, + "grad_norm": 0.17821486991700022, + "learning_rate": 9.264794556288156e-05, + "loss": 3.1189, + "step": 12436 + }, + { + "epoch": 0.7720528896889938, + "grad_norm": 0.1789675157916417, + "learning_rate": 9.264606031524376e-05, + "loss": 3.0544, + "step": 12437 + }, + { + "epoch": 0.7721149667887516, + "grad_norm": 0.17774272731397056, + "learning_rate": 9.264417484511114e-05, + "loss": 3.0921, + "step": 12438 + }, + { + "epoch": 0.7721770438885095, + "grad_norm": 0.20117946721537058, + "learning_rate": 9.26422891524935e-05, + "loss": 3.1193, + "step": 12439 + }, + { + "epoch": 0.7722391209882674, + "grad_norm": 0.17242665517115452, + "learning_rate": 9.264040323740069e-05, + "loss": 3.1357, + "step": 12440 + }, + { + "epoch": 0.7723011980880253, + "grad_norm": 0.17827636745497077, + "learning_rate": 9.263851709984252e-05, + "loss": 3.1582, + "step": 12441 + }, + { + "epoch": 0.7723632751877832, + "grad_norm": 0.195537837692571, + "learning_rate": 9.26366307398289e-05, + "loss": 3.0633, + "step": 12442 + }, + { + "epoch": 0.7724253522875412, + "grad_norm": 0.20500706849472547, + "learning_rate": 9.263474415736959e-05, + "loss": 3.1039, + "step": 12443 + }, + { + "epoch": 0.772487429387299, + "grad_norm": 0.21656702074063597, + "learning_rate": 9.263285735247447e-05, + "loss": 2.9794, + "step": 12444 + }, + { + "epoch": 0.7725495064870569, + "grad_norm": 0.17936311341819586, + "learning_rate": 9.26309703251534e-05, + "loss": 3.0862, + "step": 12445 + }, + { + "epoch": 0.7726115835868148, + "grad_norm": 0.21618764350381253, + "learning_rate": 9.26290830754162e-05, + "loss": 3.0771, + "step": 12446 + }, + { + "epoch": 0.7726736606865727, + "grad_norm": 0.1859856787540299, + "learning_rate": 9.262719560327273e-05, + "loss": 3.0546, + "step": 12447 + }, + { + "epoch": 0.7727357377863306, + "grad_norm": 0.15686551690410142, + "learning_rate": 9.262530790873283e-05, + "loss": 3.0644, + "step": 12448 + }, + { + "epoch": 0.7727978148860886, + "grad_norm": 0.20090503031315474, + "learning_rate": 9.262341999180634e-05, + "loss": 2.9808, + "step": 12449 + }, + { + "epoch": 0.7728598919858464, + "grad_norm": 0.21052844510072144, + "learning_rate": 9.262153185250315e-05, + "loss": 3.0316, + "step": 12450 + }, + { + "epoch": 0.7729219690856043, + "grad_norm": 0.20938819124885283, + "learning_rate": 9.261964349083305e-05, + "loss": 3.0744, + "step": 12451 + }, + { + "epoch": 0.7729840461853622, + "grad_norm": 0.18071720269662342, + "learning_rate": 9.261775490680594e-05, + "loss": 3.0908, + "step": 12452 + }, + { + "epoch": 0.7730461232851201, + "grad_norm": 0.1853223043971311, + "learning_rate": 9.261586610043166e-05, + "loss": 2.9426, + "step": 12453 + }, + { + "epoch": 0.773108200384878, + "grad_norm": 0.21491484766085958, + "learning_rate": 9.261397707172006e-05, + "loss": 3.1296, + "step": 12454 + }, + { + "epoch": 0.773170277484636, + "grad_norm": 0.3097534720885883, + "learning_rate": 9.261208782068098e-05, + "loss": 3.1064, + "step": 12455 + }, + { + "epoch": 0.7732323545843938, + "grad_norm": 0.20364729965312967, + "learning_rate": 9.261019834732432e-05, + "loss": 3.0765, + "step": 12456 + }, + { + "epoch": 0.7732944316841517, + "grad_norm": 0.19389069893668257, + "learning_rate": 9.260830865165989e-05, + "loss": 3.0161, + "step": 12457 + }, + { + "epoch": 0.7733565087839096, + "grad_norm": 0.27318161844586825, + "learning_rate": 9.260641873369758e-05, + "loss": 3.1155, + "step": 12458 + }, + { + "epoch": 0.7734185858836675, + "grad_norm": 0.19782343454557483, + "learning_rate": 9.260452859344723e-05, + "loss": 3.1381, + "step": 12459 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.19199052501436004, + "learning_rate": 9.260263823091872e-05, + "loss": 3.0418, + "step": 12460 + }, + { + "epoch": 0.7735427400831834, + "grad_norm": 0.191603899133649, + "learning_rate": 9.26007476461219e-05, + "loss": 3.0539, + "step": 12461 + }, + { + "epoch": 0.7736048171829412, + "grad_norm": 0.24460479099702045, + "learning_rate": 9.259885683906664e-05, + "loss": 3.1332, + "step": 12462 + }, + { + "epoch": 0.7736668942826991, + "grad_norm": 0.21541681156085937, + "learning_rate": 9.25969658097628e-05, + "loss": 2.9832, + "step": 12463 + }, + { + "epoch": 0.773728971382457, + "grad_norm": 0.18895032368549222, + "learning_rate": 9.259507455822025e-05, + "loss": 3.051, + "step": 12464 + }, + { + "epoch": 0.7737910484822149, + "grad_norm": 0.18892238445495663, + "learning_rate": 9.259318308444886e-05, + "loss": 3.0647, + "step": 12465 + }, + { + "epoch": 0.7738531255819728, + "grad_norm": 0.1970286976605847, + "learning_rate": 9.25912913884585e-05, + "loss": 3.1767, + "step": 12466 + }, + { + "epoch": 0.7739152026817308, + "grad_norm": 0.5612500926911888, + "learning_rate": 9.258939947025901e-05, + "loss": 2.9957, + "step": 12467 + }, + { + "epoch": 0.7739772797814886, + "grad_norm": 0.20370648672740696, + "learning_rate": 9.258750732986032e-05, + "loss": 3.0944, + "step": 12468 + }, + { + "epoch": 0.7740393568812465, + "grad_norm": 0.3808614486246635, + "learning_rate": 9.258561496727224e-05, + "loss": 3.0958, + "step": 12469 + }, + { + "epoch": 0.7741014339810044, + "grad_norm": 0.2863845238819058, + "learning_rate": 9.258372238250468e-05, + "loss": 3.0987, + "step": 12470 + }, + { + "epoch": 0.7741635110807623, + "grad_norm": 0.3196649984978185, + "learning_rate": 9.258182957556749e-05, + "loss": 3.0234, + "step": 12471 + }, + { + "epoch": 0.7742255881805202, + "grad_norm": 0.3206337271724006, + "learning_rate": 9.257993654647057e-05, + "loss": 2.9944, + "step": 12472 + }, + { + "epoch": 0.7742876652802781, + "grad_norm": 0.38599507215175816, + "learning_rate": 9.257804329522378e-05, + "loss": 3.0102, + "step": 12473 + }, + { + "epoch": 0.774349742380036, + "grad_norm": 0.26017430789811835, + "learning_rate": 9.2576149821837e-05, + "loss": 3.0154, + "step": 12474 + }, + { + "epoch": 0.7744118194797939, + "grad_norm": 0.2439324292338341, + "learning_rate": 9.257425612632012e-05, + "loss": 3.1385, + "step": 12475 + }, + { + "epoch": 0.7744738965795518, + "grad_norm": 0.27313237195573953, + "learning_rate": 9.257236220868299e-05, + "loss": 3.0721, + "step": 12476 + }, + { + "epoch": 0.7745359736793097, + "grad_norm": 0.23141744099768946, + "learning_rate": 9.257046806893553e-05, + "loss": 3.0263, + "step": 12477 + }, + { + "epoch": 0.7745980507790676, + "grad_norm": 0.3376270655456202, + "learning_rate": 9.256857370708759e-05, + "loss": 2.9911, + "step": 12478 + }, + { + "epoch": 0.7746601278788255, + "grad_norm": 0.29510754936228384, + "learning_rate": 9.256667912314907e-05, + "loss": 3.0421, + "step": 12479 + }, + { + "epoch": 0.7747222049785834, + "grad_norm": 0.25686160927464513, + "learning_rate": 9.256478431712986e-05, + "loss": 2.9953, + "step": 12480 + }, + { + "epoch": 0.7747842820783413, + "grad_norm": 0.21734730549638995, + "learning_rate": 9.256288928903984e-05, + "loss": 2.9872, + "step": 12481 + }, + { + "epoch": 0.7748463591780992, + "grad_norm": 0.24709192565952726, + "learning_rate": 9.256099403888888e-05, + "loss": 3.0636, + "step": 12482 + }, + { + "epoch": 0.7749084362778571, + "grad_norm": 0.2573424736366513, + "learning_rate": 9.25590985666869e-05, + "loss": 3.0445, + "step": 12483 + }, + { + "epoch": 0.774970513377615, + "grad_norm": 0.1974063503530228, + "learning_rate": 9.255720287244375e-05, + "loss": 3.025, + "step": 12484 + }, + { + "epoch": 0.7750325904773729, + "grad_norm": 0.2644126658290743, + "learning_rate": 9.255530695616936e-05, + "loss": 3.0945, + "step": 12485 + }, + { + "epoch": 0.7750946675771307, + "grad_norm": 0.2064921269168659, + "learning_rate": 9.255341081787358e-05, + "loss": 2.9179, + "step": 12486 + }, + { + "epoch": 0.7751567446768887, + "grad_norm": 0.2640346574494598, + "learning_rate": 9.255151445756635e-05, + "loss": 3.0345, + "step": 12487 + }, + { + "epoch": 0.7752188217766466, + "grad_norm": 0.20219864889711311, + "learning_rate": 9.254961787525753e-05, + "loss": 3.0708, + "step": 12488 + }, + { + "epoch": 0.7752808988764045, + "grad_norm": 0.28371140480134327, + "learning_rate": 9.254772107095702e-05, + "loss": 3.1588, + "step": 12489 + }, + { + "epoch": 0.7753429759761624, + "grad_norm": 0.20256243343758248, + "learning_rate": 9.254582404467473e-05, + "loss": 3.0318, + "step": 12490 + }, + { + "epoch": 0.7754050530759203, + "grad_norm": 0.1723118616191923, + "learning_rate": 9.254392679642054e-05, + "loss": 3.0474, + "step": 12491 + }, + { + "epoch": 0.7754671301756781, + "grad_norm": 0.23255520690722586, + "learning_rate": 9.254202932620436e-05, + "loss": 3.0588, + "step": 12492 + }, + { + "epoch": 0.7755292072754361, + "grad_norm": 0.16904136925816043, + "learning_rate": 9.254013163403607e-05, + "loss": 3.011, + "step": 12493 + }, + { + "epoch": 0.775591284375194, + "grad_norm": 0.21110147707707905, + "learning_rate": 9.25382337199256e-05, + "loss": 3.0916, + "step": 12494 + }, + { + "epoch": 0.7756533614749519, + "grad_norm": 0.17556095624505041, + "learning_rate": 9.253633558388285e-05, + "loss": 3.0688, + "step": 12495 + }, + { + "epoch": 0.7757154385747098, + "grad_norm": 0.27319526842147446, + "learning_rate": 9.253443722591771e-05, + "loss": 3.0535, + "step": 12496 + }, + { + "epoch": 0.7757775156744677, + "grad_norm": 0.21164801813155665, + "learning_rate": 9.253253864604006e-05, + "loss": 3.1314, + "step": 12497 + }, + { + "epoch": 0.7758395927742255, + "grad_norm": 0.19249092007872393, + "learning_rate": 9.253063984425985e-05, + "loss": 3.1141, + "step": 12498 + }, + { + "epoch": 0.7759016698739835, + "grad_norm": 0.21210183117906511, + "learning_rate": 9.252874082058696e-05, + "loss": 2.984, + "step": 12499 + }, + { + "epoch": 0.7759637469737414, + "grad_norm": 0.17791917646329297, + "learning_rate": 9.25268415750313e-05, + "loss": 3.0378, + "step": 12500 + }, + { + "epoch": 0.7760258240734993, + "grad_norm": 0.22001303753368615, + "learning_rate": 9.252494210760281e-05, + "loss": 3.0858, + "step": 12501 + }, + { + "epoch": 0.7760879011732572, + "grad_norm": 0.26508932234468285, + "learning_rate": 9.252304241831134e-05, + "loss": 2.9663, + "step": 12502 + }, + { + "epoch": 0.7761499782730151, + "grad_norm": 0.19810086178474207, + "learning_rate": 9.252114250716685e-05, + "loss": 3.0637, + "step": 12503 + }, + { + "epoch": 0.7762120553727729, + "grad_norm": 0.18555479534711872, + "learning_rate": 9.251924237417924e-05, + "loss": 3.1268, + "step": 12504 + }, + { + "epoch": 0.7762741324725309, + "grad_norm": 0.17872329835928746, + "learning_rate": 9.251734201935843e-05, + "loss": 3.0475, + "step": 12505 + }, + { + "epoch": 0.7763362095722888, + "grad_norm": 0.17068812917665324, + "learning_rate": 9.251544144271431e-05, + "loss": 3.1045, + "step": 12506 + }, + { + "epoch": 0.7763982866720467, + "grad_norm": 0.18274142888983033, + "learning_rate": 9.251354064425681e-05, + "loss": 3.1183, + "step": 12507 + }, + { + "epoch": 0.7764603637718046, + "grad_norm": 0.23550194965049562, + "learning_rate": 9.251163962399585e-05, + "loss": 3.0837, + "step": 12508 + }, + { + "epoch": 0.7765224408715625, + "grad_norm": 0.19822527432758902, + "learning_rate": 9.250973838194134e-05, + "loss": 2.9956, + "step": 12509 + }, + { + "epoch": 0.7765845179713203, + "grad_norm": 0.23504426292028527, + "learning_rate": 9.250783691810322e-05, + "loss": 3.0473, + "step": 12510 + }, + { + "epoch": 0.7766465950710782, + "grad_norm": 0.22417730985219128, + "learning_rate": 9.25059352324914e-05, + "loss": 3.0428, + "step": 12511 + }, + { + "epoch": 0.7767086721708362, + "grad_norm": 0.19066014161364167, + "learning_rate": 9.250403332511579e-05, + "loss": 3.0804, + "step": 12512 + }, + { + "epoch": 0.7767707492705941, + "grad_norm": 0.17613236540405353, + "learning_rate": 9.25021311959863e-05, + "loss": 3.0393, + "step": 12513 + }, + { + "epoch": 0.776832826370352, + "grad_norm": 0.21208430189449878, + "learning_rate": 9.250022884511289e-05, + "loss": 3.0562, + "step": 12514 + }, + { + "epoch": 0.7768949034701099, + "grad_norm": 0.1893544087372926, + "learning_rate": 9.249832627250548e-05, + "loss": 3.1002, + "step": 12515 + }, + { + "epoch": 0.7769569805698677, + "grad_norm": 0.18279619674041594, + "learning_rate": 9.249642347817397e-05, + "loss": 3.1083, + "step": 12516 + }, + { + "epoch": 0.7770190576696256, + "grad_norm": 0.3033574491170051, + "learning_rate": 9.24945204621283e-05, + "loss": 3.0865, + "step": 12517 + }, + { + "epoch": 0.7770811347693836, + "grad_norm": 0.21919114045708654, + "learning_rate": 9.249261722437841e-05, + "loss": 3.1573, + "step": 12518 + }, + { + "epoch": 0.7771432118691415, + "grad_norm": 0.21383918509209346, + "learning_rate": 9.24907137649342e-05, + "loss": 3.1281, + "step": 12519 + }, + { + "epoch": 0.7772052889688994, + "grad_norm": 0.19289515864257728, + "learning_rate": 9.248881008380566e-05, + "loss": 2.9648, + "step": 12520 + }, + { + "epoch": 0.7772673660686573, + "grad_norm": 0.1910466202159907, + "learning_rate": 9.248690618100267e-05, + "loss": 3.0654, + "step": 12521 + }, + { + "epoch": 0.7773294431684151, + "grad_norm": 0.3089884033133439, + "learning_rate": 9.248500205653517e-05, + "loss": 3.015, + "step": 12522 + }, + { + "epoch": 0.777391520268173, + "grad_norm": 0.1796603596964245, + "learning_rate": 9.248309771041309e-05, + "loss": 3.0783, + "step": 12523 + }, + { + "epoch": 0.777453597367931, + "grad_norm": 0.2831636861766283, + "learning_rate": 9.248119314264638e-05, + "loss": 3.0622, + "step": 12524 + }, + { + "epoch": 0.7775156744676889, + "grad_norm": 0.19179107944721285, + "learning_rate": 9.247928835324497e-05, + "loss": 3.0345, + "step": 12525 + }, + { + "epoch": 0.7775777515674468, + "grad_norm": 0.20700555318027122, + "learning_rate": 9.247738334221881e-05, + "loss": 3.0436, + "step": 12526 + }, + { + "epoch": 0.7776398286672047, + "grad_norm": 0.2202694945600629, + "learning_rate": 9.247547810957785e-05, + "loss": 3.0909, + "step": 12527 + }, + { + "epoch": 0.7777019057669625, + "grad_norm": 0.21445185725214333, + "learning_rate": 9.247357265533197e-05, + "loss": 3.0489, + "step": 12528 + }, + { + "epoch": 0.7777639828667204, + "grad_norm": 0.21551409958530715, + "learning_rate": 9.247166697949117e-05, + "loss": 3.0473, + "step": 12529 + }, + { + "epoch": 0.7778260599664784, + "grad_norm": 0.20010412046566664, + "learning_rate": 9.246976108206537e-05, + "loss": 3.0692, + "step": 12530 + }, + { + "epoch": 0.7778881370662363, + "grad_norm": 0.19159660799579334, + "learning_rate": 9.246785496306452e-05, + "loss": 3.0363, + "step": 12531 + }, + { + "epoch": 0.7779502141659942, + "grad_norm": 0.17881412341298597, + "learning_rate": 9.246594862249855e-05, + "loss": 3.0812, + "step": 12532 + }, + { + "epoch": 0.7780122912657521, + "grad_norm": 0.1805358638071506, + "learning_rate": 9.246404206037743e-05, + "loss": 3.0677, + "step": 12533 + }, + { + "epoch": 0.7780743683655099, + "grad_norm": 0.1683881591212096, + "learning_rate": 9.24621352767111e-05, + "loss": 2.944, + "step": 12534 + }, + { + "epoch": 0.7781364454652678, + "grad_norm": 0.1637587687641572, + "learning_rate": 9.246022827150949e-05, + "loss": 2.9282, + "step": 12535 + }, + { + "epoch": 0.7781985225650258, + "grad_norm": 0.2206866924953893, + "learning_rate": 9.245832104478256e-05, + "loss": 3.0074, + "step": 12536 + }, + { + "epoch": 0.7782605996647837, + "grad_norm": 0.1767294265547372, + "learning_rate": 9.245641359654027e-05, + "loss": 3.0779, + "step": 12537 + }, + { + "epoch": 0.7783226767645416, + "grad_norm": 0.16039025842259616, + "learning_rate": 9.245450592679256e-05, + "loss": 3.0901, + "step": 12538 + }, + { + "epoch": 0.7783847538642995, + "grad_norm": 0.17285959580714316, + "learning_rate": 9.245259803554937e-05, + "loss": 2.9746, + "step": 12539 + }, + { + "epoch": 0.7784468309640573, + "grad_norm": 0.16640189057670848, + "learning_rate": 9.24506899228207e-05, + "loss": 3.0693, + "step": 12540 + }, + { + "epoch": 0.7785089080638152, + "grad_norm": 0.1746782452924585, + "learning_rate": 9.244878158861645e-05, + "loss": 2.9565, + "step": 12541 + }, + { + "epoch": 0.7785709851635731, + "grad_norm": 0.1991319373724915, + "learning_rate": 9.244687303294661e-05, + "loss": 3.0988, + "step": 12542 + }, + { + "epoch": 0.7786330622633311, + "grad_norm": 0.17072686983902258, + "learning_rate": 9.244496425582114e-05, + "loss": 2.9973, + "step": 12543 + }, + { + "epoch": 0.778695139363089, + "grad_norm": 0.1840893367056372, + "learning_rate": 9.244305525724998e-05, + "loss": 3.1678, + "step": 12544 + }, + { + "epoch": 0.7787572164628469, + "grad_norm": 0.16921542189927796, + "learning_rate": 9.24411460372431e-05, + "loss": 3.0567, + "step": 12545 + }, + { + "epoch": 0.7788192935626047, + "grad_norm": 0.19861406295942144, + "learning_rate": 9.243923659581045e-05, + "loss": 3.0686, + "step": 12546 + }, + { + "epoch": 0.7788813706623626, + "grad_norm": 0.21763649299703675, + "learning_rate": 9.243732693296201e-05, + "loss": 3.1056, + "step": 12547 + }, + { + "epoch": 0.7789434477621205, + "grad_norm": 0.2073532156088051, + "learning_rate": 9.243541704870773e-05, + "loss": 3.071, + "step": 12548 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.20391921517380485, + "learning_rate": 9.243350694305757e-05, + "loss": 3.0464, + "step": 12549 + }, + { + "epoch": 0.7790676019616364, + "grad_norm": 0.18277715112998402, + "learning_rate": 9.243159661602152e-05, + "loss": 3.1002, + "step": 12550 + }, + { + "epoch": 0.7791296790613943, + "grad_norm": 0.17364811416845075, + "learning_rate": 9.242968606760951e-05, + "loss": 2.9632, + "step": 12551 + }, + { + "epoch": 0.7791917561611521, + "grad_norm": 0.22993460681854708, + "learning_rate": 9.242777529783154e-05, + "loss": 3.0391, + "step": 12552 + }, + { + "epoch": 0.77925383326091, + "grad_norm": 0.22727367166572207, + "learning_rate": 9.242586430669757e-05, + "loss": 3.0581, + "step": 12553 + }, + { + "epoch": 0.7793159103606679, + "grad_norm": 0.18090078979623436, + "learning_rate": 9.242395309421755e-05, + "loss": 3.0305, + "step": 12554 + }, + { + "epoch": 0.7793779874604259, + "grad_norm": 0.18249852614264875, + "learning_rate": 9.242204166040148e-05, + "loss": 2.9723, + "step": 12555 + }, + { + "epoch": 0.7794400645601838, + "grad_norm": 0.19662192101502063, + "learning_rate": 9.242013000525932e-05, + "loss": 3.0071, + "step": 12556 + }, + { + "epoch": 0.7795021416599417, + "grad_norm": 0.2805826561507696, + "learning_rate": 9.241821812880105e-05, + "loss": 3.0378, + "step": 12557 + }, + { + "epoch": 0.7795642187596995, + "grad_norm": 0.2250401553773225, + "learning_rate": 9.241630603103663e-05, + "loss": 2.9383, + "step": 12558 + }, + { + "epoch": 0.7796262958594574, + "grad_norm": 0.3023240292448299, + "learning_rate": 9.241439371197604e-05, + "loss": 2.9897, + "step": 12559 + }, + { + "epoch": 0.7796883729592153, + "grad_norm": 0.2467278908382487, + "learning_rate": 9.241248117162926e-05, + "loss": 3.0318, + "step": 12560 + }, + { + "epoch": 0.7797504500589733, + "grad_norm": 0.21113554170240237, + "learning_rate": 9.241056841000628e-05, + "loss": 3.0923, + "step": 12561 + }, + { + "epoch": 0.7798125271587312, + "grad_norm": 0.19606511070269256, + "learning_rate": 9.240865542711708e-05, + "loss": 2.969, + "step": 12562 + }, + { + "epoch": 0.7798746042584891, + "grad_norm": 0.20348895002458095, + "learning_rate": 9.240674222297161e-05, + "loss": 3.0466, + "step": 12563 + }, + { + "epoch": 0.7799366813582469, + "grad_norm": 0.1880232047797058, + "learning_rate": 9.240482879757987e-05, + "loss": 3.1333, + "step": 12564 + }, + { + "epoch": 0.7799987584580048, + "grad_norm": 0.21204243125191743, + "learning_rate": 9.240291515095185e-05, + "loss": 3.0482, + "step": 12565 + }, + { + "epoch": 0.7800608355577627, + "grad_norm": 0.23705843326400305, + "learning_rate": 9.240100128309753e-05, + "loss": 3.0435, + "step": 12566 + }, + { + "epoch": 0.7801229126575207, + "grad_norm": 0.17571820082045272, + "learning_rate": 9.23990871940269e-05, + "loss": 3.1259, + "step": 12567 + }, + { + "epoch": 0.7801849897572786, + "grad_norm": 0.2263967960403075, + "learning_rate": 9.239717288374992e-05, + "loss": 3.0232, + "step": 12568 + }, + { + "epoch": 0.7802470668570365, + "grad_norm": 0.23018210575962136, + "learning_rate": 9.239525835227661e-05, + "loss": 3.1417, + "step": 12569 + }, + { + "epoch": 0.7803091439567943, + "grad_norm": 0.190256891347613, + "learning_rate": 9.239334359961695e-05, + "loss": 3.1242, + "step": 12570 + }, + { + "epoch": 0.7803712210565522, + "grad_norm": 0.24474966757559874, + "learning_rate": 9.239142862578092e-05, + "loss": 3.0371, + "step": 12571 + }, + { + "epoch": 0.7804332981563101, + "grad_norm": 0.2327573422883115, + "learning_rate": 9.238951343077851e-05, + "loss": 3.0655, + "step": 12572 + }, + { + "epoch": 0.780495375256068, + "grad_norm": 0.2587525971830561, + "learning_rate": 9.238759801461971e-05, + "loss": 3.0855, + "step": 12573 + }, + { + "epoch": 0.780557452355826, + "grad_norm": 0.2102019501385798, + "learning_rate": 9.238568237731456e-05, + "loss": 3.0609, + "step": 12574 + }, + { + "epoch": 0.7806195294555839, + "grad_norm": 0.24940212851511226, + "learning_rate": 9.238376651887298e-05, + "loss": 3.0872, + "step": 12575 + }, + { + "epoch": 0.7806816065553417, + "grad_norm": 0.29754435220658604, + "learning_rate": 9.2381850439305e-05, + "loss": 3.1479, + "step": 12576 + }, + { + "epoch": 0.7807436836550996, + "grad_norm": 0.22389066398343926, + "learning_rate": 9.237993413862063e-05, + "loss": 3.1071, + "step": 12577 + }, + { + "epoch": 0.7808057607548575, + "grad_norm": 0.34183747186709823, + "learning_rate": 9.237801761682985e-05, + "loss": 3.0799, + "step": 12578 + }, + { + "epoch": 0.7808678378546154, + "grad_norm": 0.2624096646865267, + "learning_rate": 9.237610087394267e-05, + "loss": 3.0669, + "step": 12579 + }, + { + "epoch": 0.7809299149543734, + "grad_norm": 0.21443073280872732, + "learning_rate": 9.237418390996907e-05, + "loss": 2.9882, + "step": 12580 + }, + { + "epoch": 0.7809919920541313, + "grad_norm": 0.22454000313938335, + "learning_rate": 9.23722667249191e-05, + "loss": 3.102, + "step": 12581 + }, + { + "epoch": 0.7810540691538891, + "grad_norm": 0.2612469962052321, + "learning_rate": 9.237034931880271e-05, + "loss": 3.118, + "step": 12582 + }, + { + "epoch": 0.781116146253647, + "grad_norm": 0.20099617405471776, + "learning_rate": 9.23684316916299e-05, + "loss": 3.0326, + "step": 12583 + }, + { + "epoch": 0.7811782233534049, + "grad_norm": 0.3413573681822178, + "learning_rate": 9.236651384341074e-05, + "loss": 2.9944, + "step": 12584 + }, + { + "epoch": 0.7812403004531628, + "grad_norm": 0.22684644302627835, + "learning_rate": 9.236459577415515e-05, + "loss": 2.9768, + "step": 12585 + }, + { + "epoch": 0.7813023775529208, + "grad_norm": 0.19528340122895593, + "learning_rate": 9.23626774838732e-05, + "loss": 3.097, + "step": 12586 + }, + { + "epoch": 0.7813644546526787, + "grad_norm": 0.3112688848212668, + "learning_rate": 9.236075897257488e-05, + "loss": 2.9838, + "step": 12587 + }, + { + "epoch": 0.7814265317524365, + "grad_norm": 0.1952699137989961, + "learning_rate": 9.23588402402702e-05, + "loss": 3.0055, + "step": 12588 + }, + { + "epoch": 0.7814886088521944, + "grad_norm": 0.21763839297903181, + "learning_rate": 9.235692128696914e-05, + "loss": 3.1123, + "step": 12589 + }, + { + "epoch": 0.7815506859519523, + "grad_norm": 0.19784369608387734, + "learning_rate": 9.235500211268177e-05, + "loss": 3.0944, + "step": 12590 + }, + { + "epoch": 0.7816127630517102, + "grad_norm": 0.20837515868116263, + "learning_rate": 9.235308271741805e-05, + "loss": 3.0845, + "step": 12591 + }, + { + "epoch": 0.7816748401514682, + "grad_norm": 0.18669218648513977, + "learning_rate": 9.235116310118803e-05, + "loss": 3.0146, + "step": 12592 + }, + { + "epoch": 0.7817369172512261, + "grad_norm": 0.1797478823172302, + "learning_rate": 9.234924326400171e-05, + "loss": 2.9985, + "step": 12593 + }, + { + "epoch": 0.7817989943509839, + "grad_norm": 0.1805806147677783, + "learning_rate": 9.23473232058691e-05, + "loss": 3.0751, + "step": 12594 + }, + { + "epoch": 0.7818610714507418, + "grad_norm": 0.2276355391528686, + "learning_rate": 9.234540292680023e-05, + "loss": 3.1459, + "step": 12595 + }, + { + "epoch": 0.7819231485504997, + "grad_norm": 0.1910637496518044, + "learning_rate": 9.234348242680512e-05, + "loss": 3.085, + "step": 12596 + }, + { + "epoch": 0.7819852256502576, + "grad_norm": 0.17994640191448608, + "learning_rate": 9.234156170589377e-05, + "loss": 3.0349, + "step": 12597 + }, + { + "epoch": 0.7820473027500155, + "grad_norm": 0.18038911259588394, + "learning_rate": 9.233964076407622e-05, + "loss": 3.0483, + "step": 12598 + }, + { + "epoch": 0.7821093798497735, + "grad_norm": 0.18505503064753978, + "learning_rate": 9.233771960136249e-05, + "loss": 3.1573, + "step": 12599 + }, + { + "epoch": 0.7821714569495313, + "grad_norm": 0.3353331696893534, + "learning_rate": 9.233579821776259e-05, + "loss": 2.9963, + "step": 12600 + }, + { + "epoch": 0.7822335340492892, + "grad_norm": 0.1867085382230855, + "learning_rate": 9.233387661328656e-05, + "loss": 3.0419, + "step": 12601 + }, + { + "epoch": 0.7822956111490471, + "grad_norm": 0.20928532331084676, + "learning_rate": 9.233195478794441e-05, + "loss": 3.058, + "step": 12602 + }, + { + "epoch": 0.782357688248805, + "grad_norm": 0.19534398778773696, + "learning_rate": 9.233003274174618e-05, + "loss": 3.1135, + "step": 12603 + }, + { + "epoch": 0.782419765348563, + "grad_norm": 0.2128424283687975, + "learning_rate": 9.232811047470188e-05, + "loss": 3.0544, + "step": 12604 + }, + { + "epoch": 0.7824818424483209, + "grad_norm": 0.25271220043606335, + "learning_rate": 9.232618798682158e-05, + "loss": 3.0783, + "step": 12605 + }, + { + "epoch": 0.7825439195480787, + "grad_norm": 0.20893640948155637, + "learning_rate": 9.232426527811528e-05, + "loss": 3.0499, + "step": 12606 + }, + { + "epoch": 0.7826059966478366, + "grad_norm": 0.25807335690262884, + "learning_rate": 9.2322342348593e-05, + "loss": 3.1786, + "step": 12607 + }, + { + "epoch": 0.7826680737475945, + "grad_norm": 0.18979091615524998, + "learning_rate": 9.232041919826477e-05, + "loss": 2.9626, + "step": 12608 + }, + { + "epoch": 0.7827301508473524, + "grad_norm": 0.24422127726657286, + "learning_rate": 9.231849582714068e-05, + "loss": 2.9838, + "step": 12609 + }, + { + "epoch": 0.7827922279471103, + "grad_norm": 0.24682690387319206, + "learning_rate": 9.231657223523069e-05, + "loss": 3.0453, + "step": 12610 + }, + { + "epoch": 0.7828543050468683, + "grad_norm": 0.18645467039769076, + "learning_rate": 9.231464842254489e-05, + "loss": 2.9787, + "step": 12611 + }, + { + "epoch": 0.7829163821466261, + "grad_norm": 0.23772509348361312, + "learning_rate": 9.231272438909329e-05, + "loss": 3.1084, + "step": 12612 + }, + { + "epoch": 0.782978459246384, + "grad_norm": 0.17858104467045394, + "learning_rate": 9.231080013488594e-05, + "loss": 3.0658, + "step": 12613 + }, + { + "epoch": 0.7830405363461419, + "grad_norm": 0.19056857008632616, + "learning_rate": 9.230887565993285e-05, + "loss": 2.9772, + "step": 12614 + }, + { + "epoch": 0.7831026134458998, + "grad_norm": 0.2081471545081052, + "learning_rate": 9.230695096424411e-05, + "loss": 2.9801, + "step": 12615 + }, + { + "epoch": 0.7831646905456577, + "grad_norm": 0.2057308889067466, + "learning_rate": 9.230502604782974e-05, + "loss": 3.0625, + "step": 12616 + }, + { + "epoch": 0.7832267676454157, + "grad_norm": 0.1905561271914316, + "learning_rate": 9.230310091069976e-05, + "loss": 3.0413, + "step": 12617 + }, + { + "epoch": 0.7832888447451735, + "grad_norm": 0.18554668426503867, + "learning_rate": 9.230117555286423e-05, + "loss": 3.038, + "step": 12618 + }, + { + "epoch": 0.7833509218449314, + "grad_norm": 0.16286987565894892, + "learning_rate": 9.229924997433322e-05, + "loss": 3.099, + "step": 12619 + }, + { + "epoch": 0.7834129989446893, + "grad_norm": 0.1958666284052661, + "learning_rate": 9.229732417511674e-05, + "loss": 2.9402, + "step": 12620 + }, + { + "epoch": 0.7834750760444472, + "grad_norm": 0.1668070497469276, + "learning_rate": 9.229539815522485e-05, + "loss": 3.1226, + "step": 12621 + }, + { + "epoch": 0.7835371531442051, + "grad_norm": 0.192511297783071, + "learning_rate": 9.229347191466761e-05, + "loss": 3.0396, + "step": 12622 + }, + { + "epoch": 0.783599230243963, + "grad_norm": 0.1773542069196422, + "learning_rate": 9.229154545345507e-05, + "loss": 3.0262, + "step": 12623 + }, + { + "epoch": 0.7836613073437209, + "grad_norm": 0.1635100281436913, + "learning_rate": 9.228961877159726e-05, + "loss": 3.0928, + "step": 12624 + }, + { + "epoch": 0.7837233844434788, + "grad_norm": 0.17479360001847066, + "learning_rate": 9.228769186910424e-05, + "loss": 3.025, + "step": 12625 + }, + { + "epoch": 0.7837854615432367, + "grad_norm": 0.1821971796026056, + "learning_rate": 9.228576474598605e-05, + "loss": 3.0208, + "step": 12626 + }, + { + "epoch": 0.7838475386429946, + "grad_norm": 0.1916708965629494, + "learning_rate": 9.228383740225278e-05, + "loss": 2.9839, + "step": 12627 + }, + { + "epoch": 0.7839096157427525, + "grad_norm": 0.1739716687249693, + "learning_rate": 9.228190983791446e-05, + "loss": 3.1065, + "step": 12628 + }, + { + "epoch": 0.7839716928425104, + "grad_norm": 0.22147128522295725, + "learning_rate": 9.227998205298116e-05, + "loss": 3.0635, + "step": 12629 + }, + { + "epoch": 0.7840337699422683, + "grad_norm": 0.21519357637296074, + "learning_rate": 9.227805404746293e-05, + "loss": 3.1546, + "step": 12630 + }, + { + "epoch": 0.7840958470420262, + "grad_norm": 0.18788924147320563, + "learning_rate": 9.227612582136981e-05, + "loss": 3.0334, + "step": 12631 + }, + { + "epoch": 0.7841579241417841, + "grad_norm": 0.17858293253593605, + "learning_rate": 9.227419737471189e-05, + "loss": 3.0842, + "step": 12632 + }, + { + "epoch": 0.784220001241542, + "grad_norm": 0.2738666164537515, + "learning_rate": 9.227226870749922e-05, + "loss": 3.0444, + "step": 12633 + }, + { + "epoch": 0.7842820783412999, + "grad_norm": 0.22008666306665475, + "learning_rate": 9.227033981974187e-05, + "loss": 3.0965, + "step": 12634 + }, + { + "epoch": 0.7843441554410578, + "grad_norm": 0.20342882981700602, + "learning_rate": 9.226841071144988e-05, + "loss": 3.0492, + "step": 12635 + }, + { + "epoch": 0.7844062325408157, + "grad_norm": 0.18832411060566334, + "learning_rate": 9.226648138263334e-05, + "loss": 3.0191, + "step": 12636 + }, + { + "epoch": 0.7844683096405736, + "grad_norm": 0.17282784832714213, + "learning_rate": 9.22645518333023e-05, + "loss": 3.0027, + "step": 12637 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.21193267810818145, + "learning_rate": 9.226262206346684e-05, + "loss": 3.1177, + "step": 12638 + }, + { + "epoch": 0.7845924638400894, + "grad_norm": 0.22473568889003365, + "learning_rate": 9.226069207313703e-05, + "loss": 3.1305, + "step": 12639 + }, + { + "epoch": 0.7846545409398473, + "grad_norm": 0.19788326224128724, + "learning_rate": 9.225876186232291e-05, + "loss": 3.0194, + "step": 12640 + }, + { + "epoch": 0.7847166180396052, + "grad_norm": 0.22583611557596822, + "learning_rate": 9.225683143103458e-05, + "loss": 3.0143, + "step": 12641 + }, + { + "epoch": 0.784778695139363, + "grad_norm": 0.19202131591752236, + "learning_rate": 9.22549007792821e-05, + "loss": 3.0717, + "step": 12642 + }, + { + "epoch": 0.784840772239121, + "grad_norm": 0.18436025342320603, + "learning_rate": 9.225296990707555e-05, + "loss": 3.0541, + "step": 12643 + }, + { + "epoch": 0.7849028493388789, + "grad_norm": 0.18266905593687216, + "learning_rate": 9.225103881442499e-05, + "loss": 3.0892, + "step": 12644 + }, + { + "epoch": 0.7849649264386368, + "grad_norm": 0.19273192781055232, + "learning_rate": 9.224910750134052e-05, + "loss": 3.1004, + "step": 12645 + }, + { + "epoch": 0.7850270035383947, + "grad_norm": 0.19755232155649144, + "learning_rate": 9.224717596783218e-05, + "loss": 3.0848, + "step": 12646 + }, + { + "epoch": 0.7850890806381526, + "grad_norm": 0.273154793779406, + "learning_rate": 9.224524421391008e-05, + "loss": 3.0152, + "step": 12647 + }, + { + "epoch": 0.7851511577379104, + "grad_norm": 0.21296311815334124, + "learning_rate": 9.224331223958429e-05, + "loss": 2.9987, + "step": 12648 + }, + { + "epoch": 0.7852132348376684, + "grad_norm": 0.20809483386844616, + "learning_rate": 9.224138004486487e-05, + "loss": 3.0555, + "step": 12649 + }, + { + "epoch": 0.7852753119374263, + "grad_norm": 0.20652357871662447, + "learning_rate": 9.223944762976192e-05, + "loss": 3.0736, + "step": 12650 + }, + { + "epoch": 0.7853373890371842, + "grad_norm": 0.1879294266160745, + "learning_rate": 9.223751499428553e-05, + "loss": 3.1187, + "step": 12651 + }, + { + "epoch": 0.7853994661369421, + "grad_norm": 0.21229455778070636, + "learning_rate": 9.223558213844575e-05, + "loss": 3.1024, + "step": 12652 + }, + { + "epoch": 0.7854615432367, + "grad_norm": 0.21952578378300036, + "learning_rate": 9.22336490622527e-05, + "loss": 3.0751, + "step": 12653 + }, + { + "epoch": 0.7855236203364578, + "grad_norm": 0.20307765005334547, + "learning_rate": 9.223171576571645e-05, + "loss": 3.055, + "step": 12654 + }, + { + "epoch": 0.7855856974362158, + "grad_norm": 0.17430355120790278, + "learning_rate": 9.222978224884708e-05, + "loss": 2.9754, + "step": 12655 + }, + { + "epoch": 0.7856477745359737, + "grad_norm": 0.1878130108101193, + "learning_rate": 9.222784851165469e-05, + "loss": 3.0649, + "step": 12656 + }, + { + "epoch": 0.7857098516357316, + "grad_norm": 0.1951279817636826, + "learning_rate": 9.222591455414935e-05, + "loss": 3.0469, + "step": 12657 + }, + { + "epoch": 0.7857719287354895, + "grad_norm": 0.18342662874920404, + "learning_rate": 9.222398037634119e-05, + "loss": 3.0032, + "step": 12658 + }, + { + "epoch": 0.7858340058352474, + "grad_norm": 0.18078922369540842, + "learning_rate": 9.222204597824024e-05, + "loss": 3.0498, + "step": 12659 + }, + { + "epoch": 0.7858960829350052, + "grad_norm": 0.1897349375430814, + "learning_rate": 9.222011135985665e-05, + "loss": 3.0433, + "step": 12660 + }, + { + "epoch": 0.7859581600347632, + "grad_norm": 0.16761415778866617, + "learning_rate": 9.221817652120048e-05, + "loss": 2.9718, + "step": 12661 + }, + { + "epoch": 0.7860202371345211, + "grad_norm": 0.19156758651701178, + "learning_rate": 9.221624146228185e-05, + "loss": 3.0767, + "step": 12662 + }, + { + "epoch": 0.786082314234279, + "grad_norm": 0.1981576918005246, + "learning_rate": 9.221430618311082e-05, + "loss": 3.0987, + "step": 12663 + }, + { + "epoch": 0.7861443913340369, + "grad_norm": 0.2800861224491798, + "learning_rate": 9.221237068369753e-05, + "loss": 3.0271, + "step": 12664 + }, + { + "epoch": 0.7862064684337948, + "grad_norm": 0.22461634086090293, + "learning_rate": 9.221043496405203e-05, + "loss": 3.036, + "step": 12665 + }, + { + "epoch": 0.7862685455335526, + "grad_norm": 0.19411307164319963, + "learning_rate": 9.220849902418445e-05, + "loss": 2.9539, + "step": 12666 + }, + { + "epoch": 0.7863306226333105, + "grad_norm": 0.19305084158057678, + "learning_rate": 9.22065628641049e-05, + "loss": 3.0309, + "step": 12667 + }, + { + "epoch": 0.7863926997330685, + "grad_norm": 0.16616439885017495, + "learning_rate": 9.220462648382344e-05, + "loss": 3.0784, + "step": 12668 + }, + { + "epoch": 0.7864547768328264, + "grad_norm": 0.2067046468310918, + "learning_rate": 9.220268988335022e-05, + "loss": 3.0783, + "step": 12669 + }, + { + "epoch": 0.7865168539325843, + "grad_norm": 0.21913303321294955, + "learning_rate": 9.220075306269531e-05, + "loss": 3.0437, + "step": 12670 + }, + { + "epoch": 0.7865789310323422, + "grad_norm": 0.18172823936108906, + "learning_rate": 9.219881602186882e-05, + "loss": 3.0312, + "step": 12671 + }, + { + "epoch": 0.7866410081321, + "grad_norm": 0.18745338337282583, + "learning_rate": 9.219687876088087e-05, + "loss": 3.0864, + "step": 12672 + }, + { + "epoch": 0.786703085231858, + "grad_norm": 0.18943289029318902, + "learning_rate": 9.219494127974157e-05, + "loss": 3.0228, + "step": 12673 + }, + { + "epoch": 0.7867651623316159, + "grad_norm": 0.17597776660232042, + "learning_rate": 9.219300357846102e-05, + "loss": 3.1813, + "step": 12674 + }, + { + "epoch": 0.7868272394313738, + "grad_norm": 0.16452085522839155, + "learning_rate": 9.219106565704931e-05, + "loss": 3.0323, + "step": 12675 + }, + { + "epoch": 0.7868893165311317, + "grad_norm": 0.1799367130139105, + "learning_rate": 9.218912751551659e-05, + "loss": 3.1102, + "step": 12676 + }, + { + "epoch": 0.7869513936308896, + "grad_norm": 0.16987579641723888, + "learning_rate": 9.218718915387292e-05, + "loss": 3.1112, + "step": 12677 + }, + { + "epoch": 0.7870134707306474, + "grad_norm": 0.18086509860314925, + "learning_rate": 9.218525057212846e-05, + "loss": 3.1217, + "step": 12678 + }, + { + "epoch": 0.7870755478304053, + "grad_norm": 0.16930927915949032, + "learning_rate": 9.21833117702933e-05, + "loss": 3.0587, + "step": 12679 + }, + { + "epoch": 0.7871376249301633, + "grad_norm": 0.15437930355946072, + "learning_rate": 9.218137274837758e-05, + "loss": 2.9381, + "step": 12680 + }, + { + "epoch": 0.7871997020299212, + "grad_norm": 0.22704531755333907, + "learning_rate": 9.217943350639139e-05, + "loss": 3.0227, + "step": 12681 + }, + { + "epoch": 0.7872617791296791, + "grad_norm": 0.17644290376023622, + "learning_rate": 9.217749404434484e-05, + "loss": 3.0191, + "step": 12682 + }, + { + "epoch": 0.787323856229437, + "grad_norm": 0.19363083712778617, + "learning_rate": 9.217555436224809e-05, + "loss": 3.1576, + "step": 12683 + }, + { + "epoch": 0.7873859333291948, + "grad_norm": 0.16521297974644908, + "learning_rate": 9.217361446011121e-05, + "loss": 2.9728, + "step": 12684 + }, + { + "epoch": 0.7874480104289527, + "grad_norm": 0.21928028137680255, + "learning_rate": 9.217167433794437e-05, + "loss": 3.0644, + "step": 12685 + }, + { + "epoch": 0.7875100875287107, + "grad_norm": 0.22697956239812583, + "learning_rate": 9.216973399575765e-05, + "loss": 3.0454, + "step": 12686 + }, + { + "epoch": 0.7875721646284686, + "grad_norm": 0.3414541651712064, + "learning_rate": 9.21677934335612e-05, + "loss": 3.0115, + "step": 12687 + }, + { + "epoch": 0.7876342417282265, + "grad_norm": 0.18670209347478828, + "learning_rate": 9.216585265136513e-05, + "loss": 2.9225, + "step": 12688 + }, + { + "epoch": 0.7876963188279844, + "grad_norm": 0.20068775181975793, + "learning_rate": 9.216391164917957e-05, + "loss": 3.1007, + "step": 12689 + }, + { + "epoch": 0.7877583959277422, + "grad_norm": 0.23305876207614942, + "learning_rate": 9.216197042701464e-05, + "loss": 3.0081, + "step": 12690 + }, + { + "epoch": 0.7878204730275001, + "grad_norm": 0.1966429409941005, + "learning_rate": 9.216002898488048e-05, + "loss": 3.0787, + "step": 12691 + }, + { + "epoch": 0.787882550127258, + "grad_norm": 0.23572980789763762, + "learning_rate": 9.215808732278722e-05, + "loss": 3.1312, + "step": 12692 + }, + { + "epoch": 0.787944627227016, + "grad_norm": 0.19556844154212405, + "learning_rate": 9.2156145440745e-05, + "loss": 3.0788, + "step": 12693 + }, + { + "epoch": 0.7880067043267739, + "grad_norm": 0.17645480069839511, + "learning_rate": 9.21542033387639e-05, + "loss": 3.0935, + "step": 12694 + }, + { + "epoch": 0.7880687814265318, + "grad_norm": 0.17287183093188668, + "learning_rate": 9.215226101685411e-05, + "loss": 3.0627, + "step": 12695 + }, + { + "epoch": 0.7881308585262896, + "grad_norm": 0.16722917193395387, + "learning_rate": 9.215031847502573e-05, + "loss": 3.1572, + "step": 12696 + }, + { + "epoch": 0.7881929356260475, + "grad_norm": 0.17953737870953074, + "learning_rate": 9.214837571328892e-05, + "loss": 3.0648, + "step": 12697 + }, + { + "epoch": 0.7882550127258054, + "grad_norm": 0.1748507827847761, + "learning_rate": 9.21464327316538e-05, + "loss": 2.9787, + "step": 12698 + }, + { + "epoch": 0.7883170898255634, + "grad_norm": 0.1792529386820844, + "learning_rate": 9.21444895301305e-05, + "loss": 3.0312, + "step": 12699 + }, + { + "epoch": 0.7883791669253213, + "grad_norm": 0.16894199941396446, + "learning_rate": 9.214254610872916e-05, + "loss": 3.0543, + "step": 12700 + }, + { + "epoch": 0.7884412440250792, + "grad_norm": 0.16970138384992944, + "learning_rate": 9.214060246745995e-05, + "loss": 2.9946, + "step": 12701 + }, + { + "epoch": 0.788503321124837, + "grad_norm": 0.1992745695264065, + "learning_rate": 9.213865860633297e-05, + "loss": 3.0763, + "step": 12702 + }, + { + "epoch": 0.7885653982245949, + "grad_norm": 0.16922723807852577, + "learning_rate": 9.21367145253584e-05, + "loss": 3.1051, + "step": 12703 + }, + { + "epoch": 0.7886274753243528, + "grad_norm": 0.22553833133013837, + "learning_rate": 9.213477022454633e-05, + "loss": 3.0844, + "step": 12704 + }, + { + "epoch": 0.7886895524241108, + "grad_norm": 0.21499259301112744, + "learning_rate": 9.213282570390696e-05, + "loss": 3.1368, + "step": 12705 + }, + { + "epoch": 0.7887516295238687, + "grad_norm": 0.18608009616590301, + "learning_rate": 9.21308809634504e-05, + "loss": 3.098, + "step": 12706 + }, + { + "epoch": 0.7888137066236266, + "grad_norm": 0.21463337915723618, + "learning_rate": 9.212893600318679e-05, + "loss": 3.0107, + "step": 12707 + }, + { + "epoch": 0.7888757837233844, + "grad_norm": 0.26177094067258766, + "learning_rate": 9.212699082312631e-05, + "loss": 3.1682, + "step": 12708 + }, + { + "epoch": 0.7889378608231423, + "grad_norm": 0.19578048905318357, + "learning_rate": 9.212504542327909e-05, + "loss": 3.1085, + "step": 12709 + }, + { + "epoch": 0.7889999379229002, + "grad_norm": 0.22815441862917105, + "learning_rate": 9.212309980365529e-05, + "loss": 3.0845, + "step": 12710 + }, + { + "epoch": 0.7890620150226582, + "grad_norm": 0.21187528791661236, + "learning_rate": 9.212115396426505e-05, + "loss": 3.0444, + "step": 12711 + }, + { + "epoch": 0.7891240921224161, + "grad_norm": 0.23450724526550965, + "learning_rate": 9.211920790511853e-05, + "loss": 3.0024, + "step": 12712 + }, + { + "epoch": 0.789186169222174, + "grad_norm": 0.2107404035292979, + "learning_rate": 9.211726162622586e-05, + "loss": 3.0676, + "step": 12713 + }, + { + "epoch": 0.7892482463219318, + "grad_norm": 0.22825659229028014, + "learning_rate": 9.211531512759722e-05, + "loss": 3.0725, + "step": 12714 + }, + { + "epoch": 0.7893103234216897, + "grad_norm": 0.3421061397311529, + "learning_rate": 9.211336840924275e-05, + "loss": 3.0666, + "step": 12715 + }, + { + "epoch": 0.7893724005214476, + "grad_norm": 0.1764258750696523, + "learning_rate": 9.211142147117262e-05, + "loss": 3.1124, + "step": 12716 + }, + { + "epoch": 0.7894344776212056, + "grad_norm": 0.21826384370524685, + "learning_rate": 9.210947431339698e-05, + "loss": 3.0243, + "step": 12717 + }, + { + "epoch": 0.7894965547209635, + "grad_norm": 0.21141599319741933, + "learning_rate": 9.2107526935926e-05, + "loss": 3.0709, + "step": 12718 + }, + { + "epoch": 0.7895586318207214, + "grad_norm": 0.2602054486421512, + "learning_rate": 9.210557933876981e-05, + "loss": 3.0798, + "step": 12719 + }, + { + "epoch": 0.7896207089204792, + "grad_norm": 0.1828953567440302, + "learning_rate": 9.210363152193862e-05, + "loss": 3.014, + "step": 12720 + }, + { + "epoch": 0.7896827860202371, + "grad_norm": 0.19383515478607038, + "learning_rate": 9.210168348544255e-05, + "loss": 3.0434, + "step": 12721 + }, + { + "epoch": 0.789744863119995, + "grad_norm": 0.1841688583766476, + "learning_rate": 9.209973522929176e-05, + "loss": 3.05, + "step": 12722 + }, + { + "epoch": 0.789806940219753, + "grad_norm": 0.18755401389637877, + "learning_rate": 9.209778675349645e-05, + "loss": 3.1056, + "step": 12723 + }, + { + "epoch": 0.7898690173195109, + "grad_norm": 0.24832611436053298, + "learning_rate": 9.209583805806677e-05, + "loss": 3.0454, + "step": 12724 + }, + { + "epoch": 0.7899310944192688, + "grad_norm": 0.25244148003523614, + "learning_rate": 9.209388914301287e-05, + "loss": 3.0747, + "step": 12725 + }, + { + "epoch": 0.7899931715190266, + "grad_norm": 0.1877019609113368, + "learning_rate": 9.209194000834494e-05, + "loss": 2.9966, + "step": 12726 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.22507427036171795, + "learning_rate": 9.208999065407313e-05, + "loss": 2.9918, + "step": 12727 + }, + { + "epoch": 0.7901173257185424, + "grad_norm": 0.19677633091213925, + "learning_rate": 9.208804108020763e-05, + "loss": 2.9858, + "step": 12728 + }, + { + "epoch": 0.7901794028183003, + "grad_norm": 0.1942358573965638, + "learning_rate": 9.20860912867586e-05, + "loss": 3.0587, + "step": 12729 + }, + { + "epoch": 0.7902414799180583, + "grad_norm": 0.3136295408799726, + "learning_rate": 9.208414127373621e-05, + "loss": 3.0422, + "step": 12730 + }, + { + "epoch": 0.7903035570178162, + "grad_norm": 0.18428942585663138, + "learning_rate": 9.208219104115066e-05, + "loss": 3.038, + "step": 12731 + }, + { + "epoch": 0.790365634117574, + "grad_norm": 0.18310505167595728, + "learning_rate": 9.208024058901208e-05, + "loss": 3.1152, + "step": 12732 + }, + { + "epoch": 0.7904277112173319, + "grad_norm": 0.18481887751616133, + "learning_rate": 9.207828991733067e-05, + "loss": 3.0328, + "step": 12733 + }, + { + "epoch": 0.7904897883170898, + "grad_norm": 0.23627576957414242, + "learning_rate": 9.207633902611663e-05, + "loss": 3.1445, + "step": 12734 + }, + { + "epoch": 0.7905518654168477, + "grad_norm": 0.26604655417899453, + "learning_rate": 9.20743879153801e-05, + "loss": 3.1269, + "step": 12735 + }, + { + "epoch": 0.7906139425166057, + "grad_norm": 0.18291003160528446, + "learning_rate": 9.207243658513127e-05, + "loss": 2.97, + "step": 12736 + }, + { + "epoch": 0.7906760196163636, + "grad_norm": 0.2932300673765145, + "learning_rate": 9.207048503538033e-05, + "loss": 3.0966, + "step": 12737 + }, + { + "epoch": 0.7907380967161214, + "grad_norm": 0.18749730783755034, + "learning_rate": 9.206853326613745e-05, + "loss": 3.1854, + "step": 12738 + }, + { + "epoch": 0.7908001738158793, + "grad_norm": 0.19282319645138968, + "learning_rate": 9.206658127741284e-05, + "loss": 3.0183, + "step": 12739 + }, + { + "epoch": 0.7908622509156372, + "grad_norm": 0.21627275157051187, + "learning_rate": 9.206462906921663e-05, + "loss": 2.9947, + "step": 12740 + }, + { + "epoch": 0.7909243280153951, + "grad_norm": 0.19289177892315834, + "learning_rate": 9.206267664155907e-05, + "loss": 3.1092, + "step": 12741 + }, + { + "epoch": 0.7909864051151531, + "grad_norm": 0.1838295518014822, + "learning_rate": 9.20607239944503e-05, + "loss": 2.9972, + "step": 12742 + }, + { + "epoch": 0.7910484822149109, + "grad_norm": 0.17137860553372408, + "learning_rate": 9.205877112790051e-05, + "loss": 3.1144, + "step": 12743 + }, + { + "epoch": 0.7911105593146688, + "grad_norm": 0.2170007022454066, + "learning_rate": 9.205681804191992e-05, + "loss": 3.0134, + "step": 12744 + }, + { + "epoch": 0.7911726364144267, + "grad_norm": 0.1733343096642171, + "learning_rate": 9.20548647365187e-05, + "loss": 3.0209, + "step": 12745 + }, + { + "epoch": 0.7912347135141846, + "grad_norm": 0.16673367551475085, + "learning_rate": 9.205291121170703e-05, + "loss": 3.0447, + "step": 12746 + }, + { + "epoch": 0.7912967906139425, + "grad_norm": 0.1639567670258575, + "learning_rate": 9.205095746749513e-05, + "loss": 3.1491, + "step": 12747 + }, + { + "epoch": 0.7913588677137005, + "grad_norm": 0.21233790505387953, + "learning_rate": 9.204900350389317e-05, + "loss": 3.0415, + "step": 12748 + }, + { + "epoch": 0.7914209448134583, + "grad_norm": 0.19887203076529938, + "learning_rate": 9.204704932091135e-05, + "loss": 3.1246, + "step": 12749 + }, + { + "epoch": 0.7914830219132162, + "grad_norm": 0.20850440194150904, + "learning_rate": 9.204509491855986e-05, + "loss": 3.0612, + "step": 12750 + }, + { + "epoch": 0.7915450990129741, + "grad_norm": 0.22311145695956477, + "learning_rate": 9.204314029684891e-05, + "loss": 3.075, + "step": 12751 + }, + { + "epoch": 0.791607176112732, + "grad_norm": 0.16109046379533734, + "learning_rate": 9.204118545578869e-05, + "loss": 3.0397, + "step": 12752 + }, + { + "epoch": 0.7916692532124899, + "grad_norm": 0.1631951223578024, + "learning_rate": 9.20392303953894e-05, + "loss": 2.9696, + "step": 12753 + }, + { + "epoch": 0.7917313303122478, + "grad_norm": 0.1705918586686908, + "learning_rate": 9.203727511566122e-05, + "loss": 3.0497, + "step": 12754 + }, + { + "epoch": 0.7917934074120057, + "grad_norm": 0.23543551835111473, + "learning_rate": 9.20353196166144e-05, + "loss": 3.0237, + "step": 12755 + }, + { + "epoch": 0.7918554845117636, + "grad_norm": 0.21771324078414736, + "learning_rate": 9.203336389825909e-05, + "loss": 3.0957, + "step": 12756 + }, + { + "epoch": 0.7919175616115215, + "grad_norm": 0.19160522123026327, + "learning_rate": 9.203140796060554e-05, + "loss": 3.0477, + "step": 12757 + }, + { + "epoch": 0.7919796387112794, + "grad_norm": 0.19876819037020496, + "learning_rate": 9.202945180366391e-05, + "loss": 3.0343, + "step": 12758 + }, + { + "epoch": 0.7920417158110373, + "grad_norm": 0.18019092568532716, + "learning_rate": 9.202749542744445e-05, + "loss": 3.0165, + "step": 12759 + }, + { + "epoch": 0.7921037929107952, + "grad_norm": 0.18541380477332509, + "learning_rate": 9.202553883195733e-05, + "loss": 3.0133, + "step": 12760 + }, + { + "epoch": 0.792165870010553, + "grad_norm": 0.17729934438927403, + "learning_rate": 9.202358201721277e-05, + "loss": 2.9967, + "step": 12761 + }, + { + "epoch": 0.792227947110311, + "grad_norm": 0.1903327499540489, + "learning_rate": 9.202162498322098e-05, + "loss": 2.9881, + "step": 12762 + }, + { + "epoch": 0.7922900242100689, + "grad_norm": 0.18121661668032585, + "learning_rate": 9.201966772999218e-05, + "loss": 3.0641, + "step": 12763 + }, + { + "epoch": 0.7923521013098268, + "grad_norm": 0.19808671050314886, + "learning_rate": 9.201771025753657e-05, + "loss": 3.1069, + "step": 12764 + }, + { + "epoch": 0.7924141784095847, + "grad_norm": 0.26587240534127476, + "learning_rate": 9.201575256586435e-05, + "loss": 3.0728, + "step": 12765 + }, + { + "epoch": 0.7924762555093426, + "grad_norm": 0.181813701085398, + "learning_rate": 9.201379465498576e-05, + "loss": 2.9961, + "step": 12766 + }, + { + "epoch": 0.7925383326091004, + "grad_norm": 0.17077787779002923, + "learning_rate": 9.201183652491101e-05, + "loss": 3.0499, + "step": 12767 + }, + { + "epoch": 0.7926004097088584, + "grad_norm": 0.25093649755921377, + "learning_rate": 9.200987817565031e-05, + "loss": 3.0777, + "step": 12768 + }, + { + "epoch": 0.7926624868086163, + "grad_norm": 0.22302884999411407, + "learning_rate": 9.200791960721388e-05, + "loss": 3.0602, + "step": 12769 + }, + { + "epoch": 0.7927245639083742, + "grad_norm": 0.16940859299605004, + "learning_rate": 9.200596081961193e-05, + "loss": 3.0866, + "step": 12770 + }, + { + "epoch": 0.7927866410081321, + "grad_norm": 0.22517210914462524, + "learning_rate": 9.200400181285468e-05, + "loss": 3.0268, + "step": 12771 + }, + { + "epoch": 0.79284871810789, + "grad_norm": 0.25431350911433126, + "learning_rate": 9.200204258695235e-05, + "loss": 3.0606, + "step": 12772 + }, + { + "epoch": 0.7929107952076478, + "grad_norm": 0.19735382487780456, + "learning_rate": 9.20000831419152e-05, + "loss": 3.135, + "step": 12773 + }, + { + "epoch": 0.7929728723074058, + "grad_norm": 0.21943676805768, + "learning_rate": 9.199812347775338e-05, + "loss": 3.0404, + "step": 12774 + }, + { + "epoch": 0.7930349494071637, + "grad_norm": 0.27132074653045735, + "learning_rate": 9.199616359447718e-05, + "loss": 3.0489, + "step": 12775 + }, + { + "epoch": 0.7930970265069216, + "grad_norm": 0.2175490844494907, + "learning_rate": 9.199420349209679e-05, + "loss": 3.068, + "step": 12776 + }, + { + "epoch": 0.7931591036066795, + "grad_norm": 0.24270437828137736, + "learning_rate": 9.199224317062246e-05, + "loss": 3.0156, + "step": 12777 + }, + { + "epoch": 0.7932211807064374, + "grad_norm": 0.22729583150023996, + "learning_rate": 9.199028263006438e-05, + "loss": 3.0584, + "step": 12778 + }, + { + "epoch": 0.7932832578061952, + "grad_norm": 0.1978455225332318, + "learning_rate": 9.19883218704328e-05, + "loss": 3.003, + "step": 12779 + }, + { + "epoch": 0.7933453349059532, + "grad_norm": 0.22258326522177946, + "learning_rate": 9.198636089173797e-05, + "loss": 3.0485, + "step": 12780 + }, + { + "epoch": 0.7934074120057111, + "grad_norm": 0.2005760015833055, + "learning_rate": 9.19843996939901e-05, + "loss": 3.0408, + "step": 12781 + }, + { + "epoch": 0.793469489105469, + "grad_norm": 0.195044333419003, + "learning_rate": 9.198243827719942e-05, + "loss": 2.9934, + "step": 12782 + }, + { + "epoch": 0.7935315662052269, + "grad_norm": 0.20471545348863746, + "learning_rate": 9.198047664137617e-05, + "loss": 3.0331, + "step": 12783 + }, + { + "epoch": 0.7935936433049848, + "grad_norm": 0.19545717603050408, + "learning_rate": 9.197851478653059e-05, + "loss": 3.0082, + "step": 12784 + }, + { + "epoch": 0.7936557204047426, + "grad_norm": 0.19410852324191352, + "learning_rate": 9.197655271267289e-05, + "loss": 3.0407, + "step": 12785 + }, + { + "epoch": 0.7937177975045006, + "grad_norm": 0.19424000965114652, + "learning_rate": 9.197459041981334e-05, + "loss": 3.1223, + "step": 12786 + }, + { + "epoch": 0.7937798746042585, + "grad_norm": 0.19103427671084353, + "learning_rate": 9.197262790796216e-05, + "loss": 3.0246, + "step": 12787 + }, + { + "epoch": 0.7938419517040164, + "grad_norm": 0.2354734825185214, + "learning_rate": 9.197066517712958e-05, + "loss": 2.9803, + "step": 12788 + }, + { + "epoch": 0.7939040288037743, + "grad_norm": 0.18321941644910753, + "learning_rate": 9.196870222732586e-05, + "loss": 3.0782, + "step": 12789 + }, + { + "epoch": 0.7939661059035322, + "grad_norm": 0.23420321208777398, + "learning_rate": 9.196673905856124e-05, + "loss": 3.0732, + "step": 12790 + }, + { + "epoch": 0.79402818300329, + "grad_norm": 0.17301475235367475, + "learning_rate": 9.196477567084594e-05, + "loss": 3.0561, + "step": 12791 + }, + { + "epoch": 0.794090260103048, + "grad_norm": 0.19223663663367024, + "learning_rate": 9.196281206419022e-05, + "loss": 3.0212, + "step": 12792 + }, + { + "epoch": 0.7941523372028059, + "grad_norm": 0.1958106174043659, + "learning_rate": 9.196084823860434e-05, + "loss": 3.0954, + "step": 12793 + }, + { + "epoch": 0.7942144143025638, + "grad_norm": 0.16952857178009656, + "learning_rate": 9.19588841940985e-05, + "loss": 3.077, + "step": 12794 + }, + { + "epoch": 0.7942764914023217, + "grad_norm": 0.1885251096354349, + "learning_rate": 9.1956919930683e-05, + "loss": 3.0815, + "step": 12795 + }, + { + "epoch": 0.7943385685020796, + "grad_norm": 0.1782463058039383, + "learning_rate": 9.195495544836805e-05, + "loss": 3.0136, + "step": 12796 + }, + { + "epoch": 0.7944006456018374, + "grad_norm": 0.2060571462475491, + "learning_rate": 9.195299074716392e-05, + "loss": 2.9155, + "step": 12797 + }, + { + "epoch": 0.7944627227015953, + "grad_norm": 0.19692929556114463, + "learning_rate": 9.195102582708085e-05, + "loss": 3.0575, + "step": 12798 + }, + { + "epoch": 0.7945247998013533, + "grad_norm": 0.1993841998898513, + "learning_rate": 9.194906068812908e-05, + "loss": 3.0672, + "step": 12799 + }, + { + "epoch": 0.7945868769011112, + "grad_norm": 0.24483276674847995, + "learning_rate": 9.19470953303189e-05, + "loss": 3.0691, + "step": 12800 + }, + { + "epoch": 0.7946489540008691, + "grad_norm": 0.2064947879228168, + "learning_rate": 9.194512975366052e-05, + "loss": 3.1321, + "step": 12801 + }, + { + "epoch": 0.794711031100627, + "grad_norm": 0.1738563418257057, + "learning_rate": 9.194316395816424e-05, + "loss": 3.0974, + "step": 12802 + }, + { + "epoch": 0.7947731082003848, + "grad_norm": 0.284147739253162, + "learning_rate": 9.194119794384027e-05, + "loss": 3.0291, + "step": 12803 + }, + { + "epoch": 0.7948351853001427, + "grad_norm": 0.1942003150685708, + "learning_rate": 9.193923171069889e-05, + "loss": 3.0017, + "step": 12804 + }, + { + "epoch": 0.7948972623999007, + "grad_norm": 0.18571030295962762, + "learning_rate": 9.193726525875037e-05, + "loss": 2.9895, + "step": 12805 + }, + { + "epoch": 0.7949593394996586, + "grad_norm": 0.19603864153805894, + "learning_rate": 9.193529858800495e-05, + "loss": 3.1103, + "step": 12806 + }, + { + "epoch": 0.7950214165994165, + "grad_norm": 0.20020783548031876, + "learning_rate": 9.193333169847289e-05, + "loss": 3.1195, + "step": 12807 + }, + { + "epoch": 0.7950834936991744, + "grad_norm": 0.18937626488895815, + "learning_rate": 9.193136459016447e-05, + "loss": 3.1143, + "step": 12808 + }, + { + "epoch": 0.7951455707989322, + "grad_norm": 0.21837580555907546, + "learning_rate": 9.192939726308993e-05, + "loss": 3.0255, + "step": 12809 + }, + { + "epoch": 0.7952076478986901, + "grad_norm": 0.19585551117721375, + "learning_rate": 9.192742971725957e-05, + "loss": 2.9922, + "step": 12810 + }, + { + "epoch": 0.7952697249984481, + "grad_norm": 0.18399867591433328, + "learning_rate": 9.192546195268361e-05, + "loss": 3.0599, + "step": 12811 + }, + { + "epoch": 0.795331802098206, + "grad_norm": 0.19637765356052084, + "learning_rate": 9.192349396937234e-05, + "loss": 3.0248, + "step": 12812 + }, + { + "epoch": 0.7953938791979639, + "grad_norm": 0.17042576409597118, + "learning_rate": 9.192152576733602e-05, + "loss": 2.951, + "step": 12813 + }, + { + "epoch": 0.7954559562977218, + "grad_norm": 0.20167741863268243, + "learning_rate": 9.191955734658493e-05, + "loss": 3.0618, + "step": 12814 + }, + { + "epoch": 0.7955180333974796, + "grad_norm": 0.33875925945281665, + "learning_rate": 9.191758870712933e-05, + "loss": 3.1906, + "step": 12815 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.21657968931800758, + "learning_rate": 9.19156198489795e-05, + "loss": 3.0666, + "step": 12816 + }, + { + "epoch": 0.7956421875969955, + "grad_norm": 0.22263436460852518, + "learning_rate": 9.191365077214571e-05, + "loss": 3.0302, + "step": 12817 + }, + { + "epoch": 0.7957042646967534, + "grad_norm": 0.2461306377799749, + "learning_rate": 9.191168147663822e-05, + "loss": 3.0393, + "step": 12818 + }, + { + "epoch": 0.7957663417965113, + "grad_norm": 0.24564641834864787, + "learning_rate": 9.190971196246731e-05, + "loss": 2.9843, + "step": 12819 + }, + { + "epoch": 0.7958284188962692, + "grad_norm": 0.22193562022019783, + "learning_rate": 9.190774222964327e-05, + "loss": 3.1631, + "step": 12820 + }, + { + "epoch": 0.795890495996027, + "grad_norm": 0.30445329516853664, + "learning_rate": 9.190577227817635e-05, + "loss": 3.0089, + "step": 12821 + }, + { + "epoch": 0.7959525730957849, + "grad_norm": 0.353658278540773, + "learning_rate": 9.190380210807685e-05, + "loss": 3.0814, + "step": 12822 + }, + { + "epoch": 0.7960146501955428, + "grad_norm": 0.21732909369124243, + "learning_rate": 9.190183171935505e-05, + "loss": 3.1246, + "step": 12823 + }, + { + "epoch": 0.7960767272953008, + "grad_norm": 0.3193399271584561, + "learning_rate": 9.189986111202122e-05, + "loss": 3.0809, + "step": 12824 + }, + { + "epoch": 0.7961388043950587, + "grad_norm": 0.3250601547763911, + "learning_rate": 9.189789028608563e-05, + "loss": 3.1081, + "step": 12825 + }, + { + "epoch": 0.7962008814948166, + "grad_norm": 0.33526525214331365, + "learning_rate": 9.189591924155858e-05, + "loss": 3.1246, + "step": 12826 + }, + { + "epoch": 0.7962629585945744, + "grad_norm": 0.2502024248290016, + "learning_rate": 9.189394797845036e-05, + "loss": 3.0338, + "step": 12827 + }, + { + "epoch": 0.7963250356943323, + "grad_norm": 0.20963756587820329, + "learning_rate": 9.189197649677124e-05, + "loss": 3.0396, + "step": 12828 + }, + { + "epoch": 0.7963871127940902, + "grad_norm": 0.2816302722186822, + "learning_rate": 9.189000479653149e-05, + "loss": 3.078, + "step": 12829 + }, + { + "epoch": 0.7964491898938482, + "grad_norm": 0.21150522303332056, + "learning_rate": 9.188803287774143e-05, + "loss": 3.0459, + "step": 12830 + }, + { + "epoch": 0.7965112669936061, + "grad_norm": 0.20921807279754961, + "learning_rate": 9.188606074041134e-05, + "loss": 3.0961, + "step": 12831 + }, + { + "epoch": 0.796573344093364, + "grad_norm": 0.23662274494295618, + "learning_rate": 9.18840883845515e-05, + "loss": 3.0609, + "step": 12832 + }, + { + "epoch": 0.7966354211931218, + "grad_norm": 0.20555256051668144, + "learning_rate": 9.18821158101722e-05, + "loss": 3.0297, + "step": 12833 + }, + { + "epoch": 0.7966974982928797, + "grad_norm": 0.19085933438588357, + "learning_rate": 9.188014301728373e-05, + "loss": 2.9514, + "step": 12834 + }, + { + "epoch": 0.7967595753926376, + "grad_norm": 0.28323970426902667, + "learning_rate": 9.187817000589641e-05, + "loss": 3.1054, + "step": 12835 + }, + { + "epoch": 0.7968216524923956, + "grad_norm": 0.25782424726599185, + "learning_rate": 9.187619677602049e-05, + "loss": 3.0236, + "step": 12836 + }, + { + "epoch": 0.7968837295921535, + "grad_norm": 0.1909341439422002, + "learning_rate": 9.187422332766628e-05, + "loss": 3.0985, + "step": 12837 + }, + { + "epoch": 0.7969458066919114, + "grad_norm": 0.22010149861758874, + "learning_rate": 9.187224966084409e-05, + "loss": 2.9606, + "step": 12838 + }, + { + "epoch": 0.7970078837916692, + "grad_norm": 0.21342239984126204, + "learning_rate": 9.187027577556421e-05, + "loss": 3.1162, + "step": 12839 + }, + { + "epoch": 0.7970699608914271, + "grad_norm": 0.22335876931758497, + "learning_rate": 9.186830167183693e-05, + "loss": 3.0066, + "step": 12840 + }, + { + "epoch": 0.797132037991185, + "grad_norm": 0.30966923814776237, + "learning_rate": 9.186632734967256e-05, + "loss": 3.0282, + "step": 12841 + }, + { + "epoch": 0.797194115090943, + "grad_norm": 0.20493379691286784, + "learning_rate": 9.18643528090814e-05, + "loss": 3.0989, + "step": 12842 + }, + { + "epoch": 0.7972561921907009, + "grad_norm": 0.34316611552614923, + "learning_rate": 9.186237805007374e-05, + "loss": 3.0953, + "step": 12843 + }, + { + "epoch": 0.7973182692904588, + "grad_norm": 0.2184025184442476, + "learning_rate": 9.18604030726599e-05, + "loss": 3.063, + "step": 12844 + }, + { + "epoch": 0.7973803463902166, + "grad_norm": 0.24068442112586133, + "learning_rate": 9.185842787685019e-05, + "loss": 3.1375, + "step": 12845 + }, + { + "epoch": 0.7974424234899745, + "grad_norm": 0.2028084544261781, + "learning_rate": 9.185645246265488e-05, + "loss": 3.0377, + "step": 12846 + }, + { + "epoch": 0.7975045005897324, + "grad_norm": 0.29182268409009404, + "learning_rate": 9.18544768300843e-05, + "loss": 2.9652, + "step": 12847 + }, + { + "epoch": 0.7975665776894904, + "grad_norm": 0.1957214746609903, + "learning_rate": 9.185250097914875e-05, + "loss": 3.0397, + "step": 12848 + }, + { + "epoch": 0.7976286547892483, + "grad_norm": 0.20485864429461398, + "learning_rate": 9.185052490985855e-05, + "loss": 3.0543, + "step": 12849 + }, + { + "epoch": 0.7976907318890062, + "grad_norm": 0.19161859722414423, + "learning_rate": 9.184854862222399e-05, + "loss": 3.0834, + "step": 12850 + }, + { + "epoch": 0.797752808988764, + "grad_norm": 0.17810617062948775, + "learning_rate": 9.18465721162554e-05, + "loss": 3.0619, + "step": 12851 + }, + { + "epoch": 0.7978148860885219, + "grad_norm": 0.2363502949175761, + "learning_rate": 9.184459539196309e-05, + "loss": 3.1428, + "step": 12852 + }, + { + "epoch": 0.7978769631882798, + "grad_norm": 0.18499523275908766, + "learning_rate": 9.184261844935738e-05, + "loss": 3.0136, + "step": 12853 + }, + { + "epoch": 0.7979390402880377, + "grad_norm": 0.2808256259001117, + "learning_rate": 9.184064128844853e-05, + "loss": 3.026, + "step": 12854 + }, + { + "epoch": 0.7980011173877957, + "grad_norm": 0.1901109718167277, + "learning_rate": 9.183866390924694e-05, + "loss": 3.0179, + "step": 12855 + }, + { + "epoch": 0.7980631944875536, + "grad_norm": 0.22190128291632943, + "learning_rate": 9.183668631176287e-05, + "loss": 3.1011, + "step": 12856 + }, + { + "epoch": 0.7981252715873114, + "grad_norm": 0.19301818839839668, + "learning_rate": 9.183470849600664e-05, + "loss": 2.9182, + "step": 12857 + }, + { + "epoch": 0.7981873486870693, + "grad_norm": 0.2573877216426156, + "learning_rate": 9.18327304619886e-05, + "loss": 2.9709, + "step": 12858 + }, + { + "epoch": 0.7982494257868272, + "grad_norm": 0.2159990459515995, + "learning_rate": 9.183075220971903e-05, + "loss": 3.0538, + "step": 12859 + }, + { + "epoch": 0.7983115028865851, + "grad_norm": 0.2485827492222609, + "learning_rate": 9.182877373920827e-05, + "loss": 2.9573, + "step": 12860 + }, + { + "epoch": 0.7983735799863431, + "grad_norm": 0.2029990943092193, + "learning_rate": 9.182679505046665e-05, + "loss": 3.1267, + "step": 12861 + }, + { + "epoch": 0.798435657086101, + "grad_norm": 0.19032539054673198, + "learning_rate": 9.182481614350448e-05, + "loss": 3.0302, + "step": 12862 + }, + { + "epoch": 0.7984977341858588, + "grad_norm": 0.17803008244302193, + "learning_rate": 9.182283701833212e-05, + "loss": 3.2019, + "step": 12863 + }, + { + "epoch": 0.7985598112856167, + "grad_norm": 0.18114745963779452, + "learning_rate": 9.182085767495985e-05, + "loss": 3.0463, + "step": 12864 + }, + { + "epoch": 0.7986218883853746, + "grad_norm": 0.16181584290778772, + "learning_rate": 9.1818878113398e-05, + "loss": 3.0115, + "step": 12865 + }, + { + "epoch": 0.7986839654851325, + "grad_norm": 0.2826472648006449, + "learning_rate": 9.181689833365692e-05, + "loss": 3.0879, + "step": 12866 + }, + { + "epoch": 0.7987460425848905, + "grad_norm": 0.18942096279899792, + "learning_rate": 9.181491833574694e-05, + "loss": 2.9754, + "step": 12867 + }, + { + "epoch": 0.7988081196846484, + "grad_norm": 0.24609702728753496, + "learning_rate": 9.181293811967836e-05, + "loss": 2.9482, + "step": 12868 + }, + { + "epoch": 0.7988701967844062, + "grad_norm": 0.21018557397117063, + "learning_rate": 9.181095768546154e-05, + "loss": 3.0864, + "step": 12869 + }, + { + "epoch": 0.7989322738841641, + "grad_norm": 0.3283997475353042, + "learning_rate": 9.18089770331068e-05, + "loss": 3.1195, + "step": 12870 + }, + { + "epoch": 0.798994350983922, + "grad_norm": 0.24017506929620733, + "learning_rate": 9.18069961626245e-05, + "loss": 2.9817, + "step": 12871 + }, + { + "epoch": 0.7990564280836799, + "grad_norm": 0.2469541555574438, + "learning_rate": 9.180501507402492e-05, + "loss": 3.1266, + "step": 12872 + }, + { + "epoch": 0.7991185051834379, + "grad_norm": 0.187155944348804, + "learning_rate": 9.180303376731845e-05, + "loss": 2.9604, + "step": 12873 + }, + { + "epoch": 0.7991805822831958, + "grad_norm": 0.2284730317287604, + "learning_rate": 9.180105224251539e-05, + "loss": 2.9467, + "step": 12874 + }, + { + "epoch": 0.7992426593829536, + "grad_norm": 0.22254467561193822, + "learning_rate": 9.179907049962611e-05, + "loss": 3.0987, + "step": 12875 + }, + { + "epoch": 0.7993047364827115, + "grad_norm": 0.20240441300378817, + "learning_rate": 9.179708853866092e-05, + "loss": 3.0441, + "step": 12876 + }, + { + "epoch": 0.7993668135824694, + "grad_norm": 0.2207021375650918, + "learning_rate": 9.179510635963017e-05, + "loss": 3.128, + "step": 12877 + }, + { + "epoch": 0.7994288906822273, + "grad_norm": 0.19421458648797796, + "learning_rate": 9.17931239625442e-05, + "loss": 3.0422, + "step": 12878 + }, + { + "epoch": 0.7994909677819853, + "grad_norm": 0.21133040992784877, + "learning_rate": 9.179114134741338e-05, + "loss": 3.0291, + "step": 12879 + }, + { + "epoch": 0.7995530448817432, + "grad_norm": 0.17555470703370704, + "learning_rate": 9.1789158514248e-05, + "loss": 3.025, + "step": 12880 + }, + { + "epoch": 0.799615121981501, + "grad_norm": 0.2813293954258576, + "learning_rate": 9.178717546305846e-05, + "loss": 3.0218, + "step": 12881 + }, + { + "epoch": 0.7996771990812589, + "grad_norm": 0.18356426489231276, + "learning_rate": 9.178519219385507e-05, + "loss": 3.058, + "step": 12882 + }, + { + "epoch": 0.7997392761810168, + "grad_norm": 0.1846287851707508, + "learning_rate": 9.17832087066482e-05, + "loss": 3.0697, + "step": 12883 + }, + { + "epoch": 0.7998013532807747, + "grad_norm": 0.1865207911598257, + "learning_rate": 9.178122500144817e-05, + "loss": 3.0648, + "step": 12884 + }, + { + "epoch": 0.7998634303805326, + "grad_norm": 0.29343905528735986, + "learning_rate": 9.177924107826536e-05, + "loss": 3.0487, + "step": 12885 + }, + { + "epoch": 0.7999255074802906, + "grad_norm": 0.20116999541950842, + "learning_rate": 9.17772569371101e-05, + "loss": 3.0422, + "step": 12886 + }, + { + "epoch": 0.7999875845800484, + "grad_norm": 0.17252844982745755, + "learning_rate": 9.177527257799274e-05, + "loss": 2.9416, + "step": 12887 + }, + { + "epoch": 0.8000496616798063, + "grad_norm": 0.16986911327240828, + "learning_rate": 9.177328800092366e-05, + "loss": 2.9743, + "step": 12888 + }, + { + "epoch": 0.8001117387795642, + "grad_norm": 0.22720343267620524, + "learning_rate": 9.177130320591319e-05, + "loss": 3.0719, + "step": 12889 + }, + { + "epoch": 0.8001738158793221, + "grad_norm": 0.18356930793227255, + "learning_rate": 9.176931819297168e-05, + "loss": 3.0896, + "step": 12890 + }, + { + "epoch": 0.80023589297908, + "grad_norm": 0.1826444595290759, + "learning_rate": 9.176733296210951e-05, + "loss": 2.9839, + "step": 12891 + }, + { + "epoch": 0.800297970078838, + "grad_norm": 0.19171341752624127, + "learning_rate": 9.176534751333704e-05, + "loss": 3.0937, + "step": 12892 + }, + { + "epoch": 0.8003600471785958, + "grad_norm": 0.22010695696854046, + "learning_rate": 9.176336184666458e-05, + "loss": 3.0537, + "step": 12893 + }, + { + "epoch": 0.8004221242783537, + "grad_norm": 0.1933722347260306, + "learning_rate": 9.176137596210252e-05, + "loss": 3.0312, + "step": 12894 + }, + { + "epoch": 0.8004842013781116, + "grad_norm": 0.18849969632974242, + "learning_rate": 9.175938985966125e-05, + "loss": 3.0591, + "step": 12895 + }, + { + "epoch": 0.8005462784778695, + "grad_norm": 0.46485101069061185, + "learning_rate": 9.17574035393511e-05, + "loss": 3.1086, + "step": 12896 + }, + { + "epoch": 0.8006083555776274, + "grad_norm": 0.2206475854719506, + "learning_rate": 9.175541700118244e-05, + "loss": 3.0972, + "step": 12897 + }, + { + "epoch": 0.8006704326773854, + "grad_norm": 0.18673502320438926, + "learning_rate": 9.175343024516561e-05, + "loss": 3.0468, + "step": 12898 + }, + { + "epoch": 0.8007325097771432, + "grad_norm": 0.21528659609368034, + "learning_rate": 9.175144327131102e-05, + "loss": 2.9783, + "step": 12899 + }, + { + "epoch": 0.8007945868769011, + "grad_norm": 0.21773791344172677, + "learning_rate": 9.174945607962901e-05, + "loss": 3.1141, + "step": 12900 + }, + { + "epoch": 0.800856663976659, + "grad_norm": 0.19129710102075737, + "learning_rate": 9.174746867012995e-05, + "loss": 2.962, + "step": 12901 + }, + { + "epoch": 0.8009187410764169, + "grad_norm": 0.29184765252644124, + "learning_rate": 9.174548104282422e-05, + "loss": 3.1585, + "step": 12902 + }, + { + "epoch": 0.8009808181761748, + "grad_norm": 0.22928221816573027, + "learning_rate": 9.174349319772217e-05, + "loss": 3.0492, + "step": 12903 + }, + { + "epoch": 0.8010428952759328, + "grad_norm": 0.21888412042435276, + "learning_rate": 9.174150513483419e-05, + "loss": 3.073, + "step": 12904 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.34506353923086286, + "learning_rate": 9.173951685417065e-05, + "loss": 3.0802, + "step": 12905 + }, + { + "epoch": 0.8011670494754485, + "grad_norm": 0.18073102380272932, + "learning_rate": 9.173752835574191e-05, + "loss": 3.0022, + "step": 12906 + }, + { + "epoch": 0.8012291265752064, + "grad_norm": 0.2944488774471744, + "learning_rate": 9.173553963955836e-05, + "loss": 2.9948, + "step": 12907 + }, + { + "epoch": 0.8012912036749643, + "grad_norm": 0.26739657292092234, + "learning_rate": 9.173355070563036e-05, + "loss": 3.0333, + "step": 12908 + }, + { + "epoch": 0.8013532807747222, + "grad_norm": 0.24066923682333266, + "learning_rate": 9.173156155396829e-05, + "loss": 3.0398, + "step": 12909 + }, + { + "epoch": 0.8014153578744801, + "grad_norm": 0.26621784640643686, + "learning_rate": 9.172957218458254e-05, + "loss": 2.975, + "step": 12910 + }, + { + "epoch": 0.801477434974238, + "grad_norm": 0.19867136495797003, + "learning_rate": 9.172758259748349e-05, + "loss": 3.0464, + "step": 12911 + }, + { + "epoch": 0.8015395120739959, + "grad_norm": 0.24282835955234464, + "learning_rate": 9.172559279268149e-05, + "loss": 3.0753, + "step": 12912 + }, + { + "epoch": 0.8016015891737538, + "grad_norm": 0.2218041343750593, + "learning_rate": 9.172360277018696e-05, + "loss": 3.0456, + "step": 12913 + }, + { + "epoch": 0.8016636662735117, + "grad_norm": 0.2094230415256534, + "learning_rate": 9.172161253001026e-05, + "loss": 3.077, + "step": 12914 + }, + { + "epoch": 0.8017257433732696, + "grad_norm": 0.20959206936573221, + "learning_rate": 9.171962207216179e-05, + "loss": 3.0523, + "step": 12915 + }, + { + "epoch": 0.8017878204730275, + "grad_norm": 0.23652384668971607, + "learning_rate": 9.17176313966519e-05, + "loss": 3.086, + "step": 12916 + }, + { + "epoch": 0.8018498975727854, + "grad_norm": 0.3338137185067895, + "learning_rate": 9.171564050349102e-05, + "loss": 3.0067, + "step": 12917 + }, + { + "epoch": 0.8019119746725433, + "grad_norm": 0.20314851037929188, + "learning_rate": 9.171364939268952e-05, + "loss": 3.0941, + "step": 12918 + }, + { + "epoch": 0.8019740517723012, + "grad_norm": 0.2107246525976006, + "learning_rate": 9.171165806425777e-05, + "loss": 3.0013, + "step": 12919 + }, + { + "epoch": 0.8020361288720591, + "grad_norm": 0.2382684375882868, + "learning_rate": 9.170966651820617e-05, + "loss": 2.925, + "step": 12920 + }, + { + "epoch": 0.802098205971817, + "grad_norm": 0.17955102091225197, + "learning_rate": 9.170767475454512e-05, + "loss": 3.1173, + "step": 12921 + }, + { + "epoch": 0.8021602830715749, + "grad_norm": 0.18365159036704345, + "learning_rate": 9.170568277328502e-05, + "loss": 3.1044, + "step": 12922 + }, + { + "epoch": 0.8022223601713327, + "grad_norm": 0.1790612544603951, + "learning_rate": 9.170369057443624e-05, + "loss": 3.0823, + "step": 12923 + }, + { + "epoch": 0.8022844372710907, + "grad_norm": 0.23513699481237185, + "learning_rate": 9.170169815800917e-05, + "loss": 3.1263, + "step": 12924 + }, + { + "epoch": 0.8023465143708486, + "grad_norm": 0.3356119064000443, + "learning_rate": 9.169970552401423e-05, + "loss": 3.0045, + "step": 12925 + }, + { + "epoch": 0.8024085914706065, + "grad_norm": 0.22593195086638543, + "learning_rate": 9.169771267246182e-05, + "loss": 3.0274, + "step": 12926 + }, + { + "epoch": 0.8024706685703644, + "grad_norm": 0.1912121469773099, + "learning_rate": 9.169571960336229e-05, + "loss": 3.0893, + "step": 12927 + }, + { + "epoch": 0.8025327456701223, + "grad_norm": 0.2408591187358511, + "learning_rate": 9.169372631672609e-05, + "loss": 3.025, + "step": 12928 + }, + { + "epoch": 0.8025948227698801, + "grad_norm": 0.2198854084170004, + "learning_rate": 9.169173281256357e-05, + "loss": 3.1528, + "step": 12929 + }, + { + "epoch": 0.8026568998696381, + "grad_norm": 0.2008175739130538, + "learning_rate": 9.168973909088518e-05, + "loss": 3.0717, + "step": 12930 + }, + { + "epoch": 0.802718976969396, + "grad_norm": 0.21110885335962118, + "learning_rate": 9.168774515170129e-05, + "loss": 3.0821, + "step": 12931 + }, + { + "epoch": 0.8027810540691539, + "grad_norm": 0.24280243145749997, + "learning_rate": 9.168575099502233e-05, + "loss": 2.9988, + "step": 12932 + }, + { + "epoch": 0.8028431311689118, + "grad_norm": 0.2578104770465437, + "learning_rate": 9.168375662085868e-05, + "loss": 3.1128, + "step": 12933 + }, + { + "epoch": 0.8029052082686697, + "grad_norm": 0.2044540172902899, + "learning_rate": 9.168176202922075e-05, + "loss": 3.0847, + "step": 12934 + }, + { + "epoch": 0.8029672853684275, + "grad_norm": 0.17287770844039038, + "learning_rate": 9.167976722011895e-05, + "loss": 3.0041, + "step": 12935 + }, + { + "epoch": 0.8030293624681855, + "grad_norm": 0.2690080041891063, + "learning_rate": 9.16777721935637e-05, + "loss": 3.0445, + "step": 12936 + }, + { + "epoch": 0.8030914395679434, + "grad_norm": 0.2268888819207043, + "learning_rate": 9.167577694956537e-05, + "loss": 3.0314, + "step": 12937 + }, + { + "epoch": 0.8031535166677013, + "grad_norm": 0.19217721244756258, + "learning_rate": 9.167378148813442e-05, + "loss": 3.0864, + "step": 12938 + }, + { + "epoch": 0.8032155937674592, + "grad_norm": 0.21496256230672775, + "learning_rate": 9.167178580928122e-05, + "loss": 3.024, + "step": 12939 + }, + { + "epoch": 0.8032776708672171, + "grad_norm": 0.27464931897523936, + "learning_rate": 9.166978991301619e-05, + "loss": 3.0689, + "step": 12940 + }, + { + "epoch": 0.8033397479669749, + "grad_norm": 0.29632910933709483, + "learning_rate": 9.166779379934975e-05, + "loss": 2.9867, + "step": 12941 + }, + { + "epoch": 0.8034018250667329, + "grad_norm": 0.21227205160757764, + "learning_rate": 9.166579746829234e-05, + "loss": 2.99, + "step": 12942 + }, + { + "epoch": 0.8034639021664908, + "grad_norm": 0.30815550916176704, + "learning_rate": 9.166380091985433e-05, + "loss": 3.0104, + "step": 12943 + }, + { + "epoch": 0.8035259792662487, + "grad_norm": 0.22379942272227307, + "learning_rate": 9.166180415404616e-05, + "loss": 2.9343, + "step": 12944 + }, + { + "epoch": 0.8035880563660066, + "grad_norm": 0.2669400452358736, + "learning_rate": 9.165980717087824e-05, + "loss": 3.0543, + "step": 12945 + }, + { + "epoch": 0.8036501334657645, + "grad_norm": 0.23800453941339755, + "learning_rate": 9.1657809970361e-05, + "loss": 2.9944, + "step": 12946 + }, + { + "epoch": 0.8037122105655223, + "grad_norm": 0.3358357519381369, + "learning_rate": 9.165581255250484e-05, + "loss": 3.0548, + "step": 12947 + }, + { + "epoch": 0.8037742876652803, + "grad_norm": 0.26434700328404714, + "learning_rate": 9.165381491732021e-05, + "loss": 3.0409, + "step": 12948 + }, + { + "epoch": 0.8038363647650382, + "grad_norm": 0.2872467879944941, + "learning_rate": 9.165181706481752e-05, + "loss": 3.0816, + "step": 12949 + }, + { + "epoch": 0.8038984418647961, + "grad_norm": 0.2218614077098475, + "learning_rate": 9.164981899500718e-05, + "loss": 3.0282, + "step": 12950 + }, + { + "epoch": 0.803960518964554, + "grad_norm": 0.28348611307528593, + "learning_rate": 9.164782070789962e-05, + "loss": 3.0233, + "step": 12951 + }, + { + "epoch": 0.8040225960643119, + "grad_norm": 0.34200640114571607, + "learning_rate": 9.164582220350527e-05, + "loss": 3.0288, + "step": 12952 + }, + { + "epoch": 0.8040846731640697, + "grad_norm": 0.24168202502814412, + "learning_rate": 9.164382348183455e-05, + "loss": 3.0113, + "step": 12953 + }, + { + "epoch": 0.8041467502638276, + "grad_norm": 0.264158717802013, + "learning_rate": 9.164182454289789e-05, + "loss": 3.0694, + "step": 12954 + }, + { + "epoch": 0.8042088273635856, + "grad_norm": 0.19098023279805965, + "learning_rate": 9.163982538670574e-05, + "loss": 3.0145, + "step": 12955 + }, + { + "epoch": 0.8042709044633435, + "grad_norm": 0.22574738509225709, + "learning_rate": 9.16378260132685e-05, + "loss": 3.0188, + "step": 12956 + }, + { + "epoch": 0.8043329815631014, + "grad_norm": 0.21972961423662002, + "learning_rate": 9.163582642259663e-05, + "loss": 3.0497, + "step": 12957 + }, + { + "epoch": 0.8043950586628593, + "grad_norm": 0.1929951039746994, + "learning_rate": 9.163382661470053e-05, + "loss": 2.9699, + "step": 12958 + }, + { + "epoch": 0.8044571357626171, + "grad_norm": 0.21011026084873724, + "learning_rate": 9.163182658959064e-05, + "loss": 2.9971, + "step": 12959 + }, + { + "epoch": 0.804519212862375, + "grad_norm": 0.19557134642057417, + "learning_rate": 9.162982634727743e-05, + "loss": 2.997, + "step": 12960 + }, + { + "epoch": 0.804581289962133, + "grad_norm": 0.21550320073956983, + "learning_rate": 9.162782588777131e-05, + "loss": 3.0375, + "step": 12961 + }, + { + "epoch": 0.8046433670618909, + "grad_norm": 0.21132863832647625, + "learning_rate": 9.16258252110827e-05, + "loss": 3.0662, + "step": 12962 + }, + { + "epoch": 0.8047054441616488, + "grad_norm": 0.19295904351351345, + "learning_rate": 9.162382431722205e-05, + "loss": 3.1244, + "step": 12963 + }, + { + "epoch": 0.8047675212614067, + "grad_norm": 0.19721514934202025, + "learning_rate": 9.16218232061998e-05, + "loss": 3.041, + "step": 12964 + }, + { + "epoch": 0.8048295983611645, + "grad_norm": 0.18205114345966736, + "learning_rate": 9.161982187802643e-05, + "loss": 3.0424, + "step": 12965 + }, + { + "epoch": 0.8048916754609224, + "grad_norm": 0.2897711135866925, + "learning_rate": 9.161782033271231e-05, + "loss": 3.0666, + "step": 12966 + }, + { + "epoch": 0.8049537525606804, + "grad_norm": 0.19261436276210647, + "learning_rate": 9.161581857026793e-05, + "loss": 3.0512, + "step": 12967 + }, + { + "epoch": 0.8050158296604383, + "grad_norm": 0.1934225191222246, + "learning_rate": 9.161381659070373e-05, + "loss": 3.0936, + "step": 12968 + }, + { + "epoch": 0.8050779067601962, + "grad_norm": 0.19827066414293315, + "learning_rate": 9.161181439403015e-05, + "loss": 3.0225, + "step": 12969 + }, + { + "epoch": 0.8051399838599541, + "grad_norm": 0.18436897235651212, + "learning_rate": 9.160981198025762e-05, + "loss": 3.0219, + "step": 12970 + }, + { + "epoch": 0.8052020609597119, + "grad_norm": 0.16311462515508424, + "learning_rate": 9.16078093493966e-05, + "loss": 3.0121, + "step": 12971 + }, + { + "epoch": 0.8052641380594698, + "grad_norm": 0.16574187428635656, + "learning_rate": 9.160580650145754e-05, + "loss": 3.0526, + "step": 12972 + }, + { + "epoch": 0.8053262151592278, + "grad_norm": 0.2000591652498999, + "learning_rate": 9.160380343645088e-05, + "loss": 2.9843, + "step": 12973 + }, + { + "epoch": 0.8053882922589857, + "grad_norm": 0.1565231677196549, + "learning_rate": 9.160180015438709e-05, + "loss": 3.0408, + "step": 12974 + }, + { + "epoch": 0.8054503693587436, + "grad_norm": 0.1639663494011441, + "learning_rate": 9.15997966552766e-05, + "loss": 2.9961, + "step": 12975 + }, + { + "epoch": 0.8055124464585015, + "grad_norm": 0.17335077707974977, + "learning_rate": 9.15977929391299e-05, + "loss": 3.0794, + "step": 12976 + }, + { + "epoch": 0.8055745235582593, + "grad_norm": 0.15391884742534223, + "learning_rate": 9.159578900595738e-05, + "loss": 3.0461, + "step": 12977 + }, + { + "epoch": 0.8056366006580172, + "grad_norm": 0.18184859152912627, + "learning_rate": 9.159378485576954e-05, + "loss": 2.9788, + "step": 12978 + }, + { + "epoch": 0.8056986777577752, + "grad_norm": 0.1571429427132358, + "learning_rate": 9.159178048857684e-05, + "loss": 3.0208, + "step": 12979 + }, + { + "epoch": 0.8057607548575331, + "grad_norm": 0.177244916759608, + "learning_rate": 9.158977590438971e-05, + "loss": 3.031, + "step": 12980 + }, + { + "epoch": 0.805822831957291, + "grad_norm": 0.1625727098819733, + "learning_rate": 9.158777110321863e-05, + "loss": 3.0991, + "step": 12981 + }, + { + "epoch": 0.8058849090570489, + "grad_norm": 0.1701890773826176, + "learning_rate": 9.158576608507405e-05, + "loss": 2.9546, + "step": 12982 + }, + { + "epoch": 0.8059469861568067, + "grad_norm": 0.18168381208332168, + "learning_rate": 9.158376084996645e-05, + "loss": 3.0213, + "step": 12983 + }, + { + "epoch": 0.8060090632565646, + "grad_norm": 0.17658273445920158, + "learning_rate": 9.158175539790626e-05, + "loss": 3.0418, + "step": 12984 + }, + { + "epoch": 0.8060711403563225, + "grad_norm": 0.19975948810661617, + "learning_rate": 9.157974972890396e-05, + "loss": 3.1236, + "step": 12985 + }, + { + "epoch": 0.8061332174560805, + "grad_norm": 0.23079386909711744, + "learning_rate": 9.157774384297e-05, + "loss": 3.0325, + "step": 12986 + }, + { + "epoch": 0.8061952945558384, + "grad_norm": 0.22920625608389752, + "learning_rate": 9.157573774011486e-05, + "loss": 3.1682, + "step": 12987 + }, + { + "epoch": 0.8062573716555963, + "grad_norm": 0.17234982662261375, + "learning_rate": 9.157373142034902e-05, + "loss": 2.9841, + "step": 12988 + }, + { + "epoch": 0.8063194487553541, + "grad_norm": 0.17699563903069934, + "learning_rate": 9.157172488368291e-05, + "loss": 3.0069, + "step": 12989 + }, + { + "epoch": 0.806381525855112, + "grad_norm": 0.1956308155139934, + "learning_rate": 9.156971813012702e-05, + "loss": 3.0488, + "step": 12990 + }, + { + "epoch": 0.8064436029548699, + "grad_norm": 0.2078872116135494, + "learning_rate": 9.156771115969184e-05, + "loss": 3.1043, + "step": 12991 + }, + { + "epoch": 0.8065056800546279, + "grad_norm": 0.21684785192018088, + "learning_rate": 9.156570397238781e-05, + "loss": 2.9878, + "step": 12992 + }, + { + "epoch": 0.8065677571543858, + "grad_norm": 0.19099820687540792, + "learning_rate": 9.156369656822541e-05, + "loss": 3.06, + "step": 12993 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.18034613188230098, + "learning_rate": 9.156168894721511e-05, + "loss": 3.0438, + "step": 12994 + }, + { + "epoch": 0.8066919113539015, + "grad_norm": 0.1933550400681511, + "learning_rate": 9.15596811093674e-05, + "loss": 3.0654, + "step": 12995 + }, + { + "epoch": 0.8067539884536594, + "grad_norm": 0.18643278145773495, + "learning_rate": 9.155767305469274e-05, + "loss": 3.0295, + "step": 12996 + }, + { + "epoch": 0.8068160655534173, + "grad_norm": 0.16555242432838588, + "learning_rate": 9.15556647832016e-05, + "loss": 2.9386, + "step": 12997 + }, + { + "epoch": 0.8068781426531753, + "grad_norm": 0.21254110955442837, + "learning_rate": 9.155365629490449e-05, + "loss": 3.0407, + "step": 12998 + }, + { + "epoch": 0.8069402197529332, + "grad_norm": 0.192161888261314, + "learning_rate": 9.155164758981186e-05, + "loss": 3.0737, + "step": 12999 + }, + { + "epoch": 0.8070022968526911, + "grad_norm": 0.18103992439005265, + "learning_rate": 9.15496386679342e-05, + "loss": 3.0266, + "step": 13000 + }, + { + "epoch": 0.8070643739524489, + "grad_norm": 0.16113618789374196, + "learning_rate": 9.154762952928197e-05, + "loss": 3.0622, + "step": 13001 + }, + { + "epoch": 0.8071264510522068, + "grad_norm": 0.21619503579593963, + "learning_rate": 9.154562017386569e-05, + "loss": 3.0078, + "step": 13002 + }, + { + "epoch": 0.8071885281519647, + "grad_norm": 0.1692088282556849, + "learning_rate": 9.154361060169582e-05, + "loss": 3.1221, + "step": 13003 + }, + { + "epoch": 0.8072506052517227, + "grad_norm": 0.2263425795548421, + "learning_rate": 9.154160081278285e-05, + "loss": 3.0123, + "step": 13004 + }, + { + "epoch": 0.8073126823514806, + "grad_norm": 0.196360655676541, + "learning_rate": 9.153959080713725e-05, + "loss": 3.0327, + "step": 13005 + }, + { + "epoch": 0.8073747594512385, + "grad_norm": 0.16872178985427658, + "learning_rate": 9.153758058476954e-05, + "loss": 3.0378, + "step": 13006 + }, + { + "epoch": 0.8074368365509963, + "grad_norm": 0.17561818612785154, + "learning_rate": 9.153557014569019e-05, + "loss": 3.0513, + "step": 13007 + }, + { + "epoch": 0.8074989136507542, + "grad_norm": 0.17662128950886674, + "learning_rate": 9.153355948990968e-05, + "loss": 3.0131, + "step": 13008 + }, + { + "epoch": 0.8075609907505121, + "grad_norm": 0.20475775808732213, + "learning_rate": 9.15315486174385e-05, + "loss": 3.0427, + "step": 13009 + }, + { + "epoch": 0.80762306785027, + "grad_norm": 0.24305728115120157, + "learning_rate": 9.152953752828715e-05, + "loss": 3.0601, + "step": 13010 + }, + { + "epoch": 0.807685144950028, + "grad_norm": 0.17137149732329646, + "learning_rate": 9.152752622246612e-05, + "loss": 3.0211, + "step": 13011 + }, + { + "epoch": 0.8077472220497859, + "grad_norm": 0.2427001725954606, + "learning_rate": 9.152551469998591e-05, + "loss": 3.0813, + "step": 13012 + }, + { + "epoch": 0.8078092991495437, + "grad_norm": 0.18100087498814285, + "learning_rate": 9.152350296085702e-05, + "loss": 3.1576, + "step": 13013 + }, + { + "epoch": 0.8078713762493016, + "grad_norm": 0.1756683198778957, + "learning_rate": 9.15214910050899e-05, + "loss": 3.0473, + "step": 13014 + }, + { + "epoch": 0.8079334533490595, + "grad_norm": 0.19562186535709541, + "learning_rate": 9.151947883269512e-05, + "loss": 3.0804, + "step": 13015 + }, + { + "epoch": 0.8079955304488174, + "grad_norm": 0.16551439690384123, + "learning_rate": 9.151746644368311e-05, + "loss": 2.9469, + "step": 13016 + }, + { + "epoch": 0.8080576075485754, + "grad_norm": 0.17460852010462177, + "learning_rate": 9.151545383806441e-05, + "loss": 3.0887, + "step": 13017 + }, + { + "epoch": 0.8081196846483333, + "grad_norm": 0.17001260878105995, + "learning_rate": 9.151344101584952e-05, + "loss": 3.0079, + "step": 13018 + }, + { + "epoch": 0.8081817617480911, + "grad_norm": 0.20588605769339322, + "learning_rate": 9.151142797704893e-05, + "loss": 3.0842, + "step": 13019 + }, + { + "epoch": 0.808243838847849, + "grad_norm": 0.1830844511049494, + "learning_rate": 9.150941472167313e-05, + "loss": 3.0207, + "step": 13020 + }, + { + "epoch": 0.8083059159476069, + "grad_norm": 0.1710087066216425, + "learning_rate": 9.150740124973265e-05, + "loss": 3.1085, + "step": 13021 + }, + { + "epoch": 0.8083679930473648, + "grad_norm": 0.18022831096790715, + "learning_rate": 9.150538756123797e-05, + "loss": 3.0241, + "step": 13022 + }, + { + "epoch": 0.8084300701471228, + "grad_norm": 0.19375950420818244, + "learning_rate": 9.15033736561996e-05, + "loss": 3.0479, + "step": 13023 + }, + { + "epoch": 0.8084921472468807, + "grad_norm": 0.19512911770597838, + "learning_rate": 9.150135953462808e-05, + "loss": 3.0198, + "step": 13024 + }, + { + "epoch": 0.8085542243466385, + "grad_norm": 0.19221435956199945, + "learning_rate": 9.149934519653387e-05, + "loss": 2.9703, + "step": 13025 + }, + { + "epoch": 0.8086163014463964, + "grad_norm": 0.22240963043606862, + "learning_rate": 9.149733064192751e-05, + "loss": 3.0406, + "step": 13026 + }, + { + "epoch": 0.8086783785461543, + "grad_norm": 0.2359188808269099, + "learning_rate": 9.14953158708195e-05, + "loss": 3.0059, + "step": 13027 + }, + { + "epoch": 0.8087404556459122, + "grad_norm": 0.28113634409136606, + "learning_rate": 9.149330088322037e-05, + "loss": 3.0116, + "step": 13028 + }, + { + "epoch": 0.8088025327456702, + "grad_norm": 0.17049026246000765, + "learning_rate": 9.149128567914059e-05, + "loss": 2.9922, + "step": 13029 + }, + { + "epoch": 0.8088646098454281, + "grad_norm": 0.23918042391050473, + "learning_rate": 9.148927025859071e-05, + "loss": 3.0193, + "step": 13030 + }, + { + "epoch": 0.8089266869451859, + "grad_norm": 0.21130231816071351, + "learning_rate": 9.148725462158124e-05, + "loss": 3.1169, + "step": 13031 + }, + { + "epoch": 0.8089887640449438, + "grad_norm": 0.2050413825408382, + "learning_rate": 9.14852387681227e-05, + "loss": 3.0017, + "step": 13032 + }, + { + "epoch": 0.8090508411447017, + "grad_norm": 0.22541391328398108, + "learning_rate": 9.148322269822557e-05, + "loss": 3.1184, + "step": 13033 + }, + { + "epoch": 0.8091129182444596, + "grad_norm": 0.28667142773757237, + "learning_rate": 9.148120641190042e-05, + "loss": 3.0705, + "step": 13034 + }, + { + "epoch": 0.8091749953442176, + "grad_norm": 0.18795238003435277, + "learning_rate": 9.147918990915775e-05, + "loss": 3.0438, + "step": 13035 + }, + { + "epoch": 0.8092370724439755, + "grad_norm": 0.24039991614356934, + "learning_rate": 9.147717319000806e-05, + "loss": 2.9607, + "step": 13036 + }, + { + "epoch": 0.8092991495437333, + "grad_norm": 0.20701222062376234, + "learning_rate": 9.14751562544619e-05, + "loss": 3.0346, + "step": 13037 + }, + { + "epoch": 0.8093612266434912, + "grad_norm": 0.18977627281070683, + "learning_rate": 9.147313910252977e-05, + "loss": 3.0431, + "step": 13038 + }, + { + "epoch": 0.8094233037432491, + "grad_norm": 0.18058055287922606, + "learning_rate": 9.147112173422222e-05, + "loss": 3.0939, + "step": 13039 + }, + { + "epoch": 0.809485380843007, + "grad_norm": 0.17758612685629577, + "learning_rate": 9.146910414954974e-05, + "loss": 3.0247, + "step": 13040 + }, + { + "epoch": 0.809547457942765, + "grad_norm": 0.235338875519628, + "learning_rate": 9.14670863485229e-05, + "loss": 3.0289, + "step": 13041 + }, + { + "epoch": 0.8096095350425229, + "grad_norm": 0.26106933187358844, + "learning_rate": 9.14650683311522e-05, + "loss": 2.9814, + "step": 13042 + }, + { + "epoch": 0.8096716121422807, + "grad_norm": 0.19541020810394896, + "learning_rate": 9.146305009744815e-05, + "loss": 3.1247, + "step": 13043 + }, + { + "epoch": 0.8097336892420386, + "grad_norm": 0.19131221212847094, + "learning_rate": 9.146103164742132e-05, + "loss": 2.9826, + "step": 13044 + }, + { + "epoch": 0.8097957663417965, + "grad_norm": 0.2218428947834189, + "learning_rate": 9.145901298108224e-05, + "loss": 2.9564, + "step": 13045 + }, + { + "epoch": 0.8098578434415544, + "grad_norm": 0.20874705164077875, + "learning_rate": 9.14569940984414e-05, + "loss": 2.9288, + "step": 13046 + }, + { + "epoch": 0.8099199205413123, + "grad_norm": 0.24392467102975918, + "learning_rate": 9.145497499950937e-05, + "loss": 3.0103, + "step": 13047 + }, + { + "epoch": 0.8099819976410703, + "grad_norm": 0.19487275209837096, + "learning_rate": 9.145295568429666e-05, + "loss": 3.039, + "step": 13048 + }, + { + "epoch": 0.8100440747408281, + "grad_norm": 0.32593886681450246, + "learning_rate": 9.145093615281381e-05, + "loss": 2.9934, + "step": 13049 + }, + { + "epoch": 0.810106151840586, + "grad_norm": 0.22458833512986248, + "learning_rate": 9.144891640507138e-05, + "loss": 2.9652, + "step": 13050 + }, + { + "epoch": 0.8101682289403439, + "grad_norm": 0.18647816493062527, + "learning_rate": 9.144689644107989e-05, + "loss": 3.0426, + "step": 13051 + }, + { + "epoch": 0.8102303060401018, + "grad_norm": 0.28981039878440534, + "learning_rate": 9.144487626084988e-05, + "loss": 3.0872, + "step": 13052 + }, + { + "epoch": 0.8102923831398597, + "grad_norm": 0.20300880189661502, + "learning_rate": 9.144285586439189e-05, + "loss": 3.0686, + "step": 13053 + }, + { + "epoch": 0.8103544602396177, + "grad_norm": 0.18621827172123354, + "learning_rate": 9.144083525171646e-05, + "loss": 3.0662, + "step": 13054 + }, + { + "epoch": 0.8104165373393755, + "grad_norm": 0.18511132769739264, + "learning_rate": 9.143881442283413e-05, + "loss": 3.0261, + "step": 13055 + }, + { + "epoch": 0.8104786144391334, + "grad_norm": 0.40090065082859755, + "learning_rate": 9.143679337775543e-05, + "loss": 3.0398, + "step": 13056 + }, + { + "epoch": 0.8105406915388913, + "grad_norm": 0.2701005865748115, + "learning_rate": 9.143477211649093e-05, + "loss": 3.0249, + "step": 13057 + }, + { + "epoch": 0.8106027686386492, + "grad_norm": 0.23203602185464117, + "learning_rate": 9.143275063905119e-05, + "loss": 2.9743, + "step": 13058 + }, + { + "epoch": 0.8106648457384071, + "grad_norm": 0.3581848547549427, + "learning_rate": 9.143072894544671e-05, + "loss": 3.1012, + "step": 13059 + }, + { + "epoch": 0.810726922838165, + "grad_norm": 0.3053734711254696, + "learning_rate": 9.142870703568806e-05, + "loss": 3.0447, + "step": 13060 + }, + { + "epoch": 0.8107889999379229, + "grad_norm": 0.2682252701534959, + "learning_rate": 9.142668490978579e-05, + "loss": 3.0888, + "step": 13061 + }, + { + "epoch": 0.8108510770376808, + "grad_norm": 0.22315669287464152, + "learning_rate": 9.142466256775045e-05, + "loss": 3.0178, + "step": 13062 + }, + { + "epoch": 0.8109131541374387, + "grad_norm": 0.18258562682266838, + "learning_rate": 9.142264000959257e-05, + "loss": 3.0404, + "step": 13063 + }, + { + "epoch": 0.8109752312371966, + "grad_norm": 0.2260309264419246, + "learning_rate": 9.142061723532275e-05, + "loss": 3.0286, + "step": 13064 + }, + { + "epoch": 0.8110373083369545, + "grad_norm": 0.18873089840530272, + "learning_rate": 9.141859424495149e-05, + "loss": 3.0764, + "step": 13065 + }, + { + "epoch": 0.8110993854367125, + "grad_norm": 0.16991528136515885, + "learning_rate": 9.141657103848938e-05, + "loss": 3.0177, + "step": 13066 + }, + { + "epoch": 0.8111614625364703, + "grad_norm": 0.29876278181545163, + "learning_rate": 9.141454761594698e-05, + "loss": 3.0608, + "step": 13067 + }, + { + "epoch": 0.8112235396362282, + "grad_norm": 0.1877491507587198, + "learning_rate": 9.141252397733481e-05, + "loss": 2.9975, + "step": 13068 + }, + { + "epoch": 0.8112856167359861, + "grad_norm": 0.2874011289549497, + "learning_rate": 9.141050012266345e-05, + "loss": 3.0231, + "step": 13069 + }, + { + "epoch": 0.811347693835744, + "grad_norm": 0.23243897639598343, + "learning_rate": 9.140847605194347e-05, + "loss": 3.0633, + "step": 13070 + }, + { + "epoch": 0.8114097709355019, + "grad_norm": 0.2524436788253758, + "learning_rate": 9.140645176518541e-05, + "loss": 3.0822, + "step": 13071 + }, + { + "epoch": 0.8114718480352598, + "grad_norm": 0.20446875728541283, + "learning_rate": 9.140442726239985e-05, + "loss": 3.0321, + "step": 13072 + }, + { + "epoch": 0.8115339251350177, + "grad_norm": 0.18893813898216363, + "learning_rate": 9.140240254359734e-05, + "loss": 2.9827, + "step": 13073 + }, + { + "epoch": 0.8115960022347756, + "grad_norm": 0.2297371307910352, + "learning_rate": 9.140037760878843e-05, + "loss": 2.972, + "step": 13074 + }, + { + "epoch": 0.8116580793345335, + "grad_norm": 0.20812562283717714, + "learning_rate": 9.139835245798372e-05, + "loss": 2.9458, + "step": 13075 + }, + { + "epoch": 0.8117201564342914, + "grad_norm": 0.1854229061187725, + "learning_rate": 9.139632709119375e-05, + "loss": 3.1104, + "step": 13076 + }, + { + "epoch": 0.8117822335340493, + "grad_norm": 0.17940834378481033, + "learning_rate": 9.139430150842908e-05, + "loss": 3.0289, + "step": 13077 + }, + { + "epoch": 0.8118443106338072, + "grad_norm": 0.18746365751283162, + "learning_rate": 9.139227570970029e-05, + "loss": 3.0101, + "step": 13078 + }, + { + "epoch": 0.811906387733565, + "grad_norm": 0.19285411518548573, + "learning_rate": 9.139024969501796e-05, + "loss": 2.983, + "step": 13079 + }, + { + "epoch": 0.811968464833323, + "grad_norm": 0.2703034217500755, + "learning_rate": 9.138822346439265e-05, + "loss": 2.9834, + "step": 13080 + }, + { + "epoch": 0.8120305419330809, + "grad_norm": 0.22173054989686142, + "learning_rate": 9.138619701783493e-05, + "loss": 3.0772, + "step": 13081 + }, + { + "epoch": 0.8120926190328388, + "grad_norm": 0.2197351326860549, + "learning_rate": 9.138417035535536e-05, + "loss": 3.0495, + "step": 13082 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.22658283865828938, + "learning_rate": 9.138214347696455e-05, + "loss": 2.9999, + "step": 13083 + }, + { + "epoch": 0.8122167732323546, + "grad_norm": 0.21784098821992604, + "learning_rate": 9.138011638267303e-05, + "loss": 3.0356, + "step": 13084 + }, + { + "epoch": 0.8122788503321124, + "grad_norm": 0.19466428956659532, + "learning_rate": 9.13780890724914e-05, + "loss": 3.0389, + "step": 13085 + }, + { + "epoch": 0.8123409274318704, + "grad_norm": 0.1680355050263174, + "learning_rate": 9.137606154643025e-05, + "loss": 3.0348, + "step": 13086 + }, + { + "epoch": 0.8124030045316283, + "grad_norm": 0.1603651545127153, + "learning_rate": 9.137403380450013e-05, + "loss": 2.9961, + "step": 13087 + }, + { + "epoch": 0.8124650816313862, + "grad_norm": 0.16500425253792234, + "learning_rate": 9.137200584671164e-05, + "loss": 2.9439, + "step": 13088 + }, + { + "epoch": 0.8125271587311441, + "grad_norm": 0.15910284258662102, + "learning_rate": 9.136997767307534e-05, + "loss": 3.0255, + "step": 13089 + }, + { + "epoch": 0.812589235830902, + "grad_norm": 0.162262516763347, + "learning_rate": 9.136794928360183e-05, + "loss": 3.0284, + "step": 13090 + }, + { + "epoch": 0.8126513129306598, + "grad_norm": 0.17198002380844493, + "learning_rate": 9.136592067830168e-05, + "loss": 3.0066, + "step": 13091 + }, + { + "epoch": 0.8127133900304178, + "grad_norm": 0.16748012954188216, + "learning_rate": 9.136389185718549e-05, + "loss": 3.0295, + "step": 13092 + }, + { + "epoch": 0.8127754671301757, + "grad_norm": 0.15794032250468587, + "learning_rate": 9.136186282026384e-05, + "loss": 3.0808, + "step": 13093 + }, + { + "epoch": 0.8128375442299336, + "grad_norm": 0.16683288694686252, + "learning_rate": 9.13598335675473e-05, + "loss": 3.002, + "step": 13094 + }, + { + "epoch": 0.8128996213296915, + "grad_norm": 0.15513121491300352, + "learning_rate": 9.135780409904646e-05, + "loss": 3.0398, + "step": 13095 + }, + { + "epoch": 0.8129616984294494, + "grad_norm": 0.20591448131359047, + "learning_rate": 9.135577441477193e-05, + "loss": 3.0688, + "step": 13096 + }, + { + "epoch": 0.8130237755292072, + "grad_norm": 0.14788621247001696, + "learning_rate": 9.135374451473429e-05, + "loss": 3.0317, + "step": 13097 + }, + { + "epoch": 0.8130858526289652, + "grad_norm": 0.20133960546076038, + "learning_rate": 9.135171439894411e-05, + "loss": 2.969, + "step": 13098 + }, + { + "epoch": 0.8131479297287231, + "grad_norm": 0.20636860045671945, + "learning_rate": 9.1349684067412e-05, + "loss": 3.0139, + "step": 13099 + }, + { + "epoch": 0.813210006828481, + "grad_norm": 0.19765318200968315, + "learning_rate": 9.134765352014855e-05, + "loss": 2.9849, + "step": 13100 + }, + { + "epoch": 0.8132720839282389, + "grad_norm": 0.20117971719807745, + "learning_rate": 9.134562275716436e-05, + "loss": 3.0941, + "step": 13101 + }, + { + "epoch": 0.8133341610279968, + "grad_norm": 0.21819451169975385, + "learning_rate": 9.134359177847003e-05, + "loss": 2.9948, + "step": 13102 + }, + { + "epoch": 0.8133962381277546, + "grad_norm": 0.22995012735158832, + "learning_rate": 9.134156058407613e-05, + "loss": 3.0282, + "step": 13103 + }, + { + "epoch": 0.8134583152275126, + "grad_norm": 0.294328408249388, + "learning_rate": 9.133952917399328e-05, + "loss": 2.9927, + "step": 13104 + }, + { + "epoch": 0.8135203923272705, + "grad_norm": 0.19357519916820762, + "learning_rate": 9.133749754823206e-05, + "loss": 2.9735, + "step": 13105 + }, + { + "epoch": 0.8135824694270284, + "grad_norm": 0.1853118512269318, + "learning_rate": 9.13354657068031e-05, + "loss": 3.0501, + "step": 13106 + }, + { + "epoch": 0.8136445465267863, + "grad_norm": 0.2185476252528719, + "learning_rate": 9.133343364971695e-05, + "loss": 2.9754, + "step": 13107 + }, + { + "epoch": 0.8137066236265442, + "grad_norm": 0.18661152422377278, + "learning_rate": 9.133140137698426e-05, + "loss": 3.0914, + "step": 13108 + }, + { + "epoch": 0.813768700726302, + "grad_norm": 0.19156901581839028, + "learning_rate": 9.132936888861562e-05, + "loss": 2.9911, + "step": 13109 + }, + { + "epoch": 0.81383077782606, + "grad_norm": 0.16074873492601863, + "learning_rate": 9.132733618462162e-05, + "loss": 3.0622, + "step": 13110 + }, + { + "epoch": 0.8138928549258179, + "grad_norm": 0.18355883611296267, + "learning_rate": 9.132530326501288e-05, + "loss": 3.1074, + "step": 13111 + }, + { + "epoch": 0.8139549320255758, + "grad_norm": 0.18511625815723945, + "learning_rate": 9.13232701298e-05, + "loss": 3.0022, + "step": 13112 + }, + { + "epoch": 0.8140170091253337, + "grad_norm": 0.1823187546812851, + "learning_rate": 9.13212367789936e-05, + "loss": 3.0368, + "step": 13113 + }, + { + "epoch": 0.8140790862250915, + "grad_norm": 0.16776961343335092, + "learning_rate": 9.131920321260425e-05, + "loss": 3.113, + "step": 13114 + }, + { + "epoch": 0.8141411633248494, + "grad_norm": 0.17093554502621058, + "learning_rate": 9.131716943064262e-05, + "loss": 3.0321, + "step": 13115 + }, + { + "epoch": 0.8142032404246073, + "grad_norm": 0.1630183610992742, + "learning_rate": 9.131513543311926e-05, + "loss": 2.9706, + "step": 13116 + }, + { + "epoch": 0.8142653175243653, + "grad_norm": 0.17240367124397976, + "learning_rate": 9.131310122004482e-05, + "loss": 2.8752, + "step": 13117 + }, + { + "epoch": 0.8143273946241232, + "grad_norm": 0.20566936822986634, + "learning_rate": 9.131106679142988e-05, + "loss": 3.0383, + "step": 13118 + }, + { + "epoch": 0.8143894717238811, + "grad_norm": 0.2819194376569039, + "learning_rate": 9.13090321472851e-05, + "loss": 3.1095, + "step": 13119 + }, + { + "epoch": 0.8144515488236389, + "grad_norm": 0.2234270156721408, + "learning_rate": 9.130699728762108e-05, + "loss": 2.9812, + "step": 13120 + }, + { + "epoch": 0.8145136259233968, + "grad_norm": 0.2392570962700536, + "learning_rate": 9.130496221244841e-05, + "loss": 2.9589, + "step": 13121 + }, + { + "epoch": 0.8145757030231547, + "grad_norm": 0.22551432211787711, + "learning_rate": 9.130292692177774e-05, + "loss": 3.1254, + "step": 13122 + }, + { + "epoch": 0.8146377801229127, + "grad_norm": 0.29008961246674864, + "learning_rate": 9.130089141561966e-05, + "loss": 3.0542, + "step": 13123 + }, + { + "epoch": 0.8146998572226706, + "grad_norm": 0.21740801477476782, + "learning_rate": 9.129885569398479e-05, + "loss": 3.0464, + "step": 13124 + }, + { + "epoch": 0.8147619343224285, + "grad_norm": 0.49702819166108975, + "learning_rate": 9.129681975688378e-05, + "loss": 2.9795, + "step": 13125 + }, + { + "epoch": 0.8148240114221863, + "grad_norm": 0.2205561004692863, + "learning_rate": 9.129478360432724e-05, + "loss": 3.1102, + "step": 13126 + }, + { + "epoch": 0.8148860885219442, + "grad_norm": 0.20493412411693496, + "learning_rate": 9.129274723632579e-05, + "loss": 3.0623, + "step": 13127 + }, + { + "epoch": 0.8149481656217021, + "grad_norm": 0.2010443848259814, + "learning_rate": 9.129071065289004e-05, + "loss": 3.0376, + "step": 13128 + }, + { + "epoch": 0.81501024272146, + "grad_norm": 0.1907883128234298, + "learning_rate": 9.128867385403064e-05, + "loss": 3.0069, + "step": 13129 + }, + { + "epoch": 0.815072319821218, + "grad_norm": 0.19499998840001878, + "learning_rate": 9.12866368397582e-05, + "loss": 3.0524, + "step": 13130 + }, + { + "epoch": 0.8151343969209759, + "grad_norm": 0.21157996038006063, + "learning_rate": 9.128459961008336e-05, + "loss": 3.0347, + "step": 13131 + }, + { + "epoch": 0.8151964740207337, + "grad_norm": 0.19898050685637994, + "learning_rate": 9.128256216501674e-05, + "loss": 3.1003, + "step": 13132 + }, + { + "epoch": 0.8152585511204916, + "grad_norm": 0.24997348942861536, + "learning_rate": 9.128052450456896e-05, + "loss": 3.1175, + "step": 13133 + }, + { + "epoch": 0.8153206282202495, + "grad_norm": 0.2826073397678443, + "learning_rate": 9.127848662875069e-05, + "loss": 3.0147, + "step": 13134 + }, + { + "epoch": 0.8153827053200075, + "grad_norm": 0.19147287882889932, + "learning_rate": 9.12764485375725e-05, + "loss": 3.0648, + "step": 13135 + }, + { + "epoch": 0.8154447824197654, + "grad_norm": 0.20868031495056275, + "learning_rate": 9.127441023104508e-05, + "loss": 3.0029, + "step": 13136 + }, + { + "epoch": 0.8155068595195233, + "grad_norm": 0.19568881022472479, + "learning_rate": 9.127237170917904e-05, + "loss": 3.085, + "step": 13137 + }, + { + "epoch": 0.8155689366192811, + "grad_norm": 0.17061312155545022, + "learning_rate": 9.127033297198501e-05, + "loss": 3.0801, + "step": 13138 + }, + { + "epoch": 0.815631013719039, + "grad_norm": 0.27253558049076476, + "learning_rate": 9.126829401947364e-05, + "loss": 3.0605, + "step": 13139 + }, + { + "epoch": 0.8156930908187969, + "grad_norm": 0.18220927329191344, + "learning_rate": 9.126625485165557e-05, + "loss": 3.0594, + "step": 13140 + }, + { + "epoch": 0.8157551679185548, + "grad_norm": 0.20621565547285556, + "learning_rate": 9.12642154685414e-05, + "loss": 3.0513, + "step": 13141 + }, + { + "epoch": 0.8158172450183128, + "grad_norm": 0.19978803749852295, + "learning_rate": 9.126217587014184e-05, + "loss": 3.0606, + "step": 13142 + }, + { + "epoch": 0.8158793221180707, + "grad_norm": 0.24580385905639882, + "learning_rate": 9.126013605646746e-05, + "loss": 3.0412, + "step": 13143 + }, + { + "epoch": 0.8159413992178285, + "grad_norm": 0.1859924087071238, + "learning_rate": 9.125809602752895e-05, + "loss": 3.0076, + "step": 13144 + }, + { + "epoch": 0.8160034763175864, + "grad_norm": 0.2415530755152625, + "learning_rate": 9.125605578333693e-05, + "loss": 3.0464, + "step": 13145 + }, + { + "epoch": 0.8160655534173443, + "grad_norm": 0.19099934787476644, + "learning_rate": 9.125401532390205e-05, + "loss": 3.0107, + "step": 13146 + }, + { + "epoch": 0.8161276305171022, + "grad_norm": 0.28558429832859444, + "learning_rate": 9.125197464923497e-05, + "loss": 3.0008, + "step": 13147 + }, + { + "epoch": 0.8161897076168602, + "grad_norm": 0.2123785860652496, + "learning_rate": 9.124993375934631e-05, + "loss": 2.9983, + "step": 13148 + }, + { + "epoch": 0.8162517847166181, + "grad_norm": 0.28358680779653456, + "learning_rate": 9.124789265424675e-05, + "loss": 3.0461, + "step": 13149 + }, + { + "epoch": 0.8163138618163759, + "grad_norm": 0.29749424247619993, + "learning_rate": 9.124585133394691e-05, + "loss": 3.0366, + "step": 13150 + }, + { + "epoch": 0.8163759389161338, + "grad_norm": 0.22182481695219425, + "learning_rate": 9.124380979845745e-05, + "loss": 3.0987, + "step": 13151 + }, + { + "epoch": 0.8164380160158917, + "grad_norm": 0.22536447727055997, + "learning_rate": 9.124176804778903e-05, + "loss": 3.0767, + "step": 13152 + }, + { + "epoch": 0.8165000931156496, + "grad_norm": 0.23384691986857598, + "learning_rate": 9.123972608195228e-05, + "loss": 3.0222, + "step": 13153 + }, + { + "epoch": 0.8165621702154076, + "grad_norm": 0.20710804252814632, + "learning_rate": 9.123768390095787e-05, + "loss": 3.0835, + "step": 13154 + }, + { + "epoch": 0.8166242473151655, + "grad_norm": 0.20590530874720633, + "learning_rate": 9.123564150481646e-05, + "loss": 3.0882, + "step": 13155 + }, + { + "epoch": 0.8166863244149233, + "grad_norm": 0.256254313595449, + "learning_rate": 9.123359889353868e-05, + "loss": 3.0194, + "step": 13156 + }, + { + "epoch": 0.8167484015146812, + "grad_norm": 0.2268542926444194, + "learning_rate": 9.123155606713523e-05, + "loss": 3.0078, + "step": 13157 + }, + { + "epoch": 0.8168104786144391, + "grad_norm": 0.20062382125805311, + "learning_rate": 9.122951302561673e-05, + "loss": 3.0835, + "step": 13158 + }, + { + "epoch": 0.816872555714197, + "grad_norm": 0.20505747009893818, + "learning_rate": 9.122746976899386e-05, + "loss": 2.9244, + "step": 13159 + }, + { + "epoch": 0.816934632813955, + "grad_norm": 0.19747573691429937, + "learning_rate": 9.122542629727726e-05, + "loss": 2.9505, + "step": 13160 + }, + { + "epoch": 0.8169967099137129, + "grad_norm": 0.21325183113570859, + "learning_rate": 9.122338261047762e-05, + "loss": 3.0141, + "step": 13161 + }, + { + "epoch": 0.8170587870134707, + "grad_norm": 0.18907166450924698, + "learning_rate": 9.122133870860557e-05, + "loss": 3.0754, + "step": 13162 + }, + { + "epoch": 0.8171208641132286, + "grad_norm": 0.26349569015127056, + "learning_rate": 9.121929459167178e-05, + "loss": 3.0554, + "step": 13163 + }, + { + "epoch": 0.8171829412129865, + "grad_norm": 0.20362993899479787, + "learning_rate": 9.121725025968695e-05, + "loss": 3.0067, + "step": 13164 + }, + { + "epoch": 0.8172450183127444, + "grad_norm": 0.2343173114987746, + "learning_rate": 9.121520571266171e-05, + "loss": 3.0111, + "step": 13165 + }, + { + "epoch": 0.8173070954125023, + "grad_norm": 0.2405666441566658, + "learning_rate": 9.121316095060672e-05, + "loss": 3.0368, + "step": 13166 + }, + { + "epoch": 0.8173691725122603, + "grad_norm": 0.17322341339843936, + "learning_rate": 9.121111597353269e-05, + "loss": 2.9688, + "step": 13167 + }, + { + "epoch": 0.8174312496120181, + "grad_norm": 0.18774293023601013, + "learning_rate": 9.120907078145024e-05, + "loss": 3.044, + "step": 13168 + }, + { + "epoch": 0.817493326711776, + "grad_norm": 0.21387522608311987, + "learning_rate": 9.12070253743701e-05, + "loss": 3.0471, + "step": 13169 + }, + { + "epoch": 0.8175554038115339, + "grad_norm": 0.21765527812633984, + "learning_rate": 9.120497975230286e-05, + "loss": 3.0724, + "step": 13170 + }, + { + "epoch": 0.8176174809112918, + "grad_norm": 0.18745631464039206, + "learning_rate": 9.120293391525927e-05, + "loss": 3.0976, + "step": 13171 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.2069959430458272, + "learning_rate": 9.120088786324996e-05, + "loss": 3.0308, + "step": 13172 + }, + { + "epoch": 0.8177416351108077, + "grad_norm": 0.1670067931363302, + "learning_rate": 9.11988415962856e-05, + "loss": 2.9957, + "step": 13173 + }, + { + "epoch": 0.8178037122105655, + "grad_norm": 0.15830157007377393, + "learning_rate": 9.119679511437691e-05, + "loss": 2.9913, + "step": 13174 + }, + { + "epoch": 0.8178657893103234, + "grad_norm": 0.18200403181918323, + "learning_rate": 9.119474841753452e-05, + "loss": 2.928, + "step": 13175 + }, + { + "epoch": 0.8179278664100813, + "grad_norm": 0.18244947800515882, + "learning_rate": 9.119270150576914e-05, + "loss": 2.9844, + "step": 13176 + }, + { + "epoch": 0.8179899435098392, + "grad_norm": 0.2294655398866795, + "learning_rate": 9.119065437909143e-05, + "loss": 3.0978, + "step": 13177 + }, + { + "epoch": 0.8180520206095971, + "grad_norm": 0.1906285836711532, + "learning_rate": 9.118860703751208e-05, + "loss": 3.1061, + "step": 13178 + }, + { + "epoch": 0.8181140977093551, + "grad_norm": 0.23730571573921636, + "learning_rate": 9.118655948104175e-05, + "loss": 3.0194, + "step": 13179 + }, + { + "epoch": 0.8181761748091129, + "grad_norm": 0.21901663640217311, + "learning_rate": 9.118451170969117e-05, + "loss": 3.036, + "step": 13180 + }, + { + "epoch": 0.8182382519088708, + "grad_norm": 0.20347771831556935, + "learning_rate": 9.118246372347098e-05, + "loss": 3.0343, + "step": 13181 + }, + { + "epoch": 0.8183003290086287, + "grad_norm": 0.17533685352705025, + "learning_rate": 9.118041552239188e-05, + "loss": 3.0634, + "step": 13182 + }, + { + "epoch": 0.8183624061083866, + "grad_norm": 0.20272447455309045, + "learning_rate": 9.117836710646456e-05, + "loss": 3.0127, + "step": 13183 + }, + { + "epoch": 0.8184244832081445, + "grad_norm": 0.19315169963096276, + "learning_rate": 9.117631847569969e-05, + "loss": 3.1562, + "step": 13184 + }, + { + "epoch": 0.8184865603079025, + "grad_norm": 0.1729431233120821, + "learning_rate": 9.117426963010797e-05, + "loss": 2.9588, + "step": 13185 + }, + { + "epoch": 0.8185486374076603, + "grad_norm": 0.20961521872764247, + "learning_rate": 9.117222056970012e-05, + "loss": 3.0455, + "step": 13186 + }, + { + "epoch": 0.8186107145074182, + "grad_norm": 0.1615986822902988, + "learning_rate": 9.117017129448677e-05, + "loss": 2.8859, + "step": 13187 + }, + { + "epoch": 0.8186727916071761, + "grad_norm": 0.17867982312110117, + "learning_rate": 9.116812180447865e-05, + "loss": 3.1032, + "step": 13188 + }, + { + "epoch": 0.818734868706934, + "grad_norm": 0.18326119476604472, + "learning_rate": 9.116607209968644e-05, + "loss": 3.0349, + "step": 13189 + }, + { + "epoch": 0.8187969458066919, + "grad_norm": 0.17668960372251294, + "learning_rate": 9.116402218012084e-05, + "loss": 2.9553, + "step": 13190 + }, + { + "epoch": 0.8188590229064499, + "grad_norm": 0.2190703107527024, + "learning_rate": 9.116197204579255e-05, + "loss": 3.1098, + "step": 13191 + }, + { + "epoch": 0.8189211000062077, + "grad_norm": 0.17685308049325618, + "learning_rate": 9.115992169671225e-05, + "loss": 2.9869, + "step": 13192 + }, + { + "epoch": 0.8189831771059656, + "grad_norm": 0.1929047277555188, + "learning_rate": 9.115787113289067e-05, + "loss": 2.9943, + "step": 13193 + }, + { + "epoch": 0.8190452542057235, + "grad_norm": 0.2089326295339195, + "learning_rate": 9.115582035433847e-05, + "loss": 3.0952, + "step": 13194 + }, + { + "epoch": 0.8191073313054814, + "grad_norm": 0.1983131601104426, + "learning_rate": 9.115376936106635e-05, + "loss": 3.06, + "step": 13195 + }, + { + "epoch": 0.8191694084052393, + "grad_norm": 0.20468990273557805, + "learning_rate": 9.115171815308504e-05, + "loss": 3.128, + "step": 13196 + }, + { + "epoch": 0.8192314855049972, + "grad_norm": 0.17871421018845318, + "learning_rate": 9.114966673040522e-05, + "loss": 3.026, + "step": 13197 + }, + { + "epoch": 0.819293562604755, + "grad_norm": 0.17121198501109805, + "learning_rate": 9.11476150930376e-05, + "loss": 2.9269, + "step": 13198 + }, + { + "epoch": 0.819355639704513, + "grad_norm": 0.16781170844845056, + "learning_rate": 9.114556324099289e-05, + "loss": 3.1017, + "step": 13199 + }, + { + "epoch": 0.8194177168042709, + "grad_norm": 0.21905303299292417, + "learning_rate": 9.11435111742818e-05, + "loss": 3.0944, + "step": 13200 + }, + { + "epoch": 0.8194797939040288, + "grad_norm": 0.17336354160845835, + "learning_rate": 9.1141458892915e-05, + "loss": 2.9781, + "step": 13201 + }, + { + "epoch": 0.8195418710037867, + "grad_norm": 0.19499773410862245, + "learning_rate": 9.113940639690324e-05, + "loss": 2.8996, + "step": 13202 + }, + { + "epoch": 0.8196039481035446, + "grad_norm": 0.1977428762868445, + "learning_rate": 9.11373536862572e-05, + "loss": 2.977, + "step": 13203 + }, + { + "epoch": 0.8196660252033025, + "grad_norm": 0.21703713652809878, + "learning_rate": 9.11353007609876e-05, + "loss": 3.0677, + "step": 13204 + }, + { + "epoch": 0.8197281023030604, + "grad_norm": 0.16441088539474297, + "learning_rate": 9.113324762110514e-05, + "loss": 2.9195, + "step": 13205 + }, + { + "epoch": 0.8197901794028183, + "grad_norm": 0.1628777551483846, + "learning_rate": 9.113119426662055e-05, + "loss": 2.9333, + "step": 13206 + }, + { + "epoch": 0.8198522565025762, + "grad_norm": 0.19301010144375563, + "learning_rate": 9.112914069754452e-05, + "loss": 3.1009, + "step": 13207 + }, + { + "epoch": 0.8199143336023341, + "grad_norm": 0.20495711798332453, + "learning_rate": 9.11270869138878e-05, + "loss": 3.1095, + "step": 13208 + }, + { + "epoch": 0.819976410702092, + "grad_norm": 0.20790961011507578, + "learning_rate": 9.112503291566107e-05, + "loss": 2.9416, + "step": 13209 + }, + { + "epoch": 0.8200384878018498, + "grad_norm": 0.17051380470294913, + "learning_rate": 9.112297870287509e-05, + "loss": 3.0339, + "step": 13210 + }, + { + "epoch": 0.8201005649016078, + "grad_norm": 0.18863184367974017, + "learning_rate": 9.112092427554053e-05, + "loss": 3.0234, + "step": 13211 + }, + { + "epoch": 0.8201626420013657, + "grad_norm": 0.19021045095727998, + "learning_rate": 9.111886963366811e-05, + "loss": 3.0347, + "step": 13212 + }, + { + "epoch": 0.8202247191011236, + "grad_norm": 0.19067445332403904, + "learning_rate": 9.111681477726858e-05, + "loss": 2.9726, + "step": 13213 + }, + { + "epoch": 0.8202867962008815, + "grad_norm": 0.1790190058206902, + "learning_rate": 9.111475970635263e-05, + "loss": 3.0533, + "step": 13214 + }, + { + "epoch": 0.8203488733006394, + "grad_norm": 0.19272817797337302, + "learning_rate": 9.1112704420931e-05, + "loss": 3.0217, + "step": 13215 + }, + { + "epoch": 0.8204109504003972, + "grad_norm": 0.1716582220847245, + "learning_rate": 9.111064892101443e-05, + "loss": 3.0957, + "step": 13216 + }, + { + "epoch": 0.8204730275001552, + "grad_norm": 0.18405122777492536, + "learning_rate": 9.110859320661363e-05, + "loss": 3.0379, + "step": 13217 + }, + { + "epoch": 0.8205351045999131, + "grad_norm": 0.18508180256350376, + "learning_rate": 9.110653727773928e-05, + "loss": 3.0994, + "step": 13218 + }, + { + "epoch": 0.820597181699671, + "grad_norm": 0.20434357928012098, + "learning_rate": 9.110448113440218e-05, + "loss": 2.9856, + "step": 13219 + }, + { + "epoch": 0.8206592587994289, + "grad_norm": 0.18379410504711463, + "learning_rate": 9.110242477661301e-05, + "loss": 3.0335, + "step": 13220 + }, + { + "epoch": 0.8207213358991868, + "grad_norm": 0.19880334703481695, + "learning_rate": 9.110036820438253e-05, + "loss": 2.9576, + "step": 13221 + }, + { + "epoch": 0.8207834129989446, + "grad_norm": 0.1991869957243919, + "learning_rate": 9.109831141772143e-05, + "loss": 3.0834, + "step": 13222 + }, + { + "epoch": 0.8208454900987026, + "grad_norm": 0.21078302767253534, + "learning_rate": 9.109625441664048e-05, + "loss": 3.0056, + "step": 13223 + }, + { + "epoch": 0.8209075671984605, + "grad_norm": 0.3763610261295, + "learning_rate": 9.109419720115037e-05, + "loss": 2.984, + "step": 13224 + }, + { + "epoch": 0.8209696442982184, + "grad_norm": 0.24069271713928303, + "learning_rate": 9.109213977126188e-05, + "loss": 3.0393, + "step": 13225 + }, + { + "epoch": 0.8210317213979763, + "grad_norm": 0.2200497856622106, + "learning_rate": 9.109008212698571e-05, + "loss": 3.0753, + "step": 13226 + }, + { + "epoch": 0.8210937984977342, + "grad_norm": 0.2006910745010739, + "learning_rate": 9.10880242683326e-05, + "loss": 3.0151, + "step": 13227 + }, + { + "epoch": 0.821155875597492, + "grad_norm": 0.22886074935418121, + "learning_rate": 9.10859661953133e-05, + "loss": 3.0317, + "step": 13228 + }, + { + "epoch": 0.82121795269725, + "grad_norm": 0.21916629994443906, + "learning_rate": 9.108390790793855e-05, + "loss": 3.0102, + "step": 13229 + }, + { + "epoch": 0.8212800297970079, + "grad_norm": 0.21777876603659582, + "learning_rate": 9.108184940621907e-05, + "loss": 3.0232, + "step": 13230 + }, + { + "epoch": 0.8213421068967658, + "grad_norm": 0.20856527391291768, + "learning_rate": 9.107979069016562e-05, + "loss": 3.1, + "step": 13231 + }, + { + "epoch": 0.8214041839965237, + "grad_norm": 0.17594612125020975, + "learning_rate": 9.107773175978891e-05, + "loss": 2.9731, + "step": 13232 + }, + { + "epoch": 0.8214662610962816, + "grad_norm": 0.17653822563207008, + "learning_rate": 9.10756726150997e-05, + "loss": 3.0081, + "step": 13233 + }, + { + "epoch": 0.8215283381960394, + "grad_norm": 0.20641764994117448, + "learning_rate": 9.107361325610875e-05, + "loss": 3.0815, + "step": 13234 + }, + { + "epoch": 0.8215904152957973, + "grad_norm": 0.17282607542594822, + "learning_rate": 9.107155368282678e-05, + "loss": 3.0744, + "step": 13235 + }, + { + "epoch": 0.8216524923955553, + "grad_norm": 0.18000139931293807, + "learning_rate": 9.106949389526456e-05, + "loss": 3.0159, + "step": 13236 + }, + { + "epoch": 0.8217145694953132, + "grad_norm": 0.16424588331342918, + "learning_rate": 9.10674338934328e-05, + "loss": 3.0331, + "step": 13237 + }, + { + "epoch": 0.8217766465950711, + "grad_norm": 0.20180852387493547, + "learning_rate": 9.106537367734228e-05, + "loss": 3.0979, + "step": 13238 + }, + { + "epoch": 0.821838723694829, + "grad_norm": 0.18686961459689597, + "learning_rate": 9.106331324700373e-05, + "loss": 3.0265, + "step": 13239 + }, + { + "epoch": 0.8219008007945868, + "grad_norm": 0.18496232560955123, + "learning_rate": 9.106125260242791e-05, + "loss": 3.0174, + "step": 13240 + }, + { + "epoch": 0.8219628778943447, + "grad_norm": 0.16870828218836947, + "learning_rate": 9.105919174362556e-05, + "loss": 2.9652, + "step": 13241 + }, + { + "epoch": 0.8220249549941027, + "grad_norm": 0.20743115506722143, + "learning_rate": 9.105713067060745e-05, + "loss": 3.1005, + "step": 13242 + }, + { + "epoch": 0.8220870320938606, + "grad_norm": 0.19621853486590157, + "learning_rate": 9.105506938338432e-05, + "loss": 3.1159, + "step": 13243 + }, + { + "epoch": 0.8221491091936185, + "grad_norm": 0.21417968598330475, + "learning_rate": 9.105300788196693e-05, + "loss": 3.0264, + "step": 13244 + }, + { + "epoch": 0.8222111862933764, + "grad_norm": 0.269752618855623, + "learning_rate": 9.105094616636603e-05, + "loss": 3.1569, + "step": 13245 + }, + { + "epoch": 0.8222732633931342, + "grad_norm": 0.24805196621693604, + "learning_rate": 9.104888423659237e-05, + "loss": 3.1296, + "step": 13246 + }, + { + "epoch": 0.8223353404928921, + "grad_norm": 0.26228253291595904, + "learning_rate": 9.104682209265671e-05, + "loss": 3.0039, + "step": 13247 + }, + { + "epoch": 0.8223974175926501, + "grad_norm": 0.1921746439626896, + "learning_rate": 9.104475973456983e-05, + "loss": 3.072, + "step": 13248 + }, + { + "epoch": 0.822459494692408, + "grad_norm": 0.2757538233402906, + "learning_rate": 9.104269716234248e-05, + "loss": 3.0098, + "step": 13249 + }, + { + "epoch": 0.8225215717921659, + "grad_norm": 0.1863611423980812, + "learning_rate": 9.10406343759854e-05, + "loss": 3.0479, + "step": 13250 + }, + { + "epoch": 0.8225836488919238, + "grad_norm": 0.190774458780722, + "learning_rate": 9.103857137550937e-05, + "loss": 3.0692, + "step": 13251 + }, + { + "epoch": 0.8226457259916816, + "grad_norm": 0.16077987280126985, + "learning_rate": 9.103650816092515e-05, + "loss": 3.0612, + "step": 13252 + }, + { + "epoch": 0.8227078030914395, + "grad_norm": 0.30808342614497947, + "learning_rate": 9.103444473224351e-05, + "loss": 2.9476, + "step": 13253 + }, + { + "epoch": 0.8227698801911975, + "grad_norm": 0.2794476882196681, + "learning_rate": 9.103238108947521e-05, + "loss": 3.0003, + "step": 13254 + }, + { + "epoch": 0.8228319572909554, + "grad_norm": 0.1694159649739258, + "learning_rate": 9.103031723263101e-05, + "loss": 2.8897, + "step": 13255 + }, + { + "epoch": 0.8228940343907133, + "grad_norm": 0.204324299592657, + "learning_rate": 9.10282531617217e-05, + "loss": 3.0284, + "step": 13256 + }, + { + "epoch": 0.8229561114904712, + "grad_norm": 0.17250439969961842, + "learning_rate": 9.102618887675802e-05, + "loss": 3.0725, + "step": 13257 + }, + { + "epoch": 0.823018188590229, + "grad_norm": 0.24481437033347106, + "learning_rate": 9.102412437775076e-05, + "loss": 3.0612, + "step": 13258 + }, + { + "epoch": 0.8230802656899869, + "grad_norm": 0.17148558435904301, + "learning_rate": 9.102205966471068e-05, + "loss": 2.9786, + "step": 13259 + }, + { + "epoch": 0.8231423427897449, + "grad_norm": 0.1953738974389409, + "learning_rate": 9.101999473764855e-05, + "loss": 2.9953, + "step": 13260 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.18244077964375358, + "learning_rate": 9.101792959657516e-05, + "loss": 3.0655, + "step": 13261 + }, + { + "epoch": 0.8232664969892607, + "grad_norm": 0.22439737681217292, + "learning_rate": 9.101586424150128e-05, + "loss": 2.9841, + "step": 13262 + }, + { + "epoch": 0.8233285740890186, + "grad_norm": 0.17519683114268617, + "learning_rate": 9.101379867243766e-05, + "loss": 3.0375, + "step": 13263 + }, + { + "epoch": 0.8233906511887764, + "grad_norm": 0.1810555315049467, + "learning_rate": 9.101173288939511e-05, + "loss": 3.1151, + "step": 13264 + }, + { + "epoch": 0.8234527282885343, + "grad_norm": 0.14901767459590623, + "learning_rate": 9.10096668923844e-05, + "loss": 3.0228, + "step": 13265 + }, + { + "epoch": 0.8235148053882922, + "grad_norm": 0.17902048328683143, + "learning_rate": 9.100760068141627e-05, + "loss": 3.0928, + "step": 13266 + }, + { + "epoch": 0.8235768824880502, + "grad_norm": 0.15910184747345876, + "learning_rate": 9.100553425650156e-05, + "loss": 3.0269, + "step": 13267 + }, + { + "epoch": 0.8236389595878081, + "grad_norm": 0.2376782259916072, + "learning_rate": 9.100346761765102e-05, + "loss": 2.9969, + "step": 13268 + }, + { + "epoch": 0.823701036687566, + "grad_norm": 0.1609054811347792, + "learning_rate": 9.100140076487543e-05, + "loss": 3.0417, + "step": 13269 + }, + { + "epoch": 0.8237631137873238, + "grad_norm": 0.20742668318698304, + "learning_rate": 9.099933369818558e-05, + "loss": 3.1127, + "step": 13270 + }, + { + "epoch": 0.8238251908870817, + "grad_norm": 0.1933234595649425, + "learning_rate": 9.099726641759225e-05, + "loss": 2.9395, + "step": 13271 + }, + { + "epoch": 0.8238872679868396, + "grad_norm": 0.18111888237146057, + "learning_rate": 9.099519892310624e-05, + "loss": 2.9537, + "step": 13272 + }, + { + "epoch": 0.8239493450865976, + "grad_norm": 0.17966463109318873, + "learning_rate": 9.099313121473831e-05, + "loss": 3.0749, + "step": 13273 + }, + { + "epoch": 0.8240114221863555, + "grad_norm": 0.22519390506089518, + "learning_rate": 9.099106329249929e-05, + "loss": 3.0914, + "step": 13274 + }, + { + "epoch": 0.8240734992861134, + "grad_norm": 0.2837882607035852, + "learning_rate": 9.098899515639992e-05, + "loss": 3.0626, + "step": 13275 + }, + { + "epoch": 0.8241355763858712, + "grad_norm": 0.22671498724708886, + "learning_rate": 9.098692680645101e-05, + "loss": 3.0183, + "step": 13276 + }, + { + "epoch": 0.8241976534856291, + "grad_norm": 0.23394963288342588, + "learning_rate": 9.098485824266336e-05, + "loss": 3.0732, + "step": 13277 + }, + { + "epoch": 0.824259730585387, + "grad_norm": 0.17415528103545014, + "learning_rate": 9.098278946504774e-05, + "loss": 3.0063, + "step": 13278 + }, + { + "epoch": 0.824321807685145, + "grad_norm": 0.20377802416209823, + "learning_rate": 9.098072047361499e-05, + "loss": 2.9587, + "step": 13279 + }, + { + "epoch": 0.8243838847849029, + "grad_norm": 0.23256073337940902, + "learning_rate": 9.097865126837585e-05, + "loss": 3.0838, + "step": 13280 + }, + { + "epoch": 0.8244459618846608, + "grad_norm": 0.21713070469885487, + "learning_rate": 9.097658184934115e-05, + "loss": 2.9575, + "step": 13281 + }, + { + "epoch": 0.8245080389844186, + "grad_norm": 0.22379057537625532, + "learning_rate": 9.097451221652167e-05, + "loss": 3.0113, + "step": 13282 + }, + { + "epoch": 0.8245701160841765, + "grad_norm": 0.23023112218065045, + "learning_rate": 9.09724423699282e-05, + "loss": 3.0098, + "step": 13283 + }, + { + "epoch": 0.8246321931839344, + "grad_norm": 0.20073010724699702, + "learning_rate": 9.097037230957158e-05, + "loss": 2.9282, + "step": 13284 + }, + { + "epoch": 0.8246942702836924, + "grad_norm": 0.2063853155922142, + "learning_rate": 9.096830203546255e-05, + "loss": 3.0186, + "step": 13285 + }, + { + "epoch": 0.8247563473834503, + "grad_norm": 0.17136185620590677, + "learning_rate": 9.096623154761196e-05, + "loss": 2.9461, + "step": 13286 + }, + { + "epoch": 0.8248184244832082, + "grad_norm": 0.2053287337256183, + "learning_rate": 9.096416084603059e-05, + "loss": 3.0452, + "step": 13287 + }, + { + "epoch": 0.824880501582966, + "grad_norm": 0.18396345412412624, + "learning_rate": 9.096208993072926e-05, + "loss": 3.0863, + "step": 13288 + }, + { + "epoch": 0.8249425786827239, + "grad_norm": 0.1812623607597082, + "learning_rate": 9.096001880171874e-05, + "loss": 3.0167, + "step": 13289 + }, + { + "epoch": 0.8250046557824818, + "grad_norm": 0.16836969627198853, + "learning_rate": 9.095794745900988e-05, + "loss": 3.0774, + "step": 13290 + }, + { + "epoch": 0.8250667328822398, + "grad_norm": 0.18340609476599962, + "learning_rate": 9.095587590261345e-05, + "loss": 3.1526, + "step": 13291 + }, + { + "epoch": 0.8251288099819977, + "grad_norm": 0.22742869602459126, + "learning_rate": 9.09538041325403e-05, + "loss": 3.0757, + "step": 13292 + }, + { + "epoch": 0.8251908870817556, + "grad_norm": 0.17371528155979418, + "learning_rate": 9.095173214880117e-05, + "loss": 3.0366, + "step": 13293 + }, + { + "epoch": 0.8252529641815134, + "grad_norm": 0.17217222977686597, + "learning_rate": 9.094965995140694e-05, + "loss": 2.9743, + "step": 13294 + }, + { + "epoch": 0.8253150412812713, + "grad_norm": 0.2170521376069775, + "learning_rate": 9.094758754036839e-05, + "loss": 2.9453, + "step": 13295 + }, + { + "epoch": 0.8253771183810292, + "grad_norm": 0.19298815494529745, + "learning_rate": 9.094551491569634e-05, + "loss": 3.058, + "step": 13296 + }, + { + "epoch": 0.8254391954807871, + "grad_norm": 0.19625916893915746, + "learning_rate": 9.094344207740158e-05, + "loss": 3.0237, + "step": 13297 + }, + { + "epoch": 0.8255012725805451, + "grad_norm": 0.21263875848726138, + "learning_rate": 9.094136902549496e-05, + "loss": 3.0349, + "step": 13298 + }, + { + "epoch": 0.825563349680303, + "grad_norm": 0.1569281168221129, + "learning_rate": 9.093929575998728e-05, + "loss": 3.0647, + "step": 13299 + }, + { + "epoch": 0.8256254267800608, + "grad_norm": 0.18207994268222558, + "learning_rate": 9.093722228088935e-05, + "loss": 2.9623, + "step": 13300 + }, + { + "epoch": 0.8256875038798187, + "grad_norm": 0.26886573118286244, + "learning_rate": 9.093514858821199e-05, + "loss": 3.0351, + "step": 13301 + }, + { + "epoch": 0.8257495809795766, + "grad_norm": 0.24120801082179213, + "learning_rate": 9.093307468196603e-05, + "loss": 2.9868, + "step": 13302 + }, + { + "epoch": 0.8258116580793345, + "grad_norm": 0.15851956967502606, + "learning_rate": 9.093100056216228e-05, + "loss": 3.0252, + "step": 13303 + }, + { + "epoch": 0.8258737351790925, + "grad_norm": 0.17925521433068123, + "learning_rate": 9.092892622881155e-05, + "loss": 3.0675, + "step": 13304 + }, + { + "epoch": 0.8259358122788504, + "grad_norm": 0.24699684980879766, + "learning_rate": 9.09268516819247e-05, + "loss": 3.0056, + "step": 13305 + }, + { + "epoch": 0.8259978893786082, + "grad_norm": 0.23122924666456757, + "learning_rate": 9.092477692151252e-05, + "loss": 3.0773, + "step": 13306 + }, + { + "epoch": 0.8260599664783661, + "grad_norm": 0.1715971695241929, + "learning_rate": 9.092270194758583e-05, + "loss": 3.0456, + "step": 13307 + }, + { + "epoch": 0.826122043578124, + "grad_norm": 0.21439987626336837, + "learning_rate": 9.09206267601555e-05, + "loss": 3.0547, + "step": 13308 + }, + { + "epoch": 0.8261841206778819, + "grad_norm": 0.17135422553454685, + "learning_rate": 9.091855135923229e-05, + "loss": 3.0479, + "step": 13309 + }, + { + "epoch": 0.8262461977776399, + "grad_norm": 0.17937714269733981, + "learning_rate": 9.091647574482709e-05, + "loss": 3.1098, + "step": 13310 + }, + { + "epoch": 0.8263082748773978, + "grad_norm": 0.18767250589352502, + "learning_rate": 9.09143999169507e-05, + "loss": 2.9832, + "step": 13311 + }, + { + "epoch": 0.8263703519771556, + "grad_norm": 0.24489958477649013, + "learning_rate": 9.091232387561394e-05, + "loss": 3.0514, + "step": 13312 + }, + { + "epoch": 0.8264324290769135, + "grad_norm": 0.19455874911282228, + "learning_rate": 9.091024762082766e-05, + "loss": 3.0241, + "step": 13313 + }, + { + "epoch": 0.8264945061766714, + "grad_norm": 0.18163696408428015, + "learning_rate": 9.090817115260269e-05, + "loss": 3.1557, + "step": 13314 + }, + { + "epoch": 0.8265565832764293, + "grad_norm": 0.18855152714658888, + "learning_rate": 9.090609447094986e-05, + "loss": 3.0236, + "step": 13315 + }, + { + "epoch": 0.8266186603761873, + "grad_norm": 0.19649334615927197, + "learning_rate": 9.090401757588001e-05, + "loss": 3.0701, + "step": 13316 + }, + { + "epoch": 0.8266807374759452, + "grad_norm": 0.303655134759802, + "learning_rate": 9.090194046740396e-05, + "loss": 3.094, + "step": 13317 + }, + { + "epoch": 0.826742814575703, + "grad_norm": 0.18102525226937366, + "learning_rate": 9.089986314553256e-05, + "loss": 3.0786, + "step": 13318 + }, + { + "epoch": 0.8268048916754609, + "grad_norm": 0.2017933772629831, + "learning_rate": 9.089778561027666e-05, + "loss": 3.0466, + "step": 13319 + }, + { + "epoch": 0.8268669687752188, + "grad_norm": 0.19630976705339995, + "learning_rate": 9.089570786164707e-05, + "loss": 3.0033, + "step": 13320 + }, + { + "epoch": 0.8269290458749767, + "grad_norm": 0.20502885499103965, + "learning_rate": 9.089362989965464e-05, + "loss": 2.9724, + "step": 13321 + }, + { + "epoch": 0.8269911229747346, + "grad_norm": 0.16934396863325965, + "learning_rate": 9.089155172431022e-05, + "loss": 2.9339, + "step": 13322 + }, + { + "epoch": 0.8270532000744926, + "grad_norm": 0.1722759101029048, + "learning_rate": 9.088947333562466e-05, + "loss": 2.9605, + "step": 13323 + }, + { + "epoch": 0.8271152771742504, + "grad_norm": 0.21084613057419957, + "learning_rate": 9.088739473360877e-05, + "loss": 3.045, + "step": 13324 + }, + { + "epoch": 0.8271773542740083, + "grad_norm": 0.16737828599112956, + "learning_rate": 9.088531591827343e-05, + "loss": 2.9812, + "step": 13325 + }, + { + "epoch": 0.8272394313737662, + "grad_norm": 0.1709666906029529, + "learning_rate": 9.088323688962947e-05, + "loss": 3.0546, + "step": 13326 + }, + { + "epoch": 0.8273015084735241, + "grad_norm": 0.19731313303331455, + "learning_rate": 9.088115764768774e-05, + "loss": 3.0097, + "step": 13327 + }, + { + "epoch": 0.827363585573282, + "grad_norm": 0.1995875460489358, + "learning_rate": 9.08790781924591e-05, + "loss": 3.0196, + "step": 13328 + }, + { + "epoch": 0.82742566267304, + "grad_norm": 0.16944171215472728, + "learning_rate": 9.087699852395435e-05, + "loss": 3.116, + "step": 13329 + }, + { + "epoch": 0.8274877397727978, + "grad_norm": 0.19644538386342286, + "learning_rate": 9.087491864218439e-05, + "loss": 3.0405, + "step": 13330 + }, + { + "epoch": 0.8275498168725557, + "grad_norm": 0.19484765048126546, + "learning_rate": 9.087283854716006e-05, + "loss": 3.056, + "step": 13331 + }, + { + "epoch": 0.8276118939723136, + "grad_norm": 0.17631685461787688, + "learning_rate": 9.087075823889222e-05, + "loss": 2.9855, + "step": 13332 + }, + { + "epoch": 0.8276739710720715, + "grad_norm": 0.1704666131617311, + "learning_rate": 9.08686777173917e-05, + "loss": 3.097, + "step": 13333 + }, + { + "epoch": 0.8277360481718294, + "grad_norm": 0.17740405481595414, + "learning_rate": 9.086659698266937e-05, + "loss": 3.0447, + "step": 13334 + }, + { + "epoch": 0.8277981252715874, + "grad_norm": 0.2602205809822777, + "learning_rate": 9.086451603473608e-05, + "loss": 3.0539, + "step": 13335 + }, + { + "epoch": 0.8278602023713452, + "grad_norm": 0.16276466109781487, + "learning_rate": 9.086243487360269e-05, + "loss": 2.9759, + "step": 13336 + }, + { + "epoch": 0.8279222794711031, + "grad_norm": 0.23640304355249553, + "learning_rate": 9.086035349928005e-05, + "loss": 3.0819, + "step": 13337 + }, + { + "epoch": 0.827984356570861, + "grad_norm": 0.17497321481589795, + "learning_rate": 9.085827191177905e-05, + "loss": 3.0472, + "step": 13338 + }, + { + "epoch": 0.8280464336706189, + "grad_norm": 0.17432237395855624, + "learning_rate": 9.08561901111105e-05, + "loss": 2.9564, + "step": 13339 + }, + { + "epoch": 0.8281085107703768, + "grad_norm": 0.19007669646654904, + "learning_rate": 9.085410809728532e-05, + "loss": 3.0056, + "step": 13340 + }, + { + "epoch": 0.8281705878701348, + "grad_norm": 0.2662998910061767, + "learning_rate": 9.085202587031432e-05, + "loss": 3.0661, + "step": 13341 + }, + { + "epoch": 0.8282326649698926, + "grad_norm": 0.2027388717358472, + "learning_rate": 9.084994343020838e-05, + "loss": 3.0867, + "step": 13342 + }, + { + "epoch": 0.8282947420696505, + "grad_norm": 0.20582082774774185, + "learning_rate": 9.084786077697837e-05, + "loss": 3.0585, + "step": 13343 + }, + { + "epoch": 0.8283568191694084, + "grad_norm": 0.21198353309883117, + "learning_rate": 9.084577791063516e-05, + "loss": 3.0334, + "step": 13344 + }, + { + "epoch": 0.8284188962691663, + "grad_norm": 0.1683508994763752, + "learning_rate": 9.084369483118961e-05, + "loss": 3.0446, + "step": 13345 + }, + { + "epoch": 0.8284809733689242, + "grad_norm": 0.2182232697008033, + "learning_rate": 9.084161153865258e-05, + "loss": 3.0423, + "step": 13346 + }, + { + "epoch": 0.8285430504686822, + "grad_norm": 0.18984236500866705, + "learning_rate": 9.083952803303497e-05, + "loss": 3.0782, + "step": 13347 + }, + { + "epoch": 0.82860512756844, + "grad_norm": 0.20979723203854114, + "learning_rate": 9.083744431434761e-05, + "loss": 3.009, + "step": 13348 + }, + { + "epoch": 0.8286672046681979, + "grad_norm": 0.19165078910251668, + "learning_rate": 9.083536038260139e-05, + "loss": 3.0473, + "step": 13349 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.1945405486768472, + "learning_rate": 9.083327623780718e-05, + "loss": 3.0478, + "step": 13350 + }, + { + "epoch": 0.8287913588677137, + "grad_norm": 0.21155038734144518, + "learning_rate": 9.083119187997587e-05, + "loss": 2.9527, + "step": 13351 + }, + { + "epoch": 0.8288534359674716, + "grad_norm": 0.22600198337330257, + "learning_rate": 9.08291073091183e-05, + "loss": 3.0311, + "step": 13352 + }, + { + "epoch": 0.8289155130672295, + "grad_norm": 0.18185368214695394, + "learning_rate": 9.082702252524537e-05, + "loss": 3.0659, + "step": 13353 + }, + { + "epoch": 0.8289775901669874, + "grad_norm": 0.21576790151666533, + "learning_rate": 9.082493752836796e-05, + "loss": 3.0084, + "step": 13354 + }, + { + "epoch": 0.8290396672667453, + "grad_norm": 0.19755108640382185, + "learning_rate": 9.082285231849694e-05, + "loss": 2.9654, + "step": 13355 + }, + { + "epoch": 0.8291017443665032, + "grad_norm": 0.18652139033447404, + "learning_rate": 9.08207668956432e-05, + "loss": 3.0466, + "step": 13356 + }, + { + "epoch": 0.8291638214662611, + "grad_norm": 0.2027258619457855, + "learning_rate": 9.081868125981758e-05, + "loss": 2.9833, + "step": 13357 + }, + { + "epoch": 0.829225898566019, + "grad_norm": 0.1782390639185873, + "learning_rate": 9.081659541103102e-05, + "loss": 3.0205, + "step": 13358 + }, + { + "epoch": 0.8292879756657769, + "grad_norm": 0.21849199815824372, + "learning_rate": 9.081450934929437e-05, + "loss": 3.0892, + "step": 13359 + }, + { + "epoch": 0.8293500527655348, + "grad_norm": 0.23849191108567883, + "learning_rate": 9.081242307461851e-05, + "loss": 3.0812, + "step": 13360 + }, + { + "epoch": 0.8294121298652927, + "grad_norm": 0.20135734859668544, + "learning_rate": 9.081033658701432e-05, + "loss": 3.0116, + "step": 13361 + }, + { + "epoch": 0.8294742069650506, + "grad_norm": 0.2587814473885119, + "learning_rate": 9.08082498864927e-05, + "loss": 3.0664, + "step": 13362 + }, + { + "epoch": 0.8295362840648085, + "grad_norm": 0.2106490193518026, + "learning_rate": 9.080616297306455e-05, + "loss": 3.0534, + "step": 13363 + }, + { + "epoch": 0.8295983611645664, + "grad_norm": 0.2153787281153187, + "learning_rate": 9.080407584674073e-05, + "loss": 3.0175, + "step": 13364 + }, + { + "epoch": 0.8296604382643243, + "grad_norm": 0.20301026074722564, + "learning_rate": 9.080198850753214e-05, + "loss": 3.0221, + "step": 13365 + }, + { + "epoch": 0.8297225153640821, + "grad_norm": 0.178364370779885, + "learning_rate": 9.079990095544969e-05, + "loss": 2.9759, + "step": 13366 + }, + { + "epoch": 0.8297845924638401, + "grad_norm": 0.20399074192403457, + "learning_rate": 9.079781319050422e-05, + "loss": 3.0566, + "step": 13367 + }, + { + "epoch": 0.829846669563598, + "grad_norm": 0.20951718299520294, + "learning_rate": 9.079572521270667e-05, + "loss": 3.0944, + "step": 13368 + }, + { + "epoch": 0.8299087466633559, + "grad_norm": 0.17542415481061788, + "learning_rate": 9.079363702206792e-05, + "loss": 3.0354, + "step": 13369 + }, + { + "epoch": 0.8299708237631138, + "grad_norm": 0.18125826060445774, + "learning_rate": 9.079154861859885e-05, + "loss": 3.0414, + "step": 13370 + }, + { + "epoch": 0.8300329008628717, + "grad_norm": 0.1810423113266764, + "learning_rate": 9.078946000231039e-05, + "loss": 3.0276, + "step": 13371 + }, + { + "epoch": 0.8300949779626295, + "grad_norm": 0.19015044254408964, + "learning_rate": 9.07873711732134e-05, + "loss": 3.0932, + "step": 13372 + }, + { + "epoch": 0.8301570550623875, + "grad_norm": 0.21161735436304815, + "learning_rate": 9.07852821313188e-05, + "loss": 2.9791, + "step": 13373 + }, + { + "epoch": 0.8302191321621454, + "grad_norm": 0.2376314903468557, + "learning_rate": 9.078319287663747e-05, + "loss": 3.0371, + "step": 13374 + }, + { + "epoch": 0.8302812092619033, + "grad_norm": 0.16916376114234724, + "learning_rate": 9.078110340918033e-05, + "loss": 3.0102, + "step": 13375 + }, + { + "epoch": 0.8303432863616612, + "grad_norm": 0.24402711136716143, + "learning_rate": 9.077901372895828e-05, + "loss": 3.0119, + "step": 13376 + }, + { + "epoch": 0.8304053634614191, + "grad_norm": 0.17663875279368674, + "learning_rate": 9.077692383598221e-05, + "loss": 3.1037, + "step": 13377 + }, + { + "epoch": 0.8304674405611769, + "grad_norm": 0.17962818618037787, + "learning_rate": 9.077483373026303e-05, + "loss": 3.0215, + "step": 13378 + }, + { + "epoch": 0.8305295176609349, + "grad_norm": 0.1711074259705079, + "learning_rate": 9.077274341181166e-05, + "loss": 2.9823, + "step": 13379 + }, + { + "epoch": 0.8305915947606928, + "grad_norm": 0.19711703189972074, + "learning_rate": 9.077065288063897e-05, + "loss": 3.0318, + "step": 13380 + }, + { + "epoch": 0.8306536718604507, + "grad_norm": 0.18942832784494748, + "learning_rate": 9.076856213675588e-05, + "loss": 3.0675, + "step": 13381 + }, + { + "epoch": 0.8307157489602086, + "grad_norm": 0.19379452186157886, + "learning_rate": 9.076647118017332e-05, + "loss": 3.0454, + "step": 13382 + }, + { + "epoch": 0.8307778260599665, + "grad_norm": 0.2569045058705008, + "learning_rate": 9.076438001090217e-05, + "loss": 2.9602, + "step": 13383 + }, + { + "epoch": 0.8308399031597243, + "grad_norm": 0.20201211285508885, + "learning_rate": 9.076228862895338e-05, + "loss": 3.092, + "step": 13384 + }, + { + "epoch": 0.8309019802594823, + "grad_norm": 0.2976684883184927, + "learning_rate": 9.076019703433781e-05, + "loss": 3.1127, + "step": 13385 + }, + { + "epoch": 0.8309640573592402, + "grad_norm": 0.1746444020565507, + "learning_rate": 9.07581052270664e-05, + "loss": 2.9903, + "step": 13386 + }, + { + "epoch": 0.8310261344589981, + "grad_norm": 0.17908954642452524, + "learning_rate": 9.075601320715006e-05, + "loss": 2.9353, + "step": 13387 + }, + { + "epoch": 0.831088211558756, + "grad_norm": 0.1880491381145698, + "learning_rate": 9.075392097459971e-05, + "loss": 3.0484, + "step": 13388 + }, + { + "epoch": 0.8311502886585139, + "grad_norm": 0.188625711515537, + "learning_rate": 9.075182852942627e-05, + "loss": 2.9647, + "step": 13389 + }, + { + "epoch": 0.8312123657582717, + "grad_norm": 0.17887158287031282, + "learning_rate": 9.074973587164064e-05, + "loss": 2.9397, + "step": 13390 + }, + { + "epoch": 0.8312744428580296, + "grad_norm": 0.19093197822390717, + "learning_rate": 9.074764300125374e-05, + "loss": 3.1251, + "step": 13391 + }, + { + "epoch": 0.8313365199577876, + "grad_norm": 0.18066534093456033, + "learning_rate": 9.074554991827651e-05, + "loss": 3.0462, + "step": 13392 + }, + { + "epoch": 0.8313985970575455, + "grad_norm": 0.168084681224982, + "learning_rate": 9.074345662271984e-05, + "loss": 3.051, + "step": 13393 + }, + { + "epoch": 0.8314606741573034, + "grad_norm": 0.16602325982882257, + "learning_rate": 9.074136311459467e-05, + "loss": 3.0041, + "step": 13394 + }, + { + "epoch": 0.8315227512570613, + "grad_norm": 0.17272421892831263, + "learning_rate": 9.073926939391191e-05, + "loss": 3.0176, + "step": 13395 + }, + { + "epoch": 0.8315848283568191, + "grad_norm": 0.1753795484345907, + "learning_rate": 9.07371754606825e-05, + "loss": 2.9866, + "step": 13396 + }, + { + "epoch": 0.831646905456577, + "grad_norm": 0.16399683092198458, + "learning_rate": 9.073508131491736e-05, + "loss": 3.1349, + "step": 13397 + }, + { + "epoch": 0.831708982556335, + "grad_norm": 0.17750649987663844, + "learning_rate": 9.073298695662743e-05, + "loss": 3.084, + "step": 13398 + }, + { + "epoch": 0.8317710596560929, + "grad_norm": 0.17300215050602422, + "learning_rate": 9.073089238582357e-05, + "loss": 3.0322, + "step": 13399 + }, + { + "epoch": 0.8318331367558508, + "grad_norm": 0.2167261626936157, + "learning_rate": 9.072879760251679e-05, + "loss": 2.9993, + "step": 13400 + }, + { + "epoch": 0.8318952138556087, + "grad_norm": 0.18075731293735833, + "learning_rate": 9.072670260671799e-05, + "loss": 3.0087, + "step": 13401 + }, + { + "epoch": 0.8319572909553665, + "grad_norm": 0.2831762893064105, + "learning_rate": 9.072460739843807e-05, + "loss": 2.9953, + "step": 13402 + }, + { + "epoch": 0.8320193680551244, + "grad_norm": 0.20467546078168908, + "learning_rate": 9.0722511977688e-05, + "loss": 3.0765, + "step": 13403 + }, + { + "epoch": 0.8320814451548824, + "grad_norm": 0.17451497263295057, + "learning_rate": 9.072041634447871e-05, + "loss": 3.0385, + "step": 13404 + }, + { + "epoch": 0.8321435222546403, + "grad_norm": 0.19902258920535568, + "learning_rate": 9.071832049882112e-05, + "loss": 3.01, + "step": 13405 + }, + { + "epoch": 0.8322055993543982, + "grad_norm": 0.15658885452270552, + "learning_rate": 9.071622444072615e-05, + "loss": 2.9716, + "step": 13406 + }, + { + "epoch": 0.8322676764541561, + "grad_norm": 0.19304983813714507, + "learning_rate": 9.071412817020477e-05, + "loss": 3.1042, + "step": 13407 + }, + { + "epoch": 0.8323297535539139, + "grad_norm": 0.17273128225524514, + "learning_rate": 9.071203168726789e-05, + "loss": 2.9737, + "step": 13408 + }, + { + "epoch": 0.8323918306536718, + "grad_norm": 0.17957533569811734, + "learning_rate": 9.070993499192645e-05, + "loss": 3.0737, + "step": 13409 + }, + { + "epoch": 0.8324539077534298, + "grad_norm": 0.1606023147442174, + "learning_rate": 9.070783808419143e-05, + "loss": 3.0986, + "step": 13410 + }, + { + "epoch": 0.8325159848531877, + "grad_norm": 0.20548821273699597, + "learning_rate": 9.07057409640737e-05, + "loss": 3.0444, + "step": 13411 + }, + { + "epoch": 0.8325780619529456, + "grad_norm": 0.15753540957051504, + "learning_rate": 9.070364363158426e-05, + "loss": 3.1018, + "step": 13412 + }, + { + "epoch": 0.8326401390527035, + "grad_norm": 0.1855017657408, + "learning_rate": 9.070154608673403e-05, + "loss": 2.9894, + "step": 13413 + }, + { + "epoch": 0.8327022161524613, + "grad_norm": 0.16801131064119804, + "learning_rate": 9.069944832953393e-05, + "loss": 2.9739, + "step": 13414 + }, + { + "epoch": 0.8327642932522192, + "grad_norm": 0.200077943589539, + "learning_rate": 9.069735035999495e-05, + "loss": 3.0438, + "step": 13415 + }, + { + "epoch": 0.8328263703519772, + "grad_norm": 0.18088144163631203, + "learning_rate": 9.069525217812801e-05, + "loss": 3.0426, + "step": 13416 + }, + { + "epoch": 0.8328884474517351, + "grad_norm": 0.2660829719291508, + "learning_rate": 9.069315378394406e-05, + "loss": 2.9795, + "step": 13417 + }, + { + "epoch": 0.832950524551493, + "grad_norm": 0.19369170019341542, + "learning_rate": 9.069105517745404e-05, + "loss": 2.9153, + "step": 13418 + }, + { + "epoch": 0.8330126016512509, + "grad_norm": 0.18230437961026796, + "learning_rate": 9.06889563586689e-05, + "loss": 3.0454, + "step": 13419 + }, + { + "epoch": 0.8330746787510087, + "grad_norm": 0.2263365885317766, + "learning_rate": 9.068685732759961e-05, + "loss": 3.0953, + "step": 13420 + }, + { + "epoch": 0.8331367558507666, + "grad_norm": 0.23098623240354435, + "learning_rate": 9.068475808425711e-05, + "loss": 3.0129, + "step": 13421 + }, + { + "epoch": 0.8331988329505245, + "grad_norm": 0.1782669952838776, + "learning_rate": 9.068265862865235e-05, + "loss": 3.0637, + "step": 13422 + }, + { + "epoch": 0.8332609100502825, + "grad_norm": 0.1815452017397982, + "learning_rate": 9.068055896079627e-05, + "loss": 3.0385, + "step": 13423 + }, + { + "epoch": 0.8333229871500404, + "grad_norm": 0.23010801920954732, + "learning_rate": 9.067845908069988e-05, + "loss": 2.9813, + "step": 13424 + }, + { + "epoch": 0.8333850642497983, + "grad_norm": 0.2740334012102311, + "learning_rate": 9.067635898837406e-05, + "loss": 3.056, + "step": 13425 + }, + { + "epoch": 0.8334471413495561, + "grad_norm": 0.26473306520402906, + "learning_rate": 9.067425868382979e-05, + "loss": 3.1403, + "step": 13426 + }, + { + "epoch": 0.833509218449314, + "grad_norm": 0.32003993556302696, + "learning_rate": 9.067215816707806e-05, + "loss": 3.03, + "step": 13427 + }, + { + "epoch": 0.8335712955490719, + "grad_norm": 0.21509481845398026, + "learning_rate": 9.06700574381298e-05, + "loss": 3.042, + "step": 13428 + }, + { + "epoch": 0.8336333726488299, + "grad_norm": 0.2588860815753384, + "learning_rate": 9.066795649699598e-05, + "loss": 3.0762, + "step": 13429 + }, + { + "epoch": 0.8336954497485878, + "grad_norm": 0.19514354073439003, + "learning_rate": 9.066585534368756e-05, + "loss": 3.0461, + "step": 13430 + }, + { + "epoch": 0.8337575268483457, + "grad_norm": 0.23402526510916602, + "learning_rate": 9.06637539782155e-05, + "loss": 2.9588, + "step": 13431 + }, + { + "epoch": 0.8338196039481035, + "grad_norm": 0.19268306789936998, + "learning_rate": 9.066165240059076e-05, + "loss": 3.1339, + "step": 13432 + }, + { + "epoch": 0.8338816810478614, + "grad_norm": 0.3750825036207875, + "learning_rate": 9.065955061082431e-05, + "loss": 2.9511, + "step": 13433 + }, + { + "epoch": 0.8339437581476193, + "grad_norm": 0.17602517899118258, + "learning_rate": 9.065744860892712e-05, + "loss": 2.9766, + "step": 13434 + }, + { + "epoch": 0.8340058352473773, + "grad_norm": 0.27908638712027706, + "learning_rate": 9.065534639491013e-05, + "loss": 3.018, + "step": 13435 + }, + { + "epoch": 0.8340679123471352, + "grad_norm": 0.20312990586151927, + "learning_rate": 9.065324396878434e-05, + "loss": 2.9889, + "step": 13436 + }, + { + "epoch": 0.8341299894468931, + "grad_norm": 0.24304223743160028, + "learning_rate": 9.065114133056072e-05, + "loss": 3.0319, + "step": 13437 + }, + { + "epoch": 0.8341920665466509, + "grad_norm": 0.27298526150616964, + "learning_rate": 9.064903848025021e-05, + "loss": 2.9748, + "step": 13438 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.2184218005240199, + "learning_rate": 9.064693541786381e-05, + "loss": 3.0301, + "step": 13439 + }, + { + "epoch": 0.8343162207461667, + "grad_norm": 0.22961044511159534, + "learning_rate": 9.064483214341248e-05, + "loss": 3.0059, + "step": 13440 + }, + { + "epoch": 0.8343782978459247, + "grad_norm": 0.2481455418950097, + "learning_rate": 9.064272865690718e-05, + "loss": 3.1036, + "step": 13441 + }, + { + "epoch": 0.8344403749456826, + "grad_norm": 0.1936999029334824, + "learning_rate": 9.064062495835892e-05, + "loss": 3.0386, + "step": 13442 + }, + { + "epoch": 0.8345024520454405, + "grad_norm": 0.21565414521987158, + "learning_rate": 9.063852104777864e-05, + "loss": 3.0704, + "step": 13443 + }, + { + "epoch": 0.8345645291451983, + "grad_norm": 0.20301334356686218, + "learning_rate": 9.063641692517733e-05, + "loss": 3.0517, + "step": 13444 + }, + { + "epoch": 0.8346266062449562, + "grad_norm": 0.19896679260252104, + "learning_rate": 9.063431259056597e-05, + "loss": 3.0267, + "step": 13445 + }, + { + "epoch": 0.8346886833447141, + "grad_norm": 0.21929016009230057, + "learning_rate": 9.063220804395554e-05, + "loss": 2.9539, + "step": 13446 + }, + { + "epoch": 0.834750760444472, + "grad_norm": 0.26758459876923635, + "learning_rate": 9.063010328535701e-05, + "loss": 2.9, + "step": 13447 + }, + { + "epoch": 0.83481283754423, + "grad_norm": 0.25402283559484734, + "learning_rate": 9.062799831478137e-05, + "loss": 3.0532, + "step": 13448 + }, + { + "epoch": 0.8348749146439879, + "grad_norm": 0.18202506165361296, + "learning_rate": 9.06258931322396e-05, + "loss": 3.0169, + "step": 13449 + }, + { + "epoch": 0.8349369917437457, + "grad_norm": 0.17282882907318095, + "learning_rate": 9.062378773774269e-05, + "loss": 2.9853, + "step": 13450 + }, + { + "epoch": 0.8349990688435036, + "grad_norm": 0.18224967858988675, + "learning_rate": 9.062168213130162e-05, + "loss": 2.9624, + "step": 13451 + }, + { + "epoch": 0.8350611459432615, + "grad_norm": 0.17400542336163408, + "learning_rate": 9.061957631292737e-05, + "loss": 2.9386, + "step": 13452 + }, + { + "epoch": 0.8351232230430194, + "grad_norm": 0.2087744790703486, + "learning_rate": 9.061747028263093e-05, + "loss": 3.0347, + "step": 13453 + }, + { + "epoch": 0.8351853001427774, + "grad_norm": 0.17458822391081616, + "learning_rate": 9.061536404042327e-05, + "loss": 2.9648, + "step": 13454 + }, + { + "epoch": 0.8352473772425353, + "grad_norm": 0.17455134867729266, + "learning_rate": 9.06132575863154e-05, + "loss": 3.0054, + "step": 13455 + }, + { + "epoch": 0.8353094543422931, + "grad_norm": 0.1738685871565872, + "learning_rate": 9.061115092031832e-05, + "loss": 2.9616, + "step": 13456 + }, + { + "epoch": 0.835371531442051, + "grad_norm": 0.20310786981976467, + "learning_rate": 9.0609044042443e-05, + "loss": 3.0306, + "step": 13457 + }, + { + "epoch": 0.8354336085418089, + "grad_norm": 0.19177794858388592, + "learning_rate": 9.060693695270044e-05, + "loss": 3.0865, + "step": 13458 + }, + { + "epoch": 0.8354956856415668, + "grad_norm": 0.17659338790648518, + "learning_rate": 9.060482965110165e-05, + "loss": 2.98, + "step": 13459 + }, + { + "epoch": 0.8355577627413248, + "grad_norm": 0.19213728558865464, + "learning_rate": 9.060272213765758e-05, + "loss": 3.0056, + "step": 13460 + }, + { + "epoch": 0.8356198398410827, + "grad_norm": 0.17056935369735518, + "learning_rate": 9.060061441237926e-05, + "loss": 3.0295, + "step": 13461 + }, + { + "epoch": 0.8356819169408405, + "grad_norm": 0.20354794522021166, + "learning_rate": 9.059850647527768e-05, + "loss": 3.0214, + "step": 13462 + }, + { + "epoch": 0.8357439940405984, + "grad_norm": 0.19115472343275547, + "learning_rate": 9.059639832636383e-05, + "loss": 3.0408, + "step": 13463 + }, + { + "epoch": 0.8358060711403563, + "grad_norm": 0.1802980065264388, + "learning_rate": 9.059428996564872e-05, + "loss": 3.1085, + "step": 13464 + }, + { + "epoch": 0.8358681482401142, + "grad_norm": 0.16896588966434833, + "learning_rate": 9.059218139314336e-05, + "loss": 3.0789, + "step": 13465 + }, + { + "epoch": 0.8359302253398722, + "grad_norm": 0.17836117521723435, + "learning_rate": 9.059007260885872e-05, + "loss": 3.0447, + "step": 13466 + }, + { + "epoch": 0.8359923024396301, + "grad_norm": 0.17406484473034345, + "learning_rate": 9.058796361280582e-05, + "loss": 3.0803, + "step": 13467 + }, + { + "epoch": 0.8360543795393879, + "grad_norm": 0.1850237851382234, + "learning_rate": 9.058585440499565e-05, + "loss": 2.9953, + "step": 13468 + }, + { + "epoch": 0.8361164566391458, + "grad_norm": 0.18630539297595056, + "learning_rate": 9.058374498543925e-05, + "loss": 3.0179, + "step": 13469 + }, + { + "epoch": 0.8361785337389037, + "grad_norm": 0.17442116674839683, + "learning_rate": 9.058163535414758e-05, + "loss": 3.0503, + "step": 13470 + }, + { + "epoch": 0.8362406108386616, + "grad_norm": 0.21729261406947678, + "learning_rate": 9.057952551113167e-05, + "loss": 3.0538, + "step": 13471 + }, + { + "epoch": 0.8363026879384196, + "grad_norm": 0.21989402323606844, + "learning_rate": 9.057741545640253e-05, + "loss": 3.0176, + "step": 13472 + }, + { + "epoch": 0.8363647650381775, + "grad_norm": 0.23332122100034983, + "learning_rate": 9.057530518997115e-05, + "loss": 2.99, + "step": 13473 + }, + { + "epoch": 0.8364268421379353, + "grad_norm": 0.2305574408100472, + "learning_rate": 9.057319471184856e-05, + "loss": 2.9287, + "step": 13474 + }, + { + "epoch": 0.8364889192376932, + "grad_norm": 0.16408638553224933, + "learning_rate": 9.057108402204577e-05, + "loss": 2.9963, + "step": 13475 + }, + { + "epoch": 0.8365509963374511, + "grad_norm": 0.16682646151752378, + "learning_rate": 9.056897312057378e-05, + "loss": 2.9721, + "step": 13476 + }, + { + "epoch": 0.836613073437209, + "grad_norm": 0.1714643481906048, + "learning_rate": 9.056686200744361e-05, + "loss": 3.0166, + "step": 13477 + }, + { + "epoch": 0.836675150536967, + "grad_norm": 0.16734743534521532, + "learning_rate": 9.056475068266628e-05, + "loss": 2.9713, + "step": 13478 + }, + { + "epoch": 0.8367372276367249, + "grad_norm": 0.1833256122921184, + "learning_rate": 9.056263914625279e-05, + "loss": 2.8557, + "step": 13479 + }, + { + "epoch": 0.8367993047364827, + "grad_norm": 0.17304981306694756, + "learning_rate": 9.056052739821417e-05, + "loss": 2.9935, + "step": 13480 + }, + { + "epoch": 0.8368613818362406, + "grad_norm": 0.1778000825424507, + "learning_rate": 9.055841543856142e-05, + "loss": 2.9804, + "step": 13481 + }, + { + "epoch": 0.8369234589359985, + "grad_norm": 0.1814975174781382, + "learning_rate": 9.055630326730557e-05, + "loss": 3.0581, + "step": 13482 + }, + { + "epoch": 0.8369855360357564, + "grad_norm": 0.1547179873603142, + "learning_rate": 9.055419088445767e-05, + "loss": 3.0829, + "step": 13483 + }, + { + "epoch": 0.8370476131355143, + "grad_norm": 0.23535100849767038, + "learning_rate": 9.055207829002867e-05, + "loss": 3.0123, + "step": 13484 + }, + { + "epoch": 0.8371096902352723, + "grad_norm": 0.16532262447520357, + "learning_rate": 9.054996548402967e-05, + "loss": 2.9968, + "step": 13485 + }, + { + "epoch": 0.8371717673350301, + "grad_norm": 0.37283804675663984, + "learning_rate": 9.054785246647164e-05, + "loss": 3.0742, + "step": 13486 + }, + { + "epoch": 0.837233844434788, + "grad_norm": 0.27683116664895985, + "learning_rate": 9.054573923736562e-05, + "loss": 2.9989, + "step": 13487 + }, + { + "epoch": 0.8372959215345459, + "grad_norm": 0.16319446036719748, + "learning_rate": 9.054362579672263e-05, + "loss": 2.9878, + "step": 13488 + }, + { + "epoch": 0.8373579986343038, + "grad_norm": 0.18869901781728665, + "learning_rate": 9.054151214455371e-05, + "loss": 3.0079, + "step": 13489 + }, + { + "epoch": 0.8374200757340617, + "grad_norm": 0.18953979122291753, + "learning_rate": 9.053939828086988e-05, + "loss": 3.0591, + "step": 13490 + }, + { + "epoch": 0.8374821528338195, + "grad_norm": 0.19079389711429878, + "learning_rate": 9.053728420568217e-05, + "loss": 2.9797, + "step": 13491 + }, + { + "epoch": 0.8375442299335775, + "grad_norm": 0.17184208424703062, + "learning_rate": 9.05351699190016e-05, + "loss": 2.972, + "step": 13492 + }, + { + "epoch": 0.8376063070333354, + "grad_norm": 0.2296270015086294, + "learning_rate": 9.05330554208392e-05, + "loss": 2.9797, + "step": 13493 + }, + { + "epoch": 0.8376683841330933, + "grad_norm": 0.22250872459611762, + "learning_rate": 9.053094071120602e-05, + "loss": 3.0199, + "step": 13494 + }, + { + "epoch": 0.8377304612328512, + "grad_norm": 0.1996585721865905, + "learning_rate": 9.052882579011309e-05, + "loss": 3.1355, + "step": 13495 + }, + { + "epoch": 0.8377925383326091, + "grad_norm": 0.22802207499590507, + "learning_rate": 9.052671065757144e-05, + "loss": 3.0783, + "step": 13496 + }, + { + "epoch": 0.837854615432367, + "grad_norm": 0.18714047772721115, + "learning_rate": 9.052459531359209e-05, + "loss": 2.9974, + "step": 13497 + }, + { + "epoch": 0.8379166925321249, + "grad_norm": 0.21018712479929086, + "learning_rate": 9.052247975818608e-05, + "loss": 3.1031, + "step": 13498 + }, + { + "epoch": 0.8379787696318828, + "grad_norm": 0.19652613277630498, + "learning_rate": 9.052036399136448e-05, + "loss": 2.9624, + "step": 13499 + }, + { + "epoch": 0.8380408467316407, + "grad_norm": 0.21644072447414703, + "learning_rate": 9.051824801313828e-05, + "loss": 2.9519, + "step": 13500 + }, + { + "epoch": 0.8381029238313986, + "grad_norm": 0.20154777147759267, + "learning_rate": 9.051613182351857e-05, + "loss": 2.9803, + "step": 13501 + }, + { + "epoch": 0.8381650009311565, + "grad_norm": 0.17825164559154452, + "learning_rate": 9.051401542251634e-05, + "loss": 3.0091, + "step": 13502 + }, + { + "epoch": 0.8382270780309143, + "grad_norm": 0.18862323291916527, + "learning_rate": 9.051189881014267e-05, + "loss": 2.9944, + "step": 13503 + }, + { + "epoch": 0.8382891551306723, + "grad_norm": 0.1901987093287017, + "learning_rate": 9.05097819864086e-05, + "loss": 2.9934, + "step": 13504 + }, + { + "epoch": 0.8383512322304302, + "grad_norm": 0.20276084682975218, + "learning_rate": 9.050766495132514e-05, + "loss": 3.065, + "step": 13505 + }, + { + "epoch": 0.8384133093301881, + "grad_norm": 0.17178555892375644, + "learning_rate": 9.050554770490336e-05, + "loss": 3.0513, + "step": 13506 + }, + { + "epoch": 0.838475386429946, + "grad_norm": 0.1653296096993948, + "learning_rate": 9.050343024715433e-05, + "loss": 3.0576, + "step": 13507 + }, + { + "epoch": 0.8385374635297039, + "grad_norm": 0.23401975730379881, + "learning_rate": 9.050131257808904e-05, + "loss": 2.9918, + "step": 13508 + }, + { + "epoch": 0.8385995406294617, + "grad_norm": 0.18348205496461398, + "learning_rate": 9.049919469771859e-05, + "loss": 3.1452, + "step": 13509 + }, + { + "epoch": 0.8386616177292197, + "grad_norm": 0.17843826003171662, + "learning_rate": 9.0497076606054e-05, + "loss": 3.0919, + "step": 13510 + }, + { + "epoch": 0.8387236948289776, + "grad_norm": 0.18201919810465209, + "learning_rate": 9.049495830310635e-05, + "loss": 3.0687, + "step": 13511 + }, + { + "epoch": 0.8387857719287355, + "grad_norm": 0.18123061553294645, + "learning_rate": 9.049283978888665e-05, + "loss": 2.9275, + "step": 13512 + }, + { + "epoch": 0.8388478490284934, + "grad_norm": 0.16755219305574592, + "learning_rate": 9.049072106340599e-05, + "loss": 3.095, + "step": 13513 + }, + { + "epoch": 0.8389099261282513, + "grad_norm": 0.24399844798098672, + "learning_rate": 9.04886021266754e-05, + "loss": 3.0685, + "step": 13514 + }, + { + "epoch": 0.8389720032280091, + "grad_norm": 0.1797948910275622, + "learning_rate": 9.048648297870594e-05, + "loss": 3.0656, + "step": 13515 + }, + { + "epoch": 0.839034080327767, + "grad_norm": 0.190396692400816, + "learning_rate": 9.048436361950867e-05, + "loss": 2.9966, + "step": 13516 + }, + { + "epoch": 0.839096157427525, + "grad_norm": 0.18595584169850135, + "learning_rate": 9.048224404909466e-05, + "loss": 3.0532, + "step": 13517 + }, + { + "epoch": 0.8391582345272829, + "grad_norm": 0.19924487148395165, + "learning_rate": 9.048012426747495e-05, + "loss": 3.0231, + "step": 13518 + }, + { + "epoch": 0.8392203116270408, + "grad_norm": 0.22463390382262752, + "learning_rate": 9.04780042746606e-05, + "loss": 3.0988, + "step": 13519 + }, + { + "epoch": 0.8392823887267987, + "grad_norm": 0.2758381096036459, + "learning_rate": 9.047588407066268e-05, + "loss": 2.9623, + "step": 13520 + }, + { + "epoch": 0.8393444658265565, + "grad_norm": 0.17548168936328412, + "learning_rate": 9.047376365549223e-05, + "loss": 2.9046, + "step": 13521 + }, + { + "epoch": 0.8394065429263144, + "grad_norm": 0.25898986799073626, + "learning_rate": 9.047164302916036e-05, + "loss": 3.0872, + "step": 13522 + }, + { + "epoch": 0.8394686200260724, + "grad_norm": 0.2751532945577108, + "learning_rate": 9.046952219167809e-05, + "loss": 3.1137, + "step": 13523 + }, + { + "epoch": 0.8395306971258303, + "grad_norm": 0.19935667648839078, + "learning_rate": 9.046740114305649e-05, + "loss": 3.0487, + "step": 13524 + }, + { + "epoch": 0.8395927742255882, + "grad_norm": 0.21312314206093794, + "learning_rate": 9.046527988330664e-05, + "loss": 2.8938, + "step": 13525 + }, + { + "epoch": 0.8396548513253461, + "grad_norm": 0.1876311836685008, + "learning_rate": 9.04631584124396e-05, + "loss": 3.1045, + "step": 13526 + }, + { + "epoch": 0.8397169284251039, + "grad_norm": 0.17730171865064887, + "learning_rate": 9.046103673046645e-05, + "loss": 3.0183, + "step": 13527 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.1774204751971115, + "learning_rate": 9.045891483739824e-05, + "loss": 3.0534, + "step": 13528 + }, + { + "epoch": 0.8398410826246198, + "grad_norm": 0.17652085264592188, + "learning_rate": 9.045679273324603e-05, + "loss": 3.0578, + "step": 13529 + }, + { + "epoch": 0.8399031597243777, + "grad_norm": 0.161111177090128, + "learning_rate": 9.045467041802094e-05, + "loss": 3.0086, + "step": 13530 + }, + { + "epoch": 0.8399652368241356, + "grad_norm": 0.2230337272683917, + "learning_rate": 9.045254789173401e-05, + "loss": 3.035, + "step": 13531 + }, + { + "epoch": 0.8400273139238935, + "grad_norm": 0.1750302910134011, + "learning_rate": 9.045042515439631e-05, + "loss": 3.0165, + "step": 13532 + }, + { + "epoch": 0.8400893910236513, + "grad_norm": 0.20584092645661725, + "learning_rate": 9.044830220601892e-05, + "loss": 3.1749, + "step": 13533 + }, + { + "epoch": 0.8401514681234092, + "grad_norm": 0.17970850033984115, + "learning_rate": 9.044617904661293e-05, + "loss": 3.0549, + "step": 13534 + }, + { + "epoch": 0.8402135452231672, + "grad_norm": 0.1906869471062791, + "learning_rate": 9.044405567618938e-05, + "loss": 2.9621, + "step": 13535 + }, + { + "epoch": 0.8402756223229251, + "grad_norm": 0.29027078022855174, + "learning_rate": 9.044193209475939e-05, + "loss": 3.0281, + "step": 13536 + }, + { + "epoch": 0.840337699422683, + "grad_norm": 0.1842495386345624, + "learning_rate": 9.043980830233403e-05, + "loss": 3.0511, + "step": 13537 + }, + { + "epoch": 0.8403997765224409, + "grad_norm": 0.19737184138847902, + "learning_rate": 9.043768429892437e-05, + "loss": 3.029, + "step": 13538 + }, + { + "epoch": 0.8404618536221987, + "grad_norm": 0.19714260042036363, + "learning_rate": 9.04355600845415e-05, + "loss": 3.1122, + "step": 13539 + }, + { + "epoch": 0.8405239307219566, + "grad_norm": 0.20413658535183474, + "learning_rate": 9.043343565919648e-05, + "loss": 3.0512, + "step": 13540 + }, + { + "epoch": 0.8405860078217146, + "grad_norm": 0.18033607196608667, + "learning_rate": 9.043131102290043e-05, + "loss": 3.0386, + "step": 13541 + }, + { + "epoch": 0.8406480849214725, + "grad_norm": 0.19368596864718424, + "learning_rate": 9.04291861756644e-05, + "loss": 2.9825, + "step": 13542 + }, + { + "epoch": 0.8407101620212304, + "grad_norm": 0.20740178077334678, + "learning_rate": 9.04270611174995e-05, + "loss": 3.0834, + "step": 13543 + }, + { + "epoch": 0.8407722391209883, + "grad_norm": 0.17789560906586355, + "learning_rate": 9.04249358484168e-05, + "loss": 3.0778, + "step": 13544 + }, + { + "epoch": 0.8408343162207461, + "grad_norm": 0.19651260741935606, + "learning_rate": 9.04228103684274e-05, + "loss": 2.9566, + "step": 13545 + }, + { + "epoch": 0.840896393320504, + "grad_norm": 0.21242445030489213, + "learning_rate": 9.042068467754238e-05, + "loss": 3.0192, + "step": 13546 + }, + { + "epoch": 0.840958470420262, + "grad_norm": 0.19237598658811275, + "learning_rate": 9.041855877577285e-05, + "loss": 2.9432, + "step": 13547 + }, + { + "epoch": 0.8410205475200199, + "grad_norm": 0.19577080361580962, + "learning_rate": 9.041643266312987e-05, + "loss": 2.9946, + "step": 13548 + }, + { + "epoch": 0.8410826246197778, + "grad_norm": 0.30912680399561526, + "learning_rate": 9.041430633962456e-05, + "loss": 3.009, + "step": 13549 + }, + { + "epoch": 0.8411447017195357, + "grad_norm": 0.28922657562912824, + "learning_rate": 9.041217980526799e-05, + "loss": 3.0264, + "step": 13550 + }, + { + "epoch": 0.8412067788192935, + "grad_norm": 0.21569615240547332, + "learning_rate": 9.041005306007128e-05, + "loss": 3.0189, + "step": 13551 + }, + { + "epoch": 0.8412688559190514, + "grad_norm": 0.26251431437473366, + "learning_rate": 9.040792610404552e-05, + "loss": 2.9908, + "step": 13552 + }, + { + "epoch": 0.8413309330188093, + "grad_norm": 0.19773984317460705, + "learning_rate": 9.04057989372018e-05, + "loss": 2.99, + "step": 13553 + }, + { + "epoch": 0.8413930101185673, + "grad_norm": 0.22262554626247893, + "learning_rate": 9.040367155955121e-05, + "loss": 2.8883, + "step": 13554 + }, + { + "epoch": 0.8414550872183252, + "grad_norm": 0.20408941533934127, + "learning_rate": 9.040154397110485e-05, + "loss": 3.0542, + "step": 13555 + }, + { + "epoch": 0.8415171643180831, + "grad_norm": 0.20299898420159576, + "learning_rate": 9.039941617187383e-05, + "loss": 3.0426, + "step": 13556 + }, + { + "epoch": 0.8415792414178409, + "grad_norm": 0.19111949499259434, + "learning_rate": 9.039728816186925e-05, + "loss": 3.0119, + "step": 13557 + }, + { + "epoch": 0.8416413185175988, + "grad_norm": 0.18887208539732384, + "learning_rate": 9.039515994110221e-05, + "loss": 3.0127, + "step": 13558 + }, + { + "epoch": 0.8417033956173567, + "grad_norm": 0.1924723208796528, + "learning_rate": 9.039303150958383e-05, + "loss": 3.0026, + "step": 13559 + }, + { + "epoch": 0.8417654727171147, + "grad_norm": 0.19636772249785212, + "learning_rate": 9.039090286732518e-05, + "loss": 3.1126, + "step": 13560 + }, + { + "epoch": 0.8418275498168726, + "grad_norm": 0.18148270787275134, + "learning_rate": 9.038877401433738e-05, + "loss": 2.9178, + "step": 13561 + }, + { + "epoch": 0.8418896269166305, + "grad_norm": 0.18470959731291312, + "learning_rate": 9.038664495063155e-05, + "loss": 3.0018, + "step": 13562 + }, + { + "epoch": 0.8419517040163883, + "grad_norm": 0.19470414785476622, + "learning_rate": 9.03845156762188e-05, + "loss": 2.9939, + "step": 13563 + }, + { + "epoch": 0.8420137811161462, + "grad_norm": 0.2050581059448707, + "learning_rate": 9.038238619111022e-05, + "loss": 2.9299, + "step": 13564 + }, + { + "epoch": 0.8420758582159041, + "grad_norm": 0.16425870901944942, + "learning_rate": 9.038025649531693e-05, + "loss": 2.9553, + "step": 13565 + }, + { + "epoch": 0.8421379353156621, + "grad_norm": 0.2044088796945954, + "learning_rate": 9.037812658885003e-05, + "loss": 3.0711, + "step": 13566 + }, + { + "epoch": 0.84220001241542, + "grad_norm": 0.18189880521724264, + "learning_rate": 9.037599647172066e-05, + "loss": 3.049, + "step": 13567 + }, + { + "epoch": 0.8422620895151779, + "grad_norm": 0.18334657181459885, + "learning_rate": 9.03738661439399e-05, + "loss": 3.0928, + "step": 13568 + }, + { + "epoch": 0.8423241666149357, + "grad_norm": 0.2710681578899087, + "learning_rate": 9.037173560551889e-05, + "loss": 2.945, + "step": 13569 + }, + { + "epoch": 0.8423862437146936, + "grad_norm": 0.23762823739823916, + "learning_rate": 9.036960485646872e-05, + "loss": 3.074, + "step": 13570 + }, + { + "epoch": 0.8424483208144515, + "grad_norm": 0.22277247591060306, + "learning_rate": 9.036747389680053e-05, + "loss": 2.9506, + "step": 13571 + }, + { + "epoch": 0.8425103979142095, + "grad_norm": 0.2143768525808156, + "learning_rate": 9.036534272652542e-05, + "loss": 3.0842, + "step": 13572 + }, + { + "epoch": 0.8425724750139674, + "grad_norm": 0.19979795703757527, + "learning_rate": 9.036321134565453e-05, + "loss": 2.9562, + "step": 13573 + }, + { + "epoch": 0.8426345521137253, + "grad_norm": 0.18008402940913334, + "learning_rate": 9.036107975419896e-05, + "loss": 2.9815, + "step": 13574 + }, + { + "epoch": 0.8426966292134831, + "grad_norm": 0.20191101284458313, + "learning_rate": 9.035894795216984e-05, + "loss": 3.026, + "step": 13575 + }, + { + "epoch": 0.842758706313241, + "grad_norm": 0.1855327600832424, + "learning_rate": 9.03568159395783e-05, + "loss": 3.1105, + "step": 13576 + }, + { + "epoch": 0.8428207834129989, + "grad_norm": 0.17582051580349944, + "learning_rate": 9.035468371643545e-05, + "loss": 2.9431, + "step": 13577 + }, + { + "epoch": 0.8428828605127568, + "grad_norm": 0.17578417987501044, + "learning_rate": 9.03525512827524e-05, + "loss": 2.9949, + "step": 13578 + }, + { + "epoch": 0.8429449376125148, + "grad_norm": 0.16616818766948413, + "learning_rate": 9.035041863854032e-05, + "loss": 3.0569, + "step": 13579 + }, + { + "epoch": 0.8430070147122727, + "grad_norm": 0.2310576270231673, + "learning_rate": 9.03482857838103e-05, + "loss": 3.0448, + "step": 13580 + }, + { + "epoch": 0.8430690918120305, + "grad_norm": 0.17369562092245255, + "learning_rate": 9.034615271857347e-05, + "loss": 3.0581, + "step": 13581 + }, + { + "epoch": 0.8431311689117884, + "grad_norm": 0.19072665997984242, + "learning_rate": 9.034401944284099e-05, + "loss": 3.0542, + "step": 13582 + }, + { + "epoch": 0.8431932460115463, + "grad_norm": 0.17228384051389745, + "learning_rate": 9.034188595662394e-05, + "loss": 3.0188, + "step": 13583 + }, + { + "epoch": 0.8432553231113042, + "grad_norm": 0.20463434978008602, + "learning_rate": 9.03397522599335e-05, + "loss": 3.0016, + "step": 13584 + }, + { + "epoch": 0.8433174002110622, + "grad_norm": 0.17710253784349983, + "learning_rate": 9.033761835278076e-05, + "loss": 2.9544, + "step": 13585 + }, + { + "epoch": 0.8433794773108201, + "grad_norm": 0.17886470995401582, + "learning_rate": 9.033548423517688e-05, + "loss": 3.0561, + "step": 13586 + }, + { + "epoch": 0.8434415544105779, + "grad_norm": 0.21536443211382192, + "learning_rate": 9.0333349907133e-05, + "loss": 3.044, + "step": 13587 + }, + { + "epoch": 0.8435036315103358, + "grad_norm": 0.1861887120370384, + "learning_rate": 9.033121536866023e-05, + "loss": 3.0624, + "step": 13588 + }, + { + "epoch": 0.8435657086100937, + "grad_norm": 0.2056330212735898, + "learning_rate": 9.032908061976973e-05, + "loss": 3.0308, + "step": 13589 + }, + { + "epoch": 0.8436277857098516, + "grad_norm": 0.17087377097728015, + "learning_rate": 9.032694566047259e-05, + "loss": 2.9456, + "step": 13590 + }, + { + "epoch": 0.8436898628096096, + "grad_norm": 0.18082197350501195, + "learning_rate": 9.032481049078004e-05, + "loss": 3.0517, + "step": 13591 + }, + { + "epoch": 0.8437519399093675, + "grad_norm": 0.17791249708259949, + "learning_rate": 9.032267511070312e-05, + "loss": 2.9543, + "step": 13592 + }, + { + "epoch": 0.8438140170091253, + "grad_norm": 0.18052189362382035, + "learning_rate": 9.032053952025303e-05, + "loss": 3.0164, + "step": 13593 + }, + { + "epoch": 0.8438760941088832, + "grad_norm": 0.18404415358152193, + "learning_rate": 9.03184037194409e-05, + "loss": 2.9467, + "step": 13594 + }, + { + "epoch": 0.8439381712086411, + "grad_norm": 0.24228257958588456, + "learning_rate": 9.031626770827786e-05, + "loss": 3.0692, + "step": 13595 + }, + { + "epoch": 0.844000248308399, + "grad_norm": 0.1715979130732657, + "learning_rate": 9.031413148677508e-05, + "loss": 3.0268, + "step": 13596 + }, + { + "epoch": 0.844062325408157, + "grad_norm": 0.18796581778137877, + "learning_rate": 9.031199505494368e-05, + "loss": 2.9831, + "step": 13597 + }, + { + "epoch": 0.8441244025079149, + "grad_norm": 0.17538409932492122, + "learning_rate": 9.030985841279482e-05, + "loss": 3.0737, + "step": 13598 + }, + { + "epoch": 0.8441864796076727, + "grad_norm": 0.1752546582024641, + "learning_rate": 9.030772156033964e-05, + "loss": 3.0206, + "step": 13599 + }, + { + "epoch": 0.8442485567074306, + "grad_norm": 0.2492315880987512, + "learning_rate": 9.03055844975893e-05, + "loss": 3.0576, + "step": 13600 + }, + { + "epoch": 0.8443106338071885, + "grad_norm": 0.24429845667303549, + "learning_rate": 9.030344722455492e-05, + "loss": 2.9727, + "step": 13601 + }, + { + "epoch": 0.8443727109069464, + "grad_norm": 0.19062422572432064, + "learning_rate": 9.030130974124768e-05, + "loss": 2.9731, + "step": 13602 + }, + { + "epoch": 0.8444347880067044, + "grad_norm": 0.3191630043951797, + "learning_rate": 9.029917204767875e-05, + "loss": 2.9682, + "step": 13603 + }, + { + "epoch": 0.8444968651064623, + "grad_norm": 0.23524876482433565, + "learning_rate": 9.029703414385924e-05, + "loss": 3.0514, + "step": 13604 + }, + { + "epoch": 0.8445589422062201, + "grad_norm": 0.22283513429019042, + "learning_rate": 9.02948960298003e-05, + "loss": 3.0193, + "step": 13605 + }, + { + "epoch": 0.844621019305978, + "grad_norm": 0.26449253797191, + "learning_rate": 9.029275770551313e-05, + "loss": 3.0663, + "step": 13606 + }, + { + "epoch": 0.8446830964057359, + "grad_norm": 0.33922475588984635, + "learning_rate": 9.029061917100885e-05, + "loss": 2.9587, + "step": 13607 + }, + { + "epoch": 0.8447451735054938, + "grad_norm": 0.22746432800229258, + "learning_rate": 9.028848042629862e-05, + "loss": 3.0415, + "step": 13608 + }, + { + "epoch": 0.8448072506052517, + "grad_norm": 0.2236279899377029, + "learning_rate": 9.028634147139364e-05, + "loss": 2.9715, + "step": 13609 + }, + { + "epoch": 0.8448693277050097, + "grad_norm": 0.29070467625824065, + "learning_rate": 9.028420230630501e-05, + "loss": 2.9913, + "step": 13610 + }, + { + "epoch": 0.8449314048047675, + "grad_norm": 0.21955936428342548, + "learning_rate": 9.028206293104392e-05, + "loss": 3.0584, + "step": 13611 + }, + { + "epoch": 0.8449934819045254, + "grad_norm": 0.21093415328302548, + "learning_rate": 9.027992334562153e-05, + "loss": 2.9544, + "step": 13612 + }, + { + "epoch": 0.8450555590042833, + "grad_norm": 0.21720408095043853, + "learning_rate": 9.027778355004901e-05, + "loss": 3.0671, + "step": 13613 + }, + { + "epoch": 0.8451176361040412, + "grad_norm": 0.201842145106362, + "learning_rate": 9.02756435443375e-05, + "loss": 2.9613, + "step": 13614 + }, + { + "epoch": 0.8451797132037991, + "grad_norm": 0.21666459180376405, + "learning_rate": 9.02735033284982e-05, + "loss": 3.0006, + "step": 13615 + }, + { + "epoch": 0.8452417903035571, + "grad_norm": 0.511037649902698, + "learning_rate": 9.027136290254224e-05, + "loss": 3.1007, + "step": 13616 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.3758988535447738, + "learning_rate": 9.02692222664808e-05, + "loss": 2.9302, + "step": 13617 + }, + { + "epoch": 0.8453659445030728, + "grad_norm": 0.26639593669360023, + "learning_rate": 9.026708142032507e-05, + "loss": 3.0166, + "step": 13618 + }, + { + "epoch": 0.8454280216028307, + "grad_norm": 0.260750445403014, + "learning_rate": 9.026494036408619e-05, + "loss": 2.9616, + "step": 13619 + }, + { + "epoch": 0.8454900987025886, + "grad_norm": 0.2336235636564405, + "learning_rate": 9.026279909777535e-05, + "loss": 3.0525, + "step": 13620 + }, + { + "epoch": 0.8455521758023465, + "grad_norm": 0.40515568872568114, + "learning_rate": 9.02606576214037e-05, + "loss": 3.015, + "step": 13621 + }, + { + "epoch": 0.8456142529021045, + "grad_norm": 0.3352668299685557, + "learning_rate": 9.025851593498243e-05, + "loss": 3.0356, + "step": 13622 + }, + { + "epoch": 0.8456763300018623, + "grad_norm": 0.2569052309035299, + "learning_rate": 9.025637403852271e-05, + "loss": 3.0116, + "step": 13623 + }, + { + "epoch": 0.8457384071016202, + "grad_norm": 0.2197647909181825, + "learning_rate": 9.025423193203573e-05, + "loss": 3.0309, + "step": 13624 + }, + { + "epoch": 0.8458004842013781, + "grad_norm": 0.21313771948236793, + "learning_rate": 9.025208961553262e-05, + "loss": 3.0429, + "step": 13625 + }, + { + "epoch": 0.845862561301136, + "grad_norm": 0.2036667897563017, + "learning_rate": 9.02499470890246e-05, + "loss": 3.0548, + "step": 13626 + }, + { + "epoch": 0.8459246384008939, + "grad_norm": 0.2962463146897952, + "learning_rate": 9.024780435252285e-05, + "loss": 3.0026, + "step": 13627 + }, + { + "epoch": 0.8459867155006519, + "grad_norm": 0.19626090573726945, + "learning_rate": 9.024566140603851e-05, + "loss": 3.1043, + "step": 13628 + }, + { + "epoch": 0.8460487926004097, + "grad_norm": 0.196253645249547, + "learning_rate": 9.02435182495828e-05, + "loss": 2.9874, + "step": 13629 + }, + { + "epoch": 0.8461108697001676, + "grad_norm": 0.21737549864605946, + "learning_rate": 9.024137488316689e-05, + "loss": 3.0816, + "step": 13630 + }, + { + "epoch": 0.8461729467999255, + "grad_norm": 0.28845435685302384, + "learning_rate": 9.023923130680195e-05, + "loss": 3.0924, + "step": 13631 + }, + { + "epoch": 0.8462350238996834, + "grad_norm": 0.20129264058464647, + "learning_rate": 9.023708752049917e-05, + "loss": 3.0625, + "step": 13632 + }, + { + "epoch": 0.8462971009994413, + "grad_norm": 0.21156752066240295, + "learning_rate": 9.023494352426973e-05, + "loss": 3.0147, + "step": 13633 + }, + { + "epoch": 0.8463591780991992, + "grad_norm": 0.22550545119465537, + "learning_rate": 9.023279931812484e-05, + "loss": 3.002, + "step": 13634 + }, + { + "epoch": 0.8464212551989571, + "grad_norm": 0.18925535556706197, + "learning_rate": 9.023065490207565e-05, + "loss": 2.9604, + "step": 13635 + }, + { + "epoch": 0.846483332298715, + "grad_norm": 0.17392028015964137, + "learning_rate": 9.022851027613338e-05, + "loss": 3.0093, + "step": 13636 + }, + { + "epoch": 0.8465454093984729, + "grad_norm": 0.18880927411997903, + "learning_rate": 9.02263654403092e-05, + "loss": 3.0457, + "step": 13637 + }, + { + "epoch": 0.8466074864982308, + "grad_norm": 0.18463514087658198, + "learning_rate": 9.022422039461431e-05, + "loss": 3.0655, + "step": 13638 + }, + { + "epoch": 0.8466695635979887, + "grad_norm": 0.4334426994809461, + "learning_rate": 9.022207513905991e-05, + "loss": 2.9625, + "step": 13639 + }, + { + "epoch": 0.8467316406977466, + "grad_norm": 0.18459154440855524, + "learning_rate": 9.021992967365716e-05, + "loss": 2.9949, + "step": 13640 + }, + { + "epoch": 0.8467937177975045, + "grad_norm": 0.2612949755161113, + "learning_rate": 9.021778399841729e-05, + "loss": 2.9994, + "step": 13641 + }, + { + "epoch": 0.8468557948972624, + "grad_norm": 0.20948802118036822, + "learning_rate": 9.021563811335145e-05, + "loss": 3.0872, + "step": 13642 + }, + { + "epoch": 0.8469178719970203, + "grad_norm": 0.17743176070320585, + "learning_rate": 9.021349201847089e-05, + "loss": 2.9721, + "step": 13643 + }, + { + "epoch": 0.8469799490967782, + "grad_norm": 0.18709917737531742, + "learning_rate": 9.021134571378677e-05, + "loss": 3.024, + "step": 13644 + }, + { + "epoch": 0.8470420261965361, + "grad_norm": 0.15803719873266117, + "learning_rate": 9.02091991993103e-05, + "loss": 3.0095, + "step": 13645 + }, + { + "epoch": 0.847104103296294, + "grad_norm": 0.2257312372776035, + "learning_rate": 9.020705247505268e-05, + "loss": 3.0193, + "step": 13646 + }, + { + "epoch": 0.8471661803960518, + "grad_norm": 0.18094746810915757, + "learning_rate": 9.020490554102509e-05, + "loss": 2.9386, + "step": 13647 + }, + { + "epoch": 0.8472282574958098, + "grad_norm": 0.15564023485260237, + "learning_rate": 9.020275839723877e-05, + "loss": 3.0203, + "step": 13648 + }, + { + "epoch": 0.8472903345955677, + "grad_norm": 0.22491464187847968, + "learning_rate": 9.020061104370488e-05, + "loss": 3.0142, + "step": 13649 + }, + { + "epoch": 0.8473524116953256, + "grad_norm": 0.1933739781885947, + "learning_rate": 9.019846348043466e-05, + "loss": 3.0073, + "step": 13650 + }, + { + "epoch": 0.8474144887950835, + "grad_norm": 0.23068589148403756, + "learning_rate": 9.019631570743928e-05, + "loss": 3.0478, + "step": 13651 + }, + { + "epoch": 0.8474765658948414, + "grad_norm": 0.19198747863893598, + "learning_rate": 9.019416772472998e-05, + "loss": 2.9928, + "step": 13652 + }, + { + "epoch": 0.8475386429945992, + "grad_norm": 0.21043862406886737, + "learning_rate": 9.019201953231794e-05, + "loss": 2.9878, + "step": 13653 + }, + { + "epoch": 0.8476007200943572, + "grad_norm": 0.26351172210490237, + "learning_rate": 9.018987113021438e-05, + "loss": 3.0908, + "step": 13654 + }, + { + "epoch": 0.8476627971941151, + "grad_norm": 0.2654596824062853, + "learning_rate": 9.018772251843051e-05, + "loss": 3.0704, + "step": 13655 + }, + { + "epoch": 0.847724874293873, + "grad_norm": 0.21570902755879315, + "learning_rate": 9.018557369697753e-05, + "loss": 3.0439, + "step": 13656 + }, + { + "epoch": 0.8477869513936309, + "grad_norm": 0.185676118108979, + "learning_rate": 9.018342466586665e-05, + "loss": 3.0681, + "step": 13657 + }, + { + "epoch": 0.8478490284933888, + "grad_norm": 0.20349752671498506, + "learning_rate": 9.018127542510911e-05, + "loss": 2.9479, + "step": 13658 + }, + { + "epoch": 0.8479111055931466, + "grad_norm": 0.2105364784547288, + "learning_rate": 9.017912597471607e-05, + "loss": 3.0452, + "step": 13659 + }, + { + "epoch": 0.8479731826929046, + "grad_norm": 0.20661644496676843, + "learning_rate": 9.01769763146988e-05, + "loss": 2.9922, + "step": 13660 + }, + { + "epoch": 0.8480352597926625, + "grad_norm": 0.25515635927832203, + "learning_rate": 9.017482644506849e-05, + "loss": 3.0617, + "step": 13661 + }, + { + "epoch": 0.8480973368924204, + "grad_norm": 0.2430628753687657, + "learning_rate": 9.017267636583635e-05, + "loss": 3.126, + "step": 13662 + }, + { + "epoch": 0.8481594139921783, + "grad_norm": 0.2070542448697089, + "learning_rate": 9.01705260770136e-05, + "loss": 3.0209, + "step": 13663 + }, + { + "epoch": 0.8482214910919362, + "grad_norm": 0.2297232823817983, + "learning_rate": 9.016837557861149e-05, + "loss": 3.0238, + "step": 13664 + }, + { + "epoch": 0.848283568191694, + "grad_norm": 0.16768829085753234, + "learning_rate": 9.01662248706412e-05, + "loss": 2.9149, + "step": 13665 + }, + { + "epoch": 0.848345645291452, + "grad_norm": 0.17661971287734363, + "learning_rate": 9.016407395311395e-05, + "loss": 3.0308, + "step": 13666 + }, + { + "epoch": 0.8484077223912099, + "grad_norm": 0.3889566461349097, + "learning_rate": 9.016192282604097e-05, + "loss": 3.0528, + "step": 13667 + }, + { + "epoch": 0.8484697994909678, + "grad_norm": 0.30593002497237165, + "learning_rate": 9.015977148943351e-05, + "loss": 3.1449, + "step": 13668 + }, + { + "epoch": 0.8485318765907257, + "grad_norm": 0.18029828546119295, + "learning_rate": 9.015761994330277e-05, + "loss": 3.087, + "step": 13669 + }, + { + "epoch": 0.8485939536904836, + "grad_norm": 0.2776415931469668, + "learning_rate": 9.015546818765998e-05, + "loss": 3.0556, + "step": 13670 + }, + { + "epoch": 0.8486560307902414, + "grad_norm": 0.20368363516583105, + "learning_rate": 9.015331622251635e-05, + "loss": 2.9575, + "step": 13671 + }, + { + "epoch": 0.8487181078899994, + "grad_norm": 0.22715033532856183, + "learning_rate": 9.015116404788313e-05, + "loss": 3.0901, + "step": 13672 + }, + { + "epoch": 0.8487801849897573, + "grad_norm": 0.19159272899738292, + "learning_rate": 9.014901166377156e-05, + "loss": 3.0938, + "step": 13673 + }, + { + "epoch": 0.8488422620895152, + "grad_norm": 0.2001860453623963, + "learning_rate": 9.014685907019282e-05, + "loss": 3.0311, + "step": 13674 + }, + { + "epoch": 0.8489043391892731, + "grad_norm": 0.18411993147561187, + "learning_rate": 9.014470626715818e-05, + "loss": 3.0562, + "step": 13675 + }, + { + "epoch": 0.848966416289031, + "grad_norm": 0.17120259058573778, + "learning_rate": 9.014255325467886e-05, + "loss": 2.9767, + "step": 13676 + }, + { + "epoch": 0.8490284933887888, + "grad_norm": 0.18583569384224669, + "learning_rate": 9.014040003276611e-05, + "loss": 3.0171, + "step": 13677 + }, + { + "epoch": 0.8490905704885467, + "grad_norm": 0.19796219205946183, + "learning_rate": 9.013824660143113e-05, + "loss": 2.964, + "step": 13678 + }, + { + "epoch": 0.8491526475883047, + "grad_norm": 0.18587011991711969, + "learning_rate": 9.013609296068519e-05, + "loss": 3.0109, + "step": 13679 + }, + { + "epoch": 0.8492147246880626, + "grad_norm": 0.19527812655291002, + "learning_rate": 9.01339391105395e-05, + "loss": 3.0274, + "step": 13680 + }, + { + "epoch": 0.8492768017878205, + "grad_norm": 0.16618436764601757, + "learning_rate": 9.01317850510053e-05, + "loss": 2.9763, + "step": 13681 + }, + { + "epoch": 0.8493388788875784, + "grad_norm": 0.16623325668720376, + "learning_rate": 9.012963078209384e-05, + "loss": 2.957, + "step": 13682 + }, + { + "epoch": 0.8494009559873362, + "grad_norm": 0.16519407745895498, + "learning_rate": 9.012747630381636e-05, + "loss": 3.0733, + "step": 13683 + }, + { + "epoch": 0.8494630330870941, + "grad_norm": 0.17255823386845087, + "learning_rate": 9.01253216161841e-05, + "loss": 2.9984, + "step": 13684 + }, + { + "epoch": 0.8495251101868521, + "grad_norm": 0.16780326697986483, + "learning_rate": 9.012316671920828e-05, + "loss": 3.0376, + "step": 13685 + }, + { + "epoch": 0.84958718728661, + "grad_norm": 0.16839922590431766, + "learning_rate": 9.012101161290016e-05, + "loss": 3.02, + "step": 13686 + }, + { + "epoch": 0.8496492643863679, + "grad_norm": 0.1709210244647655, + "learning_rate": 9.0118856297271e-05, + "loss": 3.1078, + "step": 13687 + }, + { + "epoch": 0.8497113414861258, + "grad_norm": 0.16081347176932412, + "learning_rate": 9.011670077233201e-05, + "loss": 3.0037, + "step": 13688 + }, + { + "epoch": 0.8497734185858836, + "grad_norm": 0.15833493028533926, + "learning_rate": 9.011454503809446e-05, + "loss": 3.0187, + "step": 13689 + }, + { + "epoch": 0.8498354956856415, + "grad_norm": 0.17418880761295286, + "learning_rate": 9.011238909456959e-05, + "loss": 3.0409, + "step": 13690 + }, + { + "epoch": 0.8498975727853995, + "grad_norm": 0.17725048637726934, + "learning_rate": 9.011023294176866e-05, + "loss": 2.9901, + "step": 13691 + }, + { + "epoch": 0.8499596498851574, + "grad_norm": 0.16867544416898056, + "learning_rate": 9.010807657970288e-05, + "loss": 3.1244, + "step": 13692 + }, + { + "epoch": 0.8500217269849153, + "grad_norm": 0.1556944507531195, + "learning_rate": 9.010592000838355e-05, + "loss": 2.9724, + "step": 13693 + }, + { + "epoch": 0.8500838040846732, + "grad_norm": 0.23693617318953414, + "learning_rate": 9.01037632278219e-05, + "loss": 3.0594, + "step": 13694 + }, + { + "epoch": 0.850145881184431, + "grad_norm": 0.15250057316109555, + "learning_rate": 9.010160623802916e-05, + "loss": 3.0768, + "step": 13695 + }, + { + "epoch": 0.8502079582841889, + "grad_norm": 0.1731570825045596, + "learning_rate": 9.009944903901664e-05, + "loss": 2.9829, + "step": 13696 + }, + { + "epoch": 0.8502700353839469, + "grad_norm": 0.15748531066801275, + "learning_rate": 9.009729163079554e-05, + "loss": 2.9105, + "step": 13697 + }, + { + "epoch": 0.8503321124837048, + "grad_norm": 0.15776587330950798, + "learning_rate": 9.009513401337712e-05, + "loss": 3.0726, + "step": 13698 + }, + { + "epoch": 0.8503941895834627, + "grad_norm": 0.15883771167537483, + "learning_rate": 9.009297618677268e-05, + "loss": 3.0293, + "step": 13699 + }, + { + "epoch": 0.8504562666832206, + "grad_norm": 0.16421706434704098, + "learning_rate": 9.009081815099345e-05, + "loss": 3.0669, + "step": 13700 + }, + { + "epoch": 0.8505183437829784, + "grad_norm": 0.20907325280766426, + "learning_rate": 9.008865990605067e-05, + "loss": 3.0363, + "step": 13701 + }, + { + "epoch": 0.8505804208827363, + "grad_norm": 0.1814095966020562, + "learning_rate": 9.008650145195563e-05, + "loss": 3.0442, + "step": 13702 + }, + { + "epoch": 0.8506424979824943, + "grad_norm": 0.15211896329302374, + "learning_rate": 9.008434278871958e-05, + "loss": 2.9382, + "step": 13703 + }, + { + "epoch": 0.8507045750822522, + "grad_norm": 0.16567520132038197, + "learning_rate": 9.008218391635377e-05, + "loss": 3.052, + "step": 13704 + }, + { + "epoch": 0.8507666521820101, + "grad_norm": 0.3724203790787073, + "learning_rate": 9.008002483486949e-05, + "loss": 2.9828, + "step": 13705 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.3482734366627673, + "learning_rate": 9.0077865544278e-05, + "loss": 2.965, + "step": 13706 + }, + { + "epoch": 0.8508908063815258, + "grad_norm": 0.20505553529847312, + "learning_rate": 9.007570604459053e-05, + "loss": 2.9911, + "step": 13707 + }, + { + "epoch": 0.8509528834812837, + "grad_norm": 0.22223239006599524, + "learning_rate": 9.00735463358184e-05, + "loss": 3.0164, + "step": 13708 + }, + { + "epoch": 0.8510149605810416, + "grad_norm": 0.3310189016799863, + "learning_rate": 9.007138641797284e-05, + "loss": 3.0598, + "step": 13709 + }, + { + "epoch": 0.8510770376807996, + "grad_norm": 0.2269320433865976, + "learning_rate": 9.006922629106512e-05, + "loss": 2.9154, + "step": 13710 + }, + { + "epoch": 0.8511391147805575, + "grad_norm": 0.1921592226392521, + "learning_rate": 9.006706595510652e-05, + "loss": 2.9941, + "step": 13711 + }, + { + "epoch": 0.8512011918803154, + "grad_norm": 0.19018135506903688, + "learning_rate": 9.006490541010832e-05, + "loss": 3.0534, + "step": 13712 + }, + { + "epoch": 0.8512632689800732, + "grad_norm": 0.20259975417921042, + "learning_rate": 9.006274465608177e-05, + "loss": 3.0273, + "step": 13713 + }, + { + "epoch": 0.8513253460798311, + "grad_norm": 0.20126828411221204, + "learning_rate": 9.006058369303819e-05, + "loss": 3.0106, + "step": 13714 + }, + { + "epoch": 0.851387423179589, + "grad_norm": 0.19827132175380455, + "learning_rate": 9.005842252098877e-05, + "loss": 3.016, + "step": 13715 + }, + { + "epoch": 0.851449500279347, + "grad_norm": 0.3131874153229967, + "learning_rate": 9.005626113994488e-05, + "loss": 3.0133, + "step": 13716 + }, + { + "epoch": 0.8515115773791049, + "grad_norm": 0.20620898033928806, + "learning_rate": 9.005409954991772e-05, + "loss": 3.0097, + "step": 13717 + }, + { + "epoch": 0.8515736544788628, + "grad_norm": 0.20356922908093086, + "learning_rate": 9.00519377509186e-05, + "loss": 2.9777, + "step": 13718 + }, + { + "epoch": 0.8516357315786206, + "grad_norm": 0.21407651890146026, + "learning_rate": 9.004977574295882e-05, + "loss": 3.0998, + "step": 13719 + }, + { + "epoch": 0.8516978086783785, + "grad_norm": 0.18132494084209855, + "learning_rate": 9.004761352604963e-05, + "loss": 3.0798, + "step": 13720 + }, + { + "epoch": 0.8517598857781364, + "grad_norm": 0.20391016295029227, + "learning_rate": 9.00454511002023e-05, + "loss": 2.9934, + "step": 13721 + }, + { + "epoch": 0.8518219628778944, + "grad_norm": 0.20914036285874157, + "learning_rate": 9.004328846542816e-05, + "loss": 3.1043, + "step": 13722 + }, + { + "epoch": 0.8518840399776523, + "grad_norm": 0.19108065334927268, + "learning_rate": 9.004112562173845e-05, + "loss": 3.1072, + "step": 13723 + }, + { + "epoch": 0.8519461170774102, + "grad_norm": 0.17640397183558085, + "learning_rate": 9.003896256914446e-05, + "loss": 3.0433, + "step": 13724 + }, + { + "epoch": 0.852008194177168, + "grad_norm": 0.2611248337547463, + "learning_rate": 9.003679930765749e-05, + "loss": 3.002, + "step": 13725 + }, + { + "epoch": 0.8520702712769259, + "grad_norm": 0.1687594808458321, + "learning_rate": 9.003463583728881e-05, + "loss": 2.9032, + "step": 13726 + }, + { + "epoch": 0.8521323483766838, + "grad_norm": 0.23915258935349631, + "learning_rate": 9.003247215804971e-05, + "loss": 3.1061, + "step": 13727 + }, + { + "epoch": 0.8521944254764418, + "grad_norm": 0.16732518418132517, + "learning_rate": 9.003030826995151e-05, + "loss": 2.9427, + "step": 13728 + }, + { + "epoch": 0.8522565025761997, + "grad_norm": 0.22194178039657822, + "learning_rate": 9.002814417300547e-05, + "loss": 3.0077, + "step": 13729 + }, + { + "epoch": 0.8523185796759576, + "grad_norm": 0.18798762730075988, + "learning_rate": 9.002597986722286e-05, + "loss": 2.9097, + "step": 13730 + }, + { + "epoch": 0.8523806567757154, + "grad_norm": 0.18023770516522322, + "learning_rate": 9.002381535261502e-05, + "loss": 2.8918, + "step": 13731 + }, + { + "epoch": 0.8524427338754733, + "grad_norm": 0.1824443938020193, + "learning_rate": 9.00216506291932e-05, + "loss": 3.0077, + "step": 13732 + }, + { + "epoch": 0.8525048109752312, + "grad_norm": 0.16374614277875804, + "learning_rate": 9.001948569696873e-05, + "loss": 2.9723, + "step": 13733 + }, + { + "epoch": 0.8525668880749891, + "grad_norm": 0.26207825181721806, + "learning_rate": 9.001732055595289e-05, + "loss": 3.0094, + "step": 13734 + }, + { + "epoch": 0.8526289651747471, + "grad_norm": 0.19316421639925205, + "learning_rate": 9.001515520615695e-05, + "loss": 3.0802, + "step": 13735 + }, + { + "epoch": 0.852691042274505, + "grad_norm": 0.22179498908842452, + "learning_rate": 9.001298964759226e-05, + "loss": 3.0292, + "step": 13736 + }, + { + "epoch": 0.8527531193742628, + "grad_norm": 0.2875302143391292, + "learning_rate": 9.001082388027008e-05, + "loss": 2.8955, + "step": 13737 + }, + { + "epoch": 0.8528151964740207, + "grad_norm": 0.2541359522636499, + "learning_rate": 9.000865790420171e-05, + "loss": 3.0373, + "step": 13738 + }, + { + "epoch": 0.8528772735737786, + "grad_norm": 0.21449570641299698, + "learning_rate": 9.000649171939847e-05, + "loss": 3.0249, + "step": 13739 + }, + { + "epoch": 0.8529393506735365, + "grad_norm": 0.21052259683853464, + "learning_rate": 9.000432532587164e-05, + "loss": 3.0027, + "step": 13740 + }, + { + "epoch": 0.8530014277732945, + "grad_norm": 0.21887223200624517, + "learning_rate": 9.000215872363254e-05, + "loss": 3.0236, + "step": 13741 + }, + { + "epoch": 0.8530635048730524, + "grad_norm": 0.23919436333585076, + "learning_rate": 8.999999191269248e-05, + "loss": 2.8984, + "step": 13742 + }, + { + "epoch": 0.8531255819728102, + "grad_norm": 0.19917457689221424, + "learning_rate": 8.999782489306272e-05, + "loss": 3.004, + "step": 13743 + }, + { + "epoch": 0.8531876590725681, + "grad_norm": 0.2039777663207536, + "learning_rate": 8.999565766475461e-05, + "loss": 2.9869, + "step": 13744 + }, + { + "epoch": 0.853249736172326, + "grad_norm": 0.19685636689550537, + "learning_rate": 8.999349022777946e-05, + "loss": 3.0608, + "step": 13745 + }, + { + "epoch": 0.8533118132720839, + "grad_norm": 0.16689896909685065, + "learning_rate": 8.999132258214855e-05, + "loss": 3.0572, + "step": 13746 + }, + { + "epoch": 0.8533738903718419, + "grad_norm": 0.18618810178768946, + "learning_rate": 8.998915472787321e-05, + "loss": 3.0166, + "step": 13747 + }, + { + "epoch": 0.8534359674715998, + "grad_norm": 0.4673191006408454, + "learning_rate": 8.998698666496474e-05, + "loss": 3.0484, + "step": 13748 + }, + { + "epoch": 0.8534980445713576, + "grad_norm": 0.32560072363566844, + "learning_rate": 8.998481839343444e-05, + "loss": 3.0919, + "step": 13749 + }, + { + "epoch": 0.8535601216711155, + "grad_norm": 0.32565395557798527, + "learning_rate": 8.998264991329365e-05, + "loss": 3.0103, + "step": 13750 + }, + { + "epoch": 0.8536221987708734, + "grad_norm": 0.2587628045346164, + "learning_rate": 8.998048122455367e-05, + "loss": 2.9714, + "step": 13751 + }, + { + "epoch": 0.8536842758706313, + "grad_norm": 0.30440790846419563, + "learning_rate": 8.997831232722582e-05, + "loss": 3.0673, + "step": 13752 + }, + { + "epoch": 0.8537463529703893, + "grad_norm": 0.2228846800365976, + "learning_rate": 8.997614322132139e-05, + "loss": 3.0766, + "step": 13753 + }, + { + "epoch": 0.8538084300701472, + "grad_norm": 0.21777484876235284, + "learning_rate": 8.997397390685174e-05, + "loss": 3.0688, + "step": 13754 + }, + { + "epoch": 0.853870507169905, + "grad_norm": 0.22854178841296988, + "learning_rate": 8.997180438382814e-05, + "loss": 2.9922, + "step": 13755 + }, + { + "epoch": 0.8539325842696629, + "grad_norm": 0.18144126945072753, + "learning_rate": 8.996963465226194e-05, + "loss": 2.9592, + "step": 13756 + }, + { + "epoch": 0.8539946613694208, + "grad_norm": 0.3183557358048147, + "learning_rate": 8.996746471216446e-05, + "loss": 3.0501, + "step": 13757 + }, + { + "epoch": 0.8540567384691787, + "grad_norm": 0.21953398848756708, + "learning_rate": 8.996529456354701e-05, + "loss": 2.9605, + "step": 13758 + }, + { + "epoch": 0.8541188155689367, + "grad_norm": 0.26754837149154914, + "learning_rate": 8.996312420642091e-05, + "loss": 3.091, + "step": 13759 + }, + { + "epoch": 0.8541808926686946, + "grad_norm": 0.18207740566904101, + "learning_rate": 8.996095364079751e-05, + "loss": 3.0029, + "step": 13760 + }, + { + "epoch": 0.8542429697684524, + "grad_norm": 0.2758946237053399, + "learning_rate": 8.995878286668808e-05, + "loss": 3.0282, + "step": 13761 + }, + { + "epoch": 0.8543050468682103, + "grad_norm": 0.1558314439417627, + "learning_rate": 8.9956611884104e-05, + "loss": 2.988, + "step": 13762 + }, + { + "epoch": 0.8543671239679682, + "grad_norm": 0.1793918873151337, + "learning_rate": 8.995444069305659e-05, + "loss": 3.0562, + "step": 13763 + }, + { + "epoch": 0.8544292010677261, + "grad_norm": 0.19000354917248044, + "learning_rate": 8.995226929355715e-05, + "loss": 2.9652, + "step": 13764 + }, + { + "epoch": 0.854491278167484, + "grad_norm": 0.1897470612631342, + "learning_rate": 8.995009768561702e-05, + "loss": 2.9436, + "step": 13765 + }, + { + "epoch": 0.854553355267242, + "grad_norm": 0.18407044513374285, + "learning_rate": 8.994792586924753e-05, + "loss": 3.0166, + "step": 13766 + }, + { + "epoch": 0.8546154323669998, + "grad_norm": 0.21909224610253927, + "learning_rate": 8.994575384446002e-05, + "loss": 2.9054, + "step": 13767 + }, + { + "epoch": 0.8546775094667577, + "grad_norm": 0.19685653721053611, + "learning_rate": 8.994358161126581e-05, + "loss": 3.0807, + "step": 13768 + }, + { + "epoch": 0.8547395865665156, + "grad_norm": 0.20525456059531524, + "learning_rate": 8.994140916967623e-05, + "loss": 2.9653, + "step": 13769 + }, + { + "epoch": 0.8548016636662735, + "grad_norm": 0.2068169852589197, + "learning_rate": 8.993923651970263e-05, + "loss": 3.0425, + "step": 13770 + }, + { + "epoch": 0.8548637407660314, + "grad_norm": 0.19334716951013914, + "learning_rate": 8.993706366135634e-05, + "loss": 3.0324, + "step": 13771 + }, + { + "epoch": 0.8549258178657894, + "grad_norm": 0.2190031307080512, + "learning_rate": 8.99348905946487e-05, + "loss": 3.0368, + "step": 13772 + }, + { + "epoch": 0.8549878949655472, + "grad_norm": 0.24748881332807732, + "learning_rate": 8.993271731959103e-05, + "loss": 3.0182, + "step": 13773 + }, + { + "epoch": 0.8550499720653051, + "grad_norm": 0.28546448600520186, + "learning_rate": 8.993054383619469e-05, + "loss": 3.0314, + "step": 13774 + }, + { + "epoch": 0.855112049165063, + "grad_norm": 0.19095617043823107, + "learning_rate": 8.992837014447099e-05, + "loss": 3.0777, + "step": 13775 + }, + { + "epoch": 0.8551741262648209, + "grad_norm": 0.3046277060891393, + "learning_rate": 8.992619624443132e-05, + "loss": 3.0248, + "step": 13776 + }, + { + "epoch": 0.8552362033645788, + "grad_norm": 0.19786084897991674, + "learning_rate": 8.992402213608698e-05, + "loss": 3.0309, + "step": 13777 + }, + { + "epoch": 0.8552982804643368, + "grad_norm": 0.2196521124277106, + "learning_rate": 8.992184781944931e-05, + "loss": 3.0458, + "step": 13778 + }, + { + "epoch": 0.8553603575640946, + "grad_norm": 0.18134739495229307, + "learning_rate": 8.991967329452969e-05, + "loss": 3.0568, + "step": 13779 + }, + { + "epoch": 0.8554224346638525, + "grad_norm": 0.22984214234973052, + "learning_rate": 8.991749856133942e-05, + "loss": 3.0656, + "step": 13780 + }, + { + "epoch": 0.8554845117636104, + "grad_norm": 0.21571305142628544, + "learning_rate": 8.991532361988987e-05, + "loss": 3.1015, + "step": 13781 + }, + { + "epoch": 0.8555465888633683, + "grad_norm": 0.17388736594048737, + "learning_rate": 8.99131484701924e-05, + "loss": 2.956, + "step": 13782 + }, + { + "epoch": 0.8556086659631262, + "grad_norm": 0.17345060114308253, + "learning_rate": 8.991097311225836e-05, + "loss": 3.0366, + "step": 13783 + }, + { + "epoch": 0.8556707430628842, + "grad_norm": 0.2629911324126569, + "learning_rate": 8.990879754609906e-05, + "loss": 3.0246, + "step": 13784 + }, + { + "epoch": 0.855732820162642, + "grad_norm": 0.20255374358629608, + "learning_rate": 8.990662177172588e-05, + "loss": 3.1158, + "step": 13785 + }, + { + "epoch": 0.8557948972623999, + "grad_norm": 0.18645324185060128, + "learning_rate": 8.990444578915018e-05, + "loss": 2.9772, + "step": 13786 + }, + { + "epoch": 0.8558569743621578, + "grad_norm": 0.18395254908706227, + "learning_rate": 8.990226959838328e-05, + "loss": 3.0398, + "step": 13787 + }, + { + "epoch": 0.8559190514619157, + "grad_norm": 0.24300478885684554, + "learning_rate": 8.990009319943657e-05, + "loss": 3.0104, + "step": 13788 + }, + { + "epoch": 0.8559811285616736, + "grad_norm": 0.2331332611660545, + "learning_rate": 8.989791659232137e-05, + "loss": 3.0249, + "step": 13789 + }, + { + "epoch": 0.8560432056614315, + "grad_norm": 0.18543020897248133, + "learning_rate": 8.989573977704907e-05, + "loss": 2.9235, + "step": 13790 + }, + { + "epoch": 0.8561052827611894, + "grad_norm": 0.18474407191308997, + "learning_rate": 8.989356275363101e-05, + "loss": 2.9901, + "step": 13791 + }, + { + "epoch": 0.8561673598609473, + "grad_norm": 0.17160083296473339, + "learning_rate": 8.989138552207855e-05, + "loss": 3.0782, + "step": 13792 + }, + { + "epoch": 0.8562294369607052, + "grad_norm": 0.1737575494889366, + "learning_rate": 8.988920808240303e-05, + "loss": 3.0252, + "step": 13793 + }, + { + "epoch": 0.8562915140604631, + "grad_norm": 0.1516188099070105, + "learning_rate": 8.988703043461583e-05, + "loss": 2.9734, + "step": 13794 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.17241245465283214, + "learning_rate": 8.988485257872831e-05, + "loss": 2.9649, + "step": 13795 + }, + { + "epoch": 0.856415668259979, + "grad_norm": 0.14654928041024093, + "learning_rate": 8.988267451475185e-05, + "loss": 2.9826, + "step": 13796 + }, + { + "epoch": 0.8564777453597368, + "grad_norm": 0.1792184471187676, + "learning_rate": 8.988049624269779e-05, + "loss": 2.9975, + "step": 13797 + }, + { + "epoch": 0.8565398224594947, + "grad_norm": 0.20007536481437707, + "learning_rate": 8.987831776257749e-05, + "loss": 3.0408, + "step": 13798 + }, + { + "epoch": 0.8566018995592526, + "grad_norm": 0.17299617493919744, + "learning_rate": 8.987613907440234e-05, + "loss": 2.9634, + "step": 13799 + }, + { + "epoch": 0.8566639766590105, + "grad_norm": 0.21067010744583092, + "learning_rate": 8.987396017818368e-05, + "loss": 2.9396, + "step": 13800 + }, + { + "epoch": 0.8567260537587684, + "grad_norm": 0.1858120479032917, + "learning_rate": 8.987178107393287e-05, + "loss": 3.0386, + "step": 13801 + }, + { + "epoch": 0.8567881308585263, + "grad_norm": 0.16172042974806747, + "learning_rate": 8.986960176166134e-05, + "loss": 2.9848, + "step": 13802 + }, + { + "epoch": 0.8568502079582841, + "grad_norm": 0.17061451351338164, + "learning_rate": 8.98674222413804e-05, + "loss": 3.0031, + "step": 13803 + }, + { + "epoch": 0.8569122850580421, + "grad_norm": 0.16295440874975786, + "learning_rate": 8.986524251310143e-05, + "loss": 2.9654, + "step": 13804 + }, + { + "epoch": 0.8569743621578, + "grad_norm": 0.22657983812221566, + "learning_rate": 8.986306257683583e-05, + "loss": 3.0115, + "step": 13805 + }, + { + "epoch": 0.8570364392575579, + "grad_norm": 0.16910398892613446, + "learning_rate": 8.986088243259494e-05, + "loss": 3.0206, + "step": 13806 + }, + { + "epoch": 0.8570985163573158, + "grad_norm": 0.16449083883456897, + "learning_rate": 8.985870208039016e-05, + "loss": 2.9663, + "step": 13807 + }, + { + "epoch": 0.8571605934570737, + "grad_norm": 0.2547285555881633, + "learning_rate": 8.985652152023284e-05, + "loss": 3.1008, + "step": 13808 + }, + { + "epoch": 0.8572226705568315, + "grad_norm": 0.16489943404512636, + "learning_rate": 8.985434075213439e-05, + "loss": 3.0277, + "step": 13809 + }, + { + "epoch": 0.8572847476565895, + "grad_norm": 0.17378302857137604, + "learning_rate": 8.985215977610617e-05, + "loss": 2.9492, + "step": 13810 + }, + { + "epoch": 0.8573468247563474, + "grad_norm": 0.25478559956744146, + "learning_rate": 8.984997859215955e-05, + "loss": 3.0811, + "step": 13811 + }, + { + "epoch": 0.8574089018561053, + "grad_norm": 0.18622895406698156, + "learning_rate": 8.984779720030591e-05, + "loss": 2.9995, + "step": 13812 + }, + { + "epoch": 0.8574709789558632, + "grad_norm": 0.1904965894505186, + "learning_rate": 8.984561560055666e-05, + "loss": 3.1362, + "step": 13813 + }, + { + "epoch": 0.8575330560556211, + "grad_norm": 0.1841459087573505, + "learning_rate": 8.984343379292315e-05, + "loss": 3.0582, + "step": 13814 + }, + { + "epoch": 0.8575951331553789, + "grad_norm": 0.208080297028074, + "learning_rate": 8.984125177741677e-05, + "loss": 2.9892, + "step": 13815 + }, + { + "epoch": 0.8576572102551369, + "grad_norm": 0.16701206289770512, + "learning_rate": 8.98390695540489e-05, + "loss": 3.066, + "step": 13816 + }, + { + "epoch": 0.8577192873548948, + "grad_norm": 0.2049319881920711, + "learning_rate": 8.983688712283096e-05, + "loss": 3.0101, + "step": 13817 + }, + { + "epoch": 0.8577813644546527, + "grad_norm": 0.24789569783422782, + "learning_rate": 8.983470448377429e-05, + "loss": 2.9499, + "step": 13818 + }, + { + "epoch": 0.8578434415544106, + "grad_norm": 0.17849718662256733, + "learning_rate": 8.983252163689029e-05, + "loss": 2.9009, + "step": 13819 + }, + { + "epoch": 0.8579055186541685, + "grad_norm": 0.23026949140615047, + "learning_rate": 8.983033858219036e-05, + "loss": 3.0217, + "step": 13820 + }, + { + "epoch": 0.8579675957539263, + "grad_norm": 0.19559339351776983, + "learning_rate": 8.982815531968589e-05, + "loss": 3.074, + "step": 13821 + }, + { + "epoch": 0.8580296728536843, + "grad_norm": 0.19228165770099528, + "learning_rate": 8.982597184938825e-05, + "loss": 3.1204, + "step": 13822 + }, + { + "epoch": 0.8580917499534422, + "grad_norm": 0.20645202726850223, + "learning_rate": 8.982378817130888e-05, + "loss": 3.0237, + "step": 13823 + }, + { + "epoch": 0.8581538270532001, + "grad_norm": 0.1841522204406363, + "learning_rate": 8.982160428545912e-05, + "loss": 3.0659, + "step": 13824 + }, + { + "epoch": 0.858215904152958, + "grad_norm": 0.20600811038671507, + "learning_rate": 8.981942019185036e-05, + "loss": 3.0681, + "step": 13825 + }, + { + "epoch": 0.8582779812527159, + "grad_norm": 0.1891361810224373, + "learning_rate": 8.981723589049405e-05, + "loss": 3.0423, + "step": 13826 + }, + { + "epoch": 0.8583400583524737, + "grad_norm": 0.18158985782089604, + "learning_rate": 8.981505138140155e-05, + "loss": 3.0061, + "step": 13827 + }, + { + "epoch": 0.8584021354522317, + "grad_norm": 0.19605142052925553, + "learning_rate": 8.981286666458426e-05, + "loss": 3.1157, + "step": 13828 + }, + { + "epoch": 0.8584642125519896, + "grad_norm": 0.16053715047598505, + "learning_rate": 8.981068174005357e-05, + "loss": 3.0733, + "step": 13829 + }, + { + "epoch": 0.8585262896517475, + "grad_norm": 0.19565962462172226, + "learning_rate": 8.98084966078209e-05, + "loss": 2.9023, + "step": 13830 + }, + { + "epoch": 0.8585883667515054, + "grad_norm": 0.21251648214877086, + "learning_rate": 8.980631126789762e-05, + "loss": 2.961, + "step": 13831 + }, + { + "epoch": 0.8586504438512633, + "grad_norm": 0.19781076061394032, + "learning_rate": 8.980412572029517e-05, + "loss": 3.0683, + "step": 13832 + }, + { + "epoch": 0.8587125209510211, + "grad_norm": 0.20001364117533976, + "learning_rate": 8.980193996502493e-05, + "loss": 3.0152, + "step": 13833 + }, + { + "epoch": 0.858774598050779, + "grad_norm": 0.2199500579121525, + "learning_rate": 8.97997540020983e-05, + "loss": 2.9566, + "step": 13834 + }, + { + "epoch": 0.858836675150537, + "grad_norm": 0.2368216051927111, + "learning_rate": 8.97975678315267e-05, + "loss": 2.9643, + "step": 13835 + }, + { + "epoch": 0.8588987522502949, + "grad_norm": 0.19108432829205915, + "learning_rate": 8.979538145332154e-05, + "loss": 3.0247, + "step": 13836 + }, + { + "epoch": 0.8589608293500528, + "grad_norm": 0.22691334492337975, + "learning_rate": 8.97931948674942e-05, + "loss": 3.097, + "step": 13837 + }, + { + "epoch": 0.8590229064498107, + "grad_norm": 0.21345351985668412, + "learning_rate": 8.97910080740561e-05, + "loss": 2.9833, + "step": 13838 + }, + { + "epoch": 0.8590849835495685, + "grad_norm": 0.21288711649952174, + "learning_rate": 8.978882107301865e-05, + "loss": 3.0989, + "step": 13839 + }, + { + "epoch": 0.8591470606493264, + "grad_norm": 0.2521321303976845, + "learning_rate": 8.978663386439325e-05, + "loss": 3.0522, + "step": 13840 + }, + { + "epoch": 0.8592091377490844, + "grad_norm": 0.19974895121271616, + "learning_rate": 8.978444644819136e-05, + "loss": 2.9789, + "step": 13841 + }, + { + "epoch": 0.8592712148488423, + "grad_norm": 0.21495850706034905, + "learning_rate": 8.978225882442431e-05, + "loss": 3.0075, + "step": 13842 + }, + { + "epoch": 0.8593332919486002, + "grad_norm": 0.21541401001093244, + "learning_rate": 8.978007099310358e-05, + "loss": 3.0315, + "step": 13843 + }, + { + "epoch": 0.8593953690483581, + "grad_norm": 0.19146286847143298, + "learning_rate": 8.977788295424057e-05, + "loss": 2.9855, + "step": 13844 + }, + { + "epoch": 0.8594574461481159, + "grad_norm": 0.19734835070384055, + "learning_rate": 8.977569470784668e-05, + "loss": 3.0127, + "step": 13845 + }, + { + "epoch": 0.8595195232478738, + "grad_norm": 0.28921369516251283, + "learning_rate": 8.977350625393333e-05, + "loss": 3.0438, + "step": 13846 + }, + { + "epoch": 0.8595816003476318, + "grad_norm": 0.2206888953828951, + "learning_rate": 8.977131759251195e-05, + "loss": 2.9847, + "step": 13847 + }, + { + "epoch": 0.8596436774473897, + "grad_norm": 0.1930555596244897, + "learning_rate": 8.976912872359396e-05, + "loss": 3.0999, + "step": 13848 + }, + { + "epoch": 0.8597057545471476, + "grad_norm": 0.20132849439025316, + "learning_rate": 8.976693964719075e-05, + "loss": 3.1029, + "step": 13849 + }, + { + "epoch": 0.8597678316469055, + "grad_norm": 0.2025729527164386, + "learning_rate": 8.976475036331378e-05, + "loss": 3.0666, + "step": 13850 + }, + { + "epoch": 0.8598299087466633, + "grad_norm": 0.24571735874545297, + "learning_rate": 8.976256087197443e-05, + "loss": 2.9855, + "step": 13851 + }, + { + "epoch": 0.8598919858464212, + "grad_norm": 0.21364004884908738, + "learning_rate": 8.976037117318417e-05, + "loss": 3.0186, + "step": 13852 + }, + { + "epoch": 0.8599540629461792, + "grad_norm": 0.26171146211867924, + "learning_rate": 8.975818126695439e-05, + "loss": 3.0409, + "step": 13853 + }, + { + "epoch": 0.8600161400459371, + "grad_norm": 0.20263865826149655, + "learning_rate": 8.975599115329653e-05, + "loss": 3.0018, + "step": 13854 + }, + { + "epoch": 0.860078217145695, + "grad_norm": 0.20912825008085395, + "learning_rate": 8.975380083222202e-05, + "loss": 2.8752, + "step": 13855 + }, + { + "epoch": 0.8601402942454529, + "grad_norm": 0.1736262198999913, + "learning_rate": 8.975161030374228e-05, + "loss": 2.9121, + "step": 13856 + }, + { + "epoch": 0.8602023713452107, + "grad_norm": 0.19788139848852637, + "learning_rate": 8.974941956786873e-05, + "loss": 3.0375, + "step": 13857 + }, + { + "epoch": 0.8602644484449686, + "grad_norm": 0.17897137701691923, + "learning_rate": 8.974722862461282e-05, + "loss": 3.1057, + "step": 13858 + }, + { + "epoch": 0.8603265255447266, + "grad_norm": 0.1945462837691266, + "learning_rate": 8.974503747398597e-05, + "loss": 2.9717, + "step": 13859 + }, + { + "epoch": 0.8603886026444845, + "grad_norm": 0.1789213411554573, + "learning_rate": 8.97428461159996e-05, + "loss": 2.9731, + "step": 13860 + }, + { + "epoch": 0.8604506797442424, + "grad_norm": 0.17915587979565745, + "learning_rate": 8.974065455066515e-05, + "loss": 2.9451, + "step": 13861 + }, + { + "epoch": 0.8605127568440002, + "grad_norm": 0.20634116014978152, + "learning_rate": 8.973846277799407e-05, + "loss": 2.9426, + "step": 13862 + }, + { + "epoch": 0.8605748339437581, + "grad_norm": 0.16953162619553414, + "learning_rate": 8.973627079799778e-05, + "loss": 2.9838, + "step": 13863 + }, + { + "epoch": 0.860636911043516, + "grad_norm": 0.189060443283748, + "learning_rate": 8.973407861068771e-05, + "loss": 3.0492, + "step": 13864 + }, + { + "epoch": 0.860698988143274, + "grad_norm": 0.17257311856462632, + "learning_rate": 8.973188621607531e-05, + "loss": 2.9989, + "step": 13865 + }, + { + "epoch": 0.8607610652430319, + "grad_norm": 0.1834847654344832, + "learning_rate": 8.972969361417202e-05, + "loss": 2.8867, + "step": 13866 + }, + { + "epoch": 0.8608231423427898, + "grad_norm": 0.16427752618050484, + "learning_rate": 8.972750080498927e-05, + "loss": 3.0632, + "step": 13867 + }, + { + "epoch": 0.8608852194425476, + "grad_norm": 0.18982211291866904, + "learning_rate": 8.97253077885385e-05, + "loss": 2.9818, + "step": 13868 + }, + { + "epoch": 0.8609472965423055, + "grad_norm": 0.21356443636538008, + "learning_rate": 8.972311456483117e-05, + "loss": 3.0148, + "step": 13869 + }, + { + "epoch": 0.8610093736420634, + "grad_norm": 0.16618603207103794, + "learning_rate": 8.972092113387869e-05, + "loss": 3.0637, + "step": 13870 + }, + { + "epoch": 0.8610714507418213, + "grad_norm": 0.2002774326021968, + "learning_rate": 8.971872749569254e-05, + "loss": 3.1009, + "step": 13871 + }, + { + "epoch": 0.8611335278415793, + "grad_norm": 0.200137869804693, + "learning_rate": 8.971653365028413e-05, + "loss": 3.0157, + "step": 13872 + }, + { + "epoch": 0.8611956049413372, + "grad_norm": 0.19077993608064459, + "learning_rate": 8.971433959766493e-05, + "loss": 2.9924, + "step": 13873 + }, + { + "epoch": 0.861257682041095, + "grad_norm": 0.25911108171151315, + "learning_rate": 8.971214533784637e-05, + "loss": 2.9925, + "step": 13874 + }, + { + "epoch": 0.8613197591408529, + "grad_norm": 0.18178946027073267, + "learning_rate": 8.970995087083993e-05, + "loss": 3.1207, + "step": 13875 + }, + { + "epoch": 0.8613818362406108, + "grad_norm": 0.20566448671708368, + "learning_rate": 8.970775619665702e-05, + "loss": 2.9816, + "step": 13876 + }, + { + "epoch": 0.8614439133403687, + "grad_norm": 0.2019443772005919, + "learning_rate": 8.97055613153091e-05, + "loss": 3.0045, + "step": 13877 + }, + { + "epoch": 0.8615059904401267, + "grad_norm": 0.19535124394467687, + "learning_rate": 8.970336622680763e-05, + "loss": 3.0451, + "step": 13878 + }, + { + "epoch": 0.8615680675398846, + "grad_norm": 0.2109104951597177, + "learning_rate": 8.970117093116408e-05, + "loss": 2.9384, + "step": 13879 + }, + { + "epoch": 0.8616301446396424, + "grad_norm": 0.18893087313378248, + "learning_rate": 8.969897542838987e-05, + "loss": 3.0722, + "step": 13880 + }, + { + "epoch": 0.8616922217394003, + "grad_norm": 0.19254327269099447, + "learning_rate": 8.969677971849648e-05, + "loss": 2.9854, + "step": 13881 + }, + { + "epoch": 0.8617542988391582, + "grad_norm": 0.24926384186069536, + "learning_rate": 8.969458380149533e-05, + "loss": 3.0305, + "step": 13882 + }, + { + "epoch": 0.8618163759389161, + "grad_norm": 0.20690366124358747, + "learning_rate": 8.969238767739791e-05, + "loss": 2.9436, + "step": 13883 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.25997943253504446, + "learning_rate": 8.969019134621568e-05, + "loss": 3.0033, + "step": 13884 + }, + { + "epoch": 0.861940530138432, + "grad_norm": 0.20569631861557605, + "learning_rate": 8.968799480796006e-05, + "loss": 3.016, + "step": 13885 + }, + { + "epoch": 0.8620026072381898, + "grad_norm": 0.19480776418547643, + "learning_rate": 8.968579806264256e-05, + "loss": 2.9886, + "step": 13886 + }, + { + "epoch": 0.8620646843379477, + "grad_norm": 0.16779819967477166, + "learning_rate": 8.968360111027461e-05, + "loss": 3.0592, + "step": 13887 + }, + { + "epoch": 0.8621267614377056, + "grad_norm": 0.18318826456538367, + "learning_rate": 8.968140395086768e-05, + "loss": 3.1058, + "step": 13888 + }, + { + "epoch": 0.8621888385374635, + "grad_norm": 0.1866325538585876, + "learning_rate": 8.967920658443324e-05, + "loss": 3.0674, + "step": 13889 + }, + { + "epoch": 0.8622509156372214, + "grad_norm": 0.2867551941538401, + "learning_rate": 8.967700901098273e-05, + "loss": 3.0517, + "step": 13890 + }, + { + "epoch": 0.8623129927369794, + "grad_norm": 0.20893745252110624, + "learning_rate": 8.967481123052765e-05, + "loss": 3.0122, + "step": 13891 + }, + { + "epoch": 0.8623750698367372, + "grad_norm": 0.18220214802858162, + "learning_rate": 8.967261324307944e-05, + "loss": 2.9271, + "step": 13892 + }, + { + "epoch": 0.8624371469364951, + "grad_norm": 0.20118199480398796, + "learning_rate": 8.967041504864957e-05, + "loss": 3.1259, + "step": 13893 + }, + { + "epoch": 0.862499224036253, + "grad_norm": 0.1954570712039505, + "learning_rate": 8.966821664724952e-05, + "loss": 3.0464, + "step": 13894 + }, + { + "epoch": 0.8625613011360109, + "grad_norm": 0.20625902502948246, + "learning_rate": 8.966601803889075e-05, + "loss": 3.0938, + "step": 13895 + }, + { + "epoch": 0.8626233782357688, + "grad_norm": 0.17117149337315118, + "learning_rate": 8.966381922358475e-05, + "loss": 2.9233, + "step": 13896 + }, + { + "epoch": 0.8626854553355268, + "grad_norm": 0.19719010975557097, + "learning_rate": 8.966162020134297e-05, + "loss": 2.9338, + "step": 13897 + }, + { + "epoch": 0.8627475324352846, + "grad_norm": 0.22711650758453672, + "learning_rate": 8.965942097217688e-05, + "loss": 3.031, + "step": 13898 + }, + { + "epoch": 0.8628096095350425, + "grad_norm": 0.2185030036611792, + "learning_rate": 8.965722153609796e-05, + "loss": 3.0281, + "step": 13899 + }, + { + "epoch": 0.8628716866348004, + "grad_norm": 0.22533444840010244, + "learning_rate": 8.96550218931177e-05, + "loss": 3.0439, + "step": 13900 + }, + { + "epoch": 0.8629337637345583, + "grad_norm": 0.22219764595098107, + "learning_rate": 8.965282204324756e-05, + "loss": 3.0187, + "step": 13901 + }, + { + "epoch": 0.8629958408343162, + "grad_norm": 0.16875506587725117, + "learning_rate": 8.965062198649902e-05, + "loss": 3.0005, + "step": 13902 + }, + { + "epoch": 0.8630579179340742, + "grad_norm": 0.1621847781649367, + "learning_rate": 8.964842172288356e-05, + "loss": 2.8636, + "step": 13903 + }, + { + "epoch": 0.863119995033832, + "grad_norm": 0.18537878152188902, + "learning_rate": 8.964622125241266e-05, + "loss": 3.0157, + "step": 13904 + }, + { + "epoch": 0.8631820721335899, + "grad_norm": 0.20220942659892793, + "learning_rate": 8.96440205750978e-05, + "loss": 3.0748, + "step": 13905 + }, + { + "epoch": 0.8632441492333478, + "grad_norm": 0.17278639324582754, + "learning_rate": 8.964181969095046e-05, + "loss": 3.019, + "step": 13906 + }, + { + "epoch": 0.8633062263331057, + "grad_norm": 0.18311679042146686, + "learning_rate": 8.963961859998212e-05, + "loss": 2.9044, + "step": 13907 + }, + { + "epoch": 0.8633683034328636, + "grad_norm": 0.19060768220751612, + "learning_rate": 8.963741730220428e-05, + "loss": 3.0164, + "step": 13908 + }, + { + "epoch": 0.8634303805326216, + "grad_norm": 0.17771116042226928, + "learning_rate": 8.963521579762839e-05, + "loss": 2.9662, + "step": 13909 + }, + { + "epoch": 0.8634924576323794, + "grad_norm": 0.19433672344106345, + "learning_rate": 8.963301408626597e-05, + "loss": 3.0453, + "step": 13910 + }, + { + "epoch": 0.8635545347321373, + "grad_norm": 0.17071987632946914, + "learning_rate": 8.96308121681285e-05, + "loss": 3.0113, + "step": 13911 + }, + { + "epoch": 0.8636166118318952, + "grad_norm": 0.16816635484716935, + "learning_rate": 8.962861004322744e-05, + "loss": 2.9756, + "step": 13912 + }, + { + "epoch": 0.8636786889316531, + "grad_norm": 0.18046636332135105, + "learning_rate": 8.962640771157433e-05, + "loss": 2.9376, + "step": 13913 + }, + { + "epoch": 0.863740766031411, + "grad_norm": 0.1680477699440394, + "learning_rate": 8.96242051731806e-05, + "loss": 3.0, + "step": 13914 + }, + { + "epoch": 0.863802843131169, + "grad_norm": 0.21666957795192385, + "learning_rate": 8.96220024280578e-05, + "loss": 3.0631, + "step": 13915 + }, + { + "epoch": 0.8638649202309268, + "grad_norm": 0.18376125648300431, + "learning_rate": 8.961979947621739e-05, + "loss": 3.0098, + "step": 13916 + }, + { + "epoch": 0.8639269973306847, + "grad_norm": 0.19857654656090476, + "learning_rate": 8.961759631767085e-05, + "loss": 2.948, + "step": 13917 + }, + { + "epoch": 0.8639890744304426, + "grad_norm": 0.2601553495924206, + "learning_rate": 8.961539295242971e-05, + "loss": 3.0313, + "step": 13918 + }, + { + "epoch": 0.8640511515302005, + "grad_norm": 0.20621441801515375, + "learning_rate": 8.961318938050544e-05, + "loss": 3.048, + "step": 13919 + }, + { + "epoch": 0.8641132286299584, + "grad_norm": 0.23682779501046158, + "learning_rate": 8.961098560190955e-05, + "loss": 3.0901, + "step": 13920 + }, + { + "epoch": 0.8641753057297163, + "grad_norm": 0.17612195904308284, + "learning_rate": 8.960878161665353e-05, + "loss": 2.9341, + "step": 13921 + }, + { + "epoch": 0.8642373828294742, + "grad_norm": 0.20434448168901123, + "learning_rate": 8.960657742474888e-05, + "loss": 2.9843, + "step": 13922 + }, + { + "epoch": 0.8642994599292321, + "grad_norm": 0.20365384838952905, + "learning_rate": 8.96043730262071e-05, + "loss": 3.0633, + "step": 13923 + }, + { + "epoch": 0.86436153702899, + "grad_norm": 0.1764590546883345, + "learning_rate": 8.96021684210397e-05, + "loss": 3.0534, + "step": 13924 + }, + { + "epoch": 0.8644236141287479, + "grad_norm": 0.18194210505579123, + "learning_rate": 8.959996360925818e-05, + "loss": 2.9948, + "step": 13925 + }, + { + "epoch": 0.8644856912285058, + "grad_norm": 0.1800063145212066, + "learning_rate": 8.9597758590874e-05, + "loss": 2.9518, + "step": 13926 + }, + { + "epoch": 0.8645477683282637, + "grad_norm": 0.20755927210235953, + "learning_rate": 8.959555336589873e-05, + "loss": 3.0311, + "step": 13927 + }, + { + "epoch": 0.8646098454280216, + "grad_norm": 0.16330618462612898, + "learning_rate": 8.959334793434385e-05, + "loss": 3.0324, + "step": 13928 + }, + { + "epoch": 0.8646719225277795, + "grad_norm": 0.20283618898864303, + "learning_rate": 8.959114229622086e-05, + "loss": 3.0465, + "step": 13929 + }, + { + "epoch": 0.8647339996275374, + "grad_norm": 0.21072746345622434, + "learning_rate": 8.958893645154125e-05, + "loss": 2.9494, + "step": 13930 + }, + { + "epoch": 0.8647960767272953, + "grad_norm": 0.1881471334313285, + "learning_rate": 8.958673040031656e-05, + "loss": 3.0317, + "step": 13931 + }, + { + "epoch": 0.8648581538270532, + "grad_norm": 0.17401246144352062, + "learning_rate": 8.95845241425583e-05, + "loss": 2.9789, + "step": 13932 + }, + { + "epoch": 0.8649202309268111, + "grad_norm": 0.17608052843819577, + "learning_rate": 8.958231767827795e-05, + "loss": 3.1003, + "step": 13933 + }, + { + "epoch": 0.864982308026569, + "grad_norm": 0.17556886027695476, + "learning_rate": 8.958011100748705e-05, + "loss": 3.0125, + "step": 13934 + }, + { + "epoch": 0.8650443851263269, + "grad_norm": 0.19945920196073386, + "learning_rate": 8.957790413019709e-05, + "loss": 3.0983, + "step": 13935 + }, + { + "epoch": 0.8651064622260848, + "grad_norm": 0.24577101480618996, + "learning_rate": 8.95756970464196e-05, + "loss": 3.0842, + "step": 13936 + }, + { + "epoch": 0.8651685393258427, + "grad_norm": 0.1709290122883583, + "learning_rate": 8.95734897561661e-05, + "loss": 3.0324, + "step": 13937 + }, + { + "epoch": 0.8652306164256006, + "grad_norm": 0.17167273996679827, + "learning_rate": 8.957128225944809e-05, + "loss": 2.995, + "step": 13938 + }, + { + "epoch": 0.8652926935253585, + "grad_norm": 0.201314801303343, + "learning_rate": 8.95690745562771e-05, + "loss": 2.9946, + "step": 13939 + }, + { + "epoch": 0.8653547706251163, + "grad_norm": 0.17166362574481883, + "learning_rate": 8.956686664666463e-05, + "loss": 2.9573, + "step": 13940 + }, + { + "epoch": 0.8654168477248743, + "grad_norm": 0.15699287486536903, + "learning_rate": 8.956465853062223e-05, + "loss": 2.984, + "step": 13941 + }, + { + "epoch": 0.8654789248246322, + "grad_norm": 0.17252348891954797, + "learning_rate": 8.956245020816138e-05, + "loss": 3.0496, + "step": 13942 + }, + { + "epoch": 0.8655410019243901, + "grad_norm": 0.17595789299092382, + "learning_rate": 8.956024167929365e-05, + "loss": 2.9616, + "step": 13943 + }, + { + "epoch": 0.865603079024148, + "grad_norm": 0.18106711508558038, + "learning_rate": 8.955803294403052e-05, + "loss": 2.9905, + "step": 13944 + }, + { + "epoch": 0.8656651561239059, + "grad_norm": 0.18249941409356638, + "learning_rate": 8.955582400238353e-05, + "loss": 3.0396, + "step": 13945 + }, + { + "epoch": 0.8657272332236637, + "grad_norm": 0.1815205864449485, + "learning_rate": 8.95536148543642e-05, + "loss": 3.0592, + "step": 13946 + }, + { + "epoch": 0.8657893103234217, + "grad_norm": 0.17292978381095428, + "learning_rate": 8.955140549998407e-05, + "loss": 2.9855, + "step": 13947 + }, + { + "epoch": 0.8658513874231796, + "grad_norm": 0.15540053006646987, + "learning_rate": 8.954919593925464e-05, + "loss": 2.9027, + "step": 13948 + }, + { + "epoch": 0.8659134645229375, + "grad_norm": 0.25618846491743386, + "learning_rate": 8.954698617218746e-05, + "loss": 3.0166, + "step": 13949 + }, + { + "epoch": 0.8659755416226954, + "grad_norm": 0.17621850625156754, + "learning_rate": 8.954477619879406e-05, + "loss": 3.0545, + "step": 13950 + }, + { + "epoch": 0.8660376187224533, + "grad_norm": 0.18652957886177185, + "learning_rate": 8.954256601908596e-05, + "loss": 3.0265, + "step": 13951 + }, + { + "epoch": 0.8660996958222111, + "grad_norm": 0.16872721028087678, + "learning_rate": 8.954035563307469e-05, + "loss": 3.0757, + "step": 13952 + }, + { + "epoch": 0.866161772921969, + "grad_norm": 0.18135722417041622, + "learning_rate": 8.953814504077179e-05, + "loss": 3.0264, + "step": 13953 + }, + { + "epoch": 0.866223850021727, + "grad_norm": 0.1659376746785597, + "learning_rate": 8.95359342421888e-05, + "loss": 2.9046, + "step": 13954 + }, + { + "epoch": 0.8662859271214849, + "grad_norm": 0.1847224743641381, + "learning_rate": 8.953372323733722e-05, + "loss": 3.0023, + "step": 13955 + }, + { + "epoch": 0.8663480042212428, + "grad_norm": 0.1731140923248544, + "learning_rate": 8.953151202622862e-05, + "loss": 2.9185, + "step": 13956 + }, + { + "epoch": 0.8664100813210007, + "grad_norm": 0.19094043609247188, + "learning_rate": 8.952930060887452e-05, + "loss": 3.0248, + "step": 13957 + }, + { + "epoch": 0.8664721584207585, + "grad_norm": 0.20866221258086184, + "learning_rate": 8.952708898528648e-05, + "loss": 3.0152, + "step": 13958 + }, + { + "epoch": 0.8665342355205164, + "grad_norm": 0.18488234931957095, + "learning_rate": 8.9524877155476e-05, + "loss": 3.0049, + "step": 13959 + }, + { + "epoch": 0.8665963126202744, + "grad_norm": 0.222448059258534, + "learning_rate": 8.952266511945464e-05, + "loss": 3.1265, + "step": 13960 + }, + { + "epoch": 0.8666583897200323, + "grad_norm": 0.19394809033266758, + "learning_rate": 8.952045287723396e-05, + "loss": 3.0467, + "step": 13961 + }, + { + "epoch": 0.8667204668197902, + "grad_norm": 0.17415542007445678, + "learning_rate": 8.951824042882547e-05, + "loss": 3.0744, + "step": 13962 + }, + { + "epoch": 0.8667825439195481, + "grad_norm": 0.2087408322331305, + "learning_rate": 8.951602777424073e-05, + "loss": 2.989, + "step": 13963 + }, + { + "epoch": 0.8668446210193059, + "grad_norm": 0.19349540395354795, + "learning_rate": 8.95138149134913e-05, + "loss": 2.988, + "step": 13964 + }, + { + "epoch": 0.8669066981190638, + "grad_norm": 0.18023638235957726, + "learning_rate": 8.951160184658867e-05, + "loss": 3.0365, + "step": 13965 + }, + { + "epoch": 0.8669687752188218, + "grad_norm": 0.18860157590872126, + "learning_rate": 8.950938857354443e-05, + "loss": 2.9568, + "step": 13966 + }, + { + "epoch": 0.8670308523185797, + "grad_norm": 0.19421544005442212, + "learning_rate": 8.950717509437014e-05, + "loss": 2.979, + "step": 13967 + }, + { + "epoch": 0.8670929294183376, + "grad_norm": 0.18629602479632706, + "learning_rate": 8.950496140907731e-05, + "loss": 3.0019, + "step": 13968 + }, + { + "epoch": 0.8671550065180955, + "grad_norm": 0.2283203317529623, + "learning_rate": 8.950274751767752e-05, + "loss": 2.9417, + "step": 13969 + }, + { + "epoch": 0.8672170836178533, + "grad_norm": 0.21889576142783318, + "learning_rate": 8.950053342018229e-05, + "loss": 3.0088, + "step": 13970 + }, + { + "epoch": 0.8672791607176112, + "grad_norm": 0.1730245866479416, + "learning_rate": 8.94983191166032e-05, + "loss": 3.0232, + "step": 13971 + }, + { + "epoch": 0.8673412378173692, + "grad_norm": 0.19807347978593287, + "learning_rate": 8.949610460695177e-05, + "loss": 3.0185, + "step": 13972 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.17220696988255568, + "learning_rate": 8.949388989123959e-05, + "loss": 3.059, + "step": 13973 + }, + { + "epoch": 0.867465392016885, + "grad_norm": 0.16693049958988865, + "learning_rate": 8.94916749694782e-05, + "loss": 2.9072, + "step": 13974 + }, + { + "epoch": 0.8675274691166429, + "grad_norm": 0.2861569625505685, + "learning_rate": 8.948945984167915e-05, + "loss": 3.079, + "step": 13975 + }, + { + "epoch": 0.8675895462164007, + "grad_norm": 0.21408568068801695, + "learning_rate": 8.948724450785399e-05, + "loss": 2.9697, + "step": 13976 + }, + { + "epoch": 0.8676516233161586, + "grad_norm": 0.24668302073155948, + "learning_rate": 8.948502896801432e-05, + "loss": 2.9677, + "step": 13977 + }, + { + "epoch": 0.8677137004159166, + "grad_norm": 0.18903204735707416, + "learning_rate": 8.948281322217165e-05, + "loss": 2.9212, + "step": 13978 + }, + { + "epoch": 0.8677757775156745, + "grad_norm": 0.29783073727706066, + "learning_rate": 8.948059727033756e-05, + "loss": 3.0409, + "step": 13979 + }, + { + "epoch": 0.8678378546154324, + "grad_norm": 0.23598983291621758, + "learning_rate": 8.947838111252362e-05, + "loss": 2.982, + "step": 13980 + }, + { + "epoch": 0.8678999317151903, + "grad_norm": 0.22371107506085858, + "learning_rate": 8.947616474874136e-05, + "loss": 3.1022, + "step": 13981 + }, + { + "epoch": 0.8679620088149481, + "grad_norm": 0.17980024698174712, + "learning_rate": 8.947394817900238e-05, + "loss": 2.9693, + "step": 13982 + }, + { + "epoch": 0.868024085914706, + "grad_norm": 0.19016067871164907, + "learning_rate": 8.947173140331822e-05, + "loss": 3.0678, + "step": 13983 + }, + { + "epoch": 0.868086163014464, + "grad_norm": 0.21536322127760635, + "learning_rate": 8.946951442170048e-05, + "loss": 2.9695, + "step": 13984 + }, + { + "epoch": 0.8681482401142219, + "grad_norm": 0.16546574574553846, + "learning_rate": 8.946729723416067e-05, + "loss": 2.9906, + "step": 13985 + }, + { + "epoch": 0.8682103172139798, + "grad_norm": 0.20345413611696703, + "learning_rate": 8.946507984071042e-05, + "loss": 3.0655, + "step": 13986 + }, + { + "epoch": 0.8682723943137377, + "grad_norm": 0.16685517027188682, + "learning_rate": 8.946286224136124e-05, + "loss": 3.0165, + "step": 13987 + }, + { + "epoch": 0.8683344714134955, + "grad_norm": 0.34441201445533154, + "learning_rate": 8.946064443612474e-05, + "loss": 2.9371, + "step": 13988 + }, + { + "epoch": 0.8683965485132534, + "grad_norm": 0.19549339500413795, + "learning_rate": 8.945842642501247e-05, + "loss": 2.9539, + "step": 13989 + }, + { + "epoch": 0.8684586256130113, + "grad_norm": 0.16960102240836536, + "learning_rate": 8.945620820803604e-05, + "loss": 2.9809, + "step": 13990 + }, + { + "epoch": 0.8685207027127693, + "grad_norm": 0.1777961120696377, + "learning_rate": 8.945398978520696e-05, + "loss": 3.0822, + "step": 13991 + }, + { + "epoch": 0.8685827798125272, + "grad_norm": 0.2011814115171256, + "learning_rate": 8.945177115653684e-05, + "loss": 2.9492, + "step": 13992 + }, + { + "epoch": 0.8686448569122851, + "grad_norm": 0.17901752834235513, + "learning_rate": 8.944955232203727e-05, + "loss": 3.0432, + "step": 13993 + }, + { + "epoch": 0.8687069340120429, + "grad_norm": 0.20889917758277132, + "learning_rate": 8.944733328171981e-05, + "loss": 3.0291, + "step": 13994 + }, + { + "epoch": 0.8687690111118008, + "grad_norm": 0.19170833610660964, + "learning_rate": 8.944511403559602e-05, + "loss": 3.1132, + "step": 13995 + }, + { + "epoch": 0.8688310882115587, + "grad_norm": 0.1863388526275514, + "learning_rate": 8.944289458367751e-05, + "loss": 2.9385, + "step": 13996 + }, + { + "epoch": 0.8688931653113167, + "grad_norm": 0.1739095272242107, + "learning_rate": 8.944067492597584e-05, + "loss": 3.042, + "step": 13997 + }, + { + "epoch": 0.8689552424110746, + "grad_norm": 0.27792129021382167, + "learning_rate": 8.943845506250258e-05, + "loss": 2.9232, + "step": 13998 + }, + { + "epoch": 0.8690173195108325, + "grad_norm": 0.22054871077656918, + "learning_rate": 8.943623499326934e-05, + "loss": 2.9441, + "step": 13999 + }, + { + "epoch": 0.8690793966105903, + "grad_norm": 0.2058540198771459, + "learning_rate": 8.943401471828768e-05, + "loss": 3.1141, + "step": 14000 + }, + { + "epoch": 0.8691414737103482, + "grad_norm": 0.19863189014381955, + "learning_rate": 8.94317942375692e-05, + "loss": 2.9856, + "step": 14001 + }, + { + "epoch": 0.8692035508101061, + "grad_norm": 0.22638811602213513, + "learning_rate": 8.942957355112547e-05, + "loss": 3.0465, + "step": 14002 + }, + { + "epoch": 0.8692656279098641, + "grad_norm": 0.2216182657257885, + "learning_rate": 8.942735265896809e-05, + "loss": 2.9614, + "step": 14003 + }, + { + "epoch": 0.869327705009622, + "grad_norm": 0.18137023653732923, + "learning_rate": 8.942513156110864e-05, + "loss": 3.0549, + "step": 14004 + }, + { + "epoch": 0.8693897821093799, + "grad_norm": 0.1953730966026201, + "learning_rate": 8.942291025755869e-05, + "loss": 2.9603, + "step": 14005 + }, + { + "epoch": 0.8694518592091377, + "grad_norm": 0.17254793580003863, + "learning_rate": 8.942068874832987e-05, + "loss": 3.0425, + "step": 14006 + }, + { + "epoch": 0.8695139363088956, + "grad_norm": 0.25738472561943726, + "learning_rate": 8.941846703343373e-05, + "loss": 2.9564, + "step": 14007 + }, + { + "epoch": 0.8695760134086535, + "grad_norm": 0.3036692060152596, + "learning_rate": 8.94162451128819e-05, + "loss": 3.0638, + "step": 14008 + }, + { + "epoch": 0.8696380905084115, + "grad_norm": 0.18513045513569304, + "learning_rate": 8.941402298668593e-05, + "loss": 3.0368, + "step": 14009 + }, + { + "epoch": 0.8697001676081694, + "grad_norm": 0.19209017499033107, + "learning_rate": 8.941180065485744e-05, + "loss": 3.1048, + "step": 14010 + }, + { + "epoch": 0.8697622447079273, + "grad_norm": 0.21350986643233916, + "learning_rate": 8.940957811740803e-05, + "loss": 2.9522, + "step": 14011 + }, + { + "epoch": 0.8698243218076851, + "grad_norm": 0.19032660408561664, + "learning_rate": 8.940735537434926e-05, + "loss": 2.9744, + "step": 14012 + }, + { + "epoch": 0.869886398907443, + "grad_norm": 0.37584422370980386, + "learning_rate": 8.940513242569277e-05, + "loss": 2.9163, + "step": 14013 + }, + { + "epoch": 0.8699484760072009, + "grad_norm": 0.20793313490284943, + "learning_rate": 8.940290927145013e-05, + "loss": 3.0054, + "step": 14014 + }, + { + "epoch": 0.8700105531069589, + "grad_norm": 0.20588530679344863, + "learning_rate": 8.940068591163296e-05, + "loss": 3.0368, + "step": 14015 + }, + { + "epoch": 0.8700726302067168, + "grad_norm": 0.22299214174233875, + "learning_rate": 8.939846234625282e-05, + "loss": 3.0241, + "step": 14016 + }, + { + "epoch": 0.8701347073064747, + "grad_norm": 0.17870626808278214, + "learning_rate": 8.939623857532136e-05, + "loss": 2.9405, + "step": 14017 + }, + { + "epoch": 0.8701967844062325, + "grad_norm": 0.24039720428954808, + "learning_rate": 8.939401459885016e-05, + "loss": 3.105, + "step": 14018 + }, + { + "epoch": 0.8702588615059904, + "grad_norm": 0.16045852954524462, + "learning_rate": 8.939179041685081e-05, + "loss": 2.9915, + "step": 14019 + }, + { + "epoch": 0.8703209386057483, + "grad_norm": 0.24350149802093765, + "learning_rate": 8.938956602933493e-05, + "loss": 3.0715, + "step": 14020 + }, + { + "epoch": 0.8703830157055062, + "grad_norm": 0.18584752069473895, + "learning_rate": 8.938734143631412e-05, + "loss": 2.9083, + "step": 14021 + }, + { + "epoch": 0.8704450928052642, + "grad_norm": 0.280591445373832, + "learning_rate": 8.93851166378e-05, + "loss": 3.0072, + "step": 14022 + }, + { + "epoch": 0.8705071699050221, + "grad_norm": 0.2010207488159019, + "learning_rate": 8.938289163380415e-05, + "loss": 3.0458, + "step": 14023 + }, + { + "epoch": 0.8705692470047799, + "grad_norm": 0.25633271673357366, + "learning_rate": 8.93806664243382e-05, + "loss": 2.9808, + "step": 14024 + }, + { + "epoch": 0.8706313241045378, + "grad_norm": 0.24923806758381503, + "learning_rate": 8.937844100941376e-05, + "loss": 3.0948, + "step": 14025 + }, + { + "epoch": 0.8706934012042957, + "grad_norm": 0.22552480323386692, + "learning_rate": 8.937621538904242e-05, + "loss": 3.0915, + "step": 14026 + }, + { + "epoch": 0.8707554783040536, + "grad_norm": 0.30044489793138573, + "learning_rate": 8.937398956323581e-05, + "loss": 3.067, + "step": 14027 + }, + { + "epoch": 0.8708175554038116, + "grad_norm": 0.28216079420789936, + "learning_rate": 8.937176353200556e-05, + "loss": 2.9605, + "step": 14028 + }, + { + "epoch": 0.8708796325035695, + "grad_norm": 0.21352481000164025, + "learning_rate": 8.936953729536323e-05, + "loss": 3.0492, + "step": 14029 + }, + { + "epoch": 0.8709417096033273, + "grad_norm": 0.24541563125860405, + "learning_rate": 8.936731085332049e-05, + "loss": 3.009, + "step": 14030 + }, + { + "epoch": 0.8710037867030852, + "grad_norm": 0.30564756013728567, + "learning_rate": 8.936508420588892e-05, + "loss": 2.9733, + "step": 14031 + }, + { + "epoch": 0.8710658638028431, + "grad_norm": 0.3735306882902722, + "learning_rate": 8.936285735308016e-05, + "loss": 3.0601, + "step": 14032 + }, + { + "epoch": 0.871127940902601, + "grad_norm": 0.2532748955900839, + "learning_rate": 8.93606302949058e-05, + "loss": 3.0498, + "step": 14033 + }, + { + "epoch": 0.871190018002359, + "grad_norm": 0.19874438675265257, + "learning_rate": 8.935840303137748e-05, + "loss": 2.9497, + "step": 14034 + }, + { + "epoch": 0.8712520951021169, + "grad_norm": 0.27640556096001323, + "learning_rate": 8.935617556250684e-05, + "loss": 3.0531, + "step": 14035 + }, + { + "epoch": 0.8713141722018747, + "grad_norm": 0.21123373458531586, + "learning_rate": 8.935394788830547e-05, + "loss": 3.0155, + "step": 14036 + }, + { + "epoch": 0.8713762493016326, + "grad_norm": 0.20858706381436656, + "learning_rate": 8.935172000878498e-05, + "loss": 2.9643, + "step": 14037 + }, + { + "epoch": 0.8714383264013905, + "grad_norm": 0.19944737909858573, + "learning_rate": 8.934949192395703e-05, + "loss": 2.9728, + "step": 14038 + }, + { + "epoch": 0.8715004035011484, + "grad_norm": 0.23349115293884162, + "learning_rate": 8.934726363383323e-05, + "loss": 3.0143, + "step": 14039 + }, + { + "epoch": 0.8715624806009064, + "grad_norm": 0.20204877930760715, + "learning_rate": 8.934503513842518e-05, + "loss": 2.9852, + "step": 14040 + }, + { + "epoch": 0.8716245577006643, + "grad_norm": 0.18612446481052616, + "learning_rate": 8.934280643774456e-05, + "loss": 3.0236, + "step": 14041 + }, + { + "epoch": 0.8716866348004221, + "grad_norm": 0.20118392056447734, + "learning_rate": 8.934057753180296e-05, + "loss": 3.025, + "step": 14042 + }, + { + "epoch": 0.87174871190018, + "grad_norm": 0.23433653197902252, + "learning_rate": 8.9338348420612e-05, + "loss": 3.0606, + "step": 14043 + }, + { + "epoch": 0.8718107889999379, + "grad_norm": 0.1784770665168951, + "learning_rate": 8.933611910418334e-05, + "loss": 2.998, + "step": 14044 + }, + { + "epoch": 0.8718728660996958, + "grad_norm": 0.17108138510036364, + "learning_rate": 8.933388958252859e-05, + "loss": 2.9654, + "step": 14045 + }, + { + "epoch": 0.8719349431994537, + "grad_norm": 0.1821685216123728, + "learning_rate": 8.933165985565941e-05, + "loss": 2.9577, + "step": 14046 + }, + { + "epoch": 0.8719970202992117, + "grad_norm": 0.18534609321266896, + "learning_rate": 8.93294299235874e-05, + "loss": 3.0393, + "step": 14047 + }, + { + "epoch": 0.8720590973989695, + "grad_norm": 0.2164721677760081, + "learning_rate": 8.93271997863242e-05, + "loss": 3.0443, + "step": 14048 + }, + { + "epoch": 0.8721211744987274, + "grad_norm": 0.2486777430569168, + "learning_rate": 8.932496944388147e-05, + "loss": 2.9908, + "step": 14049 + }, + { + "epoch": 0.8721832515984853, + "grad_norm": 0.43773991954532054, + "learning_rate": 8.932273889627081e-05, + "loss": 2.9345, + "step": 14050 + }, + { + "epoch": 0.8722453286982432, + "grad_norm": 0.1925130200162975, + "learning_rate": 8.932050814350388e-05, + "loss": 3.0368, + "step": 14051 + }, + { + "epoch": 0.8723074057980011, + "grad_norm": 0.19707724575099733, + "learning_rate": 8.931827718559231e-05, + "loss": 3.0705, + "step": 14052 + }, + { + "epoch": 0.8723694828977591, + "grad_norm": 0.184356242373347, + "learning_rate": 8.931604602254775e-05, + "loss": 3.0334, + "step": 14053 + }, + { + "epoch": 0.8724315599975169, + "grad_norm": 0.18300954722230647, + "learning_rate": 8.931381465438183e-05, + "loss": 3.0849, + "step": 14054 + }, + { + "epoch": 0.8724936370972748, + "grad_norm": 0.24891451529161793, + "learning_rate": 8.931158308110618e-05, + "loss": 3.0005, + "step": 14055 + }, + { + "epoch": 0.8725557141970327, + "grad_norm": 0.20934971400488064, + "learning_rate": 8.930935130273248e-05, + "loss": 3.1315, + "step": 14056 + }, + { + "epoch": 0.8726177912967906, + "grad_norm": 0.24698891637090875, + "learning_rate": 8.930711931927236e-05, + "loss": 2.9794, + "step": 14057 + }, + { + "epoch": 0.8726798683965485, + "grad_norm": 0.2614510861313131, + "learning_rate": 8.930488713073744e-05, + "loss": 2.9337, + "step": 14058 + }, + { + "epoch": 0.8727419454963065, + "grad_norm": 0.1931120321247024, + "learning_rate": 8.930265473713938e-05, + "loss": 2.9908, + "step": 14059 + }, + { + "epoch": 0.8728040225960643, + "grad_norm": 0.1944663219027536, + "learning_rate": 8.930042213848984e-05, + "loss": 3.1152, + "step": 14060 + }, + { + "epoch": 0.8728660996958222, + "grad_norm": 0.20867354013513434, + "learning_rate": 8.929818933480044e-05, + "loss": 3.0707, + "step": 14061 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.18186413948824567, + "learning_rate": 8.929595632608285e-05, + "loss": 3.001, + "step": 14062 + }, + { + "epoch": 0.872990253895338, + "grad_norm": 0.18046787022297753, + "learning_rate": 8.929372311234874e-05, + "loss": 3.0405, + "step": 14063 + }, + { + "epoch": 0.8730523309950959, + "grad_norm": 0.18049248460858222, + "learning_rate": 8.929148969360971e-05, + "loss": 2.9697, + "step": 14064 + }, + { + "epoch": 0.8731144080948539, + "grad_norm": 0.17148131561157082, + "learning_rate": 8.928925606987745e-05, + "loss": 2.9255, + "step": 14065 + }, + { + "epoch": 0.8731764851946117, + "grad_norm": 0.1616140865408855, + "learning_rate": 8.928702224116361e-05, + "loss": 3.0193, + "step": 14066 + }, + { + "epoch": 0.8732385622943696, + "grad_norm": 0.16915805740032164, + "learning_rate": 8.928478820747983e-05, + "loss": 2.9623, + "step": 14067 + }, + { + "epoch": 0.8733006393941275, + "grad_norm": 0.20564870025266696, + "learning_rate": 8.928255396883778e-05, + "loss": 3.0492, + "step": 14068 + }, + { + "epoch": 0.8733627164938854, + "grad_norm": 0.19178540764613144, + "learning_rate": 8.928031952524909e-05, + "loss": 3.0557, + "step": 14069 + }, + { + "epoch": 0.8734247935936433, + "grad_norm": 0.1739136988851647, + "learning_rate": 8.927808487672546e-05, + "loss": 3.0356, + "step": 14070 + }, + { + "epoch": 0.8734868706934013, + "grad_norm": 0.2758619050045235, + "learning_rate": 8.927585002327852e-05, + "loss": 2.9748, + "step": 14071 + }, + { + "epoch": 0.8735489477931591, + "grad_norm": 0.20632668030654458, + "learning_rate": 8.927361496491993e-05, + "loss": 3.0189, + "step": 14072 + }, + { + "epoch": 0.873611024892917, + "grad_norm": 0.21140481259011704, + "learning_rate": 8.927137970166137e-05, + "loss": 2.989, + "step": 14073 + }, + { + "epoch": 0.8736731019926749, + "grad_norm": 0.16937027663334658, + "learning_rate": 8.926914423351447e-05, + "loss": 2.9024, + "step": 14074 + }, + { + "epoch": 0.8737351790924328, + "grad_norm": 0.1756208164507577, + "learning_rate": 8.926690856049092e-05, + "loss": 3.0356, + "step": 14075 + }, + { + "epoch": 0.8737972561921907, + "grad_norm": 0.17558004201766053, + "learning_rate": 8.926467268260237e-05, + "loss": 3.0025, + "step": 14076 + }, + { + "epoch": 0.8738593332919486, + "grad_norm": 0.18343442740458385, + "learning_rate": 8.926243659986051e-05, + "loss": 2.9355, + "step": 14077 + }, + { + "epoch": 0.8739214103917065, + "grad_norm": 0.18806700606254936, + "learning_rate": 8.926020031227697e-05, + "loss": 3.0449, + "step": 14078 + }, + { + "epoch": 0.8739834874914644, + "grad_norm": 0.16648530716368817, + "learning_rate": 8.925796381986344e-05, + "loss": 2.9996, + "step": 14079 + }, + { + "epoch": 0.8740455645912223, + "grad_norm": 0.20019798277027748, + "learning_rate": 8.925572712263157e-05, + "loss": 3.0446, + "step": 14080 + }, + { + "epoch": 0.8741076416909802, + "grad_norm": 0.16768759106574213, + "learning_rate": 8.925349022059306e-05, + "loss": 2.9063, + "step": 14081 + }, + { + "epoch": 0.8741697187907381, + "grad_norm": 0.21888832419146986, + "learning_rate": 8.925125311375954e-05, + "loss": 3.0882, + "step": 14082 + }, + { + "epoch": 0.874231795890496, + "grad_norm": 0.2010265955357257, + "learning_rate": 8.924901580214273e-05, + "loss": 3.0682, + "step": 14083 + }, + { + "epoch": 0.8742938729902539, + "grad_norm": 0.1764208548288882, + "learning_rate": 8.924677828575428e-05, + "loss": 2.9638, + "step": 14084 + }, + { + "epoch": 0.8743559500900118, + "grad_norm": 0.20675435533052988, + "learning_rate": 8.924454056460584e-05, + "loss": 3.0188, + "step": 14085 + }, + { + "epoch": 0.8744180271897697, + "grad_norm": 0.25796231338631753, + "learning_rate": 8.924230263870911e-05, + "loss": 3.0105, + "step": 14086 + }, + { + "epoch": 0.8744801042895276, + "grad_norm": 0.20396357530716153, + "learning_rate": 8.924006450807575e-05, + "loss": 2.9972, + "step": 14087 + }, + { + "epoch": 0.8745421813892855, + "grad_norm": 0.17730236999113225, + "learning_rate": 8.923782617271748e-05, + "loss": 3.0057, + "step": 14088 + }, + { + "epoch": 0.8746042584890434, + "grad_norm": 0.1925453042001319, + "learning_rate": 8.923558763264592e-05, + "loss": 2.919, + "step": 14089 + }, + { + "epoch": 0.8746663355888012, + "grad_norm": 0.2927110683182158, + "learning_rate": 8.923334888787277e-05, + "loss": 2.9468, + "step": 14090 + }, + { + "epoch": 0.8747284126885592, + "grad_norm": 0.4371705858668396, + "learning_rate": 8.923110993840973e-05, + "loss": 3.03, + "step": 14091 + }, + { + "epoch": 0.8747904897883171, + "grad_norm": 0.2007453340846823, + "learning_rate": 8.922887078426844e-05, + "loss": 2.99, + "step": 14092 + }, + { + "epoch": 0.874852566888075, + "grad_norm": 0.25522813752594603, + "learning_rate": 8.922663142546064e-05, + "loss": 3.0222, + "step": 14093 + }, + { + "epoch": 0.8749146439878329, + "grad_norm": 0.2156189285335743, + "learning_rate": 8.922439186199798e-05, + "loss": 3.0046, + "step": 14094 + }, + { + "epoch": 0.8749767210875908, + "grad_norm": 0.3284625519690436, + "learning_rate": 8.922215209389213e-05, + "loss": 3.0472, + "step": 14095 + }, + { + "epoch": 0.8750387981873486, + "grad_norm": 0.2045063494830975, + "learning_rate": 8.92199121211548e-05, + "loss": 3.0135, + "step": 14096 + }, + { + "epoch": 0.8751008752871066, + "grad_norm": 0.2367441676754768, + "learning_rate": 8.921767194379766e-05, + "loss": 3.0557, + "step": 14097 + }, + { + "epoch": 0.8751629523868645, + "grad_norm": 0.22477855116158577, + "learning_rate": 8.92154315618324e-05, + "loss": 2.9652, + "step": 14098 + }, + { + "epoch": 0.8752250294866224, + "grad_norm": 0.27416547871876285, + "learning_rate": 8.921319097527073e-05, + "loss": 2.951, + "step": 14099 + }, + { + "epoch": 0.8752871065863803, + "grad_norm": 0.20310247736555795, + "learning_rate": 8.921095018412432e-05, + "loss": 2.9883, + "step": 14100 + }, + { + "epoch": 0.8753491836861382, + "grad_norm": 0.2434877822251777, + "learning_rate": 8.920870918840485e-05, + "loss": 2.9599, + "step": 14101 + }, + { + "epoch": 0.875411260785896, + "grad_norm": 0.2746385897944981, + "learning_rate": 8.920646798812404e-05, + "loss": 2.9882, + "step": 14102 + }, + { + "epoch": 0.875473337885654, + "grad_norm": 0.23666439880260562, + "learning_rate": 8.920422658329357e-05, + "loss": 2.9197, + "step": 14103 + }, + { + "epoch": 0.8755354149854119, + "grad_norm": 0.18359000628286487, + "learning_rate": 8.920198497392511e-05, + "loss": 3.096, + "step": 14104 + }, + { + "epoch": 0.8755974920851698, + "grad_norm": 0.17181898418026323, + "learning_rate": 8.91997431600304e-05, + "loss": 3.0061, + "step": 14105 + }, + { + "epoch": 0.8756595691849277, + "grad_norm": 0.26536913778323035, + "learning_rate": 8.919750114162111e-05, + "loss": 2.9942, + "step": 14106 + }, + { + "epoch": 0.8757216462846856, + "grad_norm": 0.18396198598130598, + "learning_rate": 8.919525891870894e-05, + "loss": 3.0174, + "step": 14107 + }, + { + "epoch": 0.8757837233844434, + "grad_norm": 0.3108527398677598, + "learning_rate": 8.919301649130558e-05, + "loss": 3.0614, + "step": 14108 + }, + { + "epoch": 0.8758458004842014, + "grad_norm": 0.2180459152550455, + "learning_rate": 8.919077385942276e-05, + "loss": 3.0137, + "step": 14109 + }, + { + "epoch": 0.8759078775839593, + "grad_norm": 0.21793408306179235, + "learning_rate": 8.918853102307214e-05, + "loss": 3.0541, + "step": 14110 + }, + { + "epoch": 0.8759699546837172, + "grad_norm": 0.29811518525016256, + "learning_rate": 8.918628798226544e-05, + "loss": 3.1098, + "step": 14111 + }, + { + "epoch": 0.8760320317834751, + "grad_norm": 0.19764168463254103, + "learning_rate": 8.918404473701438e-05, + "loss": 2.9766, + "step": 14112 + }, + { + "epoch": 0.876094108883233, + "grad_norm": 0.1915038404994698, + "learning_rate": 8.918180128733063e-05, + "loss": 3.0205, + "step": 14113 + }, + { + "epoch": 0.8761561859829908, + "grad_norm": 0.2064119829974263, + "learning_rate": 8.917955763322591e-05, + "loss": 3.0999, + "step": 14114 + }, + { + "epoch": 0.8762182630827487, + "grad_norm": 0.1820188808741042, + "learning_rate": 8.917731377471195e-05, + "loss": 2.8859, + "step": 14115 + }, + { + "epoch": 0.8762803401825067, + "grad_norm": 0.17404927426913996, + "learning_rate": 8.91750697118004e-05, + "loss": 3.0159, + "step": 14116 + }, + { + "epoch": 0.8763424172822646, + "grad_norm": 0.16743589314077384, + "learning_rate": 8.917282544450303e-05, + "loss": 3.0667, + "step": 14117 + }, + { + "epoch": 0.8764044943820225, + "grad_norm": 0.2242554654488789, + "learning_rate": 8.91705809728315e-05, + "loss": 2.9939, + "step": 14118 + }, + { + "epoch": 0.8764665714817804, + "grad_norm": 0.22505623690824336, + "learning_rate": 8.916833629679755e-05, + "loss": 2.997, + "step": 14119 + }, + { + "epoch": 0.8765286485815382, + "grad_norm": 0.17123171325837025, + "learning_rate": 8.916609141641288e-05, + "loss": 3.0499, + "step": 14120 + }, + { + "epoch": 0.8765907256812961, + "grad_norm": 0.21038008260112068, + "learning_rate": 8.91638463316892e-05, + "loss": 3.0068, + "step": 14121 + }, + { + "epoch": 0.8766528027810541, + "grad_norm": 0.1970687866151864, + "learning_rate": 8.916160104263825e-05, + "loss": 3.0793, + "step": 14122 + }, + { + "epoch": 0.876714879880812, + "grad_norm": 0.20278683353061305, + "learning_rate": 8.915935554927168e-05, + "loss": 3.008, + "step": 14123 + }, + { + "epoch": 0.8767769569805699, + "grad_norm": 0.18840582908827586, + "learning_rate": 8.915710985160126e-05, + "loss": 2.9976, + "step": 14124 + }, + { + "epoch": 0.8768390340803278, + "grad_norm": 0.1719501721112565, + "learning_rate": 8.91548639496387e-05, + "loss": 3.0669, + "step": 14125 + }, + { + "epoch": 0.8769011111800856, + "grad_norm": 0.18183657628652966, + "learning_rate": 8.91526178433957e-05, + "loss": 2.9906, + "step": 14126 + }, + { + "epoch": 0.8769631882798435, + "grad_norm": 0.18339014938525527, + "learning_rate": 8.9150371532884e-05, + "loss": 3.0115, + "step": 14127 + }, + { + "epoch": 0.8770252653796015, + "grad_norm": 0.17745319593918044, + "learning_rate": 8.914812501811532e-05, + "loss": 2.9728, + "step": 14128 + }, + { + "epoch": 0.8770873424793594, + "grad_norm": 0.18023794285583378, + "learning_rate": 8.914587829910135e-05, + "loss": 3.005, + "step": 14129 + }, + { + "epoch": 0.8771494195791173, + "grad_norm": 0.1866200082249313, + "learning_rate": 8.914363137585382e-05, + "loss": 3.0172, + "step": 14130 + }, + { + "epoch": 0.8772114966788752, + "grad_norm": 0.1792226355114018, + "learning_rate": 8.914138424838447e-05, + "loss": 2.9577, + "step": 14131 + }, + { + "epoch": 0.877273573778633, + "grad_norm": 0.16002686438001107, + "learning_rate": 8.913913691670503e-05, + "loss": 2.9927, + "step": 14132 + }, + { + "epoch": 0.8773356508783909, + "grad_norm": 0.17246714372365543, + "learning_rate": 8.91368893808272e-05, + "loss": 2.8573, + "step": 14133 + }, + { + "epoch": 0.8773977279781489, + "grad_norm": 0.1541254684993371, + "learning_rate": 8.91346416407627e-05, + "loss": 2.9745, + "step": 14134 + }, + { + "epoch": 0.8774598050779068, + "grad_norm": 0.16402599518960537, + "learning_rate": 8.913239369652331e-05, + "loss": 3.03, + "step": 14135 + }, + { + "epoch": 0.8775218821776647, + "grad_norm": 0.16606220825975124, + "learning_rate": 8.91301455481207e-05, + "loss": 3.0059, + "step": 14136 + }, + { + "epoch": 0.8775839592774226, + "grad_norm": 0.19488093380033006, + "learning_rate": 8.912789719556662e-05, + "loss": 3.0078, + "step": 14137 + }, + { + "epoch": 0.8776460363771804, + "grad_norm": 0.17701409659987286, + "learning_rate": 8.912564863887282e-05, + "loss": 2.9433, + "step": 14138 + }, + { + "epoch": 0.8777081134769383, + "grad_norm": 0.1628793161340717, + "learning_rate": 8.9123399878051e-05, + "loss": 3.048, + "step": 14139 + }, + { + "epoch": 0.8777701905766963, + "grad_norm": 0.18757787716367363, + "learning_rate": 8.91211509131129e-05, + "loss": 3.0232, + "step": 14140 + }, + { + "epoch": 0.8778322676764542, + "grad_norm": 0.19827446886135056, + "learning_rate": 8.911890174407027e-05, + "loss": 3.0879, + "step": 14141 + }, + { + "epoch": 0.8778943447762121, + "grad_norm": 0.20226682430543877, + "learning_rate": 8.911665237093483e-05, + "loss": 3.0287, + "step": 14142 + }, + { + "epoch": 0.87795642187597, + "grad_norm": 0.17346744944621453, + "learning_rate": 8.911440279371832e-05, + "loss": 3.0476, + "step": 14143 + }, + { + "epoch": 0.8780184989757278, + "grad_norm": 0.2363208816931575, + "learning_rate": 8.911215301243246e-05, + "loss": 2.9407, + "step": 14144 + }, + { + "epoch": 0.8780805760754857, + "grad_norm": 0.19333470312371823, + "learning_rate": 8.910990302708901e-05, + "loss": 2.8889, + "step": 14145 + }, + { + "epoch": 0.8781426531752436, + "grad_norm": 0.1976539170651475, + "learning_rate": 8.910765283769972e-05, + "loss": 3.0075, + "step": 14146 + }, + { + "epoch": 0.8782047302750016, + "grad_norm": 0.2333818214279767, + "learning_rate": 8.91054024442763e-05, + "loss": 3.086, + "step": 14147 + }, + { + "epoch": 0.8782668073747595, + "grad_norm": 0.1924651303849824, + "learning_rate": 8.91031518468305e-05, + "loss": 3.0496, + "step": 14148 + }, + { + "epoch": 0.8783288844745174, + "grad_norm": 0.21249905719267437, + "learning_rate": 8.910090104537407e-05, + "loss": 2.9746, + "step": 14149 + }, + { + "epoch": 0.8783909615742752, + "grad_norm": 0.19205947271283205, + "learning_rate": 8.909865003991874e-05, + "loss": 3.0767, + "step": 14150 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.2555599936299182, + "learning_rate": 8.909639883047626e-05, + "loss": 2.9566, + "step": 14151 + }, + { + "epoch": 0.878515115773791, + "grad_norm": 0.15540820696918156, + "learning_rate": 8.909414741705838e-05, + "loss": 2.9964, + "step": 14152 + }, + { + "epoch": 0.878577192873549, + "grad_norm": 0.1924090585378175, + "learning_rate": 8.909189579967683e-05, + "loss": 3.0424, + "step": 14153 + }, + { + "epoch": 0.8786392699733069, + "grad_norm": 0.1702308148295864, + "learning_rate": 8.908964397834339e-05, + "loss": 3.0063, + "step": 14154 + }, + { + "epoch": 0.8787013470730648, + "grad_norm": 0.2145304215728479, + "learning_rate": 8.908739195306978e-05, + "loss": 3.0929, + "step": 14155 + }, + { + "epoch": 0.8787634241728226, + "grad_norm": 0.16918568242262508, + "learning_rate": 8.908513972386776e-05, + "loss": 2.9382, + "step": 14156 + }, + { + "epoch": 0.8788255012725805, + "grad_norm": 0.18942784602826554, + "learning_rate": 8.908288729074908e-05, + "loss": 3.0187, + "step": 14157 + }, + { + "epoch": 0.8788875783723384, + "grad_norm": 0.23688415659365294, + "learning_rate": 8.908063465372547e-05, + "loss": 3.0336, + "step": 14158 + }, + { + "epoch": 0.8789496554720964, + "grad_norm": 0.2003693359250299, + "learning_rate": 8.907838181280872e-05, + "loss": 3.0128, + "step": 14159 + }, + { + "epoch": 0.8790117325718543, + "grad_norm": 0.1985338067470006, + "learning_rate": 8.907612876801056e-05, + "loss": 2.9389, + "step": 14160 + }, + { + "epoch": 0.8790738096716122, + "grad_norm": 0.17482686035356168, + "learning_rate": 8.907387551934277e-05, + "loss": 2.9988, + "step": 14161 + }, + { + "epoch": 0.87913588677137, + "grad_norm": 0.17800556704637047, + "learning_rate": 8.907162206681705e-05, + "loss": 2.969, + "step": 14162 + }, + { + "epoch": 0.8791979638711279, + "grad_norm": 0.2218631036013773, + "learning_rate": 8.906936841044521e-05, + "loss": 3.0998, + "step": 14163 + }, + { + "epoch": 0.8792600409708858, + "grad_norm": 0.18495032129968814, + "learning_rate": 8.9067114550239e-05, + "loss": 2.9199, + "step": 14164 + }, + { + "epoch": 0.8793221180706438, + "grad_norm": 0.2061571616689969, + "learning_rate": 8.906486048621016e-05, + "loss": 3.0429, + "step": 14165 + }, + { + "epoch": 0.8793841951704017, + "grad_norm": 0.19401888647515514, + "learning_rate": 8.906260621837046e-05, + "loss": 3.0193, + "step": 14166 + }, + { + "epoch": 0.8794462722701596, + "grad_norm": 0.17430562966424532, + "learning_rate": 8.906035174673167e-05, + "loss": 3.0099, + "step": 14167 + }, + { + "epoch": 0.8795083493699174, + "grad_norm": 0.20982548980058044, + "learning_rate": 8.905809707130553e-05, + "loss": 2.9485, + "step": 14168 + }, + { + "epoch": 0.8795704264696753, + "grad_norm": 0.16429002062783776, + "learning_rate": 8.905584219210381e-05, + "loss": 3.1038, + "step": 14169 + }, + { + "epoch": 0.8796325035694332, + "grad_norm": 0.18983621307607995, + "learning_rate": 8.905358710913829e-05, + "loss": 2.9736, + "step": 14170 + }, + { + "epoch": 0.8796945806691912, + "grad_norm": 0.23202473506065657, + "learning_rate": 8.905133182242072e-05, + "loss": 2.9816, + "step": 14171 + }, + { + "epoch": 0.8797566577689491, + "grad_norm": 0.23705635106373904, + "learning_rate": 8.904907633196288e-05, + "loss": 2.9974, + "step": 14172 + }, + { + "epoch": 0.879818734868707, + "grad_norm": 0.20076662745089302, + "learning_rate": 8.904682063777652e-05, + "loss": 3.0064, + "step": 14173 + }, + { + "epoch": 0.8798808119684648, + "grad_norm": 0.1915343011976214, + "learning_rate": 8.904456473987343e-05, + "loss": 3.0764, + "step": 14174 + }, + { + "epoch": 0.8799428890682227, + "grad_norm": 0.1872919935553148, + "learning_rate": 8.904230863826535e-05, + "loss": 3.0193, + "step": 14175 + }, + { + "epoch": 0.8800049661679806, + "grad_norm": 0.18602306096363797, + "learning_rate": 8.904005233296408e-05, + "loss": 2.8942, + "step": 14176 + }, + { + "epoch": 0.8800670432677385, + "grad_norm": 0.17725697003247207, + "learning_rate": 8.90377958239814e-05, + "loss": 2.9865, + "step": 14177 + }, + { + "epoch": 0.8801291203674965, + "grad_norm": 0.17343406802725417, + "learning_rate": 8.903553911132903e-05, + "loss": 2.9339, + "step": 14178 + }, + { + "epoch": 0.8801911974672544, + "grad_norm": 0.2180335539873361, + "learning_rate": 8.903328219501879e-05, + "loss": 2.9218, + "step": 14179 + }, + { + "epoch": 0.8802532745670122, + "grad_norm": 0.19501946401045725, + "learning_rate": 8.903102507506244e-05, + "loss": 3.0522, + "step": 14180 + }, + { + "epoch": 0.8803153516667701, + "grad_norm": 0.2196895144276625, + "learning_rate": 8.902876775147175e-05, + "loss": 2.9479, + "step": 14181 + }, + { + "epoch": 0.880377428766528, + "grad_norm": 0.2047227869120559, + "learning_rate": 8.902651022425852e-05, + "loss": 3.0729, + "step": 14182 + }, + { + "epoch": 0.8804395058662859, + "grad_norm": 0.19671970782300544, + "learning_rate": 8.902425249343449e-05, + "loss": 3.0227, + "step": 14183 + }, + { + "epoch": 0.8805015829660439, + "grad_norm": 0.21001112582048506, + "learning_rate": 8.902199455901148e-05, + "loss": 2.9055, + "step": 14184 + }, + { + "epoch": 0.8805636600658018, + "grad_norm": 0.18743499980224468, + "learning_rate": 8.901973642100124e-05, + "loss": 2.9526, + "step": 14185 + }, + { + "epoch": 0.8806257371655596, + "grad_norm": 0.2340382700195913, + "learning_rate": 8.901747807941557e-05, + "loss": 3.0849, + "step": 14186 + }, + { + "epoch": 0.8806878142653175, + "grad_norm": 0.20643668901086115, + "learning_rate": 8.901521953426624e-05, + "loss": 2.9696, + "step": 14187 + }, + { + "epoch": 0.8807498913650754, + "grad_norm": 0.2128061074292467, + "learning_rate": 8.901296078556503e-05, + "loss": 3.0468, + "step": 14188 + }, + { + "epoch": 0.8808119684648333, + "grad_norm": 0.21585670440546031, + "learning_rate": 8.901070183332375e-05, + "loss": 3.0043, + "step": 14189 + }, + { + "epoch": 0.8808740455645913, + "grad_norm": 0.18270605648243096, + "learning_rate": 8.900844267755417e-05, + "loss": 2.9252, + "step": 14190 + }, + { + "epoch": 0.8809361226643492, + "grad_norm": 0.18027040018186644, + "learning_rate": 8.900618331826807e-05, + "loss": 3.0524, + "step": 14191 + }, + { + "epoch": 0.880998199764107, + "grad_norm": 0.18728780660445687, + "learning_rate": 8.900392375547724e-05, + "loss": 2.9702, + "step": 14192 + }, + { + "epoch": 0.8810602768638649, + "grad_norm": 0.1797037220971427, + "learning_rate": 8.900166398919346e-05, + "loss": 3.0275, + "step": 14193 + }, + { + "epoch": 0.8811223539636228, + "grad_norm": 0.24930945034461802, + "learning_rate": 8.899940401942854e-05, + "loss": 2.9717, + "step": 14194 + }, + { + "epoch": 0.8811844310633807, + "grad_norm": 0.22303906375281451, + "learning_rate": 8.899714384619426e-05, + "loss": 2.9725, + "step": 14195 + }, + { + "epoch": 0.8812465081631387, + "grad_norm": 0.20972174393277931, + "learning_rate": 8.899488346950243e-05, + "loss": 3.0071, + "step": 14196 + }, + { + "epoch": 0.8813085852628966, + "grad_norm": 0.20001087531681605, + "learning_rate": 8.899262288936481e-05, + "loss": 2.9908, + "step": 14197 + }, + { + "epoch": 0.8813706623626544, + "grad_norm": 0.19853529848017668, + "learning_rate": 8.899036210579321e-05, + "loss": 2.9701, + "step": 14198 + }, + { + "epoch": 0.8814327394624123, + "grad_norm": 0.17221279437067724, + "learning_rate": 8.898810111879943e-05, + "loss": 2.991, + "step": 14199 + }, + { + "epoch": 0.8814948165621702, + "grad_norm": 0.24686167505978182, + "learning_rate": 8.898583992839527e-05, + "loss": 2.9899, + "step": 14200 + }, + { + "epoch": 0.8815568936619281, + "grad_norm": 0.168073200307945, + "learning_rate": 8.898357853459252e-05, + "loss": 2.9595, + "step": 14201 + }, + { + "epoch": 0.881618970761686, + "grad_norm": 0.17263807701683936, + "learning_rate": 8.898131693740295e-05, + "loss": 3.0566, + "step": 14202 + }, + { + "epoch": 0.881681047861444, + "grad_norm": 0.24570270532229535, + "learning_rate": 8.89790551368384e-05, + "loss": 2.9414, + "step": 14203 + }, + { + "epoch": 0.8817431249612018, + "grad_norm": 0.1820144600553824, + "learning_rate": 8.897679313291068e-05, + "loss": 3.0424, + "step": 14204 + }, + { + "epoch": 0.8818052020609597, + "grad_norm": 0.1748203451720258, + "learning_rate": 8.897453092563154e-05, + "loss": 2.9097, + "step": 14205 + }, + { + "epoch": 0.8818672791607176, + "grad_norm": 0.20922158050616663, + "learning_rate": 8.897226851501281e-05, + "loss": 2.9661, + "step": 14206 + }, + { + "epoch": 0.8819293562604755, + "grad_norm": 0.17686185356591955, + "learning_rate": 8.897000590106631e-05, + "loss": 2.9819, + "step": 14207 + }, + { + "epoch": 0.8819914333602334, + "grad_norm": 0.1690032341512905, + "learning_rate": 8.896774308380382e-05, + "loss": 3.074, + "step": 14208 + }, + { + "epoch": 0.8820535104599914, + "grad_norm": 0.21059715222736425, + "learning_rate": 8.896548006323717e-05, + "loss": 3.0535, + "step": 14209 + }, + { + "epoch": 0.8821155875597492, + "grad_norm": 0.1921387785119969, + "learning_rate": 8.896321683937814e-05, + "loss": 3.0548, + "step": 14210 + }, + { + "epoch": 0.8821776646595071, + "grad_norm": 0.19127793779638694, + "learning_rate": 8.896095341223855e-05, + "loss": 3.025, + "step": 14211 + }, + { + "epoch": 0.882239741759265, + "grad_norm": 0.17850750668886955, + "learning_rate": 8.895868978183019e-05, + "loss": 2.9369, + "step": 14212 + }, + { + "epoch": 0.8823018188590229, + "grad_norm": 0.18077738532746704, + "learning_rate": 8.895642594816492e-05, + "loss": 3.0077, + "step": 14213 + }, + { + "epoch": 0.8823638959587808, + "grad_norm": 0.16243592658827533, + "learning_rate": 8.89541619112545e-05, + "loss": 3.0647, + "step": 14214 + }, + { + "epoch": 0.8824259730585388, + "grad_norm": 0.2319009617386426, + "learning_rate": 8.895189767111077e-05, + "loss": 2.9432, + "step": 14215 + }, + { + "epoch": 0.8824880501582966, + "grad_norm": 0.17095907497938562, + "learning_rate": 8.894963322774553e-05, + "loss": 3.0218, + "step": 14216 + }, + { + "epoch": 0.8825501272580545, + "grad_norm": 0.170089031083589, + "learning_rate": 8.894736858117059e-05, + "loss": 3.0506, + "step": 14217 + }, + { + "epoch": 0.8826122043578124, + "grad_norm": 0.16167921421509857, + "learning_rate": 8.894510373139778e-05, + "loss": 2.9615, + "step": 14218 + }, + { + "epoch": 0.8826742814575703, + "grad_norm": 0.18383408095425952, + "learning_rate": 8.894283867843892e-05, + "loss": 3.0412, + "step": 14219 + }, + { + "epoch": 0.8827363585573282, + "grad_norm": 0.19722769155347253, + "learning_rate": 8.89405734223058e-05, + "loss": 2.9932, + "step": 14220 + }, + { + "epoch": 0.8827984356570862, + "grad_norm": 0.18781705712741792, + "learning_rate": 8.893830796301024e-05, + "loss": 2.9363, + "step": 14221 + }, + { + "epoch": 0.882860512756844, + "grad_norm": 0.18276945691503732, + "learning_rate": 8.893604230056411e-05, + "loss": 3.0815, + "step": 14222 + }, + { + "epoch": 0.8829225898566019, + "grad_norm": 0.17734374621891236, + "learning_rate": 8.893377643497917e-05, + "loss": 2.9967, + "step": 14223 + }, + { + "epoch": 0.8829846669563598, + "grad_norm": 0.19060547369085182, + "learning_rate": 8.893151036626728e-05, + "loss": 3.0293, + "step": 14224 + }, + { + "epoch": 0.8830467440561177, + "grad_norm": 0.18504838739196, + "learning_rate": 8.892924409444026e-05, + "loss": 3.0847, + "step": 14225 + }, + { + "epoch": 0.8831088211558756, + "grad_norm": 0.19211147750876764, + "learning_rate": 8.89269776195099e-05, + "loss": 3.0521, + "step": 14226 + }, + { + "epoch": 0.8831708982556336, + "grad_norm": 0.18870264043395948, + "learning_rate": 8.892471094148807e-05, + "loss": 2.9344, + "step": 14227 + }, + { + "epoch": 0.8832329753553914, + "grad_norm": 0.24834452814806132, + "learning_rate": 8.892244406038655e-05, + "loss": 3.0011, + "step": 14228 + }, + { + "epoch": 0.8832950524551493, + "grad_norm": 0.24735993911286552, + "learning_rate": 8.89201769762172e-05, + "loss": 2.9988, + "step": 14229 + }, + { + "epoch": 0.8833571295549072, + "grad_norm": 0.19856920189634514, + "learning_rate": 8.891790968899185e-05, + "loss": 3.0465, + "step": 14230 + }, + { + "epoch": 0.8834192066546651, + "grad_norm": 0.1845467365688444, + "learning_rate": 8.89156421987223e-05, + "loss": 2.9453, + "step": 14231 + }, + { + "epoch": 0.883481283754423, + "grad_norm": 0.2164293214914471, + "learning_rate": 8.891337450542042e-05, + "loss": 2.9582, + "step": 14232 + }, + { + "epoch": 0.883543360854181, + "grad_norm": 0.19957892265868232, + "learning_rate": 8.8911106609098e-05, + "loss": 3.0391, + "step": 14233 + }, + { + "epoch": 0.8836054379539388, + "grad_norm": 0.18496223497590247, + "learning_rate": 8.89088385097669e-05, + "loss": 2.968, + "step": 14234 + }, + { + "epoch": 0.8836675150536967, + "grad_norm": 0.21837282872935673, + "learning_rate": 8.890657020743893e-05, + "loss": 3.0123, + "step": 14235 + }, + { + "epoch": 0.8837295921534546, + "grad_norm": 0.18251133467147626, + "learning_rate": 8.890430170212596e-05, + "loss": 3.0262, + "step": 14236 + }, + { + "epoch": 0.8837916692532125, + "grad_norm": 0.21721006660408948, + "learning_rate": 8.89020329938398e-05, + "loss": 3.0068, + "step": 14237 + }, + { + "epoch": 0.8838537463529704, + "grad_norm": 0.16368881879881905, + "learning_rate": 8.889976408259226e-05, + "loss": 3.0158, + "step": 14238 + }, + { + "epoch": 0.8839158234527282, + "grad_norm": 0.18975716213548494, + "learning_rate": 8.889749496839524e-05, + "loss": 2.9854, + "step": 14239 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.1611822977084426, + "learning_rate": 8.889522565126053e-05, + "loss": 3.122, + "step": 14240 + }, + { + "epoch": 0.8840399776522441, + "grad_norm": 0.3778208100167793, + "learning_rate": 8.889295613119999e-05, + "loss": 3.0259, + "step": 14241 + }, + { + "epoch": 0.884102054752002, + "grad_norm": 0.18020356747126323, + "learning_rate": 8.889068640822546e-05, + "loss": 3.0749, + "step": 14242 + }, + { + "epoch": 0.8841641318517599, + "grad_norm": 0.22026807391216602, + "learning_rate": 8.888841648234878e-05, + "loss": 3.0022, + "step": 14243 + }, + { + "epoch": 0.8842262089515178, + "grad_norm": 0.2531166416719126, + "learning_rate": 8.888614635358178e-05, + "loss": 2.9634, + "step": 14244 + }, + { + "epoch": 0.8842882860512756, + "grad_norm": 0.22915254986516073, + "learning_rate": 8.888387602193632e-05, + "loss": 3.072, + "step": 14245 + }, + { + "epoch": 0.8843503631510335, + "grad_norm": 0.2688581659304573, + "learning_rate": 8.888160548742424e-05, + "loss": 2.9698, + "step": 14246 + }, + { + "epoch": 0.8844124402507915, + "grad_norm": 0.24707402916325463, + "learning_rate": 8.887933475005739e-05, + "loss": 2.9566, + "step": 14247 + }, + { + "epoch": 0.8844745173505494, + "grad_norm": 0.18581295009267257, + "learning_rate": 8.88770638098476e-05, + "loss": 3.0591, + "step": 14248 + }, + { + "epoch": 0.8845365944503073, + "grad_norm": 0.2600936396180468, + "learning_rate": 8.887479266680673e-05, + "loss": 2.9476, + "step": 14249 + }, + { + "epoch": 0.8845986715500652, + "grad_norm": 0.17226640610290733, + "learning_rate": 8.887252132094666e-05, + "loss": 2.9971, + "step": 14250 + }, + { + "epoch": 0.884660748649823, + "grad_norm": 0.1960473826480786, + "learning_rate": 8.887024977227917e-05, + "loss": 2.993, + "step": 14251 + }, + { + "epoch": 0.8847228257495809, + "grad_norm": 0.26343412630176705, + "learning_rate": 8.886797802081616e-05, + "loss": 3.0131, + "step": 14252 + }, + { + "epoch": 0.8847849028493389, + "grad_norm": 0.18654490641922683, + "learning_rate": 8.88657060665695e-05, + "loss": 3.0637, + "step": 14253 + }, + { + "epoch": 0.8848469799490968, + "grad_norm": 0.2086084674972328, + "learning_rate": 8.886343390955098e-05, + "loss": 3.055, + "step": 14254 + }, + { + "epoch": 0.8849090570488547, + "grad_norm": 0.19533157255509648, + "learning_rate": 8.886116154977252e-05, + "loss": 2.9797, + "step": 14255 + }, + { + "epoch": 0.8849711341486126, + "grad_norm": 0.26202330732505846, + "learning_rate": 8.885888898724594e-05, + "loss": 2.9323, + "step": 14256 + }, + { + "epoch": 0.8850332112483704, + "grad_norm": 0.19844089377497348, + "learning_rate": 8.885661622198309e-05, + "loss": 2.9858, + "step": 14257 + }, + { + "epoch": 0.8850952883481283, + "grad_norm": 0.18490084565223555, + "learning_rate": 8.885434325399585e-05, + "loss": 3.0307, + "step": 14258 + }, + { + "epoch": 0.8851573654478863, + "grad_norm": 0.17329107861210036, + "learning_rate": 8.885207008329607e-05, + "loss": 2.9306, + "step": 14259 + }, + { + "epoch": 0.8852194425476442, + "grad_norm": 0.18394012445475794, + "learning_rate": 8.884979670989561e-05, + "loss": 3.0004, + "step": 14260 + }, + { + "epoch": 0.8852815196474021, + "grad_norm": 0.2034367864423388, + "learning_rate": 8.884752313380632e-05, + "loss": 2.9999, + "step": 14261 + }, + { + "epoch": 0.88534359674716, + "grad_norm": 0.20479446957520833, + "learning_rate": 8.884524935504008e-05, + "loss": 2.9029, + "step": 14262 + }, + { + "epoch": 0.8854056738469178, + "grad_norm": 0.2508095954015795, + "learning_rate": 8.884297537360874e-05, + "loss": 2.9415, + "step": 14263 + }, + { + "epoch": 0.8854677509466757, + "grad_norm": 0.2192530382792962, + "learning_rate": 8.884070118952416e-05, + "loss": 3.0198, + "step": 14264 + }, + { + "epoch": 0.8855298280464337, + "grad_norm": 0.18658453255335153, + "learning_rate": 8.883842680279824e-05, + "loss": 2.9601, + "step": 14265 + }, + { + "epoch": 0.8855919051461916, + "grad_norm": 0.28599912365032193, + "learning_rate": 8.883615221344279e-05, + "loss": 3.0675, + "step": 14266 + }, + { + "epoch": 0.8856539822459495, + "grad_norm": 0.21905485266371752, + "learning_rate": 8.883387742146972e-05, + "loss": 3.0025, + "step": 14267 + }, + { + "epoch": 0.8857160593457074, + "grad_norm": 0.287781985008253, + "learning_rate": 8.883160242689089e-05, + "loss": 3.0047, + "step": 14268 + }, + { + "epoch": 0.8857781364454652, + "grad_norm": 0.22119754504849307, + "learning_rate": 8.882932722971813e-05, + "loss": 2.9528, + "step": 14269 + }, + { + "epoch": 0.8858402135452231, + "grad_norm": 0.2423292483547796, + "learning_rate": 8.882705182996338e-05, + "loss": 2.9634, + "step": 14270 + }, + { + "epoch": 0.885902290644981, + "grad_norm": 0.2597948502470693, + "learning_rate": 8.882477622763846e-05, + "loss": 3.006, + "step": 14271 + }, + { + "epoch": 0.885964367744739, + "grad_norm": 0.26281772299893796, + "learning_rate": 8.882250042275526e-05, + "loss": 3.0617, + "step": 14272 + }, + { + "epoch": 0.8860264448444969, + "grad_norm": 0.26916270006335624, + "learning_rate": 8.882022441532564e-05, + "loss": 3.0008, + "step": 14273 + }, + { + "epoch": 0.8860885219442548, + "grad_norm": 0.20268674485733182, + "learning_rate": 8.881794820536152e-05, + "loss": 2.9303, + "step": 14274 + }, + { + "epoch": 0.8861505990440126, + "grad_norm": 0.2592506035702999, + "learning_rate": 8.881567179287472e-05, + "loss": 3.0053, + "step": 14275 + }, + { + "epoch": 0.8862126761437705, + "grad_norm": 0.20408503451799423, + "learning_rate": 8.881339517787712e-05, + "loss": 2.9164, + "step": 14276 + }, + { + "epoch": 0.8862747532435284, + "grad_norm": 0.31744525001830753, + "learning_rate": 8.881111836038063e-05, + "loss": 2.9232, + "step": 14277 + }, + { + "epoch": 0.8863368303432864, + "grad_norm": 0.29715997420033324, + "learning_rate": 8.880884134039711e-05, + "loss": 3.0161, + "step": 14278 + }, + { + "epoch": 0.8863989074430443, + "grad_norm": 0.24128413923618897, + "learning_rate": 8.880656411793845e-05, + "loss": 3.0395, + "step": 14279 + }, + { + "epoch": 0.8864609845428022, + "grad_norm": 0.3480730879617342, + "learning_rate": 8.880428669301652e-05, + "loss": 3.047, + "step": 14280 + }, + { + "epoch": 0.88652306164256, + "grad_norm": 0.29193806226345853, + "learning_rate": 8.880200906564322e-05, + "loss": 3.0415, + "step": 14281 + }, + { + "epoch": 0.8865851387423179, + "grad_norm": 0.22168776207556987, + "learning_rate": 8.87997312358304e-05, + "loss": 3.0697, + "step": 14282 + }, + { + "epoch": 0.8866472158420758, + "grad_norm": 0.21085955804602757, + "learning_rate": 8.879745320358997e-05, + "loss": 2.9757, + "step": 14283 + }, + { + "epoch": 0.8867092929418338, + "grad_norm": 0.19871413142654074, + "learning_rate": 8.879517496893381e-05, + "loss": 3.0419, + "step": 14284 + }, + { + "epoch": 0.8867713700415917, + "grad_norm": 0.2160492713896294, + "learning_rate": 8.87928965318738e-05, + "loss": 2.9825, + "step": 14285 + }, + { + "epoch": 0.8868334471413496, + "grad_norm": 0.2072625128134575, + "learning_rate": 8.879061789242183e-05, + "loss": 3.0548, + "step": 14286 + }, + { + "epoch": 0.8868955242411074, + "grad_norm": 0.18217891998436886, + "learning_rate": 8.87883390505898e-05, + "loss": 3.0353, + "step": 14287 + }, + { + "epoch": 0.8869576013408653, + "grad_norm": 0.21756099380085758, + "learning_rate": 8.878606000638959e-05, + "loss": 3.0314, + "step": 14288 + }, + { + "epoch": 0.8870196784406232, + "grad_norm": 0.26785155755986406, + "learning_rate": 8.878378075983307e-05, + "loss": 3.0092, + "step": 14289 + }, + { + "epoch": 0.8870817555403812, + "grad_norm": 0.238944646670075, + "learning_rate": 8.878150131093217e-05, + "loss": 3.0053, + "step": 14290 + }, + { + "epoch": 0.8871438326401391, + "grad_norm": 0.1793310008265881, + "learning_rate": 8.877922165969876e-05, + "loss": 3.0197, + "step": 14291 + }, + { + "epoch": 0.887205909739897, + "grad_norm": 0.1760302369697903, + "learning_rate": 8.877694180614473e-05, + "loss": 3.0258, + "step": 14292 + }, + { + "epoch": 0.8872679868396548, + "grad_norm": 0.2660999399743125, + "learning_rate": 8.877466175028197e-05, + "loss": 3.1133, + "step": 14293 + }, + { + "epoch": 0.8873300639394127, + "grad_norm": 0.19434055725434093, + "learning_rate": 8.877238149212241e-05, + "loss": 2.9915, + "step": 14294 + }, + { + "epoch": 0.8873921410391706, + "grad_norm": 0.2075800100721191, + "learning_rate": 8.877010103167792e-05, + "loss": 2.964, + "step": 14295 + }, + { + "epoch": 0.8874542181389286, + "grad_norm": 0.22533449799691174, + "learning_rate": 8.876782036896038e-05, + "loss": 2.9866, + "step": 14296 + }, + { + "epoch": 0.8875162952386865, + "grad_norm": 0.19279853325064286, + "learning_rate": 8.876553950398173e-05, + "loss": 3.0631, + "step": 14297 + }, + { + "epoch": 0.8875783723384444, + "grad_norm": 0.19055696782107637, + "learning_rate": 8.876325843675385e-05, + "loss": 3.0844, + "step": 14298 + }, + { + "epoch": 0.8876404494382022, + "grad_norm": 0.1969709839794793, + "learning_rate": 8.876097716728864e-05, + "loss": 3.0417, + "step": 14299 + }, + { + "epoch": 0.8877025265379601, + "grad_norm": 0.22298039716682644, + "learning_rate": 8.875869569559799e-05, + "loss": 2.9946, + "step": 14300 + }, + { + "epoch": 0.887764603637718, + "grad_norm": 0.18345537449508878, + "learning_rate": 8.875641402169383e-05, + "loss": 3.0092, + "step": 14301 + }, + { + "epoch": 0.887826680737476, + "grad_norm": 0.273184673045099, + "learning_rate": 8.875413214558805e-05, + "loss": 3.0558, + "step": 14302 + }, + { + "epoch": 0.8878887578372339, + "grad_norm": 0.166523291838097, + "learning_rate": 8.875185006729254e-05, + "loss": 3.0655, + "step": 14303 + }, + { + "epoch": 0.8879508349369918, + "grad_norm": 0.16319793578791575, + "learning_rate": 8.874956778681922e-05, + "loss": 3.0419, + "step": 14304 + }, + { + "epoch": 0.8880129120367496, + "grad_norm": 0.16103577437319777, + "learning_rate": 8.874728530418002e-05, + "loss": 2.9434, + "step": 14305 + }, + { + "epoch": 0.8880749891365075, + "grad_norm": 0.1753064578080978, + "learning_rate": 8.874500261938681e-05, + "loss": 2.9287, + "step": 14306 + }, + { + "epoch": 0.8881370662362654, + "grad_norm": 0.1739766632823917, + "learning_rate": 8.874271973245152e-05, + "loss": 3.008, + "step": 14307 + }, + { + "epoch": 0.8881991433360233, + "grad_norm": 0.18671046362330362, + "learning_rate": 8.874043664338605e-05, + "loss": 3.0246, + "step": 14308 + }, + { + "epoch": 0.8882612204357813, + "grad_norm": 0.14989266379846836, + "learning_rate": 8.873815335220232e-05, + "loss": 2.931, + "step": 14309 + }, + { + "epoch": 0.8883232975355392, + "grad_norm": 0.18620913923006216, + "learning_rate": 8.873586985891225e-05, + "loss": 2.9507, + "step": 14310 + }, + { + "epoch": 0.888385374635297, + "grad_norm": 0.16484017210056048, + "learning_rate": 8.873358616352773e-05, + "loss": 2.9712, + "step": 14311 + }, + { + "epoch": 0.8884474517350549, + "grad_norm": 0.17738370571319012, + "learning_rate": 8.87313022660607e-05, + "loss": 3.0244, + "step": 14312 + }, + { + "epoch": 0.8885095288348128, + "grad_norm": 0.15848504954633327, + "learning_rate": 8.872901816652305e-05, + "loss": 3.0342, + "step": 14313 + }, + { + "epoch": 0.8885716059345707, + "grad_norm": 0.21329923665703265, + "learning_rate": 8.872673386492672e-05, + "loss": 2.9106, + "step": 14314 + }, + { + "epoch": 0.8886336830343287, + "grad_norm": 0.20757133286131407, + "learning_rate": 8.872444936128362e-05, + "loss": 3.0195, + "step": 14315 + }, + { + "epoch": 0.8886957601340866, + "grad_norm": 0.1747814887652377, + "learning_rate": 8.872216465560566e-05, + "loss": 2.8263, + "step": 14316 + }, + { + "epoch": 0.8887578372338444, + "grad_norm": 0.16964144421559105, + "learning_rate": 8.871987974790476e-05, + "loss": 2.9842, + "step": 14317 + }, + { + "epoch": 0.8888199143336023, + "grad_norm": 0.17628398106912005, + "learning_rate": 8.871759463819286e-05, + "loss": 2.9119, + "step": 14318 + }, + { + "epoch": 0.8888819914333602, + "grad_norm": 0.20111489646420821, + "learning_rate": 8.871530932648186e-05, + "loss": 3.1059, + "step": 14319 + }, + { + "epoch": 0.8889440685331181, + "grad_norm": 0.18086045390522024, + "learning_rate": 8.871302381278371e-05, + "loss": 3.1293, + "step": 14320 + }, + { + "epoch": 0.889006145632876, + "grad_norm": 0.1731942653497876, + "learning_rate": 8.871073809711031e-05, + "loss": 3.0273, + "step": 14321 + }, + { + "epoch": 0.889068222732634, + "grad_norm": 0.25828924315558094, + "learning_rate": 8.870845217947358e-05, + "loss": 2.9808, + "step": 14322 + }, + { + "epoch": 0.8891302998323918, + "grad_norm": 0.193596607676698, + "learning_rate": 8.870616605988547e-05, + "loss": 2.9854, + "step": 14323 + }, + { + "epoch": 0.8891923769321497, + "grad_norm": 0.173943133294491, + "learning_rate": 8.87038797383579e-05, + "loss": 2.9386, + "step": 14324 + }, + { + "epoch": 0.8892544540319076, + "grad_norm": 0.17620139773334856, + "learning_rate": 8.87015932149028e-05, + "loss": 2.8898, + "step": 14325 + }, + { + "epoch": 0.8893165311316655, + "grad_norm": 0.22423454189181355, + "learning_rate": 8.869930648953206e-05, + "loss": 2.9323, + "step": 14326 + }, + { + "epoch": 0.8893786082314235, + "grad_norm": 0.20914845843459473, + "learning_rate": 8.869701956225767e-05, + "loss": 2.9994, + "step": 14327 + }, + { + "epoch": 0.8894406853311814, + "grad_norm": 0.2663530581892209, + "learning_rate": 8.869473243309151e-05, + "loss": 2.9926, + "step": 14328 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.2388375776252734, + "learning_rate": 8.869244510204558e-05, + "loss": 3.0138, + "step": 14329 + }, + { + "epoch": 0.8895648395306971, + "grad_norm": 0.212885314105223, + "learning_rate": 8.869015756913174e-05, + "loss": 3.0071, + "step": 14330 + }, + { + "epoch": 0.889626916630455, + "grad_norm": 0.25120513244954334, + "learning_rate": 8.868786983436195e-05, + "loss": 2.9976, + "step": 14331 + }, + { + "epoch": 0.8896889937302129, + "grad_norm": 0.311988877488681, + "learning_rate": 8.868558189774818e-05, + "loss": 3.0901, + "step": 14332 + }, + { + "epoch": 0.8897510708299708, + "grad_norm": 0.2108461394083437, + "learning_rate": 8.868329375930232e-05, + "loss": 2.9708, + "step": 14333 + }, + { + "epoch": 0.8898131479297288, + "grad_norm": 0.2621324884311882, + "learning_rate": 8.868100541903633e-05, + "loss": 2.9535, + "step": 14334 + }, + { + "epoch": 0.8898752250294866, + "grad_norm": 0.2154463380273985, + "learning_rate": 8.867871687696214e-05, + "loss": 3.0628, + "step": 14335 + }, + { + "epoch": 0.8899373021292445, + "grad_norm": 0.23040306642403263, + "learning_rate": 8.86764281330917e-05, + "loss": 2.9621, + "step": 14336 + }, + { + "epoch": 0.8899993792290024, + "grad_norm": 0.18211376955677888, + "learning_rate": 8.867413918743694e-05, + "loss": 3.0154, + "step": 14337 + }, + { + "epoch": 0.8900614563287603, + "grad_norm": 0.18689001719951065, + "learning_rate": 8.86718500400098e-05, + "loss": 3.0534, + "step": 14338 + }, + { + "epoch": 0.8901235334285182, + "grad_norm": 0.2526336174277989, + "learning_rate": 8.866956069082224e-05, + "loss": 3.0733, + "step": 14339 + }, + { + "epoch": 0.8901856105282762, + "grad_norm": 0.2108451146451539, + "learning_rate": 8.86672711398862e-05, + "loss": 2.9824, + "step": 14340 + }, + { + "epoch": 0.890247687628034, + "grad_norm": 0.18082789647664668, + "learning_rate": 8.86649813872136e-05, + "loss": 2.9549, + "step": 14341 + }, + { + "epoch": 0.8903097647277919, + "grad_norm": 0.18482207228401715, + "learning_rate": 8.866269143281642e-05, + "loss": 3.0009, + "step": 14342 + }, + { + "epoch": 0.8903718418275498, + "grad_norm": 0.18533497807518493, + "learning_rate": 8.86604012767066e-05, + "loss": 3.0348, + "step": 14343 + }, + { + "epoch": 0.8904339189273077, + "grad_norm": 0.2068461814555343, + "learning_rate": 8.865811091889606e-05, + "loss": 3.0385, + "step": 14344 + }, + { + "epoch": 0.8904959960270656, + "grad_norm": 0.19002002765795742, + "learning_rate": 8.865582035939678e-05, + "loss": 3.0345, + "step": 14345 + }, + { + "epoch": 0.8905580731268236, + "grad_norm": 0.22034525763017931, + "learning_rate": 8.865352959822073e-05, + "loss": 2.96, + "step": 14346 + }, + { + "epoch": 0.8906201502265814, + "grad_norm": 0.1997755068504397, + "learning_rate": 8.865123863537981e-05, + "loss": 2.9556, + "step": 14347 + }, + { + "epoch": 0.8906822273263393, + "grad_norm": 0.2247521900090458, + "learning_rate": 8.864894747088599e-05, + "loss": 3.0083, + "step": 14348 + }, + { + "epoch": 0.8907443044260972, + "grad_norm": 0.25579233237593807, + "learning_rate": 8.864665610475122e-05, + "loss": 3.0524, + "step": 14349 + }, + { + "epoch": 0.8908063815258551, + "grad_norm": 0.2001217404528379, + "learning_rate": 8.864436453698748e-05, + "loss": 2.9949, + "step": 14350 + }, + { + "epoch": 0.890868458625613, + "grad_norm": 0.2504078101868026, + "learning_rate": 8.86420727676067e-05, + "loss": 2.9895, + "step": 14351 + }, + { + "epoch": 0.890930535725371, + "grad_norm": 0.20980883567060174, + "learning_rate": 8.863978079662086e-05, + "loss": 3.0253, + "step": 14352 + }, + { + "epoch": 0.8909926128251288, + "grad_norm": 0.22290038226859837, + "learning_rate": 8.863748862404189e-05, + "loss": 2.9554, + "step": 14353 + }, + { + "epoch": 0.8910546899248867, + "grad_norm": 0.2386902207411541, + "learning_rate": 8.863519624988176e-05, + "loss": 3.0003, + "step": 14354 + }, + { + "epoch": 0.8911167670246446, + "grad_norm": 0.20253749394338225, + "learning_rate": 8.863290367415245e-05, + "loss": 2.9623, + "step": 14355 + }, + { + "epoch": 0.8911788441244025, + "grad_norm": 0.216200825386695, + "learning_rate": 8.863061089686589e-05, + "loss": 2.9811, + "step": 14356 + }, + { + "epoch": 0.8912409212241604, + "grad_norm": 0.19723357516246479, + "learning_rate": 8.862831791803405e-05, + "loss": 2.9797, + "step": 14357 + }, + { + "epoch": 0.8913029983239183, + "grad_norm": 0.1959516462515573, + "learning_rate": 8.86260247376689e-05, + "loss": 2.9159, + "step": 14358 + }, + { + "epoch": 0.8913650754236762, + "grad_norm": 0.19276465847196378, + "learning_rate": 8.862373135578241e-05, + "loss": 2.9789, + "step": 14359 + }, + { + "epoch": 0.8914271525234341, + "grad_norm": 0.1714852150568448, + "learning_rate": 8.862143777238654e-05, + "loss": 2.9017, + "step": 14360 + }, + { + "epoch": 0.891489229623192, + "grad_norm": 0.1641772867735627, + "learning_rate": 8.861914398749325e-05, + "loss": 3.0107, + "step": 14361 + }, + { + "epoch": 0.8915513067229499, + "grad_norm": 0.19084032240782112, + "learning_rate": 8.86168500011145e-05, + "loss": 3.1096, + "step": 14362 + }, + { + "epoch": 0.8916133838227078, + "grad_norm": 0.18438591399460763, + "learning_rate": 8.861455581326228e-05, + "loss": 2.9172, + "step": 14363 + }, + { + "epoch": 0.8916754609224657, + "grad_norm": 0.15508375638062474, + "learning_rate": 8.861226142394854e-05, + "loss": 3.0095, + "step": 14364 + }, + { + "epoch": 0.8917375380222236, + "grad_norm": 0.16583724817211018, + "learning_rate": 8.860996683318527e-05, + "loss": 3.0261, + "step": 14365 + }, + { + "epoch": 0.8917996151219815, + "grad_norm": 0.17074244873369002, + "learning_rate": 8.860767204098441e-05, + "loss": 2.9752, + "step": 14366 + }, + { + "epoch": 0.8918616922217394, + "grad_norm": 0.18480800279777337, + "learning_rate": 8.860537704735797e-05, + "loss": 2.9999, + "step": 14367 + }, + { + "epoch": 0.8919237693214973, + "grad_norm": 0.15961910515029526, + "learning_rate": 8.86030818523179e-05, + "loss": 2.9332, + "step": 14368 + }, + { + "epoch": 0.8919858464212552, + "grad_norm": 0.1493019418839741, + "learning_rate": 8.86007864558762e-05, + "loss": 2.9679, + "step": 14369 + }, + { + "epoch": 0.8920479235210131, + "grad_norm": 0.1559026254193223, + "learning_rate": 8.85984908580448e-05, + "loss": 2.9715, + "step": 14370 + }, + { + "epoch": 0.892110000620771, + "grad_norm": 0.1734907326325738, + "learning_rate": 8.859619505883572e-05, + "loss": 2.9878, + "step": 14371 + }, + { + "epoch": 0.8921720777205289, + "grad_norm": 0.16343892893704462, + "learning_rate": 8.85938990582609e-05, + "loss": 2.942, + "step": 14372 + }, + { + "epoch": 0.8922341548202868, + "grad_norm": 0.17576089847160062, + "learning_rate": 8.859160285633235e-05, + "loss": 2.9737, + "step": 14373 + }, + { + "epoch": 0.8922962319200447, + "grad_norm": 0.1516261131952561, + "learning_rate": 8.858930645306205e-05, + "loss": 2.9679, + "step": 14374 + }, + { + "epoch": 0.8923583090198026, + "grad_norm": 0.1562990588419152, + "learning_rate": 8.858700984846197e-05, + "loss": 3.0202, + "step": 14375 + }, + { + "epoch": 0.8924203861195605, + "grad_norm": 0.15778495127937275, + "learning_rate": 8.858471304254408e-05, + "loss": 3.063, + "step": 14376 + }, + { + "epoch": 0.8924824632193183, + "grad_norm": 0.23413270506066697, + "learning_rate": 8.858241603532038e-05, + "loss": 2.9489, + "step": 14377 + }, + { + "epoch": 0.8925445403190763, + "grad_norm": 0.20529687205078812, + "learning_rate": 8.858011882680284e-05, + "loss": 2.9998, + "step": 14378 + }, + { + "epoch": 0.8926066174188342, + "grad_norm": 0.18196522002192203, + "learning_rate": 8.857782141700348e-05, + "loss": 2.9212, + "step": 14379 + }, + { + "epoch": 0.8926686945185921, + "grad_norm": 0.165148129060532, + "learning_rate": 8.857552380593424e-05, + "loss": 2.9366, + "step": 14380 + }, + { + "epoch": 0.89273077161835, + "grad_norm": 0.198496012433835, + "learning_rate": 8.857322599360714e-05, + "loss": 3.009, + "step": 14381 + }, + { + "epoch": 0.8927928487181079, + "grad_norm": 0.18409670257790808, + "learning_rate": 8.857092798003415e-05, + "loss": 2.9601, + "step": 14382 + }, + { + "epoch": 0.8928549258178657, + "grad_norm": 0.1842546235731121, + "learning_rate": 8.856862976522727e-05, + "loss": 2.9706, + "step": 14383 + }, + { + "epoch": 0.8929170029176237, + "grad_norm": 0.19205489507354453, + "learning_rate": 8.856633134919848e-05, + "loss": 3.0656, + "step": 14384 + }, + { + "epoch": 0.8929790800173816, + "grad_norm": 0.2278562962069586, + "learning_rate": 8.856403273195976e-05, + "loss": 2.8713, + "step": 14385 + }, + { + "epoch": 0.8930411571171395, + "grad_norm": 0.18712679158743792, + "learning_rate": 8.856173391352314e-05, + "loss": 2.9848, + "step": 14386 + }, + { + "epoch": 0.8931032342168974, + "grad_norm": 0.17193160967130017, + "learning_rate": 8.85594348939006e-05, + "loss": 2.8841, + "step": 14387 + }, + { + "epoch": 0.8931653113166553, + "grad_norm": 0.19159604413624096, + "learning_rate": 8.855713567310411e-05, + "loss": 3.0793, + "step": 14388 + }, + { + "epoch": 0.8932273884164131, + "grad_norm": 0.1755576459944237, + "learning_rate": 8.855483625114569e-05, + "loss": 3.0519, + "step": 14389 + }, + { + "epoch": 0.893289465516171, + "grad_norm": 0.20866811298334026, + "learning_rate": 8.855253662803732e-05, + "loss": 3.0248, + "step": 14390 + }, + { + "epoch": 0.893351542615929, + "grad_norm": 0.21208287033081727, + "learning_rate": 8.855023680379102e-05, + "loss": 3.1038, + "step": 14391 + }, + { + "epoch": 0.8934136197156869, + "grad_norm": 0.23481196360340237, + "learning_rate": 8.854793677841877e-05, + "loss": 2.9305, + "step": 14392 + }, + { + "epoch": 0.8934756968154448, + "grad_norm": 0.1659798864156137, + "learning_rate": 8.854563655193257e-05, + "loss": 3.0761, + "step": 14393 + }, + { + "epoch": 0.8935377739152027, + "grad_norm": 0.19367607267569456, + "learning_rate": 8.854333612434445e-05, + "loss": 2.8089, + "step": 14394 + }, + { + "epoch": 0.8935998510149605, + "grad_norm": 0.21345138191889204, + "learning_rate": 8.854103549566637e-05, + "loss": 3.0389, + "step": 14395 + }, + { + "epoch": 0.8936619281147185, + "grad_norm": 0.1883600101911166, + "learning_rate": 8.853873466591037e-05, + "loss": 2.9848, + "step": 14396 + }, + { + "epoch": 0.8937240052144764, + "grad_norm": 0.18517649978451617, + "learning_rate": 8.853643363508842e-05, + "loss": 2.987, + "step": 14397 + }, + { + "epoch": 0.8937860823142343, + "grad_norm": 0.2135063419354908, + "learning_rate": 8.853413240321253e-05, + "loss": 2.9678, + "step": 14398 + }, + { + "epoch": 0.8938481594139922, + "grad_norm": 0.20006496558953718, + "learning_rate": 8.853183097029474e-05, + "loss": 3.0389, + "step": 14399 + }, + { + "epoch": 0.8939102365137501, + "grad_norm": 0.17990320154336964, + "learning_rate": 8.852952933634702e-05, + "loss": 2.9808, + "step": 14400 + }, + { + "epoch": 0.8939723136135079, + "grad_norm": 0.19069843402111059, + "learning_rate": 8.852722750138138e-05, + "loss": 3.011, + "step": 14401 + }, + { + "epoch": 0.8940343907132658, + "grad_norm": 0.17038143429637576, + "learning_rate": 8.852492546540987e-05, + "loss": 3.0763, + "step": 14402 + }, + { + "epoch": 0.8940964678130238, + "grad_norm": 0.17067757094434555, + "learning_rate": 8.852262322844446e-05, + "loss": 2.968, + "step": 14403 + }, + { + "epoch": 0.8941585449127817, + "grad_norm": 0.2108637539556238, + "learning_rate": 8.852032079049717e-05, + "loss": 2.9111, + "step": 14404 + }, + { + "epoch": 0.8942206220125396, + "grad_norm": 0.16221555426780013, + "learning_rate": 8.851801815158002e-05, + "loss": 2.9264, + "step": 14405 + }, + { + "epoch": 0.8942826991122975, + "grad_norm": 0.21074945403715034, + "learning_rate": 8.851571531170501e-05, + "loss": 2.9786, + "step": 14406 + }, + { + "epoch": 0.8943447762120553, + "grad_norm": 0.1904228893925051, + "learning_rate": 8.851341227088416e-05, + "loss": 2.9654, + "step": 14407 + }, + { + "epoch": 0.8944068533118132, + "grad_norm": 0.16156618620432175, + "learning_rate": 8.851110902912951e-05, + "loss": 2.9467, + "step": 14408 + }, + { + "epoch": 0.8944689304115712, + "grad_norm": 0.16470484214295084, + "learning_rate": 8.850880558645304e-05, + "loss": 3.0576, + "step": 14409 + }, + { + "epoch": 0.8945310075113291, + "grad_norm": 0.18838888312973526, + "learning_rate": 8.850650194286678e-05, + "loss": 3.0113, + "step": 14410 + }, + { + "epoch": 0.894593084611087, + "grad_norm": 0.17056142547474434, + "learning_rate": 8.850419809838277e-05, + "loss": 3.0632, + "step": 14411 + }, + { + "epoch": 0.8946551617108449, + "grad_norm": 0.16389749949091517, + "learning_rate": 8.850189405301299e-05, + "loss": 2.9584, + "step": 14412 + }, + { + "epoch": 0.8947172388106027, + "grad_norm": 0.17740043246843873, + "learning_rate": 8.849958980676949e-05, + "loss": 2.9945, + "step": 14413 + }, + { + "epoch": 0.8947793159103606, + "grad_norm": 0.17362379001308414, + "learning_rate": 8.849728535966429e-05, + "loss": 2.9782, + "step": 14414 + }, + { + "epoch": 0.8948413930101186, + "grad_norm": 0.19096857751393131, + "learning_rate": 8.84949807117094e-05, + "loss": 2.9761, + "step": 14415 + }, + { + "epoch": 0.8949034701098765, + "grad_norm": 0.1727210646012332, + "learning_rate": 8.849267586291685e-05, + "loss": 3.0386, + "step": 14416 + }, + { + "epoch": 0.8949655472096344, + "grad_norm": 0.1874504719328873, + "learning_rate": 8.849037081329866e-05, + "loss": 3.0076, + "step": 14417 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.1741192127068412, + "learning_rate": 8.848806556286687e-05, + "loss": 2.929, + "step": 14418 + }, + { + "epoch": 0.8950897014091501, + "grad_norm": 0.162876359969029, + "learning_rate": 8.84857601116335e-05, + "loss": 2.9875, + "step": 14419 + }, + { + "epoch": 0.895151778508908, + "grad_norm": 0.17829771159782035, + "learning_rate": 8.848345445961057e-05, + "loss": 2.9763, + "step": 14420 + }, + { + "epoch": 0.895213855608666, + "grad_norm": 0.15677225333000544, + "learning_rate": 8.848114860681012e-05, + "loss": 3.0468, + "step": 14421 + }, + { + "epoch": 0.8952759327084239, + "grad_norm": 0.18961223331367025, + "learning_rate": 8.847884255324419e-05, + "loss": 2.9476, + "step": 14422 + }, + { + "epoch": 0.8953380098081818, + "grad_norm": 0.160839715898529, + "learning_rate": 8.847653629892478e-05, + "loss": 3.011, + "step": 14423 + }, + { + "epoch": 0.8954000869079397, + "grad_norm": 0.2063170475786492, + "learning_rate": 8.847422984386395e-05, + "loss": 3.019, + "step": 14424 + }, + { + "epoch": 0.8954621640076975, + "grad_norm": 0.18005695921990497, + "learning_rate": 8.847192318807372e-05, + "loss": 2.9574, + "step": 14425 + }, + { + "epoch": 0.8955242411074554, + "grad_norm": 0.1721767464410846, + "learning_rate": 8.84696163315661e-05, + "loss": 2.998, + "step": 14426 + }, + { + "epoch": 0.8955863182072133, + "grad_norm": 0.19797013317669704, + "learning_rate": 8.846730927435319e-05, + "loss": 3.0233, + "step": 14427 + }, + { + "epoch": 0.8956483953069713, + "grad_norm": 0.18283422088371395, + "learning_rate": 8.846500201644697e-05, + "loss": 2.97, + "step": 14428 + }, + { + "epoch": 0.8957104724067292, + "grad_norm": 0.20652028673949405, + "learning_rate": 8.84626945578595e-05, + "loss": 2.902, + "step": 14429 + }, + { + "epoch": 0.8957725495064871, + "grad_norm": 0.20531251319467161, + "learning_rate": 8.846038689860283e-05, + "loss": 2.9154, + "step": 14430 + }, + { + "epoch": 0.8958346266062449, + "grad_norm": 0.1908037082784598, + "learning_rate": 8.845807903868896e-05, + "loss": 2.8832, + "step": 14431 + }, + { + "epoch": 0.8958967037060028, + "grad_norm": 0.24142591649411144, + "learning_rate": 8.845577097812998e-05, + "loss": 3.009, + "step": 14432 + }, + { + "epoch": 0.8959587808057607, + "grad_norm": 0.22933220664244755, + "learning_rate": 8.845346271693789e-05, + "loss": 3.0384, + "step": 14433 + }, + { + "epoch": 0.8960208579055187, + "grad_norm": 0.1805834100452742, + "learning_rate": 8.845115425512476e-05, + "loss": 2.9654, + "step": 14434 + }, + { + "epoch": 0.8960829350052766, + "grad_norm": 0.21671016780634486, + "learning_rate": 8.84488455927026e-05, + "loss": 3.0525, + "step": 14435 + }, + { + "epoch": 0.8961450121050345, + "grad_norm": 0.22638698060177959, + "learning_rate": 8.844653672968351e-05, + "loss": 2.9996, + "step": 14436 + }, + { + "epoch": 0.8962070892047923, + "grad_norm": 0.1859111823362101, + "learning_rate": 8.844422766607949e-05, + "loss": 2.9563, + "step": 14437 + }, + { + "epoch": 0.8962691663045502, + "grad_norm": 0.23852824089354235, + "learning_rate": 8.844191840190261e-05, + "loss": 2.9935, + "step": 14438 + }, + { + "epoch": 0.8963312434043081, + "grad_norm": 0.19546048280292624, + "learning_rate": 8.843960893716489e-05, + "loss": 2.9901, + "step": 14439 + }, + { + "epoch": 0.8963933205040661, + "grad_norm": 0.1888334818195804, + "learning_rate": 8.843729927187843e-05, + "loss": 2.9539, + "step": 14440 + }, + { + "epoch": 0.896455397603824, + "grad_norm": 0.18487240526481846, + "learning_rate": 8.843498940605524e-05, + "loss": 2.9896, + "step": 14441 + }, + { + "epoch": 0.8965174747035819, + "grad_norm": 0.16857067642317602, + "learning_rate": 8.843267933970735e-05, + "loss": 3.0, + "step": 14442 + }, + { + "epoch": 0.8965795518033397, + "grad_norm": 0.17860115306258834, + "learning_rate": 8.843036907284687e-05, + "loss": 2.9534, + "step": 14443 + }, + { + "epoch": 0.8966416289030976, + "grad_norm": 0.1904307145311097, + "learning_rate": 8.842805860548582e-05, + "loss": 2.9889, + "step": 14444 + }, + { + "epoch": 0.8967037060028555, + "grad_norm": 0.18813587686392189, + "learning_rate": 8.842574793763626e-05, + "loss": 3.0585, + "step": 14445 + }, + { + "epoch": 0.8967657831026135, + "grad_norm": 0.16577034389457473, + "learning_rate": 8.842343706931025e-05, + "loss": 3.0403, + "step": 14446 + }, + { + "epoch": 0.8968278602023714, + "grad_norm": 0.1712375056536408, + "learning_rate": 8.842112600051983e-05, + "loss": 2.8973, + "step": 14447 + }, + { + "epoch": 0.8968899373021293, + "grad_norm": 0.1792638386727401, + "learning_rate": 8.841881473127707e-05, + "loss": 2.9223, + "step": 14448 + }, + { + "epoch": 0.8969520144018871, + "grad_norm": 0.206808303053191, + "learning_rate": 8.841650326159403e-05, + "loss": 3.071, + "step": 14449 + }, + { + "epoch": 0.897014091501645, + "grad_norm": 0.16811854314636418, + "learning_rate": 8.841419159148276e-05, + "loss": 2.9445, + "step": 14450 + }, + { + "epoch": 0.8970761686014029, + "grad_norm": 0.1798023913906293, + "learning_rate": 8.841187972095535e-05, + "loss": 2.9946, + "step": 14451 + }, + { + "epoch": 0.8971382457011609, + "grad_norm": 0.18626280745372176, + "learning_rate": 8.840956765002382e-05, + "loss": 2.9764, + "step": 14452 + }, + { + "epoch": 0.8972003228009188, + "grad_norm": 0.166960863889946, + "learning_rate": 8.840725537870025e-05, + "loss": 3.0199, + "step": 14453 + }, + { + "epoch": 0.8972623999006767, + "grad_norm": 0.1879265621974453, + "learning_rate": 8.84049429069967e-05, + "loss": 2.9941, + "step": 14454 + }, + { + "epoch": 0.8973244770004345, + "grad_norm": 0.18723950569529685, + "learning_rate": 8.840263023492525e-05, + "loss": 3.045, + "step": 14455 + }, + { + "epoch": 0.8973865541001924, + "grad_norm": 0.20856279114853493, + "learning_rate": 8.840031736249797e-05, + "loss": 3.017, + "step": 14456 + }, + { + "epoch": 0.8974486311999503, + "grad_norm": 0.22365057762247514, + "learning_rate": 8.839800428972689e-05, + "loss": 2.9649, + "step": 14457 + }, + { + "epoch": 0.8975107082997082, + "grad_norm": 0.2572174429815679, + "learning_rate": 8.83956910166241e-05, + "loss": 3.0616, + "step": 14458 + }, + { + "epoch": 0.8975727853994662, + "grad_norm": 0.1607806316211846, + "learning_rate": 8.839337754320168e-05, + "loss": 2.9883, + "step": 14459 + }, + { + "epoch": 0.8976348624992241, + "grad_norm": 0.35195220964081986, + "learning_rate": 8.83910638694717e-05, + "loss": 2.9082, + "step": 14460 + }, + { + "epoch": 0.8976969395989819, + "grad_norm": 0.1666640800533644, + "learning_rate": 8.83887499954462e-05, + "loss": 2.9639, + "step": 14461 + }, + { + "epoch": 0.8977590166987398, + "grad_norm": 0.19400239240877, + "learning_rate": 8.838643592113727e-05, + "loss": 3.0556, + "step": 14462 + }, + { + "epoch": 0.8978210937984977, + "grad_norm": 0.1914731414842939, + "learning_rate": 8.838412164655699e-05, + "loss": 2.9283, + "step": 14463 + }, + { + "epoch": 0.8978831708982556, + "grad_norm": 0.15915749986158922, + "learning_rate": 8.838180717171742e-05, + "loss": 2.8797, + "step": 14464 + }, + { + "epoch": 0.8979452479980136, + "grad_norm": 0.22265006776224422, + "learning_rate": 8.837949249663066e-05, + "loss": 2.9871, + "step": 14465 + }, + { + "epoch": 0.8980073250977715, + "grad_norm": 0.18587002972612818, + "learning_rate": 8.837717762130877e-05, + "loss": 3.0094, + "step": 14466 + }, + { + "epoch": 0.8980694021975293, + "grad_norm": 0.2226580653200267, + "learning_rate": 8.837486254576382e-05, + "loss": 2.9418, + "step": 14467 + }, + { + "epoch": 0.8981314792972872, + "grad_norm": 0.17671456519213524, + "learning_rate": 8.83725472700079e-05, + "loss": 2.9597, + "step": 14468 + }, + { + "epoch": 0.8981935563970451, + "grad_norm": 0.16036205327126915, + "learning_rate": 8.837023179405309e-05, + "loss": 3.0574, + "step": 14469 + }, + { + "epoch": 0.898255633496803, + "grad_norm": 0.1723023650074034, + "learning_rate": 8.836791611791146e-05, + "loss": 2.9749, + "step": 14470 + }, + { + "epoch": 0.898317710596561, + "grad_norm": 0.2142060178121305, + "learning_rate": 8.83656002415951e-05, + "loss": 2.9258, + "step": 14471 + }, + { + "epoch": 0.8983797876963189, + "grad_norm": 0.19375751426953455, + "learning_rate": 8.83632841651161e-05, + "loss": 2.9524, + "step": 14472 + }, + { + "epoch": 0.8984418647960767, + "grad_norm": 0.15975730333364166, + "learning_rate": 8.836096788848651e-05, + "loss": 3.044, + "step": 14473 + }, + { + "epoch": 0.8985039418958346, + "grad_norm": 0.1959445834362453, + "learning_rate": 8.835865141171847e-05, + "loss": 2.9156, + "step": 14474 + }, + { + "epoch": 0.8985660189955925, + "grad_norm": 0.17351233805154279, + "learning_rate": 8.8356334734824e-05, + "loss": 2.9734, + "step": 14475 + }, + { + "epoch": 0.8986280960953504, + "grad_norm": 0.16900895413930242, + "learning_rate": 8.835401785781523e-05, + "loss": 2.8651, + "step": 14476 + }, + { + "epoch": 0.8986901731951084, + "grad_norm": 0.22579165569996906, + "learning_rate": 8.835170078070425e-05, + "loss": 3.0646, + "step": 14477 + }, + { + "epoch": 0.8987522502948663, + "grad_norm": 0.2141357808085337, + "learning_rate": 8.834938350350313e-05, + "loss": 2.9771, + "step": 14478 + }, + { + "epoch": 0.8988143273946241, + "grad_norm": 0.22532434695574668, + "learning_rate": 8.834706602622397e-05, + "loss": 2.975, + "step": 14479 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.178603478731582, + "learning_rate": 8.834474834887884e-05, + "loss": 3.0274, + "step": 14480 + }, + { + "epoch": 0.8989384815941399, + "grad_norm": 0.22364369828161992, + "learning_rate": 8.834243047147988e-05, + "loss": 3.0338, + "step": 14481 + }, + { + "epoch": 0.8990005586938978, + "grad_norm": 0.18447166915152877, + "learning_rate": 8.834011239403913e-05, + "loss": 2.9945, + "step": 14482 + }, + { + "epoch": 0.8990626357936558, + "grad_norm": 0.18741202277254632, + "learning_rate": 8.83377941165687e-05, + "loss": 3.0364, + "step": 14483 + }, + { + "epoch": 0.8991247128934137, + "grad_norm": 0.15246561262667876, + "learning_rate": 8.833547563908071e-05, + "loss": 3.016, + "step": 14484 + }, + { + "epoch": 0.8991867899931715, + "grad_norm": 0.1858076170874145, + "learning_rate": 8.833315696158724e-05, + "loss": 2.9744, + "step": 14485 + }, + { + "epoch": 0.8992488670929294, + "grad_norm": 0.1711234763637004, + "learning_rate": 8.833083808410037e-05, + "loss": 2.9259, + "step": 14486 + }, + { + "epoch": 0.8993109441926873, + "grad_norm": 0.19772533634225675, + "learning_rate": 8.832851900663221e-05, + "loss": 2.9864, + "step": 14487 + }, + { + "epoch": 0.8993730212924452, + "grad_norm": 0.2166872176792098, + "learning_rate": 8.832619972919487e-05, + "loss": 3.0077, + "step": 14488 + }, + { + "epoch": 0.8994350983922031, + "grad_norm": 0.19223907833055429, + "learning_rate": 8.832388025180045e-05, + "loss": 2.9945, + "step": 14489 + }, + { + "epoch": 0.8994971754919611, + "grad_norm": 0.18188517525645986, + "learning_rate": 8.832156057446103e-05, + "loss": 2.9943, + "step": 14490 + }, + { + "epoch": 0.8995592525917189, + "grad_norm": 0.1780508284320268, + "learning_rate": 8.831924069718874e-05, + "loss": 2.9864, + "step": 14491 + }, + { + "epoch": 0.8996213296914768, + "grad_norm": 0.21089635553690697, + "learning_rate": 8.831692061999566e-05, + "loss": 2.9713, + "step": 14492 + }, + { + "epoch": 0.8996834067912347, + "grad_norm": 0.18318154344144424, + "learning_rate": 8.83146003428939e-05, + "loss": 3.0578, + "step": 14493 + }, + { + "epoch": 0.8997454838909926, + "grad_norm": 0.16570181639719797, + "learning_rate": 8.831227986589556e-05, + "loss": 2.9228, + "step": 14494 + }, + { + "epoch": 0.8998075609907505, + "grad_norm": 0.20050631301292438, + "learning_rate": 8.830995918901279e-05, + "loss": 2.9657, + "step": 14495 + }, + { + "epoch": 0.8998696380905085, + "grad_norm": 0.16870327518990302, + "learning_rate": 8.830763831225763e-05, + "loss": 2.9701, + "step": 14496 + }, + { + "epoch": 0.8999317151902663, + "grad_norm": 0.16599458673351672, + "learning_rate": 8.830531723564223e-05, + "loss": 2.9904, + "step": 14497 + }, + { + "epoch": 0.8999937922900242, + "grad_norm": 0.15182140250347906, + "learning_rate": 8.830299595917869e-05, + "loss": 2.9121, + "step": 14498 + }, + { + "epoch": 0.9000558693897821, + "grad_norm": 0.20608331382015838, + "learning_rate": 8.830067448287913e-05, + "loss": 3.0453, + "step": 14499 + }, + { + "epoch": 0.90011794648954, + "grad_norm": 0.16365739216605346, + "learning_rate": 8.829835280675564e-05, + "loss": 2.9393, + "step": 14500 + }, + { + "epoch": 0.9001800235892979, + "grad_norm": 0.1779753131342857, + "learning_rate": 8.829603093082035e-05, + "loss": 3.0291, + "step": 14501 + }, + { + "epoch": 0.9002421006890559, + "grad_norm": 0.16707906080686868, + "learning_rate": 8.829370885508538e-05, + "loss": 2.9432, + "step": 14502 + }, + { + "epoch": 0.9003041777888137, + "grad_norm": 0.18877055902492373, + "learning_rate": 8.829138657956283e-05, + "loss": 2.9581, + "step": 14503 + }, + { + "epoch": 0.9003662548885716, + "grad_norm": 0.16446161542708063, + "learning_rate": 8.828906410426482e-05, + "loss": 2.9413, + "step": 14504 + }, + { + "epoch": 0.9004283319883295, + "grad_norm": 0.21118598889411153, + "learning_rate": 8.828674142920346e-05, + "loss": 2.8984, + "step": 14505 + }, + { + "epoch": 0.9004904090880874, + "grad_norm": 0.19487950967362924, + "learning_rate": 8.828441855439089e-05, + "loss": 2.9852, + "step": 14506 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.1950534437681323, + "learning_rate": 8.82820954798392e-05, + "loss": 2.8691, + "step": 14507 + }, + { + "epoch": 0.9006145632876033, + "grad_norm": 0.16167727873897925, + "learning_rate": 8.827977220556054e-05, + "loss": 2.9772, + "step": 14508 + }, + { + "epoch": 0.9006766403873611, + "grad_norm": 0.22115905053496193, + "learning_rate": 8.827744873156701e-05, + "loss": 3.0174, + "step": 14509 + }, + { + "epoch": 0.900738717487119, + "grad_norm": 0.17513630488867818, + "learning_rate": 8.827512505787072e-05, + "loss": 2.9253, + "step": 14510 + }, + { + "epoch": 0.9008007945868769, + "grad_norm": 0.20412363726960528, + "learning_rate": 8.827280118448383e-05, + "loss": 2.9602, + "step": 14511 + }, + { + "epoch": 0.9008628716866348, + "grad_norm": 0.17519080816494, + "learning_rate": 8.827047711141845e-05, + "loss": 3.039, + "step": 14512 + }, + { + "epoch": 0.9009249487863927, + "grad_norm": 0.19111963143614646, + "learning_rate": 8.826815283868668e-05, + "loss": 2.8959, + "step": 14513 + }, + { + "epoch": 0.9009870258861506, + "grad_norm": 0.1902212126641238, + "learning_rate": 8.826582836630068e-05, + "loss": 2.894, + "step": 14514 + }, + { + "epoch": 0.9010491029859085, + "grad_norm": 0.19026284367095664, + "learning_rate": 8.826350369427256e-05, + "loss": 3.0445, + "step": 14515 + }, + { + "epoch": 0.9011111800856664, + "grad_norm": 0.175929954995469, + "learning_rate": 8.826117882261446e-05, + "loss": 3.0405, + "step": 14516 + }, + { + "epoch": 0.9011732571854243, + "grad_norm": 0.17616680565114737, + "learning_rate": 8.825885375133849e-05, + "loss": 2.9745, + "step": 14517 + }, + { + "epoch": 0.9012353342851822, + "grad_norm": 0.184991386509467, + "learning_rate": 8.82565284804568e-05, + "loss": 2.9567, + "step": 14518 + }, + { + "epoch": 0.9012974113849401, + "grad_norm": 0.18594444245913466, + "learning_rate": 8.82542030099815e-05, + "loss": 2.9224, + "step": 14519 + }, + { + "epoch": 0.901359488484698, + "grad_norm": 0.1841168965073398, + "learning_rate": 8.825187733992476e-05, + "loss": 2.9742, + "step": 14520 + }, + { + "epoch": 0.9014215655844559, + "grad_norm": 0.18729875612861674, + "learning_rate": 8.824955147029868e-05, + "loss": 2.9811, + "step": 14521 + }, + { + "epoch": 0.9014836426842138, + "grad_norm": 0.19998816030445649, + "learning_rate": 8.824722540111541e-05, + "loss": 2.9917, + "step": 14522 + }, + { + "epoch": 0.9015457197839717, + "grad_norm": 0.18905739895583318, + "learning_rate": 8.824489913238707e-05, + "loss": 2.9424, + "step": 14523 + }, + { + "epoch": 0.9016077968837296, + "grad_norm": 0.17506565029124835, + "learning_rate": 8.824257266412581e-05, + "loss": 3.0234, + "step": 14524 + }, + { + "epoch": 0.9016698739834875, + "grad_norm": 0.22490893580804905, + "learning_rate": 8.824024599634376e-05, + "loss": 2.9796, + "step": 14525 + }, + { + "epoch": 0.9017319510832454, + "grad_norm": 0.16928415551701725, + "learning_rate": 8.823791912905308e-05, + "loss": 2.9946, + "step": 14526 + }, + { + "epoch": 0.9017940281830032, + "grad_norm": 0.19590840984607588, + "learning_rate": 8.823559206226587e-05, + "loss": 2.9612, + "step": 14527 + }, + { + "epoch": 0.9018561052827612, + "grad_norm": 0.190362967516038, + "learning_rate": 8.82332647959943e-05, + "loss": 3.0113, + "step": 14528 + }, + { + "epoch": 0.9019181823825191, + "grad_norm": 0.1626668673924311, + "learning_rate": 8.823093733025051e-05, + "loss": 2.9846, + "step": 14529 + }, + { + "epoch": 0.901980259482277, + "grad_norm": 0.17965558017334457, + "learning_rate": 8.822860966504665e-05, + "loss": 3.0172, + "step": 14530 + }, + { + "epoch": 0.9020423365820349, + "grad_norm": 0.187795584864255, + "learning_rate": 8.822628180039485e-05, + "loss": 3.0386, + "step": 14531 + }, + { + "epoch": 0.9021044136817928, + "grad_norm": 0.16575000494565711, + "learning_rate": 8.822395373630725e-05, + "loss": 3.0428, + "step": 14532 + }, + { + "epoch": 0.9021664907815506, + "grad_norm": 0.18864369270764417, + "learning_rate": 8.8221625472796e-05, + "loss": 3.0621, + "step": 14533 + }, + { + "epoch": 0.9022285678813086, + "grad_norm": 0.15916237415168535, + "learning_rate": 8.821929700987326e-05, + "loss": 2.9819, + "step": 14534 + }, + { + "epoch": 0.9022906449810665, + "grad_norm": 0.17946683768997693, + "learning_rate": 8.821696834755118e-05, + "loss": 3.0109, + "step": 14535 + }, + { + "epoch": 0.9023527220808244, + "grad_norm": 0.18418275419981037, + "learning_rate": 8.821463948584188e-05, + "loss": 2.9994, + "step": 14536 + }, + { + "epoch": 0.9024147991805823, + "grad_norm": 0.17281608971873544, + "learning_rate": 8.821231042475753e-05, + "loss": 3.0066, + "step": 14537 + }, + { + "epoch": 0.9024768762803402, + "grad_norm": 0.2723641856148779, + "learning_rate": 8.820998116431029e-05, + "loss": 3.0038, + "step": 14538 + }, + { + "epoch": 0.902538953380098, + "grad_norm": 0.1867219358036763, + "learning_rate": 8.82076517045123e-05, + "loss": 2.9907, + "step": 14539 + }, + { + "epoch": 0.902601030479856, + "grad_norm": 0.18450438114071466, + "learning_rate": 8.82053220453757e-05, + "loss": 3.0894, + "step": 14540 + }, + { + "epoch": 0.9026631075796139, + "grad_norm": 0.20761011278473984, + "learning_rate": 8.820299218691269e-05, + "loss": 2.9936, + "step": 14541 + }, + { + "epoch": 0.9027251846793718, + "grad_norm": 0.28086717035036163, + "learning_rate": 8.820066212913536e-05, + "loss": 3.1118, + "step": 14542 + }, + { + "epoch": 0.9027872617791297, + "grad_norm": 0.1983465286548279, + "learning_rate": 8.819833187205593e-05, + "loss": 3.0906, + "step": 14543 + }, + { + "epoch": 0.9028493388788876, + "grad_norm": 0.2114205216597118, + "learning_rate": 8.819600141568652e-05, + "loss": 2.9891, + "step": 14544 + }, + { + "epoch": 0.9029114159786454, + "grad_norm": 0.1912809273078738, + "learning_rate": 8.819367076003931e-05, + "loss": 3.038, + "step": 14545 + }, + { + "epoch": 0.9029734930784034, + "grad_norm": 0.21501965690731784, + "learning_rate": 8.819133990512642e-05, + "loss": 2.9105, + "step": 14546 + }, + { + "epoch": 0.9030355701781613, + "grad_norm": 0.23515991068680656, + "learning_rate": 8.818900885096006e-05, + "loss": 2.9679, + "step": 14547 + }, + { + "epoch": 0.9030976472779192, + "grad_norm": 0.2184458303841385, + "learning_rate": 8.818667759755237e-05, + "loss": 2.9905, + "step": 14548 + }, + { + "epoch": 0.9031597243776771, + "grad_norm": 0.1795204567881128, + "learning_rate": 8.818434614491551e-05, + "loss": 3.0655, + "step": 14549 + }, + { + "epoch": 0.903221801477435, + "grad_norm": 0.20841133127840622, + "learning_rate": 8.818201449306165e-05, + "loss": 2.9685, + "step": 14550 + }, + { + "epoch": 0.9032838785771928, + "grad_norm": 0.19642769012934666, + "learning_rate": 8.817968264200294e-05, + "loss": 3.0481, + "step": 14551 + }, + { + "epoch": 0.9033459556769508, + "grad_norm": 0.19816039892803416, + "learning_rate": 8.817735059175156e-05, + "loss": 2.9267, + "step": 14552 + }, + { + "epoch": 0.9034080327767087, + "grad_norm": 0.15963572138580395, + "learning_rate": 8.817501834231968e-05, + "loss": 3.0175, + "step": 14553 + }, + { + "epoch": 0.9034701098764666, + "grad_norm": 0.2983884506461975, + "learning_rate": 8.817268589371945e-05, + "loss": 3.0117, + "step": 14554 + }, + { + "epoch": 0.9035321869762245, + "grad_norm": 0.19892441754255255, + "learning_rate": 8.817035324596308e-05, + "loss": 2.9856, + "step": 14555 + }, + { + "epoch": 0.9035942640759824, + "grad_norm": 0.18966963283220098, + "learning_rate": 8.816802039906269e-05, + "loss": 2.9765, + "step": 14556 + }, + { + "epoch": 0.9036563411757402, + "grad_norm": 0.17767530025447684, + "learning_rate": 8.816568735303047e-05, + "loss": 2.9823, + "step": 14557 + }, + { + "epoch": 0.9037184182754981, + "grad_norm": 0.17367580254936144, + "learning_rate": 8.816335410787859e-05, + "loss": 2.976, + "step": 14558 + }, + { + "epoch": 0.9037804953752561, + "grad_norm": 0.19139055265558716, + "learning_rate": 8.816102066361924e-05, + "loss": 3.0153, + "step": 14559 + }, + { + "epoch": 0.903842572475014, + "grad_norm": 0.1749364580019126, + "learning_rate": 8.815868702026458e-05, + "loss": 2.9577, + "step": 14560 + }, + { + "epoch": 0.9039046495747719, + "grad_norm": 0.19875729166606423, + "learning_rate": 8.815635317782677e-05, + "loss": 2.9151, + "step": 14561 + }, + { + "epoch": 0.9039667266745298, + "grad_norm": 0.16623039938533404, + "learning_rate": 8.815401913631803e-05, + "loss": 3.0029, + "step": 14562 + }, + { + "epoch": 0.9040288037742876, + "grad_norm": 0.18688750561682335, + "learning_rate": 8.815168489575048e-05, + "loss": 3.0228, + "step": 14563 + }, + { + "epoch": 0.9040908808740455, + "grad_norm": 0.18659926665620427, + "learning_rate": 8.814935045613635e-05, + "loss": 2.992, + "step": 14564 + }, + { + "epoch": 0.9041529579738035, + "grad_norm": 0.1583294364225865, + "learning_rate": 8.814701581748779e-05, + "loss": 2.9796, + "step": 14565 + }, + { + "epoch": 0.9042150350735614, + "grad_norm": 0.1856901318502587, + "learning_rate": 8.814468097981698e-05, + "loss": 2.9576, + "step": 14566 + }, + { + "epoch": 0.9042771121733193, + "grad_norm": 0.18668384544976754, + "learning_rate": 8.814234594313613e-05, + "loss": 3.1053, + "step": 14567 + }, + { + "epoch": 0.9043391892730772, + "grad_norm": 0.17979253932995998, + "learning_rate": 8.814001070745739e-05, + "loss": 2.9663, + "step": 14568 + }, + { + "epoch": 0.904401266372835, + "grad_norm": 0.19590232480148628, + "learning_rate": 8.813767527279295e-05, + "loss": 3.0286, + "step": 14569 + }, + { + "epoch": 0.9044633434725929, + "grad_norm": 0.18925719600612512, + "learning_rate": 8.813533963915501e-05, + "loss": 2.9861, + "step": 14570 + }, + { + "epoch": 0.9045254205723509, + "grad_norm": 0.22295455859385901, + "learning_rate": 8.813300380655574e-05, + "loss": 2.9908, + "step": 14571 + }, + { + "epoch": 0.9045874976721088, + "grad_norm": 0.17894706343831138, + "learning_rate": 8.813066777500734e-05, + "loss": 2.9599, + "step": 14572 + }, + { + "epoch": 0.9046495747718667, + "grad_norm": 0.20878114355216176, + "learning_rate": 8.812833154452199e-05, + "loss": 2.9256, + "step": 14573 + }, + { + "epoch": 0.9047116518716246, + "grad_norm": 0.18635620489404278, + "learning_rate": 8.812599511511185e-05, + "loss": 2.9761, + "step": 14574 + }, + { + "epoch": 0.9047737289713824, + "grad_norm": 0.1894434405339009, + "learning_rate": 8.812365848678917e-05, + "loss": 3.0263, + "step": 14575 + }, + { + "epoch": 0.9048358060711403, + "grad_norm": 0.18503898577117778, + "learning_rate": 8.81213216595661e-05, + "loss": 3.0385, + "step": 14576 + }, + { + "epoch": 0.9048978831708983, + "grad_norm": 0.2084461125448954, + "learning_rate": 8.811898463345482e-05, + "loss": 2.8962, + "step": 14577 + }, + { + "epoch": 0.9049599602706562, + "grad_norm": 0.1872472549709651, + "learning_rate": 8.811664740846757e-05, + "loss": 2.9474, + "step": 14578 + }, + { + "epoch": 0.9050220373704141, + "grad_norm": 0.16555077040996233, + "learning_rate": 8.811430998461651e-05, + "loss": 2.9612, + "step": 14579 + }, + { + "epoch": 0.905084114470172, + "grad_norm": 0.19903320198701072, + "learning_rate": 8.811197236191384e-05, + "loss": 3.1133, + "step": 14580 + }, + { + "epoch": 0.9051461915699298, + "grad_norm": 0.15213341742872155, + "learning_rate": 8.810963454037175e-05, + "loss": 2.9795, + "step": 14581 + }, + { + "epoch": 0.9052082686696877, + "grad_norm": 0.17302578696413817, + "learning_rate": 8.810729652000247e-05, + "loss": 3.0294, + "step": 14582 + }, + { + "epoch": 0.9052703457694457, + "grad_norm": 0.1664086458096393, + "learning_rate": 8.810495830081816e-05, + "loss": 3.0137, + "step": 14583 + }, + { + "epoch": 0.9053324228692036, + "grad_norm": 0.15911560289791957, + "learning_rate": 8.810261988283102e-05, + "loss": 2.981, + "step": 14584 + }, + { + "epoch": 0.9053944999689615, + "grad_norm": 0.16774371508928387, + "learning_rate": 8.810028126605328e-05, + "loss": 2.9934, + "step": 14585 + }, + { + "epoch": 0.9054565770687194, + "grad_norm": 0.16925000404924193, + "learning_rate": 8.809794245049711e-05, + "loss": 2.8642, + "step": 14586 + }, + { + "epoch": 0.9055186541684772, + "grad_norm": 0.16854560845280145, + "learning_rate": 8.809560343617473e-05, + "loss": 3.0384, + "step": 14587 + }, + { + "epoch": 0.9055807312682351, + "grad_norm": 0.1718203013252765, + "learning_rate": 8.809326422309836e-05, + "loss": 3.0189, + "step": 14588 + }, + { + "epoch": 0.905642808367993, + "grad_norm": 0.1574819457306474, + "learning_rate": 8.809092481128015e-05, + "loss": 2.9123, + "step": 14589 + }, + { + "epoch": 0.905704885467751, + "grad_norm": 0.16053390172068172, + "learning_rate": 8.808858520073236e-05, + "loss": 2.9988, + "step": 14590 + }, + { + "epoch": 0.9057669625675089, + "grad_norm": 0.18018813465640438, + "learning_rate": 8.808624539146715e-05, + "loss": 3.0396, + "step": 14591 + }, + { + "epoch": 0.9058290396672668, + "grad_norm": 0.23229737513560117, + "learning_rate": 8.808390538349677e-05, + "loss": 3.0632, + "step": 14592 + }, + { + "epoch": 0.9058911167670246, + "grad_norm": 0.18400845461189047, + "learning_rate": 8.808156517683342e-05, + "loss": 2.974, + "step": 14593 + }, + { + "epoch": 0.9059531938667825, + "grad_norm": 0.16289864138447338, + "learning_rate": 8.807922477148928e-05, + "loss": 2.9703, + "step": 14594 + }, + { + "epoch": 0.9060152709665404, + "grad_norm": 0.15923521910368732, + "learning_rate": 8.80768841674766e-05, + "loss": 2.9596, + "step": 14595 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.22651228677137428, + "learning_rate": 8.807454336480756e-05, + "loss": 2.9735, + "step": 14596 + }, + { + "epoch": 0.9061394251660563, + "grad_norm": 0.23420032508912642, + "learning_rate": 8.807220236349438e-05, + "loss": 2.9838, + "step": 14597 + }, + { + "epoch": 0.9062015022658142, + "grad_norm": 0.21790531253927606, + "learning_rate": 8.806986116354928e-05, + "loss": 2.9518, + "step": 14598 + }, + { + "epoch": 0.906263579365572, + "grad_norm": 0.22429059668576345, + "learning_rate": 8.806751976498448e-05, + "loss": 2.9091, + "step": 14599 + }, + { + "epoch": 0.9063256564653299, + "grad_norm": 0.17963438700329895, + "learning_rate": 8.806517816781219e-05, + "loss": 2.9716, + "step": 14600 + }, + { + "epoch": 0.9063877335650878, + "grad_norm": 0.21118487758029975, + "learning_rate": 8.806283637204462e-05, + "loss": 2.9646, + "step": 14601 + }, + { + "epoch": 0.9064498106648458, + "grad_norm": 0.17881252965756878, + "learning_rate": 8.8060494377694e-05, + "loss": 3.0161, + "step": 14602 + }, + { + "epoch": 0.9065118877646037, + "grad_norm": 0.1749021801716153, + "learning_rate": 8.805815218477252e-05, + "loss": 2.9135, + "step": 14603 + }, + { + "epoch": 0.9065739648643616, + "grad_norm": 0.179834875539104, + "learning_rate": 8.805580979329244e-05, + "loss": 3.0379, + "step": 14604 + }, + { + "epoch": 0.9066360419641194, + "grad_norm": 0.23052006489665436, + "learning_rate": 8.805346720326595e-05, + "loss": 3.0059, + "step": 14605 + }, + { + "epoch": 0.9066981190638773, + "grad_norm": 0.20033285486885646, + "learning_rate": 8.805112441470529e-05, + "loss": 3.0063, + "step": 14606 + }, + { + "epoch": 0.9067601961636352, + "grad_norm": 0.23523613564533305, + "learning_rate": 8.804878142762268e-05, + "loss": 3.0216, + "step": 14607 + }, + { + "epoch": 0.9068222732633932, + "grad_norm": 0.1962593397724402, + "learning_rate": 8.804643824203033e-05, + "loss": 3.0405, + "step": 14608 + }, + { + "epoch": 0.9068843503631511, + "grad_norm": 0.2718819513199588, + "learning_rate": 8.804409485794047e-05, + "loss": 3.0052, + "step": 14609 + }, + { + "epoch": 0.906946427462909, + "grad_norm": 0.17601845419413611, + "learning_rate": 8.804175127536536e-05, + "loss": 2.9227, + "step": 14610 + }, + { + "epoch": 0.9070085045626668, + "grad_norm": 0.23966289405153682, + "learning_rate": 8.803940749431718e-05, + "loss": 2.9794, + "step": 14611 + }, + { + "epoch": 0.9070705816624247, + "grad_norm": 0.20023513956031885, + "learning_rate": 8.803706351480818e-05, + "loss": 3.0005, + "step": 14612 + }, + { + "epoch": 0.9071326587621826, + "grad_norm": 0.1895410196187476, + "learning_rate": 8.803471933685059e-05, + "loss": 3.0566, + "step": 14613 + }, + { + "epoch": 0.9071947358619405, + "grad_norm": 0.18878235001315372, + "learning_rate": 8.803237496045662e-05, + "loss": 2.9737, + "step": 14614 + }, + { + "epoch": 0.9072568129616985, + "grad_norm": 0.2065627897955958, + "learning_rate": 8.803003038563853e-05, + "loss": 2.9988, + "step": 14615 + }, + { + "epoch": 0.9073188900614563, + "grad_norm": 0.187059092684047, + "learning_rate": 8.802768561240854e-05, + "loss": 2.9212, + "step": 14616 + }, + { + "epoch": 0.9073809671612142, + "grad_norm": 0.25390316887966785, + "learning_rate": 8.802534064077887e-05, + "loss": 2.8826, + "step": 14617 + }, + { + "epoch": 0.9074430442609721, + "grad_norm": 0.26491988784013265, + "learning_rate": 8.802299547076178e-05, + "loss": 2.9641, + "step": 14618 + }, + { + "epoch": 0.90750512136073, + "grad_norm": 0.1759005459653374, + "learning_rate": 8.802065010236951e-05, + "loss": 3.0723, + "step": 14619 + }, + { + "epoch": 0.907567198460488, + "grad_norm": 0.1959847991076259, + "learning_rate": 8.801830453561425e-05, + "loss": 2.996, + "step": 14620 + }, + { + "epoch": 0.9076292755602459, + "grad_norm": 0.1648858278817148, + "learning_rate": 8.801595877050827e-05, + "loss": 3.0358, + "step": 14621 + }, + { + "epoch": 0.9076913526600037, + "grad_norm": 0.24632970155501188, + "learning_rate": 8.801361280706381e-05, + "loss": 3.0417, + "step": 14622 + }, + { + "epoch": 0.9077534297597616, + "grad_norm": 0.17259268583121268, + "learning_rate": 8.80112666452931e-05, + "loss": 3.085, + "step": 14623 + }, + { + "epoch": 0.9078155068595195, + "grad_norm": 0.16460145685046873, + "learning_rate": 8.800892028520839e-05, + "loss": 3.0262, + "step": 14624 + }, + { + "epoch": 0.9078775839592774, + "grad_norm": 0.1652616376908867, + "learning_rate": 8.800657372682191e-05, + "loss": 3.0228, + "step": 14625 + }, + { + "epoch": 0.9079396610590353, + "grad_norm": 0.15991588325997722, + "learning_rate": 8.800422697014592e-05, + "loss": 2.9711, + "step": 14626 + }, + { + "epoch": 0.9080017381587933, + "grad_norm": 0.16142947313318207, + "learning_rate": 8.800188001519263e-05, + "loss": 2.8974, + "step": 14627 + }, + { + "epoch": 0.9080638152585511, + "grad_norm": 0.18795933686403368, + "learning_rate": 8.799953286197433e-05, + "loss": 2.9366, + "step": 14628 + }, + { + "epoch": 0.908125892358309, + "grad_norm": 0.22139842301548404, + "learning_rate": 8.799718551050323e-05, + "loss": 3.1431, + "step": 14629 + }, + { + "epoch": 0.9081879694580669, + "grad_norm": 0.18715300837671453, + "learning_rate": 8.799483796079159e-05, + "loss": 2.9361, + "step": 14630 + }, + { + "epoch": 0.9082500465578248, + "grad_norm": 0.1879892225365791, + "learning_rate": 8.799249021285166e-05, + "loss": 2.9248, + "step": 14631 + }, + { + "epoch": 0.9083121236575827, + "grad_norm": 0.17651191488950674, + "learning_rate": 8.799014226669569e-05, + "loss": 2.9338, + "step": 14632 + }, + { + "epoch": 0.9083742007573407, + "grad_norm": 0.23609020931080715, + "learning_rate": 8.79877941223359e-05, + "loss": 3.0897, + "step": 14633 + }, + { + "epoch": 0.9084362778570985, + "grad_norm": 0.1928788581927041, + "learning_rate": 8.798544577978459e-05, + "loss": 3.0678, + "step": 14634 + }, + { + "epoch": 0.9084983549568564, + "grad_norm": 0.22027673295707165, + "learning_rate": 8.798309723905398e-05, + "loss": 2.8782, + "step": 14635 + }, + { + "epoch": 0.9085604320566143, + "grad_norm": 0.21326590192856335, + "learning_rate": 8.798074850015634e-05, + "loss": 3.0135, + "step": 14636 + }, + { + "epoch": 0.9086225091563722, + "grad_norm": 0.19621859182213805, + "learning_rate": 8.797839956310392e-05, + "loss": 3.0259, + "step": 14637 + }, + { + "epoch": 0.9086845862561301, + "grad_norm": 0.2059240695556529, + "learning_rate": 8.797605042790895e-05, + "loss": 3.0404, + "step": 14638 + }, + { + "epoch": 0.908746663355888, + "grad_norm": 0.20685865297298828, + "learning_rate": 8.797370109458372e-05, + "loss": 2.945, + "step": 14639 + }, + { + "epoch": 0.9088087404556459, + "grad_norm": 0.18689113344764866, + "learning_rate": 8.797135156314047e-05, + "loss": 3.0103, + "step": 14640 + }, + { + "epoch": 0.9088708175554038, + "grad_norm": 0.1854372604234856, + "learning_rate": 8.796900183359145e-05, + "loss": 3.0094, + "step": 14641 + }, + { + "epoch": 0.9089328946551617, + "grad_norm": 0.20797316392474874, + "learning_rate": 8.796665190594895e-05, + "loss": 3.0212, + "step": 14642 + }, + { + "epoch": 0.9089949717549196, + "grad_norm": 0.1761513892652396, + "learning_rate": 8.796430178022519e-05, + "loss": 3.0386, + "step": 14643 + }, + { + "epoch": 0.9090570488546775, + "grad_norm": 0.1829839175816883, + "learning_rate": 8.796195145643247e-05, + "loss": 2.9521, + "step": 14644 + }, + { + "epoch": 0.9091191259544354, + "grad_norm": 0.16376412598271725, + "learning_rate": 8.795960093458302e-05, + "loss": 2.967, + "step": 14645 + }, + { + "epoch": 0.9091812030541933, + "grad_norm": 0.17301737919306323, + "learning_rate": 8.795725021468912e-05, + "loss": 2.9706, + "step": 14646 + }, + { + "epoch": 0.9092432801539512, + "grad_norm": 0.33093736758423087, + "learning_rate": 8.795489929676304e-05, + "loss": 2.9998, + "step": 14647 + }, + { + "epoch": 0.9093053572537091, + "grad_norm": 0.23674978518602158, + "learning_rate": 8.795254818081702e-05, + "loss": 2.9582, + "step": 14648 + }, + { + "epoch": 0.909367434353467, + "grad_norm": 0.2225661597445105, + "learning_rate": 8.795019686686334e-05, + "loss": 3.0586, + "step": 14649 + }, + { + "epoch": 0.9094295114532249, + "grad_norm": 0.26630303853042286, + "learning_rate": 8.794784535491429e-05, + "loss": 2.9336, + "step": 14650 + }, + { + "epoch": 0.9094915885529828, + "grad_norm": 0.2469953038641799, + "learning_rate": 8.794549364498211e-05, + "loss": 2.9707, + "step": 14651 + }, + { + "epoch": 0.9095536656527407, + "grad_norm": 0.23014495215879335, + "learning_rate": 8.794314173707909e-05, + "loss": 3.019, + "step": 14652 + }, + { + "epoch": 0.9096157427524986, + "grad_norm": 0.24404730729989776, + "learning_rate": 8.794078963121747e-05, + "loss": 2.9076, + "step": 14653 + }, + { + "epoch": 0.9096778198522565, + "grad_norm": 0.20089739991153194, + "learning_rate": 8.793843732740954e-05, + "loss": 3.0588, + "step": 14654 + }, + { + "epoch": 0.9097398969520144, + "grad_norm": 0.19443130070298403, + "learning_rate": 8.793608482566759e-05, + "loss": 2.9544, + "step": 14655 + }, + { + "epoch": 0.9098019740517723, + "grad_norm": 0.2107693832286368, + "learning_rate": 8.793373212600386e-05, + "loss": 2.9702, + "step": 14656 + }, + { + "epoch": 0.9098640511515302, + "grad_norm": 0.17768750253865886, + "learning_rate": 8.793137922843065e-05, + "loss": 2.9321, + "step": 14657 + }, + { + "epoch": 0.909926128251288, + "grad_norm": 0.22337013542938755, + "learning_rate": 8.792902613296022e-05, + "loss": 2.9654, + "step": 14658 + }, + { + "epoch": 0.909988205351046, + "grad_norm": 0.1891598249992814, + "learning_rate": 8.792667283960486e-05, + "loss": 2.9485, + "step": 14659 + }, + { + "epoch": 0.9100502824508039, + "grad_norm": 0.17657681149739426, + "learning_rate": 8.792431934837684e-05, + "loss": 3.0012, + "step": 14660 + }, + { + "epoch": 0.9101123595505618, + "grad_norm": 0.17934667356972675, + "learning_rate": 8.792196565928843e-05, + "loss": 2.9591, + "step": 14661 + }, + { + "epoch": 0.9101744366503197, + "grad_norm": 0.1870046967863874, + "learning_rate": 8.791961177235193e-05, + "loss": 2.9651, + "step": 14662 + }, + { + "epoch": 0.9102365137500776, + "grad_norm": 0.1814906721627809, + "learning_rate": 8.791725768757961e-05, + "loss": 3.0362, + "step": 14663 + }, + { + "epoch": 0.9102985908498354, + "grad_norm": 0.2212709700619876, + "learning_rate": 8.791490340498374e-05, + "loss": 2.871, + "step": 14664 + }, + { + "epoch": 0.9103606679495934, + "grad_norm": 0.17373154668275229, + "learning_rate": 8.791254892457663e-05, + "loss": 2.9735, + "step": 14665 + }, + { + "epoch": 0.9104227450493513, + "grad_norm": 0.15751255277724055, + "learning_rate": 8.791019424637054e-05, + "loss": 2.9754, + "step": 14666 + }, + { + "epoch": 0.9104848221491092, + "grad_norm": 0.1862501069043, + "learning_rate": 8.790783937037777e-05, + "loss": 2.9526, + "step": 14667 + }, + { + "epoch": 0.9105468992488671, + "grad_norm": 0.15948410997530366, + "learning_rate": 8.790548429661058e-05, + "loss": 3.024, + "step": 14668 + }, + { + "epoch": 0.910608976348625, + "grad_norm": 0.17312425988517685, + "learning_rate": 8.79031290250813e-05, + "loss": 2.9789, + "step": 14669 + }, + { + "epoch": 0.9106710534483828, + "grad_norm": 0.1552475241864479, + "learning_rate": 8.790077355580219e-05, + "loss": 2.9986, + "step": 14670 + }, + { + "epoch": 0.9107331305481408, + "grad_norm": 0.15627367913112525, + "learning_rate": 8.789841788878552e-05, + "loss": 2.9204, + "step": 14671 + }, + { + "epoch": 0.9107952076478987, + "grad_norm": 0.16736689016667164, + "learning_rate": 8.789606202404363e-05, + "loss": 2.983, + "step": 14672 + }, + { + "epoch": 0.9108572847476566, + "grad_norm": 0.4748471924034731, + "learning_rate": 8.78937059615888e-05, + "loss": 2.9303, + "step": 14673 + }, + { + "epoch": 0.9109193618474145, + "grad_norm": 0.20530612623578015, + "learning_rate": 8.789134970143327e-05, + "loss": 2.9758, + "step": 14674 + }, + { + "epoch": 0.9109814389471724, + "grad_norm": 0.18645112398442532, + "learning_rate": 8.78889932435894e-05, + "loss": 2.909, + "step": 14675 + }, + { + "epoch": 0.9110435160469302, + "grad_norm": 0.22888931465646217, + "learning_rate": 8.788663658806941e-05, + "loss": 3.0296, + "step": 14676 + }, + { + "epoch": 0.9111055931466882, + "grad_norm": 0.22102093390530422, + "learning_rate": 8.788427973488569e-05, + "loss": 3.007, + "step": 14677 + }, + { + "epoch": 0.9111676702464461, + "grad_norm": 0.2641854547313597, + "learning_rate": 8.788192268405046e-05, + "loss": 2.9159, + "step": 14678 + }, + { + "epoch": 0.911229747346204, + "grad_norm": 0.18970652475429847, + "learning_rate": 8.787956543557605e-05, + "loss": 2.9179, + "step": 14679 + }, + { + "epoch": 0.9112918244459619, + "grad_norm": 0.199825149338589, + "learning_rate": 8.787720798947474e-05, + "loss": 2.9676, + "step": 14680 + }, + { + "epoch": 0.9113539015457198, + "grad_norm": 0.2848871345981981, + "learning_rate": 8.787485034575886e-05, + "loss": 3.0288, + "step": 14681 + }, + { + "epoch": 0.9114159786454776, + "grad_norm": 0.20043637718228766, + "learning_rate": 8.787249250444067e-05, + "loss": 2.9905, + "step": 14682 + }, + { + "epoch": 0.9114780557452355, + "grad_norm": 0.16857654920409523, + "learning_rate": 8.787013446553251e-05, + "loss": 2.9688, + "step": 14683 + }, + { + "epoch": 0.9115401328449935, + "grad_norm": 0.2029188467146252, + "learning_rate": 8.786777622904666e-05, + "loss": 2.981, + "step": 14684 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.17727686086917177, + "learning_rate": 8.786541779499543e-05, + "loss": 3.0385, + "step": 14685 + }, + { + "epoch": 0.9116642870445093, + "grad_norm": 0.23550424332349326, + "learning_rate": 8.786305916339113e-05, + "loss": 3.0013, + "step": 14686 + }, + { + "epoch": 0.9117263641442672, + "grad_norm": 0.2338198046441837, + "learning_rate": 8.786070033424604e-05, + "loss": 3.0078, + "step": 14687 + }, + { + "epoch": 0.911788441244025, + "grad_norm": 0.17092261007199544, + "learning_rate": 8.78583413075725e-05, + "loss": 2.9946, + "step": 14688 + }, + { + "epoch": 0.911850518343783, + "grad_norm": 0.17593079140003132, + "learning_rate": 8.785598208338279e-05, + "loss": 2.9549, + "step": 14689 + }, + { + "epoch": 0.9119125954435409, + "grad_norm": 0.18320935163510627, + "learning_rate": 8.785362266168925e-05, + "loss": 3.062, + "step": 14690 + }, + { + "epoch": 0.9119746725432988, + "grad_norm": 0.1956317295495257, + "learning_rate": 8.785126304250416e-05, + "loss": 2.8971, + "step": 14691 + }, + { + "epoch": 0.9120367496430567, + "grad_norm": 0.20249889236606444, + "learning_rate": 8.784890322583984e-05, + "loss": 2.9423, + "step": 14692 + }, + { + "epoch": 0.9120988267428146, + "grad_norm": 0.16680950635122577, + "learning_rate": 8.78465432117086e-05, + "loss": 2.9759, + "step": 14693 + }, + { + "epoch": 0.9121609038425724, + "grad_norm": 0.1881141835490159, + "learning_rate": 8.784418300012277e-05, + "loss": 3.0063, + "step": 14694 + }, + { + "epoch": 0.9122229809423303, + "grad_norm": 0.19278881394302677, + "learning_rate": 8.784182259109465e-05, + "loss": 2.957, + "step": 14695 + }, + { + "epoch": 0.9122850580420883, + "grad_norm": 0.22649192532984105, + "learning_rate": 8.783946198463655e-05, + "loss": 2.9798, + "step": 14696 + }, + { + "epoch": 0.9123471351418462, + "grad_norm": 0.19932468253208124, + "learning_rate": 8.783710118076078e-05, + "loss": 3.0184, + "step": 14697 + }, + { + "epoch": 0.9124092122416041, + "grad_norm": 0.2616674315968852, + "learning_rate": 8.783474017947968e-05, + "loss": 3.0102, + "step": 14698 + }, + { + "epoch": 0.912471289341362, + "grad_norm": 0.1819115945371354, + "learning_rate": 8.783237898080557e-05, + "loss": 3.097, + "step": 14699 + }, + { + "epoch": 0.9125333664411198, + "grad_norm": 0.1832647114354573, + "learning_rate": 8.783001758475073e-05, + "loss": 3.0414, + "step": 14700 + }, + { + "epoch": 0.9125954435408777, + "grad_norm": 0.17145201262538465, + "learning_rate": 8.782765599132751e-05, + "loss": 3.0461, + "step": 14701 + }, + { + "epoch": 0.9126575206406357, + "grad_norm": 0.2051248769602851, + "learning_rate": 8.782529420054823e-05, + "loss": 2.946, + "step": 14702 + }, + { + "epoch": 0.9127195977403936, + "grad_norm": 0.1913935501400792, + "learning_rate": 8.782293221242521e-05, + "loss": 2.9987, + "step": 14703 + }, + { + "epoch": 0.9127816748401515, + "grad_norm": 0.15569791413778464, + "learning_rate": 8.782057002697076e-05, + "loss": 2.9964, + "step": 14704 + }, + { + "epoch": 0.9128437519399094, + "grad_norm": 0.34753012780419185, + "learning_rate": 8.781820764419721e-05, + "loss": 2.9404, + "step": 14705 + }, + { + "epoch": 0.9129058290396672, + "grad_norm": 0.1762337370904159, + "learning_rate": 8.781584506411691e-05, + "loss": 3.0941, + "step": 14706 + }, + { + "epoch": 0.9129679061394251, + "grad_norm": 0.17550471063891565, + "learning_rate": 8.781348228674217e-05, + "loss": 3.0287, + "step": 14707 + }, + { + "epoch": 0.913029983239183, + "grad_norm": 0.2662519880901131, + "learning_rate": 8.78111193120853e-05, + "loss": 2.8837, + "step": 14708 + }, + { + "epoch": 0.913092060338941, + "grad_norm": 0.2100294134567419, + "learning_rate": 8.780875614015865e-05, + "loss": 2.9756, + "step": 14709 + }, + { + "epoch": 0.9131541374386989, + "grad_norm": 0.20148759899244653, + "learning_rate": 8.780639277097454e-05, + "loss": 3.0191, + "step": 14710 + }, + { + "epoch": 0.9132162145384568, + "grad_norm": 0.31183345758808406, + "learning_rate": 8.780402920454529e-05, + "loss": 3.0214, + "step": 14711 + }, + { + "epoch": 0.9132782916382146, + "grad_norm": 0.20354275659863472, + "learning_rate": 8.780166544088325e-05, + "loss": 2.9753, + "step": 14712 + }, + { + "epoch": 0.9133403687379725, + "grad_norm": 0.17358394681820147, + "learning_rate": 8.779930148000073e-05, + "loss": 3.0217, + "step": 14713 + }, + { + "epoch": 0.9134024458377304, + "grad_norm": 0.1800472038515652, + "learning_rate": 8.77969373219101e-05, + "loss": 2.9879, + "step": 14714 + }, + { + "epoch": 0.9134645229374884, + "grad_norm": 0.18701726641282443, + "learning_rate": 8.779457296662366e-05, + "loss": 3.0206, + "step": 14715 + }, + { + "epoch": 0.9135266000372463, + "grad_norm": 0.25283693947564195, + "learning_rate": 8.779220841415375e-05, + "loss": 2.9934, + "step": 14716 + }, + { + "epoch": 0.9135886771370042, + "grad_norm": 0.18524375047129318, + "learning_rate": 8.778984366451273e-05, + "loss": 3.0297, + "step": 14717 + }, + { + "epoch": 0.913650754236762, + "grad_norm": 0.18139595041928058, + "learning_rate": 8.778747871771292e-05, + "loss": 2.8796, + "step": 14718 + }, + { + "epoch": 0.9137128313365199, + "grad_norm": 0.171960576384305, + "learning_rate": 8.778511357376667e-05, + "loss": 2.9323, + "step": 14719 + }, + { + "epoch": 0.9137749084362778, + "grad_norm": 0.21193365366861672, + "learning_rate": 8.778274823268628e-05, + "loss": 2.9196, + "step": 14720 + }, + { + "epoch": 0.9138369855360358, + "grad_norm": 0.18002079032332655, + "learning_rate": 8.778038269448414e-05, + "loss": 2.9753, + "step": 14721 + }, + { + "epoch": 0.9138990626357937, + "grad_norm": 0.18376968962737247, + "learning_rate": 8.777801695917258e-05, + "loss": 2.9292, + "step": 14722 + }, + { + "epoch": 0.9139611397355516, + "grad_norm": 0.16622758125322523, + "learning_rate": 8.777565102676392e-05, + "loss": 3.0071, + "step": 14723 + }, + { + "epoch": 0.9140232168353094, + "grad_norm": 0.17811029105775386, + "learning_rate": 8.777328489727053e-05, + "loss": 2.9364, + "step": 14724 + }, + { + "epoch": 0.9140852939350673, + "grad_norm": 0.22816429681542238, + "learning_rate": 8.777091857070472e-05, + "loss": 3.0355, + "step": 14725 + }, + { + "epoch": 0.9141473710348252, + "grad_norm": 0.18429435809115938, + "learning_rate": 8.776855204707888e-05, + "loss": 2.8964, + "step": 14726 + }, + { + "epoch": 0.9142094481345832, + "grad_norm": 0.2577093380379321, + "learning_rate": 8.776618532640533e-05, + "loss": 2.9886, + "step": 14727 + }, + { + "epoch": 0.9142715252343411, + "grad_norm": 0.21513800843588624, + "learning_rate": 8.776381840869642e-05, + "loss": 3.1125, + "step": 14728 + }, + { + "epoch": 0.914333602334099, + "grad_norm": 0.1685089473612753, + "learning_rate": 8.776145129396451e-05, + "loss": 2.991, + "step": 14729 + }, + { + "epoch": 0.9143956794338568, + "grad_norm": 0.17719410956298068, + "learning_rate": 8.775908398222192e-05, + "loss": 3.0546, + "step": 14730 + }, + { + "epoch": 0.9144577565336147, + "grad_norm": 0.21471699530093136, + "learning_rate": 8.775671647348105e-05, + "loss": 3.0237, + "step": 14731 + }, + { + "epoch": 0.9145198336333726, + "grad_norm": 0.17378647980471842, + "learning_rate": 8.77543487677542e-05, + "loss": 3.0069, + "step": 14732 + }, + { + "epoch": 0.9145819107331306, + "grad_norm": 0.19763607706251718, + "learning_rate": 8.775198086505376e-05, + "loss": 3.0065, + "step": 14733 + }, + { + "epoch": 0.9146439878328885, + "grad_norm": 0.17653914356191844, + "learning_rate": 8.774961276539209e-05, + "loss": 3.0551, + "step": 14734 + }, + { + "epoch": 0.9147060649326464, + "grad_norm": 0.18965993977953635, + "learning_rate": 8.77472444687815e-05, + "loss": 3.0202, + "step": 14735 + }, + { + "epoch": 0.9147681420324042, + "grad_norm": 0.18310892737618967, + "learning_rate": 8.774487597523437e-05, + "loss": 3.0062, + "step": 14736 + }, + { + "epoch": 0.9148302191321621, + "grad_norm": 0.18070763758373773, + "learning_rate": 8.774250728476308e-05, + "loss": 2.9378, + "step": 14737 + }, + { + "epoch": 0.91489229623192, + "grad_norm": 0.17017917773708877, + "learning_rate": 8.774013839737995e-05, + "loss": 3.0667, + "step": 14738 + }, + { + "epoch": 0.914954373331678, + "grad_norm": 0.18132709803654062, + "learning_rate": 8.773776931309736e-05, + "loss": 2.9437, + "step": 14739 + }, + { + "epoch": 0.9150164504314359, + "grad_norm": 0.1629256528703733, + "learning_rate": 8.773540003192767e-05, + "loss": 2.9858, + "step": 14740 + }, + { + "epoch": 0.9150785275311938, + "grad_norm": 0.18240699298936713, + "learning_rate": 8.773303055388324e-05, + "loss": 2.9628, + "step": 14741 + }, + { + "epoch": 0.9151406046309516, + "grad_norm": 0.16908868318092876, + "learning_rate": 8.773066087897644e-05, + "loss": 3.0645, + "step": 14742 + }, + { + "epoch": 0.9152026817307095, + "grad_norm": 0.16998866292372103, + "learning_rate": 8.77282910072196e-05, + "loss": 2.9936, + "step": 14743 + }, + { + "epoch": 0.9152647588304674, + "grad_norm": 0.1678324578935813, + "learning_rate": 8.77259209386251e-05, + "loss": 3.052, + "step": 14744 + }, + { + "epoch": 0.9153268359302253, + "grad_norm": 0.20650145381769025, + "learning_rate": 8.772355067320535e-05, + "loss": 3.0413, + "step": 14745 + }, + { + "epoch": 0.9153889130299833, + "grad_norm": 0.17743730346101771, + "learning_rate": 8.772118021097265e-05, + "loss": 2.9896, + "step": 14746 + }, + { + "epoch": 0.9154509901297412, + "grad_norm": 0.18095666027905996, + "learning_rate": 8.771880955193939e-05, + "loss": 3.024, + "step": 14747 + }, + { + "epoch": 0.915513067229499, + "grad_norm": 0.15624325856924587, + "learning_rate": 8.771643869611796e-05, + "loss": 3.0051, + "step": 14748 + }, + { + "epoch": 0.9155751443292569, + "grad_norm": 0.19155797977932923, + "learning_rate": 8.77140676435207e-05, + "loss": 2.9719, + "step": 14749 + }, + { + "epoch": 0.9156372214290148, + "grad_norm": 0.14756239628010603, + "learning_rate": 8.771169639416002e-05, + "loss": 3.0258, + "step": 14750 + }, + { + "epoch": 0.9156992985287727, + "grad_norm": 0.16704639871058574, + "learning_rate": 8.770932494804824e-05, + "loss": 3.0081, + "step": 14751 + }, + { + "epoch": 0.9157613756285307, + "grad_norm": 0.2048928559262628, + "learning_rate": 8.770695330519777e-05, + "loss": 3.0079, + "step": 14752 + }, + { + "epoch": 0.9158234527282886, + "grad_norm": 0.15928216572566226, + "learning_rate": 8.770458146562096e-05, + "loss": 2.9769, + "step": 14753 + }, + { + "epoch": 0.9158855298280464, + "grad_norm": 0.15896344855549643, + "learning_rate": 8.770220942933021e-05, + "loss": 3.034, + "step": 14754 + }, + { + "epoch": 0.9159476069278043, + "grad_norm": 0.15939153721038563, + "learning_rate": 8.769983719633788e-05, + "loss": 2.9914, + "step": 14755 + }, + { + "epoch": 0.9160096840275622, + "grad_norm": 0.2029765118207986, + "learning_rate": 8.769746476665632e-05, + "loss": 3.0781, + "step": 14756 + }, + { + "epoch": 0.9160717611273201, + "grad_norm": 0.15899088843872455, + "learning_rate": 8.769509214029796e-05, + "loss": 2.9352, + "step": 14757 + }, + { + "epoch": 0.9161338382270781, + "grad_norm": 0.16987847179446847, + "learning_rate": 8.769271931727515e-05, + "loss": 2.9793, + "step": 14758 + }, + { + "epoch": 0.916195915326836, + "grad_norm": 0.20489661040269444, + "learning_rate": 8.769034629760026e-05, + "loss": 2.991, + "step": 14759 + }, + { + "epoch": 0.9162579924265938, + "grad_norm": 0.17242237731383372, + "learning_rate": 8.768797308128572e-05, + "loss": 2.9709, + "step": 14760 + }, + { + "epoch": 0.9163200695263517, + "grad_norm": 0.16633056052034262, + "learning_rate": 8.768559966834384e-05, + "loss": 3.0097, + "step": 14761 + }, + { + "epoch": 0.9163821466261096, + "grad_norm": 0.183385406128669, + "learning_rate": 8.768322605878705e-05, + "loss": 2.9717, + "step": 14762 + }, + { + "epoch": 0.9164442237258675, + "grad_norm": 0.14039303270715525, + "learning_rate": 8.768085225262772e-05, + "loss": 2.8984, + "step": 14763 + }, + { + "epoch": 0.9165063008256255, + "grad_norm": 0.17412249681695882, + "learning_rate": 8.767847824987823e-05, + "loss": 3.0255, + "step": 14764 + }, + { + "epoch": 0.9165683779253834, + "grad_norm": 0.18483447794098337, + "learning_rate": 8.767610405055098e-05, + "loss": 3.0626, + "step": 14765 + }, + { + "epoch": 0.9166304550251412, + "grad_norm": 0.16925627144508823, + "learning_rate": 8.767372965465835e-05, + "loss": 2.941, + "step": 14766 + }, + { + "epoch": 0.9166925321248991, + "grad_norm": 0.16023009621400106, + "learning_rate": 8.767135506221273e-05, + "loss": 2.9692, + "step": 14767 + }, + { + "epoch": 0.916754609224657, + "grad_norm": 0.17748853428169403, + "learning_rate": 8.76689802732265e-05, + "loss": 2.9741, + "step": 14768 + }, + { + "epoch": 0.9168166863244149, + "grad_norm": 0.1438727650220234, + "learning_rate": 8.766660528771204e-05, + "loss": 2.919, + "step": 14769 + }, + { + "epoch": 0.9168787634241728, + "grad_norm": 0.16542965192017517, + "learning_rate": 8.766423010568179e-05, + "loss": 2.9475, + "step": 14770 + }, + { + "epoch": 0.9169408405239308, + "grad_norm": 0.15827935936938717, + "learning_rate": 8.766185472714808e-05, + "loss": 2.9758, + "step": 14771 + }, + { + "epoch": 0.9170029176236886, + "grad_norm": 0.20425785049376843, + "learning_rate": 8.765947915212334e-05, + "loss": 3.0293, + "step": 14772 + }, + { + "epoch": 0.9170649947234465, + "grad_norm": 0.17857201184449986, + "learning_rate": 8.765710338061996e-05, + "loss": 3.0212, + "step": 14773 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.18624899681512394, + "learning_rate": 8.765472741265032e-05, + "loss": 2.9256, + "step": 14774 + }, + { + "epoch": 0.9171891489229623, + "grad_norm": 0.21260053661604827, + "learning_rate": 8.765235124822684e-05, + "loss": 2.9483, + "step": 14775 + }, + { + "epoch": 0.9172512260227202, + "grad_norm": 0.16592564668902968, + "learning_rate": 8.764997488736189e-05, + "loss": 2.9873, + "step": 14776 + }, + { + "epoch": 0.9173133031224782, + "grad_norm": 0.17860069418261781, + "learning_rate": 8.764759833006789e-05, + "loss": 2.9317, + "step": 14777 + }, + { + "epoch": 0.917375380222236, + "grad_norm": 0.16389921555092815, + "learning_rate": 8.764522157635723e-05, + "loss": 2.8824, + "step": 14778 + }, + { + "epoch": 0.9174374573219939, + "grad_norm": 0.1734015986082884, + "learning_rate": 8.76428446262423e-05, + "loss": 2.9356, + "step": 14779 + }, + { + "epoch": 0.9174995344217518, + "grad_norm": 0.2119246801010549, + "learning_rate": 8.764046747973554e-05, + "loss": 3.0653, + "step": 14780 + }, + { + "epoch": 0.9175616115215097, + "grad_norm": 0.21192229821862438, + "learning_rate": 8.76380901368493e-05, + "loss": 3.0396, + "step": 14781 + }, + { + "epoch": 0.9176236886212676, + "grad_norm": 0.23089025229626414, + "learning_rate": 8.7635712597596e-05, + "loss": 2.8817, + "step": 14782 + }, + { + "epoch": 0.9176857657210256, + "grad_norm": 0.1784722239951576, + "learning_rate": 8.763333486198805e-05, + "loss": 2.9725, + "step": 14783 + }, + { + "epoch": 0.9177478428207834, + "grad_norm": 0.18122111863194884, + "learning_rate": 8.763095693003788e-05, + "loss": 2.9745, + "step": 14784 + }, + { + "epoch": 0.9178099199205413, + "grad_norm": 0.19436438270838036, + "learning_rate": 8.762857880175786e-05, + "loss": 3.0688, + "step": 14785 + }, + { + "epoch": 0.9178719970202992, + "grad_norm": 0.19058709119419026, + "learning_rate": 8.76262004771604e-05, + "loss": 2.9883, + "step": 14786 + }, + { + "epoch": 0.9179340741200571, + "grad_norm": 0.22157367406411993, + "learning_rate": 8.762382195625794e-05, + "loss": 3.0154, + "step": 14787 + }, + { + "epoch": 0.917996151219815, + "grad_norm": 0.30694829485037944, + "learning_rate": 8.762144323906285e-05, + "loss": 2.9776, + "step": 14788 + }, + { + "epoch": 0.918058228319573, + "grad_norm": 0.1707821695734031, + "learning_rate": 8.761906432558756e-05, + "loss": 2.9026, + "step": 14789 + }, + { + "epoch": 0.9181203054193308, + "grad_norm": 0.1971426571096295, + "learning_rate": 8.761668521584447e-05, + "loss": 2.9775, + "step": 14790 + }, + { + "epoch": 0.9181823825190887, + "grad_norm": 0.19188909547207234, + "learning_rate": 8.761430590984603e-05, + "loss": 2.986, + "step": 14791 + }, + { + "epoch": 0.9182444596188466, + "grad_norm": 0.20786695233031113, + "learning_rate": 8.76119264076046e-05, + "loss": 2.9436, + "step": 14792 + }, + { + "epoch": 0.9183065367186045, + "grad_norm": 0.1677117266420274, + "learning_rate": 8.760954670913262e-05, + "loss": 2.9589, + "step": 14793 + }, + { + "epoch": 0.9183686138183624, + "grad_norm": 0.17490065726655088, + "learning_rate": 8.760716681444253e-05, + "loss": 2.8604, + "step": 14794 + }, + { + "epoch": 0.9184306909181204, + "grad_norm": 0.2164113250715731, + "learning_rate": 8.760478672354668e-05, + "loss": 2.9915, + "step": 14795 + }, + { + "epoch": 0.9184927680178782, + "grad_norm": 0.21081110911754464, + "learning_rate": 8.760240643645756e-05, + "loss": 2.8753, + "step": 14796 + }, + { + "epoch": 0.9185548451176361, + "grad_norm": 0.2194509345733305, + "learning_rate": 8.760002595318754e-05, + "loss": 3.1327, + "step": 14797 + }, + { + "epoch": 0.918616922217394, + "grad_norm": 0.18587956968627806, + "learning_rate": 8.759764527374907e-05, + "loss": 2.8779, + "step": 14798 + }, + { + "epoch": 0.9186789993171519, + "grad_norm": 0.1814566470117824, + "learning_rate": 8.759526439815455e-05, + "loss": 3.0032, + "step": 14799 + }, + { + "epoch": 0.9187410764169098, + "grad_norm": 0.17992833831945135, + "learning_rate": 8.759288332641642e-05, + "loss": 2.9446, + "step": 14800 + }, + { + "epoch": 0.9188031535166677, + "grad_norm": 0.20808235587877255, + "learning_rate": 8.759050205854709e-05, + "loss": 2.951, + "step": 14801 + }, + { + "epoch": 0.9188652306164256, + "grad_norm": 0.19357120585856097, + "learning_rate": 8.758812059455896e-05, + "loss": 2.9257, + "step": 14802 + }, + { + "epoch": 0.9189273077161835, + "grad_norm": 0.1755751514198289, + "learning_rate": 8.758573893446452e-05, + "loss": 3.0387, + "step": 14803 + }, + { + "epoch": 0.9189893848159414, + "grad_norm": 0.2784670125260472, + "learning_rate": 8.758335707827612e-05, + "loss": 3.0988, + "step": 14804 + }, + { + "epoch": 0.9190514619156993, + "grad_norm": 0.19463440484884775, + "learning_rate": 8.758097502600623e-05, + "loss": 3.0489, + "step": 14805 + }, + { + "epoch": 0.9191135390154572, + "grad_norm": 0.19585843081830645, + "learning_rate": 8.757859277766729e-05, + "loss": 2.9723, + "step": 14806 + }, + { + "epoch": 0.9191756161152151, + "grad_norm": 0.20860436908966468, + "learning_rate": 8.75762103332717e-05, + "loss": 3.0308, + "step": 14807 + }, + { + "epoch": 0.919237693214973, + "grad_norm": 0.18808186171127367, + "learning_rate": 8.757382769283189e-05, + "loss": 2.9896, + "step": 14808 + }, + { + "epoch": 0.9192997703147309, + "grad_norm": 0.197272621635015, + "learning_rate": 8.75714448563603e-05, + "loss": 2.9471, + "step": 14809 + }, + { + "epoch": 0.9193618474144888, + "grad_norm": 0.1973183910691518, + "learning_rate": 8.756906182386938e-05, + "loss": 3.0278, + "step": 14810 + }, + { + "epoch": 0.9194239245142467, + "grad_norm": 0.20965431357248682, + "learning_rate": 8.756667859537153e-05, + "loss": 2.9564, + "step": 14811 + }, + { + "epoch": 0.9194860016140046, + "grad_norm": 0.19397866652804605, + "learning_rate": 8.756429517087918e-05, + "loss": 2.914, + "step": 14812 + }, + { + "epoch": 0.9195480787137625, + "grad_norm": 0.19463354354976511, + "learning_rate": 8.75619115504048e-05, + "loss": 3.0044, + "step": 14813 + }, + { + "epoch": 0.9196101558135203, + "grad_norm": 0.18985717984957706, + "learning_rate": 8.755952773396083e-05, + "loss": 2.9388, + "step": 14814 + }, + { + "epoch": 0.9196722329132783, + "grad_norm": 0.18607644251262107, + "learning_rate": 8.755714372155965e-05, + "loss": 3.0394, + "step": 14815 + }, + { + "epoch": 0.9197343100130362, + "grad_norm": 0.1737426429758689, + "learning_rate": 8.755475951321376e-05, + "loss": 2.9049, + "step": 14816 + }, + { + "epoch": 0.9197963871127941, + "grad_norm": 0.19952802400064704, + "learning_rate": 8.755237510893556e-05, + "loss": 2.9054, + "step": 14817 + }, + { + "epoch": 0.919858464212552, + "grad_norm": 0.2739703844173322, + "learning_rate": 8.754999050873752e-05, + "loss": 2.9816, + "step": 14818 + }, + { + "epoch": 0.9199205413123099, + "grad_norm": 0.15307567298235963, + "learning_rate": 8.754760571263204e-05, + "loss": 2.932, + "step": 14819 + }, + { + "epoch": 0.9199826184120677, + "grad_norm": 0.1778873058138435, + "learning_rate": 8.754522072063161e-05, + "loss": 2.9822, + "step": 14820 + }, + { + "epoch": 0.9200446955118257, + "grad_norm": 0.18044223053851913, + "learning_rate": 8.754283553274863e-05, + "loss": 2.9888, + "step": 14821 + }, + { + "epoch": 0.9201067726115836, + "grad_norm": 0.2630289362927677, + "learning_rate": 8.754045014899557e-05, + "loss": 2.8799, + "step": 14822 + }, + { + "epoch": 0.9201688497113415, + "grad_norm": 0.1682318799909223, + "learning_rate": 8.753806456938486e-05, + "loss": 2.9785, + "step": 14823 + }, + { + "epoch": 0.9202309268110994, + "grad_norm": 0.16649155556970235, + "learning_rate": 8.753567879392896e-05, + "loss": 2.9933, + "step": 14824 + }, + { + "epoch": 0.9202930039108573, + "grad_norm": 0.21998000892659517, + "learning_rate": 8.753329282264031e-05, + "loss": 3.0118, + "step": 14825 + }, + { + "epoch": 0.9203550810106151, + "grad_norm": 0.18707923566781182, + "learning_rate": 8.753090665553137e-05, + "loss": 2.9603, + "step": 14826 + }, + { + "epoch": 0.9204171581103731, + "grad_norm": 0.2021970091622151, + "learning_rate": 8.752852029261457e-05, + "loss": 3.0091, + "step": 14827 + }, + { + "epoch": 0.920479235210131, + "grad_norm": 0.202073880511997, + "learning_rate": 8.752613373390237e-05, + "loss": 2.9626, + "step": 14828 + }, + { + "epoch": 0.9205413123098889, + "grad_norm": 0.20689889887921908, + "learning_rate": 8.752374697940721e-05, + "loss": 3.0262, + "step": 14829 + }, + { + "epoch": 0.9206033894096468, + "grad_norm": 0.18174043599798162, + "learning_rate": 8.752136002914157e-05, + "loss": 3.0474, + "step": 14830 + }, + { + "epoch": 0.9206654665094047, + "grad_norm": 0.1931593949358608, + "learning_rate": 8.751897288311788e-05, + "loss": 2.9356, + "step": 14831 + }, + { + "epoch": 0.9207275436091625, + "grad_norm": 0.20631798649599511, + "learning_rate": 8.751658554134861e-05, + "loss": 2.9831, + "step": 14832 + }, + { + "epoch": 0.9207896207089205, + "grad_norm": 0.20440708455386467, + "learning_rate": 8.751419800384619e-05, + "loss": 3.032, + "step": 14833 + }, + { + "epoch": 0.9208516978086784, + "grad_norm": 0.2121423818573529, + "learning_rate": 8.75118102706231e-05, + "loss": 3.0382, + "step": 14834 + }, + { + "epoch": 0.9209137749084363, + "grad_norm": 0.1848996367981182, + "learning_rate": 8.750942234169179e-05, + "loss": 2.9433, + "step": 14835 + }, + { + "epoch": 0.9209758520081942, + "grad_norm": 0.18797946265207663, + "learning_rate": 8.750703421706472e-05, + "loss": 3.0528, + "step": 14836 + }, + { + "epoch": 0.9210379291079521, + "grad_norm": 0.18872277578224908, + "learning_rate": 8.750464589675433e-05, + "loss": 2.861, + "step": 14837 + }, + { + "epoch": 0.9211000062077099, + "grad_norm": 0.2065175629269644, + "learning_rate": 8.750225738077311e-05, + "loss": 2.9867, + "step": 14838 + }, + { + "epoch": 0.9211620833074678, + "grad_norm": 0.22817007658062324, + "learning_rate": 8.749986866913352e-05, + "loss": 3.0648, + "step": 14839 + }, + { + "epoch": 0.9212241604072258, + "grad_norm": 0.17189991835379478, + "learning_rate": 8.7497479761848e-05, + "loss": 2.9359, + "step": 14840 + }, + { + "epoch": 0.9212862375069837, + "grad_norm": 0.2962810950317557, + "learning_rate": 8.749509065892903e-05, + "loss": 3.0608, + "step": 14841 + }, + { + "epoch": 0.9213483146067416, + "grad_norm": 0.18839072214645966, + "learning_rate": 8.749270136038907e-05, + "loss": 3.0046, + "step": 14842 + }, + { + "epoch": 0.9214103917064995, + "grad_norm": 0.2041076129947369, + "learning_rate": 8.749031186624059e-05, + "loss": 3.0732, + "step": 14843 + }, + { + "epoch": 0.9214724688062573, + "grad_norm": 0.24144212554966196, + "learning_rate": 8.748792217649604e-05, + "loss": 2.9905, + "step": 14844 + }, + { + "epoch": 0.9215345459060152, + "grad_norm": 0.2202391093010122, + "learning_rate": 8.74855322911679e-05, + "loss": 3.0698, + "step": 14845 + }, + { + "epoch": 0.9215966230057732, + "grad_norm": 0.17966170747821425, + "learning_rate": 8.748314221026864e-05, + "loss": 3.0008, + "step": 14846 + }, + { + "epoch": 0.9216587001055311, + "grad_norm": 0.21466197213290325, + "learning_rate": 8.748075193381073e-05, + "loss": 2.9689, + "step": 14847 + }, + { + "epoch": 0.921720777205289, + "grad_norm": 0.1647491108833454, + "learning_rate": 8.747836146180663e-05, + "loss": 3.0088, + "step": 14848 + }, + { + "epoch": 0.9217828543050469, + "grad_norm": 0.17840376757232565, + "learning_rate": 8.747597079426884e-05, + "loss": 2.9722, + "step": 14849 + }, + { + "epoch": 0.9218449314048047, + "grad_norm": 0.16448127569070922, + "learning_rate": 8.747357993120979e-05, + "loss": 3.0174, + "step": 14850 + }, + { + "epoch": 0.9219070085045626, + "grad_norm": 0.18370603617724915, + "learning_rate": 8.747118887264199e-05, + "loss": 2.9467, + "step": 14851 + }, + { + "epoch": 0.9219690856043206, + "grad_norm": 0.17833154565666792, + "learning_rate": 8.74687976185779e-05, + "loss": 2.9448, + "step": 14852 + }, + { + "epoch": 0.9220311627040785, + "grad_norm": 0.1760272159561138, + "learning_rate": 8.746640616903e-05, + "loss": 3.0765, + "step": 14853 + }, + { + "epoch": 0.9220932398038364, + "grad_norm": 0.18730014839281048, + "learning_rate": 8.746401452401077e-05, + "loss": 2.9978, + "step": 14854 + }, + { + "epoch": 0.9221553169035943, + "grad_norm": 0.18185839365820497, + "learning_rate": 8.746162268353266e-05, + "loss": 3.0342, + "step": 14855 + }, + { + "epoch": 0.9222173940033521, + "grad_norm": 0.17715805010807123, + "learning_rate": 8.745923064760818e-05, + "loss": 2.995, + "step": 14856 + }, + { + "epoch": 0.92227947110311, + "grad_norm": 0.190596688231815, + "learning_rate": 8.745683841624981e-05, + "loss": 2.918, + "step": 14857 + }, + { + "epoch": 0.922341548202868, + "grad_norm": 0.18805984522073071, + "learning_rate": 8.745444598947001e-05, + "loss": 2.9736, + "step": 14858 + }, + { + "epoch": 0.9224036253026259, + "grad_norm": 0.20709451572466034, + "learning_rate": 8.745205336728127e-05, + "loss": 2.9797, + "step": 14859 + }, + { + "epoch": 0.9224657024023838, + "grad_norm": 0.17898614465445575, + "learning_rate": 8.744966054969607e-05, + "loss": 3.0734, + "step": 14860 + }, + { + "epoch": 0.9225277795021417, + "grad_norm": 0.16683431044463745, + "learning_rate": 8.744726753672692e-05, + "loss": 2.9372, + "step": 14861 + }, + { + "epoch": 0.9225898566018995, + "grad_norm": 0.2289955365373781, + "learning_rate": 8.744487432838629e-05, + "loss": 3.0502, + "step": 14862 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.1754657828588808, + "learning_rate": 8.744248092468664e-05, + "loss": 2.9674, + "step": 14863 + }, + { + "epoch": 0.9227140108014154, + "grad_norm": 0.1740627366463304, + "learning_rate": 8.744008732564048e-05, + "loss": 2.9859, + "step": 14864 + }, + { + "epoch": 0.9227760879011733, + "grad_norm": 0.16486832290928727, + "learning_rate": 8.74376935312603e-05, + "loss": 3.0111, + "step": 14865 + }, + { + "epoch": 0.9228381650009312, + "grad_norm": 0.19336628179063334, + "learning_rate": 8.743529954155858e-05, + "loss": 3.0803, + "step": 14866 + }, + { + "epoch": 0.9229002421006891, + "grad_norm": 0.16307415034476155, + "learning_rate": 8.743290535654783e-05, + "loss": 2.9159, + "step": 14867 + }, + { + "epoch": 0.9229623192004469, + "grad_norm": 0.18686081298190613, + "learning_rate": 8.743051097624051e-05, + "loss": 2.949, + "step": 14868 + }, + { + "epoch": 0.9230243963002048, + "grad_norm": 0.23446648719485796, + "learning_rate": 8.742811640064912e-05, + "loss": 3.0743, + "step": 14869 + }, + { + "epoch": 0.9230864733999627, + "grad_norm": 0.16522401118862123, + "learning_rate": 8.742572162978617e-05, + "loss": 2.9995, + "step": 14870 + }, + { + "epoch": 0.9231485504997207, + "grad_norm": 0.15490376954846805, + "learning_rate": 8.742332666366416e-05, + "loss": 2.9936, + "step": 14871 + }, + { + "epoch": 0.9232106275994786, + "grad_norm": 0.18810920294151673, + "learning_rate": 8.742093150229556e-05, + "loss": 3.0161, + "step": 14872 + }, + { + "epoch": 0.9232727046992365, + "grad_norm": 0.1853575593965078, + "learning_rate": 8.741853614569286e-05, + "loss": 3.016, + "step": 14873 + }, + { + "epoch": 0.9233347817989943, + "grad_norm": 0.1767943855511569, + "learning_rate": 8.741614059386858e-05, + "loss": 3.0359, + "step": 14874 + }, + { + "epoch": 0.9233968588987522, + "grad_norm": 0.1860701860904004, + "learning_rate": 8.741374484683522e-05, + "loss": 2.9626, + "step": 14875 + }, + { + "epoch": 0.9234589359985101, + "grad_norm": 0.16136458518322305, + "learning_rate": 8.741134890460527e-05, + "loss": 2.9489, + "step": 14876 + }, + { + "epoch": 0.9235210130982681, + "grad_norm": 0.17096405949239374, + "learning_rate": 8.740895276719123e-05, + "loss": 2.9879, + "step": 14877 + }, + { + "epoch": 0.923583090198026, + "grad_norm": 0.16035535113264898, + "learning_rate": 8.74065564346056e-05, + "loss": 3.0231, + "step": 14878 + }, + { + "epoch": 0.9236451672977839, + "grad_norm": 0.15949876084563847, + "learning_rate": 8.74041599068609e-05, + "loss": 2.9303, + "step": 14879 + }, + { + "epoch": 0.9237072443975417, + "grad_norm": 0.2101071935072218, + "learning_rate": 8.740176318396959e-05, + "loss": 3.0112, + "step": 14880 + }, + { + "epoch": 0.9237693214972996, + "grad_norm": 0.15884174561535194, + "learning_rate": 8.739936626594423e-05, + "loss": 2.9782, + "step": 14881 + }, + { + "epoch": 0.9238313985970575, + "grad_norm": 0.16096441543836498, + "learning_rate": 8.739696915279727e-05, + "loss": 2.994, + "step": 14882 + }, + { + "epoch": 0.9238934756968155, + "grad_norm": 0.16933904092798185, + "learning_rate": 8.739457184454127e-05, + "loss": 3.0079, + "step": 14883 + }, + { + "epoch": 0.9239555527965734, + "grad_norm": 0.16470038634124926, + "learning_rate": 8.739217434118868e-05, + "loss": 2.9947, + "step": 14884 + }, + { + "epoch": 0.9240176298963313, + "grad_norm": 0.2117717106587228, + "learning_rate": 8.738977664275205e-05, + "loss": 2.9746, + "step": 14885 + }, + { + "epoch": 0.9240797069960891, + "grad_norm": 0.19371772081914332, + "learning_rate": 8.73873787492439e-05, + "loss": 2.8741, + "step": 14886 + }, + { + "epoch": 0.924141784095847, + "grad_norm": 0.18950450829638774, + "learning_rate": 8.73849806606767e-05, + "loss": 3.0243, + "step": 14887 + }, + { + "epoch": 0.9242038611956049, + "grad_norm": 0.18088788782048784, + "learning_rate": 8.738258237706298e-05, + "loss": 3.0388, + "step": 14888 + }, + { + "epoch": 0.9242659382953629, + "grad_norm": 0.21211687394404505, + "learning_rate": 8.738018389841527e-05, + "loss": 2.8576, + "step": 14889 + }, + { + "epoch": 0.9243280153951208, + "grad_norm": 0.19638079307565884, + "learning_rate": 8.737778522474605e-05, + "loss": 2.9751, + "step": 14890 + }, + { + "epoch": 0.9243900924948787, + "grad_norm": 0.17678491374105654, + "learning_rate": 8.737538635606785e-05, + "loss": 2.9818, + "step": 14891 + }, + { + "epoch": 0.9244521695946365, + "grad_norm": 0.16867833735148408, + "learning_rate": 8.737298729239319e-05, + "loss": 2.9753, + "step": 14892 + }, + { + "epoch": 0.9245142466943944, + "grad_norm": 0.19849285007098516, + "learning_rate": 8.737058803373456e-05, + "loss": 3.0345, + "step": 14893 + }, + { + "epoch": 0.9245763237941523, + "grad_norm": 0.20313489413006333, + "learning_rate": 8.736818858010454e-05, + "loss": 3.0336, + "step": 14894 + }, + { + "epoch": 0.9246384008939103, + "grad_norm": 0.1770629672418981, + "learning_rate": 8.736578893151559e-05, + "loss": 2.9123, + "step": 14895 + }, + { + "epoch": 0.9247004779936682, + "grad_norm": 0.14985403096249328, + "learning_rate": 8.736338908798026e-05, + "loss": 2.9869, + "step": 14896 + }, + { + "epoch": 0.9247625550934261, + "grad_norm": 0.22429709079403862, + "learning_rate": 8.736098904951105e-05, + "loss": 2.9473, + "step": 14897 + }, + { + "epoch": 0.9248246321931839, + "grad_norm": 0.24330810508903722, + "learning_rate": 8.735858881612048e-05, + "loss": 2.9829, + "step": 14898 + }, + { + "epoch": 0.9248867092929418, + "grad_norm": 0.18457310790513426, + "learning_rate": 8.73561883878211e-05, + "loss": 2.9451, + "step": 14899 + }, + { + "epoch": 0.9249487863926997, + "grad_norm": 0.1815019611503587, + "learning_rate": 8.73537877646254e-05, + "loss": 3.0032, + "step": 14900 + }, + { + "epoch": 0.9250108634924576, + "grad_norm": 0.18042066519742853, + "learning_rate": 8.735138694654592e-05, + "loss": 3.0143, + "step": 14901 + }, + { + "epoch": 0.9250729405922156, + "grad_norm": 0.2430698411391537, + "learning_rate": 8.73489859335952e-05, + "loss": 3.0547, + "step": 14902 + }, + { + "epoch": 0.9251350176919735, + "grad_norm": 0.16391663652063249, + "learning_rate": 8.734658472578576e-05, + "loss": 2.9551, + "step": 14903 + }, + { + "epoch": 0.9251970947917313, + "grad_norm": 0.20192929612440333, + "learning_rate": 8.73441833231301e-05, + "loss": 2.9314, + "step": 14904 + }, + { + "epoch": 0.9252591718914892, + "grad_norm": 0.18290809674995104, + "learning_rate": 8.734178172564079e-05, + "loss": 2.9204, + "step": 14905 + }, + { + "epoch": 0.9253212489912471, + "grad_norm": 0.16480190686151602, + "learning_rate": 8.733937993333033e-05, + "loss": 3.007, + "step": 14906 + }, + { + "epoch": 0.925383326091005, + "grad_norm": 0.1919947064254852, + "learning_rate": 8.733697794621127e-05, + "loss": 3.0402, + "step": 14907 + }, + { + "epoch": 0.925445403190763, + "grad_norm": 0.16875970710986046, + "learning_rate": 8.733457576429612e-05, + "loss": 3.008, + "step": 14908 + }, + { + "epoch": 0.9255074802905209, + "grad_norm": 0.21352339681919688, + "learning_rate": 8.733217338759742e-05, + "loss": 2.8803, + "step": 14909 + }, + { + "epoch": 0.9255695573902787, + "grad_norm": 0.1824577676068555, + "learning_rate": 8.732977081612773e-05, + "loss": 2.9763, + "step": 14910 + }, + { + "epoch": 0.9256316344900366, + "grad_norm": 0.1768073860749019, + "learning_rate": 8.732736804989954e-05, + "loss": 3.0335, + "step": 14911 + }, + { + "epoch": 0.9256937115897945, + "grad_norm": 0.1955066193078336, + "learning_rate": 8.732496508892542e-05, + "loss": 2.9685, + "step": 14912 + }, + { + "epoch": 0.9257557886895524, + "grad_norm": 0.17734668704050144, + "learning_rate": 8.732256193321791e-05, + "loss": 2.921, + "step": 14913 + }, + { + "epoch": 0.9258178657893104, + "grad_norm": 0.21460914807607945, + "learning_rate": 8.732015858278953e-05, + "loss": 3.0241, + "step": 14914 + }, + { + "epoch": 0.9258799428890683, + "grad_norm": 0.17191413953935855, + "learning_rate": 8.73177550376528e-05, + "loss": 2.9078, + "step": 14915 + }, + { + "epoch": 0.9259420199888261, + "grad_norm": 0.17471703404235692, + "learning_rate": 8.731535129782029e-05, + "loss": 2.9917, + "step": 14916 + }, + { + "epoch": 0.926004097088584, + "grad_norm": 0.16831796406780594, + "learning_rate": 8.731294736330454e-05, + "loss": 2.9148, + "step": 14917 + }, + { + "epoch": 0.9260661741883419, + "grad_norm": 0.17646060108854145, + "learning_rate": 8.731054323411809e-05, + "loss": 2.9462, + "step": 14918 + }, + { + "epoch": 0.9261282512880998, + "grad_norm": 0.17408537313753897, + "learning_rate": 8.730813891027347e-05, + "loss": 2.9611, + "step": 14919 + }, + { + "epoch": 0.9261903283878578, + "grad_norm": 0.18879180200579368, + "learning_rate": 8.730573439178324e-05, + "loss": 2.9406, + "step": 14920 + }, + { + "epoch": 0.9262524054876157, + "grad_norm": 0.18988195042825493, + "learning_rate": 8.730332967865993e-05, + "loss": 3.0365, + "step": 14921 + }, + { + "epoch": 0.9263144825873735, + "grad_norm": 0.21137540009955333, + "learning_rate": 8.730092477091609e-05, + "loss": 2.9626, + "step": 14922 + }, + { + "epoch": 0.9263765596871314, + "grad_norm": 0.19636673600384158, + "learning_rate": 8.729851966856429e-05, + "loss": 3.0155, + "step": 14923 + }, + { + "epoch": 0.9264386367868893, + "grad_norm": 0.21257962249822832, + "learning_rate": 8.729611437161705e-05, + "loss": 3.0276, + "step": 14924 + }, + { + "epoch": 0.9265007138866472, + "grad_norm": 0.18557410793223647, + "learning_rate": 8.729370888008692e-05, + "loss": 3.0636, + "step": 14925 + }, + { + "epoch": 0.9265627909864051, + "grad_norm": 0.18501599307136787, + "learning_rate": 8.729130319398647e-05, + "loss": 3.0497, + "step": 14926 + }, + { + "epoch": 0.9266248680861631, + "grad_norm": 0.24575434091834028, + "learning_rate": 8.728889731332821e-05, + "loss": 3.0719, + "step": 14927 + }, + { + "epoch": 0.9266869451859209, + "grad_norm": 0.19252081825032824, + "learning_rate": 8.728649123812474e-05, + "loss": 2.9382, + "step": 14928 + }, + { + "epoch": 0.9267490222856788, + "grad_norm": 0.18518664879077018, + "learning_rate": 8.728408496838861e-05, + "loss": 2.9271, + "step": 14929 + }, + { + "epoch": 0.9268110993854367, + "grad_norm": 0.24441970895491352, + "learning_rate": 8.728167850413233e-05, + "loss": 2.9382, + "step": 14930 + }, + { + "epoch": 0.9268731764851946, + "grad_norm": 0.21170533795825724, + "learning_rate": 8.727927184536851e-05, + "loss": 2.9831, + "step": 14931 + }, + { + "epoch": 0.9269352535849525, + "grad_norm": 0.1908854190953052, + "learning_rate": 8.727686499210965e-05, + "loss": 2.8831, + "step": 14932 + }, + { + "epoch": 0.9269973306847105, + "grad_norm": 0.17896304041282707, + "learning_rate": 8.727445794436834e-05, + "loss": 2.9249, + "step": 14933 + }, + { + "epoch": 0.9270594077844683, + "grad_norm": 0.1940523695336395, + "learning_rate": 8.727205070215715e-05, + "loss": 2.9141, + "step": 14934 + }, + { + "epoch": 0.9271214848842262, + "grad_norm": 0.20733552741079841, + "learning_rate": 8.726964326548863e-05, + "loss": 3.0429, + "step": 14935 + }, + { + "epoch": 0.9271835619839841, + "grad_norm": 0.2098446723707525, + "learning_rate": 8.726723563437531e-05, + "loss": 2.9268, + "step": 14936 + }, + { + "epoch": 0.927245639083742, + "grad_norm": 0.21672936840488624, + "learning_rate": 8.726482780882978e-05, + "loss": 2.9586, + "step": 14937 + }, + { + "epoch": 0.9273077161834999, + "grad_norm": 0.2535078738163518, + "learning_rate": 8.72624197888646e-05, + "loss": 3.0093, + "step": 14938 + }, + { + "epoch": 0.9273697932832579, + "grad_norm": 0.2031068335259027, + "learning_rate": 8.726001157449234e-05, + "loss": 2.965, + "step": 14939 + }, + { + "epoch": 0.9274318703830157, + "grad_norm": 0.20342110179552292, + "learning_rate": 8.725760316572554e-05, + "loss": 2.9504, + "step": 14940 + }, + { + "epoch": 0.9274939474827736, + "grad_norm": 0.18954490129724505, + "learning_rate": 8.725519456257679e-05, + "loss": 2.9325, + "step": 14941 + }, + { + "epoch": 0.9275560245825315, + "grad_norm": 0.20737219572959212, + "learning_rate": 8.725278576505865e-05, + "loss": 2.9546, + "step": 14942 + }, + { + "epoch": 0.9276181016822894, + "grad_norm": 0.17407026998651978, + "learning_rate": 8.725037677318366e-05, + "loss": 3.0245, + "step": 14943 + }, + { + "epoch": 0.9276801787820473, + "grad_norm": 0.19363994664510703, + "learning_rate": 8.724796758696444e-05, + "loss": 3.0087, + "step": 14944 + }, + { + "epoch": 0.9277422558818053, + "grad_norm": 0.23509754227491841, + "learning_rate": 8.724555820641352e-05, + "loss": 3.0291, + "step": 14945 + }, + { + "epoch": 0.9278043329815631, + "grad_norm": 0.17277069139637674, + "learning_rate": 8.724314863154347e-05, + "loss": 2.908, + "step": 14946 + }, + { + "epoch": 0.927866410081321, + "grad_norm": 0.18865774271540195, + "learning_rate": 8.724073886236688e-05, + "loss": 2.9358, + "step": 14947 + }, + { + "epoch": 0.9279284871810789, + "grad_norm": 0.18500874434244874, + "learning_rate": 8.723832889889631e-05, + "loss": 3.0333, + "step": 14948 + }, + { + "epoch": 0.9279905642808368, + "grad_norm": 0.16107077126660382, + "learning_rate": 8.723591874114435e-05, + "loss": 2.8831, + "step": 14949 + }, + { + "epoch": 0.9280526413805947, + "grad_norm": 0.19517631551302392, + "learning_rate": 8.723350838912356e-05, + "loss": 3.0115, + "step": 14950 + }, + { + "epoch": 0.9281147184803527, + "grad_norm": 0.16569985484053273, + "learning_rate": 8.723109784284651e-05, + "loss": 2.9594, + "step": 14951 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.18886057616282886, + "learning_rate": 8.722868710232579e-05, + "loss": 2.824, + "step": 14952 + }, + { + "epoch": 0.9282388726798684, + "grad_norm": 0.17225512820764, + "learning_rate": 8.722627616757398e-05, + "loss": 2.8563, + "step": 14953 + }, + { + "epoch": 0.9283009497796263, + "grad_norm": 0.17638955824569377, + "learning_rate": 8.722386503860363e-05, + "loss": 2.944, + "step": 14954 + }, + { + "epoch": 0.9283630268793842, + "grad_norm": 0.17920725334096013, + "learning_rate": 8.722145371542735e-05, + "loss": 2.9885, + "step": 14955 + }, + { + "epoch": 0.9284251039791421, + "grad_norm": 0.16959650853614347, + "learning_rate": 8.721904219805772e-05, + "loss": 2.9697, + "step": 14956 + }, + { + "epoch": 0.9284871810789, + "grad_norm": 0.15868919371016188, + "learning_rate": 8.72166304865073e-05, + "loss": 2.8947, + "step": 14957 + }, + { + "epoch": 0.9285492581786579, + "grad_norm": 0.26454532205068293, + "learning_rate": 8.721421858078869e-05, + "loss": 2.9348, + "step": 14958 + }, + { + "epoch": 0.9286113352784158, + "grad_norm": 0.203569384629761, + "learning_rate": 8.721180648091447e-05, + "loss": 3.0159, + "step": 14959 + }, + { + "epoch": 0.9286734123781737, + "grad_norm": 0.2023151926192026, + "learning_rate": 8.72093941868972e-05, + "loss": 3.0219, + "step": 14960 + }, + { + "epoch": 0.9287354894779316, + "grad_norm": 0.17424620489006237, + "learning_rate": 8.720698169874952e-05, + "loss": 2.9463, + "step": 14961 + }, + { + "epoch": 0.9287975665776895, + "grad_norm": 0.19268384124865556, + "learning_rate": 8.720456901648396e-05, + "loss": 2.9536, + "step": 14962 + }, + { + "epoch": 0.9288596436774474, + "grad_norm": 0.19612469599951285, + "learning_rate": 8.720215614011315e-05, + "loss": 2.9871, + "step": 14963 + }, + { + "epoch": 0.9289217207772053, + "grad_norm": 0.149266531668886, + "learning_rate": 8.719974306964964e-05, + "loss": 2.9674, + "step": 14964 + }, + { + "epoch": 0.9289837978769632, + "grad_norm": 0.16729028764059756, + "learning_rate": 8.719732980510605e-05, + "loss": 2.9834, + "step": 14965 + }, + { + "epoch": 0.9290458749767211, + "grad_norm": 0.1605471341857303, + "learning_rate": 8.719491634649497e-05, + "loss": 3.0733, + "step": 14966 + }, + { + "epoch": 0.929107952076479, + "grad_norm": 0.1587613934460849, + "learning_rate": 8.719250269382897e-05, + "loss": 2.9695, + "step": 14967 + }, + { + "epoch": 0.9291700291762369, + "grad_norm": 0.16412487433792614, + "learning_rate": 8.719008884712066e-05, + "loss": 3.0253, + "step": 14968 + }, + { + "epoch": 0.9292321062759948, + "grad_norm": 0.1624773227475581, + "learning_rate": 8.718767480638261e-05, + "loss": 2.9953, + "step": 14969 + }, + { + "epoch": 0.9292941833757526, + "grad_norm": 0.17040172326500735, + "learning_rate": 8.718526057162745e-05, + "loss": 2.8854, + "step": 14970 + }, + { + "epoch": 0.9293562604755106, + "grad_norm": 0.16115973276617604, + "learning_rate": 8.718284614286777e-05, + "loss": 2.9783, + "step": 14971 + }, + { + "epoch": 0.9294183375752685, + "grad_norm": 0.16839652702576913, + "learning_rate": 8.718043152011615e-05, + "loss": 2.9597, + "step": 14972 + }, + { + "epoch": 0.9294804146750264, + "grad_norm": 0.1718167780752561, + "learning_rate": 8.717801670338517e-05, + "loss": 3.0308, + "step": 14973 + }, + { + "epoch": 0.9295424917747843, + "grad_norm": 0.16892525107284762, + "learning_rate": 8.717560169268747e-05, + "loss": 2.9777, + "step": 14974 + }, + { + "epoch": 0.9296045688745422, + "grad_norm": 0.18524014065383773, + "learning_rate": 8.717318648803563e-05, + "loss": 2.9948, + "step": 14975 + }, + { + "epoch": 0.9296666459743, + "grad_norm": 0.1656531359508125, + "learning_rate": 8.717077108944225e-05, + "loss": 2.9055, + "step": 14976 + }, + { + "epoch": 0.929728723074058, + "grad_norm": 0.15225352770143208, + "learning_rate": 8.716835549691992e-05, + "loss": 3.0029, + "step": 14977 + }, + { + "epoch": 0.9297908001738159, + "grad_norm": 0.2138879495933132, + "learning_rate": 8.716593971048129e-05, + "loss": 2.9714, + "step": 14978 + }, + { + "epoch": 0.9298528772735738, + "grad_norm": 0.16013114430039296, + "learning_rate": 8.716352373013891e-05, + "loss": 2.9039, + "step": 14979 + }, + { + "epoch": 0.9299149543733317, + "grad_norm": 0.15959007509667886, + "learning_rate": 8.716110755590541e-05, + "loss": 2.9527, + "step": 14980 + }, + { + "epoch": 0.9299770314730896, + "grad_norm": 0.17330495456368367, + "learning_rate": 8.715869118779338e-05, + "loss": 2.9535, + "step": 14981 + }, + { + "epoch": 0.9300391085728474, + "grad_norm": 0.1682345372239354, + "learning_rate": 8.715627462581543e-05, + "loss": 2.9313, + "step": 14982 + }, + { + "epoch": 0.9301011856726054, + "grad_norm": 0.16815316256864932, + "learning_rate": 8.715385786998421e-05, + "loss": 2.9192, + "step": 14983 + }, + { + "epoch": 0.9301632627723633, + "grad_norm": 0.16245406056235895, + "learning_rate": 8.715144092031227e-05, + "loss": 2.9444, + "step": 14984 + }, + { + "epoch": 0.9302253398721212, + "grad_norm": 0.167790884280964, + "learning_rate": 8.714902377681224e-05, + "loss": 2.9316, + "step": 14985 + }, + { + "epoch": 0.9302874169718791, + "grad_norm": 0.1679508621094607, + "learning_rate": 8.714660643949675e-05, + "loss": 2.9585, + "step": 14986 + }, + { + "epoch": 0.9303494940716369, + "grad_norm": 0.165280120911026, + "learning_rate": 8.714418890837838e-05, + "loss": 2.9764, + "step": 14987 + }, + { + "epoch": 0.9304115711713948, + "grad_norm": 0.15334698876271774, + "learning_rate": 8.714177118346977e-05, + "loss": 2.9848, + "step": 14988 + }, + { + "epoch": 0.9304736482711528, + "grad_norm": 0.15760104676978554, + "learning_rate": 8.713935326478352e-05, + "loss": 3.0249, + "step": 14989 + }, + { + "epoch": 0.9305357253709107, + "grad_norm": 0.1614328480931593, + "learning_rate": 8.713693515233224e-05, + "loss": 3.0066, + "step": 14990 + }, + { + "epoch": 0.9305978024706686, + "grad_norm": 0.15154063224343287, + "learning_rate": 8.713451684612857e-05, + "loss": 2.93, + "step": 14991 + }, + { + "epoch": 0.9306598795704265, + "grad_norm": 0.17553008997946273, + "learning_rate": 8.71320983461851e-05, + "loss": 3.0315, + "step": 14992 + }, + { + "epoch": 0.9307219566701843, + "grad_norm": 0.17210792342624487, + "learning_rate": 8.712967965251446e-05, + "loss": 2.9522, + "step": 14993 + }, + { + "epoch": 0.9307840337699422, + "grad_norm": 0.1655286265635787, + "learning_rate": 8.712726076512926e-05, + "loss": 2.9989, + "step": 14994 + }, + { + "epoch": 0.9308461108697001, + "grad_norm": 0.15080181485599897, + "learning_rate": 8.712484168404215e-05, + "loss": 2.9695, + "step": 14995 + }, + { + "epoch": 0.9309081879694581, + "grad_norm": 0.18419523325226433, + "learning_rate": 8.712242240926571e-05, + "loss": 3.0832, + "step": 14996 + }, + { + "epoch": 0.930970265069216, + "grad_norm": 0.2134210994389942, + "learning_rate": 8.71200029408126e-05, + "loss": 3.0294, + "step": 14997 + }, + { + "epoch": 0.9310323421689739, + "grad_norm": 0.17225380898085846, + "learning_rate": 8.711758327869542e-05, + "loss": 2.9067, + "step": 14998 + }, + { + "epoch": 0.9310944192687317, + "grad_norm": 0.171200169819356, + "learning_rate": 8.711516342292677e-05, + "loss": 2.9323, + "step": 14999 + }, + { + "epoch": 0.9311564963684896, + "grad_norm": 0.21613321018509893, + "learning_rate": 8.711274337351932e-05, + "loss": 2.9527, + "step": 15000 + }, + { + "epoch": 0.9312185734682475, + "grad_norm": 0.17159667020475913, + "learning_rate": 8.711032313048568e-05, + "loss": 3.0067, + "step": 15001 + }, + { + "epoch": 0.9312806505680055, + "grad_norm": 0.15587594438976482, + "learning_rate": 8.710790269383848e-05, + "loss": 2.909, + "step": 15002 + }, + { + "epoch": 0.9313427276677634, + "grad_norm": 0.16028461894248655, + "learning_rate": 8.710548206359034e-05, + "loss": 2.8218, + "step": 15003 + }, + { + "epoch": 0.9314048047675213, + "grad_norm": 0.16214803441529888, + "learning_rate": 8.710306123975389e-05, + "loss": 3.0646, + "step": 15004 + }, + { + "epoch": 0.9314668818672791, + "grad_norm": 0.17591831903093477, + "learning_rate": 8.710064022234176e-05, + "loss": 2.9681, + "step": 15005 + }, + { + "epoch": 0.931528958967037, + "grad_norm": 0.18241766367264145, + "learning_rate": 8.709821901136659e-05, + "loss": 2.9707, + "step": 15006 + }, + { + "epoch": 0.9315910360667949, + "grad_norm": 0.16590755535495283, + "learning_rate": 8.709579760684099e-05, + "loss": 2.9124, + "step": 15007 + }, + { + "epoch": 0.9316531131665529, + "grad_norm": 0.1858329174296539, + "learning_rate": 8.709337600877762e-05, + "loss": 2.9846, + "step": 15008 + }, + { + "epoch": 0.9317151902663108, + "grad_norm": 0.17094205027352355, + "learning_rate": 8.709095421718912e-05, + "loss": 3.0544, + "step": 15009 + }, + { + "epoch": 0.9317772673660687, + "grad_norm": 0.14952766861607866, + "learning_rate": 8.708853223208808e-05, + "loss": 2.968, + "step": 15010 + }, + { + "epoch": 0.9318393444658265, + "grad_norm": 0.23621089644484544, + "learning_rate": 8.708611005348717e-05, + "loss": 3.0152, + "step": 15011 + }, + { + "epoch": 0.9319014215655844, + "grad_norm": 0.15891105709288036, + "learning_rate": 8.708368768139903e-05, + "loss": 3.0741, + "step": 15012 + }, + { + "epoch": 0.9319634986653423, + "grad_norm": 0.20199443645475734, + "learning_rate": 8.708126511583627e-05, + "loss": 2.9332, + "step": 15013 + }, + { + "epoch": 0.9320255757651003, + "grad_norm": 0.1722094089447314, + "learning_rate": 8.707884235681158e-05, + "loss": 3.0356, + "step": 15014 + }, + { + "epoch": 0.9320876528648582, + "grad_norm": 0.14782728703915235, + "learning_rate": 8.707641940433754e-05, + "loss": 2.9958, + "step": 15015 + }, + { + "epoch": 0.9321497299646161, + "grad_norm": 0.17931422563096808, + "learning_rate": 8.707399625842683e-05, + "loss": 3.1594, + "step": 15016 + }, + { + "epoch": 0.9322118070643739, + "grad_norm": 0.25148749388378067, + "learning_rate": 8.707157291909208e-05, + "loss": 2.9953, + "step": 15017 + }, + { + "epoch": 0.9322738841641318, + "grad_norm": 0.1987175152280298, + "learning_rate": 8.706914938634593e-05, + "loss": 3.0017, + "step": 15018 + }, + { + "epoch": 0.9323359612638897, + "grad_norm": 0.18591796507917036, + "learning_rate": 8.706672566020105e-05, + "loss": 3.0169, + "step": 15019 + }, + { + "epoch": 0.9323980383636477, + "grad_norm": 0.20515583927319705, + "learning_rate": 8.706430174067004e-05, + "loss": 3.0064, + "step": 15020 + }, + { + "epoch": 0.9324601154634056, + "grad_norm": 0.2477432104319785, + "learning_rate": 8.706187762776558e-05, + "loss": 2.9392, + "step": 15021 + }, + { + "epoch": 0.9325221925631635, + "grad_norm": 0.20695555400754975, + "learning_rate": 8.705945332150031e-05, + "loss": 2.918, + "step": 15022 + }, + { + "epoch": 0.9325842696629213, + "grad_norm": 0.16356886669911416, + "learning_rate": 8.705702882188687e-05, + "loss": 2.9198, + "step": 15023 + }, + { + "epoch": 0.9326463467626792, + "grad_norm": 0.1892410152764199, + "learning_rate": 8.705460412893791e-05, + "loss": 2.9891, + "step": 15024 + }, + { + "epoch": 0.9327084238624371, + "grad_norm": 0.20441100286099845, + "learning_rate": 8.705217924266609e-05, + "loss": 2.9976, + "step": 15025 + }, + { + "epoch": 0.932770500962195, + "grad_norm": 0.25705144275319014, + "learning_rate": 8.704975416308406e-05, + "loss": 2.9676, + "step": 15026 + }, + { + "epoch": 0.932832578061953, + "grad_norm": 0.20669891566463663, + "learning_rate": 8.704732889020445e-05, + "loss": 2.94, + "step": 15027 + }, + { + "epoch": 0.9328946551617109, + "grad_norm": 0.17749550246195608, + "learning_rate": 8.704490342403997e-05, + "loss": 2.9178, + "step": 15028 + }, + { + "epoch": 0.9329567322614687, + "grad_norm": 0.16670076696909378, + "learning_rate": 8.704247776460321e-05, + "loss": 3.0709, + "step": 15029 + }, + { + "epoch": 0.9330188093612266, + "grad_norm": 0.17262427154700835, + "learning_rate": 8.704005191190685e-05, + "loss": 2.8951, + "step": 15030 + }, + { + "epoch": 0.9330808864609845, + "grad_norm": 0.20052049789565793, + "learning_rate": 8.703762586596356e-05, + "loss": 2.9001, + "step": 15031 + }, + { + "epoch": 0.9331429635607424, + "grad_norm": 0.265187955791649, + "learning_rate": 8.703519962678596e-05, + "loss": 3.0198, + "step": 15032 + }, + { + "epoch": 0.9332050406605004, + "grad_norm": 0.18493643279120614, + "learning_rate": 8.703277319438676e-05, + "loss": 2.8631, + "step": 15033 + }, + { + "epoch": 0.9332671177602583, + "grad_norm": 0.19565089474855354, + "learning_rate": 8.703034656877857e-05, + "loss": 3.0367, + "step": 15034 + }, + { + "epoch": 0.9333291948600161, + "grad_norm": 0.23955255376359857, + "learning_rate": 8.702791974997409e-05, + "loss": 2.9332, + "step": 15035 + }, + { + "epoch": 0.933391271959774, + "grad_norm": 0.24230127584640643, + "learning_rate": 8.702549273798597e-05, + "loss": 2.9448, + "step": 15036 + }, + { + "epoch": 0.9334533490595319, + "grad_norm": 0.2192029096110305, + "learning_rate": 8.702306553282684e-05, + "loss": 2.9831, + "step": 15037 + }, + { + "epoch": 0.9335154261592898, + "grad_norm": 0.16677006789724172, + "learning_rate": 8.70206381345094e-05, + "loss": 2.9478, + "step": 15038 + }, + { + "epoch": 0.9335775032590478, + "grad_norm": 0.20604368605329654, + "learning_rate": 8.701821054304632e-05, + "loss": 3.0582, + "step": 15039 + }, + { + "epoch": 0.9336395803588057, + "grad_norm": 0.17634328952731215, + "learning_rate": 8.701578275845024e-05, + "loss": 2.9842, + "step": 15040 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.2853609813308058, + "learning_rate": 8.701335478073382e-05, + "loss": 3.072, + "step": 15041 + }, + { + "epoch": 0.9337637345583214, + "grad_norm": 0.24241160975564074, + "learning_rate": 8.701092660990976e-05, + "loss": 3.0045, + "step": 15042 + }, + { + "epoch": 0.9338258116580793, + "grad_norm": 0.2262031401797384, + "learning_rate": 8.700849824599071e-05, + "loss": 2.8497, + "step": 15043 + }, + { + "epoch": 0.9338878887578372, + "grad_norm": 0.2352647819237919, + "learning_rate": 8.700606968898933e-05, + "loss": 3.0131, + "step": 15044 + }, + { + "epoch": 0.9339499658575952, + "grad_norm": 0.2004929260964504, + "learning_rate": 8.70036409389183e-05, + "loss": 2.9815, + "step": 15045 + }, + { + "epoch": 0.9340120429573531, + "grad_norm": 0.21874628745062136, + "learning_rate": 8.700121199579031e-05, + "loss": 3.0836, + "step": 15046 + }, + { + "epoch": 0.9340741200571109, + "grad_norm": 0.19523465512588756, + "learning_rate": 8.6998782859618e-05, + "loss": 2.9808, + "step": 15047 + }, + { + "epoch": 0.9341361971568688, + "grad_norm": 0.18774587682509036, + "learning_rate": 8.699635353041407e-05, + "loss": 2.9727, + "step": 15048 + }, + { + "epoch": 0.9341982742566267, + "grad_norm": 0.17361384482350728, + "learning_rate": 8.699392400819118e-05, + "loss": 2.9268, + "step": 15049 + }, + { + "epoch": 0.9342603513563846, + "grad_norm": 0.17236666365376463, + "learning_rate": 8.6991494292962e-05, + "loss": 2.9755, + "step": 15050 + }, + { + "epoch": 0.9343224284561426, + "grad_norm": 0.24657221104169053, + "learning_rate": 8.698906438473922e-05, + "loss": 3.0763, + "step": 15051 + }, + { + "epoch": 0.9343845055559005, + "grad_norm": 0.16757137956359106, + "learning_rate": 8.698663428353551e-05, + "loss": 3.1154, + "step": 15052 + }, + { + "epoch": 0.9344465826556583, + "grad_norm": 0.18082274607753202, + "learning_rate": 8.698420398936353e-05, + "loss": 3.0099, + "step": 15053 + }, + { + "epoch": 0.9345086597554162, + "grad_norm": 0.1969683361572634, + "learning_rate": 8.6981773502236e-05, + "loss": 2.9631, + "step": 15054 + }, + { + "epoch": 0.9345707368551741, + "grad_norm": 0.18503671078901798, + "learning_rate": 8.697934282216558e-05, + "loss": 2.9739, + "step": 15055 + }, + { + "epoch": 0.934632813954932, + "grad_norm": 0.18084103963036072, + "learning_rate": 8.697691194916495e-05, + "loss": 2.9678, + "step": 15056 + }, + { + "epoch": 0.93469489105469, + "grad_norm": 0.18702170840840499, + "learning_rate": 8.69744808832468e-05, + "loss": 2.9988, + "step": 15057 + }, + { + "epoch": 0.9347569681544479, + "grad_norm": 0.18176118889793508, + "learning_rate": 8.69720496244238e-05, + "loss": 2.8832, + "step": 15058 + }, + { + "epoch": 0.9348190452542057, + "grad_norm": 0.16268806120797316, + "learning_rate": 8.696961817270863e-05, + "loss": 2.8741, + "step": 15059 + }, + { + "epoch": 0.9348811223539636, + "grad_norm": 0.208255503938373, + "learning_rate": 8.696718652811401e-05, + "loss": 3.0327, + "step": 15060 + }, + { + "epoch": 0.9349431994537215, + "grad_norm": 0.2316220570370617, + "learning_rate": 8.69647546906526e-05, + "loss": 2.9739, + "step": 15061 + }, + { + "epoch": 0.9350052765534794, + "grad_norm": 0.16113238479095104, + "learning_rate": 8.696232266033709e-05, + "loss": 3.1202, + "step": 15062 + }, + { + "epoch": 0.9350673536532373, + "grad_norm": 0.17151184079316212, + "learning_rate": 8.695989043718016e-05, + "loss": 3.0233, + "step": 15063 + }, + { + "epoch": 0.9351294307529953, + "grad_norm": 0.18407726534636318, + "learning_rate": 8.69574580211945e-05, + "loss": 2.9834, + "step": 15064 + }, + { + "epoch": 0.9351915078527531, + "grad_norm": 0.1638297121119355, + "learning_rate": 8.695502541239284e-05, + "loss": 2.9838, + "step": 15065 + }, + { + "epoch": 0.935253584952511, + "grad_norm": 0.16538419131675017, + "learning_rate": 8.695259261078782e-05, + "loss": 2.9275, + "step": 15066 + }, + { + "epoch": 0.9353156620522689, + "grad_norm": 0.15678491819567839, + "learning_rate": 8.695015961639215e-05, + "loss": 2.8882, + "step": 15067 + }, + { + "epoch": 0.9353777391520268, + "grad_norm": 0.1733283025633548, + "learning_rate": 8.694772642921853e-05, + "loss": 3.0596, + "step": 15068 + }, + { + "epoch": 0.9354398162517847, + "grad_norm": 0.20669764505432683, + "learning_rate": 8.694529304927967e-05, + "loss": 2.9141, + "step": 15069 + }, + { + "epoch": 0.9355018933515427, + "grad_norm": 0.17137870878537287, + "learning_rate": 8.694285947658823e-05, + "loss": 3.037, + "step": 15070 + }, + { + "epoch": 0.9355639704513005, + "grad_norm": 0.17315201051999118, + "learning_rate": 8.694042571115694e-05, + "loss": 3.1085, + "step": 15071 + }, + { + "epoch": 0.9356260475510584, + "grad_norm": 0.17147283754226852, + "learning_rate": 8.693799175299846e-05, + "loss": 3.0259, + "step": 15072 + }, + { + "epoch": 0.9356881246508163, + "grad_norm": 0.18573062230957327, + "learning_rate": 8.693555760212553e-05, + "loss": 2.9289, + "step": 15073 + }, + { + "epoch": 0.9357502017505742, + "grad_norm": 0.16467200322141215, + "learning_rate": 8.693312325855082e-05, + "loss": 2.9133, + "step": 15074 + }, + { + "epoch": 0.9358122788503321, + "grad_norm": 0.22631131421494632, + "learning_rate": 8.693068872228704e-05, + "loss": 2.9497, + "step": 15075 + }, + { + "epoch": 0.93587435595009, + "grad_norm": 0.17671615151772443, + "learning_rate": 8.69282539933469e-05, + "loss": 2.8952, + "step": 15076 + }, + { + "epoch": 0.9359364330498479, + "grad_norm": 0.18224062318515205, + "learning_rate": 8.692581907174307e-05, + "loss": 2.9672, + "step": 15077 + }, + { + "epoch": 0.9359985101496058, + "grad_norm": 0.17652882976530748, + "learning_rate": 8.69233839574883e-05, + "loss": 2.9358, + "step": 15078 + }, + { + "epoch": 0.9360605872493637, + "grad_norm": 0.16177106508687505, + "learning_rate": 8.692094865059527e-05, + "loss": 2.9301, + "step": 15079 + }, + { + "epoch": 0.9361226643491216, + "grad_norm": 0.16122675382515259, + "learning_rate": 8.691851315107668e-05, + "loss": 3.0402, + "step": 15080 + }, + { + "epoch": 0.9361847414488795, + "grad_norm": 0.1836489533563062, + "learning_rate": 8.691607745894526e-05, + "loss": 3.0109, + "step": 15081 + }, + { + "epoch": 0.9362468185486374, + "grad_norm": 0.1595428921825639, + "learning_rate": 8.691364157421369e-05, + "loss": 2.9648, + "step": 15082 + }, + { + "epoch": 0.9363088956483953, + "grad_norm": 0.18958377266053938, + "learning_rate": 8.691120549689468e-05, + "loss": 3.0027, + "step": 15083 + }, + { + "epoch": 0.9363709727481532, + "grad_norm": 0.18247266140061294, + "learning_rate": 8.690876922700097e-05, + "loss": 3.0098, + "step": 15084 + }, + { + "epoch": 0.9364330498479111, + "grad_norm": 0.16265424826102104, + "learning_rate": 8.690633276454525e-05, + "loss": 2.9474, + "step": 15085 + }, + { + "epoch": 0.936495126947669, + "grad_norm": 0.156218948855181, + "learning_rate": 8.690389610954021e-05, + "loss": 2.8992, + "step": 15086 + }, + { + "epoch": 0.9365572040474269, + "grad_norm": 0.19705697157195123, + "learning_rate": 8.690145926199862e-05, + "loss": 2.9403, + "step": 15087 + }, + { + "epoch": 0.9366192811471848, + "grad_norm": 0.16488330307783372, + "learning_rate": 8.689902222193314e-05, + "loss": 3.0407, + "step": 15088 + }, + { + "epoch": 0.9366813582469427, + "grad_norm": 0.18551335282927497, + "learning_rate": 8.689658498935649e-05, + "loss": 3.0963, + "step": 15089 + }, + { + "epoch": 0.9367434353467006, + "grad_norm": 0.1721172143273746, + "learning_rate": 8.689414756428143e-05, + "loss": 2.9613, + "step": 15090 + }, + { + "epoch": 0.9368055124464585, + "grad_norm": 0.21088327487929773, + "learning_rate": 8.689170994672063e-05, + "loss": 2.9221, + "step": 15091 + }, + { + "epoch": 0.9368675895462164, + "grad_norm": 0.16468367601113593, + "learning_rate": 8.68892721366868e-05, + "loss": 2.8916, + "step": 15092 + }, + { + "epoch": 0.9369296666459743, + "grad_norm": 0.20434274067434752, + "learning_rate": 8.688683413419272e-05, + "loss": 2.9158, + "step": 15093 + }, + { + "epoch": 0.9369917437457322, + "grad_norm": 0.1652841103104864, + "learning_rate": 8.688439593925105e-05, + "loss": 2.9652, + "step": 15094 + }, + { + "epoch": 0.93705382084549, + "grad_norm": 0.1622815768684667, + "learning_rate": 8.688195755187454e-05, + "loss": 2.8695, + "step": 15095 + }, + { + "epoch": 0.937115897945248, + "grad_norm": 0.21279073699158987, + "learning_rate": 8.687951897207592e-05, + "loss": 2.9681, + "step": 15096 + }, + { + "epoch": 0.9371779750450059, + "grad_norm": 0.19698675669714988, + "learning_rate": 8.687708019986787e-05, + "loss": 2.9597, + "step": 15097 + }, + { + "epoch": 0.9372400521447638, + "grad_norm": 0.19839849088622666, + "learning_rate": 8.687464123526314e-05, + "loss": 3.0008, + "step": 15098 + }, + { + "epoch": 0.9373021292445217, + "grad_norm": 0.16898405590018425, + "learning_rate": 8.687220207827446e-05, + "loss": 3.0188, + "step": 15099 + }, + { + "epoch": 0.9373642063442796, + "grad_norm": 0.1715360572288653, + "learning_rate": 8.686976272891456e-05, + "loss": 3.0425, + "step": 15100 + }, + { + "epoch": 0.9374262834440374, + "grad_norm": 0.1722063858004555, + "learning_rate": 8.686732318719616e-05, + "loss": 2.9597, + "step": 15101 + }, + { + "epoch": 0.9374883605437954, + "grad_norm": 0.1688441210542704, + "learning_rate": 8.686488345313197e-05, + "loss": 2.9555, + "step": 15102 + }, + { + "epoch": 0.9375504376435533, + "grad_norm": 0.17705172135050373, + "learning_rate": 8.686244352673474e-05, + "loss": 2.9888, + "step": 15103 + }, + { + "epoch": 0.9376125147433112, + "grad_norm": 0.17967048854742457, + "learning_rate": 8.686000340801721e-05, + "loss": 3.0678, + "step": 15104 + }, + { + "epoch": 0.9376745918430691, + "grad_norm": 0.23098178916319304, + "learning_rate": 8.685756309699206e-05, + "loss": 2.9636, + "step": 15105 + }, + { + "epoch": 0.937736668942827, + "grad_norm": 0.15983102504498248, + "learning_rate": 8.685512259367208e-05, + "loss": 2.892, + "step": 15106 + }, + { + "epoch": 0.9377987460425848, + "grad_norm": 0.18778396997499042, + "learning_rate": 8.685268189807e-05, + "loss": 3.0761, + "step": 15107 + }, + { + "epoch": 0.9378608231423428, + "grad_norm": 0.16651355458157516, + "learning_rate": 8.685024101019849e-05, + "loss": 2.906, + "step": 15108 + }, + { + "epoch": 0.9379229002421007, + "grad_norm": 0.2105339393109881, + "learning_rate": 8.684779993007033e-05, + "loss": 2.946, + "step": 15109 + }, + { + "epoch": 0.9379849773418586, + "grad_norm": 0.18308645531166254, + "learning_rate": 8.684535865769827e-05, + "loss": 2.9753, + "step": 15110 + }, + { + "epoch": 0.9380470544416165, + "grad_norm": 0.25671193715747226, + "learning_rate": 8.684291719309503e-05, + "loss": 2.9137, + "step": 15111 + }, + { + "epoch": 0.9381091315413744, + "grad_norm": 0.21772644151702533, + "learning_rate": 8.684047553627335e-05, + "loss": 3.0199, + "step": 15112 + }, + { + "epoch": 0.9381712086411322, + "grad_norm": 0.18935347423712962, + "learning_rate": 8.683803368724594e-05, + "loss": 3.0464, + "step": 15113 + }, + { + "epoch": 0.9382332857408902, + "grad_norm": 0.20655051684283748, + "learning_rate": 8.683559164602559e-05, + "loss": 2.9336, + "step": 15114 + }, + { + "epoch": 0.9382953628406481, + "grad_norm": 0.17102146080345149, + "learning_rate": 8.683314941262501e-05, + "loss": 2.9203, + "step": 15115 + }, + { + "epoch": 0.938357439940406, + "grad_norm": 0.229191059020938, + "learning_rate": 8.683070698705696e-05, + "loss": 2.9841, + "step": 15116 + }, + { + "epoch": 0.9384195170401639, + "grad_norm": 0.19489757332839794, + "learning_rate": 8.682826436933415e-05, + "loss": 2.9663, + "step": 15117 + }, + { + "epoch": 0.9384815941399218, + "grad_norm": 0.16717291416287708, + "learning_rate": 8.682582155946935e-05, + "loss": 2.9551, + "step": 15118 + }, + { + "epoch": 0.9385436712396796, + "grad_norm": 0.20366847253085718, + "learning_rate": 8.682337855747532e-05, + "loss": 2.9092, + "step": 15119 + }, + { + "epoch": 0.9386057483394376, + "grad_norm": 0.2592422404116609, + "learning_rate": 8.682093536336475e-05, + "loss": 2.9709, + "step": 15120 + }, + { + "epoch": 0.9386678254391955, + "grad_norm": 0.2002231578535209, + "learning_rate": 8.681849197715045e-05, + "loss": 2.9927, + "step": 15121 + }, + { + "epoch": 0.9387299025389534, + "grad_norm": 0.18857317014982478, + "learning_rate": 8.681604839884512e-05, + "loss": 3.0048, + "step": 15122 + }, + { + "epoch": 0.9387919796387113, + "grad_norm": 0.20052187267050628, + "learning_rate": 8.681360462846153e-05, + "loss": 3.0031, + "step": 15123 + }, + { + "epoch": 0.9388540567384692, + "grad_norm": 0.1689277154655396, + "learning_rate": 8.681116066601244e-05, + "loss": 2.971, + "step": 15124 + }, + { + "epoch": 0.938916133838227, + "grad_norm": 0.23176006049218933, + "learning_rate": 8.680871651151058e-05, + "loss": 3.0165, + "step": 15125 + }, + { + "epoch": 0.938978210937985, + "grad_norm": 0.17768670585151614, + "learning_rate": 8.68062721649687e-05, + "loss": 2.9624, + "step": 15126 + }, + { + "epoch": 0.9390402880377429, + "grad_norm": 0.16987853209961715, + "learning_rate": 8.680382762639959e-05, + "loss": 3.0753, + "step": 15127 + }, + { + "epoch": 0.9391023651375008, + "grad_norm": 0.1737794593666648, + "learning_rate": 8.680138289581596e-05, + "loss": 3.0156, + "step": 15128 + }, + { + "epoch": 0.9391644422372587, + "grad_norm": 0.19971709470879181, + "learning_rate": 8.679893797323058e-05, + "loss": 2.8827, + "step": 15129 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.1800833467566335, + "learning_rate": 8.67964928586562e-05, + "loss": 2.9938, + "step": 15130 + }, + { + "epoch": 0.9392885964367744, + "grad_norm": 0.208906765368674, + "learning_rate": 8.67940475521056e-05, + "loss": 3.0314, + "step": 15131 + }, + { + "epoch": 0.9393506735365323, + "grad_norm": 0.17231295339872685, + "learning_rate": 8.679160205359153e-05, + "loss": 3.0173, + "step": 15132 + }, + { + "epoch": 0.9394127506362903, + "grad_norm": 0.20385715690184408, + "learning_rate": 8.678915636312672e-05, + "loss": 2.9619, + "step": 15133 + }, + { + "epoch": 0.9394748277360482, + "grad_norm": 0.22479364051488454, + "learning_rate": 8.678671048072397e-05, + "loss": 3.0435, + "step": 15134 + }, + { + "epoch": 0.9395369048358061, + "grad_norm": 0.1945535888254834, + "learning_rate": 8.6784264406396e-05, + "loss": 2.9318, + "step": 15135 + }, + { + "epoch": 0.939598981935564, + "grad_norm": 0.21079964018129135, + "learning_rate": 8.67818181401556e-05, + "loss": 3.0428, + "step": 15136 + }, + { + "epoch": 0.9396610590353218, + "grad_norm": 0.16523736969057973, + "learning_rate": 8.677937168201553e-05, + "loss": 2.9735, + "step": 15137 + }, + { + "epoch": 0.9397231361350797, + "grad_norm": 0.18269592260585504, + "learning_rate": 8.677692503198856e-05, + "loss": 2.97, + "step": 15138 + }, + { + "epoch": 0.9397852132348377, + "grad_norm": 0.16652831964139067, + "learning_rate": 8.677447819008741e-05, + "loss": 3.0951, + "step": 15139 + }, + { + "epoch": 0.9398472903345956, + "grad_norm": 0.27901672771770225, + "learning_rate": 8.67720311563249e-05, + "loss": 2.9994, + "step": 15140 + }, + { + "epoch": 0.9399093674343535, + "grad_norm": 0.19459449398767836, + "learning_rate": 8.67695839307138e-05, + "loss": 3.0081, + "step": 15141 + }, + { + "epoch": 0.9399714445341114, + "grad_norm": 0.2355144538518739, + "learning_rate": 8.676713651326683e-05, + "loss": 2.9555, + "step": 15142 + }, + { + "epoch": 0.9400335216338692, + "grad_norm": 0.21481242384977214, + "learning_rate": 8.676468890399678e-05, + "loss": 2.9099, + "step": 15143 + }, + { + "epoch": 0.9400955987336271, + "grad_norm": 0.18457638765862586, + "learning_rate": 8.676224110291643e-05, + "loss": 2.9138, + "step": 15144 + }, + { + "epoch": 0.940157675833385, + "grad_norm": 0.20160118145861405, + "learning_rate": 8.675979311003854e-05, + "loss": 2.9244, + "step": 15145 + }, + { + "epoch": 0.940219752933143, + "grad_norm": 0.1751276208669876, + "learning_rate": 8.67573449253759e-05, + "loss": 3.0355, + "step": 15146 + }, + { + "epoch": 0.9402818300329009, + "grad_norm": 0.1894578244914374, + "learning_rate": 8.675489654894125e-05, + "loss": 2.9918, + "step": 15147 + }, + { + "epoch": 0.9403439071326588, + "grad_norm": 0.22104540589525853, + "learning_rate": 8.675244798074739e-05, + "loss": 2.9922, + "step": 15148 + }, + { + "epoch": 0.9404059842324166, + "grad_norm": 0.2205187694304018, + "learning_rate": 8.674999922080708e-05, + "loss": 3.0582, + "step": 15149 + }, + { + "epoch": 0.9404680613321745, + "grad_norm": 0.20528309887363985, + "learning_rate": 8.674755026913312e-05, + "loss": 2.8614, + "step": 15150 + }, + { + "epoch": 0.9405301384319324, + "grad_norm": 0.2568702239731866, + "learning_rate": 8.674510112573826e-05, + "loss": 2.9895, + "step": 15151 + }, + { + "epoch": 0.9405922155316904, + "grad_norm": 0.20271014985750296, + "learning_rate": 8.674265179063528e-05, + "loss": 2.9781, + "step": 15152 + }, + { + "epoch": 0.9406542926314483, + "grad_norm": 0.1829100621554677, + "learning_rate": 8.674020226383697e-05, + "loss": 2.8724, + "step": 15153 + }, + { + "epoch": 0.9407163697312062, + "grad_norm": 0.1906093141480796, + "learning_rate": 8.67377525453561e-05, + "loss": 2.9163, + "step": 15154 + }, + { + "epoch": 0.940778446830964, + "grad_norm": 0.162234103236301, + "learning_rate": 8.673530263520547e-05, + "loss": 2.9029, + "step": 15155 + }, + { + "epoch": 0.9408405239307219, + "grad_norm": 0.25439882183300716, + "learning_rate": 8.673285253339785e-05, + "loss": 2.9906, + "step": 15156 + }, + { + "epoch": 0.9409026010304798, + "grad_norm": 0.2612910690606965, + "learning_rate": 8.673040223994601e-05, + "loss": 3.0114, + "step": 15157 + }, + { + "epoch": 0.9409646781302378, + "grad_norm": 0.17777735850620832, + "learning_rate": 8.672795175486275e-05, + "loss": 3.018, + "step": 15158 + }, + { + "epoch": 0.9410267552299957, + "grad_norm": 0.20256616674068798, + "learning_rate": 8.672550107816084e-05, + "loss": 2.9359, + "step": 15159 + }, + { + "epoch": 0.9410888323297536, + "grad_norm": 0.24368170521174656, + "learning_rate": 8.672305020985308e-05, + "loss": 2.9803, + "step": 15160 + }, + { + "epoch": 0.9411509094295114, + "grad_norm": 0.2103018454875455, + "learning_rate": 8.672059914995226e-05, + "loss": 2.8611, + "step": 15161 + }, + { + "epoch": 0.9412129865292693, + "grad_norm": 0.18191166621213506, + "learning_rate": 8.671814789847116e-05, + "loss": 2.987, + "step": 15162 + }, + { + "epoch": 0.9412750636290272, + "grad_norm": 0.2704052506894244, + "learning_rate": 8.671569645542257e-05, + "loss": 2.9021, + "step": 15163 + }, + { + "epoch": 0.9413371407287852, + "grad_norm": 0.1797820858172584, + "learning_rate": 8.671324482081927e-05, + "loss": 2.9396, + "step": 15164 + }, + { + "epoch": 0.9413992178285431, + "grad_norm": 0.2811464940450018, + "learning_rate": 8.671079299467406e-05, + "loss": 2.9895, + "step": 15165 + }, + { + "epoch": 0.941461294928301, + "grad_norm": 0.1892167514314324, + "learning_rate": 8.670834097699974e-05, + "loss": 2.9563, + "step": 15166 + }, + { + "epoch": 0.9415233720280588, + "grad_norm": 0.2043611071761333, + "learning_rate": 8.670588876780907e-05, + "loss": 2.9555, + "step": 15167 + }, + { + "epoch": 0.9415854491278167, + "grad_norm": 0.18124491133616913, + "learning_rate": 8.670343636711489e-05, + "loss": 2.9866, + "step": 15168 + }, + { + "epoch": 0.9416475262275746, + "grad_norm": 0.19871796514743345, + "learning_rate": 8.670098377492996e-05, + "loss": 3.1225, + "step": 15169 + }, + { + "epoch": 0.9417096033273326, + "grad_norm": 0.17796108961155366, + "learning_rate": 8.66985309912671e-05, + "loss": 2.9998, + "step": 15170 + }, + { + "epoch": 0.9417716804270905, + "grad_norm": 0.21855394910426854, + "learning_rate": 8.669607801613909e-05, + "loss": 3.0492, + "step": 15171 + }, + { + "epoch": 0.9418337575268484, + "grad_norm": 0.2000610180251419, + "learning_rate": 8.669362484955874e-05, + "loss": 2.9784, + "step": 15172 + }, + { + "epoch": 0.9418958346266062, + "grad_norm": 0.19635898616215294, + "learning_rate": 8.669117149153882e-05, + "loss": 3.0046, + "step": 15173 + }, + { + "epoch": 0.9419579117263641, + "grad_norm": 0.19729803782155372, + "learning_rate": 8.668871794209218e-05, + "loss": 2.9404, + "step": 15174 + }, + { + "epoch": 0.942019988826122, + "grad_norm": 0.2118808439033978, + "learning_rate": 8.668626420123159e-05, + "loss": 3.0062, + "step": 15175 + }, + { + "epoch": 0.94208206592588, + "grad_norm": 0.1981654746924368, + "learning_rate": 8.668381026896984e-05, + "loss": 2.9698, + "step": 15176 + }, + { + "epoch": 0.9421441430256379, + "grad_norm": 0.21893245036503345, + "learning_rate": 8.668135614531973e-05, + "loss": 2.9, + "step": 15177 + }, + { + "epoch": 0.9422062201253958, + "grad_norm": 0.17961609499698097, + "learning_rate": 8.66789018302941e-05, + "loss": 2.9285, + "step": 15178 + }, + { + "epoch": 0.9422682972251536, + "grad_norm": 0.17615941483613784, + "learning_rate": 8.667644732390574e-05, + "loss": 2.945, + "step": 15179 + }, + { + "epoch": 0.9423303743249115, + "grad_norm": 0.16969845871638478, + "learning_rate": 8.667399262616745e-05, + "loss": 2.972, + "step": 15180 + }, + { + "epoch": 0.9423924514246694, + "grad_norm": 0.21845827916572894, + "learning_rate": 8.667153773709202e-05, + "loss": 2.9497, + "step": 15181 + }, + { + "epoch": 0.9424545285244273, + "grad_norm": 0.18148992295966798, + "learning_rate": 8.666908265669229e-05, + "loss": 2.9552, + "step": 15182 + }, + { + "epoch": 0.9425166056241853, + "grad_norm": 0.16877782932149576, + "learning_rate": 8.666662738498106e-05, + "loss": 2.9099, + "step": 15183 + }, + { + "epoch": 0.9425786827239432, + "grad_norm": 0.1599124005729994, + "learning_rate": 8.666417192197112e-05, + "loss": 3.0031, + "step": 15184 + }, + { + "epoch": 0.942640759823701, + "grad_norm": 0.2042892450982975, + "learning_rate": 8.66617162676753e-05, + "loss": 3.0693, + "step": 15185 + }, + { + "epoch": 0.9427028369234589, + "grad_norm": 0.19747925512095418, + "learning_rate": 8.66592604221064e-05, + "loss": 3.0096, + "step": 15186 + }, + { + "epoch": 0.9427649140232168, + "grad_norm": 0.1782745181358147, + "learning_rate": 8.665680438527724e-05, + "loss": 3.017, + "step": 15187 + }, + { + "epoch": 0.9428269911229747, + "grad_norm": 0.15820044056063048, + "learning_rate": 8.665434815720063e-05, + "loss": 3.0391, + "step": 15188 + }, + { + "epoch": 0.9428890682227327, + "grad_norm": 0.15804932029065677, + "learning_rate": 8.665189173788942e-05, + "loss": 2.9902, + "step": 15189 + }, + { + "epoch": 0.9429511453224906, + "grad_norm": 0.15059076343526767, + "learning_rate": 8.664943512735635e-05, + "loss": 3.0023, + "step": 15190 + }, + { + "epoch": 0.9430132224222484, + "grad_norm": 0.16377816652337007, + "learning_rate": 8.66469783256143e-05, + "loss": 2.9657, + "step": 15191 + }, + { + "epoch": 0.9430752995220063, + "grad_norm": 0.15933360515887857, + "learning_rate": 8.664452133267606e-05, + "loss": 3.0991, + "step": 15192 + }, + { + "epoch": 0.9431373766217642, + "grad_norm": 0.2307500092856673, + "learning_rate": 8.664206414855447e-05, + "loss": 2.9411, + "step": 15193 + }, + { + "epoch": 0.9431994537215221, + "grad_norm": 0.20765994538692423, + "learning_rate": 8.663960677326233e-05, + "loss": 2.9481, + "step": 15194 + }, + { + "epoch": 0.9432615308212801, + "grad_norm": 0.16894570389189614, + "learning_rate": 8.663714920681246e-05, + "loss": 3.0376, + "step": 15195 + }, + { + "epoch": 0.943323607921038, + "grad_norm": 0.20223208553442118, + "learning_rate": 8.66346914492177e-05, + "loss": 2.9758, + "step": 15196 + }, + { + "epoch": 0.9433856850207958, + "grad_norm": 0.1702341083036087, + "learning_rate": 8.663223350049085e-05, + "loss": 2.9357, + "step": 15197 + }, + { + "epoch": 0.9434477621205537, + "grad_norm": 0.17258605393120058, + "learning_rate": 8.662977536064476e-05, + "loss": 2.9309, + "step": 15198 + }, + { + "epoch": 0.9435098392203116, + "grad_norm": 0.16853188772698036, + "learning_rate": 8.662731702969222e-05, + "loss": 3.0307, + "step": 15199 + }, + { + "epoch": 0.9435719163200695, + "grad_norm": 0.1992326899868531, + "learning_rate": 8.662485850764609e-05, + "loss": 2.9186, + "step": 15200 + }, + { + "epoch": 0.9436339934198275, + "grad_norm": 0.1656092382074693, + "learning_rate": 8.662239979451918e-05, + "loss": 3.0048, + "step": 15201 + }, + { + "epoch": 0.9436960705195854, + "grad_norm": 0.20684682080792785, + "learning_rate": 8.661994089032433e-05, + "loss": 3.0001, + "step": 15202 + }, + { + "epoch": 0.9437581476193432, + "grad_norm": 0.20180994525366278, + "learning_rate": 8.661748179507432e-05, + "loss": 3.0301, + "step": 15203 + }, + { + "epoch": 0.9438202247191011, + "grad_norm": 0.29825424739534406, + "learning_rate": 8.661502250878207e-05, + "loss": 2.8662, + "step": 15204 + }, + { + "epoch": 0.943882301818859, + "grad_norm": 0.1596365556525897, + "learning_rate": 8.661256303146033e-05, + "loss": 3.0282, + "step": 15205 + }, + { + "epoch": 0.9439443789186169, + "grad_norm": 0.20494843878350325, + "learning_rate": 8.661010336312195e-05, + "loss": 2.9208, + "step": 15206 + }, + { + "epoch": 0.9440064560183749, + "grad_norm": 0.1671813540354, + "learning_rate": 8.66076435037798e-05, + "loss": 2.9735, + "step": 15207 + }, + { + "epoch": 0.9440685331181328, + "grad_norm": 0.20282201912146022, + "learning_rate": 8.660518345344668e-05, + "loss": 2.9509, + "step": 15208 + }, + { + "epoch": 0.9441306102178906, + "grad_norm": 0.19556265070486678, + "learning_rate": 8.660272321213541e-05, + "loss": 3.0224, + "step": 15209 + }, + { + "epoch": 0.9441926873176485, + "grad_norm": 0.23103584412638364, + "learning_rate": 8.660026277985887e-05, + "loss": 2.9879, + "step": 15210 + }, + { + "epoch": 0.9442547644174064, + "grad_norm": 0.232881188340618, + "learning_rate": 8.659780215662987e-05, + "loss": 3.0014, + "step": 15211 + }, + { + "epoch": 0.9443168415171643, + "grad_norm": 0.2507506663845103, + "learning_rate": 8.659534134246123e-05, + "loss": 2.8827, + "step": 15212 + }, + { + "epoch": 0.9443789186169222, + "grad_norm": 0.2747970840280651, + "learning_rate": 8.659288033736583e-05, + "loss": 2.9223, + "step": 15213 + }, + { + "epoch": 0.9444409957166802, + "grad_norm": 0.20112706583466425, + "learning_rate": 8.65904191413565e-05, + "loss": 2.9765, + "step": 15214 + }, + { + "epoch": 0.944503072816438, + "grad_norm": 0.19183034041622246, + "learning_rate": 8.658795775444605e-05, + "loss": 2.9986, + "step": 15215 + }, + { + "epoch": 0.9445651499161959, + "grad_norm": 0.19873852912621498, + "learning_rate": 8.658549617664736e-05, + "loss": 3.0291, + "step": 15216 + }, + { + "epoch": 0.9446272270159538, + "grad_norm": 0.18079353023748715, + "learning_rate": 8.658303440797326e-05, + "loss": 2.998, + "step": 15217 + }, + { + "epoch": 0.9446893041157117, + "grad_norm": 0.24058216785092007, + "learning_rate": 8.658057244843657e-05, + "loss": 2.8759, + "step": 15218 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.18741019402978723, + "learning_rate": 8.657811029805016e-05, + "loss": 2.9612, + "step": 15219 + }, + { + "epoch": 0.9448134583152276, + "grad_norm": 0.18710052134804644, + "learning_rate": 8.657564795682687e-05, + "loss": 2.9685, + "step": 15220 + }, + { + "epoch": 0.9448755354149854, + "grad_norm": 0.18556060740198585, + "learning_rate": 8.657318542477955e-05, + "loss": 2.9393, + "step": 15221 + }, + { + "epoch": 0.9449376125147433, + "grad_norm": 0.1973255179856221, + "learning_rate": 8.657072270192103e-05, + "loss": 3.0506, + "step": 15222 + }, + { + "epoch": 0.9449996896145012, + "grad_norm": 0.18433938894821808, + "learning_rate": 8.656825978826417e-05, + "loss": 2.965, + "step": 15223 + }, + { + "epoch": 0.9450617667142591, + "grad_norm": 0.15447466301178248, + "learning_rate": 8.656579668382183e-05, + "loss": 2.9907, + "step": 15224 + }, + { + "epoch": 0.945123843814017, + "grad_norm": 0.17219247540799076, + "learning_rate": 8.656333338860686e-05, + "loss": 2.942, + "step": 15225 + }, + { + "epoch": 0.945185920913775, + "grad_norm": 0.16399500230408226, + "learning_rate": 8.65608699026321e-05, + "loss": 2.8891, + "step": 15226 + }, + { + "epoch": 0.9452479980135328, + "grad_norm": 0.1759744747770481, + "learning_rate": 8.65584062259104e-05, + "loss": 2.8607, + "step": 15227 + }, + { + "epoch": 0.9453100751132907, + "grad_norm": 0.16792577371164905, + "learning_rate": 8.655594235845462e-05, + "loss": 3.0315, + "step": 15228 + }, + { + "epoch": 0.9453721522130486, + "grad_norm": 0.17762048446853795, + "learning_rate": 8.655347830027762e-05, + "loss": 3.0913, + "step": 15229 + }, + { + "epoch": 0.9454342293128065, + "grad_norm": 0.168006094396268, + "learning_rate": 8.655101405139226e-05, + "loss": 2.9764, + "step": 15230 + }, + { + "epoch": 0.9454963064125644, + "grad_norm": 0.17678348080901737, + "learning_rate": 8.654854961181136e-05, + "loss": 2.9584, + "step": 15231 + }, + { + "epoch": 0.9455583835123224, + "grad_norm": 0.14810694102567043, + "learning_rate": 8.654608498154782e-05, + "loss": 2.9407, + "step": 15232 + }, + { + "epoch": 0.9456204606120802, + "grad_norm": 0.20594534300426257, + "learning_rate": 8.654362016061449e-05, + "loss": 3.0186, + "step": 15233 + }, + { + "epoch": 0.9456825377118381, + "grad_norm": 0.1854585738632241, + "learning_rate": 8.65411551490242e-05, + "loss": 2.974, + "step": 15234 + }, + { + "epoch": 0.945744614811596, + "grad_norm": 0.18891539407185648, + "learning_rate": 8.653868994678985e-05, + "loss": 2.9512, + "step": 15235 + }, + { + "epoch": 0.9458066919113539, + "grad_norm": 0.15799929002899596, + "learning_rate": 8.653622455392427e-05, + "loss": 2.9541, + "step": 15236 + }, + { + "epoch": 0.9458687690111118, + "grad_norm": 0.17329573808922574, + "learning_rate": 8.653375897044036e-05, + "loss": 2.971, + "step": 15237 + }, + { + "epoch": 0.9459308461108697, + "grad_norm": 0.15469951245838934, + "learning_rate": 8.653129319635093e-05, + "loss": 2.9989, + "step": 15238 + }, + { + "epoch": 0.9459929232106276, + "grad_norm": 0.18937423872715994, + "learning_rate": 8.65288272316689e-05, + "loss": 2.9679, + "step": 15239 + }, + { + "epoch": 0.9460550003103855, + "grad_norm": 0.16410691364029345, + "learning_rate": 8.65263610764071e-05, + "loss": 2.9072, + "step": 15240 + }, + { + "epoch": 0.9461170774101434, + "grad_norm": 0.1646624901907285, + "learning_rate": 8.652389473057843e-05, + "loss": 2.9457, + "step": 15241 + }, + { + "epoch": 0.9461791545099013, + "grad_norm": 0.17686563438381667, + "learning_rate": 8.652142819419571e-05, + "loss": 3.0093, + "step": 15242 + }, + { + "epoch": 0.9462412316096592, + "grad_norm": 0.1634846228432055, + "learning_rate": 8.651896146727183e-05, + "loss": 2.944, + "step": 15243 + }, + { + "epoch": 0.9463033087094171, + "grad_norm": 0.1616098336622023, + "learning_rate": 8.651649454981967e-05, + "loss": 2.9263, + "step": 15244 + }, + { + "epoch": 0.946365385809175, + "grad_norm": 0.1517575736516499, + "learning_rate": 8.651402744185211e-05, + "loss": 2.9668, + "step": 15245 + }, + { + "epoch": 0.9464274629089329, + "grad_norm": 0.1768621273883149, + "learning_rate": 8.651156014338198e-05, + "loss": 2.9981, + "step": 15246 + }, + { + "epoch": 0.9464895400086908, + "grad_norm": 0.1624367808740336, + "learning_rate": 8.65090926544222e-05, + "loss": 2.971, + "step": 15247 + }, + { + "epoch": 0.9465516171084487, + "grad_norm": 0.1723631299588239, + "learning_rate": 8.65066249749856e-05, + "loss": 3.0683, + "step": 15248 + }, + { + "epoch": 0.9466136942082066, + "grad_norm": 0.16866473233611515, + "learning_rate": 8.650415710508508e-05, + "loss": 2.8416, + "step": 15249 + }, + { + "epoch": 0.9466757713079645, + "grad_norm": 0.17298222576326086, + "learning_rate": 8.650168904473352e-05, + "loss": 2.9024, + "step": 15250 + }, + { + "epoch": 0.9467378484077223, + "grad_norm": 0.1585943893833144, + "learning_rate": 8.649922079394377e-05, + "loss": 2.9774, + "step": 15251 + }, + { + "epoch": 0.9467999255074803, + "grad_norm": 0.1603754334796985, + "learning_rate": 8.649675235272873e-05, + "loss": 2.9215, + "step": 15252 + }, + { + "epoch": 0.9468620026072382, + "grad_norm": 0.15724343748340203, + "learning_rate": 8.649428372110129e-05, + "loss": 2.9179, + "step": 15253 + }, + { + "epoch": 0.9469240797069961, + "grad_norm": 0.1615463844798823, + "learning_rate": 8.64918148990743e-05, + "loss": 3.0306, + "step": 15254 + }, + { + "epoch": 0.946986156806754, + "grad_norm": 0.25071044647281876, + "learning_rate": 8.648934588666064e-05, + "loss": 3.0359, + "step": 15255 + }, + { + "epoch": 0.9470482339065119, + "grad_norm": 0.16831734435783646, + "learning_rate": 8.648687668387323e-05, + "loss": 2.8808, + "step": 15256 + }, + { + "epoch": 0.9471103110062697, + "grad_norm": 0.18148197916381453, + "learning_rate": 8.648440729072492e-05, + "loss": 2.9142, + "step": 15257 + }, + { + "epoch": 0.9471723881060277, + "grad_norm": 0.20836773926030866, + "learning_rate": 8.648193770722858e-05, + "loss": 2.9419, + "step": 15258 + }, + { + "epoch": 0.9472344652057856, + "grad_norm": 0.16549352564853945, + "learning_rate": 8.647946793339715e-05, + "loss": 2.9408, + "step": 15259 + }, + { + "epoch": 0.9472965423055435, + "grad_norm": 0.16625314999048496, + "learning_rate": 8.647699796924345e-05, + "loss": 2.9722, + "step": 15260 + }, + { + "epoch": 0.9473586194053014, + "grad_norm": 0.16087591326172015, + "learning_rate": 8.64745278147804e-05, + "loss": 3.0103, + "step": 15261 + }, + { + "epoch": 0.9474206965050593, + "grad_norm": 0.17143143507965444, + "learning_rate": 8.64720574700209e-05, + "loss": 2.9655, + "step": 15262 + }, + { + "epoch": 0.9474827736048171, + "grad_norm": 0.14518146913805777, + "learning_rate": 8.646958693497781e-05, + "loss": 2.9339, + "step": 15263 + }, + { + "epoch": 0.9475448507045751, + "grad_norm": 0.15736647120795055, + "learning_rate": 8.646711620966404e-05, + "loss": 2.9463, + "step": 15264 + }, + { + "epoch": 0.947606927804333, + "grad_norm": 0.15056714931497173, + "learning_rate": 8.646464529409246e-05, + "loss": 2.9471, + "step": 15265 + }, + { + "epoch": 0.9476690049040909, + "grad_norm": 0.1596435213183753, + "learning_rate": 8.646217418827599e-05, + "loss": 2.9246, + "step": 15266 + }, + { + "epoch": 0.9477310820038488, + "grad_norm": 0.17696310735171067, + "learning_rate": 8.645970289222749e-05, + "loss": 2.976, + "step": 15267 + }, + { + "epoch": 0.9477931591036067, + "grad_norm": 0.1847830762142294, + "learning_rate": 8.645723140595987e-05, + "loss": 3.0279, + "step": 15268 + }, + { + "epoch": 0.9478552362033645, + "grad_norm": 0.22948266679210233, + "learning_rate": 8.645475972948602e-05, + "loss": 2.9954, + "step": 15269 + }, + { + "epoch": 0.9479173133031225, + "grad_norm": 0.17308998905537445, + "learning_rate": 8.645228786281886e-05, + "loss": 3.0117, + "step": 15270 + }, + { + "epoch": 0.9479793904028804, + "grad_norm": 0.24445215566997047, + "learning_rate": 8.644981580597125e-05, + "loss": 2.9967, + "step": 15271 + }, + { + "epoch": 0.9480414675026383, + "grad_norm": 0.2370957290518549, + "learning_rate": 8.64473435589561e-05, + "loss": 2.9428, + "step": 15272 + }, + { + "epoch": 0.9481035446023962, + "grad_norm": 0.20890195950572446, + "learning_rate": 8.644487112178632e-05, + "loss": 3.007, + "step": 15273 + }, + { + "epoch": 0.9481656217021541, + "grad_norm": 0.20696375304394182, + "learning_rate": 8.64423984944748e-05, + "loss": 3.0187, + "step": 15274 + }, + { + "epoch": 0.9482276988019119, + "grad_norm": 0.2073826955801066, + "learning_rate": 8.643992567703443e-05, + "loss": 2.9559, + "step": 15275 + }, + { + "epoch": 0.9482897759016699, + "grad_norm": 0.21813937519656224, + "learning_rate": 8.643745266947813e-05, + "loss": 2.9862, + "step": 15276 + }, + { + "epoch": 0.9483518530014278, + "grad_norm": 0.2236083821108117, + "learning_rate": 8.643497947181878e-05, + "loss": 2.8483, + "step": 15277 + }, + { + "epoch": 0.9484139301011857, + "grad_norm": 0.19977205705488355, + "learning_rate": 8.643250608406932e-05, + "loss": 2.9955, + "step": 15278 + }, + { + "epoch": 0.9484760072009436, + "grad_norm": 0.2376193954237264, + "learning_rate": 8.643003250624262e-05, + "loss": 3.058, + "step": 15279 + }, + { + "epoch": 0.9485380843007015, + "grad_norm": 0.1784794042389851, + "learning_rate": 8.64275587383516e-05, + "loss": 2.9553, + "step": 15280 + }, + { + "epoch": 0.9486001614004593, + "grad_norm": 0.19773372029075917, + "learning_rate": 8.642508478040916e-05, + "loss": 2.9276, + "step": 15281 + }, + { + "epoch": 0.9486622385002172, + "grad_norm": 0.18577317942999208, + "learning_rate": 8.64226106324282e-05, + "loss": 2.9079, + "step": 15282 + }, + { + "epoch": 0.9487243155999752, + "grad_norm": 0.17369621072033423, + "learning_rate": 8.642013629442165e-05, + "loss": 2.9248, + "step": 15283 + }, + { + "epoch": 0.9487863926997331, + "grad_norm": 0.18819325159507358, + "learning_rate": 8.64176617664024e-05, + "loss": 2.9178, + "step": 15284 + }, + { + "epoch": 0.948848469799491, + "grad_norm": 0.1652209786018935, + "learning_rate": 8.641518704838338e-05, + "loss": 2.9775, + "step": 15285 + }, + { + "epoch": 0.9489105468992489, + "grad_norm": 0.17264635221669491, + "learning_rate": 8.641271214037747e-05, + "loss": 2.9809, + "step": 15286 + }, + { + "epoch": 0.9489726239990067, + "grad_norm": 0.17989811785568444, + "learning_rate": 8.641023704239764e-05, + "loss": 2.9664, + "step": 15287 + }, + { + "epoch": 0.9490347010987646, + "grad_norm": 0.174233547788238, + "learning_rate": 8.640776175445673e-05, + "loss": 2.9147, + "step": 15288 + }, + { + "epoch": 0.9490967781985226, + "grad_norm": 0.204563649032424, + "learning_rate": 8.64052862765677e-05, + "loss": 2.9084, + "step": 15289 + }, + { + "epoch": 0.9491588552982805, + "grad_norm": 0.1829593431414794, + "learning_rate": 8.640281060874345e-05, + "loss": 3.0377, + "step": 15290 + }, + { + "epoch": 0.9492209323980384, + "grad_norm": 0.22503422072899368, + "learning_rate": 8.640033475099689e-05, + "loss": 3.0077, + "step": 15291 + }, + { + "epoch": 0.9492830094977963, + "grad_norm": 0.18438947992524465, + "learning_rate": 8.639785870334096e-05, + "loss": 3.0039, + "step": 15292 + }, + { + "epoch": 0.9493450865975541, + "grad_norm": 0.19969049399255165, + "learning_rate": 8.639538246578856e-05, + "loss": 3.0184, + "step": 15293 + }, + { + "epoch": 0.949407163697312, + "grad_norm": 0.19510513741950472, + "learning_rate": 8.639290603835262e-05, + "loss": 3.029, + "step": 15294 + }, + { + "epoch": 0.94946924079707, + "grad_norm": 0.17147062698381163, + "learning_rate": 8.639042942104605e-05, + "loss": 2.9732, + "step": 15295 + }, + { + "epoch": 0.9495313178968279, + "grad_norm": 0.16273308460897887, + "learning_rate": 8.638795261388178e-05, + "loss": 2.9591, + "step": 15296 + }, + { + "epoch": 0.9495933949965858, + "grad_norm": 0.16201109952158718, + "learning_rate": 8.638547561687273e-05, + "loss": 2.8407, + "step": 15297 + }, + { + "epoch": 0.9496554720963437, + "grad_norm": 0.16554680993950624, + "learning_rate": 8.638299843003181e-05, + "loss": 3.0217, + "step": 15298 + }, + { + "epoch": 0.9497175491961015, + "grad_norm": 0.18184801792945202, + "learning_rate": 8.638052105337198e-05, + "loss": 2.9907, + "step": 15299 + }, + { + "epoch": 0.9497796262958594, + "grad_norm": 0.19306403058779992, + "learning_rate": 8.63780434869061e-05, + "loss": 2.9215, + "step": 15300 + }, + { + "epoch": 0.9498417033956174, + "grad_norm": 0.18699716655795362, + "learning_rate": 8.637556573064717e-05, + "loss": 2.9593, + "step": 15301 + }, + { + "epoch": 0.9499037804953753, + "grad_norm": 0.1914681704266323, + "learning_rate": 8.637308778460807e-05, + "loss": 2.9198, + "step": 15302 + }, + { + "epoch": 0.9499658575951332, + "grad_norm": 0.19168436514645243, + "learning_rate": 8.637060964880174e-05, + "loss": 2.8733, + "step": 15303 + }, + { + "epoch": 0.9500279346948911, + "grad_norm": 0.17295537406699027, + "learning_rate": 8.636813132324111e-05, + "loss": 2.8899, + "step": 15304 + }, + { + "epoch": 0.9500900117946489, + "grad_norm": 0.186282046636482, + "learning_rate": 8.636565280793911e-05, + "loss": 3.0022, + "step": 15305 + }, + { + "epoch": 0.9501520888944068, + "grad_norm": 0.20934754305482664, + "learning_rate": 8.636317410290868e-05, + "loss": 3.0337, + "step": 15306 + }, + { + "epoch": 0.9502141659941648, + "grad_norm": 0.18808817005516001, + "learning_rate": 8.636069520816273e-05, + "loss": 2.906, + "step": 15307 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.17219892235478532, + "learning_rate": 8.635821612371422e-05, + "loss": 2.9426, + "step": 15308 + }, + { + "epoch": 0.9503383201936806, + "grad_norm": 0.17355121566146947, + "learning_rate": 8.635573684957605e-05, + "loss": 3.041, + "step": 15309 + }, + { + "epoch": 0.9504003972934385, + "grad_norm": 0.1761088550371433, + "learning_rate": 8.635325738576119e-05, + "loss": 2.9585, + "step": 15310 + }, + { + "epoch": 0.9504624743931963, + "grad_norm": 0.19807017902151147, + "learning_rate": 8.635077773228256e-05, + "loss": 2.9916, + "step": 15311 + }, + { + "epoch": 0.9505245514929542, + "grad_norm": 0.15758018532022147, + "learning_rate": 8.63482978891531e-05, + "loss": 2.9751, + "step": 15312 + }, + { + "epoch": 0.9505866285927121, + "grad_norm": 0.18406722720087715, + "learning_rate": 8.634581785638574e-05, + "loss": 2.9141, + "step": 15313 + }, + { + "epoch": 0.9506487056924701, + "grad_norm": 0.16362001717362828, + "learning_rate": 8.634333763399343e-05, + "loss": 2.9767, + "step": 15314 + }, + { + "epoch": 0.950710782792228, + "grad_norm": 0.15981418435936323, + "learning_rate": 8.63408572219891e-05, + "loss": 2.9911, + "step": 15315 + }, + { + "epoch": 0.9507728598919859, + "grad_norm": 0.16643450009174007, + "learning_rate": 8.63383766203857e-05, + "loss": 2.918, + "step": 15316 + }, + { + "epoch": 0.9508349369917437, + "grad_norm": 0.15473653830035192, + "learning_rate": 8.633589582919616e-05, + "loss": 3.0551, + "step": 15317 + }, + { + "epoch": 0.9508970140915016, + "grad_norm": 0.16469025855676617, + "learning_rate": 8.633341484843345e-05, + "loss": 3.0245, + "step": 15318 + }, + { + "epoch": 0.9509590911912595, + "grad_norm": 0.18611064390599108, + "learning_rate": 8.633093367811048e-05, + "loss": 2.9108, + "step": 15319 + }, + { + "epoch": 0.9510211682910175, + "grad_norm": 0.2388551866537598, + "learning_rate": 8.63284523182402e-05, + "loss": 2.8741, + "step": 15320 + }, + { + "epoch": 0.9510832453907754, + "grad_norm": 0.15874754279592876, + "learning_rate": 8.63259707688356e-05, + "loss": 3.0201, + "step": 15321 + }, + { + "epoch": 0.9511453224905333, + "grad_norm": 0.1820108168925851, + "learning_rate": 8.632348902990956e-05, + "loss": 2.975, + "step": 15322 + }, + { + "epoch": 0.9512073995902911, + "grad_norm": 0.16930470250037966, + "learning_rate": 8.632100710147506e-05, + "loss": 2.9119, + "step": 15323 + }, + { + "epoch": 0.951269476690049, + "grad_norm": 0.19552207193573384, + "learning_rate": 8.631852498354506e-05, + "loss": 3.047, + "step": 15324 + }, + { + "epoch": 0.9513315537898069, + "grad_norm": 0.2164118931306544, + "learning_rate": 8.63160426761325e-05, + "loss": 2.9847, + "step": 15325 + }, + { + "epoch": 0.9513936308895649, + "grad_norm": 0.22168014123004742, + "learning_rate": 8.631356017925032e-05, + "loss": 3.0284, + "step": 15326 + }, + { + "epoch": 0.9514557079893228, + "grad_norm": 0.19520858861491944, + "learning_rate": 8.63110774929115e-05, + "loss": 2.9962, + "step": 15327 + }, + { + "epoch": 0.9515177850890807, + "grad_norm": 0.2586105738811351, + "learning_rate": 8.630859461712895e-05, + "loss": 3.0126, + "step": 15328 + }, + { + "epoch": 0.9515798621888385, + "grad_norm": 0.18413948558831955, + "learning_rate": 8.630611155191566e-05, + "loss": 2.8388, + "step": 15329 + }, + { + "epoch": 0.9516419392885964, + "grad_norm": 0.296072980811589, + "learning_rate": 8.630362829728456e-05, + "loss": 2.9479, + "step": 15330 + }, + { + "epoch": 0.9517040163883543, + "grad_norm": 0.25393396373035665, + "learning_rate": 8.630114485324864e-05, + "loss": 2.9494, + "step": 15331 + }, + { + "epoch": 0.9517660934881123, + "grad_norm": 0.20670717894679, + "learning_rate": 8.629866121982082e-05, + "loss": 2.9429, + "step": 15332 + }, + { + "epoch": 0.9518281705878702, + "grad_norm": 0.24989678815336203, + "learning_rate": 8.629617739701407e-05, + "loss": 2.9814, + "step": 15333 + }, + { + "epoch": 0.9518902476876281, + "grad_norm": 0.20013062897544706, + "learning_rate": 8.629369338484137e-05, + "loss": 2.9866, + "step": 15334 + }, + { + "epoch": 0.9519523247873859, + "grad_norm": 0.24034272828490677, + "learning_rate": 8.629120918331564e-05, + "loss": 3.0114, + "step": 15335 + }, + { + "epoch": 0.9520144018871438, + "grad_norm": 0.1875626836742149, + "learning_rate": 8.628872479244986e-05, + "loss": 3.0261, + "step": 15336 + }, + { + "epoch": 0.9520764789869017, + "grad_norm": 0.3092472544789541, + "learning_rate": 8.628624021225702e-05, + "loss": 3.0319, + "step": 15337 + }, + { + "epoch": 0.9521385560866596, + "grad_norm": 0.2015865089843938, + "learning_rate": 8.628375544275004e-05, + "loss": 2.9973, + "step": 15338 + }, + { + "epoch": 0.9522006331864176, + "grad_norm": 0.25757068795692817, + "learning_rate": 8.62812704839419e-05, + "loss": 3.0427, + "step": 15339 + }, + { + "epoch": 0.9522627102861755, + "grad_norm": 0.18440754086033426, + "learning_rate": 8.627878533584555e-05, + "loss": 2.9824, + "step": 15340 + }, + { + "epoch": 0.9523247873859333, + "grad_norm": 0.25112425321541043, + "learning_rate": 8.627629999847399e-05, + "loss": 3.043, + "step": 15341 + }, + { + "epoch": 0.9523868644856912, + "grad_norm": 0.18561989200833767, + "learning_rate": 8.627381447184017e-05, + "loss": 2.9742, + "step": 15342 + }, + { + "epoch": 0.9524489415854491, + "grad_norm": 0.2706890394281103, + "learning_rate": 8.627132875595703e-05, + "loss": 2.9696, + "step": 15343 + }, + { + "epoch": 0.952511018685207, + "grad_norm": 0.17855284947576144, + "learning_rate": 8.626884285083758e-05, + "loss": 3.0285, + "step": 15344 + }, + { + "epoch": 0.952573095784965, + "grad_norm": 0.1891960582624731, + "learning_rate": 8.626635675649477e-05, + "loss": 2.9994, + "step": 15345 + }, + { + "epoch": 0.9526351728847229, + "grad_norm": 0.1770065938406994, + "learning_rate": 8.626387047294158e-05, + "loss": 3.0314, + "step": 15346 + }, + { + "epoch": 0.9526972499844807, + "grad_norm": 0.4154099366909686, + "learning_rate": 8.626138400019096e-05, + "loss": 2.9741, + "step": 15347 + }, + { + "epoch": 0.9527593270842386, + "grad_norm": 0.19091864193068359, + "learning_rate": 8.625889733825589e-05, + "loss": 3.0357, + "step": 15348 + }, + { + "epoch": 0.9528214041839965, + "grad_norm": 0.23856119221035782, + "learning_rate": 8.625641048714938e-05, + "loss": 3.0154, + "step": 15349 + }, + { + "epoch": 0.9528834812837544, + "grad_norm": 0.3289332652833227, + "learning_rate": 8.625392344688434e-05, + "loss": 2.8986, + "step": 15350 + }, + { + "epoch": 0.9529455583835124, + "grad_norm": 0.2189885354573969, + "learning_rate": 8.62514362174738e-05, + "loss": 2.9118, + "step": 15351 + }, + { + "epoch": 0.9530076354832703, + "grad_norm": 0.1917266013846955, + "learning_rate": 8.624894879893073e-05, + "loss": 3.0514, + "step": 15352 + }, + { + "epoch": 0.9530697125830281, + "grad_norm": 0.20631035766887573, + "learning_rate": 8.624646119126806e-05, + "loss": 2.9609, + "step": 15353 + }, + { + "epoch": 0.953131789682786, + "grad_norm": 0.23917410807103825, + "learning_rate": 8.624397339449883e-05, + "loss": 3.0971, + "step": 15354 + }, + { + "epoch": 0.9531938667825439, + "grad_norm": 0.19518409322696537, + "learning_rate": 8.624148540863599e-05, + "loss": 2.9382, + "step": 15355 + }, + { + "epoch": 0.9532559438823018, + "grad_norm": 0.19457540662032666, + "learning_rate": 8.623899723369252e-05, + "loss": 2.9108, + "step": 15356 + }, + { + "epoch": 0.9533180209820598, + "grad_norm": 0.1822216830566262, + "learning_rate": 8.62365088696814e-05, + "loss": 2.9318, + "step": 15357 + }, + { + "epoch": 0.9533800980818177, + "grad_norm": 0.19136713562509364, + "learning_rate": 8.623402031661563e-05, + "loss": 2.9272, + "step": 15358 + }, + { + "epoch": 0.9534421751815755, + "grad_norm": 0.19221653990264928, + "learning_rate": 8.623153157450816e-05, + "loss": 2.9625, + "step": 15359 + }, + { + "epoch": 0.9535042522813334, + "grad_norm": 0.15368699004789485, + "learning_rate": 8.6229042643372e-05, + "loss": 2.9279, + "step": 15360 + }, + { + "epoch": 0.9535663293810913, + "grad_norm": 0.23091645317803128, + "learning_rate": 8.622655352322014e-05, + "loss": 3.0016, + "step": 15361 + }, + { + "epoch": 0.9536284064808492, + "grad_norm": 0.16603079333853749, + "learning_rate": 8.622406421406555e-05, + "loss": 2.9771, + "step": 15362 + }, + { + "epoch": 0.9536904835806072, + "grad_norm": 0.17423941677832278, + "learning_rate": 8.622157471592122e-05, + "loss": 2.9705, + "step": 15363 + }, + { + "epoch": 0.953752560680365, + "grad_norm": 0.16365422242412597, + "learning_rate": 8.621908502880014e-05, + "loss": 2.9699, + "step": 15364 + }, + { + "epoch": 0.9538146377801229, + "grad_norm": 0.14746660259556318, + "learning_rate": 8.62165951527153e-05, + "loss": 2.9246, + "step": 15365 + }, + { + "epoch": 0.9538767148798808, + "grad_norm": 0.2031698819239972, + "learning_rate": 8.62141050876797e-05, + "loss": 2.8978, + "step": 15366 + }, + { + "epoch": 0.9539387919796387, + "grad_norm": 0.1631108738914135, + "learning_rate": 8.621161483370632e-05, + "loss": 3.0135, + "step": 15367 + }, + { + "epoch": 0.9540008690793966, + "grad_norm": 0.18479761125026506, + "learning_rate": 8.620912439080815e-05, + "loss": 2.973, + "step": 15368 + }, + { + "epoch": 0.9540629461791545, + "grad_norm": 0.1893595336509233, + "learning_rate": 8.62066337589982e-05, + "loss": 2.933, + "step": 15369 + }, + { + "epoch": 0.9541250232789124, + "grad_norm": 0.16020921668671873, + "learning_rate": 8.620414293828944e-05, + "loss": 2.8981, + "step": 15370 + }, + { + "epoch": 0.9541871003786703, + "grad_norm": 0.21630896402503844, + "learning_rate": 8.620165192869488e-05, + "loss": 3.022, + "step": 15371 + }, + { + "epoch": 0.9542491774784282, + "grad_norm": 0.1912344239162629, + "learning_rate": 8.619916073022751e-05, + "loss": 3.0004, + "step": 15372 + }, + { + "epoch": 0.9543112545781861, + "grad_norm": 0.19300639561386101, + "learning_rate": 8.619666934290033e-05, + "loss": 2.9024, + "step": 15373 + }, + { + "epoch": 0.954373331677944, + "grad_norm": 0.1795346708942915, + "learning_rate": 8.619417776672634e-05, + "loss": 2.954, + "step": 15374 + }, + { + "epoch": 0.9544354087777019, + "grad_norm": 0.19866117351840168, + "learning_rate": 8.619168600171854e-05, + "loss": 2.9682, + "step": 15375 + }, + { + "epoch": 0.9544974858774598, + "grad_norm": 0.16509726023042667, + "learning_rate": 8.618919404788994e-05, + "loss": 3.0031, + "step": 15376 + }, + { + "epoch": 0.9545595629772177, + "grad_norm": 0.17058529639030795, + "learning_rate": 8.618670190525352e-05, + "loss": 2.9595, + "step": 15377 + }, + { + "epoch": 0.9546216400769756, + "grad_norm": 0.18910258593299065, + "learning_rate": 8.618420957382229e-05, + "loss": 2.9711, + "step": 15378 + }, + { + "epoch": 0.9546837171767335, + "grad_norm": 0.16711553401752302, + "learning_rate": 8.618171705360925e-05, + "loss": 2.9475, + "step": 15379 + }, + { + "epoch": 0.9547457942764914, + "grad_norm": 0.2017676964967145, + "learning_rate": 8.617922434462742e-05, + "loss": 2.9623, + "step": 15380 + }, + { + "epoch": 0.9548078713762493, + "grad_norm": 0.22044639509212163, + "learning_rate": 8.617673144688977e-05, + "loss": 2.8587, + "step": 15381 + }, + { + "epoch": 0.9548699484760071, + "grad_norm": 0.3017317998346668, + "learning_rate": 8.617423836040936e-05, + "loss": 3.0444, + "step": 15382 + }, + { + "epoch": 0.9549320255757651, + "grad_norm": 0.22982299175439025, + "learning_rate": 8.617174508519916e-05, + "loss": 3.0562, + "step": 15383 + }, + { + "epoch": 0.954994102675523, + "grad_norm": 0.20953377141153712, + "learning_rate": 8.616925162127218e-05, + "loss": 3.0598, + "step": 15384 + }, + { + "epoch": 0.9550561797752809, + "grad_norm": 0.26338529200692695, + "learning_rate": 8.616675796864143e-05, + "loss": 3.0386, + "step": 15385 + }, + { + "epoch": 0.9551182568750388, + "grad_norm": 0.2723616005348824, + "learning_rate": 8.616426412731992e-05, + "loss": 3.0376, + "step": 15386 + }, + { + "epoch": 0.9551803339747967, + "grad_norm": 0.1677024971102461, + "learning_rate": 8.616177009732067e-05, + "loss": 2.9029, + "step": 15387 + }, + { + "epoch": 0.9552424110745545, + "grad_norm": 0.3138074228088372, + "learning_rate": 8.61592758786567e-05, + "loss": 3.0075, + "step": 15388 + }, + { + "epoch": 0.9553044881743125, + "grad_norm": 0.18653124648119898, + "learning_rate": 8.6156781471341e-05, + "loss": 3.0157, + "step": 15389 + }, + { + "epoch": 0.9553665652740704, + "grad_norm": 0.18088065901684425, + "learning_rate": 8.61542868753866e-05, + "loss": 2.951, + "step": 15390 + }, + { + "epoch": 0.9554286423738283, + "grad_norm": 0.20723749180483178, + "learning_rate": 8.615179209080649e-05, + "loss": 3.0267, + "step": 15391 + }, + { + "epoch": 0.9554907194735862, + "grad_norm": 0.23315035831659897, + "learning_rate": 8.614929711761372e-05, + "loss": 2.9451, + "step": 15392 + }, + { + "epoch": 0.9555527965733441, + "grad_norm": 0.17255910280732226, + "learning_rate": 8.614680195582127e-05, + "loss": 3.0733, + "step": 15393 + }, + { + "epoch": 0.9556148736731019, + "grad_norm": 0.1993645620603341, + "learning_rate": 8.61443066054422e-05, + "loss": 2.8106, + "step": 15394 + }, + { + "epoch": 0.9556769507728599, + "grad_norm": 0.18520499404676014, + "learning_rate": 8.614181106648951e-05, + "loss": 2.93, + "step": 15395 + }, + { + "epoch": 0.9557390278726178, + "grad_norm": 0.3081284890741422, + "learning_rate": 8.613931533897621e-05, + "loss": 3.0131, + "step": 15396 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.2551402695388369, + "learning_rate": 8.613681942291533e-05, + "loss": 3.016, + "step": 15397 + }, + { + "epoch": 0.9558631820721336, + "grad_norm": 0.19713098822540903, + "learning_rate": 8.61343233183199e-05, + "loss": 2.9564, + "step": 15398 + }, + { + "epoch": 0.9559252591718915, + "grad_norm": 0.19215755539502036, + "learning_rate": 8.613182702520292e-05, + "loss": 3.002, + "step": 15399 + }, + { + "epoch": 0.9559873362716493, + "grad_norm": 0.1882531319195584, + "learning_rate": 8.612933054357744e-05, + "loss": 3.0312, + "step": 15400 + }, + { + "epoch": 0.9560494133714073, + "grad_norm": 0.2006221593673536, + "learning_rate": 8.612683387345646e-05, + "loss": 2.9901, + "step": 15401 + }, + { + "epoch": 0.9561114904711652, + "grad_norm": 0.21177705827549706, + "learning_rate": 8.612433701485301e-05, + "loss": 3.0404, + "step": 15402 + }, + { + "epoch": 0.9561735675709231, + "grad_norm": 0.20193998674049893, + "learning_rate": 8.612183996778014e-05, + "loss": 2.9775, + "step": 15403 + }, + { + "epoch": 0.956235644670681, + "grad_norm": 0.21090076267905813, + "learning_rate": 8.611934273225086e-05, + "loss": 2.9815, + "step": 15404 + }, + { + "epoch": 0.9562977217704389, + "grad_norm": 0.20732367754024206, + "learning_rate": 8.611684530827821e-05, + "loss": 2.9593, + "step": 15405 + }, + { + "epoch": 0.9563597988701967, + "grad_norm": 0.22100800580308178, + "learning_rate": 8.611434769587518e-05, + "loss": 2.9913, + "step": 15406 + }, + { + "epoch": 0.9564218759699546, + "grad_norm": 0.1817272143963348, + "learning_rate": 8.611184989505486e-05, + "loss": 2.9594, + "step": 15407 + }, + { + "epoch": 0.9564839530697126, + "grad_norm": 0.20062499107230097, + "learning_rate": 8.610935190583025e-05, + "loss": 2.8315, + "step": 15408 + }, + { + "epoch": 0.9565460301694705, + "grad_norm": 0.18642726635828466, + "learning_rate": 8.610685372821436e-05, + "loss": 2.8677, + "step": 15409 + }, + { + "epoch": 0.9566081072692284, + "grad_norm": 0.27771128895201014, + "learning_rate": 8.610435536222028e-05, + "loss": 2.9207, + "step": 15410 + }, + { + "epoch": 0.9566701843689863, + "grad_norm": 0.22876487897208428, + "learning_rate": 8.6101856807861e-05, + "loss": 3.0432, + "step": 15411 + }, + { + "epoch": 0.9567322614687441, + "grad_norm": 0.18695060388735854, + "learning_rate": 8.609935806514957e-05, + "loss": 3.1116, + "step": 15412 + }, + { + "epoch": 0.956794338568502, + "grad_norm": 0.17407931705440266, + "learning_rate": 8.609685913409901e-05, + "loss": 2.9651, + "step": 15413 + }, + { + "epoch": 0.95685641566826, + "grad_norm": 0.19273042015282743, + "learning_rate": 8.60943600147224e-05, + "loss": 2.9941, + "step": 15414 + }, + { + "epoch": 0.9569184927680179, + "grad_norm": 0.17425683015924015, + "learning_rate": 8.609186070703275e-05, + "loss": 2.8778, + "step": 15415 + }, + { + "epoch": 0.9569805698677758, + "grad_norm": 0.2935073352421862, + "learning_rate": 8.608936121104307e-05, + "loss": 2.9848, + "step": 15416 + }, + { + "epoch": 0.9570426469675337, + "grad_norm": 0.2034630255353074, + "learning_rate": 8.608686152676646e-05, + "loss": 3.0255, + "step": 15417 + }, + { + "epoch": 0.9571047240672915, + "grad_norm": 0.2135683698162888, + "learning_rate": 8.608436165421592e-05, + "loss": 2.9717, + "step": 15418 + }, + { + "epoch": 0.9571668011670494, + "grad_norm": 0.18873321768065324, + "learning_rate": 8.608186159340452e-05, + "loss": 2.9678, + "step": 15419 + }, + { + "epoch": 0.9572288782668074, + "grad_norm": 0.20054098849542196, + "learning_rate": 8.607936134434527e-05, + "loss": 3.0098, + "step": 15420 + }, + { + "epoch": 0.9572909553665653, + "grad_norm": 0.2299082940433216, + "learning_rate": 8.607686090705125e-05, + "loss": 2.9503, + "step": 15421 + }, + { + "epoch": 0.9573530324663232, + "grad_norm": 0.20843673861763456, + "learning_rate": 8.607436028153548e-05, + "loss": 2.9851, + "step": 15422 + }, + { + "epoch": 0.9574151095660811, + "grad_norm": 0.19255325590118352, + "learning_rate": 8.607185946781102e-05, + "loss": 3.0385, + "step": 15423 + }, + { + "epoch": 0.9574771866658389, + "grad_norm": 0.19343815749492663, + "learning_rate": 8.60693584658909e-05, + "loss": 2.9951, + "step": 15424 + }, + { + "epoch": 0.9575392637655968, + "grad_norm": 0.200180555590595, + "learning_rate": 8.60668572757882e-05, + "loss": 2.9978, + "step": 15425 + }, + { + "epoch": 0.9576013408653548, + "grad_norm": 0.1894088280399598, + "learning_rate": 8.606435589751593e-05, + "loss": 2.9255, + "step": 15426 + }, + { + "epoch": 0.9576634179651127, + "grad_norm": 0.2160748754024312, + "learning_rate": 8.606185433108717e-05, + "loss": 2.8978, + "step": 15427 + }, + { + "epoch": 0.9577254950648706, + "grad_norm": 0.25863288629343845, + "learning_rate": 8.605935257651496e-05, + "loss": 3.0332, + "step": 15428 + }, + { + "epoch": 0.9577875721646285, + "grad_norm": 0.18896007285391658, + "learning_rate": 8.605685063381236e-05, + "loss": 2.9388, + "step": 15429 + }, + { + "epoch": 0.9578496492643863, + "grad_norm": 0.17805669713096578, + "learning_rate": 8.605434850299241e-05, + "loss": 2.9876, + "step": 15430 + }, + { + "epoch": 0.9579117263641442, + "grad_norm": 0.17092848922265538, + "learning_rate": 8.605184618406819e-05, + "loss": 3.0006, + "step": 15431 + }, + { + "epoch": 0.9579738034639022, + "grad_norm": 0.20289554841153135, + "learning_rate": 8.604934367705271e-05, + "loss": 2.9405, + "step": 15432 + }, + { + "epoch": 0.9580358805636601, + "grad_norm": 0.1901813746571301, + "learning_rate": 8.604684098195905e-05, + "loss": 2.9519, + "step": 15433 + }, + { + "epoch": 0.958097957663418, + "grad_norm": 0.1861874715280655, + "learning_rate": 8.604433809880029e-05, + "loss": 3.0612, + "step": 15434 + }, + { + "epoch": 0.9581600347631759, + "grad_norm": 0.20823453805784978, + "learning_rate": 8.604183502758947e-05, + "loss": 2.929, + "step": 15435 + }, + { + "epoch": 0.9582221118629337, + "grad_norm": 0.3113466915400198, + "learning_rate": 8.603933176833963e-05, + "loss": 2.9538, + "step": 15436 + }, + { + "epoch": 0.9582841889626916, + "grad_norm": 0.2477395714750454, + "learning_rate": 8.603682832106385e-05, + "loss": 2.8115, + "step": 15437 + }, + { + "epoch": 0.9583462660624495, + "grad_norm": 0.2586608425127648, + "learning_rate": 8.60343246857752e-05, + "loss": 2.9909, + "step": 15438 + }, + { + "epoch": 0.9584083431622075, + "grad_norm": 0.21322420216644905, + "learning_rate": 8.603182086248674e-05, + "loss": 3.0034, + "step": 15439 + }, + { + "epoch": 0.9584704202619654, + "grad_norm": 0.21215027507898557, + "learning_rate": 8.60293168512115e-05, + "loss": 2.9379, + "step": 15440 + }, + { + "epoch": 0.9585324973617233, + "grad_norm": 0.22395296792693056, + "learning_rate": 8.602681265196257e-05, + "loss": 2.9317, + "step": 15441 + }, + { + "epoch": 0.9585945744614811, + "grad_norm": 0.3592452768799769, + "learning_rate": 8.602430826475303e-05, + "loss": 2.9852, + "step": 15442 + }, + { + "epoch": 0.958656651561239, + "grad_norm": 0.3105301882501205, + "learning_rate": 8.602180368959592e-05, + "loss": 2.9085, + "step": 15443 + }, + { + "epoch": 0.9587187286609969, + "grad_norm": 0.2501699942554504, + "learning_rate": 8.60192989265043e-05, + "loss": 2.9869, + "step": 15444 + }, + { + "epoch": 0.9587808057607549, + "grad_norm": 0.24173482767451931, + "learning_rate": 8.601679397549127e-05, + "loss": 2.906, + "step": 15445 + }, + { + "epoch": 0.9588428828605128, + "grad_norm": 0.28078463253558766, + "learning_rate": 8.601428883656988e-05, + "loss": 3.0179, + "step": 15446 + }, + { + "epoch": 0.9589049599602707, + "grad_norm": 0.21913781019772782, + "learning_rate": 8.60117835097532e-05, + "loss": 3.079, + "step": 15447 + }, + { + "epoch": 0.9589670370600285, + "grad_norm": 0.206597476557478, + "learning_rate": 8.60092779950543e-05, + "loss": 2.9663, + "step": 15448 + }, + { + "epoch": 0.9590291141597864, + "grad_norm": 0.18151155623776513, + "learning_rate": 8.600677229248626e-05, + "loss": 2.9513, + "step": 15449 + }, + { + "epoch": 0.9590911912595443, + "grad_norm": 0.19976547430672031, + "learning_rate": 8.600426640206214e-05, + "loss": 3.0168, + "step": 15450 + }, + { + "epoch": 0.9591532683593023, + "grad_norm": 0.27619351702849904, + "learning_rate": 8.600176032379503e-05, + "loss": 2.9329, + "step": 15451 + }, + { + "epoch": 0.9592153454590602, + "grad_norm": 0.19226367766546226, + "learning_rate": 8.5999254057698e-05, + "loss": 3.0267, + "step": 15452 + }, + { + "epoch": 0.9592774225588181, + "grad_norm": 0.19915453492974486, + "learning_rate": 8.599674760378412e-05, + "loss": 2.9733, + "step": 15453 + }, + { + "epoch": 0.9593394996585759, + "grad_norm": 0.20422395203377902, + "learning_rate": 8.599424096206648e-05, + "loss": 2.9194, + "step": 15454 + }, + { + "epoch": 0.9594015767583338, + "grad_norm": 0.18733948552398008, + "learning_rate": 8.599173413255813e-05, + "loss": 2.8741, + "step": 15455 + }, + { + "epoch": 0.9594636538580917, + "grad_norm": 0.19084943824350095, + "learning_rate": 8.598922711527217e-05, + "loss": 2.9685, + "step": 15456 + }, + { + "epoch": 0.9595257309578497, + "grad_norm": 0.1805639350354775, + "learning_rate": 8.598671991022167e-05, + "loss": 2.9384, + "step": 15457 + }, + { + "epoch": 0.9595878080576076, + "grad_norm": 0.2738818509103785, + "learning_rate": 8.598421251741973e-05, + "loss": 3.0278, + "step": 15458 + }, + { + "epoch": 0.9596498851573655, + "grad_norm": 0.21425651577719693, + "learning_rate": 8.598170493687941e-05, + "loss": 2.9514, + "step": 15459 + }, + { + "epoch": 0.9597119622571233, + "grad_norm": 0.16940415646100868, + "learning_rate": 8.597919716861379e-05, + "loss": 2.9652, + "step": 15460 + }, + { + "epoch": 0.9597740393568812, + "grad_norm": 0.3042365429361992, + "learning_rate": 8.597668921263597e-05, + "loss": 3.0507, + "step": 15461 + }, + { + "epoch": 0.9598361164566391, + "grad_norm": 0.19994052397494272, + "learning_rate": 8.597418106895903e-05, + "loss": 2.9712, + "step": 15462 + }, + { + "epoch": 0.959898193556397, + "grad_norm": 0.1955813845863936, + "learning_rate": 8.597167273759607e-05, + "loss": 3.0055, + "step": 15463 + }, + { + "epoch": 0.959960270656155, + "grad_norm": 0.19735039881773944, + "learning_rate": 8.596916421856014e-05, + "loss": 2.8895, + "step": 15464 + }, + { + "epoch": 0.9600223477559129, + "grad_norm": 0.1837371712377856, + "learning_rate": 8.596665551186436e-05, + "loss": 3.0196, + "step": 15465 + }, + { + "epoch": 0.9600844248556707, + "grad_norm": 0.19325833618843186, + "learning_rate": 8.59641466175218e-05, + "loss": 2.9152, + "step": 15466 + }, + { + "epoch": 0.9601465019554286, + "grad_norm": 0.18346268459866208, + "learning_rate": 8.596163753554555e-05, + "loss": 2.9602, + "step": 15467 + }, + { + "epoch": 0.9602085790551865, + "grad_norm": 0.19489225953816713, + "learning_rate": 8.595912826594873e-05, + "loss": 3.062, + "step": 15468 + }, + { + "epoch": 0.9602706561549444, + "grad_norm": 0.19796659546713163, + "learning_rate": 8.595661880874439e-05, + "loss": 3.0037, + "step": 15469 + }, + { + "epoch": 0.9603327332547024, + "grad_norm": 0.17593670434924324, + "learning_rate": 8.595410916394565e-05, + "loss": 2.9311, + "step": 15470 + }, + { + "epoch": 0.9603948103544603, + "grad_norm": 0.17749046525061282, + "learning_rate": 8.59515993315656e-05, + "loss": 2.9618, + "step": 15471 + }, + { + "epoch": 0.9604568874542181, + "grad_norm": 0.16009132883734453, + "learning_rate": 8.594908931161732e-05, + "loss": 2.8884, + "step": 15472 + }, + { + "epoch": 0.960518964553976, + "grad_norm": 0.18256551858181885, + "learning_rate": 8.59465791041139e-05, + "loss": 2.9477, + "step": 15473 + }, + { + "epoch": 0.9605810416537339, + "grad_norm": 0.21878668783976796, + "learning_rate": 8.594406870906847e-05, + "loss": 2.9645, + "step": 15474 + }, + { + "epoch": 0.9606431187534918, + "grad_norm": 0.19597094174392624, + "learning_rate": 8.59415581264941e-05, + "loss": 2.9349, + "step": 15475 + }, + { + "epoch": 0.9607051958532498, + "grad_norm": 0.19419592217597018, + "learning_rate": 8.59390473564039e-05, + "loss": 3.0171, + "step": 15476 + }, + { + "epoch": 0.9607672729530077, + "grad_norm": 0.16228625402883845, + "learning_rate": 8.593653639881096e-05, + "loss": 3.0044, + "step": 15477 + }, + { + "epoch": 0.9608293500527655, + "grad_norm": 0.2807156466954088, + "learning_rate": 8.593402525372838e-05, + "loss": 3.0701, + "step": 15478 + }, + { + "epoch": 0.9608914271525234, + "grad_norm": 0.19017963130893525, + "learning_rate": 8.59315139211693e-05, + "loss": 3.0049, + "step": 15479 + }, + { + "epoch": 0.9609535042522813, + "grad_norm": 0.19013749226793375, + "learning_rate": 8.592900240114676e-05, + "loss": 2.9347, + "step": 15480 + }, + { + "epoch": 0.9610155813520392, + "grad_norm": 0.1975476731936933, + "learning_rate": 8.592649069367389e-05, + "loss": 2.9456, + "step": 15481 + }, + { + "epoch": 0.9610776584517972, + "grad_norm": 0.17783954149923067, + "learning_rate": 8.59239787987638e-05, + "loss": 2.9786, + "step": 15482 + }, + { + "epoch": 0.9611397355515551, + "grad_norm": 0.23019532778986024, + "learning_rate": 8.59214667164296e-05, + "loss": 3.0029, + "step": 15483 + }, + { + "epoch": 0.9612018126513129, + "grad_norm": 0.18765060019266253, + "learning_rate": 8.591895444668438e-05, + "loss": 2.9768, + "step": 15484 + }, + { + "epoch": 0.9612638897510708, + "grad_norm": 0.2944962923467431, + "learning_rate": 8.591644198954126e-05, + "loss": 3.0151, + "step": 15485 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.1911182864883088, + "learning_rate": 8.591392934501334e-05, + "loss": 2.9328, + "step": 15486 + }, + { + "epoch": 0.9613880439505866, + "grad_norm": 0.19116137858868637, + "learning_rate": 8.591141651311374e-05, + "loss": 2.9433, + "step": 15487 + }, + { + "epoch": 0.9614501210503446, + "grad_norm": 0.262936704545102, + "learning_rate": 8.590890349385557e-05, + "loss": 3.0148, + "step": 15488 + }, + { + "epoch": 0.9615121981501025, + "grad_norm": 0.23466093582237954, + "learning_rate": 8.590639028725191e-05, + "loss": 2.9417, + "step": 15489 + }, + { + "epoch": 0.9615742752498603, + "grad_norm": 0.18278662832704856, + "learning_rate": 8.590387689331589e-05, + "loss": 2.9715, + "step": 15490 + }, + { + "epoch": 0.9616363523496182, + "grad_norm": 0.20763537595499681, + "learning_rate": 8.590136331206064e-05, + "loss": 3.0239, + "step": 15491 + }, + { + "epoch": 0.9616984294493761, + "grad_norm": 0.18517600690483713, + "learning_rate": 8.589884954349927e-05, + "loss": 2.9588, + "step": 15492 + }, + { + "epoch": 0.961760506549134, + "grad_norm": 0.19052141376849802, + "learning_rate": 8.589633558764489e-05, + "loss": 2.9536, + "step": 15493 + }, + { + "epoch": 0.961822583648892, + "grad_norm": 0.2135973582489708, + "learning_rate": 8.589382144451061e-05, + "loss": 2.9163, + "step": 15494 + }, + { + "epoch": 0.9618846607486499, + "grad_norm": 0.21177394471241645, + "learning_rate": 8.589130711410956e-05, + "loss": 2.9914, + "step": 15495 + }, + { + "epoch": 0.9619467378484077, + "grad_norm": 0.22060058541062444, + "learning_rate": 8.588879259645482e-05, + "loss": 2.985, + "step": 15496 + }, + { + "epoch": 0.9620088149481656, + "grad_norm": 0.19870184239966981, + "learning_rate": 8.588627789155955e-05, + "loss": 2.9845, + "step": 15497 + }, + { + "epoch": 0.9620708920479235, + "grad_norm": 0.21736367339325843, + "learning_rate": 8.588376299943686e-05, + "loss": 2.9491, + "step": 15498 + }, + { + "epoch": 0.9621329691476814, + "grad_norm": 0.23940439988658666, + "learning_rate": 8.588124792009985e-05, + "loss": 2.9913, + "step": 15499 + }, + { + "epoch": 0.9621950462474393, + "grad_norm": 0.22524787706227828, + "learning_rate": 8.587873265356167e-05, + "loss": 2.9323, + "step": 15500 + }, + { + "epoch": 0.9622571233471973, + "grad_norm": 0.1947843372186608, + "learning_rate": 8.587621719983546e-05, + "loss": 2.9076, + "step": 15501 + }, + { + "epoch": 0.9623192004469551, + "grad_norm": 0.18081063176049308, + "learning_rate": 8.587370155893429e-05, + "loss": 2.9543, + "step": 15502 + }, + { + "epoch": 0.962381277546713, + "grad_norm": 0.17642455535668852, + "learning_rate": 8.587118573087131e-05, + "loss": 2.9848, + "step": 15503 + }, + { + "epoch": 0.9624433546464709, + "grad_norm": 0.19023913117089242, + "learning_rate": 8.586866971565965e-05, + "loss": 2.9645, + "step": 15504 + }, + { + "epoch": 0.9625054317462288, + "grad_norm": 0.230560591550453, + "learning_rate": 8.586615351331242e-05, + "loss": 2.9581, + "step": 15505 + }, + { + "epoch": 0.9625675088459867, + "grad_norm": 0.172368624985289, + "learning_rate": 8.586363712384277e-05, + "loss": 3.0641, + "step": 15506 + }, + { + "epoch": 0.9626295859457447, + "grad_norm": 0.1977873032237715, + "learning_rate": 8.586112054726382e-05, + "loss": 2.9762, + "step": 15507 + }, + { + "epoch": 0.9626916630455025, + "grad_norm": 0.1804567126598643, + "learning_rate": 8.58586037835887e-05, + "loss": 2.9727, + "step": 15508 + }, + { + "epoch": 0.9627537401452604, + "grad_norm": 0.1560203473881194, + "learning_rate": 8.585608683283053e-05, + "loss": 2.9554, + "step": 15509 + }, + { + "epoch": 0.9628158172450183, + "grad_norm": 0.17240998065058924, + "learning_rate": 8.585356969500247e-05, + "loss": 2.9696, + "step": 15510 + }, + { + "epoch": 0.9628778943447762, + "grad_norm": 0.19397308750013617, + "learning_rate": 8.585105237011762e-05, + "loss": 2.8923, + "step": 15511 + }, + { + "epoch": 0.9629399714445341, + "grad_norm": 0.23957093438254848, + "learning_rate": 8.584853485818912e-05, + "loss": 2.8984, + "step": 15512 + }, + { + "epoch": 0.963002048544292, + "grad_norm": 0.1842951767484615, + "learning_rate": 8.584601715923012e-05, + "loss": 2.9704, + "step": 15513 + }, + { + "epoch": 0.9630641256440499, + "grad_norm": 0.18438050940735892, + "learning_rate": 8.584349927325375e-05, + "loss": 3.0337, + "step": 15514 + }, + { + "epoch": 0.9631262027438078, + "grad_norm": 0.20558225339574437, + "learning_rate": 8.584098120027314e-05, + "loss": 2.9433, + "step": 15515 + }, + { + "epoch": 0.9631882798435657, + "grad_norm": 0.21042556806816007, + "learning_rate": 8.583846294030142e-05, + "loss": 2.9226, + "step": 15516 + }, + { + "epoch": 0.9632503569433236, + "grad_norm": 0.2148215802631787, + "learning_rate": 8.583594449335175e-05, + "loss": 3.0316, + "step": 15517 + }, + { + "epoch": 0.9633124340430815, + "grad_norm": 0.1592124834970683, + "learning_rate": 8.583342585943725e-05, + "loss": 3.0219, + "step": 15518 + }, + { + "epoch": 0.9633745111428395, + "grad_norm": 0.1965792678964731, + "learning_rate": 8.583090703857108e-05, + "loss": 2.9541, + "step": 15519 + }, + { + "epoch": 0.9634365882425973, + "grad_norm": 0.19263053088135168, + "learning_rate": 8.582838803076636e-05, + "loss": 2.946, + "step": 15520 + }, + { + "epoch": 0.9634986653423552, + "grad_norm": 0.1937377964298829, + "learning_rate": 8.582586883603625e-05, + "loss": 3.0746, + "step": 15521 + }, + { + "epoch": 0.9635607424421131, + "grad_norm": 0.1558394342826592, + "learning_rate": 8.582334945439387e-05, + "loss": 2.8324, + "step": 15522 + }, + { + "epoch": 0.963622819541871, + "grad_norm": 0.20475689388728277, + "learning_rate": 8.58208298858524e-05, + "loss": 2.9043, + "step": 15523 + }, + { + "epoch": 0.9636848966416289, + "grad_norm": 0.19385551609660817, + "learning_rate": 8.581831013042495e-05, + "loss": 2.9932, + "step": 15524 + }, + { + "epoch": 0.9637469737413868, + "grad_norm": 0.169136116020591, + "learning_rate": 8.58157901881247e-05, + "loss": 3.001, + "step": 15525 + }, + { + "epoch": 0.9638090508411447, + "grad_norm": 0.17186129580396395, + "learning_rate": 8.581327005896475e-05, + "loss": 3.0528, + "step": 15526 + }, + { + "epoch": 0.9638711279409026, + "grad_norm": 0.19860860476398126, + "learning_rate": 8.581074974295829e-05, + "loss": 3.0481, + "step": 15527 + }, + { + "epoch": 0.9639332050406605, + "grad_norm": 0.18964341036167073, + "learning_rate": 8.580822924011844e-05, + "loss": 3.0935, + "step": 15528 + }, + { + "epoch": 0.9639952821404184, + "grad_norm": 0.22802893660517926, + "learning_rate": 8.58057085504584e-05, + "loss": 2.93, + "step": 15529 + }, + { + "epoch": 0.9640573592401763, + "grad_norm": 0.16834427197150428, + "learning_rate": 8.580318767399126e-05, + "loss": 2.9804, + "step": 15530 + }, + { + "epoch": 0.9641194363399342, + "grad_norm": 0.18264085952818138, + "learning_rate": 8.580066661073021e-05, + "loss": 2.9998, + "step": 15531 + }, + { + "epoch": 0.964181513439692, + "grad_norm": 0.17154272429301048, + "learning_rate": 8.579814536068838e-05, + "loss": 2.9113, + "step": 15532 + }, + { + "epoch": 0.96424359053945, + "grad_norm": 0.15751863681565287, + "learning_rate": 8.579562392387894e-05, + "loss": 2.8656, + "step": 15533 + }, + { + "epoch": 0.9643056676392079, + "grad_norm": 0.17674566195545233, + "learning_rate": 8.579310230031504e-05, + "loss": 3.0414, + "step": 15534 + }, + { + "epoch": 0.9643677447389658, + "grad_norm": 0.16772241604854848, + "learning_rate": 8.579058049000984e-05, + "loss": 3.0476, + "step": 15535 + }, + { + "epoch": 0.9644298218387237, + "grad_norm": 0.19381918325713743, + "learning_rate": 8.578805849297648e-05, + "loss": 2.9307, + "step": 15536 + }, + { + "epoch": 0.9644918989384816, + "grad_norm": 0.1976549065585713, + "learning_rate": 8.578553630922815e-05, + "loss": 2.8991, + "step": 15537 + }, + { + "epoch": 0.9645539760382394, + "grad_norm": 0.17879178870841544, + "learning_rate": 8.578301393877799e-05, + "loss": 2.9945, + "step": 15538 + }, + { + "epoch": 0.9646160531379974, + "grad_norm": 0.1833491928563168, + "learning_rate": 8.578049138163915e-05, + "loss": 2.9617, + "step": 15539 + }, + { + "epoch": 0.9646781302377553, + "grad_norm": 0.19574150653234212, + "learning_rate": 8.57779686378248e-05, + "loss": 2.9626, + "step": 15540 + }, + { + "epoch": 0.9647402073375132, + "grad_norm": 0.16592133031968176, + "learning_rate": 8.57754457073481e-05, + "loss": 3.0183, + "step": 15541 + }, + { + "epoch": 0.9648022844372711, + "grad_norm": 0.18678872979760053, + "learning_rate": 8.577292259022223e-05, + "loss": 2.8991, + "step": 15542 + }, + { + "epoch": 0.964864361537029, + "grad_norm": 0.16606952689769647, + "learning_rate": 8.577039928646033e-05, + "loss": 2.8992, + "step": 15543 + }, + { + "epoch": 0.9649264386367868, + "grad_norm": 0.2905359090616329, + "learning_rate": 8.576787579607557e-05, + "loss": 2.8746, + "step": 15544 + }, + { + "epoch": 0.9649885157365448, + "grad_norm": 0.23752059847197743, + "learning_rate": 8.576535211908112e-05, + "loss": 2.948, + "step": 15545 + }, + { + "epoch": 0.9650505928363027, + "grad_norm": 0.17757850141774875, + "learning_rate": 8.576282825549014e-05, + "loss": 2.9086, + "step": 15546 + }, + { + "epoch": 0.9651126699360606, + "grad_norm": 0.20730474688764705, + "learning_rate": 8.576030420531581e-05, + "loss": 2.8891, + "step": 15547 + }, + { + "epoch": 0.9651747470358185, + "grad_norm": 0.257289471533262, + "learning_rate": 8.575777996857128e-05, + "loss": 2.9413, + "step": 15548 + }, + { + "epoch": 0.9652368241355764, + "grad_norm": 0.1869005315939424, + "learning_rate": 8.575525554526975e-05, + "loss": 2.8999, + "step": 15549 + }, + { + "epoch": 0.9652989012353342, + "grad_norm": 0.19481267870997926, + "learning_rate": 8.575273093542437e-05, + "loss": 2.9133, + "step": 15550 + }, + { + "epoch": 0.9653609783350922, + "grad_norm": 0.15850868269118232, + "learning_rate": 8.57502061390483e-05, + "loss": 2.9964, + "step": 15551 + }, + { + "epoch": 0.9654230554348501, + "grad_norm": 0.1775761937927146, + "learning_rate": 8.574768115615475e-05, + "loss": 2.9328, + "step": 15552 + }, + { + "epoch": 0.965485132534608, + "grad_norm": 0.24024440775445083, + "learning_rate": 8.574515598675684e-05, + "loss": 3.0114, + "step": 15553 + }, + { + "epoch": 0.9655472096343659, + "grad_norm": 0.17750803190514683, + "learning_rate": 8.574263063086778e-05, + "loss": 2.9115, + "step": 15554 + }, + { + "epoch": 0.9656092867341238, + "grad_norm": 0.1731169005678709, + "learning_rate": 8.574010508850076e-05, + "loss": 3.0063, + "step": 15555 + }, + { + "epoch": 0.9656713638338816, + "grad_norm": 0.23231848653141243, + "learning_rate": 8.573757935966892e-05, + "loss": 2.8744, + "step": 15556 + }, + { + "epoch": 0.9657334409336396, + "grad_norm": 0.18973801026768575, + "learning_rate": 8.573505344438546e-05, + "loss": 2.8765, + "step": 15557 + }, + { + "epoch": 0.9657955180333975, + "grad_norm": 0.17330684616429032, + "learning_rate": 8.573252734266354e-05, + "loss": 2.9004, + "step": 15558 + }, + { + "epoch": 0.9658575951331554, + "grad_norm": 0.1712797706110692, + "learning_rate": 8.573000105451636e-05, + "loss": 2.9074, + "step": 15559 + }, + { + "epoch": 0.9659196722329133, + "grad_norm": 0.22072859423030444, + "learning_rate": 8.572747457995708e-05, + "loss": 2.9339, + "step": 15560 + }, + { + "epoch": 0.9659817493326712, + "grad_norm": 0.15675720504654453, + "learning_rate": 8.57249479189989e-05, + "loss": 2.9991, + "step": 15561 + }, + { + "epoch": 0.966043826432429, + "grad_norm": 0.16571404609499893, + "learning_rate": 8.5722421071655e-05, + "loss": 3.0354, + "step": 15562 + }, + { + "epoch": 0.966105903532187, + "grad_norm": 0.23834544239566186, + "learning_rate": 8.571989403793854e-05, + "loss": 3.0071, + "step": 15563 + }, + { + "epoch": 0.9661679806319449, + "grad_norm": 0.1698650890042293, + "learning_rate": 8.571736681786273e-05, + "loss": 3.0071, + "step": 15564 + }, + { + "epoch": 0.9662300577317028, + "grad_norm": 0.20976790492632008, + "learning_rate": 8.571483941144076e-05, + "loss": 2.9944, + "step": 15565 + }, + { + "epoch": 0.9662921348314607, + "grad_norm": 0.15960792555783404, + "learning_rate": 8.571231181868579e-05, + "loss": 2.9645, + "step": 15566 + }, + { + "epoch": 0.9663542119312186, + "grad_norm": 0.25470155723954674, + "learning_rate": 8.570978403961101e-05, + "loss": 2.8563, + "step": 15567 + }, + { + "epoch": 0.9664162890309764, + "grad_norm": 0.17124931826943074, + "learning_rate": 8.570725607422964e-05, + "loss": 2.8812, + "step": 15568 + }, + { + "epoch": 0.9664783661307343, + "grad_norm": 0.180322964313392, + "learning_rate": 8.570472792255484e-05, + "loss": 2.9688, + "step": 15569 + }, + { + "epoch": 0.9665404432304923, + "grad_norm": 0.20864163574177333, + "learning_rate": 8.57021995845998e-05, + "loss": 3.013, + "step": 15570 + }, + { + "epoch": 0.9666025203302502, + "grad_norm": 0.19773570789383782, + "learning_rate": 8.569967106037771e-05, + "loss": 2.9162, + "step": 15571 + }, + { + "epoch": 0.9666645974300081, + "grad_norm": 0.19697200527282174, + "learning_rate": 8.569714234990178e-05, + "loss": 2.9466, + "step": 15572 + }, + { + "epoch": 0.966726674529766, + "grad_norm": 0.1997700897123658, + "learning_rate": 8.569461345318519e-05, + "loss": 2.9422, + "step": 15573 + }, + { + "epoch": 0.9667887516295238, + "grad_norm": 0.1622950111470171, + "learning_rate": 8.569208437024115e-05, + "loss": 2.8934, + "step": 15574 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.17227490999107076, + "learning_rate": 8.568955510108283e-05, + "loss": 2.8809, + "step": 15575 + }, + { + "epoch": 0.9669129058290397, + "grad_norm": 0.17061669149535877, + "learning_rate": 8.568702564572344e-05, + "loss": 2.9252, + "step": 15576 + }, + { + "epoch": 0.9669749829287976, + "grad_norm": 0.22926596184586648, + "learning_rate": 8.568449600417617e-05, + "loss": 2.9271, + "step": 15577 + }, + { + "epoch": 0.9670370600285555, + "grad_norm": 0.16895590732406926, + "learning_rate": 8.568196617645423e-05, + "loss": 2.8384, + "step": 15578 + }, + { + "epoch": 0.9670991371283134, + "grad_norm": 0.2127581893138177, + "learning_rate": 8.567943616257079e-05, + "loss": 2.997, + "step": 15579 + }, + { + "epoch": 0.9671612142280712, + "grad_norm": 0.1843699725322317, + "learning_rate": 8.567690596253908e-05, + "loss": 2.9774, + "step": 15580 + }, + { + "epoch": 0.9672232913278291, + "grad_norm": 0.16803921286132703, + "learning_rate": 8.56743755763723e-05, + "loss": 2.9615, + "step": 15581 + }, + { + "epoch": 0.9672853684275871, + "grad_norm": 0.17152530142688552, + "learning_rate": 8.567184500408363e-05, + "loss": 3.024, + "step": 15582 + }, + { + "epoch": 0.967347445527345, + "grad_norm": 0.1886422609896383, + "learning_rate": 8.56693142456863e-05, + "loss": 2.9849, + "step": 15583 + }, + { + "epoch": 0.9674095226271029, + "grad_norm": 0.16018569489532516, + "learning_rate": 8.566678330119348e-05, + "loss": 2.9596, + "step": 15584 + }, + { + "epoch": 0.9674715997268608, + "grad_norm": 0.1652335877095398, + "learning_rate": 8.566425217061841e-05, + "loss": 2.8721, + "step": 15585 + }, + { + "epoch": 0.9675336768266186, + "grad_norm": 0.15478779161778966, + "learning_rate": 8.566172085397427e-05, + "loss": 2.9898, + "step": 15586 + }, + { + "epoch": 0.9675957539263765, + "grad_norm": 0.15411024637255383, + "learning_rate": 8.565918935127429e-05, + "loss": 2.9301, + "step": 15587 + }, + { + "epoch": 0.9676578310261345, + "grad_norm": 0.1879991404381845, + "learning_rate": 8.565665766253163e-05, + "loss": 3.0245, + "step": 15588 + }, + { + "epoch": 0.9677199081258924, + "grad_norm": 0.21203288249861083, + "learning_rate": 8.565412578775955e-05, + "loss": 2.9242, + "step": 15589 + }, + { + "epoch": 0.9677819852256503, + "grad_norm": 0.17136624749236487, + "learning_rate": 8.565159372697124e-05, + "loss": 2.9751, + "step": 15590 + }, + { + "epoch": 0.9678440623254082, + "grad_norm": 0.17294577277965326, + "learning_rate": 8.564906148017992e-05, + "loss": 2.9732, + "step": 15591 + }, + { + "epoch": 0.967906139425166, + "grad_norm": 0.17319508270189235, + "learning_rate": 8.564652904739878e-05, + "loss": 3.0045, + "step": 15592 + }, + { + "epoch": 0.9679682165249239, + "grad_norm": 0.16714436585169126, + "learning_rate": 8.564399642864105e-05, + "loss": 3.0187, + "step": 15593 + }, + { + "epoch": 0.9680302936246818, + "grad_norm": 0.1850196777330284, + "learning_rate": 8.564146362391994e-05, + "loss": 3.0608, + "step": 15594 + }, + { + "epoch": 0.9680923707244398, + "grad_norm": 0.16825398910577455, + "learning_rate": 8.563893063324865e-05, + "loss": 3.0545, + "step": 15595 + }, + { + "epoch": 0.9681544478241977, + "grad_norm": 0.1914601152881581, + "learning_rate": 8.563639745664041e-05, + "loss": 3.0313, + "step": 15596 + }, + { + "epoch": 0.9682165249239556, + "grad_norm": 0.18097025724641988, + "learning_rate": 8.563386409410845e-05, + "loss": 2.9609, + "step": 15597 + }, + { + "epoch": 0.9682786020237134, + "grad_norm": 0.1774273095753671, + "learning_rate": 8.563133054566596e-05, + "loss": 2.8881, + "step": 15598 + }, + { + "epoch": 0.9683406791234713, + "grad_norm": 0.16570454230415074, + "learning_rate": 8.562879681132616e-05, + "loss": 2.9155, + "step": 15599 + }, + { + "epoch": 0.9684027562232292, + "grad_norm": 0.20943471035797082, + "learning_rate": 8.562626289110229e-05, + "loss": 2.9213, + "step": 15600 + }, + { + "epoch": 0.9684648333229872, + "grad_norm": 0.17250655382083385, + "learning_rate": 8.562372878500755e-05, + "loss": 2.9548, + "step": 15601 + }, + { + "epoch": 0.9685269104227451, + "grad_norm": 0.16121042867570518, + "learning_rate": 8.562119449305518e-05, + "loss": 2.9972, + "step": 15602 + }, + { + "epoch": 0.968588987522503, + "grad_norm": 0.23589370144168734, + "learning_rate": 8.561866001525838e-05, + "loss": 2.8791, + "step": 15603 + }, + { + "epoch": 0.9686510646222608, + "grad_norm": 0.18004421403052867, + "learning_rate": 8.561612535163039e-05, + "loss": 2.8959, + "step": 15604 + }, + { + "epoch": 0.9687131417220187, + "grad_norm": 0.17606372918372573, + "learning_rate": 8.561359050218443e-05, + "loss": 2.9869, + "step": 15605 + }, + { + "epoch": 0.9687752188217766, + "grad_norm": 0.21166965788194503, + "learning_rate": 8.56110554669337e-05, + "loss": 3.0159, + "step": 15606 + }, + { + "epoch": 0.9688372959215346, + "grad_norm": 0.17100856502419143, + "learning_rate": 8.560852024589148e-05, + "loss": 2.8728, + "step": 15607 + }, + { + "epoch": 0.9688993730212925, + "grad_norm": 0.2533964467694488, + "learning_rate": 8.560598483907095e-05, + "loss": 3.0224, + "step": 15608 + }, + { + "epoch": 0.9689614501210504, + "grad_norm": 0.1752490623550391, + "learning_rate": 8.560344924648536e-05, + "loss": 2.9128, + "step": 15609 + }, + { + "epoch": 0.9690235272208082, + "grad_norm": 0.18014099074667467, + "learning_rate": 8.560091346814793e-05, + "loss": 2.9772, + "step": 15610 + }, + { + "epoch": 0.9690856043205661, + "grad_norm": 0.18301170491544222, + "learning_rate": 8.55983775040719e-05, + "loss": 3.003, + "step": 15611 + }, + { + "epoch": 0.969147681420324, + "grad_norm": 0.19179044817627836, + "learning_rate": 8.559584135427048e-05, + "loss": 3.0901, + "step": 15612 + }, + { + "epoch": 0.969209758520082, + "grad_norm": 0.19639377742546563, + "learning_rate": 8.559330501875691e-05, + "loss": 2.9728, + "step": 15613 + }, + { + "epoch": 0.9692718356198399, + "grad_norm": 0.19170607225678918, + "learning_rate": 8.559076849754444e-05, + "loss": 3.0456, + "step": 15614 + }, + { + "epoch": 0.9693339127195978, + "grad_norm": 0.2078616564557433, + "learning_rate": 8.558823179064628e-05, + "loss": 2.9668, + "step": 15615 + }, + { + "epoch": 0.9693959898193556, + "grad_norm": 0.18677322279434733, + "learning_rate": 8.558569489807569e-05, + "loss": 3.0014, + "step": 15616 + }, + { + "epoch": 0.9694580669191135, + "grad_norm": 0.19625551496944957, + "learning_rate": 8.558315781984587e-05, + "loss": 2.9733, + "step": 15617 + }, + { + "epoch": 0.9695201440188714, + "grad_norm": 0.19743526831461794, + "learning_rate": 8.55806205559701e-05, + "loss": 2.9574, + "step": 15618 + }, + { + "epoch": 0.9695822211186294, + "grad_norm": 0.18087798160884666, + "learning_rate": 8.557808310646159e-05, + "loss": 2.8727, + "step": 15619 + }, + { + "epoch": 0.9696442982183873, + "grad_norm": 0.18940782496998537, + "learning_rate": 8.557554547133357e-05, + "loss": 2.9372, + "step": 15620 + }, + { + "epoch": 0.9697063753181452, + "grad_norm": 0.175585924345871, + "learning_rate": 8.557300765059929e-05, + "loss": 3.0807, + "step": 15621 + }, + { + "epoch": 0.969768452417903, + "grad_norm": 0.18598582897885443, + "learning_rate": 8.5570469644272e-05, + "loss": 2.9501, + "step": 15622 + }, + { + "epoch": 0.9698305295176609, + "grad_norm": 0.18229931179545536, + "learning_rate": 8.556793145236495e-05, + "loss": 2.8634, + "step": 15623 + }, + { + "epoch": 0.9698926066174188, + "grad_norm": 0.16704782038892046, + "learning_rate": 8.556539307489136e-05, + "loss": 2.8469, + "step": 15624 + }, + { + "epoch": 0.9699546837171767, + "grad_norm": 0.1921316821754906, + "learning_rate": 8.556285451186447e-05, + "loss": 2.9057, + "step": 15625 + }, + { + "epoch": 0.9700167608169347, + "grad_norm": 0.17347660848079657, + "learning_rate": 8.556031576329755e-05, + "loss": 2.8639, + "step": 15626 + }, + { + "epoch": 0.9700788379166926, + "grad_norm": 0.16436916595701387, + "learning_rate": 8.555777682920382e-05, + "loss": 3.0485, + "step": 15627 + }, + { + "epoch": 0.9701409150164504, + "grad_norm": 0.17880518569906884, + "learning_rate": 8.555523770959653e-05, + "loss": 2.947, + "step": 15628 + }, + { + "epoch": 0.9702029921162083, + "grad_norm": 0.19441060687629552, + "learning_rate": 8.555269840448894e-05, + "loss": 3.0104, + "step": 15629 + }, + { + "epoch": 0.9702650692159662, + "grad_norm": 0.1712568666045596, + "learning_rate": 8.555015891389431e-05, + "loss": 2.9828, + "step": 15630 + }, + { + "epoch": 0.9703271463157241, + "grad_norm": 0.1905839441988398, + "learning_rate": 8.554761923782584e-05, + "loss": 2.8666, + "step": 15631 + }, + { + "epoch": 0.9703892234154821, + "grad_norm": 0.19095820907755479, + "learning_rate": 8.554507937629683e-05, + "loss": 2.9327, + "step": 15632 + }, + { + "epoch": 0.97045130051524, + "grad_norm": 0.17029839487899961, + "learning_rate": 8.55425393293205e-05, + "loss": 2.9029, + "step": 15633 + }, + { + "epoch": 0.9705133776149978, + "grad_norm": 0.17952531253634502, + "learning_rate": 8.553999909691013e-05, + "loss": 2.8906, + "step": 15634 + }, + { + "epoch": 0.9705754547147557, + "grad_norm": 0.1694025896271907, + "learning_rate": 8.553745867907894e-05, + "loss": 2.9777, + "step": 15635 + }, + { + "epoch": 0.9706375318145136, + "grad_norm": 0.20025344562840297, + "learning_rate": 8.553491807584022e-05, + "loss": 2.9479, + "step": 15636 + }, + { + "epoch": 0.9706996089142715, + "grad_norm": 0.15869148841268907, + "learning_rate": 8.55323772872072e-05, + "loss": 2.9494, + "step": 15637 + }, + { + "epoch": 0.9707616860140295, + "grad_norm": 0.23262614401895243, + "learning_rate": 8.552983631319314e-05, + "loss": 3.0394, + "step": 15638 + }, + { + "epoch": 0.9708237631137874, + "grad_norm": 0.1936922675687998, + "learning_rate": 8.552729515381129e-05, + "loss": 2.8893, + "step": 15639 + }, + { + "epoch": 0.9708858402135452, + "grad_norm": 0.20341060806413824, + "learning_rate": 8.552475380907492e-05, + "loss": 2.9456, + "step": 15640 + }, + { + "epoch": 0.9709479173133031, + "grad_norm": 0.18678778246133407, + "learning_rate": 8.552221227899728e-05, + "loss": 2.9302, + "step": 15641 + }, + { + "epoch": 0.971009994413061, + "grad_norm": 0.2483614706267207, + "learning_rate": 8.551967056359165e-05, + "loss": 2.8872, + "step": 15642 + }, + { + "epoch": 0.9710720715128189, + "grad_norm": 0.18620646841689198, + "learning_rate": 8.551712866287127e-05, + "loss": 2.9829, + "step": 15643 + }, + { + "epoch": 0.9711341486125769, + "grad_norm": 0.17676186425217663, + "learning_rate": 8.551458657684941e-05, + "loss": 3.0282, + "step": 15644 + }, + { + "epoch": 0.9711962257123348, + "grad_norm": 0.1833477401130163, + "learning_rate": 8.551204430553932e-05, + "loss": 3.0247, + "step": 15645 + }, + { + "epoch": 0.9712583028120926, + "grad_norm": 0.19597634984400836, + "learning_rate": 8.550950184895429e-05, + "loss": 3.0478, + "step": 15646 + }, + { + "epoch": 0.9713203799118505, + "grad_norm": 0.1601935785102529, + "learning_rate": 8.550695920710756e-05, + "loss": 3.0245, + "step": 15647 + }, + { + "epoch": 0.9713824570116084, + "grad_norm": 0.21751568170446317, + "learning_rate": 8.55044163800124e-05, + "loss": 2.9453, + "step": 15648 + }, + { + "epoch": 0.9714445341113663, + "grad_norm": 0.3787509167222615, + "learning_rate": 8.55018733676821e-05, + "loss": 2.9372, + "step": 15649 + }, + { + "epoch": 0.9715066112111242, + "grad_norm": 0.1912454591473824, + "learning_rate": 8.549933017012987e-05, + "loss": 2.7854, + "step": 15650 + }, + { + "epoch": 0.9715686883108822, + "grad_norm": 0.18240326576151014, + "learning_rate": 8.549678678736906e-05, + "loss": 2.8477, + "step": 15651 + }, + { + "epoch": 0.97163076541064, + "grad_norm": 0.1848666219515247, + "learning_rate": 8.549424321941286e-05, + "loss": 3.0706, + "step": 15652 + }, + { + "epoch": 0.9716928425103979, + "grad_norm": 0.1859831649739629, + "learning_rate": 8.54916994662746e-05, + "loss": 2.9762, + "step": 15653 + }, + { + "epoch": 0.9717549196101558, + "grad_norm": 0.22540899661342392, + "learning_rate": 8.548915552796752e-05, + "loss": 3.0053, + "step": 15654 + }, + { + "epoch": 0.9718169967099137, + "grad_norm": 0.17829626993001937, + "learning_rate": 8.54866114045049e-05, + "loss": 2.9426, + "step": 15655 + }, + { + "epoch": 0.9718790738096716, + "grad_norm": 0.1732183272263517, + "learning_rate": 8.548406709590003e-05, + "loss": 3.0442, + "step": 15656 + }, + { + "epoch": 0.9719411509094296, + "grad_norm": 0.21952718378014113, + "learning_rate": 8.548152260216614e-05, + "loss": 2.9906, + "step": 15657 + }, + { + "epoch": 0.9720032280091874, + "grad_norm": 0.2687071211563699, + "learning_rate": 8.547897792331655e-05, + "loss": 2.9534, + "step": 15658 + }, + { + "epoch": 0.9720653051089453, + "grad_norm": 0.18348348634133185, + "learning_rate": 8.547643305936451e-05, + "loss": 2.9901, + "step": 15659 + }, + { + "epoch": 0.9721273822087032, + "grad_norm": 0.16108168166435066, + "learning_rate": 8.547388801032333e-05, + "loss": 2.9843, + "step": 15660 + }, + { + "epoch": 0.9721894593084611, + "grad_norm": 0.15318075741388248, + "learning_rate": 8.547134277620623e-05, + "loss": 2.9594, + "step": 15661 + }, + { + "epoch": 0.972251536408219, + "grad_norm": 0.16025818511625003, + "learning_rate": 8.546879735702654e-05, + "loss": 2.9407, + "step": 15662 + }, + { + "epoch": 0.972313613507977, + "grad_norm": 0.16377368550954438, + "learning_rate": 8.546625175279752e-05, + "loss": 2.9304, + "step": 15663 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.18611358631344022, + "learning_rate": 8.546370596353246e-05, + "loss": 3.0257, + "step": 15664 + }, + { + "epoch": 0.9724377677074927, + "grad_norm": 0.20701232859678872, + "learning_rate": 8.546115998924464e-05, + "loss": 2.9582, + "step": 15665 + }, + { + "epoch": 0.9724998448072506, + "grad_norm": 0.16172504251557382, + "learning_rate": 8.545861382994733e-05, + "loss": 2.9116, + "step": 15666 + }, + { + "epoch": 0.9725619219070085, + "grad_norm": 0.15907241272344302, + "learning_rate": 8.545606748565384e-05, + "loss": 3.0066, + "step": 15667 + }, + { + "epoch": 0.9726239990067664, + "grad_norm": 0.1565042394274849, + "learning_rate": 8.545352095637743e-05, + "loss": 2.9787, + "step": 15668 + }, + { + "epoch": 0.9726860761065244, + "grad_norm": 0.18493364287798916, + "learning_rate": 8.545097424213139e-05, + "loss": 2.9505, + "step": 15669 + }, + { + "epoch": 0.9727481532062822, + "grad_norm": 0.17855042925897624, + "learning_rate": 8.544842734292901e-05, + "loss": 2.9369, + "step": 15670 + }, + { + "epoch": 0.9728102303060401, + "grad_norm": 0.17703588752362398, + "learning_rate": 8.544588025878358e-05, + "loss": 3.0058, + "step": 15671 + }, + { + "epoch": 0.972872307405798, + "grad_norm": 0.16788030114257768, + "learning_rate": 8.544333298970839e-05, + "loss": 2.9364, + "step": 15672 + }, + { + "epoch": 0.9729343845055559, + "grad_norm": 0.1563912587598433, + "learning_rate": 8.544078553571672e-05, + "loss": 3.0208, + "step": 15673 + }, + { + "epoch": 0.9729964616053138, + "grad_norm": 0.17334254625389717, + "learning_rate": 8.543823789682189e-05, + "loss": 2.9405, + "step": 15674 + }, + { + "epoch": 0.9730585387050718, + "grad_norm": 0.18706389214838534, + "learning_rate": 8.543569007303715e-05, + "loss": 3.0583, + "step": 15675 + }, + { + "epoch": 0.9731206158048296, + "grad_norm": 0.15470244426980148, + "learning_rate": 8.543314206437583e-05, + "loss": 2.9073, + "step": 15676 + }, + { + "epoch": 0.9731826929045875, + "grad_norm": 0.19442927931975992, + "learning_rate": 8.543059387085119e-05, + "loss": 2.9709, + "step": 15677 + }, + { + "epoch": 0.9732447700043454, + "grad_norm": 0.1742122563404964, + "learning_rate": 8.542804549247654e-05, + "loss": 2.9396, + "step": 15678 + }, + { + "epoch": 0.9733068471041033, + "grad_norm": 0.1939775046438962, + "learning_rate": 8.542549692926518e-05, + "loss": 2.8511, + "step": 15679 + }, + { + "epoch": 0.9733689242038612, + "grad_norm": 0.16878884296716362, + "learning_rate": 8.542294818123043e-05, + "loss": 3.0552, + "step": 15680 + }, + { + "epoch": 0.9734310013036191, + "grad_norm": 0.17823761132606877, + "learning_rate": 8.542039924838552e-05, + "loss": 3.0102, + "step": 15681 + }, + { + "epoch": 0.973493078403377, + "grad_norm": 0.15054433572050005, + "learning_rate": 8.541785013074381e-05, + "loss": 2.9312, + "step": 15682 + }, + { + "epoch": 0.9735551555031349, + "grad_norm": 0.17214591176582417, + "learning_rate": 8.541530082831856e-05, + "loss": 2.8412, + "step": 15683 + }, + { + "epoch": 0.9736172326028928, + "grad_norm": 0.17524889229799887, + "learning_rate": 8.541275134112311e-05, + "loss": 2.9367, + "step": 15684 + }, + { + "epoch": 0.9736793097026507, + "grad_norm": 0.17344990312198882, + "learning_rate": 8.541020166917073e-05, + "loss": 3.0355, + "step": 15685 + }, + { + "epoch": 0.9737413868024086, + "grad_norm": 0.16609088650313766, + "learning_rate": 8.540765181247473e-05, + "loss": 2.8953, + "step": 15686 + }, + { + "epoch": 0.9738034639021665, + "grad_norm": 0.15724913512076982, + "learning_rate": 8.540510177104842e-05, + "loss": 2.9201, + "step": 15687 + }, + { + "epoch": 0.9738655410019244, + "grad_norm": 0.15903571117790502, + "learning_rate": 8.540255154490509e-05, + "loss": 2.9433, + "step": 15688 + }, + { + "epoch": 0.9739276181016823, + "grad_norm": 0.15720658145725977, + "learning_rate": 8.540000113405806e-05, + "loss": 2.9817, + "step": 15689 + }, + { + "epoch": 0.9739896952014402, + "grad_norm": 0.14015629595072152, + "learning_rate": 8.539745053852064e-05, + "loss": 2.9773, + "step": 15690 + }, + { + "epoch": 0.9740517723011981, + "grad_norm": 0.16150626629816445, + "learning_rate": 8.53948997583061e-05, + "loss": 2.9061, + "step": 15691 + }, + { + "epoch": 0.974113849400956, + "grad_norm": 0.15568540012860796, + "learning_rate": 8.53923487934278e-05, + "loss": 2.9605, + "step": 15692 + }, + { + "epoch": 0.9741759265007139, + "grad_norm": 0.15465178294079057, + "learning_rate": 8.538979764389903e-05, + "loss": 2.92, + "step": 15693 + }, + { + "epoch": 0.9742380036004717, + "grad_norm": 0.1560258151142501, + "learning_rate": 8.538724630973307e-05, + "loss": 2.9661, + "step": 15694 + }, + { + "epoch": 0.9743000807002297, + "grad_norm": 0.1582684558619376, + "learning_rate": 8.538469479094327e-05, + "loss": 2.9715, + "step": 15695 + }, + { + "epoch": 0.9743621577999876, + "grad_norm": 0.15335840614545274, + "learning_rate": 8.538214308754292e-05, + "loss": 3.0486, + "step": 15696 + }, + { + "epoch": 0.9744242348997455, + "grad_norm": 0.16579305089966456, + "learning_rate": 8.537959119954534e-05, + "loss": 3.0339, + "step": 15697 + }, + { + "epoch": 0.9744863119995034, + "grad_norm": 0.23504048282852855, + "learning_rate": 8.537703912696387e-05, + "loss": 3.0368, + "step": 15698 + }, + { + "epoch": 0.9745483890992613, + "grad_norm": 0.1847809694463186, + "learning_rate": 8.537448686981177e-05, + "loss": 2.93, + "step": 15699 + }, + { + "epoch": 0.9746104661990191, + "grad_norm": 0.16101903077049134, + "learning_rate": 8.537193442810238e-05, + "loss": 3.0083, + "step": 15700 + }, + { + "epoch": 0.9746725432987771, + "grad_norm": 0.19961946055096136, + "learning_rate": 8.536938180184905e-05, + "loss": 2.8814, + "step": 15701 + }, + { + "epoch": 0.974734620398535, + "grad_norm": 0.19860815459683673, + "learning_rate": 8.536682899106505e-05, + "loss": 2.9322, + "step": 15702 + }, + { + "epoch": 0.9747966974982929, + "grad_norm": 0.17599086978514042, + "learning_rate": 8.536427599576371e-05, + "loss": 3.0785, + "step": 15703 + }, + { + "epoch": 0.9748587745980508, + "grad_norm": 0.17847170216385225, + "learning_rate": 8.536172281595838e-05, + "loss": 2.9027, + "step": 15704 + }, + { + "epoch": 0.9749208516978087, + "grad_norm": 0.2000697416659146, + "learning_rate": 8.535916945166233e-05, + "loss": 2.9221, + "step": 15705 + }, + { + "epoch": 0.9749829287975665, + "grad_norm": 0.20518177682794486, + "learning_rate": 8.535661590288893e-05, + "loss": 2.9209, + "step": 15706 + }, + { + "epoch": 0.9750450058973245, + "grad_norm": 0.19121385168829844, + "learning_rate": 8.535406216965149e-05, + "loss": 2.9657, + "step": 15707 + }, + { + "epoch": 0.9751070829970824, + "grad_norm": 0.17192508227934483, + "learning_rate": 8.535150825196329e-05, + "loss": 2.9527, + "step": 15708 + }, + { + "epoch": 0.9751691600968403, + "grad_norm": 0.18635104736146896, + "learning_rate": 8.534895414983772e-05, + "loss": 3.1047, + "step": 15709 + }, + { + "epoch": 0.9752312371965982, + "grad_norm": 0.18918306234666765, + "learning_rate": 8.534639986328808e-05, + "loss": 2.9642, + "step": 15710 + }, + { + "epoch": 0.9752933142963561, + "grad_norm": 0.168342490577625, + "learning_rate": 8.534384539232767e-05, + "loss": 2.9315, + "step": 15711 + }, + { + "epoch": 0.9753553913961139, + "grad_norm": 0.2667832227260635, + "learning_rate": 8.534129073696984e-05, + "loss": 2.9723, + "step": 15712 + }, + { + "epoch": 0.9754174684958719, + "grad_norm": 0.20709948944593426, + "learning_rate": 8.533873589722792e-05, + "loss": 2.9556, + "step": 15713 + }, + { + "epoch": 0.9754795455956298, + "grad_norm": 0.1927143474018451, + "learning_rate": 8.533618087311524e-05, + "loss": 3.0167, + "step": 15714 + }, + { + "epoch": 0.9755416226953877, + "grad_norm": 0.18000492792867523, + "learning_rate": 8.533362566464514e-05, + "loss": 2.8854, + "step": 15715 + }, + { + "epoch": 0.9756036997951456, + "grad_norm": 0.3422366882052373, + "learning_rate": 8.533107027183092e-05, + "loss": 2.9859, + "step": 15716 + }, + { + "epoch": 0.9756657768949035, + "grad_norm": 0.34664210800875045, + "learning_rate": 8.532851469468593e-05, + "loss": 3.006, + "step": 15717 + }, + { + "epoch": 0.9757278539946613, + "grad_norm": 0.2047375803389899, + "learning_rate": 8.532595893322349e-05, + "loss": 2.9094, + "step": 15718 + }, + { + "epoch": 0.9757899310944192, + "grad_norm": 0.22356978615247186, + "learning_rate": 8.532340298745697e-05, + "loss": 2.9619, + "step": 15719 + }, + { + "epoch": 0.9758520081941772, + "grad_norm": 0.28358944818686155, + "learning_rate": 8.532084685739967e-05, + "loss": 2.9895, + "step": 15720 + }, + { + "epoch": 0.9759140852939351, + "grad_norm": 0.24401763106720448, + "learning_rate": 8.531829054306493e-05, + "loss": 2.9668, + "step": 15721 + }, + { + "epoch": 0.975976162393693, + "grad_norm": 0.2188184069212372, + "learning_rate": 8.531573404446611e-05, + "loss": 2.8797, + "step": 15722 + }, + { + "epoch": 0.9760382394934509, + "grad_norm": 0.2083974153588352, + "learning_rate": 8.531317736161652e-05, + "loss": 2.9955, + "step": 15723 + }, + { + "epoch": 0.9761003165932087, + "grad_norm": 0.2396494870835181, + "learning_rate": 8.531062049452953e-05, + "loss": 3.0094, + "step": 15724 + }, + { + "epoch": 0.9761623936929666, + "grad_norm": 0.25799786521800977, + "learning_rate": 8.530806344321844e-05, + "loss": 2.9455, + "step": 15725 + }, + { + "epoch": 0.9762244707927246, + "grad_norm": 0.2621750625643015, + "learning_rate": 8.530550620769663e-05, + "loss": 3.0254, + "step": 15726 + }, + { + "epoch": 0.9762865478924825, + "grad_norm": 0.18661710374560145, + "learning_rate": 8.530294878797743e-05, + "loss": 2.9758, + "step": 15727 + }, + { + "epoch": 0.9763486249922404, + "grad_norm": 0.2238800065259803, + "learning_rate": 8.530039118407415e-05, + "loss": 2.94, + "step": 15728 + }, + { + "epoch": 0.9764107020919983, + "grad_norm": 0.17981519515490882, + "learning_rate": 8.529783339600017e-05, + "loss": 2.939, + "step": 15729 + }, + { + "epoch": 0.9764727791917561, + "grad_norm": 0.1892680709625389, + "learning_rate": 8.529527542376882e-05, + "loss": 2.9999, + "step": 15730 + }, + { + "epoch": 0.976534856291514, + "grad_norm": 0.18832178629195823, + "learning_rate": 8.529271726739345e-05, + "loss": 2.9945, + "step": 15731 + }, + { + "epoch": 0.976596933391272, + "grad_norm": 0.33587261061178747, + "learning_rate": 8.52901589268874e-05, + "loss": 2.9947, + "step": 15732 + }, + { + "epoch": 0.9766590104910299, + "grad_norm": 0.18745000490588987, + "learning_rate": 8.528760040226405e-05, + "loss": 3.0386, + "step": 15733 + }, + { + "epoch": 0.9767210875907878, + "grad_norm": 0.20208937477801392, + "learning_rate": 8.52850416935367e-05, + "loss": 2.9884, + "step": 15734 + }, + { + "epoch": 0.9767831646905456, + "grad_norm": 0.1878435742160531, + "learning_rate": 8.528248280071873e-05, + "loss": 2.9632, + "step": 15735 + }, + { + "epoch": 0.9768452417903035, + "grad_norm": 0.1857953165085359, + "learning_rate": 8.527992372382347e-05, + "loss": 2.9542, + "step": 15736 + }, + { + "epoch": 0.9769073188900614, + "grad_norm": 0.19446629316803074, + "learning_rate": 8.52773644628643e-05, + "loss": 2.8963, + "step": 15737 + }, + { + "epoch": 0.9769693959898194, + "grad_norm": 0.24052280076352422, + "learning_rate": 8.527480501785454e-05, + "loss": 2.9673, + "step": 15738 + }, + { + "epoch": 0.9770314730895773, + "grad_norm": 0.26059197088481234, + "learning_rate": 8.527224538880759e-05, + "loss": 2.9628, + "step": 15739 + }, + { + "epoch": 0.9770935501893352, + "grad_norm": 0.23326920478880106, + "learning_rate": 8.526968557573674e-05, + "loss": 3.0403, + "step": 15740 + }, + { + "epoch": 0.977155627289093, + "grad_norm": 0.18701017493664038, + "learning_rate": 8.52671255786554e-05, + "loss": 2.94, + "step": 15741 + }, + { + "epoch": 0.9772177043888509, + "grad_norm": 0.2024776247843522, + "learning_rate": 8.526456539757688e-05, + "loss": 2.9222, + "step": 15742 + }, + { + "epoch": 0.9772797814886088, + "grad_norm": 0.22285446746831786, + "learning_rate": 8.526200503251458e-05, + "loss": 2.9809, + "step": 15743 + }, + { + "epoch": 0.9773418585883668, + "grad_norm": 0.16997721746421698, + "learning_rate": 8.525944448348185e-05, + "loss": 3.0007, + "step": 15744 + }, + { + "epoch": 0.9774039356881247, + "grad_norm": 0.17894631399471056, + "learning_rate": 8.525688375049202e-05, + "loss": 2.9579, + "step": 15745 + }, + { + "epoch": 0.9774660127878826, + "grad_norm": 0.22060861505069118, + "learning_rate": 8.525432283355846e-05, + "loss": 2.9596, + "step": 15746 + }, + { + "epoch": 0.9775280898876404, + "grad_norm": 0.4107439644231376, + "learning_rate": 8.525176173269455e-05, + "loss": 2.9512, + "step": 15747 + }, + { + "epoch": 0.9775901669873983, + "grad_norm": 0.20976193778208926, + "learning_rate": 8.524920044791365e-05, + "loss": 3.0505, + "step": 15748 + }, + { + "epoch": 0.9776522440871562, + "grad_norm": 0.19215159390922773, + "learning_rate": 8.524663897922911e-05, + "loss": 3.002, + "step": 15749 + }, + { + "epoch": 0.9777143211869141, + "grad_norm": 0.28875611796234985, + "learning_rate": 8.524407732665429e-05, + "loss": 2.9751, + "step": 15750 + }, + { + "epoch": 0.9777763982866721, + "grad_norm": 0.20194824183332225, + "learning_rate": 8.524151549020257e-05, + "loss": 2.9885, + "step": 15751 + }, + { + "epoch": 0.97783847538643, + "grad_norm": 0.1541853151340569, + "learning_rate": 8.523895346988731e-05, + "loss": 2.897, + "step": 15752 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.20571436506857127, + "learning_rate": 8.523639126572187e-05, + "loss": 2.9564, + "step": 15753 + }, + { + "epoch": 0.9779626295859457, + "grad_norm": 0.1762436612659987, + "learning_rate": 8.523382887771962e-05, + "loss": 2.9691, + "step": 15754 + }, + { + "epoch": 0.9780247066857036, + "grad_norm": 0.16171416976450276, + "learning_rate": 8.523126630589395e-05, + "loss": 2.9278, + "step": 15755 + }, + { + "epoch": 0.9780867837854615, + "grad_norm": 0.585078589018593, + "learning_rate": 8.522870355025818e-05, + "loss": 2.9563, + "step": 15756 + }, + { + "epoch": 0.9781488608852195, + "grad_norm": 0.25722642230911313, + "learning_rate": 8.522614061082574e-05, + "loss": 2.9463, + "step": 15757 + }, + { + "epoch": 0.9782109379849774, + "grad_norm": 0.19669595933292602, + "learning_rate": 8.522357748760996e-05, + "loss": 2.9181, + "step": 15758 + }, + { + "epoch": 0.9782730150847352, + "grad_norm": 0.17369505256987178, + "learning_rate": 8.522101418062422e-05, + "loss": 2.9518, + "step": 15759 + }, + { + "epoch": 0.9783350921844931, + "grad_norm": 0.22517991786720407, + "learning_rate": 8.521845068988191e-05, + "loss": 3.0422, + "step": 15760 + }, + { + "epoch": 0.978397169284251, + "grad_norm": 0.17950825458463374, + "learning_rate": 8.52158870153964e-05, + "loss": 2.9121, + "step": 15761 + }, + { + "epoch": 0.9784592463840089, + "grad_norm": 0.18479665370678588, + "learning_rate": 8.521332315718104e-05, + "loss": 2.9575, + "step": 15762 + }, + { + "epoch": 0.9785213234837669, + "grad_norm": 0.20088749757308252, + "learning_rate": 8.521075911524924e-05, + "loss": 2.9087, + "step": 15763 + }, + { + "epoch": 0.9785834005835248, + "grad_norm": 0.18653006816482587, + "learning_rate": 8.520819488961435e-05, + "loss": 2.9594, + "step": 15764 + }, + { + "epoch": 0.9786454776832826, + "grad_norm": 0.20946730315653322, + "learning_rate": 8.520563048028976e-05, + "loss": 2.9156, + "step": 15765 + }, + { + "epoch": 0.9787075547830405, + "grad_norm": 0.20423065556537942, + "learning_rate": 8.520306588728886e-05, + "loss": 2.9366, + "step": 15766 + }, + { + "epoch": 0.9787696318827984, + "grad_norm": 0.19207477597753966, + "learning_rate": 8.520050111062501e-05, + "loss": 2.8923, + "step": 15767 + }, + { + "epoch": 0.9788317089825563, + "grad_norm": 0.1944481990277634, + "learning_rate": 8.519793615031161e-05, + "loss": 3.0277, + "step": 15768 + }, + { + "epoch": 0.9788937860823143, + "grad_norm": 0.1726392908095684, + "learning_rate": 8.519537100636203e-05, + "loss": 2.8967, + "step": 15769 + }, + { + "epoch": 0.9789558631820722, + "grad_norm": 0.30385647442115366, + "learning_rate": 8.519280567878964e-05, + "loss": 2.9137, + "step": 15770 + }, + { + "epoch": 0.97901794028183, + "grad_norm": 0.19749629819998205, + "learning_rate": 8.519024016760786e-05, + "loss": 2.94, + "step": 15771 + }, + { + "epoch": 0.9790800173815879, + "grad_norm": 0.23250136899288715, + "learning_rate": 8.518767447283005e-05, + "loss": 2.9076, + "step": 15772 + }, + { + "epoch": 0.9791420944813458, + "grad_norm": 0.17653152037903885, + "learning_rate": 8.518510859446959e-05, + "loss": 2.9972, + "step": 15773 + }, + { + "epoch": 0.9792041715811037, + "grad_norm": 0.22390820899686723, + "learning_rate": 8.518254253253987e-05, + "loss": 3.0289, + "step": 15774 + }, + { + "epoch": 0.9792662486808617, + "grad_norm": 0.16891550892693635, + "learning_rate": 8.517997628705431e-05, + "loss": 3.0043, + "step": 15775 + }, + { + "epoch": 0.9793283257806196, + "grad_norm": 0.221641394378865, + "learning_rate": 8.517740985802626e-05, + "loss": 2.7941, + "step": 15776 + }, + { + "epoch": 0.9793904028803774, + "grad_norm": 0.24279859570542153, + "learning_rate": 8.517484324546913e-05, + "loss": 2.9496, + "step": 15777 + }, + { + "epoch": 0.9794524799801353, + "grad_norm": 0.17034651324778244, + "learning_rate": 8.51722764493963e-05, + "loss": 3.0173, + "step": 15778 + }, + { + "epoch": 0.9795145570798932, + "grad_norm": 0.2470791555035911, + "learning_rate": 8.516970946982116e-05, + "loss": 2.9156, + "step": 15779 + }, + { + "epoch": 0.9795766341796511, + "grad_norm": 0.17683442705207483, + "learning_rate": 8.516714230675711e-05, + "loss": 2.989, + "step": 15780 + }, + { + "epoch": 0.979638711279409, + "grad_norm": 0.2353148451532475, + "learning_rate": 8.516457496021754e-05, + "loss": 2.9122, + "step": 15781 + }, + { + "epoch": 0.979700788379167, + "grad_norm": 0.15445279440219753, + "learning_rate": 8.516200743021585e-05, + "loss": 2.9575, + "step": 15782 + }, + { + "epoch": 0.9797628654789248, + "grad_norm": 0.1884445005036979, + "learning_rate": 8.515943971676543e-05, + "loss": 2.9812, + "step": 15783 + }, + { + "epoch": 0.9798249425786827, + "grad_norm": 0.1839087401669559, + "learning_rate": 8.515687181987969e-05, + "loss": 3.0004, + "step": 15784 + }, + { + "epoch": 0.9798870196784406, + "grad_norm": 0.1845169494830751, + "learning_rate": 8.515430373957198e-05, + "loss": 3.0941, + "step": 15785 + }, + { + "epoch": 0.9799490967781985, + "grad_norm": 0.19698326751768025, + "learning_rate": 8.515173547585577e-05, + "loss": 2.985, + "step": 15786 + }, + { + "epoch": 0.9800111738779564, + "grad_norm": 0.18664212470172648, + "learning_rate": 8.51491670287444e-05, + "loss": 2.9813, + "step": 15787 + }, + { + "epoch": 0.9800732509777144, + "grad_norm": 0.21690195131899276, + "learning_rate": 8.514659839825131e-05, + "loss": 2.9593, + "step": 15788 + }, + { + "epoch": 0.9801353280774722, + "grad_norm": 0.24886444850714293, + "learning_rate": 8.514402958438987e-05, + "loss": 3.0002, + "step": 15789 + }, + { + "epoch": 0.9801974051772301, + "grad_norm": 0.19445884115324322, + "learning_rate": 8.514146058717352e-05, + "loss": 2.9952, + "step": 15790 + }, + { + "epoch": 0.980259482276988, + "grad_norm": 0.21674405562852464, + "learning_rate": 8.513889140661562e-05, + "loss": 2.9572, + "step": 15791 + }, + { + "epoch": 0.9803215593767459, + "grad_norm": 0.2856136468346334, + "learning_rate": 8.51363220427296e-05, + "loss": 2.9159, + "step": 15792 + }, + { + "epoch": 0.9803836364765038, + "grad_norm": 0.17087667054111097, + "learning_rate": 8.513375249552885e-05, + "loss": 2.9747, + "step": 15793 + }, + { + "epoch": 0.9804457135762618, + "grad_norm": 0.28695424099151345, + "learning_rate": 8.513118276502677e-05, + "loss": 3.0062, + "step": 15794 + }, + { + "epoch": 0.9805077906760196, + "grad_norm": 0.24573656682402384, + "learning_rate": 8.512861285123682e-05, + "loss": 2.9922, + "step": 15795 + }, + { + "epoch": 0.9805698677757775, + "grad_norm": 0.21768081303253384, + "learning_rate": 8.512604275417232e-05, + "loss": 2.9144, + "step": 15796 + }, + { + "epoch": 0.9806319448755354, + "grad_norm": 0.16714056015619633, + "learning_rate": 8.512347247384675e-05, + "loss": 2.9331, + "step": 15797 + }, + { + "epoch": 0.9806940219752933, + "grad_norm": 0.18468901435115018, + "learning_rate": 8.512090201027352e-05, + "loss": 2.9643, + "step": 15798 + }, + { + "epoch": 0.9807560990750512, + "grad_norm": 0.20167764952666348, + "learning_rate": 8.5118331363466e-05, + "loss": 2.9617, + "step": 15799 + }, + { + "epoch": 0.9808181761748092, + "grad_norm": 0.20989610425081273, + "learning_rate": 8.511576053343762e-05, + "loss": 3.0043, + "step": 15800 + }, + { + "epoch": 0.980880253274567, + "grad_norm": 0.2832120302049857, + "learning_rate": 8.51131895202018e-05, + "loss": 2.9554, + "step": 15801 + }, + { + "epoch": 0.9809423303743249, + "grad_norm": 0.21730816322850824, + "learning_rate": 8.511061832377193e-05, + "loss": 2.8514, + "step": 15802 + }, + { + "epoch": 0.9810044074740828, + "grad_norm": 0.17682546301025648, + "learning_rate": 8.510804694416146e-05, + "loss": 3.0082, + "step": 15803 + }, + { + "epoch": 0.9810664845738407, + "grad_norm": 0.16781479966241325, + "learning_rate": 8.510547538138376e-05, + "loss": 2.9358, + "step": 15804 + }, + { + "epoch": 0.9811285616735986, + "grad_norm": 0.1663656639697538, + "learning_rate": 8.51029036354523e-05, + "loss": 2.9571, + "step": 15805 + }, + { + "epoch": 0.9811906387733565, + "grad_norm": 0.2888637323282496, + "learning_rate": 8.510033170638046e-05, + "loss": 2.97, + "step": 15806 + }, + { + "epoch": 0.9812527158731144, + "grad_norm": 0.18965107123356262, + "learning_rate": 8.509775959418166e-05, + "loss": 2.9796, + "step": 15807 + }, + { + "epoch": 0.9813147929728723, + "grad_norm": 0.22694935327701654, + "learning_rate": 8.509518729886934e-05, + "loss": 2.9041, + "step": 15808 + }, + { + "epoch": 0.9813768700726302, + "grad_norm": 0.23031697690745567, + "learning_rate": 8.509261482045692e-05, + "loss": 2.956, + "step": 15809 + }, + { + "epoch": 0.9814389471723881, + "grad_norm": 0.17274974798585752, + "learning_rate": 8.509004215895779e-05, + "loss": 3.0284, + "step": 15810 + }, + { + "epoch": 0.981501024272146, + "grad_norm": 0.19708137594572306, + "learning_rate": 8.508746931438539e-05, + "loss": 2.9716, + "step": 15811 + }, + { + "epoch": 0.981563101371904, + "grad_norm": 0.1793718156465466, + "learning_rate": 8.508489628675316e-05, + "loss": 2.9058, + "step": 15812 + }, + { + "epoch": 0.9816251784716618, + "grad_norm": 0.18199925243766493, + "learning_rate": 8.508232307607449e-05, + "loss": 2.882, + "step": 15813 + }, + { + "epoch": 0.9816872555714197, + "grad_norm": 0.16565021804243396, + "learning_rate": 8.507974968236284e-05, + "loss": 3.0028, + "step": 15814 + }, + { + "epoch": 0.9817493326711776, + "grad_norm": 0.1983736851726146, + "learning_rate": 8.507717610563161e-05, + "loss": 2.8997, + "step": 15815 + }, + { + "epoch": 0.9818114097709355, + "grad_norm": 0.1695044751272804, + "learning_rate": 8.507460234589424e-05, + "loss": 2.9446, + "step": 15816 + }, + { + "epoch": 0.9818734868706934, + "grad_norm": 0.19080018412730593, + "learning_rate": 8.507202840316415e-05, + "loss": 2.9625, + "step": 15817 + }, + { + "epoch": 0.9819355639704513, + "grad_norm": 0.16916703685319107, + "learning_rate": 8.506945427745479e-05, + "loss": 2.9422, + "step": 15818 + }, + { + "epoch": 0.9819976410702091, + "grad_norm": 0.16689134084688909, + "learning_rate": 8.506687996877955e-05, + "loss": 2.8976, + "step": 15819 + }, + { + "epoch": 0.9820597181699671, + "grad_norm": 0.17746260059273497, + "learning_rate": 8.50643054771519e-05, + "loss": 2.9707, + "step": 15820 + }, + { + "epoch": 0.982121795269725, + "grad_norm": 0.1709092000569634, + "learning_rate": 8.506173080258525e-05, + "loss": 2.9964, + "step": 15821 + }, + { + "epoch": 0.9821838723694829, + "grad_norm": 0.16403120173778613, + "learning_rate": 8.505915594509303e-05, + "loss": 2.9125, + "step": 15822 + }, + { + "epoch": 0.9822459494692408, + "grad_norm": 0.17711050979728596, + "learning_rate": 8.505658090468871e-05, + "loss": 2.9588, + "step": 15823 + }, + { + "epoch": 0.9823080265689987, + "grad_norm": 0.1717859926372326, + "learning_rate": 8.505400568138569e-05, + "loss": 2.9415, + "step": 15824 + }, + { + "epoch": 0.9823701036687565, + "grad_norm": 0.15133157741919417, + "learning_rate": 8.50514302751974e-05, + "loss": 2.9749, + "step": 15825 + }, + { + "epoch": 0.9824321807685145, + "grad_norm": 0.15062905892763834, + "learning_rate": 8.50488546861373e-05, + "loss": 2.8372, + "step": 15826 + }, + { + "epoch": 0.9824942578682724, + "grad_norm": 0.15406662625944412, + "learning_rate": 8.50462789142188e-05, + "loss": 2.8715, + "step": 15827 + }, + { + "epoch": 0.9825563349680303, + "grad_norm": 0.1554901961290267, + "learning_rate": 8.504370295945537e-05, + "loss": 2.9407, + "step": 15828 + }, + { + "epoch": 0.9826184120677882, + "grad_norm": 0.1869495776645331, + "learning_rate": 8.504112682186043e-05, + "loss": 3.0635, + "step": 15829 + }, + { + "epoch": 0.9826804891675461, + "grad_norm": 0.1686035403713691, + "learning_rate": 8.503855050144743e-05, + "loss": 2.8636, + "step": 15830 + }, + { + "epoch": 0.9827425662673039, + "grad_norm": 0.1775844486585822, + "learning_rate": 8.50359739982298e-05, + "loss": 2.9358, + "step": 15831 + }, + { + "epoch": 0.9828046433670619, + "grad_norm": 0.15900625247065409, + "learning_rate": 8.503339731222098e-05, + "loss": 2.9711, + "step": 15832 + }, + { + "epoch": 0.9828667204668198, + "grad_norm": 0.1638204685308494, + "learning_rate": 8.503082044343443e-05, + "loss": 2.8359, + "step": 15833 + }, + { + "epoch": 0.9829287975665777, + "grad_norm": 0.16478337000165497, + "learning_rate": 8.502824339188359e-05, + "loss": 2.8714, + "step": 15834 + }, + { + "epoch": 0.9829908746663356, + "grad_norm": 0.15539070115967385, + "learning_rate": 8.502566615758189e-05, + "loss": 2.9668, + "step": 15835 + }, + { + "epoch": 0.9830529517660935, + "grad_norm": 0.16067669787018063, + "learning_rate": 8.50230887405428e-05, + "loss": 2.9477, + "step": 15836 + }, + { + "epoch": 0.9831150288658513, + "grad_norm": 0.20370272860349847, + "learning_rate": 8.502051114077976e-05, + "loss": 2.9381, + "step": 15837 + }, + { + "epoch": 0.9831771059656093, + "grad_norm": 0.18741979473020137, + "learning_rate": 8.501793335830619e-05, + "loss": 2.9712, + "step": 15838 + }, + { + "epoch": 0.9832391830653672, + "grad_norm": 0.1657890625118642, + "learning_rate": 8.501535539313556e-05, + "loss": 2.937, + "step": 15839 + }, + { + "epoch": 0.9833012601651251, + "grad_norm": 0.16220294021692933, + "learning_rate": 8.501277724528134e-05, + "loss": 2.9988, + "step": 15840 + }, + { + "epoch": 0.983363337264883, + "grad_norm": 0.16627603190837006, + "learning_rate": 8.501019891475695e-05, + "loss": 2.9026, + "step": 15841 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.15132871569595496, + "learning_rate": 8.500762040157586e-05, + "loss": 2.9028, + "step": 15842 + }, + { + "epoch": 0.9834874914643987, + "grad_norm": 0.17207906811160523, + "learning_rate": 8.500504170575153e-05, + "loss": 2.9723, + "step": 15843 + }, + { + "epoch": 0.9835495685641567, + "grad_norm": 0.15828638476095092, + "learning_rate": 8.500246282729739e-05, + "loss": 2.9705, + "step": 15844 + }, + { + "epoch": 0.9836116456639146, + "grad_norm": 0.15781981928064576, + "learning_rate": 8.499988376622687e-05, + "loss": 3.0611, + "step": 15845 + }, + { + "epoch": 0.9836737227636725, + "grad_norm": 0.16864227392973402, + "learning_rate": 8.49973045225535e-05, + "loss": 2.9297, + "step": 15846 + }, + { + "epoch": 0.9837357998634304, + "grad_norm": 0.1513772591962526, + "learning_rate": 8.499472509629067e-05, + "loss": 2.9742, + "step": 15847 + }, + { + "epoch": 0.9837978769631883, + "grad_norm": 0.2122589767504437, + "learning_rate": 8.499214548745188e-05, + "loss": 2.9225, + "step": 15848 + }, + { + "epoch": 0.9838599540629461, + "grad_norm": 0.1704781295615536, + "learning_rate": 8.498956569605057e-05, + "loss": 2.9577, + "step": 15849 + }, + { + "epoch": 0.983922031162704, + "grad_norm": 0.20559136789591217, + "learning_rate": 8.498698572210019e-05, + "loss": 2.8824, + "step": 15850 + }, + { + "epoch": 0.983984108262462, + "grad_norm": 0.17056014775149514, + "learning_rate": 8.498440556561424e-05, + "loss": 2.9021, + "step": 15851 + }, + { + "epoch": 0.9840461853622199, + "grad_norm": 0.17563483710416136, + "learning_rate": 8.498182522660613e-05, + "loss": 2.8995, + "step": 15852 + }, + { + "epoch": 0.9841082624619778, + "grad_norm": 0.18480609789245428, + "learning_rate": 8.497924470508935e-05, + "loss": 2.9307, + "step": 15853 + }, + { + "epoch": 0.9841703395617357, + "grad_norm": 0.16154998759436728, + "learning_rate": 8.497666400107736e-05, + "loss": 2.8761, + "step": 15854 + }, + { + "epoch": 0.9842324166614935, + "grad_norm": 0.18001082604687332, + "learning_rate": 8.497408311458362e-05, + "loss": 2.964, + "step": 15855 + }, + { + "epoch": 0.9842944937612514, + "grad_norm": 0.16843520313317614, + "learning_rate": 8.497150204562161e-05, + "loss": 2.9271, + "step": 15856 + }, + { + "epoch": 0.9843565708610094, + "grad_norm": 0.1626069120688119, + "learning_rate": 8.496892079420478e-05, + "loss": 2.962, + "step": 15857 + }, + { + "epoch": 0.9844186479607673, + "grad_norm": 0.17124093204875018, + "learning_rate": 8.496633936034659e-05, + "loss": 3.008, + "step": 15858 + }, + { + "epoch": 0.9844807250605252, + "grad_norm": 0.15455884062310354, + "learning_rate": 8.496375774406052e-05, + "loss": 2.9435, + "step": 15859 + }, + { + "epoch": 0.9845428021602831, + "grad_norm": 0.16213579996588381, + "learning_rate": 8.496117594536006e-05, + "loss": 2.8578, + "step": 15860 + }, + { + "epoch": 0.9846048792600409, + "grad_norm": 0.1793718052622775, + "learning_rate": 8.495859396425863e-05, + "loss": 3.0207, + "step": 15861 + }, + { + "epoch": 0.9846669563597988, + "grad_norm": 0.18956823882771398, + "learning_rate": 8.495601180076976e-05, + "loss": 3.0095, + "step": 15862 + }, + { + "epoch": 0.9847290334595568, + "grad_norm": 0.19741292684887285, + "learning_rate": 8.495342945490686e-05, + "loss": 2.9565, + "step": 15863 + }, + { + "epoch": 0.9847911105593147, + "grad_norm": 0.1981232290910824, + "learning_rate": 8.495084692668344e-05, + "loss": 2.9392, + "step": 15864 + }, + { + "epoch": 0.9848531876590726, + "grad_norm": 0.20376915762651418, + "learning_rate": 8.494826421611298e-05, + "loss": 3.0128, + "step": 15865 + }, + { + "epoch": 0.9849152647588305, + "grad_norm": 0.17447600551607362, + "learning_rate": 8.494568132320893e-05, + "loss": 2.8677, + "step": 15866 + }, + { + "epoch": 0.9849773418585883, + "grad_norm": 0.19963444554203602, + "learning_rate": 8.494309824798477e-05, + "loss": 2.879, + "step": 15867 + }, + { + "epoch": 0.9850394189583462, + "grad_norm": 0.15923049911484885, + "learning_rate": 8.4940514990454e-05, + "loss": 2.9397, + "step": 15868 + }, + { + "epoch": 0.9851014960581042, + "grad_norm": 0.1866865692977515, + "learning_rate": 8.493793155063008e-05, + "loss": 2.9581, + "step": 15869 + }, + { + "epoch": 0.9851635731578621, + "grad_norm": 0.1865766459362396, + "learning_rate": 8.493534792852648e-05, + "loss": 2.9978, + "step": 15870 + }, + { + "epoch": 0.98522565025762, + "grad_norm": 0.18133131989997173, + "learning_rate": 8.49327641241567e-05, + "loss": 2.9716, + "step": 15871 + }, + { + "epoch": 0.9852877273573779, + "grad_norm": 0.23012210347913284, + "learning_rate": 8.49301801375342e-05, + "loss": 3.0275, + "step": 15872 + }, + { + "epoch": 0.9853498044571357, + "grad_norm": 0.2005506757517966, + "learning_rate": 8.492759596867248e-05, + "loss": 2.9852, + "step": 15873 + }, + { + "epoch": 0.9854118815568936, + "grad_norm": 0.16949881581790213, + "learning_rate": 8.4925011617585e-05, + "loss": 2.9655, + "step": 15874 + }, + { + "epoch": 0.9854739586566515, + "grad_norm": 0.18375569161209304, + "learning_rate": 8.492242708428527e-05, + "loss": 3.0486, + "step": 15875 + }, + { + "epoch": 0.9855360357564095, + "grad_norm": 0.17715607345930243, + "learning_rate": 8.491984236878675e-05, + "loss": 2.971, + "step": 15876 + }, + { + "epoch": 0.9855981128561674, + "grad_norm": 0.18373420091100595, + "learning_rate": 8.491725747110294e-05, + "loss": 2.9973, + "step": 15877 + }, + { + "epoch": 0.9856601899559253, + "grad_norm": 0.29983668203208513, + "learning_rate": 8.491467239124733e-05, + "loss": 3.0066, + "step": 15878 + }, + { + "epoch": 0.9857222670556831, + "grad_norm": 0.19684221130898555, + "learning_rate": 8.491208712923337e-05, + "loss": 2.9689, + "step": 15879 + }, + { + "epoch": 0.985784344155441, + "grad_norm": 0.18536685442375073, + "learning_rate": 8.49095016850746e-05, + "loss": 2.8492, + "step": 15880 + }, + { + "epoch": 0.985846421255199, + "grad_norm": 0.23657587952112225, + "learning_rate": 8.490691605878447e-05, + "loss": 2.9272, + "step": 15881 + }, + { + "epoch": 0.9859084983549569, + "grad_norm": 0.18604040263630453, + "learning_rate": 8.49043302503765e-05, + "loss": 2.9753, + "step": 15882 + }, + { + "epoch": 0.9859705754547148, + "grad_norm": 0.18275971384167888, + "learning_rate": 8.490174425986417e-05, + "loss": 2.9424, + "step": 15883 + }, + { + "epoch": 0.9860326525544727, + "grad_norm": 0.20484009507422687, + "learning_rate": 8.489915808726095e-05, + "loss": 2.9628, + "step": 15884 + }, + { + "epoch": 0.9860947296542305, + "grad_norm": 0.26959986435313027, + "learning_rate": 8.489657173258037e-05, + "loss": 2.8945, + "step": 15885 + }, + { + "epoch": 0.9861568067539884, + "grad_norm": 0.2025773018740442, + "learning_rate": 8.489398519583588e-05, + "loss": 2.9632, + "step": 15886 + }, + { + "epoch": 0.9862188838537463, + "grad_norm": 0.21568736991834186, + "learning_rate": 8.489139847704102e-05, + "loss": 2.9819, + "step": 15887 + }, + { + "epoch": 0.9862809609535043, + "grad_norm": 0.1713513449221081, + "learning_rate": 8.488881157620925e-05, + "loss": 2.9838, + "step": 15888 + }, + { + "epoch": 0.9863430380532622, + "grad_norm": 0.20735324255767623, + "learning_rate": 8.488622449335408e-05, + "loss": 2.899, + "step": 15889 + }, + { + "epoch": 0.9864051151530201, + "grad_norm": 0.1787329905093082, + "learning_rate": 8.488363722848902e-05, + "loss": 3.0805, + "step": 15890 + }, + { + "epoch": 0.9864671922527779, + "grad_norm": 0.1849977297579163, + "learning_rate": 8.488104978162754e-05, + "loss": 2.8696, + "step": 15891 + }, + { + "epoch": 0.9865292693525358, + "grad_norm": 0.15516326402482072, + "learning_rate": 8.487846215278316e-05, + "loss": 2.9985, + "step": 15892 + }, + { + "epoch": 0.9865913464522937, + "grad_norm": 0.1982555290956824, + "learning_rate": 8.48758743419694e-05, + "loss": 2.9184, + "step": 15893 + }, + { + "epoch": 0.9866534235520517, + "grad_norm": 0.15790376479236995, + "learning_rate": 8.487328634919971e-05, + "loss": 2.9448, + "step": 15894 + }, + { + "epoch": 0.9867155006518096, + "grad_norm": 0.20411441160212485, + "learning_rate": 8.487069817448763e-05, + "loss": 3.0019, + "step": 15895 + }, + { + "epoch": 0.9867775777515675, + "grad_norm": 0.16947458854119737, + "learning_rate": 8.486810981784664e-05, + "loss": 2.9807, + "step": 15896 + }, + { + "epoch": 0.9868396548513253, + "grad_norm": 0.17750500980882952, + "learning_rate": 8.486552127929026e-05, + "loss": 3.0068, + "step": 15897 + }, + { + "epoch": 0.9869017319510832, + "grad_norm": 0.16386828962098646, + "learning_rate": 8.4862932558832e-05, + "loss": 2.9602, + "step": 15898 + }, + { + "epoch": 0.9869638090508411, + "grad_norm": 0.17687227958096838, + "learning_rate": 8.486034365648536e-05, + "loss": 2.9637, + "step": 15899 + }, + { + "epoch": 0.987025886150599, + "grad_norm": 0.1685657482120671, + "learning_rate": 8.485775457226383e-05, + "loss": 2.8554, + "step": 15900 + }, + { + "epoch": 0.987087963250357, + "grad_norm": 0.22124904866741418, + "learning_rate": 8.485516530618095e-05, + "loss": 2.9383, + "step": 15901 + }, + { + "epoch": 0.9871500403501149, + "grad_norm": 0.1747090968036035, + "learning_rate": 8.485257585825021e-05, + "loss": 2.9982, + "step": 15902 + }, + { + "epoch": 0.9872121174498727, + "grad_norm": 0.15614773861009176, + "learning_rate": 8.484998622848511e-05, + "loss": 2.9745, + "step": 15903 + }, + { + "epoch": 0.9872741945496306, + "grad_norm": 0.1801834621704767, + "learning_rate": 8.484739641689918e-05, + "loss": 2.9831, + "step": 15904 + }, + { + "epoch": 0.9873362716493885, + "grad_norm": 0.192326038716525, + "learning_rate": 8.484480642350592e-05, + "loss": 2.9817, + "step": 15905 + }, + { + "epoch": 0.9873983487491464, + "grad_norm": 0.157705052783446, + "learning_rate": 8.484221624831885e-05, + "loss": 2.8691, + "step": 15906 + }, + { + "epoch": 0.9874604258489044, + "grad_norm": 0.1780086329635654, + "learning_rate": 8.483962589135146e-05, + "loss": 2.9359, + "step": 15907 + }, + { + "epoch": 0.9875225029486623, + "grad_norm": 0.16309658746723435, + "learning_rate": 8.48370353526173e-05, + "loss": 2.9035, + "step": 15908 + }, + { + "epoch": 0.9875845800484201, + "grad_norm": 0.207528031119009, + "learning_rate": 8.483444463212988e-05, + "loss": 2.9273, + "step": 15909 + }, + { + "epoch": 0.987646657148178, + "grad_norm": 0.27266059587835345, + "learning_rate": 8.483185372990269e-05, + "loss": 2.9714, + "step": 15910 + }, + { + "epoch": 0.9877087342479359, + "grad_norm": 0.20328312001405446, + "learning_rate": 8.482926264594927e-05, + "loss": 2.9044, + "step": 15911 + }, + { + "epoch": 0.9877708113476938, + "grad_norm": 0.1979028023498244, + "learning_rate": 8.482667138028313e-05, + "loss": 3.0337, + "step": 15912 + }, + { + "epoch": 0.9878328884474518, + "grad_norm": 0.19104664946505573, + "learning_rate": 8.482407993291779e-05, + "loss": 2.9711, + "step": 15913 + }, + { + "epoch": 0.9878949655472097, + "grad_norm": 0.2241291302339232, + "learning_rate": 8.482148830386676e-05, + "loss": 2.997, + "step": 15914 + }, + { + "epoch": 0.9879570426469675, + "grad_norm": 0.2084224669198344, + "learning_rate": 8.481889649314358e-05, + "loss": 2.8144, + "step": 15915 + }, + { + "epoch": 0.9880191197467254, + "grad_norm": 0.18471236035804506, + "learning_rate": 8.481630450076177e-05, + "loss": 2.9315, + "step": 15916 + }, + { + "epoch": 0.9880811968464833, + "grad_norm": 0.17959093526038675, + "learning_rate": 8.481371232673485e-05, + "loss": 2.9762, + "step": 15917 + }, + { + "epoch": 0.9881432739462412, + "grad_norm": 0.18578301511524356, + "learning_rate": 8.481111997107634e-05, + "loss": 3.0266, + "step": 15918 + }, + { + "epoch": 0.9882053510459992, + "grad_norm": 0.17373337467158467, + "learning_rate": 8.480852743379977e-05, + "loss": 2.9914, + "step": 15919 + }, + { + "epoch": 0.9882674281457571, + "grad_norm": 0.1809454093284903, + "learning_rate": 8.480593471491865e-05, + "loss": 2.8775, + "step": 15920 + }, + { + "epoch": 0.9883295052455149, + "grad_norm": 0.17693112765041188, + "learning_rate": 8.480334181444653e-05, + "loss": 2.9534, + "step": 15921 + }, + { + "epoch": 0.9883915823452728, + "grad_norm": 0.1626262925966749, + "learning_rate": 8.480074873239691e-05, + "loss": 2.9022, + "step": 15922 + }, + { + "epoch": 0.9884536594450307, + "grad_norm": 0.2088223301475596, + "learning_rate": 8.479815546878335e-05, + "loss": 3.0439, + "step": 15923 + }, + { + "epoch": 0.9885157365447886, + "grad_norm": 0.17716086784584756, + "learning_rate": 8.479556202361938e-05, + "loss": 2.9373, + "step": 15924 + }, + { + "epoch": 0.9885778136445466, + "grad_norm": 0.2042627109094162, + "learning_rate": 8.479296839691849e-05, + "loss": 2.9122, + "step": 15925 + }, + { + "epoch": 0.9886398907443045, + "grad_norm": 0.16982476978530678, + "learning_rate": 8.479037458869426e-05, + "loss": 2.9673, + "step": 15926 + }, + { + "epoch": 0.9887019678440623, + "grad_norm": 0.16253431191775514, + "learning_rate": 8.478778059896017e-05, + "loss": 3.0165, + "step": 15927 + }, + { + "epoch": 0.9887640449438202, + "grad_norm": 0.23568882658278334, + "learning_rate": 8.478518642772982e-05, + "loss": 2.9308, + "step": 15928 + }, + { + "epoch": 0.9888261220435781, + "grad_norm": 0.16501682176779975, + "learning_rate": 8.478259207501668e-05, + "loss": 3.0377, + "step": 15929 + }, + { + "epoch": 0.988888199143336, + "grad_norm": 0.1744219463142865, + "learning_rate": 8.477999754083433e-05, + "loss": 3.0398, + "step": 15930 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.17902136767974391, + "learning_rate": 8.477740282519626e-05, + "loss": 2.9005, + "step": 15931 + }, + { + "epoch": 0.9890123533428519, + "grad_norm": 0.17676985683819596, + "learning_rate": 8.477480792811608e-05, + "loss": 2.9651, + "step": 15932 + }, + { + "epoch": 0.9890744304426097, + "grad_norm": 0.1929280257399661, + "learning_rate": 8.477221284960726e-05, + "loss": 2.9479, + "step": 15933 + }, + { + "epoch": 0.9891365075423676, + "grad_norm": 0.1582036312707406, + "learning_rate": 8.476961758968336e-05, + "loss": 3.037, + "step": 15934 + }, + { + "epoch": 0.9891985846421255, + "grad_norm": 0.2152857135947321, + "learning_rate": 8.476702214835793e-05, + "loss": 2.9434, + "step": 15935 + }, + { + "epoch": 0.9892606617418834, + "grad_norm": 0.22629850726417036, + "learning_rate": 8.476442652564452e-05, + "loss": 2.8506, + "step": 15936 + }, + { + "epoch": 0.9893227388416413, + "grad_norm": 0.2021340627673273, + "learning_rate": 8.476183072155663e-05, + "loss": 2.9357, + "step": 15937 + }, + { + "epoch": 0.9893848159413993, + "grad_norm": 0.15117307337204616, + "learning_rate": 8.475923473610785e-05, + "loss": 2.9481, + "step": 15938 + }, + { + "epoch": 0.9894468930411571, + "grad_norm": 0.20846499316067396, + "learning_rate": 8.475663856931171e-05, + "loss": 2.974, + "step": 15939 + }, + { + "epoch": 0.989508970140915, + "grad_norm": 0.16374186564763818, + "learning_rate": 8.475404222118173e-05, + "loss": 2.8881, + "step": 15940 + }, + { + "epoch": 0.9895710472406729, + "grad_norm": 0.19409992512891153, + "learning_rate": 8.475144569173148e-05, + "loss": 2.9409, + "step": 15941 + }, + { + "epoch": 0.9896331243404308, + "grad_norm": 0.1733440774749804, + "learning_rate": 8.474884898097452e-05, + "loss": 2.9501, + "step": 15942 + }, + { + "epoch": 0.9896952014401887, + "grad_norm": 0.186475847574275, + "learning_rate": 8.474625208892437e-05, + "loss": 2.8795, + "step": 15943 + }, + { + "epoch": 0.9897572785399467, + "grad_norm": 0.20284744703678778, + "learning_rate": 8.474365501559456e-05, + "loss": 2.9949, + "step": 15944 + }, + { + "epoch": 0.9898193556397045, + "grad_norm": 0.18171916822219703, + "learning_rate": 8.47410577609987e-05, + "loss": 2.9322, + "step": 15945 + }, + { + "epoch": 0.9898814327394624, + "grad_norm": 0.17055710083434983, + "learning_rate": 8.47384603251503e-05, + "loss": 2.9671, + "step": 15946 + }, + { + "epoch": 0.9899435098392203, + "grad_norm": 0.17318364479904824, + "learning_rate": 8.473586270806291e-05, + "loss": 2.9201, + "step": 15947 + }, + { + "epoch": 0.9900055869389782, + "grad_norm": 0.21667394503570664, + "learning_rate": 8.47332649097501e-05, + "loss": 3.0543, + "step": 15948 + }, + { + "epoch": 0.9900676640387361, + "grad_norm": 0.18592622025520109, + "learning_rate": 8.473066693022543e-05, + "loss": 2.8927, + "step": 15949 + }, + { + "epoch": 0.9901297411384941, + "grad_norm": 0.16180905133731469, + "learning_rate": 8.472806876950241e-05, + "loss": 2.8887, + "step": 15950 + }, + { + "epoch": 0.9901918182382519, + "grad_norm": 0.17748322400884275, + "learning_rate": 8.472547042759464e-05, + "loss": 2.9736, + "step": 15951 + }, + { + "epoch": 0.9902538953380098, + "grad_norm": 0.16983321496739753, + "learning_rate": 8.472287190451566e-05, + "loss": 3.0214, + "step": 15952 + }, + { + "epoch": 0.9903159724377677, + "grad_norm": 0.22213387699346407, + "learning_rate": 8.472027320027904e-05, + "loss": 3.0385, + "step": 15953 + }, + { + "epoch": 0.9903780495375256, + "grad_norm": 0.15305151124740554, + "learning_rate": 8.47176743148983e-05, + "loss": 2.8906, + "step": 15954 + }, + { + "epoch": 0.9904401266372835, + "grad_norm": 0.19083363653419874, + "learning_rate": 8.471507524838705e-05, + "loss": 2.9063, + "step": 15955 + }, + { + "epoch": 0.9905022037370415, + "grad_norm": 0.193947101135304, + "learning_rate": 8.47124760007588e-05, + "loss": 2.9627, + "step": 15956 + }, + { + "epoch": 0.9905642808367993, + "grad_norm": 0.1751410588492129, + "learning_rate": 8.470987657202715e-05, + "loss": 3.0159, + "step": 15957 + }, + { + "epoch": 0.9906263579365572, + "grad_norm": 0.1656301904710743, + "learning_rate": 8.470727696220567e-05, + "loss": 2.8932, + "step": 15958 + }, + { + "epoch": 0.9906884350363151, + "grad_norm": 0.17959273990953073, + "learning_rate": 8.470467717130787e-05, + "loss": 2.966, + "step": 15959 + }, + { + "epoch": 0.990750512136073, + "grad_norm": 0.16053337959316144, + "learning_rate": 8.470207719934735e-05, + "loss": 2.9927, + "step": 15960 + }, + { + "epoch": 0.9908125892358309, + "grad_norm": 0.14712982392161006, + "learning_rate": 8.469947704633768e-05, + "loss": 2.9015, + "step": 15961 + }, + { + "epoch": 0.9908746663355888, + "grad_norm": 0.2087059037381352, + "learning_rate": 8.469687671229238e-05, + "loss": 2.9711, + "step": 15962 + }, + { + "epoch": 0.9909367434353467, + "grad_norm": 0.19989268061060175, + "learning_rate": 8.469427619722508e-05, + "loss": 3.0445, + "step": 15963 + }, + { + "epoch": 0.9909988205351046, + "grad_norm": 0.15400615296209638, + "learning_rate": 8.469167550114933e-05, + "loss": 2.8575, + "step": 15964 + }, + { + "epoch": 0.9910608976348625, + "grad_norm": 0.17476048256918564, + "learning_rate": 8.468907462407867e-05, + "loss": 2.9901, + "step": 15965 + }, + { + "epoch": 0.9911229747346204, + "grad_norm": 0.15264397772907237, + "learning_rate": 8.468647356602669e-05, + "loss": 2.9809, + "step": 15966 + }, + { + "epoch": 0.9911850518343783, + "grad_norm": 0.19328321055016984, + "learning_rate": 8.468387232700696e-05, + "loss": 2.8884, + "step": 15967 + }, + { + "epoch": 0.9912471289341362, + "grad_norm": 0.16063382887046784, + "learning_rate": 8.468127090703304e-05, + "loss": 3.0261, + "step": 15968 + }, + { + "epoch": 0.991309206033894, + "grad_norm": 0.17347416039229455, + "learning_rate": 8.467866930611852e-05, + "loss": 3.0517, + "step": 15969 + }, + { + "epoch": 0.991371283133652, + "grad_norm": 0.1864978313345524, + "learning_rate": 8.467606752427696e-05, + "loss": 2.8628, + "step": 15970 + }, + { + "epoch": 0.9914333602334099, + "grad_norm": 0.17145096425153536, + "learning_rate": 8.467346556152193e-05, + "loss": 2.9765, + "step": 15971 + }, + { + "epoch": 0.9914954373331678, + "grad_norm": 0.16441424447065647, + "learning_rate": 8.467086341786703e-05, + "loss": 2.8658, + "step": 15972 + }, + { + "epoch": 0.9915575144329257, + "grad_norm": 0.1623032570969845, + "learning_rate": 8.46682610933258e-05, + "loss": 2.9587, + "step": 15973 + }, + { + "epoch": 0.9916195915326836, + "grad_norm": 0.2684093012448117, + "learning_rate": 8.466565858791186e-05, + "loss": 3.0036, + "step": 15974 + }, + { + "epoch": 0.9916816686324414, + "grad_norm": 0.15534131347941715, + "learning_rate": 8.466305590163875e-05, + "loss": 3.0145, + "step": 15975 + }, + { + "epoch": 0.9917437457321994, + "grad_norm": 0.18210204798271085, + "learning_rate": 8.466045303452006e-05, + "loss": 3.0026, + "step": 15976 + }, + { + "epoch": 0.9918058228319573, + "grad_norm": 0.16794323727447583, + "learning_rate": 8.465784998656936e-05, + "loss": 3.0057, + "step": 15977 + }, + { + "epoch": 0.9918678999317152, + "grad_norm": 0.1703317620382772, + "learning_rate": 8.465524675780025e-05, + "loss": 2.8885, + "step": 15978 + }, + { + "epoch": 0.9919299770314731, + "grad_norm": 0.1793487610882108, + "learning_rate": 8.465264334822632e-05, + "loss": 3.0, + "step": 15979 + }, + { + "epoch": 0.991992054131231, + "grad_norm": 0.1754208048149662, + "learning_rate": 8.465003975786113e-05, + "loss": 2.9923, + "step": 15980 + }, + { + "epoch": 0.9920541312309888, + "grad_norm": 0.1629609811362977, + "learning_rate": 8.464743598671828e-05, + "loss": 2.9505, + "step": 15981 + }, + { + "epoch": 0.9921162083307468, + "grad_norm": 0.23301870187925117, + "learning_rate": 8.464483203481133e-05, + "loss": 3.0103, + "step": 15982 + }, + { + "epoch": 0.9921782854305047, + "grad_norm": 0.23873555540408078, + "learning_rate": 8.464222790215388e-05, + "loss": 3.0069, + "step": 15983 + }, + { + "epoch": 0.9922403625302626, + "grad_norm": 0.1808710201625692, + "learning_rate": 8.463962358875953e-05, + "loss": 2.992, + "step": 15984 + }, + { + "epoch": 0.9923024396300205, + "grad_norm": 0.17655089686254255, + "learning_rate": 8.463701909464185e-05, + "loss": 2.9772, + "step": 15985 + }, + { + "epoch": 0.9923645167297784, + "grad_norm": 0.17093369801388514, + "learning_rate": 8.463441441981444e-05, + "loss": 2.9435, + "step": 15986 + }, + { + "epoch": 0.9924265938295362, + "grad_norm": 0.18338940794210135, + "learning_rate": 8.463180956429086e-05, + "loss": 2.9619, + "step": 15987 + }, + { + "epoch": 0.9924886709292942, + "grad_norm": 0.15280273571100952, + "learning_rate": 8.462920452808474e-05, + "loss": 3.0154, + "step": 15988 + }, + { + "epoch": 0.9925507480290521, + "grad_norm": 0.17267290677984415, + "learning_rate": 8.462659931120965e-05, + "loss": 2.9336, + "step": 15989 + }, + { + "epoch": 0.99261282512881, + "grad_norm": 0.1710540553427769, + "learning_rate": 8.462399391367919e-05, + "loss": 2.9982, + "step": 15990 + }, + { + "epoch": 0.9926749022285679, + "grad_norm": 0.16598835328441033, + "learning_rate": 8.462138833550694e-05, + "loss": 3.0603, + "step": 15991 + }, + { + "epoch": 0.9927369793283258, + "grad_norm": 0.18917181819008427, + "learning_rate": 8.46187825767065e-05, + "loss": 2.9794, + "step": 15992 + }, + { + "epoch": 0.9927990564280836, + "grad_norm": 0.1575222019884389, + "learning_rate": 8.46161766372915e-05, + "loss": 2.8678, + "step": 15993 + }, + { + "epoch": 0.9928611335278416, + "grad_norm": 0.15988922537798048, + "learning_rate": 8.461357051727546e-05, + "loss": 2.9734, + "step": 15994 + }, + { + "epoch": 0.9929232106275995, + "grad_norm": 0.19254162812027928, + "learning_rate": 8.461096421667203e-05, + "loss": 2.9429, + "step": 15995 + }, + { + "epoch": 0.9929852877273574, + "grad_norm": 0.1571723180988944, + "learning_rate": 8.46083577354948e-05, + "loss": 2.9571, + "step": 15996 + }, + { + "epoch": 0.9930473648271153, + "grad_norm": 0.18273461997874826, + "learning_rate": 8.460575107375738e-05, + "loss": 3.0105, + "step": 15997 + }, + { + "epoch": 0.9931094419268732, + "grad_norm": 0.17902139889353158, + "learning_rate": 8.460314423147334e-05, + "loss": 2.883, + "step": 15998 + }, + { + "epoch": 0.993171519026631, + "grad_norm": 0.16972725234862718, + "learning_rate": 8.46005372086563e-05, + "loss": 3.0144, + "step": 15999 + }, + { + "epoch": 0.993233596126389, + "grad_norm": 0.19472394958702044, + "learning_rate": 8.459793000531986e-05, + "loss": 2.9603, + "step": 16000 + }, + { + "epoch": 0.9932956732261469, + "grad_norm": 0.15764398429763055, + "learning_rate": 8.459532262147763e-05, + "loss": 2.9691, + "step": 16001 + }, + { + "epoch": 0.9933577503259048, + "grad_norm": 0.1698806372841177, + "learning_rate": 8.45927150571432e-05, + "loss": 2.8819, + "step": 16002 + }, + { + "epoch": 0.9934198274256627, + "grad_norm": 0.24036056026812985, + "learning_rate": 8.459010731233017e-05, + "loss": 2.9046, + "step": 16003 + }, + { + "epoch": 0.9934819045254206, + "grad_norm": 0.17215122437910066, + "learning_rate": 8.458749938705215e-05, + "loss": 2.9442, + "step": 16004 + }, + { + "epoch": 0.9935439816251784, + "grad_norm": 0.16702174878730983, + "learning_rate": 8.458489128132275e-05, + "loss": 2.9267, + "step": 16005 + }, + { + "epoch": 0.9936060587249363, + "grad_norm": 0.17497364595066153, + "learning_rate": 8.458228299515559e-05, + "loss": 2.9746, + "step": 16006 + }, + { + "epoch": 0.9936681358246943, + "grad_norm": 0.24188686754501698, + "learning_rate": 8.457967452856426e-05, + "loss": 2.8906, + "step": 16007 + }, + { + "epoch": 0.9937302129244522, + "grad_norm": 0.17596808147421983, + "learning_rate": 8.457706588156236e-05, + "loss": 2.9957, + "step": 16008 + }, + { + "epoch": 0.9937922900242101, + "grad_norm": 0.15266760003016805, + "learning_rate": 8.457445705416351e-05, + "loss": 2.9314, + "step": 16009 + }, + { + "epoch": 0.993854367123968, + "grad_norm": 0.19834761799198672, + "learning_rate": 8.457184804638134e-05, + "loss": 3.0053, + "step": 16010 + }, + { + "epoch": 0.9939164442237258, + "grad_norm": 0.15868378254110813, + "learning_rate": 8.456923885822944e-05, + "loss": 2.9693, + "step": 16011 + }, + { + "epoch": 0.9939785213234837, + "grad_norm": 0.1860581831810909, + "learning_rate": 8.456662948972143e-05, + "loss": 2.9676, + "step": 16012 + }, + { + "epoch": 0.9940405984232417, + "grad_norm": 0.19340017639623525, + "learning_rate": 8.456401994087093e-05, + "loss": 2.914, + "step": 16013 + }, + { + "epoch": 0.9941026755229996, + "grad_norm": 0.19819916883226024, + "learning_rate": 8.456141021169154e-05, + "loss": 2.989, + "step": 16014 + }, + { + "epoch": 0.9941647526227575, + "grad_norm": 0.17405841338909533, + "learning_rate": 8.455880030219688e-05, + "loss": 2.9456, + "step": 16015 + }, + { + "epoch": 0.9942268297225154, + "grad_norm": 0.18373409953381792, + "learning_rate": 8.455619021240058e-05, + "loss": 2.9294, + "step": 16016 + }, + { + "epoch": 0.9942889068222732, + "grad_norm": 0.20262589942819578, + "learning_rate": 8.455357994231624e-05, + "loss": 3.0295, + "step": 16017 + }, + { + "epoch": 0.9943509839220311, + "grad_norm": 0.24195228198369265, + "learning_rate": 8.45509694919575e-05, + "loss": 2.9534, + "step": 16018 + }, + { + "epoch": 0.9944130610217891, + "grad_norm": 0.3587350745535526, + "learning_rate": 8.454835886133793e-05, + "loss": 2.8653, + "step": 16019 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.1531458426906974, + "learning_rate": 8.454574805047121e-05, + "loss": 2.9009, + "step": 16020 + }, + { + "epoch": 0.9945372152213049, + "grad_norm": 0.19930767430827084, + "learning_rate": 8.454313705937094e-05, + "loss": 2.9459, + "step": 16021 + }, + { + "epoch": 0.9945992923210628, + "grad_norm": 0.15940533774667717, + "learning_rate": 8.454052588805073e-05, + "loss": 3.0002, + "step": 16022 + }, + { + "epoch": 0.9946613694208206, + "grad_norm": 0.1510529794786324, + "learning_rate": 8.45379145365242e-05, + "loss": 2.8723, + "step": 16023 + }, + { + "epoch": 0.9947234465205785, + "grad_norm": 0.20465710428828204, + "learning_rate": 8.4535303004805e-05, + "loss": 3.0258, + "step": 16024 + }, + { + "epoch": 0.9947855236203365, + "grad_norm": 0.18765156302526612, + "learning_rate": 8.453269129290674e-05, + "loss": 2.904, + "step": 16025 + }, + { + "epoch": 0.9948476007200944, + "grad_norm": 0.16571903663148646, + "learning_rate": 8.453007940084303e-05, + "loss": 2.9398, + "step": 16026 + }, + { + "epoch": 0.9949096778198523, + "grad_norm": 0.16872465464318817, + "learning_rate": 8.452746732862754e-05, + "loss": 3.0063, + "step": 16027 + }, + { + "epoch": 0.9949717549196102, + "grad_norm": 0.18060654420512864, + "learning_rate": 8.452485507627386e-05, + "loss": 2.9209, + "step": 16028 + }, + { + "epoch": 0.995033832019368, + "grad_norm": 0.18435622216632577, + "learning_rate": 8.452224264379563e-05, + "loss": 2.9323, + "step": 16029 + }, + { + "epoch": 0.9950959091191259, + "grad_norm": 0.16951736446067972, + "learning_rate": 8.451963003120647e-05, + "loss": 2.9804, + "step": 16030 + }, + { + "epoch": 0.9951579862188838, + "grad_norm": 0.15551632052741005, + "learning_rate": 8.451701723852002e-05, + "loss": 2.9835, + "step": 16031 + }, + { + "epoch": 0.9952200633186418, + "grad_norm": 0.16742827266622023, + "learning_rate": 8.451440426574993e-05, + "loss": 2.937, + "step": 16032 + }, + { + "epoch": 0.9952821404183997, + "grad_norm": 0.19087046937243346, + "learning_rate": 8.45117911129098e-05, + "loss": 2.9459, + "step": 16033 + }, + { + "epoch": 0.9953442175181576, + "grad_norm": 0.20184288336317824, + "learning_rate": 8.450917778001329e-05, + "loss": 2.9645, + "step": 16034 + }, + { + "epoch": 0.9954062946179154, + "grad_norm": 0.188302676378583, + "learning_rate": 8.450656426707399e-05, + "loss": 2.8691, + "step": 16035 + }, + { + "epoch": 0.9954683717176733, + "grad_norm": 0.16398000346041228, + "learning_rate": 8.450395057410561e-05, + "loss": 2.893, + "step": 16036 + }, + { + "epoch": 0.9955304488174312, + "grad_norm": 0.19862395672857264, + "learning_rate": 8.450133670112172e-05, + "loss": 2.845, + "step": 16037 + }, + { + "epoch": 0.9955925259171892, + "grad_norm": 0.1652804984430661, + "learning_rate": 8.449872264813598e-05, + "loss": 2.8668, + "step": 16038 + }, + { + "epoch": 0.9956546030169471, + "grad_norm": 0.17674673161627721, + "learning_rate": 8.449610841516205e-05, + "loss": 2.9421, + "step": 16039 + }, + { + "epoch": 0.995716680116705, + "grad_norm": 0.15666205789904974, + "learning_rate": 8.449349400221353e-05, + "loss": 2.9147, + "step": 16040 + }, + { + "epoch": 0.9957787572164628, + "grad_norm": 0.26545416723998, + "learning_rate": 8.449087940930407e-05, + "loss": 2.9051, + "step": 16041 + }, + { + "epoch": 0.9958408343162207, + "grad_norm": 0.1589844038211131, + "learning_rate": 8.448826463644734e-05, + "loss": 3.0113, + "step": 16042 + }, + { + "epoch": 0.9959029114159786, + "grad_norm": 0.1755966170545856, + "learning_rate": 8.448564968365694e-05, + "loss": 2.9492, + "step": 16043 + }, + { + "epoch": 0.9959649885157366, + "grad_norm": 0.16191438069789713, + "learning_rate": 8.448303455094655e-05, + "loss": 2.993, + "step": 16044 + }, + { + "epoch": 0.9960270656154945, + "grad_norm": 0.18413218212234236, + "learning_rate": 8.448041923832979e-05, + "loss": 2.8529, + "step": 16045 + }, + { + "epoch": 0.9960891427152524, + "grad_norm": 0.2104466785534276, + "learning_rate": 8.447780374582032e-05, + "loss": 2.9561, + "step": 16046 + }, + { + "epoch": 0.9961512198150102, + "grad_norm": 0.18891425034562695, + "learning_rate": 8.447518807343178e-05, + "loss": 2.8508, + "step": 16047 + }, + { + "epoch": 0.9962132969147681, + "grad_norm": 0.15824006085083844, + "learning_rate": 8.447257222117779e-05, + "loss": 2.909, + "step": 16048 + }, + { + "epoch": 0.996275374014526, + "grad_norm": 0.18743553642931102, + "learning_rate": 8.446995618907205e-05, + "loss": 3.0182, + "step": 16049 + }, + { + "epoch": 0.996337451114284, + "grad_norm": 0.16512214668539055, + "learning_rate": 8.446733997712817e-05, + "loss": 3.0311, + "step": 16050 + }, + { + "epoch": 0.9963995282140419, + "grad_norm": 0.19217964520723965, + "learning_rate": 8.446472358535981e-05, + "loss": 2.9421, + "step": 16051 + }, + { + "epoch": 0.9964616053137998, + "grad_norm": 0.16250414636713909, + "learning_rate": 8.446210701378063e-05, + "loss": 2.9999, + "step": 16052 + }, + { + "epoch": 0.9965236824135576, + "grad_norm": 0.16021615745510867, + "learning_rate": 8.445949026240425e-05, + "loss": 2.9148, + "step": 16053 + }, + { + "epoch": 0.9965857595133155, + "grad_norm": 0.1644611793865302, + "learning_rate": 8.445687333124436e-05, + "loss": 2.9743, + "step": 16054 + }, + { + "epoch": 0.9966478366130734, + "grad_norm": 0.19201857074132822, + "learning_rate": 8.445425622031459e-05, + "loss": 2.9373, + "step": 16055 + }, + { + "epoch": 0.9967099137128314, + "grad_norm": 0.14541855577095286, + "learning_rate": 8.445163892962861e-05, + "loss": 2.9499, + "step": 16056 + }, + { + "epoch": 0.9967719908125893, + "grad_norm": 0.15528438337599745, + "learning_rate": 8.444902145920006e-05, + "loss": 2.8238, + "step": 16057 + }, + { + "epoch": 0.9968340679123472, + "grad_norm": 0.189431798855441, + "learning_rate": 8.444640380904261e-05, + "loss": 2.9261, + "step": 16058 + }, + { + "epoch": 0.996896145012105, + "grad_norm": 0.1899483392622019, + "learning_rate": 8.44437859791699e-05, + "loss": 2.9728, + "step": 16059 + }, + { + "epoch": 0.9969582221118629, + "grad_norm": 0.1689654046167191, + "learning_rate": 8.44411679695956e-05, + "loss": 2.8705, + "step": 16060 + }, + { + "epoch": 0.9970202992116208, + "grad_norm": 0.19029640032565515, + "learning_rate": 8.443854978033337e-05, + "loss": 2.8903, + "step": 16061 + }, + { + "epoch": 0.9970823763113787, + "grad_norm": 0.16470392045781737, + "learning_rate": 8.443593141139685e-05, + "loss": 3.0235, + "step": 16062 + }, + { + "epoch": 0.9971444534111367, + "grad_norm": 0.20573754333793692, + "learning_rate": 8.443331286279974e-05, + "loss": 2.8614, + "step": 16063 + }, + { + "epoch": 0.9972065305108946, + "grad_norm": 0.16226901386489165, + "learning_rate": 8.443069413455566e-05, + "loss": 3.0089, + "step": 16064 + }, + { + "epoch": 0.9972686076106524, + "grad_norm": 0.23100405288986556, + "learning_rate": 8.44280752266783e-05, + "loss": 2.9962, + "step": 16065 + }, + { + "epoch": 0.9973306847104103, + "grad_norm": 0.19728624594900798, + "learning_rate": 8.442545613918131e-05, + "loss": 2.9863, + "step": 16066 + }, + { + "epoch": 0.9973927618101682, + "grad_norm": 0.18600042023854826, + "learning_rate": 8.442283687207835e-05, + "loss": 2.8424, + "step": 16067 + }, + { + "epoch": 0.9974548389099261, + "grad_norm": 0.22571149032159066, + "learning_rate": 8.442021742538311e-05, + "loss": 2.9802, + "step": 16068 + }, + { + "epoch": 0.9975169160096841, + "grad_norm": 0.21866030216428542, + "learning_rate": 8.441759779910922e-05, + "loss": 2.9287, + "step": 16069 + }, + { + "epoch": 0.997578993109442, + "grad_norm": 0.1953105354210277, + "learning_rate": 8.441497799327039e-05, + "loss": 2.9958, + "step": 16070 + }, + { + "epoch": 0.9976410702091998, + "grad_norm": 0.2986699976202264, + "learning_rate": 8.441235800788025e-05, + "loss": 3.0231, + "step": 16071 + }, + { + "epoch": 0.9977031473089577, + "grad_norm": 0.2289593293293008, + "learning_rate": 8.440973784295249e-05, + "loss": 2.9038, + "step": 16072 + }, + { + "epoch": 0.9977652244087156, + "grad_norm": 0.25133917177425685, + "learning_rate": 8.440711749850076e-05, + "loss": 2.9551, + "step": 16073 + }, + { + "epoch": 0.9978273015084735, + "grad_norm": 0.2096909850181993, + "learning_rate": 8.440449697453878e-05, + "loss": 2.9099, + "step": 16074 + }, + { + "epoch": 0.9978893786082315, + "grad_norm": 0.21196011513116766, + "learning_rate": 8.440187627108016e-05, + "loss": 3.0544, + "step": 16075 + }, + { + "epoch": 0.9979514557079894, + "grad_norm": 0.20383794999924745, + "learning_rate": 8.43992553881386e-05, + "loss": 2.9196, + "step": 16076 + }, + { + "epoch": 0.9980135328077472, + "grad_norm": 0.182429558961761, + "learning_rate": 8.439663432572778e-05, + "loss": 2.8212, + "step": 16077 + }, + { + "epoch": 0.9980756099075051, + "grad_norm": 0.21209757194206133, + "learning_rate": 8.439401308386137e-05, + "loss": 3.0392, + "step": 16078 + }, + { + "epoch": 0.998137687007263, + "grad_norm": 0.18260035717768378, + "learning_rate": 8.439139166255303e-05, + "loss": 2.9066, + "step": 16079 + }, + { + "epoch": 0.9981997641070209, + "grad_norm": 0.26680768606865546, + "learning_rate": 8.438877006181647e-05, + "loss": 2.8548, + "step": 16080 + }, + { + "epoch": 0.9982618412067789, + "grad_norm": 0.25249116211711325, + "learning_rate": 8.438614828166535e-05, + "loss": 2.9933, + "step": 16081 + }, + { + "epoch": 0.9983239183065368, + "grad_norm": 0.20762679076929655, + "learning_rate": 8.438352632211334e-05, + "loss": 2.9617, + "step": 16082 + }, + { + "epoch": 0.9983859954062946, + "grad_norm": 0.1806735268253152, + "learning_rate": 8.438090418317412e-05, + "loss": 2.9468, + "step": 16083 + }, + { + "epoch": 0.9984480725060525, + "grad_norm": 0.18986783404699906, + "learning_rate": 8.437828186486139e-05, + "loss": 3.0638, + "step": 16084 + }, + { + "epoch": 0.9985101496058104, + "grad_norm": 0.26031625083245186, + "learning_rate": 8.437565936718881e-05, + "loss": 3.0215, + "step": 16085 + }, + { + "epoch": 0.9985722267055683, + "grad_norm": 0.20082340799797857, + "learning_rate": 8.437303669017007e-05, + "loss": 3.0441, + "step": 16086 + }, + { + "epoch": 0.9986343038053263, + "grad_norm": 0.20333502072242562, + "learning_rate": 8.437041383381885e-05, + "loss": 3.0303, + "step": 16087 + }, + { + "epoch": 0.9986963809050842, + "grad_norm": 0.19096909445792587, + "learning_rate": 8.436779079814883e-05, + "loss": 2.9291, + "step": 16088 + }, + { + "epoch": 0.998758458004842, + "grad_norm": 0.18568244799710565, + "learning_rate": 8.436516758317372e-05, + "loss": 2.9439, + "step": 16089 + }, + { + "epoch": 0.9988205351045999, + "grad_norm": 0.18121986467492593, + "learning_rate": 8.436254418890718e-05, + "loss": 2.8601, + "step": 16090 + }, + { + "epoch": 0.9988826122043578, + "grad_norm": 0.18025166637348783, + "learning_rate": 8.435992061536291e-05, + "loss": 2.961, + "step": 16091 + }, + { + "epoch": 0.9989446893041157, + "grad_norm": 0.18080763395982974, + "learning_rate": 8.435729686255458e-05, + "loss": 2.8984, + "step": 16092 + }, + { + "epoch": 0.9990067664038736, + "grad_norm": 0.20370491399299107, + "learning_rate": 8.43546729304959e-05, + "loss": 2.8699, + "step": 16093 + }, + { + "epoch": 0.9990688435036316, + "grad_norm": 0.18025604775847523, + "learning_rate": 8.435204881920056e-05, + "loss": 2.9868, + "step": 16094 + }, + { + "epoch": 0.9991309206033894, + "grad_norm": 0.19431600037100163, + "learning_rate": 8.434942452868222e-05, + "loss": 2.9805, + "step": 16095 + }, + { + "epoch": 0.9991929977031473, + "grad_norm": 0.1839439420553837, + "learning_rate": 8.434680005895461e-05, + "loss": 3.0011, + "step": 16096 + }, + { + "epoch": 0.9992550748029052, + "grad_norm": 0.22901202352164102, + "learning_rate": 8.434417541003141e-05, + "loss": 2.909, + "step": 16097 + }, + { + "epoch": 0.9993171519026631, + "grad_norm": 0.17897157475936762, + "learning_rate": 8.43415505819263e-05, + "loss": 2.9434, + "step": 16098 + }, + { + "epoch": 0.999379229002421, + "grad_norm": 0.18978327074298892, + "learning_rate": 8.433892557465299e-05, + "loss": 2.9581, + "step": 16099 + }, + { + "epoch": 0.999441306102179, + "grad_norm": 0.19559477435476247, + "learning_rate": 8.433630038822517e-05, + "loss": 2.9924, + "step": 16100 + }, + { + "epoch": 0.9995033832019368, + "grad_norm": 0.18038252467442648, + "learning_rate": 8.433367502265654e-05, + "loss": 2.9911, + "step": 16101 + }, + { + "epoch": 0.9995654603016947, + "grad_norm": 0.25839223126190547, + "learning_rate": 8.433104947796078e-05, + "loss": 3.0152, + "step": 16102 + }, + { + "epoch": 0.9996275374014526, + "grad_norm": 0.18338960092074505, + "learning_rate": 8.43284237541516e-05, + "loss": 3.0596, + "step": 16103 + }, + { + "epoch": 0.9996896145012105, + "grad_norm": 0.18164690068386327, + "learning_rate": 8.43257978512427e-05, + "loss": 2.921, + "step": 16104 + }, + { + "epoch": 0.9997516916009684, + "grad_norm": 0.18435263538497004, + "learning_rate": 8.43231717692478e-05, + "loss": 2.9822, + "step": 16105 + }, + { + "epoch": 0.9998137687007264, + "grad_norm": 0.16679165048032948, + "learning_rate": 8.432054550818056e-05, + "loss": 2.879, + "step": 16106 + }, + { + "epoch": 0.9998758458004842, + "grad_norm": 0.19171917889647322, + "learning_rate": 8.43179190680547e-05, + "loss": 2.9466, + "step": 16107 + }, + { + "epoch": 0.9999379229002421, + "grad_norm": 0.2206504640797148, + "learning_rate": 8.431529244888394e-05, + "loss": 2.9484, + "step": 16108 + }, + { + "epoch": 1.0, + "grad_norm": 0.1816549295514905, + "learning_rate": 8.431266565068196e-05, + "loss": 2.9861, + "step": 16109 + }, + { + "epoch": 1.000062077099758, + "grad_norm": 0.15666335385726857, + "learning_rate": 8.431003867346247e-05, + "loss": 2.9933, + "step": 16110 + }, + { + "epoch": 1.0001241541995158, + "grad_norm": 0.17626325432334852, + "learning_rate": 8.43074115172392e-05, + "loss": 2.9469, + "step": 16111 + }, + { + "epoch": 1.0001862312992738, + "grad_norm": 0.19107271835110834, + "learning_rate": 8.43047841820258e-05, + "loss": 2.9522, + "step": 16112 + }, + { + "epoch": 1.0002483083990317, + "grad_norm": 0.23317095312570116, + "learning_rate": 8.430215666783605e-05, + "loss": 2.9796, + "step": 16113 + }, + { + "epoch": 1.0003103854987896, + "grad_norm": 0.16534601254952835, + "learning_rate": 8.429952897468361e-05, + "loss": 2.9322, + "step": 16114 + }, + { + "epoch": 1.0003724625985473, + "grad_norm": 0.16144492237220218, + "learning_rate": 8.429690110258218e-05, + "loss": 2.9542, + "step": 16115 + }, + { + "epoch": 1.0004345396983052, + "grad_norm": 0.1873055681798635, + "learning_rate": 8.429427305154552e-05, + "loss": 2.9844, + "step": 16116 + }, + { + "epoch": 1.0004966167980631, + "grad_norm": 0.16642652598504173, + "learning_rate": 8.429164482158729e-05, + "loss": 2.9815, + "step": 16117 + }, + { + "epoch": 1.000558693897821, + "grad_norm": 0.18224882008323662, + "learning_rate": 8.428901641272124e-05, + "loss": 2.9893, + "step": 16118 + }, + { + "epoch": 1.000620770997579, + "grad_norm": 0.16470385825802136, + "learning_rate": 8.428638782496106e-05, + "loss": 3.0285, + "step": 16119 + }, + { + "epoch": 1.0006828480973369, + "grad_norm": 0.1934610061198646, + "learning_rate": 8.428375905832046e-05, + "loss": 2.9651, + "step": 16120 + }, + { + "epoch": 1.0007449251970948, + "grad_norm": 0.1873869555160249, + "learning_rate": 8.42811301128132e-05, + "loss": 2.9081, + "step": 16121 + }, + { + "epoch": 1.0008070022968527, + "grad_norm": 0.17446021017369084, + "learning_rate": 8.427850098845293e-05, + "loss": 3.0501, + "step": 16122 + }, + { + "epoch": 1.0008690793966106, + "grad_norm": 0.2275958583822648, + "learning_rate": 8.427587168525341e-05, + "loss": 2.9046, + "step": 16123 + }, + { + "epoch": 1.0009311564963685, + "grad_norm": 0.21671256582544404, + "learning_rate": 8.427324220322835e-05, + "loss": 2.9382, + "step": 16124 + }, + { + "epoch": 1.0009932335961265, + "grad_norm": 0.2066832172220005, + "learning_rate": 8.427061254239146e-05, + "loss": 2.9321, + "step": 16125 + }, + { + "epoch": 1.0010553106958844, + "grad_norm": 0.16900725138854164, + "learning_rate": 8.426798270275646e-05, + "loss": 2.9597, + "step": 16126 + }, + { + "epoch": 1.001117387795642, + "grad_norm": 0.18042471212137098, + "learning_rate": 8.426535268433708e-05, + "loss": 2.9362, + "step": 16127 + }, + { + "epoch": 1.0011794648954, + "grad_norm": 0.1651453657723129, + "learning_rate": 8.426272248714705e-05, + "loss": 2.9496, + "step": 16128 + }, + { + "epoch": 1.001241541995158, + "grad_norm": 0.19729127811533267, + "learning_rate": 8.426009211120008e-05, + "loss": 2.7956, + "step": 16129 + }, + { + "epoch": 1.0013036190949158, + "grad_norm": 0.18885052596628252, + "learning_rate": 8.425746155650987e-05, + "loss": 2.9317, + "step": 16130 + }, + { + "epoch": 1.0013656961946737, + "grad_norm": 0.15258707275171435, + "learning_rate": 8.425483082309019e-05, + "loss": 2.9165, + "step": 16131 + }, + { + "epoch": 1.0014277732944317, + "grad_norm": 0.16461402854805027, + "learning_rate": 8.425219991095473e-05, + "loss": 2.9851, + "step": 16132 + }, + { + "epoch": 1.0014898503941896, + "grad_norm": 0.19319712449445442, + "learning_rate": 8.424956882011721e-05, + "loss": 2.8208, + "step": 16133 + }, + { + "epoch": 1.0015519274939475, + "grad_norm": 0.15232405168387453, + "learning_rate": 8.42469375505914e-05, + "loss": 2.9846, + "step": 16134 + }, + { + "epoch": 1.0016140045937054, + "grad_norm": 0.1721583815408239, + "learning_rate": 8.4244306102391e-05, + "loss": 2.9503, + "step": 16135 + }, + { + "epoch": 1.0016760816934633, + "grad_norm": 0.18121073725089645, + "learning_rate": 8.424167447552973e-05, + "loss": 2.9178, + "step": 16136 + }, + { + "epoch": 1.0017381587932213, + "grad_norm": 0.16185744368962962, + "learning_rate": 8.423904267002134e-05, + "loss": 2.8574, + "step": 16137 + }, + { + "epoch": 1.0018002358929792, + "grad_norm": 0.1604739910094458, + "learning_rate": 8.423641068587955e-05, + "loss": 2.9589, + "step": 16138 + }, + { + "epoch": 1.0018623129927369, + "grad_norm": 0.15293111422909733, + "learning_rate": 8.42337785231181e-05, + "loss": 2.9211, + "step": 16139 + }, + { + "epoch": 1.0019243900924948, + "grad_norm": 0.16905461893204116, + "learning_rate": 8.423114618175072e-05, + "loss": 2.835, + "step": 16140 + }, + { + "epoch": 1.0019864671922527, + "grad_norm": 0.1636959989062987, + "learning_rate": 8.422851366179111e-05, + "loss": 2.9008, + "step": 16141 + }, + { + "epoch": 1.0020485442920106, + "grad_norm": 0.16128133229129957, + "learning_rate": 8.422588096325307e-05, + "loss": 2.9097, + "step": 16142 + }, + { + "epoch": 1.0021106213917685, + "grad_norm": 0.16674348369738554, + "learning_rate": 8.422324808615029e-05, + "loss": 2.9634, + "step": 16143 + }, + { + "epoch": 1.0021726984915265, + "grad_norm": 0.14674801942806404, + "learning_rate": 8.422061503049652e-05, + "loss": 2.957, + "step": 16144 + }, + { + "epoch": 1.0022347755912844, + "grad_norm": 0.16312539314354185, + "learning_rate": 8.421798179630549e-05, + "loss": 2.9566, + "step": 16145 + }, + { + "epoch": 1.0022968526910423, + "grad_norm": 0.2346198948303231, + "learning_rate": 8.421534838359094e-05, + "loss": 3.0345, + "step": 16146 + }, + { + "epoch": 1.0023589297908002, + "grad_norm": 0.16933397008155546, + "learning_rate": 8.421271479236661e-05, + "loss": 2.8634, + "step": 16147 + }, + { + "epoch": 1.0024210068905581, + "grad_norm": 0.19457429616419233, + "learning_rate": 8.421008102264624e-05, + "loss": 2.8642, + "step": 16148 + }, + { + "epoch": 1.002483083990316, + "grad_norm": 0.16912540032493764, + "learning_rate": 8.420744707444359e-05, + "loss": 2.9721, + "step": 16149 + }, + { + "epoch": 1.002545161090074, + "grad_norm": 0.16930562114260114, + "learning_rate": 8.420481294777237e-05, + "loss": 2.9821, + "step": 16150 + }, + { + "epoch": 1.0026072381898317, + "grad_norm": 0.1826883779061594, + "learning_rate": 8.420217864264635e-05, + "loss": 2.8572, + "step": 16151 + }, + { + "epoch": 1.0026693152895896, + "grad_norm": 0.17906984653681057, + "learning_rate": 8.419954415907924e-05, + "loss": 2.8634, + "step": 16152 + }, + { + "epoch": 1.0027313923893475, + "grad_norm": 0.1693628311800564, + "learning_rate": 8.419690949708481e-05, + "loss": 2.8938, + "step": 16153 + }, + { + "epoch": 1.0027934694891054, + "grad_norm": 0.16554669742475692, + "learning_rate": 8.419427465667682e-05, + "loss": 2.8994, + "step": 16154 + }, + { + "epoch": 1.0028555465888633, + "grad_norm": 0.16442146087046214, + "learning_rate": 8.419163963786899e-05, + "loss": 2.83, + "step": 16155 + }, + { + "epoch": 1.0029176236886213, + "grad_norm": 0.1551827879872851, + "learning_rate": 8.418900444067507e-05, + "loss": 2.9673, + "step": 16156 + }, + { + "epoch": 1.0029797007883792, + "grad_norm": 0.16092089965536588, + "learning_rate": 8.418636906510882e-05, + "loss": 2.8859, + "step": 16157 + }, + { + "epoch": 1.003041777888137, + "grad_norm": 0.17795974988343075, + "learning_rate": 8.418373351118397e-05, + "loss": 3.0041, + "step": 16158 + }, + { + "epoch": 1.003103854987895, + "grad_norm": 0.1874624453128461, + "learning_rate": 8.418109777891429e-05, + "loss": 2.8821, + "step": 16159 + }, + { + "epoch": 1.003165932087653, + "grad_norm": 0.18099480330992163, + "learning_rate": 8.417846186831353e-05, + "loss": 2.9316, + "step": 16160 + }, + { + "epoch": 1.0032280091874108, + "grad_norm": 0.16552938049323657, + "learning_rate": 8.417582577939545e-05, + "loss": 3.039, + "step": 16161 + }, + { + "epoch": 1.0032900862871688, + "grad_norm": 0.18856012142515846, + "learning_rate": 8.417318951217377e-05, + "loss": 2.9466, + "step": 16162 + }, + { + "epoch": 1.0033521633869265, + "grad_norm": 0.18324083537959096, + "learning_rate": 8.417055306666227e-05, + "loss": 2.9079, + "step": 16163 + }, + { + "epoch": 1.0034142404866844, + "grad_norm": 0.2359239969115798, + "learning_rate": 8.41679164428747e-05, + "loss": 2.9421, + "step": 16164 + }, + { + "epoch": 1.0034763175864423, + "grad_norm": 0.17414270824180603, + "learning_rate": 8.416527964082481e-05, + "loss": 2.9359, + "step": 16165 + }, + { + "epoch": 1.0035383946862002, + "grad_norm": 0.1940768542025853, + "learning_rate": 8.416264266052638e-05, + "loss": 2.9597, + "step": 16166 + }, + { + "epoch": 1.0036004717859581, + "grad_norm": 0.19609321009041067, + "learning_rate": 8.416000550199313e-05, + "loss": 3.0419, + "step": 16167 + }, + { + "epoch": 1.003662548885716, + "grad_norm": 0.17574881678294182, + "learning_rate": 8.415736816523884e-05, + "loss": 3.0497, + "step": 16168 + }, + { + "epoch": 1.003724625985474, + "grad_norm": 0.1582327626570806, + "learning_rate": 8.415473065027728e-05, + "loss": 2.9132, + "step": 16169 + }, + { + "epoch": 1.0037867030852319, + "grad_norm": 0.20826508278282113, + "learning_rate": 8.41520929571222e-05, + "loss": 3.0205, + "step": 16170 + }, + { + "epoch": 1.0038487801849898, + "grad_norm": 0.15870727459724732, + "learning_rate": 8.414945508578736e-05, + "loss": 2.9299, + "step": 16171 + }, + { + "epoch": 1.0039108572847477, + "grad_norm": 0.1655100360475208, + "learning_rate": 8.41468170362865e-05, + "loss": 2.9732, + "step": 16172 + }, + { + "epoch": 1.0039729343845056, + "grad_norm": 0.2619610277093666, + "learning_rate": 8.414417880863344e-05, + "loss": 2.8701, + "step": 16173 + }, + { + "epoch": 1.0040350114842636, + "grad_norm": 0.17499567499095547, + "learning_rate": 8.414154040284188e-05, + "loss": 2.8967, + "step": 16174 + }, + { + "epoch": 1.0040970885840212, + "grad_norm": 0.1858056923561976, + "learning_rate": 8.413890181892563e-05, + "loss": 3.0419, + "step": 16175 + }, + { + "epoch": 1.0041591656837792, + "grad_norm": 0.2249422674815928, + "learning_rate": 8.413626305689843e-05, + "loss": 3.0166, + "step": 16176 + }, + { + "epoch": 1.004221242783537, + "grad_norm": 0.20562367309849472, + "learning_rate": 8.413362411677408e-05, + "loss": 2.8895, + "step": 16177 + }, + { + "epoch": 1.004283319883295, + "grad_norm": 0.18618219927272026, + "learning_rate": 8.413098499856631e-05, + "loss": 2.9021, + "step": 16178 + }, + { + "epoch": 1.004345396983053, + "grad_norm": 0.1732892835782562, + "learning_rate": 8.412834570228893e-05, + "loss": 2.9542, + "step": 16179 + }, + { + "epoch": 1.0044074740828108, + "grad_norm": 0.18559985157445819, + "learning_rate": 8.412570622795566e-05, + "loss": 2.987, + "step": 16180 + }, + { + "epoch": 1.0044695511825688, + "grad_norm": 0.20927290740408636, + "learning_rate": 8.412306657558032e-05, + "loss": 2.9589, + "step": 16181 + }, + { + "epoch": 1.0045316282823267, + "grad_norm": 0.16574588628557566, + "learning_rate": 8.412042674517663e-05, + "loss": 2.9399, + "step": 16182 + }, + { + "epoch": 1.0045937053820846, + "grad_norm": 0.1675597912200566, + "learning_rate": 8.41177867367584e-05, + "loss": 2.9336, + "step": 16183 + }, + { + "epoch": 1.0046557824818425, + "grad_norm": 0.16248632749478834, + "learning_rate": 8.41151465503394e-05, + "loss": 2.8776, + "step": 16184 + }, + { + "epoch": 1.0047178595816004, + "grad_norm": 0.15488146243763729, + "learning_rate": 8.411250618593338e-05, + "loss": 2.9454, + "step": 16185 + }, + { + "epoch": 1.0047799366813583, + "grad_norm": 0.16326871782787428, + "learning_rate": 8.410986564355417e-05, + "loss": 2.8818, + "step": 16186 + }, + { + "epoch": 1.004842013781116, + "grad_norm": 0.16520655332410536, + "learning_rate": 8.410722492321548e-05, + "loss": 3.014, + "step": 16187 + }, + { + "epoch": 1.004904090880874, + "grad_norm": 0.1462909947918347, + "learning_rate": 8.410458402493114e-05, + "loss": 2.886, + "step": 16188 + }, + { + "epoch": 1.0049661679806319, + "grad_norm": 0.1847322752455016, + "learning_rate": 8.410194294871488e-05, + "loss": 2.8821, + "step": 16189 + }, + { + "epoch": 1.0050282450803898, + "grad_norm": 0.22746303417845148, + "learning_rate": 8.409930169458051e-05, + "loss": 2.9202, + "step": 16190 + }, + { + "epoch": 1.0050903221801477, + "grad_norm": 0.17905680222957915, + "learning_rate": 8.409666026254181e-05, + "loss": 2.9388, + "step": 16191 + }, + { + "epoch": 1.0051523992799056, + "grad_norm": 0.21478602761014423, + "learning_rate": 8.409401865261257e-05, + "loss": 2.9524, + "step": 16192 + }, + { + "epoch": 1.0052144763796635, + "grad_norm": 0.17150630202475584, + "learning_rate": 8.409137686480655e-05, + "loss": 2.828, + "step": 16193 + }, + { + "epoch": 1.0052765534794215, + "grad_norm": 0.1952335102570469, + "learning_rate": 8.408873489913753e-05, + "loss": 2.9192, + "step": 16194 + }, + { + "epoch": 1.0053386305791794, + "grad_norm": 0.18152727671783275, + "learning_rate": 8.408609275561931e-05, + "loss": 3.0169, + "step": 16195 + }, + { + "epoch": 1.0054007076789373, + "grad_norm": 0.2083923296046739, + "learning_rate": 8.408345043426569e-05, + "loss": 2.866, + "step": 16196 + }, + { + "epoch": 1.0054627847786952, + "grad_norm": 0.18470780231984082, + "learning_rate": 8.40808079350904e-05, + "loss": 2.9773, + "step": 16197 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.19033570538062644, + "learning_rate": 8.40781652581073e-05, + "loss": 2.9366, + "step": 16198 + }, + { + "epoch": 1.0055869389782108, + "grad_norm": 0.18514322243100773, + "learning_rate": 8.407552240333012e-05, + "loss": 2.8308, + "step": 16199 + }, + { + "epoch": 1.0056490160779687, + "grad_norm": 0.1717713758896895, + "learning_rate": 8.407287937077266e-05, + "loss": 2.8909, + "step": 16200 + }, + { + "epoch": 1.0057110931777267, + "grad_norm": 0.17306168494991805, + "learning_rate": 8.407023616044873e-05, + "loss": 2.9008, + "step": 16201 + }, + { + "epoch": 1.0057731702774846, + "grad_norm": 0.17649634923921298, + "learning_rate": 8.406759277237209e-05, + "loss": 2.9524, + "step": 16202 + }, + { + "epoch": 1.0058352473772425, + "grad_norm": 0.21461398236050894, + "learning_rate": 8.406494920655656e-05, + "loss": 2.9447, + "step": 16203 + }, + { + "epoch": 1.0058973244770004, + "grad_norm": 0.19479904394578532, + "learning_rate": 8.406230546301592e-05, + "loss": 2.8158, + "step": 16204 + }, + { + "epoch": 1.0059594015767583, + "grad_norm": 0.17436048340447255, + "learning_rate": 8.405966154176397e-05, + "loss": 2.964, + "step": 16205 + }, + { + "epoch": 1.0060214786765163, + "grad_norm": 0.17023790502017563, + "learning_rate": 8.405701744281448e-05, + "loss": 2.9371, + "step": 16206 + }, + { + "epoch": 1.0060835557762742, + "grad_norm": 0.17892018559582484, + "learning_rate": 8.405437316618128e-05, + "loss": 2.9489, + "step": 16207 + }, + { + "epoch": 1.006145632876032, + "grad_norm": 0.1797400169750529, + "learning_rate": 8.405172871187814e-05, + "loss": 2.9882, + "step": 16208 + }, + { + "epoch": 1.00620770997579, + "grad_norm": 0.18778574548280155, + "learning_rate": 8.404908407991886e-05, + "loss": 2.9193, + "step": 16209 + }, + { + "epoch": 1.006269787075548, + "grad_norm": 0.19674058480922724, + "learning_rate": 8.404643927031725e-05, + "loss": 2.9989, + "step": 16210 + }, + { + "epoch": 1.0063318641753056, + "grad_norm": 0.19076345488827526, + "learning_rate": 8.404379428308709e-05, + "loss": 2.8798, + "step": 16211 + }, + { + "epoch": 1.0063939412750635, + "grad_norm": 0.17502344285246138, + "learning_rate": 8.404114911824221e-05, + "loss": 2.9072, + "step": 16212 + }, + { + "epoch": 1.0064560183748215, + "grad_norm": 0.18045731132553797, + "learning_rate": 8.403850377579639e-05, + "loss": 2.9666, + "step": 16213 + }, + { + "epoch": 1.0065180954745794, + "grad_norm": 0.22226849471990048, + "learning_rate": 8.40358582557634e-05, + "loss": 2.8935, + "step": 16214 + }, + { + "epoch": 1.0065801725743373, + "grad_norm": 0.21031814175105443, + "learning_rate": 8.403321255815711e-05, + "loss": 3.0314, + "step": 16215 + }, + { + "epoch": 1.0066422496740952, + "grad_norm": 0.16629798662695314, + "learning_rate": 8.403056668299128e-05, + "loss": 2.9966, + "step": 16216 + }, + { + "epoch": 1.0067043267738531, + "grad_norm": 0.1739860685066534, + "learning_rate": 8.402792063027973e-05, + "loss": 2.9219, + "step": 16217 + }, + { + "epoch": 1.006766403873611, + "grad_norm": 0.17222920138681455, + "learning_rate": 8.402527440003624e-05, + "loss": 2.8124, + "step": 16218 + }, + { + "epoch": 1.006828480973369, + "grad_norm": 0.20053465390983527, + "learning_rate": 8.402262799227465e-05, + "loss": 2.9729, + "step": 16219 + }, + { + "epoch": 1.0068905580731269, + "grad_norm": 0.16560935630333948, + "learning_rate": 8.401998140700873e-05, + "loss": 2.8795, + "step": 16220 + }, + { + "epoch": 1.0069526351728848, + "grad_norm": 0.19196037929305174, + "learning_rate": 8.401733464425232e-05, + "loss": 2.8889, + "step": 16221 + }, + { + "epoch": 1.0070147122726427, + "grad_norm": 0.2070458694910321, + "learning_rate": 8.401468770401922e-05, + "loss": 2.9894, + "step": 16222 + }, + { + "epoch": 1.0070767893724004, + "grad_norm": 0.16753478879273914, + "learning_rate": 8.401204058632324e-05, + "loss": 2.9269, + "step": 16223 + }, + { + "epoch": 1.0071388664721583, + "grad_norm": 0.21076700596509332, + "learning_rate": 8.400939329117818e-05, + "loss": 3.0068, + "step": 16224 + }, + { + "epoch": 1.0072009435719163, + "grad_norm": 0.1830293522619649, + "learning_rate": 8.400674581859785e-05, + "loss": 2.9011, + "step": 16225 + }, + { + "epoch": 1.0072630206716742, + "grad_norm": 0.25923538586912215, + "learning_rate": 8.400409816859609e-05, + "loss": 2.8449, + "step": 16226 + }, + { + "epoch": 1.007325097771432, + "grad_norm": 0.17609393072415194, + "learning_rate": 8.400145034118669e-05, + "loss": 3.0482, + "step": 16227 + }, + { + "epoch": 1.00738717487119, + "grad_norm": 0.17504655099661895, + "learning_rate": 8.399880233638347e-05, + "loss": 2.855, + "step": 16228 + }, + { + "epoch": 1.007449251970948, + "grad_norm": 0.17076454367074065, + "learning_rate": 8.399615415420023e-05, + "loss": 2.9299, + "step": 16229 + }, + { + "epoch": 1.0075113290707058, + "grad_norm": 0.15384958720015462, + "learning_rate": 8.399350579465081e-05, + "loss": 2.9849, + "step": 16230 + }, + { + "epoch": 1.0075734061704638, + "grad_norm": 0.17848057308854787, + "learning_rate": 8.399085725774903e-05, + "loss": 2.9567, + "step": 16231 + }, + { + "epoch": 1.0076354832702217, + "grad_norm": 0.1641589687197802, + "learning_rate": 8.398820854350868e-05, + "loss": 2.9242, + "step": 16232 + }, + { + "epoch": 1.0076975603699796, + "grad_norm": 0.15821458045081063, + "learning_rate": 8.39855596519436e-05, + "loss": 2.8991, + "step": 16233 + }, + { + "epoch": 1.0077596374697375, + "grad_norm": 0.15912905864978436, + "learning_rate": 8.398291058306762e-05, + "loss": 2.9459, + "step": 16234 + }, + { + "epoch": 1.0078217145694952, + "grad_norm": 0.18389223063145005, + "learning_rate": 8.398026133689451e-05, + "loss": 2.8829, + "step": 16235 + }, + { + "epoch": 1.0078837916692531, + "grad_norm": 0.15441145445254223, + "learning_rate": 8.397761191343816e-05, + "loss": 2.9555, + "step": 16236 + }, + { + "epoch": 1.007945868769011, + "grad_norm": 0.18765917617714392, + "learning_rate": 8.397496231271235e-05, + "loss": 2.9285, + "step": 16237 + }, + { + "epoch": 1.008007945868769, + "grad_norm": 0.1774533114011188, + "learning_rate": 8.397231253473093e-05, + "loss": 2.886, + "step": 16238 + }, + { + "epoch": 1.0080700229685269, + "grad_norm": 0.17101660321697826, + "learning_rate": 8.396966257950767e-05, + "loss": 2.9142, + "step": 16239 + }, + { + "epoch": 1.0081321000682848, + "grad_norm": 0.18026219598471913, + "learning_rate": 8.396701244705646e-05, + "loss": 2.9546, + "step": 16240 + }, + { + "epoch": 1.0081941771680427, + "grad_norm": 0.17760305478893681, + "learning_rate": 8.396436213739109e-05, + "loss": 2.882, + "step": 16241 + }, + { + "epoch": 1.0082562542678006, + "grad_norm": 0.16825425479604747, + "learning_rate": 8.396171165052542e-05, + "loss": 2.8857, + "step": 16242 + }, + { + "epoch": 1.0083183313675586, + "grad_norm": 0.1497264849420989, + "learning_rate": 8.395906098647324e-05, + "loss": 2.8947, + "step": 16243 + }, + { + "epoch": 1.0083804084673165, + "grad_norm": 0.16952695665328707, + "learning_rate": 8.395641014524839e-05, + "loss": 2.9973, + "step": 16244 + }, + { + "epoch": 1.0084424855670744, + "grad_norm": 0.15235121415266015, + "learning_rate": 8.395375912686472e-05, + "loss": 2.9505, + "step": 16245 + }, + { + "epoch": 1.0085045626668323, + "grad_norm": 0.15821988994165884, + "learning_rate": 8.395110793133602e-05, + "loss": 2.9497, + "step": 16246 + }, + { + "epoch": 1.00856663976659, + "grad_norm": 0.17263197555611873, + "learning_rate": 8.394845655867618e-05, + "loss": 3.0024, + "step": 16247 + }, + { + "epoch": 1.008628716866348, + "grad_norm": 0.16212909647465484, + "learning_rate": 8.394580500889897e-05, + "loss": 2.9527, + "step": 16248 + }, + { + "epoch": 1.0086907939661058, + "grad_norm": 0.1568856957292575, + "learning_rate": 8.394315328201827e-05, + "loss": 2.906, + "step": 16249 + }, + { + "epoch": 1.0087528710658638, + "grad_norm": 0.1609665388463836, + "learning_rate": 8.394050137804791e-05, + "loss": 2.8907, + "step": 16250 + }, + { + "epoch": 1.0088149481656217, + "grad_norm": 0.22974023602817492, + "learning_rate": 8.39378492970017e-05, + "loss": 2.9061, + "step": 16251 + }, + { + "epoch": 1.0088770252653796, + "grad_norm": 0.18398707442520468, + "learning_rate": 8.39351970388935e-05, + "loss": 2.9332, + "step": 16252 + }, + { + "epoch": 1.0089391023651375, + "grad_norm": 0.15532856088716632, + "learning_rate": 8.393254460373714e-05, + "loss": 2.9495, + "step": 16253 + }, + { + "epoch": 1.0090011794648954, + "grad_norm": 0.17297161911443093, + "learning_rate": 8.392989199154645e-05, + "loss": 2.9831, + "step": 16254 + }, + { + "epoch": 1.0090632565646533, + "grad_norm": 0.15697991234181, + "learning_rate": 8.392723920233528e-05, + "loss": 2.8467, + "step": 16255 + }, + { + "epoch": 1.0091253336644113, + "grad_norm": 0.18349062275508668, + "learning_rate": 8.392458623611745e-05, + "loss": 2.9776, + "step": 16256 + }, + { + "epoch": 1.0091874107641692, + "grad_norm": 0.17545802280623057, + "learning_rate": 8.392193309290684e-05, + "loss": 3.0576, + "step": 16257 + }, + { + "epoch": 1.009249487863927, + "grad_norm": 0.15349238756110173, + "learning_rate": 8.391927977271726e-05, + "loss": 2.9029, + "step": 16258 + }, + { + "epoch": 1.0093115649636848, + "grad_norm": 0.17897875578446135, + "learning_rate": 8.391662627556258e-05, + "loss": 3.0223, + "step": 16259 + }, + { + "epoch": 1.0093736420634427, + "grad_norm": 0.15359525265545568, + "learning_rate": 8.391397260145662e-05, + "loss": 2.8669, + "step": 16260 + }, + { + "epoch": 1.0094357191632006, + "grad_norm": 0.28183452476311294, + "learning_rate": 8.391131875041322e-05, + "loss": 2.911, + "step": 16261 + }, + { + "epoch": 1.0094977962629585, + "grad_norm": 0.20113283537781024, + "learning_rate": 8.390866472244625e-05, + "loss": 2.9017, + "step": 16262 + }, + { + "epoch": 1.0095598733627165, + "grad_norm": 0.16708980740892918, + "learning_rate": 8.390601051756953e-05, + "loss": 2.9829, + "step": 16263 + }, + { + "epoch": 1.0096219504624744, + "grad_norm": 0.17224912131871586, + "learning_rate": 8.390335613579693e-05, + "loss": 2.8899, + "step": 16264 + }, + { + "epoch": 1.0096840275622323, + "grad_norm": 0.1841424898385747, + "learning_rate": 8.39007015771423e-05, + "loss": 2.9617, + "step": 16265 + }, + { + "epoch": 1.0097461046619902, + "grad_norm": 0.25219305575938583, + "learning_rate": 8.389804684161947e-05, + "loss": 3.0033, + "step": 16266 + }, + { + "epoch": 1.0098081817617481, + "grad_norm": 0.22676729121674044, + "learning_rate": 8.38953919292423e-05, + "loss": 2.9551, + "step": 16267 + }, + { + "epoch": 1.009870258861506, + "grad_norm": 0.1711803998526234, + "learning_rate": 8.389273684002463e-05, + "loss": 2.9438, + "step": 16268 + }, + { + "epoch": 1.009932335961264, + "grad_norm": 0.18694542289901842, + "learning_rate": 8.389008157398036e-05, + "loss": 2.9182, + "step": 16269 + }, + { + "epoch": 1.009994413061022, + "grad_norm": 0.2287479426077786, + "learning_rate": 8.388742613112329e-05, + "loss": 3.0062, + "step": 16270 + }, + { + "epoch": 1.0100564901607796, + "grad_norm": 0.19536895889600064, + "learning_rate": 8.388477051146728e-05, + "loss": 2.8182, + "step": 16271 + }, + { + "epoch": 1.0101185672605375, + "grad_norm": 0.18142902190268637, + "learning_rate": 8.38821147150262e-05, + "loss": 2.9254, + "step": 16272 + }, + { + "epoch": 1.0101806443602954, + "grad_norm": 0.16829367194201358, + "learning_rate": 8.38794587418139e-05, + "loss": 2.9404, + "step": 16273 + }, + { + "epoch": 1.0102427214600533, + "grad_norm": 0.18292439981166342, + "learning_rate": 8.387680259184426e-05, + "loss": 2.9196, + "step": 16274 + }, + { + "epoch": 1.0103047985598113, + "grad_norm": 0.16020198494652416, + "learning_rate": 8.387414626513109e-05, + "loss": 3.0413, + "step": 16275 + }, + { + "epoch": 1.0103668756595692, + "grad_norm": 0.15741896851461934, + "learning_rate": 8.387148976168829e-05, + "loss": 2.9042, + "step": 16276 + }, + { + "epoch": 1.010428952759327, + "grad_norm": 0.17160471883988848, + "learning_rate": 8.386883308152971e-05, + "loss": 2.9307, + "step": 16277 + }, + { + "epoch": 1.010491029859085, + "grad_norm": 0.20737353406345452, + "learning_rate": 8.386617622466919e-05, + "loss": 2.9117, + "step": 16278 + }, + { + "epoch": 1.010553106958843, + "grad_norm": 0.17371570509417206, + "learning_rate": 8.386351919112062e-05, + "loss": 2.9591, + "step": 16279 + }, + { + "epoch": 1.0106151840586008, + "grad_norm": 0.1999476505849793, + "learning_rate": 8.386086198089785e-05, + "loss": 2.9595, + "step": 16280 + }, + { + "epoch": 1.0106772611583588, + "grad_norm": 0.1950204000133711, + "learning_rate": 8.385820459401473e-05, + "loss": 2.9022, + "step": 16281 + }, + { + "epoch": 1.0107393382581167, + "grad_norm": 0.18523526376923707, + "learning_rate": 8.385554703048515e-05, + "loss": 2.9623, + "step": 16282 + }, + { + "epoch": 1.0108014153578744, + "grad_norm": 0.18849498283225669, + "learning_rate": 8.385288929032295e-05, + "loss": 2.8954, + "step": 16283 + }, + { + "epoch": 1.0108634924576323, + "grad_norm": 0.1721844704737063, + "learning_rate": 8.385023137354202e-05, + "loss": 2.9327, + "step": 16284 + }, + { + "epoch": 1.0109255695573902, + "grad_norm": 0.20437533438360642, + "learning_rate": 8.384757328015623e-05, + "loss": 2.9498, + "step": 16285 + }, + { + "epoch": 1.0109876466571481, + "grad_norm": 0.17755092333631706, + "learning_rate": 8.384491501017942e-05, + "loss": 2.9106, + "step": 16286 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.18172308372978144, + "learning_rate": 8.384225656362545e-05, + "loss": 2.951, + "step": 16287 + }, + { + "epoch": 1.011111800856664, + "grad_norm": 0.21907772480657603, + "learning_rate": 8.383959794050823e-05, + "loss": 2.9404, + "step": 16288 + }, + { + "epoch": 1.0111738779564219, + "grad_norm": 0.20207664578636095, + "learning_rate": 8.38369391408416e-05, + "loss": 2.9429, + "step": 16289 + }, + { + "epoch": 1.0112359550561798, + "grad_norm": 0.1545152205744398, + "learning_rate": 8.383428016463948e-05, + "loss": 2.9866, + "step": 16290 + }, + { + "epoch": 1.0112980321559377, + "grad_norm": 0.16882404760249964, + "learning_rate": 8.383162101191568e-05, + "loss": 2.9135, + "step": 16291 + }, + { + "epoch": 1.0113601092556956, + "grad_norm": 0.15763011817881234, + "learning_rate": 8.38289616826841e-05, + "loss": 3.0624, + "step": 16292 + }, + { + "epoch": 1.0114221863554536, + "grad_norm": 0.24479387245131332, + "learning_rate": 8.382630217695862e-05, + "loss": 2.942, + "step": 16293 + }, + { + "epoch": 1.0114842634552115, + "grad_norm": 0.24861784423217784, + "learning_rate": 8.382364249475312e-05, + "loss": 3.0383, + "step": 16294 + }, + { + "epoch": 1.0115463405549692, + "grad_norm": 0.1769908506923446, + "learning_rate": 8.382098263608145e-05, + "loss": 2.937, + "step": 16295 + }, + { + "epoch": 1.011608417654727, + "grad_norm": 0.22881586777199617, + "learning_rate": 8.38183226009575e-05, + "loss": 3.0203, + "step": 16296 + }, + { + "epoch": 1.011670494754485, + "grad_norm": 0.15895564454688982, + "learning_rate": 8.381566238939517e-05, + "loss": 2.8578, + "step": 16297 + }, + { + "epoch": 1.011732571854243, + "grad_norm": 0.23672536299378852, + "learning_rate": 8.381300200140831e-05, + "loss": 2.9353, + "step": 16298 + }, + { + "epoch": 1.0117946489540008, + "grad_norm": 0.2146602106727205, + "learning_rate": 8.381034143701081e-05, + "loss": 2.9071, + "step": 16299 + }, + { + "epoch": 1.0118567260537588, + "grad_norm": 0.22422698268330662, + "learning_rate": 8.380768069621655e-05, + "loss": 2.9625, + "step": 16300 + }, + { + "epoch": 1.0119188031535167, + "grad_norm": 0.16994562750360007, + "learning_rate": 8.380501977903942e-05, + "loss": 2.9279, + "step": 16301 + }, + { + "epoch": 1.0119808802532746, + "grad_norm": 0.2090668470988475, + "learning_rate": 8.380235868549329e-05, + "loss": 2.924, + "step": 16302 + }, + { + "epoch": 1.0120429573530325, + "grad_norm": 0.202526411803636, + "learning_rate": 8.379969741559203e-05, + "loss": 2.9544, + "step": 16303 + }, + { + "epoch": 1.0121050344527904, + "grad_norm": 0.23560638403523015, + "learning_rate": 8.379703596934958e-05, + "loss": 2.9756, + "step": 16304 + }, + { + "epoch": 1.0121671115525483, + "grad_norm": 0.208908833906447, + "learning_rate": 8.379437434677978e-05, + "loss": 2.9753, + "step": 16305 + }, + { + "epoch": 1.0122291886523063, + "grad_norm": 0.3094239114379289, + "learning_rate": 8.379171254789651e-05, + "loss": 2.9032, + "step": 16306 + }, + { + "epoch": 1.012291265752064, + "grad_norm": 0.2087673682708281, + "learning_rate": 8.378905057271367e-05, + "loss": 2.9246, + "step": 16307 + }, + { + "epoch": 1.0123533428518219, + "grad_norm": 0.21283912910807584, + "learning_rate": 8.378638842124516e-05, + "loss": 2.9539, + "step": 16308 + }, + { + "epoch": 1.0124154199515798, + "grad_norm": 0.2513115276611006, + "learning_rate": 8.378372609350485e-05, + "loss": 2.9361, + "step": 16309 + }, + { + "epoch": 1.0124774970513377, + "grad_norm": 0.1958411405829907, + "learning_rate": 8.378106358950666e-05, + "loss": 2.914, + "step": 16310 + }, + { + "epoch": 1.0125395741510956, + "grad_norm": 0.22088335493355302, + "learning_rate": 8.377840090926444e-05, + "loss": 2.9472, + "step": 16311 + }, + { + "epoch": 1.0126016512508536, + "grad_norm": 0.20830252638173882, + "learning_rate": 8.37757380527921e-05, + "loss": 2.9406, + "step": 16312 + }, + { + "epoch": 1.0126637283506115, + "grad_norm": 0.29131180457437494, + "learning_rate": 8.377307502010356e-05, + "loss": 2.9361, + "step": 16313 + }, + { + "epoch": 1.0127258054503694, + "grad_norm": 0.2120835026791564, + "learning_rate": 8.377041181121267e-05, + "loss": 2.9162, + "step": 16314 + }, + { + "epoch": 1.0127878825501273, + "grad_norm": 0.2140311044098488, + "learning_rate": 8.376774842613335e-05, + "loss": 2.9536, + "step": 16315 + }, + { + "epoch": 1.0128499596498852, + "grad_norm": 0.19430034637417845, + "learning_rate": 8.37650848648795e-05, + "loss": 2.8058, + "step": 16316 + }, + { + "epoch": 1.0129120367496431, + "grad_norm": 0.2101478365805834, + "learning_rate": 8.3762421127465e-05, + "loss": 2.8525, + "step": 16317 + }, + { + "epoch": 1.012974113849401, + "grad_norm": 0.3052992059981888, + "learning_rate": 8.375975721390374e-05, + "loss": 2.8781, + "step": 16318 + }, + { + "epoch": 1.0130361909491588, + "grad_norm": 0.21369596272324562, + "learning_rate": 8.375709312420963e-05, + "loss": 2.9411, + "step": 16319 + }, + { + "epoch": 1.0130982680489167, + "grad_norm": 0.17113679884268973, + "learning_rate": 8.375442885839659e-05, + "loss": 2.8601, + "step": 16320 + }, + { + "epoch": 1.0131603451486746, + "grad_norm": 0.31601497805685963, + "learning_rate": 8.375176441647847e-05, + "loss": 2.8071, + "step": 16321 + }, + { + "epoch": 1.0132224222484325, + "grad_norm": 0.18036036353628138, + "learning_rate": 8.374909979846923e-05, + "loss": 2.9073, + "step": 16322 + }, + { + "epoch": 1.0132844993481904, + "grad_norm": 0.1887736967824096, + "learning_rate": 8.374643500438273e-05, + "loss": 3.0006, + "step": 16323 + }, + { + "epoch": 1.0133465764479483, + "grad_norm": 0.19304733915439137, + "learning_rate": 8.374377003423288e-05, + "loss": 2.9546, + "step": 16324 + }, + { + "epoch": 1.0134086535477063, + "grad_norm": 0.2028484938382251, + "learning_rate": 8.374110488803361e-05, + "loss": 2.8743, + "step": 16325 + }, + { + "epoch": 1.0134707306474642, + "grad_norm": 0.17942916917811164, + "learning_rate": 8.373843956579879e-05, + "loss": 3.0047, + "step": 16326 + }, + { + "epoch": 1.013532807747222, + "grad_norm": 0.18561770442471773, + "learning_rate": 8.373577406754235e-05, + "loss": 2.9649, + "step": 16327 + }, + { + "epoch": 1.01359488484698, + "grad_norm": 0.2344781013056717, + "learning_rate": 8.373310839327818e-05, + "loss": 2.9879, + "step": 16328 + }, + { + "epoch": 1.013656961946738, + "grad_norm": 0.18825912780910126, + "learning_rate": 8.37304425430202e-05, + "loss": 2.9128, + "step": 16329 + }, + { + "epoch": 1.0137190390464959, + "grad_norm": 0.23477996966417697, + "learning_rate": 8.37277765167823e-05, + "loss": 2.9881, + "step": 16330 + }, + { + "epoch": 1.0137811161462535, + "grad_norm": 0.16443404066571302, + "learning_rate": 8.372511031457841e-05, + "loss": 2.9091, + "step": 16331 + }, + { + "epoch": 1.0138431932460115, + "grad_norm": 0.2049089006046274, + "learning_rate": 8.372244393642246e-05, + "loss": 2.9763, + "step": 16332 + }, + { + "epoch": 1.0139052703457694, + "grad_norm": 0.16591413425013443, + "learning_rate": 8.37197773823283e-05, + "loss": 2.892, + "step": 16333 + }, + { + "epoch": 1.0139673474455273, + "grad_norm": 0.18618458030975055, + "learning_rate": 8.371711065230988e-05, + "loss": 2.9295, + "step": 16334 + }, + { + "epoch": 1.0140294245452852, + "grad_norm": 0.17347325845624292, + "learning_rate": 8.371444374638112e-05, + "loss": 2.9901, + "step": 16335 + }, + { + "epoch": 1.0140915016450431, + "grad_norm": 0.16950126088251533, + "learning_rate": 8.371177666455591e-05, + "loss": 2.9114, + "step": 16336 + }, + { + "epoch": 1.014153578744801, + "grad_norm": 0.178676455980696, + "learning_rate": 8.370910940684817e-05, + "loss": 2.8269, + "step": 16337 + }, + { + "epoch": 1.014215655844559, + "grad_norm": 0.16250717234371953, + "learning_rate": 8.370644197327184e-05, + "loss": 2.9581, + "step": 16338 + }, + { + "epoch": 1.014277732944317, + "grad_norm": 0.16657294802434516, + "learning_rate": 8.370377436384081e-05, + "loss": 2.9214, + "step": 16339 + }, + { + "epoch": 1.0143398100440748, + "grad_norm": 0.21129451667797075, + "learning_rate": 8.370110657856903e-05, + "loss": 2.9576, + "step": 16340 + }, + { + "epoch": 1.0144018871438327, + "grad_norm": 0.18176263382643562, + "learning_rate": 8.369843861747037e-05, + "loss": 2.9507, + "step": 16341 + }, + { + "epoch": 1.0144639642435906, + "grad_norm": 0.18070587499153, + "learning_rate": 8.369577048055878e-05, + "loss": 3.0337, + "step": 16342 + }, + { + "epoch": 1.0145260413433483, + "grad_norm": 0.2011879479366837, + "learning_rate": 8.369310216784818e-05, + "loss": 2.9148, + "step": 16343 + }, + { + "epoch": 1.0145881184431063, + "grad_norm": 0.19924517063644467, + "learning_rate": 8.369043367935247e-05, + "loss": 2.8786, + "step": 16344 + }, + { + "epoch": 1.0146501955428642, + "grad_norm": 0.17206639232348359, + "learning_rate": 8.368776501508562e-05, + "loss": 2.9915, + "step": 16345 + }, + { + "epoch": 1.014712272642622, + "grad_norm": 0.2062626372425022, + "learning_rate": 8.368509617506151e-05, + "loss": 2.8925, + "step": 16346 + }, + { + "epoch": 1.01477434974238, + "grad_norm": 0.1705811033880793, + "learning_rate": 8.368242715929406e-05, + "loss": 2.9546, + "step": 16347 + }, + { + "epoch": 1.014836426842138, + "grad_norm": 0.16108572300726878, + "learning_rate": 8.367975796779722e-05, + "loss": 2.8533, + "step": 16348 + }, + { + "epoch": 1.0148985039418958, + "grad_norm": 0.15690701751620034, + "learning_rate": 8.367708860058492e-05, + "loss": 2.9279, + "step": 16349 + }, + { + "epoch": 1.0149605810416538, + "grad_norm": 0.2028830903409179, + "learning_rate": 8.367441905767106e-05, + "loss": 2.8901, + "step": 16350 + }, + { + "epoch": 1.0150226581414117, + "grad_norm": 0.16347632824004787, + "learning_rate": 8.367174933906959e-05, + "loss": 2.9174, + "step": 16351 + }, + { + "epoch": 1.0150847352411696, + "grad_norm": 0.2001842309629487, + "learning_rate": 8.366907944479441e-05, + "loss": 2.9919, + "step": 16352 + }, + { + "epoch": 1.0151468123409275, + "grad_norm": 0.20762499653726077, + "learning_rate": 8.366640937485949e-05, + "loss": 2.9022, + "step": 16353 + }, + { + "epoch": 1.0152088894406854, + "grad_norm": 0.17174659072033968, + "learning_rate": 8.366373912927871e-05, + "loss": 3.0053, + "step": 16354 + }, + { + "epoch": 1.0152709665404431, + "grad_norm": 0.2138829595594064, + "learning_rate": 8.366106870806608e-05, + "loss": 2.8319, + "step": 16355 + }, + { + "epoch": 1.015333043640201, + "grad_norm": 0.19339947332891866, + "learning_rate": 8.365839811123544e-05, + "loss": 2.9253, + "step": 16356 + }, + { + "epoch": 1.015395120739959, + "grad_norm": 0.1750354947980318, + "learning_rate": 8.365572733880078e-05, + "loss": 2.9335, + "step": 16357 + }, + { + "epoch": 1.0154571978397169, + "grad_norm": 0.2044919947611457, + "learning_rate": 8.365305639077603e-05, + "loss": 2.8848, + "step": 16358 + }, + { + "epoch": 1.0155192749394748, + "grad_norm": 0.1682082010841826, + "learning_rate": 8.36503852671751e-05, + "loss": 2.9421, + "step": 16359 + }, + { + "epoch": 1.0155813520392327, + "grad_norm": 0.1983211435267146, + "learning_rate": 8.364771396801197e-05, + "loss": 2.8674, + "step": 16360 + }, + { + "epoch": 1.0156434291389906, + "grad_norm": 0.17456511378149647, + "learning_rate": 8.364504249330052e-05, + "loss": 2.9297, + "step": 16361 + }, + { + "epoch": 1.0157055062387486, + "grad_norm": 0.19264669764366688, + "learning_rate": 8.364237084305472e-05, + "loss": 2.908, + "step": 16362 + }, + { + "epoch": 1.0157675833385065, + "grad_norm": 0.19914404085326803, + "learning_rate": 8.36396990172885e-05, + "loss": 2.951, + "step": 16363 + }, + { + "epoch": 1.0158296604382644, + "grad_norm": 0.19061914458427714, + "learning_rate": 8.363702701601581e-05, + "loss": 2.9781, + "step": 16364 + }, + { + "epoch": 1.0158917375380223, + "grad_norm": 0.20118973476795818, + "learning_rate": 8.36343548392506e-05, + "loss": 2.9531, + "step": 16365 + }, + { + "epoch": 1.0159538146377802, + "grad_norm": 0.1870502999849114, + "learning_rate": 8.363168248700677e-05, + "loss": 3.0364, + "step": 16366 + }, + { + "epoch": 1.016015891737538, + "grad_norm": 0.18189931721690242, + "learning_rate": 8.36290099592983e-05, + "loss": 2.9345, + "step": 16367 + }, + { + "epoch": 1.0160779688372958, + "grad_norm": 0.1724500896805065, + "learning_rate": 8.362633725613911e-05, + "loss": 2.9903, + "step": 16368 + }, + { + "epoch": 1.0161400459370538, + "grad_norm": 0.21254814668384475, + "learning_rate": 8.362366437754317e-05, + "loss": 2.9312, + "step": 16369 + }, + { + "epoch": 1.0162021230368117, + "grad_norm": 0.18535735841779777, + "learning_rate": 8.362099132352441e-05, + "loss": 2.9268, + "step": 16370 + }, + { + "epoch": 1.0162642001365696, + "grad_norm": 0.1641994767353186, + "learning_rate": 8.361831809409677e-05, + "loss": 2.9114, + "step": 16371 + }, + { + "epoch": 1.0163262772363275, + "grad_norm": 0.19161977341150085, + "learning_rate": 8.36156446892742e-05, + "loss": 2.8919, + "step": 16372 + }, + { + "epoch": 1.0163883543360854, + "grad_norm": 0.17381429617233865, + "learning_rate": 8.361297110907067e-05, + "loss": 2.943, + "step": 16373 + }, + { + "epoch": 1.0164504314358433, + "grad_norm": 0.176706765111652, + "learning_rate": 8.361029735350008e-05, + "loss": 2.9443, + "step": 16374 + }, + { + "epoch": 1.0165125085356013, + "grad_norm": 0.15877127764812823, + "learning_rate": 8.360762342257645e-05, + "loss": 2.8729, + "step": 16375 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.1796301459953727, + "learning_rate": 8.360494931631365e-05, + "loss": 2.977, + "step": 16376 + }, + { + "epoch": 1.016636662735117, + "grad_norm": 0.15891387604200416, + "learning_rate": 8.360227503472568e-05, + "loss": 2.9412, + "step": 16377 + }, + { + "epoch": 1.016698739834875, + "grad_norm": 0.17567154321908093, + "learning_rate": 8.35996005778265e-05, + "loss": 2.951, + "step": 16378 + }, + { + "epoch": 1.0167608169346327, + "grad_norm": 0.1678357873330068, + "learning_rate": 8.359692594563004e-05, + "loss": 3.055, + "step": 16379 + }, + { + "epoch": 1.0168228940343906, + "grad_norm": 0.16684306329341853, + "learning_rate": 8.359425113815027e-05, + "loss": 2.9663, + "step": 16380 + }, + { + "epoch": 1.0168849711341486, + "grad_norm": 0.15962660834160114, + "learning_rate": 8.359157615540112e-05, + "loss": 2.9825, + "step": 16381 + }, + { + "epoch": 1.0169470482339065, + "grad_norm": 0.16739390947375413, + "learning_rate": 8.358890099739656e-05, + "loss": 2.969, + "step": 16382 + }, + { + "epoch": 1.0170091253336644, + "grad_norm": 0.15922563275930077, + "learning_rate": 8.358622566415057e-05, + "loss": 2.9765, + "step": 16383 + }, + { + "epoch": 1.0170712024334223, + "grad_norm": 0.16100913416455015, + "learning_rate": 8.358355015567707e-05, + "loss": 2.9749, + "step": 16384 + }, + { + "epoch": 1.0171332795331802, + "grad_norm": 0.1624451572232359, + "learning_rate": 8.358087447199004e-05, + "loss": 2.8841, + "step": 16385 + }, + { + "epoch": 1.0171953566329381, + "grad_norm": 0.1576126405232617, + "learning_rate": 8.357819861310345e-05, + "loss": 2.8999, + "step": 16386 + }, + { + "epoch": 1.017257433732696, + "grad_norm": 0.15940714890533764, + "learning_rate": 8.357552257903123e-05, + "loss": 2.8348, + "step": 16387 + }, + { + "epoch": 1.017319510832454, + "grad_norm": 0.14840182703921345, + "learning_rate": 8.357284636978736e-05, + "loss": 2.9498, + "step": 16388 + }, + { + "epoch": 1.017381587932212, + "grad_norm": 0.17241465853759588, + "learning_rate": 8.357016998538581e-05, + "loss": 2.9318, + "step": 16389 + }, + { + "epoch": 1.0174436650319698, + "grad_norm": 0.15067146138703413, + "learning_rate": 8.356749342584052e-05, + "loss": 2.8408, + "step": 16390 + }, + { + "epoch": 1.0175057421317275, + "grad_norm": 0.2147215929405293, + "learning_rate": 8.356481669116548e-05, + "loss": 3.051, + "step": 16391 + }, + { + "epoch": 1.0175678192314854, + "grad_norm": 0.18088628144464367, + "learning_rate": 8.356213978137464e-05, + "loss": 2.9427, + "step": 16392 + }, + { + "epoch": 1.0176298963312433, + "grad_norm": 0.15885873042779078, + "learning_rate": 8.355946269648196e-05, + "loss": 2.9315, + "step": 16393 + }, + { + "epoch": 1.0176919734310013, + "grad_norm": 0.18851566403226927, + "learning_rate": 8.355678543650143e-05, + "loss": 2.9039, + "step": 16394 + }, + { + "epoch": 1.0177540505307592, + "grad_norm": 0.16199584800423966, + "learning_rate": 8.3554108001447e-05, + "loss": 2.8151, + "step": 16395 + }, + { + "epoch": 1.017816127630517, + "grad_norm": 0.1682625795454168, + "learning_rate": 8.355143039133264e-05, + "loss": 2.9095, + "step": 16396 + }, + { + "epoch": 1.017878204730275, + "grad_norm": 0.18156041661181727, + "learning_rate": 8.354875260617232e-05, + "loss": 2.9264, + "step": 16397 + }, + { + "epoch": 1.017940281830033, + "grad_norm": 0.16137358276725872, + "learning_rate": 8.354607464598e-05, + "loss": 2.9029, + "step": 16398 + }, + { + "epoch": 1.0180023589297909, + "grad_norm": 0.16441474860934793, + "learning_rate": 8.35433965107697e-05, + "loss": 2.8175, + "step": 16399 + }, + { + "epoch": 1.0180644360295488, + "grad_norm": 0.3098038836180046, + "learning_rate": 8.354071820055533e-05, + "loss": 2.9297, + "step": 16400 + }, + { + "epoch": 1.0181265131293067, + "grad_norm": 0.18540141778741256, + "learning_rate": 8.353803971535091e-05, + "loss": 2.9518, + "step": 16401 + }, + { + "epoch": 1.0181885902290646, + "grad_norm": 0.3346421084000856, + "learning_rate": 8.353536105517039e-05, + "loss": 2.9973, + "step": 16402 + }, + { + "epoch": 1.0182506673288223, + "grad_norm": 0.19212105656959264, + "learning_rate": 8.353268222002773e-05, + "loss": 2.8726, + "step": 16403 + }, + { + "epoch": 1.0183127444285802, + "grad_norm": 0.29294531410575997, + "learning_rate": 8.353000320993695e-05, + "loss": 3.0389, + "step": 16404 + }, + { + "epoch": 1.0183748215283381, + "grad_norm": 0.1988963810014908, + "learning_rate": 8.352732402491199e-05, + "loss": 2.9238, + "step": 16405 + }, + { + "epoch": 1.018436898628096, + "grad_norm": 0.19957072822615607, + "learning_rate": 8.352464466496686e-05, + "loss": 2.9702, + "step": 16406 + }, + { + "epoch": 1.018498975727854, + "grad_norm": 0.1637601621128114, + "learning_rate": 8.35219651301155e-05, + "loss": 2.9612, + "step": 16407 + }, + { + "epoch": 1.018561052827612, + "grad_norm": 0.2392706964877419, + "learning_rate": 8.351928542037192e-05, + "loss": 3.0113, + "step": 16408 + }, + { + "epoch": 1.0186231299273698, + "grad_norm": 0.19630313462255647, + "learning_rate": 8.351660553575009e-05, + "loss": 2.8859, + "step": 16409 + }, + { + "epoch": 1.0186852070271277, + "grad_norm": 0.1675785155836413, + "learning_rate": 8.351392547626398e-05, + "loss": 2.8461, + "step": 16410 + }, + { + "epoch": 1.0187472841268856, + "grad_norm": 0.19467829734581155, + "learning_rate": 8.351124524192762e-05, + "loss": 2.9548, + "step": 16411 + }, + { + "epoch": 1.0188093612266436, + "grad_norm": 0.3115667833169644, + "learning_rate": 8.350856483275494e-05, + "loss": 2.907, + "step": 16412 + }, + { + "epoch": 1.0188714383264015, + "grad_norm": 0.179495916649535, + "learning_rate": 8.350588424875993e-05, + "loss": 2.9657, + "step": 16413 + }, + { + "epoch": 1.0189335154261594, + "grad_norm": 0.21235855352688984, + "learning_rate": 8.350320348995659e-05, + "loss": 3.0152, + "step": 16414 + }, + { + "epoch": 1.018995592525917, + "grad_norm": 0.22922103771480076, + "learning_rate": 8.350052255635891e-05, + "loss": 2.9316, + "step": 16415 + }, + { + "epoch": 1.019057669625675, + "grad_norm": 0.1898963109790056, + "learning_rate": 8.349784144798088e-05, + "loss": 2.9343, + "step": 16416 + }, + { + "epoch": 1.019119746725433, + "grad_norm": 0.1675178163664603, + "learning_rate": 8.349516016483646e-05, + "loss": 2.9147, + "step": 16417 + }, + { + "epoch": 1.0191818238251908, + "grad_norm": 0.18419225008728793, + "learning_rate": 8.349247870693967e-05, + "loss": 2.8849, + "step": 16418 + }, + { + "epoch": 1.0192439009249488, + "grad_norm": 0.21639256106988392, + "learning_rate": 8.348979707430448e-05, + "loss": 2.8352, + "step": 16419 + }, + { + "epoch": 1.0193059780247067, + "grad_norm": 0.19908012915419007, + "learning_rate": 8.348711526694491e-05, + "loss": 3.0395, + "step": 16420 + }, + { + "epoch": 1.0193680551244646, + "grad_norm": 0.1836832315429896, + "learning_rate": 8.34844332848749e-05, + "loss": 2.9508, + "step": 16421 + }, + { + "epoch": 1.0194301322242225, + "grad_norm": 0.17147719339960896, + "learning_rate": 8.348175112810849e-05, + "loss": 2.9626, + "step": 16422 + }, + { + "epoch": 1.0194922093239804, + "grad_norm": 0.18103853542537754, + "learning_rate": 8.347906879665966e-05, + "loss": 2.893, + "step": 16423 + }, + { + "epoch": 1.0195542864237384, + "grad_norm": 0.23086582342652792, + "learning_rate": 8.34763862905424e-05, + "loss": 2.9848, + "step": 16424 + }, + { + "epoch": 1.0196163635234963, + "grad_norm": 0.1796938335297635, + "learning_rate": 8.34737036097707e-05, + "loss": 2.9448, + "step": 16425 + }, + { + "epoch": 1.0196784406232542, + "grad_norm": 0.1975130474633671, + "learning_rate": 8.347102075435856e-05, + "loss": 2.9562, + "step": 16426 + }, + { + "epoch": 1.0197405177230119, + "grad_norm": 0.25592153827792513, + "learning_rate": 8.346833772432e-05, + "loss": 2.7842, + "step": 16427 + }, + { + "epoch": 1.0198025948227698, + "grad_norm": 0.24469352675770692, + "learning_rate": 8.346565451966897e-05, + "loss": 2.8389, + "step": 16428 + }, + { + "epoch": 1.0198646719225277, + "grad_norm": 0.21909141293945406, + "learning_rate": 8.34629711404195e-05, + "loss": 2.9286, + "step": 16429 + }, + { + "epoch": 1.0199267490222856, + "grad_norm": 0.29195701106554955, + "learning_rate": 8.34602875865856e-05, + "loss": 2.9727, + "step": 16430 + }, + { + "epoch": 1.0199888261220436, + "grad_norm": 0.2177970086585878, + "learning_rate": 8.345760385818124e-05, + "loss": 2.9644, + "step": 16431 + }, + { + "epoch": 1.0200509032218015, + "grad_norm": 0.2495937772363734, + "learning_rate": 8.345491995522046e-05, + "loss": 2.8313, + "step": 16432 + }, + { + "epoch": 1.0201129803215594, + "grad_norm": 0.21185869820933526, + "learning_rate": 8.345223587771722e-05, + "loss": 2.9167, + "step": 16433 + }, + { + "epoch": 1.0201750574213173, + "grad_norm": 0.29109005328302057, + "learning_rate": 8.344955162568555e-05, + "loss": 2.8524, + "step": 16434 + }, + { + "epoch": 1.0202371345210752, + "grad_norm": 0.26927323361820515, + "learning_rate": 8.344686719913946e-05, + "loss": 2.9886, + "step": 16435 + }, + { + "epoch": 1.0202992116208331, + "grad_norm": 0.25512864691797194, + "learning_rate": 8.344418259809293e-05, + "loss": 3.0028, + "step": 16436 + }, + { + "epoch": 1.020361288720591, + "grad_norm": 0.22080186306242577, + "learning_rate": 8.344149782255999e-05, + "loss": 3.0173, + "step": 16437 + }, + { + "epoch": 1.020423365820349, + "grad_norm": 0.23396776105331124, + "learning_rate": 8.343881287255464e-05, + "loss": 2.8776, + "step": 16438 + }, + { + "epoch": 1.0204854429201067, + "grad_norm": 0.24781863908511473, + "learning_rate": 8.343612774809089e-05, + "loss": 2.994, + "step": 16439 + }, + { + "epoch": 1.0205475200198646, + "grad_norm": 0.2433081280555355, + "learning_rate": 8.343344244918273e-05, + "loss": 2.9455, + "step": 16440 + }, + { + "epoch": 1.0206095971196225, + "grad_norm": 0.1896022131360636, + "learning_rate": 8.34307569758442e-05, + "loss": 2.9398, + "step": 16441 + }, + { + "epoch": 1.0206716742193804, + "grad_norm": 0.21675239140985386, + "learning_rate": 8.342807132808927e-05, + "loss": 2.916, + "step": 16442 + }, + { + "epoch": 1.0207337513191383, + "grad_norm": 0.2098151654610228, + "learning_rate": 8.3425385505932e-05, + "loss": 2.8845, + "step": 16443 + }, + { + "epoch": 1.0207958284188963, + "grad_norm": 0.23155184197447942, + "learning_rate": 8.342269950938637e-05, + "loss": 2.9336, + "step": 16444 + }, + { + "epoch": 1.0208579055186542, + "grad_norm": 0.1887908153898514, + "learning_rate": 8.342001333846641e-05, + "loss": 2.9324, + "step": 16445 + }, + { + "epoch": 1.020919982618412, + "grad_norm": 0.20436115274852637, + "learning_rate": 8.341732699318611e-05, + "loss": 2.9244, + "step": 16446 + }, + { + "epoch": 1.02098205971817, + "grad_norm": 0.17285556573057542, + "learning_rate": 8.341464047355952e-05, + "loss": 2.8879, + "step": 16447 + }, + { + "epoch": 1.021044136817928, + "grad_norm": 0.18199815735827368, + "learning_rate": 8.341195377960065e-05, + "loss": 2.8738, + "step": 16448 + }, + { + "epoch": 1.0211062139176859, + "grad_norm": 0.18413664313807035, + "learning_rate": 8.340926691132349e-05, + "loss": 2.9381, + "step": 16449 + }, + { + "epoch": 1.0211682910174438, + "grad_norm": 0.17939258298194946, + "learning_rate": 8.340657986874207e-05, + "loss": 2.907, + "step": 16450 + }, + { + "epoch": 1.0212303681172015, + "grad_norm": 0.1976395735660294, + "learning_rate": 8.340389265187042e-05, + "loss": 2.9192, + "step": 16451 + }, + { + "epoch": 1.0212924452169594, + "grad_norm": 0.18925198958408065, + "learning_rate": 8.340120526072255e-05, + "loss": 2.8938, + "step": 16452 + }, + { + "epoch": 1.0213545223167173, + "grad_norm": 0.17328515063024166, + "learning_rate": 8.339851769531247e-05, + "loss": 2.8862, + "step": 16453 + }, + { + "epoch": 1.0214165994164752, + "grad_norm": 0.16762875367910754, + "learning_rate": 8.339582995565423e-05, + "loss": 2.8633, + "step": 16454 + }, + { + "epoch": 1.0214786765162331, + "grad_norm": 0.158274016568065, + "learning_rate": 8.339314204176185e-05, + "loss": 2.8116, + "step": 16455 + }, + { + "epoch": 1.021540753615991, + "grad_norm": 0.2500623237887646, + "learning_rate": 8.339045395364933e-05, + "loss": 2.9595, + "step": 16456 + }, + { + "epoch": 1.021602830715749, + "grad_norm": 0.17688863349577258, + "learning_rate": 8.338776569133071e-05, + "loss": 2.9116, + "step": 16457 + }, + { + "epoch": 1.021664907815507, + "grad_norm": 0.20212634974897753, + "learning_rate": 8.338507725482e-05, + "loss": 2.9462, + "step": 16458 + }, + { + "epoch": 1.0217269849152648, + "grad_norm": 0.1677795386842592, + "learning_rate": 8.338238864413124e-05, + "loss": 2.7872, + "step": 16459 + }, + { + "epoch": 1.0217890620150227, + "grad_norm": 0.16550826916463954, + "learning_rate": 8.337969985927846e-05, + "loss": 2.9241, + "step": 16460 + }, + { + "epoch": 1.0218511391147806, + "grad_norm": 0.18884908595319697, + "learning_rate": 8.337701090027568e-05, + "loss": 2.9498, + "step": 16461 + }, + { + "epoch": 1.0219132162145383, + "grad_norm": 0.28432109762069796, + "learning_rate": 8.337432176713693e-05, + "loss": 2.7842, + "step": 16462 + }, + { + "epoch": 1.0219752933142963, + "grad_norm": 0.18300541496538417, + "learning_rate": 8.337163245987624e-05, + "loss": 2.9952, + "step": 16463 + }, + { + "epoch": 1.0220373704140542, + "grad_norm": 0.16998833967813984, + "learning_rate": 8.336894297850766e-05, + "loss": 2.8206, + "step": 16464 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.20581690913058234, + "learning_rate": 8.33662533230452e-05, + "loss": 2.9275, + "step": 16465 + }, + { + "epoch": 1.02216152461357, + "grad_norm": 0.22843775278226172, + "learning_rate": 8.336356349350287e-05, + "loss": 2.8768, + "step": 16466 + }, + { + "epoch": 1.022223601713328, + "grad_norm": 0.2605042532813404, + "learning_rate": 8.336087348989477e-05, + "loss": 2.89, + "step": 16467 + }, + { + "epoch": 1.0222856788130859, + "grad_norm": 0.2356304162892891, + "learning_rate": 8.335818331223485e-05, + "loss": 2.8563, + "step": 16468 + }, + { + "epoch": 1.0223477559128438, + "grad_norm": 0.16283779339537335, + "learning_rate": 8.335549296053722e-05, + "loss": 2.9978, + "step": 16469 + }, + { + "epoch": 1.0224098330126017, + "grad_norm": 0.19078049258894852, + "learning_rate": 8.335280243481587e-05, + "loss": 2.942, + "step": 16470 + }, + { + "epoch": 1.0224719101123596, + "grad_norm": 0.27471689911011526, + "learning_rate": 8.335011173508485e-05, + "loss": 2.9062, + "step": 16471 + }, + { + "epoch": 1.0225339872121175, + "grad_norm": 0.18678578805348914, + "learning_rate": 8.334742086135821e-05, + "loss": 2.9193, + "step": 16472 + }, + { + "epoch": 1.0225960643118754, + "grad_norm": 0.17524908892643673, + "learning_rate": 8.334472981364999e-05, + "loss": 2.9824, + "step": 16473 + }, + { + "epoch": 1.0226581414116334, + "grad_norm": 0.26121612224222346, + "learning_rate": 8.334203859197421e-05, + "loss": 2.891, + "step": 16474 + }, + { + "epoch": 1.022720218511391, + "grad_norm": 0.20953402031667948, + "learning_rate": 8.333934719634493e-05, + "loss": 2.984, + "step": 16475 + }, + { + "epoch": 1.022782295611149, + "grad_norm": 0.1758647402534251, + "learning_rate": 8.333665562677617e-05, + "loss": 2.9986, + "step": 16476 + }, + { + "epoch": 1.022844372710907, + "grad_norm": 0.19880769433536608, + "learning_rate": 8.333396388328198e-05, + "loss": 3.0055, + "step": 16477 + }, + { + "epoch": 1.0229064498106648, + "grad_norm": 0.15432122798870793, + "learning_rate": 8.333127196587642e-05, + "loss": 2.8886, + "step": 16478 + }, + { + "epoch": 1.0229685269104227, + "grad_norm": 0.189765024541515, + "learning_rate": 8.332857987457352e-05, + "loss": 2.9102, + "step": 16479 + }, + { + "epoch": 1.0230306040101806, + "grad_norm": 0.16934731785109058, + "learning_rate": 8.332588760938734e-05, + "loss": 2.9503, + "step": 16480 + }, + { + "epoch": 1.0230926811099386, + "grad_norm": 0.16224028001414242, + "learning_rate": 8.33231951703319e-05, + "loss": 2.9225, + "step": 16481 + }, + { + "epoch": 1.0231547582096965, + "grad_norm": 0.16771219310307361, + "learning_rate": 8.332050255742125e-05, + "loss": 2.8898, + "step": 16482 + }, + { + "epoch": 1.0232168353094544, + "grad_norm": 0.1768900234561609, + "learning_rate": 8.331780977066948e-05, + "loss": 2.889, + "step": 16483 + }, + { + "epoch": 1.0232789124092123, + "grad_norm": 0.17875524919978705, + "learning_rate": 8.331511681009058e-05, + "loss": 2.9489, + "step": 16484 + }, + { + "epoch": 1.0233409895089702, + "grad_norm": 0.16010387657547107, + "learning_rate": 8.331242367569863e-05, + "loss": 2.9311, + "step": 16485 + }, + { + "epoch": 1.023403066608728, + "grad_norm": 0.16100799465655022, + "learning_rate": 8.330973036750768e-05, + "loss": 2.9745, + "step": 16486 + }, + { + "epoch": 1.0234651437084858, + "grad_norm": 0.18083191368612248, + "learning_rate": 8.33070368855318e-05, + "loss": 2.8959, + "step": 16487 + }, + { + "epoch": 1.0235272208082438, + "grad_norm": 0.17797132563883242, + "learning_rate": 8.330434322978501e-05, + "loss": 2.8988, + "step": 16488 + }, + { + "epoch": 1.0235892979080017, + "grad_norm": 0.16234319553702783, + "learning_rate": 8.330164940028137e-05, + "loss": 2.9578, + "step": 16489 + }, + { + "epoch": 1.0236513750077596, + "grad_norm": 0.17691876791577527, + "learning_rate": 8.329895539703496e-05, + "loss": 2.9615, + "step": 16490 + }, + { + "epoch": 1.0237134521075175, + "grad_norm": 0.16496054344663136, + "learning_rate": 8.329626122005981e-05, + "loss": 2.9554, + "step": 16491 + }, + { + "epoch": 1.0237755292072754, + "grad_norm": 0.17970590911770282, + "learning_rate": 8.329356686936997e-05, + "loss": 2.9352, + "step": 16492 + }, + { + "epoch": 1.0238376063070334, + "grad_norm": 0.1659066571920273, + "learning_rate": 8.329087234497952e-05, + "loss": 2.9179, + "step": 16493 + }, + { + "epoch": 1.0238996834067913, + "grad_norm": 0.16049058256789195, + "learning_rate": 8.32881776469025e-05, + "loss": 2.9624, + "step": 16494 + }, + { + "epoch": 1.0239617605065492, + "grad_norm": 0.18248467559031917, + "learning_rate": 8.328548277515298e-05, + "loss": 2.9818, + "step": 16495 + }, + { + "epoch": 1.024023837606307, + "grad_norm": 0.17222793603644854, + "learning_rate": 8.328278772974502e-05, + "loss": 2.8921, + "step": 16496 + }, + { + "epoch": 1.024085914706065, + "grad_norm": 0.1716341802457551, + "learning_rate": 8.328009251069266e-05, + "loss": 2.9306, + "step": 16497 + }, + { + "epoch": 1.024147991805823, + "grad_norm": 0.18504143184910957, + "learning_rate": 8.327739711801e-05, + "loss": 3.0145, + "step": 16498 + }, + { + "epoch": 1.0242100689055806, + "grad_norm": 0.23929920227782223, + "learning_rate": 8.327470155171107e-05, + "loss": 2.905, + "step": 16499 + }, + { + "epoch": 1.0242721460053386, + "grad_norm": 0.1849253131754731, + "learning_rate": 8.327200581180993e-05, + "loss": 2.9188, + "step": 16500 + }, + { + "epoch": 1.0243342231050965, + "grad_norm": 0.17819134162767694, + "learning_rate": 8.326930989832069e-05, + "loss": 2.973, + "step": 16501 + }, + { + "epoch": 1.0243963002048544, + "grad_norm": 0.18143740945775017, + "learning_rate": 8.326661381125737e-05, + "loss": 2.9929, + "step": 16502 + }, + { + "epoch": 1.0244583773046123, + "grad_norm": 0.1605236967206026, + "learning_rate": 8.326391755063404e-05, + "loss": 3.0045, + "step": 16503 + }, + { + "epoch": 1.0245204544043702, + "grad_norm": 0.1958044817567679, + "learning_rate": 8.326122111646479e-05, + "loss": 2.939, + "step": 16504 + }, + { + "epoch": 1.0245825315041281, + "grad_norm": 0.15840411076104938, + "learning_rate": 8.325852450876367e-05, + "loss": 3.0001, + "step": 16505 + }, + { + "epoch": 1.024644608603886, + "grad_norm": 0.18326481307229825, + "learning_rate": 8.325582772754474e-05, + "loss": 2.8998, + "step": 16506 + }, + { + "epoch": 1.024706685703644, + "grad_norm": 0.17873224016807646, + "learning_rate": 8.32531307728221e-05, + "loss": 2.8974, + "step": 16507 + }, + { + "epoch": 1.024768762803402, + "grad_norm": 0.1785427824101997, + "learning_rate": 8.325043364460981e-05, + "loss": 2.9106, + "step": 16508 + }, + { + "epoch": 1.0248308399031598, + "grad_norm": 0.19575960482617094, + "learning_rate": 8.324773634292194e-05, + "loss": 2.8656, + "step": 16509 + }, + { + "epoch": 1.0248929170029175, + "grad_norm": 0.17929976550329974, + "learning_rate": 8.324503886777255e-05, + "loss": 2.9086, + "step": 16510 + }, + { + "epoch": 1.0249549941026754, + "grad_norm": 0.17127872662041785, + "learning_rate": 8.324234121917572e-05, + "loss": 2.8691, + "step": 16511 + }, + { + "epoch": 1.0250170712024333, + "grad_norm": 0.1667945875060126, + "learning_rate": 8.323964339714552e-05, + "loss": 2.9727, + "step": 16512 + }, + { + "epoch": 1.0250791483021913, + "grad_norm": 0.17414690640442892, + "learning_rate": 8.323694540169602e-05, + "loss": 3.0324, + "step": 16513 + }, + { + "epoch": 1.0251412254019492, + "grad_norm": 0.18757507688753874, + "learning_rate": 8.323424723284132e-05, + "loss": 2.9724, + "step": 16514 + }, + { + "epoch": 1.025203302501707, + "grad_norm": 0.21548531714103356, + "learning_rate": 8.32315488905955e-05, + "loss": 2.8553, + "step": 16515 + }, + { + "epoch": 1.025265379601465, + "grad_norm": 0.2086844654205359, + "learning_rate": 8.322885037497261e-05, + "loss": 2.9931, + "step": 16516 + }, + { + "epoch": 1.025327456701223, + "grad_norm": 0.16180463090779304, + "learning_rate": 8.322615168598673e-05, + "loss": 2.9309, + "step": 16517 + }, + { + "epoch": 1.0253895338009809, + "grad_norm": 0.18310902909950041, + "learning_rate": 8.322345282365196e-05, + "loss": 2.8592, + "step": 16518 + }, + { + "epoch": 1.0254516109007388, + "grad_norm": 0.1956985282738812, + "learning_rate": 8.322075378798237e-05, + "loss": 2.9257, + "step": 16519 + }, + { + "epoch": 1.0255136880004967, + "grad_norm": 0.20515117252571688, + "learning_rate": 8.321805457899206e-05, + "loss": 2.9421, + "step": 16520 + }, + { + "epoch": 1.0255757651002546, + "grad_norm": 0.23548049877196867, + "learning_rate": 8.321535519669506e-05, + "loss": 2.9327, + "step": 16521 + }, + { + "epoch": 1.0256378422000123, + "grad_norm": 0.17294944534846948, + "learning_rate": 8.321265564110552e-05, + "loss": 2.8693, + "step": 16522 + }, + { + "epoch": 1.0256999192997702, + "grad_norm": 0.1566197431348875, + "learning_rate": 8.320995591223748e-05, + "loss": 2.8787, + "step": 16523 + }, + { + "epoch": 1.0257619963995281, + "grad_norm": 0.17318765648614867, + "learning_rate": 8.320725601010503e-05, + "loss": 2.9695, + "step": 16524 + }, + { + "epoch": 1.025824073499286, + "grad_norm": 0.15887347414115366, + "learning_rate": 8.320455593472229e-05, + "loss": 2.9571, + "step": 16525 + }, + { + "epoch": 1.025886150599044, + "grad_norm": 0.1527868271053795, + "learning_rate": 8.32018556861033e-05, + "loss": 2.8299, + "step": 16526 + }, + { + "epoch": 1.025948227698802, + "grad_norm": 0.15506230400923338, + "learning_rate": 8.319915526426217e-05, + "loss": 2.9465, + "step": 16527 + }, + { + "epoch": 1.0260103047985598, + "grad_norm": 0.15056763796315523, + "learning_rate": 8.3196454669213e-05, + "loss": 2.9535, + "step": 16528 + }, + { + "epoch": 1.0260723818983177, + "grad_norm": 0.5280606236243325, + "learning_rate": 8.319375390096985e-05, + "loss": 2.9002, + "step": 16529 + }, + { + "epoch": 1.0261344589980756, + "grad_norm": 0.17841281935637127, + "learning_rate": 8.319105295954683e-05, + "loss": 2.9635, + "step": 16530 + }, + { + "epoch": 1.0261965360978336, + "grad_norm": 0.2105056173776782, + "learning_rate": 8.318835184495804e-05, + "loss": 3.0071, + "step": 16531 + }, + { + "epoch": 1.0262586131975915, + "grad_norm": 0.2235428325089992, + "learning_rate": 8.318565055721756e-05, + "loss": 2.8875, + "step": 16532 + }, + { + "epoch": 1.0263206902973494, + "grad_norm": 0.1820970359101193, + "learning_rate": 8.318294909633947e-05, + "loss": 2.9507, + "step": 16533 + }, + { + "epoch": 1.026382767397107, + "grad_norm": 0.2323550440769622, + "learning_rate": 8.31802474623379e-05, + "loss": 2.9911, + "step": 16534 + }, + { + "epoch": 1.026444844496865, + "grad_norm": 0.21129479877050172, + "learning_rate": 8.317754565522691e-05, + "loss": 2.9528, + "step": 16535 + }, + { + "epoch": 1.026506921596623, + "grad_norm": 0.2052417102689249, + "learning_rate": 8.317484367502062e-05, + "loss": 2.8939, + "step": 16536 + }, + { + "epoch": 1.0265689986963809, + "grad_norm": 0.18720770941717443, + "learning_rate": 8.317214152173312e-05, + "loss": 2.8667, + "step": 16537 + }, + { + "epoch": 1.0266310757961388, + "grad_norm": 0.2308664930766341, + "learning_rate": 8.31694391953785e-05, + "loss": 2.9886, + "step": 16538 + }, + { + "epoch": 1.0266931528958967, + "grad_norm": 0.21859700949681202, + "learning_rate": 8.316673669597087e-05, + "loss": 3.0677, + "step": 16539 + }, + { + "epoch": 1.0267552299956546, + "grad_norm": 0.18639185393413807, + "learning_rate": 8.31640340235243e-05, + "loss": 2.9444, + "step": 16540 + }, + { + "epoch": 1.0268173070954125, + "grad_norm": 0.2022844769034673, + "learning_rate": 8.316133117805293e-05, + "loss": 2.8664, + "step": 16541 + }, + { + "epoch": 1.0268793841951704, + "grad_norm": 0.1950177161562544, + "learning_rate": 8.315862815957083e-05, + "loss": 2.9146, + "step": 16542 + }, + { + "epoch": 1.0269414612949284, + "grad_norm": 0.18549622296076512, + "learning_rate": 8.315592496809215e-05, + "loss": 2.9405, + "step": 16543 + }, + { + "epoch": 1.0270035383946863, + "grad_norm": 0.31773095623622366, + "learning_rate": 8.315322160363092e-05, + "loss": 2.915, + "step": 16544 + }, + { + "epoch": 1.0270656154944442, + "grad_norm": 0.19675283539690844, + "learning_rate": 8.315051806620131e-05, + "loss": 2.892, + "step": 16545 + }, + { + "epoch": 1.027127692594202, + "grad_norm": 0.17561761869415535, + "learning_rate": 8.31478143558174e-05, + "loss": 2.8781, + "step": 16546 + }, + { + "epoch": 1.0271897696939598, + "grad_norm": 0.1757836447658597, + "learning_rate": 8.31451104724933e-05, + "loss": 3.0397, + "step": 16547 + }, + { + "epoch": 1.0272518467937177, + "grad_norm": 0.19780394309641433, + "learning_rate": 8.31424064162431e-05, + "loss": 2.8833, + "step": 16548 + }, + { + "epoch": 1.0273139238934756, + "grad_norm": 0.17865722140798265, + "learning_rate": 8.313970218708092e-05, + "loss": 2.9156, + "step": 16549 + }, + { + "epoch": 1.0273760009932336, + "grad_norm": 0.19193800222703172, + "learning_rate": 8.313699778502086e-05, + "loss": 2.9381, + "step": 16550 + }, + { + "epoch": 1.0274380780929915, + "grad_norm": 0.17264959421749523, + "learning_rate": 8.313429321007706e-05, + "loss": 2.9012, + "step": 16551 + }, + { + "epoch": 1.0275001551927494, + "grad_norm": 0.16104391130899806, + "learning_rate": 8.31315884622636e-05, + "loss": 2.9455, + "step": 16552 + }, + { + "epoch": 1.0275622322925073, + "grad_norm": 0.17838316704846768, + "learning_rate": 8.31288835415946e-05, + "loss": 2.9441, + "step": 16553 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.20675576072610302, + "learning_rate": 8.312617844808417e-05, + "loss": 2.8765, + "step": 16554 + }, + { + "epoch": 1.0276863864920232, + "grad_norm": 0.14764074390474008, + "learning_rate": 8.312347318174643e-05, + "loss": 2.9837, + "step": 16555 + }, + { + "epoch": 1.027748463591781, + "grad_norm": 0.1804250115075464, + "learning_rate": 8.312076774259548e-05, + "loss": 2.942, + "step": 16556 + }, + { + "epoch": 1.027810540691539, + "grad_norm": 0.15717850418589638, + "learning_rate": 8.311806213064546e-05, + "loss": 3.0278, + "step": 16557 + }, + { + "epoch": 1.0278726177912967, + "grad_norm": 0.174058830737917, + "learning_rate": 8.311535634591045e-05, + "loss": 2.8796, + "step": 16558 + }, + { + "epoch": 1.0279346948910546, + "grad_norm": 0.18842059870976557, + "learning_rate": 8.311265038840462e-05, + "loss": 2.8996, + "step": 16559 + }, + { + "epoch": 1.0279967719908125, + "grad_norm": 0.15568749982465563, + "learning_rate": 8.310994425814201e-05, + "loss": 2.9347, + "step": 16560 + }, + { + "epoch": 1.0280588490905704, + "grad_norm": 0.1772290791733698, + "learning_rate": 8.310723795513681e-05, + "loss": 2.9066, + "step": 16561 + }, + { + "epoch": 1.0281209261903284, + "grad_norm": 0.146880960850792, + "learning_rate": 8.310453147940311e-05, + "loss": 2.8856, + "step": 16562 + }, + { + "epoch": 1.0281830032900863, + "grad_norm": 0.16939099460649804, + "learning_rate": 8.310182483095503e-05, + "loss": 2.9069, + "step": 16563 + }, + { + "epoch": 1.0282450803898442, + "grad_norm": 0.14955324036189446, + "learning_rate": 8.309911800980669e-05, + "loss": 2.937, + "step": 16564 + }, + { + "epoch": 1.028307157489602, + "grad_norm": 0.15043066160668692, + "learning_rate": 8.309641101597223e-05, + "loss": 2.9008, + "step": 16565 + }, + { + "epoch": 1.02836923458936, + "grad_norm": 0.18019073961807727, + "learning_rate": 8.309370384946576e-05, + "loss": 2.9529, + "step": 16566 + }, + { + "epoch": 1.028431311689118, + "grad_norm": 0.15261709925519873, + "learning_rate": 8.309099651030138e-05, + "loss": 2.9301, + "step": 16567 + }, + { + "epoch": 1.0284933887888759, + "grad_norm": 0.16907272605853133, + "learning_rate": 8.308828899849324e-05, + "loss": 2.9776, + "step": 16568 + }, + { + "epoch": 1.0285554658886338, + "grad_norm": 0.17467546215105428, + "learning_rate": 8.308558131405548e-05, + "loss": 2.9707, + "step": 16569 + }, + { + "epoch": 1.0286175429883915, + "grad_norm": 0.16415384565270477, + "learning_rate": 8.30828734570022e-05, + "loss": 2.9555, + "step": 16570 + }, + { + "epoch": 1.0286796200881494, + "grad_norm": 0.14063113252195683, + "learning_rate": 8.308016542734756e-05, + "loss": 2.9729, + "step": 16571 + }, + { + "epoch": 1.0287416971879073, + "grad_norm": 0.16944936849282835, + "learning_rate": 8.307745722510565e-05, + "loss": 2.9653, + "step": 16572 + }, + { + "epoch": 1.0288037742876652, + "grad_norm": 0.17406786234932264, + "learning_rate": 8.307474885029061e-05, + "loss": 2.9805, + "step": 16573 + }, + { + "epoch": 1.0288658513874231, + "grad_norm": 0.16192577489956375, + "learning_rate": 8.307204030291658e-05, + "loss": 2.8431, + "step": 16574 + }, + { + "epoch": 1.028927928487181, + "grad_norm": 0.1688003469348626, + "learning_rate": 8.30693315829977e-05, + "loss": 2.8804, + "step": 16575 + }, + { + "epoch": 1.028990005586939, + "grad_norm": 0.16433571024108942, + "learning_rate": 8.306662269054808e-05, + "loss": 2.8822, + "step": 16576 + }, + { + "epoch": 1.029052082686697, + "grad_norm": 0.15722642626075292, + "learning_rate": 8.306391362558186e-05, + "loss": 2.8164, + "step": 16577 + }, + { + "epoch": 1.0291141597864548, + "grad_norm": 0.20848254979474018, + "learning_rate": 8.306120438811317e-05, + "loss": 2.8628, + "step": 16578 + }, + { + "epoch": 1.0291762368862127, + "grad_norm": 0.16696431669139503, + "learning_rate": 8.305849497815616e-05, + "loss": 2.8884, + "step": 16579 + }, + { + "epoch": 1.0292383139859707, + "grad_norm": 0.20565571962771317, + "learning_rate": 8.305578539572496e-05, + "loss": 2.9825, + "step": 16580 + }, + { + "epoch": 1.0293003910857286, + "grad_norm": 0.19646881185973272, + "learning_rate": 8.305307564083369e-05, + "loss": 2.9203, + "step": 16581 + }, + { + "epoch": 1.0293624681854863, + "grad_norm": 0.17212725140014065, + "learning_rate": 8.305036571349651e-05, + "loss": 2.9457, + "step": 16582 + }, + { + "epoch": 1.0294245452852442, + "grad_norm": 0.17145289803457503, + "learning_rate": 8.304765561372754e-05, + "loss": 2.8957, + "step": 16583 + }, + { + "epoch": 1.029486622385002, + "grad_norm": 0.16516776963211954, + "learning_rate": 8.304494534154094e-05, + "loss": 2.9111, + "step": 16584 + }, + { + "epoch": 1.02954869948476, + "grad_norm": 0.19874961629566604, + "learning_rate": 8.304223489695083e-05, + "loss": 2.9786, + "step": 16585 + }, + { + "epoch": 1.029610776584518, + "grad_norm": 0.19870369845703004, + "learning_rate": 8.303952427997137e-05, + "loss": 2.9422, + "step": 16586 + }, + { + "epoch": 1.0296728536842759, + "grad_norm": 0.15641898911240998, + "learning_rate": 8.303681349061668e-05, + "loss": 2.9185, + "step": 16587 + }, + { + "epoch": 1.0297349307840338, + "grad_norm": 0.1769916399878034, + "learning_rate": 8.303410252890092e-05, + "loss": 2.9399, + "step": 16588 + }, + { + "epoch": 1.0297970078837917, + "grad_norm": 0.16622981671382586, + "learning_rate": 8.303139139483822e-05, + "loss": 2.8955, + "step": 16589 + }, + { + "epoch": 1.0298590849835496, + "grad_norm": 0.1944338489571031, + "learning_rate": 8.302868008844273e-05, + "loss": 3.0152, + "step": 16590 + }, + { + "epoch": 1.0299211620833075, + "grad_norm": 0.17037894720107677, + "learning_rate": 8.302596860972862e-05, + "loss": 2.947, + "step": 16591 + }, + { + "epoch": 1.0299832391830654, + "grad_norm": 0.212175427825171, + "learning_rate": 8.302325695870999e-05, + "loss": 2.961, + "step": 16592 + }, + { + "epoch": 1.0300453162828234, + "grad_norm": 0.15501132170028867, + "learning_rate": 8.302054513540103e-05, + "loss": 2.8688, + "step": 16593 + }, + { + "epoch": 1.030107393382581, + "grad_norm": 0.19431436121795545, + "learning_rate": 8.301783313981585e-05, + "loss": 2.8628, + "step": 16594 + }, + { + "epoch": 1.030169470482339, + "grad_norm": 0.23570390499492955, + "learning_rate": 8.301512097196864e-05, + "loss": 2.946, + "step": 16595 + }, + { + "epoch": 1.030231547582097, + "grad_norm": 0.20642864048096807, + "learning_rate": 8.301240863187352e-05, + "loss": 2.9129, + "step": 16596 + }, + { + "epoch": 1.0302936246818548, + "grad_norm": 0.16316238484651452, + "learning_rate": 8.300969611954466e-05, + "loss": 2.8433, + "step": 16597 + }, + { + "epoch": 1.0303557017816127, + "grad_norm": 0.22449601671448838, + "learning_rate": 8.300698343499618e-05, + "loss": 2.9321, + "step": 16598 + }, + { + "epoch": 1.0304177788813706, + "grad_norm": 0.15382455409968174, + "learning_rate": 8.300427057824227e-05, + "loss": 2.9987, + "step": 16599 + }, + { + "epoch": 1.0304798559811286, + "grad_norm": 0.154746613381864, + "learning_rate": 8.300155754929706e-05, + "loss": 2.8561, + "step": 16600 + }, + { + "epoch": 1.0305419330808865, + "grad_norm": 0.18292567263165824, + "learning_rate": 8.299884434817472e-05, + "loss": 2.9708, + "step": 16601 + }, + { + "epoch": 1.0306040101806444, + "grad_norm": 0.17974635901696304, + "learning_rate": 8.29961309748894e-05, + "loss": 2.8978, + "step": 16602 + }, + { + "epoch": 1.0306660872804023, + "grad_norm": 0.19997412394977807, + "learning_rate": 8.299341742945526e-05, + "loss": 2.9566, + "step": 16603 + }, + { + "epoch": 1.0307281643801602, + "grad_norm": 0.19499604337297255, + "learning_rate": 8.299070371188643e-05, + "loss": 2.906, + "step": 16604 + }, + { + "epoch": 1.0307902414799182, + "grad_norm": 0.20717233672832497, + "learning_rate": 8.29879898221971e-05, + "loss": 2.9685, + "step": 16605 + }, + { + "epoch": 1.0308523185796759, + "grad_norm": 0.16692200797801554, + "learning_rate": 8.298527576040143e-05, + "loss": 2.9541, + "step": 16606 + }, + { + "epoch": 1.0309143956794338, + "grad_norm": 0.16203091342710818, + "learning_rate": 8.298256152651356e-05, + "loss": 2.8865, + "step": 16607 + }, + { + "epoch": 1.0309764727791917, + "grad_norm": 0.15937822048822933, + "learning_rate": 8.297984712054767e-05, + "loss": 2.9414, + "step": 16608 + }, + { + "epoch": 1.0310385498789496, + "grad_norm": 0.16110583570823464, + "learning_rate": 8.297713254251788e-05, + "loss": 2.8901, + "step": 16609 + }, + { + "epoch": 1.0311006269787075, + "grad_norm": 0.17063431083860053, + "learning_rate": 8.297441779243842e-05, + "loss": 2.9652, + "step": 16610 + }, + { + "epoch": 1.0311627040784654, + "grad_norm": 0.1661293257511788, + "learning_rate": 8.297170287032342e-05, + "loss": 2.9188, + "step": 16611 + }, + { + "epoch": 1.0312247811782234, + "grad_norm": 0.19639417576301968, + "learning_rate": 8.2968987776187e-05, + "loss": 2.9647, + "step": 16612 + }, + { + "epoch": 1.0312868582779813, + "grad_norm": 0.17010846799978502, + "learning_rate": 8.296627251004341e-05, + "loss": 3.0051, + "step": 16613 + }, + { + "epoch": 1.0313489353777392, + "grad_norm": 0.1947058602580161, + "learning_rate": 8.296355707190675e-05, + "loss": 2.9189, + "step": 16614 + }, + { + "epoch": 1.0314110124774971, + "grad_norm": 0.20232507111338255, + "learning_rate": 8.296084146179121e-05, + "loss": 2.9081, + "step": 16615 + }, + { + "epoch": 1.031473089577255, + "grad_norm": 0.25675971927280117, + "learning_rate": 8.295812567971097e-05, + "loss": 2.9098, + "step": 16616 + }, + { + "epoch": 1.031535166677013, + "grad_norm": 0.1802774881422387, + "learning_rate": 8.295540972568018e-05, + "loss": 2.907, + "step": 16617 + }, + { + "epoch": 1.0315972437767706, + "grad_norm": 0.16593429031608423, + "learning_rate": 8.295269359971303e-05, + "loss": 2.8114, + "step": 16618 + }, + { + "epoch": 1.0316593208765286, + "grad_norm": 0.20085331777477589, + "learning_rate": 8.294997730182365e-05, + "loss": 2.9199, + "step": 16619 + }, + { + "epoch": 1.0317213979762865, + "grad_norm": 0.23034367966227118, + "learning_rate": 8.294726083202627e-05, + "loss": 3.0209, + "step": 16620 + }, + { + "epoch": 1.0317834750760444, + "grad_norm": 0.20516434628097427, + "learning_rate": 8.294454419033502e-05, + "loss": 2.8717, + "step": 16621 + }, + { + "epoch": 1.0318455521758023, + "grad_norm": 0.20925940482562913, + "learning_rate": 8.294182737676408e-05, + "loss": 2.8958, + "step": 16622 + }, + { + "epoch": 1.0319076292755602, + "grad_norm": 0.3273000565094379, + "learning_rate": 8.293911039132763e-05, + "loss": 2.9266, + "step": 16623 + }, + { + "epoch": 1.0319697063753182, + "grad_norm": 0.26256803527805217, + "learning_rate": 8.293639323403986e-05, + "loss": 2.9441, + "step": 16624 + }, + { + "epoch": 1.032031783475076, + "grad_norm": 0.21220627427789013, + "learning_rate": 8.293367590491492e-05, + "loss": 2.9634, + "step": 16625 + }, + { + "epoch": 1.032093860574834, + "grad_norm": 0.17157961648838707, + "learning_rate": 8.2930958403967e-05, + "loss": 2.841, + "step": 16626 + }, + { + "epoch": 1.032155937674592, + "grad_norm": 0.23681659501844926, + "learning_rate": 8.292824073121028e-05, + "loss": 2.8983, + "step": 16627 + }, + { + "epoch": 1.0322180147743498, + "grad_norm": 0.2143424514606594, + "learning_rate": 8.292552288665891e-05, + "loss": 2.9689, + "step": 16628 + }, + { + "epoch": 1.0322800918741077, + "grad_norm": 0.21036530523293576, + "learning_rate": 8.292280487032713e-05, + "loss": 2.8795, + "step": 16629 + }, + { + "epoch": 1.0323421689738654, + "grad_norm": 0.1849737652335398, + "learning_rate": 8.292008668222906e-05, + "loss": 2.861, + "step": 16630 + }, + { + "epoch": 1.0324042460736234, + "grad_norm": 0.23872660619654984, + "learning_rate": 8.291736832237891e-05, + "loss": 2.8616, + "step": 16631 + }, + { + "epoch": 1.0324663231733813, + "grad_norm": 0.17981409713319563, + "learning_rate": 8.291464979079086e-05, + "loss": 2.844, + "step": 16632 + }, + { + "epoch": 1.0325284002731392, + "grad_norm": 0.19826372151795504, + "learning_rate": 8.29119310874791e-05, + "loss": 2.9848, + "step": 16633 + }, + { + "epoch": 1.032590477372897, + "grad_norm": 0.18199139228249137, + "learning_rate": 8.29092122124578e-05, + "loss": 2.9603, + "step": 16634 + }, + { + "epoch": 1.032652554472655, + "grad_norm": 0.18602160907590304, + "learning_rate": 8.290649316574115e-05, + "loss": 2.9539, + "step": 16635 + }, + { + "epoch": 1.032714631572413, + "grad_norm": 0.19732007134549656, + "learning_rate": 8.290377394734334e-05, + "loss": 2.992, + "step": 16636 + }, + { + "epoch": 1.0327767086721709, + "grad_norm": 0.17215778106476726, + "learning_rate": 8.290105455727855e-05, + "loss": 2.9248, + "step": 16637 + }, + { + "epoch": 1.0328387857719288, + "grad_norm": 0.17262816135928655, + "learning_rate": 8.289833499556097e-05, + "loss": 2.9375, + "step": 16638 + }, + { + "epoch": 1.0329008628716867, + "grad_norm": 0.18185577161483282, + "learning_rate": 8.289561526220479e-05, + "loss": 2.8881, + "step": 16639 + }, + { + "epoch": 1.0329629399714446, + "grad_norm": 0.17299473215879554, + "learning_rate": 8.28928953572242e-05, + "loss": 2.9116, + "step": 16640 + }, + { + "epoch": 1.0330250170712025, + "grad_norm": 0.1825039558439154, + "learning_rate": 8.289017528063339e-05, + "loss": 3.0123, + "step": 16641 + }, + { + "epoch": 1.0330870941709602, + "grad_norm": 0.15455581570064908, + "learning_rate": 8.288745503244654e-05, + "loss": 2.9278, + "step": 16642 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.16569589231282536, + "learning_rate": 8.288473461267789e-05, + "loss": 2.9738, + "step": 16643 + }, + { + "epoch": 1.033211248370476, + "grad_norm": 0.1906666186504648, + "learning_rate": 8.288201402134157e-05, + "loss": 2.9771, + "step": 16644 + }, + { + "epoch": 1.033273325470234, + "grad_norm": 0.17387275979980021, + "learning_rate": 8.28792932584518e-05, + "loss": 2.96, + "step": 16645 + }, + { + "epoch": 1.033335402569992, + "grad_norm": 0.16507602593729812, + "learning_rate": 8.287657232402278e-05, + "loss": 2.8988, + "step": 16646 + }, + { + "epoch": 1.0333974796697498, + "grad_norm": 0.1924299676101775, + "learning_rate": 8.287385121806869e-05, + "loss": 2.9289, + "step": 16647 + }, + { + "epoch": 1.0334595567695077, + "grad_norm": 0.1593379538619611, + "learning_rate": 8.287112994060375e-05, + "loss": 2.8555, + "step": 16648 + }, + { + "epoch": 1.0335216338692657, + "grad_norm": 0.18754265220621574, + "learning_rate": 8.286840849164214e-05, + "loss": 2.9397, + "step": 16649 + }, + { + "epoch": 1.0335837109690236, + "grad_norm": 0.19039138995718577, + "learning_rate": 8.286568687119806e-05, + "loss": 2.9629, + "step": 16650 + }, + { + "epoch": 1.0336457880687815, + "grad_norm": 0.1678443214944426, + "learning_rate": 8.286296507928574e-05, + "loss": 2.9841, + "step": 16651 + }, + { + "epoch": 1.0337078651685394, + "grad_norm": 0.18630150378887786, + "learning_rate": 8.286024311591934e-05, + "loss": 2.9637, + "step": 16652 + }, + { + "epoch": 1.0337699422682973, + "grad_norm": 0.20137251875560522, + "learning_rate": 8.285752098111305e-05, + "loss": 2.9935, + "step": 16653 + }, + { + "epoch": 1.033832019368055, + "grad_norm": 0.18173286188030466, + "learning_rate": 8.285479867488112e-05, + "loss": 2.9346, + "step": 16654 + }, + { + "epoch": 1.033894096467813, + "grad_norm": 0.19174081406480095, + "learning_rate": 8.285207619723771e-05, + "loss": 2.956, + "step": 16655 + }, + { + "epoch": 1.0339561735675709, + "grad_norm": 0.18355446253095253, + "learning_rate": 8.284935354819707e-05, + "loss": 2.9317, + "step": 16656 + }, + { + "epoch": 1.0340182506673288, + "grad_norm": 0.1810701600246246, + "learning_rate": 8.284663072777337e-05, + "loss": 2.9274, + "step": 16657 + }, + { + "epoch": 1.0340803277670867, + "grad_norm": 0.19785199970223416, + "learning_rate": 8.28439077359808e-05, + "loss": 2.9581, + "step": 16658 + }, + { + "epoch": 1.0341424048668446, + "grad_norm": 0.17324834208431436, + "learning_rate": 8.284118457283364e-05, + "loss": 2.9932, + "step": 16659 + }, + { + "epoch": 1.0342044819666025, + "grad_norm": 0.15860534094749038, + "learning_rate": 8.2838461238346e-05, + "loss": 2.9141, + "step": 16660 + }, + { + "epoch": 1.0342665590663604, + "grad_norm": 0.19170699528891383, + "learning_rate": 8.283573773253214e-05, + "loss": 2.9304, + "step": 16661 + }, + { + "epoch": 1.0343286361661184, + "grad_norm": 0.1774112572930106, + "learning_rate": 8.28330140554063e-05, + "loss": 2.9897, + "step": 16662 + }, + { + "epoch": 1.0343907132658763, + "grad_norm": 0.1778515649465003, + "learning_rate": 8.283029020698264e-05, + "loss": 2.9256, + "step": 16663 + }, + { + "epoch": 1.0344527903656342, + "grad_norm": 0.1505991369615006, + "learning_rate": 8.282756618727537e-05, + "loss": 2.9492, + "step": 16664 + }, + { + "epoch": 1.0345148674653921, + "grad_norm": 0.17028246907914252, + "learning_rate": 8.282484199629873e-05, + "loss": 3.0152, + "step": 16665 + }, + { + "epoch": 1.0345769445651498, + "grad_norm": 0.2086535982292248, + "learning_rate": 8.28221176340669e-05, + "loss": 2.9996, + "step": 16666 + }, + { + "epoch": 1.0346390216649077, + "grad_norm": 0.15674818013623865, + "learning_rate": 8.281939310059414e-05, + "loss": 2.8873, + "step": 16667 + }, + { + "epoch": 1.0347010987646656, + "grad_norm": 0.1651565427064783, + "learning_rate": 8.281666839589464e-05, + "loss": 2.993, + "step": 16668 + }, + { + "epoch": 1.0347631758644236, + "grad_norm": 0.24842805184686267, + "learning_rate": 8.28139435199826e-05, + "loss": 2.9165, + "step": 16669 + }, + { + "epoch": 1.0348252529641815, + "grad_norm": 0.22293387221102925, + "learning_rate": 8.281121847287226e-05, + "loss": 2.917, + "step": 16670 + }, + { + "epoch": 1.0348873300639394, + "grad_norm": 0.2585278758480479, + "learning_rate": 8.280849325457782e-05, + "loss": 2.9482, + "step": 16671 + }, + { + "epoch": 1.0349494071636973, + "grad_norm": 0.18254818352104016, + "learning_rate": 8.280576786511351e-05, + "loss": 3.0069, + "step": 16672 + }, + { + "epoch": 1.0350114842634552, + "grad_norm": 0.19498790471149718, + "learning_rate": 8.280304230449355e-05, + "loss": 2.9262, + "step": 16673 + }, + { + "epoch": 1.0350735613632132, + "grad_norm": 0.17265560874992494, + "learning_rate": 8.280031657273215e-05, + "loss": 2.8997, + "step": 16674 + }, + { + "epoch": 1.035135638462971, + "grad_norm": 0.16350097724603543, + "learning_rate": 8.279759066984354e-05, + "loss": 2.8382, + "step": 16675 + }, + { + "epoch": 1.035197715562729, + "grad_norm": 0.1932823817775837, + "learning_rate": 8.279486459584194e-05, + "loss": 2.9192, + "step": 16676 + }, + { + "epoch": 1.035259792662487, + "grad_norm": 0.17353343093349285, + "learning_rate": 8.279213835074155e-05, + "loss": 2.8726, + "step": 16677 + }, + { + "epoch": 1.0353218697622446, + "grad_norm": 0.1662656080028897, + "learning_rate": 8.278941193455664e-05, + "loss": 2.789, + "step": 16678 + }, + { + "epoch": 1.0353839468620025, + "grad_norm": 0.26245472040660656, + "learning_rate": 8.27866853473014e-05, + "loss": 2.8824, + "step": 16679 + }, + { + "epoch": 1.0354460239617604, + "grad_norm": 0.16576013540260343, + "learning_rate": 8.278395858899006e-05, + "loss": 2.9062, + "step": 16680 + }, + { + "epoch": 1.0355081010615184, + "grad_norm": 0.1644434706330015, + "learning_rate": 8.278123165963683e-05, + "loss": 3.0036, + "step": 16681 + }, + { + "epoch": 1.0355701781612763, + "grad_norm": 0.17200967540807255, + "learning_rate": 8.277850455925599e-05, + "loss": 2.8936, + "step": 16682 + }, + { + "epoch": 1.0356322552610342, + "grad_norm": 0.17895047753055293, + "learning_rate": 8.277577728786173e-05, + "loss": 2.9506, + "step": 16683 + }, + { + "epoch": 1.0356943323607921, + "grad_norm": 0.20267235316207718, + "learning_rate": 8.277304984546825e-05, + "loss": 2.8319, + "step": 16684 + }, + { + "epoch": 1.03575640946055, + "grad_norm": 0.17459011227648635, + "learning_rate": 8.277032223208984e-05, + "loss": 2.9438, + "step": 16685 + }, + { + "epoch": 1.035818486560308, + "grad_norm": 0.15688935245536928, + "learning_rate": 8.276759444774071e-05, + "loss": 2.9372, + "step": 16686 + }, + { + "epoch": 1.0358805636600659, + "grad_norm": 0.17496856807379743, + "learning_rate": 8.276486649243506e-05, + "loss": 2.9809, + "step": 16687 + }, + { + "epoch": 1.0359426407598238, + "grad_norm": 0.15730189064936448, + "learning_rate": 8.276213836618718e-05, + "loss": 2.8913, + "step": 16688 + }, + { + "epoch": 1.0360047178595817, + "grad_norm": 0.16321522618995704, + "learning_rate": 8.275941006901124e-05, + "loss": 2.9984, + "step": 16689 + }, + { + "epoch": 1.0360667949593394, + "grad_norm": 0.1624984897029824, + "learning_rate": 8.275668160092154e-05, + "loss": 2.9238, + "step": 16690 + }, + { + "epoch": 1.0361288720590973, + "grad_norm": 0.1522107583099581, + "learning_rate": 8.275395296193225e-05, + "loss": 2.9618, + "step": 16691 + }, + { + "epoch": 1.0361909491588552, + "grad_norm": 0.19044446643310126, + "learning_rate": 8.275122415205765e-05, + "loss": 2.9387, + "step": 16692 + }, + { + "epoch": 1.0362530262586132, + "grad_norm": 0.18647148247569603, + "learning_rate": 8.274849517131195e-05, + "loss": 2.927, + "step": 16693 + }, + { + "epoch": 1.036315103358371, + "grad_norm": 0.16797692256828753, + "learning_rate": 8.274576601970941e-05, + "loss": 2.9378, + "step": 16694 + }, + { + "epoch": 1.036377180458129, + "grad_norm": 0.15864272911602337, + "learning_rate": 8.274303669726426e-05, + "loss": 2.9462, + "step": 16695 + }, + { + "epoch": 1.036439257557887, + "grad_norm": 0.16814956248082125, + "learning_rate": 8.274030720399073e-05, + "loss": 2.871, + "step": 16696 + }, + { + "epoch": 1.0365013346576448, + "grad_norm": 0.14522146044435127, + "learning_rate": 8.273757753990307e-05, + "loss": 2.9414, + "step": 16697 + }, + { + "epoch": 1.0365634117574027, + "grad_norm": 0.17806040027460943, + "learning_rate": 8.273484770501554e-05, + "loss": 3.0158, + "step": 16698 + }, + { + "epoch": 1.0366254888571607, + "grad_norm": 0.1579734760222687, + "learning_rate": 8.273211769934234e-05, + "loss": 2.9488, + "step": 16699 + }, + { + "epoch": 1.0366875659569186, + "grad_norm": 0.1742472364461009, + "learning_rate": 8.272938752289775e-05, + "loss": 2.8905, + "step": 16700 + }, + { + "epoch": 1.0367496430566765, + "grad_norm": 0.18518547197712662, + "learning_rate": 8.2726657175696e-05, + "loss": 2.8924, + "step": 16701 + }, + { + "epoch": 1.0368117201564342, + "grad_norm": 0.17276450297717622, + "learning_rate": 8.272392665775133e-05, + "loss": 3.0174, + "step": 16702 + }, + { + "epoch": 1.036873797256192, + "grad_norm": 0.19394417192905491, + "learning_rate": 8.2721195969078e-05, + "loss": 2.9869, + "step": 16703 + }, + { + "epoch": 1.03693587435595, + "grad_norm": 0.15341484897095484, + "learning_rate": 8.271846510969024e-05, + "loss": 2.8588, + "step": 16704 + }, + { + "epoch": 1.036997951455708, + "grad_norm": 0.18332716681303887, + "learning_rate": 8.271573407960229e-05, + "loss": 2.9373, + "step": 16705 + }, + { + "epoch": 1.0370600285554659, + "grad_norm": 0.2293424645484155, + "learning_rate": 8.271300287882843e-05, + "loss": 2.9269, + "step": 16706 + }, + { + "epoch": 1.0371221056552238, + "grad_norm": 0.17689665720756054, + "learning_rate": 8.271027150738289e-05, + "loss": 2.9154, + "step": 16707 + }, + { + "epoch": 1.0371841827549817, + "grad_norm": 0.18386922622435384, + "learning_rate": 8.270753996527992e-05, + "loss": 2.9276, + "step": 16708 + }, + { + "epoch": 1.0372462598547396, + "grad_norm": 0.18931290264659842, + "learning_rate": 8.270480825253379e-05, + "loss": 2.9053, + "step": 16709 + }, + { + "epoch": 1.0373083369544975, + "grad_norm": 0.17935422383254068, + "learning_rate": 8.270207636915871e-05, + "loss": 2.9331, + "step": 16710 + }, + { + "epoch": 1.0373704140542555, + "grad_norm": 0.1873093171885051, + "learning_rate": 8.269934431516898e-05, + "loss": 2.9463, + "step": 16711 + }, + { + "epoch": 1.0374324911540134, + "grad_norm": 0.19869288996033097, + "learning_rate": 8.269661209057882e-05, + "loss": 2.8962, + "step": 16712 + }, + { + "epoch": 1.0374945682537713, + "grad_norm": 0.19677976696708957, + "learning_rate": 8.26938796954025e-05, + "loss": 2.8926, + "step": 16713 + }, + { + "epoch": 1.037556645353529, + "grad_norm": 0.17551238903274163, + "learning_rate": 8.269114712965427e-05, + "loss": 2.9468, + "step": 16714 + }, + { + "epoch": 1.037618722453287, + "grad_norm": 0.2027269277509436, + "learning_rate": 8.268841439334839e-05, + "loss": 2.9718, + "step": 16715 + }, + { + "epoch": 1.0376807995530448, + "grad_norm": 0.1980194663469334, + "learning_rate": 8.268568148649912e-05, + "loss": 2.9934, + "step": 16716 + }, + { + "epoch": 1.0377428766528027, + "grad_norm": 0.18181154966450228, + "learning_rate": 8.268294840912072e-05, + "loss": 2.9462, + "step": 16717 + }, + { + "epoch": 1.0378049537525607, + "grad_norm": 0.23194776792883096, + "learning_rate": 8.268021516122743e-05, + "loss": 2.8862, + "step": 16718 + }, + { + "epoch": 1.0378670308523186, + "grad_norm": 0.17223543066846567, + "learning_rate": 8.267748174283354e-05, + "loss": 2.8721, + "step": 16719 + }, + { + "epoch": 1.0379291079520765, + "grad_norm": 0.17855578081509374, + "learning_rate": 8.267474815395329e-05, + "loss": 2.886, + "step": 16720 + }, + { + "epoch": 1.0379911850518344, + "grad_norm": 0.20447933335150908, + "learning_rate": 8.267201439460095e-05, + "loss": 2.9284, + "step": 16721 + }, + { + "epoch": 1.0380532621515923, + "grad_norm": 0.17254954423906407, + "learning_rate": 8.266928046479076e-05, + "loss": 2.9856, + "step": 16722 + }, + { + "epoch": 1.0381153392513502, + "grad_norm": 0.17230415410746447, + "learning_rate": 8.266654636453703e-05, + "loss": 2.9244, + "step": 16723 + }, + { + "epoch": 1.0381774163511082, + "grad_norm": 0.1538150483245434, + "learning_rate": 8.266381209385399e-05, + "loss": 2.9624, + "step": 16724 + }, + { + "epoch": 1.038239493450866, + "grad_norm": 0.18369785360391644, + "learning_rate": 8.26610776527559e-05, + "loss": 2.9728, + "step": 16725 + }, + { + "epoch": 1.0383015705506238, + "grad_norm": 0.16919955378955764, + "learning_rate": 8.265834304125704e-05, + "loss": 2.9908, + "step": 16726 + }, + { + "epoch": 1.0383636476503817, + "grad_norm": 0.17324539082693277, + "learning_rate": 8.265560825937169e-05, + "loss": 2.9617, + "step": 16727 + }, + { + "epoch": 1.0384257247501396, + "grad_norm": 0.20398578371564527, + "learning_rate": 8.26528733071141e-05, + "loss": 2.8885, + "step": 16728 + }, + { + "epoch": 1.0384878018498975, + "grad_norm": 0.17055817654428937, + "learning_rate": 8.265013818449854e-05, + "loss": 3.0003, + "step": 16729 + }, + { + "epoch": 1.0385498789496554, + "grad_norm": 0.18163302621021174, + "learning_rate": 8.264740289153927e-05, + "loss": 2.9353, + "step": 16730 + }, + { + "epoch": 1.0386119560494134, + "grad_norm": 0.1692123067374125, + "learning_rate": 8.26446674282506e-05, + "loss": 2.7256, + "step": 16731 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.16883604562441826, + "learning_rate": 8.264193179464677e-05, + "loss": 2.9216, + "step": 16732 + }, + { + "epoch": 1.0387361102489292, + "grad_norm": 0.20034354699802423, + "learning_rate": 8.263919599074203e-05, + "loss": 3.0389, + "step": 16733 + }, + { + "epoch": 1.0387981873486871, + "grad_norm": 0.1473064560785292, + "learning_rate": 8.26364600165507e-05, + "loss": 2.8369, + "step": 16734 + }, + { + "epoch": 1.038860264448445, + "grad_norm": 0.20500536036461325, + "learning_rate": 8.263372387208705e-05, + "loss": 2.8454, + "step": 16735 + }, + { + "epoch": 1.038922341548203, + "grad_norm": 0.1973845151929344, + "learning_rate": 8.263098755736532e-05, + "loss": 2.969, + "step": 16736 + }, + { + "epoch": 1.0389844186479609, + "grad_norm": 0.17611884970617314, + "learning_rate": 8.262825107239982e-05, + "loss": 2.762, + "step": 16737 + }, + { + "epoch": 1.0390464957477186, + "grad_norm": 0.19853792540443352, + "learning_rate": 8.262551441720481e-05, + "loss": 2.9076, + "step": 16738 + }, + { + "epoch": 1.0391085728474765, + "grad_norm": 0.1568299735373555, + "learning_rate": 8.262277759179457e-05, + "loss": 2.8987, + "step": 16739 + }, + { + "epoch": 1.0391706499472344, + "grad_norm": 0.1802415805165053, + "learning_rate": 8.262004059618338e-05, + "loss": 2.8924, + "step": 16740 + }, + { + "epoch": 1.0392327270469923, + "grad_norm": 0.16173148634878168, + "learning_rate": 8.261730343038551e-05, + "loss": 2.9302, + "step": 16741 + }, + { + "epoch": 1.0392948041467502, + "grad_norm": 0.30238443884481797, + "learning_rate": 8.261456609441527e-05, + "loss": 3.0061, + "step": 16742 + }, + { + "epoch": 1.0393568812465082, + "grad_norm": 0.18080087584886304, + "learning_rate": 8.26118285882869e-05, + "loss": 2.9242, + "step": 16743 + }, + { + "epoch": 1.039418958346266, + "grad_norm": 0.2367493367341931, + "learning_rate": 8.260909091201471e-05, + "loss": 2.8298, + "step": 16744 + }, + { + "epoch": 1.039481035446024, + "grad_norm": 0.2161858944746418, + "learning_rate": 8.260635306561299e-05, + "loss": 2.9057, + "step": 16745 + }, + { + "epoch": 1.039543112545782, + "grad_norm": 0.1641767762330142, + "learning_rate": 8.2603615049096e-05, + "loss": 2.9394, + "step": 16746 + }, + { + "epoch": 1.0396051896455398, + "grad_norm": 0.18963011113887193, + "learning_rate": 8.260087686247802e-05, + "loss": 2.9733, + "step": 16747 + }, + { + "epoch": 1.0396672667452977, + "grad_norm": 0.19340453920888237, + "learning_rate": 8.259813850577336e-05, + "loss": 2.8966, + "step": 16748 + }, + { + "epoch": 1.0397293438450557, + "grad_norm": 0.18194731590461696, + "learning_rate": 8.259539997899631e-05, + "loss": 2.8861, + "step": 16749 + }, + { + "epoch": 1.0397914209448134, + "grad_norm": 0.18875253070941042, + "learning_rate": 8.259266128216115e-05, + "loss": 2.923, + "step": 16750 + }, + { + "epoch": 1.0398534980445713, + "grad_norm": 0.16746917441852288, + "learning_rate": 8.258992241528214e-05, + "loss": 2.9169, + "step": 16751 + }, + { + "epoch": 1.0399155751443292, + "grad_norm": 0.1610096547492775, + "learning_rate": 8.258718337837362e-05, + "loss": 2.9095, + "step": 16752 + }, + { + "epoch": 1.0399776522440871, + "grad_norm": 0.16634438443748203, + "learning_rate": 8.258444417144983e-05, + "loss": 2.9614, + "step": 16753 + }, + { + "epoch": 1.040039729343845, + "grad_norm": 0.19171310662106897, + "learning_rate": 8.25817047945251e-05, + "loss": 2.9718, + "step": 16754 + }, + { + "epoch": 1.040101806443603, + "grad_norm": 0.16129128725970868, + "learning_rate": 8.25789652476137e-05, + "loss": 2.9522, + "step": 16755 + }, + { + "epoch": 1.0401638835433609, + "grad_norm": 0.19171455426898562, + "learning_rate": 8.257622553072992e-05, + "loss": 2.9778, + "step": 16756 + }, + { + "epoch": 1.0402259606431188, + "grad_norm": 0.18178384528455968, + "learning_rate": 8.257348564388808e-05, + "loss": 2.8663, + "step": 16757 + }, + { + "epoch": 1.0402880377428767, + "grad_norm": 0.18066554713283572, + "learning_rate": 8.257074558710245e-05, + "loss": 2.8491, + "step": 16758 + }, + { + "epoch": 1.0403501148426346, + "grad_norm": 0.20377237521804628, + "learning_rate": 8.256800536038733e-05, + "loss": 2.8375, + "step": 16759 + }, + { + "epoch": 1.0404121919423925, + "grad_norm": 0.21750872821943257, + "learning_rate": 8.256526496375705e-05, + "loss": 2.9586, + "step": 16760 + }, + { + "epoch": 1.0404742690421505, + "grad_norm": 0.2079829080206235, + "learning_rate": 8.256252439722585e-05, + "loss": 2.9613, + "step": 16761 + }, + { + "epoch": 1.0405363461419082, + "grad_norm": 0.1569721461799577, + "learning_rate": 8.255978366080806e-05, + "loss": 2.893, + "step": 16762 + }, + { + "epoch": 1.040598423241666, + "grad_norm": 0.1715771304772977, + "learning_rate": 8.255704275451796e-05, + "loss": 2.9916, + "step": 16763 + }, + { + "epoch": 1.040660500341424, + "grad_norm": 0.1493621374422111, + "learning_rate": 8.255430167836988e-05, + "loss": 2.8091, + "step": 16764 + }, + { + "epoch": 1.040722577441182, + "grad_norm": 0.18572735290975334, + "learning_rate": 8.255156043237809e-05, + "loss": 2.9729, + "step": 16765 + }, + { + "epoch": 1.0407846545409398, + "grad_norm": 0.19057705385676962, + "learning_rate": 8.254881901655694e-05, + "loss": 2.9423, + "step": 16766 + }, + { + "epoch": 1.0408467316406977, + "grad_norm": 0.184895053154459, + "learning_rate": 8.254607743092066e-05, + "loss": 2.9158, + "step": 16767 + }, + { + "epoch": 1.0409088087404557, + "grad_norm": 0.15435788602214126, + "learning_rate": 8.254333567548362e-05, + "loss": 2.9123, + "step": 16768 + }, + { + "epoch": 1.0409708858402136, + "grad_norm": 0.15983413658990217, + "learning_rate": 8.254059375026009e-05, + "loss": 2.8563, + "step": 16769 + }, + { + "epoch": 1.0410329629399715, + "grad_norm": 0.15326256484528183, + "learning_rate": 8.253785165526438e-05, + "loss": 2.9423, + "step": 16770 + }, + { + "epoch": 1.0410950400397294, + "grad_norm": 0.1760594550622026, + "learning_rate": 8.253510939051082e-05, + "loss": 2.9301, + "step": 16771 + }, + { + "epoch": 1.0411571171394873, + "grad_norm": 0.1608189792221059, + "learning_rate": 8.253236695601366e-05, + "loss": 2.8962, + "step": 16772 + }, + { + "epoch": 1.0412191942392452, + "grad_norm": 0.19734056390167182, + "learning_rate": 8.252962435178727e-05, + "loss": 2.9091, + "step": 16773 + }, + { + "epoch": 1.041281271339003, + "grad_norm": 0.21325258230018795, + "learning_rate": 8.252688157784591e-05, + "loss": 2.9225, + "step": 16774 + }, + { + "epoch": 1.0413433484387609, + "grad_norm": 0.19594599031612733, + "learning_rate": 8.252413863420394e-05, + "loss": 2.8893, + "step": 16775 + }, + { + "epoch": 1.0414054255385188, + "grad_norm": 0.1876205116658939, + "learning_rate": 8.252139552087563e-05, + "loss": 2.882, + "step": 16776 + }, + { + "epoch": 1.0414675026382767, + "grad_norm": 0.15845809217709328, + "learning_rate": 8.251865223787531e-05, + "loss": 2.9017, + "step": 16777 + }, + { + "epoch": 1.0415295797380346, + "grad_norm": 0.16314895919806266, + "learning_rate": 8.251590878521728e-05, + "loss": 2.9994, + "step": 16778 + }, + { + "epoch": 1.0415916568377925, + "grad_norm": 0.1629080115903047, + "learning_rate": 8.251316516291586e-05, + "loss": 3.0001, + "step": 16779 + }, + { + "epoch": 1.0416537339375505, + "grad_norm": 0.1604998670567054, + "learning_rate": 8.251042137098537e-05, + "loss": 2.8738, + "step": 16780 + }, + { + "epoch": 1.0417158110373084, + "grad_norm": 0.1557157741697114, + "learning_rate": 8.250767740944011e-05, + "loss": 2.9879, + "step": 16781 + }, + { + "epoch": 1.0417778881370663, + "grad_norm": 0.16053246296518497, + "learning_rate": 8.25049332782944e-05, + "loss": 2.8051, + "step": 16782 + }, + { + "epoch": 1.0418399652368242, + "grad_norm": 0.3309821778055983, + "learning_rate": 8.250218897756259e-05, + "loss": 2.9972, + "step": 16783 + }, + { + "epoch": 1.0419020423365821, + "grad_norm": 0.18604580906282053, + "learning_rate": 8.249944450725895e-05, + "loss": 2.9024, + "step": 16784 + }, + { + "epoch": 1.04196411943634, + "grad_norm": 0.20748009699713027, + "learning_rate": 8.249669986739783e-05, + "loss": 2.9606, + "step": 16785 + }, + { + "epoch": 1.0420261965360977, + "grad_norm": 0.19525603431823332, + "learning_rate": 8.249395505799353e-05, + "loss": 2.9687, + "step": 16786 + }, + { + "epoch": 1.0420882736358557, + "grad_norm": 0.17955565798769849, + "learning_rate": 8.249121007906038e-05, + "loss": 3.0101, + "step": 16787 + }, + { + "epoch": 1.0421503507356136, + "grad_norm": 0.18321598024143695, + "learning_rate": 8.24884649306127e-05, + "loss": 2.9947, + "step": 16788 + }, + { + "epoch": 1.0422124278353715, + "grad_norm": 0.23767918992041398, + "learning_rate": 8.24857196126648e-05, + "loss": 2.9303, + "step": 16789 + }, + { + "epoch": 1.0422745049351294, + "grad_norm": 0.19395710812259198, + "learning_rate": 8.248297412523103e-05, + "loss": 2.9584, + "step": 16790 + }, + { + "epoch": 1.0423365820348873, + "grad_norm": 0.17902296998048048, + "learning_rate": 8.24802284683257e-05, + "loss": 2.9928, + "step": 16791 + }, + { + "epoch": 1.0423986591346452, + "grad_norm": 0.17535714273800293, + "learning_rate": 8.247748264196313e-05, + "loss": 2.8133, + "step": 16792 + }, + { + "epoch": 1.0424607362344032, + "grad_norm": 0.19082035193545385, + "learning_rate": 8.247473664615763e-05, + "loss": 2.9542, + "step": 16793 + }, + { + "epoch": 1.042522813334161, + "grad_norm": 0.16633066690562265, + "learning_rate": 8.247199048092357e-05, + "loss": 3.0261, + "step": 16794 + }, + { + "epoch": 1.042584890433919, + "grad_norm": 0.166877880348393, + "learning_rate": 8.246924414627523e-05, + "loss": 2.8253, + "step": 16795 + }, + { + "epoch": 1.042646967533677, + "grad_norm": 0.16425853892380046, + "learning_rate": 8.246649764222698e-05, + "loss": 2.8946, + "step": 16796 + }, + { + "epoch": 1.0427090446334346, + "grad_norm": 0.17349293355609152, + "learning_rate": 8.246375096879313e-05, + "loss": 2.9282, + "step": 16797 + }, + { + "epoch": 1.0427711217331925, + "grad_norm": 0.17211132167073695, + "learning_rate": 8.246100412598801e-05, + "loss": 2.9278, + "step": 16798 + }, + { + "epoch": 1.0428331988329504, + "grad_norm": 0.16347408361162044, + "learning_rate": 8.245825711382595e-05, + "loss": 2.9148, + "step": 16799 + }, + { + "epoch": 1.0428952759327084, + "grad_norm": 0.18073371369379487, + "learning_rate": 8.245550993232127e-05, + "loss": 2.8885, + "step": 16800 + }, + { + "epoch": 1.0429573530324663, + "grad_norm": 0.17247436337253294, + "learning_rate": 8.245276258148833e-05, + "loss": 2.9647, + "step": 16801 + }, + { + "epoch": 1.0430194301322242, + "grad_norm": 0.16986966701302017, + "learning_rate": 8.245001506134144e-05, + "loss": 2.9374, + "step": 16802 + }, + { + "epoch": 1.0430815072319821, + "grad_norm": 0.16427610320449532, + "learning_rate": 8.244726737189495e-05, + "loss": 2.9642, + "step": 16803 + }, + { + "epoch": 1.04314358433174, + "grad_norm": 0.17250385981539887, + "learning_rate": 8.244451951316319e-05, + "loss": 2.9871, + "step": 16804 + }, + { + "epoch": 1.043205661431498, + "grad_norm": 0.203886840890438, + "learning_rate": 8.24417714851605e-05, + "loss": 2.9585, + "step": 16805 + }, + { + "epoch": 1.0432677385312559, + "grad_norm": 0.16570379479186115, + "learning_rate": 8.24390232879012e-05, + "loss": 2.9991, + "step": 16806 + }, + { + "epoch": 1.0433298156310138, + "grad_norm": 0.1557174488164841, + "learning_rate": 8.243627492139965e-05, + "loss": 2.9505, + "step": 16807 + }, + { + "epoch": 1.0433918927307717, + "grad_norm": 0.17034305792706259, + "learning_rate": 8.243352638567017e-05, + "loss": 2.879, + "step": 16808 + }, + { + "epoch": 1.0434539698305296, + "grad_norm": 0.16209648264899443, + "learning_rate": 8.243077768072713e-05, + "loss": 2.9313, + "step": 16809 + }, + { + "epoch": 1.0435160469302873, + "grad_norm": 0.17580607556831418, + "learning_rate": 8.242802880658483e-05, + "loss": 2.888, + "step": 16810 + }, + { + "epoch": 1.0435781240300452, + "grad_norm": 0.1668776292093726, + "learning_rate": 8.242527976325764e-05, + "loss": 2.8881, + "step": 16811 + }, + { + "epoch": 1.0436402011298032, + "grad_norm": 0.17482406892366267, + "learning_rate": 8.242253055075988e-05, + "loss": 2.9538, + "step": 16812 + }, + { + "epoch": 1.043702278229561, + "grad_norm": 0.15961003200229604, + "learning_rate": 8.241978116910592e-05, + "loss": 3.0418, + "step": 16813 + }, + { + "epoch": 1.043764355329319, + "grad_norm": 0.1887244536309691, + "learning_rate": 8.24170316183101e-05, + "loss": 2.9425, + "step": 16814 + }, + { + "epoch": 1.043826432429077, + "grad_norm": 0.2039375010837341, + "learning_rate": 8.241428189838674e-05, + "loss": 3.0165, + "step": 16815 + }, + { + "epoch": 1.0438885095288348, + "grad_norm": 0.19559833591690412, + "learning_rate": 8.241153200935021e-05, + "loss": 2.8763, + "step": 16816 + }, + { + "epoch": 1.0439505866285927, + "grad_norm": 0.1820936808130183, + "learning_rate": 8.240878195121485e-05, + "loss": 2.8981, + "step": 16817 + }, + { + "epoch": 1.0440126637283507, + "grad_norm": 0.16301777266004766, + "learning_rate": 8.240603172399501e-05, + "loss": 2.9346, + "step": 16818 + }, + { + "epoch": 1.0440747408281086, + "grad_norm": 0.19941270026359098, + "learning_rate": 8.240328132770503e-05, + "loss": 2.9542, + "step": 16819 + }, + { + "epoch": 1.0441368179278665, + "grad_norm": 0.17457519149705583, + "learning_rate": 8.240053076235926e-05, + "loss": 2.9557, + "step": 16820 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.15937770041814622, + "learning_rate": 8.239778002797207e-05, + "loss": 2.938, + "step": 16821 + }, + { + "epoch": 1.0442609721273821, + "grad_norm": 0.20373033228246012, + "learning_rate": 8.239502912455779e-05, + "loss": 2.8686, + "step": 16822 + }, + { + "epoch": 1.04432304922714, + "grad_norm": 0.23258262172474164, + "learning_rate": 8.239227805213078e-05, + "loss": 2.9583, + "step": 16823 + }, + { + "epoch": 1.044385126326898, + "grad_norm": 0.18042997711614278, + "learning_rate": 8.238952681070538e-05, + "loss": 2.9733, + "step": 16824 + }, + { + "epoch": 1.0444472034266559, + "grad_norm": 0.1629257043165495, + "learning_rate": 8.238677540029598e-05, + "loss": 2.8861, + "step": 16825 + }, + { + "epoch": 1.0445092805264138, + "grad_norm": 0.17132954853021368, + "learning_rate": 8.23840238209169e-05, + "loss": 2.9837, + "step": 16826 + }, + { + "epoch": 1.0445713576261717, + "grad_norm": 0.143115533199148, + "learning_rate": 8.238127207258249e-05, + "loss": 2.9354, + "step": 16827 + }, + { + "epoch": 1.0446334347259296, + "grad_norm": 0.16730438295631025, + "learning_rate": 8.237852015530714e-05, + "loss": 2.9697, + "step": 16828 + }, + { + "epoch": 1.0446955118256875, + "grad_norm": 0.1897441948340672, + "learning_rate": 8.237576806910518e-05, + "loss": 2.8902, + "step": 16829 + }, + { + "epoch": 1.0447575889254455, + "grad_norm": 0.16939395803721596, + "learning_rate": 8.237301581399098e-05, + "loss": 2.8616, + "step": 16830 + }, + { + "epoch": 1.0448196660252034, + "grad_norm": 0.1741301773277858, + "learning_rate": 8.237026338997891e-05, + "loss": 2.9569, + "step": 16831 + }, + { + "epoch": 1.0448817431249613, + "grad_norm": 0.16606567413769532, + "learning_rate": 8.236751079708331e-05, + "loss": 2.9037, + "step": 16832 + }, + { + "epoch": 1.0449438202247192, + "grad_norm": 0.17898589489481734, + "learning_rate": 8.236475803531853e-05, + "loss": 2.933, + "step": 16833 + }, + { + "epoch": 1.045005897324477, + "grad_norm": 0.15483297096866352, + "learning_rate": 8.236200510469898e-05, + "loss": 2.9768, + "step": 16834 + }, + { + "epoch": 1.0450679744242348, + "grad_norm": 0.17433392946871965, + "learning_rate": 8.235925200523897e-05, + "loss": 2.9962, + "step": 16835 + }, + { + "epoch": 1.0451300515239927, + "grad_norm": 0.15375687471387453, + "learning_rate": 8.235649873695291e-05, + "loss": 3.0186, + "step": 16836 + }, + { + "epoch": 1.0451921286237507, + "grad_norm": 0.1499253482128048, + "learning_rate": 8.235374529985513e-05, + "loss": 2.9987, + "step": 16837 + }, + { + "epoch": 1.0452542057235086, + "grad_norm": 0.16281107622757807, + "learning_rate": 8.235099169396e-05, + "loss": 2.9445, + "step": 16838 + }, + { + "epoch": 1.0453162828232665, + "grad_norm": 0.1608406655175363, + "learning_rate": 8.23482379192819e-05, + "loss": 2.869, + "step": 16839 + }, + { + "epoch": 1.0453783599230244, + "grad_norm": 0.1428260150645493, + "learning_rate": 8.23454839758352e-05, + "loss": 2.9024, + "step": 16840 + }, + { + "epoch": 1.0454404370227823, + "grad_norm": 0.16852255942694322, + "learning_rate": 8.234272986363424e-05, + "loss": 2.943, + "step": 16841 + }, + { + "epoch": 1.0455025141225402, + "grad_norm": 0.16572927575274637, + "learning_rate": 8.233997558269342e-05, + "loss": 2.896, + "step": 16842 + }, + { + "epoch": 1.0455645912222982, + "grad_norm": 0.15731066474329, + "learning_rate": 8.233722113302709e-05, + "loss": 2.8223, + "step": 16843 + }, + { + "epoch": 1.045626668322056, + "grad_norm": 0.1571697345651986, + "learning_rate": 8.233446651464963e-05, + "loss": 2.8855, + "step": 16844 + }, + { + "epoch": 1.0456887454218138, + "grad_norm": 0.15607036158560028, + "learning_rate": 8.233171172757539e-05, + "loss": 2.941, + "step": 16845 + }, + { + "epoch": 1.0457508225215717, + "grad_norm": 0.3896762006233443, + "learning_rate": 8.232895677181878e-05, + "loss": 2.9348, + "step": 16846 + }, + { + "epoch": 1.0458128996213296, + "grad_norm": 0.16824989299430676, + "learning_rate": 8.232620164739415e-05, + "loss": 2.9332, + "step": 16847 + }, + { + "epoch": 1.0458749767210875, + "grad_norm": 0.16833828597073316, + "learning_rate": 8.232344635431589e-05, + "loss": 2.8501, + "step": 16848 + }, + { + "epoch": 1.0459370538208455, + "grad_norm": 0.1566771509863487, + "learning_rate": 8.232069089259835e-05, + "loss": 2.9481, + "step": 16849 + }, + { + "epoch": 1.0459991309206034, + "grad_norm": 0.1660208084194825, + "learning_rate": 8.231793526225592e-05, + "loss": 2.9476, + "step": 16850 + }, + { + "epoch": 1.0460612080203613, + "grad_norm": 0.15493522252782185, + "learning_rate": 8.231517946330298e-05, + "loss": 2.9011, + "step": 16851 + }, + { + "epoch": 1.0461232851201192, + "grad_norm": 0.16357921763510755, + "learning_rate": 8.231242349575391e-05, + "loss": 2.9396, + "step": 16852 + }, + { + "epoch": 1.0461853622198771, + "grad_norm": 0.15318279416440905, + "learning_rate": 8.230966735962306e-05, + "loss": 3.0123, + "step": 16853 + }, + { + "epoch": 1.046247439319635, + "grad_norm": 0.1852803775936028, + "learning_rate": 8.230691105492487e-05, + "loss": 2.908, + "step": 16854 + }, + { + "epoch": 1.046309516419393, + "grad_norm": 0.16374970886517626, + "learning_rate": 8.230415458167365e-05, + "loss": 2.917, + "step": 16855 + }, + { + "epoch": 1.0463715935191509, + "grad_norm": 0.16846361562051376, + "learning_rate": 8.230139793988383e-05, + "loss": 3.0506, + "step": 16856 + }, + { + "epoch": 1.0464336706189088, + "grad_norm": 0.16526920591782765, + "learning_rate": 8.229864112956977e-05, + "loss": 2.9581, + "step": 16857 + }, + { + "epoch": 1.0464957477186665, + "grad_norm": 0.23430016435512796, + "learning_rate": 8.229588415074585e-05, + "loss": 2.9213, + "step": 16858 + }, + { + "epoch": 1.0465578248184244, + "grad_norm": 0.20284205684210863, + "learning_rate": 8.229312700342649e-05, + "loss": 2.9909, + "step": 16859 + }, + { + "epoch": 1.0466199019181823, + "grad_norm": 0.21037421250663418, + "learning_rate": 8.229036968762602e-05, + "loss": 2.9184, + "step": 16860 + }, + { + "epoch": 1.0466819790179402, + "grad_norm": 0.20602881463496425, + "learning_rate": 8.228761220335884e-05, + "loss": 2.8834, + "step": 16861 + }, + { + "epoch": 1.0467440561176982, + "grad_norm": 0.20644215677929706, + "learning_rate": 8.228485455063937e-05, + "loss": 2.954, + "step": 16862 + }, + { + "epoch": 1.046806133217456, + "grad_norm": 0.20846661039846132, + "learning_rate": 8.228209672948198e-05, + "loss": 2.9787, + "step": 16863 + }, + { + "epoch": 1.046868210317214, + "grad_norm": 0.184865584160133, + "learning_rate": 8.227933873990105e-05, + "loss": 3.0281, + "step": 16864 + }, + { + "epoch": 1.046930287416972, + "grad_norm": 0.27714138976391245, + "learning_rate": 8.227658058191098e-05, + "loss": 2.9806, + "step": 16865 + }, + { + "epoch": 1.0469923645167298, + "grad_norm": 0.18262919218211376, + "learning_rate": 8.227382225552615e-05, + "loss": 2.9294, + "step": 16866 + }, + { + "epoch": 1.0470544416164878, + "grad_norm": 0.19352983417174804, + "learning_rate": 8.227106376076095e-05, + "loss": 2.9873, + "step": 16867 + }, + { + "epoch": 1.0471165187162457, + "grad_norm": 0.1792009857672694, + "learning_rate": 8.226830509762977e-05, + "loss": 2.9723, + "step": 16868 + }, + { + "epoch": 1.0471785958160034, + "grad_norm": 0.19389838412527846, + "learning_rate": 8.226554626614702e-05, + "loss": 3.0036, + "step": 16869 + }, + { + "epoch": 1.0472406729157613, + "grad_norm": 0.19006306495695435, + "learning_rate": 8.226278726632707e-05, + "loss": 2.9367, + "step": 16870 + }, + { + "epoch": 1.0473027500155192, + "grad_norm": 0.1885539968069769, + "learning_rate": 8.226002809818434e-05, + "loss": 2.9504, + "step": 16871 + }, + { + "epoch": 1.0473648271152771, + "grad_norm": 0.1979892507685345, + "learning_rate": 8.225726876173321e-05, + "loss": 2.9473, + "step": 16872 + }, + { + "epoch": 1.047426904215035, + "grad_norm": 0.24266624048156527, + "learning_rate": 8.225450925698808e-05, + "loss": 2.8721, + "step": 16873 + }, + { + "epoch": 1.047488981314793, + "grad_norm": 0.20392106028005252, + "learning_rate": 8.225174958396335e-05, + "loss": 2.899, + "step": 16874 + }, + { + "epoch": 1.0475510584145509, + "grad_norm": 0.19973924372385954, + "learning_rate": 8.22489897426734e-05, + "loss": 2.9355, + "step": 16875 + }, + { + "epoch": 1.0476131355143088, + "grad_norm": 0.19726772118389488, + "learning_rate": 8.224622973313263e-05, + "loss": 2.8649, + "step": 16876 + }, + { + "epoch": 1.0476752126140667, + "grad_norm": 0.18268786811791526, + "learning_rate": 8.224346955535549e-05, + "loss": 2.8671, + "step": 16877 + }, + { + "epoch": 1.0477372897138246, + "grad_norm": 0.2102321824377998, + "learning_rate": 8.224070920935631e-05, + "loss": 3.0123, + "step": 16878 + }, + { + "epoch": 1.0477993668135825, + "grad_norm": 0.16501677097355724, + "learning_rate": 8.223794869514953e-05, + "loss": 2.935, + "step": 16879 + }, + { + "epoch": 1.0478614439133405, + "grad_norm": 0.22093414773515638, + "learning_rate": 8.223518801274955e-05, + "loss": 2.9357, + "step": 16880 + }, + { + "epoch": 1.0479235210130984, + "grad_norm": 0.16597045957244716, + "learning_rate": 8.223242716217077e-05, + "loss": 2.8657, + "step": 16881 + }, + { + "epoch": 1.047985598112856, + "grad_norm": 0.2384095550050011, + "learning_rate": 8.222966614342758e-05, + "loss": 2.8798, + "step": 16882 + }, + { + "epoch": 1.048047675212614, + "grad_norm": 0.1999285059787364, + "learning_rate": 8.222690495653443e-05, + "loss": 2.8698, + "step": 16883 + }, + { + "epoch": 1.048109752312372, + "grad_norm": 0.2022300776499786, + "learning_rate": 8.222414360150565e-05, + "loss": 2.9075, + "step": 16884 + }, + { + "epoch": 1.0481718294121298, + "grad_norm": 0.17226875782408052, + "learning_rate": 8.222138207835573e-05, + "loss": 2.9076, + "step": 16885 + }, + { + "epoch": 1.0482339065118877, + "grad_norm": 0.16452001742523395, + "learning_rate": 8.221862038709901e-05, + "loss": 2.9951, + "step": 16886 + }, + { + "epoch": 1.0482959836116457, + "grad_norm": 0.17825400716870587, + "learning_rate": 8.221585852774993e-05, + "loss": 2.9306, + "step": 16887 + }, + { + "epoch": 1.0483580607114036, + "grad_norm": 0.1822093550723716, + "learning_rate": 8.22130965003229e-05, + "loss": 2.9029, + "step": 16888 + }, + { + "epoch": 1.0484201378111615, + "grad_norm": 1.2023962712110163, + "learning_rate": 8.221033430483234e-05, + "loss": 2.9045, + "step": 16889 + }, + { + "epoch": 1.0484822149109194, + "grad_norm": 0.22190664663415016, + "learning_rate": 8.220757194129263e-05, + "loss": 2.9256, + "step": 16890 + }, + { + "epoch": 1.0485442920106773, + "grad_norm": 0.17558955230216863, + "learning_rate": 8.220480940971819e-05, + "loss": 2.9597, + "step": 16891 + }, + { + "epoch": 1.0486063691104353, + "grad_norm": 0.17872940551721694, + "learning_rate": 8.220204671012346e-05, + "loss": 2.9645, + "step": 16892 + }, + { + "epoch": 1.048668446210193, + "grad_norm": 0.17849029928983437, + "learning_rate": 8.219928384252283e-05, + "loss": 3.0159, + "step": 16893 + }, + { + "epoch": 1.0487305233099509, + "grad_norm": 0.19864273946706867, + "learning_rate": 8.219652080693071e-05, + "loss": 2.9007, + "step": 16894 + }, + { + "epoch": 1.0487926004097088, + "grad_norm": 0.26476203760692313, + "learning_rate": 8.219375760336153e-05, + "loss": 2.9906, + "step": 16895 + }, + { + "epoch": 1.0488546775094667, + "grad_norm": 0.18694454610217057, + "learning_rate": 8.21909942318297e-05, + "loss": 2.9683, + "step": 16896 + }, + { + "epoch": 1.0489167546092246, + "grad_norm": 0.187496294541302, + "learning_rate": 8.218823069234964e-05, + "loss": 3.0437, + "step": 16897 + }, + { + "epoch": 1.0489788317089825, + "grad_norm": 0.1840438904446194, + "learning_rate": 8.218546698493576e-05, + "loss": 2.9908, + "step": 16898 + }, + { + "epoch": 1.0490409088087405, + "grad_norm": 0.3082827316573394, + "learning_rate": 8.218270310960252e-05, + "loss": 2.9029, + "step": 16899 + }, + { + "epoch": 1.0491029859084984, + "grad_norm": 0.24520690765360728, + "learning_rate": 8.217993906636427e-05, + "loss": 2.962, + "step": 16900 + }, + { + "epoch": 1.0491650630082563, + "grad_norm": 0.18389025546505233, + "learning_rate": 8.217717485523547e-05, + "loss": 2.9476, + "step": 16901 + }, + { + "epoch": 1.0492271401080142, + "grad_norm": 0.2930624239891574, + "learning_rate": 8.217441047623055e-05, + "loss": 2.8868, + "step": 16902 + }, + { + "epoch": 1.0492892172077721, + "grad_norm": 0.21393963705784713, + "learning_rate": 8.217164592936391e-05, + "loss": 2.863, + "step": 16903 + }, + { + "epoch": 1.04935129430753, + "grad_norm": 0.2561692843432715, + "learning_rate": 8.216888121464999e-05, + "loss": 2.9584, + "step": 16904 + }, + { + "epoch": 1.049413371407288, + "grad_norm": 0.18334786202198747, + "learning_rate": 8.216611633210321e-05, + "loss": 2.8872, + "step": 16905 + }, + { + "epoch": 1.0494754485070457, + "grad_norm": 0.20956085589031578, + "learning_rate": 8.216335128173797e-05, + "loss": 2.9597, + "step": 16906 + }, + { + "epoch": 1.0495375256068036, + "grad_norm": 0.21129940916692813, + "learning_rate": 8.216058606356873e-05, + "loss": 2.914, + "step": 16907 + }, + { + "epoch": 1.0495996027065615, + "grad_norm": 0.17944590248848963, + "learning_rate": 8.215782067760991e-05, + "loss": 2.9552, + "step": 16908 + }, + { + "epoch": 1.0496616798063194, + "grad_norm": 0.17884383966236042, + "learning_rate": 8.215505512387594e-05, + "loss": 2.8542, + "step": 16909 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.20900157608117853, + "learning_rate": 8.215228940238123e-05, + "loss": 2.9177, + "step": 16910 + }, + { + "epoch": 1.0497858340058352, + "grad_norm": 0.2571267052196582, + "learning_rate": 8.214952351314023e-05, + "loss": 2.9188, + "step": 16911 + }, + { + "epoch": 1.0498479111055932, + "grad_norm": 0.1693306921000212, + "learning_rate": 8.214675745616735e-05, + "loss": 2.8893, + "step": 16912 + }, + { + "epoch": 1.049909988205351, + "grad_norm": 0.1899495748219673, + "learning_rate": 8.214399123147704e-05, + "loss": 2.9316, + "step": 16913 + }, + { + "epoch": 1.049972065305109, + "grad_norm": 0.19974689971740336, + "learning_rate": 8.214122483908372e-05, + "loss": 2.9567, + "step": 16914 + }, + { + "epoch": 1.050034142404867, + "grad_norm": 0.14434552502826006, + "learning_rate": 8.213845827900184e-05, + "loss": 2.9, + "step": 16915 + }, + { + "epoch": 1.0500962195046248, + "grad_norm": 0.1953915531391123, + "learning_rate": 8.21356915512458e-05, + "loss": 2.9697, + "step": 16916 + }, + { + "epoch": 1.0501582966043825, + "grad_norm": 0.16310387929571998, + "learning_rate": 8.213292465583007e-05, + "loss": 2.9188, + "step": 16917 + }, + { + "epoch": 1.0502203737041405, + "grad_norm": 0.24898813034782935, + "learning_rate": 8.213015759276908e-05, + "loss": 2.8454, + "step": 16918 + }, + { + "epoch": 1.0502824508038984, + "grad_norm": 0.19375911968056816, + "learning_rate": 8.212739036207724e-05, + "loss": 2.9356, + "step": 16919 + }, + { + "epoch": 1.0503445279036563, + "grad_norm": 0.16163859315851678, + "learning_rate": 8.212462296376902e-05, + "loss": 2.9361, + "step": 16920 + }, + { + "epoch": 1.0504066050034142, + "grad_norm": 0.15318411955766903, + "learning_rate": 8.212185539785884e-05, + "loss": 2.9294, + "step": 16921 + }, + { + "epoch": 1.0504686821031721, + "grad_norm": 0.1897158325221071, + "learning_rate": 8.211908766436114e-05, + "loss": 2.7931, + "step": 16922 + }, + { + "epoch": 1.05053075920293, + "grad_norm": 0.19358408037841524, + "learning_rate": 8.211631976329035e-05, + "loss": 2.9104, + "step": 16923 + }, + { + "epoch": 1.050592836302688, + "grad_norm": 0.21244758036547023, + "learning_rate": 8.211355169466093e-05, + "loss": 3.0077, + "step": 16924 + }, + { + "epoch": 1.0506549134024459, + "grad_norm": 0.22304191149585278, + "learning_rate": 8.211078345848733e-05, + "loss": 2.8027, + "step": 16925 + }, + { + "epoch": 1.0507169905022038, + "grad_norm": 0.21655228710450922, + "learning_rate": 8.210801505478396e-05, + "loss": 2.9395, + "step": 16926 + }, + { + "epoch": 1.0507790676019617, + "grad_norm": 0.26469367509865044, + "learning_rate": 8.210524648356528e-05, + "loss": 2.9373, + "step": 16927 + }, + { + "epoch": 1.0508411447017196, + "grad_norm": 0.1674801573458521, + "learning_rate": 8.210247774484574e-05, + "loss": 2.9209, + "step": 16928 + }, + { + "epoch": 1.0509032218014773, + "grad_norm": 0.16541907333858122, + "learning_rate": 8.20997088386398e-05, + "loss": 2.9022, + "step": 16929 + }, + { + "epoch": 1.0509652989012352, + "grad_norm": 0.2622210394102789, + "learning_rate": 8.209693976496186e-05, + "loss": 2.8951, + "step": 16930 + }, + { + "epoch": 1.0510273760009932, + "grad_norm": 0.4508262565530973, + "learning_rate": 8.20941705238264e-05, + "loss": 2.9326, + "step": 16931 + }, + { + "epoch": 1.051089453100751, + "grad_norm": 0.2040133673563516, + "learning_rate": 8.209140111524787e-05, + "loss": 2.9325, + "step": 16932 + }, + { + "epoch": 1.051151530200509, + "grad_norm": 0.2846777703248325, + "learning_rate": 8.208863153924071e-05, + "loss": 2.9722, + "step": 16933 + }, + { + "epoch": 1.051213607300267, + "grad_norm": 0.203928595812939, + "learning_rate": 8.208586179581937e-05, + "loss": 2.9215, + "step": 16934 + }, + { + "epoch": 1.0512756844000248, + "grad_norm": 0.453725926073241, + "learning_rate": 8.208309188499827e-05, + "loss": 3.0021, + "step": 16935 + }, + { + "epoch": 1.0513377614997828, + "grad_norm": 0.21429151306509048, + "learning_rate": 8.208032180679192e-05, + "loss": 2.9279, + "step": 16936 + }, + { + "epoch": 1.0513998385995407, + "grad_norm": 0.3707173898442513, + "learning_rate": 8.207755156121474e-05, + "loss": 2.8906, + "step": 16937 + }, + { + "epoch": 1.0514619156992986, + "grad_norm": 0.19915358094564234, + "learning_rate": 8.20747811482812e-05, + "loss": 2.9331, + "step": 16938 + }, + { + "epoch": 1.0515239927990565, + "grad_norm": 0.18656225948861369, + "learning_rate": 8.207201056800572e-05, + "loss": 2.8487, + "step": 16939 + }, + { + "epoch": 1.0515860698988144, + "grad_norm": 0.6759846552089118, + "learning_rate": 8.206923982040277e-05, + "loss": 2.9558, + "step": 16940 + }, + { + "epoch": 1.0516481469985721, + "grad_norm": 0.4082213767319878, + "learning_rate": 8.206646890548684e-05, + "loss": 2.9433, + "step": 16941 + }, + { + "epoch": 1.05171022409833, + "grad_norm": 0.4196488313336634, + "learning_rate": 8.206369782327233e-05, + "loss": 2.8135, + "step": 16942 + }, + { + "epoch": 1.051772301198088, + "grad_norm": 0.25228583376909325, + "learning_rate": 8.206092657377372e-05, + "loss": 2.9587, + "step": 16943 + }, + { + "epoch": 1.0518343782978459, + "grad_norm": 0.21287884810884547, + "learning_rate": 8.20581551570055e-05, + "loss": 2.9573, + "step": 16944 + }, + { + "epoch": 1.0518964553976038, + "grad_norm": 0.6035654979456362, + "learning_rate": 8.205538357298208e-05, + "loss": 2.87, + "step": 16945 + }, + { + "epoch": 1.0519585324973617, + "grad_norm": 0.26910024542118677, + "learning_rate": 8.205261182171796e-05, + "loss": 2.8965, + "step": 16946 + }, + { + "epoch": 1.0520206095971196, + "grad_norm": 0.2623673558023493, + "learning_rate": 8.204983990322756e-05, + "loss": 2.9555, + "step": 16947 + }, + { + "epoch": 1.0520826866968775, + "grad_norm": 0.2173254511470757, + "learning_rate": 8.204706781752538e-05, + "loss": 2.9648, + "step": 16948 + }, + { + "epoch": 1.0521447637966355, + "grad_norm": 0.19255721228216496, + "learning_rate": 8.204429556462588e-05, + "loss": 3.0042, + "step": 16949 + }, + { + "epoch": 1.0522068408963934, + "grad_norm": 0.17889326206688888, + "learning_rate": 8.204152314454348e-05, + "loss": 2.9111, + "step": 16950 + }, + { + "epoch": 1.0522689179961513, + "grad_norm": 0.28381162581301805, + "learning_rate": 8.203875055729271e-05, + "loss": 2.8949, + "step": 16951 + }, + { + "epoch": 1.0523309950959092, + "grad_norm": 0.2045933670796659, + "learning_rate": 8.203597780288797e-05, + "loss": 2.9995, + "step": 16952 + }, + { + "epoch": 1.052393072195667, + "grad_norm": 0.1663331641405458, + "learning_rate": 8.203320488134378e-05, + "loss": 2.9471, + "step": 16953 + }, + { + "epoch": 1.0524551492954248, + "grad_norm": 0.18210199683980963, + "learning_rate": 8.203043179267457e-05, + "loss": 2.9295, + "step": 16954 + }, + { + "epoch": 1.0525172263951827, + "grad_norm": 0.17281840162564588, + "learning_rate": 8.202765853689482e-05, + "loss": 2.925, + "step": 16955 + }, + { + "epoch": 1.0525793034949407, + "grad_norm": 0.19942471198359696, + "learning_rate": 8.202488511401901e-05, + "loss": 2.9653, + "step": 16956 + }, + { + "epoch": 1.0526413805946986, + "grad_norm": 0.23475305742592464, + "learning_rate": 8.20221115240616e-05, + "loss": 2.8045, + "step": 16957 + }, + { + "epoch": 1.0527034576944565, + "grad_norm": 0.18379154102304138, + "learning_rate": 8.201933776703705e-05, + "loss": 2.8784, + "step": 16958 + }, + { + "epoch": 1.0527655347942144, + "grad_norm": 0.2762529251539271, + "learning_rate": 8.201656384295985e-05, + "loss": 2.8899, + "step": 16959 + }, + { + "epoch": 1.0528276118939723, + "grad_norm": 0.19254871902195736, + "learning_rate": 8.201378975184446e-05, + "loss": 2.9573, + "step": 16960 + }, + { + "epoch": 1.0528896889937303, + "grad_norm": 0.23600715720509288, + "learning_rate": 8.201101549370536e-05, + "loss": 2.9516, + "step": 16961 + }, + { + "epoch": 1.0529517660934882, + "grad_norm": 0.3750722934656523, + "learning_rate": 8.200824106855702e-05, + "loss": 2.9382, + "step": 16962 + }, + { + "epoch": 1.053013843193246, + "grad_norm": 0.16763469279261808, + "learning_rate": 8.200546647641392e-05, + "loss": 2.9537, + "step": 16963 + }, + { + "epoch": 1.053075920293004, + "grad_norm": 0.19273754276537655, + "learning_rate": 8.200269171729052e-05, + "loss": 2.9204, + "step": 16964 + }, + { + "epoch": 1.0531379973927617, + "grad_norm": 0.19336508732844618, + "learning_rate": 8.199991679120133e-05, + "loss": 2.8964, + "step": 16965 + }, + { + "epoch": 1.0532000744925196, + "grad_norm": 0.18663586728214296, + "learning_rate": 8.19971416981608e-05, + "loss": 2.8184, + "step": 16966 + }, + { + "epoch": 1.0532621515922775, + "grad_norm": 0.1885283403973534, + "learning_rate": 8.199436643818339e-05, + "loss": 2.8862, + "step": 16967 + }, + { + "epoch": 1.0533242286920355, + "grad_norm": 0.19413512124256088, + "learning_rate": 8.199159101128362e-05, + "loss": 2.9188, + "step": 16968 + }, + { + "epoch": 1.0533863057917934, + "grad_norm": 0.18016694210049375, + "learning_rate": 8.198881541747596e-05, + "loss": 2.7565, + "step": 16969 + }, + { + "epoch": 1.0534483828915513, + "grad_norm": 0.19256199078552905, + "learning_rate": 8.198603965677488e-05, + "loss": 2.9335, + "step": 16970 + }, + { + "epoch": 1.0535104599913092, + "grad_norm": 0.24611641007502139, + "learning_rate": 8.198326372919487e-05, + "loss": 2.9021, + "step": 16971 + }, + { + "epoch": 1.0535725370910671, + "grad_norm": 0.18023975136108303, + "learning_rate": 8.19804876347504e-05, + "loss": 2.9655, + "step": 16972 + }, + { + "epoch": 1.053634614190825, + "grad_norm": 0.2056142701448195, + "learning_rate": 8.197771137345596e-05, + "loss": 2.93, + "step": 16973 + }, + { + "epoch": 1.053696691290583, + "grad_norm": 0.21594264158208598, + "learning_rate": 8.197493494532605e-05, + "loss": 2.8984, + "step": 16974 + }, + { + "epoch": 1.0537587683903409, + "grad_norm": 0.19765411496422855, + "learning_rate": 8.197215835037512e-05, + "loss": 2.9149, + "step": 16975 + }, + { + "epoch": 1.0538208454900988, + "grad_norm": 0.1705167103878528, + "learning_rate": 8.19693815886177e-05, + "loss": 2.9416, + "step": 16976 + }, + { + "epoch": 1.0538829225898565, + "grad_norm": 0.17101994146107596, + "learning_rate": 8.196660466006825e-05, + "loss": 2.9983, + "step": 16977 + }, + { + "epoch": 1.0539449996896144, + "grad_norm": 0.16601321835736937, + "learning_rate": 8.196382756474126e-05, + "loss": 2.938, + "step": 16978 + }, + { + "epoch": 1.0540070767893723, + "grad_norm": 0.18090654544868798, + "learning_rate": 8.196105030265121e-05, + "loss": 2.9409, + "step": 16979 + }, + { + "epoch": 1.0540691538891303, + "grad_norm": 0.21500652389276054, + "learning_rate": 8.195827287381263e-05, + "loss": 2.9978, + "step": 16980 + }, + { + "epoch": 1.0541312309888882, + "grad_norm": 0.23187949101707894, + "learning_rate": 8.195549527823998e-05, + "loss": 2.8145, + "step": 16981 + }, + { + "epoch": 1.054193308088646, + "grad_norm": 0.18469396616981265, + "learning_rate": 8.195271751594773e-05, + "loss": 3.0099, + "step": 16982 + }, + { + "epoch": 1.054255385188404, + "grad_norm": 0.17645008741112056, + "learning_rate": 8.194993958695041e-05, + "loss": 2.8899, + "step": 16983 + }, + { + "epoch": 1.054317462288162, + "grad_norm": 0.19062384464351545, + "learning_rate": 8.19471614912625e-05, + "loss": 2.9729, + "step": 16984 + }, + { + "epoch": 1.0543795393879198, + "grad_norm": 0.20412729639512464, + "learning_rate": 8.194438322889848e-05, + "loss": 3.0017, + "step": 16985 + }, + { + "epoch": 1.0544416164876778, + "grad_norm": 0.1702203978410954, + "learning_rate": 8.194160479987287e-05, + "loss": 2.8808, + "step": 16986 + }, + { + "epoch": 1.0545036935874357, + "grad_norm": 0.177162297725488, + "learning_rate": 8.193882620420016e-05, + "loss": 2.9474, + "step": 16987 + }, + { + "epoch": 1.0545657706871936, + "grad_norm": 0.189225394244923, + "learning_rate": 8.193604744189482e-05, + "loss": 3.0113, + "step": 16988 + }, + { + "epoch": 1.0546278477869513, + "grad_norm": 0.17294228861368727, + "learning_rate": 8.193326851297138e-05, + "loss": 2.9769, + "step": 16989 + }, + { + "epoch": 1.0546899248867092, + "grad_norm": 0.1666534010008619, + "learning_rate": 8.193048941744434e-05, + "loss": 2.9963, + "step": 16990 + }, + { + "epoch": 1.0547520019864671, + "grad_norm": 0.1747127323062709, + "learning_rate": 8.192771015532817e-05, + "loss": 2.945, + "step": 16991 + }, + { + "epoch": 1.054814079086225, + "grad_norm": 0.16197552957979588, + "learning_rate": 8.192493072663739e-05, + "loss": 2.9577, + "step": 16992 + }, + { + "epoch": 1.054876156185983, + "grad_norm": 0.3803708978733249, + "learning_rate": 8.192215113138648e-05, + "loss": 2.9238, + "step": 16993 + }, + { + "epoch": 1.0549382332857409, + "grad_norm": 0.17449849232832548, + "learning_rate": 8.191937136958999e-05, + "loss": 2.8839, + "step": 16994 + }, + { + "epoch": 1.0550003103854988, + "grad_norm": 0.20441541314476158, + "learning_rate": 8.191659144126236e-05, + "loss": 2.9567, + "step": 16995 + }, + { + "epoch": 1.0550623874852567, + "grad_norm": 0.29076523217590783, + "learning_rate": 8.191381134641814e-05, + "loss": 2.8586, + "step": 16996 + }, + { + "epoch": 1.0551244645850146, + "grad_norm": 0.21308710498297678, + "learning_rate": 8.191103108507181e-05, + "loss": 2.816, + "step": 16997 + }, + { + "epoch": 1.0551865416847725, + "grad_norm": 0.18682076687865826, + "learning_rate": 8.190825065723789e-05, + "loss": 2.8278, + "step": 16998 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.20191326375143917, + "learning_rate": 8.190547006293088e-05, + "loss": 2.8669, + "step": 16999 + }, + { + "epoch": 1.0553106958842884, + "grad_norm": 0.23874184383571198, + "learning_rate": 8.190268930216529e-05, + "loss": 2.8874, + "step": 17000 + }, + { + "epoch": 1.055372772984046, + "grad_norm": 0.1783115113207888, + "learning_rate": 8.189990837495563e-05, + "loss": 3.0183, + "step": 17001 + }, + { + "epoch": 1.055434850083804, + "grad_norm": 0.29158004967553225, + "learning_rate": 8.189712728131639e-05, + "loss": 2.9135, + "step": 17002 + }, + { + "epoch": 1.055496927183562, + "grad_norm": 0.20188594674762406, + "learning_rate": 8.18943460212621e-05, + "loss": 2.9255, + "step": 17003 + }, + { + "epoch": 1.0555590042833198, + "grad_norm": 0.23046272884197597, + "learning_rate": 8.189156459480727e-05, + "loss": 3.069, + "step": 17004 + }, + { + "epoch": 1.0556210813830778, + "grad_norm": 0.20043475090951182, + "learning_rate": 8.188878300196639e-05, + "loss": 2.8099, + "step": 17005 + }, + { + "epoch": 1.0556831584828357, + "grad_norm": 0.19527131600094583, + "learning_rate": 8.1886001242754e-05, + "loss": 2.9129, + "step": 17006 + }, + { + "epoch": 1.0557452355825936, + "grad_norm": 0.1944585633733923, + "learning_rate": 8.188321931718459e-05, + "loss": 2.8338, + "step": 17007 + }, + { + "epoch": 1.0558073126823515, + "grad_norm": 0.17506703876595847, + "learning_rate": 8.18804372252727e-05, + "loss": 2.9472, + "step": 17008 + }, + { + "epoch": 1.0558693897821094, + "grad_norm": 0.19120730551522874, + "learning_rate": 8.187765496703281e-05, + "loss": 2.8499, + "step": 17009 + }, + { + "epoch": 1.0559314668818673, + "grad_norm": 0.20558409263657204, + "learning_rate": 8.187487254247948e-05, + "loss": 3.0026, + "step": 17010 + }, + { + "epoch": 1.0559935439816253, + "grad_norm": 0.18836098917487423, + "learning_rate": 8.187208995162718e-05, + "loss": 2.9145, + "step": 17011 + }, + { + "epoch": 1.0560556210813832, + "grad_norm": 0.19811991975390175, + "learning_rate": 8.186930719449047e-05, + "loss": 2.9003, + "step": 17012 + }, + { + "epoch": 1.0561176981811409, + "grad_norm": 0.15908017587721382, + "learning_rate": 8.186652427108383e-05, + "loss": 2.9454, + "step": 17013 + }, + { + "epoch": 1.0561797752808988, + "grad_norm": 0.18811991653037607, + "learning_rate": 8.186374118142182e-05, + "loss": 2.8558, + "step": 17014 + }, + { + "epoch": 1.0562418523806567, + "grad_norm": 0.2327722104944934, + "learning_rate": 8.186095792551892e-05, + "loss": 2.9972, + "step": 17015 + }, + { + "epoch": 1.0563039294804146, + "grad_norm": 0.19789521619769668, + "learning_rate": 8.185817450338966e-05, + "loss": 2.9546, + "step": 17016 + }, + { + "epoch": 1.0563660065801725, + "grad_norm": 0.200364232323695, + "learning_rate": 8.185539091504859e-05, + "loss": 2.9787, + "step": 17017 + }, + { + "epoch": 1.0564280836799305, + "grad_norm": 0.2044597385023408, + "learning_rate": 8.18526071605102e-05, + "loss": 2.9519, + "step": 17018 + }, + { + "epoch": 1.0564901607796884, + "grad_norm": 0.19484621655173243, + "learning_rate": 8.184982323978902e-05, + "loss": 2.9337, + "step": 17019 + }, + { + "epoch": 1.0565522378794463, + "grad_norm": 0.20579291615477235, + "learning_rate": 8.18470391528996e-05, + "loss": 2.9881, + "step": 17020 + }, + { + "epoch": 1.0566143149792042, + "grad_norm": 0.17698551496271805, + "learning_rate": 8.184425489985643e-05, + "loss": 2.9666, + "step": 17021 + }, + { + "epoch": 1.0566763920789621, + "grad_norm": 0.17618355807454633, + "learning_rate": 8.184147048067405e-05, + "loss": 2.825, + "step": 17022 + }, + { + "epoch": 1.05673846917872, + "grad_norm": 0.18189449412295228, + "learning_rate": 8.1838685895367e-05, + "loss": 2.948, + "step": 17023 + }, + { + "epoch": 1.056800546278478, + "grad_norm": 0.24987708735918038, + "learning_rate": 8.183590114394979e-05, + "loss": 2.8504, + "step": 17024 + }, + { + "epoch": 1.0568626233782357, + "grad_norm": 0.2784679489740669, + "learning_rate": 8.183311622643696e-05, + "loss": 3.0431, + "step": 17025 + }, + { + "epoch": 1.0569247004779936, + "grad_norm": 0.1917739178500813, + "learning_rate": 8.183033114284303e-05, + "loss": 2.8479, + "step": 17026 + }, + { + "epoch": 1.0569867775777515, + "grad_norm": 0.22305549832057364, + "learning_rate": 8.182754589318253e-05, + "loss": 2.9704, + "step": 17027 + }, + { + "epoch": 1.0570488546775094, + "grad_norm": 0.1638332820639121, + "learning_rate": 8.182476047747001e-05, + "loss": 2.808, + "step": 17028 + }, + { + "epoch": 1.0571109317772673, + "grad_norm": 0.1987316028459511, + "learning_rate": 8.182197489571998e-05, + "loss": 2.9369, + "step": 17029 + }, + { + "epoch": 1.0571730088770253, + "grad_norm": 0.18149164714850202, + "learning_rate": 8.181918914794699e-05, + "loss": 2.8806, + "step": 17030 + }, + { + "epoch": 1.0572350859767832, + "grad_norm": 0.17183715230204955, + "learning_rate": 8.181640323416554e-05, + "loss": 2.9214, + "step": 17031 + }, + { + "epoch": 1.057297163076541, + "grad_norm": 0.18297501049195472, + "learning_rate": 8.181361715439023e-05, + "loss": 2.9132, + "step": 17032 + }, + { + "epoch": 1.057359240176299, + "grad_norm": 0.16938940016035647, + "learning_rate": 8.181083090863553e-05, + "loss": 2.8363, + "step": 17033 + }, + { + "epoch": 1.057421317276057, + "grad_norm": 0.18748300197164364, + "learning_rate": 8.180804449691601e-05, + "loss": 2.9069, + "step": 17034 + }, + { + "epoch": 1.0574833943758148, + "grad_norm": 0.17038244005647316, + "learning_rate": 8.18052579192462e-05, + "loss": 2.9502, + "step": 17035 + }, + { + "epoch": 1.0575454714755728, + "grad_norm": 0.178484371796763, + "learning_rate": 8.180247117564063e-05, + "loss": 2.9442, + "step": 17036 + }, + { + "epoch": 1.0576075485753305, + "grad_norm": 0.1613299753022942, + "learning_rate": 8.179968426611386e-05, + "loss": 2.8768, + "step": 17037 + }, + { + "epoch": 1.0576696256750884, + "grad_norm": 0.19377537491973387, + "learning_rate": 8.179689719068042e-05, + "loss": 2.8958, + "step": 17038 + }, + { + "epoch": 1.0577317027748463, + "grad_norm": 0.18211575376253825, + "learning_rate": 8.179410994935483e-05, + "loss": 2.8752, + "step": 17039 + }, + { + "epoch": 1.0577937798746042, + "grad_norm": 0.16757780421833202, + "learning_rate": 8.179132254215167e-05, + "loss": 2.9271, + "step": 17040 + }, + { + "epoch": 1.0578558569743621, + "grad_norm": 0.17604557403106774, + "learning_rate": 8.178853496908545e-05, + "loss": 2.9153, + "step": 17041 + }, + { + "epoch": 1.05791793407412, + "grad_norm": 0.14871195466896375, + "learning_rate": 8.178574723017073e-05, + "loss": 2.9149, + "step": 17042 + }, + { + "epoch": 1.057980011173878, + "grad_norm": 0.1836469350032756, + "learning_rate": 8.178295932542206e-05, + "loss": 2.8713, + "step": 17043 + }, + { + "epoch": 1.0580420882736359, + "grad_norm": 0.17848512318001916, + "learning_rate": 8.178017125485396e-05, + "loss": 2.9161, + "step": 17044 + }, + { + "epoch": 1.0581041653733938, + "grad_norm": 0.1662225107361742, + "learning_rate": 8.1777383018481e-05, + "loss": 2.9975, + "step": 17045 + }, + { + "epoch": 1.0581662424731517, + "grad_norm": 0.15910893605559512, + "learning_rate": 8.177459461631771e-05, + "loss": 2.7998, + "step": 17046 + }, + { + "epoch": 1.0582283195729096, + "grad_norm": 0.1540987698648852, + "learning_rate": 8.177180604837866e-05, + "loss": 2.9422, + "step": 17047 + }, + { + "epoch": 1.0582903966726676, + "grad_norm": 0.15924644825453982, + "learning_rate": 8.176901731467839e-05, + "loss": 2.8369, + "step": 17048 + }, + { + "epoch": 1.0583524737724253, + "grad_norm": 0.1876517913250558, + "learning_rate": 8.176622841523142e-05, + "loss": 2.9385, + "step": 17049 + }, + { + "epoch": 1.0584145508721832, + "grad_norm": 0.15241453775864328, + "learning_rate": 8.176343935005233e-05, + "loss": 2.9373, + "step": 17050 + }, + { + "epoch": 1.058476627971941, + "grad_norm": 0.1590232841342907, + "learning_rate": 8.176065011915568e-05, + "loss": 2.7698, + "step": 17051 + }, + { + "epoch": 1.058538705071699, + "grad_norm": 0.1637649733398767, + "learning_rate": 8.1757860722556e-05, + "loss": 2.9673, + "step": 17052 + }, + { + "epoch": 1.058600782171457, + "grad_norm": 0.16748255403071297, + "learning_rate": 8.175507116026786e-05, + "loss": 2.9172, + "step": 17053 + }, + { + "epoch": 1.0586628592712148, + "grad_norm": 0.18403104688231908, + "learning_rate": 8.17522814323058e-05, + "loss": 2.9234, + "step": 17054 + }, + { + "epoch": 1.0587249363709728, + "grad_norm": 0.15889760046367615, + "learning_rate": 8.174949153868438e-05, + "loss": 2.8424, + "step": 17055 + }, + { + "epoch": 1.0587870134707307, + "grad_norm": 0.17093021097455702, + "learning_rate": 8.174670147941816e-05, + "loss": 2.9241, + "step": 17056 + }, + { + "epoch": 1.0588490905704886, + "grad_norm": 0.1739806406248137, + "learning_rate": 8.174391125452168e-05, + "loss": 2.9153, + "step": 17057 + }, + { + "epoch": 1.0589111676702465, + "grad_norm": 0.24552357940907638, + "learning_rate": 8.17411208640095e-05, + "loss": 2.8773, + "step": 17058 + }, + { + "epoch": 1.0589732447700044, + "grad_norm": 0.20875008543092727, + "learning_rate": 8.17383303078962e-05, + "loss": 2.92, + "step": 17059 + }, + { + "epoch": 1.0590353218697623, + "grad_norm": 0.23160935073396388, + "learning_rate": 8.173553958619632e-05, + "loss": 2.9126, + "step": 17060 + }, + { + "epoch": 1.05909739896952, + "grad_norm": 0.1593855597232763, + "learning_rate": 8.173274869892444e-05, + "loss": 2.8912, + "step": 17061 + }, + { + "epoch": 1.059159476069278, + "grad_norm": 0.1772756315477758, + "learning_rate": 8.17299576460951e-05, + "loss": 2.8805, + "step": 17062 + }, + { + "epoch": 1.0592215531690359, + "grad_norm": 0.17491090733337292, + "learning_rate": 8.172716642772287e-05, + "loss": 2.9247, + "step": 17063 + }, + { + "epoch": 1.0592836302687938, + "grad_norm": 0.15105487228630843, + "learning_rate": 8.172437504382231e-05, + "loss": 2.9167, + "step": 17064 + }, + { + "epoch": 1.0593457073685517, + "grad_norm": 0.1564942001366465, + "learning_rate": 8.172158349440797e-05, + "loss": 2.9007, + "step": 17065 + }, + { + "epoch": 1.0594077844683096, + "grad_norm": 0.20811060483368476, + "learning_rate": 8.171879177949445e-05, + "loss": 2.8818, + "step": 17066 + }, + { + "epoch": 1.0594698615680676, + "grad_norm": 0.164659944994205, + "learning_rate": 8.171599989909629e-05, + "loss": 2.9249, + "step": 17067 + }, + { + "epoch": 1.0595319386678255, + "grad_norm": 0.1709072492235122, + "learning_rate": 8.171320785322805e-05, + "loss": 2.8834, + "step": 17068 + }, + { + "epoch": 1.0595940157675834, + "grad_norm": 0.2207799625831643, + "learning_rate": 8.17104156419043e-05, + "loss": 2.9093, + "step": 17069 + }, + { + "epoch": 1.0596560928673413, + "grad_norm": 0.175117590897602, + "learning_rate": 8.170762326513964e-05, + "loss": 2.9315, + "step": 17070 + }, + { + "epoch": 1.0597181699670992, + "grad_norm": 0.24069825023554764, + "learning_rate": 8.170483072294858e-05, + "loss": 2.9642, + "step": 17071 + }, + { + "epoch": 1.0597802470668571, + "grad_norm": 0.2275749063706187, + "learning_rate": 8.170203801534573e-05, + "loss": 2.9514, + "step": 17072 + }, + { + "epoch": 1.0598423241666148, + "grad_norm": 0.16631280442360158, + "learning_rate": 8.169924514234566e-05, + "loss": 2.9521, + "step": 17073 + }, + { + "epoch": 1.0599044012663728, + "grad_norm": 0.20433033437599868, + "learning_rate": 8.169645210396293e-05, + "loss": 2.9301, + "step": 17074 + }, + { + "epoch": 1.0599664783661307, + "grad_norm": 0.1782316022744035, + "learning_rate": 8.169365890021211e-05, + "loss": 2.9306, + "step": 17075 + }, + { + "epoch": 1.0600285554658886, + "grad_norm": 0.20559018104054289, + "learning_rate": 8.169086553110779e-05, + "loss": 2.9479, + "step": 17076 + }, + { + "epoch": 1.0600906325656465, + "grad_norm": 0.23024418768862248, + "learning_rate": 8.168807199666451e-05, + "loss": 2.903, + "step": 17077 + }, + { + "epoch": 1.0601527096654044, + "grad_norm": 0.17823556304599894, + "learning_rate": 8.16852782968969e-05, + "loss": 2.9571, + "step": 17078 + }, + { + "epoch": 1.0602147867651623, + "grad_norm": 0.19464947696586613, + "learning_rate": 8.168248443181947e-05, + "loss": 2.8586, + "step": 17079 + }, + { + "epoch": 1.0602768638649203, + "grad_norm": 0.1836202176271752, + "learning_rate": 8.167969040144686e-05, + "loss": 2.9389, + "step": 17080 + }, + { + "epoch": 1.0603389409646782, + "grad_norm": 0.2144605774270724, + "learning_rate": 8.167689620579357e-05, + "loss": 2.9906, + "step": 17081 + }, + { + "epoch": 1.060401018064436, + "grad_norm": 0.1698922811146961, + "learning_rate": 8.167410184487425e-05, + "loss": 2.9287, + "step": 17082 + }, + { + "epoch": 1.060463095164194, + "grad_norm": 0.18806143984395962, + "learning_rate": 8.167130731870346e-05, + "loss": 2.783, + "step": 17083 + }, + { + "epoch": 1.060525172263952, + "grad_norm": 0.17246373629000053, + "learning_rate": 8.166851262729576e-05, + "loss": 2.8086, + "step": 17084 + }, + { + "epoch": 1.0605872493637096, + "grad_norm": 0.17560693256768095, + "learning_rate": 8.166571777066572e-05, + "loss": 2.9664, + "step": 17085 + }, + { + "epoch": 1.0606493264634675, + "grad_norm": 0.16106946451334245, + "learning_rate": 8.166292274882797e-05, + "loss": 3.0272, + "step": 17086 + }, + { + "epoch": 1.0607114035632255, + "grad_norm": 0.20162408427605796, + "learning_rate": 8.166012756179706e-05, + "loss": 2.8931, + "step": 17087 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.2790901779725304, + "learning_rate": 8.165733220958758e-05, + "loss": 2.9369, + "step": 17088 + }, + { + "epoch": 1.0608355577627413, + "grad_norm": 0.1678945244309212, + "learning_rate": 8.165453669221411e-05, + "loss": 3.0114, + "step": 17089 + }, + { + "epoch": 1.0608976348624992, + "grad_norm": 0.19368972879144813, + "learning_rate": 8.165174100969124e-05, + "loss": 2.9246, + "step": 17090 + }, + { + "epoch": 1.0609597119622571, + "grad_norm": 0.20272999650346762, + "learning_rate": 8.164894516203355e-05, + "loss": 2.8724, + "step": 17091 + }, + { + "epoch": 1.061021789062015, + "grad_norm": 0.15649761606722457, + "learning_rate": 8.164614914925563e-05, + "loss": 2.9568, + "step": 17092 + }, + { + "epoch": 1.061083866161773, + "grad_norm": 0.1617347974232724, + "learning_rate": 8.164335297137206e-05, + "loss": 2.8228, + "step": 17093 + }, + { + "epoch": 1.0611459432615309, + "grad_norm": 0.33523591657509466, + "learning_rate": 8.164055662839745e-05, + "loss": 2.8792, + "step": 17094 + }, + { + "epoch": 1.0612080203612888, + "grad_norm": 0.24175073093996133, + "learning_rate": 8.163776012034638e-05, + "loss": 2.9389, + "step": 17095 + }, + { + "epoch": 1.0612700974610467, + "grad_norm": 0.19179127368400545, + "learning_rate": 8.163496344723342e-05, + "loss": 2.7869, + "step": 17096 + }, + { + "epoch": 1.0613321745608044, + "grad_norm": 0.1701372579303948, + "learning_rate": 8.163216660907318e-05, + "loss": 2.9645, + "step": 17097 + }, + { + "epoch": 1.0613942516605623, + "grad_norm": 0.20886835104006649, + "learning_rate": 8.162936960588026e-05, + "loss": 2.8394, + "step": 17098 + }, + { + "epoch": 1.0614563287603203, + "grad_norm": 0.19857170642934296, + "learning_rate": 8.162657243766923e-05, + "loss": 2.8978, + "step": 17099 + }, + { + "epoch": 1.0615184058600782, + "grad_norm": 0.1855715584649478, + "learning_rate": 8.162377510445469e-05, + "loss": 2.8677, + "step": 17100 + }, + { + "epoch": 1.061580482959836, + "grad_norm": 0.1862520770462761, + "learning_rate": 8.162097760625125e-05, + "loss": 2.9202, + "step": 17101 + }, + { + "epoch": 1.061642560059594, + "grad_norm": 0.17603238497950063, + "learning_rate": 8.161817994307348e-05, + "loss": 2.8825, + "step": 17102 + }, + { + "epoch": 1.061704637159352, + "grad_norm": 0.23013455196067312, + "learning_rate": 8.1615382114936e-05, + "loss": 2.9025, + "step": 17103 + }, + { + "epoch": 1.0617667142591098, + "grad_norm": 0.18074193771011968, + "learning_rate": 8.161258412185339e-05, + "loss": 2.8932, + "step": 17104 + }, + { + "epoch": 1.0618287913588678, + "grad_norm": 0.22740106122439036, + "learning_rate": 8.160978596384026e-05, + "loss": 2.877, + "step": 17105 + }, + { + "epoch": 1.0618908684586257, + "grad_norm": 0.16568086194915324, + "learning_rate": 8.16069876409112e-05, + "loss": 2.8549, + "step": 17106 + }, + { + "epoch": 1.0619529455583836, + "grad_norm": 0.1891886251017269, + "learning_rate": 8.16041891530808e-05, + "loss": 3.081, + "step": 17107 + }, + { + "epoch": 1.0620150226581415, + "grad_norm": 0.18036840836488544, + "learning_rate": 8.160139050036369e-05, + "loss": 2.9117, + "step": 17108 + }, + { + "epoch": 1.0620770997578992, + "grad_norm": 0.19578199225100068, + "learning_rate": 8.159859168277444e-05, + "loss": 2.8926, + "step": 17109 + }, + { + "epoch": 1.0621391768576571, + "grad_norm": 0.18623162455767714, + "learning_rate": 8.159579270032768e-05, + "loss": 2.8538, + "step": 17110 + }, + { + "epoch": 1.062201253957415, + "grad_norm": 0.9237673459211788, + "learning_rate": 8.159299355303799e-05, + "loss": 2.9529, + "step": 17111 + }, + { + "epoch": 1.062263331057173, + "grad_norm": 0.18024319263809777, + "learning_rate": 8.159019424091999e-05, + "loss": 3.0441, + "step": 17112 + }, + { + "epoch": 1.0623254081569309, + "grad_norm": 0.18968558060682061, + "learning_rate": 8.158739476398828e-05, + "loss": 3.0809, + "step": 17113 + }, + { + "epoch": 1.0623874852566888, + "grad_norm": 0.18243479672723004, + "learning_rate": 8.158459512225746e-05, + "loss": 2.9533, + "step": 17114 + }, + { + "epoch": 1.0624495623564467, + "grad_norm": 0.18333949075097766, + "learning_rate": 8.158179531574214e-05, + "loss": 2.8487, + "step": 17115 + }, + { + "epoch": 1.0625116394562046, + "grad_norm": 0.1937066437294801, + "learning_rate": 8.157899534445693e-05, + "loss": 2.9784, + "step": 17116 + }, + { + "epoch": 1.0625737165559626, + "grad_norm": 0.269964629898575, + "learning_rate": 8.157619520841644e-05, + "loss": 2.9453, + "step": 17117 + }, + { + "epoch": 1.0626357936557205, + "grad_norm": 0.21158634282461242, + "learning_rate": 8.157339490763527e-05, + "loss": 2.8696, + "step": 17118 + }, + { + "epoch": 1.0626978707554784, + "grad_norm": 0.22070952212084521, + "learning_rate": 8.157059444212803e-05, + "loss": 2.9955, + "step": 17119 + }, + { + "epoch": 1.0627599478552363, + "grad_norm": 0.30488874440651026, + "learning_rate": 8.156779381190933e-05, + "loss": 2.8883, + "step": 17120 + }, + { + "epoch": 1.062822024954994, + "grad_norm": 0.44255720999876935, + "learning_rate": 8.156499301699378e-05, + "loss": 2.9841, + "step": 17121 + }, + { + "epoch": 1.062884102054752, + "grad_norm": 0.2589412889270137, + "learning_rate": 8.156219205739602e-05, + "loss": 2.8646, + "step": 17122 + }, + { + "epoch": 1.0629461791545098, + "grad_norm": 0.21081283205482892, + "learning_rate": 8.155939093313063e-05, + "loss": 3.008, + "step": 17123 + }, + { + "epoch": 1.0630082562542678, + "grad_norm": 0.2607654175429245, + "learning_rate": 8.155658964421222e-05, + "loss": 2.9255, + "step": 17124 + }, + { + "epoch": 1.0630703333540257, + "grad_norm": 0.4363908502338515, + "learning_rate": 8.155378819065544e-05, + "loss": 2.9281, + "step": 17125 + }, + { + "epoch": 1.0631324104537836, + "grad_norm": 0.24293097849464668, + "learning_rate": 8.15509865724749e-05, + "loss": 2.942, + "step": 17126 + }, + { + "epoch": 1.0631944875535415, + "grad_norm": 0.19081824349381796, + "learning_rate": 8.154818478968518e-05, + "loss": 3.0103, + "step": 17127 + }, + { + "epoch": 1.0632565646532994, + "grad_norm": 0.24210351595293159, + "learning_rate": 8.154538284230093e-05, + "loss": 2.8865, + "step": 17128 + }, + { + "epoch": 1.0633186417530573, + "grad_norm": 0.34973304308780967, + "learning_rate": 8.154258073033676e-05, + "loss": 2.9715, + "step": 17129 + }, + { + "epoch": 1.0633807188528153, + "grad_norm": 0.23276487253726463, + "learning_rate": 8.15397784538073e-05, + "loss": 2.9898, + "step": 17130 + }, + { + "epoch": 1.0634427959525732, + "grad_norm": 0.3487986642970804, + "learning_rate": 8.153697601272714e-05, + "loss": 3.073, + "step": 17131 + }, + { + "epoch": 1.0635048730523309, + "grad_norm": 0.36228669319811907, + "learning_rate": 8.153417340711093e-05, + "loss": 2.8932, + "step": 17132 + }, + { + "epoch": 1.0635669501520888, + "grad_norm": 0.20375456813652754, + "learning_rate": 8.153137063697329e-05, + "loss": 2.935, + "step": 17133 + }, + { + "epoch": 1.0636290272518467, + "grad_norm": 0.2402125945723751, + "learning_rate": 8.152856770232883e-05, + "loss": 3.0049, + "step": 17134 + }, + { + "epoch": 1.0636911043516046, + "grad_norm": 0.2327336376187468, + "learning_rate": 8.152576460319216e-05, + "loss": 2.971, + "step": 17135 + }, + { + "epoch": 1.0637531814513626, + "grad_norm": 0.1795051208917215, + "learning_rate": 8.152296133957795e-05, + "loss": 2.8597, + "step": 17136 + }, + { + "epoch": 1.0638152585511205, + "grad_norm": 0.20736623150588931, + "learning_rate": 8.152015791150079e-05, + "loss": 2.9296, + "step": 17137 + }, + { + "epoch": 1.0638773356508784, + "grad_norm": 0.265725831920934, + "learning_rate": 8.151735431897531e-05, + "loss": 2.9401, + "step": 17138 + }, + { + "epoch": 1.0639394127506363, + "grad_norm": 0.23642530886363786, + "learning_rate": 8.151455056201615e-05, + "loss": 2.9966, + "step": 17139 + }, + { + "epoch": 1.0640014898503942, + "grad_norm": 0.1923007305884208, + "learning_rate": 8.15117466406379e-05, + "loss": 2.9461, + "step": 17140 + }, + { + "epoch": 1.0640635669501521, + "grad_norm": 0.2117107752322124, + "learning_rate": 8.150894255485524e-05, + "loss": 2.9098, + "step": 17141 + }, + { + "epoch": 1.06412564404991, + "grad_norm": 0.1914450353135128, + "learning_rate": 8.150613830468279e-05, + "loss": 2.9917, + "step": 17142 + }, + { + "epoch": 1.064187721149668, + "grad_norm": 0.17759210530676892, + "learning_rate": 8.150333389013515e-05, + "loss": 2.9308, + "step": 17143 + }, + { + "epoch": 1.064249798249426, + "grad_norm": 0.23871607269771272, + "learning_rate": 8.150052931122699e-05, + "loss": 2.8898, + "step": 17144 + }, + { + "epoch": 1.0643118753491836, + "grad_norm": 0.29586687328828304, + "learning_rate": 8.149772456797289e-05, + "loss": 2.9676, + "step": 17145 + }, + { + "epoch": 1.0643739524489415, + "grad_norm": 0.17561789975967326, + "learning_rate": 8.149491966038753e-05, + "loss": 2.9105, + "step": 17146 + }, + { + "epoch": 1.0644360295486994, + "grad_norm": 0.19985395031175052, + "learning_rate": 8.149211458848554e-05, + "loss": 2.9183, + "step": 17147 + }, + { + "epoch": 1.0644981066484573, + "grad_norm": 0.1692433402157754, + "learning_rate": 8.148930935228154e-05, + "loss": 2.9937, + "step": 17148 + }, + { + "epoch": 1.0645601837482153, + "grad_norm": 0.1666398374500434, + "learning_rate": 8.148650395179015e-05, + "loss": 2.8937, + "step": 17149 + }, + { + "epoch": 1.0646222608479732, + "grad_norm": 0.2012143414312575, + "learning_rate": 8.148369838702603e-05, + "loss": 3.0155, + "step": 17150 + }, + { + "epoch": 1.064684337947731, + "grad_norm": 0.17571788281466402, + "learning_rate": 8.148089265800383e-05, + "loss": 2.8775, + "step": 17151 + }, + { + "epoch": 1.064746415047489, + "grad_norm": 0.20660249830680635, + "learning_rate": 8.147808676473816e-05, + "loss": 2.8835, + "step": 17152 + }, + { + "epoch": 1.064808492147247, + "grad_norm": 0.17821156715488193, + "learning_rate": 8.147528070724367e-05, + "loss": 2.968, + "step": 17153 + }, + { + "epoch": 1.0648705692470049, + "grad_norm": 0.1946567207363804, + "learning_rate": 8.1472474485535e-05, + "loss": 2.9097, + "step": 17154 + }, + { + "epoch": 1.0649326463467628, + "grad_norm": 0.1840317654977355, + "learning_rate": 8.14696680996268e-05, + "loss": 2.8561, + "step": 17155 + }, + { + "epoch": 1.0649947234465205, + "grad_norm": 0.17075089216243577, + "learning_rate": 8.146686154953367e-05, + "loss": 2.8827, + "step": 17156 + }, + { + "epoch": 1.0650568005462784, + "grad_norm": 0.18529118437826142, + "learning_rate": 8.146405483527031e-05, + "loss": 2.8234, + "step": 17157 + }, + { + "epoch": 1.0651188776460363, + "grad_norm": 0.1796601005064596, + "learning_rate": 8.146124795685133e-05, + "loss": 2.956, + "step": 17158 + }, + { + "epoch": 1.0651809547457942, + "grad_norm": 0.1762954080032593, + "learning_rate": 8.14584409142914e-05, + "loss": 2.9156, + "step": 17159 + }, + { + "epoch": 1.0652430318455521, + "grad_norm": 0.18207890946741306, + "learning_rate": 8.145563370760513e-05, + "loss": 2.9869, + "step": 17160 + }, + { + "epoch": 1.06530510894531, + "grad_norm": 0.20405207494887953, + "learning_rate": 8.145282633680717e-05, + "loss": 2.9208, + "step": 17161 + }, + { + "epoch": 1.065367186045068, + "grad_norm": 0.1947380582884576, + "learning_rate": 8.14500188019122e-05, + "loss": 2.9966, + "step": 17162 + }, + { + "epoch": 1.0654292631448259, + "grad_norm": 0.19169928054691976, + "learning_rate": 8.144721110293485e-05, + "loss": 2.963, + "step": 17163 + }, + { + "epoch": 1.0654913402445838, + "grad_norm": 0.1780110291555995, + "learning_rate": 8.144440323988975e-05, + "loss": 2.9854, + "step": 17164 + }, + { + "epoch": 1.0655534173443417, + "grad_norm": 0.18598022050310767, + "learning_rate": 8.144159521279158e-05, + "loss": 2.9041, + "step": 17165 + }, + { + "epoch": 1.0656154944440996, + "grad_norm": 0.22103727414683158, + "learning_rate": 8.143878702165497e-05, + "loss": 2.9206, + "step": 17166 + }, + { + "epoch": 1.0656775715438576, + "grad_norm": 0.16808391659368607, + "learning_rate": 8.143597866649457e-05, + "loss": 2.9766, + "step": 17167 + }, + { + "epoch": 1.0657396486436155, + "grad_norm": 0.19950438573266271, + "learning_rate": 8.143317014732504e-05, + "loss": 2.8934, + "step": 17168 + }, + { + "epoch": 1.0658017257433732, + "grad_norm": 0.21380231038746333, + "learning_rate": 8.143036146416104e-05, + "loss": 2.894, + "step": 17169 + }, + { + "epoch": 1.065863802843131, + "grad_norm": 0.17723804382035244, + "learning_rate": 8.14275526170172e-05, + "loss": 2.8677, + "step": 17170 + }, + { + "epoch": 1.065925879942889, + "grad_norm": 0.18612209301209798, + "learning_rate": 8.142474360590821e-05, + "loss": 2.9458, + "step": 17171 + }, + { + "epoch": 1.065987957042647, + "grad_norm": 0.1727980945888025, + "learning_rate": 8.142193443084869e-05, + "loss": 2.8608, + "step": 17172 + }, + { + "epoch": 1.0660500341424048, + "grad_norm": 0.26986146132101296, + "learning_rate": 8.14191250918533e-05, + "loss": 2.8335, + "step": 17173 + }, + { + "epoch": 1.0661121112421628, + "grad_norm": 0.17441080246256938, + "learning_rate": 8.141631558893672e-05, + "loss": 2.921, + "step": 17174 + }, + { + "epoch": 1.0661741883419207, + "grad_norm": 0.17566439135085718, + "learning_rate": 8.141350592211358e-05, + "loss": 2.9942, + "step": 17175 + }, + { + "epoch": 1.0662362654416786, + "grad_norm": 0.16868979916696467, + "learning_rate": 8.141069609139857e-05, + "loss": 2.887, + "step": 17176 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.15559974906965202, + "learning_rate": 8.140788609680634e-05, + "loss": 2.8125, + "step": 17177 + }, + { + "epoch": 1.0663604196411944, + "grad_norm": 0.16640653028151142, + "learning_rate": 8.140507593835152e-05, + "loss": 2.8706, + "step": 17178 + }, + { + "epoch": 1.0664224967409524, + "grad_norm": 0.16248600078766817, + "learning_rate": 8.140226561604881e-05, + "loss": 2.8934, + "step": 17179 + }, + { + "epoch": 1.06648457384071, + "grad_norm": 0.2136499355287497, + "learning_rate": 8.139945512991285e-05, + "loss": 2.921, + "step": 17180 + }, + { + "epoch": 1.066546650940468, + "grad_norm": 0.24992695128617917, + "learning_rate": 8.139664447995832e-05, + "loss": 2.9015, + "step": 17181 + }, + { + "epoch": 1.0666087280402259, + "grad_norm": 0.1634409404213828, + "learning_rate": 8.139383366619987e-05, + "loss": 2.827, + "step": 17182 + }, + { + "epoch": 1.0666708051399838, + "grad_norm": 0.18221511027993156, + "learning_rate": 8.139102268865218e-05, + "loss": 2.8975, + "step": 17183 + }, + { + "epoch": 1.0667328822397417, + "grad_norm": 0.17918534186160961, + "learning_rate": 8.138821154732988e-05, + "loss": 2.9846, + "step": 17184 + }, + { + "epoch": 1.0667949593394996, + "grad_norm": 0.1802981408284113, + "learning_rate": 8.138540024224768e-05, + "loss": 2.8542, + "step": 17185 + }, + { + "epoch": 1.0668570364392576, + "grad_norm": 0.19369539291188761, + "learning_rate": 8.138258877342022e-05, + "loss": 2.9533, + "step": 17186 + }, + { + "epoch": 1.0669191135390155, + "grad_norm": 0.18042429917410696, + "learning_rate": 8.137977714086217e-05, + "loss": 2.8593, + "step": 17187 + }, + { + "epoch": 1.0669811906387734, + "grad_norm": 0.16072890202440127, + "learning_rate": 8.13769653445882e-05, + "loss": 2.939, + "step": 17188 + }, + { + "epoch": 1.0670432677385313, + "grad_norm": 0.1664985141272586, + "learning_rate": 8.1374153384613e-05, + "loss": 2.8943, + "step": 17189 + }, + { + "epoch": 1.0671053448382892, + "grad_norm": 0.16921711705160505, + "learning_rate": 8.137134126095122e-05, + "loss": 2.9367, + "step": 17190 + }, + { + "epoch": 1.0671674219380471, + "grad_norm": 0.15767718240433606, + "learning_rate": 8.136852897361754e-05, + "loss": 2.9119, + "step": 17191 + }, + { + "epoch": 1.067229499037805, + "grad_norm": 0.15988758860112845, + "learning_rate": 8.136571652262664e-05, + "loss": 2.8267, + "step": 17192 + }, + { + "epoch": 1.0672915761375628, + "grad_norm": 0.1842469304008946, + "learning_rate": 8.136290390799316e-05, + "loss": 2.8672, + "step": 17193 + }, + { + "epoch": 1.0673536532373207, + "grad_norm": 0.15823842467215593, + "learning_rate": 8.13600911297318e-05, + "loss": 2.9053, + "step": 17194 + }, + { + "epoch": 1.0674157303370786, + "grad_norm": 0.15898396447430968, + "learning_rate": 8.135727818785724e-05, + "loss": 2.836, + "step": 17195 + }, + { + "epoch": 1.0674778074368365, + "grad_norm": 0.26534110212773865, + "learning_rate": 8.135446508238415e-05, + "loss": 2.9298, + "step": 17196 + }, + { + "epoch": 1.0675398845365944, + "grad_norm": 0.1850156507766689, + "learning_rate": 8.13516518133272e-05, + "loss": 2.9105, + "step": 17197 + }, + { + "epoch": 1.0676019616363523, + "grad_norm": 0.21838847155852423, + "learning_rate": 8.134883838070108e-05, + "loss": 2.9951, + "step": 17198 + }, + { + "epoch": 1.0676640387361103, + "grad_norm": 0.19110131789136792, + "learning_rate": 8.134602478452044e-05, + "loss": 2.9787, + "step": 17199 + }, + { + "epoch": 1.0677261158358682, + "grad_norm": 0.1999189655976241, + "learning_rate": 8.134321102480001e-05, + "loss": 2.8979, + "step": 17200 + }, + { + "epoch": 1.067788192935626, + "grad_norm": 0.19654281308390584, + "learning_rate": 8.134039710155442e-05, + "loss": 2.9547, + "step": 17201 + }, + { + "epoch": 1.067850270035384, + "grad_norm": 0.19229220662673188, + "learning_rate": 8.133758301479836e-05, + "loss": 2.8738, + "step": 17202 + }, + { + "epoch": 1.067912347135142, + "grad_norm": 0.19618834573381755, + "learning_rate": 8.133476876454653e-05, + "loss": 2.9445, + "step": 17203 + }, + { + "epoch": 1.0679744242348996, + "grad_norm": 0.19098996612209482, + "learning_rate": 8.133195435081361e-05, + "loss": 2.8646, + "step": 17204 + }, + { + "epoch": 1.0680365013346576, + "grad_norm": 0.1822623921911712, + "learning_rate": 8.132913977361427e-05, + "loss": 3.0259, + "step": 17205 + }, + { + "epoch": 1.0680985784344155, + "grad_norm": 0.175664322428546, + "learning_rate": 8.13263250329632e-05, + "loss": 2.9225, + "step": 17206 + }, + { + "epoch": 1.0681606555341734, + "grad_norm": 0.1654400327976439, + "learning_rate": 8.13235101288751e-05, + "loss": 2.9277, + "step": 17207 + }, + { + "epoch": 1.0682227326339313, + "grad_norm": 0.23275046802262678, + "learning_rate": 8.132069506136464e-05, + "loss": 2.962, + "step": 17208 + }, + { + "epoch": 1.0682848097336892, + "grad_norm": 0.24095734304262756, + "learning_rate": 8.13178798304465e-05, + "loss": 2.9149, + "step": 17209 + }, + { + "epoch": 1.0683468868334471, + "grad_norm": 0.201733527332954, + "learning_rate": 8.131506443613539e-05, + "loss": 2.9533, + "step": 17210 + }, + { + "epoch": 1.068408963933205, + "grad_norm": 0.20042174954587763, + "learning_rate": 8.131224887844599e-05, + "loss": 2.9255, + "step": 17211 + }, + { + "epoch": 1.068471041032963, + "grad_norm": 0.18526516660453593, + "learning_rate": 8.130943315739299e-05, + "loss": 2.8657, + "step": 17212 + }, + { + "epoch": 1.068533118132721, + "grad_norm": 0.24514472472388996, + "learning_rate": 8.130661727299106e-05, + "loss": 2.9484, + "step": 17213 + }, + { + "epoch": 1.0685951952324788, + "grad_norm": 0.17259809800665926, + "learning_rate": 8.130380122525492e-05, + "loss": 2.8932, + "step": 17214 + }, + { + "epoch": 1.0686572723322367, + "grad_norm": 0.1615092302414333, + "learning_rate": 8.130098501419925e-05, + "loss": 2.8511, + "step": 17215 + }, + { + "epoch": 1.0687193494319946, + "grad_norm": 0.15949309101918574, + "learning_rate": 8.129816863983873e-05, + "loss": 2.9462, + "step": 17216 + }, + { + "epoch": 1.0687814265317523, + "grad_norm": 0.16019617722749277, + "learning_rate": 8.129535210218808e-05, + "loss": 2.8474, + "step": 17217 + }, + { + "epoch": 1.0688435036315103, + "grad_norm": 0.1561598458077443, + "learning_rate": 8.129253540126196e-05, + "loss": 2.8428, + "step": 17218 + }, + { + "epoch": 1.0689055807312682, + "grad_norm": 0.16293604466820313, + "learning_rate": 8.12897185370751e-05, + "loss": 2.9296, + "step": 17219 + }, + { + "epoch": 1.068967657831026, + "grad_norm": 0.21715692744064996, + "learning_rate": 8.128690150964219e-05, + "loss": 2.838, + "step": 17220 + }, + { + "epoch": 1.069029734930784, + "grad_norm": 0.17483721062922708, + "learning_rate": 8.128408431897791e-05, + "loss": 2.9746, + "step": 17221 + }, + { + "epoch": 1.069091812030542, + "grad_norm": 0.17828305409861805, + "learning_rate": 8.128126696509697e-05, + "loss": 2.8603, + "step": 17222 + }, + { + "epoch": 1.0691538891302999, + "grad_norm": 0.1857179756308877, + "learning_rate": 8.127844944801406e-05, + "loss": 2.9311, + "step": 17223 + }, + { + "epoch": 1.0692159662300578, + "grad_norm": 0.16279996709306313, + "learning_rate": 8.12756317677439e-05, + "loss": 2.9263, + "step": 17224 + }, + { + "epoch": 1.0692780433298157, + "grad_norm": 0.1774684047786299, + "learning_rate": 8.127281392430117e-05, + "loss": 2.8922, + "step": 17225 + }, + { + "epoch": 1.0693401204295736, + "grad_norm": 0.17121069038437683, + "learning_rate": 8.126999591770057e-05, + "loss": 3.0127, + "step": 17226 + }, + { + "epoch": 1.0694021975293315, + "grad_norm": 0.17859154762452006, + "learning_rate": 8.126717774795682e-05, + "loss": 3.0343, + "step": 17227 + }, + { + "epoch": 1.0694642746290892, + "grad_norm": 0.16585116936163882, + "learning_rate": 8.126435941508461e-05, + "loss": 2.9292, + "step": 17228 + }, + { + "epoch": 1.0695263517288471, + "grad_norm": 0.1947943968165074, + "learning_rate": 8.126154091909865e-05, + "loss": 2.9078, + "step": 17229 + }, + { + "epoch": 1.069588428828605, + "grad_norm": 0.1760995050131309, + "learning_rate": 8.125872226001364e-05, + "loss": 2.9393, + "step": 17230 + }, + { + "epoch": 1.069650505928363, + "grad_norm": 0.16100529913851955, + "learning_rate": 8.125590343784428e-05, + "loss": 2.9039, + "step": 17231 + }, + { + "epoch": 1.069712583028121, + "grad_norm": 0.14634653599848815, + "learning_rate": 8.125308445260531e-05, + "loss": 2.8855, + "step": 17232 + }, + { + "epoch": 1.0697746601278788, + "grad_norm": 0.18231102068656926, + "learning_rate": 8.12502653043114e-05, + "loss": 2.8825, + "step": 17233 + }, + { + "epoch": 1.0698367372276367, + "grad_norm": 0.17434490195340108, + "learning_rate": 8.124744599297725e-05, + "loss": 2.7804, + "step": 17234 + }, + { + "epoch": 1.0698988143273946, + "grad_norm": 0.17945098860734865, + "learning_rate": 8.124462651861761e-05, + "loss": 2.8554, + "step": 17235 + }, + { + "epoch": 1.0699608914271526, + "grad_norm": 0.15187404433094961, + "learning_rate": 8.124180688124716e-05, + "loss": 2.8486, + "step": 17236 + }, + { + "epoch": 1.0700229685269105, + "grad_norm": 0.1569621902172036, + "learning_rate": 8.123898708088063e-05, + "loss": 3.0045, + "step": 17237 + }, + { + "epoch": 1.0700850456266684, + "grad_norm": 0.1507052375740308, + "learning_rate": 8.12361671175327e-05, + "loss": 2.8984, + "step": 17238 + }, + { + "epoch": 1.0701471227264263, + "grad_norm": 0.17374027904239545, + "learning_rate": 8.123334699121811e-05, + "loss": 2.907, + "step": 17239 + }, + { + "epoch": 1.0702091998261842, + "grad_norm": 0.1642813131641762, + "learning_rate": 8.123052670195156e-05, + "loss": 2.8724, + "step": 17240 + }, + { + "epoch": 1.070271276925942, + "grad_norm": 0.15189722842352485, + "learning_rate": 8.122770624974778e-05, + "loss": 2.8924, + "step": 17241 + }, + { + "epoch": 1.0703333540256998, + "grad_norm": 0.1706596231789063, + "learning_rate": 8.122488563462149e-05, + "loss": 2.8811, + "step": 17242 + }, + { + "epoch": 1.0703954311254578, + "grad_norm": 0.14513917604927035, + "learning_rate": 8.122206485658737e-05, + "loss": 3.0108, + "step": 17243 + }, + { + "epoch": 1.0704575082252157, + "grad_norm": 0.16620426119743892, + "learning_rate": 8.121924391566017e-05, + "loss": 2.8218, + "step": 17244 + }, + { + "epoch": 1.0705195853249736, + "grad_norm": 0.1651169294143656, + "learning_rate": 8.12164228118546e-05, + "loss": 2.9661, + "step": 17245 + }, + { + "epoch": 1.0705816624247315, + "grad_norm": 0.19526234936582293, + "learning_rate": 8.121360154518536e-05, + "loss": 2.9642, + "step": 17246 + }, + { + "epoch": 1.0706437395244894, + "grad_norm": 0.16967382083468757, + "learning_rate": 8.121078011566719e-05, + "loss": 2.8176, + "step": 17247 + }, + { + "epoch": 1.0707058166242474, + "grad_norm": 0.16054669911544336, + "learning_rate": 8.12079585233148e-05, + "loss": 2.9715, + "step": 17248 + }, + { + "epoch": 1.0707678937240053, + "grad_norm": 0.17766364230676632, + "learning_rate": 8.120513676814292e-05, + "loss": 2.8731, + "step": 17249 + }, + { + "epoch": 1.0708299708237632, + "grad_norm": 0.14716509010623238, + "learning_rate": 8.120231485016627e-05, + "loss": 2.7954, + "step": 17250 + }, + { + "epoch": 1.070892047923521, + "grad_norm": 0.16913831303978813, + "learning_rate": 8.119949276939955e-05, + "loss": 2.9487, + "step": 17251 + }, + { + "epoch": 1.0709541250232788, + "grad_norm": 0.1510254786849112, + "learning_rate": 8.119667052585753e-05, + "loss": 2.9026, + "step": 17252 + }, + { + "epoch": 1.0710162021230367, + "grad_norm": 0.18269917488705248, + "learning_rate": 8.119384811955489e-05, + "loss": 2.963, + "step": 17253 + }, + { + "epoch": 1.0710782792227946, + "grad_norm": 0.1939124184149125, + "learning_rate": 8.119102555050637e-05, + "loss": 2.8324, + "step": 17254 + }, + { + "epoch": 1.0711403563225526, + "grad_norm": 0.16261308041292385, + "learning_rate": 8.11882028187267e-05, + "loss": 2.8674, + "step": 17255 + }, + { + "epoch": 1.0712024334223105, + "grad_norm": 0.16204469033709723, + "learning_rate": 8.118537992423061e-05, + "loss": 2.8688, + "step": 17256 + }, + { + "epoch": 1.0712645105220684, + "grad_norm": 0.17699817521897662, + "learning_rate": 8.118255686703282e-05, + "loss": 2.9671, + "step": 17257 + }, + { + "epoch": 1.0713265876218263, + "grad_norm": 0.1622661441524688, + "learning_rate": 8.117973364714805e-05, + "loss": 2.7271, + "step": 17258 + }, + { + "epoch": 1.0713886647215842, + "grad_norm": 0.1663645107683616, + "learning_rate": 8.117691026459105e-05, + "loss": 2.9814, + "step": 17259 + }, + { + "epoch": 1.0714507418213421, + "grad_norm": 0.16987644333608706, + "learning_rate": 8.117408671937654e-05, + "loss": 2.9851, + "step": 17260 + }, + { + "epoch": 1.0715128189211, + "grad_norm": 0.17220132367260027, + "learning_rate": 8.117126301151925e-05, + "loss": 2.8958, + "step": 17261 + }, + { + "epoch": 1.071574896020858, + "grad_norm": 0.19905601661580052, + "learning_rate": 8.116843914103393e-05, + "loss": 2.8995, + "step": 17262 + }, + { + "epoch": 1.071636973120616, + "grad_norm": 0.17970017720159603, + "learning_rate": 8.116561510793528e-05, + "loss": 2.8936, + "step": 17263 + }, + { + "epoch": 1.0716990502203738, + "grad_norm": 0.21199651950945886, + "learning_rate": 8.116279091223804e-05, + "loss": 2.9022, + "step": 17264 + }, + { + "epoch": 1.0717611273201315, + "grad_norm": 0.17394264606357027, + "learning_rate": 8.115996655395698e-05, + "loss": 2.8716, + "step": 17265 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.2470418891087551, + "learning_rate": 8.115714203310679e-05, + "loss": 2.9047, + "step": 17266 + }, + { + "epoch": 1.0718852815196473, + "grad_norm": 0.1850500683594706, + "learning_rate": 8.115431734970224e-05, + "loss": 2.8871, + "step": 17267 + }, + { + "epoch": 1.0719473586194053, + "grad_norm": 0.21433032850765377, + "learning_rate": 8.115149250375804e-05, + "loss": 2.9119, + "step": 17268 + }, + { + "epoch": 1.0720094357191632, + "grad_norm": 0.20358052893664244, + "learning_rate": 8.114866749528896e-05, + "loss": 2.9323, + "step": 17269 + }, + { + "epoch": 1.072071512818921, + "grad_norm": 0.1888221182003898, + "learning_rate": 8.11458423243097e-05, + "loss": 2.9051, + "step": 17270 + }, + { + "epoch": 1.072133589918679, + "grad_norm": 0.17820698917452218, + "learning_rate": 8.114301699083503e-05, + "loss": 2.7894, + "step": 17271 + }, + { + "epoch": 1.072195667018437, + "grad_norm": 0.1843501498561102, + "learning_rate": 8.114019149487969e-05, + "loss": 2.9747, + "step": 17272 + }, + { + "epoch": 1.0722577441181949, + "grad_norm": 0.1653762623024396, + "learning_rate": 8.11373658364584e-05, + "loss": 2.9162, + "step": 17273 + }, + { + "epoch": 1.0723198212179528, + "grad_norm": 0.17373837071889534, + "learning_rate": 8.11345400155859e-05, + "loss": 2.9503, + "step": 17274 + }, + { + "epoch": 1.0723818983177107, + "grad_norm": 0.1747057970663353, + "learning_rate": 8.113171403227697e-05, + "loss": 2.9068, + "step": 17275 + }, + { + "epoch": 1.0724439754174684, + "grad_norm": 0.16355274685805524, + "learning_rate": 8.112888788654631e-05, + "loss": 2.9569, + "step": 17276 + }, + { + "epoch": 1.0725060525172263, + "grad_norm": 0.17552519268391514, + "learning_rate": 8.11260615784087e-05, + "loss": 3.0215, + "step": 17277 + }, + { + "epoch": 1.0725681296169842, + "grad_norm": 0.17098913783157632, + "learning_rate": 8.112323510787887e-05, + "loss": 2.8277, + "step": 17278 + }, + { + "epoch": 1.0726302067167421, + "grad_norm": 0.19424119873310736, + "learning_rate": 8.112040847497156e-05, + "loss": 2.969, + "step": 17279 + }, + { + "epoch": 1.0726922838165, + "grad_norm": 0.1588893184437462, + "learning_rate": 8.111758167970153e-05, + "loss": 2.9661, + "step": 17280 + }, + { + "epoch": 1.072754360916258, + "grad_norm": 0.168472007417472, + "learning_rate": 8.111475472208352e-05, + "loss": 2.927, + "step": 17281 + }, + { + "epoch": 1.072816438016016, + "grad_norm": 0.14847945574054364, + "learning_rate": 8.111192760213229e-05, + "loss": 2.9307, + "step": 17282 + }, + { + "epoch": 1.0728785151157738, + "grad_norm": 0.16739334197960853, + "learning_rate": 8.110910031986256e-05, + "loss": 2.9526, + "step": 17283 + }, + { + "epoch": 1.0729405922155317, + "grad_norm": 0.1542192155025147, + "learning_rate": 8.110627287528912e-05, + "loss": 2.9046, + "step": 17284 + }, + { + "epoch": 1.0730026693152896, + "grad_norm": 0.16184146413129796, + "learning_rate": 8.11034452684267e-05, + "loss": 2.9292, + "step": 17285 + }, + { + "epoch": 1.0730647464150476, + "grad_norm": 0.1524722034593338, + "learning_rate": 8.110061749929006e-05, + "loss": 2.9125, + "step": 17286 + }, + { + "epoch": 1.0731268235148055, + "grad_norm": 0.14757997874140247, + "learning_rate": 8.109778956789394e-05, + "loss": 2.8646, + "step": 17287 + }, + { + "epoch": 1.0731889006145634, + "grad_norm": 0.1818012224898032, + "learning_rate": 8.109496147425309e-05, + "loss": 2.9346, + "step": 17288 + }, + { + "epoch": 1.073250977714321, + "grad_norm": 0.22906264830736717, + "learning_rate": 8.109213321838229e-05, + "loss": 3.0139, + "step": 17289 + }, + { + "epoch": 1.073313054814079, + "grad_norm": 0.1981718662108266, + "learning_rate": 8.108930480029629e-05, + "loss": 2.9285, + "step": 17290 + }, + { + "epoch": 1.073375131913837, + "grad_norm": 0.24468860925591085, + "learning_rate": 8.108647622000983e-05, + "loss": 2.9244, + "step": 17291 + }, + { + "epoch": 1.0734372090135949, + "grad_norm": 0.16403912763780054, + "learning_rate": 8.108364747753767e-05, + "loss": 2.818, + "step": 17292 + }, + { + "epoch": 1.0734992861133528, + "grad_norm": 0.1744035668541793, + "learning_rate": 8.10808185728946e-05, + "loss": 2.9355, + "step": 17293 + }, + { + "epoch": 1.0735613632131107, + "grad_norm": 0.25844728465163974, + "learning_rate": 8.107798950609534e-05, + "loss": 2.8471, + "step": 17294 + }, + { + "epoch": 1.0736234403128686, + "grad_norm": 0.22732386420148792, + "learning_rate": 8.107516027715467e-05, + "loss": 2.7755, + "step": 17295 + }, + { + "epoch": 1.0736855174126265, + "grad_norm": 0.17998665288276372, + "learning_rate": 8.107233088608734e-05, + "loss": 2.9697, + "step": 17296 + }, + { + "epoch": 1.0737475945123844, + "grad_norm": 0.17117338678692628, + "learning_rate": 8.106950133290811e-05, + "loss": 2.9007, + "step": 17297 + }, + { + "epoch": 1.0738096716121424, + "grad_norm": 0.22824506147730655, + "learning_rate": 8.106667161763175e-05, + "loss": 2.8797, + "step": 17298 + }, + { + "epoch": 1.0738717487119003, + "grad_norm": 0.31421086948780735, + "learning_rate": 8.106384174027304e-05, + "loss": 2.9709, + "step": 17299 + }, + { + "epoch": 1.073933825811658, + "grad_norm": 0.1820433674000389, + "learning_rate": 8.106101170084671e-05, + "loss": 2.8946, + "step": 17300 + }, + { + "epoch": 1.073995902911416, + "grad_norm": 0.18810417269114768, + "learning_rate": 8.105818149936755e-05, + "loss": 2.846, + "step": 17301 + }, + { + "epoch": 1.0740579800111738, + "grad_norm": 0.2002620771774944, + "learning_rate": 8.105535113585031e-05, + "loss": 2.8818, + "step": 17302 + }, + { + "epoch": 1.0741200571109317, + "grad_norm": 0.17882044621118062, + "learning_rate": 8.105252061030977e-05, + "loss": 2.9382, + "step": 17303 + }, + { + "epoch": 1.0741821342106896, + "grad_norm": 0.20496896836004128, + "learning_rate": 8.104968992276069e-05, + "loss": 2.832, + "step": 17304 + }, + { + "epoch": 1.0742442113104476, + "grad_norm": 0.18419878265002412, + "learning_rate": 8.104685907321783e-05, + "loss": 2.885, + "step": 17305 + }, + { + "epoch": 1.0743062884102055, + "grad_norm": 0.20417531519854176, + "learning_rate": 8.104402806169598e-05, + "loss": 2.9064, + "step": 17306 + }, + { + "epoch": 1.0743683655099634, + "grad_norm": 0.2039845053366658, + "learning_rate": 8.10411968882099e-05, + "loss": 2.9947, + "step": 17307 + }, + { + "epoch": 1.0744304426097213, + "grad_norm": 0.24719083055550434, + "learning_rate": 8.103836555277436e-05, + "loss": 2.8221, + "step": 17308 + }, + { + "epoch": 1.0744925197094792, + "grad_norm": 0.15684070389893146, + "learning_rate": 8.103553405540413e-05, + "loss": 2.8635, + "step": 17309 + }, + { + "epoch": 1.0745545968092372, + "grad_norm": 0.195822307896868, + "learning_rate": 8.103270239611399e-05, + "loss": 2.9608, + "step": 17310 + }, + { + "epoch": 1.074616673908995, + "grad_norm": 0.18121123063692204, + "learning_rate": 8.102987057491871e-05, + "loss": 2.9138, + "step": 17311 + }, + { + "epoch": 1.074678751008753, + "grad_norm": 0.1878781776508477, + "learning_rate": 8.102703859183305e-05, + "loss": 2.9214, + "step": 17312 + }, + { + "epoch": 1.0747408281085107, + "grad_norm": 0.1699462741566287, + "learning_rate": 8.102420644687181e-05, + "loss": 2.9198, + "step": 17313 + }, + { + "epoch": 1.0748029052082686, + "grad_norm": 0.2420615899516796, + "learning_rate": 8.102137414004976e-05, + "loss": 2.902, + "step": 17314 + }, + { + "epoch": 1.0748649823080265, + "grad_norm": 0.22908240723593298, + "learning_rate": 8.101854167138165e-05, + "loss": 2.963, + "step": 17315 + }, + { + "epoch": 1.0749270594077844, + "grad_norm": 0.20927264928781458, + "learning_rate": 8.101570904088228e-05, + "loss": 2.9062, + "step": 17316 + }, + { + "epoch": 1.0749891365075424, + "grad_norm": 0.17425617812415736, + "learning_rate": 8.101287624856644e-05, + "loss": 2.8511, + "step": 17317 + }, + { + "epoch": 1.0750512136073003, + "grad_norm": 0.2591149203938859, + "learning_rate": 8.101004329444888e-05, + "loss": 2.9079, + "step": 17318 + }, + { + "epoch": 1.0751132907070582, + "grad_norm": 0.2090287295181121, + "learning_rate": 8.10072101785444e-05, + "loss": 2.8854, + "step": 17319 + }, + { + "epoch": 1.075175367806816, + "grad_norm": 0.1786771961315511, + "learning_rate": 8.100437690086779e-05, + "loss": 2.9026, + "step": 17320 + }, + { + "epoch": 1.075237444906574, + "grad_norm": 0.18969769766229358, + "learning_rate": 8.10015434614338e-05, + "loss": 2.8826, + "step": 17321 + }, + { + "epoch": 1.075299522006332, + "grad_norm": 0.17817138561478302, + "learning_rate": 8.099870986025724e-05, + "loss": 3.0184, + "step": 17322 + }, + { + "epoch": 1.0753615991060899, + "grad_norm": 0.1666414246716562, + "learning_rate": 8.09958760973529e-05, + "loss": 2.8042, + "step": 17323 + }, + { + "epoch": 1.0754236762058476, + "grad_norm": 0.18547572729392578, + "learning_rate": 8.099304217273553e-05, + "loss": 2.887, + "step": 17324 + }, + { + "epoch": 1.0754857533056055, + "grad_norm": 0.16179115591067633, + "learning_rate": 8.099020808641994e-05, + "loss": 2.9197, + "step": 17325 + }, + { + "epoch": 1.0755478304053634, + "grad_norm": 0.18695952082914943, + "learning_rate": 8.098737383842092e-05, + "loss": 2.9377, + "step": 17326 + }, + { + "epoch": 1.0756099075051213, + "grad_norm": 0.2020783602374631, + "learning_rate": 8.098453942875325e-05, + "loss": 2.8803, + "step": 17327 + }, + { + "epoch": 1.0756719846048792, + "grad_norm": 0.19749259166765962, + "learning_rate": 8.09817048574317e-05, + "loss": 2.9209, + "step": 17328 + }, + { + "epoch": 1.0757340617046371, + "grad_norm": 0.19221318701302337, + "learning_rate": 8.09788701244711e-05, + "loss": 2.8874, + "step": 17329 + }, + { + "epoch": 1.075796138804395, + "grad_norm": 0.20141984415750552, + "learning_rate": 8.097603522988619e-05, + "loss": 2.9508, + "step": 17330 + }, + { + "epoch": 1.075858215904153, + "grad_norm": 0.1683967593080011, + "learning_rate": 8.09732001736918e-05, + "loss": 2.8691, + "step": 17331 + }, + { + "epoch": 1.075920293003911, + "grad_norm": 0.25545065754510693, + "learning_rate": 8.097036495590269e-05, + "loss": 2.968, + "step": 17332 + }, + { + "epoch": 1.0759823701036688, + "grad_norm": 0.17587216992903978, + "learning_rate": 8.096752957653368e-05, + "loss": 2.8271, + "step": 17333 + }, + { + "epoch": 1.0760444472034267, + "grad_norm": 0.18324972954856306, + "learning_rate": 8.096469403559955e-05, + "loss": 2.8864, + "step": 17334 + }, + { + "epoch": 1.0761065243031847, + "grad_norm": 0.21347371140484367, + "learning_rate": 8.096185833311509e-05, + "loss": 2.8995, + "step": 17335 + }, + { + "epoch": 1.0761686014029426, + "grad_norm": 0.17670899449411928, + "learning_rate": 8.095902246909509e-05, + "loss": 2.9247, + "step": 17336 + }, + { + "epoch": 1.0762306785027003, + "grad_norm": 0.200790042907309, + "learning_rate": 8.095618644355438e-05, + "loss": 2.9565, + "step": 17337 + }, + { + "epoch": 1.0762927556024582, + "grad_norm": 0.2094147964632606, + "learning_rate": 8.09533502565077e-05, + "loss": 2.9356, + "step": 17338 + }, + { + "epoch": 1.076354832702216, + "grad_norm": 0.1820726895949553, + "learning_rate": 8.095051390796989e-05, + "loss": 2.8517, + "step": 17339 + }, + { + "epoch": 1.076416909801974, + "grad_norm": 0.1846307120824546, + "learning_rate": 8.094767739795574e-05, + "loss": 2.841, + "step": 17340 + }, + { + "epoch": 1.076478986901732, + "grad_norm": 0.20008271935612496, + "learning_rate": 8.094484072648004e-05, + "loss": 2.9318, + "step": 17341 + }, + { + "epoch": 1.0765410640014899, + "grad_norm": 0.1938461764877301, + "learning_rate": 8.09420038935576e-05, + "loss": 2.8863, + "step": 17342 + }, + { + "epoch": 1.0766031411012478, + "grad_norm": 0.19659778173712064, + "learning_rate": 8.093916689920322e-05, + "loss": 2.9238, + "step": 17343 + }, + { + "epoch": 1.0766652182010057, + "grad_norm": 0.16371739512445585, + "learning_rate": 8.093632974343167e-05, + "loss": 2.9227, + "step": 17344 + }, + { + "epoch": 1.0767272953007636, + "grad_norm": 0.16480995392234143, + "learning_rate": 8.093349242625779e-05, + "loss": 2.909, + "step": 17345 + }, + { + "epoch": 1.0767893724005215, + "grad_norm": 0.24517076978117702, + "learning_rate": 8.093065494769638e-05, + "loss": 2.9395, + "step": 17346 + }, + { + "epoch": 1.0768514495002794, + "grad_norm": 0.182028622669693, + "learning_rate": 8.092781730776222e-05, + "loss": 3.0092, + "step": 17347 + }, + { + "epoch": 1.0769135266000371, + "grad_norm": 0.1836909888839239, + "learning_rate": 8.092497950647014e-05, + "loss": 2.9531, + "step": 17348 + }, + { + "epoch": 1.076975603699795, + "grad_norm": 0.1601750548883162, + "learning_rate": 8.092214154383494e-05, + "loss": 2.8557, + "step": 17349 + }, + { + "epoch": 1.077037680799553, + "grad_norm": 0.19114257205983978, + "learning_rate": 8.091930341987141e-05, + "loss": 3.0272, + "step": 17350 + }, + { + "epoch": 1.077099757899311, + "grad_norm": 0.17153903789461886, + "learning_rate": 8.091646513459438e-05, + "loss": 2.9023, + "step": 17351 + }, + { + "epoch": 1.0771618349990688, + "grad_norm": 0.18211734929375628, + "learning_rate": 8.091362668801863e-05, + "loss": 2.9631, + "step": 17352 + }, + { + "epoch": 1.0772239120988267, + "grad_norm": 0.18325945673398994, + "learning_rate": 8.091078808015899e-05, + "loss": 2.8616, + "step": 17353 + }, + { + "epoch": 1.0772859891985846, + "grad_norm": 0.18246528100911952, + "learning_rate": 8.090794931103026e-05, + "loss": 2.9064, + "step": 17354 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.15431922436868722, + "learning_rate": 8.090511038064728e-05, + "loss": 2.9416, + "step": 17355 + }, + { + "epoch": 1.0774101433981005, + "grad_norm": 0.16857552162032002, + "learning_rate": 8.09022712890248e-05, + "loss": 2.8797, + "step": 17356 + }, + { + "epoch": 1.0774722204978584, + "grad_norm": 0.15866195992388613, + "learning_rate": 8.089943203617769e-05, + "loss": 2.8772, + "step": 17357 + }, + { + "epoch": 1.0775342975976163, + "grad_norm": 0.1920271068495412, + "learning_rate": 8.089659262212075e-05, + "loss": 2.9741, + "step": 17358 + }, + { + "epoch": 1.0775963746973742, + "grad_norm": 0.1607654835076293, + "learning_rate": 8.089375304686877e-05, + "loss": 2.8807, + "step": 17359 + }, + { + "epoch": 1.0776584517971322, + "grad_norm": 0.17553080625717363, + "learning_rate": 8.089091331043659e-05, + "loss": 2.8865, + "step": 17360 + }, + { + "epoch": 1.0777205288968899, + "grad_norm": 0.16494176468208896, + "learning_rate": 8.0888073412839e-05, + "loss": 2.9418, + "step": 17361 + }, + { + "epoch": 1.0777826059966478, + "grad_norm": 0.15880943604005374, + "learning_rate": 8.088523335409085e-05, + "loss": 2.8807, + "step": 17362 + }, + { + "epoch": 1.0778446830964057, + "grad_norm": 0.16789890655853923, + "learning_rate": 8.088239313420694e-05, + "loss": 2.8545, + "step": 17363 + }, + { + "epoch": 1.0779067601961636, + "grad_norm": 0.17116924083638088, + "learning_rate": 8.087955275320208e-05, + "loss": 2.8955, + "step": 17364 + }, + { + "epoch": 1.0779688372959215, + "grad_norm": 0.17955621816320222, + "learning_rate": 8.08767122110911e-05, + "loss": 2.9631, + "step": 17365 + }, + { + "epoch": 1.0780309143956794, + "grad_norm": 0.17908481407270796, + "learning_rate": 8.087387150788881e-05, + "loss": 2.9459, + "step": 17366 + }, + { + "epoch": 1.0780929914954374, + "grad_norm": 0.1694145796602785, + "learning_rate": 8.087103064361005e-05, + "loss": 2.9438, + "step": 17367 + }, + { + "epoch": 1.0781550685951953, + "grad_norm": 0.16877182007049768, + "learning_rate": 8.086818961826963e-05, + "loss": 2.9626, + "step": 17368 + }, + { + "epoch": 1.0782171456949532, + "grad_norm": 0.15510698909851656, + "learning_rate": 8.086534843188237e-05, + "loss": 2.9304, + "step": 17369 + }, + { + "epoch": 1.0782792227947111, + "grad_norm": 0.14857061591104606, + "learning_rate": 8.086250708446308e-05, + "loss": 2.9407, + "step": 17370 + }, + { + "epoch": 1.078341299894469, + "grad_norm": 0.17063725813474834, + "learning_rate": 8.08596655760266e-05, + "loss": 2.9523, + "step": 17371 + }, + { + "epoch": 1.0784033769942267, + "grad_norm": 0.15120705779175245, + "learning_rate": 8.085682390658776e-05, + "loss": 2.8677, + "step": 17372 + }, + { + "epoch": 1.0784654540939846, + "grad_norm": 0.2107215058057274, + "learning_rate": 8.085398207616138e-05, + "loss": 2.9822, + "step": 17373 + }, + { + "epoch": 1.0785275311937426, + "grad_norm": 0.16249075230547372, + "learning_rate": 8.085114008476229e-05, + "loss": 2.9319, + "step": 17374 + }, + { + "epoch": 1.0785896082935005, + "grad_norm": 0.21213876439846624, + "learning_rate": 8.08482979324053e-05, + "loss": 2.8191, + "step": 17375 + }, + { + "epoch": 1.0786516853932584, + "grad_norm": 0.17523125329810693, + "learning_rate": 8.084545561910526e-05, + "loss": 2.9221, + "step": 17376 + }, + { + "epoch": 1.0787137624930163, + "grad_norm": 0.17526279923027346, + "learning_rate": 8.0842613144877e-05, + "loss": 2.9106, + "step": 17377 + }, + { + "epoch": 1.0787758395927742, + "grad_norm": 0.16330571698578433, + "learning_rate": 8.083977050973532e-05, + "loss": 2.8167, + "step": 17378 + }, + { + "epoch": 1.0788379166925322, + "grad_norm": 0.18436430479807753, + "learning_rate": 8.083692771369509e-05, + "loss": 2.8915, + "step": 17379 + }, + { + "epoch": 1.07889999379229, + "grad_norm": 0.20435987671767847, + "learning_rate": 8.08340847567711e-05, + "loss": 2.9293, + "step": 17380 + }, + { + "epoch": 1.078962070892048, + "grad_norm": 0.18556697134307912, + "learning_rate": 8.083124163897822e-05, + "loss": 2.9243, + "step": 17381 + }, + { + "epoch": 1.079024147991806, + "grad_norm": 0.17336360607338977, + "learning_rate": 8.082839836033126e-05, + "loss": 2.8699, + "step": 17382 + }, + { + "epoch": 1.0790862250915638, + "grad_norm": 0.15546104205974526, + "learning_rate": 8.082555492084507e-05, + "loss": 2.8323, + "step": 17383 + }, + { + "epoch": 1.0791483021913215, + "grad_norm": 0.1483449584701974, + "learning_rate": 8.082271132053448e-05, + "loss": 2.8239, + "step": 17384 + }, + { + "epoch": 1.0792103792910794, + "grad_norm": 0.1586849915601801, + "learning_rate": 8.081986755941431e-05, + "loss": 2.8991, + "step": 17385 + }, + { + "epoch": 1.0792724563908374, + "grad_norm": 0.16261810884345282, + "learning_rate": 8.08170236374994e-05, + "loss": 2.9245, + "step": 17386 + }, + { + "epoch": 1.0793345334905953, + "grad_norm": 0.16197960036813583, + "learning_rate": 8.081417955480462e-05, + "loss": 2.8794, + "step": 17387 + }, + { + "epoch": 1.0793966105903532, + "grad_norm": 0.16105610726799577, + "learning_rate": 8.081133531134477e-05, + "loss": 2.8536, + "step": 17388 + }, + { + "epoch": 1.079458687690111, + "grad_norm": 0.222041477622829, + "learning_rate": 8.080849090713472e-05, + "loss": 2.9844, + "step": 17389 + }, + { + "epoch": 1.079520764789869, + "grad_norm": 0.18198664327422817, + "learning_rate": 8.080564634218928e-05, + "loss": 2.9916, + "step": 17390 + }, + { + "epoch": 1.079582841889627, + "grad_norm": 0.1695942021316988, + "learning_rate": 8.08028016165233e-05, + "loss": 2.9675, + "step": 17391 + }, + { + "epoch": 1.0796449189893849, + "grad_norm": 0.1746752648768963, + "learning_rate": 8.079995673015164e-05, + "loss": 2.8981, + "step": 17392 + }, + { + "epoch": 1.0797069960891428, + "grad_norm": 0.15125469827054236, + "learning_rate": 8.079711168308912e-05, + "loss": 2.889, + "step": 17393 + }, + { + "epoch": 1.0797690731889007, + "grad_norm": 0.1561338708386927, + "learning_rate": 8.07942664753506e-05, + "loss": 2.8657, + "step": 17394 + }, + { + "epoch": 1.0798311502886586, + "grad_norm": 0.18121698670796535, + "learning_rate": 8.07914211069509e-05, + "loss": 2.9579, + "step": 17395 + }, + { + "epoch": 1.0798932273884163, + "grad_norm": 0.17316374093956738, + "learning_rate": 8.07885755779049e-05, + "loss": 2.9583, + "step": 17396 + }, + { + "epoch": 1.0799553044881742, + "grad_norm": 0.17757655042763354, + "learning_rate": 8.078572988822742e-05, + "loss": 2.8727, + "step": 17397 + }, + { + "epoch": 1.0800173815879321, + "grad_norm": 0.16448549396065074, + "learning_rate": 8.078288403793331e-05, + "loss": 2.9715, + "step": 17398 + }, + { + "epoch": 1.08007945868769, + "grad_norm": 0.2595790106677268, + "learning_rate": 8.078003802703743e-05, + "loss": 2.7821, + "step": 17399 + }, + { + "epoch": 1.080141535787448, + "grad_norm": 0.1966429125774392, + "learning_rate": 8.077719185555461e-05, + "loss": 2.8974, + "step": 17400 + }, + { + "epoch": 1.080203612887206, + "grad_norm": 0.1738735096865532, + "learning_rate": 8.077434552349971e-05, + "loss": 2.8866, + "step": 17401 + }, + { + "epoch": 1.0802656899869638, + "grad_norm": 0.18110932818825123, + "learning_rate": 8.077149903088759e-05, + "loss": 2.8448, + "step": 17402 + }, + { + "epoch": 1.0803277670867217, + "grad_norm": 0.19115117652001148, + "learning_rate": 8.076865237773308e-05, + "loss": 2.9493, + "step": 17403 + }, + { + "epoch": 1.0803898441864797, + "grad_norm": 0.15582980637927984, + "learning_rate": 8.076580556405104e-05, + "loss": 2.9008, + "step": 17404 + }, + { + "epoch": 1.0804519212862376, + "grad_norm": 0.16799217437820727, + "learning_rate": 8.076295858985632e-05, + "loss": 2.8502, + "step": 17405 + }, + { + "epoch": 1.0805139983859955, + "grad_norm": 0.15943329156475222, + "learning_rate": 8.076011145516379e-05, + "loss": 2.9124, + "step": 17406 + }, + { + "epoch": 1.0805760754857534, + "grad_norm": 0.17153497136326082, + "learning_rate": 8.075726415998828e-05, + "loss": 2.9444, + "step": 17407 + }, + { + "epoch": 1.080638152585511, + "grad_norm": 0.14448116697724941, + "learning_rate": 8.075441670434467e-05, + "loss": 2.8849, + "step": 17408 + }, + { + "epoch": 1.080700229685269, + "grad_norm": 0.19552192903786472, + "learning_rate": 8.075156908824779e-05, + "loss": 2.8444, + "step": 17409 + }, + { + "epoch": 1.080762306785027, + "grad_norm": 0.15181675261017896, + "learning_rate": 8.074872131171251e-05, + "loss": 2.8798, + "step": 17410 + }, + { + "epoch": 1.0808243838847849, + "grad_norm": 0.18375830681831948, + "learning_rate": 8.074587337475369e-05, + "loss": 2.8318, + "step": 17411 + }, + { + "epoch": 1.0808864609845428, + "grad_norm": 0.16178746606388308, + "learning_rate": 8.074302527738618e-05, + "loss": 2.9277, + "step": 17412 + }, + { + "epoch": 1.0809485380843007, + "grad_norm": 0.18092624095613982, + "learning_rate": 8.074017701962485e-05, + "loss": 2.9187, + "step": 17413 + }, + { + "epoch": 1.0810106151840586, + "grad_norm": 0.17097158773818233, + "learning_rate": 8.073732860148455e-05, + "loss": 3.0024, + "step": 17414 + }, + { + "epoch": 1.0810726922838165, + "grad_norm": 0.18328336085204003, + "learning_rate": 8.073448002298014e-05, + "loss": 2.9338, + "step": 17415 + }, + { + "epoch": 1.0811347693835744, + "grad_norm": 0.17032705975339313, + "learning_rate": 8.073163128412649e-05, + "loss": 2.9174, + "step": 17416 + }, + { + "epoch": 1.0811968464833324, + "grad_norm": 0.16155828886428303, + "learning_rate": 8.072878238493846e-05, + "loss": 2.8386, + "step": 17417 + }, + { + "epoch": 1.0812589235830903, + "grad_norm": 0.1818832397396932, + "learning_rate": 8.072593332543092e-05, + "loss": 2.8559, + "step": 17418 + }, + { + "epoch": 1.0813210006828482, + "grad_norm": 0.17031845861038145, + "learning_rate": 8.072308410561872e-05, + "loss": 2.878, + "step": 17419 + }, + { + "epoch": 1.081383077782606, + "grad_norm": 0.1864293844954869, + "learning_rate": 8.072023472551673e-05, + "loss": 2.8628, + "step": 17420 + }, + { + "epoch": 1.0814451548823638, + "grad_norm": 0.1633354265965243, + "learning_rate": 8.071738518513983e-05, + "loss": 2.9263, + "step": 17421 + }, + { + "epoch": 1.0815072319821217, + "grad_norm": 0.16677825453049744, + "learning_rate": 8.071453548450286e-05, + "loss": 2.7843, + "step": 17422 + }, + { + "epoch": 1.0815693090818796, + "grad_norm": 0.16853411472700314, + "learning_rate": 8.07116856236207e-05, + "loss": 2.8608, + "step": 17423 + }, + { + "epoch": 1.0816313861816376, + "grad_norm": 0.18509404992718978, + "learning_rate": 8.070883560250823e-05, + "loss": 2.8913, + "step": 17424 + }, + { + "epoch": 1.0816934632813955, + "grad_norm": 0.16448907798956966, + "learning_rate": 8.07059854211803e-05, + "loss": 3.0182, + "step": 17425 + }, + { + "epoch": 1.0817555403811534, + "grad_norm": 0.16074403041001, + "learning_rate": 8.070313507965181e-05, + "loss": 2.9807, + "step": 17426 + }, + { + "epoch": 1.0818176174809113, + "grad_norm": 0.22960040199214687, + "learning_rate": 8.07002845779376e-05, + "loss": 3.0428, + "step": 17427 + }, + { + "epoch": 1.0818796945806692, + "grad_norm": 0.2999459222889671, + "learning_rate": 8.069743391605256e-05, + "loss": 2.9083, + "step": 17428 + }, + { + "epoch": 1.0819417716804272, + "grad_norm": 0.18322054489378264, + "learning_rate": 8.069458309401155e-05, + "loss": 3.0211, + "step": 17429 + }, + { + "epoch": 1.082003848780185, + "grad_norm": 0.27758054966811063, + "learning_rate": 8.069173211182943e-05, + "loss": 2.895, + "step": 17430 + }, + { + "epoch": 1.082065925879943, + "grad_norm": 0.19622483792241802, + "learning_rate": 8.068888096952111e-05, + "loss": 2.9085, + "step": 17431 + }, + { + "epoch": 1.0821280029797007, + "grad_norm": 0.20661174810368393, + "learning_rate": 8.068602966710146e-05, + "loss": 2.9132, + "step": 17432 + }, + { + "epoch": 1.0821900800794586, + "grad_norm": 0.18616371018422295, + "learning_rate": 8.068317820458532e-05, + "loss": 2.878, + "step": 17433 + }, + { + "epoch": 1.0822521571792165, + "grad_norm": 0.17998441752895167, + "learning_rate": 8.06803265819876e-05, + "loss": 2.9713, + "step": 17434 + }, + { + "epoch": 1.0823142342789744, + "grad_norm": 0.19344919218677054, + "learning_rate": 8.067747479932318e-05, + "loss": 2.9492, + "step": 17435 + }, + { + "epoch": 1.0823763113787324, + "grad_norm": 0.1745158211557285, + "learning_rate": 8.06746228566069e-05, + "loss": 2.8212, + "step": 17436 + }, + { + "epoch": 1.0824383884784903, + "grad_norm": 0.16509336221603552, + "learning_rate": 8.067177075385369e-05, + "loss": 2.8578, + "step": 17437 + }, + { + "epoch": 1.0825004655782482, + "grad_norm": 0.172836949652112, + "learning_rate": 8.06689184910784e-05, + "loss": 2.9093, + "step": 17438 + }, + { + "epoch": 1.0825625426780061, + "grad_norm": 0.16948337538178124, + "learning_rate": 8.066606606829591e-05, + "loss": 2.9005, + "step": 17439 + }, + { + "epoch": 1.082624619777764, + "grad_norm": 0.15552559057941284, + "learning_rate": 8.066321348552111e-05, + "loss": 2.7689, + "step": 17440 + }, + { + "epoch": 1.082686696877522, + "grad_norm": 0.21846133156688946, + "learning_rate": 8.06603607427689e-05, + "loss": 2.8504, + "step": 17441 + }, + { + "epoch": 1.0827487739772799, + "grad_norm": 0.17157475842003517, + "learning_rate": 8.065750784005413e-05, + "loss": 2.9615, + "step": 17442 + }, + { + "epoch": 1.0828108510770378, + "grad_norm": 0.19360861464877488, + "learning_rate": 8.06546547773917e-05, + "loss": 3.012, + "step": 17443 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.171150082193765, + "learning_rate": 8.065180155479648e-05, + "loss": 2.9933, + "step": 17444 + }, + { + "epoch": 1.0829350052765534, + "grad_norm": 0.17133664217407374, + "learning_rate": 8.06489481722834e-05, + "loss": 2.7857, + "step": 17445 + }, + { + "epoch": 1.0829970823763113, + "grad_norm": 0.18588744579699432, + "learning_rate": 8.06460946298673e-05, + "loss": 2.8622, + "step": 17446 + }, + { + "epoch": 1.0830591594760692, + "grad_norm": 0.2114927348840415, + "learning_rate": 8.064324092756308e-05, + "loss": 2.9185, + "step": 17447 + }, + { + "epoch": 1.0831212365758272, + "grad_norm": 0.1968571711596334, + "learning_rate": 8.064038706538563e-05, + "loss": 2.8562, + "step": 17448 + }, + { + "epoch": 1.083183313675585, + "grad_norm": 0.15670226327961767, + "learning_rate": 8.063753304334986e-05, + "loss": 2.9382, + "step": 17449 + }, + { + "epoch": 1.083245390775343, + "grad_norm": 0.17209419448473726, + "learning_rate": 8.063467886147063e-05, + "loss": 2.8563, + "step": 17450 + }, + { + "epoch": 1.083307467875101, + "grad_norm": 0.17563093434180513, + "learning_rate": 8.063182451976284e-05, + "loss": 2.8924, + "step": 17451 + }, + { + "epoch": 1.0833695449748588, + "grad_norm": 0.17166941984196432, + "learning_rate": 8.06289700182414e-05, + "loss": 2.9041, + "step": 17452 + }, + { + "epoch": 1.0834316220746167, + "grad_norm": 0.1696386058891352, + "learning_rate": 8.062611535692118e-05, + "loss": 2.8599, + "step": 17453 + }, + { + "epoch": 1.0834936991743747, + "grad_norm": 0.17169067946518368, + "learning_rate": 8.062326053581709e-05, + "loss": 2.9455, + "step": 17454 + }, + { + "epoch": 1.0835557762741326, + "grad_norm": 0.15696430250045623, + "learning_rate": 8.062040555494401e-05, + "loss": 2.9376, + "step": 17455 + }, + { + "epoch": 1.0836178533738903, + "grad_norm": 0.15901509651267937, + "learning_rate": 8.061755041431683e-05, + "loss": 2.9835, + "step": 17456 + }, + { + "epoch": 1.0836799304736482, + "grad_norm": 0.18171950647618157, + "learning_rate": 8.061469511395047e-05, + "loss": 3.0179, + "step": 17457 + }, + { + "epoch": 1.083742007573406, + "grad_norm": 0.18201244404453118, + "learning_rate": 8.06118396538598e-05, + "loss": 2.848, + "step": 17458 + }, + { + "epoch": 1.083804084673164, + "grad_norm": 0.16922501469654466, + "learning_rate": 8.060898403405975e-05, + "loss": 2.891, + "step": 17459 + }, + { + "epoch": 1.083866161772922, + "grad_norm": 0.19318345284025365, + "learning_rate": 8.060612825456519e-05, + "loss": 2.8494, + "step": 17460 + }, + { + "epoch": 1.0839282388726799, + "grad_norm": 0.15419209218000082, + "learning_rate": 8.060327231539104e-05, + "loss": 2.9092, + "step": 17461 + }, + { + "epoch": 1.0839903159724378, + "grad_norm": 0.19961841547849082, + "learning_rate": 8.060041621655217e-05, + "loss": 2.8552, + "step": 17462 + }, + { + "epoch": 1.0840523930721957, + "grad_norm": 0.15916688551646394, + "learning_rate": 8.05975599580635e-05, + "loss": 2.9047, + "step": 17463 + }, + { + "epoch": 1.0841144701719536, + "grad_norm": 0.16001475294091763, + "learning_rate": 8.059470353993994e-05, + "loss": 3.0209, + "step": 17464 + }, + { + "epoch": 1.0841765472717115, + "grad_norm": 0.16602224448975722, + "learning_rate": 8.05918469621964e-05, + "loss": 2.9495, + "step": 17465 + }, + { + "epoch": 1.0842386243714695, + "grad_norm": 0.19610402889592016, + "learning_rate": 8.058899022484777e-05, + "loss": 2.8043, + "step": 17466 + }, + { + "epoch": 1.0843007014712274, + "grad_norm": 0.1648115022583533, + "learning_rate": 8.058613332790891e-05, + "loss": 2.9259, + "step": 17467 + }, + { + "epoch": 1.084362778570985, + "grad_norm": 0.1618807743489688, + "learning_rate": 8.05832762713948e-05, + "loss": 2.8822, + "step": 17468 + }, + { + "epoch": 1.084424855670743, + "grad_norm": 0.17804590108620996, + "learning_rate": 8.05804190553203e-05, + "loss": 2.9397, + "step": 17469 + }, + { + "epoch": 1.084486932770501, + "grad_norm": 0.19015143190108583, + "learning_rate": 8.057756167970035e-05, + "loss": 2.845, + "step": 17470 + }, + { + "epoch": 1.0845490098702588, + "grad_norm": 0.18917042985469537, + "learning_rate": 8.057470414454982e-05, + "loss": 2.9618, + "step": 17471 + }, + { + "epoch": 1.0846110869700167, + "grad_norm": 0.15128167105539123, + "learning_rate": 8.057184644988364e-05, + "loss": 2.8982, + "step": 17472 + }, + { + "epoch": 1.0846731640697747, + "grad_norm": 0.1763925044158744, + "learning_rate": 8.056898859571671e-05, + "loss": 2.9021, + "step": 17473 + }, + { + "epoch": 1.0847352411695326, + "grad_norm": 0.15889086586030546, + "learning_rate": 8.056613058206395e-05, + "loss": 2.8942, + "step": 17474 + }, + { + "epoch": 1.0847973182692905, + "grad_norm": 0.19291719296165946, + "learning_rate": 8.056327240894027e-05, + "loss": 2.8939, + "step": 17475 + }, + { + "epoch": 1.0848593953690484, + "grad_norm": 0.16831471056501113, + "learning_rate": 8.056041407636056e-05, + "loss": 2.853, + "step": 17476 + }, + { + "epoch": 1.0849214724688063, + "grad_norm": 0.1598940074663155, + "learning_rate": 8.055755558433979e-05, + "loss": 2.8687, + "step": 17477 + }, + { + "epoch": 1.0849835495685642, + "grad_norm": 0.1998422719708412, + "learning_rate": 8.05546969328928e-05, + "loss": 2.9295, + "step": 17478 + }, + { + "epoch": 1.0850456266683222, + "grad_norm": 0.18165448863948874, + "learning_rate": 8.055183812203455e-05, + "loss": 2.7484, + "step": 17479 + }, + { + "epoch": 1.0851077037680799, + "grad_norm": 0.19670568444007053, + "learning_rate": 8.054897915177995e-05, + "loss": 2.9172, + "step": 17480 + }, + { + "epoch": 1.0851697808678378, + "grad_norm": 0.1950470169098648, + "learning_rate": 8.05461200221439e-05, + "loss": 2.8512, + "step": 17481 + }, + { + "epoch": 1.0852318579675957, + "grad_norm": 0.2059972511259714, + "learning_rate": 8.054326073314134e-05, + "loss": 2.9565, + "step": 17482 + }, + { + "epoch": 1.0852939350673536, + "grad_norm": 0.1673617818255283, + "learning_rate": 8.054040128478718e-05, + "loss": 2.7854, + "step": 17483 + }, + { + "epoch": 1.0853560121671115, + "grad_norm": 0.17819025450590847, + "learning_rate": 8.053754167709629e-05, + "loss": 2.8732, + "step": 17484 + }, + { + "epoch": 1.0854180892668694, + "grad_norm": 0.1918980450102222, + "learning_rate": 8.053468191008367e-05, + "loss": 2.8934, + "step": 17485 + }, + { + "epoch": 1.0854801663666274, + "grad_norm": 0.20393131767030287, + "learning_rate": 8.05318219837642e-05, + "loss": 2.9074, + "step": 17486 + }, + { + "epoch": 1.0855422434663853, + "grad_norm": 0.15846305850751238, + "learning_rate": 8.052896189815278e-05, + "loss": 2.8841, + "step": 17487 + }, + { + "epoch": 1.0856043205661432, + "grad_norm": 0.1607517012495725, + "learning_rate": 8.052610165326436e-05, + "loss": 2.8605, + "step": 17488 + }, + { + "epoch": 1.0856663976659011, + "grad_norm": 0.172007764127823, + "learning_rate": 8.052324124911387e-05, + "loss": 2.9169, + "step": 17489 + }, + { + "epoch": 1.085728474765659, + "grad_norm": 0.16774788463185433, + "learning_rate": 8.052038068571621e-05, + "loss": 2.9446, + "step": 17490 + }, + { + "epoch": 1.0857905518654167, + "grad_norm": 0.15945528313028123, + "learning_rate": 8.051751996308633e-05, + "loss": 2.9419, + "step": 17491 + }, + { + "epoch": 1.0858526289651746, + "grad_norm": 0.16340070602990803, + "learning_rate": 8.051465908123912e-05, + "loss": 2.9186, + "step": 17492 + }, + { + "epoch": 1.0859147060649326, + "grad_norm": 0.1625140493536803, + "learning_rate": 8.051179804018956e-05, + "loss": 2.8783, + "step": 17493 + }, + { + "epoch": 1.0859767831646905, + "grad_norm": 0.17772997337172836, + "learning_rate": 8.05089368399525e-05, + "loss": 2.9348, + "step": 17494 + }, + { + "epoch": 1.0860388602644484, + "grad_norm": 0.18917113879318828, + "learning_rate": 8.050607548054294e-05, + "loss": 2.902, + "step": 17495 + }, + { + "epoch": 1.0861009373642063, + "grad_norm": 0.1811529709632894, + "learning_rate": 8.050321396197577e-05, + "loss": 2.8381, + "step": 17496 + }, + { + "epoch": 1.0861630144639642, + "grad_norm": 0.1991904369505192, + "learning_rate": 8.050035228426592e-05, + "loss": 2.8839, + "step": 17497 + }, + { + "epoch": 1.0862250915637222, + "grad_norm": 0.22802325128644477, + "learning_rate": 8.049749044742834e-05, + "loss": 2.8994, + "step": 17498 + }, + { + "epoch": 1.08628716866348, + "grad_norm": 0.17672040975653014, + "learning_rate": 8.049462845147794e-05, + "loss": 2.8811, + "step": 17499 + }, + { + "epoch": 1.086349245763238, + "grad_norm": 0.1945541345395216, + "learning_rate": 8.049176629642966e-05, + "loss": 3.0012, + "step": 17500 + }, + { + "epoch": 1.086411322862996, + "grad_norm": 0.17296808700985816, + "learning_rate": 8.048890398229843e-05, + "loss": 2.9288, + "step": 17501 + }, + { + "epoch": 1.0864733999627538, + "grad_norm": 0.1896182746354667, + "learning_rate": 8.048604150909919e-05, + "loss": 2.9688, + "step": 17502 + }, + { + "epoch": 1.0865354770625117, + "grad_norm": 0.1661217014033336, + "learning_rate": 8.048317887684688e-05, + "loss": 2.9328, + "step": 17503 + }, + { + "epoch": 1.0865975541622694, + "grad_norm": 0.2228059098391328, + "learning_rate": 8.048031608555641e-05, + "loss": 2.9945, + "step": 17504 + }, + { + "epoch": 1.0866596312620274, + "grad_norm": 0.1629102754518141, + "learning_rate": 8.047745313524276e-05, + "loss": 2.8746, + "step": 17505 + }, + { + "epoch": 1.0867217083617853, + "grad_norm": 0.17354822125732267, + "learning_rate": 8.047459002592081e-05, + "loss": 2.9509, + "step": 17506 + }, + { + "epoch": 1.0867837854615432, + "grad_norm": 0.16549693027938334, + "learning_rate": 8.047172675760553e-05, + "loss": 2.9625, + "step": 17507 + }, + { + "epoch": 1.0868458625613011, + "grad_norm": 0.18718969974276095, + "learning_rate": 8.046886333031188e-05, + "loss": 2.9513, + "step": 17508 + }, + { + "epoch": 1.086907939661059, + "grad_norm": 0.20828440906335335, + "learning_rate": 8.046599974405475e-05, + "loss": 2.9409, + "step": 17509 + }, + { + "epoch": 1.086970016760817, + "grad_norm": 0.18018194255270348, + "learning_rate": 8.04631359988491e-05, + "loss": 2.9404, + "step": 17510 + }, + { + "epoch": 1.0870320938605749, + "grad_norm": 0.19039600759167732, + "learning_rate": 8.046027209470988e-05, + "loss": 2.816, + "step": 17511 + }, + { + "epoch": 1.0870941709603328, + "grad_norm": 0.15481177260621076, + "learning_rate": 8.045740803165202e-05, + "loss": 2.9086, + "step": 17512 + }, + { + "epoch": 1.0871562480600907, + "grad_norm": 0.1758039936610888, + "learning_rate": 8.045454380969049e-05, + "loss": 2.9285, + "step": 17513 + }, + { + "epoch": 1.0872183251598486, + "grad_norm": 0.1560380392062518, + "learning_rate": 8.04516794288402e-05, + "loss": 2.8889, + "step": 17514 + }, + { + "epoch": 1.0872804022596063, + "grad_norm": 0.1714208031042695, + "learning_rate": 8.044881488911609e-05, + "loss": 2.8896, + "step": 17515 + }, + { + "epoch": 1.0873424793593642, + "grad_norm": 0.2337129221925494, + "learning_rate": 8.044595019053314e-05, + "loss": 2.8447, + "step": 17516 + }, + { + "epoch": 1.0874045564591222, + "grad_norm": 0.1521926827791719, + "learning_rate": 8.044308533310627e-05, + "loss": 2.9345, + "step": 17517 + }, + { + "epoch": 1.08746663355888, + "grad_norm": 0.14739726125432134, + "learning_rate": 8.044022031685044e-05, + "loss": 2.8002, + "step": 17518 + }, + { + "epoch": 1.087528710658638, + "grad_norm": 0.15010572050690313, + "learning_rate": 8.043735514178057e-05, + "loss": 2.8947, + "step": 17519 + }, + { + "epoch": 1.087590787758396, + "grad_norm": 0.18367558542492926, + "learning_rate": 8.043448980791165e-05, + "loss": 2.8998, + "step": 17520 + }, + { + "epoch": 1.0876528648581538, + "grad_norm": 0.16409308852282356, + "learning_rate": 8.04316243152586e-05, + "loss": 2.851, + "step": 17521 + }, + { + "epoch": 1.0877149419579117, + "grad_norm": 0.14868270547920837, + "learning_rate": 8.04287586638364e-05, + "loss": 2.9662, + "step": 17522 + }, + { + "epoch": 1.0877770190576697, + "grad_norm": 0.15559147107826085, + "learning_rate": 8.042589285365995e-05, + "loss": 2.9109, + "step": 17523 + }, + { + "epoch": 1.0878390961574276, + "grad_norm": 0.1584219067328301, + "learning_rate": 8.042302688474424e-05, + "loss": 2.9655, + "step": 17524 + }, + { + "epoch": 1.0879011732571855, + "grad_norm": 0.1475426908314036, + "learning_rate": 8.042016075710421e-05, + "loss": 2.9007, + "step": 17525 + }, + { + "epoch": 1.0879632503569434, + "grad_norm": 0.15961201005087164, + "learning_rate": 8.041729447075483e-05, + "loss": 2.9325, + "step": 17526 + }, + { + "epoch": 1.0880253274567013, + "grad_norm": 0.15883312646221762, + "learning_rate": 8.041442802571103e-05, + "loss": 2.9198, + "step": 17527 + }, + { + "epoch": 1.088087404556459, + "grad_norm": 0.1470823480613153, + "learning_rate": 8.041156142198778e-05, + "loss": 2.8339, + "step": 17528 + }, + { + "epoch": 1.088149481656217, + "grad_norm": 0.15111221866300636, + "learning_rate": 8.040869465960003e-05, + "loss": 2.8759, + "step": 17529 + }, + { + "epoch": 1.0882115587559749, + "grad_norm": 0.15669344917448944, + "learning_rate": 8.040582773856274e-05, + "loss": 2.8569, + "step": 17530 + }, + { + "epoch": 1.0882736358557328, + "grad_norm": 0.15471207611330734, + "learning_rate": 8.040296065889086e-05, + "loss": 2.892, + "step": 17531 + }, + { + "epoch": 1.0883357129554907, + "grad_norm": 0.1448417048531087, + "learning_rate": 8.040009342059936e-05, + "loss": 2.9187, + "step": 17532 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.14731225987901567, + "learning_rate": 8.039722602370318e-05, + "loss": 2.8627, + "step": 17533 + }, + { + "epoch": 1.0884598671550065, + "grad_norm": 0.15018377693830465, + "learning_rate": 8.03943584682173e-05, + "loss": 2.9491, + "step": 17534 + }, + { + "epoch": 1.0885219442547645, + "grad_norm": 0.1990525449931478, + "learning_rate": 8.039149075415667e-05, + "loss": 2.8899, + "step": 17535 + }, + { + "epoch": 1.0885840213545224, + "grad_norm": 0.15063296646206054, + "learning_rate": 8.038862288153627e-05, + "loss": 2.9547, + "step": 17536 + }, + { + "epoch": 1.0886460984542803, + "grad_norm": 0.21745131065476264, + "learning_rate": 8.038575485037104e-05, + "loss": 3.0126, + "step": 17537 + }, + { + "epoch": 1.0887081755540382, + "grad_norm": 0.15258097518508945, + "learning_rate": 8.038288666067596e-05, + "loss": 2.9206, + "step": 17538 + }, + { + "epoch": 1.088770252653796, + "grad_norm": 0.20379479639850073, + "learning_rate": 8.038001831246595e-05, + "loss": 2.9571, + "step": 17539 + }, + { + "epoch": 1.0888323297535538, + "grad_norm": 0.21345387763193732, + "learning_rate": 8.037714980575604e-05, + "loss": 2.8252, + "step": 17540 + }, + { + "epoch": 1.0888944068533117, + "grad_norm": 0.1652702709643889, + "learning_rate": 8.037428114056116e-05, + "loss": 2.9451, + "step": 17541 + }, + { + "epoch": 1.0889564839530697, + "grad_norm": 0.15508705925368835, + "learning_rate": 8.037141231689626e-05, + "loss": 2.9564, + "step": 17542 + }, + { + "epoch": 1.0890185610528276, + "grad_norm": 0.19612051716526246, + "learning_rate": 8.036854333477635e-05, + "loss": 2.8413, + "step": 17543 + }, + { + "epoch": 1.0890806381525855, + "grad_norm": 0.17191763370777094, + "learning_rate": 8.036567419421639e-05, + "loss": 2.9381, + "step": 17544 + }, + { + "epoch": 1.0891427152523434, + "grad_norm": 0.17170878522872549, + "learning_rate": 8.036280489523132e-05, + "loss": 2.8379, + "step": 17545 + }, + { + "epoch": 1.0892047923521013, + "grad_norm": 0.23105094403598456, + "learning_rate": 8.035993543783613e-05, + "loss": 2.9322, + "step": 17546 + }, + { + "epoch": 1.0892668694518592, + "grad_norm": 0.14726189575735138, + "learning_rate": 8.035706582204579e-05, + "loss": 2.9393, + "step": 17547 + }, + { + "epoch": 1.0893289465516172, + "grad_norm": 0.15557882878381013, + "learning_rate": 8.035419604787525e-05, + "loss": 2.9736, + "step": 17548 + }, + { + "epoch": 1.089391023651375, + "grad_norm": 0.1566640256112092, + "learning_rate": 8.03513261153395e-05, + "loss": 2.89, + "step": 17549 + }, + { + "epoch": 1.089453100751133, + "grad_norm": 0.1506153322912201, + "learning_rate": 8.034845602445353e-05, + "loss": 2.8444, + "step": 17550 + }, + { + "epoch": 1.089515177850891, + "grad_norm": 0.28381503853316825, + "learning_rate": 8.03455857752323e-05, + "loss": 2.8984, + "step": 17551 + }, + { + "epoch": 1.0895772549506486, + "grad_norm": 0.20825436804244415, + "learning_rate": 8.034271536769078e-05, + "loss": 2.8877, + "step": 17552 + }, + { + "epoch": 1.0896393320504065, + "grad_norm": 0.160049402835379, + "learning_rate": 8.033984480184395e-05, + "loss": 2.8491, + "step": 17553 + }, + { + "epoch": 1.0897014091501644, + "grad_norm": 0.18493484143792144, + "learning_rate": 8.033697407770677e-05, + "loss": 2.9361, + "step": 17554 + }, + { + "epoch": 1.0897634862499224, + "grad_norm": 0.16894050547205228, + "learning_rate": 8.033410319529424e-05, + "loss": 2.8677, + "step": 17555 + }, + { + "epoch": 1.0898255633496803, + "grad_norm": 0.24917935744683975, + "learning_rate": 8.033123215462133e-05, + "loss": 2.9383, + "step": 17556 + }, + { + "epoch": 1.0898876404494382, + "grad_norm": 0.23532811123203506, + "learning_rate": 8.032836095570302e-05, + "loss": 2.9018, + "step": 17557 + }, + { + "epoch": 1.0899497175491961, + "grad_norm": 0.16220480626260816, + "learning_rate": 8.032548959855429e-05, + "loss": 2.8023, + "step": 17558 + }, + { + "epoch": 1.090011794648954, + "grad_norm": 0.15808260412451022, + "learning_rate": 8.032261808319012e-05, + "loss": 2.8459, + "step": 17559 + }, + { + "epoch": 1.090073871748712, + "grad_norm": 0.1582915565208011, + "learning_rate": 8.031974640962547e-05, + "loss": 2.9271, + "step": 17560 + }, + { + "epoch": 1.0901359488484699, + "grad_norm": 0.21425186468890897, + "learning_rate": 8.031687457787537e-05, + "loss": 2.9183, + "step": 17561 + }, + { + "epoch": 1.0901980259482278, + "grad_norm": 0.1899072081858843, + "learning_rate": 8.031400258795476e-05, + "loss": 2.812, + "step": 17562 + }, + { + "epoch": 1.0902601030479855, + "grad_norm": 0.16478738838213144, + "learning_rate": 8.031113043987866e-05, + "loss": 3.01, + "step": 17563 + }, + { + "epoch": 1.0903221801477434, + "grad_norm": 0.1561936276788169, + "learning_rate": 8.0308258133662e-05, + "loss": 3.0075, + "step": 17564 + }, + { + "epoch": 1.0903842572475013, + "grad_norm": 0.16099493312081825, + "learning_rate": 8.030538566931981e-05, + "loss": 2.8751, + "step": 17565 + }, + { + "epoch": 1.0904463343472592, + "grad_norm": 0.18595423903571098, + "learning_rate": 8.030251304686709e-05, + "loss": 2.9644, + "step": 17566 + }, + { + "epoch": 1.0905084114470172, + "grad_norm": 0.17274799584800915, + "learning_rate": 8.029964026631878e-05, + "loss": 2.8115, + "step": 17567 + }, + { + "epoch": 1.090570488546775, + "grad_norm": 0.1665684303709458, + "learning_rate": 8.02967673276899e-05, + "loss": 2.8994, + "step": 17568 + }, + { + "epoch": 1.090632565646533, + "grad_norm": 0.19786554646757568, + "learning_rate": 8.029389423099543e-05, + "loss": 2.9005, + "step": 17569 + }, + { + "epoch": 1.090694642746291, + "grad_norm": 0.1773298396734256, + "learning_rate": 8.029102097625037e-05, + "loss": 2.9099, + "step": 17570 + }, + { + "epoch": 1.0907567198460488, + "grad_norm": 0.2665691006638143, + "learning_rate": 8.028814756346969e-05, + "loss": 2.8863, + "step": 17571 + }, + { + "epoch": 1.0908187969458067, + "grad_norm": 0.2661186287718874, + "learning_rate": 8.028527399266839e-05, + "loss": 2.8925, + "step": 17572 + }, + { + "epoch": 1.0908808740455647, + "grad_norm": 0.1611175876707353, + "learning_rate": 8.028240026386146e-05, + "loss": 2.8468, + "step": 17573 + }, + { + "epoch": 1.0909429511453226, + "grad_norm": 0.2532074040631301, + "learning_rate": 8.027952637706389e-05, + "loss": 2.8734, + "step": 17574 + }, + { + "epoch": 1.0910050282450805, + "grad_norm": 0.1615614017305362, + "learning_rate": 8.027665233229069e-05, + "loss": 2.9357, + "step": 17575 + }, + { + "epoch": 1.0910671053448382, + "grad_norm": 0.1764740114516966, + "learning_rate": 8.027377812955685e-05, + "loss": 2.9152, + "step": 17576 + }, + { + "epoch": 1.0911291824445961, + "grad_norm": 0.17507702907540973, + "learning_rate": 8.027090376887737e-05, + "loss": 2.9099, + "step": 17577 + }, + { + "epoch": 1.091191259544354, + "grad_norm": 0.23246360051651035, + "learning_rate": 8.026802925026722e-05, + "loss": 2.8516, + "step": 17578 + }, + { + "epoch": 1.091253336644112, + "grad_norm": 0.1664433578970896, + "learning_rate": 8.026515457374141e-05, + "loss": 2.966, + "step": 17579 + }, + { + "epoch": 1.0913154137438699, + "grad_norm": 0.19558475592309452, + "learning_rate": 8.026227973931495e-05, + "loss": 2.9146, + "step": 17580 + }, + { + "epoch": 1.0913774908436278, + "grad_norm": 0.19058297663392432, + "learning_rate": 8.025940474700282e-05, + "loss": 2.8964, + "step": 17581 + }, + { + "epoch": 1.0914395679433857, + "grad_norm": 0.16188139568677393, + "learning_rate": 8.025652959682004e-05, + "loss": 2.8755, + "step": 17582 + }, + { + "epoch": 1.0915016450431436, + "grad_norm": 0.2128798455825326, + "learning_rate": 8.02536542887816e-05, + "loss": 2.9474, + "step": 17583 + }, + { + "epoch": 1.0915637221429015, + "grad_norm": 0.2843451788221063, + "learning_rate": 8.025077882290249e-05, + "loss": 2.9194, + "step": 17584 + }, + { + "epoch": 1.0916257992426595, + "grad_norm": 0.17396332272971451, + "learning_rate": 8.024790319919773e-05, + "loss": 2.982, + "step": 17585 + }, + { + "epoch": 1.0916878763424174, + "grad_norm": 0.16551665325020054, + "learning_rate": 8.024502741768232e-05, + "loss": 3.0178, + "step": 17586 + }, + { + "epoch": 1.091749953442175, + "grad_norm": 0.18559312745886075, + "learning_rate": 8.024215147837126e-05, + "loss": 2.8745, + "step": 17587 + }, + { + "epoch": 1.091812030541933, + "grad_norm": 0.17942133139003305, + "learning_rate": 8.023927538127955e-05, + "loss": 2.9092, + "step": 17588 + }, + { + "epoch": 1.091874107641691, + "grad_norm": 0.22132935758948344, + "learning_rate": 8.02363991264222e-05, + "loss": 2.8429, + "step": 17589 + }, + { + "epoch": 1.0919361847414488, + "grad_norm": 0.16514082034457966, + "learning_rate": 8.023352271381421e-05, + "loss": 2.8605, + "step": 17590 + }, + { + "epoch": 1.0919982618412067, + "grad_norm": 0.17158152168211285, + "learning_rate": 8.02306461434706e-05, + "loss": 2.8494, + "step": 17591 + }, + { + "epoch": 1.0920603389409647, + "grad_norm": 0.15085600398291293, + "learning_rate": 8.022776941540638e-05, + "loss": 2.7812, + "step": 17592 + }, + { + "epoch": 1.0921224160407226, + "grad_norm": 0.17406077835252018, + "learning_rate": 8.022489252963654e-05, + "loss": 2.8839, + "step": 17593 + }, + { + "epoch": 1.0921844931404805, + "grad_norm": 0.154132369023838, + "learning_rate": 8.02220154861761e-05, + "loss": 2.9582, + "step": 17594 + }, + { + "epoch": 1.0922465702402384, + "grad_norm": 0.18831619793096757, + "learning_rate": 8.021913828504006e-05, + "loss": 2.894, + "step": 17595 + }, + { + "epoch": 1.0923086473399963, + "grad_norm": 0.15934554042809024, + "learning_rate": 8.021626092624345e-05, + "loss": 2.946, + "step": 17596 + }, + { + "epoch": 1.0923707244397542, + "grad_norm": 0.18121080920310884, + "learning_rate": 8.021338340980126e-05, + "loss": 2.9498, + "step": 17597 + }, + { + "epoch": 1.0924328015395122, + "grad_norm": 0.15670817077048965, + "learning_rate": 8.021050573572853e-05, + "loss": 2.924, + "step": 17598 + }, + { + "epoch": 1.09249487863927, + "grad_norm": 0.17115705812502688, + "learning_rate": 8.020762790404024e-05, + "loss": 2.9513, + "step": 17599 + }, + { + "epoch": 1.0925569557390278, + "grad_norm": 0.1830891309442969, + "learning_rate": 8.020474991475142e-05, + "loss": 2.9149, + "step": 17600 + }, + { + "epoch": 1.0926190328387857, + "grad_norm": 0.18143692695285396, + "learning_rate": 8.02018717678771e-05, + "loss": 2.9918, + "step": 17601 + }, + { + "epoch": 1.0926811099385436, + "grad_norm": 0.18747655404048696, + "learning_rate": 8.01989934634323e-05, + "loss": 2.8473, + "step": 17602 + }, + { + "epoch": 1.0927431870383015, + "grad_norm": 0.2524068723923355, + "learning_rate": 8.019611500143198e-05, + "loss": 2.9216, + "step": 17603 + }, + { + "epoch": 1.0928052641380595, + "grad_norm": 0.15744285041460016, + "learning_rate": 8.019323638189123e-05, + "loss": 2.9829, + "step": 17604 + }, + { + "epoch": 1.0928673412378174, + "grad_norm": 0.1743768539125068, + "learning_rate": 8.019035760482503e-05, + "loss": 2.8417, + "step": 17605 + }, + { + "epoch": 1.0929294183375753, + "grad_norm": 0.15303963889930136, + "learning_rate": 8.01874786702484e-05, + "loss": 2.7741, + "step": 17606 + }, + { + "epoch": 1.0929914954373332, + "grad_norm": 0.18391649812391908, + "learning_rate": 8.018459957817636e-05, + "loss": 2.9009, + "step": 17607 + }, + { + "epoch": 1.0930535725370911, + "grad_norm": 0.1781172662863061, + "learning_rate": 8.018172032862395e-05, + "loss": 2.8569, + "step": 17608 + }, + { + "epoch": 1.093115649636849, + "grad_norm": 0.15791753611308515, + "learning_rate": 8.017884092160617e-05, + "loss": 2.9862, + "step": 17609 + }, + { + "epoch": 1.093177726736607, + "grad_norm": 0.19851877620617464, + "learning_rate": 8.017596135713806e-05, + "loss": 2.8979, + "step": 17610 + }, + { + "epoch": 1.0932398038363647, + "grad_norm": 0.18394655458521467, + "learning_rate": 8.01730816352346e-05, + "loss": 2.8228, + "step": 17611 + }, + { + "epoch": 1.0933018809361226, + "grad_norm": 0.28552387003981067, + "learning_rate": 8.017020175591087e-05, + "loss": 3.0193, + "step": 17612 + }, + { + "epoch": 1.0933639580358805, + "grad_norm": 0.23144998827367882, + "learning_rate": 8.016732171918189e-05, + "loss": 2.8505, + "step": 17613 + }, + { + "epoch": 1.0934260351356384, + "grad_norm": 0.15548883049643525, + "learning_rate": 8.016444152506266e-05, + "loss": 2.8776, + "step": 17614 + }, + { + "epoch": 1.0934881122353963, + "grad_norm": 0.2067966751714475, + "learning_rate": 8.01615611735682e-05, + "loss": 2.9017, + "step": 17615 + }, + { + "epoch": 1.0935501893351542, + "grad_norm": 0.165310681311441, + "learning_rate": 8.015868066471357e-05, + "loss": 2.8937, + "step": 17616 + }, + { + "epoch": 1.0936122664349122, + "grad_norm": 0.18779284734617882, + "learning_rate": 8.015579999851379e-05, + "loss": 2.8821, + "step": 17617 + }, + { + "epoch": 1.09367434353467, + "grad_norm": 0.18055602271272264, + "learning_rate": 8.015291917498386e-05, + "loss": 2.9076, + "step": 17618 + }, + { + "epoch": 1.093736420634428, + "grad_norm": 0.20884502078521208, + "learning_rate": 8.015003819413885e-05, + "loss": 2.8699, + "step": 17619 + }, + { + "epoch": 1.093798497734186, + "grad_norm": 0.18996176328642336, + "learning_rate": 8.014715705599374e-05, + "loss": 2.9512, + "step": 17620 + }, + { + "epoch": 1.0938605748339438, + "grad_norm": 0.18876950322687264, + "learning_rate": 8.014427576056362e-05, + "loss": 2.9748, + "step": 17621 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.16636767925396675, + "learning_rate": 8.014139430786349e-05, + "loss": 2.7926, + "step": 17622 + }, + { + "epoch": 1.0939847290334597, + "grad_norm": 0.17469566824280444, + "learning_rate": 8.013851269790839e-05, + "loss": 2.9222, + "step": 17623 + }, + { + "epoch": 1.0940468061332174, + "grad_norm": 0.18340059021162683, + "learning_rate": 8.013563093071335e-05, + "loss": 2.9101, + "step": 17624 + }, + { + "epoch": 1.0941088832329753, + "grad_norm": 0.18510600466576874, + "learning_rate": 8.013274900629341e-05, + "loss": 2.9222, + "step": 17625 + }, + { + "epoch": 1.0941709603327332, + "grad_norm": 0.18088689928319504, + "learning_rate": 8.01298669246636e-05, + "loss": 2.8299, + "step": 17626 + }, + { + "epoch": 1.0942330374324911, + "grad_norm": 0.16552828335565672, + "learning_rate": 8.012698468583897e-05, + "loss": 2.9573, + "step": 17627 + }, + { + "epoch": 1.094295114532249, + "grad_norm": 0.16863416122027952, + "learning_rate": 8.012410228983453e-05, + "loss": 2.8374, + "step": 17628 + }, + { + "epoch": 1.094357191632007, + "grad_norm": 0.198133655022105, + "learning_rate": 8.012121973666536e-05, + "loss": 2.9643, + "step": 17629 + }, + { + "epoch": 1.0944192687317649, + "grad_norm": 0.1623746872484838, + "learning_rate": 8.011833702634644e-05, + "loss": 2.9423, + "step": 17630 + }, + { + "epoch": 1.0944813458315228, + "grad_norm": 0.19601100946604924, + "learning_rate": 8.011545415889287e-05, + "loss": 2.9909, + "step": 17631 + }, + { + "epoch": 1.0945434229312807, + "grad_norm": 0.16209435680636922, + "learning_rate": 8.011257113431966e-05, + "loss": 2.8676, + "step": 17632 + }, + { + "epoch": 1.0946055000310386, + "grad_norm": 0.24699280770878687, + "learning_rate": 8.010968795264185e-05, + "loss": 2.8617, + "step": 17633 + }, + { + "epoch": 1.0946675771307965, + "grad_norm": 0.154737940665811, + "learning_rate": 8.010680461387449e-05, + "loss": 2.9574, + "step": 17634 + }, + { + "epoch": 1.0947296542305542, + "grad_norm": 0.16894511404742454, + "learning_rate": 8.010392111803263e-05, + "loss": 2.9377, + "step": 17635 + }, + { + "epoch": 1.0947917313303122, + "grad_norm": 0.14477359584062524, + "learning_rate": 8.01010374651313e-05, + "loss": 2.851, + "step": 17636 + }, + { + "epoch": 1.09485380843007, + "grad_norm": 0.1625559527095417, + "learning_rate": 8.009815365518554e-05, + "loss": 2.8605, + "step": 17637 + }, + { + "epoch": 1.094915885529828, + "grad_norm": 0.1792398869156869, + "learning_rate": 8.009526968821043e-05, + "loss": 2.9585, + "step": 17638 + }, + { + "epoch": 1.094977962629586, + "grad_norm": 0.1733054865784411, + "learning_rate": 8.009238556422098e-05, + "loss": 2.9392, + "step": 17639 + }, + { + "epoch": 1.0950400397293438, + "grad_norm": 0.15752441909500095, + "learning_rate": 8.008950128323225e-05, + "loss": 2.8939, + "step": 17640 + }, + { + "epoch": 1.0951021168291017, + "grad_norm": 0.17560696438837764, + "learning_rate": 8.008661684525927e-05, + "loss": 2.9931, + "step": 17641 + }, + { + "epoch": 1.0951641939288597, + "grad_norm": 0.1534148732534153, + "learning_rate": 8.008373225031713e-05, + "loss": 2.9132, + "step": 17642 + }, + { + "epoch": 1.0952262710286176, + "grad_norm": 0.20093063593422975, + "learning_rate": 8.008084749842085e-05, + "loss": 2.97, + "step": 17643 + }, + { + "epoch": 1.0952883481283755, + "grad_norm": 0.17859010832870903, + "learning_rate": 8.007796258958549e-05, + "loss": 2.9126, + "step": 17644 + }, + { + "epoch": 1.0953504252281334, + "grad_norm": 0.17684590794604463, + "learning_rate": 8.007507752382608e-05, + "loss": 3.0071, + "step": 17645 + }, + { + "epoch": 1.0954125023278913, + "grad_norm": 0.21609645972825078, + "learning_rate": 8.00721923011577e-05, + "loss": 2.8473, + "step": 17646 + }, + { + "epoch": 1.0954745794276493, + "grad_norm": 0.1850523230464065, + "learning_rate": 8.006930692159539e-05, + "loss": 2.8648, + "step": 17647 + }, + { + "epoch": 1.095536656527407, + "grad_norm": 0.18824692805064713, + "learning_rate": 8.006642138515422e-05, + "loss": 2.9603, + "step": 17648 + }, + { + "epoch": 1.0955987336271649, + "grad_norm": 0.16277953734936873, + "learning_rate": 8.00635356918492e-05, + "loss": 2.8172, + "step": 17649 + }, + { + "epoch": 1.0956608107269228, + "grad_norm": 0.20514758613985337, + "learning_rate": 8.006064984169546e-05, + "loss": 2.8511, + "step": 17650 + }, + { + "epoch": 1.0957228878266807, + "grad_norm": 0.1666064799433872, + "learning_rate": 8.005776383470798e-05, + "loss": 2.8209, + "step": 17651 + }, + { + "epoch": 1.0957849649264386, + "grad_norm": 0.20925253304050137, + "learning_rate": 8.005487767090185e-05, + "loss": 2.9482, + "step": 17652 + }, + { + "epoch": 1.0958470420261965, + "grad_norm": 0.19424028774354757, + "learning_rate": 8.005199135029215e-05, + "loss": 2.9687, + "step": 17653 + }, + { + "epoch": 1.0959091191259545, + "grad_norm": 0.15967368486070965, + "learning_rate": 8.00491048728939e-05, + "loss": 3.0324, + "step": 17654 + }, + { + "epoch": 1.0959711962257124, + "grad_norm": 0.17326364588598345, + "learning_rate": 8.004621823872216e-05, + "loss": 2.8679, + "step": 17655 + }, + { + "epoch": 1.0960332733254703, + "grad_norm": 0.1676725726078897, + "learning_rate": 8.004333144779203e-05, + "loss": 2.8859, + "step": 17656 + }, + { + "epoch": 1.0960953504252282, + "grad_norm": 0.1609591964800084, + "learning_rate": 8.004044450011854e-05, + "loss": 2.9004, + "step": 17657 + }, + { + "epoch": 1.0961574275249861, + "grad_norm": 0.1953225610049051, + "learning_rate": 8.003755739571676e-05, + "loss": 2.9337, + "step": 17658 + }, + { + "epoch": 1.0962195046247438, + "grad_norm": 0.2050276194551753, + "learning_rate": 8.003467013460174e-05, + "loss": 2.8731, + "step": 17659 + }, + { + "epoch": 1.0962815817245017, + "grad_norm": 0.1893933388671602, + "learning_rate": 8.003178271678857e-05, + "loss": 2.8417, + "step": 17660 + }, + { + "epoch": 1.0963436588242597, + "grad_norm": 0.15987257141076133, + "learning_rate": 8.002889514229228e-05, + "loss": 2.9311, + "step": 17661 + }, + { + "epoch": 1.0964057359240176, + "grad_norm": 0.18458982899332477, + "learning_rate": 8.002600741112798e-05, + "loss": 2.9447, + "step": 17662 + }, + { + "epoch": 1.0964678130237755, + "grad_norm": 0.17510443832275027, + "learning_rate": 8.00231195233107e-05, + "loss": 2.8223, + "step": 17663 + }, + { + "epoch": 1.0965298901235334, + "grad_norm": 0.19036416124609318, + "learning_rate": 8.002023147885549e-05, + "loss": 2.9235, + "step": 17664 + }, + { + "epoch": 1.0965919672232913, + "grad_norm": 0.18050541484654486, + "learning_rate": 8.001734327777748e-05, + "loss": 2.8234, + "step": 17665 + }, + { + "epoch": 1.0966540443230492, + "grad_norm": 0.18391604237831483, + "learning_rate": 8.001445492009168e-05, + "loss": 2.9846, + "step": 17666 + }, + { + "epoch": 1.0967161214228072, + "grad_norm": 0.17451945001162392, + "learning_rate": 8.001156640581319e-05, + "loss": 2.9425, + "step": 17667 + }, + { + "epoch": 1.096778198522565, + "grad_norm": 0.1695694447486185, + "learning_rate": 8.000867773495709e-05, + "loss": 2.857, + "step": 17668 + }, + { + "epoch": 1.096840275622323, + "grad_norm": 0.1960399622307632, + "learning_rate": 8.000578890753842e-05, + "loss": 2.9755, + "step": 17669 + }, + { + "epoch": 1.096902352722081, + "grad_norm": 0.17433303198080233, + "learning_rate": 8.000289992357226e-05, + "loss": 2.8851, + "step": 17670 + }, + { + "epoch": 1.0969644298218388, + "grad_norm": 0.15855814106505314, + "learning_rate": 8.000001078307369e-05, + "loss": 2.8768, + "step": 17671 + }, + { + "epoch": 1.0970265069215965, + "grad_norm": 0.1792024201567107, + "learning_rate": 7.999712148605779e-05, + "loss": 2.9509, + "step": 17672 + }, + { + "epoch": 1.0970885840213545, + "grad_norm": 0.16171930101088855, + "learning_rate": 7.999423203253962e-05, + "loss": 2.824, + "step": 17673 + }, + { + "epoch": 1.0971506611211124, + "grad_norm": 0.16319330758404477, + "learning_rate": 7.999134242253426e-05, + "loss": 2.991, + "step": 17674 + }, + { + "epoch": 1.0972127382208703, + "grad_norm": 0.17263252043450114, + "learning_rate": 7.998845265605677e-05, + "loss": 2.8731, + "step": 17675 + }, + { + "epoch": 1.0972748153206282, + "grad_norm": 0.16696805389197789, + "learning_rate": 7.998556273312225e-05, + "loss": 2.8002, + "step": 17676 + }, + { + "epoch": 1.0973368924203861, + "grad_norm": 0.17843772766245733, + "learning_rate": 7.99826726537458e-05, + "loss": 2.8317, + "step": 17677 + }, + { + "epoch": 1.097398969520144, + "grad_norm": 0.1842249409166765, + "learning_rate": 7.997978241794243e-05, + "loss": 2.9738, + "step": 17678 + }, + { + "epoch": 1.097461046619902, + "grad_norm": 0.16337595068794342, + "learning_rate": 7.997689202572729e-05, + "loss": 2.9861, + "step": 17679 + }, + { + "epoch": 1.0975231237196599, + "grad_norm": 0.170936999738683, + "learning_rate": 7.997400147711539e-05, + "loss": 2.9359, + "step": 17680 + }, + { + "epoch": 1.0975852008194178, + "grad_norm": 0.1675084315942789, + "learning_rate": 7.997111077212187e-05, + "loss": 2.8453, + "step": 17681 + }, + { + "epoch": 1.0976472779191757, + "grad_norm": 0.15273030463451126, + "learning_rate": 7.996821991076178e-05, + "loss": 2.9288, + "step": 17682 + }, + { + "epoch": 1.0977093550189334, + "grad_norm": 0.16933527355657915, + "learning_rate": 7.996532889305022e-05, + "loss": 2.9584, + "step": 17683 + }, + { + "epoch": 1.0977714321186913, + "grad_norm": 0.1607190860635002, + "learning_rate": 7.996243771900227e-05, + "loss": 2.9345, + "step": 17684 + }, + { + "epoch": 1.0978335092184492, + "grad_norm": 0.1609221265912925, + "learning_rate": 7.995954638863299e-05, + "loss": 2.9286, + "step": 17685 + }, + { + "epoch": 1.0978955863182072, + "grad_norm": 0.1569365082750847, + "learning_rate": 7.995665490195751e-05, + "loss": 2.8566, + "step": 17686 + }, + { + "epoch": 1.097957663417965, + "grad_norm": 0.1583748050750972, + "learning_rate": 7.995376325899087e-05, + "loss": 2.8645, + "step": 17687 + }, + { + "epoch": 1.098019740517723, + "grad_norm": 0.15719614267762777, + "learning_rate": 7.995087145974817e-05, + "loss": 2.9162, + "step": 17688 + }, + { + "epoch": 1.098081817617481, + "grad_norm": 0.2137191992034185, + "learning_rate": 7.994797950424453e-05, + "loss": 2.9535, + "step": 17689 + }, + { + "epoch": 1.0981438947172388, + "grad_norm": 0.15639197932295418, + "learning_rate": 7.994508739249498e-05, + "loss": 2.9237, + "step": 17690 + }, + { + "epoch": 1.0982059718169968, + "grad_norm": 0.14235795017224848, + "learning_rate": 7.994219512451466e-05, + "loss": 2.8745, + "step": 17691 + }, + { + "epoch": 1.0982680489167547, + "grad_norm": 0.15644786346870324, + "learning_rate": 7.993930270031864e-05, + "loss": 2.9343, + "step": 17692 + }, + { + "epoch": 1.0983301260165126, + "grad_norm": 0.20217046755843204, + "learning_rate": 7.993641011992199e-05, + "loss": 2.9008, + "step": 17693 + }, + { + "epoch": 1.0983922031162705, + "grad_norm": 0.15261020344673076, + "learning_rate": 7.993351738333983e-05, + "loss": 2.9281, + "step": 17694 + }, + { + "epoch": 1.0984542802160284, + "grad_norm": 0.1597009677209419, + "learning_rate": 7.993062449058726e-05, + "loss": 2.8731, + "step": 17695 + }, + { + "epoch": 1.0985163573157861, + "grad_norm": 0.18893105059338125, + "learning_rate": 7.992773144167933e-05, + "loss": 2.8966, + "step": 17696 + }, + { + "epoch": 1.098578434415544, + "grad_norm": 0.17027698877197, + "learning_rate": 7.992483823663118e-05, + "loss": 2.9251, + "step": 17697 + }, + { + "epoch": 1.098640511515302, + "grad_norm": 0.17344363154492876, + "learning_rate": 7.992194487545787e-05, + "loss": 2.93, + "step": 17698 + }, + { + "epoch": 1.0987025886150599, + "grad_norm": 0.15835285753935135, + "learning_rate": 7.991905135817453e-05, + "loss": 2.8132, + "step": 17699 + }, + { + "epoch": 1.0987646657148178, + "grad_norm": 0.17709486002899805, + "learning_rate": 7.991615768479621e-05, + "loss": 2.9212, + "step": 17700 + }, + { + "epoch": 1.0988267428145757, + "grad_norm": 0.17132602605999517, + "learning_rate": 7.991326385533804e-05, + "loss": 2.9555, + "step": 17701 + }, + { + "epoch": 1.0988888199143336, + "grad_norm": 0.1676879409620023, + "learning_rate": 7.991036986981513e-05, + "loss": 2.9323, + "step": 17702 + }, + { + "epoch": 1.0989508970140915, + "grad_norm": 0.1605967360707157, + "learning_rate": 7.990747572824253e-05, + "loss": 2.9164, + "step": 17703 + }, + { + "epoch": 1.0990129741138495, + "grad_norm": 0.16902183166968276, + "learning_rate": 7.99045814306354e-05, + "loss": 2.9174, + "step": 17704 + }, + { + "epoch": 1.0990750512136074, + "grad_norm": 0.17667954645819625, + "learning_rate": 7.99016869770088e-05, + "loss": 2.9229, + "step": 17705 + }, + { + "epoch": 1.0991371283133653, + "grad_norm": 0.15212881380392546, + "learning_rate": 7.989879236737783e-05, + "loss": 2.8406, + "step": 17706 + }, + { + "epoch": 1.099199205413123, + "grad_norm": 0.1740005473828475, + "learning_rate": 7.989589760175759e-05, + "loss": 2.9336, + "step": 17707 + }, + { + "epoch": 1.099261282512881, + "grad_norm": 0.2106706202536919, + "learning_rate": 7.989300268016322e-05, + "loss": 2.9409, + "step": 17708 + }, + { + "epoch": 1.0993233596126388, + "grad_norm": 0.17751449566067126, + "learning_rate": 7.98901076026098e-05, + "loss": 2.9485, + "step": 17709 + }, + { + "epoch": 1.0993854367123967, + "grad_norm": 0.1601679437136458, + "learning_rate": 7.988721236911241e-05, + "loss": 2.9428, + "step": 17710 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.18425236923397395, + "learning_rate": 7.988431697968619e-05, + "loss": 2.8998, + "step": 17711 + }, + { + "epoch": 1.0995095909119126, + "grad_norm": 0.1753333690703181, + "learning_rate": 7.988142143434623e-05, + "loss": 2.9516, + "step": 17712 + }, + { + "epoch": 1.0995716680116705, + "grad_norm": 0.17173907475552275, + "learning_rate": 7.987852573310764e-05, + "loss": 2.783, + "step": 17713 + }, + { + "epoch": 1.0996337451114284, + "grad_norm": 0.1707065485909948, + "learning_rate": 7.987562987598553e-05, + "loss": 2.9014, + "step": 17714 + }, + { + "epoch": 1.0996958222111863, + "grad_norm": 0.15812447441337898, + "learning_rate": 7.987273386299501e-05, + "loss": 2.9412, + "step": 17715 + }, + { + "epoch": 1.0997578993109443, + "grad_norm": 0.16361886169672835, + "learning_rate": 7.986983769415118e-05, + "loss": 2.8537, + "step": 17716 + }, + { + "epoch": 1.0998199764107022, + "grad_norm": 0.1712914770080134, + "learning_rate": 7.986694136946918e-05, + "loss": 2.8985, + "step": 17717 + }, + { + "epoch": 1.09988205351046, + "grad_norm": 0.1876189033672873, + "learning_rate": 7.986404488896407e-05, + "loss": 2.8642, + "step": 17718 + }, + { + "epoch": 1.099944130610218, + "grad_norm": 0.15831113589648269, + "learning_rate": 7.986114825265099e-05, + "loss": 2.913, + "step": 17719 + }, + { + "epoch": 1.1000062077099757, + "grad_norm": 0.19051342614675057, + "learning_rate": 7.985825146054505e-05, + "loss": 2.7784, + "step": 17720 + }, + { + "epoch": 1.1000682848097336, + "grad_norm": 0.1712281507019917, + "learning_rate": 7.985535451266136e-05, + "loss": 2.9642, + "step": 17721 + }, + { + "epoch": 1.1001303619094915, + "grad_norm": 0.18114301755192624, + "learning_rate": 7.985245740901506e-05, + "loss": 2.8302, + "step": 17722 + }, + { + "epoch": 1.1001924390092495, + "grad_norm": 0.1743873003241223, + "learning_rate": 7.984956014962121e-05, + "loss": 2.895, + "step": 17723 + }, + { + "epoch": 1.1002545161090074, + "grad_norm": 0.14883946903679066, + "learning_rate": 7.984666273449499e-05, + "loss": 2.8754, + "step": 17724 + }, + { + "epoch": 1.1003165932087653, + "grad_norm": 0.18032016499853243, + "learning_rate": 7.984376516365146e-05, + "loss": 2.861, + "step": 17725 + }, + { + "epoch": 1.1003786703085232, + "grad_norm": 0.2140744915965649, + "learning_rate": 7.98408674371058e-05, + "loss": 2.8841, + "step": 17726 + }, + { + "epoch": 1.1004407474082811, + "grad_norm": 0.18388736864523117, + "learning_rate": 7.983796955487305e-05, + "loss": 2.8998, + "step": 17727 + }, + { + "epoch": 1.100502824508039, + "grad_norm": 0.16871607117063883, + "learning_rate": 7.983507151696838e-05, + "loss": 2.8025, + "step": 17728 + }, + { + "epoch": 1.100564901607797, + "grad_norm": 0.23184360565384668, + "learning_rate": 7.98321733234069e-05, + "loss": 2.9978, + "step": 17729 + }, + { + "epoch": 1.1006269787075549, + "grad_norm": 0.16622555306496525, + "learning_rate": 7.982927497420374e-05, + "loss": 2.9915, + "step": 17730 + }, + { + "epoch": 1.1006890558073126, + "grad_norm": 0.19564265956023347, + "learning_rate": 7.9826376469374e-05, + "loss": 2.7807, + "step": 17731 + }, + { + "epoch": 1.1007511329070705, + "grad_norm": 0.17053239040052862, + "learning_rate": 7.982347780893283e-05, + "loss": 2.8153, + "step": 17732 + }, + { + "epoch": 1.1008132100068284, + "grad_norm": 0.1952227386206368, + "learning_rate": 7.982057899289531e-05, + "loss": 2.8754, + "step": 17733 + }, + { + "epoch": 1.1008752871065863, + "grad_norm": 0.1716182644429481, + "learning_rate": 7.98176800212766e-05, + "loss": 2.8749, + "step": 17734 + }, + { + "epoch": 1.1009373642063442, + "grad_norm": 0.1828231364196282, + "learning_rate": 7.981478089409183e-05, + "loss": 2.9165, + "step": 17735 + }, + { + "epoch": 1.1009994413061022, + "grad_norm": 0.20517760089428688, + "learning_rate": 7.981188161135609e-05, + "loss": 2.8961, + "step": 17736 + }, + { + "epoch": 1.10106151840586, + "grad_norm": 0.24333312197625984, + "learning_rate": 7.980898217308454e-05, + "loss": 2.9996, + "step": 17737 + }, + { + "epoch": 1.101123595505618, + "grad_norm": 0.1992176560764572, + "learning_rate": 7.980608257929228e-05, + "loss": 2.8667, + "step": 17738 + }, + { + "epoch": 1.101185672605376, + "grad_norm": 0.17783899686390797, + "learning_rate": 7.980318282999447e-05, + "loss": 2.9043, + "step": 17739 + }, + { + "epoch": 1.1012477497051338, + "grad_norm": 0.17950877340268526, + "learning_rate": 7.98002829252062e-05, + "loss": 2.987, + "step": 17740 + }, + { + "epoch": 1.1013098268048918, + "grad_norm": 0.18762960524310981, + "learning_rate": 7.979738286494265e-05, + "loss": 2.9684, + "step": 17741 + }, + { + "epoch": 1.1013719039046497, + "grad_norm": 0.18132440667570104, + "learning_rate": 7.979448264921889e-05, + "loss": 2.9717, + "step": 17742 + }, + { + "epoch": 1.1014339810044076, + "grad_norm": 0.161907271138078, + "learning_rate": 7.979158227805009e-05, + "loss": 2.9008, + "step": 17743 + }, + { + "epoch": 1.1014960581041653, + "grad_norm": 0.169774056956835, + "learning_rate": 7.978868175145138e-05, + "loss": 2.8994, + "step": 17744 + }, + { + "epoch": 1.1015581352039232, + "grad_norm": 0.16684440855663285, + "learning_rate": 7.978578106943788e-05, + "loss": 2.8135, + "step": 17745 + }, + { + "epoch": 1.1016202123036811, + "grad_norm": 0.16446129830679498, + "learning_rate": 7.978288023202471e-05, + "loss": 2.7609, + "step": 17746 + }, + { + "epoch": 1.101682289403439, + "grad_norm": 0.15199618550701469, + "learning_rate": 7.977997923922707e-05, + "loss": 2.9141, + "step": 17747 + }, + { + "epoch": 1.101744366503197, + "grad_norm": 0.15453345233555163, + "learning_rate": 7.977707809106e-05, + "loss": 2.8236, + "step": 17748 + }, + { + "epoch": 1.1018064436029549, + "grad_norm": 0.15804470041903212, + "learning_rate": 7.977417678753871e-05, + "loss": 2.9685, + "step": 17749 + }, + { + "epoch": 1.1018685207027128, + "grad_norm": 0.1810970890471311, + "learning_rate": 7.977127532867832e-05, + "loss": 2.9343, + "step": 17750 + }, + { + "epoch": 1.1019305978024707, + "grad_norm": 0.19671217073402208, + "learning_rate": 7.976837371449395e-05, + "loss": 2.9964, + "step": 17751 + }, + { + "epoch": 1.1019926749022286, + "grad_norm": 0.17738659336989285, + "learning_rate": 7.976547194500075e-05, + "loss": 2.927, + "step": 17752 + }, + { + "epoch": 1.1020547520019865, + "grad_norm": 0.1843756400921589, + "learning_rate": 7.976257002021385e-05, + "loss": 2.9692, + "step": 17753 + }, + { + "epoch": 1.1021168291017445, + "grad_norm": 0.23074245364391915, + "learning_rate": 7.97596679401484e-05, + "loss": 2.8066, + "step": 17754 + }, + { + "epoch": 1.1021789062015022, + "grad_norm": 0.24876075054065522, + "learning_rate": 7.975676570481954e-05, + "loss": 2.9479, + "step": 17755 + }, + { + "epoch": 1.10224098330126, + "grad_norm": 0.20126669232523986, + "learning_rate": 7.975386331424241e-05, + "loss": 2.7794, + "step": 17756 + }, + { + "epoch": 1.102303060401018, + "grad_norm": 0.22501029017547752, + "learning_rate": 7.975096076843215e-05, + "loss": 3.0045, + "step": 17757 + }, + { + "epoch": 1.102365137500776, + "grad_norm": 0.17680869239558453, + "learning_rate": 7.974805806740391e-05, + "loss": 2.856, + "step": 17758 + }, + { + "epoch": 1.1024272146005338, + "grad_norm": 0.20731719081932107, + "learning_rate": 7.974515521117283e-05, + "loss": 2.9052, + "step": 17759 + }, + { + "epoch": 1.1024892917002918, + "grad_norm": 0.20336793164172082, + "learning_rate": 7.974225219975404e-05, + "loss": 2.9151, + "step": 17760 + }, + { + "epoch": 1.1025513688000497, + "grad_norm": 0.17545593676911614, + "learning_rate": 7.973934903316271e-05, + "loss": 2.8957, + "step": 17761 + }, + { + "epoch": 1.1026134458998076, + "grad_norm": 0.19640985254945645, + "learning_rate": 7.973644571141397e-05, + "loss": 2.8458, + "step": 17762 + }, + { + "epoch": 1.1026755229995655, + "grad_norm": 0.19445019147720777, + "learning_rate": 7.973354223452298e-05, + "loss": 2.9069, + "step": 17763 + }, + { + "epoch": 1.1027376000993234, + "grad_norm": 0.16532602135899285, + "learning_rate": 7.973063860250489e-05, + "loss": 2.8898, + "step": 17764 + }, + { + "epoch": 1.1027996771990813, + "grad_norm": 0.2087497820538965, + "learning_rate": 7.972773481537484e-05, + "loss": 2.8459, + "step": 17765 + }, + { + "epoch": 1.1028617542988393, + "grad_norm": 0.24928429348577225, + "learning_rate": 7.972483087314798e-05, + "loss": 2.9303, + "step": 17766 + }, + { + "epoch": 1.102923831398597, + "grad_norm": 0.16939688293098393, + "learning_rate": 7.972192677583944e-05, + "loss": 2.936, + "step": 17767 + }, + { + "epoch": 1.1029859084983549, + "grad_norm": 0.21332092719556187, + "learning_rate": 7.971902252346442e-05, + "loss": 2.8937, + "step": 17768 + }, + { + "epoch": 1.1030479855981128, + "grad_norm": 0.19308466615129063, + "learning_rate": 7.971611811603804e-05, + "loss": 2.8398, + "step": 17769 + }, + { + "epoch": 1.1031100626978707, + "grad_norm": 0.20030880943981094, + "learning_rate": 7.971321355357545e-05, + "loss": 2.8896, + "step": 17770 + }, + { + "epoch": 1.1031721397976286, + "grad_norm": 0.19190477144907603, + "learning_rate": 7.97103088360918e-05, + "loss": 2.9348, + "step": 17771 + }, + { + "epoch": 1.1032342168973865, + "grad_norm": 0.18709788317023177, + "learning_rate": 7.970740396360226e-05, + "loss": 2.9218, + "step": 17772 + }, + { + "epoch": 1.1032962939971445, + "grad_norm": 0.19331956704089057, + "learning_rate": 7.9704498936122e-05, + "loss": 2.922, + "step": 17773 + }, + { + "epoch": 1.1033583710969024, + "grad_norm": 0.1849991997495303, + "learning_rate": 7.970159375366615e-05, + "loss": 2.9046, + "step": 17774 + }, + { + "epoch": 1.1034204481966603, + "grad_norm": 0.21288915511157558, + "learning_rate": 7.969868841624988e-05, + "loss": 2.8699, + "step": 17775 + }, + { + "epoch": 1.1034825252964182, + "grad_norm": 0.18994587792474896, + "learning_rate": 7.969578292388833e-05, + "loss": 2.9375, + "step": 17776 + }, + { + "epoch": 1.1035446023961761, + "grad_norm": 0.19022969234272627, + "learning_rate": 7.969287727659669e-05, + "loss": 2.86, + "step": 17777 + }, + { + "epoch": 1.103606679495934, + "grad_norm": 0.22052382891648578, + "learning_rate": 7.968997147439008e-05, + "loss": 2.952, + "step": 17778 + }, + { + "epoch": 1.1036687565956917, + "grad_norm": 0.2026151346867894, + "learning_rate": 7.96870655172837e-05, + "loss": 2.8893, + "step": 17779 + }, + { + "epoch": 1.1037308336954497, + "grad_norm": 0.17557746412732161, + "learning_rate": 7.968415940529267e-05, + "loss": 2.8309, + "step": 17780 + }, + { + "epoch": 1.1037929107952076, + "grad_norm": 0.1743769447071237, + "learning_rate": 7.968125313843218e-05, + "loss": 2.9506, + "step": 17781 + }, + { + "epoch": 1.1038549878949655, + "grad_norm": 0.16249891381634315, + "learning_rate": 7.96783467167174e-05, + "loss": 2.8588, + "step": 17782 + }, + { + "epoch": 1.1039170649947234, + "grad_norm": 0.21406006498413682, + "learning_rate": 7.967544014016348e-05, + "loss": 2.9381, + "step": 17783 + }, + { + "epoch": 1.1039791420944813, + "grad_norm": 0.15813843263897306, + "learning_rate": 7.967253340878557e-05, + "loss": 2.9203, + "step": 17784 + }, + { + "epoch": 1.1040412191942393, + "grad_norm": 0.17964572006131177, + "learning_rate": 7.966962652259887e-05, + "loss": 2.9174, + "step": 17785 + }, + { + "epoch": 1.1041032962939972, + "grad_norm": 0.16763159825739096, + "learning_rate": 7.966671948161852e-05, + "loss": 2.9154, + "step": 17786 + }, + { + "epoch": 1.104165373393755, + "grad_norm": 0.16855146553080824, + "learning_rate": 7.966381228585968e-05, + "loss": 2.8473, + "step": 17787 + }, + { + "epoch": 1.104227450493513, + "grad_norm": 0.1672943070420258, + "learning_rate": 7.966090493533755e-05, + "loss": 2.8986, + "step": 17788 + }, + { + "epoch": 1.104289527593271, + "grad_norm": 0.19995561531914566, + "learning_rate": 7.965799743006726e-05, + "loss": 2.9018, + "step": 17789 + }, + { + "epoch": 1.1043516046930288, + "grad_norm": 0.16064119770336444, + "learning_rate": 7.965508977006402e-05, + "loss": 2.841, + "step": 17790 + }, + { + "epoch": 1.1044136817927865, + "grad_norm": 0.1553660662285421, + "learning_rate": 7.965218195534295e-05, + "loss": 2.8249, + "step": 17791 + }, + { + "epoch": 1.1044757588925445, + "grad_norm": 0.16149563832674732, + "learning_rate": 7.964927398591926e-05, + "loss": 2.8895, + "step": 17792 + }, + { + "epoch": 1.1045378359923024, + "grad_norm": 0.14715798311839187, + "learning_rate": 7.964636586180811e-05, + "loss": 2.8968, + "step": 17793 + }, + { + "epoch": 1.1045999130920603, + "grad_norm": 0.16437227091862588, + "learning_rate": 7.964345758302468e-05, + "loss": 2.9333, + "step": 17794 + }, + { + "epoch": 1.1046619901918182, + "grad_norm": 0.15254340171747635, + "learning_rate": 7.964054914958413e-05, + "loss": 2.8741, + "step": 17795 + }, + { + "epoch": 1.1047240672915761, + "grad_norm": 0.15249957160811367, + "learning_rate": 7.963764056150162e-05, + "loss": 2.8329, + "step": 17796 + }, + { + "epoch": 1.104786144391334, + "grad_norm": 0.16024799161662687, + "learning_rate": 7.963473181879238e-05, + "loss": 2.949, + "step": 17797 + }, + { + "epoch": 1.104848221491092, + "grad_norm": 0.17035773162238085, + "learning_rate": 7.963182292147154e-05, + "loss": 2.8593, + "step": 17798 + }, + { + "epoch": 1.1049102985908499, + "grad_norm": 0.15869224548117933, + "learning_rate": 7.962891386955427e-05, + "loss": 2.8205, + "step": 17799 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.18550119339618176, + "learning_rate": 7.962600466305576e-05, + "loss": 2.9231, + "step": 17800 + }, + { + "epoch": 1.1050344527903657, + "grad_norm": 0.17229461382759612, + "learning_rate": 7.962309530199121e-05, + "loss": 2.9651, + "step": 17801 + }, + { + "epoch": 1.1050965298901236, + "grad_norm": 0.14664195803451482, + "learning_rate": 7.962018578637577e-05, + "loss": 2.9452, + "step": 17802 + }, + { + "epoch": 1.1051586069898813, + "grad_norm": 0.17021437386093896, + "learning_rate": 7.961727611622465e-05, + "loss": 2.8646, + "step": 17803 + }, + { + "epoch": 1.1052206840896392, + "grad_norm": 0.18677842850426465, + "learning_rate": 7.9614366291553e-05, + "loss": 2.97, + "step": 17804 + }, + { + "epoch": 1.1052827611893972, + "grad_norm": 0.18142684538684503, + "learning_rate": 7.961145631237601e-05, + "loss": 2.8908, + "step": 17805 + }, + { + "epoch": 1.105344838289155, + "grad_norm": 0.1564449167425172, + "learning_rate": 7.960854617870886e-05, + "loss": 2.8777, + "step": 17806 + }, + { + "epoch": 1.105406915388913, + "grad_norm": 0.16370532347278915, + "learning_rate": 7.960563589056673e-05, + "loss": 2.8327, + "step": 17807 + }, + { + "epoch": 1.105468992488671, + "grad_norm": 0.16730283542438912, + "learning_rate": 7.960272544796483e-05, + "loss": 2.9673, + "step": 17808 + }, + { + "epoch": 1.1055310695884288, + "grad_norm": 0.1738216313210032, + "learning_rate": 7.959981485091832e-05, + "loss": 2.9109, + "step": 17809 + }, + { + "epoch": 1.1055931466881868, + "grad_norm": 0.16388972585494918, + "learning_rate": 7.959690409944238e-05, + "loss": 2.888, + "step": 17810 + }, + { + "epoch": 1.1056552237879447, + "grad_norm": 0.14683184420884693, + "learning_rate": 7.95939931935522e-05, + "loss": 2.9585, + "step": 17811 + }, + { + "epoch": 1.1057173008877026, + "grad_norm": 0.18791139690075648, + "learning_rate": 7.959108213326299e-05, + "loss": 2.9564, + "step": 17812 + }, + { + "epoch": 1.1057793779874605, + "grad_norm": 0.15909819476943984, + "learning_rate": 7.958817091858992e-05, + "loss": 2.9418, + "step": 17813 + }, + { + "epoch": 1.1058414550872184, + "grad_norm": 0.19084807187189, + "learning_rate": 7.958525954954816e-05, + "loss": 2.9133, + "step": 17814 + }, + { + "epoch": 1.1059035321869761, + "grad_norm": 0.20415671301601943, + "learning_rate": 7.958234802615294e-05, + "loss": 2.8463, + "step": 17815 + }, + { + "epoch": 1.105965609286734, + "grad_norm": 0.16732582426364004, + "learning_rate": 7.957943634841942e-05, + "loss": 2.9314, + "step": 17816 + }, + { + "epoch": 1.106027686386492, + "grad_norm": 0.17780808607566279, + "learning_rate": 7.957652451636281e-05, + "loss": 2.9089, + "step": 17817 + }, + { + "epoch": 1.1060897634862499, + "grad_norm": 0.1695278795847767, + "learning_rate": 7.957361252999829e-05, + "loss": 2.9781, + "step": 17818 + }, + { + "epoch": 1.1061518405860078, + "grad_norm": 0.1980476270281416, + "learning_rate": 7.957070038934105e-05, + "loss": 2.94, + "step": 17819 + }, + { + "epoch": 1.1062139176857657, + "grad_norm": 0.2248523807393447, + "learning_rate": 7.956778809440628e-05, + "loss": 2.9125, + "step": 17820 + }, + { + "epoch": 1.1062759947855236, + "grad_norm": 0.17575624076105556, + "learning_rate": 7.95648756452092e-05, + "loss": 2.8409, + "step": 17821 + }, + { + "epoch": 1.1063380718852815, + "grad_norm": 0.18246394372781669, + "learning_rate": 7.956196304176498e-05, + "loss": 2.8948, + "step": 17822 + }, + { + "epoch": 1.1064001489850395, + "grad_norm": 0.16786702535602535, + "learning_rate": 7.955905028408881e-05, + "loss": 2.9719, + "step": 17823 + }, + { + "epoch": 1.1064622260847974, + "grad_norm": 0.18112064094444896, + "learning_rate": 7.95561373721959e-05, + "loss": 2.8967, + "step": 17824 + }, + { + "epoch": 1.1065243031845553, + "grad_norm": 0.18107399698638257, + "learning_rate": 7.955322430610146e-05, + "loss": 2.9507, + "step": 17825 + }, + { + "epoch": 1.1065863802843132, + "grad_norm": 0.18872197633143545, + "learning_rate": 7.955031108582066e-05, + "loss": 2.9302, + "step": 17826 + }, + { + "epoch": 1.106648457384071, + "grad_norm": 0.19351690791678505, + "learning_rate": 7.954739771136872e-05, + "loss": 2.8089, + "step": 17827 + }, + { + "epoch": 1.1067105344838288, + "grad_norm": 0.16331555996220526, + "learning_rate": 7.954448418276084e-05, + "loss": 2.8412, + "step": 17828 + }, + { + "epoch": 1.1067726115835868, + "grad_norm": 0.15606181017456247, + "learning_rate": 7.954157050001221e-05, + "loss": 2.8299, + "step": 17829 + }, + { + "epoch": 1.1068346886833447, + "grad_norm": 0.17454410822366445, + "learning_rate": 7.953865666313802e-05, + "loss": 2.9141, + "step": 17830 + }, + { + "epoch": 1.1068967657831026, + "grad_norm": 0.18520156450707614, + "learning_rate": 7.95357426721535e-05, + "loss": 2.9584, + "step": 17831 + }, + { + "epoch": 1.1069588428828605, + "grad_norm": 0.16904775457367574, + "learning_rate": 7.953282852707385e-05, + "loss": 2.9486, + "step": 17832 + }, + { + "epoch": 1.1070209199826184, + "grad_norm": 0.15835320453638413, + "learning_rate": 7.952991422791424e-05, + "loss": 2.8353, + "step": 17833 + }, + { + "epoch": 1.1070829970823763, + "grad_norm": 0.16333197690946283, + "learning_rate": 7.952699977468993e-05, + "loss": 2.8872, + "step": 17834 + }, + { + "epoch": 1.1071450741821343, + "grad_norm": 0.16405128826705467, + "learning_rate": 7.952408516741607e-05, + "loss": 2.9931, + "step": 17835 + }, + { + "epoch": 1.1072071512818922, + "grad_norm": 0.16770852246382417, + "learning_rate": 7.95211704061079e-05, + "loss": 2.8999, + "step": 17836 + }, + { + "epoch": 1.10726922838165, + "grad_norm": 0.18474105728080512, + "learning_rate": 7.951825549078061e-05, + "loss": 2.9476, + "step": 17837 + }, + { + "epoch": 1.107331305481408, + "grad_norm": 0.18270653563571632, + "learning_rate": 7.951534042144943e-05, + "loss": 2.9095, + "step": 17838 + }, + { + "epoch": 1.1073933825811657, + "grad_norm": 0.1556705698450411, + "learning_rate": 7.951242519812955e-05, + "loss": 2.8681, + "step": 17839 + }, + { + "epoch": 1.1074554596809236, + "grad_norm": 0.1754644453093056, + "learning_rate": 7.950950982083618e-05, + "loss": 2.8535, + "step": 17840 + }, + { + "epoch": 1.1075175367806815, + "grad_norm": 0.15555592996747603, + "learning_rate": 7.950659428958454e-05, + "loss": 2.8531, + "step": 17841 + }, + { + "epoch": 1.1075796138804395, + "grad_norm": 0.17629144590471127, + "learning_rate": 7.95036786043898e-05, + "loss": 2.9632, + "step": 17842 + }, + { + "epoch": 1.1076416909801974, + "grad_norm": 0.17231536934172043, + "learning_rate": 7.950076276526724e-05, + "loss": 2.9131, + "step": 17843 + }, + { + "epoch": 1.1077037680799553, + "grad_norm": 0.15535422089057266, + "learning_rate": 7.949784677223203e-05, + "loss": 2.9448, + "step": 17844 + }, + { + "epoch": 1.1077658451797132, + "grad_norm": 0.1653314686206273, + "learning_rate": 7.94949306252994e-05, + "loss": 2.7957, + "step": 17845 + }, + { + "epoch": 1.1078279222794711, + "grad_norm": 0.1648551435284561, + "learning_rate": 7.949201432448454e-05, + "loss": 2.8584, + "step": 17846 + }, + { + "epoch": 1.107889999379229, + "grad_norm": 0.17241415618398886, + "learning_rate": 7.94890978698027e-05, + "loss": 2.9179, + "step": 17847 + }, + { + "epoch": 1.107952076478987, + "grad_norm": 0.16246515895479713, + "learning_rate": 7.948618126126907e-05, + "loss": 2.9518, + "step": 17848 + }, + { + "epoch": 1.1080141535787449, + "grad_norm": 0.16767127842486404, + "learning_rate": 7.948326449889888e-05, + "loss": 2.9455, + "step": 17849 + }, + { + "epoch": 1.1080762306785026, + "grad_norm": 0.17856762043264324, + "learning_rate": 7.948034758270733e-05, + "loss": 2.9004, + "step": 17850 + }, + { + "epoch": 1.1081383077782605, + "grad_norm": 0.1971918478001285, + "learning_rate": 7.947743051270967e-05, + "loss": 2.942, + "step": 17851 + }, + { + "epoch": 1.1082003848780184, + "grad_norm": 0.146556613911695, + "learning_rate": 7.947451328892108e-05, + "loss": 2.8504, + "step": 17852 + }, + { + "epoch": 1.1082624619777763, + "grad_norm": 0.17073849414189984, + "learning_rate": 7.94715959113568e-05, + "loss": 2.9458, + "step": 17853 + }, + { + "epoch": 1.1083245390775343, + "grad_norm": 0.18051119342437022, + "learning_rate": 7.946867838003207e-05, + "loss": 3.0132, + "step": 17854 + }, + { + "epoch": 1.1083866161772922, + "grad_norm": 0.15266967412790958, + "learning_rate": 7.946576069496207e-05, + "loss": 2.8777, + "step": 17855 + }, + { + "epoch": 1.10844869327705, + "grad_norm": 0.17038623346776355, + "learning_rate": 7.946284285616204e-05, + "loss": 2.9721, + "step": 17856 + }, + { + "epoch": 1.108510770376808, + "grad_norm": 0.15346214391759191, + "learning_rate": 7.945992486364723e-05, + "loss": 2.9448, + "step": 17857 + }, + { + "epoch": 1.108572847476566, + "grad_norm": 0.1571148007203865, + "learning_rate": 7.945700671743283e-05, + "loss": 2.9261, + "step": 17858 + }, + { + "epoch": 1.1086349245763238, + "grad_norm": 0.15349918913277602, + "learning_rate": 7.945408841753407e-05, + "loss": 2.9604, + "step": 17859 + }, + { + "epoch": 1.1086970016760818, + "grad_norm": 0.18136943548724926, + "learning_rate": 7.945116996396618e-05, + "loss": 2.8239, + "step": 17860 + }, + { + "epoch": 1.1087590787758397, + "grad_norm": 0.16132344615515462, + "learning_rate": 7.944825135674438e-05, + "loss": 2.9777, + "step": 17861 + }, + { + "epoch": 1.1088211558755976, + "grad_norm": 0.15097933886345236, + "learning_rate": 7.944533259588391e-05, + "loss": 2.9065, + "step": 17862 + }, + { + "epoch": 1.1088832329753553, + "grad_norm": 0.20662858780124707, + "learning_rate": 7.94424136814e-05, + "loss": 2.9091, + "step": 17863 + }, + { + "epoch": 1.1089453100751132, + "grad_norm": 0.17671531356013437, + "learning_rate": 7.943949461330787e-05, + "loss": 2.8694, + "step": 17864 + }, + { + "epoch": 1.1090073871748711, + "grad_norm": 0.16516771888429824, + "learning_rate": 7.943657539162274e-05, + "loss": 2.8241, + "step": 17865 + }, + { + "epoch": 1.109069464274629, + "grad_norm": 0.15593914462084213, + "learning_rate": 7.943365601635986e-05, + "loss": 2.9865, + "step": 17866 + }, + { + "epoch": 1.109131541374387, + "grad_norm": 0.1733453078147522, + "learning_rate": 7.943073648753444e-05, + "loss": 2.9276, + "step": 17867 + }, + { + "epoch": 1.1091936184741449, + "grad_norm": 0.17586547635076197, + "learning_rate": 7.942781680516174e-05, + "loss": 2.8974, + "step": 17868 + }, + { + "epoch": 1.1092556955739028, + "grad_norm": 0.15902615380175197, + "learning_rate": 7.942489696925694e-05, + "loss": 2.9393, + "step": 17869 + }, + { + "epoch": 1.1093177726736607, + "grad_norm": 0.15317060973677898, + "learning_rate": 7.942197697983535e-05, + "loss": 2.8681, + "step": 17870 + }, + { + "epoch": 1.1093798497734186, + "grad_norm": 0.15882202641477447, + "learning_rate": 7.941905683691215e-05, + "loss": 2.9371, + "step": 17871 + }, + { + "epoch": 1.1094419268731766, + "grad_norm": 0.15571767010774618, + "learning_rate": 7.941613654050258e-05, + "loss": 2.921, + "step": 17872 + }, + { + "epoch": 1.1095040039729345, + "grad_norm": 0.15339157246497845, + "learning_rate": 7.941321609062188e-05, + "loss": 2.9557, + "step": 17873 + }, + { + "epoch": 1.1095660810726922, + "grad_norm": 0.16971291925096563, + "learning_rate": 7.941029548728532e-05, + "loss": 2.9455, + "step": 17874 + }, + { + "epoch": 1.10962815817245, + "grad_norm": 0.20208257256361395, + "learning_rate": 7.94073747305081e-05, + "loss": 2.8631, + "step": 17875 + }, + { + "epoch": 1.109690235272208, + "grad_norm": 0.18675104205161247, + "learning_rate": 7.940445382030546e-05, + "loss": 2.8969, + "step": 17876 + }, + { + "epoch": 1.109752312371966, + "grad_norm": 0.15493937610415176, + "learning_rate": 7.940153275669264e-05, + "loss": 2.8623, + "step": 17877 + }, + { + "epoch": 1.1098143894717238, + "grad_norm": 0.15968621875861086, + "learning_rate": 7.939861153968489e-05, + "loss": 2.9329, + "step": 17878 + }, + { + "epoch": 1.1098764665714818, + "grad_norm": 0.15109322273651563, + "learning_rate": 7.939569016929744e-05, + "loss": 2.9781, + "step": 17879 + }, + { + "epoch": 1.1099385436712397, + "grad_norm": 0.16335240027645093, + "learning_rate": 7.939276864554554e-05, + "loss": 2.8975, + "step": 17880 + }, + { + "epoch": 1.1100006207709976, + "grad_norm": 0.15236332963175714, + "learning_rate": 7.938984696844444e-05, + "loss": 2.89, + "step": 17881 + }, + { + "epoch": 1.1100626978707555, + "grad_norm": 0.17706323023361722, + "learning_rate": 7.938692513800939e-05, + "loss": 2.8728, + "step": 17882 + }, + { + "epoch": 1.1101247749705134, + "grad_norm": 0.17574649043262158, + "learning_rate": 7.938400315425558e-05, + "loss": 2.8251, + "step": 17883 + }, + { + "epoch": 1.1101868520702713, + "grad_norm": 0.15808626851206728, + "learning_rate": 7.938108101719833e-05, + "loss": 2.915, + "step": 17884 + }, + { + "epoch": 1.1102489291700293, + "grad_norm": 0.16117977267214023, + "learning_rate": 7.937815872685281e-05, + "loss": 2.8831, + "step": 17885 + }, + { + "epoch": 1.1103110062697872, + "grad_norm": 0.1520752680667024, + "learning_rate": 7.937523628323433e-05, + "loss": 2.8905, + "step": 17886 + }, + { + "epoch": 1.1103730833695449, + "grad_norm": 0.14814337652632742, + "learning_rate": 7.93723136863581e-05, + "loss": 2.9537, + "step": 17887 + }, + { + "epoch": 1.1104351604693028, + "grad_norm": 0.1499815678695939, + "learning_rate": 7.936939093623939e-05, + "loss": 2.9146, + "step": 17888 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.1872638466933205, + "learning_rate": 7.936646803289343e-05, + "loss": 2.9124, + "step": 17889 + }, + { + "epoch": 1.1105593146688186, + "grad_norm": 0.16020933876313243, + "learning_rate": 7.936354497633547e-05, + "loss": 2.8654, + "step": 17890 + }, + { + "epoch": 1.1106213917685765, + "grad_norm": 0.22308856420852535, + "learning_rate": 7.936062176658078e-05, + "loss": 2.877, + "step": 17891 + }, + { + "epoch": 1.1106834688683345, + "grad_norm": 0.1558754544580346, + "learning_rate": 7.93576984036446e-05, + "loss": 2.8383, + "step": 17892 + }, + { + "epoch": 1.1107455459680924, + "grad_norm": 0.15030748372818184, + "learning_rate": 7.935477488754217e-05, + "loss": 2.8284, + "step": 17893 + }, + { + "epoch": 1.1108076230678503, + "grad_norm": 0.15755789656725427, + "learning_rate": 7.935185121828875e-05, + "loss": 2.8616, + "step": 17894 + }, + { + "epoch": 1.1108697001676082, + "grad_norm": 0.14425184332043375, + "learning_rate": 7.93489273958996e-05, + "loss": 2.9213, + "step": 17895 + }, + { + "epoch": 1.1109317772673661, + "grad_norm": 0.14915091585491302, + "learning_rate": 7.934600342038998e-05, + "loss": 2.9664, + "step": 17896 + }, + { + "epoch": 1.110993854367124, + "grad_norm": 0.15260162902019717, + "learning_rate": 7.934307929177513e-05, + "loss": 2.8963, + "step": 17897 + }, + { + "epoch": 1.1110559314668818, + "grad_norm": 0.1895430636517637, + "learning_rate": 7.934015501007033e-05, + "loss": 2.8826, + "step": 17898 + }, + { + "epoch": 1.1111180085666397, + "grad_norm": 0.1677280242439024, + "learning_rate": 7.933723057529078e-05, + "loss": 2.9385, + "step": 17899 + }, + { + "epoch": 1.1111800856663976, + "grad_norm": 0.14876365591925042, + "learning_rate": 7.933430598745182e-05, + "loss": 2.9306, + "step": 17900 + }, + { + "epoch": 1.1112421627661555, + "grad_norm": 0.16294707593957217, + "learning_rate": 7.933138124656865e-05, + "loss": 2.9027, + "step": 17901 + }, + { + "epoch": 1.1113042398659134, + "grad_norm": 0.14383112960735467, + "learning_rate": 7.932845635265653e-05, + "loss": 2.9567, + "step": 17902 + }, + { + "epoch": 1.1113663169656713, + "grad_norm": 0.1834229730153529, + "learning_rate": 7.932553130573075e-05, + "loss": 2.9541, + "step": 17903 + }, + { + "epoch": 1.1114283940654293, + "grad_norm": 0.14955079298752957, + "learning_rate": 7.932260610580655e-05, + "loss": 2.8888, + "step": 17904 + }, + { + "epoch": 1.1114904711651872, + "grad_norm": 0.1596308148754354, + "learning_rate": 7.931968075289918e-05, + "loss": 2.9591, + "step": 17905 + }, + { + "epoch": 1.111552548264945, + "grad_norm": 0.1460749244290355, + "learning_rate": 7.931675524702393e-05, + "loss": 2.7927, + "step": 17906 + }, + { + "epoch": 1.111614625364703, + "grad_norm": 0.16845422825347192, + "learning_rate": 7.931382958819605e-05, + "loss": 2.9258, + "step": 17907 + }, + { + "epoch": 1.111676702464461, + "grad_norm": 0.1470682207341272, + "learning_rate": 7.93109037764308e-05, + "loss": 2.8389, + "step": 17908 + }, + { + "epoch": 1.1117387795642188, + "grad_norm": 0.18509167498820303, + "learning_rate": 7.930797781174346e-05, + "loss": 2.9704, + "step": 17909 + }, + { + "epoch": 1.1118008566639768, + "grad_norm": 0.16365303304372059, + "learning_rate": 7.930505169414927e-05, + "loss": 2.9509, + "step": 17910 + }, + { + "epoch": 1.1118629337637345, + "grad_norm": 0.1721588089053903, + "learning_rate": 7.930212542366351e-05, + "loss": 2.9642, + "step": 17911 + }, + { + "epoch": 1.1119250108634924, + "grad_norm": 0.17659983240270277, + "learning_rate": 7.929919900030147e-05, + "loss": 3.0184, + "step": 17912 + }, + { + "epoch": 1.1119870879632503, + "grad_norm": 0.17802071823204194, + "learning_rate": 7.929627242407838e-05, + "loss": 2.8771, + "step": 17913 + }, + { + "epoch": 1.1120491650630082, + "grad_norm": 0.17300547735378577, + "learning_rate": 7.929334569500954e-05, + "loss": 2.838, + "step": 17914 + }, + { + "epoch": 1.1121112421627661, + "grad_norm": 0.16824986531753874, + "learning_rate": 7.929041881311019e-05, + "loss": 2.9969, + "step": 17915 + }, + { + "epoch": 1.112173319262524, + "grad_norm": 0.14668499877294164, + "learning_rate": 7.928749177839562e-05, + "loss": 2.8348, + "step": 17916 + }, + { + "epoch": 1.112235396362282, + "grad_norm": 0.16350977752771897, + "learning_rate": 7.92845645908811e-05, + "loss": 2.9954, + "step": 17917 + }, + { + "epoch": 1.1122974734620399, + "grad_norm": 0.1651450950803682, + "learning_rate": 7.928163725058188e-05, + "loss": 3.0225, + "step": 17918 + }, + { + "epoch": 1.1123595505617978, + "grad_norm": 0.1529638313188449, + "learning_rate": 7.927870975751327e-05, + "loss": 2.8116, + "step": 17919 + }, + { + "epoch": 1.1124216276615557, + "grad_norm": 0.16394910413071798, + "learning_rate": 7.92757821116905e-05, + "loss": 2.8894, + "step": 17920 + }, + { + "epoch": 1.1124837047613136, + "grad_norm": 0.15200052355649463, + "learning_rate": 7.92728543131289e-05, + "loss": 2.9246, + "step": 17921 + }, + { + "epoch": 1.1125457818610713, + "grad_norm": 0.15395672643319327, + "learning_rate": 7.92699263618437e-05, + "loss": 2.919, + "step": 17922 + }, + { + "epoch": 1.1126078589608293, + "grad_norm": 0.1568720950469403, + "learning_rate": 7.926699825785017e-05, + "loss": 2.8484, + "step": 17923 + }, + { + "epoch": 1.1126699360605872, + "grad_norm": 0.1476082539637486, + "learning_rate": 7.926407000116363e-05, + "loss": 2.8711, + "step": 17924 + }, + { + "epoch": 1.112732013160345, + "grad_norm": 0.1555840785734131, + "learning_rate": 7.926114159179932e-05, + "loss": 2.9413, + "step": 17925 + }, + { + "epoch": 1.112794090260103, + "grad_norm": 0.16593595163696953, + "learning_rate": 7.925821302977254e-05, + "loss": 2.9832, + "step": 17926 + }, + { + "epoch": 1.112856167359861, + "grad_norm": 0.159285802517174, + "learning_rate": 7.925528431509855e-05, + "loss": 2.9463, + "step": 17927 + }, + { + "epoch": 1.1129182444596188, + "grad_norm": 0.15680590920492424, + "learning_rate": 7.925235544779264e-05, + "loss": 2.8763, + "step": 17928 + }, + { + "epoch": 1.1129803215593768, + "grad_norm": 0.1827164752342992, + "learning_rate": 7.924942642787009e-05, + "loss": 2.8876, + "step": 17929 + }, + { + "epoch": 1.1130423986591347, + "grad_norm": 0.1598698975272252, + "learning_rate": 7.924649725534619e-05, + "loss": 2.8637, + "step": 17930 + }, + { + "epoch": 1.1131044757588926, + "grad_norm": 0.19141305210716725, + "learning_rate": 7.92435679302362e-05, + "loss": 2.9748, + "step": 17931 + }, + { + "epoch": 1.1131665528586505, + "grad_norm": 0.16777512013276594, + "learning_rate": 7.924063845255542e-05, + "loss": 2.8762, + "step": 17932 + }, + { + "epoch": 1.1132286299584084, + "grad_norm": 0.17329516842515752, + "learning_rate": 7.923770882231915e-05, + "loss": 2.8276, + "step": 17933 + }, + { + "epoch": 1.1132907070581664, + "grad_norm": 0.16572804506854646, + "learning_rate": 7.923477903954264e-05, + "loss": 2.947, + "step": 17934 + }, + { + "epoch": 1.113352784157924, + "grad_norm": 0.1707241532055328, + "learning_rate": 7.92318491042412e-05, + "loss": 2.842, + "step": 17935 + }, + { + "epoch": 1.113414861257682, + "grad_norm": 0.16629206137879943, + "learning_rate": 7.922891901643009e-05, + "loss": 2.8734, + "step": 17936 + }, + { + "epoch": 1.1134769383574399, + "grad_norm": 0.16550355363326708, + "learning_rate": 7.922598877612463e-05, + "loss": 2.9107, + "step": 17937 + }, + { + "epoch": 1.1135390154571978, + "grad_norm": 0.172304337881041, + "learning_rate": 7.922305838334008e-05, + "loss": 2.8541, + "step": 17938 + }, + { + "epoch": 1.1136010925569557, + "grad_norm": 0.18161739693064816, + "learning_rate": 7.922012783809176e-05, + "loss": 2.8569, + "step": 17939 + }, + { + "epoch": 1.1136631696567136, + "grad_norm": 0.16916079918448845, + "learning_rate": 7.921719714039492e-05, + "loss": 2.9375, + "step": 17940 + }, + { + "epoch": 1.1137252467564716, + "grad_norm": 0.1623769356033714, + "learning_rate": 7.921426629026487e-05, + "loss": 2.9599, + "step": 17941 + }, + { + "epoch": 1.1137873238562295, + "grad_norm": 0.15542146835971926, + "learning_rate": 7.92113352877169e-05, + "loss": 2.8669, + "step": 17942 + }, + { + "epoch": 1.1138494009559874, + "grad_norm": 0.16457557491433572, + "learning_rate": 7.92084041327663e-05, + "loss": 2.8205, + "step": 17943 + }, + { + "epoch": 1.1139114780557453, + "grad_norm": 0.16305109896852457, + "learning_rate": 7.920547282542839e-05, + "loss": 2.9092, + "step": 17944 + }, + { + "epoch": 1.1139735551555032, + "grad_norm": 0.17048959601973981, + "learning_rate": 7.920254136571843e-05, + "loss": 2.8767, + "step": 17945 + }, + { + "epoch": 1.114035632255261, + "grad_norm": 0.18877706142667042, + "learning_rate": 7.91996097536517e-05, + "loss": 2.8839, + "step": 17946 + }, + { + "epoch": 1.1140977093550188, + "grad_norm": 0.15257293627891988, + "learning_rate": 7.919667798924354e-05, + "loss": 2.8045, + "step": 17947 + }, + { + "epoch": 1.1141597864547768, + "grad_norm": 0.18987757534518168, + "learning_rate": 7.919374607250922e-05, + "loss": 2.9621, + "step": 17948 + }, + { + "epoch": 1.1142218635545347, + "grad_norm": 0.15769028250945352, + "learning_rate": 7.919081400346403e-05, + "loss": 3.0219, + "step": 17949 + }, + { + "epoch": 1.1142839406542926, + "grad_norm": 0.16710057004012943, + "learning_rate": 7.918788178212328e-05, + "loss": 2.9413, + "step": 17950 + }, + { + "epoch": 1.1143460177540505, + "grad_norm": 0.19753269020078168, + "learning_rate": 7.918494940850227e-05, + "loss": 2.8088, + "step": 17951 + }, + { + "epoch": 1.1144080948538084, + "grad_norm": 0.22539311379524643, + "learning_rate": 7.918201688261628e-05, + "loss": 2.8624, + "step": 17952 + }, + { + "epoch": 1.1144701719535663, + "grad_norm": 0.17158465896357292, + "learning_rate": 7.917908420448065e-05, + "loss": 2.9283, + "step": 17953 + }, + { + "epoch": 1.1145322490533243, + "grad_norm": 0.17950320121838031, + "learning_rate": 7.917615137411064e-05, + "loss": 2.9573, + "step": 17954 + }, + { + "epoch": 1.1145943261530822, + "grad_norm": 0.18178256446962515, + "learning_rate": 7.917321839152156e-05, + "loss": 2.9051, + "step": 17955 + }, + { + "epoch": 1.11465640325284, + "grad_norm": 0.17941210208538536, + "learning_rate": 7.917028525672872e-05, + "loss": 2.9435, + "step": 17956 + }, + { + "epoch": 1.114718480352598, + "grad_norm": 0.2244761776927971, + "learning_rate": 7.916735196974742e-05, + "loss": 2.8682, + "step": 17957 + }, + { + "epoch": 1.114780557452356, + "grad_norm": 0.18680952012591143, + "learning_rate": 7.916441853059297e-05, + "loss": 2.8672, + "step": 17958 + }, + { + "epoch": 1.1148426345521136, + "grad_norm": 0.19093422198564408, + "learning_rate": 7.916148493928066e-05, + "loss": 2.9487, + "step": 17959 + }, + { + "epoch": 1.1149047116518715, + "grad_norm": 0.1734072325904301, + "learning_rate": 7.915855119582582e-05, + "loss": 2.8859, + "step": 17960 + }, + { + "epoch": 1.1149667887516295, + "grad_norm": 0.17233134506071335, + "learning_rate": 7.915561730024373e-05, + "loss": 2.9027, + "step": 17961 + }, + { + "epoch": 1.1150288658513874, + "grad_norm": 0.17608376010086918, + "learning_rate": 7.91526832525497e-05, + "loss": 2.8896, + "step": 17962 + }, + { + "epoch": 1.1150909429511453, + "grad_norm": 0.15581121226389513, + "learning_rate": 7.914974905275905e-05, + "loss": 2.9395, + "step": 17963 + }, + { + "epoch": 1.1151530200509032, + "grad_norm": 0.19261461419061668, + "learning_rate": 7.914681470088708e-05, + "loss": 2.8059, + "step": 17964 + }, + { + "epoch": 1.1152150971506611, + "grad_norm": 0.19983955966472408, + "learning_rate": 7.914388019694911e-05, + "loss": 2.9656, + "step": 17965 + }, + { + "epoch": 1.115277174250419, + "grad_norm": 0.16462181889770505, + "learning_rate": 7.914094554096043e-05, + "loss": 2.8288, + "step": 17966 + }, + { + "epoch": 1.115339251350177, + "grad_norm": 0.16859183515428222, + "learning_rate": 7.913801073293639e-05, + "loss": 2.8542, + "step": 17967 + }, + { + "epoch": 1.115401328449935, + "grad_norm": 0.1600444682732736, + "learning_rate": 7.913507577289225e-05, + "loss": 2.9627, + "step": 17968 + }, + { + "epoch": 1.1154634055496928, + "grad_norm": 0.2067162078043696, + "learning_rate": 7.913214066084335e-05, + "loss": 2.8368, + "step": 17969 + }, + { + "epoch": 1.1155254826494505, + "grad_norm": 0.14962710980112595, + "learning_rate": 7.9129205396805e-05, + "loss": 2.9968, + "step": 17970 + }, + { + "epoch": 1.1155875597492084, + "grad_norm": 0.14628622003908187, + "learning_rate": 7.912626998079251e-05, + "loss": 2.9192, + "step": 17971 + }, + { + "epoch": 1.1156496368489663, + "grad_norm": 0.15562861983176027, + "learning_rate": 7.91233344128212e-05, + "loss": 2.8848, + "step": 17972 + }, + { + "epoch": 1.1157117139487243, + "grad_norm": 0.16033187401506277, + "learning_rate": 7.912039869290641e-05, + "loss": 2.9489, + "step": 17973 + }, + { + "epoch": 1.1157737910484822, + "grad_norm": 0.16839939734801027, + "learning_rate": 7.911746282106341e-05, + "loss": 2.9935, + "step": 17974 + }, + { + "epoch": 1.11583586814824, + "grad_norm": 0.15021290744262136, + "learning_rate": 7.911452679730753e-05, + "loss": 2.9051, + "step": 17975 + }, + { + "epoch": 1.115897945247998, + "grad_norm": 0.24299454046177876, + "learning_rate": 7.911159062165409e-05, + "loss": 2.841, + "step": 17976 + }, + { + "epoch": 1.115960022347756, + "grad_norm": 0.1595577711315328, + "learning_rate": 7.910865429411844e-05, + "loss": 2.9691, + "step": 17977 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.1746743104933641, + "learning_rate": 7.910571781471585e-05, + "loss": 2.8619, + "step": 17978 + }, + { + "epoch": 1.1160841765472718, + "grad_norm": 0.17869937838500702, + "learning_rate": 7.910278118346168e-05, + "loss": 2.9034, + "step": 17979 + }, + { + "epoch": 1.1161462536470297, + "grad_norm": 0.1680460629960096, + "learning_rate": 7.909984440037124e-05, + "loss": 2.9364, + "step": 17980 + }, + { + "epoch": 1.1162083307467876, + "grad_norm": 0.17935840905450567, + "learning_rate": 7.909690746545985e-05, + "loss": 2.8488, + "step": 17981 + }, + { + "epoch": 1.1162704078465455, + "grad_norm": 0.15563414919244573, + "learning_rate": 7.909397037874282e-05, + "loss": 2.8827, + "step": 17982 + }, + { + "epoch": 1.1163324849463032, + "grad_norm": 0.17954139367074928, + "learning_rate": 7.909103314023549e-05, + "loss": 2.943, + "step": 17983 + }, + { + "epoch": 1.1163945620460611, + "grad_norm": 0.17969167745958708, + "learning_rate": 7.908809574995316e-05, + "loss": 2.9121, + "step": 17984 + }, + { + "epoch": 1.116456639145819, + "grad_norm": 0.17199330159939138, + "learning_rate": 7.908515820791119e-05, + "loss": 2.9446, + "step": 17985 + }, + { + "epoch": 1.116518716245577, + "grad_norm": 0.14870077551873276, + "learning_rate": 7.908222051412488e-05, + "loss": 2.8626, + "step": 17986 + }, + { + "epoch": 1.1165807933453349, + "grad_norm": 0.15855071066347043, + "learning_rate": 7.907928266860958e-05, + "loss": 2.9807, + "step": 17987 + }, + { + "epoch": 1.1166428704450928, + "grad_norm": 0.16117632307348595, + "learning_rate": 7.907634467138059e-05, + "loss": 2.8533, + "step": 17988 + }, + { + "epoch": 1.1167049475448507, + "grad_norm": 0.15527162011000656, + "learning_rate": 7.907340652245325e-05, + "loss": 2.9368, + "step": 17989 + }, + { + "epoch": 1.1167670246446086, + "grad_norm": 0.18137064733086963, + "learning_rate": 7.90704682218429e-05, + "loss": 2.9095, + "step": 17990 + }, + { + "epoch": 1.1168291017443666, + "grad_norm": 0.18082320961945966, + "learning_rate": 7.906752976956484e-05, + "loss": 2.8618, + "step": 17991 + }, + { + "epoch": 1.1168911788441245, + "grad_norm": 0.1843008668215771, + "learning_rate": 7.906459116563444e-05, + "loss": 2.9293, + "step": 17992 + }, + { + "epoch": 1.1169532559438824, + "grad_norm": 0.15620447330020779, + "learning_rate": 7.9061652410067e-05, + "loss": 2.8858, + "step": 17993 + }, + { + "epoch": 1.11701533304364, + "grad_norm": 0.15738133104791194, + "learning_rate": 7.905871350287788e-05, + "loss": 2.9812, + "step": 17994 + }, + { + "epoch": 1.117077410143398, + "grad_norm": 0.1713682691854299, + "learning_rate": 7.905577444408238e-05, + "loss": 2.9049, + "step": 17995 + }, + { + "epoch": 1.117139487243156, + "grad_norm": 0.14607986547412716, + "learning_rate": 7.905283523369586e-05, + "loss": 2.9971, + "step": 17996 + }, + { + "epoch": 1.1172015643429138, + "grad_norm": 0.18627003737968167, + "learning_rate": 7.904989587173366e-05, + "loss": 2.8577, + "step": 17997 + }, + { + "epoch": 1.1172636414426718, + "grad_norm": 0.1727540069545282, + "learning_rate": 7.904695635821109e-05, + "loss": 2.8893, + "step": 17998 + }, + { + "epoch": 1.1173257185424297, + "grad_norm": 0.16250195135265424, + "learning_rate": 7.90440166931435e-05, + "loss": 2.9635, + "step": 17999 + }, + { + "epoch": 1.1173877956421876, + "grad_norm": 0.1585048517991062, + "learning_rate": 7.904107687654622e-05, + "loss": 2.9184, + "step": 18000 + }, + { + "epoch": 1.1174498727419455, + "grad_norm": 0.15658037901153318, + "learning_rate": 7.903813690843459e-05, + "loss": 2.8986, + "step": 18001 + }, + { + "epoch": 1.1175119498417034, + "grad_norm": 0.15786497452285847, + "learning_rate": 7.903519678882396e-05, + "loss": 2.9315, + "step": 18002 + }, + { + "epoch": 1.1175740269414614, + "grad_norm": 0.19598684246313744, + "learning_rate": 7.903225651772964e-05, + "loss": 2.8575, + "step": 18003 + }, + { + "epoch": 1.1176361040412193, + "grad_norm": 0.17805262776848416, + "learning_rate": 7.9029316095167e-05, + "loss": 2.991, + "step": 18004 + }, + { + "epoch": 1.1176981811409772, + "grad_norm": 0.19735101230394084, + "learning_rate": 7.902637552115138e-05, + "loss": 2.9054, + "step": 18005 + }, + { + "epoch": 1.117760258240735, + "grad_norm": 0.15154591130387698, + "learning_rate": 7.902343479569811e-05, + "loss": 2.8574, + "step": 18006 + }, + { + "epoch": 1.1178223353404928, + "grad_norm": 0.1886177621170719, + "learning_rate": 7.902049391882252e-05, + "loss": 2.9226, + "step": 18007 + }, + { + "epoch": 1.1178844124402507, + "grad_norm": 0.15262423273271722, + "learning_rate": 7.901755289053998e-05, + "loss": 2.8895, + "step": 18008 + }, + { + "epoch": 1.1179464895400086, + "grad_norm": 0.1526437702856506, + "learning_rate": 7.901461171086581e-05, + "loss": 2.8819, + "step": 18009 + }, + { + "epoch": 1.1180085666397666, + "grad_norm": 0.15272276143948227, + "learning_rate": 7.901167037981539e-05, + "loss": 2.8436, + "step": 18010 + }, + { + "epoch": 1.1180706437395245, + "grad_norm": 0.1514603973857576, + "learning_rate": 7.900872889740402e-05, + "loss": 2.8546, + "step": 18011 + }, + { + "epoch": 1.1181327208392824, + "grad_norm": 0.17941725145226356, + "learning_rate": 7.900578726364709e-05, + "loss": 2.8351, + "step": 18012 + }, + { + "epoch": 1.1181947979390403, + "grad_norm": 0.15612125457023127, + "learning_rate": 7.900284547855991e-05, + "loss": 2.9611, + "step": 18013 + }, + { + "epoch": 1.1182568750387982, + "grad_norm": 0.16917867474813242, + "learning_rate": 7.899990354215785e-05, + "loss": 2.9723, + "step": 18014 + }, + { + "epoch": 1.1183189521385561, + "grad_norm": 0.16977212599390434, + "learning_rate": 7.899696145445625e-05, + "loss": 3.0412, + "step": 18015 + }, + { + "epoch": 1.118381029238314, + "grad_norm": 0.20712841201140533, + "learning_rate": 7.899401921547047e-05, + "loss": 2.8662, + "step": 18016 + }, + { + "epoch": 1.118443106338072, + "grad_norm": 0.16727534486377044, + "learning_rate": 7.899107682521586e-05, + "loss": 2.8798, + "step": 18017 + }, + { + "epoch": 1.1185051834378297, + "grad_norm": 0.17211458457165352, + "learning_rate": 7.898813428370775e-05, + "loss": 2.8748, + "step": 18018 + }, + { + "epoch": 1.1185672605375876, + "grad_norm": 0.16033765358488145, + "learning_rate": 7.898519159096151e-05, + "loss": 2.9325, + "step": 18019 + }, + { + "epoch": 1.1186293376373455, + "grad_norm": 0.20518388291876585, + "learning_rate": 7.89822487469925e-05, + "loss": 2.8992, + "step": 18020 + }, + { + "epoch": 1.1186914147371034, + "grad_norm": 0.19055787686989087, + "learning_rate": 7.897930575181604e-05, + "loss": 2.9012, + "step": 18021 + }, + { + "epoch": 1.1187534918368613, + "grad_norm": 0.1638696365719039, + "learning_rate": 7.897636260544752e-05, + "loss": 2.845, + "step": 18022 + }, + { + "epoch": 1.1188155689366193, + "grad_norm": 0.19712476138836454, + "learning_rate": 7.897341930790229e-05, + "loss": 2.9581, + "step": 18023 + }, + { + "epoch": 1.1188776460363772, + "grad_norm": 0.14339262709228448, + "learning_rate": 7.897047585919569e-05, + "loss": 2.8734, + "step": 18024 + }, + { + "epoch": 1.118939723136135, + "grad_norm": 0.1570981668356382, + "learning_rate": 7.896753225934308e-05, + "loss": 2.8688, + "step": 18025 + }, + { + "epoch": 1.119001800235893, + "grad_norm": 0.14404521875725487, + "learning_rate": 7.896458850835981e-05, + "loss": 2.8865, + "step": 18026 + }, + { + "epoch": 1.119063877335651, + "grad_norm": 0.17844393853832322, + "learning_rate": 7.896164460626128e-05, + "loss": 2.9537, + "step": 18027 + }, + { + "epoch": 1.1191259544354089, + "grad_norm": 0.15832825409957274, + "learning_rate": 7.89587005530628e-05, + "loss": 2.9325, + "step": 18028 + }, + { + "epoch": 1.1191880315351668, + "grad_norm": 0.16874540221167236, + "learning_rate": 7.895575634877977e-05, + "loss": 2.8448, + "step": 18029 + }, + { + "epoch": 1.1192501086349247, + "grad_norm": 0.14992838580538623, + "learning_rate": 7.89528119934275e-05, + "loss": 2.9611, + "step": 18030 + }, + { + "epoch": 1.1193121857346824, + "grad_norm": 0.16131882190435975, + "learning_rate": 7.89498674870214e-05, + "loss": 2.9191, + "step": 18031 + }, + { + "epoch": 1.1193742628344403, + "grad_norm": 0.2092173694681854, + "learning_rate": 7.89469228295768e-05, + "loss": 2.8846, + "step": 18032 + }, + { + "epoch": 1.1194363399341982, + "grad_norm": 0.15176871787106508, + "learning_rate": 7.894397802110909e-05, + "loss": 2.8779, + "step": 18033 + }, + { + "epoch": 1.1194984170339561, + "grad_norm": 0.19608873611666572, + "learning_rate": 7.894103306163363e-05, + "loss": 2.8703, + "step": 18034 + }, + { + "epoch": 1.119560494133714, + "grad_norm": 0.16049391924288642, + "learning_rate": 7.893808795116575e-05, + "loss": 2.947, + "step": 18035 + }, + { + "epoch": 1.119622571233472, + "grad_norm": 0.16868836924310232, + "learning_rate": 7.893514268972087e-05, + "loss": 2.938, + "step": 18036 + }, + { + "epoch": 1.11968464833323, + "grad_norm": 0.17411119472501332, + "learning_rate": 7.893219727731429e-05, + "loss": 2.975, + "step": 18037 + }, + { + "epoch": 1.1197467254329878, + "grad_norm": 0.16054887445508817, + "learning_rate": 7.892925171396144e-05, + "loss": 2.919, + "step": 18038 + }, + { + "epoch": 1.1198088025327457, + "grad_norm": 0.16574117189555154, + "learning_rate": 7.892630599967765e-05, + "loss": 2.9808, + "step": 18039 + }, + { + "epoch": 1.1198708796325036, + "grad_norm": 0.19600616299965526, + "learning_rate": 7.89233601344783e-05, + "loss": 2.8073, + "step": 18040 + }, + { + "epoch": 1.1199329567322616, + "grad_norm": 0.1505364602687723, + "learning_rate": 7.892041411837876e-05, + "loss": 2.8124, + "step": 18041 + }, + { + "epoch": 1.1199950338320193, + "grad_norm": 0.1709758692120098, + "learning_rate": 7.89174679513944e-05, + "loss": 2.8872, + "step": 18042 + }, + { + "epoch": 1.1200571109317772, + "grad_norm": 0.23737473886384106, + "learning_rate": 7.891452163354059e-05, + "loss": 2.896, + "step": 18043 + }, + { + "epoch": 1.120119188031535, + "grad_norm": 0.2230109267652967, + "learning_rate": 7.891157516483269e-05, + "loss": 2.9688, + "step": 18044 + }, + { + "epoch": 1.120181265131293, + "grad_norm": 0.1763211700376006, + "learning_rate": 7.89086285452861e-05, + "loss": 2.9141, + "step": 18045 + }, + { + "epoch": 1.120243342231051, + "grad_norm": 0.18263195610506483, + "learning_rate": 7.890568177491618e-05, + "loss": 2.9496, + "step": 18046 + }, + { + "epoch": 1.1203054193308088, + "grad_norm": 0.15779153240771643, + "learning_rate": 7.890273485373828e-05, + "loss": 2.9388, + "step": 18047 + }, + { + "epoch": 1.1203674964305668, + "grad_norm": 0.1887059569920581, + "learning_rate": 7.88997877817678e-05, + "loss": 2.9143, + "step": 18048 + }, + { + "epoch": 1.1204295735303247, + "grad_norm": 0.15205703555886566, + "learning_rate": 7.889684055902013e-05, + "loss": 2.8599, + "step": 18049 + }, + { + "epoch": 1.1204916506300826, + "grad_norm": 0.19992534764125947, + "learning_rate": 7.889389318551061e-05, + "loss": 2.8949, + "step": 18050 + }, + { + "epoch": 1.1205537277298405, + "grad_norm": 0.1645131450139718, + "learning_rate": 7.889094566125465e-05, + "loss": 2.9481, + "step": 18051 + }, + { + "epoch": 1.1206158048295984, + "grad_norm": 0.1780395403192757, + "learning_rate": 7.88879979862676e-05, + "loss": 2.9453, + "step": 18052 + }, + { + "epoch": 1.1206778819293564, + "grad_norm": 0.17242917214367112, + "learning_rate": 7.888505016056487e-05, + "loss": 2.9386, + "step": 18053 + }, + { + "epoch": 1.1207399590291143, + "grad_norm": 0.2033951961316937, + "learning_rate": 7.888210218416182e-05, + "loss": 2.9284, + "step": 18054 + }, + { + "epoch": 1.120802036128872, + "grad_norm": 0.1848576443594152, + "learning_rate": 7.88791540570738e-05, + "loss": 2.8935, + "step": 18055 + }, + { + "epoch": 1.1208641132286299, + "grad_norm": 0.18478673528035516, + "learning_rate": 7.887620577931624e-05, + "loss": 2.8462, + "step": 18056 + }, + { + "epoch": 1.1209261903283878, + "grad_norm": 0.18651169345492902, + "learning_rate": 7.88732573509045e-05, + "loss": 2.933, + "step": 18057 + }, + { + "epoch": 1.1209882674281457, + "grad_norm": 0.15770376538557015, + "learning_rate": 7.887030877185397e-05, + "loss": 2.8805, + "step": 18058 + }, + { + "epoch": 1.1210503445279036, + "grad_norm": 0.18044653503952504, + "learning_rate": 7.886736004218005e-05, + "loss": 2.9066, + "step": 18059 + }, + { + "epoch": 1.1211124216276616, + "grad_norm": 0.2037778139291219, + "learning_rate": 7.886441116189809e-05, + "loss": 2.8858, + "step": 18060 + }, + { + "epoch": 1.1211744987274195, + "grad_norm": 0.24738761821291233, + "learning_rate": 7.886146213102349e-05, + "loss": 3.0069, + "step": 18061 + }, + { + "epoch": 1.1212365758271774, + "grad_norm": 0.17321287528620422, + "learning_rate": 7.885851294957162e-05, + "loss": 2.8351, + "step": 18062 + }, + { + "epoch": 1.1212986529269353, + "grad_norm": 0.18875952712387403, + "learning_rate": 7.88555636175579e-05, + "loss": 2.9338, + "step": 18063 + }, + { + "epoch": 1.1213607300266932, + "grad_norm": 0.20475751247294513, + "learning_rate": 7.885261413499772e-05, + "loss": 2.9705, + "step": 18064 + }, + { + "epoch": 1.1214228071264511, + "grad_norm": 0.18202907290814455, + "learning_rate": 7.884966450190641e-05, + "loss": 2.8782, + "step": 18065 + }, + { + "epoch": 1.1214848842262088, + "grad_norm": 0.14900628712409922, + "learning_rate": 7.88467147182994e-05, + "loss": 2.9399, + "step": 18066 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.17599827296963463, + "learning_rate": 7.884376478419212e-05, + "loss": 2.9248, + "step": 18067 + }, + { + "epoch": 1.1216090384257247, + "grad_norm": 0.14391568912982372, + "learning_rate": 7.884081469959988e-05, + "loss": 2.9033, + "step": 18068 + }, + { + "epoch": 1.1216711155254826, + "grad_norm": 0.15868187508937556, + "learning_rate": 7.88378644645381e-05, + "loss": 2.9746, + "step": 18069 + }, + { + "epoch": 1.1217331926252405, + "grad_norm": 0.16600315944770158, + "learning_rate": 7.883491407902219e-05, + "loss": 2.9776, + "step": 18070 + }, + { + "epoch": 1.1217952697249984, + "grad_norm": 0.16398253081340167, + "learning_rate": 7.883196354306755e-05, + "loss": 2.9393, + "step": 18071 + }, + { + "epoch": 1.1218573468247564, + "grad_norm": 0.2251845748472102, + "learning_rate": 7.882901285668955e-05, + "loss": 2.8687, + "step": 18072 + }, + { + "epoch": 1.1219194239245143, + "grad_norm": 0.1600275918519744, + "learning_rate": 7.882606201990357e-05, + "loss": 2.9124, + "step": 18073 + }, + { + "epoch": 1.1219815010242722, + "grad_norm": 0.16617106285488595, + "learning_rate": 7.882311103272503e-05, + "loss": 2.8466, + "step": 18074 + }, + { + "epoch": 1.12204357812403, + "grad_norm": 0.15043311323942163, + "learning_rate": 7.882015989516935e-05, + "loss": 2.9089, + "step": 18075 + }, + { + "epoch": 1.122105655223788, + "grad_norm": 0.16486320493796003, + "learning_rate": 7.881720860725188e-05, + "loss": 2.8952, + "step": 18076 + }, + { + "epoch": 1.122167732323546, + "grad_norm": 0.15349303678746612, + "learning_rate": 7.881425716898804e-05, + "loss": 2.8837, + "step": 18077 + }, + { + "epoch": 1.1222298094233039, + "grad_norm": 0.16352351527307654, + "learning_rate": 7.881130558039325e-05, + "loss": 2.9095, + "step": 18078 + }, + { + "epoch": 1.1222918865230616, + "grad_norm": 0.14736576672297705, + "learning_rate": 7.880835384148286e-05, + "loss": 2.8553, + "step": 18079 + }, + { + "epoch": 1.1223539636228195, + "grad_norm": 0.14414262718526213, + "learning_rate": 7.880540195227229e-05, + "loss": 2.8706, + "step": 18080 + }, + { + "epoch": 1.1224160407225774, + "grad_norm": 0.41340936575132203, + "learning_rate": 7.880244991277694e-05, + "loss": 2.9263, + "step": 18081 + }, + { + "epoch": 1.1224781178223353, + "grad_norm": 0.146334145122221, + "learning_rate": 7.879949772301223e-05, + "loss": 2.9393, + "step": 18082 + }, + { + "epoch": 1.1225401949220932, + "grad_norm": 0.23290014345411156, + "learning_rate": 7.879654538299356e-05, + "loss": 2.8889, + "step": 18083 + }, + { + "epoch": 1.1226022720218511, + "grad_norm": 0.15400179883909534, + "learning_rate": 7.87935928927363e-05, + "loss": 2.9361, + "step": 18084 + }, + { + "epoch": 1.122664349121609, + "grad_norm": 0.19393167668683864, + "learning_rate": 7.879064025225589e-05, + "loss": 2.9257, + "step": 18085 + }, + { + "epoch": 1.122726426221367, + "grad_norm": 0.1461675014076009, + "learning_rate": 7.878768746156771e-05, + "loss": 2.9365, + "step": 18086 + }, + { + "epoch": 1.122788503321125, + "grad_norm": 0.15449901208862632, + "learning_rate": 7.87847345206872e-05, + "loss": 2.9433, + "step": 18087 + }, + { + "epoch": 1.1228505804208828, + "grad_norm": 0.15624805687649798, + "learning_rate": 7.87817814296297e-05, + "loss": 2.953, + "step": 18088 + }, + { + "epoch": 1.1229126575206407, + "grad_norm": 0.16511794468002397, + "learning_rate": 7.87788281884107e-05, + "loss": 2.8697, + "step": 18089 + }, + { + "epoch": 1.1229747346203984, + "grad_norm": 0.15786284479320611, + "learning_rate": 7.877587479704554e-05, + "loss": 2.9209, + "step": 18090 + }, + { + "epoch": 1.1230368117201563, + "grad_norm": 0.1562067568067639, + "learning_rate": 7.877292125554967e-05, + "loss": 2.8707, + "step": 18091 + }, + { + "epoch": 1.1230988888199143, + "grad_norm": 0.2481498129206384, + "learning_rate": 7.876996756393848e-05, + "loss": 2.9746, + "step": 18092 + }, + { + "epoch": 1.1231609659196722, + "grad_norm": 0.17953835392800482, + "learning_rate": 7.876701372222739e-05, + "loss": 2.9293, + "step": 18093 + }, + { + "epoch": 1.12322304301943, + "grad_norm": 0.16606045846865058, + "learning_rate": 7.87640597304318e-05, + "loss": 2.9223, + "step": 18094 + }, + { + "epoch": 1.123285120119188, + "grad_norm": 0.1962484725813742, + "learning_rate": 7.876110558856713e-05, + "loss": 2.943, + "step": 18095 + }, + { + "epoch": 1.123347197218946, + "grad_norm": 0.17111956323265798, + "learning_rate": 7.87581512966488e-05, + "loss": 3.0084, + "step": 18096 + }, + { + "epoch": 1.1234092743187039, + "grad_norm": 0.1711692571592177, + "learning_rate": 7.87551968546922e-05, + "loss": 2.9281, + "step": 18097 + }, + { + "epoch": 1.1234713514184618, + "grad_norm": 0.16996922874523532, + "learning_rate": 7.875224226271276e-05, + "loss": 2.8282, + "step": 18098 + }, + { + "epoch": 1.1235334285182197, + "grad_norm": 0.35104825238163795, + "learning_rate": 7.87492875207259e-05, + "loss": 2.9927, + "step": 18099 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.21849278245720025, + "learning_rate": 7.874633262874702e-05, + "loss": 2.8889, + "step": 18100 + }, + { + "epoch": 1.1236575827177355, + "grad_norm": 0.24559319012324837, + "learning_rate": 7.874337758679155e-05, + "loss": 2.8405, + "step": 18101 + }, + { + "epoch": 1.1237196598174934, + "grad_norm": 0.21830119345671198, + "learning_rate": 7.874042239487489e-05, + "loss": 2.9194, + "step": 18102 + }, + { + "epoch": 1.1237817369172511, + "grad_norm": 0.20986553952443013, + "learning_rate": 7.873746705301248e-05, + "loss": 2.874, + "step": 18103 + }, + { + "epoch": 1.123843814017009, + "grad_norm": 0.20930585466738433, + "learning_rate": 7.873451156121972e-05, + "loss": 2.8233, + "step": 18104 + }, + { + "epoch": 1.123905891116767, + "grad_norm": 0.19961048394658457, + "learning_rate": 7.873155591951205e-05, + "loss": 2.8283, + "step": 18105 + }, + { + "epoch": 1.123967968216525, + "grad_norm": 0.2571032333451876, + "learning_rate": 7.872860012790486e-05, + "loss": 2.8832, + "step": 18106 + }, + { + "epoch": 1.1240300453162828, + "grad_norm": 0.22651650520948888, + "learning_rate": 7.872564418641359e-05, + "loss": 2.9755, + "step": 18107 + }, + { + "epoch": 1.1240921224160407, + "grad_norm": 0.1976379996758774, + "learning_rate": 7.872268809505367e-05, + "loss": 2.9202, + "step": 18108 + }, + { + "epoch": 1.1241541995157986, + "grad_norm": 0.2309617007793636, + "learning_rate": 7.871973185384051e-05, + "loss": 2.8965, + "step": 18109 + }, + { + "epoch": 1.1242162766155566, + "grad_norm": 0.2169822724941575, + "learning_rate": 7.871677546278955e-05, + "loss": 2.9966, + "step": 18110 + }, + { + "epoch": 1.1242783537153145, + "grad_norm": 0.20118019864537848, + "learning_rate": 7.871381892191619e-05, + "loss": 2.8574, + "step": 18111 + }, + { + "epoch": 1.1243404308150724, + "grad_norm": 0.2049827717310481, + "learning_rate": 7.871086223123587e-05, + "loss": 3.0898, + "step": 18112 + }, + { + "epoch": 1.1244025079148303, + "grad_norm": 0.1941316287836559, + "learning_rate": 7.870790539076402e-05, + "loss": 2.975, + "step": 18113 + }, + { + "epoch": 1.124464585014588, + "grad_norm": 0.17213086568517255, + "learning_rate": 7.870494840051604e-05, + "loss": 2.8882, + "step": 18114 + }, + { + "epoch": 1.124526662114346, + "grad_norm": 0.19401605435741934, + "learning_rate": 7.87019912605074e-05, + "loss": 2.9871, + "step": 18115 + }, + { + "epoch": 1.1245887392141038, + "grad_norm": 0.1728848139789972, + "learning_rate": 7.869903397075348e-05, + "loss": 2.9224, + "step": 18116 + }, + { + "epoch": 1.1246508163138618, + "grad_norm": 0.16543812443140157, + "learning_rate": 7.869607653126973e-05, + "loss": 2.9095, + "step": 18117 + }, + { + "epoch": 1.1247128934136197, + "grad_norm": 0.26667565235009155, + "learning_rate": 7.86931189420716e-05, + "loss": 2.8691, + "step": 18118 + }, + { + "epoch": 1.1247749705133776, + "grad_norm": 0.16940006616962958, + "learning_rate": 7.86901612031745e-05, + "loss": 2.9326, + "step": 18119 + }, + { + "epoch": 1.1248370476131355, + "grad_norm": 0.16691080977156086, + "learning_rate": 7.868720331459387e-05, + "loss": 2.928, + "step": 18120 + }, + { + "epoch": 1.1248991247128934, + "grad_norm": 0.170916970474245, + "learning_rate": 7.868424527634511e-05, + "loss": 2.7864, + "step": 18121 + }, + { + "epoch": 1.1249612018126514, + "grad_norm": 0.1902957836727162, + "learning_rate": 7.86812870884437e-05, + "loss": 2.9047, + "step": 18122 + }, + { + "epoch": 1.1250232789124093, + "grad_norm": 0.17029684174153467, + "learning_rate": 7.867832875090503e-05, + "loss": 2.8994, + "step": 18123 + }, + { + "epoch": 1.1250853560121672, + "grad_norm": 0.17520228513453243, + "learning_rate": 7.867537026374458e-05, + "loss": 2.8916, + "step": 18124 + }, + { + "epoch": 1.125147433111925, + "grad_norm": 0.16262001021402814, + "learning_rate": 7.867241162697776e-05, + "loss": 2.9678, + "step": 18125 + }, + { + "epoch": 1.125209510211683, + "grad_norm": 0.17028872582100038, + "learning_rate": 7.866945284062001e-05, + "loss": 2.9895, + "step": 18126 + }, + { + "epoch": 1.1252715873114407, + "grad_norm": 0.20287048460212556, + "learning_rate": 7.866649390468677e-05, + "loss": 2.9337, + "step": 18127 + }, + { + "epoch": 1.1253336644111986, + "grad_norm": 0.17115497952314104, + "learning_rate": 7.866353481919346e-05, + "loss": 2.9074, + "step": 18128 + }, + { + "epoch": 1.1253957415109566, + "grad_norm": 0.17420479329403799, + "learning_rate": 7.866057558415554e-05, + "loss": 3.0217, + "step": 18129 + }, + { + "epoch": 1.1254578186107145, + "grad_norm": 0.16770432417581307, + "learning_rate": 7.865761619958843e-05, + "loss": 2.915, + "step": 18130 + }, + { + "epoch": 1.1255198957104724, + "grad_norm": 0.1503687012780996, + "learning_rate": 7.865465666550757e-05, + "loss": 2.8453, + "step": 18131 + }, + { + "epoch": 1.1255819728102303, + "grad_norm": 0.15822819522909373, + "learning_rate": 7.865169698192842e-05, + "loss": 2.9503, + "step": 18132 + }, + { + "epoch": 1.1256440499099882, + "grad_norm": 0.1897597437104646, + "learning_rate": 7.864873714886643e-05, + "loss": 2.9181, + "step": 18133 + }, + { + "epoch": 1.1257061270097461, + "grad_norm": 0.18513740733397208, + "learning_rate": 7.864577716633701e-05, + "loss": 2.948, + "step": 18134 + }, + { + "epoch": 1.125768204109504, + "grad_norm": 0.23030508016489876, + "learning_rate": 7.864281703435561e-05, + "loss": 2.9596, + "step": 18135 + }, + { + "epoch": 1.125830281209262, + "grad_norm": 0.1998449842401468, + "learning_rate": 7.863985675293768e-05, + "loss": 2.9437, + "step": 18136 + }, + { + "epoch": 1.1258923583090197, + "grad_norm": 0.1742807079273923, + "learning_rate": 7.863689632209868e-05, + "loss": 2.86, + "step": 18137 + }, + { + "epoch": 1.1259544354087776, + "grad_norm": 0.1869121017273488, + "learning_rate": 7.863393574185403e-05, + "loss": 3.0414, + "step": 18138 + }, + { + "epoch": 1.1260165125085355, + "grad_norm": 0.16346840353855938, + "learning_rate": 7.863097501221917e-05, + "loss": 2.9371, + "step": 18139 + }, + { + "epoch": 1.1260785896082934, + "grad_norm": 0.16925632646948496, + "learning_rate": 7.862801413320958e-05, + "loss": 2.9356, + "step": 18140 + }, + { + "epoch": 1.1261406667080514, + "grad_norm": 0.19022247582727525, + "learning_rate": 7.86250531048407e-05, + "loss": 2.9517, + "step": 18141 + }, + { + "epoch": 1.1262027438078093, + "grad_norm": 0.1536154245468363, + "learning_rate": 7.862209192712795e-05, + "loss": 2.8362, + "step": 18142 + }, + { + "epoch": 1.1262648209075672, + "grad_norm": 0.202763409823152, + "learning_rate": 7.861913060008682e-05, + "loss": 2.8608, + "step": 18143 + }, + { + "epoch": 1.126326898007325, + "grad_norm": 0.2009950064007979, + "learning_rate": 7.861616912373271e-05, + "loss": 2.9691, + "step": 18144 + }, + { + "epoch": 1.126388975107083, + "grad_norm": 0.15995684864188686, + "learning_rate": 7.861320749808111e-05, + "loss": 2.9577, + "step": 18145 + }, + { + "epoch": 1.126451052206841, + "grad_norm": 0.17389888073472992, + "learning_rate": 7.861024572314746e-05, + "loss": 2.8829, + "step": 18146 + }, + { + "epoch": 1.1265131293065989, + "grad_norm": 0.17212196510107716, + "learning_rate": 7.860728379894722e-05, + "loss": 2.9104, + "step": 18147 + }, + { + "epoch": 1.1265752064063568, + "grad_norm": 0.16284508537392084, + "learning_rate": 7.860432172549583e-05, + "loss": 2.9005, + "step": 18148 + }, + { + "epoch": 1.1266372835061147, + "grad_norm": 0.14718008769863936, + "learning_rate": 7.860135950280876e-05, + "loss": 2.8893, + "step": 18149 + }, + { + "epoch": 1.1266993606058726, + "grad_norm": 0.1874303390321788, + "learning_rate": 7.859839713090144e-05, + "loss": 2.8891, + "step": 18150 + }, + { + "epoch": 1.1267614377056303, + "grad_norm": 0.17829378354801745, + "learning_rate": 7.859543460978935e-05, + "loss": 2.9033, + "step": 18151 + }, + { + "epoch": 1.1268235148053882, + "grad_norm": 0.18217880751744364, + "learning_rate": 7.859247193948793e-05, + "loss": 2.989, + "step": 18152 + }, + { + "epoch": 1.1268855919051461, + "grad_norm": 0.16295862082276638, + "learning_rate": 7.858950912001264e-05, + "loss": 2.8785, + "step": 18153 + }, + { + "epoch": 1.126947669004904, + "grad_norm": 0.15987088785967524, + "learning_rate": 7.858654615137895e-05, + "loss": 2.91, + "step": 18154 + }, + { + "epoch": 1.127009746104662, + "grad_norm": 0.15290968875031505, + "learning_rate": 7.858358303360229e-05, + "loss": 2.8723, + "step": 18155 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.1520825494349757, + "learning_rate": 7.858061976669815e-05, + "loss": 2.9074, + "step": 18156 + }, + { + "epoch": 1.1271339003041778, + "grad_norm": 0.1579895343747326, + "learning_rate": 7.857765635068198e-05, + "loss": 2.8052, + "step": 18157 + }, + { + "epoch": 1.1271959774039357, + "grad_norm": 0.1406201653179268, + "learning_rate": 7.857469278556924e-05, + "loss": 2.9129, + "step": 18158 + }, + { + "epoch": 1.1272580545036937, + "grad_norm": 0.18339336904198816, + "learning_rate": 7.85717290713754e-05, + "loss": 2.941, + "step": 18159 + }, + { + "epoch": 1.1273201316034516, + "grad_norm": 0.1528810538257494, + "learning_rate": 7.85687652081159e-05, + "loss": 2.8917, + "step": 18160 + }, + { + "epoch": 1.1273822087032093, + "grad_norm": 0.1586531607684201, + "learning_rate": 7.856580119580622e-05, + "loss": 2.8634, + "step": 18161 + }, + { + "epoch": 1.1274442858029672, + "grad_norm": 0.1629973359332728, + "learning_rate": 7.856283703446183e-05, + "loss": 2.8071, + "step": 18162 + }, + { + "epoch": 1.127506362902725, + "grad_norm": 0.16078176694810986, + "learning_rate": 7.855987272409817e-05, + "loss": 2.8992, + "step": 18163 + }, + { + "epoch": 1.127568440002483, + "grad_norm": 0.1694520341144234, + "learning_rate": 7.855690826473072e-05, + "loss": 2.9128, + "step": 18164 + }, + { + "epoch": 1.127630517102241, + "grad_norm": 0.15445621930208012, + "learning_rate": 7.855394365637496e-05, + "loss": 2.8985, + "step": 18165 + }, + { + "epoch": 1.1276925942019989, + "grad_norm": 0.21185388018129533, + "learning_rate": 7.855097889904635e-05, + "loss": 3.0211, + "step": 18166 + }, + { + "epoch": 1.1277546713017568, + "grad_norm": 0.20289149067150192, + "learning_rate": 7.854801399276033e-05, + "loss": 2.9442, + "step": 18167 + }, + { + "epoch": 1.1278167484015147, + "grad_norm": 0.17426857171773436, + "learning_rate": 7.854504893753241e-05, + "loss": 2.8961, + "step": 18168 + }, + { + "epoch": 1.1278788255012726, + "grad_norm": 0.2045767513629059, + "learning_rate": 7.854208373337804e-05, + "loss": 2.9024, + "step": 18169 + }, + { + "epoch": 1.1279409026010305, + "grad_norm": 0.24022697034139526, + "learning_rate": 7.853911838031267e-05, + "loss": 2.9308, + "step": 18170 + }, + { + "epoch": 1.1280029797007884, + "grad_norm": 0.19689282987480294, + "learning_rate": 7.85361528783518e-05, + "loss": 2.9325, + "step": 18171 + }, + { + "epoch": 1.1280650568005464, + "grad_norm": 0.15792352788454028, + "learning_rate": 7.853318722751088e-05, + "loss": 2.8614, + "step": 18172 + }, + { + "epoch": 1.1281271339003043, + "grad_norm": 0.21777433557710088, + "learning_rate": 7.853022142780542e-05, + "loss": 2.942, + "step": 18173 + }, + { + "epoch": 1.1281892110000622, + "grad_norm": 0.16128587677595652, + "learning_rate": 7.852725547925086e-05, + "loss": 2.8703, + "step": 18174 + }, + { + "epoch": 1.12825128809982, + "grad_norm": 0.1677237376005791, + "learning_rate": 7.852428938186269e-05, + "loss": 2.8454, + "step": 18175 + }, + { + "epoch": 1.1283133651995778, + "grad_norm": 0.15940565324031103, + "learning_rate": 7.852132313565636e-05, + "loss": 2.8594, + "step": 18176 + }, + { + "epoch": 1.1283754422993357, + "grad_norm": 0.20385446146568642, + "learning_rate": 7.851835674064738e-05, + "loss": 2.8613, + "step": 18177 + }, + { + "epoch": 1.1284375193990936, + "grad_norm": 0.21443087179212927, + "learning_rate": 7.851539019685122e-05, + "loss": 2.8497, + "step": 18178 + }, + { + "epoch": 1.1284995964988516, + "grad_norm": 0.19217745475568423, + "learning_rate": 7.851242350428332e-05, + "loss": 2.8886, + "step": 18179 + }, + { + "epoch": 1.1285616735986095, + "grad_norm": 0.26140438957031153, + "learning_rate": 7.85094566629592e-05, + "loss": 2.9034, + "step": 18180 + }, + { + "epoch": 1.1286237506983674, + "grad_norm": 0.17136535076603096, + "learning_rate": 7.850648967289432e-05, + "loss": 2.8627, + "step": 18181 + }, + { + "epoch": 1.1286858277981253, + "grad_norm": 0.18641470690886394, + "learning_rate": 7.850352253410415e-05, + "loss": 2.9322, + "step": 18182 + }, + { + "epoch": 1.1287479048978832, + "grad_norm": 0.22649089734099911, + "learning_rate": 7.850055524660421e-05, + "loss": 2.8677, + "step": 18183 + }, + { + "epoch": 1.1288099819976412, + "grad_norm": 0.17394983659673813, + "learning_rate": 7.849758781040994e-05, + "loss": 2.9562, + "step": 18184 + }, + { + "epoch": 1.1288720590973989, + "grad_norm": 0.19938448949132284, + "learning_rate": 7.849462022553685e-05, + "loss": 2.9088, + "step": 18185 + }, + { + "epoch": 1.1289341361971568, + "grad_norm": 0.1594750291469097, + "learning_rate": 7.849165249200039e-05, + "loss": 2.9143, + "step": 18186 + }, + { + "epoch": 1.1289962132969147, + "grad_norm": 0.17692107358766315, + "learning_rate": 7.848868460981606e-05, + "loss": 2.9118, + "step": 18187 + }, + { + "epoch": 1.1290582903966726, + "grad_norm": 0.2056130381267351, + "learning_rate": 7.848571657899936e-05, + "loss": 2.8093, + "step": 18188 + }, + { + "epoch": 1.1291203674964305, + "grad_norm": 0.16907276461742507, + "learning_rate": 7.848274839956575e-05, + "loss": 2.9341, + "step": 18189 + }, + { + "epoch": 1.1291824445961884, + "grad_norm": 0.19350577120282006, + "learning_rate": 7.847978007153075e-05, + "loss": 2.9748, + "step": 18190 + }, + { + "epoch": 1.1292445216959464, + "grad_norm": 0.21968017936273523, + "learning_rate": 7.84768115949098e-05, + "loss": 2.9138, + "step": 18191 + }, + { + "epoch": 1.1293065987957043, + "grad_norm": 0.19571918109503408, + "learning_rate": 7.847384296971842e-05, + "loss": 2.9119, + "step": 18192 + }, + { + "epoch": 1.1293686758954622, + "grad_norm": 0.19798071770145934, + "learning_rate": 7.847087419597208e-05, + "loss": 2.8707, + "step": 18193 + }, + { + "epoch": 1.12943075299522, + "grad_norm": 0.20561948802192542, + "learning_rate": 7.846790527368629e-05, + "loss": 2.9331, + "step": 18194 + }, + { + "epoch": 1.129492830094978, + "grad_norm": 0.17969068234154945, + "learning_rate": 7.846493620287652e-05, + "loss": 2.9171, + "step": 18195 + }, + { + "epoch": 1.129554907194736, + "grad_norm": 0.1525804014275387, + "learning_rate": 7.846196698355828e-05, + "loss": 2.9448, + "step": 18196 + }, + { + "epoch": 1.1296169842944939, + "grad_norm": 0.16499961291253318, + "learning_rate": 7.845899761574704e-05, + "loss": 2.8347, + "step": 18197 + }, + { + "epoch": 1.1296790613942518, + "grad_norm": 0.2168498618868053, + "learning_rate": 7.84560280994583e-05, + "loss": 2.909, + "step": 18198 + }, + { + "epoch": 1.1297411384940095, + "grad_norm": 0.17535144924874832, + "learning_rate": 7.845305843470756e-05, + "loss": 2.9191, + "step": 18199 + }, + { + "epoch": 1.1298032155937674, + "grad_norm": 0.2125323369161473, + "learning_rate": 7.845008862151029e-05, + "loss": 2.9519, + "step": 18200 + }, + { + "epoch": 1.1298652926935253, + "grad_norm": 0.17981459435140584, + "learning_rate": 7.844711865988201e-05, + "loss": 2.9199, + "step": 18201 + }, + { + "epoch": 1.1299273697932832, + "grad_norm": 0.19822433466458672, + "learning_rate": 7.844414854983821e-05, + "loss": 2.9415, + "step": 18202 + }, + { + "epoch": 1.1299894468930411, + "grad_norm": 0.20987181435371724, + "learning_rate": 7.844117829139439e-05, + "loss": 2.8106, + "step": 18203 + }, + { + "epoch": 1.130051523992799, + "grad_norm": 0.17750514622400823, + "learning_rate": 7.843820788456602e-05, + "loss": 2.8948, + "step": 18204 + }, + { + "epoch": 1.130113601092557, + "grad_norm": 0.16157977794845324, + "learning_rate": 7.843523732936862e-05, + "loss": 2.9853, + "step": 18205 + }, + { + "epoch": 1.130175678192315, + "grad_norm": 0.17073379760425314, + "learning_rate": 7.843226662581768e-05, + "loss": 2.8543, + "step": 18206 + }, + { + "epoch": 1.1302377552920728, + "grad_norm": 0.1968890457592517, + "learning_rate": 7.842929577392872e-05, + "loss": 2.874, + "step": 18207 + }, + { + "epoch": 1.1302998323918307, + "grad_norm": 0.20001819498029053, + "learning_rate": 7.842632477371722e-05, + "loss": 2.9549, + "step": 18208 + }, + { + "epoch": 1.1303619094915884, + "grad_norm": 0.17769363481354786, + "learning_rate": 7.842335362519868e-05, + "loss": 2.8156, + "step": 18209 + }, + { + "epoch": 1.1304239865913464, + "grad_norm": 0.17497766982389573, + "learning_rate": 7.842038232838859e-05, + "loss": 2.8908, + "step": 18210 + }, + { + "epoch": 1.1304860636911043, + "grad_norm": 0.23474614638577032, + "learning_rate": 7.841741088330248e-05, + "loss": 2.8672, + "step": 18211 + }, + { + "epoch": 1.1305481407908622, + "grad_norm": 0.18267654025612376, + "learning_rate": 7.841443928995584e-05, + "loss": 2.9653, + "step": 18212 + }, + { + "epoch": 1.13061021789062, + "grad_norm": 0.14541520621687637, + "learning_rate": 7.841146754836417e-05, + "loss": 2.9107, + "step": 18213 + }, + { + "epoch": 1.130672294990378, + "grad_norm": 0.1661437269653286, + "learning_rate": 7.840849565854298e-05, + "loss": 2.9343, + "step": 18214 + }, + { + "epoch": 1.130734372090136, + "grad_norm": 0.16008055456578857, + "learning_rate": 7.840552362050776e-05, + "loss": 2.9349, + "step": 18215 + }, + { + "epoch": 1.1307964491898939, + "grad_norm": 0.19174339807527133, + "learning_rate": 7.840255143427404e-05, + "loss": 2.9099, + "step": 18216 + }, + { + "epoch": 1.1308585262896518, + "grad_norm": 0.17184497831148066, + "learning_rate": 7.839957909985731e-05, + "loss": 2.9221, + "step": 18217 + }, + { + "epoch": 1.1309206033894097, + "grad_norm": 0.1763718643497764, + "learning_rate": 7.839660661727307e-05, + "loss": 2.8928, + "step": 18218 + }, + { + "epoch": 1.1309826804891676, + "grad_norm": 0.18461449916668143, + "learning_rate": 7.839363398653685e-05, + "loss": 2.9214, + "step": 18219 + }, + { + "epoch": 1.1310447575889255, + "grad_norm": 0.16884166647334925, + "learning_rate": 7.839066120766416e-05, + "loss": 2.8868, + "step": 18220 + }, + { + "epoch": 1.1311068346886834, + "grad_norm": 0.1757665151142347, + "learning_rate": 7.838768828067047e-05, + "loss": 2.8326, + "step": 18221 + }, + { + "epoch": 1.1311689117884414, + "grad_norm": 0.1618831906492593, + "learning_rate": 7.838471520557133e-05, + "loss": 2.9391, + "step": 18222 + }, + { + "epoch": 1.131230988888199, + "grad_norm": 0.1693614399327708, + "learning_rate": 7.838174198238227e-05, + "loss": 2.8875, + "step": 18223 + }, + { + "epoch": 1.131293065987957, + "grad_norm": 0.16764767032900887, + "learning_rate": 7.837876861111874e-05, + "loss": 2.8502, + "step": 18224 + }, + { + "epoch": 1.131355143087715, + "grad_norm": 0.19043686682054692, + "learning_rate": 7.837579509179628e-05, + "loss": 2.8393, + "step": 18225 + }, + { + "epoch": 1.1314172201874728, + "grad_norm": 0.16583886553851315, + "learning_rate": 7.837282142443043e-05, + "loss": 2.9618, + "step": 18226 + }, + { + "epoch": 1.1314792972872307, + "grad_norm": 0.18340002146596482, + "learning_rate": 7.836984760903668e-05, + "loss": 2.894, + "step": 18227 + }, + { + "epoch": 1.1315413743869887, + "grad_norm": 0.16146155259296754, + "learning_rate": 7.836687364563053e-05, + "loss": 2.8683, + "step": 18228 + }, + { + "epoch": 1.1316034514867466, + "grad_norm": 0.17955059558166245, + "learning_rate": 7.836389953422754e-05, + "loss": 2.9364, + "step": 18229 + }, + { + "epoch": 1.1316655285865045, + "grad_norm": 0.16909978673103448, + "learning_rate": 7.836092527484318e-05, + "loss": 2.9074, + "step": 18230 + }, + { + "epoch": 1.1317276056862624, + "grad_norm": 0.16830598998202137, + "learning_rate": 7.8357950867493e-05, + "loss": 2.918, + "step": 18231 + }, + { + "epoch": 1.1317896827860203, + "grad_norm": 0.17590626403759743, + "learning_rate": 7.835497631219251e-05, + "loss": 2.9263, + "step": 18232 + }, + { + "epoch": 1.131851759885778, + "grad_norm": 0.1687971579038987, + "learning_rate": 7.83520016089572e-05, + "loss": 2.9311, + "step": 18233 + }, + { + "epoch": 1.131913836985536, + "grad_norm": 0.16404588365423314, + "learning_rate": 7.834902675780263e-05, + "loss": 2.9128, + "step": 18234 + }, + { + "epoch": 1.1319759140852939, + "grad_norm": 0.2403147260689286, + "learning_rate": 7.834605175874432e-05, + "loss": 2.8881, + "step": 18235 + }, + { + "epoch": 1.1320379911850518, + "grad_norm": 0.1556474511677695, + "learning_rate": 7.834307661179776e-05, + "loss": 2.9249, + "step": 18236 + }, + { + "epoch": 1.1321000682848097, + "grad_norm": 0.16335114028221492, + "learning_rate": 7.834010131697849e-05, + "loss": 2.8656, + "step": 18237 + }, + { + "epoch": 1.1321621453845676, + "grad_norm": 0.18496405771074817, + "learning_rate": 7.833712587430204e-05, + "loss": 2.9331, + "step": 18238 + }, + { + "epoch": 1.1322242224843255, + "grad_norm": 0.18753400136692983, + "learning_rate": 7.83341502837839e-05, + "loss": 2.9452, + "step": 18239 + }, + { + "epoch": 1.1322862995840834, + "grad_norm": 0.15767822785394844, + "learning_rate": 7.833117454543964e-05, + "loss": 2.9255, + "step": 18240 + }, + { + "epoch": 1.1323483766838414, + "grad_norm": 0.17908888078259652, + "learning_rate": 7.832819865928477e-05, + "loss": 2.8759, + "step": 18241 + }, + { + "epoch": 1.1324104537835993, + "grad_norm": 0.15094141606568043, + "learning_rate": 7.83252226253348e-05, + "loss": 2.8385, + "step": 18242 + }, + { + "epoch": 1.1324725308833572, + "grad_norm": 0.17249163098523876, + "learning_rate": 7.832224644360527e-05, + "loss": 2.9099, + "step": 18243 + }, + { + "epoch": 1.1325346079831151, + "grad_norm": 0.1735251067970364, + "learning_rate": 7.831927011411171e-05, + "loss": 2.7919, + "step": 18244 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.17047169402226298, + "learning_rate": 7.831629363686965e-05, + "loss": 2.8684, + "step": 18245 + }, + { + "epoch": 1.132658762182631, + "grad_norm": 0.15827595247175805, + "learning_rate": 7.83133170118946e-05, + "loss": 2.8489, + "step": 18246 + }, + { + "epoch": 1.1327208392823886, + "grad_norm": 0.17353449892725664, + "learning_rate": 7.83103402392021e-05, + "loss": 2.8459, + "step": 18247 + }, + { + "epoch": 1.1327829163821466, + "grad_norm": 0.15455235685222452, + "learning_rate": 7.830736331880768e-05, + "loss": 2.9056, + "step": 18248 + }, + { + "epoch": 1.1328449934819045, + "grad_norm": 0.16172285996157934, + "learning_rate": 7.830438625072689e-05, + "loss": 2.9291, + "step": 18249 + }, + { + "epoch": 1.1329070705816624, + "grad_norm": 0.1765886624963127, + "learning_rate": 7.830140903497522e-05, + "loss": 2.8294, + "step": 18250 + }, + { + "epoch": 1.1329691476814203, + "grad_norm": 0.1557966749136247, + "learning_rate": 7.829843167156826e-05, + "loss": 2.9187, + "step": 18251 + }, + { + "epoch": 1.1330312247811782, + "grad_norm": 0.17299428532511132, + "learning_rate": 7.829545416052148e-05, + "loss": 2.8986, + "step": 18252 + }, + { + "epoch": 1.1330933018809362, + "grad_norm": 0.15901379629363485, + "learning_rate": 7.829247650185047e-05, + "loss": 2.9015, + "step": 18253 + }, + { + "epoch": 1.133155378980694, + "grad_norm": 0.1657488362230466, + "learning_rate": 7.828949869557073e-05, + "loss": 2.8437, + "step": 18254 + }, + { + "epoch": 1.133217456080452, + "grad_norm": 0.18266017428278714, + "learning_rate": 7.828652074169781e-05, + "loss": 2.9352, + "step": 18255 + }, + { + "epoch": 1.13327953318021, + "grad_norm": 0.15710620538330233, + "learning_rate": 7.828354264024725e-05, + "loss": 2.867, + "step": 18256 + }, + { + "epoch": 1.1333416102799676, + "grad_norm": 0.16596470220130002, + "learning_rate": 7.828056439123458e-05, + "loss": 2.9293, + "step": 18257 + }, + { + "epoch": 1.1334036873797255, + "grad_norm": 0.14738339791787014, + "learning_rate": 7.827758599467533e-05, + "loss": 2.8467, + "step": 18258 + }, + { + "epoch": 1.1334657644794834, + "grad_norm": 0.17137081257186068, + "learning_rate": 7.827460745058508e-05, + "loss": 2.9039, + "step": 18259 + }, + { + "epoch": 1.1335278415792414, + "grad_norm": 0.15827415778980375, + "learning_rate": 7.827162875897931e-05, + "loss": 2.8565, + "step": 18260 + }, + { + "epoch": 1.1335899186789993, + "grad_norm": 0.16208830662058382, + "learning_rate": 7.826864991987359e-05, + "loss": 3.0012, + "step": 18261 + }, + { + "epoch": 1.1336519957787572, + "grad_norm": 0.16334766812110904, + "learning_rate": 7.826567093328347e-05, + "loss": 3.0565, + "step": 18262 + }, + { + "epoch": 1.1337140728785151, + "grad_norm": 0.16359659298136173, + "learning_rate": 7.826269179922448e-05, + "loss": 2.8519, + "step": 18263 + }, + { + "epoch": 1.133776149978273, + "grad_norm": 0.1811591915676843, + "learning_rate": 7.825971251771217e-05, + "loss": 2.8372, + "step": 18264 + }, + { + "epoch": 1.133838227078031, + "grad_norm": 0.17691797829520767, + "learning_rate": 7.825673308876206e-05, + "loss": 2.8872, + "step": 18265 + }, + { + "epoch": 1.1339003041777889, + "grad_norm": 0.17017040493310648, + "learning_rate": 7.825375351238974e-05, + "loss": 2.8365, + "step": 18266 + }, + { + "epoch": 1.1339623812775468, + "grad_norm": 0.15589203958327405, + "learning_rate": 7.825077378861073e-05, + "loss": 2.8954, + "step": 18267 + }, + { + "epoch": 1.1340244583773047, + "grad_norm": 0.1744385513081267, + "learning_rate": 7.824779391744058e-05, + "loss": 2.8828, + "step": 18268 + }, + { + "epoch": 1.1340865354770626, + "grad_norm": 0.15928424724350634, + "learning_rate": 7.824481389889479e-05, + "loss": 2.917, + "step": 18269 + }, + { + "epoch": 1.1341486125768203, + "grad_norm": 0.1786330110510034, + "learning_rate": 7.824183373298898e-05, + "loss": 2.9609, + "step": 18270 + }, + { + "epoch": 1.1342106896765782, + "grad_norm": 0.2113606041930121, + "learning_rate": 7.823885341973866e-05, + "loss": 2.9455, + "step": 18271 + }, + { + "epoch": 1.1342727667763361, + "grad_norm": 0.17694074953774538, + "learning_rate": 7.82358729591594e-05, + "loss": 2.7999, + "step": 18272 + }, + { + "epoch": 1.134334843876094, + "grad_norm": 0.1700289378009861, + "learning_rate": 7.823289235126672e-05, + "loss": 2.864, + "step": 18273 + }, + { + "epoch": 1.134396920975852, + "grad_norm": 0.18239280869375982, + "learning_rate": 7.822991159607619e-05, + "loss": 2.8282, + "step": 18274 + }, + { + "epoch": 1.13445899807561, + "grad_norm": 0.18704049110687598, + "learning_rate": 7.822693069360337e-05, + "loss": 2.9345, + "step": 18275 + }, + { + "epoch": 1.1345210751753678, + "grad_norm": 0.15781714602514862, + "learning_rate": 7.822394964386378e-05, + "loss": 2.8274, + "step": 18276 + }, + { + "epoch": 1.1345831522751257, + "grad_norm": 0.17972849295362273, + "learning_rate": 7.822096844687301e-05, + "loss": 2.8562, + "step": 18277 + }, + { + "epoch": 1.1346452293748837, + "grad_norm": 0.15968390335690907, + "learning_rate": 7.821798710264658e-05, + "loss": 2.8631, + "step": 18278 + }, + { + "epoch": 1.1347073064746416, + "grad_norm": 0.1663363612184238, + "learning_rate": 7.821500561120007e-05, + "loss": 2.9623, + "step": 18279 + }, + { + "epoch": 1.1347693835743995, + "grad_norm": 0.2857887230172905, + "learning_rate": 7.821202397254903e-05, + "loss": 2.9526, + "step": 18280 + }, + { + "epoch": 1.1348314606741572, + "grad_norm": 0.24175462184125104, + "learning_rate": 7.820904218670901e-05, + "loss": 2.82, + "step": 18281 + }, + { + "epoch": 1.134893537773915, + "grad_norm": 0.16987853209961715, + "learning_rate": 7.820606025369558e-05, + "loss": 2.8644, + "step": 18282 + }, + { + "epoch": 1.134955614873673, + "grad_norm": 0.18041952956470583, + "learning_rate": 7.820307817352428e-05, + "loss": 2.8671, + "step": 18283 + }, + { + "epoch": 1.135017691973431, + "grad_norm": 0.1823677869086732, + "learning_rate": 7.820009594621066e-05, + "loss": 2.7921, + "step": 18284 + }, + { + "epoch": 1.1350797690731889, + "grad_norm": 0.20187083362555278, + "learning_rate": 7.819711357177031e-05, + "loss": 2.8184, + "step": 18285 + }, + { + "epoch": 1.1351418461729468, + "grad_norm": 0.153467520731215, + "learning_rate": 7.819413105021875e-05, + "loss": 2.835, + "step": 18286 + }, + { + "epoch": 1.1352039232727047, + "grad_norm": 0.17911029449109733, + "learning_rate": 7.81911483815716e-05, + "loss": 2.942, + "step": 18287 + }, + { + "epoch": 1.1352660003724626, + "grad_norm": 0.15167219749149954, + "learning_rate": 7.818816556584436e-05, + "loss": 2.8599, + "step": 18288 + }, + { + "epoch": 1.1353280774722205, + "grad_norm": 0.20664803107678392, + "learning_rate": 7.818518260305262e-05, + "loss": 2.962, + "step": 18289 + }, + { + "epoch": 1.1353901545719784, + "grad_norm": 0.17687877709969163, + "learning_rate": 7.818219949321195e-05, + "loss": 2.8627, + "step": 18290 + }, + { + "epoch": 1.1354522316717364, + "grad_norm": 0.17199473653581307, + "learning_rate": 7.817921623633789e-05, + "loss": 2.7938, + "step": 18291 + }, + { + "epoch": 1.1355143087714943, + "grad_norm": 0.1719076548246599, + "learning_rate": 7.817623283244604e-05, + "loss": 2.9192, + "step": 18292 + }, + { + "epoch": 1.1355763858712522, + "grad_norm": 0.18787438050787866, + "learning_rate": 7.817324928155191e-05, + "loss": 2.9392, + "step": 18293 + }, + { + "epoch": 1.13563846297101, + "grad_norm": 0.19130389727230904, + "learning_rate": 7.817026558367114e-05, + "loss": 2.9835, + "step": 18294 + }, + { + "epoch": 1.1357005400707678, + "grad_norm": 0.17323945052985704, + "learning_rate": 7.816728173881922e-05, + "loss": 2.9702, + "step": 18295 + }, + { + "epoch": 1.1357626171705257, + "grad_norm": 0.173330860173787, + "learning_rate": 7.816429774701177e-05, + "loss": 2.9479, + "step": 18296 + }, + { + "epoch": 1.1358246942702837, + "grad_norm": 0.1734874097219733, + "learning_rate": 7.816131360826434e-05, + "loss": 2.8546, + "step": 18297 + }, + { + "epoch": 1.1358867713700416, + "grad_norm": 0.1667184538154867, + "learning_rate": 7.81583293225925e-05, + "loss": 2.9122, + "step": 18298 + }, + { + "epoch": 1.1359488484697995, + "grad_norm": 0.1679968089686907, + "learning_rate": 7.815534489001181e-05, + "loss": 2.8698, + "step": 18299 + }, + { + "epoch": 1.1360109255695574, + "grad_norm": 0.19787840515190144, + "learning_rate": 7.815236031053788e-05, + "loss": 2.9626, + "step": 18300 + }, + { + "epoch": 1.1360730026693153, + "grad_norm": 0.16883235528916193, + "learning_rate": 7.814937558418623e-05, + "loss": 2.7947, + "step": 18301 + }, + { + "epoch": 1.1361350797690732, + "grad_norm": 0.17258935102644501, + "learning_rate": 7.814639071097245e-05, + "loss": 2.9209, + "step": 18302 + }, + { + "epoch": 1.1361971568688312, + "grad_norm": 0.18114574245547851, + "learning_rate": 7.814340569091213e-05, + "loss": 2.8696, + "step": 18303 + }, + { + "epoch": 1.136259233968589, + "grad_norm": 0.21596272978121564, + "learning_rate": 7.814042052402082e-05, + "loss": 2.8259, + "step": 18304 + }, + { + "epoch": 1.1363213110683468, + "grad_norm": 0.17288223360867935, + "learning_rate": 7.81374352103141e-05, + "loss": 2.8653, + "step": 18305 + }, + { + "epoch": 1.1363833881681047, + "grad_norm": 0.1682575426784544, + "learning_rate": 7.813444974980757e-05, + "loss": 2.9311, + "step": 18306 + }, + { + "epoch": 1.1364454652678626, + "grad_norm": 0.20864128756955366, + "learning_rate": 7.813146414251678e-05, + "loss": 2.9333, + "step": 18307 + }, + { + "epoch": 1.1365075423676205, + "grad_norm": 0.20762012511908734, + "learning_rate": 7.812847838845729e-05, + "loss": 2.9019, + "step": 18308 + }, + { + "epoch": 1.1365696194673784, + "grad_norm": 0.16308682264441857, + "learning_rate": 7.812549248764472e-05, + "loss": 2.9066, + "step": 18309 + }, + { + "epoch": 1.1366316965671364, + "grad_norm": 0.16825641351396228, + "learning_rate": 7.812250644009462e-05, + "loss": 2.9724, + "step": 18310 + }, + { + "epoch": 1.1366937736668943, + "grad_norm": 0.16138248174433745, + "learning_rate": 7.811952024582259e-05, + "loss": 2.8786, + "step": 18311 + }, + { + "epoch": 1.1367558507666522, + "grad_norm": 0.18091064327926368, + "learning_rate": 7.811653390484419e-05, + "loss": 2.9135, + "step": 18312 + }, + { + "epoch": 1.1368179278664101, + "grad_norm": 0.18942858350228106, + "learning_rate": 7.8113547417175e-05, + "loss": 2.9742, + "step": 18313 + }, + { + "epoch": 1.136880004966168, + "grad_norm": 0.19081583242320924, + "learning_rate": 7.811056078283062e-05, + "loss": 2.9241, + "step": 18314 + }, + { + "epoch": 1.136942082065926, + "grad_norm": 0.18255753996489724, + "learning_rate": 7.810757400182661e-05, + "loss": 2.9511, + "step": 18315 + }, + { + "epoch": 1.1370041591656839, + "grad_norm": 0.1679612645987492, + "learning_rate": 7.810458707417857e-05, + "loss": 2.8036, + "step": 18316 + }, + { + "epoch": 1.1370662362654418, + "grad_norm": 0.17020078219068444, + "learning_rate": 7.810159999990207e-05, + "loss": 2.9301, + "step": 18317 + }, + { + "epoch": 1.1371283133651995, + "grad_norm": 0.16590744308485303, + "learning_rate": 7.809861277901271e-05, + "loss": 2.915, + "step": 18318 + }, + { + "epoch": 1.1371903904649574, + "grad_norm": 0.16321509494957998, + "learning_rate": 7.809562541152606e-05, + "loss": 2.9249, + "step": 18319 + }, + { + "epoch": 1.1372524675647153, + "grad_norm": 0.18137940728165167, + "learning_rate": 7.809263789745771e-05, + "loss": 2.9186, + "step": 18320 + }, + { + "epoch": 1.1373145446644732, + "grad_norm": 0.3191433597106681, + "learning_rate": 7.808965023682324e-05, + "loss": 2.9291, + "step": 18321 + }, + { + "epoch": 1.1373766217642312, + "grad_norm": 0.14938668392885118, + "learning_rate": 7.808666242963827e-05, + "loss": 2.9742, + "step": 18322 + }, + { + "epoch": 1.137438698863989, + "grad_norm": 0.16719051763895057, + "learning_rate": 7.808367447591834e-05, + "loss": 2.9075, + "step": 18323 + }, + { + "epoch": 1.137500775963747, + "grad_norm": 0.22001211473043175, + "learning_rate": 7.808068637567908e-05, + "loss": 2.848, + "step": 18324 + }, + { + "epoch": 1.137562853063505, + "grad_norm": 0.2055345451690558, + "learning_rate": 7.807769812893607e-05, + "loss": 2.9383, + "step": 18325 + }, + { + "epoch": 1.1376249301632628, + "grad_norm": 0.17082918671386463, + "learning_rate": 7.807470973570487e-05, + "loss": 2.9382, + "step": 18326 + }, + { + "epoch": 1.1376870072630207, + "grad_norm": 0.15660763581037263, + "learning_rate": 7.80717211960011e-05, + "loss": 2.8986, + "step": 18327 + }, + { + "epoch": 1.1377490843627787, + "grad_norm": 0.25095310917972957, + "learning_rate": 7.806873250984035e-05, + "loss": 2.891, + "step": 18328 + }, + { + "epoch": 1.1378111614625364, + "grad_norm": 0.18367084953966187, + "learning_rate": 7.80657436772382e-05, + "loss": 2.977, + "step": 18329 + }, + { + "epoch": 1.1378732385622943, + "grad_norm": 0.19460464954190396, + "learning_rate": 7.806275469821026e-05, + "loss": 2.874, + "step": 18330 + }, + { + "epoch": 1.1379353156620522, + "grad_norm": 0.220431937652898, + "learning_rate": 7.805976557277212e-05, + "loss": 2.9155, + "step": 18331 + }, + { + "epoch": 1.1379973927618101, + "grad_norm": 0.15761493908154128, + "learning_rate": 7.805677630093937e-05, + "loss": 2.9248, + "step": 18332 + }, + { + "epoch": 1.138059469861568, + "grad_norm": 0.18600083082023797, + "learning_rate": 7.80537868827276e-05, + "loss": 2.9202, + "step": 18333 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.17578199704329378, + "learning_rate": 7.805079731815242e-05, + "loss": 2.7958, + "step": 18334 + }, + { + "epoch": 1.1381836240610839, + "grad_norm": 0.16651917552572193, + "learning_rate": 7.80478076072294e-05, + "loss": 2.8944, + "step": 18335 + }, + { + "epoch": 1.1382457011608418, + "grad_norm": 0.22680121211032056, + "learning_rate": 7.804481774997418e-05, + "loss": 2.901, + "step": 18336 + }, + { + "epoch": 1.1383077782605997, + "grad_norm": 0.18455309510029486, + "learning_rate": 7.804182774640234e-05, + "loss": 2.8905, + "step": 18337 + }, + { + "epoch": 1.1383698553603576, + "grad_norm": 0.17288838549274038, + "learning_rate": 7.803883759652946e-05, + "loss": 2.9454, + "step": 18338 + }, + { + "epoch": 1.1384319324601155, + "grad_norm": 0.18800778968948806, + "learning_rate": 7.803584730037118e-05, + "loss": 2.8689, + "step": 18339 + }, + { + "epoch": 1.1384940095598735, + "grad_norm": 0.16149954244309675, + "learning_rate": 7.803285685794307e-05, + "loss": 2.8843, + "step": 18340 + }, + { + "epoch": 1.1385560866596314, + "grad_norm": 0.1603565998192224, + "learning_rate": 7.802986626926073e-05, + "loss": 2.8948, + "step": 18341 + }, + { + "epoch": 1.138618163759389, + "grad_norm": 0.17005244532999841, + "learning_rate": 7.802687553433979e-05, + "loss": 2.9305, + "step": 18342 + }, + { + "epoch": 1.138680240859147, + "grad_norm": 0.23474379769664025, + "learning_rate": 7.802388465319583e-05, + "loss": 2.8941, + "step": 18343 + }, + { + "epoch": 1.138742317958905, + "grad_norm": 0.17371448273897985, + "learning_rate": 7.802089362584445e-05, + "loss": 2.8167, + "step": 18344 + }, + { + "epoch": 1.1388043950586628, + "grad_norm": 0.17536613403811463, + "learning_rate": 7.801790245230128e-05, + "loss": 2.9608, + "step": 18345 + }, + { + "epoch": 1.1388664721584207, + "grad_norm": 0.22710597690209233, + "learning_rate": 7.80149111325819e-05, + "loss": 2.8356, + "step": 18346 + }, + { + "epoch": 1.1389285492581787, + "grad_norm": 0.16992576638512352, + "learning_rate": 7.801191966670193e-05, + "loss": 2.8713, + "step": 18347 + }, + { + "epoch": 1.1389906263579366, + "grad_norm": 0.18409343451435833, + "learning_rate": 7.800892805467699e-05, + "loss": 2.8468, + "step": 18348 + }, + { + "epoch": 1.1390527034576945, + "grad_norm": 0.1805146914378483, + "learning_rate": 7.800593629652266e-05, + "loss": 2.9136, + "step": 18349 + }, + { + "epoch": 1.1391147805574524, + "grad_norm": 0.19538939869529026, + "learning_rate": 7.800294439225456e-05, + "loss": 2.9041, + "step": 18350 + }, + { + "epoch": 1.1391768576572103, + "grad_norm": 0.24147181747006732, + "learning_rate": 7.79999523418883e-05, + "loss": 2.9751, + "step": 18351 + }, + { + "epoch": 1.1392389347569682, + "grad_norm": 0.17935050586113843, + "learning_rate": 7.799696014543949e-05, + "loss": 2.9482, + "step": 18352 + }, + { + "epoch": 1.139301011856726, + "grad_norm": 0.19810709555017297, + "learning_rate": 7.799396780292375e-05, + "loss": 2.7613, + "step": 18353 + }, + { + "epoch": 1.1393630889564839, + "grad_norm": 0.2196638568836917, + "learning_rate": 7.799097531435668e-05, + "loss": 2.9099, + "step": 18354 + }, + { + "epoch": 1.1394251660562418, + "grad_norm": 0.20317919631683312, + "learning_rate": 7.798798267975392e-05, + "loss": 2.8631, + "step": 18355 + }, + { + "epoch": 1.1394872431559997, + "grad_norm": 0.1726568278134719, + "learning_rate": 7.798498989913103e-05, + "loss": 2.8568, + "step": 18356 + }, + { + "epoch": 1.1395493202557576, + "grad_norm": 0.17542953272056505, + "learning_rate": 7.798199697250366e-05, + "loss": 3.008, + "step": 18357 + }, + { + "epoch": 1.1396113973555155, + "grad_norm": 0.19927151299171444, + "learning_rate": 7.797900389988742e-05, + "loss": 3.024, + "step": 18358 + }, + { + "epoch": 1.1396734744552734, + "grad_norm": 0.1777123867309042, + "learning_rate": 7.797601068129791e-05, + "loss": 2.767, + "step": 18359 + }, + { + "epoch": 1.1397355515550314, + "grad_norm": 0.17267195750976613, + "learning_rate": 7.79730173167508e-05, + "loss": 2.837, + "step": 18360 + }, + { + "epoch": 1.1397976286547893, + "grad_norm": 0.18294574128665272, + "learning_rate": 7.797002380626164e-05, + "loss": 2.9493, + "step": 18361 + }, + { + "epoch": 1.1398597057545472, + "grad_norm": 0.16731008866075495, + "learning_rate": 7.796703014984607e-05, + "loss": 2.8546, + "step": 18362 + }, + { + "epoch": 1.1399217828543051, + "grad_norm": 0.157051284897396, + "learning_rate": 7.796403634751975e-05, + "loss": 2.91, + "step": 18363 + }, + { + "epoch": 1.139983859954063, + "grad_norm": 0.3573210735656986, + "learning_rate": 7.796104239929822e-05, + "loss": 2.8627, + "step": 18364 + }, + { + "epoch": 1.140045937053821, + "grad_norm": 0.24538315253674609, + "learning_rate": 7.795804830519718e-05, + "loss": 2.8862, + "step": 18365 + }, + { + "epoch": 1.1401080141535787, + "grad_norm": 0.18452088637809025, + "learning_rate": 7.795505406523219e-05, + "loss": 2.8739, + "step": 18366 + }, + { + "epoch": 1.1401700912533366, + "grad_norm": 0.16102002560313455, + "learning_rate": 7.795205967941889e-05, + "loss": 2.7387, + "step": 18367 + }, + { + "epoch": 1.1402321683530945, + "grad_norm": 0.25536959120707736, + "learning_rate": 7.794906514777294e-05, + "loss": 2.9504, + "step": 18368 + }, + { + "epoch": 1.1402942454528524, + "grad_norm": 0.2062919480118913, + "learning_rate": 7.79460704703099e-05, + "loss": 2.9446, + "step": 18369 + }, + { + "epoch": 1.1403563225526103, + "grad_norm": 0.1874479380490382, + "learning_rate": 7.794307564704545e-05, + "loss": 2.8819, + "step": 18370 + }, + { + "epoch": 1.1404183996523682, + "grad_norm": 0.24769250832970516, + "learning_rate": 7.794008067799518e-05, + "loss": 2.9304, + "step": 18371 + }, + { + "epoch": 1.1404804767521262, + "grad_norm": 0.1951878722251741, + "learning_rate": 7.793708556317473e-05, + "loss": 2.8963, + "step": 18372 + }, + { + "epoch": 1.140542553851884, + "grad_norm": 0.16848367121041025, + "learning_rate": 7.793409030259972e-05, + "loss": 2.903, + "step": 18373 + }, + { + "epoch": 1.140604630951642, + "grad_norm": 0.254109374739166, + "learning_rate": 7.79310948962858e-05, + "loss": 2.8905, + "step": 18374 + }, + { + "epoch": 1.1406667080514, + "grad_norm": 0.23284469740278993, + "learning_rate": 7.792809934424856e-05, + "loss": 2.8703, + "step": 18375 + }, + { + "epoch": 1.1407287851511578, + "grad_norm": 0.1869091120974182, + "learning_rate": 7.792510364650364e-05, + "loss": 2.9243, + "step": 18376 + }, + { + "epoch": 1.1407908622509155, + "grad_norm": 0.1929486758489662, + "learning_rate": 7.792210780306668e-05, + "loss": 2.8507, + "step": 18377 + }, + { + "epoch": 1.1408529393506734, + "grad_norm": 0.15953573531665016, + "learning_rate": 7.79191118139533e-05, + "loss": 2.9181, + "step": 18378 + }, + { + "epoch": 1.1409150164504314, + "grad_norm": 0.16268403677088086, + "learning_rate": 7.791611567917914e-05, + "loss": 2.8596, + "step": 18379 + }, + { + "epoch": 1.1409770935501893, + "grad_norm": 0.17297040765376367, + "learning_rate": 7.791311939875984e-05, + "loss": 2.8907, + "step": 18380 + }, + { + "epoch": 1.1410391706499472, + "grad_norm": 0.2361097193580324, + "learning_rate": 7.791012297271101e-05, + "loss": 2.9531, + "step": 18381 + }, + { + "epoch": 1.1411012477497051, + "grad_norm": 0.17945223416550318, + "learning_rate": 7.790712640104829e-05, + "loss": 2.9086, + "step": 18382 + }, + { + "epoch": 1.141163324849463, + "grad_norm": 0.16772801313875038, + "learning_rate": 7.790412968378731e-05, + "loss": 2.8998, + "step": 18383 + }, + { + "epoch": 1.141225401949221, + "grad_norm": 0.19622866332335218, + "learning_rate": 7.790113282094372e-05, + "loss": 2.8806, + "step": 18384 + }, + { + "epoch": 1.1412874790489789, + "grad_norm": 0.18105167357601756, + "learning_rate": 7.789813581253315e-05, + "loss": 2.9002, + "step": 18385 + }, + { + "epoch": 1.1413495561487368, + "grad_norm": 0.19813345760199677, + "learning_rate": 7.789513865857124e-05, + "loss": 2.8915, + "step": 18386 + }, + { + "epoch": 1.1414116332484947, + "grad_norm": 0.1785591710941636, + "learning_rate": 7.789214135907361e-05, + "loss": 2.8157, + "step": 18387 + }, + { + "epoch": 1.1414737103482526, + "grad_norm": 0.16022655060634303, + "learning_rate": 7.788914391405591e-05, + "loss": 2.9323, + "step": 18388 + }, + { + "epoch": 1.1415357874480105, + "grad_norm": 0.15997206750512288, + "learning_rate": 7.788614632353377e-05, + "loss": 2.9465, + "step": 18389 + }, + { + "epoch": 1.1415978645477682, + "grad_norm": 0.15694532653209997, + "learning_rate": 7.788314858752283e-05, + "loss": 2.8042, + "step": 18390 + }, + { + "epoch": 1.1416599416475262, + "grad_norm": 0.1594405698529306, + "learning_rate": 7.788015070603876e-05, + "loss": 2.9771, + "step": 18391 + }, + { + "epoch": 1.141722018747284, + "grad_norm": 0.15586001484483827, + "learning_rate": 7.787715267909715e-05, + "loss": 2.8485, + "step": 18392 + }, + { + "epoch": 1.141784095847042, + "grad_norm": 0.17192552647487147, + "learning_rate": 7.787415450671368e-05, + "loss": 2.8253, + "step": 18393 + }, + { + "epoch": 1.1418461729468, + "grad_norm": 0.1636474957665986, + "learning_rate": 7.787115618890397e-05, + "loss": 2.9442, + "step": 18394 + }, + { + "epoch": 1.1419082500465578, + "grad_norm": 0.19023297248652316, + "learning_rate": 7.786815772568369e-05, + "loss": 2.8692, + "step": 18395 + }, + { + "epoch": 1.1419703271463157, + "grad_norm": 0.15678503699820512, + "learning_rate": 7.786515911706844e-05, + "loss": 2.8868, + "step": 18396 + }, + { + "epoch": 1.1420324042460737, + "grad_norm": 0.17074669777230836, + "learning_rate": 7.786216036307392e-05, + "loss": 2.9441, + "step": 18397 + }, + { + "epoch": 1.1420944813458316, + "grad_norm": 0.1577487235984913, + "learning_rate": 7.785916146371573e-05, + "loss": 2.8637, + "step": 18398 + }, + { + "epoch": 1.1421565584455895, + "grad_norm": 0.16937696296532717, + "learning_rate": 7.785616241900952e-05, + "loss": 2.8731, + "step": 18399 + }, + { + "epoch": 1.1422186355453474, + "grad_norm": 0.1649766104097871, + "learning_rate": 7.785316322897096e-05, + "loss": 2.8751, + "step": 18400 + }, + { + "epoch": 1.1422807126451051, + "grad_norm": 0.1520848152227797, + "learning_rate": 7.785016389361569e-05, + "loss": 2.9163, + "step": 18401 + }, + { + "epoch": 1.142342789744863, + "grad_norm": 0.18807256221253651, + "learning_rate": 7.784716441295935e-05, + "loss": 2.9257, + "step": 18402 + }, + { + "epoch": 1.142404866844621, + "grad_norm": 0.16820728752205474, + "learning_rate": 7.784416478701759e-05, + "loss": 2.8289, + "step": 18403 + }, + { + "epoch": 1.1424669439443789, + "grad_norm": 0.15504006175716104, + "learning_rate": 7.784116501580608e-05, + "loss": 2.8345, + "step": 18404 + }, + { + "epoch": 1.1425290210441368, + "grad_norm": 0.20394376650788518, + "learning_rate": 7.783816509934043e-05, + "loss": 2.8843, + "step": 18405 + }, + { + "epoch": 1.1425910981438947, + "grad_norm": 0.15717224106754063, + "learning_rate": 7.783516503763632e-05, + "loss": 2.9433, + "step": 18406 + }, + { + "epoch": 1.1426531752436526, + "grad_norm": 0.1746296939079049, + "learning_rate": 7.783216483070941e-05, + "loss": 2.867, + "step": 18407 + }, + { + "epoch": 1.1427152523434105, + "grad_norm": 0.15547724010537814, + "learning_rate": 7.782916447857532e-05, + "loss": 2.8959, + "step": 18408 + }, + { + "epoch": 1.1427773294431685, + "grad_norm": 0.21634098611267666, + "learning_rate": 7.782616398124973e-05, + "loss": 2.8239, + "step": 18409 + }, + { + "epoch": 1.1428394065429264, + "grad_norm": 0.16798748421987203, + "learning_rate": 7.78231633387483e-05, + "loss": 2.7646, + "step": 18410 + }, + { + "epoch": 1.1429014836426843, + "grad_norm": 0.16290665097153442, + "learning_rate": 7.782016255108666e-05, + "loss": 2.9142, + "step": 18411 + }, + { + "epoch": 1.1429635607424422, + "grad_norm": 0.15552751877662424, + "learning_rate": 7.781716161828049e-05, + "loss": 2.8781, + "step": 18412 + }, + { + "epoch": 1.1430256378422001, + "grad_norm": 0.16003786340310378, + "learning_rate": 7.781416054034543e-05, + "loss": 2.881, + "step": 18413 + }, + { + "epoch": 1.1430877149419578, + "grad_norm": 0.1846545294718462, + "learning_rate": 7.781115931729714e-05, + "loss": 2.9335, + "step": 18414 + }, + { + "epoch": 1.1431497920417157, + "grad_norm": 0.31455007681520514, + "learning_rate": 7.780815794915127e-05, + "loss": 2.8318, + "step": 18415 + }, + { + "epoch": 1.1432118691414737, + "grad_norm": 0.17250204039313707, + "learning_rate": 7.78051564359235e-05, + "loss": 2.8941, + "step": 18416 + }, + { + "epoch": 1.1432739462412316, + "grad_norm": 0.23187074311608305, + "learning_rate": 7.780215477762947e-05, + "loss": 3.0515, + "step": 18417 + }, + { + "epoch": 1.1433360233409895, + "grad_norm": 0.20127778829918275, + "learning_rate": 7.779915297428486e-05, + "loss": 2.88, + "step": 18418 + }, + { + "epoch": 1.1433981004407474, + "grad_norm": 0.1687508053230567, + "learning_rate": 7.779615102590532e-05, + "loss": 2.9266, + "step": 18419 + }, + { + "epoch": 1.1434601775405053, + "grad_norm": 0.15323202664938987, + "learning_rate": 7.779314893250651e-05, + "loss": 2.9066, + "step": 18420 + }, + { + "epoch": 1.1435222546402632, + "grad_norm": 0.2206345341867413, + "learning_rate": 7.779014669410409e-05, + "loss": 2.8799, + "step": 18421 + }, + { + "epoch": 1.1435843317400212, + "grad_norm": 0.18883724981316535, + "learning_rate": 7.778714431071375e-05, + "loss": 2.8635, + "step": 18422 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.24646411002352428, + "learning_rate": 7.77841417823511e-05, + "loss": 3.0128, + "step": 18423 + }, + { + "epoch": 1.143708485939537, + "grad_norm": 0.20150951642798062, + "learning_rate": 7.778113910903186e-05, + "loss": 2.8236, + "step": 18424 + }, + { + "epoch": 1.1437705630392947, + "grad_norm": 0.18967306996594535, + "learning_rate": 7.777813629077166e-05, + "loss": 2.8368, + "step": 18425 + }, + { + "epoch": 1.1438326401390526, + "grad_norm": 0.17209749017997458, + "learning_rate": 7.777513332758619e-05, + "loss": 2.9763, + "step": 18426 + }, + { + "epoch": 1.1438947172388105, + "grad_norm": 0.19456838957979963, + "learning_rate": 7.777213021949111e-05, + "loss": 2.8844, + "step": 18427 + }, + { + "epoch": 1.1439567943385685, + "grad_norm": 0.17101172915465424, + "learning_rate": 7.776912696650208e-05, + "loss": 2.9008, + "step": 18428 + }, + { + "epoch": 1.1440188714383264, + "grad_norm": 0.16990805712235785, + "learning_rate": 7.776612356863478e-05, + "loss": 2.8789, + "step": 18429 + }, + { + "epoch": 1.1440809485380843, + "grad_norm": 0.1665913416599499, + "learning_rate": 7.776312002590486e-05, + "loss": 2.9436, + "step": 18430 + }, + { + "epoch": 1.1441430256378422, + "grad_norm": 0.23198568461859057, + "learning_rate": 7.776011633832801e-05, + "loss": 2.8819, + "step": 18431 + }, + { + "epoch": 1.1442051027376001, + "grad_norm": 0.17432678683277564, + "learning_rate": 7.775711250591988e-05, + "loss": 2.9307, + "step": 18432 + }, + { + "epoch": 1.144267179837358, + "grad_norm": 0.17979977044296674, + "learning_rate": 7.775410852869618e-05, + "loss": 2.9403, + "step": 18433 + }, + { + "epoch": 1.144329256937116, + "grad_norm": 0.1622813243553598, + "learning_rate": 7.775110440667252e-05, + "loss": 2.9634, + "step": 18434 + }, + { + "epoch": 1.1443913340368739, + "grad_norm": 0.17384970456022272, + "learning_rate": 7.774810013986465e-05, + "loss": 2.9088, + "step": 18435 + }, + { + "epoch": 1.1444534111366318, + "grad_norm": 0.20361806557429452, + "learning_rate": 7.774509572828819e-05, + "loss": 2.8568, + "step": 18436 + }, + { + "epoch": 1.1445154882363897, + "grad_norm": 0.16393366364401607, + "learning_rate": 7.774209117195883e-05, + "loss": 2.8936, + "step": 18437 + }, + { + "epoch": 1.1445775653361474, + "grad_norm": 0.1517115521221341, + "learning_rate": 7.773908647089225e-05, + "loss": 2.8065, + "step": 18438 + }, + { + "epoch": 1.1446396424359053, + "grad_norm": 0.15899067756101168, + "learning_rate": 7.773608162510413e-05, + "loss": 2.9176, + "step": 18439 + }, + { + "epoch": 1.1447017195356632, + "grad_norm": 0.18120593694590706, + "learning_rate": 7.773307663461013e-05, + "loss": 2.7766, + "step": 18440 + }, + { + "epoch": 1.1447637966354212, + "grad_norm": 0.15839962470836205, + "learning_rate": 7.773007149942593e-05, + "loss": 2.8741, + "step": 18441 + }, + { + "epoch": 1.144825873735179, + "grad_norm": 0.17192088945039222, + "learning_rate": 7.772706621956722e-05, + "loss": 2.904, + "step": 18442 + }, + { + "epoch": 1.144887950834937, + "grad_norm": 0.17246180304097317, + "learning_rate": 7.772406079504968e-05, + "loss": 2.871, + "step": 18443 + }, + { + "epoch": 1.144950027934695, + "grad_norm": 0.17291757971645114, + "learning_rate": 7.772105522588898e-05, + "loss": 2.9545, + "step": 18444 + }, + { + "epoch": 1.1450121050344528, + "grad_norm": 0.16616084535237358, + "learning_rate": 7.771804951210082e-05, + "loss": 2.9524, + "step": 18445 + }, + { + "epoch": 1.1450741821342107, + "grad_norm": 0.16841456665863966, + "learning_rate": 7.771504365370084e-05, + "loss": 2.9179, + "step": 18446 + }, + { + "epoch": 1.1451362592339687, + "grad_norm": 0.2509018901959766, + "learning_rate": 7.771203765070476e-05, + "loss": 2.9382, + "step": 18447 + }, + { + "epoch": 1.1451983363337266, + "grad_norm": 0.16948301270694335, + "learning_rate": 7.770903150312826e-05, + "loss": 2.8729, + "step": 18448 + }, + { + "epoch": 1.1452604134334843, + "grad_norm": 0.16728704755418755, + "learning_rate": 7.7706025210987e-05, + "loss": 2.835, + "step": 18449 + }, + { + "epoch": 1.1453224905332422, + "grad_norm": 0.16765741948682422, + "learning_rate": 7.770301877429669e-05, + "loss": 2.8395, + "step": 18450 + }, + { + "epoch": 1.1453845676330001, + "grad_norm": 0.1638062496797628, + "learning_rate": 7.770001219307301e-05, + "loss": 2.9189, + "step": 18451 + }, + { + "epoch": 1.145446644732758, + "grad_norm": 0.20766944407044197, + "learning_rate": 7.769700546733164e-05, + "loss": 2.9192, + "step": 18452 + }, + { + "epoch": 1.145508721832516, + "grad_norm": 0.16886282434061725, + "learning_rate": 7.769399859708827e-05, + "loss": 2.7819, + "step": 18453 + }, + { + "epoch": 1.1455707989322739, + "grad_norm": 0.1932658344927348, + "learning_rate": 7.769099158235858e-05, + "loss": 2.9148, + "step": 18454 + }, + { + "epoch": 1.1456328760320318, + "grad_norm": 0.19721887997771217, + "learning_rate": 7.768798442315828e-05, + "loss": 2.8668, + "step": 18455 + }, + { + "epoch": 1.1456949531317897, + "grad_norm": 0.17782565277743487, + "learning_rate": 7.768497711950302e-05, + "loss": 2.8668, + "step": 18456 + }, + { + "epoch": 1.1457570302315476, + "grad_norm": 0.1690104419657292, + "learning_rate": 7.768196967140853e-05, + "loss": 2.8906, + "step": 18457 + }, + { + "epoch": 1.1458191073313055, + "grad_norm": 0.1850220636633221, + "learning_rate": 7.767896207889047e-05, + "loss": 2.909, + "step": 18458 + }, + { + "epoch": 1.1458811844310635, + "grad_norm": 0.15524764416545817, + "learning_rate": 7.767595434196456e-05, + "loss": 2.8561, + "step": 18459 + }, + { + "epoch": 1.1459432615308214, + "grad_norm": 0.17201213892494152, + "learning_rate": 7.767294646064648e-05, + "loss": 2.9334, + "step": 18460 + }, + { + "epoch": 1.1460053386305793, + "grad_norm": 0.19599573793934913, + "learning_rate": 7.766993843495192e-05, + "loss": 2.9247, + "step": 18461 + }, + { + "epoch": 1.146067415730337, + "grad_norm": 0.16361855432711495, + "learning_rate": 7.766693026489655e-05, + "loss": 2.8799, + "step": 18462 + }, + { + "epoch": 1.146129492830095, + "grad_norm": 0.1522366746554345, + "learning_rate": 7.766392195049612e-05, + "loss": 2.9104, + "step": 18463 + }, + { + "epoch": 1.1461915699298528, + "grad_norm": 0.18556028618781653, + "learning_rate": 7.766091349176628e-05, + "loss": 2.8738, + "step": 18464 + }, + { + "epoch": 1.1462536470296107, + "grad_norm": 0.16661167851958958, + "learning_rate": 7.765790488872273e-05, + "loss": 2.9572, + "step": 18465 + }, + { + "epoch": 1.1463157241293687, + "grad_norm": 0.161156288517564, + "learning_rate": 7.765489614138118e-05, + "loss": 2.8521, + "step": 18466 + }, + { + "epoch": 1.1463778012291266, + "grad_norm": 0.16260835537921145, + "learning_rate": 7.765188724975734e-05, + "loss": 2.9938, + "step": 18467 + }, + { + "epoch": 1.1464398783288845, + "grad_norm": 0.19196656988533892, + "learning_rate": 7.764887821386689e-05, + "loss": 2.9056, + "step": 18468 + }, + { + "epoch": 1.1465019554286424, + "grad_norm": 0.15989865544921644, + "learning_rate": 7.764586903372551e-05, + "loss": 2.9571, + "step": 18469 + }, + { + "epoch": 1.1465640325284003, + "grad_norm": 0.1636252423318157, + "learning_rate": 7.764285970934893e-05, + "loss": 2.773, + "step": 18470 + }, + { + "epoch": 1.1466261096281583, + "grad_norm": 0.1533420936031682, + "learning_rate": 7.763985024075284e-05, + "loss": 2.8742, + "step": 18471 + }, + { + "epoch": 1.146688186727916, + "grad_norm": 0.20220604596284378, + "learning_rate": 7.763684062795295e-05, + "loss": 2.9799, + "step": 18472 + }, + { + "epoch": 1.1467502638276739, + "grad_norm": 0.16902695044967891, + "learning_rate": 7.763383087096495e-05, + "loss": 2.962, + "step": 18473 + }, + { + "epoch": 1.1468123409274318, + "grad_norm": 0.23666801916215052, + "learning_rate": 7.763082096980454e-05, + "loss": 2.8839, + "step": 18474 + }, + { + "epoch": 1.1468744180271897, + "grad_norm": 0.16800203658896068, + "learning_rate": 7.762781092448743e-05, + "loss": 2.8877, + "step": 18475 + }, + { + "epoch": 1.1469364951269476, + "grad_norm": 0.15393578250854634, + "learning_rate": 7.762480073502931e-05, + "loss": 2.9437, + "step": 18476 + }, + { + "epoch": 1.1469985722267055, + "grad_norm": 0.1874227762140993, + "learning_rate": 7.762179040144592e-05, + "loss": 2.9565, + "step": 18477 + }, + { + "epoch": 1.1470606493264635, + "grad_norm": 0.19765905295333394, + "learning_rate": 7.761877992375294e-05, + "loss": 2.877, + "step": 18478 + }, + { + "epoch": 1.1471227264262214, + "grad_norm": 0.2709405378151572, + "learning_rate": 7.761576930196608e-05, + "loss": 2.9469, + "step": 18479 + }, + { + "epoch": 1.1471848035259793, + "grad_norm": 0.16291049268939561, + "learning_rate": 7.761275853610104e-05, + "loss": 2.9345, + "step": 18480 + }, + { + "epoch": 1.1472468806257372, + "grad_norm": 0.1535958650662619, + "learning_rate": 7.760974762617354e-05, + "loss": 2.9076, + "step": 18481 + }, + { + "epoch": 1.1473089577254951, + "grad_norm": 0.16592861588182026, + "learning_rate": 7.760673657219929e-05, + "loss": 2.9247, + "step": 18482 + }, + { + "epoch": 1.147371034825253, + "grad_norm": 0.15651493261701482, + "learning_rate": 7.760372537419397e-05, + "loss": 2.9382, + "step": 18483 + }, + { + "epoch": 1.147433111925011, + "grad_norm": 0.14759225871560205, + "learning_rate": 7.760071403217334e-05, + "loss": 2.9661, + "step": 18484 + }, + { + "epoch": 1.1474951890247689, + "grad_norm": 0.16906299791950227, + "learning_rate": 7.759770254615308e-05, + "loss": 2.9251, + "step": 18485 + }, + { + "epoch": 1.1475572661245266, + "grad_norm": 0.1618827764289138, + "learning_rate": 7.759469091614891e-05, + "loss": 2.8912, + "step": 18486 + }, + { + "epoch": 1.1476193432242845, + "grad_norm": 0.17708076564011824, + "learning_rate": 7.759167914217652e-05, + "loss": 2.9735, + "step": 18487 + }, + { + "epoch": 1.1476814203240424, + "grad_norm": 0.16470336631335258, + "learning_rate": 7.758866722425167e-05, + "loss": 2.8788, + "step": 18488 + }, + { + "epoch": 1.1477434974238003, + "grad_norm": 0.15702656055679212, + "learning_rate": 7.758565516239002e-05, + "loss": 2.8259, + "step": 18489 + }, + { + "epoch": 1.1478055745235582, + "grad_norm": 0.21910113864584962, + "learning_rate": 7.758264295660731e-05, + "loss": 2.9329, + "step": 18490 + }, + { + "epoch": 1.1478676516233162, + "grad_norm": 0.18692130948709557, + "learning_rate": 7.757963060691927e-05, + "loss": 2.9214, + "step": 18491 + }, + { + "epoch": 1.147929728723074, + "grad_norm": 0.17407797955180873, + "learning_rate": 7.75766181133416e-05, + "loss": 2.8481, + "step": 18492 + }, + { + "epoch": 1.147991805822832, + "grad_norm": 0.20751820283352387, + "learning_rate": 7.757360547589e-05, + "loss": 2.9446, + "step": 18493 + }, + { + "epoch": 1.14805388292259, + "grad_norm": 0.1487381049617317, + "learning_rate": 7.757059269458024e-05, + "loss": 2.9322, + "step": 18494 + }, + { + "epoch": 1.1481159600223478, + "grad_norm": 0.20416387492009844, + "learning_rate": 7.7567579769428e-05, + "loss": 2.9095, + "step": 18495 + }, + { + "epoch": 1.1481780371221055, + "grad_norm": 0.1596073537204676, + "learning_rate": 7.756456670044898e-05, + "loss": 2.9293, + "step": 18496 + }, + { + "epoch": 1.1482401142218635, + "grad_norm": 0.15700082389272163, + "learning_rate": 7.756155348765893e-05, + "loss": 2.8652, + "step": 18497 + }, + { + "epoch": 1.1483021913216214, + "grad_norm": 0.17639145900628161, + "learning_rate": 7.755854013107358e-05, + "loss": 2.8502, + "step": 18498 + }, + { + "epoch": 1.1483642684213793, + "grad_norm": 0.18570185763994224, + "learning_rate": 7.755552663070862e-05, + "loss": 2.9535, + "step": 18499 + }, + { + "epoch": 1.1484263455211372, + "grad_norm": 0.16082888173846546, + "learning_rate": 7.755251298657979e-05, + "loss": 2.873, + "step": 18500 + }, + { + "epoch": 1.1484884226208951, + "grad_norm": 0.159707883926099, + "learning_rate": 7.754949919870281e-05, + "loss": 2.9503, + "step": 18501 + }, + { + "epoch": 1.148550499720653, + "grad_norm": 0.15233047745426231, + "learning_rate": 7.75464852670934e-05, + "loss": 2.8563, + "step": 18502 + }, + { + "epoch": 1.148612576820411, + "grad_norm": 0.15846816573113343, + "learning_rate": 7.754347119176731e-05, + "loss": 2.8662, + "step": 18503 + }, + { + "epoch": 1.1486746539201689, + "grad_norm": 0.15233333258590587, + "learning_rate": 7.754045697274024e-05, + "loss": 2.9088, + "step": 18504 + }, + { + "epoch": 1.1487367310199268, + "grad_norm": 0.17705525615105783, + "learning_rate": 7.75374426100279e-05, + "loss": 2.8829, + "step": 18505 + }, + { + "epoch": 1.1487988081196847, + "grad_norm": 0.16055243034682515, + "learning_rate": 7.753442810364606e-05, + "loss": 2.9242, + "step": 18506 + }, + { + "epoch": 1.1488608852194426, + "grad_norm": 0.18377972375410812, + "learning_rate": 7.753141345361042e-05, + "loss": 2.8902, + "step": 18507 + }, + { + "epoch": 1.1489229623192005, + "grad_norm": 0.19755833692246402, + "learning_rate": 7.752839865993671e-05, + "loss": 2.9502, + "step": 18508 + }, + { + "epoch": 1.1489850394189585, + "grad_norm": 0.17450921430962743, + "learning_rate": 7.752538372264067e-05, + "loss": 2.952, + "step": 18509 + }, + { + "epoch": 1.1490471165187162, + "grad_norm": 0.1608617004571043, + "learning_rate": 7.752236864173799e-05, + "loss": 2.7941, + "step": 18510 + }, + { + "epoch": 1.149109193618474, + "grad_norm": 0.1550182548855863, + "learning_rate": 7.751935341724446e-05, + "loss": 2.8834, + "step": 18511 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.17207899234109428, + "learning_rate": 7.751633804917578e-05, + "loss": 2.9298, + "step": 18512 + }, + { + "epoch": 1.14923334781799, + "grad_norm": 0.17569037313564875, + "learning_rate": 7.751332253754767e-05, + "loss": 2.9254, + "step": 18513 + }, + { + "epoch": 1.1492954249177478, + "grad_norm": 0.2187578761862669, + "learning_rate": 7.75103068823759e-05, + "loss": 2.9223, + "step": 18514 + }, + { + "epoch": 1.1493575020175057, + "grad_norm": 0.15582236541329056, + "learning_rate": 7.750729108367615e-05, + "loss": 2.9264, + "step": 18515 + }, + { + "epoch": 1.1494195791172637, + "grad_norm": 0.1754492750850393, + "learning_rate": 7.750427514146421e-05, + "loss": 2.9537, + "step": 18516 + }, + { + "epoch": 1.1494816562170216, + "grad_norm": 0.1591709930292134, + "learning_rate": 7.750125905575579e-05, + "loss": 2.8125, + "step": 18517 + }, + { + "epoch": 1.1495437333167795, + "grad_norm": 0.18924589719948548, + "learning_rate": 7.74982428265666e-05, + "loss": 2.9118, + "step": 18518 + }, + { + "epoch": 1.1496058104165374, + "grad_norm": 0.21551236237560875, + "learning_rate": 7.749522645391243e-05, + "loss": 2.9222, + "step": 18519 + }, + { + "epoch": 1.1496678875162951, + "grad_norm": 0.19048314444621878, + "learning_rate": 7.749220993780897e-05, + "loss": 2.8105, + "step": 18520 + }, + { + "epoch": 1.149729964616053, + "grad_norm": 0.15410639679146734, + "learning_rate": 7.748919327827198e-05, + "loss": 2.8691, + "step": 18521 + }, + { + "epoch": 1.149792041715811, + "grad_norm": 0.17998503846335093, + "learning_rate": 7.748617647531721e-05, + "loss": 2.9159, + "step": 18522 + }, + { + "epoch": 1.1498541188155689, + "grad_norm": 0.17363608392609406, + "learning_rate": 7.748315952896037e-05, + "loss": 2.9297, + "step": 18523 + }, + { + "epoch": 1.1499161959153268, + "grad_norm": 0.16262482653820454, + "learning_rate": 7.748014243921722e-05, + "loss": 2.8727, + "step": 18524 + }, + { + "epoch": 1.1499782730150847, + "grad_norm": 0.1633404955086345, + "learning_rate": 7.74771252061035e-05, + "loss": 2.9683, + "step": 18525 + }, + { + "epoch": 1.1500403501148426, + "grad_norm": 0.19660982332509705, + "learning_rate": 7.747410782963495e-05, + "loss": 2.8455, + "step": 18526 + }, + { + "epoch": 1.1501024272146005, + "grad_norm": 0.17982171063638472, + "learning_rate": 7.747109030982729e-05, + "loss": 2.9642, + "step": 18527 + }, + { + "epoch": 1.1501645043143585, + "grad_norm": 0.15728117308339057, + "learning_rate": 7.74680726466963e-05, + "loss": 2.9828, + "step": 18528 + }, + { + "epoch": 1.1502265814141164, + "grad_norm": 0.18823350047632464, + "learning_rate": 7.74650548402577e-05, + "loss": 2.8459, + "step": 18529 + }, + { + "epoch": 1.1502886585138743, + "grad_norm": 0.20156689165934827, + "learning_rate": 7.746203689052725e-05, + "loss": 2.8704, + "step": 18530 + }, + { + "epoch": 1.1503507356136322, + "grad_norm": 0.1731305053761714, + "learning_rate": 7.745901879752067e-05, + "loss": 2.8995, + "step": 18531 + }, + { + "epoch": 1.1504128127133901, + "grad_norm": 0.16268918894844742, + "learning_rate": 7.745600056125373e-05, + "loss": 2.8825, + "step": 18532 + }, + { + "epoch": 1.150474889813148, + "grad_norm": 0.17469879757886883, + "learning_rate": 7.745298218174219e-05, + "loss": 2.9451, + "step": 18533 + }, + { + "epoch": 1.1505369669129057, + "grad_norm": 0.15804382828525612, + "learning_rate": 7.744996365900175e-05, + "loss": 2.8828, + "step": 18534 + }, + { + "epoch": 1.1505990440126637, + "grad_norm": 0.1774557465808636, + "learning_rate": 7.744694499304821e-05, + "loss": 2.9121, + "step": 18535 + }, + { + "epoch": 1.1506611211124216, + "grad_norm": 0.15671343027063767, + "learning_rate": 7.744392618389727e-05, + "loss": 2.8547, + "step": 18536 + }, + { + "epoch": 1.1507231982121795, + "grad_norm": 0.1566770915441936, + "learning_rate": 7.744090723156472e-05, + "loss": 2.7901, + "step": 18537 + }, + { + "epoch": 1.1507852753119374, + "grad_norm": 0.17660637158564735, + "learning_rate": 7.743788813606629e-05, + "loss": 2.9786, + "step": 18538 + }, + { + "epoch": 1.1508473524116953, + "grad_norm": 0.177252745700575, + "learning_rate": 7.743486889741774e-05, + "loss": 2.9335, + "step": 18539 + }, + { + "epoch": 1.1509094295114533, + "grad_norm": 0.16698952165226963, + "learning_rate": 7.743184951563482e-05, + "loss": 2.8711, + "step": 18540 + }, + { + "epoch": 1.1509715066112112, + "grad_norm": 0.19909771814315227, + "learning_rate": 7.742882999073329e-05, + "loss": 2.7939, + "step": 18541 + }, + { + "epoch": 1.151033583710969, + "grad_norm": 0.26015857145631177, + "learning_rate": 7.742581032272889e-05, + "loss": 2.9398, + "step": 18542 + }, + { + "epoch": 1.151095660810727, + "grad_norm": 0.16286809157067852, + "learning_rate": 7.742279051163737e-05, + "loss": 2.8717, + "step": 18543 + }, + { + "epoch": 1.1511577379104847, + "grad_norm": 0.2742706684402297, + "learning_rate": 7.741977055747449e-05, + "loss": 2.9006, + "step": 18544 + }, + { + "epoch": 1.1512198150102426, + "grad_norm": 0.23049191988841647, + "learning_rate": 7.741675046025602e-05, + "loss": 2.7859, + "step": 18545 + }, + { + "epoch": 1.1512818921100005, + "grad_norm": 0.1894392126421911, + "learning_rate": 7.741373021999772e-05, + "loss": 2.9419, + "step": 18546 + }, + { + "epoch": 1.1513439692097585, + "grad_norm": 0.1889531232959568, + "learning_rate": 7.741070983671532e-05, + "loss": 2.8983, + "step": 18547 + }, + { + "epoch": 1.1514060463095164, + "grad_norm": 0.1749846528340016, + "learning_rate": 7.740768931042459e-05, + "loss": 2.8596, + "step": 18548 + }, + { + "epoch": 1.1514681234092743, + "grad_norm": 0.1720909907270634, + "learning_rate": 7.740466864114128e-05, + "loss": 2.9977, + "step": 18549 + }, + { + "epoch": 1.1515302005090322, + "grad_norm": 0.36700229844015436, + "learning_rate": 7.74016478288812e-05, + "loss": 2.8295, + "step": 18550 + }, + { + "epoch": 1.1515922776087901, + "grad_norm": 0.1892205314804371, + "learning_rate": 7.739862687366004e-05, + "loss": 2.9216, + "step": 18551 + }, + { + "epoch": 1.151654354708548, + "grad_norm": 0.19030392724571274, + "learning_rate": 7.739560577549361e-05, + "loss": 2.8802, + "step": 18552 + }, + { + "epoch": 1.151716431808306, + "grad_norm": 0.16970684433017008, + "learning_rate": 7.739258453439764e-05, + "loss": 2.8884, + "step": 18553 + }, + { + "epoch": 1.1517785089080639, + "grad_norm": 0.2538746887545084, + "learning_rate": 7.738956315038791e-05, + "loss": 2.9442, + "step": 18554 + }, + { + "epoch": 1.1518405860078218, + "grad_norm": 0.1825139677078459, + "learning_rate": 7.738654162348018e-05, + "loss": 2.8922, + "step": 18555 + }, + { + "epoch": 1.1519026631075797, + "grad_norm": 0.15658246075853327, + "learning_rate": 7.738351995369023e-05, + "loss": 2.8355, + "step": 18556 + }, + { + "epoch": 1.1519647402073376, + "grad_norm": 0.34217852402153975, + "learning_rate": 7.738049814103379e-05, + "loss": 2.9224, + "step": 18557 + }, + { + "epoch": 1.1520268173070953, + "grad_norm": 0.1859757135696762, + "learning_rate": 7.737747618552664e-05, + "loss": 2.9124, + "step": 18558 + }, + { + "epoch": 1.1520888944068532, + "grad_norm": 0.18559683077057396, + "learning_rate": 7.737445408718457e-05, + "loss": 2.9309, + "step": 18559 + }, + { + "epoch": 1.1521509715066112, + "grad_norm": 0.21206790420308697, + "learning_rate": 7.737143184602331e-05, + "loss": 2.9269, + "step": 18560 + }, + { + "epoch": 1.152213048606369, + "grad_norm": 0.211906397808357, + "learning_rate": 7.736840946205866e-05, + "loss": 2.9661, + "step": 18561 + }, + { + "epoch": 1.152275125706127, + "grad_norm": 0.18773916012193606, + "learning_rate": 7.736538693530638e-05, + "loss": 2.9414, + "step": 18562 + }, + { + "epoch": 1.152337202805885, + "grad_norm": 0.2051741148324044, + "learning_rate": 7.73623642657822e-05, + "loss": 2.9408, + "step": 18563 + }, + { + "epoch": 1.1523992799056428, + "grad_norm": 0.1953097724724102, + "learning_rate": 7.735934145350195e-05, + "loss": 2.9315, + "step": 18564 + }, + { + "epoch": 1.1524613570054008, + "grad_norm": 0.2080326602762494, + "learning_rate": 7.735631849848136e-05, + "loss": 2.8341, + "step": 18565 + }, + { + "epoch": 1.1525234341051587, + "grad_norm": 0.17711330725802032, + "learning_rate": 7.735329540073621e-05, + "loss": 2.8357, + "step": 18566 + }, + { + "epoch": 1.1525855112049166, + "grad_norm": 0.19405259990164353, + "learning_rate": 7.73502721602823e-05, + "loss": 2.922, + "step": 18567 + }, + { + "epoch": 1.1526475883046743, + "grad_norm": 0.17986279727283705, + "learning_rate": 7.734724877713537e-05, + "loss": 2.8458, + "step": 18568 + }, + { + "epoch": 1.1527096654044322, + "grad_norm": 0.20323999524331796, + "learning_rate": 7.734422525131121e-05, + "loss": 2.8854, + "step": 18569 + }, + { + "epoch": 1.1527717425041901, + "grad_norm": 0.1956189040980108, + "learning_rate": 7.734120158282559e-05, + "loss": 2.8736, + "step": 18570 + }, + { + "epoch": 1.152833819603948, + "grad_norm": 0.16993871145037573, + "learning_rate": 7.733817777169428e-05, + "loss": 2.9382, + "step": 18571 + }, + { + "epoch": 1.152895896703706, + "grad_norm": 0.17753829200501897, + "learning_rate": 7.733515381793304e-05, + "loss": 2.8686, + "step": 18572 + }, + { + "epoch": 1.1529579738034639, + "grad_norm": 0.15247241724431573, + "learning_rate": 7.73321297215577e-05, + "loss": 2.9693, + "step": 18573 + }, + { + "epoch": 1.1530200509032218, + "grad_norm": 0.1602033278425758, + "learning_rate": 7.732910548258399e-05, + "loss": 2.8774, + "step": 18574 + }, + { + "epoch": 1.1530821280029797, + "grad_norm": 0.19858692993350688, + "learning_rate": 7.73260811010277e-05, + "loss": 2.8505, + "step": 18575 + }, + { + "epoch": 1.1531442051027376, + "grad_norm": 0.15865417630626988, + "learning_rate": 7.732305657690461e-05, + "loss": 2.9711, + "step": 18576 + }, + { + "epoch": 1.1532062822024955, + "grad_norm": 0.1880367959573888, + "learning_rate": 7.732003191023049e-05, + "loss": 2.886, + "step": 18577 + }, + { + "epoch": 1.1532683593022535, + "grad_norm": 0.1679774271032303, + "learning_rate": 7.731700710102115e-05, + "loss": 2.8679, + "step": 18578 + }, + { + "epoch": 1.1533304364020114, + "grad_norm": 0.18566449101689747, + "learning_rate": 7.731398214929235e-05, + "loss": 2.9598, + "step": 18579 + }, + { + "epoch": 1.1533925135017693, + "grad_norm": 0.17829197619816115, + "learning_rate": 7.731095705505988e-05, + "loss": 2.867, + "step": 18580 + }, + { + "epoch": 1.1534545906015272, + "grad_norm": 0.1560766152169389, + "learning_rate": 7.73079318183395e-05, + "loss": 2.9155, + "step": 18581 + }, + { + "epoch": 1.153516667701285, + "grad_norm": 0.1683391988219439, + "learning_rate": 7.730490643914701e-05, + "loss": 2.81, + "step": 18582 + }, + { + "epoch": 1.1535787448010428, + "grad_norm": 0.18510506884287592, + "learning_rate": 7.730188091749822e-05, + "loss": 2.8874, + "step": 18583 + }, + { + "epoch": 1.1536408219008008, + "grad_norm": 0.18159610448796804, + "learning_rate": 7.729885525340887e-05, + "loss": 2.9658, + "step": 18584 + }, + { + "epoch": 1.1537028990005587, + "grad_norm": 0.15700915214337957, + "learning_rate": 7.729582944689476e-05, + "loss": 2.9285, + "step": 18585 + }, + { + "epoch": 1.1537649761003166, + "grad_norm": 0.17330212249804333, + "learning_rate": 7.72928034979717e-05, + "loss": 2.8373, + "step": 18586 + }, + { + "epoch": 1.1538270532000745, + "grad_norm": 0.16828667139619916, + "learning_rate": 7.728977740665543e-05, + "loss": 2.9237, + "step": 18587 + }, + { + "epoch": 1.1538891302998324, + "grad_norm": 0.14905804248558824, + "learning_rate": 7.72867511729618e-05, + "loss": 2.9721, + "step": 18588 + }, + { + "epoch": 1.1539512073995903, + "grad_norm": 0.16473815505387215, + "learning_rate": 7.728372479690656e-05, + "loss": 2.8524, + "step": 18589 + }, + { + "epoch": 1.1540132844993483, + "grad_norm": 0.15150675953013434, + "learning_rate": 7.728069827850548e-05, + "loss": 2.8486, + "step": 18590 + }, + { + "epoch": 1.1540753615991062, + "grad_norm": 0.17177783864065493, + "learning_rate": 7.72776716177744e-05, + "loss": 2.8746, + "step": 18591 + }, + { + "epoch": 1.1541374386988639, + "grad_norm": 0.1620933283481736, + "learning_rate": 7.727464481472907e-05, + "loss": 2.7487, + "step": 18592 + }, + { + "epoch": 1.1541995157986218, + "grad_norm": 0.18606631200830184, + "learning_rate": 7.727161786938531e-05, + "loss": 2.8719, + "step": 18593 + }, + { + "epoch": 1.1542615928983797, + "grad_norm": 0.23252926277005634, + "learning_rate": 7.726859078175887e-05, + "loss": 2.9447, + "step": 18594 + }, + { + "epoch": 1.1543236699981376, + "grad_norm": 0.16808304114198921, + "learning_rate": 7.72655635518656e-05, + "loss": 2.9246, + "step": 18595 + }, + { + "epoch": 1.1543857470978955, + "grad_norm": 0.17074737957225483, + "learning_rate": 7.726253617972126e-05, + "loss": 2.916, + "step": 18596 + }, + { + "epoch": 1.1544478241976535, + "grad_norm": 0.17727063011813518, + "learning_rate": 7.725950866534165e-05, + "loss": 2.9178, + "step": 18597 + }, + { + "epoch": 1.1545099012974114, + "grad_norm": 0.17227128791583798, + "learning_rate": 7.725648100874257e-05, + "loss": 2.921, + "step": 18598 + }, + { + "epoch": 1.1545719783971693, + "grad_norm": 0.16459236977814934, + "learning_rate": 7.725345320993979e-05, + "loss": 2.8614, + "step": 18599 + }, + { + "epoch": 1.1546340554969272, + "grad_norm": 0.19547400472821583, + "learning_rate": 7.725042526894915e-05, + "loss": 2.9504, + "step": 18600 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.1629057419803937, + "learning_rate": 7.724739718578642e-05, + "loss": 2.8846, + "step": 18601 + }, + { + "epoch": 1.154758209696443, + "grad_norm": 0.1929481642086382, + "learning_rate": 7.724436896046741e-05, + "loss": 2.8346, + "step": 18602 + }, + { + "epoch": 1.154820286796201, + "grad_norm": 0.17143174473895806, + "learning_rate": 7.72413405930079e-05, + "loss": 2.9678, + "step": 18603 + }, + { + "epoch": 1.1548823638959589, + "grad_norm": 0.14939637172910322, + "learning_rate": 7.723831208342371e-05, + "loss": 2.9161, + "step": 18604 + }, + { + "epoch": 1.1549444409957168, + "grad_norm": 0.17338026408267243, + "learning_rate": 7.723528343173066e-05, + "loss": 2.8906, + "step": 18605 + }, + { + "epoch": 1.1550065180954745, + "grad_norm": 0.15924372291882116, + "learning_rate": 7.723225463794449e-05, + "loss": 2.8595, + "step": 18606 + }, + { + "epoch": 1.1550685951952324, + "grad_norm": 0.20206123350827926, + "learning_rate": 7.722922570208104e-05, + "loss": 2.9169, + "step": 18607 + }, + { + "epoch": 1.1551306722949903, + "grad_norm": 0.15887681546850968, + "learning_rate": 7.722619662415612e-05, + "loss": 2.8695, + "step": 18608 + }, + { + "epoch": 1.1551927493947483, + "grad_norm": 0.1705503952407479, + "learning_rate": 7.722316740418551e-05, + "loss": 2.8983, + "step": 18609 + }, + { + "epoch": 1.1552548264945062, + "grad_norm": 0.1520104430095021, + "learning_rate": 7.722013804218505e-05, + "loss": 2.8952, + "step": 18610 + }, + { + "epoch": 1.155316903594264, + "grad_norm": 0.1870030035074783, + "learning_rate": 7.72171085381705e-05, + "loss": 2.8636, + "step": 18611 + }, + { + "epoch": 1.155378980694022, + "grad_norm": 0.2146790393261634, + "learning_rate": 7.721407889215771e-05, + "loss": 2.8901, + "step": 18612 + }, + { + "epoch": 1.15544105779378, + "grad_norm": 0.18083801143355327, + "learning_rate": 7.721104910416243e-05, + "loss": 2.954, + "step": 18613 + }, + { + "epoch": 1.1555031348935378, + "grad_norm": 0.20928672951047997, + "learning_rate": 7.720801917420053e-05, + "loss": 2.8703, + "step": 18614 + }, + { + "epoch": 1.1555652119932958, + "grad_norm": 0.15562299452297623, + "learning_rate": 7.720498910228777e-05, + "loss": 2.8798, + "step": 18615 + }, + { + "epoch": 1.1556272890930535, + "grad_norm": 0.18894749445953662, + "learning_rate": 7.720195888844e-05, + "loss": 2.9301, + "step": 18616 + }, + { + "epoch": 1.1556893661928114, + "grad_norm": 0.17255788845054126, + "learning_rate": 7.7198928532673e-05, + "loss": 2.9601, + "step": 18617 + }, + { + "epoch": 1.1557514432925693, + "grad_norm": 0.18282039576376508, + "learning_rate": 7.719589803500257e-05, + "loss": 2.8822, + "step": 18618 + }, + { + "epoch": 1.1558135203923272, + "grad_norm": 0.182046201607609, + "learning_rate": 7.719286739544455e-05, + "loss": 2.9078, + "step": 18619 + }, + { + "epoch": 1.1558755974920851, + "grad_norm": 0.1657074985979107, + "learning_rate": 7.718983661401473e-05, + "loss": 3.0147, + "step": 18620 + }, + { + "epoch": 1.155937674591843, + "grad_norm": 0.15956619355431445, + "learning_rate": 7.718680569072895e-05, + "loss": 2.919, + "step": 18621 + }, + { + "epoch": 1.155999751691601, + "grad_norm": 0.16352904535801585, + "learning_rate": 7.7183774625603e-05, + "loss": 2.8915, + "step": 18622 + }, + { + "epoch": 1.1560618287913589, + "grad_norm": 0.18573363090886608, + "learning_rate": 7.718074341865269e-05, + "loss": 2.8962, + "step": 18623 + }, + { + "epoch": 1.1561239058911168, + "grad_norm": 0.16621571431989074, + "learning_rate": 7.717771206989385e-05, + "loss": 2.8741, + "step": 18624 + }, + { + "epoch": 1.1561859829908747, + "grad_norm": 0.1755128825185756, + "learning_rate": 7.717468057934229e-05, + "loss": 2.8422, + "step": 18625 + }, + { + "epoch": 1.1562480600906326, + "grad_norm": 0.15587414597427995, + "learning_rate": 7.717164894701381e-05, + "loss": 2.9126, + "step": 18626 + }, + { + "epoch": 1.1563101371903906, + "grad_norm": 0.16017802020720953, + "learning_rate": 7.716861717292425e-05, + "loss": 2.9075, + "step": 18627 + }, + { + "epoch": 1.1563722142901485, + "grad_norm": 0.16301986932215579, + "learning_rate": 7.716558525708944e-05, + "loss": 2.9333, + "step": 18628 + }, + { + "epoch": 1.1564342913899064, + "grad_norm": 0.15857357639107933, + "learning_rate": 7.716255319952515e-05, + "loss": 2.8875, + "step": 18629 + }, + { + "epoch": 1.156496368489664, + "grad_norm": 0.17576996979017714, + "learning_rate": 7.715952100024723e-05, + "loss": 2.847, + "step": 18630 + }, + { + "epoch": 1.156558445589422, + "grad_norm": 0.1827549236465384, + "learning_rate": 7.71564886592715e-05, + "loss": 2.8409, + "step": 18631 + }, + { + "epoch": 1.15662052268918, + "grad_norm": 0.2074155928671575, + "learning_rate": 7.715345617661377e-05, + "loss": 2.9229, + "step": 18632 + }, + { + "epoch": 1.1566825997889378, + "grad_norm": 0.22115917686788475, + "learning_rate": 7.715042355228987e-05, + "loss": 2.8641, + "step": 18633 + }, + { + "epoch": 1.1567446768886958, + "grad_norm": 0.1887795084030121, + "learning_rate": 7.714739078631563e-05, + "loss": 2.8726, + "step": 18634 + }, + { + "epoch": 1.1568067539884537, + "grad_norm": 0.24182971552610258, + "learning_rate": 7.714435787870685e-05, + "loss": 2.9345, + "step": 18635 + }, + { + "epoch": 1.1568688310882116, + "grad_norm": 0.1972570228589561, + "learning_rate": 7.714132482947937e-05, + "loss": 2.8613, + "step": 18636 + }, + { + "epoch": 1.1569309081879695, + "grad_norm": 0.21255894292262767, + "learning_rate": 7.713829163864901e-05, + "loss": 2.9334, + "step": 18637 + }, + { + "epoch": 1.1569929852877274, + "grad_norm": 0.17822420302464487, + "learning_rate": 7.713525830623159e-05, + "loss": 2.7898, + "step": 18638 + }, + { + "epoch": 1.1570550623874853, + "grad_norm": 0.17535314350600895, + "learning_rate": 7.713222483224294e-05, + "loss": 2.9023, + "step": 18639 + }, + { + "epoch": 1.157117139487243, + "grad_norm": 0.174834920090909, + "learning_rate": 7.712919121669889e-05, + "loss": 2.9014, + "step": 18640 + }, + { + "epoch": 1.157179216587001, + "grad_norm": 0.16576026462794952, + "learning_rate": 7.712615745961527e-05, + "loss": 2.7882, + "step": 18641 + }, + { + "epoch": 1.1572412936867589, + "grad_norm": 0.19915389894085092, + "learning_rate": 7.71231235610079e-05, + "loss": 2.8876, + "step": 18642 + }, + { + "epoch": 1.1573033707865168, + "grad_norm": 0.17486851874707435, + "learning_rate": 7.71200895208926e-05, + "loss": 2.8647, + "step": 18643 + }, + { + "epoch": 1.1573654478862747, + "grad_norm": 0.16867713370524928, + "learning_rate": 7.711705533928521e-05, + "loss": 2.9393, + "step": 18644 + }, + { + "epoch": 1.1574275249860326, + "grad_norm": 0.17639531854323348, + "learning_rate": 7.711402101620158e-05, + "loss": 2.8659, + "step": 18645 + }, + { + "epoch": 1.1574896020857905, + "grad_norm": 0.18209183957492348, + "learning_rate": 7.711098655165749e-05, + "loss": 2.8877, + "step": 18646 + }, + { + "epoch": 1.1575516791855485, + "grad_norm": 0.16418005501022792, + "learning_rate": 7.710795194566882e-05, + "loss": 2.9279, + "step": 18647 + }, + { + "epoch": 1.1576137562853064, + "grad_norm": 0.17253328104245305, + "learning_rate": 7.710491719825138e-05, + "loss": 2.9126, + "step": 18648 + }, + { + "epoch": 1.1576758333850643, + "grad_norm": 0.181600812425088, + "learning_rate": 7.710188230942102e-05, + "loss": 2.9184, + "step": 18649 + }, + { + "epoch": 1.1577379104848222, + "grad_norm": 0.1694398433621051, + "learning_rate": 7.709884727919353e-05, + "loss": 2.8477, + "step": 18650 + }, + { + "epoch": 1.1577999875845801, + "grad_norm": 0.1709219562151261, + "learning_rate": 7.70958121075848e-05, + "loss": 2.8751, + "step": 18651 + }, + { + "epoch": 1.157862064684338, + "grad_norm": 0.16534192888767082, + "learning_rate": 7.709277679461064e-05, + "loss": 2.9286, + "step": 18652 + }, + { + "epoch": 1.1579241417840958, + "grad_norm": 0.20815468022353478, + "learning_rate": 7.708974134028687e-05, + "loss": 2.9472, + "step": 18653 + }, + { + "epoch": 1.1579862188838537, + "grad_norm": 0.18085887818237353, + "learning_rate": 7.708670574462935e-05, + "loss": 2.9152, + "step": 18654 + }, + { + "epoch": 1.1580482959836116, + "grad_norm": 0.18477899369956216, + "learning_rate": 7.708367000765393e-05, + "loss": 2.839, + "step": 18655 + }, + { + "epoch": 1.1581103730833695, + "grad_norm": 0.15128766709050326, + "learning_rate": 7.708063412937641e-05, + "loss": 2.8848, + "step": 18656 + }, + { + "epoch": 1.1581724501831274, + "grad_norm": 0.18973501607979323, + "learning_rate": 7.707759810981265e-05, + "loss": 2.9672, + "step": 18657 + }, + { + "epoch": 1.1582345272828853, + "grad_norm": 0.21697535342233748, + "learning_rate": 7.707456194897848e-05, + "loss": 2.8358, + "step": 18658 + }, + { + "epoch": 1.1582966043826433, + "grad_norm": 0.16016894383251476, + "learning_rate": 7.707152564688975e-05, + "loss": 2.8891, + "step": 18659 + }, + { + "epoch": 1.1583586814824012, + "grad_norm": 0.1596617157904518, + "learning_rate": 7.70684892035623e-05, + "loss": 2.8946, + "step": 18660 + }, + { + "epoch": 1.158420758582159, + "grad_norm": 0.19014746464642227, + "learning_rate": 7.7065452619012e-05, + "loss": 3.0138, + "step": 18661 + }, + { + "epoch": 1.158482835681917, + "grad_norm": 0.20625287509128065, + "learning_rate": 7.706241589325462e-05, + "loss": 2.8531, + "step": 18662 + }, + { + "epoch": 1.158544912781675, + "grad_norm": 0.2121643925895417, + "learning_rate": 7.705937902630606e-05, + "loss": 2.88, + "step": 18663 + }, + { + "epoch": 1.1586069898814326, + "grad_norm": 0.17625615816822407, + "learning_rate": 7.705634201818216e-05, + "loss": 2.9849, + "step": 18664 + }, + { + "epoch": 1.1586690669811905, + "grad_norm": 0.17678684185922158, + "learning_rate": 7.705330486889874e-05, + "loss": 3.0463, + "step": 18665 + }, + { + "epoch": 1.1587311440809485, + "grad_norm": 0.21972961423662002, + "learning_rate": 7.705026757847168e-05, + "loss": 2.8522, + "step": 18666 + }, + { + "epoch": 1.1587932211807064, + "grad_norm": 0.17961765051216225, + "learning_rate": 7.70472301469168e-05, + "loss": 2.9004, + "step": 18667 + }, + { + "epoch": 1.1588552982804643, + "grad_norm": 0.1828155664040079, + "learning_rate": 7.704419257424995e-05, + "loss": 2.9173, + "step": 18668 + }, + { + "epoch": 1.1589173753802222, + "grad_norm": 0.17758292777820547, + "learning_rate": 7.704115486048699e-05, + "loss": 2.9281, + "step": 18669 + }, + { + "epoch": 1.1589794524799801, + "grad_norm": 0.2012935474176679, + "learning_rate": 7.703811700564376e-05, + "loss": 2.9608, + "step": 18670 + }, + { + "epoch": 1.159041529579738, + "grad_norm": 0.1659553867333582, + "learning_rate": 7.703507900973611e-05, + "loss": 2.8876, + "step": 18671 + }, + { + "epoch": 1.159103606679496, + "grad_norm": 0.18037521365860876, + "learning_rate": 7.703204087277988e-05, + "loss": 2.9367, + "step": 18672 + }, + { + "epoch": 1.1591656837792539, + "grad_norm": 0.17243166747500505, + "learning_rate": 7.702900259479094e-05, + "loss": 2.8425, + "step": 18673 + }, + { + "epoch": 1.1592277608790118, + "grad_norm": 0.21227015624195888, + "learning_rate": 7.702596417578514e-05, + "loss": 2.903, + "step": 18674 + }, + { + "epoch": 1.1592898379787697, + "grad_norm": 0.18416354856037875, + "learning_rate": 7.70229256157783e-05, + "loss": 2.9335, + "step": 18675 + }, + { + "epoch": 1.1593519150785276, + "grad_norm": 0.18805970655720222, + "learning_rate": 7.701988691478633e-05, + "loss": 2.7503, + "step": 18676 + }, + { + "epoch": 1.1594139921782853, + "grad_norm": 0.16640594263003622, + "learning_rate": 7.701684807282502e-05, + "loss": 2.8169, + "step": 18677 + }, + { + "epoch": 1.1594760692780433, + "grad_norm": 0.14939519351668307, + "learning_rate": 7.701380908991027e-05, + "loss": 2.9452, + "step": 18678 + }, + { + "epoch": 1.1595381463778012, + "grad_norm": 0.16203727037801544, + "learning_rate": 7.701076996605791e-05, + "loss": 2.8319, + "step": 18679 + }, + { + "epoch": 1.159600223477559, + "grad_norm": 0.15607764155234433, + "learning_rate": 7.70077307012838e-05, + "loss": 2.8125, + "step": 18680 + }, + { + "epoch": 1.159662300577317, + "grad_norm": 0.17066980054119008, + "learning_rate": 7.700469129560382e-05, + "loss": 2.9082, + "step": 18681 + }, + { + "epoch": 1.159724377677075, + "grad_norm": 0.15363025923100215, + "learning_rate": 7.70016517490338e-05, + "loss": 2.9181, + "step": 18682 + }, + { + "epoch": 1.1597864547768328, + "grad_norm": 0.16620101114614177, + "learning_rate": 7.699861206158963e-05, + "loss": 2.8395, + "step": 18683 + }, + { + "epoch": 1.1598485318765908, + "grad_norm": 0.16256589837135532, + "learning_rate": 7.69955722332871e-05, + "loss": 2.9485, + "step": 18684 + }, + { + "epoch": 1.1599106089763487, + "grad_norm": 0.15490966746854856, + "learning_rate": 7.699253226414216e-05, + "loss": 2.7945, + "step": 18685 + }, + { + "epoch": 1.1599726860761066, + "grad_norm": 0.16480883504273902, + "learning_rate": 7.698949215417061e-05, + "loss": 2.9733, + "step": 18686 + }, + { + "epoch": 1.1600347631758645, + "grad_norm": 0.16233139461022839, + "learning_rate": 7.698645190338834e-05, + "loss": 2.8904, + "step": 18687 + }, + { + "epoch": 1.1600968402756222, + "grad_norm": 0.16918926596803208, + "learning_rate": 7.698341151181118e-05, + "loss": 2.8699, + "step": 18688 + }, + { + "epoch": 1.1601589173753801, + "grad_norm": 0.1567257255207446, + "learning_rate": 7.698037097945503e-05, + "loss": 2.8523, + "step": 18689 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.1819172465361322, + "learning_rate": 7.697733030633571e-05, + "loss": 2.9811, + "step": 18690 + }, + { + "epoch": 1.160283071574896, + "grad_norm": 0.3500484820404236, + "learning_rate": 7.697428949246913e-05, + "loss": 2.9508, + "step": 18691 + }, + { + "epoch": 1.1603451486746539, + "grad_norm": 0.1916296200521915, + "learning_rate": 7.697124853787113e-05, + "loss": 2.8859, + "step": 18692 + }, + { + "epoch": 1.1604072257744118, + "grad_norm": 0.18559982146702084, + "learning_rate": 7.696820744255757e-05, + "loss": 2.9446, + "step": 18693 + }, + { + "epoch": 1.1604693028741697, + "grad_norm": 0.1593835963941457, + "learning_rate": 7.696516620654434e-05, + "loss": 2.9042, + "step": 18694 + }, + { + "epoch": 1.1605313799739276, + "grad_norm": 0.23946017835226102, + "learning_rate": 7.696212482984728e-05, + "loss": 2.9103, + "step": 18695 + }, + { + "epoch": 1.1605934570736856, + "grad_norm": 0.17158650982417445, + "learning_rate": 7.695908331248227e-05, + "loss": 3.0124, + "step": 18696 + }, + { + "epoch": 1.1606555341734435, + "grad_norm": 0.19934392250389651, + "learning_rate": 7.695604165446518e-05, + "loss": 2.7866, + "step": 18697 + }, + { + "epoch": 1.1607176112732014, + "grad_norm": 0.14652263141797822, + "learning_rate": 7.695299985581188e-05, + "loss": 2.8396, + "step": 18698 + }, + { + "epoch": 1.1607796883729593, + "grad_norm": 0.16733951030742414, + "learning_rate": 7.694995791653823e-05, + "loss": 2.8662, + "step": 18699 + }, + { + "epoch": 1.1608417654727172, + "grad_norm": 0.14563359194282421, + "learning_rate": 7.694691583666012e-05, + "loss": 2.8951, + "step": 18700 + }, + { + "epoch": 1.160903842572475, + "grad_norm": 0.214256176729313, + "learning_rate": 7.69438736161934e-05, + "loss": 2.9757, + "step": 18701 + }, + { + "epoch": 1.1609659196722328, + "grad_norm": 0.2053559827845842, + "learning_rate": 7.694083125515395e-05, + "loss": 2.8639, + "step": 18702 + }, + { + "epoch": 1.1610279967719908, + "grad_norm": 0.15479067362389362, + "learning_rate": 7.693778875355764e-05, + "loss": 2.8699, + "step": 18703 + }, + { + "epoch": 1.1610900738717487, + "grad_norm": 0.15154616941387472, + "learning_rate": 7.693474611142035e-05, + "loss": 2.9614, + "step": 18704 + }, + { + "epoch": 1.1611521509715066, + "grad_norm": 0.15640615170768218, + "learning_rate": 7.693170332875794e-05, + "loss": 2.9712, + "step": 18705 + }, + { + "epoch": 1.1612142280712645, + "grad_norm": 0.15001149506034828, + "learning_rate": 7.69286604055863e-05, + "loss": 2.8491, + "step": 18706 + }, + { + "epoch": 1.1612763051710224, + "grad_norm": 0.16320535433658623, + "learning_rate": 7.692561734192132e-05, + "loss": 2.8446, + "step": 18707 + }, + { + "epoch": 1.1613383822707803, + "grad_norm": 0.19633131379081625, + "learning_rate": 7.692257413777886e-05, + "loss": 2.7804, + "step": 18708 + }, + { + "epoch": 1.1614004593705383, + "grad_norm": 0.16955487310830633, + "learning_rate": 7.691953079317478e-05, + "loss": 2.7922, + "step": 18709 + }, + { + "epoch": 1.1614625364702962, + "grad_norm": 0.16581143556519917, + "learning_rate": 7.691648730812497e-05, + "loss": 2.9826, + "step": 18710 + }, + { + "epoch": 1.161524613570054, + "grad_norm": 0.14750922593463459, + "learning_rate": 7.691344368264531e-05, + "loss": 2.8234, + "step": 18711 + }, + { + "epoch": 1.1615866906698118, + "grad_norm": 0.1613367581770788, + "learning_rate": 7.691039991675169e-05, + "loss": 2.8849, + "step": 18712 + }, + { + "epoch": 1.1616487677695697, + "grad_norm": 0.16009730905958475, + "learning_rate": 7.690735601045998e-05, + "loss": 2.9409, + "step": 18713 + }, + { + "epoch": 1.1617108448693276, + "grad_norm": 0.1904863713337979, + "learning_rate": 7.690431196378604e-05, + "loss": 2.8643, + "step": 18714 + }, + { + "epoch": 1.1617729219690855, + "grad_norm": 0.17890996221164243, + "learning_rate": 7.69012677767458e-05, + "loss": 2.9377, + "step": 18715 + }, + { + "epoch": 1.1618349990688435, + "grad_norm": 0.17871108341732217, + "learning_rate": 7.689822344935511e-05, + "loss": 2.8433, + "step": 18716 + }, + { + "epoch": 1.1618970761686014, + "grad_norm": 0.19423732460058732, + "learning_rate": 7.689517898162985e-05, + "loss": 2.8629, + "step": 18717 + }, + { + "epoch": 1.1619591532683593, + "grad_norm": 0.16368654294624504, + "learning_rate": 7.689213437358593e-05, + "loss": 2.9405, + "step": 18718 + }, + { + "epoch": 1.1620212303681172, + "grad_norm": 0.17442071288993855, + "learning_rate": 7.688908962523921e-05, + "loss": 2.8211, + "step": 18719 + }, + { + "epoch": 1.1620833074678751, + "grad_norm": 0.16868879435686804, + "learning_rate": 7.688604473660557e-05, + "loss": 2.8709, + "step": 18720 + }, + { + "epoch": 1.162145384567633, + "grad_norm": 0.18854060098090475, + "learning_rate": 7.688299970770093e-05, + "loss": 2.8741, + "step": 18721 + }, + { + "epoch": 1.162207461667391, + "grad_norm": 0.22997650823402327, + "learning_rate": 7.687995453854113e-05, + "loss": 2.8802, + "step": 18722 + }, + { + "epoch": 1.162269538767149, + "grad_norm": 0.17741048036349125, + "learning_rate": 7.687690922914209e-05, + "loss": 2.9801, + "step": 18723 + }, + { + "epoch": 1.1623316158669068, + "grad_norm": 0.18764728483435827, + "learning_rate": 7.687386377951968e-05, + "loss": 2.8625, + "step": 18724 + }, + { + "epoch": 1.1623936929666645, + "grad_norm": 0.17660317585391072, + "learning_rate": 7.687081818968982e-05, + "loss": 2.8229, + "step": 18725 + }, + { + "epoch": 1.1624557700664224, + "grad_norm": 0.17787242600300293, + "learning_rate": 7.686777245966837e-05, + "loss": 2.8865, + "step": 18726 + }, + { + "epoch": 1.1625178471661803, + "grad_norm": 0.1729967186704808, + "learning_rate": 7.686472658947122e-05, + "loss": 2.7727, + "step": 18727 + }, + { + "epoch": 1.1625799242659383, + "grad_norm": 0.15831367727249304, + "learning_rate": 7.686168057911429e-05, + "loss": 2.8885, + "step": 18728 + }, + { + "epoch": 1.1626420013656962, + "grad_norm": 0.18199819829593958, + "learning_rate": 7.685863442861343e-05, + "loss": 2.8466, + "step": 18729 + }, + { + "epoch": 1.162704078465454, + "grad_norm": 0.16423569914826852, + "learning_rate": 7.685558813798456e-05, + "loss": 2.9187, + "step": 18730 + }, + { + "epoch": 1.162766155565212, + "grad_norm": 0.15304200005471197, + "learning_rate": 7.685254170724356e-05, + "loss": 2.9456, + "step": 18731 + }, + { + "epoch": 1.16282823266497, + "grad_norm": 0.16407429561899725, + "learning_rate": 7.684949513640636e-05, + "loss": 2.8605, + "step": 18732 + }, + { + "epoch": 1.1628903097647278, + "grad_norm": 0.15773730516664278, + "learning_rate": 7.68464484254888e-05, + "loss": 2.91, + "step": 18733 + }, + { + "epoch": 1.1629523868644858, + "grad_norm": 0.15657580501876917, + "learning_rate": 7.68434015745068e-05, + "loss": 2.9469, + "step": 18734 + }, + { + "epoch": 1.1630144639642437, + "grad_norm": 0.1662529203151258, + "learning_rate": 7.684035458347627e-05, + "loss": 2.873, + "step": 18735 + }, + { + "epoch": 1.1630765410640014, + "grad_norm": 0.15199410221890877, + "learning_rate": 7.68373074524131e-05, + "loss": 2.9619, + "step": 18736 + }, + { + "epoch": 1.1631386181637593, + "grad_norm": 0.1455452314049578, + "learning_rate": 7.683426018133317e-05, + "loss": 2.8109, + "step": 18737 + }, + { + "epoch": 1.1632006952635172, + "grad_norm": 0.15243623444937826, + "learning_rate": 7.683121277025238e-05, + "loss": 2.9059, + "step": 18738 + }, + { + "epoch": 1.1632627723632751, + "grad_norm": 0.18442710487741212, + "learning_rate": 7.682816521918666e-05, + "loss": 2.911, + "step": 18739 + }, + { + "epoch": 1.163324849463033, + "grad_norm": 0.1602071471866935, + "learning_rate": 7.682511752815188e-05, + "loss": 2.9241, + "step": 18740 + }, + { + "epoch": 1.163386926562791, + "grad_norm": 0.14983064881611, + "learning_rate": 7.682206969716395e-05, + "loss": 2.9554, + "step": 18741 + }, + { + "epoch": 1.1634490036625489, + "grad_norm": 0.16440629700139964, + "learning_rate": 7.681902172623877e-05, + "loss": 2.8876, + "step": 18742 + }, + { + "epoch": 1.1635110807623068, + "grad_norm": 0.16705046858100972, + "learning_rate": 7.681597361539226e-05, + "loss": 2.9055, + "step": 18743 + }, + { + "epoch": 1.1635731578620647, + "grad_norm": 0.16149518851111921, + "learning_rate": 7.681292536464028e-05, + "loss": 2.9323, + "step": 18744 + }, + { + "epoch": 1.1636352349618226, + "grad_norm": 0.21591008593361927, + "learning_rate": 7.680987697399877e-05, + "loss": 2.9279, + "step": 18745 + }, + { + "epoch": 1.1636973120615806, + "grad_norm": 0.16095215466822224, + "learning_rate": 7.680682844348362e-05, + "loss": 2.8915, + "step": 18746 + }, + { + "epoch": 1.1637593891613385, + "grad_norm": 0.16564678845050054, + "learning_rate": 7.680377977311074e-05, + "loss": 2.8124, + "step": 18747 + }, + { + "epoch": 1.1638214662610964, + "grad_norm": 0.1608517362717895, + "learning_rate": 7.680073096289604e-05, + "loss": 2.9233, + "step": 18748 + }, + { + "epoch": 1.163883543360854, + "grad_norm": 0.1657603264313403, + "learning_rate": 7.679768201285541e-05, + "loss": 2.8558, + "step": 18749 + }, + { + "epoch": 1.163945620460612, + "grad_norm": 0.18050354708808147, + "learning_rate": 7.679463292300479e-05, + "loss": 2.9164, + "step": 18750 + }, + { + "epoch": 1.16400769756037, + "grad_norm": 0.1512852046818434, + "learning_rate": 7.679158369336003e-05, + "loss": 2.791, + "step": 18751 + }, + { + "epoch": 1.1640697746601278, + "grad_norm": 0.17267883420622154, + "learning_rate": 7.678853432393712e-05, + "loss": 2.8853, + "step": 18752 + }, + { + "epoch": 1.1641318517598858, + "grad_norm": 0.14892296710247785, + "learning_rate": 7.678548481475189e-05, + "loss": 2.9128, + "step": 18753 + }, + { + "epoch": 1.1641939288596437, + "grad_norm": 0.16434882927469782, + "learning_rate": 7.678243516582029e-05, + "loss": 2.8341, + "step": 18754 + }, + { + "epoch": 1.1642560059594016, + "grad_norm": 0.1433638387499636, + "learning_rate": 7.677938537715824e-05, + "loss": 2.8026, + "step": 18755 + }, + { + "epoch": 1.1643180830591595, + "grad_norm": 0.1856850561291227, + "learning_rate": 7.67763354487816e-05, + "loss": 2.9438, + "step": 18756 + }, + { + "epoch": 1.1643801601589174, + "grad_norm": 0.14161204595234575, + "learning_rate": 7.677328538070634e-05, + "loss": 2.9476, + "step": 18757 + }, + { + "epoch": 1.1644422372586754, + "grad_norm": 0.17893168880727697, + "learning_rate": 7.677023517294835e-05, + "loss": 2.9302, + "step": 18758 + }, + { + "epoch": 1.1645043143584333, + "grad_norm": 0.15959448100248097, + "learning_rate": 7.676718482552354e-05, + "loss": 2.9807, + "step": 18759 + }, + { + "epoch": 1.164566391458191, + "grad_norm": 0.1913186086746795, + "learning_rate": 7.676413433844782e-05, + "loss": 2.9386, + "step": 18760 + }, + { + "epoch": 1.1646284685579489, + "grad_norm": 0.16612873151311144, + "learning_rate": 7.676108371173713e-05, + "loss": 2.8856, + "step": 18761 + }, + { + "epoch": 1.1646905456577068, + "grad_norm": 0.16147585102131082, + "learning_rate": 7.675803294540737e-05, + "loss": 2.9345, + "step": 18762 + }, + { + "epoch": 1.1647526227574647, + "grad_norm": 0.1619391639311862, + "learning_rate": 7.675498203947444e-05, + "loss": 2.9043, + "step": 18763 + }, + { + "epoch": 1.1648146998572226, + "grad_norm": 0.15889470502949535, + "learning_rate": 7.67519309939543e-05, + "loss": 2.8516, + "step": 18764 + }, + { + "epoch": 1.1648767769569806, + "grad_norm": 0.1618823622075084, + "learning_rate": 7.674887980886283e-05, + "loss": 2.9357, + "step": 18765 + }, + { + "epoch": 1.1649388540567385, + "grad_norm": 0.17185513966864088, + "learning_rate": 7.674582848421596e-05, + "loss": 2.9212, + "step": 18766 + }, + { + "epoch": 1.1650009311564964, + "grad_norm": 0.16793337161061944, + "learning_rate": 7.67427770200296e-05, + "loss": 2.8251, + "step": 18767 + }, + { + "epoch": 1.1650630082562543, + "grad_norm": 0.17955945469856102, + "learning_rate": 7.673972541631969e-05, + "loss": 2.8572, + "step": 18768 + }, + { + "epoch": 1.1651250853560122, + "grad_norm": 0.1736458293983064, + "learning_rate": 7.673667367310213e-05, + "loss": 2.9211, + "step": 18769 + }, + { + "epoch": 1.1651871624557701, + "grad_norm": 0.16051598597474542, + "learning_rate": 7.673362179039287e-05, + "loss": 2.935, + "step": 18770 + }, + { + "epoch": 1.165249239555528, + "grad_norm": 0.17745609296147466, + "learning_rate": 7.673056976820781e-05, + "loss": 2.9521, + "step": 18771 + }, + { + "epoch": 1.165311316655286, + "grad_norm": 0.15086231942960543, + "learning_rate": 7.672751760656288e-05, + "loss": 2.8948, + "step": 18772 + }, + { + "epoch": 1.1653733937550437, + "grad_norm": 0.17633835143864454, + "learning_rate": 7.672446530547401e-05, + "loss": 2.9295, + "step": 18773 + }, + { + "epoch": 1.1654354708548016, + "grad_norm": 0.3225996886917594, + "learning_rate": 7.672141286495709e-05, + "loss": 2.8552, + "step": 18774 + }, + { + "epoch": 1.1654975479545595, + "grad_norm": 0.17025060753987004, + "learning_rate": 7.67183602850281e-05, + "loss": 2.9017, + "step": 18775 + }, + { + "epoch": 1.1655596250543174, + "grad_norm": 0.14892817010683324, + "learning_rate": 7.671530756570291e-05, + "loss": 2.8853, + "step": 18776 + }, + { + "epoch": 1.1656217021540753, + "grad_norm": 0.19649215174731502, + "learning_rate": 7.67122547069975e-05, + "loss": 2.8782, + "step": 18777 + }, + { + "epoch": 1.1656837792538333, + "grad_norm": 0.15361953499511688, + "learning_rate": 7.670920170892776e-05, + "loss": 2.8517, + "step": 18778 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.21351544095089708, + "learning_rate": 7.670614857150964e-05, + "loss": 2.8863, + "step": 18779 + }, + { + "epoch": 1.165807933453349, + "grad_norm": 0.16671180610532352, + "learning_rate": 7.670309529475905e-05, + "loss": 2.9321, + "step": 18780 + }, + { + "epoch": 1.165870010553107, + "grad_norm": 0.22151461097166064, + "learning_rate": 7.670004187869193e-05, + "loss": 2.9006, + "step": 18781 + }, + { + "epoch": 1.165932087652865, + "grad_norm": 0.18384570222953153, + "learning_rate": 7.669698832332422e-05, + "loss": 2.8595, + "step": 18782 + }, + { + "epoch": 1.1659941647526229, + "grad_norm": 0.16622699857183873, + "learning_rate": 7.669393462867183e-05, + "loss": 2.851, + "step": 18783 + }, + { + "epoch": 1.1660562418523805, + "grad_norm": 0.16133540162493518, + "learning_rate": 7.66908807947507e-05, + "loss": 2.8485, + "step": 18784 + }, + { + "epoch": 1.1661183189521385, + "grad_norm": 0.17691585156628356, + "learning_rate": 7.668782682157677e-05, + "loss": 2.8761, + "step": 18785 + }, + { + "epoch": 1.1661803960518964, + "grad_norm": 0.16982598174958513, + "learning_rate": 7.668477270916598e-05, + "loss": 2.8718, + "step": 18786 + }, + { + "epoch": 1.1662424731516543, + "grad_norm": 0.16678078973651952, + "learning_rate": 7.668171845753425e-05, + "loss": 2.8438, + "step": 18787 + }, + { + "epoch": 1.1663045502514122, + "grad_norm": 0.1823245677321475, + "learning_rate": 7.66786640666975e-05, + "loss": 2.8785, + "step": 18788 + }, + { + "epoch": 1.1663666273511701, + "grad_norm": 0.1679576493077161, + "learning_rate": 7.66756095366717e-05, + "loss": 2.8576, + "step": 18789 + }, + { + "epoch": 1.166428704450928, + "grad_norm": 0.16457379799897454, + "learning_rate": 7.667255486747277e-05, + "loss": 2.8139, + "step": 18790 + }, + { + "epoch": 1.166490781550686, + "grad_norm": 0.1601099552141422, + "learning_rate": 7.666950005911664e-05, + "loss": 2.913, + "step": 18791 + }, + { + "epoch": 1.166552858650444, + "grad_norm": 0.2108180626294488, + "learning_rate": 7.666644511161925e-05, + "loss": 2.859, + "step": 18792 + }, + { + "epoch": 1.1666149357502018, + "grad_norm": 0.1730466539110274, + "learning_rate": 7.666339002499656e-05, + "loss": 2.9153, + "step": 18793 + }, + { + "epoch": 1.1666770128499597, + "grad_norm": 0.1875036557159082, + "learning_rate": 7.666033479926447e-05, + "loss": 2.9045, + "step": 18794 + }, + { + "epoch": 1.1667390899497176, + "grad_norm": 0.16090031814680067, + "learning_rate": 7.665727943443895e-05, + "loss": 2.9012, + "step": 18795 + }, + { + "epoch": 1.1668011670494756, + "grad_norm": 0.1521481884916371, + "learning_rate": 7.665422393053593e-05, + "loss": 2.8091, + "step": 18796 + }, + { + "epoch": 1.1668632441492333, + "grad_norm": 0.15324476531128925, + "learning_rate": 7.665116828757135e-05, + "loss": 2.9115, + "step": 18797 + }, + { + "epoch": 1.1669253212489912, + "grad_norm": 0.18449548688075043, + "learning_rate": 7.664811250556117e-05, + "loss": 2.8621, + "step": 18798 + }, + { + "epoch": 1.166987398348749, + "grad_norm": 0.1743686768622744, + "learning_rate": 7.66450565845213e-05, + "loss": 2.8879, + "step": 18799 + }, + { + "epoch": 1.167049475448507, + "grad_norm": 0.14964548277819392, + "learning_rate": 7.66420005244677e-05, + "loss": 2.8961, + "step": 18800 + }, + { + "epoch": 1.167111552548265, + "grad_norm": 0.16166166159433729, + "learning_rate": 7.663894432541635e-05, + "loss": 2.9384, + "step": 18801 + }, + { + "epoch": 1.1671736296480228, + "grad_norm": 0.1585321536971762, + "learning_rate": 7.663588798738313e-05, + "loss": 2.9846, + "step": 18802 + }, + { + "epoch": 1.1672357067477808, + "grad_norm": 0.18476122110937307, + "learning_rate": 7.6632831510384e-05, + "loss": 2.9196, + "step": 18803 + }, + { + "epoch": 1.1672977838475387, + "grad_norm": 0.16467929315657792, + "learning_rate": 7.662977489443496e-05, + "loss": 2.8742, + "step": 18804 + }, + { + "epoch": 1.1673598609472966, + "grad_norm": 0.17452202752078827, + "learning_rate": 7.66267181395519e-05, + "loss": 2.9217, + "step": 18805 + }, + { + "epoch": 1.1674219380470545, + "grad_norm": 0.1532540816106569, + "learning_rate": 7.66236612457508e-05, + "loss": 2.8521, + "step": 18806 + }, + { + "epoch": 1.1674840151468124, + "grad_norm": 0.15995439159707533, + "learning_rate": 7.662060421304759e-05, + "loss": 2.8642, + "step": 18807 + }, + { + "epoch": 1.1675460922465701, + "grad_norm": 0.15022402366172363, + "learning_rate": 7.661754704145821e-05, + "loss": 2.9784, + "step": 18808 + }, + { + "epoch": 1.167608169346328, + "grad_norm": 0.14875991214255904, + "learning_rate": 7.661448973099866e-05, + "loss": 2.8672, + "step": 18809 + }, + { + "epoch": 1.167670246446086, + "grad_norm": 0.1580435572156697, + "learning_rate": 7.661143228168482e-05, + "loss": 2.8521, + "step": 18810 + }, + { + "epoch": 1.1677323235458439, + "grad_norm": 0.16798543292743134, + "learning_rate": 7.66083746935327e-05, + "loss": 2.8672, + "step": 18811 + }, + { + "epoch": 1.1677944006456018, + "grad_norm": 0.16324385689390483, + "learning_rate": 7.660531696655821e-05, + "loss": 2.8219, + "step": 18812 + }, + { + "epoch": 1.1678564777453597, + "grad_norm": 0.16547529702026653, + "learning_rate": 7.660225910077734e-05, + "loss": 2.9366, + "step": 18813 + }, + { + "epoch": 1.1679185548451176, + "grad_norm": 0.18812746122404111, + "learning_rate": 7.659920109620602e-05, + "loss": 2.8351, + "step": 18814 + }, + { + "epoch": 1.1679806319448756, + "grad_norm": 0.15882394977491354, + "learning_rate": 7.659614295286021e-05, + "loss": 2.8839, + "step": 18815 + }, + { + "epoch": 1.1680427090446335, + "grad_norm": 0.21590233016779725, + "learning_rate": 7.659308467075588e-05, + "loss": 2.8928, + "step": 18816 + }, + { + "epoch": 1.1681047861443914, + "grad_norm": 0.1554864286310684, + "learning_rate": 7.659002624990895e-05, + "loss": 2.8495, + "step": 18817 + }, + { + "epoch": 1.1681668632441493, + "grad_norm": 0.19518139253442743, + "learning_rate": 7.658696769033542e-05, + "loss": 2.8747, + "step": 18818 + }, + { + "epoch": 1.1682289403439072, + "grad_norm": 0.1604642000825509, + "learning_rate": 7.658390899205121e-05, + "loss": 2.9125, + "step": 18819 + }, + { + "epoch": 1.1682910174436651, + "grad_norm": 0.1938015669218047, + "learning_rate": 7.65808501550723e-05, + "loss": 2.8723, + "step": 18820 + }, + { + "epoch": 1.1683530945434228, + "grad_norm": 0.15935952027261222, + "learning_rate": 7.657779117941464e-05, + "loss": 2.9677, + "step": 18821 + }, + { + "epoch": 1.1684151716431808, + "grad_norm": 0.16099071016558006, + "learning_rate": 7.657473206509418e-05, + "loss": 2.8751, + "step": 18822 + }, + { + "epoch": 1.1684772487429387, + "grad_norm": 0.16616725728694606, + "learning_rate": 7.657167281212689e-05, + "loss": 2.908, + "step": 18823 + }, + { + "epoch": 1.1685393258426966, + "grad_norm": 0.16381725077815384, + "learning_rate": 7.656861342052877e-05, + "loss": 2.9489, + "step": 18824 + }, + { + "epoch": 1.1686014029424545, + "grad_norm": 0.1914628587429399, + "learning_rate": 7.656555389031572e-05, + "loss": 2.9835, + "step": 18825 + }, + { + "epoch": 1.1686634800422124, + "grad_norm": 0.16028489203246865, + "learning_rate": 7.65624942215037e-05, + "loss": 2.8701, + "step": 18826 + }, + { + "epoch": 1.1687255571419704, + "grad_norm": 0.17151936128551398, + "learning_rate": 7.655943441410873e-05, + "loss": 2.8335, + "step": 18827 + }, + { + "epoch": 1.1687876342417283, + "grad_norm": 0.15188264756904793, + "learning_rate": 7.655637446814673e-05, + "loss": 2.8354, + "step": 18828 + }, + { + "epoch": 1.1688497113414862, + "grad_norm": 0.16104016385872105, + "learning_rate": 7.655331438363369e-05, + "loss": 2.8924, + "step": 18829 + }, + { + "epoch": 1.168911788441244, + "grad_norm": 0.1600631466509512, + "learning_rate": 7.655025416058555e-05, + "loss": 2.9052, + "step": 18830 + }, + { + "epoch": 1.1689738655410018, + "grad_norm": 0.16682724865987536, + "learning_rate": 7.654719379901831e-05, + "loss": 2.9331, + "step": 18831 + }, + { + "epoch": 1.1690359426407597, + "grad_norm": 0.1465002114954356, + "learning_rate": 7.654413329894791e-05, + "loss": 2.8496, + "step": 18832 + }, + { + "epoch": 1.1690980197405176, + "grad_norm": 0.17031480039565375, + "learning_rate": 7.65410726603903e-05, + "loss": 2.8612, + "step": 18833 + }, + { + "epoch": 1.1691600968402756, + "grad_norm": 0.16080540425561052, + "learning_rate": 7.653801188336149e-05, + "loss": 2.9651, + "step": 18834 + }, + { + "epoch": 1.1692221739400335, + "grad_norm": 0.1526063648484351, + "learning_rate": 7.653495096787741e-05, + "loss": 2.9366, + "step": 18835 + }, + { + "epoch": 1.1692842510397914, + "grad_norm": 0.15905679159469818, + "learning_rate": 7.653188991395408e-05, + "loss": 2.8399, + "step": 18836 + }, + { + "epoch": 1.1693463281395493, + "grad_norm": 0.1635310955994539, + "learning_rate": 7.652882872160742e-05, + "loss": 2.9212, + "step": 18837 + }, + { + "epoch": 1.1694084052393072, + "grad_norm": 0.1569696542819004, + "learning_rate": 7.652576739085343e-05, + "loss": 3.0089, + "step": 18838 + }, + { + "epoch": 1.1694704823390651, + "grad_norm": 0.15628847006237628, + "learning_rate": 7.652270592170806e-05, + "loss": 2.8969, + "step": 18839 + }, + { + "epoch": 1.169532559438823, + "grad_norm": 0.14939569223206914, + "learning_rate": 7.65196443141873e-05, + "loss": 2.9249, + "step": 18840 + }, + { + "epoch": 1.169594636538581, + "grad_norm": 0.1507172443742587, + "learning_rate": 7.651658256830711e-05, + "loss": 2.9458, + "step": 18841 + }, + { + "epoch": 1.169656713638339, + "grad_norm": 0.1442623084839899, + "learning_rate": 7.65135206840835e-05, + "loss": 2.8251, + "step": 18842 + }, + { + "epoch": 1.1697187907380968, + "grad_norm": 0.1715359377839853, + "learning_rate": 7.651045866153239e-05, + "loss": 2.9508, + "step": 18843 + }, + { + "epoch": 1.1697808678378547, + "grad_norm": 0.1632907460841684, + "learning_rate": 7.650739650066979e-05, + "loss": 2.8611, + "step": 18844 + }, + { + "epoch": 1.1698429449376124, + "grad_norm": 0.14589587570658094, + "learning_rate": 7.650433420151166e-05, + "loss": 2.9124, + "step": 18845 + }, + { + "epoch": 1.1699050220373703, + "grad_norm": 0.1584032170883411, + "learning_rate": 7.650127176407399e-05, + "loss": 2.7942, + "step": 18846 + }, + { + "epoch": 1.1699670991371283, + "grad_norm": 0.14868807348106272, + "learning_rate": 7.649820918837276e-05, + "loss": 2.9171, + "step": 18847 + }, + { + "epoch": 1.1700291762368862, + "grad_norm": 0.15923770479125768, + "learning_rate": 7.649514647442393e-05, + "loss": 2.8811, + "step": 18848 + }, + { + "epoch": 1.170091253336644, + "grad_norm": 0.16517140088019178, + "learning_rate": 7.649208362224349e-05, + "loss": 2.9012, + "step": 18849 + }, + { + "epoch": 1.170153330436402, + "grad_norm": 0.1627313505286331, + "learning_rate": 7.648902063184742e-05, + "loss": 2.8512, + "step": 18850 + }, + { + "epoch": 1.17021540753616, + "grad_norm": 0.1640734555355863, + "learning_rate": 7.648595750325169e-05, + "loss": 2.9386, + "step": 18851 + }, + { + "epoch": 1.1702774846359179, + "grad_norm": 0.14462028802286064, + "learning_rate": 7.64828942364723e-05, + "loss": 2.8663, + "step": 18852 + }, + { + "epoch": 1.1703395617356758, + "grad_norm": 0.14825835208942376, + "learning_rate": 7.647983083152523e-05, + "loss": 2.8731, + "step": 18853 + }, + { + "epoch": 1.1704016388354337, + "grad_norm": 0.1539242082855975, + "learning_rate": 7.647676728842643e-05, + "loss": 2.8439, + "step": 18854 + }, + { + "epoch": 1.1704637159351914, + "grad_norm": 0.15772393144342928, + "learning_rate": 7.647370360719193e-05, + "loss": 2.9014, + "step": 18855 + }, + { + "epoch": 1.1705257930349493, + "grad_norm": 0.19860602566755933, + "learning_rate": 7.647063978783769e-05, + "loss": 2.8852, + "step": 18856 + }, + { + "epoch": 1.1705878701347072, + "grad_norm": 0.16648301919271394, + "learning_rate": 7.646757583037967e-05, + "loss": 2.8628, + "step": 18857 + }, + { + "epoch": 1.1706499472344651, + "grad_norm": 0.14983155632525572, + "learning_rate": 7.646451173483392e-05, + "loss": 2.8415, + "step": 18858 + }, + { + "epoch": 1.170712024334223, + "grad_norm": 0.18602378189678562, + "learning_rate": 7.646144750121635e-05, + "loss": 2.8751, + "step": 18859 + }, + { + "epoch": 1.170774101433981, + "grad_norm": 0.18970418792230767, + "learning_rate": 7.645838312954302e-05, + "loss": 2.8833, + "step": 18860 + }, + { + "epoch": 1.170836178533739, + "grad_norm": 0.17094793965158564, + "learning_rate": 7.645531861982986e-05, + "loss": 2.818, + "step": 18861 + }, + { + "epoch": 1.1708982556334968, + "grad_norm": 0.19240121704986052, + "learning_rate": 7.645225397209287e-05, + "loss": 2.9542, + "step": 18862 + }, + { + "epoch": 1.1709603327332547, + "grad_norm": 0.16741820977097813, + "learning_rate": 7.644918918634809e-05, + "loss": 2.9945, + "step": 18863 + }, + { + "epoch": 1.1710224098330126, + "grad_norm": 0.15934160106980713, + "learning_rate": 7.644612426261144e-05, + "loss": 2.8537, + "step": 18864 + }, + { + "epoch": 1.1710844869327706, + "grad_norm": 0.21729166256882348, + "learning_rate": 7.644305920089893e-05, + "loss": 2.9931, + "step": 18865 + }, + { + "epoch": 1.1711465640325285, + "grad_norm": 0.15953403653542347, + "learning_rate": 7.643999400122659e-05, + "loss": 2.9157, + "step": 18866 + }, + { + "epoch": 1.1712086411322864, + "grad_norm": 0.15341966289359465, + "learning_rate": 7.643692866361036e-05, + "loss": 2.7693, + "step": 18867 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.158401623751465, + "learning_rate": 7.643386318806628e-05, + "loss": 2.8666, + "step": 18868 + }, + { + "epoch": 1.171332795331802, + "grad_norm": 0.1764431571195413, + "learning_rate": 7.643079757461029e-05, + "loss": 2.8918, + "step": 18869 + }, + { + "epoch": 1.17139487243156, + "grad_norm": 0.15688436598953936, + "learning_rate": 7.642773182325844e-05, + "loss": 2.8834, + "step": 18870 + }, + { + "epoch": 1.1714569495313178, + "grad_norm": 0.1628595368246259, + "learning_rate": 7.642466593402667e-05, + "loss": 2.8585, + "step": 18871 + }, + { + "epoch": 1.1715190266310758, + "grad_norm": 0.15977086242821095, + "learning_rate": 7.642159990693104e-05, + "loss": 2.8019, + "step": 18872 + }, + { + "epoch": 1.1715811037308337, + "grad_norm": 0.18057825267299052, + "learning_rate": 7.641853374198747e-05, + "loss": 2.8901, + "step": 18873 + }, + { + "epoch": 1.1716431808305916, + "grad_norm": 0.15513511710677078, + "learning_rate": 7.641546743921202e-05, + "loss": 2.8277, + "step": 18874 + }, + { + "epoch": 1.1717052579303495, + "grad_norm": 0.20228083048364023, + "learning_rate": 7.641240099862063e-05, + "loss": 2.9778, + "step": 18875 + }, + { + "epoch": 1.1717673350301074, + "grad_norm": 0.15877361222441347, + "learning_rate": 7.640933442022936e-05, + "loss": 2.8422, + "step": 18876 + }, + { + "epoch": 1.1718294121298654, + "grad_norm": 0.15767593612483732, + "learning_rate": 7.64062677040542e-05, + "loss": 2.7756, + "step": 18877 + }, + { + "epoch": 1.1718914892296233, + "grad_norm": 0.3106431751767264, + "learning_rate": 7.640320085011109e-05, + "loss": 2.9011, + "step": 18878 + }, + { + "epoch": 1.171953566329381, + "grad_norm": 0.16514599174573563, + "learning_rate": 7.640013385841609e-05, + "loss": 2.8647, + "step": 18879 + }, + { + "epoch": 1.1720156434291389, + "grad_norm": 0.15674188793940722, + "learning_rate": 7.639706672898519e-05, + "loss": 2.8331, + "step": 18880 + }, + { + "epoch": 1.1720777205288968, + "grad_norm": 0.17771044769199484, + "learning_rate": 7.639399946183439e-05, + "loss": 2.8475, + "step": 18881 + }, + { + "epoch": 1.1721397976286547, + "grad_norm": 0.164780996228751, + "learning_rate": 7.639093205697967e-05, + "loss": 2.904, + "step": 18882 + }, + { + "epoch": 1.1722018747284126, + "grad_norm": 0.17750135804046346, + "learning_rate": 7.638786451443705e-05, + "loss": 2.8491, + "step": 18883 + }, + { + "epoch": 1.1722639518281706, + "grad_norm": 0.1689558081040329, + "learning_rate": 7.638479683422254e-05, + "loss": 2.8876, + "step": 18884 + }, + { + "epoch": 1.1723260289279285, + "grad_norm": 0.1635094984322068, + "learning_rate": 7.638172901635214e-05, + "loss": 2.9569, + "step": 18885 + }, + { + "epoch": 1.1723881060276864, + "grad_norm": 0.15176045797195029, + "learning_rate": 7.637866106084186e-05, + "loss": 2.9302, + "step": 18886 + }, + { + "epoch": 1.1724501831274443, + "grad_norm": 0.1884226153553957, + "learning_rate": 7.63755929677077e-05, + "loss": 2.8478, + "step": 18887 + }, + { + "epoch": 1.1725122602272022, + "grad_norm": 0.15633227447195744, + "learning_rate": 7.637252473696568e-05, + "loss": 2.9253, + "step": 18888 + }, + { + "epoch": 1.1725743373269601, + "grad_norm": 0.15680778008378587, + "learning_rate": 7.636945636863178e-05, + "loss": 2.936, + "step": 18889 + }, + { + "epoch": 1.172636414426718, + "grad_norm": 0.15576369185677819, + "learning_rate": 7.636638786272204e-05, + "loss": 2.8719, + "step": 18890 + }, + { + "epoch": 1.172698491526476, + "grad_norm": 0.1440226589044229, + "learning_rate": 7.636331921925242e-05, + "loss": 2.8482, + "step": 18891 + }, + { + "epoch": 1.172760568626234, + "grad_norm": 0.14708050544341855, + "learning_rate": 7.636025043823897e-05, + "loss": 2.7776, + "step": 18892 + }, + { + "epoch": 1.1728226457259916, + "grad_norm": 0.18154922359663517, + "learning_rate": 7.635718151969772e-05, + "loss": 2.8906, + "step": 18893 + }, + { + "epoch": 1.1728847228257495, + "grad_norm": 0.14555675528341677, + "learning_rate": 7.635411246364466e-05, + "loss": 2.8639, + "step": 18894 + }, + { + "epoch": 1.1729467999255074, + "grad_norm": 0.16135154096976076, + "learning_rate": 7.635104327009577e-05, + "loss": 2.9662, + "step": 18895 + }, + { + "epoch": 1.1730088770252654, + "grad_norm": 0.15286834577861563, + "learning_rate": 7.634797393906711e-05, + "loss": 2.9226, + "step": 18896 + }, + { + "epoch": 1.1730709541250233, + "grad_norm": 0.15299718056836056, + "learning_rate": 7.634490447057468e-05, + "loss": 2.9335, + "step": 18897 + }, + { + "epoch": 1.1731330312247812, + "grad_norm": 0.1600954591703423, + "learning_rate": 7.634183486463446e-05, + "loss": 2.7773, + "step": 18898 + }, + { + "epoch": 1.173195108324539, + "grad_norm": 0.1532341599628709, + "learning_rate": 7.633876512126252e-05, + "loss": 2.8935, + "step": 18899 + }, + { + "epoch": 1.173257185424297, + "grad_norm": 0.17514455244935967, + "learning_rate": 7.633569524047483e-05, + "loss": 2.9393, + "step": 18900 + }, + { + "epoch": 1.173319262524055, + "grad_norm": 0.14775564415836823, + "learning_rate": 7.633262522228744e-05, + "loss": 2.7397, + "step": 18901 + }, + { + "epoch": 1.1733813396238129, + "grad_norm": 0.1657087968775528, + "learning_rate": 7.632955506671633e-05, + "loss": 2.8539, + "step": 18902 + }, + { + "epoch": 1.1734434167235706, + "grad_norm": 0.15878118469666383, + "learning_rate": 7.632648477377754e-05, + "loss": 2.8582, + "step": 18903 + }, + { + "epoch": 1.1735054938233285, + "grad_norm": 0.1619451104226858, + "learning_rate": 7.632341434348711e-05, + "loss": 2.9269, + "step": 18904 + }, + { + "epoch": 1.1735675709230864, + "grad_norm": 0.1511076578861767, + "learning_rate": 7.632034377586104e-05, + "loss": 2.8123, + "step": 18905 + }, + { + "epoch": 1.1736296480228443, + "grad_norm": 0.16267920502499195, + "learning_rate": 7.631727307091533e-05, + "loss": 2.8154, + "step": 18906 + }, + { + "epoch": 1.1736917251226022, + "grad_norm": 0.1705877313416934, + "learning_rate": 7.631420222866602e-05, + "loss": 2.9375, + "step": 18907 + }, + { + "epoch": 1.1737538022223601, + "grad_norm": 0.15273877427772847, + "learning_rate": 7.631113124912911e-05, + "loss": 2.8664, + "step": 18908 + }, + { + "epoch": 1.173815879322118, + "grad_norm": 0.16812156778327816, + "learning_rate": 7.630806013232067e-05, + "loss": 2.9946, + "step": 18909 + }, + { + "epoch": 1.173877956421876, + "grad_norm": 0.1734327792478873, + "learning_rate": 7.63049888782567e-05, + "loss": 2.8913, + "step": 18910 + }, + { + "epoch": 1.173940033521634, + "grad_norm": 0.163895289042765, + "learning_rate": 7.63019174869532e-05, + "loss": 2.9594, + "step": 18911 + }, + { + "epoch": 1.1740021106213918, + "grad_norm": 0.18299594909908493, + "learning_rate": 7.629884595842621e-05, + "loss": 2.9165, + "step": 18912 + }, + { + "epoch": 1.1740641877211497, + "grad_norm": 0.19149555341759789, + "learning_rate": 7.629577429269177e-05, + "loss": 2.8678, + "step": 18913 + }, + { + "epoch": 1.1741262648209077, + "grad_norm": 0.17038430397660917, + "learning_rate": 7.629270248976588e-05, + "loss": 2.8349, + "step": 18914 + }, + { + "epoch": 1.1741883419206656, + "grad_norm": 0.17849705096554888, + "learning_rate": 7.628963054966458e-05, + "loss": 2.8728, + "step": 18915 + }, + { + "epoch": 1.1742504190204235, + "grad_norm": 0.14715635662678508, + "learning_rate": 7.62865584724039e-05, + "loss": 2.8978, + "step": 18916 + }, + { + "epoch": 1.1743124961201812, + "grad_norm": 0.17150485757143172, + "learning_rate": 7.628348625799986e-05, + "loss": 2.8074, + "step": 18917 + }, + { + "epoch": 1.174374573219939, + "grad_norm": 0.25856804676759254, + "learning_rate": 7.62804139064685e-05, + "loss": 3.0098, + "step": 18918 + }, + { + "epoch": 1.174436650319697, + "grad_norm": 0.2142648874816909, + "learning_rate": 7.627734141782583e-05, + "loss": 2.7747, + "step": 18919 + }, + { + "epoch": 1.174498727419455, + "grad_norm": 0.17960638827793718, + "learning_rate": 7.627426879208788e-05, + "loss": 2.8312, + "step": 18920 + }, + { + "epoch": 1.1745608045192129, + "grad_norm": 0.1960399622307632, + "learning_rate": 7.627119602927071e-05, + "loss": 2.9412, + "step": 18921 + }, + { + "epoch": 1.1746228816189708, + "grad_norm": 0.17813261710159936, + "learning_rate": 7.626812312939033e-05, + "loss": 2.9482, + "step": 18922 + }, + { + "epoch": 1.1746849587187287, + "grad_norm": 0.21267826968343578, + "learning_rate": 7.626505009246277e-05, + "loss": 2.8714, + "step": 18923 + }, + { + "epoch": 1.1747470358184866, + "grad_norm": 0.15893929121115924, + "learning_rate": 7.626197691850408e-05, + "loss": 2.946, + "step": 18924 + }, + { + "epoch": 1.1748091129182445, + "grad_norm": 0.17867338067318128, + "learning_rate": 7.625890360753025e-05, + "loss": 2.8335, + "step": 18925 + }, + { + "epoch": 1.1748711900180024, + "grad_norm": 0.16319775887994614, + "learning_rate": 7.625583015955737e-05, + "loss": 2.9097, + "step": 18926 + }, + { + "epoch": 1.1749332671177601, + "grad_norm": 0.1587208764515304, + "learning_rate": 7.625275657460144e-05, + "loss": 2.8887, + "step": 18927 + }, + { + "epoch": 1.174995344217518, + "grad_norm": 0.16663727215498156, + "learning_rate": 7.624968285267853e-05, + "loss": 2.8517, + "step": 18928 + }, + { + "epoch": 1.175057421317276, + "grad_norm": 0.16698059242549687, + "learning_rate": 7.624660899380463e-05, + "loss": 2.9061, + "step": 18929 + }, + { + "epoch": 1.175119498417034, + "grad_norm": 0.1740996083831847, + "learning_rate": 7.62435349979958e-05, + "loss": 2.893, + "step": 18930 + }, + { + "epoch": 1.1751815755167918, + "grad_norm": 0.18739167700422682, + "learning_rate": 7.624046086526808e-05, + "loss": 2.8752, + "step": 18931 + }, + { + "epoch": 1.1752436526165497, + "grad_norm": 0.19204154946849267, + "learning_rate": 7.623738659563751e-05, + "loss": 2.8427, + "step": 18932 + }, + { + "epoch": 1.1753057297163076, + "grad_norm": 0.1986986645642392, + "learning_rate": 7.623431218912012e-05, + "loss": 2.9113, + "step": 18933 + }, + { + "epoch": 1.1753678068160656, + "grad_norm": 0.17236229785498175, + "learning_rate": 7.623123764573196e-05, + "loss": 2.9281, + "step": 18934 + }, + { + "epoch": 1.1754298839158235, + "grad_norm": 0.1650999735551463, + "learning_rate": 7.622816296548907e-05, + "loss": 2.7916, + "step": 18935 + }, + { + "epoch": 1.1754919610155814, + "grad_norm": 0.16745356351982404, + "learning_rate": 7.622508814840747e-05, + "loss": 2.9105, + "step": 18936 + }, + { + "epoch": 1.1755540381153393, + "grad_norm": 0.1589223794712532, + "learning_rate": 7.622201319450323e-05, + "loss": 2.8747, + "step": 18937 + }, + { + "epoch": 1.1756161152150972, + "grad_norm": 0.24106998417941328, + "learning_rate": 7.621893810379237e-05, + "loss": 2.8842, + "step": 18938 + }, + { + "epoch": 1.1756781923148552, + "grad_norm": 0.19832369815204867, + "learning_rate": 7.621586287629097e-05, + "loss": 2.7652, + "step": 18939 + }, + { + "epoch": 1.175740269414613, + "grad_norm": 0.178588356126848, + "learning_rate": 7.621278751201501e-05, + "loss": 2.8542, + "step": 18940 + }, + { + "epoch": 1.1758023465143708, + "grad_norm": 0.16646393662567738, + "learning_rate": 7.62097120109806e-05, + "loss": 2.8083, + "step": 18941 + }, + { + "epoch": 1.1758644236141287, + "grad_norm": 0.1638373521690345, + "learning_rate": 7.620663637320376e-05, + "loss": 2.8669, + "step": 18942 + }, + { + "epoch": 1.1759265007138866, + "grad_norm": 0.16245072975018265, + "learning_rate": 7.620356059870053e-05, + "loss": 2.91, + "step": 18943 + }, + { + "epoch": 1.1759885778136445, + "grad_norm": 0.1654942628555393, + "learning_rate": 7.620048468748697e-05, + "loss": 2.8224, + "step": 18944 + }, + { + "epoch": 1.1760506549134024, + "grad_norm": 0.2464770329586476, + "learning_rate": 7.619740863957911e-05, + "loss": 2.8882, + "step": 18945 + }, + { + "epoch": 1.1761127320131604, + "grad_norm": 0.1585144758582353, + "learning_rate": 7.619433245499303e-05, + "loss": 2.8819, + "step": 18946 + }, + { + "epoch": 1.1761748091129183, + "grad_norm": 0.17275065909051548, + "learning_rate": 7.619125613374474e-05, + "loss": 2.845, + "step": 18947 + }, + { + "epoch": 1.1762368862126762, + "grad_norm": 0.15799474532482632, + "learning_rate": 7.618817967585031e-05, + "loss": 2.8527, + "step": 18948 + }, + { + "epoch": 1.176298963312434, + "grad_norm": 0.19722482051522058, + "learning_rate": 7.61851030813258e-05, + "loss": 2.8722, + "step": 18949 + }, + { + "epoch": 1.176361040412192, + "grad_norm": 0.16701069110281624, + "learning_rate": 7.618202635018725e-05, + "loss": 2.9013, + "step": 18950 + }, + { + "epoch": 1.1764231175119497, + "grad_norm": 0.17964462100373493, + "learning_rate": 7.61789494824507e-05, + "loss": 2.8664, + "step": 18951 + }, + { + "epoch": 1.1764851946117076, + "grad_norm": 0.1513433499178343, + "learning_rate": 7.617587247813223e-05, + "loss": 2.8625, + "step": 18952 + }, + { + "epoch": 1.1765472717114656, + "grad_norm": 0.17140606284674664, + "learning_rate": 7.617279533724786e-05, + "loss": 2.8167, + "step": 18953 + }, + { + "epoch": 1.1766093488112235, + "grad_norm": 0.15957714262297373, + "learning_rate": 7.616971805981367e-05, + "loss": 2.8355, + "step": 18954 + }, + { + "epoch": 1.1766714259109814, + "grad_norm": 0.16328692471987577, + "learning_rate": 7.616664064584572e-05, + "loss": 2.9246, + "step": 18955 + }, + { + "epoch": 1.1767335030107393, + "grad_norm": 0.1808205313608094, + "learning_rate": 7.616356309536004e-05, + "loss": 2.9204, + "step": 18956 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.1877244559535801, + "learning_rate": 7.61604854083727e-05, + "loss": 2.9009, + "step": 18957 + }, + { + "epoch": 1.1768576572102551, + "grad_norm": 0.1518363327284238, + "learning_rate": 7.615740758489977e-05, + "loss": 2.8917, + "step": 18958 + }, + { + "epoch": 1.176919734310013, + "grad_norm": 0.16834650698561018, + "learning_rate": 7.615432962495728e-05, + "loss": 2.9203, + "step": 18959 + }, + { + "epoch": 1.176981811409771, + "grad_norm": 0.14670087709531976, + "learning_rate": 7.615125152856131e-05, + "loss": 2.7933, + "step": 18960 + }, + { + "epoch": 1.177043888509529, + "grad_norm": 0.1679753202439702, + "learning_rate": 7.614817329572792e-05, + "loss": 2.8509, + "step": 18961 + }, + { + "epoch": 1.1771059656092868, + "grad_norm": 0.15585587982809612, + "learning_rate": 7.614509492647315e-05, + "loss": 2.8362, + "step": 18962 + }, + { + "epoch": 1.1771680427090447, + "grad_norm": 0.16023314769875607, + "learning_rate": 7.614201642081306e-05, + "loss": 2.8989, + "step": 18963 + }, + { + "epoch": 1.1772301198088027, + "grad_norm": 0.15423830345127565, + "learning_rate": 7.613893777876375e-05, + "loss": 2.8554, + "step": 18964 + }, + { + "epoch": 1.1772921969085604, + "grad_norm": 0.16808072505158722, + "learning_rate": 7.613585900034126e-05, + "loss": 2.9586, + "step": 18965 + }, + { + "epoch": 1.1773542740083183, + "grad_norm": 0.16845431671166874, + "learning_rate": 7.613278008556164e-05, + "loss": 2.8861, + "step": 18966 + }, + { + "epoch": 1.1774163511080762, + "grad_norm": 0.15301931198260607, + "learning_rate": 7.612970103444096e-05, + "loss": 2.8939, + "step": 18967 + }, + { + "epoch": 1.177478428207834, + "grad_norm": 0.1916904966339668, + "learning_rate": 7.612662184699528e-05, + "loss": 2.8114, + "step": 18968 + }, + { + "epoch": 1.177540505307592, + "grad_norm": 0.2078621493093244, + "learning_rate": 7.612354252324068e-05, + "loss": 2.8054, + "step": 18969 + }, + { + "epoch": 1.17760258240735, + "grad_norm": 0.1881440446249974, + "learning_rate": 7.61204630631932e-05, + "loss": 2.7833, + "step": 18970 + }, + { + "epoch": 1.1776646595071079, + "grad_norm": 0.1527192366822358, + "learning_rate": 7.611738346686895e-05, + "loss": 2.779, + "step": 18971 + }, + { + "epoch": 1.1777267366068658, + "grad_norm": 0.18903961476620804, + "learning_rate": 7.611430373428395e-05, + "loss": 2.8971, + "step": 18972 + }, + { + "epoch": 1.1777888137066237, + "grad_norm": 0.19910436974414275, + "learning_rate": 7.611122386545431e-05, + "loss": 2.8621, + "step": 18973 + }, + { + "epoch": 1.1778508908063816, + "grad_norm": 0.1815550510270385, + "learning_rate": 7.610814386039605e-05, + "loss": 2.8781, + "step": 18974 + }, + { + "epoch": 1.1779129679061393, + "grad_norm": 0.17021033586933365, + "learning_rate": 7.610506371912526e-05, + "loss": 2.8043, + "step": 18975 + }, + { + "epoch": 1.1779750450058972, + "grad_norm": 0.17199257600434156, + "learning_rate": 7.610198344165805e-05, + "loss": 2.9205, + "step": 18976 + }, + { + "epoch": 1.1780371221056551, + "grad_norm": 0.15803240170963412, + "learning_rate": 7.609890302801044e-05, + "loss": 2.9357, + "step": 18977 + }, + { + "epoch": 1.178099199205413, + "grad_norm": 0.18790017577579798, + "learning_rate": 7.609582247819851e-05, + "loss": 2.9855, + "step": 18978 + }, + { + "epoch": 1.178161276305171, + "grad_norm": 0.1663692186907386, + "learning_rate": 7.609274179223834e-05, + "loss": 2.919, + "step": 18979 + }, + { + "epoch": 1.178223353404929, + "grad_norm": 0.2064055578666362, + "learning_rate": 7.6089660970146e-05, + "loss": 2.7818, + "step": 18980 + }, + { + "epoch": 1.1782854305046868, + "grad_norm": 0.16617031183659872, + "learning_rate": 7.608658001193758e-05, + "loss": 2.8953, + "step": 18981 + }, + { + "epoch": 1.1783475076044447, + "grad_norm": 0.1688930452674353, + "learning_rate": 7.608349891762913e-05, + "loss": 2.9363, + "step": 18982 + }, + { + "epoch": 1.1784095847042027, + "grad_norm": 0.17872761299420536, + "learning_rate": 7.608041768723673e-05, + "loss": 2.9718, + "step": 18983 + }, + { + "epoch": 1.1784716618039606, + "grad_norm": 0.16923339075065205, + "learning_rate": 7.607733632077647e-05, + "loss": 2.8825, + "step": 18984 + }, + { + "epoch": 1.1785337389037185, + "grad_norm": 0.18592598983657094, + "learning_rate": 7.607425481826442e-05, + "loss": 2.9815, + "step": 18985 + }, + { + "epoch": 1.1785958160034764, + "grad_norm": 0.1616079376951225, + "learning_rate": 7.607117317971663e-05, + "loss": 2.9435, + "step": 18986 + }, + { + "epoch": 1.1786578931032343, + "grad_norm": 0.16911041593473586, + "learning_rate": 7.606809140514923e-05, + "loss": 2.9788, + "step": 18987 + }, + { + "epoch": 1.1787199702029922, + "grad_norm": 0.17114982099892875, + "learning_rate": 7.606500949457824e-05, + "loss": 2.8092, + "step": 18988 + }, + { + "epoch": 1.17878204730275, + "grad_norm": 0.17075118123884436, + "learning_rate": 7.60619274480198e-05, + "loss": 2.822, + "step": 18989 + }, + { + "epoch": 1.1788441244025079, + "grad_norm": 0.20193376983330877, + "learning_rate": 7.605884526548992e-05, + "loss": 2.7979, + "step": 18990 + }, + { + "epoch": 1.1789062015022658, + "grad_norm": 0.21136160883171085, + "learning_rate": 7.605576294700474e-05, + "loss": 2.9283, + "step": 18991 + }, + { + "epoch": 1.1789682786020237, + "grad_norm": 0.16325226029033257, + "learning_rate": 7.605268049258032e-05, + "loss": 2.9412, + "step": 18992 + }, + { + "epoch": 1.1790303557017816, + "grad_norm": 0.16947403350970341, + "learning_rate": 7.604959790223273e-05, + "loss": 2.9001, + "step": 18993 + }, + { + "epoch": 1.1790924328015395, + "grad_norm": 0.18333794649417903, + "learning_rate": 7.604651517597807e-05, + "loss": 2.9089, + "step": 18994 + }, + { + "epoch": 1.1791545099012974, + "grad_norm": 0.16788325240756433, + "learning_rate": 7.604343231383241e-05, + "loss": 2.9108, + "step": 18995 + }, + { + "epoch": 1.1792165870010554, + "grad_norm": 0.16562602947023128, + "learning_rate": 7.604034931581187e-05, + "loss": 2.9629, + "step": 18996 + }, + { + "epoch": 1.1792786641008133, + "grad_norm": 0.1681017072222867, + "learning_rate": 7.603726618193247e-05, + "loss": 2.9562, + "step": 18997 + }, + { + "epoch": 1.1793407412005712, + "grad_norm": 0.1884519432472983, + "learning_rate": 7.603418291221036e-05, + "loss": 2.9334, + "step": 18998 + }, + { + "epoch": 1.179402818300329, + "grad_norm": 0.16711301503579679, + "learning_rate": 7.603109950666158e-05, + "loss": 2.8414, + "step": 18999 + }, + { + "epoch": 1.1794648954000868, + "grad_norm": 0.1852333833691244, + "learning_rate": 7.602801596530225e-05, + "loss": 2.7787, + "step": 19000 + }, + { + "epoch": 1.1795269724998447, + "grad_norm": 0.1535165284399574, + "learning_rate": 7.602493228814842e-05, + "loss": 2.8731, + "step": 19001 + }, + { + "epoch": 1.1795890495996026, + "grad_norm": 0.236126285459615, + "learning_rate": 7.602184847521623e-05, + "loss": 2.9236, + "step": 19002 + }, + { + "epoch": 1.1796511266993606, + "grad_norm": 0.19519299665889955, + "learning_rate": 7.601876452652172e-05, + "loss": 2.8977, + "step": 19003 + }, + { + "epoch": 1.1797132037991185, + "grad_norm": 0.22314312055880592, + "learning_rate": 7.6015680442081e-05, + "loss": 2.9357, + "step": 19004 + }, + { + "epoch": 1.1797752808988764, + "grad_norm": 0.18542573889672187, + "learning_rate": 7.601259622191017e-05, + "loss": 2.7955, + "step": 19005 + }, + { + "epoch": 1.1798373579986343, + "grad_norm": 0.16989521935638718, + "learning_rate": 7.60095118660253e-05, + "loss": 2.9279, + "step": 19006 + }, + { + "epoch": 1.1798994350983922, + "grad_norm": 0.2115777770996614, + "learning_rate": 7.600642737444248e-05, + "loss": 2.924, + "step": 19007 + }, + { + "epoch": 1.1799615121981502, + "grad_norm": 0.16969734462787348, + "learning_rate": 7.600334274717782e-05, + "loss": 2.8977, + "step": 19008 + }, + { + "epoch": 1.180023589297908, + "grad_norm": 0.19500420079841635, + "learning_rate": 7.600025798424744e-05, + "loss": 2.8713, + "step": 19009 + }, + { + "epoch": 1.180085666397666, + "grad_norm": 0.19367765952731916, + "learning_rate": 7.599717308566738e-05, + "loss": 2.9111, + "step": 19010 + }, + { + "epoch": 1.180147743497424, + "grad_norm": 0.2538850188548721, + "learning_rate": 7.599408805145373e-05, + "loss": 2.867, + "step": 19011 + }, + { + "epoch": 1.1802098205971818, + "grad_norm": 0.18808656575542668, + "learning_rate": 7.599100288162266e-05, + "loss": 2.8581, + "step": 19012 + }, + { + "epoch": 1.1802718976969395, + "grad_norm": 0.19712159592538225, + "learning_rate": 7.598791757619019e-05, + "loss": 2.9806, + "step": 19013 + }, + { + "epoch": 1.1803339747966974, + "grad_norm": 0.19026669104403476, + "learning_rate": 7.598483213517246e-05, + "loss": 2.7474, + "step": 19014 + }, + { + "epoch": 1.1803960518964554, + "grad_norm": 0.18100782114987402, + "learning_rate": 7.598174655858554e-05, + "loss": 2.8287, + "step": 19015 + }, + { + "epoch": 1.1804581289962133, + "grad_norm": 0.17928420294234285, + "learning_rate": 7.597866084644556e-05, + "loss": 2.8937, + "step": 19016 + }, + { + "epoch": 1.1805202060959712, + "grad_norm": 0.20837001874904673, + "learning_rate": 7.597557499876857e-05, + "loss": 2.9079, + "step": 19017 + }, + { + "epoch": 1.180582283195729, + "grad_norm": 0.16678479908059454, + "learning_rate": 7.597248901557072e-05, + "loss": 2.856, + "step": 19018 + }, + { + "epoch": 1.180644360295487, + "grad_norm": 0.18559791465270645, + "learning_rate": 7.59694028968681e-05, + "loss": 2.9305, + "step": 19019 + }, + { + "epoch": 1.180706437395245, + "grad_norm": 0.18926943889923736, + "learning_rate": 7.596631664267678e-05, + "loss": 2.8669, + "step": 19020 + }, + { + "epoch": 1.1807685144950029, + "grad_norm": 0.18724240052001032, + "learning_rate": 7.59632302530129e-05, + "loss": 2.8681, + "step": 19021 + }, + { + "epoch": 1.1808305915947608, + "grad_norm": 0.18736521525534552, + "learning_rate": 7.596014372789253e-05, + "loss": 2.9493, + "step": 19022 + }, + { + "epoch": 1.1808926686945185, + "grad_norm": 0.20639378997365232, + "learning_rate": 7.595705706733178e-05, + "loss": 2.8572, + "step": 19023 + }, + { + "epoch": 1.1809547457942764, + "grad_norm": 0.1815658846259003, + "learning_rate": 7.595397027134677e-05, + "loss": 2.8925, + "step": 19024 + }, + { + "epoch": 1.1810168228940343, + "grad_norm": 0.17046432945680715, + "learning_rate": 7.595088333995361e-05, + "loss": 2.9289, + "step": 19025 + }, + { + "epoch": 1.1810788999937922, + "grad_norm": 0.17150550377573845, + "learning_rate": 7.594779627316837e-05, + "loss": 2.974, + "step": 19026 + }, + { + "epoch": 1.1811409770935501, + "grad_norm": 0.17096038785428677, + "learning_rate": 7.594470907100721e-05, + "loss": 2.8971, + "step": 19027 + }, + { + "epoch": 1.181203054193308, + "grad_norm": 0.19715306876169555, + "learning_rate": 7.594162173348618e-05, + "loss": 2.9625, + "step": 19028 + }, + { + "epoch": 1.181265131293066, + "grad_norm": 0.1768470875920051, + "learning_rate": 7.593853426062141e-05, + "loss": 2.8449, + "step": 19029 + }, + { + "epoch": 1.181327208392824, + "grad_norm": 0.17425516265482555, + "learning_rate": 7.593544665242902e-05, + "loss": 2.8318, + "step": 19030 + }, + { + "epoch": 1.1813892854925818, + "grad_norm": 0.17898867345116642, + "learning_rate": 7.59323589089251e-05, + "loss": 2.8614, + "step": 19031 + }, + { + "epoch": 1.1814513625923397, + "grad_norm": 0.16462792306077612, + "learning_rate": 7.592927103012578e-05, + "loss": 2.8713, + "step": 19032 + }, + { + "epoch": 1.1815134396920977, + "grad_norm": 0.16829535424322162, + "learning_rate": 7.592618301604715e-05, + "loss": 2.8672, + "step": 19033 + }, + { + "epoch": 1.1815755167918556, + "grad_norm": 0.1637821583892809, + "learning_rate": 7.592309486670534e-05, + "loss": 2.8924, + "step": 19034 + }, + { + "epoch": 1.1816375938916135, + "grad_norm": 0.17656913192075513, + "learning_rate": 7.592000658211645e-05, + "loss": 2.866, + "step": 19035 + }, + { + "epoch": 1.1816996709913712, + "grad_norm": 0.15270947914443944, + "learning_rate": 7.591691816229658e-05, + "loss": 2.8632, + "step": 19036 + }, + { + "epoch": 1.181761748091129, + "grad_norm": 0.20384328644679472, + "learning_rate": 7.591382960726185e-05, + "loss": 2.8822, + "step": 19037 + }, + { + "epoch": 1.181823825190887, + "grad_norm": 0.15667196754566204, + "learning_rate": 7.59107409170284e-05, + "loss": 2.8854, + "step": 19038 + }, + { + "epoch": 1.181885902290645, + "grad_norm": 0.17701582229472615, + "learning_rate": 7.590765209161232e-05, + "loss": 2.8474, + "step": 19039 + }, + { + "epoch": 1.1819479793904029, + "grad_norm": 0.20452558468810977, + "learning_rate": 7.590456313102972e-05, + "loss": 2.8482, + "step": 19040 + }, + { + "epoch": 1.1820100564901608, + "grad_norm": 0.17640319574920252, + "learning_rate": 7.590147403529672e-05, + "loss": 2.8676, + "step": 19041 + }, + { + "epoch": 1.1820721335899187, + "grad_norm": 0.16186001569295405, + "learning_rate": 7.589838480442946e-05, + "loss": 2.8663, + "step": 19042 + }, + { + "epoch": 1.1821342106896766, + "grad_norm": 0.1794356156319663, + "learning_rate": 7.589529543844404e-05, + "loss": 2.8548, + "step": 19043 + }, + { + "epoch": 1.1821962877894345, + "grad_norm": 0.16290499305935366, + "learning_rate": 7.589220593735658e-05, + "loss": 2.9556, + "step": 19044 + }, + { + "epoch": 1.1822583648891924, + "grad_norm": 0.18785193316541476, + "learning_rate": 7.588911630118318e-05, + "loss": 2.8132, + "step": 19045 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.18339644645284514, + "learning_rate": 7.588602652993999e-05, + "loss": 2.9276, + "step": 19046 + }, + { + "epoch": 1.182382519088708, + "grad_norm": 0.15931968149660589, + "learning_rate": 7.58829366236431e-05, + "loss": 2.8632, + "step": 19047 + }, + { + "epoch": 1.182444596188466, + "grad_norm": 0.15094915316400415, + "learning_rate": 7.587984658230866e-05, + "loss": 2.8573, + "step": 19048 + }, + { + "epoch": 1.182506673288224, + "grad_norm": 0.18362253044667268, + "learning_rate": 7.587675640595277e-05, + "loss": 2.8135, + "step": 19049 + }, + { + "epoch": 1.1825687503879818, + "grad_norm": 0.15398066145097775, + "learning_rate": 7.587366609459156e-05, + "loss": 2.9247, + "step": 19050 + }, + { + "epoch": 1.1826308274877397, + "grad_norm": 0.1771670078356243, + "learning_rate": 7.587057564824116e-05, + "loss": 2.8158, + "step": 19051 + }, + { + "epoch": 1.1826929045874977, + "grad_norm": 0.147229075512854, + "learning_rate": 7.586748506691769e-05, + "loss": 2.8835, + "step": 19052 + }, + { + "epoch": 1.1827549816872556, + "grad_norm": 0.1875659409442223, + "learning_rate": 7.586439435063726e-05, + "loss": 2.9695, + "step": 19053 + }, + { + "epoch": 1.1828170587870135, + "grad_norm": 0.1753879438590403, + "learning_rate": 7.5861303499416e-05, + "loss": 2.8753, + "step": 19054 + }, + { + "epoch": 1.1828791358867714, + "grad_norm": 0.15653794102677077, + "learning_rate": 7.585821251327006e-05, + "loss": 2.9432, + "step": 19055 + }, + { + "epoch": 1.1829412129865293, + "grad_norm": 0.1583175716167598, + "learning_rate": 7.585512139221553e-05, + "loss": 2.8939, + "step": 19056 + }, + { + "epoch": 1.1830032900862872, + "grad_norm": 0.1511863304836492, + "learning_rate": 7.585203013626857e-05, + "loss": 2.8936, + "step": 19057 + }, + { + "epoch": 1.1830653671860452, + "grad_norm": 0.17676345015329056, + "learning_rate": 7.584893874544528e-05, + "loss": 2.9585, + "step": 19058 + }, + { + "epoch": 1.183127444285803, + "grad_norm": 0.16819368316436717, + "learning_rate": 7.584584721976181e-05, + "loss": 2.8703, + "step": 19059 + }, + { + "epoch": 1.1831895213855608, + "grad_norm": 0.15942478617778985, + "learning_rate": 7.584275555923427e-05, + "loss": 2.9348, + "step": 19060 + }, + { + "epoch": 1.1832515984853187, + "grad_norm": 0.16073408790103655, + "learning_rate": 7.583966376387883e-05, + "loss": 2.9505, + "step": 19061 + }, + { + "epoch": 1.1833136755850766, + "grad_norm": 0.18953226343939056, + "learning_rate": 7.583657183371155e-05, + "loss": 2.7725, + "step": 19062 + }, + { + "epoch": 1.1833757526848345, + "grad_norm": 0.1572863186838644, + "learning_rate": 7.583347976874865e-05, + "loss": 2.9687, + "step": 19063 + }, + { + "epoch": 1.1834378297845924, + "grad_norm": 0.2304074157308696, + "learning_rate": 7.583038756900618e-05, + "loss": 2.9609, + "step": 19064 + }, + { + "epoch": 1.1834999068843504, + "grad_norm": 0.15170440642365438, + "learning_rate": 7.582729523450032e-05, + "loss": 2.8866, + "step": 19065 + }, + { + "epoch": 1.1835619839841083, + "grad_norm": 0.17251747517940652, + "learning_rate": 7.582420276524719e-05, + "loss": 2.8268, + "step": 19066 + }, + { + "epoch": 1.1836240610838662, + "grad_norm": 0.16307559523082737, + "learning_rate": 7.582111016126292e-05, + "loss": 2.854, + "step": 19067 + }, + { + "epoch": 1.1836861381836241, + "grad_norm": 0.22508525193598486, + "learning_rate": 7.581801742256365e-05, + "loss": 2.9385, + "step": 19068 + }, + { + "epoch": 1.183748215283382, + "grad_norm": 0.1595365350802433, + "learning_rate": 7.581492454916551e-05, + "loss": 2.8027, + "step": 19069 + }, + { + "epoch": 1.18381029238314, + "grad_norm": 0.1743269257349933, + "learning_rate": 7.581183154108466e-05, + "loss": 2.8597, + "step": 19070 + }, + { + "epoch": 1.1838723694828976, + "grad_norm": 0.15487518460129582, + "learning_rate": 7.58087383983372e-05, + "loss": 2.8711, + "step": 19071 + }, + { + "epoch": 1.1839344465826556, + "grad_norm": 0.1550904461268366, + "learning_rate": 7.580564512093928e-05, + "loss": 2.7955, + "step": 19072 + }, + { + "epoch": 1.1839965236824135, + "grad_norm": 0.14971949954035374, + "learning_rate": 7.580255170890708e-05, + "loss": 2.9066, + "step": 19073 + }, + { + "epoch": 1.1840586007821714, + "grad_norm": 0.15868138208273727, + "learning_rate": 7.579945816225668e-05, + "loss": 2.9136, + "step": 19074 + }, + { + "epoch": 1.1841206778819293, + "grad_norm": 0.15983069873713704, + "learning_rate": 7.579636448100425e-05, + "loss": 2.8851, + "step": 19075 + }, + { + "epoch": 1.1841827549816872, + "grad_norm": 0.1692609924732554, + "learning_rate": 7.579327066516591e-05, + "loss": 2.9327, + "step": 19076 + }, + { + "epoch": 1.1842448320814452, + "grad_norm": 0.16022104604030857, + "learning_rate": 7.579017671475784e-05, + "loss": 2.7986, + "step": 19077 + }, + { + "epoch": 1.184306909181203, + "grad_norm": 0.15034129832458595, + "learning_rate": 7.578708262979614e-05, + "loss": 2.9385, + "step": 19078 + }, + { + "epoch": 1.184368986280961, + "grad_norm": 0.1838777658447704, + "learning_rate": 7.578398841029697e-05, + "loss": 2.8347, + "step": 19079 + }, + { + "epoch": 1.184431063380719, + "grad_norm": 0.15966971860154244, + "learning_rate": 7.578089405627648e-05, + "loss": 2.912, + "step": 19080 + }, + { + "epoch": 1.1844931404804768, + "grad_norm": 0.19267856307808126, + "learning_rate": 7.577779956775081e-05, + "loss": 2.9659, + "step": 19081 + }, + { + "epoch": 1.1845552175802347, + "grad_norm": 0.18255579523366003, + "learning_rate": 7.57747049447361e-05, + "loss": 2.9343, + "step": 19082 + }, + { + "epoch": 1.1846172946799927, + "grad_norm": 0.1638449577648635, + "learning_rate": 7.577161018724848e-05, + "loss": 2.8131, + "step": 19083 + }, + { + "epoch": 1.1846793717797504, + "grad_norm": 0.17971002396101493, + "learning_rate": 7.576851529530413e-05, + "loss": 2.9379, + "step": 19084 + }, + { + "epoch": 1.1847414488795083, + "grad_norm": 0.17078542514908202, + "learning_rate": 7.576542026891917e-05, + "loss": 2.8907, + "step": 19085 + }, + { + "epoch": 1.1848035259792662, + "grad_norm": 0.1661366918843257, + "learning_rate": 7.576232510810978e-05, + "loss": 2.873, + "step": 19086 + }, + { + "epoch": 1.184865603079024, + "grad_norm": 0.17272568553598583, + "learning_rate": 7.575922981289207e-05, + "loss": 2.8605, + "step": 19087 + }, + { + "epoch": 1.184927680178782, + "grad_norm": 0.16769631048510236, + "learning_rate": 7.575613438328221e-05, + "loss": 2.8848, + "step": 19088 + }, + { + "epoch": 1.18498975727854, + "grad_norm": 0.16535395427896243, + "learning_rate": 7.575303881929633e-05, + "loss": 2.8999, + "step": 19089 + }, + { + "epoch": 1.1850518343782979, + "grad_norm": 0.2634228271932378, + "learning_rate": 7.574994312095061e-05, + "loss": 2.9358, + "step": 19090 + }, + { + "epoch": 1.1851139114780558, + "grad_norm": 0.22765003003412002, + "learning_rate": 7.574684728826117e-05, + "loss": 2.8942, + "step": 19091 + }, + { + "epoch": 1.1851759885778137, + "grad_norm": 0.17778983665606532, + "learning_rate": 7.574375132124419e-05, + "loss": 2.9329, + "step": 19092 + }, + { + "epoch": 1.1852380656775716, + "grad_norm": 0.2768197684170308, + "learning_rate": 7.574065521991582e-05, + "loss": 2.8839, + "step": 19093 + }, + { + "epoch": 1.1853001427773295, + "grad_norm": 0.16667105934692336, + "learning_rate": 7.573755898429217e-05, + "loss": 2.8314, + "step": 19094 + }, + { + "epoch": 1.1853622198770872, + "grad_norm": 0.19480497222139134, + "learning_rate": 7.573446261438945e-05, + "loss": 2.8873, + "step": 19095 + }, + { + "epoch": 1.1854242969768451, + "grad_norm": 0.1790642606938264, + "learning_rate": 7.57313661102238e-05, + "loss": 2.772, + "step": 19096 + }, + { + "epoch": 1.185486374076603, + "grad_norm": 0.19379982730591563, + "learning_rate": 7.572826947181135e-05, + "loss": 2.9568, + "step": 19097 + }, + { + "epoch": 1.185548451176361, + "grad_norm": 0.21251645585464188, + "learning_rate": 7.572517269916828e-05, + "loss": 2.8875, + "step": 19098 + }, + { + "epoch": 1.185610528276119, + "grad_norm": 0.18771465253477881, + "learning_rate": 7.572207579231075e-05, + "loss": 2.7956, + "step": 19099 + }, + { + "epoch": 1.1856726053758768, + "grad_norm": 0.1634142249786303, + "learning_rate": 7.571897875125487e-05, + "loss": 2.8035, + "step": 19100 + }, + { + "epoch": 1.1857346824756347, + "grad_norm": 0.17210935199579822, + "learning_rate": 7.571588157601687e-05, + "loss": 2.8639, + "step": 19101 + }, + { + "epoch": 1.1857967595753927, + "grad_norm": 0.17653252802970482, + "learning_rate": 7.571278426661286e-05, + "loss": 2.8708, + "step": 19102 + }, + { + "epoch": 1.1858588366751506, + "grad_norm": 0.1706902680258144, + "learning_rate": 7.570968682305901e-05, + "loss": 2.8048, + "step": 19103 + }, + { + "epoch": 1.1859209137749085, + "grad_norm": 0.1661683165774972, + "learning_rate": 7.57065892453715e-05, + "loss": 2.7474, + "step": 19104 + }, + { + "epoch": 1.1859829908746664, + "grad_norm": 0.16387776350918162, + "learning_rate": 7.570349153356644e-05, + "loss": 2.8196, + "step": 19105 + }, + { + "epoch": 1.1860450679744243, + "grad_norm": 0.1540811878171107, + "learning_rate": 7.570039368766004e-05, + "loss": 2.984, + "step": 19106 + }, + { + "epoch": 1.1861071450741822, + "grad_norm": 0.158344011798911, + "learning_rate": 7.569729570766845e-05, + "loss": 2.9367, + "step": 19107 + }, + { + "epoch": 1.18616922217394, + "grad_norm": 0.1666388929342327, + "learning_rate": 7.569419759360784e-05, + "loss": 2.8742, + "step": 19108 + }, + { + "epoch": 1.1862312992736979, + "grad_norm": 0.169527099488251, + "learning_rate": 7.569109934549436e-05, + "loss": 2.9255, + "step": 19109 + }, + { + "epoch": 1.1862933763734558, + "grad_norm": 0.19990243655502185, + "learning_rate": 7.568800096334416e-05, + "loss": 2.886, + "step": 19110 + }, + { + "epoch": 1.1863554534732137, + "grad_norm": 0.1851492787911886, + "learning_rate": 7.568490244717346e-05, + "loss": 2.9223, + "step": 19111 + }, + { + "epoch": 1.1864175305729716, + "grad_norm": 0.17572360153088148, + "learning_rate": 7.568180379699834e-05, + "loss": 2.918, + "step": 19112 + }, + { + "epoch": 1.1864796076727295, + "grad_norm": 0.1997535483507044, + "learning_rate": 7.567870501283505e-05, + "loss": 2.9923, + "step": 19113 + }, + { + "epoch": 1.1865416847724874, + "grad_norm": 0.18424791102020954, + "learning_rate": 7.567560609469971e-05, + "loss": 2.9048, + "step": 19114 + }, + { + "epoch": 1.1866037618722454, + "grad_norm": 0.18310678100115715, + "learning_rate": 7.56725070426085e-05, + "loss": 2.843, + "step": 19115 + }, + { + "epoch": 1.1866658389720033, + "grad_norm": 0.16161674309259738, + "learning_rate": 7.56694078565776e-05, + "loss": 2.914, + "step": 19116 + }, + { + "epoch": 1.1867279160717612, + "grad_norm": 0.15784595342656596, + "learning_rate": 7.566630853662315e-05, + "loss": 2.9014, + "step": 19117 + }, + { + "epoch": 1.1867899931715191, + "grad_norm": 0.15178164684030931, + "learning_rate": 7.566320908276134e-05, + "loss": 2.9007, + "step": 19118 + }, + { + "epoch": 1.1868520702712768, + "grad_norm": 0.15228335088012215, + "learning_rate": 7.566010949500835e-05, + "loss": 2.9058, + "step": 19119 + }, + { + "epoch": 1.1869141473710347, + "grad_norm": 0.21578616739535678, + "learning_rate": 7.565700977338034e-05, + "loss": 2.8572, + "step": 19120 + }, + { + "epoch": 1.1869762244707927, + "grad_norm": 0.1644430968427893, + "learning_rate": 7.565390991789346e-05, + "loss": 2.8993, + "step": 19121 + }, + { + "epoch": 1.1870383015705506, + "grad_norm": 0.14873272624215536, + "learning_rate": 7.565080992856392e-05, + "loss": 2.9184, + "step": 19122 + }, + { + "epoch": 1.1871003786703085, + "grad_norm": 0.1576119964482198, + "learning_rate": 7.564770980540788e-05, + "loss": 2.9026, + "step": 19123 + }, + { + "epoch": 1.1871624557700664, + "grad_norm": 0.16000661023276236, + "learning_rate": 7.564460954844152e-05, + "loss": 2.8913, + "step": 19124 + }, + { + "epoch": 1.1872245328698243, + "grad_norm": 0.1682251702907719, + "learning_rate": 7.5641509157681e-05, + "loss": 2.955, + "step": 19125 + }, + { + "epoch": 1.1872866099695822, + "grad_norm": 0.1500982563748893, + "learning_rate": 7.563840863314248e-05, + "loss": 2.8904, + "step": 19126 + }, + { + "epoch": 1.1873486870693402, + "grad_norm": 0.15317364985437693, + "learning_rate": 7.563530797484218e-05, + "loss": 2.8542, + "step": 19127 + }, + { + "epoch": 1.187410764169098, + "grad_norm": 0.1405680130356162, + "learning_rate": 7.563220718279624e-05, + "loss": 2.8685, + "step": 19128 + }, + { + "epoch": 1.187472841268856, + "grad_norm": 0.1662071805858858, + "learning_rate": 7.562910625702089e-05, + "loss": 2.9365, + "step": 19129 + }, + { + "epoch": 1.187534918368614, + "grad_norm": 0.21546726780935574, + "learning_rate": 7.562600519753222e-05, + "loss": 2.917, + "step": 19130 + }, + { + "epoch": 1.1875969954683718, + "grad_norm": 0.17386298419445603, + "learning_rate": 7.562290400434651e-05, + "loss": 2.922, + "step": 19131 + }, + { + "epoch": 1.1876590725681295, + "grad_norm": 0.14280617778189747, + "learning_rate": 7.561980267747986e-05, + "loss": 2.9079, + "step": 19132 + }, + { + "epoch": 1.1877211496678874, + "grad_norm": 0.15287791651059296, + "learning_rate": 7.561670121694848e-05, + "loss": 2.8561, + "step": 19133 + }, + { + "epoch": 1.1877832267676454, + "grad_norm": 0.1507316043140815, + "learning_rate": 7.561359962276855e-05, + "loss": 2.8338, + "step": 19134 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.1809860659104281, + "learning_rate": 7.561049789495626e-05, + "loss": 2.8341, + "step": 19135 + }, + { + "epoch": 1.1879073809671612, + "grad_norm": 0.15202101732810716, + "learning_rate": 7.560739603352777e-05, + "loss": 2.9375, + "step": 19136 + }, + { + "epoch": 1.1879694580669191, + "grad_norm": 0.15109665598634023, + "learning_rate": 7.560429403849931e-05, + "loss": 2.7479, + "step": 19137 + }, + { + "epoch": 1.188031535166677, + "grad_norm": 0.15479854922911618, + "learning_rate": 7.5601191909887e-05, + "loss": 2.9075, + "step": 19138 + }, + { + "epoch": 1.188093612266435, + "grad_norm": 0.19730120989435299, + "learning_rate": 7.559808964770707e-05, + "loss": 2.8868, + "step": 19139 + }, + { + "epoch": 1.1881556893661929, + "grad_norm": 0.16005535553035732, + "learning_rate": 7.55949872519757e-05, + "loss": 2.902, + "step": 19140 + }, + { + "epoch": 1.1882177664659508, + "grad_norm": 0.15147402899687934, + "learning_rate": 7.559188472270906e-05, + "loss": 2.814, + "step": 19141 + }, + { + "epoch": 1.1882798435657087, + "grad_norm": 0.15053916382849306, + "learning_rate": 7.558878205992334e-05, + "loss": 2.9072, + "step": 19142 + }, + { + "epoch": 1.1883419206654664, + "grad_norm": 0.16521346453659105, + "learning_rate": 7.558567926363474e-05, + "loss": 2.8377, + "step": 19143 + }, + { + "epoch": 1.1884039977652243, + "grad_norm": 0.1535049104712793, + "learning_rate": 7.558257633385943e-05, + "loss": 2.8534, + "step": 19144 + }, + { + "epoch": 1.1884660748649822, + "grad_norm": 0.15227283147568288, + "learning_rate": 7.557947327061363e-05, + "loss": 2.8734, + "step": 19145 + }, + { + "epoch": 1.1885281519647402, + "grad_norm": 0.14923398379511338, + "learning_rate": 7.55763700739135e-05, + "loss": 2.7957, + "step": 19146 + }, + { + "epoch": 1.188590229064498, + "grad_norm": 0.19876949292685114, + "learning_rate": 7.557326674377522e-05, + "loss": 2.8817, + "step": 19147 + }, + { + "epoch": 1.188652306164256, + "grad_norm": 0.16504706414141548, + "learning_rate": 7.557016328021503e-05, + "loss": 2.8868, + "step": 19148 + }, + { + "epoch": 1.188714383264014, + "grad_norm": 0.14668774157277886, + "learning_rate": 7.556705968324907e-05, + "loss": 2.8564, + "step": 19149 + }, + { + "epoch": 1.1887764603637718, + "grad_norm": 0.15877985323468213, + "learning_rate": 7.556395595289355e-05, + "loss": 2.9461, + "step": 19150 + }, + { + "epoch": 1.1888385374635297, + "grad_norm": 0.14846170378665244, + "learning_rate": 7.556085208916468e-05, + "loss": 2.8905, + "step": 19151 + }, + { + "epoch": 1.1889006145632877, + "grad_norm": 0.16182976473619995, + "learning_rate": 7.555774809207862e-05, + "loss": 2.8312, + "step": 19152 + }, + { + "epoch": 1.1889626916630456, + "grad_norm": 0.1552604393946765, + "learning_rate": 7.555464396165161e-05, + "loss": 2.9075, + "step": 19153 + }, + { + "epoch": 1.1890247687628035, + "grad_norm": 0.19977687742835495, + "learning_rate": 7.55515396978998e-05, + "loss": 2.8881, + "step": 19154 + }, + { + "epoch": 1.1890868458625614, + "grad_norm": 0.15849438688119474, + "learning_rate": 7.55484353008394e-05, + "loss": 2.9067, + "step": 19155 + }, + { + "epoch": 1.189148922962319, + "grad_norm": 0.16806008938273098, + "learning_rate": 7.554533077048663e-05, + "loss": 2.869, + "step": 19156 + }, + { + "epoch": 1.189211000062077, + "grad_norm": 0.158412571004856, + "learning_rate": 7.554222610685765e-05, + "loss": 2.8662, + "step": 19157 + }, + { + "epoch": 1.189273077161835, + "grad_norm": 0.16981210122467044, + "learning_rate": 7.55391213099687e-05, + "loss": 2.9278, + "step": 19158 + }, + { + "epoch": 1.1893351542615929, + "grad_norm": 0.15208879557907626, + "learning_rate": 7.553601637983593e-05, + "loss": 2.9415, + "step": 19159 + }, + { + "epoch": 1.1893972313613508, + "grad_norm": 0.15718418047681773, + "learning_rate": 7.553291131647558e-05, + "loss": 2.8635, + "step": 19160 + }, + { + "epoch": 1.1894593084611087, + "grad_norm": 0.18705736006708465, + "learning_rate": 7.552980611990381e-05, + "loss": 2.8979, + "step": 19161 + }, + { + "epoch": 1.1895213855608666, + "grad_norm": 0.16502615635535664, + "learning_rate": 7.552670079013687e-05, + "loss": 2.9069, + "step": 19162 + }, + { + "epoch": 1.1895834626606245, + "grad_norm": 0.1533987306398253, + "learning_rate": 7.552359532719093e-05, + "loss": 2.8144, + "step": 19163 + }, + { + "epoch": 1.1896455397603825, + "grad_norm": 0.17611545475522525, + "learning_rate": 7.55204897310822e-05, + "loss": 2.9229, + "step": 19164 + }, + { + "epoch": 1.1897076168601404, + "grad_norm": 0.16425680960803773, + "learning_rate": 7.551738400182687e-05, + "loss": 2.9617, + "step": 19165 + }, + { + "epoch": 1.1897696939598983, + "grad_norm": 0.16880215660037912, + "learning_rate": 7.551427813944113e-05, + "loss": 2.8771, + "step": 19166 + }, + { + "epoch": 1.189831771059656, + "grad_norm": 0.15718791913266736, + "learning_rate": 7.551117214394125e-05, + "loss": 2.8305, + "step": 19167 + }, + { + "epoch": 1.189893848159414, + "grad_norm": 0.176742231620173, + "learning_rate": 7.550806601534337e-05, + "loss": 2.8226, + "step": 19168 + }, + { + "epoch": 1.1899559252591718, + "grad_norm": 0.16110768555750668, + "learning_rate": 7.550495975366372e-05, + "loss": 2.8935, + "step": 19169 + }, + { + "epoch": 1.1900180023589297, + "grad_norm": 0.15951402925247488, + "learning_rate": 7.55018533589185e-05, + "loss": 2.8763, + "step": 19170 + }, + { + "epoch": 1.1900800794586877, + "grad_norm": 0.24268676456641583, + "learning_rate": 7.549874683112392e-05, + "loss": 2.8381, + "step": 19171 + }, + { + "epoch": 1.1901421565584456, + "grad_norm": 0.15563677018440025, + "learning_rate": 7.54956401702962e-05, + "loss": 2.8958, + "step": 19172 + }, + { + "epoch": 1.1902042336582035, + "grad_norm": 0.18629980411864777, + "learning_rate": 7.549253337645153e-05, + "loss": 2.9248, + "step": 19173 + }, + { + "epoch": 1.1902663107579614, + "grad_norm": 0.20213884524378462, + "learning_rate": 7.548942644960612e-05, + "loss": 2.8734, + "step": 19174 + }, + { + "epoch": 1.1903283878577193, + "grad_norm": 0.1681220386467672, + "learning_rate": 7.548631938977617e-05, + "loss": 2.8847, + "step": 19175 + }, + { + "epoch": 1.1903904649574772, + "grad_norm": 0.15788712554745593, + "learning_rate": 7.548321219697793e-05, + "loss": 2.8638, + "step": 19176 + }, + { + "epoch": 1.1904525420572352, + "grad_norm": 0.1670610665043283, + "learning_rate": 7.548010487122759e-05, + "loss": 2.9565, + "step": 19177 + }, + { + "epoch": 1.190514619156993, + "grad_norm": 0.16331964867198515, + "learning_rate": 7.547699741254133e-05, + "loss": 2.8606, + "step": 19178 + }, + { + "epoch": 1.190576696256751, + "grad_norm": 0.16238718472472852, + "learning_rate": 7.547388982093541e-05, + "loss": 2.8685, + "step": 19179 + }, + { + "epoch": 1.1906387733565087, + "grad_norm": 0.15754005620043815, + "learning_rate": 7.547078209642601e-05, + "loss": 2.8746, + "step": 19180 + }, + { + "epoch": 1.1907008504562666, + "grad_norm": 0.16794011515124035, + "learning_rate": 7.546767423902938e-05, + "loss": 2.8627, + "step": 19181 + }, + { + "epoch": 1.1907629275560245, + "grad_norm": 0.15976168713928074, + "learning_rate": 7.546456624876169e-05, + "loss": 2.9127, + "step": 19182 + }, + { + "epoch": 1.1908250046557824, + "grad_norm": 0.16524748648709967, + "learning_rate": 7.546145812563918e-05, + "loss": 2.9016, + "step": 19183 + }, + { + "epoch": 1.1908870817555404, + "grad_norm": 0.21107987614520218, + "learning_rate": 7.545834986967806e-05, + "loss": 2.8658, + "step": 19184 + }, + { + "epoch": 1.1909491588552983, + "grad_norm": 0.15409696883980678, + "learning_rate": 7.545524148089455e-05, + "loss": 2.8601, + "step": 19185 + }, + { + "epoch": 1.1910112359550562, + "grad_norm": 0.16693528639226593, + "learning_rate": 7.545213295930488e-05, + "loss": 2.8423, + "step": 19186 + }, + { + "epoch": 1.1910733130548141, + "grad_norm": 0.18477356028704964, + "learning_rate": 7.544902430492523e-05, + "loss": 2.9328, + "step": 19187 + }, + { + "epoch": 1.191135390154572, + "grad_norm": 0.19183632159481073, + "learning_rate": 7.544591551777186e-05, + "loss": 2.8287, + "step": 19188 + }, + { + "epoch": 1.19119746725433, + "grad_norm": 0.1805371431495221, + "learning_rate": 7.544280659786096e-05, + "loss": 2.8609, + "step": 19189 + }, + { + "epoch": 1.1912595443540879, + "grad_norm": 0.18057080516748802, + "learning_rate": 7.543969754520876e-05, + "loss": 2.8682, + "step": 19190 + }, + { + "epoch": 1.1913216214538456, + "grad_norm": 0.18002939868300238, + "learning_rate": 7.543658835983149e-05, + "loss": 2.8438, + "step": 19191 + }, + { + "epoch": 1.1913836985536035, + "grad_norm": 0.19283253731009667, + "learning_rate": 7.543347904174538e-05, + "loss": 2.9345, + "step": 19192 + }, + { + "epoch": 1.1914457756533614, + "grad_norm": 0.17993852454672576, + "learning_rate": 7.543036959096661e-05, + "loss": 2.8689, + "step": 19193 + }, + { + "epoch": 1.1915078527531193, + "grad_norm": 0.18620338743126844, + "learning_rate": 7.542726000751143e-05, + "loss": 2.8809, + "step": 19194 + }, + { + "epoch": 1.1915699298528772, + "grad_norm": 0.1871448869394273, + "learning_rate": 7.542415029139607e-05, + "loss": 2.943, + "step": 19195 + }, + { + "epoch": 1.1916320069526352, + "grad_norm": 0.22255795802200934, + "learning_rate": 7.542104044263676e-05, + "loss": 2.9263, + "step": 19196 + }, + { + "epoch": 1.191694084052393, + "grad_norm": 0.23277582737412328, + "learning_rate": 7.541793046124967e-05, + "loss": 2.9998, + "step": 19197 + }, + { + "epoch": 1.191756161152151, + "grad_norm": 0.1845448592462411, + "learning_rate": 7.541482034725111e-05, + "loss": 2.966, + "step": 19198 + }, + { + "epoch": 1.191818238251909, + "grad_norm": 0.1830092215305668, + "learning_rate": 7.541171010065724e-05, + "loss": 2.8805, + "step": 19199 + }, + { + "epoch": 1.1918803153516668, + "grad_norm": 0.21563477977743828, + "learning_rate": 7.540859972148431e-05, + "loss": 2.8529, + "step": 19200 + }, + { + "epoch": 1.1919423924514247, + "grad_norm": 0.1850758748302166, + "learning_rate": 7.540548920974856e-05, + "loss": 2.8312, + "step": 19201 + }, + { + "epoch": 1.1920044695511827, + "grad_norm": 0.1838243132462339, + "learning_rate": 7.540237856546619e-05, + "loss": 2.781, + "step": 19202 + }, + { + "epoch": 1.1920665466509406, + "grad_norm": 0.1697211175798042, + "learning_rate": 7.539926778865345e-05, + "loss": 2.7875, + "step": 19203 + }, + { + "epoch": 1.1921286237506983, + "grad_norm": 0.17538567644372546, + "learning_rate": 7.539615687932655e-05, + "loss": 2.7588, + "step": 19204 + }, + { + "epoch": 1.1921907008504562, + "grad_norm": 0.15655882832986365, + "learning_rate": 7.539304583750176e-05, + "loss": 2.8916, + "step": 19205 + }, + { + "epoch": 1.1922527779502141, + "grad_norm": 0.1677236598625126, + "learning_rate": 7.538993466319529e-05, + "loss": 2.8787, + "step": 19206 + }, + { + "epoch": 1.192314855049972, + "grad_norm": 0.1995090445228637, + "learning_rate": 7.538682335642334e-05, + "loss": 2.8794, + "step": 19207 + }, + { + "epoch": 1.19237693214973, + "grad_norm": 0.1628530804386524, + "learning_rate": 7.538371191720218e-05, + "loss": 2.889, + "step": 19208 + }, + { + "epoch": 1.1924390092494879, + "grad_norm": 0.17353917333518393, + "learning_rate": 7.538060034554804e-05, + "loss": 2.9375, + "step": 19209 + }, + { + "epoch": 1.1925010863492458, + "grad_norm": 0.1792129906291835, + "learning_rate": 7.537748864147713e-05, + "loss": 2.9232, + "step": 19210 + }, + { + "epoch": 1.1925631634490037, + "grad_norm": 0.16220701678124994, + "learning_rate": 7.537437680500572e-05, + "loss": 2.8447, + "step": 19211 + }, + { + "epoch": 1.1926252405487616, + "grad_norm": 0.1592905851808958, + "learning_rate": 7.537126483615002e-05, + "loss": 2.864, + "step": 19212 + }, + { + "epoch": 1.1926873176485195, + "grad_norm": 0.1624566288528278, + "learning_rate": 7.536815273492627e-05, + "loss": 2.8594, + "step": 19213 + }, + { + "epoch": 1.1927493947482772, + "grad_norm": 0.15804361025106015, + "learning_rate": 7.536504050135072e-05, + "loss": 2.8763, + "step": 19214 + }, + { + "epoch": 1.1928114718480352, + "grad_norm": 0.15297539905986052, + "learning_rate": 7.536192813543957e-05, + "loss": 2.8946, + "step": 19215 + }, + { + "epoch": 1.192873548947793, + "grad_norm": 0.15837340551061443, + "learning_rate": 7.53588156372091e-05, + "loss": 2.8352, + "step": 19216 + }, + { + "epoch": 1.192935626047551, + "grad_norm": 0.17621786147500923, + "learning_rate": 7.535570300667554e-05, + "loss": 2.7913, + "step": 19217 + }, + { + "epoch": 1.192997703147309, + "grad_norm": 0.1595372764633509, + "learning_rate": 7.53525902438551e-05, + "loss": 2.8366, + "step": 19218 + }, + { + "epoch": 1.1930597802470668, + "grad_norm": 0.15293224084118373, + "learning_rate": 7.534947734876407e-05, + "loss": 2.8339, + "step": 19219 + }, + { + "epoch": 1.1931218573468247, + "grad_norm": 0.1678078346385341, + "learning_rate": 7.534636432141863e-05, + "loss": 2.7901, + "step": 19220 + }, + { + "epoch": 1.1931839344465827, + "grad_norm": 0.16842708044964175, + "learning_rate": 7.534325116183509e-05, + "loss": 2.9595, + "step": 19221 + }, + { + "epoch": 1.1932460115463406, + "grad_norm": 0.16429854623885082, + "learning_rate": 7.534013787002962e-05, + "loss": 2.9328, + "step": 19222 + }, + { + "epoch": 1.1933080886460985, + "grad_norm": 0.15371865564573353, + "learning_rate": 7.533702444601853e-05, + "loss": 2.8608, + "step": 19223 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.19030619798758727, + "learning_rate": 7.533391088981803e-05, + "loss": 2.8594, + "step": 19224 + }, + { + "epoch": 1.1934322428456143, + "grad_norm": 0.14563944961831507, + "learning_rate": 7.533079720144436e-05, + "loss": 2.7967, + "step": 19225 + }, + { + "epoch": 1.1934943199453723, + "grad_norm": 0.16597337185079347, + "learning_rate": 7.532768338091377e-05, + "loss": 2.8611, + "step": 19226 + }, + { + "epoch": 1.1935563970451302, + "grad_norm": 0.15859958633244733, + "learning_rate": 7.532456942824251e-05, + "loss": 2.8681, + "step": 19227 + }, + { + "epoch": 1.1936184741448879, + "grad_norm": 0.1688684773886711, + "learning_rate": 7.532145534344682e-05, + "loss": 2.8396, + "step": 19228 + }, + { + "epoch": 1.1936805512446458, + "grad_norm": 0.16166179409582357, + "learning_rate": 7.531834112654295e-05, + "loss": 2.9, + "step": 19229 + }, + { + "epoch": 1.1937426283444037, + "grad_norm": 0.1751747795499109, + "learning_rate": 7.531522677754716e-05, + "loss": 2.8432, + "step": 19230 + }, + { + "epoch": 1.1938047054441616, + "grad_norm": 0.17587863026141382, + "learning_rate": 7.531211229647567e-05, + "loss": 2.8275, + "step": 19231 + }, + { + "epoch": 1.1938667825439195, + "grad_norm": 0.15608597729397589, + "learning_rate": 7.530899768334476e-05, + "loss": 2.9435, + "step": 19232 + }, + { + "epoch": 1.1939288596436775, + "grad_norm": 0.1603856072660594, + "learning_rate": 7.530588293817064e-05, + "loss": 2.8612, + "step": 19233 + }, + { + "epoch": 1.1939909367434354, + "grad_norm": 0.1946731593635187, + "learning_rate": 7.530276806096961e-05, + "loss": 2.9107, + "step": 19234 + }, + { + "epoch": 1.1940530138431933, + "grad_norm": 0.16458985744796303, + "learning_rate": 7.52996530517579e-05, + "loss": 2.8384, + "step": 19235 + }, + { + "epoch": 1.1941150909429512, + "grad_norm": 0.18412651718470188, + "learning_rate": 7.529653791055173e-05, + "loss": 2.9578, + "step": 19236 + }, + { + "epoch": 1.1941771680427091, + "grad_norm": 0.17595890392653876, + "learning_rate": 7.52934226373674e-05, + "loss": 2.9672, + "step": 19237 + }, + { + "epoch": 1.1942392451424668, + "grad_norm": 0.18297188527314603, + "learning_rate": 7.529030723222114e-05, + "loss": 2.834, + "step": 19238 + }, + { + "epoch": 1.1943013222422247, + "grad_norm": 0.16324301824200857, + "learning_rate": 7.528719169512921e-05, + "loss": 2.837, + "step": 19239 + }, + { + "epoch": 1.1943633993419827, + "grad_norm": 0.15809706676794474, + "learning_rate": 7.528407602610786e-05, + "loss": 2.908, + "step": 19240 + }, + { + "epoch": 1.1944254764417406, + "grad_norm": 0.22550538511570561, + "learning_rate": 7.528096022517335e-05, + "loss": 2.8929, + "step": 19241 + }, + { + "epoch": 1.1944875535414985, + "grad_norm": 0.17529948761701097, + "learning_rate": 7.527784429234195e-05, + "loss": 2.9344, + "step": 19242 + }, + { + "epoch": 1.1945496306412564, + "grad_norm": 0.17019996140289934, + "learning_rate": 7.527472822762986e-05, + "loss": 2.8381, + "step": 19243 + }, + { + "epoch": 1.1946117077410143, + "grad_norm": 0.16098928706255922, + "learning_rate": 7.527161203105342e-05, + "loss": 2.9229, + "step": 19244 + }, + { + "epoch": 1.1946737848407722, + "grad_norm": 0.1559828859227769, + "learning_rate": 7.526849570262882e-05, + "loss": 2.9424, + "step": 19245 + }, + { + "epoch": 1.1947358619405302, + "grad_norm": 0.1546602256408047, + "learning_rate": 7.526537924237236e-05, + "loss": 2.8423, + "step": 19246 + }, + { + "epoch": 1.194797939040288, + "grad_norm": 0.16643377264521172, + "learning_rate": 7.526226265030026e-05, + "loss": 2.872, + "step": 19247 + }, + { + "epoch": 1.194860016140046, + "grad_norm": 0.16680779231522375, + "learning_rate": 7.525914592642882e-05, + "loss": 2.9839, + "step": 19248 + }, + { + "epoch": 1.194922093239804, + "grad_norm": 0.18608889466941347, + "learning_rate": 7.525602907077427e-05, + "loss": 2.9294, + "step": 19249 + }, + { + "epoch": 1.1949841703395618, + "grad_norm": 0.14996051591524254, + "learning_rate": 7.52529120833529e-05, + "loss": 2.9238, + "step": 19250 + }, + { + "epoch": 1.1950462474393198, + "grad_norm": 0.15476850065037084, + "learning_rate": 7.524979496418095e-05, + "loss": 2.9594, + "step": 19251 + }, + { + "epoch": 1.1951083245390774, + "grad_norm": 0.15285513096969205, + "learning_rate": 7.524667771327468e-05, + "loss": 2.9898, + "step": 19252 + }, + { + "epoch": 1.1951704016388354, + "grad_norm": 0.15817223353252494, + "learning_rate": 7.524356033065039e-05, + "loss": 2.8521, + "step": 19253 + }, + { + "epoch": 1.1952324787385933, + "grad_norm": 0.15772067198008385, + "learning_rate": 7.52404428163243e-05, + "loss": 2.9291, + "step": 19254 + }, + { + "epoch": 1.1952945558383512, + "grad_norm": 0.15350396400794408, + "learning_rate": 7.52373251703127e-05, + "loss": 2.9077, + "step": 19255 + }, + { + "epoch": 1.1953566329381091, + "grad_norm": 0.15521933254741657, + "learning_rate": 7.523420739263183e-05, + "loss": 2.8899, + "step": 19256 + }, + { + "epoch": 1.195418710037867, + "grad_norm": 0.1489884416678302, + "learning_rate": 7.5231089483298e-05, + "loss": 2.8343, + "step": 19257 + }, + { + "epoch": 1.195480787137625, + "grad_norm": 0.15753007700952004, + "learning_rate": 7.522797144232742e-05, + "loss": 2.7805, + "step": 19258 + }, + { + "epoch": 1.1955428642373829, + "grad_norm": 0.15467125104082163, + "learning_rate": 7.522485326973641e-05, + "loss": 2.8774, + "step": 19259 + }, + { + "epoch": 1.1956049413371408, + "grad_norm": 0.15608716466833603, + "learning_rate": 7.522173496554123e-05, + "loss": 2.9192, + "step": 19260 + }, + { + "epoch": 1.1956670184368987, + "grad_norm": 0.15315926348506703, + "learning_rate": 7.521861652975811e-05, + "loss": 2.7732, + "step": 19261 + }, + { + "epoch": 1.1957290955366564, + "grad_norm": 0.16987354314676212, + "learning_rate": 7.521549796240337e-05, + "loss": 2.8992, + "step": 19262 + }, + { + "epoch": 1.1957911726364143, + "grad_norm": 0.15375895834620318, + "learning_rate": 7.521237926349322e-05, + "loss": 2.9281, + "step": 19263 + }, + { + "epoch": 1.1958532497361722, + "grad_norm": 0.17033829034416328, + "learning_rate": 7.5209260433044e-05, + "loss": 2.8175, + "step": 19264 + }, + { + "epoch": 1.1959153268359302, + "grad_norm": 0.16093944738191324, + "learning_rate": 7.520614147107193e-05, + "loss": 2.8381, + "step": 19265 + }, + { + "epoch": 1.195977403935688, + "grad_norm": 0.1952630743431112, + "learning_rate": 7.520302237759331e-05, + "loss": 2.9747, + "step": 19266 + }, + { + "epoch": 1.196039481035446, + "grad_norm": 0.16166150028803347, + "learning_rate": 7.51999031526244e-05, + "loss": 2.9259, + "step": 19267 + }, + { + "epoch": 1.196101558135204, + "grad_norm": 0.19101530164757569, + "learning_rate": 7.519678379618149e-05, + "loss": 2.9426, + "step": 19268 + }, + { + "epoch": 1.1961636352349618, + "grad_norm": 0.16255451466315632, + "learning_rate": 7.519366430828083e-05, + "loss": 2.9424, + "step": 19269 + }, + { + "epoch": 1.1962257123347197, + "grad_norm": 0.14895857778164562, + "learning_rate": 7.51905446889387e-05, + "loss": 2.9153, + "step": 19270 + }, + { + "epoch": 1.1962877894344777, + "grad_norm": 0.17380404037613692, + "learning_rate": 7.518742493817139e-05, + "loss": 2.9554, + "step": 19271 + }, + { + "epoch": 1.1963498665342356, + "grad_norm": 0.14249008543675903, + "learning_rate": 7.518430505599517e-05, + "loss": 2.9517, + "step": 19272 + }, + { + "epoch": 1.1964119436339935, + "grad_norm": 0.16950871124395017, + "learning_rate": 7.518118504242633e-05, + "loss": 2.8799, + "step": 19273 + }, + { + "epoch": 1.1964740207337514, + "grad_norm": 0.18935126092887303, + "learning_rate": 7.517806489748111e-05, + "loss": 2.9344, + "step": 19274 + }, + { + "epoch": 1.1965360978335093, + "grad_norm": 0.19259888960680643, + "learning_rate": 7.517494462117584e-05, + "loss": 2.8069, + "step": 19275 + }, + { + "epoch": 1.196598174933267, + "grad_norm": 0.15549838967271443, + "learning_rate": 7.517182421352675e-05, + "loss": 2.7698, + "step": 19276 + }, + { + "epoch": 1.196660252033025, + "grad_norm": 0.16759894381763057, + "learning_rate": 7.516870367455016e-05, + "loss": 2.8834, + "step": 19277 + }, + { + "epoch": 1.1967223291327829, + "grad_norm": 0.16216617761041102, + "learning_rate": 7.516558300426233e-05, + "loss": 2.9118, + "step": 19278 + }, + { + "epoch": 1.1967844062325408, + "grad_norm": 0.1512200902101772, + "learning_rate": 7.516246220267953e-05, + "loss": 2.9556, + "step": 19279 + }, + { + "epoch": 1.1968464833322987, + "grad_norm": 0.1574197021220826, + "learning_rate": 7.515934126981807e-05, + "loss": 2.8854, + "step": 19280 + }, + { + "epoch": 1.1969085604320566, + "grad_norm": 0.1471239876035733, + "learning_rate": 7.515622020569421e-05, + "loss": 2.8423, + "step": 19281 + }, + { + "epoch": 1.1969706375318145, + "grad_norm": 0.1726219300643476, + "learning_rate": 7.515309901032426e-05, + "loss": 2.9056, + "step": 19282 + }, + { + "epoch": 1.1970327146315725, + "grad_norm": 0.15702817377711714, + "learning_rate": 7.514997768372447e-05, + "loss": 2.8372, + "step": 19283 + }, + { + "epoch": 1.1970947917313304, + "grad_norm": 0.1572029321102699, + "learning_rate": 7.514685622591114e-05, + "loss": 2.9139, + "step": 19284 + }, + { + "epoch": 1.1971568688310883, + "grad_norm": 0.19004491422608158, + "learning_rate": 7.514373463690057e-05, + "loss": 2.8274, + "step": 19285 + }, + { + "epoch": 1.197218945930846, + "grad_norm": 0.16535859522653235, + "learning_rate": 7.514061291670899e-05, + "loss": 2.8783, + "step": 19286 + }, + { + "epoch": 1.197281023030604, + "grad_norm": 0.15077717138120597, + "learning_rate": 7.513749106535278e-05, + "loss": 2.849, + "step": 19287 + }, + { + "epoch": 1.1973431001303618, + "grad_norm": 0.15176981629332187, + "learning_rate": 7.513436908284815e-05, + "loss": 2.935, + "step": 19288 + }, + { + "epoch": 1.1974051772301197, + "grad_norm": 0.16556177934502006, + "learning_rate": 7.513124696921144e-05, + "loss": 2.8053, + "step": 19289 + }, + { + "epoch": 1.1974672543298777, + "grad_norm": 0.18017385838420716, + "learning_rate": 7.512812472445889e-05, + "loss": 2.8373, + "step": 19290 + }, + { + "epoch": 1.1975293314296356, + "grad_norm": 0.18752154584115757, + "learning_rate": 7.512500234860682e-05, + "loss": 2.9217, + "step": 19291 + }, + { + "epoch": 1.1975914085293935, + "grad_norm": 0.1772091724624188, + "learning_rate": 7.512187984167151e-05, + "loss": 2.8961, + "step": 19292 + }, + { + "epoch": 1.1976534856291514, + "grad_norm": 0.20031595084302126, + "learning_rate": 7.511875720366927e-05, + "loss": 2.8221, + "step": 19293 + }, + { + "epoch": 1.1977155627289093, + "grad_norm": 0.1474066375394138, + "learning_rate": 7.511563443461636e-05, + "loss": 2.9245, + "step": 19294 + }, + { + "epoch": 1.1977776398286673, + "grad_norm": 0.16120651178036652, + "learning_rate": 7.51125115345291e-05, + "loss": 2.8813, + "step": 19295 + }, + { + "epoch": 1.1978397169284252, + "grad_norm": 0.15839122257193322, + "learning_rate": 7.510938850342376e-05, + "loss": 2.8434, + "step": 19296 + }, + { + "epoch": 1.197901794028183, + "grad_norm": 0.18115433848637483, + "learning_rate": 7.510626534131664e-05, + "loss": 2.8895, + "step": 19297 + }, + { + "epoch": 1.197963871127941, + "grad_norm": 0.17073901233500427, + "learning_rate": 7.510314204822406e-05, + "loss": 2.9986, + "step": 19298 + }, + { + "epoch": 1.198025948227699, + "grad_norm": 0.18033254982993452, + "learning_rate": 7.510001862416227e-05, + "loss": 2.8149, + "step": 19299 + }, + { + "epoch": 1.1980880253274566, + "grad_norm": 0.15480808485347472, + "learning_rate": 7.509689506914761e-05, + "loss": 2.8556, + "step": 19300 + }, + { + "epoch": 1.1981501024272145, + "grad_norm": 0.16709985106669592, + "learning_rate": 7.509377138319633e-05, + "loss": 2.8956, + "step": 19301 + }, + { + "epoch": 1.1982121795269725, + "grad_norm": 0.21070426833111663, + "learning_rate": 7.509064756632478e-05, + "loss": 2.8695, + "step": 19302 + }, + { + "epoch": 1.1982742566267304, + "grad_norm": 0.15353081460600243, + "learning_rate": 7.508752361854923e-05, + "loss": 2.8235, + "step": 19303 + }, + { + "epoch": 1.1983363337264883, + "grad_norm": 0.1687724988120452, + "learning_rate": 7.508439953988596e-05, + "loss": 2.8941, + "step": 19304 + }, + { + "epoch": 1.1983984108262462, + "grad_norm": 0.18353503891463233, + "learning_rate": 7.50812753303513e-05, + "loss": 2.913, + "step": 19305 + }, + { + "epoch": 1.1984604879260041, + "grad_norm": 0.158594583170915, + "learning_rate": 7.507815098996153e-05, + "loss": 2.9073, + "step": 19306 + }, + { + "epoch": 1.198522565025762, + "grad_norm": 0.17194041177841762, + "learning_rate": 7.507502651873296e-05, + "loss": 2.8478, + "step": 19307 + }, + { + "epoch": 1.19858464212552, + "grad_norm": 0.16403438122791777, + "learning_rate": 7.50719019166819e-05, + "loss": 2.8928, + "step": 19308 + }, + { + "epoch": 1.1986467192252779, + "grad_norm": 0.1807217067824253, + "learning_rate": 7.506877718382464e-05, + "loss": 2.8715, + "step": 19309 + }, + { + "epoch": 1.1987087963250356, + "grad_norm": 0.16779368725785906, + "learning_rate": 7.506565232017748e-05, + "loss": 2.9816, + "step": 19310 + }, + { + "epoch": 1.1987708734247935, + "grad_norm": 0.15055618214801877, + "learning_rate": 7.506252732575672e-05, + "loss": 2.8145, + "step": 19311 + }, + { + "epoch": 1.1988329505245514, + "grad_norm": 0.15411495397131483, + "learning_rate": 7.505940220057868e-05, + "loss": 2.8569, + "step": 19312 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.16398008865272046, + "learning_rate": 7.505627694465964e-05, + "loss": 2.8242, + "step": 19313 + }, + { + "epoch": 1.1989571047240672, + "grad_norm": 0.17548440133944498, + "learning_rate": 7.505315155801594e-05, + "loss": 2.9739, + "step": 19314 + }, + { + "epoch": 1.1990191818238252, + "grad_norm": 0.16044234100293142, + "learning_rate": 7.505002604066386e-05, + "loss": 2.918, + "step": 19315 + }, + { + "epoch": 1.199081258923583, + "grad_norm": 0.16110266201109705, + "learning_rate": 7.504690039261971e-05, + "loss": 2.8702, + "step": 19316 + }, + { + "epoch": 1.199143336023341, + "grad_norm": 0.14805767754671656, + "learning_rate": 7.50437746138998e-05, + "loss": 2.7386, + "step": 19317 + }, + { + "epoch": 1.199205413123099, + "grad_norm": 0.15669151749384777, + "learning_rate": 7.504064870452044e-05, + "loss": 2.9454, + "step": 19318 + }, + { + "epoch": 1.1992674902228568, + "grad_norm": 0.17174487715269038, + "learning_rate": 7.503752266449792e-05, + "loss": 2.9747, + "step": 19319 + }, + { + "epoch": 1.1993295673226148, + "grad_norm": 0.1580568626480028, + "learning_rate": 7.50343964938486e-05, + "loss": 2.8708, + "step": 19320 + }, + { + "epoch": 1.1993916444223727, + "grad_norm": 0.1999684141221514, + "learning_rate": 7.503127019258873e-05, + "loss": 2.9548, + "step": 19321 + }, + { + "epoch": 1.1994537215221306, + "grad_norm": 0.15922618257134644, + "learning_rate": 7.502814376073465e-05, + "loss": 2.8532, + "step": 19322 + }, + { + "epoch": 1.1995157986218885, + "grad_norm": 0.1687245884058436, + "learning_rate": 7.502501719830269e-05, + "loss": 2.8242, + "step": 19323 + }, + { + "epoch": 1.1995778757216462, + "grad_norm": 0.16450322083607213, + "learning_rate": 7.502189050530913e-05, + "loss": 2.8311, + "step": 19324 + }, + { + "epoch": 1.1996399528214041, + "grad_norm": 0.16700354197309406, + "learning_rate": 7.501876368177028e-05, + "loss": 2.8479, + "step": 19325 + }, + { + "epoch": 1.199702029921162, + "grad_norm": 0.14441835014782184, + "learning_rate": 7.501563672770246e-05, + "loss": 2.8429, + "step": 19326 + }, + { + "epoch": 1.19976410702092, + "grad_norm": 0.169521517847817, + "learning_rate": 7.501250964312202e-05, + "loss": 2.8925, + "step": 19327 + }, + { + "epoch": 1.1998261841206779, + "grad_norm": 0.15041280558122427, + "learning_rate": 7.500938242804523e-05, + "loss": 2.8557, + "step": 19328 + }, + { + "epoch": 1.1998882612204358, + "grad_norm": 0.16127388298609938, + "learning_rate": 7.500625508248842e-05, + "loss": 2.8404, + "step": 19329 + }, + { + "epoch": 1.1999503383201937, + "grad_norm": 0.15270363042137497, + "learning_rate": 7.50031276064679e-05, + "loss": 2.9343, + "step": 19330 + }, + { + "epoch": 1.2000124154199516, + "grad_norm": 0.17227332061221104, + "learning_rate": 7.500000000000001e-05, + "loss": 2.872, + "step": 19331 + }, + { + "epoch": 1.2000744925197095, + "grad_norm": 0.16619321076371385, + "learning_rate": 7.499687226310103e-05, + "loss": 2.8883, + "step": 19332 + }, + { + "epoch": 1.2001365696194675, + "grad_norm": 0.15454260055428323, + "learning_rate": 7.499374439578733e-05, + "loss": 2.8196, + "step": 19333 + }, + { + "epoch": 1.2001986467192252, + "grad_norm": 0.1596072778643365, + "learning_rate": 7.499061639807516e-05, + "loss": 2.9268, + "step": 19334 + }, + { + "epoch": 1.200260723818983, + "grad_norm": 0.148247207838046, + "learning_rate": 7.49874882699809e-05, + "loss": 2.8615, + "step": 19335 + }, + { + "epoch": 1.200322800918741, + "grad_norm": 0.18252067259405633, + "learning_rate": 7.498436001152085e-05, + "loss": 2.807, + "step": 19336 + }, + { + "epoch": 1.200384878018499, + "grad_norm": 0.16409696489452855, + "learning_rate": 7.49812316227113e-05, + "loss": 2.8664, + "step": 19337 + }, + { + "epoch": 1.2004469551182568, + "grad_norm": 0.16209161615189835, + "learning_rate": 7.497810310356864e-05, + "loss": 2.9401, + "step": 19338 + }, + { + "epoch": 1.2005090322180147, + "grad_norm": 0.15667932060957687, + "learning_rate": 7.497497445410912e-05, + "loss": 2.8092, + "step": 19339 + }, + { + "epoch": 1.2005711093177727, + "grad_norm": 0.19854102137768634, + "learning_rate": 7.497184567434912e-05, + "loss": 2.8643, + "step": 19340 + }, + { + "epoch": 1.2006331864175306, + "grad_norm": 0.16462814934606654, + "learning_rate": 7.496871676430495e-05, + "loss": 2.8725, + "step": 19341 + }, + { + "epoch": 1.2006952635172885, + "grad_norm": 0.14389984002681028, + "learning_rate": 7.49655877239929e-05, + "loss": 2.8329, + "step": 19342 + }, + { + "epoch": 1.2007573406170464, + "grad_norm": 0.15652660086785994, + "learning_rate": 7.496245855342931e-05, + "loss": 2.7238, + "step": 19343 + }, + { + "epoch": 1.2008194177168043, + "grad_norm": 0.15539927751474142, + "learning_rate": 7.495932925263053e-05, + "loss": 2.9046, + "step": 19344 + }, + { + "epoch": 1.2008814948165623, + "grad_norm": 0.1789127003035833, + "learning_rate": 7.495619982161287e-05, + "loss": 2.8209, + "step": 19345 + }, + { + "epoch": 1.2009435719163202, + "grad_norm": 0.18607037628873627, + "learning_rate": 7.495307026039265e-05, + "loss": 2.8834, + "step": 19346 + }, + { + "epoch": 1.201005649016078, + "grad_norm": 0.171373921104015, + "learning_rate": 7.494994056898621e-05, + "loss": 2.9539, + "step": 19347 + }, + { + "epoch": 1.2010677261158358, + "grad_norm": 0.18639357275114504, + "learning_rate": 7.494681074740985e-05, + "loss": 2.8506, + "step": 19348 + }, + { + "epoch": 1.2011298032155937, + "grad_norm": 0.16699963826294514, + "learning_rate": 7.494368079567997e-05, + "loss": 2.838, + "step": 19349 + }, + { + "epoch": 1.2011918803153516, + "grad_norm": 0.21375177483072774, + "learning_rate": 7.49405507138128e-05, + "loss": 2.8943, + "step": 19350 + }, + { + "epoch": 1.2012539574151095, + "grad_norm": 0.1639197670987681, + "learning_rate": 7.493742050182476e-05, + "loss": 2.8791, + "step": 19351 + }, + { + "epoch": 1.2013160345148675, + "grad_norm": 0.15011289267311131, + "learning_rate": 7.493429015973212e-05, + "loss": 2.9466, + "step": 19352 + }, + { + "epoch": 1.2013781116146254, + "grad_norm": 0.14477851053771315, + "learning_rate": 7.493115968755126e-05, + "loss": 2.8277, + "step": 19353 + }, + { + "epoch": 1.2014401887143833, + "grad_norm": 0.14630587192214245, + "learning_rate": 7.492802908529847e-05, + "loss": 2.8409, + "step": 19354 + }, + { + "epoch": 1.2015022658141412, + "grad_norm": 0.22508560777306177, + "learning_rate": 7.49248983529901e-05, + "loss": 2.9092, + "step": 19355 + }, + { + "epoch": 1.2015643429138991, + "grad_norm": 0.15161361938317772, + "learning_rate": 7.492176749064249e-05, + "loss": 2.8711, + "step": 19356 + }, + { + "epoch": 1.201626420013657, + "grad_norm": 0.15578819815159314, + "learning_rate": 7.491863649827197e-05, + "loss": 2.8494, + "step": 19357 + }, + { + "epoch": 1.2016884971134147, + "grad_norm": 0.19771234522997538, + "learning_rate": 7.491550537589487e-05, + "loss": 2.8644, + "step": 19358 + }, + { + "epoch": 1.2017505742131727, + "grad_norm": 0.16636200280764066, + "learning_rate": 7.491237412352754e-05, + "loss": 2.8026, + "step": 19359 + }, + { + "epoch": 1.2018126513129306, + "grad_norm": 0.2216004133270339, + "learning_rate": 7.490924274118629e-05, + "loss": 2.8718, + "step": 19360 + }, + { + "epoch": 1.2018747284126885, + "grad_norm": 0.14535105024707298, + "learning_rate": 7.490611122888749e-05, + "loss": 2.8533, + "step": 19361 + }, + { + "epoch": 1.2019368055124464, + "grad_norm": 0.15279667722660165, + "learning_rate": 7.490297958664744e-05, + "loss": 2.8512, + "step": 19362 + }, + { + "epoch": 1.2019988826122043, + "grad_norm": 0.2216324945033591, + "learning_rate": 7.489984781448253e-05, + "loss": 2.9127, + "step": 19363 + }, + { + "epoch": 1.2020609597119623, + "grad_norm": 0.1892527671117824, + "learning_rate": 7.489671591240905e-05, + "loss": 2.8673, + "step": 19364 + }, + { + "epoch": 1.2021230368117202, + "grad_norm": 0.18516779875652448, + "learning_rate": 7.489358388044336e-05, + "loss": 2.8558, + "step": 19365 + }, + { + "epoch": 1.202185113911478, + "grad_norm": 0.19151892556950942, + "learning_rate": 7.489045171860181e-05, + "loss": 2.9288, + "step": 19366 + }, + { + "epoch": 1.202247191011236, + "grad_norm": 0.1491551868010007, + "learning_rate": 7.488731942690072e-05, + "loss": 2.8767, + "step": 19367 + }, + { + "epoch": 1.202309268110994, + "grad_norm": 0.14769037346903943, + "learning_rate": 7.488418700535644e-05, + "loss": 2.8863, + "step": 19368 + }, + { + "epoch": 1.2023713452107518, + "grad_norm": 0.17144125698296656, + "learning_rate": 7.488105445398532e-05, + "loss": 2.9559, + "step": 19369 + }, + { + "epoch": 1.2024334223105098, + "grad_norm": 0.18013841608582332, + "learning_rate": 7.487792177280371e-05, + "loss": 2.9348, + "step": 19370 + }, + { + "epoch": 1.2024954994102677, + "grad_norm": 0.16170750062857198, + "learning_rate": 7.487478896182792e-05, + "loss": 2.9642, + "step": 19371 + }, + { + "epoch": 1.2025575765100254, + "grad_norm": 0.18651253233981485, + "learning_rate": 7.487165602107435e-05, + "loss": 2.9857, + "step": 19372 + }, + { + "epoch": 1.2026196536097833, + "grad_norm": 0.163626483140071, + "learning_rate": 7.486852295055928e-05, + "loss": 2.915, + "step": 19373 + }, + { + "epoch": 1.2026817307095412, + "grad_norm": 0.16945357850692513, + "learning_rate": 7.486538975029911e-05, + "loss": 2.7727, + "step": 19374 + }, + { + "epoch": 1.2027438078092991, + "grad_norm": 0.16579902206711714, + "learning_rate": 7.486225642031015e-05, + "loss": 2.9051, + "step": 19375 + }, + { + "epoch": 1.202805884909057, + "grad_norm": 0.1493240845965759, + "learning_rate": 7.485912296060877e-05, + "loss": 2.8815, + "step": 19376 + }, + { + "epoch": 1.202867962008815, + "grad_norm": 0.16461768898338314, + "learning_rate": 7.485598937121133e-05, + "loss": 2.856, + "step": 19377 + }, + { + "epoch": 1.2029300391085729, + "grad_norm": 0.1508739433583424, + "learning_rate": 7.485285565213413e-05, + "loss": 2.825, + "step": 19378 + }, + { + "epoch": 1.2029921162083308, + "grad_norm": 0.17904760615003704, + "learning_rate": 7.484972180339356e-05, + "loss": 2.9399, + "step": 19379 + }, + { + "epoch": 1.2030541933080887, + "grad_norm": 0.15024728260344017, + "learning_rate": 7.484658782500595e-05, + "loss": 2.9047, + "step": 19380 + }, + { + "epoch": 1.2031162704078466, + "grad_norm": 0.15748602031170383, + "learning_rate": 7.484345371698767e-05, + "loss": 2.8003, + "step": 19381 + }, + { + "epoch": 1.2031783475076043, + "grad_norm": 0.15228273930673653, + "learning_rate": 7.484031947935506e-05, + "loss": 2.8345, + "step": 19382 + }, + { + "epoch": 1.2032404246073622, + "grad_norm": 0.2021527773626224, + "learning_rate": 7.483718511212447e-05, + "loss": 2.8544, + "step": 19383 + }, + { + "epoch": 1.2033025017071202, + "grad_norm": 0.18355343761615228, + "learning_rate": 7.483405061531227e-05, + "loss": 2.8715, + "step": 19384 + }, + { + "epoch": 1.203364578806878, + "grad_norm": 0.15059927919610785, + "learning_rate": 7.483091598893478e-05, + "loss": 2.8186, + "step": 19385 + }, + { + "epoch": 1.203426655906636, + "grad_norm": 0.1704988494737043, + "learning_rate": 7.482778123300837e-05, + "loss": 2.9355, + "step": 19386 + }, + { + "epoch": 1.203488733006394, + "grad_norm": 0.1794607348690655, + "learning_rate": 7.482464634754942e-05, + "loss": 2.9357, + "step": 19387 + }, + { + "epoch": 1.2035508101061518, + "grad_norm": 0.16395410862690046, + "learning_rate": 7.482151133257426e-05, + "loss": 2.7595, + "step": 19388 + }, + { + "epoch": 1.2036128872059098, + "grad_norm": 0.15894298272383622, + "learning_rate": 7.481837618809924e-05, + "loss": 2.8123, + "step": 19389 + }, + { + "epoch": 1.2036749643056677, + "grad_norm": 0.1638527676281513, + "learning_rate": 7.481524091414072e-05, + "loss": 2.9249, + "step": 19390 + }, + { + "epoch": 1.2037370414054256, + "grad_norm": 0.15662323957505758, + "learning_rate": 7.481210551071508e-05, + "loss": 2.811, + "step": 19391 + }, + { + "epoch": 1.2037991185051835, + "grad_norm": 0.1697425663539528, + "learning_rate": 7.480896997783866e-05, + "loss": 2.8023, + "step": 19392 + }, + { + "epoch": 1.2038611956049414, + "grad_norm": 0.15606081357349527, + "learning_rate": 7.48058343155278e-05, + "loss": 2.9502, + "step": 19393 + }, + { + "epoch": 1.2039232727046993, + "grad_norm": 0.1587919826168683, + "learning_rate": 7.480269852379891e-05, + "loss": 2.9237, + "step": 19394 + }, + { + "epoch": 1.203985349804457, + "grad_norm": 0.15865407651382119, + "learning_rate": 7.479956260266831e-05, + "loss": 2.8781, + "step": 19395 + }, + { + "epoch": 1.204047426904215, + "grad_norm": 0.18669821255154662, + "learning_rate": 7.479642655215236e-05, + "loss": 2.891, + "step": 19396 + }, + { + "epoch": 1.2041095040039729, + "grad_norm": 0.26261655342993506, + "learning_rate": 7.479329037226745e-05, + "loss": 2.968, + "step": 19397 + }, + { + "epoch": 1.2041715811037308, + "grad_norm": 0.16669148704865294, + "learning_rate": 7.479015406302992e-05, + "loss": 2.8807, + "step": 19398 + }, + { + "epoch": 1.2042336582034887, + "grad_norm": 0.1559172663815598, + "learning_rate": 7.478701762445612e-05, + "loss": 2.8456, + "step": 19399 + }, + { + "epoch": 1.2042957353032466, + "grad_norm": 0.19969996134361728, + "learning_rate": 7.478388105656244e-05, + "loss": 2.8367, + "step": 19400 + }, + { + "epoch": 1.2043578124030045, + "grad_norm": 0.1651467925371055, + "learning_rate": 7.478074435936526e-05, + "loss": 2.854, + "step": 19401 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.17118921882376512, + "learning_rate": 7.47776075328809e-05, + "loss": 2.904, + "step": 19402 + }, + { + "epoch": 1.2044819666025204, + "grad_norm": 0.17936208532276002, + "learning_rate": 7.477447057712574e-05, + "loss": 2.8614, + "step": 19403 + }, + { + "epoch": 1.2045440437022783, + "grad_norm": 0.17198246067396072, + "learning_rate": 7.477133349211619e-05, + "loss": 2.9308, + "step": 19404 + }, + { + "epoch": 1.2046061208020362, + "grad_norm": 0.16424761274423172, + "learning_rate": 7.476819627786852e-05, + "loss": 2.8729, + "step": 19405 + }, + { + "epoch": 1.204668197901794, + "grad_norm": 0.1755349712215078, + "learning_rate": 7.476505893439921e-05, + "loss": 2.7956, + "step": 19406 + }, + { + "epoch": 1.2047302750015518, + "grad_norm": 0.17239768573730016, + "learning_rate": 7.476192146172455e-05, + "loss": 2.916, + "step": 19407 + }, + { + "epoch": 1.2047923521013097, + "grad_norm": 0.19185810859710203, + "learning_rate": 7.475878385986095e-05, + "loss": 2.9064, + "step": 19408 + }, + { + "epoch": 1.2048544292010677, + "grad_norm": 0.1737718115896314, + "learning_rate": 7.475564612882476e-05, + "loss": 2.8047, + "step": 19409 + }, + { + "epoch": 1.2049165063008256, + "grad_norm": 0.15465590799802195, + "learning_rate": 7.475250826863233e-05, + "loss": 2.9009, + "step": 19410 + }, + { + "epoch": 1.2049785834005835, + "grad_norm": 0.17047462227739366, + "learning_rate": 7.474937027930009e-05, + "loss": 2.8146, + "step": 19411 + }, + { + "epoch": 1.2050406605003414, + "grad_norm": 0.167143368466961, + "learning_rate": 7.474623216084436e-05, + "loss": 2.8584, + "step": 19412 + }, + { + "epoch": 1.2051027376000993, + "grad_norm": 0.15859209914828087, + "learning_rate": 7.474309391328154e-05, + "loss": 2.8266, + "step": 19413 + }, + { + "epoch": 1.2051648146998573, + "grad_norm": 0.16131158217435412, + "learning_rate": 7.473995553662798e-05, + "loss": 2.9199, + "step": 19414 + }, + { + "epoch": 1.2052268917996152, + "grad_norm": 0.17763744058404676, + "learning_rate": 7.473681703090007e-05, + "loss": 2.9041, + "step": 19415 + }, + { + "epoch": 1.205288968899373, + "grad_norm": 0.1484794431957426, + "learning_rate": 7.473367839611416e-05, + "loss": 2.9423, + "step": 19416 + }, + { + "epoch": 1.205351045999131, + "grad_norm": 0.17866356018428156, + "learning_rate": 7.473053963228668e-05, + "loss": 2.9025, + "step": 19417 + }, + { + "epoch": 1.205413123098889, + "grad_norm": 0.1697523652756055, + "learning_rate": 7.472740073943395e-05, + "loss": 2.9171, + "step": 19418 + }, + { + "epoch": 1.2054752001986466, + "grad_norm": 0.1549768494325822, + "learning_rate": 7.472426171757239e-05, + "loss": 2.8758, + "step": 19419 + }, + { + "epoch": 1.2055372772984045, + "grad_norm": 0.2021934809234971, + "learning_rate": 7.472112256671832e-05, + "loss": 2.9356, + "step": 19420 + }, + { + "epoch": 1.2055993543981625, + "grad_norm": 0.1716648030257065, + "learning_rate": 7.471798328688819e-05, + "loss": 2.8273, + "step": 19421 + }, + { + "epoch": 1.2056614314979204, + "grad_norm": 0.15237585365269085, + "learning_rate": 7.471484387809832e-05, + "loss": 2.8948, + "step": 19422 + }, + { + "epoch": 1.2057235085976783, + "grad_norm": 0.15368602046626703, + "learning_rate": 7.471170434036511e-05, + "loss": 2.9024, + "step": 19423 + }, + { + "epoch": 1.2057855856974362, + "grad_norm": 0.15317247029591288, + "learning_rate": 7.470856467370493e-05, + "loss": 2.8858, + "step": 19424 + }, + { + "epoch": 1.2058476627971941, + "grad_norm": 0.1788662511763982, + "learning_rate": 7.470542487813419e-05, + "loss": 2.9007, + "step": 19425 + }, + { + "epoch": 1.205909739896952, + "grad_norm": 0.15070445892194517, + "learning_rate": 7.470228495366924e-05, + "loss": 2.8097, + "step": 19426 + }, + { + "epoch": 1.20597181699671, + "grad_norm": 0.194450938640486, + "learning_rate": 7.469914490032648e-05, + "loss": 2.8122, + "step": 19427 + }, + { + "epoch": 1.2060338940964679, + "grad_norm": 0.16845512389056913, + "learning_rate": 7.469600471812229e-05, + "loss": 2.8499, + "step": 19428 + }, + { + "epoch": 1.2060959711962258, + "grad_norm": 0.1625606907245573, + "learning_rate": 7.469286440707303e-05, + "loss": 2.9257, + "step": 19429 + }, + { + "epoch": 1.2061580482959835, + "grad_norm": 0.1804579822416053, + "learning_rate": 7.468972396719512e-05, + "loss": 2.9715, + "step": 19430 + }, + { + "epoch": 1.2062201253957414, + "grad_norm": 0.17082514144767946, + "learning_rate": 7.468658339850492e-05, + "loss": 2.893, + "step": 19431 + }, + { + "epoch": 1.2062822024954993, + "grad_norm": 0.17738393672745625, + "learning_rate": 7.468344270101879e-05, + "loss": 2.8866, + "step": 19432 + }, + { + "epoch": 1.2063442795952573, + "grad_norm": 0.18442139849932096, + "learning_rate": 7.468030187475319e-05, + "loss": 2.8248, + "step": 19433 + }, + { + "epoch": 1.2064063566950152, + "grad_norm": 0.17068456073089408, + "learning_rate": 7.467716091972445e-05, + "loss": 2.8802, + "step": 19434 + }, + { + "epoch": 1.206468433794773, + "grad_norm": 0.16512971566553944, + "learning_rate": 7.467401983594897e-05, + "loss": 2.9295, + "step": 19435 + }, + { + "epoch": 1.206530510894531, + "grad_norm": 0.1685597203498991, + "learning_rate": 7.467087862344313e-05, + "loss": 2.7899, + "step": 19436 + }, + { + "epoch": 1.206592587994289, + "grad_norm": 0.15838904699860554, + "learning_rate": 7.466773728222333e-05, + "loss": 2.8016, + "step": 19437 + }, + { + "epoch": 1.2066546650940468, + "grad_norm": 0.17467070085038916, + "learning_rate": 7.466459581230599e-05, + "loss": 2.8688, + "step": 19438 + }, + { + "epoch": 1.2067167421938048, + "grad_norm": 0.15778460303264774, + "learning_rate": 7.466145421370743e-05, + "loss": 2.9244, + "step": 19439 + }, + { + "epoch": 1.2067788192935627, + "grad_norm": 0.17320545521030276, + "learning_rate": 7.465831248644409e-05, + "loss": 2.9716, + "step": 19440 + }, + { + "epoch": 1.2068408963933206, + "grad_norm": 0.18371222102799484, + "learning_rate": 7.465517063053234e-05, + "loss": 2.9102, + "step": 19441 + }, + { + "epoch": 1.2069029734930785, + "grad_norm": 0.16069424216608394, + "learning_rate": 7.465202864598858e-05, + "loss": 2.8649, + "step": 19442 + }, + { + "epoch": 1.2069650505928362, + "grad_norm": 0.17579995691576386, + "learning_rate": 7.46488865328292e-05, + "loss": 2.8353, + "step": 19443 + }, + { + "epoch": 1.2070271276925941, + "grad_norm": 0.15469879003226059, + "learning_rate": 7.46457442910706e-05, + "loss": 2.8649, + "step": 19444 + }, + { + "epoch": 1.207089204792352, + "grad_norm": 0.18731177339486094, + "learning_rate": 7.464260192072917e-05, + "loss": 2.8597, + "step": 19445 + }, + { + "epoch": 1.20715128189211, + "grad_norm": 0.1686543898024859, + "learning_rate": 7.463945942182131e-05, + "loss": 2.9433, + "step": 19446 + }, + { + "epoch": 1.2072133589918679, + "grad_norm": 0.24483323843325297, + "learning_rate": 7.463631679436339e-05, + "loss": 2.9785, + "step": 19447 + }, + { + "epoch": 1.2072754360916258, + "grad_norm": 0.15998453145578825, + "learning_rate": 7.463317403837185e-05, + "loss": 2.8595, + "step": 19448 + }, + { + "epoch": 1.2073375131913837, + "grad_norm": 0.2201003984684481, + "learning_rate": 7.463003115386304e-05, + "loss": 2.7783, + "step": 19449 + }, + { + "epoch": 1.2073995902911416, + "grad_norm": 0.17725657072182502, + "learning_rate": 7.462688814085338e-05, + "loss": 2.8471, + "step": 19450 + }, + { + "epoch": 1.2074616673908996, + "grad_norm": 0.18137826738234145, + "learning_rate": 7.462374499935928e-05, + "loss": 2.8558, + "step": 19451 + }, + { + "epoch": 1.2075237444906575, + "grad_norm": 0.19027284863701366, + "learning_rate": 7.46206017293971e-05, + "loss": 3.0203, + "step": 19452 + }, + { + "epoch": 1.2075858215904154, + "grad_norm": 0.17618012736714997, + "learning_rate": 7.46174583309833e-05, + "loss": 2.9923, + "step": 19453 + }, + { + "epoch": 1.207647898690173, + "grad_norm": 0.1743902108948262, + "learning_rate": 7.46143148041342e-05, + "loss": 2.8445, + "step": 19454 + }, + { + "epoch": 1.207709975789931, + "grad_norm": 0.17001560519975326, + "learning_rate": 7.461117114886629e-05, + "loss": 2.8887, + "step": 19455 + }, + { + "epoch": 1.207772052889689, + "grad_norm": 0.17588007056546145, + "learning_rate": 7.46080273651959e-05, + "loss": 2.8701, + "step": 19456 + }, + { + "epoch": 1.2078341299894468, + "grad_norm": 0.2002159294119078, + "learning_rate": 7.460488345313947e-05, + "loss": 2.9098, + "step": 19457 + }, + { + "epoch": 1.2078962070892048, + "grad_norm": 0.1754871822656048, + "learning_rate": 7.460173941271338e-05, + "loss": 2.8878, + "step": 19458 + }, + { + "epoch": 1.2079582841889627, + "grad_norm": 0.15699308842658902, + "learning_rate": 7.459859524393404e-05, + "loss": 2.9586, + "step": 19459 + }, + { + "epoch": 1.2080203612887206, + "grad_norm": 0.17380052518865394, + "learning_rate": 7.459545094681785e-05, + "loss": 2.9687, + "step": 19460 + }, + { + "epoch": 1.2080824383884785, + "grad_norm": 0.19787999595593408, + "learning_rate": 7.459230652138124e-05, + "loss": 2.8616, + "step": 19461 + }, + { + "epoch": 1.2081445154882364, + "grad_norm": 0.15632741321704796, + "learning_rate": 7.45891619676406e-05, + "loss": 2.8744, + "step": 19462 + }, + { + "epoch": 1.2082065925879943, + "grad_norm": 0.17888103790542528, + "learning_rate": 7.458601728561232e-05, + "loss": 2.8858, + "step": 19463 + }, + { + "epoch": 1.2082686696877523, + "grad_norm": 0.1812147151374825, + "learning_rate": 7.458287247531282e-05, + "loss": 2.8721, + "step": 19464 + }, + { + "epoch": 1.2083307467875102, + "grad_norm": 0.1534130520582154, + "learning_rate": 7.45797275367585e-05, + "loss": 2.9041, + "step": 19465 + }, + { + "epoch": 1.208392823887268, + "grad_norm": 0.17055700254579373, + "learning_rate": 7.457658246996579e-05, + "loss": 2.8379, + "step": 19466 + }, + { + "epoch": 1.2084549009870258, + "grad_norm": 0.24895143402919717, + "learning_rate": 7.457343727495108e-05, + "loss": 2.8759, + "step": 19467 + }, + { + "epoch": 1.2085169780867837, + "grad_norm": 0.1857037032057796, + "learning_rate": 7.457029195173078e-05, + "loss": 2.8274, + "step": 19468 + }, + { + "epoch": 1.2085790551865416, + "grad_norm": 0.17342361252833097, + "learning_rate": 7.45671465003213e-05, + "loss": 2.8406, + "step": 19469 + }, + { + "epoch": 1.2086411322862995, + "grad_norm": 0.1691233848613071, + "learning_rate": 7.456400092073906e-05, + "loss": 2.9708, + "step": 19470 + }, + { + "epoch": 1.2087032093860575, + "grad_norm": 0.17927082096863045, + "learning_rate": 7.456085521300045e-05, + "loss": 2.97, + "step": 19471 + }, + { + "epoch": 1.2087652864858154, + "grad_norm": 0.16931012076723015, + "learning_rate": 7.45577093771219e-05, + "loss": 2.9255, + "step": 19472 + }, + { + "epoch": 1.2088273635855733, + "grad_norm": 0.1785160419450895, + "learning_rate": 7.455456341311983e-05, + "loss": 2.9187, + "step": 19473 + }, + { + "epoch": 1.2088894406853312, + "grad_norm": 0.15403807339072836, + "learning_rate": 7.455141732101063e-05, + "loss": 2.839, + "step": 19474 + }, + { + "epoch": 1.2089515177850891, + "grad_norm": 0.1779119736095664, + "learning_rate": 7.454827110081072e-05, + "loss": 2.8626, + "step": 19475 + }, + { + "epoch": 1.209013594884847, + "grad_norm": 0.18534858548473843, + "learning_rate": 7.454512475253655e-05, + "loss": 2.9028, + "step": 19476 + }, + { + "epoch": 1.209075671984605, + "grad_norm": 0.20807779057364084, + "learning_rate": 7.454197827620449e-05, + "loss": 2.8293, + "step": 19477 + }, + { + "epoch": 1.2091377490843627, + "grad_norm": 0.1500899665962473, + "learning_rate": 7.453883167183096e-05, + "loss": 2.8992, + "step": 19478 + }, + { + "epoch": 1.2091998261841206, + "grad_norm": 0.1839737206713017, + "learning_rate": 7.453568493943239e-05, + "loss": 2.8609, + "step": 19479 + }, + { + "epoch": 1.2092619032838785, + "grad_norm": 0.193564441100264, + "learning_rate": 7.453253807902521e-05, + "loss": 2.9529, + "step": 19480 + }, + { + "epoch": 1.2093239803836364, + "grad_norm": 0.15411949222628382, + "learning_rate": 7.452939109062582e-05, + "loss": 2.8107, + "step": 19481 + }, + { + "epoch": 1.2093860574833943, + "grad_norm": 0.24721255373190604, + "learning_rate": 7.452624397425062e-05, + "loss": 2.8911, + "step": 19482 + }, + { + "epoch": 1.2094481345831523, + "grad_norm": 0.1785403098938029, + "learning_rate": 7.452309672991607e-05, + "loss": 2.809, + "step": 19483 + }, + { + "epoch": 1.2095102116829102, + "grad_norm": 0.16161500855944258, + "learning_rate": 7.451994935763856e-05, + "loss": 2.8838, + "step": 19484 + }, + { + "epoch": 1.209572288782668, + "grad_norm": 0.15558868172093285, + "learning_rate": 7.451680185743455e-05, + "loss": 2.8686, + "step": 19485 + }, + { + "epoch": 1.209634365882426, + "grad_norm": 0.19628674709795382, + "learning_rate": 7.45136542293204e-05, + "loss": 2.8525, + "step": 19486 + }, + { + "epoch": 1.209696442982184, + "grad_norm": 0.18606508069399808, + "learning_rate": 7.451050647331259e-05, + "loss": 2.9203, + "step": 19487 + }, + { + "epoch": 1.2097585200819418, + "grad_norm": 0.16417433128904743, + "learning_rate": 7.45073585894275e-05, + "loss": 2.8001, + "step": 19488 + }, + { + "epoch": 1.2098205971816998, + "grad_norm": 0.19255083753536897, + "learning_rate": 7.450421057768157e-05, + "loss": 2.9138, + "step": 19489 + }, + { + "epoch": 1.2098826742814577, + "grad_norm": 0.1849905407265732, + "learning_rate": 7.450106243809123e-05, + "loss": 3.0205, + "step": 19490 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.16558347442371685, + "learning_rate": 7.44979141706729e-05, + "loss": 2.8913, + "step": 19491 + }, + { + "epoch": 1.2100068284809733, + "grad_norm": 0.18650555150421624, + "learning_rate": 7.449476577544301e-05, + "loss": 2.8388, + "step": 19492 + }, + { + "epoch": 1.2100689055807312, + "grad_norm": 0.21551891357085562, + "learning_rate": 7.449161725241799e-05, + "loss": 2.9537, + "step": 19493 + }, + { + "epoch": 1.2101309826804891, + "grad_norm": 0.17962905721190425, + "learning_rate": 7.448846860161425e-05, + "loss": 2.8289, + "step": 19494 + }, + { + "epoch": 1.210193059780247, + "grad_norm": 0.15645837598305673, + "learning_rate": 7.44853198230482e-05, + "loss": 2.8031, + "step": 19495 + }, + { + "epoch": 1.210255136880005, + "grad_norm": 0.20502771030365421, + "learning_rate": 7.448217091673633e-05, + "loss": 2.8126, + "step": 19496 + }, + { + "epoch": 1.2103172139797629, + "grad_norm": 0.16255588968785512, + "learning_rate": 7.4479021882695e-05, + "loss": 2.9197, + "step": 19497 + }, + { + "epoch": 1.2103792910795208, + "grad_norm": 0.17563266832437313, + "learning_rate": 7.447587272094067e-05, + "loss": 2.911, + "step": 19498 + }, + { + "epoch": 1.2104413681792787, + "grad_norm": 0.15553569837849782, + "learning_rate": 7.44727234314898e-05, + "loss": 2.8871, + "step": 19499 + }, + { + "epoch": 1.2105034452790366, + "grad_norm": 0.1962426353700664, + "learning_rate": 7.446957401435877e-05, + "loss": 2.9422, + "step": 19500 + }, + { + "epoch": 1.2105655223787946, + "grad_norm": 0.21770644761292476, + "learning_rate": 7.446642446956405e-05, + "loss": 2.9411, + "step": 19501 + }, + { + "epoch": 1.2106275994785523, + "grad_norm": 0.18028407990192988, + "learning_rate": 7.446327479712202e-05, + "loss": 2.8508, + "step": 19502 + }, + { + "epoch": 1.2106896765783102, + "grad_norm": 0.1598336005228118, + "learning_rate": 7.446012499704917e-05, + "loss": 2.8532, + "step": 19503 + }, + { + "epoch": 1.210751753678068, + "grad_norm": 0.17268181132632968, + "learning_rate": 7.445697506936192e-05, + "loss": 2.944, + "step": 19504 + }, + { + "epoch": 1.210813830777826, + "grad_norm": 0.16632535875706905, + "learning_rate": 7.445382501407667e-05, + "loss": 2.8795, + "step": 19505 + }, + { + "epoch": 1.210875907877584, + "grad_norm": 0.16379935300721665, + "learning_rate": 7.445067483120988e-05, + "loss": 2.8154, + "step": 19506 + }, + { + "epoch": 1.2109379849773418, + "grad_norm": 0.19415072140450346, + "learning_rate": 7.444752452077801e-05, + "loss": 2.8739, + "step": 19507 + }, + { + "epoch": 1.2110000620770998, + "grad_norm": 0.16986229279445345, + "learning_rate": 7.444437408279745e-05, + "loss": 2.9624, + "step": 19508 + }, + { + "epoch": 1.2110621391768577, + "grad_norm": 0.17463233911954157, + "learning_rate": 7.444122351728467e-05, + "loss": 2.7505, + "step": 19509 + }, + { + "epoch": 1.2111242162766156, + "grad_norm": 0.16101262205008368, + "learning_rate": 7.443807282425606e-05, + "loss": 2.8926, + "step": 19510 + }, + { + "epoch": 1.2111862933763735, + "grad_norm": 0.17508289106827749, + "learning_rate": 7.443492200372813e-05, + "loss": 2.7827, + "step": 19511 + }, + { + "epoch": 1.2112483704761314, + "grad_norm": 0.18015298466564394, + "learning_rate": 7.443177105571726e-05, + "loss": 2.8644, + "step": 19512 + }, + { + "epoch": 1.2113104475758893, + "grad_norm": 0.19066098178468388, + "learning_rate": 7.442861998023992e-05, + "loss": 2.8641, + "step": 19513 + }, + { + "epoch": 1.2113725246756473, + "grad_norm": 0.19385995513566023, + "learning_rate": 7.442546877731252e-05, + "loss": 2.8557, + "step": 19514 + }, + { + "epoch": 1.211434601775405, + "grad_norm": 0.16748010729870286, + "learning_rate": 7.442231744695152e-05, + "loss": 2.8602, + "step": 19515 + }, + { + "epoch": 1.2114966788751629, + "grad_norm": 0.18401064108186957, + "learning_rate": 7.44191659891734e-05, + "loss": 2.9081, + "step": 19516 + }, + { + "epoch": 1.2115587559749208, + "grad_norm": 0.16271317877333738, + "learning_rate": 7.441601440399453e-05, + "loss": 2.8374, + "step": 19517 + }, + { + "epoch": 1.2116208330746787, + "grad_norm": 0.16111782466657187, + "learning_rate": 7.44128626914314e-05, + "loss": 2.8548, + "step": 19518 + }, + { + "epoch": 1.2116829101744366, + "grad_norm": 0.1914801357676274, + "learning_rate": 7.440971085150042e-05, + "loss": 2.9056, + "step": 19519 + }, + { + "epoch": 1.2117449872741946, + "grad_norm": 0.15848199966201607, + "learning_rate": 7.440655888421806e-05, + "loss": 2.8462, + "step": 19520 + }, + { + "epoch": 1.2118070643739525, + "grad_norm": 0.17629346922749548, + "learning_rate": 7.440340678960077e-05, + "loss": 2.8839, + "step": 19521 + }, + { + "epoch": 1.2118691414737104, + "grad_norm": 0.17348890208711137, + "learning_rate": 7.440025456766497e-05, + "loss": 2.7975, + "step": 19522 + }, + { + "epoch": 1.2119312185734683, + "grad_norm": 0.17196345762295487, + "learning_rate": 7.439710221842712e-05, + "loss": 2.7999, + "step": 19523 + }, + { + "epoch": 1.2119932956732262, + "grad_norm": 0.18680155326636028, + "learning_rate": 7.439394974190368e-05, + "loss": 2.8174, + "step": 19524 + }, + { + "epoch": 1.2120553727729841, + "grad_norm": 0.1849208308949102, + "learning_rate": 7.439079713811106e-05, + "loss": 2.9582, + "step": 19525 + }, + { + "epoch": 1.2121174498727418, + "grad_norm": 0.19503046653625647, + "learning_rate": 7.438764440706574e-05, + "loss": 2.9793, + "step": 19526 + }, + { + "epoch": 1.2121795269724998, + "grad_norm": 0.17910070595905653, + "learning_rate": 7.438449154878418e-05, + "loss": 2.9445, + "step": 19527 + }, + { + "epoch": 1.2122416040722577, + "grad_norm": 0.16364480957906508, + "learning_rate": 7.438133856328277e-05, + "loss": 2.8283, + "step": 19528 + }, + { + "epoch": 1.2123036811720156, + "grad_norm": 0.15898349583638116, + "learning_rate": 7.437818545057802e-05, + "loss": 2.8404, + "step": 19529 + }, + { + "epoch": 1.2123657582717735, + "grad_norm": 0.152848057013308, + "learning_rate": 7.437503221068635e-05, + "loss": 2.8261, + "step": 19530 + }, + { + "epoch": 1.2124278353715314, + "grad_norm": 0.1435322646710032, + "learning_rate": 7.437187884362423e-05, + "loss": 2.9209, + "step": 19531 + }, + { + "epoch": 1.2124899124712893, + "grad_norm": 0.20680742940857214, + "learning_rate": 7.436872534940809e-05, + "loss": 2.9805, + "step": 19532 + }, + { + "epoch": 1.2125519895710473, + "grad_norm": 0.16115483220238458, + "learning_rate": 7.43655717280544e-05, + "loss": 2.8658, + "step": 19533 + }, + { + "epoch": 1.2126140666708052, + "grad_norm": 0.15088386897841755, + "learning_rate": 7.436241797957961e-05, + "loss": 2.8878, + "step": 19534 + }, + { + "epoch": 1.212676143770563, + "grad_norm": 0.1517981025304553, + "learning_rate": 7.435926410400015e-05, + "loss": 2.9437, + "step": 19535 + }, + { + "epoch": 1.212738220870321, + "grad_norm": 0.15372563501493017, + "learning_rate": 7.435611010133253e-05, + "loss": 2.9152, + "step": 19536 + }, + { + "epoch": 1.212800297970079, + "grad_norm": 0.14866929402318596, + "learning_rate": 7.435295597159317e-05, + "loss": 2.9252, + "step": 19537 + }, + { + "epoch": 1.2128623750698369, + "grad_norm": 0.30715835502882, + "learning_rate": 7.43498017147985e-05, + "loss": 2.9168, + "step": 19538 + }, + { + "epoch": 1.2129244521695945, + "grad_norm": 0.15305425556996777, + "learning_rate": 7.434664733096502e-05, + "loss": 2.8076, + "step": 19539 + }, + { + "epoch": 1.2129865292693525, + "grad_norm": 0.17360565329290975, + "learning_rate": 7.434349282010916e-05, + "loss": 2.8249, + "step": 19540 + }, + { + "epoch": 1.2130486063691104, + "grad_norm": 0.1664920310431012, + "learning_rate": 7.434033818224739e-05, + "loss": 2.8381, + "step": 19541 + }, + { + "epoch": 1.2131106834688683, + "grad_norm": 0.2150917615769497, + "learning_rate": 7.433718341739617e-05, + "loss": 2.9519, + "step": 19542 + }, + { + "epoch": 1.2131727605686262, + "grad_norm": 0.16937223967101167, + "learning_rate": 7.433402852557197e-05, + "loss": 2.8675, + "step": 19543 + }, + { + "epoch": 1.2132348376683841, + "grad_norm": 0.16720928335438795, + "learning_rate": 7.43308735067912e-05, + "loss": 2.8678, + "step": 19544 + }, + { + "epoch": 1.213296914768142, + "grad_norm": 0.15109262483763675, + "learning_rate": 7.43277183610704e-05, + "loss": 2.8015, + "step": 19545 + }, + { + "epoch": 1.2133589918679, + "grad_norm": 0.15716370220420828, + "learning_rate": 7.432456308842596e-05, + "loss": 2.8802, + "step": 19546 + }, + { + "epoch": 1.213421068967658, + "grad_norm": 0.16181667744551076, + "learning_rate": 7.432140768887437e-05, + "loss": 2.9009, + "step": 19547 + }, + { + "epoch": 1.2134831460674158, + "grad_norm": 0.17721004487305472, + "learning_rate": 7.43182521624321e-05, + "loss": 2.8359, + "step": 19548 + }, + { + "epoch": 1.2135452231671737, + "grad_norm": 0.18090581441890421, + "learning_rate": 7.43150965091156e-05, + "loss": 2.8618, + "step": 19549 + }, + { + "epoch": 1.2136073002669314, + "grad_norm": 0.1997493708341618, + "learning_rate": 7.431194072894135e-05, + "loss": 3.0013, + "step": 19550 + }, + { + "epoch": 1.2136693773666893, + "grad_norm": 0.16888002004933994, + "learning_rate": 7.43087848219258e-05, + "loss": 2.9167, + "step": 19551 + }, + { + "epoch": 1.2137314544664473, + "grad_norm": 0.18294947782506077, + "learning_rate": 7.43056287880854e-05, + "loss": 2.9172, + "step": 19552 + }, + { + "epoch": 1.2137935315662052, + "grad_norm": 0.1689693896826715, + "learning_rate": 7.430247262743665e-05, + "loss": 2.966, + "step": 19553 + }, + { + "epoch": 1.213855608665963, + "grad_norm": 0.16846988463086046, + "learning_rate": 7.4299316339996e-05, + "loss": 2.9759, + "step": 19554 + }, + { + "epoch": 1.213917685765721, + "grad_norm": 0.1977475765769412, + "learning_rate": 7.429615992577992e-05, + "loss": 2.8061, + "step": 19555 + }, + { + "epoch": 1.213979762865479, + "grad_norm": 0.23960056277532987, + "learning_rate": 7.429300338480488e-05, + "loss": 3.0182, + "step": 19556 + }, + { + "epoch": 1.2140418399652368, + "grad_norm": 0.16387291012798247, + "learning_rate": 7.428984671708733e-05, + "loss": 2.8741, + "step": 19557 + }, + { + "epoch": 1.2141039170649948, + "grad_norm": 0.2694903909424564, + "learning_rate": 7.428668992264375e-05, + "loss": 2.8233, + "step": 19558 + }, + { + "epoch": 1.2141659941647527, + "grad_norm": 0.18518588436566483, + "learning_rate": 7.428353300149063e-05, + "loss": 2.9305, + "step": 19559 + }, + { + "epoch": 1.2142280712645106, + "grad_norm": 0.15407141981746836, + "learning_rate": 7.428037595364441e-05, + "loss": 2.8395, + "step": 19560 + }, + { + "epoch": 1.2142901483642685, + "grad_norm": 0.18767909596199933, + "learning_rate": 7.427721877912158e-05, + "loss": 2.8934, + "step": 19561 + }, + { + "epoch": 1.2143522254640264, + "grad_norm": 0.16047449011597614, + "learning_rate": 7.427406147793861e-05, + "loss": 2.9294, + "step": 19562 + }, + { + "epoch": 1.2144143025637841, + "grad_norm": 0.21731261176336014, + "learning_rate": 7.427090405011196e-05, + "loss": 2.8819, + "step": 19563 + }, + { + "epoch": 1.214476379663542, + "grad_norm": 0.15427102705380755, + "learning_rate": 7.426774649565812e-05, + "loss": 2.8681, + "step": 19564 + }, + { + "epoch": 1.2145384567633, + "grad_norm": 0.16177840514700112, + "learning_rate": 7.426458881459355e-05, + "loss": 2.9849, + "step": 19565 + }, + { + "epoch": 1.2146005338630579, + "grad_norm": 0.1725473636671057, + "learning_rate": 7.426143100693472e-05, + "loss": 2.8106, + "step": 19566 + }, + { + "epoch": 1.2146626109628158, + "grad_norm": 0.15923696201338988, + "learning_rate": 7.425827307269813e-05, + "loss": 2.9052, + "step": 19567 + }, + { + "epoch": 1.2147246880625737, + "grad_norm": 0.16771292055759213, + "learning_rate": 7.425511501190023e-05, + "loss": 2.9349, + "step": 19568 + }, + { + "epoch": 1.2147867651623316, + "grad_norm": 0.1726099038384034, + "learning_rate": 7.425195682455752e-05, + "loss": 2.8805, + "step": 19569 + }, + { + "epoch": 1.2148488422620896, + "grad_norm": 0.17124004010487792, + "learning_rate": 7.424879851068646e-05, + "loss": 2.9098, + "step": 19570 + }, + { + "epoch": 1.2149109193618475, + "grad_norm": 0.19188961964308077, + "learning_rate": 7.424564007030353e-05, + "loss": 2.8206, + "step": 19571 + }, + { + "epoch": 1.2149729964616054, + "grad_norm": 0.14952939388115694, + "learning_rate": 7.424248150342521e-05, + "loss": 2.9043, + "step": 19572 + }, + { + "epoch": 1.2150350735613633, + "grad_norm": 0.24435717324994932, + "learning_rate": 7.423932281006797e-05, + "loss": 2.8793, + "step": 19573 + }, + { + "epoch": 1.215097150661121, + "grad_norm": 0.19614536092549012, + "learning_rate": 7.423616399024831e-05, + "loss": 2.9315, + "step": 19574 + }, + { + "epoch": 1.215159227760879, + "grad_norm": 0.239998112003033, + "learning_rate": 7.42330050439827e-05, + "loss": 2.9381, + "step": 19575 + }, + { + "epoch": 1.2152213048606368, + "grad_norm": 0.17108848359002285, + "learning_rate": 7.422984597128762e-05, + "loss": 2.9046, + "step": 19576 + }, + { + "epoch": 1.2152833819603948, + "grad_norm": 0.16048944518108377, + "learning_rate": 7.422668677217955e-05, + "loss": 2.8709, + "step": 19577 + }, + { + "epoch": 1.2153454590601527, + "grad_norm": 0.16820758650657006, + "learning_rate": 7.422352744667496e-05, + "loss": 3.038, + "step": 19578 + }, + { + "epoch": 1.2154075361599106, + "grad_norm": 0.22195041610530958, + "learning_rate": 7.422036799479035e-05, + "loss": 2.8597, + "step": 19579 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.1747503835625457, + "learning_rate": 7.42172084165422e-05, + "loss": 2.7836, + "step": 19580 + }, + { + "epoch": 1.2155316903594264, + "grad_norm": 0.177551280021995, + "learning_rate": 7.421404871194701e-05, + "loss": 2.8679, + "step": 19581 + }, + { + "epoch": 1.2155937674591843, + "grad_norm": 0.16687301376496572, + "learning_rate": 7.421088888102125e-05, + "loss": 2.8441, + "step": 19582 + }, + { + "epoch": 1.2156558445589423, + "grad_norm": 0.16049738350856763, + "learning_rate": 7.420772892378138e-05, + "loss": 2.7868, + "step": 19583 + }, + { + "epoch": 1.2157179216587002, + "grad_norm": 0.19505618442931671, + "learning_rate": 7.420456884024393e-05, + "loss": 2.7828, + "step": 19584 + }, + { + "epoch": 1.215779998758458, + "grad_norm": 0.23352144902184702, + "learning_rate": 7.420140863042537e-05, + "loss": 2.999, + "step": 19585 + }, + { + "epoch": 1.215842075858216, + "grad_norm": 0.16696142169792397, + "learning_rate": 7.419824829434218e-05, + "loss": 2.7041, + "step": 19586 + }, + { + "epoch": 1.2159041529579737, + "grad_norm": 0.1765302067315757, + "learning_rate": 7.419508783201086e-05, + "loss": 2.8903, + "step": 19587 + }, + { + "epoch": 1.2159662300577316, + "grad_norm": 0.16011158390109195, + "learning_rate": 7.41919272434479e-05, + "loss": 2.8578, + "step": 19588 + }, + { + "epoch": 1.2160283071574896, + "grad_norm": 0.1910314295964619, + "learning_rate": 7.418876652866978e-05, + "loss": 2.7841, + "step": 19589 + }, + { + "epoch": 1.2160903842572475, + "grad_norm": 0.15534984460100984, + "learning_rate": 7.418560568769298e-05, + "loss": 2.8675, + "step": 19590 + }, + { + "epoch": 1.2161524613570054, + "grad_norm": 0.20097524792655555, + "learning_rate": 7.418244472053401e-05, + "loss": 2.8707, + "step": 19591 + }, + { + "epoch": 1.2162145384567633, + "grad_norm": 0.15899780624416238, + "learning_rate": 7.417928362720935e-05, + "loss": 2.9017, + "step": 19592 + }, + { + "epoch": 1.2162766155565212, + "grad_norm": 0.15576791900444023, + "learning_rate": 7.417612240773552e-05, + "loss": 2.8041, + "step": 19593 + }, + { + "epoch": 1.2163386926562791, + "grad_norm": 0.1834831716424437, + "learning_rate": 7.417296106212896e-05, + "loss": 2.9844, + "step": 19594 + }, + { + "epoch": 1.216400769756037, + "grad_norm": 0.15148181881648876, + "learning_rate": 7.416979959040622e-05, + "loss": 2.8536, + "step": 19595 + }, + { + "epoch": 1.216462846855795, + "grad_norm": 0.1720384719867601, + "learning_rate": 7.416663799258374e-05, + "loss": 2.9053, + "step": 19596 + }, + { + "epoch": 1.2165249239555527, + "grad_norm": 0.1822301056773975, + "learning_rate": 7.416347626867806e-05, + "loss": 2.9093, + "step": 19597 + }, + { + "epoch": 1.2165870010553106, + "grad_norm": 0.18036572335287124, + "learning_rate": 7.416031441870565e-05, + "loss": 2.8754, + "step": 19598 + }, + { + "epoch": 1.2166490781550685, + "grad_norm": 0.17193885722364793, + "learning_rate": 7.415715244268303e-05, + "loss": 2.8265, + "step": 19599 + }, + { + "epoch": 1.2167111552548264, + "grad_norm": 0.1649990371834956, + "learning_rate": 7.415399034062668e-05, + "loss": 2.8213, + "step": 19600 + }, + { + "epoch": 1.2167732323545843, + "grad_norm": 0.15527674833402708, + "learning_rate": 7.415082811255309e-05, + "loss": 2.8968, + "step": 19601 + }, + { + "epoch": 1.2168353094543423, + "grad_norm": 0.1674813028654008, + "learning_rate": 7.414766575847876e-05, + "loss": 2.855, + "step": 19602 + }, + { + "epoch": 1.2168973865541002, + "grad_norm": 0.16396561100446846, + "learning_rate": 7.414450327842021e-05, + "loss": 2.8449, + "step": 19603 + }, + { + "epoch": 1.216959463653858, + "grad_norm": 0.1700061939667846, + "learning_rate": 7.414134067239392e-05, + "loss": 2.8988, + "step": 19604 + }, + { + "epoch": 1.217021540753616, + "grad_norm": 0.19041114129940445, + "learning_rate": 7.41381779404164e-05, + "loss": 2.9293, + "step": 19605 + }, + { + "epoch": 1.217083617853374, + "grad_norm": 0.17598052914771184, + "learning_rate": 7.413501508250413e-05, + "loss": 2.8607, + "step": 19606 + }, + { + "epoch": 1.2171456949531319, + "grad_norm": 0.19399460572370134, + "learning_rate": 7.413185209867364e-05, + "loss": 2.7663, + "step": 19607 + }, + { + "epoch": 1.2172077720528898, + "grad_norm": 0.16172793910507308, + "learning_rate": 7.412868898894141e-05, + "loss": 2.8575, + "step": 19608 + }, + { + "epoch": 1.2172698491526477, + "grad_norm": 0.16638595564759365, + "learning_rate": 7.412552575332397e-05, + "loss": 2.9303, + "step": 19609 + }, + { + "epoch": 1.2173319262524056, + "grad_norm": 0.19596237782146453, + "learning_rate": 7.412236239183779e-05, + "loss": 2.8107, + "step": 19610 + }, + { + "epoch": 1.2173940033521633, + "grad_norm": 0.1625607193699355, + "learning_rate": 7.411919890449941e-05, + "loss": 2.8312, + "step": 19611 + }, + { + "epoch": 1.2174560804519212, + "grad_norm": 0.172696372587397, + "learning_rate": 7.411603529132529e-05, + "loss": 2.8725, + "step": 19612 + }, + { + "epoch": 1.2175181575516791, + "grad_norm": 0.15137624405870237, + "learning_rate": 7.411287155233196e-05, + "loss": 2.7898, + "step": 19613 + }, + { + "epoch": 1.217580234651437, + "grad_norm": 0.16029298574345982, + "learning_rate": 7.410970768753594e-05, + "loss": 2.8635, + "step": 19614 + }, + { + "epoch": 1.217642311751195, + "grad_norm": 0.16669453199219691, + "learning_rate": 7.410654369695371e-05, + "loss": 2.8323, + "step": 19615 + }, + { + "epoch": 1.217704388850953, + "grad_norm": 0.15703270493967333, + "learning_rate": 7.41033795806018e-05, + "loss": 2.8065, + "step": 19616 + }, + { + "epoch": 1.2177664659507108, + "grad_norm": 0.16833025266661858, + "learning_rate": 7.410021533849671e-05, + "loss": 2.8507, + "step": 19617 + }, + { + "epoch": 1.2178285430504687, + "grad_norm": 0.1713313423529568, + "learning_rate": 7.409705097065493e-05, + "loss": 2.8444, + "step": 19618 + }, + { + "epoch": 1.2178906201502266, + "grad_norm": 0.15875680016235752, + "learning_rate": 7.4093886477093e-05, + "loss": 2.8533, + "step": 19619 + }, + { + "epoch": 1.2179526972499846, + "grad_norm": 0.16617753604344307, + "learning_rate": 7.409072185782742e-05, + "loss": 2.8752, + "step": 19620 + }, + { + "epoch": 1.2180147743497423, + "grad_norm": 0.18721881286131023, + "learning_rate": 7.408755711287467e-05, + "loss": 2.9076, + "step": 19621 + }, + { + "epoch": 1.2180768514495002, + "grad_norm": 0.19044109212519147, + "learning_rate": 7.408439224225131e-05, + "loss": 2.9015, + "step": 19622 + }, + { + "epoch": 1.218138928549258, + "grad_norm": 0.16573930633112116, + "learning_rate": 7.408122724597381e-05, + "loss": 2.7811, + "step": 19623 + }, + { + "epoch": 1.218201005649016, + "grad_norm": 0.15415155230891897, + "learning_rate": 7.407806212405872e-05, + "loss": 2.787, + "step": 19624 + }, + { + "epoch": 1.218263082748774, + "grad_norm": 0.17493408890540324, + "learning_rate": 7.407489687652252e-05, + "loss": 2.8975, + "step": 19625 + }, + { + "epoch": 1.2183251598485318, + "grad_norm": 0.1681054080569899, + "learning_rate": 7.407173150338176e-05, + "loss": 2.8499, + "step": 19626 + }, + { + "epoch": 1.2183872369482898, + "grad_norm": 0.16634237446987535, + "learning_rate": 7.406856600465293e-05, + "loss": 2.8413, + "step": 19627 + }, + { + "epoch": 1.2184493140480477, + "grad_norm": 0.16072166469452148, + "learning_rate": 7.406540038035252e-05, + "loss": 2.75, + "step": 19628 + }, + { + "epoch": 1.2185113911478056, + "grad_norm": 0.16139057811751523, + "learning_rate": 7.406223463049709e-05, + "loss": 2.9045, + "step": 19629 + }, + { + "epoch": 1.2185734682475635, + "grad_norm": 0.1614452338801132, + "learning_rate": 7.405906875510314e-05, + "loss": 2.8544, + "step": 19630 + }, + { + "epoch": 1.2186355453473214, + "grad_norm": 0.15636478618538105, + "learning_rate": 7.40559027541872e-05, + "loss": 2.8084, + "step": 19631 + }, + { + "epoch": 1.2186976224470794, + "grad_norm": 0.1639492631862955, + "learning_rate": 7.405273662776574e-05, + "loss": 2.7861, + "step": 19632 + }, + { + "epoch": 1.2187596995468373, + "grad_norm": 0.1527847911719268, + "learning_rate": 7.404957037585536e-05, + "loss": 2.8962, + "step": 19633 + }, + { + "epoch": 1.2188217766465952, + "grad_norm": 0.17199576535078445, + "learning_rate": 7.404640399847251e-05, + "loss": 2.9487, + "step": 19634 + }, + { + "epoch": 1.2188838537463529, + "grad_norm": 0.17374356495480894, + "learning_rate": 7.404323749563374e-05, + "loss": 2.8385, + "step": 19635 + }, + { + "epoch": 1.2189459308461108, + "grad_norm": 0.1586535892909645, + "learning_rate": 7.404007086735557e-05, + "loss": 2.7825, + "step": 19636 + }, + { + "epoch": 1.2190080079458687, + "grad_norm": 0.17440935536490568, + "learning_rate": 7.403690411365449e-05, + "loss": 2.936, + "step": 19637 + }, + { + "epoch": 1.2190700850456266, + "grad_norm": 0.15670981104032428, + "learning_rate": 7.403373723454707e-05, + "loss": 2.8485, + "step": 19638 + }, + { + "epoch": 1.2191321621453846, + "grad_norm": 0.16381897336387127, + "learning_rate": 7.403057023004979e-05, + "loss": 2.9243, + "step": 19639 + }, + { + "epoch": 1.2191942392451425, + "grad_norm": 0.17778838039320483, + "learning_rate": 7.40274031001792e-05, + "loss": 2.8918, + "step": 19640 + }, + { + "epoch": 1.2192563163449004, + "grad_norm": 0.16122536746480232, + "learning_rate": 7.402423584495181e-05, + "loss": 2.9047, + "step": 19641 + }, + { + "epoch": 1.2193183934446583, + "grad_norm": 0.14948963921664865, + "learning_rate": 7.402106846438417e-05, + "loss": 2.7876, + "step": 19642 + }, + { + "epoch": 1.2193804705444162, + "grad_norm": 0.16025107763050067, + "learning_rate": 7.401790095849275e-05, + "loss": 2.9035, + "step": 19643 + }, + { + "epoch": 1.2194425476441741, + "grad_norm": 0.15191001770959606, + "learning_rate": 7.401473332729414e-05, + "loss": 2.7979, + "step": 19644 + }, + { + "epoch": 1.2195046247439318, + "grad_norm": 0.1499654034612589, + "learning_rate": 7.401156557080482e-05, + "loss": 2.9115, + "step": 19645 + }, + { + "epoch": 1.2195667018436898, + "grad_norm": 0.1553710534690934, + "learning_rate": 7.400839768904133e-05, + "loss": 2.8819, + "step": 19646 + }, + { + "epoch": 1.2196287789434477, + "grad_norm": 0.14733234381595156, + "learning_rate": 7.400522968202022e-05, + "loss": 2.9052, + "step": 19647 + }, + { + "epoch": 1.2196908560432056, + "grad_norm": 0.14838355740629489, + "learning_rate": 7.400206154975798e-05, + "loss": 2.8814, + "step": 19648 + }, + { + "epoch": 1.2197529331429635, + "grad_norm": 0.16111429281445522, + "learning_rate": 7.399889329227117e-05, + "loss": 2.8861, + "step": 19649 + }, + { + "epoch": 1.2198150102427214, + "grad_norm": 0.1457298172542437, + "learning_rate": 7.399572490957631e-05, + "loss": 2.8945, + "step": 19650 + }, + { + "epoch": 1.2198770873424793, + "grad_norm": 0.15858922749915952, + "learning_rate": 7.399255640168992e-05, + "loss": 2.8808, + "step": 19651 + }, + { + "epoch": 1.2199391644422373, + "grad_norm": 0.19096149621569714, + "learning_rate": 7.398938776862855e-05, + "loss": 2.8909, + "step": 19652 + }, + { + "epoch": 1.2200012415419952, + "grad_norm": 0.16709324639856069, + "learning_rate": 7.39862190104087e-05, + "loss": 2.9076, + "step": 19653 + }, + { + "epoch": 1.220063318641753, + "grad_norm": 0.15633099959865124, + "learning_rate": 7.398305012704695e-05, + "loss": 2.887, + "step": 19654 + }, + { + "epoch": 1.220125395741511, + "grad_norm": 0.15800847335130255, + "learning_rate": 7.397988111855978e-05, + "loss": 2.8268, + "step": 19655 + }, + { + "epoch": 1.220187472841269, + "grad_norm": 0.1405738234116759, + "learning_rate": 7.397671198496377e-05, + "loss": 2.8884, + "step": 19656 + }, + { + "epoch": 1.2202495499410269, + "grad_norm": 0.17214737789034826, + "learning_rate": 7.397354272627541e-05, + "loss": 2.814, + "step": 19657 + }, + { + "epoch": 1.2203116270407848, + "grad_norm": 0.15023181630423926, + "learning_rate": 7.397037334251128e-05, + "loss": 2.934, + "step": 19658 + }, + { + "epoch": 1.2203737041405425, + "grad_norm": 0.22376455566010095, + "learning_rate": 7.396720383368788e-05, + "loss": 2.9012, + "step": 19659 + }, + { + "epoch": 1.2204357812403004, + "grad_norm": 0.14543538563211467, + "learning_rate": 7.396403419982177e-05, + "loss": 2.8148, + "step": 19660 + }, + { + "epoch": 1.2204978583400583, + "grad_norm": 0.1544773097054201, + "learning_rate": 7.396086444092946e-05, + "loss": 2.8889, + "step": 19661 + }, + { + "epoch": 1.2205599354398162, + "grad_norm": 0.15919188002247053, + "learning_rate": 7.39576945570275e-05, + "loss": 2.8158, + "step": 19662 + }, + { + "epoch": 1.2206220125395741, + "grad_norm": 0.15402998355748063, + "learning_rate": 7.395452454813246e-05, + "loss": 2.9455, + "step": 19663 + }, + { + "epoch": 1.220684089639332, + "grad_norm": 0.1564820771470004, + "learning_rate": 7.395135441426082e-05, + "loss": 2.8894, + "step": 19664 + }, + { + "epoch": 1.22074616673909, + "grad_norm": 0.14233443585646463, + "learning_rate": 7.394818415542916e-05, + "loss": 2.9063, + "step": 19665 + }, + { + "epoch": 1.220808243838848, + "grad_norm": 0.16805207048947832, + "learning_rate": 7.394501377165402e-05, + "loss": 2.9147, + "step": 19666 + }, + { + "epoch": 1.2208703209386058, + "grad_norm": 0.16016712965879332, + "learning_rate": 7.394184326295193e-05, + "loss": 2.8372, + "step": 19667 + }, + { + "epoch": 1.2209323980383637, + "grad_norm": 0.15788152761926105, + "learning_rate": 7.393867262933943e-05, + "loss": 2.9205, + "step": 19668 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.19472816795674452, + "learning_rate": 7.393550187083305e-05, + "loss": 2.8746, + "step": 19669 + }, + { + "epoch": 1.2210565522378793, + "grad_norm": 0.1590230498734362, + "learning_rate": 7.393233098744936e-05, + "loss": 2.8636, + "step": 19670 + }, + { + "epoch": 1.2211186293376373, + "grad_norm": 0.1555368061245944, + "learning_rate": 7.392915997920488e-05, + "loss": 2.9264, + "step": 19671 + }, + { + "epoch": 1.2211807064373952, + "grad_norm": 0.16850807964461711, + "learning_rate": 7.392598884611616e-05, + "loss": 2.8522, + "step": 19672 + }, + { + "epoch": 1.221242783537153, + "grad_norm": 0.15573568335797147, + "learning_rate": 7.392281758819977e-05, + "loss": 2.8786, + "step": 19673 + }, + { + "epoch": 1.221304860636911, + "grad_norm": 0.1674347473105505, + "learning_rate": 7.391964620547221e-05, + "loss": 2.9378, + "step": 19674 + }, + { + "epoch": 1.221366937736669, + "grad_norm": 0.1582021889835501, + "learning_rate": 7.391647469795006e-05, + "loss": 2.8378, + "step": 19675 + }, + { + "epoch": 1.2214290148364269, + "grad_norm": 0.16784216302516705, + "learning_rate": 7.391330306564986e-05, + "loss": 2.7782, + "step": 19676 + }, + { + "epoch": 1.2214910919361848, + "grad_norm": 0.1946185659062907, + "learning_rate": 7.391013130858814e-05, + "loss": 2.8904, + "step": 19677 + }, + { + "epoch": 1.2215531690359427, + "grad_norm": 0.1772309499108824, + "learning_rate": 7.390695942678146e-05, + "loss": 2.8341, + "step": 19678 + }, + { + "epoch": 1.2216152461357006, + "grad_norm": 0.15903535981395164, + "learning_rate": 7.390378742024638e-05, + "loss": 2.8119, + "step": 19679 + }, + { + "epoch": 1.2216773232354585, + "grad_norm": 0.16266473182784924, + "learning_rate": 7.390061528899943e-05, + "loss": 2.929, + "step": 19680 + }, + { + "epoch": 1.2217394003352164, + "grad_norm": 0.2004021576208234, + "learning_rate": 7.389744303305718e-05, + "loss": 2.8413, + "step": 19681 + }, + { + "epoch": 1.2218014774349744, + "grad_norm": 0.1682293777247976, + "learning_rate": 7.389427065243615e-05, + "loss": 2.8831, + "step": 19682 + }, + { + "epoch": 1.221863554534732, + "grad_norm": 0.16068656278764973, + "learning_rate": 7.389109814715292e-05, + "loss": 2.7856, + "step": 19683 + }, + { + "epoch": 1.22192563163449, + "grad_norm": 0.16195191353925845, + "learning_rate": 7.388792551722404e-05, + "loss": 2.8902, + "step": 19684 + }, + { + "epoch": 1.221987708734248, + "grad_norm": 0.16405798133803554, + "learning_rate": 7.388475276266604e-05, + "loss": 2.8202, + "step": 19685 + }, + { + "epoch": 1.2220497858340058, + "grad_norm": 0.15363232033246021, + "learning_rate": 7.388157988349549e-05, + "loss": 2.904, + "step": 19686 + }, + { + "epoch": 1.2221118629337637, + "grad_norm": 0.16159305156364284, + "learning_rate": 7.387840687972895e-05, + "loss": 2.8524, + "step": 19687 + }, + { + "epoch": 1.2221739400335216, + "grad_norm": 0.15403026773670092, + "learning_rate": 7.387523375138295e-05, + "loss": 2.7891, + "step": 19688 + }, + { + "epoch": 1.2222360171332796, + "grad_norm": 0.15813449854649472, + "learning_rate": 7.387206049847407e-05, + "loss": 2.8397, + "step": 19689 + }, + { + "epoch": 1.2222980942330375, + "grad_norm": 0.16413856060213272, + "learning_rate": 7.386888712101885e-05, + "loss": 2.9622, + "step": 19690 + }, + { + "epoch": 1.2223601713327954, + "grad_norm": 0.15199635707064496, + "learning_rate": 7.386571361903385e-05, + "loss": 2.894, + "step": 19691 + }, + { + "epoch": 1.2224222484325533, + "grad_norm": 0.15921758422948884, + "learning_rate": 7.386253999253564e-05, + "loss": 2.9026, + "step": 19692 + }, + { + "epoch": 1.222484325532311, + "grad_norm": 0.15014315965894728, + "learning_rate": 7.385936624154076e-05, + "loss": 2.8438, + "step": 19693 + }, + { + "epoch": 1.222546402632069, + "grad_norm": 0.16013202832980025, + "learning_rate": 7.385619236606576e-05, + "loss": 2.8595, + "step": 19694 + }, + { + "epoch": 1.2226084797318268, + "grad_norm": 0.15816492632071566, + "learning_rate": 7.385301836612722e-05, + "loss": 2.7875, + "step": 19695 + }, + { + "epoch": 1.2226705568315848, + "grad_norm": 0.14606947314667032, + "learning_rate": 7.384984424174169e-05, + "loss": 2.8357, + "step": 19696 + }, + { + "epoch": 1.2227326339313427, + "grad_norm": 0.1526480167183391, + "learning_rate": 7.384666999292576e-05, + "loss": 2.8419, + "step": 19697 + }, + { + "epoch": 1.2227947110311006, + "grad_norm": 0.15021350264307087, + "learning_rate": 7.384349561969594e-05, + "loss": 2.8451, + "step": 19698 + }, + { + "epoch": 1.2228567881308585, + "grad_norm": 0.1432672529445321, + "learning_rate": 7.384032112206881e-05, + "loss": 2.9045, + "step": 19699 + }, + { + "epoch": 1.2229188652306164, + "grad_norm": 0.1657775181162426, + "learning_rate": 7.383714650006095e-05, + "loss": 2.7127, + "step": 19700 + }, + { + "epoch": 1.2229809423303744, + "grad_norm": 0.1593600404020261, + "learning_rate": 7.383397175368892e-05, + "loss": 2.8783, + "step": 19701 + }, + { + "epoch": 1.2230430194301323, + "grad_norm": 0.1518143294563011, + "learning_rate": 7.383079688296924e-05, + "loss": 2.885, + "step": 19702 + }, + { + "epoch": 1.2231050965298902, + "grad_norm": 0.16673274270196092, + "learning_rate": 7.382762188791853e-05, + "loss": 2.8423, + "step": 19703 + }, + { + "epoch": 1.223167173629648, + "grad_norm": 0.15712253612173688, + "learning_rate": 7.382444676855332e-05, + "loss": 2.9015, + "step": 19704 + }, + { + "epoch": 1.223229250729406, + "grad_norm": 0.14466734226689087, + "learning_rate": 7.382127152489022e-05, + "loss": 2.8663, + "step": 19705 + }, + { + "epoch": 1.223291327829164, + "grad_norm": 0.16116464473836103, + "learning_rate": 7.381809615694573e-05, + "loss": 2.9289, + "step": 19706 + }, + { + "epoch": 1.2233534049289216, + "grad_norm": 0.1568569257191406, + "learning_rate": 7.381492066473646e-05, + "loss": 2.8812, + "step": 19707 + }, + { + "epoch": 1.2234154820286796, + "grad_norm": 0.16595856302997128, + "learning_rate": 7.381174504827897e-05, + "loss": 2.8896, + "step": 19708 + }, + { + "epoch": 1.2234775591284375, + "grad_norm": 0.15229719012920465, + "learning_rate": 7.380856930758982e-05, + "loss": 2.9057, + "step": 19709 + }, + { + "epoch": 1.2235396362281954, + "grad_norm": 0.16507598644483537, + "learning_rate": 7.38053934426856e-05, + "loss": 2.839, + "step": 19710 + }, + { + "epoch": 1.2236017133279533, + "grad_norm": 0.15301135088539977, + "learning_rate": 7.380221745358283e-05, + "loss": 2.8708, + "step": 19711 + }, + { + "epoch": 1.2236637904277112, + "grad_norm": 0.15916006868373828, + "learning_rate": 7.379904134029813e-05, + "loss": 2.8761, + "step": 19712 + }, + { + "epoch": 1.2237258675274691, + "grad_norm": 0.1547012884082818, + "learning_rate": 7.379586510284806e-05, + "loss": 2.8565, + "step": 19713 + }, + { + "epoch": 1.223787944627227, + "grad_norm": 0.14847432482849007, + "learning_rate": 7.379268874124919e-05, + "loss": 2.9248, + "step": 19714 + }, + { + "epoch": 1.223850021726985, + "grad_norm": 0.16286222452721139, + "learning_rate": 7.37895122555181e-05, + "loss": 2.8683, + "step": 19715 + }, + { + "epoch": 1.223912098826743, + "grad_norm": 0.16338517381647813, + "learning_rate": 7.378633564567132e-05, + "loss": 2.8512, + "step": 19716 + }, + { + "epoch": 1.2239741759265006, + "grad_norm": 0.14459408863210177, + "learning_rate": 7.378315891172548e-05, + "loss": 2.941, + "step": 19717 + }, + { + "epoch": 1.2240362530262585, + "grad_norm": 0.15409253266812395, + "learning_rate": 7.37799820536971e-05, + "loss": 2.912, + "step": 19718 + }, + { + "epoch": 1.2240983301260164, + "grad_norm": 0.15988186266272358, + "learning_rate": 7.377680507160279e-05, + "loss": 2.872, + "step": 19719 + }, + { + "epoch": 1.2241604072257743, + "grad_norm": 0.15509364677408854, + "learning_rate": 7.37736279654591e-05, + "loss": 2.8087, + "step": 19720 + }, + { + "epoch": 1.2242224843255323, + "grad_norm": 0.1519603918735608, + "learning_rate": 7.377045073528264e-05, + "loss": 2.8213, + "step": 19721 + }, + { + "epoch": 1.2242845614252902, + "grad_norm": 0.1488759879997101, + "learning_rate": 7.376727338108997e-05, + "loss": 2.8219, + "step": 19722 + }, + { + "epoch": 1.224346638525048, + "grad_norm": 0.14660525103105618, + "learning_rate": 7.376409590289766e-05, + "loss": 2.8323, + "step": 19723 + }, + { + "epoch": 1.224408715624806, + "grad_norm": 0.1642759387961142, + "learning_rate": 7.376091830072229e-05, + "loss": 2.8623, + "step": 19724 + }, + { + "epoch": 1.224470792724564, + "grad_norm": 0.15544844307857528, + "learning_rate": 7.375774057458045e-05, + "loss": 2.7944, + "step": 19725 + }, + { + "epoch": 1.2245328698243219, + "grad_norm": 0.15852786513703562, + "learning_rate": 7.375456272448871e-05, + "loss": 2.899, + "step": 19726 + }, + { + "epoch": 1.2245949469240798, + "grad_norm": 0.18571318150220875, + "learning_rate": 7.375138475046363e-05, + "loss": 2.8115, + "step": 19727 + }, + { + "epoch": 1.2246570240238377, + "grad_norm": 0.18720165995183843, + "learning_rate": 7.374820665252184e-05, + "loss": 2.8109, + "step": 19728 + }, + { + "epoch": 1.2247191011235956, + "grad_norm": 0.15183786615236639, + "learning_rate": 7.374502843067987e-05, + "loss": 2.8596, + "step": 19729 + }, + { + "epoch": 1.2247811782233535, + "grad_norm": 0.1741609387638471, + "learning_rate": 7.374185008495432e-05, + "loss": 2.8798, + "step": 19730 + }, + { + "epoch": 1.2248432553231112, + "grad_norm": 0.15661393340098237, + "learning_rate": 7.373867161536179e-05, + "loss": 2.7856, + "step": 19731 + }, + { + "epoch": 1.2249053324228691, + "grad_norm": 0.16827309006022353, + "learning_rate": 7.373549302191883e-05, + "loss": 2.9183, + "step": 19732 + }, + { + "epoch": 1.224967409522627, + "grad_norm": 0.15255738830871568, + "learning_rate": 7.373231430464206e-05, + "loss": 2.8487, + "step": 19733 + }, + { + "epoch": 1.225029486622385, + "grad_norm": 0.16119495118473728, + "learning_rate": 7.372913546354802e-05, + "loss": 2.8303, + "step": 19734 + }, + { + "epoch": 1.225091563722143, + "grad_norm": 0.14789324038208623, + "learning_rate": 7.372595649865334e-05, + "loss": 2.9031, + "step": 19735 + }, + { + "epoch": 1.2251536408219008, + "grad_norm": 0.17073857596196826, + "learning_rate": 7.372277740997458e-05, + "loss": 2.8805, + "step": 19736 + }, + { + "epoch": 1.2252157179216587, + "grad_norm": 0.1684970697560624, + "learning_rate": 7.371959819752834e-05, + "loss": 2.8201, + "step": 19737 + }, + { + "epoch": 1.2252777950214166, + "grad_norm": 0.15674287426835218, + "learning_rate": 7.37164188613312e-05, + "loss": 2.8822, + "step": 19738 + }, + { + "epoch": 1.2253398721211746, + "grad_norm": 0.1613636675053431, + "learning_rate": 7.371323940139974e-05, + "loss": 2.8564, + "step": 19739 + }, + { + "epoch": 1.2254019492209325, + "grad_norm": 0.15535991587732215, + "learning_rate": 7.371005981775054e-05, + "loss": 2.9473, + "step": 19740 + }, + { + "epoch": 1.2254640263206902, + "grad_norm": 0.17200401730718248, + "learning_rate": 7.370688011040021e-05, + "loss": 2.8556, + "step": 19741 + }, + { + "epoch": 1.225526103420448, + "grad_norm": 0.1482941098058386, + "learning_rate": 7.370370027936533e-05, + "loss": 2.7937, + "step": 19742 + }, + { + "epoch": 1.225588180520206, + "grad_norm": 0.15746520868223002, + "learning_rate": 7.370052032466251e-05, + "loss": 2.9449, + "step": 19743 + }, + { + "epoch": 1.225650257619964, + "grad_norm": 0.15691683453634295, + "learning_rate": 7.369734024630828e-05, + "loss": 2.8163, + "step": 19744 + }, + { + "epoch": 1.2257123347197219, + "grad_norm": 0.1567599320238686, + "learning_rate": 7.36941600443193e-05, + "loss": 2.828, + "step": 19745 + }, + { + "epoch": 1.2257744118194798, + "grad_norm": 0.15813443965215118, + "learning_rate": 7.369097971871214e-05, + "loss": 2.8764, + "step": 19746 + }, + { + "epoch": 1.2258364889192377, + "grad_norm": 0.18694410770220452, + "learning_rate": 7.368779926950338e-05, + "loss": 2.8436, + "step": 19747 + }, + { + "epoch": 1.2258985660189956, + "grad_norm": 0.16058273054264496, + "learning_rate": 7.368461869670962e-05, + "loss": 2.9145, + "step": 19748 + }, + { + "epoch": 1.2259606431187535, + "grad_norm": 0.15120306038350054, + "learning_rate": 7.368143800034746e-05, + "loss": 2.8425, + "step": 19749 + }, + { + "epoch": 1.2260227202185114, + "grad_norm": 0.1554761738658012, + "learning_rate": 7.367825718043348e-05, + "loss": 2.8762, + "step": 19750 + }, + { + "epoch": 1.2260847973182694, + "grad_norm": 0.16610078871194953, + "learning_rate": 7.367507623698431e-05, + "loss": 2.8204, + "step": 19751 + }, + { + "epoch": 1.2261468744180273, + "grad_norm": 0.15274148153762668, + "learning_rate": 7.367189517001649e-05, + "loss": 2.8959, + "step": 19752 + }, + { + "epoch": 1.2262089515177852, + "grad_norm": 0.15547265162915663, + "learning_rate": 7.366871397954667e-05, + "loss": 2.8569, + "step": 19753 + }, + { + "epoch": 1.2262710286175431, + "grad_norm": 0.1715175477095881, + "learning_rate": 7.36655326655914e-05, + "loss": 2.8972, + "step": 19754 + }, + { + "epoch": 1.2263331057173008, + "grad_norm": 0.14616465964112, + "learning_rate": 7.366235122816732e-05, + "loss": 2.8203, + "step": 19755 + }, + { + "epoch": 1.2263951828170587, + "grad_norm": 0.14715754010689516, + "learning_rate": 7.365916966729098e-05, + "loss": 2.8767, + "step": 19756 + }, + { + "epoch": 1.2264572599168166, + "grad_norm": 0.1490413405037392, + "learning_rate": 7.365598798297904e-05, + "loss": 2.8106, + "step": 19757 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.15791034686679223, + "learning_rate": 7.365280617524805e-05, + "loss": 2.9162, + "step": 19758 + }, + { + "epoch": 1.2265814141163325, + "grad_norm": 0.16543742638043366, + "learning_rate": 7.364962424411465e-05, + "loss": 2.9237, + "step": 19759 + }, + { + "epoch": 1.2266434912160904, + "grad_norm": 0.16049799279379448, + "learning_rate": 7.364644218959538e-05, + "loss": 2.8895, + "step": 19760 + }, + { + "epoch": 1.2267055683158483, + "grad_norm": 0.15894299444278756, + "learning_rate": 7.364326001170692e-05, + "loss": 2.8776, + "step": 19761 + }, + { + "epoch": 1.2267676454156062, + "grad_norm": 0.1481084438922106, + "learning_rate": 7.364007771046582e-05, + "loss": 2.8446, + "step": 19762 + }, + { + "epoch": 1.2268297225153642, + "grad_norm": 0.15293954230427054, + "learning_rate": 7.363689528588869e-05, + "loss": 2.8285, + "step": 19763 + }, + { + "epoch": 1.226891799615122, + "grad_norm": 0.1627212547236212, + "learning_rate": 7.363371273799214e-05, + "loss": 2.8435, + "step": 19764 + }, + { + "epoch": 1.2269538767148798, + "grad_norm": 0.17117382749230023, + "learning_rate": 7.363053006679277e-05, + "loss": 2.8225, + "step": 19765 + }, + { + "epoch": 1.2270159538146377, + "grad_norm": 0.1591455914542187, + "learning_rate": 7.36273472723072e-05, + "loss": 2.8943, + "step": 19766 + }, + { + "epoch": 1.2270780309143956, + "grad_norm": 0.16442095675235147, + "learning_rate": 7.362416435455202e-05, + "loss": 2.8496, + "step": 19767 + }, + { + "epoch": 1.2271401080141535, + "grad_norm": 0.15785784188033655, + "learning_rate": 7.362098131354384e-05, + "loss": 2.8447, + "step": 19768 + }, + { + "epoch": 1.2272021851139114, + "grad_norm": 0.1587465161219109, + "learning_rate": 7.361779814929927e-05, + "loss": 2.8964, + "step": 19769 + }, + { + "epoch": 1.2272642622136694, + "grad_norm": 0.14781514000097631, + "learning_rate": 7.36146148618349e-05, + "loss": 2.9217, + "step": 19770 + }, + { + "epoch": 1.2273263393134273, + "grad_norm": 0.14973858888268238, + "learning_rate": 7.361143145116737e-05, + "loss": 2.9332, + "step": 19771 + }, + { + "epoch": 1.2273884164131852, + "grad_norm": 0.1517161991011012, + "learning_rate": 7.360824791731324e-05, + "loss": 2.855, + "step": 19772 + }, + { + "epoch": 1.227450493512943, + "grad_norm": 0.16474229325465536, + "learning_rate": 7.360506426028919e-05, + "loss": 2.9308, + "step": 19773 + }, + { + "epoch": 1.227512570612701, + "grad_norm": 0.16682253133276928, + "learning_rate": 7.360188048011177e-05, + "loss": 2.9185, + "step": 19774 + }, + { + "epoch": 1.227574647712459, + "grad_norm": 0.1658868290069039, + "learning_rate": 7.35986965767976e-05, + "loss": 2.9625, + "step": 19775 + }, + { + "epoch": 1.2276367248122169, + "grad_norm": 0.179096306710206, + "learning_rate": 7.359551255036329e-05, + "loss": 2.9178, + "step": 19776 + }, + { + "epoch": 1.2276988019119748, + "grad_norm": 0.17636836337756834, + "learning_rate": 7.359232840082549e-05, + "loss": 2.9213, + "step": 19777 + }, + { + "epoch": 1.2277608790117325, + "grad_norm": 0.15343441938737867, + "learning_rate": 7.358914412820078e-05, + "loss": 2.8484, + "step": 19778 + }, + { + "epoch": 1.2278229561114904, + "grad_norm": 0.16491956165268823, + "learning_rate": 7.358595973250577e-05, + "loss": 2.8579, + "step": 19779 + }, + { + "epoch": 1.2278850332112483, + "grad_norm": 0.3393829444355835, + "learning_rate": 7.35827752137571e-05, + "loss": 2.8881, + "step": 19780 + }, + { + "epoch": 1.2279471103110062, + "grad_norm": 0.1749923593692369, + "learning_rate": 7.357959057197135e-05, + "loss": 2.9453, + "step": 19781 + }, + { + "epoch": 1.2280091874107641, + "grad_norm": 0.17040275076703487, + "learning_rate": 7.357640580716516e-05, + "loss": 2.8744, + "step": 19782 + }, + { + "epoch": 1.228071264510522, + "grad_norm": 0.1923313749842826, + "learning_rate": 7.357322091935514e-05, + "loss": 2.7808, + "step": 19783 + }, + { + "epoch": 1.22813334161028, + "grad_norm": 0.2299084155686018, + "learning_rate": 7.35700359085579e-05, + "loss": 2.8187, + "step": 19784 + }, + { + "epoch": 1.228195418710038, + "grad_norm": 0.15662909058604896, + "learning_rate": 7.356685077479005e-05, + "loss": 2.8866, + "step": 19785 + }, + { + "epoch": 1.2282574958097958, + "grad_norm": 0.17263236937928028, + "learning_rate": 7.356366551806824e-05, + "loss": 2.812, + "step": 19786 + }, + { + "epoch": 1.2283195729095537, + "grad_norm": 0.1613862327873986, + "learning_rate": 7.356048013840906e-05, + "loss": 2.8613, + "step": 19787 + }, + { + "epoch": 1.2283816500093117, + "grad_norm": 0.17580967249496862, + "learning_rate": 7.355729463582912e-05, + "loss": 2.8919, + "step": 19788 + }, + { + "epoch": 1.2284437271090693, + "grad_norm": 0.1756700004759919, + "learning_rate": 7.355410901034507e-05, + "loss": 2.8059, + "step": 19789 + }, + { + "epoch": 1.2285058042088273, + "grad_norm": 0.16524790918099502, + "learning_rate": 7.35509232619735e-05, + "loss": 2.8762, + "step": 19790 + }, + { + "epoch": 1.2285678813085852, + "grad_norm": 0.16873794022488633, + "learning_rate": 7.354773739073107e-05, + "loss": 2.8722, + "step": 19791 + }, + { + "epoch": 1.228629958408343, + "grad_norm": 0.15560088628641236, + "learning_rate": 7.354455139663436e-05, + "loss": 2.8194, + "step": 19792 + }, + { + "epoch": 1.228692035508101, + "grad_norm": 0.16084969240850802, + "learning_rate": 7.354136527970001e-05, + "loss": 2.798, + "step": 19793 + }, + { + "epoch": 1.228754112607859, + "grad_norm": 0.15531846359286083, + "learning_rate": 7.353817903994465e-05, + "loss": 2.8799, + "step": 19794 + }, + { + "epoch": 1.2288161897076169, + "grad_norm": 0.1646080426839783, + "learning_rate": 7.353499267738489e-05, + "loss": 2.8786, + "step": 19795 + }, + { + "epoch": 1.2288782668073748, + "grad_norm": 0.15363204147917453, + "learning_rate": 7.353180619203736e-05, + "loss": 2.855, + "step": 19796 + }, + { + "epoch": 1.2289403439071327, + "grad_norm": 0.1859842265744704, + "learning_rate": 7.35286195839187e-05, + "loss": 2.8585, + "step": 19797 + }, + { + "epoch": 1.2290024210068906, + "grad_norm": 0.1522022286891833, + "learning_rate": 7.35254328530455e-05, + "loss": 2.8618, + "step": 19798 + }, + { + "epoch": 1.2290644981066485, + "grad_norm": 0.1626352490106395, + "learning_rate": 7.352224599943441e-05, + "loss": 2.9212, + "step": 19799 + }, + { + "epoch": 1.2291265752064064, + "grad_norm": 0.16171601265511765, + "learning_rate": 7.351905902310205e-05, + "loss": 2.9126, + "step": 19800 + }, + { + "epoch": 1.2291886523061644, + "grad_norm": 0.15503767096062465, + "learning_rate": 7.351587192406506e-05, + "loss": 2.8315, + "step": 19801 + }, + { + "epoch": 1.229250729405922, + "grad_norm": 0.17232771336292477, + "learning_rate": 7.351268470234006e-05, + "loss": 2.8575, + "step": 19802 + }, + { + "epoch": 1.22931280650568, + "grad_norm": 0.1476341392472412, + "learning_rate": 7.350949735794365e-05, + "loss": 2.8089, + "step": 19803 + }, + { + "epoch": 1.229374883605438, + "grad_norm": 0.16132008045266116, + "learning_rate": 7.350630989089253e-05, + "loss": 2.9701, + "step": 19804 + }, + { + "epoch": 1.2294369607051958, + "grad_norm": 0.15573760895401084, + "learning_rate": 7.350312230120325e-05, + "loss": 2.8854, + "step": 19805 + }, + { + "epoch": 1.2294990378049537, + "grad_norm": 0.155581474652023, + "learning_rate": 7.349993458889248e-05, + "loss": 2.8152, + "step": 19806 + }, + { + "epoch": 1.2295611149047116, + "grad_norm": 0.15210428735877762, + "learning_rate": 7.349674675397687e-05, + "loss": 2.8589, + "step": 19807 + }, + { + "epoch": 1.2296231920044696, + "grad_norm": 0.15668921727691426, + "learning_rate": 7.3493558796473e-05, + "loss": 2.8865, + "step": 19808 + }, + { + "epoch": 1.2296852691042275, + "grad_norm": 0.1473457884723353, + "learning_rate": 7.349037071639752e-05, + "loss": 2.8592, + "step": 19809 + }, + { + "epoch": 1.2297473462039854, + "grad_norm": 0.20335925789183779, + "learning_rate": 7.348718251376709e-05, + "loss": 2.7644, + "step": 19810 + }, + { + "epoch": 1.2298094233037433, + "grad_norm": 0.14149520431309356, + "learning_rate": 7.348399418859834e-05, + "loss": 2.8757, + "step": 19811 + }, + { + "epoch": 1.2298715004035012, + "grad_norm": 0.15376775895847372, + "learning_rate": 7.348080574090788e-05, + "loss": 2.8558, + "step": 19812 + }, + { + "epoch": 1.229933577503259, + "grad_norm": 0.14872578809540254, + "learning_rate": 7.347761717071235e-05, + "loss": 2.8515, + "step": 19813 + }, + { + "epoch": 1.2299956546030169, + "grad_norm": 0.15463590194876117, + "learning_rate": 7.347442847802842e-05, + "loss": 2.9064, + "step": 19814 + }, + { + "epoch": 1.2300577317027748, + "grad_norm": 0.1400818268669218, + "learning_rate": 7.347123966287267e-05, + "loss": 2.7518, + "step": 19815 + }, + { + "epoch": 1.2301198088025327, + "grad_norm": 0.14867672966925474, + "learning_rate": 7.346805072526177e-05, + "loss": 2.9126, + "step": 19816 + }, + { + "epoch": 1.2301818859022906, + "grad_norm": 0.16441968228956064, + "learning_rate": 7.346486166521235e-05, + "loss": 2.8579, + "step": 19817 + }, + { + "epoch": 1.2302439630020485, + "grad_norm": 0.18196228218867458, + "learning_rate": 7.346167248274106e-05, + "loss": 2.9109, + "step": 19818 + }, + { + "epoch": 1.2303060401018064, + "grad_norm": 0.15240484626472417, + "learning_rate": 7.345848317786453e-05, + "loss": 2.783, + "step": 19819 + }, + { + "epoch": 1.2303681172015644, + "grad_norm": 0.14989619289043143, + "learning_rate": 7.345529375059939e-05, + "loss": 2.7955, + "step": 19820 + }, + { + "epoch": 1.2304301943013223, + "grad_norm": 0.1494732596119669, + "learning_rate": 7.345210420096229e-05, + "loss": 2.9412, + "step": 19821 + }, + { + "epoch": 1.2304922714010802, + "grad_norm": 0.15853567845426667, + "learning_rate": 7.344891452896987e-05, + "loss": 2.8328, + "step": 19822 + }, + { + "epoch": 1.2305543485008381, + "grad_norm": 0.15952754480615527, + "learning_rate": 7.344572473463878e-05, + "loss": 2.9112, + "step": 19823 + }, + { + "epoch": 1.230616425600596, + "grad_norm": 0.15872351688347613, + "learning_rate": 7.344253481798563e-05, + "loss": 2.8601, + "step": 19824 + }, + { + "epoch": 1.230678502700354, + "grad_norm": 0.1707781341003238, + "learning_rate": 7.34393447790271e-05, + "loss": 2.8532, + "step": 19825 + }, + { + "epoch": 1.2307405798001116, + "grad_norm": 0.19457854649648135, + "learning_rate": 7.343615461777981e-05, + "loss": 2.8634, + "step": 19826 + }, + { + "epoch": 1.2308026568998696, + "grad_norm": 0.164581205458909, + "learning_rate": 7.343296433426042e-05, + "loss": 2.8215, + "step": 19827 + }, + { + "epoch": 1.2308647339996275, + "grad_norm": 0.1781420381662451, + "learning_rate": 7.342977392848555e-05, + "loss": 2.9293, + "step": 19828 + }, + { + "epoch": 1.2309268110993854, + "grad_norm": 0.17280708430394182, + "learning_rate": 7.342658340047189e-05, + "loss": 2.7883, + "step": 19829 + }, + { + "epoch": 1.2309888881991433, + "grad_norm": 0.17310964315693045, + "learning_rate": 7.342339275023605e-05, + "loss": 2.7509, + "step": 19830 + }, + { + "epoch": 1.2310509652989012, + "grad_norm": 0.1685460394522716, + "learning_rate": 7.342020197779466e-05, + "loss": 2.8508, + "step": 19831 + }, + { + "epoch": 1.2311130423986592, + "grad_norm": 0.17481309454963076, + "learning_rate": 7.341701108316442e-05, + "loss": 2.9006, + "step": 19832 + }, + { + "epoch": 1.231175119498417, + "grad_norm": 0.19567543641014068, + "learning_rate": 7.341382006636194e-05, + "loss": 2.8758, + "step": 19833 + }, + { + "epoch": 1.231237196598175, + "grad_norm": 0.16432702220094397, + "learning_rate": 7.341062892740388e-05, + "loss": 2.886, + "step": 19834 + }, + { + "epoch": 1.231299273697933, + "grad_norm": 0.15611236590132954, + "learning_rate": 7.340743766630687e-05, + "loss": 2.8619, + "step": 19835 + }, + { + "epoch": 1.2313613507976908, + "grad_norm": 0.154534302094099, + "learning_rate": 7.34042462830876e-05, + "loss": 2.7395, + "step": 19836 + }, + { + "epoch": 1.2314234278974485, + "grad_norm": 0.15521841453821664, + "learning_rate": 7.340105477776267e-05, + "loss": 2.8786, + "step": 19837 + }, + { + "epoch": 1.2314855049972064, + "grad_norm": 0.19849008178915276, + "learning_rate": 7.339786315034876e-05, + "loss": 2.7712, + "step": 19838 + }, + { + "epoch": 1.2315475820969644, + "grad_norm": 0.18383959279279208, + "learning_rate": 7.339467140086253e-05, + "loss": 2.815, + "step": 19839 + }, + { + "epoch": 1.2316096591967223, + "grad_norm": 0.1602366466630877, + "learning_rate": 7.339147952932062e-05, + "loss": 2.9399, + "step": 19840 + }, + { + "epoch": 1.2316717362964802, + "grad_norm": 0.1887885264210082, + "learning_rate": 7.338828753573968e-05, + "loss": 2.9227, + "step": 19841 + }, + { + "epoch": 1.231733813396238, + "grad_norm": 0.15945530065223168, + "learning_rate": 7.338509542013635e-05, + "loss": 2.8658, + "step": 19842 + }, + { + "epoch": 1.231795890495996, + "grad_norm": 0.18871004338527414, + "learning_rate": 7.338190318252731e-05, + "loss": 2.9567, + "step": 19843 + }, + { + "epoch": 1.231857967595754, + "grad_norm": 0.1935488514256131, + "learning_rate": 7.33787108229292e-05, + "loss": 2.9135, + "step": 19844 + }, + { + "epoch": 1.2319200446955119, + "grad_norm": 0.16056788856927415, + "learning_rate": 7.337551834135871e-05, + "loss": 2.8329, + "step": 19845 + }, + { + "epoch": 1.2319821217952698, + "grad_norm": 0.1822266099281699, + "learning_rate": 7.337232573783243e-05, + "loss": 2.8983, + "step": 19846 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.16018346229670807, + "learning_rate": 7.336913301236708e-05, + "loss": 2.8103, + "step": 19847 + }, + { + "epoch": 1.2321062759947856, + "grad_norm": 0.1968775983669233, + "learning_rate": 7.336594016497929e-05, + "loss": 2.7868, + "step": 19848 + }, + { + "epoch": 1.2321683530945435, + "grad_norm": 0.14867263290604985, + "learning_rate": 7.33627471956857e-05, + "loss": 2.8814, + "step": 19849 + }, + { + "epoch": 1.2322304301943012, + "grad_norm": 0.26505629582554796, + "learning_rate": 7.335955410450299e-05, + "loss": 2.9479, + "step": 19850 + }, + { + "epoch": 1.2322925072940591, + "grad_norm": 0.14857423908970166, + "learning_rate": 7.335636089144782e-05, + "loss": 2.8989, + "step": 19851 + }, + { + "epoch": 1.232354584393817, + "grad_norm": 0.19687218664066503, + "learning_rate": 7.335316755653684e-05, + "loss": 2.8554, + "step": 19852 + }, + { + "epoch": 1.232416661493575, + "grad_norm": 0.1797354261090175, + "learning_rate": 7.334997409978672e-05, + "loss": 2.9879, + "step": 19853 + }, + { + "epoch": 1.232478738593333, + "grad_norm": 0.1815929452809031, + "learning_rate": 7.334678052121412e-05, + "loss": 2.8191, + "step": 19854 + }, + { + "epoch": 1.2325408156930908, + "grad_norm": 0.22465455803238657, + "learning_rate": 7.334358682083569e-05, + "loss": 2.9533, + "step": 19855 + }, + { + "epoch": 1.2326028927928487, + "grad_norm": 0.19685580863964927, + "learning_rate": 7.334039299866811e-05, + "loss": 2.9149, + "step": 19856 + }, + { + "epoch": 1.2326649698926067, + "grad_norm": 0.1853340333582616, + "learning_rate": 7.333719905472803e-05, + "loss": 2.8017, + "step": 19857 + }, + { + "epoch": 1.2327270469923646, + "grad_norm": 0.15666826413380922, + "learning_rate": 7.33340049890321e-05, + "loss": 2.8802, + "step": 19858 + }, + { + "epoch": 1.2327891240921225, + "grad_norm": 0.16935816798205436, + "learning_rate": 7.333081080159702e-05, + "loss": 2.9071, + "step": 19859 + }, + { + "epoch": 1.2328512011918804, + "grad_norm": 0.21462227069172451, + "learning_rate": 7.332761649243944e-05, + "loss": 2.8938, + "step": 19860 + }, + { + "epoch": 1.232913278291638, + "grad_norm": 0.167401731795066, + "learning_rate": 7.332442206157601e-05, + "loss": 2.8937, + "step": 19861 + }, + { + "epoch": 1.232975355391396, + "grad_norm": 0.24123910592737177, + "learning_rate": 7.332122750902342e-05, + "loss": 2.913, + "step": 19862 + }, + { + "epoch": 1.233037432491154, + "grad_norm": 0.1700000144804219, + "learning_rate": 7.331803283479831e-05, + "loss": 2.8913, + "step": 19863 + }, + { + "epoch": 1.2330995095909119, + "grad_norm": 0.1744696053452034, + "learning_rate": 7.331483803891736e-05, + "loss": 2.923, + "step": 19864 + }, + { + "epoch": 1.2331615866906698, + "grad_norm": 0.19306265094599284, + "learning_rate": 7.331164312139727e-05, + "loss": 2.81, + "step": 19865 + }, + { + "epoch": 1.2332236637904277, + "grad_norm": 0.16420175115039815, + "learning_rate": 7.330844808225466e-05, + "loss": 2.7998, + "step": 19866 + }, + { + "epoch": 1.2332857408901856, + "grad_norm": 0.18711446822845884, + "learning_rate": 7.330525292150621e-05, + "loss": 2.8826, + "step": 19867 + }, + { + "epoch": 1.2333478179899435, + "grad_norm": 0.15940929305689255, + "learning_rate": 7.330205763916861e-05, + "loss": 2.8957, + "step": 19868 + }, + { + "epoch": 1.2334098950897014, + "grad_norm": 0.1749161146696888, + "learning_rate": 7.329886223525851e-05, + "loss": 2.9274, + "step": 19869 + }, + { + "epoch": 1.2334719721894594, + "grad_norm": 0.19159192207807155, + "learning_rate": 7.32956667097926e-05, + "loss": 2.9187, + "step": 19870 + }, + { + "epoch": 1.2335340492892173, + "grad_norm": 0.15789494697774845, + "learning_rate": 7.329247106278752e-05, + "loss": 2.8894, + "step": 19871 + }, + { + "epoch": 1.2335961263889752, + "grad_norm": 0.16753392714859155, + "learning_rate": 7.328927529425999e-05, + "loss": 2.8377, + "step": 19872 + }, + { + "epoch": 1.2336582034887331, + "grad_norm": 0.16510660719246065, + "learning_rate": 7.328607940422663e-05, + "loss": 2.9163, + "step": 19873 + }, + { + "epoch": 1.2337202805884908, + "grad_norm": 0.16911832957636144, + "learning_rate": 7.328288339270417e-05, + "loss": 2.9372, + "step": 19874 + }, + { + "epoch": 1.2337823576882487, + "grad_norm": 0.16879528197518726, + "learning_rate": 7.327968725970924e-05, + "loss": 2.7946, + "step": 19875 + }, + { + "epoch": 1.2338444347880066, + "grad_norm": 0.1932979929432769, + "learning_rate": 7.327649100525853e-05, + "loss": 2.9222, + "step": 19876 + }, + { + "epoch": 1.2339065118877646, + "grad_norm": 0.15332783844572243, + "learning_rate": 7.327329462936871e-05, + "loss": 2.901, + "step": 19877 + }, + { + "epoch": 1.2339685889875225, + "grad_norm": 0.2117579803860197, + "learning_rate": 7.327009813205647e-05, + "loss": 2.8346, + "step": 19878 + }, + { + "epoch": 1.2340306660872804, + "grad_norm": 0.17272177095982408, + "learning_rate": 7.326690151333848e-05, + "loss": 2.822, + "step": 19879 + }, + { + "epoch": 1.2340927431870383, + "grad_norm": 0.20509111272466787, + "learning_rate": 7.326370477323141e-05, + "loss": 2.8362, + "step": 19880 + }, + { + "epoch": 1.2341548202867962, + "grad_norm": 0.23356736426459784, + "learning_rate": 7.326050791175196e-05, + "loss": 2.8647, + "step": 19881 + }, + { + "epoch": 1.2342168973865542, + "grad_norm": 0.15641247527340962, + "learning_rate": 7.325731092891678e-05, + "loss": 2.9221, + "step": 19882 + }, + { + "epoch": 1.234278974486312, + "grad_norm": 0.18010768275435166, + "learning_rate": 7.325411382474256e-05, + "loss": 2.8226, + "step": 19883 + }, + { + "epoch": 1.23434105158607, + "grad_norm": 0.15923741235943978, + "learning_rate": 7.325091659924599e-05, + "loss": 2.8206, + "step": 19884 + }, + { + "epoch": 1.2344031286858277, + "grad_norm": 0.21336395246190257, + "learning_rate": 7.324771925244376e-05, + "loss": 2.8629, + "step": 19885 + }, + { + "epoch": 1.2344652057855856, + "grad_norm": 0.16009476109337503, + "learning_rate": 7.324452178435252e-05, + "loss": 2.798, + "step": 19886 + }, + { + "epoch": 1.2345272828853435, + "grad_norm": 0.17546720001337784, + "learning_rate": 7.324132419498895e-05, + "loss": 2.9512, + "step": 19887 + }, + { + "epoch": 1.2345893599851014, + "grad_norm": 0.2011684676687323, + "learning_rate": 7.323812648436979e-05, + "loss": 2.9607, + "step": 19888 + }, + { + "epoch": 1.2346514370848594, + "grad_norm": 0.17371424148364928, + "learning_rate": 7.323492865251164e-05, + "loss": 2.8483, + "step": 19889 + }, + { + "epoch": 1.2347135141846173, + "grad_norm": 0.1549339121144985, + "learning_rate": 7.323173069943125e-05, + "loss": 2.7935, + "step": 19890 + }, + { + "epoch": 1.2347755912843752, + "grad_norm": 0.15457492835937647, + "learning_rate": 7.322853262514528e-05, + "loss": 2.8421, + "step": 19891 + }, + { + "epoch": 1.2348376683841331, + "grad_norm": 0.16932288738857906, + "learning_rate": 7.32253344296704e-05, + "loss": 2.783, + "step": 19892 + }, + { + "epoch": 1.234899745483891, + "grad_norm": 0.18714065688458556, + "learning_rate": 7.322213611302333e-05, + "loss": 2.9022, + "step": 19893 + }, + { + "epoch": 1.234961822583649, + "grad_norm": 0.1557278073027191, + "learning_rate": 7.321893767522072e-05, + "loss": 2.876, + "step": 19894 + }, + { + "epoch": 1.2350238996834069, + "grad_norm": 0.21971120992643023, + "learning_rate": 7.32157391162793e-05, + "loss": 2.9373, + "step": 19895 + }, + { + "epoch": 1.2350859767831648, + "grad_norm": 0.16719003301109867, + "learning_rate": 7.321254043621572e-05, + "loss": 2.8822, + "step": 19896 + }, + { + "epoch": 1.2351480538829227, + "grad_norm": 0.1573598604657841, + "learning_rate": 7.320934163504669e-05, + "loss": 2.8132, + "step": 19897 + }, + { + "epoch": 1.2352101309826804, + "grad_norm": 0.19786729740638767, + "learning_rate": 7.320614271278887e-05, + "loss": 2.888, + "step": 19898 + }, + { + "epoch": 1.2352722080824383, + "grad_norm": 0.17848150190104586, + "learning_rate": 7.320294366945899e-05, + "loss": 2.8586, + "step": 19899 + }, + { + "epoch": 1.2353342851821962, + "grad_norm": 0.1580283353230116, + "learning_rate": 7.319974450507372e-05, + "loss": 2.9273, + "step": 19900 + }, + { + "epoch": 1.2353963622819542, + "grad_norm": 0.15463674512168113, + "learning_rate": 7.319654521964975e-05, + "loss": 2.8873, + "step": 19901 + }, + { + "epoch": 1.235458439381712, + "grad_norm": 0.17760711347721295, + "learning_rate": 7.319334581320376e-05, + "loss": 2.9439, + "step": 19902 + }, + { + "epoch": 1.23552051648147, + "grad_norm": 0.14920698414097305, + "learning_rate": 7.319014628575247e-05, + "loss": 2.9322, + "step": 19903 + }, + { + "epoch": 1.235582593581228, + "grad_norm": 0.1690070640298051, + "learning_rate": 7.318694663731255e-05, + "loss": 2.897, + "step": 19904 + }, + { + "epoch": 1.2356446706809858, + "grad_norm": 0.1963581039698408, + "learning_rate": 7.318374686790069e-05, + "loss": 2.806, + "step": 19905 + }, + { + "epoch": 1.2357067477807437, + "grad_norm": 0.15644138655100076, + "learning_rate": 7.318054697753362e-05, + "loss": 2.8297, + "step": 19906 + }, + { + "epoch": 1.2357688248805017, + "grad_norm": 0.18703399804374274, + "learning_rate": 7.317734696622797e-05, + "loss": 2.804, + "step": 19907 + }, + { + "epoch": 1.2358309019802596, + "grad_norm": 0.21932449854492655, + "learning_rate": 7.31741468340005e-05, + "loss": 2.7856, + "step": 19908 + }, + { + "epoch": 1.2358929790800173, + "grad_norm": 0.16835228247871156, + "learning_rate": 7.317094658086787e-05, + "loss": 2.7623, + "step": 19909 + }, + { + "epoch": 1.2359550561797752, + "grad_norm": 0.1666920401709722, + "learning_rate": 7.316774620684681e-05, + "loss": 2.8667, + "step": 19910 + }, + { + "epoch": 1.236017133279533, + "grad_norm": 0.18893723199871412, + "learning_rate": 7.316454571195398e-05, + "loss": 2.8989, + "step": 19911 + }, + { + "epoch": 1.236079210379291, + "grad_norm": 0.16779161694479125, + "learning_rate": 7.316134509620609e-05, + "loss": 2.8126, + "step": 19912 + }, + { + "epoch": 1.236141287479049, + "grad_norm": 0.2029203337495331, + "learning_rate": 7.315814435961985e-05, + "loss": 2.9353, + "step": 19913 + }, + { + "epoch": 1.2362033645788069, + "grad_norm": 0.16018079360284554, + "learning_rate": 7.315494350221193e-05, + "loss": 2.7864, + "step": 19914 + }, + { + "epoch": 1.2362654416785648, + "grad_norm": 0.1725472503197901, + "learning_rate": 7.315174252399905e-05, + "loss": 3.0061, + "step": 19915 + }, + { + "epoch": 1.2363275187783227, + "grad_norm": 0.16196568566878558, + "learning_rate": 7.314854142499792e-05, + "loss": 2.9647, + "step": 19916 + }, + { + "epoch": 1.2363895958780806, + "grad_norm": 0.18940907388963316, + "learning_rate": 7.314534020522523e-05, + "loss": 2.8855, + "step": 19917 + }, + { + "epoch": 1.2364516729778385, + "grad_norm": 0.1811002774744894, + "learning_rate": 7.314213886469768e-05, + "loss": 2.8989, + "step": 19918 + }, + { + "epoch": 1.2365137500775965, + "grad_norm": 0.17375783892313554, + "learning_rate": 7.313893740343197e-05, + "loss": 2.8548, + "step": 19919 + }, + { + "epoch": 1.2365758271773544, + "grad_norm": 0.20334207416498698, + "learning_rate": 7.31357358214448e-05, + "loss": 2.794, + "step": 19920 + }, + { + "epoch": 1.2366379042771123, + "grad_norm": 0.1565827878876884, + "learning_rate": 7.313253411875289e-05, + "loss": 2.7993, + "step": 19921 + }, + { + "epoch": 1.23669998137687, + "grad_norm": 0.1613394597022658, + "learning_rate": 7.312933229537294e-05, + "loss": 2.86, + "step": 19922 + }, + { + "epoch": 1.236762058476628, + "grad_norm": 0.18655710765259742, + "learning_rate": 7.312613035132165e-05, + "loss": 2.8583, + "step": 19923 + }, + { + "epoch": 1.2368241355763858, + "grad_norm": 0.15715618809478626, + "learning_rate": 7.31229282866157e-05, + "loss": 2.8457, + "step": 19924 + }, + { + "epoch": 1.2368862126761437, + "grad_norm": 0.1694471810029572, + "learning_rate": 7.311972610127183e-05, + "loss": 2.9155, + "step": 19925 + }, + { + "epoch": 1.2369482897759017, + "grad_norm": 0.1730025058034967, + "learning_rate": 7.311652379530675e-05, + "loss": 2.8577, + "step": 19926 + }, + { + "epoch": 1.2370103668756596, + "grad_norm": 0.15694041898991182, + "learning_rate": 7.311332136873714e-05, + "loss": 2.7873, + "step": 19927 + }, + { + "epoch": 1.2370724439754175, + "grad_norm": 0.17585993171444478, + "learning_rate": 7.311011882157971e-05, + "loss": 2.8747, + "step": 19928 + }, + { + "epoch": 1.2371345210751754, + "grad_norm": 0.15716049630714243, + "learning_rate": 7.310691615385122e-05, + "loss": 2.8074, + "step": 19929 + }, + { + "epoch": 1.2371965981749333, + "grad_norm": 0.1772986720262938, + "learning_rate": 7.31037133655683e-05, + "loss": 2.8592, + "step": 19930 + }, + { + "epoch": 1.2372586752746912, + "grad_norm": 0.15278893615753322, + "learning_rate": 7.310051045674773e-05, + "loss": 2.8486, + "step": 19931 + }, + { + "epoch": 1.2373207523744492, + "grad_norm": 0.17936527343789452, + "learning_rate": 7.309730742740617e-05, + "loss": 2.9425, + "step": 19932 + }, + { + "epoch": 1.2373828294742069, + "grad_norm": 0.16356100342160257, + "learning_rate": 7.309410427756034e-05, + "loss": 2.7763, + "step": 19933 + }, + { + "epoch": 1.2374449065739648, + "grad_norm": 0.20446821070382348, + "learning_rate": 7.309090100722698e-05, + "loss": 2.925, + "step": 19934 + }, + { + "epoch": 1.2375069836737227, + "grad_norm": 0.19914297457925564, + "learning_rate": 7.308769761642277e-05, + "loss": 2.8317, + "step": 19935 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.18112068208045484, + "learning_rate": 7.308449410516446e-05, + "loss": 2.9189, + "step": 19936 + }, + { + "epoch": 1.2376311378732385, + "grad_norm": 0.16669584493234268, + "learning_rate": 7.308129047346872e-05, + "loss": 2.9116, + "step": 19937 + }, + { + "epoch": 1.2376932149729964, + "grad_norm": 0.1768855165751945, + "learning_rate": 7.307808672135229e-05, + "loss": 2.8958, + "step": 19938 + }, + { + "epoch": 1.2377552920727544, + "grad_norm": 0.16205363866616226, + "learning_rate": 7.307488284883188e-05, + "loss": 2.8338, + "step": 19939 + }, + { + "epoch": 1.2378173691725123, + "grad_norm": 0.1694677961496039, + "learning_rate": 7.307167885592421e-05, + "loss": 2.836, + "step": 19940 + }, + { + "epoch": 1.2378794462722702, + "grad_norm": 0.15797533896917412, + "learning_rate": 7.306847474264598e-05, + "loss": 2.861, + "step": 19941 + }, + { + "epoch": 1.2379415233720281, + "grad_norm": 0.2484826517760771, + "learning_rate": 7.306527050901392e-05, + "loss": 2.8487, + "step": 19942 + }, + { + "epoch": 1.238003600471786, + "grad_norm": 0.17423279945106582, + "learning_rate": 7.306206615504474e-05, + "loss": 2.9038, + "step": 19943 + }, + { + "epoch": 1.238065677571544, + "grad_norm": 0.208539694664962, + "learning_rate": 7.305886168075518e-05, + "loss": 2.866, + "step": 19944 + }, + { + "epoch": 1.2381277546713019, + "grad_norm": 0.18113259056132008, + "learning_rate": 7.305565708616191e-05, + "loss": 2.8687, + "step": 19945 + }, + { + "epoch": 1.2381898317710596, + "grad_norm": 0.16022470220927093, + "learning_rate": 7.305245237128171e-05, + "loss": 2.8255, + "step": 19946 + }, + { + "epoch": 1.2382519088708175, + "grad_norm": 0.15625658617443905, + "learning_rate": 7.304924753613127e-05, + "loss": 2.8427, + "step": 19947 + }, + { + "epoch": 1.2383139859705754, + "grad_norm": 0.16024610278408963, + "learning_rate": 7.304604258072729e-05, + "loss": 2.8217, + "step": 19948 + }, + { + "epoch": 1.2383760630703333, + "grad_norm": 0.15556804131692087, + "learning_rate": 7.304283750508653e-05, + "loss": 2.8569, + "step": 19949 + }, + { + "epoch": 1.2384381401700912, + "grad_norm": 0.15947626136680715, + "learning_rate": 7.303963230922567e-05, + "loss": 2.8638, + "step": 19950 + }, + { + "epoch": 1.2385002172698492, + "grad_norm": 0.15258677978111276, + "learning_rate": 7.303642699316147e-05, + "loss": 2.8422, + "step": 19951 + }, + { + "epoch": 1.238562294369607, + "grad_norm": 0.17781976597828794, + "learning_rate": 7.303322155691064e-05, + "loss": 2.8148, + "step": 19952 + }, + { + "epoch": 1.238624371469365, + "grad_norm": 0.15856312775590986, + "learning_rate": 7.30300160004899e-05, + "loss": 2.827, + "step": 19953 + }, + { + "epoch": 1.238686448569123, + "grad_norm": 0.15960883582533383, + "learning_rate": 7.302681032391597e-05, + "loss": 2.7814, + "step": 19954 + }, + { + "epoch": 1.2387485256688808, + "grad_norm": 0.1564499767432069, + "learning_rate": 7.302360452720559e-05, + "loss": 2.8367, + "step": 19955 + }, + { + "epoch": 1.2388106027686385, + "grad_norm": 0.15955615430235295, + "learning_rate": 7.302039861037546e-05, + "loss": 2.9382, + "step": 19956 + }, + { + "epoch": 1.2388726798683964, + "grad_norm": 0.162513235589349, + "learning_rate": 7.301719257344232e-05, + "loss": 2.8588, + "step": 19957 + }, + { + "epoch": 1.2389347569681544, + "grad_norm": 0.1527055637571211, + "learning_rate": 7.301398641642294e-05, + "loss": 2.8356, + "step": 19958 + }, + { + "epoch": 1.2389968340679123, + "grad_norm": 0.17246418449827267, + "learning_rate": 7.301078013933396e-05, + "loss": 2.7783, + "step": 19959 + }, + { + "epoch": 1.2390589111676702, + "grad_norm": 0.14943109053775122, + "learning_rate": 7.300757374219218e-05, + "loss": 2.8734, + "step": 19960 + }, + { + "epoch": 1.2391209882674281, + "grad_norm": 0.15201261797289414, + "learning_rate": 7.30043672250143e-05, + "loss": 2.8014, + "step": 19961 + }, + { + "epoch": 1.239183065367186, + "grad_norm": 0.17688106223187705, + "learning_rate": 7.300116058781703e-05, + "loss": 2.9169, + "step": 19962 + }, + { + "epoch": 1.239245142466944, + "grad_norm": 0.17489730789372554, + "learning_rate": 7.299795383061714e-05, + "loss": 2.8997, + "step": 19963 + }, + { + "epoch": 1.2393072195667019, + "grad_norm": 0.16559344634398812, + "learning_rate": 7.299474695343133e-05, + "loss": 2.8564, + "step": 19964 + }, + { + "epoch": 1.2393692966664598, + "grad_norm": 0.16313477316076813, + "learning_rate": 7.299153995627637e-05, + "loss": 2.9103, + "step": 19965 + }, + { + "epoch": 1.2394313737662177, + "grad_norm": 0.1536301379888788, + "learning_rate": 7.298833283916893e-05, + "loss": 2.9066, + "step": 19966 + }, + { + "epoch": 1.2394934508659756, + "grad_norm": 0.1450896559176071, + "learning_rate": 7.298512560212581e-05, + "loss": 2.829, + "step": 19967 + }, + { + "epoch": 1.2395555279657335, + "grad_norm": 0.14663361258753163, + "learning_rate": 7.298191824516369e-05, + "loss": 2.8663, + "step": 19968 + }, + { + "epoch": 1.2396176050654915, + "grad_norm": 0.16416002394961915, + "learning_rate": 7.297871076829932e-05, + "loss": 2.8326, + "step": 19969 + }, + { + "epoch": 1.2396796821652492, + "grad_norm": 0.1449733686346641, + "learning_rate": 7.297550317154946e-05, + "loss": 2.8428, + "step": 19970 + }, + { + "epoch": 1.239741759265007, + "grad_norm": 0.14359869898630764, + "learning_rate": 7.29722954549308e-05, + "loss": 2.9428, + "step": 19971 + }, + { + "epoch": 1.239803836364765, + "grad_norm": 0.15511256107068125, + "learning_rate": 7.296908761846011e-05, + "loss": 2.9209, + "step": 19972 + }, + { + "epoch": 1.239865913464523, + "grad_norm": 0.15939383348747652, + "learning_rate": 7.296587966215409e-05, + "loss": 2.9644, + "step": 19973 + }, + { + "epoch": 1.2399279905642808, + "grad_norm": 0.17963690667452917, + "learning_rate": 7.296267158602953e-05, + "loss": 2.8693, + "step": 19974 + }, + { + "epoch": 1.2399900676640387, + "grad_norm": 0.14658781214324865, + "learning_rate": 7.295946339010312e-05, + "loss": 2.8808, + "step": 19975 + }, + { + "epoch": 1.2400521447637967, + "grad_norm": 0.16634434524614525, + "learning_rate": 7.295625507439163e-05, + "loss": 2.8352, + "step": 19976 + }, + { + "epoch": 1.2401142218635546, + "grad_norm": 0.15371923121334163, + "learning_rate": 7.295304663891177e-05, + "loss": 2.855, + "step": 19977 + }, + { + "epoch": 1.2401762989633125, + "grad_norm": 0.14385028818575313, + "learning_rate": 7.294983808368032e-05, + "loss": 2.8687, + "step": 19978 + }, + { + "epoch": 1.2402383760630704, + "grad_norm": 0.17424476177033071, + "learning_rate": 7.294662940871397e-05, + "loss": 2.8531, + "step": 19979 + }, + { + "epoch": 1.240300453162828, + "grad_norm": 0.15978842458353779, + "learning_rate": 7.294342061402948e-05, + "loss": 2.8992, + "step": 19980 + }, + { + "epoch": 1.240362530262586, + "grad_norm": 0.16350156394621723, + "learning_rate": 7.29402116996436e-05, + "loss": 2.8208, + "step": 19981 + }, + { + "epoch": 1.240424607362344, + "grad_norm": 0.18705526895627472, + "learning_rate": 7.293700266557307e-05, + "loss": 2.9246, + "step": 19982 + }, + { + "epoch": 1.2404866844621019, + "grad_norm": 0.1765552645863377, + "learning_rate": 7.293379351183465e-05, + "loss": 2.7978, + "step": 19983 + }, + { + "epoch": 1.2405487615618598, + "grad_norm": 0.17103097953022237, + "learning_rate": 7.293058423844504e-05, + "loss": 2.9245, + "step": 19984 + }, + { + "epoch": 1.2406108386616177, + "grad_norm": 0.16312455959187538, + "learning_rate": 7.292737484542101e-05, + "loss": 2.8164, + "step": 19985 + }, + { + "epoch": 1.2406729157613756, + "grad_norm": 0.22692526352486644, + "learning_rate": 7.292416533277928e-05, + "loss": 2.9592, + "step": 19986 + }, + { + "epoch": 1.2407349928611335, + "grad_norm": 0.1514440033949306, + "learning_rate": 7.292095570053663e-05, + "loss": 2.8098, + "step": 19987 + }, + { + "epoch": 1.2407970699608915, + "grad_norm": 0.16482289959808738, + "learning_rate": 7.291774594870979e-05, + "loss": 2.7817, + "step": 19988 + }, + { + "epoch": 1.2408591470606494, + "grad_norm": 0.17046349354759527, + "learning_rate": 7.29145360773155e-05, + "loss": 2.8934, + "step": 19989 + }, + { + "epoch": 1.2409212241604073, + "grad_norm": 0.19012164118331099, + "learning_rate": 7.291132608637052e-05, + "loss": 2.8704, + "step": 19990 + }, + { + "epoch": 1.2409833012601652, + "grad_norm": 0.16033319258853057, + "learning_rate": 7.290811597589158e-05, + "loss": 2.8484, + "step": 19991 + }, + { + "epoch": 1.2410453783599231, + "grad_norm": 0.20504787766963536, + "learning_rate": 7.290490574589546e-05, + "loss": 2.8243, + "step": 19992 + }, + { + "epoch": 1.241107455459681, + "grad_norm": 0.1540656892918378, + "learning_rate": 7.290169539639887e-05, + "loss": 2.8197, + "step": 19993 + }, + { + "epoch": 1.2411695325594387, + "grad_norm": 0.19231367078988207, + "learning_rate": 7.289848492741859e-05, + "loss": 2.8075, + "step": 19994 + }, + { + "epoch": 1.2412316096591967, + "grad_norm": 0.16255230314070207, + "learning_rate": 7.289527433897133e-05, + "loss": 2.938, + "step": 19995 + }, + { + "epoch": 1.2412936867589546, + "grad_norm": 0.18189532358133592, + "learning_rate": 7.289206363107389e-05, + "loss": 2.9273, + "step": 19996 + }, + { + "epoch": 1.2413557638587125, + "grad_norm": 0.1742686518804124, + "learning_rate": 7.288885280374299e-05, + "loss": 2.8229, + "step": 19997 + }, + { + "epoch": 1.2414178409584704, + "grad_norm": 0.26993407678696846, + "learning_rate": 7.288564185699537e-05, + "loss": 2.8235, + "step": 19998 + }, + { + "epoch": 1.2414799180582283, + "grad_norm": 0.15957835654594246, + "learning_rate": 7.288243079084782e-05, + "loss": 2.8422, + "step": 19999 + }, + { + "epoch": 1.2415419951579862, + "grad_norm": 0.17129616369231804, + "learning_rate": 7.287921960531708e-05, + "loss": 2.8301, + "step": 20000 + }, + { + "epoch": 1.2416040722577442, + "grad_norm": 0.1667801922354875, + "learning_rate": 7.287600830041988e-05, + "loss": 2.8813, + "step": 20001 + }, + { + "epoch": 1.241666149357502, + "grad_norm": 0.1688940047484336, + "learning_rate": 7.2872796876173e-05, + "loss": 2.9339, + "step": 20002 + }, + { + "epoch": 1.24172822645726, + "grad_norm": 0.16005338877977138, + "learning_rate": 7.286958533259318e-05, + "loss": 2.8692, + "step": 20003 + }, + { + "epoch": 1.2417903035570177, + "grad_norm": 0.20334454739210406, + "learning_rate": 7.286637366969718e-05, + "loss": 2.8207, + "step": 20004 + }, + { + "epoch": 1.2418523806567756, + "grad_norm": 0.15711918119749746, + "learning_rate": 7.286316188750177e-05, + "loss": 2.8571, + "step": 20005 + }, + { + "epoch": 1.2419144577565335, + "grad_norm": 0.19876554774704258, + "learning_rate": 7.285994998602369e-05, + "loss": 2.7963, + "step": 20006 + }, + { + "epoch": 1.2419765348562914, + "grad_norm": 0.1630091000763652, + "learning_rate": 7.28567379652797e-05, + "loss": 2.9507, + "step": 20007 + }, + { + "epoch": 1.2420386119560494, + "grad_norm": 0.16233236418882066, + "learning_rate": 7.285352582528655e-05, + "loss": 2.866, + "step": 20008 + }, + { + "epoch": 1.2421006890558073, + "grad_norm": 0.15329577012655882, + "learning_rate": 7.285031356606101e-05, + "loss": 2.8949, + "step": 20009 + }, + { + "epoch": 1.2421627661555652, + "grad_norm": 0.16586238857447813, + "learning_rate": 7.284710118761986e-05, + "loss": 2.8464, + "step": 20010 + }, + { + "epoch": 1.2422248432553231, + "grad_norm": 0.15220601017258376, + "learning_rate": 7.284388868997981e-05, + "loss": 2.8358, + "step": 20011 + }, + { + "epoch": 1.242286920355081, + "grad_norm": 0.2193569210677954, + "learning_rate": 7.284067607315765e-05, + "loss": 2.9182, + "step": 20012 + }, + { + "epoch": 1.242348997454839, + "grad_norm": 0.1538689146891583, + "learning_rate": 7.283746333717015e-05, + "loss": 2.8331, + "step": 20013 + }, + { + "epoch": 1.2424110745545969, + "grad_norm": 0.1642570364045849, + "learning_rate": 7.283425048203405e-05, + "loss": 2.843, + "step": 20014 + }, + { + "epoch": 1.2424731516543548, + "grad_norm": 0.17189962037351364, + "learning_rate": 7.283103750776613e-05, + "loss": 2.8956, + "step": 20015 + }, + { + "epoch": 1.2425352287541127, + "grad_norm": 0.15318988306105502, + "learning_rate": 7.282782441438315e-05, + "loss": 2.8338, + "step": 20016 + }, + { + "epoch": 1.2425973058538706, + "grad_norm": 0.15613683536472983, + "learning_rate": 7.282461120190185e-05, + "loss": 2.8185, + "step": 20017 + }, + { + "epoch": 1.2426593829536283, + "grad_norm": 0.18906502446152243, + "learning_rate": 7.282139787033904e-05, + "loss": 2.8467, + "step": 20018 + }, + { + "epoch": 1.2427214600533862, + "grad_norm": 0.1733589752814974, + "learning_rate": 7.281818441971143e-05, + "loss": 2.9099, + "step": 20019 + }, + { + "epoch": 1.2427835371531442, + "grad_norm": 0.15271377863636337, + "learning_rate": 7.281497085003582e-05, + "loss": 2.957, + "step": 20020 + }, + { + "epoch": 1.242845614252902, + "grad_norm": 0.1551220893997939, + "learning_rate": 7.281175716132897e-05, + "loss": 2.8284, + "step": 20021 + }, + { + "epoch": 1.24290769135266, + "grad_norm": 0.14682360470211267, + "learning_rate": 7.280854335360764e-05, + "loss": 2.8641, + "step": 20022 + }, + { + "epoch": 1.242969768452418, + "grad_norm": 0.17095370897507667, + "learning_rate": 7.280532942688862e-05, + "loss": 2.8195, + "step": 20023 + }, + { + "epoch": 1.2430318455521758, + "grad_norm": 0.15214897199740074, + "learning_rate": 7.280211538118864e-05, + "loss": 2.8962, + "step": 20024 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.26582767664163515, + "learning_rate": 7.27989012165245e-05, + "loss": 2.8884, + "step": 20025 + }, + { + "epoch": 1.2431559997516917, + "grad_norm": 0.16163271029580348, + "learning_rate": 7.279568693291296e-05, + "loss": 2.8679, + "step": 20026 + }, + { + "epoch": 1.2432180768514496, + "grad_norm": 0.18905900486954785, + "learning_rate": 7.279247253037078e-05, + "loss": 2.8146, + "step": 20027 + }, + { + "epoch": 1.2432801539512073, + "grad_norm": 0.15969174759211696, + "learning_rate": 7.278925800891474e-05, + "loss": 2.8067, + "step": 20028 + }, + { + "epoch": 1.2433422310509652, + "grad_norm": 0.18912172335719138, + "learning_rate": 7.278604336856161e-05, + "loss": 2.7817, + "step": 20029 + }, + { + "epoch": 1.2434043081507231, + "grad_norm": 0.20330061107096611, + "learning_rate": 7.278282860932817e-05, + "loss": 2.9044, + "step": 20030 + }, + { + "epoch": 1.243466385250481, + "grad_norm": 0.19200990814357977, + "learning_rate": 7.277961373123115e-05, + "loss": 2.9259, + "step": 20031 + }, + { + "epoch": 1.243528462350239, + "grad_norm": 0.18603867054420883, + "learning_rate": 7.277639873428739e-05, + "loss": 2.9043, + "step": 20032 + }, + { + "epoch": 1.2435905394499969, + "grad_norm": 0.20192082809062312, + "learning_rate": 7.277318361851361e-05, + "loss": 2.9551, + "step": 20033 + }, + { + "epoch": 1.2436526165497548, + "grad_norm": 0.21098919517583714, + "learning_rate": 7.276996838392662e-05, + "loss": 2.9124, + "step": 20034 + }, + { + "epoch": 1.2437146936495127, + "grad_norm": 0.17665638832641561, + "learning_rate": 7.276675303054316e-05, + "loss": 2.8374, + "step": 20035 + }, + { + "epoch": 1.2437767707492706, + "grad_norm": 0.17036460879866122, + "learning_rate": 7.276353755838002e-05, + "loss": 2.9303, + "step": 20036 + }, + { + "epoch": 1.2438388478490285, + "grad_norm": 0.18938592328461462, + "learning_rate": 7.276032196745399e-05, + "loss": 2.7948, + "step": 20037 + }, + { + "epoch": 1.2439009249487865, + "grad_norm": 0.17250317416000446, + "learning_rate": 7.275710625778182e-05, + "loss": 2.8485, + "step": 20038 + }, + { + "epoch": 1.2439630020485444, + "grad_norm": 0.17415831847770402, + "learning_rate": 7.275389042938032e-05, + "loss": 2.9019, + "step": 20039 + }, + { + "epoch": 1.2440250791483023, + "grad_norm": 0.18349464257576492, + "learning_rate": 7.275067448226624e-05, + "loss": 2.9586, + "step": 20040 + }, + { + "epoch": 1.2440871562480602, + "grad_norm": 0.18080149397926418, + "learning_rate": 7.274745841645638e-05, + "loss": 2.9079, + "step": 20041 + }, + { + "epoch": 1.244149233347818, + "grad_norm": 0.18398192134767993, + "learning_rate": 7.27442422319675e-05, + "loss": 2.873, + "step": 20042 + }, + { + "epoch": 1.2442113104475758, + "grad_norm": 0.19551966171103297, + "learning_rate": 7.274102592881639e-05, + "loss": 2.8708, + "step": 20043 + }, + { + "epoch": 1.2442733875473337, + "grad_norm": 0.17370836547258184, + "learning_rate": 7.273780950701983e-05, + "loss": 2.8071, + "step": 20044 + }, + { + "epoch": 1.2443354646470917, + "grad_norm": 0.16460347676041048, + "learning_rate": 7.273459296659458e-05, + "loss": 2.8326, + "step": 20045 + }, + { + "epoch": 1.2443975417468496, + "grad_norm": 0.17009324169135548, + "learning_rate": 7.273137630755747e-05, + "loss": 2.8926, + "step": 20046 + }, + { + "epoch": 1.2444596188466075, + "grad_norm": 0.14995537978354528, + "learning_rate": 7.272815952992523e-05, + "loss": 2.7748, + "step": 20047 + }, + { + "epoch": 1.2445216959463654, + "grad_norm": 0.1775172657695619, + "learning_rate": 7.272494263371466e-05, + "loss": 2.8541, + "step": 20048 + }, + { + "epoch": 1.2445837730461233, + "grad_norm": 0.15353948271861848, + "learning_rate": 7.272172561894255e-05, + "loss": 2.8759, + "step": 20049 + }, + { + "epoch": 1.2446458501458812, + "grad_norm": 0.16918742741504658, + "learning_rate": 7.27185084856257e-05, + "loss": 2.8583, + "step": 20050 + }, + { + "epoch": 1.2447079272456392, + "grad_norm": 0.15169597111959918, + "learning_rate": 7.271529123378085e-05, + "loss": 2.914, + "step": 20051 + }, + { + "epoch": 1.2447700043453969, + "grad_norm": 0.15738440226161202, + "learning_rate": 7.271207386342482e-05, + "loss": 2.8369, + "step": 20052 + }, + { + "epoch": 1.2448320814451548, + "grad_norm": 0.14765940819244003, + "learning_rate": 7.270885637457439e-05, + "loss": 2.7739, + "step": 20053 + }, + { + "epoch": 1.2448941585449127, + "grad_norm": 0.17127725849779807, + "learning_rate": 7.270563876724635e-05, + "loss": 2.8719, + "step": 20054 + }, + { + "epoch": 1.2449562356446706, + "grad_norm": 0.15082475005100202, + "learning_rate": 7.270242104145747e-05, + "loss": 2.9173, + "step": 20055 + }, + { + "epoch": 1.2450183127444285, + "grad_norm": 0.16559572973361925, + "learning_rate": 7.269920319722454e-05, + "loss": 2.9058, + "step": 20056 + }, + { + "epoch": 1.2450803898441865, + "grad_norm": 0.17059173310333695, + "learning_rate": 7.269598523456438e-05, + "loss": 2.8879, + "step": 20057 + }, + { + "epoch": 1.2451424669439444, + "grad_norm": 0.1647169479701432, + "learning_rate": 7.269276715349374e-05, + "loss": 2.939, + "step": 20058 + }, + { + "epoch": 1.2452045440437023, + "grad_norm": 0.18100553666387809, + "learning_rate": 7.268954895402943e-05, + "loss": 2.8833, + "step": 20059 + }, + { + "epoch": 1.2452666211434602, + "grad_norm": 0.19046203142979617, + "learning_rate": 7.268633063618824e-05, + "loss": 2.8504, + "step": 20060 + }, + { + "epoch": 1.2453286982432181, + "grad_norm": 0.1600986644680085, + "learning_rate": 7.268311219998694e-05, + "loss": 2.7941, + "step": 20061 + }, + { + "epoch": 1.245390775342976, + "grad_norm": 0.16351952848409157, + "learning_rate": 7.267989364544235e-05, + "loss": 2.7904, + "step": 20062 + }, + { + "epoch": 1.245452852442734, + "grad_norm": 0.1450492815526713, + "learning_rate": 7.267667497257125e-05, + "loss": 2.8438, + "step": 20063 + }, + { + "epoch": 1.2455149295424919, + "grad_norm": 0.21435228836376577, + "learning_rate": 7.267345618139043e-05, + "loss": 2.7994, + "step": 20064 + }, + { + "epoch": 1.2455770066422498, + "grad_norm": 0.16800330050666418, + "learning_rate": 7.267023727191668e-05, + "loss": 2.7821, + "step": 20065 + }, + { + "epoch": 1.2456390837420075, + "grad_norm": 0.15610283835666564, + "learning_rate": 7.266701824416681e-05, + "loss": 2.8508, + "step": 20066 + }, + { + "epoch": 1.2457011608417654, + "grad_norm": 0.17612884380804197, + "learning_rate": 7.266379909815759e-05, + "loss": 2.8753, + "step": 20067 + }, + { + "epoch": 1.2457632379415233, + "grad_norm": 0.17135834526969343, + "learning_rate": 7.266057983390584e-05, + "loss": 2.8948, + "step": 20068 + }, + { + "epoch": 1.2458253150412812, + "grad_norm": 0.18069573202154163, + "learning_rate": 7.265736045142834e-05, + "loss": 2.7904, + "step": 20069 + }, + { + "epoch": 1.2458873921410392, + "grad_norm": 0.17725048637726934, + "learning_rate": 7.265414095074188e-05, + "loss": 2.9492, + "step": 20070 + }, + { + "epoch": 1.245949469240797, + "grad_norm": 0.1691820602673379, + "learning_rate": 7.265092133186328e-05, + "loss": 2.8089, + "step": 20071 + }, + { + "epoch": 1.246011546340555, + "grad_norm": 0.15836111468623737, + "learning_rate": 7.264770159480932e-05, + "loss": 2.988, + "step": 20072 + }, + { + "epoch": 1.246073623440313, + "grad_norm": 0.15228973555960426, + "learning_rate": 7.264448173959679e-05, + "loss": 2.8918, + "step": 20073 + }, + { + "epoch": 1.2461357005400708, + "grad_norm": 0.16479216961858253, + "learning_rate": 7.264126176624252e-05, + "loss": 2.8729, + "step": 20074 + }, + { + "epoch": 1.2461977776398288, + "grad_norm": 0.14641865209570576, + "learning_rate": 7.26380416747633e-05, + "loss": 2.8608, + "step": 20075 + }, + { + "epoch": 1.2462598547395864, + "grad_norm": 0.1833180122223042, + "learning_rate": 7.263482146517588e-05, + "loss": 2.8777, + "step": 20076 + }, + { + "epoch": 1.2463219318393444, + "grad_norm": 0.14986957353387684, + "learning_rate": 7.263160113749714e-05, + "loss": 2.9301, + "step": 20077 + }, + { + "epoch": 1.2463840089391023, + "grad_norm": 0.16942251757385834, + "learning_rate": 7.262838069174382e-05, + "loss": 2.8778, + "step": 20078 + }, + { + "epoch": 1.2464460860388602, + "grad_norm": 0.15627432276242317, + "learning_rate": 7.262516012793277e-05, + "loss": 2.8279, + "step": 20079 + }, + { + "epoch": 1.2465081631386181, + "grad_norm": 0.18607876484301483, + "learning_rate": 7.262193944608075e-05, + "loss": 2.8939, + "step": 20080 + }, + { + "epoch": 1.246570240238376, + "grad_norm": 0.15442535628328172, + "learning_rate": 7.261871864620457e-05, + "loss": 2.8606, + "step": 20081 + }, + { + "epoch": 1.246632317338134, + "grad_norm": 0.15399544885570643, + "learning_rate": 7.261549772832106e-05, + "loss": 2.8375, + "step": 20082 + }, + { + "epoch": 1.2466943944378919, + "grad_norm": 0.15443354601682613, + "learning_rate": 7.2612276692447e-05, + "loss": 2.9068, + "step": 20083 + }, + { + "epoch": 1.2467564715376498, + "grad_norm": 0.15480039022190728, + "learning_rate": 7.260905553859923e-05, + "loss": 2.8986, + "step": 20084 + }, + { + "epoch": 1.2468185486374077, + "grad_norm": 0.14873553772752474, + "learning_rate": 7.260583426679451e-05, + "loss": 2.892, + "step": 20085 + }, + { + "epoch": 1.2468806257371656, + "grad_norm": 0.1454518933983213, + "learning_rate": 7.260261287704965e-05, + "loss": 2.8183, + "step": 20086 + }, + { + "epoch": 1.2469427028369235, + "grad_norm": 0.159477055588804, + "learning_rate": 7.259939136938147e-05, + "loss": 2.7373, + "step": 20087 + }, + { + "epoch": 1.2470047799366815, + "grad_norm": 0.14791424655837374, + "learning_rate": 7.25961697438068e-05, + "loss": 2.8914, + "step": 20088 + }, + { + "epoch": 1.2470668570364394, + "grad_norm": 0.18550956752554984, + "learning_rate": 7.259294800034242e-05, + "loss": 2.9171, + "step": 20089 + }, + { + "epoch": 1.247128934136197, + "grad_norm": 0.16676886164718635, + "learning_rate": 7.258972613900515e-05, + "loss": 2.8677, + "step": 20090 + }, + { + "epoch": 1.247191011235955, + "grad_norm": 0.184998656055363, + "learning_rate": 7.258650415981178e-05, + "loss": 2.7391, + "step": 20091 + }, + { + "epoch": 1.247253088335713, + "grad_norm": 0.15345431503721615, + "learning_rate": 7.258328206277915e-05, + "loss": 2.8004, + "step": 20092 + }, + { + "epoch": 1.2473151654354708, + "grad_norm": 0.2370617883180274, + "learning_rate": 7.258005984792405e-05, + "loss": 2.9102, + "step": 20093 + }, + { + "epoch": 1.2473772425352287, + "grad_norm": 0.16121878786709556, + "learning_rate": 7.25768375152633e-05, + "loss": 2.9566, + "step": 20094 + }, + { + "epoch": 1.2474393196349867, + "grad_norm": 0.17122823772715284, + "learning_rate": 7.257361506481371e-05, + "loss": 2.9034, + "step": 20095 + }, + { + "epoch": 1.2475013967347446, + "grad_norm": 0.16639475446941707, + "learning_rate": 7.257039249659208e-05, + "loss": 2.9045, + "step": 20096 + }, + { + "epoch": 1.2475634738345025, + "grad_norm": 0.1754660057828187, + "learning_rate": 7.256716981061523e-05, + "loss": 2.8809, + "step": 20097 + }, + { + "epoch": 1.2476255509342604, + "grad_norm": 0.20510503501540367, + "learning_rate": 7.256394700689998e-05, + "loss": 2.8308, + "step": 20098 + }, + { + "epoch": 1.2476876280340183, + "grad_norm": 0.17832475637892298, + "learning_rate": 7.256072408546315e-05, + "loss": 2.7864, + "step": 20099 + }, + { + "epoch": 1.247749705133776, + "grad_norm": 0.17446630641647917, + "learning_rate": 7.255750104632156e-05, + "loss": 2.7702, + "step": 20100 + }, + { + "epoch": 1.247811782233534, + "grad_norm": 0.16359690039223662, + "learning_rate": 7.255427788949199e-05, + "loss": 2.8874, + "step": 20101 + }, + { + "epoch": 1.2478738593332919, + "grad_norm": 0.18008666690584205, + "learning_rate": 7.255105461499127e-05, + "loss": 2.8343, + "step": 20102 + }, + { + "epoch": 1.2479359364330498, + "grad_norm": 0.16997853244369868, + "learning_rate": 7.254783122283623e-05, + "loss": 2.963, + "step": 20103 + }, + { + "epoch": 1.2479980135328077, + "grad_norm": 0.22935044801679505, + "learning_rate": 7.254460771304368e-05, + "loss": 2.825, + "step": 20104 + }, + { + "epoch": 1.2480600906325656, + "grad_norm": 0.17075738262808832, + "learning_rate": 7.254138408563044e-05, + "loss": 2.8769, + "step": 20105 + }, + { + "epoch": 1.2481221677323235, + "grad_norm": 0.1542137320358196, + "learning_rate": 7.253816034061332e-05, + "loss": 2.8762, + "step": 20106 + }, + { + "epoch": 1.2481842448320815, + "grad_norm": 0.23842953929885893, + "learning_rate": 7.253493647800917e-05, + "loss": 2.8813, + "step": 20107 + }, + { + "epoch": 1.2482463219318394, + "grad_norm": 0.18200171890075875, + "learning_rate": 7.253171249783477e-05, + "loss": 2.935, + "step": 20108 + }, + { + "epoch": 1.2483083990315973, + "grad_norm": 0.16090435825687205, + "learning_rate": 7.252848840010696e-05, + "loss": 2.8703, + "step": 20109 + }, + { + "epoch": 1.2483704761313552, + "grad_norm": 0.18358165619970512, + "learning_rate": 7.252526418484254e-05, + "loss": 2.887, + "step": 20110 + }, + { + "epoch": 1.2484325532311131, + "grad_norm": 0.16082885857539594, + "learning_rate": 7.252203985205837e-05, + "loss": 2.8627, + "step": 20111 + }, + { + "epoch": 1.248494630330871, + "grad_norm": 0.18685697522836547, + "learning_rate": 7.251881540177125e-05, + "loss": 2.8355, + "step": 20112 + }, + { + "epoch": 1.248556707430629, + "grad_norm": 0.17658448019187103, + "learning_rate": 7.2515590833998e-05, + "loss": 2.8726, + "step": 20113 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.16930003770128202, + "learning_rate": 7.251236614875543e-05, + "loss": 2.8492, + "step": 20114 + }, + { + "epoch": 1.2486808616301446, + "grad_norm": 0.14971583565081245, + "learning_rate": 7.25091413460604e-05, + "loss": 2.8579, + "step": 20115 + }, + { + "epoch": 1.2487429387299025, + "grad_norm": 0.2042153967309892, + "learning_rate": 7.250591642592971e-05, + "loss": 2.8896, + "step": 20116 + }, + { + "epoch": 1.2488050158296604, + "grad_norm": 0.16971459845900247, + "learning_rate": 7.25026913883802e-05, + "loss": 2.8578, + "step": 20117 + }, + { + "epoch": 1.2488670929294183, + "grad_norm": 0.15146890729498955, + "learning_rate": 7.249946623342867e-05, + "loss": 2.8127, + "step": 20118 + }, + { + "epoch": 1.2489291700291762, + "grad_norm": 0.1628329547729146, + "learning_rate": 7.249624096109196e-05, + "loss": 2.9072, + "step": 20119 + }, + { + "epoch": 1.2489912471289342, + "grad_norm": 0.15476481788420943, + "learning_rate": 7.249301557138691e-05, + "loss": 2.7832, + "step": 20120 + }, + { + "epoch": 1.249053324228692, + "grad_norm": 0.16133455882509284, + "learning_rate": 7.248979006433033e-05, + "loss": 2.9232, + "step": 20121 + }, + { + "epoch": 1.24911540132845, + "grad_norm": 0.1667971838829879, + "learning_rate": 7.248656443993907e-05, + "loss": 2.8527, + "step": 20122 + }, + { + "epoch": 1.249177478428208, + "grad_norm": 0.169879891700924, + "learning_rate": 7.248333869822993e-05, + "loss": 2.8706, + "step": 20123 + }, + { + "epoch": 1.2492395555279656, + "grad_norm": 0.19776574558562704, + "learning_rate": 7.248011283921976e-05, + "loss": 2.845, + "step": 20124 + }, + { + "epoch": 1.2493016326277235, + "grad_norm": 0.18341255377412052, + "learning_rate": 7.24768868629254e-05, + "loss": 2.8779, + "step": 20125 + }, + { + "epoch": 1.2493637097274815, + "grad_norm": 0.19124866343324154, + "learning_rate": 7.247366076936363e-05, + "loss": 2.9154, + "step": 20126 + }, + { + "epoch": 1.2494257868272394, + "grad_norm": 0.18140707074948134, + "learning_rate": 7.247043455855135e-05, + "loss": 2.9203, + "step": 20127 + }, + { + "epoch": 1.2494878639269973, + "grad_norm": 0.15159367260432582, + "learning_rate": 7.246720823050534e-05, + "loss": 2.8148, + "step": 20128 + }, + { + "epoch": 1.2495499410267552, + "grad_norm": 0.18741472609872095, + "learning_rate": 7.246398178524245e-05, + "loss": 2.8321, + "step": 20129 + }, + { + "epoch": 1.2496120181265131, + "grad_norm": 0.16514695607852753, + "learning_rate": 7.246075522277952e-05, + "loss": 2.8614, + "step": 20130 + }, + { + "epoch": 1.249674095226271, + "grad_norm": 0.1692510495484349, + "learning_rate": 7.245752854313337e-05, + "loss": 2.9458, + "step": 20131 + }, + { + "epoch": 1.249736172326029, + "grad_norm": 0.18755300090649735, + "learning_rate": 7.245430174632085e-05, + "loss": 2.9209, + "step": 20132 + }, + { + "epoch": 1.2497982494257869, + "grad_norm": 0.19012800920677012, + "learning_rate": 7.245107483235879e-05, + "loss": 2.9521, + "step": 20133 + }, + { + "epoch": 1.2498603265255448, + "grad_norm": 0.18484738659929223, + "learning_rate": 7.244784780126401e-05, + "loss": 2.9544, + "step": 20134 + }, + { + "epoch": 1.2499224036253027, + "grad_norm": 0.1553309052363099, + "learning_rate": 7.244462065305338e-05, + "loss": 2.8936, + "step": 20135 + }, + { + "epoch": 1.2499844807250606, + "grad_norm": 0.16319259993094704, + "learning_rate": 7.244139338774369e-05, + "loss": 2.8424, + "step": 20136 + }, + { + "epoch": 1.2500465578248185, + "grad_norm": 0.17364485326668502, + "learning_rate": 7.24381660053518e-05, + "loss": 2.8107, + "step": 20137 + }, + { + "epoch": 1.2501086349245762, + "grad_norm": 0.17929842538932061, + "learning_rate": 7.243493850589457e-05, + "loss": 2.8949, + "step": 20138 + }, + { + "epoch": 1.2501707120243342, + "grad_norm": 0.1468389098994791, + "learning_rate": 7.243171088938882e-05, + "loss": 2.807, + "step": 20139 + }, + { + "epoch": 1.250232789124092, + "grad_norm": 0.18040804894887, + "learning_rate": 7.242848315585138e-05, + "loss": 2.822, + "step": 20140 + }, + { + "epoch": 1.25029486622385, + "grad_norm": 0.15220123129198107, + "learning_rate": 7.24252553052991e-05, + "loss": 2.7033, + "step": 20141 + }, + { + "epoch": 1.250356943323608, + "grad_norm": 0.15601154848330778, + "learning_rate": 7.242202733774882e-05, + "loss": 2.8606, + "step": 20142 + }, + { + "epoch": 1.2504190204233658, + "grad_norm": 0.19039966640049996, + "learning_rate": 7.24187992532174e-05, + "loss": 2.8407, + "step": 20143 + }, + { + "epoch": 1.2504810975231238, + "grad_norm": 0.1505360086396466, + "learning_rate": 7.241557105172163e-05, + "loss": 2.8927, + "step": 20144 + }, + { + "epoch": 1.2505431746228817, + "grad_norm": 0.15893644341422503, + "learning_rate": 7.241234273327839e-05, + "loss": 2.919, + "step": 20145 + }, + { + "epoch": 1.2506052517226396, + "grad_norm": 0.1689117651911482, + "learning_rate": 7.240911429790452e-05, + "loss": 2.7798, + "step": 20146 + }, + { + "epoch": 1.2506673288223973, + "grad_norm": 0.17479341353391117, + "learning_rate": 7.240588574561688e-05, + "loss": 2.8326, + "step": 20147 + }, + { + "epoch": 1.2507294059221552, + "grad_norm": 0.16913580766473568, + "learning_rate": 7.240265707643226e-05, + "loss": 2.8665, + "step": 20148 + }, + { + "epoch": 1.2507914830219131, + "grad_norm": 0.20335827783559007, + "learning_rate": 7.239942829036758e-05, + "loss": 2.8809, + "step": 20149 + }, + { + "epoch": 1.250853560121671, + "grad_norm": 0.1648215999908291, + "learning_rate": 7.239619938743961e-05, + "loss": 2.7765, + "step": 20150 + }, + { + "epoch": 1.250915637221429, + "grad_norm": 0.17153711051193415, + "learning_rate": 7.239297036766526e-05, + "loss": 2.8511, + "step": 20151 + }, + { + "epoch": 1.2509777143211869, + "grad_norm": 0.17537053126924587, + "learning_rate": 7.238974123106133e-05, + "loss": 2.8276, + "step": 20152 + }, + { + "epoch": 1.2510397914209448, + "grad_norm": 0.2073738664000548, + "learning_rate": 7.238651197764468e-05, + "loss": 2.8104, + "step": 20153 + }, + { + "epoch": 1.2511018685207027, + "grad_norm": 0.2214130024226907, + "learning_rate": 7.238328260743217e-05, + "loss": 2.8921, + "step": 20154 + }, + { + "epoch": 1.2511639456204606, + "grad_norm": 0.15999438084108014, + "learning_rate": 7.238005312044063e-05, + "loss": 2.8902, + "step": 20155 + }, + { + "epoch": 1.2512260227202185, + "grad_norm": 0.20430648591463427, + "learning_rate": 7.237682351668694e-05, + "loss": 2.9051, + "step": 20156 + }, + { + "epoch": 1.2512880998199765, + "grad_norm": 0.18036826378833362, + "learning_rate": 7.237359379618791e-05, + "loss": 2.8385, + "step": 20157 + }, + { + "epoch": 1.2513501769197344, + "grad_norm": 0.17191995227955123, + "learning_rate": 7.237036395896044e-05, + "loss": 2.8644, + "step": 20158 + }, + { + "epoch": 1.2514122540194923, + "grad_norm": 0.19820847248085607, + "learning_rate": 7.23671340050213e-05, + "loss": 2.8916, + "step": 20159 + }, + { + "epoch": 1.2514743311192502, + "grad_norm": 0.17104065563338305, + "learning_rate": 7.236390393438745e-05, + "loss": 2.8849, + "step": 20160 + }, + { + "epoch": 1.2515364082190081, + "grad_norm": 0.15906877105319756, + "learning_rate": 7.236067374707565e-05, + "loss": 2.8957, + "step": 20161 + }, + { + "epoch": 1.2515984853187658, + "grad_norm": 0.19137842598918822, + "learning_rate": 7.235744344310279e-05, + "loss": 2.8771, + "step": 20162 + }, + { + "epoch": 1.2516605624185237, + "grad_norm": 0.15739737289234537, + "learning_rate": 7.235421302248572e-05, + "loss": 2.8227, + "step": 20163 + }, + { + "epoch": 1.2517226395182817, + "grad_norm": 0.17591454432388, + "learning_rate": 7.23509824852413e-05, + "loss": 2.8697, + "step": 20164 + }, + { + "epoch": 1.2517847166180396, + "grad_norm": 0.2121611091230378, + "learning_rate": 7.234775183138638e-05, + "loss": 2.9734, + "step": 20165 + }, + { + "epoch": 1.2518467937177975, + "grad_norm": 0.3163881061202564, + "learning_rate": 7.23445210609378e-05, + "loss": 2.8628, + "step": 20166 + }, + { + "epoch": 1.2519088708175554, + "grad_norm": 0.19139675194665975, + "learning_rate": 7.234129017391245e-05, + "loss": 2.9263, + "step": 20167 + }, + { + "epoch": 1.2519709479173133, + "grad_norm": 0.23969538632167292, + "learning_rate": 7.233805917032715e-05, + "loss": 2.9005, + "step": 20168 + }, + { + "epoch": 1.2520330250170713, + "grad_norm": 0.17502404946028366, + "learning_rate": 7.233482805019878e-05, + "loss": 2.7956, + "step": 20169 + }, + { + "epoch": 1.2520951021168292, + "grad_norm": 0.17136833983556268, + "learning_rate": 7.233159681354419e-05, + "loss": 2.7674, + "step": 20170 + }, + { + "epoch": 1.2521571792165869, + "grad_norm": 0.17634224382595876, + "learning_rate": 7.232836546038022e-05, + "loss": 2.8847, + "step": 20171 + }, + { + "epoch": 1.2522192563163448, + "grad_norm": 0.1977315065860877, + "learning_rate": 7.232513399072377e-05, + "loss": 2.8742, + "step": 20172 + }, + { + "epoch": 1.2522813334161027, + "grad_norm": 0.1799078813080994, + "learning_rate": 7.232190240459167e-05, + "loss": 2.904, + "step": 20173 + }, + { + "epoch": 1.2523434105158606, + "grad_norm": 0.18875324121749118, + "learning_rate": 7.23186707020008e-05, + "loss": 2.845, + "step": 20174 + }, + { + "epoch": 1.2524054876156185, + "grad_norm": 0.164628471802068, + "learning_rate": 7.231543888296797e-05, + "loss": 2.9153, + "step": 20175 + }, + { + "epoch": 1.2524675647153765, + "grad_norm": 0.16663350517954909, + "learning_rate": 7.231220694751011e-05, + "loss": 2.7967, + "step": 20176 + }, + { + "epoch": 1.2525296418151344, + "grad_norm": 0.16520156421860357, + "learning_rate": 7.230897489564403e-05, + "loss": 2.9294, + "step": 20177 + }, + { + "epoch": 1.2525917189148923, + "grad_norm": 0.16190925563399888, + "learning_rate": 7.230574272738664e-05, + "loss": 2.8866, + "step": 20178 + }, + { + "epoch": 1.2526537960146502, + "grad_norm": 0.16602743331202754, + "learning_rate": 7.230251044275476e-05, + "loss": 2.8419, + "step": 20179 + }, + { + "epoch": 1.2527158731144081, + "grad_norm": 0.1771810007501239, + "learning_rate": 7.229927804176526e-05, + "loss": 2.8422, + "step": 20180 + }, + { + "epoch": 1.252777950214166, + "grad_norm": 0.1981392861128436, + "learning_rate": 7.229604552443502e-05, + "loss": 2.8242, + "step": 20181 + }, + { + "epoch": 1.252840027313924, + "grad_norm": 0.19053766172812994, + "learning_rate": 7.22928128907809e-05, + "loss": 2.8712, + "step": 20182 + }, + { + "epoch": 1.2529021044136819, + "grad_norm": 0.15228303897800236, + "learning_rate": 7.228958014081975e-05, + "loss": 2.8228, + "step": 20183 + }, + { + "epoch": 1.2529641815134398, + "grad_norm": 0.21101495415737342, + "learning_rate": 7.228634727456847e-05, + "loss": 2.9076, + "step": 20184 + }, + { + "epoch": 1.2530262586131977, + "grad_norm": 0.18241067929597976, + "learning_rate": 7.22831142920439e-05, + "loss": 2.8923, + "step": 20185 + }, + { + "epoch": 1.2530883357129554, + "grad_norm": 0.15911441471030566, + "learning_rate": 7.22798811932629e-05, + "loss": 2.8787, + "step": 20186 + }, + { + "epoch": 1.2531504128127133, + "grad_norm": 0.18534072667436047, + "learning_rate": 7.227664797824236e-05, + "loss": 2.8825, + "step": 20187 + }, + { + "epoch": 1.2532124899124713, + "grad_norm": 0.16850364150801547, + "learning_rate": 7.227341464699914e-05, + "loss": 2.8272, + "step": 20188 + }, + { + "epoch": 1.2532745670122292, + "grad_norm": 0.17300181135775616, + "learning_rate": 7.22701811995501e-05, + "loss": 2.7094, + "step": 20189 + }, + { + "epoch": 1.253336644111987, + "grad_norm": 0.1651488396128866, + "learning_rate": 7.226694763591214e-05, + "loss": 2.7819, + "step": 20190 + }, + { + "epoch": 1.253398721211745, + "grad_norm": 0.1774326320194075, + "learning_rate": 7.226371395610208e-05, + "loss": 2.8229, + "step": 20191 + }, + { + "epoch": 1.253460798311503, + "grad_norm": 0.18790677768487904, + "learning_rate": 7.226048016013685e-05, + "loss": 2.9529, + "step": 20192 + }, + { + "epoch": 1.2535228754112608, + "grad_norm": 0.1606378988734648, + "learning_rate": 7.225724624803327e-05, + "loss": 2.8653, + "step": 20193 + }, + { + "epoch": 1.2535849525110188, + "grad_norm": 0.17219645069314346, + "learning_rate": 7.225401221980826e-05, + "loss": 2.8819, + "step": 20194 + }, + { + "epoch": 1.2536470296107765, + "grad_norm": 0.16158168000974213, + "learning_rate": 7.225077807547863e-05, + "loss": 2.9333, + "step": 20195 + }, + { + "epoch": 1.2537091067105344, + "grad_norm": 0.17848681378078035, + "learning_rate": 7.224754381506132e-05, + "loss": 2.883, + "step": 20196 + }, + { + "epoch": 1.2537711838102923, + "grad_norm": 0.19535569666969538, + "learning_rate": 7.224430943857315e-05, + "loss": 2.9491, + "step": 20197 + }, + { + "epoch": 1.2538332609100502, + "grad_norm": 0.162036684123543, + "learning_rate": 7.224107494603103e-05, + "loss": 2.9553, + "step": 20198 + }, + { + "epoch": 1.2538953380098081, + "grad_norm": 0.16161295706098436, + "learning_rate": 7.223784033745183e-05, + "loss": 2.7932, + "step": 20199 + }, + { + "epoch": 1.253957415109566, + "grad_norm": 0.17617276354309797, + "learning_rate": 7.223460561285239e-05, + "loss": 2.8412, + "step": 20200 + }, + { + "epoch": 1.254019492209324, + "grad_norm": 0.184903847605778, + "learning_rate": 7.223137077224963e-05, + "loss": 2.8957, + "step": 20201 + }, + { + "epoch": 1.2540815693090819, + "grad_norm": 0.16050896533469433, + "learning_rate": 7.222813581566041e-05, + "loss": 2.7935, + "step": 20202 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.16467080422522123, + "learning_rate": 7.222490074310163e-05, + "loss": 2.8321, + "step": 20203 + }, + { + "epoch": 1.2542057235085977, + "grad_norm": 0.17018862318262998, + "learning_rate": 7.222166555459012e-05, + "loss": 2.8396, + "step": 20204 + }, + { + "epoch": 1.2542678006083556, + "grad_norm": 0.1623331559114234, + "learning_rate": 7.221843025014278e-05, + "loss": 2.9228, + "step": 20205 + }, + { + "epoch": 1.2543298777081135, + "grad_norm": 0.16291116155065644, + "learning_rate": 7.221519482977654e-05, + "loss": 2.9613, + "step": 20206 + }, + { + "epoch": 1.2543919548078715, + "grad_norm": 0.17287859732208105, + "learning_rate": 7.221195929350819e-05, + "loss": 2.8978, + "step": 20207 + }, + { + "epoch": 1.2544540319076294, + "grad_norm": 0.17558835890376512, + "learning_rate": 7.22087236413547e-05, + "loss": 2.8755, + "step": 20208 + }, + { + "epoch": 1.2545161090073873, + "grad_norm": 0.16721911376567092, + "learning_rate": 7.220548787333287e-05, + "loss": 2.9842, + "step": 20209 + }, + { + "epoch": 1.254578186107145, + "grad_norm": 0.16663096214353415, + "learning_rate": 7.220225198945962e-05, + "loss": 2.8277, + "step": 20210 + }, + { + "epoch": 1.254640263206903, + "grad_norm": 0.1699639082035072, + "learning_rate": 7.219901598975184e-05, + "loss": 2.9208, + "step": 20211 + }, + { + "epoch": 1.2547023403066608, + "grad_norm": 0.16105160834463608, + "learning_rate": 7.219577987422642e-05, + "loss": 2.7897, + "step": 20212 + }, + { + "epoch": 1.2547644174064188, + "grad_norm": 0.19220872932353048, + "learning_rate": 7.219254364290022e-05, + "loss": 2.8691, + "step": 20213 + }, + { + "epoch": 1.2548264945061767, + "grad_norm": 0.1584543070822882, + "learning_rate": 7.218930729579011e-05, + "loss": 2.921, + "step": 20214 + }, + { + "epoch": 1.2548885716059346, + "grad_norm": 0.16636663802683818, + "learning_rate": 7.218607083291304e-05, + "loss": 2.8743, + "step": 20215 + }, + { + "epoch": 1.2549506487056925, + "grad_norm": 0.2365940032829649, + "learning_rate": 7.218283425428582e-05, + "loss": 2.7875, + "step": 20216 + }, + { + "epoch": 1.2550127258054504, + "grad_norm": 0.16569094042513893, + "learning_rate": 7.217959755992538e-05, + "loss": 2.8406, + "step": 20217 + }, + { + "epoch": 1.2550748029052083, + "grad_norm": 0.17464306888165596, + "learning_rate": 7.21763607498486e-05, + "loss": 2.9091, + "step": 20218 + }, + { + "epoch": 1.255136880004966, + "grad_norm": 0.17191554263257205, + "learning_rate": 7.217312382407234e-05, + "loss": 2.8962, + "step": 20219 + }, + { + "epoch": 1.255198957104724, + "grad_norm": 0.1482707015044946, + "learning_rate": 7.216988678261352e-05, + "loss": 2.9278, + "step": 20220 + }, + { + "epoch": 1.2552610342044819, + "grad_norm": 0.16165668406853176, + "learning_rate": 7.216664962548902e-05, + "loss": 2.8774, + "step": 20221 + }, + { + "epoch": 1.2553231113042398, + "grad_norm": 0.1532729191027838, + "learning_rate": 7.216341235271574e-05, + "loss": 2.8034, + "step": 20222 + }, + { + "epoch": 1.2553851884039977, + "grad_norm": 0.16116132772885206, + "learning_rate": 7.216017496431055e-05, + "loss": 2.8698, + "step": 20223 + }, + { + "epoch": 1.2554472655037556, + "grad_norm": 0.18488570418394507, + "learning_rate": 7.215693746029034e-05, + "loss": 2.8086, + "step": 20224 + }, + { + "epoch": 1.2555093426035135, + "grad_norm": 0.16971948233565662, + "learning_rate": 7.215369984067201e-05, + "loss": 2.906, + "step": 20225 + }, + { + "epoch": 1.2555714197032715, + "grad_norm": 0.15918325057459262, + "learning_rate": 7.215046210547248e-05, + "loss": 2.9385, + "step": 20226 + }, + { + "epoch": 1.2556334968030294, + "grad_norm": 0.1510596257801634, + "learning_rate": 7.214722425470857e-05, + "loss": 2.9313, + "step": 20227 + }, + { + "epoch": 1.2556955739027873, + "grad_norm": 0.1745890347369959, + "learning_rate": 7.214398628839723e-05, + "loss": 2.863, + "step": 20228 + }, + { + "epoch": 1.2557576510025452, + "grad_norm": 0.19877710193611303, + "learning_rate": 7.214074820655534e-05, + "loss": 2.8618, + "step": 20229 + }, + { + "epoch": 1.2558197281023031, + "grad_norm": 0.14869901558714374, + "learning_rate": 7.21375100091998e-05, + "loss": 2.8499, + "step": 20230 + }, + { + "epoch": 1.255881805202061, + "grad_norm": 0.1745626169362071, + "learning_rate": 7.213427169634748e-05, + "loss": 2.8524, + "step": 20231 + }, + { + "epoch": 1.255943882301819, + "grad_norm": 0.1891633994041583, + "learning_rate": 7.21310332680153e-05, + "loss": 2.9093, + "step": 20232 + }, + { + "epoch": 1.2560059594015769, + "grad_norm": 0.14750348670927893, + "learning_rate": 7.212779472422013e-05, + "loss": 2.8691, + "step": 20233 + }, + { + "epoch": 1.2560680365013346, + "grad_norm": 0.16958346046350042, + "learning_rate": 7.212455606497891e-05, + "loss": 2.9253, + "step": 20234 + }, + { + "epoch": 1.2561301136010925, + "grad_norm": 0.15870939299545234, + "learning_rate": 7.21213172903085e-05, + "loss": 2.8714, + "step": 20235 + }, + { + "epoch": 1.2561921907008504, + "grad_norm": 0.16158861369813537, + "learning_rate": 7.21180784002258e-05, + "loss": 2.85, + "step": 20236 + }, + { + "epoch": 1.2562542678006083, + "grad_norm": 0.16951198028885353, + "learning_rate": 7.211483939474773e-05, + "loss": 2.8963, + "step": 20237 + }, + { + "epoch": 1.2563163449003663, + "grad_norm": 0.1738512206004144, + "learning_rate": 7.211160027389116e-05, + "loss": 2.9724, + "step": 20238 + }, + { + "epoch": 1.2563784220001242, + "grad_norm": 0.16598362333739258, + "learning_rate": 7.2108361037673e-05, + "loss": 2.8434, + "step": 20239 + }, + { + "epoch": 1.256440499099882, + "grad_norm": 0.16651253663905927, + "learning_rate": 7.210512168611018e-05, + "loss": 2.7962, + "step": 20240 + }, + { + "epoch": 1.25650257619964, + "grad_norm": 0.15549982110278457, + "learning_rate": 7.210188221921955e-05, + "loss": 2.834, + "step": 20241 + }, + { + "epoch": 1.256564653299398, + "grad_norm": 0.17017865236667412, + "learning_rate": 7.209864263701805e-05, + "loss": 2.8376, + "step": 20242 + }, + { + "epoch": 1.2566267303991556, + "grad_norm": 0.168565284113015, + "learning_rate": 7.209540293952255e-05, + "loss": 2.8642, + "step": 20243 + }, + { + "epoch": 1.2566888074989135, + "grad_norm": 0.18145919270390415, + "learning_rate": 7.209216312674998e-05, + "loss": 2.8843, + "step": 20244 + }, + { + "epoch": 1.2567508845986715, + "grad_norm": 0.15765136882269093, + "learning_rate": 7.208892319871723e-05, + "loss": 2.8771, + "step": 20245 + }, + { + "epoch": 1.2568129616984294, + "grad_norm": 0.17869728328051862, + "learning_rate": 7.20856831554412e-05, + "loss": 2.8232, + "step": 20246 + }, + { + "epoch": 1.2568750387981873, + "grad_norm": 0.14763233505828297, + "learning_rate": 7.20824429969388e-05, + "loss": 2.787, + "step": 20247 + }, + { + "epoch": 1.2569371158979452, + "grad_norm": 0.1697623117547536, + "learning_rate": 7.207920272322696e-05, + "loss": 2.8314, + "step": 20248 + }, + { + "epoch": 1.2569991929977031, + "grad_norm": 0.16718397225909515, + "learning_rate": 7.207596233432252e-05, + "loss": 2.8434, + "step": 20249 + }, + { + "epoch": 1.257061270097461, + "grad_norm": 0.15450359331314892, + "learning_rate": 7.207272183024245e-05, + "loss": 2.832, + "step": 20250 + }, + { + "epoch": 1.257123347197219, + "grad_norm": 0.14839800510007115, + "learning_rate": 7.206948121100363e-05, + "loss": 2.9046, + "step": 20251 + }, + { + "epoch": 1.2571854242969769, + "grad_norm": 0.20450848074550027, + "learning_rate": 7.206624047662296e-05, + "loss": 2.8641, + "step": 20252 + }, + { + "epoch": 1.2572475013967348, + "grad_norm": 0.16095238612132876, + "learning_rate": 7.206299962711737e-05, + "loss": 2.8354, + "step": 20253 + }, + { + "epoch": 1.2573095784964927, + "grad_norm": 0.15035698873708914, + "learning_rate": 7.205975866250375e-05, + "loss": 2.8534, + "step": 20254 + }, + { + "epoch": 1.2573716555962506, + "grad_norm": 0.16023607125725917, + "learning_rate": 7.2056517582799e-05, + "loss": 2.9175, + "step": 20255 + }, + { + "epoch": 1.2574337326960086, + "grad_norm": 0.1998676223305803, + "learning_rate": 7.205327638802004e-05, + "loss": 2.8282, + "step": 20256 + }, + { + "epoch": 1.2574958097957665, + "grad_norm": 0.1521445341184909, + "learning_rate": 7.205003507818381e-05, + "loss": 2.8176, + "step": 20257 + }, + { + "epoch": 1.2575578868955242, + "grad_norm": 0.14999437768651352, + "learning_rate": 7.204679365330717e-05, + "loss": 2.7677, + "step": 20258 + }, + { + "epoch": 1.257619963995282, + "grad_norm": 0.15566206228390272, + "learning_rate": 7.204355211340708e-05, + "loss": 2.8634, + "step": 20259 + }, + { + "epoch": 1.25768204109504, + "grad_norm": 0.16005870127834318, + "learning_rate": 7.204031045850041e-05, + "loss": 2.8401, + "step": 20260 + }, + { + "epoch": 1.257744118194798, + "grad_norm": 0.1702960378232693, + "learning_rate": 7.20370686886041e-05, + "loss": 2.8134, + "step": 20261 + }, + { + "epoch": 1.2578061952945558, + "grad_norm": 0.16779446986284816, + "learning_rate": 7.203382680373505e-05, + "loss": 2.9459, + "step": 20262 + }, + { + "epoch": 1.2578682723943138, + "grad_norm": 0.1614426148874785, + "learning_rate": 7.203058480391016e-05, + "loss": 2.8126, + "step": 20263 + }, + { + "epoch": 1.2579303494940717, + "grad_norm": 0.15619756295559598, + "learning_rate": 7.202734268914638e-05, + "loss": 2.7993, + "step": 20264 + }, + { + "epoch": 1.2579924265938296, + "grad_norm": 0.16754568959828645, + "learning_rate": 7.20241004594606e-05, + "loss": 2.8361, + "step": 20265 + }, + { + "epoch": 1.2580545036935875, + "grad_norm": 0.14589310525242566, + "learning_rate": 7.202085811486975e-05, + "loss": 2.8918, + "step": 20266 + }, + { + "epoch": 1.2581165807933452, + "grad_norm": 0.15544520779738585, + "learning_rate": 7.201761565539072e-05, + "loss": 2.8585, + "step": 20267 + }, + { + "epoch": 1.2581786578931031, + "grad_norm": 0.15753351777352706, + "learning_rate": 7.201437308104046e-05, + "loss": 2.8757, + "step": 20268 + }, + { + "epoch": 1.258240734992861, + "grad_norm": 0.15486388505733042, + "learning_rate": 7.201113039183586e-05, + "loss": 2.8068, + "step": 20269 + }, + { + "epoch": 1.258302812092619, + "grad_norm": 0.15340562135773922, + "learning_rate": 7.200788758779385e-05, + "loss": 2.8342, + "step": 20270 + }, + { + "epoch": 1.2583648891923769, + "grad_norm": 0.14336122724703348, + "learning_rate": 7.200464466893135e-05, + "loss": 2.9043, + "step": 20271 + }, + { + "epoch": 1.2584269662921348, + "grad_norm": 0.20640961871693775, + "learning_rate": 7.200140163526528e-05, + "loss": 2.8398, + "step": 20272 + }, + { + "epoch": 1.2584890433918927, + "grad_norm": 0.15613794481070417, + "learning_rate": 7.199815848681257e-05, + "loss": 2.8498, + "step": 20273 + }, + { + "epoch": 1.2585511204916506, + "grad_norm": 0.17838546423435736, + "learning_rate": 7.19949152235901e-05, + "loss": 2.8274, + "step": 20274 + }, + { + "epoch": 1.2586131975914086, + "grad_norm": 0.14791571360491468, + "learning_rate": 7.199167184561484e-05, + "loss": 2.7712, + "step": 20275 + }, + { + "epoch": 1.2586752746911665, + "grad_norm": 0.17459880169595726, + "learning_rate": 7.198842835290367e-05, + "loss": 2.9296, + "step": 20276 + }, + { + "epoch": 1.2587373517909244, + "grad_norm": 0.21252907666257748, + "learning_rate": 7.198518474547354e-05, + "loss": 2.8146, + "step": 20277 + }, + { + "epoch": 1.2587994288906823, + "grad_norm": 0.18376051653720585, + "learning_rate": 7.198194102334138e-05, + "loss": 2.8135, + "step": 20278 + }, + { + "epoch": 1.2588615059904402, + "grad_norm": 0.18140570513161278, + "learning_rate": 7.197869718652407e-05, + "loss": 2.7521, + "step": 20279 + }, + { + "epoch": 1.2589235830901981, + "grad_norm": 0.16555946736062174, + "learning_rate": 7.197545323503859e-05, + "loss": 2.8395, + "step": 20280 + }, + { + "epoch": 1.258985660189956, + "grad_norm": 0.15025806157951235, + "learning_rate": 7.197220916890181e-05, + "loss": 2.8711, + "step": 20281 + }, + { + "epoch": 1.2590477372897138, + "grad_norm": 0.16393937872509315, + "learning_rate": 7.196896498813069e-05, + "loss": 2.8485, + "step": 20282 + }, + { + "epoch": 1.2591098143894717, + "grad_norm": 0.20155054397017505, + "learning_rate": 7.196572069274214e-05, + "loss": 2.9222, + "step": 20283 + }, + { + "epoch": 1.2591718914892296, + "grad_norm": 0.17535187945355318, + "learning_rate": 7.19624762827531e-05, + "loss": 2.8646, + "step": 20284 + }, + { + "epoch": 1.2592339685889875, + "grad_norm": 0.1669282511959395, + "learning_rate": 7.195923175818048e-05, + "loss": 2.9541, + "step": 20285 + }, + { + "epoch": 1.2592960456887454, + "grad_norm": 0.15793586455631717, + "learning_rate": 7.195598711904124e-05, + "loss": 2.937, + "step": 20286 + }, + { + "epoch": 1.2593581227885033, + "grad_norm": 0.16214690291106565, + "learning_rate": 7.195274236535226e-05, + "loss": 2.8393, + "step": 20287 + }, + { + "epoch": 1.2594201998882613, + "grad_norm": 0.18409846305868682, + "learning_rate": 7.194949749713051e-05, + "loss": 2.945, + "step": 20288 + }, + { + "epoch": 1.2594822769880192, + "grad_norm": 0.17647452863537608, + "learning_rate": 7.19462525143929e-05, + "loss": 2.9081, + "step": 20289 + }, + { + "epoch": 1.259544354087777, + "grad_norm": 0.15629594842509698, + "learning_rate": 7.194300741715637e-05, + "loss": 2.8953, + "step": 20290 + }, + { + "epoch": 1.2596064311875348, + "grad_norm": 0.17641174834934667, + "learning_rate": 7.193976220543783e-05, + "loss": 2.7523, + "step": 20291 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.15617071762915585, + "learning_rate": 7.193651687925422e-05, + "loss": 2.9417, + "step": 20292 + }, + { + "epoch": 1.2597305853870506, + "grad_norm": 0.19156549602300155, + "learning_rate": 7.193327143862249e-05, + "loss": 2.809, + "step": 20293 + }, + { + "epoch": 1.2597926624868085, + "grad_norm": 0.1520046776742731, + "learning_rate": 7.193002588355955e-05, + "loss": 2.7936, + "step": 20294 + }, + { + "epoch": 1.2598547395865665, + "grad_norm": 0.18248879922500616, + "learning_rate": 7.192678021408236e-05, + "loss": 2.8553, + "step": 20295 + }, + { + "epoch": 1.2599168166863244, + "grad_norm": 0.14945895324725053, + "learning_rate": 7.192353443020782e-05, + "loss": 2.8867, + "step": 20296 + }, + { + "epoch": 1.2599788937860823, + "grad_norm": 0.15409748860088282, + "learning_rate": 7.192028853195288e-05, + "loss": 2.8569, + "step": 20297 + }, + { + "epoch": 1.2600409708858402, + "grad_norm": 0.15018328704116066, + "learning_rate": 7.191704251933447e-05, + "loss": 2.8776, + "step": 20298 + }, + { + "epoch": 1.2601030479855981, + "grad_norm": 0.21052744533367557, + "learning_rate": 7.191379639236953e-05, + "loss": 2.7601, + "step": 20299 + }, + { + "epoch": 1.260165125085356, + "grad_norm": 0.20592462131826195, + "learning_rate": 7.191055015107499e-05, + "loss": 2.8827, + "step": 20300 + }, + { + "epoch": 1.260227202185114, + "grad_norm": 0.18196071600731617, + "learning_rate": 7.19073037954678e-05, + "loss": 2.8006, + "step": 20301 + }, + { + "epoch": 1.2602892792848719, + "grad_norm": 0.21091096322596206, + "learning_rate": 7.190405732556488e-05, + "loss": 2.8828, + "step": 20302 + }, + { + "epoch": 1.2603513563846298, + "grad_norm": 0.232499133821125, + "learning_rate": 7.190081074138318e-05, + "loss": 2.905, + "step": 20303 + }, + { + "epoch": 1.2604134334843877, + "grad_norm": 0.15377322198951246, + "learning_rate": 7.189756404293964e-05, + "loss": 2.8852, + "step": 20304 + }, + { + "epoch": 1.2604755105841456, + "grad_norm": 0.19978544565810655, + "learning_rate": 7.189431723025118e-05, + "loss": 2.8282, + "step": 20305 + }, + { + "epoch": 1.2605375876839033, + "grad_norm": 0.15961123400523758, + "learning_rate": 7.189107030333475e-05, + "loss": 3.0039, + "step": 20306 + }, + { + "epoch": 1.2605996647836613, + "grad_norm": 0.2208396354411833, + "learning_rate": 7.18878232622073e-05, + "loss": 2.8769, + "step": 20307 + }, + { + "epoch": 1.2606617418834192, + "grad_norm": 0.1646335687858456, + "learning_rate": 7.188457610688577e-05, + "loss": 2.7837, + "step": 20308 + }, + { + "epoch": 1.260723818983177, + "grad_norm": 0.2034249566865387, + "learning_rate": 7.18813288373871e-05, + "loss": 2.8158, + "step": 20309 + }, + { + "epoch": 1.260785896082935, + "grad_norm": 0.15704964819400857, + "learning_rate": 7.18780814537282e-05, + "loss": 2.833, + "step": 20310 + }, + { + "epoch": 1.260847973182693, + "grad_norm": 0.17249617168076015, + "learning_rate": 7.187483395592604e-05, + "loss": 2.8474, + "step": 20311 + }, + { + "epoch": 1.2609100502824508, + "grad_norm": 0.1542390823778203, + "learning_rate": 7.187158634399758e-05, + "loss": 2.8804, + "step": 20312 + }, + { + "epoch": 1.2609721273822088, + "grad_norm": 0.16346145272926957, + "learning_rate": 7.186833861795975e-05, + "loss": 2.8773, + "step": 20313 + }, + { + "epoch": 1.2610342044819667, + "grad_norm": 0.16130838943100845, + "learning_rate": 7.186509077782947e-05, + "loss": 2.9617, + "step": 20314 + }, + { + "epoch": 1.2610962815817244, + "grad_norm": 0.15330780488525492, + "learning_rate": 7.18618428236237e-05, + "loss": 2.8602, + "step": 20315 + }, + { + "epoch": 1.2611583586814823, + "grad_norm": 0.18335056431670219, + "learning_rate": 7.18585947553594e-05, + "loss": 2.7578, + "step": 20316 + }, + { + "epoch": 1.2612204357812402, + "grad_norm": 0.1601733512587109, + "learning_rate": 7.185534657305348e-05, + "loss": 2.8996, + "step": 20317 + }, + { + "epoch": 1.2612825128809981, + "grad_norm": 0.1636750892378175, + "learning_rate": 7.185209827672294e-05, + "loss": 2.83, + "step": 20318 + }, + { + "epoch": 1.261344589980756, + "grad_norm": 0.18426361032809643, + "learning_rate": 7.18488498663847e-05, + "loss": 2.853, + "step": 20319 + }, + { + "epoch": 1.261406667080514, + "grad_norm": 0.15972432766323716, + "learning_rate": 7.184560134205569e-05, + "loss": 2.743, + "step": 20320 + }, + { + "epoch": 1.2614687441802719, + "grad_norm": 0.2067372195469295, + "learning_rate": 7.184235270375289e-05, + "loss": 2.8915, + "step": 20321 + }, + { + "epoch": 1.2615308212800298, + "grad_norm": 0.15595579463346648, + "learning_rate": 7.183910395149322e-05, + "loss": 2.8262, + "step": 20322 + }, + { + "epoch": 1.2615928983797877, + "grad_norm": 0.16081114364108356, + "learning_rate": 7.183585508529366e-05, + "loss": 2.8978, + "step": 20323 + }, + { + "epoch": 1.2616549754795456, + "grad_norm": 0.15880740694356257, + "learning_rate": 7.183260610517112e-05, + "loss": 2.8791, + "step": 20324 + }, + { + "epoch": 1.2617170525793036, + "grad_norm": 0.17602391447382143, + "learning_rate": 7.182935701114259e-05, + "loss": 2.8861, + "step": 20325 + }, + { + "epoch": 1.2617791296790615, + "grad_norm": 0.15448922830826847, + "learning_rate": 7.182610780322499e-05, + "loss": 2.9114, + "step": 20326 + }, + { + "epoch": 1.2618412067788194, + "grad_norm": 0.15498332145387442, + "learning_rate": 7.18228584814353e-05, + "loss": 2.9473, + "step": 20327 + }, + { + "epoch": 1.2619032838785773, + "grad_norm": 0.15786348784479945, + "learning_rate": 7.181960904579046e-05, + "loss": 2.8096, + "step": 20328 + }, + { + "epoch": 1.2619653609783352, + "grad_norm": 0.21355560982161167, + "learning_rate": 7.181635949630743e-05, + "loss": 2.9625, + "step": 20329 + }, + { + "epoch": 1.262027438078093, + "grad_norm": 0.22128274644287238, + "learning_rate": 7.181310983300315e-05, + "loss": 2.9693, + "step": 20330 + }, + { + "epoch": 1.2620895151778508, + "grad_norm": 0.1829168034377778, + "learning_rate": 7.180986005589457e-05, + "loss": 2.8021, + "step": 20331 + }, + { + "epoch": 1.2621515922776088, + "grad_norm": 0.16135169681393316, + "learning_rate": 7.180661016499868e-05, + "loss": 2.9262, + "step": 20332 + }, + { + "epoch": 1.2622136693773667, + "grad_norm": 0.1635788703370939, + "learning_rate": 7.180336016033236e-05, + "loss": 2.8027, + "step": 20333 + }, + { + "epoch": 1.2622757464771246, + "grad_norm": 0.15645163162556494, + "learning_rate": 7.180011004191267e-05, + "loss": 2.808, + "step": 20334 + }, + { + "epoch": 1.2623378235768825, + "grad_norm": 0.1610344095021938, + "learning_rate": 7.179685980975649e-05, + "loss": 2.894, + "step": 20335 + }, + { + "epoch": 1.2623999006766404, + "grad_norm": 0.14918551074796518, + "learning_rate": 7.17936094638808e-05, + "loss": 2.9025, + "step": 20336 + }, + { + "epoch": 1.2624619777763983, + "grad_norm": 0.15599865366930946, + "learning_rate": 7.179035900430257e-05, + "loss": 2.8175, + "step": 20337 + }, + { + "epoch": 1.2625240548761563, + "grad_norm": 0.16317766431061145, + "learning_rate": 7.178710843103875e-05, + "loss": 2.8556, + "step": 20338 + }, + { + "epoch": 1.262586131975914, + "grad_norm": 0.1557081124310368, + "learning_rate": 7.178385774410629e-05, + "loss": 2.7928, + "step": 20339 + }, + { + "epoch": 1.2626482090756719, + "grad_norm": 0.18009403100588373, + "learning_rate": 7.178060694352214e-05, + "loss": 2.825, + "step": 20340 + }, + { + "epoch": 1.2627102861754298, + "grad_norm": 0.15961710385073033, + "learning_rate": 7.17773560293033e-05, + "loss": 2.8164, + "step": 20341 + }, + { + "epoch": 1.2627723632751877, + "grad_norm": 0.15540006860121816, + "learning_rate": 7.177410500146669e-05, + "loss": 2.8769, + "step": 20342 + }, + { + "epoch": 1.2628344403749456, + "grad_norm": 0.1664593824443851, + "learning_rate": 7.177085386002931e-05, + "loss": 2.8958, + "step": 20343 + }, + { + "epoch": 1.2628965174747036, + "grad_norm": 0.1783094743357615, + "learning_rate": 7.176760260500808e-05, + "loss": 2.911, + "step": 20344 + }, + { + "epoch": 1.2629585945744615, + "grad_norm": 0.157978162824651, + "learning_rate": 7.176435123642e-05, + "loss": 2.8482, + "step": 20345 + }, + { + "epoch": 1.2630206716742194, + "grad_norm": 0.16857688620601238, + "learning_rate": 7.176109975428199e-05, + "loss": 2.896, + "step": 20346 + }, + { + "epoch": 1.2630827487739773, + "grad_norm": 0.20105429796618784, + "learning_rate": 7.175784815861106e-05, + "loss": 2.8536, + "step": 20347 + }, + { + "epoch": 1.2631448258737352, + "grad_norm": 0.18207172795097235, + "learning_rate": 7.175459644942414e-05, + "loss": 2.8493, + "step": 20348 + }, + { + "epoch": 1.2632069029734931, + "grad_norm": 0.15915677426422437, + "learning_rate": 7.175134462673822e-05, + "loss": 2.8598, + "step": 20349 + }, + { + "epoch": 1.263268980073251, + "grad_norm": 0.19523884338052527, + "learning_rate": 7.174809269057026e-05, + "loss": 2.8612, + "step": 20350 + }, + { + "epoch": 1.263331057173009, + "grad_norm": 0.21743938672967839, + "learning_rate": 7.17448406409372e-05, + "loss": 2.816, + "step": 20351 + }, + { + "epoch": 1.263393134272767, + "grad_norm": 0.2142332767149316, + "learning_rate": 7.174158847785605e-05, + "loss": 2.8481, + "step": 20352 + }, + { + "epoch": 1.2634552113725248, + "grad_norm": 0.18630546296057843, + "learning_rate": 7.173833620134374e-05, + "loss": 2.9438, + "step": 20353 + }, + { + "epoch": 1.2635172884722825, + "grad_norm": 0.16615942168997672, + "learning_rate": 7.173508381141727e-05, + "loss": 2.8503, + "step": 20354 + }, + { + "epoch": 1.2635793655720404, + "grad_norm": 0.17797274900704296, + "learning_rate": 7.173183130809358e-05, + "loss": 2.8596, + "step": 20355 + }, + { + "epoch": 1.2636414426717983, + "grad_norm": 0.18985859259521973, + "learning_rate": 7.172857869138965e-05, + "loss": 2.8502, + "step": 20356 + }, + { + "epoch": 1.2637035197715563, + "grad_norm": 0.23666126635987556, + "learning_rate": 7.172532596132245e-05, + "loss": 2.8851, + "step": 20357 + }, + { + "epoch": 1.2637655968713142, + "grad_norm": 0.1750457688919529, + "learning_rate": 7.172207311790895e-05, + "loss": 2.8892, + "step": 20358 + }, + { + "epoch": 1.263827673971072, + "grad_norm": 0.15409478098907944, + "learning_rate": 7.171882016116613e-05, + "loss": 2.8539, + "step": 20359 + }, + { + "epoch": 1.26388975107083, + "grad_norm": 0.16732917491545338, + "learning_rate": 7.171556709111094e-05, + "loss": 2.9323, + "step": 20360 + }, + { + "epoch": 1.263951828170588, + "grad_norm": 0.18650508211104552, + "learning_rate": 7.171231390776037e-05, + "loss": 2.8321, + "step": 20361 + }, + { + "epoch": 1.2640139052703458, + "grad_norm": 0.2597950653372887, + "learning_rate": 7.170906061113139e-05, + "loss": 2.8993, + "step": 20362 + }, + { + "epoch": 1.2640759823701035, + "grad_norm": 0.17699430251852097, + "learning_rate": 7.170580720124098e-05, + "loss": 2.8771, + "step": 20363 + }, + { + "epoch": 1.2641380594698615, + "grad_norm": 0.2167393290121496, + "learning_rate": 7.170255367810609e-05, + "loss": 2.8127, + "step": 20364 + }, + { + "epoch": 1.2642001365696194, + "grad_norm": 0.19579147735332725, + "learning_rate": 7.169930004174371e-05, + "loss": 2.8482, + "step": 20365 + }, + { + "epoch": 1.2642622136693773, + "grad_norm": 0.15847417194461844, + "learning_rate": 7.169604629217082e-05, + "loss": 2.8539, + "step": 20366 + }, + { + "epoch": 1.2643242907691352, + "grad_norm": 0.20417735868957343, + "learning_rate": 7.169279242940438e-05, + "loss": 3.0197, + "step": 20367 + }, + { + "epoch": 1.2643863678688931, + "grad_norm": 0.18451541508073063, + "learning_rate": 7.168953845346138e-05, + "loss": 2.7625, + "step": 20368 + }, + { + "epoch": 1.264448444968651, + "grad_norm": 0.16637073012389256, + "learning_rate": 7.168628436435879e-05, + "loss": 2.7808, + "step": 20369 + }, + { + "epoch": 1.264510522068409, + "grad_norm": 0.1930741219336738, + "learning_rate": 7.16830301621136e-05, + "loss": 2.8185, + "step": 20370 + }, + { + "epoch": 1.2645725991681669, + "grad_norm": 0.154035382873014, + "learning_rate": 7.167977584674277e-05, + "loss": 2.7731, + "step": 20371 + }, + { + "epoch": 1.2646346762679248, + "grad_norm": 0.20572169910028248, + "learning_rate": 7.167652141826329e-05, + "loss": 2.8983, + "step": 20372 + }, + { + "epoch": 1.2646967533676827, + "grad_norm": 0.1491427732445818, + "learning_rate": 7.167326687669214e-05, + "loss": 2.9077, + "step": 20373 + }, + { + "epoch": 1.2647588304674406, + "grad_norm": 0.2671686506036077, + "learning_rate": 7.167001222204629e-05, + "loss": 2.8716, + "step": 20374 + }, + { + "epoch": 1.2648209075671986, + "grad_norm": 0.17200389277272507, + "learning_rate": 7.166675745434273e-05, + "loss": 2.9175, + "step": 20375 + }, + { + "epoch": 1.2648829846669565, + "grad_norm": 0.20292814509344412, + "learning_rate": 7.166350257359843e-05, + "loss": 2.8812, + "step": 20376 + }, + { + "epoch": 1.2649450617667144, + "grad_norm": 0.18014055646911106, + "learning_rate": 7.166024757983039e-05, + "loss": 2.7774, + "step": 20377 + }, + { + "epoch": 1.265007138866472, + "grad_norm": 0.17386259851546831, + "learning_rate": 7.165699247305555e-05, + "loss": 2.9881, + "step": 20378 + }, + { + "epoch": 1.26506921596623, + "grad_norm": 0.1685829355520696, + "learning_rate": 7.165373725329096e-05, + "loss": 2.8565, + "step": 20379 + }, + { + "epoch": 1.265131293065988, + "grad_norm": 0.16495019450018666, + "learning_rate": 7.165048192055353e-05, + "loss": 2.8506, + "step": 20380 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.16148447330178423, + "learning_rate": 7.164722647486032e-05, + "loss": 2.9819, + "step": 20381 + }, + { + "epoch": 1.2652554472655038, + "grad_norm": 0.15990376923921545, + "learning_rate": 7.164397091622824e-05, + "loss": 2.8547, + "step": 20382 + }, + { + "epoch": 1.2653175243652617, + "grad_norm": 0.1890846779323526, + "learning_rate": 7.164071524467433e-05, + "loss": 2.7921, + "step": 20383 + }, + { + "epoch": 1.2653796014650196, + "grad_norm": 0.19053930404405653, + "learning_rate": 7.163745946021554e-05, + "loss": 2.9221, + "step": 20384 + }, + { + "epoch": 1.2654416785647775, + "grad_norm": 0.15686886537951084, + "learning_rate": 7.163420356286887e-05, + "loss": 2.8242, + "step": 20385 + }, + { + "epoch": 1.2655037556645354, + "grad_norm": 0.16670883968861622, + "learning_rate": 7.163094755265132e-05, + "loss": 2.8314, + "step": 20386 + }, + { + "epoch": 1.2655658327642931, + "grad_norm": 0.15572479312607096, + "learning_rate": 7.162769142957987e-05, + "loss": 2.8849, + "step": 20387 + }, + { + "epoch": 1.265627909864051, + "grad_norm": 0.16943025723121138, + "learning_rate": 7.16244351936715e-05, + "loss": 2.8142, + "step": 20388 + }, + { + "epoch": 1.265689986963809, + "grad_norm": 0.14858530866952283, + "learning_rate": 7.162117884494319e-05, + "loss": 2.9118, + "step": 20389 + }, + { + "epoch": 1.2657520640635669, + "grad_norm": 0.16586402815563941, + "learning_rate": 7.161792238341196e-05, + "loss": 2.7709, + "step": 20390 + }, + { + "epoch": 1.2658141411633248, + "grad_norm": 0.16854049163128978, + "learning_rate": 7.161466580909477e-05, + "loss": 2.8948, + "step": 20391 + }, + { + "epoch": 1.2658762182630827, + "grad_norm": 0.15241549098705215, + "learning_rate": 7.161140912200863e-05, + "loss": 2.8933, + "step": 20392 + }, + { + "epoch": 1.2659382953628406, + "grad_norm": 0.15355428834881235, + "learning_rate": 7.160815232217053e-05, + "loss": 2.8607, + "step": 20393 + }, + { + "epoch": 1.2660003724625986, + "grad_norm": 0.15076713990122387, + "learning_rate": 7.160489540959742e-05, + "loss": 2.9269, + "step": 20394 + }, + { + "epoch": 1.2660624495623565, + "grad_norm": 0.1473053306964113, + "learning_rate": 7.160163838430635e-05, + "loss": 2.9049, + "step": 20395 + }, + { + "epoch": 1.2661245266621144, + "grad_norm": 0.15089425067588652, + "learning_rate": 7.159838124631428e-05, + "loss": 2.8497, + "step": 20396 + }, + { + "epoch": 1.2661866037618723, + "grad_norm": 0.15247812824644472, + "learning_rate": 7.159512399563821e-05, + "loss": 2.8976, + "step": 20397 + }, + { + "epoch": 1.2662486808616302, + "grad_norm": 0.17503244597365744, + "learning_rate": 7.159186663229513e-05, + "loss": 2.9287, + "step": 20398 + }, + { + "epoch": 1.2663107579613881, + "grad_norm": 0.15702898038106466, + "learning_rate": 7.158860915630205e-05, + "loss": 2.8596, + "step": 20399 + }, + { + "epoch": 1.266372835061146, + "grad_norm": 0.18113886328045917, + "learning_rate": 7.158535156767596e-05, + "loss": 2.7799, + "step": 20400 + }, + { + "epoch": 1.266434912160904, + "grad_norm": 0.17035509110653566, + "learning_rate": 7.158209386643385e-05, + "loss": 2.8713, + "step": 20401 + }, + { + "epoch": 1.2664969892606617, + "grad_norm": 0.16353456386603293, + "learning_rate": 7.157883605259273e-05, + "loss": 2.9297, + "step": 20402 + }, + { + "epoch": 1.2665590663604196, + "grad_norm": 0.16391178429045214, + "learning_rate": 7.157557812616955e-05, + "loss": 2.7951, + "step": 20403 + }, + { + "epoch": 1.2666211434601775, + "grad_norm": 0.19973956078666416, + "learning_rate": 7.157232008718137e-05, + "loss": 2.7864, + "step": 20404 + }, + { + "epoch": 1.2666832205599354, + "grad_norm": 0.16793163022472435, + "learning_rate": 7.156906193564515e-05, + "loss": 2.86, + "step": 20405 + }, + { + "epoch": 1.2667452976596933, + "grad_norm": 0.14705565634114823, + "learning_rate": 7.156580367157789e-05, + "loss": 2.8619, + "step": 20406 + }, + { + "epoch": 1.2668073747594513, + "grad_norm": 0.16776704875963294, + "learning_rate": 7.156254529499659e-05, + "loss": 2.9007, + "step": 20407 + }, + { + "epoch": 1.2668694518592092, + "grad_norm": 0.1617260330052952, + "learning_rate": 7.155928680591829e-05, + "loss": 2.8811, + "step": 20408 + }, + { + "epoch": 1.266931528958967, + "grad_norm": 0.1963817794990558, + "learning_rate": 7.155602820435993e-05, + "loss": 2.859, + "step": 20409 + }, + { + "epoch": 1.266993606058725, + "grad_norm": 0.1633864164478899, + "learning_rate": 7.155276949033855e-05, + "loss": 2.954, + "step": 20410 + }, + { + "epoch": 1.2670556831584827, + "grad_norm": 0.19362114035968903, + "learning_rate": 7.154951066387114e-05, + "loss": 2.8441, + "step": 20411 + }, + { + "epoch": 1.2671177602582406, + "grad_norm": 0.21373559221431018, + "learning_rate": 7.15462517249747e-05, + "loss": 2.8938, + "step": 20412 + }, + { + "epoch": 1.2671798373579986, + "grad_norm": 0.17657468593371806, + "learning_rate": 7.154299267366622e-05, + "loss": 2.8368, + "step": 20413 + }, + { + "epoch": 1.2672419144577565, + "grad_norm": 0.17370636565340775, + "learning_rate": 7.153973350996272e-05, + "loss": 2.8617, + "step": 20414 + }, + { + "epoch": 1.2673039915575144, + "grad_norm": 0.1760170785155645, + "learning_rate": 7.15364742338812e-05, + "loss": 2.8967, + "step": 20415 + }, + { + "epoch": 1.2673660686572723, + "grad_norm": 0.1670846794485232, + "learning_rate": 7.153321484543866e-05, + "loss": 2.8594, + "step": 20416 + }, + { + "epoch": 1.2674281457570302, + "grad_norm": 0.17079823962633042, + "learning_rate": 7.152995534465214e-05, + "loss": 2.9042, + "step": 20417 + }, + { + "epoch": 1.2674902228567881, + "grad_norm": 0.18288344082742122, + "learning_rate": 7.15266957315386e-05, + "loss": 2.9389, + "step": 20418 + }, + { + "epoch": 1.267552299956546, + "grad_norm": 0.2016476310922836, + "learning_rate": 7.152343600611507e-05, + "loss": 2.8362, + "step": 20419 + }, + { + "epoch": 1.267614377056304, + "grad_norm": 0.1977823213484063, + "learning_rate": 7.152017616839855e-05, + "loss": 2.9256, + "step": 20420 + }, + { + "epoch": 1.267676454156062, + "grad_norm": 0.21374922159677842, + "learning_rate": 7.151691621840603e-05, + "loss": 2.8169, + "step": 20421 + }, + { + "epoch": 1.2677385312558198, + "grad_norm": 0.2144543760641881, + "learning_rate": 7.151365615615455e-05, + "loss": 2.8553, + "step": 20422 + }, + { + "epoch": 1.2678006083555777, + "grad_norm": 0.16869199095723283, + "learning_rate": 7.151039598166109e-05, + "loss": 2.8171, + "step": 20423 + }, + { + "epoch": 1.2678626854553356, + "grad_norm": 0.2085214639634405, + "learning_rate": 7.15071356949427e-05, + "loss": 2.9202, + "step": 20424 + }, + { + "epoch": 1.2679247625550933, + "grad_norm": 0.16916333171672718, + "learning_rate": 7.150387529601633e-05, + "loss": 2.8177, + "step": 20425 + }, + { + "epoch": 1.2679868396548513, + "grad_norm": 0.16578915239208264, + "learning_rate": 7.150061478489904e-05, + "loss": 2.8118, + "step": 20426 + }, + { + "epoch": 1.2680489167546092, + "grad_norm": 0.17659026577465134, + "learning_rate": 7.14973541616078e-05, + "loss": 2.9721, + "step": 20427 + }, + { + "epoch": 1.268110993854367, + "grad_norm": 0.16925761953765905, + "learning_rate": 7.149409342615968e-05, + "loss": 2.7569, + "step": 20428 + }, + { + "epoch": 1.268173070954125, + "grad_norm": 0.18952544297004403, + "learning_rate": 7.149083257857163e-05, + "loss": 2.8385, + "step": 20429 + }, + { + "epoch": 1.268235148053883, + "grad_norm": 0.16535631419454996, + "learning_rate": 7.14875716188607e-05, + "loss": 2.8338, + "step": 20430 + }, + { + "epoch": 1.2682972251536409, + "grad_norm": 0.18738907274544658, + "learning_rate": 7.14843105470439e-05, + "loss": 2.8744, + "step": 20431 + }, + { + "epoch": 1.2683593022533988, + "grad_norm": 0.16538620729812642, + "learning_rate": 7.148104936313821e-05, + "loss": 2.8596, + "step": 20432 + }, + { + "epoch": 1.2684213793531567, + "grad_norm": 0.18523966805414427, + "learning_rate": 7.14777880671607e-05, + "loss": 2.9052, + "step": 20433 + }, + { + "epoch": 1.2684834564529144, + "grad_norm": 0.1886058127014944, + "learning_rate": 7.147452665912834e-05, + "loss": 2.8067, + "step": 20434 + }, + { + "epoch": 1.2685455335526723, + "grad_norm": 0.17488450620204718, + "learning_rate": 7.147126513905818e-05, + "loss": 2.8896, + "step": 20435 + }, + { + "epoch": 1.2686076106524302, + "grad_norm": 0.15986266208871333, + "learning_rate": 7.14680035069672e-05, + "loss": 2.7475, + "step": 20436 + }, + { + "epoch": 1.2686696877521881, + "grad_norm": 0.208264680319232, + "learning_rate": 7.146474176287243e-05, + "loss": 2.8461, + "step": 20437 + }, + { + "epoch": 1.268731764851946, + "grad_norm": 0.1666709531789142, + "learning_rate": 7.14614799067909e-05, + "loss": 2.777, + "step": 20438 + }, + { + "epoch": 1.268793841951704, + "grad_norm": 0.1747751956974206, + "learning_rate": 7.145821793873961e-05, + "loss": 2.8703, + "step": 20439 + }, + { + "epoch": 1.268855919051462, + "grad_norm": 0.1713790946199309, + "learning_rate": 7.145495585873561e-05, + "loss": 2.8581, + "step": 20440 + }, + { + "epoch": 1.2689179961512198, + "grad_norm": 0.20405758835751103, + "learning_rate": 7.145169366679587e-05, + "loss": 2.8876, + "step": 20441 + }, + { + "epoch": 1.2689800732509777, + "grad_norm": 0.16748192010812293, + "learning_rate": 7.144843136293746e-05, + "loss": 2.9214, + "step": 20442 + }, + { + "epoch": 1.2690421503507356, + "grad_norm": 0.1591184182233006, + "learning_rate": 7.144516894717737e-05, + "loss": 2.8246, + "step": 20443 + }, + { + "epoch": 1.2691042274504936, + "grad_norm": 0.1544977401719359, + "learning_rate": 7.144190641953262e-05, + "loss": 2.7753, + "step": 20444 + }, + { + "epoch": 1.2691663045502515, + "grad_norm": 0.15966066582920208, + "learning_rate": 7.143864378002025e-05, + "loss": 2.8503, + "step": 20445 + }, + { + "epoch": 1.2692283816500094, + "grad_norm": 0.16653690961926249, + "learning_rate": 7.143538102865726e-05, + "loss": 2.8713, + "step": 20446 + }, + { + "epoch": 1.2692904587497673, + "grad_norm": 0.1598122496064022, + "learning_rate": 7.143211816546068e-05, + "loss": 2.7802, + "step": 20447 + }, + { + "epoch": 1.2693525358495252, + "grad_norm": 0.1568323904565105, + "learning_rate": 7.142885519044755e-05, + "loss": 2.8478, + "step": 20448 + }, + { + "epoch": 1.269414612949283, + "grad_norm": 0.16517539290551322, + "learning_rate": 7.142559210363487e-05, + "loss": 2.9282, + "step": 20449 + }, + { + "epoch": 1.2694766900490408, + "grad_norm": 0.15441387907017787, + "learning_rate": 7.142232890503967e-05, + "loss": 2.9017, + "step": 20450 + }, + { + "epoch": 1.2695387671487988, + "grad_norm": 0.16995057598067195, + "learning_rate": 7.1419065594679e-05, + "loss": 2.8994, + "step": 20451 + }, + { + "epoch": 1.2696008442485567, + "grad_norm": 0.14364162070515268, + "learning_rate": 7.141580217256984e-05, + "loss": 2.8155, + "step": 20452 + }, + { + "epoch": 1.2696629213483146, + "grad_norm": 0.1481282689844434, + "learning_rate": 7.141253863872927e-05, + "loss": 2.8042, + "step": 20453 + }, + { + "epoch": 1.2697249984480725, + "grad_norm": 0.14716848842880947, + "learning_rate": 7.140927499317428e-05, + "loss": 2.8517, + "step": 20454 + }, + { + "epoch": 1.2697870755478304, + "grad_norm": 0.14637472499297097, + "learning_rate": 7.140601123592189e-05, + "loss": 2.8244, + "step": 20455 + }, + { + "epoch": 1.2698491526475884, + "grad_norm": 0.15993883914170734, + "learning_rate": 7.140274736698916e-05, + "loss": 2.9407, + "step": 20456 + }, + { + "epoch": 1.2699112297473463, + "grad_norm": 0.1487580840434506, + "learning_rate": 7.139948338639309e-05, + "loss": 2.8339, + "step": 20457 + }, + { + "epoch": 1.269973306847104, + "grad_norm": 0.15946096598192408, + "learning_rate": 7.139621929415073e-05, + "loss": 2.9176, + "step": 20458 + }, + { + "epoch": 1.2700353839468619, + "grad_norm": 0.14517542618967663, + "learning_rate": 7.139295509027908e-05, + "loss": 2.8675, + "step": 20459 + }, + { + "epoch": 1.2700974610466198, + "grad_norm": 0.16036895258064965, + "learning_rate": 7.13896907747952e-05, + "loss": 2.8353, + "step": 20460 + }, + { + "epoch": 1.2701595381463777, + "grad_norm": 0.17501378473418303, + "learning_rate": 7.138642634771611e-05, + "loss": 2.8375, + "step": 20461 + }, + { + "epoch": 1.2702216152461356, + "grad_norm": 0.14997591704515698, + "learning_rate": 7.138316180905885e-05, + "loss": 2.9103, + "step": 20462 + }, + { + "epoch": 1.2702836923458936, + "grad_norm": 0.18501227813327503, + "learning_rate": 7.137989715884043e-05, + "loss": 2.85, + "step": 20463 + }, + { + "epoch": 1.2703457694456515, + "grad_norm": 0.1940105436098507, + "learning_rate": 7.13766323970779e-05, + "loss": 2.8529, + "step": 20464 + }, + { + "epoch": 1.2704078465454094, + "grad_norm": 0.15935601373788053, + "learning_rate": 7.137336752378831e-05, + "loss": 2.7837, + "step": 20465 + }, + { + "epoch": 1.2704699236451673, + "grad_norm": 0.1895484979208002, + "learning_rate": 7.137010253898864e-05, + "loss": 2.8767, + "step": 20466 + }, + { + "epoch": 1.2705320007449252, + "grad_norm": 0.14949138361044614, + "learning_rate": 7.1366837442696e-05, + "loss": 2.9273, + "step": 20467 + }, + { + "epoch": 1.2705940778446831, + "grad_norm": 0.16679813866951812, + "learning_rate": 7.136357223492734e-05, + "loss": 2.7916, + "step": 20468 + }, + { + "epoch": 1.270656154944441, + "grad_norm": 0.1781767171937566, + "learning_rate": 7.136030691569976e-05, + "loss": 2.8517, + "step": 20469 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.16506056109470085, + "learning_rate": 7.135704148503026e-05, + "loss": 2.7582, + "step": 20470 + }, + { + "epoch": 1.270780309143957, + "grad_norm": 0.16484478231531677, + "learning_rate": 7.135377594293591e-05, + "loss": 2.8377, + "step": 20471 + }, + { + "epoch": 1.2708423862437148, + "grad_norm": 0.15176736784399378, + "learning_rate": 7.13505102894337e-05, + "loss": 2.83, + "step": 20472 + }, + { + "epoch": 1.2709044633434725, + "grad_norm": 0.1685735216812503, + "learning_rate": 7.134724452454072e-05, + "loss": 2.7804, + "step": 20473 + }, + { + "epoch": 1.2709665404432304, + "grad_norm": 0.17064645992069785, + "learning_rate": 7.134397864827398e-05, + "loss": 2.8748, + "step": 20474 + }, + { + "epoch": 1.2710286175429883, + "grad_norm": 0.17338266515358142, + "learning_rate": 7.134071266065051e-05, + "loss": 2.9054, + "step": 20475 + }, + { + "epoch": 1.2710906946427463, + "grad_norm": 0.1610300892622052, + "learning_rate": 7.133744656168737e-05, + "loss": 2.8327, + "step": 20476 + }, + { + "epoch": 1.2711527717425042, + "grad_norm": 0.18014613995798567, + "learning_rate": 7.133418035140158e-05, + "loss": 2.9465, + "step": 20477 + }, + { + "epoch": 1.271214848842262, + "grad_norm": 0.1668636597037603, + "learning_rate": 7.13309140298102e-05, + "loss": 2.7861, + "step": 20478 + }, + { + "epoch": 1.27127692594202, + "grad_norm": 0.16997667503221708, + "learning_rate": 7.132764759693027e-05, + "loss": 2.8883, + "step": 20479 + }, + { + "epoch": 1.271339003041778, + "grad_norm": 0.16392124998333293, + "learning_rate": 7.132438105277882e-05, + "loss": 2.91, + "step": 20480 + }, + { + "epoch": 1.2714010801415359, + "grad_norm": 0.1578405369507296, + "learning_rate": 7.132111439737289e-05, + "loss": 2.7958, + "step": 20481 + }, + { + "epoch": 1.2714631572412936, + "grad_norm": 0.18136302695923784, + "learning_rate": 7.131784763072953e-05, + "loss": 2.863, + "step": 20482 + }, + { + "epoch": 1.2715252343410515, + "grad_norm": 0.16058194758887617, + "learning_rate": 7.13145807528658e-05, + "loss": 2.8632, + "step": 20483 + }, + { + "epoch": 1.2715873114408094, + "grad_norm": 0.1538412331962795, + "learning_rate": 7.131131376379871e-05, + "loss": 2.8356, + "step": 20484 + }, + { + "epoch": 1.2716493885405673, + "grad_norm": 0.17232899419364905, + "learning_rate": 7.130804666354533e-05, + "loss": 2.844, + "step": 20485 + }, + { + "epoch": 1.2717114656403252, + "grad_norm": 0.16004303675626805, + "learning_rate": 7.13047794521227e-05, + "loss": 2.8736, + "step": 20486 + }, + { + "epoch": 1.2717735427400831, + "grad_norm": 0.16098781188151493, + "learning_rate": 7.130151212954786e-05, + "loss": 2.8541, + "step": 20487 + }, + { + "epoch": 1.271835619839841, + "grad_norm": 0.16669857693209503, + "learning_rate": 7.129824469583786e-05, + "loss": 2.755, + "step": 20488 + }, + { + "epoch": 1.271897696939599, + "grad_norm": 0.1597803694303299, + "learning_rate": 7.129497715100975e-05, + "loss": 2.8478, + "step": 20489 + }, + { + "epoch": 1.271959774039357, + "grad_norm": 0.17930936416644294, + "learning_rate": 7.129170949508057e-05, + "loss": 2.9086, + "step": 20490 + }, + { + "epoch": 1.2720218511391148, + "grad_norm": 0.1792846185155969, + "learning_rate": 7.128844172806739e-05, + "loss": 2.7852, + "step": 20491 + }, + { + "epoch": 1.2720839282388727, + "grad_norm": 0.14492726842915543, + "learning_rate": 7.128517384998723e-05, + "loss": 2.7819, + "step": 20492 + }, + { + "epoch": 1.2721460053386306, + "grad_norm": 0.15793003837061498, + "learning_rate": 7.128190586085715e-05, + "loss": 2.8337, + "step": 20493 + }, + { + "epoch": 1.2722080824383886, + "grad_norm": 0.18090924302598915, + "learning_rate": 7.12786377606942e-05, + "loss": 2.8957, + "step": 20494 + }, + { + "epoch": 1.2722701595381465, + "grad_norm": 0.14950825334017412, + "learning_rate": 7.127536954951543e-05, + "loss": 2.8406, + "step": 20495 + }, + { + "epoch": 1.2723322366379044, + "grad_norm": 0.16032801698019866, + "learning_rate": 7.127210122733789e-05, + "loss": 2.8359, + "step": 20496 + }, + { + "epoch": 1.272394313737662, + "grad_norm": 0.14606293771545156, + "learning_rate": 7.126883279417863e-05, + "loss": 2.8835, + "step": 20497 + }, + { + "epoch": 1.27245639083742, + "grad_norm": 0.1747504901514151, + "learning_rate": 7.126556425005471e-05, + "loss": 2.8887, + "step": 20498 + }, + { + "epoch": 1.272518467937178, + "grad_norm": 0.14745525323888803, + "learning_rate": 7.126229559498318e-05, + "loss": 2.8714, + "step": 20499 + }, + { + "epoch": 1.2725805450369359, + "grad_norm": 0.1561369367660333, + "learning_rate": 7.125902682898108e-05, + "loss": 2.8349, + "step": 20500 + }, + { + "epoch": 1.2726426221366938, + "grad_norm": 0.15940878477289222, + "learning_rate": 7.125575795206551e-05, + "loss": 2.7567, + "step": 20501 + }, + { + "epoch": 1.2727046992364517, + "grad_norm": 0.159912272445945, + "learning_rate": 7.125248896425347e-05, + "loss": 2.7977, + "step": 20502 + }, + { + "epoch": 1.2727667763362096, + "grad_norm": 0.1511688101504632, + "learning_rate": 7.124921986556204e-05, + "loss": 2.9123, + "step": 20503 + }, + { + "epoch": 1.2728288534359675, + "grad_norm": 0.1497617926229142, + "learning_rate": 7.124595065600826e-05, + "loss": 2.7985, + "step": 20504 + }, + { + "epoch": 1.2728909305357254, + "grad_norm": 0.1906463954987976, + "learning_rate": 7.124268133560922e-05, + "loss": 2.8552, + "step": 20505 + }, + { + "epoch": 1.2729530076354831, + "grad_norm": 0.15207717264665596, + "learning_rate": 7.123941190438193e-05, + "loss": 2.9158, + "step": 20506 + }, + { + "epoch": 1.273015084735241, + "grad_norm": 0.15700712943935707, + "learning_rate": 7.123614236234349e-05, + "loss": 2.9222, + "step": 20507 + }, + { + "epoch": 1.273077161834999, + "grad_norm": 0.15026791012313526, + "learning_rate": 7.123287270951094e-05, + "loss": 2.8845, + "step": 20508 + }, + { + "epoch": 1.273139238934757, + "grad_norm": 0.16765899152016325, + "learning_rate": 7.122960294590133e-05, + "loss": 2.8106, + "step": 20509 + }, + { + "epoch": 1.2732013160345148, + "grad_norm": 0.15365431177593772, + "learning_rate": 7.122633307153173e-05, + "loss": 2.7803, + "step": 20510 + }, + { + "epoch": 1.2732633931342727, + "grad_norm": 0.15462698812524941, + "learning_rate": 7.122306308641919e-05, + "loss": 2.8626, + "step": 20511 + }, + { + "epoch": 1.2733254702340306, + "grad_norm": 0.15376612364536274, + "learning_rate": 7.121979299058079e-05, + "loss": 2.9405, + "step": 20512 + }, + { + "epoch": 1.2733875473337886, + "grad_norm": 0.15704121534316925, + "learning_rate": 7.121652278403356e-05, + "loss": 2.855, + "step": 20513 + }, + { + "epoch": 1.2734496244335465, + "grad_norm": 0.14569840973288603, + "learning_rate": 7.12132524667946e-05, + "loss": 2.8796, + "step": 20514 + }, + { + "epoch": 1.2735117015333044, + "grad_norm": 0.16243788742541748, + "learning_rate": 7.120998203888093e-05, + "loss": 2.8391, + "step": 20515 + }, + { + "epoch": 1.2735737786330623, + "grad_norm": 0.14656142433682748, + "learning_rate": 7.120671150030966e-05, + "loss": 2.8386, + "step": 20516 + }, + { + "epoch": 1.2736358557328202, + "grad_norm": 0.16328687338750789, + "learning_rate": 7.120344085109783e-05, + "loss": 2.9312, + "step": 20517 + }, + { + "epoch": 1.2736979328325782, + "grad_norm": 0.16015942500619182, + "learning_rate": 7.120017009126248e-05, + "loss": 2.8665, + "step": 20518 + }, + { + "epoch": 1.273760009932336, + "grad_norm": 0.15050279471822006, + "learning_rate": 7.119689922082072e-05, + "loss": 2.7374, + "step": 20519 + }, + { + "epoch": 1.273822087032094, + "grad_norm": 0.16261967232148394, + "learning_rate": 7.119362823978957e-05, + "loss": 2.7873, + "step": 20520 + }, + { + "epoch": 1.2738841641318517, + "grad_norm": 0.15195586881146128, + "learning_rate": 7.119035714818612e-05, + "loss": 2.8557, + "step": 20521 + }, + { + "epoch": 1.2739462412316096, + "grad_norm": 0.15842638627599015, + "learning_rate": 7.118708594602744e-05, + "loss": 2.8812, + "step": 20522 + }, + { + "epoch": 1.2740083183313675, + "grad_norm": 0.1512633121066592, + "learning_rate": 7.118381463333059e-05, + "loss": 2.8427, + "step": 20523 + }, + { + "epoch": 1.2740703954311254, + "grad_norm": 0.15236164868187613, + "learning_rate": 7.118054321011263e-05, + "loss": 2.8282, + "step": 20524 + }, + { + "epoch": 1.2741324725308834, + "grad_norm": 0.15689011228386635, + "learning_rate": 7.117727167639064e-05, + "loss": 2.8462, + "step": 20525 + }, + { + "epoch": 1.2741945496306413, + "grad_norm": 0.1676480703056101, + "learning_rate": 7.117400003218168e-05, + "loss": 2.7672, + "step": 20526 + }, + { + "epoch": 1.2742566267303992, + "grad_norm": 0.15342136867356893, + "learning_rate": 7.117072827750282e-05, + "loss": 2.9101, + "step": 20527 + }, + { + "epoch": 1.274318703830157, + "grad_norm": 0.14293620463502804, + "learning_rate": 7.116745641237113e-05, + "loss": 2.9496, + "step": 20528 + }, + { + "epoch": 1.274380780929915, + "grad_norm": 0.18334438758454452, + "learning_rate": 7.116418443680368e-05, + "loss": 2.9162, + "step": 20529 + }, + { + "epoch": 1.2744428580296727, + "grad_norm": 0.15916500725950486, + "learning_rate": 7.116091235081754e-05, + "loss": 2.8956, + "step": 20530 + }, + { + "epoch": 1.2745049351294306, + "grad_norm": 0.16102817490193874, + "learning_rate": 7.11576401544298e-05, + "loss": 2.8276, + "step": 20531 + }, + { + "epoch": 1.2745670122291886, + "grad_norm": 0.16885567641088278, + "learning_rate": 7.11543678476575e-05, + "loss": 2.9064, + "step": 20532 + }, + { + "epoch": 1.2746290893289465, + "grad_norm": 0.15907823805153515, + "learning_rate": 7.115109543051772e-05, + "loss": 2.957, + "step": 20533 + }, + { + "epoch": 1.2746911664287044, + "grad_norm": 0.16458818253986712, + "learning_rate": 7.114782290302755e-05, + "loss": 2.8319, + "step": 20534 + }, + { + "epoch": 1.2747532435284623, + "grad_norm": 0.16233229534319446, + "learning_rate": 7.114455026520408e-05, + "loss": 2.8905, + "step": 20535 + }, + { + "epoch": 1.2748153206282202, + "grad_norm": 0.1667207608999947, + "learning_rate": 7.114127751706432e-05, + "loss": 2.9106, + "step": 20536 + }, + { + "epoch": 1.2748773977279781, + "grad_norm": 0.1536435892189065, + "learning_rate": 7.113800465862541e-05, + "loss": 2.8592, + "step": 20537 + }, + { + "epoch": 1.274939474827736, + "grad_norm": 0.16462985213290057, + "learning_rate": 7.113473168990437e-05, + "loss": 2.7379, + "step": 20538 + }, + { + "epoch": 1.275001551927494, + "grad_norm": 0.1520806204268638, + "learning_rate": 7.113145861091833e-05, + "loss": 2.9692, + "step": 20539 + }, + { + "epoch": 1.275063629027252, + "grad_norm": 0.1645963475558524, + "learning_rate": 7.112818542168433e-05, + "loss": 2.9187, + "step": 20540 + }, + { + "epoch": 1.2751257061270098, + "grad_norm": 0.16033531854758468, + "learning_rate": 7.112491212221946e-05, + "loss": 2.9635, + "step": 20541 + }, + { + "epoch": 1.2751877832267677, + "grad_norm": 0.15075985677681, + "learning_rate": 7.11216387125408e-05, + "loss": 2.8171, + "step": 20542 + }, + { + "epoch": 1.2752498603265257, + "grad_norm": 0.166766990825443, + "learning_rate": 7.111836519266542e-05, + "loss": 2.8974, + "step": 20543 + }, + { + "epoch": 1.2753119374262836, + "grad_norm": 0.17879492448594425, + "learning_rate": 7.111509156261038e-05, + "loss": 2.8545, + "step": 20544 + }, + { + "epoch": 1.2753740145260413, + "grad_norm": 0.2098924747005856, + "learning_rate": 7.111181782239282e-05, + "loss": 2.8293, + "step": 20545 + }, + { + "epoch": 1.2754360916257992, + "grad_norm": 0.17391500552090425, + "learning_rate": 7.110854397202974e-05, + "loss": 2.7767, + "step": 20546 + }, + { + "epoch": 1.275498168725557, + "grad_norm": 0.19529614380060897, + "learning_rate": 7.110527001153828e-05, + "loss": 2.9483, + "step": 20547 + }, + { + "epoch": 1.275560245825315, + "grad_norm": 0.18829037062528203, + "learning_rate": 7.110199594093551e-05, + "loss": 2.893, + "step": 20548 + }, + { + "epoch": 1.275622322925073, + "grad_norm": 0.15260470488413225, + "learning_rate": 7.10987217602385e-05, + "loss": 2.8422, + "step": 20549 + }, + { + "epoch": 1.2756844000248309, + "grad_norm": 0.17095357822769205, + "learning_rate": 7.109544746946435e-05, + "loss": 2.8944, + "step": 20550 + }, + { + "epoch": 1.2757464771245888, + "grad_norm": 0.1703092172288805, + "learning_rate": 7.109217306863011e-05, + "loss": 2.828, + "step": 20551 + }, + { + "epoch": 1.2758085542243467, + "grad_norm": 0.16748438905626378, + "learning_rate": 7.10888985577529e-05, + "loss": 2.8564, + "step": 20552 + }, + { + "epoch": 1.2758706313241046, + "grad_norm": 0.16944428445909673, + "learning_rate": 7.108562393684978e-05, + "loss": 2.8688, + "step": 20553 + }, + { + "epoch": 1.2759327084238623, + "grad_norm": 0.1711991090233014, + "learning_rate": 7.108234920593783e-05, + "loss": 2.8477, + "step": 20554 + }, + { + "epoch": 1.2759947855236202, + "grad_norm": 0.168002524417973, + "learning_rate": 7.107907436503417e-05, + "loss": 2.9105, + "step": 20555 + }, + { + "epoch": 1.2760568626233781, + "grad_norm": 0.16690082169893908, + "learning_rate": 7.107579941415584e-05, + "loss": 2.8801, + "step": 20556 + }, + { + "epoch": 1.276118939723136, + "grad_norm": 0.15996508121363398, + "learning_rate": 7.107252435331995e-05, + "loss": 2.9447, + "step": 20557 + }, + { + "epoch": 1.276181016822894, + "grad_norm": 0.1839725563481081, + "learning_rate": 7.106924918254358e-05, + "loss": 2.804, + "step": 20558 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.22565885083831727, + "learning_rate": 7.106597390184384e-05, + "loss": 2.8697, + "step": 20559 + }, + { + "epoch": 1.2763051710224098, + "grad_norm": 0.15343577902580424, + "learning_rate": 7.106269851123777e-05, + "loss": 2.7936, + "step": 20560 + }, + { + "epoch": 1.2763672481221677, + "grad_norm": 0.15804751714254464, + "learning_rate": 7.105942301074253e-05, + "loss": 2.833, + "step": 20561 + }, + { + "epoch": 1.2764293252219256, + "grad_norm": 0.1856092247858378, + "learning_rate": 7.105614740037514e-05, + "loss": 2.8542, + "step": 20562 + }, + { + "epoch": 1.2764914023216836, + "grad_norm": 0.1574430042124706, + "learning_rate": 7.105287168015272e-05, + "loss": 2.9094, + "step": 20563 + }, + { + "epoch": 1.2765534794214415, + "grad_norm": 0.1599073511245654, + "learning_rate": 7.104959585009236e-05, + "loss": 2.8777, + "step": 20564 + }, + { + "epoch": 1.2766155565211994, + "grad_norm": 0.1918180473435332, + "learning_rate": 7.104631991021115e-05, + "loss": 2.8892, + "step": 20565 + }, + { + "epoch": 1.2766776336209573, + "grad_norm": 0.2194027697818665, + "learning_rate": 7.104304386052619e-05, + "loss": 2.8429, + "step": 20566 + }, + { + "epoch": 1.2767397107207152, + "grad_norm": 0.19336871885592077, + "learning_rate": 7.103976770105455e-05, + "loss": 2.8085, + "step": 20567 + }, + { + "epoch": 1.2768017878204732, + "grad_norm": 0.15881411576490612, + "learning_rate": 7.103649143181335e-05, + "loss": 2.8832, + "step": 20568 + }, + { + "epoch": 1.2768638649202309, + "grad_norm": 0.21043128625359078, + "learning_rate": 7.103321505281965e-05, + "loss": 2.803, + "step": 20569 + }, + { + "epoch": 1.2769259420199888, + "grad_norm": 0.19960880428762648, + "learning_rate": 7.102993856409057e-05, + "loss": 2.9205, + "step": 20570 + }, + { + "epoch": 1.2769880191197467, + "grad_norm": 0.1839536527831687, + "learning_rate": 7.10266619656432e-05, + "loss": 3.0217, + "step": 20571 + }, + { + "epoch": 1.2770500962195046, + "grad_norm": 0.17017234232236522, + "learning_rate": 7.102338525749461e-05, + "loss": 2.8034, + "step": 20572 + }, + { + "epoch": 1.2771121733192625, + "grad_norm": 0.158286308271767, + "learning_rate": 7.102010843966192e-05, + "loss": 2.8054, + "step": 20573 + }, + { + "epoch": 1.2771742504190204, + "grad_norm": 0.16317382888296247, + "learning_rate": 7.101683151216222e-05, + "loss": 2.8725, + "step": 20574 + }, + { + "epoch": 1.2772363275187784, + "grad_norm": 0.17337566596592927, + "learning_rate": 7.101355447501264e-05, + "loss": 2.8402, + "step": 20575 + }, + { + "epoch": 1.2772984046185363, + "grad_norm": 0.16981683970603934, + "learning_rate": 7.101027732823022e-05, + "loss": 2.8176, + "step": 20576 + }, + { + "epoch": 1.2773604817182942, + "grad_norm": 0.197139765936286, + "learning_rate": 7.100700007183207e-05, + "loss": 2.8319, + "step": 20577 + }, + { + "epoch": 1.277422558818052, + "grad_norm": 0.15461590535779762, + "learning_rate": 7.100372270583532e-05, + "loss": 2.8045, + "step": 20578 + }, + { + "epoch": 1.2774846359178098, + "grad_norm": 0.1610299620244645, + "learning_rate": 7.100044523025705e-05, + "loss": 2.9361, + "step": 20579 + }, + { + "epoch": 1.2775467130175677, + "grad_norm": 0.14635688956251627, + "learning_rate": 7.099716764511435e-05, + "loss": 2.6867, + "step": 20580 + }, + { + "epoch": 1.2776087901173256, + "grad_norm": 0.20113695637873247, + "learning_rate": 7.099388995042432e-05, + "loss": 2.946, + "step": 20581 + }, + { + "epoch": 1.2776708672170836, + "grad_norm": 0.18096406102614881, + "learning_rate": 7.09906121462041e-05, + "loss": 2.8828, + "step": 20582 + }, + { + "epoch": 1.2777329443168415, + "grad_norm": 0.1997615581213352, + "learning_rate": 7.098733423247074e-05, + "loss": 2.7929, + "step": 20583 + }, + { + "epoch": 1.2777950214165994, + "grad_norm": 0.1572110482374587, + "learning_rate": 7.098405620924137e-05, + "loss": 2.8026, + "step": 20584 + }, + { + "epoch": 1.2778570985163573, + "grad_norm": 0.18089738162232358, + "learning_rate": 7.098077807653307e-05, + "loss": 2.9774, + "step": 20585 + }, + { + "epoch": 1.2779191756161152, + "grad_norm": 0.2012541704456555, + "learning_rate": 7.097749983436297e-05, + "loss": 2.7497, + "step": 20586 + }, + { + "epoch": 1.2779812527158732, + "grad_norm": 0.1941861193368116, + "learning_rate": 7.097422148274816e-05, + "loss": 2.8228, + "step": 20587 + }, + { + "epoch": 1.278043329815631, + "grad_norm": 0.1599094885691428, + "learning_rate": 7.097094302170575e-05, + "loss": 2.83, + "step": 20588 + }, + { + "epoch": 1.278105406915389, + "grad_norm": 0.1591420450929064, + "learning_rate": 7.096766445125283e-05, + "loss": 2.9062, + "step": 20589 + }, + { + "epoch": 1.278167484015147, + "grad_norm": 0.15945202401401143, + "learning_rate": 7.096438577140651e-05, + "loss": 2.8064, + "step": 20590 + }, + { + "epoch": 1.2782295611149048, + "grad_norm": 0.16027229454642683, + "learning_rate": 7.096110698218392e-05, + "loss": 2.8138, + "step": 20591 + }, + { + "epoch": 1.2782916382146627, + "grad_norm": 0.1920681815643057, + "learning_rate": 7.095782808360214e-05, + "loss": 2.7808, + "step": 20592 + }, + { + "epoch": 1.2783537153144204, + "grad_norm": 0.16165789389793012, + "learning_rate": 7.095454907567827e-05, + "loss": 2.7546, + "step": 20593 + }, + { + "epoch": 1.2784157924141784, + "grad_norm": 0.17347575487491285, + "learning_rate": 7.095126995842945e-05, + "loss": 2.9598, + "step": 20594 + }, + { + "epoch": 1.2784778695139363, + "grad_norm": 0.15791468758294927, + "learning_rate": 7.094799073187274e-05, + "loss": 2.8701, + "step": 20595 + }, + { + "epoch": 1.2785399466136942, + "grad_norm": 0.17613638395612774, + "learning_rate": 7.09447113960253e-05, + "loss": 2.8478, + "step": 20596 + }, + { + "epoch": 1.278602023713452, + "grad_norm": 0.15476180300549858, + "learning_rate": 7.094143195090421e-05, + "loss": 2.8289, + "step": 20597 + }, + { + "epoch": 1.27866410081321, + "grad_norm": 0.1553384237030511, + "learning_rate": 7.09381523965266e-05, + "loss": 2.8883, + "step": 20598 + }, + { + "epoch": 1.278726177912968, + "grad_norm": 0.15294538809517277, + "learning_rate": 7.093487273290954e-05, + "loss": 2.8008, + "step": 20599 + }, + { + "epoch": 1.2787882550127259, + "grad_norm": 0.21081743532076044, + "learning_rate": 7.093159296007018e-05, + "loss": 2.8977, + "step": 20600 + }, + { + "epoch": 1.2788503321124838, + "grad_norm": 0.15708012603104476, + "learning_rate": 7.09283130780256e-05, + "loss": 2.8952, + "step": 20601 + }, + { + "epoch": 1.2789124092122415, + "grad_norm": 0.16945363896328483, + "learning_rate": 7.092503308679297e-05, + "loss": 2.7532, + "step": 20602 + }, + { + "epoch": 1.2789744863119994, + "grad_norm": 0.20511963742038425, + "learning_rate": 7.092175298638933e-05, + "loss": 2.807, + "step": 20603 + }, + { + "epoch": 1.2790365634117573, + "grad_norm": 0.20888269036230928, + "learning_rate": 7.091847277683183e-05, + "loss": 2.8778, + "step": 20604 + }, + { + "epoch": 1.2790986405115152, + "grad_norm": 0.1758966649826238, + "learning_rate": 7.09151924581376e-05, + "loss": 2.8997, + "step": 20605 + }, + { + "epoch": 1.2791607176112731, + "grad_norm": 0.17004368241858717, + "learning_rate": 7.091191203032371e-05, + "loss": 2.8002, + "step": 20606 + }, + { + "epoch": 1.279222794711031, + "grad_norm": 0.2069697288658221, + "learning_rate": 7.090863149340731e-05, + "loss": 2.8775, + "step": 20607 + }, + { + "epoch": 1.279284871810789, + "grad_norm": 0.16293966277888924, + "learning_rate": 7.09053508474055e-05, + "loss": 2.9513, + "step": 20608 + }, + { + "epoch": 1.279346948910547, + "grad_norm": 0.19662172207703996, + "learning_rate": 7.09020700923354e-05, + "loss": 2.8495, + "step": 20609 + }, + { + "epoch": 1.2794090260103048, + "grad_norm": 0.16024204607772222, + "learning_rate": 7.089878922821412e-05, + "loss": 2.8974, + "step": 20610 + }, + { + "epoch": 1.2794711031100627, + "grad_norm": 0.18242610787620767, + "learning_rate": 7.08955082550588e-05, + "loss": 2.8287, + "step": 20611 + }, + { + "epoch": 1.2795331802098207, + "grad_norm": 0.23939167870963832, + "learning_rate": 7.089222717288651e-05, + "loss": 2.9015, + "step": 20612 + }, + { + "epoch": 1.2795952573095786, + "grad_norm": 0.1877006907094452, + "learning_rate": 7.088894598171443e-05, + "loss": 2.8658, + "step": 20613 + }, + { + "epoch": 1.2796573344093365, + "grad_norm": 0.22722223842704214, + "learning_rate": 7.088566468155965e-05, + "loss": 2.8719, + "step": 20614 + }, + { + "epoch": 1.2797194115090944, + "grad_norm": 0.18040536452683514, + "learning_rate": 7.088238327243926e-05, + "loss": 2.8531, + "step": 20615 + }, + { + "epoch": 1.2797814886088523, + "grad_norm": 0.18906836421793904, + "learning_rate": 7.087910175437044e-05, + "loss": 2.8207, + "step": 20616 + }, + { + "epoch": 1.27984356570861, + "grad_norm": 0.1700440110359147, + "learning_rate": 7.087582012737025e-05, + "loss": 2.7466, + "step": 20617 + }, + { + "epoch": 1.279905642808368, + "grad_norm": 0.16887326992216029, + "learning_rate": 7.087253839145584e-05, + "loss": 2.8337, + "step": 20618 + }, + { + "epoch": 1.2799677199081259, + "grad_norm": 0.20252843514410912, + "learning_rate": 7.086925654664435e-05, + "loss": 2.8516, + "step": 20619 + }, + { + "epoch": 1.2800297970078838, + "grad_norm": 0.17296231487884836, + "learning_rate": 7.086597459295287e-05, + "loss": 2.916, + "step": 20620 + }, + { + "epoch": 1.2800918741076417, + "grad_norm": 0.18143090066659004, + "learning_rate": 7.086269253039853e-05, + "loss": 2.8446, + "step": 20621 + }, + { + "epoch": 1.2801539512073996, + "grad_norm": 0.166605658218109, + "learning_rate": 7.085941035899847e-05, + "loss": 2.9371, + "step": 20622 + }, + { + "epoch": 1.2802160283071575, + "grad_norm": 0.18854177661140847, + "learning_rate": 7.085612807876979e-05, + "loss": 2.7952, + "step": 20623 + }, + { + "epoch": 1.2802781054069154, + "grad_norm": 0.18337450734626834, + "learning_rate": 7.085284568972964e-05, + "loss": 2.753, + "step": 20624 + }, + { + "epoch": 1.2803401825066734, + "grad_norm": 0.18065825788091136, + "learning_rate": 7.084956319189512e-05, + "loss": 2.9217, + "step": 20625 + }, + { + "epoch": 1.280402259606431, + "grad_norm": 0.17024741832278353, + "learning_rate": 7.084628058528338e-05, + "loss": 2.8337, + "step": 20626 + }, + { + "epoch": 1.280464336706189, + "grad_norm": 0.22856526772969807, + "learning_rate": 7.084299786991154e-05, + "loss": 2.8944, + "step": 20627 + }, + { + "epoch": 1.280526413805947, + "grad_norm": 0.16838843013058266, + "learning_rate": 7.083971504579668e-05, + "loss": 2.8635, + "step": 20628 + }, + { + "epoch": 1.2805884909057048, + "grad_norm": 0.14922490956041212, + "learning_rate": 7.0836432112956e-05, + "loss": 2.8905, + "step": 20629 + }, + { + "epoch": 1.2806505680054627, + "grad_norm": 0.15921441383933646, + "learning_rate": 7.083314907140659e-05, + "loss": 2.8649, + "step": 20630 + }, + { + "epoch": 1.2807126451052206, + "grad_norm": 0.1613495322980654, + "learning_rate": 7.082986592116558e-05, + "loss": 2.863, + "step": 20631 + }, + { + "epoch": 1.2807747222049786, + "grad_norm": 0.17177351209434416, + "learning_rate": 7.082658266225012e-05, + "loss": 2.9129, + "step": 20632 + }, + { + "epoch": 1.2808367993047365, + "grad_norm": 0.16739584561577187, + "learning_rate": 7.082329929467731e-05, + "loss": 2.7856, + "step": 20633 + }, + { + "epoch": 1.2808988764044944, + "grad_norm": 0.16282106350473455, + "learning_rate": 7.082001581846429e-05, + "loss": 2.7994, + "step": 20634 + }, + { + "epoch": 1.2809609535042523, + "grad_norm": 0.17400267762968463, + "learning_rate": 7.081673223362819e-05, + "loss": 2.8784, + "step": 20635 + }, + { + "epoch": 1.2810230306040102, + "grad_norm": 0.17119470256933, + "learning_rate": 7.081344854018614e-05, + "loss": 2.8316, + "step": 20636 + }, + { + "epoch": 1.2810851077037682, + "grad_norm": 0.15981880552692126, + "learning_rate": 7.08101647381553e-05, + "loss": 2.871, + "step": 20637 + }, + { + "epoch": 1.281147184803526, + "grad_norm": 0.20782154321713026, + "learning_rate": 7.080688082755275e-05, + "loss": 2.8768, + "step": 20638 + }, + { + "epoch": 1.281209261903284, + "grad_norm": 0.18099013106631454, + "learning_rate": 7.080359680839566e-05, + "loss": 2.8407, + "step": 20639 + }, + { + "epoch": 1.281271339003042, + "grad_norm": 0.16693768532084666, + "learning_rate": 7.080031268070117e-05, + "loss": 2.8949, + "step": 20640 + }, + { + "epoch": 1.2813334161027996, + "grad_norm": 0.16615702273519356, + "learning_rate": 7.07970284444864e-05, + "loss": 2.8358, + "step": 20641 + }, + { + "epoch": 1.2813954932025575, + "grad_norm": 0.18159788921052175, + "learning_rate": 7.079374409976847e-05, + "loss": 2.6897, + "step": 20642 + }, + { + "epoch": 1.2814575703023154, + "grad_norm": 0.17129548407667308, + "learning_rate": 7.079045964656452e-05, + "loss": 2.8149, + "step": 20643 + }, + { + "epoch": 1.2815196474020734, + "grad_norm": 0.16746634376507025, + "learning_rate": 7.07871750848917e-05, + "loss": 2.8246, + "step": 20644 + }, + { + "epoch": 1.2815817245018313, + "grad_norm": 0.1662669859451277, + "learning_rate": 7.078389041476716e-05, + "loss": 2.8776, + "step": 20645 + }, + { + "epoch": 1.2816438016015892, + "grad_norm": 0.15419296797971208, + "learning_rate": 7.078060563620801e-05, + "loss": 2.8404, + "step": 20646 + }, + { + "epoch": 1.2817058787013471, + "grad_norm": 0.18553351303976964, + "learning_rate": 7.07773207492314e-05, + "loss": 2.7388, + "step": 20647 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.15814452793328157, + "learning_rate": 7.077403575385445e-05, + "loss": 2.828, + "step": 20648 + }, + { + "epoch": 1.281830032900863, + "grad_norm": 0.17323546692242672, + "learning_rate": 7.077075065009433e-05, + "loss": 2.8817, + "step": 20649 + }, + { + "epoch": 1.2818921100006206, + "grad_norm": 0.19183230179728358, + "learning_rate": 7.076746543796814e-05, + "loss": 2.785, + "step": 20650 + }, + { + "epoch": 1.2819541871003786, + "grad_norm": 0.19944744447178825, + "learning_rate": 7.076418011749307e-05, + "loss": 2.9195, + "step": 20651 + }, + { + "epoch": 1.2820162642001365, + "grad_norm": 0.1617607193542635, + "learning_rate": 7.076089468868621e-05, + "loss": 2.7332, + "step": 20652 + }, + { + "epoch": 1.2820783412998944, + "grad_norm": 0.1910213083463156, + "learning_rate": 7.075760915156472e-05, + "loss": 2.9218, + "step": 20653 + }, + { + "epoch": 1.2821404183996523, + "grad_norm": 0.18058010934398097, + "learning_rate": 7.075432350614576e-05, + "loss": 2.8308, + "step": 20654 + }, + { + "epoch": 1.2822024954994102, + "grad_norm": 0.218185001690083, + "learning_rate": 7.075103775244643e-05, + "loss": 2.8554, + "step": 20655 + }, + { + "epoch": 1.2822645725991682, + "grad_norm": 0.19682225360911776, + "learning_rate": 7.074775189048392e-05, + "loss": 2.822, + "step": 20656 + }, + { + "epoch": 1.282326649698926, + "grad_norm": 0.16705651187689957, + "learning_rate": 7.074446592027536e-05, + "loss": 2.7436, + "step": 20657 + }, + { + "epoch": 1.282388726798684, + "grad_norm": 0.17873890975747272, + "learning_rate": 7.074117984183787e-05, + "loss": 2.901, + "step": 20658 + }, + { + "epoch": 1.282450803898442, + "grad_norm": 0.15760780103261926, + "learning_rate": 7.073789365518861e-05, + "loss": 2.7604, + "step": 20659 + }, + { + "epoch": 1.2825128809981998, + "grad_norm": 0.1837209098845981, + "learning_rate": 7.073460736034473e-05, + "loss": 2.9496, + "step": 20660 + }, + { + "epoch": 1.2825749580979577, + "grad_norm": 0.20444434188267765, + "learning_rate": 7.073132095732335e-05, + "loss": 2.8262, + "step": 20661 + }, + { + "epoch": 1.2826370351977157, + "grad_norm": 0.1810219184660622, + "learning_rate": 7.072803444614165e-05, + "loss": 2.9008, + "step": 20662 + }, + { + "epoch": 1.2826991122974736, + "grad_norm": 0.186872674630921, + "learning_rate": 7.072474782681677e-05, + "loss": 2.8292, + "step": 20663 + }, + { + "epoch": 1.2827611893972315, + "grad_norm": 0.18518413422262028, + "learning_rate": 7.072146109936585e-05, + "loss": 2.9118, + "step": 20664 + }, + { + "epoch": 1.2828232664969892, + "grad_norm": 0.196406153964897, + "learning_rate": 7.071817426380603e-05, + "loss": 2.8531, + "step": 20665 + }, + { + "epoch": 1.282885343596747, + "grad_norm": 0.1714233457112612, + "learning_rate": 7.071488732015446e-05, + "loss": 2.8822, + "step": 20666 + }, + { + "epoch": 1.282947420696505, + "grad_norm": 0.2324156079128556, + "learning_rate": 7.071160026842831e-05, + "loss": 2.8715, + "step": 20667 + }, + { + "epoch": 1.283009497796263, + "grad_norm": 0.1570117620455505, + "learning_rate": 7.07083131086447e-05, + "loss": 2.8993, + "step": 20668 + }, + { + "epoch": 1.2830715748960209, + "grad_norm": 0.21002536051466889, + "learning_rate": 7.070502584082081e-05, + "loss": 2.8824, + "step": 20669 + }, + { + "epoch": 1.2831336519957788, + "grad_norm": 0.1658424933238386, + "learning_rate": 7.070173846497377e-05, + "loss": 2.9236, + "step": 20670 + }, + { + "epoch": 1.2831957290955367, + "grad_norm": 0.1653191655533246, + "learning_rate": 7.069845098112071e-05, + "loss": 2.877, + "step": 20671 + }, + { + "epoch": 1.2832578061952946, + "grad_norm": 0.16891531595289305, + "learning_rate": 7.069516338927883e-05, + "loss": 2.8767, + "step": 20672 + }, + { + "epoch": 1.2833198832950525, + "grad_norm": 0.1696406207215097, + "learning_rate": 7.069187568946525e-05, + "loss": 3.0, + "step": 20673 + }, + { + "epoch": 1.2833819603948102, + "grad_norm": 0.175605808232696, + "learning_rate": 7.068858788169714e-05, + "loss": 2.9337, + "step": 20674 + }, + { + "epoch": 1.2834440374945681, + "grad_norm": 0.20046123428111648, + "learning_rate": 7.068529996599163e-05, + "loss": 2.8016, + "step": 20675 + }, + { + "epoch": 1.283506114594326, + "grad_norm": 0.18814093596742654, + "learning_rate": 7.06820119423659e-05, + "loss": 2.7919, + "step": 20676 + }, + { + "epoch": 1.283568191694084, + "grad_norm": 0.15915950694029546, + "learning_rate": 7.067872381083709e-05, + "loss": 2.843, + "step": 20677 + }, + { + "epoch": 1.283630268793842, + "grad_norm": 0.2099634037316548, + "learning_rate": 7.067543557142234e-05, + "loss": 2.7957, + "step": 20678 + }, + { + "epoch": 1.2836923458935998, + "grad_norm": 0.21682605031595997, + "learning_rate": 7.067214722413885e-05, + "loss": 2.8628, + "step": 20679 + }, + { + "epoch": 1.2837544229933577, + "grad_norm": 0.1580868281779308, + "learning_rate": 7.066885876900373e-05, + "loss": 2.829, + "step": 20680 + }, + { + "epoch": 1.2838165000931157, + "grad_norm": 0.15797091738216734, + "learning_rate": 7.066557020603418e-05, + "loss": 2.8627, + "step": 20681 + }, + { + "epoch": 1.2838785771928736, + "grad_norm": 0.20566680522368672, + "learning_rate": 7.066228153524733e-05, + "loss": 2.7813, + "step": 20682 + }, + { + "epoch": 1.2839406542926315, + "grad_norm": 0.18470039022551096, + "learning_rate": 7.065899275666033e-05, + "loss": 2.9038, + "step": 20683 + }, + { + "epoch": 1.2840027313923894, + "grad_norm": 0.24501119654966405, + "learning_rate": 7.065570387029033e-05, + "loss": 2.8912, + "step": 20684 + }, + { + "epoch": 1.2840648084921473, + "grad_norm": 0.17094057381113592, + "learning_rate": 7.065241487615455e-05, + "loss": 2.846, + "step": 20685 + }, + { + "epoch": 1.2841268855919052, + "grad_norm": 0.15547156738718848, + "learning_rate": 7.064912577427008e-05, + "loss": 2.8319, + "step": 20686 + }, + { + "epoch": 1.2841889626916632, + "grad_norm": 0.17825354739507407, + "learning_rate": 7.064583656465411e-05, + "loss": 2.921, + "step": 20687 + }, + { + "epoch": 1.284251039791421, + "grad_norm": 0.18187599929063125, + "learning_rate": 7.064254724732382e-05, + "loss": 2.8752, + "step": 20688 + }, + { + "epoch": 1.2843131168911788, + "grad_norm": 0.17311840150666463, + "learning_rate": 7.063925782229632e-05, + "loss": 2.8946, + "step": 20689 + }, + { + "epoch": 1.2843751939909367, + "grad_norm": 0.17212856618626232, + "learning_rate": 7.063596828958883e-05, + "loss": 2.8645, + "step": 20690 + }, + { + "epoch": 1.2844372710906946, + "grad_norm": 0.17818744259287786, + "learning_rate": 7.063267864921847e-05, + "loss": 2.8058, + "step": 20691 + }, + { + "epoch": 1.2844993481904525, + "grad_norm": 0.17507751846889308, + "learning_rate": 7.062938890120241e-05, + "loss": 2.843, + "step": 20692 + }, + { + "epoch": 1.2845614252902104, + "grad_norm": 0.18853838800948172, + "learning_rate": 7.062609904555782e-05, + "loss": 2.9027, + "step": 20693 + }, + { + "epoch": 1.2846235023899684, + "grad_norm": 0.1719425458829798, + "learning_rate": 7.062280908230188e-05, + "loss": 2.78, + "step": 20694 + }, + { + "epoch": 1.2846855794897263, + "grad_norm": 0.17347600719910194, + "learning_rate": 7.061951901145173e-05, + "loss": 2.8936, + "step": 20695 + }, + { + "epoch": 1.2847476565894842, + "grad_norm": 0.17772677688287222, + "learning_rate": 7.061622883302454e-05, + "loss": 2.9048, + "step": 20696 + }, + { + "epoch": 1.2848097336892421, + "grad_norm": 0.19124145614949817, + "learning_rate": 7.061293854703747e-05, + "loss": 2.8016, + "step": 20697 + }, + { + "epoch": 1.2848718107889998, + "grad_norm": 0.1567361778046832, + "learning_rate": 7.060964815350771e-05, + "loss": 2.8606, + "step": 20698 + }, + { + "epoch": 1.2849338878887577, + "grad_norm": 0.19362063049634093, + "learning_rate": 7.060635765245242e-05, + "loss": 2.8146, + "step": 20699 + }, + { + "epoch": 1.2849959649885156, + "grad_norm": 0.18498856721811605, + "learning_rate": 7.060306704388875e-05, + "loss": 2.8775, + "step": 20700 + }, + { + "epoch": 1.2850580420882736, + "grad_norm": 0.18498449930772812, + "learning_rate": 7.059977632783388e-05, + "loss": 2.935, + "step": 20701 + }, + { + "epoch": 1.2851201191880315, + "grad_norm": 0.17185980556256183, + "learning_rate": 7.059648550430497e-05, + "loss": 2.8055, + "step": 20702 + }, + { + "epoch": 1.2851821962877894, + "grad_norm": 0.2461742768916534, + "learning_rate": 7.05931945733192e-05, + "loss": 2.9121, + "step": 20703 + }, + { + "epoch": 1.2852442733875473, + "grad_norm": 0.6613177177570182, + "learning_rate": 7.058990353489372e-05, + "loss": 2.951, + "step": 20704 + }, + { + "epoch": 1.2853063504873052, + "grad_norm": 0.1960368362602571, + "learning_rate": 7.058661238904574e-05, + "loss": 2.7974, + "step": 20705 + }, + { + "epoch": 1.2853684275870632, + "grad_norm": 0.22207158272010052, + "learning_rate": 7.058332113579241e-05, + "loss": 2.7911, + "step": 20706 + }, + { + "epoch": 1.285430504686821, + "grad_norm": 0.20524057584412006, + "learning_rate": 7.058002977515087e-05, + "loss": 2.9067, + "step": 20707 + }, + { + "epoch": 1.285492581786579, + "grad_norm": 0.15096585382578723, + "learning_rate": 7.057673830713834e-05, + "loss": 2.8167, + "step": 20708 + }, + { + "epoch": 1.285554658886337, + "grad_norm": 0.1703517288997951, + "learning_rate": 7.057344673177195e-05, + "loss": 2.8914, + "step": 20709 + }, + { + "epoch": 1.2856167359860948, + "grad_norm": 0.17232931304876295, + "learning_rate": 7.05701550490689e-05, + "loss": 2.8387, + "step": 20710 + }, + { + "epoch": 1.2856788130858527, + "grad_norm": 0.1666735961834489, + "learning_rate": 7.056686325904636e-05, + "loss": 2.8823, + "step": 20711 + }, + { + "epoch": 1.2857408901856107, + "grad_norm": 0.15781669752751687, + "learning_rate": 7.056357136172152e-05, + "loss": 2.8448, + "step": 20712 + }, + { + "epoch": 1.2858029672853684, + "grad_norm": 0.16843484925995797, + "learning_rate": 7.056027935711152e-05, + "loss": 2.8023, + "step": 20713 + }, + { + "epoch": 1.2858650443851263, + "grad_norm": 0.20830661284433913, + "learning_rate": 7.055698724523354e-05, + "loss": 2.9028, + "step": 20714 + }, + { + "epoch": 1.2859271214848842, + "grad_norm": 0.15860226988809656, + "learning_rate": 7.055369502610478e-05, + "loss": 2.931, + "step": 20715 + }, + { + "epoch": 1.2859891985846421, + "grad_norm": 0.17380144150199806, + "learning_rate": 7.05504026997424e-05, + "loss": 2.9104, + "step": 20716 + }, + { + "epoch": 1.2860512756844, + "grad_norm": 0.18113385540577864, + "learning_rate": 7.054711026616359e-05, + "loss": 2.8714, + "step": 20717 + }, + { + "epoch": 1.286113352784158, + "grad_norm": 0.20112041630035243, + "learning_rate": 7.054381772538551e-05, + "loss": 2.9296, + "step": 20718 + }, + { + "epoch": 1.2861754298839159, + "grad_norm": 0.21337236791584036, + "learning_rate": 7.054052507742535e-05, + "loss": 2.8745, + "step": 20719 + }, + { + "epoch": 1.2862375069836738, + "grad_norm": 0.18364286780239425, + "learning_rate": 7.053723232230028e-05, + "loss": 2.7512, + "step": 20720 + }, + { + "epoch": 1.2862995840834317, + "grad_norm": 0.22173159995555855, + "learning_rate": 7.053393946002748e-05, + "loss": 2.8973, + "step": 20721 + }, + { + "epoch": 1.2863616611831894, + "grad_norm": 0.20190594821240795, + "learning_rate": 7.053064649062414e-05, + "loss": 2.9018, + "step": 20722 + }, + { + "epoch": 1.2864237382829473, + "grad_norm": 0.16880775099242404, + "learning_rate": 7.052735341410744e-05, + "loss": 2.8384, + "step": 20723 + }, + { + "epoch": 1.2864858153827052, + "grad_norm": 0.199465318569315, + "learning_rate": 7.052406023049453e-05, + "loss": 2.9273, + "step": 20724 + }, + { + "epoch": 1.2865478924824632, + "grad_norm": 0.18738028558995856, + "learning_rate": 7.052076693980264e-05, + "loss": 2.8645, + "step": 20725 + }, + { + "epoch": 1.286609969582221, + "grad_norm": 0.21391485719382358, + "learning_rate": 7.051747354204892e-05, + "loss": 2.9004, + "step": 20726 + }, + { + "epoch": 1.286672046681979, + "grad_norm": 0.23226244451126857, + "learning_rate": 7.051418003725054e-05, + "loss": 2.7982, + "step": 20727 + }, + { + "epoch": 1.286734123781737, + "grad_norm": 0.2141602220026377, + "learning_rate": 7.051088642542472e-05, + "loss": 2.8958, + "step": 20728 + }, + { + "epoch": 1.2867962008814948, + "grad_norm": 0.22951256681819102, + "learning_rate": 7.050759270658864e-05, + "loss": 2.8754, + "step": 20729 + }, + { + "epoch": 1.2868582779812527, + "grad_norm": 0.2087621487720007, + "learning_rate": 7.050429888075944e-05, + "loss": 2.7661, + "step": 20730 + }, + { + "epoch": 1.2869203550810107, + "grad_norm": 0.19419270897107718, + "learning_rate": 7.050100494795437e-05, + "loss": 2.9205, + "step": 20731 + }, + { + "epoch": 1.2869824321807686, + "grad_norm": 0.16956591866931564, + "learning_rate": 7.049771090819055e-05, + "loss": 2.8695, + "step": 20732 + }, + { + "epoch": 1.2870445092805265, + "grad_norm": 0.1716953444041066, + "learning_rate": 7.049441676148522e-05, + "loss": 2.8294, + "step": 20733 + }, + { + "epoch": 1.2871065863802844, + "grad_norm": 0.1650866095767657, + "learning_rate": 7.049112250785552e-05, + "loss": 2.8452, + "step": 20734 + }, + { + "epoch": 1.2871686634800423, + "grad_norm": 0.16286358551713556, + "learning_rate": 7.048782814731868e-05, + "loss": 2.91, + "step": 20735 + }, + { + "epoch": 1.2872307405798002, + "grad_norm": 0.15478616106447254, + "learning_rate": 7.048453367989185e-05, + "loss": 2.8209, + "step": 20736 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.17270820944861773, + "learning_rate": 7.048123910559225e-05, + "loss": 2.9097, + "step": 20737 + }, + { + "epoch": 1.2873548947793159, + "grad_norm": 0.17993252053944978, + "learning_rate": 7.047794442443703e-05, + "loss": 2.9009, + "step": 20738 + }, + { + "epoch": 1.2874169718790738, + "grad_norm": 0.16438028802089472, + "learning_rate": 7.047464963644344e-05, + "loss": 2.7966, + "step": 20739 + }, + { + "epoch": 1.2874790489788317, + "grad_norm": 0.17643427349164614, + "learning_rate": 7.04713547416286e-05, + "loss": 2.9306, + "step": 20740 + }, + { + "epoch": 1.2875411260785896, + "grad_norm": 0.15248561635935434, + "learning_rate": 7.046805974000975e-05, + "loss": 2.7856, + "step": 20741 + }, + { + "epoch": 1.2876032031783475, + "grad_norm": 0.24603464537014316, + "learning_rate": 7.046476463160407e-05, + "loss": 2.8974, + "step": 20742 + }, + { + "epoch": 1.2876652802781055, + "grad_norm": 0.16543284394046404, + "learning_rate": 7.04614694164287e-05, + "loss": 2.8806, + "step": 20743 + }, + { + "epoch": 1.2877273573778634, + "grad_norm": 0.16030331581607518, + "learning_rate": 7.045817409450092e-05, + "loss": 2.9158, + "step": 20744 + }, + { + "epoch": 1.2877894344776213, + "grad_norm": 0.15213138499452405, + "learning_rate": 7.045487866583787e-05, + "loss": 2.8283, + "step": 20745 + }, + { + "epoch": 1.287851511577379, + "grad_norm": 0.15158136658387011, + "learning_rate": 7.045158313045674e-05, + "loss": 2.8362, + "step": 20746 + }, + { + "epoch": 1.287913588677137, + "grad_norm": 0.14629856404718072, + "learning_rate": 7.044828748837475e-05, + "loss": 2.7846, + "step": 20747 + }, + { + "epoch": 1.2879756657768948, + "grad_norm": 0.16315418802582976, + "learning_rate": 7.044499173960907e-05, + "loss": 2.8391, + "step": 20748 + }, + { + "epoch": 1.2880377428766527, + "grad_norm": 0.1550174978977657, + "learning_rate": 7.044169588417693e-05, + "loss": 2.8154, + "step": 20749 + }, + { + "epoch": 1.2880998199764107, + "grad_norm": 0.15500398564405748, + "learning_rate": 7.043839992209547e-05, + "loss": 2.7287, + "step": 20750 + }, + { + "epoch": 1.2881618970761686, + "grad_norm": 0.14544070699800973, + "learning_rate": 7.043510385338195e-05, + "loss": 2.8186, + "step": 20751 + }, + { + "epoch": 1.2882239741759265, + "grad_norm": 0.16627850755990461, + "learning_rate": 7.04318076780535e-05, + "loss": 2.7595, + "step": 20752 + }, + { + "epoch": 1.2882860512756844, + "grad_norm": 0.14912982782693204, + "learning_rate": 7.042851139612736e-05, + "loss": 2.8245, + "step": 20753 + }, + { + "epoch": 1.2883481283754423, + "grad_norm": 0.1786573465175852, + "learning_rate": 7.042521500762072e-05, + "loss": 2.9274, + "step": 20754 + }, + { + "epoch": 1.2884102054752002, + "grad_norm": 0.18068940268471068, + "learning_rate": 7.042191851255076e-05, + "loss": 2.8579, + "step": 20755 + }, + { + "epoch": 1.2884722825749582, + "grad_norm": 0.17259205988571538, + "learning_rate": 7.041862191093471e-05, + "loss": 2.9128, + "step": 20756 + }, + { + "epoch": 1.288534359674716, + "grad_norm": 0.15963764661026944, + "learning_rate": 7.041532520278976e-05, + "loss": 2.8486, + "step": 20757 + }, + { + "epoch": 1.288596436774474, + "grad_norm": 0.17198913209355496, + "learning_rate": 7.041202838813307e-05, + "loss": 2.8199, + "step": 20758 + }, + { + "epoch": 1.288658513874232, + "grad_norm": 0.14766357090880647, + "learning_rate": 7.04087314669819e-05, + "loss": 2.8202, + "step": 20759 + }, + { + "epoch": 1.2887205909739898, + "grad_norm": 0.19916116592826166, + "learning_rate": 7.040543443935342e-05, + "loss": 2.8466, + "step": 20760 + }, + { + "epoch": 1.2887826680737475, + "grad_norm": 0.18038474476888072, + "learning_rate": 7.040213730526482e-05, + "loss": 2.8855, + "step": 20761 + }, + { + "epoch": 1.2888447451735054, + "grad_norm": 0.14857949818317479, + "learning_rate": 7.039884006473334e-05, + "loss": 2.8695, + "step": 20762 + }, + { + "epoch": 1.2889068222732634, + "grad_norm": 0.16549136465642697, + "learning_rate": 7.039554271777615e-05, + "loss": 2.8706, + "step": 20763 + }, + { + "epoch": 1.2889688993730213, + "grad_norm": 0.16621926664468276, + "learning_rate": 7.039224526441047e-05, + "loss": 2.8863, + "step": 20764 + }, + { + "epoch": 1.2890309764727792, + "grad_norm": 0.15580097289813036, + "learning_rate": 7.038894770465346e-05, + "loss": 2.9108, + "step": 20765 + }, + { + "epoch": 1.2890930535725371, + "grad_norm": 0.1558058685107245, + "learning_rate": 7.03856500385224e-05, + "loss": 2.7955, + "step": 20766 + }, + { + "epoch": 1.289155130672295, + "grad_norm": 0.15615079830089598, + "learning_rate": 7.038235226603445e-05, + "loss": 2.8095, + "step": 20767 + }, + { + "epoch": 1.289217207772053, + "grad_norm": 0.16272222197923625, + "learning_rate": 7.03790543872068e-05, + "loss": 2.8495, + "step": 20768 + }, + { + "epoch": 1.2892792848718109, + "grad_norm": 0.1781764140298184, + "learning_rate": 7.037575640205669e-05, + "loss": 2.8945, + "step": 20769 + }, + { + "epoch": 1.2893413619715686, + "grad_norm": 0.16661143256910224, + "learning_rate": 7.037245831060131e-05, + "loss": 2.8348, + "step": 20770 + }, + { + "epoch": 1.2894034390713265, + "grad_norm": 0.1596494133109523, + "learning_rate": 7.036916011285788e-05, + "loss": 2.9088, + "step": 20771 + }, + { + "epoch": 1.2894655161710844, + "grad_norm": 0.15345796856442817, + "learning_rate": 7.036586180884358e-05, + "loss": 2.8436, + "step": 20772 + }, + { + "epoch": 1.2895275932708423, + "grad_norm": 0.15979573332196975, + "learning_rate": 7.036256339857564e-05, + "loss": 2.8602, + "step": 20773 + }, + { + "epoch": 1.2895896703706002, + "grad_norm": 0.1654801653129447, + "learning_rate": 7.035926488207127e-05, + "loss": 2.8446, + "step": 20774 + }, + { + "epoch": 1.2896517474703582, + "grad_norm": 0.1780851382275062, + "learning_rate": 7.035596625934767e-05, + "loss": 2.8768, + "step": 20775 + }, + { + "epoch": 1.289713824570116, + "grad_norm": 0.14529091715758385, + "learning_rate": 7.035266753042204e-05, + "loss": 2.9178, + "step": 20776 + }, + { + "epoch": 1.289775901669874, + "grad_norm": 0.1812287346927248, + "learning_rate": 7.034936869531161e-05, + "loss": 2.9495, + "step": 20777 + }, + { + "epoch": 1.289837978769632, + "grad_norm": 0.16285000941944416, + "learning_rate": 7.03460697540336e-05, + "loss": 2.8265, + "step": 20778 + }, + { + "epoch": 1.2899000558693898, + "grad_norm": 0.15201738440355814, + "learning_rate": 7.034277070660518e-05, + "loss": 2.8587, + "step": 20779 + }, + { + "epoch": 1.2899621329691477, + "grad_norm": 0.14410201333467396, + "learning_rate": 7.033947155304361e-05, + "loss": 2.8314, + "step": 20780 + }, + { + "epoch": 1.2900242100689057, + "grad_norm": 0.14808603765212094, + "learning_rate": 7.033617229336606e-05, + "loss": 2.8177, + "step": 20781 + }, + { + "epoch": 1.2900862871686636, + "grad_norm": 0.14713101394502803, + "learning_rate": 7.033287292758977e-05, + "loss": 2.8611, + "step": 20782 + }, + { + "epoch": 1.2901483642684215, + "grad_norm": 0.1671831589429333, + "learning_rate": 7.032957345573194e-05, + "loss": 2.8426, + "step": 20783 + }, + { + "epoch": 1.2902104413681794, + "grad_norm": 0.1448186967225265, + "learning_rate": 7.03262738778098e-05, + "loss": 2.8402, + "step": 20784 + }, + { + "epoch": 1.2902725184679371, + "grad_norm": 0.17510701253969835, + "learning_rate": 7.032297419384056e-05, + "loss": 2.8408, + "step": 20785 + }, + { + "epoch": 1.290334595567695, + "grad_norm": 0.16838576426432308, + "learning_rate": 7.031967440384142e-05, + "loss": 2.9231, + "step": 20786 + }, + { + "epoch": 1.290396672667453, + "grad_norm": 0.16317676824406768, + "learning_rate": 7.03163745078296e-05, + "loss": 2.9253, + "step": 20787 + }, + { + "epoch": 1.2904587497672109, + "grad_norm": 0.16268554809321173, + "learning_rate": 7.031307450582233e-05, + "loss": 2.8958, + "step": 20788 + }, + { + "epoch": 1.2905208268669688, + "grad_norm": 0.1515941087954051, + "learning_rate": 7.030977439783681e-05, + "loss": 2.9192, + "step": 20789 + }, + { + "epoch": 1.2905829039667267, + "grad_norm": 0.1760735465749392, + "learning_rate": 7.030647418389027e-05, + "loss": 2.9133, + "step": 20790 + }, + { + "epoch": 1.2906449810664846, + "grad_norm": 0.1584457667769204, + "learning_rate": 7.030317386399993e-05, + "loss": 2.9348, + "step": 20791 + }, + { + "epoch": 1.2907070581662425, + "grad_norm": 0.15109587935137622, + "learning_rate": 7.029987343818301e-05, + "loss": 2.8036, + "step": 20792 + }, + { + "epoch": 1.2907691352660002, + "grad_norm": 0.16060984158700978, + "learning_rate": 7.029657290645671e-05, + "loss": 2.8588, + "step": 20793 + }, + { + "epoch": 1.2908312123657582, + "grad_norm": 0.17572069714590322, + "learning_rate": 7.029327226883828e-05, + "loss": 2.7465, + "step": 20794 + }, + { + "epoch": 1.290893289465516, + "grad_norm": 0.1617896477463366, + "learning_rate": 7.02899715253449e-05, + "loss": 2.9179, + "step": 20795 + }, + { + "epoch": 1.290955366565274, + "grad_norm": 0.17807212638809788, + "learning_rate": 7.028667067599382e-05, + "loss": 2.8065, + "step": 20796 + }, + { + "epoch": 1.291017443665032, + "grad_norm": 0.17001007246583258, + "learning_rate": 7.028336972080225e-05, + "loss": 2.9076, + "step": 20797 + }, + { + "epoch": 1.2910795207647898, + "grad_norm": 0.18669602762610432, + "learning_rate": 7.028006865978743e-05, + "loss": 2.815, + "step": 20798 + }, + { + "epoch": 1.2911415978645477, + "grad_norm": 0.1543673221709755, + "learning_rate": 7.027676749296655e-05, + "loss": 2.8393, + "step": 20799 + }, + { + "epoch": 1.2912036749643057, + "grad_norm": 0.16482957263744402, + "learning_rate": 7.027346622035687e-05, + "loss": 2.7568, + "step": 20800 + }, + { + "epoch": 1.2912657520640636, + "grad_norm": 0.16506888893167526, + "learning_rate": 7.027016484197559e-05, + "loss": 2.8072, + "step": 20801 + }, + { + "epoch": 1.2913278291638215, + "grad_norm": 0.16296994201538753, + "learning_rate": 7.026686335783994e-05, + "loss": 2.8138, + "step": 20802 + }, + { + "epoch": 1.2913899062635794, + "grad_norm": 0.17999627700571488, + "learning_rate": 7.026356176796713e-05, + "loss": 2.8067, + "step": 20803 + }, + { + "epoch": 1.2914519833633373, + "grad_norm": 0.17355698430518235, + "learning_rate": 7.026026007237442e-05, + "loss": 2.818, + "step": 20804 + }, + { + "epoch": 1.2915140604630952, + "grad_norm": 0.19073755857470626, + "learning_rate": 7.025695827107901e-05, + "loss": 2.8663, + "step": 20805 + }, + { + "epoch": 1.2915761375628532, + "grad_norm": 0.19392834384595498, + "learning_rate": 7.025365636409813e-05, + "loss": 2.8984, + "step": 20806 + }, + { + "epoch": 1.291638214662611, + "grad_norm": 0.20699018394996493, + "learning_rate": 7.0250354351449e-05, + "loss": 2.7878, + "step": 20807 + }, + { + "epoch": 1.2917002917623688, + "grad_norm": 0.17962911942828022, + "learning_rate": 7.024705223314884e-05, + "loss": 2.8283, + "step": 20808 + }, + { + "epoch": 1.2917623688621267, + "grad_norm": 0.2935816647740885, + "learning_rate": 7.024375000921493e-05, + "loss": 2.8109, + "step": 20809 + }, + { + "epoch": 1.2918244459618846, + "grad_norm": 0.1625023525324506, + "learning_rate": 7.024044767966445e-05, + "loss": 2.8514, + "step": 20810 + }, + { + "epoch": 1.2918865230616425, + "grad_norm": 0.18030502108793425, + "learning_rate": 7.023714524451463e-05, + "loss": 2.9731, + "step": 20811 + }, + { + "epoch": 1.2919486001614005, + "grad_norm": 0.1658541679643074, + "learning_rate": 7.023384270378272e-05, + "loss": 2.8652, + "step": 20812 + }, + { + "epoch": 1.2920106772611584, + "grad_norm": 0.22626730992293295, + "learning_rate": 7.023054005748592e-05, + "loss": 2.859, + "step": 20813 + }, + { + "epoch": 1.2920727543609163, + "grad_norm": 0.19894600879151117, + "learning_rate": 7.022723730564152e-05, + "loss": 2.8835, + "step": 20814 + }, + { + "epoch": 1.2921348314606742, + "grad_norm": 0.18460194755590564, + "learning_rate": 7.022393444826668e-05, + "loss": 2.9381, + "step": 20815 + }, + { + "epoch": 1.2921969085604321, + "grad_norm": 0.19031504579228772, + "learning_rate": 7.022063148537868e-05, + "loss": 2.886, + "step": 20816 + }, + { + "epoch": 1.2922589856601898, + "grad_norm": 0.1790637509899812, + "learning_rate": 7.021732841699473e-05, + "loss": 2.8527, + "step": 20817 + }, + { + "epoch": 1.2923210627599477, + "grad_norm": 0.15799695580038145, + "learning_rate": 7.021402524313208e-05, + "loss": 2.8456, + "step": 20818 + }, + { + "epoch": 1.2923831398597057, + "grad_norm": 0.18597874825019842, + "learning_rate": 7.021072196380793e-05, + "loss": 2.895, + "step": 20819 + }, + { + "epoch": 1.2924452169594636, + "grad_norm": 0.17278879178926265, + "learning_rate": 7.020741857903957e-05, + "loss": 2.7794, + "step": 20820 + }, + { + "epoch": 1.2925072940592215, + "grad_norm": 0.15837569302766308, + "learning_rate": 7.020411508884417e-05, + "loss": 2.8393, + "step": 20821 + }, + { + "epoch": 1.2925693711589794, + "grad_norm": 0.15530957095456327, + "learning_rate": 7.020081149323902e-05, + "loss": 2.9616, + "step": 20822 + }, + { + "epoch": 1.2926314482587373, + "grad_norm": 0.16402280418090218, + "learning_rate": 7.019750779224133e-05, + "loss": 2.9106, + "step": 20823 + }, + { + "epoch": 1.2926935253584952, + "grad_norm": 0.2141356851257798, + "learning_rate": 7.019420398586832e-05, + "loss": 2.8959, + "step": 20824 + }, + { + "epoch": 1.2927556024582532, + "grad_norm": 0.15858455875616181, + "learning_rate": 7.019090007413726e-05, + "loss": 2.9663, + "step": 20825 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.16919039441131725, + "learning_rate": 7.018759605706537e-05, + "loss": 2.8517, + "step": 20826 + }, + { + "epoch": 1.292879756657769, + "grad_norm": 0.21164505228373554, + "learning_rate": 7.018429193466988e-05, + "loss": 2.9093, + "step": 20827 + }, + { + "epoch": 1.292941833757527, + "grad_norm": 0.19394818637097788, + "learning_rate": 7.018098770696806e-05, + "loss": 2.8772, + "step": 20828 + }, + { + "epoch": 1.2930039108572848, + "grad_norm": 0.20983895595297897, + "learning_rate": 7.017768337397712e-05, + "loss": 2.8578, + "step": 20829 + }, + { + "epoch": 1.2930659879570428, + "grad_norm": 0.15647877991225234, + "learning_rate": 7.01743789357143e-05, + "loss": 2.8215, + "step": 20830 + }, + { + "epoch": 1.2931280650568007, + "grad_norm": 0.1707622312267353, + "learning_rate": 7.017107439219686e-05, + "loss": 2.8159, + "step": 20831 + }, + { + "epoch": 1.2931901421565584, + "grad_norm": 0.16180891320072005, + "learning_rate": 7.016776974344202e-05, + "loss": 2.8564, + "step": 20832 + }, + { + "epoch": 1.2932522192563163, + "grad_norm": 0.15192205802846034, + "learning_rate": 7.016446498946703e-05, + "loss": 2.8758, + "step": 20833 + }, + { + "epoch": 1.2933142963560742, + "grad_norm": 0.1550028981237808, + "learning_rate": 7.016116013028914e-05, + "loss": 2.9376, + "step": 20834 + }, + { + "epoch": 1.2933763734558321, + "grad_norm": 0.1715199802860812, + "learning_rate": 7.015785516592557e-05, + "loss": 2.8151, + "step": 20835 + }, + { + "epoch": 1.29343845055559, + "grad_norm": 0.22002090236949412, + "learning_rate": 7.015455009639357e-05, + "loss": 2.8331, + "step": 20836 + }, + { + "epoch": 1.293500527655348, + "grad_norm": 0.15694664982131912, + "learning_rate": 7.015124492171041e-05, + "loss": 2.8868, + "step": 20837 + }, + { + "epoch": 1.2935626047551059, + "grad_norm": 0.18826319421957174, + "learning_rate": 7.01479396418933e-05, + "loss": 2.9167, + "step": 20838 + }, + { + "epoch": 1.2936246818548638, + "grad_norm": 0.1722543226154376, + "learning_rate": 7.01446342569595e-05, + "loss": 2.9176, + "step": 20839 + }, + { + "epoch": 1.2936867589546217, + "grad_norm": 0.14948643072490533, + "learning_rate": 7.014132876692626e-05, + "loss": 2.8233, + "step": 20840 + }, + { + "epoch": 1.2937488360543794, + "grad_norm": 0.25740747715420265, + "learning_rate": 7.013802317181082e-05, + "loss": 2.7596, + "step": 20841 + }, + { + "epoch": 1.2938109131541373, + "grad_norm": 0.15779690334940424, + "learning_rate": 7.013471747163041e-05, + "loss": 2.8908, + "step": 20842 + }, + { + "epoch": 1.2938729902538952, + "grad_norm": 0.1879443426925794, + "learning_rate": 7.01314116664023e-05, + "loss": 2.8498, + "step": 20843 + }, + { + "epoch": 1.2939350673536532, + "grad_norm": 0.15737922436185572, + "learning_rate": 7.012810575614373e-05, + "loss": 2.871, + "step": 20844 + }, + { + "epoch": 1.293997144453411, + "grad_norm": 0.15735031375623415, + "learning_rate": 7.012479974087194e-05, + "loss": 2.9265, + "step": 20845 + }, + { + "epoch": 1.294059221553169, + "grad_norm": 0.16546825040538318, + "learning_rate": 7.012149362060418e-05, + "loss": 2.8843, + "step": 20846 + }, + { + "epoch": 1.294121298652927, + "grad_norm": 0.1767440284707655, + "learning_rate": 7.011818739535772e-05, + "loss": 2.8256, + "step": 20847 + }, + { + "epoch": 1.2941833757526848, + "grad_norm": 0.16635648292162317, + "learning_rate": 7.011488106514978e-05, + "loss": 2.7727, + "step": 20848 + }, + { + "epoch": 1.2942454528524427, + "grad_norm": 0.18822181365008686, + "learning_rate": 7.011157462999762e-05, + "loss": 2.8941, + "step": 20849 + }, + { + "epoch": 1.2943075299522007, + "grad_norm": 0.1681875422940862, + "learning_rate": 7.010826808991851e-05, + "loss": 2.8798, + "step": 20850 + }, + { + "epoch": 1.2943696070519586, + "grad_norm": 0.15710900336534125, + "learning_rate": 7.010496144492966e-05, + "loss": 2.9214, + "step": 20851 + }, + { + "epoch": 1.2944316841517165, + "grad_norm": 0.15809868673725286, + "learning_rate": 7.010165469504836e-05, + "loss": 2.7861, + "step": 20852 + }, + { + "epoch": 1.2944937612514744, + "grad_norm": 0.16371321395095914, + "learning_rate": 7.009834784029184e-05, + "loss": 2.8735, + "step": 20853 + }, + { + "epoch": 1.2945558383512323, + "grad_norm": 0.16424664880144052, + "learning_rate": 7.009504088067738e-05, + "loss": 2.8604, + "step": 20854 + }, + { + "epoch": 1.2946179154509903, + "grad_norm": 0.18058028469525458, + "learning_rate": 7.00917338162222e-05, + "loss": 2.8327, + "step": 20855 + }, + { + "epoch": 1.294679992550748, + "grad_norm": 0.18056627668641198, + "learning_rate": 7.008842664694356e-05, + "loss": 2.8849, + "step": 20856 + }, + { + "epoch": 1.2947420696505059, + "grad_norm": 0.17613281485193133, + "learning_rate": 7.008511937285873e-05, + "loss": 2.8052, + "step": 20857 + }, + { + "epoch": 1.2948041467502638, + "grad_norm": 0.1706570691932697, + "learning_rate": 7.008181199398494e-05, + "loss": 2.9239, + "step": 20858 + }, + { + "epoch": 1.2948662238500217, + "grad_norm": 0.19484335822071372, + "learning_rate": 7.007850451033949e-05, + "loss": 2.8026, + "step": 20859 + }, + { + "epoch": 1.2949283009497796, + "grad_norm": 0.1769710434858147, + "learning_rate": 7.007519692193958e-05, + "loss": 2.9288, + "step": 20860 + }, + { + "epoch": 1.2949903780495375, + "grad_norm": 0.174935233529071, + "learning_rate": 7.007188922880252e-05, + "loss": 2.9676, + "step": 20861 + }, + { + "epoch": 1.2950524551492955, + "grad_norm": 0.24230122203522675, + "learning_rate": 7.006858143094553e-05, + "loss": 2.8308, + "step": 20862 + }, + { + "epoch": 1.2951145322490534, + "grad_norm": 0.1544695684510469, + "learning_rate": 7.006527352838588e-05, + "loss": 2.8454, + "step": 20863 + }, + { + "epoch": 1.2951766093488113, + "grad_norm": 0.15192547867818665, + "learning_rate": 7.006196552114081e-05, + "loss": 2.8738, + "step": 20864 + }, + { + "epoch": 1.295238686448569, + "grad_norm": 0.15427564523208379, + "learning_rate": 7.005865740922763e-05, + "loss": 2.8322, + "step": 20865 + }, + { + "epoch": 1.295300763548327, + "grad_norm": 0.14909941759932796, + "learning_rate": 7.005534919266355e-05, + "loss": 2.874, + "step": 20866 + }, + { + "epoch": 1.2953628406480848, + "grad_norm": 0.16976863155344707, + "learning_rate": 7.005204087146582e-05, + "loss": 2.8284, + "step": 20867 + }, + { + "epoch": 1.2954249177478427, + "grad_norm": 0.14903398553957714, + "learning_rate": 7.004873244565176e-05, + "loss": 2.8631, + "step": 20868 + }, + { + "epoch": 1.2954869948476007, + "grad_norm": 0.16039066488429937, + "learning_rate": 7.004542391523858e-05, + "loss": 2.8793, + "step": 20869 + }, + { + "epoch": 1.2955490719473586, + "grad_norm": 0.15981258177950977, + "learning_rate": 7.004211528024355e-05, + "loss": 2.7531, + "step": 20870 + }, + { + "epoch": 1.2956111490471165, + "grad_norm": 0.16599234810501495, + "learning_rate": 7.003880654068395e-05, + "loss": 2.9312, + "step": 20871 + }, + { + "epoch": 1.2956732261468744, + "grad_norm": 0.13817578890160978, + "learning_rate": 7.003549769657703e-05, + "loss": 2.7662, + "step": 20872 + }, + { + "epoch": 1.2957353032466323, + "grad_norm": 0.1611100267425398, + "learning_rate": 7.003218874794006e-05, + "loss": 2.7954, + "step": 20873 + }, + { + "epoch": 1.2957973803463902, + "grad_norm": 0.16716242361515543, + "learning_rate": 7.002887969479028e-05, + "loss": 2.9181, + "step": 20874 + }, + { + "epoch": 1.2958594574461482, + "grad_norm": 0.15549162763452526, + "learning_rate": 7.002557053714499e-05, + "loss": 2.7865, + "step": 20875 + }, + { + "epoch": 1.295921534545906, + "grad_norm": 0.1473226530418455, + "learning_rate": 7.002226127502142e-05, + "loss": 2.8735, + "step": 20876 + }, + { + "epoch": 1.295983611645664, + "grad_norm": 0.1519704487690158, + "learning_rate": 7.001895190843689e-05, + "loss": 2.8977, + "step": 20877 + }, + { + "epoch": 1.296045688745422, + "grad_norm": 0.1623218017809443, + "learning_rate": 7.001564243740858e-05, + "loss": 2.8773, + "step": 20878 + }, + { + "epoch": 1.2961077658451798, + "grad_norm": 0.1987925814352561, + "learning_rate": 7.001233286195383e-05, + "loss": 2.7615, + "step": 20879 + }, + { + "epoch": 1.2961698429449375, + "grad_norm": 0.14798573084336894, + "learning_rate": 7.000902318208989e-05, + "loss": 2.8324, + "step": 20880 + }, + { + "epoch": 1.2962319200446955, + "grad_norm": 0.14210706783391033, + "learning_rate": 7.000571339783399e-05, + "loss": 2.8912, + "step": 20881 + }, + { + "epoch": 1.2962939971444534, + "grad_norm": 0.17207244889040477, + "learning_rate": 7.000240350920344e-05, + "loss": 2.8783, + "step": 20882 + }, + { + "epoch": 1.2963560742442113, + "grad_norm": 0.18721083357108087, + "learning_rate": 6.99990935162155e-05, + "loss": 2.9118, + "step": 20883 + }, + { + "epoch": 1.2964181513439692, + "grad_norm": 0.14418392073897612, + "learning_rate": 6.999578341888746e-05, + "loss": 2.8015, + "step": 20884 + }, + { + "epoch": 1.2964802284437271, + "grad_norm": 0.1504005329782092, + "learning_rate": 6.999247321723652e-05, + "loss": 2.8941, + "step": 20885 + }, + { + "epoch": 1.296542305543485, + "grad_norm": 0.14456258897988913, + "learning_rate": 6.998916291128002e-05, + "loss": 2.8402, + "step": 20886 + }, + { + "epoch": 1.296604382643243, + "grad_norm": 0.18044557505179445, + "learning_rate": 6.998585250103519e-05, + "loss": 2.8517, + "step": 20887 + }, + { + "epoch": 1.2966664597430009, + "grad_norm": 0.15497486630789523, + "learning_rate": 6.998254198651933e-05, + "loss": 2.8599, + "step": 20888 + }, + { + "epoch": 1.2967285368427586, + "grad_norm": 0.168306986010549, + "learning_rate": 6.997923136774968e-05, + "loss": 2.9561, + "step": 20889 + }, + { + "epoch": 1.2967906139425165, + "grad_norm": 0.19748138676527052, + "learning_rate": 6.997592064474353e-05, + "loss": 2.8612, + "step": 20890 + }, + { + "epoch": 1.2968526910422744, + "grad_norm": 0.16995859297492277, + "learning_rate": 6.997260981751816e-05, + "loss": 2.8715, + "step": 20891 + }, + { + "epoch": 1.2969147681420323, + "grad_norm": 0.2049336151326048, + "learning_rate": 6.996929888609082e-05, + "loss": 2.8566, + "step": 20892 + }, + { + "epoch": 1.2969768452417902, + "grad_norm": 0.1792349922582576, + "learning_rate": 6.996598785047882e-05, + "loss": 2.8933, + "step": 20893 + }, + { + "epoch": 1.2970389223415482, + "grad_norm": 0.1654933286842813, + "learning_rate": 6.996267671069943e-05, + "loss": 2.8558, + "step": 20894 + }, + { + "epoch": 1.297100999441306, + "grad_norm": 0.16465083852444953, + "learning_rate": 6.995936546676988e-05, + "loss": 2.9128, + "step": 20895 + }, + { + "epoch": 1.297163076541064, + "grad_norm": 0.18386053424611418, + "learning_rate": 6.995605411870749e-05, + "loss": 2.842, + "step": 20896 + }, + { + "epoch": 1.297225153640822, + "grad_norm": 0.1649458638972798, + "learning_rate": 6.995274266652952e-05, + "loss": 2.7749, + "step": 20897 + }, + { + "epoch": 1.2972872307405798, + "grad_norm": 0.18701435814332062, + "learning_rate": 6.994943111025324e-05, + "loss": 2.8902, + "step": 20898 + }, + { + "epoch": 1.2973493078403378, + "grad_norm": 0.17756367964925052, + "learning_rate": 6.994611944989594e-05, + "loss": 2.8189, + "step": 20899 + }, + { + "epoch": 1.2974113849400957, + "grad_norm": 0.15020371873643598, + "learning_rate": 6.99428076854749e-05, + "loss": 2.7917, + "step": 20900 + }, + { + "epoch": 1.2974734620398536, + "grad_norm": 0.1672436789129253, + "learning_rate": 6.993949581700739e-05, + "loss": 2.8845, + "step": 20901 + }, + { + "epoch": 1.2975355391396115, + "grad_norm": 0.16296721037103581, + "learning_rate": 6.993618384451069e-05, + "loss": 2.7893, + "step": 20902 + }, + { + "epoch": 1.2975976162393694, + "grad_norm": 0.2074767753087756, + "learning_rate": 6.993287176800205e-05, + "loss": 2.8602, + "step": 20903 + }, + { + "epoch": 1.2976596933391271, + "grad_norm": 0.18253359181724485, + "learning_rate": 6.992955958749882e-05, + "loss": 2.8358, + "step": 20904 + }, + { + "epoch": 1.297721770438885, + "grad_norm": 0.1892701081020784, + "learning_rate": 6.992624730301821e-05, + "loss": 2.7821, + "step": 20905 + }, + { + "epoch": 1.297783847538643, + "grad_norm": 0.20170301854719372, + "learning_rate": 6.992293491457753e-05, + "loss": 2.9006, + "step": 20906 + }, + { + "epoch": 1.2978459246384009, + "grad_norm": 0.17550225900081903, + "learning_rate": 6.991962242219407e-05, + "loss": 2.8101, + "step": 20907 + }, + { + "epoch": 1.2979080017381588, + "grad_norm": 0.17762813956340368, + "learning_rate": 6.991630982588512e-05, + "loss": 2.8344, + "step": 20908 + }, + { + "epoch": 1.2979700788379167, + "grad_norm": 0.19436129687294648, + "learning_rate": 6.991299712566794e-05, + "loss": 2.886, + "step": 20909 + }, + { + "epoch": 1.2980321559376746, + "grad_norm": 0.1614925184258944, + "learning_rate": 6.990968432155981e-05, + "loss": 2.843, + "step": 20910 + }, + { + "epoch": 1.2980942330374325, + "grad_norm": 0.17159762542327658, + "learning_rate": 6.990637141357803e-05, + "loss": 2.8796, + "step": 20911 + }, + { + "epoch": 1.2981563101371905, + "grad_norm": 0.24148393541466975, + "learning_rate": 6.990305840173988e-05, + "loss": 2.7798, + "step": 20912 + }, + { + "epoch": 1.2982183872369482, + "grad_norm": 0.1752586649957857, + "learning_rate": 6.989974528606263e-05, + "loss": 2.8464, + "step": 20913 + }, + { + "epoch": 1.298280464336706, + "grad_norm": 0.17302452743930383, + "learning_rate": 6.98964320665636e-05, + "loss": 2.7739, + "step": 20914 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.18012592483934306, + "learning_rate": 6.989311874326003e-05, + "loss": 2.8335, + "step": 20915 + }, + { + "epoch": 1.298404618536222, + "grad_norm": 0.20475813105673868, + "learning_rate": 6.988980531616924e-05, + "loss": 2.8728, + "step": 20916 + }, + { + "epoch": 1.2984666956359798, + "grad_norm": 0.1629990957522469, + "learning_rate": 6.988649178530852e-05, + "loss": 2.8767, + "step": 20917 + }, + { + "epoch": 1.2985287727357377, + "grad_norm": 0.22675104352917014, + "learning_rate": 6.988317815069512e-05, + "loss": 2.9397, + "step": 20918 + }, + { + "epoch": 1.2985908498354957, + "grad_norm": 0.1730201351784024, + "learning_rate": 6.987986441234638e-05, + "loss": 2.9636, + "step": 20919 + }, + { + "epoch": 1.2986529269352536, + "grad_norm": 0.1890377919154028, + "learning_rate": 6.987655057027956e-05, + "loss": 2.8538, + "step": 20920 + }, + { + "epoch": 1.2987150040350115, + "grad_norm": 0.1696538016366931, + "learning_rate": 6.987323662451192e-05, + "loss": 2.8025, + "step": 20921 + }, + { + "epoch": 1.2987770811347694, + "grad_norm": 0.17198898588826922, + "learning_rate": 6.98699225750608e-05, + "loss": 2.8441, + "step": 20922 + }, + { + "epoch": 1.2988391582345273, + "grad_norm": 0.15537149703802475, + "learning_rate": 6.986660842194345e-05, + "loss": 2.9519, + "step": 20923 + }, + { + "epoch": 1.2989012353342853, + "grad_norm": 0.18344139311069432, + "learning_rate": 6.986329416517721e-05, + "loss": 2.8732, + "step": 20924 + }, + { + "epoch": 1.2989633124340432, + "grad_norm": 0.15720918808181572, + "learning_rate": 6.985997980477933e-05, + "loss": 2.7616, + "step": 20925 + }, + { + "epoch": 1.299025389533801, + "grad_norm": 0.182683922308788, + "learning_rate": 6.985666534076712e-05, + "loss": 2.9249, + "step": 20926 + }, + { + "epoch": 1.299087466633559, + "grad_norm": 0.17088790316130614, + "learning_rate": 6.985335077315787e-05, + "loss": 2.8792, + "step": 20927 + }, + { + "epoch": 1.2991495437333167, + "grad_norm": 0.19062183174252006, + "learning_rate": 6.985003610196886e-05, + "loss": 2.781, + "step": 20928 + }, + { + "epoch": 1.2992116208330746, + "grad_norm": 0.18865508682084486, + "learning_rate": 6.98467213272174e-05, + "loss": 2.9459, + "step": 20929 + }, + { + "epoch": 1.2992736979328325, + "grad_norm": 0.18105998602462658, + "learning_rate": 6.984340644892076e-05, + "loss": 2.9244, + "step": 20930 + }, + { + "epoch": 1.2993357750325905, + "grad_norm": 0.16212251908662184, + "learning_rate": 6.984009146709628e-05, + "loss": 2.7639, + "step": 20931 + }, + { + "epoch": 1.2993978521323484, + "grad_norm": 0.18089059597310717, + "learning_rate": 6.98367763817612e-05, + "loss": 2.8564, + "step": 20932 + }, + { + "epoch": 1.2994599292321063, + "grad_norm": 0.16683347867682444, + "learning_rate": 6.983346119293285e-05, + "loss": 2.7749, + "step": 20933 + }, + { + "epoch": 1.2995220063318642, + "grad_norm": 0.19740460476137628, + "learning_rate": 6.983014590062851e-05, + "loss": 2.9011, + "step": 20934 + }, + { + "epoch": 1.2995840834316221, + "grad_norm": 0.16814917477445782, + "learning_rate": 6.98268305048655e-05, + "loss": 2.8095, + "step": 20935 + }, + { + "epoch": 1.29964616053138, + "grad_norm": 0.1776962553792992, + "learning_rate": 6.98235150056611e-05, + "loss": 2.8839, + "step": 20936 + }, + { + "epoch": 1.2997082376311377, + "grad_norm": 0.1520152830264542, + "learning_rate": 6.98201994030326e-05, + "loss": 2.8643, + "step": 20937 + }, + { + "epoch": 1.2997703147308957, + "grad_norm": 0.17460138336770542, + "learning_rate": 6.981688369699733e-05, + "loss": 2.8332, + "step": 20938 + }, + { + "epoch": 1.2998323918306536, + "grad_norm": 0.2178232709035826, + "learning_rate": 6.981356788757253e-05, + "loss": 2.7143, + "step": 20939 + }, + { + "epoch": 1.2998944689304115, + "grad_norm": 0.16697941558360843, + "learning_rate": 6.981025197477557e-05, + "loss": 2.8902, + "step": 20940 + }, + { + "epoch": 1.2999565460301694, + "grad_norm": 0.1703502090512301, + "learning_rate": 6.98069359586237e-05, + "loss": 2.9037, + "step": 20941 + }, + { + "epoch": 1.3000186231299273, + "grad_norm": 0.16848703752940017, + "learning_rate": 6.980361983913424e-05, + "loss": 2.7396, + "step": 20942 + }, + { + "epoch": 1.3000807002296852, + "grad_norm": 0.2252431221572406, + "learning_rate": 6.980030361632448e-05, + "loss": 2.8301, + "step": 20943 + }, + { + "epoch": 1.3001427773294432, + "grad_norm": 0.15703333952922696, + "learning_rate": 6.979698729021175e-05, + "loss": 2.9051, + "step": 20944 + }, + { + "epoch": 1.300204854429201, + "grad_norm": 0.1571723180988944, + "learning_rate": 6.979367086081333e-05, + "loss": 2.8768, + "step": 20945 + }, + { + "epoch": 1.300266931528959, + "grad_norm": 0.15064332217454984, + "learning_rate": 6.97903543281465e-05, + "loss": 2.9337, + "step": 20946 + }, + { + "epoch": 1.300329008628717, + "grad_norm": 0.14625136598943542, + "learning_rate": 6.978703769222861e-05, + "loss": 2.7725, + "step": 20947 + }, + { + "epoch": 1.3003910857284748, + "grad_norm": 0.15340372720005044, + "learning_rate": 6.978372095307692e-05, + "loss": 2.7867, + "step": 20948 + }, + { + "epoch": 1.3004531628282328, + "grad_norm": 0.15193578920299572, + "learning_rate": 6.978040411070879e-05, + "loss": 2.9008, + "step": 20949 + }, + { + "epoch": 1.3005152399279907, + "grad_norm": 0.1493237789871159, + "learning_rate": 6.977708716514146e-05, + "loss": 2.8617, + "step": 20950 + }, + { + "epoch": 1.3005773170277486, + "grad_norm": 0.14760261323385537, + "learning_rate": 6.977377011639227e-05, + "loss": 2.8063, + "step": 20951 + }, + { + "epoch": 1.3006393941275063, + "grad_norm": 0.16924983346718184, + "learning_rate": 6.977045296447851e-05, + "loss": 2.8787, + "step": 20952 + }, + { + "epoch": 1.3007014712272642, + "grad_norm": 0.17397228435340678, + "learning_rate": 6.976713570941752e-05, + "loss": 2.8934, + "step": 20953 + }, + { + "epoch": 1.3007635483270221, + "grad_norm": 0.16549028414978878, + "learning_rate": 6.976381835122658e-05, + "loss": 2.7844, + "step": 20954 + }, + { + "epoch": 1.30082562542678, + "grad_norm": 0.17046357003619028, + "learning_rate": 6.976050088992299e-05, + "loss": 2.8613, + "step": 20955 + }, + { + "epoch": 1.300887702526538, + "grad_norm": 0.19393656539015644, + "learning_rate": 6.975718332552408e-05, + "loss": 2.812, + "step": 20956 + }, + { + "epoch": 1.3009497796262959, + "grad_norm": 0.14043277609748572, + "learning_rate": 6.975386565804714e-05, + "loss": 2.7339, + "step": 20957 + }, + { + "epoch": 1.3010118567260538, + "grad_norm": 0.15578109596765186, + "learning_rate": 6.97505478875095e-05, + "loss": 2.7679, + "step": 20958 + }, + { + "epoch": 1.3010739338258117, + "grad_norm": 0.1468847842204116, + "learning_rate": 6.974723001392844e-05, + "loss": 2.7864, + "step": 20959 + }, + { + "epoch": 1.3011360109255696, + "grad_norm": 0.15031500562455105, + "learning_rate": 6.97439120373213e-05, + "loss": 2.8354, + "step": 20960 + }, + { + "epoch": 1.3011980880253273, + "grad_norm": 0.14693923899799116, + "learning_rate": 6.974059395770537e-05, + "loss": 2.8355, + "step": 20961 + }, + { + "epoch": 1.3012601651250852, + "grad_norm": 0.1599443650656176, + "learning_rate": 6.973727577509799e-05, + "loss": 2.8055, + "step": 20962 + }, + { + "epoch": 1.3013222422248432, + "grad_norm": 0.17452039990555906, + "learning_rate": 6.973395748951643e-05, + "loss": 2.7841, + "step": 20963 + }, + { + "epoch": 1.301384319324601, + "grad_norm": 0.18799276965839484, + "learning_rate": 6.973063910097803e-05, + "loss": 2.8269, + "step": 20964 + }, + { + "epoch": 1.301446396424359, + "grad_norm": 0.15101897271990075, + "learning_rate": 6.97273206095001e-05, + "loss": 2.8365, + "step": 20965 + }, + { + "epoch": 1.301508473524117, + "grad_norm": 0.21072390126111487, + "learning_rate": 6.972400201509993e-05, + "loss": 2.8706, + "step": 20966 + }, + { + "epoch": 1.3015705506238748, + "grad_norm": 0.15906755909621764, + "learning_rate": 6.972068331779487e-05, + "loss": 2.8741, + "step": 20967 + }, + { + "epoch": 1.3016326277236328, + "grad_norm": 0.1827039767387135, + "learning_rate": 6.971736451760222e-05, + "loss": 2.8389, + "step": 20968 + }, + { + "epoch": 1.3016947048233907, + "grad_norm": 0.14649464889101507, + "learning_rate": 6.971404561453929e-05, + "loss": 2.7471, + "step": 20969 + }, + { + "epoch": 1.3017567819231486, + "grad_norm": 0.18977904060719664, + "learning_rate": 6.971072660862339e-05, + "loss": 2.8809, + "step": 20970 + }, + { + "epoch": 1.3018188590229065, + "grad_norm": 0.14229028858780676, + "learning_rate": 6.970740749987186e-05, + "loss": 2.7635, + "step": 20971 + }, + { + "epoch": 1.3018809361226644, + "grad_norm": 0.18372263341291228, + "learning_rate": 6.970408828830197e-05, + "loss": 2.8934, + "step": 20972 + }, + { + "epoch": 1.3019430132224223, + "grad_norm": 0.14986692625256764, + "learning_rate": 6.97007689739311e-05, + "loss": 2.9684, + "step": 20973 + }, + { + "epoch": 1.3020050903221803, + "grad_norm": 0.17589762861797356, + "learning_rate": 6.969744955677652e-05, + "loss": 2.8352, + "step": 20974 + }, + { + "epoch": 1.3020671674219382, + "grad_norm": 0.164912960022963, + "learning_rate": 6.969413003685557e-05, + "loss": 2.8785, + "step": 20975 + }, + { + "epoch": 1.3021292445216959, + "grad_norm": 0.16994026786090588, + "learning_rate": 6.969081041418555e-05, + "loss": 2.8198, + "step": 20976 + }, + { + "epoch": 1.3021913216214538, + "grad_norm": 0.15533502425000295, + "learning_rate": 6.968749068878381e-05, + "loss": 2.8665, + "step": 20977 + }, + { + "epoch": 1.3022533987212117, + "grad_norm": 0.158998538424414, + "learning_rate": 6.968417086066765e-05, + "loss": 2.7545, + "step": 20978 + }, + { + "epoch": 1.3023154758209696, + "grad_norm": 0.18082353924855193, + "learning_rate": 6.968085092985438e-05, + "loss": 2.8629, + "step": 20979 + }, + { + "epoch": 1.3023775529207275, + "grad_norm": 0.18835906086867413, + "learning_rate": 6.967753089636134e-05, + "loss": 2.7812, + "step": 20980 + }, + { + "epoch": 1.3024396300204855, + "grad_norm": 0.15349851566320824, + "learning_rate": 6.967421076020586e-05, + "loss": 2.8568, + "step": 20981 + }, + { + "epoch": 1.3025017071202434, + "grad_norm": 0.1677053071177463, + "learning_rate": 6.967089052140522e-05, + "loss": 2.8956, + "step": 20982 + }, + { + "epoch": 1.3025637842200013, + "grad_norm": 0.1535885766129238, + "learning_rate": 6.966757017997678e-05, + "loss": 2.8576, + "step": 20983 + }, + { + "epoch": 1.3026258613197592, + "grad_norm": 0.15872155123276152, + "learning_rate": 6.966424973593786e-05, + "loss": 2.8282, + "step": 20984 + }, + { + "epoch": 1.302687938419517, + "grad_norm": 0.1587493086597077, + "learning_rate": 6.966092918930577e-05, + "loss": 2.7513, + "step": 20985 + }, + { + "epoch": 1.3027500155192748, + "grad_norm": 0.16881501128790835, + "learning_rate": 6.965760854009783e-05, + "loss": 2.9584, + "step": 20986 + }, + { + "epoch": 1.3028120926190327, + "grad_norm": 0.15860624522515704, + "learning_rate": 6.965428778833138e-05, + "loss": 2.8424, + "step": 20987 + }, + { + "epoch": 1.3028741697187907, + "grad_norm": 0.1674026720068035, + "learning_rate": 6.965096693402375e-05, + "loss": 2.8525, + "step": 20988 + }, + { + "epoch": 1.3029362468185486, + "grad_norm": 0.17027037060105032, + "learning_rate": 6.964764597719223e-05, + "loss": 2.9233, + "step": 20989 + }, + { + "epoch": 1.3029983239183065, + "grad_norm": 0.16496871261466184, + "learning_rate": 6.96443249178542e-05, + "loss": 2.887, + "step": 20990 + }, + { + "epoch": 1.3030604010180644, + "grad_norm": 0.15850810101261117, + "learning_rate": 6.964100375602694e-05, + "loss": 2.8155, + "step": 20991 + }, + { + "epoch": 1.3031224781178223, + "grad_norm": 0.1534337699129857, + "learning_rate": 6.963768249172779e-05, + "loss": 2.8624, + "step": 20992 + }, + { + "epoch": 1.3031845552175803, + "grad_norm": 0.13831806716453923, + "learning_rate": 6.96343611249741e-05, + "loss": 2.8396, + "step": 20993 + }, + { + "epoch": 1.3032466323173382, + "grad_norm": 0.15943677887590862, + "learning_rate": 6.963103965578317e-05, + "loss": 2.86, + "step": 20994 + }, + { + "epoch": 1.303308709417096, + "grad_norm": 0.15341572920769084, + "learning_rate": 6.962771808417234e-05, + "loss": 2.8625, + "step": 20995 + }, + { + "epoch": 1.303370786516854, + "grad_norm": 0.14439192706741769, + "learning_rate": 6.962439641015895e-05, + "loss": 2.8377, + "step": 20996 + }, + { + "epoch": 1.303432863616612, + "grad_norm": 0.14875666286869116, + "learning_rate": 6.962107463376032e-05, + "loss": 2.8939, + "step": 20997 + }, + { + "epoch": 1.3034949407163698, + "grad_norm": 0.16047234278615521, + "learning_rate": 6.961775275499378e-05, + "loss": 2.8352, + "step": 20998 + }, + { + "epoch": 1.3035570178161278, + "grad_norm": 0.15552650677547858, + "learning_rate": 6.961443077387665e-05, + "loss": 2.9215, + "step": 20999 + }, + { + "epoch": 1.3036190949158855, + "grad_norm": 0.15643667156725577, + "learning_rate": 6.961110869042628e-05, + "loss": 2.8347, + "step": 21000 + }, + { + "epoch": 1.3036811720156434, + "grad_norm": 0.14968808919635362, + "learning_rate": 6.960778650466e-05, + "loss": 2.8814, + "step": 21001 + }, + { + "epoch": 1.3037432491154013, + "grad_norm": 0.19797036839450652, + "learning_rate": 6.960446421659513e-05, + "loss": 2.9037, + "step": 21002 + }, + { + "epoch": 1.3038053262151592, + "grad_norm": 0.20398206726311638, + "learning_rate": 6.960114182624903e-05, + "loss": 2.8545, + "step": 21003 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.1701171728280868, + "learning_rate": 6.959781933363898e-05, + "loss": 2.8048, + "step": 21004 + }, + { + "epoch": 1.303929480414675, + "grad_norm": 0.14465121497058683, + "learning_rate": 6.959449673878238e-05, + "loss": 2.8076, + "step": 21005 + }, + { + "epoch": 1.303991557514433, + "grad_norm": 0.1621423366081694, + "learning_rate": 6.959117404169652e-05, + "loss": 2.8953, + "step": 21006 + }, + { + "epoch": 1.3040536346141909, + "grad_norm": 0.15476868117587073, + "learning_rate": 6.958785124239875e-05, + "loss": 2.888, + "step": 21007 + }, + { + "epoch": 1.3041157117139488, + "grad_norm": 0.19408658578462912, + "learning_rate": 6.95845283409064e-05, + "loss": 2.8133, + "step": 21008 + }, + { + "epoch": 1.3041777888137065, + "grad_norm": 0.15903439355907717, + "learning_rate": 6.958120533723682e-05, + "loss": 2.9341, + "step": 21009 + }, + { + "epoch": 1.3042398659134644, + "grad_norm": 0.15398966711928377, + "learning_rate": 6.957788223140734e-05, + "loss": 2.8727, + "step": 21010 + }, + { + "epoch": 1.3043019430132223, + "grad_norm": 0.1624072853478906, + "learning_rate": 6.957455902343528e-05, + "loss": 2.8267, + "step": 21011 + }, + { + "epoch": 1.3043640201129802, + "grad_norm": 0.17093714139541585, + "learning_rate": 6.957123571333801e-05, + "loss": 3.0303, + "step": 21012 + }, + { + "epoch": 1.3044260972127382, + "grad_norm": 0.1422456300249441, + "learning_rate": 6.956791230113284e-05, + "loss": 2.919, + "step": 21013 + }, + { + "epoch": 1.304488174312496, + "grad_norm": 0.21523802731663888, + "learning_rate": 6.956458878683714e-05, + "loss": 2.8351, + "step": 21014 + }, + { + "epoch": 1.304550251412254, + "grad_norm": 0.21731241462405232, + "learning_rate": 6.956126517046822e-05, + "loss": 2.9324, + "step": 21015 + }, + { + "epoch": 1.304612328512012, + "grad_norm": 0.18868081484321161, + "learning_rate": 6.955794145204342e-05, + "loss": 2.8411, + "step": 21016 + }, + { + "epoch": 1.3046744056117698, + "grad_norm": 0.16479466756165273, + "learning_rate": 6.955461763158011e-05, + "loss": 2.8424, + "step": 21017 + }, + { + "epoch": 1.3047364827115278, + "grad_norm": 0.22506452966244686, + "learning_rate": 6.955129370909562e-05, + "loss": 2.8312, + "step": 21018 + }, + { + "epoch": 1.3047985598112857, + "grad_norm": 0.15342687438914185, + "learning_rate": 6.954796968460726e-05, + "loss": 2.86, + "step": 21019 + }, + { + "epoch": 1.3048606369110436, + "grad_norm": 0.1804838570551111, + "learning_rate": 6.95446455581324e-05, + "loss": 2.9903, + "step": 21020 + }, + { + "epoch": 1.3049227140108015, + "grad_norm": 0.16617054723075544, + "learning_rate": 6.954132132968838e-05, + "loss": 2.8707, + "step": 21021 + }, + { + "epoch": 1.3049847911105594, + "grad_norm": 0.262298610614023, + "learning_rate": 6.953799699929255e-05, + "loss": 2.794, + "step": 21022 + }, + { + "epoch": 1.3050468682103173, + "grad_norm": 0.152758266697436, + "learning_rate": 6.953467256696225e-05, + "loss": 2.9011, + "step": 21023 + }, + { + "epoch": 1.305108945310075, + "grad_norm": 0.1542164254810496, + "learning_rate": 6.95313480327148e-05, + "loss": 2.8929, + "step": 21024 + }, + { + "epoch": 1.305171022409833, + "grad_norm": 0.14948031882485577, + "learning_rate": 6.95280233965676e-05, + "loss": 2.8218, + "step": 21025 + }, + { + "epoch": 1.3052330995095909, + "grad_norm": 0.18781728522604257, + "learning_rate": 6.952469865853794e-05, + "loss": 2.8761, + "step": 21026 + }, + { + "epoch": 1.3052951766093488, + "grad_norm": 0.15325336452488827, + "learning_rate": 6.952137381864318e-05, + "loss": 2.8548, + "step": 21027 + }, + { + "epoch": 1.3053572537091067, + "grad_norm": 0.17329413657603956, + "learning_rate": 6.95180488769007e-05, + "loss": 2.8226, + "step": 21028 + }, + { + "epoch": 1.3054193308088646, + "grad_norm": 0.16011390475137052, + "learning_rate": 6.95147238333278e-05, + "loss": 2.8762, + "step": 21029 + }, + { + "epoch": 1.3054814079086225, + "grad_norm": 0.18642957432736298, + "learning_rate": 6.951139868794185e-05, + "loss": 2.7544, + "step": 21030 + }, + { + "epoch": 1.3055434850083805, + "grad_norm": 0.14802977757630612, + "learning_rate": 6.95080734407602e-05, + "loss": 2.819, + "step": 21031 + }, + { + "epoch": 1.3056055621081384, + "grad_norm": 0.14878583492464884, + "learning_rate": 6.95047480918002e-05, + "loss": 2.7822, + "step": 21032 + }, + { + "epoch": 1.305667639207896, + "grad_norm": 0.1520558657129893, + "learning_rate": 6.950142264107918e-05, + "loss": 2.8258, + "step": 21033 + }, + { + "epoch": 1.305729716307654, + "grad_norm": 0.15249870440448665, + "learning_rate": 6.949809708861452e-05, + "loss": 2.971, + "step": 21034 + }, + { + "epoch": 1.305791793407412, + "grad_norm": 0.15083144963148154, + "learning_rate": 6.949477143442354e-05, + "loss": 2.873, + "step": 21035 + }, + { + "epoch": 1.3058538705071698, + "grad_norm": 0.15180688798503286, + "learning_rate": 6.949144567852363e-05, + "loss": 2.9049, + "step": 21036 + }, + { + "epoch": 1.3059159476069278, + "grad_norm": 0.15916771053821557, + "learning_rate": 6.948811982093209e-05, + "loss": 2.7993, + "step": 21037 + }, + { + "epoch": 1.3059780247066857, + "grad_norm": 0.1819131713806043, + "learning_rate": 6.948479386166629e-05, + "loss": 2.8273, + "step": 21038 + }, + { + "epoch": 1.3060401018064436, + "grad_norm": 0.16111502693739022, + "learning_rate": 6.948146780074363e-05, + "loss": 2.8136, + "step": 21039 + }, + { + "epoch": 1.3061021789062015, + "grad_norm": 0.1527251091997759, + "learning_rate": 6.947814163818138e-05, + "loss": 2.9055, + "step": 21040 + }, + { + "epoch": 1.3061642560059594, + "grad_norm": 0.1494741505989674, + "learning_rate": 6.947481537399696e-05, + "loss": 2.8772, + "step": 21041 + }, + { + "epoch": 1.3062263331057173, + "grad_norm": 0.15462998154085508, + "learning_rate": 6.94714890082077e-05, + "loss": 2.8736, + "step": 21042 + }, + { + "epoch": 1.3062884102054753, + "grad_norm": 0.20990743620791327, + "learning_rate": 6.946816254083095e-05, + "loss": 2.8391, + "step": 21043 + }, + { + "epoch": 1.3063504873052332, + "grad_norm": 0.15848745297836592, + "learning_rate": 6.94648359718841e-05, + "loss": 2.8555, + "step": 21044 + }, + { + "epoch": 1.306412564404991, + "grad_norm": 0.17881580048668133, + "learning_rate": 6.946150930138442e-05, + "loss": 2.7634, + "step": 21045 + }, + { + "epoch": 1.306474641504749, + "grad_norm": 0.15441927702494943, + "learning_rate": 6.945818252934935e-05, + "loss": 2.72, + "step": 21046 + }, + { + "epoch": 1.306536718604507, + "grad_norm": 0.15745340296614863, + "learning_rate": 6.945485565579622e-05, + "loss": 2.8613, + "step": 21047 + }, + { + "epoch": 1.3065987957042646, + "grad_norm": 0.17607095475220824, + "learning_rate": 6.945152868074238e-05, + "loss": 2.8682, + "step": 21048 + }, + { + "epoch": 1.3066608728040225, + "grad_norm": 0.15746167179202375, + "learning_rate": 6.94482016042052e-05, + "loss": 2.8536, + "step": 21049 + }, + { + "epoch": 1.3067229499037805, + "grad_norm": 0.15408772163458728, + "learning_rate": 6.944487442620203e-05, + "loss": 2.8334, + "step": 21050 + }, + { + "epoch": 1.3067850270035384, + "grad_norm": 0.15341875232712537, + "learning_rate": 6.944154714675022e-05, + "loss": 2.8741, + "step": 21051 + }, + { + "epoch": 1.3068471041032963, + "grad_norm": 0.14861941483978292, + "learning_rate": 6.943821976586715e-05, + "loss": 2.855, + "step": 21052 + }, + { + "epoch": 1.3069091812030542, + "grad_norm": 0.1501479606245707, + "learning_rate": 6.943489228357016e-05, + "loss": 2.9024, + "step": 21053 + }, + { + "epoch": 1.3069712583028121, + "grad_norm": 0.1515445531464083, + "learning_rate": 6.943156469987661e-05, + "loss": 2.8529, + "step": 21054 + }, + { + "epoch": 1.30703333540257, + "grad_norm": 0.15713801763563234, + "learning_rate": 6.94282370148039e-05, + "loss": 2.8322, + "step": 21055 + }, + { + "epoch": 1.307095412502328, + "grad_norm": 0.14488034360154567, + "learning_rate": 6.942490922836933e-05, + "loss": 2.8323, + "step": 21056 + }, + { + "epoch": 1.3071574896020857, + "grad_norm": 0.1639840699238913, + "learning_rate": 6.942158134059032e-05, + "loss": 2.8647, + "step": 21057 + }, + { + "epoch": 1.3072195667018436, + "grad_norm": 0.1470254758801678, + "learning_rate": 6.941825335148419e-05, + "loss": 2.7997, + "step": 21058 + }, + { + "epoch": 1.3072816438016015, + "grad_norm": 0.15386271054528292, + "learning_rate": 6.941492526106832e-05, + "loss": 2.8214, + "step": 21059 + }, + { + "epoch": 1.3073437209013594, + "grad_norm": 0.16177863541800938, + "learning_rate": 6.941159706936006e-05, + "loss": 2.8044, + "step": 21060 + }, + { + "epoch": 1.3074057980011173, + "grad_norm": 0.18125635949849678, + "learning_rate": 6.94082687763768e-05, + "loss": 2.8385, + "step": 21061 + }, + { + "epoch": 1.3074678751008753, + "grad_norm": 0.1512988766994114, + "learning_rate": 6.940494038213589e-05, + "loss": 2.7699, + "step": 21062 + }, + { + "epoch": 1.3075299522006332, + "grad_norm": 0.2122821072920941, + "learning_rate": 6.94016118866547e-05, + "loss": 2.8237, + "step": 21063 + }, + { + "epoch": 1.307592029300391, + "grad_norm": 0.16163920390378791, + "learning_rate": 6.93982832899506e-05, + "loss": 2.7881, + "step": 21064 + }, + { + "epoch": 1.307654106400149, + "grad_norm": 0.1561050577239661, + "learning_rate": 6.939495459204092e-05, + "loss": 2.813, + "step": 21065 + }, + { + "epoch": 1.307716183499907, + "grad_norm": 0.15792878229229507, + "learning_rate": 6.939162579294307e-05, + "loss": 2.9043, + "step": 21066 + }, + { + "epoch": 1.3077782605996648, + "grad_norm": 0.15005464924259546, + "learning_rate": 6.938829689267439e-05, + "loss": 2.852, + "step": 21067 + }, + { + "epoch": 1.3078403376994228, + "grad_norm": 0.16071762578668583, + "learning_rate": 6.938496789125227e-05, + "loss": 2.7957, + "step": 21068 + }, + { + "epoch": 1.3079024147991807, + "grad_norm": 0.15161436265252093, + "learning_rate": 6.938163878869405e-05, + "loss": 2.92, + "step": 21069 + }, + { + "epoch": 1.3079644918989386, + "grad_norm": 0.15542759230726613, + "learning_rate": 6.937830958501713e-05, + "loss": 2.8769, + "step": 21070 + }, + { + "epoch": 1.3080265689986965, + "grad_norm": 0.14834904544862004, + "learning_rate": 6.937498028023885e-05, + "loss": 2.9091, + "step": 21071 + }, + { + "epoch": 1.3080886460984542, + "grad_norm": 0.1542480488071037, + "learning_rate": 6.937165087437661e-05, + "loss": 2.8714, + "step": 21072 + }, + { + "epoch": 1.3081507231982121, + "grad_norm": 0.16012440341567338, + "learning_rate": 6.936832136744777e-05, + "loss": 2.8351, + "step": 21073 + }, + { + "epoch": 1.30821280029797, + "grad_norm": 0.14930390055488316, + "learning_rate": 6.936499175946967e-05, + "loss": 2.883, + "step": 21074 + }, + { + "epoch": 1.308274877397728, + "grad_norm": 0.1733311342012378, + "learning_rate": 6.936166205045973e-05, + "loss": 2.8996, + "step": 21075 + }, + { + "epoch": 1.3083369544974859, + "grad_norm": 0.1415592393436446, + "learning_rate": 6.935833224043529e-05, + "loss": 2.9134, + "step": 21076 + }, + { + "epoch": 1.3083990315972438, + "grad_norm": 0.16101464070885035, + "learning_rate": 6.935500232941374e-05, + "loss": 2.8651, + "step": 21077 + }, + { + "epoch": 1.3084611086970017, + "grad_norm": 0.14864731691547284, + "learning_rate": 6.935167231741242e-05, + "loss": 2.7963, + "step": 21078 + }, + { + "epoch": 1.3085231857967596, + "grad_norm": 0.15047631361580177, + "learning_rate": 6.934834220444875e-05, + "loss": 2.8601, + "step": 21079 + }, + { + "epoch": 1.3085852628965176, + "grad_norm": 0.1720740184752894, + "learning_rate": 6.934501199054009e-05, + "loss": 2.9476, + "step": 21080 + }, + { + "epoch": 1.3086473399962752, + "grad_norm": 0.15567396795329844, + "learning_rate": 6.934168167570378e-05, + "loss": 2.8148, + "step": 21081 + }, + { + "epoch": 1.3087094170960332, + "grad_norm": 0.15057669931615383, + "learning_rate": 6.933835125995724e-05, + "loss": 2.8182, + "step": 21082 + }, + { + "epoch": 1.308771494195791, + "grad_norm": 0.14831742022399144, + "learning_rate": 6.933502074331782e-05, + "loss": 2.8488, + "step": 21083 + }, + { + "epoch": 1.308833571295549, + "grad_norm": 0.1584007653419291, + "learning_rate": 6.93316901258029e-05, + "loss": 2.8342, + "step": 21084 + }, + { + "epoch": 1.308895648395307, + "grad_norm": 0.1452068185544831, + "learning_rate": 6.932835940742987e-05, + "loss": 2.8669, + "step": 21085 + }, + { + "epoch": 1.3089577254950648, + "grad_norm": 0.15572185663993776, + "learning_rate": 6.932502858821608e-05, + "loss": 2.7888, + "step": 21086 + }, + { + "epoch": 1.3090198025948228, + "grad_norm": 0.16080913401074715, + "learning_rate": 6.932169766817895e-05, + "loss": 2.8278, + "step": 21087 + }, + { + "epoch": 1.3090818796945807, + "grad_norm": 0.14667856061073334, + "learning_rate": 6.93183666473358e-05, + "loss": 2.9213, + "step": 21088 + }, + { + "epoch": 1.3091439567943386, + "grad_norm": 0.17692457941473452, + "learning_rate": 6.931503552570407e-05, + "loss": 2.8589, + "step": 21089 + }, + { + "epoch": 1.3092060338940965, + "grad_norm": 0.14974138148237107, + "learning_rate": 6.931170430330109e-05, + "loss": 2.8848, + "step": 21090 + }, + { + "epoch": 1.3092681109938544, + "grad_norm": 0.16480454593389254, + "learning_rate": 6.930837298014428e-05, + "loss": 2.9209, + "step": 21091 + }, + { + "epoch": 1.3093301880936123, + "grad_norm": 0.15880978203946922, + "learning_rate": 6.930504155625098e-05, + "loss": 2.7658, + "step": 21092 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.14712115798210224, + "learning_rate": 6.93017100316386e-05, + "loss": 2.8818, + "step": 21093 + }, + { + "epoch": 1.3094543422931282, + "grad_norm": 0.1531115343537417, + "learning_rate": 6.929837840632453e-05, + "loss": 2.8559, + "step": 21094 + }, + { + "epoch": 1.309516419392886, + "grad_norm": 0.15270550276898567, + "learning_rate": 6.929504668032613e-05, + "loss": 2.8067, + "step": 21095 + }, + { + "epoch": 1.3095784964926438, + "grad_norm": 0.18108615540056414, + "learning_rate": 6.929171485366077e-05, + "loss": 2.7974, + "step": 21096 + }, + { + "epoch": 1.3096405735924017, + "grad_norm": 0.1768887598562592, + "learning_rate": 6.928838292634586e-05, + "loss": 2.8408, + "step": 21097 + }, + { + "epoch": 1.3097026506921596, + "grad_norm": 0.2090281859492826, + "learning_rate": 6.928505089839879e-05, + "loss": 2.9397, + "step": 21098 + }, + { + "epoch": 1.3097647277919175, + "grad_norm": 0.16479589956488047, + "learning_rate": 6.928171876983689e-05, + "loss": 2.8573, + "step": 21099 + }, + { + "epoch": 1.3098268048916755, + "grad_norm": 0.15750029529816328, + "learning_rate": 6.927838654067761e-05, + "loss": 2.8268, + "step": 21100 + }, + { + "epoch": 1.3098888819914334, + "grad_norm": 0.17450350916614527, + "learning_rate": 6.92750542109383e-05, + "loss": 2.8506, + "step": 21101 + }, + { + "epoch": 1.3099509590911913, + "grad_norm": 0.1993673462280565, + "learning_rate": 6.927172178063635e-05, + "loss": 2.8079, + "step": 21102 + }, + { + "epoch": 1.3100130361909492, + "grad_norm": 0.15565861605086836, + "learning_rate": 6.926838924978914e-05, + "loss": 2.8252, + "step": 21103 + }, + { + "epoch": 1.3100751132907071, + "grad_norm": 0.20049549021072544, + "learning_rate": 6.92650566184141e-05, + "loss": 2.9495, + "step": 21104 + }, + { + "epoch": 1.3101371903904648, + "grad_norm": 0.16402916342508644, + "learning_rate": 6.926172388652855e-05, + "loss": 2.8722, + "step": 21105 + }, + { + "epoch": 1.3101992674902228, + "grad_norm": 0.1745335324598357, + "learning_rate": 6.925839105414992e-05, + "loss": 2.8684, + "step": 21106 + }, + { + "epoch": 1.3102613445899807, + "grad_norm": 0.13836764153158398, + "learning_rate": 6.92550581212956e-05, + "loss": 2.802, + "step": 21107 + }, + { + "epoch": 1.3103234216897386, + "grad_norm": 0.14880323527309383, + "learning_rate": 6.925172508798295e-05, + "loss": 2.8201, + "step": 21108 + }, + { + "epoch": 1.3103854987894965, + "grad_norm": 0.14917604024922987, + "learning_rate": 6.924839195422938e-05, + "loss": 2.8911, + "step": 21109 + }, + { + "epoch": 1.3104475758892544, + "grad_norm": 0.14334013209487434, + "learning_rate": 6.924505872005229e-05, + "loss": 2.8882, + "step": 21110 + }, + { + "epoch": 1.3105096529890123, + "grad_norm": 0.17937664426654737, + "learning_rate": 6.924172538546904e-05, + "loss": 2.8525, + "step": 21111 + }, + { + "epoch": 1.3105717300887703, + "grad_norm": 0.19293739048590486, + "learning_rate": 6.923839195049705e-05, + "loss": 2.8615, + "step": 21112 + }, + { + "epoch": 1.3106338071885282, + "grad_norm": 0.14920337632757621, + "learning_rate": 6.923505841515369e-05, + "loss": 2.9029, + "step": 21113 + }, + { + "epoch": 1.310695884288286, + "grad_norm": 0.21355035907607547, + "learning_rate": 6.923172477945637e-05, + "loss": 2.8832, + "step": 21114 + }, + { + "epoch": 1.310757961388044, + "grad_norm": 0.16531742479816347, + "learning_rate": 6.922839104342246e-05, + "loss": 2.8799, + "step": 21115 + }, + { + "epoch": 1.310820038487802, + "grad_norm": 0.14781446583632218, + "learning_rate": 6.922505720706939e-05, + "loss": 2.9026, + "step": 21116 + }, + { + "epoch": 1.3108821155875598, + "grad_norm": 0.1533960896218341, + "learning_rate": 6.922172327041451e-05, + "loss": 2.8283, + "step": 21117 + }, + { + "epoch": 1.3109441926873178, + "grad_norm": 0.13914287526765107, + "learning_rate": 6.921838923347524e-05, + "loss": 2.8215, + "step": 21118 + }, + { + "epoch": 1.3110062697870757, + "grad_norm": 0.20255706324544875, + "learning_rate": 6.921505509626896e-05, + "loss": 2.8994, + "step": 21119 + }, + { + "epoch": 1.3110683468868334, + "grad_norm": 0.1562436877406155, + "learning_rate": 6.921172085881308e-05, + "loss": 2.9238, + "step": 21120 + }, + { + "epoch": 1.3111304239865913, + "grad_norm": 0.16715916990738355, + "learning_rate": 6.920838652112498e-05, + "loss": 2.8619, + "step": 21121 + }, + { + "epoch": 1.3111925010863492, + "grad_norm": 0.1424708289270658, + "learning_rate": 6.920505208322208e-05, + "loss": 2.854, + "step": 21122 + }, + { + "epoch": 1.3112545781861071, + "grad_norm": 0.17215768909978083, + "learning_rate": 6.920171754512175e-05, + "loss": 2.8438, + "step": 21123 + }, + { + "epoch": 1.311316655285865, + "grad_norm": 0.19333119620554678, + "learning_rate": 6.919838290684139e-05, + "loss": 2.8717, + "step": 21124 + }, + { + "epoch": 1.311378732385623, + "grad_norm": 0.1727498018975095, + "learning_rate": 6.919504816839843e-05, + "loss": 2.7476, + "step": 21125 + }, + { + "epoch": 1.3114408094853809, + "grad_norm": 0.17895036303461262, + "learning_rate": 6.919171332981023e-05, + "loss": 2.9022, + "step": 21126 + }, + { + "epoch": 1.3115028865851388, + "grad_norm": 0.20287727876113215, + "learning_rate": 6.91883783910942e-05, + "loss": 2.7886, + "step": 21127 + }, + { + "epoch": 1.3115649636848967, + "grad_norm": 0.2033171478523564, + "learning_rate": 6.918504335226775e-05, + "loss": 2.9067, + "step": 21128 + }, + { + "epoch": 1.3116270407846544, + "grad_norm": 0.1661109258091156, + "learning_rate": 6.918170821334826e-05, + "loss": 2.9659, + "step": 21129 + }, + { + "epoch": 1.3116891178844123, + "grad_norm": 0.17303470564476828, + "learning_rate": 6.917837297435315e-05, + "loss": 2.8716, + "step": 21130 + }, + { + "epoch": 1.3117511949841703, + "grad_norm": 0.17376757220732905, + "learning_rate": 6.917503763529981e-05, + "loss": 2.892, + "step": 21131 + }, + { + "epoch": 1.3118132720839282, + "grad_norm": 0.21044954622955686, + "learning_rate": 6.917170219620565e-05, + "loss": 2.8928, + "step": 21132 + }, + { + "epoch": 1.311875349183686, + "grad_norm": 0.19010295715121447, + "learning_rate": 6.916836665708806e-05, + "loss": 2.8754, + "step": 21133 + }, + { + "epoch": 1.311937426283444, + "grad_norm": 0.18112248177156648, + "learning_rate": 6.916503101796445e-05, + "loss": 2.9242, + "step": 21134 + }, + { + "epoch": 1.311999503383202, + "grad_norm": 0.1781926168966826, + "learning_rate": 6.916169527885222e-05, + "loss": 2.9421, + "step": 21135 + }, + { + "epoch": 1.3120615804829598, + "grad_norm": 0.235380724924411, + "learning_rate": 6.915835943976877e-05, + "loss": 2.8785, + "step": 21136 + }, + { + "epoch": 1.3121236575827178, + "grad_norm": 0.16648032840910898, + "learning_rate": 6.91550235007315e-05, + "loss": 2.836, + "step": 21137 + }, + { + "epoch": 1.3121857346824757, + "grad_norm": 0.17040864785300136, + "learning_rate": 6.915168746175782e-05, + "loss": 2.8349, + "step": 21138 + }, + { + "epoch": 1.3122478117822336, + "grad_norm": 0.1607117787517567, + "learning_rate": 6.914835132286515e-05, + "loss": 2.9178, + "step": 21139 + }, + { + "epoch": 1.3123098888819915, + "grad_norm": 0.19295631167830585, + "learning_rate": 6.914501508407089e-05, + "loss": 2.9443, + "step": 21140 + }, + { + "epoch": 1.3123719659817494, + "grad_norm": 0.16877048465232994, + "learning_rate": 6.914167874539244e-05, + "loss": 2.8076, + "step": 21141 + }, + { + "epoch": 1.3124340430815074, + "grad_norm": 0.16962729603162907, + "learning_rate": 6.913834230684719e-05, + "loss": 2.7028, + "step": 21142 + }, + { + "epoch": 1.3124961201812653, + "grad_norm": 0.16047822175863452, + "learning_rate": 6.913500576845258e-05, + "loss": 2.7965, + "step": 21143 + }, + { + "epoch": 1.312558197281023, + "grad_norm": 0.2361241398205495, + "learning_rate": 6.913166913022599e-05, + "loss": 2.8085, + "step": 21144 + }, + { + "epoch": 1.3126202743807809, + "grad_norm": 0.19024752193508798, + "learning_rate": 6.912833239218484e-05, + "loss": 2.9242, + "step": 21145 + }, + { + "epoch": 1.3126823514805388, + "grad_norm": 0.19050271027994922, + "learning_rate": 6.912499555434653e-05, + "loss": 2.9015, + "step": 21146 + }, + { + "epoch": 1.3127444285802967, + "grad_norm": 0.19806223251117666, + "learning_rate": 6.912165861672847e-05, + "loss": 2.9142, + "step": 21147 + }, + { + "epoch": 1.3128065056800546, + "grad_norm": 0.17036103357632365, + "learning_rate": 6.911832157934808e-05, + "loss": 2.9422, + "step": 21148 + }, + { + "epoch": 1.3128685827798126, + "grad_norm": 0.18237916458674694, + "learning_rate": 6.911498444222277e-05, + "loss": 2.796, + "step": 21149 + }, + { + "epoch": 1.3129306598795705, + "grad_norm": 0.15338900418341794, + "learning_rate": 6.911164720536994e-05, + "loss": 2.7855, + "step": 21150 + }, + { + "epoch": 1.3129927369793284, + "grad_norm": 0.18869920536590593, + "learning_rate": 6.910830986880702e-05, + "loss": 2.8987, + "step": 21151 + }, + { + "epoch": 1.3130548140790863, + "grad_norm": 0.15550526520809083, + "learning_rate": 6.91049724325514e-05, + "loss": 2.9156, + "step": 21152 + }, + { + "epoch": 1.313116891178844, + "grad_norm": 0.18346375063109743, + "learning_rate": 6.91016348966205e-05, + "loss": 2.9472, + "step": 21153 + }, + { + "epoch": 1.313178968278602, + "grad_norm": 0.1718529394470676, + "learning_rate": 6.909829726103173e-05, + "loss": 2.9449, + "step": 21154 + }, + { + "epoch": 1.3132410453783598, + "grad_norm": 0.18431552073039714, + "learning_rate": 6.909495952580251e-05, + "loss": 2.8412, + "step": 21155 + }, + { + "epoch": 1.3133031224781178, + "grad_norm": 0.1689577098112521, + "learning_rate": 6.909162169095025e-05, + "loss": 2.7503, + "step": 21156 + }, + { + "epoch": 1.3133651995778757, + "grad_norm": 0.15976404221923562, + "learning_rate": 6.908828375649237e-05, + "loss": 2.8797, + "step": 21157 + }, + { + "epoch": 1.3134272766776336, + "grad_norm": 0.17958059447855956, + "learning_rate": 6.908494572244628e-05, + "loss": 2.8236, + "step": 21158 + }, + { + "epoch": 1.3134893537773915, + "grad_norm": 0.1565971035797119, + "learning_rate": 6.90816075888294e-05, + "loss": 2.8665, + "step": 21159 + }, + { + "epoch": 1.3135514308771494, + "grad_norm": 0.15760435598109496, + "learning_rate": 6.907826935565912e-05, + "loss": 2.9295, + "step": 21160 + }, + { + "epoch": 1.3136135079769073, + "grad_norm": 0.17863354283827304, + "learning_rate": 6.90749310229529e-05, + "loss": 2.8901, + "step": 21161 + }, + { + "epoch": 1.3136755850766653, + "grad_norm": 0.18302846688200847, + "learning_rate": 6.907159259072812e-05, + "loss": 2.8914, + "step": 21162 + }, + { + "epoch": 1.3137376621764232, + "grad_norm": 0.15471584440888148, + "learning_rate": 6.906825405900221e-05, + "loss": 2.8686, + "step": 21163 + }, + { + "epoch": 1.313799739276181, + "grad_norm": 0.15725771080111212, + "learning_rate": 6.90649154277926e-05, + "loss": 2.8033, + "step": 21164 + }, + { + "epoch": 1.313861816375939, + "grad_norm": 0.2025850253064583, + "learning_rate": 6.90615766971167e-05, + "loss": 2.8211, + "step": 21165 + }, + { + "epoch": 1.313923893475697, + "grad_norm": 0.18546809481574963, + "learning_rate": 6.905823786699192e-05, + "loss": 2.8587, + "step": 21166 + }, + { + "epoch": 1.3139859705754546, + "grad_norm": 0.15201664310474727, + "learning_rate": 6.905489893743569e-05, + "loss": 2.8539, + "step": 21167 + }, + { + "epoch": 1.3140480476752125, + "grad_norm": 0.16994466299959562, + "learning_rate": 6.905155990846542e-05, + "loss": 2.8578, + "step": 21168 + }, + { + "epoch": 1.3141101247749705, + "grad_norm": 0.15965864171775154, + "learning_rate": 6.904822078009854e-05, + "loss": 2.862, + "step": 21169 + }, + { + "epoch": 1.3141722018747284, + "grad_norm": 0.19479644309696936, + "learning_rate": 6.904488155235246e-05, + "loss": 2.7753, + "step": 21170 + }, + { + "epoch": 1.3142342789744863, + "grad_norm": 0.19106935513820367, + "learning_rate": 6.904154222524462e-05, + "loss": 2.8276, + "step": 21171 + }, + { + "epoch": 1.3142963560742442, + "grad_norm": 0.14987965887438937, + "learning_rate": 6.903820279879243e-05, + "loss": 2.7465, + "step": 21172 + }, + { + "epoch": 1.3143584331740021, + "grad_norm": 0.15449771003160295, + "learning_rate": 6.903486327301331e-05, + "loss": 2.8564, + "step": 21173 + }, + { + "epoch": 1.31442051027376, + "grad_norm": 0.14746563000823204, + "learning_rate": 6.90315236479247e-05, + "loss": 2.7407, + "step": 21174 + }, + { + "epoch": 1.314482587373518, + "grad_norm": 0.1528872917135512, + "learning_rate": 6.9028183923544e-05, + "loss": 2.7865, + "step": 21175 + }, + { + "epoch": 1.3145446644732757, + "grad_norm": 0.20493845043270276, + "learning_rate": 6.902484409988865e-05, + "loss": 2.8443, + "step": 21176 + }, + { + "epoch": 1.3146067415730336, + "grad_norm": 0.14926789183578992, + "learning_rate": 6.902150417697608e-05, + "loss": 2.8245, + "step": 21177 + }, + { + "epoch": 1.3146688186727915, + "grad_norm": 0.1632504803809653, + "learning_rate": 6.901816415482368e-05, + "loss": 2.874, + "step": 21178 + }, + { + "epoch": 1.3147308957725494, + "grad_norm": 0.1614799401735936, + "learning_rate": 6.901482403344893e-05, + "loss": 2.8249, + "step": 21179 + }, + { + "epoch": 1.3147929728723073, + "grad_norm": 0.1674421060739512, + "learning_rate": 6.901148381286921e-05, + "loss": 2.8754, + "step": 21180 + }, + { + "epoch": 1.3148550499720653, + "grad_norm": 0.1489408204036313, + "learning_rate": 6.900814349310196e-05, + "loss": 2.7776, + "step": 21181 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.16693341185663735, + "learning_rate": 6.900480307416461e-05, + "loss": 2.8451, + "step": 21182 + }, + { + "epoch": 1.314979204171581, + "grad_norm": 0.1415198123013773, + "learning_rate": 6.90014625560746e-05, + "loss": 2.7613, + "step": 21183 + }, + { + "epoch": 1.315041281271339, + "grad_norm": 0.17088814840678582, + "learning_rate": 6.899812193884934e-05, + "loss": 2.8018, + "step": 21184 + }, + { + "epoch": 1.315103358371097, + "grad_norm": 0.20785338530218836, + "learning_rate": 6.899478122250627e-05, + "loss": 2.8451, + "step": 21185 + }, + { + "epoch": 1.3151654354708548, + "grad_norm": 0.20434170152763634, + "learning_rate": 6.899144040706282e-05, + "loss": 2.8782, + "step": 21186 + }, + { + "epoch": 1.3152275125706128, + "grad_norm": 0.15269041965517888, + "learning_rate": 6.89880994925364e-05, + "loss": 2.7878, + "step": 21187 + }, + { + "epoch": 1.3152895896703707, + "grad_norm": 0.20472973795427735, + "learning_rate": 6.898475847894446e-05, + "loss": 2.8767, + "step": 21188 + }, + { + "epoch": 1.3153516667701286, + "grad_norm": 0.18548588001166685, + "learning_rate": 6.898141736630442e-05, + "loss": 2.9335, + "step": 21189 + }, + { + "epoch": 1.3154137438698865, + "grad_norm": 0.17111734811047036, + "learning_rate": 6.897807615463374e-05, + "loss": 2.7945, + "step": 21190 + }, + { + "epoch": 1.3154758209696442, + "grad_norm": 0.15048182805983285, + "learning_rate": 6.89747348439498e-05, + "loss": 2.814, + "step": 21191 + }, + { + "epoch": 1.3155378980694021, + "grad_norm": 0.17958950397403306, + "learning_rate": 6.89713934342701e-05, + "loss": 2.8488, + "step": 21192 + }, + { + "epoch": 1.31559997516916, + "grad_norm": 0.1556798965173971, + "learning_rate": 6.8968051925612e-05, + "loss": 2.8503, + "step": 21193 + }, + { + "epoch": 1.315662052268918, + "grad_norm": 0.1738941088822846, + "learning_rate": 6.896471031799298e-05, + "loss": 2.8231, + "step": 21194 + }, + { + "epoch": 1.3157241293686759, + "grad_norm": 0.1685988506485978, + "learning_rate": 6.896136861143047e-05, + "loss": 2.8518, + "step": 21195 + }, + { + "epoch": 1.3157862064684338, + "grad_norm": 0.17742053818797485, + "learning_rate": 6.89580268059419e-05, + "loss": 2.878, + "step": 21196 + }, + { + "epoch": 1.3158482835681917, + "grad_norm": 0.14810217451124175, + "learning_rate": 6.89546849015447e-05, + "loss": 2.8567, + "step": 21197 + }, + { + "epoch": 1.3159103606679496, + "grad_norm": 0.18315663957839146, + "learning_rate": 6.895134289825628e-05, + "loss": 2.773, + "step": 21198 + }, + { + "epoch": 1.3159724377677076, + "grad_norm": 0.154260075683684, + "learning_rate": 6.894800079609414e-05, + "loss": 2.8759, + "step": 21199 + }, + { + "epoch": 1.3160345148674653, + "grad_norm": 0.15981371232842148, + "learning_rate": 6.894465859507566e-05, + "loss": 2.7634, + "step": 21200 + }, + { + "epoch": 1.3160965919672232, + "grad_norm": 0.15346232597982531, + "learning_rate": 6.894131629521829e-05, + "loss": 2.8631, + "step": 21201 + }, + { + "epoch": 1.316158669066981, + "grad_norm": 0.1467211653253649, + "learning_rate": 6.89379738965395e-05, + "loss": 2.7651, + "step": 21202 + }, + { + "epoch": 1.316220746166739, + "grad_norm": 0.1481811417849039, + "learning_rate": 6.893463139905668e-05, + "loss": 2.9532, + "step": 21203 + }, + { + "epoch": 1.316282823266497, + "grad_norm": 0.16241442462811592, + "learning_rate": 6.89312888027873e-05, + "loss": 2.8915, + "step": 21204 + }, + { + "epoch": 1.3163449003662548, + "grad_norm": 0.14979586718279828, + "learning_rate": 6.89279461077488e-05, + "loss": 2.714, + "step": 21205 + }, + { + "epoch": 1.3164069774660128, + "grad_norm": 0.16994014729440232, + "learning_rate": 6.89246033139586e-05, + "loss": 2.7966, + "step": 21206 + }, + { + "epoch": 1.3164690545657707, + "grad_norm": 0.16025449484206208, + "learning_rate": 6.892126042143416e-05, + "loss": 2.8232, + "step": 21207 + }, + { + "epoch": 1.3165311316655286, + "grad_norm": 0.2100406229373744, + "learning_rate": 6.891791743019291e-05, + "loss": 2.73, + "step": 21208 + }, + { + "epoch": 1.3165932087652865, + "grad_norm": 0.14280411042460436, + "learning_rate": 6.891457434025228e-05, + "loss": 2.8235, + "step": 21209 + }, + { + "epoch": 1.3166552858650444, + "grad_norm": 0.17950014007517734, + "learning_rate": 6.891123115162972e-05, + "loss": 2.795, + "step": 21210 + }, + { + "epoch": 1.3167173629648024, + "grad_norm": 0.17407037699184452, + "learning_rate": 6.890788786434268e-05, + "loss": 2.8222, + "step": 21211 + }, + { + "epoch": 1.3167794400645603, + "grad_norm": 0.15165839332396006, + "learning_rate": 6.890454447840861e-05, + "loss": 2.9615, + "step": 21212 + }, + { + "epoch": 1.3168415171643182, + "grad_norm": 0.16047642849173321, + "learning_rate": 6.890120099384494e-05, + "loss": 2.8957, + "step": 21213 + }, + { + "epoch": 1.316903594264076, + "grad_norm": 0.14666360063378023, + "learning_rate": 6.889785741066912e-05, + "loss": 2.8189, + "step": 21214 + }, + { + "epoch": 1.3169656713638338, + "grad_norm": 0.15587365603689707, + "learning_rate": 6.889451372889858e-05, + "loss": 2.8236, + "step": 21215 + }, + { + "epoch": 1.3170277484635917, + "grad_norm": 0.16786068944550428, + "learning_rate": 6.889116994855079e-05, + "loss": 2.8507, + "step": 21216 + }, + { + "epoch": 1.3170898255633496, + "grad_norm": 0.25101678429486357, + "learning_rate": 6.888782606964314e-05, + "loss": 2.8272, + "step": 21217 + }, + { + "epoch": 1.3171519026631076, + "grad_norm": 0.16534931883358753, + "learning_rate": 6.888448209219315e-05, + "loss": 2.8346, + "step": 21218 + }, + { + "epoch": 1.3172139797628655, + "grad_norm": 0.2073636265953302, + "learning_rate": 6.888113801621823e-05, + "loss": 2.9786, + "step": 21219 + }, + { + "epoch": 1.3172760568626234, + "grad_norm": 0.1822623206540952, + "learning_rate": 6.887779384173582e-05, + "loss": 2.8569, + "step": 21220 + }, + { + "epoch": 1.3173381339623813, + "grad_norm": 0.22715668208214587, + "learning_rate": 6.887444956876339e-05, + "loss": 2.9398, + "step": 21221 + }, + { + "epoch": 1.3174002110621392, + "grad_norm": 0.15714732833231923, + "learning_rate": 6.887110519731837e-05, + "loss": 2.8727, + "step": 21222 + }, + { + "epoch": 1.3174622881618971, + "grad_norm": 0.1661414398959777, + "learning_rate": 6.886776072741821e-05, + "loss": 2.9405, + "step": 21223 + }, + { + "epoch": 1.3175243652616548, + "grad_norm": 0.1482941474872728, + "learning_rate": 6.886441615908036e-05, + "loss": 2.8282, + "step": 21224 + }, + { + "epoch": 1.3175864423614128, + "grad_norm": 0.15951810447304587, + "learning_rate": 6.886107149232228e-05, + "loss": 2.8274, + "step": 21225 + }, + { + "epoch": 1.3176485194611707, + "grad_norm": 0.1537802533879241, + "learning_rate": 6.885772672716141e-05, + "loss": 2.7842, + "step": 21226 + }, + { + "epoch": 1.3177105965609286, + "grad_norm": 0.15741999201441198, + "learning_rate": 6.88543818636152e-05, + "loss": 2.7718, + "step": 21227 + }, + { + "epoch": 1.3177726736606865, + "grad_norm": 0.17015907021664572, + "learning_rate": 6.885103690170109e-05, + "loss": 2.8315, + "step": 21228 + }, + { + "epoch": 1.3178347507604444, + "grad_norm": 0.16705753765449838, + "learning_rate": 6.884769184143656e-05, + "loss": 2.8676, + "step": 21229 + }, + { + "epoch": 1.3178968278602023, + "grad_norm": 0.19954744038805305, + "learning_rate": 6.884434668283905e-05, + "loss": 2.888, + "step": 21230 + }, + { + "epoch": 1.3179589049599603, + "grad_norm": 0.16815127945539954, + "learning_rate": 6.884100142592599e-05, + "loss": 2.8391, + "step": 21231 + }, + { + "epoch": 1.3180209820597182, + "grad_norm": 0.21209144200341878, + "learning_rate": 6.883765607071486e-05, + "loss": 2.7869, + "step": 21232 + }, + { + "epoch": 1.318083059159476, + "grad_norm": 0.1599368301968638, + "learning_rate": 6.883431061722312e-05, + "loss": 2.9087, + "step": 21233 + }, + { + "epoch": 1.318145136259234, + "grad_norm": 0.2760224770527657, + "learning_rate": 6.883096506546818e-05, + "loss": 2.9118, + "step": 21234 + }, + { + "epoch": 1.318207213358992, + "grad_norm": 0.16589822545048127, + "learning_rate": 6.882761941546755e-05, + "loss": 2.9608, + "step": 21235 + }, + { + "epoch": 1.3182692904587499, + "grad_norm": 0.17182014955599542, + "learning_rate": 6.882427366723864e-05, + "loss": 2.7765, + "step": 21236 + }, + { + "epoch": 1.3183313675585078, + "grad_norm": 0.2377404501568414, + "learning_rate": 6.882092782079895e-05, + "loss": 2.9035, + "step": 21237 + }, + { + "epoch": 1.3183934446582657, + "grad_norm": 0.1809900899006366, + "learning_rate": 6.881758187616586e-05, + "loss": 2.8871, + "step": 21238 + }, + { + "epoch": 1.3184555217580234, + "grad_norm": 0.16987355959410017, + "learning_rate": 6.881423583335693e-05, + "loss": 2.835, + "step": 21239 + }, + { + "epoch": 1.3185175988577813, + "grad_norm": 0.18170469446182647, + "learning_rate": 6.881088969238956e-05, + "loss": 2.8716, + "step": 21240 + }, + { + "epoch": 1.3185796759575392, + "grad_norm": 0.15881364662577666, + "learning_rate": 6.880754345328119e-05, + "loss": 2.7794, + "step": 21241 + }, + { + "epoch": 1.3186417530572971, + "grad_norm": 0.1524027807847146, + "learning_rate": 6.880419711604931e-05, + "loss": 2.8151, + "step": 21242 + }, + { + "epoch": 1.318703830157055, + "grad_norm": 0.17061367288132245, + "learning_rate": 6.880085068071136e-05, + "loss": 2.8313, + "step": 21243 + }, + { + "epoch": 1.318765907256813, + "grad_norm": 0.15780105832612873, + "learning_rate": 6.879750414728482e-05, + "loss": 2.8498, + "step": 21244 + }, + { + "epoch": 1.318827984356571, + "grad_norm": 0.17324482099731253, + "learning_rate": 6.879415751578712e-05, + "loss": 2.9087, + "step": 21245 + }, + { + "epoch": 1.3188900614563288, + "grad_norm": 0.1611127436306701, + "learning_rate": 6.879081078623575e-05, + "loss": 2.8151, + "step": 21246 + }, + { + "epoch": 1.3189521385560867, + "grad_norm": 0.17051694524368036, + "learning_rate": 6.878746395864817e-05, + "loss": 2.892, + "step": 21247 + }, + { + "epoch": 1.3190142156558444, + "grad_norm": 0.15759959896798775, + "learning_rate": 6.878411703304181e-05, + "loss": 2.877, + "step": 21248 + }, + { + "epoch": 1.3190762927556023, + "grad_norm": 0.16236892856489848, + "learning_rate": 6.878077000943416e-05, + "loss": 2.8396, + "step": 21249 + }, + { + "epoch": 1.3191383698553603, + "grad_norm": 0.15680463818242435, + "learning_rate": 6.877742288784267e-05, + "loss": 2.8368, + "step": 21250 + }, + { + "epoch": 1.3192004469551182, + "grad_norm": 0.14921655877986031, + "learning_rate": 6.877407566828482e-05, + "loss": 2.9102, + "step": 21251 + }, + { + "epoch": 1.319262524054876, + "grad_norm": 0.18103375113214912, + "learning_rate": 6.877072835077804e-05, + "loss": 2.8133, + "step": 21252 + }, + { + "epoch": 1.319324601154634, + "grad_norm": 0.1502356535826901, + "learning_rate": 6.876738093533983e-05, + "loss": 2.8514, + "step": 21253 + }, + { + "epoch": 1.319386678254392, + "grad_norm": 0.17249203592726817, + "learning_rate": 6.876403342198762e-05, + "loss": 2.8685, + "step": 21254 + }, + { + "epoch": 1.3194487553541498, + "grad_norm": 0.16112961621437766, + "learning_rate": 6.876068581073891e-05, + "loss": 2.9067, + "step": 21255 + }, + { + "epoch": 1.3195108324539078, + "grad_norm": 0.1701307766520126, + "learning_rate": 6.875733810161114e-05, + "loss": 2.8888, + "step": 21256 + }, + { + "epoch": 1.3195729095536657, + "grad_norm": 0.16642986116667502, + "learning_rate": 6.875399029462179e-05, + "loss": 2.9235, + "step": 21257 + }, + { + "epoch": 1.3196349866534236, + "grad_norm": 0.1635734444215581, + "learning_rate": 6.875064238978831e-05, + "loss": 2.9443, + "step": 21258 + }, + { + "epoch": 1.3196970637531815, + "grad_norm": 0.17993631965008913, + "learning_rate": 6.874729438712818e-05, + "loss": 2.8645, + "step": 21259 + }, + { + "epoch": 1.3197591408529394, + "grad_norm": 0.16112789955741155, + "learning_rate": 6.874394628665888e-05, + "loss": 2.8464, + "step": 21260 + }, + { + "epoch": 1.3198212179526974, + "grad_norm": 0.18168869202927795, + "learning_rate": 6.874059808839784e-05, + "loss": 2.8735, + "step": 21261 + }, + { + "epoch": 1.3198832950524553, + "grad_norm": 0.1634614185442312, + "learning_rate": 6.873724979236255e-05, + "loss": 2.8782, + "step": 21262 + }, + { + "epoch": 1.319945372152213, + "grad_norm": 0.17715182569264473, + "learning_rate": 6.873390139857048e-05, + "loss": 2.9143, + "step": 21263 + }, + { + "epoch": 1.3200074492519709, + "grad_norm": 0.15288480024396953, + "learning_rate": 6.873055290703911e-05, + "loss": 2.8227, + "step": 21264 + }, + { + "epoch": 1.3200695263517288, + "grad_norm": 0.15264753472976741, + "learning_rate": 6.87272043177859e-05, + "loss": 2.8659, + "step": 21265 + }, + { + "epoch": 1.3201316034514867, + "grad_norm": 0.15640852158618432, + "learning_rate": 6.872385563082831e-05, + "loss": 2.805, + "step": 21266 + }, + { + "epoch": 1.3201936805512446, + "grad_norm": 0.15861829981094241, + "learning_rate": 6.872050684618382e-05, + "loss": 2.8408, + "step": 21267 + }, + { + "epoch": 1.3202557576510026, + "grad_norm": 0.22843353720748932, + "learning_rate": 6.87171579638699e-05, + "loss": 2.8371, + "step": 21268 + }, + { + "epoch": 1.3203178347507605, + "grad_norm": 0.1540080034270956, + "learning_rate": 6.871380898390403e-05, + "loss": 2.9464, + "step": 21269 + }, + { + "epoch": 1.3203799118505184, + "grad_norm": 0.15862377779777612, + "learning_rate": 6.871045990630367e-05, + "loss": 2.8768, + "step": 21270 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.15721136813461878, + "learning_rate": 6.87071107310863e-05, + "loss": 2.8789, + "step": 21271 + }, + { + "epoch": 1.320504066050034, + "grad_norm": 0.15920518308195247, + "learning_rate": 6.87037614582694e-05, + "loss": 2.851, + "step": 21272 + }, + { + "epoch": 1.320566143149792, + "grad_norm": 0.15679936986248907, + "learning_rate": 6.870041208787042e-05, + "loss": 2.8614, + "step": 21273 + }, + { + "epoch": 1.3206282202495498, + "grad_norm": 0.14897592044479716, + "learning_rate": 6.869706261990686e-05, + "loss": 2.8127, + "step": 21274 + }, + { + "epoch": 1.3206902973493078, + "grad_norm": 0.15551262552708403, + "learning_rate": 6.86937130543962e-05, + "loss": 2.9003, + "step": 21275 + }, + { + "epoch": 1.3207523744490657, + "grad_norm": 0.1802826954457511, + "learning_rate": 6.869036339135589e-05, + "loss": 2.8979, + "step": 21276 + }, + { + "epoch": 1.3208144515488236, + "grad_norm": 0.16690136854743184, + "learning_rate": 6.86870136308034e-05, + "loss": 2.898, + "step": 21277 + }, + { + "epoch": 1.3208765286485815, + "grad_norm": 0.14455697114286198, + "learning_rate": 6.868366377275625e-05, + "loss": 2.7725, + "step": 21278 + }, + { + "epoch": 1.3209386057483394, + "grad_norm": 0.2114762120526362, + "learning_rate": 6.868031381723187e-05, + "loss": 2.849, + "step": 21279 + }, + { + "epoch": 1.3210006828480974, + "grad_norm": 0.1617457551568694, + "learning_rate": 6.867696376424777e-05, + "loss": 2.9665, + "step": 21280 + }, + { + "epoch": 1.3210627599478553, + "grad_norm": 0.14936495578586503, + "learning_rate": 6.86736136138214e-05, + "loss": 2.8652, + "step": 21281 + }, + { + "epoch": 1.3211248370476132, + "grad_norm": 0.15128873206983315, + "learning_rate": 6.867026336597027e-05, + "loss": 2.8769, + "step": 21282 + }, + { + "epoch": 1.321186914147371, + "grad_norm": 0.1566927597158241, + "learning_rate": 6.866691302071183e-05, + "loss": 2.7907, + "step": 21283 + }, + { + "epoch": 1.321248991247129, + "grad_norm": 0.1605822491715206, + "learning_rate": 6.866356257806358e-05, + "loss": 2.827, + "step": 21284 + }, + { + "epoch": 1.321311068346887, + "grad_norm": 0.16066827569580586, + "learning_rate": 6.8660212038043e-05, + "loss": 2.8778, + "step": 21285 + }, + { + "epoch": 1.3213731454466449, + "grad_norm": 0.1586287037144238, + "learning_rate": 6.865686140066757e-05, + "loss": 2.8498, + "step": 21286 + }, + { + "epoch": 1.3214352225464026, + "grad_norm": 0.16189533488860777, + "learning_rate": 6.865351066595475e-05, + "loss": 2.9072, + "step": 21287 + }, + { + "epoch": 1.3214972996461605, + "grad_norm": 0.16471655218416398, + "learning_rate": 6.865015983392203e-05, + "loss": 2.9293, + "step": 21288 + }, + { + "epoch": 1.3215593767459184, + "grad_norm": 0.15890156255126925, + "learning_rate": 6.864680890458691e-05, + "loss": 2.785, + "step": 21289 + }, + { + "epoch": 1.3216214538456763, + "grad_norm": 0.1545526702005666, + "learning_rate": 6.864345787796686e-05, + "loss": 2.801, + "step": 21290 + }, + { + "epoch": 1.3216835309454342, + "grad_norm": 0.1884671045806158, + "learning_rate": 6.864010675407937e-05, + "loss": 2.8319, + "step": 21291 + }, + { + "epoch": 1.3217456080451921, + "grad_norm": 0.17027777637187738, + "learning_rate": 6.863675553294191e-05, + "loss": 2.8284, + "step": 21292 + }, + { + "epoch": 1.32180768514495, + "grad_norm": 0.1527698865812874, + "learning_rate": 6.863340421457199e-05, + "loss": 2.8269, + "step": 21293 + }, + { + "epoch": 1.321869762244708, + "grad_norm": 0.16140264978852725, + "learning_rate": 6.863005279898705e-05, + "loss": 2.8687, + "step": 21294 + }, + { + "epoch": 1.321931839344466, + "grad_norm": 0.17780000920998343, + "learning_rate": 6.862670128620462e-05, + "loss": 2.9399, + "step": 21295 + }, + { + "epoch": 1.3219939164442236, + "grad_norm": 0.16252066247044605, + "learning_rate": 6.862334967624217e-05, + "loss": 2.933, + "step": 21296 + }, + { + "epoch": 1.3220559935439815, + "grad_norm": 0.15858687259062565, + "learning_rate": 6.861999796911716e-05, + "loss": 2.8928, + "step": 21297 + }, + { + "epoch": 1.3221180706437394, + "grad_norm": 0.15856408513562054, + "learning_rate": 6.861664616484713e-05, + "loss": 2.7788, + "step": 21298 + }, + { + "epoch": 1.3221801477434973, + "grad_norm": 0.1821353287369826, + "learning_rate": 6.86132942634495e-05, + "loss": 2.9007, + "step": 21299 + }, + { + "epoch": 1.3222422248432553, + "grad_norm": 0.15190660897349664, + "learning_rate": 6.860994226494183e-05, + "loss": 2.8192, + "step": 21300 + }, + { + "epoch": 1.3223043019430132, + "grad_norm": 0.1672123132991383, + "learning_rate": 6.860659016934157e-05, + "loss": 2.9212, + "step": 21301 + }, + { + "epoch": 1.322366379042771, + "grad_norm": 0.15472371780373045, + "learning_rate": 6.860323797666619e-05, + "loss": 2.8561, + "step": 21302 + }, + { + "epoch": 1.322428456142529, + "grad_norm": 0.15573467270952904, + "learning_rate": 6.859988568693323e-05, + "loss": 2.8756, + "step": 21303 + }, + { + "epoch": 1.322490533242287, + "grad_norm": 0.15721458484210254, + "learning_rate": 6.859653330016013e-05, + "loss": 2.8351, + "step": 21304 + }, + { + "epoch": 1.3225526103420449, + "grad_norm": 0.16280776988057413, + "learning_rate": 6.859318081636442e-05, + "loss": 2.8779, + "step": 21305 + }, + { + "epoch": 1.3226146874418028, + "grad_norm": 0.1486190639157411, + "learning_rate": 6.858982823556355e-05, + "loss": 2.7558, + "step": 21306 + }, + { + "epoch": 1.3226767645415607, + "grad_norm": 0.15663379977760522, + "learning_rate": 6.858647555777504e-05, + "loss": 2.8506, + "step": 21307 + }, + { + "epoch": 1.3227388416413186, + "grad_norm": 0.1585520792932445, + "learning_rate": 6.858312278301637e-05, + "loss": 2.8855, + "step": 21308 + }, + { + "epoch": 1.3228009187410765, + "grad_norm": 0.15685464574032446, + "learning_rate": 6.857976991130506e-05, + "loss": 2.8588, + "step": 21309 + }, + { + "epoch": 1.3228629958408344, + "grad_norm": 0.14708786311302124, + "learning_rate": 6.857641694265856e-05, + "loss": 2.8495, + "step": 21310 + }, + { + "epoch": 1.3229250729405921, + "grad_norm": 0.16049694250067476, + "learning_rate": 6.857306387709439e-05, + "loss": 2.8727, + "step": 21311 + }, + { + "epoch": 1.32298715004035, + "grad_norm": 0.1580902273690873, + "learning_rate": 6.856971071463004e-05, + "loss": 2.8726, + "step": 21312 + }, + { + "epoch": 1.323049227140108, + "grad_norm": 0.1568303476574991, + "learning_rate": 6.856635745528299e-05, + "loss": 2.8927, + "step": 21313 + }, + { + "epoch": 1.323111304239866, + "grad_norm": 0.15535853711020933, + "learning_rate": 6.856300409907077e-05, + "loss": 2.7308, + "step": 21314 + }, + { + "epoch": 1.3231733813396238, + "grad_norm": 0.17051071872397394, + "learning_rate": 6.855965064601082e-05, + "loss": 2.8502, + "step": 21315 + }, + { + "epoch": 1.3232354584393817, + "grad_norm": 0.1626732510175172, + "learning_rate": 6.855629709612068e-05, + "loss": 2.8386, + "step": 21316 + }, + { + "epoch": 1.3232975355391396, + "grad_norm": 0.15889049659181378, + "learning_rate": 6.855294344941782e-05, + "loss": 2.8695, + "step": 21317 + }, + { + "epoch": 1.3233596126388976, + "grad_norm": 0.16366852276630747, + "learning_rate": 6.854958970591978e-05, + "loss": 2.8922, + "step": 21318 + }, + { + "epoch": 1.3234216897386555, + "grad_norm": 0.16195773305377692, + "learning_rate": 6.854623586564401e-05, + "loss": 2.8877, + "step": 21319 + }, + { + "epoch": 1.3234837668384132, + "grad_norm": 0.15368596592711875, + "learning_rate": 6.854288192860803e-05, + "loss": 2.8004, + "step": 21320 + }, + { + "epoch": 1.323545843938171, + "grad_norm": 0.16381046827614382, + "learning_rate": 6.853952789482933e-05, + "loss": 2.8621, + "step": 21321 + }, + { + "epoch": 1.323607921037929, + "grad_norm": 0.14464100330563337, + "learning_rate": 6.853617376432542e-05, + "loss": 2.7883, + "step": 21322 + }, + { + "epoch": 1.323669998137687, + "grad_norm": 0.20322509280223888, + "learning_rate": 6.853281953711379e-05, + "loss": 2.8713, + "step": 21323 + }, + { + "epoch": 1.3237320752374448, + "grad_norm": 0.15060529003150022, + "learning_rate": 6.852946521321193e-05, + "loss": 2.7197, + "step": 21324 + }, + { + "epoch": 1.3237941523372028, + "grad_norm": 0.15689014196459233, + "learning_rate": 6.852611079263737e-05, + "loss": 2.8105, + "step": 21325 + }, + { + "epoch": 1.3238562294369607, + "grad_norm": 0.148752280305967, + "learning_rate": 6.852275627540757e-05, + "loss": 2.8732, + "step": 21326 + }, + { + "epoch": 1.3239183065367186, + "grad_norm": 0.15342068272162426, + "learning_rate": 6.851940166154007e-05, + "loss": 2.7921, + "step": 21327 + }, + { + "epoch": 1.3239803836364765, + "grad_norm": 0.1972434437433334, + "learning_rate": 6.851604695105235e-05, + "loss": 2.8462, + "step": 21328 + }, + { + "epoch": 1.3240424607362344, + "grad_norm": 0.1620928572087203, + "learning_rate": 6.851269214396192e-05, + "loss": 2.9316, + "step": 21329 + }, + { + "epoch": 1.3241045378359924, + "grad_norm": 0.1692667202572076, + "learning_rate": 6.850933724028629e-05, + "loss": 2.8575, + "step": 21330 + }, + { + "epoch": 1.3241666149357503, + "grad_norm": 0.15146436955607087, + "learning_rate": 6.850598224004293e-05, + "loss": 2.7851, + "step": 21331 + }, + { + "epoch": 1.3242286920355082, + "grad_norm": 0.14263198436206842, + "learning_rate": 6.850262714324938e-05, + "loss": 2.89, + "step": 21332 + }, + { + "epoch": 1.324290769135266, + "grad_norm": 0.15345886069158066, + "learning_rate": 6.849927194992312e-05, + "loss": 2.8541, + "step": 21333 + }, + { + "epoch": 1.324352846235024, + "grad_norm": 0.14677336478661748, + "learning_rate": 6.84959166600817e-05, + "loss": 2.863, + "step": 21334 + }, + { + "epoch": 1.3244149233347817, + "grad_norm": 0.23068245177325042, + "learning_rate": 6.849256127374258e-05, + "loss": 2.8122, + "step": 21335 + }, + { + "epoch": 1.3244770004345396, + "grad_norm": 0.15715179679669036, + "learning_rate": 6.848920579092327e-05, + "loss": 2.8705, + "step": 21336 + }, + { + "epoch": 1.3245390775342976, + "grad_norm": 0.1479873859790769, + "learning_rate": 6.848585021164129e-05, + "loss": 2.9364, + "step": 21337 + }, + { + "epoch": 1.3246011546340555, + "grad_norm": 0.15119577977830637, + "learning_rate": 6.848249453591413e-05, + "loss": 2.821, + "step": 21338 + }, + { + "epoch": 1.3246632317338134, + "grad_norm": 0.18829240845028128, + "learning_rate": 6.847913876375933e-05, + "loss": 2.8514, + "step": 21339 + }, + { + "epoch": 1.3247253088335713, + "grad_norm": 0.1650715293776234, + "learning_rate": 6.847578289519436e-05, + "loss": 2.9733, + "step": 21340 + }, + { + "epoch": 1.3247873859333292, + "grad_norm": 0.16785939671284494, + "learning_rate": 6.847242693023676e-05, + "loss": 2.9504, + "step": 21341 + }, + { + "epoch": 1.3248494630330871, + "grad_norm": 0.18383814392477674, + "learning_rate": 6.846907086890402e-05, + "loss": 2.7733, + "step": 21342 + }, + { + "epoch": 1.324911540132845, + "grad_norm": 0.14458843337178487, + "learning_rate": 6.846571471121365e-05, + "loss": 2.8413, + "step": 21343 + }, + { + "epoch": 1.3249736172326028, + "grad_norm": 0.1939178262923709, + "learning_rate": 6.846235845718316e-05, + "loss": 2.7996, + "step": 21344 + }, + { + "epoch": 1.3250356943323607, + "grad_norm": 0.15846468648964546, + "learning_rate": 6.845900210683007e-05, + "loss": 2.7369, + "step": 21345 + }, + { + "epoch": 1.3250977714321186, + "grad_norm": 0.1547178609510816, + "learning_rate": 6.845564566017189e-05, + "loss": 2.8932, + "step": 21346 + }, + { + "epoch": 1.3251598485318765, + "grad_norm": 0.15441439776452953, + "learning_rate": 6.845228911722612e-05, + "loss": 2.7415, + "step": 21347 + }, + { + "epoch": 1.3252219256316344, + "grad_norm": 0.16341552438021434, + "learning_rate": 6.844893247801026e-05, + "loss": 2.8598, + "step": 21348 + }, + { + "epoch": 1.3252840027313924, + "grad_norm": 0.1667135546463775, + "learning_rate": 6.844557574254186e-05, + "loss": 2.8798, + "step": 21349 + }, + { + "epoch": 1.3253460798311503, + "grad_norm": 0.14477860702878706, + "learning_rate": 6.844221891083842e-05, + "loss": 2.8599, + "step": 21350 + }, + { + "epoch": 1.3254081569309082, + "grad_norm": 0.15175781912785224, + "learning_rate": 6.843886198291743e-05, + "loss": 2.8383, + "step": 21351 + }, + { + "epoch": 1.325470234030666, + "grad_norm": 0.14578219919417, + "learning_rate": 6.843550495879643e-05, + "loss": 2.8195, + "step": 21352 + }, + { + "epoch": 1.325532311130424, + "grad_norm": 0.15019991164488994, + "learning_rate": 6.843214783849292e-05, + "loss": 2.9117, + "step": 21353 + }, + { + "epoch": 1.325594388230182, + "grad_norm": 0.20865418740864622, + "learning_rate": 6.842879062202444e-05, + "loss": 2.8541, + "step": 21354 + }, + { + "epoch": 1.3256564653299399, + "grad_norm": 0.16694355418218162, + "learning_rate": 6.842543330940846e-05, + "loss": 2.8058, + "step": 21355 + }, + { + "epoch": 1.3257185424296978, + "grad_norm": 0.19044516084847055, + "learning_rate": 6.842207590066254e-05, + "loss": 2.8102, + "step": 21356 + }, + { + "epoch": 1.3257806195294557, + "grad_norm": 0.15977403343477153, + "learning_rate": 6.841871839580417e-05, + "loss": 2.7517, + "step": 21357 + }, + { + "epoch": 1.3258426966292136, + "grad_norm": 0.15017996934123012, + "learning_rate": 6.841536079485087e-05, + "loss": 2.8017, + "step": 21358 + }, + { + "epoch": 1.3259047737289713, + "grad_norm": 0.17303065811704982, + "learning_rate": 6.841200309782017e-05, + "loss": 2.8968, + "step": 21359 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.20603599282743157, + "learning_rate": 6.840864530472957e-05, + "loss": 2.8216, + "step": 21360 + }, + { + "epoch": 1.3260289279284871, + "grad_norm": 0.14776279173138807, + "learning_rate": 6.840528741559662e-05, + "loss": 2.8781, + "step": 21361 + }, + { + "epoch": 1.326091005028245, + "grad_norm": 0.16486270217061688, + "learning_rate": 6.840192943043879e-05, + "loss": 2.8537, + "step": 21362 + }, + { + "epoch": 1.326153082128003, + "grad_norm": 0.1592669160237246, + "learning_rate": 6.839857134927366e-05, + "loss": 2.8462, + "step": 21363 + }, + { + "epoch": 1.326215159227761, + "grad_norm": 0.18865408961657937, + "learning_rate": 6.83952131721187e-05, + "loss": 2.8568, + "step": 21364 + }, + { + "epoch": 1.3262772363275188, + "grad_norm": 0.1465525659151063, + "learning_rate": 6.839185489899143e-05, + "loss": 2.906, + "step": 21365 + }, + { + "epoch": 1.3263393134272767, + "grad_norm": 0.19107271835110834, + "learning_rate": 6.83884965299094e-05, + "loss": 2.7806, + "step": 21366 + }, + { + "epoch": 1.3264013905270347, + "grad_norm": 0.18533465646996442, + "learning_rate": 6.838513806489012e-05, + "loss": 2.885, + "step": 21367 + }, + { + "epoch": 1.3264634676267923, + "grad_norm": 0.20700147702604657, + "learning_rate": 6.838177950395111e-05, + "loss": 2.7635, + "step": 21368 + }, + { + "epoch": 1.3265255447265503, + "grad_norm": 0.1879836143806117, + "learning_rate": 6.83784208471099e-05, + "loss": 2.8777, + "step": 21369 + }, + { + "epoch": 1.3265876218263082, + "grad_norm": 0.17533380994290795, + "learning_rate": 6.837506209438402e-05, + "loss": 2.7383, + "step": 21370 + }, + { + "epoch": 1.326649698926066, + "grad_norm": 0.1789985281328339, + "learning_rate": 6.837170324579095e-05, + "loss": 2.8169, + "step": 21371 + }, + { + "epoch": 1.326711776025824, + "grad_norm": 0.1625403856105808, + "learning_rate": 6.836834430134826e-05, + "loss": 2.8995, + "step": 21372 + }, + { + "epoch": 1.326773853125582, + "grad_norm": 0.16150755798440838, + "learning_rate": 6.836498526107346e-05, + "loss": 2.87, + "step": 21373 + }, + { + "epoch": 1.3268359302253399, + "grad_norm": 0.16246615066474096, + "learning_rate": 6.836162612498409e-05, + "loss": 2.7545, + "step": 21374 + }, + { + "epoch": 1.3268980073250978, + "grad_norm": 0.1524200920634204, + "learning_rate": 6.835826689309763e-05, + "loss": 2.8924, + "step": 21375 + }, + { + "epoch": 1.3269600844248557, + "grad_norm": 0.17135665499456407, + "learning_rate": 6.835490756543165e-05, + "loss": 2.8092, + "step": 21376 + }, + { + "epoch": 1.3270221615246136, + "grad_norm": 0.1512977810136921, + "learning_rate": 6.835154814200367e-05, + "loss": 2.828, + "step": 21377 + }, + { + "epoch": 1.3270842386243715, + "grad_norm": 0.16300460934777014, + "learning_rate": 6.834818862283117e-05, + "loss": 2.865, + "step": 21378 + }, + { + "epoch": 1.3271463157241294, + "grad_norm": 0.16830563583712438, + "learning_rate": 6.834482900793175e-05, + "loss": 2.8621, + "step": 21379 + }, + { + "epoch": 1.3272083928238874, + "grad_norm": 0.15174846006661596, + "learning_rate": 6.834146929732289e-05, + "loss": 2.8407, + "step": 21380 + }, + { + "epoch": 1.3272704699236453, + "grad_norm": 0.17402083719282382, + "learning_rate": 6.833810949102213e-05, + "loss": 2.7434, + "step": 21381 + }, + { + "epoch": 1.3273325470234032, + "grad_norm": 0.17578155199660744, + "learning_rate": 6.833474958904702e-05, + "loss": 2.8559, + "step": 21382 + }, + { + "epoch": 1.327394624123161, + "grad_norm": 0.18810728195626777, + "learning_rate": 6.833138959141506e-05, + "loss": 2.8622, + "step": 21383 + }, + { + "epoch": 1.3274567012229188, + "grad_norm": 0.16762133643643545, + "learning_rate": 6.832802949814378e-05, + "loss": 2.9571, + "step": 21384 + }, + { + "epoch": 1.3275187783226767, + "grad_norm": 0.1648806709145048, + "learning_rate": 6.832466930925073e-05, + "loss": 2.8781, + "step": 21385 + }, + { + "epoch": 1.3275808554224346, + "grad_norm": 0.17498149135327834, + "learning_rate": 6.832130902475344e-05, + "loss": 2.8828, + "step": 21386 + }, + { + "epoch": 1.3276429325221926, + "grad_norm": 0.1845730170803621, + "learning_rate": 6.831794864466941e-05, + "loss": 2.8126, + "step": 21387 + }, + { + "epoch": 1.3277050096219505, + "grad_norm": 0.17257554165869674, + "learning_rate": 6.831458816901623e-05, + "loss": 2.8501, + "step": 21388 + }, + { + "epoch": 1.3277670867217084, + "grad_norm": 0.14887068933288522, + "learning_rate": 6.831122759781137e-05, + "loss": 2.7301, + "step": 21389 + }, + { + "epoch": 1.3278291638214663, + "grad_norm": 0.1629587636998602, + "learning_rate": 6.83078669310724e-05, + "loss": 2.7803, + "step": 21390 + }, + { + "epoch": 1.3278912409212242, + "grad_norm": 0.15004530808515393, + "learning_rate": 6.830450616881684e-05, + "loss": 2.9157, + "step": 21391 + }, + { + "epoch": 1.327953318020982, + "grad_norm": 0.15768916036092306, + "learning_rate": 6.830114531106224e-05, + "loss": 2.8121, + "step": 21392 + }, + { + "epoch": 1.3280153951207398, + "grad_norm": 0.14741689769629013, + "learning_rate": 6.829778435782611e-05, + "loss": 2.8593, + "step": 21393 + }, + { + "epoch": 1.3280774722204978, + "grad_norm": 0.17538864478711721, + "learning_rate": 6.829442330912599e-05, + "loss": 2.8514, + "step": 21394 + }, + { + "epoch": 1.3281395493202557, + "grad_norm": 0.15428993957964063, + "learning_rate": 6.829106216497944e-05, + "loss": 2.7856, + "step": 21395 + }, + { + "epoch": 1.3282016264200136, + "grad_norm": 0.15901117239579973, + "learning_rate": 6.828770092540396e-05, + "loss": 2.8156, + "step": 21396 + }, + { + "epoch": 1.3282637035197715, + "grad_norm": 0.1951442279731994, + "learning_rate": 6.828433959041712e-05, + "loss": 2.8878, + "step": 21397 + }, + { + "epoch": 1.3283257806195294, + "grad_norm": 0.16887580676630815, + "learning_rate": 6.828097816003644e-05, + "loss": 2.8169, + "step": 21398 + }, + { + "epoch": 1.3283878577192874, + "grad_norm": 0.16425947444768993, + "learning_rate": 6.827761663427944e-05, + "loss": 2.8705, + "step": 21399 + }, + { + "epoch": 1.3284499348190453, + "grad_norm": 0.17243360646294936, + "learning_rate": 6.82742550131637e-05, + "loss": 2.8667, + "step": 21400 + }, + { + "epoch": 1.3285120119188032, + "grad_norm": 0.15684791248443014, + "learning_rate": 6.827089329670673e-05, + "loss": 2.8925, + "step": 21401 + }, + { + "epoch": 1.328574089018561, + "grad_norm": 0.15343649525905836, + "learning_rate": 6.826753148492607e-05, + "loss": 2.8374, + "step": 21402 + }, + { + "epoch": 1.328636166118319, + "grad_norm": 0.1566123753804221, + "learning_rate": 6.826416957783926e-05, + "loss": 2.7983, + "step": 21403 + }, + { + "epoch": 1.328698243218077, + "grad_norm": 0.15961929769382818, + "learning_rate": 6.826080757546385e-05, + "loss": 2.8678, + "step": 21404 + }, + { + "epoch": 1.3287603203178349, + "grad_norm": 0.16326104545614153, + "learning_rate": 6.825744547781738e-05, + "loss": 2.8059, + "step": 21405 + }, + { + "epoch": 1.3288223974175928, + "grad_norm": 0.14002625501263014, + "learning_rate": 6.825408328491737e-05, + "loss": 2.7788, + "step": 21406 + }, + { + "epoch": 1.3288844745173505, + "grad_norm": 0.14739999080047164, + "learning_rate": 6.825072099678138e-05, + "loss": 2.8724, + "step": 21407 + }, + { + "epoch": 1.3289465516171084, + "grad_norm": 0.15388876021680012, + "learning_rate": 6.824735861342696e-05, + "loss": 2.8072, + "step": 21408 + }, + { + "epoch": 1.3290086287168663, + "grad_norm": 0.15302189255519422, + "learning_rate": 6.824399613487163e-05, + "loss": 2.8092, + "step": 21409 + }, + { + "epoch": 1.3290707058166242, + "grad_norm": 0.15088345542395054, + "learning_rate": 6.824063356113294e-05, + "loss": 2.797, + "step": 21410 + }, + { + "epoch": 1.3291327829163821, + "grad_norm": 0.1487515477793173, + "learning_rate": 6.823727089222845e-05, + "loss": 2.7808, + "step": 21411 + }, + { + "epoch": 1.32919486001614, + "grad_norm": 0.14269167806955632, + "learning_rate": 6.823390812817568e-05, + "loss": 2.7727, + "step": 21412 + }, + { + "epoch": 1.329256937115898, + "grad_norm": 0.2018370141468897, + "learning_rate": 6.823054526899219e-05, + "loss": 2.8723, + "step": 21413 + }, + { + "epoch": 1.329319014215656, + "grad_norm": 0.167328379002577, + "learning_rate": 6.822718231469551e-05, + "loss": 2.7702, + "step": 21414 + }, + { + "epoch": 1.3293810913154138, + "grad_norm": 0.1639982336057491, + "learning_rate": 6.82238192653032e-05, + "loss": 2.8936, + "step": 21415 + }, + { + "epoch": 1.3294431684151715, + "grad_norm": 0.1509235217352818, + "learning_rate": 6.82204561208328e-05, + "loss": 2.8442, + "step": 21416 + }, + { + "epoch": 1.3295052455149294, + "grad_norm": 0.16060936029714143, + "learning_rate": 6.821709288130187e-05, + "loss": 2.8251, + "step": 21417 + }, + { + "epoch": 1.3295673226146874, + "grad_norm": 0.1649155069618915, + "learning_rate": 6.821372954672794e-05, + "loss": 2.9134, + "step": 21418 + }, + { + "epoch": 1.3296293997144453, + "grad_norm": 0.1545136655015319, + "learning_rate": 6.821036611712854e-05, + "loss": 2.7258, + "step": 21419 + }, + { + "epoch": 1.3296914768142032, + "grad_norm": 0.17515257629911327, + "learning_rate": 6.820700259252127e-05, + "loss": 2.9065, + "step": 21420 + }, + { + "epoch": 1.329753553913961, + "grad_norm": 0.1663573506643167, + "learning_rate": 6.820363897292363e-05, + "loss": 2.8576, + "step": 21421 + }, + { + "epoch": 1.329815631013719, + "grad_norm": 0.17010332701460826, + "learning_rate": 6.82002752583532e-05, + "loss": 2.8561, + "step": 21422 + }, + { + "epoch": 1.329877708113477, + "grad_norm": 0.1631587887990043, + "learning_rate": 6.81969114488275e-05, + "loss": 2.8712, + "step": 21423 + }, + { + "epoch": 1.3299397852132349, + "grad_norm": 0.22210644719048828, + "learning_rate": 6.819354754436409e-05, + "loss": 2.8546, + "step": 21424 + }, + { + "epoch": 1.3300018623129928, + "grad_norm": 0.17113479618332542, + "learning_rate": 6.819018354498053e-05, + "loss": 2.8856, + "step": 21425 + }, + { + "epoch": 1.3300639394127507, + "grad_norm": 0.1671120118909098, + "learning_rate": 6.818681945069438e-05, + "loss": 2.7776, + "step": 21426 + }, + { + "epoch": 1.3301260165125086, + "grad_norm": 0.16416012039501882, + "learning_rate": 6.818345526152316e-05, + "loss": 2.8243, + "step": 21427 + }, + { + "epoch": 1.3301880936122665, + "grad_norm": 0.1847402607643936, + "learning_rate": 6.818009097748447e-05, + "loss": 2.8162, + "step": 21428 + }, + { + "epoch": 1.3302501707120244, + "grad_norm": 0.1553736908873621, + "learning_rate": 6.817672659859582e-05, + "loss": 2.8921, + "step": 21429 + }, + { + "epoch": 1.3303122478117824, + "grad_norm": 0.17755719670342066, + "learning_rate": 6.817336212487475e-05, + "loss": 2.8995, + "step": 21430 + }, + { + "epoch": 1.33037432491154, + "grad_norm": 0.16935146990931604, + "learning_rate": 6.816999755633886e-05, + "loss": 2.875, + "step": 21431 + }, + { + "epoch": 1.330436402011298, + "grad_norm": 0.1565575077113729, + "learning_rate": 6.816663289300566e-05, + "loss": 2.8848, + "step": 21432 + }, + { + "epoch": 1.330498479111056, + "grad_norm": 0.1849881342517098, + "learning_rate": 6.816326813489275e-05, + "loss": 2.8084, + "step": 21433 + }, + { + "epoch": 1.3305605562108138, + "grad_norm": 0.1764992725202654, + "learning_rate": 6.815990328201766e-05, + "loss": 2.8994, + "step": 21434 + }, + { + "epoch": 1.3306226333105717, + "grad_norm": 0.17435285043016388, + "learning_rate": 6.815653833439795e-05, + "loss": 2.9135, + "step": 21435 + }, + { + "epoch": 1.3306847104103297, + "grad_norm": 0.1509310993176136, + "learning_rate": 6.815317329205116e-05, + "loss": 2.8443, + "step": 21436 + }, + { + "epoch": 1.3307467875100876, + "grad_norm": 0.17489013501469036, + "learning_rate": 6.814980815499486e-05, + "loss": 2.8495, + "step": 21437 + }, + { + "epoch": 1.3308088646098455, + "grad_norm": 0.1565991434741621, + "learning_rate": 6.81464429232466e-05, + "loss": 2.8773, + "step": 21438 + }, + { + "epoch": 1.3308709417096034, + "grad_norm": 0.21344833641134464, + "learning_rate": 6.814307759682395e-05, + "loss": 2.829, + "step": 21439 + }, + { + "epoch": 1.330933018809361, + "grad_norm": 0.178706247236896, + "learning_rate": 6.813971217574446e-05, + "loss": 2.8278, + "step": 21440 + }, + { + "epoch": 1.330995095909119, + "grad_norm": 0.176043955211511, + "learning_rate": 6.813634666002568e-05, + "loss": 2.8774, + "step": 21441 + }, + { + "epoch": 1.331057173008877, + "grad_norm": 0.15832903055150238, + "learning_rate": 6.813298104968518e-05, + "loss": 2.9399, + "step": 21442 + }, + { + "epoch": 1.3311192501086349, + "grad_norm": 0.1675679281551438, + "learning_rate": 6.812961534474052e-05, + "loss": 2.8427, + "step": 21443 + }, + { + "epoch": 1.3311813272083928, + "grad_norm": 0.147966824481766, + "learning_rate": 6.812624954520925e-05, + "loss": 2.8127, + "step": 21444 + }, + { + "epoch": 1.3312434043081507, + "grad_norm": 0.1726421121703618, + "learning_rate": 6.812288365110893e-05, + "loss": 2.7709, + "step": 21445 + }, + { + "epoch": 1.3313054814079086, + "grad_norm": 0.18579623882956967, + "learning_rate": 6.811951766245715e-05, + "loss": 2.8294, + "step": 21446 + }, + { + "epoch": 1.3313675585076665, + "grad_norm": 0.17282145717544062, + "learning_rate": 6.811615157927143e-05, + "loss": 2.758, + "step": 21447 + }, + { + "epoch": 1.3314296356074244, + "grad_norm": 0.14799787017298224, + "learning_rate": 6.811278540156934e-05, + "loss": 2.787, + "step": 21448 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.16278822216306393, + "learning_rate": 6.810941912936848e-05, + "loss": 2.9206, + "step": 21449 + }, + { + "epoch": 1.3315537898069403, + "grad_norm": 0.22603535987622667, + "learning_rate": 6.810605276268635e-05, + "loss": 2.8137, + "step": 21450 + }, + { + "epoch": 1.3316158669066982, + "grad_norm": 0.16951530970507975, + "learning_rate": 6.810268630154057e-05, + "loss": 2.8853, + "step": 21451 + }, + { + "epoch": 1.3316779440064561, + "grad_norm": 0.18917634744037, + "learning_rate": 6.809931974594867e-05, + "loss": 2.8776, + "step": 21452 + }, + { + "epoch": 1.331740021106214, + "grad_norm": 0.2222522701414602, + "learning_rate": 6.809595309592825e-05, + "loss": 2.8384, + "step": 21453 + }, + { + "epoch": 1.331802098205972, + "grad_norm": 0.16595798501585446, + "learning_rate": 6.809258635149684e-05, + "loss": 2.8543, + "step": 21454 + }, + { + "epoch": 1.3318641753057296, + "grad_norm": 0.16653140112321882, + "learning_rate": 6.8089219512672e-05, + "loss": 2.8733, + "step": 21455 + }, + { + "epoch": 1.3319262524054876, + "grad_norm": 0.17524365764901054, + "learning_rate": 6.80858525794713e-05, + "loss": 2.8489, + "step": 21456 + }, + { + "epoch": 1.3319883295052455, + "grad_norm": 0.16643716923368895, + "learning_rate": 6.808248555191235e-05, + "loss": 2.8779, + "step": 21457 + }, + { + "epoch": 1.3320504066050034, + "grad_norm": 0.15603323444057246, + "learning_rate": 6.807911843001267e-05, + "loss": 2.9364, + "step": 21458 + }, + { + "epoch": 1.3321124837047613, + "grad_norm": 0.15133036503982655, + "learning_rate": 6.807575121378982e-05, + "loss": 2.937, + "step": 21459 + }, + { + "epoch": 1.3321745608045192, + "grad_norm": 0.16661511060896217, + "learning_rate": 6.807238390326142e-05, + "loss": 2.8243, + "step": 21460 + }, + { + "epoch": 1.3322366379042772, + "grad_norm": 0.15665709986818585, + "learning_rate": 6.806901649844499e-05, + "loss": 2.8782, + "step": 21461 + }, + { + "epoch": 1.332298715004035, + "grad_norm": 0.18302447753068843, + "learning_rate": 6.806564899935811e-05, + "loss": 2.8227, + "step": 21462 + }, + { + "epoch": 1.332360792103793, + "grad_norm": 0.15018898587239943, + "learning_rate": 6.806228140601836e-05, + "loss": 2.8522, + "step": 21463 + }, + { + "epoch": 1.3324228692035507, + "grad_norm": 0.1572747423264632, + "learning_rate": 6.805891371844329e-05, + "loss": 2.8514, + "step": 21464 + }, + { + "epoch": 1.3324849463033086, + "grad_norm": 0.1626594529125145, + "learning_rate": 6.80555459366505e-05, + "loss": 2.8945, + "step": 21465 + }, + { + "epoch": 1.3325470234030665, + "grad_norm": 0.1496461362470124, + "learning_rate": 6.805217806065753e-05, + "loss": 2.8238, + "step": 21466 + }, + { + "epoch": 1.3326091005028244, + "grad_norm": 0.190356215951059, + "learning_rate": 6.804881009048198e-05, + "loss": 2.8459, + "step": 21467 + }, + { + "epoch": 1.3326711776025824, + "grad_norm": 0.16405547217802657, + "learning_rate": 6.80454420261414e-05, + "loss": 2.8691, + "step": 21468 + }, + { + "epoch": 1.3327332547023403, + "grad_norm": 0.16423600536288818, + "learning_rate": 6.804207386765338e-05, + "loss": 2.8215, + "step": 21469 + }, + { + "epoch": 1.3327953318020982, + "grad_norm": 0.16405098738467672, + "learning_rate": 6.803870561503546e-05, + "loss": 2.9163, + "step": 21470 + }, + { + "epoch": 1.3328574089018561, + "grad_norm": 0.16512701410961023, + "learning_rate": 6.803533726830524e-05, + "loss": 2.801, + "step": 21471 + }, + { + "epoch": 1.332919486001614, + "grad_norm": 0.1689762296666744, + "learning_rate": 6.803196882748029e-05, + "loss": 2.9164, + "step": 21472 + }, + { + "epoch": 1.332981563101372, + "grad_norm": 0.14720629501209276, + "learning_rate": 6.802860029257818e-05, + "loss": 2.8131, + "step": 21473 + }, + { + "epoch": 1.3330436402011299, + "grad_norm": 0.16110948914009693, + "learning_rate": 6.80252316636165e-05, + "loss": 2.7796, + "step": 21474 + }, + { + "epoch": 1.3331057173008878, + "grad_norm": 0.15066528011242433, + "learning_rate": 6.802186294061278e-05, + "loss": 2.8559, + "step": 21475 + }, + { + "epoch": 1.3331677944006457, + "grad_norm": 0.15208122056534282, + "learning_rate": 6.801849412358466e-05, + "loss": 2.9068, + "step": 21476 + }, + { + "epoch": 1.3332298715004036, + "grad_norm": 0.16828654411085242, + "learning_rate": 6.801512521254966e-05, + "loss": 2.9143, + "step": 21477 + }, + { + "epoch": 1.3332919486001615, + "grad_norm": 0.14790929123350882, + "learning_rate": 6.801175620752539e-05, + "loss": 2.8417, + "step": 21478 + }, + { + "epoch": 1.3333540256999192, + "grad_norm": 0.16034604085134418, + "learning_rate": 6.80083871085294e-05, + "loss": 2.8678, + "step": 21479 + }, + { + "epoch": 1.3334161027996771, + "grad_norm": 0.15518144965696898, + "learning_rate": 6.80050179155793e-05, + "loss": 2.9089, + "step": 21480 + }, + { + "epoch": 1.333478179899435, + "grad_norm": 0.17146159435669056, + "learning_rate": 6.800164862869264e-05, + "loss": 2.9148, + "step": 21481 + }, + { + "epoch": 1.333540256999193, + "grad_norm": 0.16533107430574878, + "learning_rate": 6.7998279247887e-05, + "loss": 2.8132, + "step": 21482 + }, + { + "epoch": 1.333602334098951, + "grad_norm": 0.20291114516810838, + "learning_rate": 6.799490977318e-05, + "loss": 2.8271, + "step": 21483 + }, + { + "epoch": 1.3336644111987088, + "grad_norm": 0.16640102866313622, + "learning_rate": 6.799154020458916e-05, + "loss": 2.8089, + "step": 21484 + }, + { + "epoch": 1.3337264882984667, + "grad_norm": 0.17186528416829103, + "learning_rate": 6.798817054213209e-05, + "loss": 2.7905, + "step": 21485 + }, + { + "epoch": 1.3337885653982247, + "grad_norm": 0.14858211825800713, + "learning_rate": 6.798480078582637e-05, + "loss": 2.7741, + "step": 21486 + }, + { + "epoch": 1.3338506424979826, + "grad_norm": 0.16056831198181892, + "learning_rate": 6.798143093568958e-05, + "loss": 2.8381, + "step": 21487 + }, + { + "epoch": 1.3339127195977403, + "grad_norm": 0.17190422004793776, + "learning_rate": 6.797806099173929e-05, + "loss": 2.8514, + "step": 21488 + }, + { + "epoch": 1.3339747966974982, + "grad_norm": 0.15853551984188152, + "learning_rate": 6.797469095399311e-05, + "loss": 2.8283, + "step": 21489 + }, + { + "epoch": 1.334036873797256, + "grad_norm": 0.17014554528646825, + "learning_rate": 6.79713208224686e-05, + "loss": 2.8102, + "step": 21490 + }, + { + "epoch": 1.334098950897014, + "grad_norm": 0.17245435604960124, + "learning_rate": 6.796795059718333e-05, + "loss": 2.9242, + "step": 21491 + }, + { + "epoch": 1.334161027996772, + "grad_norm": 0.16683145227210983, + "learning_rate": 6.796458027815492e-05, + "loss": 2.7174, + "step": 21492 + }, + { + "epoch": 1.3342231050965299, + "grad_norm": 0.1562099942014067, + "learning_rate": 6.796120986540092e-05, + "loss": 2.8807, + "step": 21493 + }, + { + "epoch": 1.3342851821962878, + "grad_norm": 0.18470153987595786, + "learning_rate": 6.795783935893895e-05, + "loss": 2.8453, + "step": 21494 + }, + { + "epoch": 1.3343472592960457, + "grad_norm": 0.16662287447211135, + "learning_rate": 6.795446875878655e-05, + "loss": 2.8398, + "step": 21495 + }, + { + "epoch": 1.3344093363958036, + "grad_norm": 0.20678252448141332, + "learning_rate": 6.795109806496132e-05, + "loss": 2.7971, + "step": 21496 + }, + { + "epoch": 1.3344714134955615, + "grad_norm": 0.16557583619029992, + "learning_rate": 6.794772727748088e-05, + "loss": 2.8182, + "step": 21497 + }, + { + "epoch": 1.3345334905953194, + "grad_norm": 0.1545358991453921, + "learning_rate": 6.794435639636276e-05, + "loss": 2.802, + "step": 21498 + }, + { + "epoch": 1.3345955676950774, + "grad_norm": 0.17045790978745617, + "learning_rate": 6.79409854216246e-05, + "loss": 2.8268, + "step": 21499 + }, + { + "epoch": 1.3346576447948353, + "grad_norm": 0.1709971279197979, + "learning_rate": 6.793761435328395e-05, + "loss": 2.8122, + "step": 21500 + }, + { + "epoch": 1.3347197218945932, + "grad_norm": 0.15160732909010127, + "learning_rate": 6.793424319135843e-05, + "loss": 2.875, + "step": 21501 + }, + { + "epoch": 1.3347817989943511, + "grad_norm": 0.15509836655739814, + "learning_rate": 6.793087193586559e-05, + "loss": 2.8972, + "step": 21502 + }, + { + "epoch": 1.3348438760941088, + "grad_norm": 0.14136837141095157, + "learning_rate": 6.792750058682305e-05, + "loss": 2.8734, + "step": 21503 + }, + { + "epoch": 1.3349059531938667, + "grad_norm": 0.15023402321626023, + "learning_rate": 6.792412914424838e-05, + "loss": 2.7488, + "step": 21504 + }, + { + "epoch": 1.3349680302936247, + "grad_norm": 0.14780421437435218, + "learning_rate": 6.792075760815919e-05, + "loss": 2.7407, + "step": 21505 + }, + { + "epoch": 1.3350301073933826, + "grad_norm": 0.15341919547014882, + "learning_rate": 6.791738597857304e-05, + "loss": 2.7779, + "step": 21506 + }, + { + "epoch": 1.3350921844931405, + "grad_norm": 0.18471705945667113, + "learning_rate": 6.791401425550756e-05, + "loss": 2.8184, + "step": 21507 + }, + { + "epoch": 1.3351542615928984, + "grad_norm": 0.1690747861928235, + "learning_rate": 6.79106424389803e-05, + "loss": 2.8053, + "step": 21508 + }, + { + "epoch": 1.3352163386926563, + "grad_norm": 0.15225411492457297, + "learning_rate": 6.790727052900887e-05, + "loss": 2.8985, + "step": 21509 + }, + { + "epoch": 1.3352784157924142, + "grad_norm": 0.15051265818648796, + "learning_rate": 6.790389852561088e-05, + "loss": 2.6908, + "step": 21510 + }, + { + "epoch": 1.3353404928921722, + "grad_norm": 0.17642818718296666, + "learning_rate": 6.790052642880388e-05, + "loss": 2.7789, + "step": 21511 + }, + { + "epoch": 1.3354025699919299, + "grad_norm": 0.16833844087916616, + "learning_rate": 6.78971542386055e-05, + "loss": 2.7941, + "step": 21512 + }, + { + "epoch": 1.3354646470916878, + "grad_norm": 0.1693961901975504, + "learning_rate": 6.789378195503331e-05, + "loss": 2.7859, + "step": 21513 + }, + { + "epoch": 1.3355267241914457, + "grad_norm": 0.14776134207521682, + "learning_rate": 6.789040957810494e-05, + "loss": 2.7343, + "step": 21514 + }, + { + "epoch": 1.3355888012912036, + "grad_norm": 0.14854707563480413, + "learning_rate": 6.788703710783794e-05, + "loss": 2.7947, + "step": 21515 + }, + { + "epoch": 1.3356508783909615, + "grad_norm": 0.16192267478981673, + "learning_rate": 6.788366454424994e-05, + "loss": 2.8184, + "step": 21516 + }, + { + "epoch": 1.3357129554907194, + "grad_norm": 0.191968161165209, + "learning_rate": 6.788029188735852e-05, + "loss": 2.8401, + "step": 21517 + }, + { + "epoch": 1.3357750325904774, + "grad_norm": 0.18467684098611042, + "learning_rate": 6.787691913718125e-05, + "loss": 2.9558, + "step": 21518 + }, + { + "epoch": 1.3358371096902353, + "grad_norm": 0.1485936510524962, + "learning_rate": 6.787354629373579e-05, + "loss": 2.7526, + "step": 21519 + }, + { + "epoch": 1.3358991867899932, + "grad_norm": 0.18034036867246822, + "learning_rate": 6.787017335703967e-05, + "loss": 2.714, + "step": 21520 + }, + { + "epoch": 1.3359612638897511, + "grad_norm": 0.1530507201918544, + "learning_rate": 6.786680032711055e-05, + "loss": 2.8382, + "step": 21521 + }, + { + "epoch": 1.336023340989509, + "grad_norm": 0.2648528319745559, + "learning_rate": 6.786342720396597e-05, + "loss": 2.8348, + "step": 21522 + }, + { + "epoch": 1.336085418089267, + "grad_norm": 0.18735782875820856, + "learning_rate": 6.786005398762357e-05, + "loss": 2.8448, + "step": 21523 + }, + { + "epoch": 1.3361474951890249, + "grad_norm": 0.1556922016054094, + "learning_rate": 6.785668067810092e-05, + "loss": 2.8651, + "step": 21524 + }, + { + "epoch": 1.3362095722887828, + "grad_norm": 0.167972542475979, + "learning_rate": 6.785330727541563e-05, + "loss": 2.8571, + "step": 21525 + }, + { + "epoch": 1.3362716493885407, + "grad_norm": 0.17781032783844186, + "learning_rate": 6.784993377958532e-05, + "loss": 2.9122, + "step": 21526 + }, + { + "epoch": 1.3363337264882984, + "grad_norm": 0.16715604985414662, + "learning_rate": 6.784656019062756e-05, + "loss": 2.8582, + "step": 21527 + }, + { + "epoch": 1.3363958035880563, + "grad_norm": 0.15780286429042392, + "learning_rate": 6.784318650855995e-05, + "loss": 2.8316, + "step": 21528 + }, + { + "epoch": 1.3364578806878142, + "grad_norm": 0.15738501768065608, + "learning_rate": 6.783981273340011e-05, + "loss": 2.8417, + "step": 21529 + }, + { + "epoch": 1.3365199577875722, + "grad_norm": 0.17122610559797205, + "learning_rate": 6.783643886516564e-05, + "loss": 2.8313, + "step": 21530 + }, + { + "epoch": 1.33658203488733, + "grad_norm": 0.16367320581409509, + "learning_rate": 6.783306490387414e-05, + "loss": 2.8278, + "step": 21531 + }, + { + "epoch": 1.336644111987088, + "grad_norm": 0.176615120000951, + "learning_rate": 6.782969084954321e-05, + "loss": 2.8345, + "step": 21532 + }, + { + "epoch": 1.336706189086846, + "grad_norm": 0.1465203622481545, + "learning_rate": 6.782631670219047e-05, + "loss": 2.8156, + "step": 21533 + }, + { + "epoch": 1.3367682661866038, + "grad_norm": 0.1529394874989236, + "learning_rate": 6.78229424618335e-05, + "loss": 2.8571, + "step": 21534 + }, + { + "epoch": 1.3368303432863617, + "grad_norm": 0.16109100146127273, + "learning_rate": 6.781956812848991e-05, + "loss": 2.8338, + "step": 21535 + }, + { + "epoch": 1.3368924203861194, + "grad_norm": 0.17153257700531377, + "learning_rate": 6.781619370217729e-05, + "loss": 2.8608, + "step": 21536 + }, + { + "epoch": 1.3369544974858774, + "grad_norm": 0.15116065917683197, + "learning_rate": 6.78128191829133e-05, + "loss": 2.8904, + "step": 21537 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.19627246501839452, + "learning_rate": 6.780944457071547e-05, + "loss": 2.8014, + "step": 21538 + }, + { + "epoch": 1.3370786516853932, + "grad_norm": 0.1605864886675975, + "learning_rate": 6.780606986560146e-05, + "loss": 2.805, + "step": 21539 + }, + { + "epoch": 1.3371407287851511, + "grad_norm": 0.1558336313158984, + "learning_rate": 6.780269506758887e-05, + "loss": 2.8435, + "step": 21540 + }, + { + "epoch": 1.337202805884909, + "grad_norm": 0.14388242276819296, + "learning_rate": 6.779932017669528e-05, + "loss": 2.866, + "step": 21541 + }, + { + "epoch": 1.337264882984667, + "grad_norm": 0.1520949251240436, + "learning_rate": 6.779594519293833e-05, + "loss": 2.8446, + "step": 21542 + }, + { + "epoch": 1.3373269600844249, + "grad_norm": 0.1548174995782427, + "learning_rate": 6.77925701163356e-05, + "loss": 2.8406, + "step": 21543 + }, + { + "epoch": 1.3373890371841828, + "grad_norm": 0.14440178872394774, + "learning_rate": 6.778919494690473e-05, + "loss": 2.7653, + "step": 21544 + }, + { + "epoch": 1.3374511142839407, + "grad_norm": 0.14999561949135154, + "learning_rate": 6.778581968466329e-05, + "loss": 2.881, + "step": 21545 + }, + { + "epoch": 1.3375131913836986, + "grad_norm": 0.15085515819920509, + "learning_rate": 6.778244432962893e-05, + "loss": 2.8373, + "step": 21546 + }, + { + "epoch": 1.3375752684834565, + "grad_norm": 0.18838224867826261, + "learning_rate": 6.777906888181922e-05, + "loss": 2.8296, + "step": 21547 + }, + { + "epoch": 1.3376373455832145, + "grad_norm": 0.16881012331176826, + "learning_rate": 6.777569334125182e-05, + "loss": 2.8099, + "step": 21548 + }, + { + "epoch": 1.3376994226829724, + "grad_norm": 0.15547292119090325, + "learning_rate": 6.77723177079443e-05, + "loss": 2.845, + "step": 21549 + }, + { + "epoch": 1.33776149978273, + "grad_norm": 0.16324152349121115, + "learning_rate": 6.776894198191428e-05, + "loss": 2.8293, + "step": 21550 + }, + { + "epoch": 1.337823576882488, + "grad_norm": 0.16465265985869285, + "learning_rate": 6.776556616317938e-05, + "loss": 2.829, + "step": 21551 + }, + { + "epoch": 1.337885653982246, + "grad_norm": 0.17761721260132707, + "learning_rate": 6.776219025175721e-05, + "loss": 2.8302, + "step": 21552 + }, + { + "epoch": 1.3379477310820038, + "grad_norm": 0.18039849841950403, + "learning_rate": 6.775881424766538e-05, + "loss": 2.8386, + "step": 21553 + }, + { + "epoch": 1.3380098081817617, + "grad_norm": 0.16669811880847177, + "learning_rate": 6.77554381509215e-05, + "loss": 2.8986, + "step": 21554 + }, + { + "epoch": 1.3380718852815197, + "grad_norm": 0.15962574485076286, + "learning_rate": 6.775206196154321e-05, + "loss": 2.863, + "step": 21555 + }, + { + "epoch": 1.3381339623812776, + "grad_norm": 0.15599684472820896, + "learning_rate": 6.774868567954807e-05, + "loss": 2.8723, + "step": 21556 + }, + { + "epoch": 1.3381960394810355, + "grad_norm": 0.17541897848294274, + "learning_rate": 6.774530930495376e-05, + "loss": 2.8698, + "step": 21557 + }, + { + "epoch": 1.3382581165807934, + "grad_norm": 0.14912506278101453, + "learning_rate": 6.774193283777784e-05, + "loss": 2.8129, + "step": 21558 + }, + { + "epoch": 1.338320193680551, + "grad_norm": 0.15981082766459406, + "learning_rate": 6.773855627803795e-05, + "loss": 2.9127, + "step": 21559 + }, + { + "epoch": 1.338382270780309, + "grad_norm": 0.1563150270571208, + "learning_rate": 6.77351796257517e-05, + "loss": 2.9408, + "step": 21560 + }, + { + "epoch": 1.338444347880067, + "grad_norm": 0.1460563063448911, + "learning_rate": 6.773180288093673e-05, + "loss": 2.8535, + "step": 21561 + }, + { + "epoch": 1.3385064249798249, + "grad_norm": 0.17140429697341128, + "learning_rate": 6.772842604361065e-05, + "loss": 2.8893, + "step": 21562 + }, + { + "epoch": 1.3385685020795828, + "grad_norm": 0.14601224471864216, + "learning_rate": 6.772504911379104e-05, + "loss": 2.8926, + "step": 21563 + }, + { + "epoch": 1.3386305791793407, + "grad_norm": 0.26264520613981085, + "learning_rate": 6.772167209149557e-05, + "loss": 2.8895, + "step": 21564 + }, + { + "epoch": 1.3386926562790986, + "grad_norm": 0.14421348823586694, + "learning_rate": 6.771829497674181e-05, + "loss": 2.7706, + "step": 21565 + }, + { + "epoch": 1.3387547333788565, + "grad_norm": 0.15437785570692547, + "learning_rate": 6.771491776954742e-05, + "loss": 2.8704, + "step": 21566 + }, + { + "epoch": 1.3388168104786144, + "grad_norm": 0.15279291645699158, + "learning_rate": 6.771154046993e-05, + "loss": 2.8114, + "step": 21567 + }, + { + "epoch": 1.3388788875783724, + "grad_norm": 0.15998136462152995, + "learning_rate": 6.770816307790719e-05, + "loss": 2.8722, + "step": 21568 + }, + { + "epoch": 1.3389409646781303, + "grad_norm": 0.1549650104026882, + "learning_rate": 6.770478559349656e-05, + "loss": 2.8849, + "step": 21569 + }, + { + "epoch": 1.3390030417778882, + "grad_norm": 0.1583336480089457, + "learning_rate": 6.77014080167158e-05, + "loss": 2.9531, + "step": 21570 + }, + { + "epoch": 1.3390651188776461, + "grad_norm": 0.1523308076007003, + "learning_rate": 6.769803034758248e-05, + "loss": 2.8829, + "step": 21571 + }, + { + "epoch": 1.339127195977404, + "grad_norm": 0.15657419308570675, + "learning_rate": 6.769465258611424e-05, + "loss": 2.8435, + "step": 21572 + }, + { + "epoch": 1.339189273077162, + "grad_norm": 0.16282871085093475, + "learning_rate": 6.769127473232871e-05, + "loss": 2.8891, + "step": 21573 + }, + { + "epoch": 1.3392513501769197, + "grad_norm": 0.14527338460299172, + "learning_rate": 6.76878967862435e-05, + "loss": 2.8479, + "step": 21574 + }, + { + "epoch": 1.3393134272766776, + "grad_norm": 0.15069915038504056, + "learning_rate": 6.768451874787624e-05, + "loss": 2.8361, + "step": 21575 + }, + { + "epoch": 1.3393755043764355, + "grad_norm": 0.14779281532518587, + "learning_rate": 6.768114061724454e-05, + "loss": 2.8355, + "step": 21576 + }, + { + "epoch": 1.3394375814761934, + "grad_norm": 0.14674713727491828, + "learning_rate": 6.767776239436605e-05, + "loss": 2.8161, + "step": 21577 + }, + { + "epoch": 1.3394996585759513, + "grad_norm": 0.1668077141502956, + "learning_rate": 6.767438407925836e-05, + "loss": 2.845, + "step": 21578 + }, + { + "epoch": 1.3395617356757092, + "grad_norm": 0.15540483301415356, + "learning_rate": 6.767100567193915e-05, + "loss": 2.8854, + "step": 21579 + }, + { + "epoch": 1.3396238127754672, + "grad_norm": 0.14352987684859203, + "learning_rate": 6.7667627172426e-05, + "loss": 2.7851, + "step": 21580 + }, + { + "epoch": 1.339685889875225, + "grad_norm": 0.1435753035501124, + "learning_rate": 6.766424858073654e-05, + "loss": 2.841, + "step": 21581 + }, + { + "epoch": 1.339747966974983, + "grad_norm": 0.1524522590561788, + "learning_rate": 6.766086989688842e-05, + "loss": 2.7919, + "step": 21582 + }, + { + "epoch": 1.3398100440747407, + "grad_norm": 0.16659144787869515, + "learning_rate": 6.765749112089926e-05, + "loss": 2.8731, + "step": 21583 + }, + { + "epoch": 1.3398721211744986, + "grad_norm": 0.1583855248584355, + "learning_rate": 6.765411225278667e-05, + "loss": 2.8989, + "step": 21584 + }, + { + "epoch": 1.3399341982742565, + "grad_norm": 0.15299204898296748, + "learning_rate": 6.765073329256829e-05, + "loss": 2.8513, + "step": 21585 + }, + { + "epoch": 1.3399962753740144, + "grad_norm": 0.1496691676443639, + "learning_rate": 6.764735424026176e-05, + "loss": 2.8323, + "step": 21586 + }, + { + "epoch": 1.3400583524737724, + "grad_norm": 0.21004315917674074, + "learning_rate": 6.76439750958847e-05, + "loss": 2.8796, + "step": 21587 + }, + { + "epoch": 1.3401204295735303, + "grad_norm": 0.1546203506066765, + "learning_rate": 6.764059585945472e-05, + "loss": 2.8013, + "step": 21588 + }, + { + "epoch": 1.3401825066732882, + "grad_norm": 0.159313724696816, + "learning_rate": 6.763721653098949e-05, + "loss": 2.8679, + "step": 21589 + }, + { + "epoch": 1.3402445837730461, + "grad_norm": 0.16411277591029252, + "learning_rate": 6.76338371105066e-05, + "loss": 2.9015, + "step": 21590 + }, + { + "epoch": 1.340306660872804, + "grad_norm": 0.14557262231823181, + "learning_rate": 6.763045759802372e-05, + "loss": 2.7873, + "step": 21591 + }, + { + "epoch": 1.340368737972562, + "grad_norm": 0.15665879417695736, + "learning_rate": 6.762707799355845e-05, + "loss": 2.8124, + "step": 21592 + }, + { + "epoch": 1.3404308150723199, + "grad_norm": 0.1533290350309815, + "learning_rate": 6.762369829712844e-05, + "loss": 2.8699, + "step": 21593 + }, + { + "epoch": 1.3404928921720778, + "grad_norm": 0.17881812336401806, + "learning_rate": 6.762031850875132e-05, + "loss": 2.8502, + "step": 21594 + }, + { + "epoch": 1.3405549692718357, + "grad_norm": 0.16182135653247626, + "learning_rate": 6.761693862844471e-05, + "loss": 2.8055, + "step": 21595 + }, + { + "epoch": 1.3406170463715936, + "grad_norm": 0.16251561956347865, + "learning_rate": 6.761355865622628e-05, + "loss": 2.9519, + "step": 21596 + }, + { + "epoch": 1.3406791234713515, + "grad_norm": 0.1563641667509037, + "learning_rate": 6.76101785921136e-05, + "loss": 2.7944, + "step": 21597 + }, + { + "epoch": 1.3407412005711092, + "grad_norm": 0.14861847486280805, + "learning_rate": 6.760679843612438e-05, + "loss": 2.8916, + "step": 21598 + }, + { + "epoch": 1.3408032776708672, + "grad_norm": 0.1541985669403119, + "learning_rate": 6.760341818827619e-05, + "loss": 2.8862, + "step": 21599 + }, + { + "epoch": 1.340865354770625, + "grad_norm": 0.1759094618414117, + "learning_rate": 6.760003784858671e-05, + "loss": 2.8777, + "step": 21600 + }, + { + "epoch": 1.340927431870383, + "grad_norm": 0.16581744538649085, + "learning_rate": 6.759665741707355e-05, + "loss": 2.8672, + "step": 21601 + }, + { + "epoch": 1.340989508970141, + "grad_norm": 0.17189142300431198, + "learning_rate": 6.759327689375437e-05, + "loss": 2.7652, + "step": 21602 + }, + { + "epoch": 1.3410515860698988, + "grad_norm": 0.199054491355084, + "learning_rate": 6.758989627864678e-05, + "loss": 2.8153, + "step": 21603 + }, + { + "epoch": 1.3411136631696567, + "grad_norm": 0.17599339928563112, + "learning_rate": 6.758651557176845e-05, + "loss": 2.8202, + "step": 21604 + }, + { + "epoch": 1.3411757402694147, + "grad_norm": 0.15098505082640917, + "learning_rate": 6.758313477313698e-05, + "loss": 2.8975, + "step": 21605 + }, + { + "epoch": 1.3412378173691726, + "grad_norm": 0.18651742575978453, + "learning_rate": 6.757975388277004e-05, + "loss": 2.8622, + "step": 21606 + }, + { + "epoch": 1.3412998944689303, + "grad_norm": 0.18321295063210372, + "learning_rate": 6.757637290068526e-05, + "loss": 2.8862, + "step": 21607 + }, + { + "epoch": 1.3413619715686882, + "grad_norm": 0.15987160439050524, + "learning_rate": 6.757299182690026e-05, + "loss": 2.7517, + "step": 21608 + }, + { + "epoch": 1.3414240486684461, + "grad_norm": 0.1631188901856867, + "learning_rate": 6.75696106614327e-05, + "loss": 2.8233, + "step": 21609 + }, + { + "epoch": 1.341486125768204, + "grad_norm": 0.16104856658488323, + "learning_rate": 6.75662294043002e-05, + "loss": 2.8817, + "step": 21610 + }, + { + "epoch": 1.341548202867962, + "grad_norm": 0.14560674338035504, + "learning_rate": 6.756284805552045e-05, + "loss": 2.8421, + "step": 21611 + }, + { + "epoch": 1.3416102799677199, + "grad_norm": 0.1668988519134969, + "learning_rate": 6.755946661511103e-05, + "loss": 2.8978, + "step": 21612 + }, + { + "epoch": 1.3416723570674778, + "grad_norm": 0.1557382189204616, + "learning_rate": 6.755608508308962e-05, + "loss": 2.7797, + "step": 21613 + }, + { + "epoch": 1.3417344341672357, + "grad_norm": 0.16259837794788104, + "learning_rate": 6.755270345947387e-05, + "loss": 2.7775, + "step": 21614 + }, + { + "epoch": 1.3417965112669936, + "grad_norm": 0.15411216810546366, + "learning_rate": 6.754932174428138e-05, + "loss": 2.877, + "step": 21615 + }, + { + "epoch": 1.3418585883667515, + "grad_norm": 0.1575817396820761, + "learning_rate": 6.754593993752983e-05, + "loss": 2.9023, + "step": 21616 + }, + { + "epoch": 1.3419206654665095, + "grad_norm": 0.15014944926490223, + "learning_rate": 6.754255803923685e-05, + "loss": 2.7921, + "step": 21617 + }, + { + "epoch": 1.3419827425662674, + "grad_norm": 0.15149439112152535, + "learning_rate": 6.75391760494201e-05, + "loss": 2.8362, + "step": 21618 + }, + { + "epoch": 1.3420448196660253, + "grad_norm": 0.1643790585684044, + "learning_rate": 6.753579396809718e-05, + "loss": 2.7931, + "step": 21619 + }, + { + "epoch": 1.3421068967657832, + "grad_norm": 0.1657705798566858, + "learning_rate": 6.75324117952858e-05, + "loss": 2.8349, + "step": 21620 + }, + { + "epoch": 1.3421689738655411, + "grad_norm": 0.16471910781401733, + "learning_rate": 6.752902953100353e-05, + "loss": 2.7912, + "step": 21621 + }, + { + "epoch": 1.3422310509652988, + "grad_norm": 0.15919224274156046, + "learning_rate": 6.75256471752681e-05, + "loss": 2.7646, + "step": 21622 + }, + { + "epoch": 1.3422931280650567, + "grad_norm": 0.17555696165597973, + "learning_rate": 6.752226472809709e-05, + "loss": 2.8467, + "step": 21623 + }, + { + "epoch": 1.3423552051648147, + "grad_norm": 0.16249471848414668, + "learning_rate": 6.751888218950818e-05, + "loss": 2.8819, + "step": 21624 + }, + { + "epoch": 1.3424172822645726, + "grad_norm": 0.14860204937217714, + "learning_rate": 6.751549955951902e-05, + "loss": 2.8054, + "step": 21625 + }, + { + "epoch": 1.3424793593643305, + "grad_norm": 0.14990070975838946, + "learning_rate": 6.751211683814722e-05, + "loss": 2.8882, + "step": 21626 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.1530792017002599, + "learning_rate": 6.750873402541047e-05, + "loss": 2.8633, + "step": 21627 + }, + { + "epoch": 1.3426035135638463, + "grad_norm": 0.15987902000471851, + "learning_rate": 6.750535112132639e-05, + "loss": 2.8994, + "step": 21628 + }, + { + "epoch": 1.3426655906636042, + "grad_norm": 0.15889624067119848, + "learning_rate": 6.750196812591266e-05, + "loss": 2.7949, + "step": 21629 + }, + { + "epoch": 1.3427276677633622, + "grad_norm": 0.16853620907486383, + "learning_rate": 6.74985850391869e-05, + "loss": 2.8646, + "step": 21630 + }, + { + "epoch": 1.3427897448631199, + "grad_norm": 0.16323776373308446, + "learning_rate": 6.74952018611668e-05, + "loss": 2.9115, + "step": 21631 + }, + { + "epoch": 1.3428518219628778, + "grad_norm": 0.15062452680268096, + "learning_rate": 6.749181859186997e-05, + "loss": 2.7618, + "step": 21632 + }, + { + "epoch": 1.3429138990626357, + "grad_norm": 0.15927090985595915, + "learning_rate": 6.748843523131406e-05, + "loss": 2.8396, + "step": 21633 + }, + { + "epoch": 1.3429759761623936, + "grad_norm": 0.1490065433830361, + "learning_rate": 6.748505177951676e-05, + "loss": 2.7845, + "step": 21634 + }, + { + "epoch": 1.3430380532621515, + "grad_norm": 0.14895478263259956, + "learning_rate": 6.748166823649569e-05, + "loss": 2.8484, + "step": 21635 + }, + { + "epoch": 1.3431001303619095, + "grad_norm": 0.15998601588776798, + "learning_rate": 6.747828460226853e-05, + "loss": 2.8988, + "step": 21636 + }, + { + "epoch": 1.3431622074616674, + "grad_norm": 0.15624052257405158, + "learning_rate": 6.747490087685289e-05, + "loss": 2.8169, + "step": 21637 + }, + { + "epoch": 1.3432242845614253, + "grad_norm": 0.15064161584941935, + "learning_rate": 6.747151706026646e-05, + "loss": 2.8661, + "step": 21638 + }, + { + "epoch": 1.3432863616611832, + "grad_norm": 0.15507960667130558, + "learning_rate": 6.746813315252689e-05, + "loss": 2.837, + "step": 21639 + }, + { + "epoch": 1.3433484387609411, + "grad_norm": 0.15637624528062777, + "learning_rate": 6.746474915365182e-05, + "loss": 2.7926, + "step": 21640 + }, + { + "epoch": 1.343410515860699, + "grad_norm": 0.15767997616097568, + "learning_rate": 6.746136506365892e-05, + "loss": 2.8003, + "step": 21641 + }, + { + "epoch": 1.343472592960457, + "grad_norm": 0.1707193689888478, + "learning_rate": 6.745798088256585e-05, + "loss": 2.8897, + "step": 21642 + }, + { + "epoch": 1.3435346700602149, + "grad_norm": 0.1454072130877641, + "learning_rate": 6.745459661039024e-05, + "loss": 2.8663, + "step": 21643 + }, + { + "epoch": 1.3435967471599728, + "grad_norm": 0.15622893429849227, + "learning_rate": 6.745121224714975e-05, + "loss": 2.8368, + "step": 21644 + }, + { + "epoch": 1.3436588242597307, + "grad_norm": 0.15651163607443383, + "learning_rate": 6.744782779286207e-05, + "loss": 2.8669, + "step": 21645 + }, + { + "epoch": 1.3437209013594884, + "grad_norm": 0.1469330718510253, + "learning_rate": 6.744444324754483e-05, + "loss": 2.8215, + "step": 21646 + }, + { + "epoch": 1.3437829784592463, + "grad_norm": 0.1728918168776257, + "learning_rate": 6.74410586112157e-05, + "loss": 2.9717, + "step": 21647 + }, + { + "epoch": 1.3438450555590042, + "grad_norm": 0.2035470850345989, + "learning_rate": 6.743767388389233e-05, + "loss": 2.8138, + "step": 21648 + }, + { + "epoch": 1.3439071326587622, + "grad_norm": 0.19360621908640613, + "learning_rate": 6.743428906559238e-05, + "loss": 2.9408, + "step": 21649 + }, + { + "epoch": 1.34396920975852, + "grad_norm": 0.14952470385414177, + "learning_rate": 6.743090415633351e-05, + "loss": 2.8259, + "step": 21650 + }, + { + "epoch": 1.344031286858278, + "grad_norm": 0.1801729486341001, + "learning_rate": 6.742751915613338e-05, + "loss": 2.7837, + "step": 21651 + }, + { + "epoch": 1.344093363958036, + "grad_norm": 0.16708145209709646, + "learning_rate": 6.742413406500967e-05, + "loss": 2.8395, + "step": 21652 + }, + { + "epoch": 1.3441554410577938, + "grad_norm": 0.18299015737600494, + "learning_rate": 6.742074888298002e-05, + "loss": 2.9369, + "step": 21653 + }, + { + "epoch": 1.3442175181575517, + "grad_norm": 0.16085369326101828, + "learning_rate": 6.741736361006209e-05, + "loss": 2.8058, + "step": 21654 + }, + { + "epoch": 1.3442795952573094, + "grad_norm": 0.17038305772141643, + "learning_rate": 6.741397824627352e-05, + "loss": 2.76, + "step": 21655 + }, + { + "epoch": 1.3443416723570674, + "grad_norm": 0.16725543950163424, + "learning_rate": 6.741059279163202e-05, + "loss": 2.8012, + "step": 21656 + }, + { + "epoch": 1.3444037494568253, + "grad_norm": 0.14724151125526067, + "learning_rate": 6.740720724615524e-05, + "loss": 2.8011, + "step": 21657 + }, + { + "epoch": 1.3444658265565832, + "grad_norm": 0.1777299209707352, + "learning_rate": 6.740382160986081e-05, + "loss": 2.8497, + "step": 21658 + }, + { + "epoch": 1.3445279036563411, + "grad_norm": 0.15496238405876134, + "learning_rate": 6.740043588276643e-05, + "loss": 2.9607, + "step": 21659 + }, + { + "epoch": 1.344589980756099, + "grad_norm": 0.21594646271020848, + "learning_rate": 6.739705006488975e-05, + "loss": 2.891, + "step": 21660 + }, + { + "epoch": 1.344652057855857, + "grad_norm": 0.16005871873423624, + "learning_rate": 6.739366415624845e-05, + "loss": 2.8258, + "step": 21661 + }, + { + "epoch": 1.3447141349556149, + "grad_norm": 0.15799759830614185, + "learning_rate": 6.739027815686017e-05, + "loss": 2.9321, + "step": 21662 + }, + { + "epoch": 1.3447762120553728, + "grad_norm": 0.16727558427049283, + "learning_rate": 6.738689206674259e-05, + "loss": 2.8672, + "step": 21663 + }, + { + "epoch": 1.3448382891551307, + "grad_norm": 0.1537064531052651, + "learning_rate": 6.738350588591336e-05, + "loss": 2.8003, + "step": 21664 + }, + { + "epoch": 1.3449003662548886, + "grad_norm": 0.17883966322873235, + "learning_rate": 6.738011961439018e-05, + "loss": 2.7551, + "step": 21665 + }, + { + "epoch": 1.3449624433546465, + "grad_norm": 0.15597610309245616, + "learning_rate": 6.737673325219068e-05, + "loss": 2.8161, + "step": 21666 + }, + { + "epoch": 1.3450245204544045, + "grad_norm": 0.14731535767456758, + "learning_rate": 6.737334679933256e-05, + "loss": 2.9149, + "step": 21667 + }, + { + "epoch": 1.3450865975541624, + "grad_norm": 0.151550612524205, + "learning_rate": 6.736996025583347e-05, + "loss": 2.8428, + "step": 21668 + }, + { + "epoch": 1.3451486746539203, + "grad_norm": 0.15259305411185403, + "learning_rate": 6.736657362171106e-05, + "loss": 2.8177, + "step": 21669 + }, + { + "epoch": 1.345210751753678, + "grad_norm": 0.20198668121817834, + "learning_rate": 6.736318689698305e-05, + "loss": 2.806, + "step": 21670 + }, + { + "epoch": 1.345272828853436, + "grad_norm": 0.17687569159787037, + "learning_rate": 6.735980008166706e-05, + "loss": 2.8599, + "step": 21671 + }, + { + "epoch": 1.3453349059531938, + "grad_norm": 0.1679494480753004, + "learning_rate": 6.735641317578079e-05, + "loss": 2.8625, + "step": 21672 + }, + { + "epoch": 1.3453969830529517, + "grad_norm": 0.20064200831300988, + "learning_rate": 6.735302617934187e-05, + "loss": 2.9045, + "step": 21673 + }, + { + "epoch": 1.3454590601527097, + "grad_norm": 0.15904765709046598, + "learning_rate": 6.734963909236804e-05, + "loss": 2.8332, + "step": 21674 + }, + { + "epoch": 1.3455211372524676, + "grad_norm": 0.17519627297604884, + "learning_rate": 6.73462519148769e-05, + "loss": 2.8715, + "step": 21675 + }, + { + "epoch": 1.3455832143522255, + "grad_norm": 0.18001318524004215, + "learning_rate": 6.734286464688616e-05, + "loss": 2.774, + "step": 21676 + }, + { + "epoch": 1.3456452914519834, + "grad_norm": 0.15580613151852446, + "learning_rate": 6.73394772884135e-05, + "loss": 2.9408, + "step": 21677 + }, + { + "epoch": 1.3457073685517413, + "grad_norm": 0.1775053246129299, + "learning_rate": 6.733608983947656e-05, + "loss": 2.8677, + "step": 21678 + }, + { + "epoch": 1.345769445651499, + "grad_norm": 0.17100751393175478, + "learning_rate": 6.733270230009304e-05, + "loss": 2.8585, + "step": 21679 + }, + { + "epoch": 1.345831522751257, + "grad_norm": 0.15359792662924873, + "learning_rate": 6.73293146702806e-05, + "loss": 2.8206, + "step": 21680 + }, + { + "epoch": 1.3458935998510149, + "grad_norm": 0.21631522416055776, + "learning_rate": 6.732592695005692e-05, + "loss": 2.8041, + "step": 21681 + }, + { + "epoch": 1.3459556769507728, + "grad_norm": 0.1530212839335054, + "learning_rate": 6.732253913943968e-05, + "loss": 2.8138, + "step": 21682 + }, + { + "epoch": 1.3460177540505307, + "grad_norm": 0.15815993885177124, + "learning_rate": 6.731915123844654e-05, + "loss": 2.8132, + "step": 21683 + }, + { + "epoch": 1.3460798311502886, + "grad_norm": 0.16068092328207198, + "learning_rate": 6.73157632470952e-05, + "loss": 2.8706, + "step": 21684 + }, + { + "epoch": 1.3461419082500465, + "grad_norm": 0.14869952290052146, + "learning_rate": 6.73123751654033e-05, + "loss": 2.7819, + "step": 21685 + }, + { + "epoch": 1.3462039853498045, + "grad_norm": 0.15205492247991725, + "learning_rate": 6.730898699338855e-05, + "loss": 2.8403, + "step": 21686 + }, + { + "epoch": 1.3462660624495624, + "grad_norm": 0.15557523104918541, + "learning_rate": 6.730559873106858e-05, + "loss": 2.8166, + "step": 21687 + }, + { + "epoch": 1.3463281395493203, + "grad_norm": 0.1537100945804658, + "learning_rate": 6.730221037846114e-05, + "loss": 2.7749, + "step": 21688 + }, + { + "epoch": 1.3463902166490782, + "grad_norm": 0.1697250849019691, + "learning_rate": 6.729882193558385e-05, + "loss": 2.856, + "step": 21689 + }, + { + "epoch": 1.3464522937488361, + "grad_norm": 0.16111755876878017, + "learning_rate": 6.729543340245441e-05, + "loss": 2.8465, + "step": 21690 + }, + { + "epoch": 1.346514370848594, + "grad_norm": 0.14855975838291172, + "learning_rate": 6.72920447790905e-05, + "loss": 2.8892, + "step": 21691 + }, + { + "epoch": 1.346576447948352, + "grad_norm": 0.16291885611602772, + "learning_rate": 6.728865606550978e-05, + "loss": 2.8487, + "step": 21692 + }, + { + "epoch": 1.3466385250481099, + "grad_norm": 0.14880497519603034, + "learning_rate": 6.728526726172996e-05, + "loss": 2.8923, + "step": 21693 + }, + { + "epoch": 1.3467006021478676, + "grad_norm": 0.1488951604662563, + "learning_rate": 6.728187836776869e-05, + "loss": 2.8639, + "step": 21694 + }, + { + "epoch": 1.3467626792476255, + "grad_norm": 0.15993667297402875, + "learning_rate": 6.727848938364368e-05, + "loss": 2.8225, + "step": 21695 + }, + { + "epoch": 1.3468247563473834, + "grad_norm": 0.17389116858477008, + "learning_rate": 6.727510030937258e-05, + "loss": 2.758, + "step": 21696 + }, + { + "epoch": 1.3468868334471413, + "grad_norm": 0.1599598121836371, + "learning_rate": 6.727171114497311e-05, + "loss": 2.8378, + "step": 21697 + }, + { + "epoch": 1.3469489105468992, + "grad_norm": 0.18474597745795543, + "learning_rate": 6.726832189046291e-05, + "loss": 2.7711, + "step": 21698 + }, + { + "epoch": 1.3470109876466572, + "grad_norm": 0.163877223620252, + "learning_rate": 6.72649325458597e-05, + "loss": 2.9443, + "step": 21699 + }, + { + "epoch": 1.347073064746415, + "grad_norm": 0.1592434421948958, + "learning_rate": 6.726154311118114e-05, + "loss": 2.8572, + "step": 21700 + }, + { + "epoch": 1.347135141846173, + "grad_norm": 0.17642853558097615, + "learning_rate": 6.725815358644491e-05, + "loss": 2.8249, + "step": 21701 + }, + { + "epoch": 1.347197218945931, + "grad_norm": 0.1739809618059675, + "learning_rate": 6.72547639716687e-05, + "loss": 2.8438, + "step": 21702 + }, + { + "epoch": 1.3472592960456886, + "grad_norm": 0.20929179353088032, + "learning_rate": 6.725137426687022e-05, + "loss": 2.8208, + "step": 21703 + }, + { + "epoch": 1.3473213731454465, + "grad_norm": 0.16466861546909412, + "learning_rate": 6.724798447206713e-05, + "loss": 2.7901, + "step": 21704 + }, + { + "epoch": 1.3473834502452045, + "grad_norm": 0.15597441330978384, + "learning_rate": 6.72445945872771e-05, + "loss": 2.8932, + "step": 21705 + }, + { + "epoch": 1.3474455273449624, + "grad_norm": 0.14365049655330903, + "learning_rate": 6.724120461251784e-05, + "loss": 2.7613, + "step": 21706 + }, + { + "epoch": 1.3475076044447203, + "grad_norm": 0.1379035227743839, + "learning_rate": 6.723781454780702e-05, + "loss": 2.8154, + "step": 21707 + }, + { + "epoch": 1.3475696815444782, + "grad_norm": 0.14602996280216332, + "learning_rate": 6.723442439316236e-05, + "loss": 2.8504, + "step": 21708 + }, + { + "epoch": 1.3476317586442361, + "grad_norm": 0.16623947535092085, + "learning_rate": 6.72310341486015e-05, + "loss": 2.8986, + "step": 21709 + }, + { + "epoch": 1.347693835743994, + "grad_norm": 0.14499063557155453, + "learning_rate": 6.722764381414217e-05, + "loss": 2.8191, + "step": 21710 + }, + { + "epoch": 1.347755912843752, + "grad_norm": 0.1825959810397799, + "learning_rate": 6.722425338980204e-05, + "loss": 2.8698, + "step": 21711 + }, + { + "epoch": 1.3478179899435099, + "grad_norm": 0.14584684238420803, + "learning_rate": 6.722086287559878e-05, + "loss": 2.7992, + "step": 21712 + }, + { + "epoch": 1.3478800670432678, + "grad_norm": 0.15957183743778927, + "learning_rate": 6.721747227155012e-05, + "loss": 2.7505, + "step": 21713 + }, + { + "epoch": 1.3479421441430257, + "grad_norm": 0.15879567755401852, + "learning_rate": 6.72140815776737e-05, + "loss": 2.7208, + "step": 21714 + }, + { + "epoch": 1.3480042212427836, + "grad_norm": 0.1720043963245416, + "learning_rate": 6.721069079398727e-05, + "loss": 2.8211, + "step": 21715 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.17734491205044797, + "learning_rate": 6.720729992050847e-05, + "loss": 2.8829, + "step": 21716 + }, + { + "epoch": 1.3481283754422995, + "grad_norm": 0.17843943958931904, + "learning_rate": 6.720390895725502e-05, + "loss": 2.8757, + "step": 21717 + }, + { + "epoch": 1.3481904525420572, + "grad_norm": 0.1713445399622889, + "learning_rate": 6.720051790424459e-05, + "loss": 2.87, + "step": 21718 + }, + { + "epoch": 1.348252529641815, + "grad_norm": 0.18731967877966, + "learning_rate": 6.719712676149488e-05, + "loss": 2.7647, + "step": 21719 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.21807588009171827, + "learning_rate": 6.71937355290236e-05, + "loss": 2.8629, + "step": 21720 + }, + { + "epoch": 1.348376683841331, + "grad_norm": 0.21904191056436834, + "learning_rate": 6.719034420684842e-05, + "loss": 2.8474, + "step": 21721 + }, + { + "epoch": 1.3484387609410888, + "grad_norm": 0.17595450022737338, + "learning_rate": 6.718695279498705e-05, + "loss": 2.8351, + "step": 21722 + }, + { + "epoch": 1.3485008380408467, + "grad_norm": 0.16507531507152337, + "learning_rate": 6.718356129345716e-05, + "loss": 2.8086, + "step": 21723 + }, + { + "epoch": 1.3485629151406047, + "grad_norm": 0.16033200180679852, + "learning_rate": 6.718016970227646e-05, + "loss": 2.8635, + "step": 21724 + }, + { + "epoch": 1.3486249922403626, + "grad_norm": 0.17179105073979717, + "learning_rate": 6.717677802146263e-05, + "loss": 2.7864, + "step": 21725 + }, + { + "epoch": 1.3486870693401205, + "grad_norm": 0.15382293755390017, + "learning_rate": 6.717338625103341e-05, + "loss": 2.85, + "step": 21726 + }, + { + "epoch": 1.3487491464398782, + "grad_norm": 0.15367882113134707, + "learning_rate": 6.716999439100645e-05, + "loss": 2.8618, + "step": 21727 + }, + { + "epoch": 1.3488112235396361, + "grad_norm": 0.1708888950408572, + "learning_rate": 6.716660244139946e-05, + "loss": 2.8472, + "step": 21728 + }, + { + "epoch": 1.348873300639394, + "grad_norm": 0.15006963112071875, + "learning_rate": 6.716321040223015e-05, + "loss": 2.897, + "step": 21729 + }, + { + "epoch": 1.348935377739152, + "grad_norm": 0.19314148696720648, + "learning_rate": 6.715981827351618e-05, + "loss": 2.9038, + "step": 21730 + }, + { + "epoch": 1.3489974548389099, + "grad_norm": 0.14996761429379105, + "learning_rate": 6.715642605527531e-05, + "loss": 2.8492, + "step": 21731 + }, + { + "epoch": 1.3490595319386678, + "grad_norm": 0.15598486817312626, + "learning_rate": 6.715303374752516e-05, + "loss": 2.8429, + "step": 21732 + }, + { + "epoch": 1.3491216090384257, + "grad_norm": 0.16348678757423138, + "learning_rate": 6.71496413502835e-05, + "loss": 2.8722, + "step": 21733 + }, + { + "epoch": 1.3491836861381836, + "grad_norm": 0.15787418335554645, + "learning_rate": 6.714624886356797e-05, + "loss": 2.8306, + "step": 21734 + }, + { + "epoch": 1.3492457632379415, + "grad_norm": 0.14053482237235249, + "learning_rate": 6.714285628739632e-05, + "loss": 2.7603, + "step": 21735 + }, + { + "epoch": 1.3493078403376995, + "grad_norm": 0.1585207446960343, + "learning_rate": 6.713946362178622e-05, + "loss": 2.7265, + "step": 21736 + }, + { + "epoch": 1.3493699174374574, + "grad_norm": 0.1644604715260321, + "learning_rate": 6.713607086675539e-05, + "loss": 2.8291, + "step": 21737 + }, + { + "epoch": 1.3494319945372153, + "grad_norm": 0.15011042340197311, + "learning_rate": 6.71326780223215e-05, + "loss": 2.8497, + "step": 21738 + }, + { + "epoch": 1.3494940716369732, + "grad_norm": 0.14850931563563124, + "learning_rate": 6.712928508850227e-05, + "loss": 2.89, + "step": 21739 + }, + { + "epoch": 1.3495561487367311, + "grad_norm": 0.15311296985040895, + "learning_rate": 6.712589206531541e-05, + "loss": 2.7782, + "step": 21740 + }, + { + "epoch": 1.349618225836489, + "grad_norm": 0.1474602617159889, + "learning_rate": 6.71224989527786e-05, + "loss": 2.8228, + "step": 21741 + }, + { + "epoch": 1.3496803029362467, + "grad_norm": 0.14904550836394256, + "learning_rate": 6.711910575090957e-05, + "loss": 2.8059, + "step": 21742 + }, + { + "epoch": 1.3497423800360047, + "grad_norm": 0.15820350175977513, + "learning_rate": 6.7115712459726e-05, + "loss": 2.8719, + "step": 21743 + }, + { + "epoch": 1.3498044571357626, + "grad_norm": 0.18162271965128568, + "learning_rate": 6.711231907924563e-05, + "loss": 2.8442, + "step": 21744 + }, + { + "epoch": 1.3498665342355205, + "grad_norm": 0.1621695831817572, + "learning_rate": 6.710892560948611e-05, + "loss": 2.8236, + "step": 21745 + }, + { + "epoch": 1.3499286113352784, + "grad_norm": 0.13623811843888153, + "learning_rate": 6.710553205046518e-05, + "loss": 2.8161, + "step": 21746 + }, + { + "epoch": 1.3499906884350363, + "grad_norm": 0.16662773159346453, + "learning_rate": 6.710213840220054e-05, + "loss": 2.9866, + "step": 21747 + }, + { + "epoch": 1.3500527655347943, + "grad_norm": 0.15206952356211753, + "learning_rate": 6.709874466470987e-05, + "loss": 2.9289, + "step": 21748 + }, + { + "epoch": 1.3501148426345522, + "grad_norm": 0.1520548796055477, + "learning_rate": 6.709535083801093e-05, + "loss": 2.8241, + "step": 21749 + }, + { + "epoch": 1.35017691973431, + "grad_norm": 0.1595769383561819, + "learning_rate": 6.709195692212137e-05, + "loss": 2.907, + "step": 21750 + }, + { + "epoch": 1.3502389968340678, + "grad_norm": 0.1524610984443677, + "learning_rate": 6.708856291705893e-05, + "loss": 2.8272, + "step": 21751 + }, + { + "epoch": 1.3503010739338257, + "grad_norm": 0.1630528582072377, + "learning_rate": 6.708516882284131e-05, + "loss": 2.7853, + "step": 21752 + }, + { + "epoch": 1.3503631510335836, + "grad_norm": 0.14821970169755935, + "learning_rate": 6.708177463948623e-05, + "loss": 2.8757, + "step": 21753 + }, + { + "epoch": 1.3504252281333415, + "grad_norm": 0.1508478114113016, + "learning_rate": 6.707838036701138e-05, + "loss": 2.7965, + "step": 21754 + }, + { + "epoch": 1.3504873052330995, + "grad_norm": 0.15394738610939357, + "learning_rate": 6.707498600543446e-05, + "loss": 2.8384, + "step": 21755 + }, + { + "epoch": 1.3505493823328574, + "grad_norm": 0.15791908125700532, + "learning_rate": 6.707159155477319e-05, + "loss": 2.9222, + "step": 21756 + }, + { + "epoch": 1.3506114594326153, + "grad_norm": 0.14832874128470222, + "learning_rate": 6.70681970150453e-05, + "loss": 2.876, + "step": 21757 + }, + { + "epoch": 1.3506735365323732, + "grad_norm": 0.1846813090169423, + "learning_rate": 6.706480238626846e-05, + "loss": 2.792, + "step": 21758 + }, + { + "epoch": 1.3507356136321311, + "grad_norm": 0.14315496964871993, + "learning_rate": 6.706140766846042e-05, + "loss": 2.793, + "step": 21759 + }, + { + "epoch": 1.350797690731889, + "grad_norm": 0.15791039404913101, + "learning_rate": 6.705801286163887e-05, + "loss": 2.9111, + "step": 21760 + }, + { + "epoch": 1.350859767831647, + "grad_norm": 0.15398539115574347, + "learning_rate": 6.705461796582153e-05, + "loss": 2.9194, + "step": 21761 + }, + { + "epoch": 1.3509218449314049, + "grad_norm": 0.1518051824686787, + "learning_rate": 6.70512229810261e-05, + "loss": 2.875, + "step": 21762 + }, + { + "epoch": 1.3509839220311628, + "grad_norm": 0.15745506504684859, + "learning_rate": 6.704782790727031e-05, + "loss": 2.8603, + "step": 21763 + }, + { + "epoch": 1.3510459991309207, + "grad_norm": 0.16602350104105787, + "learning_rate": 6.704443274457187e-05, + "loss": 2.8503, + "step": 21764 + }, + { + "epoch": 1.3511080762306786, + "grad_norm": 0.15368821413596384, + "learning_rate": 6.704103749294847e-05, + "loss": 2.8598, + "step": 21765 + }, + { + "epoch": 1.3511701533304363, + "grad_norm": 0.16629525364647502, + "learning_rate": 6.703764215241786e-05, + "loss": 2.8483, + "step": 21766 + }, + { + "epoch": 1.3512322304301942, + "grad_norm": 0.16538830209072938, + "learning_rate": 6.703424672299771e-05, + "loss": 2.9199, + "step": 21767 + }, + { + "epoch": 1.3512943075299522, + "grad_norm": 0.14983289893144466, + "learning_rate": 6.703085120470577e-05, + "loss": 2.8412, + "step": 21768 + }, + { + "epoch": 1.35135638462971, + "grad_norm": 0.15826606087328499, + "learning_rate": 6.702745559755977e-05, + "loss": 2.8556, + "step": 21769 + }, + { + "epoch": 1.351418461729468, + "grad_norm": 0.16951063971335917, + "learning_rate": 6.702405990157736e-05, + "loss": 2.8246, + "step": 21770 + }, + { + "epoch": 1.351480538829226, + "grad_norm": 0.17075197756000504, + "learning_rate": 6.702066411677632e-05, + "loss": 2.7816, + "step": 21771 + }, + { + "epoch": 1.3515426159289838, + "grad_norm": 0.17114661588816898, + "learning_rate": 6.701726824317432e-05, + "loss": 2.85, + "step": 21772 + }, + { + "epoch": 1.3516046930287418, + "grad_norm": 0.1709568087483652, + "learning_rate": 6.701387228078912e-05, + "loss": 2.8418, + "step": 21773 + }, + { + "epoch": 1.3516667701284997, + "grad_norm": 0.1646982262518627, + "learning_rate": 6.701047622963842e-05, + "loss": 2.9273, + "step": 21774 + }, + { + "epoch": 1.3517288472282574, + "grad_norm": 0.18024977531829142, + "learning_rate": 6.700708008973992e-05, + "loss": 2.9233, + "step": 21775 + }, + { + "epoch": 1.3517909243280153, + "grad_norm": 0.16163888124619635, + "learning_rate": 6.700368386111137e-05, + "loss": 2.8749, + "step": 21776 + }, + { + "epoch": 1.3518530014277732, + "grad_norm": 0.14278215676543862, + "learning_rate": 6.700028754377048e-05, + "loss": 2.8486, + "step": 21777 + }, + { + "epoch": 1.3519150785275311, + "grad_norm": 0.16320183912786873, + "learning_rate": 6.699689113773495e-05, + "loss": 2.8685, + "step": 21778 + }, + { + "epoch": 1.351977155627289, + "grad_norm": 0.14920023034296268, + "learning_rate": 6.699349464302252e-05, + "loss": 2.8195, + "step": 21779 + }, + { + "epoch": 1.352039232727047, + "grad_norm": 0.15740162125159057, + "learning_rate": 6.69900980596509e-05, + "loss": 2.8783, + "step": 21780 + }, + { + "epoch": 1.3521013098268049, + "grad_norm": 0.16377417456016066, + "learning_rate": 6.69867013876378e-05, + "loss": 2.885, + "step": 21781 + }, + { + "epoch": 1.3521633869265628, + "grad_norm": 0.17292676787478514, + "learning_rate": 6.698330462700097e-05, + "loss": 2.8201, + "step": 21782 + }, + { + "epoch": 1.3522254640263207, + "grad_norm": 0.15260428989023522, + "learning_rate": 6.697990777775811e-05, + "loss": 2.8303, + "step": 21783 + }, + { + "epoch": 1.3522875411260786, + "grad_norm": 0.15571471554442484, + "learning_rate": 6.697651083992696e-05, + "loss": 2.7669, + "step": 21784 + }, + { + "epoch": 1.3523496182258365, + "grad_norm": 0.14690727224280356, + "learning_rate": 6.697311381352524e-05, + "loss": 2.8296, + "step": 21785 + }, + { + "epoch": 1.3524116953255945, + "grad_norm": 0.16568231782935686, + "learning_rate": 6.696971669857064e-05, + "loss": 2.8858, + "step": 21786 + }, + { + "epoch": 1.3524737724253524, + "grad_norm": 0.1975123496057505, + "learning_rate": 6.696631949508093e-05, + "loss": 2.7837, + "step": 21787 + }, + { + "epoch": 1.3525358495251103, + "grad_norm": 0.17070680500838764, + "learning_rate": 6.696292220307378e-05, + "loss": 2.8519, + "step": 21788 + }, + { + "epoch": 1.3525979266248682, + "grad_norm": 0.1572706267409038, + "learning_rate": 6.695952482256698e-05, + "loss": 2.87, + "step": 21789 + }, + { + "epoch": 1.352660003724626, + "grad_norm": 0.17159477603413162, + "learning_rate": 6.695612735357822e-05, + "loss": 2.7888, + "step": 21790 + }, + { + "epoch": 1.3527220808243838, + "grad_norm": 0.15844546700560305, + "learning_rate": 6.69527297961252e-05, + "loss": 2.8543, + "step": 21791 + }, + { + "epoch": 1.3527841579241418, + "grad_norm": 0.1677192898146882, + "learning_rate": 6.69493321502257e-05, + "loss": 2.9618, + "step": 21792 + }, + { + "epoch": 1.3528462350238997, + "grad_norm": 0.15485061200139216, + "learning_rate": 6.69459344158974e-05, + "loss": 2.8932, + "step": 21793 + }, + { + "epoch": 1.3529083121236576, + "grad_norm": 0.1665289348238065, + "learning_rate": 6.694253659315806e-05, + "loss": 2.7927, + "step": 21794 + }, + { + "epoch": 1.3529703892234155, + "grad_norm": 0.17467065286349565, + "learning_rate": 6.69391386820254e-05, + "loss": 2.8682, + "step": 21795 + }, + { + "epoch": 1.3530324663231734, + "grad_norm": 0.1965460921142377, + "learning_rate": 6.693574068251712e-05, + "loss": 2.8325, + "step": 21796 + }, + { + "epoch": 1.3530945434229313, + "grad_norm": 0.18834449410248036, + "learning_rate": 6.693234259465097e-05, + "loss": 2.8377, + "step": 21797 + }, + { + "epoch": 1.3531566205226893, + "grad_norm": 0.16216169799787794, + "learning_rate": 6.69289444184447e-05, + "loss": 2.7394, + "step": 21798 + }, + { + "epoch": 1.353218697622447, + "grad_norm": 0.22421944814167924, + "learning_rate": 6.692554615391601e-05, + "loss": 2.8461, + "step": 21799 + }, + { + "epoch": 1.3532807747222049, + "grad_norm": 0.18533166149455085, + "learning_rate": 6.692214780108263e-05, + "loss": 2.8017, + "step": 21800 + }, + { + "epoch": 1.3533428518219628, + "grad_norm": 0.17194641320294285, + "learning_rate": 6.691874935996229e-05, + "loss": 2.8197, + "step": 21801 + }, + { + "epoch": 1.3534049289217207, + "grad_norm": 0.15478973502236987, + "learning_rate": 6.691535083057276e-05, + "loss": 2.8674, + "step": 21802 + }, + { + "epoch": 1.3534670060214786, + "grad_norm": 0.16555542834203785, + "learning_rate": 6.691195221293171e-05, + "loss": 2.8986, + "step": 21803 + }, + { + "epoch": 1.3535290831212365, + "grad_norm": 0.2241331525230468, + "learning_rate": 6.69085535070569e-05, + "loss": 2.8416, + "step": 21804 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.15552741098983328, + "learning_rate": 6.690515471296608e-05, + "loss": 2.8854, + "step": 21805 + }, + { + "epoch": 1.3536532373207524, + "grad_norm": 0.1449276797013808, + "learning_rate": 6.690175583067695e-05, + "loss": 2.8539, + "step": 21806 + }, + { + "epoch": 1.3537153144205103, + "grad_norm": 0.15445550779743872, + "learning_rate": 6.689835686020725e-05, + "loss": 2.7344, + "step": 21807 + }, + { + "epoch": 1.3537773915202682, + "grad_norm": 0.16358458642699072, + "learning_rate": 6.689495780157472e-05, + "loss": 2.7454, + "step": 21808 + }, + { + "epoch": 1.3538394686200261, + "grad_norm": 0.5489384821922413, + "learning_rate": 6.68915586547971e-05, + "loss": 2.8974, + "step": 21809 + }, + { + "epoch": 1.353901545719784, + "grad_norm": 0.15759359488062624, + "learning_rate": 6.688815941989214e-05, + "loss": 2.8131, + "step": 21810 + }, + { + "epoch": 1.353963622819542, + "grad_norm": 0.3109751333532807, + "learning_rate": 6.688476009687752e-05, + "loss": 2.8399, + "step": 21811 + }, + { + "epoch": 1.3540256999192999, + "grad_norm": 0.17482429266587016, + "learning_rate": 6.688136068577102e-05, + "loss": 2.8467, + "step": 21812 + }, + { + "epoch": 1.3540877770190578, + "grad_norm": 0.1645281065761031, + "learning_rate": 6.687796118659037e-05, + "loss": 2.7997, + "step": 21813 + }, + { + "epoch": 1.3541498541188155, + "grad_norm": 0.17287744985576767, + "learning_rate": 6.68745615993533e-05, + "loss": 2.9108, + "step": 21814 + }, + { + "epoch": 1.3542119312185734, + "grad_norm": 0.17061823626271508, + "learning_rate": 6.687116192407753e-05, + "loss": 2.8111, + "step": 21815 + }, + { + "epoch": 1.3542740083183313, + "grad_norm": 0.18279059229921188, + "learning_rate": 6.686776216078083e-05, + "loss": 2.8394, + "step": 21816 + }, + { + "epoch": 1.3543360854180893, + "grad_norm": 0.1646086537273376, + "learning_rate": 6.68643623094809e-05, + "loss": 2.8357, + "step": 21817 + }, + { + "epoch": 1.3543981625178472, + "grad_norm": 0.16642915048760168, + "learning_rate": 6.686096237019552e-05, + "loss": 2.7942, + "step": 21818 + }, + { + "epoch": 1.354460239617605, + "grad_norm": 0.1868708107073661, + "learning_rate": 6.685756234294238e-05, + "loss": 2.737, + "step": 21819 + }, + { + "epoch": 1.354522316717363, + "grad_norm": 0.1650995448415248, + "learning_rate": 6.685416222773927e-05, + "loss": 2.7968, + "step": 21820 + }, + { + "epoch": 1.354584393817121, + "grad_norm": 0.16642916167942162, + "learning_rate": 6.68507620246039e-05, + "loss": 2.9085, + "step": 21821 + }, + { + "epoch": 1.3546464709168788, + "grad_norm": 0.19061958430367268, + "learning_rate": 6.6847361733554e-05, + "loss": 2.86, + "step": 21822 + }, + { + "epoch": 1.3547085480166365, + "grad_norm": 0.17898107654690876, + "learning_rate": 6.684396135460733e-05, + "loss": 2.8153, + "step": 21823 + }, + { + "epoch": 1.3547706251163945, + "grad_norm": 0.15437814527851884, + "learning_rate": 6.684056088778162e-05, + "loss": 2.7835, + "step": 21824 + }, + { + "epoch": 1.3548327022161524, + "grad_norm": 0.16885371288338208, + "learning_rate": 6.683716033309463e-05, + "loss": 3.0197, + "step": 21825 + }, + { + "epoch": 1.3548947793159103, + "grad_norm": 0.15280371089732245, + "learning_rate": 6.683375969056407e-05, + "loss": 2.7507, + "step": 21826 + }, + { + "epoch": 1.3549568564156682, + "grad_norm": 0.1667951626209563, + "learning_rate": 6.683035896020772e-05, + "loss": 2.9038, + "step": 21827 + }, + { + "epoch": 1.3550189335154261, + "grad_norm": 0.18888325864470398, + "learning_rate": 6.68269581420433e-05, + "loss": 2.8346, + "step": 21828 + }, + { + "epoch": 1.355081010615184, + "grad_norm": 0.1675077810899922, + "learning_rate": 6.682355723608852e-05, + "loss": 2.8235, + "step": 21829 + }, + { + "epoch": 1.355143087714942, + "grad_norm": 0.14491070089935304, + "learning_rate": 6.68201562423612e-05, + "loss": 2.8415, + "step": 21830 + }, + { + "epoch": 1.3552051648146999, + "grad_norm": 0.16379017594566506, + "learning_rate": 6.681675516087902e-05, + "loss": 2.8386, + "step": 21831 + }, + { + "epoch": 1.3552672419144578, + "grad_norm": 0.17585276632376945, + "learning_rate": 6.681335399165975e-05, + "loss": 2.906, + "step": 21832 + }, + { + "epoch": 1.3553293190142157, + "grad_norm": 0.16491942047432165, + "learning_rate": 6.680995273472113e-05, + "loss": 2.8146, + "step": 21833 + }, + { + "epoch": 1.3553913961139736, + "grad_norm": 0.5946649729340358, + "learning_rate": 6.680655139008092e-05, + "loss": 2.8494, + "step": 21834 + }, + { + "epoch": 1.3554534732137316, + "grad_norm": 0.19971463251920657, + "learning_rate": 6.680314995775683e-05, + "loss": 2.8771, + "step": 21835 + }, + { + "epoch": 1.3555155503134895, + "grad_norm": 0.17366144139519038, + "learning_rate": 6.679974843776665e-05, + "loss": 2.8688, + "step": 21836 + }, + { + "epoch": 1.3555776274132474, + "grad_norm": 0.21019058975106228, + "learning_rate": 6.679634683012808e-05, + "loss": 2.8064, + "step": 21837 + }, + { + "epoch": 1.355639704513005, + "grad_norm": 0.164122411597092, + "learning_rate": 6.679294513485889e-05, + "loss": 2.9172, + "step": 21838 + }, + { + "epoch": 1.355701781612763, + "grad_norm": 0.1853378925970505, + "learning_rate": 6.678954335197686e-05, + "loss": 2.8698, + "step": 21839 + }, + { + "epoch": 1.355763858712521, + "grad_norm": 0.17064252493535645, + "learning_rate": 6.678614148149966e-05, + "loss": 2.8186, + "step": 21840 + }, + { + "epoch": 1.3558259358122788, + "grad_norm": 0.17280576390146277, + "learning_rate": 6.678273952344512e-05, + "loss": 2.8149, + "step": 21841 + }, + { + "epoch": 1.3558880129120368, + "grad_norm": 0.18091292896353, + "learning_rate": 6.677933747783094e-05, + "loss": 2.8721, + "step": 21842 + }, + { + "epoch": 1.3559500900117947, + "grad_norm": 0.1628443018593347, + "learning_rate": 6.67759353446749e-05, + "loss": 2.886, + "step": 21843 + }, + { + "epoch": 1.3560121671115526, + "grad_norm": 0.17170249344826496, + "learning_rate": 6.67725331239947e-05, + "loss": 2.9158, + "step": 21844 + }, + { + "epoch": 1.3560742442113105, + "grad_norm": 0.15689496203999387, + "learning_rate": 6.676913081580816e-05, + "loss": 2.8652, + "step": 21845 + }, + { + "epoch": 1.3561363213110684, + "grad_norm": 0.17450702620368122, + "learning_rate": 6.676572842013298e-05, + "loss": 2.8461, + "step": 21846 + }, + { + "epoch": 1.3561983984108261, + "grad_norm": 0.16213324957000902, + "learning_rate": 6.676232593698692e-05, + "loss": 2.8422, + "step": 21847 + }, + { + "epoch": 1.356260475510584, + "grad_norm": 0.15767951546032927, + "learning_rate": 6.675892336638773e-05, + "loss": 2.8137, + "step": 21848 + }, + { + "epoch": 1.356322552610342, + "grad_norm": 0.16001567835472133, + "learning_rate": 6.675552070835318e-05, + "loss": 2.9107, + "step": 21849 + }, + { + "epoch": 1.3563846297100999, + "grad_norm": 0.1532423647379476, + "learning_rate": 6.675211796290102e-05, + "loss": 2.8504, + "step": 21850 + }, + { + "epoch": 1.3564467068098578, + "grad_norm": 0.1671639223051368, + "learning_rate": 6.674871513004897e-05, + "loss": 2.8241, + "step": 21851 + }, + { + "epoch": 1.3565087839096157, + "grad_norm": 0.17511840459097305, + "learning_rate": 6.674531220981483e-05, + "loss": 2.8203, + "step": 21852 + }, + { + "epoch": 1.3565708610093736, + "grad_norm": 0.15917387173565678, + "learning_rate": 6.674190920221631e-05, + "loss": 2.94, + "step": 21853 + }, + { + "epoch": 1.3566329381091315, + "grad_norm": 0.18261747309208534, + "learning_rate": 6.67385061072712e-05, + "loss": 2.8395, + "step": 21854 + }, + { + "epoch": 1.3566950152088895, + "grad_norm": 0.16928269760417083, + "learning_rate": 6.673510292499724e-05, + "loss": 2.7906, + "step": 21855 + }, + { + "epoch": 1.3567570923086474, + "grad_norm": 0.17502511367945364, + "learning_rate": 6.673169965541217e-05, + "loss": 2.8597, + "step": 21856 + }, + { + "epoch": 1.3568191694084053, + "grad_norm": 0.20719949615640026, + "learning_rate": 6.672829629853378e-05, + "loss": 2.8929, + "step": 21857 + }, + { + "epoch": 1.3568812465081632, + "grad_norm": 0.18301361832014854, + "learning_rate": 6.672489285437979e-05, + "loss": 2.951, + "step": 21858 + }, + { + "epoch": 1.3569433236079211, + "grad_norm": 0.1830200402928907, + "learning_rate": 6.6721489322968e-05, + "loss": 2.8168, + "step": 21859 + }, + { + "epoch": 1.357005400707679, + "grad_norm": 0.23722999873028935, + "learning_rate": 6.671808570431611e-05, + "loss": 2.7409, + "step": 21860 + }, + { + "epoch": 1.357067477807437, + "grad_norm": 0.1696324459534552, + "learning_rate": 6.671468199844193e-05, + "loss": 2.7803, + "step": 21861 + }, + { + "epoch": 1.3571295549071947, + "grad_norm": 0.19959306146357508, + "learning_rate": 6.671127820536318e-05, + "loss": 2.8746, + "step": 21862 + }, + { + "epoch": 1.3571916320069526, + "grad_norm": 0.1766206145646533, + "learning_rate": 6.670787432509766e-05, + "loss": 2.8245, + "step": 21863 + }, + { + "epoch": 1.3572537091067105, + "grad_norm": 0.1878659828982788, + "learning_rate": 6.670447035766309e-05, + "loss": 2.8175, + "step": 21864 + }, + { + "epoch": 1.3573157862064684, + "grad_norm": 0.1843316386845826, + "learning_rate": 6.670106630307723e-05, + "loss": 2.763, + "step": 21865 + }, + { + "epoch": 1.3573778633062263, + "grad_norm": 0.15977160855306366, + "learning_rate": 6.669766216135787e-05, + "loss": 2.7872, + "step": 21866 + }, + { + "epoch": 1.3574399404059843, + "grad_norm": 0.1667798795238137, + "learning_rate": 6.669425793252275e-05, + "loss": 2.9277, + "step": 21867 + }, + { + "epoch": 1.3575020175057422, + "grad_norm": 0.1564274316920464, + "learning_rate": 6.669085361658963e-05, + "loss": 2.8538, + "step": 21868 + }, + { + "epoch": 1.3575640946055, + "grad_norm": 0.15795041139146968, + "learning_rate": 6.668744921357627e-05, + "loss": 2.8587, + "step": 21869 + }, + { + "epoch": 1.357626171705258, + "grad_norm": 0.19677156009173, + "learning_rate": 6.668404472350044e-05, + "loss": 2.8994, + "step": 21870 + }, + { + "epoch": 1.3576882488050157, + "grad_norm": 0.17762955519548626, + "learning_rate": 6.66806401463799e-05, + "loss": 2.8426, + "step": 21871 + }, + { + "epoch": 1.3577503259047736, + "grad_norm": 0.17156925425236336, + "learning_rate": 6.66772354822324e-05, + "loss": 2.9525, + "step": 21872 + }, + { + "epoch": 1.3578124030045315, + "grad_norm": 0.1766228977639803, + "learning_rate": 6.667383073107574e-05, + "loss": 2.914, + "step": 21873 + }, + { + "epoch": 1.3578744801042895, + "grad_norm": 0.16728975319767656, + "learning_rate": 6.667042589292764e-05, + "loss": 2.9055, + "step": 21874 + }, + { + "epoch": 1.3579365572040474, + "grad_norm": 0.164847528043725, + "learning_rate": 6.666702096780588e-05, + "loss": 2.7677, + "step": 21875 + }, + { + "epoch": 1.3579986343038053, + "grad_norm": 0.16024333633071292, + "learning_rate": 6.666361595572823e-05, + "loss": 2.7436, + "step": 21876 + }, + { + "epoch": 1.3580607114035632, + "grad_norm": 0.16751698243287075, + "learning_rate": 6.666021085671246e-05, + "loss": 2.8106, + "step": 21877 + }, + { + "epoch": 1.3581227885033211, + "grad_norm": 0.19775391566900538, + "learning_rate": 6.66568056707763e-05, + "loss": 2.859, + "step": 21878 + }, + { + "epoch": 1.358184865603079, + "grad_norm": 0.15002203024575805, + "learning_rate": 6.665340039793756e-05, + "loss": 2.7905, + "step": 21879 + }, + { + "epoch": 1.358246942702837, + "grad_norm": 0.17372641105695003, + "learning_rate": 6.664999503821398e-05, + "loss": 2.8882, + "step": 21880 + }, + { + "epoch": 1.3583090198025949, + "grad_norm": 0.1548025801325323, + "learning_rate": 6.664658959162334e-05, + "loss": 2.877, + "step": 21881 + }, + { + "epoch": 1.3583710969023528, + "grad_norm": 0.15294602746499722, + "learning_rate": 6.66431840581834e-05, + "loss": 2.8106, + "step": 21882 + }, + { + "epoch": 1.3584331740021107, + "grad_norm": 0.15578438405602416, + "learning_rate": 6.663977843791193e-05, + "loss": 2.855, + "step": 21883 + }, + { + "epoch": 1.3584952511018686, + "grad_norm": 0.1909528441905042, + "learning_rate": 6.66363727308267e-05, + "loss": 2.8164, + "step": 21884 + }, + { + "epoch": 1.3585573282016266, + "grad_norm": 0.15291371463070885, + "learning_rate": 6.663296693694546e-05, + "loss": 2.8413, + "step": 21885 + }, + { + "epoch": 1.3586194053013843, + "grad_norm": 0.19968574615687368, + "learning_rate": 6.662956105628601e-05, + "loss": 2.7344, + "step": 21886 + }, + { + "epoch": 1.3586814824011422, + "grad_norm": 0.1538202493277943, + "learning_rate": 6.662615508886609e-05, + "loss": 2.8345, + "step": 21887 + }, + { + "epoch": 1.3587435595009, + "grad_norm": 0.17112325322511493, + "learning_rate": 6.662274903470348e-05, + "loss": 2.8435, + "step": 21888 + }, + { + "epoch": 1.358805636600658, + "grad_norm": 0.16591291055047253, + "learning_rate": 6.661934289381597e-05, + "loss": 2.8851, + "step": 21889 + }, + { + "epoch": 1.358867713700416, + "grad_norm": 0.1771231506031701, + "learning_rate": 6.66159366662213e-05, + "loss": 2.8377, + "step": 21890 + }, + { + "epoch": 1.3589297908001738, + "grad_norm": 0.19599391325919463, + "learning_rate": 6.661253035193726e-05, + "loss": 2.8276, + "step": 21891 + }, + { + "epoch": 1.3589918678999318, + "grad_norm": 0.15658848577316845, + "learning_rate": 6.660912395098161e-05, + "loss": 2.7646, + "step": 21892 + }, + { + "epoch": 1.3590539449996897, + "grad_norm": 0.16775289793929543, + "learning_rate": 6.660571746337214e-05, + "loss": 2.8162, + "step": 21893 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.16719116937763998, + "learning_rate": 6.66023108891266e-05, + "loss": 2.8416, + "step": 21894 + }, + { + "epoch": 1.3591780991992053, + "grad_norm": 0.1558333145669001, + "learning_rate": 6.659890422826278e-05, + "loss": 2.9077, + "step": 21895 + }, + { + "epoch": 1.3592401762989632, + "grad_norm": 0.17270054119644618, + "learning_rate": 6.659549748079844e-05, + "loss": 2.8514, + "step": 21896 + }, + { + "epoch": 1.3593022533987211, + "grad_norm": 0.16061863212171437, + "learning_rate": 6.659209064675138e-05, + "loss": 2.8311, + "step": 21897 + }, + { + "epoch": 1.359364330498479, + "grad_norm": 0.18190993576496806, + "learning_rate": 6.658868372613934e-05, + "loss": 2.8554, + "step": 21898 + }, + { + "epoch": 1.359426407598237, + "grad_norm": 0.16999162141404217, + "learning_rate": 6.658527671898011e-05, + "loss": 2.7897, + "step": 21899 + }, + { + "epoch": 1.3594884846979949, + "grad_norm": 0.16584022456336447, + "learning_rate": 6.658186962529147e-05, + "loss": 2.88, + "step": 21900 + }, + { + "epoch": 1.3595505617977528, + "grad_norm": 0.18588820733707984, + "learning_rate": 6.657846244509118e-05, + "loss": 2.8167, + "step": 21901 + }, + { + "epoch": 1.3596126388975107, + "grad_norm": 0.1624660818758073, + "learning_rate": 6.657505517839704e-05, + "loss": 2.8223, + "step": 21902 + }, + { + "epoch": 1.3596747159972686, + "grad_norm": 0.1503812737468176, + "learning_rate": 6.65716478252268e-05, + "loss": 2.7671, + "step": 21903 + }, + { + "epoch": 1.3597367930970266, + "grad_norm": 0.15884489412813155, + "learning_rate": 6.656824038559827e-05, + "loss": 2.8413, + "step": 21904 + }, + { + "epoch": 1.3597988701967845, + "grad_norm": 0.18039231353669982, + "learning_rate": 6.656483285952918e-05, + "loss": 2.7854, + "step": 21905 + }, + { + "epoch": 1.3598609472965424, + "grad_norm": 0.18102540661063254, + "learning_rate": 6.656142524703736e-05, + "loss": 2.8277, + "step": 21906 + }, + { + "epoch": 1.3599230243963003, + "grad_norm": 0.14928762534017379, + "learning_rate": 6.655801754814056e-05, + "loss": 2.7474, + "step": 21907 + }, + { + "epoch": 1.3599851014960582, + "grad_norm": 0.1740475994252222, + "learning_rate": 6.655460976285655e-05, + "loss": 2.8751, + "step": 21908 + }, + { + "epoch": 1.3600471785958161, + "grad_norm": 0.24843188315465872, + "learning_rate": 6.655120189120314e-05, + "loss": 2.8672, + "step": 21909 + }, + { + "epoch": 1.3601092556955738, + "grad_norm": 0.14975138216941083, + "learning_rate": 6.654779393319807e-05, + "loss": 2.8117, + "step": 21910 + }, + { + "epoch": 1.3601713327953318, + "grad_norm": 0.17053683581229534, + "learning_rate": 6.654438588885916e-05, + "loss": 2.8512, + "step": 21911 + }, + { + "epoch": 1.3602334098950897, + "grad_norm": 0.15299272468198705, + "learning_rate": 6.654097775820416e-05, + "loss": 2.7634, + "step": 21912 + }, + { + "epoch": 1.3602954869948476, + "grad_norm": 0.16450102418867674, + "learning_rate": 6.653756954125088e-05, + "loss": 2.831, + "step": 21913 + }, + { + "epoch": 1.3603575640946055, + "grad_norm": 0.16520397704776138, + "learning_rate": 6.653416123801708e-05, + "loss": 2.8812, + "step": 21914 + }, + { + "epoch": 1.3604196411943634, + "grad_norm": 0.18093034863545765, + "learning_rate": 6.653075284852053e-05, + "loss": 2.8583, + "step": 21915 + }, + { + "epoch": 1.3604817182941213, + "grad_norm": 0.21392463539587747, + "learning_rate": 6.652734437277905e-05, + "loss": 2.871, + "step": 21916 + }, + { + "epoch": 1.3605437953938793, + "grad_norm": 0.19159253455930542, + "learning_rate": 6.65239358108104e-05, + "loss": 2.9153, + "step": 21917 + }, + { + "epoch": 1.360605872493637, + "grad_norm": 0.17851465421116344, + "learning_rate": 6.652052716263236e-05, + "loss": 2.7733, + "step": 21918 + }, + { + "epoch": 1.3606679495933949, + "grad_norm": 0.20624899177593833, + "learning_rate": 6.651711842826272e-05, + "loss": 2.8784, + "step": 21919 + }, + { + "epoch": 1.3607300266931528, + "grad_norm": 0.18326543305580742, + "learning_rate": 6.651370960771927e-05, + "loss": 2.8084, + "step": 21920 + }, + { + "epoch": 1.3607921037929107, + "grad_norm": 0.21280911836444122, + "learning_rate": 6.651030070101977e-05, + "loss": 2.99, + "step": 21921 + }, + { + "epoch": 1.3608541808926686, + "grad_norm": 0.24532689094852625, + "learning_rate": 6.650689170818203e-05, + "loss": 2.8213, + "step": 21922 + }, + { + "epoch": 1.3609162579924265, + "grad_norm": 0.21997366679880423, + "learning_rate": 6.650348262922384e-05, + "loss": 2.7483, + "step": 21923 + }, + { + "epoch": 1.3609783350921845, + "grad_norm": 0.19187528207925944, + "learning_rate": 6.650007346416297e-05, + "loss": 2.8749, + "step": 21924 + }, + { + "epoch": 1.3610404121919424, + "grad_norm": 0.2046028442622331, + "learning_rate": 6.64966642130172e-05, + "loss": 2.8359, + "step": 21925 + }, + { + "epoch": 1.3611024892917003, + "grad_norm": 0.18082743294673356, + "learning_rate": 6.649325487580433e-05, + "loss": 2.8823, + "step": 21926 + }, + { + "epoch": 1.3611645663914582, + "grad_norm": 0.21031372240003193, + "learning_rate": 6.648984545254217e-05, + "loss": 2.8629, + "step": 21927 + }, + { + "epoch": 1.3612266434912161, + "grad_norm": 0.16199396230416666, + "learning_rate": 6.648643594324846e-05, + "loss": 2.8669, + "step": 21928 + }, + { + "epoch": 1.361288720590974, + "grad_norm": 0.16994418622570806, + "learning_rate": 6.648302634794101e-05, + "loss": 2.9462, + "step": 21929 + }, + { + "epoch": 1.361350797690732, + "grad_norm": 0.17780856795228972, + "learning_rate": 6.647961666663762e-05, + "loss": 2.879, + "step": 21930 + }, + { + "epoch": 1.36141287479049, + "grad_norm": 0.26494770091784, + "learning_rate": 6.647620689935608e-05, + "loss": 2.8403, + "step": 21931 + }, + { + "epoch": 1.3614749518902478, + "grad_norm": 0.15636271345268987, + "learning_rate": 6.647279704611414e-05, + "loss": 2.8422, + "step": 21932 + }, + { + "epoch": 1.3615370289900055, + "grad_norm": 0.1649873302573293, + "learning_rate": 6.646938710692963e-05, + "loss": 2.8919, + "step": 21933 + }, + { + "epoch": 1.3615991060897634, + "grad_norm": 0.1736741616411293, + "learning_rate": 6.646597708182034e-05, + "loss": 2.8987, + "step": 21934 + }, + { + "epoch": 1.3616611831895213, + "grad_norm": 0.1624782685274058, + "learning_rate": 6.646256697080405e-05, + "loss": 2.8826, + "step": 21935 + }, + { + "epoch": 1.3617232602892793, + "grad_norm": 0.18790087959396828, + "learning_rate": 6.645915677389853e-05, + "loss": 2.8356, + "step": 21936 + }, + { + "epoch": 1.3617853373890372, + "grad_norm": 0.1530757094867543, + "learning_rate": 6.645574649112163e-05, + "loss": 2.7938, + "step": 21937 + }, + { + "epoch": 1.361847414488795, + "grad_norm": 0.18073398165009913, + "learning_rate": 6.645233612249107e-05, + "loss": 2.8494, + "step": 21938 + }, + { + "epoch": 1.361909491588553, + "grad_norm": 0.1643079669779826, + "learning_rate": 6.644892566802469e-05, + "loss": 2.8266, + "step": 21939 + }, + { + "epoch": 1.361971568688311, + "grad_norm": 0.17706163123793245, + "learning_rate": 6.644551512774027e-05, + "loss": 2.9261, + "step": 21940 + }, + { + "epoch": 1.3620336457880688, + "grad_norm": 0.14632503108080155, + "learning_rate": 6.644210450165562e-05, + "loss": 2.8668, + "step": 21941 + }, + { + "epoch": 1.3620957228878265, + "grad_norm": 0.1581168940505735, + "learning_rate": 6.643869378978851e-05, + "loss": 2.9283, + "step": 21942 + }, + { + "epoch": 1.3621577999875845, + "grad_norm": 0.20154564587411775, + "learning_rate": 6.643528299215675e-05, + "loss": 2.855, + "step": 21943 + }, + { + "epoch": 1.3622198770873424, + "grad_norm": 0.18647441918942007, + "learning_rate": 6.643187210877811e-05, + "loss": 2.79, + "step": 21944 + }, + { + "epoch": 1.3622819541871003, + "grad_norm": 0.14689516324776036, + "learning_rate": 6.642846113967043e-05, + "loss": 2.8669, + "step": 21945 + }, + { + "epoch": 1.3623440312868582, + "grad_norm": 0.16658313465949395, + "learning_rate": 6.642505008485146e-05, + "loss": 2.8385, + "step": 21946 + }, + { + "epoch": 1.3624061083866161, + "grad_norm": 0.14884682735567212, + "learning_rate": 6.642163894433903e-05, + "loss": 2.7874, + "step": 21947 + }, + { + "epoch": 1.362468185486374, + "grad_norm": 0.14971962394929053, + "learning_rate": 6.641822771815092e-05, + "loss": 2.7854, + "step": 21948 + }, + { + "epoch": 1.362530262586132, + "grad_norm": 0.18736757131767146, + "learning_rate": 6.641481640630492e-05, + "loss": 2.8555, + "step": 21949 + }, + { + "epoch": 1.3625923396858899, + "grad_norm": 0.1542829736652329, + "learning_rate": 6.641140500881884e-05, + "loss": 2.7596, + "step": 21950 + }, + { + "epoch": 1.3626544167856478, + "grad_norm": 0.15504327545823826, + "learning_rate": 6.640799352571049e-05, + "loss": 2.8333, + "step": 21951 + }, + { + "epoch": 1.3627164938854057, + "grad_norm": 0.15607455655871627, + "learning_rate": 6.640458195699763e-05, + "loss": 2.8695, + "step": 21952 + }, + { + "epoch": 1.3627785709851636, + "grad_norm": 0.15965237089258003, + "learning_rate": 6.640117030269811e-05, + "loss": 2.9067, + "step": 21953 + }, + { + "epoch": 1.3628406480849216, + "grad_norm": 0.16093507829714424, + "learning_rate": 6.639775856282968e-05, + "loss": 2.8822, + "step": 21954 + }, + { + "epoch": 1.3629027251846795, + "grad_norm": 0.17265429797730464, + "learning_rate": 6.639434673741018e-05, + "loss": 2.899, + "step": 21955 + }, + { + "epoch": 1.3629648022844374, + "grad_norm": 0.15307818568120324, + "learning_rate": 6.63909348264574e-05, + "loss": 2.8306, + "step": 21956 + }, + { + "epoch": 1.363026879384195, + "grad_norm": 0.17555215529686308, + "learning_rate": 6.63875228299891e-05, + "loss": 2.8156, + "step": 21957 + }, + { + "epoch": 1.363088956483953, + "grad_norm": 0.1465297819333721, + "learning_rate": 6.638411074802316e-05, + "loss": 2.7622, + "step": 21958 + }, + { + "epoch": 1.363151033583711, + "grad_norm": 0.1913652376105566, + "learning_rate": 6.63806985805773e-05, + "loss": 2.8898, + "step": 21959 + }, + { + "epoch": 1.3632131106834688, + "grad_norm": 0.18335899603292136, + "learning_rate": 6.637728632766937e-05, + "loss": 2.7396, + "step": 21960 + }, + { + "epoch": 1.3632751877832268, + "grad_norm": 0.19227936185259714, + "learning_rate": 6.637387398931715e-05, + "loss": 2.8637, + "step": 21961 + }, + { + "epoch": 1.3633372648829847, + "grad_norm": 0.16042577348183498, + "learning_rate": 6.637046156553849e-05, + "loss": 2.8306, + "step": 21962 + }, + { + "epoch": 1.3633993419827426, + "grad_norm": 0.15929713333842316, + "learning_rate": 6.636704905635114e-05, + "loss": 2.8771, + "step": 21963 + }, + { + "epoch": 1.3634614190825005, + "grad_norm": 0.15708211222177368, + "learning_rate": 6.636363646177292e-05, + "loss": 2.8909, + "step": 21964 + }, + { + "epoch": 1.3635234961822584, + "grad_norm": 0.17301864953550947, + "learning_rate": 6.636022378182163e-05, + "loss": 2.8863, + "step": 21965 + }, + { + "epoch": 1.3635855732820161, + "grad_norm": 0.14778865624920612, + "learning_rate": 6.635681101651508e-05, + "loss": 2.8534, + "step": 21966 + }, + { + "epoch": 1.363647650381774, + "grad_norm": 0.15643242081616235, + "learning_rate": 6.635339816587109e-05, + "loss": 2.838, + "step": 21967 + }, + { + "epoch": 1.363709727481532, + "grad_norm": 0.17316283200835966, + "learning_rate": 6.634998522990743e-05, + "loss": 2.9128, + "step": 21968 + }, + { + "epoch": 1.3637718045812899, + "grad_norm": 0.15420415361853546, + "learning_rate": 6.634657220864195e-05, + "loss": 2.9013, + "step": 21969 + }, + { + "epoch": 1.3638338816810478, + "grad_norm": 0.16953815235202843, + "learning_rate": 6.634315910209243e-05, + "loss": 2.9317, + "step": 21970 + }, + { + "epoch": 1.3638959587808057, + "grad_norm": 0.1591581201276818, + "learning_rate": 6.633974591027668e-05, + "loss": 2.9012, + "step": 21971 + }, + { + "epoch": 1.3639580358805636, + "grad_norm": 0.16628625353465223, + "learning_rate": 6.63363326332125e-05, + "loss": 2.8366, + "step": 21972 + }, + { + "epoch": 1.3640201129803216, + "grad_norm": 0.16566602135930109, + "learning_rate": 6.633291927091772e-05, + "loss": 2.8495, + "step": 21973 + }, + { + "epoch": 1.3640821900800795, + "grad_norm": 0.16071843705322073, + "learning_rate": 6.632950582341013e-05, + "loss": 2.9287, + "step": 21974 + }, + { + "epoch": 1.3641442671798374, + "grad_norm": 0.14917815664883863, + "learning_rate": 6.632609229070753e-05, + "loss": 2.8061, + "step": 21975 + }, + { + "epoch": 1.3642063442795953, + "grad_norm": 0.16200137849798155, + "learning_rate": 6.632267867282777e-05, + "loss": 2.9047, + "step": 21976 + }, + { + "epoch": 1.3642684213793532, + "grad_norm": 0.1749376664949656, + "learning_rate": 6.631926496978861e-05, + "loss": 2.7951, + "step": 21977 + }, + { + "epoch": 1.3643304984791111, + "grad_norm": 0.15485014288299528, + "learning_rate": 6.631585118160791e-05, + "loss": 2.8085, + "step": 21978 + }, + { + "epoch": 1.364392575578869, + "grad_norm": 0.1557766360132399, + "learning_rate": 6.63124373083034e-05, + "loss": 2.8436, + "step": 21979 + }, + { + "epoch": 1.364454652678627, + "grad_norm": 0.17147045861086932, + "learning_rate": 6.6309023349893e-05, + "loss": 2.754, + "step": 21980 + }, + { + "epoch": 1.3645167297783847, + "grad_norm": 0.16618457499953854, + "learning_rate": 6.630560930639445e-05, + "loss": 2.7906, + "step": 21981 + }, + { + "epoch": 1.3645788068781426, + "grad_norm": 0.1590928384893018, + "learning_rate": 6.630219517782556e-05, + "loss": 2.7764, + "step": 21982 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.18653974411871, + "learning_rate": 6.629878096420419e-05, + "loss": 2.8474, + "step": 21983 + }, + { + "epoch": 1.3647029610776584, + "grad_norm": 0.17128045572647593, + "learning_rate": 6.62953666655481e-05, + "loss": 2.7998, + "step": 21984 + }, + { + "epoch": 1.3647650381774163, + "grad_norm": 0.16139608895888183, + "learning_rate": 6.629195228187515e-05, + "loss": 2.8493, + "step": 21985 + }, + { + "epoch": 1.3648271152771743, + "grad_norm": 0.18239167512793816, + "learning_rate": 6.62885378132031e-05, + "loss": 2.8842, + "step": 21986 + }, + { + "epoch": 1.3648891923769322, + "grad_norm": 0.15889707295935512, + "learning_rate": 6.62851232595498e-05, + "loss": 2.8926, + "step": 21987 + }, + { + "epoch": 1.36495126947669, + "grad_norm": 0.17218701803032632, + "learning_rate": 6.628170862093308e-05, + "loss": 2.8676, + "step": 21988 + }, + { + "epoch": 1.365013346576448, + "grad_norm": 0.1804658575775741, + "learning_rate": 6.62782938973707e-05, + "loss": 2.9026, + "step": 21989 + }, + { + "epoch": 1.3650754236762057, + "grad_norm": 0.17980745706656606, + "learning_rate": 6.627487908888054e-05, + "loss": 2.743, + "step": 21990 + }, + { + "epoch": 1.3651375007759636, + "grad_norm": 0.18787572884941728, + "learning_rate": 6.627146419548037e-05, + "loss": 2.8689, + "step": 21991 + }, + { + "epoch": 1.3651995778757215, + "grad_norm": 0.16221473900604227, + "learning_rate": 6.626804921718803e-05, + "loss": 2.7361, + "step": 21992 + }, + { + "epoch": 1.3652616549754795, + "grad_norm": 0.15918653274514857, + "learning_rate": 6.626463415402132e-05, + "loss": 2.764, + "step": 21993 + }, + { + "epoch": 1.3653237320752374, + "grad_norm": 0.1529411499190429, + "learning_rate": 6.626121900599808e-05, + "loss": 2.8692, + "step": 21994 + }, + { + "epoch": 1.3653858091749953, + "grad_norm": 0.1473701715596474, + "learning_rate": 6.625780377313609e-05, + "loss": 2.9198, + "step": 21995 + }, + { + "epoch": 1.3654478862747532, + "grad_norm": 0.17044509704369204, + "learning_rate": 6.625438845545321e-05, + "loss": 2.8574, + "step": 21996 + }, + { + "epoch": 1.3655099633745111, + "grad_norm": 0.18044289118802986, + "learning_rate": 6.625097305296722e-05, + "loss": 2.9107, + "step": 21997 + }, + { + "epoch": 1.365572040474269, + "grad_norm": 0.2118139689803435, + "learning_rate": 6.624755756569598e-05, + "loss": 2.8447, + "step": 21998 + }, + { + "epoch": 1.365634117574027, + "grad_norm": 0.16787285063192067, + "learning_rate": 6.624414199365728e-05, + "loss": 2.8357, + "step": 21999 + }, + { + "epoch": 1.365696194673785, + "grad_norm": 0.19179147763242596, + "learning_rate": 6.624072633686894e-05, + "loss": 2.8374, + "step": 22000 + }, + { + "epoch": 1.3657582717735428, + "grad_norm": 0.17528968531686698, + "learning_rate": 6.623731059534881e-05, + "loss": 2.844, + "step": 22001 + }, + { + "epoch": 1.3658203488733007, + "grad_norm": 0.17029971284721723, + "learning_rate": 6.623389476911465e-05, + "loss": 2.7748, + "step": 22002 + }, + { + "epoch": 1.3658824259730586, + "grad_norm": 0.17587404450893793, + "learning_rate": 6.623047885818434e-05, + "loss": 2.8501, + "step": 22003 + }, + { + "epoch": 1.3659445030728166, + "grad_norm": 0.17264281345747695, + "learning_rate": 6.622706286257569e-05, + "loss": 2.8518, + "step": 22004 + }, + { + "epoch": 1.3660065801725743, + "grad_norm": 0.15855907498022315, + "learning_rate": 6.622364678230651e-05, + "loss": 2.6993, + "step": 22005 + }, + { + "epoch": 1.3660686572723322, + "grad_norm": 0.2091741235244158, + "learning_rate": 6.622023061739463e-05, + "loss": 2.8097, + "step": 22006 + }, + { + "epoch": 1.36613073437209, + "grad_norm": 0.1690645128137353, + "learning_rate": 6.621681436785786e-05, + "loss": 2.8603, + "step": 22007 + }, + { + "epoch": 1.366192811471848, + "grad_norm": 0.15499547754696544, + "learning_rate": 6.621339803371405e-05, + "loss": 2.8648, + "step": 22008 + }, + { + "epoch": 1.366254888571606, + "grad_norm": 0.15938945125061277, + "learning_rate": 6.620998161498098e-05, + "loss": 2.8059, + "step": 22009 + }, + { + "epoch": 1.3663169656713638, + "grad_norm": 0.1610800107504562, + "learning_rate": 6.620656511167653e-05, + "loss": 2.7977, + "step": 22010 + }, + { + "epoch": 1.3663790427711218, + "grad_norm": 0.15801369546470176, + "learning_rate": 6.620314852381846e-05, + "loss": 2.9052, + "step": 22011 + }, + { + "epoch": 1.3664411198708797, + "grad_norm": 0.15468308847089993, + "learning_rate": 6.619973185142466e-05, + "loss": 2.8011, + "step": 22012 + }, + { + "epoch": 1.3665031969706376, + "grad_norm": 0.15660447799964106, + "learning_rate": 6.619631509451291e-05, + "loss": 2.7756, + "step": 22013 + }, + { + "epoch": 1.3665652740703953, + "grad_norm": 0.147161811947897, + "learning_rate": 6.619289825310107e-05, + "loss": 2.8859, + "step": 22014 + }, + { + "epoch": 1.3666273511701532, + "grad_norm": 0.16081096989881857, + "learning_rate": 6.618948132720693e-05, + "loss": 2.8278, + "step": 22015 + }, + { + "epoch": 1.3666894282699111, + "grad_norm": 0.15566412639736266, + "learning_rate": 6.618606431684835e-05, + "loss": 2.8839, + "step": 22016 + }, + { + "epoch": 1.366751505369669, + "grad_norm": 0.17159343002051908, + "learning_rate": 6.618264722204314e-05, + "loss": 2.8368, + "step": 22017 + }, + { + "epoch": 1.366813582469427, + "grad_norm": 0.19502962608743213, + "learning_rate": 6.617923004280912e-05, + "loss": 2.8445, + "step": 22018 + }, + { + "epoch": 1.3668756595691849, + "grad_norm": 0.14823035794831832, + "learning_rate": 6.617581277916414e-05, + "loss": 2.7928, + "step": 22019 + }, + { + "epoch": 1.3669377366689428, + "grad_norm": 0.15526927486574038, + "learning_rate": 6.617239543112601e-05, + "loss": 2.8503, + "step": 22020 + }, + { + "epoch": 1.3669998137687007, + "grad_norm": 0.14360057331016585, + "learning_rate": 6.616897799871258e-05, + "loss": 2.8433, + "step": 22021 + }, + { + "epoch": 1.3670618908684586, + "grad_norm": 0.15696426096707072, + "learning_rate": 6.616556048194165e-05, + "loss": 2.7307, + "step": 22022 + }, + { + "epoch": 1.3671239679682166, + "grad_norm": 0.19588924131553612, + "learning_rate": 6.616214288083108e-05, + "loss": 2.7376, + "step": 22023 + }, + { + "epoch": 1.3671860450679745, + "grad_norm": 0.18894401455209228, + "learning_rate": 6.615872519539869e-05, + "loss": 2.8663, + "step": 22024 + }, + { + "epoch": 1.3672481221677324, + "grad_norm": 0.15526727148324915, + "learning_rate": 6.61553074256623e-05, + "loss": 2.8342, + "step": 22025 + }, + { + "epoch": 1.3673101992674903, + "grad_norm": 0.17767187214241564, + "learning_rate": 6.615188957163975e-05, + "loss": 2.8376, + "step": 22026 + }, + { + "epoch": 1.3673722763672482, + "grad_norm": 0.14978246213126822, + "learning_rate": 6.614847163334886e-05, + "loss": 2.82, + "step": 22027 + }, + { + "epoch": 1.3674343534670061, + "grad_norm": 0.13714442742422098, + "learning_rate": 6.61450536108075e-05, + "loss": 2.7337, + "step": 22028 + }, + { + "epoch": 1.3674964305667638, + "grad_norm": 0.17264316409996627, + "learning_rate": 6.614163550403345e-05, + "loss": 2.8102, + "step": 22029 + }, + { + "epoch": 1.3675585076665218, + "grad_norm": 0.14548176653939468, + "learning_rate": 6.613821731304458e-05, + "loss": 2.8988, + "step": 22030 + }, + { + "epoch": 1.3676205847662797, + "grad_norm": 0.14410522537974021, + "learning_rate": 6.61347990378587e-05, + "loss": 2.8467, + "step": 22031 + }, + { + "epoch": 1.3676826618660376, + "grad_norm": 0.1599572387379468, + "learning_rate": 6.613138067849369e-05, + "loss": 2.8317, + "step": 22032 + }, + { + "epoch": 1.3677447389657955, + "grad_norm": 0.15494321100051142, + "learning_rate": 6.612796223496731e-05, + "loss": 2.8099, + "step": 22033 + }, + { + "epoch": 1.3678068160655534, + "grad_norm": 0.16921213061347407, + "learning_rate": 6.612454370729747e-05, + "loss": 2.8141, + "step": 22034 + }, + { + "epoch": 1.3678688931653114, + "grad_norm": 0.15939799357952922, + "learning_rate": 6.612112509550196e-05, + "loss": 2.8508, + "step": 22035 + }, + { + "epoch": 1.3679309702650693, + "grad_norm": 0.1807090703214617, + "learning_rate": 6.61177063995986e-05, + "loss": 2.8516, + "step": 22036 + }, + { + "epoch": 1.3679930473648272, + "grad_norm": 0.18637950196895625, + "learning_rate": 6.611428761960528e-05, + "loss": 2.9157, + "step": 22037 + }, + { + "epoch": 1.3680551244645849, + "grad_norm": 0.16009146263853427, + "learning_rate": 6.61108687555398e-05, + "loss": 2.9466, + "step": 22038 + }, + { + "epoch": 1.3681172015643428, + "grad_norm": 0.15135898567867132, + "learning_rate": 6.610744980742001e-05, + "loss": 2.7567, + "step": 22039 + }, + { + "epoch": 1.3681792786641007, + "grad_norm": 0.15548725521236434, + "learning_rate": 6.610403077526374e-05, + "loss": 2.849, + "step": 22040 + }, + { + "epoch": 1.3682413557638586, + "grad_norm": 0.1807121006789383, + "learning_rate": 6.610061165908885e-05, + "loss": 2.8655, + "step": 22041 + }, + { + "epoch": 1.3683034328636166, + "grad_norm": 0.15983239436528468, + "learning_rate": 6.609719245891316e-05, + "loss": 2.8045, + "step": 22042 + }, + { + "epoch": 1.3683655099633745, + "grad_norm": 0.16753525018746085, + "learning_rate": 6.609377317475449e-05, + "loss": 2.8498, + "step": 22043 + }, + { + "epoch": 1.3684275870631324, + "grad_norm": 0.17630913730654607, + "learning_rate": 6.609035380663071e-05, + "loss": 2.8966, + "step": 22044 + }, + { + "epoch": 1.3684896641628903, + "grad_norm": 0.14783297589484456, + "learning_rate": 6.608693435455964e-05, + "loss": 2.8861, + "step": 22045 + }, + { + "epoch": 1.3685517412626482, + "grad_norm": 0.15869665283920123, + "learning_rate": 6.608351481855915e-05, + "loss": 2.8709, + "step": 22046 + }, + { + "epoch": 1.3686138183624061, + "grad_norm": 0.17039535589353746, + "learning_rate": 6.608009519864702e-05, + "loss": 2.8409, + "step": 22047 + }, + { + "epoch": 1.368675895462164, + "grad_norm": 0.17349443660976618, + "learning_rate": 6.607667549484117e-05, + "loss": 2.8892, + "step": 22048 + }, + { + "epoch": 1.368737972561922, + "grad_norm": 0.15983519706515417, + "learning_rate": 6.607325570715937e-05, + "loss": 2.9394, + "step": 22049 + }, + { + "epoch": 1.36880004966168, + "grad_norm": 0.1485929992227611, + "learning_rate": 6.606983583561952e-05, + "loss": 2.8954, + "step": 22050 + }, + { + "epoch": 1.3688621267614378, + "grad_norm": 0.19865510716033805, + "learning_rate": 6.606641588023941e-05, + "loss": 2.835, + "step": 22051 + }, + { + "epoch": 1.3689242038611957, + "grad_norm": 0.2019416285623794, + "learning_rate": 6.606299584103692e-05, + "loss": 2.8229, + "step": 22052 + }, + { + "epoch": 1.3689862809609534, + "grad_norm": 0.1821063643918122, + "learning_rate": 6.605957571802988e-05, + "loss": 2.7963, + "step": 22053 + }, + { + "epoch": 1.3690483580607113, + "grad_norm": 0.23555488825843218, + "learning_rate": 6.605615551123612e-05, + "loss": 2.797, + "step": 22054 + }, + { + "epoch": 1.3691104351604693, + "grad_norm": 0.21829834359807931, + "learning_rate": 6.605273522067352e-05, + "loss": 2.8656, + "step": 22055 + }, + { + "epoch": 1.3691725122602272, + "grad_norm": 0.1685643448610236, + "learning_rate": 6.60493148463599e-05, + "loss": 2.7578, + "step": 22056 + }, + { + "epoch": 1.369234589359985, + "grad_norm": 0.20308915152027032, + "learning_rate": 6.604589438831311e-05, + "loss": 2.7688, + "step": 22057 + }, + { + "epoch": 1.369296666459743, + "grad_norm": 0.17185571410608927, + "learning_rate": 6.604247384655099e-05, + "loss": 2.8374, + "step": 22058 + }, + { + "epoch": 1.369358743559501, + "grad_norm": 0.17764267284948648, + "learning_rate": 6.603905322109139e-05, + "loss": 2.8083, + "step": 22059 + }, + { + "epoch": 1.3694208206592589, + "grad_norm": 0.20373659493516638, + "learning_rate": 6.603563251195216e-05, + "loss": 2.8005, + "step": 22060 + }, + { + "epoch": 1.3694828977590168, + "grad_norm": 0.16722074561371, + "learning_rate": 6.603221171915113e-05, + "loss": 2.8921, + "step": 22061 + }, + { + "epoch": 1.3695449748587745, + "grad_norm": 0.15574424071593435, + "learning_rate": 6.602879084270618e-05, + "loss": 2.9508, + "step": 22062 + }, + { + "epoch": 1.3696070519585324, + "grad_norm": 0.16911698588043247, + "learning_rate": 6.602536988263512e-05, + "loss": 2.852, + "step": 22063 + }, + { + "epoch": 1.3696691290582903, + "grad_norm": 0.19521095497185315, + "learning_rate": 6.602194883895582e-05, + "loss": 2.8127, + "step": 22064 + }, + { + "epoch": 1.3697312061580482, + "grad_norm": 0.15248860295107358, + "learning_rate": 6.601852771168612e-05, + "loss": 2.7365, + "step": 22065 + }, + { + "epoch": 1.3697932832578061, + "grad_norm": 0.15936951347331862, + "learning_rate": 6.601510650084388e-05, + "loss": 2.9455, + "step": 22066 + }, + { + "epoch": 1.369855360357564, + "grad_norm": 0.15617079515451174, + "learning_rate": 6.601168520644692e-05, + "loss": 2.8484, + "step": 22067 + }, + { + "epoch": 1.369917437457322, + "grad_norm": 0.14894909285403246, + "learning_rate": 6.600826382851314e-05, + "loss": 2.9396, + "step": 22068 + }, + { + "epoch": 1.36997951455708, + "grad_norm": 0.19250271541364683, + "learning_rate": 6.600484236706034e-05, + "loss": 2.9255, + "step": 22069 + }, + { + "epoch": 1.3700415916568378, + "grad_norm": 0.16908564279600477, + "learning_rate": 6.60014208221064e-05, + "loss": 2.8617, + "step": 22070 + }, + { + "epoch": 1.3701036687565957, + "grad_norm": 0.17494089797502865, + "learning_rate": 6.599799919366917e-05, + "loss": 2.9194, + "step": 22071 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.1399358778911076, + "learning_rate": 6.599457748176649e-05, + "loss": 2.873, + "step": 22072 + }, + { + "epoch": 1.3702278229561116, + "grad_norm": 0.15750948997509862, + "learning_rate": 6.59911556864162e-05, + "loss": 2.8432, + "step": 22073 + }, + { + "epoch": 1.3702899000558695, + "grad_norm": 0.15744412220016737, + "learning_rate": 6.598773380763619e-05, + "loss": 2.8672, + "step": 22074 + }, + { + "epoch": 1.3703519771556274, + "grad_norm": 0.15267884252240518, + "learning_rate": 6.59843118454443e-05, + "loss": 2.7935, + "step": 22075 + }, + { + "epoch": 1.3704140542553853, + "grad_norm": 0.153358928409651, + "learning_rate": 6.598088979985835e-05, + "loss": 2.9049, + "step": 22076 + }, + { + "epoch": 1.370476131355143, + "grad_norm": 0.14599920671391203, + "learning_rate": 6.597746767089623e-05, + "loss": 2.8519, + "step": 22077 + }, + { + "epoch": 1.370538208454901, + "grad_norm": 0.16383440192135404, + "learning_rate": 6.597404545857579e-05, + "loss": 2.7734, + "step": 22078 + }, + { + "epoch": 1.3706002855546588, + "grad_norm": 0.15359636226927895, + "learning_rate": 6.597062316291486e-05, + "loss": 2.7925, + "step": 22079 + }, + { + "epoch": 1.3706623626544168, + "grad_norm": 0.17361532537325744, + "learning_rate": 6.596720078393132e-05, + "loss": 2.9328, + "step": 22080 + }, + { + "epoch": 1.3707244397541747, + "grad_norm": 0.17627701254737113, + "learning_rate": 6.596377832164302e-05, + "loss": 2.8074, + "step": 22081 + }, + { + "epoch": 1.3707865168539326, + "grad_norm": 0.18152428049113833, + "learning_rate": 6.596035577606781e-05, + "loss": 2.8079, + "step": 22082 + }, + { + "epoch": 1.3708485939536905, + "grad_norm": 0.18502795285323517, + "learning_rate": 6.595693314722356e-05, + "loss": 2.8893, + "step": 22083 + }, + { + "epoch": 1.3709106710534484, + "grad_norm": 0.16326907719406766, + "learning_rate": 6.59535104351281e-05, + "loss": 2.7863, + "step": 22084 + }, + { + "epoch": 1.3709727481532064, + "grad_norm": 0.1626170264267794, + "learning_rate": 6.595008763979931e-05, + "loss": 2.7786, + "step": 22085 + }, + { + "epoch": 1.371034825252964, + "grad_norm": 0.16232635729806993, + "learning_rate": 6.594666476125503e-05, + "loss": 2.8297, + "step": 22086 + }, + { + "epoch": 1.371096902352722, + "grad_norm": 0.15280527727731086, + "learning_rate": 6.594324179951314e-05, + "loss": 2.8906, + "step": 22087 + }, + { + "epoch": 1.3711589794524799, + "grad_norm": 0.16518434078126928, + "learning_rate": 6.593981875459149e-05, + "loss": 2.9159, + "step": 22088 + }, + { + "epoch": 1.3712210565522378, + "grad_norm": 0.16183593968219465, + "learning_rate": 6.593639562650793e-05, + "loss": 2.9226, + "step": 22089 + }, + { + "epoch": 1.3712831336519957, + "grad_norm": 0.15174313895278743, + "learning_rate": 6.593297241528033e-05, + "loss": 2.8191, + "step": 22090 + }, + { + "epoch": 1.3713452107517536, + "grad_norm": 0.16145935494089453, + "learning_rate": 6.592954912092654e-05, + "loss": 2.7729, + "step": 22091 + }, + { + "epoch": 1.3714072878515116, + "grad_norm": 0.16930309623972767, + "learning_rate": 6.592612574346442e-05, + "loss": 2.8001, + "step": 22092 + }, + { + "epoch": 1.3714693649512695, + "grad_norm": 0.1541668609723164, + "learning_rate": 6.592270228291185e-05, + "loss": 2.7413, + "step": 22093 + }, + { + "epoch": 1.3715314420510274, + "grad_norm": 0.15540414383138176, + "learning_rate": 6.591927873928667e-05, + "loss": 2.8072, + "step": 22094 + }, + { + "epoch": 1.3715935191507853, + "grad_norm": 0.1630707579532494, + "learning_rate": 6.591585511260675e-05, + "loss": 2.7905, + "step": 22095 + }, + { + "epoch": 1.3716555962505432, + "grad_norm": 0.16679051697559757, + "learning_rate": 6.591243140288995e-05, + "loss": 2.8128, + "step": 22096 + }, + { + "epoch": 1.3717176733503011, + "grad_norm": 0.18738108082460808, + "learning_rate": 6.590900761015413e-05, + "loss": 2.7602, + "step": 22097 + }, + { + "epoch": 1.371779750450059, + "grad_norm": 0.16516316279413515, + "learning_rate": 6.590558373441717e-05, + "loss": 2.7948, + "step": 22098 + }, + { + "epoch": 1.371841827549817, + "grad_norm": 0.15286757205227158, + "learning_rate": 6.59021597756969e-05, + "loss": 2.8726, + "step": 22099 + }, + { + "epoch": 1.371903904649575, + "grad_norm": 0.15429232385417982, + "learning_rate": 6.58987357340112e-05, + "loss": 2.8283, + "step": 22100 + }, + { + "epoch": 1.3719659817493326, + "grad_norm": 0.15742667122248064, + "learning_rate": 6.589531160937794e-05, + "loss": 2.8697, + "step": 22101 + }, + { + "epoch": 1.3720280588490905, + "grad_norm": 0.14169405843012892, + "learning_rate": 6.589188740181498e-05, + "loss": 2.7784, + "step": 22102 + }, + { + "epoch": 1.3720901359488484, + "grad_norm": 0.15701096128472505, + "learning_rate": 6.588846311134019e-05, + "loss": 2.8608, + "step": 22103 + }, + { + "epoch": 1.3721522130486064, + "grad_norm": 0.15792365762510838, + "learning_rate": 6.588503873797143e-05, + "loss": 2.8774, + "step": 22104 + }, + { + "epoch": 1.3722142901483643, + "grad_norm": 0.15700151793048325, + "learning_rate": 6.588161428172656e-05, + "loss": 2.8924, + "step": 22105 + }, + { + "epoch": 1.3722763672481222, + "grad_norm": 0.14407165371036945, + "learning_rate": 6.587818974262346e-05, + "loss": 2.8736, + "step": 22106 + }, + { + "epoch": 1.37233844434788, + "grad_norm": 0.15980558268786568, + "learning_rate": 6.587476512068e-05, + "loss": 2.8832, + "step": 22107 + }, + { + "epoch": 1.372400521447638, + "grad_norm": 0.14385033350548854, + "learning_rate": 6.587134041591402e-05, + "loss": 2.7208, + "step": 22108 + }, + { + "epoch": 1.372462598547396, + "grad_norm": 0.1599321367375243, + "learning_rate": 6.586791562834342e-05, + "loss": 2.8671, + "step": 22109 + }, + { + "epoch": 1.3725246756471536, + "grad_norm": 0.1599524992975009, + "learning_rate": 6.586449075798604e-05, + "loss": 2.8298, + "step": 22110 + }, + { + "epoch": 1.3725867527469116, + "grad_norm": 0.1707053483342477, + "learning_rate": 6.586106580485976e-05, + "loss": 2.8666, + "step": 22111 + }, + { + "epoch": 1.3726488298466695, + "grad_norm": 0.18791950503515534, + "learning_rate": 6.585764076898245e-05, + "loss": 2.9318, + "step": 22112 + }, + { + "epoch": 1.3727109069464274, + "grad_norm": 0.15324097299592887, + "learning_rate": 6.585421565037199e-05, + "loss": 2.8404, + "step": 22113 + }, + { + "epoch": 1.3727729840461853, + "grad_norm": 0.165382158416791, + "learning_rate": 6.585079044904623e-05, + "loss": 2.8687, + "step": 22114 + }, + { + "epoch": 1.3728350611459432, + "grad_norm": 0.15563508270149032, + "learning_rate": 6.584736516502304e-05, + "loss": 2.8277, + "step": 22115 + }, + { + "epoch": 1.3728971382457011, + "grad_norm": 0.15772674799065556, + "learning_rate": 6.584393979832031e-05, + "loss": 2.8244, + "step": 22116 + }, + { + "epoch": 1.372959215345459, + "grad_norm": 0.1478035275576946, + "learning_rate": 6.584051434895589e-05, + "loss": 2.8452, + "step": 22117 + }, + { + "epoch": 1.373021292445217, + "grad_norm": 0.1739510037587436, + "learning_rate": 6.583708881694767e-05, + "loss": 2.8519, + "step": 22118 + }, + { + "epoch": 1.373083369544975, + "grad_norm": 0.157033220914552, + "learning_rate": 6.58336632023135e-05, + "loss": 2.8652, + "step": 22119 + }, + { + "epoch": 1.3731454466447328, + "grad_norm": 0.1450516572040848, + "learning_rate": 6.583023750507129e-05, + "loss": 2.8375, + "step": 22120 + }, + { + "epoch": 1.3732075237444907, + "grad_norm": 0.19346594522766378, + "learning_rate": 6.582681172523887e-05, + "loss": 2.8504, + "step": 22121 + }, + { + "epoch": 1.3732696008442487, + "grad_norm": 0.1624923170089114, + "learning_rate": 6.582338586283413e-05, + "loss": 2.8913, + "step": 22122 + }, + { + "epoch": 1.3733316779440066, + "grad_norm": 0.20032325935926237, + "learning_rate": 6.581995991787495e-05, + "loss": 2.8026, + "step": 22123 + }, + { + "epoch": 1.3733937550437645, + "grad_norm": 0.15087431372902832, + "learning_rate": 6.58165338903792e-05, + "loss": 2.8124, + "step": 22124 + }, + { + "epoch": 1.3734558321435222, + "grad_norm": 0.16313667421842942, + "learning_rate": 6.581310778036474e-05, + "loss": 2.8773, + "step": 22125 + }, + { + "epoch": 1.37351790924328, + "grad_norm": 0.15028621095729422, + "learning_rate": 6.580968158784947e-05, + "loss": 2.8586, + "step": 22126 + }, + { + "epoch": 1.373579986343038, + "grad_norm": 0.16147747169913051, + "learning_rate": 6.580625531285126e-05, + "loss": 2.8753, + "step": 22127 + }, + { + "epoch": 1.373642063442796, + "grad_norm": 0.14460309279288158, + "learning_rate": 6.580282895538797e-05, + "loss": 2.8513, + "step": 22128 + }, + { + "epoch": 1.3737041405425539, + "grad_norm": 0.16859532084188408, + "learning_rate": 6.579940251547749e-05, + "loss": 2.9196, + "step": 22129 + }, + { + "epoch": 1.3737662176423118, + "grad_norm": 0.1622456414503537, + "learning_rate": 6.579597599313768e-05, + "loss": 2.8184, + "step": 22130 + }, + { + "epoch": 1.3738282947420697, + "grad_norm": 0.14965407722100263, + "learning_rate": 6.579254938838644e-05, + "loss": 2.7435, + "step": 22131 + }, + { + "epoch": 1.3738903718418276, + "grad_norm": 0.14993989579718356, + "learning_rate": 6.578912270124163e-05, + "loss": 2.8532, + "step": 22132 + }, + { + "epoch": 1.3739524489415855, + "grad_norm": 0.15992285422997643, + "learning_rate": 6.578569593172114e-05, + "loss": 2.7215, + "step": 22133 + }, + { + "epoch": 1.3740145260413432, + "grad_norm": 0.1481447972978631, + "learning_rate": 6.578226907984284e-05, + "loss": 2.8572, + "step": 22134 + }, + { + "epoch": 1.3740766031411011, + "grad_norm": 0.14666511194012877, + "learning_rate": 6.57788421456246e-05, + "loss": 2.8141, + "step": 22135 + }, + { + "epoch": 1.374138680240859, + "grad_norm": 0.16803749474167398, + "learning_rate": 6.577541512908432e-05, + "loss": 2.91, + "step": 22136 + }, + { + "epoch": 1.374200757340617, + "grad_norm": 0.1526165744524111, + "learning_rate": 6.577198803023988e-05, + "loss": 2.8524, + "step": 22137 + }, + { + "epoch": 1.374262834440375, + "grad_norm": 0.15183795202364928, + "learning_rate": 6.576856084910912e-05, + "loss": 2.775, + "step": 22138 + }, + { + "epoch": 1.3743249115401328, + "grad_norm": 0.1961018252894721, + "learning_rate": 6.576513358570998e-05, + "loss": 2.8741, + "step": 22139 + }, + { + "epoch": 1.3743869886398907, + "grad_norm": 0.13922187373534706, + "learning_rate": 6.576170624006031e-05, + "loss": 2.8321, + "step": 22140 + }, + { + "epoch": 1.3744490657396486, + "grad_norm": 0.1591294215120691, + "learning_rate": 6.575827881217798e-05, + "loss": 2.8412, + "step": 22141 + }, + { + "epoch": 1.3745111428394066, + "grad_norm": 0.15276911845255445, + "learning_rate": 6.575485130208088e-05, + "loss": 2.819, + "step": 22142 + }, + { + "epoch": 1.3745732199391645, + "grad_norm": 0.1506851520074558, + "learning_rate": 6.57514237097869e-05, + "loss": 2.8781, + "step": 22143 + }, + { + "epoch": 1.3746352970389224, + "grad_norm": 0.16845598082284025, + "learning_rate": 6.574799603531393e-05, + "loss": 2.8552, + "step": 22144 + }, + { + "epoch": 1.3746973741386803, + "grad_norm": 0.16028714064931374, + "learning_rate": 6.574456827867984e-05, + "loss": 2.7936, + "step": 22145 + }, + { + "epoch": 1.3747594512384382, + "grad_norm": 0.17529548175678705, + "learning_rate": 6.57411404399025e-05, + "loss": 2.8494, + "step": 22146 + }, + { + "epoch": 1.3748215283381962, + "grad_norm": 0.1678177964761678, + "learning_rate": 6.573771251899981e-05, + "loss": 2.8257, + "step": 22147 + }, + { + "epoch": 1.374883605437954, + "grad_norm": 0.16214745430497784, + "learning_rate": 6.573428451598966e-05, + "loss": 2.7289, + "step": 22148 + }, + { + "epoch": 1.3749456825377118, + "grad_norm": 0.18159276065021893, + "learning_rate": 6.573085643088993e-05, + "loss": 2.8796, + "step": 22149 + }, + { + "epoch": 1.3750077596374697, + "grad_norm": 0.17308889679648534, + "learning_rate": 6.572742826371849e-05, + "loss": 2.8492, + "step": 22150 + }, + { + "epoch": 1.3750698367372276, + "grad_norm": 0.2008517412465831, + "learning_rate": 6.572400001449326e-05, + "loss": 2.768, + "step": 22151 + }, + { + "epoch": 1.3751319138369855, + "grad_norm": 0.15463105361524815, + "learning_rate": 6.572057168323209e-05, + "loss": 2.8261, + "step": 22152 + }, + { + "epoch": 1.3751939909367434, + "grad_norm": 0.15758005529686192, + "learning_rate": 6.571714326995289e-05, + "loss": 2.7861, + "step": 22153 + }, + { + "epoch": 1.3752560680365014, + "grad_norm": 0.16982593239374275, + "learning_rate": 6.571371477467354e-05, + "loss": 2.8459, + "step": 22154 + }, + { + "epoch": 1.3753181451362593, + "grad_norm": 0.16174019288986824, + "learning_rate": 6.571028619741191e-05, + "loss": 2.8517, + "step": 22155 + }, + { + "epoch": 1.3753802222360172, + "grad_norm": 0.15637044436750733, + "learning_rate": 6.570685753818591e-05, + "loss": 2.8447, + "step": 22156 + }, + { + "epoch": 1.375442299335775, + "grad_norm": 0.1557071016036397, + "learning_rate": 6.570342879701342e-05, + "loss": 2.8926, + "step": 22157 + }, + { + "epoch": 1.3755043764355328, + "grad_norm": 0.17001376462706844, + "learning_rate": 6.569999997391235e-05, + "loss": 2.7938, + "step": 22158 + }, + { + "epoch": 1.3755664535352907, + "grad_norm": 0.16533371056449095, + "learning_rate": 6.569657106890054e-05, + "loss": 2.8165, + "step": 22159 + }, + { + "epoch": 1.3756285306350486, + "grad_norm": 0.16641401845860596, + "learning_rate": 6.569314208199593e-05, + "loss": 2.8715, + "step": 22160 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.16521611394595737, + "learning_rate": 6.568971301321638e-05, + "loss": 2.8779, + "step": 22161 + }, + { + "epoch": 1.3757526848345645, + "grad_norm": 0.16040080286724934, + "learning_rate": 6.568628386257978e-05, + "loss": 2.8625, + "step": 22162 + }, + { + "epoch": 1.3758147619343224, + "grad_norm": 0.17508292830353953, + "learning_rate": 6.568285463010404e-05, + "loss": 2.8517, + "step": 22163 + }, + { + "epoch": 1.3758768390340803, + "grad_norm": 0.1770287013396511, + "learning_rate": 6.567942531580705e-05, + "loss": 2.8659, + "step": 22164 + }, + { + "epoch": 1.3759389161338382, + "grad_norm": 0.15695390694453348, + "learning_rate": 6.567599591970667e-05, + "loss": 2.7929, + "step": 22165 + }, + { + "epoch": 1.3760009932335961, + "grad_norm": 0.15803570188852498, + "learning_rate": 6.567256644182082e-05, + "loss": 2.8011, + "step": 22166 + }, + { + "epoch": 1.376063070333354, + "grad_norm": 0.16987874042632978, + "learning_rate": 6.566913688216738e-05, + "loss": 2.9212, + "step": 22167 + }, + { + "epoch": 1.376125147433112, + "grad_norm": 0.15264363605740394, + "learning_rate": 6.566570724076426e-05, + "loss": 2.712, + "step": 22168 + }, + { + "epoch": 1.37618722453287, + "grad_norm": 0.17873617942440131, + "learning_rate": 6.566227751762934e-05, + "loss": 2.8346, + "step": 22169 + }, + { + "epoch": 1.3762493016326278, + "grad_norm": 0.15981187081166498, + "learning_rate": 6.565884771278052e-05, + "loss": 2.851, + "step": 22170 + }, + { + "epoch": 1.3763113787323857, + "grad_norm": 0.17806250287273884, + "learning_rate": 6.56554178262357e-05, + "loss": 2.9021, + "step": 22171 + }, + { + "epoch": 1.3763734558321437, + "grad_norm": 0.16507692297783555, + "learning_rate": 6.565198785801274e-05, + "loss": 2.8689, + "step": 22172 + }, + { + "epoch": 1.3764355329319014, + "grad_norm": 0.15925978769065097, + "learning_rate": 6.564855780812957e-05, + "loss": 2.8815, + "step": 22173 + }, + { + "epoch": 1.3764976100316593, + "grad_norm": 0.15339237997724522, + "learning_rate": 6.564512767660408e-05, + "loss": 2.7863, + "step": 22174 + }, + { + "epoch": 1.3765596871314172, + "grad_norm": 0.18381759511260515, + "learning_rate": 6.564169746345414e-05, + "loss": 2.8847, + "step": 22175 + }, + { + "epoch": 1.376621764231175, + "grad_norm": 0.16257207400029286, + "learning_rate": 6.56382671686977e-05, + "loss": 2.8393, + "step": 22176 + }, + { + "epoch": 1.376683841330933, + "grad_norm": 0.16085096041739796, + "learning_rate": 6.56348367923526e-05, + "loss": 2.8133, + "step": 22177 + }, + { + "epoch": 1.376745918430691, + "grad_norm": 0.17070801071053948, + "learning_rate": 6.563140633443677e-05, + "loss": 2.8223, + "step": 22178 + }, + { + "epoch": 1.3768079955304489, + "grad_norm": 0.1571536457792654, + "learning_rate": 6.562797579496812e-05, + "loss": 2.8244, + "step": 22179 + }, + { + "epoch": 1.3768700726302068, + "grad_norm": 0.17994437308532674, + "learning_rate": 6.562454517396448e-05, + "loss": 2.7872, + "step": 22180 + }, + { + "epoch": 1.3769321497299647, + "grad_norm": 0.18774333700695106, + "learning_rate": 6.562111447144384e-05, + "loss": 2.9058, + "step": 22181 + }, + { + "epoch": 1.3769942268297224, + "grad_norm": 0.15814132425232597, + "learning_rate": 6.561768368742401e-05, + "loss": 2.735, + "step": 22182 + }, + { + "epoch": 1.3770563039294803, + "grad_norm": 0.2092773665349381, + "learning_rate": 6.561425282192297e-05, + "loss": 2.854, + "step": 22183 + }, + { + "epoch": 1.3771183810292382, + "grad_norm": 0.15207738086295372, + "learning_rate": 6.561082187495856e-05, + "loss": 2.8465, + "step": 22184 + }, + { + "epoch": 1.3771804581289961, + "grad_norm": 0.1635625920504995, + "learning_rate": 6.56073908465487e-05, + "loss": 2.7166, + "step": 22185 + }, + { + "epoch": 1.377242535228754, + "grad_norm": 0.15695788842886618, + "learning_rate": 6.560395973671132e-05, + "loss": 2.7982, + "step": 22186 + }, + { + "epoch": 1.377304612328512, + "grad_norm": 0.1595304462681624, + "learning_rate": 6.560052854546427e-05, + "loss": 2.7846, + "step": 22187 + }, + { + "epoch": 1.37736668942827, + "grad_norm": 0.16947307731256936, + "learning_rate": 6.559709727282548e-05, + "loss": 2.8218, + "step": 22188 + }, + { + "epoch": 1.3774287665280278, + "grad_norm": 0.15837833921439423, + "learning_rate": 6.559366591881284e-05, + "loss": 2.8672, + "step": 22189 + }, + { + "epoch": 1.3774908436277857, + "grad_norm": 0.16451778703657086, + "learning_rate": 6.559023448344429e-05, + "loss": 2.8152, + "step": 22190 + }, + { + "epoch": 1.3775529207275437, + "grad_norm": 0.16003592553484922, + "learning_rate": 6.558680296673768e-05, + "loss": 2.8822, + "step": 22191 + }, + { + "epoch": 1.3776149978273016, + "grad_norm": 0.16579167463584293, + "learning_rate": 6.558337136871093e-05, + "loss": 2.7746, + "step": 22192 + }, + { + "epoch": 1.3776770749270595, + "grad_norm": 0.16940483266902176, + "learning_rate": 6.557993968938195e-05, + "loss": 2.8795, + "step": 22193 + }, + { + "epoch": 1.3777391520268174, + "grad_norm": 0.1433536328301314, + "learning_rate": 6.557650792876865e-05, + "loss": 2.7019, + "step": 22194 + }, + { + "epoch": 1.3778012291265753, + "grad_norm": 0.14957281791519395, + "learning_rate": 6.557307608688892e-05, + "loss": 2.8136, + "step": 22195 + }, + { + "epoch": 1.3778633062263332, + "grad_norm": 0.13938708625015703, + "learning_rate": 6.556964416376068e-05, + "loss": 2.7704, + "step": 22196 + }, + { + "epoch": 1.377925383326091, + "grad_norm": 0.14738227312326502, + "learning_rate": 6.556621215940182e-05, + "loss": 2.9038, + "step": 22197 + }, + { + "epoch": 1.3779874604258489, + "grad_norm": 0.13984520261425648, + "learning_rate": 6.556278007383026e-05, + "loss": 2.8456, + "step": 22198 + }, + { + "epoch": 1.3780495375256068, + "grad_norm": 0.14701245805035754, + "learning_rate": 6.55593479070639e-05, + "loss": 2.8201, + "step": 22199 + }, + { + "epoch": 1.3781116146253647, + "grad_norm": 0.14891753252042225, + "learning_rate": 6.555591565912062e-05, + "loss": 2.7807, + "step": 22200 + }, + { + "epoch": 1.3781736917251226, + "grad_norm": 0.15286723088035606, + "learning_rate": 6.555248333001838e-05, + "loss": 2.8175, + "step": 22201 + }, + { + "epoch": 1.3782357688248805, + "grad_norm": 0.15519931511526608, + "learning_rate": 6.554905091977506e-05, + "loss": 2.8343, + "step": 22202 + }, + { + "epoch": 1.3782978459246384, + "grad_norm": 0.1421492606398904, + "learning_rate": 6.554561842840854e-05, + "loss": 2.7966, + "step": 22203 + }, + { + "epoch": 1.3783599230243964, + "grad_norm": 0.16840155973572926, + "learning_rate": 6.554218585593678e-05, + "loss": 2.7943, + "step": 22204 + }, + { + "epoch": 1.3784220001241543, + "grad_norm": 0.15436606123562768, + "learning_rate": 6.553875320237765e-05, + "loss": 2.7978, + "step": 22205 + }, + { + "epoch": 1.378484077223912, + "grad_norm": 0.18503038901032243, + "learning_rate": 6.553532046774909e-05, + "loss": 2.9326, + "step": 22206 + }, + { + "epoch": 1.37854615432367, + "grad_norm": 0.16037703624229493, + "learning_rate": 6.553188765206896e-05, + "loss": 2.7969, + "step": 22207 + }, + { + "epoch": 1.3786082314234278, + "grad_norm": 0.16167510706110674, + "learning_rate": 6.552845475535523e-05, + "loss": 2.8308, + "step": 22208 + }, + { + "epoch": 1.3786703085231857, + "grad_norm": 0.15026618094161448, + "learning_rate": 6.552502177762576e-05, + "loss": 2.8974, + "step": 22209 + }, + { + "epoch": 1.3787323856229436, + "grad_norm": 0.15316050395222205, + "learning_rate": 6.55215887188985e-05, + "loss": 2.8639, + "step": 22210 + }, + { + "epoch": 1.3787944627227016, + "grad_norm": 0.1470688159962743, + "learning_rate": 6.551815557919133e-05, + "loss": 2.7639, + "step": 22211 + }, + { + "epoch": 1.3788565398224595, + "grad_norm": 0.15075821354657454, + "learning_rate": 6.551472235852218e-05, + "loss": 2.7721, + "step": 22212 + }, + { + "epoch": 1.3789186169222174, + "grad_norm": 0.15784109748789704, + "learning_rate": 6.551128905690897e-05, + "loss": 2.8199, + "step": 22213 + }, + { + "epoch": 1.3789806940219753, + "grad_norm": 0.14293750775949768, + "learning_rate": 6.550785567436958e-05, + "loss": 2.7297, + "step": 22214 + }, + { + "epoch": 1.3790427711217332, + "grad_norm": 0.1423050211364035, + "learning_rate": 6.550442221092197e-05, + "loss": 2.7969, + "step": 22215 + }, + { + "epoch": 1.3791048482214912, + "grad_norm": 0.15526938283172742, + "learning_rate": 6.550098866658398e-05, + "loss": 2.6941, + "step": 22216 + }, + { + "epoch": 1.379166925321249, + "grad_norm": 0.15092381176375746, + "learning_rate": 6.549755504137362e-05, + "loss": 2.7894, + "step": 22217 + }, + { + "epoch": 1.379229002421007, + "grad_norm": 0.1552794053472317, + "learning_rate": 6.549412133530871e-05, + "loss": 2.9491, + "step": 22218 + }, + { + "epoch": 1.379291079520765, + "grad_norm": 0.15686210304796414, + "learning_rate": 6.549068754840723e-05, + "loss": 2.8619, + "step": 22219 + }, + { + "epoch": 1.3793531566205228, + "grad_norm": 0.15337531809979807, + "learning_rate": 6.548725368068708e-05, + "loss": 2.8643, + "step": 22220 + }, + { + "epoch": 1.3794152337202805, + "grad_norm": 0.1603726750645362, + "learning_rate": 6.548381973216615e-05, + "loss": 2.9096, + "step": 22221 + }, + { + "epoch": 1.3794773108200384, + "grad_norm": 0.15973838513309171, + "learning_rate": 6.548038570286239e-05, + "loss": 2.8403, + "step": 22222 + }, + { + "epoch": 1.3795393879197964, + "grad_norm": 0.1612766779595435, + "learning_rate": 6.54769515927937e-05, + "loss": 2.8143, + "step": 22223 + }, + { + "epoch": 1.3796014650195543, + "grad_norm": 0.1494740010630585, + "learning_rate": 6.547351740197798e-05, + "loss": 2.7864, + "step": 22224 + }, + { + "epoch": 1.3796635421193122, + "grad_norm": 0.16267597614798318, + "learning_rate": 6.547008313043317e-05, + "loss": 2.8152, + "step": 22225 + }, + { + "epoch": 1.37972561921907, + "grad_norm": 0.1508232433762476, + "learning_rate": 6.546664877817719e-05, + "loss": 2.8569, + "step": 22226 + }, + { + "epoch": 1.379787696318828, + "grad_norm": 0.16115283841621358, + "learning_rate": 6.546321434522793e-05, + "loss": 2.8611, + "step": 22227 + }, + { + "epoch": 1.379849773418586, + "grad_norm": 0.1484717593059702, + "learning_rate": 6.545977983160334e-05, + "loss": 2.8221, + "step": 22228 + }, + { + "epoch": 1.3799118505183439, + "grad_norm": 0.16371531308406242, + "learning_rate": 6.545634523732132e-05, + "loss": 2.7871, + "step": 22229 + }, + { + "epoch": 1.3799739276181016, + "grad_norm": 0.13821428315364023, + "learning_rate": 6.545291056239981e-05, + "loss": 2.8423, + "step": 22230 + }, + { + "epoch": 1.3800360047178595, + "grad_norm": 0.16680881403913023, + "learning_rate": 6.54494758068567e-05, + "loss": 2.7722, + "step": 22231 + }, + { + "epoch": 1.3800980818176174, + "grad_norm": 0.1589030571000696, + "learning_rate": 6.544604097070992e-05, + "loss": 2.8146, + "step": 22232 + }, + { + "epoch": 1.3801601589173753, + "grad_norm": 0.17082294431552383, + "learning_rate": 6.544260605397739e-05, + "loss": 2.8499, + "step": 22233 + }, + { + "epoch": 1.3802222360171332, + "grad_norm": 0.16646996764698552, + "learning_rate": 6.543917105667705e-05, + "loss": 2.9272, + "step": 22234 + }, + { + "epoch": 1.3802843131168911, + "grad_norm": 0.1824128440728998, + "learning_rate": 6.54357359788268e-05, + "loss": 2.8828, + "step": 22235 + }, + { + "epoch": 1.380346390216649, + "grad_norm": 0.16407762185297406, + "learning_rate": 6.543230082044457e-05, + "loss": 2.8746, + "step": 22236 + }, + { + "epoch": 1.380408467316407, + "grad_norm": 0.1983517968592115, + "learning_rate": 6.542886558154829e-05, + "loss": 2.8965, + "step": 22237 + }, + { + "epoch": 1.380470544416165, + "grad_norm": 0.1551376144592715, + "learning_rate": 6.542543026215586e-05, + "loss": 2.8886, + "step": 22238 + }, + { + "epoch": 1.3805326215159228, + "grad_norm": 0.16216092841096147, + "learning_rate": 6.542199486228521e-05, + "loss": 2.8483, + "step": 22239 + }, + { + "epoch": 1.3805946986156807, + "grad_norm": 0.15764210561926068, + "learning_rate": 6.54185593819543e-05, + "loss": 2.8065, + "step": 22240 + }, + { + "epoch": 1.3806567757154387, + "grad_norm": 0.16868291444153372, + "learning_rate": 6.541512382118099e-05, + "loss": 2.6802, + "step": 22241 + }, + { + "epoch": 1.3807188528151966, + "grad_norm": 0.1627907680186517, + "learning_rate": 6.541168817998326e-05, + "loss": 2.859, + "step": 22242 + }, + { + "epoch": 1.3807809299149545, + "grad_norm": 0.1566029853023486, + "learning_rate": 6.540825245837898e-05, + "loss": 2.8206, + "step": 22243 + }, + { + "epoch": 1.3808430070147124, + "grad_norm": 0.15483313938897958, + "learning_rate": 6.540481665638614e-05, + "loss": 2.837, + "step": 22244 + }, + { + "epoch": 1.38090508411447, + "grad_norm": 0.15870159409905624, + "learning_rate": 6.540138077402262e-05, + "loss": 2.8111, + "step": 22245 + }, + { + "epoch": 1.380967161214228, + "grad_norm": 0.15308414785239907, + "learning_rate": 6.539794481130635e-05, + "loss": 2.7775, + "step": 22246 + }, + { + "epoch": 1.381029238313986, + "grad_norm": 0.1616255191992307, + "learning_rate": 6.539450876825527e-05, + "loss": 2.8578, + "step": 22247 + }, + { + "epoch": 1.3810913154137439, + "grad_norm": 0.16228942179586153, + "learning_rate": 6.539107264488731e-05, + "loss": 2.816, + "step": 22248 + }, + { + "epoch": 1.3811533925135018, + "grad_norm": 0.14260929890372823, + "learning_rate": 6.538763644122037e-05, + "loss": 2.8077, + "step": 22249 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.1464438445922512, + "learning_rate": 6.538420015727238e-05, + "loss": 2.8663, + "step": 22250 + }, + { + "epoch": 1.3812775467130176, + "grad_norm": 0.1468120597347188, + "learning_rate": 6.538076379306132e-05, + "loss": 2.8764, + "step": 22251 + }, + { + "epoch": 1.3813396238127755, + "grad_norm": 0.14055317130412565, + "learning_rate": 6.537732734860507e-05, + "loss": 2.8137, + "step": 22252 + }, + { + "epoch": 1.3814017009125334, + "grad_norm": 0.16482753854738205, + "learning_rate": 6.537389082392157e-05, + "loss": 2.8566, + "step": 22253 + }, + { + "epoch": 1.3814637780122911, + "grad_norm": 0.14272197234164272, + "learning_rate": 6.537045421902874e-05, + "loss": 2.8544, + "step": 22254 + }, + { + "epoch": 1.381525855112049, + "grad_norm": 0.1508974662648781, + "learning_rate": 6.536701753394454e-05, + "loss": 2.7633, + "step": 22255 + }, + { + "epoch": 1.381587932211807, + "grad_norm": 0.1474784183877491, + "learning_rate": 6.536358076868688e-05, + "loss": 2.8657, + "step": 22256 + }, + { + "epoch": 1.381650009311565, + "grad_norm": 0.16685779361384495, + "learning_rate": 6.536014392327366e-05, + "loss": 2.7793, + "step": 22257 + }, + { + "epoch": 1.3817120864113228, + "grad_norm": 0.17520565525515042, + "learning_rate": 6.535670699772286e-05, + "loss": 2.7823, + "step": 22258 + }, + { + "epoch": 1.3817741635110807, + "grad_norm": 0.16075485870180764, + "learning_rate": 6.535326999205238e-05, + "loss": 2.7901, + "step": 22259 + }, + { + "epoch": 1.3818362406108387, + "grad_norm": 0.17276669698042638, + "learning_rate": 6.534983290628017e-05, + "loss": 2.8157, + "step": 22260 + }, + { + "epoch": 1.3818983177105966, + "grad_norm": 0.17504148587586274, + "learning_rate": 6.534639574042416e-05, + "loss": 2.8377, + "step": 22261 + }, + { + "epoch": 1.3819603948103545, + "grad_norm": 0.16082754406573468, + "learning_rate": 6.534295849450227e-05, + "loss": 2.8047, + "step": 22262 + }, + { + "epoch": 1.3820224719101124, + "grad_norm": 0.22518729619946815, + "learning_rate": 6.533952116853245e-05, + "loss": 2.8499, + "step": 22263 + }, + { + "epoch": 1.3820845490098703, + "grad_norm": 0.17612651190166356, + "learning_rate": 6.533608376253262e-05, + "loss": 2.8636, + "step": 22264 + }, + { + "epoch": 1.3821466261096282, + "grad_norm": 0.18785187367242753, + "learning_rate": 6.533264627652072e-05, + "loss": 2.864, + "step": 22265 + }, + { + "epoch": 1.3822087032093862, + "grad_norm": 0.16689795350505154, + "learning_rate": 6.532920871051466e-05, + "loss": 2.8736, + "step": 22266 + }, + { + "epoch": 1.382270780309144, + "grad_norm": 0.1514180866925246, + "learning_rate": 6.532577106453243e-05, + "loss": 2.7962, + "step": 22267 + }, + { + "epoch": 1.382332857408902, + "grad_norm": 0.16750725290089255, + "learning_rate": 6.53223333385919e-05, + "loss": 2.8267, + "step": 22268 + }, + { + "epoch": 1.3823949345086597, + "grad_norm": 0.1556550560598912, + "learning_rate": 6.531889553271107e-05, + "loss": 2.7833, + "step": 22269 + }, + { + "epoch": 1.3824570116084176, + "grad_norm": 0.19972835137834277, + "learning_rate": 6.531545764690782e-05, + "loss": 2.8272, + "step": 22270 + }, + { + "epoch": 1.3825190887081755, + "grad_norm": 0.18328035268214313, + "learning_rate": 6.531201968120012e-05, + "loss": 2.8432, + "step": 22271 + }, + { + "epoch": 1.3825811658079334, + "grad_norm": 0.14927293931931135, + "learning_rate": 6.53085816356059e-05, + "loss": 2.8225, + "step": 22272 + }, + { + "epoch": 1.3826432429076914, + "grad_norm": 0.22253864930435774, + "learning_rate": 6.530514351014309e-05, + "loss": 2.8943, + "step": 22273 + }, + { + "epoch": 1.3827053200074493, + "grad_norm": 0.1535177114213873, + "learning_rate": 6.530170530482963e-05, + "loss": 2.8702, + "step": 22274 + }, + { + "epoch": 1.3827673971072072, + "grad_norm": 0.15478412736135574, + "learning_rate": 6.529826701968345e-05, + "loss": 2.8436, + "step": 22275 + }, + { + "epoch": 1.3828294742069651, + "grad_norm": 0.15634898983960266, + "learning_rate": 6.529482865472252e-05, + "loss": 2.819, + "step": 22276 + }, + { + "epoch": 1.382891551306723, + "grad_norm": 0.16164493096886354, + "learning_rate": 6.529139020996473e-05, + "loss": 2.8701, + "step": 22277 + }, + { + "epoch": 1.3829536284064807, + "grad_norm": 0.146457027425637, + "learning_rate": 6.528795168542805e-05, + "loss": 2.985, + "step": 22278 + }, + { + "epoch": 1.3830157055062386, + "grad_norm": 0.15668777293739544, + "learning_rate": 6.528451308113042e-05, + "loss": 2.9093, + "step": 22279 + }, + { + "epoch": 1.3830777826059966, + "grad_norm": 0.15397949412297346, + "learning_rate": 6.528107439708978e-05, + "loss": 2.8162, + "step": 22280 + }, + { + "epoch": 1.3831398597057545, + "grad_norm": 0.1517289115751808, + "learning_rate": 6.527763563332406e-05, + "loss": 2.8107, + "step": 22281 + }, + { + "epoch": 1.3832019368055124, + "grad_norm": 0.16828273657422857, + "learning_rate": 6.527419678985119e-05, + "loss": 2.8488, + "step": 22282 + }, + { + "epoch": 1.3832640139052703, + "grad_norm": 0.1606741822772736, + "learning_rate": 6.527075786668915e-05, + "loss": 2.8164, + "step": 22283 + }, + { + "epoch": 1.3833260910050282, + "grad_norm": 0.15311855356115153, + "learning_rate": 6.526731886385585e-05, + "loss": 2.8673, + "step": 22284 + }, + { + "epoch": 1.3833881681047862, + "grad_norm": 0.16533898858868012, + "learning_rate": 6.526387978136926e-05, + "loss": 2.8456, + "step": 22285 + }, + { + "epoch": 1.383450245204544, + "grad_norm": 0.1549238131445661, + "learning_rate": 6.526044061924727e-05, + "loss": 2.8488, + "step": 22286 + }, + { + "epoch": 1.383512322304302, + "grad_norm": 0.1553764601283621, + "learning_rate": 6.525700137750789e-05, + "loss": 2.8349, + "step": 22287 + }, + { + "epoch": 1.38357439940406, + "grad_norm": 0.1438482034624909, + "learning_rate": 6.5253562056169e-05, + "loss": 2.8216, + "step": 22288 + }, + { + "epoch": 1.3836364765038178, + "grad_norm": 0.15474069119119652, + "learning_rate": 6.52501226552486e-05, + "loss": 2.783, + "step": 22289 + }, + { + "epoch": 1.3836985536035757, + "grad_norm": 0.18666048657207554, + "learning_rate": 6.52466831747646e-05, + "loss": 2.9036, + "step": 22290 + }, + { + "epoch": 1.3837606307033337, + "grad_norm": 0.1569640414332791, + "learning_rate": 6.524324361473495e-05, + "loss": 2.7936, + "step": 22291 + }, + { + "epoch": 1.3838227078030914, + "grad_norm": 0.14361031421183668, + "learning_rate": 6.523980397517759e-05, + "loss": 2.8135, + "step": 22292 + }, + { + "epoch": 1.3838847849028493, + "grad_norm": 0.16869402813726153, + "learning_rate": 6.523636425611049e-05, + "loss": 2.8824, + "step": 22293 + }, + { + "epoch": 1.3839468620026072, + "grad_norm": 0.14841948575583516, + "learning_rate": 6.523292445755157e-05, + "loss": 2.9547, + "step": 22294 + }, + { + "epoch": 1.384008939102365, + "grad_norm": 0.15853469153018077, + "learning_rate": 6.522948457951877e-05, + "loss": 2.9198, + "step": 22295 + }, + { + "epoch": 1.384071016202123, + "grad_norm": 0.15412220544671545, + "learning_rate": 6.522604462203009e-05, + "loss": 2.8158, + "step": 22296 + }, + { + "epoch": 1.384133093301881, + "grad_norm": 0.16171435981356863, + "learning_rate": 6.52226045851034e-05, + "loss": 2.7775, + "step": 22297 + }, + { + "epoch": 1.3841951704016389, + "grad_norm": 0.15601187084000226, + "learning_rate": 6.521916446875669e-05, + "loss": 2.763, + "step": 22298 + }, + { + "epoch": 1.3842572475013968, + "grad_norm": 0.15764927165274234, + "learning_rate": 6.521572427300793e-05, + "loss": 2.7299, + "step": 22299 + }, + { + "epoch": 1.3843193246011547, + "grad_norm": 0.17131351744395934, + "learning_rate": 6.5212283997875e-05, + "loss": 2.9492, + "step": 22300 + }, + { + "epoch": 1.3843814017009124, + "grad_norm": 0.15626159863338274, + "learning_rate": 6.520884364337593e-05, + "loss": 2.819, + "step": 22301 + }, + { + "epoch": 1.3844434788006703, + "grad_norm": 0.15506779951028987, + "learning_rate": 6.520540320952863e-05, + "loss": 2.8232, + "step": 22302 + }, + { + "epoch": 1.3845055559004282, + "grad_norm": 0.15242891498733774, + "learning_rate": 6.520196269635104e-05, + "loss": 2.8148, + "step": 22303 + }, + { + "epoch": 1.3845676330001861, + "grad_norm": 0.16941533278854748, + "learning_rate": 6.519852210386112e-05, + "loss": 2.9057, + "step": 22304 + }, + { + "epoch": 1.384629710099944, + "grad_norm": 0.15072307131340493, + "learning_rate": 6.519508143207682e-05, + "loss": 2.7816, + "step": 22305 + }, + { + "epoch": 1.384691787199702, + "grad_norm": 0.15064498520472017, + "learning_rate": 6.519164068101609e-05, + "loss": 2.8843, + "step": 22306 + }, + { + "epoch": 1.38475386429946, + "grad_norm": 0.1520439401213382, + "learning_rate": 6.518819985069689e-05, + "loss": 2.8179, + "step": 22307 + }, + { + "epoch": 1.3848159413992178, + "grad_norm": 0.17445335564980333, + "learning_rate": 6.518475894113715e-05, + "loss": 2.8085, + "step": 22308 + }, + { + "epoch": 1.3848780184989757, + "grad_norm": 0.15006905396753095, + "learning_rate": 6.518131795235486e-05, + "loss": 2.791, + "step": 22309 + }, + { + "epoch": 1.3849400955987337, + "grad_norm": 0.17465806384636942, + "learning_rate": 6.517787688436792e-05, + "loss": 2.8479, + "step": 22310 + }, + { + "epoch": 1.3850021726984916, + "grad_norm": 0.1544369894348869, + "learning_rate": 6.517443573719434e-05, + "loss": 2.8828, + "step": 22311 + }, + { + "epoch": 1.3850642497982495, + "grad_norm": 0.14552372327004331, + "learning_rate": 6.517099451085204e-05, + "loss": 2.8837, + "step": 22312 + }, + { + "epoch": 1.3851263268980074, + "grad_norm": 0.1568476630990312, + "learning_rate": 6.516755320535897e-05, + "loss": 2.8835, + "step": 22313 + }, + { + "epoch": 1.3851884039977653, + "grad_norm": 0.14493010232814743, + "learning_rate": 6.51641118207331e-05, + "loss": 2.9377, + "step": 22314 + }, + { + "epoch": 1.3852504810975232, + "grad_norm": 0.15458364034461136, + "learning_rate": 6.516067035699235e-05, + "loss": 2.8797, + "step": 22315 + }, + { + "epoch": 1.385312558197281, + "grad_norm": 0.14581220905260212, + "learning_rate": 6.515722881415473e-05, + "loss": 2.7229, + "step": 22316 + }, + { + "epoch": 1.3853746352970389, + "grad_norm": 0.17828887336406335, + "learning_rate": 6.515378719223816e-05, + "loss": 2.8777, + "step": 22317 + }, + { + "epoch": 1.3854367123967968, + "grad_norm": 0.15843246461364396, + "learning_rate": 6.51503454912606e-05, + "loss": 2.869, + "step": 22318 + }, + { + "epoch": 1.3854987894965547, + "grad_norm": 0.1471653685564714, + "learning_rate": 6.514690371124001e-05, + "loss": 2.8488, + "step": 22319 + }, + { + "epoch": 1.3855608665963126, + "grad_norm": 0.15063405461815904, + "learning_rate": 6.514346185219436e-05, + "loss": 2.7694, + "step": 22320 + }, + { + "epoch": 1.3856229436960705, + "grad_norm": 0.154785053963131, + "learning_rate": 6.514001991414159e-05, + "loss": 2.9331, + "step": 22321 + }, + { + "epoch": 1.3856850207958284, + "grad_norm": 0.14570123502512908, + "learning_rate": 6.513657789709963e-05, + "loss": 2.7806, + "step": 22322 + }, + { + "epoch": 1.3857470978955864, + "grad_norm": 0.15815879059284665, + "learning_rate": 6.513313580108651e-05, + "loss": 2.8399, + "step": 22323 + }, + { + "epoch": 1.3858091749953443, + "grad_norm": 0.14059769179844211, + "learning_rate": 6.512969362612013e-05, + "loss": 2.8275, + "step": 22324 + }, + { + "epoch": 1.385871252095102, + "grad_norm": 0.14556004399643185, + "learning_rate": 6.512625137221846e-05, + "loss": 2.7559, + "step": 22325 + }, + { + "epoch": 1.38593332919486, + "grad_norm": 0.14973747556004563, + "learning_rate": 6.512280903939947e-05, + "loss": 2.8991, + "step": 22326 + }, + { + "epoch": 1.3859954062946178, + "grad_norm": 0.20494484884994668, + "learning_rate": 6.511936662768112e-05, + "loss": 2.8131, + "step": 22327 + }, + { + "epoch": 1.3860574833943757, + "grad_norm": 0.15643856472174278, + "learning_rate": 6.511592413708136e-05, + "loss": 2.8632, + "step": 22328 + }, + { + "epoch": 1.3861195604941337, + "grad_norm": 0.19370051836538696, + "learning_rate": 6.511248156761815e-05, + "loss": 2.8687, + "step": 22329 + }, + { + "epoch": 1.3861816375938916, + "grad_norm": 0.17707409670994018, + "learning_rate": 6.510903891930946e-05, + "loss": 2.8006, + "step": 22330 + }, + { + "epoch": 1.3862437146936495, + "grad_norm": 0.18349518057538605, + "learning_rate": 6.510559619217323e-05, + "loss": 2.7915, + "step": 22331 + }, + { + "epoch": 1.3863057917934074, + "grad_norm": 0.1617684283408342, + "learning_rate": 6.510215338622745e-05, + "loss": 2.7909, + "step": 22332 + }, + { + "epoch": 1.3863678688931653, + "grad_norm": 0.19194686214925874, + "learning_rate": 6.509871050149007e-05, + "loss": 2.7583, + "step": 22333 + }, + { + "epoch": 1.3864299459929232, + "grad_norm": 0.17794111824376616, + "learning_rate": 6.509526753797906e-05, + "loss": 2.8012, + "step": 22334 + }, + { + "epoch": 1.3864920230926812, + "grad_norm": 0.21422556457977118, + "learning_rate": 6.509182449571238e-05, + "loss": 2.883, + "step": 22335 + }, + { + "epoch": 1.386554100192439, + "grad_norm": 0.18739867452793413, + "learning_rate": 6.508838137470796e-05, + "loss": 2.8504, + "step": 22336 + }, + { + "epoch": 1.386616177292197, + "grad_norm": 0.1643915906210078, + "learning_rate": 6.508493817498383e-05, + "loss": 2.8544, + "step": 22337 + }, + { + "epoch": 1.386678254391955, + "grad_norm": 0.16681732816165623, + "learning_rate": 6.508149489655787e-05, + "loss": 2.8088, + "step": 22338 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.17348324928375414, + "learning_rate": 6.507805153944813e-05, + "loss": 2.8099, + "step": 22339 + }, + { + "epoch": 1.3868024085914705, + "grad_norm": 0.16250256458422857, + "learning_rate": 6.50746081036725e-05, + "loss": 2.8814, + "step": 22340 + }, + { + "epoch": 1.3868644856912284, + "grad_norm": 0.17373500429937136, + "learning_rate": 6.5071164589249e-05, + "loss": 2.7909, + "step": 22341 + }, + { + "epoch": 1.3869265627909864, + "grad_norm": 0.16745827419369025, + "learning_rate": 6.506772099619557e-05, + "loss": 2.902, + "step": 22342 + }, + { + "epoch": 1.3869886398907443, + "grad_norm": 0.21656316755723107, + "learning_rate": 6.50642773245302e-05, + "loss": 2.8231, + "step": 22343 + }, + { + "epoch": 1.3870507169905022, + "grad_norm": 0.19738878050478112, + "learning_rate": 6.506083357427081e-05, + "loss": 2.9277, + "step": 22344 + }, + { + "epoch": 1.3871127940902601, + "grad_norm": 0.18921411322748757, + "learning_rate": 6.505738974543541e-05, + "loss": 2.872, + "step": 22345 + }, + { + "epoch": 1.387174871190018, + "grad_norm": 0.1818527398300556, + "learning_rate": 6.505394583804195e-05, + "loss": 2.8179, + "step": 22346 + }, + { + "epoch": 1.387236948289776, + "grad_norm": 0.147028820421129, + "learning_rate": 6.50505018521084e-05, + "loss": 2.8454, + "step": 22347 + }, + { + "epoch": 1.3872990253895339, + "grad_norm": 0.16802588311766278, + "learning_rate": 6.504705778765272e-05, + "loss": 2.8401, + "step": 22348 + }, + { + "epoch": 1.3873611024892916, + "grad_norm": 0.17892596331935748, + "learning_rate": 6.504361364469289e-05, + "loss": 2.8249, + "step": 22349 + }, + { + "epoch": 1.3874231795890495, + "grad_norm": 0.16860708105325986, + "learning_rate": 6.504016942324689e-05, + "loss": 2.918, + "step": 22350 + }, + { + "epoch": 1.3874852566888074, + "grad_norm": 0.17983849028080498, + "learning_rate": 6.503672512333267e-05, + "loss": 2.8374, + "step": 22351 + }, + { + "epoch": 1.3875473337885653, + "grad_norm": 0.15034756105826266, + "learning_rate": 6.503328074496819e-05, + "loss": 2.8597, + "step": 22352 + }, + { + "epoch": 1.3876094108883232, + "grad_norm": 0.1594413934585159, + "learning_rate": 6.502983628817145e-05, + "loss": 2.8959, + "step": 22353 + }, + { + "epoch": 1.3876714879880812, + "grad_norm": 0.17681018833160833, + "learning_rate": 6.502639175296042e-05, + "loss": 2.932, + "step": 22354 + }, + { + "epoch": 1.387733565087839, + "grad_norm": 0.15978348196169212, + "learning_rate": 6.502294713935303e-05, + "loss": 2.746, + "step": 22355 + }, + { + "epoch": 1.387795642187597, + "grad_norm": 0.15788943780549924, + "learning_rate": 6.501950244736728e-05, + "loss": 2.8303, + "step": 22356 + }, + { + "epoch": 1.387857719287355, + "grad_norm": 0.18566679843561756, + "learning_rate": 6.501605767702115e-05, + "loss": 2.8916, + "step": 22357 + }, + { + "epoch": 1.3879197963871128, + "grad_norm": 0.15385838262850618, + "learning_rate": 6.50126128283326e-05, + "loss": 2.8356, + "step": 22358 + }, + { + "epoch": 1.3879818734868707, + "grad_norm": 0.16529967255572894, + "learning_rate": 6.50091679013196e-05, + "loss": 2.8175, + "step": 22359 + }, + { + "epoch": 1.3880439505866287, + "grad_norm": 0.2368506102096278, + "learning_rate": 6.500572289600012e-05, + "loss": 2.8156, + "step": 22360 + }, + { + "epoch": 1.3881060276863866, + "grad_norm": 0.15034033813958256, + "learning_rate": 6.500227781239215e-05, + "loss": 2.8155, + "step": 22361 + }, + { + "epoch": 1.3881681047861445, + "grad_norm": 0.182610628958854, + "learning_rate": 6.499883265051366e-05, + "loss": 2.8589, + "step": 22362 + }, + { + "epoch": 1.3882301818859024, + "grad_norm": 0.1604604100731719, + "learning_rate": 6.49953874103826e-05, + "loss": 2.8633, + "step": 22363 + }, + { + "epoch": 1.38829225898566, + "grad_norm": 0.1847963614030511, + "learning_rate": 6.499194209201699e-05, + "loss": 2.8448, + "step": 22364 + }, + { + "epoch": 1.388354336085418, + "grad_norm": 0.17942536971035664, + "learning_rate": 6.498849669543475e-05, + "loss": 2.7802, + "step": 22365 + }, + { + "epoch": 1.388416413185176, + "grad_norm": 0.1600743235410838, + "learning_rate": 6.498505122065391e-05, + "loss": 2.8639, + "step": 22366 + }, + { + "epoch": 1.3884784902849339, + "grad_norm": 0.19134806702357432, + "learning_rate": 6.49816056676924e-05, + "loss": 2.8138, + "step": 22367 + }, + { + "epoch": 1.3885405673846918, + "grad_norm": 0.18915820024746358, + "learning_rate": 6.497816003656824e-05, + "loss": 2.8931, + "step": 22368 + }, + { + "epoch": 1.3886026444844497, + "grad_norm": 0.15852637880076426, + "learning_rate": 6.497471432729936e-05, + "loss": 2.7662, + "step": 22369 + }, + { + "epoch": 1.3886647215842076, + "grad_norm": 0.1958533523266578, + "learning_rate": 6.497126853990377e-05, + "loss": 2.8088, + "step": 22370 + }, + { + "epoch": 1.3887267986839655, + "grad_norm": 0.19175983389812623, + "learning_rate": 6.496782267439943e-05, + "loss": 2.9255, + "step": 22371 + }, + { + "epoch": 1.3887888757837235, + "grad_norm": 0.2157758865546182, + "learning_rate": 6.496437673080435e-05, + "loss": 2.8393, + "step": 22372 + }, + { + "epoch": 1.3888509528834811, + "grad_norm": 0.15950309920700617, + "learning_rate": 6.496093070913647e-05, + "loss": 2.8436, + "step": 22373 + }, + { + "epoch": 1.388913029983239, + "grad_norm": 0.1701840209192059, + "learning_rate": 6.495748460941376e-05, + "loss": 2.8737, + "step": 22374 + }, + { + "epoch": 1.388975107082997, + "grad_norm": 0.1627000137144457, + "learning_rate": 6.495403843165425e-05, + "loss": 2.7848, + "step": 22375 + }, + { + "epoch": 1.389037184182755, + "grad_norm": 0.1509367020410173, + "learning_rate": 6.495059217587587e-05, + "loss": 2.7542, + "step": 22376 + }, + { + "epoch": 1.3890992612825128, + "grad_norm": 0.15201516662279177, + "learning_rate": 6.494714584209665e-05, + "loss": 2.7804, + "step": 22377 + }, + { + "epoch": 1.3891613383822707, + "grad_norm": 0.1618589512222624, + "learning_rate": 6.494369943033451e-05, + "loss": 2.7679, + "step": 22378 + }, + { + "epoch": 1.3892234154820287, + "grad_norm": 0.19644060500552613, + "learning_rate": 6.494025294060749e-05, + "loss": 2.7835, + "step": 22379 + }, + { + "epoch": 1.3892854925817866, + "grad_norm": 0.1538142793603626, + "learning_rate": 6.493680637293353e-05, + "loss": 2.8216, + "step": 22380 + }, + { + "epoch": 1.3893475696815445, + "grad_norm": 0.1445777534728274, + "learning_rate": 6.493335972733062e-05, + "loss": 2.7254, + "step": 22381 + }, + { + "epoch": 1.3894096467813024, + "grad_norm": 0.171276921371568, + "learning_rate": 6.492991300381676e-05, + "loss": 2.8646, + "step": 22382 + }, + { + "epoch": 1.3894717238810603, + "grad_norm": 0.13770568754460208, + "learning_rate": 6.49264662024099e-05, + "loss": 2.7913, + "step": 22383 + }, + { + "epoch": 1.3895338009808182, + "grad_norm": 0.15341759893517265, + "learning_rate": 6.492301932312806e-05, + "loss": 2.8565, + "step": 22384 + }, + { + "epoch": 1.3895958780805762, + "grad_norm": 0.15802436902192257, + "learning_rate": 6.49195723659892e-05, + "loss": 2.8321, + "step": 22385 + }, + { + "epoch": 1.389657955180334, + "grad_norm": 0.15513511710677078, + "learning_rate": 6.491612533101132e-05, + "loss": 2.7846, + "step": 22386 + }, + { + "epoch": 1.389720032280092, + "grad_norm": 0.14991048859080477, + "learning_rate": 6.491267821821236e-05, + "loss": 2.853, + "step": 22387 + }, + { + "epoch": 1.3897821093798497, + "grad_norm": 0.14543698014124215, + "learning_rate": 6.490923102761037e-05, + "loss": 2.8369, + "step": 22388 + }, + { + "epoch": 1.3898441864796076, + "grad_norm": 0.15486951989838052, + "learning_rate": 6.49057837592233e-05, + "loss": 2.8187, + "step": 22389 + }, + { + "epoch": 1.3899062635793655, + "grad_norm": 0.1434341950025418, + "learning_rate": 6.490233641306914e-05, + "loss": 2.8169, + "step": 22390 + }, + { + "epoch": 1.3899683406791234, + "grad_norm": 0.15280558201863345, + "learning_rate": 6.489888898916585e-05, + "loss": 2.7618, + "step": 22391 + }, + { + "epoch": 1.3900304177788814, + "grad_norm": 0.1682373217376625, + "learning_rate": 6.489544148753145e-05, + "loss": 2.9054, + "step": 22392 + }, + { + "epoch": 1.3900924948786393, + "grad_norm": 0.16165644210156557, + "learning_rate": 6.489199390818392e-05, + "loss": 2.8373, + "step": 22393 + }, + { + "epoch": 1.3901545719783972, + "grad_norm": 0.16493677886018884, + "learning_rate": 6.488854625114125e-05, + "loss": 2.9098, + "step": 22394 + }, + { + "epoch": 1.3902166490781551, + "grad_norm": 0.1435608311071616, + "learning_rate": 6.48850985164214e-05, + "loss": 2.8366, + "step": 22395 + }, + { + "epoch": 1.390278726177913, + "grad_norm": 0.170763981924251, + "learning_rate": 6.488165070404237e-05, + "loss": 2.8393, + "step": 22396 + }, + { + "epoch": 1.3903408032776707, + "grad_norm": 0.14958644098539906, + "learning_rate": 6.48782028140222e-05, + "loss": 2.8568, + "step": 22397 + }, + { + "epoch": 1.3904028803774287, + "grad_norm": 0.15986708378449394, + "learning_rate": 6.48747548463788e-05, + "loss": 2.9142, + "step": 22398 + }, + { + "epoch": 1.3904649574771866, + "grad_norm": 0.19558724153242807, + "learning_rate": 6.487130680113019e-05, + "loss": 2.7532, + "step": 22399 + }, + { + "epoch": 1.3905270345769445, + "grad_norm": 0.16594596972285308, + "learning_rate": 6.486785867829439e-05, + "loss": 2.7892, + "step": 22400 + }, + { + "epoch": 1.3905891116767024, + "grad_norm": 0.15492045268657723, + "learning_rate": 6.486441047788933e-05, + "loss": 2.7522, + "step": 22401 + }, + { + "epoch": 1.3906511887764603, + "grad_norm": 0.1799797604526734, + "learning_rate": 6.486096219993306e-05, + "loss": 2.6881, + "step": 22402 + }, + { + "epoch": 1.3907132658762182, + "grad_norm": 0.1767404663679314, + "learning_rate": 6.485751384444352e-05, + "loss": 2.83, + "step": 22403 + }, + { + "epoch": 1.3907753429759762, + "grad_norm": 0.1800187519855815, + "learning_rate": 6.485406541143874e-05, + "loss": 2.8571, + "step": 22404 + }, + { + "epoch": 1.390837420075734, + "grad_norm": 0.17506177739937698, + "learning_rate": 6.485061690093668e-05, + "loss": 2.8899, + "step": 22405 + }, + { + "epoch": 1.390899497175492, + "grad_norm": 0.182927831314122, + "learning_rate": 6.484716831295537e-05, + "loss": 2.8921, + "step": 22406 + }, + { + "epoch": 1.39096157427525, + "grad_norm": 0.16007516715806427, + "learning_rate": 6.484371964751275e-05, + "loss": 2.8417, + "step": 22407 + }, + { + "epoch": 1.3910236513750078, + "grad_norm": 0.17718516371799758, + "learning_rate": 6.484027090462687e-05, + "loss": 2.7132, + "step": 22408 + }, + { + "epoch": 1.3910857284747657, + "grad_norm": 0.18468380017455524, + "learning_rate": 6.483682208431567e-05, + "loss": 2.8054, + "step": 22409 + }, + { + "epoch": 1.3911478055745237, + "grad_norm": 0.17180128575864997, + "learning_rate": 6.483337318659718e-05, + "loss": 2.884, + "step": 22410 + }, + { + "epoch": 1.3912098826742816, + "grad_norm": 0.14268465504084638, + "learning_rate": 6.482992421148937e-05, + "loss": 2.7698, + "step": 22411 + }, + { + "epoch": 1.3912719597740393, + "grad_norm": 0.1872952256980169, + "learning_rate": 6.482647515901026e-05, + "loss": 2.8796, + "step": 22412 + }, + { + "epoch": 1.3913340368737972, + "grad_norm": 0.17485902250811886, + "learning_rate": 6.482302602917781e-05, + "loss": 2.7938, + "step": 22413 + }, + { + "epoch": 1.3913961139735551, + "grad_norm": 0.18488660081906913, + "learning_rate": 6.481957682201006e-05, + "loss": 2.8548, + "step": 22414 + }, + { + "epoch": 1.391458191073313, + "grad_norm": 0.1629937533859369, + "learning_rate": 6.481612753752496e-05, + "loss": 2.9426, + "step": 22415 + }, + { + "epoch": 1.391520268173071, + "grad_norm": 0.1730775864002057, + "learning_rate": 6.481267817574054e-05, + "loss": 2.8217, + "step": 22416 + }, + { + "epoch": 1.3915823452728289, + "grad_norm": 0.15186315314668733, + "learning_rate": 6.480922873667477e-05, + "loss": 2.86, + "step": 22417 + }, + { + "epoch": 1.3916444223725868, + "grad_norm": 0.16449870295723437, + "learning_rate": 6.480577922034566e-05, + "loss": 2.8704, + "step": 22418 + }, + { + "epoch": 1.3917064994723447, + "grad_norm": 0.16225046888748035, + "learning_rate": 6.480232962677121e-05, + "loss": 2.7177, + "step": 22419 + }, + { + "epoch": 1.3917685765721026, + "grad_norm": 0.17007144286662232, + "learning_rate": 6.479887995596942e-05, + "loss": 2.8448, + "step": 22420 + }, + { + "epoch": 1.3918306536718603, + "grad_norm": 0.16588631249928984, + "learning_rate": 6.479543020795827e-05, + "loss": 2.7603, + "step": 22421 + }, + { + "epoch": 1.3918927307716182, + "grad_norm": 0.1614055463936196, + "learning_rate": 6.479198038275578e-05, + "loss": 2.7901, + "step": 22422 + }, + { + "epoch": 1.3919548078713762, + "grad_norm": 0.18209669835741513, + "learning_rate": 6.478853048037991e-05, + "loss": 2.8439, + "step": 22423 + }, + { + "epoch": 1.392016884971134, + "grad_norm": 0.15258930663405476, + "learning_rate": 6.478508050084871e-05, + "loss": 2.8427, + "step": 22424 + }, + { + "epoch": 1.392078962070892, + "grad_norm": 0.16752696156576588, + "learning_rate": 6.478163044418015e-05, + "loss": 2.9107, + "step": 22425 + }, + { + "epoch": 1.39214103917065, + "grad_norm": 0.1458540451861804, + "learning_rate": 6.477818031039225e-05, + "loss": 2.8427, + "step": 22426 + }, + { + "epoch": 1.3922031162704078, + "grad_norm": 0.167208353194893, + "learning_rate": 6.477473009950299e-05, + "loss": 2.7904, + "step": 22427 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.21139341110975124, + "learning_rate": 6.477127981153036e-05, + "loss": 2.9145, + "step": 22428 + }, + { + "epoch": 1.3923272704699237, + "grad_norm": 0.14509256367072101, + "learning_rate": 6.476782944649238e-05, + "loss": 2.7907, + "step": 22429 + }, + { + "epoch": 1.3923893475696816, + "grad_norm": 0.1406612548187739, + "learning_rate": 6.476437900440705e-05, + "loss": 2.7345, + "step": 22430 + }, + { + "epoch": 1.3924514246694395, + "grad_norm": 0.14287541370548884, + "learning_rate": 6.476092848529238e-05, + "loss": 2.7535, + "step": 22431 + }, + { + "epoch": 1.3925135017691974, + "grad_norm": 0.15072632762772198, + "learning_rate": 6.475747788916637e-05, + "loss": 2.866, + "step": 22432 + }, + { + "epoch": 1.3925755788689553, + "grad_norm": 0.15431876570502334, + "learning_rate": 6.4754027216047e-05, + "loss": 2.8914, + "step": 22433 + }, + { + "epoch": 1.3926376559687133, + "grad_norm": 0.16925326708587313, + "learning_rate": 6.47505764659523e-05, + "loss": 2.9, + "step": 22434 + }, + { + "epoch": 1.3926997330684712, + "grad_norm": 0.14753478140221432, + "learning_rate": 6.474712563890024e-05, + "loss": 2.8119, + "step": 22435 + }, + { + "epoch": 1.3927618101682289, + "grad_norm": 0.1683414283710611, + "learning_rate": 6.474367473490888e-05, + "loss": 2.8671, + "step": 22436 + }, + { + "epoch": 1.3928238872679868, + "grad_norm": 0.16799755182178328, + "learning_rate": 6.474022375399616e-05, + "loss": 2.8301, + "step": 22437 + }, + { + "epoch": 1.3928859643677447, + "grad_norm": 0.15199606908872673, + "learning_rate": 6.473677269618012e-05, + "loss": 2.841, + "step": 22438 + }, + { + "epoch": 1.3929480414675026, + "grad_norm": 0.14972478682900447, + "learning_rate": 6.473332156147877e-05, + "loss": 2.9368, + "step": 22439 + }, + { + "epoch": 1.3930101185672605, + "grad_norm": 0.14877667701750677, + "learning_rate": 6.47298703499101e-05, + "loss": 2.8837, + "step": 22440 + }, + { + "epoch": 1.3930721956670185, + "grad_norm": 0.14949874721994147, + "learning_rate": 6.472641906149211e-05, + "loss": 2.8723, + "step": 22441 + }, + { + "epoch": 1.3931342727667764, + "grad_norm": 0.1646539268619362, + "learning_rate": 6.472296769624283e-05, + "loss": 2.7949, + "step": 22442 + }, + { + "epoch": 1.3931963498665343, + "grad_norm": 0.15273357303340546, + "learning_rate": 6.471951625418025e-05, + "loss": 2.8665, + "step": 22443 + }, + { + "epoch": 1.3932584269662922, + "grad_norm": 0.14659283755101202, + "learning_rate": 6.471606473532238e-05, + "loss": 2.8108, + "step": 22444 + }, + { + "epoch": 1.39332050406605, + "grad_norm": 0.24525728819261292, + "learning_rate": 6.471261313968723e-05, + "loss": 2.9108, + "step": 22445 + }, + { + "epoch": 1.3933825811658078, + "grad_norm": 0.1607289599680813, + "learning_rate": 6.470916146729279e-05, + "loss": 2.8254, + "step": 22446 + }, + { + "epoch": 1.3934446582655657, + "grad_norm": 0.16181640118522383, + "learning_rate": 6.47057097181571e-05, + "loss": 2.869, + "step": 22447 + }, + { + "epoch": 1.3935067353653237, + "grad_norm": 0.16158089613265053, + "learning_rate": 6.470225789229815e-05, + "loss": 2.849, + "step": 22448 + }, + { + "epoch": 1.3935688124650816, + "grad_norm": 0.1665815245193179, + "learning_rate": 6.469880598973395e-05, + "loss": 2.7806, + "step": 22449 + }, + { + "epoch": 1.3936308895648395, + "grad_norm": 0.14807362880886638, + "learning_rate": 6.46953540104825e-05, + "loss": 2.8176, + "step": 22450 + }, + { + "epoch": 1.3936929666645974, + "grad_norm": 0.16468708041895885, + "learning_rate": 6.469190195456184e-05, + "loss": 2.9077, + "step": 22451 + }, + { + "epoch": 1.3937550437643553, + "grad_norm": 0.14604775525822902, + "learning_rate": 6.468844982198996e-05, + "loss": 2.8693, + "step": 22452 + }, + { + "epoch": 1.3938171208641132, + "grad_norm": 0.17027658403171164, + "learning_rate": 6.468499761278485e-05, + "loss": 2.9044, + "step": 22453 + }, + { + "epoch": 1.3938791979638712, + "grad_norm": 0.19861131512376523, + "learning_rate": 6.468154532696457e-05, + "loss": 2.8317, + "step": 22454 + }, + { + "epoch": 1.393941275063629, + "grad_norm": 0.1894055139552185, + "learning_rate": 6.467809296454709e-05, + "loss": 2.8477, + "step": 22455 + }, + { + "epoch": 1.394003352163387, + "grad_norm": 0.1455068908252584, + "learning_rate": 6.467464052555044e-05, + "loss": 2.7791, + "step": 22456 + }, + { + "epoch": 1.394065429263145, + "grad_norm": 0.152940382650464, + "learning_rate": 6.467118800999262e-05, + "loss": 2.8291, + "step": 22457 + }, + { + "epoch": 1.3941275063629028, + "grad_norm": 0.17805690635017346, + "learning_rate": 6.466773541789166e-05, + "loss": 2.8973, + "step": 22458 + }, + { + "epoch": 1.3941895834626608, + "grad_norm": 0.17239013873276926, + "learning_rate": 6.466428274926556e-05, + "loss": 2.8018, + "step": 22459 + }, + { + "epoch": 1.3942516605624184, + "grad_norm": 0.18980249655260417, + "learning_rate": 6.466083000413234e-05, + "loss": 2.8952, + "step": 22460 + }, + { + "epoch": 1.3943137376621764, + "grad_norm": 0.16419631178610306, + "learning_rate": 6.465737718251e-05, + "loss": 2.7657, + "step": 22461 + }, + { + "epoch": 1.3943758147619343, + "grad_norm": 0.16895848702456268, + "learning_rate": 6.465392428441656e-05, + "loss": 2.8603, + "step": 22462 + }, + { + "epoch": 1.3944378918616922, + "grad_norm": 0.16026293295032756, + "learning_rate": 6.465047130987006e-05, + "loss": 2.7519, + "step": 22463 + }, + { + "epoch": 1.3944999689614501, + "grad_norm": 0.16085885774548717, + "learning_rate": 6.464701825888848e-05, + "loss": 2.7676, + "step": 22464 + }, + { + "epoch": 1.394562046061208, + "grad_norm": 0.17229740840054045, + "learning_rate": 6.464356513148987e-05, + "loss": 2.9567, + "step": 22465 + }, + { + "epoch": 1.394624123160966, + "grad_norm": 0.16555019659766895, + "learning_rate": 6.46401119276922e-05, + "loss": 2.795, + "step": 22466 + }, + { + "epoch": 1.3946862002607239, + "grad_norm": 0.213569660600356, + "learning_rate": 6.463665864751353e-05, + "loss": 2.8605, + "step": 22467 + }, + { + "epoch": 1.3947482773604818, + "grad_norm": 0.19659199279620768, + "learning_rate": 6.463320529097186e-05, + "loss": 2.8419, + "step": 22468 + }, + { + "epoch": 1.3948103544602395, + "grad_norm": 0.16118759611140948, + "learning_rate": 6.46297518580852e-05, + "loss": 2.8398, + "step": 22469 + }, + { + "epoch": 1.3948724315599974, + "grad_norm": 0.18482614376932457, + "learning_rate": 6.462629834887157e-05, + "loss": 2.8427, + "step": 22470 + }, + { + "epoch": 1.3949345086597553, + "grad_norm": 0.1460333428978817, + "learning_rate": 6.462284476334899e-05, + "loss": 2.7998, + "step": 22471 + }, + { + "epoch": 1.3949965857595132, + "grad_norm": 0.1530157636242362, + "learning_rate": 6.461939110153549e-05, + "loss": 2.7938, + "step": 22472 + }, + { + "epoch": 1.3950586628592712, + "grad_norm": 0.1551437135935104, + "learning_rate": 6.461593736344906e-05, + "loss": 2.8509, + "step": 22473 + }, + { + "epoch": 1.395120739959029, + "grad_norm": 0.152516987921424, + "learning_rate": 6.461248354910775e-05, + "loss": 2.8245, + "step": 22474 + }, + { + "epoch": 1.395182817058787, + "grad_norm": 0.14842646331915513, + "learning_rate": 6.460902965852956e-05, + "loss": 2.8736, + "step": 22475 + }, + { + "epoch": 1.395244894158545, + "grad_norm": 0.15481249450511098, + "learning_rate": 6.460557569173252e-05, + "loss": 2.8055, + "step": 22476 + }, + { + "epoch": 1.3953069712583028, + "grad_norm": 0.16352895993073144, + "learning_rate": 6.460212164873464e-05, + "loss": 2.8871, + "step": 22477 + }, + { + "epoch": 1.3953690483580607, + "grad_norm": 0.14327511845014512, + "learning_rate": 6.459866752955395e-05, + "loss": 2.7669, + "step": 22478 + }, + { + "epoch": 1.3954311254578187, + "grad_norm": 0.15759923849281907, + "learning_rate": 6.459521333420847e-05, + "loss": 2.932, + "step": 22479 + }, + { + "epoch": 1.3954932025575766, + "grad_norm": 0.15681446161179727, + "learning_rate": 6.459175906271621e-05, + "loss": 2.8087, + "step": 22480 + }, + { + "epoch": 1.3955552796573345, + "grad_norm": 0.17105437657444952, + "learning_rate": 6.458830471509521e-05, + "loss": 2.8183, + "step": 22481 + }, + { + "epoch": 1.3956173567570924, + "grad_norm": 0.16474423229384708, + "learning_rate": 6.458485029136347e-05, + "loss": 2.8959, + "step": 22482 + }, + { + "epoch": 1.3956794338568503, + "grad_norm": 0.15587370383573393, + "learning_rate": 6.458139579153904e-05, + "loss": 2.8626, + "step": 22483 + }, + { + "epoch": 1.395741510956608, + "grad_norm": 0.19414279675564217, + "learning_rate": 6.457794121563992e-05, + "loss": 2.7759, + "step": 22484 + }, + { + "epoch": 1.395803588056366, + "grad_norm": 0.14469236337920813, + "learning_rate": 6.457448656368415e-05, + "loss": 2.8754, + "step": 22485 + }, + { + "epoch": 1.3958656651561239, + "grad_norm": 0.16008216028074956, + "learning_rate": 6.457103183568974e-05, + "loss": 2.8492, + "step": 22486 + }, + { + "epoch": 1.3959277422558818, + "grad_norm": 0.1410026182373765, + "learning_rate": 6.456757703167473e-05, + "loss": 2.9046, + "step": 22487 + }, + { + "epoch": 1.3959898193556397, + "grad_norm": 0.1568103517633787, + "learning_rate": 6.456412215165713e-05, + "loss": 2.8599, + "step": 22488 + }, + { + "epoch": 1.3960518964553976, + "grad_norm": 0.15964854410987916, + "learning_rate": 6.456066719565497e-05, + "loss": 2.7607, + "step": 22489 + }, + { + "epoch": 1.3961139735551555, + "grad_norm": 0.16127987133065871, + "learning_rate": 6.455721216368626e-05, + "loss": 2.8554, + "step": 22490 + }, + { + "epoch": 1.3961760506549135, + "grad_norm": 0.1589543614187088, + "learning_rate": 6.455375705576904e-05, + "loss": 2.8173, + "step": 22491 + }, + { + "epoch": 1.3962381277546714, + "grad_norm": 0.15281153654212143, + "learning_rate": 6.455030187192136e-05, + "loss": 2.8659, + "step": 22492 + }, + { + "epoch": 1.396300204854429, + "grad_norm": 0.1606955867440256, + "learning_rate": 6.454684661216121e-05, + "loss": 2.7802, + "step": 22493 + }, + { + "epoch": 1.396362281954187, + "grad_norm": 0.15931469510333204, + "learning_rate": 6.454339127650663e-05, + "loss": 2.8921, + "step": 22494 + }, + { + "epoch": 1.396424359053945, + "grad_norm": 0.15692752928490572, + "learning_rate": 6.453993586497564e-05, + "loss": 2.8385, + "step": 22495 + }, + { + "epoch": 1.3964864361537028, + "grad_norm": 0.16996242324400646, + "learning_rate": 6.453648037758627e-05, + "loss": 2.8783, + "step": 22496 + }, + { + "epoch": 1.3965485132534607, + "grad_norm": 0.14958643475941635, + "learning_rate": 6.453302481435657e-05, + "loss": 2.8028, + "step": 22497 + }, + { + "epoch": 1.3966105903532187, + "grad_norm": 0.19504337843114125, + "learning_rate": 6.452956917530454e-05, + "loss": 2.7681, + "step": 22498 + }, + { + "epoch": 1.3966726674529766, + "grad_norm": 0.16646760673517064, + "learning_rate": 6.452611346044823e-05, + "loss": 2.8191, + "step": 22499 + }, + { + "epoch": 1.3967347445527345, + "grad_norm": 0.15691736869757109, + "learning_rate": 6.452265766980565e-05, + "loss": 2.745, + "step": 22500 + }, + { + "epoch": 1.3967968216524924, + "grad_norm": 0.16493833729862814, + "learning_rate": 6.451920180339486e-05, + "loss": 2.8641, + "step": 22501 + }, + { + "epoch": 1.3968588987522503, + "grad_norm": 0.19561284813599777, + "learning_rate": 6.451574586123383e-05, + "loss": 2.7475, + "step": 22502 + }, + { + "epoch": 1.3969209758520083, + "grad_norm": 0.16685846339667157, + "learning_rate": 6.451228984334066e-05, + "loss": 2.8408, + "step": 22503 + }, + { + "epoch": 1.3969830529517662, + "grad_norm": 0.1510801793742996, + "learning_rate": 6.450883374973336e-05, + "loss": 2.7089, + "step": 22504 + }, + { + "epoch": 1.397045130051524, + "grad_norm": 0.16084776432104037, + "learning_rate": 6.450537758042994e-05, + "loss": 2.8571, + "step": 22505 + }, + { + "epoch": 1.397107207151282, + "grad_norm": 0.16851760217588177, + "learning_rate": 6.450192133544843e-05, + "loss": 2.8383, + "step": 22506 + }, + { + "epoch": 1.39716928425104, + "grad_norm": 0.15839641442753197, + "learning_rate": 6.44984650148069e-05, + "loss": 2.8329, + "step": 22507 + }, + { + "epoch": 1.3972313613507976, + "grad_norm": 0.17619114873977448, + "learning_rate": 6.449500861852335e-05, + "loss": 2.9251, + "step": 22508 + }, + { + "epoch": 1.3972934384505555, + "grad_norm": 0.15340810436877214, + "learning_rate": 6.44915521466158e-05, + "loss": 2.8441, + "step": 22509 + }, + { + "epoch": 1.3973555155503135, + "grad_norm": 0.17740540923993992, + "learning_rate": 6.448809559910232e-05, + "loss": 2.84, + "step": 22510 + }, + { + "epoch": 1.3974175926500714, + "grad_norm": 0.17506599076116186, + "learning_rate": 6.448463897600092e-05, + "loss": 2.8632, + "step": 22511 + }, + { + "epoch": 1.3974796697498293, + "grad_norm": 0.15593683927812668, + "learning_rate": 6.448118227732967e-05, + "loss": 2.8117, + "step": 22512 + }, + { + "epoch": 1.3975417468495872, + "grad_norm": 0.15374629257547123, + "learning_rate": 6.447772550310655e-05, + "loss": 2.7743, + "step": 22513 + }, + { + "epoch": 1.3976038239493451, + "grad_norm": 0.16168579235657088, + "learning_rate": 6.447426865334963e-05, + "loss": 2.8873, + "step": 22514 + }, + { + "epoch": 1.397665901049103, + "grad_norm": 0.16673421174010014, + "learning_rate": 6.447081172807694e-05, + "loss": 2.8288, + "step": 22515 + }, + { + "epoch": 1.397727978148861, + "grad_norm": 0.17050797679845142, + "learning_rate": 6.44673547273065e-05, + "loss": 2.8897, + "step": 22516 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.14761128876391294, + "learning_rate": 6.446389765105639e-05, + "loss": 2.7593, + "step": 22517 + }, + { + "epoch": 1.3978521323483766, + "grad_norm": 0.15823893082995852, + "learning_rate": 6.446044049934456e-05, + "loss": 2.7645, + "step": 22518 + }, + { + "epoch": 1.3979142094481345, + "grad_norm": 0.1632267121672315, + "learning_rate": 6.445698327218916e-05, + "loss": 2.8746, + "step": 22519 + }, + { + "epoch": 1.3979762865478924, + "grad_norm": 0.14798474278775367, + "learning_rate": 6.445352596960813e-05, + "loss": 2.8389, + "step": 22520 + }, + { + "epoch": 1.3980383636476503, + "grad_norm": 0.14914253595325874, + "learning_rate": 6.445006859161956e-05, + "loss": 2.8199, + "step": 22521 + }, + { + "epoch": 1.3981004407474082, + "grad_norm": 0.14863663416597395, + "learning_rate": 6.444661113824147e-05, + "loss": 2.8474, + "step": 22522 + }, + { + "epoch": 1.3981625178471662, + "grad_norm": 0.155064874606649, + "learning_rate": 6.444315360949191e-05, + "loss": 2.757, + "step": 22523 + }, + { + "epoch": 1.398224594946924, + "grad_norm": 0.14729512599644373, + "learning_rate": 6.44396960053889e-05, + "loss": 2.8234, + "step": 22524 + }, + { + "epoch": 1.398286672046682, + "grad_norm": 0.1578333973031819, + "learning_rate": 6.44362383259505e-05, + "loss": 2.8208, + "step": 22525 + }, + { + "epoch": 1.39834874914644, + "grad_norm": 0.17047385743948879, + "learning_rate": 6.443278057119475e-05, + "loss": 2.871, + "step": 22526 + }, + { + "epoch": 1.3984108262461978, + "grad_norm": 0.15873471769417366, + "learning_rate": 6.442932274113967e-05, + "loss": 2.8192, + "step": 22527 + }, + { + "epoch": 1.3984729033459558, + "grad_norm": 0.16569456020776013, + "learning_rate": 6.442586483580328e-05, + "loss": 2.8063, + "step": 22528 + }, + { + "epoch": 1.3985349804457137, + "grad_norm": 0.16150744842213688, + "learning_rate": 6.442240685520369e-05, + "loss": 2.8851, + "step": 22529 + }, + { + "epoch": 1.3985970575454716, + "grad_norm": 0.15797766762191232, + "learning_rate": 6.44189487993589e-05, + "loss": 2.877, + "step": 22530 + }, + { + "epoch": 1.3986591346452295, + "grad_norm": 0.1635745318964907, + "learning_rate": 6.441549066828694e-05, + "loss": 2.8193, + "step": 22531 + }, + { + "epoch": 1.3987212117449872, + "grad_norm": 0.14414393878632137, + "learning_rate": 6.441203246200587e-05, + "loss": 2.7815, + "step": 22532 + }, + { + "epoch": 1.3987832888447451, + "grad_norm": 0.15312872293384003, + "learning_rate": 6.440857418053374e-05, + "loss": 2.8067, + "step": 22533 + }, + { + "epoch": 1.398845365944503, + "grad_norm": 0.1716870287908525, + "learning_rate": 6.440511582388856e-05, + "loss": 2.8676, + "step": 22534 + }, + { + "epoch": 1.398907443044261, + "grad_norm": 0.15240620897733298, + "learning_rate": 6.440165739208839e-05, + "loss": 2.8372, + "step": 22535 + }, + { + "epoch": 1.3989695201440189, + "grad_norm": 0.22418055026201936, + "learning_rate": 6.43981988851513e-05, + "loss": 2.8365, + "step": 22536 + }, + { + "epoch": 1.3990315972437768, + "grad_norm": 0.15206785773768508, + "learning_rate": 6.439474030309531e-05, + "loss": 2.8831, + "step": 22537 + }, + { + "epoch": 1.3990936743435347, + "grad_norm": 0.16986413500842013, + "learning_rate": 6.439128164593845e-05, + "loss": 2.8588, + "step": 22538 + }, + { + "epoch": 1.3991557514432926, + "grad_norm": 0.16468463174070314, + "learning_rate": 6.438782291369879e-05, + "loss": 2.825, + "step": 22539 + }, + { + "epoch": 1.3992178285430505, + "grad_norm": 0.15096040641731917, + "learning_rate": 6.438436410639435e-05, + "loss": 2.7152, + "step": 22540 + }, + { + "epoch": 1.3992799056428082, + "grad_norm": 0.16722275059291078, + "learning_rate": 6.43809052240432e-05, + "loss": 2.8343, + "step": 22541 + }, + { + "epoch": 1.3993419827425662, + "grad_norm": 0.14814907838988256, + "learning_rate": 6.437744626666339e-05, + "loss": 2.9034, + "step": 22542 + }, + { + "epoch": 1.399404059842324, + "grad_norm": 0.1548997292534258, + "learning_rate": 6.437398723427294e-05, + "loss": 2.8115, + "step": 22543 + }, + { + "epoch": 1.399466136942082, + "grad_norm": 0.1497798817076842, + "learning_rate": 6.437052812688991e-05, + "loss": 2.8022, + "step": 22544 + }, + { + "epoch": 1.39952821404184, + "grad_norm": 0.15737692236086037, + "learning_rate": 6.436706894453234e-05, + "loss": 2.8144, + "step": 22545 + }, + { + "epoch": 1.3995902911415978, + "grad_norm": 0.16287408420000807, + "learning_rate": 6.43636096872183e-05, + "loss": 2.8239, + "step": 22546 + }, + { + "epoch": 1.3996523682413557, + "grad_norm": 0.1504303396721479, + "learning_rate": 6.436015035496581e-05, + "loss": 2.7848, + "step": 22547 + }, + { + "epoch": 1.3997144453411137, + "grad_norm": 0.149176433564703, + "learning_rate": 6.435669094779294e-05, + "loss": 2.8887, + "step": 22548 + }, + { + "epoch": 1.3997765224408716, + "grad_norm": 0.1482034519535816, + "learning_rate": 6.43532314657177e-05, + "loss": 2.8235, + "step": 22549 + }, + { + "epoch": 1.3998385995406295, + "grad_norm": 0.1482768569834186, + "learning_rate": 6.434977190875819e-05, + "loss": 2.9064, + "step": 22550 + }, + { + "epoch": 1.3999006766403874, + "grad_norm": 0.1406994198867833, + "learning_rate": 6.434631227693245e-05, + "loss": 2.7493, + "step": 22551 + }, + { + "epoch": 1.3999627537401453, + "grad_norm": 0.16406964104869534, + "learning_rate": 6.434285257025849e-05, + "loss": 2.8697, + "step": 22552 + }, + { + "epoch": 1.4000248308399033, + "grad_norm": 0.14374764767048961, + "learning_rate": 6.43393927887544e-05, + "loss": 2.8144, + "step": 22553 + }, + { + "epoch": 1.4000869079396612, + "grad_norm": 0.1515860912800161, + "learning_rate": 6.43359329324382e-05, + "loss": 2.7915, + "step": 22554 + }, + { + "epoch": 1.400148985039419, + "grad_norm": 0.14342244862974632, + "learning_rate": 6.433247300132799e-05, + "loss": 2.867, + "step": 22555 + }, + { + "epoch": 1.4002110621391768, + "grad_norm": 0.15561858389929945, + "learning_rate": 6.432901299544176e-05, + "loss": 2.8396, + "step": 22556 + }, + { + "epoch": 1.4002731392389347, + "grad_norm": 0.16656567946256873, + "learning_rate": 6.432555291479761e-05, + "loss": 2.9262, + "step": 22557 + }, + { + "epoch": 1.4003352163386926, + "grad_norm": 0.15057093475059627, + "learning_rate": 6.432209275941354e-05, + "loss": 2.8464, + "step": 22558 + }, + { + "epoch": 1.4003972934384505, + "grad_norm": 0.14923742237400572, + "learning_rate": 6.431863252930768e-05, + "loss": 2.9093, + "step": 22559 + }, + { + "epoch": 1.4004593705382085, + "grad_norm": 0.1655056525677609, + "learning_rate": 6.4315172224498e-05, + "loss": 2.8525, + "step": 22560 + }, + { + "epoch": 1.4005214476379664, + "grad_norm": 0.1507723842741996, + "learning_rate": 6.431171184500261e-05, + "loss": 2.8357, + "step": 22561 + }, + { + "epoch": 1.4005835247377243, + "grad_norm": 0.1600516605795745, + "learning_rate": 6.430825139083955e-05, + "loss": 2.9143, + "step": 22562 + }, + { + "epoch": 1.4006456018374822, + "grad_norm": 0.1468333284126462, + "learning_rate": 6.430479086202685e-05, + "loss": 2.7207, + "step": 22563 + }, + { + "epoch": 1.4007076789372401, + "grad_norm": 0.1537173712134057, + "learning_rate": 6.43013302585826e-05, + "loss": 2.7671, + "step": 22564 + }, + { + "epoch": 1.4007697560369978, + "grad_norm": 0.19981240661268107, + "learning_rate": 6.429786958052483e-05, + "loss": 2.8648, + "step": 22565 + }, + { + "epoch": 1.4008318331367557, + "grad_norm": 0.15805783487881658, + "learning_rate": 6.429440882787161e-05, + "loss": 2.8924, + "step": 22566 + }, + { + "epoch": 1.4008939102365137, + "grad_norm": 0.1660457695913166, + "learning_rate": 6.429094800064098e-05, + "loss": 2.7722, + "step": 22567 + }, + { + "epoch": 1.4009559873362716, + "grad_norm": 0.1548333739741145, + "learning_rate": 6.428748709885102e-05, + "loss": 2.8561, + "step": 22568 + }, + { + "epoch": 1.4010180644360295, + "grad_norm": 0.16828804385816992, + "learning_rate": 6.428402612251975e-05, + "loss": 2.8317, + "step": 22569 + }, + { + "epoch": 1.4010801415357874, + "grad_norm": 0.15266482435527992, + "learning_rate": 6.428056507166527e-05, + "loss": 2.8888, + "step": 22570 + }, + { + "epoch": 1.4011422186355453, + "grad_norm": 0.15779258300190155, + "learning_rate": 6.427710394630562e-05, + "loss": 2.8064, + "step": 22571 + }, + { + "epoch": 1.4012042957353033, + "grad_norm": 0.15579836662404034, + "learning_rate": 6.427364274645881e-05, + "loss": 2.7577, + "step": 22572 + }, + { + "epoch": 1.4012663728350612, + "grad_norm": 0.15785839055658782, + "learning_rate": 6.427018147214298e-05, + "loss": 2.8418, + "step": 22573 + }, + { + "epoch": 1.401328449934819, + "grad_norm": 0.17026112660944315, + "learning_rate": 6.426672012337614e-05, + "loss": 2.9062, + "step": 22574 + }, + { + "epoch": 1.401390527034577, + "grad_norm": 0.17135750828666507, + "learning_rate": 6.426325870017637e-05, + "loss": 2.7712, + "step": 22575 + }, + { + "epoch": 1.401452604134335, + "grad_norm": 0.14147433122597144, + "learning_rate": 6.425979720256169e-05, + "loss": 2.8335, + "step": 22576 + }, + { + "epoch": 1.4015146812340928, + "grad_norm": 0.14651096738401484, + "learning_rate": 6.42563356305502e-05, + "loss": 2.8491, + "step": 22577 + }, + { + "epoch": 1.4015767583338508, + "grad_norm": 0.15733923932100066, + "learning_rate": 6.425287398415994e-05, + "loss": 2.9647, + "step": 22578 + }, + { + "epoch": 1.4016388354336087, + "grad_norm": 0.18031851224610912, + "learning_rate": 6.424941226340899e-05, + "loss": 2.752, + "step": 22579 + }, + { + "epoch": 1.4017009125333664, + "grad_norm": 0.17873800312478041, + "learning_rate": 6.424595046831539e-05, + "loss": 2.8929, + "step": 22580 + }, + { + "epoch": 1.4017629896331243, + "grad_norm": 0.17768142246743973, + "learning_rate": 6.42424885988972e-05, + "loss": 2.7567, + "step": 22581 + }, + { + "epoch": 1.4018250667328822, + "grad_norm": 0.20210578026098072, + "learning_rate": 6.42390266551725e-05, + "loss": 2.8368, + "step": 22582 + }, + { + "epoch": 1.4018871438326401, + "grad_norm": 0.1546075144516545, + "learning_rate": 6.423556463715933e-05, + "loss": 2.8415, + "step": 22583 + }, + { + "epoch": 1.401949220932398, + "grad_norm": 0.16269400894906755, + "learning_rate": 6.423210254487577e-05, + "loss": 2.8138, + "step": 22584 + }, + { + "epoch": 1.402011298032156, + "grad_norm": 0.15444728304823332, + "learning_rate": 6.422864037833988e-05, + "loss": 2.8607, + "step": 22585 + }, + { + "epoch": 1.4020733751319139, + "grad_norm": 0.17895670183597878, + "learning_rate": 6.422517813756972e-05, + "loss": 2.8203, + "step": 22586 + }, + { + "epoch": 1.4021354522316718, + "grad_norm": 0.15315464812933338, + "learning_rate": 6.422171582258334e-05, + "loss": 2.8147, + "step": 22587 + }, + { + "epoch": 1.4021975293314297, + "grad_norm": 0.16481639015798544, + "learning_rate": 6.421825343339883e-05, + "loss": 2.8494, + "step": 22588 + }, + { + "epoch": 1.4022596064311874, + "grad_norm": 0.1518436868886901, + "learning_rate": 6.421479097003424e-05, + "loss": 2.8626, + "step": 22589 + }, + { + "epoch": 1.4023216835309453, + "grad_norm": 0.17813465610900428, + "learning_rate": 6.421132843250763e-05, + "loss": 2.8099, + "step": 22590 + }, + { + "epoch": 1.4023837606307032, + "grad_norm": 0.15067084327098457, + "learning_rate": 6.420786582083708e-05, + "loss": 2.9155, + "step": 22591 + }, + { + "epoch": 1.4024458377304612, + "grad_norm": 0.170850261982437, + "learning_rate": 6.420440313504062e-05, + "loss": 2.8622, + "step": 22592 + }, + { + "epoch": 1.402507914830219, + "grad_norm": 0.15360539653159302, + "learning_rate": 6.420094037513635e-05, + "loss": 2.8726, + "step": 22593 + }, + { + "epoch": 1.402569991929977, + "grad_norm": 0.1780104222628506, + "learning_rate": 6.419747754114234e-05, + "loss": 2.7812, + "step": 22594 + }, + { + "epoch": 1.402632069029735, + "grad_norm": 0.1578660777233211, + "learning_rate": 6.419401463307663e-05, + "loss": 2.8218, + "step": 22595 + }, + { + "epoch": 1.4026941461294928, + "grad_norm": 0.16670434247787191, + "learning_rate": 6.41905516509573e-05, + "loss": 2.7958, + "step": 22596 + }, + { + "epoch": 1.4027562232292508, + "grad_norm": 0.15695347971517717, + "learning_rate": 6.418708859480242e-05, + "loss": 2.8553, + "step": 22597 + }, + { + "epoch": 1.4028183003290087, + "grad_norm": 0.17893630029434057, + "learning_rate": 6.418362546463006e-05, + "loss": 2.7282, + "step": 22598 + }, + { + "epoch": 1.4028803774287666, + "grad_norm": 0.1623799180669181, + "learning_rate": 6.418016226045828e-05, + "loss": 2.7446, + "step": 22599 + }, + { + "epoch": 1.4029424545285245, + "grad_norm": 0.19978523122376032, + "learning_rate": 6.417669898230516e-05, + "loss": 2.8561, + "step": 22600 + }, + { + "epoch": 1.4030045316282824, + "grad_norm": 0.163715990034527, + "learning_rate": 6.417323563018874e-05, + "loss": 2.7137, + "step": 22601 + }, + { + "epoch": 1.4030666087280403, + "grad_norm": 0.19192251336304372, + "learning_rate": 6.416977220412713e-05, + "loss": 2.8596, + "step": 22602 + }, + { + "epoch": 1.4031286858277983, + "grad_norm": 0.17573420636838927, + "learning_rate": 6.416630870413835e-05, + "loss": 2.8305, + "step": 22603 + }, + { + "epoch": 1.403190762927556, + "grad_norm": 0.17137311680389977, + "learning_rate": 6.416284513024053e-05, + "loss": 2.8287, + "step": 22604 + }, + { + "epoch": 1.4032528400273139, + "grad_norm": 0.16496810855043445, + "learning_rate": 6.41593814824517e-05, + "loss": 2.8316, + "step": 22605 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.16149768554871477, + "learning_rate": 6.415591776078993e-05, + "loss": 2.799, + "step": 22606 + }, + { + "epoch": 1.4033769942268297, + "grad_norm": 0.16456693349285348, + "learning_rate": 6.415245396527331e-05, + "loss": 2.728, + "step": 22607 + }, + { + "epoch": 1.4034390713265876, + "grad_norm": 0.14975152520921906, + "learning_rate": 6.414899009591989e-05, + "loss": 2.793, + "step": 22608 + }, + { + "epoch": 1.4035011484263455, + "grad_norm": 0.20868272491091983, + "learning_rate": 6.414552615274776e-05, + "loss": 2.786, + "step": 22609 + }, + { + "epoch": 1.4035632255261035, + "grad_norm": 0.17828639732639806, + "learning_rate": 6.414206213577498e-05, + "loss": 2.8565, + "step": 22610 + }, + { + "epoch": 1.4036253026258614, + "grad_norm": 0.16511523729106234, + "learning_rate": 6.413859804501964e-05, + "loss": 2.8953, + "step": 22611 + }, + { + "epoch": 1.4036873797256193, + "grad_norm": 0.1604707003496279, + "learning_rate": 6.413513388049981e-05, + "loss": 2.7826, + "step": 22612 + }, + { + "epoch": 1.403749456825377, + "grad_norm": 0.15974620920296734, + "learning_rate": 6.413166964223354e-05, + "loss": 2.8455, + "step": 22613 + }, + { + "epoch": 1.403811533925135, + "grad_norm": 0.148901302639731, + "learning_rate": 6.412820533023892e-05, + "loss": 2.7884, + "step": 22614 + }, + { + "epoch": 1.4038736110248928, + "grad_norm": 0.17127631780520833, + "learning_rate": 6.412474094453402e-05, + "loss": 2.7847, + "step": 22615 + }, + { + "epoch": 1.4039356881246507, + "grad_norm": 0.14479426989399247, + "learning_rate": 6.412127648513693e-05, + "loss": 2.8469, + "step": 22616 + }, + { + "epoch": 1.4039977652244087, + "grad_norm": 0.16475209562701054, + "learning_rate": 6.41178119520657e-05, + "loss": 2.8593, + "step": 22617 + }, + { + "epoch": 1.4040598423241666, + "grad_norm": 0.17224674770180476, + "learning_rate": 6.411434734533842e-05, + "loss": 2.8768, + "step": 22618 + }, + { + "epoch": 1.4041219194239245, + "grad_norm": 0.16833546993186962, + "learning_rate": 6.411088266497316e-05, + "loss": 2.805, + "step": 22619 + }, + { + "epoch": 1.4041839965236824, + "grad_norm": 0.16977704112907563, + "learning_rate": 6.410741791098802e-05, + "loss": 2.858, + "step": 22620 + }, + { + "epoch": 1.4042460736234403, + "grad_norm": 0.14617561224436854, + "learning_rate": 6.410395308340104e-05, + "loss": 2.9374, + "step": 22621 + }, + { + "epoch": 1.4043081507231983, + "grad_norm": 0.1577368741548364, + "learning_rate": 6.410048818223032e-05, + "loss": 2.8617, + "step": 22622 + }, + { + "epoch": 1.4043702278229562, + "grad_norm": 0.15527063643130248, + "learning_rate": 6.409702320749392e-05, + "loss": 2.8467, + "step": 22623 + }, + { + "epoch": 1.404432304922714, + "grad_norm": 0.15723861624437266, + "learning_rate": 6.409355815920993e-05, + "loss": 2.8794, + "step": 22624 + }, + { + "epoch": 1.404494382022472, + "grad_norm": 0.15591805483989973, + "learning_rate": 6.409009303739644e-05, + "loss": 2.9193, + "step": 22625 + }, + { + "epoch": 1.40455645912223, + "grad_norm": 0.14128922182706413, + "learning_rate": 6.408662784207149e-05, + "loss": 2.8871, + "step": 22626 + }, + { + "epoch": 1.4046185362219878, + "grad_norm": 0.15975874907007193, + "learning_rate": 6.40831625732532e-05, + "loss": 2.8793, + "step": 22627 + }, + { + "epoch": 1.4046806133217455, + "grad_norm": 0.16286944107514503, + "learning_rate": 6.407969723095961e-05, + "loss": 2.8971, + "step": 22628 + }, + { + "epoch": 1.4047426904215035, + "grad_norm": 0.16351738127317958, + "learning_rate": 6.407623181520885e-05, + "loss": 2.8538, + "step": 22629 + }, + { + "epoch": 1.4048047675212614, + "grad_norm": 0.14771134544506284, + "learning_rate": 6.407276632601896e-05, + "loss": 2.8278, + "step": 22630 + }, + { + "epoch": 1.4048668446210193, + "grad_norm": 0.15737569737488183, + "learning_rate": 6.406930076340803e-05, + "loss": 2.7835, + "step": 22631 + }, + { + "epoch": 1.4049289217207772, + "grad_norm": 0.17138557759744624, + "learning_rate": 6.406583512739415e-05, + "loss": 2.822, + "step": 22632 + }, + { + "epoch": 1.4049909988205351, + "grad_norm": 0.1488843266058377, + "learning_rate": 6.406236941799539e-05, + "loss": 2.8244, + "step": 22633 + }, + { + "epoch": 1.405053075920293, + "grad_norm": 0.15575096786850406, + "learning_rate": 6.405890363522984e-05, + "loss": 2.8147, + "step": 22634 + }, + { + "epoch": 1.405115153020051, + "grad_norm": 0.1456384584343284, + "learning_rate": 6.405543777911555e-05, + "loss": 2.8153, + "step": 22635 + }, + { + "epoch": 1.4051772301198089, + "grad_norm": 0.16651281070111804, + "learning_rate": 6.405197184967067e-05, + "loss": 2.8132, + "step": 22636 + }, + { + "epoch": 1.4052393072195666, + "grad_norm": 0.1604545246438042, + "learning_rate": 6.40485058469132e-05, + "loss": 2.9042, + "step": 22637 + }, + { + "epoch": 1.4053013843193245, + "grad_norm": 0.15051523223496233, + "learning_rate": 6.40450397708613e-05, + "loss": 2.7425, + "step": 22638 + }, + { + "epoch": 1.4053634614190824, + "grad_norm": 0.15958482286461106, + "learning_rate": 6.4041573621533e-05, + "loss": 2.8082, + "step": 22639 + }, + { + "epoch": 1.4054255385188403, + "grad_norm": 0.192244862702984, + "learning_rate": 6.403810739894641e-05, + "loss": 2.7845, + "step": 22640 + }, + { + "epoch": 1.4054876156185983, + "grad_norm": 0.20094884156690268, + "learning_rate": 6.403464110311961e-05, + "loss": 2.7672, + "step": 22641 + }, + { + "epoch": 1.4055496927183562, + "grad_norm": 0.15091661025352027, + "learning_rate": 6.403117473407065e-05, + "loss": 2.7939, + "step": 22642 + }, + { + "epoch": 1.405611769818114, + "grad_norm": 0.1515135273024533, + "learning_rate": 6.402770829181768e-05, + "loss": 2.8451, + "step": 22643 + }, + { + "epoch": 1.405673846917872, + "grad_norm": 0.18021731427769938, + "learning_rate": 6.402424177637873e-05, + "loss": 2.82, + "step": 22644 + }, + { + "epoch": 1.40573592401763, + "grad_norm": 0.18760410041603162, + "learning_rate": 6.402077518777192e-05, + "loss": 2.8959, + "step": 22645 + }, + { + "epoch": 1.4057980011173878, + "grad_norm": 0.1597135228025843, + "learning_rate": 6.40173085260153e-05, + "loss": 2.747, + "step": 22646 + }, + { + "epoch": 1.4058600782171458, + "grad_norm": 0.16040177830947902, + "learning_rate": 6.401384179112701e-05, + "loss": 2.776, + "step": 22647 + }, + { + "epoch": 1.4059221553169037, + "grad_norm": 0.1564253716970115, + "learning_rate": 6.401037498312508e-05, + "loss": 2.8472, + "step": 22648 + }, + { + "epoch": 1.4059842324166616, + "grad_norm": 0.1775104453478929, + "learning_rate": 6.400690810202764e-05, + "loss": 2.8425, + "step": 22649 + }, + { + "epoch": 1.4060463095164195, + "grad_norm": 0.21679794895490348, + "learning_rate": 6.400344114785275e-05, + "loss": 2.8616, + "step": 22650 + }, + { + "epoch": 1.4061083866161774, + "grad_norm": 0.1420174902786009, + "learning_rate": 6.399997412061851e-05, + "loss": 2.8029, + "step": 22651 + }, + { + "epoch": 1.4061704637159351, + "grad_norm": 0.14986866625493914, + "learning_rate": 6.399650702034302e-05, + "loss": 2.7684, + "step": 22652 + }, + { + "epoch": 1.406232540815693, + "grad_norm": 0.17044425557606985, + "learning_rate": 6.399303984704432e-05, + "loss": 2.8216, + "step": 22653 + }, + { + "epoch": 1.406294617915451, + "grad_norm": 0.15665735550191115, + "learning_rate": 6.398957260074057e-05, + "loss": 2.8283, + "step": 22654 + }, + { + "epoch": 1.4063566950152089, + "grad_norm": 0.1446015663759088, + "learning_rate": 6.39861052814498e-05, + "loss": 2.8122, + "step": 22655 + }, + { + "epoch": 1.4064187721149668, + "grad_norm": 0.16219681943641084, + "learning_rate": 6.398263788919014e-05, + "loss": 2.7894, + "step": 22656 + }, + { + "epoch": 1.4064808492147247, + "grad_norm": 0.15396939303955057, + "learning_rate": 6.397917042397965e-05, + "loss": 2.8819, + "step": 22657 + }, + { + "epoch": 1.4065429263144826, + "grad_norm": 0.1857872760812421, + "learning_rate": 6.397570288583646e-05, + "loss": 2.8397, + "step": 22658 + }, + { + "epoch": 1.4066050034142406, + "grad_norm": 0.17238333696172767, + "learning_rate": 6.39722352747786e-05, + "loss": 2.8507, + "step": 22659 + }, + { + "epoch": 1.4066670805139985, + "grad_norm": 0.17362368809655468, + "learning_rate": 6.39687675908242e-05, + "loss": 2.926, + "step": 22660 + }, + { + "epoch": 1.4067291576137562, + "grad_norm": 0.15955789954325636, + "learning_rate": 6.396529983399138e-05, + "loss": 2.8776, + "step": 22661 + }, + { + "epoch": 1.406791234713514, + "grad_norm": 0.17495505828756808, + "learning_rate": 6.396183200429817e-05, + "loss": 2.8868, + "step": 22662 + }, + { + "epoch": 1.406853311813272, + "grad_norm": 0.16322224454335993, + "learning_rate": 6.395836410176271e-05, + "loss": 2.7622, + "step": 22663 + }, + { + "epoch": 1.40691538891303, + "grad_norm": 0.15994092375924418, + "learning_rate": 6.395489612640307e-05, + "loss": 2.907, + "step": 22664 + }, + { + "epoch": 1.4069774660127878, + "grad_norm": 0.14818266275379152, + "learning_rate": 6.395142807823735e-05, + "loss": 2.8207, + "step": 22665 + }, + { + "epoch": 1.4070395431125458, + "grad_norm": 0.1856181058181389, + "learning_rate": 6.394795995728366e-05, + "loss": 2.7676, + "step": 22666 + }, + { + "epoch": 1.4071016202123037, + "grad_norm": 0.15339764388160584, + "learning_rate": 6.394449176356006e-05, + "loss": 2.7656, + "step": 22667 + }, + { + "epoch": 1.4071636973120616, + "grad_norm": 0.1646545320787946, + "learning_rate": 6.394102349708467e-05, + "loss": 2.8333, + "step": 22668 + }, + { + "epoch": 1.4072257744118195, + "grad_norm": 0.15800029798069148, + "learning_rate": 6.393755515787556e-05, + "loss": 2.8557, + "step": 22669 + }, + { + "epoch": 1.4072878515115774, + "grad_norm": 0.15449765577898886, + "learning_rate": 6.393408674595084e-05, + "loss": 2.8845, + "step": 22670 + }, + { + "epoch": 1.4073499286113353, + "grad_norm": 0.17042232131165097, + "learning_rate": 6.393061826132861e-05, + "loss": 2.8165, + "step": 22671 + }, + { + "epoch": 1.4074120057110933, + "grad_norm": 0.24035550761972294, + "learning_rate": 6.392714970402698e-05, + "loss": 2.8232, + "step": 22672 + }, + { + "epoch": 1.4074740828108512, + "grad_norm": 0.16098383750296338, + "learning_rate": 6.392368107406401e-05, + "loss": 2.8404, + "step": 22673 + }, + { + "epoch": 1.407536159910609, + "grad_norm": 0.19800675792793304, + "learning_rate": 6.392021237145782e-05, + "loss": 2.7957, + "step": 22674 + }, + { + "epoch": 1.4075982370103668, + "grad_norm": 0.1651850004353451, + "learning_rate": 6.391674359622651e-05, + "loss": 2.9131, + "step": 22675 + }, + { + "epoch": 1.4076603141101247, + "grad_norm": 0.1648834781773371, + "learning_rate": 6.391327474838817e-05, + "loss": 2.8811, + "step": 22676 + }, + { + "epoch": 1.4077223912098826, + "grad_norm": 0.15787342826328185, + "learning_rate": 6.390980582796089e-05, + "loss": 2.8391, + "step": 22677 + }, + { + "epoch": 1.4077844683096405, + "grad_norm": 0.1614073120277968, + "learning_rate": 6.390633683496278e-05, + "loss": 2.8211, + "step": 22678 + }, + { + "epoch": 1.4078465454093985, + "grad_norm": 0.15387062759408213, + "learning_rate": 6.390286776941194e-05, + "loss": 2.7908, + "step": 22679 + }, + { + "epoch": 1.4079086225091564, + "grad_norm": 0.16963799649370354, + "learning_rate": 6.389939863132645e-05, + "loss": 2.8319, + "step": 22680 + }, + { + "epoch": 1.4079706996089143, + "grad_norm": 0.1762226655321789, + "learning_rate": 6.389592942072445e-05, + "loss": 2.7908, + "step": 22681 + }, + { + "epoch": 1.4080327767086722, + "grad_norm": 0.15709838025372314, + "learning_rate": 6.389246013762397e-05, + "loss": 2.7389, + "step": 22682 + }, + { + "epoch": 1.4080948538084301, + "grad_norm": 0.14820626719536667, + "learning_rate": 6.38889907820432e-05, + "loss": 2.8581, + "step": 22683 + }, + { + "epoch": 1.4081569309081878, + "grad_norm": 0.16982173709471035, + "learning_rate": 6.388552135400016e-05, + "loss": 2.8311, + "step": 22684 + }, + { + "epoch": 1.4082190080079457, + "grad_norm": 0.14380012343232929, + "learning_rate": 6.3882051853513e-05, + "loss": 2.822, + "step": 22685 + }, + { + "epoch": 1.4082810851077037, + "grad_norm": 0.14868579351328623, + "learning_rate": 6.387858228059981e-05, + "loss": 2.8992, + "step": 22686 + }, + { + "epoch": 1.4083431622074616, + "grad_norm": 0.14863105126238463, + "learning_rate": 6.387511263527868e-05, + "loss": 2.78, + "step": 22687 + }, + { + "epoch": 1.4084052393072195, + "grad_norm": 0.1454384273512602, + "learning_rate": 6.387164291756771e-05, + "loss": 2.9018, + "step": 22688 + }, + { + "epoch": 1.4084673164069774, + "grad_norm": 0.14766024074510145, + "learning_rate": 6.386817312748502e-05, + "loss": 2.7644, + "step": 22689 + }, + { + "epoch": 1.4085293935067353, + "grad_norm": 0.14407958518970937, + "learning_rate": 6.38647032650487e-05, + "loss": 2.8936, + "step": 22690 + }, + { + "epoch": 1.4085914706064933, + "grad_norm": 0.14474771373389408, + "learning_rate": 6.386123333027686e-05, + "loss": 2.9086, + "step": 22691 + }, + { + "epoch": 1.4086535477062512, + "grad_norm": 0.15648095228591427, + "learning_rate": 6.38577633231876e-05, + "loss": 2.8209, + "step": 22692 + }, + { + "epoch": 1.408715624806009, + "grad_norm": 0.14533042335723018, + "learning_rate": 6.385429324379904e-05, + "loss": 2.8549, + "step": 22693 + }, + { + "epoch": 1.408777701905767, + "grad_norm": 0.14140228850785316, + "learning_rate": 6.385082309212924e-05, + "loss": 2.8458, + "step": 22694 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.14153461846983878, + "learning_rate": 6.384735286819635e-05, + "loss": 2.7724, + "step": 22695 + }, + { + "epoch": 1.4089018561052828, + "grad_norm": 0.14720101216529619, + "learning_rate": 6.384388257201846e-05, + "loss": 2.9014, + "step": 22696 + }, + { + "epoch": 1.4089639332050408, + "grad_norm": 0.1514110009557198, + "learning_rate": 6.384041220361368e-05, + "loss": 2.9007, + "step": 22697 + }, + { + "epoch": 1.4090260103047987, + "grad_norm": 0.17682136531218584, + "learning_rate": 6.383694176300008e-05, + "loss": 2.8035, + "step": 22698 + }, + { + "epoch": 1.4090880874045564, + "grad_norm": 0.17047040469995028, + "learning_rate": 6.383347125019583e-05, + "loss": 2.8538, + "step": 22699 + }, + { + "epoch": 1.4091501645043143, + "grad_norm": 0.15977580544043848, + "learning_rate": 6.383000066521898e-05, + "loss": 2.8631, + "step": 22700 + }, + { + "epoch": 1.4092122416040722, + "grad_norm": 0.19458559188784513, + "learning_rate": 6.382653000808767e-05, + "loss": 2.8583, + "step": 22701 + }, + { + "epoch": 1.4092743187038301, + "grad_norm": 0.16759929945579974, + "learning_rate": 6.382305927882e-05, + "loss": 2.7693, + "step": 22702 + }, + { + "epoch": 1.409336395803588, + "grad_norm": 0.16290882909947882, + "learning_rate": 6.381958847743407e-05, + "loss": 2.8356, + "step": 22703 + }, + { + "epoch": 1.409398472903346, + "grad_norm": 0.16990954255700252, + "learning_rate": 6.381611760394798e-05, + "loss": 2.798, + "step": 22704 + }, + { + "epoch": 1.4094605500031039, + "grad_norm": 0.15822634114531686, + "learning_rate": 6.381264665837987e-05, + "loss": 2.7766, + "step": 22705 + }, + { + "epoch": 1.4095226271028618, + "grad_norm": 0.15993920016680147, + "learning_rate": 6.380917564074782e-05, + "loss": 2.7894, + "step": 22706 + }, + { + "epoch": 1.4095847042026197, + "grad_norm": 0.1445991575669808, + "learning_rate": 6.380570455106994e-05, + "loss": 2.8678, + "step": 22707 + }, + { + "epoch": 1.4096467813023774, + "grad_norm": 0.175557099584873, + "learning_rate": 6.380223338936436e-05, + "loss": 2.8371, + "step": 22708 + }, + { + "epoch": 1.4097088584021353, + "grad_norm": 0.14765660775358266, + "learning_rate": 6.379876215564918e-05, + "loss": 2.8376, + "step": 22709 + }, + { + "epoch": 1.4097709355018933, + "grad_norm": 0.17036621598425772, + "learning_rate": 6.379529084994249e-05, + "loss": 2.832, + "step": 22710 + }, + { + "epoch": 1.4098330126016512, + "grad_norm": 0.15578743294868475, + "learning_rate": 6.379181947226245e-05, + "loss": 2.8852, + "step": 22711 + }, + { + "epoch": 1.409895089701409, + "grad_norm": 0.1542409904293252, + "learning_rate": 6.378834802262711e-05, + "loss": 2.784, + "step": 22712 + }, + { + "epoch": 1.409957166801167, + "grad_norm": 0.1504831772420812, + "learning_rate": 6.378487650105462e-05, + "loss": 2.8343, + "step": 22713 + }, + { + "epoch": 1.410019243900925, + "grad_norm": 0.17990017826780252, + "learning_rate": 6.378140490756308e-05, + "loss": 2.9209, + "step": 22714 + }, + { + "epoch": 1.4100813210006828, + "grad_norm": 0.163620313156546, + "learning_rate": 6.377793324217062e-05, + "loss": 2.8629, + "step": 22715 + }, + { + "epoch": 1.4101433981004408, + "grad_norm": 0.16507992436104893, + "learning_rate": 6.377446150489532e-05, + "loss": 2.7995, + "step": 22716 + }, + { + "epoch": 1.4102054752001987, + "grad_norm": 0.15420423817211049, + "learning_rate": 6.377098969575533e-05, + "loss": 2.8177, + "step": 22717 + }, + { + "epoch": 1.4102675522999566, + "grad_norm": 0.18498861756298124, + "learning_rate": 6.376751781476873e-05, + "loss": 2.8576, + "step": 22718 + }, + { + "epoch": 1.4103296293997145, + "grad_norm": 0.14944344898533535, + "learning_rate": 6.376404586195365e-05, + "loss": 2.7651, + "step": 22719 + }, + { + "epoch": 1.4103917064994724, + "grad_norm": 0.19336647444513383, + "learning_rate": 6.376057383732819e-05, + "loss": 2.9095, + "step": 22720 + }, + { + "epoch": 1.4104537835992303, + "grad_norm": 0.15701892719028762, + "learning_rate": 6.37571017409105e-05, + "loss": 2.7679, + "step": 22721 + }, + { + "epoch": 1.4105158606989883, + "grad_norm": 0.16572476319933305, + "learning_rate": 6.375362957271865e-05, + "loss": 2.7708, + "step": 22722 + }, + { + "epoch": 1.410577937798746, + "grad_norm": 0.16267899892837426, + "learning_rate": 6.375015733277079e-05, + "loss": 2.8326, + "step": 22723 + }, + { + "epoch": 1.4106400148985039, + "grad_norm": 0.18286335514916133, + "learning_rate": 6.3746685021085e-05, + "loss": 2.908, + "step": 22724 + }, + { + "epoch": 1.4107020919982618, + "grad_norm": 0.17011629689067795, + "learning_rate": 6.374321263767943e-05, + "loss": 2.8292, + "step": 22725 + }, + { + "epoch": 1.4107641690980197, + "grad_norm": 0.15718214817492038, + "learning_rate": 6.373974018257218e-05, + "loss": 2.8001, + "step": 22726 + }, + { + "epoch": 1.4108262461977776, + "grad_norm": 0.17350131291427262, + "learning_rate": 6.373626765578138e-05, + "loss": 2.8437, + "step": 22727 + }, + { + "epoch": 1.4108883232975356, + "grad_norm": 0.15303469130861222, + "learning_rate": 6.373279505732513e-05, + "loss": 2.8656, + "step": 22728 + }, + { + "epoch": 1.4109504003972935, + "grad_norm": 0.18070294760741926, + "learning_rate": 6.372932238722156e-05, + "loss": 2.9231, + "step": 22729 + }, + { + "epoch": 1.4110124774970514, + "grad_norm": 0.1746783945784838, + "learning_rate": 6.372584964548878e-05, + "loss": 2.7573, + "step": 22730 + }, + { + "epoch": 1.4110745545968093, + "grad_norm": 0.17880157090234114, + "learning_rate": 6.37223768321449e-05, + "loss": 2.8249, + "step": 22731 + }, + { + "epoch": 1.411136631696567, + "grad_norm": 0.15935835728047779, + "learning_rate": 6.371890394720806e-05, + "loss": 2.8481, + "step": 22732 + }, + { + "epoch": 1.411198708796325, + "grad_norm": 0.16835609950620994, + "learning_rate": 6.371543099069636e-05, + "loss": 2.8348, + "step": 22733 + }, + { + "epoch": 1.4112607858960828, + "grad_norm": 0.16367004775891436, + "learning_rate": 6.371195796262794e-05, + "loss": 2.9061, + "step": 22734 + }, + { + "epoch": 1.4113228629958408, + "grad_norm": 0.157090252376918, + "learning_rate": 6.37084848630209e-05, + "loss": 2.9179, + "step": 22735 + }, + { + "epoch": 1.4113849400955987, + "grad_norm": 0.1586288622336348, + "learning_rate": 6.370501169189336e-05, + "loss": 2.8687, + "step": 22736 + }, + { + "epoch": 1.4114470171953566, + "grad_norm": 0.16739573990744655, + "learning_rate": 6.370153844926345e-05, + "loss": 2.7697, + "step": 22737 + }, + { + "epoch": 1.4115090942951145, + "grad_norm": 0.1547782607603403, + "learning_rate": 6.36980651351493e-05, + "loss": 2.8133, + "step": 22738 + }, + { + "epoch": 1.4115711713948724, + "grad_norm": 0.18144569396932556, + "learning_rate": 6.369459174956901e-05, + "loss": 2.8328, + "step": 22739 + }, + { + "epoch": 1.4116332484946303, + "grad_norm": 0.15327409181202903, + "learning_rate": 6.36911182925407e-05, + "loss": 2.834, + "step": 22740 + }, + { + "epoch": 1.4116953255943883, + "grad_norm": 0.15480483618937765, + "learning_rate": 6.368764476408251e-05, + "loss": 2.8673, + "step": 22741 + }, + { + "epoch": 1.4117574026941462, + "grad_norm": 0.15156652799150022, + "learning_rate": 6.368417116421256e-05, + "loss": 2.9147, + "step": 22742 + }, + { + "epoch": 1.411819479793904, + "grad_norm": 0.16122269868756567, + "learning_rate": 6.368069749294896e-05, + "loss": 2.7514, + "step": 22743 + }, + { + "epoch": 1.411881556893662, + "grad_norm": 0.17523258731285618, + "learning_rate": 6.367722375030985e-05, + "loss": 2.9537, + "step": 22744 + }, + { + "epoch": 1.41194363399342, + "grad_norm": 0.15812797880947932, + "learning_rate": 6.367374993631332e-05, + "loss": 2.9095, + "step": 22745 + }, + { + "epoch": 1.4120057110931779, + "grad_norm": 0.15052921547919496, + "learning_rate": 6.367027605097755e-05, + "loss": 2.8176, + "step": 22746 + }, + { + "epoch": 1.4120677881929355, + "grad_norm": 0.15603515039592697, + "learning_rate": 6.36668020943206e-05, + "loss": 2.8347, + "step": 22747 + }, + { + "epoch": 1.4121298652926935, + "grad_norm": 0.15873309248150708, + "learning_rate": 6.366332806636065e-05, + "loss": 2.8416, + "step": 22748 + }, + { + "epoch": 1.4121919423924514, + "grad_norm": 0.15485397998917969, + "learning_rate": 6.365985396711581e-05, + "loss": 2.8355, + "step": 22749 + }, + { + "epoch": 1.4122540194922093, + "grad_norm": 0.1475326603658628, + "learning_rate": 6.365637979660416e-05, + "loss": 2.7485, + "step": 22750 + }, + { + "epoch": 1.4123160965919672, + "grad_norm": 0.14697297308497956, + "learning_rate": 6.365290555484387e-05, + "loss": 2.8248, + "step": 22751 + }, + { + "epoch": 1.4123781736917251, + "grad_norm": 0.17624719642039396, + "learning_rate": 6.364943124185307e-05, + "loss": 2.8136, + "step": 22752 + }, + { + "epoch": 1.412440250791483, + "grad_norm": 0.16915336653399293, + "learning_rate": 6.364595685764988e-05, + "loss": 2.806, + "step": 22753 + }, + { + "epoch": 1.412502327891241, + "grad_norm": 0.14678758393871366, + "learning_rate": 6.36424824022524e-05, + "loss": 2.7229, + "step": 22754 + }, + { + "epoch": 1.412564404990999, + "grad_norm": 0.1465658025125487, + "learning_rate": 6.363900787567879e-05, + "loss": 2.8987, + "step": 22755 + }, + { + "epoch": 1.4126264820907566, + "grad_norm": 0.1438071049379133, + "learning_rate": 6.363553327794716e-05, + "loss": 2.8786, + "step": 22756 + }, + { + "epoch": 1.4126885591905145, + "grad_norm": 0.15853376334596989, + "learning_rate": 6.363205860907564e-05, + "loss": 2.8708, + "step": 22757 + }, + { + "epoch": 1.4127506362902724, + "grad_norm": 0.16397487480192527, + "learning_rate": 6.362858386908238e-05, + "loss": 2.8416, + "step": 22758 + }, + { + "epoch": 1.4128127133900303, + "grad_norm": 0.1513866719656911, + "learning_rate": 6.362510905798546e-05, + "loss": 2.8526, + "step": 22759 + }, + { + "epoch": 1.4128747904897883, + "grad_norm": 0.21804208820628362, + "learning_rate": 6.362163417580306e-05, + "loss": 2.8105, + "step": 22760 + }, + { + "epoch": 1.4129368675895462, + "grad_norm": 0.16896329905291443, + "learning_rate": 6.361815922255327e-05, + "loss": 2.7445, + "step": 22761 + }, + { + "epoch": 1.412998944689304, + "grad_norm": 0.15285941417950555, + "learning_rate": 6.361468419825426e-05, + "loss": 2.807, + "step": 22762 + }, + { + "epoch": 1.413061021789062, + "grad_norm": 0.1830472115982875, + "learning_rate": 6.361120910292413e-05, + "loss": 2.8694, + "step": 22763 + }, + { + "epoch": 1.41312309888882, + "grad_norm": 0.19004042527904952, + "learning_rate": 6.360773393658101e-05, + "loss": 2.8362, + "step": 22764 + }, + { + "epoch": 1.4131851759885778, + "grad_norm": 0.19687799572557646, + "learning_rate": 6.360425869924304e-05, + "loss": 2.839, + "step": 22765 + }, + { + "epoch": 1.4132472530883358, + "grad_norm": 0.16936609754181303, + "learning_rate": 6.360078339092837e-05, + "loss": 2.7193, + "step": 22766 + }, + { + "epoch": 1.4133093301880937, + "grad_norm": 0.1713440236004724, + "learning_rate": 6.359730801165509e-05, + "loss": 2.8731, + "step": 22767 + }, + { + "epoch": 1.4133714072878516, + "grad_norm": 0.19911338787499958, + "learning_rate": 6.359383256144135e-05, + "loss": 2.8356, + "step": 22768 + }, + { + "epoch": 1.4134334843876095, + "grad_norm": 0.15231711815089088, + "learning_rate": 6.359035704030529e-05, + "loss": 2.7648, + "step": 22769 + }, + { + "epoch": 1.4134955614873674, + "grad_norm": 0.1767801618409379, + "learning_rate": 6.358688144826504e-05, + "loss": 2.8093, + "step": 22770 + }, + { + "epoch": 1.4135576385871251, + "grad_norm": 0.1529782056272637, + "learning_rate": 6.358340578533874e-05, + "loss": 2.7631, + "step": 22771 + }, + { + "epoch": 1.413619715686883, + "grad_norm": 0.14887345442221855, + "learning_rate": 6.35799300515445e-05, + "loss": 2.8294, + "step": 22772 + }, + { + "epoch": 1.413681792786641, + "grad_norm": 0.14593515432012427, + "learning_rate": 6.357645424690047e-05, + "loss": 2.7936, + "step": 22773 + }, + { + "epoch": 1.4137438698863989, + "grad_norm": 0.150143500817902, + "learning_rate": 6.357297837142479e-05, + "loss": 2.7441, + "step": 22774 + }, + { + "epoch": 1.4138059469861568, + "grad_norm": 0.14471837765796713, + "learning_rate": 6.356950242513558e-05, + "loss": 2.8055, + "step": 22775 + }, + { + "epoch": 1.4138680240859147, + "grad_norm": 0.1531289053924935, + "learning_rate": 6.356602640805099e-05, + "loss": 2.7708, + "step": 22776 + }, + { + "epoch": 1.4139301011856726, + "grad_norm": 0.18063131499553445, + "learning_rate": 6.356255032018914e-05, + "loss": 2.8308, + "step": 22777 + }, + { + "epoch": 1.4139921782854306, + "grad_norm": 0.1442931314363835, + "learning_rate": 6.355907416156818e-05, + "loss": 2.801, + "step": 22778 + }, + { + "epoch": 1.4140542553851885, + "grad_norm": 0.15067739517209533, + "learning_rate": 6.355559793220624e-05, + "loss": 2.8402, + "step": 22779 + }, + { + "epoch": 1.4141163324849462, + "grad_norm": 0.14663267893492699, + "learning_rate": 6.355212163212145e-05, + "loss": 2.8851, + "step": 22780 + }, + { + "epoch": 1.414178409584704, + "grad_norm": 0.1563468335026973, + "learning_rate": 6.354864526133194e-05, + "loss": 2.7797, + "step": 22781 + }, + { + "epoch": 1.414240486684462, + "grad_norm": 0.15582375202987403, + "learning_rate": 6.354516881985588e-05, + "loss": 2.881, + "step": 22782 + }, + { + "epoch": 1.41430256378422, + "grad_norm": 0.14449619821848544, + "learning_rate": 6.354169230771137e-05, + "loss": 2.8544, + "step": 22783 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.15656293288378537, + "learning_rate": 6.353821572491657e-05, + "loss": 2.8294, + "step": 22784 + }, + { + "epoch": 1.4144267179837358, + "grad_norm": 0.16043597890565997, + "learning_rate": 6.353473907148961e-05, + "loss": 2.8579, + "step": 22785 + }, + { + "epoch": 1.4144887950834937, + "grad_norm": 0.15847994286681527, + "learning_rate": 6.353126234744864e-05, + "loss": 2.7892, + "step": 22786 + }, + { + "epoch": 1.4145508721832516, + "grad_norm": 0.15683537147011284, + "learning_rate": 6.352778555281178e-05, + "loss": 2.9021, + "step": 22787 + }, + { + "epoch": 1.4146129492830095, + "grad_norm": 0.16204404663605823, + "learning_rate": 6.352430868759718e-05, + "loss": 2.8866, + "step": 22788 + }, + { + "epoch": 1.4146750263827674, + "grad_norm": 0.15065321353367803, + "learning_rate": 6.352083175182298e-05, + "loss": 2.8128, + "step": 22789 + }, + { + "epoch": 1.4147371034825253, + "grad_norm": 0.14626450886446718, + "learning_rate": 6.351735474550731e-05, + "loss": 2.7836, + "step": 22790 + }, + { + "epoch": 1.4147991805822833, + "grad_norm": 0.197044182170738, + "learning_rate": 6.351387766866832e-05, + "loss": 2.7722, + "step": 22791 + }, + { + "epoch": 1.4148612576820412, + "grad_norm": 0.17868951762922627, + "learning_rate": 6.351040052132415e-05, + "loss": 2.8245, + "step": 22792 + }, + { + "epoch": 1.414923334781799, + "grad_norm": 0.18011374297164856, + "learning_rate": 6.350692330349294e-05, + "loss": 2.8504, + "step": 22793 + }, + { + "epoch": 1.414985411881557, + "grad_norm": 0.15307497331311262, + "learning_rate": 6.350344601519286e-05, + "loss": 2.7558, + "step": 22794 + }, + { + "epoch": 1.4150474889813147, + "grad_norm": 0.16541273937760062, + "learning_rate": 6.349996865644199e-05, + "loss": 2.7477, + "step": 22795 + }, + { + "epoch": 1.4151095660810726, + "grad_norm": 0.18674811966203483, + "learning_rate": 6.349649122725853e-05, + "loss": 2.841, + "step": 22796 + }, + { + "epoch": 1.4151716431808306, + "grad_norm": 0.19183804988766118, + "learning_rate": 6.349301372766058e-05, + "loss": 2.8523, + "step": 22797 + }, + { + "epoch": 1.4152337202805885, + "grad_norm": 0.14746905298195845, + "learning_rate": 6.348953615766631e-05, + "loss": 2.791, + "step": 22798 + }, + { + "epoch": 1.4152957973803464, + "grad_norm": 0.21691387043921598, + "learning_rate": 6.348605851729386e-05, + "loss": 2.8386, + "step": 22799 + }, + { + "epoch": 1.4153578744801043, + "grad_norm": 0.15254758378770406, + "learning_rate": 6.348258080656136e-05, + "loss": 2.7618, + "step": 22800 + }, + { + "epoch": 1.4154199515798622, + "grad_norm": 0.16576417505164293, + "learning_rate": 6.347910302548698e-05, + "loss": 2.753, + "step": 22801 + }, + { + "epoch": 1.4154820286796201, + "grad_norm": 0.2182935055792221, + "learning_rate": 6.347562517408883e-05, + "loss": 2.8855, + "step": 22802 + }, + { + "epoch": 1.415544105779378, + "grad_norm": 0.1583112006079521, + "learning_rate": 6.347214725238507e-05, + "loss": 2.7872, + "step": 22803 + }, + { + "epoch": 1.4156061828791358, + "grad_norm": 0.2437785248082689, + "learning_rate": 6.346866926039386e-05, + "loss": 2.7454, + "step": 22804 + }, + { + "epoch": 1.4156682599788937, + "grad_norm": 0.15608004028668132, + "learning_rate": 6.346519119813332e-05, + "loss": 2.8061, + "step": 22805 + }, + { + "epoch": 1.4157303370786516, + "grad_norm": 0.1930736878044124, + "learning_rate": 6.34617130656216e-05, + "loss": 2.9014, + "step": 22806 + }, + { + "epoch": 1.4157924141784095, + "grad_norm": 0.1583647608676253, + "learning_rate": 6.345823486287687e-05, + "loss": 2.8447, + "step": 22807 + }, + { + "epoch": 1.4158544912781674, + "grad_norm": 0.21302321443571592, + "learning_rate": 6.345475658991727e-05, + "loss": 2.8818, + "step": 22808 + }, + { + "epoch": 1.4159165683779253, + "grad_norm": 0.1742093909106132, + "learning_rate": 6.345127824676093e-05, + "loss": 2.8364, + "step": 22809 + }, + { + "epoch": 1.4159786454776833, + "grad_norm": 0.15676811861199183, + "learning_rate": 6.3447799833426e-05, + "loss": 2.8411, + "step": 22810 + }, + { + "epoch": 1.4160407225774412, + "grad_norm": 0.1668218055782978, + "learning_rate": 6.344432134993063e-05, + "loss": 2.8806, + "step": 22811 + }, + { + "epoch": 1.416102799677199, + "grad_norm": 0.15698750014567472, + "learning_rate": 6.3440842796293e-05, + "loss": 2.84, + "step": 22812 + }, + { + "epoch": 1.416164876776957, + "grad_norm": 0.15979868236738246, + "learning_rate": 6.34373641725312e-05, + "loss": 2.8505, + "step": 22813 + }, + { + "epoch": 1.416226953876715, + "grad_norm": 0.17574514972015812, + "learning_rate": 6.343388547866343e-05, + "loss": 2.8101, + "step": 22814 + }, + { + "epoch": 1.4162890309764729, + "grad_norm": 0.1647178356580953, + "learning_rate": 6.34304067147078e-05, + "loss": 2.8205, + "step": 22815 + }, + { + "epoch": 1.4163511080762308, + "grad_norm": 0.15063009764921156, + "learning_rate": 6.34269278806825e-05, + "loss": 2.742, + "step": 22816 + }, + { + "epoch": 1.4164131851759887, + "grad_norm": 0.1615587557980446, + "learning_rate": 6.342344897660564e-05, + "loss": 2.8329, + "step": 22817 + }, + { + "epoch": 1.4164752622757466, + "grad_norm": 0.16892467769668554, + "learning_rate": 6.341997000249538e-05, + "loss": 2.9147, + "step": 22818 + }, + { + "epoch": 1.4165373393755043, + "grad_norm": 0.15417937138333906, + "learning_rate": 6.34164909583699e-05, + "loss": 2.7656, + "step": 22819 + }, + { + "epoch": 1.4165994164752622, + "grad_norm": 0.18026586415427975, + "learning_rate": 6.341301184424733e-05, + "loss": 2.8415, + "step": 22820 + }, + { + "epoch": 1.4166614935750201, + "grad_norm": 0.15850480479372045, + "learning_rate": 6.340953266014581e-05, + "loss": 2.9456, + "step": 22821 + }, + { + "epoch": 1.416723570674778, + "grad_norm": 0.1647705794828968, + "learning_rate": 6.340605340608349e-05, + "loss": 2.767, + "step": 22822 + }, + { + "epoch": 1.416785647774536, + "grad_norm": 0.1437871633568502, + "learning_rate": 6.340257408207855e-05, + "loss": 2.9165, + "step": 22823 + }, + { + "epoch": 1.416847724874294, + "grad_norm": 0.14456544935431362, + "learning_rate": 6.339909468814913e-05, + "loss": 2.8166, + "step": 22824 + }, + { + "epoch": 1.4169098019740518, + "grad_norm": 0.15586076176535535, + "learning_rate": 6.339561522431337e-05, + "loss": 2.8363, + "step": 22825 + }, + { + "epoch": 1.4169718790738097, + "grad_norm": 0.18969106970920765, + "learning_rate": 6.339213569058947e-05, + "loss": 2.8104, + "step": 22826 + }, + { + "epoch": 1.4170339561735676, + "grad_norm": 0.16437367606445732, + "learning_rate": 6.33886560869955e-05, + "loss": 2.7908, + "step": 22827 + }, + { + "epoch": 1.4170960332733253, + "grad_norm": 0.1626454761282626, + "learning_rate": 6.338517641354968e-05, + "loss": 2.7784, + "step": 22828 + }, + { + "epoch": 1.4171581103730833, + "grad_norm": 0.15461698957662542, + "learning_rate": 6.338169667027015e-05, + "loss": 2.8378, + "step": 22829 + }, + { + "epoch": 1.4172201874728412, + "grad_norm": 0.16092017622177984, + "learning_rate": 6.337821685717504e-05, + "loss": 2.8279, + "step": 22830 + }, + { + "epoch": 1.417282264572599, + "grad_norm": 0.15443247257133, + "learning_rate": 6.337473697428255e-05, + "loss": 2.7937, + "step": 22831 + }, + { + "epoch": 1.417344341672357, + "grad_norm": 0.14631710038454698, + "learning_rate": 6.33712570216108e-05, + "loss": 2.8107, + "step": 22832 + }, + { + "epoch": 1.417406418772115, + "grad_norm": 0.15383859977493336, + "learning_rate": 6.336777699917795e-05, + "loss": 2.8497, + "step": 22833 + }, + { + "epoch": 1.4174684958718728, + "grad_norm": 0.15114435591086253, + "learning_rate": 6.336429690700217e-05, + "loss": 2.8888, + "step": 22834 + }, + { + "epoch": 1.4175305729716308, + "grad_norm": 0.14415833330830116, + "learning_rate": 6.336081674510161e-05, + "loss": 2.8368, + "step": 22835 + }, + { + "epoch": 1.4175926500713887, + "grad_norm": 0.16086966672955003, + "learning_rate": 6.335733651349442e-05, + "loss": 2.808, + "step": 22836 + }, + { + "epoch": 1.4176547271711466, + "grad_norm": 0.1794797484162818, + "learning_rate": 6.335385621219877e-05, + "loss": 2.7992, + "step": 22837 + }, + { + "epoch": 1.4177168042709045, + "grad_norm": 0.16315216158954143, + "learning_rate": 6.33503758412328e-05, + "loss": 2.7606, + "step": 22838 + }, + { + "epoch": 1.4177788813706624, + "grad_norm": 0.16305148166167824, + "learning_rate": 6.334689540061468e-05, + "loss": 3.01, + "step": 22839 + }, + { + "epoch": 1.4178409584704204, + "grad_norm": 0.15390414948167674, + "learning_rate": 6.334341489036256e-05, + "loss": 2.8169, + "step": 22840 + }, + { + "epoch": 1.4179030355701783, + "grad_norm": 0.14947681106700636, + "learning_rate": 6.333993431049462e-05, + "loss": 2.7454, + "step": 22841 + }, + { + "epoch": 1.4179651126699362, + "grad_norm": 0.16603855086956623, + "learning_rate": 6.333645366102901e-05, + "loss": 2.8971, + "step": 22842 + }, + { + "epoch": 1.4180271897696939, + "grad_norm": 0.1487635432316952, + "learning_rate": 6.333297294198387e-05, + "loss": 2.825, + "step": 22843 + }, + { + "epoch": 1.4180892668694518, + "grad_norm": 0.16490960545801067, + "learning_rate": 6.332949215337738e-05, + "loss": 2.7054, + "step": 22844 + }, + { + "epoch": 1.4181513439692097, + "grad_norm": 0.1796143839147223, + "learning_rate": 6.332601129522769e-05, + "loss": 2.8759, + "step": 22845 + }, + { + "epoch": 1.4182134210689676, + "grad_norm": 0.15006239482450157, + "learning_rate": 6.332253036755296e-05, + "loss": 2.9133, + "step": 22846 + }, + { + "epoch": 1.4182754981687256, + "grad_norm": 0.18642212078323514, + "learning_rate": 6.331904937037137e-05, + "loss": 2.8611, + "step": 22847 + }, + { + "epoch": 1.4183375752684835, + "grad_norm": 0.15850811276371454, + "learning_rate": 6.331556830370105e-05, + "loss": 2.883, + "step": 22848 + }, + { + "epoch": 1.4183996523682414, + "grad_norm": 0.1576877547109711, + "learning_rate": 6.331208716756019e-05, + "loss": 2.7941, + "step": 22849 + }, + { + "epoch": 1.4184617294679993, + "grad_norm": 0.16027066168169912, + "learning_rate": 6.330860596196692e-05, + "loss": 2.7857, + "step": 22850 + }, + { + "epoch": 1.4185238065677572, + "grad_norm": 0.16384166091316135, + "learning_rate": 6.330512468693945e-05, + "loss": 2.8226, + "step": 22851 + }, + { + "epoch": 1.418585883667515, + "grad_norm": 0.1786830338336573, + "learning_rate": 6.33016433424959e-05, + "loss": 2.7972, + "step": 22852 + }, + { + "epoch": 1.4186479607672728, + "grad_norm": 0.1600807989147564, + "learning_rate": 6.329816192865443e-05, + "loss": 2.7826, + "step": 22853 + }, + { + "epoch": 1.4187100378670308, + "grad_norm": 0.1523537571383515, + "learning_rate": 6.329468044543326e-05, + "loss": 2.7684, + "step": 22854 + }, + { + "epoch": 1.4187721149667887, + "grad_norm": 0.17030704625173254, + "learning_rate": 6.329119889285047e-05, + "loss": 2.8047, + "step": 22855 + }, + { + "epoch": 1.4188341920665466, + "grad_norm": 0.15505447783931448, + "learning_rate": 6.32877172709243e-05, + "loss": 2.8459, + "step": 22856 + }, + { + "epoch": 1.4188962691663045, + "grad_norm": 0.1714981564521407, + "learning_rate": 6.328423557967287e-05, + "loss": 2.8336, + "step": 22857 + }, + { + "epoch": 1.4189583462660624, + "grad_norm": 0.15908105403817788, + "learning_rate": 6.328075381911435e-05, + "loss": 2.8649, + "step": 22858 + }, + { + "epoch": 1.4190204233658203, + "grad_norm": 0.16674544973291716, + "learning_rate": 6.327727198926694e-05, + "loss": 2.8689, + "step": 22859 + }, + { + "epoch": 1.4190825004655783, + "grad_norm": 0.19395136521221076, + "learning_rate": 6.327379009014877e-05, + "loss": 2.8654, + "step": 22860 + }, + { + "epoch": 1.4191445775653362, + "grad_norm": 0.1666724507003716, + "learning_rate": 6.3270308121778e-05, + "loss": 2.8053, + "step": 22861 + }, + { + "epoch": 1.419206654665094, + "grad_norm": 0.17566967177983722, + "learning_rate": 6.326682608417284e-05, + "loss": 2.8739, + "step": 22862 + }, + { + "epoch": 1.419268731764852, + "grad_norm": 0.1649110399099607, + "learning_rate": 6.32633439773514e-05, + "loss": 2.8791, + "step": 22863 + }, + { + "epoch": 1.41933080886461, + "grad_norm": 0.17992228220977619, + "learning_rate": 6.325986180133189e-05, + "loss": 2.8831, + "step": 22864 + }, + { + "epoch": 1.4193928859643679, + "grad_norm": 0.15464810341733656, + "learning_rate": 6.325637955613245e-05, + "loss": 2.8543, + "step": 22865 + }, + { + "epoch": 1.4194549630641258, + "grad_norm": 0.17729081359490695, + "learning_rate": 6.325289724177127e-05, + "loss": 2.846, + "step": 22866 + }, + { + "epoch": 1.4195170401638835, + "grad_norm": 0.16294300074340504, + "learning_rate": 6.32494148582665e-05, + "loss": 2.8641, + "step": 22867 + }, + { + "epoch": 1.4195791172636414, + "grad_norm": 0.17999243776674984, + "learning_rate": 6.324593240563632e-05, + "loss": 2.7481, + "step": 22868 + }, + { + "epoch": 1.4196411943633993, + "grad_norm": 0.18406289605776496, + "learning_rate": 6.32424498838989e-05, + "loss": 2.7812, + "step": 22869 + }, + { + "epoch": 1.4197032714631572, + "grad_norm": 0.160400077087932, + "learning_rate": 6.323896729307239e-05, + "loss": 2.831, + "step": 22870 + }, + { + "epoch": 1.4197653485629151, + "grad_norm": 0.1565272256094351, + "learning_rate": 6.3235484633175e-05, + "loss": 2.8153, + "step": 22871 + }, + { + "epoch": 1.419827425662673, + "grad_norm": 0.14947215054380494, + "learning_rate": 6.323200190422483e-05, + "loss": 2.8129, + "step": 22872 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.15711785343384313, + "learning_rate": 6.322851910624013e-05, + "loss": 2.8209, + "step": 22873 + }, + { + "epoch": 1.419951579862189, + "grad_norm": 0.17238727543056748, + "learning_rate": 6.322503623923902e-05, + "loss": 2.8185, + "step": 22874 + }, + { + "epoch": 1.4200136569619468, + "grad_norm": 0.15164695235213912, + "learning_rate": 6.322155330323969e-05, + "loss": 2.7771, + "step": 22875 + }, + { + "epoch": 1.4200757340617045, + "grad_norm": 0.1557374833723862, + "learning_rate": 6.32180702982603e-05, + "loss": 2.8485, + "step": 22876 + }, + { + "epoch": 1.4201378111614624, + "grad_norm": 0.15281227398487063, + "learning_rate": 6.321458722431904e-05, + "loss": 2.7451, + "step": 22877 + }, + { + "epoch": 1.4201998882612203, + "grad_norm": 0.16264746306922842, + "learning_rate": 6.321110408143407e-05, + "loss": 2.7364, + "step": 22878 + }, + { + "epoch": 1.4202619653609783, + "grad_norm": 0.1526814166499459, + "learning_rate": 6.320762086962357e-05, + "loss": 2.7127, + "step": 22879 + }, + { + "epoch": 1.4203240424607362, + "grad_norm": 0.16866344024431745, + "learning_rate": 6.320413758890568e-05, + "loss": 2.7882, + "step": 22880 + }, + { + "epoch": 1.420386119560494, + "grad_norm": 0.16090646509351372, + "learning_rate": 6.320065423929862e-05, + "loss": 2.8412, + "step": 22881 + }, + { + "epoch": 1.420448196660252, + "grad_norm": 0.17465177699620327, + "learning_rate": 6.319717082082054e-05, + "loss": 2.8173, + "step": 22882 + }, + { + "epoch": 1.42051027376001, + "grad_norm": 0.1732207520479511, + "learning_rate": 6.31936873334896e-05, + "loss": 2.7139, + "step": 22883 + }, + { + "epoch": 1.4205723508597679, + "grad_norm": 0.18873461910130643, + "learning_rate": 6.3190203777324e-05, + "loss": 2.8011, + "step": 22884 + }, + { + "epoch": 1.4206344279595258, + "grad_norm": 0.16478428559063665, + "learning_rate": 6.318672015234192e-05, + "loss": 2.8029, + "step": 22885 + }, + { + "epoch": 1.4206965050592837, + "grad_norm": 0.2034352482467471, + "learning_rate": 6.31832364585615e-05, + "loss": 2.7958, + "step": 22886 + }, + { + "epoch": 1.4207585821590416, + "grad_norm": 0.15242639158548102, + "learning_rate": 6.317975269600093e-05, + "loss": 2.8226, + "step": 22887 + }, + { + "epoch": 1.4208206592587995, + "grad_norm": 0.15447946802338391, + "learning_rate": 6.317626886467841e-05, + "loss": 2.7245, + "step": 22888 + }, + { + "epoch": 1.4208827363585574, + "grad_norm": 0.16343492298939175, + "learning_rate": 6.31727849646121e-05, + "loss": 2.9063, + "step": 22889 + }, + { + "epoch": 1.4209448134583154, + "grad_norm": 0.15498567102518968, + "learning_rate": 6.316930099582015e-05, + "loss": 2.7772, + "step": 22890 + }, + { + "epoch": 1.421006890558073, + "grad_norm": 0.1661939953014943, + "learning_rate": 6.316581695832077e-05, + "loss": 2.8756, + "step": 22891 + }, + { + "epoch": 1.421068967657831, + "grad_norm": 0.16874133458671026, + "learning_rate": 6.316233285213213e-05, + "loss": 2.7667, + "step": 22892 + }, + { + "epoch": 1.421131044757589, + "grad_norm": 0.15957997898626075, + "learning_rate": 6.315884867727241e-05, + "loss": 2.7992, + "step": 22893 + }, + { + "epoch": 1.4211931218573468, + "grad_norm": 0.18002141113226786, + "learning_rate": 6.315536443375977e-05, + "loss": 2.7702, + "step": 22894 + }, + { + "epoch": 1.4212551989571047, + "grad_norm": 0.17675360785272834, + "learning_rate": 6.315188012161241e-05, + "loss": 2.8159, + "step": 22895 + }, + { + "epoch": 1.4213172760568626, + "grad_norm": 0.17140205292966224, + "learning_rate": 6.314839574084848e-05, + "loss": 2.7948, + "step": 22896 + }, + { + "epoch": 1.4213793531566206, + "grad_norm": 0.1502533137186938, + "learning_rate": 6.31449112914862e-05, + "loss": 2.79, + "step": 22897 + }, + { + "epoch": 1.4214414302563785, + "grad_norm": 0.1677551796940649, + "learning_rate": 6.314142677354371e-05, + "loss": 2.7993, + "step": 22898 + }, + { + "epoch": 1.4215035073561364, + "grad_norm": 0.16286268200408513, + "learning_rate": 6.313794218703922e-05, + "loss": 2.7338, + "step": 22899 + }, + { + "epoch": 1.421565584455894, + "grad_norm": 0.1689586082915653, + "learning_rate": 6.31344575319909e-05, + "loss": 2.8175, + "step": 22900 + }, + { + "epoch": 1.421627661555652, + "grad_norm": 0.14304772863753165, + "learning_rate": 6.313097280841691e-05, + "loss": 2.7879, + "step": 22901 + }, + { + "epoch": 1.42168973865541, + "grad_norm": 0.24982493947056153, + "learning_rate": 6.312748801633545e-05, + "loss": 2.9377, + "step": 22902 + }, + { + "epoch": 1.4217518157551678, + "grad_norm": 0.16258815359579734, + "learning_rate": 6.312400315576471e-05, + "loss": 2.7785, + "step": 22903 + }, + { + "epoch": 1.4218138928549258, + "grad_norm": 0.16291965642142706, + "learning_rate": 6.312051822672285e-05, + "loss": 2.8273, + "step": 22904 + }, + { + "epoch": 1.4218759699546837, + "grad_norm": 0.23738726211402555, + "learning_rate": 6.311703322922806e-05, + "loss": 2.8691, + "step": 22905 + }, + { + "epoch": 1.4219380470544416, + "grad_norm": 0.1444997430946779, + "learning_rate": 6.311354816329853e-05, + "loss": 2.8442, + "step": 22906 + }, + { + "epoch": 1.4220001241541995, + "grad_norm": 0.1967043398093042, + "learning_rate": 6.311006302895244e-05, + "loss": 2.8424, + "step": 22907 + }, + { + "epoch": 1.4220622012539574, + "grad_norm": 0.15951857154038684, + "learning_rate": 6.310657782620794e-05, + "loss": 2.8485, + "step": 22908 + }, + { + "epoch": 1.4221242783537154, + "grad_norm": 0.1606355276164506, + "learning_rate": 6.310309255508328e-05, + "loss": 2.8747, + "step": 22909 + }, + { + "epoch": 1.4221863554534733, + "grad_norm": 0.1601582096724292, + "learning_rate": 6.309960721559658e-05, + "loss": 2.8169, + "step": 22910 + }, + { + "epoch": 1.4222484325532312, + "grad_norm": 0.16891212357887248, + "learning_rate": 6.309612180776606e-05, + "loss": 2.8698, + "step": 22911 + }, + { + "epoch": 1.422310509652989, + "grad_norm": 0.18667138308891887, + "learning_rate": 6.30926363316099e-05, + "loss": 2.8462, + "step": 22912 + }, + { + "epoch": 1.422372586752747, + "grad_norm": 0.16642868602641006, + "learning_rate": 6.308915078714626e-05, + "loss": 2.8394, + "step": 22913 + }, + { + "epoch": 1.422434663852505, + "grad_norm": 0.1588352490453767, + "learning_rate": 6.308566517439335e-05, + "loss": 2.7481, + "step": 22914 + }, + { + "epoch": 1.4224967409522626, + "grad_norm": 0.15636389276948676, + "learning_rate": 6.308217949336935e-05, + "loss": 2.8418, + "step": 22915 + }, + { + "epoch": 1.4225588180520206, + "grad_norm": 0.16988932637637802, + "learning_rate": 6.307869374409243e-05, + "loss": 2.8741, + "step": 22916 + }, + { + "epoch": 1.4226208951517785, + "grad_norm": 0.15791932895042618, + "learning_rate": 6.307520792658082e-05, + "loss": 2.9301, + "step": 22917 + }, + { + "epoch": 1.4226829722515364, + "grad_norm": 0.1565037574134194, + "learning_rate": 6.307172204085265e-05, + "loss": 2.8055, + "step": 22918 + }, + { + "epoch": 1.4227450493512943, + "grad_norm": 0.16811848220989645, + "learning_rate": 6.306823608692615e-05, + "loss": 2.9505, + "step": 22919 + }, + { + "epoch": 1.4228071264510522, + "grad_norm": 0.15969355550247058, + "learning_rate": 6.306475006481949e-05, + "loss": 2.7856, + "step": 22920 + }, + { + "epoch": 1.4228692035508101, + "grad_norm": 0.14897928371241695, + "learning_rate": 6.306126397455083e-05, + "loss": 2.9049, + "step": 22921 + }, + { + "epoch": 1.422931280650568, + "grad_norm": 0.17765027456375085, + "learning_rate": 6.30577778161384e-05, + "loss": 2.8033, + "step": 22922 + }, + { + "epoch": 1.422993357750326, + "grad_norm": 0.1767339690339265, + "learning_rate": 6.305429158960038e-05, + "loss": 2.8713, + "step": 22923 + }, + { + "epoch": 1.4230554348500837, + "grad_norm": 0.1568802639089494, + "learning_rate": 6.305080529495495e-05, + "loss": 2.781, + "step": 22924 + }, + { + "epoch": 1.4231175119498416, + "grad_norm": 0.14869340997225483, + "learning_rate": 6.304731893222031e-05, + "loss": 2.7688, + "step": 22925 + }, + { + "epoch": 1.4231795890495995, + "grad_norm": 0.17442408209339538, + "learning_rate": 6.304383250141462e-05, + "loss": 2.8437, + "step": 22926 + }, + { + "epoch": 1.4232416661493574, + "grad_norm": 0.15047147980112238, + "learning_rate": 6.304034600255611e-05, + "loss": 2.8089, + "step": 22927 + }, + { + "epoch": 1.4233037432491153, + "grad_norm": 0.14596485837217174, + "learning_rate": 6.303685943566293e-05, + "loss": 2.8674, + "step": 22928 + }, + { + "epoch": 1.4233658203488733, + "grad_norm": 0.19663272967608306, + "learning_rate": 6.30333728007533e-05, + "loss": 2.8029, + "step": 22929 + }, + { + "epoch": 1.4234278974486312, + "grad_norm": 0.15997429723226086, + "learning_rate": 6.302988609784539e-05, + "loss": 2.8596, + "step": 22930 + }, + { + "epoch": 1.423489974548389, + "grad_norm": 0.1578500540117752, + "learning_rate": 6.302639932695742e-05, + "loss": 2.8019, + "step": 22931 + }, + { + "epoch": 1.423552051648147, + "grad_norm": 0.14038227909862933, + "learning_rate": 6.302291248810754e-05, + "loss": 2.8696, + "step": 22932 + }, + { + "epoch": 1.423614128747905, + "grad_norm": 0.14867388575081672, + "learning_rate": 6.301942558131398e-05, + "loss": 2.8518, + "step": 22933 + }, + { + "epoch": 1.4236762058476629, + "grad_norm": 0.1518745594351858, + "learning_rate": 6.301593860659492e-05, + "loss": 2.7481, + "step": 22934 + }, + { + "epoch": 1.4237382829474208, + "grad_norm": 0.15009962141429548, + "learning_rate": 6.301245156396854e-05, + "loss": 2.8328, + "step": 22935 + }, + { + "epoch": 1.4238003600471787, + "grad_norm": 0.1429389607292733, + "learning_rate": 6.300896445345304e-05, + "loss": 2.8905, + "step": 22936 + }, + { + "epoch": 1.4238624371469366, + "grad_norm": 0.15376294382008182, + "learning_rate": 6.30054772750666e-05, + "loss": 2.8187, + "step": 22937 + }, + { + "epoch": 1.4239245142466945, + "grad_norm": 0.152328936761424, + "learning_rate": 6.300199002882745e-05, + "loss": 2.8294, + "step": 22938 + }, + { + "epoch": 1.4239865913464522, + "grad_norm": 0.14769652791142382, + "learning_rate": 6.299850271475376e-05, + "loss": 2.8722, + "step": 22939 + }, + { + "epoch": 1.4240486684462101, + "grad_norm": 0.16933415707858884, + "learning_rate": 6.299501533286371e-05, + "loss": 2.8673, + "step": 22940 + }, + { + "epoch": 1.424110745545968, + "grad_norm": 0.15020864797057462, + "learning_rate": 6.299152788317553e-05, + "loss": 2.751, + "step": 22941 + }, + { + "epoch": 1.424172822645726, + "grad_norm": 0.14066885555738112, + "learning_rate": 6.298804036570739e-05, + "loss": 2.8356, + "step": 22942 + }, + { + "epoch": 1.424234899745484, + "grad_norm": 0.1724703026947199, + "learning_rate": 6.298455278047748e-05, + "loss": 2.8992, + "step": 22943 + }, + { + "epoch": 1.4242969768452418, + "grad_norm": 0.14463594871544938, + "learning_rate": 6.298106512750403e-05, + "loss": 2.7976, + "step": 22944 + }, + { + "epoch": 1.4243590539449997, + "grad_norm": 0.1487925887547524, + "learning_rate": 6.29775774068052e-05, + "loss": 2.7785, + "step": 22945 + }, + { + "epoch": 1.4244211310447576, + "grad_norm": 0.14712569041442602, + "learning_rate": 6.297408961839918e-05, + "loss": 2.7809, + "step": 22946 + }, + { + "epoch": 1.4244832081445156, + "grad_norm": 0.14632910446556094, + "learning_rate": 6.29706017623042e-05, + "loss": 2.82, + "step": 22947 + }, + { + "epoch": 1.4245452852442733, + "grad_norm": 0.17598017986275843, + "learning_rate": 6.296711383853844e-05, + "loss": 2.796, + "step": 22948 + }, + { + "epoch": 1.4246073623440312, + "grad_norm": 0.14678939851051093, + "learning_rate": 6.29636258471201e-05, + "loss": 2.9111, + "step": 22949 + }, + { + "epoch": 1.424669439443789, + "grad_norm": 0.17531199869978387, + "learning_rate": 6.296013778806738e-05, + "loss": 2.8362, + "step": 22950 + }, + { + "epoch": 1.424731516543547, + "grad_norm": 0.1464767263322952, + "learning_rate": 6.295664966139847e-05, + "loss": 2.8357, + "step": 22951 + }, + { + "epoch": 1.424793593643305, + "grad_norm": 0.14101243292875668, + "learning_rate": 6.295316146713157e-05, + "loss": 2.7984, + "step": 22952 + }, + { + "epoch": 1.4248556707430629, + "grad_norm": 0.14553997145255335, + "learning_rate": 6.294967320528489e-05, + "loss": 2.8281, + "step": 22953 + }, + { + "epoch": 1.4249177478428208, + "grad_norm": 0.1461535405394582, + "learning_rate": 6.294618487587663e-05, + "loss": 2.7653, + "step": 22954 + }, + { + "epoch": 1.4249798249425787, + "grad_norm": 0.15772205962216976, + "learning_rate": 6.294269647892494e-05, + "loss": 2.7705, + "step": 22955 + }, + { + "epoch": 1.4250419020423366, + "grad_norm": 0.1753621881196973, + "learning_rate": 6.29392080144481e-05, + "loss": 2.7575, + "step": 22956 + }, + { + "epoch": 1.4251039791420945, + "grad_norm": 0.1756847116499444, + "learning_rate": 6.293571948246427e-05, + "loss": 2.9413, + "step": 22957 + }, + { + "epoch": 1.4251660562418524, + "grad_norm": 0.19518899827614802, + "learning_rate": 6.293223088299164e-05, + "loss": 2.8463, + "step": 22958 + }, + { + "epoch": 1.4252281333416104, + "grad_norm": 0.15823381032171527, + "learning_rate": 6.292874221604842e-05, + "loss": 2.8666, + "step": 22959 + }, + { + "epoch": 1.4252902104413683, + "grad_norm": 0.15491203018087377, + "learning_rate": 6.292525348165281e-05, + "loss": 2.8005, + "step": 22960 + }, + { + "epoch": 1.4253522875411262, + "grad_norm": 0.15168041307667288, + "learning_rate": 6.292176467982303e-05, + "loss": 2.8774, + "step": 22961 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.21883840137064434, + "learning_rate": 6.291827581057725e-05, + "loss": 2.7487, + "step": 22962 + }, + { + "epoch": 1.4254764417406418, + "grad_norm": 0.13401800601040945, + "learning_rate": 6.29147868739337e-05, + "loss": 2.7965, + "step": 22963 + }, + { + "epoch": 1.4255385188403997, + "grad_norm": 0.15974595268216232, + "learning_rate": 6.291129786991058e-05, + "loss": 2.8205, + "step": 22964 + }, + { + "epoch": 1.4256005959401576, + "grad_norm": 0.150530044533415, + "learning_rate": 6.290780879852606e-05, + "loss": 2.8205, + "step": 22965 + }, + { + "epoch": 1.4256626730399156, + "grad_norm": 0.17230759712690827, + "learning_rate": 6.29043196597984e-05, + "loss": 2.8076, + "step": 22966 + }, + { + "epoch": 1.4257247501396735, + "grad_norm": 0.1571711389224986, + "learning_rate": 6.290083045374575e-05, + "loss": 2.8562, + "step": 22967 + }, + { + "epoch": 1.4257868272394314, + "grad_norm": 0.1490638343183509, + "learning_rate": 6.289734118038634e-05, + "loss": 2.8318, + "step": 22968 + }, + { + "epoch": 1.4258489043391893, + "grad_norm": 0.18022249231410212, + "learning_rate": 6.289385183973838e-05, + "loss": 2.865, + "step": 22969 + }, + { + "epoch": 1.4259109814389472, + "grad_norm": 0.14816016087724007, + "learning_rate": 6.289036243182003e-05, + "loss": 2.8689, + "step": 22970 + }, + { + "epoch": 1.4259730585387052, + "grad_norm": 0.1941566598821268, + "learning_rate": 6.288687295664956e-05, + "loss": 2.783, + "step": 22971 + }, + { + "epoch": 1.4260351356384628, + "grad_norm": 0.15064130673048004, + "learning_rate": 6.288338341424515e-05, + "loss": 2.8311, + "step": 22972 + }, + { + "epoch": 1.4260972127382208, + "grad_norm": 0.1604456090259872, + "learning_rate": 6.287989380462498e-05, + "loss": 2.8064, + "step": 22973 + }, + { + "epoch": 1.4261592898379787, + "grad_norm": 0.1530052458796438, + "learning_rate": 6.287640412780728e-05, + "loss": 2.7312, + "step": 22974 + }, + { + "epoch": 1.4262213669377366, + "grad_norm": 0.17097042747299368, + "learning_rate": 6.287291438381026e-05, + "loss": 2.8666, + "step": 22975 + }, + { + "epoch": 1.4262834440374945, + "grad_norm": 0.16441530939102006, + "learning_rate": 6.286942457265214e-05, + "loss": 2.8251, + "step": 22976 + }, + { + "epoch": 1.4263455211372524, + "grad_norm": 0.153827617619241, + "learning_rate": 6.286593469435108e-05, + "loss": 2.8726, + "step": 22977 + }, + { + "epoch": 1.4264075982370104, + "grad_norm": 0.15110441595300028, + "learning_rate": 6.286244474892533e-05, + "loss": 2.8522, + "step": 22978 + }, + { + "epoch": 1.4264696753367683, + "grad_norm": 0.19152232951513448, + "learning_rate": 6.285895473639308e-05, + "loss": 2.7107, + "step": 22979 + }, + { + "epoch": 1.4265317524365262, + "grad_norm": 0.14317270307500213, + "learning_rate": 6.285546465677254e-05, + "loss": 2.8107, + "step": 22980 + }, + { + "epoch": 1.426593829536284, + "grad_norm": 0.2177826832403785, + "learning_rate": 6.285197451008194e-05, + "loss": 2.8556, + "step": 22981 + }, + { + "epoch": 1.426655906636042, + "grad_norm": 0.14490669047552163, + "learning_rate": 6.284848429633943e-05, + "loss": 2.9297, + "step": 22982 + }, + { + "epoch": 1.4267179837358, + "grad_norm": 0.16682335198979106, + "learning_rate": 6.284499401556328e-05, + "loss": 2.7994, + "step": 22983 + }, + { + "epoch": 1.4267800608355579, + "grad_norm": 0.16141353198387495, + "learning_rate": 6.284150366777168e-05, + "loss": 2.8263, + "step": 22984 + }, + { + "epoch": 1.4268421379353158, + "grad_norm": 0.17708554102886542, + "learning_rate": 6.283801325298284e-05, + "loss": 2.9111, + "step": 22985 + }, + { + "epoch": 1.4269042150350737, + "grad_norm": 0.1855085935754934, + "learning_rate": 6.283452277121495e-05, + "loss": 2.9191, + "step": 22986 + }, + { + "epoch": 1.4269662921348314, + "grad_norm": 0.15421711997163054, + "learning_rate": 6.283103222248626e-05, + "loss": 2.8039, + "step": 22987 + }, + { + "epoch": 1.4270283692345893, + "grad_norm": 0.19136798242550646, + "learning_rate": 6.282754160681494e-05, + "loss": 2.8123, + "step": 22988 + }, + { + "epoch": 1.4270904463343472, + "grad_norm": 0.2263024745333458, + "learning_rate": 6.282405092421923e-05, + "loss": 2.9, + "step": 22989 + }, + { + "epoch": 1.4271525234341051, + "grad_norm": 0.18909174086121663, + "learning_rate": 6.282056017471735e-05, + "loss": 2.7488, + "step": 22990 + }, + { + "epoch": 1.427214600533863, + "grad_norm": 0.19618793748416108, + "learning_rate": 6.281706935832748e-05, + "loss": 2.8987, + "step": 22991 + }, + { + "epoch": 1.427276677633621, + "grad_norm": 0.15468680328636095, + "learning_rate": 6.281357847506785e-05, + "loss": 2.6888, + "step": 22992 + }, + { + "epoch": 1.427338754733379, + "grad_norm": 0.15752697316308076, + "learning_rate": 6.281008752495666e-05, + "loss": 2.7322, + "step": 22993 + }, + { + "epoch": 1.4274008318331368, + "grad_norm": 0.21817824028208024, + "learning_rate": 6.280659650801216e-05, + "loss": 2.8085, + "step": 22994 + }, + { + "epoch": 1.4274629089328947, + "grad_norm": 0.2583932981318792, + "learning_rate": 6.28031054242525e-05, + "loss": 2.7376, + "step": 22995 + }, + { + "epoch": 1.4275249860326524, + "grad_norm": 0.1489185831800588, + "learning_rate": 6.279961427369596e-05, + "loss": 2.8028, + "step": 22996 + }, + { + "epoch": 1.4275870631324103, + "grad_norm": 0.15474976095803553, + "learning_rate": 6.279612305636073e-05, + "loss": 2.8454, + "step": 22997 + }, + { + "epoch": 1.4276491402321683, + "grad_norm": 0.16829568627436775, + "learning_rate": 6.2792631772265e-05, + "loss": 2.8438, + "step": 22998 + }, + { + "epoch": 1.4277112173319262, + "grad_norm": 0.18055446496556699, + "learning_rate": 6.278914042142701e-05, + "loss": 2.823, + "step": 22999 + }, + { + "epoch": 1.427773294431684, + "grad_norm": 0.17535367461776752, + "learning_rate": 6.278564900386496e-05, + "loss": 2.7858, + "step": 23000 + }, + { + "epoch": 1.427835371531442, + "grad_norm": 0.23780925311154102, + "learning_rate": 6.27821575195971e-05, + "loss": 2.8246, + "step": 23001 + }, + { + "epoch": 1.4278974486312, + "grad_norm": 0.22291173832932823, + "learning_rate": 6.277866596864159e-05, + "loss": 2.802, + "step": 23002 + }, + { + "epoch": 1.4279595257309579, + "grad_norm": 0.16441746186856662, + "learning_rate": 6.277517435101669e-05, + "loss": 2.8678, + "step": 23003 + }, + { + "epoch": 1.4280216028307158, + "grad_norm": 0.15655634769945448, + "learning_rate": 6.277168266674063e-05, + "loss": 2.9025, + "step": 23004 + }, + { + "epoch": 1.4280836799304737, + "grad_norm": 0.18409708705215866, + "learning_rate": 6.276819091583157e-05, + "loss": 2.7227, + "step": 23005 + }, + { + "epoch": 1.4281457570302316, + "grad_norm": 0.15847219732420684, + "learning_rate": 6.276469909830776e-05, + "loss": 2.7076, + "step": 23006 + }, + { + "epoch": 1.4282078341299895, + "grad_norm": 0.2039061619403985, + "learning_rate": 6.276120721418744e-05, + "loss": 2.8124, + "step": 23007 + }, + { + "epoch": 1.4282699112297474, + "grad_norm": 0.16425195041678584, + "learning_rate": 6.275771526348879e-05, + "loss": 2.8922, + "step": 23008 + }, + { + "epoch": 1.4283319883295054, + "grad_norm": 0.16717036261918655, + "learning_rate": 6.275422324623003e-05, + "loss": 2.9583, + "step": 23009 + }, + { + "epoch": 1.4283940654292633, + "grad_norm": 0.14847636341502132, + "learning_rate": 6.27507311624294e-05, + "loss": 2.861, + "step": 23010 + }, + { + "epoch": 1.428456142529021, + "grad_norm": 0.16232045919860888, + "learning_rate": 6.274723901210511e-05, + "loss": 2.7701, + "step": 23011 + }, + { + "epoch": 1.428518219628779, + "grad_norm": 0.15132729404558085, + "learning_rate": 6.274374679527539e-05, + "loss": 2.8239, + "step": 23012 + }, + { + "epoch": 1.4285802967285368, + "grad_norm": 0.16793631085158295, + "learning_rate": 6.274025451195842e-05, + "loss": 2.8695, + "step": 23013 + }, + { + "epoch": 1.4286423738282947, + "grad_norm": 0.15835907396420593, + "learning_rate": 6.273676216217249e-05, + "loss": 2.8238, + "step": 23014 + }, + { + "epoch": 1.4287044509280526, + "grad_norm": 0.16083043365652316, + "learning_rate": 6.273326974593576e-05, + "loss": 2.8707, + "step": 23015 + }, + { + "epoch": 1.4287665280278106, + "grad_norm": 0.1670982347585356, + "learning_rate": 6.272977726326646e-05, + "loss": 2.9816, + "step": 23016 + }, + { + "epoch": 1.4288286051275685, + "grad_norm": 0.17014982017825106, + "learning_rate": 6.272628471418285e-05, + "loss": 2.7153, + "step": 23017 + }, + { + "epoch": 1.4288906822273264, + "grad_norm": 0.15670086660687152, + "learning_rate": 6.27227920987031e-05, + "loss": 2.8609, + "step": 23018 + }, + { + "epoch": 1.4289527593270843, + "grad_norm": 0.1432365148277735, + "learning_rate": 6.271929941684545e-05, + "loss": 2.8136, + "step": 23019 + }, + { + "epoch": 1.429014836426842, + "grad_norm": 0.15664599426085557, + "learning_rate": 6.271580666862813e-05, + "loss": 2.8125, + "step": 23020 + }, + { + "epoch": 1.4290769135266, + "grad_norm": 0.15155995922318566, + "learning_rate": 6.271231385406938e-05, + "loss": 2.8043, + "step": 23021 + }, + { + "epoch": 1.4291389906263579, + "grad_norm": 0.15315949455323968, + "learning_rate": 6.27088209731874e-05, + "loss": 2.7841, + "step": 23022 + }, + { + "epoch": 1.4292010677261158, + "grad_norm": 0.1443984155843221, + "learning_rate": 6.27053280260004e-05, + "loss": 2.8354, + "step": 23023 + }, + { + "epoch": 1.4292631448258737, + "grad_norm": 0.15621024460482666, + "learning_rate": 6.270183501252662e-05, + "loss": 2.8024, + "step": 23024 + }, + { + "epoch": 1.4293252219256316, + "grad_norm": 0.15888493987660288, + "learning_rate": 6.26983419327843e-05, + "loss": 2.8563, + "step": 23025 + }, + { + "epoch": 1.4293872990253895, + "grad_norm": 0.15632346929378477, + "learning_rate": 6.269484878679163e-05, + "loss": 2.8261, + "step": 23026 + }, + { + "epoch": 1.4294493761251474, + "grad_norm": 0.17524050084094422, + "learning_rate": 6.269135557456686e-05, + "loss": 2.7953, + "step": 23027 + }, + { + "epoch": 1.4295114532249054, + "grad_norm": 0.21545011608554884, + "learning_rate": 6.268786229612821e-05, + "loss": 2.8651, + "step": 23028 + }, + { + "epoch": 1.4295735303246633, + "grad_norm": 0.1688266513836056, + "learning_rate": 6.26843689514939e-05, + "loss": 2.9318, + "step": 23029 + }, + { + "epoch": 1.4296356074244212, + "grad_norm": 0.14852136827588266, + "learning_rate": 6.268087554068215e-05, + "loss": 2.8269, + "step": 23030 + }, + { + "epoch": 1.4296976845241791, + "grad_norm": 0.1997477669429928, + "learning_rate": 6.267738206371119e-05, + "loss": 2.7934, + "step": 23031 + }, + { + "epoch": 1.429759761623937, + "grad_norm": 0.17887958011321586, + "learning_rate": 6.267388852059926e-05, + "loss": 2.8378, + "step": 23032 + }, + { + "epoch": 1.429821838723695, + "grad_norm": 0.17462077135619888, + "learning_rate": 6.267039491136459e-05, + "loss": 2.8505, + "step": 23033 + }, + { + "epoch": 1.4298839158234529, + "grad_norm": 0.149782567834419, + "learning_rate": 6.266690123602538e-05, + "loss": 2.834, + "step": 23034 + }, + { + "epoch": 1.4299459929232106, + "grad_norm": 0.15345234865101554, + "learning_rate": 6.266340749459985e-05, + "loss": 2.8115, + "step": 23035 + }, + { + "epoch": 1.4300080700229685, + "grad_norm": 0.154164625784751, + "learning_rate": 6.265991368710628e-05, + "loss": 2.6949, + "step": 23036 + }, + { + "epoch": 1.4300701471227264, + "grad_norm": 0.15996234483297742, + "learning_rate": 6.265641981356286e-05, + "loss": 2.9318, + "step": 23037 + }, + { + "epoch": 1.4301322242224843, + "grad_norm": 0.19586030431100154, + "learning_rate": 6.265292587398781e-05, + "loss": 2.7243, + "step": 23038 + }, + { + "epoch": 1.4301943013222422, + "grad_norm": 0.20264141582882092, + "learning_rate": 6.26494318683994e-05, + "loss": 2.9804, + "step": 23039 + }, + { + "epoch": 1.4302563784220002, + "grad_norm": 0.1860292088316365, + "learning_rate": 6.264593779681582e-05, + "loss": 2.8511, + "step": 23040 + }, + { + "epoch": 1.430318455521758, + "grad_norm": 0.16374826423984493, + "learning_rate": 6.264244365925532e-05, + "loss": 2.8045, + "step": 23041 + }, + { + "epoch": 1.430380532621516, + "grad_norm": 0.17568246926949196, + "learning_rate": 6.263894945573612e-05, + "loss": 2.9271, + "step": 23042 + }, + { + "epoch": 1.4304426097212737, + "grad_norm": 0.1544989337243936, + "learning_rate": 6.263545518627643e-05, + "loss": 2.8815, + "step": 23043 + }, + { + "epoch": 1.4305046868210316, + "grad_norm": 0.18904640349917262, + "learning_rate": 6.263196085089454e-05, + "loss": 2.7789, + "step": 23044 + }, + { + "epoch": 1.4305667639207895, + "grad_norm": 0.17006674981265998, + "learning_rate": 6.262846644960862e-05, + "loss": 2.8492, + "step": 23045 + }, + { + "epoch": 1.4306288410205474, + "grad_norm": 0.153882932106797, + "learning_rate": 6.262497198243694e-05, + "loss": 2.7881, + "step": 23046 + }, + { + "epoch": 1.4306909181203054, + "grad_norm": 0.16771175440760216, + "learning_rate": 6.26214774493977e-05, + "loss": 2.8435, + "step": 23047 + }, + { + "epoch": 1.4307529952200633, + "grad_norm": 0.18185920279676862, + "learning_rate": 6.261798285050915e-05, + "loss": 2.8651, + "step": 23048 + }, + { + "epoch": 1.4308150723198212, + "grad_norm": 0.17665696823929639, + "learning_rate": 6.261448818578952e-05, + "loss": 2.8881, + "step": 23049 + }, + { + "epoch": 1.430877149419579, + "grad_norm": 0.175204326350976, + "learning_rate": 6.261099345525705e-05, + "loss": 2.7698, + "step": 23050 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.16247204920747713, + "learning_rate": 6.260749865892995e-05, + "loss": 2.8556, + "step": 23051 + }, + { + "epoch": 1.431001303619095, + "grad_norm": 0.17125954759741277, + "learning_rate": 6.260400379682649e-05, + "loss": 2.8568, + "step": 23052 + }, + { + "epoch": 1.4310633807188529, + "grad_norm": 0.2286288722546178, + "learning_rate": 6.260050886896487e-05, + "loss": 2.8495, + "step": 23053 + }, + { + "epoch": 1.4311254578186108, + "grad_norm": 0.17474544842670053, + "learning_rate": 6.259701387536334e-05, + "loss": 2.7998, + "step": 23054 + }, + { + "epoch": 1.4311875349183687, + "grad_norm": 0.20261478535169755, + "learning_rate": 6.259351881604013e-05, + "loss": 2.9084, + "step": 23055 + }, + { + "epoch": 1.4312496120181266, + "grad_norm": 0.16568430207791474, + "learning_rate": 6.259002369101347e-05, + "loss": 2.8037, + "step": 23056 + }, + { + "epoch": 1.4313116891178845, + "grad_norm": 0.15867889354966017, + "learning_rate": 6.258652850030163e-05, + "loss": 2.8124, + "step": 23057 + }, + { + "epoch": 1.4313737662176422, + "grad_norm": 0.16761613028056654, + "learning_rate": 6.258303324392279e-05, + "loss": 2.8811, + "step": 23058 + }, + { + "epoch": 1.4314358433174001, + "grad_norm": 0.17092086644811874, + "learning_rate": 6.25795379218952e-05, + "loss": 2.8477, + "step": 23059 + }, + { + "epoch": 1.431497920417158, + "grad_norm": 0.1557288060348911, + "learning_rate": 6.257604253423712e-05, + "loss": 2.8505, + "step": 23060 + }, + { + "epoch": 1.431559997516916, + "grad_norm": 0.15209280644812492, + "learning_rate": 6.257254708096677e-05, + "loss": 2.7858, + "step": 23061 + }, + { + "epoch": 1.431622074616674, + "grad_norm": 0.15424275354454345, + "learning_rate": 6.25690515621024e-05, + "loss": 2.8091, + "step": 23062 + }, + { + "epoch": 1.4316841517164318, + "grad_norm": 0.14503628540873512, + "learning_rate": 6.256555597766223e-05, + "loss": 2.7707, + "step": 23063 + }, + { + "epoch": 1.4317462288161897, + "grad_norm": 0.15869162339415732, + "learning_rate": 6.256206032766451e-05, + "loss": 2.9133, + "step": 23064 + }, + { + "epoch": 1.4318083059159477, + "grad_norm": 0.1572804151441252, + "learning_rate": 6.255856461212746e-05, + "loss": 2.837, + "step": 23065 + }, + { + "epoch": 1.4318703830157056, + "grad_norm": 0.14446215653437666, + "learning_rate": 6.255506883106934e-05, + "loss": 2.8528, + "step": 23066 + }, + { + "epoch": 1.4319324601154633, + "grad_norm": 0.15959476694448216, + "learning_rate": 6.255157298450836e-05, + "loss": 2.7392, + "step": 23067 + }, + { + "epoch": 1.4319945372152212, + "grad_norm": 0.17801355087744758, + "learning_rate": 6.254807707246279e-05, + "loss": 2.8133, + "step": 23068 + }, + { + "epoch": 1.432056614314979, + "grad_norm": 0.1515672837810046, + "learning_rate": 6.254458109495086e-05, + "loss": 2.853, + "step": 23069 + }, + { + "epoch": 1.432118691414737, + "grad_norm": 0.14597736991309918, + "learning_rate": 6.254108505199078e-05, + "loss": 2.8056, + "step": 23070 + }, + { + "epoch": 1.432180768514495, + "grad_norm": 0.14726062457552608, + "learning_rate": 6.253758894360085e-05, + "loss": 2.8156, + "step": 23071 + }, + { + "epoch": 1.4322428456142529, + "grad_norm": 0.15690897622101946, + "learning_rate": 6.253409276979926e-05, + "loss": 2.8431, + "step": 23072 + }, + { + "epoch": 1.4323049227140108, + "grad_norm": 0.15117349845546788, + "learning_rate": 6.253059653060426e-05, + "loss": 2.8682, + "step": 23073 + }, + { + "epoch": 1.4323669998137687, + "grad_norm": 0.13895947894358576, + "learning_rate": 6.25271002260341e-05, + "loss": 2.733, + "step": 23074 + }, + { + "epoch": 1.4324290769135266, + "grad_norm": 0.14706338255666482, + "learning_rate": 6.252360385610703e-05, + "loss": 2.8799, + "step": 23075 + }, + { + "epoch": 1.4324911540132845, + "grad_norm": 0.156528915374155, + "learning_rate": 6.252010742084127e-05, + "loss": 2.8776, + "step": 23076 + }, + { + "epoch": 1.4325532311130424, + "grad_norm": 0.16073420378463674, + "learning_rate": 6.251661092025505e-05, + "loss": 2.8585, + "step": 23077 + }, + { + "epoch": 1.4326153082128004, + "grad_norm": 0.14617611557290328, + "learning_rate": 6.251311435436666e-05, + "loss": 2.8777, + "step": 23078 + }, + { + "epoch": 1.4326773853125583, + "grad_norm": 0.1561755002752199, + "learning_rate": 6.25096177231943e-05, + "loss": 2.8419, + "step": 23079 + }, + { + "epoch": 1.4327394624123162, + "grad_norm": 0.14845957717348565, + "learning_rate": 6.250612102675623e-05, + "loss": 2.769, + "step": 23080 + }, + { + "epoch": 1.4328015395120741, + "grad_norm": 0.1522663421085513, + "learning_rate": 6.25026242650707e-05, + "loss": 2.8394, + "step": 23081 + }, + { + "epoch": 1.4328636166118318, + "grad_norm": 0.17640735595464566, + "learning_rate": 6.249912743815595e-05, + "loss": 2.8437, + "step": 23082 + }, + { + "epoch": 1.4329256937115897, + "grad_norm": 0.15165666771467068, + "learning_rate": 6.249563054603019e-05, + "loss": 2.7085, + "step": 23083 + }, + { + "epoch": 1.4329877708113476, + "grad_norm": 0.14226785631854763, + "learning_rate": 6.24921335887117e-05, + "loss": 2.8532, + "step": 23084 + }, + { + "epoch": 1.4330498479111056, + "grad_norm": 0.16242157506249913, + "learning_rate": 6.248863656621872e-05, + "loss": 2.7928, + "step": 23085 + }, + { + "epoch": 1.4331119250108635, + "grad_norm": 0.1575663845150097, + "learning_rate": 6.24851394785695e-05, + "loss": 2.7659, + "step": 23086 + }, + { + "epoch": 1.4331740021106214, + "grad_norm": 0.17738739140490906, + "learning_rate": 6.248164232578227e-05, + "loss": 2.8506, + "step": 23087 + }, + { + "epoch": 1.4332360792103793, + "grad_norm": 0.1638483341269863, + "learning_rate": 6.247814510787528e-05, + "loss": 2.8242, + "step": 23088 + }, + { + "epoch": 1.4332981563101372, + "grad_norm": 0.15616278002171932, + "learning_rate": 6.247464782486679e-05, + "loss": 2.8984, + "step": 23089 + }, + { + "epoch": 1.4333602334098952, + "grad_norm": 0.16120634424142283, + "learning_rate": 6.247115047677502e-05, + "loss": 2.8915, + "step": 23090 + }, + { + "epoch": 1.4334223105096529, + "grad_norm": 0.15756766121410867, + "learning_rate": 6.246765306361824e-05, + "loss": 2.8653, + "step": 23091 + }, + { + "epoch": 1.4334843876094108, + "grad_norm": 0.15396710054572063, + "learning_rate": 6.246415558541467e-05, + "loss": 2.9159, + "step": 23092 + }, + { + "epoch": 1.4335464647091687, + "grad_norm": 0.15964568563002546, + "learning_rate": 6.24606580421826e-05, + "loss": 2.8813, + "step": 23093 + }, + { + "epoch": 1.4336085418089266, + "grad_norm": 0.17152995457492787, + "learning_rate": 6.245716043394024e-05, + "loss": 2.8301, + "step": 23094 + }, + { + "epoch": 1.4336706189086845, + "grad_norm": 0.1564853445547752, + "learning_rate": 6.245366276070585e-05, + "loss": 2.8584, + "step": 23095 + }, + { + "epoch": 1.4337326960084424, + "grad_norm": 0.16394857015729347, + "learning_rate": 6.245016502249767e-05, + "loss": 2.8262, + "step": 23096 + }, + { + "epoch": 1.4337947731082004, + "grad_norm": 0.15264245850454383, + "learning_rate": 6.244666721933396e-05, + "loss": 2.8203, + "step": 23097 + }, + { + "epoch": 1.4338568502079583, + "grad_norm": 0.1543209986600239, + "learning_rate": 6.244316935123297e-05, + "loss": 2.807, + "step": 23098 + }, + { + "epoch": 1.4339189273077162, + "grad_norm": 0.1471810052232482, + "learning_rate": 6.243967141821295e-05, + "loss": 2.8369, + "step": 23099 + }, + { + "epoch": 1.4339810044074741, + "grad_norm": 0.1691491655655874, + "learning_rate": 6.243617342029215e-05, + "loss": 2.8042, + "step": 23100 + }, + { + "epoch": 1.434043081507232, + "grad_norm": 0.14904505221700645, + "learning_rate": 6.243267535748879e-05, + "loss": 2.8067, + "step": 23101 + }, + { + "epoch": 1.43410515860699, + "grad_norm": 0.15087241248319488, + "learning_rate": 6.242917722982116e-05, + "loss": 2.8556, + "step": 23102 + }, + { + "epoch": 1.4341672357067479, + "grad_norm": 0.15727827749886344, + "learning_rate": 6.242567903730749e-05, + "loss": 2.8368, + "step": 23103 + }, + { + "epoch": 1.4342293128065058, + "grad_norm": 0.15430562682621787, + "learning_rate": 6.242218077996604e-05, + "loss": 2.7764, + "step": 23104 + }, + { + "epoch": 1.4342913899062637, + "grad_norm": 0.1479347083400862, + "learning_rate": 6.241868245781504e-05, + "loss": 2.8279, + "step": 23105 + }, + { + "epoch": 1.4343534670060214, + "grad_norm": 0.17437458937265046, + "learning_rate": 6.241518407087278e-05, + "loss": 2.7671, + "step": 23106 + }, + { + "epoch": 1.4344155441057793, + "grad_norm": 0.187426493064273, + "learning_rate": 6.241168561915749e-05, + "loss": 2.7817, + "step": 23107 + }, + { + "epoch": 1.4344776212055372, + "grad_norm": 0.1767970615879883, + "learning_rate": 6.24081871026874e-05, + "loss": 2.893, + "step": 23108 + }, + { + "epoch": 1.4345396983052952, + "grad_norm": 0.18530893629548997, + "learning_rate": 6.240468852148083e-05, + "loss": 2.8787, + "step": 23109 + }, + { + "epoch": 1.434601775405053, + "grad_norm": 0.16119677112359243, + "learning_rate": 6.240118987555595e-05, + "loss": 2.835, + "step": 23110 + }, + { + "epoch": 1.434663852504811, + "grad_norm": 0.16481713604437184, + "learning_rate": 6.239769116493107e-05, + "loss": 2.8331, + "step": 23111 + }, + { + "epoch": 1.434725929604569, + "grad_norm": 0.16054488340938491, + "learning_rate": 6.239419238962441e-05, + "loss": 2.7848, + "step": 23112 + }, + { + "epoch": 1.4347880067043268, + "grad_norm": 0.17102479895273365, + "learning_rate": 6.239069354965426e-05, + "loss": 2.9236, + "step": 23113 + }, + { + "epoch": 1.4348500838040847, + "grad_norm": 0.1804176196208246, + "learning_rate": 6.238719464503885e-05, + "loss": 2.9494, + "step": 23114 + }, + { + "epoch": 1.4349121609038424, + "grad_norm": 0.150799999158452, + "learning_rate": 6.238369567579643e-05, + "loss": 2.8224, + "step": 23115 + }, + { + "epoch": 1.4349742380036004, + "grad_norm": 0.16398719921469754, + "learning_rate": 6.238019664194526e-05, + "loss": 2.7774, + "step": 23116 + }, + { + "epoch": 1.4350363151033583, + "grad_norm": 0.16746743932817232, + "learning_rate": 6.237669754350359e-05, + "loss": 2.7131, + "step": 23117 + }, + { + "epoch": 1.4350983922031162, + "grad_norm": 0.1722876623334897, + "learning_rate": 6.237319838048971e-05, + "loss": 2.8265, + "step": 23118 + }, + { + "epoch": 1.435160469302874, + "grad_norm": 0.15290547394804072, + "learning_rate": 6.236969915292183e-05, + "loss": 2.808, + "step": 23119 + }, + { + "epoch": 1.435222546402632, + "grad_norm": 0.17306146431038189, + "learning_rate": 6.236619986081824e-05, + "loss": 2.7619, + "step": 23120 + }, + { + "epoch": 1.43528462350239, + "grad_norm": 0.14296466217033654, + "learning_rate": 6.23627005041972e-05, + "loss": 2.8129, + "step": 23121 + }, + { + "epoch": 1.4353467006021479, + "grad_norm": 0.17459697743381233, + "learning_rate": 6.235920108307691e-05, + "loss": 2.8005, + "step": 23122 + }, + { + "epoch": 1.4354087777019058, + "grad_norm": 0.14093790078581941, + "learning_rate": 6.23557015974757e-05, + "loss": 2.8444, + "step": 23123 + }, + { + "epoch": 1.4354708548016637, + "grad_norm": 0.1602532337367553, + "learning_rate": 6.235220204741176e-05, + "loss": 2.9006, + "step": 23124 + }, + { + "epoch": 1.4355329319014216, + "grad_norm": 0.15986279025552952, + "learning_rate": 6.234870243290342e-05, + "loss": 2.7044, + "step": 23125 + }, + { + "epoch": 1.4355950090011795, + "grad_norm": 0.17000799627466376, + "learning_rate": 6.234520275396887e-05, + "loss": 2.7866, + "step": 23126 + }, + { + "epoch": 1.4356570861009375, + "grad_norm": 0.144434471205083, + "learning_rate": 6.234170301062644e-05, + "loss": 2.7451, + "step": 23127 + }, + { + "epoch": 1.4357191632006954, + "grad_norm": 0.1916046087895835, + "learning_rate": 6.233820320289432e-05, + "loss": 2.8392, + "step": 23128 + }, + { + "epoch": 1.4357812403004533, + "grad_norm": 0.14788198674562045, + "learning_rate": 6.233470333079081e-05, + "loss": 2.8498, + "step": 23129 + }, + { + "epoch": 1.435843317400211, + "grad_norm": 0.16755579486202993, + "learning_rate": 6.233120339433415e-05, + "loss": 2.7895, + "step": 23130 + }, + { + "epoch": 1.435905394499969, + "grad_norm": 0.1537592430263347, + "learning_rate": 6.232770339354262e-05, + "loss": 2.8094, + "step": 23131 + }, + { + "epoch": 1.4359674715997268, + "grad_norm": 0.21118278723609424, + "learning_rate": 6.232420332843446e-05, + "loss": 2.772, + "step": 23132 + }, + { + "epoch": 1.4360295486994847, + "grad_norm": 0.16807307840210856, + "learning_rate": 6.232070319902792e-05, + "loss": 2.8498, + "step": 23133 + }, + { + "epoch": 1.4360916257992427, + "grad_norm": 0.15393069431386314, + "learning_rate": 6.23172030053413e-05, + "loss": 2.8645, + "step": 23134 + }, + { + "epoch": 1.4361537028990006, + "grad_norm": 0.19600998316551993, + "learning_rate": 6.231370274739284e-05, + "loss": 2.8379, + "step": 23135 + }, + { + "epoch": 1.4362157799987585, + "grad_norm": 0.15206233956390836, + "learning_rate": 6.231020242520082e-05, + "loss": 2.7073, + "step": 23136 + }, + { + "epoch": 1.4362778570985164, + "grad_norm": 0.18596679953525203, + "learning_rate": 6.230670203878347e-05, + "loss": 2.7896, + "step": 23137 + }, + { + "epoch": 1.4363399341982743, + "grad_norm": 0.15771299544157802, + "learning_rate": 6.230320158815906e-05, + "loss": 2.765, + "step": 23138 + }, + { + "epoch": 1.436402011298032, + "grad_norm": 0.22711983727816049, + "learning_rate": 6.229970107334588e-05, + "loss": 2.8411, + "step": 23139 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.1705667874397396, + "learning_rate": 6.229620049436216e-05, + "loss": 2.747, + "step": 23140 + }, + { + "epoch": 1.4365261654975479, + "grad_norm": 0.18477979004901054, + "learning_rate": 6.229269985122619e-05, + "loss": 2.8023, + "step": 23141 + }, + { + "epoch": 1.4365882425973058, + "grad_norm": 0.16841284130828868, + "learning_rate": 6.228919914395621e-05, + "loss": 2.736, + "step": 23142 + }, + { + "epoch": 1.4366503196970637, + "grad_norm": 0.15192199672579143, + "learning_rate": 6.22856983725705e-05, + "loss": 2.7751, + "step": 23143 + }, + { + "epoch": 1.4367123967968216, + "grad_norm": 0.1551922159872592, + "learning_rate": 6.228219753708731e-05, + "loss": 2.961, + "step": 23144 + }, + { + "epoch": 1.4367744738965795, + "grad_norm": 0.20144584666046875, + "learning_rate": 6.227869663752493e-05, + "loss": 2.7474, + "step": 23145 + }, + { + "epoch": 1.4368365509963374, + "grad_norm": 0.1489819780225922, + "learning_rate": 6.227519567390158e-05, + "loss": 2.849, + "step": 23146 + }, + { + "epoch": 1.4368986280960954, + "grad_norm": 0.17769290104821187, + "learning_rate": 6.227169464623558e-05, + "loss": 2.8704, + "step": 23147 + }, + { + "epoch": 1.4369607051958533, + "grad_norm": 0.16079957778797244, + "learning_rate": 6.226819355454516e-05, + "loss": 2.8183, + "step": 23148 + }, + { + "epoch": 1.4370227822956112, + "grad_norm": 0.14239840742610982, + "learning_rate": 6.22646923988486e-05, + "loss": 2.7988, + "step": 23149 + }, + { + "epoch": 1.4370848593953691, + "grad_norm": 0.16698963877199674, + "learning_rate": 6.226119117916415e-05, + "loss": 2.8716, + "step": 23150 + }, + { + "epoch": 1.437146936495127, + "grad_norm": 0.1989499691175354, + "learning_rate": 6.22576898955101e-05, + "loss": 2.802, + "step": 23151 + }, + { + "epoch": 1.437209013594885, + "grad_norm": 0.1675784933535211, + "learning_rate": 6.225418854790471e-05, + "loss": 2.8095, + "step": 23152 + }, + { + "epoch": 1.4372710906946429, + "grad_norm": 0.20104856322784778, + "learning_rate": 6.225068713636623e-05, + "loss": 2.7575, + "step": 23153 + }, + { + "epoch": 1.4373331677944006, + "grad_norm": 0.16397929920230273, + "learning_rate": 6.224718566091297e-05, + "loss": 2.8547, + "step": 23154 + }, + { + "epoch": 1.4373952448941585, + "grad_norm": 0.1511087118076464, + "learning_rate": 6.224368412156314e-05, + "loss": 2.8779, + "step": 23155 + }, + { + "epoch": 1.4374573219939164, + "grad_norm": 0.16250027784910656, + "learning_rate": 6.224018251833505e-05, + "loss": 2.7957, + "step": 23156 + }, + { + "epoch": 1.4375193990936743, + "grad_norm": 0.16814773471442415, + "learning_rate": 6.223668085124697e-05, + "loss": 2.843, + "step": 23157 + }, + { + "epoch": 1.4375814761934322, + "grad_norm": 0.1535690258418762, + "learning_rate": 6.223317912031713e-05, + "loss": 2.8606, + "step": 23158 + }, + { + "epoch": 1.4376435532931902, + "grad_norm": 0.15988538096452765, + "learning_rate": 6.222967732556384e-05, + "loss": 2.8303, + "step": 23159 + }, + { + "epoch": 1.437705630392948, + "grad_norm": 0.14414068883672196, + "learning_rate": 6.222617546700534e-05, + "loss": 2.8794, + "step": 23160 + }, + { + "epoch": 1.437767707492706, + "grad_norm": 0.1499907453483453, + "learning_rate": 6.222267354465992e-05, + "loss": 2.82, + "step": 23161 + }, + { + "epoch": 1.437829784592464, + "grad_norm": 0.15240517625006136, + "learning_rate": 6.221917155854585e-05, + "loss": 2.8689, + "step": 23162 + }, + { + "epoch": 1.4378918616922216, + "grad_norm": 0.15760154906526083, + "learning_rate": 6.22156695086814e-05, + "loss": 2.8042, + "step": 23163 + }, + { + "epoch": 1.4379539387919795, + "grad_norm": 0.1394199222285581, + "learning_rate": 6.221216739508483e-05, + "loss": 2.778, + "step": 23164 + }, + { + "epoch": 1.4380160158917374, + "grad_norm": 0.19438952756605204, + "learning_rate": 6.220866521777442e-05, + "loss": 2.8655, + "step": 23165 + }, + { + "epoch": 1.4380780929914954, + "grad_norm": 0.1821677649721691, + "learning_rate": 6.220516297676844e-05, + "loss": 2.9241, + "step": 23166 + }, + { + "epoch": 1.4381401700912533, + "grad_norm": 0.1609799324503051, + "learning_rate": 6.220166067208515e-05, + "loss": 2.8242, + "step": 23167 + }, + { + "epoch": 1.4382022471910112, + "grad_norm": 0.15179037189149577, + "learning_rate": 6.219815830374285e-05, + "loss": 2.7796, + "step": 23168 + }, + { + "epoch": 1.4382643242907691, + "grad_norm": 0.1559405600604932, + "learning_rate": 6.219465587175978e-05, + "loss": 2.9127, + "step": 23169 + }, + { + "epoch": 1.438326401390527, + "grad_norm": 0.16191825746225824, + "learning_rate": 6.219115337615425e-05, + "loss": 2.8804, + "step": 23170 + }, + { + "epoch": 1.438388478490285, + "grad_norm": 0.15525913172623854, + "learning_rate": 6.21876508169445e-05, + "loss": 2.8589, + "step": 23171 + }, + { + "epoch": 1.4384505555900429, + "grad_norm": 0.1848745210659434, + "learning_rate": 6.218414819414883e-05, + "loss": 2.919, + "step": 23172 + }, + { + "epoch": 1.4385126326898008, + "grad_norm": 0.15571198820544646, + "learning_rate": 6.21806455077855e-05, + "loss": 2.7807, + "step": 23173 + }, + { + "epoch": 1.4385747097895587, + "grad_norm": 0.15976743487239994, + "learning_rate": 6.217714275787278e-05, + "loss": 2.8124, + "step": 23174 + }, + { + "epoch": 1.4386367868893166, + "grad_norm": 0.18830318085773956, + "learning_rate": 6.217363994442895e-05, + "loss": 2.8184, + "step": 23175 + }, + { + "epoch": 1.4386988639890745, + "grad_norm": 0.15876041965757018, + "learning_rate": 6.217013706747228e-05, + "loss": 2.7405, + "step": 23176 + }, + { + "epoch": 1.4387609410888325, + "grad_norm": 0.15336833493355995, + "learning_rate": 6.216663412702106e-05, + "loss": 2.7823, + "step": 23177 + }, + { + "epoch": 1.4388230181885902, + "grad_norm": 0.1589729862679786, + "learning_rate": 6.216313112309354e-05, + "loss": 2.8168, + "step": 23178 + }, + { + "epoch": 1.438885095288348, + "grad_norm": 0.1624854677377987, + "learning_rate": 6.215962805570803e-05, + "loss": 2.863, + "step": 23179 + }, + { + "epoch": 1.438947172388106, + "grad_norm": 0.1562619681543134, + "learning_rate": 6.215612492488278e-05, + "loss": 2.843, + "step": 23180 + }, + { + "epoch": 1.439009249487864, + "grad_norm": 0.1572034179048546, + "learning_rate": 6.215262173063607e-05, + "loss": 2.834, + "step": 23181 + }, + { + "epoch": 1.4390713265876218, + "grad_norm": 0.15267725044549843, + "learning_rate": 6.214911847298619e-05, + "loss": 2.7942, + "step": 23182 + }, + { + "epoch": 1.4391334036873797, + "grad_norm": 0.15220599181609515, + "learning_rate": 6.214561515195141e-05, + "loss": 2.9321, + "step": 23183 + }, + { + "epoch": 1.4391954807871377, + "grad_norm": 0.1484962773651188, + "learning_rate": 6.214211176755001e-05, + "loss": 2.7849, + "step": 23184 + }, + { + "epoch": 1.4392575578868956, + "grad_norm": 0.15958358564701336, + "learning_rate": 6.213860831980025e-05, + "loss": 2.8446, + "step": 23185 + }, + { + "epoch": 1.4393196349866535, + "grad_norm": 0.15319016879872457, + "learning_rate": 6.213510480872045e-05, + "loss": 2.7657, + "step": 23186 + }, + { + "epoch": 1.4393817120864112, + "grad_norm": 0.15494748457412952, + "learning_rate": 6.213160123432884e-05, + "loss": 2.9582, + "step": 23187 + }, + { + "epoch": 1.439443789186169, + "grad_norm": 0.17455233574608622, + "learning_rate": 6.212809759664372e-05, + "loss": 2.8852, + "step": 23188 + }, + { + "epoch": 1.439505866285927, + "grad_norm": 0.1553374584338227, + "learning_rate": 6.212459389568338e-05, + "loss": 2.8761, + "step": 23189 + }, + { + "epoch": 1.439567943385685, + "grad_norm": 0.1605718152427781, + "learning_rate": 6.21210901314661e-05, + "loss": 2.8392, + "step": 23190 + }, + { + "epoch": 1.4396300204854429, + "grad_norm": 0.15244652875740486, + "learning_rate": 6.211758630401013e-05, + "loss": 2.8606, + "step": 23191 + }, + { + "epoch": 1.4396920975852008, + "grad_norm": 0.18081274358335425, + "learning_rate": 6.211408241333379e-05, + "loss": 2.8527, + "step": 23192 + }, + { + "epoch": 1.4397541746849587, + "grad_norm": 0.15771810923877588, + "learning_rate": 6.211057845945534e-05, + "loss": 2.8896, + "step": 23193 + }, + { + "epoch": 1.4398162517847166, + "grad_norm": 0.16490544321774755, + "learning_rate": 6.210707444239304e-05, + "loss": 2.9124, + "step": 23194 + }, + { + "epoch": 1.4398783288844745, + "grad_norm": 0.18318804087082333, + "learning_rate": 6.210357036216522e-05, + "loss": 2.7168, + "step": 23195 + }, + { + "epoch": 1.4399404059842325, + "grad_norm": 0.15496368822137796, + "learning_rate": 6.210006621879011e-05, + "loss": 2.8635, + "step": 23196 + }, + { + "epoch": 1.4400024830839904, + "grad_norm": 0.14540242852863208, + "learning_rate": 6.209656201228604e-05, + "loss": 2.781, + "step": 23197 + }, + { + "epoch": 1.4400645601837483, + "grad_norm": 0.1588656715750328, + "learning_rate": 6.209305774267126e-05, + "loss": 2.9275, + "step": 23198 + }, + { + "epoch": 1.4401266372835062, + "grad_norm": 0.154496848015592, + "learning_rate": 6.208955340996405e-05, + "loss": 2.8113, + "step": 23199 + }, + { + "epoch": 1.4401887143832641, + "grad_norm": 0.1563008702438788, + "learning_rate": 6.208604901418273e-05, + "loss": 2.8164, + "step": 23200 + }, + { + "epoch": 1.440250791483022, + "grad_norm": 0.1483137216979844, + "learning_rate": 6.208254455534552e-05, + "loss": 2.7874, + "step": 23201 + }, + { + "epoch": 1.4403128685827797, + "grad_norm": 0.16062988052115373, + "learning_rate": 6.207904003347079e-05, + "loss": 2.8107, + "step": 23202 + }, + { + "epoch": 1.4403749456825377, + "grad_norm": 0.18669330391611702, + "learning_rate": 6.207553544857674e-05, + "loss": 2.8799, + "step": 23203 + }, + { + "epoch": 1.4404370227822956, + "grad_norm": 0.161329380704782, + "learning_rate": 6.20720308006817e-05, + "loss": 2.8905, + "step": 23204 + }, + { + "epoch": 1.4404990998820535, + "grad_norm": 0.15112172189356618, + "learning_rate": 6.206852608980395e-05, + "loss": 2.7785, + "step": 23205 + }, + { + "epoch": 1.4405611769818114, + "grad_norm": 0.16189661771810657, + "learning_rate": 6.206502131596177e-05, + "loss": 2.8615, + "step": 23206 + }, + { + "epoch": 1.4406232540815693, + "grad_norm": 0.15274695078792247, + "learning_rate": 6.206151647917344e-05, + "loss": 2.773, + "step": 23207 + }, + { + "epoch": 1.4406853311813272, + "grad_norm": 0.15948180915836752, + "learning_rate": 6.205801157945725e-05, + "loss": 2.8333, + "step": 23208 + }, + { + "epoch": 1.4407474082810852, + "grad_norm": 0.1596485616106186, + "learning_rate": 6.205450661683149e-05, + "loss": 2.8966, + "step": 23209 + }, + { + "epoch": 1.440809485380843, + "grad_norm": 0.16836060238100936, + "learning_rate": 6.205100159131443e-05, + "loss": 2.8678, + "step": 23210 + }, + { + "epoch": 1.4408715624806008, + "grad_norm": 0.1583606677291943, + "learning_rate": 6.20474965029244e-05, + "loss": 2.8841, + "step": 23211 + }, + { + "epoch": 1.4409336395803587, + "grad_norm": 0.1507668557485973, + "learning_rate": 6.204399135167964e-05, + "loss": 2.8496, + "step": 23212 + }, + { + "epoch": 1.4409957166801166, + "grad_norm": 0.2001573759281077, + "learning_rate": 6.204048613759845e-05, + "loss": 2.7518, + "step": 23213 + }, + { + "epoch": 1.4410577937798745, + "grad_norm": 0.15133389752928064, + "learning_rate": 6.203698086069911e-05, + "loss": 2.7809, + "step": 23214 + }, + { + "epoch": 1.4411198708796324, + "grad_norm": 0.15312838234377188, + "learning_rate": 6.203347552099993e-05, + "loss": 2.7985, + "step": 23215 + }, + { + "epoch": 1.4411819479793904, + "grad_norm": 0.16101819788262117, + "learning_rate": 6.202997011851918e-05, + "loss": 2.8527, + "step": 23216 + }, + { + "epoch": 1.4412440250791483, + "grad_norm": 0.14035994000678398, + "learning_rate": 6.202646465327516e-05, + "loss": 2.6842, + "step": 23217 + }, + { + "epoch": 1.4413061021789062, + "grad_norm": 0.16802249093094396, + "learning_rate": 6.202295912528616e-05, + "loss": 2.8138, + "step": 23218 + }, + { + "epoch": 1.4413681792786641, + "grad_norm": 0.14463574910380378, + "learning_rate": 6.201945353457046e-05, + "loss": 2.7598, + "step": 23219 + }, + { + "epoch": 1.441430256378422, + "grad_norm": 0.13990447441501255, + "learning_rate": 6.201594788114636e-05, + "loss": 2.7445, + "step": 23220 + }, + { + "epoch": 1.44149233347818, + "grad_norm": 0.1717496002677086, + "learning_rate": 6.201244216503213e-05, + "loss": 2.8577, + "step": 23221 + }, + { + "epoch": 1.4415544105779379, + "grad_norm": 0.15445772671634, + "learning_rate": 6.200893638624608e-05, + "loss": 2.7278, + "step": 23222 + }, + { + "epoch": 1.4416164876776958, + "grad_norm": 0.15526246685882736, + "learning_rate": 6.20054305448065e-05, + "loss": 2.8568, + "step": 23223 + }, + { + "epoch": 1.4416785647774537, + "grad_norm": 0.15280045009370116, + "learning_rate": 6.200192464073166e-05, + "loss": 2.8095, + "step": 23224 + }, + { + "epoch": 1.4417406418772116, + "grad_norm": 0.15154054620367208, + "learning_rate": 6.199841867403987e-05, + "loss": 2.7976, + "step": 23225 + }, + { + "epoch": 1.4418027189769693, + "grad_norm": 0.1622208304050431, + "learning_rate": 6.199491264474942e-05, + "loss": 2.7501, + "step": 23226 + }, + { + "epoch": 1.4418647960767272, + "grad_norm": 0.1526284247919376, + "learning_rate": 6.199140655287861e-05, + "loss": 2.8479, + "step": 23227 + }, + { + "epoch": 1.4419268731764852, + "grad_norm": 0.5558199270033808, + "learning_rate": 6.19879003984457e-05, + "loss": 2.6691, + "step": 23228 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.175920627227039, + "learning_rate": 6.198439418146902e-05, + "loss": 2.8213, + "step": 23229 + }, + { + "epoch": 1.442051027376001, + "grad_norm": 0.20061871484614563, + "learning_rate": 6.198088790196683e-05, + "loss": 2.8792, + "step": 23230 + }, + { + "epoch": 1.442113104475759, + "grad_norm": 0.20823480640583372, + "learning_rate": 6.197738155995746e-05, + "loss": 2.8077, + "step": 23231 + }, + { + "epoch": 1.4421751815755168, + "grad_norm": 0.16661131518351438, + "learning_rate": 6.197387515545917e-05, + "loss": 2.8548, + "step": 23232 + }, + { + "epoch": 1.4422372586752747, + "grad_norm": 0.17722422357150472, + "learning_rate": 6.197036868849026e-05, + "loss": 2.8858, + "step": 23233 + }, + { + "epoch": 1.4422993357750327, + "grad_norm": 0.20712500374735873, + "learning_rate": 6.196686215906904e-05, + "loss": 2.7987, + "step": 23234 + }, + { + "epoch": 1.4423614128747904, + "grad_norm": 0.1666004538638938, + "learning_rate": 6.19633555672138e-05, + "loss": 2.8113, + "step": 23235 + }, + { + "epoch": 1.4424234899745483, + "grad_norm": 0.173624739443142, + "learning_rate": 6.195984891294282e-05, + "loss": 2.8095, + "step": 23236 + }, + { + "epoch": 1.4424855670743062, + "grad_norm": 0.1708829872776788, + "learning_rate": 6.195634219627441e-05, + "loss": 2.8275, + "step": 23237 + }, + { + "epoch": 1.4425476441740641, + "grad_norm": 0.155182817994559, + "learning_rate": 6.195283541722686e-05, + "loss": 2.8025, + "step": 23238 + }, + { + "epoch": 1.442609721273822, + "grad_norm": 0.1714023572085025, + "learning_rate": 6.194932857581845e-05, + "loss": 2.8174, + "step": 23239 + }, + { + "epoch": 1.44267179837358, + "grad_norm": 0.16768470855924553, + "learning_rate": 6.194582167206752e-05, + "loss": 2.7844, + "step": 23240 + }, + { + "epoch": 1.4427338754733379, + "grad_norm": 0.16193357380002008, + "learning_rate": 6.194231470599232e-05, + "loss": 2.8178, + "step": 23241 + }, + { + "epoch": 1.4427959525730958, + "grad_norm": 0.16217441862562823, + "learning_rate": 6.193880767761118e-05, + "loss": 2.9091, + "step": 23242 + }, + { + "epoch": 1.4428580296728537, + "grad_norm": 0.14782146565786058, + "learning_rate": 6.193530058694236e-05, + "loss": 2.8381, + "step": 23243 + }, + { + "epoch": 1.4429201067726116, + "grad_norm": 0.16431668436490962, + "learning_rate": 6.193179343400421e-05, + "loss": 2.9404, + "step": 23244 + }, + { + "epoch": 1.4429821838723695, + "grad_norm": 0.21402074796487472, + "learning_rate": 6.192828621881497e-05, + "loss": 2.8312, + "step": 23245 + }, + { + "epoch": 1.4430442609721275, + "grad_norm": 0.16197394838442167, + "learning_rate": 6.192477894139297e-05, + "loss": 2.7937, + "step": 23246 + }, + { + "epoch": 1.4431063380718854, + "grad_norm": 0.1513308142982884, + "learning_rate": 6.19212716017565e-05, + "loss": 2.8956, + "step": 23247 + }, + { + "epoch": 1.4431684151716433, + "grad_norm": 0.15323625070282176, + "learning_rate": 6.191776419992387e-05, + "loss": 2.8483, + "step": 23248 + }, + { + "epoch": 1.4432304922714012, + "grad_norm": 0.1552206165511767, + "learning_rate": 6.191425673591338e-05, + "loss": 2.7392, + "step": 23249 + }, + { + "epoch": 1.443292569371159, + "grad_norm": 0.195646591548881, + "learning_rate": 6.19107492097433e-05, + "loss": 2.8212, + "step": 23250 + }, + { + "epoch": 1.4433546464709168, + "grad_norm": 0.16320615323709936, + "learning_rate": 6.190724162143197e-05, + "loss": 2.7905, + "step": 23251 + }, + { + "epoch": 1.4434167235706747, + "grad_norm": 0.16097308825543868, + "learning_rate": 6.190373397099765e-05, + "loss": 2.735, + "step": 23252 + }, + { + "epoch": 1.4434788006704327, + "grad_norm": 0.16302750163003826, + "learning_rate": 6.190022625845868e-05, + "loss": 2.8287, + "step": 23253 + }, + { + "epoch": 1.4435408777701906, + "grad_norm": 0.15716530216534233, + "learning_rate": 6.189671848383333e-05, + "loss": 2.8198, + "step": 23254 + }, + { + "epoch": 1.4436029548699485, + "grad_norm": 0.16096364012709377, + "learning_rate": 6.189321064713991e-05, + "loss": 2.8112, + "step": 23255 + }, + { + "epoch": 1.4436650319697064, + "grad_norm": 0.15539493845365612, + "learning_rate": 6.188970274839674e-05, + "loss": 2.7312, + "step": 23256 + }, + { + "epoch": 1.4437271090694643, + "grad_norm": 0.1532956121678663, + "learning_rate": 6.188619478762208e-05, + "loss": 2.8783, + "step": 23257 + }, + { + "epoch": 1.4437891861692222, + "grad_norm": 0.15648809412375644, + "learning_rate": 6.188268676483428e-05, + "loss": 2.8316, + "step": 23258 + }, + { + "epoch": 1.44385126326898, + "grad_norm": 0.16587274236982505, + "learning_rate": 6.187917868005158e-05, + "loss": 2.727, + "step": 23259 + }, + { + "epoch": 1.4439133403687379, + "grad_norm": 0.16923921852270507, + "learning_rate": 6.187567053329236e-05, + "loss": 2.8646, + "step": 23260 + }, + { + "epoch": 1.4439754174684958, + "grad_norm": 0.1781476738368896, + "learning_rate": 6.187216232457486e-05, + "loss": 2.8503, + "step": 23261 + }, + { + "epoch": 1.4440374945682537, + "grad_norm": 0.15898853947951758, + "learning_rate": 6.186865405391743e-05, + "loss": 2.8548, + "step": 23262 + }, + { + "epoch": 1.4440995716680116, + "grad_norm": 0.1565653063376022, + "learning_rate": 6.186514572133833e-05, + "loss": 2.8195, + "step": 23263 + }, + { + "epoch": 1.4441616487677695, + "grad_norm": 0.15195176852093956, + "learning_rate": 6.186163732685589e-05, + "loss": 2.779, + "step": 23264 + }, + { + "epoch": 1.4442237258675275, + "grad_norm": 0.1690027437001472, + "learning_rate": 6.185812887048842e-05, + "loss": 2.8155, + "step": 23265 + }, + { + "epoch": 1.4442858029672854, + "grad_norm": 0.1513791725582184, + "learning_rate": 6.185462035225419e-05, + "loss": 2.8251, + "step": 23266 + }, + { + "epoch": 1.4443478800670433, + "grad_norm": 0.18548647248742986, + "learning_rate": 6.185111177217156e-05, + "loss": 2.8576, + "step": 23267 + }, + { + "epoch": 1.4444099571668012, + "grad_norm": 0.16882641417639624, + "learning_rate": 6.184760313025877e-05, + "loss": 2.7707, + "step": 23268 + }, + { + "epoch": 1.4444720342665591, + "grad_norm": 0.17604767424209955, + "learning_rate": 6.184409442653418e-05, + "loss": 2.803, + "step": 23269 + }, + { + "epoch": 1.444534111366317, + "grad_norm": 0.14904547712104627, + "learning_rate": 6.184058566101605e-05, + "loss": 2.8555, + "step": 23270 + }, + { + "epoch": 1.444596188466075, + "grad_norm": 0.16751167295832614, + "learning_rate": 6.183707683372274e-05, + "loss": 2.8149, + "step": 23271 + }, + { + "epoch": 1.4446582655658329, + "grad_norm": 0.14504639861531932, + "learning_rate": 6.183356794467251e-05, + "loss": 2.7991, + "step": 23272 + }, + { + "epoch": 1.4447203426655908, + "grad_norm": 0.16935084298254605, + "learning_rate": 6.18300589938837e-05, + "loss": 2.7647, + "step": 23273 + }, + { + "epoch": 1.4447824197653485, + "grad_norm": 0.16972832234373458, + "learning_rate": 6.182654998137458e-05, + "loss": 2.7963, + "step": 23274 + }, + { + "epoch": 1.4448444968651064, + "grad_norm": 0.14705825923450278, + "learning_rate": 6.182304090716349e-05, + "loss": 2.8252, + "step": 23275 + }, + { + "epoch": 1.4449065739648643, + "grad_norm": 0.14932564381877836, + "learning_rate": 6.181953177126873e-05, + "loss": 2.8129, + "step": 23276 + }, + { + "epoch": 1.4449686510646222, + "grad_norm": 0.15562476591873947, + "learning_rate": 6.181602257370857e-05, + "loss": 2.794, + "step": 23277 + }, + { + "epoch": 1.4450307281643802, + "grad_norm": 0.1663116620554094, + "learning_rate": 6.18125133145014e-05, + "loss": 2.9408, + "step": 23278 + }, + { + "epoch": 1.445092805264138, + "grad_norm": 0.16950014549786446, + "learning_rate": 6.180900399366544e-05, + "loss": 2.7822, + "step": 23279 + }, + { + "epoch": 1.445154882363896, + "grad_norm": 0.16760332256003424, + "learning_rate": 6.180549461121906e-05, + "loss": 2.7565, + "step": 23280 + }, + { + "epoch": 1.445216959463654, + "grad_norm": 0.17320429377834706, + "learning_rate": 6.180198516718054e-05, + "loss": 2.8992, + "step": 23281 + }, + { + "epoch": 1.4452790365634118, + "grad_norm": 0.15762182864667562, + "learning_rate": 6.17984756615682e-05, + "loss": 2.8628, + "step": 23282 + }, + { + "epoch": 1.4453411136631695, + "grad_norm": 0.15261151550508775, + "learning_rate": 6.179496609440036e-05, + "loss": 2.8346, + "step": 23283 + }, + { + "epoch": 1.4454031907629274, + "grad_norm": 0.1523527118296595, + "learning_rate": 6.179145646569531e-05, + "loss": 2.8243, + "step": 23284 + }, + { + "epoch": 1.4454652678626854, + "grad_norm": 0.1502826414470227, + "learning_rate": 6.178794677547137e-05, + "loss": 2.8145, + "step": 23285 + }, + { + "epoch": 1.4455273449624433, + "grad_norm": 0.1700412177683875, + "learning_rate": 6.178443702374684e-05, + "loss": 2.7676, + "step": 23286 + }, + { + "epoch": 1.4455894220622012, + "grad_norm": 0.14933272871940093, + "learning_rate": 6.178092721054006e-05, + "loss": 2.8037, + "step": 23287 + }, + { + "epoch": 1.4456514991619591, + "grad_norm": 0.2043736847709776, + "learning_rate": 6.17774173358693e-05, + "loss": 2.8107, + "step": 23288 + }, + { + "epoch": 1.445713576261717, + "grad_norm": 0.19384844417075583, + "learning_rate": 6.177390739975292e-05, + "loss": 2.9341, + "step": 23289 + }, + { + "epoch": 1.445775653361475, + "grad_norm": 0.16292837377918068, + "learning_rate": 6.177039740220919e-05, + "loss": 2.8258, + "step": 23290 + }, + { + "epoch": 1.4458377304612329, + "grad_norm": 0.1552835617123664, + "learning_rate": 6.176688734325644e-05, + "loss": 2.8176, + "step": 23291 + }, + { + "epoch": 1.4458998075609908, + "grad_norm": 0.15151041084539568, + "learning_rate": 6.176337722291299e-05, + "loss": 2.8382, + "step": 23292 + }, + { + "epoch": 1.4459618846607487, + "grad_norm": 0.15664507866709063, + "learning_rate": 6.175986704119714e-05, + "loss": 2.8087, + "step": 23293 + }, + { + "epoch": 1.4460239617605066, + "grad_norm": 0.14077881507165, + "learning_rate": 6.175635679812723e-05, + "loss": 2.734, + "step": 23294 + }, + { + "epoch": 1.4460860388602645, + "grad_norm": 0.1783704066864703, + "learning_rate": 6.175284649372151e-05, + "loss": 2.8445, + "step": 23295 + }, + { + "epoch": 1.4461481159600225, + "grad_norm": 0.1437852202163517, + "learning_rate": 6.174933612799837e-05, + "loss": 2.8081, + "step": 23296 + }, + { + "epoch": 1.4462101930597804, + "grad_norm": 0.15141306151233444, + "learning_rate": 6.174582570097608e-05, + "loss": 2.8315, + "step": 23297 + }, + { + "epoch": 1.446272270159538, + "grad_norm": 0.14967491716207695, + "learning_rate": 6.174231521267296e-05, + "loss": 2.8525, + "step": 23298 + }, + { + "epoch": 1.446334347259296, + "grad_norm": 0.1587669720832374, + "learning_rate": 6.173880466310734e-05, + "loss": 2.811, + "step": 23299 + }, + { + "epoch": 1.446396424359054, + "grad_norm": 0.1539578213433703, + "learning_rate": 6.173529405229752e-05, + "loss": 2.8767, + "step": 23300 + }, + { + "epoch": 1.4464585014588118, + "grad_norm": 0.15337073353520408, + "learning_rate": 6.173178338026183e-05, + "loss": 2.7781, + "step": 23301 + }, + { + "epoch": 1.4465205785585697, + "grad_norm": 0.1407216455236666, + "learning_rate": 6.172827264701857e-05, + "loss": 2.7606, + "step": 23302 + }, + { + "epoch": 1.4465826556583277, + "grad_norm": 0.17187983332687146, + "learning_rate": 6.172476185258607e-05, + "loss": 2.8556, + "step": 23303 + }, + { + "epoch": 1.4466447327580856, + "grad_norm": 0.157862012951938, + "learning_rate": 6.172125099698265e-05, + "loss": 2.7756, + "step": 23304 + }, + { + "epoch": 1.4467068098578435, + "grad_norm": 0.1617975568181384, + "learning_rate": 6.17177400802266e-05, + "loss": 2.8097, + "step": 23305 + }, + { + "epoch": 1.4467688869576014, + "grad_norm": 0.15283335968512735, + "learning_rate": 6.171422910233626e-05, + "loss": 2.8045, + "step": 23306 + }, + { + "epoch": 1.4468309640573591, + "grad_norm": 0.16165294506235028, + "learning_rate": 6.171071806332995e-05, + "loss": 2.9872, + "step": 23307 + }, + { + "epoch": 1.446893041157117, + "grad_norm": 0.14143116004470999, + "learning_rate": 6.170720696322599e-05, + "loss": 2.7748, + "step": 23308 + }, + { + "epoch": 1.446955118256875, + "grad_norm": 0.1662230037877149, + "learning_rate": 6.170369580204267e-05, + "loss": 2.8546, + "step": 23309 + }, + { + "epoch": 1.4470171953566329, + "grad_norm": 0.204490555590045, + "learning_rate": 6.170018457979834e-05, + "loss": 2.8135, + "step": 23310 + }, + { + "epoch": 1.4470792724563908, + "grad_norm": 0.1615038097592129, + "learning_rate": 6.16966732965113e-05, + "loss": 2.8297, + "step": 23311 + }, + { + "epoch": 1.4471413495561487, + "grad_norm": 0.1496347468104795, + "learning_rate": 6.169316195219988e-05, + "loss": 2.8078, + "step": 23312 + }, + { + "epoch": 1.4472034266559066, + "grad_norm": 0.15456112434007938, + "learning_rate": 6.16896505468824e-05, + "loss": 2.749, + "step": 23313 + }, + { + "epoch": 1.4472655037556645, + "grad_norm": 0.15568545994940008, + "learning_rate": 6.168613908057715e-05, + "loss": 2.8722, + "step": 23314 + }, + { + "epoch": 1.4473275808554225, + "grad_norm": 0.15771396388703088, + "learning_rate": 6.168262755330249e-05, + "loss": 2.7607, + "step": 23315 + }, + { + "epoch": 1.4473896579551804, + "grad_norm": 0.16101542155860513, + "learning_rate": 6.167911596507674e-05, + "loss": 2.7849, + "step": 23316 + }, + { + "epoch": 1.4474517350549383, + "grad_norm": 0.1553232904692402, + "learning_rate": 6.16756043159182e-05, + "loss": 2.7992, + "step": 23317 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.15045496567505318, + "learning_rate": 6.16720926058452e-05, + "loss": 2.8567, + "step": 23318 + }, + { + "epoch": 1.4475758892544541, + "grad_norm": 0.15289860941876407, + "learning_rate": 6.166858083487604e-05, + "loss": 2.7837, + "step": 23319 + }, + { + "epoch": 1.447637966354212, + "grad_norm": 0.14669254134909238, + "learning_rate": 6.166506900302908e-05, + "loss": 2.848, + "step": 23320 + }, + { + "epoch": 1.44770004345397, + "grad_norm": 0.14823952447562458, + "learning_rate": 6.166155711032261e-05, + "loss": 2.8669, + "step": 23321 + }, + { + "epoch": 1.4477621205537277, + "grad_norm": 0.14566883675732295, + "learning_rate": 6.165804515677498e-05, + "loss": 2.8021, + "step": 23322 + }, + { + "epoch": 1.4478241976534856, + "grad_norm": 0.21049176027329444, + "learning_rate": 6.16545331424045e-05, + "loss": 2.7987, + "step": 23323 + }, + { + "epoch": 1.4478862747532435, + "grad_norm": 0.14998154303127104, + "learning_rate": 6.165102106722947e-05, + "loss": 2.7811, + "step": 23324 + }, + { + "epoch": 1.4479483518530014, + "grad_norm": 0.15004932392310308, + "learning_rate": 6.164750893126826e-05, + "loss": 2.8381, + "step": 23325 + }, + { + "epoch": 1.4480104289527593, + "grad_norm": 0.17876868018663863, + "learning_rate": 6.164399673453915e-05, + "loss": 2.7363, + "step": 23326 + }, + { + "epoch": 1.4480725060525172, + "grad_norm": 0.15638476163178577, + "learning_rate": 6.164048447706049e-05, + "loss": 2.8502, + "step": 23327 + }, + { + "epoch": 1.4481345831522752, + "grad_norm": 0.15259893148532175, + "learning_rate": 6.16369721588506e-05, + "loss": 2.7134, + "step": 23328 + }, + { + "epoch": 1.448196660252033, + "grad_norm": 0.1634663582081546, + "learning_rate": 6.163345977992779e-05, + "loss": 2.9569, + "step": 23329 + }, + { + "epoch": 1.448258737351791, + "grad_norm": 0.15777407263306603, + "learning_rate": 6.162994734031041e-05, + "loss": 2.7971, + "step": 23330 + }, + { + "epoch": 1.4483208144515487, + "grad_norm": 0.18528328292138038, + "learning_rate": 6.162643484001676e-05, + "loss": 2.802, + "step": 23331 + }, + { + "epoch": 1.4483828915513066, + "grad_norm": 0.14400252081441156, + "learning_rate": 6.162292227906517e-05, + "loss": 2.7593, + "step": 23332 + }, + { + "epoch": 1.4484449686510645, + "grad_norm": 0.16712180901486176, + "learning_rate": 6.1619409657474e-05, + "loss": 2.8913, + "step": 23333 + }, + { + "epoch": 1.4485070457508225, + "grad_norm": 0.15404205768906348, + "learning_rate": 6.161589697526153e-05, + "loss": 2.8373, + "step": 23334 + }, + { + "epoch": 1.4485691228505804, + "grad_norm": 0.1554349143622696, + "learning_rate": 6.16123842324461e-05, + "loss": 2.8588, + "step": 23335 + }, + { + "epoch": 1.4486311999503383, + "grad_norm": 0.21557262067796754, + "learning_rate": 6.160887142904605e-05, + "loss": 2.9124, + "step": 23336 + }, + { + "epoch": 1.4486932770500962, + "grad_norm": 0.16490331405287312, + "learning_rate": 6.160535856507972e-05, + "loss": 2.8496, + "step": 23337 + }, + { + "epoch": 1.4487553541498541, + "grad_norm": 0.1707960257896032, + "learning_rate": 6.16018456405654e-05, + "loss": 2.8742, + "step": 23338 + }, + { + "epoch": 1.448817431249612, + "grad_norm": 0.1761043757606612, + "learning_rate": 6.159833265552143e-05, + "loss": 2.8515, + "step": 23339 + }, + { + "epoch": 1.44887950834937, + "grad_norm": 0.1620722234047392, + "learning_rate": 6.159481960996614e-05, + "loss": 2.7644, + "step": 23340 + }, + { + "epoch": 1.4489415854491279, + "grad_norm": 0.21282086410554601, + "learning_rate": 6.159130650391787e-05, + "loss": 2.8499, + "step": 23341 + }, + { + "epoch": 1.4490036625488858, + "grad_norm": 0.14716359026292858, + "learning_rate": 6.158779333739493e-05, + "loss": 2.8638, + "step": 23342 + }, + { + "epoch": 1.4490657396486437, + "grad_norm": 0.18257235420388382, + "learning_rate": 6.158428011041568e-05, + "loss": 2.8682, + "step": 23343 + }, + { + "epoch": 1.4491278167484016, + "grad_norm": 0.16862953862380492, + "learning_rate": 6.15807668229984e-05, + "loss": 2.8055, + "step": 23344 + }, + { + "epoch": 1.4491898938481595, + "grad_norm": 0.17084092943721615, + "learning_rate": 6.157725347516148e-05, + "loss": 2.881, + "step": 23345 + }, + { + "epoch": 1.4492519709479172, + "grad_norm": 0.19277245618885416, + "learning_rate": 6.157374006692319e-05, + "loss": 2.8429, + "step": 23346 + }, + { + "epoch": 1.4493140480476752, + "grad_norm": 0.16061424851151218, + "learning_rate": 6.15702265983019e-05, + "loss": 2.8088, + "step": 23347 + }, + { + "epoch": 1.449376125147433, + "grad_norm": 0.15269781196990187, + "learning_rate": 6.156671306931593e-05, + "loss": 2.7789, + "step": 23348 + }, + { + "epoch": 1.449438202247191, + "grad_norm": 0.16935781603737157, + "learning_rate": 6.15631994799836e-05, + "loss": 2.8694, + "step": 23349 + }, + { + "epoch": 1.449500279346949, + "grad_norm": 0.1526510367386714, + "learning_rate": 6.155968583032326e-05, + "loss": 2.763, + "step": 23350 + }, + { + "epoch": 1.4495623564467068, + "grad_norm": 0.15881472564370258, + "learning_rate": 6.155617212035324e-05, + "loss": 2.8641, + "step": 23351 + }, + { + "epoch": 1.4496244335464648, + "grad_norm": 0.15017710427782882, + "learning_rate": 6.155265835009186e-05, + "loss": 2.8061, + "step": 23352 + }, + { + "epoch": 1.4496865106462227, + "grad_norm": 0.14686933161571955, + "learning_rate": 6.154914451955746e-05, + "loss": 2.8584, + "step": 23353 + }, + { + "epoch": 1.4497485877459806, + "grad_norm": 0.16053885024497278, + "learning_rate": 6.154563062876836e-05, + "loss": 2.8732, + "step": 23354 + }, + { + "epoch": 1.4498106648457383, + "grad_norm": 0.14601744938951103, + "learning_rate": 6.154211667774291e-05, + "loss": 2.7515, + "step": 23355 + }, + { + "epoch": 1.4498727419454962, + "grad_norm": 0.15212250808110453, + "learning_rate": 6.153860266649942e-05, + "loss": 2.7971, + "step": 23356 + }, + { + "epoch": 1.4499348190452541, + "grad_norm": 0.15961629865607158, + "learning_rate": 6.153508859505627e-05, + "loss": 2.8592, + "step": 23357 + }, + { + "epoch": 1.449996896145012, + "grad_norm": 0.17167607087718884, + "learning_rate": 6.153157446343173e-05, + "loss": 2.8974, + "step": 23358 + }, + { + "epoch": 1.45005897324477, + "grad_norm": 0.16795715025836294, + "learning_rate": 6.152806027164419e-05, + "loss": 2.8381, + "step": 23359 + }, + { + "epoch": 1.4501210503445279, + "grad_norm": 0.1898233296750032, + "learning_rate": 6.152454601971194e-05, + "loss": 2.8378, + "step": 23360 + }, + { + "epoch": 1.4501831274442858, + "grad_norm": 0.15095868516811328, + "learning_rate": 6.152103170765336e-05, + "loss": 2.8464, + "step": 23361 + }, + { + "epoch": 1.4502452045440437, + "grad_norm": 0.1545300713350495, + "learning_rate": 6.151751733548673e-05, + "loss": 2.8376, + "step": 23362 + }, + { + "epoch": 1.4503072816438016, + "grad_norm": 0.152721541809501, + "learning_rate": 6.151400290323043e-05, + "loss": 2.7017, + "step": 23363 + }, + { + "epoch": 1.4503693587435595, + "grad_norm": 0.15280438133430244, + "learning_rate": 6.151048841090278e-05, + "loss": 2.7871, + "step": 23364 + }, + { + "epoch": 1.4504314358433175, + "grad_norm": 0.1604284148796214, + "learning_rate": 6.150697385852213e-05, + "loss": 2.7005, + "step": 23365 + }, + { + "epoch": 1.4504935129430754, + "grad_norm": 0.1501066015354012, + "learning_rate": 6.15034592461068e-05, + "loss": 2.8492, + "step": 23366 + }, + { + "epoch": 1.4505555900428333, + "grad_norm": 0.15872892669796115, + "learning_rate": 6.149994457367511e-05, + "loss": 2.8792, + "step": 23367 + }, + { + "epoch": 1.4506176671425912, + "grad_norm": 0.15561123457700618, + "learning_rate": 6.149642984124545e-05, + "loss": 2.8474, + "step": 23368 + }, + { + "epoch": 1.4506797442423491, + "grad_norm": 0.1506055621212539, + "learning_rate": 6.149291504883611e-05, + "loss": 2.8165, + "step": 23369 + }, + { + "epoch": 1.4507418213421068, + "grad_norm": 0.16034513476890297, + "learning_rate": 6.148940019646544e-05, + "loss": 2.8435, + "step": 23370 + }, + { + "epoch": 1.4508038984418647, + "grad_norm": 0.16042862967269417, + "learning_rate": 6.148588528415178e-05, + "loss": 2.8087, + "step": 23371 + }, + { + "epoch": 1.4508659755416227, + "grad_norm": 0.16033725860583778, + "learning_rate": 6.148237031191348e-05, + "loss": 2.8301, + "step": 23372 + }, + { + "epoch": 1.4509280526413806, + "grad_norm": 0.15333164076048025, + "learning_rate": 6.147885527976885e-05, + "loss": 2.7963, + "step": 23373 + }, + { + "epoch": 1.4509901297411385, + "grad_norm": 0.17226608172505356, + "learning_rate": 6.147534018773625e-05, + "loss": 2.8034, + "step": 23374 + }, + { + "epoch": 1.4510522068408964, + "grad_norm": 0.14992509352433184, + "learning_rate": 6.147182503583403e-05, + "loss": 2.7838, + "step": 23375 + }, + { + "epoch": 1.4511142839406543, + "grad_norm": 0.15765028774983514, + "learning_rate": 6.14683098240805e-05, + "loss": 2.7653, + "step": 23376 + }, + { + "epoch": 1.4511763610404123, + "grad_norm": 0.17503127006036187, + "learning_rate": 6.146479455249401e-05, + "loss": 2.782, + "step": 23377 + }, + { + "epoch": 1.4512384381401702, + "grad_norm": 0.16955172573319138, + "learning_rate": 6.14612792210929e-05, + "loss": 2.79, + "step": 23378 + }, + { + "epoch": 1.4513005152399279, + "grad_norm": 0.17171032559501762, + "learning_rate": 6.145776382989552e-05, + "loss": 2.8945, + "step": 23379 + }, + { + "epoch": 1.4513625923396858, + "grad_norm": 0.1612327843539649, + "learning_rate": 6.145424837892021e-05, + "loss": 2.8617, + "step": 23380 + }, + { + "epoch": 1.4514246694394437, + "grad_norm": 0.15750396732033867, + "learning_rate": 6.145073286818529e-05, + "loss": 2.7804, + "step": 23381 + }, + { + "epoch": 1.4514867465392016, + "grad_norm": 0.15507362512935013, + "learning_rate": 6.144721729770913e-05, + "loss": 2.7824, + "step": 23382 + }, + { + "epoch": 1.4515488236389595, + "grad_norm": 0.15026901331815293, + "learning_rate": 6.144370166751005e-05, + "loss": 2.8789, + "step": 23383 + }, + { + "epoch": 1.4516109007387175, + "grad_norm": 0.1489508810935735, + "learning_rate": 6.144018597760642e-05, + "loss": 2.7076, + "step": 23384 + }, + { + "epoch": 1.4516729778384754, + "grad_norm": 0.1611947894114002, + "learning_rate": 6.143667022801655e-05, + "loss": 2.7936, + "step": 23385 + }, + { + "epoch": 1.4517350549382333, + "grad_norm": 0.16210512362501128, + "learning_rate": 6.143315441875879e-05, + "loss": 2.7541, + "step": 23386 + }, + { + "epoch": 1.4517971320379912, + "grad_norm": 0.1497961718291187, + "learning_rate": 6.14296385498515e-05, + "loss": 2.7902, + "step": 23387 + }, + { + "epoch": 1.4518592091377491, + "grad_norm": 0.1517791310854203, + "learning_rate": 6.1426122621313e-05, + "loss": 2.8221, + "step": 23388 + }, + { + "epoch": 1.451921286237507, + "grad_norm": 0.19855728851662038, + "learning_rate": 6.142260663316165e-05, + "loss": 2.7294, + "step": 23389 + }, + { + "epoch": 1.451983363337265, + "grad_norm": 0.15668594817666315, + "learning_rate": 6.141909058541579e-05, + "loss": 2.8165, + "step": 23390 + }, + { + "epoch": 1.4520454404370229, + "grad_norm": 0.14906507762743582, + "learning_rate": 6.141557447809377e-05, + "loss": 2.8501, + "step": 23391 + }, + { + "epoch": 1.4521075175367808, + "grad_norm": 0.1696371235721092, + "learning_rate": 6.14120583112139e-05, + "loss": 2.815, + "step": 23392 + }, + { + "epoch": 1.4521695946365387, + "grad_norm": 0.1765472042581351, + "learning_rate": 6.140854208479458e-05, + "loss": 2.8953, + "step": 23393 + }, + { + "epoch": 1.4522316717362964, + "grad_norm": 0.18710495141201797, + "learning_rate": 6.14050257988541e-05, + "loss": 2.86, + "step": 23394 + }, + { + "epoch": 1.4522937488360543, + "grad_norm": 0.16647233971423506, + "learning_rate": 6.140150945341084e-05, + "loss": 2.9155, + "step": 23395 + }, + { + "epoch": 1.4523558259358123, + "grad_norm": 0.15321216899899562, + "learning_rate": 6.139799304848315e-05, + "loss": 2.8454, + "step": 23396 + }, + { + "epoch": 1.4524179030355702, + "grad_norm": 0.15348911711737748, + "learning_rate": 6.139447658408936e-05, + "loss": 2.7313, + "step": 23397 + }, + { + "epoch": 1.452479980135328, + "grad_norm": 0.15844695997869934, + "learning_rate": 6.139096006024782e-05, + "loss": 2.8412, + "step": 23398 + }, + { + "epoch": 1.452542057235086, + "grad_norm": 0.164454213906713, + "learning_rate": 6.138744347697686e-05, + "loss": 2.7431, + "step": 23399 + }, + { + "epoch": 1.452604134334844, + "grad_norm": 0.1645646867621271, + "learning_rate": 6.138392683429486e-05, + "loss": 2.836, + "step": 23400 + }, + { + "epoch": 1.4526662114346018, + "grad_norm": 0.14314867850338897, + "learning_rate": 6.138041013222014e-05, + "loss": 2.8659, + "step": 23401 + }, + { + "epoch": 1.4527282885343598, + "grad_norm": 0.1413896026612308, + "learning_rate": 6.137689337077105e-05, + "loss": 2.807, + "step": 23402 + }, + { + "epoch": 1.4527903656341175, + "grad_norm": 0.1474554490337479, + "learning_rate": 6.137337654996597e-05, + "loss": 2.8937, + "step": 23403 + }, + { + "epoch": 1.4528524427338754, + "grad_norm": 0.16439857008419773, + "learning_rate": 6.136985966982322e-05, + "loss": 2.8036, + "step": 23404 + }, + { + "epoch": 1.4529145198336333, + "grad_norm": 0.1397349549765304, + "learning_rate": 6.136634273036114e-05, + "loss": 2.7859, + "step": 23405 + }, + { + "epoch": 1.4529765969333912, + "grad_norm": 0.15796927841581113, + "learning_rate": 6.13628257315981e-05, + "loss": 2.8717, + "step": 23406 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.14425251476642784, + "learning_rate": 6.135930867355242e-05, + "loss": 2.7663, + "step": 23407 + }, + { + "epoch": 1.453100751132907, + "grad_norm": 0.1572604409437415, + "learning_rate": 6.135579155624249e-05, + "loss": 2.7814, + "step": 23408 + }, + { + "epoch": 1.453162828232665, + "grad_norm": 0.17778714413191882, + "learning_rate": 6.135227437968663e-05, + "loss": 2.8613, + "step": 23409 + }, + { + "epoch": 1.4532249053324229, + "grad_norm": 0.14728311844597072, + "learning_rate": 6.134875714390321e-05, + "loss": 2.891, + "step": 23410 + }, + { + "epoch": 1.4532869824321808, + "grad_norm": 0.16136640898037238, + "learning_rate": 6.134523984891055e-05, + "loss": 2.797, + "step": 23411 + }, + { + "epoch": 1.4533490595319387, + "grad_norm": 0.1525699818592082, + "learning_rate": 6.134172249472702e-05, + "loss": 2.8009, + "step": 23412 + }, + { + "epoch": 1.4534111366316966, + "grad_norm": 0.16572628050995805, + "learning_rate": 6.133820508137098e-05, + "loss": 2.8562, + "step": 23413 + }, + { + "epoch": 1.4534732137314545, + "grad_norm": 0.17150670386299136, + "learning_rate": 6.133468760886078e-05, + "loss": 2.8231, + "step": 23414 + }, + { + "epoch": 1.4535352908312125, + "grad_norm": 0.18920947659336176, + "learning_rate": 6.133117007721475e-05, + "loss": 2.7901, + "step": 23415 + }, + { + "epoch": 1.4535973679309704, + "grad_norm": 0.15472511426618188, + "learning_rate": 6.132765248645125e-05, + "loss": 2.8427, + "step": 23416 + }, + { + "epoch": 1.453659445030728, + "grad_norm": 0.1799805469895688, + "learning_rate": 6.132413483658866e-05, + "loss": 2.9499, + "step": 23417 + }, + { + "epoch": 1.453721522130486, + "grad_norm": 0.1572438165426104, + "learning_rate": 6.13206171276453e-05, + "loss": 2.8429, + "step": 23418 + }, + { + "epoch": 1.453783599230244, + "grad_norm": 0.16499346605462373, + "learning_rate": 6.131709935963952e-05, + "loss": 2.8877, + "step": 23419 + }, + { + "epoch": 1.4538456763300018, + "grad_norm": 0.15376359796094052, + "learning_rate": 6.131358153258969e-05, + "loss": 2.8776, + "step": 23420 + }, + { + "epoch": 1.4539077534297598, + "grad_norm": 0.15157857101810504, + "learning_rate": 6.131006364651416e-05, + "loss": 2.8437, + "step": 23421 + }, + { + "epoch": 1.4539698305295177, + "grad_norm": 0.17588164323692518, + "learning_rate": 6.13065457014313e-05, + "loss": 2.8817, + "step": 23422 + }, + { + "epoch": 1.4540319076292756, + "grad_norm": 0.15458032069029656, + "learning_rate": 6.130302769735941e-05, + "loss": 2.9342, + "step": 23423 + }, + { + "epoch": 1.4540939847290335, + "grad_norm": 0.1462375341303034, + "learning_rate": 6.129950963431691e-05, + "loss": 2.8395, + "step": 23424 + }, + { + "epoch": 1.4541560618287914, + "grad_norm": 0.14984128994757076, + "learning_rate": 6.129599151232213e-05, + "loss": 2.8437, + "step": 23425 + }, + { + "epoch": 1.4542181389285491, + "grad_norm": 0.15112031678428398, + "learning_rate": 6.129247333139339e-05, + "loss": 2.8307, + "step": 23426 + }, + { + "epoch": 1.454280216028307, + "grad_norm": 0.15033631149052787, + "learning_rate": 6.12889550915491e-05, + "loss": 2.8183, + "step": 23427 + }, + { + "epoch": 1.454342293128065, + "grad_norm": 0.1516612058407928, + "learning_rate": 6.128543679280756e-05, + "loss": 2.7971, + "step": 23428 + }, + { + "epoch": 1.4544043702278229, + "grad_norm": 0.1566962961311619, + "learning_rate": 6.12819184351872e-05, + "loss": 2.8358, + "step": 23429 + }, + { + "epoch": 1.4544664473275808, + "grad_norm": 0.1709403013997078, + "learning_rate": 6.12784000187063e-05, + "loss": 2.7919, + "step": 23430 + }, + { + "epoch": 1.4545285244273387, + "grad_norm": 0.14574411904704504, + "learning_rate": 6.127488154338326e-05, + "loss": 2.8155, + "step": 23431 + }, + { + "epoch": 1.4545906015270966, + "grad_norm": 0.15784695645725008, + "learning_rate": 6.127136300923644e-05, + "loss": 2.8242, + "step": 23432 + }, + { + "epoch": 1.4546526786268545, + "grad_norm": 0.170058261463125, + "learning_rate": 6.126784441628416e-05, + "loss": 2.8446, + "step": 23433 + }, + { + "epoch": 1.4547147557266125, + "grad_norm": 0.16790357700409106, + "learning_rate": 6.126432576454482e-05, + "loss": 2.8032, + "step": 23434 + }, + { + "epoch": 1.4547768328263704, + "grad_norm": 0.1513207333561023, + "learning_rate": 6.126080705403674e-05, + "loss": 2.7836, + "step": 23435 + }, + { + "epoch": 1.4548389099261283, + "grad_norm": 0.15331922518593663, + "learning_rate": 6.125728828477832e-05, + "loss": 2.7802, + "step": 23436 + }, + { + "epoch": 1.4549009870258862, + "grad_norm": 0.1451511748856693, + "learning_rate": 6.125376945678787e-05, + "loss": 2.874, + "step": 23437 + }, + { + "epoch": 1.4549630641256441, + "grad_norm": 0.14799023050366447, + "learning_rate": 6.125025057008378e-05, + "loss": 2.7677, + "step": 23438 + }, + { + "epoch": 1.455025141225402, + "grad_norm": 0.1538721407451135, + "learning_rate": 6.12467316246844e-05, + "loss": 2.8261, + "step": 23439 + }, + { + "epoch": 1.45508721832516, + "grad_norm": 0.18842858608159133, + "learning_rate": 6.12432126206081e-05, + "loss": 2.8354, + "step": 23440 + }, + { + "epoch": 1.4551492954249177, + "grad_norm": 0.15128298847978092, + "learning_rate": 6.123969355787322e-05, + "loss": 2.7576, + "step": 23441 + }, + { + "epoch": 1.4552113725246756, + "grad_norm": 0.14533473608665337, + "learning_rate": 6.123617443649815e-05, + "loss": 2.9136, + "step": 23442 + }, + { + "epoch": 1.4552734496244335, + "grad_norm": 0.19977751143400563, + "learning_rate": 6.12326552565012e-05, + "loss": 2.765, + "step": 23443 + }, + { + "epoch": 1.4553355267241914, + "grad_norm": 0.132501954365658, + "learning_rate": 6.122913601790078e-05, + "loss": 2.7965, + "step": 23444 + }, + { + "epoch": 1.4553976038239493, + "grad_norm": 0.1640813453386626, + "learning_rate": 6.122561672071521e-05, + "loss": 2.7913, + "step": 23445 + }, + { + "epoch": 1.4554596809237073, + "grad_norm": 0.14659470535776958, + "learning_rate": 6.12220973649629e-05, + "loss": 2.7861, + "step": 23446 + }, + { + "epoch": 1.4555217580234652, + "grad_norm": 0.14691658471236466, + "learning_rate": 6.121857795066216e-05, + "loss": 2.9552, + "step": 23447 + }, + { + "epoch": 1.455583835123223, + "grad_norm": 0.1866052757598051, + "learning_rate": 6.121505847783139e-05, + "loss": 2.82, + "step": 23448 + }, + { + "epoch": 1.455645912222981, + "grad_norm": 0.14859714205346994, + "learning_rate": 6.121153894648894e-05, + "loss": 2.7668, + "step": 23449 + }, + { + "epoch": 1.4557079893227387, + "grad_norm": 0.15733267479020568, + "learning_rate": 6.120801935665318e-05, + "loss": 2.8262, + "step": 23450 + }, + { + "epoch": 1.4557700664224966, + "grad_norm": 0.16034094116681463, + "learning_rate": 6.120449970834243e-05, + "loss": 2.8546, + "step": 23451 + }, + { + "epoch": 1.4558321435222545, + "grad_norm": 0.16875851508132456, + "learning_rate": 6.120098000157511e-05, + "loss": 2.8113, + "step": 23452 + }, + { + "epoch": 1.4558942206220125, + "grad_norm": 0.156990810425265, + "learning_rate": 6.119746023636955e-05, + "loss": 2.7299, + "step": 23453 + }, + { + "epoch": 1.4559562977217704, + "grad_norm": 0.17688057782844172, + "learning_rate": 6.119394041274413e-05, + "loss": 2.9157, + "step": 23454 + }, + { + "epoch": 1.4560183748215283, + "grad_norm": 0.15106883640478588, + "learning_rate": 6.11904205307172e-05, + "loss": 2.872, + "step": 23455 + }, + { + "epoch": 1.4560804519212862, + "grad_norm": 0.157867965535106, + "learning_rate": 6.118690059030712e-05, + "loss": 2.8626, + "step": 23456 + }, + { + "epoch": 1.4561425290210441, + "grad_norm": 0.1489533008101016, + "learning_rate": 6.118338059153228e-05, + "loss": 2.7994, + "step": 23457 + }, + { + "epoch": 1.456204606120802, + "grad_norm": 0.14740954381397686, + "learning_rate": 6.117986053441101e-05, + "loss": 2.8313, + "step": 23458 + }, + { + "epoch": 1.45626668322056, + "grad_norm": 0.14947992630897344, + "learning_rate": 6.11763404189617e-05, + "loss": 2.8534, + "step": 23459 + }, + { + "epoch": 1.4563287603203179, + "grad_norm": 0.1595816889504687, + "learning_rate": 6.117282024520272e-05, + "loss": 2.8627, + "step": 23460 + }, + { + "epoch": 1.4563908374200758, + "grad_norm": 0.16626184941132902, + "learning_rate": 6.11693000131524e-05, + "loss": 2.8688, + "step": 23461 + }, + { + "epoch": 1.4564529145198337, + "grad_norm": 0.17656596716286946, + "learning_rate": 6.116577972282915e-05, + "loss": 2.8115, + "step": 23462 + }, + { + "epoch": 1.4565149916195916, + "grad_norm": 0.1470139404307396, + "learning_rate": 6.116225937425132e-05, + "loss": 2.8571, + "step": 23463 + }, + { + "epoch": 1.4565770687193496, + "grad_norm": 0.18931184987321453, + "learning_rate": 6.115873896743726e-05, + "loss": 2.8236, + "step": 23464 + }, + { + "epoch": 1.4566391458191073, + "grad_norm": 0.21647136755156893, + "learning_rate": 6.115521850240537e-05, + "loss": 2.7787, + "step": 23465 + }, + { + "epoch": 1.4567012229188652, + "grad_norm": 0.1523971280705471, + "learning_rate": 6.115169797917397e-05, + "loss": 2.73, + "step": 23466 + }, + { + "epoch": 1.456763300018623, + "grad_norm": 0.15636177833153336, + "learning_rate": 6.114817739776147e-05, + "loss": 2.8564, + "step": 23467 + }, + { + "epoch": 1.456825377118381, + "grad_norm": 0.15939282266219698, + "learning_rate": 6.114465675818623e-05, + "loss": 2.8112, + "step": 23468 + }, + { + "epoch": 1.456887454218139, + "grad_norm": 0.14737935999171284, + "learning_rate": 6.11411360604666e-05, + "loss": 2.8371, + "step": 23469 + }, + { + "epoch": 1.4569495313178968, + "grad_norm": 0.15535634904163018, + "learning_rate": 6.113761530462097e-05, + "loss": 2.7715, + "step": 23470 + }, + { + "epoch": 1.4570116084176548, + "grad_norm": 0.157138497704138, + "learning_rate": 6.113409449066767e-05, + "loss": 2.8616, + "step": 23471 + }, + { + "epoch": 1.4570736855174127, + "grad_norm": 0.20878876239123423, + "learning_rate": 6.113057361862513e-05, + "loss": 2.8269, + "step": 23472 + }, + { + "epoch": 1.4571357626171706, + "grad_norm": 0.14971717307419402, + "learning_rate": 6.112705268851166e-05, + "loss": 2.8069, + "step": 23473 + }, + { + "epoch": 1.4571978397169283, + "grad_norm": 0.15565925025920394, + "learning_rate": 6.112353170034566e-05, + "loss": 2.8495, + "step": 23474 + }, + { + "epoch": 1.4572599168166862, + "grad_norm": 0.21078719859569667, + "learning_rate": 6.11200106541455e-05, + "loss": 2.8155, + "step": 23475 + }, + { + "epoch": 1.4573219939164441, + "grad_norm": 0.15772931059160353, + "learning_rate": 6.111648954992954e-05, + "loss": 2.8336, + "step": 23476 + }, + { + "epoch": 1.457384071016202, + "grad_norm": 0.19426501719655004, + "learning_rate": 6.111296838771616e-05, + "loss": 2.7435, + "step": 23477 + }, + { + "epoch": 1.45744614811596, + "grad_norm": 0.1634041200689775, + "learning_rate": 6.110944716752371e-05, + "loss": 2.9156, + "step": 23478 + }, + { + "epoch": 1.4575082252157179, + "grad_norm": 0.16635240726957834, + "learning_rate": 6.11059258893706e-05, + "loss": 2.888, + "step": 23479 + }, + { + "epoch": 1.4575703023154758, + "grad_norm": 0.15638750701396914, + "learning_rate": 6.110240455327517e-05, + "loss": 2.8753, + "step": 23480 + }, + { + "epoch": 1.4576323794152337, + "grad_norm": 0.16159679195563253, + "learning_rate": 6.10988831592558e-05, + "loss": 2.884, + "step": 23481 + }, + { + "epoch": 1.4576944565149916, + "grad_norm": 0.16921207007082786, + "learning_rate": 6.109536170733087e-05, + "loss": 2.7657, + "step": 23482 + }, + { + "epoch": 1.4577565336147496, + "grad_norm": 0.23135077078684485, + "learning_rate": 6.109184019751874e-05, + "loss": 2.856, + "step": 23483 + }, + { + "epoch": 1.4578186107145075, + "grad_norm": 0.14595850329816185, + "learning_rate": 6.108831862983777e-05, + "loss": 2.7723, + "step": 23484 + }, + { + "epoch": 1.4578806878142654, + "grad_norm": 0.1756338667243393, + "learning_rate": 6.108479700430637e-05, + "loss": 2.8583, + "step": 23485 + }, + { + "epoch": 1.4579427649140233, + "grad_norm": 0.15194380665427712, + "learning_rate": 6.108127532094289e-05, + "loss": 2.8925, + "step": 23486 + }, + { + "epoch": 1.4580048420137812, + "grad_norm": 0.14771979392651027, + "learning_rate": 6.107775357976571e-05, + "loss": 2.8126, + "step": 23487 + }, + { + "epoch": 1.4580669191135391, + "grad_norm": 0.16487097221614241, + "learning_rate": 6.107423178079319e-05, + "loss": 2.8283, + "step": 23488 + }, + { + "epoch": 1.4581289962132968, + "grad_norm": 0.15454129886450582, + "learning_rate": 6.107070992404371e-05, + "loss": 2.8813, + "step": 23489 + }, + { + "epoch": 1.4581910733130548, + "grad_norm": 0.17837617089101773, + "learning_rate": 6.106718800953566e-05, + "loss": 2.8745, + "step": 23490 + }, + { + "epoch": 1.4582531504128127, + "grad_norm": 0.1601519177013698, + "learning_rate": 6.106366603728741e-05, + "loss": 2.8311, + "step": 23491 + }, + { + "epoch": 1.4583152275125706, + "grad_norm": 0.13617585567580553, + "learning_rate": 6.106014400731732e-05, + "loss": 2.7468, + "step": 23492 + }, + { + "epoch": 1.4583773046123285, + "grad_norm": 0.15233179192194451, + "learning_rate": 6.105662191964376e-05, + "loss": 2.8631, + "step": 23493 + }, + { + "epoch": 1.4584393817120864, + "grad_norm": 0.14257474790780264, + "learning_rate": 6.105309977428513e-05, + "loss": 2.877, + "step": 23494 + }, + { + "epoch": 1.4585014588118443, + "grad_norm": 0.14616478707577774, + "learning_rate": 6.104957757125981e-05, + "loss": 2.7837, + "step": 23495 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.14694856211723642, + "learning_rate": 6.104605531058613e-05, + "loss": 2.8554, + "step": 23496 + }, + { + "epoch": 1.4586256130113602, + "grad_norm": 0.15261726401849238, + "learning_rate": 6.104253299228253e-05, + "loss": 2.7887, + "step": 23497 + }, + { + "epoch": 1.4586876901111179, + "grad_norm": 0.14613262541554256, + "learning_rate": 6.103901061636735e-05, + "loss": 2.8086, + "step": 23498 + }, + { + "epoch": 1.4587497672108758, + "grad_norm": 0.16921454129396435, + "learning_rate": 6.103548818285895e-05, + "loss": 2.7814, + "step": 23499 + }, + { + "epoch": 1.4588118443106337, + "grad_norm": 0.15979035962497387, + "learning_rate": 6.103196569177575e-05, + "loss": 2.7876, + "step": 23500 + }, + { + "epoch": 1.4588739214103916, + "grad_norm": 0.1642641746658717, + "learning_rate": 6.1028443143136104e-05, + "loss": 2.7704, + "step": 23501 + }, + { + "epoch": 1.4589359985101495, + "grad_norm": 0.1479285071451557, + "learning_rate": 6.1024920536958384e-05, + "loss": 2.8448, + "step": 23502 + }, + { + "epoch": 1.4589980756099075, + "grad_norm": 0.15635666186899866, + "learning_rate": 6.102139787326099e-05, + "loss": 2.9213, + "step": 23503 + }, + { + "epoch": 1.4590601527096654, + "grad_norm": 0.15075204818474486, + "learning_rate": 6.1017875152062264e-05, + "loss": 2.8232, + "step": 23504 + }, + { + "epoch": 1.4591222298094233, + "grad_norm": 0.14752778060395494, + "learning_rate": 6.101435237338063e-05, + "loss": 2.8462, + "step": 23505 + }, + { + "epoch": 1.4591843069091812, + "grad_norm": 0.14877226374409083, + "learning_rate": 6.101082953723445e-05, + "loss": 2.8526, + "step": 23506 + }, + { + "epoch": 1.4592463840089391, + "grad_norm": 0.1415574893106739, + "learning_rate": 6.100730664364208e-05, + "loss": 2.835, + "step": 23507 + }, + { + "epoch": 1.459308461108697, + "grad_norm": 0.1462575937198877, + "learning_rate": 6.100378369262194e-05, + "loss": 2.8846, + "step": 23508 + }, + { + "epoch": 1.459370538208455, + "grad_norm": 0.16382163395572213, + "learning_rate": 6.1000260684192364e-05, + "loss": 2.8272, + "step": 23509 + }, + { + "epoch": 1.4594326153082129, + "grad_norm": 0.1457417610732736, + "learning_rate": 6.099673761837177e-05, + "loss": 2.8119, + "step": 23510 + }, + { + "epoch": 1.4594946924079708, + "grad_norm": 0.17920359468444266, + "learning_rate": 6.099321449517852e-05, + "loss": 2.8129, + "step": 23511 + }, + { + "epoch": 1.4595567695077287, + "grad_norm": 0.14859718592549176, + "learning_rate": 6.098969131463102e-05, + "loss": 2.8044, + "step": 23512 + }, + { + "epoch": 1.4596188466074864, + "grad_norm": 0.14535100539525375, + "learning_rate": 6.098616807674762e-05, + "loss": 2.8795, + "step": 23513 + }, + { + "epoch": 1.4596809237072443, + "grad_norm": 0.17076380740069352, + "learning_rate": 6.098264478154672e-05, + "loss": 2.903, + "step": 23514 + }, + { + "epoch": 1.4597430008070023, + "grad_norm": 0.15136041318439525, + "learning_rate": 6.0979121429046695e-05, + "loss": 2.8758, + "step": 23515 + }, + { + "epoch": 1.4598050779067602, + "grad_norm": 0.14382872084577594, + "learning_rate": 6.0975598019265914e-05, + "loss": 2.719, + "step": 23516 + }, + { + "epoch": 1.459867155006518, + "grad_norm": 0.17799439117416352, + "learning_rate": 6.097207455222279e-05, + "loss": 2.7916, + "step": 23517 + }, + { + "epoch": 1.459929232106276, + "grad_norm": 0.15779152060324764, + "learning_rate": 6.0968551027935686e-05, + "loss": 2.9071, + "step": 23518 + }, + { + "epoch": 1.459991309206034, + "grad_norm": 0.22731947228492447, + "learning_rate": 6.0965027446422986e-05, + "loss": 2.8895, + "step": 23519 + }, + { + "epoch": 1.4600533863057918, + "grad_norm": 0.16518874406328068, + "learning_rate": 6.096150380770307e-05, + "loss": 2.8477, + "step": 23520 + }, + { + "epoch": 1.4601154634055498, + "grad_norm": 0.15997664336068215, + "learning_rate": 6.095798011179434e-05, + "loss": 2.8191, + "step": 23521 + }, + { + "epoch": 1.4601775405053075, + "grad_norm": 0.15509493782323402, + "learning_rate": 6.095445635871516e-05, + "loss": 2.8122, + "step": 23522 + }, + { + "epoch": 1.4602396176050654, + "grad_norm": 0.16704890197139274, + "learning_rate": 6.095093254848393e-05, + "loss": 2.829, + "step": 23523 + }, + { + "epoch": 1.4603016947048233, + "grad_norm": 0.16003570439523843, + "learning_rate": 6.094740868111902e-05, + "loss": 2.8059, + "step": 23524 + }, + { + "epoch": 1.4603637718045812, + "grad_norm": 0.15467535149428138, + "learning_rate": 6.094388475663881e-05, + "loss": 2.8431, + "step": 23525 + }, + { + "epoch": 1.4604258489043391, + "grad_norm": 0.2019542184943989, + "learning_rate": 6.094036077506171e-05, + "loss": 2.8351, + "step": 23526 + }, + { + "epoch": 1.460487926004097, + "grad_norm": 0.16913273508818213, + "learning_rate": 6.093683673640609e-05, + "loss": 2.7353, + "step": 23527 + }, + { + "epoch": 1.460550003103855, + "grad_norm": 0.15603734088303772, + "learning_rate": 6.093331264069034e-05, + "loss": 2.8257, + "step": 23528 + }, + { + "epoch": 1.4606120802036129, + "grad_norm": 0.154690686588025, + "learning_rate": 6.0929788487932836e-05, + "loss": 2.8557, + "step": 23529 + }, + { + "epoch": 1.4606741573033708, + "grad_norm": 0.15636415483867816, + "learning_rate": 6.092626427815198e-05, + "loss": 2.89, + "step": 23530 + }, + { + "epoch": 1.4607362344031287, + "grad_norm": 0.15946284075288905, + "learning_rate": 6.0922740011366154e-05, + "loss": 2.776, + "step": 23531 + }, + { + "epoch": 1.4607983115028866, + "grad_norm": 0.14826403696828033, + "learning_rate": 6.091921568759373e-05, + "loss": 2.8097, + "step": 23532 + }, + { + "epoch": 1.4608603886026446, + "grad_norm": 0.16404886983837072, + "learning_rate": 6.091569130685312e-05, + "loss": 2.8037, + "step": 23533 + }, + { + "epoch": 1.4609224657024025, + "grad_norm": 0.15793327582219918, + "learning_rate": 6.091216686916269e-05, + "loss": 2.7548, + "step": 23534 + }, + { + "epoch": 1.4609845428021604, + "grad_norm": 0.14171094287241603, + "learning_rate": 6.0908642374540835e-05, + "loss": 2.8649, + "step": 23535 + }, + { + "epoch": 1.4610466199019183, + "grad_norm": 0.15057833215727554, + "learning_rate": 6.090511782300595e-05, + "loss": 2.8822, + "step": 23536 + }, + { + "epoch": 1.461108697001676, + "grad_norm": 0.15236562180625407, + "learning_rate": 6.0901593214576425e-05, + "loss": 2.8411, + "step": 23537 + }, + { + "epoch": 1.461170774101434, + "grad_norm": 0.20254348080560355, + "learning_rate": 6.089806854927063e-05, + "loss": 2.8798, + "step": 23538 + }, + { + "epoch": 1.4612328512011918, + "grad_norm": 0.17301567821145306, + "learning_rate": 6.089454382710697e-05, + "loss": 2.8796, + "step": 23539 + }, + { + "epoch": 1.4612949283009498, + "grad_norm": 0.18368938674100552, + "learning_rate": 6.0891019048103817e-05, + "loss": 2.8612, + "step": 23540 + }, + { + "epoch": 1.4613570054007077, + "grad_norm": 0.14808917586182604, + "learning_rate": 6.0887494212279595e-05, + "loss": 2.8319, + "step": 23541 + }, + { + "epoch": 1.4614190825004656, + "grad_norm": 0.2523595356211182, + "learning_rate": 6.088396931965267e-05, + "loss": 2.6802, + "step": 23542 + }, + { + "epoch": 1.4614811596002235, + "grad_norm": 0.14930471769980008, + "learning_rate": 6.088044437024141e-05, + "loss": 2.7824, + "step": 23543 + }, + { + "epoch": 1.4615432366999814, + "grad_norm": 0.16726265581712563, + "learning_rate": 6.087691936406425e-05, + "loss": 2.826, + "step": 23544 + }, + { + "epoch": 1.4616053137997393, + "grad_norm": 0.15776045408423722, + "learning_rate": 6.0873394301139555e-05, + "loss": 2.9275, + "step": 23545 + }, + { + "epoch": 1.461667390899497, + "grad_norm": 0.15314857924173175, + "learning_rate": 6.0869869181485726e-05, + "loss": 2.8074, + "step": 23546 + }, + { + "epoch": 1.461729467999255, + "grad_norm": 0.1591545447871075, + "learning_rate": 6.086634400512116e-05, + "loss": 2.852, + "step": 23547 + }, + { + "epoch": 1.4617915450990129, + "grad_norm": 0.1462450361020758, + "learning_rate": 6.0862818772064235e-05, + "loss": 2.8672, + "step": 23548 + }, + { + "epoch": 1.4618536221987708, + "grad_norm": 0.15115432538819704, + "learning_rate": 6.0859293482333347e-05, + "loss": 2.892, + "step": 23549 + }, + { + "epoch": 1.4619156992985287, + "grad_norm": 0.14242758053657686, + "learning_rate": 6.085576813594688e-05, + "loss": 2.8714, + "step": 23550 + }, + { + "epoch": 1.4619777763982866, + "grad_norm": 0.16966206320427676, + "learning_rate": 6.085224273292324e-05, + "loss": 2.8274, + "step": 23551 + }, + { + "epoch": 1.4620398534980446, + "grad_norm": 0.1411308554823289, + "learning_rate": 6.084871727328081e-05, + "loss": 2.7304, + "step": 23552 + }, + { + "epoch": 1.4621019305978025, + "grad_norm": 0.21233395758106158, + "learning_rate": 6.0845191757038e-05, + "loss": 2.8358, + "step": 23553 + }, + { + "epoch": 1.4621640076975604, + "grad_norm": 0.16292773356943163, + "learning_rate": 6.0841666184213184e-05, + "loss": 2.7528, + "step": 23554 + }, + { + "epoch": 1.4622260847973183, + "grad_norm": 0.1521014830349897, + "learning_rate": 6.083814055482476e-05, + "loss": 2.8236, + "step": 23555 + }, + { + "epoch": 1.4622881618970762, + "grad_norm": 0.15035869829227153, + "learning_rate": 6.083461486889113e-05, + "loss": 2.8474, + "step": 23556 + }, + { + "epoch": 1.4623502389968341, + "grad_norm": 0.15830127591262333, + "learning_rate": 6.08310891264307e-05, + "loss": 2.782, + "step": 23557 + }, + { + "epoch": 1.462412316096592, + "grad_norm": 0.22323324497631367, + "learning_rate": 6.0827563327461825e-05, + "loss": 2.919, + "step": 23558 + }, + { + "epoch": 1.46247439319635, + "grad_norm": 0.17416717916414004, + "learning_rate": 6.0824037472002923e-05, + "loss": 2.8489, + "step": 23559 + }, + { + "epoch": 1.462536470296108, + "grad_norm": 0.15618204784990197, + "learning_rate": 6.08205115600724e-05, + "loss": 2.8888, + "step": 23560 + }, + { + "epoch": 1.4625985473958656, + "grad_norm": 0.1837332682387663, + "learning_rate": 6.081698559168865e-05, + "loss": 2.8524, + "step": 23561 + }, + { + "epoch": 1.4626606244956235, + "grad_norm": 0.17022572677934483, + "learning_rate": 6.081345956687005e-05, + "loss": 2.85, + "step": 23562 + }, + { + "epoch": 1.4627227015953814, + "grad_norm": 0.1510801238944561, + "learning_rate": 6.0809933485634996e-05, + "loss": 2.8855, + "step": 23563 + }, + { + "epoch": 1.4627847786951393, + "grad_norm": 0.157246896364805, + "learning_rate": 6.0806407348001915e-05, + "loss": 2.8805, + "step": 23564 + }, + { + "epoch": 1.4628468557948973, + "grad_norm": 0.1463842940177406, + "learning_rate": 6.080288115398917e-05, + "loss": 2.763, + "step": 23565 + }, + { + "epoch": 1.4629089328946552, + "grad_norm": 0.18675309667356152, + "learning_rate": 6.079935490361518e-05, + "loss": 2.7607, + "step": 23566 + }, + { + "epoch": 1.462971009994413, + "grad_norm": 0.15055285410744335, + "learning_rate": 6.079582859689834e-05, + "loss": 2.7683, + "step": 23567 + }, + { + "epoch": 1.463033087094171, + "grad_norm": 0.16493596010945644, + "learning_rate": 6.0792302233857024e-05, + "loss": 2.8089, + "step": 23568 + }, + { + "epoch": 1.463095164193929, + "grad_norm": 0.15010215911365454, + "learning_rate": 6.0788775814509655e-05, + "loss": 2.8122, + "step": 23569 + }, + { + "epoch": 1.4631572412936866, + "grad_norm": 0.1836858782020094, + "learning_rate": 6.0785249338874615e-05, + "loss": 2.849, + "step": 23570 + }, + { + "epoch": 1.4632193183934445, + "grad_norm": 0.1656316580423812, + "learning_rate": 6.078172280697033e-05, + "loss": 2.8242, + "step": 23571 + }, + { + "epoch": 1.4632813954932025, + "grad_norm": 0.18955005054048063, + "learning_rate": 6.077819621881516e-05, + "loss": 2.8179, + "step": 23572 + }, + { + "epoch": 1.4633434725929604, + "grad_norm": 0.16788992030021138, + "learning_rate": 6.0774669574427544e-05, + "loss": 2.8881, + "step": 23573 + }, + { + "epoch": 1.4634055496927183, + "grad_norm": 0.17342384344754352, + "learning_rate": 6.077114287382584e-05, + "loss": 2.8789, + "step": 23574 + }, + { + "epoch": 1.4634676267924762, + "grad_norm": 0.15237640373249953, + "learning_rate": 6.076761611702848e-05, + "loss": 2.7255, + "step": 23575 + }, + { + "epoch": 1.4635297038922341, + "grad_norm": 0.18235792023254413, + "learning_rate": 6.0764089304053854e-05, + "loss": 2.8571, + "step": 23576 + }, + { + "epoch": 1.463591780991992, + "grad_norm": 0.15504967262752628, + "learning_rate": 6.0760562434920354e-05, + "loss": 2.8303, + "step": 23577 + }, + { + "epoch": 1.46365385809175, + "grad_norm": 0.16994384645499494, + "learning_rate": 6.07570355096464e-05, + "loss": 2.8302, + "step": 23578 + }, + { + "epoch": 1.4637159351915079, + "grad_norm": 0.1576605074319945, + "learning_rate": 6.075350852825037e-05, + "loss": 2.8232, + "step": 23579 + }, + { + "epoch": 1.4637780122912658, + "grad_norm": 0.16107641446867466, + "learning_rate": 6.074998149075068e-05, + "loss": 2.7466, + "step": 23580 + }, + { + "epoch": 1.4638400893910237, + "grad_norm": 0.17329670006486345, + "learning_rate": 6.074645439716572e-05, + "loss": 2.8004, + "step": 23581 + }, + { + "epoch": 1.4639021664907816, + "grad_norm": 0.15441882468961446, + "learning_rate": 6.07429272475139e-05, + "loss": 2.7631, + "step": 23582 + }, + { + "epoch": 1.4639642435905396, + "grad_norm": 0.23894840397502312, + "learning_rate": 6.073940004181362e-05, + "loss": 2.779, + "step": 23583 + }, + { + "epoch": 1.4640263206902975, + "grad_norm": 0.17807070381423812, + "learning_rate": 6.07358727800833e-05, + "loss": 2.8896, + "step": 23584 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.15102756916772567, + "learning_rate": 6.073234546234131e-05, + "loss": 2.7995, + "step": 23585 + }, + { + "epoch": 1.464150474889813, + "grad_norm": 0.14378978655486963, + "learning_rate": 6.0728818088606066e-05, + "loss": 2.7542, + "step": 23586 + }, + { + "epoch": 1.464212551989571, + "grad_norm": 0.17322613384778907, + "learning_rate": 6.0725290658895985e-05, + "loss": 2.882, + "step": 23587 + }, + { + "epoch": 1.464274629089329, + "grad_norm": 0.15506175144534573, + "learning_rate": 6.072176317322945e-05, + "loss": 2.8519, + "step": 23588 + }, + { + "epoch": 1.4643367061890868, + "grad_norm": 0.20557092765855528, + "learning_rate": 6.0718235631624886e-05, + "loss": 2.7219, + "step": 23589 + }, + { + "epoch": 1.4643987832888448, + "grad_norm": 0.20583546986481346, + "learning_rate": 6.0714708034100664e-05, + "loss": 2.7681, + "step": 23590 + }, + { + "epoch": 1.4644608603886027, + "grad_norm": 0.18096298026832552, + "learning_rate": 6.071118038067523e-05, + "loss": 2.8978, + "step": 23591 + }, + { + "epoch": 1.4645229374883606, + "grad_norm": 0.16179416068474375, + "learning_rate": 6.070765267136696e-05, + "loss": 2.8074, + "step": 23592 + }, + { + "epoch": 1.4645850145881185, + "grad_norm": 0.19325066410039418, + "learning_rate": 6.070412490619425e-05, + "loss": 2.8127, + "step": 23593 + }, + { + "epoch": 1.4646470916878762, + "grad_norm": 0.2160868573830695, + "learning_rate": 6.070059708517554e-05, + "loss": 2.7611, + "step": 23594 + }, + { + "epoch": 1.4647091687876341, + "grad_norm": 0.18363133512453234, + "learning_rate": 6.0697069208329204e-05, + "loss": 2.833, + "step": 23595 + }, + { + "epoch": 1.464771245887392, + "grad_norm": 0.18800292515381303, + "learning_rate": 6.069354127567368e-05, + "loss": 2.8669, + "step": 23596 + }, + { + "epoch": 1.46483332298715, + "grad_norm": 0.1715823684280895, + "learning_rate": 6.069001328722734e-05, + "loss": 2.8439, + "step": 23597 + }, + { + "epoch": 1.4648954000869079, + "grad_norm": 0.1608568602907765, + "learning_rate": 6.068648524300862e-05, + "loss": 2.8431, + "step": 23598 + }, + { + "epoch": 1.4649574771866658, + "grad_norm": 0.2121694142635547, + "learning_rate": 6.06829571430359e-05, + "loss": 2.7868, + "step": 23599 + }, + { + "epoch": 1.4650195542864237, + "grad_norm": 0.15110470563426984, + "learning_rate": 6.06794289873276e-05, + "loss": 2.8351, + "step": 23600 + }, + { + "epoch": 1.4650816313861816, + "grad_norm": 0.17200540342375217, + "learning_rate": 6.067590077590213e-05, + "loss": 2.8972, + "step": 23601 + }, + { + "epoch": 1.4651437084859396, + "grad_norm": 0.16311666918634363, + "learning_rate": 6.06723725087779e-05, + "loss": 2.8298, + "step": 23602 + }, + { + "epoch": 1.4652057855856975, + "grad_norm": 0.160284032087972, + "learning_rate": 6.0668844185973305e-05, + "loss": 2.7651, + "step": 23603 + }, + { + "epoch": 1.4652678626854554, + "grad_norm": 0.14522450664414793, + "learning_rate": 6.066531580750676e-05, + "loss": 2.8561, + "step": 23604 + }, + { + "epoch": 1.4653299397852133, + "grad_norm": 0.18204702014309956, + "learning_rate": 6.0661787373396674e-05, + "loss": 2.8436, + "step": 23605 + }, + { + "epoch": 1.4653920168849712, + "grad_norm": 0.15005663532735267, + "learning_rate": 6.065825888366146e-05, + "loss": 2.7404, + "step": 23606 + }, + { + "epoch": 1.4654540939847291, + "grad_norm": 0.2073040190966814, + "learning_rate": 6.0654730338319524e-05, + "loss": 2.7968, + "step": 23607 + }, + { + "epoch": 1.465516171084487, + "grad_norm": 0.15414262256014336, + "learning_rate": 6.065120173738924e-05, + "loss": 2.8245, + "step": 23608 + }, + { + "epoch": 1.4655782481842448, + "grad_norm": 0.14382315852444957, + "learning_rate": 6.0647673080889076e-05, + "loss": 2.9097, + "step": 23609 + }, + { + "epoch": 1.4656403252840027, + "grad_norm": 0.18136634422236014, + "learning_rate": 6.064414436883743e-05, + "loss": 2.8153, + "step": 23610 + }, + { + "epoch": 1.4657024023837606, + "grad_norm": 0.16480516189973468, + "learning_rate": 6.064061560125268e-05, + "loss": 2.8229, + "step": 23611 + }, + { + "epoch": 1.4657644794835185, + "grad_norm": 0.15902419774832582, + "learning_rate": 6.0637086778153276e-05, + "loss": 2.8486, + "step": 23612 + }, + { + "epoch": 1.4658265565832764, + "grad_norm": 0.15509473966290058, + "learning_rate": 6.063355789955758e-05, + "loss": 2.7806, + "step": 23613 + }, + { + "epoch": 1.4658886336830343, + "grad_norm": 0.1595736408703623, + "learning_rate": 6.063002896548405e-05, + "loss": 2.7481, + "step": 23614 + }, + { + "epoch": 1.4659507107827923, + "grad_norm": 0.17401068990751217, + "learning_rate": 6.062649997595107e-05, + "loss": 2.8822, + "step": 23615 + }, + { + "epoch": 1.4660127878825502, + "grad_norm": 0.1655428493742993, + "learning_rate": 6.062297093097706e-05, + "loss": 2.7242, + "step": 23616 + }, + { + "epoch": 1.466074864982308, + "grad_norm": 0.15725639012942505, + "learning_rate": 6.061944183058044e-05, + "loss": 2.8262, + "step": 23617 + }, + { + "epoch": 1.4661369420820658, + "grad_norm": 0.14755467094695576, + "learning_rate": 6.0615912674779605e-05, + "loss": 2.816, + "step": 23618 + }, + { + "epoch": 1.4661990191818237, + "grad_norm": 0.1885822671746197, + "learning_rate": 6.061238346359298e-05, + "loss": 2.793, + "step": 23619 + }, + { + "epoch": 1.4662610962815816, + "grad_norm": 0.15562298255401735, + "learning_rate": 6.0608854197038975e-05, + "loss": 2.7405, + "step": 23620 + }, + { + "epoch": 1.4663231733813396, + "grad_norm": 0.14754817605245357, + "learning_rate": 6.0605324875136016e-05, + "loss": 2.8681, + "step": 23621 + }, + { + "epoch": 1.4663852504810975, + "grad_norm": 0.1548843246617045, + "learning_rate": 6.060179549790248e-05, + "loss": 2.7084, + "step": 23622 + }, + { + "epoch": 1.4664473275808554, + "grad_norm": 0.161616570216374, + "learning_rate": 6.059826606535681e-05, + "loss": 2.7894, + "step": 23623 + }, + { + "epoch": 1.4665094046806133, + "grad_norm": 0.1611136685183039, + "learning_rate": 6.059473657751742e-05, + "loss": 2.822, + "step": 23624 + }, + { + "epoch": 1.4665714817803712, + "grad_norm": 0.15743881612449448, + "learning_rate": 6.059120703440271e-05, + "loss": 2.8917, + "step": 23625 + }, + { + "epoch": 1.4666335588801291, + "grad_norm": 0.17408479535777976, + "learning_rate": 6.058767743603109e-05, + "loss": 2.8502, + "step": 23626 + }, + { + "epoch": 1.466695635979887, + "grad_norm": 0.22621543379175077, + "learning_rate": 6.0584147782421006e-05, + "loss": 2.8947, + "step": 23627 + }, + { + "epoch": 1.466757713079645, + "grad_norm": 0.16192779367752644, + "learning_rate": 6.0580618073590856e-05, + "loss": 2.7978, + "step": 23628 + }, + { + "epoch": 1.466819790179403, + "grad_norm": 0.16777670771616876, + "learning_rate": 6.057708830955904e-05, + "loss": 3.0026, + "step": 23629 + }, + { + "epoch": 1.4668818672791608, + "grad_norm": 0.15980586242430356, + "learning_rate": 6.0573558490343996e-05, + "loss": 2.843, + "step": 23630 + }, + { + "epoch": 1.4669439443789187, + "grad_norm": 0.16161629937658556, + "learning_rate": 6.057002861596412e-05, + "loss": 2.7688, + "step": 23631 + }, + { + "epoch": 1.4670060214786766, + "grad_norm": 0.1759795500896512, + "learning_rate": 6.056649868643786e-05, + "loss": 2.7343, + "step": 23632 + }, + { + "epoch": 1.4670680985784343, + "grad_norm": 0.16985583392219072, + "learning_rate": 6.0562968701783583e-05, + "loss": 2.7259, + "step": 23633 + }, + { + "epoch": 1.4671301756781923, + "grad_norm": 0.16034702243487764, + "learning_rate": 6.0559438662019754e-05, + "loss": 2.8431, + "step": 23634 + }, + { + "epoch": 1.4671922527779502, + "grad_norm": 0.15910080552477154, + "learning_rate": 6.055590856716475e-05, + "loss": 2.8242, + "step": 23635 + }, + { + "epoch": 1.467254329877708, + "grad_norm": 0.1626574317640016, + "learning_rate": 6.055237841723703e-05, + "loss": 2.8132, + "step": 23636 + }, + { + "epoch": 1.467316406977466, + "grad_norm": 0.16347412918813872, + "learning_rate": 6.054884821225498e-05, + "loss": 2.8169, + "step": 23637 + }, + { + "epoch": 1.467378484077224, + "grad_norm": 0.16266591125840363, + "learning_rate": 6.054531795223703e-05, + "loss": 2.9278, + "step": 23638 + }, + { + "epoch": 1.4674405611769819, + "grad_norm": 0.1569463590546893, + "learning_rate": 6.05417876372016e-05, + "loss": 2.7835, + "step": 23639 + }, + { + "epoch": 1.4675026382767398, + "grad_norm": 0.17358744493085176, + "learning_rate": 6.05382572671671e-05, + "loss": 2.768, + "step": 23640 + }, + { + "epoch": 1.4675647153764977, + "grad_norm": 0.15778964368147763, + "learning_rate": 6.0534726842151957e-05, + "loss": 2.7768, + "step": 23641 + }, + { + "epoch": 1.4676267924762554, + "grad_norm": 0.16092218446539763, + "learning_rate": 6.053119636217457e-05, + "loss": 2.8071, + "step": 23642 + }, + { + "epoch": 1.4676888695760133, + "grad_norm": 0.1564382253844162, + "learning_rate": 6.05276658272534e-05, + "loss": 2.8454, + "step": 23643 + }, + { + "epoch": 1.4677509466757712, + "grad_norm": 0.15603757365778967, + "learning_rate": 6.052413523740682e-05, + "loss": 2.8449, + "step": 23644 + }, + { + "epoch": 1.4678130237755291, + "grad_norm": 0.1535172442964759, + "learning_rate": 6.0520604592653295e-05, + "loss": 2.8733, + "step": 23645 + }, + { + "epoch": 1.467875100875287, + "grad_norm": 0.14885096312181825, + "learning_rate": 6.0517073893011213e-05, + "loss": 2.9018, + "step": 23646 + }, + { + "epoch": 1.467937177975045, + "grad_norm": 0.18397943081054052, + "learning_rate": 6.0513543138498994e-05, + "loss": 2.819, + "step": 23647 + }, + { + "epoch": 1.467999255074803, + "grad_norm": 0.14342562394645594, + "learning_rate": 6.0510012329135077e-05, + "loss": 2.7774, + "step": 23648 + }, + { + "epoch": 1.4680613321745608, + "grad_norm": 0.15263937119847512, + "learning_rate": 6.050648146493787e-05, + "loss": 2.8425, + "step": 23649 + }, + { + "epoch": 1.4681234092743187, + "grad_norm": 0.16446763493368108, + "learning_rate": 6.050295054592581e-05, + "loss": 2.8045, + "step": 23650 + }, + { + "epoch": 1.4681854863740766, + "grad_norm": 0.1578234071550428, + "learning_rate": 6.049941957211729e-05, + "loss": 2.8238, + "step": 23651 + }, + { + "epoch": 1.4682475634738346, + "grad_norm": 0.14756394256046634, + "learning_rate": 6.0495888543530754e-05, + "loss": 2.8659, + "step": 23652 + }, + { + "epoch": 1.4683096405735925, + "grad_norm": 0.16568026610260062, + "learning_rate": 6.0492357460184624e-05, + "loss": 2.7294, + "step": 23653 + }, + { + "epoch": 1.4683717176733504, + "grad_norm": 0.14945137580250759, + "learning_rate": 6.0488826322097316e-05, + "loss": 2.8518, + "step": 23654 + }, + { + "epoch": 1.4684337947731083, + "grad_norm": 0.22284594201933117, + "learning_rate": 6.0485295129287255e-05, + "loss": 2.8139, + "step": 23655 + }, + { + "epoch": 1.4684958718728662, + "grad_norm": 0.1521173347443828, + "learning_rate": 6.048176388177287e-05, + "loss": 2.8183, + "step": 23656 + }, + { + "epoch": 1.468557948972624, + "grad_norm": 0.19414670155646235, + "learning_rate": 6.047823257957257e-05, + "loss": 2.8102, + "step": 23657 + }, + { + "epoch": 1.4686200260723818, + "grad_norm": 0.17514582863294895, + "learning_rate": 6.047470122270478e-05, + "loss": 2.8289, + "step": 23658 + }, + { + "epoch": 1.4686821031721398, + "grad_norm": 0.15161029612609211, + "learning_rate": 6.047116981118794e-05, + "loss": 2.7504, + "step": 23659 + }, + { + "epoch": 1.4687441802718977, + "grad_norm": 0.18664748376033655, + "learning_rate": 6.0467638345040464e-05, + "loss": 2.8241, + "step": 23660 + }, + { + "epoch": 1.4688062573716556, + "grad_norm": 0.15692054395178567, + "learning_rate": 6.046410682428079e-05, + "loss": 2.8657, + "step": 23661 + }, + { + "epoch": 1.4688683344714135, + "grad_norm": 0.17883160170695614, + "learning_rate": 6.046057524892731e-05, + "loss": 2.7983, + "step": 23662 + }, + { + "epoch": 1.4689304115711714, + "grad_norm": 0.18045236711328885, + "learning_rate": 6.0457043618998476e-05, + "loss": 2.7602, + "step": 23663 + }, + { + "epoch": 1.4689924886709294, + "grad_norm": 0.17555007037831197, + "learning_rate": 6.045351193451271e-05, + "loss": 2.7946, + "step": 23664 + }, + { + "epoch": 1.4690545657706873, + "grad_norm": 0.19298789435182737, + "learning_rate": 6.0449980195488446e-05, + "loss": 2.8163, + "step": 23665 + }, + { + "epoch": 1.469116642870445, + "grad_norm": 0.16400576927632418, + "learning_rate": 6.044644840194409e-05, + "loss": 2.8099, + "step": 23666 + }, + { + "epoch": 1.4691787199702029, + "grad_norm": 0.19536016836205894, + "learning_rate": 6.044291655389807e-05, + "loss": 2.8851, + "step": 23667 + }, + { + "epoch": 1.4692407970699608, + "grad_norm": 0.1618368201539031, + "learning_rate": 6.043938465136882e-05, + "loss": 2.865, + "step": 23668 + }, + { + "epoch": 1.4693028741697187, + "grad_norm": 0.14677303483009993, + "learning_rate": 6.0435852694374765e-05, + "loss": 2.9029, + "step": 23669 + }, + { + "epoch": 1.4693649512694766, + "grad_norm": 0.16657196958258036, + "learning_rate": 6.043232068293434e-05, + "loss": 2.848, + "step": 23670 + }, + { + "epoch": 1.4694270283692346, + "grad_norm": 0.14355672458578705, + "learning_rate": 6.0428788617065955e-05, + "loss": 2.7297, + "step": 23671 + }, + { + "epoch": 1.4694891054689925, + "grad_norm": 0.1487364707031304, + "learning_rate": 6.042525649678806e-05, + "loss": 2.886, + "step": 23672 + }, + { + "epoch": 1.4695511825687504, + "grad_norm": 0.15367910595992734, + "learning_rate": 6.0421724322119064e-05, + "loss": 2.8146, + "step": 23673 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.15561501102293832, + "learning_rate": 6.0418192093077396e-05, + "loss": 2.8371, + "step": 23674 + }, + { + "epoch": 1.4696753367682662, + "grad_norm": 0.15957644811481467, + "learning_rate": 6.041465980968151e-05, + "loss": 2.8403, + "step": 23675 + }, + { + "epoch": 1.4697374138680241, + "grad_norm": 0.15714389096557074, + "learning_rate": 6.04111274719498e-05, + "loss": 2.7427, + "step": 23676 + }, + { + "epoch": 1.469799490967782, + "grad_norm": 0.16236564762993355, + "learning_rate": 6.040759507990073e-05, + "loss": 2.8094, + "step": 23677 + }, + { + "epoch": 1.46986156806754, + "grad_norm": 0.1664183668193574, + "learning_rate": 6.040406263355268e-05, + "loss": 2.8605, + "step": 23678 + }, + { + "epoch": 1.469923645167298, + "grad_norm": 0.14844115152385087, + "learning_rate": 6.0400530132924136e-05, + "loss": 2.775, + "step": 23679 + }, + { + "epoch": 1.4699857222670558, + "grad_norm": 0.16378407467781106, + "learning_rate": 6.0396997578033474e-05, + "loss": 2.8169, + "step": 23680 + }, + { + "epoch": 1.4700477993668135, + "grad_norm": 0.15297760900770002, + "learning_rate": 6.039346496889918e-05, + "loss": 2.755, + "step": 23681 + }, + { + "epoch": 1.4701098764665714, + "grad_norm": 0.14103358569113986, + "learning_rate": 6.038993230553964e-05, + "loss": 2.7084, + "step": 23682 + }, + { + "epoch": 1.4701719535663293, + "grad_norm": 0.13595604002790854, + "learning_rate": 6.0386399587973306e-05, + "loss": 2.7534, + "step": 23683 + }, + { + "epoch": 1.4702340306660873, + "grad_norm": 0.15025346247901178, + "learning_rate": 6.038286681621862e-05, + "loss": 2.8058, + "step": 23684 + }, + { + "epoch": 1.4702961077658452, + "grad_norm": 0.1504292190869373, + "learning_rate": 6.037933399029397e-05, + "loss": 2.6723, + "step": 23685 + }, + { + "epoch": 1.470358184865603, + "grad_norm": 0.14888453928735793, + "learning_rate": 6.037580111021783e-05, + "loss": 2.8352, + "step": 23686 + }, + { + "epoch": 1.470420261965361, + "grad_norm": 0.14374654625807134, + "learning_rate": 6.0372268176008605e-05, + "loss": 2.8336, + "step": 23687 + }, + { + "epoch": 1.470482339065119, + "grad_norm": 0.16730512331723904, + "learning_rate": 6.036873518768475e-05, + "loss": 2.8028, + "step": 23688 + }, + { + "epoch": 1.4705444161648769, + "grad_norm": 0.14793271894952523, + "learning_rate": 6.036520214526469e-05, + "loss": 2.7913, + "step": 23689 + }, + { + "epoch": 1.4706064932646346, + "grad_norm": 0.14790388237303218, + "learning_rate": 6.0361669048766835e-05, + "loss": 2.7832, + "step": 23690 + }, + { + "epoch": 1.4706685703643925, + "grad_norm": 0.15333402778850042, + "learning_rate": 6.035813589820966e-05, + "loss": 2.7626, + "step": 23691 + }, + { + "epoch": 1.4707306474641504, + "grad_norm": 0.16500446137443972, + "learning_rate": 6.035460269361155e-05, + "loss": 2.7998, + "step": 23692 + }, + { + "epoch": 1.4707927245639083, + "grad_norm": 0.1512772262024719, + "learning_rate": 6.035106943499098e-05, + "loss": 2.77, + "step": 23693 + }, + { + "epoch": 1.4708548016636662, + "grad_norm": 0.22594002186751463, + "learning_rate": 6.034753612236637e-05, + "loss": 2.8194, + "step": 23694 + }, + { + "epoch": 1.4709168787634241, + "grad_norm": 0.18001353704690068, + "learning_rate": 6.034400275575616e-05, + "loss": 2.8597, + "step": 23695 + }, + { + "epoch": 1.470978955863182, + "grad_norm": 0.21573272066828242, + "learning_rate": 6.034046933517875e-05, + "loss": 2.8562, + "step": 23696 + }, + { + "epoch": 1.47104103296294, + "grad_norm": 0.16156803078287896, + "learning_rate": 6.0336935860652624e-05, + "loss": 2.8032, + "step": 23697 + }, + { + "epoch": 1.471103110062698, + "grad_norm": 0.15889804591040568, + "learning_rate": 6.033340233219618e-05, + "loss": 2.8988, + "step": 23698 + }, + { + "epoch": 1.4711651871624558, + "grad_norm": 0.1676737945741101, + "learning_rate": 6.032986874982788e-05, + "loss": 2.8898, + "step": 23699 + }, + { + "epoch": 1.4712272642622137, + "grad_norm": 0.17221615809859417, + "learning_rate": 6.032633511356613e-05, + "loss": 2.7733, + "step": 23700 + }, + { + "epoch": 1.4712893413619716, + "grad_norm": 0.16270708291110167, + "learning_rate": 6.0322801423429406e-05, + "loss": 2.7891, + "step": 23701 + }, + { + "epoch": 1.4713514184617296, + "grad_norm": 0.21738737466554187, + "learning_rate": 6.031926767943612e-05, + "loss": 2.8055, + "step": 23702 + }, + { + "epoch": 1.4714134955614875, + "grad_norm": 0.16149928294050686, + "learning_rate": 6.0315733881604685e-05, + "loss": 2.7582, + "step": 23703 + }, + { + "epoch": 1.4714755726612454, + "grad_norm": 0.22384200660573053, + "learning_rate": 6.031220002995358e-05, + "loss": 2.8652, + "step": 23704 + }, + { + "epoch": 1.471537649761003, + "grad_norm": 0.161763269866215, + "learning_rate": 6.030866612450122e-05, + "loss": 2.7938, + "step": 23705 + }, + { + "epoch": 1.471599726860761, + "grad_norm": 0.16606407580491236, + "learning_rate": 6.0305132165266056e-05, + "loss": 2.8246, + "step": 23706 + }, + { + "epoch": 1.471661803960519, + "grad_norm": 0.16507286086853526, + "learning_rate": 6.030159815226648e-05, + "loss": 2.8538, + "step": 23707 + }, + { + "epoch": 1.4717238810602769, + "grad_norm": 0.16471853110455012, + "learning_rate": 6.0298064085521e-05, + "loss": 2.794, + "step": 23708 + }, + { + "epoch": 1.4717859581600348, + "grad_norm": 0.14831908421664627, + "learning_rate": 6.029452996504802e-05, + "loss": 2.7832, + "step": 23709 + }, + { + "epoch": 1.4718480352597927, + "grad_norm": 0.15153950757699147, + "learning_rate": 6.0290995790865964e-05, + "loss": 2.8254, + "step": 23710 + }, + { + "epoch": 1.4719101123595506, + "grad_norm": 0.19074075186222555, + "learning_rate": 6.02874615629933e-05, + "loss": 2.8304, + "step": 23711 + }, + { + "epoch": 1.4719721894593085, + "grad_norm": 0.15635529784780947, + "learning_rate": 6.028392728144843e-05, + "loss": 2.7979, + "step": 23712 + }, + { + "epoch": 1.4720342665590664, + "grad_norm": 0.16226953614709377, + "learning_rate": 6.028039294624983e-05, + "loss": 2.826, + "step": 23713 + }, + { + "epoch": 1.4720963436588241, + "grad_norm": 0.218389674150589, + "learning_rate": 6.0276858557415914e-05, + "loss": 2.8503, + "step": 23714 + }, + { + "epoch": 1.472158420758582, + "grad_norm": 0.1567324224309921, + "learning_rate": 6.027332411496515e-05, + "loss": 2.8371, + "step": 23715 + }, + { + "epoch": 1.47222049785834, + "grad_norm": 0.16307581795916115, + "learning_rate": 6.0269789618915936e-05, + "loss": 2.8904, + "step": 23716 + }, + { + "epoch": 1.472282574958098, + "grad_norm": 0.22447042728473474, + "learning_rate": 6.026625506928676e-05, + "loss": 2.8723, + "step": 23717 + }, + { + "epoch": 1.4723446520578558, + "grad_norm": 0.1524128573735328, + "learning_rate": 6.026272046609602e-05, + "loss": 2.759, + "step": 23718 + }, + { + "epoch": 1.4724067291576137, + "grad_norm": 0.15596669859235487, + "learning_rate": 6.025918580936219e-05, + "loss": 2.8316, + "step": 23719 + }, + { + "epoch": 1.4724688062573716, + "grad_norm": 0.22297791619229249, + "learning_rate": 6.025565109910371e-05, + "loss": 2.8309, + "step": 23720 + }, + { + "epoch": 1.4725308833571296, + "grad_norm": 0.15196082701200253, + "learning_rate": 6.0252116335338984e-05, + "loss": 2.8888, + "step": 23721 + }, + { + "epoch": 1.4725929604568875, + "grad_norm": 0.1951627252419233, + "learning_rate": 6.0248581518086475e-05, + "loss": 2.7802, + "step": 23722 + }, + { + "epoch": 1.4726550375566454, + "grad_norm": 0.15206020205988247, + "learning_rate": 6.024504664736463e-05, + "loss": 2.8705, + "step": 23723 + }, + { + "epoch": 1.4727171146564033, + "grad_norm": 0.1755986483834854, + "learning_rate": 6.02415117231919e-05, + "loss": 2.8483, + "step": 23724 + }, + { + "epoch": 1.4727791917561612, + "grad_norm": 0.15092644667778016, + "learning_rate": 6.023797674558671e-05, + "loss": 2.7619, + "step": 23725 + }, + { + "epoch": 1.4728412688559192, + "grad_norm": 0.15542964155872832, + "learning_rate": 6.0234441714567524e-05, + "loss": 2.7441, + "step": 23726 + }, + { + "epoch": 1.472903345955677, + "grad_norm": 0.1566283532755619, + "learning_rate": 6.023090663015276e-05, + "loss": 2.8211, + "step": 23727 + }, + { + "epoch": 1.472965423055435, + "grad_norm": 0.15350648789720345, + "learning_rate": 6.022737149236087e-05, + "loss": 2.8241, + "step": 23728 + }, + { + "epoch": 1.4730275001551927, + "grad_norm": 0.14600445650064736, + "learning_rate": 6.022383630121031e-05, + "loss": 2.8403, + "step": 23729 + }, + { + "epoch": 1.4730895772549506, + "grad_norm": 0.15723724210494938, + "learning_rate": 6.022030105671951e-05, + "loss": 2.7747, + "step": 23730 + }, + { + "epoch": 1.4731516543547085, + "grad_norm": 0.19081799945971695, + "learning_rate": 6.021676575890692e-05, + "loss": 2.8488, + "step": 23731 + }, + { + "epoch": 1.4732137314544664, + "grad_norm": 0.15511169646743064, + "learning_rate": 6.021323040779098e-05, + "loss": 2.8761, + "step": 23732 + }, + { + "epoch": 1.4732758085542244, + "grad_norm": 0.1474100618829902, + "learning_rate": 6.020969500339013e-05, + "loss": 2.743, + "step": 23733 + }, + { + "epoch": 1.4733378856539823, + "grad_norm": 0.18051025441806137, + "learning_rate": 6.020615954572283e-05, + "loss": 2.8256, + "step": 23734 + }, + { + "epoch": 1.4733999627537402, + "grad_norm": 0.15219572407318419, + "learning_rate": 6.020262403480752e-05, + "loss": 2.9228, + "step": 23735 + }, + { + "epoch": 1.473462039853498, + "grad_norm": 0.1630549429947741, + "learning_rate": 6.0199088470662644e-05, + "loss": 2.886, + "step": 23736 + }, + { + "epoch": 1.473524116953256, + "grad_norm": 0.15460936976659764, + "learning_rate": 6.019555285330665e-05, + "loss": 2.7553, + "step": 23737 + }, + { + "epoch": 1.4735861940530137, + "grad_norm": 0.19922478049631528, + "learning_rate": 6.019201718275798e-05, + "loss": 2.913, + "step": 23738 + }, + { + "epoch": 1.4736482711527716, + "grad_norm": 0.1523345614376567, + "learning_rate": 6.018848145903506e-05, + "loss": 2.7327, + "step": 23739 + }, + { + "epoch": 1.4737103482525296, + "grad_norm": 0.16759203095081712, + "learning_rate": 6.018494568215639e-05, + "loss": 2.8728, + "step": 23740 + }, + { + "epoch": 1.4737724253522875, + "grad_norm": 0.15864288175065985, + "learning_rate": 6.018140985214037e-05, + "loss": 2.8543, + "step": 23741 + }, + { + "epoch": 1.4738345024520454, + "grad_norm": 0.2162477997712016, + "learning_rate": 6.017787396900547e-05, + "loss": 2.8631, + "step": 23742 + }, + { + "epoch": 1.4738965795518033, + "grad_norm": 0.18158013351743713, + "learning_rate": 6.0174338032770114e-05, + "loss": 2.8893, + "step": 23743 + }, + { + "epoch": 1.4739586566515612, + "grad_norm": 0.15895955245794988, + "learning_rate": 6.017080204345279e-05, + "loss": 2.8187, + "step": 23744 + }, + { + "epoch": 1.4740207337513191, + "grad_norm": 0.16104055711377144, + "learning_rate": 6.016726600107192e-05, + "loss": 2.769, + "step": 23745 + }, + { + "epoch": 1.474082810851077, + "grad_norm": 0.16774251584080468, + "learning_rate": 6.016372990564594e-05, + "loss": 2.7477, + "step": 23746 + }, + { + "epoch": 1.474144887950835, + "grad_norm": 0.16856448851166717, + "learning_rate": 6.016019375719333e-05, + "loss": 2.826, + "step": 23747 + }, + { + "epoch": 1.474206965050593, + "grad_norm": 0.14529971789470822, + "learning_rate": 6.015665755573251e-05, + "loss": 2.8485, + "step": 23748 + }, + { + "epoch": 1.4742690421503508, + "grad_norm": 0.1734318717266702, + "learning_rate": 6.015312130128196e-05, + "loss": 2.7124, + "step": 23749 + }, + { + "epoch": 1.4743311192501087, + "grad_norm": 0.15607937198629213, + "learning_rate": 6.0149584993860085e-05, + "loss": 2.8029, + "step": 23750 + }, + { + "epoch": 1.4743931963498667, + "grad_norm": 0.1633867071538744, + "learning_rate": 6.014604863348537e-05, + "loss": 2.8119, + "step": 23751 + }, + { + "epoch": 1.4744552734496246, + "grad_norm": 0.1401403074436373, + "learning_rate": 6.014251222017626e-05, + "loss": 2.7456, + "step": 23752 + }, + { + "epoch": 1.4745173505493823, + "grad_norm": 0.16675275513912205, + "learning_rate": 6.013897575395121e-05, + "loss": 2.8341, + "step": 23753 + }, + { + "epoch": 1.4745794276491402, + "grad_norm": 0.1534445010625499, + "learning_rate": 6.013543923482865e-05, + "loss": 2.8846, + "step": 23754 + }, + { + "epoch": 1.474641504748898, + "grad_norm": 0.1628834672539887, + "learning_rate": 6.013190266282706e-05, + "loss": 2.734, + "step": 23755 + }, + { + "epoch": 1.474703581848656, + "grad_norm": 0.15402659755196238, + "learning_rate": 6.012836603796484e-05, + "loss": 2.8069, + "step": 23756 + }, + { + "epoch": 1.474765658948414, + "grad_norm": 0.157316105382977, + "learning_rate": 6.0124829360260504e-05, + "loss": 2.7257, + "step": 23757 + }, + { + "epoch": 1.4748277360481719, + "grad_norm": 0.16818955789750847, + "learning_rate": 6.0121292629732474e-05, + "loss": 2.8525, + "step": 23758 + }, + { + "epoch": 1.4748898131479298, + "grad_norm": 0.1578087954983401, + "learning_rate": 6.011775584639918e-05, + "loss": 2.7498, + "step": 23759 + }, + { + "epoch": 1.4749518902476877, + "grad_norm": 0.14589092843010995, + "learning_rate": 6.0114219010279124e-05, + "loss": 2.7749, + "step": 23760 + }, + { + "epoch": 1.4750139673474456, + "grad_norm": 0.16019645046824316, + "learning_rate": 6.011068212139073e-05, + "loss": 2.7827, + "step": 23761 + }, + { + "epoch": 1.4750760444472033, + "grad_norm": 0.14941038488711506, + "learning_rate": 6.010714517975244e-05, + "loss": 2.7629, + "step": 23762 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.18418125746574646, + "learning_rate": 6.010360818538273e-05, + "loss": 2.7992, + "step": 23763 + }, + { + "epoch": 1.4752001986467191, + "grad_norm": 0.1501878448832406, + "learning_rate": 6.010007113830004e-05, + "loss": 2.6503, + "step": 23764 + }, + { + "epoch": 1.475262275746477, + "grad_norm": 0.1615569111135901, + "learning_rate": 6.009653403852283e-05, + "loss": 2.8925, + "step": 23765 + }, + { + "epoch": 1.475324352846235, + "grad_norm": 0.15886860857389654, + "learning_rate": 6.009299688606954e-05, + "loss": 2.7252, + "step": 23766 + }, + { + "epoch": 1.475386429945993, + "grad_norm": 0.2107432141778383, + "learning_rate": 6.008945968095865e-05, + "loss": 2.8263, + "step": 23767 + }, + { + "epoch": 1.4754485070457508, + "grad_norm": 0.160094923978273, + "learning_rate": 6.008592242320859e-05, + "loss": 2.7804, + "step": 23768 + }, + { + "epoch": 1.4755105841455087, + "grad_norm": 0.16222468262319667, + "learning_rate": 6.008238511283784e-05, + "loss": 2.8283, + "step": 23769 + }, + { + "epoch": 1.4755726612452666, + "grad_norm": 0.16962615402351985, + "learning_rate": 6.007884774986481e-05, + "loss": 2.7741, + "step": 23770 + }, + { + "epoch": 1.4756347383450246, + "grad_norm": 0.14674396401831694, + "learning_rate": 6.007531033430801e-05, + "loss": 2.7572, + "step": 23771 + }, + { + "epoch": 1.4756968154447825, + "grad_norm": 0.16048582405919523, + "learning_rate": 6.0071772866185846e-05, + "loss": 2.8253, + "step": 23772 + }, + { + "epoch": 1.4757588925445404, + "grad_norm": 0.16400382150618534, + "learning_rate": 6.006823534551681e-05, + "loss": 2.7803, + "step": 23773 + }, + { + "epoch": 1.4758209696442983, + "grad_norm": 0.1661576392922999, + "learning_rate": 6.0064697772319355e-05, + "loss": 2.7645, + "step": 23774 + }, + { + "epoch": 1.4758830467440562, + "grad_norm": 0.17319432450039834, + "learning_rate": 6.0061160146611914e-05, + "loss": 2.8327, + "step": 23775 + }, + { + "epoch": 1.4759451238438142, + "grad_norm": 0.1392014559312706, + "learning_rate": 6.005762246841298e-05, + "loss": 2.8459, + "step": 23776 + }, + { + "epoch": 1.4760072009435719, + "grad_norm": 0.14115655624883408, + "learning_rate": 6.0054084737740965e-05, + "loss": 2.8066, + "step": 23777 + }, + { + "epoch": 1.4760692780433298, + "grad_norm": 0.1438514600200368, + "learning_rate": 6.005054695461436e-05, + "loss": 2.8556, + "step": 23778 + }, + { + "epoch": 1.4761313551430877, + "grad_norm": 0.1678604675178586, + "learning_rate": 6.00470091190516e-05, + "loss": 2.7976, + "step": 23779 + }, + { + "epoch": 1.4761934322428456, + "grad_norm": 0.17203125028584243, + "learning_rate": 6.004347123107117e-05, + "loss": 2.8683, + "step": 23780 + }, + { + "epoch": 1.4762555093426035, + "grad_norm": 0.1621046065582862, + "learning_rate": 6.003993329069152e-05, + "loss": 2.8686, + "step": 23781 + }, + { + "epoch": 1.4763175864423614, + "grad_norm": 0.1647997254978424, + "learning_rate": 6.003639529793108e-05, + "loss": 2.7669, + "step": 23782 + }, + { + "epoch": 1.4763796635421194, + "grad_norm": 0.174252949980102, + "learning_rate": 6.003285725280834e-05, + "loss": 2.887, + "step": 23783 + }, + { + "epoch": 1.4764417406418773, + "grad_norm": 0.15146042198155993, + "learning_rate": 6.002931915534173e-05, + "loss": 2.7522, + "step": 23784 + }, + { + "epoch": 1.4765038177416352, + "grad_norm": 0.16943389606723694, + "learning_rate": 6.0025781005549765e-05, + "loss": 2.7833, + "step": 23785 + }, + { + "epoch": 1.476565894841393, + "grad_norm": 0.15773252264090531, + "learning_rate": 6.0022242803450836e-05, + "loss": 2.8037, + "step": 23786 + }, + { + "epoch": 1.4766279719411508, + "grad_norm": 0.1907401952379731, + "learning_rate": 6.0018704549063446e-05, + "loss": 2.696, + "step": 23787 + }, + { + "epoch": 1.4766900490409087, + "grad_norm": 0.14949765080022392, + "learning_rate": 6.001516624240603e-05, + "loss": 2.7646, + "step": 23788 + }, + { + "epoch": 1.4767521261406666, + "grad_norm": 0.18693373525763812, + "learning_rate": 6.001162788349706e-05, + "loss": 2.8615, + "step": 23789 + }, + { + "epoch": 1.4768142032404246, + "grad_norm": 0.15307316632643628, + "learning_rate": 6.000808947235501e-05, + "loss": 2.7352, + "step": 23790 + }, + { + "epoch": 1.4768762803401825, + "grad_norm": 0.1551681377432152, + "learning_rate": 6.000455100899831e-05, + "loss": 2.8171, + "step": 23791 + }, + { + "epoch": 1.4769383574399404, + "grad_norm": 0.16269309304544982, + "learning_rate": 6.0001012493445464e-05, + "loss": 2.7584, + "step": 23792 + }, + { + "epoch": 1.4770004345396983, + "grad_norm": 0.15149819027296638, + "learning_rate": 5.9997473925714884e-05, + "loss": 2.8789, + "step": 23793 + }, + { + "epoch": 1.4770625116394562, + "grad_norm": 0.15885082746862964, + "learning_rate": 5.999393530582508e-05, + "loss": 2.8635, + "step": 23794 + }, + { + "epoch": 1.4771245887392142, + "grad_norm": 0.17583322286508202, + "learning_rate": 5.9990396633794466e-05, + "loss": 2.8678, + "step": 23795 + }, + { + "epoch": 1.477186665838972, + "grad_norm": 0.21118797336851583, + "learning_rate": 5.9986857909641546e-05, + "loss": 2.7487, + "step": 23796 + }, + { + "epoch": 1.47724874293873, + "grad_norm": 0.16838480188729688, + "learning_rate": 5.998331913338475e-05, + "loss": 2.8746, + "step": 23797 + }, + { + "epoch": 1.477310820038488, + "grad_norm": 0.19192358093137102, + "learning_rate": 5.997978030504257e-05, + "loss": 2.8451, + "step": 23798 + }, + { + "epoch": 1.4773728971382458, + "grad_norm": 0.15121172027464116, + "learning_rate": 5.997624142463344e-05, + "loss": 2.8794, + "step": 23799 + }, + { + "epoch": 1.4774349742380035, + "grad_norm": 0.1616670767839955, + "learning_rate": 5.9972702492175834e-05, + "loss": 2.8748, + "step": 23800 + }, + { + "epoch": 1.4774970513377614, + "grad_norm": 0.16567637618376763, + "learning_rate": 5.996916350768823e-05, + "loss": 2.7863, + "step": 23801 + }, + { + "epoch": 1.4775591284375194, + "grad_norm": 0.1570773571799711, + "learning_rate": 5.996562447118907e-05, + "loss": 2.8362, + "step": 23802 + }, + { + "epoch": 1.4776212055372773, + "grad_norm": 0.1819943705843577, + "learning_rate": 5.996208538269684e-05, + "loss": 2.8058, + "step": 23803 + }, + { + "epoch": 1.4776832826370352, + "grad_norm": 0.16445065743414103, + "learning_rate": 5.9958546242229974e-05, + "loss": 2.7986, + "step": 23804 + }, + { + "epoch": 1.477745359736793, + "grad_norm": 0.16979741328603354, + "learning_rate": 5.9955007049806964e-05, + "loss": 2.9293, + "step": 23805 + }, + { + "epoch": 1.477807436836551, + "grad_norm": 0.1438869730767345, + "learning_rate": 5.995146780544627e-05, + "loss": 2.8419, + "step": 23806 + }, + { + "epoch": 1.477869513936309, + "grad_norm": 0.14991228399961737, + "learning_rate": 5.994792850916635e-05, + "loss": 2.899, + "step": 23807 + }, + { + "epoch": 1.4779315910360669, + "grad_norm": 0.14850244857956094, + "learning_rate": 5.994438916098567e-05, + "loss": 2.724, + "step": 23808 + }, + { + "epoch": 1.4779936681358246, + "grad_norm": 0.1592395120079776, + "learning_rate": 5.994084976092269e-05, + "loss": 2.8458, + "step": 23809 + }, + { + "epoch": 1.4780557452355825, + "grad_norm": 0.16257002885045702, + "learning_rate": 5.99373103089959e-05, + "loss": 2.918, + "step": 23810 + }, + { + "epoch": 1.4781178223353404, + "grad_norm": 0.14512167008511856, + "learning_rate": 5.993377080522374e-05, + "loss": 2.8153, + "step": 23811 + }, + { + "epoch": 1.4781798994350983, + "grad_norm": 0.15403089051060548, + "learning_rate": 5.993023124962469e-05, + "loss": 2.7804, + "step": 23812 + }, + { + "epoch": 1.4782419765348562, + "grad_norm": 0.16767385567218734, + "learning_rate": 5.99266916422172e-05, + "loss": 2.8425, + "step": 23813 + }, + { + "epoch": 1.4783040536346141, + "grad_norm": 0.1356930472256096, + "learning_rate": 5.992315198301978e-05, + "loss": 2.8464, + "step": 23814 + }, + { + "epoch": 1.478366130734372, + "grad_norm": 0.17626722233916467, + "learning_rate": 5.991961227205084e-05, + "loss": 2.7914, + "step": 23815 + }, + { + "epoch": 1.47842820783413, + "grad_norm": 0.15214856188160583, + "learning_rate": 5.991607250932889e-05, + "loss": 2.7152, + "step": 23816 + }, + { + "epoch": 1.478490284933888, + "grad_norm": 0.13899086141004632, + "learning_rate": 5.991253269487238e-05, + "loss": 2.8109, + "step": 23817 + }, + { + "epoch": 1.4785523620336458, + "grad_norm": 0.14619196634943404, + "learning_rate": 5.990899282869977e-05, + "loss": 2.7866, + "step": 23818 + }, + { + "epoch": 1.4786144391334037, + "grad_norm": 0.15107786153237157, + "learning_rate": 5.990545291082955e-05, + "loss": 2.7461, + "step": 23819 + }, + { + "epoch": 1.4786765162331617, + "grad_norm": 0.14652705524466314, + "learning_rate": 5.990191294128017e-05, + "loss": 2.8379, + "step": 23820 + }, + { + "epoch": 1.4787385933329196, + "grad_norm": 0.14505692202261047, + "learning_rate": 5.9898372920070104e-05, + "loss": 2.7816, + "step": 23821 + }, + { + "epoch": 1.4788006704326775, + "grad_norm": 0.142732047253583, + "learning_rate": 5.989483284721782e-05, + "loss": 2.8212, + "step": 23822 + }, + { + "epoch": 1.4788627475324354, + "grad_norm": 0.1480332126704752, + "learning_rate": 5.989129272274181e-05, + "loss": 2.8396, + "step": 23823 + }, + { + "epoch": 1.478924824632193, + "grad_norm": 0.14377164988646796, + "learning_rate": 5.988775254666052e-05, + "loss": 2.8935, + "step": 23824 + }, + { + "epoch": 1.478986901731951, + "grad_norm": 0.1748580478228912, + "learning_rate": 5.98842123189924e-05, + "loss": 2.8002, + "step": 23825 + }, + { + "epoch": 1.479048978831709, + "grad_norm": 0.17386586604647836, + "learning_rate": 5.988067203975598e-05, + "loss": 2.7163, + "step": 23826 + }, + { + "epoch": 1.4791110559314669, + "grad_norm": 0.14329880336495107, + "learning_rate": 5.987713170896967e-05, + "loss": 2.7635, + "step": 23827 + }, + { + "epoch": 1.4791731330312248, + "grad_norm": 0.2199113280916296, + "learning_rate": 5.987359132665198e-05, + "loss": 2.7621, + "step": 23828 + }, + { + "epoch": 1.4792352101309827, + "grad_norm": 0.1968032876605819, + "learning_rate": 5.9870050892821364e-05, + "loss": 2.7479, + "step": 23829 + }, + { + "epoch": 1.4792972872307406, + "grad_norm": 0.13987992181164008, + "learning_rate": 5.986651040749629e-05, + "loss": 2.8281, + "step": 23830 + }, + { + "epoch": 1.4793593643304985, + "grad_norm": 0.16624791219303825, + "learning_rate": 5.986296987069524e-05, + "loss": 2.825, + "step": 23831 + }, + { + "epoch": 1.4794214414302564, + "grad_norm": 0.16240321381061154, + "learning_rate": 5.985942928243668e-05, + "loss": 2.7764, + "step": 23832 + }, + { + "epoch": 1.4794835185300141, + "grad_norm": 0.15892249081568927, + "learning_rate": 5.985588864273909e-05, + "loss": 2.8335, + "step": 23833 + }, + { + "epoch": 1.479545595629772, + "grad_norm": 0.1590318871251284, + "learning_rate": 5.9852347951620935e-05, + "loss": 2.8122, + "step": 23834 + }, + { + "epoch": 1.47960767272953, + "grad_norm": 0.1488941409159974, + "learning_rate": 5.984880720910069e-05, + "loss": 2.8898, + "step": 23835 + }, + { + "epoch": 1.479669749829288, + "grad_norm": 0.15858966793951731, + "learning_rate": 5.984526641519682e-05, + "loss": 2.822, + "step": 23836 + }, + { + "epoch": 1.4797318269290458, + "grad_norm": 0.15315430151597575, + "learning_rate": 5.984172556992782e-05, + "loss": 2.8406, + "step": 23837 + }, + { + "epoch": 1.4797939040288037, + "grad_norm": 0.15763815323223404, + "learning_rate": 5.9838184673312116e-05, + "loss": 2.898, + "step": 23838 + }, + { + "epoch": 1.4798559811285616, + "grad_norm": 0.148680832583482, + "learning_rate": 5.983464372536825e-05, + "loss": 2.8319, + "step": 23839 + }, + { + "epoch": 1.4799180582283196, + "grad_norm": 0.16959787039832544, + "learning_rate": 5.983110272611463e-05, + "loss": 2.8798, + "step": 23840 + }, + { + "epoch": 1.4799801353280775, + "grad_norm": 0.14950154429972942, + "learning_rate": 5.9827561675569785e-05, + "loss": 2.9184, + "step": 23841 + }, + { + "epoch": 1.4800422124278354, + "grad_norm": 0.15130170205462723, + "learning_rate": 5.982402057375216e-05, + "loss": 2.7743, + "step": 23842 + }, + { + "epoch": 1.4801042895275933, + "grad_norm": 0.145588218951536, + "learning_rate": 5.982047942068023e-05, + "loss": 2.7864, + "step": 23843 + }, + { + "epoch": 1.4801663666273512, + "grad_norm": 0.1560732855474733, + "learning_rate": 5.981693821637249e-05, + "loss": 2.8056, + "step": 23844 + }, + { + "epoch": 1.4802284437271092, + "grad_norm": 0.1404188685169077, + "learning_rate": 5.981339696084737e-05, + "loss": 2.7246, + "step": 23845 + }, + { + "epoch": 1.480290520826867, + "grad_norm": 0.15776816373219796, + "learning_rate": 5.9809855654123404e-05, + "loss": 2.7735, + "step": 23846 + }, + { + "epoch": 1.480352597926625, + "grad_norm": 0.15076137643995885, + "learning_rate": 5.980631429621901e-05, + "loss": 2.8192, + "step": 23847 + }, + { + "epoch": 1.4804146750263827, + "grad_norm": 0.20510542551613012, + "learning_rate": 5.980277288715272e-05, + "loss": 2.851, + "step": 23848 + }, + { + "epoch": 1.4804767521261406, + "grad_norm": 0.1407519006355181, + "learning_rate": 5.979923142694297e-05, + "loss": 2.8884, + "step": 23849 + }, + { + "epoch": 1.4805388292258985, + "grad_norm": 0.16925618890845956, + "learning_rate": 5.9795689915608264e-05, + "loss": 2.8214, + "step": 23850 + }, + { + "epoch": 1.4806009063256564, + "grad_norm": 0.14620665608934563, + "learning_rate": 5.979214835316706e-05, + "loss": 2.8741, + "step": 23851 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.16798842669718983, + "learning_rate": 5.9788606739637845e-05, + "loss": 2.8015, + "step": 23852 + }, + { + "epoch": 1.4807250605251723, + "grad_norm": 0.1691557890633632, + "learning_rate": 5.978506507503907e-05, + "loss": 2.9412, + "step": 23853 + }, + { + "epoch": 1.4807871376249302, + "grad_norm": 0.1602354319150263, + "learning_rate": 5.978152335938925e-05, + "loss": 2.8217, + "step": 23854 + }, + { + "epoch": 1.4808492147246881, + "grad_norm": 0.16213648351111432, + "learning_rate": 5.977798159270686e-05, + "loss": 2.8699, + "step": 23855 + }, + { + "epoch": 1.480911291824446, + "grad_norm": 0.17924717149803915, + "learning_rate": 5.977443977501035e-05, + "loss": 2.8371, + "step": 23856 + }, + { + "epoch": 1.4809733689242037, + "grad_norm": 0.15432366005878007, + "learning_rate": 5.9770897906318226e-05, + "loss": 2.8886, + "step": 23857 + }, + { + "epoch": 1.4810354460239616, + "grad_norm": 0.16446779348779134, + "learning_rate": 5.9767355986648944e-05, + "loss": 2.8888, + "step": 23858 + }, + { + "epoch": 1.4810975231237196, + "grad_norm": 0.14743262770777726, + "learning_rate": 5.9763814016021e-05, + "loss": 2.8257, + "step": 23859 + }, + { + "epoch": 1.4811596002234775, + "grad_norm": 0.1611681697084854, + "learning_rate": 5.9760271994452864e-05, + "loss": 2.8634, + "step": 23860 + }, + { + "epoch": 1.4812216773232354, + "grad_norm": 0.14920543616251916, + "learning_rate": 5.9756729921963036e-05, + "loss": 2.8521, + "step": 23861 + }, + { + "epoch": 1.4812837544229933, + "grad_norm": 0.1643163329577376, + "learning_rate": 5.975318779856998e-05, + "loss": 2.8363, + "step": 23862 + }, + { + "epoch": 1.4813458315227512, + "grad_norm": 0.1541179935896213, + "learning_rate": 5.974964562429215e-05, + "loss": 2.8018, + "step": 23863 + }, + { + "epoch": 1.4814079086225092, + "grad_norm": 0.16519201403031142, + "learning_rate": 5.9746103399148076e-05, + "loss": 2.8755, + "step": 23864 + }, + { + "epoch": 1.481469985722267, + "grad_norm": 0.15687669602081417, + "learning_rate": 5.974256112315619e-05, + "loss": 2.8484, + "step": 23865 + }, + { + "epoch": 1.481532062822025, + "grad_norm": 0.1660828846834321, + "learning_rate": 5.9739018796335025e-05, + "loss": 2.783, + "step": 23866 + }, + { + "epoch": 1.481594139921783, + "grad_norm": 0.151812838737358, + "learning_rate": 5.973547641870302e-05, + "loss": 2.807, + "step": 23867 + }, + { + "epoch": 1.4816562170215408, + "grad_norm": 0.17347192164954045, + "learning_rate": 5.9731933990278673e-05, + "loss": 2.7469, + "step": 23868 + }, + { + "epoch": 1.4817182941212987, + "grad_norm": 0.15662452990825948, + "learning_rate": 5.972839151108046e-05, + "loss": 2.899, + "step": 23869 + }, + { + "epoch": 1.4817803712210567, + "grad_norm": 0.17424693179028974, + "learning_rate": 5.972484898112687e-05, + "loss": 2.9139, + "step": 23870 + }, + { + "epoch": 1.4818424483208146, + "grad_norm": 0.14595832463746403, + "learning_rate": 5.972130640043638e-05, + "loss": 2.788, + "step": 23871 + }, + { + "epoch": 1.4819045254205723, + "grad_norm": 0.17921433138201953, + "learning_rate": 5.971776376902748e-05, + "loss": 2.8717, + "step": 23872 + }, + { + "epoch": 1.4819666025203302, + "grad_norm": 0.15042465617587697, + "learning_rate": 5.971422108691864e-05, + "loss": 2.842, + "step": 23873 + }, + { + "epoch": 1.482028679620088, + "grad_norm": 0.16631004928695597, + "learning_rate": 5.971067835412836e-05, + "loss": 2.7788, + "step": 23874 + }, + { + "epoch": 1.482090756719846, + "grad_norm": 0.14660563853833555, + "learning_rate": 5.970713557067511e-05, + "loss": 2.7473, + "step": 23875 + }, + { + "epoch": 1.482152833819604, + "grad_norm": 0.17327015510173466, + "learning_rate": 5.970359273657737e-05, + "loss": 2.7989, + "step": 23876 + }, + { + "epoch": 1.4822149109193619, + "grad_norm": 0.145057820874969, + "learning_rate": 5.9700049851853644e-05, + "loss": 2.8084, + "step": 23877 + }, + { + "epoch": 1.4822769880191198, + "grad_norm": 0.13955680134777226, + "learning_rate": 5.969650691652239e-05, + "loss": 2.7474, + "step": 23878 + }, + { + "epoch": 1.4823390651188777, + "grad_norm": 0.16056468683827232, + "learning_rate": 5.969296393060212e-05, + "loss": 2.7245, + "step": 23879 + }, + { + "epoch": 1.4824011422186356, + "grad_norm": 0.1540868694132372, + "learning_rate": 5.968942089411129e-05, + "loss": 2.9139, + "step": 23880 + }, + { + "epoch": 1.4824632193183933, + "grad_norm": 0.15699826126517746, + "learning_rate": 5.96858778070684e-05, + "loss": 2.8401, + "step": 23881 + }, + { + "epoch": 1.4825252964181512, + "grad_norm": 0.15798852635436628, + "learning_rate": 5.9682334669491944e-05, + "loss": 2.829, + "step": 23882 + }, + { + "epoch": 1.4825873735179091, + "grad_norm": 0.14334222420736353, + "learning_rate": 5.96787914814004e-05, + "loss": 2.6981, + "step": 23883 + }, + { + "epoch": 1.482649450617667, + "grad_norm": 0.15241655419632127, + "learning_rate": 5.9675248242812246e-05, + "loss": 2.9473, + "step": 23884 + }, + { + "epoch": 1.482711527717425, + "grad_norm": 0.14109159916815167, + "learning_rate": 5.9671704953745964e-05, + "loss": 2.8697, + "step": 23885 + }, + { + "epoch": 1.482773604817183, + "grad_norm": 0.148458611091365, + "learning_rate": 5.9668161614220066e-05, + "loss": 2.7768, + "step": 23886 + }, + { + "epoch": 1.4828356819169408, + "grad_norm": 0.14471898258555452, + "learning_rate": 5.9664618224252997e-05, + "loss": 2.7538, + "step": 23887 + }, + { + "epoch": 1.4828977590166987, + "grad_norm": 0.14684378718416077, + "learning_rate": 5.966107478386328e-05, + "loss": 2.6853, + "step": 23888 + }, + { + "epoch": 1.4829598361164567, + "grad_norm": 0.1441584237540746, + "learning_rate": 5.96575312930694e-05, + "loss": 2.8681, + "step": 23889 + }, + { + "epoch": 1.4830219132162146, + "grad_norm": 0.1588681923558028, + "learning_rate": 5.965398775188983e-05, + "loss": 2.7354, + "step": 23890 + }, + { + "epoch": 1.4830839903159725, + "grad_norm": 0.15089078813204124, + "learning_rate": 5.965044416034307e-05, + "loss": 2.8317, + "step": 23891 + }, + { + "epoch": 1.4831460674157304, + "grad_norm": 0.16220642539869476, + "learning_rate": 5.964690051844758e-05, + "loss": 2.8318, + "step": 23892 + }, + { + "epoch": 1.4832081445154883, + "grad_norm": 0.1477800982670987, + "learning_rate": 5.964335682622189e-05, + "loss": 2.767, + "step": 23893 + }, + { + "epoch": 1.4832702216152462, + "grad_norm": 0.15290232495201897, + "learning_rate": 5.963981308368445e-05, + "loss": 2.907, + "step": 23894 + }, + { + "epoch": 1.4833322987150042, + "grad_norm": 0.14604019213497554, + "learning_rate": 5.963626929085377e-05, + "loss": 2.7489, + "step": 23895 + }, + { + "epoch": 1.4833943758147619, + "grad_norm": 0.15443480640117158, + "learning_rate": 5.963272544774833e-05, + "loss": 2.753, + "step": 23896 + }, + { + "epoch": 1.4834564529145198, + "grad_norm": 0.15142435409481514, + "learning_rate": 5.962918155438664e-05, + "loss": 2.7952, + "step": 23897 + }, + { + "epoch": 1.4835185300142777, + "grad_norm": 0.14981642631085232, + "learning_rate": 5.962563761078715e-05, + "loss": 2.857, + "step": 23898 + }, + { + "epoch": 1.4835806071140356, + "grad_norm": 0.14751178925687825, + "learning_rate": 5.962209361696838e-05, + "loss": 2.8784, + "step": 23899 + }, + { + "epoch": 1.4836426842137935, + "grad_norm": 0.1498606745059304, + "learning_rate": 5.9618549572948826e-05, + "loss": 2.8329, + "step": 23900 + }, + { + "epoch": 1.4837047613135514, + "grad_norm": 0.1416011218360095, + "learning_rate": 5.961500547874694e-05, + "loss": 2.8415, + "step": 23901 + }, + { + "epoch": 1.4837668384133094, + "grad_norm": 0.16123056625464927, + "learning_rate": 5.9611461334381255e-05, + "loss": 2.8602, + "step": 23902 + }, + { + "epoch": 1.4838289155130673, + "grad_norm": 0.15643829087062425, + "learning_rate": 5.960791713987022e-05, + "loss": 2.8693, + "step": 23903 + }, + { + "epoch": 1.4838909926128252, + "grad_norm": 0.1589084432201138, + "learning_rate": 5.960437289523237e-05, + "loss": 2.9108, + "step": 23904 + }, + { + "epoch": 1.483953069712583, + "grad_norm": 0.15810394121437052, + "learning_rate": 5.960082860048617e-05, + "loss": 2.832, + "step": 23905 + }, + { + "epoch": 1.4840151468123408, + "grad_norm": 0.15242980091754915, + "learning_rate": 5.959728425565012e-05, + "loss": 2.8167, + "step": 23906 + }, + { + "epoch": 1.4840772239120987, + "grad_norm": 0.15443340731361438, + "learning_rate": 5.95937398607427e-05, + "loss": 2.8074, + "step": 23907 + }, + { + "epoch": 1.4841393010118566, + "grad_norm": 0.15973218157431257, + "learning_rate": 5.959019541578241e-05, + "loss": 2.8481, + "step": 23908 + }, + { + "epoch": 1.4842013781116146, + "grad_norm": 0.15441349306394814, + "learning_rate": 5.958665092078776e-05, + "loss": 2.8461, + "step": 23909 + }, + { + "epoch": 1.4842634552113725, + "grad_norm": 0.14787081412955408, + "learning_rate": 5.95831063757772e-05, + "loss": 2.8674, + "step": 23910 + }, + { + "epoch": 1.4843255323111304, + "grad_norm": 0.1764763440436409, + "learning_rate": 5.957956178076927e-05, + "loss": 2.8648, + "step": 23911 + }, + { + "epoch": 1.4843876094108883, + "grad_norm": 0.14971645770968536, + "learning_rate": 5.9576017135782424e-05, + "loss": 2.7239, + "step": 23912 + }, + { + "epoch": 1.4844496865106462, + "grad_norm": 0.1514149867234736, + "learning_rate": 5.957247244083518e-05, + "loss": 2.6722, + "step": 23913 + }, + { + "epoch": 1.4845117636104042, + "grad_norm": 0.141619694305269, + "learning_rate": 5.956892769594602e-05, + "loss": 2.756, + "step": 23914 + }, + { + "epoch": 1.484573840710162, + "grad_norm": 0.15401389332127036, + "learning_rate": 5.956538290113345e-05, + "loss": 2.7401, + "step": 23915 + }, + { + "epoch": 1.48463591780992, + "grad_norm": 0.17085286759035884, + "learning_rate": 5.956183805641595e-05, + "loss": 2.832, + "step": 23916 + }, + { + "epoch": 1.484697994909678, + "grad_norm": 0.14428884566500058, + "learning_rate": 5.955829316181202e-05, + "loss": 2.8299, + "step": 23917 + }, + { + "epoch": 1.4847600720094358, + "grad_norm": 0.15552838705638222, + "learning_rate": 5.955474821734015e-05, + "loss": 2.77, + "step": 23918 + }, + { + "epoch": 1.4848221491091937, + "grad_norm": 0.14966811603119246, + "learning_rate": 5.955120322301882e-05, + "loss": 2.7682, + "step": 23919 + }, + { + "epoch": 1.4848842262089514, + "grad_norm": 0.15550322294230037, + "learning_rate": 5.954765817886658e-05, + "loss": 2.7558, + "step": 23920 + }, + { + "epoch": 1.4849463033087094, + "grad_norm": 0.1636991507716018, + "learning_rate": 5.9544113084901867e-05, + "loss": 2.7759, + "step": 23921 + }, + { + "epoch": 1.4850083804084673, + "grad_norm": 0.15667156332518942, + "learning_rate": 5.954056794114321e-05, + "loss": 2.8461, + "step": 23922 + }, + { + "epoch": 1.4850704575082252, + "grad_norm": 0.1622893357160532, + "learning_rate": 5.95370227476091e-05, + "loss": 2.8611, + "step": 23923 + }, + { + "epoch": 1.4851325346079831, + "grad_norm": 0.1535000870876034, + "learning_rate": 5.9533477504318015e-05, + "loss": 2.7867, + "step": 23924 + }, + { + "epoch": 1.485194611707741, + "grad_norm": 0.2183928383860175, + "learning_rate": 5.952993221128847e-05, + "loss": 2.8043, + "step": 23925 + }, + { + "epoch": 1.485256688807499, + "grad_norm": 0.14552210411189562, + "learning_rate": 5.952638686853894e-05, + "loss": 2.8442, + "step": 23926 + }, + { + "epoch": 1.4853187659072569, + "grad_norm": 0.17672668098576258, + "learning_rate": 5.952284147608795e-05, + "loss": 2.8252, + "step": 23927 + }, + { + "epoch": 1.4853808430070148, + "grad_norm": 0.1687837776622707, + "learning_rate": 5.951929603395399e-05, + "loss": 2.8252, + "step": 23928 + }, + { + "epoch": 1.4854429201067725, + "grad_norm": 0.15233695186216822, + "learning_rate": 5.951575054215555e-05, + "loss": 2.8056, + "step": 23929 + }, + { + "epoch": 1.4855049972065304, + "grad_norm": 0.1486687991354801, + "learning_rate": 5.951220500071112e-05, + "loss": 2.774, + "step": 23930 + }, + { + "epoch": 1.4855670743062883, + "grad_norm": 0.1673346181901472, + "learning_rate": 5.950865940963921e-05, + "loss": 2.8403, + "step": 23931 + }, + { + "epoch": 1.4856291514060462, + "grad_norm": 0.16778472866836647, + "learning_rate": 5.950511376895831e-05, + "loss": 2.8458, + "step": 23932 + }, + { + "epoch": 1.4856912285058042, + "grad_norm": 0.15074457898680874, + "learning_rate": 5.950156807868694e-05, + "loss": 2.8414, + "step": 23933 + }, + { + "epoch": 1.485753305605562, + "grad_norm": 0.16353120380592678, + "learning_rate": 5.949802233884358e-05, + "loss": 2.8105, + "step": 23934 + }, + { + "epoch": 1.48581538270532, + "grad_norm": 0.17244161063365898, + "learning_rate": 5.949447654944672e-05, + "loss": 2.8638, + "step": 23935 + }, + { + "epoch": 1.485877459805078, + "grad_norm": 0.17101534523206946, + "learning_rate": 5.949093071051487e-05, + "loss": 2.8591, + "step": 23936 + }, + { + "epoch": 1.4859395369048358, + "grad_norm": 0.15447279402084854, + "learning_rate": 5.948738482206653e-05, + "loss": 2.755, + "step": 23937 + }, + { + "epoch": 1.4860016140045937, + "grad_norm": 0.16360488148721788, + "learning_rate": 5.9483838884120216e-05, + "loss": 2.7333, + "step": 23938 + }, + { + "epoch": 1.4860636911043517, + "grad_norm": 0.15291317257431938, + "learning_rate": 5.94802928966944e-05, + "loss": 2.8066, + "step": 23939 + }, + { + "epoch": 1.4861257682041096, + "grad_norm": 0.16945580438594626, + "learning_rate": 5.94767468598076e-05, + "loss": 2.8149, + "step": 23940 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.18370381566364213, + "learning_rate": 5.947320077347831e-05, + "loss": 2.8277, + "step": 23941 + }, + { + "epoch": 1.4862499224036254, + "grad_norm": 0.18867290725583585, + "learning_rate": 5.946965463772504e-05, + "loss": 2.8307, + "step": 23942 + }, + { + "epoch": 1.4863119995033833, + "grad_norm": 0.1602070483615853, + "learning_rate": 5.946610845256627e-05, + "loss": 2.855, + "step": 23943 + }, + { + "epoch": 1.486374076603141, + "grad_norm": 0.18766557813229057, + "learning_rate": 5.946256221802051e-05, + "loss": 2.8392, + "step": 23944 + }, + { + "epoch": 1.486436153702899, + "grad_norm": 0.15481036489367303, + "learning_rate": 5.945901593410629e-05, + "loss": 2.8459, + "step": 23945 + }, + { + "epoch": 1.4864982308026569, + "grad_norm": 0.15724544530224763, + "learning_rate": 5.9455469600842064e-05, + "loss": 2.8786, + "step": 23946 + }, + { + "epoch": 1.4865603079024148, + "grad_norm": 0.15869945212287953, + "learning_rate": 5.945192321824636e-05, + "loss": 2.839, + "step": 23947 + }, + { + "epoch": 1.4866223850021727, + "grad_norm": 0.15494276620548622, + "learning_rate": 5.9448376786337676e-05, + "loss": 2.797, + "step": 23948 + }, + { + "epoch": 1.4866844621019306, + "grad_norm": 0.15563785925940055, + "learning_rate": 5.944483030513454e-05, + "loss": 2.7652, + "step": 23949 + }, + { + "epoch": 1.4867465392016885, + "grad_norm": 0.15330637728785232, + "learning_rate": 5.944128377465541e-05, + "loss": 2.8918, + "step": 23950 + }, + { + "epoch": 1.4868086163014465, + "grad_norm": 0.1514956021882355, + "learning_rate": 5.943773719491882e-05, + "loss": 2.9127, + "step": 23951 + }, + { + "epoch": 1.4868706934012044, + "grad_norm": 0.17620639779956024, + "learning_rate": 5.943419056594326e-05, + "loss": 2.7927, + "step": 23952 + }, + { + "epoch": 1.486932770500962, + "grad_norm": 0.17486226609360492, + "learning_rate": 5.943064388774723e-05, + "loss": 2.8207, + "step": 23953 + }, + { + "epoch": 1.48699484760072, + "grad_norm": 0.18234353803411485, + "learning_rate": 5.942709716034926e-05, + "loss": 2.9036, + "step": 23954 + }, + { + "epoch": 1.487056924700478, + "grad_norm": 0.16658500754602162, + "learning_rate": 5.9423550383767814e-05, + "loss": 2.8536, + "step": 23955 + }, + { + "epoch": 1.4871190018002358, + "grad_norm": 0.17550073599627078, + "learning_rate": 5.942000355802143e-05, + "loss": 2.7367, + "step": 23956 + }, + { + "epoch": 1.4871810788999937, + "grad_norm": 0.17565161375250005, + "learning_rate": 5.94164566831286e-05, + "loss": 2.7843, + "step": 23957 + }, + { + "epoch": 1.4872431559997517, + "grad_norm": 0.15238833997529516, + "learning_rate": 5.9412909759107823e-05, + "loss": 2.7842, + "step": 23958 + }, + { + "epoch": 1.4873052330995096, + "grad_norm": 0.16436592494911775, + "learning_rate": 5.940936278597763e-05, + "loss": 2.7577, + "step": 23959 + }, + { + "epoch": 1.4873673101992675, + "grad_norm": 0.14510932865144135, + "learning_rate": 5.940581576375649e-05, + "loss": 2.8054, + "step": 23960 + }, + { + "epoch": 1.4874293872990254, + "grad_norm": 0.16847279239949675, + "learning_rate": 5.9402268692462924e-05, + "loss": 2.8147, + "step": 23961 + }, + { + "epoch": 1.4874914643987833, + "grad_norm": 0.16623666857745212, + "learning_rate": 5.939872157211544e-05, + "loss": 2.7699, + "step": 23962 + }, + { + "epoch": 1.4875535414985412, + "grad_norm": 0.15265957178622958, + "learning_rate": 5.939517440273257e-05, + "loss": 2.6856, + "step": 23963 + }, + { + "epoch": 1.4876156185982992, + "grad_norm": 0.14856559471635095, + "learning_rate": 5.9391627184332756e-05, + "loss": 2.8133, + "step": 23964 + }, + { + "epoch": 1.487677695698057, + "grad_norm": 0.14874723393809614, + "learning_rate": 5.938807991693457e-05, + "loss": 2.7975, + "step": 23965 + }, + { + "epoch": 1.487739772797815, + "grad_norm": 0.1527786161539406, + "learning_rate": 5.938453260055648e-05, + "loss": 2.8645, + "step": 23966 + }, + { + "epoch": 1.487801849897573, + "grad_norm": 0.16553898435638786, + "learning_rate": 5.938098523521701e-05, + "loss": 2.7547, + "step": 23967 + }, + { + "epoch": 1.4878639269973306, + "grad_norm": 0.1542402839699692, + "learning_rate": 5.9377437820934656e-05, + "loss": 2.8316, + "step": 23968 + }, + { + "epoch": 1.4879260040970885, + "grad_norm": 0.15537487171694123, + "learning_rate": 5.9373890357727936e-05, + "loss": 2.8118, + "step": 23969 + }, + { + "epoch": 1.4879880811968464, + "grad_norm": 0.14598018342657543, + "learning_rate": 5.937034284561536e-05, + "loss": 2.7253, + "step": 23970 + }, + { + "epoch": 1.4880501582966044, + "grad_norm": 0.16195646795971327, + "learning_rate": 5.936679528461542e-05, + "loss": 2.7654, + "step": 23971 + }, + { + "epoch": 1.4881122353963623, + "grad_norm": 0.14067068285141912, + "learning_rate": 5.9363247674746645e-05, + "loss": 2.8865, + "step": 23972 + }, + { + "epoch": 1.4881743124961202, + "grad_norm": 0.15213858409702513, + "learning_rate": 5.935970001602752e-05, + "loss": 2.8626, + "step": 23973 + }, + { + "epoch": 1.4882363895958781, + "grad_norm": 0.166399579071151, + "learning_rate": 5.935615230847659e-05, + "loss": 2.703, + "step": 23974 + }, + { + "epoch": 1.488298466695636, + "grad_norm": 0.15206500374606013, + "learning_rate": 5.935260455211232e-05, + "loss": 2.7824, + "step": 23975 + }, + { + "epoch": 1.488360543795394, + "grad_norm": 0.1631145509300035, + "learning_rate": 5.934905674695326e-05, + "loss": 2.837, + "step": 23976 + }, + { + "epoch": 1.4884226208951516, + "grad_norm": 0.16058539255690482, + "learning_rate": 5.9345508893017896e-05, + "loss": 2.7711, + "step": 23977 + }, + { + "epoch": 1.4884846979949096, + "grad_norm": 0.1571037926765762, + "learning_rate": 5.934196099032474e-05, + "loss": 2.8136, + "step": 23978 + }, + { + "epoch": 1.4885467750946675, + "grad_norm": 0.15729317527535946, + "learning_rate": 5.933841303889231e-05, + "loss": 2.8581, + "step": 23979 + }, + { + "epoch": 1.4886088521944254, + "grad_norm": 0.1553753812091659, + "learning_rate": 5.9334865038739104e-05, + "loss": 2.814, + "step": 23980 + }, + { + "epoch": 1.4886709292941833, + "grad_norm": 0.15091207442159243, + "learning_rate": 5.933131698988365e-05, + "loss": 2.7763, + "step": 23981 + }, + { + "epoch": 1.4887330063939412, + "grad_norm": 0.17537106232834557, + "learning_rate": 5.9327768892344437e-05, + "loss": 2.8615, + "step": 23982 + }, + { + "epoch": 1.4887950834936992, + "grad_norm": 0.15847418369823768, + "learning_rate": 5.9324220746140005e-05, + "loss": 2.857, + "step": 23983 + }, + { + "epoch": 1.488857160593457, + "grad_norm": 0.14950596719214893, + "learning_rate": 5.932067255128884e-05, + "loss": 2.8023, + "step": 23984 + }, + { + "epoch": 1.488919237693215, + "grad_norm": 0.16490584419797666, + "learning_rate": 5.9317124307809456e-05, + "loss": 2.8327, + "step": 23985 + }, + { + "epoch": 1.488981314792973, + "grad_norm": 0.17304064758841015, + "learning_rate": 5.93135760157204e-05, + "loss": 2.8473, + "step": 23986 + }, + { + "epoch": 1.4890433918927308, + "grad_norm": 0.16074908833853305, + "learning_rate": 5.931002767504012e-05, + "loss": 2.8703, + "step": 23987 + }, + { + "epoch": 1.4891054689924887, + "grad_norm": 0.14406758114300594, + "learning_rate": 5.9306479285787197e-05, + "loss": 2.8417, + "step": 23988 + }, + { + "epoch": 1.4891675460922467, + "grad_norm": 0.16787506418376044, + "learning_rate": 5.93029308479801e-05, + "loss": 2.7791, + "step": 23989 + }, + { + "epoch": 1.4892296231920046, + "grad_norm": 0.19553476085013588, + "learning_rate": 5.929938236163736e-05, + "loss": 2.847, + "step": 23990 + }, + { + "epoch": 1.4892917002917625, + "grad_norm": 0.154774590260982, + "learning_rate": 5.929583382677747e-05, + "loss": 2.9093, + "step": 23991 + }, + { + "epoch": 1.4893537773915202, + "grad_norm": 0.14776843895198755, + "learning_rate": 5.9292285243418974e-05, + "loss": 2.7715, + "step": 23992 + }, + { + "epoch": 1.4894158544912781, + "grad_norm": 0.14582562142371666, + "learning_rate": 5.928873661158035e-05, + "loss": 2.7865, + "step": 23993 + }, + { + "epoch": 1.489477931591036, + "grad_norm": 0.18035476602407216, + "learning_rate": 5.928518793128016e-05, + "loss": 2.8637, + "step": 23994 + }, + { + "epoch": 1.489540008690794, + "grad_norm": 0.21887228306740306, + "learning_rate": 5.9281639202536886e-05, + "loss": 2.8487, + "step": 23995 + }, + { + "epoch": 1.4896020857905519, + "grad_norm": 0.15437125574042287, + "learning_rate": 5.9278090425369026e-05, + "loss": 2.8211, + "step": 23996 + }, + { + "epoch": 1.4896641628903098, + "grad_norm": 0.1857038035077453, + "learning_rate": 5.927454159979513e-05, + "loss": 2.8503, + "step": 23997 + }, + { + "epoch": 1.4897262399900677, + "grad_norm": 0.15251453314774235, + "learning_rate": 5.927099272583369e-05, + "loss": 2.8931, + "step": 23998 + }, + { + "epoch": 1.4897883170898256, + "grad_norm": 0.16460633967153424, + "learning_rate": 5.926744380350325e-05, + "loss": 2.792, + "step": 23999 + }, + { + "epoch": 1.4898503941895835, + "grad_norm": 0.14126004437584905, + "learning_rate": 5.9263894832822286e-05, + "loss": 2.8493, + "step": 24000 + }, + { + "epoch": 1.4899124712893412, + "grad_norm": 0.15398761080085985, + "learning_rate": 5.9260345813809336e-05, + "loss": 2.8841, + "step": 24001 + }, + { + "epoch": 1.4899745483890992, + "grad_norm": 0.15106343586568272, + "learning_rate": 5.925679674648292e-05, + "loss": 2.7899, + "step": 24002 + }, + { + "epoch": 1.490036625488857, + "grad_norm": 0.14625459450857073, + "learning_rate": 5.925324763086155e-05, + "loss": 2.7895, + "step": 24003 + }, + { + "epoch": 1.490098702588615, + "grad_norm": 0.15483713931764376, + "learning_rate": 5.9249698466963745e-05, + "loss": 2.7023, + "step": 24004 + }, + { + "epoch": 1.490160779688373, + "grad_norm": 0.14288188635424637, + "learning_rate": 5.9246149254808005e-05, + "loss": 2.76, + "step": 24005 + }, + { + "epoch": 1.4902228567881308, + "grad_norm": 0.150004467053258, + "learning_rate": 5.924259999441286e-05, + "loss": 2.7692, + "step": 24006 + }, + { + "epoch": 1.4902849338878887, + "grad_norm": 0.1552407332233161, + "learning_rate": 5.9239050685796834e-05, + "loss": 2.7571, + "step": 24007 + }, + { + "epoch": 1.4903470109876467, + "grad_norm": 0.15189583661937417, + "learning_rate": 5.923550132897844e-05, + "loss": 2.8169, + "step": 24008 + }, + { + "epoch": 1.4904090880874046, + "grad_norm": 0.1643896417569358, + "learning_rate": 5.9231951923976184e-05, + "loss": 2.8314, + "step": 24009 + }, + { + "epoch": 1.4904711651871625, + "grad_norm": 0.15585734980326288, + "learning_rate": 5.922840247080861e-05, + "loss": 2.8033, + "step": 24010 + }, + { + "epoch": 1.4905332422869204, + "grad_norm": 0.17291136423755674, + "learning_rate": 5.9224852969494204e-05, + "loss": 2.8338, + "step": 24011 + }, + { + "epoch": 1.4905953193866783, + "grad_norm": 0.1609700565833132, + "learning_rate": 5.922130342005151e-05, + "loss": 2.7879, + "step": 24012 + }, + { + "epoch": 1.4906573964864362, + "grad_norm": 0.15949982355072162, + "learning_rate": 5.921775382249902e-05, + "loss": 2.8365, + "step": 24013 + }, + { + "epoch": 1.4907194735861942, + "grad_norm": 0.17299929194980773, + "learning_rate": 5.921420417685529e-05, + "loss": 2.7826, + "step": 24014 + }, + { + "epoch": 1.490781550685952, + "grad_norm": 0.16694060304100697, + "learning_rate": 5.921065448313883e-05, + "loss": 2.7719, + "step": 24015 + }, + { + "epoch": 1.4908436277857098, + "grad_norm": 0.14053526637984384, + "learning_rate": 5.9207104741368123e-05, + "loss": 2.7961, + "step": 24016 + }, + { + "epoch": 1.4909057048854677, + "grad_norm": 0.1476514482474293, + "learning_rate": 5.920355495156174e-05, + "loss": 2.838, + "step": 24017 + }, + { + "epoch": 1.4909677819852256, + "grad_norm": 0.14070188222078933, + "learning_rate": 5.920000511373813e-05, + "loss": 2.8046, + "step": 24018 + }, + { + "epoch": 1.4910298590849835, + "grad_norm": 0.2033460771138197, + "learning_rate": 5.9196455227915904e-05, + "loss": 2.7364, + "step": 24019 + }, + { + "epoch": 1.4910919361847415, + "grad_norm": 0.15097586593368542, + "learning_rate": 5.919290529411353e-05, + "loss": 2.8817, + "step": 24020 + }, + { + "epoch": 1.4911540132844994, + "grad_norm": 0.19369070968664237, + "learning_rate": 5.9189355312349535e-05, + "loss": 2.7964, + "step": 24021 + }, + { + "epoch": 1.4912160903842573, + "grad_norm": 0.14590596763042646, + "learning_rate": 5.9185805282642445e-05, + "loss": 2.8906, + "step": 24022 + }, + { + "epoch": 1.4912781674840152, + "grad_norm": 0.14706842337683076, + "learning_rate": 5.9182255205010773e-05, + "loss": 2.8636, + "step": 24023 + }, + { + "epoch": 1.4913402445837731, + "grad_norm": 0.13717516600357366, + "learning_rate": 5.917870507947305e-05, + "loss": 2.8617, + "step": 24024 + }, + { + "epoch": 1.4914023216835308, + "grad_norm": 0.153687335459872, + "learning_rate": 5.917515490604779e-05, + "loss": 2.8085, + "step": 24025 + }, + { + "epoch": 1.4914643987832887, + "grad_norm": 0.14267101919594108, + "learning_rate": 5.9171604684753536e-05, + "loss": 2.7531, + "step": 24026 + }, + { + "epoch": 1.4915264758830467, + "grad_norm": 0.15355100102779964, + "learning_rate": 5.916805441560878e-05, + "loss": 2.8095, + "step": 24027 + }, + { + "epoch": 1.4915885529828046, + "grad_norm": 0.154226280915529, + "learning_rate": 5.916450409863206e-05, + "loss": 2.8884, + "step": 24028 + }, + { + "epoch": 1.4916506300825625, + "grad_norm": 0.14817523374112665, + "learning_rate": 5.91609537338419e-05, + "loss": 2.9036, + "step": 24029 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.16879148039689432, + "learning_rate": 5.915740332125681e-05, + "loss": 2.8402, + "step": 24030 + }, + { + "epoch": 1.4917747842820783, + "grad_norm": 0.15189261764021406, + "learning_rate": 5.915385286089533e-05, + "loss": 2.8365, + "step": 24031 + }, + { + "epoch": 1.4918368613818362, + "grad_norm": 0.1959283655644443, + "learning_rate": 5.915030235277599e-05, + "loss": 2.8311, + "step": 24032 + }, + { + "epoch": 1.4918989384815942, + "grad_norm": 0.1601718394903036, + "learning_rate": 5.914675179691729e-05, + "loss": 2.889, + "step": 24033 + }, + { + "epoch": 1.491961015581352, + "grad_norm": 0.14584053964066965, + "learning_rate": 5.9143201193337746e-05, + "loss": 2.8264, + "step": 24034 + }, + { + "epoch": 1.49202309268111, + "grad_norm": 0.1628960629111243, + "learning_rate": 5.913965054205594e-05, + "loss": 2.7588, + "step": 24035 + }, + { + "epoch": 1.492085169780868, + "grad_norm": 0.14969834229697218, + "learning_rate": 5.913609984309032e-05, + "loss": 2.8408, + "step": 24036 + }, + { + "epoch": 1.4921472468806258, + "grad_norm": 0.15994173896541594, + "learning_rate": 5.913254909645948e-05, + "loss": 2.7976, + "step": 24037 + }, + { + "epoch": 1.4922093239803838, + "grad_norm": 0.14532654627725958, + "learning_rate": 5.91289983021819e-05, + "loss": 2.7922, + "step": 24038 + }, + { + "epoch": 1.4922714010801417, + "grad_norm": 0.15343119019782978, + "learning_rate": 5.912544746027613e-05, + "loss": 2.8607, + "step": 24039 + }, + { + "epoch": 1.4923334781798994, + "grad_norm": 0.1513927991795396, + "learning_rate": 5.912189657076068e-05, + "loss": 2.7535, + "step": 24040 + }, + { + "epoch": 1.4923955552796573, + "grad_norm": 0.18008704959823388, + "learning_rate": 5.911834563365407e-05, + "loss": 2.7979, + "step": 24041 + }, + { + "epoch": 1.4924576323794152, + "grad_norm": 0.16085833088302978, + "learning_rate": 5.911479464897486e-05, + "loss": 2.8208, + "step": 24042 + }, + { + "epoch": 1.4925197094791731, + "grad_norm": 0.15485579626647603, + "learning_rate": 5.911124361674153e-05, + "loss": 2.8713, + "step": 24043 + }, + { + "epoch": 1.492581786578931, + "grad_norm": 0.18957202169361945, + "learning_rate": 5.910769253697265e-05, + "loss": 2.8431, + "step": 24044 + }, + { + "epoch": 1.492643863678689, + "grad_norm": 0.1610817510444836, + "learning_rate": 5.910414140968671e-05, + "loss": 2.8349, + "step": 24045 + }, + { + "epoch": 1.4927059407784469, + "grad_norm": 0.15385354005982582, + "learning_rate": 5.910059023490227e-05, + "loss": 2.8263, + "step": 24046 + }, + { + "epoch": 1.4927680178782048, + "grad_norm": 0.19071644442316513, + "learning_rate": 5.9097039012637833e-05, + "loss": 2.8518, + "step": 24047 + }, + { + "epoch": 1.4928300949779627, + "grad_norm": 0.15231186582012993, + "learning_rate": 5.9093487742911934e-05, + "loss": 2.7943, + "step": 24048 + }, + { + "epoch": 1.4928921720777204, + "grad_norm": 0.15313905583311296, + "learning_rate": 5.9089936425743096e-05, + "loss": 2.8083, + "step": 24049 + }, + { + "epoch": 1.4929542491774783, + "grad_norm": 0.16100640974368946, + "learning_rate": 5.908638506114986e-05, + "loss": 2.7859, + "step": 24050 + }, + { + "epoch": 1.4930163262772362, + "grad_norm": 0.1617327589610841, + "learning_rate": 5.908283364915075e-05, + "loss": 2.735, + "step": 24051 + }, + { + "epoch": 1.4930784033769942, + "grad_norm": 0.15867758470646098, + "learning_rate": 5.907928218976429e-05, + "loss": 2.8213, + "step": 24052 + }, + { + "epoch": 1.493140480476752, + "grad_norm": 0.1730675344857663, + "learning_rate": 5.907573068300901e-05, + "loss": 2.7311, + "step": 24053 + }, + { + "epoch": 1.49320255757651, + "grad_norm": 0.17271019925588552, + "learning_rate": 5.9072179128903435e-05, + "loss": 2.7438, + "step": 24054 + }, + { + "epoch": 1.493264634676268, + "grad_norm": 0.19200900596927264, + "learning_rate": 5.906862752746611e-05, + "loss": 2.86, + "step": 24055 + }, + { + "epoch": 1.4933267117760258, + "grad_norm": 0.15951691928353018, + "learning_rate": 5.906507587871555e-05, + "loss": 2.8373, + "step": 24056 + }, + { + "epoch": 1.4933887888757837, + "grad_norm": 0.1843234838832878, + "learning_rate": 5.9061524182670295e-05, + "loss": 2.7169, + "step": 24057 + }, + { + "epoch": 1.4934508659755417, + "grad_norm": 0.1674637744468389, + "learning_rate": 5.905797243934886e-05, + "loss": 2.8278, + "step": 24058 + }, + { + "epoch": 1.4935129430752996, + "grad_norm": 0.1586412086322035, + "learning_rate": 5.9054420648769785e-05, + "loss": 2.8111, + "step": 24059 + }, + { + "epoch": 1.4935750201750575, + "grad_norm": 0.17931261554513522, + "learning_rate": 5.90508688109516e-05, + "loss": 2.7446, + "step": 24060 + }, + { + "epoch": 1.4936370972748154, + "grad_norm": 0.16144169765314031, + "learning_rate": 5.904731692591283e-05, + "loss": 2.7885, + "step": 24061 + }, + { + "epoch": 1.4936991743745733, + "grad_norm": 0.15983981179860934, + "learning_rate": 5.904376499367201e-05, + "loss": 2.8297, + "step": 24062 + }, + { + "epoch": 1.4937612514743313, + "grad_norm": 0.14939746889210223, + "learning_rate": 5.9040213014247684e-05, + "loss": 2.8405, + "step": 24063 + }, + { + "epoch": 1.493823328574089, + "grad_norm": 0.16432521426009744, + "learning_rate": 5.903666098765837e-05, + "loss": 2.7699, + "step": 24064 + }, + { + "epoch": 1.4938854056738469, + "grad_norm": 0.15708791056924098, + "learning_rate": 5.903310891392259e-05, + "loss": 2.878, + "step": 24065 + }, + { + "epoch": 1.4939474827736048, + "grad_norm": 0.15348158087600178, + "learning_rate": 5.9029556793058904e-05, + "loss": 2.8601, + "step": 24066 + }, + { + "epoch": 1.4940095598733627, + "grad_norm": 0.15842011958315194, + "learning_rate": 5.902600462508581e-05, + "loss": 2.8032, + "step": 24067 + }, + { + "epoch": 1.4940716369731206, + "grad_norm": 0.1619382782648334, + "learning_rate": 5.9022452410021865e-05, + "loss": 2.7932, + "step": 24068 + }, + { + "epoch": 1.4941337140728785, + "grad_norm": 0.15415482078182025, + "learning_rate": 5.90189001478856e-05, + "loss": 2.7134, + "step": 24069 + }, + { + "epoch": 1.4941957911726365, + "grad_norm": 0.16227751365531334, + "learning_rate": 5.901534783869555e-05, + "loss": 2.8172, + "step": 24070 + }, + { + "epoch": 1.4942578682723944, + "grad_norm": 0.21082790494672826, + "learning_rate": 5.9011795482470234e-05, + "loss": 2.8302, + "step": 24071 + }, + { + "epoch": 1.4943199453721523, + "grad_norm": 0.17730689779191047, + "learning_rate": 5.900824307922819e-05, + "loss": 2.7835, + "step": 24072 + }, + { + "epoch": 1.49438202247191, + "grad_norm": 0.15169805850197624, + "learning_rate": 5.900469062898796e-05, + "loss": 2.771, + "step": 24073 + }, + { + "epoch": 1.494444099571668, + "grad_norm": 0.15880086205103466, + "learning_rate": 5.900113813176806e-05, + "loss": 2.8807, + "step": 24074 + }, + { + "epoch": 1.4945061766714258, + "grad_norm": 0.16246544557679013, + "learning_rate": 5.899758558758706e-05, + "loss": 2.7694, + "step": 24075 + }, + { + "epoch": 1.4945682537711837, + "grad_norm": 0.16190803617851696, + "learning_rate": 5.899403299646347e-05, + "loss": 2.8159, + "step": 24076 + }, + { + "epoch": 1.4946303308709417, + "grad_norm": 0.15186429381408906, + "learning_rate": 5.8990480358415814e-05, + "loss": 2.9177, + "step": 24077 + }, + { + "epoch": 1.4946924079706996, + "grad_norm": 0.15681111791381, + "learning_rate": 5.8986927673462645e-05, + "loss": 2.8589, + "step": 24078 + }, + { + "epoch": 1.4947544850704575, + "grad_norm": 0.15279642732419882, + "learning_rate": 5.898337494162248e-05, + "loss": 2.9163, + "step": 24079 + }, + { + "epoch": 1.4948165621702154, + "grad_norm": 0.17838639354206295, + "learning_rate": 5.8979822162913886e-05, + "loss": 2.7921, + "step": 24080 + }, + { + "epoch": 1.4948786392699733, + "grad_norm": 0.1638520685070884, + "learning_rate": 5.897626933735536e-05, + "loss": 2.812, + "step": 24081 + }, + { + "epoch": 1.4949407163697312, + "grad_norm": 0.18956963407535393, + "learning_rate": 5.8972716464965485e-05, + "loss": 2.7679, + "step": 24082 + }, + { + "epoch": 1.4950027934694892, + "grad_norm": 0.1510807896512337, + "learning_rate": 5.896916354576274e-05, + "loss": 2.7456, + "step": 24083 + }, + { + "epoch": 1.495064870569247, + "grad_norm": 0.15577481253665793, + "learning_rate": 5.8965610579765705e-05, + "loss": 2.8032, + "step": 24084 + }, + { + "epoch": 1.495126947669005, + "grad_norm": 0.1487578461384899, + "learning_rate": 5.896205756699291e-05, + "loss": 2.9051, + "step": 24085 + }, + { + "epoch": 1.495189024768763, + "grad_norm": 0.15810618550619385, + "learning_rate": 5.895850450746287e-05, + "loss": 2.838, + "step": 24086 + }, + { + "epoch": 1.4952511018685208, + "grad_norm": 0.15106275153701415, + "learning_rate": 5.895495140119416e-05, + "loss": 2.7678, + "step": 24087 + }, + { + "epoch": 1.4953131789682785, + "grad_norm": 0.18266751620918556, + "learning_rate": 5.895139824820527e-05, + "loss": 2.8692, + "step": 24088 + }, + { + "epoch": 1.4953752560680365, + "grad_norm": 0.20247505817339487, + "learning_rate": 5.894784504851478e-05, + "loss": 2.8104, + "step": 24089 + }, + { + "epoch": 1.4954373331677944, + "grad_norm": 0.16240692980926616, + "learning_rate": 5.89442918021412e-05, + "loss": 2.8843, + "step": 24090 + }, + { + "epoch": 1.4954994102675523, + "grad_norm": 0.1760332632222271, + "learning_rate": 5.894073850910309e-05, + "loss": 2.796, + "step": 24091 + }, + { + "epoch": 1.4955614873673102, + "grad_norm": 0.1468057984304038, + "learning_rate": 5.893718516941895e-05, + "loss": 2.7756, + "step": 24092 + }, + { + "epoch": 1.4956235644670681, + "grad_norm": 0.16079656601219008, + "learning_rate": 5.893363178310738e-05, + "loss": 2.8172, + "step": 24093 + }, + { + "epoch": 1.495685641566826, + "grad_norm": 0.21508259934532037, + "learning_rate": 5.8930078350186866e-05, + "loss": 2.7865, + "step": 24094 + }, + { + "epoch": 1.495747718666584, + "grad_norm": 0.15803765249757137, + "learning_rate": 5.892652487067597e-05, + "loss": 2.8126, + "step": 24095 + }, + { + "epoch": 1.4958097957663419, + "grad_norm": 0.17117000800802984, + "learning_rate": 5.892297134459322e-05, + "loss": 2.8156, + "step": 24096 + }, + { + "epoch": 1.4958718728660996, + "grad_norm": 0.1654632973335394, + "learning_rate": 5.891941777195716e-05, + "loss": 2.8058, + "step": 24097 + }, + { + "epoch": 1.4959339499658575, + "grad_norm": 0.1651220564421036, + "learning_rate": 5.891586415278635e-05, + "loss": 2.8934, + "step": 24098 + }, + { + "epoch": 1.4959960270656154, + "grad_norm": 0.1636030371032832, + "learning_rate": 5.8912310487099296e-05, + "loss": 2.7334, + "step": 24099 + }, + { + "epoch": 1.4960581041653733, + "grad_norm": 0.19938292937519897, + "learning_rate": 5.890875677491455e-05, + "loss": 2.8365, + "step": 24100 + }, + { + "epoch": 1.4961201812651312, + "grad_norm": 0.19016351927291833, + "learning_rate": 5.8905203016250674e-05, + "loss": 2.7271, + "step": 24101 + }, + { + "epoch": 1.4961822583648892, + "grad_norm": 0.15425352501628284, + "learning_rate": 5.890164921112619e-05, + "loss": 2.7734, + "step": 24102 + }, + { + "epoch": 1.496244335464647, + "grad_norm": 0.21860415842227188, + "learning_rate": 5.8898095359559636e-05, + "loss": 2.8529, + "step": 24103 + }, + { + "epoch": 1.496306412564405, + "grad_norm": 0.16647094109376678, + "learning_rate": 5.889454146156955e-05, + "loss": 2.7967, + "step": 24104 + }, + { + "epoch": 1.496368489664163, + "grad_norm": 0.16387386490285413, + "learning_rate": 5.88909875171745e-05, + "loss": 2.7583, + "step": 24105 + }, + { + "epoch": 1.4964305667639208, + "grad_norm": 0.15735342108733663, + "learning_rate": 5.888743352639299e-05, + "loss": 2.8935, + "step": 24106 + }, + { + "epoch": 1.4964926438636788, + "grad_norm": 0.16334748567382115, + "learning_rate": 5.88838794892436e-05, + "loss": 2.7617, + "step": 24107 + }, + { + "epoch": 1.4965547209634367, + "grad_norm": 0.17273056515202026, + "learning_rate": 5.888032540574484e-05, + "loss": 2.7951, + "step": 24108 + }, + { + "epoch": 1.4966167980631946, + "grad_norm": 0.1765005916745613, + "learning_rate": 5.8876771275915276e-05, + "loss": 2.6883, + "step": 24109 + }, + { + "epoch": 1.4966788751629525, + "grad_norm": 0.16162636048186163, + "learning_rate": 5.887321709977344e-05, + "loss": 2.8564, + "step": 24110 + }, + { + "epoch": 1.4967409522627104, + "grad_norm": 0.1531380462927568, + "learning_rate": 5.8869662877337874e-05, + "loss": 2.7729, + "step": 24111 + }, + { + "epoch": 1.4968030293624681, + "grad_norm": 0.1491299402377104, + "learning_rate": 5.886610860862713e-05, + "loss": 2.8055, + "step": 24112 + }, + { + "epoch": 1.496865106462226, + "grad_norm": 0.1589110922567949, + "learning_rate": 5.886255429365973e-05, + "loss": 2.762, + "step": 24113 + }, + { + "epoch": 1.496927183561984, + "grad_norm": 0.15392977466976715, + "learning_rate": 5.885899993245425e-05, + "loss": 2.8245, + "step": 24114 + }, + { + "epoch": 1.4969892606617419, + "grad_norm": 0.1646376756746222, + "learning_rate": 5.885544552502921e-05, + "loss": 2.8765, + "step": 24115 + }, + { + "epoch": 1.4970513377614998, + "grad_norm": 0.1628952624898099, + "learning_rate": 5.8851891071403156e-05, + "loss": 2.7996, + "step": 24116 + }, + { + "epoch": 1.4971134148612577, + "grad_norm": 0.15117214927352746, + "learning_rate": 5.884833657159463e-05, + "loss": 2.9077, + "step": 24117 + }, + { + "epoch": 1.4971754919610156, + "grad_norm": 0.14314287505074608, + "learning_rate": 5.8844782025622205e-05, + "loss": 2.7792, + "step": 24118 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.16182317518167152, + "learning_rate": 5.88412274335044e-05, + "loss": 2.7465, + "step": 24119 + }, + { + "epoch": 1.4972996461605315, + "grad_norm": 0.1437945988653289, + "learning_rate": 5.883767279525974e-05, + "loss": 2.8919, + "step": 24120 + }, + { + "epoch": 1.4973617232602892, + "grad_norm": 0.1440026954342316, + "learning_rate": 5.883411811090683e-05, + "loss": 2.8064, + "step": 24121 + }, + { + "epoch": 1.497423800360047, + "grad_norm": 0.16568051343539203, + "learning_rate": 5.8830563380464165e-05, + "loss": 2.8987, + "step": 24122 + }, + { + "epoch": 1.497485877459805, + "grad_norm": 0.14661364891020987, + "learning_rate": 5.882700860395032e-05, + "loss": 2.8553, + "step": 24123 + }, + { + "epoch": 1.497547954559563, + "grad_norm": 0.1522104646150366, + "learning_rate": 5.882345378138381e-05, + "loss": 2.8478, + "step": 24124 + }, + { + "epoch": 1.4976100316593208, + "grad_norm": 0.16724240925445802, + "learning_rate": 5.881989891278321e-05, + "loss": 2.7957, + "step": 24125 + }, + { + "epoch": 1.4976721087590787, + "grad_norm": 0.1518387003284875, + "learning_rate": 5.881634399816704e-05, + "loss": 2.7057, + "step": 24126 + }, + { + "epoch": 1.4977341858588367, + "grad_norm": 0.17094094973819382, + "learning_rate": 5.8812789037553894e-05, + "loss": 2.9117, + "step": 24127 + }, + { + "epoch": 1.4977962629585946, + "grad_norm": 0.14678798365400797, + "learning_rate": 5.880923403096226e-05, + "loss": 2.7673, + "step": 24128 + }, + { + "epoch": 1.4978583400583525, + "grad_norm": 0.17396015874871928, + "learning_rate": 5.880567897841073e-05, + "loss": 2.7202, + "step": 24129 + }, + { + "epoch": 1.4979204171581104, + "grad_norm": 0.15151024487842915, + "learning_rate": 5.8802123879917836e-05, + "loss": 2.7762, + "step": 24130 + }, + { + "epoch": 1.4979824942578683, + "grad_norm": 0.14543659592377398, + "learning_rate": 5.8798568735502116e-05, + "loss": 2.7192, + "step": 24131 + }, + { + "epoch": 1.4980445713576263, + "grad_norm": 0.15428933595998862, + "learning_rate": 5.879501354518212e-05, + "loss": 2.8245, + "step": 24132 + }, + { + "epoch": 1.4981066484573842, + "grad_norm": 0.1767184703733625, + "learning_rate": 5.879145830897641e-05, + "loss": 2.7404, + "step": 24133 + }, + { + "epoch": 1.498168725557142, + "grad_norm": 0.15161465750216005, + "learning_rate": 5.8787903026903537e-05, + "loss": 2.8305, + "step": 24134 + }, + { + "epoch": 1.4982308026569, + "grad_norm": 0.1570239509005879, + "learning_rate": 5.8784347698982024e-05, + "loss": 2.8129, + "step": 24135 + }, + { + "epoch": 1.4982928797566577, + "grad_norm": 0.16553074768527432, + "learning_rate": 5.8780792325230446e-05, + "loss": 2.7573, + "step": 24136 + }, + { + "epoch": 1.4983549568564156, + "grad_norm": 0.1656773374294318, + "learning_rate": 5.8777236905667355e-05, + "loss": 2.7763, + "step": 24137 + }, + { + "epoch": 1.4984170339561735, + "grad_norm": 0.1606808711171705, + "learning_rate": 5.877368144031128e-05, + "loss": 2.8495, + "step": 24138 + }, + { + "epoch": 1.4984791110559315, + "grad_norm": 0.1504568350561599, + "learning_rate": 5.8770125929180774e-05, + "loss": 2.8237, + "step": 24139 + }, + { + "epoch": 1.4985411881556894, + "grad_norm": 0.16499938149227913, + "learning_rate": 5.8766570372294384e-05, + "loss": 2.8336, + "step": 24140 + }, + { + "epoch": 1.4986032652554473, + "grad_norm": 0.1485674878708599, + "learning_rate": 5.876301476967069e-05, + "loss": 2.8306, + "step": 24141 + }, + { + "epoch": 1.4986653423552052, + "grad_norm": 0.14687529218928716, + "learning_rate": 5.87594591213282e-05, + "loss": 2.719, + "step": 24142 + }, + { + "epoch": 1.4987274194549631, + "grad_norm": 0.18346462375781106, + "learning_rate": 5.87559034272855e-05, + "loss": 2.8126, + "step": 24143 + }, + { + "epoch": 1.498789496554721, + "grad_norm": 0.16736183747278596, + "learning_rate": 5.875234768756112e-05, + "loss": 2.8923, + "step": 24144 + }, + { + "epoch": 1.4988515736544787, + "grad_norm": 0.1927413504041438, + "learning_rate": 5.874879190217362e-05, + "loss": 2.8264, + "step": 24145 + }, + { + "epoch": 1.4989136507542367, + "grad_norm": 0.14699430084534038, + "learning_rate": 5.874523607114154e-05, + "loss": 2.6853, + "step": 24146 + }, + { + "epoch": 1.4989757278539946, + "grad_norm": 0.17003142453861095, + "learning_rate": 5.874168019448345e-05, + "loss": 2.8781, + "step": 24147 + }, + { + "epoch": 1.4990378049537525, + "grad_norm": 0.16243425240449746, + "learning_rate": 5.8738124272217895e-05, + "loss": 2.8601, + "step": 24148 + }, + { + "epoch": 1.4990998820535104, + "grad_norm": 0.16884216290832435, + "learning_rate": 5.8734568304363405e-05, + "loss": 2.81, + "step": 24149 + }, + { + "epoch": 1.4991619591532683, + "grad_norm": 0.1737173509711283, + "learning_rate": 5.873101229093857e-05, + "loss": 2.8814, + "step": 24150 + }, + { + "epoch": 1.4992240362530262, + "grad_norm": 0.16589016381083121, + "learning_rate": 5.872745623196192e-05, + "loss": 2.7527, + "step": 24151 + }, + { + "epoch": 1.4992861133527842, + "grad_norm": 0.15205359949380157, + "learning_rate": 5.872390012745201e-05, + "loss": 2.7958, + "step": 24152 + }, + { + "epoch": 1.499348190452542, + "grad_norm": 0.1577475369230157, + "learning_rate": 5.872034397742739e-05, + "loss": 2.7669, + "step": 24153 + }, + { + "epoch": 1.4994102675523, + "grad_norm": 0.16589634481052837, + "learning_rate": 5.8716787781906635e-05, + "loss": 2.8147, + "step": 24154 + }, + { + "epoch": 1.499472344652058, + "grad_norm": 0.16351025028025473, + "learning_rate": 5.871323154090828e-05, + "loss": 2.8714, + "step": 24155 + }, + { + "epoch": 1.4995344217518158, + "grad_norm": 0.16115431786570425, + "learning_rate": 5.870967525445087e-05, + "loss": 2.7817, + "step": 24156 + }, + { + "epoch": 1.4995964988515738, + "grad_norm": 0.1593407827932594, + "learning_rate": 5.8706118922552975e-05, + "loss": 2.817, + "step": 24157 + }, + { + "epoch": 1.4996585759513317, + "grad_norm": 0.1554378203195846, + "learning_rate": 5.870256254523314e-05, + "loss": 2.791, + "step": 24158 + }, + { + "epoch": 1.4997206530510896, + "grad_norm": 0.15753500756555594, + "learning_rate": 5.8699006122509924e-05, + "loss": 2.8403, + "step": 24159 + }, + { + "epoch": 1.4997827301508473, + "grad_norm": 0.15333270976490057, + "learning_rate": 5.869544965440188e-05, + "loss": 2.7454, + "step": 24160 + }, + { + "epoch": 1.4998448072506052, + "grad_norm": 0.15144257052745763, + "learning_rate": 5.869189314092758e-05, + "loss": 2.7572, + "step": 24161 + }, + { + "epoch": 1.4999068843503631, + "grad_norm": 0.16169407513479506, + "learning_rate": 5.868833658210554e-05, + "loss": 2.8886, + "step": 24162 + }, + { + "epoch": 1.499968961450121, + "grad_norm": 0.16457914566394277, + "learning_rate": 5.8684779977954365e-05, + "loss": 2.7906, + "step": 24163 + }, + { + "epoch": 1.500031038549879, + "grad_norm": 0.1572477492279785, + "learning_rate": 5.868122332849255e-05, + "loss": 2.7816, + "step": 24164 + }, + { + "epoch": 1.5000931156496369, + "grad_norm": 0.1488627128258892, + "learning_rate": 5.8677666633738704e-05, + "loss": 2.7848, + "step": 24165 + }, + { + "epoch": 1.5001551927493948, + "grad_norm": 0.1531046486487203, + "learning_rate": 5.8674109893711374e-05, + "loss": 2.7865, + "step": 24166 + }, + { + "epoch": 1.5002172698491525, + "grad_norm": 0.17873562710004395, + "learning_rate": 5.867055310842909e-05, + "loss": 2.7664, + "step": 24167 + }, + { + "epoch": 1.5002793469489104, + "grad_norm": 0.16285793561902315, + "learning_rate": 5.866699627791045e-05, + "loss": 2.8685, + "step": 24168 + }, + { + "epoch": 1.5003414240486683, + "grad_norm": 0.17133118471474118, + "learning_rate": 5.866343940217396e-05, + "loss": 2.8297, + "step": 24169 + }, + { + "epoch": 1.5004035011484262, + "grad_norm": 0.14685683899057686, + "learning_rate": 5.865988248123821e-05, + "loss": 2.7514, + "step": 24170 + }, + { + "epoch": 1.5004655782481842, + "grad_norm": 0.17387650384499434, + "learning_rate": 5.8656325515121757e-05, + "loss": 2.7675, + "step": 24171 + }, + { + "epoch": 1.500527655347942, + "grad_norm": 0.16105811381668092, + "learning_rate": 5.865276850384316e-05, + "loss": 2.7855, + "step": 24172 + }, + { + "epoch": 1.5005897324477, + "grad_norm": 0.17789960872865507, + "learning_rate": 5.864921144742096e-05, + "loss": 2.7586, + "step": 24173 + }, + { + "epoch": 1.500651809547458, + "grad_norm": 0.15412729336160816, + "learning_rate": 5.864565434587372e-05, + "loss": 2.867, + "step": 24174 + }, + { + "epoch": 1.5007138866472158, + "grad_norm": 0.16171433101826996, + "learning_rate": 5.864209719922001e-05, + "loss": 2.8811, + "step": 24175 + }, + { + "epoch": 1.5007759637469738, + "grad_norm": 0.15133387906700657, + "learning_rate": 5.863854000747837e-05, + "loss": 2.7217, + "step": 24176 + }, + { + "epoch": 1.5008380408467317, + "grad_norm": 0.15107932251666606, + "learning_rate": 5.863498277066739e-05, + "loss": 2.88, + "step": 24177 + }, + { + "epoch": 1.5009001179464896, + "grad_norm": 0.17363321971523735, + "learning_rate": 5.863142548880558e-05, + "loss": 2.8241, + "step": 24178 + }, + { + "epoch": 1.5009621950462475, + "grad_norm": 0.15373098443843483, + "learning_rate": 5.862786816191156e-05, + "loss": 2.7805, + "step": 24179 + }, + { + "epoch": 1.5010242721460054, + "grad_norm": 0.15292978664312157, + "learning_rate": 5.862431079000383e-05, + "loss": 2.8456, + "step": 24180 + }, + { + "epoch": 1.5010863492457633, + "grad_norm": 0.1555558222003716, + "learning_rate": 5.8620753373100975e-05, + "loss": 2.8167, + "step": 24181 + }, + { + "epoch": 1.5011484263455213, + "grad_norm": 0.1443965516167698, + "learning_rate": 5.8617195911221576e-05, + "loss": 2.7684, + "step": 24182 + }, + { + "epoch": 1.5012105034452792, + "grad_norm": 0.14858602321202485, + "learning_rate": 5.861363840438416e-05, + "loss": 2.7784, + "step": 24183 + }, + { + "epoch": 1.501272580545037, + "grad_norm": 0.1475998937445545, + "learning_rate": 5.861008085260733e-05, + "loss": 2.7523, + "step": 24184 + }, + { + "epoch": 1.5013346576447948, + "grad_norm": 0.16401704090288943, + "learning_rate": 5.8606523255909584e-05, + "loss": 2.7459, + "step": 24185 + }, + { + "epoch": 1.5013967347445527, + "grad_norm": 0.15329586733182715, + "learning_rate": 5.860296561430953e-05, + "loss": 2.7752, + "step": 24186 + }, + { + "epoch": 1.5014588118443106, + "grad_norm": 0.1540593358966514, + "learning_rate": 5.859940792782572e-05, + "loss": 2.7582, + "step": 24187 + }, + { + "epoch": 1.5015208889440685, + "grad_norm": 0.15473122364122086, + "learning_rate": 5.859585019647671e-05, + "loss": 2.8219, + "step": 24188 + }, + { + "epoch": 1.5015829660438265, + "grad_norm": 0.1468274295652799, + "learning_rate": 5.859229242028106e-05, + "loss": 2.7987, + "step": 24189 + }, + { + "epoch": 1.5016450431435844, + "grad_norm": 0.16567277286821944, + "learning_rate": 5.8588734599257335e-05, + "loss": 2.927, + "step": 24190 + }, + { + "epoch": 1.501707120243342, + "grad_norm": 0.16782022163626972, + "learning_rate": 5.8585176733424095e-05, + "loss": 2.9245, + "step": 24191 + }, + { + "epoch": 1.5017691973431, + "grad_norm": 0.16360278662978725, + "learning_rate": 5.858161882279991e-05, + "loss": 2.7672, + "step": 24192 + }, + { + "epoch": 1.501831274442858, + "grad_norm": 0.17210652731272474, + "learning_rate": 5.857806086740334e-05, + "loss": 2.8604, + "step": 24193 + }, + { + "epoch": 1.5018933515426158, + "grad_norm": 0.17909151213390484, + "learning_rate": 5.857450286725292e-05, + "loss": 2.7585, + "step": 24194 + }, + { + "epoch": 1.5019554286423737, + "grad_norm": 0.1438977948541062, + "learning_rate": 5.857094482236726e-05, + "loss": 2.7673, + "step": 24195 + }, + { + "epoch": 1.5020175057421317, + "grad_norm": 0.16960039640406457, + "learning_rate": 5.856738673276488e-05, + "loss": 2.9439, + "step": 24196 + }, + { + "epoch": 1.5020795828418896, + "grad_norm": 0.151376822380323, + "learning_rate": 5.8563828598464386e-05, + "loss": 2.8578, + "step": 24197 + }, + { + "epoch": 1.5021416599416475, + "grad_norm": 0.1574995857187908, + "learning_rate": 5.8560270419484306e-05, + "loss": 2.7843, + "step": 24198 + }, + { + "epoch": 1.5022037370414054, + "grad_norm": 0.15996126191195986, + "learning_rate": 5.855671219584322e-05, + "loss": 2.8682, + "step": 24199 + }, + { + "epoch": 1.5022658141411633, + "grad_norm": 0.15976264316602268, + "learning_rate": 5.855315392755969e-05, + "loss": 2.8396, + "step": 24200 + }, + { + "epoch": 1.5023278912409213, + "grad_norm": 0.15087990621587244, + "learning_rate": 5.8549595614652276e-05, + "loss": 2.8064, + "step": 24201 + }, + { + "epoch": 1.5023899683406792, + "grad_norm": 0.16464699216233158, + "learning_rate": 5.854603725713955e-05, + "loss": 2.8648, + "step": 24202 + }, + { + "epoch": 1.502452045440437, + "grad_norm": 0.1569061568654416, + "learning_rate": 5.8542478855040076e-05, + "loss": 2.8722, + "step": 24203 + }, + { + "epoch": 1.502514122540195, + "grad_norm": 0.17014344337967396, + "learning_rate": 5.853892040837241e-05, + "loss": 2.7828, + "step": 24204 + }, + { + "epoch": 1.502576199639953, + "grad_norm": 0.1609239611902972, + "learning_rate": 5.853536191715513e-05, + "loss": 2.8411, + "step": 24205 + }, + { + "epoch": 1.5026382767397108, + "grad_norm": 0.1626689857531914, + "learning_rate": 5.853180338140679e-05, + "loss": 2.8337, + "step": 24206 + }, + { + "epoch": 1.5027003538394688, + "grad_norm": 0.19025813469369798, + "learning_rate": 5.852824480114596e-05, + "loss": 2.8292, + "step": 24207 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.14527880165103182, + "learning_rate": 5.852468617639122e-05, + "loss": 2.7912, + "step": 24208 + }, + { + "epoch": 1.5028245080389844, + "grad_norm": 0.1733713416911352, + "learning_rate": 5.85211275071611e-05, + "loss": 2.7955, + "step": 24209 + }, + { + "epoch": 1.5028865851387423, + "grad_norm": 0.14460177247534392, + "learning_rate": 5.851756879347421e-05, + "loss": 2.8234, + "step": 24210 + }, + { + "epoch": 1.5029486622385002, + "grad_norm": 0.15587809529133584, + "learning_rate": 5.851401003534909e-05, + "loss": 2.8458, + "step": 24211 + }, + { + "epoch": 1.5030107393382581, + "grad_norm": 0.1509893809178252, + "learning_rate": 5.8510451232804296e-05, + "loss": 2.781, + "step": 24212 + }, + { + "epoch": 1.503072816438016, + "grad_norm": 0.16810848833033404, + "learning_rate": 5.850689238585844e-05, + "loss": 2.7696, + "step": 24213 + }, + { + "epoch": 1.503134893537774, + "grad_norm": 0.15393195881552396, + "learning_rate": 5.8503333494530025e-05, + "loss": 2.8235, + "step": 24214 + }, + { + "epoch": 1.5031969706375317, + "grad_norm": 0.17831688049289443, + "learning_rate": 5.8499774558837686e-05, + "loss": 2.8625, + "step": 24215 + }, + { + "epoch": 1.5032590477372896, + "grad_norm": 0.15284744160627414, + "learning_rate": 5.849621557879995e-05, + "loss": 2.7455, + "step": 24216 + }, + { + "epoch": 1.5033211248370475, + "grad_norm": 0.18010417683588298, + "learning_rate": 5.8492656554435386e-05, + "loss": 2.8745, + "step": 24217 + }, + { + "epoch": 1.5033832019368054, + "grad_norm": 0.1539822037619727, + "learning_rate": 5.848909748576259e-05, + "loss": 2.8308, + "step": 24218 + }, + { + "epoch": 1.5034452790365633, + "grad_norm": 0.14439624847790353, + "learning_rate": 5.8485538372800096e-05, + "loss": 2.7069, + "step": 24219 + }, + { + "epoch": 1.5035073561363212, + "grad_norm": 0.15512362636592744, + "learning_rate": 5.848197921556649e-05, + "loss": 2.8098, + "step": 24220 + }, + { + "epoch": 1.5035694332360792, + "grad_norm": 0.14317639129969137, + "learning_rate": 5.847842001408035e-05, + "loss": 2.8664, + "step": 24221 + }, + { + "epoch": 1.503631510335837, + "grad_norm": 0.16000169181301813, + "learning_rate": 5.847486076836023e-05, + "loss": 2.8911, + "step": 24222 + }, + { + "epoch": 1.503693587435595, + "grad_norm": 0.2080247093107652, + "learning_rate": 5.8471301478424686e-05, + "loss": 2.8238, + "step": 24223 + }, + { + "epoch": 1.503755664535353, + "grad_norm": 0.16139324988923753, + "learning_rate": 5.846774214429234e-05, + "loss": 2.8854, + "step": 24224 + }, + { + "epoch": 1.5038177416351108, + "grad_norm": 0.15782435131895933, + "learning_rate": 5.84641827659817e-05, + "loss": 2.8171, + "step": 24225 + }, + { + "epoch": 1.5038798187348688, + "grad_norm": 0.1580529972348254, + "learning_rate": 5.8460623343511376e-05, + "loss": 2.8506, + "step": 24226 + }, + { + "epoch": 1.5039418958346267, + "grad_norm": 0.15079507072633386, + "learning_rate": 5.845706387689992e-05, + "loss": 2.8483, + "step": 24227 + }, + { + "epoch": 1.5040039729343846, + "grad_norm": 0.18278503863796958, + "learning_rate": 5.8453504366165915e-05, + "loss": 2.889, + "step": 24228 + }, + { + "epoch": 1.5040660500341425, + "grad_norm": 0.1492533723499701, + "learning_rate": 5.8449944811327925e-05, + "loss": 2.7518, + "step": 24229 + }, + { + "epoch": 1.5041281271339004, + "grad_norm": 0.15912207046308294, + "learning_rate": 5.844638521240451e-05, + "loss": 2.7201, + "step": 24230 + }, + { + "epoch": 1.5041902042336583, + "grad_norm": 0.1507320491784928, + "learning_rate": 5.8442825569414264e-05, + "loss": 2.8348, + "step": 24231 + }, + { + "epoch": 1.5042522813334163, + "grad_norm": 0.15826165917800408, + "learning_rate": 5.8439265882375735e-05, + "loss": 2.8024, + "step": 24232 + }, + { + "epoch": 1.504314358433174, + "grad_norm": 0.14867487549071792, + "learning_rate": 5.8435706151307525e-05, + "loss": 2.7929, + "step": 24233 + }, + { + "epoch": 1.5043764355329319, + "grad_norm": 0.1624157434969476, + "learning_rate": 5.843214637622817e-05, + "loss": 2.864, + "step": 24234 + }, + { + "epoch": 1.5044385126326898, + "grad_norm": 0.15320134254753126, + "learning_rate": 5.8428586557156284e-05, + "loss": 2.8166, + "step": 24235 + }, + { + "epoch": 1.5045005897324477, + "grad_norm": 0.15052218070851628, + "learning_rate": 5.842502669411041e-05, + "loss": 2.8165, + "step": 24236 + }, + { + "epoch": 1.5045626668322056, + "grad_norm": 0.16914729354081834, + "learning_rate": 5.842146678710912e-05, + "loss": 2.7577, + "step": 24237 + }, + { + "epoch": 1.5046247439319635, + "grad_norm": 0.17799629572633094, + "learning_rate": 5.841790683617099e-05, + "loss": 2.8624, + "step": 24238 + }, + { + "epoch": 1.5046868210317212, + "grad_norm": 0.15339053422867466, + "learning_rate": 5.8414346841314594e-05, + "loss": 2.7472, + "step": 24239 + }, + { + "epoch": 1.5047488981314792, + "grad_norm": 0.18572174665770086, + "learning_rate": 5.8410786802558515e-05, + "loss": 2.7251, + "step": 24240 + }, + { + "epoch": 1.504810975231237, + "grad_norm": 0.16005274870780017, + "learning_rate": 5.840722671992132e-05, + "loss": 2.7788, + "step": 24241 + }, + { + "epoch": 1.504873052330995, + "grad_norm": 0.14744714963002212, + "learning_rate": 5.840366659342158e-05, + "loss": 2.7105, + "step": 24242 + }, + { + "epoch": 1.504935129430753, + "grad_norm": 0.17133737598253843, + "learning_rate": 5.840010642307786e-05, + "loss": 2.7781, + "step": 24243 + }, + { + "epoch": 1.5049972065305108, + "grad_norm": 0.15219653793028823, + "learning_rate": 5.8396546208908765e-05, + "loss": 2.8013, + "step": 24244 + }, + { + "epoch": 1.5050592836302688, + "grad_norm": 0.1802864561825266, + "learning_rate": 5.8392985950932835e-05, + "loss": 2.7715, + "step": 24245 + }, + { + "epoch": 1.5051213607300267, + "grad_norm": 0.15940630759548693, + "learning_rate": 5.8389425649168675e-05, + "loss": 2.8742, + "step": 24246 + }, + { + "epoch": 1.5051834378297846, + "grad_norm": 0.17585715138687374, + "learning_rate": 5.838586530363482e-05, + "loss": 2.7461, + "step": 24247 + }, + { + "epoch": 1.5052455149295425, + "grad_norm": 0.15836788357881906, + "learning_rate": 5.8382304914349884e-05, + "loss": 2.6818, + "step": 24248 + }, + { + "epoch": 1.5053075920293004, + "grad_norm": 0.1924465093359036, + "learning_rate": 5.837874448133244e-05, + "loss": 2.7606, + "step": 24249 + }, + { + "epoch": 1.5053696691290583, + "grad_norm": 0.16042301593225064, + "learning_rate": 5.8375184004601035e-05, + "loss": 2.8358, + "step": 24250 + }, + { + "epoch": 1.5054317462288163, + "grad_norm": 0.16913226153199265, + "learning_rate": 5.8371623484174277e-05, + "loss": 2.799, + "step": 24251 + }, + { + "epoch": 1.5054938233285742, + "grad_norm": 0.1686298810427042, + "learning_rate": 5.836806292007071e-05, + "loss": 2.8465, + "step": 24252 + }, + { + "epoch": 1.505555900428332, + "grad_norm": 0.1482702052863897, + "learning_rate": 5.836450231230894e-05, + "loss": 2.782, + "step": 24253 + }, + { + "epoch": 1.50561797752809, + "grad_norm": 0.1557242787964509, + "learning_rate": 5.8360941660907545e-05, + "loss": 2.8635, + "step": 24254 + }, + { + "epoch": 1.505680054627848, + "grad_norm": 0.15237595144480198, + "learning_rate": 5.8357380965885056e-05, + "loss": 2.8063, + "step": 24255 + }, + { + "epoch": 1.5057421317276058, + "grad_norm": 0.15913399818791854, + "learning_rate": 5.8353820227260104e-05, + "loss": 2.8334, + "step": 24256 + }, + { + "epoch": 1.5058042088273635, + "grad_norm": 0.16074191565044493, + "learning_rate": 5.835025944505123e-05, + "loss": 2.793, + "step": 24257 + }, + { + "epoch": 1.5058662859271215, + "grad_norm": 0.1488031038392398, + "learning_rate": 5.834669861927704e-05, + "loss": 2.8038, + "step": 24258 + }, + { + "epoch": 1.5059283630268794, + "grad_norm": 0.14456643501076746, + "learning_rate": 5.8343137749956075e-05, + "loss": 2.7936, + "step": 24259 + }, + { + "epoch": 1.5059904401266373, + "grad_norm": 0.16267417848301044, + "learning_rate": 5.8339576837106955e-05, + "loss": 2.8304, + "step": 24260 + }, + { + "epoch": 1.5060525172263952, + "grad_norm": 0.14232768965815887, + "learning_rate": 5.833601588074823e-05, + "loss": 2.8072, + "step": 24261 + }, + { + "epoch": 1.5061145943261531, + "grad_norm": 0.15573600628567694, + "learning_rate": 5.833245488089849e-05, + "loss": 2.7924, + "step": 24262 + }, + { + "epoch": 1.5061766714259108, + "grad_norm": 0.1409217828669312, + "learning_rate": 5.83288938375763e-05, + "loss": 2.7979, + "step": 24263 + }, + { + "epoch": 1.5062387485256687, + "grad_norm": 0.14918154657148944, + "learning_rate": 5.8325332750800256e-05, + "loss": 2.8208, + "step": 24264 + }, + { + "epoch": 1.5063008256254267, + "grad_norm": 0.1628731407186025, + "learning_rate": 5.8321771620588936e-05, + "loss": 2.9253, + "step": 24265 + }, + { + "epoch": 1.5063629027251846, + "grad_norm": 0.16832829407566877, + "learning_rate": 5.831821044696091e-05, + "loss": 2.8529, + "step": 24266 + }, + { + "epoch": 1.5064249798249425, + "grad_norm": 0.1436484348639232, + "learning_rate": 5.831464922993476e-05, + "loss": 2.7553, + "step": 24267 + }, + { + "epoch": 1.5064870569247004, + "grad_norm": 0.1550654872193713, + "learning_rate": 5.8311087969529066e-05, + "loss": 2.763, + "step": 24268 + }, + { + "epoch": 1.5065491340244583, + "grad_norm": 0.15557427922194816, + "learning_rate": 5.8307526665762416e-05, + "loss": 2.8162, + "step": 24269 + }, + { + "epoch": 1.5066112111242163, + "grad_norm": 0.14701942637513005, + "learning_rate": 5.830396531865336e-05, + "loss": 2.6713, + "step": 24270 + }, + { + "epoch": 1.5066732882239742, + "grad_norm": 0.1949357786552061, + "learning_rate": 5.830040392822052e-05, + "loss": 2.7877, + "step": 24271 + }, + { + "epoch": 1.506735365323732, + "grad_norm": 0.14987800599385365, + "learning_rate": 5.829684249448246e-05, + "loss": 2.8557, + "step": 24272 + }, + { + "epoch": 1.50679744242349, + "grad_norm": 0.1624437468620295, + "learning_rate": 5.829328101745774e-05, + "loss": 2.7864, + "step": 24273 + }, + { + "epoch": 1.506859519523248, + "grad_norm": 0.15063820932368654, + "learning_rate": 5.828971949716498e-05, + "loss": 2.8473, + "step": 24274 + }, + { + "epoch": 1.5069215966230058, + "grad_norm": 0.1803978272822109, + "learning_rate": 5.8286157933622734e-05, + "loss": 2.822, + "step": 24275 + }, + { + "epoch": 1.5069836737227638, + "grad_norm": 0.1481532022029543, + "learning_rate": 5.828259632684959e-05, + "loss": 2.8133, + "step": 24276 + }, + { + "epoch": 1.5070457508225217, + "grad_norm": 0.16396439548269842, + "learning_rate": 5.827903467686413e-05, + "loss": 2.8344, + "step": 24277 + }, + { + "epoch": 1.5071078279222796, + "grad_norm": 0.1448593925210257, + "learning_rate": 5.827547298368494e-05, + "loss": 2.847, + "step": 24278 + }, + { + "epoch": 1.5071699050220375, + "grad_norm": 0.16654618137679358, + "learning_rate": 5.827191124733059e-05, + "loss": 2.7988, + "step": 24279 + }, + { + "epoch": 1.5072319821217954, + "grad_norm": 0.1448330306126636, + "learning_rate": 5.826834946781967e-05, + "loss": 2.7719, + "step": 24280 + }, + { + "epoch": 1.5072940592215531, + "grad_norm": 0.16061604023966256, + "learning_rate": 5.826478764517077e-05, + "loss": 2.7895, + "step": 24281 + }, + { + "epoch": 1.507356136321311, + "grad_norm": 0.14799988385869314, + "learning_rate": 5.826122577940246e-05, + "loss": 2.8252, + "step": 24282 + }, + { + "epoch": 1.507418213421069, + "grad_norm": 0.18640520432875207, + "learning_rate": 5.825766387053335e-05, + "loss": 2.7672, + "step": 24283 + }, + { + "epoch": 1.5074802905208269, + "grad_norm": 0.15494732829925215, + "learning_rate": 5.825410191858198e-05, + "loss": 2.8418, + "step": 24284 + }, + { + "epoch": 1.5075423676205848, + "grad_norm": 0.15938669330110664, + "learning_rate": 5.825053992356697e-05, + "loss": 2.8448, + "step": 24285 + }, + { + "epoch": 1.5076044447203427, + "grad_norm": 0.16143947088646063, + "learning_rate": 5.824697788550688e-05, + "loss": 2.847, + "step": 24286 + }, + { + "epoch": 1.5076665218201004, + "grad_norm": 0.18072945725768488, + "learning_rate": 5.824341580442031e-05, + "loss": 2.7669, + "step": 24287 + }, + { + "epoch": 1.5077285989198583, + "grad_norm": 0.16459833923773196, + "learning_rate": 5.823985368032584e-05, + "loss": 2.8552, + "step": 24288 + }, + { + "epoch": 1.5077906760196162, + "grad_norm": 0.1781538216379045, + "learning_rate": 5.8236291513242056e-05, + "loss": 2.7824, + "step": 24289 + }, + { + "epoch": 1.5078527531193742, + "grad_norm": 0.157445257925572, + "learning_rate": 5.823272930318754e-05, + "loss": 2.8581, + "step": 24290 + }, + { + "epoch": 1.507914830219132, + "grad_norm": 0.16226122534732965, + "learning_rate": 5.8229167050180866e-05, + "loss": 2.8709, + "step": 24291 + }, + { + "epoch": 1.50797690731889, + "grad_norm": 0.15602274700331184, + "learning_rate": 5.8225604754240635e-05, + "loss": 2.8648, + "step": 24292 + }, + { + "epoch": 1.508038984418648, + "grad_norm": 0.14776912593173358, + "learning_rate": 5.8222042415385426e-05, + "loss": 2.747, + "step": 24293 + }, + { + "epoch": 1.5081010615184058, + "grad_norm": 0.1503017274263806, + "learning_rate": 5.8218480033633825e-05, + "loss": 2.8617, + "step": 24294 + }, + { + "epoch": 1.5081631386181638, + "grad_norm": 0.1516005716485598, + "learning_rate": 5.8214917609004414e-05, + "loss": 2.862, + "step": 24295 + }, + { + "epoch": 1.5082252157179217, + "grad_norm": 0.1682695368158785, + "learning_rate": 5.821135514151579e-05, + "loss": 2.8322, + "step": 24296 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.15146229125085084, + "learning_rate": 5.820779263118654e-05, + "loss": 2.8246, + "step": 24297 + }, + { + "epoch": 1.5083493699174375, + "grad_norm": 0.1614926914349139, + "learning_rate": 5.8204230078035224e-05, + "loss": 2.8568, + "step": 24298 + }, + { + "epoch": 1.5084114470171954, + "grad_norm": 0.14948720949136612, + "learning_rate": 5.820066748208047e-05, + "loss": 2.8673, + "step": 24299 + }, + { + "epoch": 1.5084735241169533, + "grad_norm": 0.15098093033921095, + "learning_rate": 5.8197104843340835e-05, + "loss": 2.6949, + "step": 24300 + }, + { + "epoch": 1.5085356012167113, + "grad_norm": 0.14606089732580765, + "learning_rate": 5.8193542161834914e-05, + "loss": 2.7768, + "step": 24301 + }, + { + "epoch": 1.5085976783164692, + "grad_norm": 0.1553938536694536, + "learning_rate": 5.8189979437581286e-05, + "loss": 2.8283, + "step": 24302 + }, + { + "epoch": 1.508659755416227, + "grad_norm": 0.1533394698304984, + "learning_rate": 5.8186416670598566e-05, + "loss": 2.7999, + "step": 24303 + }, + { + "epoch": 1.508721832515985, + "grad_norm": 0.15948392310816153, + "learning_rate": 5.818285386090529e-05, + "loss": 2.8921, + "step": 24304 + }, + { + "epoch": 1.5087839096157427, + "grad_norm": 0.18513946980278484, + "learning_rate": 5.817929100852012e-05, + "loss": 2.8463, + "step": 24305 + }, + { + "epoch": 1.5088459867155006, + "grad_norm": 0.17249815853490494, + "learning_rate": 5.817572811346157e-05, + "loss": 2.8138, + "step": 24306 + }, + { + "epoch": 1.5089080638152585, + "grad_norm": 0.1775660923356436, + "learning_rate": 5.817216517574828e-05, + "loss": 2.816, + "step": 24307 + }, + { + "epoch": 1.5089701409150165, + "grad_norm": 0.18948818157729505, + "learning_rate": 5.816860219539882e-05, + "loss": 2.7734, + "step": 24308 + }, + { + "epoch": 1.5090322180147744, + "grad_norm": 0.1722696174107043, + "learning_rate": 5.816503917243178e-05, + "loss": 2.7843, + "step": 24309 + }, + { + "epoch": 1.5090942951145323, + "grad_norm": 0.17160445290989645, + "learning_rate": 5.816147610686574e-05, + "loss": 2.8129, + "step": 24310 + }, + { + "epoch": 1.50915637221429, + "grad_norm": 0.17052693997095994, + "learning_rate": 5.8157912998719296e-05, + "loss": 2.8527, + "step": 24311 + }, + { + "epoch": 1.509218449314048, + "grad_norm": 0.1708485884852624, + "learning_rate": 5.815434984801105e-05, + "loss": 2.8393, + "step": 24312 + }, + { + "epoch": 1.5092805264138058, + "grad_norm": 0.1726260950760218, + "learning_rate": 5.815078665475956e-05, + "loss": 2.8664, + "step": 24313 + }, + { + "epoch": 1.5093426035135638, + "grad_norm": 0.15680593296225606, + "learning_rate": 5.8147223418983466e-05, + "loss": 2.841, + "step": 24314 + }, + { + "epoch": 1.5094046806133217, + "grad_norm": 0.15022277134575157, + "learning_rate": 5.8143660140701315e-05, + "loss": 2.8324, + "step": 24315 + }, + { + "epoch": 1.5094667577130796, + "grad_norm": 0.16248580590942405, + "learning_rate": 5.814009681993171e-05, + "loss": 2.778, + "step": 24316 + }, + { + "epoch": 1.5095288348128375, + "grad_norm": 0.1550047126561197, + "learning_rate": 5.8136533456693255e-05, + "loss": 2.7195, + "step": 24317 + }, + { + "epoch": 1.5095909119125954, + "grad_norm": 0.17613002296890193, + "learning_rate": 5.813297005100452e-05, + "loss": 2.8416, + "step": 24318 + }, + { + "epoch": 1.5096529890123533, + "grad_norm": 0.19414082034003333, + "learning_rate": 5.8129406602884105e-05, + "loss": 2.9094, + "step": 24319 + }, + { + "epoch": 1.5097150661121113, + "grad_norm": 0.14901938076672153, + "learning_rate": 5.812584311235061e-05, + "loss": 2.8973, + "step": 24320 + }, + { + "epoch": 1.5097771432118692, + "grad_norm": 0.16102026852645096, + "learning_rate": 5.812227957942261e-05, + "loss": 2.8042, + "step": 24321 + }, + { + "epoch": 1.509839220311627, + "grad_norm": 0.16616564872171474, + "learning_rate": 5.8118716004118715e-05, + "loss": 2.8508, + "step": 24322 + }, + { + "epoch": 1.509901297411385, + "grad_norm": 0.17197243681270327, + "learning_rate": 5.8115152386457506e-05, + "loss": 2.7858, + "step": 24323 + }, + { + "epoch": 1.509963374511143, + "grad_norm": 0.1770365398421347, + "learning_rate": 5.8111588726457565e-05, + "loss": 2.7827, + "step": 24324 + }, + { + "epoch": 1.5100254516109008, + "grad_norm": 0.1405502160222827, + "learning_rate": 5.810802502413751e-05, + "loss": 2.8878, + "step": 24325 + }, + { + "epoch": 1.5100875287106588, + "grad_norm": 0.15611721595588657, + "learning_rate": 5.810446127951592e-05, + "loss": 2.8484, + "step": 24326 + }, + { + "epoch": 1.5101496058104167, + "grad_norm": 0.14043249756146292, + "learning_rate": 5.810089749261138e-05, + "loss": 2.8029, + "step": 24327 + }, + { + "epoch": 1.5102116829101746, + "grad_norm": 0.1578619657551389, + "learning_rate": 5.80973336634425e-05, + "loss": 2.8173, + "step": 24328 + }, + { + "epoch": 1.5102737600099323, + "grad_norm": 0.14888226858366865, + "learning_rate": 5.809376979202784e-05, + "loss": 2.8252, + "step": 24329 + }, + { + "epoch": 1.5103358371096902, + "grad_norm": 0.14784870573279743, + "learning_rate": 5.809020587838605e-05, + "loss": 2.8233, + "step": 24330 + }, + { + "epoch": 1.5103979142094481, + "grad_norm": 0.15052094324783308, + "learning_rate": 5.808664192253567e-05, + "loss": 2.7921, + "step": 24331 + }, + { + "epoch": 1.510459991309206, + "grad_norm": 0.16435166828840997, + "learning_rate": 5.8083077924495333e-05, + "loss": 2.7782, + "step": 24332 + }, + { + "epoch": 1.510522068408964, + "grad_norm": 0.1463700356879539, + "learning_rate": 5.807951388428361e-05, + "loss": 2.8033, + "step": 24333 + }, + { + "epoch": 1.5105841455087219, + "grad_norm": 0.15303800189130612, + "learning_rate": 5.8075949801919096e-05, + "loss": 2.8152, + "step": 24334 + }, + { + "epoch": 1.5106462226084796, + "grad_norm": 0.14927844206732743, + "learning_rate": 5.80723856774204e-05, + "loss": 2.7762, + "step": 24335 + }, + { + "epoch": 1.5107082997082375, + "grad_norm": 0.14227717134355136, + "learning_rate": 5.8068821510806105e-05, + "loss": 2.7805, + "step": 24336 + }, + { + "epoch": 1.5107703768079954, + "grad_norm": 0.14825606551314258, + "learning_rate": 5.806525730209481e-05, + "loss": 2.8191, + "step": 24337 + }, + { + "epoch": 1.5108324539077533, + "grad_norm": 0.14779952001337243, + "learning_rate": 5.806169305130511e-05, + "loss": 2.8198, + "step": 24338 + }, + { + "epoch": 1.5108945310075113, + "grad_norm": 0.15917582009888945, + "learning_rate": 5.80581287584556e-05, + "loss": 2.8355, + "step": 24339 + }, + { + "epoch": 1.5109566081072692, + "grad_norm": 0.14923094454962244, + "learning_rate": 5.805456442356487e-05, + "loss": 2.7971, + "step": 24340 + }, + { + "epoch": 1.511018685207027, + "grad_norm": 0.1558565012827038, + "learning_rate": 5.805100004665153e-05, + "loss": 2.8443, + "step": 24341 + }, + { + "epoch": 1.511080762306785, + "grad_norm": 0.15533078532161132, + "learning_rate": 5.804743562773416e-05, + "loss": 2.7313, + "step": 24342 + }, + { + "epoch": 1.511142839406543, + "grad_norm": 0.14276170669352284, + "learning_rate": 5.8043871166831375e-05, + "loss": 2.8564, + "step": 24343 + }, + { + "epoch": 1.5112049165063008, + "grad_norm": 0.18480021171173233, + "learning_rate": 5.804030666396176e-05, + "loss": 2.7939, + "step": 24344 + }, + { + "epoch": 1.5112669936060588, + "grad_norm": 0.14530295473968227, + "learning_rate": 5.803674211914389e-05, + "loss": 2.7616, + "step": 24345 + }, + { + "epoch": 1.5113290707058167, + "grad_norm": 0.14411226965445512, + "learning_rate": 5.803317753239641e-05, + "loss": 2.8405, + "step": 24346 + }, + { + "epoch": 1.5113911478055746, + "grad_norm": 0.14954224865921398, + "learning_rate": 5.802961290373788e-05, + "loss": 2.7575, + "step": 24347 + }, + { + "epoch": 1.5114532249053325, + "grad_norm": 0.1443331172290756, + "learning_rate": 5.802604823318694e-05, + "loss": 2.804, + "step": 24348 + }, + { + "epoch": 1.5115153020050904, + "grad_norm": 0.1463646526641479, + "learning_rate": 5.802248352076212e-05, + "loss": 2.8211, + "step": 24349 + }, + { + "epoch": 1.5115773791048484, + "grad_norm": 0.1460401666263086, + "learning_rate": 5.8018918766482076e-05, + "loss": 2.8828, + "step": 24350 + }, + { + "epoch": 1.5116394562046063, + "grad_norm": 0.13288316529701663, + "learning_rate": 5.8015353970365395e-05, + "loss": 2.7649, + "step": 24351 + }, + { + "epoch": 1.511701533304364, + "grad_norm": 0.16224724869864052, + "learning_rate": 5.8011789132430636e-05, + "loss": 2.7873, + "step": 24352 + }, + { + "epoch": 1.5117636104041219, + "grad_norm": 0.14423692220927029, + "learning_rate": 5.8008224252696454e-05, + "loss": 2.7615, + "step": 24353 + }, + { + "epoch": 1.5118256875038798, + "grad_norm": 0.1565858747637004, + "learning_rate": 5.800465933118141e-05, + "loss": 2.7322, + "step": 24354 + }, + { + "epoch": 1.5118877646036377, + "grad_norm": 0.1635093617321903, + "learning_rate": 5.800109436790413e-05, + "loss": 2.7834, + "step": 24355 + }, + { + "epoch": 1.5119498417033956, + "grad_norm": 0.164696495898043, + "learning_rate": 5.799752936288317e-05, + "loss": 2.8816, + "step": 24356 + }, + { + "epoch": 1.5120119188031536, + "grad_norm": 0.1546489706124186, + "learning_rate": 5.799396431613718e-05, + "loss": 2.8288, + "step": 24357 + }, + { + "epoch": 1.5120739959029112, + "grad_norm": 0.14821052137669055, + "learning_rate": 5.799039922768473e-05, + "loss": 2.8441, + "step": 24358 + }, + { + "epoch": 1.5121360730026692, + "grad_norm": 0.1484396708480305, + "learning_rate": 5.798683409754444e-05, + "loss": 2.8215, + "step": 24359 + }, + { + "epoch": 1.512198150102427, + "grad_norm": 0.1551848224673156, + "learning_rate": 5.798326892573488e-05, + "loss": 2.8186, + "step": 24360 + }, + { + "epoch": 1.512260227202185, + "grad_norm": 0.1783928045891622, + "learning_rate": 5.797970371227468e-05, + "loss": 2.7712, + "step": 24361 + }, + { + "epoch": 1.512322304301943, + "grad_norm": 0.1511867370495542, + "learning_rate": 5.7976138457182414e-05, + "loss": 2.769, + "step": 24362 + }, + { + "epoch": 1.5123843814017008, + "grad_norm": 0.15338088011603968, + "learning_rate": 5.797257316047671e-05, + "loss": 2.7816, + "step": 24363 + }, + { + "epoch": 1.5124464585014588, + "grad_norm": 0.16498017812463708, + "learning_rate": 5.7969007822176155e-05, + "loss": 2.839, + "step": 24364 + }, + { + "epoch": 1.5125085356012167, + "grad_norm": 0.15085793012957324, + "learning_rate": 5.796544244229934e-05, + "loss": 2.8008, + "step": 24365 + }, + { + "epoch": 1.5125706127009746, + "grad_norm": 0.16316646027121706, + "learning_rate": 5.796187702086491e-05, + "loss": 2.7485, + "step": 24366 + }, + { + "epoch": 1.5126326898007325, + "grad_norm": 0.15796904259191444, + "learning_rate": 5.79583115578914e-05, + "loss": 2.7607, + "step": 24367 + }, + { + "epoch": 1.5126947669004904, + "grad_norm": 0.1648944751832576, + "learning_rate": 5.795474605339747e-05, + "loss": 2.804, + "step": 24368 + }, + { + "epoch": 1.5127568440002483, + "grad_norm": 0.15837516378501124, + "learning_rate": 5.795118050740169e-05, + "loss": 2.7305, + "step": 24369 + }, + { + "epoch": 1.5128189211000063, + "grad_norm": 0.14486456146435736, + "learning_rate": 5.7947614919922666e-05, + "loss": 2.8672, + "step": 24370 + }, + { + "epoch": 1.5128809981997642, + "grad_norm": 0.14937562385732067, + "learning_rate": 5.7944049290979006e-05, + "loss": 2.8473, + "step": 24371 + }, + { + "epoch": 1.512943075299522, + "grad_norm": 0.14461635326486763, + "learning_rate": 5.794048362058931e-05, + "loss": 2.831, + "step": 24372 + }, + { + "epoch": 1.51300515239928, + "grad_norm": 0.14464345648892005, + "learning_rate": 5.79369179087722e-05, + "loss": 2.7902, + "step": 24373 + }, + { + "epoch": 1.513067229499038, + "grad_norm": 0.17874623559775965, + "learning_rate": 5.793335215554625e-05, + "loss": 2.8249, + "step": 24374 + }, + { + "epoch": 1.5131293065987959, + "grad_norm": 0.14325025937795874, + "learning_rate": 5.792978636093008e-05, + "loss": 2.8362, + "step": 24375 + }, + { + "epoch": 1.5131913836985535, + "grad_norm": 0.1573920297373481, + "learning_rate": 5.792622052494229e-05, + "loss": 2.8771, + "step": 24376 + }, + { + "epoch": 1.5132534607983115, + "grad_norm": 0.16445415727558677, + "learning_rate": 5.792265464760148e-05, + "loss": 2.7921, + "step": 24377 + }, + { + "epoch": 1.5133155378980694, + "grad_norm": 0.18757063806631, + "learning_rate": 5.791908872892625e-05, + "loss": 2.9229, + "step": 24378 + }, + { + "epoch": 1.5133776149978273, + "grad_norm": 0.1790462433424105, + "learning_rate": 5.7915522768935213e-05, + "loss": 2.8291, + "step": 24379 + }, + { + "epoch": 1.5134396920975852, + "grad_norm": 0.1448431837303343, + "learning_rate": 5.791195676764699e-05, + "loss": 2.8034, + "step": 24380 + }, + { + "epoch": 1.5135017691973431, + "grad_norm": 0.16154047519814962, + "learning_rate": 5.7908390725080154e-05, + "loss": 2.826, + "step": 24381 + }, + { + "epoch": 1.5135638462971008, + "grad_norm": 0.14917346806533113, + "learning_rate": 5.790482464125333e-05, + "loss": 2.7584, + "step": 24382 + }, + { + "epoch": 1.5136259233968588, + "grad_norm": 0.16423069189117423, + "learning_rate": 5.790125851618511e-05, + "loss": 2.7398, + "step": 24383 + }, + { + "epoch": 1.5136880004966167, + "grad_norm": 0.15825535653752734, + "learning_rate": 5.789769234989411e-05, + "loss": 2.8822, + "step": 24384 + }, + { + "epoch": 1.5137500775963746, + "grad_norm": 0.1578938793694486, + "learning_rate": 5.7894126142398933e-05, + "loss": 2.8049, + "step": 24385 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.1596031699092423, + "learning_rate": 5.789055989371818e-05, + "loss": 2.8055, + "step": 24386 + }, + { + "epoch": 1.5138742317958904, + "grad_norm": 0.14431904989514294, + "learning_rate": 5.788699360387047e-05, + "loss": 2.7927, + "step": 24387 + }, + { + "epoch": 1.5139363088956483, + "grad_norm": 0.15265298294233823, + "learning_rate": 5.788342727287439e-05, + "loss": 2.8493, + "step": 24388 + }, + { + "epoch": 1.5139983859954063, + "grad_norm": 0.1436116890395559, + "learning_rate": 5.787986090074856e-05, + "loss": 2.8137, + "step": 24389 + }, + { + "epoch": 1.5140604630951642, + "grad_norm": 0.15715734960520647, + "learning_rate": 5.787629448751157e-05, + "loss": 2.8857, + "step": 24390 + }, + { + "epoch": 1.514122540194922, + "grad_norm": 0.15847984884125377, + "learning_rate": 5.7872728033182064e-05, + "loss": 2.85, + "step": 24391 + }, + { + "epoch": 1.51418461729468, + "grad_norm": 0.1497829035968738, + "learning_rate": 5.786916153777861e-05, + "loss": 2.8042, + "step": 24392 + }, + { + "epoch": 1.514246694394438, + "grad_norm": 0.14923481380416906, + "learning_rate": 5.786559500131983e-05, + "loss": 2.8628, + "step": 24393 + }, + { + "epoch": 1.5143087714941958, + "grad_norm": 0.1443707309335313, + "learning_rate": 5.786202842382432e-05, + "loss": 2.7492, + "step": 24394 + }, + { + "epoch": 1.5143708485939538, + "grad_norm": 0.175036564268346, + "learning_rate": 5.7858461805310715e-05, + "loss": 2.7603, + "step": 24395 + }, + { + "epoch": 1.5144329256937117, + "grad_norm": 0.16664170969928832, + "learning_rate": 5.7854895145797606e-05, + "loss": 2.8541, + "step": 24396 + }, + { + "epoch": 1.5144950027934696, + "grad_norm": 0.16657146079059132, + "learning_rate": 5.78513284453036e-05, + "loss": 2.8552, + "step": 24397 + }, + { + "epoch": 1.5145570798932275, + "grad_norm": 0.17970239536079893, + "learning_rate": 5.784776170384731e-05, + "loss": 2.7565, + "step": 24398 + }, + { + "epoch": 1.5146191569929854, + "grad_norm": 0.16306169980789656, + "learning_rate": 5.7844194921447334e-05, + "loss": 2.8045, + "step": 24399 + }, + { + "epoch": 1.5146812340927431, + "grad_norm": 0.16127231223596158, + "learning_rate": 5.7840628098122295e-05, + "loss": 2.8423, + "step": 24400 + }, + { + "epoch": 1.514743311192501, + "grad_norm": 0.1624427607485463, + "learning_rate": 5.783706123389079e-05, + "loss": 2.8671, + "step": 24401 + }, + { + "epoch": 1.514805388292259, + "grad_norm": 0.1507448570032886, + "learning_rate": 5.783349432877145e-05, + "loss": 2.8038, + "step": 24402 + }, + { + "epoch": 1.5148674653920169, + "grad_norm": 0.16908594022737722, + "learning_rate": 5.7829927382782856e-05, + "loss": 2.8112, + "step": 24403 + }, + { + "epoch": 1.5149295424917748, + "grad_norm": 0.1442702165709807, + "learning_rate": 5.782636039594364e-05, + "loss": 2.7915, + "step": 24404 + }, + { + "epoch": 1.5149916195915327, + "grad_norm": 0.151848519954355, + "learning_rate": 5.7822793368272374e-05, + "loss": 2.8019, + "step": 24405 + }, + { + "epoch": 1.5150536966912904, + "grad_norm": 0.16872501894811856, + "learning_rate": 5.781922629978773e-05, + "loss": 2.8133, + "step": 24406 + }, + { + "epoch": 1.5151157737910483, + "grad_norm": 0.15416367733050443, + "learning_rate": 5.781565919050828e-05, + "loss": 2.7794, + "step": 24407 + }, + { + "epoch": 1.5151778508908063, + "grad_norm": 0.1813651631604022, + "learning_rate": 5.781209204045261e-05, + "loss": 2.7854, + "step": 24408 + }, + { + "epoch": 1.5152399279905642, + "grad_norm": 0.14751502807079425, + "learning_rate": 5.780852484963939e-05, + "loss": 2.8035, + "step": 24409 + }, + { + "epoch": 1.515302005090322, + "grad_norm": 0.17913410760508755, + "learning_rate": 5.780495761808719e-05, + "loss": 2.7642, + "step": 24410 + }, + { + "epoch": 1.51536408219008, + "grad_norm": 0.1600010631756924, + "learning_rate": 5.780139034581462e-05, + "loss": 2.7999, + "step": 24411 + }, + { + "epoch": 1.515426159289838, + "grad_norm": 0.14708267099197142, + "learning_rate": 5.7797823032840315e-05, + "loss": 2.7164, + "step": 24412 + }, + { + "epoch": 1.5154882363895958, + "grad_norm": 0.16129325623810112, + "learning_rate": 5.7794255679182866e-05, + "loss": 2.8339, + "step": 24413 + }, + { + "epoch": 1.5155503134893538, + "grad_norm": 0.16508144195697705, + "learning_rate": 5.77906882848609e-05, + "loss": 2.8338, + "step": 24414 + }, + { + "epoch": 1.5156123905891117, + "grad_norm": 0.16119753375852727, + "learning_rate": 5.7787120849893014e-05, + "loss": 2.8533, + "step": 24415 + }, + { + "epoch": 1.5156744676888696, + "grad_norm": 0.14740006662047653, + "learning_rate": 5.778355337429785e-05, + "loss": 2.8434, + "step": 24416 + }, + { + "epoch": 1.5157365447886275, + "grad_norm": 0.16201807231528106, + "learning_rate": 5.7779985858093976e-05, + "loss": 2.7721, + "step": 24417 + }, + { + "epoch": 1.5157986218883854, + "grad_norm": 0.15128687912409272, + "learning_rate": 5.777641830130004e-05, + "loss": 2.8717, + "step": 24418 + }, + { + "epoch": 1.5158606989881434, + "grad_norm": 0.15601929679870755, + "learning_rate": 5.777285070393463e-05, + "loss": 2.8485, + "step": 24419 + }, + { + "epoch": 1.5159227760879013, + "grad_norm": 0.15158629402701057, + "learning_rate": 5.7769283066016375e-05, + "loss": 2.8035, + "step": 24420 + }, + { + "epoch": 1.5159848531876592, + "grad_norm": 0.1637940879158176, + "learning_rate": 5.776571538756389e-05, + "loss": 2.821, + "step": 24421 + }, + { + "epoch": 1.516046930287417, + "grad_norm": 0.18600707956141085, + "learning_rate": 5.7762147668595776e-05, + "loss": 2.7095, + "step": 24422 + }, + { + "epoch": 1.516109007387175, + "grad_norm": 0.15918518712191135, + "learning_rate": 5.775857990913066e-05, + "loss": 2.7693, + "step": 24423 + }, + { + "epoch": 1.5161710844869327, + "grad_norm": 0.14635925671582956, + "learning_rate": 5.7755012109187146e-05, + "loss": 2.8207, + "step": 24424 + }, + { + "epoch": 1.5162331615866906, + "grad_norm": 0.17052311692041908, + "learning_rate": 5.775144426878386e-05, + "loss": 2.7723, + "step": 24425 + }, + { + "epoch": 1.5162952386864486, + "grad_norm": 0.1418491829266377, + "learning_rate": 5.774787638793939e-05, + "loss": 2.7862, + "step": 24426 + }, + { + "epoch": 1.5163573157862065, + "grad_norm": 0.15145583479533192, + "learning_rate": 5.774430846667237e-05, + "loss": 2.8451, + "step": 24427 + }, + { + "epoch": 1.5164193928859644, + "grad_norm": 0.14495834189958903, + "learning_rate": 5.774074050500141e-05, + "loss": 2.8378, + "step": 24428 + }, + { + "epoch": 1.5164814699857223, + "grad_norm": 0.1640547852745911, + "learning_rate": 5.773717250294515e-05, + "loss": 2.7481, + "step": 24429 + }, + { + "epoch": 1.51654354708548, + "grad_norm": 0.14572770829206097, + "learning_rate": 5.773360446052216e-05, + "loss": 2.8498, + "step": 24430 + }, + { + "epoch": 1.516605624185238, + "grad_norm": 0.16427644335924413, + "learning_rate": 5.77300363777511e-05, + "loss": 2.7961, + "step": 24431 + }, + { + "epoch": 1.5166677012849958, + "grad_norm": 0.1467626725489456, + "learning_rate": 5.772646825465056e-05, + "loss": 2.7874, + "step": 24432 + }, + { + "epoch": 1.5167297783847538, + "grad_norm": 0.14999805339901187, + "learning_rate": 5.772290009123914e-05, + "loss": 2.7698, + "step": 24433 + }, + { + "epoch": 1.5167918554845117, + "grad_norm": 0.17102756526524832, + "learning_rate": 5.771933188753549e-05, + "loss": 2.7245, + "step": 24434 + }, + { + "epoch": 1.5168539325842696, + "grad_norm": 0.1694523309026871, + "learning_rate": 5.77157636435582e-05, + "loss": 2.8593, + "step": 24435 + }, + { + "epoch": 1.5169160096840275, + "grad_norm": 0.15461150817016336, + "learning_rate": 5.771219535932592e-05, + "loss": 2.8939, + "step": 24436 + }, + { + "epoch": 1.5169780867837854, + "grad_norm": 0.15414586705508507, + "learning_rate": 5.770862703485722e-05, + "loss": 2.8214, + "step": 24437 + }, + { + "epoch": 1.5170401638835433, + "grad_norm": 0.14452677797151964, + "learning_rate": 5.7705058670170756e-05, + "loss": 2.8174, + "step": 24438 + }, + { + "epoch": 1.5171022409833013, + "grad_norm": 0.1507508249667513, + "learning_rate": 5.770149026528512e-05, + "loss": 2.8317, + "step": 24439 + }, + { + "epoch": 1.5171643180830592, + "grad_norm": 0.1618976934462182, + "learning_rate": 5.7697921820218946e-05, + "loss": 2.8034, + "step": 24440 + }, + { + "epoch": 1.517226395182817, + "grad_norm": 0.18389183559986771, + "learning_rate": 5.769435333499085e-05, + "loss": 2.8173, + "step": 24441 + }, + { + "epoch": 1.517288472282575, + "grad_norm": 0.1551426630707494, + "learning_rate": 5.769078480961944e-05, + "loss": 2.7544, + "step": 24442 + }, + { + "epoch": 1.517350549382333, + "grad_norm": 0.16263372576788315, + "learning_rate": 5.7687216244123335e-05, + "loss": 2.8124, + "step": 24443 + }, + { + "epoch": 1.5174126264820909, + "grad_norm": 0.1705815838414796, + "learning_rate": 5.768364763852116e-05, + "loss": 2.843, + "step": 24444 + }, + { + "epoch": 1.5174747035818488, + "grad_norm": 0.15148595641886067, + "learning_rate": 5.768007899283153e-05, + "loss": 2.816, + "step": 24445 + }, + { + "epoch": 1.5175367806816067, + "grad_norm": 0.16468663366513336, + "learning_rate": 5.767651030707306e-05, + "loss": 2.8528, + "step": 24446 + }, + { + "epoch": 1.5175988577813646, + "grad_norm": 0.18305976805987856, + "learning_rate": 5.767294158126437e-05, + "loss": 2.7193, + "step": 24447 + }, + { + "epoch": 1.5176609348811223, + "grad_norm": 0.14987422169872225, + "learning_rate": 5.7669372815424094e-05, + "loss": 2.8457, + "step": 24448 + }, + { + "epoch": 1.5177230119808802, + "grad_norm": 0.1523104411171128, + "learning_rate": 5.766580400957083e-05, + "loss": 2.8409, + "step": 24449 + }, + { + "epoch": 1.5177850890806381, + "grad_norm": 0.15490592192452915, + "learning_rate": 5.766223516372321e-05, + "loss": 2.8178, + "step": 24450 + }, + { + "epoch": 1.517847166180396, + "grad_norm": 0.14317851182325156, + "learning_rate": 5.765866627789984e-05, + "loss": 2.7994, + "step": 24451 + }, + { + "epoch": 1.517909243280154, + "grad_norm": 0.15564508765196944, + "learning_rate": 5.765509735211937e-05, + "loss": 2.8827, + "step": 24452 + }, + { + "epoch": 1.517971320379912, + "grad_norm": 0.1451603882921386, + "learning_rate": 5.7651528386400376e-05, + "loss": 2.8353, + "step": 24453 + }, + { + "epoch": 1.5180333974796696, + "grad_norm": 0.15839134604952324, + "learning_rate": 5.7647959380761505e-05, + "loss": 2.7854, + "step": 24454 + }, + { + "epoch": 1.5180954745794275, + "grad_norm": 0.1559286389097988, + "learning_rate": 5.764439033522138e-05, + "loss": 2.7979, + "step": 24455 + }, + { + "epoch": 1.5181575516791854, + "grad_norm": 0.17894445078104582, + "learning_rate": 5.764082124979862e-05, + "loss": 2.8609, + "step": 24456 + }, + { + "epoch": 1.5182196287789433, + "grad_norm": 0.147967328011388, + "learning_rate": 5.763725212451182e-05, + "loss": 2.8297, + "step": 24457 + }, + { + "epoch": 1.5182817058787013, + "grad_norm": 0.14592531332026795, + "learning_rate": 5.7633682959379644e-05, + "loss": 2.8945, + "step": 24458 + }, + { + "epoch": 1.5183437829784592, + "grad_norm": 0.20827376686326857, + "learning_rate": 5.763011375442067e-05, + "loss": 2.8465, + "step": 24459 + }, + { + "epoch": 1.518405860078217, + "grad_norm": 0.16516586939750483, + "learning_rate": 5.7626544509653546e-05, + "loss": 2.856, + "step": 24460 + }, + { + "epoch": 1.518467937177975, + "grad_norm": 0.17454702151525178, + "learning_rate": 5.76229752250969e-05, + "loss": 2.7581, + "step": 24461 + }, + { + "epoch": 1.518530014277733, + "grad_norm": 0.1663961033570755, + "learning_rate": 5.761940590076933e-05, + "loss": 2.908, + "step": 24462 + }, + { + "epoch": 1.5185920913774908, + "grad_norm": 0.16548022159294748, + "learning_rate": 5.7615836536689484e-05, + "loss": 2.7828, + "step": 24463 + }, + { + "epoch": 1.5186541684772488, + "grad_norm": 0.16431016620634198, + "learning_rate": 5.761226713287595e-05, + "loss": 2.7507, + "step": 24464 + }, + { + "epoch": 1.5187162455770067, + "grad_norm": 0.16937949226552326, + "learning_rate": 5.760869768934739e-05, + "loss": 2.8164, + "step": 24465 + }, + { + "epoch": 1.5187783226767646, + "grad_norm": 0.16093400770861124, + "learning_rate": 5.760512820612238e-05, + "loss": 2.8303, + "step": 24466 + }, + { + "epoch": 1.5188403997765225, + "grad_norm": 0.14933234205215873, + "learning_rate": 5.76015586832196e-05, + "loss": 2.8409, + "step": 24467 + }, + { + "epoch": 1.5189024768762804, + "grad_norm": 0.1665160602597181, + "learning_rate": 5.759798912065764e-05, + "loss": 2.8742, + "step": 24468 + }, + { + "epoch": 1.5189645539760384, + "grad_norm": 0.1410064953202438, + "learning_rate": 5.759441951845511e-05, + "loss": 2.6815, + "step": 24469 + }, + { + "epoch": 1.5190266310757963, + "grad_norm": 0.15874622278525766, + "learning_rate": 5.759084987663066e-05, + "loss": 2.8138, + "step": 24470 + }, + { + "epoch": 1.5190887081755542, + "grad_norm": 0.17509372087642136, + "learning_rate": 5.75872801952029e-05, + "loss": 2.8443, + "step": 24471 + }, + { + "epoch": 1.5191507852753119, + "grad_norm": 0.1553932902978746, + "learning_rate": 5.758371047419045e-05, + "loss": 2.8625, + "step": 24472 + }, + { + "epoch": 1.5192128623750698, + "grad_norm": 0.18698714570435795, + "learning_rate": 5.7580140713611953e-05, + "loss": 2.7429, + "step": 24473 + }, + { + "epoch": 1.5192749394748277, + "grad_norm": 0.1641505379840408, + "learning_rate": 5.757657091348603e-05, + "loss": 2.8963, + "step": 24474 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.17093656931942833, + "learning_rate": 5.7573001073831276e-05, + "loss": 2.7676, + "step": 24475 + }, + { + "epoch": 1.5193990936743436, + "grad_norm": 0.16355009897527029, + "learning_rate": 5.7569431194666335e-05, + "loss": 2.8036, + "step": 24476 + }, + { + "epoch": 1.5194611707741015, + "grad_norm": 0.1492688214840203, + "learning_rate": 5.756586127600986e-05, + "loss": 2.797, + "step": 24477 + }, + { + "epoch": 1.5195232478738592, + "grad_norm": 0.1508952937439319, + "learning_rate": 5.7562291317880425e-05, + "loss": 2.7939, + "step": 24478 + }, + { + "epoch": 1.519585324973617, + "grad_norm": 0.15479904256920113, + "learning_rate": 5.755872132029669e-05, + "loss": 2.8394, + "step": 24479 + }, + { + "epoch": 1.519647402073375, + "grad_norm": 0.1764661321292155, + "learning_rate": 5.755515128327727e-05, + "loss": 2.8603, + "step": 24480 + }, + { + "epoch": 1.519709479173133, + "grad_norm": 0.1591326872353981, + "learning_rate": 5.75515812068408e-05, + "loss": 2.8575, + "step": 24481 + }, + { + "epoch": 1.5197715562728908, + "grad_norm": 0.15016006160229087, + "learning_rate": 5.754801109100588e-05, + "loss": 2.7679, + "step": 24482 + }, + { + "epoch": 1.5198336333726488, + "grad_norm": 0.1520162081260762, + "learning_rate": 5.754444093579118e-05, + "loss": 2.8578, + "step": 24483 + }, + { + "epoch": 1.5198957104724067, + "grad_norm": 0.16453568590239734, + "learning_rate": 5.754087074121527e-05, + "loss": 2.8311, + "step": 24484 + }, + { + "epoch": 1.5199577875721646, + "grad_norm": 0.14017559137075042, + "learning_rate": 5.753730050729683e-05, + "loss": 2.7501, + "step": 24485 + }, + { + "epoch": 1.5200198646719225, + "grad_norm": 0.14331922878501316, + "learning_rate": 5.7533730234054465e-05, + "loss": 2.8767, + "step": 24486 + }, + { + "epoch": 1.5200819417716804, + "grad_norm": 0.16664890229404136, + "learning_rate": 5.7530159921506776e-05, + "loss": 2.7887, + "step": 24487 + }, + { + "epoch": 1.5201440188714384, + "grad_norm": 0.14098898485416272, + "learning_rate": 5.752658956967244e-05, + "loss": 2.7568, + "step": 24488 + }, + { + "epoch": 1.5202060959711963, + "grad_norm": 0.14486096765389272, + "learning_rate": 5.752301917857004e-05, + "loss": 2.8542, + "step": 24489 + }, + { + "epoch": 1.5202681730709542, + "grad_norm": 0.16622732913172214, + "learning_rate": 5.751944874821824e-05, + "loss": 2.8185, + "step": 24490 + }, + { + "epoch": 1.520330250170712, + "grad_norm": 0.14810967006909503, + "learning_rate": 5.751587827863564e-05, + "loss": 2.8702, + "step": 24491 + }, + { + "epoch": 1.52039232727047, + "grad_norm": 0.15310280551435979, + "learning_rate": 5.751230776984088e-05, + "loss": 2.8865, + "step": 24492 + }, + { + "epoch": 1.520454404370228, + "grad_norm": 0.14435373823687084, + "learning_rate": 5.750873722185258e-05, + "loss": 2.7522, + "step": 24493 + }, + { + "epoch": 1.5205164814699859, + "grad_norm": 0.1668181990900183, + "learning_rate": 5.750516663468938e-05, + "loss": 2.8563, + "step": 24494 + }, + { + "epoch": 1.5205785585697438, + "grad_norm": 0.15665827696885443, + "learning_rate": 5.7501596008369906e-05, + "loss": 2.855, + "step": 24495 + }, + { + "epoch": 1.5206406356695015, + "grad_norm": 0.15713446745387186, + "learning_rate": 5.749802534291278e-05, + "loss": 2.8701, + "step": 24496 + }, + { + "epoch": 1.5207027127692594, + "grad_norm": 0.1538580497067885, + "learning_rate": 5.749445463833665e-05, + "loss": 2.8285, + "step": 24497 + }, + { + "epoch": 1.5207647898690173, + "grad_norm": 0.15210175857127678, + "learning_rate": 5.749088389466011e-05, + "loss": 2.7881, + "step": 24498 + }, + { + "epoch": 1.5208268669687752, + "grad_norm": 0.15385191776523435, + "learning_rate": 5.748731311190183e-05, + "loss": 2.8143, + "step": 24499 + }, + { + "epoch": 1.5208889440685331, + "grad_norm": 0.14839947364005596, + "learning_rate": 5.74837422900804e-05, + "loss": 2.8476, + "step": 24500 + }, + { + "epoch": 1.520951021168291, + "grad_norm": 0.17036602465343412, + "learning_rate": 5.748017142921448e-05, + "loss": 2.853, + "step": 24501 + }, + { + "epoch": 1.5210130982680488, + "grad_norm": 0.14621216595373407, + "learning_rate": 5.747660052932269e-05, + "loss": 2.901, + "step": 24502 + }, + { + "epoch": 1.5210751753678067, + "grad_norm": 0.16908264642114204, + "learning_rate": 5.747302959042367e-05, + "loss": 2.8081, + "step": 24503 + }, + { + "epoch": 1.5211372524675646, + "grad_norm": 0.14154716630756836, + "learning_rate": 5.746945861253603e-05, + "loss": 2.788, + "step": 24504 + }, + { + "epoch": 1.5211993295673225, + "grad_norm": 0.17421274281738236, + "learning_rate": 5.74658875956784e-05, + "loss": 2.9293, + "step": 24505 + }, + { + "epoch": 1.5212614066670804, + "grad_norm": 0.15063166808370315, + "learning_rate": 5.746231653986944e-05, + "loss": 2.8981, + "step": 24506 + }, + { + "epoch": 1.5213234837668383, + "grad_norm": 0.16940186393082773, + "learning_rate": 5.745874544512776e-05, + "loss": 2.8739, + "step": 24507 + }, + { + "epoch": 1.5213855608665963, + "grad_norm": 0.16167008386417756, + "learning_rate": 5.745517431147199e-05, + "loss": 2.8885, + "step": 24508 + }, + { + "epoch": 1.5214476379663542, + "grad_norm": 0.15768997539591253, + "learning_rate": 5.745160313892075e-05, + "loss": 2.8261, + "step": 24509 + }, + { + "epoch": 1.521509715066112, + "grad_norm": 0.15301927546475716, + "learning_rate": 5.744803192749271e-05, + "loss": 2.7806, + "step": 24510 + }, + { + "epoch": 1.52157179216587, + "grad_norm": 0.1717553209736168, + "learning_rate": 5.744446067720648e-05, + "loss": 2.8075, + "step": 24511 + }, + { + "epoch": 1.521633869265628, + "grad_norm": 0.15553227329741467, + "learning_rate": 5.744088938808068e-05, + "loss": 2.8132, + "step": 24512 + }, + { + "epoch": 1.5216959463653859, + "grad_norm": 0.16602689480396068, + "learning_rate": 5.743731806013397e-05, + "loss": 2.8474, + "step": 24513 + }, + { + "epoch": 1.5217580234651438, + "grad_norm": 0.1529514894011253, + "learning_rate": 5.743374669338495e-05, + "loss": 2.868, + "step": 24514 + }, + { + "epoch": 1.5218201005649017, + "grad_norm": 0.14717867659790562, + "learning_rate": 5.743017528785227e-05, + "loss": 2.8503, + "step": 24515 + }, + { + "epoch": 1.5218821776646596, + "grad_norm": 0.16558114024396847, + "learning_rate": 5.742660384355456e-05, + "loss": 2.7453, + "step": 24516 + }, + { + "epoch": 1.5219442547644175, + "grad_norm": 0.15089699102859386, + "learning_rate": 5.742303236051047e-05, + "loss": 2.7383, + "step": 24517 + }, + { + "epoch": 1.5220063318641754, + "grad_norm": 0.1484888326828188, + "learning_rate": 5.74194608387386e-05, + "loss": 2.6689, + "step": 24518 + }, + { + "epoch": 1.5220684089639334, + "grad_norm": 0.17413188346770023, + "learning_rate": 5.741588927825762e-05, + "loss": 2.8858, + "step": 24519 + }, + { + "epoch": 1.522130486063691, + "grad_norm": 0.16384644701060502, + "learning_rate": 5.741231767908613e-05, + "loss": 2.8227, + "step": 24520 + }, + { + "epoch": 1.522192563163449, + "grad_norm": 0.15365306317296878, + "learning_rate": 5.740874604124279e-05, + "loss": 2.763, + "step": 24521 + }, + { + "epoch": 1.522254640263207, + "grad_norm": 0.1916947720401059, + "learning_rate": 5.740517436474623e-05, + "loss": 2.847, + "step": 24522 + }, + { + "epoch": 1.5223167173629648, + "grad_norm": 0.15429147276294003, + "learning_rate": 5.740160264961505e-05, + "loss": 2.8371, + "step": 24523 + }, + { + "epoch": 1.5223787944627227, + "grad_norm": 0.16204702373192886, + "learning_rate": 5.739803089586794e-05, + "loss": 2.8524, + "step": 24524 + }, + { + "epoch": 1.5224408715624806, + "grad_norm": 0.15868528501002194, + "learning_rate": 5.739445910352348e-05, + "loss": 2.842, + "step": 24525 + }, + { + "epoch": 1.5225029486622383, + "grad_norm": 0.14649148923764346, + "learning_rate": 5.7390887272600347e-05, + "loss": 2.7892, + "step": 24526 + }, + { + "epoch": 1.5225650257619963, + "grad_norm": 0.15346894076115478, + "learning_rate": 5.738731540311716e-05, + "loss": 2.8615, + "step": 24527 + }, + { + "epoch": 1.5226271028617542, + "grad_norm": 0.19947714036820402, + "learning_rate": 5.7383743495092555e-05, + "loss": 2.8191, + "step": 24528 + }, + { + "epoch": 1.522689179961512, + "grad_norm": 0.14521714435081295, + "learning_rate": 5.738017154854518e-05, + "loss": 2.7396, + "step": 24529 + }, + { + "epoch": 1.52275125706127, + "grad_norm": 0.15218433573642884, + "learning_rate": 5.737659956349364e-05, + "loss": 2.7856, + "step": 24530 + }, + { + "epoch": 1.522813334161028, + "grad_norm": 0.15104008679276207, + "learning_rate": 5.7373027539956604e-05, + "loss": 2.7537, + "step": 24531 + }, + { + "epoch": 1.5228754112607858, + "grad_norm": 0.17373948569696915, + "learning_rate": 5.736945547795268e-05, + "loss": 2.7962, + "step": 24532 + }, + { + "epoch": 1.5229374883605438, + "grad_norm": 0.1604974299303806, + "learning_rate": 5.736588337750053e-05, + "loss": 2.8552, + "step": 24533 + }, + { + "epoch": 1.5229995654603017, + "grad_norm": 0.15380831521629665, + "learning_rate": 5.736231123861876e-05, + "loss": 2.8584, + "step": 24534 + }, + { + "epoch": 1.5230616425600596, + "grad_norm": 0.16189026674978946, + "learning_rate": 5.735873906132605e-05, + "loss": 2.8477, + "step": 24535 + }, + { + "epoch": 1.5231237196598175, + "grad_norm": 0.1557751294046127, + "learning_rate": 5.7355166845641e-05, + "loss": 2.7154, + "step": 24536 + }, + { + "epoch": 1.5231857967595754, + "grad_norm": 0.16303940640294026, + "learning_rate": 5.7351594591582256e-05, + "loss": 2.698, + "step": 24537 + }, + { + "epoch": 1.5232478738593334, + "grad_norm": 0.15527084036520442, + "learning_rate": 5.7348022299168446e-05, + "loss": 2.8462, + "step": 24538 + }, + { + "epoch": 1.5233099509590913, + "grad_norm": 0.1714955932319326, + "learning_rate": 5.7344449968418244e-05, + "loss": 2.8128, + "step": 24539 + }, + { + "epoch": 1.5233720280588492, + "grad_norm": 0.1609848788343408, + "learning_rate": 5.734087759935026e-05, + "loss": 2.8114, + "step": 24540 + }, + { + "epoch": 1.523434105158607, + "grad_norm": 0.15466519951222993, + "learning_rate": 5.7337305191983126e-05, + "loss": 2.7955, + "step": 24541 + }, + { + "epoch": 1.523496182258365, + "grad_norm": 0.1751889315902895, + "learning_rate": 5.7333732746335486e-05, + "loss": 2.9318, + "step": 24542 + }, + { + "epoch": 1.523558259358123, + "grad_norm": 0.15594923158751686, + "learning_rate": 5.733016026242598e-05, + "loss": 2.751, + "step": 24543 + }, + { + "epoch": 1.5236203364578806, + "grad_norm": 0.16418982288226164, + "learning_rate": 5.7326587740273266e-05, + "loss": 2.7879, + "step": 24544 + }, + { + "epoch": 1.5236824135576386, + "grad_norm": 0.1872244837211961, + "learning_rate": 5.7323015179895955e-05, + "loss": 2.8179, + "step": 24545 + }, + { + "epoch": 1.5237444906573965, + "grad_norm": 0.1641614195657623, + "learning_rate": 5.73194425813127e-05, + "loss": 2.8038, + "step": 24546 + }, + { + "epoch": 1.5238065677571544, + "grad_norm": 0.1753351538007097, + "learning_rate": 5.731586994454213e-05, + "loss": 2.8342, + "step": 24547 + }, + { + "epoch": 1.5238686448569123, + "grad_norm": 0.16328616043628552, + "learning_rate": 5.73122972696029e-05, + "loss": 2.717, + "step": 24548 + }, + { + "epoch": 1.5239307219566702, + "grad_norm": 0.15931330964162185, + "learning_rate": 5.730872455651364e-05, + "loss": 2.8535, + "step": 24549 + }, + { + "epoch": 1.523992799056428, + "grad_norm": 0.152757199769778, + "learning_rate": 5.730515180529298e-05, + "loss": 2.7768, + "step": 24550 + }, + { + "epoch": 1.5240548761561858, + "grad_norm": 0.15579136055187395, + "learning_rate": 5.7301579015959574e-05, + "loss": 2.7024, + "step": 24551 + }, + { + "epoch": 1.5241169532559438, + "grad_norm": 0.15633683771830534, + "learning_rate": 5.729800618853205e-05, + "loss": 2.8375, + "step": 24552 + }, + { + "epoch": 1.5241790303557017, + "grad_norm": 0.17151711331729813, + "learning_rate": 5.7294433323029074e-05, + "loss": 2.7782, + "step": 24553 + }, + { + "epoch": 1.5242411074554596, + "grad_norm": 0.17128011316911512, + "learning_rate": 5.729086041946925e-05, + "loss": 2.9041, + "step": 24554 + }, + { + "epoch": 1.5243031845552175, + "grad_norm": 0.15838279646897524, + "learning_rate": 5.7287287477871256e-05, + "loss": 2.7762, + "step": 24555 + }, + { + "epoch": 1.5243652616549754, + "grad_norm": 0.14564070937145332, + "learning_rate": 5.7283714498253694e-05, + "loss": 2.6979, + "step": 24556 + }, + { + "epoch": 1.5244273387547334, + "grad_norm": 0.17613373489459844, + "learning_rate": 5.728014148063524e-05, + "loss": 2.7964, + "step": 24557 + }, + { + "epoch": 1.5244894158544913, + "grad_norm": 0.1617512942016215, + "learning_rate": 5.7276568425034505e-05, + "loss": 2.8533, + "step": 24558 + }, + { + "epoch": 1.5245514929542492, + "grad_norm": 0.15658514914513783, + "learning_rate": 5.727299533147016e-05, + "loss": 2.8087, + "step": 24559 + }, + { + "epoch": 1.524613570054007, + "grad_norm": 0.15860302151042305, + "learning_rate": 5.7269422199960834e-05, + "loss": 2.7359, + "step": 24560 + }, + { + "epoch": 1.524675647153765, + "grad_norm": 0.1430124434945897, + "learning_rate": 5.7265849030525154e-05, + "loss": 2.814, + "step": 24561 + }, + { + "epoch": 1.524737724253523, + "grad_norm": 0.20986432358814408, + "learning_rate": 5.726227582318179e-05, + "loss": 2.7422, + "step": 24562 + }, + { + "epoch": 1.5247998013532809, + "grad_norm": 0.14447861434699855, + "learning_rate": 5.7258702577949364e-05, + "loss": 2.8013, + "step": 24563 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.14902833628070866, + "learning_rate": 5.7255129294846524e-05, + "loss": 2.7758, + "step": 24564 + }, + { + "epoch": 1.5249239555527967, + "grad_norm": 0.15123540617745657, + "learning_rate": 5.725155597389192e-05, + "loss": 2.8499, + "step": 24565 + }, + { + "epoch": 1.5249860326525546, + "grad_norm": 0.1498993553335917, + "learning_rate": 5.724798261510418e-05, + "loss": 2.8018, + "step": 24566 + }, + { + "epoch": 1.5250481097523125, + "grad_norm": 0.17842331133413905, + "learning_rate": 5.724440921850196e-05, + "loss": 2.7576, + "step": 24567 + }, + { + "epoch": 1.5251101868520702, + "grad_norm": 0.14386005102184996, + "learning_rate": 5.7240835784103886e-05, + "loss": 2.7976, + "step": 24568 + }, + { + "epoch": 1.5251722639518281, + "grad_norm": 0.15639009155966058, + "learning_rate": 5.723726231192863e-05, + "loss": 2.8839, + "step": 24569 + }, + { + "epoch": 1.525234341051586, + "grad_norm": 0.15233440248226704, + "learning_rate": 5.723368880199481e-05, + "loss": 2.8665, + "step": 24570 + }, + { + "epoch": 1.525296418151344, + "grad_norm": 0.14873117332767505, + "learning_rate": 5.7230115254321094e-05, + "loss": 2.7715, + "step": 24571 + }, + { + "epoch": 1.525358495251102, + "grad_norm": 0.14793270006277776, + "learning_rate": 5.722654166892609e-05, + "loss": 2.8051, + "step": 24572 + }, + { + "epoch": 1.5254205723508598, + "grad_norm": 0.16629114289043154, + "learning_rate": 5.7222968045828484e-05, + "loss": 2.8029, + "step": 24573 + }, + { + "epoch": 1.5254826494506175, + "grad_norm": 0.16719302988265553, + "learning_rate": 5.721939438504689e-05, + "loss": 2.8549, + "step": 24574 + }, + { + "epoch": 1.5255447265503754, + "grad_norm": 0.14649918799207842, + "learning_rate": 5.721582068659995e-05, + "loss": 2.8867, + "step": 24575 + }, + { + "epoch": 1.5256068036501333, + "grad_norm": 0.14005949969910336, + "learning_rate": 5.721224695050634e-05, + "loss": 2.8802, + "step": 24576 + }, + { + "epoch": 1.5256688807498913, + "grad_norm": 0.15373319564066104, + "learning_rate": 5.720867317678468e-05, + "loss": 2.817, + "step": 24577 + }, + { + "epoch": 1.5257309578496492, + "grad_norm": 0.1413274018293934, + "learning_rate": 5.720509936545363e-05, + "loss": 2.718, + "step": 24578 + }, + { + "epoch": 1.525793034949407, + "grad_norm": 0.1503487194185934, + "learning_rate": 5.720152551653182e-05, + "loss": 2.8283, + "step": 24579 + }, + { + "epoch": 1.525855112049165, + "grad_norm": 0.17658522383710176, + "learning_rate": 5.71979516300379e-05, + "loss": 2.8292, + "step": 24580 + }, + { + "epoch": 1.525917189148923, + "grad_norm": 0.16060114335483583, + "learning_rate": 5.719437770599052e-05, + "loss": 2.8803, + "step": 24581 + }, + { + "epoch": 1.5259792662486809, + "grad_norm": 0.1747503888919907, + "learning_rate": 5.7190803744408336e-05, + "loss": 2.8584, + "step": 24582 + }, + { + "epoch": 1.5260413433484388, + "grad_norm": 0.16463942927360492, + "learning_rate": 5.718722974530999e-05, + "loss": 2.8019, + "step": 24583 + }, + { + "epoch": 1.5261034204481967, + "grad_norm": 0.1793520117526969, + "learning_rate": 5.71836557087141e-05, + "loss": 2.7962, + "step": 24584 + }, + { + "epoch": 1.5261654975479546, + "grad_norm": 0.14671036136684099, + "learning_rate": 5.718008163463935e-05, + "loss": 2.763, + "step": 24585 + }, + { + "epoch": 1.5262275746477125, + "grad_norm": 0.14533465278105648, + "learning_rate": 5.717650752310435e-05, + "loss": 2.7531, + "step": 24586 + }, + { + "epoch": 1.5262896517474704, + "grad_norm": 0.14369840629008454, + "learning_rate": 5.717293337412779e-05, + "loss": 2.7889, + "step": 24587 + }, + { + "epoch": 1.5263517288472284, + "grad_norm": 0.1443850125416156, + "learning_rate": 5.7169359187728274e-05, + "loss": 2.8365, + "step": 24588 + }, + { + "epoch": 1.5264138059469863, + "grad_norm": 0.16271607494334897, + "learning_rate": 5.7165784963924494e-05, + "loss": 2.8229, + "step": 24589 + }, + { + "epoch": 1.5264758830467442, + "grad_norm": 0.14704500996439843, + "learning_rate": 5.7162210702735064e-05, + "loss": 2.8371, + "step": 24590 + }, + { + "epoch": 1.5265379601465021, + "grad_norm": 0.15643141466909605, + "learning_rate": 5.7158636404178634e-05, + "loss": 2.8796, + "step": 24591 + }, + { + "epoch": 1.5266000372462598, + "grad_norm": 0.14263489651133024, + "learning_rate": 5.715506206827388e-05, + "loss": 2.8369, + "step": 24592 + }, + { + "epoch": 1.5266621143460177, + "grad_norm": 0.14849534288041177, + "learning_rate": 5.7151487695039405e-05, + "loss": 2.8608, + "step": 24593 + }, + { + "epoch": 1.5267241914457756, + "grad_norm": 0.14354550728022278, + "learning_rate": 5.71479132844939e-05, + "loss": 2.7794, + "step": 24594 + }, + { + "epoch": 1.5267862685455336, + "grad_norm": 0.16276175434834414, + "learning_rate": 5.7144338836655986e-05, + "loss": 2.9008, + "step": 24595 + }, + { + "epoch": 1.5268483456452915, + "grad_norm": 0.15682007981412652, + "learning_rate": 5.714076435154434e-05, + "loss": 2.7713, + "step": 24596 + }, + { + "epoch": 1.5269104227450494, + "grad_norm": 0.14590246330334564, + "learning_rate": 5.7137189829177576e-05, + "loss": 2.7812, + "step": 24597 + }, + { + "epoch": 1.526972499844807, + "grad_norm": 0.14821194150457032, + "learning_rate": 5.713361526957437e-05, + "loss": 2.8454, + "step": 24598 + }, + { + "epoch": 1.527034576944565, + "grad_norm": 0.15678282131626503, + "learning_rate": 5.713004067275335e-05, + "loss": 2.776, + "step": 24599 + }, + { + "epoch": 1.527096654044323, + "grad_norm": 0.14232242857936764, + "learning_rate": 5.712646603873319e-05, + "loss": 2.7647, + "step": 24600 + }, + { + "epoch": 1.5271587311440808, + "grad_norm": 0.14941964107895075, + "learning_rate": 5.7122891367532505e-05, + "loss": 2.8417, + "step": 24601 + }, + { + "epoch": 1.5272208082438388, + "grad_norm": 0.15306835975972607, + "learning_rate": 5.711931665916999e-05, + "loss": 2.8056, + "step": 24602 + }, + { + "epoch": 1.5272828853435967, + "grad_norm": 0.1503373832091933, + "learning_rate": 5.7115741913664264e-05, + "loss": 2.7976, + "step": 24603 + }, + { + "epoch": 1.5273449624433546, + "grad_norm": 0.14649288788536952, + "learning_rate": 5.711216713103398e-05, + "loss": 2.7498, + "step": 24604 + }, + { + "epoch": 1.5274070395431125, + "grad_norm": 0.14688640103831915, + "learning_rate": 5.7108592311297804e-05, + "loss": 2.7268, + "step": 24605 + }, + { + "epoch": 1.5274691166428704, + "grad_norm": 0.17331439086585543, + "learning_rate": 5.7105017454474365e-05, + "loss": 2.8002, + "step": 24606 + }, + { + "epoch": 1.5275311937426284, + "grad_norm": 0.140092350917691, + "learning_rate": 5.710144256058233e-05, + "loss": 2.8485, + "step": 24607 + }, + { + "epoch": 1.5275932708423863, + "grad_norm": 0.14717132979657632, + "learning_rate": 5.7097867629640346e-05, + "loss": 2.8413, + "step": 24608 + }, + { + "epoch": 1.5276553479421442, + "grad_norm": 0.1529567989224169, + "learning_rate": 5.709429266166706e-05, + "loss": 2.8552, + "step": 24609 + }, + { + "epoch": 1.527717425041902, + "grad_norm": 0.14729572666441282, + "learning_rate": 5.7090717656681136e-05, + "loss": 2.7627, + "step": 24610 + }, + { + "epoch": 1.52777950214166, + "grad_norm": 0.16050653995131078, + "learning_rate": 5.708714261470121e-05, + "loss": 2.9051, + "step": 24611 + }, + { + "epoch": 1.527841579241418, + "grad_norm": 0.1514082083954342, + "learning_rate": 5.708356753574594e-05, + "loss": 2.7507, + "step": 24612 + }, + { + "epoch": 1.5279036563411759, + "grad_norm": 0.1511164280146046, + "learning_rate": 5.7079992419833975e-05, + "loss": 2.7951, + "step": 24613 + }, + { + "epoch": 1.5279657334409338, + "grad_norm": 0.1757659852499555, + "learning_rate": 5.7076417266983986e-05, + "loss": 2.8564, + "step": 24614 + }, + { + "epoch": 1.5280278105406917, + "grad_norm": 0.16666219687925324, + "learning_rate": 5.707284207721458e-05, + "loss": 2.7397, + "step": 24615 + }, + { + "epoch": 1.5280898876404494, + "grad_norm": 0.14568665413915752, + "learning_rate": 5.706926685054448e-05, + "loss": 2.7445, + "step": 24616 + }, + { + "epoch": 1.5281519647402073, + "grad_norm": 0.15223772076087166, + "learning_rate": 5.706569158699227e-05, + "loss": 2.8366, + "step": 24617 + }, + { + "epoch": 1.5282140418399652, + "grad_norm": 0.14982977238082706, + "learning_rate": 5.706211628657664e-05, + "loss": 2.7739, + "step": 24618 + }, + { + "epoch": 1.5282761189397231, + "grad_norm": 0.16411564170664006, + "learning_rate": 5.705854094931623e-05, + "loss": 2.7628, + "step": 24619 + }, + { + "epoch": 1.528338196039481, + "grad_norm": 0.14945901556017058, + "learning_rate": 5.705496557522971e-05, + "loss": 2.7615, + "step": 24620 + }, + { + "epoch": 1.528400273139239, + "grad_norm": 0.1556445670753119, + "learning_rate": 5.7051390164335715e-05, + "loss": 2.7567, + "step": 24621 + }, + { + "epoch": 1.5284623502389967, + "grad_norm": 0.15294315332440758, + "learning_rate": 5.7047814716652904e-05, + "loss": 2.7119, + "step": 24622 + }, + { + "epoch": 1.5285244273387546, + "grad_norm": 0.16233910518166056, + "learning_rate": 5.704423923219994e-05, + "loss": 2.8108, + "step": 24623 + }, + { + "epoch": 1.5285865044385125, + "grad_norm": 0.15577617566214977, + "learning_rate": 5.704066371099545e-05, + "loss": 2.7608, + "step": 24624 + }, + { + "epoch": 1.5286485815382704, + "grad_norm": 0.1675304972050537, + "learning_rate": 5.7037088153058125e-05, + "loss": 2.8335, + "step": 24625 + }, + { + "epoch": 1.5287106586380284, + "grad_norm": 0.15789495877448492, + "learning_rate": 5.70335125584066e-05, + "loss": 2.8113, + "step": 24626 + }, + { + "epoch": 1.5287727357377863, + "grad_norm": 0.16451626990213, + "learning_rate": 5.7029936927059535e-05, + "loss": 2.7353, + "step": 24627 + }, + { + "epoch": 1.5288348128375442, + "grad_norm": 0.14526850588152335, + "learning_rate": 5.702636125903559e-05, + "loss": 2.7933, + "step": 24628 + }, + { + "epoch": 1.528896889937302, + "grad_norm": 0.15993631194322996, + "learning_rate": 5.7022785554353395e-05, + "loss": 2.7866, + "step": 24629 + }, + { + "epoch": 1.52895896703706, + "grad_norm": 0.1594213920745968, + "learning_rate": 5.701920981303164e-05, + "loss": 2.768, + "step": 24630 + }, + { + "epoch": 1.529021044136818, + "grad_norm": 0.16291190472661438, + "learning_rate": 5.701563403508894e-05, + "loss": 2.8065, + "step": 24631 + }, + { + "epoch": 1.5290831212365759, + "grad_norm": 0.15620317949708304, + "learning_rate": 5.7012058220543996e-05, + "loss": 2.8409, + "step": 24632 + }, + { + "epoch": 1.5291451983363338, + "grad_norm": 0.17970622006633222, + "learning_rate": 5.700848236941543e-05, + "loss": 2.7787, + "step": 24633 + }, + { + "epoch": 1.5292072754360917, + "grad_norm": 0.15390961372262635, + "learning_rate": 5.700490648172192e-05, + "loss": 2.7138, + "step": 24634 + }, + { + "epoch": 1.5292693525358496, + "grad_norm": 0.17520111567689622, + "learning_rate": 5.70013305574821e-05, + "loss": 2.7806, + "step": 24635 + }, + { + "epoch": 1.5293314296356075, + "grad_norm": 0.17957595804703214, + "learning_rate": 5.699775459671466e-05, + "loss": 2.8375, + "step": 24636 + }, + { + "epoch": 1.5293935067353654, + "grad_norm": 0.15880767670933024, + "learning_rate": 5.699417859943821e-05, + "loss": 2.8619, + "step": 24637 + }, + { + "epoch": 1.5294555838351234, + "grad_norm": 0.1776674060623494, + "learning_rate": 5.6990602565671447e-05, + "loss": 2.845, + "step": 24638 + }, + { + "epoch": 1.5295176609348813, + "grad_norm": 0.17507968880611616, + "learning_rate": 5.698702649543301e-05, + "loss": 2.8614, + "step": 24639 + }, + { + "epoch": 1.529579738034639, + "grad_norm": 0.22081078798141635, + "learning_rate": 5.6983450388741554e-05, + "loss": 2.769, + "step": 24640 + }, + { + "epoch": 1.529641815134397, + "grad_norm": 0.21161680864150983, + "learning_rate": 5.697987424561575e-05, + "loss": 2.7852, + "step": 24641 + }, + { + "epoch": 1.5297038922341548, + "grad_norm": 0.1857819222870414, + "learning_rate": 5.697629806607424e-05, + "loss": 2.7343, + "step": 24642 + }, + { + "epoch": 1.5297659693339127, + "grad_norm": 0.19687850661409492, + "learning_rate": 5.6972721850135705e-05, + "loss": 2.884, + "step": 24643 + }, + { + "epoch": 1.5298280464336707, + "grad_norm": 0.17198524407366217, + "learning_rate": 5.6969145597818764e-05, + "loss": 2.851, + "step": 24644 + }, + { + "epoch": 1.5298901235334286, + "grad_norm": 0.16168762981061588, + "learning_rate": 5.696556930914212e-05, + "loss": 2.7539, + "step": 24645 + }, + { + "epoch": 1.5299522006331863, + "grad_norm": 0.15942212230943975, + "learning_rate": 5.6961992984124416e-05, + "loss": 2.8863, + "step": 24646 + }, + { + "epoch": 1.5300142777329442, + "grad_norm": 0.19764800827273488, + "learning_rate": 5.695841662278427e-05, + "loss": 2.8021, + "step": 24647 + }, + { + "epoch": 1.530076354832702, + "grad_norm": 0.15594896284908488, + "learning_rate": 5.6954840225140404e-05, + "loss": 2.8031, + "step": 24648 + }, + { + "epoch": 1.53013843193246, + "grad_norm": 0.16457709150208033, + "learning_rate": 5.695126379121144e-05, + "loss": 2.7696, + "step": 24649 + }, + { + "epoch": 1.530200509032218, + "grad_norm": 0.16053287486827764, + "learning_rate": 5.694768732101605e-05, + "loss": 2.8197, + "step": 24650 + }, + { + "epoch": 1.5302625861319759, + "grad_norm": 0.14741234262858002, + "learning_rate": 5.694411081457288e-05, + "loss": 2.8331, + "step": 24651 + }, + { + "epoch": 1.5303246632317338, + "grad_norm": 0.15860057285271226, + "learning_rate": 5.6940534271900606e-05, + "loss": 2.8176, + "step": 24652 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.14430660755069064, + "learning_rate": 5.6936957693017864e-05, + "loss": 2.7427, + "step": 24653 + }, + { + "epoch": 1.5304488174312496, + "grad_norm": 0.16089868007900013, + "learning_rate": 5.693338107794335e-05, + "loss": 2.8704, + "step": 24654 + }, + { + "epoch": 1.5305108945310075, + "grad_norm": 0.1586131680632273, + "learning_rate": 5.692980442669569e-05, + "loss": 2.7777, + "step": 24655 + }, + { + "epoch": 1.5305729716307654, + "grad_norm": 0.15003016238145128, + "learning_rate": 5.692622773929356e-05, + "loss": 2.7604, + "step": 24656 + }, + { + "epoch": 1.5306350487305234, + "grad_norm": 0.16000167435090354, + "learning_rate": 5.692265101575562e-05, + "loss": 2.8786, + "step": 24657 + }, + { + "epoch": 1.5306971258302813, + "grad_norm": 0.1544677596908426, + "learning_rate": 5.691907425610054e-05, + "loss": 2.8238, + "step": 24658 + }, + { + "epoch": 1.5307592029300392, + "grad_norm": 0.15743969752514647, + "learning_rate": 5.6915497460346956e-05, + "loss": 2.8503, + "step": 24659 + }, + { + "epoch": 1.5308212800297971, + "grad_norm": 0.1505725862098515, + "learning_rate": 5.6911920628513545e-05, + "loss": 2.773, + "step": 24660 + }, + { + "epoch": 1.530883357129555, + "grad_norm": 0.16297649663213692, + "learning_rate": 5.6908343760618974e-05, + "loss": 2.8681, + "step": 24661 + }, + { + "epoch": 1.530945434229313, + "grad_norm": 0.14626623441587344, + "learning_rate": 5.690476685668189e-05, + "loss": 2.8224, + "step": 24662 + }, + { + "epoch": 1.5310075113290709, + "grad_norm": 0.1731212849941414, + "learning_rate": 5.690118991672096e-05, + "loss": 2.8134, + "step": 24663 + }, + { + "epoch": 1.5310695884288286, + "grad_norm": 0.14749027112015586, + "learning_rate": 5.689761294075486e-05, + "loss": 2.8388, + "step": 24664 + }, + { + "epoch": 1.5311316655285865, + "grad_norm": 0.1334760163842769, + "learning_rate": 5.6894035928802225e-05, + "loss": 2.8163, + "step": 24665 + }, + { + "epoch": 1.5311937426283444, + "grad_norm": 0.1572824817043063, + "learning_rate": 5.689045888088174e-05, + "loss": 2.9038, + "step": 24666 + }, + { + "epoch": 1.5312558197281023, + "grad_norm": 0.1516320034801319, + "learning_rate": 5.688688179701205e-05, + "loss": 2.7308, + "step": 24667 + }, + { + "epoch": 1.5313178968278602, + "grad_norm": 0.1387476244070119, + "learning_rate": 5.6883304677211836e-05, + "loss": 2.8107, + "step": 24668 + }, + { + "epoch": 1.5313799739276182, + "grad_norm": 0.15679327573384813, + "learning_rate": 5.6879727521499724e-05, + "loss": 2.7509, + "step": 24669 + }, + { + "epoch": 1.5314420510273759, + "grad_norm": 0.15968693028589137, + "learning_rate": 5.687615032989444e-05, + "loss": 2.7939, + "step": 24670 + }, + { + "epoch": 1.5315041281271338, + "grad_norm": 0.15067494133306816, + "learning_rate": 5.6872573102414585e-05, + "loss": 2.8166, + "step": 24671 + }, + { + "epoch": 1.5315662052268917, + "grad_norm": 0.1473517297725902, + "learning_rate": 5.6868995839078854e-05, + "loss": 2.7998, + "step": 24672 + }, + { + "epoch": 1.5316282823266496, + "grad_norm": 0.1484361510466467, + "learning_rate": 5.6865418539905904e-05, + "loss": 2.8615, + "step": 24673 + }, + { + "epoch": 1.5316903594264075, + "grad_norm": 0.14519549140169966, + "learning_rate": 5.6861841204914404e-05, + "loss": 2.7579, + "step": 24674 + }, + { + "epoch": 1.5317524365261654, + "grad_norm": 0.14494087834963104, + "learning_rate": 5.6858263834123e-05, + "loss": 2.7696, + "step": 24675 + }, + { + "epoch": 1.5318145136259234, + "grad_norm": 0.1715968004621428, + "learning_rate": 5.685468642755039e-05, + "loss": 2.7829, + "step": 24676 + }, + { + "epoch": 1.5318765907256813, + "grad_norm": 0.1635666575036449, + "learning_rate": 5.6851108985215196e-05, + "loss": 2.7406, + "step": 24677 + }, + { + "epoch": 1.5319386678254392, + "grad_norm": 0.14398313669799248, + "learning_rate": 5.684753150713611e-05, + "loss": 2.6735, + "step": 24678 + }, + { + "epoch": 1.532000744925197, + "grad_norm": 0.15132558927838793, + "learning_rate": 5.68439539933318e-05, + "loss": 2.8437, + "step": 24679 + }, + { + "epoch": 1.532062822024955, + "grad_norm": 0.15171073566330817, + "learning_rate": 5.684037644382091e-05, + "loss": 2.8651, + "step": 24680 + }, + { + "epoch": 1.532124899124713, + "grad_norm": 0.15114608736912916, + "learning_rate": 5.683679885862212e-05, + "loss": 2.7828, + "step": 24681 + }, + { + "epoch": 1.5321869762244709, + "grad_norm": 0.1618369697760702, + "learning_rate": 5.68332212377541e-05, + "loss": 2.8374, + "step": 24682 + }, + { + "epoch": 1.5322490533242288, + "grad_norm": 0.14160220705082965, + "learning_rate": 5.682964358123548e-05, + "loss": 2.7742, + "step": 24683 + }, + { + "epoch": 1.5323111304239867, + "grad_norm": 0.15066931033223915, + "learning_rate": 5.682606588908497e-05, + "loss": 2.8386, + "step": 24684 + }, + { + "epoch": 1.5323732075237446, + "grad_norm": 0.15361058644467077, + "learning_rate": 5.682248816132121e-05, + "loss": 2.8592, + "step": 24685 + }, + { + "epoch": 1.5324352846235025, + "grad_norm": 0.14828556843369203, + "learning_rate": 5.6818910397962885e-05, + "loss": 2.7969, + "step": 24686 + }, + { + "epoch": 1.5324973617232605, + "grad_norm": 0.16232523277424066, + "learning_rate": 5.681533259902864e-05, + "loss": 2.7675, + "step": 24687 + }, + { + "epoch": 1.5325594388230181, + "grad_norm": 0.13649769350651567, + "learning_rate": 5.6811754764537154e-05, + "loss": 2.7861, + "step": 24688 + }, + { + "epoch": 1.532621515922776, + "grad_norm": 0.16094229445676808, + "learning_rate": 5.680817689450708e-05, + "loss": 2.7859, + "step": 24689 + }, + { + "epoch": 1.532683593022534, + "grad_norm": 0.17477714065789934, + "learning_rate": 5.6804598988957095e-05, + "loss": 2.879, + "step": 24690 + }, + { + "epoch": 1.532745670122292, + "grad_norm": 0.16979786304759645, + "learning_rate": 5.680102104790588e-05, + "loss": 2.7685, + "step": 24691 + }, + { + "epoch": 1.5328077472220498, + "grad_norm": 0.15728063423631938, + "learning_rate": 5.679744307137207e-05, + "loss": 2.8324, + "step": 24692 + }, + { + "epoch": 1.5328698243218077, + "grad_norm": 0.15883244629144733, + "learning_rate": 5.679386505937436e-05, + "loss": 2.7855, + "step": 24693 + }, + { + "epoch": 1.5329319014215654, + "grad_norm": 0.15655882832986365, + "learning_rate": 5.6790287011931396e-05, + "loss": 2.7836, + "step": 24694 + }, + { + "epoch": 1.5329939785213234, + "grad_norm": 0.15102807482588923, + "learning_rate": 5.678670892906187e-05, + "loss": 2.828, + "step": 24695 + }, + { + "epoch": 1.5330560556210813, + "grad_norm": 0.13859275790636802, + "learning_rate": 5.6783130810784434e-05, + "loss": 2.7474, + "step": 24696 + }, + { + "epoch": 1.5331181327208392, + "grad_norm": 0.14919144745148144, + "learning_rate": 5.677955265711775e-05, + "loss": 2.8549, + "step": 24697 + }, + { + "epoch": 1.533180209820597, + "grad_norm": 0.15118662000796634, + "learning_rate": 5.677597446808049e-05, + "loss": 2.7968, + "step": 24698 + }, + { + "epoch": 1.533242286920355, + "grad_norm": 0.14035856650437162, + "learning_rate": 5.677239624369135e-05, + "loss": 2.7741, + "step": 24699 + }, + { + "epoch": 1.533304364020113, + "grad_norm": 0.1442793894088644, + "learning_rate": 5.676881798396896e-05, + "loss": 2.8549, + "step": 24700 + }, + { + "epoch": 1.5333664411198709, + "grad_norm": 0.13997365522721467, + "learning_rate": 5.6765239688932e-05, + "loss": 2.8093, + "step": 24701 + }, + { + "epoch": 1.5334285182196288, + "grad_norm": 0.13637428485952086, + "learning_rate": 5.676166135859915e-05, + "loss": 2.7616, + "step": 24702 + }, + { + "epoch": 1.5334905953193867, + "grad_norm": 0.15368902614940297, + "learning_rate": 5.675808299298906e-05, + "loss": 2.7916, + "step": 24703 + }, + { + "epoch": 1.5335526724191446, + "grad_norm": 0.14160920484242828, + "learning_rate": 5.6754504592120426e-05, + "loss": 2.7643, + "step": 24704 + }, + { + "epoch": 1.5336147495189025, + "grad_norm": 0.1536636214277351, + "learning_rate": 5.675092615601187e-05, + "loss": 2.7886, + "step": 24705 + }, + { + "epoch": 1.5336768266186604, + "grad_norm": 0.15030465827282133, + "learning_rate": 5.674734768468213e-05, + "loss": 2.8421, + "step": 24706 + }, + { + "epoch": 1.5337389037184184, + "grad_norm": 0.14468664759185035, + "learning_rate": 5.674376917814983e-05, + "loss": 2.7799, + "step": 24707 + }, + { + "epoch": 1.5338009808181763, + "grad_norm": 0.16103072544940045, + "learning_rate": 5.674019063643363e-05, + "loss": 2.7761, + "step": 24708 + }, + { + "epoch": 1.5338630579179342, + "grad_norm": 0.1443790007631422, + "learning_rate": 5.673661205955225e-05, + "loss": 2.7964, + "step": 24709 + }, + { + "epoch": 1.5339251350176921, + "grad_norm": 0.15209985429918582, + "learning_rate": 5.6733033447524305e-05, + "loss": 2.7969, + "step": 24710 + }, + { + "epoch": 1.53398721211745, + "grad_norm": 0.14961485989628343, + "learning_rate": 5.672945480036851e-05, + "loss": 2.9073, + "step": 24711 + }, + { + "epoch": 1.5340492892172077, + "grad_norm": 0.1558315814006043, + "learning_rate": 5.672587611810349e-05, + "loss": 2.7799, + "step": 24712 + }, + { + "epoch": 1.5341113663169657, + "grad_norm": 0.1416488108239481, + "learning_rate": 5.672229740074796e-05, + "loss": 2.7859, + "step": 24713 + }, + { + "epoch": 1.5341734434167236, + "grad_norm": 0.1516221575470568, + "learning_rate": 5.671871864832057e-05, + "loss": 2.8459, + "step": 24714 + }, + { + "epoch": 1.5342355205164815, + "grad_norm": 0.14236442672220515, + "learning_rate": 5.671513986083999e-05, + "loss": 2.8073, + "step": 24715 + }, + { + "epoch": 1.5342975976162394, + "grad_norm": 0.14393491409817988, + "learning_rate": 5.671156103832489e-05, + "loss": 2.7624, + "step": 24716 + }, + { + "epoch": 1.5343596747159973, + "grad_norm": 0.1606638992368099, + "learning_rate": 5.670798218079396e-05, + "loss": 2.8892, + "step": 24717 + }, + { + "epoch": 1.534421751815755, + "grad_norm": 0.14445737290931887, + "learning_rate": 5.6704403288265865e-05, + "loss": 2.7453, + "step": 24718 + }, + { + "epoch": 1.534483828915513, + "grad_norm": 0.1566130711396105, + "learning_rate": 5.670082436075925e-05, + "loss": 2.8054, + "step": 24719 + }, + { + "epoch": 1.5345459060152709, + "grad_norm": 0.15648851072090278, + "learning_rate": 5.669724539829282e-05, + "loss": 2.812, + "step": 24720 + }, + { + "epoch": 1.5346079831150288, + "grad_norm": 0.16151335314599227, + "learning_rate": 5.6693666400885226e-05, + "loss": 2.8325, + "step": 24721 + }, + { + "epoch": 1.5346700602147867, + "grad_norm": 0.21384472489514542, + "learning_rate": 5.669008736855515e-05, + "loss": 2.8288, + "step": 24722 + }, + { + "epoch": 1.5347321373145446, + "grad_norm": 0.1689978611538484, + "learning_rate": 5.668650830132126e-05, + "loss": 2.7224, + "step": 24723 + }, + { + "epoch": 1.5347942144143025, + "grad_norm": 1.1640643433421856, + "learning_rate": 5.668292919920224e-05, + "loss": 2.7616, + "step": 24724 + }, + { + "epoch": 1.5348562915140604, + "grad_norm": 0.19631178805001426, + "learning_rate": 5.667935006221676e-05, + "loss": 2.8656, + "step": 24725 + }, + { + "epoch": 1.5349183686138184, + "grad_norm": 0.2088707053246942, + "learning_rate": 5.6675770890383485e-05, + "loss": 2.6765, + "step": 24726 + }, + { + "epoch": 1.5349804457135763, + "grad_norm": 0.25074603109746274, + "learning_rate": 5.66721916837211e-05, + "loss": 2.8269, + "step": 24727 + }, + { + "epoch": 1.5350425228133342, + "grad_norm": 0.17511025153163923, + "learning_rate": 5.6668612442248256e-05, + "loss": 2.7953, + "step": 24728 + }, + { + "epoch": 1.5351045999130921, + "grad_norm": 0.15615273667048105, + "learning_rate": 5.666503316598366e-05, + "loss": 2.7695, + "step": 24729 + }, + { + "epoch": 1.53516667701285, + "grad_norm": 0.17252464953646615, + "learning_rate": 5.6661453854945954e-05, + "loss": 2.8571, + "step": 24730 + }, + { + "epoch": 1.535228754112608, + "grad_norm": 0.17048235241075893, + "learning_rate": 5.665787450915384e-05, + "loss": 2.8527, + "step": 24731 + }, + { + "epoch": 1.5352908312123659, + "grad_norm": 0.20726334859242818, + "learning_rate": 5.665429512862597e-05, + "loss": 2.8577, + "step": 24732 + }, + { + "epoch": 1.5353529083121238, + "grad_norm": 0.1685634719044803, + "learning_rate": 5.665071571338102e-05, + "loss": 2.8791, + "step": 24733 + }, + { + "epoch": 1.5354149854118817, + "grad_norm": 0.2545887357733775, + "learning_rate": 5.6647136263437676e-05, + "loss": 2.7336, + "step": 24734 + }, + { + "epoch": 1.5354770625116394, + "grad_norm": 0.1479662076556439, + "learning_rate": 5.664355677881461e-05, + "loss": 2.8707, + "step": 24735 + }, + { + "epoch": 1.5355391396113973, + "grad_norm": 0.1662508532226151, + "learning_rate": 5.6639977259530506e-05, + "loss": 2.7886, + "step": 24736 + }, + { + "epoch": 1.5356012167111552, + "grad_norm": 0.19408465678199346, + "learning_rate": 5.6636397705604016e-05, + "loss": 2.8085, + "step": 24737 + }, + { + "epoch": 1.5356632938109132, + "grad_norm": 0.15134733746719412, + "learning_rate": 5.663281811705382e-05, + "loss": 2.783, + "step": 24738 + }, + { + "epoch": 1.535725370910671, + "grad_norm": 0.1768650656865674, + "learning_rate": 5.66292384938986e-05, + "loss": 2.8239, + "step": 24739 + }, + { + "epoch": 1.535787448010429, + "grad_norm": 0.2506526831370849, + "learning_rate": 5.662565883615707e-05, + "loss": 2.8859, + "step": 24740 + }, + { + "epoch": 1.5358495251101867, + "grad_norm": 0.18373796196514924, + "learning_rate": 5.662207914384783e-05, + "loss": 2.771, + "step": 24741 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.30571040816762196, + "learning_rate": 5.661849941698962e-05, + "loss": 2.8167, + "step": 24742 + }, + { + "epoch": 1.5359736793097025, + "grad_norm": 0.16259898508806192, + "learning_rate": 5.6614919655601086e-05, + "loss": 2.7768, + "step": 24743 + }, + { + "epoch": 1.5360357564094604, + "grad_norm": 0.1966682396656778, + "learning_rate": 5.6611339859700906e-05, + "loss": 2.813, + "step": 24744 + }, + { + "epoch": 1.5360978335092184, + "grad_norm": 0.23538414345777453, + "learning_rate": 5.660776002930777e-05, + "loss": 2.7477, + "step": 24745 + }, + { + "epoch": 1.5361599106089763, + "grad_norm": 0.2515443842744604, + "learning_rate": 5.6604180164440335e-05, + "loss": 2.7573, + "step": 24746 + }, + { + "epoch": 1.5362219877087342, + "grad_norm": 0.1737047035877818, + "learning_rate": 5.660060026511729e-05, + "loss": 2.7903, + "step": 24747 + }, + { + "epoch": 1.5362840648084921, + "grad_norm": 0.2422019738825258, + "learning_rate": 5.659702033135731e-05, + "loss": 2.799, + "step": 24748 + }, + { + "epoch": 1.53634614190825, + "grad_norm": 0.1757002378916039, + "learning_rate": 5.6593440363179087e-05, + "loss": 2.7709, + "step": 24749 + }, + { + "epoch": 1.536408219008008, + "grad_norm": 0.39897093873324857, + "learning_rate": 5.658986036060127e-05, + "loss": 2.8179, + "step": 24750 + }, + { + "epoch": 1.5364702961077659, + "grad_norm": 0.20042458408232144, + "learning_rate": 5.658628032364255e-05, + "loss": 2.7404, + "step": 24751 + }, + { + "epoch": 1.5365323732075238, + "grad_norm": 0.24795764159071523, + "learning_rate": 5.658270025232161e-05, + "loss": 2.82, + "step": 24752 + }, + { + "epoch": 1.5365944503072817, + "grad_norm": 0.20400504974293104, + "learning_rate": 5.657912014665714e-05, + "loss": 2.8056, + "step": 24753 + }, + { + "epoch": 1.5366565274070396, + "grad_norm": 0.1646165405093442, + "learning_rate": 5.657554000666778e-05, + "loss": 2.7192, + "step": 24754 + }, + { + "epoch": 1.5367186045067975, + "grad_norm": 0.17082405106304513, + "learning_rate": 5.657195983237224e-05, + "loss": 2.7862, + "step": 24755 + }, + { + "epoch": 1.5367806816065555, + "grad_norm": 0.17060420729679013, + "learning_rate": 5.656837962378919e-05, + "loss": 2.7456, + "step": 24756 + }, + { + "epoch": 1.5368427587063134, + "grad_norm": 0.15623013847002365, + "learning_rate": 5.656479938093731e-05, + "loss": 2.887, + "step": 24757 + }, + { + "epoch": 1.5369048358060713, + "grad_norm": 0.16805368316564506, + "learning_rate": 5.6561219103835284e-05, + "loss": 2.7966, + "step": 24758 + }, + { + "epoch": 1.536966912905829, + "grad_norm": 0.145984585413699, + "learning_rate": 5.655763879250178e-05, + "loss": 2.6922, + "step": 24759 + }, + { + "epoch": 1.537028990005587, + "grad_norm": 0.1748349573789824, + "learning_rate": 5.655405844695548e-05, + "loss": 2.8378, + "step": 24760 + }, + { + "epoch": 1.5370910671053448, + "grad_norm": 0.15254968393739113, + "learning_rate": 5.6550478067215086e-05, + "loss": 2.8556, + "step": 24761 + }, + { + "epoch": 1.5371531442051027, + "grad_norm": 0.16807123318024006, + "learning_rate": 5.654689765329924e-05, + "loss": 2.7776, + "step": 24762 + }, + { + "epoch": 1.5372152213048607, + "grad_norm": 0.18058718514841765, + "learning_rate": 5.6543317205226635e-05, + "loss": 2.9063, + "step": 24763 + }, + { + "epoch": 1.5372772984046186, + "grad_norm": 0.15229972790188442, + "learning_rate": 5.6539736723015955e-05, + "loss": 2.8264, + "step": 24764 + }, + { + "epoch": 1.5373393755043763, + "grad_norm": 0.14368831487568104, + "learning_rate": 5.6536156206685894e-05, + "loss": 2.8598, + "step": 24765 + }, + { + "epoch": 1.5374014526041342, + "grad_norm": 0.1758819821272508, + "learning_rate": 5.6532575656255106e-05, + "loss": 2.8884, + "step": 24766 + }, + { + "epoch": 1.537463529703892, + "grad_norm": 0.14898428471346972, + "learning_rate": 5.6528995071742295e-05, + "loss": 2.838, + "step": 24767 + }, + { + "epoch": 1.53752560680365, + "grad_norm": 0.15415422267458437, + "learning_rate": 5.652541445316613e-05, + "loss": 2.7689, + "step": 24768 + }, + { + "epoch": 1.537587683903408, + "grad_norm": 0.14432403169497435, + "learning_rate": 5.65218338005453e-05, + "loss": 2.8584, + "step": 24769 + }, + { + "epoch": 1.5376497610031659, + "grad_norm": 0.15672131027610092, + "learning_rate": 5.6518253113898456e-05, + "loss": 2.7618, + "step": 24770 + }, + { + "epoch": 1.5377118381029238, + "grad_norm": 0.18955519964383766, + "learning_rate": 5.6514672393244315e-05, + "loss": 2.8647, + "step": 24771 + }, + { + "epoch": 1.5377739152026817, + "grad_norm": 0.1482246529282193, + "learning_rate": 5.6511091638601555e-05, + "loss": 2.8378, + "step": 24772 + }, + { + "epoch": 1.5378359923024396, + "grad_norm": 0.16322162830978093, + "learning_rate": 5.650751084998884e-05, + "loss": 2.6962, + "step": 24773 + }, + { + "epoch": 1.5378980694021975, + "grad_norm": 0.17843753976704543, + "learning_rate": 5.650393002742487e-05, + "loss": 2.856, + "step": 24774 + }, + { + "epoch": 1.5379601465019554, + "grad_norm": 0.14350835870031106, + "learning_rate": 5.6500349170928314e-05, + "loss": 2.7491, + "step": 24775 + }, + { + "epoch": 1.5380222236017134, + "grad_norm": 0.15480982346236546, + "learning_rate": 5.649676828051787e-05, + "loss": 2.7471, + "step": 24776 + }, + { + "epoch": 1.5380843007014713, + "grad_norm": 0.16744736769740964, + "learning_rate": 5.6493187356212185e-05, + "loss": 2.8566, + "step": 24777 + }, + { + "epoch": 1.5381463778012292, + "grad_norm": 0.17102057860629613, + "learning_rate": 5.648960639802998e-05, + "loss": 2.7361, + "step": 24778 + }, + { + "epoch": 1.5382084549009871, + "grad_norm": 0.21758888533059123, + "learning_rate": 5.648602540598993e-05, + "loss": 2.7916, + "step": 24779 + }, + { + "epoch": 1.538270532000745, + "grad_norm": 0.15763867904130502, + "learning_rate": 5.648244438011069e-05, + "loss": 2.7928, + "step": 24780 + }, + { + "epoch": 1.538332609100503, + "grad_norm": 0.1509817939236866, + "learning_rate": 5.647886332041098e-05, + "loss": 2.8461, + "step": 24781 + }, + { + "epoch": 1.5383946862002609, + "grad_norm": 0.15696870497699678, + "learning_rate": 5.647528222690946e-05, + "loss": 2.8124, + "step": 24782 + }, + { + "epoch": 1.5384567633000186, + "grad_norm": 0.16570318216649302, + "learning_rate": 5.647170109962483e-05, + "loss": 2.8896, + "step": 24783 + }, + { + "epoch": 1.5385188403997765, + "grad_norm": 0.19116844273936393, + "learning_rate": 5.646811993857575e-05, + "loss": 2.8544, + "step": 24784 + }, + { + "epoch": 1.5385809174995344, + "grad_norm": 0.14414534729042522, + "learning_rate": 5.646453874378094e-05, + "loss": 2.8253, + "step": 24785 + }, + { + "epoch": 1.5386429945992923, + "grad_norm": 0.141815431815045, + "learning_rate": 5.646095751525904e-05, + "loss": 2.8392, + "step": 24786 + }, + { + "epoch": 1.5387050716990502, + "grad_norm": 0.1441413543393258, + "learning_rate": 5.645737625302875e-05, + "loss": 2.7833, + "step": 24787 + }, + { + "epoch": 1.5387671487988082, + "grad_norm": 0.14586054528190368, + "learning_rate": 5.6453794957108784e-05, + "loss": 2.7248, + "step": 24788 + }, + { + "epoch": 1.5388292258985659, + "grad_norm": 0.1546322581843504, + "learning_rate": 5.645021362751779e-05, + "loss": 2.7843, + "step": 24789 + }, + { + "epoch": 1.5388913029983238, + "grad_norm": 0.22550893683180506, + "learning_rate": 5.6446632264274466e-05, + "loss": 2.8216, + "step": 24790 + }, + { + "epoch": 1.5389533800980817, + "grad_norm": 0.14096571786782677, + "learning_rate": 5.6443050867397495e-05, + "loss": 2.8296, + "step": 24791 + }, + { + "epoch": 1.5390154571978396, + "grad_norm": 0.1554937419260709, + "learning_rate": 5.643946943690558e-05, + "loss": 2.795, + "step": 24792 + }, + { + "epoch": 1.5390775342975975, + "grad_norm": 0.16022412676054681, + "learning_rate": 5.643588797281737e-05, + "loss": 2.8686, + "step": 24793 + }, + { + "epoch": 1.5391396113973554, + "grad_norm": 0.1698461506365961, + "learning_rate": 5.643230647515158e-05, + "loss": 2.8302, + "step": 24794 + }, + { + "epoch": 1.5392016884971134, + "grad_norm": 0.15778071324156473, + "learning_rate": 5.642872494392688e-05, + "loss": 2.8734, + "step": 24795 + }, + { + "epoch": 1.5392637655968713, + "grad_norm": 0.1610668220639548, + "learning_rate": 5.642514337916197e-05, + "loss": 2.8047, + "step": 24796 + }, + { + "epoch": 1.5393258426966292, + "grad_norm": 0.1505458451865844, + "learning_rate": 5.642156178087552e-05, + "loss": 2.8174, + "step": 24797 + }, + { + "epoch": 1.5393879197963871, + "grad_norm": 0.21793585224803189, + "learning_rate": 5.6417980149086235e-05, + "loss": 2.8355, + "step": 24798 + }, + { + "epoch": 1.539449996896145, + "grad_norm": 0.14876352445042768, + "learning_rate": 5.641439848381278e-05, + "loss": 2.758, + "step": 24799 + }, + { + "epoch": 1.539512073995903, + "grad_norm": 0.16306035189458334, + "learning_rate": 5.641081678507385e-05, + "loss": 2.8372, + "step": 24800 + }, + { + "epoch": 1.5395741510956609, + "grad_norm": 0.15987651516171789, + "learning_rate": 5.640723505288813e-05, + "loss": 2.7731, + "step": 24801 + }, + { + "epoch": 1.5396362281954188, + "grad_norm": 0.16144509543222693, + "learning_rate": 5.6403653287274316e-05, + "loss": 2.7209, + "step": 24802 + }, + { + "epoch": 1.5396983052951767, + "grad_norm": 0.17486504093589847, + "learning_rate": 5.6400071488251085e-05, + "loss": 2.7623, + "step": 24803 + }, + { + "epoch": 1.5397603823949346, + "grad_norm": 0.15541051416091006, + "learning_rate": 5.6396489655837124e-05, + "loss": 2.8113, + "step": 24804 + }, + { + "epoch": 1.5398224594946925, + "grad_norm": 0.2750695839541555, + "learning_rate": 5.639290779005112e-05, + "loss": 2.7613, + "step": 24805 + }, + { + "epoch": 1.5398845365944505, + "grad_norm": 0.2029068857844274, + "learning_rate": 5.638932589091178e-05, + "loss": 2.7561, + "step": 24806 + }, + { + "epoch": 1.5399466136942082, + "grad_norm": 0.22067993177229653, + "learning_rate": 5.6385743958437765e-05, + "loss": 2.8172, + "step": 24807 + }, + { + "epoch": 1.540008690793966, + "grad_norm": 0.19469904882182035, + "learning_rate": 5.6382161992647774e-05, + "loss": 2.775, + "step": 24808 + }, + { + "epoch": 1.540070767893724, + "grad_norm": 0.24162456245698502, + "learning_rate": 5.637857999356049e-05, + "loss": 2.8404, + "step": 24809 + }, + { + "epoch": 1.540132844993482, + "grad_norm": 0.1966848889811853, + "learning_rate": 5.637499796119462e-05, + "loss": 2.7904, + "step": 24810 + }, + { + "epoch": 1.5401949220932398, + "grad_norm": 0.23980556768121297, + "learning_rate": 5.6371415895568824e-05, + "loss": 2.8216, + "step": 24811 + }, + { + "epoch": 1.5402569991929977, + "grad_norm": 0.1646215077433857, + "learning_rate": 5.636783379670182e-05, + "loss": 2.7827, + "step": 24812 + }, + { + "epoch": 1.5403190762927554, + "grad_norm": 0.15208267803464656, + "learning_rate": 5.6364251664612263e-05, + "loss": 2.7884, + "step": 24813 + }, + { + "epoch": 1.5403811533925134, + "grad_norm": 0.15874068448751813, + "learning_rate": 5.6360669499318874e-05, + "loss": 2.8893, + "step": 24814 + }, + { + "epoch": 1.5404432304922713, + "grad_norm": 0.1815520244800165, + "learning_rate": 5.635708730084032e-05, + "loss": 2.8099, + "step": 24815 + }, + { + "epoch": 1.5405053075920292, + "grad_norm": 0.17689204519722887, + "learning_rate": 5.63535050691953e-05, + "loss": 2.7782, + "step": 24816 + }, + { + "epoch": 1.5405673846917871, + "grad_norm": 0.16592440623547436, + "learning_rate": 5.634992280440251e-05, + "loss": 2.8806, + "step": 24817 + }, + { + "epoch": 1.540629461791545, + "grad_norm": 0.20778198699372963, + "learning_rate": 5.634634050648062e-05, + "loss": 2.8077, + "step": 24818 + }, + { + "epoch": 1.540691538891303, + "grad_norm": 0.16788250350124878, + "learning_rate": 5.634275817544834e-05, + "loss": 2.7458, + "step": 24819 + }, + { + "epoch": 1.5407536159910609, + "grad_norm": 0.17028435050121446, + "learning_rate": 5.633917581132433e-05, + "loss": 2.733, + "step": 24820 + }, + { + "epoch": 1.5408156930908188, + "grad_norm": 0.17037561279656863, + "learning_rate": 5.633559341412732e-05, + "loss": 2.74, + "step": 24821 + }, + { + "epoch": 1.5408777701905767, + "grad_norm": 0.1707291882169507, + "learning_rate": 5.6332010983875974e-05, + "loss": 2.8512, + "step": 24822 + }, + { + "epoch": 1.5409398472903346, + "grad_norm": 0.16426859694377502, + "learning_rate": 5.6328428520588995e-05, + "loss": 2.8537, + "step": 24823 + }, + { + "epoch": 1.5410019243900925, + "grad_norm": 0.19546313199234272, + "learning_rate": 5.632484602428507e-05, + "loss": 2.7681, + "step": 24824 + }, + { + "epoch": 1.5410640014898505, + "grad_norm": 0.1931173563188375, + "learning_rate": 5.632126349498289e-05, + "loss": 2.8112, + "step": 24825 + }, + { + "epoch": 1.5411260785896084, + "grad_norm": 0.15050853095438818, + "learning_rate": 5.631768093270113e-05, + "loss": 2.8708, + "step": 24826 + }, + { + "epoch": 1.5411881556893663, + "grad_norm": 0.22868815047116076, + "learning_rate": 5.631409833745851e-05, + "loss": 2.804, + "step": 24827 + }, + { + "epoch": 1.5412502327891242, + "grad_norm": 0.15082399054060955, + "learning_rate": 5.63105157092737e-05, + "loss": 2.752, + "step": 24828 + }, + { + "epoch": 1.5413123098888821, + "grad_norm": 0.15222517925727402, + "learning_rate": 5.630693304816539e-05, + "loss": 2.8334, + "step": 24829 + }, + { + "epoch": 1.54137438698864, + "grad_norm": 0.1563427888016068, + "learning_rate": 5.63033503541523e-05, + "loss": 2.8089, + "step": 24830 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.1464608872974603, + "learning_rate": 5.6299767627253074e-05, + "loss": 2.7952, + "step": 24831 + }, + { + "epoch": 1.5414985411881557, + "grad_norm": 0.16911153388987002, + "learning_rate": 5.6296184867486456e-05, + "loss": 2.7672, + "step": 24832 + }, + { + "epoch": 1.5415606182879136, + "grad_norm": 0.1559095189689832, + "learning_rate": 5.6292602074871095e-05, + "loss": 2.8, + "step": 24833 + }, + { + "epoch": 1.5416226953876715, + "grad_norm": 0.15217981933035, + "learning_rate": 5.628901924942571e-05, + "loss": 2.8635, + "step": 24834 + }, + { + "epoch": 1.5416847724874294, + "grad_norm": 0.152353799928613, + "learning_rate": 5.628543639116899e-05, + "loss": 2.801, + "step": 24835 + }, + { + "epoch": 1.5417468495871873, + "grad_norm": 0.15958013656052075, + "learning_rate": 5.62818535001196e-05, + "loss": 2.7834, + "step": 24836 + }, + { + "epoch": 1.541808926686945, + "grad_norm": 0.14791590249396994, + "learning_rate": 5.627827057629628e-05, + "loss": 2.7987, + "step": 24837 + }, + { + "epoch": 1.541871003786703, + "grad_norm": 0.14623657247451516, + "learning_rate": 5.627468761971768e-05, + "loss": 2.8623, + "step": 24838 + }, + { + "epoch": 1.5419330808864609, + "grad_norm": 0.15033307770350848, + "learning_rate": 5.6271104630402524e-05, + "loss": 2.8256, + "step": 24839 + }, + { + "epoch": 1.5419951579862188, + "grad_norm": 0.15592799385160544, + "learning_rate": 5.6267521608369475e-05, + "loss": 2.805, + "step": 24840 + }, + { + "epoch": 1.5420572350859767, + "grad_norm": 0.15293231391843784, + "learning_rate": 5.6263938553637274e-05, + "loss": 2.6998, + "step": 24841 + }, + { + "epoch": 1.5421193121857346, + "grad_norm": 0.1654750662652664, + "learning_rate": 5.626035546622457e-05, + "loss": 2.7675, + "step": 24842 + }, + { + "epoch": 1.5421813892854925, + "grad_norm": 0.1616977728723638, + "learning_rate": 5.6256772346150066e-05, + "loss": 2.8473, + "step": 24843 + }, + { + "epoch": 1.5422434663852504, + "grad_norm": 0.17491326077041652, + "learning_rate": 5.625318919343248e-05, + "loss": 2.8465, + "step": 24844 + }, + { + "epoch": 1.5423055434850084, + "grad_norm": 0.15158733233312577, + "learning_rate": 5.624960600809046e-05, + "loss": 2.7734, + "step": 24845 + }, + { + "epoch": 1.5423676205847663, + "grad_norm": 0.16778895269735705, + "learning_rate": 5.624602279014275e-05, + "loss": 2.8157, + "step": 24846 + }, + { + "epoch": 1.5424296976845242, + "grad_norm": 0.15022022948420355, + "learning_rate": 5.624243953960801e-05, + "loss": 2.7355, + "step": 24847 + }, + { + "epoch": 1.5424917747842821, + "grad_norm": 0.16303404822022038, + "learning_rate": 5.6238856256504955e-05, + "loss": 2.8489, + "step": 24848 + }, + { + "epoch": 1.54255385188404, + "grad_norm": 0.1587418344042144, + "learning_rate": 5.623527294085227e-05, + "loss": 2.8441, + "step": 24849 + }, + { + "epoch": 1.542615928983798, + "grad_norm": 0.13843438806975725, + "learning_rate": 5.623168959266866e-05, + "loss": 2.8037, + "step": 24850 + }, + { + "epoch": 1.5426780060835559, + "grad_norm": 0.13933796816181104, + "learning_rate": 5.6228106211972795e-05, + "loss": 2.8444, + "step": 24851 + }, + { + "epoch": 1.5427400831833138, + "grad_norm": 0.15699113670352444, + "learning_rate": 5.6224522798783406e-05, + "loss": 2.8346, + "step": 24852 + }, + { + "epoch": 1.5428021602830717, + "grad_norm": 0.145229040553729, + "learning_rate": 5.622093935311916e-05, + "loss": 2.7486, + "step": 24853 + }, + { + "epoch": 1.5428642373828296, + "grad_norm": 0.14618011665454536, + "learning_rate": 5.6217355874998755e-05, + "loss": 2.8466, + "step": 24854 + }, + { + "epoch": 1.5429263144825873, + "grad_norm": 0.1902519962053176, + "learning_rate": 5.621377236444092e-05, + "loss": 2.781, + "step": 24855 + }, + { + "epoch": 1.5429883915823452, + "grad_norm": 0.18519772254262382, + "learning_rate": 5.62101888214643e-05, + "loss": 2.8608, + "step": 24856 + }, + { + "epoch": 1.5430504686821032, + "grad_norm": 0.1468996899732015, + "learning_rate": 5.620660524608764e-05, + "loss": 2.7428, + "step": 24857 + }, + { + "epoch": 1.543112545781861, + "grad_norm": 0.16684674180901768, + "learning_rate": 5.62030216383296e-05, + "loss": 2.8952, + "step": 24858 + }, + { + "epoch": 1.543174622881619, + "grad_norm": 0.1811265438657717, + "learning_rate": 5.619943799820889e-05, + "loss": 2.8539, + "step": 24859 + }, + { + "epoch": 1.543236699981377, + "grad_norm": 0.14558109255938476, + "learning_rate": 5.619585432574422e-05, + "loss": 2.7841, + "step": 24860 + }, + { + "epoch": 1.5432987770811346, + "grad_norm": 0.17992706500049466, + "learning_rate": 5.619227062095427e-05, + "loss": 2.8579, + "step": 24861 + }, + { + "epoch": 1.5433608541808925, + "grad_norm": 0.15796469747361624, + "learning_rate": 5.618868688385773e-05, + "loss": 2.7261, + "step": 24862 + }, + { + "epoch": 1.5434229312806504, + "grad_norm": 0.18254385715149335, + "learning_rate": 5.618510311447331e-05, + "loss": 2.8157, + "step": 24863 + }, + { + "epoch": 1.5434850083804084, + "grad_norm": 0.15824053757597684, + "learning_rate": 5.618151931281972e-05, + "loss": 2.8276, + "step": 24864 + }, + { + "epoch": 1.5435470854801663, + "grad_norm": 0.15364319521584216, + "learning_rate": 5.617793547891562e-05, + "loss": 2.748, + "step": 24865 + }, + { + "epoch": 1.5436091625799242, + "grad_norm": 0.14899164213111032, + "learning_rate": 5.617435161277975e-05, + "loss": 2.8348, + "step": 24866 + }, + { + "epoch": 1.5436712396796821, + "grad_norm": 0.15927920711697374, + "learning_rate": 5.6170767714430784e-05, + "loss": 2.7532, + "step": 24867 + }, + { + "epoch": 1.54373331677944, + "grad_norm": 0.14798787685226764, + "learning_rate": 5.616718378388742e-05, + "loss": 2.8175, + "step": 24868 + }, + { + "epoch": 1.543795393879198, + "grad_norm": 0.14940978025395923, + "learning_rate": 5.6163599821168365e-05, + "loss": 2.8006, + "step": 24869 + }, + { + "epoch": 1.5438574709789559, + "grad_norm": 0.14940578462695386, + "learning_rate": 5.61600158262923e-05, + "loss": 2.8298, + "step": 24870 + }, + { + "epoch": 1.5439195480787138, + "grad_norm": 0.16550819601303082, + "learning_rate": 5.615643179927796e-05, + "loss": 2.8335, + "step": 24871 + }, + { + "epoch": 1.5439816251784717, + "grad_norm": 0.14171420254082925, + "learning_rate": 5.615284774014401e-05, + "loss": 2.8889, + "step": 24872 + }, + { + "epoch": 1.5440437022782296, + "grad_norm": 0.14086986786176964, + "learning_rate": 5.6149263648909156e-05, + "loss": 2.7332, + "step": 24873 + }, + { + "epoch": 1.5441057793779875, + "grad_norm": 0.1498385613649953, + "learning_rate": 5.614567952559211e-05, + "loss": 2.8433, + "step": 24874 + }, + { + "epoch": 1.5441678564777455, + "grad_norm": 0.15069165384389094, + "learning_rate": 5.614209537021156e-05, + "loss": 2.7987, + "step": 24875 + }, + { + "epoch": 1.5442299335775034, + "grad_norm": 0.14178024068317172, + "learning_rate": 5.613851118278619e-05, + "loss": 2.8079, + "step": 24876 + }, + { + "epoch": 1.5442920106772613, + "grad_norm": 0.16101912909724725, + "learning_rate": 5.6134926963334746e-05, + "loss": 2.7714, + "step": 24877 + }, + { + "epoch": 1.5443540877770192, + "grad_norm": 0.1413555309847793, + "learning_rate": 5.613134271187589e-05, + "loss": 2.8215, + "step": 24878 + }, + { + "epoch": 1.544416164876777, + "grad_norm": 0.1567110768890027, + "learning_rate": 5.612775842842831e-05, + "loss": 2.8945, + "step": 24879 + }, + { + "epoch": 1.5444782419765348, + "grad_norm": 0.14200506270015414, + "learning_rate": 5.612417411301075e-05, + "loss": 2.95, + "step": 24880 + }, + { + "epoch": 1.5445403190762927, + "grad_norm": 0.14823425332047616, + "learning_rate": 5.612058976564187e-05, + "loss": 2.8007, + "step": 24881 + }, + { + "epoch": 1.5446023961760507, + "grad_norm": 0.1499484298887631, + "learning_rate": 5.6117005386340406e-05, + "loss": 2.7394, + "step": 24882 + }, + { + "epoch": 1.5446644732758086, + "grad_norm": 0.1626576722419115, + "learning_rate": 5.611342097512502e-05, + "loss": 2.7726, + "step": 24883 + }, + { + "epoch": 1.5447265503755665, + "grad_norm": 0.14331837101492342, + "learning_rate": 5.610983653201445e-05, + "loss": 2.8273, + "step": 24884 + }, + { + "epoch": 1.5447886274753242, + "grad_norm": 0.17203849364057985, + "learning_rate": 5.6106252057027364e-05, + "loss": 2.8327, + "step": 24885 + }, + { + "epoch": 1.5448507045750821, + "grad_norm": 0.14023555367274632, + "learning_rate": 5.610266755018249e-05, + "loss": 2.8266, + "step": 24886 + }, + { + "epoch": 1.54491278167484, + "grad_norm": 0.20617685718884057, + "learning_rate": 5.609908301149851e-05, + "loss": 2.8198, + "step": 24887 + }, + { + "epoch": 1.544974858774598, + "grad_norm": 0.163401196196831, + "learning_rate": 5.609549844099413e-05, + "loss": 2.7891, + "step": 24888 + }, + { + "epoch": 1.5450369358743559, + "grad_norm": 0.16353890906173513, + "learning_rate": 5.609191383868807e-05, + "loss": 2.8228, + "step": 24889 + }, + { + "epoch": 1.5450990129741138, + "grad_norm": 0.16270859974323626, + "learning_rate": 5.608832920459901e-05, + "loss": 2.751, + "step": 24890 + }, + { + "epoch": 1.5451610900738717, + "grad_norm": 0.22201616739350047, + "learning_rate": 5.608474453874566e-05, + "loss": 2.722, + "step": 24891 + }, + { + "epoch": 1.5452231671736296, + "grad_norm": 0.16584166219736632, + "learning_rate": 5.6081159841146714e-05, + "loss": 2.7977, + "step": 24892 + }, + { + "epoch": 1.5452852442733875, + "grad_norm": 0.17884341264993198, + "learning_rate": 5.607757511182089e-05, + "loss": 2.8348, + "step": 24893 + }, + { + "epoch": 1.5453473213731455, + "grad_norm": 0.16140078023891216, + "learning_rate": 5.6073990350786875e-05, + "loss": 2.7757, + "step": 24894 + }, + { + "epoch": 1.5454093984729034, + "grad_norm": 0.15607977177347598, + "learning_rate": 5.6070405558063386e-05, + "loss": 2.7811, + "step": 24895 + }, + { + "epoch": 1.5454714755726613, + "grad_norm": 0.1731539414414922, + "learning_rate": 5.606682073366911e-05, + "loss": 2.8398, + "step": 24896 + }, + { + "epoch": 1.5455335526724192, + "grad_norm": 0.15580634670639418, + "learning_rate": 5.6063235877622754e-05, + "loss": 2.8104, + "step": 24897 + }, + { + "epoch": 1.5455956297721771, + "grad_norm": 0.2087235293480139, + "learning_rate": 5.6059650989943036e-05, + "loss": 2.8537, + "step": 24898 + }, + { + "epoch": 1.545657706871935, + "grad_norm": 0.1747992797100325, + "learning_rate": 5.605606607064864e-05, + "loss": 2.8869, + "step": 24899 + }, + { + "epoch": 1.545719783971693, + "grad_norm": 0.15860103087789224, + "learning_rate": 5.605248111975828e-05, + "loss": 2.7966, + "step": 24900 + }, + { + "epoch": 1.5457818610714509, + "grad_norm": 0.16385760455476867, + "learning_rate": 5.604889613729065e-05, + "loss": 2.7791, + "step": 24901 + }, + { + "epoch": 1.5458439381712088, + "grad_norm": 0.1618103003170043, + "learning_rate": 5.6045311123264456e-05, + "loss": 2.8556, + "step": 24902 + }, + { + "epoch": 1.5459060152709665, + "grad_norm": 0.1581608162337412, + "learning_rate": 5.604172607769842e-05, + "loss": 2.7824, + "step": 24903 + }, + { + "epoch": 1.5459680923707244, + "grad_norm": 0.16257061318160684, + "learning_rate": 5.603814100061122e-05, + "loss": 2.7709, + "step": 24904 + }, + { + "epoch": 1.5460301694704823, + "grad_norm": 0.16485591748606812, + "learning_rate": 5.603455589202159e-05, + "loss": 2.9001, + "step": 24905 + }, + { + "epoch": 1.5460922465702402, + "grad_norm": 0.15331690474689652, + "learning_rate": 5.603097075194821e-05, + "loss": 2.8276, + "step": 24906 + }, + { + "epoch": 1.5461543236699982, + "grad_norm": 0.16917035103725694, + "learning_rate": 5.602738558040979e-05, + "loss": 2.8597, + "step": 24907 + }, + { + "epoch": 1.546216400769756, + "grad_norm": 0.16947637452129324, + "learning_rate": 5.602380037742503e-05, + "loss": 2.8052, + "step": 24908 + }, + { + "epoch": 1.5462784778695138, + "grad_norm": 0.1604299300354266, + "learning_rate": 5.602021514301265e-05, + "loss": 2.8529, + "step": 24909 + }, + { + "epoch": 1.5463405549692717, + "grad_norm": 0.15632330247912718, + "learning_rate": 5.601662987719134e-05, + "loss": 2.8078, + "step": 24910 + }, + { + "epoch": 1.5464026320690296, + "grad_norm": 0.1737813618798008, + "learning_rate": 5.601304457997981e-05, + "loss": 2.8182, + "step": 24911 + }, + { + "epoch": 1.5464647091687875, + "grad_norm": 0.17910563548541372, + "learning_rate": 5.600945925139677e-05, + "loss": 2.8525, + "step": 24912 + }, + { + "epoch": 1.5465267862685455, + "grad_norm": 0.14760189393065418, + "learning_rate": 5.600587389146093e-05, + "loss": 2.8666, + "step": 24913 + }, + { + "epoch": 1.5465888633683034, + "grad_norm": 0.15811211122598773, + "learning_rate": 5.600228850019098e-05, + "loss": 2.7305, + "step": 24914 + }, + { + "epoch": 1.5466509404680613, + "grad_norm": 0.17104845274760958, + "learning_rate": 5.599870307760563e-05, + "loss": 2.8519, + "step": 24915 + }, + { + "epoch": 1.5467130175678192, + "grad_norm": 0.17425315842167322, + "learning_rate": 5.59951176237236e-05, + "loss": 2.8584, + "step": 24916 + }, + { + "epoch": 1.5467750946675771, + "grad_norm": 0.16811364599979384, + "learning_rate": 5.5991532138563574e-05, + "loss": 2.8236, + "step": 24917 + }, + { + "epoch": 1.546837171767335, + "grad_norm": 0.17799223544767243, + "learning_rate": 5.598794662214427e-05, + "loss": 2.8355, + "step": 24918 + }, + { + "epoch": 1.546899248867093, + "grad_norm": 0.15394904974481247, + "learning_rate": 5.5984361074484403e-05, + "loss": 2.78, + "step": 24919 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.16729899436603693, + "learning_rate": 5.5980775495602666e-05, + "loss": 2.8924, + "step": 24920 + }, + { + "epoch": 1.5470234030666088, + "grad_norm": 0.1566310349285113, + "learning_rate": 5.597718988551779e-05, + "loss": 2.7864, + "step": 24921 + }, + { + "epoch": 1.5470854801663667, + "grad_norm": 0.23302685515106572, + "learning_rate": 5.5973604244248435e-05, + "loss": 2.7845, + "step": 24922 + }, + { + "epoch": 1.5471475572661246, + "grad_norm": 0.17662783844301064, + "learning_rate": 5.5970018571813366e-05, + "loss": 2.7626, + "step": 24923 + }, + { + "epoch": 1.5472096343658825, + "grad_norm": 0.17804323335800873, + "learning_rate": 5.5966432868231235e-05, + "loss": 2.7829, + "step": 24924 + }, + { + "epoch": 1.5472717114656405, + "grad_norm": 0.16138573650055194, + "learning_rate": 5.596284713352079e-05, + "loss": 2.7545, + "step": 24925 + }, + { + "epoch": 1.5473337885653984, + "grad_norm": 0.16021401247587588, + "learning_rate": 5.5959261367700714e-05, + "loss": 2.7427, + "step": 24926 + }, + { + "epoch": 1.547395865665156, + "grad_norm": 0.16921266448859812, + "learning_rate": 5.5955675570789735e-05, + "loss": 2.8359, + "step": 24927 + }, + { + "epoch": 1.547457942764914, + "grad_norm": 0.15467927723039307, + "learning_rate": 5.595208974280654e-05, + "loss": 2.7861, + "step": 24928 + }, + { + "epoch": 1.547520019864672, + "grad_norm": 0.16060649572278599, + "learning_rate": 5.594850388376985e-05, + "loss": 2.8623, + "step": 24929 + }, + { + "epoch": 1.5475820969644298, + "grad_norm": 0.16177866420186238, + "learning_rate": 5.5944917993698376e-05, + "loss": 2.8, + "step": 24930 + }, + { + "epoch": 1.5476441740641877, + "grad_norm": 0.1509210965817451, + "learning_rate": 5.594133207261083e-05, + "loss": 2.7827, + "step": 24931 + }, + { + "epoch": 1.5477062511639457, + "grad_norm": 0.16471339151629413, + "learning_rate": 5.59377461205259e-05, + "loss": 2.9055, + "step": 24932 + }, + { + "epoch": 1.5477683282637034, + "grad_norm": 0.14754029216662967, + "learning_rate": 5.593416013746231e-05, + "loss": 2.8308, + "step": 24933 + }, + { + "epoch": 1.5478304053634613, + "grad_norm": 0.16837560371900986, + "learning_rate": 5.5930574123438774e-05, + "loss": 2.8046, + "step": 24934 + }, + { + "epoch": 1.5478924824632192, + "grad_norm": 0.14360755803172073, + "learning_rate": 5.5926988078473966e-05, + "loss": 2.7133, + "step": 24935 + }, + { + "epoch": 1.5479545595629771, + "grad_norm": 0.20685191754648044, + "learning_rate": 5.592340200258664e-05, + "loss": 2.7274, + "step": 24936 + }, + { + "epoch": 1.548016636662735, + "grad_norm": 0.16765238110396938, + "learning_rate": 5.591981589579549e-05, + "loss": 2.7813, + "step": 24937 + }, + { + "epoch": 1.548078713762493, + "grad_norm": 0.1957842279985572, + "learning_rate": 5.591622975811923e-05, + "loss": 2.8181, + "step": 24938 + }, + { + "epoch": 1.5481407908622509, + "grad_norm": 0.15586087529696058, + "learning_rate": 5.5912643589576566e-05, + "loss": 2.7901, + "step": 24939 + }, + { + "epoch": 1.5482028679620088, + "grad_norm": 0.21994844042436906, + "learning_rate": 5.590905739018618e-05, + "loss": 2.8679, + "step": 24940 + }, + { + "epoch": 1.5482649450617667, + "grad_norm": 0.16229602111190736, + "learning_rate": 5.590547115996683e-05, + "loss": 2.7716, + "step": 24941 + }, + { + "epoch": 1.5483270221615246, + "grad_norm": 0.20627706740238722, + "learning_rate": 5.5901884898937185e-05, + "loss": 2.8895, + "step": 24942 + }, + { + "epoch": 1.5483890992612825, + "grad_norm": 0.16750939900623837, + "learning_rate": 5.5898298607115996e-05, + "loss": 2.9163, + "step": 24943 + }, + { + "epoch": 1.5484511763610405, + "grad_norm": 0.17712735699409243, + "learning_rate": 5.5894712284521936e-05, + "loss": 2.7621, + "step": 24944 + }, + { + "epoch": 1.5485132534607984, + "grad_norm": 0.19769771389802024, + "learning_rate": 5.589112593117374e-05, + "loss": 2.7051, + "step": 24945 + }, + { + "epoch": 1.5485753305605563, + "grad_norm": 0.19282934968425147, + "learning_rate": 5.588753954709009e-05, + "loss": 2.7179, + "step": 24946 + }, + { + "epoch": 1.5486374076603142, + "grad_norm": 0.16958252135893928, + "learning_rate": 5.5883953132289735e-05, + "loss": 2.7686, + "step": 24947 + }, + { + "epoch": 1.5486994847600721, + "grad_norm": 0.18631931939900684, + "learning_rate": 5.5880366686791364e-05, + "loss": 2.8385, + "step": 24948 + }, + { + "epoch": 1.54876156185983, + "grad_norm": 0.1694485990272491, + "learning_rate": 5.58767802106137e-05, + "loss": 2.8462, + "step": 24949 + }, + { + "epoch": 1.548823638959588, + "grad_norm": 0.1904163355914882, + "learning_rate": 5.587319370377543e-05, + "loss": 2.8694, + "step": 24950 + }, + { + "epoch": 1.5488857160593457, + "grad_norm": 0.1460479146790405, + "learning_rate": 5.586960716629529e-05, + "loss": 2.7439, + "step": 24951 + }, + { + "epoch": 1.5489477931591036, + "grad_norm": 0.1658281277274025, + "learning_rate": 5.586602059819199e-05, + "loss": 2.8186, + "step": 24952 + }, + { + "epoch": 1.5490098702588615, + "grad_norm": 0.15244646766563918, + "learning_rate": 5.5862433999484234e-05, + "loss": 2.8248, + "step": 24953 + }, + { + "epoch": 1.5490719473586194, + "grad_norm": 0.15907839612279198, + "learning_rate": 5.585884737019074e-05, + "loss": 2.7957, + "step": 24954 + }, + { + "epoch": 1.5491340244583773, + "grad_norm": 0.18816390318068907, + "learning_rate": 5.58552607103302e-05, + "loss": 2.9821, + "step": 24955 + }, + { + "epoch": 1.5491961015581353, + "grad_norm": 0.15481718676591247, + "learning_rate": 5.585167401992136e-05, + "loss": 2.7797, + "step": 24956 + }, + { + "epoch": 1.549258178657893, + "grad_norm": 0.19506889410029182, + "learning_rate": 5.584808729898292e-05, + "loss": 2.8656, + "step": 24957 + }, + { + "epoch": 1.5493202557576509, + "grad_norm": 0.15091134003598106, + "learning_rate": 5.584450054753357e-05, + "loss": 2.7698, + "step": 24958 + }, + { + "epoch": 1.5493823328574088, + "grad_norm": 0.15518029736332428, + "learning_rate": 5.584091376559206e-05, + "loss": 2.7793, + "step": 24959 + }, + { + "epoch": 1.5494444099571667, + "grad_norm": 0.1691250203623199, + "learning_rate": 5.5837326953177085e-05, + "loss": 2.8155, + "step": 24960 + }, + { + "epoch": 1.5495064870569246, + "grad_norm": 0.16339889923913234, + "learning_rate": 5.5833740110307356e-05, + "loss": 2.8403, + "step": 24961 + }, + { + "epoch": 1.5495685641566825, + "grad_norm": 0.16820326225285792, + "learning_rate": 5.583015323700157e-05, + "loss": 2.8157, + "step": 24962 + }, + { + "epoch": 1.5496306412564405, + "grad_norm": 0.1492268817243741, + "learning_rate": 5.582656633327849e-05, + "loss": 2.7949, + "step": 24963 + }, + { + "epoch": 1.5496927183561984, + "grad_norm": 0.15803303228485535, + "learning_rate": 5.5822979399156775e-05, + "loss": 2.7165, + "step": 24964 + }, + { + "epoch": 1.5497547954559563, + "grad_norm": 0.16080640620143571, + "learning_rate": 5.581939243465517e-05, + "loss": 2.7262, + "step": 24965 + }, + { + "epoch": 1.5498168725557142, + "grad_norm": 0.15604101152138208, + "learning_rate": 5.5815805439792377e-05, + "loss": 2.7023, + "step": 24966 + }, + { + "epoch": 1.5498789496554721, + "grad_norm": 0.16807987728803062, + "learning_rate": 5.581221841458712e-05, + "loss": 2.7712, + "step": 24967 + }, + { + "epoch": 1.54994102675523, + "grad_norm": 0.1718779260212916, + "learning_rate": 5.5808631359058104e-05, + "loss": 2.8327, + "step": 24968 + }, + { + "epoch": 1.550003103854988, + "grad_norm": 0.16599197219200532, + "learning_rate": 5.580504427322405e-05, + "loss": 2.83, + "step": 24969 + }, + { + "epoch": 1.5500651809547459, + "grad_norm": 0.1654108757393468, + "learning_rate": 5.580145715710367e-05, + "loss": 2.7961, + "step": 24970 + }, + { + "epoch": 1.5501272580545038, + "grad_norm": 0.1656398109795308, + "learning_rate": 5.579787001071569e-05, + "loss": 2.8439, + "step": 24971 + }, + { + "epoch": 1.5501893351542617, + "grad_norm": 0.18183991564077515, + "learning_rate": 5.579428283407882e-05, + "loss": 2.8104, + "step": 24972 + }, + { + "epoch": 1.5502514122540196, + "grad_norm": 0.161629811999696, + "learning_rate": 5.5790695627211743e-05, + "loss": 2.7426, + "step": 24973 + }, + { + "epoch": 1.5503134893537776, + "grad_norm": 0.18692673029369802, + "learning_rate": 5.5787108390133226e-05, + "loss": 2.8846, + "step": 24974 + }, + { + "epoch": 1.5503755664535352, + "grad_norm": 0.1826275399277532, + "learning_rate": 5.5783521122861935e-05, + "loss": 2.7822, + "step": 24975 + }, + { + "epoch": 1.5504376435532932, + "grad_norm": 0.16228440039735637, + "learning_rate": 5.577993382541664e-05, + "loss": 2.7962, + "step": 24976 + }, + { + "epoch": 1.550499720653051, + "grad_norm": 0.16497429022470844, + "learning_rate": 5.577634649781601e-05, + "loss": 2.7658, + "step": 24977 + }, + { + "epoch": 1.550561797752809, + "grad_norm": 0.1604899442395533, + "learning_rate": 5.5772759140078775e-05, + "loss": 2.8282, + "step": 24978 + }, + { + "epoch": 1.550623874852567, + "grad_norm": 0.17740512575670553, + "learning_rate": 5.576917175222367e-05, + "loss": 2.7806, + "step": 24979 + }, + { + "epoch": 1.5506859519523248, + "grad_norm": 0.1537850800963412, + "learning_rate": 5.5765584334269386e-05, + "loss": 2.811, + "step": 24980 + }, + { + "epoch": 1.5507480290520825, + "grad_norm": 0.1673970195300556, + "learning_rate": 5.576199688623464e-05, + "loss": 2.8752, + "step": 24981 + }, + { + "epoch": 1.5508101061518405, + "grad_norm": 0.15916397157562645, + "learning_rate": 5.5758409408138166e-05, + "loss": 2.7042, + "step": 24982 + }, + { + "epoch": 1.5508721832515984, + "grad_norm": 0.15280119977994616, + "learning_rate": 5.575482189999868e-05, + "loss": 2.8769, + "step": 24983 + }, + { + "epoch": 1.5509342603513563, + "grad_norm": 0.14565982175113074, + "learning_rate": 5.575123436183487e-05, + "loss": 2.7594, + "step": 24984 + }, + { + "epoch": 1.5509963374511142, + "grad_norm": 0.16772007276682602, + "learning_rate": 5.574764679366549e-05, + "loss": 2.8125, + "step": 24985 + }, + { + "epoch": 1.5510584145508721, + "grad_norm": 0.17197246389037102, + "learning_rate": 5.574405919550923e-05, + "loss": 2.7746, + "step": 24986 + }, + { + "epoch": 1.55112049165063, + "grad_norm": 0.165485528711204, + "learning_rate": 5.574047156738483e-05, + "loss": 2.8406, + "step": 24987 + }, + { + "epoch": 1.551182568750388, + "grad_norm": 0.152358830757018, + "learning_rate": 5.5736883909311e-05, + "loss": 2.8381, + "step": 24988 + }, + { + "epoch": 1.5512446458501459, + "grad_norm": 0.1496953062578404, + "learning_rate": 5.573329622130644e-05, + "loss": 2.7667, + "step": 24989 + }, + { + "epoch": 1.5513067229499038, + "grad_norm": 0.1553814230601717, + "learning_rate": 5.5729708503389896e-05, + "loss": 2.8401, + "step": 24990 + }, + { + "epoch": 1.5513688000496617, + "grad_norm": 0.14981236071375234, + "learning_rate": 5.5726120755580057e-05, + "loss": 2.8114, + "step": 24991 + }, + { + "epoch": 1.5514308771494196, + "grad_norm": 0.15345819311392775, + "learning_rate": 5.572253297789567e-05, + "loss": 2.7704, + "step": 24992 + }, + { + "epoch": 1.5514929542491775, + "grad_norm": 0.15294892590801257, + "learning_rate": 5.571894517035543e-05, + "loss": 2.7938, + "step": 24993 + }, + { + "epoch": 1.5515550313489355, + "grad_norm": 0.1410304621765968, + "learning_rate": 5.571535733297807e-05, + "loss": 2.7934, + "step": 24994 + }, + { + "epoch": 1.5516171084486934, + "grad_norm": 0.14471149803761094, + "learning_rate": 5.5711769465782306e-05, + "loss": 2.8849, + "step": 24995 + }, + { + "epoch": 1.5516791855484513, + "grad_norm": 0.14494659053699818, + "learning_rate": 5.570818156878684e-05, + "loss": 2.7834, + "step": 24996 + }, + { + "epoch": 1.5517412626482092, + "grad_norm": 0.15200041326863487, + "learning_rate": 5.570459364201043e-05, + "loss": 2.8224, + "step": 24997 + }, + { + "epoch": 1.5518033397479671, + "grad_norm": 0.14701810241883853, + "learning_rate": 5.570100568547174e-05, + "loss": 2.8448, + "step": 24998 + }, + { + "epoch": 1.5518654168477248, + "grad_norm": 0.16430142013169108, + "learning_rate": 5.5697417699189546e-05, + "loss": 2.7292, + "step": 24999 + }, + { + "epoch": 1.5519274939474828, + "grad_norm": 0.15009936081682212, + "learning_rate": 5.569382968318252e-05, + "loss": 2.8096, + "step": 25000 + }, + { + "epoch": 1.5519895710472407, + "grad_norm": 0.20101608799667203, + "learning_rate": 5.569024163746941e-05, + "loss": 2.6904, + "step": 25001 + }, + { + "epoch": 1.5520516481469986, + "grad_norm": 0.16944955536251313, + "learning_rate": 5.5686653562068934e-05, + "loss": 2.8333, + "step": 25002 + }, + { + "epoch": 1.5521137252467565, + "grad_norm": 0.18460138251155278, + "learning_rate": 5.5683065456999794e-05, + "loss": 2.7434, + "step": 25003 + }, + { + "epoch": 1.5521758023465144, + "grad_norm": 0.1908894002862183, + "learning_rate": 5.567947732228074e-05, + "loss": 2.8466, + "step": 25004 + }, + { + "epoch": 1.5522378794462721, + "grad_norm": 0.15412038052420096, + "learning_rate": 5.567588915793045e-05, + "loss": 2.786, + "step": 25005 + }, + { + "epoch": 1.55229995654603, + "grad_norm": 0.15356406498361122, + "learning_rate": 5.567230096396768e-05, + "loss": 2.7728, + "step": 25006 + }, + { + "epoch": 1.552362033645788, + "grad_norm": 0.18109227544399803, + "learning_rate": 5.566871274041114e-05, + "loss": 2.8323, + "step": 25007 + }, + { + "epoch": 1.5524241107455459, + "grad_norm": 0.17420310390676552, + "learning_rate": 5.566512448727955e-05, + "loss": 2.8107, + "step": 25008 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.16120147398154683, + "learning_rate": 5.566153620459163e-05, + "loss": 2.902, + "step": 25009 + }, + { + "epoch": 1.5525482649450617, + "grad_norm": 0.15375345846488767, + "learning_rate": 5.56579478923661e-05, + "loss": 2.8199, + "step": 25010 + }, + { + "epoch": 1.5526103420448196, + "grad_norm": 0.17597159570253593, + "learning_rate": 5.5654359550621684e-05, + "loss": 2.8056, + "step": 25011 + }, + { + "epoch": 1.5526724191445775, + "grad_norm": 0.15190806198726642, + "learning_rate": 5.56507711793771e-05, + "loss": 2.8463, + "step": 25012 + }, + { + "epoch": 1.5527344962443355, + "grad_norm": 0.16507890321987734, + "learning_rate": 5.5647182778651085e-05, + "loss": 2.7845, + "step": 25013 + }, + { + "epoch": 1.5527965733440934, + "grad_norm": 0.16103400466534948, + "learning_rate": 5.564359434846232e-05, + "loss": 2.798, + "step": 25014 + }, + { + "epoch": 1.5528586504438513, + "grad_norm": 0.1577439945550659, + "learning_rate": 5.564000588882958e-05, + "loss": 2.8895, + "step": 25015 + }, + { + "epoch": 1.5529207275436092, + "grad_norm": 0.16704076763046577, + "learning_rate": 5.5636417399771546e-05, + "loss": 2.8339, + "step": 25016 + }, + { + "epoch": 1.5529828046433671, + "grad_norm": 0.15756427439243204, + "learning_rate": 5.563282888130694e-05, + "loss": 2.8728, + "step": 25017 + }, + { + "epoch": 1.553044881743125, + "grad_norm": 0.17521156609879543, + "learning_rate": 5.562924033345451e-05, + "loss": 2.7366, + "step": 25018 + }, + { + "epoch": 1.553106958842883, + "grad_norm": 0.1755504205190353, + "learning_rate": 5.562565175623298e-05, + "loss": 2.758, + "step": 25019 + }, + { + "epoch": 1.5531690359426409, + "grad_norm": 0.14712381668838448, + "learning_rate": 5.5622063149661055e-05, + "loss": 2.7866, + "step": 25020 + }, + { + "epoch": 1.5532311130423988, + "grad_norm": 0.16286512948353193, + "learning_rate": 5.5618474513757445e-05, + "loss": 2.8219, + "step": 25021 + }, + { + "epoch": 1.5532931901421567, + "grad_norm": 0.17336372963110794, + "learning_rate": 5.561488584854091e-05, + "loss": 2.8154, + "step": 25022 + }, + { + "epoch": 1.5533552672419144, + "grad_norm": 0.15273341449328212, + "learning_rate": 5.561129715403013e-05, + "loss": 2.8154, + "step": 25023 + }, + { + "epoch": 1.5534173443416723, + "grad_norm": 0.15516818575938368, + "learning_rate": 5.5607708430243875e-05, + "loss": 2.8291, + "step": 25024 + }, + { + "epoch": 1.5534794214414303, + "grad_norm": 0.15217446433916693, + "learning_rate": 5.5604119677200826e-05, + "loss": 2.8341, + "step": 25025 + }, + { + "epoch": 1.5535414985411882, + "grad_norm": 0.14362788766941076, + "learning_rate": 5.560053089491973e-05, + "loss": 2.7952, + "step": 25026 + }, + { + "epoch": 1.553603575640946, + "grad_norm": 0.1825245402931064, + "learning_rate": 5.55969420834193e-05, + "loss": 2.9398, + "step": 25027 + }, + { + "epoch": 1.553665652740704, + "grad_norm": 0.15393287844657114, + "learning_rate": 5.559335324271828e-05, + "loss": 2.7709, + "step": 25028 + }, + { + "epoch": 1.5537277298404617, + "grad_norm": 0.1969980370089717, + "learning_rate": 5.5589764372835364e-05, + "loss": 2.8468, + "step": 25029 + }, + { + "epoch": 1.5537898069402196, + "grad_norm": 0.16844630552782677, + "learning_rate": 5.558617547378929e-05, + "loss": 2.8296, + "step": 25030 + }, + { + "epoch": 1.5538518840399775, + "grad_norm": 0.16169982905433397, + "learning_rate": 5.5582586545598804e-05, + "loss": 2.7466, + "step": 25031 + }, + { + "epoch": 1.5539139611397355, + "grad_norm": 0.1732336605601263, + "learning_rate": 5.557899758828258e-05, + "loss": 2.8635, + "step": 25032 + }, + { + "epoch": 1.5539760382394934, + "grad_norm": 0.16143056351258256, + "learning_rate": 5.557540860185938e-05, + "loss": 2.7785, + "step": 25033 + }, + { + "epoch": 1.5540381153392513, + "grad_norm": 0.17800266850271437, + "learning_rate": 5.557181958634792e-05, + "loss": 2.8449, + "step": 25034 + }, + { + "epoch": 1.5541001924390092, + "grad_norm": 0.16833166903457547, + "learning_rate": 5.556823054176693e-05, + "loss": 2.8348, + "step": 25035 + }, + { + "epoch": 1.5541622695387671, + "grad_norm": 0.16979340377359775, + "learning_rate": 5.556464146813511e-05, + "loss": 2.7741, + "step": 25036 + }, + { + "epoch": 1.554224346638525, + "grad_norm": 0.15575419081909553, + "learning_rate": 5.556105236547123e-05, + "loss": 2.8238, + "step": 25037 + }, + { + "epoch": 1.554286423738283, + "grad_norm": 0.17783061764871172, + "learning_rate": 5.555746323379399e-05, + "loss": 2.8153, + "step": 25038 + }, + { + "epoch": 1.5543485008380409, + "grad_norm": 0.16194367270297594, + "learning_rate": 5.5553874073122104e-05, + "loss": 2.8365, + "step": 25039 + }, + { + "epoch": 1.5544105779377988, + "grad_norm": 0.1673043384232176, + "learning_rate": 5.5550284883474316e-05, + "loss": 2.8186, + "step": 25040 + }, + { + "epoch": 1.5544726550375567, + "grad_norm": 0.15533465851958123, + "learning_rate": 5.554669566486933e-05, + "loss": 2.838, + "step": 25041 + }, + { + "epoch": 1.5545347321373146, + "grad_norm": 0.15704813600724082, + "learning_rate": 5.5543106417325906e-05, + "loss": 2.7939, + "step": 25042 + }, + { + "epoch": 1.5545968092370726, + "grad_norm": 0.17438358860843792, + "learning_rate": 5.553951714086273e-05, + "loss": 2.7918, + "step": 25043 + }, + { + "epoch": 1.5546588863368305, + "grad_norm": 0.18149846163930575, + "learning_rate": 5.5535927835498566e-05, + "loss": 2.7801, + "step": 25044 + }, + { + "epoch": 1.5547209634365884, + "grad_norm": 0.15573966010621249, + "learning_rate": 5.553233850125211e-05, + "loss": 2.8836, + "step": 25045 + }, + { + "epoch": 1.5547830405363463, + "grad_norm": 0.159062289610688, + "learning_rate": 5.552874913814211e-05, + "loss": 2.7769, + "step": 25046 + }, + { + "epoch": 1.554845117636104, + "grad_norm": 0.1489254435551051, + "learning_rate": 5.5525159746187274e-05, + "loss": 2.8993, + "step": 25047 + }, + { + "epoch": 1.554907194735862, + "grad_norm": 0.15555883366412748, + "learning_rate": 5.5521570325406347e-05, + "loss": 2.772, + "step": 25048 + }, + { + "epoch": 1.5549692718356198, + "grad_norm": 0.1448809221400988, + "learning_rate": 5.5517980875818045e-05, + "loss": 2.8375, + "step": 25049 + }, + { + "epoch": 1.5550313489353778, + "grad_norm": 0.15658702860924406, + "learning_rate": 5.551439139744108e-05, + "loss": 2.7274, + "step": 25050 + }, + { + "epoch": 1.5550934260351357, + "grad_norm": 0.14962559106657453, + "learning_rate": 5.551080189029422e-05, + "loss": 2.7901, + "step": 25051 + }, + { + "epoch": 1.5551555031348936, + "grad_norm": 0.14744641693571542, + "learning_rate": 5.5507212354396155e-05, + "loss": 2.784, + "step": 25052 + }, + { + "epoch": 1.5552175802346513, + "grad_norm": 0.1602191336388903, + "learning_rate": 5.550362278976564e-05, + "loss": 2.8122, + "step": 25053 + }, + { + "epoch": 1.5552796573344092, + "grad_norm": 0.1466749794987966, + "learning_rate": 5.5500033196421375e-05, + "loss": 2.7331, + "step": 25054 + }, + { + "epoch": 1.5553417344341671, + "grad_norm": 0.15835951504468254, + "learning_rate": 5.549644357438212e-05, + "loss": 2.8772, + "step": 25055 + }, + { + "epoch": 1.555403811533925, + "grad_norm": 0.15506231602146991, + "learning_rate": 5.5492853923666576e-05, + "loss": 2.7648, + "step": 25056 + }, + { + "epoch": 1.555465888633683, + "grad_norm": 0.14920490560169225, + "learning_rate": 5.548926424429347e-05, + "loss": 2.799, + "step": 25057 + }, + { + "epoch": 1.5555279657334409, + "grad_norm": 0.15887153965635434, + "learning_rate": 5.548567453628155e-05, + "loss": 2.8449, + "step": 25058 + }, + { + "epoch": 1.5555900428331988, + "grad_norm": 0.1467784853628791, + "learning_rate": 5.548208479964952e-05, + "loss": 2.7354, + "step": 25059 + }, + { + "epoch": 1.5556521199329567, + "grad_norm": 0.15101836219315967, + "learning_rate": 5.547849503441614e-05, + "loss": 2.8238, + "step": 25060 + }, + { + "epoch": 1.5557141970327146, + "grad_norm": 0.1592581502926285, + "learning_rate": 5.547490524060011e-05, + "loss": 2.8372, + "step": 25061 + }, + { + "epoch": 1.5557762741324725, + "grad_norm": 0.16971751233940086, + "learning_rate": 5.547131541822018e-05, + "loss": 2.7551, + "step": 25062 + }, + { + "epoch": 1.5558383512322305, + "grad_norm": 0.155592578427282, + "learning_rate": 5.546772556729506e-05, + "loss": 2.8366, + "step": 25063 + }, + { + "epoch": 1.5559004283319884, + "grad_norm": 0.1525898193237482, + "learning_rate": 5.54641356878435e-05, + "loss": 2.8012, + "step": 25064 + }, + { + "epoch": 1.5559625054317463, + "grad_norm": 0.1565996192474718, + "learning_rate": 5.54605457798842e-05, + "loss": 2.8106, + "step": 25065 + }, + { + "epoch": 1.5560245825315042, + "grad_norm": 0.1459867480804856, + "learning_rate": 5.545695584343591e-05, + "loss": 2.7197, + "step": 25066 + }, + { + "epoch": 1.5560866596312621, + "grad_norm": 0.18702351107158288, + "learning_rate": 5.545336587851737e-05, + "loss": 2.9228, + "step": 25067 + }, + { + "epoch": 1.55614873673102, + "grad_norm": 0.1499756003443, + "learning_rate": 5.544977588514728e-05, + "loss": 2.8135, + "step": 25068 + }, + { + "epoch": 1.556210813830778, + "grad_norm": 0.17670499950510615, + "learning_rate": 5.54461858633444e-05, + "loss": 2.8031, + "step": 25069 + }, + { + "epoch": 1.556272890930536, + "grad_norm": 0.1588220615984008, + "learning_rate": 5.544259581312744e-05, + "loss": 2.837, + "step": 25070 + }, + { + "epoch": 1.5563349680302936, + "grad_norm": 0.15764961429086596, + "learning_rate": 5.5439005734515146e-05, + "loss": 2.7431, + "step": 25071 + }, + { + "epoch": 1.5563970451300515, + "grad_norm": 0.15875886510328563, + "learning_rate": 5.543541562752623e-05, + "loss": 2.8005, + "step": 25072 + }, + { + "epoch": 1.5564591222298094, + "grad_norm": 0.15352677457666236, + "learning_rate": 5.543182549217943e-05, + "loss": 2.8171, + "step": 25073 + }, + { + "epoch": 1.5565211993295673, + "grad_norm": 0.14694789665246188, + "learning_rate": 5.542823532849349e-05, + "loss": 2.7566, + "step": 25074 + }, + { + "epoch": 1.5565832764293253, + "grad_norm": 0.18190873775344021, + "learning_rate": 5.5424645136487116e-05, + "loss": 2.7758, + "step": 25075 + }, + { + "epoch": 1.5566453535290832, + "grad_norm": 0.14167387858667754, + "learning_rate": 5.5421054916179063e-05, + "loss": 2.8667, + "step": 25076 + }, + { + "epoch": 1.5567074306288409, + "grad_norm": 0.1548488437784791, + "learning_rate": 5.541746466758804e-05, + "loss": 2.7524, + "step": 25077 + }, + { + "epoch": 1.5567695077285988, + "grad_norm": 0.14860981426188133, + "learning_rate": 5.541387439073279e-05, + "loss": 2.7783, + "step": 25078 + }, + { + "epoch": 1.5568315848283567, + "grad_norm": 0.18320965663759367, + "learning_rate": 5.541028408563204e-05, + "loss": 2.8191, + "step": 25079 + }, + { + "epoch": 1.5568936619281146, + "grad_norm": 0.15779882740052245, + "learning_rate": 5.540669375230453e-05, + "loss": 2.7676, + "step": 25080 + }, + { + "epoch": 1.5569557390278725, + "grad_norm": 0.157012295883832, + "learning_rate": 5.540310339076899e-05, + "loss": 2.8425, + "step": 25081 + }, + { + "epoch": 1.5570178161276305, + "grad_norm": 0.14967385936695568, + "learning_rate": 5.539951300104412e-05, + "loss": 2.7998, + "step": 25082 + }, + { + "epoch": 1.5570798932273884, + "grad_norm": 0.15528623060390678, + "learning_rate": 5.539592258314872e-05, + "loss": 2.8664, + "step": 25083 + }, + { + "epoch": 1.5571419703271463, + "grad_norm": 0.16193341276457546, + "learning_rate": 5.539233213710145e-05, + "loss": 2.7691, + "step": 25084 + }, + { + "epoch": 1.5572040474269042, + "grad_norm": 0.15663682619087976, + "learning_rate": 5.538874166292109e-05, + "loss": 2.9212, + "step": 25085 + }, + { + "epoch": 1.5572661245266621, + "grad_norm": 0.15320915396079562, + "learning_rate": 5.538515116062636e-05, + "loss": 2.7791, + "step": 25086 + }, + { + "epoch": 1.55732820162642, + "grad_norm": 0.1473792272880366, + "learning_rate": 5.5381560630235985e-05, + "loss": 2.7548, + "step": 25087 + }, + { + "epoch": 1.557390278726178, + "grad_norm": 0.14905164434182744, + "learning_rate": 5.5377970071768704e-05, + "loss": 2.7122, + "step": 25088 + }, + { + "epoch": 1.5574523558259359, + "grad_norm": 0.15815815463047045, + "learning_rate": 5.5374379485243246e-05, + "loss": 2.8463, + "step": 25089 + }, + { + "epoch": 1.5575144329256938, + "grad_norm": 0.14383041734334653, + "learning_rate": 5.5370788870678335e-05, + "loss": 2.7866, + "step": 25090 + }, + { + "epoch": 1.5575765100254517, + "grad_norm": 0.14570553677892042, + "learning_rate": 5.5367198228092734e-05, + "loss": 2.8667, + "step": 25091 + }, + { + "epoch": 1.5576385871252096, + "grad_norm": 0.1459952580865017, + "learning_rate": 5.536360755750515e-05, + "loss": 2.835, + "step": 25092 + }, + { + "epoch": 1.5577006642249676, + "grad_norm": 0.1486123962015025, + "learning_rate": 5.536001685893432e-05, + "loss": 2.8746, + "step": 25093 + }, + { + "epoch": 1.5577627413247255, + "grad_norm": 0.14488798655231694, + "learning_rate": 5.5356426132398977e-05, + "loss": 2.7529, + "step": 25094 + }, + { + "epoch": 1.5578248184244832, + "grad_norm": 0.14372115312223185, + "learning_rate": 5.5352835377917856e-05, + "loss": 2.7073, + "step": 25095 + }, + { + "epoch": 1.557886895524241, + "grad_norm": 0.15090474274783736, + "learning_rate": 5.534924459550971e-05, + "loss": 2.8829, + "step": 25096 + }, + { + "epoch": 1.557948972623999, + "grad_norm": 0.13638372243982042, + "learning_rate": 5.534565378519324e-05, + "loss": 2.7403, + "step": 25097 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.1571439620842652, + "learning_rate": 5.5342062946987195e-05, + "loss": 2.8585, + "step": 25098 + }, + { + "epoch": 1.5580731268235148, + "grad_norm": 0.14962803721751294, + "learning_rate": 5.533847208091032e-05, + "loss": 2.8868, + "step": 25099 + }, + { + "epoch": 1.5581352039232728, + "grad_norm": 0.14055487420737225, + "learning_rate": 5.5334881186981336e-05, + "loss": 2.7182, + "step": 25100 + }, + { + "epoch": 1.5581972810230305, + "grad_norm": 0.15836030310540905, + "learning_rate": 5.5331290265218995e-05, + "loss": 2.8375, + "step": 25101 + }, + { + "epoch": 1.5582593581227884, + "grad_norm": 0.14222196617127733, + "learning_rate": 5.5327699315642e-05, + "loss": 2.7895, + "step": 25102 + }, + { + "epoch": 1.5583214352225463, + "grad_norm": 0.162998335832748, + "learning_rate": 5.532410833826912e-05, + "loss": 2.897, + "step": 25103 + }, + { + "epoch": 1.5583835123223042, + "grad_norm": 0.16305440609313507, + "learning_rate": 5.532051733311906e-05, + "loss": 2.8398, + "step": 25104 + }, + { + "epoch": 1.5584455894220621, + "grad_norm": 0.15193895210127703, + "learning_rate": 5.5316926300210584e-05, + "loss": 2.7515, + "step": 25105 + }, + { + "epoch": 1.55850766652182, + "grad_norm": 0.1512448031442998, + "learning_rate": 5.531333523956239e-05, + "loss": 2.8908, + "step": 25106 + }, + { + "epoch": 1.558569743621578, + "grad_norm": 0.14017806290620197, + "learning_rate": 5.5309744151193264e-05, + "loss": 2.8228, + "step": 25107 + }, + { + "epoch": 1.5586318207213359, + "grad_norm": 0.20661027862171208, + "learning_rate": 5.530615303512189e-05, + "loss": 2.7926, + "step": 25108 + }, + { + "epoch": 1.5586938978210938, + "grad_norm": 0.15762708720300833, + "learning_rate": 5.5302561891367046e-05, + "loss": 2.8861, + "step": 25109 + }, + { + "epoch": 1.5587559749208517, + "grad_norm": 0.1373098402290951, + "learning_rate": 5.529897071994745e-05, + "loss": 2.7756, + "step": 25110 + }, + { + "epoch": 1.5588180520206096, + "grad_norm": 0.14740702293998825, + "learning_rate": 5.5295379520881815e-05, + "loss": 2.6893, + "step": 25111 + }, + { + "epoch": 1.5588801291203676, + "grad_norm": 0.14720473864818542, + "learning_rate": 5.529178829418893e-05, + "loss": 2.8119, + "step": 25112 + }, + { + "epoch": 1.5589422062201255, + "grad_norm": 0.17022997230274095, + "learning_rate": 5.528819703988748e-05, + "loss": 2.7664, + "step": 25113 + }, + { + "epoch": 1.5590042833198834, + "grad_norm": 0.13842445786784893, + "learning_rate": 5.528460575799622e-05, + "loss": 2.8538, + "step": 25114 + }, + { + "epoch": 1.5590663604196413, + "grad_norm": 0.16417047376280675, + "learning_rate": 5.528101444853388e-05, + "loss": 2.776, + "step": 25115 + }, + { + "epoch": 1.5591284375193992, + "grad_norm": 0.14389659750849507, + "learning_rate": 5.5277423111519236e-05, + "loss": 2.8607, + "step": 25116 + }, + { + "epoch": 1.5591905146191571, + "grad_norm": 0.1541591524589024, + "learning_rate": 5.5273831746970984e-05, + "loss": 2.8115, + "step": 25117 + }, + { + "epoch": 1.5592525917189148, + "grad_norm": 0.1435130701268546, + "learning_rate": 5.5270240354907856e-05, + "loss": 2.8278, + "step": 25118 + }, + { + "epoch": 1.5593146688186728, + "grad_norm": 0.14913329378696166, + "learning_rate": 5.526664893534862e-05, + "loss": 2.8097, + "step": 25119 + }, + { + "epoch": 1.5593767459184307, + "grad_norm": 0.1649308724707074, + "learning_rate": 5.5263057488312e-05, + "loss": 2.7908, + "step": 25120 + }, + { + "epoch": 1.5594388230181886, + "grad_norm": 0.16577023153154669, + "learning_rate": 5.5259466013816715e-05, + "loss": 2.7711, + "step": 25121 + }, + { + "epoch": 1.5595009001179465, + "grad_norm": 0.1608450371668696, + "learning_rate": 5.5255874511881535e-05, + "loss": 2.7776, + "step": 25122 + }, + { + "epoch": 1.5595629772177044, + "grad_norm": 0.16080226517104615, + "learning_rate": 5.525228298252518e-05, + "loss": 2.7778, + "step": 25123 + }, + { + "epoch": 1.5596250543174621, + "grad_norm": 0.16217757134926963, + "learning_rate": 5.5248691425766377e-05, + "loss": 2.7249, + "step": 25124 + }, + { + "epoch": 1.55968713141722, + "grad_norm": 0.15309130823477807, + "learning_rate": 5.524509984162389e-05, + "loss": 2.7663, + "step": 25125 + }, + { + "epoch": 1.559749208516978, + "grad_norm": 0.16865862518258093, + "learning_rate": 5.5241508230116434e-05, + "loss": 2.7892, + "step": 25126 + }, + { + "epoch": 1.5598112856167359, + "grad_norm": 0.1678071797449263, + "learning_rate": 5.5237916591262776e-05, + "loss": 2.7676, + "step": 25127 + }, + { + "epoch": 1.5598733627164938, + "grad_norm": 0.20764355711826232, + "learning_rate": 5.5234324925081626e-05, + "loss": 2.7585, + "step": 25128 + }, + { + "epoch": 1.5599354398162517, + "grad_norm": 0.15837061810584938, + "learning_rate": 5.523073323159171e-05, + "loss": 2.8045, + "step": 25129 + }, + { + "epoch": 1.5599975169160096, + "grad_norm": 0.17516126971746832, + "learning_rate": 5.522714151081182e-05, + "loss": 2.765, + "step": 25130 + }, + { + "epoch": 1.5600595940157675, + "grad_norm": 0.15393467534138033, + "learning_rate": 5.522354976276063e-05, + "loss": 2.7569, + "step": 25131 + }, + { + "epoch": 1.5601216711155255, + "grad_norm": 0.16614082327875457, + "learning_rate": 5.521995798745694e-05, + "loss": 2.7937, + "step": 25132 + }, + { + "epoch": 1.5601837482152834, + "grad_norm": 0.15658043254251702, + "learning_rate": 5.521636618491945e-05, + "loss": 2.7777, + "step": 25133 + }, + { + "epoch": 1.5602458253150413, + "grad_norm": 0.1492086819053274, + "learning_rate": 5.521277435516693e-05, + "loss": 2.8044, + "step": 25134 + }, + { + "epoch": 1.5603079024147992, + "grad_norm": 0.16337067195118324, + "learning_rate": 5.5209182498218095e-05, + "loss": 2.8056, + "step": 25135 + }, + { + "epoch": 1.5603699795145571, + "grad_norm": 0.14465745364693977, + "learning_rate": 5.5205590614091675e-05, + "loss": 2.7103, + "step": 25136 + }, + { + "epoch": 1.560432056614315, + "grad_norm": 0.15847434824881584, + "learning_rate": 5.520199870280644e-05, + "loss": 2.8219, + "step": 25137 + }, + { + "epoch": 1.560494133714073, + "grad_norm": 0.16320832166163668, + "learning_rate": 5.519840676438109e-05, + "loss": 2.8865, + "step": 25138 + }, + { + "epoch": 1.560556210813831, + "grad_norm": 0.14715794514603012, + "learning_rate": 5.5194814798834415e-05, + "loss": 2.8868, + "step": 25139 + }, + { + "epoch": 1.5606182879135888, + "grad_norm": 0.14596742967389437, + "learning_rate": 5.519122280618513e-05, + "loss": 2.7909, + "step": 25140 + }, + { + "epoch": 1.5606803650133467, + "grad_norm": 0.15162290691690591, + "learning_rate": 5.5187630786451974e-05, + "loss": 2.8606, + "step": 25141 + }, + { + "epoch": 1.5607424421131044, + "grad_norm": 0.1533551814282758, + "learning_rate": 5.5184038739653675e-05, + "loss": 2.8263, + "step": 25142 + }, + { + "epoch": 1.5608045192128623, + "grad_norm": 0.1654464895549968, + "learning_rate": 5.518044666580899e-05, + "loss": 2.8081, + "step": 25143 + }, + { + "epoch": 1.5608665963126203, + "grad_norm": 0.15159869794097705, + "learning_rate": 5.5176854564936665e-05, + "loss": 2.8954, + "step": 25144 + }, + { + "epoch": 1.5609286734123782, + "grad_norm": 0.15776659940071305, + "learning_rate": 5.517326243705544e-05, + "loss": 2.8634, + "step": 25145 + }, + { + "epoch": 1.560990750512136, + "grad_norm": 0.1757460876900314, + "learning_rate": 5.5169670282184025e-05, + "loss": 2.6421, + "step": 25146 + }, + { + "epoch": 1.561052827611894, + "grad_norm": 0.1542938207975942, + "learning_rate": 5.516607810034121e-05, + "loss": 2.7444, + "step": 25147 + }, + { + "epoch": 1.5611149047116517, + "grad_norm": 0.16801981371562125, + "learning_rate": 5.51624858915457e-05, + "loss": 2.7973, + "step": 25148 + }, + { + "epoch": 1.5611769818114096, + "grad_norm": 0.14746187222418206, + "learning_rate": 5.5158893655816234e-05, + "loss": 2.6852, + "step": 25149 + }, + { + "epoch": 1.5612390589111675, + "grad_norm": 0.1465152835026305, + "learning_rate": 5.51553013931716e-05, + "loss": 2.7968, + "step": 25150 + }, + { + "epoch": 1.5613011360109255, + "grad_norm": 0.18748617121245115, + "learning_rate": 5.515170910363048e-05, + "loss": 2.8056, + "step": 25151 + }, + { + "epoch": 1.5613632131106834, + "grad_norm": 0.15874672145724567, + "learning_rate": 5.514811678721166e-05, + "loss": 2.8671, + "step": 25152 + }, + { + "epoch": 1.5614252902104413, + "grad_norm": 0.15587055505603503, + "learning_rate": 5.514452444393386e-05, + "loss": 2.7497, + "step": 25153 + }, + { + "epoch": 1.5614873673101992, + "grad_norm": 0.15593830251773896, + "learning_rate": 5.514093207381582e-05, + "loss": 2.8627, + "step": 25154 + }, + { + "epoch": 1.5615494444099571, + "grad_norm": 0.15423794719673878, + "learning_rate": 5.51373396768763e-05, + "loss": 2.7671, + "step": 25155 + }, + { + "epoch": 1.561611521509715, + "grad_norm": 0.1473260414084624, + "learning_rate": 5.5133747253134027e-05, + "loss": 2.7878, + "step": 25156 + }, + { + "epoch": 1.561673598609473, + "grad_norm": 0.16104141880162728, + "learning_rate": 5.5130154802607747e-05, + "loss": 2.7733, + "step": 25157 + }, + { + "epoch": 1.5617356757092309, + "grad_norm": 0.1553113699110123, + "learning_rate": 5.51265623253162e-05, + "loss": 2.7654, + "step": 25158 + }, + { + "epoch": 1.5617977528089888, + "grad_norm": 0.15208632777044392, + "learning_rate": 5.5122969821278134e-05, + "loss": 2.8459, + "step": 25159 + }, + { + "epoch": 1.5618598299087467, + "grad_norm": 0.1448153461560216, + "learning_rate": 5.51193772905123e-05, + "loss": 2.7647, + "step": 25160 + }, + { + "epoch": 1.5619219070085046, + "grad_norm": 0.161619975843914, + "learning_rate": 5.5115784733037425e-05, + "loss": 2.8278, + "step": 25161 + }, + { + "epoch": 1.5619839841082626, + "grad_norm": 0.13966932340639013, + "learning_rate": 5.5112192148872256e-05, + "loss": 2.8364, + "step": 25162 + }, + { + "epoch": 1.5620460612080205, + "grad_norm": 0.14316069454587474, + "learning_rate": 5.510859953803555e-05, + "loss": 2.787, + "step": 25163 + }, + { + "epoch": 1.5621081383077784, + "grad_norm": 0.16198050880693815, + "learning_rate": 5.510500690054603e-05, + "loss": 2.831, + "step": 25164 + }, + { + "epoch": 1.5621702154075363, + "grad_norm": 0.14143719176563518, + "learning_rate": 5.5101414236422446e-05, + "loss": 2.8281, + "step": 25165 + }, + { + "epoch": 1.562232292507294, + "grad_norm": 0.1437363221565042, + "learning_rate": 5.509782154568356e-05, + "loss": 2.8905, + "step": 25166 + }, + { + "epoch": 1.562294369607052, + "grad_norm": 0.14329857589399375, + "learning_rate": 5.509422882834809e-05, + "loss": 2.6392, + "step": 25167 + }, + { + "epoch": 1.5623564467068098, + "grad_norm": 0.13531529241988682, + "learning_rate": 5.50906360844348e-05, + "loss": 2.7442, + "step": 25168 + }, + { + "epoch": 1.5624185238065678, + "grad_norm": 0.156159213632474, + "learning_rate": 5.508704331396242e-05, + "loss": 2.8135, + "step": 25169 + }, + { + "epoch": 1.5624806009063257, + "grad_norm": 0.13988660629867453, + "learning_rate": 5.508345051694971e-05, + "loss": 2.7704, + "step": 25170 + }, + { + "epoch": 1.5625426780060836, + "grad_norm": 0.19943464025421428, + "learning_rate": 5.507985769341538e-05, + "loss": 2.761, + "step": 25171 + }, + { + "epoch": 1.5626047551058413, + "grad_norm": 0.14224549907919862, + "learning_rate": 5.5076264843378225e-05, + "loss": 2.7496, + "step": 25172 + }, + { + "epoch": 1.5626668322055992, + "grad_norm": 0.1525831298085426, + "learning_rate": 5.507267196685696e-05, + "loss": 2.7569, + "step": 25173 + }, + { + "epoch": 1.5627289093053571, + "grad_norm": 0.1712447825823909, + "learning_rate": 5.506907906387032e-05, + "loss": 2.8438, + "step": 25174 + }, + { + "epoch": 1.562790986405115, + "grad_norm": 0.14707565500553502, + "learning_rate": 5.506548613443708e-05, + "loss": 2.8407, + "step": 25175 + }, + { + "epoch": 1.562853063504873, + "grad_norm": 0.15203103345794783, + "learning_rate": 5.506189317857596e-05, + "loss": 2.82, + "step": 25176 + }, + { + "epoch": 1.5629151406046309, + "grad_norm": 0.14019946776933906, + "learning_rate": 5.505830019630572e-05, + "loss": 2.7831, + "step": 25177 + }, + { + "epoch": 1.5629772177043888, + "grad_norm": 0.15268837633673904, + "learning_rate": 5.505470718764509e-05, + "loss": 2.7379, + "step": 25178 + }, + { + "epoch": 1.5630392948041467, + "grad_norm": 0.1655874227685317, + "learning_rate": 5.505111415261284e-05, + "loss": 2.817, + "step": 25179 + }, + { + "epoch": 1.5631013719039046, + "grad_norm": 0.20897695046899792, + "learning_rate": 5.504752109122768e-05, + "loss": 2.8269, + "step": 25180 + }, + { + "epoch": 1.5631634490036626, + "grad_norm": 0.2125144925504872, + "learning_rate": 5.50439280035084e-05, + "loss": 2.8522, + "step": 25181 + }, + { + "epoch": 1.5632255261034205, + "grad_norm": 0.1484314327505825, + "learning_rate": 5.504033488947371e-05, + "loss": 2.7559, + "step": 25182 + }, + { + "epoch": 1.5632876032031784, + "grad_norm": 0.17397266978923262, + "learning_rate": 5.503674174914236e-05, + "loss": 2.8179, + "step": 25183 + }, + { + "epoch": 1.5633496803029363, + "grad_norm": 0.1530017154560814, + "learning_rate": 5.5033148582533124e-05, + "loss": 2.8481, + "step": 25184 + }, + { + "epoch": 1.5634117574026942, + "grad_norm": 0.17156671923606692, + "learning_rate": 5.502955538966472e-05, + "loss": 2.7842, + "step": 25185 + }, + { + "epoch": 1.5634738345024521, + "grad_norm": 0.16930812400480044, + "learning_rate": 5.502596217055591e-05, + "loss": 2.7872, + "step": 25186 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.17014662907201297, + "learning_rate": 5.5022368925225435e-05, + "loss": 2.7852, + "step": 25187 + }, + { + "epoch": 1.563597988701968, + "grad_norm": 0.15802099788240614, + "learning_rate": 5.501877565369205e-05, + "loss": 2.7972, + "step": 25188 + }, + { + "epoch": 1.563660065801726, + "grad_norm": 0.16489972773158462, + "learning_rate": 5.501518235597448e-05, + "loss": 2.8036, + "step": 25189 + }, + { + "epoch": 1.5637221429014836, + "grad_norm": 0.1532991479734892, + "learning_rate": 5.5011589032091493e-05, + "loss": 2.8541, + "step": 25190 + }, + { + "epoch": 1.5637842200012415, + "grad_norm": 0.15306582864620977, + "learning_rate": 5.500799568206183e-05, + "loss": 2.7321, + "step": 25191 + }, + { + "epoch": 1.5638462971009994, + "grad_norm": 0.19672084404252754, + "learning_rate": 5.500440230590423e-05, + "loss": 2.9162, + "step": 25192 + }, + { + "epoch": 1.5639083742007573, + "grad_norm": 0.16389594252025783, + "learning_rate": 5.5000808903637454e-05, + "loss": 2.8334, + "step": 25193 + }, + { + "epoch": 1.5639704513005153, + "grad_norm": 0.14898677890000425, + "learning_rate": 5.499721547528025e-05, + "loss": 2.7565, + "step": 25194 + }, + { + "epoch": 1.5640325284002732, + "grad_norm": 0.17913259988170163, + "learning_rate": 5.499362202085136e-05, + "loss": 2.8178, + "step": 25195 + }, + { + "epoch": 1.5640946055000309, + "grad_norm": 0.19031879424036105, + "learning_rate": 5.499002854036952e-05, + "loss": 2.8232, + "step": 25196 + }, + { + "epoch": 1.5641566825997888, + "grad_norm": 0.17896791128485257, + "learning_rate": 5.498643503385349e-05, + "loss": 2.8351, + "step": 25197 + }, + { + "epoch": 1.5642187596995467, + "grad_norm": 0.19443424173014337, + "learning_rate": 5.4982841501322024e-05, + "loss": 2.9153, + "step": 25198 + }, + { + "epoch": 1.5642808367993046, + "grad_norm": 0.1572639112952411, + "learning_rate": 5.4979247942793856e-05, + "loss": 2.9015, + "step": 25199 + }, + { + "epoch": 1.5643429138990625, + "grad_norm": 0.1537971673989997, + "learning_rate": 5.497565435828775e-05, + "loss": 2.8817, + "step": 25200 + }, + { + "epoch": 1.5644049909988205, + "grad_norm": 0.1895940983688753, + "learning_rate": 5.497206074782245e-05, + "loss": 2.8241, + "step": 25201 + }, + { + "epoch": 1.5644670680985784, + "grad_norm": 0.15807199341715422, + "learning_rate": 5.49684671114167e-05, + "loss": 2.8091, + "step": 25202 + }, + { + "epoch": 1.5645291451983363, + "grad_norm": 0.14169042364123718, + "learning_rate": 5.496487344908925e-05, + "loss": 2.7063, + "step": 25203 + }, + { + "epoch": 1.5645912222980942, + "grad_norm": 0.1402017063884075, + "learning_rate": 5.496127976085885e-05, + "loss": 2.739, + "step": 25204 + }, + { + "epoch": 1.5646532993978521, + "grad_norm": 0.15002298626014657, + "learning_rate": 5.495768604674425e-05, + "loss": 2.7795, + "step": 25205 + }, + { + "epoch": 1.56471537649761, + "grad_norm": 0.14825745379580482, + "learning_rate": 5.4954092306764204e-05, + "loss": 2.8321, + "step": 25206 + }, + { + "epoch": 1.564777453597368, + "grad_norm": 0.1642279245080476, + "learning_rate": 5.4950498540937435e-05, + "loss": 2.7747, + "step": 25207 + }, + { + "epoch": 1.564839530697126, + "grad_norm": 0.14136918830984088, + "learning_rate": 5.4946904749282735e-05, + "loss": 2.733, + "step": 25208 + }, + { + "epoch": 1.5649016077968838, + "grad_norm": 0.15693396240077134, + "learning_rate": 5.494331093181884e-05, + "loss": 2.8004, + "step": 25209 + }, + { + "epoch": 1.5649636848966417, + "grad_norm": 0.15762587597805508, + "learning_rate": 5.493971708856447e-05, + "loss": 2.8563, + "step": 25210 + }, + { + "epoch": 1.5650257619963996, + "grad_norm": 0.15586450826466372, + "learning_rate": 5.4936123219538404e-05, + "loss": 2.7057, + "step": 25211 + }, + { + "epoch": 1.5650878390961576, + "grad_norm": 0.1681562253519213, + "learning_rate": 5.493252932475937e-05, + "loss": 2.849, + "step": 25212 + }, + { + "epoch": 1.5651499161959155, + "grad_norm": 0.1504268354939481, + "learning_rate": 5.492893540424615e-05, + "loss": 2.7861, + "step": 25213 + }, + { + "epoch": 1.5652119932956732, + "grad_norm": 0.14990479159764028, + "learning_rate": 5.492534145801747e-05, + "loss": 2.751, + "step": 25214 + }, + { + "epoch": 1.565274070395431, + "grad_norm": 0.15930513692611628, + "learning_rate": 5.492174748609209e-05, + "loss": 2.7899, + "step": 25215 + }, + { + "epoch": 1.565336147495189, + "grad_norm": 0.14527286532505532, + "learning_rate": 5.4918153488488776e-05, + "loss": 2.8129, + "step": 25216 + }, + { + "epoch": 1.565398224594947, + "grad_norm": 0.16148546526706462, + "learning_rate": 5.491455946522623e-05, + "loss": 2.8165, + "step": 25217 + }, + { + "epoch": 1.5654603016947048, + "grad_norm": 0.16093247415046721, + "learning_rate": 5.491096541632327e-05, + "loss": 2.8339, + "step": 25218 + }, + { + "epoch": 1.5655223787944628, + "grad_norm": 0.15084770645459636, + "learning_rate": 5.490737134179857e-05, + "loss": 2.7295, + "step": 25219 + }, + { + "epoch": 1.5655844558942205, + "grad_norm": 0.15071110823405431, + "learning_rate": 5.490377724167096e-05, + "loss": 2.8307, + "step": 25220 + }, + { + "epoch": 1.5656465329939784, + "grad_norm": 0.1543791889382966, + "learning_rate": 5.4900183115959136e-05, + "loss": 2.7484, + "step": 25221 + }, + { + "epoch": 1.5657086100937363, + "grad_norm": 0.15230927321757526, + "learning_rate": 5.489658896468187e-05, + "loss": 2.7598, + "step": 25222 + }, + { + "epoch": 1.5657706871934942, + "grad_norm": 0.15348706622843367, + "learning_rate": 5.489299478785791e-05, + "loss": 2.7387, + "step": 25223 + }, + { + "epoch": 1.5658327642932521, + "grad_norm": 0.15088344925148722, + "learning_rate": 5.488940058550601e-05, + "loss": 2.7419, + "step": 25224 + }, + { + "epoch": 1.56589484139301, + "grad_norm": 0.1443132354268218, + "learning_rate": 5.488580635764491e-05, + "loss": 2.8355, + "step": 25225 + }, + { + "epoch": 1.565956918492768, + "grad_norm": 0.17344638612682178, + "learning_rate": 5.4882212104293385e-05, + "loss": 2.8161, + "step": 25226 + }, + { + "epoch": 1.5660189955925259, + "grad_norm": 0.14402931924578222, + "learning_rate": 5.487861782547018e-05, + "loss": 2.8022, + "step": 25227 + }, + { + "epoch": 1.5660810726922838, + "grad_norm": 0.14886644776119717, + "learning_rate": 5.4875023521194025e-05, + "loss": 2.7528, + "step": 25228 + }, + { + "epoch": 1.5661431497920417, + "grad_norm": 0.15350336943183876, + "learning_rate": 5.48714291914837e-05, + "loss": 2.7848, + "step": 25229 + }, + { + "epoch": 1.5662052268917996, + "grad_norm": 0.1464478574375487, + "learning_rate": 5.486783483635792e-05, + "loss": 2.9318, + "step": 25230 + }, + { + "epoch": 1.5662673039915576, + "grad_norm": 0.15202708221948613, + "learning_rate": 5.486424045583549e-05, + "loss": 2.807, + "step": 25231 + }, + { + "epoch": 1.5663293810913155, + "grad_norm": 0.16142465576037987, + "learning_rate": 5.486064604993514e-05, + "loss": 2.8745, + "step": 25232 + }, + { + "epoch": 1.5663914581910734, + "grad_norm": 0.1526115948384872, + "learning_rate": 5.4857051618675615e-05, + "loss": 2.7107, + "step": 25233 + }, + { + "epoch": 1.5664535352908313, + "grad_norm": 0.15342051275075674, + "learning_rate": 5.485345716207567e-05, + "loss": 2.7797, + "step": 25234 + }, + { + "epoch": 1.5665156123905892, + "grad_norm": 0.14756888423763478, + "learning_rate": 5.4849862680154054e-05, + "loss": 2.7912, + "step": 25235 + }, + { + "epoch": 1.5665776894903471, + "grad_norm": 0.16791741562570592, + "learning_rate": 5.484626817292954e-05, + "loss": 2.8727, + "step": 25236 + }, + { + "epoch": 1.566639766590105, + "grad_norm": 0.16914955648579189, + "learning_rate": 5.484267364042086e-05, + "loss": 2.9092, + "step": 25237 + }, + { + "epoch": 1.5667018436898628, + "grad_norm": 0.16813025361754913, + "learning_rate": 5.4839079082646784e-05, + "loss": 2.8119, + "step": 25238 + }, + { + "epoch": 1.5667639207896207, + "grad_norm": 0.17092535624353283, + "learning_rate": 5.4835484499626046e-05, + "loss": 2.8248, + "step": 25239 + }, + { + "epoch": 1.5668259978893786, + "grad_norm": 0.15123493816122294, + "learning_rate": 5.483188989137742e-05, + "loss": 2.7647, + "step": 25240 + }, + { + "epoch": 1.5668880749891365, + "grad_norm": 0.1550453598231367, + "learning_rate": 5.482829525791964e-05, + "loss": 2.7639, + "step": 25241 + }, + { + "epoch": 1.5669501520888944, + "grad_norm": 0.15197916298117403, + "learning_rate": 5.4824700599271484e-05, + "loss": 2.824, + "step": 25242 + }, + { + "epoch": 1.5670122291886524, + "grad_norm": 0.17554289764495892, + "learning_rate": 5.4821105915451684e-05, + "loss": 2.8486, + "step": 25243 + }, + { + "epoch": 1.56707430628841, + "grad_norm": 0.15323666398524927, + "learning_rate": 5.4817511206479024e-05, + "loss": 2.829, + "step": 25244 + }, + { + "epoch": 1.567136383388168, + "grad_norm": 0.16627733695099, + "learning_rate": 5.481391647237223e-05, + "loss": 2.7794, + "step": 25245 + }, + { + "epoch": 1.5671984604879259, + "grad_norm": 0.14847505245027967, + "learning_rate": 5.4810321713150035e-05, + "loss": 2.7248, + "step": 25246 + }, + { + "epoch": 1.5672605375876838, + "grad_norm": 0.14687888743957755, + "learning_rate": 5.4806726928831254e-05, + "loss": 2.836, + "step": 25247 + }, + { + "epoch": 1.5673226146874417, + "grad_norm": 0.15876294797561272, + "learning_rate": 5.48031321194346e-05, + "loss": 2.7624, + "step": 25248 + }, + { + "epoch": 1.5673846917871996, + "grad_norm": 0.14914855554276393, + "learning_rate": 5.479953728497885e-05, + "loss": 2.843, + "step": 25249 + }, + { + "epoch": 1.5674467688869576, + "grad_norm": 0.15221231243615077, + "learning_rate": 5.479594242548274e-05, + "loss": 2.7856, + "step": 25250 + }, + { + "epoch": 1.5675088459867155, + "grad_norm": 0.14432521904157783, + "learning_rate": 5.479234754096505e-05, + "loss": 2.8986, + "step": 25251 + }, + { + "epoch": 1.5675709230864734, + "grad_norm": 0.14194939085143404, + "learning_rate": 5.4788752631444504e-05, + "loss": 2.7898, + "step": 25252 + }, + { + "epoch": 1.5676330001862313, + "grad_norm": 0.1585650248892239, + "learning_rate": 5.478515769693988e-05, + "loss": 2.868, + "step": 25253 + }, + { + "epoch": 1.5676950772859892, + "grad_norm": 0.16429554191843035, + "learning_rate": 5.478156273746993e-05, + "loss": 2.7872, + "step": 25254 + }, + { + "epoch": 1.5677571543857471, + "grad_norm": 0.14710268497702286, + "learning_rate": 5.4777967753053394e-05, + "loss": 2.8318, + "step": 25255 + }, + { + "epoch": 1.567819231485505, + "grad_norm": 0.1511245875364785, + "learning_rate": 5.477437274370905e-05, + "loss": 2.7754, + "step": 25256 + }, + { + "epoch": 1.567881308585263, + "grad_norm": 0.1480710186055631, + "learning_rate": 5.477077770945563e-05, + "loss": 2.8167, + "step": 25257 + }, + { + "epoch": 1.567943385685021, + "grad_norm": 0.16683432160927789, + "learning_rate": 5.4767182650311934e-05, + "loss": 2.9025, + "step": 25258 + }, + { + "epoch": 1.5680054627847788, + "grad_norm": 0.1458151726541137, + "learning_rate": 5.476358756629666e-05, + "loss": 2.7896, + "step": 25259 + }, + { + "epoch": 1.5680675398845367, + "grad_norm": 0.1501169811508424, + "learning_rate": 5.475999245742861e-05, + "loss": 2.7542, + "step": 25260 + }, + { + "epoch": 1.5681296169842946, + "grad_norm": 0.14826148665179711, + "learning_rate": 5.475639732372651e-05, + "loss": 2.7966, + "step": 25261 + }, + { + "epoch": 1.5681916940840523, + "grad_norm": 0.13733513034726805, + "learning_rate": 5.475280216520913e-05, + "loss": 2.864, + "step": 25262 + }, + { + "epoch": 1.5682537711838103, + "grad_norm": 0.14193340092051804, + "learning_rate": 5.474920698189524e-05, + "loss": 2.8044, + "step": 25263 + }, + { + "epoch": 1.5683158482835682, + "grad_norm": 0.14270906445276507, + "learning_rate": 5.474561177380357e-05, + "loss": 2.8035, + "step": 25264 + }, + { + "epoch": 1.568377925383326, + "grad_norm": 0.14660232881202928, + "learning_rate": 5.474201654095291e-05, + "loss": 2.8233, + "step": 25265 + }, + { + "epoch": 1.568440002483084, + "grad_norm": 0.1447294590154367, + "learning_rate": 5.4738421283361986e-05, + "loss": 2.7408, + "step": 25266 + }, + { + "epoch": 1.568502079582842, + "grad_norm": 0.14607173019352057, + "learning_rate": 5.4734826001049576e-05, + "loss": 2.7785, + "step": 25267 + }, + { + "epoch": 1.5685641566825996, + "grad_norm": 0.149226057912218, + "learning_rate": 5.473123069403442e-05, + "loss": 2.7759, + "step": 25268 + }, + { + "epoch": 1.5686262337823575, + "grad_norm": 0.1390078330011288, + "learning_rate": 5.4727635362335294e-05, + "loss": 2.8052, + "step": 25269 + }, + { + "epoch": 1.5686883108821155, + "grad_norm": 0.1441729912434047, + "learning_rate": 5.4724040005970955e-05, + "loss": 2.7773, + "step": 25270 + }, + { + "epoch": 1.5687503879818734, + "grad_norm": 0.15347662325940636, + "learning_rate": 5.472044462496013e-05, + "loss": 2.7643, + "step": 25271 + }, + { + "epoch": 1.5688124650816313, + "grad_norm": 0.14749183709986832, + "learning_rate": 5.471684921932162e-05, + "loss": 2.8007, + "step": 25272 + }, + { + "epoch": 1.5688745421813892, + "grad_norm": 0.1624582053503038, + "learning_rate": 5.471325378907414e-05, + "loss": 2.8123, + "step": 25273 + }, + { + "epoch": 1.5689366192811471, + "grad_norm": 0.14562405033374942, + "learning_rate": 5.470965833423649e-05, + "loss": 2.757, + "step": 25274 + }, + { + "epoch": 1.568998696380905, + "grad_norm": 0.17221468174378815, + "learning_rate": 5.47060628548274e-05, + "loss": 2.7789, + "step": 25275 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.15370290243239915, + "learning_rate": 5.470246735086564e-05, + "loss": 2.8539, + "step": 25276 + }, + { + "epoch": 1.569122850580421, + "grad_norm": 0.1509733799331227, + "learning_rate": 5.4698871822369956e-05, + "loss": 2.7944, + "step": 25277 + }, + { + "epoch": 1.5691849276801788, + "grad_norm": 0.15152075576080906, + "learning_rate": 5.469527626935914e-05, + "loss": 2.9029, + "step": 25278 + }, + { + "epoch": 1.5692470047799367, + "grad_norm": 0.14994194551415393, + "learning_rate": 5.469168069185191e-05, + "loss": 2.7934, + "step": 25279 + }, + { + "epoch": 1.5693090818796946, + "grad_norm": 0.1567505210779585, + "learning_rate": 5.468808508986704e-05, + "loss": 2.8557, + "step": 25280 + }, + { + "epoch": 1.5693711589794526, + "grad_norm": 0.15449475022226805, + "learning_rate": 5.4684489463423304e-05, + "loss": 2.8219, + "step": 25281 + }, + { + "epoch": 1.5694332360792105, + "grad_norm": 0.16072303222137582, + "learning_rate": 5.4680893812539436e-05, + "loss": 2.8682, + "step": 25282 + }, + { + "epoch": 1.5694953131789684, + "grad_norm": 0.13884046541579298, + "learning_rate": 5.4677298137234225e-05, + "loss": 2.8538, + "step": 25283 + }, + { + "epoch": 1.5695573902787263, + "grad_norm": 0.15551606301807772, + "learning_rate": 5.4673702437526395e-05, + "loss": 2.8365, + "step": 25284 + }, + { + "epoch": 1.5696194673784842, + "grad_norm": 0.15000597574331243, + "learning_rate": 5.4670106713434743e-05, + "loss": 2.7627, + "step": 25285 + }, + { + "epoch": 1.569681544478242, + "grad_norm": 0.15627831560105834, + "learning_rate": 5.4666510964978e-05, + "loss": 2.8541, + "step": 25286 + }, + { + "epoch": 1.5697436215779998, + "grad_norm": 0.15287881811455084, + "learning_rate": 5.4662915192174946e-05, + "loss": 2.9009, + "step": 25287 + }, + { + "epoch": 1.5698056986777578, + "grad_norm": 0.17783563475123668, + "learning_rate": 5.465931939504434e-05, + "loss": 2.8631, + "step": 25288 + }, + { + "epoch": 1.5698677757775157, + "grad_norm": 0.14974697896412897, + "learning_rate": 5.4655723573604914e-05, + "loss": 2.7692, + "step": 25289 + }, + { + "epoch": 1.5699298528772736, + "grad_norm": 0.1634384559782089, + "learning_rate": 5.4652127727875454e-05, + "loss": 2.7697, + "step": 25290 + }, + { + "epoch": 1.5699919299770315, + "grad_norm": 0.15198325640483543, + "learning_rate": 5.4648531857874707e-05, + "loss": 2.8258, + "step": 25291 + }, + { + "epoch": 1.5700540070767892, + "grad_norm": 0.15676905130934476, + "learning_rate": 5.464493596362146e-05, + "loss": 2.8348, + "step": 25292 + }, + { + "epoch": 1.5701160841765471, + "grad_norm": 0.15067207950054795, + "learning_rate": 5.4641340045134426e-05, + "loss": 2.8166, + "step": 25293 + }, + { + "epoch": 1.570178161276305, + "grad_norm": 0.15290139912267028, + "learning_rate": 5.463774410243242e-05, + "loss": 2.745, + "step": 25294 + }, + { + "epoch": 1.570240238376063, + "grad_norm": 0.16010761685605288, + "learning_rate": 5.463414813553416e-05, + "loss": 2.8564, + "step": 25295 + }, + { + "epoch": 1.5703023154758209, + "grad_norm": 0.159468610937666, + "learning_rate": 5.463055214445843e-05, + "loss": 2.7964, + "step": 25296 + }, + { + "epoch": 1.5703643925755788, + "grad_norm": 0.17456711976807926, + "learning_rate": 5.4626956129223994e-05, + "loss": 2.7624, + "step": 25297 + }, + { + "epoch": 1.5704264696753367, + "grad_norm": 0.15798800760453674, + "learning_rate": 5.462336008984958e-05, + "loss": 2.8111, + "step": 25298 + }, + { + "epoch": 1.5704885467750946, + "grad_norm": 0.14355182644854034, + "learning_rate": 5.4619764026354004e-05, + "loss": 2.8324, + "step": 25299 + }, + { + "epoch": 1.5705506238748526, + "grad_norm": 0.14986388741426193, + "learning_rate": 5.461616793875598e-05, + "loss": 2.8704, + "step": 25300 + }, + { + "epoch": 1.5706127009746105, + "grad_norm": 0.14164119691938437, + "learning_rate": 5.4612571827074297e-05, + "loss": 2.7121, + "step": 25301 + }, + { + "epoch": 1.5706747780743684, + "grad_norm": 0.14707490146362295, + "learning_rate": 5.46089756913277e-05, + "loss": 2.6573, + "step": 25302 + }, + { + "epoch": 1.5707368551741263, + "grad_norm": 0.1539262472996834, + "learning_rate": 5.460537953153496e-05, + "loss": 2.8673, + "step": 25303 + }, + { + "epoch": 1.5707989322738842, + "grad_norm": 0.13979738468475053, + "learning_rate": 5.460178334771484e-05, + "loss": 2.7812, + "step": 25304 + }, + { + "epoch": 1.5708610093736421, + "grad_norm": 0.15602719992277236, + "learning_rate": 5.45981871398861e-05, + "loss": 2.9296, + "step": 25305 + }, + { + "epoch": 1.5709230864734, + "grad_norm": 0.14575535883670535, + "learning_rate": 5.45945909080675e-05, + "loss": 2.7817, + "step": 25306 + }, + { + "epoch": 1.570985163573158, + "grad_norm": 0.16606542177038391, + "learning_rate": 5.45909946522778e-05, + "loss": 2.8521, + "step": 25307 + }, + { + "epoch": 1.571047240672916, + "grad_norm": 0.1438146300773583, + "learning_rate": 5.458739837253577e-05, + "loss": 2.7439, + "step": 25308 + }, + { + "epoch": 1.5711093177726738, + "grad_norm": 0.18072284051631585, + "learning_rate": 5.458380206886017e-05, + "loss": 2.8342, + "step": 25309 + }, + { + "epoch": 1.5711713948724315, + "grad_norm": 0.1479823575817268, + "learning_rate": 5.458020574126976e-05, + "loss": 2.8734, + "step": 25310 + }, + { + "epoch": 1.5712334719721894, + "grad_norm": 0.16385280741543914, + "learning_rate": 5.4576609389783285e-05, + "loss": 2.6897, + "step": 25311 + }, + { + "epoch": 1.5712955490719474, + "grad_norm": 0.14783378857064425, + "learning_rate": 5.4573013014419546e-05, + "loss": 2.9025, + "step": 25312 + }, + { + "epoch": 1.5713576261717053, + "grad_norm": 0.16922516328981568, + "learning_rate": 5.45694166151973e-05, + "loss": 2.8751, + "step": 25313 + }, + { + "epoch": 1.5714197032714632, + "grad_norm": 0.16465231482487058, + "learning_rate": 5.456582019213529e-05, + "loss": 2.8155, + "step": 25314 + }, + { + "epoch": 1.571481780371221, + "grad_norm": 0.16361007866714036, + "learning_rate": 5.456222374525228e-05, + "loss": 2.8358, + "step": 25315 + }, + { + "epoch": 1.5715438574709788, + "grad_norm": 0.14821307257013816, + "learning_rate": 5.455862727456704e-05, + "loss": 2.8377, + "step": 25316 + }, + { + "epoch": 1.5716059345707367, + "grad_norm": 0.1481482737300454, + "learning_rate": 5.455503078009835e-05, + "loss": 2.8312, + "step": 25317 + }, + { + "epoch": 1.5716680116704946, + "grad_norm": 0.15895914819574375, + "learning_rate": 5.455143426186494e-05, + "loss": 2.8463, + "step": 25318 + }, + { + "epoch": 1.5717300887702526, + "grad_norm": 0.14947726589638108, + "learning_rate": 5.4547837719885605e-05, + "loss": 2.8995, + "step": 25319 + }, + { + "epoch": 1.5717921658700105, + "grad_norm": 0.14989285019624898, + "learning_rate": 5.4544241154179096e-05, + "loss": 2.7831, + "step": 25320 + }, + { + "epoch": 1.5718542429697684, + "grad_norm": 0.14230148048986177, + "learning_rate": 5.4540644564764176e-05, + "loss": 2.8535, + "step": 25321 + }, + { + "epoch": 1.5719163200695263, + "grad_norm": 0.16617047997531612, + "learning_rate": 5.4537047951659614e-05, + "loss": 2.8471, + "step": 25322 + }, + { + "epoch": 1.5719783971692842, + "grad_norm": 0.14040322167123367, + "learning_rate": 5.453345131488417e-05, + "loss": 2.7159, + "step": 25323 + }, + { + "epoch": 1.5720404742690421, + "grad_norm": 0.17122896656114106, + "learning_rate": 5.4529854654456614e-05, + "loss": 2.8767, + "step": 25324 + }, + { + "epoch": 1.5721025513688, + "grad_norm": 0.16065370249439728, + "learning_rate": 5.452625797039569e-05, + "loss": 2.7897, + "step": 25325 + }, + { + "epoch": 1.572164628468558, + "grad_norm": 0.15430296511642672, + "learning_rate": 5.4522661262720197e-05, + "loss": 2.8368, + "step": 25326 + }, + { + "epoch": 1.572226705568316, + "grad_norm": 0.178495454379046, + "learning_rate": 5.4519064531448874e-05, + "loss": 2.7868, + "step": 25327 + }, + { + "epoch": 1.5722887826680738, + "grad_norm": 0.15619405102395695, + "learning_rate": 5.4515467776600484e-05, + "loss": 2.8109, + "step": 25328 + }, + { + "epoch": 1.5723508597678317, + "grad_norm": 0.20651851897121562, + "learning_rate": 5.451187099819382e-05, + "loss": 2.7457, + "step": 25329 + }, + { + "epoch": 1.5724129368675897, + "grad_norm": 0.14230127760356506, + "learning_rate": 5.4508274196247624e-05, + "loss": 2.7707, + "step": 25330 + }, + { + "epoch": 1.5724750139673476, + "grad_norm": 0.1732926640307205, + "learning_rate": 5.450467737078067e-05, + "loss": 2.8183, + "step": 25331 + }, + { + "epoch": 1.5725370910671055, + "grad_norm": 0.1488208025535194, + "learning_rate": 5.450108052181171e-05, + "loss": 2.8157, + "step": 25332 + }, + { + "epoch": 1.5725991681668634, + "grad_norm": 0.17239970613914257, + "learning_rate": 5.449748364935954e-05, + "loss": 2.744, + "step": 25333 + }, + { + "epoch": 1.572661245266621, + "grad_norm": 0.16666267745389726, + "learning_rate": 5.449388675344288e-05, + "loss": 2.8794, + "step": 25334 + }, + { + "epoch": 1.572723322366379, + "grad_norm": 0.15062485450523916, + "learning_rate": 5.4490289834080545e-05, + "loss": 2.7656, + "step": 25335 + }, + { + "epoch": 1.572785399466137, + "grad_norm": 0.16830719628248464, + "learning_rate": 5.448669289129126e-05, + "loss": 2.8629, + "step": 25336 + }, + { + "epoch": 1.5728474765658949, + "grad_norm": 0.14598060449195455, + "learning_rate": 5.4483095925093816e-05, + "loss": 2.8216, + "step": 25337 + }, + { + "epoch": 1.5729095536656528, + "grad_norm": 0.16626475658571752, + "learning_rate": 5.4479498935506975e-05, + "loss": 2.8626, + "step": 25338 + }, + { + "epoch": 1.5729716307654107, + "grad_norm": 0.16051672283403565, + "learning_rate": 5.44759019225495e-05, + "loss": 2.8093, + "step": 25339 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.15592602282392462, + "learning_rate": 5.447230488624015e-05, + "loss": 2.8417, + "step": 25340 + }, + { + "epoch": 1.5730957849649263, + "grad_norm": 0.1512819234600179, + "learning_rate": 5.446870782659772e-05, + "loss": 2.7403, + "step": 25341 + }, + { + "epoch": 1.5731578620646842, + "grad_norm": 0.16339778779639588, + "learning_rate": 5.4465110743640936e-05, + "loss": 2.8124, + "step": 25342 + }, + { + "epoch": 1.5732199391644421, + "grad_norm": 0.1570119103341441, + "learning_rate": 5.4461513637388595e-05, + "loss": 2.6944, + "step": 25343 + }, + { + "epoch": 1.5732820162642, + "grad_norm": 0.16258800466490975, + "learning_rate": 5.445791650785945e-05, + "loss": 2.7998, + "step": 25344 + }, + { + "epoch": 1.573344093363958, + "grad_norm": 0.15582222794631906, + "learning_rate": 5.445431935507227e-05, + "loss": 2.7596, + "step": 25345 + }, + { + "epoch": 1.573406170463716, + "grad_norm": 0.16718958737505252, + "learning_rate": 5.4450722179045835e-05, + "loss": 2.7857, + "step": 25346 + }, + { + "epoch": 1.5734682475634738, + "grad_norm": 0.15718140160844035, + "learning_rate": 5.444712497979889e-05, + "loss": 2.801, + "step": 25347 + }, + { + "epoch": 1.5735303246632317, + "grad_norm": 0.15768618958178565, + "learning_rate": 5.444352775735022e-05, + "loss": 2.7784, + "step": 25348 + }, + { + "epoch": 1.5735924017629896, + "grad_norm": 0.15737771534186876, + "learning_rate": 5.443993051171859e-05, + "loss": 2.7048, + "step": 25349 + }, + { + "epoch": 1.5736544788627476, + "grad_norm": 0.1498291756559987, + "learning_rate": 5.4436333242922764e-05, + "loss": 2.8478, + "step": 25350 + }, + { + "epoch": 1.5737165559625055, + "grad_norm": 0.1781593001117446, + "learning_rate": 5.443273595098151e-05, + "loss": 2.8603, + "step": 25351 + }, + { + "epoch": 1.5737786330622634, + "grad_norm": 0.18994071979938015, + "learning_rate": 5.442913863591359e-05, + "loss": 2.9356, + "step": 25352 + }, + { + "epoch": 1.5738407101620213, + "grad_norm": 0.1624068552607462, + "learning_rate": 5.4425541297737795e-05, + "loss": 2.7486, + "step": 25353 + }, + { + "epoch": 1.5739027872617792, + "grad_norm": 0.14438754102602133, + "learning_rate": 5.442194393647285e-05, + "loss": 2.8964, + "step": 25354 + }, + { + "epoch": 1.5739648643615372, + "grad_norm": 0.15790723280126165, + "learning_rate": 5.441834655213758e-05, + "loss": 2.7947, + "step": 25355 + }, + { + "epoch": 1.574026941461295, + "grad_norm": 0.17853704445047744, + "learning_rate": 5.44147491447507e-05, + "loss": 2.8248, + "step": 25356 + }, + { + "epoch": 1.574089018561053, + "grad_norm": 0.15683369688378898, + "learning_rate": 5.441115171433101e-05, + "loss": 2.8267, + "step": 25357 + }, + { + "epoch": 1.5741510956608107, + "grad_norm": 0.14594981247794142, + "learning_rate": 5.440755426089727e-05, + "loss": 2.8567, + "step": 25358 + }, + { + "epoch": 1.5742131727605686, + "grad_norm": 0.1491788496227163, + "learning_rate": 5.440395678446826e-05, + "loss": 2.7824, + "step": 25359 + }, + { + "epoch": 1.5742752498603265, + "grad_norm": 0.1488103638210884, + "learning_rate": 5.440035928506272e-05, + "loss": 2.8722, + "step": 25360 + }, + { + "epoch": 1.5743373269600844, + "grad_norm": 0.13757865780018322, + "learning_rate": 5.439676176269945e-05, + "loss": 2.6572, + "step": 25361 + }, + { + "epoch": 1.5743994040598424, + "grad_norm": 0.15571238893587785, + "learning_rate": 5.4393164217397194e-05, + "loss": 2.9476, + "step": 25362 + }, + { + "epoch": 1.5744614811596003, + "grad_norm": 0.13737890400124955, + "learning_rate": 5.438956664917474e-05, + "loss": 2.7499, + "step": 25363 + }, + { + "epoch": 1.574523558259358, + "grad_norm": 0.14556716502114847, + "learning_rate": 5.4385969058050853e-05, + "loss": 2.7452, + "step": 25364 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.14787526060437725, + "learning_rate": 5.43823714440443e-05, + "loss": 2.8192, + "step": 25365 + }, + { + "epoch": 1.5746477124588738, + "grad_norm": 0.13670499596213082, + "learning_rate": 5.437877380717387e-05, + "loss": 2.7752, + "step": 25366 + }, + { + "epoch": 1.5747097895586317, + "grad_norm": 0.14274623836164166, + "learning_rate": 5.437517614745829e-05, + "loss": 2.7984, + "step": 25367 + }, + { + "epoch": 1.5747718666583896, + "grad_norm": 0.14825801287429116, + "learning_rate": 5.437157846491636e-05, + "loss": 2.7999, + "step": 25368 + }, + { + "epoch": 1.5748339437581476, + "grad_norm": 0.14854995959750666, + "learning_rate": 5.436798075956685e-05, + "loss": 2.8811, + "step": 25369 + }, + { + "epoch": 1.5748960208579055, + "grad_norm": 0.16073385613358554, + "learning_rate": 5.436438303142851e-05, + "loss": 2.8446, + "step": 25370 + }, + { + "epoch": 1.5749580979576634, + "grad_norm": 0.14720125258639394, + "learning_rate": 5.436078528052014e-05, + "loss": 2.7969, + "step": 25371 + }, + { + "epoch": 1.5750201750574213, + "grad_norm": 0.15259860191896865, + "learning_rate": 5.4357187506860483e-05, + "loss": 2.8374, + "step": 25372 + }, + { + "epoch": 1.5750822521571792, + "grad_norm": 0.16283017507371145, + "learning_rate": 5.4353589710468337e-05, + "loss": 2.8721, + "step": 25373 + }, + { + "epoch": 1.5751443292569371, + "grad_norm": 0.15217459898125466, + "learning_rate": 5.434999189136244e-05, + "loss": 2.8176, + "step": 25374 + }, + { + "epoch": 1.575206406356695, + "grad_norm": 0.16857511279492576, + "learning_rate": 5.434639404956159e-05, + "loss": 2.868, + "step": 25375 + }, + { + "epoch": 1.575268483456453, + "grad_norm": 0.14438065854250273, + "learning_rate": 5.434279618508453e-05, + "loss": 2.8442, + "step": 25376 + }, + { + "epoch": 1.575330560556211, + "grad_norm": 0.1431869805949734, + "learning_rate": 5.433919829795006e-05, + "loss": 2.7643, + "step": 25377 + }, + { + "epoch": 1.5753926376559688, + "grad_norm": 0.1500336075426518, + "learning_rate": 5.4335600388176946e-05, + "loss": 2.8305, + "step": 25378 + }, + { + "epoch": 1.5754547147557267, + "grad_norm": 0.15158875768851773, + "learning_rate": 5.4332002455783936e-05, + "loss": 2.766, + "step": 25379 + }, + { + "epoch": 1.5755167918554847, + "grad_norm": 0.15220083355507755, + "learning_rate": 5.432840450078983e-05, + "loss": 2.8684, + "step": 25380 + }, + { + "epoch": 1.5755788689552426, + "grad_norm": 0.146712151501916, + "learning_rate": 5.432480652321338e-05, + "loss": 2.8288, + "step": 25381 + }, + { + "epoch": 1.5756409460550003, + "grad_norm": 0.14889241454503888, + "learning_rate": 5.432120852307337e-05, + "loss": 2.8378, + "step": 25382 + }, + { + "epoch": 1.5757030231547582, + "grad_norm": 0.14527412184625627, + "learning_rate": 5.431761050038856e-05, + "loss": 2.8042, + "step": 25383 + }, + { + "epoch": 1.575765100254516, + "grad_norm": 0.1581318365002566, + "learning_rate": 5.431401245517774e-05, + "loss": 2.7761, + "step": 25384 + }, + { + "epoch": 1.575827177354274, + "grad_norm": 0.1576196129102943, + "learning_rate": 5.431041438745965e-05, + "loss": 2.8158, + "step": 25385 + }, + { + "epoch": 1.575889254454032, + "grad_norm": 0.23521549992896246, + "learning_rate": 5.4306816297253104e-05, + "loss": 2.7862, + "step": 25386 + }, + { + "epoch": 1.5759513315537899, + "grad_norm": 0.17863009140641545, + "learning_rate": 5.430321818457684e-05, + "loss": 2.8083, + "step": 25387 + }, + { + "epoch": 1.5760134086535476, + "grad_norm": 0.1554229724044154, + "learning_rate": 5.4299620049449654e-05, + "loss": 2.742, + "step": 25388 + }, + { + "epoch": 1.5760754857533055, + "grad_norm": 0.1579707817848766, + "learning_rate": 5.4296021891890295e-05, + "loss": 2.8001, + "step": 25389 + }, + { + "epoch": 1.5761375628530634, + "grad_norm": 0.14998094070068213, + "learning_rate": 5.4292423711917536e-05, + "loss": 2.8157, + "step": 25390 + }, + { + "epoch": 1.5761996399528213, + "grad_norm": 0.1441341434871992, + "learning_rate": 5.4288825509550175e-05, + "loss": 2.7179, + "step": 25391 + }, + { + "epoch": 1.5762617170525792, + "grad_norm": 0.15832200116984765, + "learning_rate": 5.4285227284806975e-05, + "loss": 2.7841, + "step": 25392 + }, + { + "epoch": 1.5763237941523371, + "grad_norm": 0.15914288194766674, + "learning_rate": 5.428162903770668e-05, + "loss": 2.8762, + "step": 25393 + }, + { + "epoch": 1.576385871252095, + "grad_norm": 0.1458808226975987, + "learning_rate": 5.427803076826812e-05, + "loss": 2.796, + "step": 25394 + }, + { + "epoch": 1.576447948351853, + "grad_norm": 0.16184914625488894, + "learning_rate": 5.427443247651e-05, + "loss": 2.854, + "step": 25395 + }, + { + "epoch": 1.576510025451611, + "grad_norm": 0.16041654856685733, + "learning_rate": 5.4270834162451157e-05, + "loss": 2.7776, + "step": 25396 + }, + { + "epoch": 1.5765721025513688, + "grad_norm": 0.1538677344078848, + "learning_rate": 5.4267235826110315e-05, + "loss": 2.793, + "step": 25397 + }, + { + "epoch": 1.5766341796511267, + "grad_norm": 0.14616520760935992, + "learning_rate": 5.426363746750629e-05, + "loss": 2.8135, + "step": 25398 + }, + { + "epoch": 1.5766962567508847, + "grad_norm": 0.153336062503149, + "learning_rate": 5.426003908665782e-05, + "loss": 2.7564, + "step": 25399 + }, + { + "epoch": 1.5767583338506426, + "grad_norm": 0.17300639787803165, + "learning_rate": 5.42564406835837e-05, + "loss": 2.8472, + "step": 25400 + }, + { + "epoch": 1.5768204109504005, + "grad_norm": 0.15812059887313593, + "learning_rate": 5.425284225830268e-05, + "loss": 2.8035, + "step": 25401 + }, + { + "epoch": 1.5768824880501584, + "grad_norm": 0.1620292867892894, + "learning_rate": 5.4249243810833564e-05, + "loss": 2.9038, + "step": 25402 + }, + { + "epoch": 1.5769445651499163, + "grad_norm": 0.15746154167080204, + "learning_rate": 5.4245645341195094e-05, + "loss": 2.7988, + "step": 25403 + }, + { + "epoch": 1.5770066422496742, + "grad_norm": 0.14325030488752372, + "learning_rate": 5.424204684940608e-05, + "loss": 2.7512, + "step": 25404 + }, + { + "epoch": 1.5770687193494322, + "grad_norm": 0.16652924800670887, + "learning_rate": 5.423844833548528e-05, + "loss": 2.7855, + "step": 25405 + }, + { + "epoch": 1.5771307964491899, + "grad_norm": 0.14789375045947895, + "learning_rate": 5.4234849799451446e-05, + "loss": 2.8217, + "step": 25406 + }, + { + "epoch": 1.5771928735489478, + "grad_norm": 0.14863598879053216, + "learning_rate": 5.42312512413234e-05, + "loss": 2.7507, + "step": 25407 + }, + { + "epoch": 1.5772549506487057, + "grad_norm": 0.16149825645950897, + "learning_rate": 5.4227652661119865e-05, + "loss": 2.885, + "step": 25408 + }, + { + "epoch": 1.5773170277484636, + "grad_norm": 0.14753692134548813, + "learning_rate": 5.422405405885965e-05, + "loss": 2.808, + "step": 25409 + }, + { + "epoch": 1.5773791048482215, + "grad_norm": 0.1512541256393158, + "learning_rate": 5.42204554345615e-05, + "loss": 2.8146, + "step": 25410 + }, + { + "epoch": 1.5774411819479794, + "grad_norm": 0.1479316990617124, + "learning_rate": 5.421685678824423e-05, + "loss": 2.9102, + "step": 25411 + }, + { + "epoch": 1.5775032590477371, + "grad_norm": 0.14677681659494685, + "learning_rate": 5.42132581199266e-05, + "loss": 2.8574, + "step": 25412 + }, + { + "epoch": 1.577565336147495, + "grad_norm": 0.1393193321717622, + "learning_rate": 5.420965942962737e-05, + "loss": 2.7858, + "step": 25413 + }, + { + "epoch": 1.577627413247253, + "grad_norm": 0.13981618343134405, + "learning_rate": 5.420606071736533e-05, + "loss": 2.8854, + "step": 25414 + }, + { + "epoch": 1.577689490347011, + "grad_norm": 0.13636016824359773, + "learning_rate": 5.4202461983159234e-05, + "loss": 2.9135, + "step": 25415 + }, + { + "epoch": 1.5777515674467688, + "grad_norm": 0.15932103767504957, + "learning_rate": 5.4198863227027895e-05, + "loss": 2.8208, + "step": 25416 + }, + { + "epoch": 1.5778136445465267, + "grad_norm": 0.14457061592323767, + "learning_rate": 5.419526444899005e-05, + "loss": 2.7268, + "step": 25417 + }, + { + "epoch": 1.5778757216462846, + "grad_norm": 0.15211397963432827, + "learning_rate": 5.4191665649064504e-05, + "loss": 2.7632, + "step": 25418 + }, + { + "epoch": 1.5779377987460426, + "grad_norm": 0.1402874511436718, + "learning_rate": 5.418806682727001e-05, + "loss": 2.8009, + "step": 25419 + }, + { + "epoch": 1.5779998758458005, + "grad_norm": 0.14561650357816688, + "learning_rate": 5.4184467983625364e-05, + "loss": 2.8264, + "step": 25420 + }, + { + "epoch": 1.5780619529455584, + "grad_norm": 0.14490250640543276, + "learning_rate": 5.418086911814932e-05, + "loss": 2.8723, + "step": 25421 + }, + { + "epoch": 1.5781240300453163, + "grad_norm": 0.1552297482715364, + "learning_rate": 5.417727023086069e-05, + "loss": 2.8061, + "step": 25422 + }, + { + "epoch": 1.5781861071450742, + "grad_norm": 0.1476662955323407, + "learning_rate": 5.417367132177822e-05, + "loss": 2.7662, + "step": 25423 + }, + { + "epoch": 1.5782481842448322, + "grad_norm": 0.14055818056370717, + "learning_rate": 5.4170072390920676e-05, + "loss": 2.7315, + "step": 25424 + }, + { + "epoch": 1.57831026134459, + "grad_norm": 0.1415585945971725, + "learning_rate": 5.416647343830687e-05, + "loss": 2.8262, + "step": 25425 + }, + { + "epoch": 1.578372338444348, + "grad_norm": 0.14196867872820373, + "learning_rate": 5.416287446395554e-05, + "loss": 2.8412, + "step": 25426 + }, + { + "epoch": 1.578434415544106, + "grad_norm": 0.1497885679203902, + "learning_rate": 5.415927546788551e-05, + "loss": 2.8737, + "step": 25427 + }, + { + "epoch": 1.5784964926438638, + "grad_norm": 0.1501923033731577, + "learning_rate": 5.41556764501155e-05, + "loss": 2.9019, + "step": 25428 + }, + { + "epoch": 1.5785585697436217, + "grad_norm": 0.14668467496679835, + "learning_rate": 5.415207741066435e-05, + "loss": 2.7996, + "step": 25429 + }, + { + "epoch": 1.5786206468433794, + "grad_norm": 0.16428537783843739, + "learning_rate": 5.414847834955079e-05, + "loss": 2.7561, + "step": 25430 + }, + { + "epoch": 1.5786827239431374, + "grad_norm": 0.1463095384535961, + "learning_rate": 5.41448792667936e-05, + "loss": 2.7766, + "step": 25431 + }, + { + "epoch": 1.5787448010428953, + "grad_norm": 0.1457020340231788, + "learning_rate": 5.414128016241159e-05, + "loss": 2.8234, + "step": 25432 + }, + { + "epoch": 1.5788068781426532, + "grad_norm": 0.1647171289005598, + "learning_rate": 5.413768103642349e-05, + "loss": 2.7365, + "step": 25433 + }, + { + "epoch": 1.578868955242411, + "grad_norm": 0.15234541280768218, + "learning_rate": 5.4134081888848133e-05, + "loss": 2.7349, + "step": 25434 + }, + { + "epoch": 1.578931032342169, + "grad_norm": 0.15229077518513576, + "learning_rate": 5.413048271970425e-05, + "loss": 2.8143, + "step": 25435 + }, + { + "epoch": 1.5789931094419267, + "grad_norm": 0.14959518200954133, + "learning_rate": 5.4126883529010645e-05, + "loss": 2.8562, + "step": 25436 + }, + { + "epoch": 1.5790551865416846, + "grad_norm": 0.15284561974320285, + "learning_rate": 5.4123284316786074e-05, + "loss": 2.8737, + "step": 25437 + }, + { + "epoch": 1.5791172636414426, + "grad_norm": 0.14160980332171003, + "learning_rate": 5.411968508304933e-05, + "loss": 2.8089, + "step": 25438 + }, + { + "epoch": 1.5791793407412005, + "grad_norm": 0.1452760194291029, + "learning_rate": 5.4116085827819185e-05, + "loss": 2.7433, + "step": 25439 + }, + { + "epoch": 1.5792414178409584, + "grad_norm": 0.15830479992725707, + "learning_rate": 5.411248655111444e-05, + "loss": 2.8398, + "step": 25440 + }, + { + "epoch": 1.5793034949407163, + "grad_norm": 0.14287673693950279, + "learning_rate": 5.4108887252953844e-05, + "loss": 2.7183, + "step": 25441 + }, + { + "epoch": 1.5793655720404742, + "grad_norm": 0.16594196817341497, + "learning_rate": 5.410528793335617e-05, + "loss": 2.7795, + "step": 25442 + }, + { + "epoch": 1.5794276491402321, + "grad_norm": 0.14641557349295564, + "learning_rate": 5.410168859234024e-05, + "loss": 2.7172, + "step": 25443 + }, + { + "epoch": 1.57948972623999, + "grad_norm": 0.17319638938178414, + "learning_rate": 5.409808922992479e-05, + "loss": 2.8117, + "step": 25444 + }, + { + "epoch": 1.579551803339748, + "grad_norm": 0.1626848496808492, + "learning_rate": 5.409448984612861e-05, + "loss": 2.7149, + "step": 25445 + }, + { + "epoch": 1.579613880439506, + "grad_norm": 0.17453435954821803, + "learning_rate": 5.409089044097049e-05, + "loss": 2.8721, + "step": 25446 + }, + { + "epoch": 1.5796759575392638, + "grad_norm": 0.1657547190844519, + "learning_rate": 5.40872910144692e-05, + "loss": 2.7672, + "step": 25447 + }, + { + "epoch": 1.5797380346390217, + "grad_norm": 0.1719647032562965, + "learning_rate": 5.408369156664352e-05, + "loss": 2.8004, + "step": 25448 + }, + { + "epoch": 1.5798001117387797, + "grad_norm": 0.1503307111716103, + "learning_rate": 5.4080092097512224e-05, + "loss": 2.757, + "step": 25449 + }, + { + "epoch": 1.5798621888385376, + "grad_norm": 0.16543469607214814, + "learning_rate": 5.407649260709411e-05, + "loss": 2.8538, + "step": 25450 + }, + { + "epoch": 1.5799242659382955, + "grad_norm": 0.1643858402724473, + "learning_rate": 5.407289309540794e-05, + "loss": 2.8796, + "step": 25451 + }, + { + "epoch": 1.5799863430380534, + "grad_norm": 0.17403897880747057, + "learning_rate": 5.40692935624725e-05, + "loss": 2.8214, + "step": 25452 + }, + { + "epoch": 1.5800484201378113, + "grad_norm": 0.14807029528807206, + "learning_rate": 5.406569400830656e-05, + "loss": 2.7781, + "step": 25453 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.14566862577419226, + "learning_rate": 5.406209443292891e-05, + "loss": 2.8592, + "step": 25454 + }, + { + "epoch": 1.580172574337327, + "grad_norm": 0.15102416518090667, + "learning_rate": 5.405849483635832e-05, + "loss": 2.8144, + "step": 25455 + }, + { + "epoch": 1.5802346514370849, + "grad_norm": 0.1446841178976623, + "learning_rate": 5.40548952186136e-05, + "loss": 2.7765, + "step": 25456 + }, + { + "epoch": 1.5802967285368428, + "grad_norm": 0.15214675001352038, + "learning_rate": 5.405129557971348e-05, + "loss": 2.83, + "step": 25457 + }, + { + "epoch": 1.5803588056366007, + "grad_norm": 0.15958198075140442, + "learning_rate": 5.404769591967679e-05, + "loss": 2.8058, + "step": 25458 + }, + { + "epoch": 1.5804208827363586, + "grad_norm": 0.15095833351263532, + "learning_rate": 5.404409623852227e-05, + "loss": 2.7677, + "step": 25459 + }, + { + "epoch": 1.5804829598361163, + "grad_norm": 0.13625545340963482, + "learning_rate": 5.404049653626872e-05, + "loss": 2.7615, + "step": 25460 + }, + { + "epoch": 1.5805450369358742, + "grad_norm": 0.15080996669802688, + "learning_rate": 5.4036896812934924e-05, + "loss": 2.6081, + "step": 25461 + }, + { + "epoch": 1.5806071140356321, + "grad_norm": 0.16475086894913255, + "learning_rate": 5.403329706853966e-05, + "loss": 2.6926, + "step": 25462 + }, + { + "epoch": 1.58066919113539, + "grad_norm": 0.1657272695642198, + "learning_rate": 5.40296973031017e-05, + "loss": 2.8591, + "step": 25463 + }, + { + "epoch": 1.580731268235148, + "grad_norm": 0.16021794200944223, + "learning_rate": 5.402609751663983e-05, + "loss": 2.8107, + "step": 25464 + }, + { + "epoch": 1.580793345334906, + "grad_norm": 0.1464228500860498, + "learning_rate": 5.4022497709172845e-05, + "loss": 2.8075, + "step": 25465 + }, + { + "epoch": 1.5808554224346638, + "grad_norm": 0.1405945585998766, + "learning_rate": 5.401889788071951e-05, + "loss": 2.845, + "step": 25466 + }, + { + "epoch": 1.5809174995344217, + "grad_norm": 0.18535625303099734, + "learning_rate": 5.40152980312986e-05, + "loss": 2.812, + "step": 25467 + }, + { + "epoch": 1.5809795766341797, + "grad_norm": 0.16252906314755347, + "learning_rate": 5.401169816092891e-05, + "loss": 2.8344, + "step": 25468 + }, + { + "epoch": 1.5810416537339376, + "grad_norm": 0.15482818895824288, + "learning_rate": 5.400809826962921e-05, + "loss": 2.8463, + "step": 25469 + }, + { + "epoch": 1.5811037308336955, + "grad_norm": 0.15508648275727552, + "learning_rate": 5.40044983574183e-05, + "loss": 2.8698, + "step": 25470 + }, + { + "epoch": 1.5811658079334534, + "grad_norm": 0.1518209240249899, + "learning_rate": 5.4000898424314935e-05, + "loss": 2.7659, + "step": 25471 + }, + { + "epoch": 1.5812278850332113, + "grad_norm": 0.1586957432080893, + "learning_rate": 5.3997298470337934e-05, + "loss": 2.8021, + "step": 25472 + }, + { + "epoch": 1.5812899621329692, + "grad_norm": 0.15432504203685507, + "learning_rate": 5.399369849550604e-05, + "loss": 2.7957, + "step": 25473 + }, + { + "epoch": 1.5813520392327272, + "grad_norm": 0.14854345179359457, + "learning_rate": 5.399009849983806e-05, + "loss": 2.8592, + "step": 25474 + }, + { + "epoch": 1.581414116332485, + "grad_norm": 0.20854414267853272, + "learning_rate": 5.3986498483352755e-05, + "loss": 2.8121, + "step": 25475 + }, + { + "epoch": 1.581476193432243, + "grad_norm": 0.14427514841555938, + "learning_rate": 5.398289844606892e-05, + "loss": 2.8181, + "step": 25476 + }, + { + "epoch": 1.5815382705320007, + "grad_norm": 0.15613072730593403, + "learning_rate": 5.397929838800535e-05, + "loss": 2.7937, + "step": 25477 + }, + { + "epoch": 1.5816003476317586, + "grad_norm": 0.16061529803754623, + "learning_rate": 5.39756983091808e-05, + "loss": 2.7797, + "step": 25478 + }, + { + "epoch": 1.5816624247315165, + "grad_norm": 0.16619708858579768, + "learning_rate": 5.3972098209614084e-05, + "loss": 2.8302, + "step": 25479 + }, + { + "epoch": 1.5817245018312744, + "grad_norm": 0.13948840194646075, + "learning_rate": 5.396849808932395e-05, + "loss": 2.6887, + "step": 25480 + }, + { + "epoch": 1.5817865789310324, + "grad_norm": 0.1503509927522286, + "learning_rate": 5.396489794832922e-05, + "loss": 2.7773, + "step": 25481 + }, + { + "epoch": 1.5818486560307903, + "grad_norm": 0.18546446927858218, + "learning_rate": 5.396129778664863e-05, + "loss": 2.7478, + "step": 25482 + }, + { + "epoch": 1.581910733130548, + "grad_norm": 0.1574927617672905, + "learning_rate": 5.3957697604301004e-05, + "loss": 2.8583, + "step": 25483 + }, + { + "epoch": 1.581972810230306, + "grad_norm": 0.1584814531447525, + "learning_rate": 5.395409740130511e-05, + "loss": 2.7563, + "step": 25484 + }, + { + "epoch": 1.5820348873300638, + "grad_norm": 0.1409635045955752, + "learning_rate": 5.395049717767971e-05, + "loss": 2.7749, + "step": 25485 + }, + { + "epoch": 1.5820969644298217, + "grad_norm": 0.14930450561681627, + "learning_rate": 5.3946896933443636e-05, + "loss": 2.7161, + "step": 25486 + }, + { + "epoch": 1.5821590415295796, + "grad_norm": 0.16117697023999872, + "learning_rate": 5.394329666861562e-05, + "loss": 2.8448, + "step": 25487 + }, + { + "epoch": 1.5822211186293376, + "grad_norm": 0.16253211731065945, + "learning_rate": 5.393969638321448e-05, + "loss": 2.8558, + "step": 25488 + }, + { + "epoch": 1.5822831957290955, + "grad_norm": 0.14589927805871644, + "learning_rate": 5.393609607725898e-05, + "loss": 2.6915, + "step": 25489 + }, + { + "epoch": 1.5823452728288534, + "grad_norm": 0.15348814021807353, + "learning_rate": 5.393249575076792e-05, + "loss": 2.7711, + "step": 25490 + }, + { + "epoch": 1.5824073499286113, + "grad_norm": 0.14679423937377942, + "learning_rate": 5.392889540376006e-05, + "loss": 2.7871, + "step": 25491 + }, + { + "epoch": 1.5824694270283692, + "grad_norm": 0.1526473211900403, + "learning_rate": 5.3925295036254206e-05, + "loss": 2.8701, + "step": 25492 + }, + { + "epoch": 1.5825315041281272, + "grad_norm": 0.14890274745228255, + "learning_rate": 5.392169464826914e-05, + "loss": 2.7324, + "step": 25493 + }, + { + "epoch": 1.582593581227885, + "grad_norm": 0.14648248671268282, + "learning_rate": 5.3918094239823635e-05, + "loss": 2.7161, + "step": 25494 + }, + { + "epoch": 1.582655658327643, + "grad_norm": 0.15857641895852517, + "learning_rate": 5.39144938109365e-05, + "loss": 2.7473, + "step": 25495 + }, + { + "epoch": 1.582717735427401, + "grad_norm": 0.1483128425780663, + "learning_rate": 5.3910893361626466e-05, + "loss": 2.7298, + "step": 25496 + }, + { + "epoch": 1.5827798125271588, + "grad_norm": 0.16225778726080453, + "learning_rate": 5.390729289191239e-05, + "loss": 2.8198, + "step": 25497 + }, + { + "epoch": 1.5828418896269167, + "grad_norm": 0.13917447066721186, + "learning_rate": 5.390369240181299e-05, + "loss": 2.8556, + "step": 25498 + }, + { + "epoch": 1.5829039667266747, + "grad_norm": 0.1521766185982752, + "learning_rate": 5.390009189134709e-05, + "loss": 2.875, + "step": 25499 + }, + { + "epoch": 1.5829660438264326, + "grad_norm": 0.14942043265931343, + "learning_rate": 5.389649136053345e-05, + "loss": 2.6477, + "step": 25500 + }, + { + "epoch": 1.5830281209261903, + "grad_norm": 0.1507396919911845, + "learning_rate": 5.38928908093909e-05, + "loss": 2.8265, + "step": 25501 + }, + { + "epoch": 1.5830901980259482, + "grad_norm": 0.15016474418947828, + "learning_rate": 5.3889290237938174e-05, + "loss": 2.7868, + "step": 25502 + }, + { + "epoch": 1.583152275125706, + "grad_norm": 0.16839909870404587, + "learning_rate": 5.388568964619407e-05, + "loss": 2.8245, + "step": 25503 + }, + { + "epoch": 1.583214352225464, + "grad_norm": 0.17523654677890096, + "learning_rate": 5.3882089034177393e-05, + "loss": 2.8563, + "step": 25504 + }, + { + "epoch": 1.583276429325222, + "grad_norm": 0.15447504283620755, + "learning_rate": 5.3878488401906904e-05, + "loss": 2.8829, + "step": 25505 + }, + { + "epoch": 1.5833385064249799, + "grad_norm": 0.15854329753639285, + "learning_rate": 5.3874887749401415e-05, + "loss": 2.7973, + "step": 25506 + }, + { + "epoch": 1.5834005835247376, + "grad_norm": 0.1728481680326621, + "learning_rate": 5.3871287076679675e-05, + "loss": 2.7981, + "step": 25507 + }, + { + "epoch": 1.5834626606244955, + "grad_norm": 0.1540254789444966, + "learning_rate": 5.3867686383760495e-05, + "loss": 2.8022, + "step": 25508 + }, + { + "epoch": 1.5835247377242534, + "grad_norm": 0.1936247863245032, + "learning_rate": 5.386408567066267e-05, + "loss": 2.77, + "step": 25509 + }, + { + "epoch": 1.5835868148240113, + "grad_norm": 0.14996297523403837, + "learning_rate": 5.386048493740495e-05, + "loss": 2.7959, + "step": 25510 + }, + { + "epoch": 1.5836488919237692, + "grad_norm": 0.16728483735614763, + "learning_rate": 5.385688418400617e-05, + "loss": 2.8016, + "step": 25511 + }, + { + "epoch": 1.5837109690235271, + "grad_norm": 0.15177062016155568, + "learning_rate": 5.385328341048507e-05, + "loss": 2.8465, + "step": 25512 + }, + { + "epoch": 1.583773046123285, + "grad_norm": 0.18921053978065322, + "learning_rate": 5.384968261686046e-05, + "loss": 2.7832, + "step": 25513 + }, + { + "epoch": 1.583835123223043, + "grad_norm": 0.13880811634037982, + "learning_rate": 5.3846081803151125e-05, + "loss": 2.7691, + "step": 25514 + }, + { + "epoch": 1.583897200322801, + "grad_norm": 0.23548420852160154, + "learning_rate": 5.3842480969375855e-05, + "loss": 2.8057, + "step": 25515 + }, + { + "epoch": 1.5839592774225588, + "grad_norm": 0.1490564117347908, + "learning_rate": 5.3838880115553416e-05, + "loss": 2.9397, + "step": 25516 + }, + { + "epoch": 1.5840213545223167, + "grad_norm": 0.1671203601021166, + "learning_rate": 5.3835279241702605e-05, + "loss": 2.7906, + "step": 25517 + }, + { + "epoch": 1.5840834316220747, + "grad_norm": 0.1758442553880207, + "learning_rate": 5.383167834784222e-05, + "loss": 2.9118, + "step": 25518 + }, + { + "epoch": 1.5841455087218326, + "grad_norm": 0.13787135240773998, + "learning_rate": 5.382807743399103e-05, + "loss": 2.7224, + "step": 25519 + }, + { + "epoch": 1.5842075858215905, + "grad_norm": 0.16665328364483534, + "learning_rate": 5.382447650016785e-05, + "loss": 2.792, + "step": 25520 + }, + { + "epoch": 1.5842696629213484, + "grad_norm": 0.1450860933591907, + "learning_rate": 5.382087554639143e-05, + "loss": 2.8518, + "step": 25521 + }, + { + "epoch": 1.5843317400211063, + "grad_norm": 0.15411538906946065, + "learning_rate": 5.3817274572680574e-05, + "loss": 2.8141, + "step": 25522 + }, + { + "epoch": 1.5843938171208642, + "grad_norm": 0.17372829270782508, + "learning_rate": 5.381367357905408e-05, + "loss": 2.734, + "step": 25523 + }, + { + "epoch": 1.5844558942206222, + "grad_norm": 0.16199381857612666, + "learning_rate": 5.3810072565530714e-05, + "loss": 2.702, + "step": 25524 + }, + { + "epoch": 1.5845179713203799, + "grad_norm": 0.15972131310482954, + "learning_rate": 5.3806471532129274e-05, + "loss": 2.7865, + "step": 25525 + }, + { + "epoch": 1.5845800484201378, + "grad_norm": 0.151596467891605, + "learning_rate": 5.380287047886856e-05, + "loss": 2.7543, + "step": 25526 + }, + { + "epoch": 1.5846421255198957, + "grad_norm": 0.17478189373051273, + "learning_rate": 5.379926940576734e-05, + "loss": 2.7889, + "step": 25527 + }, + { + "epoch": 1.5847042026196536, + "grad_norm": 0.15363278710747963, + "learning_rate": 5.3795668312844405e-05, + "loss": 2.8267, + "step": 25528 + }, + { + "epoch": 1.5847662797194115, + "grad_norm": 0.1869000033967695, + "learning_rate": 5.379206720011857e-05, + "loss": 2.8083, + "step": 25529 + }, + { + "epoch": 1.5848283568191694, + "grad_norm": 0.15655760289145645, + "learning_rate": 5.3788466067608566e-05, + "loss": 2.8249, + "step": 25530 + }, + { + "epoch": 1.5848904339189271, + "grad_norm": 0.17080181114972445, + "learning_rate": 5.378486491533324e-05, + "loss": 2.8279, + "step": 25531 + }, + { + "epoch": 1.584952511018685, + "grad_norm": 0.144096442173247, + "learning_rate": 5.3781263743311335e-05, + "loss": 2.7438, + "step": 25532 + }, + { + "epoch": 1.585014588118443, + "grad_norm": 0.14963364516326164, + "learning_rate": 5.377766255156168e-05, + "loss": 2.9051, + "step": 25533 + }, + { + "epoch": 1.585076665218201, + "grad_norm": 0.2004447964177935, + "learning_rate": 5.377406134010303e-05, + "loss": 2.8359, + "step": 25534 + }, + { + "epoch": 1.5851387423179588, + "grad_norm": 0.1503396319350856, + "learning_rate": 5.3770460108954194e-05, + "loss": 2.8534, + "step": 25535 + }, + { + "epoch": 1.5852008194177167, + "grad_norm": 0.16306600620203798, + "learning_rate": 5.376685885813394e-05, + "loss": 2.7438, + "step": 25536 + }, + { + "epoch": 1.5852628965174747, + "grad_norm": 0.14949764457054346, + "learning_rate": 5.376325758766109e-05, + "loss": 2.7686, + "step": 25537 + }, + { + "epoch": 1.5853249736172326, + "grad_norm": 0.15503945505097802, + "learning_rate": 5.3759656297554386e-05, + "loss": 2.7818, + "step": 25538 + }, + { + "epoch": 1.5853870507169905, + "grad_norm": 0.16176491069350435, + "learning_rate": 5.375605498783266e-05, + "loss": 2.762, + "step": 25539 + }, + { + "epoch": 1.5854491278167484, + "grad_norm": 0.14761154744461696, + "learning_rate": 5.375245365851468e-05, + "loss": 2.7336, + "step": 25540 + }, + { + "epoch": 1.5855112049165063, + "grad_norm": 0.151208301955807, + "learning_rate": 5.3748852309619224e-05, + "loss": 2.8129, + "step": 25541 + }, + { + "epoch": 1.5855732820162642, + "grad_norm": 0.14616330245512268, + "learning_rate": 5.374525094116511e-05, + "loss": 2.7816, + "step": 25542 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.16862449614885752, + "learning_rate": 5.3741649553171103e-05, + "loss": 2.7954, + "step": 25543 + }, + { + "epoch": 1.58569743621578, + "grad_norm": 0.16237946496540823, + "learning_rate": 5.373804814565602e-05, + "loss": 2.7683, + "step": 25544 + }, + { + "epoch": 1.585759513315538, + "grad_norm": 0.1995475617345335, + "learning_rate": 5.373444671863862e-05, + "loss": 2.7701, + "step": 25545 + }, + { + "epoch": 1.585821590415296, + "grad_norm": 0.14684258214839707, + "learning_rate": 5.373084527213771e-05, + "loss": 2.8365, + "step": 25546 + }, + { + "epoch": 1.5858836675150538, + "grad_norm": 0.14985426091232246, + "learning_rate": 5.3727243806172065e-05, + "loss": 2.8321, + "step": 25547 + }, + { + "epoch": 1.5859457446148117, + "grad_norm": 0.1533683045712734, + "learning_rate": 5.372364232076049e-05, + "loss": 2.7911, + "step": 25548 + }, + { + "epoch": 1.5860078217145694, + "grad_norm": 0.15422792946532546, + "learning_rate": 5.372004081592177e-05, + "loss": 2.8602, + "step": 25549 + }, + { + "epoch": 1.5860698988143274, + "grad_norm": 0.2219841836200973, + "learning_rate": 5.371643929167469e-05, + "loss": 2.7788, + "step": 25550 + }, + { + "epoch": 1.5861319759140853, + "grad_norm": 0.15143363478620642, + "learning_rate": 5.371283774803806e-05, + "loss": 2.848, + "step": 25551 + }, + { + "epoch": 1.5861940530138432, + "grad_norm": 0.1628037998873258, + "learning_rate": 5.370923618503063e-05, + "loss": 2.7726, + "step": 25552 + }, + { + "epoch": 1.5862561301136011, + "grad_norm": 0.16615080098545412, + "learning_rate": 5.370563460267123e-05, + "loss": 2.7139, + "step": 25553 + }, + { + "epoch": 1.586318207213359, + "grad_norm": 0.15212241012606698, + "learning_rate": 5.370203300097863e-05, + "loss": 2.8369, + "step": 25554 + }, + { + "epoch": 1.5863802843131167, + "grad_norm": 0.22412426848645886, + "learning_rate": 5.369843137997163e-05, + "loss": 2.7915, + "step": 25555 + }, + { + "epoch": 1.5864423614128746, + "grad_norm": 0.1472063456252769, + "learning_rate": 5.3694829739669006e-05, + "loss": 2.8616, + "step": 25556 + }, + { + "epoch": 1.5865044385126326, + "grad_norm": 0.15207344307703738, + "learning_rate": 5.369122808008955e-05, + "loss": 2.7518, + "step": 25557 + }, + { + "epoch": 1.5865665156123905, + "grad_norm": 0.15175243697102872, + "learning_rate": 5.368762640125208e-05, + "loss": 2.7677, + "step": 25558 + }, + { + "epoch": 1.5866285927121484, + "grad_norm": 0.17176566656260167, + "learning_rate": 5.368402470317536e-05, + "loss": 2.8328, + "step": 25559 + }, + { + "epoch": 1.5866906698119063, + "grad_norm": 0.15545242118388553, + "learning_rate": 5.36804229858782e-05, + "loss": 2.8211, + "step": 25560 + }, + { + "epoch": 1.5867527469116642, + "grad_norm": 0.17503658023053997, + "learning_rate": 5.367682124937936e-05, + "loss": 2.8146, + "step": 25561 + }, + { + "epoch": 1.5868148240114222, + "grad_norm": 0.15352761170859103, + "learning_rate": 5.367321949369767e-05, + "loss": 2.8391, + "step": 25562 + }, + { + "epoch": 1.58687690111118, + "grad_norm": 0.1555773980806706, + "learning_rate": 5.3669617718851883e-05, + "loss": 2.8547, + "step": 25563 + }, + { + "epoch": 1.586938978210938, + "grad_norm": 0.2078656888598014, + "learning_rate": 5.366601592486083e-05, + "loss": 2.8156, + "step": 25564 + }, + { + "epoch": 1.587001055310696, + "grad_norm": 0.19672257676466384, + "learning_rate": 5.366241411174328e-05, + "loss": 2.8758, + "step": 25565 + }, + { + "epoch": 1.5870631324104538, + "grad_norm": 0.16263011803076025, + "learning_rate": 5.3658812279518014e-05, + "loss": 2.8665, + "step": 25566 + }, + { + "epoch": 1.5871252095102117, + "grad_norm": 0.15680054589513773, + "learning_rate": 5.365521042820384e-05, + "loss": 2.8048, + "step": 25567 + }, + { + "epoch": 1.5871872866099697, + "grad_norm": 0.16400064710236445, + "learning_rate": 5.3651608557819546e-05, + "loss": 2.8558, + "step": 25568 + }, + { + "epoch": 1.5872493637097276, + "grad_norm": 0.15956144249868595, + "learning_rate": 5.364800666838393e-05, + "loss": 2.7532, + "step": 25569 + }, + { + "epoch": 1.5873114408094855, + "grad_norm": 0.18049648864400408, + "learning_rate": 5.3644404759915766e-05, + "loss": 2.8049, + "step": 25570 + }, + { + "epoch": 1.5873735179092434, + "grad_norm": 0.20998314141258093, + "learning_rate": 5.364080283243387e-05, + "loss": 2.8131, + "step": 25571 + }, + { + "epoch": 1.5874355950090013, + "grad_norm": 0.21100974611795142, + "learning_rate": 5.363720088595701e-05, + "loss": 2.8306, + "step": 25572 + }, + { + "epoch": 1.587497672108759, + "grad_norm": 0.21906874628725723, + "learning_rate": 5.363359892050399e-05, + "loss": 2.8782, + "step": 25573 + }, + { + "epoch": 1.587559749208517, + "grad_norm": 0.1571220619318451, + "learning_rate": 5.3629996936093605e-05, + "loss": 2.7632, + "step": 25574 + }, + { + "epoch": 1.5876218263082749, + "grad_norm": 0.17882335233173272, + "learning_rate": 5.362639493274465e-05, + "loss": 2.7983, + "step": 25575 + }, + { + "epoch": 1.5876839034080328, + "grad_norm": 0.15980096696594198, + "learning_rate": 5.3622792910475914e-05, + "loss": 2.7543, + "step": 25576 + }, + { + "epoch": 1.5877459805077907, + "grad_norm": 0.18917163111002303, + "learning_rate": 5.361919086930618e-05, + "loss": 2.7792, + "step": 25577 + }, + { + "epoch": 1.5878080576075486, + "grad_norm": 0.16429605208992132, + "learning_rate": 5.361558880925426e-05, + "loss": 2.7309, + "step": 25578 + }, + { + "epoch": 1.5878701347073063, + "grad_norm": 0.17138819680296596, + "learning_rate": 5.361198673033893e-05, + "loss": 2.904, + "step": 25579 + }, + { + "epoch": 1.5879322118070642, + "grad_norm": 0.15007056821417758, + "learning_rate": 5.360838463257898e-05, + "loss": 2.7388, + "step": 25580 + }, + { + "epoch": 1.5879942889068221, + "grad_norm": 0.15975814279509726, + "learning_rate": 5.360478251599321e-05, + "loss": 2.7147, + "step": 25581 + }, + { + "epoch": 1.58805636600658, + "grad_norm": 0.15306045597568327, + "learning_rate": 5.3601180380600434e-05, + "loss": 2.7892, + "step": 25582 + }, + { + "epoch": 1.588118443106338, + "grad_norm": 0.16098182424320015, + "learning_rate": 5.359757822641942e-05, + "loss": 2.7543, + "step": 25583 + }, + { + "epoch": 1.588180520206096, + "grad_norm": 0.2306230319593035, + "learning_rate": 5.359397605346895e-05, + "loss": 2.7415, + "step": 25584 + }, + { + "epoch": 1.5882425973058538, + "grad_norm": 0.16283925181720577, + "learning_rate": 5.359037386176785e-05, + "loss": 2.791, + "step": 25585 + }, + { + "epoch": 1.5883046744056117, + "grad_norm": 0.1609769703323689, + "learning_rate": 5.3586771651334896e-05, + "loss": 2.7871, + "step": 25586 + }, + { + "epoch": 1.5883667515053697, + "grad_norm": 0.1639032669723243, + "learning_rate": 5.358316942218888e-05, + "loss": 2.8172, + "step": 25587 + }, + { + "epoch": 1.5884288286051276, + "grad_norm": 0.15549263986267672, + "learning_rate": 5.35795671743486e-05, + "loss": 2.7352, + "step": 25588 + }, + { + "epoch": 1.5884909057048855, + "grad_norm": 0.15245128773071587, + "learning_rate": 5.3575964907832855e-05, + "loss": 2.7858, + "step": 25589 + }, + { + "epoch": 1.5885529828046434, + "grad_norm": 0.15583869323611754, + "learning_rate": 5.357236262266042e-05, + "loss": 2.8593, + "step": 25590 + }, + { + "epoch": 1.5886150599044013, + "grad_norm": 0.16668976621192216, + "learning_rate": 5.356876031885011e-05, + "loss": 2.7661, + "step": 25591 + }, + { + "epoch": 1.5886771370041592, + "grad_norm": 0.1654789552882554, + "learning_rate": 5.3565157996420715e-05, + "loss": 2.8455, + "step": 25592 + }, + { + "epoch": 1.5887392141039172, + "grad_norm": 0.16564932410646047, + "learning_rate": 5.3561555655391014e-05, + "loss": 2.8609, + "step": 25593 + }, + { + "epoch": 1.588801291203675, + "grad_norm": 0.14741258270501045, + "learning_rate": 5.355795329577983e-05, + "loss": 2.8562, + "step": 25594 + }, + { + "epoch": 1.588863368303433, + "grad_norm": 0.17259280454475778, + "learning_rate": 5.355435091760592e-05, + "loss": 2.7631, + "step": 25595 + }, + { + "epoch": 1.588925445403191, + "grad_norm": 0.15292610831718875, + "learning_rate": 5.355074852088811e-05, + "loss": 2.85, + "step": 25596 + }, + { + "epoch": 1.5889875225029486, + "grad_norm": 0.15932239384194927, + "learning_rate": 5.3547146105645175e-05, + "loss": 2.8584, + "step": 25597 + }, + { + "epoch": 1.5890495996027065, + "grad_norm": 0.17415206706439615, + "learning_rate": 5.3543543671895933e-05, + "loss": 2.8259, + "step": 25598 + }, + { + "epoch": 1.5891116767024644, + "grad_norm": 0.16342039134519862, + "learning_rate": 5.353994121965914e-05, + "loss": 2.8781, + "step": 25599 + }, + { + "epoch": 1.5891737538022224, + "grad_norm": 0.14571661337018835, + "learning_rate": 5.353633874895364e-05, + "loss": 2.8548, + "step": 25600 + }, + { + "epoch": 1.5892358309019803, + "grad_norm": 0.15711881369260905, + "learning_rate": 5.35327362597982e-05, + "loss": 2.7562, + "step": 25601 + }, + { + "epoch": 1.5892979080017382, + "grad_norm": 0.15681861886332982, + "learning_rate": 5.35291337522116e-05, + "loss": 2.8655, + "step": 25602 + }, + { + "epoch": 1.589359985101496, + "grad_norm": 0.1471540339402892, + "learning_rate": 5.3525531226212664e-05, + "loss": 2.7808, + "step": 25603 + }, + { + "epoch": 1.5894220622012538, + "grad_norm": 0.16805731301586302, + "learning_rate": 5.3521928681820164e-05, + "loss": 2.7627, + "step": 25604 + }, + { + "epoch": 1.5894841393010117, + "grad_norm": 0.14437875564239963, + "learning_rate": 5.351832611905292e-05, + "loss": 2.8109, + "step": 25605 + }, + { + "epoch": 1.5895462164007697, + "grad_norm": 0.1463819845298019, + "learning_rate": 5.35147235379297e-05, + "loss": 2.7903, + "step": 25606 + }, + { + "epoch": 1.5896082935005276, + "grad_norm": 0.16291958782397545, + "learning_rate": 5.3511120938469325e-05, + "loss": 2.8543, + "step": 25607 + }, + { + "epoch": 1.5896703706002855, + "grad_norm": 0.13466763043378419, + "learning_rate": 5.350751832069059e-05, + "loss": 2.799, + "step": 25608 + }, + { + "epoch": 1.5897324477000434, + "grad_norm": 0.17134507262863746, + "learning_rate": 5.350391568461226e-05, + "loss": 2.7858, + "step": 25609 + }, + { + "epoch": 1.5897945247998013, + "grad_norm": 0.14934474608765036, + "learning_rate": 5.350031303025316e-05, + "loss": 2.7973, + "step": 25610 + }, + { + "epoch": 1.5898566018995592, + "grad_norm": 0.1449084964653749, + "learning_rate": 5.3496710357632075e-05, + "loss": 2.7916, + "step": 25611 + }, + { + "epoch": 1.5899186789993172, + "grad_norm": 0.1593952124064227, + "learning_rate": 5.3493107666767806e-05, + "loss": 2.8488, + "step": 25612 + }, + { + "epoch": 1.589980756099075, + "grad_norm": 0.14834627686263624, + "learning_rate": 5.348950495767915e-05, + "loss": 2.6344, + "step": 25613 + }, + { + "epoch": 1.590042833198833, + "grad_norm": 0.14568047232351766, + "learning_rate": 5.34859022303849e-05, + "loss": 2.8721, + "step": 25614 + }, + { + "epoch": 1.590104910298591, + "grad_norm": 0.15238461802140746, + "learning_rate": 5.348229948490384e-05, + "loss": 2.6799, + "step": 25615 + }, + { + "epoch": 1.5901669873983488, + "grad_norm": 0.14465716393108505, + "learning_rate": 5.3478696721254785e-05, + "loss": 2.7615, + "step": 25616 + }, + { + "epoch": 1.5902290644981067, + "grad_norm": 0.14803499939124032, + "learning_rate": 5.347509393945652e-05, + "loss": 2.7916, + "step": 25617 + }, + { + "epoch": 1.5902911415978647, + "grad_norm": 0.14654915966494514, + "learning_rate": 5.347149113952786e-05, + "loss": 2.7987, + "step": 25618 + }, + { + "epoch": 1.5903532186976226, + "grad_norm": 0.17264832115954834, + "learning_rate": 5.346788832148758e-05, + "loss": 2.881, + "step": 25619 + }, + { + "epoch": 1.5904152957973805, + "grad_norm": 0.1478138294724725, + "learning_rate": 5.346428548535448e-05, + "loss": 2.7996, + "step": 25620 + }, + { + "epoch": 1.5904773728971382, + "grad_norm": 0.14601848264768605, + "learning_rate": 5.3460682631147375e-05, + "loss": 2.7876, + "step": 25621 + }, + { + "epoch": 1.5905394499968961, + "grad_norm": 0.14412831509756827, + "learning_rate": 5.345707975888503e-05, + "loss": 2.7638, + "step": 25622 + }, + { + "epoch": 1.590601527096654, + "grad_norm": 0.14062671527876078, + "learning_rate": 5.345347686858627e-05, + "loss": 2.8166, + "step": 25623 + }, + { + "epoch": 1.590663604196412, + "grad_norm": 0.16662696586566716, + "learning_rate": 5.344987396026988e-05, + "loss": 2.7938, + "step": 25624 + }, + { + "epoch": 1.5907256812961699, + "grad_norm": 0.14794898583925756, + "learning_rate": 5.344627103395466e-05, + "loss": 2.7861, + "step": 25625 + }, + { + "epoch": 1.5907877583959278, + "grad_norm": 0.1486232060199038, + "learning_rate": 5.344266808965942e-05, + "loss": 2.7656, + "step": 25626 + }, + { + "epoch": 1.5908498354956855, + "grad_norm": 0.1450977564005314, + "learning_rate": 5.343906512740293e-05, + "loss": 2.815, + "step": 25627 + }, + { + "epoch": 1.5909119125954434, + "grad_norm": 0.1504479583841951, + "learning_rate": 5.343546214720401e-05, + "loss": 2.7702, + "step": 25628 + }, + { + "epoch": 1.5909739896952013, + "grad_norm": 0.1621923004755627, + "learning_rate": 5.3431859149081445e-05, + "loss": 2.8134, + "step": 25629 + }, + { + "epoch": 1.5910360667949592, + "grad_norm": 0.15200376475841781, + "learning_rate": 5.342825613305405e-05, + "loss": 2.7664, + "step": 25630 + }, + { + "epoch": 1.5910981438947172, + "grad_norm": 0.17068465348960288, + "learning_rate": 5.342465309914059e-05, + "loss": 2.8212, + "step": 25631 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.15629470304928564, + "learning_rate": 5.34210500473599e-05, + "loss": 2.7998, + "step": 25632 + }, + { + "epoch": 1.591222298094233, + "grad_norm": 0.167066440470513, + "learning_rate": 5.341744697773075e-05, + "loss": 2.7941, + "step": 25633 + }, + { + "epoch": 1.591284375193991, + "grad_norm": 0.17301604424688255, + "learning_rate": 5.3413843890271964e-05, + "loss": 2.784, + "step": 25634 + }, + { + "epoch": 1.5913464522937488, + "grad_norm": 0.14888632830223703, + "learning_rate": 5.341024078500231e-05, + "loss": 2.8588, + "step": 25635 + }, + { + "epoch": 1.5914085293935067, + "grad_norm": 0.1474724821914641, + "learning_rate": 5.3406637661940615e-05, + "loss": 2.7853, + "step": 25636 + }, + { + "epoch": 1.5914706064932647, + "grad_norm": 0.15053511156407703, + "learning_rate": 5.3403034521105663e-05, + "loss": 2.8202, + "step": 25637 + }, + { + "epoch": 1.5915326835930226, + "grad_norm": 0.14400962184964897, + "learning_rate": 5.339943136251624e-05, + "loss": 2.7867, + "step": 25638 + }, + { + "epoch": 1.5915947606927805, + "grad_norm": 0.18549443557971423, + "learning_rate": 5.339582818619117e-05, + "loss": 2.8332, + "step": 25639 + }, + { + "epoch": 1.5916568377925384, + "grad_norm": 0.15280348538604077, + "learning_rate": 5.339222499214922e-05, + "loss": 2.8292, + "step": 25640 + }, + { + "epoch": 1.5917189148922963, + "grad_norm": 0.1586818516129037, + "learning_rate": 5.338862178040924e-05, + "loss": 2.8458, + "step": 25641 + }, + { + "epoch": 1.5917809919920543, + "grad_norm": 0.1542043408442464, + "learning_rate": 5.338501855098997e-05, + "loss": 2.7046, + "step": 25642 + }, + { + "epoch": 1.5918430690918122, + "grad_norm": 0.14744540631690412, + "learning_rate": 5.338141530391026e-05, + "loss": 2.7813, + "step": 25643 + }, + { + "epoch": 1.59190514619157, + "grad_norm": 0.15077684400969749, + "learning_rate": 5.337781203918888e-05, + "loss": 2.7917, + "step": 25644 + }, + { + "epoch": 1.5919672232913278, + "grad_norm": 0.14517879410594922, + "learning_rate": 5.337420875684462e-05, + "loss": 2.7911, + "step": 25645 + }, + { + "epoch": 1.5920293003910857, + "grad_norm": 0.18825618925343432, + "learning_rate": 5.3370605456896315e-05, + "loss": 2.7586, + "step": 25646 + }, + { + "epoch": 1.5920913774908436, + "grad_norm": 0.16434024960187885, + "learning_rate": 5.336700213936273e-05, + "loss": 2.8218, + "step": 25647 + }, + { + "epoch": 1.5921534545906015, + "grad_norm": 0.1601761945075621, + "learning_rate": 5.336339880426269e-05, + "loss": 2.7854, + "step": 25648 + }, + { + "epoch": 1.5922155316903595, + "grad_norm": 0.15645125659980194, + "learning_rate": 5.335979545161496e-05, + "loss": 2.8092, + "step": 25649 + }, + { + "epoch": 1.5922776087901174, + "grad_norm": 0.15104666583787715, + "learning_rate": 5.335619208143837e-05, + "loss": 2.7959, + "step": 25650 + }, + { + "epoch": 1.592339685889875, + "grad_norm": 0.14110573085134026, + "learning_rate": 5.335258869375171e-05, + "loss": 2.7847, + "step": 25651 + }, + { + "epoch": 1.592401762989633, + "grad_norm": 0.15241641976797268, + "learning_rate": 5.334898528857379e-05, + "loss": 2.8669, + "step": 25652 + }, + { + "epoch": 1.592463840089391, + "grad_norm": 0.16314097863348892, + "learning_rate": 5.3345381865923395e-05, + "loss": 2.8192, + "step": 25653 + }, + { + "epoch": 1.5925259171891488, + "grad_norm": 0.18501713069331763, + "learning_rate": 5.334177842581933e-05, + "loss": 2.7639, + "step": 25654 + }, + { + "epoch": 1.5925879942889067, + "grad_norm": 0.14902399920436446, + "learning_rate": 5.3338174968280393e-05, + "loss": 2.8278, + "step": 25655 + }, + { + "epoch": 1.5926500713886647, + "grad_norm": 0.13726404980541088, + "learning_rate": 5.333457149332539e-05, + "loss": 2.7828, + "step": 25656 + }, + { + "epoch": 1.5927121484884226, + "grad_norm": 0.15776427353579223, + "learning_rate": 5.3330968000973126e-05, + "loss": 2.754, + "step": 25657 + }, + { + "epoch": 1.5927742255881805, + "grad_norm": 0.14431626207719034, + "learning_rate": 5.3327364491242384e-05, + "loss": 2.7097, + "step": 25658 + }, + { + "epoch": 1.5928363026879384, + "grad_norm": 0.15368205117769967, + "learning_rate": 5.332376096415198e-05, + "loss": 2.8242, + "step": 25659 + }, + { + "epoch": 1.5928983797876963, + "grad_norm": 0.1401548805829089, + "learning_rate": 5.3320157419720696e-05, + "loss": 2.7165, + "step": 25660 + }, + { + "epoch": 1.5929604568874542, + "grad_norm": 0.1470507924169444, + "learning_rate": 5.331655385796736e-05, + "loss": 2.7775, + "step": 25661 + }, + { + "epoch": 1.5930225339872122, + "grad_norm": 0.16933690701111442, + "learning_rate": 5.331295027891076e-05, + "loss": 2.8106, + "step": 25662 + }, + { + "epoch": 1.59308461108697, + "grad_norm": 0.1490899041514767, + "learning_rate": 5.330934668256967e-05, + "loss": 2.7601, + "step": 25663 + }, + { + "epoch": 1.593146688186728, + "grad_norm": 0.16174072263782402, + "learning_rate": 5.330574306896294e-05, + "loss": 2.8876, + "step": 25664 + }, + { + "epoch": 1.593208765286486, + "grad_norm": 0.1524334545615329, + "learning_rate": 5.330213943810933e-05, + "loss": 2.7698, + "step": 25665 + }, + { + "epoch": 1.5932708423862438, + "grad_norm": 0.14490410035114085, + "learning_rate": 5.329853579002767e-05, + "loss": 2.7335, + "step": 25666 + }, + { + "epoch": 1.5933329194860018, + "grad_norm": 0.1433183125304122, + "learning_rate": 5.329493212473673e-05, + "loss": 2.8125, + "step": 25667 + }, + { + "epoch": 1.5933949965857597, + "grad_norm": 0.15285935325275365, + "learning_rate": 5.329132844225535e-05, + "loss": 2.8375, + "step": 25668 + }, + { + "epoch": 1.5934570736855174, + "grad_norm": 0.1533609202846847, + "learning_rate": 5.3287724742602296e-05, + "loss": 2.8075, + "step": 25669 + }, + { + "epoch": 1.5935191507852753, + "grad_norm": 0.15947426995455505, + "learning_rate": 5.3284121025796386e-05, + "loss": 2.7718, + "step": 25670 + }, + { + "epoch": 1.5935812278850332, + "grad_norm": 0.13936868403112562, + "learning_rate": 5.328051729185642e-05, + "loss": 2.8174, + "step": 25671 + }, + { + "epoch": 1.5936433049847911, + "grad_norm": 0.19139415352413855, + "learning_rate": 5.3276913540801196e-05, + "loss": 2.8048, + "step": 25672 + }, + { + "epoch": 1.593705382084549, + "grad_norm": 0.14411725213994486, + "learning_rate": 5.3273309772649524e-05, + "loss": 2.8097, + "step": 25673 + }, + { + "epoch": 1.593767459184307, + "grad_norm": 0.13911896480809527, + "learning_rate": 5.326970598742019e-05, + "loss": 2.7621, + "step": 25674 + }, + { + "epoch": 1.5938295362840647, + "grad_norm": 0.17197503083391408, + "learning_rate": 5.326610218513203e-05, + "loss": 2.7615, + "step": 25675 + }, + { + "epoch": 1.5938916133838226, + "grad_norm": 0.14540033403352334, + "learning_rate": 5.3262498365803804e-05, + "loss": 2.7285, + "step": 25676 + }, + { + "epoch": 1.5939536904835805, + "grad_norm": 0.16076223936658215, + "learning_rate": 5.325889452945434e-05, + "loss": 2.7667, + "step": 25677 + }, + { + "epoch": 1.5940157675833384, + "grad_norm": 0.14723177654925035, + "learning_rate": 5.325529067610242e-05, + "loss": 2.8101, + "step": 25678 + }, + { + "epoch": 1.5940778446830963, + "grad_norm": 0.15669634962254586, + "learning_rate": 5.325168680576686e-05, + "loss": 2.7859, + "step": 25679 + }, + { + "epoch": 1.5941399217828542, + "grad_norm": 0.1447557883155338, + "learning_rate": 5.324808291846648e-05, + "loss": 2.742, + "step": 25680 + }, + { + "epoch": 1.5942019988826122, + "grad_norm": 0.16949568938053025, + "learning_rate": 5.324447901422004e-05, + "loss": 2.7837, + "step": 25681 + }, + { + "epoch": 1.59426407598237, + "grad_norm": 0.1613752506414457, + "learning_rate": 5.3240875093046385e-05, + "loss": 2.8402, + "step": 25682 + }, + { + "epoch": 1.594326153082128, + "grad_norm": 0.13990181165117632, + "learning_rate": 5.3237271154964285e-05, + "loss": 2.8325, + "step": 25683 + }, + { + "epoch": 1.594388230181886, + "grad_norm": 0.14707356534200125, + "learning_rate": 5.3233667199992564e-05, + "loss": 2.7583, + "step": 25684 + }, + { + "epoch": 1.5944503072816438, + "grad_norm": 0.14314057832793545, + "learning_rate": 5.3230063228149994e-05, + "loss": 2.8078, + "step": 25685 + }, + { + "epoch": 1.5945123843814017, + "grad_norm": 0.14318773508489685, + "learning_rate": 5.3226459239455425e-05, + "loss": 2.8213, + "step": 25686 + }, + { + "epoch": 1.5945744614811597, + "grad_norm": 0.15094621632170496, + "learning_rate": 5.322285523392763e-05, + "loss": 2.8206, + "step": 25687 + }, + { + "epoch": 1.5946365385809176, + "grad_norm": 0.14865844996702743, + "learning_rate": 5.3219251211585394e-05, + "loss": 2.7828, + "step": 25688 + }, + { + "epoch": 1.5946986156806755, + "grad_norm": 0.15526122519013746, + "learning_rate": 5.321564717244757e-05, + "loss": 2.8474, + "step": 25689 + }, + { + "epoch": 1.5947606927804334, + "grad_norm": 0.146686370179271, + "learning_rate": 5.3212043116532925e-05, + "loss": 2.8158, + "step": 25690 + }, + { + "epoch": 1.5948227698801913, + "grad_norm": 0.15847953150457186, + "learning_rate": 5.320843904386028e-05, + "loss": 2.8198, + "step": 25691 + }, + { + "epoch": 1.5948848469799493, + "grad_norm": 0.15544569908516193, + "learning_rate": 5.320483495444841e-05, + "loss": 2.8257, + "step": 25692 + }, + { + "epoch": 1.594946924079707, + "grad_norm": 0.14481189261190955, + "learning_rate": 5.3201230848316165e-05, + "loss": 2.7998, + "step": 25693 + }, + { + "epoch": 1.5950090011794649, + "grad_norm": 0.1394737458143103, + "learning_rate": 5.3197626725482294e-05, + "loss": 2.8146, + "step": 25694 + }, + { + "epoch": 1.5950710782792228, + "grad_norm": 0.14073990392513136, + "learning_rate": 5.319402258596564e-05, + "loss": 2.8186, + "step": 25695 + }, + { + "epoch": 1.5951331553789807, + "grad_norm": 0.1660219415697174, + "learning_rate": 5.319041842978499e-05, + "loss": 2.7772, + "step": 25696 + }, + { + "epoch": 1.5951952324787386, + "grad_norm": 0.1446253495999317, + "learning_rate": 5.318681425695917e-05, + "loss": 2.8162, + "step": 25697 + }, + { + "epoch": 1.5952573095784965, + "grad_norm": 0.1466342858299386, + "learning_rate": 5.3183210067506964e-05, + "loss": 2.7788, + "step": 25698 + }, + { + "epoch": 1.5953193866782542, + "grad_norm": 0.15634154975588793, + "learning_rate": 5.317960586144716e-05, + "loss": 2.923, + "step": 25699 + }, + { + "epoch": 1.5953814637780122, + "grad_norm": 0.15187004608151652, + "learning_rate": 5.317600163879859e-05, + "loss": 2.8265, + "step": 25700 + }, + { + "epoch": 1.59544354087777, + "grad_norm": 0.15712083495885795, + "learning_rate": 5.3172397399580045e-05, + "loss": 2.793, + "step": 25701 + }, + { + "epoch": 1.595505617977528, + "grad_norm": 0.1543124227295717, + "learning_rate": 5.316879314381034e-05, + "loss": 2.7573, + "step": 25702 + }, + { + "epoch": 1.595567695077286, + "grad_norm": 0.16350933325300582, + "learning_rate": 5.316518887150827e-05, + "loss": 2.7828, + "step": 25703 + }, + { + "epoch": 1.5956297721770438, + "grad_norm": 0.1518600009352282, + "learning_rate": 5.3161584582692625e-05, + "loss": 2.7124, + "step": 25704 + }, + { + "epoch": 1.5956918492768017, + "grad_norm": 0.15332306415980035, + "learning_rate": 5.3157980277382243e-05, + "loss": 2.7682, + "step": 25705 + }, + { + "epoch": 1.5957539263765597, + "grad_norm": 0.15187086781622058, + "learning_rate": 5.3154375955595905e-05, + "loss": 2.7668, + "step": 25706 + }, + { + "epoch": 1.5958160034763176, + "grad_norm": 0.1387119369664195, + "learning_rate": 5.315077161735242e-05, + "loss": 2.7168, + "step": 25707 + }, + { + "epoch": 1.5958780805760755, + "grad_norm": 0.14310096219238816, + "learning_rate": 5.3147167262670604e-05, + "loss": 2.7932, + "step": 25708 + }, + { + "epoch": 1.5959401576758334, + "grad_norm": 0.14094442276541405, + "learning_rate": 5.3143562891569255e-05, + "loss": 2.8739, + "step": 25709 + }, + { + "epoch": 1.5960022347755913, + "grad_norm": 0.14339792034832158, + "learning_rate": 5.313995850406715e-05, + "loss": 2.8131, + "step": 25710 + }, + { + "epoch": 1.5960643118753493, + "grad_norm": 0.14413327118224087, + "learning_rate": 5.313635410018315e-05, + "loss": 2.8086, + "step": 25711 + }, + { + "epoch": 1.5961263889751072, + "grad_norm": 0.13926435217141422, + "learning_rate": 5.3132749679936014e-05, + "loss": 2.7612, + "step": 25712 + }, + { + "epoch": 1.596188466074865, + "grad_norm": 0.14595113336282167, + "learning_rate": 5.312914524334457e-05, + "loss": 2.8398, + "step": 25713 + }, + { + "epoch": 1.596250543174623, + "grad_norm": 0.13924389377711394, + "learning_rate": 5.31255407904276e-05, + "loss": 2.7873, + "step": 25714 + }, + { + "epoch": 1.596312620274381, + "grad_norm": 0.14842672685378724, + "learning_rate": 5.3121936321203935e-05, + "loss": 2.7894, + "step": 25715 + }, + { + "epoch": 1.5963746973741388, + "grad_norm": 0.14761841178158222, + "learning_rate": 5.311833183569237e-05, + "loss": 2.8774, + "step": 25716 + }, + { + "epoch": 1.5964367744738965, + "grad_norm": 0.13896146945783994, + "learning_rate": 5.311472733391172e-05, + "loss": 2.7754, + "step": 25717 + }, + { + "epoch": 1.5964988515736545, + "grad_norm": 0.14524339165690345, + "learning_rate": 5.311112281588078e-05, + "loss": 2.8426, + "step": 25718 + }, + { + "epoch": 1.5965609286734124, + "grad_norm": 0.13969908642773926, + "learning_rate": 5.310751828161834e-05, + "loss": 2.769, + "step": 25719 + }, + { + "epoch": 1.5966230057731703, + "grad_norm": 0.1415599761931598, + "learning_rate": 5.310391373114324e-05, + "loss": 2.8336, + "step": 25720 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.1442902011185087, + "learning_rate": 5.310030916447424e-05, + "loss": 2.8284, + "step": 25721 + }, + { + "epoch": 1.5967471599726861, + "grad_norm": 0.14705545368085168, + "learning_rate": 5.3096704581630197e-05, + "loss": 2.8334, + "step": 25722 + }, + { + "epoch": 1.5968092370724438, + "grad_norm": 0.1544492729470759, + "learning_rate": 5.3093099982629904e-05, + "loss": 2.8138, + "step": 25723 + }, + { + "epoch": 1.5968713141722017, + "grad_norm": 0.15072843461714608, + "learning_rate": 5.3089495367492136e-05, + "loss": 2.8274, + "step": 25724 + }, + { + "epoch": 1.5969333912719597, + "grad_norm": 0.15072613608176838, + "learning_rate": 5.3085890736235744e-05, + "loss": 2.8311, + "step": 25725 + }, + { + "epoch": 1.5969954683717176, + "grad_norm": 0.16620277066456143, + "learning_rate": 5.3082286088879486e-05, + "loss": 2.9265, + "step": 25726 + }, + { + "epoch": 1.5970575454714755, + "grad_norm": 0.14348265079590997, + "learning_rate": 5.307868142544221e-05, + "loss": 2.7254, + "step": 25727 + }, + { + "epoch": 1.5971196225712334, + "grad_norm": 0.15206049604502628, + "learning_rate": 5.30750767459427e-05, + "loss": 2.7927, + "step": 25728 + }, + { + "epoch": 1.5971816996709913, + "grad_norm": 0.15254356656037119, + "learning_rate": 5.307147205039977e-05, + "loss": 2.8484, + "step": 25729 + }, + { + "epoch": 1.5972437767707492, + "grad_norm": 0.14524727095651724, + "learning_rate": 5.3067867338832224e-05, + "loss": 2.891, + "step": 25730 + }, + { + "epoch": 1.5973058538705072, + "grad_norm": 0.16340884489018803, + "learning_rate": 5.3064262611258876e-05, + "loss": 2.8055, + "step": 25731 + }, + { + "epoch": 1.597367930970265, + "grad_norm": 0.16253933132546364, + "learning_rate": 5.3060657867698505e-05, + "loss": 2.7607, + "step": 25732 + }, + { + "epoch": 1.597430008070023, + "grad_norm": 0.1463359144024553, + "learning_rate": 5.3057053108169965e-05, + "loss": 2.7647, + "step": 25733 + }, + { + "epoch": 1.597492085169781, + "grad_norm": 0.16075649244173693, + "learning_rate": 5.305344833269201e-05, + "loss": 2.8099, + "step": 25734 + }, + { + "epoch": 1.5975541622695388, + "grad_norm": 0.15457275329891582, + "learning_rate": 5.3049843541283495e-05, + "loss": 2.6782, + "step": 25735 + }, + { + "epoch": 1.5976162393692968, + "grad_norm": 0.15045416715750076, + "learning_rate": 5.304623873396321e-05, + "loss": 2.7969, + "step": 25736 + }, + { + "epoch": 1.5976783164690547, + "grad_norm": 0.16381888240270223, + "learning_rate": 5.3042633910749926e-05, + "loss": 2.802, + "step": 25737 + }, + { + "epoch": 1.5977403935688126, + "grad_norm": 0.16150299667118873, + "learning_rate": 5.3039029071662515e-05, + "loss": 2.8218, + "step": 25738 + }, + { + "epoch": 1.5978024706685705, + "grad_norm": 0.15994102857169903, + "learning_rate": 5.303542421671973e-05, + "loss": 2.7961, + "step": 25739 + }, + { + "epoch": 1.5978645477683284, + "grad_norm": 0.18529737663313728, + "learning_rate": 5.3031819345940414e-05, + "loss": 2.805, + "step": 25740 + }, + { + "epoch": 1.5979266248680861, + "grad_norm": 0.18801841989660753, + "learning_rate": 5.302821445934336e-05, + "loss": 2.8556, + "step": 25741 + }, + { + "epoch": 1.597988701967844, + "grad_norm": 0.1668703627567117, + "learning_rate": 5.302460955694737e-05, + "loss": 2.7723, + "step": 25742 + }, + { + "epoch": 1.598050779067602, + "grad_norm": 0.19299427396490126, + "learning_rate": 5.3021004638771266e-05, + "loss": 2.805, + "step": 25743 + }, + { + "epoch": 1.5981128561673599, + "grad_norm": 0.16005728152600082, + "learning_rate": 5.301739970483384e-05, + "loss": 2.7855, + "step": 25744 + }, + { + "epoch": 1.5981749332671178, + "grad_norm": 0.18084455186314424, + "learning_rate": 5.301379475515391e-05, + "loss": 2.8307, + "step": 25745 + }, + { + "epoch": 1.5982370103668757, + "grad_norm": 0.15034347265634623, + "learning_rate": 5.301018978975028e-05, + "loss": 2.8094, + "step": 25746 + }, + { + "epoch": 1.5982990874666334, + "grad_norm": 0.15493403233636607, + "learning_rate": 5.300658480864176e-05, + "loss": 2.7192, + "step": 25747 + }, + { + "epoch": 1.5983611645663913, + "grad_norm": 0.16253333208378395, + "learning_rate": 5.3002979811847154e-05, + "loss": 2.7339, + "step": 25748 + }, + { + "epoch": 1.5984232416661492, + "grad_norm": 0.15050081452120762, + "learning_rate": 5.299937479938528e-05, + "loss": 2.7388, + "step": 25749 + }, + { + "epoch": 1.5984853187659072, + "grad_norm": 0.15386779493534983, + "learning_rate": 5.299576977127493e-05, + "loss": 2.7301, + "step": 25750 + }, + { + "epoch": 1.598547395865665, + "grad_norm": 0.15242220007990315, + "learning_rate": 5.2992164727534934e-05, + "loss": 2.8553, + "step": 25751 + }, + { + "epoch": 1.598609472965423, + "grad_norm": 0.1669895160751377, + "learning_rate": 5.298855966818408e-05, + "loss": 2.743, + "step": 25752 + }, + { + "epoch": 1.598671550065181, + "grad_norm": 0.15900710175431798, + "learning_rate": 5.2984954593241184e-05, + "loss": 2.7151, + "step": 25753 + }, + { + "epoch": 1.5987336271649388, + "grad_norm": 0.1557253911816327, + "learning_rate": 5.298134950272505e-05, + "loss": 2.8864, + "step": 25754 + }, + { + "epoch": 1.5987957042646967, + "grad_norm": 0.1544439604630232, + "learning_rate": 5.29777443966545e-05, + "loss": 2.8317, + "step": 25755 + }, + { + "epoch": 1.5988577813644547, + "grad_norm": 0.16020136872205723, + "learning_rate": 5.297413927504834e-05, + "loss": 2.8632, + "step": 25756 + }, + { + "epoch": 1.5989198584642126, + "grad_norm": 0.13638143480847012, + "learning_rate": 5.297053413792535e-05, + "loss": 2.7145, + "step": 25757 + }, + { + "epoch": 1.5989819355639705, + "grad_norm": 0.16128354391799785, + "learning_rate": 5.29669289853044e-05, + "loss": 2.8189, + "step": 25758 + }, + { + "epoch": 1.5990440126637284, + "grad_norm": 0.1565118919457478, + "learning_rate": 5.2963323817204226e-05, + "loss": 2.8663, + "step": 25759 + }, + { + "epoch": 1.5991060897634863, + "grad_norm": 0.14708752119849194, + "learning_rate": 5.2959718633643694e-05, + "loss": 2.8074, + "step": 25760 + }, + { + "epoch": 1.5991681668632443, + "grad_norm": 0.14139184219767229, + "learning_rate": 5.2956113434641596e-05, + "loss": 2.8346, + "step": 25761 + }, + { + "epoch": 1.5992302439630022, + "grad_norm": 0.15341857628365124, + "learning_rate": 5.2952508220216715e-05, + "loss": 2.8266, + "step": 25762 + }, + { + "epoch": 1.59929232106276, + "grad_norm": 0.15587357836375593, + "learning_rate": 5.2948902990387896e-05, + "loss": 2.7942, + "step": 25763 + }, + { + "epoch": 1.599354398162518, + "grad_norm": 0.1519313389671756, + "learning_rate": 5.294529774517393e-05, + "loss": 2.8077, + "step": 25764 + }, + { + "epoch": 1.5994164752622757, + "grad_norm": 0.14620529292322576, + "learning_rate": 5.294169248459363e-05, + "loss": 2.8212, + "step": 25765 + }, + { + "epoch": 1.5994785523620336, + "grad_norm": 0.15962058714604743, + "learning_rate": 5.29380872086658e-05, + "loss": 2.8277, + "step": 25766 + }, + { + "epoch": 1.5995406294617915, + "grad_norm": 0.14061874799605892, + "learning_rate": 5.293448191740927e-05, + "loss": 2.7652, + "step": 25767 + }, + { + "epoch": 1.5996027065615495, + "grad_norm": 0.1426895046226989, + "learning_rate": 5.293087661084283e-05, + "loss": 2.7974, + "step": 25768 + }, + { + "epoch": 1.5996647836613074, + "grad_norm": 0.1618681917452604, + "learning_rate": 5.292727128898529e-05, + "loss": 2.8227, + "step": 25769 + }, + { + "epoch": 1.5997268607610653, + "grad_norm": 0.15156047539489387, + "learning_rate": 5.292366595185547e-05, + "loss": 2.7689, + "step": 25770 + }, + { + "epoch": 1.599788937860823, + "grad_norm": 0.16779868809638435, + "learning_rate": 5.2920060599472165e-05, + "loss": 2.7257, + "step": 25771 + }, + { + "epoch": 1.599851014960581, + "grad_norm": 0.15375354932355603, + "learning_rate": 5.291645523185421e-05, + "loss": 2.8309, + "step": 25772 + }, + { + "epoch": 1.5999130920603388, + "grad_norm": 0.18425947586871208, + "learning_rate": 5.2912849849020385e-05, + "loss": 2.7759, + "step": 25773 + }, + { + "epoch": 1.5999751691600967, + "grad_norm": 0.19237605436431487, + "learning_rate": 5.2909244450989525e-05, + "loss": 2.8802, + "step": 25774 + }, + { + "epoch": 1.6000372462598547, + "grad_norm": 0.15343783060026583, + "learning_rate": 5.290563903778043e-05, + "loss": 2.787, + "step": 25775 + }, + { + "epoch": 1.6000993233596126, + "grad_norm": 0.17666929884533936, + "learning_rate": 5.2902033609411914e-05, + "loss": 2.7695, + "step": 25776 + }, + { + "epoch": 1.6001614004593705, + "grad_norm": 0.1510381568029529, + "learning_rate": 5.289842816590278e-05, + "loss": 2.7686, + "step": 25777 + }, + { + "epoch": 1.6002234775591284, + "grad_norm": 0.20480677500113306, + "learning_rate": 5.289482270727184e-05, + "loss": 2.9092, + "step": 25778 + }, + { + "epoch": 1.6002855546588863, + "grad_norm": 0.15172728497809645, + "learning_rate": 5.289121723353791e-05, + "loss": 2.7785, + "step": 25779 + }, + { + "epoch": 1.6003476317586443, + "grad_norm": 0.1586960777182011, + "learning_rate": 5.28876117447198e-05, + "loss": 2.6532, + "step": 25780 + }, + { + "epoch": 1.6004097088584022, + "grad_norm": 0.15405650066239424, + "learning_rate": 5.288400624083632e-05, + "loss": 2.7478, + "step": 25781 + }, + { + "epoch": 1.60047178595816, + "grad_norm": 0.15707658639903918, + "learning_rate": 5.2880400721906275e-05, + "loss": 2.7519, + "step": 25782 + }, + { + "epoch": 1.600533863057918, + "grad_norm": 0.15971625178403562, + "learning_rate": 5.287679518794848e-05, + "loss": 2.7288, + "step": 25783 + }, + { + "epoch": 1.600595940157676, + "grad_norm": 0.1656081528462428, + "learning_rate": 5.2873189638981744e-05, + "loss": 2.7881, + "step": 25784 + }, + { + "epoch": 1.6006580172574338, + "grad_norm": 0.1592426058686016, + "learning_rate": 5.286958407502489e-05, + "loss": 2.7976, + "step": 25785 + }, + { + "epoch": 1.6007200943571918, + "grad_norm": 0.1568177814829564, + "learning_rate": 5.286597849609671e-05, + "loss": 2.8614, + "step": 25786 + }, + { + "epoch": 1.6007821714569497, + "grad_norm": 0.1574999345957151, + "learning_rate": 5.286237290221602e-05, + "loss": 2.8184, + "step": 25787 + }, + { + "epoch": 1.6008442485567076, + "grad_norm": 0.14451568107261564, + "learning_rate": 5.285876729340165e-05, + "loss": 2.7639, + "step": 25788 + }, + { + "epoch": 1.6009063256564653, + "grad_norm": 0.16600953820233857, + "learning_rate": 5.285516166967238e-05, + "loss": 2.829, + "step": 25789 + }, + { + "epoch": 1.6009684027562232, + "grad_norm": 0.14623671258369905, + "learning_rate": 5.285155603104706e-05, + "loss": 2.8493, + "step": 25790 + }, + { + "epoch": 1.6010304798559811, + "grad_norm": 0.1530585453769479, + "learning_rate": 5.284795037754446e-05, + "loss": 2.8141, + "step": 25791 + }, + { + "epoch": 1.601092556955739, + "grad_norm": 0.15545308019752635, + "learning_rate": 5.284434470918342e-05, + "loss": 2.8003, + "step": 25792 + }, + { + "epoch": 1.601154634055497, + "grad_norm": 0.1534173257621243, + "learning_rate": 5.2840739025982746e-05, + "loss": 2.7392, + "step": 25793 + }, + { + "epoch": 1.6012167111552549, + "grad_norm": 0.15834142385588176, + "learning_rate": 5.283713332796124e-05, + "loss": 2.7701, + "step": 25794 + }, + { + "epoch": 1.6012787882550126, + "grad_norm": 0.16242262437802607, + "learning_rate": 5.2833527615137724e-05, + "loss": 2.7772, + "step": 25795 + }, + { + "epoch": 1.6013408653547705, + "grad_norm": 0.15531012863329102, + "learning_rate": 5.2829921887531007e-05, + "loss": 2.7534, + "step": 25796 + }, + { + "epoch": 1.6014029424545284, + "grad_norm": 0.16358884489799794, + "learning_rate": 5.2826316145159915e-05, + "loss": 2.8214, + "step": 25797 + }, + { + "epoch": 1.6014650195542863, + "grad_norm": 0.15660887869769866, + "learning_rate": 5.2822710388043214e-05, + "loss": 2.8489, + "step": 25798 + }, + { + "epoch": 1.6015270966540442, + "grad_norm": 0.15589910086975958, + "learning_rate": 5.2819104616199774e-05, + "loss": 2.7597, + "step": 25799 + }, + { + "epoch": 1.6015891737538022, + "grad_norm": 0.16166420214739202, + "learning_rate": 5.2815498829648366e-05, + "loss": 2.8171, + "step": 25800 + }, + { + "epoch": 1.60165125085356, + "grad_norm": 0.1377028740504333, + "learning_rate": 5.2811893028407834e-05, + "loss": 2.7363, + "step": 25801 + }, + { + "epoch": 1.601713327953318, + "grad_norm": 0.14726011230512132, + "learning_rate": 5.2808287212496956e-05, + "loss": 2.7818, + "step": 25802 + }, + { + "epoch": 1.601775405053076, + "grad_norm": 0.14594031706956173, + "learning_rate": 5.280468138193456e-05, + "loss": 2.7566, + "step": 25803 + }, + { + "epoch": 1.6018374821528338, + "grad_norm": 0.14348930375013674, + "learning_rate": 5.280107553673947e-05, + "loss": 2.9405, + "step": 25804 + }, + { + "epoch": 1.6018995592525918, + "grad_norm": 0.1591652588653053, + "learning_rate": 5.279746967693049e-05, + "loss": 2.812, + "step": 25805 + }, + { + "epoch": 1.6019616363523497, + "grad_norm": 0.13987640633570675, + "learning_rate": 5.2793863802526436e-05, + "loss": 2.835, + "step": 25806 + }, + { + "epoch": 1.6020237134521076, + "grad_norm": 0.1470328363035047, + "learning_rate": 5.279025791354609e-05, + "loss": 2.7631, + "step": 25807 + }, + { + "epoch": 1.6020857905518655, + "grad_norm": 0.15092582343529137, + "learning_rate": 5.278665201000832e-05, + "loss": 2.7844, + "step": 25808 + }, + { + "epoch": 1.6021478676516234, + "grad_norm": 0.16343857564262199, + "learning_rate": 5.27830460919319e-05, + "loss": 2.7724, + "step": 25809 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.14561399643383116, + "learning_rate": 5.277944015933566e-05, + "loss": 2.7259, + "step": 25810 + }, + { + "epoch": 1.6022720218511393, + "grad_norm": 0.17332412218605858, + "learning_rate": 5.2775834212238386e-05, + "loss": 2.8081, + "step": 25811 + }, + { + "epoch": 1.6023340989508972, + "grad_norm": 0.14557351158875817, + "learning_rate": 5.277222825065893e-05, + "loss": 2.7557, + "step": 25812 + }, + { + "epoch": 1.6023961760506549, + "grad_norm": 0.14523238798878357, + "learning_rate": 5.276862227461607e-05, + "loss": 2.7277, + "step": 25813 + }, + { + "epoch": 1.6024582531504128, + "grad_norm": 0.1470141938275996, + "learning_rate": 5.2765016284128655e-05, + "loss": 2.83, + "step": 25814 + }, + { + "epoch": 1.6025203302501707, + "grad_norm": 0.1465934855682974, + "learning_rate": 5.276141027921549e-05, + "loss": 2.8058, + "step": 25815 + }, + { + "epoch": 1.6025824073499286, + "grad_norm": 0.14467493209674126, + "learning_rate": 5.275780425989534e-05, + "loss": 2.8017, + "step": 25816 + }, + { + "epoch": 1.6026444844496865, + "grad_norm": 0.1528149921315436, + "learning_rate": 5.275419822618708e-05, + "loss": 2.8679, + "step": 25817 + }, + { + "epoch": 1.6027065615494445, + "grad_norm": 0.14072193010554498, + "learning_rate": 5.275059217810948e-05, + "loss": 2.7501, + "step": 25818 + }, + { + "epoch": 1.6027686386492022, + "grad_norm": 0.1504621335730829, + "learning_rate": 5.274698611568139e-05, + "loss": 2.7619, + "step": 25819 + }, + { + "epoch": 1.60283071574896, + "grad_norm": 0.14522201838467286, + "learning_rate": 5.274338003892158e-05, + "loss": 2.7743, + "step": 25820 + }, + { + "epoch": 1.602892792848718, + "grad_norm": 0.15018067629651902, + "learning_rate": 5.273977394784892e-05, + "loss": 2.712, + "step": 25821 + }, + { + "epoch": 1.602954869948476, + "grad_norm": 0.1552135423985178, + "learning_rate": 5.27361678424822e-05, + "loss": 2.8704, + "step": 25822 + }, + { + "epoch": 1.6030169470482338, + "grad_norm": 0.1467931417870591, + "learning_rate": 5.2732561722840215e-05, + "loss": 2.8787, + "step": 25823 + }, + { + "epoch": 1.6030790241479917, + "grad_norm": 0.16033100851380785, + "learning_rate": 5.2728955588941784e-05, + "loss": 2.8432, + "step": 25824 + }, + { + "epoch": 1.6031411012477497, + "grad_norm": 0.15661306519296386, + "learning_rate": 5.272534944080574e-05, + "loss": 2.8581, + "step": 25825 + }, + { + "epoch": 1.6032031783475076, + "grad_norm": 0.1710085105636021, + "learning_rate": 5.2721743278450885e-05, + "loss": 2.7506, + "step": 25826 + }, + { + "epoch": 1.6032652554472655, + "grad_norm": 0.15007628994266123, + "learning_rate": 5.2718137101896024e-05, + "loss": 2.8065, + "step": 25827 + }, + { + "epoch": 1.6033273325470234, + "grad_norm": 0.14769731611553966, + "learning_rate": 5.2714530911159996e-05, + "loss": 2.6831, + "step": 25828 + }, + { + "epoch": 1.6033894096467813, + "grad_norm": 0.1459638183538949, + "learning_rate": 5.271092470626159e-05, + "loss": 2.7916, + "step": 25829 + }, + { + "epoch": 1.6034514867465393, + "grad_norm": 0.13710153671421785, + "learning_rate": 5.2707318487219645e-05, + "loss": 2.7919, + "step": 25830 + }, + { + "epoch": 1.6035135638462972, + "grad_norm": 0.14924960965715733, + "learning_rate": 5.270371225405295e-05, + "loss": 2.8228, + "step": 25831 + }, + { + "epoch": 1.603575640946055, + "grad_norm": 0.17985604508477074, + "learning_rate": 5.270010600678034e-05, + "loss": 2.7971, + "step": 25832 + }, + { + "epoch": 1.603637718045813, + "grad_norm": 0.17387843743333214, + "learning_rate": 5.269649974542062e-05, + "loss": 2.808, + "step": 25833 + }, + { + "epoch": 1.603699795145571, + "grad_norm": 0.18140114615240202, + "learning_rate": 5.26928934699926e-05, + "loss": 2.7881, + "step": 25834 + }, + { + "epoch": 1.6037618722453288, + "grad_norm": 0.18463652296142863, + "learning_rate": 5.26892871805151e-05, + "loss": 2.8898, + "step": 25835 + }, + { + "epoch": 1.6038239493450868, + "grad_norm": 0.16497773379667743, + "learning_rate": 5.268568087700694e-05, + "loss": 2.7575, + "step": 25836 + }, + { + "epoch": 1.6038860264448445, + "grad_norm": 0.15253353524054378, + "learning_rate": 5.2682074559486936e-05, + "loss": 2.8163, + "step": 25837 + }, + { + "epoch": 1.6039481035446024, + "grad_norm": 0.18351782588361656, + "learning_rate": 5.267846822797389e-05, + "loss": 2.8809, + "step": 25838 + }, + { + "epoch": 1.6040101806443603, + "grad_norm": 0.16619780025719183, + "learning_rate": 5.267486188248663e-05, + "loss": 2.7964, + "step": 25839 + }, + { + "epoch": 1.6040722577441182, + "grad_norm": 0.19414534879651887, + "learning_rate": 5.267125552304397e-05, + "loss": 2.8361, + "step": 25840 + }, + { + "epoch": 1.6041343348438761, + "grad_norm": 0.1687903603219386, + "learning_rate": 5.266764914966471e-05, + "loss": 2.7664, + "step": 25841 + }, + { + "epoch": 1.604196411943634, + "grad_norm": 0.1860684542755558, + "learning_rate": 5.266404276236769e-05, + "loss": 2.7478, + "step": 25842 + }, + { + "epoch": 1.6042584890433917, + "grad_norm": 0.1602347693212931, + "learning_rate": 5.2660436361171704e-05, + "loss": 2.7519, + "step": 25843 + }, + { + "epoch": 1.6043205661431497, + "grad_norm": 0.15450302066751234, + "learning_rate": 5.265682994609558e-05, + "loss": 2.8702, + "step": 25844 + }, + { + "epoch": 1.6043826432429076, + "grad_norm": 0.15167566061753154, + "learning_rate": 5.265322351715812e-05, + "loss": 2.6937, + "step": 25845 + }, + { + "epoch": 1.6044447203426655, + "grad_norm": 0.14884391785979992, + "learning_rate": 5.2649617074378164e-05, + "loss": 2.7663, + "step": 25846 + }, + { + "epoch": 1.6045067974424234, + "grad_norm": 0.16408418898042923, + "learning_rate": 5.26460106177745e-05, + "loss": 2.7887, + "step": 25847 + }, + { + "epoch": 1.6045688745421813, + "grad_norm": 0.18211894481099594, + "learning_rate": 5.2642404147365966e-05, + "loss": 2.8753, + "step": 25848 + }, + { + "epoch": 1.6046309516419393, + "grad_norm": 0.14679050246979045, + "learning_rate": 5.263879766317135e-05, + "loss": 2.8561, + "step": 25849 + }, + { + "epoch": 1.6046930287416972, + "grad_norm": 0.15283658322511484, + "learning_rate": 5.263519116520951e-05, + "loss": 2.8306, + "step": 25850 + }, + { + "epoch": 1.604755105841455, + "grad_norm": 0.20207618490992812, + "learning_rate": 5.263158465349922e-05, + "loss": 2.8351, + "step": 25851 + }, + { + "epoch": 1.604817182941213, + "grad_norm": 0.15595173381868666, + "learning_rate": 5.2627978128059304e-05, + "loss": 2.8, + "step": 25852 + }, + { + "epoch": 1.604879260040971, + "grad_norm": 0.1762989262721275, + "learning_rate": 5.262437158890862e-05, + "loss": 2.8015, + "step": 25853 + }, + { + "epoch": 1.6049413371407288, + "grad_norm": 0.1575215279818588, + "learning_rate": 5.2620765036065925e-05, + "loss": 2.8972, + "step": 25854 + }, + { + "epoch": 1.6050034142404868, + "grad_norm": 0.14988879288550125, + "learning_rate": 5.261715846955008e-05, + "loss": 2.6891, + "step": 25855 + }, + { + "epoch": 1.6050654913402447, + "grad_norm": 0.1507289721727819, + "learning_rate": 5.261355188937988e-05, + "loss": 2.7521, + "step": 25856 + }, + { + "epoch": 1.6051275684400026, + "grad_norm": 0.144268712453941, + "learning_rate": 5.2609945295574135e-05, + "loss": 2.7538, + "step": 25857 + }, + { + "epoch": 1.6051896455397605, + "grad_norm": 0.15312004373275498, + "learning_rate": 5.260633868815169e-05, + "loss": 2.7881, + "step": 25858 + }, + { + "epoch": 1.6052517226395184, + "grad_norm": 0.1545224713473167, + "learning_rate": 5.260273206713133e-05, + "loss": 2.8256, + "step": 25859 + }, + { + "epoch": 1.6053137997392761, + "grad_norm": 0.16515474384635212, + "learning_rate": 5.259912543253189e-05, + "loss": 2.8828, + "step": 25860 + }, + { + "epoch": 1.605375876839034, + "grad_norm": 0.14843117549874613, + "learning_rate": 5.259551878437218e-05, + "loss": 2.8428, + "step": 25861 + }, + { + "epoch": 1.605437953938792, + "grad_norm": 0.1449006810589726, + "learning_rate": 5.2591912122671025e-05, + "loss": 2.8286, + "step": 25862 + }, + { + "epoch": 1.6055000310385499, + "grad_norm": 0.19535212114906791, + "learning_rate": 5.2588305447447214e-05, + "loss": 2.7614, + "step": 25863 + }, + { + "epoch": 1.6055621081383078, + "grad_norm": 0.18336786415463158, + "learning_rate": 5.258469875871961e-05, + "loss": 2.8434, + "step": 25864 + }, + { + "epoch": 1.6056241852380657, + "grad_norm": 0.1459653751883139, + "learning_rate": 5.2581092056507e-05, + "loss": 2.7715, + "step": 25865 + }, + { + "epoch": 1.6056862623378234, + "grad_norm": 0.17129747942054352, + "learning_rate": 5.2577485340828204e-05, + "loss": 2.803, + "step": 25866 + }, + { + "epoch": 1.6057483394375813, + "grad_norm": 0.1465802260560277, + "learning_rate": 5.257387861170203e-05, + "loss": 2.7975, + "step": 25867 + }, + { + "epoch": 1.6058104165373392, + "grad_norm": 0.16828367186373558, + "learning_rate": 5.2570271869147305e-05, + "loss": 2.7186, + "step": 25868 + }, + { + "epoch": 1.6058724936370972, + "grad_norm": 0.1611170847759762, + "learning_rate": 5.2566665113182864e-05, + "loss": 2.8764, + "step": 25869 + }, + { + "epoch": 1.605934570736855, + "grad_norm": 0.15154656272350214, + "learning_rate": 5.2563058343827495e-05, + "loss": 2.7111, + "step": 25870 + }, + { + "epoch": 1.605996647836613, + "grad_norm": 0.15596063761402107, + "learning_rate": 5.2559451561100045e-05, + "loss": 2.7893, + "step": 25871 + }, + { + "epoch": 1.606058724936371, + "grad_norm": 0.15741795092441346, + "learning_rate": 5.2555844765019304e-05, + "loss": 2.8266, + "step": 25872 + }, + { + "epoch": 1.6061208020361288, + "grad_norm": 0.14854705682616795, + "learning_rate": 5.25522379556041e-05, + "loss": 2.814, + "step": 25873 + }, + { + "epoch": 1.6061828791358868, + "grad_norm": 0.14464275466382973, + "learning_rate": 5.254863113287325e-05, + "loss": 2.7619, + "step": 25874 + }, + { + "epoch": 1.6062449562356447, + "grad_norm": 0.14957253772019796, + "learning_rate": 5.254502429684558e-05, + "loss": 2.9019, + "step": 25875 + }, + { + "epoch": 1.6063070333354026, + "grad_norm": 0.15035332799376924, + "learning_rate": 5.25414174475399e-05, + "loss": 2.7921, + "step": 25876 + }, + { + "epoch": 1.6063691104351605, + "grad_norm": 0.15194528995897985, + "learning_rate": 5.2537810584975025e-05, + "loss": 2.8062, + "step": 25877 + }, + { + "epoch": 1.6064311875349184, + "grad_norm": 0.16103365766153002, + "learning_rate": 5.2534203709169774e-05, + "loss": 2.8197, + "step": 25878 + }, + { + "epoch": 1.6064932646346763, + "grad_norm": 0.14841304125803195, + "learning_rate": 5.253059682014296e-05, + "loss": 2.8715, + "step": 25879 + }, + { + "epoch": 1.6065553417344343, + "grad_norm": 0.1803740983928583, + "learning_rate": 5.252698991791343e-05, + "loss": 2.7961, + "step": 25880 + }, + { + "epoch": 1.6066174188341922, + "grad_norm": 0.14182174926830243, + "learning_rate": 5.2523383002499946e-05, + "loss": 2.7244, + "step": 25881 + }, + { + "epoch": 1.60667949593395, + "grad_norm": 0.15175287884286343, + "learning_rate": 5.251977607392139e-05, + "loss": 2.8034, + "step": 25882 + }, + { + "epoch": 1.606741573033708, + "grad_norm": 0.14122488630949978, + "learning_rate": 5.251616913219653e-05, + "loss": 2.7298, + "step": 25883 + }, + { + "epoch": 1.6068036501334657, + "grad_norm": 0.15350117312054506, + "learning_rate": 5.251256217734423e-05, + "loss": 2.7525, + "step": 25884 + }, + { + "epoch": 1.6068657272332236, + "grad_norm": 0.14498225288470168, + "learning_rate": 5.2508955209383257e-05, + "loss": 2.7332, + "step": 25885 + }, + { + "epoch": 1.6069278043329815, + "grad_norm": 0.19689857213123527, + "learning_rate": 5.250534822833246e-05, + "loss": 2.8591, + "step": 25886 + }, + { + "epoch": 1.6069898814327395, + "grad_norm": 0.1460909646834598, + "learning_rate": 5.250174123421068e-05, + "loss": 2.7499, + "step": 25887 + }, + { + "epoch": 1.6070519585324974, + "grad_norm": 0.14186291746121296, + "learning_rate": 5.2498134227036676e-05, + "loss": 2.7849, + "step": 25888 + }, + { + "epoch": 1.6071140356322553, + "grad_norm": 0.16361486584671442, + "learning_rate": 5.249452720682932e-05, + "loss": 2.8298, + "step": 25889 + }, + { + "epoch": 1.607176112732013, + "grad_norm": 0.14647025358162039, + "learning_rate": 5.2490920173607396e-05, + "loss": 2.748, + "step": 25890 + }, + { + "epoch": 1.607238189831771, + "grad_norm": 0.14401625045999794, + "learning_rate": 5.248731312738975e-05, + "loss": 2.8406, + "step": 25891 + }, + { + "epoch": 1.6073002669315288, + "grad_norm": 0.15361894086632452, + "learning_rate": 5.248370606819517e-05, + "loss": 2.827, + "step": 25892 + }, + { + "epoch": 1.6073623440312867, + "grad_norm": 0.15343585186323705, + "learning_rate": 5.2480098996042516e-05, + "loss": 2.8082, + "step": 25893 + }, + { + "epoch": 1.6074244211310447, + "grad_norm": 0.14060595833817272, + "learning_rate": 5.2476491910950575e-05, + "loss": 2.7232, + "step": 25894 + }, + { + "epoch": 1.6074864982308026, + "grad_norm": 0.16287810394135618, + "learning_rate": 5.2472884812938163e-05, + "loss": 2.8499, + "step": 25895 + }, + { + "epoch": 1.6075485753305605, + "grad_norm": 0.150978931739052, + "learning_rate": 5.2469277702024125e-05, + "loss": 2.8026, + "step": 25896 + }, + { + "epoch": 1.6076106524303184, + "grad_norm": 0.22309984389912638, + "learning_rate": 5.246567057822726e-05, + "loss": 2.8086, + "step": 25897 + }, + { + "epoch": 1.6076727295300763, + "grad_norm": 0.14781650722265632, + "learning_rate": 5.246206344156639e-05, + "loss": 2.7445, + "step": 25898 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.16535074383852688, + "learning_rate": 5.2458456292060334e-05, + "loss": 2.8064, + "step": 25899 + }, + { + "epoch": 1.6077968837295922, + "grad_norm": 0.14721415250057526, + "learning_rate": 5.2454849129727926e-05, + "loss": 2.897, + "step": 25900 + }, + { + "epoch": 1.60785896082935, + "grad_norm": 0.1499150236650508, + "learning_rate": 5.245124195458796e-05, + "loss": 2.8327, + "step": 25901 + }, + { + "epoch": 1.607921037929108, + "grad_norm": 0.17750572336398893, + "learning_rate": 5.2447634766659274e-05, + "loss": 2.9031, + "step": 25902 + }, + { + "epoch": 1.607983115028866, + "grad_norm": 0.15943365373461948, + "learning_rate": 5.24440275659607e-05, + "loss": 2.8337, + "step": 25903 + }, + { + "epoch": 1.6080451921286238, + "grad_norm": 0.1842332011824329, + "learning_rate": 5.244042035251101e-05, + "loss": 2.8049, + "step": 25904 + }, + { + "epoch": 1.6081072692283818, + "grad_norm": 0.17624594934861257, + "learning_rate": 5.243681312632909e-05, + "loss": 2.8317, + "step": 25905 + }, + { + "epoch": 1.6081693463281397, + "grad_norm": 0.15260993493107186, + "learning_rate": 5.2433205887433686e-05, + "loss": 2.9001, + "step": 25906 + }, + { + "epoch": 1.6082314234278976, + "grad_norm": 0.15046335913804326, + "learning_rate": 5.242959863584369e-05, + "loss": 2.8373, + "step": 25907 + }, + { + "epoch": 1.6082935005276553, + "grad_norm": 0.15670954954918212, + "learning_rate": 5.242599137157787e-05, + "loss": 2.7838, + "step": 25908 + }, + { + "epoch": 1.6083555776274132, + "grad_norm": 0.14740822336154985, + "learning_rate": 5.2422384094655076e-05, + "loss": 2.8315, + "step": 25909 + }, + { + "epoch": 1.6084176547271711, + "grad_norm": 0.15633363869402708, + "learning_rate": 5.2418776805094104e-05, + "loss": 2.7864, + "step": 25910 + }, + { + "epoch": 1.608479731826929, + "grad_norm": 0.18895784508019245, + "learning_rate": 5.24151695029138e-05, + "loss": 2.7981, + "step": 25911 + }, + { + "epoch": 1.608541808926687, + "grad_norm": 0.18513157193188845, + "learning_rate": 5.241156218813296e-05, + "loss": 2.8459, + "step": 25912 + }, + { + "epoch": 1.6086038860264449, + "grad_norm": 0.17540745728738358, + "learning_rate": 5.240795486077042e-05, + "loss": 2.8128, + "step": 25913 + }, + { + "epoch": 1.6086659631262026, + "grad_norm": 0.18551434683522083, + "learning_rate": 5.240434752084499e-05, + "loss": 2.7434, + "step": 25914 + }, + { + "epoch": 1.6087280402259605, + "grad_norm": 0.16399019782572277, + "learning_rate": 5.24007401683755e-05, + "loss": 2.8001, + "step": 25915 + }, + { + "epoch": 1.6087901173257184, + "grad_norm": 0.14984080514609607, + "learning_rate": 5.239713280338075e-05, + "loss": 2.8047, + "step": 25916 + }, + { + "epoch": 1.6088521944254763, + "grad_norm": 0.14692194751411944, + "learning_rate": 5.239352542587959e-05, + "loss": 2.712, + "step": 25917 + }, + { + "epoch": 1.6089142715252343, + "grad_norm": 0.15746694752554302, + "learning_rate": 5.2389918035890826e-05, + "loss": 2.801, + "step": 25918 + }, + { + "epoch": 1.6089763486249922, + "grad_norm": 0.1614628331008556, + "learning_rate": 5.238631063343328e-05, + "loss": 2.757, + "step": 25919 + }, + { + "epoch": 1.60903842572475, + "grad_norm": 0.1563897223414626, + "learning_rate": 5.238270321852576e-05, + "loss": 2.736, + "step": 25920 + }, + { + "epoch": 1.609100502824508, + "grad_norm": 0.16061145360762993, + "learning_rate": 5.2379095791187124e-05, + "loss": 2.7353, + "step": 25921 + }, + { + "epoch": 1.609162579924266, + "grad_norm": 0.1475993637222439, + "learning_rate": 5.237548835143614e-05, + "loss": 2.7748, + "step": 25922 + }, + { + "epoch": 1.6092246570240238, + "grad_norm": 0.15920291918330284, + "learning_rate": 5.237188089929168e-05, + "loss": 2.7895, + "step": 25923 + }, + { + "epoch": 1.6092867341237818, + "grad_norm": 0.15884561528707888, + "learning_rate": 5.236827343477253e-05, + "loss": 2.9135, + "step": 25924 + }, + { + "epoch": 1.6093488112235397, + "grad_norm": 0.15457328351038102, + "learning_rate": 5.236466595789753e-05, + "loss": 2.7862, + "step": 25925 + }, + { + "epoch": 1.6094108883232976, + "grad_norm": 0.14819396268668378, + "learning_rate": 5.236105846868549e-05, + "loss": 2.6703, + "step": 25926 + }, + { + "epoch": 1.6094729654230555, + "grad_norm": 0.15679848486240677, + "learning_rate": 5.235745096715523e-05, + "loss": 2.8691, + "step": 25927 + }, + { + "epoch": 1.6095350425228134, + "grad_norm": 0.1482701299113424, + "learning_rate": 5.2353843453325566e-05, + "loss": 2.7885, + "step": 25928 + }, + { + "epoch": 1.6095971196225713, + "grad_norm": 0.1486827743812768, + "learning_rate": 5.235023592721535e-05, + "loss": 2.8935, + "step": 25929 + }, + { + "epoch": 1.6096591967223293, + "grad_norm": 0.14786772167234083, + "learning_rate": 5.234662838884338e-05, + "loss": 2.8635, + "step": 25930 + }, + { + "epoch": 1.6097212738220872, + "grad_norm": 0.16229738111151715, + "learning_rate": 5.234302083822847e-05, + "loss": 2.8276, + "step": 25931 + }, + { + "epoch": 1.6097833509218449, + "grad_norm": 0.14526358211738954, + "learning_rate": 5.233941327538947e-05, + "loss": 2.7928, + "step": 25932 + }, + { + "epoch": 1.6098454280216028, + "grad_norm": 0.146507045263164, + "learning_rate": 5.2335805700345165e-05, + "loss": 2.8438, + "step": 25933 + }, + { + "epoch": 1.6099075051213607, + "grad_norm": 0.16533620032581742, + "learning_rate": 5.2332198113114394e-05, + "loss": 2.7699, + "step": 25934 + }, + { + "epoch": 1.6099695822211186, + "grad_norm": 0.14384803512948063, + "learning_rate": 5.232859051371597e-05, + "loss": 2.9019, + "step": 25935 + }, + { + "epoch": 1.6100316593208766, + "grad_norm": 0.1438074935089718, + "learning_rate": 5.232498290216875e-05, + "loss": 2.6675, + "step": 25936 + }, + { + "epoch": 1.6100937364206345, + "grad_norm": 0.14016750542473455, + "learning_rate": 5.232137527849151e-05, + "loss": 2.8127, + "step": 25937 + }, + { + "epoch": 1.6101558135203922, + "grad_norm": 0.14080404433671995, + "learning_rate": 5.231776764270311e-05, + "loss": 2.7322, + "step": 25938 + }, + { + "epoch": 1.61021789062015, + "grad_norm": 0.1394924346165323, + "learning_rate": 5.2314159994822344e-05, + "loss": 2.7479, + "step": 25939 + }, + { + "epoch": 1.610279967719908, + "grad_norm": 0.14262258150112989, + "learning_rate": 5.2310552334868034e-05, + "loss": 2.7925, + "step": 25940 + }, + { + "epoch": 1.610342044819666, + "grad_norm": 0.18144672052434563, + "learning_rate": 5.230694466285902e-05, + "loss": 2.8571, + "step": 25941 + }, + { + "epoch": 1.6104041219194238, + "grad_norm": 0.15647009011600024, + "learning_rate": 5.230333697881412e-05, + "loss": 2.8331, + "step": 25942 + }, + { + "epoch": 1.6104661990191818, + "grad_norm": 0.1641911615232795, + "learning_rate": 5.229972928275215e-05, + "loss": 2.7536, + "step": 25943 + }, + { + "epoch": 1.6105282761189397, + "grad_norm": 0.15742686644750614, + "learning_rate": 5.2296121574691925e-05, + "loss": 2.8078, + "step": 25944 + }, + { + "epoch": 1.6105903532186976, + "grad_norm": 0.15700886742415457, + "learning_rate": 5.229251385465229e-05, + "loss": 2.8161, + "step": 25945 + }, + { + "epoch": 1.6106524303184555, + "grad_norm": 0.15101860270398867, + "learning_rate": 5.228890612265205e-05, + "loss": 2.789, + "step": 25946 + }, + { + "epoch": 1.6107145074182134, + "grad_norm": 0.14624878694365417, + "learning_rate": 5.228529837871004e-05, + "loss": 2.7993, + "step": 25947 + }, + { + "epoch": 1.6107765845179713, + "grad_norm": 0.14563902756660346, + "learning_rate": 5.228169062284505e-05, + "loss": 2.7477, + "step": 25948 + }, + { + "epoch": 1.6108386616177293, + "grad_norm": 0.13984488294984299, + "learning_rate": 5.227808285507594e-05, + "loss": 2.776, + "step": 25949 + }, + { + "epoch": 1.6109007387174872, + "grad_norm": 0.13898964859562274, + "learning_rate": 5.2274475075421515e-05, + "loss": 2.8538, + "step": 25950 + }, + { + "epoch": 1.610962815817245, + "grad_norm": 0.14435962849072545, + "learning_rate": 5.227086728390059e-05, + "loss": 2.8015, + "step": 25951 + }, + { + "epoch": 1.611024892917003, + "grad_norm": 0.14833534640739246, + "learning_rate": 5.226725948053202e-05, + "loss": 2.7668, + "step": 25952 + }, + { + "epoch": 1.611086970016761, + "grad_norm": 0.13935628087449364, + "learning_rate": 5.226365166533459e-05, + "loss": 2.8174, + "step": 25953 + }, + { + "epoch": 1.6111490471165189, + "grad_norm": 0.14356977033203966, + "learning_rate": 5.226004383832716e-05, + "loss": 2.7724, + "step": 25954 + }, + { + "epoch": 1.6112111242162768, + "grad_norm": 0.17703928586628348, + "learning_rate": 5.225643599952851e-05, + "loss": 2.7637, + "step": 25955 + }, + { + "epoch": 1.6112732013160345, + "grad_norm": 0.14743545135143638, + "learning_rate": 5.22528281489575e-05, + "loss": 2.6905, + "step": 25956 + }, + { + "epoch": 1.6113352784157924, + "grad_norm": 0.14170905012731608, + "learning_rate": 5.224922028663295e-05, + "loss": 2.8302, + "step": 25957 + }, + { + "epoch": 1.6113973555155503, + "grad_norm": 0.15854515966192564, + "learning_rate": 5.2245612412573654e-05, + "loss": 2.8262, + "step": 25958 + }, + { + "epoch": 1.6114594326153082, + "grad_norm": 0.15027927016502707, + "learning_rate": 5.2242004526798464e-05, + "loss": 2.8982, + "step": 25959 + }, + { + "epoch": 1.6115215097150661, + "grad_norm": 0.18273695419989874, + "learning_rate": 5.223839662932617e-05, + "loss": 2.7468, + "step": 25960 + }, + { + "epoch": 1.611583586814824, + "grad_norm": 0.15628057419146998, + "learning_rate": 5.2234788720175646e-05, + "loss": 2.7857, + "step": 25961 + }, + { + "epoch": 1.6116456639145817, + "grad_norm": 0.20224358903852607, + "learning_rate": 5.223118079936567e-05, + "loss": 2.7496, + "step": 25962 + }, + { + "epoch": 1.6117077410143397, + "grad_norm": 0.16740292235815105, + "learning_rate": 5.222757286691508e-05, + "loss": 2.8659, + "step": 25963 + }, + { + "epoch": 1.6117698181140976, + "grad_norm": 0.15970914350626436, + "learning_rate": 5.2223964922842696e-05, + "loss": 2.8752, + "step": 25964 + }, + { + "epoch": 1.6118318952138555, + "grad_norm": 0.15364125549371702, + "learning_rate": 5.222035696716736e-05, + "loss": 2.8263, + "step": 25965 + }, + { + "epoch": 1.6118939723136134, + "grad_norm": 0.15242673985408173, + "learning_rate": 5.221674899990788e-05, + "loss": 2.7738, + "step": 25966 + }, + { + "epoch": 1.6119560494133713, + "grad_norm": 0.15730930899421244, + "learning_rate": 5.221314102108308e-05, + "loss": 2.7452, + "step": 25967 + }, + { + "epoch": 1.6120181265131293, + "grad_norm": 0.17768080396577787, + "learning_rate": 5.220953303071179e-05, + "loss": 2.8181, + "step": 25968 + }, + { + "epoch": 1.6120802036128872, + "grad_norm": 0.14751671374112854, + "learning_rate": 5.220592502881282e-05, + "loss": 2.8415, + "step": 25969 + }, + { + "epoch": 1.612142280712645, + "grad_norm": 0.15265810152256581, + "learning_rate": 5.220231701540502e-05, + "loss": 2.7889, + "step": 25970 + }, + { + "epoch": 1.612204357812403, + "grad_norm": 0.13812311823887877, + "learning_rate": 5.219870899050718e-05, + "loss": 2.7381, + "step": 25971 + }, + { + "epoch": 1.612266434912161, + "grad_norm": 0.1421707748700047, + "learning_rate": 5.219510095413817e-05, + "loss": 2.6905, + "step": 25972 + }, + { + "epoch": 1.6123285120119188, + "grad_norm": 0.14898626631369924, + "learning_rate": 5.219149290631675e-05, + "loss": 2.7598, + "step": 25973 + }, + { + "epoch": 1.6123905891116768, + "grad_norm": 0.14886669174868522, + "learning_rate": 5.2187884847061806e-05, + "loss": 2.7131, + "step": 25974 + }, + { + "epoch": 1.6124526662114347, + "grad_norm": 0.15520319159416168, + "learning_rate": 5.218427677639213e-05, + "loss": 2.8595, + "step": 25975 + }, + { + "epoch": 1.6125147433111926, + "grad_norm": 0.1464836947107413, + "learning_rate": 5.218066869432654e-05, + "loss": 2.7689, + "step": 25976 + }, + { + "epoch": 1.6125768204109505, + "grad_norm": 0.18302311380391678, + "learning_rate": 5.217706060088389e-05, + "loss": 2.8106, + "step": 25977 + }, + { + "epoch": 1.6126388975107084, + "grad_norm": 0.15215826969917712, + "learning_rate": 5.2173452496082965e-05, + "loss": 2.7098, + "step": 25978 + }, + { + "epoch": 1.6127009746104664, + "grad_norm": 0.15558109154266955, + "learning_rate": 5.216984437994262e-05, + "loss": 2.758, + "step": 25979 + }, + { + "epoch": 1.612763051710224, + "grad_norm": 0.1408089917588668, + "learning_rate": 5.216623625248167e-05, + "loss": 2.8053, + "step": 25980 + }, + { + "epoch": 1.612825128809982, + "grad_norm": 0.13926934091538243, + "learning_rate": 5.216262811371895e-05, + "loss": 2.824, + "step": 25981 + }, + { + "epoch": 1.6128872059097399, + "grad_norm": 0.14491258396071224, + "learning_rate": 5.215901996367326e-05, + "loss": 2.8237, + "step": 25982 + }, + { + "epoch": 1.6129492830094978, + "grad_norm": 0.1357843141572199, + "learning_rate": 5.215541180236343e-05, + "loss": 2.8062, + "step": 25983 + }, + { + "epoch": 1.6130113601092557, + "grad_norm": 0.1409057225902549, + "learning_rate": 5.215180362980832e-05, + "loss": 2.7199, + "step": 25984 + }, + { + "epoch": 1.6130734372090136, + "grad_norm": 0.13609710783380013, + "learning_rate": 5.21481954460267e-05, + "loss": 2.6961, + "step": 25985 + }, + { + "epoch": 1.6131355143087713, + "grad_norm": 0.14776044706300143, + "learning_rate": 5.2144587251037445e-05, + "loss": 2.7551, + "step": 25986 + }, + { + "epoch": 1.6131975914085293, + "grad_norm": 0.14469627032465762, + "learning_rate": 5.214097904485935e-05, + "loss": 2.7136, + "step": 25987 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.14507996940570203, + "learning_rate": 5.2137370827511255e-05, + "loss": 2.7646, + "step": 25988 + }, + { + "epoch": 1.613321745608045, + "grad_norm": 0.19250914013206147, + "learning_rate": 5.213376259901197e-05, + "loss": 2.6747, + "step": 25989 + }, + { + "epoch": 1.613383822707803, + "grad_norm": 0.14267410026672334, + "learning_rate": 5.213015435938034e-05, + "loss": 2.7158, + "step": 25990 + }, + { + "epoch": 1.613445899807561, + "grad_norm": 0.14840519704182115, + "learning_rate": 5.2126546108635156e-05, + "loss": 2.8167, + "step": 25991 + }, + { + "epoch": 1.6135079769073188, + "grad_norm": 0.14162600076192206, + "learning_rate": 5.212293784679527e-05, + "loss": 2.7791, + "step": 25992 + }, + { + "epoch": 1.6135700540070768, + "grad_norm": 0.14760951584249693, + "learning_rate": 5.211932957387951e-05, + "loss": 2.7613, + "step": 25993 + }, + { + "epoch": 1.6136321311068347, + "grad_norm": 0.14392543458786844, + "learning_rate": 5.211572128990669e-05, + "loss": 2.7707, + "step": 25994 + }, + { + "epoch": 1.6136942082065926, + "grad_norm": 0.14363112328568048, + "learning_rate": 5.2112112994895645e-05, + "loss": 2.8204, + "step": 25995 + }, + { + "epoch": 1.6137562853063505, + "grad_norm": 0.14776964274007967, + "learning_rate": 5.210850468886518e-05, + "loss": 2.8437, + "step": 25996 + }, + { + "epoch": 1.6138183624061084, + "grad_norm": 0.1634760832694081, + "learning_rate": 5.210489637183415e-05, + "loss": 2.8717, + "step": 25997 + }, + { + "epoch": 1.6138804395058663, + "grad_norm": 0.1517922248265492, + "learning_rate": 5.210128804382135e-05, + "loss": 2.7701, + "step": 25998 + }, + { + "epoch": 1.6139425166056243, + "grad_norm": 0.1444365474664567, + "learning_rate": 5.2097679704845616e-05, + "loss": 2.8008, + "step": 25999 + }, + { + "epoch": 1.6140045937053822, + "grad_norm": 0.15206968891869427, + "learning_rate": 5.20940713549258e-05, + "loss": 2.8132, + "step": 26000 + }, + { + "epoch": 1.61406667080514, + "grad_norm": 0.15717212848318646, + "learning_rate": 5.209046299408068e-05, + "loss": 2.859, + "step": 26001 + }, + { + "epoch": 1.614128747904898, + "grad_norm": 0.16373299820739112, + "learning_rate": 5.208685462232913e-05, + "loss": 2.7116, + "step": 26002 + }, + { + "epoch": 1.614190825004656, + "grad_norm": 0.1767095109900633, + "learning_rate": 5.208324623968993e-05, + "loss": 2.7362, + "step": 26003 + }, + { + "epoch": 1.6142529021044136, + "grad_norm": 0.1397834138866918, + "learning_rate": 5.207963784618195e-05, + "loss": 2.7026, + "step": 26004 + }, + { + "epoch": 1.6143149792041716, + "grad_norm": 0.14545386549646105, + "learning_rate": 5.207602944182398e-05, + "loss": 2.7736, + "step": 26005 + }, + { + "epoch": 1.6143770563039295, + "grad_norm": 0.15102044660761932, + "learning_rate": 5.2072421026634874e-05, + "loss": 2.749, + "step": 26006 + }, + { + "epoch": 1.6144391334036874, + "grad_norm": 0.1419966349370926, + "learning_rate": 5.206881260063343e-05, + "loss": 2.8149, + "step": 26007 + }, + { + "epoch": 1.6145012105034453, + "grad_norm": 0.17265781492499316, + "learning_rate": 5.206520416383851e-05, + "loss": 2.7514, + "step": 26008 + }, + { + "epoch": 1.6145632876032032, + "grad_norm": 0.15735917392695517, + "learning_rate": 5.20615957162689e-05, + "loss": 2.7756, + "step": 26009 + }, + { + "epoch": 1.614625364702961, + "grad_norm": 0.16696972723826078, + "learning_rate": 5.2057987257943453e-05, + "loss": 2.8745, + "step": 26010 + }, + { + "epoch": 1.6146874418027188, + "grad_norm": 0.14030015698923406, + "learning_rate": 5.2054378788880985e-05, + "loss": 2.7296, + "step": 26011 + }, + { + "epoch": 1.6147495189024768, + "grad_norm": 0.15000352954685578, + "learning_rate": 5.205077030910032e-05, + "loss": 2.8286, + "step": 26012 + }, + { + "epoch": 1.6148115960022347, + "grad_norm": 0.14403956134451337, + "learning_rate": 5.2047161818620296e-05, + "loss": 2.7917, + "step": 26013 + }, + { + "epoch": 1.6148736731019926, + "grad_norm": 0.16183513401761082, + "learning_rate": 5.204355331745972e-05, + "loss": 2.8112, + "step": 26014 + }, + { + "epoch": 1.6149357502017505, + "grad_norm": 0.14700770673045074, + "learning_rate": 5.203994480563743e-05, + "loss": 2.8722, + "step": 26015 + }, + { + "epoch": 1.6149978273015084, + "grad_norm": 0.16671095138049916, + "learning_rate": 5.203633628317225e-05, + "loss": 2.8512, + "step": 26016 + }, + { + "epoch": 1.6150599044012663, + "grad_norm": 0.19241343414279624, + "learning_rate": 5.203272775008302e-05, + "loss": 2.8019, + "step": 26017 + }, + { + "epoch": 1.6151219815010243, + "grad_norm": 0.15736312740262065, + "learning_rate": 5.202911920638857e-05, + "loss": 2.8438, + "step": 26018 + }, + { + "epoch": 1.6151840586007822, + "grad_norm": 0.15633073747387918, + "learning_rate": 5.202551065210768e-05, + "loss": 2.8178, + "step": 26019 + }, + { + "epoch": 1.61524613570054, + "grad_norm": 0.15421393132822173, + "learning_rate": 5.2021902087259237e-05, + "loss": 2.8172, + "step": 26020 + }, + { + "epoch": 1.615308212800298, + "grad_norm": 0.13966374213377247, + "learning_rate": 5.2018293511862014e-05, + "loss": 2.7619, + "step": 26021 + }, + { + "epoch": 1.615370289900056, + "grad_norm": 0.18347240049381758, + "learning_rate": 5.2014684925934884e-05, + "loss": 2.7495, + "step": 26022 + }, + { + "epoch": 1.6154323669998139, + "grad_norm": 0.1605291560992377, + "learning_rate": 5.2011076329496645e-05, + "loss": 2.8063, + "step": 26023 + }, + { + "epoch": 1.6154944440995718, + "grad_norm": 0.13691528363302286, + "learning_rate": 5.200746772256615e-05, + "loss": 2.6592, + "step": 26024 + }, + { + "epoch": 1.6155565211993297, + "grad_norm": 0.15646740570466083, + "learning_rate": 5.200385910516219e-05, + "loss": 2.7776, + "step": 26025 + }, + { + "epoch": 1.6156185982990876, + "grad_norm": 0.13991967116962717, + "learning_rate": 5.2000250477303615e-05, + "loss": 2.7779, + "step": 26026 + }, + { + "epoch": 1.6156806753988455, + "grad_norm": 0.16292381222982516, + "learning_rate": 5.199664183900924e-05, + "loss": 2.8697, + "step": 26027 + }, + { + "epoch": 1.6157427524986032, + "grad_norm": 0.1504716469138243, + "learning_rate": 5.199303319029791e-05, + "loss": 2.7693, + "step": 26028 + }, + { + "epoch": 1.6158048295983611, + "grad_norm": 0.1446517171650187, + "learning_rate": 5.198942453118845e-05, + "loss": 2.7222, + "step": 26029 + }, + { + "epoch": 1.615866906698119, + "grad_norm": 0.16890095255164178, + "learning_rate": 5.198581586169966e-05, + "loss": 2.8049, + "step": 26030 + }, + { + "epoch": 1.615928983797877, + "grad_norm": 0.16648461350343788, + "learning_rate": 5.198220718185041e-05, + "loss": 2.8832, + "step": 26031 + }, + { + "epoch": 1.615991060897635, + "grad_norm": 0.15818443305653315, + "learning_rate": 5.1978598491659466e-05, + "loss": 2.8086, + "step": 26032 + }, + { + "epoch": 1.6160531379973928, + "grad_norm": 0.15009716433442616, + "learning_rate": 5.197498979114572e-05, + "loss": 2.7647, + "step": 26033 + }, + { + "epoch": 1.6161152150971505, + "grad_norm": 0.1544487724597802, + "learning_rate": 5.1971381080327966e-05, + "loss": 2.7433, + "step": 26034 + }, + { + "epoch": 1.6161772921969084, + "grad_norm": 0.1547644989477225, + "learning_rate": 5.196777235922505e-05, + "loss": 2.7453, + "step": 26035 + }, + { + "epoch": 1.6162393692966663, + "grad_norm": 0.17572556249359178, + "learning_rate": 5.196416362785578e-05, + "loss": 2.7606, + "step": 26036 + }, + { + "epoch": 1.6163014463964243, + "grad_norm": 0.17599521436504395, + "learning_rate": 5.1960554886238985e-05, + "loss": 2.753, + "step": 26037 + }, + { + "epoch": 1.6163635234961822, + "grad_norm": 0.14998986363931724, + "learning_rate": 5.195694613439352e-05, + "loss": 2.8045, + "step": 26038 + }, + { + "epoch": 1.61642560059594, + "grad_norm": 0.15491300411287012, + "learning_rate": 5.1953337372338164e-05, + "loss": 2.802, + "step": 26039 + }, + { + "epoch": 1.616487677695698, + "grad_norm": 0.15834417060312858, + "learning_rate": 5.19497286000918e-05, + "loss": 2.8299, + "step": 26040 + }, + { + "epoch": 1.616549754795456, + "grad_norm": 0.16089703620620877, + "learning_rate": 5.19461198176732e-05, + "loss": 2.689, + "step": 26041 + }, + { + "epoch": 1.6166118318952138, + "grad_norm": 0.15246466582008544, + "learning_rate": 5.194251102510125e-05, + "loss": 2.7908, + "step": 26042 + }, + { + "epoch": 1.6166739089949718, + "grad_norm": 0.1417434179146131, + "learning_rate": 5.1938902222394725e-05, + "loss": 2.7881, + "step": 26043 + }, + { + "epoch": 1.6167359860947297, + "grad_norm": 0.18338289733307506, + "learning_rate": 5.193529340957248e-05, + "loss": 2.7859, + "step": 26044 + }, + { + "epoch": 1.6167980631944876, + "grad_norm": 0.1481571624641667, + "learning_rate": 5.193168458665335e-05, + "loss": 2.8048, + "step": 26045 + }, + { + "epoch": 1.6168601402942455, + "grad_norm": 0.16307458438302774, + "learning_rate": 5.1928075753656144e-05, + "loss": 2.8709, + "step": 26046 + }, + { + "epoch": 1.6169222173940034, + "grad_norm": 0.16818076437352275, + "learning_rate": 5.1924466910599715e-05, + "loss": 2.7851, + "step": 26047 + }, + { + "epoch": 1.6169842944937614, + "grad_norm": 0.17082257903274836, + "learning_rate": 5.192085805750284e-05, + "loss": 2.8555, + "step": 26048 + }, + { + "epoch": 1.6170463715935193, + "grad_norm": 0.15849222447625286, + "learning_rate": 5.191724919438441e-05, + "loss": 2.7972, + "step": 26049 + }, + { + "epoch": 1.6171084486932772, + "grad_norm": 0.16971072418272537, + "learning_rate": 5.191364032126321e-05, + "loss": 2.77, + "step": 26050 + }, + { + "epoch": 1.617170525793035, + "grad_norm": 0.15350125806134501, + "learning_rate": 5.19100314381581e-05, + "loss": 2.7729, + "step": 26051 + }, + { + "epoch": 1.6172326028927928, + "grad_norm": 0.1715762565652795, + "learning_rate": 5.190642254508788e-05, + "loss": 2.8078, + "step": 26052 + }, + { + "epoch": 1.6172946799925507, + "grad_norm": 0.15629633574091004, + "learning_rate": 5.1902813642071414e-05, + "loss": 2.8099, + "step": 26053 + }, + { + "epoch": 1.6173567570923086, + "grad_norm": 0.16623557050547397, + "learning_rate": 5.1899204729127496e-05, + "loss": 2.8014, + "step": 26054 + }, + { + "epoch": 1.6174188341920666, + "grad_norm": 0.15954107672739268, + "learning_rate": 5.1895595806274946e-05, + "loss": 2.8325, + "step": 26055 + }, + { + "epoch": 1.6174809112918245, + "grad_norm": 0.19076964526611737, + "learning_rate": 5.189198687353263e-05, + "loss": 2.8104, + "step": 26056 + }, + { + "epoch": 1.6175429883915824, + "grad_norm": 0.16062001792068534, + "learning_rate": 5.188837793091935e-05, + "loss": 2.8324, + "step": 26057 + }, + { + "epoch": 1.61760506549134, + "grad_norm": 0.20071843348381202, + "learning_rate": 5.188476897845396e-05, + "loss": 2.8339, + "step": 26058 + }, + { + "epoch": 1.617667142591098, + "grad_norm": 0.150768925108735, + "learning_rate": 5.188116001615525e-05, + "loss": 2.8574, + "step": 26059 + }, + { + "epoch": 1.617729219690856, + "grad_norm": 0.1852659407564149, + "learning_rate": 5.1877551044042093e-05, + "loss": 2.8626, + "step": 26060 + }, + { + "epoch": 1.6177912967906138, + "grad_norm": 0.18574971605962048, + "learning_rate": 5.187394206213328e-05, + "loss": 2.8476, + "step": 26061 + }, + { + "epoch": 1.6178533738903718, + "grad_norm": 0.17677815989355553, + "learning_rate": 5.187033307044768e-05, + "loss": 2.8038, + "step": 26062 + }, + { + "epoch": 1.6179154509901297, + "grad_norm": 0.1639304140259642, + "learning_rate": 5.186672406900407e-05, + "loss": 2.7929, + "step": 26063 + }, + { + "epoch": 1.6179775280898876, + "grad_norm": 0.16958845245834478, + "learning_rate": 5.186311505782132e-05, + "loss": 2.8244, + "step": 26064 + }, + { + "epoch": 1.6180396051896455, + "grad_norm": 0.16231427399639373, + "learning_rate": 5.185950603691824e-05, + "loss": 2.743, + "step": 26065 + }, + { + "epoch": 1.6181016822894034, + "grad_norm": 0.18969157049597302, + "learning_rate": 5.185589700631367e-05, + "loss": 2.7546, + "step": 26066 + }, + { + "epoch": 1.6181637593891613, + "grad_norm": 0.17069867584401271, + "learning_rate": 5.1852287966026436e-05, + "loss": 2.7216, + "step": 26067 + }, + { + "epoch": 1.6182258364889193, + "grad_norm": 0.1538622928914886, + "learning_rate": 5.1848678916075365e-05, + "loss": 2.7752, + "step": 26068 + }, + { + "epoch": 1.6182879135886772, + "grad_norm": 0.16462681991553113, + "learning_rate": 5.184506985647929e-05, + "loss": 2.7933, + "step": 26069 + }, + { + "epoch": 1.618349990688435, + "grad_norm": 0.1494100482875158, + "learning_rate": 5.184146078725703e-05, + "loss": 2.8359, + "step": 26070 + }, + { + "epoch": 1.618412067788193, + "grad_norm": 0.16799456930217257, + "learning_rate": 5.183785170842743e-05, + "loss": 2.7037, + "step": 26071 + }, + { + "epoch": 1.618474144887951, + "grad_norm": 0.15222680053418242, + "learning_rate": 5.183424262000932e-05, + "loss": 2.9049, + "step": 26072 + }, + { + "epoch": 1.6185362219877089, + "grad_norm": 0.15813514638282583, + "learning_rate": 5.18306335220215e-05, + "loss": 2.688, + "step": 26073 + }, + { + "epoch": 1.6185982990874668, + "grad_norm": 0.17603940021778933, + "learning_rate": 5.1827024414482836e-05, + "loss": 2.7722, + "step": 26074 + }, + { + "epoch": 1.6186603761872247, + "grad_norm": 0.16531517136620205, + "learning_rate": 5.1823415297412137e-05, + "loss": 2.7575, + "step": 26075 + }, + { + "epoch": 1.6187224532869824, + "grad_norm": 0.16456320401623176, + "learning_rate": 5.181980617082825e-05, + "loss": 2.7265, + "step": 26076 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.17036776302273848, + "learning_rate": 5.1816197034749966e-05, + "loss": 2.8775, + "step": 26077 + }, + { + "epoch": 1.6188466074864982, + "grad_norm": 0.16333141240827964, + "learning_rate": 5.1812587889196164e-05, + "loss": 2.795, + "step": 26078 + }, + { + "epoch": 1.6189086845862561, + "grad_norm": 0.1462597396146562, + "learning_rate": 5.180897873418563e-05, + "loss": 2.8, + "step": 26079 + }, + { + "epoch": 1.618970761686014, + "grad_norm": 0.18618393002955047, + "learning_rate": 5.1805369569737236e-05, + "loss": 2.805, + "step": 26080 + }, + { + "epoch": 1.619032838785772, + "grad_norm": 0.17725514160266484, + "learning_rate": 5.180176039586978e-05, + "loss": 2.7588, + "step": 26081 + }, + { + "epoch": 1.6190949158855297, + "grad_norm": 0.14857422655290362, + "learning_rate": 5.17981512126021e-05, + "loss": 2.8513, + "step": 26082 + }, + { + "epoch": 1.6191569929852876, + "grad_norm": 0.1761790754111296, + "learning_rate": 5.179454201995303e-05, + "loss": 2.8426, + "step": 26083 + }, + { + "epoch": 1.6192190700850455, + "grad_norm": 0.1555892683281285, + "learning_rate": 5.17909328179414e-05, + "loss": 2.8346, + "step": 26084 + }, + { + "epoch": 1.6192811471848034, + "grad_norm": 0.1645138073593538, + "learning_rate": 5.1787323606586044e-05, + "loss": 2.763, + "step": 26085 + }, + { + "epoch": 1.6193432242845613, + "grad_norm": 0.16786884508296768, + "learning_rate": 5.178371438590578e-05, + "loss": 2.7855, + "step": 26086 + }, + { + "epoch": 1.6194053013843193, + "grad_norm": 0.15538712304351662, + "learning_rate": 5.1780105155919456e-05, + "loss": 2.7632, + "step": 26087 + }, + { + "epoch": 1.6194673784840772, + "grad_norm": 0.1645472098848722, + "learning_rate": 5.177649591664587e-05, + "loss": 2.8701, + "step": 26088 + }, + { + "epoch": 1.619529455583835, + "grad_norm": 0.15015553392880612, + "learning_rate": 5.1772886668103893e-05, + "loss": 2.8533, + "step": 26089 + }, + { + "epoch": 1.619591532683593, + "grad_norm": 0.17132361247377398, + "learning_rate": 5.176927741031233e-05, + "loss": 2.6947, + "step": 26090 + }, + { + "epoch": 1.619653609783351, + "grad_norm": 0.14336386473482834, + "learning_rate": 5.1765668143290027e-05, + "loss": 2.7723, + "step": 26091 + }, + { + "epoch": 1.6197156868831089, + "grad_norm": 0.14514052356162435, + "learning_rate": 5.17620588670558e-05, + "loss": 2.7012, + "step": 26092 + }, + { + "epoch": 1.6197777639828668, + "grad_norm": 0.1434726285927411, + "learning_rate": 5.1758449581628465e-05, + "loss": 2.8291, + "step": 26093 + }, + { + "epoch": 1.6198398410826247, + "grad_norm": 0.13994489560299148, + "learning_rate": 5.1754840287026896e-05, + "loss": 2.7902, + "step": 26094 + }, + { + "epoch": 1.6199019181823826, + "grad_norm": 0.15987218110810894, + "learning_rate": 5.1751230983269885e-05, + "loss": 2.8517, + "step": 26095 + }, + { + "epoch": 1.6199639952821405, + "grad_norm": 0.14730230856031334, + "learning_rate": 5.174762167037628e-05, + "loss": 2.7433, + "step": 26096 + }, + { + "epoch": 1.6200260723818984, + "grad_norm": 0.1543233281408054, + "learning_rate": 5.17440123483649e-05, + "loss": 2.8555, + "step": 26097 + }, + { + "epoch": 1.6200881494816564, + "grad_norm": 0.1567084382069602, + "learning_rate": 5.174040301725459e-05, + "loss": 2.7946, + "step": 26098 + }, + { + "epoch": 1.6201502265814143, + "grad_norm": 0.1608887123952997, + "learning_rate": 5.173679367706419e-05, + "loss": 2.8129, + "step": 26099 + }, + { + "epoch": 1.620212303681172, + "grad_norm": 0.1511388841258826, + "learning_rate": 5.173318432781249e-05, + "loss": 2.8188, + "step": 26100 + }, + { + "epoch": 1.62027438078093, + "grad_norm": 0.1490372600119483, + "learning_rate": 5.1729574969518366e-05, + "loss": 2.878, + "step": 26101 + }, + { + "epoch": 1.6203364578806878, + "grad_norm": 0.1464134807694189, + "learning_rate": 5.172596560220062e-05, + "loss": 2.7888, + "step": 26102 + }, + { + "epoch": 1.6203985349804457, + "grad_norm": 0.14832188470553212, + "learning_rate": 5.172235622587811e-05, + "loss": 2.797, + "step": 26103 + }, + { + "epoch": 1.6204606120802036, + "grad_norm": 0.14575648979653755, + "learning_rate": 5.1718746840569634e-05, + "loss": 2.7453, + "step": 26104 + }, + { + "epoch": 1.6205226891799616, + "grad_norm": 0.14154848879934187, + "learning_rate": 5.171513744629404e-05, + "loss": 2.8258, + "step": 26105 + }, + { + "epoch": 1.6205847662797193, + "grad_norm": 0.14392812644403066, + "learning_rate": 5.171152804307016e-05, + "loss": 2.7499, + "step": 26106 + }, + { + "epoch": 1.6206468433794772, + "grad_norm": 0.14409147189500202, + "learning_rate": 5.170791863091682e-05, + "loss": 2.8275, + "step": 26107 + }, + { + "epoch": 1.620708920479235, + "grad_norm": 0.1562212917657555, + "learning_rate": 5.170430920985285e-05, + "loss": 2.7172, + "step": 26108 + }, + { + "epoch": 1.620770997578993, + "grad_norm": 0.15554985298360083, + "learning_rate": 5.170069977989709e-05, + "loss": 2.8597, + "step": 26109 + }, + { + "epoch": 1.620833074678751, + "grad_norm": 0.14709832911215054, + "learning_rate": 5.1697090341068376e-05, + "loss": 2.8123, + "step": 26110 + }, + { + "epoch": 1.6208951517785088, + "grad_norm": 0.15361779504002168, + "learning_rate": 5.169348089338552e-05, + "loss": 2.8373, + "step": 26111 + }, + { + "epoch": 1.6209572288782668, + "grad_norm": 0.1526740174413065, + "learning_rate": 5.168987143686737e-05, + "loss": 2.7929, + "step": 26112 + }, + { + "epoch": 1.6210193059780247, + "grad_norm": 0.1513632989199358, + "learning_rate": 5.168626197153274e-05, + "loss": 2.845, + "step": 26113 + }, + { + "epoch": 1.6210813830777826, + "grad_norm": 0.1488049188679086, + "learning_rate": 5.168265249740046e-05, + "loss": 2.7965, + "step": 26114 + }, + { + "epoch": 1.6211434601775405, + "grad_norm": 0.1618107262341411, + "learning_rate": 5.1679043014489395e-05, + "loss": 2.7559, + "step": 26115 + }, + { + "epoch": 1.6212055372772984, + "grad_norm": 0.15075821972416537, + "learning_rate": 5.167543352281834e-05, + "loss": 2.8325, + "step": 26116 + }, + { + "epoch": 1.6212676143770564, + "grad_norm": 0.14773606534905695, + "learning_rate": 5.167182402240616e-05, + "loss": 2.8042, + "step": 26117 + }, + { + "epoch": 1.6213296914768143, + "grad_norm": 0.17422379242651925, + "learning_rate": 5.166821451327165e-05, + "loss": 2.8088, + "step": 26118 + }, + { + "epoch": 1.6213917685765722, + "grad_norm": 0.13902087017110096, + "learning_rate": 5.166460499543367e-05, + "loss": 2.7839, + "step": 26119 + }, + { + "epoch": 1.62145384567633, + "grad_norm": 0.1561812130195824, + "learning_rate": 5.166099546891104e-05, + "loss": 2.68, + "step": 26120 + }, + { + "epoch": 1.621515922776088, + "grad_norm": 0.14528369284750686, + "learning_rate": 5.16573859337226e-05, + "loss": 2.7839, + "step": 26121 + }, + { + "epoch": 1.621577999875846, + "grad_norm": 0.14532891098792608, + "learning_rate": 5.165377638988716e-05, + "loss": 2.7865, + "step": 26122 + }, + { + "epoch": 1.6216400769756039, + "grad_norm": 0.14787238237743816, + "learning_rate": 5.1650166837423585e-05, + "loss": 2.8588, + "step": 26123 + }, + { + "epoch": 1.6217021540753616, + "grad_norm": 0.14495461549082042, + "learning_rate": 5.164655727635067e-05, + "loss": 2.8303, + "step": 26124 + }, + { + "epoch": 1.6217642311751195, + "grad_norm": 0.15250878076210309, + "learning_rate": 5.164294770668728e-05, + "loss": 2.7277, + "step": 26125 + }, + { + "epoch": 1.6218263082748774, + "grad_norm": 0.15537329527747584, + "learning_rate": 5.163933812845223e-05, + "loss": 2.8512, + "step": 26126 + }, + { + "epoch": 1.6218883853746353, + "grad_norm": 0.1589888558002358, + "learning_rate": 5.1635728541664364e-05, + "loss": 2.7388, + "step": 26127 + }, + { + "epoch": 1.6219504624743932, + "grad_norm": 0.16319881461821237, + "learning_rate": 5.16321189463425e-05, + "loss": 2.6805, + "step": 26128 + }, + { + "epoch": 1.6220125395741511, + "grad_norm": 0.14774185857675193, + "learning_rate": 5.162850934250546e-05, + "loss": 2.7757, + "step": 26129 + }, + { + "epoch": 1.6220746166739088, + "grad_norm": 0.15237185634643513, + "learning_rate": 5.162489973017209e-05, + "loss": 2.7218, + "step": 26130 + }, + { + "epoch": 1.6221366937736668, + "grad_norm": 0.1473095413517009, + "learning_rate": 5.162129010936123e-05, + "loss": 2.8058, + "step": 26131 + }, + { + "epoch": 1.6221987708734247, + "grad_norm": 0.1479135287706448, + "learning_rate": 5.1617680480091726e-05, + "loss": 2.7215, + "step": 26132 + }, + { + "epoch": 1.6222608479731826, + "grad_norm": 0.1475690609484408, + "learning_rate": 5.1614070842382366e-05, + "loss": 2.7724, + "step": 26133 + }, + { + "epoch": 1.6223229250729405, + "grad_norm": 0.1626229595852396, + "learning_rate": 5.161046119625202e-05, + "loss": 2.7561, + "step": 26134 + }, + { + "epoch": 1.6223850021726984, + "grad_norm": 0.16534841200586825, + "learning_rate": 5.1606851541719516e-05, + "loss": 2.7573, + "step": 26135 + }, + { + "epoch": 1.6224470792724563, + "grad_norm": 0.14371987006341352, + "learning_rate": 5.1603241878803655e-05, + "loss": 2.8498, + "step": 26136 + }, + { + "epoch": 1.6225091563722143, + "grad_norm": 0.15121433169498055, + "learning_rate": 5.159963220752331e-05, + "loss": 2.7909, + "step": 26137 + }, + { + "epoch": 1.6225712334719722, + "grad_norm": 0.164383007514475, + "learning_rate": 5.159602252789728e-05, + "loss": 2.6439, + "step": 26138 + }, + { + "epoch": 1.62263331057173, + "grad_norm": 0.14993292033828887, + "learning_rate": 5.1592412839944424e-05, + "loss": 2.8255, + "step": 26139 + }, + { + "epoch": 1.622695387671488, + "grad_norm": 0.1516428559918911, + "learning_rate": 5.158880314368355e-05, + "loss": 2.779, + "step": 26140 + }, + { + "epoch": 1.622757464771246, + "grad_norm": 0.22012725746117162, + "learning_rate": 5.158519343913353e-05, + "loss": 2.8157, + "step": 26141 + }, + { + "epoch": 1.6228195418710039, + "grad_norm": 0.14252996284457348, + "learning_rate": 5.158158372631314e-05, + "loss": 2.7456, + "step": 26142 + }, + { + "epoch": 1.6228816189707618, + "grad_norm": 0.14120416453515902, + "learning_rate": 5.157797400524127e-05, + "loss": 2.8492, + "step": 26143 + }, + { + "epoch": 1.6229436960705197, + "grad_norm": 0.14389193100160005, + "learning_rate": 5.1574364275936704e-05, + "loss": 2.788, + "step": 26144 + }, + { + "epoch": 1.6230057731702776, + "grad_norm": 0.14225123438986748, + "learning_rate": 5.157075453841832e-05, + "loss": 2.8324, + "step": 26145 + }, + { + "epoch": 1.6230678502700355, + "grad_norm": 0.1588677175141107, + "learning_rate": 5.156714479270492e-05, + "loss": 2.7403, + "step": 26146 + }, + { + "epoch": 1.6231299273697934, + "grad_norm": 0.15593872655604019, + "learning_rate": 5.156353503881532e-05, + "loss": 2.7675, + "step": 26147 + }, + { + "epoch": 1.6231920044695511, + "grad_norm": 0.1662740547212812, + "learning_rate": 5.1559925276768404e-05, + "loss": 2.8092, + "step": 26148 + }, + { + "epoch": 1.623254081569309, + "grad_norm": 0.14737174513509047, + "learning_rate": 5.1556315506582976e-05, + "loss": 2.7921, + "step": 26149 + }, + { + "epoch": 1.623316158669067, + "grad_norm": 0.1585559207918105, + "learning_rate": 5.155270572827787e-05, + "loss": 2.7414, + "step": 26150 + }, + { + "epoch": 1.623378235768825, + "grad_norm": 0.16302061200253834, + "learning_rate": 5.154909594187193e-05, + "loss": 2.7726, + "step": 26151 + }, + { + "epoch": 1.6234403128685828, + "grad_norm": 0.16415337475368771, + "learning_rate": 5.154548614738397e-05, + "loss": 2.7857, + "step": 26152 + }, + { + "epoch": 1.6235023899683407, + "grad_norm": 0.16364766080634405, + "learning_rate": 5.154187634483284e-05, + "loss": 2.7781, + "step": 26153 + }, + { + "epoch": 1.6235644670680984, + "grad_norm": 0.17676686427085686, + "learning_rate": 5.153826653423737e-05, + "loss": 2.7551, + "step": 26154 + }, + { + "epoch": 1.6236265441678563, + "grad_norm": 0.13995005307366926, + "learning_rate": 5.153465671561638e-05, + "loss": 2.7431, + "step": 26155 + }, + { + "epoch": 1.6236886212676143, + "grad_norm": 0.16085857405053225, + "learning_rate": 5.153104688898871e-05, + "loss": 2.7586, + "step": 26156 + }, + { + "epoch": 1.6237506983673722, + "grad_norm": 0.15873857824094686, + "learning_rate": 5.152743705437322e-05, + "loss": 2.7915, + "step": 26157 + }, + { + "epoch": 1.62381277546713, + "grad_norm": 0.1574865111467194, + "learning_rate": 5.1523827211788686e-05, + "loss": 2.8131, + "step": 26158 + }, + { + "epoch": 1.623874852566888, + "grad_norm": 0.16639761454602903, + "learning_rate": 5.152021736125401e-05, + "loss": 2.8617, + "step": 26159 + }, + { + "epoch": 1.623936929666646, + "grad_norm": 0.14601765349047077, + "learning_rate": 5.151660750278796e-05, + "loss": 2.8, + "step": 26160 + }, + { + "epoch": 1.6239990067664039, + "grad_norm": 0.16418021951462605, + "learning_rate": 5.151299763640942e-05, + "loss": 2.8165, + "step": 26161 + }, + { + "epoch": 1.6240610838661618, + "grad_norm": 0.1446288977499991, + "learning_rate": 5.1509387762137196e-05, + "loss": 2.7568, + "step": 26162 + }, + { + "epoch": 1.6241231609659197, + "grad_norm": 0.1545356460289293, + "learning_rate": 5.1505777879990126e-05, + "loss": 2.648, + "step": 26163 + }, + { + "epoch": 1.6241852380656776, + "grad_norm": 0.15651402815736998, + "learning_rate": 5.150216798998706e-05, + "loss": 2.7711, + "step": 26164 + }, + { + "epoch": 1.6242473151654355, + "grad_norm": 0.14794556767456332, + "learning_rate": 5.149855809214681e-05, + "loss": 2.8537, + "step": 26165 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.1644672611983879, + "learning_rate": 5.149494818648822e-05, + "loss": 2.7794, + "step": 26166 + }, + { + "epoch": 1.6243714693649514, + "grad_norm": 0.14984301781699225, + "learning_rate": 5.1491338273030124e-05, + "loss": 2.8619, + "step": 26167 + }, + { + "epoch": 1.6244335464647093, + "grad_norm": 0.15880549511216427, + "learning_rate": 5.1487728351791365e-05, + "loss": 2.8944, + "step": 26168 + }, + { + "epoch": 1.6244956235644672, + "grad_norm": 0.15822239746630123, + "learning_rate": 5.148411842279074e-05, + "loss": 2.6966, + "step": 26169 + }, + { + "epoch": 1.6245577006642251, + "grad_norm": 0.14907074423649938, + "learning_rate": 5.148050848604713e-05, + "loss": 2.7853, + "step": 26170 + }, + { + "epoch": 1.624619777763983, + "grad_norm": 0.14657366256663243, + "learning_rate": 5.147689854157936e-05, + "loss": 2.7759, + "step": 26171 + }, + { + "epoch": 1.6246818548637407, + "grad_norm": 0.14282552601279883, + "learning_rate": 5.1473288589406224e-05, + "loss": 2.8032, + "step": 26172 + }, + { + "epoch": 1.6247439319634986, + "grad_norm": 0.1742697581216037, + "learning_rate": 5.146967862954659e-05, + "loss": 2.6554, + "step": 26173 + }, + { + "epoch": 1.6248060090632566, + "grad_norm": 0.15550369607957734, + "learning_rate": 5.1466068662019285e-05, + "loss": 2.7666, + "step": 26174 + }, + { + "epoch": 1.6248680861630145, + "grad_norm": 0.16677187725596593, + "learning_rate": 5.146245868684315e-05, + "loss": 2.7218, + "step": 26175 + }, + { + "epoch": 1.6249301632627724, + "grad_norm": 0.16706678051857457, + "learning_rate": 5.145884870403701e-05, + "loss": 2.7551, + "step": 26176 + }, + { + "epoch": 1.6249922403625303, + "grad_norm": 0.1946280694265028, + "learning_rate": 5.1455238713619714e-05, + "loss": 2.8699, + "step": 26177 + }, + { + "epoch": 1.625054317462288, + "grad_norm": 0.14869508854725852, + "learning_rate": 5.145162871561007e-05, + "loss": 2.7593, + "step": 26178 + }, + { + "epoch": 1.625116394562046, + "grad_norm": 0.15507077240509357, + "learning_rate": 5.1448018710026915e-05, + "loss": 2.7591, + "step": 26179 + }, + { + "epoch": 1.6251784716618038, + "grad_norm": 0.14584992660625887, + "learning_rate": 5.144440869688912e-05, + "loss": 2.7416, + "step": 26180 + }, + { + "epoch": 1.6252405487615618, + "grad_norm": 0.15828605526881842, + "learning_rate": 5.144079867621547e-05, + "loss": 2.7096, + "step": 26181 + }, + { + "epoch": 1.6253026258613197, + "grad_norm": 0.15480136485628587, + "learning_rate": 5.143718864802485e-05, + "loss": 2.8162, + "step": 26182 + }, + { + "epoch": 1.6253647029610776, + "grad_norm": 0.1540629267154688, + "learning_rate": 5.143357861233605e-05, + "loss": 2.7591, + "step": 26183 + }, + { + "epoch": 1.6254267800608355, + "grad_norm": 0.16279314220764768, + "learning_rate": 5.142996856916793e-05, + "loss": 2.7913, + "step": 26184 + }, + { + "epoch": 1.6254888571605934, + "grad_norm": 0.14173741236684922, + "learning_rate": 5.142635851853931e-05, + "loss": 2.6925, + "step": 26185 + }, + { + "epoch": 1.6255509342603514, + "grad_norm": 0.14823202920116915, + "learning_rate": 5.142274846046905e-05, + "loss": 2.8056, + "step": 26186 + }, + { + "epoch": 1.6256130113601093, + "grad_norm": 0.15072117433885968, + "learning_rate": 5.1419138394975943e-05, + "loss": 2.8177, + "step": 26187 + }, + { + "epoch": 1.6256750884598672, + "grad_norm": 0.16704509410099735, + "learning_rate": 5.1415528322078856e-05, + "loss": 2.7387, + "step": 26188 + }, + { + "epoch": 1.625737165559625, + "grad_norm": 0.15613122240163022, + "learning_rate": 5.141191824179662e-05, + "loss": 2.818, + "step": 26189 + }, + { + "epoch": 1.625799242659383, + "grad_norm": 0.1377142088348897, + "learning_rate": 5.140830815414805e-05, + "loss": 2.7005, + "step": 26190 + }, + { + "epoch": 1.625861319759141, + "grad_norm": 0.16035913781731445, + "learning_rate": 5.140469805915201e-05, + "loss": 2.7554, + "step": 26191 + }, + { + "epoch": 1.6259233968588989, + "grad_norm": 0.14981226746481816, + "learning_rate": 5.140108795682731e-05, + "loss": 2.8268, + "step": 26192 + }, + { + "epoch": 1.6259854739586568, + "grad_norm": 0.16627273284937732, + "learning_rate": 5.13974778471928e-05, + "loss": 2.7613, + "step": 26193 + }, + { + "epoch": 1.6260475510584147, + "grad_norm": 0.1471805306422681, + "learning_rate": 5.13938677302673e-05, + "loss": 2.748, + "step": 26194 + }, + { + "epoch": 1.6261096281581726, + "grad_norm": 0.16777908906310626, + "learning_rate": 5.1390257606069655e-05, + "loss": 2.8297, + "step": 26195 + }, + { + "epoch": 1.6261717052579303, + "grad_norm": 0.14861592436199886, + "learning_rate": 5.13866474746187e-05, + "loss": 2.7502, + "step": 26196 + }, + { + "epoch": 1.6262337823576882, + "grad_norm": 0.15740040237473482, + "learning_rate": 5.138303733593326e-05, + "loss": 2.8131, + "step": 26197 + }, + { + "epoch": 1.6262958594574461, + "grad_norm": 0.16029791265527932, + "learning_rate": 5.1379427190032206e-05, + "loss": 2.6729, + "step": 26198 + }, + { + "epoch": 1.626357936557204, + "grad_norm": 0.18251555975792086, + "learning_rate": 5.137581703693431e-05, + "loss": 2.8576, + "step": 26199 + }, + { + "epoch": 1.626420013656962, + "grad_norm": 0.1771229823454554, + "learning_rate": 5.1372206876658466e-05, + "loss": 2.7798, + "step": 26200 + }, + { + "epoch": 1.62648209075672, + "grad_norm": 0.15584915122820497, + "learning_rate": 5.136859670922348e-05, + "loss": 2.7625, + "step": 26201 + }, + { + "epoch": 1.6265441678564776, + "grad_norm": 0.17633882148726407, + "learning_rate": 5.1364986534648194e-05, + "loss": 2.8272, + "step": 26202 + }, + { + "epoch": 1.6266062449562355, + "grad_norm": 0.15489118536581595, + "learning_rate": 5.136137635295144e-05, + "loss": 2.6878, + "step": 26203 + }, + { + "epoch": 1.6266683220559934, + "grad_norm": 0.15438596350623612, + "learning_rate": 5.1357766164152056e-05, + "loss": 2.8624, + "step": 26204 + }, + { + "epoch": 1.6267303991557513, + "grad_norm": 0.15802923700907381, + "learning_rate": 5.135415596826887e-05, + "loss": 2.8168, + "step": 26205 + }, + { + "epoch": 1.6267924762555093, + "grad_norm": 0.15426863037608252, + "learning_rate": 5.1350545765320735e-05, + "loss": 2.7078, + "step": 26206 + }, + { + "epoch": 1.6268545533552672, + "grad_norm": 0.14810235058574353, + "learning_rate": 5.134693555532648e-05, + "loss": 2.765, + "step": 26207 + }, + { + "epoch": 1.626916630455025, + "grad_norm": 0.14315013583570238, + "learning_rate": 5.1343325338304914e-05, + "loss": 2.7473, + "step": 26208 + }, + { + "epoch": 1.626978707554783, + "grad_norm": 0.151935734035581, + "learning_rate": 5.133971511427491e-05, + "loss": 2.8917, + "step": 26209 + }, + { + "epoch": 1.627040784654541, + "grad_norm": 0.1531084321751448, + "learning_rate": 5.133610488325528e-05, + "loss": 2.7783, + "step": 26210 + }, + { + "epoch": 1.6271028617542989, + "grad_norm": 0.14200462328829194, + "learning_rate": 5.1332494645264875e-05, + "loss": 2.8316, + "step": 26211 + }, + { + "epoch": 1.6271649388540568, + "grad_norm": 0.15543916842566433, + "learning_rate": 5.1328884400322494e-05, + "loss": 2.7731, + "step": 26212 + }, + { + "epoch": 1.6272270159538147, + "grad_norm": 0.14916286670339562, + "learning_rate": 5.132527414844702e-05, + "loss": 2.7921, + "step": 26213 + }, + { + "epoch": 1.6272890930535726, + "grad_norm": 0.15512570965118141, + "learning_rate": 5.132166388965728e-05, + "loss": 2.7856, + "step": 26214 + }, + { + "epoch": 1.6273511701533305, + "grad_norm": 0.14503003092045702, + "learning_rate": 5.131805362397208e-05, + "loss": 2.7758, + "step": 26215 + }, + { + "epoch": 1.6274132472530884, + "grad_norm": 0.143965490161447, + "learning_rate": 5.131444335141029e-05, + "loss": 2.7952, + "step": 26216 + }, + { + "epoch": 1.6274753243528464, + "grad_norm": 0.14862679658562677, + "learning_rate": 5.131083307199071e-05, + "loss": 2.86, + "step": 26217 + }, + { + "epoch": 1.6275374014526043, + "grad_norm": 0.14429351224424905, + "learning_rate": 5.130722278573222e-05, + "loss": 2.7814, + "step": 26218 + }, + { + "epoch": 1.6275994785523622, + "grad_norm": 0.15162616233309412, + "learning_rate": 5.130361249265362e-05, + "loss": 2.8348, + "step": 26219 + }, + { + "epoch": 1.62766155565212, + "grad_norm": 0.1442159616070589, + "learning_rate": 5.130000219277377e-05, + "loss": 2.8062, + "step": 26220 + }, + { + "epoch": 1.6277236327518778, + "grad_norm": 0.1429583691648177, + "learning_rate": 5.129639188611147e-05, + "loss": 2.7911, + "step": 26221 + }, + { + "epoch": 1.6277857098516357, + "grad_norm": 0.15088594290611174, + "learning_rate": 5.129278157268559e-05, + "loss": 2.8007, + "step": 26222 + }, + { + "epoch": 1.6278477869513936, + "grad_norm": 0.1376874190969658, + "learning_rate": 5.1289171252514946e-05, + "loss": 2.8051, + "step": 26223 + }, + { + "epoch": 1.6279098640511516, + "grad_norm": 0.14088330123309745, + "learning_rate": 5.12855609256184e-05, + "loss": 2.7754, + "step": 26224 + }, + { + "epoch": 1.6279719411509093, + "grad_norm": 0.14012847105205425, + "learning_rate": 5.128195059201477e-05, + "loss": 2.7773, + "step": 26225 + }, + { + "epoch": 1.6280340182506672, + "grad_norm": 0.15469893451775624, + "learning_rate": 5.127834025172288e-05, + "loss": 2.8027, + "step": 26226 + }, + { + "epoch": 1.628096095350425, + "grad_norm": 0.14048206164432805, + "learning_rate": 5.127472990476159e-05, + "loss": 2.71, + "step": 26227 + }, + { + "epoch": 1.628158172450183, + "grad_norm": 0.14948910966450396, + "learning_rate": 5.127111955114969e-05, + "loss": 2.8498, + "step": 26228 + }, + { + "epoch": 1.628220249549941, + "grad_norm": 0.14702392393664454, + "learning_rate": 5.126750919090608e-05, + "loss": 2.7547, + "step": 26229 + }, + { + "epoch": 1.6282823266496989, + "grad_norm": 0.14253333446381058, + "learning_rate": 5.126389882404956e-05, + "loss": 2.736, + "step": 26230 + }, + { + "epoch": 1.6283444037494568, + "grad_norm": 0.15748089306412386, + "learning_rate": 5.126028845059898e-05, + "loss": 2.6747, + "step": 26231 + }, + { + "epoch": 1.6284064808492147, + "grad_norm": 0.14674320242652192, + "learning_rate": 5.1256678070573186e-05, + "loss": 2.7868, + "step": 26232 + }, + { + "epoch": 1.6284685579489726, + "grad_norm": 0.1475904224637723, + "learning_rate": 5.1253067683990965e-05, + "loss": 2.8135, + "step": 26233 + }, + { + "epoch": 1.6285306350487305, + "grad_norm": 0.15087920253687534, + "learning_rate": 5.12494572908712e-05, + "loss": 2.8249, + "step": 26234 + }, + { + "epoch": 1.6285927121484884, + "grad_norm": 0.15154242677587995, + "learning_rate": 5.124584689123271e-05, + "loss": 2.8865, + "step": 26235 + }, + { + "epoch": 1.6286547892482464, + "grad_norm": 0.15503843986403507, + "learning_rate": 5.124223648509434e-05, + "loss": 2.841, + "step": 26236 + }, + { + "epoch": 1.6287168663480043, + "grad_norm": 0.14573124877482074, + "learning_rate": 5.1238626072474916e-05, + "loss": 2.8162, + "step": 26237 + }, + { + "epoch": 1.6287789434477622, + "grad_norm": 0.14596122146609042, + "learning_rate": 5.123501565339328e-05, + "loss": 2.7261, + "step": 26238 + }, + { + "epoch": 1.6288410205475201, + "grad_norm": 0.15184765516583423, + "learning_rate": 5.123140522786827e-05, + "loss": 2.7944, + "step": 26239 + }, + { + "epoch": 1.628903097647278, + "grad_norm": 0.14390587829434148, + "learning_rate": 5.122779479591872e-05, + "loss": 2.7848, + "step": 26240 + }, + { + "epoch": 1.628965174747036, + "grad_norm": 0.15164183649073446, + "learning_rate": 5.122418435756345e-05, + "loss": 2.877, + "step": 26241 + }, + { + "epoch": 1.6290272518467939, + "grad_norm": 0.1458343196449532, + "learning_rate": 5.1220573912821334e-05, + "loss": 2.7793, + "step": 26242 + }, + { + "epoch": 1.6290893289465516, + "grad_norm": 0.16463621055663435, + "learning_rate": 5.121696346171119e-05, + "loss": 2.8321, + "step": 26243 + }, + { + "epoch": 1.6291514060463095, + "grad_norm": 0.15141601390278903, + "learning_rate": 5.121335300425183e-05, + "loss": 2.8022, + "step": 26244 + }, + { + "epoch": 1.6292134831460674, + "grad_norm": 0.14026218871668614, + "learning_rate": 5.1209742540462125e-05, + "loss": 2.7772, + "step": 26245 + }, + { + "epoch": 1.6292755602458253, + "grad_norm": 0.14289243229786253, + "learning_rate": 5.120613207036089e-05, + "loss": 2.8369, + "step": 26246 + }, + { + "epoch": 1.6293376373455832, + "grad_norm": 0.13905606308516064, + "learning_rate": 5.120252159396699e-05, + "loss": 2.7126, + "step": 26247 + }, + { + "epoch": 1.6293997144453412, + "grad_norm": 0.14475657323092392, + "learning_rate": 5.1198911111299234e-05, + "loss": 2.804, + "step": 26248 + }, + { + "epoch": 1.6294617915450988, + "grad_norm": 0.1515217699301865, + "learning_rate": 5.1195300622376466e-05, + "loss": 2.8557, + "step": 26249 + }, + { + "epoch": 1.6295238686448568, + "grad_norm": 0.14842074701234176, + "learning_rate": 5.119169012721753e-05, + "loss": 2.7712, + "step": 26250 + }, + { + "epoch": 1.6295859457446147, + "grad_norm": 0.18328108440368923, + "learning_rate": 5.118807962584126e-05, + "loss": 2.7915, + "step": 26251 + }, + { + "epoch": 1.6296480228443726, + "grad_norm": 0.1570523167233741, + "learning_rate": 5.118446911826647e-05, + "loss": 2.8629, + "step": 26252 + }, + { + "epoch": 1.6297100999441305, + "grad_norm": 0.15182645710720633, + "learning_rate": 5.118085860451204e-05, + "loss": 2.7722, + "step": 26253 + }, + { + "epoch": 1.6297721770438884, + "grad_norm": 0.15400319580030364, + "learning_rate": 5.117724808459677e-05, + "loss": 2.8305, + "step": 26254 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.1686736828370101, + "learning_rate": 5.117363755853951e-05, + "loss": 2.8076, + "step": 26255 + }, + { + "epoch": 1.6298963312434043, + "grad_norm": 0.1491711892731264, + "learning_rate": 5.117002702635911e-05, + "loss": 2.7623, + "step": 26256 + }, + { + "epoch": 1.6299584083431622, + "grad_norm": 0.17793351848678, + "learning_rate": 5.1166416488074386e-05, + "loss": 2.7695, + "step": 26257 + }, + { + "epoch": 1.63002048544292, + "grad_norm": 0.15679026422026296, + "learning_rate": 5.116280594370419e-05, + "loss": 2.8641, + "step": 26258 + }, + { + "epoch": 1.630082562542678, + "grad_norm": 0.174728813967067, + "learning_rate": 5.1159195393267344e-05, + "loss": 2.7805, + "step": 26259 + }, + { + "epoch": 1.630144639642436, + "grad_norm": 0.16264493214883785, + "learning_rate": 5.115558483678271e-05, + "loss": 2.8225, + "step": 26260 + }, + { + "epoch": 1.6302067167421939, + "grad_norm": 0.1630132421781729, + "learning_rate": 5.11519742742691e-05, + "loss": 2.8483, + "step": 26261 + }, + { + "epoch": 1.6302687938419518, + "grad_norm": 0.17305094324500758, + "learning_rate": 5.114836370574535e-05, + "loss": 2.72, + "step": 26262 + }, + { + "epoch": 1.6303308709417097, + "grad_norm": 0.18570202815494624, + "learning_rate": 5.1144753131230317e-05, + "loss": 2.759, + "step": 26263 + }, + { + "epoch": 1.6303929480414676, + "grad_norm": 0.19329768458677815, + "learning_rate": 5.114114255074283e-05, + "loss": 2.8068, + "step": 26264 + }, + { + "epoch": 1.6304550251412255, + "grad_norm": 0.17175000695925424, + "learning_rate": 5.113753196430173e-05, + "loss": 2.7646, + "step": 26265 + }, + { + "epoch": 1.6305171022409835, + "grad_norm": 0.18223443949561788, + "learning_rate": 5.1133921371925844e-05, + "loss": 2.7469, + "step": 26266 + }, + { + "epoch": 1.6305791793407411, + "grad_norm": 0.1608266117417973, + "learning_rate": 5.113031077363401e-05, + "loss": 2.849, + "step": 26267 + }, + { + "epoch": 1.630641256440499, + "grad_norm": 0.1512302394442992, + "learning_rate": 5.1126700169445085e-05, + "loss": 2.7547, + "step": 26268 + }, + { + "epoch": 1.630703333540257, + "grad_norm": 0.15682075683549046, + "learning_rate": 5.112308955937788e-05, + "loss": 2.79, + "step": 26269 + }, + { + "epoch": 1.630765410640015, + "grad_norm": 0.15382897983660296, + "learning_rate": 5.111947894345125e-05, + "loss": 2.7681, + "step": 26270 + }, + { + "epoch": 1.6308274877397728, + "grad_norm": 0.16515656526259376, + "learning_rate": 5.111586832168401e-05, + "loss": 2.8237, + "step": 26271 + }, + { + "epoch": 1.6308895648395307, + "grad_norm": 0.15750897556040325, + "learning_rate": 5.111225769409505e-05, + "loss": 2.7949, + "step": 26272 + }, + { + "epoch": 1.6309516419392884, + "grad_norm": 0.16423878962147434, + "learning_rate": 5.110864706070313e-05, + "loss": 2.803, + "step": 26273 + }, + { + "epoch": 1.6310137190390464, + "grad_norm": 0.1738990253342751, + "learning_rate": 5.110503642152715e-05, + "loss": 2.9097, + "step": 26274 + }, + { + "epoch": 1.6310757961388043, + "grad_norm": 0.17120922711802014, + "learning_rate": 5.110142577658592e-05, + "loss": 2.7213, + "step": 26275 + }, + { + "epoch": 1.6311378732385622, + "grad_norm": 0.15737481561578762, + "learning_rate": 5.109781512589829e-05, + "loss": 2.8357, + "step": 26276 + }, + { + "epoch": 1.63119995033832, + "grad_norm": 0.15639948846490676, + "learning_rate": 5.1094204469483095e-05, + "loss": 2.8318, + "step": 26277 + }, + { + "epoch": 1.631262027438078, + "grad_norm": 0.16219200188667585, + "learning_rate": 5.109059380735917e-05, + "loss": 2.7226, + "step": 26278 + }, + { + "epoch": 1.631324104537836, + "grad_norm": 0.1524662356832938, + "learning_rate": 5.108698313954534e-05, + "loss": 2.7874, + "step": 26279 + }, + { + "epoch": 1.6313861816375939, + "grad_norm": 0.15284825808635627, + "learning_rate": 5.108337246606046e-05, + "loss": 2.8591, + "step": 26280 + }, + { + "epoch": 1.6314482587373518, + "grad_norm": 0.14675960116277503, + "learning_rate": 5.1079761786923375e-05, + "loss": 2.7013, + "step": 26281 + }, + { + "epoch": 1.6315103358371097, + "grad_norm": 0.15888363859242582, + "learning_rate": 5.107615110215289e-05, + "loss": 2.8446, + "step": 26282 + }, + { + "epoch": 1.6315724129368676, + "grad_norm": 0.161661177674943, + "learning_rate": 5.107254041176789e-05, + "loss": 2.8345, + "step": 26283 + }, + { + "epoch": 1.6316344900366255, + "grad_norm": 0.14695230133945278, + "learning_rate": 5.1068929715787165e-05, + "loss": 2.7553, + "step": 26284 + }, + { + "epoch": 1.6316965671363834, + "grad_norm": 0.1528045093264824, + "learning_rate": 5.1065319014229583e-05, + "loss": 2.8665, + "step": 26285 + }, + { + "epoch": 1.6317586442361414, + "grad_norm": 0.15297896661822835, + "learning_rate": 5.1061708307113975e-05, + "loss": 2.7709, + "step": 26286 + }, + { + "epoch": 1.6318207213358993, + "grad_norm": 0.15245603433788532, + "learning_rate": 5.1058097594459164e-05, + "loss": 2.7685, + "step": 26287 + }, + { + "epoch": 1.6318827984356572, + "grad_norm": 0.14732584544162083, + "learning_rate": 5.105448687628402e-05, + "loss": 2.8759, + "step": 26288 + }, + { + "epoch": 1.6319448755354151, + "grad_norm": 0.1613574744860919, + "learning_rate": 5.105087615260734e-05, + "loss": 2.8096, + "step": 26289 + }, + { + "epoch": 1.632006952635173, + "grad_norm": 0.13878585939498453, + "learning_rate": 5.104726542344801e-05, + "loss": 2.7351, + "step": 26290 + }, + { + "epoch": 1.6320690297349307, + "grad_norm": 0.17338602229988784, + "learning_rate": 5.104365468882481e-05, + "loss": 2.8606, + "step": 26291 + }, + { + "epoch": 1.6321311068346886, + "grad_norm": 0.16046018371449555, + "learning_rate": 5.104004394875664e-05, + "loss": 2.8221, + "step": 26292 + }, + { + "epoch": 1.6321931839344466, + "grad_norm": 0.1400428949680258, + "learning_rate": 5.103643320326229e-05, + "loss": 2.6951, + "step": 26293 + }, + { + "epoch": 1.6322552610342045, + "grad_norm": 0.14193512663375038, + "learning_rate": 5.103282245236062e-05, + "loss": 2.8127, + "step": 26294 + }, + { + "epoch": 1.6323173381339624, + "grad_norm": 0.15226411572095397, + "learning_rate": 5.1029211696070466e-05, + "loss": 2.7763, + "step": 26295 + }, + { + "epoch": 1.6323794152337203, + "grad_norm": 0.1610214310829629, + "learning_rate": 5.1025600934410665e-05, + "loss": 2.8107, + "step": 26296 + }, + { + "epoch": 1.632441492333478, + "grad_norm": 0.14482746184943487, + "learning_rate": 5.1021990167400056e-05, + "loss": 2.7304, + "step": 26297 + }, + { + "epoch": 1.632503569433236, + "grad_norm": 0.14292871795475584, + "learning_rate": 5.101837939505748e-05, + "loss": 2.827, + "step": 26298 + }, + { + "epoch": 1.6325656465329939, + "grad_norm": 0.1539320677194874, + "learning_rate": 5.1014768617401764e-05, + "loss": 2.7663, + "step": 26299 + }, + { + "epoch": 1.6326277236327518, + "grad_norm": 0.1380667849886415, + "learning_rate": 5.101115783445176e-05, + "loss": 2.8334, + "step": 26300 + }, + { + "epoch": 1.6326898007325097, + "grad_norm": 0.16045112910558426, + "learning_rate": 5.10075470462263e-05, + "loss": 2.7587, + "step": 26301 + }, + { + "epoch": 1.6327518778322676, + "grad_norm": 0.16786404051728937, + "learning_rate": 5.100393625274421e-05, + "loss": 2.8532, + "step": 26302 + }, + { + "epoch": 1.6328139549320255, + "grad_norm": 0.140010797103421, + "learning_rate": 5.100032545402435e-05, + "loss": 2.7546, + "step": 26303 + }, + { + "epoch": 1.6328760320317834, + "grad_norm": 0.14329328546758643, + "learning_rate": 5.099671465008555e-05, + "loss": 2.8008, + "step": 26304 + }, + { + "epoch": 1.6329381091315414, + "grad_norm": 0.15367415471560486, + "learning_rate": 5.0993103840946656e-05, + "loss": 2.8103, + "step": 26305 + }, + { + "epoch": 1.6330001862312993, + "grad_norm": 0.1424132268836028, + "learning_rate": 5.09894930266265e-05, + "loss": 2.824, + "step": 26306 + }, + { + "epoch": 1.6330622633310572, + "grad_norm": 0.14684737052156027, + "learning_rate": 5.09858822071439e-05, + "loss": 2.8001, + "step": 26307 + }, + { + "epoch": 1.6331243404308151, + "grad_norm": 0.14520207228997853, + "learning_rate": 5.098227138251773e-05, + "loss": 2.7751, + "step": 26308 + }, + { + "epoch": 1.633186417530573, + "grad_norm": 0.13537472448294544, + "learning_rate": 5.09786605527668e-05, + "loss": 2.7201, + "step": 26309 + }, + { + "epoch": 1.633248494630331, + "grad_norm": 0.14249244493191124, + "learning_rate": 5.097504971790995e-05, + "loss": 2.7576, + "step": 26310 + }, + { + "epoch": 1.6333105717300889, + "grad_norm": 0.14204192910371108, + "learning_rate": 5.097143887796605e-05, + "loss": 2.8431, + "step": 26311 + }, + { + "epoch": 1.6333726488298468, + "grad_norm": 0.14169252039340632, + "learning_rate": 5.09678280329539e-05, + "loss": 2.7755, + "step": 26312 + }, + { + "epoch": 1.6334347259296047, + "grad_norm": 0.1476235913686708, + "learning_rate": 5.096421718289238e-05, + "loss": 2.7342, + "step": 26313 + }, + { + "epoch": 1.6334968030293626, + "grad_norm": 0.14572777220045433, + "learning_rate": 5.0960606327800285e-05, + "loss": 2.8546, + "step": 26314 + }, + { + "epoch": 1.6335588801291203, + "grad_norm": 0.1436628142067243, + "learning_rate": 5.095699546769648e-05, + "loss": 2.7606, + "step": 26315 + }, + { + "epoch": 1.6336209572288782, + "grad_norm": 0.15344078651878565, + "learning_rate": 5.095338460259979e-05, + "loss": 2.8213, + "step": 26316 + }, + { + "epoch": 1.6336830343286362, + "grad_norm": 0.16811351304359926, + "learning_rate": 5.094977373252908e-05, + "loss": 2.7116, + "step": 26317 + }, + { + "epoch": 1.633745111428394, + "grad_norm": 0.1455486675485827, + "learning_rate": 5.0946162857503155e-05, + "loss": 2.7859, + "step": 26318 + }, + { + "epoch": 1.633807188528152, + "grad_norm": 0.1532032209704389, + "learning_rate": 5.094255197754088e-05, + "loss": 2.8413, + "step": 26319 + }, + { + "epoch": 1.63386926562791, + "grad_norm": 0.142199510025201, + "learning_rate": 5.093894109266106e-05, + "loss": 2.7874, + "step": 26320 + }, + { + "epoch": 1.6339313427276676, + "grad_norm": 0.1435800581849767, + "learning_rate": 5.093533020288258e-05, + "loss": 2.8031, + "step": 26321 + }, + { + "epoch": 1.6339934198274255, + "grad_norm": 0.152130980953165, + "learning_rate": 5.0931719308224235e-05, + "loss": 2.7571, + "step": 26322 + }, + { + "epoch": 1.6340554969271834, + "grad_norm": 0.1719355530710067, + "learning_rate": 5.09281084087049e-05, + "loss": 2.6659, + "step": 26323 + }, + { + "epoch": 1.6341175740269414, + "grad_norm": 0.1518770613451968, + "learning_rate": 5.09244975043434e-05, + "loss": 2.7921, + "step": 26324 + }, + { + "epoch": 1.6341796511266993, + "grad_norm": 0.1422527205570772, + "learning_rate": 5.092088659515857e-05, + "loss": 2.7545, + "step": 26325 + }, + { + "epoch": 1.6342417282264572, + "grad_norm": 0.1517961269601717, + "learning_rate": 5.091727568116924e-05, + "loss": 2.8016, + "step": 26326 + }, + { + "epoch": 1.634303805326215, + "grad_norm": 0.14940677575188635, + "learning_rate": 5.091366476239425e-05, + "loss": 2.7464, + "step": 26327 + }, + { + "epoch": 1.634365882425973, + "grad_norm": 0.14849268364343002, + "learning_rate": 5.0910053838852476e-05, + "loss": 2.8568, + "step": 26328 + }, + { + "epoch": 1.634427959525731, + "grad_norm": 0.15227325960516563, + "learning_rate": 5.0906442910562715e-05, + "loss": 2.8198, + "step": 26329 + }, + { + "epoch": 1.6344900366254889, + "grad_norm": 0.1394315716305222, + "learning_rate": 5.090283197754383e-05, + "loss": 2.7895, + "step": 26330 + }, + { + "epoch": 1.6345521137252468, + "grad_norm": 0.1543015829406232, + "learning_rate": 5.089922103981466e-05, + "loss": 2.7707, + "step": 26331 + }, + { + "epoch": 1.6346141908250047, + "grad_norm": 0.1390922849347021, + "learning_rate": 5.089561009739401e-05, + "loss": 2.7428, + "step": 26332 + }, + { + "epoch": 1.6346762679247626, + "grad_norm": 0.15044330318369203, + "learning_rate": 5.089199915030076e-05, + "loss": 2.8371, + "step": 26333 + }, + { + "epoch": 1.6347383450245205, + "grad_norm": 0.1485482919215538, + "learning_rate": 5.0888388198553724e-05, + "loss": 2.871, + "step": 26334 + }, + { + "epoch": 1.6348004221242785, + "grad_norm": 0.16797406720461527, + "learning_rate": 5.0884777242171776e-05, + "loss": 2.7963, + "step": 26335 + }, + { + "epoch": 1.6348624992240364, + "grad_norm": 0.1444706209706447, + "learning_rate": 5.0881166281173695e-05, + "loss": 2.6978, + "step": 26336 + }, + { + "epoch": 1.6349245763237943, + "grad_norm": 0.18105050074932905, + "learning_rate": 5.087755531557839e-05, + "loss": 2.7665, + "step": 26337 + }, + { + "epoch": 1.6349866534235522, + "grad_norm": 0.15111570078507913, + "learning_rate": 5.087394434540465e-05, + "loss": 2.7097, + "step": 26338 + }, + { + "epoch": 1.63504873052331, + "grad_norm": 0.16093155979563864, + "learning_rate": 5.087033337067133e-05, + "loss": 2.7932, + "step": 26339 + }, + { + "epoch": 1.6351108076230678, + "grad_norm": 0.1598496701010276, + "learning_rate": 5.086672239139726e-05, + "loss": 2.8178, + "step": 26340 + }, + { + "epoch": 1.6351728847228257, + "grad_norm": 0.14933983822896124, + "learning_rate": 5.08631114076013e-05, + "loss": 2.8425, + "step": 26341 + }, + { + "epoch": 1.6352349618225837, + "grad_norm": 0.1605132763922037, + "learning_rate": 5.0859500419302285e-05, + "loss": 2.7724, + "step": 26342 + }, + { + "epoch": 1.6352970389223416, + "grad_norm": 0.14373671091891352, + "learning_rate": 5.085588942651902e-05, + "loss": 2.7254, + "step": 26343 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.1672585966779718, + "learning_rate": 5.08522784292704e-05, + "loss": 2.8656, + "step": 26344 + }, + { + "epoch": 1.6354211931218572, + "grad_norm": 0.16149192442685012, + "learning_rate": 5.084866742757523e-05, + "loss": 2.8022, + "step": 26345 + }, + { + "epoch": 1.635483270221615, + "grad_norm": 0.2081462864829975, + "learning_rate": 5.0845056421452354e-05, + "loss": 2.7196, + "step": 26346 + }, + { + "epoch": 1.635545347321373, + "grad_norm": 0.15016158113334233, + "learning_rate": 5.0841445410920606e-05, + "loss": 2.8135, + "step": 26347 + }, + { + "epoch": 1.635607424421131, + "grad_norm": 0.16664460465820424, + "learning_rate": 5.083783439599885e-05, + "loss": 2.7957, + "step": 26348 + }, + { + "epoch": 1.6356695015208889, + "grad_norm": 0.15810200909510347, + "learning_rate": 5.08342233767059e-05, + "loss": 2.8374, + "step": 26349 + }, + { + "epoch": 1.6357315786206468, + "grad_norm": 0.14656108754866243, + "learning_rate": 5.0830612353060605e-05, + "loss": 2.9195, + "step": 26350 + }, + { + "epoch": 1.6357936557204047, + "grad_norm": 0.14033547372650576, + "learning_rate": 5.08270013250818e-05, + "loss": 2.7816, + "step": 26351 + }, + { + "epoch": 1.6358557328201626, + "grad_norm": 0.1448897734919061, + "learning_rate": 5.082339029278832e-05, + "loss": 2.7509, + "step": 26352 + }, + { + "epoch": 1.6359178099199205, + "grad_norm": 0.15493454327826256, + "learning_rate": 5.081977925619903e-05, + "loss": 2.7814, + "step": 26353 + }, + { + "epoch": 1.6359798870196784, + "grad_norm": 0.14995172165745346, + "learning_rate": 5.081616821533274e-05, + "loss": 2.7626, + "step": 26354 + }, + { + "epoch": 1.6360419641194364, + "grad_norm": 0.14923005211054557, + "learning_rate": 5.081255717020832e-05, + "loss": 2.7788, + "step": 26355 + }, + { + "epoch": 1.6361040412191943, + "grad_norm": 0.15160899997465474, + "learning_rate": 5.080894612084457e-05, + "loss": 2.8099, + "step": 26356 + }, + { + "epoch": 1.6361661183189522, + "grad_norm": 0.15238149493248515, + "learning_rate": 5.0805335067260374e-05, + "loss": 2.8145, + "step": 26357 + }, + { + "epoch": 1.6362281954187101, + "grad_norm": 0.1445583176397485, + "learning_rate": 5.080172400947453e-05, + "loss": 2.8084, + "step": 26358 + }, + { + "epoch": 1.636290272518468, + "grad_norm": 0.15478853769549858, + "learning_rate": 5.079811294750589e-05, + "loss": 2.7859, + "step": 26359 + }, + { + "epoch": 1.636352349618226, + "grad_norm": 0.15007430411801337, + "learning_rate": 5.079450188137332e-05, + "loss": 2.7453, + "step": 26360 + }, + { + "epoch": 1.6364144267179839, + "grad_norm": 0.15459327959012617, + "learning_rate": 5.079089081109564e-05, + "loss": 2.8159, + "step": 26361 + }, + { + "epoch": 1.6364765038177418, + "grad_norm": 0.14715062262581274, + "learning_rate": 5.078727973669168e-05, + "loss": 2.8745, + "step": 26362 + }, + { + "epoch": 1.6365385809174995, + "grad_norm": 0.14823556013207537, + "learning_rate": 5.0783668658180296e-05, + "loss": 2.7838, + "step": 26363 + }, + { + "epoch": 1.6366006580172574, + "grad_norm": 0.15573610196783144, + "learning_rate": 5.078005757558033e-05, + "loss": 2.7795, + "step": 26364 + }, + { + "epoch": 1.6366627351170153, + "grad_norm": 0.14564784563530855, + "learning_rate": 5.07764464889106e-05, + "loss": 2.8136, + "step": 26365 + }, + { + "epoch": 1.6367248122167732, + "grad_norm": 0.16448844385427816, + "learning_rate": 5.077283539818998e-05, + "loss": 2.8054, + "step": 26366 + }, + { + "epoch": 1.6367868893165312, + "grad_norm": 0.15411061500871165, + "learning_rate": 5.076922430343728e-05, + "loss": 2.8022, + "step": 26367 + }, + { + "epoch": 1.636848966416289, + "grad_norm": 0.1543819880834404, + "learning_rate": 5.0765613204671345e-05, + "loss": 2.7286, + "step": 26368 + }, + { + "epoch": 1.6369110435160468, + "grad_norm": 0.14972362986179927, + "learning_rate": 5.076200210191103e-05, + "loss": 2.7848, + "step": 26369 + }, + { + "epoch": 1.6369731206158047, + "grad_norm": 0.15697768161196113, + "learning_rate": 5.075839099517515e-05, + "loss": 2.7118, + "step": 26370 + }, + { + "epoch": 1.6370351977155626, + "grad_norm": 0.15918886123153106, + "learning_rate": 5.075477988448257e-05, + "loss": 2.8219, + "step": 26371 + }, + { + "epoch": 1.6370972748153205, + "grad_norm": 0.15656970218836244, + "learning_rate": 5.075116876985212e-05, + "loss": 2.8715, + "step": 26372 + }, + { + "epoch": 1.6371593519150784, + "grad_norm": 0.14899261725860155, + "learning_rate": 5.0747557651302645e-05, + "loss": 2.7859, + "step": 26373 + }, + { + "epoch": 1.6372214290148364, + "grad_norm": 0.14200360017480695, + "learning_rate": 5.074394652885297e-05, + "loss": 2.7175, + "step": 26374 + }, + { + "epoch": 1.6372835061145943, + "grad_norm": 0.14881400619579307, + "learning_rate": 5.074033540252196e-05, + "loss": 2.7382, + "step": 26375 + }, + { + "epoch": 1.6373455832143522, + "grad_norm": 0.15278506547591947, + "learning_rate": 5.0736724272328415e-05, + "loss": 2.8278, + "step": 26376 + }, + { + "epoch": 1.6374076603141101, + "grad_norm": 0.15015514938038454, + "learning_rate": 5.073311313829121e-05, + "loss": 2.804, + "step": 26377 + }, + { + "epoch": 1.637469737413868, + "grad_norm": 0.14964642252769114, + "learning_rate": 5.072950200042918e-05, + "loss": 2.8506, + "step": 26378 + }, + { + "epoch": 1.637531814513626, + "grad_norm": 0.1586438621311703, + "learning_rate": 5.072589085876116e-05, + "loss": 2.7446, + "step": 26379 + }, + { + "epoch": 1.6375938916133839, + "grad_norm": 0.1442398665230634, + "learning_rate": 5.0722279713306e-05, + "loss": 2.8288, + "step": 26380 + }, + { + "epoch": 1.6376559687131418, + "grad_norm": 0.16261133359827515, + "learning_rate": 5.071866856408252e-05, + "loss": 2.8079, + "step": 26381 + }, + { + "epoch": 1.6377180458128997, + "grad_norm": 0.1839341295479856, + "learning_rate": 5.0715057411109576e-05, + "loss": 2.8739, + "step": 26382 + }, + { + "epoch": 1.6377801229126576, + "grad_norm": 0.15539108473842575, + "learning_rate": 5.0711446254406e-05, + "loss": 2.8586, + "step": 26383 + }, + { + "epoch": 1.6378422000124155, + "grad_norm": 0.14694290873958557, + "learning_rate": 5.070783509399065e-05, + "loss": 2.7279, + "step": 26384 + }, + { + "epoch": 1.6379042771121735, + "grad_norm": 0.15030221693747833, + "learning_rate": 5.0704223929882356e-05, + "loss": 2.7512, + "step": 26385 + }, + { + "epoch": 1.6379663542119314, + "grad_norm": 0.15064105325247645, + "learning_rate": 5.070061276209993e-05, + "loss": 2.8051, + "step": 26386 + }, + { + "epoch": 1.638028431311689, + "grad_norm": 0.1706088200607379, + "learning_rate": 5.069700159066225e-05, + "loss": 2.7506, + "step": 26387 + }, + { + "epoch": 1.638090508411447, + "grad_norm": 0.15491141696129024, + "learning_rate": 5.069339041558814e-05, + "loss": 2.8284, + "step": 26388 + }, + { + "epoch": 1.638152585511205, + "grad_norm": 0.14395568918950236, + "learning_rate": 5.068977923689645e-05, + "loss": 2.8189, + "step": 26389 + }, + { + "epoch": 1.6382146626109628, + "grad_norm": 0.15116032031353713, + "learning_rate": 5.068616805460601e-05, + "loss": 2.7648, + "step": 26390 + }, + { + "epoch": 1.6382767397107207, + "grad_norm": 0.1502110226256388, + "learning_rate": 5.068255686873567e-05, + "loss": 2.8045, + "step": 26391 + }, + { + "epoch": 1.6383388168104787, + "grad_norm": 0.14437918782869205, + "learning_rate": 5.067894567930426e-05, + "loss": 2.7525, + "step": 26392 + }, + { + "epoch": 1.6384008939102364, + "grad_norm": 0.14269387106346243, + "learning_rate": 5.067533448633062e-05, + "loss": 2.7566, + "step": 26393 + }, + { + "epoch": 1.6384629710099943, + "grad_norm": 0.15162594121269762, + "learning_rate": 5.067172328983361e-05, + "loss": 2.7799, + "step": 26394 + }, + { + "epoch": 1.6385250481097522, + "grad_norm": 0.1466777605826724, + "learning_rate": 5.0668112089832035e-05, + "loss": 2.8032, + "step": 26395 + }, + { + "epoch": 1.63858712520951, + "grad_norm": 0.1509560076303384, + "learning_rate": 5.066450088634478e-05, + "loss": 2.8133, + "step": 26396 + }, + { + "epoch": 1.638649202309268, + "grad_norm": 0.14833394001808112, + "learning_rate": 5.0660889679390644e-05, + "loss": 2.7994, + "step": 26397 + }, + { + "epoch": 1.638711279409026, + "grad_norm": 0.1516472409970388, + "learning_rate": 5.06572784689885e-05, + "loss": 2.8516, + "step": 26398 + }, + { + "epoch": 1.6387733565087839, + "grad_norm": 0.14697678138798936, + "learning_rate": 5.0653667255157165e-05, + "loss": 2.7638, + "step": 26399 + }, + { + "epoch": 1.6388354336085418, + "grad_norm": 0.20464291487390773, + "learning_rate": 5.0650056037915496e-05, + "loss": 2.8612, + "step": 26400 + }, + { + "epoch": 1.6388975107082997, + "grad_norm": 0.15715422061877402, + "learning_rate": 5.064644481728232e-05, + "loss": 2.795, + "step": 26401 + }, + { + "epoch": 1.6389595878080576, + "grad_norm": 0.15113186540895998, + "learning_rate": 5.064283359327649e-05, + "loss": 2.7435, + "step": 26402 + }, + { + "epoch": 1.6390216649078155, + "grad_norm": 0.1440354232104878, + "learning_rate": 5.063922236591685e-05, + "loss": 2.7911, + "step": 26403 + }, + { + "epoch": 1.6390837420075735, + "grad_norm": 0.16343892893704462, + "learning_rate": 5.0635611135222214e-05, + "loss": 2.7776, + "step": 26404 + }, + { + "epoch": 1.6391458191073314, + "grad_norm": 0.15776154030552625, + "learning_rate": 5.0631999901211454e-05, + "loss": 2.8213, + "step": 26405 + }, + { + "epoch": 1.6392078962070893, + "grad_norm": 0.17913693585196847, + "learning_rate": 5.062838866390339e-05, + "loss": 2.7692, + "step": 26406 + }, + { + "epoch": 1.6392699733068472, + "grad_norm": 0.13975523486253416, + "learning_rate": 5.062477742331688e-05, + "loss": 2.6611, + "step": 26407 + }, + { + "epoch": 1.6393320504066051, + "grad_norm": 0.18244614980787519, + "learning_rate": 5.062116617947073e-05, + "loss": 2.734, + "step": 26408 + }, + { + "epoch": 1.639394127506363, + "grad_norm": 0.14145160495653886, + "learning_rate": 5.0617554932383825e-05, + "loss": 2.7623, + "step": 26409 + }, + { + "epoch": 1.639456204606121, + "grad_norm": 0.1918283110454018, + "learning_rate": 5.061394368207498e-05, + "loss": 2.8184, + "step": 26410 + }, + { + "epoch": 1.6395182817058787, + "grad_norm": 0.14880078808086455, + "learning_rate": 5.061033242856305e-05, + "loss": 2.7962, + "step": 26411 + }, + { + "epoch": 1.6395803588056366, + "grad_norm": 0.1836632840391853, + "learning_rate": 5.060672117186687e-05, + "loss": 2.7778, + "step": 26412 + }, + { + "epoch": 1.6396424359053945, + "grad_norm": 0.14392518869465182, + "learning_rate": 5.0603109912005255e-05, + "loss": 2.7421, + "step": 26413 + }, + { + "epoch": 1.6397045130051524, + "grad_norm": 0.14694091225941183, + "learning_rate": 5.059949864899709e-05, + "loss": 2.8186, + "step": 26414 + }, + { + "epoch": 1.6397665901049103, + "grad_norm": 0.14816576780971635, + "learning_rate": 5.059588738286118e-05, + "loss": 2.6736, + "step": 26415 + }, + { + "epoch": 1.6398286672046682, + "grad_norm": 0.16132054807590435, + "learning_rate": 5.05922761136164e-05, + "loss": 2.7961, + "step": 26416 + }, + { + "epoch": 1.639890744304426, + "grad_norm": 0.14455137885889813, + "learning_rate": 5.058866484128156e-05, + "loss": 2.8087, + "step": 26417 + }, + { + "epoch": 1.6399528214041839, + "grad_norm": 0.15199792564342846, + "learning_rate": 5.0585053565875515e-05, + "loss": 2.7947, + "step": 26418 + }, + { + "epoch": 1.6400148985039418, + "grad_norm": 0.1529391160399435, + "learning_rate": 5.05814422874171e-05, + "loss": 2.7189, + "step": 26419 + }, + { + "epoch": 1.6400769756036997, + "grad_norm": 0.15082026703188434, + "learning_rate": 5.057783100592517e-05, + "loss": 2.8097, + "step": 26420 + }, + { + "epoch": 1.6401390527034576, + "grad_norm": 0.14625283698036515, + "learning_rate": 5.057421972141855e-05, + "loss": 2.8444, + "step": 26421 + }, + { + "epoch": 1.6402011298032155, + "grad_norm": 0.1576207650970998, + "learning_rate": 5.0570608433916075e-05, + "loss": 2.7798, + "step": 26422 + }, + { + "epoch": 1.6402632069029734, + "grad_norm": 0.14394472295466984, + "learning_rate": 5.056699714343661e-05, + "loss": 2.7667, + "step": 26423 + }, + { + "epoch": 1.6403252840027314, + "grad_norm": 0.15116335157274255, + "learning_rate": 5.0563385849998975e-05, + "loss": 2.8138, + "step": 26424 + }, + { + "epoch": 1.6403873611024893, + "grad_norm": 0.1673240988187604, + "learning_rate": 5.055977455362202e-05, + "loss": 2.7108, + "step": 26425 + }, + { + "epoch": 1.6404494382022472, + "grad_norm": 0.16830265877712225, + "learning_rate": 5.0556163254324584e-05, + "loss": 2.7467, + "step": 26426 + }, + { + "epoch": 1.6405115153020051, + "grad_norm": 0.14997289904510508, + "learning_rate": 5.0552551952125525e-05, + "loss": 2.7136, + "step": 26427 + }, + { + "epoch": 1.640573592401763, + "grad_norm": 0.15934945046634136, + "learning_rate": 5.054894064704367e-05, + "loss": 2.7713, + "step": 26428 + }, + { + "epoch": 1.640635669501521, + "grad_norm": 0.18810670763728102, + "learning_rate": 5.0545329339097834e-05, + "loss": 2.8262, + "step": 26429 + }, + { + "epoch": 1.6406977466012789, + "grad_norm": 0.17452121104684937, + "learning_rate": 5.054171802830689e-05, + "loss": 2.7635, + "step": 26430 + }, + { + "epoch": 1.6407598237010368, + "grad_norm": 0.1582141507392121, + "learning_rate": 5.053810671468968e-05, + "loss": 2.7763, + "step": 26431 + }, + { + "epoch": 1.6408219008007947, + "grad_norm": 0.15686423449033465, + "learning_rate": 5.0534495398265034e-05, + "loss": 2.7902, + "step": 26432 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.14965774884551708, + "learning_rate": 5.0530884079051786e-05, + "loss": 2.7161, + "step": 26433 + }, + { + "epoch": 1.6409460550003105, + "grad_norm": 0.16418803044753685, + "learning_rate": 5.052727275706881e-05, + "loss": 2.8177, + "step": 26434 + }, + { + "epoch": 1.6410081321000682, + "grad_norm": 0.153038464393481, + "learning_rate": 5.05236614323349e-05, + "loss": 2.8114, + "step": 26435 + }, + { + "epoch": 1.6410702091998262, + "grad_norm": 0.1509852728854001, + "learning_rate": 5.0520050104868934e-05, + "loss": 2.8203, + "step": 26436 + }, + { + "epoch": 1.641132286299584, + "grad_norm": 0.18463628084512262, + "learning_rate": 5.0516438774689734e-05, + "loss": 2.8028, + "step": 26437 + }, + { + "epoch": 1.641194363399342, + "grad_norm": 0.14413766496094926, + "learning_rate": 5.0512827441816155e-05, + "loss": 2.7569, + "step": 26438 + }, + { + "epoch": 1.6412564404991, + "grad_norm": 0.16399799509356688, + "learning_rate": 5.050921610626703e-05, + "loss": 2.9017, + "step": 26439 + }, + { + "epoch": 1.6413185175988578, + "grad_norm": 0.19167252916893174, + "learning_rate": 5.0505604768061197e-05, + "loss": 2.7972, + "step": 26440 + }, + { + "epoch": 1.6413805946986155, + "grad_norm": 0.1745976548667167, + "learning_rate": 5.05019934272175e-05, + "loss": 2.8713, + "step": 26441 + }, + { + "epoch": 1.6414426717983734, + "grad_norm": 0.15350423095973204, + "learning_rate": 5.049838208375477e-05, + "loss": 2.7787, + "step": 26442 + }, + { + "epoch": 1.6415047488981314, + "grad_norm": 0.15001962155911885, + "learning_rate": 5.049477073769188e-05, + "loss": 2.7369, + "step": 26443 + }, + { + "epoch": 1.6415668259978893, + "grad_norm": 0.1519212856130242, + "learning_rate": 5.0491159389047637e-05, + "loss": 2.8033, + "step": 26444 + }, + { + "epoch": 1.6416289030976472, + "grad_norm": 0.15156925004498548, + "learning_rate": 5.0487548037840904e-05, + "loss": 2.7456, + "step": 26445 + }, + { + "epoch": 1.6416909801974051, + "grad_norm": 0.19382512228260884, + "learning_rate": 5.0483936684090525e-05, + "loss": 2.6912, + "step": 26446 + }, + { + "epoch": 1.641753057297163, + "grad_norm": 0.190939909315751, + "learning_rate": 5.048032532781532e-05, + "loss": 2.7566, + "step": 26447 + }, + { + "epoch": 1.641815134396921, + "grad_norm": 0.15508211692841706, + "learning_rate": 5.047671396903414e-05, + "loss": 2.7545, + "step": 26448 + }, + { + "epoch": 1.6418772114966789, + "grad_norm": 0.15958935146608338, + "learning_rate": 5.047310260776582e-05, + "loss": 2.7236, + "step": 26449 + }, + { + "epoch": 1.6419392885964368, + "grad_norm": 0.16244777725199883, + "learning_rate": 5.046949124402922e-05, + "loss": 2.8709, + "step": 26450 + }, + { + "epoch": 1.6420013656961947, + "grad_norm": 0.14957313546888817, + "learning_rate": 5.046587987784316e-05, + "loss": 2.7654, + "step": 26451 + }, + { + "epoch": 1.6420634427959526, + "grad_norm": 0.14870475877451758, + "learning_rate": 5.046226850922651e-05, + "loss": 2.7899, + "step": 26452 + }, + { + "epoch": 1.6421255198957105, + "grad_norm": 0.15369115917919415, + "learning_rate": 5.0458657138198075e-05, + "loss": 2.713, + "step": 26453 + }, + { + "epoch": 1.6421875969954685, + "grad_norm": 0.159483017967343, + "learning_rate": 5.045504576477672e-05, + "loss": 2.8465, + "step": 26454 + }, + { + "epoch": 1.6422496740952264, + "grad_norm": 0.15048438406916423, + "learning_rate": 5.0451434388981276e-05, + "loss": 2.8281, + "step": 26455 + }, + { + "epoch": 1.6423117511949843, + "grad_norm": 0.14962138334443012, + "learning_rate": 5.0447823010830595e-05, + "loss": 2.8031, + "step": 26456 + }, + { + "epoch": 1.6423738282947422, + "grad_norm": 0.2025277453712188, + "learning_rate": 5.044421163034352e-05, + "loss": 2.7659, + "step": 26457 + }, + { + "epoch": 1.6424359053945001, + "grad_norm": 0.1608688851723084, + "learning_rate": 5.044060024753886e-05, + "loss": 2.7263, + "step": 26458 + }, + { + "epoch": 1.6424979824942578, + "grad_norm": 0.1769021851436999, + "learning_rate": 5.043698886243551e-05, + "loss": 2.9329, + "step": 26459 + }, + { + "epoch": 1.6425600595940157, + "grad_norm": 0.14479298348022424, + "learning_rate": 5.043337747505227e-05, + "loss": 2.8135, + "step": 26460 + }, + { + "epoch": 1.6426221366937737, + "grad_norm": 0.1599741167594187, + "learning_rate": 5.0429766085408e-05, + "loss": 2.7928, + "step": 26461 + }, + { + "epoch": 1.6426842137935316, + "grad_norm": 0.17437022577777875, + "learning_rate": 5.042615469352152e-05, + "loss": 2.7593, + "step": 26462 + }, + { + "epoch": 1.6427462908932895, + "grad_norm": 0.15000146204514056, + "learning_rate": 5.0422543299411705e-05, + "loss": 2.8793, + "step": 26463 + }, + { + "epoch": 1.6428083679930474, + "grad_norm": 0.14215197957891554, + "learning_rate": 5.041893190309739e-05, + "loss": 2.8051, + "step": 26464 + }, + { + "epoch": 1.642870445092805, + "grad_norm": 0.15895460163984124, + "learning_rate": 5.0415320504597374e-05, + "loss": 2.7113, + "step": 26465 + }, + { + "epoch": 1.642932522192563, + "grad_norm": 0.16603042312192154, + "learning_rate": 5.041170910393056e-05, + "loss": 2.9216, + "step": 26466 + }, + { + "epoch": 1.642994599292321, + "grad_norm": 0.15198016183573954, + "learning_rate": 5.040809770111574e-05, + "loss": 2.8273, + "step": 26467 + }, + { + "epoch": 1.6430566763920789, + "grad_norm": 0.1559451108806685, + "learning_rate": 5.040448629617178e-05, + "loss": 2.776, + "step": 26468 + }, + { + "epoch": 1.6431187534918368, + "grad_norm": 0.15444428609159336, + "learning_rate": 5.040087488911751e-05, + "loss": 2.6893, + "step": 26469 + }, + { + "epoch": 1.6431808305915947, + "grad_norm": 0.14505694128379085, + "learning_rate": 5.039726347997179e-05, + "loss": 2.859, + "step": 26470 + }, + { + "epoch": 1.6432429076913526, + "grad_norm": 0.1769340963848136, + "learning_rate": 5.039365206875345e-05, + "loss": 2.6741, + "step": 26471 + }, + { + "epoch": 1.6433049847911105, + "grad_norm": 0.13814927743282013, + "learning_rate": 5.039004065548133e-05, + "loss": 2.6793, + "step": 26472 + }, + { + "epoch": 1.6433670618908685, + "grad_norm": 0.16742742159899124, + "learning_rate": 5.038642924017426e-05, + "loss": 2.8287, + "step": 26473 + }, + { + "epoch": 1.6434291389906264, + "grad_norm": 0.15025006574837171, + "learning_rate": 5.03828178228511e-05, + "loss": 2.8227, + "step": 26474 + }, + { + "epoch": 1.6434912160903843, + "grad_norm": 0.15624190547957148, + "learning_rate": 5.03792064035307e-05, + "loss": 2.7902, + "step": 26475 + }, + { + "epoch": 1.6435532931901422, + "grad_norm": 0.15334050841254848, + "learning_rate": 5.037559498223188e-05, + "loss": 2.7604, + "step": 26476 + }, + { + "epoch": 1.6436153702899001, + "grad_norm": 0.16482114795111982, + "learning_rate": 5.037198355897349e-05, + "loss": 2.7581, + "step": 26477 + }, + { + "epoch": 1.643677447389658, + "grad_norm": 0.1591811033782709, + "learning_rate": 5.036837213377437e-05, + "loss": 2.7636, + "step": 26478 + }, + { + "epoch": 1.643739524489416, + "grad_norm": 0.1466412657742531, + "learning_rate": 5.036476070665337e-05, + "loss": 2.7441, + "step": 26479 + }, + { + "epoch": 1.6438016015891739, + "grad_norm": 0.1563112436909087, + "learning_rate": 5.036114927762931e-05, + "loss": 2.7103, + "step": 26480 + }, + { + "epoch": 1.6438636786889318, + "grad_norm": 0.1563412399929239, + "learning_rate": 5.035753784672106e-05, + "loss": 2.9297, + "step": 26481 + }, + { + "epoch": 1.6439257557886897, + "grad_norm": 0.15866066855070293, + "learning_rate": 5.035392641394745e-05, + "loss": 2.7035, + "step": 26482 + }, + { + "epoch": 1.6439878328884474, + "grad_norm": 0.1593097962437734, + "learning_rate": 5.0350314979327306e-05, + "loss": 2.7781, + "step": 26483 + }, + { + "epoch": 1.6440499099882053, + "grad_norm": 0.18149073374098137, + "learning_rate": 5.03467035428795e-05, + "loss": 2.7825, + "step": 26484 + }, + { + "epoch": 1.6441119870879632, + "grad_norm": 0.17031790086074627, + "learning_rate": 5.034309210462285e-05, + "loss": 2.7155, + "step": 26485 + }, + { + "epoch": 1.6441740641877212, + "grad_norm": 0.17423006264775007, + "learning_rate": 5.03394806645762e-05, + "loss": 2.8286, + "step": 26486 + }, + { + "epoch": 1.644236141287479, + "grad_norm": 0.17057584015085706, + "learning_rate": 5.0335869222758405e-05, + "loss": 2.8114, + "step": 26487 + }, + { + "epoch": 1.644298218387237, + "grad_norm": 0.1592166658757366, + "learning_rate": 5.033225777918829e-05, + "loss": 2.8363, + "step": 26488 + }, + { + "epoch": 1.6443602954869947, + "grad_norm": 0.15500891243591622, + "learning_rate": 5.032864633388471e-05, + "loss": 2.8472, + "step": 26489 + }, + { + "epoch": 1.6444223725867526, + "grad_norm": 0.1552514774460297, + "learning_rate": 5.03250348868665e-05, + "loss": 2.8586, + "step": 26490 + }, + { + "epoch": 1.6444844496865105, + "grad_norm": 0.1755005396497749, + "learning_rate": 5.032142343815252e-05, + "loss": 2.7683, + "step": 26491 + }, + { + "epoch": 1.6445465267862684, + "grad_norm": 0.153388882750601, + "learning_rate": 5.031781198776157e-05, + "loss": 2.7632, + "step": 26492 + }, + { + "epoch": 1.6446086038860264, + "grad_norm": 0.14320770160627094, + "learning_rate": 5.031420053571253e-05, + "loss": 2.8356, + "step": 26493 + }, + { + "epoch": 1.6446706809857843, + "grad_norm": 0.14856344451491713, + "learning_rate": 5.031058908202423e-05, + "loss": 2.7959, + "step": 26494 + }, + { + "epoch": 1.6447327580855422, + "grad_norm": 0.1440120405194118, + "learning_rate": 5.030697762671551e-05, + "loss": 2.7811, + "step": 26495 + }, + { + "epoch": 1.6447948351853001, + "grad_norm": 0.17714720981092577, + "learning_rate": 5.0303366169805214e-05, + "loss": 2.8027, + "step": 26496 + }, + { + "epoch": 1.644856912285058, + "grad_norm": 0.154427075077805, + "learning_rate": 5.0299754711312184e-05, + "loss": 2.8725, + "step": 26497 + }, + { + "epoch": 1.644918989384816, + "grad_norm": 0.14485876889225974, + "learning_rate": 5.029614325125526e-05, + "loss": 2.795, + "step": 26498 + }, + { + "epoch": 1.6449810664845739, + "grad_norm": 0.17163017565833877, + "learning_rate": 5.0292531789653294e-05, + "loss": 2.8224, + "step": 26499 + }, + { + "epoch": 1.6450431435843318, + "grad_norm": 0.15397537513842982, + "learning_rate": 5.02889203265251e-05, + "loss": 2.8196, + "step": 26500 + }, + { + "epoch": 1.6451052206840897, + "grad_norm": 0.1526074023169553, + "learning_rate": 5.028530886188956e-05, + "loss": 2.7572, + "step": 26501 + }, + { + "epoch": 1.6451672977838476, + "grad_norm": 0.15299820929964422, + "learning_rate": 5.028169739576548e-05, + "loss": 2.8235, + "step": 26502 + }, + { + "epoch": 1.6452293748836055, + "grad_norm": 0.16041077183961, + "learning_rate": 5.027808592817171e-05, + "loss": 2.918, + "step": 26503 + }, + { + "epoch": 1.6452914519833635, + "grad_norm": 0.15653548386373656, + "learning_rate": 5.027447445912712e-05, + "loss": 2.7565, + "step": 26504 + }, + { + "epoch": 1.6453535290831214, + "grad_norm": 0.14705056441654996, + "learning_rate": 5.0270862988650516e-05, + "loss": 2.787, + "step": 26505 + }, + { + "epoch": 1.6454156061828793, + "grad_norm": 0.1485679329467288, + "learning_rate": 5.026725151676076e-05, + "loss": 2.7567, + "step": 26506 + }, + { + "epoch": 1.645477683282637, + "grad_norm": 0.1601265844922581, + "learning_rate": 5.026364004347668e-05, + "loss": 2.8347, + "step": 26507 + }, + { + "epoch": 1.645539760382395, + "grad_norm": 0.15172176669428594, + "learning_rate": 5.026002856881712e-05, + "loss": 2.7554, + "step": 26508 + }, + { + "epoch": 1.6456018374821528, + "grad_norm": 0.15550046194796008, + "learning_rate": 5.0256417092800945e-05, + "loss": 2.7298, + "step": 26509 + }, + { + "epoch": 1.6456639145819107, + "grad_norm": 0.14623275763196744, + "learning_rate": 5.0252805615446965e-05, + "loss": 2.6816, + "step": 26510 + }, + { + "epoch": 1.6457259916816687, + "grad_norm": 0.15638193282532534, + "learning_rate": 5.0249194136774046e-05, + "loss": 2.7919, + "step": 26511 + }, + { + "epoch": 1.6457880687814266, + "grad_norm": 0.15373743015560354, + "learning_rate": 5.024558265680102e-05, + "loss": 2.7527, + "step": 26512 + }, + { + "epoch": 1.6458501458811843, + "grad_norm": 0.14620498079461566, + "learning_rate": 5.024197117554673e-05, + "loss": 2.7122, + "step": 26513 + }, + { + "epoch": 1.6459122229809422, + "grad_norm": 0.14741238053541084, + "learning_rate": 5.0238359693030016e-05, + "loss": 2.7779, + "step": 26514 + }, + { + "epoch": 1.6459743000807001, + "grad_norm": 0.18320168573916282, + "learning_rate": 5.023474820926972e-05, + "loss": 2.8423, + "step": 26515 + }, + { + "epoch": 1.646036377180458, + "grad_norm": 0.15178577626952341, + "learning_rate": 5.023113672428468e-05, + "loss": 2.8408, + "step": 26516 + }, + { + "epoch": 1.646098454280216, + "grad_norm": 0.16469360627973034, + "learning_rate": 5.022752523809375e-05, + "loss": 2.8551, + "step": 26517 + }, + { + "epoch": 1.6461605313799739, + "grad_norm": 0.15591999610291005, + "learning_rate": 5.022391375071577e-05, + "loss": 2.7633, + "step": 26518 + }, + { + "epoch": 1.6462226084797318, + "grad_norm": 0.15886655678399947, + "learning_rate": 5.022030226216958e-05, + "loss": 2.8556, + "step": 26519 + }, + { + "epoch": 1.6462846855794897, + "grad_norm": 0.1540020649279215, + "learning_rate": 5.021669077247402e-05, + "loss": 2.8093, + "step": 26520 + }, + { + "epoch": 1.6463467626792476, + "grad_norm": 0.15349695635746277, + "learning_rate": 5.021307928164791e-05, + "loss": 2.8204, + "step": 26521 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.16787819306439375, + "learning_rate": 5.020946778971014e-05, + "loss": 2.7423, + "step": 26522 + }, + { + "epoch": 1.6464709168787635, + "grad_norm": 0.15079905424597756, + "learning_rate": 5.0205856296679496e-05, + "loss": 2.7621, + "step": 26523 + }, + { + "epoch": 1.6465329939785214, + "grad_norm": 0.1500299140784886, + "learning_rate": 5.020224480257487e-05, + "loss": 2.8166, + "step": 26524 + }, + { + "epoch": 1.6465950710782793, + "grad_norm": 0.15597441330978384, + "learning_rate": 5.019863330741509e-05, + "loss": 2.7961, + "step": 26525 + }, + { + "epoch": 1.6466571481780372, + "grad_norm": 0.15097112830919543, + "learning_rate": 5.019502181121898e-05, + "loss": 2.8134, + "step": 26526 + }, + { + "epoch": 1.6467192252777951, + "grad_norm": 0.1920948778432438, + "learning_rate": 5.01914103140054e-05, + "loss": 2.7122, + "step": 26527 + }, + { + "epoch": 1.646781302377553, + "grad_norm": 0.17480655221637112, + "learning_rate": 5.018779881579319e-05, + "loss": 2.8069, + "step": 26528 + }, + { + "epoch": 1.646843379477311, + "grad_norm": 0.16862490485452353, + "learning_rate": 5.018418731660118e-05, + "loss": 2.8104, + "step": 26529 + }, + { + "epoch": 1.6469054565770689, + "grad_norm": 0.17252682499737718, + "learning_rate": 5.018057581644822e-05, + "loss": 2.7642, + "step": 26530 + }, + { + "epoch": 1.6469675336768266, + "grad_norm": 0.16546227292972288, + "learning_rate": 5.017696431535316e-05, + "loss": 2.7949, + "step": 26531 + }, + { + "epoch": 1.6470296107765845, + "grad_norm": 0.19937427844990238, + "learning_rate": 5.017335281333482e-05, + "loss": 2.7918, + "step": 26532 + }, + { + "epoch": 1.6470916878763424, + "grad_norm": 0.15330041768447314, + "learning_rate": 5.016974131041206e-05, + "loss": 2.7866, + "step": 26533 + }, + { + "epoch": 1.6471537649761003, + "grad_norm": 0.15560676376897495, + "learning_rate": 5.0166129806603726e-05, + "loss": 2.8272, + "step": 26534 + }, + { + "epoch": 1.6472158420758582, + "grad_norm": 0.15781809612776385, + "learning_rate": 5.016251830192865e-05, + "loss": 2.8175, + "step": 26535 + }, + { + "epoch": 1.6472779191756162, + "grad_norm": 0.15702677407219853, + "learning_rate": 5.015890679640567e-05, + "loss": 2.7336, + "step": 26536 + }, + { + "epoch": 1.6473399962753739, + "grad_norm": 0.16634711661077156, + "learning_rate": 5.015529529005365e-05, + "loss": 2.8011, + "step": 26537 + }, + { + "epoch": 1.6474020733751318, + "grad_norm": 0.1931211661140351, + "learning_rate": 5.0151683782891414e-05, + "loss": 2.7266, + "step": 26538 + }, + { + "epoch": 1.6474641504748897, + "grad_norm": 0.16649364764250046, + "learning_rate": 5.01480722749378e-05, + "loss": 2.8163, + "step": 26539 + }, + { + "epoch": 1.6475262275746476, + "grad_norm": 0.15061429346781657, + "learning_rate": 5.014446076621164e-05, + "loss": 2.881, + "step": 26540 + }, + { + "epoch": 1.6475883046744055, + "grad_norm": 0.17220621814689416, + "learning_rate": 5.0140849256731804e-05, + "loss": 2.7419, + "step": 26541 + }, + { + "epoch": 1.6476503817741635, + "grad_norm": 0.1433037816389059, + "learning_rate": 5.013723774651714e-05, + "loss": 2.6986, + "step": 26542 + }, + { + "epoch": 1.6477124588739214, + "grad_norm": 0.17539261665997574, + "learning_rate": 5.013362623558645e-05, + "loss": 2.7433, + "step": 26543 + }, + { + "epoch": 1.6477745359736793, + "grad_norm": 0.15962793274077003, + "learning_rate": 5.013001472395862e-05, + "loss": 2.7395, + "step": 26544 + }, + { + "epoch": 1.6478366130734372, + "grad_norm": 0.178392209436511, + "learning_rate": 5.0126403211652476e-05, + "loss": 2.7503, + "step": 26545 + }, + { + "epoch": 1.6478986901731951, + "grad_norm": 0.16534379330304474, + "learning_rate": 5.012279169868683e-05, + "loss": 2.7444, + "step": 26546 + }, + { + "epoch": 1.647960767272953, + "grad_norm": 0.16586976094729297, + "learning_rate": 5.011918018508057e-05, + "loss": 2.7304, + "step": 26547 + }, + { + "epoch": 1.648022844372711, + "grad_norm": 0.154943168925361, + "learning_rate": 5.011556867085251e-05, + "loss": 2.6882, + "step": 26548 + }, + { + "epoch": 1.6480849214724689, + "grad_norm": 0.1602951819563502, + "learning_rate": 5.0111957156021503e-05, + "loss": 2.7896, + "step": 26549 + }, + { + "epoch": 1.6481469985722268, + "grad_norm": 0.18054492218136758, + "learning_rate": 5.010834564060638e-05, + "loss": 2.7793, + "step": 26550 + }, + { + "epoch": 1.6482090756719847, + "grad_norm": 0.17448426293763805, + "learning_rate": 5.0104734124626005e-05, + "loss": 2.8733, + "step": 26551 + }, + { + "epoch": 1.6482711527717426, + "grad_norm": 0.18193894161896826, + "learning_rate": 5.010112260809919e-05, + "loss": 2.8261, + "step": 26552 + }, + { + "epoch": 1.6483332298715005, + "grad_norm": 0.16554596045125797, + "learning_rate": 5.0097511091044816e-05, + "loss": 2.7775, + "step": 26553 + }, + { + "epoch": 1.6483953069712585, + "grad_norm": 0.15290303759315987, + "learning_rate": 5.009389957348169e-05, + "loss": 2.7828, + "step": 26554 + }, + { + "epoch": 1.6484573840710162, + "grad_norm": 0.17083425679092942, + "learning_rate": 5.009028805542867e-05, + "loss": 2.6544, + "step": 26555 + }, + { + "epoch": 1.648519461170774, + "grad_norm": 0.15801921798602958, + "learning_rate": 5.0086676536904584e-05, + "loss": 2.7293, + "step": 26556 + }, + { + "epoch": 1.648581538270532, + "grad_norm": 0.17412955156068044, + "learning_rate": 5.00830650179283e-05, + "loss": 2.7918, + "step": 26557 + }, + { + "epoch": 1.64864361537029, + "grad_norm": 0.15260123232688255, + "learning_rate": 5.007945349851864e-05, + "loss": 2.9261, + "step": 26558 + }, + { + "epoch": 1.6487056924700478, + "grad_norm": 0.1508861712833119, + "learning_rate": 5.0075841978694446e-05, + "loss": 2.7639, + "step": 26559 + }, + { + "epoch": 1.6487677695698058, + "grad_norm": 0.1587197439861474, + "learning_rate": 5.007223045847458e-05, + "loss": 2.8672, + "step": 26560 + }, + { + "epoch": 1.6488298466695634, + "grad_norm": 0.1511397899418082, + "learning_rate": 5.0068618937877854e-05, + "loss": 2.8673, + "step": 26561 + }, + { + "epoch": 1.6488919237693214, + "grad_norm": 0.15987472679630824, + "learning_rate": 5.006500741692314e-05, + "loss": 2.741, + "step": 26562 + }, + { + "epoch": 1.6489540008690793, + "grad_norm": 0.1525950132614959, + "learning_rate": 5.006139589562927e-05, + "loss": 2.7435, + "step": 26563 + }, + { + "epoch": 1.6490160779688372, + "grad_norm": 0.156578231809203, + "learning_rate": 5.005778437401507e-05, + "loss": 2.819, + "step": 26564 + }, + { + "epoch": 1.6490781550685951, + "grad_norm": 0.1851684727235896, + "learning_rate": 5.00541728520994e-05, + "loss": 2.8144, + "step": 26565 + }, + { + "epoch": 1.649140232168353, + "grad_norm": 0.1533526975603408, + "learning_rate": 5.005056132990109e-05, + "loss": 2.7576, + "step": 26566 + }, + { + "epoch": 1.649202309268111, + "grad_norm": 0.1921040504992883, + "learning_rate": 5.0046949807439006e-05, + "loss": 2.8152, + "step": 26567 + }, + { + "epoch": 1.6492643863678689, + "grad_norm": 0.1638137259588464, + "learning_rate": 5.0043338284731955e-05, + "loss": 2.7311, + "step": 26568 + }, + { + "epoch": 1.6493264634676268, + "grad_norm": 0.16003576258990249, + "learning_rate": 5.003972676179881e-05, + "loss": 2.8162, + "step": 26569 + }, + { + "epoch": 1.6493885405673847, + "grad_norm": 0.1610064849406376, + "learning_rate": 5.0036115238658385e-05, + "loss": 2.831, + "step": 26570 + }, + { + "epoch": 1.6494506176671426, + "grad_norm": 0.1702231498700775, + "learning_rate": 5.003250371532956e-05, + "loss": 2.6983, + "step": 26571 + }, + { + "epoch": 1.6495126947669005, + "grad_norm": 0.1760736364946349, + "learning_rate": 5.002889219183113e-05, + "loss": 2.7741, + "step": 26572 + }, + { + "epoch": 1.6495747718666585, + "grad_norm": 0.15239526415666668, + "learning_rate": 5.002528066818198e-05, + "loss": 2.7636, + "step": 26573 + }, + { + "epoch": 1.6496368489664164, + "grad_norm": 0.17659560817273506, + "learning_rate": 5.002166914440094e-05, + "loss": 2.8399, + "step": 26574 + }, + { + "epoch": 1.6496989260661743, + "grad_norm": 0.1457138522906195, + "learning_rate": 5.001805762050683e-05, + "loss": 2.713, + "step": 26575 + }, + { + "epoch": 1.6497610031659322, + "grad_norm": 0.1542598402267262, + "learning_rate": 5.0014446096518516e-05, + "loss": 2.8194, + "step": 26576 + }, + { + "epoch": 1.6498230802656901, + "grad_norm": 0.1894596236382647, + "learning_rate": 5.0010834572454836e-05, + "loss": 2.9082, + "step": 26577 + }, + { + "epoch": 1.649885157365448, + "grad_norm": 0.18100976602739197, + "learning_rate": 5.000722304833463e-05, + "loss": 2.8802, + "step": 26578 + }, + { + "epoch": 1.6499472344652057, + "grad_norm": 0.16194413277467182, + "learning_rate": 5.000361152417673e-05, + "loss": 2.7779, + "step": 26579 + }, + { + "epoch": 1.6500093115649637, + "grad_norm": 0.15947729502260063, + "learning_rate": 5e-05, + "loss": 2.8532, + "step": 26580 + }, + { + "epoch": 1.6500713886647216, + "grad_norm": 0.15812839697592818, + "learning_rate": 4.9996388475823274e-05, + "loss": 2.8619, + "step": 26581 + }, + { + "epoch": 1.6501334657644795, + "grad_norm": 0.14206911047195445, + "learning_rate": 4.999277695166538e-05, + "loss": 2.8426, + "step": 26582 + }, + { + "epoch": 1.6501955428642374, + "grad_norm": 0.15603730507150662, + "learning_rate": 4.9989165427545176e-05, + "loss": 2.8296, + "step": 26583 + }, + { + "epoch": 1.6502576199639953, + "grad_norm": 0.14650376509443633, + "learning_rate": 4.998555390348149e-05, + "loss": 2.7897, + "step": 26584 + }, + { + "epoch": 1.650319697063753, + "grad_norm": 0.15866218884803546, + "learning_rate": 4.998194237949318e-05, + "loss": 2.714, + "step": 26585 + }, + { + "epoch": 1.650381774163511, + "grad_norm": 0.15319312341656935, + "learning_rate": 4.997833085559908e-05, + "loss": 2.8206, + "step": 26586 + }, + { + "epoch": 1.6504438512632689, + "grad_norm": 0.15166046280102227, + "learning_rate": 4.9974719331818035e-05, + "loss": 2.7969, + "step": 26587 + }, + { + "epoch": 1.6505059283630268, + "grad_norm": 0.14572432110710595, + "learning_rate": 4.997110780816887e-05, + "loss": 2.7845, + "step": 26588 + }, + { + "epoch": 1.6505680054627847, + "grad_norm": 0.16612610226445595, + "learning_rate": 4.996749628467046e-05, + "loss": 2.8242, + "step": 26589 + }, + { + "epoch": 1.6506300825625426, + "grad_norm": 0.1425316029251336, + "learning_rate": 4.996388476134162e-05, + "loss": 2.6979, + "step": 26590 + }, + { + "epoch": 1.6506921596623005, + "grad_norm": 0.14251619457151551, + "learning_rate": 4.996027323820121e-05, + "loss": 2.8086, + "step": 26591 + }, + { + "epoch": 1.6507542367620585, + "grad_norm": 0.1477075182500373, + "learning_rate": 4.9956661715268064e-05, + "loss": 2.7326, + "step": 26592 + }, + { + "epoch": 1.6508163138618164, + "grad_norm": 0.16562841362404337, + "learning_rate": 4.995305019256102e-05, + "loss": 2.6849, + "step": 26593 + }, + { + "epoch": 1.6508783909615743, + "grad_norm": 0.14884547585498534, + "learning_rate": 4.994943867009893e-05, + "loss": 2.7747, + "step": 26594 + }, + { + "epoch": 1.6509404680613322, + "grad_norm": 0.1640531957349518, + "learning_rate": 4.994582714790062e-05, + "loss": 2.832, + "step": 26595 + }, + { + "epoch": 1.6510025451610901, + "grad_norm": 0.16728436413614717, + "learning_rate": 4.994221562598495e-05, + "loss": 2.7592, + "step": 26596 + }, + { + "epoch": 1.651064622260848, + "grad_norm": 0.15003755541389419, + "learning_rate": 4.9938604104370737e-05, + "loss": 2.8521, + "step": 26597 + }, + { + "epoch": 1.651126699360606, + "grad_norm": 0.17234947538333362, + "learning_rate": 4.9934992583076854e-05, + "loss": 2.9157, + "step": 26598 + }, + { + "epoch": 1.6511887764603639, + "grad_norm": 0.17733784342268927, + "learning_rate": 4.9931381062122144e-05, + "loss": 2.8547, + "step": 26599 + }, + { + "epoch": 1.6512508535601218, + "grad_norm": 0.15164998616681663, + "learning_rate": 4.9927769541525424e-05, + "loss": 2.7746, + "step": 26600 + }, + { + "epoch": 1.6513129306598797, + "grad_norm": 0.14433887282549768, + "learning_rate": 4.992415802130555e-05, + "loss": 2.8712, + "step": 26601 + }, + { + "epoch": 1.6513750077596374, + "grad_norm": 0.14618449350855928, + "learning_rate": 4.992054650148136e-05, + "loss": 2.7948, + "step": 26602 + }, + { + "epoch": 1.6514370848593953, + "grad_norm": 0.1619664159327291, + "learning_rate": 4.9916934982071706e-05, + "loss": 2.8627, + "step": 26603 + }, + { + "epoch": 1.6514991619591533, + "grad_norm": 0.15845008106992495, + "learning_rate": 4.9913323463095414e-05, + "loss": 2.7867, + "step": 26604 + }, + { + "epoch": 1.6515612390589112, + "grad_norm": 0.14979505271714413, + "learning_rate": 4.9909711944571343e-05, + "loss": 2.8877, + "step": 26605 + }, + { + "epoch": 1.651623316158669, + "grad_norm": 0.16305404054207137, + "learning_rate": 4.990610042651832e-05, + "loss": 2.847, + "step": 26606 + }, + { + "epoch": 1.651685393258427, + "grad_norm": 0.1674586801841727, + "learning_rate": 4.990248890895519e-05, + "loss": 2.7682, + "step": 26607 + }, + { + "epoch": 1.6517474703581847, + "grad_norm": 0.1439879942908431, + "learning_rate": 4.9898877391900814e-05, + "loss": 2.7438, + "step": 26608 + }, + { + "epoch": 1.6518095474579426, + "grad_norm": 0.16179186393717743, + "learning_rate": 4.9895265875374006e-05, + "loss": 2.8276, + "step": 26609 + }, + { + "epoch": 1.6518716245577005, + "grad_norm": 0.15349945002559834, + "learning_rate": 4.989165435939362e-05, + "loss": 2.7894, + "step": 26610 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.15654484229534318, + "learning_rate": 4.988804284397851e-05, + "loss": 2.7735, + "step": 26611 + }, + { + "epoch": 1.6519957787572164, + "grad_norm": 0.1551720210028501, + "learning_rate": 4.98844313291475e-05, + "loss": 2.8004, + "step": 26612 + }, + { + "epoch": 1.6520578558569743, + "grad_norm": 0.15966624221101594, + "learning_rate": 4.988081981491944e-05, + "loss": 2.6685, + "step": 26613 + }, + { + "epoch": 1.6521199329567322, + "grad_norm": 0.16106622071274337, + "learning_rate": 4.987720830131318e-05, + "loss": 2.7238, + "step": 26614 + }, + { + "epoch": 1.6521820100564901, + "grad_norm": 0.1573677258563978, + "learning_rate": 4.9873596788347535e-05, + "loss": 2.811, + "step": 26615 + }, + { + "epoch": 1.652244087156248, + "grad_norm": 0.14030439865326422, + "learning_rate": 4.9869985276041384e-05, + "loss": 2.7173, + "step": 26616 + }, + { + "epoch": 1.652306164256006, + "grad_norm": 0.15357892282115534, + "learning_rate": 4.986637376441356e-05, + "loss": 2.815, + "step": 26617 + }, + { + "epoch": 1.6523682413557639, + "grad_norm": 0.146597583258333, + "learning_rate": 4.986276225348287e-05, + "loss": 2.8387, + "step": 26618 + }, + { + "epoch": 1.6524303184555218, + "grad_norm": 0.15203791878159667, + "learning_rate": 4.98591507432682e-05, + "loss": 2.7884, + "step": 26619 + }, + { + "epoch": 1.6524923955552797, + "grad_norm": 0.14483845770013812, + "learning_rate": 4.985553923378837e-05, + "loss": 2.7535, + "step": 26620 + }, + { + "epoch": 1.6525544726550376, + "grad_norm": 0.14826381711513897, + "learning_rate": 4.985192772506223e-05, + "loss": 2.7773, + "step": 26621 + }, + { + "epoch": 1.6526165497547955, + "grad_norm": 0.164848816147465, + "learning_rate": 4.984831621710861e-05, + "loss": 2.7983, + "step": 26622 + }, + { + "epoch": 1.6526786268545535, + "grad_norm": 0.15321405336774122, + "learning_rate": 4.984470470994637e-05, + "loss": 2.8286, + "step": 26623 + }, + { + "epoch": 1.6527407039543114, + "grad_norm": 0.14119357824139078, + "learning_rate": 4.984109320359434e-05, + "loss": 2.7103, + "step": 26624 + }, + { + "epoch": 1.6528027810540693, + "grad_norm": 0.15712346671523938, + "learning_rate": 4.983748169807137e-05, + "loss": 2.8472, + "step": 26625 + }, + { + "epoch": 1.652864858153827, + "grad_norm": 0.1413812831313864, + "learning_rate": 4.9833870193396286e-05, + "loss": 2.7167, + "step": 26626 + }, + { + "epoch": 1.652926935253585, + "grad_norm": 0.14586717278263528, + "learning_rate": 4.9830258689587955e-05, + "loss": 2.7821, + "step": 26627 + }, + { + "epoch": 1.6529890123533428, + "grad_norm": 0.172128485026916, + "learning_rate": 4.9826647186665194e-05, + "loss": 2.7744, + "step": 26628 + }, + { + "epoch": 1.6530510894531008, + "grad_norm": 0.16111170890633714, + "learning_rate": 4.982303568464687e-05, + "loss": 2.7326, + "step": 26629 + }, + { + "epoch": 1.6531131665528587, + "grad_norm": 0.1486754705843222, + "learning_rate": 4.981942418355178e-05, + "loss": 2.7553, + "step": 26630 + }, + { + "epoch": 1.6531752436526166, + "grad_norm": 0.15919072750632887, + "learning_rate": 4.981581268339883e-05, + "loss": 2.6671, + "step": 26631 + }, + { + "epoch": 1.6532373207523743, + "grad_norm": 0.14819965003549193, + "learning_rate": 4.9812201184206816e-05, + "loss": 2.7967, + "step": 26632 + }, + { + "epoch": 1.6532993978521322, + "grad_norm": 0.17962265917952394, + "learning_rate": 4.98085896859946e-05, + "loss": 2.8073, + "step": 26633 + }, + { + "epoch": 1.6533614749518901, + "grad_norm": 0.15713178253860455, + "learning_rate": 4.980497818878101e-05, + "loss": 2.8586, + "step": 26634 + }, + { + "epoch": 1.653423552051648, + "grad_norm": 0.15977608522901046, + "learning_rate": 4.980136669258491e-05, + "loss": 2.7588, + "step": 26635 + }, + { + "epoch": 1.653485629151406, + "grad_norm": 0.1801141773138628, + "learning_rate": 4.979775519742512e-05, + "loss": 2.7034, + "step": 26636 + }, + { + "epoch": 1.6535477062511639, + "grad_norm": 0.15095325600978013, + "learning_rate": 4.97941437033205e-05, + "loss": 2.8308, + "step": 26637 + }, + { + "epoch": 1.6536097833509218, + "grad_norm": 0.146865178094317, + "learning_rate": 4.979053221028987e-05, + "loss": 2.7622, + "step": 26638 + }, + { + "epoch": 1.6536718604506797, + "grad_norm": 0.15663956120955053, + "learning_rate": 4.978692071835209e-05, + "loss": 2.7205, + "step": 26639 + }, + { + "epoch": 1.6537339375504376, + "grad_norm": 0.14850924038194308, + "learning_rate": 4.978330922752599e-05, + "loss": 2.8025, + "step": 26640 + }, + { + "epoch": 1.6537960146501955, + "grad_norm": 0.14676681628008245, + "learning_rate": 4.9779697737830436e-05, + "loss": 2.8435, + "step": 26641 + }, + { + "epoch": 1.6538580917499535, + "grad_norm": 0.14052952068198796, + "learning_rate": 4.9776086249284244e-05, + "loss": 2.6218, + "step": 26642 + }, + { + "epoch": 1.6539201688497114, + "grad_norm": 0.1510434040978135, + "learning_rate": 4.977247476190625e-05, + "loss": 2.8298, + "step": 26643 + }, + { + "epoch": 1.6539822459494693, + "grad_norm": 0.14694328901844797, + "learning_rate": 4.9768863275715326e-05, + "loss": 2.7985, + "step": 26644 + }, + { + "epoch": 1.6540443230492272, + "grad_norm": 0.18775407147278733, + "learning_rate": 4.976525179073029e-05, + "loss": 2.7611, + "step": 26645 + }, + { + "epoch": 1.6541064001489851, + "grad_norm": 0.14183101479516674, + "learning_rate": 4.976164030697e-05, + "loss": 2.7753, + "step": 26646 + }, + { + "epoch": 1.654168477248743, + "grad_norm": 0.15693797406277293, + "learning_rate": 4.975802882445328e-05, + "loss": 2.7449, + "step": 26647 + }, + { + "epoch": 1.654230554348501, + "grad_norm": 0.14334262703297065, + "learning_rate": 4.975441734319899e-05, + "loss": 2.7043, + "step": 26648 + }, + { + "epoch": 1.6542926314482589, + "grad_norm": 0.15490230856108514, + "learning_rate": 4.9750805863225966e-05, + "loss": 2.851, + "step": 26649 + }, + { + "epoch": 1.6543547085480166, + "grad_norm": 0.14730691128683135, + "learning_rate": 4.974719438455305e-05, + "loss": 2.7386, + "step": 26650 + }, + { + "epoch": 1.6544167856477745, + "grad_norm": 0.15914088636366552, + "learning_rate": 4.9743582907199066e-05, + "loss": 2.7213, + "step": 26651 + }, + { + "epoch": 1.6544788627475324, + "grad_norm": 0.16189277495182744, + "learning_rate": 4.973997143118289e-05, + "loss": 2.8591, + "step": 26652 + }, + { + "epoch": 1.6545409398472903, + "grad_norm": 0.15849109625326677, + "learning_rate": 4.9736359956523335e-05, + "loss": 2.762, + "step": 26653 + }, + { + "epoch": 1.6546030169470483, + "grad_norm": 0.16417435965287527, + "learning_rate": 4.9732748483239266e-05, + "loss": 2.8284, + "step": 26654 + }, + { + "epoch": 1.6546650940468062, + "grad_norm": 0.1539854637332015, + "learning_rate": 4.972913701134951e-05, + "loss": 2.9052, + "step": 26655 + }, + { + "epoch": 1.6547271711465639, + "grad_norm": 0.17108875032200638, + "learning_rate": 4.97255255408729e-05, + "loss": 2.7637, + "step": 26656 + }, + { + "epoch": 1.6547892482463218, + "grad_norm": 0.15569068221596158, + "learning_rate": 4.97219140718283e-05, + "loss": 2.7923, + "step": 26657 + }, + { + "epoch": 1.6548513253460797, + "grad_norm": 0.18340400264856094, + "learning_rate": 4.971830260423454e-05, + "loss": 2.7808, + "step": 26658 + }, + { + "epoch": 1.6549134024458376, + "grad_norm": 0.15114490431082275, + "learning_rate": 4.9714691138110466e-05, + "loss": 2.8814, + "step": 26659 + }, + { + "epoch": 1.6549754795455955, + "grad_norm": 0.15135451233000205, + "learning_rate": 4.971107967347491e-05, + "loss": 2.7338, + "step": 26660 + }, + { + "epoch": 1.6550375566453535, + "grad_norm": 0.15865808577280377, + "learning_rate": 4.970746821034673e-05, + "loss": 2.7119, + "step": 26661 + }, + { + "epoch": 1.6550996337451114, + "grad_norm": 0.15551750626255453, + "learning_rate": 4.970385674874474e-05, + "loss": 2.8053, + "step": 26662 + }, + { + "epoch": 1.6551617108448693, + "grad_norm": 0.15329092801112149, + "learning_rate": 4.970024528868781e-05, + "loss": 2.7655, + "step": 26663 + }, + { + "epoch": 1.6552237879446272, + "grad_norm": 0.18101066128144813, + "learning_rate": 4.9696633830194784e-05, + "loss": 2.8314, + "step": 26664 + }, + { + "epoch": 1.6552858650443851, + "grad_norm": 0.150309757685479, + "learning_rate": 4.9693022373284486e-05, + "loss": 2.8188, + "step": 26665 + }, + { + "epoch": 1.655347942144143, + "grad_norm": 0.15088403563437486, + "learning_rate": 4.968941091797577e-05, + "loss": 2.7037, + "step": 26666 + }, + { + "epoch": 1.655410019243901, + "grad_norm": 0.14813041292835008, + "learning_rate": 4.968579946428748e-05, + "loss": 2.7495, + "step": 26667 + }, + { + "epoch": 1.6554720963436589, + "grad_norm": 0.1462068917756571, + "learning_rate": 4.968218801223844e-05, + "loss": 2.7899, + "step": 26668 + }, + { + "epoch": 1.6555341734434168, + "grad_norm": 0.13859352396717584, + "learning_rate": 4.96785765618475e-05, + "loss": 2.8316, + "step": 26669 + }, + { + "epoch": 1.6555962505431747, + "grad_norm": 0.14365370572791053, + "learning_rate": 4.96749651131335e-05, + "loss": 2.7588, + "step": 26670 + }, + { + "epoch": 1.6556583276429326, + "grad_norm": 0.1575632872945779, + "learning_rate": 4.96713536661153e-05, + "loss": 2.8871, + "step": 26671 + }, + { + "epoch": 1.6557204047426906, + "grad_norm": 0.17321727878965007, + "learning_rate": 4.966774222081171e-05, + "loss": 2.804, + "step": 26672 + }, + { + "epoch": 1.6557824818424485, + "grad_norm": 0.18034109166588363, + "learning_rate": 4.966413077724161e-05, + "loss": 2.8757, + "step": 26673 + }, + { + "epoch": 1.6558445589422062, + "grad_norm": 0.15052210646116218, + "learning_rate": 4.96605193354238e-05, + "loss": 2.78, + "step": 26674 + }, + { + "epoch": 1.655906636041964, + "grad_norm": 0.15768963284548812, + "learning_rate": 4.965690789537717e-05, + "loss": 2.8113, + "step": 26675 + }, + { + "epoch": 1.655968713141722, + "grad_norm": 0.1445704548733785, + "learning_rate": 4.965329645712051e-05, + "loss": 2.7346, + "step": 26676 + }, + { + "epoch": 1.65603079024148, + "grad_norm": 0.1416900884209289, + "learning_rate": 4.9649685020672705e-05, + "loss": 2.7483, + "step": 26677 + }, + { + "epoch": 1.6560928673412378, + "grad_norm": 0.1744943559971726, + "learning_rate": 4.964607358605256e-05, + "loss": 2.672, + "step": 26678 + }, + { + "epoch": 1.6561549444409958, + "grad_norm": 0.1386778991357833, + "learning_rate": 4.9642462153278954e-05, + "loss": 2.8276, + "step": 26679 + }, + { + "epoch": 1.6562170215407535, + "grad_norm": 0.14380520740082844, + "learning_rate": 4.96388507223707e-05, + "loss": 2.765, + "step": 26680 + }, + { + "epoch": 1.6562790986405114, + "grad_norm": 0.14516730437875922, + "learning_rate": 4.963523929334665e-05, + "loss": 2.7763, + "step": 26681 + }, + { + "epoch": 1.6563411757402693, + "grad_norm": 0.14564443101504768, + "learning_rate": 4.9631627866225645e-05, + "loss": 2.7175, + "step": 26682 + }, + { + "epoch": 1.6564032528400272, + "grad_norm": 0.132273886141524, + "learning_rate": 4.962801644102652e-05, + "loss": 2.8113, + "step": 26683 + }, + { + "epoch": 1.6564653299397851, + "grad_norm": 0.14301918995249488, + "learning_rate": 4.962440501776814e-05, + "loss": 2.7477, + "step": 26684 + }, + { + "epoch": 1.656527407039543, + "grad_norm": 0.13745651776774187, + "learning_rate": 4.962079359646931e-05, + "loss": 2.65, + "step": 26685 + }, + { + "epoch": 1.656589484139301, + "grad_norm": 0.16082718503479623, + "learning_rate": 4.9617182177148905e-05, + "loss": 2.811, + "step": 26686 + }, + { + "epoch": 1.6566515612390589, + "grad_norm": 0.17191230304097627, + "learning_rate": 4.961357075982574e-05, + "loss": 2.7859, + "step": 26687 + }, + { + "epoch": 1.6567136383388168, + "grad_norm": 0.14642809740473284, + "learning_rate": 4.9609959344518693e-05, + "loss": 2.7865, + "step": 26688 + }, + { + "epoch": 1.6567757154385747, + "grad_norm": 0.14614727014006182, + "learning_rate": 4.9606347931246564e-05, + "loss": 2.8384, + "step": 26689 + }, + { + "epoch": 1.6568377925383326, + "grad_norm": 0.14209426794341545, + "learning_rate": 4.9602736520028225e-05, + "loss": 2.8436, + "step": 26690 + }, + { + "epoch": 1.6568998696380905, + "grad_norm": 0.1495623320380518, + "learning_rate": 4.959912511088251e-05, + "loss": 2.8234, + "step": 26691 + }, + { + "epoch": 1.6569619467378485, + "grad_norm": 0.1457066169913486, + "learning_rate": 4.9595513703828236e-05, + "loss": 2.8285, + "step": 26692 + }, + { + "epoch": 1.6570240238376064, + "grad_norm": 0.13778016337067667, + "learning_rate": 4.9591902298884284e-05, + "loss": 2.8001, + "step": 26693 + }, + { + "epoch": 1.6570861009373643, + "grad_norm": 0.14371036340713408, + "learning_rate": 4.958829089606947e-05, + "loss": 2.7124, + "step": 26694 + }, + { + "epoch": 1.6571481780371222, + "grad_norm": 0.14168510603344142, + "learning_rate": 4.958467949540262e-05, + "loss": 2.7255, + "step": 26695 + }, + { + "epoch": 1.6572102551368801, + "grad_norm": 0.14798651121384188, + "learning_rate": 4.958106809690263e-05, + "loss": 2.8072, + "step": 26696 + }, + { + "epoch": 1.657272332236638, + "grad_norm": 0.16675761964723826, + "learning_rate": 4.9577456700588287e-05, + "loss": 2.8717, + "step": 26697 + }, + { + "epoch": 1.6573344093363958, + "grad_norm": 0.1485958760266885, + "learning_rate": 4.9573845306478475e-05, + "loss": 2.7454, + "step": 26698 + }, + { + "epoch": 1.6573964864361537, + "grad_norm": 0.1476840926466209, + "learning_rate": 4.9570233914592e-05, + "loss": 2.8438, + "step": 26699 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.15171300700403834, + "learning_rate": 4.956662252494773e-05, + "loss": 2.713, + "step": 26700 + }, + { + "epoch": 1.6575206406356695, + "grad_norm": 0.18660485652629827, + "learning_rate": 4.956301113756449e-05, + "loss": 2.8129, + "step": 26701 + }, + { + "epoch": 1.6575827177354274, + "grad_norm": 0.14406098721841518, + "learning_rate": 4.955939975246113e-05, + "loss": 2.7757, + "step": 26702 + }, + { + "epoch": 1.6576447948351853, + "grad_norm": 0.17219031735761728, + "learning_rate": 4.95557883696565e-05, + "loss": 2.8102, + "step": 26703 + }, + { + "epoch": 1.657706871934943, + "grad_norm": 0.1604339181357527, + "learning_rate": 4.955217698916941e-05, + "loss": 2.722, + "step": 26704 + }, + { + "epoch": 1.657768949034701, + "grad_norm": 0.16299766161548973, + "learning_rate": 4.9548565611018736e-05, + "loss": 2.7012, + "step": 26705 + }, + { + "epoch": 1.6578310261344589, + "grad_norm": 0.14858696966215415, + "learning_rate": 4.954495423522329e-05, + "loss": 2.807, + "step": 26706 + }, + { + "epoch": 1.6578931032342168, + "grad_norm": 0.16187945687480726, + "learning_rate": 4.954134286180194e-05, + "loss": 2.8237, + "step": 26707 + }, + { + "epoch": 1.6579551803339747, + "grad_norm": 0.1400717142577411, + "learning_rate": 4.95377314907735e-05, + "loss": 2.7455, + "step": 26708 + }, + { + "epoch": 1.6580172574337326, + "grad_norm": 0.1758429207172977, + "learning_rate": 4.9534120122156854e-05, + "loss": 2.899, + "step": 26709 + }, + { + "epoch": 1.6580793345334905, + "grad_norm": 0.14423040705568127, + "learning_rate": 4.953050875597079e-05, + "loss": 2.792, + "step": 26710 + }, + { + "epoch": 1.6581414116332485, + "grad_norm": 0.15984456623312732, + "learning_rate": 4.952689739223419e-05, + "loss": 2.8469, + "step": 26711 + }, + { + "epoch": 1.6582034887330064, + "grad_norm": 0.1583856542205618, + "learning_rate": 4.9523286030965874e-05, + "loss": 2.7685, + "step": 26712 + }, + { + "epoch": 1.6582655658327643, + "grad_norm": 0.15459077946855043, + "learning_rate": 4.95196746721847e-05, + "loss": 2.8238, + "step": 26713 + }, + { + "epoch": 1.6583276429325222, + "grad_norm": 0.1481025455251263, + "learning_rate": 4.9516063315909487e-05, + "loss": 2.7457, + "step": 26714 + }, + { + "epoch": 1.6583897200322801, + "grad_norm": 0.14067646912595216, + "learning_rate": 4.95124519621591e-05, + "loss": 2.7026, + "step": 26715 + }, + { + "epoch": 1.658451797132038, + "grad_norm": 0.15474373055516533, + "learning_rate": 4.9508840610952375e-05, + "loss": 2.8393, + "step": 26716 + }, + { + "epoch": 1.658513874231796, + "grad_norm": 0.15319674669728855, + "learning_rate": 4.9505229262308134e-05, + "loss": 2.8604, + "step": 26717 + }, + { + "epoch": 1.6585759513315539, + "grad_norm": 0.1591659142087789, + "learning_rate": 4.950161791624524e-05, + "loss": 2.8545, + "step": 26718 + }, + { + "epoch": 1.6586380284313118, + "grad_norm": 0.1681220663445784, + "learning_rate": 4.949800657278252e-05, + "loss": 2.7217, + "step": 26719 + }, + { + "epoch": 1.6587001055310697, + "grad_norm": 0.1539191076068439, + "learning_rate": 4.949439523193883e-05, + "loss": 2.7688, + "step": 26720 + }, + { + "epoch": 1.6587621826308276, + "grad_norm": 0.18234190362160568, + "learning_rate": 4.949078389373299e-05, + "loss": 2.834, + "step": 26721 + }, + { + "epoch": 1.6588242597305853, + "grad_norm": 0.1621019292753088, + "learning_rate": 4.948717255818386e-05, + "loss": 2.7042, + "step": 26722 + }, + { + "epoch": 1.6588863368303433, + "grad_norm": 0.14685698484969192, + "learning_rate": 4.948356122531028e-05, + "loss": 2.8002, + "step": 26723 + }, + { + "epoch": 1.6589484139301012, + "grad_norm": 0.16709602206304744, + "learning_rate": 4.9479949895131084e-05, + "loss": 2.7283, + "step": 26724 + }, + { + "epoch": 1.659010491029859, + "grad_norm": 0.1483337139899838, + "learning_rate": 4.9476338567665106e-05, + "loss": 2.8159, + "step": 26725 + }, + { + "epoch": 1.659072568129617, + "grad_norm": 0.16406780188939993, + "learning_rate": 4.9472727242931216e-05, + "loss": 2.8877, + "step": 26726 + }, + { + "epoch": 1.659134645229375, + "grad_norm": 0.1539816352255997, + "learning_rate": 4.9469115920948225e-05, + "loss": 2.7603, + "step": 26727 + }, + { + "epoch": 1.6591967223291326, + "grad_norm": 0.14671166905631539, + "learning_rate": 4.946550460173498e-05, + "loss": 2.7944, + "step": 26728 + }, + { + "epoch": 1.6592587994288905, + "grad_norm": 0.15764667821244963, + "learning_rate": 4.9461893285310325e-05, + "loss": 2.7872, + "step": 26729 + }, + { + "epoch": 1.6593208765286485, + "grad_norm": 0.1477875156336759, + "learning_rate": 4.945828197169311e-05, + "loss": 2.9094, + "step": 26730 + }, + { + "epoch": 1.6593829536284064, + "grad_norm": 0.14598982296748322, + "learning_rate": 4.945467066090217e-05, + "loss": 2.9301, + "step": 26731 + }, + { + "epoch": 1.6594450307281643, + "grad_norm": 0.15939823897482652, + "learning_rate": 4.9451059352956345e-05, + "loss": 2.7456, + "step": 26732 + }, + { + "epoch": 1.6595071078279222, + "grad_norm": 0.1456518164694294, + "learning_rate": 4.944744804787448e-05, + "loss": 2.7379, + "step": 26733 + }, + { + "epoch": 1.6595691849276801, + "grad_norm": 0.15223359747320744, + "learning_rate": 4.9443836745675414e-05, + "loss": 2.767, + "step": 26734 + }, + { + "epoch": 1.659631262027438, + "grad_norm": 0.16371538703677038, + "learning_rate": 4.944022544637798e-05, + "loss": 2.8689, + "step": 26735 + }, + { + "epoch": 1.659693339127196, + "grad_norm": 0.14727707952377495, + "learning_rate": 4.943661415000103e-05, + "loss": 2.896, + "step": 26736 + }, + { + "epoch": 1.6597554162269539, + "grad_norm": 0.1504206813155857, + "learning_rate": 4.9433002856563394e-05, + "loss": 2.7105, + "step": 26737 + }, + { + "epoch": 1.6598174933267118, + "grad_norm": 0.1484832504942, + "learning_rate": 4.942939156608393e-05, + "loss": 2.7861, + "step": 26738 + }, + { + "epoch": 1.6598795704264697, + "grad_norm": 0.1492210275665945, + "learning_rate": 4.9425780278581454e-05, + "loss": 2.8294, + "step": 26739 + }, + { + "epoch": 1.6599416475262276, + "grad_norm": 0.15041038457686962, + "learning_rate": 4.942216899407484e-05, + "loss": 2.804, + "step": 26740 + }, + { + "epoch": 1.6600037246259856, + "grad_norm": 0.15207485162803627, + "learning_rate": 4.941855771258291e-05, + "loss": 2.7038, + "step": 26741 + }, + { + "epoch": 1.6600658017257435, + "grad_norm": 0.14976441066968008, + "learning_rate": 4.941494643412449e-05, + "loss": 2.7973, + "step": 26742 + }, + { + "epoch": 1.6601278788255014, + "grad_norm": 0.1601162081183852, + "learning_rate": 4.941133515871845e-05, + "loss": 2.6714, + "step": 26743 + }, + { + "epoch": 1.6601899559252593, + "grad_norm": 0.14691160841507137, + "learning_rate": 4.940772388638361e-05, + "loss": 2.7146, + "step": 26744 + }, + { + "epoch": 1.6602520330250172, + "grad_norm": 0.16034019188287027, + "learning_rate": 4.940411261713882e-05, + "loss": 2.8746, + "step": 26745 + }, + { + "epoch": 1.660314110124775, + "grad_norm": 0.16218870014599246, + "learning_rate": 4.940050135100292e-05, + "loss": 2.8078, + "step": 26746 + }, + { + "epoch": 1.6603761872245328, + "grad_norm": 0.14176717478660664, + "learning_rate": 4.939689008799475e-05, + "loss": 2.8406, + "step": 26747 + }, + { + "epoch": 1.6604382643242908, + "grad_norm": 0.1831549514030654, + "learning_rate": 4.939327882813315e-05, + "loss": 2.786, + "step": 26748 + }, + { + "epoch": 1.6605003414240487, + "grad_norm": 0.14330872724635366, + "learning_rate": 4.938966757143696e-05, + "loss": 2.7223, + "step": 26749 + }, + { + "epoch": 1.6605624185238066, + "grad_norm": 0.1724075876408444, + "learning_rate": 4.9386056317925026e-05, + "loss": 2.7496, + "step": 26750 + }, + { + "epoch": 1.6606244956235645, + "grad_norm": 0.15735004741071296, + "learning_rate": 4.938244506761619e-05, + "loss": 2.7578, + "step": 26751 + }, + { + "epoch": 1.6606865727233222, + "grad_norm": 0.15544539352831302, + "learning_rate": 4.937883382052928e-05, + "loss": 2.695, + "step": 26752 + }, + { + "epoch": 1.6607486498230801, + "grad_norm": 0.15762117870056536, + "learning_rate": 4.9375222576683146e-05, + "loss": 2.7382, + "step": 26753 + }, + { + "epoch": 1.660810726922838, + "grad_norm": 0.1555788946317551, + "learning_rate": 4.937161133609663e-05, + "loss": 2.7715, + "step": 26754 + }, + { + "epoch": 1.660872804022596, + "grad_norm": 0.14219913670846857, + "learning_rate": 4.9368000098788564e-05, + "loss": 2.7201, + "step": 26755 + }, + { + "epoch": 1.6609348811223539, + "grad_norm": 0.1648986602917608, + "learning_rate": 4.9364388864777805e-05, + "loss": 2.7609, + "step": 26756 + }, + { + "epoch": 1.6609969582221118, + "grad_norm": 0.16755802371880463, + "learning_rate": 4.936077763408317e-05, + "loss": 2.8136, + "step": 26757 + }, + { + "epoch": 1.6610590353218697, + "grad_norm": 0.157894834908708, + "learning_rate": 4.9357166406723526e-05, + "loss": 2.799, + "step": 26758 + }, + { + "epoch": 1.6611211124216276, + "grad_norm": 0.1540544029181753, + "learning_rate": 4.935355518271769e-05, + "loss": 2.8742, + "step": 26759 + }, + { + "epoch": 1.6611831895213856, + "grad_norm": 0.17276017418634393, + "learning_rate": 4.934994396208452e-05, + "loss": 2.7408, + "step": 26760 + }, + { + "epoch": 1.6612452666211435, + "grad_norm": 0.16143910744753256, + "learning_rate": 4.934633274484284e-05, + "loss": 2.8721, + "step": 26761 + }, + { + "epoch": 1.6613073437209014, + "grad_norm": 0.14451568751705507, + "learning_rate": 4.9342721531011496e-05, + "loss": 2.7639, + "step": 26762 + }, + { + "epoch": 1.6613694208206593, + "grad_norm": 0.16329973449298546, + "learning_rate": 4.9339110320609354e-05, + "loss": 2.747, + "step": 26763 + }, + { + "epoch": 1.6614314979204172, + "grad_norm": 0.16527745562298504, + "learning_rate": 4.9335499113655226e-05, + "loss": 2.8061, + "step": 26764 + }, + { + "epoch": 1.6614935750201751, + "grad_norm": 0.18925457804999452, + "learning_rate": 4.933188791016797e-05, + "loss": 2.7945, + "step": 26765 + }, + { + "epoch": 1.661555652119933, + "grad_norm": 0.16804327531308344, + "learning_rate": 4.932827671016641e-05, + "loss": 2.8357, + "step": 26766 + }, + { + "epoch": 1.661617729219691, + "grad_norm": 0.17866736542715722, + "learning_rate": 4.9324665513669385e-05, + "loss": 2.837, + "step": 26767 + }, + { + "epoch": 1.661679806319449, + "grad_norm": 0.1472469001724661, + "learning_rate": 4.932105432069576e-05, + "loss": 2.8105, + "step": 26768 + }, + { + "epoch": 1.6617418834192068, + "grad_norm": 0.16874225629593798, + "learning_rate": 4.931744313126434e-05, + "loss": 2.8454, + "step": 26769 + }, + { + "epoch": 1.6618039605189645, + "grad_norm": 0.14298826824428382, + "learning_rate": 4.9313831945394e-05, + "loss": 2.7149, + "step": 26770 + }, + { + "epoch": 1.6618660376187224, + "grad_norm": 0.17329288437588913, + "learning_rate": 4.931022076310356e-05, + "loss": 2.7665, + "step": 26771 + }, + { + "epoch": 1.6619281147184803, + "grad_norm": 0.16983350560579963, + "learning_rate": 4.9306609584411864e-05, + "loss": 2.7592, + "step": 26772 + }, + { + "epoch": 1.6619901918182383, + "grad_norm": 0.17882887278909185, + "learning_rate": 4.930299840933776e-05, + "loss": 2.8808, + "step": 26773 + }, + { + "epoch": 1.6620522689179962, + "grad_norm": 0.17110503107269284, + "learning_rate": 4.929938723790008e-05, + "loss": 2.7244, + "step": 26774 + }, + { + "epoch": 1.662114346017754, + "grad_norm": 0.17092252289122403, + "learning_rate": 4.929577607011766e-05, + "loss": 2.8347, + "step": 26775 + }, + { + "epoch": 1.6621764231175118, + "grad_norm": 0.15840562176198847, + "learning_rate": 4.929216490600936e-05, + "loss": 2.8357, + "step": 26776 + }, + { + "epoch": 1.6622385002172697, + "grad_norm": 0.14978542800900554, + "learning_rate": 4.928855374559401e-05, + "loss": 2.7222, + "step": 26777 + }, + { + "epoch": 1.6623005773170276, + "grad_norm": 0.17546324575220465, + "learning_rate": 4.928494258889043e-05, + "loss": 2.7244, + "step": 26778 + }, + { + "epoch": 1.6623626544167855, + "grad_norm": 0.1603024269214927, + "learning_rate": 4.928133143591749e-05, + "loss": 2.7095, + "step": 26779 + }, + { + "epoch": 1.6624247315165435, + "grad_norm": 0.19613578847311355, + "learning_rate": 4.927772028669401e-05, + "loss": 2.8224, + "step": 26780 + }, + { + "epoch": 1.6624868086163014, + "grad_norm": 0.149178762220783, + "learning_rate": 4.927410914123885e-05, + "loss": 2.8049, + "step": 26781 + }, + { + "epoch": 1.6625488857160593, + "grad_norm": 0.1674176590478121, + "learning_rate": 4.927049799957082e-05, + "loss": 2.8325, + "step": 26782 + }, + { + "epoch": 1.6626109628158172, + "grad_norm": 0.2113259853036128, + "learning_rate": 4.92668868617088e-05, + "loss": 2.7792, + "step": 26783 + }, + { + "epoch": 1.6626730399155751, + "grad_norm": 0.16063683789971464, + "learning_rate": 4.92632757276716e-05, + "loss": 2.787, + "step": 26784 + }, + { + "epoch": 1.662735117015333, + "grad_norm": 0.16934703187777028, + "learning_rate": 4.9259664597478074e-05, + "loss": 2.8093, + "step": 26785 + }, + { + "epoch": 1.662797194115091, + "grad_norm": 0.175138075674294, + "learning_rate": 4.925605347114704e-05, + "loss": 2.7757, + "step": 26786 + }, + { + "epoch": 1.6628592712148489, + "grad_norm": 0.16460137197976046, + "learning_rate": 4.925244234869738e-05, + "loss": 2.7725, + "step": 26787 + }, + { + "epoch": 1.6629213483146068, + "grad_norm": 0.15395004791743447, + "learning_rate": 4.9248831230147905e-05, + "loss": 2.7841, + "step": 26788 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.15865589624860998, + "learning_rate": 4.924522011551745e-05, + "loss": 2.7401, + "step": 26789 + }, + { + "epoch": 1.6630455025141226, + "grad_norm": 0.14982215152886266, + "learning_rate": 4.9241609004824873e-05, + "loss": 2.8327, + "step": 26790 + }, + { + "epoch": 1.6631075796138806, + "grad_norm": 0.14178361042496998, + "learning_rate": 4.9237997898088994e-05, + "loss": 2.7965, + "step": 26791 + }, + { + "epoch": 1.6631696567136385, + "grad_norm": 0.17049628761730237, + "learning_rate": 4.923438679532868e-05, + "loss": 2.7171, + "step": 26792 + }, + { + "epoch": 1.6632317338133964, + "grad_norm": 0.15917098717813102, + "learning_rate": 4.923077569656273e-05, + "loss": 2.7845, + "step": 26793 + }, + { + "epoch": 1.663293810913154, + "grad_norm": 0.14877779127009547, + "learning_rate": 4.922716460181002e-05, + "loss": 2.7877, + "step": 26794 + }, + { + "epoch": 1.663355888012912, + "grad_norm": 0.1390409728864947, + "learning_rate": 4.922355351108939e-05, + "loss": 2.8213, + "step": 26795 + }, + { + "epoch": 1.66341796511267, + "grad_norm": 0.1618150659205214, + "learning_rate": 4.921994242441967e-05, + "loss": 2.7757, + "step": 26796 + }, + { + "epoch": 1.6634800422124278, + "grad_norm": 0.16622923964291533, + "learning_rate": 4.92163313418197e-05, + "loss": 2.8148, + "step": 26797 + }, + { + "epoch": 1.6635421193121858, + "grad_norm": 0.158174176569903, + "learning_rate": 4.921272026330831e-05, + "loss": 2.8651, + "step": 26798 + }, + { + "epoch": 1.6636041964119437, + "grad_norm": 0.16644829859065977, + "learning_rate": 4.920910918890436e-05, + "loss": 2.8053, + "step": 26799 + }, + { + "epoch": 1.6636662735117014, + "grad_norm": 0.15716440737427104, + "learning_rate": 4.9205498118626676e-05, + "loss": 2.7434, + "step": 26800 + }, + { + "epoch": 1.6637283506114593, + "grad_norm": 0.159892836712478, + "learning_rate": 4.9201887052494106e-05, + "loss": 2.7553, + "step": 26801 + }, + { + "epoch": 1.6637904277112172, + "grad_norm": 0.1544057124201286, + "learning_rate": 4.919827599052548e-05, + "loss": 2.8168, + "step": 26802 + }, + { + "epoch": 1.6638525048109751, + "grad_norm": 0.1660808379051922, + "learning_rate": 4.9194664932739644e-05, + "loss": 2.779, + "step": 26803 + }, + { + "epoch": 1.663914581910733, + "grad_norm": 0.144431517962021, + "learning_rate": 4.919105387915544e-05, + "loss": 2.8582, + "step": 26804 + }, + { + "epoch": 1.663976659010491, + "grad_norm": 0.1531567217121593, + "learning_rate": 4.91874428297917e-05, + "loss": 2.8335, + "step": 26805 + }, + { + "epoch": 1.6640387361102489, + "grad_norm": 0.17323665502545696, + "learning_rate": 4.9183831784667266e-05, + "loss": 2.7135, + "step": 26806 + }, + { + "epoch": 1.6641008132100068, + "grad_norm": 0.19536106459512362, + "learning_rate": 4.918022074380098e-05, + "loss": 2.7615, + "step": 26807 + }, + { + "epoch": 1.6641628903097647, + "grad_norm": 0.17749813644680096, + "learning_rate": 4.9176609707211685e-05, + "loss": 2.7863, + "step": 26808 + }, + { + "epoch": 1.6642249674095226, + "grad_norm": 0.16175466244690384, + "learning_rate": 4.917299867491822e-05, + "loss": 2.7809, + "step": 26809 + }, + { + "epoch": 1.6642870445092806, + "grad_norm": 0.1856233338881707, + "learning_rate": 4.916938764693941e-05, + "loss": 2.7552, + "step": 26810 + }, + { + "epoch": 1.6643491216090385, + "grad_norm": 0.17396184514169083, + "learning_rate": 4.916577662329411e-05, + "loss": 2.7873, + "step": 26811 + }, + { + "epoch": 1.6644111987087964, + "grad_norm": 0.16107983729819386, + "learning_rate": 4.916216560400116e-05, + "loss": 2.7525, + "step": 26812 + }, + { + "epoch": 1.6644732758085543, + "grad_norm": 0.15419388605406892, + "learning_rate": 4.9158554589079405e-05, + "loss": 2.8491, + "step": 26813 + }, + { + "epoch": 1.6645353529083122, + "grad_norm": 0.15645666759872434, + "learning_rate": 4.915494357854766e-05, + "loss": 2.7432, + "step": 26814 + }, + { + "epoch": 1.6645974300080701, + "grad_norm": 0.15206122488074403, + "learning_rate": 4.915133257242479e-05, + "loss": 2.7886, + "step": 26815 + }, + { + "epoch": 1.664659507107828, + "grad_norm": 0.18160550998361447, + "learning_rate": 4.914772157072961e-05, + "loss": 2.8586, + "step": 26816 + }, + { + "epoch": 1.664721584207586, + "grad_norm": 0.15697630518836792, + "learning_rate": 4.914411057348099e-05, + "loss": 2.792, + "step": 26817 + }, + { + "epoch": 1.6647836613073437, + "grad_norm": 0.15506722894820257, + "learning_rate": 4.9140499580697734e-05, + "loss": 2.7057, + "step": 26818 + }, + { + "epoch": 1.6648457384071016, + "grad_norm": 0.144701921370476, + "learning_rate": 4.913688859239872e-05, + "loss": 2.7711, + "step": 26819 + }, + { + "epoch": 1.6649078155068595, + "grad_norm": 0.14705466836956455, + "learning_rate": 4.913327760860275e-05, + "loss": 2.7171, + "step": 26820 + }, + { + "epoch": 1.6649698926066174, + "grad_norm": 0.150356053428745, + "learning_rate": 4.9129666629328694e-05, + "loss": 2.7075, + "step": 26821 + }, + { + "epoch": 1.6650319697063753, + "grad_norm": 0.14140196577716246, + "learning_rate": 4.912605565459537e-05, + "loss": 2.7071, + "step": 26822 + }, + { + "epoch": 1.6650940468061333, + "grad_norm": 0.14784061739079593, + "learning_rate": 4.912244468442164e-05, + "loss": 2.8397, + "step": 26823 + }, + { + "epoch": 1.665156123905891, + "grad_norm": 0.1476650719807688, + "learning_rate": 4.911883371882631e-05, + "loss": 2.749, + "step": 26824 + }, + { + "epoch": 1.6652182010056489, + "grad_norm": 0.15688964926381394, + "learning_rate": 4.911522275782825e-05, + "loss": 2.8184, + "step": 26825 + }, + { + "epoch": 1.6652802781054068, + "grad_norm": 0.1664880258364851, + "learning_rate": 4.911161180144627e-05, + "loss": 2.7762, + "step": 26826 + }, + { + "epoch": 1.6653423552051647, + "grad_norm": 0.141714577135017, + "learning_rate": 4.910800084969925e-05, + "loss": 2.6992, + "step": 26827 + }, + { + "epoch": 1.6654044323049226, + "grad_norm": 0.16360876942598016, + "learning_rate": 4.910438990260599e-05, + "loss": 2.6925, + "step": 26828 + }, + { + "epoch": 1.6654665094046806, + "grad_norm": 0.14219162431960458, + "learning_rate": 4.910077896018536e-05, + "loss": 2.7901, + "step": 26829 + }, + { + "epoch": 1.6655285865044385, + "grad_norm": 0.18311463396658428, + "learning_rate": 4.909716802245617e-05, + "loss": 2.8857, + "step": 26830 + }, + { + "epoch": 1.6655906636041964, + "grad_norm": 0.14626948170833637, + "learning_rate": 4.909355708943728e-05, + "loss": 2.8003, + "step": 26831 + }, + { + "epoch": 1.6656527407039543, + "grad_norm": 0.14419032173140553, + "learning_rate": 4.908994616114753e-05, + "loss": 2.7906, + "step": 26832 + }, + { + "epoch": 1.6657148178037122, + "grad_norm": 0.1520247175532413, + "learning_rate": 4.9086335237605746e-05, + "loss": 2.7149, + "step": 26833 + }, + { + "epoch": 1.6657768949034701, + "grad_norm": 0.1400929359333247, + "learning_rate": 4.9082724318830766e-05, + "loss": 2.6825, + "step": 26834 + }, + { + "epoch": 1.665838972003228, + "grad_norm": 0.1612950173274434, + "learning_rate": 4.907911340484145e-05, + "loss": 2.7851, + "step": 26835 + }, + { + "epoch": 1.665901049102986, + "grad_norm": 0.14719529888524946, + "learning_rate": 4.90755024956566e-05, + "loss": 2.8533, + "step": 26836 + }, + { + "epoch": 1.665963126202744, + "grad_norm": 0.1450710847201223, + "learning_rate": 4.90718915912951e-05, + "loss": 2.8035, + "step": 26837 + }, + { + "epoch": 1.6660252033025018, + "grad_norm": 0.13832704222283476, + "learning_rate": 4.906828069177577e-05, + "loss": 2.77, + "step": 26838 + }, + { + "epoch": 1.6660872804022597, + "grad_norm": 0.15638826332553923, + "learning_rate": 4.906466979711744e-05, + "loss": 2.7961, + "step": 26839 + }, + { + "epoch": 1.6661493575020176, + "grad_norm": 0.16470149464836684, + "learning_rate": 4.9061058907338944e-05, + "loss": 2.7741, + "step": 26840 + }, + { + "epoch": 1.6662114346017756, + "grad_norm": 0.14160904700131707, + "learning_rate": 4.905744802245913e-05, + "loss": 2.7694, + "step": 26841 + }, + { + "epoch": 1.6662735117015333, + "grad_norm": 0.1500811364856199, + "learning_rate": 4.905383714249686e-05, + "loss": 2.8696, + "step": 26842 + }, + { + "epoch": 1.6663355888012912, + "grad_norm": 0.14006330979822534, + "learning_rate": 4.905022626747093e-05, + "loss": 2.7716, + "step": 26843 + }, + { + "epoch": 1.666397665901049, + "grad_norm": 0.14885556175863093, + "learning_rate": 4.904661539740021e-05, + "loss": 2.8509, + "step": 26844 + }, + { + "epoch": 1.666459743000807, + "grad_norm": 0.18221136890457298, + "learning_rate": 4.9043004532303526e-05, + "loss": 2.8493, + "step": 26845 + }, + { + "epoch": 1.666521820100565, + "grad_norm": 0.14244728094048592, + "learning_rate": 4.903939367219973e-05, + "loss": 2.8259, + "step": 26846 + }, + { + "epoch": 1.6665838972003229, + "grad_norm": 0.14576840587387743, + "learning_rate": 4.903578281710763e-05, + "loss": 2.853, + "step": 26847 + }, + { + "epoch": 1.6666459743000805, + "grad_norm": 0.14395725480207908, + "learning_rate": 4.9032171967046107e-05, + "loss": 2.7477, + "step": 26848 + }, + { + "epoch": 1.6667080513998385, + "grad_norm": 0.15257747767204166, + "learning_rate": 4.902856112203396e-05, + "loss": 2.7887, + "step": 26849 + }, + { + "epoch": 1.6667701284995964, + "grad_norm": 0.16010671524119935, + "learning_rate": 4.902495028209006e-05, + "loss": 2.8401, + "step": 26850 + }, + { + "epoch": 1.6668322055993543, + "grad_norm": 0.1902102648987592, + "learning_rate": 4.9021339447233224e-05, + "loss": 2.8277, + "step": 26851 + }, + { + "epoch": 1.6668942826991122, + "grad_norm": 0.158950265885497, + "learning_rate": 4.9017728617482295e-05, + "loss": 2.7729, + "step": 26852 + }, + { + "epoch": 1.6669563597988701, + "grad_norm": 0.16181604434832184, + "learning_rate": 4.9014117792856126e-05, + "loss": 2.7248, + "step": 26853 + }, + { + "epoch": 1.667018436898628, + "grad_norm": 0.14830089858969983, + "learning_rate": 4.9010506973373526e-05, + "loss": 2.7728, + "step": 26854 + }, + { + "epoch": 1.667080513998386, + "grad_norm": 0.1504603137763921, + "learning_rate": 4.900689615905336e-05, + "loss": 2.6948, + "step": 26855 + }, + { + "epoch": 1.6671425910981439, + "grad_norm": 0.16378937420768744, + "learning_rate": 4.900328534991446e-05, + "loss": 2.8665, + "step": 26856 + }, + { + "epoch": 1.6672046681979018, + "grad_norm": 0.14752026179327343, + "learning_rate": 4.899967454597566e-05, + "loss": 2.7875, + "step": 26857 + }, + { + "epoch": 1.6672667452976597, + "grad_norm": 0.16951017820130956, + "learning_rate": 4.89960637472558e-05, + "loss": 2.7728, + "step": 26858 + }, + { + "epoch": 1.6673288223974176, + "grad_norm": 0.1458028388213543, + "learning_rate": 4.89924529537737e-05, + "loss": 2.7807, + "step": 26859 + }, + { + "epoch": 1.6673908994971756, + "grad_norm": 0.2212153036584937, + "learning_rate": 4.898884216554824e-05, + "loss": 2.8434, + "step": 26860 + }, + { + "epoch": 1.6674529765969335, + "grad_norm": 0.1471708172153554, + "learning_rate": 4.898523138259823e-05, + "loss": 2.6851, + "step": 26861 + }, + { + "epoch": 1.6675150536966914, + "grad_norm": 0.14957566342130982, + "learning_rate": 4.898162060494252e-05, + "loss": 2.9278, + "step": 26862 + }, + { + "epoch": 1.6675771307964493, + "grad_norm": 0.1480323759241109, + "learning_rate": 4.897800983259995e-05, + "loss": 2.7737, + "step": 26863 + }, + { + "epoch": 1.6676392078962072, + "grad_norm": 0.17479750016423673, + "learning_rate": 4.897439906558933e-05, + "loss": 2.8994, + "step": 26864 + }, + { + "epoch": 1.6677012849959651, + "grad_norm": 0.1571779057340941, + "learning_rate": 4.897078830392954e-05, + "loss": 2.7658, + "step": 26865 + }, + { + "epoch": 1.6677633620957228, + "grad_norm": 0.14997073176068787, + "learning_rate": 4.896717754763938e-05, + "loss": 2.8297, + "step": 26866 + }, + { + "epoch": 1.6678254391954808, + "grad_norm": 0.14193142583894022, + "learning_rate": 4.896356679673772e-05, + "loss": 2.8602, + "step": 26867 + }, + { + "epoch": 1.6678875162952387, + "grad_norm": 0.1514026846333024, + "learning_rate": 4.8959956051243364e-05, + "loss": 2.8304, + "step": 26868 + }, + { + "epoch": 1.6679495933949966, + "grad_norm": 0.1508709248033508, + "learning_rate": 4.895634531117519e-05, + "loss": 2.7369, + "step": 26869 + }, + { + "epoch": 1.6680116704947545, + "grad_norm": 0.14243800973236082, + "learning_rate": 4.8952734576552006e-05, + "loss": 2.7046, + "step": 26870 + }, + { + "epoch": 1.6680737475945124, + "grad_norm": 0.15908087840637286, + "learning_rate": 4.8949123847392663e-05, + "loss": 2.8982, + "step": 26871 + }, + { + "epoch": 1.6681358246942701, + "grad_norm": 0.13868605850110097, + "learning_rate": 4.894551312371599e-05, + "loss": 2.6966, + "step": 26872 + }, + { + "epoch": 1.668197901794028, + "grad_norm": 0.14849214426408203, + "learning_rate": 4.894190240554084e-05, + "loss": 2.7578, + "step": 26873 + }, + { + "epoch": 1.668259978893786, + "grad_norm": 0.14467347724926283, + "learning_rate": 4.8938291692886037e-05, + "loss": 2.7993, + "step": 26874 + }, + { + "epoch": 1.6683220559935439, + "grad_norm": 0.16433639030312555, + "learning_rate": 4.893468098577043e-05, + "loss": 2.806, + "step": 26875 + }, + { + "epoch": 1.6683841330933018, + "grad_norm": 0.14433774366268193, + "learning_rate": 4.893107028421285e-05, + "loss": 2.85, + "step": 26876 + }, + { + "epoch": 1.6684462101930597, + "grad_norm": 0.1559931670803084, + "learning_rate": 4.892745958823213e-05, + "loss": 2.7965, + "step": 26877 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.144854634872862, + "learning_rate": 4.8923848897847117e-05, + "loss": 2.8079, + "step": 26878 + }, + { + "epoch": 1.6685703643925756, + "grad_norm": 0.16164411282913296, + "learning_rate": 4.892023821307664e-05, + "loss": 2.8727, + "step": 26879 + }, + { + "epoch": 1.6686324414923335, + "grad_norm": 0.14090116193108168, + "learning_rate": 4.891662753393954e-05, + "loss": 2.716, + "step": 26880 + }, + { + "epoch": 1.6686945185920914, + "grad_norm": 0.15648654080176258, + "learning_rate": 4.891301686045467e-05, + "loss": 2.73, + "step": 26881 + }, + { + "epoch": 1.6687565956918493, + "grad_norm": 0.14537702353316398, + "learning_rate": 4.890940619264085e-05, + "loss": 2.8146, + "step": 26882 + }, + { + "epoch": 1.6688186727916072, + "grad_norm": 0.15114357335915918, + "learning_rate": 4.890579553051691e-05, + "loss": 2.7316, + "step": 26883 + }, + { + "epoch": 1.6688807498913651, + "grad_norm": 0.15036628575732708, + "learning_rate": 4.890218487410172e-05, + "loss": 2.8148, + "step": 26884 + }, + { + "epoch": 1.668942826991123, + "grad_norm": 0.14865786733503317, + "learning_rate": 4.889857422341408e-05, + "loss": 2.822, + "step": 26885 + }, + { + "epoch": 1.669004904090881, + "grad_norm": 0.17854207299971514, + "learning_rate": 4.889496357847286e-05, + "loss": 2.7838, + "step": 26886 + }, + { + "epoch": 1.669066981190639, + "grad_norm": 0.14633998112017468, + "learning_rate": 4.8891352939296885e-05, + "loss": 2.8217, + "step": 26887 + }, + { + "epoch": 1.6691290582903968, + "grad_norm": 0.17365000203095832, + "learning_rate": 4.8887742305904984e-05, + "loss": 2.7611, + "step": 26888 + }, + { + "epoch": 1.6691911353901547, + "grad_norm": 0.14216181973799635, + "learning_rate": 4.8884131678316005e-05, + "loss": 2.792, + "step": 26889 + }, + { + "epoch": 1.6692532124899124, + "grad_norm": 0.1726036341142314, + "learning_rate": 4.888052105654877e-05, + "loss": 2.7564, + "step": 26890 + }, + { + "epoch": 1.6693152895896703, + "grad_norm": 0.1779842506604501, + "learning_rate": 4.887691044062214e-05, + "loss": 2.7504, + "step": 26891 + }, + { + "epoch": 1.6693773666894283, + "grad_norm": 0.1596478032434837, + "learning_rate": 4.8873299830554927e-05, + "loss": 2.7712, + "step": 26892 + }, + { + "epoch": 1.6694394437891862, + "grad_norm": 0.16288038537204463, + "learning_rate": 4.8869689226365985e-05, + "loss": 2.8127, + "step": 26893 + }, + { + "epoch": 1.669501520888944, + "grad_norm": 0.1648654419515955, + "learning_rate": 4.886607862807416e-05, + "loss": 2.8143, + "step": 26894 + }, + { + "epoch": 1.669563597988702, + "grad_norm": 0.17047864308308497, + "learning_rate": 4.886246803569827e-05, + "loss": 2.8093, + "step": 26895 + }, + { + "epoch": 1.6696256750884597, + "grad_norm": 0.18948321742536783, + "learning_rate": 4.885885744925717e-05, + "loss": 2.8533, + "step": 26896 + }, + { + "epoch": 1.6696877521882176, + "grad_norm": 0.16051856786844343, + "learning_rate": 4.8855246868769675e-05, + "loss": 2.7767, + "step": 26897 + }, + { + "epoch": 1.6697498292879756, + "grad_norm": 0.15248753413662722, + "learning_rate": 4.8851636294254655e-05, + "loss": 2.7419, + "step": 26898 + }, + { + "epoch": 1.6698119063877335, + "grad_norm": 0.22082961517178842, + "learning_rate": 4.8848025725730916e-05, + "loss": 2.8338, + "step": 26899 + }, + { + "epoch": 1.6698739834874914, + "grad_norm": 0.15671389381133907, + "learning_rate": 4.8844415163217296e-05, + "loss": 2.8408, + "step": 26900 + }, + { + "epoch": 1.6699360605872493, + "grad_norm": 0.15590722513534266, + "learning_rate": 4.884080460673267e-05, + "loss": 2.7058, + "step": 26901 + }, + { + "epoch": 1.6699981376870072, + "grad_norm": 0.1516424752151526, + "learning_rate": 4.883719405629581e-05, + "loss": 2.6413, + "step": 26902 + }, + { + "epoch": 1.6700602147867651, + "grad_norm": 0.20702811904555052, + "learning_rate": 4.8833583511925625e-05, + "loss": 2.8258, + "step": 26903 + }, + { + "epoch": 1.670122291886523, + "grad_norm": 0.14478338004030947, + "learning_rate": 4.8829972973640895e-05, + "loss": 2.6728, + "step": 26904 + }, + { + "epoch": 1.670184368986281, + "grad_norm": 0.15534413125937746, + "learning_rate": 4.8826362441460494e-05, + "loss": 2.6799, + "step": 26905 + }, + { + "epoch": 1.670246446086039, + "grad_norm": 0.18236149517686948, + "learning_rate": 4.882275191540323e-05, + "loss": 2.8594, + "step": 26906 + }, + { + "epoch": 1.6703085231857968, + "grad_norm": 0.20098398749127236, + "learning_rate": 4.881914139548798e-05, + "loss": 2.7788, + "step": 26907 + }, + { + "epoch": 1.6703706002855547, + "grad_norm": 0.1494855024299838, + "learning_rate": 4.881553088173353e-05, + "loss": 2.7105, + "step": 26908 + }, + { + "epoch": 1.6704326773853126, + "grad_norm": 0.1611176396942415, + "learning_rate": 4.881192037415876e-05, + "loss": 2.8039, + "step": 26909 + }, + { + "epoch": 1.6704947544850706, + "grad_norm": 0.1702515976620098, + "learning_rate": 4.8808309872782474e-05, + "loss": 2.8803, + "step": 26910 + }, + { + "epoch": 1.6705568315848285, + "grad_norm": 0.16703266634755437, + "learning_rate": 4.880469937762354e-05, + "loss": 2.7613, + "step": 26911 + }, + { + "epoch": 1.6706189086845864, + "grad_norm": 0.1704928681071982, + "learning_rate": 4.880108888870078e-05, + "loss": 2.7746, + "step": 26912 + }, + { + "epoch": 1.6706809857843443, + "grad_norm": 0.16736742436327642, + "learning_rate": 4.879747840603302e-05, + "loss": 2.8821, + "step": 26913 + }, + { + "epoch": 1.670743062884102, + "grad_norm": 0.15139406642374445, + "learning_rate": 4.8793867929639114e-05, + "loss": 2.7867, + "step": 26914 + }, + { + "epoch": 1.67080513998386, + "grad_norm": 0.15651750910827295, + "learning_rate": 4.8790257459537886e-05, + "loss": 2.8014, + "step": 26915 + }, + { + "epoch": 1.6708672170836179, + "grad_norm": 0.1733299467458221, + "learning_rate": 4.8786646995748186e-05, + "loss": 2.7557, + "step": 26916 + }, + { + "epoch": 1.6709292941833758, + "grad_norm": 0.18682636009136241, + "learning_rate": 4.878303653828884e-05, + "loss": 2.7627, + "step": 26917 + }, + { + "epoch": 1.6709913712831337, + "grad_norm": 0.1597082513046734, + "learning_rate": 4.8779426087178684e-05, + "loss": 2.7602, + "step": 26918 + }, + { + "epoch": 1.6710534483828916, + "grad_norm": 0.159303780612307, + "learning_rate": 4.877581564243656e-05, + "loss": 2.8168, + "step": 26919 + }, + { + "epoch": 1.6711155254826493, + "grad_norm": 0.16843767469570464, + "learning_rate": 4.877220520408131e-05, + "loss": 2.8441, + "step": 26920 + }, + { + "epoch": 1.6711776025824072, + "grad_norm": 0.15946990160831817, + "learning_rate": 4.876859477213175e-05, + "loss": 2.8364, + "step": 26921 + }, + { + "epoch": 1.6712396796821651, + "grad_norm": 0.15457247614219885, + "learning_rate": 4.876498434660674e-05, + "loss": 2.8297, + "step": 26922 + }, + { + "epoch": 1.671301756781923, + "grad_norm": 0.15837638104046647, + "learning_rate": 4.876137392752511e-05, + "loss": 2.7053, + "step": 26923 + }, + { + "epoch": 1.671363833881681, + "grad_norm": 0.17602889841846361, + "learning_rate": 4.875776351490567e-05, + "loss": 2.7264, + "step": 26924 + }, + { + "epoch": 1.671425910981439, + "grad_norm": 0.14806889267104206, + "learning_rate": 4.875415310876729e-05, + "loss": 2.7931, + "step": 26925 + }, + { + "epoch": 1.6714879880811968, + "grad_norm": 0.13631304764101962, + "learning_rate": 4.87505427091288e-05, + "loss": 2.7386, + "step": 26926 + }, + { + "epoch": 1.6715500651809547, + "grad_norm": 0.17996708224557648, + "learning_rate": 4.874693231600904e-05, + "loss": 2.8796, + "step": 26927 + }, + { + "epoch": 1.6716121422807126, + "grad_norm": 0.14686897016890496, + "learning_rate": 4.874332192942683e-05, + "loss": 2.7474, + "step": 26928 + }, + { + "epoch": 1.6716742193804706, + "grad_norm": 0.163652247706466, + "learning_rate": 4.873971154940101e-05, + "loss": 2.7661, + "step": 26929 + }, + { + "epoch": 1.6717362964802285, + "grad_norm": 0.15282760711214002, + "learning_rate": 4.873610117595044e-05, + "loss": 2.7802, + "step": 26930 + }, + { + "epoch": 1.6717983735799864, + "grad_norm": 0.16139816629527431, + "learning_rate": 4.8732490809093915e-05, + "loss": 2.783, + "step": 26931 + }, + { + "epoch": 1.6718604506797443, + "grad_norm": 0.1433970695436592, + "learning_rate": 4.8728880448850306e-05, + "loss": 2.7649, + "step": 26932 + }, + { + "epoch": 1.6719225277795022, + "grad_norm": 0.15369082589524072, + "learning_rate": 4.872527009523843e-05, + "loss": 2.7992, + "step": 26933 + }, + { + "epoch": 1.6719846048792602, + "grad_norm": 0.14520942893383446, + "learning_rate": 4.872165974827714e-05, + "loss": 2.7249, + "step": 26934 + }, + { + "epoch": 1.672046681979018, + "grad_norm": 0.1522043764364278, + "learning_rate": 4.871804940798525e-05, + "loss": 2.7607, + "step": 26935 + }, + { + "epoch": 1.672108759078776, + "grad_norm": 0.1446885013854635, + "learning_rate": 4.871443907438161e-05, + "loss": 2.7546, + "step": 26936 + }, + { + "epoch": 1.672170836178534, + "grad_norm": 0.15761447819079416, + "learning_rate": 4.871082874748506e-05, + "loss": 2.7664, + "step": 26937 + }, + { + "epoch": 1.6722329132782916, + "grad_norm": 0.1481010929061384, + "learning_rate": 4.8707218427314416e-05, + "loss": 2.7554, + "step": 26938 + }, + { + "epoch": 1.6722949903780495, + "grad_norm": 0.15297835173782254, + "learning_rate": 4.870360811388854e-05, + "loss": 2.9235, + "step": 26939 + }, + { + "epoch": 1.6723570674778074, + "grad_norm": 0.15423239797322688, + "learning_rate": 4.869999780722625e-05, + "loss": 2.8161, + "step": 26940 + }, + { + "epoch": 1.6724191445775654, + "grad_norm": 0.16631475315123154, + "learning_rate": 4.869638750734639e-05, + "loss": 2.773, + "step": 26941 + }, + { + "epoch": 1.6724812216773233, + "grad_norm": 0.14248161446187182, + "learning_rate": 4.8692777214267785e-05, + "loss": 2.6891, + "step": 26942 + }, + { + "epoch": 1.6725432987770812, + "grad_norm": 0.15907534590525702, + "learning_rate": 4.868916692800929e-05, + "loss": 2.8344, + "step": 26943 + }, + { + "epoch": 1.6726053758768389, + "grad_norm": 0.14843467032094523, + "learning_rate": 4.8685556648589716e-05, + "loss": 2.8296, + "step": 26944 + }, + { + "epoch": 1.6726674529765968, + "grad_norm": 0.15549728764226045, + "learning_rate": 4.868194637602793e-05, + "loss": 2.7136, + "step": 26945 + }, + { + "epoch": 1.6727295300763547, + "grad_norm": 0.1613013801850893, + "learning_rate": 4.867833611034273e-05, + "loss": 2.7602, + "step": 26946 + }, + { + "epoch": 1.6727916071761126, + "grad_norm": 0.1540164693232812, + "learning_rate": 4.8674725851552984e-05, + "loss": 2.7876, + "step": 26947 + }, + { + "epoch": 1.6728536842758706, + "grad_norm": 0.14972160203749632, + "learning_rate": 4.867111559967752e-05, + "loss": 2.7946, + "step": 26948 + }, + { + "epoch": 1.6729157613756285, + "grad_norm": 0.18189710536803722, + "learning_rate": 4.866750535473515e-05, + "loss": 2.8254, + "step": 26949 + }, + { + "epoch": 1.6729778384753864, + "grad_norm": 0.14677760973796486, + "learning_rate": 4.866389511674475e-05, + "loss": 2.7558, + "step": 26950 + }, + { + "epoch": 1.6730399155751443, + "grad_norm": 0.15683313868437426, + "learning_rate": 4.866028488572511e-05, + "loss": 2.7426, + "step": 26951 + }, + { + "epoch": 1.6731019926749022, + "grad_norm": 0.14285538770244596, + "learning_rate": 4.8656674661695105e-05, + "loss": 2.8819, + "step": 26952 + }, + { + "epoch": 1.6731640697746601, + "grad_norm": 0.15512644209641, + "learning_rate": 4.8653064444673546e-05, + "loss": 2.7765, + "step": 26953 + }, + { + "epoch": 1.673226146874418, + "grad_norm": 0.13929752463094605, + "learning_rate": 4.8649454234679284e-05, + "loss": 2.7794, + "step": 26954 + }, + { + "epoch": 1.673288223974176, + "grad_norm": 0.14128814739155585, + "learning_rate": 4.8645844031731136e-05, + "loss": 2.8008, + "step": 26955 + }, + { + "epoch": 1.673350301073934, + "grad_norm": 0.13761834096736472, + "learning_rate": 4.864223383584796e-05, + "loss": 2.7964, + "step": 26956 + }, + { + "epoch": 1.6734123781736918, + "grad_norm": 0.14149105100505446, + "learning_rate": 4.8638623647048565e-05, + "loss": 2.8021, + "step": 26957 + }, + { + "epoch": 1.6734744552734497, + "grad_norm": 0.15073245696971646, + "learning_rate": 4.8635013465351804e-05, + "loss": 2.7957, + "step": 26958 + }, + { + "epoch": 1.6735365323732077, + "grad_norm": 0.15042996201729125, + "learning_rate": 4.863140329077652e-05, + "loss": 2.7395, + "step": 26959 + }, + { + "epoch": 1.6735986094729656, + "grad_norm": 0.1362886202174429, + "learning_rate": 4.862779312334153e-05, + "loss": 2.7795, + "step": 26960 + }, + { + "epoch": 1.6736606865727235, + "grad_norm": 0.15476610565881163, + "learning_rate": 4.8624182963065685e-05, + "loss": 2.7782, + "step": 26961 + }, + { + "epoch": 1.6737227636724812, + "grad_norm": 0.13916015625, + "learning_rate": 4.862057280996781e-05, + "loss": 2.7334, + "step": 26962 + }, + { + "epoch": 1.673784840772239, + "grad_norm": 0.15868971016784425, + "learning_rate": 4.861696266406674e-05, + "loss": 2.8059, + "step": 26963 + }, + { + "epoch": 1.673846917871997, + "grad_norm": 0.1486401742823887, + "learning_rate": 4.861335252538131e-05, + "loss": 2.7432, + "step": 26964 + }, + { + "epoch": 1.673908994971755, + "grad_norm": 0.16223991262872953, + "learning_rate": 4.860974239393035e-05, + "loss": 2.7613, + "step": 26965 + }, + { + "epoch": 1.6739710720715129, + "grad_norm": 0.17270171680150703, + "learning_rate": 4.860613226973271e-05, + "loss": 2.8585, + "step": 26966 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.1414174626051605, + "learning_rate": 4.860252215280721e-05, + "loss": 2.7763, + "step": 26967 + }, + { + "epoch": 1.6740952262710285, + "grad_norm": 0.14478479518868517, + "learning_rate": 4.85989120431727e-05, + "loss": 2.7568, + "step": 26968 + }, + { + "epoch": 1.6741573033707864, + "grad_norm": 0.1461571216834572, + "learning_rate": 4.8595301940848e-05, + "loss": 2.8254, + "step": 26969 + }, + { + "epoch": 1.6742193804705443, + "grad_norm": 0.13805432558058126, + "learning_rate": 4.859169184585195e-05, + "loss": 2.7826, + "step": 26970 + }, + { + "epoch": 1.6742814575703022, + "grad_norm": 0.14248535979296958, + "learning_rate": 4.8588081758203394e-05, + "loss": 2.7978, + "step": 26971 + }, + { + "epoch": 1.6743435346700601, + "grad_norm": 0.17348914902398993, + "learning_rate": 4.858447167792115e-05, + "loss": 2.7925, + "step": 26972 + }, + { + "epoch": 1.674405611769818, + "grad_norm": 0.1447037492241642, + "learning_rate": 4.858086160502407e-05, + "loss": 2.79, + "step": 26973 + }, + { + "epoch": 1.674467688869576, + "grad_norm": 0.15339246497828918, + "learning_rate": 4.8577251539530964e-05, + "loss": 2.7567, + "step": 26974 + }, + { + "epoch": 1.674529765969334, + "grad_norm": 0.1414558186773391, + "learning_rate": 4.8573641481460694e-05, + "loss": 2.8193, + "step": 26975 + }, + { + "epoch": 1.6745918430690918, + "grad_norm": 0.15032919954535015, + "learning_rate": 4.8570031430832076e-05, + "loss": 2.6984, + "step": 26976 + }, + { + "epoch": 1.6746539201688497, + "grad_norm": 0.14011365587746533, + "learning_rate": 4.856642138766396e-05, + "loss": 2.729, + "step": 26977 + }, + { + "epoch": 1.6747159972686076, + "grad_norm": 0.14717887276070069, + "learning_rate": 4.856281135197516e-05, + "loss": 2.8248, + "step": 26978 + }, + { + "epoch": 1.6747780743683656, + "grad_norm": 0.14204397477116107, + "learning_rate": 4.8559201323784534e-05, + "loss": 2.7667, + "step": 26979 + }, + { + "epoch": 1.6748401514681235, + "grad_norm": 0.14614434513613145, + "learning_rate": 4.855559130311089e-05, + "loss": 2.7783, + "step": 26980 + }, + { + "epoch": 1.6749022285678814, + "grad_norm": 0.1490363351682636, + "learning_rate": 4.855198128997309e-05, + "loss": 2.7775, + "step": 26981 + }, + { + "epoch": 1.6749643056676393, + "grad_norm": 0.15905092449262823, + "learning_rate": 4.854837128438995e-05, + "loss": 2.7582, + "step": 26982 + }, + { + "epoch": 1.6750263827673972, + "grad_norm": 0.15300771104090047, + "learning_rate": 4.854476128638031e-05, + "loss": 2.8353, + "step": 26983 + }, + { + "epoch": 1.6750884598671552, + "grad_norm": 0.15320762818296718, + "learning_rate": 4.854115129596301e-05, + "loss": 2.7494, + "step": 26984 + }, + { + "epoch": 1.6751505369669129, + "grad_norm": 0.16288685782853907, + "learning_rate": 4.853754131315686e-05, + "loss": 2.8483, + "step": 26985 + }, + { + "epoch": 1.6752126140666708, + "grad_norm": 0.15256831539475482, + "learning_rate": 4.853393133798074e-05, + "loss": 2.7974, + "step": 26986 + }, + { + "epoch": 1.6752746911664287, + "grad_norm": 0.15754253316471314, + "learning_rate": 4.853032137045343e-05, + "loss": 2.8495, + "step": 26987 + }, + { + "epoch": 1.6753367682661866, + "grad_norm": 0.14439862197371334, + "learning_rate": 4.852671141059381e-05, + "loss": 2.8582, + "step": 26988 + }, + { + "epoch": 1.6753988453659445, + "grad_norm": 0.1586695729421599, + "learning_rate": 4.8523101458420674e-05, + "loss": 2.8004, + "step": 26989 + }, + { + "epoch": 1.6754609224657024, + "grad_norm": 0.15042862474001234, + "learning_rate": 4.8519491513952864e-05, + "loss": 2.8647, + "step": 26990 + }, + { + "epoch": 1.6755229995654601, + "grad_norm": 0.13949801606686846, + "learning_rate": 4.8515881577209256e-05, + "loss": 2.8154, + "step": 26991 + }, + { + "epoch": 1.675585076665218, + "grad_norm": 0.1677265028312229, + "learning_rate": 4.851227164820864e-05, + "loss": 2.7329, + "step": 26992 + }, + { + "epoch": 1.675647153764976, + "grad_norm": 0.14787685399694991, + "learning_rate": 4.850866172696988e-05, + "loss": 2.8215, + "step": 26993 + }, + { + "epoch": 1.675709230864734, + "grad_norm": 0.16198844305585663, + "learning_rate": 4.850505181351178e-05, + "loss": 2.8174, + "step": 26994 + }, + { + "epoch": 1.6757713079644918, + "grad_norm": 0.14699110124497358, + "learning_rate": 4.850144190785319e-05, + "loss": 2.7485, + "step": 26995 + }, + { + "epoch": 1.6758333850642497, + "grad_norm": 0.15623044845327214, + "learning_rate": 4.849783201001294e-05, + "loss": 2.8153, + "step": 26996 + }, + { + "epoch": 1.6758954621640076, + "grad_norm": 0.14545990328003905, + "learning_rate": 4.849422212000987e-05, + "loss": 2.8049, + "step": 26997 + }, + { + "epoch": 1.6759575392637656, + "grad_norm": 0.1593396138194731, + "learning_rate": 4.8490612237862815e-05, + "loss": 2.7916, + "step": 26998 + }, + { + "epoch": 1.6760196163635235, + "grad_norm": 0.15651293922862639, + "learning_rate": 4.848700236359059e-05, + "loss": 2.8431, + "step": 26999 + }, + { + "epoch": 1.6760816934632814, + "grad_norm": 0.1522451289392894, + "learning_rate": 4.848339249721205e-05, + "loss": 2.6334, + "step": 27000 + }, + { + "epoch": 1.6761437705630393, + "grad_norm": 0.15693663289724252, + "learning_rate": 4.847978263874601e-05, + "loss": 2.822, + "step": 27001 + }, + { + "epoch": 1.6762058476627972, + "grad_norm": 0.16270243502043522, + "learning_rate": 4.847617278821132e-05, + "loss": 2.8117, + "step": 27002 + }, + { + "epoch": 1.6762679247625552, + "grad_norm": 0.1529231850811672, + "learning_rate": 4.84725629456268e-05, + "loss": 2.8035, + "step": 27003 + }, + { + "epoch": 1.676330001862313, + "grad_norm": 0.154379936990675, + "learning_rate": 4.8468953111011295e-05, + "loss": 2.733, + "step": 27004 + }, + { + "epoch": 1.676392078962071, + "grad_norm": 0.14124581604010364, + "learning_rate": 4.846534328438363e-05, + "loss": 2.7781, + "step": 27005 + }, + { + "epoch": 1.676454156061829, + "grad_norm": 0.16136232849429355, + "learning_rate": 4.846173346576265e-05, + "loss": 2.8104, + "step": 27006 + }, + { + "epoch": 1.6765162331615868, + "grad_norm": 0.14711048469185695, + "learning_rate": 4.845812365516716e-05, + "loss": 2.8293, + "step": 27007 + }, + { + "epoch": 1.6765783102613447, + "grad_norm": 0.15575472298819545, + "learning_rate": 4.8454513852616044e-05, + "loss": 2.7881, + "step": 27008 + }, + { + "epoch": 1.6766403873611024, + "grad_norm": 0.16506108018624865, + "learning_rate": 4.845090405812809e-05, + "loss": 2.8439, + "step": 27009 + }, + { + "epoch": 1.6767024644608604, + "grad_norm": 0.146243482244786, + "learning_rate": 4.8447294271722135e-05, + "loss": 2.789, + "step": 27010 + }, + { + "epoch": 1.6767645415606183, + "grad_norm": 0.14538297482410295, + "learning_rate": 4.8443684493417035e-05, + "loss": 2.8528, + "step": 27011 + }, + { + "epoch": 1.6768266186603762, + "grad_norm": 0.15375313743049612, + "learning_rate": 4.84400747232316e-05, + "loss": 2.8938, + "step": 27012 + }, + { + "epoch": 1.676888695760134, + "grad_norm": 0.13920276725638295, + "learning_rate": 4.843646496118469e-05, + "loss": 2.7956, + "step": 27013 + }, + { + "epoch": 1.676950772859892, + "grad_norm": 0.1534729094931038, + "learning_rate": 4.8432855207295104e-05, + "loss": 2.8899, + "step": 27014 + }, + { + "epoch": 1.6770128499596497, + "grad_norm": 0.14696349942397272, + "learning_rate": 4.8429245461581704e-05, + "loss": 2.737, + "step": 27015 + }, + { + "epoch": 1.6770749270594076, + "grad_norm": 0.14772562561875885, + "learning_rate": 4.842563572406331e-05, + "loss": 2.7585, + "step": 27016 + }, + { + "epoch": 1.6771370041591656, + "grad_norm": 0.15900187126631488, + "learning_rate": 4.842202599475876e-05, + "loss": 2.9169, + "step": 27017 + }, + { + "epoch": 1.6771990812589235, + "grad_norm": 0.15311612667921262, + "learning_rate": 4.841841627368687e-05, + "loss": 2.7948, + "step": 27018 + }, + { + "epoch": 1.6772611583586814, + "grad_norm": 0.14661844476004357, + "learning_rate": 4.84148065608665e-05, + "loss": 2.7905, + "step": 27019 + }, + { + "epoch": 1.6773232354584393, + "grad_norm": 0.1654594867444029, + "learning_rate": 4.841119685631646e-05, + "loss": 2.8147, + "step": 27020 + }, + { + "epoch": 1.6773853125581972, + "grad_norm": 0.15983445706393448, + "learning_rate": 4.84075871600556e-05, + "loss": 2.8262, + "step": 27021 + }, + { + "epoch": 1.6774473896579551, + "grad_norm": 0.1813449093184883, + "learning_rate": 4.840397747210275e-05, + "loss": 2.8171, + "step": 27022 + }, + { + "epoch": 1.677509466757713, + "grad_norm": 0.15458403195067352, + "learning_rate": 4.84003677924767e-05, + "loss": 2.8701, + "step": 27023 + }, + { + "epoch": 1.677571543857471, + "grad_norm": 0.15180793705212134, + "learning_rate": 4.839675812119634e-05, + "loss": 2.7822, + "step": 27024 + }, + { + "epoch": 1.677633620957229, + "grad_norm": 0.18680842333187356, + "learning_rate": 4.83931484582805e-05, + "loss": 2.7953, + "step": 27025 + }, + { + "epoch": 1.6776956980569868, + "grad_norm": 0.1600091712435409, + "learning_rate": 4.838953880374798e-05, + "loss": 2.8158, + "step": 27026 + }, + { + "epoch": 1.6777577751567447, + "grad_norm": 0.15806614278914907, + "learning_rate": 4.838592915761763e-05, + "loss": 2.815, + "step": 27027 + }, + { + "epoch": 1.6778198522565027, + "grad_norm": 0.1586545696053094, + "learning_rate": 4.838231951990828e-05, + "loss": 2.854, + "step": 27028 + }, + { + "epoch": 1.6778819293562606, + "grad_norm": 0.14862157676426385, + "learning_rate": 4.837870989063876e-05, + "loss": 2.8018, + "step": 27029 + }, + { + "epoch": 1.6779440064560185, + "grad_norm": 0.148547069365259, + "learning_rate": 4.8375100269827906e-05, + "loss": 2.8469, + "step": 27030 + }, + { + "epoch": 1.6780060835557764, + "grad_norm": 0.1427694369693643, + "learning_rate": 4.837149065749455e-05, + "loss": 2.8406, + "step": 27031 + }, + { + "epoch": 1.6780681606555343, + "grad_norm": 0.16124217628010606, + "learning_rate": 4.8367881053657514e-05, + "loss": 2.852, + "step": 27032 + }, + { + "epoch": 1.678130237755292, + "grad_norm": 0.14467903906510404, + "learning_rate": 4.8364271458335655e-05, + "loss": 2.7456, + "step": 27033 + }, + { + "epoch": 1.67819231485505, + "grad_norm": 0.1519303336616962, + "learning_rate": 4.836066187154778e-05, + "loss": 2.8233, + "step": 27034 + }, + { + "epoch": 1.6782543919548079, + "grad_norm": 0.23487057743913176, + "learning_rate": 4.8357052293312724e-05, + "loss": 2.7957, + "step": 27035 + }, + { + "epoch": 1.6783164690545658, + "grad_norm": 0.1492107853586344, + "learning_rate": 4.835344272364934e-05, + "loss": 2.8716, + "step": 27036 + }, + { + "epoch": 1.6783785461543237, + "grad_norm": 0.1505733469883582, + "learning_rate": 4.834983316257643e-05, + "loss": 2.7927, + "step": 27037 + }, + { + "epoch": 1.6784406232540816, + "grad_norm": 0.17145780301002383, + "learning_rate": 4.834622361011285e-05, + "loss": 2.9018, + "step": 27038 + }, + { + "epoch": 1.6785027003538393, + "grad_norm": 0.14765105086967342, + "learning_rate": 4.834261406627741e-05, + "loss": 2.867, + "step": 27039 + }, + { + "epoch": 1.6785647774535972, + "grad_norm": 0.18354000157009792, + "learning_rate": 4.833900453108897e-05, + "loss": 2.8032, + "step": 27040 + }, + { + "epoch": 1.6786268545533551, + "grad_norm": 0.14659065205982974, + "learning_rate": 4.833539500456634e-05, + "loss": 2.7799, + "step": 27041 + }, + { + "epoch": 1.678688931653113, + "grad_norm": 0.15336956763731818, + "learning_rate": 4.8331785486728356e-05, + "loss": 2.7538, + "step": 27042 + }, + { + "epoch": 1.678751008752871, + "grad_norm": 0.1588516189548079, + "learning_rate": 4.8328175977593854e-05, + "loss": 2.7582, + "step": 27043 + }, + { + "epoch": 1.678813085852629, + "grad_norm": 0.1587376219136405, + "learning_rate": 4.832456647718167e-05, + "loss": 2.8313, + "step": 27044 + }, + { + "epoch": 1.6788751629523868, + "grad_norm": 0.1592103425446338, + "learning_rate": 4.8320956985510617e-05, + "loss": 2.8273, + "step": 27045 + }, + { + "epoch": 1.6789372400521447, + "grad_norm": 0.14700128903723952, + "learning_rate": 4.831734750259955e-05, + "loss": 2.8809, + "step": 27046 + }, + { + "epoch": 1.6789993171519026, + "grad_norm": 0.15558820285627636, + "learning_rate": 4.8313738028467295e-05, + "loss": 2.7686, + "step": 27047 + }, + { + "epoch": 1.6790613942516606, + "grad_norm": 0.15828150114658987, + "learning_rate": 4.8310128563132664e-05, + "loss": 2.8804, + "step": 27048 + }, + { + "epoch": 1.6791234713514185, + "grad_norm": 0.15511893737082622, + "learning_rate": 4.8306519106614513e-05, + "loss": 2.7323, + "step": 27049 + }, + { + "epoch": 1.6791855484511764, + "grad_norm": 0.17078461262428024, + "learning_rate": 4.830290965893165e-05, + "loss": 2.7678, + "step": 27050 + }, + { + "epoch": 1.6792476255509343, + "grad_norm": 0.1531638787017241, + "learning_rate": 4.829930022010293e-05, + "loss": 2.7713, + "step": 27051 + }, + { + "epoch": 1.6793097026506922, + "grad_norm": 0.17235718086837323, + "learning_rate": 4.829569079014716e-05, + "loss": 2.881, + "step": 27052 + }, + { + "epoch": 1.6793717797504502, + "grad_norm": 0.18763643506369057, + "learning_rate": 4.82920813690832e-05, + "loss": 2.7854, + "step": 27053 + }, + { + "epoch": 1.679433856850208, + "grad_norm": 0.1472457110856277, + "learning_rate": 4.828847195692986e-05, + "loss": 2.7787, + "step": 27054 + }, + { + "epoch": 1.679495933949966, + "grad_norm": 0.15324136803315774, + "learning_rate": 4.8284862553705965e-05, + "loss": 2.8428, + "step": 27055 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.14945010454881558, + "learning_rate": 4.828125315943037e-05, + "loss": 2.8711, + "step": 27056 + }, + { + "epoch": 1.6796200881494816, + "grad_norm": 0.1689127852157503, + "learning_rate": 4.8277643774121894e-05, + "loss": 2.8092, + "step": 27057 + }, + { + "epoch": 1.6796821652492395, + "grad_norm": 0.15042949768625, + "learning_rate": 4.827403439779938e-05, + "loss": 2.7276, + "step": 27058 + }, + { + "epoch": 1.6797442423489974, + "grad_norm": 0.15968459739771845, + "learning_rate": 4.827042503048164e-05, + "loss": 2.8138, + "step": 27059 + }, + { + "epoch": 1.6798063194487554, + "grad_norm": 0.14937644684461954, + "learning_rate": 4.82668156721875e-05, + "loss": 2.7205, + "step": 27060 + }, + { + "epoch": 1.6798683965485133, + "grad_norm": 0.14773311506527906, + "learning_rate": 4.826320632293582e-05, + "loss": 2.7819, + "step": 27061 + }, + { + "epoch": 1.6799304736482712, + "grad_norm": 0.15154418441377293, + "learning_rate": 4.825959698274541e-05, + "loss": 2.7293, + "step": 27062 + }, + { + "epoch": 1.679992550748029, + "grad_norm": 0.1385784976505607, + "learning_rate": 4.82559876516351e-05, + "loss": 2.8018, + "step": 27063 + }, + { + "epoch": 1.6800546278477868, + "grad_norm": 0.16597371413834142, + "learning_rate": 4.825237832962373e-05, + "loss": 2.7781, + "step": 27064 + }, + { + "epoch": 1.6801167049475447, + "grad_norm": 0.1509253544590484, + "learning_rate": 4.8248769016730134e-05, + "loss": 2.7687, + "step": 27065 + }, + { + "epoch": 1.6801787820473026, + "grad_norm": 0.16637748662199528, + "learning_rate": 4.824515971297312e-05, + "loss": 2.8311, + "step": 27066 + }, + { + "epoch": 1.6802408591470606, + "grad_norm": 0.14666157495084517, + "learning_rate": 4.824155041837155e-05, + "loss": 2.8676, + "step": 27067 + }, + { + "epoch": 1.6803029362468185, + "grad_norm": 0.16307280824544546, + "learning_rate": 4.823794113294422e-05, + "loss": 2.8049, + "step": 27068 + }, + { + "epoch": 1.6803650133465764, + "grad_norm": 0.15994003867355314, + "learning_rate": 4.823433185670999e-05, + "loss": 2.7793, + "step": 27069 + }, + { + "epoch": 1.6804270904463343, + "grad_norm": 0.1534552193236464, + "learning_rate": 4.8230722589687674e-05, + "loss": 2.7356, + "step": 27070 + }, + { + "epoch": 1.6804891675460922, + "grad_norm": 0.15614314597919637, + "learning_rate": 4.822711333189611e-05, + "loss": 2.7891, + "step": 27071 + }, + { + "epoch": 1.6805512446458502, + "grad_norm": 0.1652906689443864, + "learning_rate": 4.8223504083354135e-05, + "loss": 2.8178, + "step": 27072 + }, + { + "epoch": 1.680613321745608, + "grad_norm": 0.16542195028431433, + "learning_rate": 4.821989484408056e-05, + "loss": 2.6937, + "step": 27073 + }, + { + "epoch": 1.680675398845366, + "grad_norm": 0.15736707486081056, + "learning_rate": 4.821628561409423e-05, + "loss": 2.7928, + "step": 27074 + }, + { + "epoch": 1.680737475945124, + "grad_norm": 0.17198966276354785, + "learning_rate": 4.821267639341397e-05, + "loss": 2.7291, + "step": 27075 + }, + { + "epoch": 1.6807995530448818, + "grad_norm": 0.16178251543520947, + "learning_rate": 4.820906718205861e-05, + "loss": 2.7801, + "step": 27076 + }, + { + "epoch": 1.6808616301446397, + "grad_norm": 0.19334987655583424, + "learning_rate": 4.820545798004698e-05, + "loss": 2.8744, + "step": 27077 + }, + { + "epoch": 1.6809237072443977, + "grad_norm": 0.16640887527006815, + "learning_rate": 4.820184878739792e-05, + "loss": 2.8007, + "step": 27078 + }, + { + "epoch": 1.6809857843441556, + "grad_norm": 0.17048304073072784, + "learning_rate": 4.819823960413023e-05, + "loss": 2.7533, + "step": 27079 + }, + { + "epoch": 1.6810478614439135, + "grad_norm": 0.1537619807743088, + "learning_rate": 4.819463043026279e-05, + "loss": 2.794, + "step": 27080 + }, + { + "epoch": 1.6811099385436712, + "grad_norm": 0.16245979903038463, + "learning_rate": 4.819102126581438e-05, + "loss": 2.8197, + "step": 27081 + }, + { + "epoch": 1.681172015643429, + "grad_norm": 0.1524299047848919, + "learning_rate": 4.818741211080387e-05, + "loss": 2.6881, + "step": 27082 + }, + { + "epoch": 1.681234092743187, + "grad_norm": 0.1838425918119845, + "learning_rate": 4.818380296525006e-05, + "loss": 2.7358, + "step": 27083 + }, + { + "epoch": 1.681296169842945, + "grad_norm": 0.17642787045690647, + "learning_rate": 4.8180193829171785e-05, + "loss": 2.8203, + "step": 27084 + }, + { + "epoch": 1.6813582469427029, + "grad_norm": 0.15373462533256774, + "learning_rate": 4.817658470258789e-05, + "loss": 2.8224, + "step": 27085 + }, + { + "epoch": 1.6814203240424608, + "grad_norm": 0.161277780921272, + "learning_rate": 4.817297558551718e-05, + "loss": 2.8342, + "step": 27086 + }, + { + "epoch": 1.6814824011422185, + "grad_norm": 0.1444360703158631, + "learning_rate": 4.816936647797852e-05, + "loss": 2.8079, + "step": 27087 + }, + { + "epoch": 1.6815444782419764, + "grad_norm": 0.17621667761682525, + "learning_rate": 4.81657573799907e-05, + "loss": 2.82, + "step": 27088 + }, + { + "epoch": 1.6816065553417343, + "grad_norm": 0.2761189994211814, + "learning_rate": 4.816214829157257e-05, + "loss": 2.8172, + "step": 27089 + }, + { + "epoch": 1.6816686324414922, + "grad_norm": 0.16130242524025087, + "learning_rate": 4.8158539212742973e-05, + "loss": 2.8077, + "step": 27090 + }, + { + "epoch": 1.6817307095412501, + "grad_norm": 0.18092493347974806, + "learning_rate": 4.815493014352071e-05, + "loss": 2.7967, + "step": 27091 + }, + { + "epoch": 1.681792786641008, + "grad_norm": 0.1483642748866747, + "learning_rate": 4.815132108392464e-05, + "loss": 2.7189, + "step": 27092 + }, + { + "epoch": 1.681854863740766, + "grad_norm": 0.203697397617806, + "learning_rate": 4.814771203397356e-05, + "loss": 2.8166, + "step": 27093 + }, + { + "epoch": 1.681916940840524, + "grad_norm": 0.14463415220072148, + "learning_rate": 4.814410299368633e-05, + "loss": 2.826, + "step": 27094 + }, + { + "epoch": 1.6819790179402818, + "grad_norm": 0.1689655809978363, + "learning_rate": 4.8140493963081765e-05, + "loss": 2.7905, + "step": 27095 + }, + { + "epoch": 1.6820410950400397, + "grad_norm": 0.15396185006184654, + "learning_rate": 4.813688494217869e-05, + "loss": 2.7956, + "step": 27096 + }, + { + "epoch": 1.6821031721397977, + "grad_norm": 0.17432241669879386, + "learning_rate": 4.813327593099595e-05, + "loss": 2.7962, + "step": 27097 + }, + { + "epoch": 1.6821652492395556, + "grad_norm": 0.1599460187296512, + "learning_rate": 4.812966692955234e-05, + "loss": 2.8808, + "step": 27098 + }, + { + "epoch": 1.6822273263393135, + "grad_norm": 0.18632306825825687, + "learning_rate": 4.812605793786673e-05, + "loss": 2.7784, + "step": 27099 + }, + { + "epoch": 1.6822894034390714, + "grad_norm": 0.19286793573425753, + "learning_rate": 4.812244895595792e-05, + "loss": 2.7433, + "step": 27100 + }, + { + "epoch": 1.6823514805388293, + "grad_norm": 0.14751606977945433, + "learning_rate": 4.811883998384476e-05, + "loss": 2.8405, + "step": 27101 + }, + { + "epoch": 1.6824135576385872, + "grad_norm": 0.1488809424266518, + "learning_rate": 4.811523102154605e-05, + "loss": 2.7172, + "step": 27102 + }, + { + "epoch": 1.6824756347383452, + "grad_norm": 0.1574893910755688, + "learning_rate": 4.811162206908066e-05, + "loss": 2.7871, + "step": 27103 + }, + { + "epoch": 1.682537711838103, + "grad_norm": 0.14066890852275976, + "learning_rate": 4.8108013126467375e-05, + "loss": 2.6948, + "step": 27104 + }, + { + "epoch": 1.6825997889378608, + "grad_norm": 0.14431062174376444, + "learning_rate": 4.8104404193725065e-05, + "loss": 2.8438, + "step": 27105 + }, + { + "epoch": 1.6826618660376187, + "grad_norm": 0.16718643446609494, + "learning_rate": 4.810079527087252e-05, + "loss": 2.9104, + "step": 27106 + }, + { + "epoch": 1.6827239431373766, + "grad_norm": 0.17151100998930405, + "learning_rate": 4.8097186357928604e-05, + "loss": 2.8332, + "step": 27107 + }, + { + "epoch": 1.6827860202371345, + "grad_norm": 0.15165604133206437, + "learning_rate": 4.809357745491212e-05, + "loss": 2.8165, + "step": 27108 + }, + { + "epoch": 1.6828480973368924, + "grad_norm": 0.15023111578886986, + "learning_rate": 4.808996856184191e-05, + "loss": 2.7895, + "step": 27109 + }, + { + "epoch": 1.6829101744366504, + "grad_norm": 0.16961641369960676, + "learning_rate": 4.808635967873679e-05, + "loss": 2.8159, + "step": 27110 + }, + { + "epoch": 1.682972251536408, + "grad_norm": 0.15456586038435774, + "learning_rate": 4.80827508056156e-05, + "loss": 2.8298, + "step": 27111 + }, + { + "epoch": 1.683034328636166, + "grad_norm": 0.1634499262686476, + "learning_rate": 4.807914194249717e-05, + "loss": 2.8681, + "step": 27112 + }, + { + "epoch": 1.683096405735924, + "grad_norm": 0.14872447306720316, + "learning_rate": 4.807553308940031e-05, + "loss": 2.7606, + "step": 27113 + }, + { + "epoch": 1.6831584828356818, + "grad_norm": 0.14135694091925885, + "learning_rate": 4.8071924246343875e-05, + "loss": 2.7415, + "step": 27114 + }, + { + "epoch": 1.6832205599354397, + "grad_norm": 0.1630646697470613, + "learning_rate": 4.806831541334667e-05, + "loss": 2.741, + "step": 27115 + }, + { + "epoch": 1.6832826370351976, + "grad_norm": 0.15303714382435846, + "learning_rate": 4.806470659042753e-05, + "loss": 2.7878, + "step": 27116 + }, + { + "epoch": 1.6833447141349556, + "grad_norm": 0.14539797048446068, + "learning_rate": 4.8061097777605294e-05, + "loss": 2.8651, + "step": 27117 + }, + { + "epoch": 1.6834067912347135, + "grad_norm": 0.15338059473365243, + "learning_rate": 4.805748897489878e-05, + "loss": 2.8209, + "step": 27118 + }, + { + "epoch": 1.6834688683344714, + "grad_norm": 0.15218129421482432, + "learning_rate": 4.805388018232681e-05, + "loss": 2.6858, + "step": 27119 + }, + { + "epoch": 1.6835309454342293, + "grad_norm": 0.1500433715066487, + "learning_rate": 4.8050271399908226e-05, + "loss": 2.76, + "step": 27120 + }, + { + "epoch": 1.6835930225339872, + "grad_norm": 0.14425132682138847, + "learning_rate": 4.8046662627661834e-05, + "loss": 2.7846, + "step": 27121 + }, + { + "epoch": 1.6836550996337452, + "grad_norm": 0.14490695398434042, + "learning_rate": 4.8043053865606495e-05, + "loss": 2.7379, + "step": 27122 + }, + { + "epoch": 1.683717176733503, + "grad_norm": 0.15630521990628712, + "learning_rate": 4.803944511376101e-05, + "loss": 2.8093, + "step": 27123 + }, + { + "epoch": 1.683779253833261, + "grad_norm": 0.15022459401999338, + "learning_rate": 4.803583637214423e-05, + "loss": 2.7817, + "step": 27124 + }, + { + "epoch": 1.683841330933019, + "grad_norm": 0.1469463375517604, + "learning_rate": 4.8032227640774954e-05, + "loss": 2.7698, + "step": 27125 + }, + { + "epoch": 1.6839034080327768, + "grad_norm": 0.14729509438227226, + "learning_rate": 4.802861891967204e-05, + "loss": 2.863, + "step": 27126 + }, + { + "epoch": 1.6839654851325347, + "grad_norm": 0.16378738406099427, + "learning_rate": 4.802501020885428e-05, + "loss": 2.8273, + "step": 27127 + }, + { + "epoch": 1.6840275622322927, + "grad_norm": 0.15523027023932073, + "learning_rate": 4.802140150834054e-05, + "loss": 2.7199, + "step": 27128 + }, + { + "epoch": 1.6840896393320504, + "grad_norm": 0.159007892462617, + "learning_rate": 4.801779281814961e-05, + "loss": 2.795, + "step": 27129 + }, + { + "epoch": 1.6841517164318083, + "grad_norm": 0.17314700293242502, + "learning_rate": 4.801418413830035e-05, + "loss": 2.757, + "step": 27130 + }, + { + "epoch": 1.6842137935315662, + "grad_norm": 0.14811704578047807, + "learning_rate": 4.801057546881156e-05, + "loss": 2.7039, + "step": 27131 + }, + { + "epoch": 1.6842758706313241, + "grad_norm": 0.1631962066469575, + "learning_rate": 4.8006966809702094e-05, + "loss": 2.7788, + "step": 27132 + }, + { + "epoch": 1.684337947731082, + "grad_norm": 0.1566355300131785, + "learning_rate": 4.800335816099077e-05, + "loss": 2.7296, + "step": 27133 + }, + { + "epoch": 1.68440002483084, + "grad_norm": 0.15576717163873202, + "learning_rate": 4.7999749522696396e-05, + "loss": 2.8517, + "step": 27134 + }, + { + "epoch": 1.6844621019305976, + "grad_norm": 0.1535311967654362, + "learning_rate": 4.799614089483783e-05, + "loss": 2.8651, + "step": 27135 + }, + { + "epoch": 1.6845241790303556, + "grad_norm": 0.20175731064326896, + "learning_rate": 4.7992532277433864e-05, + "loss": 2.8618, + "step": 27136 + }, + { + "epoch": 1.6845862561301135, + "grad_norm": 0.15076500256615782, + "learning_rate": 4.798892367050336e-05, + "loss": 2.8418, + "step": 27137 + }, + { + "epoch": 1.6846483332298714, + "grad_norm": 0.14430143798321798, + "learning_rate": 4.798531507406512e-05, + "loss": 2.7754, + "step": 27138 + }, + { + "epoch": 1.6847104103296293, + "grad_norm": 0.14852079137678548, + "learning_rate": 4.798170648813799e-05, + "loss": 2.7865, + "step": 27139 + }, + { + "epoch": 1.6847724874293872, + "grad_norm": 0.14125176996202404, + "learning_rate": 4.7978097912740775e-05, + "loss": 2.7604, + "step": 27140 + }, + { + "epoch": 1.6848345645291452, + "grad_norm": 0.14923074484343937, + "learning_rate": 4.7974489347892325e-05, + "loss": 2.8026, + "step": 27141 + }, + { + "epoch": 1.684896641628903, + "grad_norm": 0.15472091280227399, + "learning_rate": 4.797088079361145e-05, + "loss": 2.7832, + "step": 27142 + }, + { + "epoch": 1.684958718728661, + "grad_norm": 0.15121740498331887, + "learning_rate": 4.796727224991699e-05, + "loss": 2.8224, + "step": 27143 + }, + { + "epoch": 1.685020795828419, + "grad_norm": 0.14581343537777441, + "learning_rate": 4.796366371682776e-05, + "loss": 2.7725, + "step": 27144 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.14451078321553912, + "learning_rate": 4.7960055194362584e-05, + "loss": 2.8154, + "step": 27145 + }, + { + "epoch": 1.6851449500279347, + "grad_norm": 0.18637383538092717, + "learning_rate": 4.795644668254031e-05, + "loss": 2.7978, + "step": 27146 + }, + { + "epoch": 1.6852070271276927, + "grad_norm": 0.1403698062691721, + "learning_rate": 4.7952838181379736e-05, + "loss": 2.7162, + "step": 27147 + }, + { + "epoch": 1.6852691042274506, + "grad_norm": 0.18189007028102383, + "learning_rate": 4.7949229690899714e-05, + "loss": 2.8023, + "step": 27148 + }, + { + "epoch": 1.6853311813272085, + "grad_norm": 0.1543144928253864, + "learning_rate": 4.794562121111904e-05, + "loss": 2.836, + "step": 27149 + }, + { + "epoch": 1.6853932584269664, + "grad_norm": 0.1557221795963163, + "learning_rate": 4.794201274205657e-05, + "loss": 2.7678, + "step": 27150 + }, + { + "epoch": 1.6854553355267243, + "grad_norm": 0.17387293655911187, + "learning_rate": 4.793840428373112e-05, + "loss": 2.7941, + "step": 27151 + }, + { + "epoch": 1.6855174126264822, + "grad_norm": 0.13823501519802722, + "learning_rate": 4.793479583616152e-05, + "loss": 2.7368, + "step": 27152 + }, + { + "epoch": 1.68557948972624, + "grad_norm": 0.14633859373935007, + "learning_rate": 4.793118739936658e-05, + "loss": 2.6857, + "step": 27153 + }, + { + "epoch": 1.6856415668259979, + "grad_norm": 0.14693246970050353, + "learning_rate": 4.7927578973365124e-05, + "loss": 2.7684, + "step": 27154 + }, + { + "epoch": 1.6857036439257558, + "grad_norm": 0.14800701963686008, + "learning_rate": 4.792397055817602e-05, + "loss": 2.772, + "step": 27155 + }, + { + "epoch": 1.6857657210255137, + "grad_norm": 0.17298612367788288, + "learning_rate": 4.7920362153818045e-05, + "loss": 2.7324, + "step": 27156 + }, + { + "epoch": 1.6858277981252716, + "grad_norm": 0.1553029026408862, + "learning_rate": 4.791675376031006e-05, + "loss": 2.7397, + "step": 27157 + }, + { + "epoch": 1.6858898752250295, + "grad_norm": 0.19299312545708422, + "learning_rate": 4.791314537767088e-05, + "loss": 2.7969, + "step": 27158 + }, + { + "epoch": 1.6859519523247872, + "grad_norm": 0.14581627760219298, + "learning_rate": 4.790953700591931e-05, + "loss": 2.8184, + "step": 27159 + }, + { + "epoch": 1.6860140294245451, + "grad_norm": 0.16117908506958975, + "learning_rate": 4.790592864507422e-05, + "loss": 2.818, + "step": 27160 + }, + { + "epoch": 1.686076106524303, + "grad_norm": 0.14663009389295786, + "learning_rate": 4.790232029515438e-05, + "loss": 2.8869, + "step": 27161 + }, + { + "epoch": 1.686138183624061, + "grad_norm": 0.18099152040245425, + "learning_rate": 4.789871195617866e-05, + "loss": 2.765, + "step": 27162 + }, + { + "epoch": 1.686200260723819, + "grad_norm": 0.1574455181948234, + "learning_rate": 4.789510362816586e-05, + "loss": 2.8086, + "step": 27163 + }, + { + "epoch": 1.6862623378235768, + "grad_norm": 0.16426645385473568, + "learning_rate": 4.789149531113483e-05, + "loss": 2.6955, + "step": 27164 + }, + { + "epoch": 1.6863244149233347, + "grad_norm": 0.1556089842265026, + "learning_rate": 4.788788700510437e-05, + "loss": 2.774, + "step": 27165 + }, + { + "epoch": 1.6863864920230927, + "grad_norm": 0.17492360593374665, + "learning_rate": 4.7884278710093325e-05, + "loss": 2.7233, + "step": 27166 + }, + { + "epoch": 1.6864485691228506, + "grad_norm": 0.1701412046014318, + "learning_rate": 4.788067042612049e-05, + "loss": 2.7533, + "step": 27167 + }, + { + "epoch": 1.6865106462226085, + "grad_norm": 0.15965170587900063, + "learning_rate": 4.787706215320474e-05, + "loss": 2.7547, + "step": 27168 + }, + { + "epoch": 1.6865727233223664, + "grad_norm": 0.1504147931285619, + "learning_rate": 4.787345389136486e-05, + "loss": 2.836, + "step": 27169 + }, + { + "epoch": 1.6866348004221243, + "grad_norm": 0.15208275152012393, + "learning_rate": 4.786984564061968e-05, + "loss": 2.8106, + "step": 27170 + }, + { + "epoch": 1.6866968775218822, + "grad_norm": 0.1500799946765318, + "learning_rate": 4.786623740098805e-05, + "loss": 2.7817, + "step": 27171 + }, + { + "epoch": 1.6867589546216402, + "grad_norm": 0.15440896344999822, + "learning_rate": 4.786262917248876e-05, + "loss": 2.7928, + "step": 27172 + }, + { + "epoch": 1.686821031721398, + "grad_norm": 0.1691287703907935, + "learning_rate": 4.785902095514066e-05, + "loss": 2.7738, + "step": 27173 + }, + { + "epoch": 1.686883108821156, + "grad_norm": 0.1533459988112952, + "learning_rate": 4.785541274896256e-05, + "loss": 2.7119, + "step": 27174 + }, + { + "epoch": 1.686945185920914, + "grad_norm": 0.168245979454117, + "learning_rate": 4.78518045539733e-05, + "loss": 2.7707, + "step": 27175 + }, + { + "epoch": 1.6870072630206718, + "grad_norm": 0.15452544870663223, + "learning_rate": 4.7848196370191693e-05, + "loss": 2.6736, + "step": 27176 + }, + { + "epoch": 1.6870693401204295, + "grad_norm": 0.1490212931547462, + "learning_rate": 4.784458819763658e-05, + "loss": 2.8568, + "step": 27177 + }, + { + "epoch": 1.6871314172201874, + "grad_norm": 0.1902911440293839, + "learning_rate": 4.784098003632675e-05, + "loss": 2.8514, + "step": 27178 + }, + { + "epoch": 1.6871934943199454, + "grad_norm": 0.1504716902393093, + "learning_rate": 4.783737188628108e-05, + "loss": 2.7846, + "step": 27179 + }, + { + "epoch": 1.6872555714197033, + "grad_norm": 0.15162398796852358, + "learning_rate": 4.783376374751835e-05, + "loss": 2.859, + "step": 27180 + }, + { + "epoch": 1.6873176485194612, + "grad_norm": 0.14859772492356, + "learning_rate": 4.7830155620057396e-05, + "loss": 2.8556, + "step": 27181 + }, + { + "epoch": 1.6873797256192191, + "grad_norm": 0.15283432248869064, + "learning_rate": 4.782654750391706e-05, + "loss": 2.7758, + "step": 27182 + }, + { + "epoch": 1.6874418027189768, + "grad_norm": 0.1549948586488, + "learning_rate": 4.782293939911614e-05, + "loss": 2.7919, + "step": 27183 + }, + { + "epoch": 1.6875038798187347, + "grad_norm": 0.14327909654145102, + "learning_rate": 4.781933130567348e-05, + "loss": 2.7353, + "step": 27184 + }, + { + "epoch": 1.6875659569184926, + "grad_norm": 0.14596396510538914, + "learning_rate": 4.7815723223607896e-05, + "loss": 2.7844, + "step": 27185 + }, + { + "epoch": 1.6876280340182506, + "grad_norm": 0.15881813271196782, + "learning_rate": 4.78121151529382e-05, + "loss": 2.822, + "step": 27186 + }, + { + "epoch": 1.6876901111180085, + "grad_norm": 0.1520248890846725, + "learning_rate": 4.780850709368325e-05, + "loss": 2.7307, + "step": 27187 + }, + { + "epoch": 1.6877521882177664, + "grad_norm": 0.16489349242982873, + "learning_rate": 4.780489904586184e-05, + "loss": 2.8866, + "step": 27188 + }, + { + "epoch": 1.6878142653175243, + "grad_norm": 0.14628152153024163, + "learning_rate": 4.7801291009492815e-05, + "loss": 2.8409, + "step": 27189 + }, + { + "epoch": 1.6878763424172822, + "grad_norm": 0.16309583942253914, + "learning_rate": 4.7797682984594976e-05, + "loss": 2.8138, + "step": 27190 + }, + { + "epoch": 1.6879384195170402, + "grad_norm": 0.1424433319241992, + "learning_rate": 4.7794074971187173e-05, + "loss": 2.7972, + "step": 27191 + }, + { + "epoch": 1.688000496616798, + "grad_norm": 0.14034502318362604, + "learning_rate": 4.7790466969288204e-05, + "loss": 2.7309, + "step": 27192 + }, + { + "epoch": 1.688062573716556, + "grad_norm": 0.15836885390026667, + "learning_rate": 4.778685897891692e-05, + "loss": 2.6896, + "step": 27193 + }, + { + "epoch": 1.688124650816314, + "grad_norm": 0.15025965449659426, + "learning_rate": 4.778325100009213e-05, + "loss": 2.8049, + "step": 27194 + }, + { + "epoch": 1.6881867279160718, + "grad_norm": 0.14835679847002645, + "learning_rate": 4.777964303283265e-05, + "loss": 2.7742, + "step": 27195 + }, + { + "epoch": 1.6882488050158297, + "grad_norm": 0.16710293872916965, + "learning_rate": 4.777603507715731e-05, + "loss": 2.8044, + "step": 27196 + }, + { + "epoch": 1.6883108821155877, + "grad_norm": 0.1432482569427862, + "learning_rate": 4.7772427133084926e-05, + "loss": 2.7629, + "step": 27197 + }, + { + "epoch": 1.6883729592153456, + "grad_norm": 0.15999696533111762, + "learning_rate": 4.776881920063435e-05, + "loss": 2.8565, + "step": 27198 + }, + { + "epoch": 1.6884350363151035, + "grad_norm": 0.1557730308897418, + "learning_rate": 4.776521127982437e-05, + "loss": 2.8348, + "step": 27199 + }, + { + "epoch": 1.6884971134148614, + "grad_norm": 0.15247867184940167, + "learning_rate": 4.7761603370673844e-05, + "loss": 2.7261, + "step": 27200 + }, + { + "epoch": 1.6885591905146191, + "grad_norm": 0.15583752189730013, + "learning_rate": 4.7757995473201555e-05, + "loss": 2.7164, + "step": 27201 + }, + { + "epoch": 1.688621267614377, + "grad_norm": 0.14547462212852244, + "learning_rate": 4.7754387587426364e-05, + "loss": 2.8211, + "step": 27202 + }, + { + "epoch": 1.688683344714135, + "grad_norm": 0.15220113950663405, + "learning_rate": 4.775077971336707e-05, + "loss": 2.865, + "step": 27203 + }, + { + "epoch": 1.6887454218138929, + "grad_norm": 0.14897131927167961, + "learning_rate": 4.7747171851042506e-05, + "loss": 2.7581, + "step": 27204 + }, + { + "epoch": 1.6888074989136508, + "grad_norm": 0.14561331207826028, + "learning_rate": 4.7743564000471495e-05, + "loss": 2.7601, + "step": 27205 + }, + { + "epoch": 1.6888695760134087, + "grad_norm": 0.14415227977289302, + "learning_rate": 4.773995616167285e-05, + "loss": 2.6949, + "step": 27206 + }, + { + "epoch": 1.6889316531131664, + "grad_norm": 0.15983731216785974, + "learning_rate": 4.773634833466542e-05, + "loss": 2.7894, + "step": 27207 + }, + { + "epoch": 1.6889937302129243, + "grad_norm": 0.1463284743712695, + "learning_rate": 4.773274051946799e-05, + "loss": 2.7268, + "step": 27208 + }, + { + "epoch": 1.6890558073126822, + "grad_norm": 0.14416964505014068, + "learning_rate": 4.772913271609942e-05, + "loss": 2.7668, + "step": 27209 + }, + { + "epoch": 1.6891178844124402, + "grad_norm": 0.149169609706321, + "learning_rate": 4.77255249245785e-05, + "loss": 2.8021, + "step": 27210 + }, + { + "epoch": 1.689179961512198, + "grad_norm": 0.1737609315510521, + "learning_rate": 4.7721917144924086e-05, + "loss": 2.8453, + "step": 27211 + }, + { + "epoch": 1.689242038611956, + "grad_norm": 0.1610854917457452, + "learning_rate": 4.771830937715497e-05, + "loss": 2.7746, + "step": 27212 + }, + { + "epoch": 1.689304115711714, + "grad_norm": 0.15128176955480782, + "learning_rate": 4.7714701621289995e-05, + "loss": 2.8934, + "step": 27213 + }, + { + "epoch": 1.6893661928114718, + "grad_norm": 0.16153436390869222, + "learning_rate": 4.771109387734797e-05, + "loss": 2.7473, + "step": 27214 + }, + { + "epoch": 1.6894282699112297, + "grad_norm": 0.15119123384194597, + "learning_rate": 4.770748614534773e-05, + "loss": 2.8317, + "step": 27215 + }, + { + "epoch": 1.6894903470109877, + "grad_norm": 0.14107151139275878, + "learning_rate": 4.770387842530808e-05, + "loss": 2.7897, + "step": 27216 + }, + { + "epoch": 1.6895524241107456, + "grad_norm": 0.16651700547225887, + "learning_rate": 4.770027071724787e-05, + "loss": 2.7505, + "step": 27217 + }, + { + "epoch": 1.6896145012105035, + "grad_norm": 0.1489964802009442, + "learning_rate": 4.76966630211859e-05, + "loss": 2.7449, + "step": 27218 + }, + { + "epoch": 1.6896765783102614, + "grad_norm": 0.15753256004264005, + "learning_rate": 4.769305533714098e-05, + "loss": 2.8392, + "step": 27219 + }, + { + "epoch": 1.6897386554100193, + "grad_norm": 0.15472196016928186, + "learning_rate": 4.7689447665131964e-05, + "loss": 2.733, + "step": 27220 + }, + { + "epoch": 1.6898007325097772, + "grad_norm": 0.1449276668491414, + "learning_rate": 4.768584000517767e-05, + "loss": 2.8039, + "step": 27221 + }, + { + "epoch": 1.6898628096095352, + "grad_norm": 0.15224626674483224, + "learning_rate": 4.76822323572969e-05, + "loss": 2.7455, + "step": 27222 + }, + { + "epoch": 1.689924886709293, + "grad_norm": 0.16183352843122906, + "learning_rate": 4.767862472150849e-05, + "loss": 2.7481, + "step": 27223 + }, + { + "epoch": 1.689986963809051, + "grad_norm": 0.1589068783923002, + "learning_rate": 4.7675017097831255e-05, + "loss": 2.7784, + "step": 27224 + }, + { + "epoch": 1.6900490409088087, + "grad_norm": 0.13964022765669176, + "learning_rate": 4.7671409486284024e-05, + "loss": 2.8067, + "step": 27225 + }, + { + "epoch": 1.6901111180085666, + "grad_norm": 0.16234785944902377, + "learning_rate": 4.766780188688561e-05, + "loss": 2.7131, + "step": 27226 + }, + { + "epoch": 1.6901731951083245, + "grad_norm": 0.17093209071553256, + "learning_rate": 4.766419429965485e-05, + "loss": 2.7822, + "step": 27227 + }, + { + "epoch": 1.6902352722080825, + "grad_norm": 0.1445870098602537, + "learning_rate": 4.766058672461055e-05, + "loss": 2.7882, + "step": 27228 + }, + { + "epoch": 1.6902973493078404, + "grad_norm": 0.1509119386530932, + "learning_rate": 4.765697916177154e-05, + "loss": 2.8248, + "step": 27229 + }, + { + "epoch": 1.6903594264075983, + "grad_norm": 0.14203518868326354, + "learning_rate": 4.765337161115664e-05, + "loss": 2.8208, + "step": 27230 + }, + { + "epoch": 1.690421503507356, + "grad_norm": 0.1556281231371476, + "learning_rate": 4.7649764072784654e-05, + "loss": 2.8131, + "step": 27231 + }, + { + "epoch": 1.690483580607114, + "grad_norm": 0.1455575998627216, + "learning_rate": 4.764615654667444e-05, + "loss": 2.6615, + "step": 27232 + }, + { + "epoch": 1.6905456577068718, + "grad_norm": 0.1621451683054894, + "learning_rate": 4.764254903284478e-05, + "loss": 2.8141, + "step": 27233 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.15465864793872836, + "learning_rate": 4.763894153131453e-05, + "loss": 2.7588, + "step": 27234 + }, + { + "epoch": 1.6906698119063877, + "grad_norm": 0.16016040191851122, + "learning_rate": 4.763533404210248e-05, + "loss": 2.7612, + "step": 27235 + }, + { + "epoch": 1.6907318890061456, + "grad_norm": 0.15667918389447957, + "learning_rate": 4.763172656522748e-05, + "loss": 2.8313, + "step": 27236 + }, + { + "epoch": 1.6907939661059035, + "grad_norm": 0.1512923948027843, + "learning_rate": 4.762811910070833e-05, + "loss": 2.7176, + "step": 27237 + }, + { + "epoch": 1.6908560432056614, + "grad_norm": 0.17830717617087072, + "learning_rate": 4.7624511648563866e-05, + "loss": 2.7652, + "step": 27238 + }, + { + "epoch": 1.6909181203054193, + "grad_norm": 0.15528847963419784, + "learning_rate": 4.762090420881289e-05, + "loss": 2.7708, + "step": 27239 + }, + { + "epoch": 1.6909801974051772, + "grad_norm": 0.15740111831810943, + "learning_rate": 4.761729678147424e-05, + "loss": 2.8433, + "step": 27240 + }, + { + "epoch": 1.6910422745049352, + "grad_norm": 0.16709782789229424, + "learning_rate": 4.7613689366566726e-05, + "loss": 2.8249, + "step": 27241 + }, + { + "epoch": 1.691104351604693, + "grad_norm": 0.16012173373749677, + "learning_rate": 4.7610081964109185e-05, + "loss": 2.7319, + "step": 27242 + }, + { + "epoch": 1.691166428704451, + "grad_norm": 0.15655339706629962, + "learning_rate": 4.760647457412043e-05, + "loss": 2.7421, + "step": 27243 + }, + { + "epoch": 1.691228505804209, + "grad_norm": 0.1497603187854793, + "learning_rate": 4.760286719661926e-05, + "loss": 2.699, + "step": 27244 + }, + { + "epoch": 1.6912905829039668, + "grad_norm": 0.14998533703741573, + "learning_rate": 4.7599259831624535e-05, + "loss": 2.8755, + "step": 27245 + }, + { + "epoch": 1.6913526600037248, + "grad_norm": 0.16844818534292588, + "learning_rate": 4.759565247915504e-05, + "loss": 2.7537, + "step": 27246 + }, + { + "epoch": 1.6914147371034827, + "grad_norm": 0.14428130007439946, + "learning_rate": 4.759204513922961e-05, + "loss": 2.7952, + "step": 27247 + }, + { + "epoch": 1.6914768142032406, + "grad_norm": 0.15576018210104517, + "learning_rate": 4.758843781186706e-05, + "loss": 2.7631, + "step": 27248 + }, + { + "epoch": 1.6915388913029983, + "grad_norm": 0.14811239277429086, + "learning_rate": 4.7584830497086226e-05, + "loss": 2.6814, + "step": 27249 + }, + { + "epoch": 1.6916009684027562, + "grad_norm": 0.15877598782584806, + "learning_rate": 4.7581223194905914e-05, + "loss": 2.8294, + "step": 27250 + }, + { + "epoch": 1.6916630455025141, + "grad_norm": 0.1405741612938866, + "learning_rate": 4.757761590534495e-05, + "loss": 2.8043, + "step": 27251 + }, + { + "epoch": 1.691725122602272, + "grad_norm": 0.15111958957347232, + "learning_rate": 4.757400862842213e-05, + "loss": 2.7625, + "step": 27252 + }, + { + "epoch": 1.69178719970203, + "grad_norm": 0.15026238163060487, + "learning_rate": 4.757040136415631e-05, + "loss": 2.8284, + "step": 27253 + }, + { + "epoch": 1.6918492768017879, + "grad_norm": 0.13920996594358928, + "learning_rate": 4.7566794112566305e-05, + "loss": 2.8108, + "step": 27254 + }, + { + "epoch": 1.6919113539015456, + "grad_norm": 0.16989423812087684, + "learning_rate": 4.756318687367093e-05, + "loss": 2.8143, + "step": 27255 + }, + { + "epoch": 1.6919734310013035, + "grad_norm": 0.1838111908712661, + "learning_rate": 4.755957964748898e-05, + "loss": 2.8407, + "step": 27256 + }, + { + "epoch": 1.6920355081010614, + "grad_norm": 0.15448505057187858, + "learning_rate": 4.755597243403932e-05, + "loss": 2.8159, + "step": 27257 + }, + { + "epoch": 1.6920975852008193, + "grad_norm": 0.14790246558410433, + "learning_rate": 4.755236523334073e-05, + "loss": 2.8123, + "step": 27258 + }, + { + "epoch": 1.6921596623005772, + "grad_norm": 0.14533393506932074, + "learning_rate": 4.754875804541205e-05, + "loss": 2.8369, + "step": 27259 + }, + { + "epoch": 1.6922217394003352, + "grad_norm": 0.15177421603909097, + "learning_rate": 4.7545150870272086e-05, + "loss": 2.8863, + "step": 27260 + }, + { + "epoch": 1.692283816500093, + "grad_norm": 0.1440640967439881, + "learning_rate": 4.754154370793968e-05, + "loss": 2.6768, + "step": 27261 + }, + { + "epoch": 1.692345893599851, + "grad_norm": 0.14991283690665125, + "learning_rate": 4.753793655843362e-05, + "loss": 2.7359, + "step": 27262 + }, + { + "epoch": 1.692407970699609, + "grad_norm": 0.15038104460285134, + "learning_rate": 4.753432942177276e-05, + "loss": 2.7743, + "step": 27263 + }, + { + "epoch": 1.6924700477993668, + "grad_norm": 0.16216280642634665, + "learning_rate": 4.7530722297975886e-05, + "loss": 2.7753, + "step": 27264 + }, + { + "epoch": 1.6925321248991247, + "grad_norm": 0.1542093535771497, + "learning_rate": 4.752711518706185e-05, + "loss": 2.7847, + "step": 27265 + }, + { + "epoch": 1.6925942019988827, + "grad_norm": 0.1514796177873053, + "learning_rate": 4.7523508089049436e-05, + "loss": 2.7375, + "step": 27266 + }, + { + "epoch": 1.6926562790986406, + "grad_norm": 0.15236967428638937, + "learning_rate": 4.7519901003957495e-05, + "loss": 2.7823, + "step": 27267 + }, + { + "epoch": 1.6927183561983985, + "grad_norm": 0.15315039750082468, + "learning_rate": 4.751629393180484e-05, + "loss": 2.8104, + "step": 27268 + }, + { + "epoch": 1.6927804332981564, + "grad_norm": 0.1558254972366565, + "learning_rate": 4.7512686872610264e-05, + "loss": 2.8749, + "step": 27269 + }, + { + "epoch": 1.6928425103979143, + "grad_norm": 0.16844427640974982, + "learning_rate": 4.7509079826392616e-05, + "loss": 2.8147, + "step": 27270 + }, + { + "epoch": 1.6929045874976723, + "grad_norm": 0.16114169587587976, + "learning_rate": 4.750547279317069e-05, + "loss": 2.767, + "step": 27271 + }, + { + "epoch": 1.6929666645974302, + "grad_norm": 0.15329227677131668, + "learning_rate": 4.7501865772963336e-05, + "loss": 2.7276, + "step": 27272 + }, + { + "epoch": 1.6930287416971879, + "grad_norm": 0.1572097094015406, + "learning_rate": 4.749825876578934e-05, + "loss": 2.8073, + "step": 27273 + }, + { + "epoch": 1.6930908187969458, + "grad_norm": 0.14594378219944912, + "learning_rate": 4.7494651771667544e-05, + "loss": 2.7976, + "step": 27274 + }, + { + "epoch": 1.6931528958967037, + "grad_norm": 0.15188476304475082, + "learning_rate": 4.749104479061675e-05, + "loss": 2.7314, + "step": 27275 + }, + { + "epoch": 1.6932149729964616, + "grad_norm": 0.15985832183359727, + "learning_rate": 4.7487437822655796e-05, + "loss": 2.6648, + "step": 27276 + }, + { + "epoch": 1.6932770500962195, + "grad_norm": 0.15016399374493342, + "learning_rate": 4.748383086780347e-05, + "loss": 2.8355, + "step": 27277 + }, + { + "epoch": 1.6933391271959775, + "grad_norm": 0.15793417215407668, + "learning_rate": 4.748022392607863e-05, + "loss": 2.7301, + "step": 27278 + }, + { + "epoch": 1.6934012042957352, + "grad_norm": 0.13824950624329482, + "learning_rate": 4.7476616997500066e-05, + "loss": 2.7703, + "step": 27279 + }, + { + "epoch": 1.693463281395493, + "grad_norm": 0.15017622986447565, + "learning_rate": 4.74730100820866e-05, + "loss": 2.8353, + "step": 27280 + }, + { + "epoch": 1.693525358495251, + "grad_norm": 0.14700657272606724, + "learning_rate": 4.746940317985706e-05, + "loss": 2.8554, + "step": 27281 + }, + { + "epoch": 1.693587435595009, + "grad_norm": 0.18538377520111787, + "learning_rate": 4.746579629083025e-05, + "loss": 2.8201, + "step": 27282 + }, + { + "epoch": 1.6936495126947668, + "grad_norm": 0.1567341099834664, + "learning_rate": 4.7462189415025007e-05, + "loss": 2.787, + "step": 27283 + }, + { + "epoch": 1.6937115897945247, + "grad_norm": 0.16272613100696753, + "learning_rate": 4.745858255246012e-05, + "loss": 2.7233, + "step": 27284 + }, + { + "epoch": 1.6937736668942827, + "grad_norm": 0.20149896012018956, + "learning_rate": 4.745497570315442e-05, + "loss": 2.6975, + "step": 27285 + }, + { + "epoch": 1.6938357439940406, + "grad_norm": 0.15357553899925963, + "learning_rate": 4.745136886712675e-05, + "loss": 2.7295, + "step": 27286 + }, + { + "epoch": 1.6938978210937985, + "grad_norm": 0.1547031606588042, + "learning_rate": 4.7447762044395896e-05, + "loss": 2.817, + "step": 27287 + }, + { + "epoch": 1.6939598981935564, + "grad_norm": 0.1801093684666648, + "learning_rate": 4.74441552349807e-05, + "loss": 2.7845, + "step": 27288 + }, + { + "epoch": 1.6940219752933143, + "grad_norm": 0.1489266004696186, + "learning_rate": 4.744054843889996e-05, + "loss": 2.7308, + "step": 27289 + }, + { + "epoch": 1.6940840523930722, + "grad_norm": 0.17353411251821152, + "learning_rate": 4.74369416561725e-05, + "loss": 2.6994, + "step": 27290 + }, + { + "epoch": 1.6941461294928302, + "grad_norm": 0.17720170949712472, + "learning_rate": 4.7433334886817134e-05, + "loss": 2.8233, + "step": 27291 + }, + { + "epoch": 1.694208206592588, + "grad_norm": 0.15274371316173302, + "learning_rate": 4.742972813085269e-05, + "loss": 2.778, + "step": 27292 + }, + { + "epoch": 1.694270283692346, + "grad_norm": 0.14655652494593868, + "learning_rate": 4.742612138829798e-05, + "loss": 2.7654, + "step": 27293 + }, + { + "epoch": 1.694332360792104, + "grad_norm": 0.1547754627646175, + "learning_rate": 4.7422514659171814e-05, + "loss": 2.7204, + "step": 27294 + }, + { + "epoch": 1.6943944378918618, + "grad_norm": 0.1627476318424401, + "learning_rate": 4.741890794349302e-05, + "loss": 2.8223, + "step": 27295 + }, + { + "epoch": 1.6944565149916198, + "grad_norm": 0.1482719703077931, + "learning_rate": 4.7415301241280394e-05, + "loss": 2.7383, + "step": 27296 + }, + { + "epoch": 1.6945185920913775, + "grad_norm": 0.17642188950639, + "learning_rate": 4.741169455255279e-05, + "loss": 2.853, + "step": 27297 + }, + { + "epoch": 1.6945806691911354, + "grad_norm": 0.14663706133439236, + "learning_rate": 4.740808787732899e-05, + "loss": 2.8476, + "step": 27298 + }, + { + "epoch": 1.6946427462908933, + "grad_norm": 0.1442657622732598, + "learning_rate": 4.7404481215627834e-05, + "loss": 2.7586, + "step": 27299 + }, + { + "epoch": 1.6947048233906512, + "grad_norm": 0.15658262134929482, + "learning_rate": 4.7400874567468115e-05, + "loss": 2.7578, + "step": 27300 + }, + { + "epoch": 1.6947669004904091, + "grad_norm": 0.15644628593532497, + "learning_rate": 4.739726793286868e-05, + "loss": 2.7772, + "step": 27301 + }, + { + "epoch": 1.694828977590167, + "grad_norm": 0.14941842565212085, + "learning_rate": 4.739366131184832e-05, + "loss": 2.7681, + "step": 27302 + }, + { + "epoch": 1.6948910546899247, + "grad_norm": 0.15575817905338313, + "learning_rate": 4.739005470442587e-05, + "loss": 2.7846, + "step": 27303 + }, + { + "epoch": 1.6949531317896827, + "grad_norm": 0.16148636495123547, + "learning_rate": 4.7386448110620133e-05, + "loss": 2.8093, + "step": 27304 + }, + { + "epoch": 1.6950152088894406, + "grad_norm": 0.149741847947176, + "learning_rate": 4.7382841530449926e-05, + "loss": 2.8212, + "step": 27305 + }, + { + "epoch": 1.6950772859891985, + "grad_norm": 0.18004551755722337, + "learning_rate": 4.737923496393408e-05, + "loss": 2.7624, + "step": 27306 + }, + { + "epoch": 1.6951393630889564, + "grad_norm": 0.14814107560192952, + "learning_rate": 4.737562841109139e-05, + "loss": 2.752, + "step": 27307 + }, + { + "epoch": 1.6952014401887143, + "grad_norm": 0.15161521648636023, + "learning_rate": 4.73720218719407e-05, + "loss": 2.7982, + "step": 27308 + }, + { + "epoch": 1.6952635172884722, + "grad_norm": 0.14067882593844072, + "learning_rate": 4.7368415346500795e-05, + "loss": 2.8182, + "step": 27309 + }, + { + "epoch": 1.6953255943882302, + "grad_norm": 0.16694225992631223, + "learning_rate": 4.736480883479052e-05, + "loss": 2.8212, + "step": 27310 + }, + { + "epoch": 1.695387671487988, + "grad_norm": 0.1448355384153245, + "learning_rate": 4.736120233682866e-05, + "loss": 2.8049, + "step": 27311 + }, + { + "epoch": 1.695449748587746, + "grad_norm": 0.14704179246627205, + "learning_rate": 4.735759585263406e-05, + "loss": 2.6798, + "step": 27312 + }, + { + "epoch": 1.695511825687504, + "grad_norm": 0.1659540679351019, + "learning_rate": 4.735398938222552e-05, + "loss": 2.8648, + "step": 27313 + }, + { + "epoch": 1.6955739027872618, + "grad_norm": 0.14880579508072175, + "learning_rate": 4.735038292562186e-05, + "loss": 2.7671, + "step": 27314 + }, + { + "epoch": 1.6956359798870198, + "grad_norm": 0.2219821446241534, + "learning_rate": 4.73467764828419e-05, + "loss": 2.8258, + "step": 27315 + }, + { + "epoch": 1.6956980569867777, + "grad_norm": 0.1470749647866213, + "learning_rate": 4.734317005390444e-05, + "loss": 2.7509, + "step": 27316 + }, + { + "epoch": 1.6957601340865356, + "grad_norm": 0.14540442691785738, + "learning_rate": 4.73395636388283e-05, + "loss": 2.7117, + "step": 27317 + }, + { + "epoch": 1.6958222111862935, + "grad_norm": 0.15544031280876125, + "learning_rate": 4.733595723763232e-05, + "loss": 2.7872, + "step": 27318 + }, + { + "epoch": 1.6958842882860514, + "grad_norm": 0.16123545874042794, + "learning_rate": 4.733235085033529e-05, + "loss": 2.768, + "step": 27319 + }, + { + "epoch": 1.6959463653858093, + "grad_norm": 0.15657325922759527, + "learning_rate": 4.732874447695604e-05, + "loss": 2.8185, + "step": 27320 + }, + { + "epoch": 1.696008442485567, + "grad_norm": 0.15280667908236267, + "learning_rate": 4.732513811751337e-05, + "loss": 2.7981, + "step": 27321 + }, + { + "epoch": 1.696070519585325, + "grad_norm": 0.20295535853187233, + "learning_rate": 4.7321531772026115e-05, + "loss": 2.781, + "step": 27322 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.17884998436245997, + "learning_rate": 4.731792544051307e-05, + "loss": 2.8127, + "step": 27323 + }, + { + "epoch": 1.6961946737848408, + "grad_norm": 0.14569801342054287, + "learning_rate": 4.731431912299306e-05, + "loss": 2.8241, + "step": 27324 + }, + { + "epoch": 1.6962567508845987, + "grad_norm": 0.15330699085262026, + "learning_rate": 4.7310712819484895e-05, + "loss": 2.7972, + "step": 27325 + }, + { + "epoch": 1.6963188279843566, + "grad_norm": 0.15406098623286496, + "learning_rate": 4.730710653000741e-05, + "loss": 2.7264, + "step": 27326 + }, + { + "epoch": 1.6963809050841143, + "grad_norm": 0.16185150548557656, + "learning_rate": 4.730350025457938e-05, + "loss": 2.6607, + "step": 27327 + }, + { + "epoch": 1.6964429821838722, + "grad_norm": 0.17081158201987814, + "learning_rate": 4.7299893993219665e-05, + "loss": 2.8313, + "step": 27328 + }, + { + "epoch": 1.6965050592836302, + "grad_norm": 0.16565728502682428, + "learning_rate": 4.729628774594706e-05, + "loss": 2.8026, + "step": 27329 + }, + { + "epoch": 1.696567136383388, + "grad_norm": 0.1500820797126624, + "learning_rate": 4.7292681512780367e-05, + "loss": 2.8355, + "step": 27330 + }, + { + "epoch": 1.696629213483146, + "grad_norm": 0.1706791533566867, + "learning_rate": 4.7289075293738415e-05, + "loss": 2.7837, + "step": 27331 + }, + { + "epoch": 1.696691290582904, + "grad_norm": 0.14085462816082212, + "learning_rate": 4.728546908884001e-05, + "loss": 2.8327, + "step": 27332 + }, + { + "epoch": 1.6967533676826618, + "grad_norm": 0.15224101196969042, + "learning_rate": 4.728186289810399e-05, + "loss": 2.8063, + "step": 27333 + }, + { + "epoch": 1.6968154447824197, + "grad_norm": 0.14986690760957572, + "learning_rate": 4.7278256721549127e-05, + "loss": 2.723, + "step": 27334 + }, + { + "epoch": 1.6968775218821777, + "grad_norm": 0.1615868730994265, + "learning_rate": 4.727465055919427e-05, + "loss": 2.8036, + "step": 27335 + }, + { + "epoch": 1.6969395989819356, + "grad_norm": 0.14268641083043077, + "learning_rate": 4.727104441105823e-05, + "loss": 2.7656, + "step": 27336 + }, + { + "epoch": 1.6970016760816935, + "grad_norm": 0.15556871780047896, + "learning_rate": 4.7267438277159804e-05, + "loss": 2.7931, + "step": 27337 + }, + { + "epoch": 1.6970637531814514, + "grad_norm": 0.1441335942587669, + "learning_rate": 4.726383215751781e-05, + "loss": 2.7668, + "step": 27338 + }, + { + "epoch": 1.6971258302812093, + "grad_norm": 0.14037703135702942, + "learning_rate": 4.726022605215108e-05, + "loss": 2.7429, + "step": 27339 + }, + { + "epoch": 1.6971879073809673, + "grad_norm": 0.15835659800965987, + "learning_rate": 4.725661996107842e-05, + "loss": 2.7258, + "step": 27340 + }, + { + "epoch": 1.6972499844807252, + "grad_norm": 0.15193252203156707, + "learning_rate": 4.7253013884318625e-05, + "loss": 2.7822, + "step": 27341 + }, + { + "epoch": 1.697312061580483, + "grad_norm": 0.15172917551193132, + "learning_rate": 4.724940782189054e-05, + "loss": 2.6493, + "step": 27342 + }, + { + "epoch": 1.697374138680241, + "grad_norm": 0.14211036429079216, + "learning_rate": 4.724580177381294e-05, + "loss": 2.8331, + "step": 27343 + }, + { + "epoch": 1.697436215779999, + "grad_norm": 0.16255221720010302, + "learning_rate": 4.7242195740104686e-05, + "loss": 2.7715, + "step": 27344 + }, + { + "epoch": 1.6974982928797566, + "grad_norm": 0.1418175989565009, + "learning_rate": 4.7238589720784545e-05, + "loss": 2.8324, + "step": 27345 + }, + { + "epoch": 1.6975603699795145, + "grad_norm": 0.14318039815217118, + "learning_rate": 4.7234983715871364e-05, + "loss": 2.8146, + "step": 27346 + }, + { + "epoch": 1.6976224470792725, + "grad_norm": 0.1475245799480141, + "learning_rate": 4.723137772538393e-05, + "loss": 2.8966, + "step": 27347 + }, + { + "epoch": 1.6976845241790304, + "grad_norm": 0.13669574407747137, + "learning_rate": 4.722777174934109e-05, + "loss": 2.7932, + "step": 27348 + }, + { + "epoch": 1.6977466012787883, + "grad_norm": 0.1591552586913157, + "learning_rate": 4.722416578776162e-05, + "loss": 2.7797, + "step": 27349 + }, + { + "epoch": 1.697808678378546, + "grad_norm": 0.21353120409913243, + "learning_rate": 4.722055984066434e-05, + "loss": 2.6687, + "step": 27350 + }, + { + "epoch": 1.697870755478304, + "grad_norm": 0.1450232238813168, + "learning_rate": 4.72169539080681e-05, + "loss": 2.8534, + "step": 27351 + }, + { + "epoch": 1.6979328325780618, + "grad_norm": 0.14902139939780576, + "learning_rate": 4.721334798999168e-05, + "loss": 2.7513, + "step": 27352 + }, + { + "epoch": 1.6979949096778197, + "grad_norm": 0.15483156946392823, + "learning_rate": 4.72097420864539e-05, + "loss": 2.7513, + "step": 27353 + }, + { + "epoch": 1.6980569867775777, + "grad_norm": 0.14272124149134885, + "learning_rate": 4.720613619747358e-05, + "loss": 2.8322, + "step": 27354 + }, + { + "epoch": 1.6981190638773356, + "grad_norm": 0.151705898208094, + "learning_rate": 4.7202530323069516e-05, + "loss": 2.8073, + "step": 27355 + }, + { + "epoch": 1.6981811409770935, + "grad_norm": 0.14068741870125345, + "learning_rate": 4.719892446326053e-05, + "loss": 2.7947, + "step": 27356 + }, + { + "epoch": 1.6982432180768514, + "grad_norm": 0.15481752364070347, + "learning_rate": 4.719531861806544e-05, + "loss": 2.7684, + "step": 27357 + }, + { + "epoch": 1.6983052951766093, + "grad_norm": 0.1409751982079858, + "learning_rate": 4.7191712787503055e-05, + "loss": 2.6979, + "step": 27358 + }, + { + "epoch": 1.6983673722763672, + "grad_norm": 0.1410679661942857, + "learning_rate": 4.7188106971592184e-05, + "loss": 2.777, + "step": 27359 + }, + { + "epoch": 1.6984294493761252, + "grad_norm": 0.13813165423171553, + "learning_rate": 4.718450117035164e-05, + "loss": 2.7505, + "step": 27360 + }, + { + "epoch": 1.698491526475883, + "grad_norm": 0.1736158457083959, + "learning_rate": 4.718089538380023e-05, + "loss": 2.7505, + "step": 27361 + }, + { + "epoch": 1.698553603575641, + "grad_norm": 0.13898441528053837, + "learning_rate": 4.717728961195679e-05, + "loss": 2.7827, + "step": 27362 + }, + { + "epoch": 1.698615680675399, + "grad_norm": 0.14154188937658255, + "learning_rate": 4.71736838548401e-05, + "loss": 2.8061, + "step": 27363 + }, + { + "epoch": 1.6986777577751568, + "grad_norm": 0.15321105660269588, + "learning_rate": 4.7170078112469005e-05, + "loss": 2.8278, + "step": 27364 + }, + { + "epoch": 1.6987398348749148, + "grad_norm": 0.1456591695801056, + "learning_rate": 4.716647238486229e-05, + "loss": 2.7228, + "step": 27365 + }, + { + "epoch": 1.6988019119746727, + "grad_norm": 0.14920852586161573, + "learning_rate": 4.716286667203877e-05, + "loss": 2.81, + "step": 27366 + }, + { + "epoch": 1.6988639890744306, + "grad_norm": 0.15169902237179195, + "learning_rate": 4.715926097401727e-05, + "loss": 2.7672, + "step": 27367 + }, + { + "epoch": 1.6989260661741883, + "grad_norm": 0.1479250633236226, + "learning_rate": 4.715565529081659e-05, + "loss": 2.8374, + "step": 27368 + }, + { + "epoch": 1.6989881432739462, + "grad_norm": 0.16043608919964114, + "learning_rate": 4.715204962245555e-05, + "loss": 2.7869, + "step": 27369 + }, + { + "epoch": 1.6990502203737041, + "grad_norm": 0.15775179356327712, + "learning_rate": 4.7148443968952954e-05, + "loss": 2.8281, + "step": 27370 + }, + { + "epoch": 1.699112297473462, + "grad_norm": 0.14963741687093282, + "learning_rate": 4.714483833032763e-05, + "loss": 2.7482, + "step": 27371 + }, + { + "epoch": 1.69917437457322, + "grad_norm": 0.14038605390143058, + "learning_rate": 4.7141232706598364e-05, + "loss": 2.8085, + "step": 27372 + }, + { + "epoch": 1.6992364516729779, + "grad_norm": 0.1516191538881105, + "learning_rate": 4.713762709778399e-05, + "loss": 2.7896, + "step": 27373 + }, + { + "epoch": 1.6992985287727356, + "grad_norm": 0.14973758129495346, + "learning_rate": 4.7134021503903306e-05, + "loss": 2.8937, + "step": 27374 + }, + { + "epoch": 1.6993606058724935, + "grad_norm": 0.14502491924924465, + "learning_rate": 4.713041592497513e-05, + "loss": 2.7666, + "step": 27375 + }, + { + "epoch": 1.6994226829722514, + "grad_norm": 0.1493035699529972, + "learning_rate": 4.7126810361018275e-05, + "loss": 2.7941, + "step": 27376 + }, + { + "epoch": 1.6994847600720093, + "grad_norm": 0.16978549965578318, + "learning_rate": 4.712320481205154e-05, + "loss": 2.76, + "step": 27377 + }, + { + "epoch": 1.6995468371717672, + "grad_norm": 0.15886037780521747, + "learning_rate": 4.711959927809374e-05, + "loss": 2.8458, + "step": 27378 + }, + { + "epoch": 1.6996089142715252, + "grad_norm": 0.14983336511133527, + "learning_rate": 4.7115993759163696e-05, + "loss": 2.8535, + "step": 27379 + }, + { + "epoch": 1.699670991371283, + "grad_norm": 0.1442523275366091, + "learning_rate": 4.711238825528022e-05, + "loss": 2.801, + "step": 27380 + }, + { + "epoch": 1.699733068471041, + "grad_norm": 0.15964176534585375, + "learning_rate": 4.71087827664621e-05, + "loss": 2.7688, + "step": 27381 + }, + { + "epoch": 1.699795145570799, + "grad_norm": 0.15142935429941676, + "learning_rate": 4.710517729272818e-05, + "loss": 2.7784, + "step": 27382 + }, + { + "epoch": 1.6998572226705568, + "grad_norm": 0.14738167280802422, + "learning_rate": 4.7101571834097227e-05, + "loss": 2.7808, + "step": 27383 + }, + { + "epoch": 1.6999192997703148, + "grad_norm": 0.1571352853659626, + "learning_rate": 4.7097966390588084e-05, + "loss": 2.7669, + "step": 27384 + }, + { + "epoch": 1.6999813768700727, + "grad_norm": 0.1501673614130062, + "learning_rate": 4.709436096221957e-05, + "loss": 2.7279, + "step": 27385 + }, + { + "epoch": 1.7000434539698306, + "grad_norm": 0.1701259921715567, + "learning_rate": 4.7090755549010466e-05, + "loss": 2.8482, + "step": 27386 + }, + { + "epoch": 1.7001055310695885, + "grad_norm": 0.1535729010218704, + "learning_rate": 4.708715015097961e-05, + "loss": 2.7968, + "step": 27387 + }, + { + "epoch": 1.7001676081693464, + "grad_norm": 0.1534691046099177, + "learning_rate": 4.708354476814579e-05, + "loss": 2.8049, + "step": 27388 + }, + { + "epoch": 1.7002296852691043, + "grad_norm": 0.14071403439944027, + "learning_rate": 4.707993940052783e-05, + "loss": 2.8043, + "step": 27389 + }, + { + "epoch": 1.7002917623688623, + "grad_norm": 0.14016175794038438, + "learning_rate": 4.707633404814454e-05, + "loss": 2.7532, + "step": 27390 + }, + { + "epoch": 1.7003538394686202, + "grad_norm": 0.1604027247150203, + "learning_rate": 4.707272871101472e-05, + "loss": 2.8437, + "step": 27391 + }, + { + "epoch": 1.7004159165683779, + "grad_norm": 0.153093419175687, + "learning_rate": 4.706912338915719e-05, + "loss": 2.8437, + "step": 27392 + }, + { + "epoch": 1.7004779936681358, + "grad_norm": 0.16175119631994062, + "learning_rate": 4.7065518082590735e-05, + "loss": 2.8308, + "step": 27393 + }, + { + "epoch": 1.7005400707678937, + "grad_norm": 0.153747861467391, + "learning_rate": 4.7061912791334206e-05, + "loss": 2.7765, + "step": 27394 + }, + { + "epoch": 1.7006021478676516, + "grad_norm": 0.14871475402207285, + "learning_rate": 4.705830751540638e-05, + "loss": 2.7696, + "step": 27395 + }, + { + "epoch": 1.7006642249674095, + "grad_norm": 0.15668763028585633, + "learning_rate": 4.705470225482608e-05, + "loss": 2.7565, + "step": 27396 + }, + { + "epoch": 1.7007263020671675, + "grad_norm": 0.15868083038301642, + "learning_rate": 4.705109700961211e-05, + "loss": 2.684, + "step": 27397 + }, + { + "epoch": 1.7007883791669252, + "grad_norm": 0.15320610846883034, + "learning_rate": 4.70474917797833e-05, + "loss": 2.8076, + "step": 27398 + }, + { + "epoch": 1.700850456266683, + "grad_norm": 0.16544375376704054, + "learning_rate": 4.704388656535842e-05, + "loss": 2.7659, + "step": 27399 + }, + { + "epoch": 1.700912533366441, + "grad_norm": 0.18060760646943924, + "learning_rate": 4.704028136635632e-05, + "loss": 2.7346, + "step": 27400 + }, + { + "epoch": 1.700974610466199, + "grad_norm": 0.15578415090295294, + "learning_rate": 4.703667618279578e-05, + "loss": 2.7879, + "step": 27401 + }, + { + "epoch": 1.7010366875659568, + "grad_norm": 0.15330603101754287, + "learning_rate": 4.703307101469562e-05, + "loss": 2.7691, + "step": 27402 + }, + { + "epoch": 1.7010987646657147, + "grad_norm": 0.16252395173680803, + "learning_rate": 4.702946586207465e-05, + "loss": 2.8089, + "step": 27403 + }, + { + "epoch": 1.7011608417654727, + "grad_norm": 0.1401110236771885, + "learning_rate": 4.702586072495168e-05, + "loss": 2.8374, + "step": 27404 + }, + { + "epoch": 1.7012229188652306, + "grad_norm": 0.15408827769136685, + "learning_rate": 4.702225560334551e-05, + "loss": 2.693, + "step": 27405 + }, + { + "epoch": 1.7012849959649885, + "grad_norm": 0.14684341298990328, + "learning_rate": 4.701865049727496e-05, + "loss": 2.7767, + "step": 27406 + }, + { + "epoch": 1.7013470730647464, + "grad_norm": 0.14956528982724013, + "learning_rate": 4.7015045406758834e-05, + "loss": 2.8034, + "step": 27407 + }, + { + "epoch": 1.7014091501645043, + "grad_norm": 0.15471402648860946, + "learning_rate": 4.701144033181594e-05, + "loss": 2.908, + "step": 27408 + }, + { + "epoch": 1.7014712272642623, + "grad_norm": 0.14590547613607435, + "learning_rate": 4.700783527246509e-05, + "loss": 2.7731, + "step": 27409 + }, + { + "epoch": 1.7015333043640202, + "grad_norm": 0.14861985349367, + "learning_rate": 4.700423022872508e-05, + "loss": 2.7901, + "step": 27410 + }, + { + "epoch": 1.701595381463778, + "grad_norm": 0.14751685894778432, + "learning_rate": 4.700062520061474e-05, + "loss": 2.755, + "step": 27411 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.15581204309943544, + "learning_rate": 4.699702018815286e-05, + "loss": 2.7103, + "step": 27412 + }, + { + "epoch": 1.701719535663294, + "grad_norm": 0.15706608561435384, + "learning_rate": 4.699341519135826e-05, + "loss": 2.8248, + "step": 27413 + }, + { + "epoch": 1.7017816127630518, + "grad_norm": 0.14080428245192816, + "learning_rate": 4.698981021024975e-05, + "loss": 2.7492, + "step": 27414 + }, + { + "epoch": 1.7018436898628098, + "grad_norm": 0.14241350808523026, + "learning_rate": 4.698620524484611e-05, + "loss": 2.7283, + "step": 27415 + }, + { + "epoch": 1.7019057669625675, + "grad_norm": 0.1553171863942624, + "learning_rate": 4.698260029516616e-05, + "loss": 2.7347, + "step": 27416 + }, + { + "epoch": 1.7019678440623254, + "grad_norm": 0.14682985890424122, + "learning_rate": 4.6978995361228745e-05, + "loss": 2.8243, + "step": 27417 + }, + { + "epoch": 1.7020299211620833, + "grad_norm": 0.14713758422020165, + "learning_rate": 4.697539044305263e-05, + "loss": 2.7829, + "step": 27418 + }, + { + "epoch": 1.7020919982618412, + "grad_norm": 0.1471218416540209, + "learning_rate": 4.6971785540656646e-05, + "loss": 2.6935, + "step": 27419 + }, + { + "epoch": 1.7021540753615991, + "grad_norm": 0.14712662726849826, + "learning_rate": 4.6968180654059584e-05, + "loss": 2.8624, + "step": 27420 + }, + { + "epoch": 1.702216152461357, + "grad_norm": 0.1545114293033655, + "learning_rate": 4.6964575783280266e-05, + "loss": 2.8103, + "step": 27421 + }, + { + "epoch": 1.7022782295611147, + "grad_norm": 0.14619550832982878, + "learning_rate": 4.696097092833749e-05, + "loss": 2.7538, + "step": 27422 + }, + { + "epoch": 1.7023403066608727, + "grad_norm": 0.15664510244881455, + "learning_rate": 4.695736608925007e-05, + "loss": 2.6336, + "step": 27423 + }, + { + "epoch": 1.7024023837606306, + "grad_norm": 0.1470473913743666, + "learning_rate": 4.69537612660368e-05, + "loss": 2.7537, + "step": 27424 + }, + { + "epoch": 1.7024644608603885, + "grad_norm": 0.1619290647470353, + "learning_rate": 4.695015645871651e-05, + "loss": 2.7585, + "step": 27425 + }, + { + "epoch": 1.7025265379601464, + "grad_norm": 0.14421765354934113, + "learning_rate": 4.6946551667308e-05, + "loss": 2.7729, + "step": 27426 + }, + { + "epoch": 1.7025886150599043, + "grad_norm": 0.14394753090793683, + "learning_rate": 4.694294689183005e-05, + "loss": 2.7085, + "step": 27427 + }, + { + "epoch": 1.7026506921596622, + "grad_norm": 0.15460865294300885, + "learning_rate": 4.6939342132301506e-05, + "loss": 2.8078, + "step": 27428 + }, + { + "epoch": 1.7027127692594202, + "grad_norm": 0.1566614099159201, + "learning_rate": 4.693573738874114e-05, + "loss": 2.7735, + "step": 27429 + }, + { + "epoch": 1.702774846359178, + "grad_norm": 0.17297435429220012, + "learning_rate": 4.6932132661167794e-05, + "loss": 2.7526, + "step": 27430 + }, + { + "epoch": 1.702836923458936, + "grad_norm": 0.16653858170536345, + "learning_rate": 4.692852794960024e-05, + "loss": 2.8278, + "step": 27431 + }, + { + "epoch": 1.702899000558694, + "grad_norm": 0.15830122296355817, + "learning_rate": 4.692492325405731e-05, + "loss": 2.8576, + "step": 27432 + }, + { + "epoch": 1.7029610776584518, + "grad_norm": 0.16160191540291113, + "learning_rate": 4.69213185745578e-05, + "loss": 2.684, + "step": 27433 + }, + { + "epoch": 1.7030231547582098, + "grad_norm": 0.162346603131671, + "learning_rate": 4.691771391112052e-05, + "loss": 2.7691, + "step": 27434 + }, + { + "epoch": 1.7030852318579677, + "grad_norm": 0.17566458751421024, + "learning_rate": 4.6914109263764274e-05, + "loss": 2.6932, + "step": 27435 + }, + { + "epoch": 1.7031473089577256, + "grad_norm": 0.18685069509800054, + "learning_rate": 4.6910504632507876e-05, + "loss": 2.8265, + "step": 27436 + }, + { + "epoch": 1.7032093860574835, + "grad_norm": 0.2048330932161317, + "learning_rate": 4.690690001737011e-05, + "loss": 2.8831, + "step": 27437 + }, + { + "epoch": 1.7032714631572414, + "grad_norm": 0.14956140421423308, + "learning_rate": 4.690329541836981e-05, + "loss": 2.8586, + "step": 27438 + }, + { + "epoch": 1.7033335402569993, + "grad_norm": 0.18719100326354923, + "learning_rate": 4.6899690835525774e-05, + "loss": 2.8146, + "step": 27439 + }, + { + "epoch": 1.703395617356757, + "grad_norm": 0.1433754859935203, + "learning_rate": 4.689608626885679e-05, + "loss": 2.7965, + "step": 27440 + }, + { + "epoch": 1.703457694456515, + "grad_norm": 0.16964309119176135, + "learning_rate": 4.689248171838168e-05, + "loss": 2.7441, + "step": 27441 + }, + { + "epoch": 1.7035197715562729, + "grad_norm": 0.15680612302078104, + "learning_rate": 4.688887718411925e-05, + "loss": 2.7682, + "step": 27442 + }, + { + "epoch": 1.7035818486560308, + "grad_norm": 0.16174489146335833, + "learning_rate": 4.6885272666088305e-05, + "loss": 2.686, + "step": 27443 + }, + { + "epoch": 1.7036439257557887, + "grad_norm": 0.14933226721322201, + "learning_rate": 4.688166816430764e-05, + "loss": 2.8113, + "step": 27444 + }, + { + "epoch": 1.7037060028555466, + "grad_norm": 0.16174340589973077, + "learning_rate": 4.687806367879608e-05, + "loss": 2.864, + "step": 27445 + }, + { + "epoch": 1.7037680799553043, + "grad_norm": 0.1760285121942519, + "learning_rate": 4.6874459209572405e-05, + "loss": 2.7904, + "step": 27446 + }, + { + "epoch": 1.7038301570550622, + "grad_norm": 0.13828428259719286, + "learning_rate": 4.687085475665546e-05, + "loss": 2.7726, + "step": 27447 + }, + { + "epoch": 1.7038922341548202, + "grad_norm": 0.16350138736677233, + "learning_rate": 4.686725032006399e-05, + "loss": 2.7583, + "step": 27448 + }, + { + "epoch": 1.703954311254578, + "grad_norm": 0.14449368452630595, + "learning_rate": 4.686364589981685e-05, + "loss": 2.755, + "step": 27449 + }, + { + "epoch": 1.704016388354336, + "grad_norm": 0.17692925374276924, + "learning_rate": 4.6860041495932845e-05, + "loss": 2.7725, + "step": 27450 + }, + { + "epoch": 1.704078465454094, + "grad_norm": 0.1902794954811384, + "learning_rate": 4.685643710843076e-05, + "loss": 2.9072, + "step": 27451 + }, + { + "epoch": 1.7041405425538518, + "grad_norm": 0.1963677983818015, + "learning_rate": 4.68528327373294e-05, + "loss": 2.8048, + "step": 27452 + }, + { + "epoch": 1.7042026196536098, + "grad_norm": 0.16413847549207575, + "learning_rate": 4.684922838264758e-05, + "loss": 2.815, + "step": 27453 + }, + { + "epoch": 1.7042646967533677, + "grad_norm": 0.1574532846679211, + "learning_rate": 4.684562404440409e-05, + "loss": 2.7898, + "step": 27454 + }, + { + "epoch": 1.7043267738531256, + "grad_norm": 0.15480185217117406, + "learning_rate": 4.684201972261776e-05, + "loss": 2.8098, + "step": 27455 + }, + { + "epoch": 1.7043888509528835, + "grad_norm": 0.16596674480027776, + "learning_rate": 4.683841541730737e-05, + "loss": 2.86, + "step": 27456 + }, + { + "epoch": 1.7044509280526414, + "grad_norm": 0.14931272673080687, + "learning_rate": 4.683481112849175e-05, + "loss": 2.7795, + "step": 27457 + }, + { + "epoch": 1.7045130051523993, + "grad_norm": 0.14546953248203817, + "learning_rate": 4.683120685618967e-05, + "loss": 2.8104, + "step": 27458 + }, + { + "epoch": 1.7045750822521573, + "grad_norm": 0.21165336007308483, + "learning_rate": 4.6827602600419966e-05, + "loss": 2.709, + "step": 27459 + }, + { + "epoch": 1.7046371593519152, + "grad_norm": 0.14793524345639905, + "learning_rate": 4.682399836120142e-05, + "loss": 2.8165, + "step": 27460 + }, + { + "epoch": 1.704699236451673, + "grad_norm": 0.14555024162080446, + "learning_rate": 4.682039413855285e-05, + "loss": 2.8306, + "step": 27461 + }, + { + "epoch": 1.704761313551431, + "grad_norm": 0.1530610583569637, + "learning_rate": 4.681678993249305e-05, + "loss": 2.7065, + "step": 27462 + }, + { + "epoch": 1.704823390651189, + "grad_norm": 0.184240177128206, + "learning_rate": 4.681318574304084e-05, + "loss": 2.795, + "step": 27463 + }, + { + "epoch": 1.7048854677509466, + "grad_norm": 0.15427199295873667, + "learning_rate": 4.680958157021502e-05, + "loss": 2.7603, + "step": 27464 + }, + { + "epoch": 1.7049475448507045, + "grad_norm": 0.1806501434737902, + "learning_rate": 4.680597741403436e-05, + "loss": 2.7759, + "step": 27465 + }, + { + "epoch": 1.7050096219504625, + "grad_norm": 0.2089026014826834, + "learning_rate": 4.680237327451772e-05, + "loss": 2.7556, + "step": 27466 + }, + { + "epoch": 1.7050716990502204, + "grad_norm": 0.1633895628851662, + "learning_rate": 4.679876915168385e-05, + "loss": 2.8248, + "step": 27467 + }, + { + "epoch": 1.7051337761499783, + "grad_norm": 0.15252294760556342, + "learning_rate": 4.6795165045551595e-05, + "loss": 2.7487, + "step": 27468 + }, + { + "epoch": 1.7051958532497362, + "grad_norm": 0.1786092249856572, + "learning_rate": 4.6791560956139734e-05, + "loss": 2.7485, + "step": 27469 + }, + { + "epoch": 1.705257930349494, + "grad_norm": 0.20435851864752366, + "learning_rate": 4.6787956883467086e-05, + "loss": 2.8216, + "step": 27470 + }, + { + "epoch": 1.7053200074492518, + "grad_norm": 0.1481943460397067, + "learning_rate": 4.678435282755244e-05, + "loss": 2.8031, + "step": 27471 + }, + { + "epoch": 1.7053820845490097, + "grad_norm": 0.16384057521127007, + "learning_rate": 4.678074878841461e-05, + "loss": 2.7652, + "step": 27472 + }, + { + "epoch": 1.7054441616487677, + "grad_norm": 0.1716266322259741, + "learning_rate": 4.677714476607239e-05, + "loss": 2.8425, + "step": 27473 + }, + { + "epoch": 1.7055062387485256, + "grad_norm": 0.14892791996655258, + "learning_rate": 4.677354076054459e-05, + "loss": 2.8027, + "step": 27474 + }, + { + "epoch": 1.7055683158482835, + "grad_norm": 0.14429227945582448, + "learning_rate": 4.676993677185002e-05, + "loss": 2.7853, + "step": 27475 + }, + { + "epoch": 1.7056303929480414, + "grad_norm": 0.14248862135364163, + "learning_rate": 4.676633280000746e-05, + "loss": 2.7301, + "step": 27476 + }, + { + "epoch": 1.7056924700477993, + "grad_norm": 0.1447290729196868, + "learning_rate": 4.676272884503574e-05, + "loss": 2.6814, + "step": 27477 + }, + { + "epoch": 1.7057545471475573, + "grad_norm": 0.14046604391510514, + "learning_rate": 4.675912490695364e-05, + "loss": 2.7598, + "step": 27478 + }, + { + "epoch": 1.7058166242473152, + "grad_norm": 0.15764677864259447, + "learning_rate": 4.675552098577998e-05, + "loss": 2.7313, + "step": 27479 + }, + { + "epoch": 1.705878701347073, + "grad_norm": 0.15232630776544065, + "learning_rate": 4.6751917081533544e-05, + "loss": 2.6694, + "step": 27480 + }, + { + "epoch": 1.705940778446831, + "grad_norm": 0.15599197899046063, + "learning_rate": 4.674831319423313e-05, + "loss": 2.7334, + "step": 27481 + }, + { + "epoch": 1.706002855546589, + "grad_norm": 0.16352329315453282, + "learning_rate": 4.6744709323897584e-05, + "loss": 2.822, + "step": 27482 + }, + { + "epoch": 1.7060649326463468, + "grad_norm": 0.16347524011184192, + "learning_rate": 4.6741105470545666e-05, + "loss": 2.8122, + "step": 27483 + }, + { + "epoch": 1.7061270097461048, + "grad_norm": 0.15211146325322264, + "learning_rate": 4.67375016341962e-05, + "loss": 2.8249, + "step": 27484 + }, + { + "epoch": 1.7061890868458627, + "grad_norm": 0.16933386558312336, + "learning_rate": 4.673389781486797e-05, + "loss": 2.8564, + "step": 27485 + }, + { + "epoch": 1.7062511639456206, + "grad_norm": 0.16088064865459917, + "learning_rate": 4.673029401257981e-05, + "loss": 2.8153, + "step": 27486 + }, + { + "epoch": 1.7063132410453785, + "grad_norm": 0.15845235572479993, + "learning_rate": 4.6726690227350474e-05, + "loss": 2.8144, + "step": 27487 + }, + { + "epoch": 1.7063753181451362, + "grad_norm": 0.14873217520984083, + "learning_rate": 4.672308645919881e-05, + "loss": 2.7625, + "step": 27488 + }, + { + "epoch": 1.7064373952448941, + "grad_norm": 0.16527808109613132, + "learning_rate": 4.671948270814359e-05, + "loss": 2.8073, + "step": 27489 + }, + { + "epoch": 1.706499472344652, + "grad_norm": 0.14442760385469258, + "learning_rate": 4.671587897420362e-05, + "loss": 2.8216, + "step": 27490 + }, + { + "epoch": 1.70656154944441, + "grad_norm": 0.15467349697079177, + "learning_rate": 4.671227525739772e-05, + "loss": 2.7882, + "step": 27491 + }, + { + "epoch": 1.7066236265441679, + "grad_norm": 0.14920993024914603, + "learning_rate": 4.670867155774466e-05, + "loss": 2.7949, + "step": 27492 + }, + { + "epoch": 1.7066857036439258, + "grad_norm": 0.19102539396366452, + "learning_rate": 4.670506787526328e-05, + "loss": 2.7697, + "step": 27493 + }, + { + "epoch": 1.7067477807436835, + "grad_norm": 0.14939756239993976, + "learning_rate": 4.670146420997235e-05, + "loss": 2.7678, + "step": 27494 + }, + { + "epoch": 1.7068098578434414, + "grad_norm": 0.14778191955811604, + "learning_rate": 4.6697860561890686e-05, + "loss": 2.8888, + "step": 27495 + }, + { + "epoch": 1.7068719349431993, + "grad_norm": 0.14583873880677126, + "learning_rate": 4.669425693103707e-05, + "loss": 2.7062, + "step": 27496 + }, + { + "epoch": 1.7069340120429572, + "grad_norm": 0.17032147152045582, + "learning_rate": 4.6690653317430335e-05, + "loss": 2.8135, + "step": 27497 + }, + { + "epoch": 1.7069960891427152, + "grad_norm": 0.14443289142100002, + "learning_rate": 4.6687049721089254e-05, + "loss": 2.6786, + "step": 27498 + }, + { + "epoch": 1.707058166242473, + "grad_norm": 0.17196789853534922, + "learning_rate": 4.6683446142032646e-05, + "loss": 2.7585, + "step": 27499 + }, + { + "epoch": 1.707120243342231, + "grad_norm": 0.15724979252718171, + "learning_rate": 4.6679842580279316e-05, + "loss": 2.784, + "step": 27500 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.1614420149360481, + "learning_rate": 4.667623903584803e-05, + "loss": 2.8008, + "step": 27501 + }, + { + "epoch": 1.7072443975417468, + "grad_norm": 0.16833649898100275, + "learning_rate": 4.667263550875763e-05, + "loss": 2.8244, + "step": 27502 + }, + { + "epoch": 1.7073064746415048, + "grad_norm": 0.1684235691615646, + "learning_rate": 4.6669031999026885e-05, + "loss": 2.8358, + "step": 27503 + }, + { + "epoch": 1.7073685517412627, + "grad_norm": 0.1500158780853422, + "learning_rate": 4.6665428506674616e-05, + "loss": 2.7726, + "step": 27504 + }, + { + "epoch": 1.7074306288410206, + "grad_norm": 0.15185254330285694, + "learning_rate": 4.666182503171961e-05, + "loss": 2.7107, + "step": 27505 + }, + { + "epoch": 1.7074927059407785, + "grad_norm": 0.15172702103805727, + "learning_rate": 4.665822157418068e-05, + "loss": 2.7795, + "step": 27506 + }, + { + "epoch": 1.7075547830405364, + "grad_norm": 0.15625686630421048, + "learning_rate": 4.6654618134076616e-05, + "loss": 2.6823, + "step": 27507 + }, + { + "epoch": 1.7076168601402943, + "grad_norm": 0.16258997512407722, + "learning_rate": 4.665101471142623e-05, + "loss": 2.7879, + "step": 27508 + }, + { + "epoch": 1.7076789372400523, + "grad_norm": 0.15625243781095396, + "learning_rate": 4.664741130624829e-05, + "loss": 2.8066, + "step": 27509 + }, + { + "epoch": 1.7077410143398102, + "grad_norm": 0.14102786029560152, + "learning_rate": 4.664380791856164e-05, + "loss": 2.7489, + "step": 27510 + }, + { + "epoch": 1.707803091439568, + "grad_norm": 0.15794248656917476, + "learning_rate": 4.6640204548385067e-05, + "loss": 2.8419, + "step": 27511 + }, + { + "epoch": 1.7078651685393258, + "grad_norm": 0.1418041490102504, + "learning_rate": 4.6636601195737336e-05, + "loss": 2.7713, + "step": 27512 + }, + { + "epoch": 1.7079272456390837, + "grad_norm": 0.18800241986896074, + "learning_rate": 4.6632997860637297e-05, + "loss": 2.8309, + "step": 27513 + }, + { + "epoch": 1.7079893227388416, + "grad_norm": 0.15059378760591263, + "learning_rate": 4.66293945431037e-05, + "loss": 2.7641, + "step": 27514 + }, + { + "epoch": 1.7080513998385995, + "grad_norm": 0.14561361907928802, + "learning_rate": 4.6625791243155375e-05, + "loss": 2.726, + "step": 27515 + }, + { + "epoch": 1.7081134769383575, + "grad_norm": 0.159604961317433, + "learning_rate": 4.662218796081113e-05, + "loss": 2.8583, + "step": 27516 + }, + { + "epoch": 1.7081755540381154, + "grad_norm": 0.1464318880779691, + "learning_rate": 4.661858469608974e-05, + "loss": 2.815, + "step": 27517 + }, + { + "epoch": 1.708237631137873, + "grad_norm": 0.16167690431384096, + "learning_rate": 4.6614981449010025e-05, + "loss": 2.797, + "step": 27518 + }, + { + "epoch": 1.708299708237631, + "grad_norm": 0.14559040029986964, + "learning_rate": 4.661137821959076e-05, + "loss": 2.7653, + "step": 27519 + }, + { + "epoch": 1.708361785337389, + "grad_norm": 0.15649655677965527, + "learning_rate": 4.660777500785077e-05, + "loss": 2.8211, + "step": 27520 + }, + { + "epoch": 1.7084238624371468, + "grad_norm": 0.1648910129281929, + "learning_rate": 4.6604171813808836e-05, + "loss": 2.7248, + "step": 27521 + }, + { + "epoch": 1.7084859395369048, + "grad_norm": 0.18328497670670277, + "learning_rate": 4.660056863748377e-05, + "loss": 2.8421, + "step": 27522 + }, + { + "epoch": 1.7085480166366627, + "grad_norm": 0.14254343577581258, + "learning_rate": 4.659696547889435e-05, + "loss": 2.7685, + "step": 27523 + }, + { + "epoch": 1.7086100937364206, + "grad_norm": 0.15186093924625693, + "learning_rate": 4.65933623380594e-05, + "loss": 2.6849, + "step": 27524 + }, + { + "epoch": 1.7086721708361785, + "grad_norm": 0.14673994023027434, + "learning_rate": 4.6589759214997704e-05, + "loss": 2.7175, + "step": 27525 + }, + { + "epoch": 1.7087342479359364, + "grad_norm": 0.18904093509040928, + "learning_rate": 4.6586156109728054e-05, + "loss": 2.777, + "step": 27526 + }, + { + "epoch": 1.7087963250356943, + "grad_norm": 0.1509028665720622, + "learning_rate": 4.658255302226926e-05, + "loss": 2.8185, + "step": 27527 + }, + { + "epoch": 1.7088584021354523, + "grad_norm": 0.140836761561498, + "learning_rate": 4.657894995264011e-05, + "loss": 2.7183, + "step": 27528 + }, + { + "epoch": 1.7089204792352102, + "grad_norm": 0.1546496812269858, + "learning_rate": 4.657534690085942e-05, + "loss": 2.8148, + "step": 27529 + }, + { + "epoch": 1.708982556334968, + "grad_norm": 0.15311854139642572, + "learning_rate": 4.6571743866945964e-05, + "loss": 2.7991, + "step": 27530 + }, + { + "epoch": 1.709044633434726, + "grad_norm": 0.1435116294591221, + "learning_rate": 4.656814085091857e-05, + "loss": 2.7601, + "step": 27531 + }, + { + "epoch": 1.709106710534484, + "grad_norm": 0.1864388558712186, + "learning_rate": 4.6564537852795996e-05, + "loss": 2.7489, + "step": 27532 + }, + { + "epoch": 1.7091687876342418, + "grad_norm": 0.14935442413687103, + "learning_rate": 4.656093487259708e-05, + "loss": 2.8066, + "step": 27533 + }, + { + "epoch": 1.7092308647339998, + "grad_norm": 0.15142031321843485, + "learning_rate": 4.655733191034059e-05, + "loss": 2.8051, + "step": 27534 + }, + { + "epoch": 1.7092929418337577, + "grad_norm": 0.1404116323162394, + "learning_rate": 4.655372896604534e-05, + "loss": 2.7965, + "step": 27535 + }, + { + "epoch": 1.7093550189335154, + "grad_norm": 0.16936020264791535, + "learning_rate": 4.655012603973013e-05, + "loss": 2.7472, + "step": 27536 + }, + { + "epoch": 1.7094170960332733, + "grad_norm": 0.15176457570444832, + "learning_rate": 4.654652313141374e-05, + "loss": 2.6854, + "step": 27537 + }, + { + "epoch": 1.7094791731330312, + "grad_norm": 0.13679210874575337, + "learning_rate": 4.654292024111499e-05, + "loss": 2.6656, + "step": 27538 + }, + { + "epoch": 1.7095412502327891, + "grad_norm": 0.14943083500687027, + "learning_rate": 4.653931736885265e-05, + "loss": 2.7693, + "step": 27539 + }, + { + "epoch": 1.709603327332547, + "grad_norm": 0.1397722602301271, + "learning_rate": 4.653571451464554e-05, + "loss": 2.8213, + "step": 27540 + }, + { + "epoch": 1.709665404432305, + "grad_norm": 0.1550966491973968, + "learning_rate": 4.6532111678512434e-05, + "loss": 2.8454, + "step": 27541 + }, + { + "epoch": 1.7097274815320627, + "grad_norm": 0.1578716997962994, + "learning_rate": 4.6528508860472164e-05, + "loss": 2.8385, + "step": 27542 + }, + { + "epoch": 1.7097895586318206, + "grad_norm": 0.15645861408392162, + "learning_rate": 4.652490606054349e-05, + "loss": 2.827, + "step": 27543 + }, + { + "epoch": 1.7098516357315785, + "grad_norm": 0.15414699083070585, + "learning_rate": 4.6521303278745234e-05, + "loss": 2.7905, + "step": 27544 + }, + { + "epoch": 1.7099137128313364, + "grad_norm": 0.14900233693058368, + "learning_rate": 4.6517700515096175e-05, + "loss": 2.8166, + "step": 27545 + }, + { + "epoch": 1.7099757899310943, + "grad_norm": 0.14582218541951306, + "learning_rate": 4.6514097769615126e-05, + "loss": 2.7563, + "step": 27546 + }, + { + "epoch": 1.7100378670308523, + "grad_norm": 0.16498725122993282, + "learning_rate": 4.651049504232086e-05, + "loss": 2.7109, + "step": 27547 + }, + { + "epoch": 1.7100999441306102, + "grad_norm": 0.15375202894810436, + "learning_rate": 4.6506892333232186e-05, + "loss": 2.6884, + "step": 27548 + }, + { + "epoch": 1.710162021230368, + "grad_norm": 0.14662589548444027, + "learning_rate": 4.650328964236792e-05, + "loss": 2.7994, + "step": 27549 + }, + { + "epoch": 1.710224098330126, + "grad_norm": 0.13729364234050775, + "learning_rate": 4.6499686969746845e-05, + "loss": 2.7496, + "step": 27550 + }, + { + "epoch": 1.710286175429884, + "grad_norm": 0.15573805745898656, + "learning_rate": 4.6496084315387744e-05, + "loss": 2.8208, + "step": 27551 + }, + { + "epoch": 1.7103482525296418, + "grad_norm": 0.14067100725983608, + "learning_rate": 4.649248167930943e-05, + "loss": 2.8206, + "step": 27552 + }, + { + "epoch": 1.7104103296293998, + "grad_norm": 0.14805420527767904, + "learning_rate": 4.648887906153068e-05, + "loss": 2.8297, + "step": 27553 + }, + { + "epoch": 1.7104724067291577, + "grad_norm": 0.14119661899642477, + "learning_rate": 4.6485276462070304e-05, + "loss": 2.7532, + "step": 27554 + }, + { + "epoch": 1.7105344838289156, + "grad_norm": 0.15082356447212944, + "learning_rate": 4.648167388094709e-05, + "loss": 2.7527, + "step": 27555 + }, + { + "epoch": 1.7105965609286735, + "grad_norm": 0.14610343992767913, + "learning_rate": 4.647807131817984e-05, + "loss": 2.7142, + "step": 27556 + }, + { + "epoch": 1.7106586380284314, + "grad_norm": 0.1398859338697079, + "learning_rate": 4.647446877378735e-05, + "loss": 2.7766, + "step": 27557 + }, + { + "epoch": 1.7107207151281894, + "grad_norm": 0.16150416151943, + "learning_rate": 4.647086624778841e-05, + "loss": 2.695, + "step": 27558 + }, + { + "epoch": 1.7107827922279473, + "grad_norm": 0.1677442647419715, + "learning_rate": 4.646726374020182e-05, + "loss": 2.7366, + "step": 27559 + }, + { + "epoch": 1.710844869327705, + "grad_norm": 0.1443550866294472, + "learning_rate": 4.646366125104637e-05, + "loss": 2.7173, + "step": 27560 + }, + { + "epoch": 1.7109069464274629, + "grad_norm": 0.1428055386651732, + "learning_rate": 4.6460058780340864e-05, + "loss": 2.866, + "step": 27561 + }, + { + "epoch": 1.7109690235272208, + "grad_norm": 0.15732171159069003, + "learning_rate": 4.645645632810408e-05, + "loss": 2.7859, + "step": 27562 + }, + { + "epoch": 1.7110311006269787, + "grad_norm": 0.17266698993748641, + "learning_rate": 4.645285389435484e-05, + "loss": 2.7647, + "step": 27563 + }, + { + "epoch": 1.7110931777267366, + "grad_norm": 0.1569025183379989, + "learning_rate": 4.6449251479111896e-05, + "loss": 2.8247, + "step": 27564 + }, + { + "epoch": 1.7111552548264946, + "grad_norm": 0.14667026169481845, + "learning_rate": 4.6445649082394095e-05, + "loss": 2.7939, + "step": 27565 + }, + { + "epoch": 1.7112173319262522, + "grad_norm": 0.15988100637557004, + "learning_rate": 4.644204670422019e-05, + "loss": 2.7759, + "step": 27566 + }, + { + "epoch": 1.7112794090260102, + "grad_norm": 0.14424663950800617, + "learning_rate": 4.6438444344609e-05, + "loss": 2.7919, + "step": 27567 + }, + { + "epoch": 1.711341486125768, + "grad_norm": 0.16848157620946588, + "learning_rate": 4.6434842003579297e-05, + "loss": 2.7801, + "step": 27568 + }, + { + "epoch": 1.711403563225526, + "grad_norm": 0.14920351989275174, + "learning_rate": 4.643123968114991e-05, + "loss": 2.8042, + "step": 27569 + }, + { + "epoch": 1.711465640325284, + "grad_norm": 0.16115873301703032, + "learning_rate": 4.642763737733959e-05, + "loss": 2.8296, + "step": 27570 + }, + { + "epoch": 1.7115277174250418, + "grad_norm": 0.14720771217467077, + "learning_rate": 4.642403509216717e-05, + "loss": 2.8238, + "step": 27571 + }, + { + "epoch": 1.7115897945247998, + "grad_norm": 0.1512129951953939, + "learning_rate": 4.6420432825651424e-05, + "loss": 2.7745, + "step": 27572 + }, + { + "epoch": 1.7116518716245577, + "grad_norm": 0.16148172806377703, + "learning_rate": 4.6416830577811136e-05, + "loss": 2.7381, + "step": 27573 + }, + { + "epoch": 1.7117139487243156, + "grad_norm": 0.14010460249633314, + "learning_rate": 4.641322834866513e-05, + "loss": 2.7251, + "step": 27574 + }, + { + "epoch": 1.7117760258240735, + "grad_norm": 0.19048592152850513, + "learning_rate": 4.640962613823217e-05, + "loss": 2.8052, + "step": 27575 + }, + { + "epoch": 1.7118381029238314, + "grad_norm": 0.15719709060904716, + "learning_rate": 4.640602394653107e-05, + "loss": 2.681, + "step": 27576 + }, + { + "epoch": 1.7119001800235893, + "grad_norm": 0.14249130767319737, + "learning_rate": 4.64024217735806e-05, + "loss": 2.7845, + "step": 27577 + }, + { + "epoch": 1.7119622571233473, + "grad_norm": 0.15146702580998436, + "learning_rate": 4.639881961939959e-05, + "loss": 2.8599, + "step": 27578 + }, + { + "epoch": 1.7120243342231052, + "grad_norm": 0.14844546798602348, + "learning_rate": 4.639521748400679e-05, + "loss": 2.7485, + "step": 27579 + }, + { + "epoch": 1.712086411322863, + "grad_norm": 0.1565480608001888, + "learning_rate": 4.639161536742102e-05, + "loss": 2.859, + "step": 27580 + }, + { + "epoch": 1.712148488422621, + "grad_norm": 0.1606838444895264, + "learning_rate": 4.6388013269661076e-05, + "loss": 2.8088, + "step": 27581 + }, + { + "epoch": 1.712210565522379, + "grad_norm": 0.15411827760655958, + "learning_rate": 4.638441119074574e-05, + "loss": 2.7779, + "step": 27582 + }, + { + "epoch": 1.7122726426221369, + "grad_norm": 0.15938238098780208, + "learning_rate": 4.638080913069382e-05, + "loss": 2.7785, + "step": 27583 + }, + { + "epoch": 1.7123347197218945, + "grad_norm": 0.14365506069121722, + "learning_rate": 4.6377207089524084e-05, + "loss": 2.7676, + "step": 27584 + }, + { + "epoch": 1.7123967968216525, + "grad_norm": 0.14888206215434324, + "learning_rate": 4.637360506725535e-05, + "loss": 2.8448, + "step": 27585 + }, + { + "epoch": 1.7124588739214104, + "grad_norm": 0.14199292262751506, + "learning_rate": 4.63700030639064e-05, + "loss": 2.7266, + "step": 27586 + }, + { + "epoch": 1.7125209510211683, + "grad_norm": 0.14589687152517491, + "learning_rate": 4.6366401079496014e-05, + "loss": 2.769, + "step": 27587 + }, + { + "epoch": 1.7125830281209262, + "grad_norm": 0.16785900833628148, + "learning_rate": 4.6362799114043005e-05, + "loss": 2.8388, + "step": 27588 + }, + { + "epoch": 1.7126451052206841, + "grad_norm": 0.17166286077917725, + "learning_rate": 4.635919716756614e-05, + "loss": 2.6495, + "step": 27589 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.14002490484159177, + "learning_rate": 4.6355595240084245e-05, + "loss": 2.7538, + "step": 27590 + }, + { + "epoch": 1.7127692594201998, + "grad_norm": 0.15327081063405948, + "learning_rate": 4.635199333161608e-05, + "loss": 2.8024, + "step": 27591 + }, + { + "epoch": 1.7128313365199577, + "grad_norm": 0.15157902568505893, + "learning_rate": 4.6348391442180465e-05, + "loss": 2.7729, + "step": 27592 + }, + { + "epoch": 1.7128934136197156, + "grad_norm": 0.17706772207186827, + "learning_rate": 4.6344789571796164e-05, + "loss": 2.7908, + "step": 27593 + }, + { + "epoch": 1.7129554907194735, + "grad_norm": 0.15888182146605798, + "learning_rate": 4.6341187720482e-05, + "loss": 2.7638, + "step": 27594 + }, + { + "epoch": 1.7130175678192314, + "grad_norm": 0.1909040460607634, + "learning_rate": 4.6337585888256734e-05, + "loss": 2.7579, + "step": 27595 + }, + { + "epoch": 1.7130796449189893, + "grad_norm": 0.16995389680475873, + "learning_rate": 4.633398407513918e-05, + "loss": 2.8103, + "step": 27596 + }, + { + "epoch": 1.7131417220187473, + "grad_norm": 0.16258326171759854, + "learning_rate": 4.633038228114812e-05, + "loss": 2.8031, + "step": 27597 + }, + { + "epoch": 1.7132037991185052, + "grad_norm": 0.16083401227615857, + "learning_rate": 4.632678050630234e-05, + "loss": 2.7794, + "step": 27598 + }, + { + "epoch": 1.713265876218263, + "grad_norm": 0.18200096156709922, + "learning_rate": 4.632317875062065e-05, + "loss": 2.8595, + "step": 27599 + }, + { + "epoch": 1.713327953318021, + "grad_norm": 0.14722399591206195, + "learning_rate": 4.6319577014121815e-05, + "loss": 2.7149, + "step": 27600 + }, + { + "epoch": 1.713390030417779, + "grad_norm": 0.17516802210556356, + "learning_rate": 4.631597529682465e-05, + "loss": 2.8191, + "step": 27601 + }, + { + "epoch": 1.7134521075175368, + "grad_norm": 0.14510392453154522, + "learning_rate": 4.6312373598747926e-05, + "loss": 2.7635, + "step": 27602 + }, + { + "epoch": 1.7135141846172948, + "grad_norm": 0.1512257376532711, + "learning_rate": 4.630877191991046e-05, + "loss": 2.8062, + "step": 27603 + }, + { + "epoch": 1.7135762617170527, + "grad_norm": 0.15314558119548347, + "learning_rate": 4.630517026033101e-05, + "loss": 2.7794, + "step": 27604 + }, + { + "epoch": 1.7136383388168106, + "grad_norm": 0.15097220785910453, + "learning_rate": 4.63015686200284e-05, + "loss": 2.7393, + "step": 27605 + }, + { + "epoch": 1.7137004159165685, + "grad_norm": 0.17755130100348315, + "learning_rate": 4.6297966999021385e-05, + "loss": 2.7005, + "step": 27606 + }, + { + "epoch": 1.7137624930163264, + "grad_norm": 0.14748302824769674, + "learning_rate": 4.6294365397328796e-05, + "loss": 2.8248, + "step": 27607 + }, + { + "epoch": 1.7138245701160841, + "grad_norm": 0.14433641446541942, + "learning_rate": 4.629076381496938e-05, + "loss": 2.8323, + "step": 27608 + }, + { + "epoch": 1.713886647215842, + "grad_norm": 0.16243664900471314, + "learning_rate": 4.628716225196197e-05, + "loss": 2.7004, + "step": 27609 + }, + { + "epoch": 1.7139487243156, + "grad_norm": 0.18666553576976927, + "learning_rate": 4.628356070832533e-05, + "loss": 2.7748, + "step": 27610 + }, + { + "epoch": 1.7140108014153579, + "grad_norm": 0.1591283738912833, + "learning_rate": 4.627995918407825e-05, + "loss": 2.7391, + "step": 27611 + }, + { + "epoch": 1.7140728785151158, + "grad_norm": 0.1679282416800324, + "learning_rate": 4.6276357679239514e-05, + "loss": 2.7755, + "step": 27612 + }, + { + "epoch": 1.7141349556148737, + "grad_norm": 0.1666525403880814, + "learning_rate": 4.627275619382794e-05, + "loss": 2.7952, + "step": 27613 + }, + { + "epoch": 1.7141970327146314, + "grad_norm": 0.18826643936977747, + "learning_rate": 4.62691547278623e-05, + "loss": 2.7054, + "step": 27614 + }, + { + "epoch": 1.7142591098143893, + "grad_norm": 0.15868841315161114, + "learning_rate": 4.6265553281361385e-05, + "loss": 2.7968, + "step": 27615 + }, + { + "epoch": 1.7143211869141473, + "grad_norm": 0.16322086372043948, + "learning_rate": 4.6261951854343985e-05, + "loss": 2.7403, + "step": 27616 + }, + { + "epoch": 1.7143832640139052, + "grad_norm": 0.14291825288632828, + "learning_rate": 4.6258350446828895e-05, + "loss": 2.7227, + "step": 27617 + }, + { + "epoch": 1.714445341113663, + "grad_norm": 0.1594082122211577, + "learning_rate": 4.625474905883489e-05, + "loss": 2.8781, + "step": 27618 + }, + { + "epoch": 1.714507418213421, + "grad_norm": 0.15600250431566284, + "learning_rate": 4.6251147690380774e-05, + "loss": 2.7219, + "step": 27619 + }, + { + "epoch": 1.714569495313179, + "grad_norm": 0.15712735499534392, + "learning_rate": 4.6247546341485324e-05, + "loss": 2.8515, + "step": 27620 + }, + { + "epoch": 1.7146315724129368, + "grad_norm": 0.16562321230085522, + "learning_rate": 4.624394501216735e-05, + "loss": 2.7856, + "step": 27621 + }, + { + "epoch": 1.7146936495126948, + "grad_norm": 0.16732605246630805, + "learning_rate": 4.624034370244562e-05, + "loss": 2.7686, + "step": 27622 + }, + { + "epoch": 1.7147557266124527, + "grad_norm": 0.17736143242492322, + "learning_rate": 4.6236742412338924e-05, + "loss": 2.8434, + "step": 27623 + }, + { + "epoch": 1.7148178037122106, + "grad_norm": 0.14731016724964457, + "learning_rate": 4.623314114186607e-05, + "loss": 2.685, + "step": 27624 + }, + { + "epoch": 1.7148798808119685, + "grad_norm": 0.1805697736328123, + "learning_rate": 4.622953989104581e-05, + "loss": 2.7966, + "step": 27625 + }, + { + "epoch": 1.7149419579117264, + "grad_norm": 0.15130113575672016, + "learning_rate": 4.622593865989698e-05, + "loss": 2.792, + "step": 27626 + }, + { + "epoch": 1.7150040350114844, + "grad_norm": 0.20968768058689488, + "learning_rate": 4.6222337448438325e-05, + "loss": 2.8243, + "step": 27627 + }, + { + "epoch": 1.7150661121112423, + "grad_norm": 0.16549417844269054, + "learning_rate": 4.621873625668867e-05, + "loss": 2.7191, + "step": 27628 + }, + { + "epoch": 1.7151281892110002, + "grad_norm": 0.18498721797062478, + "learning_rate": 4.621513508466677e-05, + "loss": 2.8575, + "step": 27629 + }, + { + "epoch": 1.715190266310758, + "grad_norm": 0.15183284873166236, + "learning_rate": 4.6211533932391446e-05, + "loss": 2.838, + "step": 27630 + }, + { + "epoch": 1.715252343410516, + "grad_norm": 0.15107322574183096, + "learning_rate": 4.620793279988145e-05, + "loss": 2.7203, + "step": 27631 + }, + { + "epoch": 1.7153144205102737, + "grad_norm": 0.1695794458923944, + "learning_rate": 4.62043316871556e-05, + "loss": 2.827, + "step": 27632 + }, + { + "epoch": 1.7153764976100316, + "grad_norm": 0.13589645785616547, + "learning_rate": 4.6200730594232666e-05, + "loss": 2.7666, + "step": 27633 + }, + { + "epoch": 1.7154385747097896, + "grad_norm": 0.14928979630309647, + "learning_rate": 4.6197129521131455e-05, + "loss": 2.7811, + "step": 27634 + }, + { + "epoch": 1.7155006518095475, + "grad_norm": 0.14061764856832287, + "learning_rate": 4.619352846787074e-05, + "loss": 2.8392, + "step": 27635 + }, + { + "epoch": 1.7155627289093054, + "grad_norm": 0.1728994066220637, + "learning_rate": 4.61899274344693e-05, + "loss": 2.7331, + "step": 27636 + }, + { + "epoch": 1.7156248060090633, + "grad_norm": 0.14161802393498482, + "learning_rate": 4.6186326420945945e-05, + "loss": 2.6255, + "step": 27637 + }, + { + "epoch": 1.715686883108821, + "grad_norm": 0.18112702719749846, + "learning_rate": 4.618272542731944e-05, + "loss": 2.7663, + "step": 27638 + }, + { + "epoch": 1.715748960208579, + "grad_norm": 0.1539844355493559, + "learning_rate": 4.61791244536086e-05, + "loss": 2.7865, + "step": 27639 + }, + { + "epoch": 1.7158110373083368, + "grad_norm": 0.19151015283647516, + "learning_rate": 4.617552349983217e-05, + "loss": 2.8623, + "step": 27640 + }, + { + "epoch": 1.7158731144080948, + "grad_norm": 0.17099179034261697, + "learning_rate": 4.617192256600898e-05, + "loss": 2.6957, + "step": 27641 + }, + { + "epoch": 1.7159351915078527, + "grad_norm": 0.13774592909924652, + "learning_rate": 4.61683216521578e-05, + "loss": 2.728, + "step": 27642 + }, + { + "epoch": 1.7159972686076106, + "grad_norm": 0.16188725225372066, + "learning_rate": 4.616472075829741e-05, + "loss": 2.8201, + "step": 27643 + }, + { + "epoch": 1.7160593457073685, + "grad_norm": 0.15697466177025285, + "learning_rate": 4.61611198844466e-05, + "loss": 2.8737, + "step": 27644 + }, + { + "epoch": 1.7161214228071264, + "grad_norm": 0.14830565871717624, + "learning_rate": 4.615751903062415e-05, + "loss": 2.752, + "step": 27645 + }, + { + "epoch": 1.7161834999068843, + "grad_norm": 0.15715010780321065, + "learning_rate": 4.615391819684888e-05, + "loss": 2.8196, + "step": 27646 + }, + { + "epoch": 1.7162455770066423, + "grad_norm": 0.1674325613117241, + "learning_rate": 4.615031738313954e-05, + "loss": 2.7582, + "step": 27647 + }, + { + "epoch": 1.7163076541064002, + "grad_norm": 0.14392142906447056, + "learning_rate": 4.614671658951493e-05, + "loss": 2.835, + "step": 27648 + }, + { + "epoch": 1.716369731206158, + "grad_norm": 0.1606142601085224, + "learning_rate": 4.614311581599384e-05, + "loss": 2.8637, + "step": 27649 + }, + { + "epoch": 1.716431808305916, + "grad_norm": 0.15165010285078448, + "learning_rate": 4.613951506259504e-05, + "loss": 2.7894, + "step": 27650 + }, + { + "epoch": 1.716493885405674, + "grad_norm": 0.177916150881476, + "learning_rate": 4.6135914329337345e-05, + "loss": 2.7215, + "step": 27651 + }, + { + "epoch": 1.7165559625054319, + "grad_norm": 0.16858685231880943, + "learning_rate": 4.61323136162395e-05, + "loss": 2.7644, + "step": 27652 + }, + { + "epoch": 1.7166180396051898, + "grad_norm": 0.16421765983139838, + "learning_rate": 4.6128712923320337e-05, + "loss": 2.8483, + "step": 27653 + }, + { + "epoch": 1.7166801167049477, + "grad_norm": 0.16837438684612402, + "learning_rate": 4.61251122505986e-05, + "loss": 2.7773, + "step": 27654 + }, + { + "epoch": 1.7167421938047056, + "grad_norm": 0.1510209707901604, + "learning_rate": 4.612151159809311e-05, + "loss": 2.7316, + "step": 27655 + }, + { + "epoch": 1.7168042709044633, + "grad_norm": 0.16085876511045485, + "learning_rate": 4.611791096582262e-05, + "loss": 2.6982, + "step": 27656 + }, + { + "epoch": 1.7168663480042212, + "grad_norm": 0.15128033515072012, + "learning_rate": 4.611431035380594e-05, + "loss": 2.8444, + "step": 27657 + }, + { + "epoch": 1.7169284251039791, + "grad_norm": 0.14966171907710094, + "learning_rate": 4.611070976206184e-05, + "loss": 2.7266, + "step": 27658 + }, + { + "epoch": 1.716990502203737, + "grad_norm": 0.14148986620386655, + "learning_rate": 4.610710919060912e-05, + "loss": 2.8089, + "step": 27659 + }, + { + "epoch": 1.717052579303495, + "grad_norm": 0.15595497053548943, + "learning_rate": 4.610350863946655e-05, + "loss": 2.7854, + "step": 27660 + }, + { + "epoch": 1.717114656403253, + "grad_norm": 0.15373477678220276, + "learning_rate": 4.6099908108652925e-05, + "loss": 2.815, + "step": 27661 + }, + { + "epoch": 1.7171767335030106, + "grad_norm": 0.19900815716036246, + "learning_rate": 4.609630759818703e-05, + "loss": 2.8296, + "step": 27662 + }, + { + "epoch": 1.7172388106027685, + "grad_norm": 0.146631408649728, + "learning_rate": 4.609270710808763e-05, + "loss": 2.7766, + "step": 27663 + }, + { + "epoch": 1.7173008877025264, + "grad_norm": 0.16382302108476396, + "learning_rate": 4.608910663837354e-05, + "loss": 2.8172, + "step": 27664 + }, + { + "epoch": 1.7173629648022843, + "grad_norm": 0.15355910396508146, + "learning_rate": 4.608550618906352e-05, + "loss": 2.775, + "step": 27665 + }, + { + "epoch": 1.7174250419020423, + "grad_norm": 0.14572315155137366, + "learning_rate": 4.6081905760176377e-05, + "loss": 2.7407, + "step": 27666 + }, + { + "epoch": 1.7174871190018002, + "grad_norm": 0.1548392986346812, + "learning_rate": 4.607830535173087e-05, + "loss": 2.7227, + "step": 27667 + }, + { + "epoch": 1.717549196101558, + "grad_norm": 0.17596997620242236, + "learning_rate": 4.60747049637458e-05, + "loss": 2.8321, + "step": 27668 + }, + { + "epoch": 1.717611273201316, + "grad_norm": 0.15406970914489762, + "learning_rate": 4.607110459623995e-05, + "loss": 2.7707, + "step": 27669 + }, + { + "epoch": 1.717673350301074, + "grad_norm": 0.15140116525581748, + "learning_rate": 4.60675042492321e-05, + "loss": 2.7609, + "step": 27670 + }, + { + "epoch": 1.7177354274008318, + "grad_norm": 0.1824684353208499, + "learning_rate": 4.606390392274104e-05, + "loss": 2.7247, + "step": 27671 + }, + { + "epoch": 1.7177975045005898, + "grad_norm": 0.14690375376335138, + "learning_rate": 4.606030361678554e-05, + "loss": 2.754, + "step": 27672 + }, + { + "epoch": 1.7178595816003477, + "grad_norm": 0.16101853335186483, + "learning_rate": 4.60567033313844e-05, + "loss": 2.8106, + "step": 27673 + }, + { + "epoch": 1.7179216587001056, + "grad_norm": 0.16184489955298653, + "learning_rate": 4.605310306655638e-05, + "loss": 2.8214, + "step": 27674 + }, + { + "epoch": 1.7179837357998635, + "grad_norm": 0.17230117046197477, + "learning_rate": 4.60495028223203e-05, + "loss": 2.817, + "step": 27675 + }, + { + "epoch": 1.7180458128996214, + "grad_norm": 0.16709374245443304, + "learning_rate": 4.604590259869491e-05, + "loss": 2.7457, + "step": 27676 + }, + { + "epoch": 1.7181078899993794, + "grad_norm": 0.164995763391955, + "learning_rate": 4.6042302395699014e-05, + "loss": 2.8087, + "step": 27677 + }, + { + "epoch": 1.7181699670991373, + "grad_norm": 0.17111508397560343, + "learning_rate": 4.603870221335137e-05, + "loss": 2.7341, + "step": 27678 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.17548633843961436, + "learning_rate": 4.6035102051670786e-05, + "loss": 2.6667, + "step": 27679 + }, + { + "epoch": 1.7182941212986529, + "grad_norm": 0.15014048618648074, + "learning_rate": 4.6031501910676046e-05, + "loss": 2.7667, + "step": 27680 + }, + { + "epoch": 1.7183561983984108, + "grad_norm": 0.14750453481330394, + "learning_rate": 4.602790179038592e-05, + "loss": 2.7296, + "step": 27681 + }, + { + "epoch": 1.7184182754981687, + "grad_norm": 0.14884653328017836, + "learning_rate": 4.6024301690819196e-05, + "loss": 2.7676, + "step": 27682 + }, + { + "epoch": 1.7184803525979266, + "grad_norm": 0.14571370528723113, + "learning_rate": 4.602070161199465e-05, + "loss": 2.8531, + "step": 27683 + }, + { + "epoch": 1.7185424296976846, + "grad_norm": 0.17340085742256114, + "learning_rate": 4.601710155393108e-05, + "loss": 2.7918, + "step": 27684 + }, + { + "epoch": 1.7186045067974425, + "grad_norm": 0.14914351633865117, + "learning_rate": 4.601350151664726e-05, + "loss": 2.8502, + "step": 27685 + }, + { + "epoch": 1.7186665838972002, + "grad_norm": 0.15052596108789146, + "learning_rate": 4.6009901500161955e-05, + "loss": 2.8578, + "step": 27686 + }, + { + "epoch": 1.718728660996958, + "grad_norm": 0.15029674547592184, + "learning_rate": 4.600630150449398e-05, + "loss": 2.8105, + "step": 27687 + }, + { + "epoch": 1.718790738096716, + "grad_norm": 0.13915152941282052, + "learning_rate": 4.600270152966208e-05, + "loss": 2.8195, + "step": 27688 + }, + { + "epoch": 1.718852815196474, + "grad_norm": 0.14840300058698117, + "learning_rate": 4.599910157568507e-05, + "loss": 2.7084, + "step": 27689 + }, + { + "epoch": 1.7189148922962318, + "grad_norm": 0.14656375006038916, + "learning_rate": 4.5995501642581704e-05, + "loss": 2.8333, + "step": 27690 + }, + { + "epoch": 1.7189769693959898, + "grad_norm": 0.16043300093950688, + "learning_rate": 4.599190173037079e-05, + "loss": 2.8675, + "step": 27691 + }, + { + "epoch": 1.7190390464957477, + "grad_norm": 0.15578632100650938, + "learning_rate": 4.59883018390711e-05, + "loss": 2.8696, + "step": 27692 + }, + { + "epoch": 1.7191011235955056, + "grad_norm": 0.15732989854463658, + "learning_rate": 4.5984701968701413e-05, + "loss": 2.7101, + "step": 27693 + }, + { + "epoch": 1.7191632006952635, + "grad_norm": 0.14193116336755865, + "learning_rate": 4.59811021192805e-05, + "loss": 2.7951, + "step": 27694 + }, + { + "epoch": 1.7192252777950214, + "grad_norm": 0.16210626116600105, + "learning_rate": 4.597750229082717e-05, + "loss": 2.7432, + "step": 27695 + }, + { + "epoch": 1.7192873548947794, + "grad_norm": 0.16687012834974474, + "learning_rate": 4.5973902483360174e-05, + "loss": 2.7074, + "step": 27696 + }, + { + "epoch": 1.7193494319945373, + "grad_norm": 0.13903602953588262, + "learning_rate": 4.5970302696898304e-05, + "loss": 2.7321, + "step": 27697 + }, + { + "epoch": 1.7194115090942952, + "grad_norm": 0.16163528011329611, + "learning_rate": 4.5966702931460354e-05, + "loss": 2.7822, + "step": 27698 + }, + { + "epoch": 1.719473586194053, + "grad_norm": 0.1446738248690171, + "learning_rate": 4.596310318706508e-05, + "loss": 2.7649, + "step": 27699 + }, + { + "epoch": 1.719535663293811, + "grad_norm": 0.14220992975119448, + "learning_rate": 4.595950346373129e-05, + "loss": 2.7946, + "step": 27700 + }, + { + "epoch": 1.719597740393569, + "grad_norm": 0.199435237989446, + "learning_rate": 4.595590376147774e-05, + "loss": 2.7775, + "step": 27701 + }, + { + "epoch": 1.7196598174933269, + "grad_norm": 0.2112442893887239, + "learning_rate": 4.595230408032324e-05, + "loss": 2.7822, + "step": 27702 + }, + { + "epoch": 1.7197218945930848, + "grad_norm": 0.142537372460961, + "learning_rate": 4.5948704420286534e-05, + "loss": 2.69, + "step": 27703 + }, + { + "epoch": 1.7197839716928425, + "grad_norm": 0.1524048279321845, + "learning_rate": 4.594510478138643e-05, + "loss": 2.8333, + "step": 27704 + }, + { + "epoch": 1.7198460487926004, + "grad_norm": 0.14811612776650776, + "learning_rate": 4.5941505163641684e-05, + "loss": 2.8002, + "step": 27705 + }, + { + "epoch": 1.7199081258923583, + "grad_norm": 0.1667806557177827, + "learning_rate": 4.593790556707111e-05, + "loss": 2.8667, + "step": 27706 + }, + { + "epoch": 1.7199702029921162, + "grad_norm": 0.15527908746768504, + "learning_rate": 4.593430599169347e-05, + "loss": 2.7424, + "step": 27707 + }, + { + "epoch": 1.7200322800918741, + "grad_norm": 0.14076443885111237, + "learning_rate": 4.5930706437527524e-05, + "loss": 2.852, + "step": 27708 + }, + { + "epoch": 1.720094357191632, + "grad_norm": 0.14490081603256152, + "learning_rate": 4.592710690459209e-05, + "loss": 2.7262, + "step": 27709 + }, + { + "epoch": 1.7201564342913898, + "grad_norm": 0.1619563702020334, + "learning_rate": 4.5923507392905896e-05, + "loss": 2.7185, + "step": 27710 + }, + { + "epoch": 1.7202185113911477, + "grad_norm": 0.14416918639597698, + "learning_rate": 4.591990790248777e-05, + "loss": 2.8655, + "step": 27711 + }, + { + "epoch": 1.7202805884909056, + "grad_norm": 0.1454100824667165, + "learning_rate": 4.5916308433356484e-05, + "loss": 2.8022, + "step": 27712 + }, + { + "epoch": 1.7203426655906635, + "grad_norm": 0.18042041742362594, + "learning_rate": 4.59127089855308e-05, + "loss": 2.821, + "step": 27713 + }, + { + "epoch": 1.7204047426904214, + "grad_norm": 0.147079935556921, + "learning_rate": 4.5909109559029516e-05, + "loss": 2.7815, + "step": 27714 + }, + { + "epoch": 1.7204668197901793, + "grad_norm": 0.1436925861517086, + "learning_rate": 4.590551015387139e-05, + "loss": 2.7125, + "step": 27715 + }, + { + "epoch": 1.7205288968899373, + "grad_norm": 0.1532480470469125, + "learning_rate": 4.5901910770075216e-05, + "loss": 2.7623, + "step": 27716 + }, + { + "epoch": 1.7205909739896952, + "grad_norm": 0.16735635056414824, + "learning_rate": 4.5898311407659763e-05, + "loss": 2.8298, + "step": 27717 + }, + { + "epoch": 1.720653051089453, + "grad_norm": 0.14475480394841408, + "learning_rate": 4.589471206664383e-05, + "loss": 2.8004, + "step": 27718 + }, + { + "epoch": 1.720715128189211, + "grad_norm": 0.15075765756236445, + "learning_rate": 4.589111274704616e-05, + "loss": 2.7841, + "step": 27719 + }, + { + "epoch": 1.720777205288969, + "grad_norm": 0.16467114922030282, + "learning_rate": 4.588751344888557e-05, + "loss": 2.8005, + "step": 27720 + }, + { + "epoch": 1.7208392823887269, + "grad_norm": 0.2117814382952196, + "learning_rate": 4.588391417218082e-05, + "loss": 2.8408, + "step": 27721 + }, + { + "epoch": 1.7209013594884848, + "grad_norm": 0.18416651196243153, + "learning_rate": 4.588031491695068e-05, + "loss": 2.7249, + "step": 27722 + }, + { + "epoch": 1.7209634365882427, + "grad_norm": 0.2035848108233588, + "learning_rate": 4.5876715683213945e-05, + "loss": 2.7087, + "step": 27723 + }, + { + "epoch": 1.7210255136880006, + "grad_norm": 0.15781312130343064, + "learning_rate": 4.5873116470989374e-05, + "loss": 2.796, + "step": 27724 + }, + { + "epoch": 1.7210875907877585, + "grad_norm": 0.16856635043393228, + "learning_rate": 4.586951728029577e-05, + "loss": 2.8921, + "step": 27725 + }, + { + "epoch": 1.7211496678875164, + "grad_norm": 0.14933224226690145, + "learning_rate": 4.5865918111151885e-05, + "loss": 2.8407, + "step": 27726 + }, + { + "epoch": 1.7212117449872741, + "grad_norm": 0.1598794161158319, + "learning_rate": 4.5862318963576514e-05, + "loss": 2.7792, + "step": 27727 + }, + { + "epoch": 1.721273822087032, + "grad_norm": 0.1457703545230716, + "learning_rate": 4.585871983758843e-05, + "loss": 2.8415, + "step": 27728 + }, + { + "epoch": 1.72133589918679, + "grad_norm": 0.14439063706907254, + "learning_rate": 4.5855120733206406e-05, + "loss": 2.7207, + "step": 27729 + }, + { + "epoch": 1.721397976286548, + "grad_norm": 0.15651010678832425, + "learning_rate": 4.585152165044923e-05, + "loss": 2.8334, + "step": 27730 + }, + { + "epoch": 1.7214600533863058, + "grad_norm": 0.1515904840708515, + "learning_rate": 4.584792258933567e-05, + "loss": 2.7201, + "step": 27731 + }, + { + "epoch": 1.7215221304860637, + "grad_norm": 0.1398478464773294, + "learning_rate": 4.5844323549884506e-05, + "loss": 2.7413, + "step": 27732 + }, + { + "epoch": 1.7215842075858214, + "grad_norm": 0.1706479607580541, + "learning_rate": 4.584072453211451e-05, + "loss": 2.7705, + "step": 27733 + }, + { + "epoch": 1.7216462846855793, + "grad_norm": 0.1444668626416511, + "learning_rate": 4.583712553604447e-05, + "loss": 2.7185, + "step": 27734 + }, + { + "epoch": 1.7217083617853373, + "grad_norm": 0.14861411955856815, + "learning_rate": 4.5833526561693153e-05, + "loss": 2.801, + "step": 27735 + }, + { + "epoch": 1.7217704388850952, + "grad_norm": 0.16058427324038121, + "learning_rate": 4.582992760907934e-05, + "loss": 2.7542, + "step": 27736 + }, + { + "epoch": 1.721832515984853, + "grad_norm": 0.14832725948674744, + "learning_rate": 4.5826328678221804e-05, + "loss": 2.8138, + "step": 27737 + }, + { + "epoch": 1.721894593084611, + "grad_norm": 0.15570454759038815, + "learning_rate": 4.5822729769139333e-05, + "loss": 2.9012, + "step": 27738 + }, + { + "epoch": 1.721956670184369, + "grad_norm": 0.1393461222946083, + "learning_rate": 4.581913088185069e-05, + "loss": 2.7738, + "step": 27739 + }, + { + "epoch": 1.7220187472841268, + "grad_norm": 0.14692208696970974, + "learning_rate": 4.5815532016374654e-05, + "loss": 2.7902, + "step": 27740 + }, + { + "epoch": 1.7220808243838848, + "grad_norm": 0.17032173945361687, + "learning_rate": 4.581193317273e-05, + "loss": 2.8293, + "step": 27741 + }, + { + "epoch": 1.7221429014836427, + "grad_norm": 0.13734704472622739, + "learning_rate": 4.5808334350935514e-05, + "loss": 2.8222, + "step": 27742 + }, + { + "epoch": 1.7222049785834006, + "grad_norm": 0.15338206414367697, + "learning_rate": 4.5804735551009956e-05, + "loss": 2.8082, + "step": 27743 + }, + { + "epoch": 1.7222670556831585, + "grad_norm": 0.14683604941398087, + "learning_rate": 4.580113677297211e-05, + "loss": 2.8434, + "step": 27744 + }, + { + "epoch": 1.7223291327829164, + "grad_norm": 0.19136448813296525, + "learning_rate": 4.5797538016840764e-05, + "loss": 2.8048, + "step": 27745 + }, + { + "epoch": 1.7223912098826744, + "grad_norm": 0.14703502155619147, + "learning_rate": 4.579393928263468e-05, + "loss": 2.6953, + "step": 27746 + }, + { + "epoch": 1.7224532869824323, + "grad_norm": 0.14620457948542356, + "learning_rate": 4.579034057037264e-05, + "loss": 2.777, + "step": 27747 + }, + { + "epoch": 1.7225153640821902, + "grad_norm": 0.1452117185891297, + "learning_rate": 4.578674188007341e-05, + "loss": 2.7791, + "step": 27748 + }, + { + "epoch": 1.722577441181948, + "grad_norm": 0.15021221303911617, + "learning_rate": 4.578314321175577e-05, + "loss": 2.8549, + "step": 27749 + }, + { + "epoch": 1.722639518281706, + "grad_norm": 0.15532542504002714, + "learning_rate": 4.5779544565438495e-05, + "loss": 2.8322, + "step": 27750 + }, + { + "epoch": 1.7227015953814637, + "grad_norm": 0.14715456556659165, + "learning_rate": 4.577594594114036e-05, + "loss": 2.7162, + "step": 27751 + }, + { + "epoch": 1.7227636724812216, + "grad_norm": 0.16555397697146126, + "learning_rate": 4.577234733888015e-05, + "loss": 2.8075, + "step": 27752 + }, + { + "epoch": 1.7228257495809796, + "grad_norm": 0.1516347612095285, + "learning_rate": 4.5768748758676615e-05, + "loss": 2.6785, + "step": 27753 + }, + { + "epoch": 1.7228878266807375, + "grad_norm": 0.1469213072844834, + "learning_rate": 4.576515020054856e-05, + "loss": 2.7198, + "step": 27754 + }, + { + "epoch": 1.7229499037804954, + "grad_norm": 0.19876123700974638, + "learning_rate": 4.5761551664514727e-05, + "loss": 2.8004, + "step": 27755 + }, + { + "epoch": 1.7230119808802533, + "grad_norm": 0.13874786605139058, + "learning_rate": 4.575795315059393e-05, + "loss": 2.7596, + "step": 27756 + }, + { + "epoch": 1.723074057980011, + "grad_norm": 0.18943287062769965, + "learning_rate": 4.575435465880491e-05, + "loss": 2.8086, + "step": 27757 + }, + { + "epoch": 1.723136135079769, + "grad_norm": 0.16549693027938334, + "learning_rate": 4.575075618916645e-05, + "loss": 2.7756, + "step": 27758 + }, + { + "epoch": 1.7231982121795268, + "grad_norm": 0.19099340875222443, + "learning_rate": 4.574715774169733e-05, + "loss": 2.8276, + "step": 27759 + }, + { + "epoch": 1.7232602892792848, + "grad_norm": 0.1422770273351569, + "learning_rate": 4.5743559316416315e-05, + "loss": 2.7761, + "step": 27760 + }, + { + "epoch": 1.7233223663790427, + "grad_norm": 0.17924461517332685, + "learning_rate": 4.573996091334219e-05, + "loss": 2.7656, + "step": 27761 + }, + { + "epoch": 1.7233844434788006, + "grad_norm": 0.1448594953873683, + "learning_rate": 4.573636253249372e-05, + "loss": 2.8065, + "step": 27762 + }, + { + "epoch": 1.7234465205785585, + "grad_norm": 0.15265500843203067, + "learning_rate": 4.573276417388969e-05, + "loss": 2.7877, + "step": 27763 + }, + { + "epoch": 1.7235085976783164, + "grad_norm": 0.1441980916764184, + "learning_rate": 4.572916583754885e-05, + "loss": 2.7508, + "step": 27764 + }, + { + "epoch": 1.7235706747780744, + "grad_norm": 0.15306040121363113, + "learning_rate": 4.572556752349e-05, + "loss": 2.737, + "step": 27765 + }, + { + "epoch": 1.7236327518778323, + "grad_norm": 0.157906259643189, + "learning_rate": 4.57219692317319e-05, + "loss": 2.7158, + "step": 27766 + }, + { + "epoch": 1.7236948289775902, + "grad_norm": 0.16609555171696294, + "learning_rate": 4.5718370962293325e-05, + "loss": 2.7742, + "step": 27767 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.14650343453072362, + "learning_rate": 4.571477271519305e-05, + "loss": 2.7197, + "step": 27768 + }, + { + "epoch": 1.723818983177106, + "grad_norm": 0.15974893762603884, + "learning_rate": 4.5711174490449836e-05, + "loss": 2.8345, + "step": 27769 + }, + { + "epoch": 1.723881060276864, + "grad_norm": 0.14454647576834978, + "learning_rate": 4.570757628808248e-05, + "loss": 2.8399, + "step": 27770 + }, + { + "epoch": 1.7239431373766219, + "grad_norm": 0.14397918452750944, + "learning_rate": 4.570397810810973e-05, + "loss": 2.7298, + "step": 27771 + }, + { + "epoch": 1.7240052144763798, + "grad_norm": 0.14679874383448385, + "learning_rate": 4.570037995055038e-05, + "loss": 2.7103, + "step": 27772 + }, + { + "epoch": 1.7240672915761377, + "grad_norm": 0.14426633036525827, + "learning_rate": 4.5696781815423176e-05, + "loss": 2.7624, + "step": 27773 + }, + { + "epoch": 1.7241293686758956, + "grad_norm": 0.14306876275006386, + "learning_rate": 4.5693183702746914e-05, + "loss": 2.766, + "step": 27774 + }, + { + "epoch": 1.7241914457756533, + "grad_norm": 0.143316733439587, + "learning_rate": 4.5689585612540356e-05, + "loss": 2.8264, + "step": 27775 + }, + { + "epoch": 1.7242535228754112, + "grad_norm": 0.14233982735053816, + "learning_rate": 4.5685987544822265e-05, + "loss": 2.6913, + "step": 27776 + }, + { + "epoch": 1.7243155999751691, + "grad_norm": 0.1438799825007765, + "learning_rate": 4.568238949961144e-05, + "loss": 2.7325, + "step": 27777 + }, + { + "epoch": 1.724377677074927, + "grad_norm": 0.14471009504401186, + "learning_rate": 4.5678791476926627e-05, + "loss": 2.7795, + "step": 27778 + }, + { + "epoch": 1.724439754174685, + "grad_norm": 0.1465028560424316, + "learning_rate": 4.567519347678662e-05, + "loss": 2.8579, + "step": 27779 + }, + { + "epoch": 1.724501831274443, + "grad_norm": 0.15763715478096868, + "learning_rate": 4.567159549921017e-05, + "loss": 2.7913, + "step": 27780 + }, + { + "epoch": 1.7245639083742006, + "grad_norm": 0.148842159620747, + "learning_rate": 4.566799754421607e-05, + "loss": 2.792, + "step": 27781 + }, + { + "epoch": 1.7246259854739585, + "grad_norm": 0.14522017781479415, + "learning_rate": 4.566439961182307e-05, + "loss": 2.814, + "step": 27782 + }, + { + "epoch": 1.7246880625737164, + "grad_norm": 0.1387907713944038, + "learning_rate": 4.566080170204994e-05, + "loss": 2.7104, + "step": 27783 + }, + { + "epoch": 1.7247501396734743, + "grad_norm": 0.14674406556359096, + "learning_rate": 4.565720381491548e-05, + "loss": 2.7475, + "step": 27784 + }, + { + "epoch": 1.7248122167732323, + "grad_norm": 0.14173283904383768, + "learning_rate": 4.5653605950438424e-05, + "loss": 2.7257, + "step": 27785 + }, + { + "epoch": 1.7248742938729902, + "grad_norm": 0.14948778889098085, + "learning_rate": 4.565000810863758e-05, + "loss": 2.818, + "step": 27786 + }, + { + "epoch": 1.724936370972748, + "grad_norm": 0.1657153555633732, + "learning_rate": 4.5646410289531675e-05, + "loss": 2.7657, + "step": 27787 + }, + { + "epoch": 1.724998448072506, + "grad_norm": 0.14192576947318605, + "learning_rate": 4.564281249313953e-05, + "loss": 2.7406, + "step": 27788 + }, + { + "epoch": 1.725060525172264, + "grad_norm": 0.14274131241273827, + "learning_rate": 4.563921471947987e-05, + "loss": 2.7861, + "step": 27789 + }, + { + "epoch": 1.7251226022720219, + "grad_norm": 0.14598836846469335, + "learning_rate": 4.56356169685715e-05, + "loss": 2.7925, + "step": 27790 + }, + { + "epoch": 1.7251846793717798, + "grad_norm": 0.14470334375157126, + "learning_rate": 4.563201924043316e-05, + "loss": 2.8226, + "step": 27791 + }, + { + "epoch": 1.7252467564715377, + "grad_norm": 0.15012189462167258, + "learning_rate": 4.562842153508365e-05, + "loss": 2.7407, + "step": 27792 + }, + { + "epoch": 1.7253088335712956, + "grad_norm": 0.15119143711844235, + "learning_rate": 4.5624823852541725e-05, + "loss": 2.7785, + "step": 27793 + }, + { + "epoch": 1.7253709106710535, + "grad_norm": 0.16779470852889156, + "learning_rate": 4.562122619282615e-05, + "loss": 2.8395, + "step": 27794 + }, + { + "epoch": 1.7254329877708114, + "grad_norm": 0.1408964953646951, + "learning_rate": 4.5617628555955704e-05, + "loss": 2.7256, + "step": 27795 + }, + { + "epoch": 1.7254950648705694, + "grad_norm": 0.1385721129888503, + "learning_rate": 4.561403094194915e-05, + "loss": 2.6171, + "step": 27796 + }, + { + "epoch": 1.7255571419703273, + "grad_norm": 0.14406308179187594, + "learning_rate": 4.561043335082527e-05, + "loss": 2.8161, + "step": 27797 + }, + { + "epoch": 1.7256192190700852, + "grad_norm": 0.14592662804477843, + "learning_rate": 4.560683578260281e-05, + "loss": 2.7592, + "step": 27798 + }, + { + "epoch": 1.725681296169843, + "grad_norm": 0.17278535296832298, + "learning_rate": 4.560323823730057e-05, + "loss": 2.809, + "step": 27799 + }, + { + "epoch": 1.7257433732696008, + "grad_norm": 0.14490816868952888, + "learning_rate": 4.559964071493729e-05, + "loss": 2.8095, + "step": 27800 + }, + { + "epoch": 1.7258054503693587, + "grad_norm": 0.15594935699862664, + "learning_rate": 4.559604321553176e-05, + "loss": 2.8445, + "step": 27801 + }, + { + "epoch": 1.7258675274691166, + "grad_norm": 0.14811049380273747, + "learning_rate": 4.559244573910274e-05, + "loss": 2.8093, + "step": 27802 + }, + { + "epoch": 1.7259296045688746, + "grad_norm": 0.1544812404742452, + "learning_rate": 4.558884828566901e-05, + "loss": 2.7388, + "step": 27803 + }, + { + "epoch": 1.7259916816686325, + "grad_norm": 0.1836865474660655, + "learning_rate": 4.558525085524931e-05, + "loss": 2.7546, + "step": 27804 + }, + { + "epoch": 1.7260537587683902, + "grad_norm": 0.15279613475501272, + "learning_rate": 4.5581653447862446e-05, + "loss": 2.7919, + "step": 27805 + }, + { + "epoch": 1.726115835868148, + "grad_norm": 0.14799778207360684, + "learning_rate": 4.557805606352717e-05, + "loss": 2.7763, + "step": 27806 + }, + { + "epoch": 1.726177912967906, + "grad_norm": 0.1457413904397804, + "learning_rate": 4.557445870226224e-05, + "loss": 2.8159, + "step": 27807 + }, + { + "epoch": 1.726239990067664, + "grad_norm": 0.14034763108779633, + "learning_rate": 4.5570861364086434e-05, + "loss": 2.7043, + "step": 27808 + }, + { + "epoch": 1.7263020671674218, + "grad_norm": 0.16181501412123767, + "learning_rate": 4.5567264049018496e-05, + "loss": 2.7864, + "step": 27809 + }, + { + "epoch": 1.7263641442671798, + "grad_norm": 0.14594595185215306, + "learning_rate": 4.556366675707724e-05, + "loss": 2.8298, + "step": 27810 + }, + { + "epoch": 1.7264262213669377, + "grad_norm": 0.1590128006232147, + "learning_rate": 4.556006948828141e-05, + "loss": 2.8392, + "step": 27811 + }, + { + "epoch": 1.7264882984666956, + "grad_norm": 0.15416942233659958, + "learning_rate": 4.555647224264978e-05, + "loss": 2.7852, + "step": 27812 + }, + { + "epoch": 1.7265503755664535, + "grad_norm": 0.16510423806687344, + "learning_rate": 4.555287502020111e-05, + "loss": 2.8282, + "step": 27813 + }, + { + "epoch": 1.7266124526662114, + "grad_norm": 0.1427455076355887, + "learning_rate": 4.554927782095417e-05, + "loss": 2.6862, + "step": 27814 + }, + { + "epoch": 1.7266745297659694, + "grad_norm": 0.14902866749316687, + "learning_rate": 4.5545680644927734e-05, + "loss": 2.7439, + "step": 27815 + }, + { + "epoch": 1.7267366068657273, + "grad_norm": 0.14848193331796972, + "learning_rate": 4.5542083492140555e-05, + "loss": 2.762, + "step": 27816 + }, + { + "epoch": 1.7267986839654852, + "grad_norm": 0.1413194938204484, + "learning_rate": 4.5538486362611416e-05, + "loss": 2.7263, + "step": 27817 + }, + { + "epoch": 1.726860761065243, + "grad_norm": 0.15233722085899795, + "learning_rate": 4.5534889256359075e-05, + "loss": 2.902, + "step": 27818 + }, + { + "epoch": 1.726922838165001, + "grad_norm": 0.14730439497889058, + "learning_rate": 4.5531292173402295e-05, + "loss": 2.7344, + "step": 27819 + }, + { + "epoch": 1.726984915264759, + "grad_norm": 0.16221693216568184, + "learning_rate": 4.552769511375985e-05, + "loss": 2.8356, + "step": 27820 + }, + { + "epoch": 1.7270469923645169, + "grad_norm": 0.1460476914898557, + "learning_rate": 4.5524098077450505e-05, + "loss": 2.7506, + "step": 27821 + }, + { + "epoch": 1.7271090694642748, + "grad_norm": 0.14832305260679784, + "learning_rate": 4.5520501064493036e-05, + "loss": 2.7846, + "step": 27822 + }, + { + "epoch": 1.7271711465640325, + "grad_norm": 0.15932484894231616, + "learning_rate": 4.551690407490619e-05, + "loss": 2.8329, + "step": 27823 + }, + { + "epoch": 1.7272332236637904, + "grad_norm": 0.15297428494182042, + "learning_rate": 4.551330710870875e-05, + "loss": 2.8899, + "step": 27824 + }, + { + "epoch": 1.7272953007635483, + "grad_norm": 0.15935669751820922, + "learning_rate": 4.550971016591947e-05, + "loss": 2.8008, + "step": 27825 + }, + { + "epoch": 1.7273573778633062, + "grad_norm": 0.14014933858265258, + "learning_rate": 4.550611324655713e-05, + "loss": 2.7453, + "step": 27826 + }, + { + "epoch": 1.7274194549630641, + "grad_norm": 0.14390714027966064, + "learning_rate": 4.550251635064048e-05, + "loss": 2.763, + "step": 27827 + }, + { + "epoch": 1.727481532062822, + "grad_norm": 0.1498904021535947, + "learning_rate": 4.54989194781883e-05, + "loss": 2.8192, + "step": 27828 + }, + { + "epoch": 1.7275436091625798, + "grad_norm": 0.14526349877098732, + "learning_rate": 4.5495322629219345e-05, + "loss": 2.7612, + "step": 27829 + }, + { + "epoch": 1.7276056862623377, + "grad_norm": 0.13818304108957366, + "learning_rate": 4.549172580375239e-05, + "loss": 2.8514, + "step": 27830 + }, + { + "epoch": 1.7276677633620956, + "grad_norm": 0.1540772770648553, + "learning_rate": 4.54881290018062e-05, + "loss": 2.8101, + "step": 27831 + }, + { + "epoch": 1.7277298404618535, + "grad_norm": 0.14606287395370662, + "learning_rate": 4.548453222339952e-05, + "loss": 2.8523, + "step": 27832 + }, + { + "epoch": 1.7277919175616114, + "grad_norm": 0.15079649739430126, + "learning_rate": 4.548093546855115e-05, + "loss": 2.7375, + "step": 27833 + }, + { + "epoch": 1.7278539946613694, + "grad_norm": 0.16451668881378428, + "learning_rate": 4.547733873727982e-05, + "loss": 2.8036, + "step": 27834 + }, + { + "epoch": 1.7279160717611273, + "grad_norm": 0.15688853919735388, + "learning_rate": 4.547374202960433e-05, + "loss": 2.8053, + "step": 27835 + }, + { + "epoch": 1.7279781488608852, + "grad_norm": 0.1503056930400493, + "learning_rate": 4.5470145345543405e-05, + "loss": 2.7452, + "step": 27836 + }, + { + "epoch": 1.728040225960643, + "grad_norm": 0.16756843947939945, + "learning_rate": 4.546654868511585e-05, + "loss": 2.8599, + "step": 27837 + }, + { + "epoch": 1.728102303060401, + "grad_norm": 0.1616033042959537, + "learning_rate": 4.5462952048340404e-05, + "loss": 2.804, + "step": 27838 + }, + { + "epoch": 1.728164380160159, + "grad_norm": 0.16050510095054524, + "learning_rate": 4.545935543523584e-05, + "loss": 2.8496, + "step": 27839 + }, + { + "epoch": 1.7282264572599169, + "grad_norm": 0.14311555272160983, + "learning_rate": 4.5455758845820915e-05, + "loss": 2.696, + "step": 27840 + }, + { + "epoch": 1.7282885343596748, + "grad_norm": 0.15576316569741452, + "learning_rate": 4.5452162280114386e-05, + "loss": 2.8125, + "step": 27841 + }, + { + "epoch": 1.7283506114594327, + "grad_norm": 0.14518640210159, + "learning_rate": 4.544856573813505e-05, + "loss": 2.7596, + "step": 27842 + }, + { + "epoch": 1.7284126885591906, + "grad_norm": 0.1590600705195456, + "learning_rate": 4.544496921990166e-05, + "loss": 2.8387, + "step": 27843 + }, + { + "epoch": 1.7284747656589485, + "grad_norm": 0.14620042618909235, + "learning_rate": 4.544137272543295e-05, + "loss": 2.8345, + "step": 27844 + }, + { + "epoch": 1.7285368427587064, + "grad_norm": 0.15287086797746055, + "learning_rate": 4.543777625474773e-05, + "loss": 2.7168, + "step": 27845 + }, + { + "epoch": 1.7285989198584644, + "grad_norm": 0.14592778320556754, + "learning_rate": 4.5434179807864716e-05, + "loss": 2.7357, + "step": 27846 + }, + { + "epoch": 1.728660996958222, + "grad_norm": 0.1469760526853972, + "learning_rate": 4.54305833848027e-05, + "loss": 2.81, + "step": 27847 + }, + { + "epoch": 1.72872307405798, + "grad_norm": 0.14895408236388916, + "learning_rate": 4.5426986985580445e-05, + "loss": 2.7533, + "step": 27848 + }, + { + "epoch": 1.728785151157738, + "grad_norm": 0.15859687337007775, + "learning_rate": 4.5423390610216707e-05, + "loss": 2.786, + "step": 27849 + }, + { + "epoch": 1.7288472282574958, + "grad_norm": 0.18265686012211851, + "learning_rate": 4.541979425873025e-05, + "loss": 2.7596, + "step": 27850 + }, + { + "epoch": 1.7289093053572537, + "grad_norm": 0.1428427070269984, + "learning_rate": 4.541619793113985e-05, + "loss": 2.7248, + "step": 27851 + }, + { + "epoch": 1.7289713824570117, + "grad_norm": 0.15585856282111016, + "learning_rate": 4.5412601627464235e-05, + "loss": 2.77, + "step": 27852 + }, + { + "epoch": 1.7290334595567693, + "grad_norm": 0.15726732827300324, + "learning_rate": 4.540900534772221e-05, + "loss": 2.8147, + "step": 27853 + }, + { + "epoch": 1.7290955366565273, + "grad_norm": 0.14533372360000832, + "learning_rate": 4.5405409091932506e-05, + "loss": 2.8359, + "step": 27854 + }, + { + "epoch": 1.7291576137562852, + "grad_norm": 0.14682009056089554, + "learning_rate": 4.540181286011391e-05, + "loss": 2.6987, + "step": 27855 + }, + { + "epoch": 1.729219690856043, + "grad_norm": 0.15205664356962698, + "learning_rate": 4.539821665228518e-05, + "loss": 2.7767, + "step": 27856 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.14661899738431555, + "learning_rate": 4.5394620468465045e-05, + "loss": 2.7945, + "step": 27857 + }, + { + "epoch": 1.729343845055559, + "grad_norm": 0.18071726453837847, + "learning_rate": 4.5391024308672316e-05, + "loss": 2.771, + "step": 27858 + }, + { + "epoch": 1.7294059221553169, + "grad_norm": 0.15405900340919404, + "learning_rate": 4.538742817292571e-05, + "loss": 2.7648, + "step": 27859 + }, + { + "epoch": 1.7294679992550748, + "grad_norm": 0.15479332093086065, + "learning_rate": 4.538383206124403e-05, + "loss": 2.807, + "step": 27860 + }, + { + "epoch": 1.7295300763548327, + "grad_norm": 0.15480774795814412, + "learning_rate": 4.538023597364601e-05, + "loss": 2.7714, + "step": 27861 + }, + { + "epoch": 1.7295921534545906, + "grad_norm": 0.14965112740994999, + "learning_rate": 4.537663991015042e-05, + "loss": 2.8559, + "step": 27862 + }, + { + "epoch": 1.7296542305543485, + "grad_norm": 0.15714269379604973, + "learning_rate": 4.5373043870776025e-05, + "loss": 2.8731, + "step": 27863 + }, + { + "epoch": 1.7297163076541064, + "grad_norm": 0.14912827280247107, + "learning_rate": 4.536944785554158e-05, + "loss": 2.7769, + "step": 27864 + }, + { + "epoch": 1.7297783847538644, + "grad_norm": 0.15846922359346888, + "learning_rate": 4.5365851864465846e-05, + "loss": 2.7398, + "step": 27865 + }, + { + "epoch": 1.7298404618536223, + "grad_norm": 0.14800717065490263, + "learning_rate": 4.53622558975676e-05, + "loss": 2.7791, + "step": 27866 + }, + { + "epoch": 1.7299025389533802, + "grad_norm": 0.1444008148427775, + "learning_rate": 4.535865995486559e-05, + "loss": 2.7971, + "step": 27867 + }, + { + "epoch": 1.7299646160531381, + "grad_norm": 0.14885019979692568, + "learning_rate": 4.535506403637857e-05, + "loss": 2.7656, + "step": 27868 + }, + { + "epoch": 1.730026693152896, + "grad_norm": 0.16269660207322056, + "learning_rate": 4.535146814212531e-05, + "loss": 2.7095, + "step": 27869 + }, + { + "epoch": 1.730088770252654, + "grad_norm": 0.14595515975456752, + "learning_rate": 4.534787227212457e-05, + "loss": 2.7076, + "step": 27870 + }, + { + "epoch": 1.7301508473524116, + "grad_norm": 0.15283973966908854, + "learning_rate": 4.534427642639512e-05, + "loss": 2.9206, + "step": 27871 + }, + { + "epoch": 1.7302129244521696, + "grad_norm": 0.14870150829549447, + "learning_rate": 4.534068060495569e-05, + "loss": 2.7823, + "step": 27872 + }, + { + "epoch": 1.7302750015519275, + "grad_norm": 0.16064701252328378, + "learning_rate": 4.533708480782507e-05, + "loss": 2.8939, + "step": 27873 + }, + { + "epoch": 1.7303370786516854, + "grad_norm": 0.1573542260240537, + "learning_rate": 4.5333489035022e-05, + "loss": 2.7402, + "step": 27874 + }, + { + "epoch": 1.7303991557514433, + "grad_norm": 0.14320872912390611, + "learning_rate": 4.532989328656525e-05, + "loss": 2.8244, + "step": 27875 + }, + { + "epoch": 1.7304612328512012, + "grad_norm": 0.1671490350932337, + "learning_rate": 4.5326297562473596e-05, + "loss": 2.8386, + "step": 27876 + }, + { + "epoch": 1.730523309950959, + "grad_norm": 0.19581228208179688, + "learning_rate": 4.532270186276578e-05, + "loss": 2.8135, + "step": 27877 + }, + { + "epoch": 1.7305853870507169, + "grad_norm": 0.14565744323094398, + "learning_rate": 4.531910618746056e-05, + "loss": 2.8228, + "step": 27878 + }, + { + "epoch": 1.7306474641504748, + "grad_norm": 0.15353922189381944, + "learning_rate": 4.53155105365767e-05, + "loss": 2.8408, + "step": 27879 + }, + { + "epoch": 1.7307095412502327, + "grad_norm": 0.1459729230476556, + "learning_rate": 4.5311914910132965e-05, + "loss": 2.8199, + "step": 27880 + }, + { + "epoch": 1.7307716183499906, + "grad_norm": 0.16527005119343827, + "learning_rate": 4.5308319308148106e-05, + "loss": 2.6936, + "step": 27881 + }, + { + "epoch": 1.7308336954497485, + "grad_norm": 0.17093591006758255, + "learning_rate": 4.530472373064087e-05, + "loss": 2.8108, + "step": 27882 + }, + { + "epoch": 1.7308957725495064, + "grad_norm": 0.1668050286044457, + "learning_rate": 4.530112817763005e-05, + "loss": 2.787, + "step": 27883 + }, + { + "epoch": 1.7309578496492644, + "grad_norm": 0.15634096001433218, + "learning_rate": 4.529753264913437e-05, + "loss": 2.7678, + "step": 27884 + }, + { + "epoch": 1.7310199267490223, + "grad_norm": 0.17169061979653927, + "learning_rate": 4.5293937145172616e-05, + "loss": 2.7219, + "step": 27885 + }, + { + "epoch": 1.7310820038487802, + "grad_norm": 0.1716961960129962, + "learning_rate": 4.529034166576352e-05, + "loss": 2.8182, + "step": 27886 + }, + { + "epoch": 1.731144080948538, + "grad_norm": 0.15109881328473984, + "learning_rate": 4.528674621092587e-05, + "loss": 2.7772, + "step": 27887 + }, + { + "epoch": 1.731206158048296, + "grad_norm": 0.14724042965304243, + "learning_rate": 4.5283150780678396e-05, + "loss": 2.633, + "step": 27888 + }, + { + "epoch": 1.731268235148054, + "grad_norm": 0.14839586502219276, + "learning_rate": 4.5279555375039886e-05, + "loss": 2.7927, + "step": 27889 + }, + { + "epoch": 1.7313303122478119, + "grad_norm": 0.14370868493419905, + "learning_rate": 4.527595999402906e-05, + "loss": 2.7436, + "step": 27890 + }, + { + "epoch": 1.7313923893475698, + "grad_norm": 0.16611867959805993, + "learning_rate": 4.527236463766472e-05, + "loss": 2.8257, + "step": 27891 + }, + { + "epoch": 1.7314544664473277, + "grad_norm": 0.1484309307953661, + "learning_rate": 4.5268769305965585e-05, + "loss": 2.774, + "step": 27892 + }, + { + "epoch": 1.7315165435470856, + "grad_norm": 0.14904048442207268, + "learning_rate": 4.5265173998950435e-05, + "loss": 2.7841, + "step": 27893 + }, + { + "epoch": 1.7315786206468435, + "grad_norm": 0.14905979193020905, + "learning_rate": 4.526157871663802e-05, + "loss": 2.7504, + "step": 27894 + }, + { + "epoch": 1.7316406977466012, + "grad_norm": 0.15614413012575995, + "learning_rate": 4.52579834590471e-05, + "loss": 2.8444, + "step": 27895 + }, + { + "epoch": 1.7317027748463591, + "grad_norm": 0.15794113034694168, + "learning_rate": 4.5254388226196434e-05, + "loss": 2.7973, + "step": 27896 + }, + { + "epoch": 1.731764851946117, + "grad_norm": 0.17324476186394272, + "learning_rate": 4.5250793018104765e-05, + "loss": 2.7965, + "step": 27897 + }, + { + "epoch": 1.731826929045875, + "grad_norm": 0.15442322133291894, + "learning_rate": 4.5247197834790876e-05, + "loss": 2.8548, + "step": 27898 + }, + { + "epoch": 1.731889006145633, + "grad_norm": 0.1443172107157328, + "learning_rate": 4.52436026762735e-05, + "loss": 2.8151, + "step": 27899 + }, + { + "epoch": 1.7319510832453908, + "grad_norm": 0.15904533824820946, + "learning_rate": 4.524000754257142e-05, + "loss": 2.7274, + "step": 27900 + }, + { + "epoch": 1.7320131603451485, + "grad_norm": 0.14762725040713467, + "learning_rate": 4.523641243370335e-05, + "loss": 2.6978, + "step": 27901 + }, + { + "epoch": 1.7320752374449064, + "grad_norm": 0.1637692953618716, + "learning_rate": 4.52328173496881e-05, + "loss": 2.7196, + "step": 27902 + }, + { + "epoch": 1.7321373145446644, + "grad_norm": 0.14657818651335291, + "learning_rate": 4.522922229054439e-05, + "loss": 2.7944, + "step": 27903 + }, + { + "epoch": 1.7321993916444223, + "grad_norm": 0.16880136211546823, + "learning_rate": 4.522562725629097e-05, + "loss": 2.7012, + "step": 27904 + }, + { + "epoch": 1.7322614687441802, + "grad_norm": 0.15009546421016473, + "learning_rate": 4.5222032246946624e-05, + "loss": 2.7665, + "step": 27905 + }, + { + "epoch": 1.732323545843938, + "grad_norm": 0.14168045214122477, + "learning_rate": 4.521843726253009e-05, + "loss": 2.7738, + "step": 27906 + }, + { + "epoch": 1.732385622943696, + "grad_norm": 0.17209239779014363, + "learning_rate": 4.521484230306012e-05, + "loss": 2.8078, + "step": 27907 + }, + { + "epoch": 1.732447700043454, + "grad_norm": 0.14629072740979973, + "learning_rate": 4.52112473685555e-05, + "loss": 2.8549, + "step": 27908 + }, + { + "epoch": 1.7325097771432119, + "grad_norm": 0.15220348307515844, + "learning_rate": 4.520765245903496e-05, + "loss": 2.8075, + "step": 27909 + }, + { + "epoch": 1.7325718542429698, + "grad_norm": 0.15217465406207437, + "learning_rate": 4.520405757451725e-05, + "loss": 2.8218, + "step": 27910 + }, + { + "epoch": 1.7326339313427277, + "grad_norm": 0.1414708817028355, + "learning_rate": 4.5200462715021146e-05, + "loss": 2.7079, + "step": 27911 + }, + { + "epoch": 1.7326960084424856, + "grad_norm": 0.15075508147543124, + "learning_rate": 4.51968678805654e-05, + "loss": 2.8775, + "step": 27912 + }, + { + "epoch": 1.7327580855422435, + "grad_norm": 0.15291553568248548, + "learning_rate": 4.5193273071168744e-05, + "loss": 2.7595, + "step": 27913 + }, + { + "epoch": 1.7328201626420014, + "grad_norm": 0.15392930879533243, + "learning_rate": 4.518967828684996e-05, + "loss": 2.8442, + "step": 27914 + }, + { + "epoch": 1.7328822397417594, + "grad_norm": 0.16304475298543894, + "learning_rate": 4.5186083527627785e-05, + "loss": 2.838, + "step": 27915 + }, + { + "epoch": 1.7329443168415173, + "grad_norm": 0.1694969366656226, + "learning_rate": 4.5182488793520994e-05, + "loss": 2.8425, + "step": 27916 + }, + { + "epoch": 1.7330063939412752, + "grad_norm": 0.16420157532392207, + "learning_rate": 4.517889408454832e-05, + "loss": 2.7851, + "step": 27917 + }, + { + "epoch": 1.7330684710410331, + "grad_norm": 0.16538678731056244, + "learning_rate": 4.517529940072852e-05, + "loss": 2.8011, + "step": 27918 + }, + { + "epoch": 1.7331305481407908, + "grad_norm": 0.14950989162093567, + "learning_rate": 4.5171704742080365e-05, + "loss": 2.7396, + "step": 27919 + }, + { + "epoch": 1.7331926252405487, + "grad_norm": 0.16110718841260194, + "learning_rate": 4.516811010862259e-05, + "loss": 2.823, + "step": 27920 + }, + { + "epoch": 1.7332547023403067, + "grad_norm": 0.14266516367252982, + "learning_rate": 4.516451550037397e-05, + "loss": 2.7269, + "step": 27921 + }, + { + "epoch": 1.7333167794400646, + "grad_norm": 0.14954815251836331, + "learning_rate": 4.5160920917353234e-05, + "loss": 2.7837, + "step": 27922 + }, + { + "epoch": 1.7333788565398225, + "grad_norm": 0.15452546678758644, + "learning_rate": 4.5157326359579154e-05, + "loss": 2.8262, + "step": 27923 + }, + { + "epoch": 1.7334409336395804, + "grad_norm": 0.1465170315262776, + "learning_rate": 4.5153731827070474e-05, + "loss": 2.7429, + "step": 27924 + }, + { + "epoch": 1.733503010739338, + "grad_norm": 0.14705433271104154, + "learning_rate": 4.515013731984595e-05, + "loss": 2.8653, + "step": 27925 + }, + { + "epoch": 1.733565087839096, + "grad_norm": 0.1559737385857934, + "learning_rate": 4.514654283792434e-05, + "loss": 2.7137, + "step": 27926 + }, + { + "epoch": 1.733627164938854, + "grad_norm": 0.1655086405492312, + "learning_rate": 4.51429483813244e-05, + "loss": 2.9269, + "step": 27927 + }, + { + "epoch": 1.7336892420386119, + "grad_norm": 0.1718125826033015, + "learning_rate": 4.513935395006488e-05, + "loss": 2.7047, + "step": 27928 + }, + { + "epoch": 1.7337513191383698, + "grad_norm": 0.14914668849690668, + "learning_rate": 4.5135759544164516e-05, + "loss": 2.7752, + "step": 27929 + }, + { + "epoch": 1.7338133962381277, + "grad_norm": 0.1526267223538248, + "learning_rate": 4.513216516364209e-05, + "loss": 2.7967, + "step": 27930 + }, + { + "epoch": 1.7338754733378856, + "grad_norm": 0.150204692195493, + "learning_rate": 4.5128570808516324e-05, + "loss": 2.8424, + "step": 27931 + }, + { + "epoch": 1.7339375504376435, + "grad_norm": 0.1543708756606331, + "learning_rate": 4.5124976478805994e-05, + "loss": 2.7347, + "step": 27932 + }, + { + "epoch": 1.7339996275374014, + "grad_norm": 0.1486179672727696, + "learning_rate": 4.512138217452984e-05, + "loss": 2.7564, + "step": 27933 + }, + { + "epoch": 1.7340617046371594, + "grad_norm": 0.15095084984307316, + "learning_rate": 4.511778789570663e-05, + "loss": 2.7747, + "step": 27934 + }, + { + "epoch": 1.7341237817369173, + "grad_norm": 0.16886034245486123, + "learning_rate": 4.5114193642355096e-05, + "loss": 2.7788, + "step": 27935 + }, + { + "epoch": 1.7341858588366752, + "grad_norm": 0.14758987978703672, + "learning_rate": 4.511059941449401e-05, + "loss": 2.8519, + "step": 27936 + }, + { + "epoch": 1.7342479359364331, + "grad_norm": 0.15528605068007634, + "learning_rate": 4.5107005212142106e-05, + "loss": 2.7226, + "step": 27937 + }, + { + "epoch": 1.734310013036191, + "grad_norm": 0.13805550613579398, + "learning_rate": 4.510341103531815e-05, + "loss": 2.7999, + "step": 27938 + }, + { + "epoch": 1.734372090135949, + "grad_norm": 0.15132293669629873, + "learning_rate": 4.509981688404088e-05, + "loss": 2.8116, + "step": 27939 + }, + { + "epoch": 1.7344341672357069, + "grad_norm": 0.1525015258517302, + "learning_rate": 4.509622275832904e-05, + "loss": 2.8144, + "step": 27940 + }, + { + "epoch": 1.7344962443354648, + "grad_norm": 0.1475699634324715, + "learning_rate": 4.509262865820142e-05, + "loss": 2.7293, + "step": 27941 + }, + { + "epoch": 1.7345583214352227, + "grad_norm": 0.14540768705158433, + "learning_rate": 4.508903458367675e-05, + "loss": 2.7403, + "step": 27942 + }, + { + "epoch": 1.7346203985349804, + "grad_norm": 0.146420757466514, + "learning_rate": 4.508544053477376e-05, + "loss": 2.7866, + "step": 27943 + }, + { + "epoch": 1.7346824756347383, + "grad_norm": 0.142004065823525, + "learning_rate": 4.508184651151124e-05, + "loss": 2.7671, + "step": 27944 + }, + { + "epoch": 1.7347445527344962, + "grad_norm": 0.14026088065637146, + "learning_rate": 4.507825251390791e-05, + "loss": 2.7387, + "step": 27945 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.14341707835336004, + "learning_rate": 4.507465854198253e-05, + "loss": 2.7604, + "step": 27946 + }, + { + "epoch": 1.734868706934012, + "grad_norm": 0.13971401888815324, + "learning_rate": 4.507106459575385e-05, + "loss": 2.7296, + "step": 27947 + }, + { + "epoch": 1.73493078403377, + "grad_norm": 0.14821660395264394, + "learning_rate": 4.506747067524064e-05, + "loss": 2.7711, + "step": 27948 + }, + { + "epoch": 1.7349928611335277, + "grad_norm": 0.1455500880535322, + "learning_rate": 4.5063876780461614e-05, + "loss": 2.7339, + "step": 27949 + }, + { + "epoch": 1.7350549382332856, + "grad_norm": 0.14052353615867888, + "learning_rate": 4.506028291143555e-05, + "loss": 2.7945, + "step": 27950 + }, + { + "epoch": 1.7351170153330435, + "grad_norm": 0.15252987788801844, + "learning_rate": 4.505668906818118e-05, + "loss": 2.7907, + "step": 27951 + }, + { + "epoch": 1.7351790924328014, + "grad_norm": 0.1555930812209927, + "learning_rate": 4.505309525071728e-05, + "loss": 2.7611, + "step": 27952 + }, + { + "epoch": 1.7352411695325594, + "grad_norm": 0.182542418406826, + "learning_rate": 4.504950145906257e-05, + "loss": 2.8583, + "step": 27953 + }, + { + "epoch": 1.7353032466323173, + "grad_norm": 0.137138308762497, + "learning_rate": 4.5045907693235814e-05, + "loss": 2.7291, + "step": 27954 + }, + { + "epoch": 1.7353653237320752, + "grad_norm": 0.15363878837425754, + "learning_rate": 4.504231395325577e-05, + "loss": 2.7581, + "step": 27955 + }, + { + "epoch": 1.7354274008318331, + "grad_norm": 0.13865048955506964, + "learning_rate": 4.503872023914116e-05, + "loss": 2.6993, + "step": 27956 + }, + { + "epoch": 1.735489477931591, + "grad_norm": 0.16351194762146898, + "learning_rate": 4.503512655091077e-05, + "loss": 2.7792, + "step": 27957 + }, + { + "epoch": 1.735551555031349, + "grad_norm": 0.13821609573135749, + "learning_rate": 4.503153288858331e-05, + "loss": 2.7671, + "step": 27958 + }, + { + "epoch": 1.7356136321311069, + "grad_norm": 0.13833586185535335, + "learning_rate": 4.502793925217756e-05, + "loss": 2.6457, + "step": 27959 + }, + { + "epoch": 1.7356757092308648, + "grad_norm": 0.14541957407133488, + "learning_rate": 4.5024345641712254e-05, + "loss": 2.6611, + "step": 27960 + }, + { + "epoch": 1.7357377863306227, + "grad_norm": 0.14295012139031352, + "learning_rate": 4.5020752057206155e-05, + "loss": 2.7705, + "step": 27961 + }, + { + "epoch": 1.7357998634303806, + "grad_norm": 0.14578062123799856, + "learning_rate": 4.501715849867799e-05, + "loss": 2.8648, + "step": 27962 + }, + { + "epoch": 1.7358619405301385, + "grad_norm": 0.13663139958806872, + "learning_rate": 4.501356496614653e-05, + "loss": 2.7921, + "step": 27963 + }, + { + "epoch": 1.7359240176298965, + "grad_norm": 0.13801519289427375, + "learning_rate": 4.5009971459630494e-05, + "loss": 2.7903, + "step": 27964 + }, + { + "epoch": 1.7359860947296544, + "grad_norm": 0.14816503238335393, + "learning_rate": 4.500637797914867e-05, + "loss": 2.7235, + "step": 27965 + }, + { + "epoch": 1.7360481718294123, + "grad_norm": 0.13994371102159475, + "learning_rate": 4.500278452471978e-05, + "loss": 2.8065, + "step": 27966 + }, + { + "epoch": 1.73611024892917, + "grad_norm": 0.14349593692520188, + "learning_rate": 4.499919109636256e-05, + "loss": 2.7853, + "step": 27967 + }, + { + "epoch": 1.736172326028928, + "grad_norm": 0.14383397862811595, + "learning_rate": 4.499559769409579e-05, + "loss": 2.644, + "step": 27968 + }, + { + "epoch": 1.7362344031286858, + "grad_norm": 0.14529125047999158, + "learning_rate": 4.4992004317938184e-05, + "loss": 2.8224, + "step": 27969 + }, + { + "epoch": 1.7362964802284437, + "grad_norm": 0.14514527183877834, + "learning_rate": 4.498841096790853e-05, + "loss": 2.8242, + "step": 27970 + }, + { + "epoch": 1.7363585573282017, + "grad_norm": 0.15122691392076834, + "learning_rate": 4.498481764402553e-05, + "loss": 2.7147, + "step": 27971 + }, + { + "epoch": 1.7364206344279596, + "grad_norm": 0.14434933167951314, + "learning_rate": 4.498122434630795e-05, + "loss": 2.7133, + "step": 27972 + }, + { + "epoch": 1.7364827115277173, + "grad_norm": 0.1783671276898944, + "learning_rate": 4.4977631074774564e-05, + "loss": 2.7992, + "step": 27973 + }, + { + "epoch": 1.7365447886274752, + "grad_norm": 0.16016397806456303, + "learning_rate": 4.497403782944408e-05, + "loss": 2.8476, + "step": 27974 + }, + { + "epoch": 1.736606865727233, + "grad_norm": 0.1745227532721939, + "learning_rate": 4.4970444610335275e-05, + "loss": 2.9306, + "step": 27975 + }, + { + "epoch": 1.736668942826991, + "grad_norm": 0.14990138696619915, + "learning_rate": 4.4966851417466874e-05, + "loss": 2.8278, + "step": 27976 + }, + { + "epoch": 1.736731019926749, + "grad_norm": 0.1485748095128929, + "learning_rate": 4.496325825085764e-05, + "loss": 2.8878, + "step": 27977 + }, + { + "epoch": 1.7367930970265069, + "grad_norm": 0.15205465298367915, + "learning_rate": 4.495966511052631e-05, + "loss": 2.8174, + "step": 27978 + }, + { + "epoch": 1.7368551741262648, + "grad_norm": 0.14130411806644066, + "learning_rate": 4.495607199649161e-05, + "loss": 2.7207, + "step": 27979 + }, + { + "epoch": 1.7369172512260227, + "grad_norm": 0.14639741859751212, + "learning_rate": 4.4952478908772325e-05, + "loss": 2.7667, + "step": 27980 + }, + { + "epoch": 1.7369793283257806, + "grad_norm": 0.14749027112015586, + "learning_rate": 4.4948885847387175e-05, + "loss": 2.7402, + "step": 27981 + }, + { + "epoch": 1.7370414054255385, + "grad_norm": 0.15058979247189475, + "learning_rate": 4.494529281235492e-05, + "loss": 2.8172, + "step": 27982 + }, + { + "epoch": 1.7371034825252964, + "grad_norm": 0.14968988105098652, + "learning_rate": 4.494169980369429e-05, + "loss": 2.8148, + "step": 27983 + }, + { + "epoch": 1.7371655596250544, + "grad_norm": 0.15624060602559378, + "learning_rate": 4.493810682142405e-05, + "loss": 2.7707, + "step": 27984 + }, + { + "epoch": 1.7372276367248123, + "grad_norm": 0.17337976452682807, + "learning_rate": 4.493451386556293e-05, + "loss": 2.7825, + "step": 27985 + }, + { + "epoch": 1.7372897138245702, + "grad_norm": 0.14157096269082578, + "learning_rate": 4.493092093612969e-05, + "loss": 2.7262, + "step": 27986 + }, + { + "epoch": 1.7373517909243281, + "grad_norm": 0.19319887917832246, + "learning_rate": 4.492732803314305e-05, + "loss": 2.7776, + "step": 27987 + }, + { + "epoch": 1.737413868024086, + "grad_norm": 0.14536028301976842, + "learning_rate": 4.492373515662178e-05, + "loss": 2.7868, + "step": 27988 + }, + { + "epoch": 1.737475945123844, + "grad_norm": 0.14967901138096412, + "learning_rate": 4.492014230658463e-05, + "loss": 2.8077, + "step": 27989 + }, + { + "epoch": 1.7375380222236019, + "grad_norm": 0.1539006880902517, + "learning_rate": 4.491654948305031e-05, + "loss": 2.8349, + "step": 27990 + }, + { + "epoch": 1.7376000993233596, + "grad_norm": 0.1540931370587015, + "learning_rate": 4.491295668603759e-05, + "loss": 2.7553, + "step": 27991 + }, + { + "epoch": 1.7376621764231175, + "grad_norm": 0.1648506522428535, + "learning_rate": 4.490936391556521e-05, + "loss": 2.7292, + "step": 27992 + }, + { + "epoch": 1.7377242535228754, + "grad_norm": 0.1500863427767346, + "learning_rate": 4.4905771171651925e-05, + "loss": 2.8287, + "step": 27993 + }, + { + "epoch": 1.7377863306226333, + "grad_norm": 0.16865231348864665, + "learning_rate": 4.490217845431645e-05, + "loss": 2.8149, + "step": 27994 + }, + { + "epoch": 1.7378484077223912, + "grad_norm": 0.1495170612240312, + "learning_rate": 4.489858576357756e-05, + "loss": 2.8301, + "step": 27995 + }, + { + "epoch": 1.7379104848221492, + "grad_norm": 0.14553516566075353, + "learning_rate": 4.489499309945398e-05, + "loss": 2.7545, + "step": 27996 + }, + { + "epoch": 1.7379725619219069, + "grad_norm": 0.1529149205470199, + "learning_rate": 4.489140046196447e-05, + "loss": 2.6926, + "step": 27997 + }, + { + "epoch": 1.7380346390216648, + "grad_norm": 0.155600132133068, + "learning_rate": 4.488780785112775e-05, + "loss": 2.7629, + "step": 27998 + }, + { + "epoch": 1.7380967161214227, + "grad_norm": 0.15110213546403672, + "learning_rate": 4.4884215266962593e-05, + "loss": 2.712, + "step": 27999 + }, + { + "epoch": 1.7381587932211806, + "grad_norm": 0.1536857780699043, + "learning_rate": 4.488062270948771e-05, + "loss": 2.7507, + "step": 28000 + }, + { + "epoch": 1.7382208703209385, + "grad_norm": 0.1467139416151486, + "learning_rate": 4.487703017872188e-05, + "loss": 2.7362, + "step": 28001 + }, + { + "epoch": 1.7382829474206964, + "grad_norm": 0.1514084605891175, + "learning_rate": 4.4873437674683824e-05, + "loss": 2.8117, + "step": 28002 + }, + { + "epoch": 1.7383450245204544, + "grad_norm": 0.15592722933473552, + "learning_rate": 4.486984519739227e-05, + "loss": 2.7941, + "step": 28003 + }, + { + "epoch": 1.7384071016202123, + "grad_norm": 0.14781657652848765, + "learning_rate": 4.4866252746866e-05, + "loss": 2.7909, + "step": 28004 + }, + { + "epoch": 1.7384691787199702, + "grad_norm": 0.19433599500799048, + "learning_rate": 4.4862660323123714e-05, + "loss": 2.7261, + "step": 28005 + }, + { + "epoch": 1.7385312558197281, + "grad_norm": 0.15613886337828736, + "learning_rate": 4.485906792618418e-05, + "loss": 2.8059, + "step": 28006 + }, + { + "epoch": 1.738593332919486, + "grad_norm": 0.1522636875657608, + "learning_rate": 4.485547555606615e-05, + "loss": 2.7797, + "step": 28007 + }, + { + "epoch": 1.738655410019244, + "grad_norm": 0.15537513545430143, + "learning_rate": 4.485188321278834e-05, + "loss": 2.8559, + "step": 28008 + }, + { + "epoch": 1.7387174871190019, + "grad_norm": 0.1504071276143964, + "learning_rate": 4.484829089636952e-05, + "loss": 2.7654, + "step": 28009 + }, + { + "epoch": 1.7387795642187598, + "grad_norm": 0.1549798661372314, + "learning_rate": 4.4844698606828406e-05, + "loss": 2.7768, + "step": 28010 + }, + { + "epoch": 1.7388416413185177, + "grad_norm": 0.15132288130545787, + "learning_rate": 4.484110634418376e-05, + "loss": 2.7881, + "step": 28011 + }, + { + "epoch": 1.7389037184182756, + "grad_norm": 0.17135093184918462, + "learning_rate": 4.4837514108454306e-05, + "loss": 2.8047, + "step": 28012 + }, + { + "epoch": 1.7389657955180335, + "grad_norm": 0.15128850430031768, + "learning_rate": 4.48339218996588e-05, + "loss": 2.7054, + "step": 28013 + }, + { + "epoch": 1.7390278726177915, + "grad_norm": 0.14327446192445828, + "learning_rate": 4.483032971781598e-05, + "loss": 2.7544, + "step": 28014 + }, + { + "epoch": 1.7390899497175492, + "grad_norm": 0.15426203177374614, + "learning_rate": 4.482673756294458e-05, + "loss": 2.8267, + "step": 28015 + }, + { + "epoch": 1.739152026817307, + "grad_norm": 0.14013206660294475, + "learning_rate": 4.4823145435063346e-05, + "loss": 2.7787, + "step": 28016 + }, + { + "epoch": 1.739214103917065, + "grad_norm": 0.1518530032825569, + "learning_rate": 4.481955333419101e-05, + "loss": 2.8381, + "step": 28017 + }, + { + "epoch": 1.739276181016823, + "grad_norm": 0.15737817692598904, + "learning_rate": 4.481596126034634e-05, + "loss": 2.7473, + "step": 28018 + }, + { + "epoch": 1.7393382581165808, + "grad_norm": 0.14323784772979023, + "learning_rate": 4.4812369213548044e-05, + "loss": 2.8103, + "step": 28019 + }, + { + "epoch": 1.7394003352163387, + "grad_norm": 0.1529786439585216, + "learning_rate": 4.480877719381489e-05, + "loss": 2.8954, + "step": 28020 + }, + { + "epoch": 1.7394624123160964, + "grad_norm": 0.14693532197060905, + "learning_rate": 4.480518520116559e-05, + "loss": 2.7831, + "step": 28021 + }, + { + "epoch": 1.7395244894158544, + "grad_norm": 0.15142976636303473, + "learning_rate": 4.480159323561891e-05, + "loss": 2.7302, + "step": 28022 + }, + { + "epoch": 1.7395865665156123, + "grad_norm": 0.1493766525907358, + "learning_rate": 4.4798001297193574e-05, + "loss": 2.804, + "step": 28023 + }, + { + "epoch": 1.7396486436153702, + "grad_norm": 0.15184867328513899, + "learning_rate": 4.4794409385908343e-05, + "loss": 2.6847, + "step": 28024 + }, + { + "epoch": 1.7397107207151281, + "grad_norm": 0.16467983607134318, + "learning_rate": 4.479081750178192e-05, + "loss": 2.8089, + "step": 28025 + }, + { + "epoch": 1.739772797814886, + "grad_norm": 0.15150542561018918, + "learning_rate": 4.478722564483308e-05, + "loss": 2.8385, + "step": 28026 + }, + { + "epoch": 1.739834874914644, + "grad_norm": 0.15106428664835322, + "learning_rate": 4.478363381508055e-05, + "loss": 2.7493, + "step": 28027 + }, + { + "epoch": 1.7398969520144019, + "grad_norm": 0.17420927329867822, + "learning_rate": 4.4780042012543064e-05, + "loss": 2.7479, + "step": 28028 + }, + { + "epoch": 1.7399590291141598, + "grad_norm": 0.16744973704256771, + "learning_rate": 4.477645023723938e-05, + "loss": 2.7496, + "step": 28029 + }, + { + "epoch": 1.7400211062139177, + "grad_norm": 0.14462628333207317, + "learning_rate": 4.47728584891882e-05, + "loss": 2.8343, + "step": 28030 + }, + { + "epoch": 1.7400831833136756, + "grad_norm": 0.15689917651659957, + "learning_rate": 4.4769266768408305e-05, + "loss": 2.7615, + "step": 28031 + }, + { + "epoch": 1.7401452604134335, + "grad_norm": 0.15685130883658197, + "learning_rate": 4.47656750749184e-05, + "loss": 2.7902, + "step": 28032 + }, + { + "epoch": 1.7402073375131915, + "grad_norm": 0.16424008249943414, + "learning_rate": 4.4762083408737256e-05, + "loss": 2.7528, + "step": 28033 + }, + { + "epoch": 1.7402694146129494, + "grad_norm": 0.15609310737111, + "learning_rate": 4.475849176988358e-05, + "loss": 2.8129, + "step": 28034 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.15810090165024399, + "learning_rate": 4.475490015837613e-05, + "loss": 2.7526, + "step": 28035 + }, + { + "epoch": 1.7403935688124652, + "grad_norm": 0.168314776963656, + "learning_rate": 4.475130857423363e-05, + "loss": 2.8017, + "step": 28036 + }, + { + "epoch": 1.7404556459122231, + "grad_norm": 0.14845287720186082, + "learning_rate": 4.4747717017474844e-05, + "loss": 2.7922, + "step": 28037 + }, + { + "epoch": 1.740517723011981, + "grad_norm": 0.15619050323694675, + "learning_rate": 4.474412548811847e-05, + "loss": 2.7601, + "step": 28038 + }, + { + "epoch": 1.7405798001117387, + "grad_norm": 0.13720502885871003, + "learning_rate": 4.474053398618329e-05, + "loss": 2.6917, + "step": 28039 + }, + { + "epoch": 1.7406418772114967, + "grad_norm": 0.14302333144275003, + "learning_rate": 4.4736942511688006e-05, + "loss": 2.7189, + "step": 28040 + }, + { + "epoch": 1.7407039543112546, + "grad_norm": 0.14851038172545014, + "learning_rate": 4.473335106465138e-05, + "loss": 2.785, + "step": 28041 + }, + { + "epoch": 1.7407660314110125, + "grad_norm": 0.14581963710388357, + "learning_rate": 4.472975964509214e-05, + "loss": 2.7442, + "step": 28042 + }, + { + "epoch": 1.7408281085107704, + "grad_norm": 0.1397421729006301, + "learning_rate": 4.4726168253029034e-05, + "loss": 2.8037, + "step": 28043 + }, + { + "epoch": 1.7408901856105283, + "grad_norm": 0.13691375313368823, + "learning_rate": 4.472257688848077e-05, + "loss": 2.7686, + "step": 28044 + }, + { + "epoch": 1.740952262710286, + "grad_norm": 0.14621296215974125, + "learning_rate": 4.4718985551466115e-05, + "loss": 2.7314, + "step": 28045 + }, + { + "epoch": 1.741014339810044, + "grad_norm": 0.18604555876876508, + "learning_rate": 4.4715394242003783e-05, + "loss": 2.6949, + "step": 28046 + }, + { + "epoch": 1.7410764169098019, + "grad_norm": 0.14859796308487433, + "learning_rate": 4.471180296011254e-05, + "loss": 2.7864, + "step": 28047 + }, + { + "epoch": 1.7411384940095598, + "grad_norm": 0.1678386894404962, + "learning_rate": 4.4708211705811084e-05, + "loss": 2.7803, + "step": 28048 + }, + { + "epoch": 1.7412005711093177, + "grad_norm": 0.16122343231687988, + "learning_rate": 4.470462047911819e-05, + "loss": 2.7643, + "step": 28049 + }, + { + "epoch": 1.7412626482090756, + "grad_norm": 0.14980522389421583, + "learning_rate": 4.470102928005256e-05, + "loss": 2.8215, + "step": 28050 + }, + { + "epoch": 1.7413247253088335, + "grad_norm": 0.1570360972951629, + "learning_rate": 4.4697438108632965e-05, + "loss": 2.6885, + "step": 28051 + }, + { + "epoch": 1.7413868024085914, + "grad_norm": 0.14270878383351893, + "learning_rate": 4.4693846964878115e-05, + "loss": 2.6876, + "step": 28052 + }, + { + "epoch": 1.7414488795083494, + "grad_norm": 0.15483469125327076, + "learning_rate": 4.469025584880675e-05, + "loss": 2.7622, + "step": 28053 + }, + { + "epoch": 1.7415109566081073, + "grad_norm": 0.14676024844371865, + "learning_rate": 4.468666476043761e-05, + "loss": 2.8049, + "step": 28054 + }, + { + "epoch": 1.7415730337078652, + "grad_norm": 0.15857191428774592, + "learning_rate": 4.468307369978943e-05, + "loss": 2.7116, + "step": 28055 + }, + { + "epoch": 1.7416351108076231, + "grad_norm": 0.15149561448327933, + "learning_rate": 4.467948266688095e-05, + "loss": 2.8396, + "step": 28056 + }, + { + "epoch": 1.741697187907381, + "grad_norm": 0.1705513017126228, + "learning_rate": 4.4675891661730896e-05, + "loss": 2.7942, + "step": 28057 + }, + { + "epoch": 1.741759265007139, + "grad_norm": 0.15492733583622317, + "learning_rate": 4.467230068435801e-05, + "loss": 2.8165, + "step": 28058 + }, + { + "epoch": 1.7418213421068969, + "grad_norm": 0.15887804060426686, + "learning_rate": 4.4668709734781017e-05, + "loss": 2.769, + "step": 28059 + }, + { + "epoch": 1.7418834192066548, + "grad_norm": 0.1584922303523932, + "learning_rate": 4.466511881301867e-05, + "loss": 2.7467, + "step": 28060 + }, + { + "epoch": 1.7419454963064127, + "grad_norm": 0.16076177591251695, + "learning_rate": 4.466152791908969e-05, + "loss": 2.6964, + "step": 28061 + }, + { + "epoch": 1.7420075734061706, + "grad_norm": 0.1887360007734534, + "learning_rate": 4.465793705301282e-05, + "loss": 2.8686, + "step": 28062 + }, + { + "epoch": 1.7420696505059283, + "grad_norm": 0.18543159517633256, + "learning_rate": 4.465434621480678e-05, + "loss": 2.7187, + "step": 28063 + }, + { + "epoch": 1.7421317276056862, + "grad_norm": 0.15422234968720816, + "learning_rate": 4.465075540449031e-05, + "loss": 2.822, + "step": 28064 + }, + { + "epoch": 1.7421938047054442, + "grad_norm": 0.1676080456945302, + "learning_rate": 4.464716462208216e-05, + "loss": 2.7588, + "step": 28065 + }, + { + "epoch": 1.742255881805202, + "grad_norm": 0.1477790962323651, + "learning_rate": 4.464357386760104e-05, + "loss": 2.7924, + "step": 28066 + }, + { + "epoch": 1.74231795890496, + "grad_norm": 0.17626771899530708, + "learning_rate": 4.4639983141065714e-05, + "loss": 2.805, + "step": 28067 + }, + { + "epoch": 1.742380036004718, + "grad_norm": 0.14686245129120407, + "learning_rate": 4.4636392442494873e-05, + "loss": 2.7263, + "step": 28068 + }, + { + "epoch": 1.7424421131044756, + "grad_norm": 0.14855269304090696, + "learning_rate": 4.463280177190729e-05, + "loss": 2.8127, + "step": 28069 + }, + { + "epoch": 1.7425041902042335, + "grad_norm": 0.15134920197968985, + "learning_rate": 4.4629211129321676e-05, + "loss": 2.8233, + "step": 28070 + }, + { + "epoch": 1.7425662673039914, + "grad_norm": 0.14183893368140066, + "learning_rate": 4.462562051475676e-05, + "loss": 2.7036, + "step": 28071 + }, + { + "epoch": 1.7426283444037494, + "grad_norm": 0.15719498145384875, + "learning_rate": 4.46220299282313e-05, + "loss": 2.6973, + "step": 28072 + }, + { + "epoch": 1.7426904215035073, + "grad_norm": 0.1586177772492596, + "learning_rate": 4.461843936976401e-05, + "loss": 2.7774, + "step": 28073 + }, + { + "epoch": 1.7427524986032652, + "grad_norm": 0.16406884067635052, + "learning_rate": 4.461484883937364e-05, + "loss": 2.6644, + "step": 28074 + }, + { + "epoch": 1.7428145757030231, + "grad_norm": 0.16158173188382727, + "learning_rate": 4.4611258337078905e-05, + "loss": 2.7859, + "step": 28075 + }, + { + "epoch": 1.742876652802781, + "grad_norm": 0.15600163867476446, + "learning_rate": 4.4607667862898545e-05, + "loss": 2.7648, + "step": 28076 + }, + { + "epoch": 1.742938729902539, + "grad_norm": 0.15081817985080065, + "learning_rate": 4.46040774168513e-05, + "loss": 2.728, + "step": 28077 + }, + { + "epoch": 1.7430008070022969, + "grad_norm": 0.19849300021398078, + "learning_rate": 4.460048699895587e-05, + "loss": 2.7605, + "step": 28078 + }, + { + "epoch": 1.7430628841020548, + "grad_norm": 0.14550984144396373, + "learning_rate": 4.459689660923103e-05, + "loss": 2.776, + "step": 28079 + }, + { + "epoch": 1.7431249612018127, + "grad_norm": 0.14788577793760865, + "learning_rate": 4.4593306247695475e-05, + "loss": 2.7376, + "step": 28080 + }, + { + "epoch": 1.7431870383015706, + "grad_norm": 0.18038562247310513, + "learning_rate": 4.458971591436797e-05, + "loss": 2.7605, + "step": 28081 + }, + { + "epoch": 1.7432491154013285, + "grad_norm": 0.15450043469394098, + "learning_rate": 4.458612560926722e-05, + "loss": 2.7298, + "step": 28082 + }, + { + "epoch": 1.7433111925010865, + "grad_norm": 0.15192946320889555, + "learning_rate": 4.458253533241198e-05, + "loss": 2.783, + "step": 28083 + }, + { + "epoch": 1.7433732696008444, + "grad_norm": 0.1600897290318487, + "learning_rate": 4.457894508382095e-05, + "loss": 2.7697, + "step": 28084 + }, + { + "epoch": 1.7434353467006023, + "grad_norm": 0.15537348109436605, + "learning_rate": 4.4575354863512896e-05, + "loss": 2.8631, + "step": 28085 + }, + { + "epoch": 1.7434974238003602, + "grad_norm": 0.16420481390397135, + "learning_rate": 4.457176467150652e-05, + "loss": 2.697, + "step": 28086 + }, + { + "epoch": 1.743559500900118, + "grad_norm": 0.15222095162004176, + "learning_rate": 4.456817450782058e-05, + "loss": 2.8168, + "step": 28087 + }, + { + "epoch": 1.7436215779998758, + "grad_norm": 0.1456264997474654, + "learning_rate": 4.456458437247378e-05, + "loss": 2.7637, + "step": 28088 + }, + { + "epoch": 1.7436836550996337, + "grad_norm": 0.15036576548623565, + "learning_rate": 4.4560994265484865e-05, + "loss": 2.7612, + "step": 28089 + }, + { + "epoch": 1.7437457321993917, + "grad_norm": 0.15004550050024287, + "learning_rate": 4.4557404186872564e-05, + "loss": 2.808, + "step": 28090 + }, + { + "epoch": 1.7438078092991496, + "grad_norm": 0.14783022284362385, + "learning_rate": 4.4553814136655605e-05, + "loss": 2.702, + "step": 28091 + }, + { + "epoch": 1.7438698863989075, + "grad_norm": 0.1477434534080469, + "learning_rate": 4.4550224114852726e-05, + "loss": 2.6643, + "step": 28092 + }, + { + "epoch": 1.7439319634986652, + "grad_norm": 0.14996640951830623, + "learning_rate": 4.454663412148264e-05, + "loss": 2.8151, + "step": 28093 + }, + { + "epoch": 1.7439940405984231, + "grad_norm": 0.15420585072070325, + "learning_rate": 4.4543044156564104e-05, + "loss": 2.8274, + "step": 28094 + }, + { + "epoch": 1.744056117698181, + "grad_norm": 0.16630178920334746, + "learning_rate": 4.453945422011581e-05, + "loss": 2.7474, + "step": 28095 + }, + { + "epoch": 1.744118194797939, + "grad_norm": 0.14345795103314496, + "learning_rate": 4.453586431215653e-05, + "loss": 2.7108, + "step": 28096 + }, + { + "epoch": 1.7441802718976969, + "grad_norm": 0.14842473778321613, + "learning_rate": 4.4532274432704954e-05, + "loss": 2.7128, + "step": 28097 + }, + { + "epoch": 1.7442423489974548, + "grad_norm": 0.1458843658483871, + "learning_rate": 4.4528684581779846e-05, + "loss": 2.7544, + "step": 28098 + }, + { + "epoch": 1.7443044260972127, + "grad_norm": 0.13895326594297638, + "learning_rate": 4.452509475939991e-05, + "loss": 2.8212, + "step": 28099 + }, + { + "epoch": 1.7443665031969706, + "grad_norm": 0.15091453674726898, + "learning_rate": 4.452150496558388e-05, + "loss": 2.7706, + "step": 28100 + }, + { + "epoch": 1.7444285802967285, + "grad_norm": 0.1626840596698088, + "learning_rate": 4.45179152003505e-05, + "loss": 2.7717, + "step": 28101 + }, + { + "epoch": 1.7444906573964865, + "grad_norm": 0.1581300401826717, + "learning_rate": 4.451432546371848e-05, + "loss": 2.714, + "step": 28102 + }, + { + "epoch": 1.7445527344962444, + "grad_norm": 0.1489970240051738, + "learning_rate": 4.4510735755706536e-05, + "loss": 2.7134, + "step": 28103 + }, + { + "epoch": 1.7446148115960023, + "grad_norm": 0.153427851678535, + "learning_rate": 4.450714607633344e-05, + "loss": 2.7652, + "step": 28104 + }, + { + "epoch": 1.7446768886957602, + "grad_norm": 0.14724394008742892, + "learning_rate": 4.4503556425617886e-05, + "loss": 2.7578, + "step": 28105 + }, + { + "epoch": 1.7447389657955181, + "grad_norm": 0.14431514564425132, + "learning_rate": 4.4499966803578623e-05, + "loss": 2.843, + "step": 28106 + }, + { + "epoch": 1.744801042895276, + "grad_norm": 0.15674373581566536, + "learning_rate": 4.449637721023436e-05, + "loss": 2.6978, + "step": 28107 + }, + { + "epoch": 1.744863119995034, + "grad_norm": 0.14496597431368313, + "learning_rate": 4.449278764560384e-05, + "loss": 2.7265, + "step": 28108 + }, + { + "epoch": 1.7449251970947919, + "grad_norm": 0.14254231852489524, + "learning_rate": 4.448919810970578e-05, + "loss": 2.7276, + "step": 28109 + }, + { + "epoch": 1.7449872741945496, + "grad_norm": 0.14287339297965287, + "learning_rate": 4.448560860255891e-05, + "loss": 2.8459, + "step": 28110 + }, + { + "epoch": 1.7450493512943075, + "grad_norm": 0.14549982445827525, + "learning_rate": 4.448201912418196e-05, + "loss": 2.672, + "step": 28111 + }, + { + "epoch": 1.7451114283940654, + "grad_norm": 0.13999488511491537, + "learning_rate": 4.4478429674593665e-05, + "loss": 2.7167, + "step": 28112 + }, + { + "epoch": 1.7451735054938233, + "grad_norm": 0.1495057367066207, + "learning_rate": 4.447484025381274e-05, + "loss": 2.6978, + "step": 28113 + }, + { + "epoch": 1.7452355825935812, + "grad_norm": 0.14722880983446615, + "learning_rate": 4.4471250861857906e-05, + "loss": 2.852, + "step": 28114 + }, + { + "epoch": 1.7452976596933392, + "grad_norm": 0.15318559693205333, + "learning_rate": 4.44676614987479e-05, + "loss": 2.7008, + "step": 28115 + }, + { + "epoch": 1.7453597367930969, + "grad_norm": 0.14540712341875947, + "learning_rate": 4.4464072164501445e-05, + "loss": 2.7917, + "step": 28116 + }, + { + "epoch": 1.7454218138928548, + "grad_norm": 0.15114367194857323, + "learning_rate": 4.446048285913728e-05, + "loss": 2.6687, + "step": 28117 + }, + { + "epoch": 1.7454838909926127, + "grad_norm": 0.16254310150922135, + "learning_rate": 4.4456893582674105e-05, + "loss": 2.7888, + "step": 28118 + }, + { + "epoch": 1.7455459680923706, + "grad_norm": 0.14968588045713443, + "learning_rate": 4.445330433513068e-05, + "loss": 2.7935, + "step": 28119 + }, + { + "epoch": 1.7456080451921285, + "grad_norm": 0.15580247327926605, + "learning_rate": 4.4449715116525696e-05, + "loss": 2.8206, + "step": 28120 + }, + { + "epoch": 1.7456701222918865, + "grad_norm": 0.14735968694733756, + "learning_rate": 4.444612592687791e-05, + "loss": 2.8099, + "step": 28121 + }, + { + "epoch": 1.7457321993916444, + "grad_norm": 0.15279563494802387, + "learning_rate": 4.444253676620602e-05, + "loss": 2.7811, + "step": 28122 + }, + { + "epoch": 1.7457942764914023, + "grad_norm": 0.15710335399864508, + "learning_rate": 4.443894763452878e-05, + "loss": 2.8008, + "step": 28123 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.15854964747081435, + "learning_rate": 4.443535853186489e-05, + "loss": 2.6884, + "step": 28124 + }, + { + "epoch": 1.7459184306909181, + "grad_norm": 0.20516676122637145, + "learning_rate": 4.443176945823309e-05, + "loss": 2.7068, + "step": 28125 + }, + { + "epoch": 1.745980507790676, + "grad_norm": 0.16573454679902988, + "learning_rate": 4.44281804136521e-05, + "loss": 2.841, + "step": 28126 + }, + { + "epoch": 1.746042584890434, + "grad_norm": 0.15542182190141454, + "learning_rate": 4.442459139814064e-05, + "loss": 2.7938, + "step": 28127 + }, + { + "epoch": 1.7461046619901919, + "grad_norm": 0.16359391165090478, + "learning_rate": 4.442100241171744e-05, + "loss": 2.7297, + "step": 28128 + }, + { + "epoch": 1.7461667390899498, + "grad_norm": 0.16138885269499303, + "learning_rate": 4.441741345440123e-05, + "loss": 2.757, + "step": 28129 + }, + { + "epoch": 1.7462288161897077, + "grad_norm": 0.16467000111911687, + "learning_rate": 4.4413824526210725e-05, + "loss": 2.7566, + "step": 28130 + }, + { + "epoch": 1.7462908932894656, + "grad_norm": 0.1596941095354172, + "learning_rate": 4.441023562716465e-05, + "loss": 2.8313, + "step": 28131 + }, + { + "epoch": 1.7463529703892235, + "grad_norm": 0.1467994163195761, + "learning_rate": 4.4406646757281745e-05, + "loss": 2.7877, + "step": 28132 + }, + { + "epoch": 1.7464150474889815, + "grad_norm": 0.1631869271815111, + "learning_rate": 4.440305791658071e-05, + "loss": 2.7814, + "step": 28133 + }, + { + "epoch": 1.7464771245887392, + "grad_norm": 0.1507280515303506, + "learning_rate": 4.439946910508029e-05, + "loss": 2.764, + "step": 28134 + }, + { + "epoch": 1.746539201688497, + "grad_norm": 0.17749878706651964, + "learning_rate": 4.4395880322799186e-05, + "loss": 2.8037, + "step": 28135 + }, + { + "epoch": 1.746601278788255, + "grad_norm": 0.18237681557311797, + "learning_rate": 4.439229156975613e-05, + "loss": 2.7702, + "step": 28136 + }, + { + "epoch": 1.746663355888013, + "grad_norm": 0.20484497804659157, + "learning_rate": 4.438870284596986e-05, + "loss": 2.7725, + "step": 28137 + }, + { + "epoch": 1.7467254329877708, + "grad_norm": 0.14567317782724426, + "learning_rate": 4.438511415145911e-05, + "loss": 2.7457, + "step": 28138 + }, + { + "epoch": 1.7467875100875287, + "grad_norm": 0.15011373023072933, + "learning_rate": 4.4381525486242554e-05, + "loss": 2.7608, + "step": 28139 + }, + { + "epoch": 1.7468495871872864, + "grad_norm": 0.16170159147521243, + "learning_rate": 4.437793685033896e-05, + "loss": 2.8173, + "step": 28140 + }, + { + "epoch": 1.7469116642870444, + "grad_norm": 0.19356857888946208, + "learning_rate": 4.437434824376702e-05, + "loss": 2.7996, + "step": 28141 + }, + { + "epoch": 1.7469737413868023, + "grad_norm": 0.16622521689877448, + "learning_rate": 4.437075966654549e-05, + "loss": 2.8023, + "step": 28142 + }, + { + "epoch": 1.7470358184865602, + "grad_norm": 0.14171924962141802, + "learning_rate": 4.436717111869306e-05, + "loss": 2.8729, + "step": 28143 + }, + { + "epoch": 1.7470978955863181, + "grad_norm": 0.16135240099683557, + "learning_rate": 4.436358260022847e-05, + "loss": 2.8314, + "step": 28144 + }, + { + "epoch": 1.747159972686076, + "grad_norm": 0.15012957469964175, + "learning_rate": 4.435999411117043e-05, + "loss": 2.7797, + "step": 28145 + }, + { + "epoch": 1.747222049785834, + "grad_norm": 0.16703347482082764, + "learning_rate": 4.4356405651537683e-05, + "loss": 2.8018, + "step": 28146 + }, + { + "epoch": 1.7472841268855919, + "grad_norm": 0.17734346263710973, + "learning_rate": 4.435281722134893e-05, + "loss": 2.7828, + "step": 28147 + }, + { + "epoch": 1.7473462039853498, + "grad_norm": 0.18084524194191967, + "learning_rate": 4.434922882062291e-05, + "loss": 2.785, + "step": 28148 + }, + { + "epoch": 1.7474082810851077, + "grad_norm": 0.16539050947788045, + "learning_rate": 4.434564044937833e-05, + "loss": 2.729, + "step": 28149 + }, + { + "epoch": 1.7474703581848656, + "grad_norm": 0.15315250154627655, + "learning_rate": 4.4342052107633905e-05, + "loss": 2.7325, + "step": 28150 + }, + { + "epoch": 1.7475324352846235, + "grad_norm": 0.1526948355693888, + "learning_rate": 4.433846379540838e-05, + "loss": 2.844, + "step": 28151 + }, + { + "epoch": 1.7475945123843815, + "grad_norm": 0.1572731967738669, + "learning_rate": 4.4334875512720454e-05, + "loss": 2.6562, + "step": 28152 + }, + { + "epoch": 1.7476565894841394, + "grad_norm": 0.16494619702461696, + "learning_rate": 4.433128725958887e-05, + "loss": 2.6942, + "step": 28153 + }, + { + "epoch": 1.7477186665838973, + "grad_norm": 0.15157841741409622, + "learning_rate": 4.4327699036032324e-05, + "loss": 2.797, + "step": 28154 + }, + { + "epoch": 1.7477807436836552, + "grad_norm": 0.14066392968999666, + "learning_rate": 4.432411084206956e-05, + "loss": 2.7633, + "step": 28155 + }, + { + "epoch": 1.7478428207834131, + "grad_norm": 0.15309984915350291, + "learning_rate": 4.432052267771928e-05, + "loss": 2.876, + "step": 28156 + }, + { + "epoch": 1.747904897883171, + "grad_norm": 0.15388214532529657, + "learning_rate": 4.4316934543000224e-05, + "loss": 2.6926, + "step": 28157 + }, + { + "epoch": 1.7479669749829287, + "grad_norm": 0.1673193788012128, + "learning_rate": 4.431334643793108e-05, + "loss": 2.7565, + "step": 28158 + }, + { + "epoch": 1.7480290520826867, + "grad_norm": 0.15353226438040102, + "learning_rate": 4.430975836253061e-05, + "loss": 2.7572, + "step": 28159 + }, + { + "epoch": 1.7480911291824446, + "grad_norm": 0.14445300174953388, + "learning_rate": 4.430617031681749e-05, + "loss": 2.732, + "step": 28160 + }, + { + "epoch": 1.7481532062822025, + "grad_norm": 0.15481668145235153, + "learning_rate": 4.430258230081048e-05, + "loss": 2.7883, + "step": 28161 + }, + { + "epoch": 1.7482152833819604, + "grad_norm": 0.1390151556731057, + "learning_rate": 4.429899431452828e-05, + "loss": 2.7418, + "step": 28162 + }, + { + "epoch": 1.7482773604817183, + "grad_norm": 0.14912151543957955, + "learning_rate": 4.429540635798959e-05, + "loss": 2.7776, + "step": 28163 + }, + { + "epoch": 1.748339437581476, + "grad_norm": 0.1854756971663399, + "learning_rate": 4.4291818431213175e-05, + "loss": 2.8558, + "step": 28164 + }, + { + "epoch": 1.748401514681234, + "grad_norm": 0.17622105362440574, + "learning_rate": 4.428823053421771e-05, + "loss": 2.7191, + "step": 28165 + }, + { + "epoch": 1.7484635917809919, + "grad_norm": 0.15771620782315426, + "learning_rate": 4.428464266702195e-05, + "loss": 2.7955, + "step": 28166 + }, + { + "epoch": 1.7485256688807498, + "grad_norm": 0.16907588235030355, + "learning_rate": 4.428105482964458e-05, + "loss": 2.7755, + "step": 28167 + }, + { + "epoch": 1.7485877459805077, + "grad_norm": 0.155915779051564, + "learning_rate": 4.427746702210435e-05, + "loss": 2.8203, + "step": 28168 + }, + { + "epoch": 1.7486498230802656, + "grad_norm": 0.14843595027114603, + "learning_rate": 4.427387924441994e-05, + "loss": 2.8267, + "step": 28169 + }, + { + "epoch": 1.7487119001800235, + "grad_norm": 0.15237613480484116, + "learning_rate": 4.427029149661011e-05, + "loss": 2.7034, + "step": 28170 + }, + { + "epoch": 1.7487739772797815, + "grad_norm": 0.20008802562975123, + "learning_rate": 4.426670377869356e-05, + "loss": 2.8595, + "step": 28171 + }, + { + "epoch": 1.7488360543795394, + "grad_norm": 0.1515720457826648, + "learning_rate": 4.4263116090689e-05, + "loss": 2.7878, + "step": 28172 + }, + { + "epoch": 1.7488981314792973, + "grad_norm": 0.20149922819439384, + "learning_rate": 4.4259528432615175e-05, + "loss": 2.727, + "step": 28173 + }, + { + "epoch": 1.7489602085790552, + "grad_norm": 0.17078147155244772, + "learning_rate": 4.425594080449077e-05, + "loss": 2.7606, + "step": 28174 + }, + { + "epoch": 1.7490222856788131, + "grad_norm": 0.18750277159549741, + "learning_rate": 4.425235320633451e-05, + "loss": 2.7409, + "step": 28175 + }, + { + "epoch": 1.749084362778571, + "grad_norm": 0.17199231067404408, + "learning_rate": 4.424876563816513e-05, + "loss": 2.6905, + "step": 28176 + }, + { + "epoch": 1.749146439878329, + "grad_norm": 0.15625928612771306, + "learning_rate": 4.4245178100001334e-05, + "loss": 2.7804, + "step": 28177 + }, + { + "epoch": 1.7492085169780869, + "grad_norm": 0.1533360199869997, + "learning_rate": 4.4241590591861845e-05, + "loss": 2.7887, + "step": 28178 + }, + { + "epoch": 1.7492705940778448, + "grad_norm": 0.15761838981046952, + "learning_rate": 4.423800311376536e-05, + "loss": 2.8616, + "step": 28179 + }, + { + "epoch": 1.7493326711776027, + "grad_norm": 0.18356248914103596, + "learning_rate": 4.423441566573063e-05, + "loss": 2.7399, + "step": 28180 + }, + { + "epoch": 1.7493947482773606, + "grad_norm": 0.1647476015321558, + "learning_rate": 4.423082824777634e-05, + "loss": 2.8379, + "step": 28181 + }, + { + "epoch": 1.7494568253771183, + "grad_norm": 0.15498419278243059, + "learning_rate": 4.422724085992123e-05, + "loss": 2.805, + "step": 28182 + }, + { + "epoch": 1.7495189024768762, + "grad_norm": 0.1616870883686472, + "learning_rate": 4.4223653502183994e-05, + "loss": 2.7396, + "step": 28183 + }, + { + "epoch": 1.7495809795766342, + "grad_norm": 0.15457528986230407, + "learning_rate": 4.422006617458338e-05, + "loss": 2.7843, + "step": 28184 + }, + { + "epoch": 1.749643056676392, + "grad_norm": 0.15450525698737927, + "learning_rate": 4.421647887713807e-05, + "loss": 2.8911, + "step": 28185 + }, + { + "epoch": 1.74970513377615, + "grad_norm": 0.15437602777370613, + "learning_rate": 4.4212891609866785e-05, + "loss": 2.7553, + "step": 28186 + }, + { + "epoch": 1.749767210875908, + "grad_norm": 0.1501575248814982, + "learning_rate": 4.420930437278826e-05, + "loss": 2.7442, + "step": 28187 + }, + { + "epoch": 1.7498292879756656, + "grad_norm": 0.16764868137913044, + "learning_rate": 4.42057171659212e-05, + "loss": 2.7592, + "step": 28188 + }, + { + "epoch": 1.7498913650754235, + "grad_norm": 0.1573868284188867, + "learning_rate": 4.4202129989284325e-05, + "loss": 2.7615, + "step": 28189 + }, + { + "epoch": 1.7499534421751815, + "grad_norm": 0.16378836776509637, + "learning_rate": 4.419854284289633e-05, + "loss": 2.7188, + "step": 28190 + }, + { + "epoch": 1.7500155192749394, + "grad_norm": 0.19283085656849086, + "learning_rate": 4.419495572677596e-05, + "loss": 2.7713, + "step": 28191 + }, + { + "epoch": 1.7500775963746973, + "grad_norm": 0.14619087698209332, + "learning_rate": 4.419136864094191e-05, + "loss": 2.7231, + "step": 28192 + }, + { + "epoch": 1.7501396734744552, + "grad_norm": 0.18001888649589015, + "learning_rate": 4.4187781585412895e-05, + "loss": 2.8135, + "step": 28193 + }, + { + "epoch": 1.7502017505742131, + "grad_norm": 0.1519066334970511, + "learning_rate": 4.4184194560207635e-05, + "loss": 2.6961, + "step": 28194 + }, + { + "epoch": 1.750263827673971, + "grad_norm": 0.16518506809202443, + "learning_rate": 4.418060756534485e-05, + "loss": 2.8756, + "step": 28195 + }, + { + "epoch": 1.750325904773729, + "grad_norm": 0.18465429746635212, + "learning_rate": 4.417702060084324e-05, + "loss": 2.7717, + "step": 28196 + }, + { + "epoch": 1.7503879818734869, + "grad_norm": 0.1466988074873735, + "learning_rate": 4.4173433666721544e-05, + "loss": 2.8311, + "step": 28197 + }, + { + "epoch": 1.7504500589732448, + "grad_norm": 0.14355399332722057, + "learning_rate": 4.416984676299845e-05, + "loss": 2.7365, + "step": 28198 + }, + { + "epoch": 1.7505121360730027, + "grad_norm": 0.1504643680552413, + "learning_rate": 4.4166259889692676e-05, + "loss": 2.8102, + "step": 28199 + }, + { + "epoch": 1.7505742131727606, + "grad_norm": 0.14781940543877942, + "learning_rate": 4.416267304682295e-05, + "loss": 2.7136, + "step": 28200 + }, + { + "epoch": 1.7506362902725185, + "grad_norm": 0.1526303346730258, + "learning_rate": 4.4159086234407956e-05, + "loss": 2.7683, + "step": 28201 + }, + { + "epoch": 1.7506983673722765, + "grad_norm": 0.13825050998329827, + "learning_rate": 4.415549945246642e-05, + "loss": 2.7407, + "step": 28202 + }, + { + "epoch": 1.7507604444720344, + "grad_norm": 0.14824425516206916, + "learning_rate": 4.415191270101709e-05, + "loss": 2.7095, + "step": 28203 + }, + { + "epoch": 1.7508225215717923, + "grad_norm": 0.15685014505986142, + "learning_rate": 4.4148325980078635e-05, + "loss": 2.8442, + "step": 28204 + }, + { + "epoch": 1.7508845986715502, + "grad_norm": 0.14805167021823826, + "learning_rate": 4.41447392896698e-05, + "loss": 2.8201, + "step": 28205 + }, + { + "epoch": 1.750946675771308, + "grad_norm": 0.14540621391573044, + "learning_rate": 4.4141152629809266e-05, + "loss": 2.7319, + "step": 28206 + }, + { + "epoch": 1.7510087528710658, + "grad_norm": 0.1477922481853533, + "learning_rate": 4.413756600051577e-05, + "loss": 2.7124, + "step": 28207 + }, + { + "epoch": 1.7510708299708238, + "grad_norm": 0.1495135044960504, + "learning_rate": 4.413397940180801e-05, + "loss": 2.7936, + "step": 28208 + }, + { + "epoch": 1.7511329070705817, + "grad_norm": 0.15074266992646476, + "learning_rate": 4.413039283370471e-05, + "loss": 2.7981, + "step": 28209 + }, + { + "epoch": 1.7511949841703396, + "grad_norm": 0.13993625059034323, + "learning_rate": 4.412680629622458e-05, + "loss": 2.7688, + "step": 28210 + }, + { + "epoch": 1.7512570612700975, + "grad_norm": 0.16091044717094297, + "learning_rate": 4.4123219789386314e-05, + "loss": 2.8505, + "step": 28211 + }, + { + "epoch": 1.7513191383698552, + "grad_norm": 0.16976072079613963, + "learning_rate": 4.411963331320865e-05, + "loss": 2.7462, + "step": 28212 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.1470666882396413, + "learning_rate": 4.411604686771027e-05, + "loss": 2.7363, + "step": 28213 + }, + { + "epoch": 1.751443292569371, + "grad_norm": 0.16546912280608947, + "learning_rate": 4.411246045290992e-05, + "loss": 2.7409, + "step": 28214 + }, + { + "epoch": 1.751505369669129, + "grad_norm": 0.15603549060983868, + "learning_rate": 4.4108874068826275e-05, + "loss": 2.8311, + "step": 28215 + }, + { + "epoch": 1.7515674467688869, + "grad_norm": 0.16474783895910217, + "learning_rate": 4.410528771547808e-05, + "loss": 2.7561, + "step": 28216 + }, + { + "epoch": 1.7516295238686448, + "grad_norm": 0.15677690475742673, + "learning_rate": 4.410170139288402e-05, + "loss": 2.7853, + "step": 28217 + }, + { + "epoch": 1.7516916009684027, + "grad_norm": 0.1604829398700055, + "learning_rate": 4.409811510106282e-05, + "loss": 2.7499, + "step": 28218 + }, + { + "epoch": 1.7517536780681606, + "grad_norm": 0.1752060060841707, + "learning_rate": 4.409452884003318e-05, + "loss": 2.7578, + "step": 28219 + }, + { + "epoch": 1.7518157551679185, + "grad_norm": 0.19483672367204863, + "learning_rate": 4.409094260981383e-05, + "loss": 2.7983, + "step": 28220 + }, + { + "epoch": 1.7518778322676765, + "grad_norm": 0.158088678006793, + "learning_rate": 4.408735641042345e-05, + "loss": 2.832, + "step": 28221 + }, + { + "epoch": 1.7519399093674344, + "grad_norm": 0.16264703361718538, + "learning_rate": 4.4083770241880786e-05, + "loss": 2.7643, + "step": 28222 + }, + { + "epoch": 1.7520019864671923, + "grad_norm": 0.1519119243613588, + "learning_rate": 4.408018410420452e-05, + "loss": 2.7094, + "step": 28223 + }, + { + "epoch": 1.7520640635669502, + "grad_norm": 0.14491845792664534, + "learning_rate": 4.407659799741336e-05, + "loss": 2.8225, + "step": 28224 + }, + { + "epoch": 1.7521261406667081, + "grad_norm": 0.14772483756569185, + "learning_rate": 4.4073011921526046e-05, + "loss": 2.6757, + "step": 28225 + }, + { + "epoch": 1.752188217766466, + "grad_norm": 0.15020340871630347, + "learning_rate": 4.406942587656125e-05, + "loss": 2.8187, + "step": 28226 + }, + { + "epoch": 1.752250294866224, + "grad_norm": 0.14596202542155234, + "learning_rate": 4.406583986253771e-05, + "loss": 2.7424, + "step": 28227 + }, + { + "epoch": 1.7523123719659819, + "grad_norm": 0.15831688923103107, + "learning_rate": 4.406225387947411e-05, + "loss": 2.7347, + "step": 28228 + }, + { + "epoch": 1.7523744490657398, + "grad_norm": 0.14786102637536108, + "learning_rate": 4.4058667927389196e-05, + "loss": 2.7193, + "step": 28229 + }, + { + "epoch": 1.7524365261654975, + "grad_norm": 0.1519795980658715, + "learning_rate": 4.4055082006301636e-05, + "loss": 2.7737, + "step": 28230 + }, + { + "epoch": 1.7524986032652554, + "grad_norm": 0.15740756757857655, + "learning_rate": 4.4051496116230165e-05, + "loss": 2.7958, + "step": 28231 + }, + { + "epoch": 1.7525606803650133, + "grad_norm": 0.14494274174257854, + "learning_rate": 4.4047910257193467e-05, + "loss": 2.7756, + "step": 28232 + }, + { + "epoch": 1.7526227574647713, + "grad_norm": 0.15057056981895406, + "learning_rate": 4.404432442921029e-05, + "loss": 2.9065, + "step": 28233 + }, + { + "epoch": 1.7526848345645292, + "grad_norm": 0.16362455931408187, + "learning_rate": 4.404073863229929e-05, + "loss": 2.7657, + "step": 28234 + }, + { + "epoch": 1.752746911664287, + "grad_norm": 0.15273434134090277, + "learning_rate": 4.403715286647922e-05, + "loss": 2.7515, + "step": 28235 + }, + { + "epoch": 1.7528089887640448, + "grad_norm": 0.15686685274062684, + "learning_rate": 4.403356713176876e-05, + "loss": 2.8068, + "step": 28236 + }, + { + "epoch": 1.7528710658638027, + "grad_norm": 0.15107038378394494, + "learning_rate": 4.402998142818665e-05, + "loss": 2.7399, + "step": 28237 + }, + { + "epoch": 1.7529331429635606, + "grad_norm": 0.17770510212381924, + "learning_rate": 4.4026395755751556e-05, + "loss": 2.7623, + "step": 28238 + }, + { + "epoch": 1.7529952200633185, + "grad_norm": 0.1463179278448509, + "learning_rate": 4.4022810114482224e-05, + "loss": 2.7692, + "step": 28239 + }, + { + "epoch": 1.7530572971630765, + "grad_norm": 0.15603684548946103, + "learning_rate": 4.4019224504397325e-05, + "loss": 2.8247, + "step": 28240 + }, + { + "epoch": 1.7531193742628344, + "grad_norm": 0.1442357341358085, + "learning_rate": 4.40156389255156e-05, + "loss": 2.7636, + "step": 28241 + }, + { + "epoch": 1.7531814513625923, + "grad_norm": 0.16283441895752954, + "learning_rate": 4.401205337785573e-05, + "loss": 2.7606, + "step": 28242 + }, + { + "epoch": 1.7532435284623502, + "grad_norm": 0.14130661599884897, + "learning_rate": 4.400846786143644e-05, + "loss": 2.7509, + "step": 28243 + }, + { + "epoch": 1.7533056055621081, + "grad_norm": 0.13810397439050268, + "learning_rate": 4.400488237627641e-05, + "loss": 2.7394, + "step": 28244 + }, + { + "epoch": 1.753367682661866, + "grad_norm": 0.14613756450971094, + "learning_rate": 4.400129692239438e-05, + "loss": 2.7875, + "step": 28245 + }, + { + "epoch": 1.753429759761624, + "grad_norm": 0.15514798764705726, + "learning_rate": 4.399771149980903e-05, + "loss": 2.8007, + "step": 28246 + }, + { + "epoch": 1.7534918368613819, + "grad_norm": 0.1482571522694539, + "learning_rate": 4.3994126108539084e-05, + "loss": 2.7253, + "step": 28247 + }, + { + "epoch": 1.7535539139611398, + "grad_norm": 0.152811286664019, + "learning_rate": 4.399054074860324e-05, + "loss": 2.7973, + "step": 28248 + }, + { + "epoch": 1.7536159910608977, + "grad_norm": 0.1528678096536915, + "learning_rate": 4.39869554200202e-05, + "loss": 2.7377, + "step": 28249 + }, + { + "epoch": 1.7536780681606556, + "grad_norm": 0.14708541270799808, + "learning_rate": 4.398337012280868e-05, + "loss": 2.845, + "step": 28250 + }, + { + "epoch": 1.7537401452604136, + "grad_norm": 0.1413599847485482, + "learning_rate": 4.397978485698736e-05, + "loss": 2.7705, + "step": 28251 + }, + { + "epoch": 1.7538022223601715, + "grad_norm": 0.155881160274795, + "learning_rate": 4.397619962257498e-05, + "loss": 2.8122, + "step": 28252 + }, + { + "epoch": 1.7538642994599294, + "grad_norm": 0.14062877491016929, + "learning_rate": 4.397261441959022e-05, + "loss": 2.7382, + "step": 28253 + }, + { + "epoch": 1.753926376559687, + "grad_norm": 0.1510850491922958, + "learning_rate": 4.396902924805181e-05, + "loss": 2.8116, + "step": 28254 + }, + { + "epoch": 1.753988453659445, + "grad_norm": 0.15239827696348404, + "learning_rate": 4.396544410797842e-05, + "loss": 2.6905, + "step": 28255 + }, + { + "epoch": 1.754050530759203, + "grad_norm": 0.17755268577621933, + "learning_rate": 4.3961858999388786e-05, + "loss": 2.7828, + "step": 28256 + }, + { + "epoch": 1.7541126078589608, + "grad_norm": 0.14047553145786207, + "learning_rate": 4.3958273922301584e-05, + "loss": 2.7703, + "step": 28257 + }, + { + "epoch": 1.7541746849587188, + "grad_norm": 0.15100945685588327, + "learning_rate": 4.395468887673555e-05, + "loss": 2.7727, + "step": 28258 + }, + { + "epoch": 1.7542367620584767, + "grad_norm": 0.14264257489786625, + "learning_rate": 4.3951103862709374e-05, + "loss": 2.7251, + "step": 28259 + }, + { + "epoch": 1.7542988391582344, + "grad_norm": 0.14806991790363058, + "learning_rate": 4.394751888024174e-05, + "loss": 2.7833, + "step": 28260 + }, + { + "epoch": 1.7543609162579923, + "grad_norm": 0.15367583948984023, + "learning_rate": 4.3943933929351385e-05, + "loss": 2.7832, + "step": 28261 + }, + { + "epoch": 1.7544229933577502, + "grad_norm": 0.1423581987725331, + "learning_rate": 4.394034901005699e-05, + "loss": 2.6864, + "step": 28262 + }, + { + "epoch": 1.7544850704575081, + "grad_norm": 0.150038498914912, + "learning_rate": 4.3936764122377264e-05, + "loss": 2.7213, + "step": 28263 + }, + { + "epoch": 1.754547147557266, + "grad_norm": 0.14652553615831573, + "learning_rate": 4.393317926633091e-05, + "loss": 2.789, + "step": 28264 + }, + { + "epoch": 1.754609224657024, + "grad_norm": 0.14470960592377966, + "learning_rate": 4.392959444193664e-05, + "loss": 2.7915, + "step": 28265 + }, + { + "epoch": 1.7546713017567819, + "grad_norm": 0.1498566473978239, + "learning_rate": 4.392600964921314e-05, + "loss": 2.7771, + "step": 28266 + }, + { + "epoch": 1.7547333788565398, + "grad_norm": 0.19842051485915552, + "learning_rate": 4.392242488817911e-05, + "loss": 2.824, + "step": 28267 + }, + { + "epoch": 1.7547954559562977, + "grad_norm": 0.16223719739511716, + "learning_rate": 4.3918840158853284e-05, + "loss": 2.8529, + "step": 28268 + }, + { + "epoch": 1.7548575330560556, + "grad_norm": 0.14578715016035984, + "learning_rate": 4.391525546125433e-05, + "loss": 2.6643, + "step": 28269 + }, + { + "epoch": 1.7549196101558135, + "grad_norm": 0.17700899307457338, + "learning_rate": 4.391167079540099e-05, + "loss": 2.8577, + "step": 28270 + }, + { + "epoch": 1.7549816872555715, + "grad_norm": 0.14640830289934217, + "learning_rate": 4.3908086161311925e-05, + "loss": 2.8096, + "step": 28271 + }, + { + "epoch": 1.7550437643553294, + "grad_norm": 0.1600338945306235, + "learning_rate": 4.390450155900586e-05, + "loss": 2.7897, + "step": 28272 + }, + { + "epoch": 1.7551058414550873, + "grad_norm": 0.16885470016551316, + "learning_rate": 4.390091698850149e-05, + "loss": 2.6733, + "step": 28273 + }, + { + "epoch": 1.7551679185548452, + "grad_norm": 0.1554722323099551, + "learning_rate": 4.3897332449817516e-05, + "loss": 2.7386, + "step": 28274 + }, + { + "epoch": 1.7552299956546031, + "grad_norm": 0.17121494412112995, + "learning_rate": 4.389374794297264e-05, + "loss": 2.8769, + "step": 28275 + }, + { + "epoch": 1.755292072754361, + "grad_norm": 0.1431644416273231, + "learning_rate": 4.3890163467985555e-05, + "loss": 2.7947, + "step": 28276 + }, + { + "epoch": 1.755354149854119, + "grad_norm": 0.1575516603532478, + "learning_rate": 4.388657902487498e-05, + "loss": 2.8624, + "step": 28277 + }, + { + "epoch": 1.7554162269538767, + "grad_norm": 0.14246661907860783, + "learning_rate": 4.3882994613659605e-05, + "loss": 2.7775, + "step": 28278 + }, + { + "epoch": 1.7554783040536346, + "grad_norm": 0.15620808039058226, + "learning_rate": 4.387941023435814e-05, + "loss": 2.835, + "step": 28279 + }, + { + "epoch": 1.7555403811533925, + "grad_norm": 0.15363611511164646, + "learning_rate": 4.387582588698926e-05, + "loss": 2.8118, + "step": 28280 + }, + { + "epoch": 1.7556024582531504, + "grad_norm": 0.15456516143529714, + "learning_rate": 4.3872241571571696e-05, + "loss": 2.6864, + "step": 28281 + }, + { + "epoch": 1.7556645353529083, + "grad_norm": 0.1574057511622219, + "learning_rate": 4.386865728812412e-05, + "loss": 2.8253, + "step": 28282 + }, + { + "epoch": 1.7557266124526663, + "grad_norm": 0.13730008645305009, + "learning_rate": 4.3865073036665265e-05, + "loss": 2.7164, + "step": 28283 + }, + { + "epoch": 1.755788689552424, + "grad_norm": 0.148600131588755, + "learning_rate": 4.386148881721382e-05, + "loss": 2.7885, + "step": 28284 + }, + { + "epoch": 1.7558507666521819, + "grad_norm": 0.14209647015723256, + "learning_rate": 4.3857904629788454e-05, + "loss": 2.7077, + "step": 28285 + }, + { + "epoch": 1.7559128437519398, + "grad_norm": 0.141314768573903, + "learning_rate": 4.3854320474407906e-05, + "loss": 2.8331, + "step": 28286 + }, + { + "epoch": 1.7559749208516977, + "grad_norm": 0.20330280078299356, + "learning_rate": 4.385073635109085e-05, + "loss": 2.8059, + "step": 28287 + }, + { + "epoch": 1.7560369979514556, + "grad_norm": 0.18315237843122248, + "learning_rate": 4.3847152259856005e-05, + "loss": 2.8268, + "step": 28288 + }, + { + "epoch": 1.7560990750512135, + "grad_norm": 0.15801568170857852, + "learning_rate": 4.384356820072205e-05, + "loss": 2.8159, + "step": 28289 + }, + { + "epoch": 1.7561611521509715, + "grad_norm": 0.14892387388915296, + "learning_rate": 4.3839984173707704e-05, + "loss": 2.659, + "step": 28290 + }, + { + "epoch": 1.7562232292507294, + "grad_norm": 0.15720658145725977, + "learning_rate": 4.3836400178831646e-05, + "loss": 2.8141, + "step": 28291 + }, + { + "epoch": 1.7562853063504873, + "grad_norm": 0.1588426602735407, + "learning_rate": 4.38328162161126e-05, + "loss": 2.8105, + "step": 28292 + }, + { + "epoch": 1.7563473834502452, + "grad_norm": 0.14572913983335467, + "learning_rate": 4.3829232285569235e-05, + "loss": 2.7408, + "step": 28293 + }, + { + "epoch": 1.7564094605500031, + "grad_norm": 0.15438127020407225, + "learning_rate": 4.382564838722027e-05, + "loss": 2.7912, + "step": 28294 + }, + { + "epoch": 1.756471537649761, + "grad_norm": 0.15365869998937617, + "learning_rate": 4.38220645210844e-05, + "loss": 2.7636, + "step": 28295 + }, + { + "epoch": 1.756533614749519, + "grad_norm": 0.14836595090823101, + "learning_rate": 4.38184806871803e-05, + "loss": 2.6955, + "step": 28296 + }, + { + "epoch": 1.7565956918492769, + "grad_norm": 0.14956893249775716, + "learning_rate": 4.381489688552671e-05, + "loss": 2.7167, + "step": 28297 + }, + { + "epoch": 1.7566577689490348, + "grad_norm": 0.1487239971493703, + "learning_rate": 4.3811313116142285e-05, + "loss": 2.7523, + "step": 28298 + }, + { + "epoch": 1.7567198460487927, + "grad_norm": 0.17120182356009442, + "learning_rate": 4.3807729379045757e-05, + "loss": 2.7579, + "step": 28299 + }, + { + "epoch": 1.7567819231485506, + "grad_norm": 0.14925027109925013, + "learning_rate": 4.380414567425579e-05, + "loss": 2.7396, + "step": 28300 + }, + { + "epoch": 1.7568440002483086, + "grad_norm": 0.16530233185149462, + "learning_rate": 4.38005620017911e-05, + "loss": 2.8215, + "step": 28301 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.15204418513520024, + "learning_rate": 4.3796978361670404e-05, + "loss": 2.822, + "step": 28302 + }, + { + "epoch": 1.7569681544478242, + "grad_norm": 0.1562327375412971, + "learning_rate": 4.379339475391236e-05, + "loss": 2.8167, + "step": 28303 + }, + { + "epoch": 1.757030231547582, + "grad_norm": 0.16458737903060924, + "learning_rate": 4.3789811178535696e-05, + "loss": 2.7812, + "step": 28304 + }, + { + "epoch": 1.75709230864734, + "grad_norm": 0.15140785163292084, + "learning_rate": 4.378622763555909e-05, + "loss": 2.7001, + "step": 28305 + }, + { + "epoch": 1.757154385747098, + "grad_norm": 0.1571755296801984, + "learning_rate": 4.3782644125001236e-05, + "loss": 2.7436, + "step": 28306 + }, + { + "epoch": 1.7572164628468558, + "grad_norm": 0.14384080309835803, + "learning_rate": 4.3779060646880844e-05, + "loss": 2.7484, + "step": 28307 + }, + { + "epoch": 1.7572785399466135, + "grad_norm": 0.15318669127426301, + "learning_rate": 4.3775477201216605e-05, + "loss": 2.6593, + "step": 28308 + }, + { + "epoch": 1.7573406170463715, + "grad_norm": 0.15641188579927423, + "learning_rate": 4.377189378802721e-05, + "loss": 2.7075, + "step": 28309 + }, + { + "epoch": 1.7574026941461294, + "grad_norm": 0.1549226108423899, + "learning_rate": 4.3768310407331354e-05, + "loss": 2.7786, + "step": 28310 + }, + { + "epoch": 1.7574647712458873, + "grad_norm": 0.1516480086669707, + "learning_rate": 4.376472705914774e-05, + "loss": 2.7318, + "step": 28311 + }, + { + "epoch": 1.7575268483456452, + "grad_norm": 0.16512082123215582, + "learning_rate": 4.3761143743495056e-05, + "loss": 2.8037, + "step": 28312 + }, + { + "epoch": 1.7575889254454031, + "grad_norm": 0.14956006539790673, + "learning_rate": 4.3757560460392e-05, + "loss": 2.7703, + "step": 28313 + }, + { + "epoch": 1.757651002545161, + "grad_norm": 0.14790885047230434, + "learning_rate": 4.375397720985726e-05, + "loss": 2.7945, + "step": 28314 + }, + { + "epoch": 1.757713079644919, + "grad_norm": 0.15092952582815028, + "learning_rate": 4.375039399190955e-05, + "loss": 2.8538, + "step": 28315 + }, + { + "epoch": 1.7577751567446769, + "grad_norm": 0.15901171709243828, + "learning_rate": 4.374681080656754e-05, + "loss": 2.8339, + "step": 28316 + }, + { + "epoch": 1.7578372338444348, + "grad_norm": 0.18825737655417568, + "learning_rate": 4.3743227653849945e-05, + "loss": 2.886, + "step": 28317 + }, + { + "epoch": 1.7578993109441927, + "grad_norm": 0.15167653866706265, + "learning_rate": 4.3739644533775434e-05, + "loss": 2.751, + "step": 28318 + }, + { + "epoch": 1.7579613880439506, + "grad_norm": 0.13720140411675735, + "learning_rate": 4.3736061446362744e-05, + "loss": 2.7358, + "step": 28319 + }, + { + "epoch": 1.7580234651437086, + "grad_norm": 0.1510318425425598, + "learning_rate": 4.373247839163053e-05, + "loss": 2.7814, + "step": 28320 + }, + { + "epoch": 1.7580855422434665, + "grad_norm": 0.14646308108371833, + "learning_rate": 4.372889536959749e-05, + "loss": 2.724, + "step": 28321 + }, + { + "epoch": 1.7581476193432244, + "grad_norm": 0.15036246420033517, + "learning_rate": 4.372531238028233e-05, + "loss": 2.8956, + "step": 28322 + }, + { + "epoch": 1.7582096964429823, + "grad_norm": 0.1487509029024434, + "learning_rate": 4.372172942370374e-05, + "loss": 2.6696, + "step": 28323 + }, + { + "epoch": 1.7582717735427402, + "grad_norm": 0.16916640373755287, + "learning_rate": 4.3718146499880414e-05, + "loss": 2.7726, + "step": 28324 + }, + { + "epoch": 1.7583338506424981, + "grad_norm": 0.14760756623528903, + "learning_rate": 4.3714563608831035e-05, + "loss": 2.828, + "step": 28325 + }, + { + "epoch": 1.7583959277422558, + "grad_norm": 0.14413643730097017, + "learning_rate": 4.3710980750574306e-05, + "loss": 2.7185, + "step": 28326 + }, + { + "epoch": 1.7584580048420138, + "grad_norm": 0.15425599437887344, + "learning_rate": 4.370739792512892e-05, + "loss": 2.6839, + "step": 28327 + }, + { + "epoch": 1.7585200819417717, + "grad_norm": 0.1389659329316058, + "learning_rate": 4.370381513251357e-05, + "loss": 2.7626, + "step": 28328 + }, + { + "epoch": 1.7585821590415296, + "grad_norm": 0.18318228572594433, + "learning_rate": 4.370023237274694e-05, + "loss": 2.8113, + "step": 28329 + }, + { + "epoch": 1.7586442361412875, + "grad_norm": 0.14325751472030707, + "learning_rate": 4.3696649645847726e-05, + "loss": 2.7166, + "step": 28330 + }, + { + "epoch": 1.7587063132410454, + "grad_norm": 0.1530409168487627, + "learning_rate": 4.3693066951834617e-05, + "loss": 2.7862, + "step": 28331 + }, + { + "epoch": 1.7587683903408031, + "grad_norm": 0.15718909818318402, + "learning_rate": 4.368948429072632e-05, + "loss": 2.8558, + "step": 28332 + }, + { + "epoch": 1.758830467440561, + "grad_norm": 0.16496027245140205, + "learning_rate": 4.36859016625415e-05, + "loss": 2.7234, + "step": 28333 + }, + { + "epoch": 1.758892544540319, + "grad_norm": 0.17198298053413533, + "learning_rate": 4.3682319067298874e-05, + "loss": 2.8381, + "step": 28334 + }, + { + "epoch": 1.7589546216400769, + "grad_norm": 0.14575873891091634, + "learning_rate": 4.3678736505017114e-05, + "loss": 2.7688, + "step": 28335 + }, + { + "epoch": 1.7590166987398348, + "grad_norm": 0.15768454765598106, + "learning_rate": 4.3675153975714934e-05, + "loss": 2.8384, + "step": 28336 + }, + { + "epoch": 1.7590787758395927, + "grad_norm": 0.16376392125147074, + "learning_rate": 4.3671571479410996e-05, + "loss": 2.7797, + "step": 28337 + }, + { + "epoch": 1.7591408529393506, + "grad_norm": 0.16626667786960225, + "learning_rate": 4.3667989016124024e-05, + "loss": 2.8568, + "step": 28338 + }, + { + "epoch": 1.7592029300391085, + "grad_norm": 0.16175044781101092, + "learning_rate": 4.366440658587268e-05, + "loss": 2.7951, + "step": 28339 + }, + { + "epoch": 1.7592650071388665, + "grad_norm": 0.14841481713102478, + "learning_rate": 4.366082418867567e-05, + "loss": 2.75, + "step": 28340 + }, + { + "epoch": 1.7593270842386244, + "grad_norm": 0.14904021572389337, + "learning_rate": 4.365724182455167e-05, + "loss": 2.7957, + "step": 28341 + }, + { + "epoch": 1.7593891613383823, + "grad_norm": 0.14953594597006822, + "learning_rate": 4.3653659493519396e-05, + "loss": 2.7279, + "step": 28342 + }, + { + "epoch": 1.7594512384381402, + "grad_norm": 0.19034785933467002, + "learning_rate": 4.36500771955975e-05, + "loss": 2.7067, + "step": 28343 + }, + { + "epoch": 1.7595133155378981, + "grad_norm": 0.15415266395997457, + "learning_rate": 4.36464949308047e-05, + "loss": 2.6775, + "step": 28344 + }, + { + "epoch": 1.759575392637656, + "grad_norm": 0.219943596358388, + "learning_rate": 4.364291269915969e-05, + "loss": 2.7541, + "step": 28345 + }, + { + "epoch": 1.759637469737414, + "grad_norm": 0.16480211595444497, + "learning_rate": 4.363933050068114e-05, + "loss": 2.7903, + "step": 28346 + }, + { + "epoch": 1.759699546837172, + "grad_norm": 0.15896102888509916, + "learning_rate": 4.363574833538775e-05, + "loss": 2.7757, + "step": 28347 + }, + { + "epoch": 1.7597616239369298, + "grad_norm": 0.17411961386367822, + "learning_rate": 4.36321662032982e-05, + "loss": 2.7486, + "step": 28348 + }, + { + "epoch": 1.7598237010366877, + "grad_norm": 0.1615138029110319, + "learning_rate": 4.362858410443119e-05, + "loss": 2.7896, + "step": 28349 + }, + { + "epoch": 1.7598857781364454, + "grad_norm": 0.1493135313526971, + "learning_rate": 4.3625002038805394e-05, + "loss": 2.8447, + "step": 28350 + }, + { + "epoch": 1.7599478552362033, + "grad_norm": 0.17684130514503807, + "learning_rate": 4.3621420006439514e-05, + "loss": 2.7871, + "step": 28351 + }, + { + "epoch": 1.7600099323359613, + "grad_norm": 0.15707429775029144, + "learning_rate": 4.361783800735224e-05, + "loss": 2.8364, + "step": 28352 + }, + { + "epoch": 1.7600720094357192, + "grad_norm": 0.18355896802913055, + "learning_rate": 4.361425604156225e-05, + "loss": 2.6948, + "step": 28353 + }, + { + "epoch": 1.760134086535477, + "grad_norm": 0.18159545829099416, + "learning_rate": 4.3610674109088236e-05, + "loss": 2.8386, + "step": 28354 + }, + { + "epoch": 1.760196163635235, + "grad_norm": 0.15575490835003925, + "learning_rate": 4.360709220994889e-05, + "loss": 2.7667, + "step": 28355 + }, + { + "epoch": 1.7602582407349927, + "grad_norm": 0.14695173729338165, + "learning_rate": 4.360351034416289e-05, + "loss": 2.7943, + "step": 28356 + }, + { + "epoch": 1.7603203178347506, + "grad_norm": 0.2046429785873963, + "learning_rate": 4.3599928511748934e-05, + "loss": 2.75, + "step": 28357 + }, + { + "epoch": 1.7603823949345085, + "grad_norm": 0.13901519586969158, + "learning_rate": 4.359634671272571e-05, + "loss": 2.75, + "step": 28358 + }, + { + "epoch": 1.7604444720342665, + "grad_norm": 0.16138023684763372, + "learning_rate": 4.3592764947111884e-05, + "loss": 2.7456, + "step": 28359 + }, + { + "epoch": 1.7605065491340244, + "grad_norm": 0.14955149046216706, + "learning_rate": 4.358918321492618e-05, + "loss": 2.7896, + "step": 28360 + }, + { + "epoch": 1.7605686262337823, + "grad_norm": 0.14475979648638498, + "learning_rate": 4.358560151618724e-05, + "loss": 2.771, + "step": 28361 + }, + { + "epoch": 1.7606307033335402, + "grad_norm": 0.1630973754800727, + "learning_rate": 4.358201985091379e-05, + "loss": 2.7889, + "step": 28362 + }, + { + "epoch": 1.7606927804332981, + "grad_norm": 0.1431574352859029, + "learning_rate": 4.357843821912449e-05, + "loss": 2.7988, + "step": 28363 + }, + { + "epoch": 1.760754857533056, + "grad_norm": 0.1474985365269684, + "learning_rate": 4.357485662083805e-05, + "loss": 2.7449, + "step": 28364 + }, + { + "epoch": 1.760816934632814, + "grad_norm": 0.1493107058008911, + "learning_rate": 4.357127505607312e-05, + "loss": 2.8036, + "step": 28365 + }, + { + "epoch": 1.7608790117325719, + "grad_norm": 0.15220922252399824, + "learning_rate": 4.356769352484842e-05, + "loss": 2.8448, + "step": 28366 + }, + { + "epoch": 1.7609410888323298, + "grad_norm": 0.14897338231524432, + "learning_rate": 4.356411202718263e-05, + "loss": 2.7767, + "step": 28367 + }, + { + "epoch": 1.7610031659320877, + "grad_norm": 0.1585065792373632, + "learning_rate": 4.356053056309442e-05, + "loss": 2.6702, + "step": 28368 + }, + { + "epoch": 1.7610652430318456, + "grad_norm": 0.15219946288455485, + "learning_rate": 4.35569491326025e-05, + "loss": 2.8114, + "step": 28369 + }, + { + "epoch": 1.7611273201316036, + "grad_norm": 0.1386783893828104, + "learning_rate": 4.355336773572554e-05, + "loss": 2.7738, + "step": 28370 + }, + { + "epoch": 1.7611893972313615, + "grad_norm": 0.15311899757298283, + "learning_rate": 4.354978637248221e-05, + "loss": 2.7593, + "step": 28371 + }, + { + "epoch": 1.7612514743311194, + "grad_norm": 0.14937197023774673, + "learning_rate": 4.354620504289123e-05, + "loss": 2.7658, + "step": 28372 + }, + { + "epoch": 1.7613135514308773, + "grad_norm": 0.16151162903507102, + "learning_rate": 4.3542623746971246e-05, + "loss": 2.8531, + "step": 28373 + }, + { + "epoch": 1.761375628530635, + "grad_norm": 0.14903714128233692, + "learning_rate": 4.3539042484740977e-05, + "loss": 2.7556, + "step": 28374 + }, + { + "epoch": 1.761437705630393, + "grad_norm": 0.15678490631542077, + "learning_rate": 4.353546125621908e-05, + "loss": 2.7515, + "step": 28375 + }, + { + "epoch": 1.7614997827301508, + "grad_norm": 0.1640912042212186, + "learning_rate": 4.3531880061424255e-05, + "loss": 2.8468, + "step": 28376 + }, + { + "epoch": 1.7615618598299088, + "grad_norm": 0.13726706905180763, + "learning_rate": 4.352829890037518e-05, + "loss": 2.7665, + "step": 28377 + }, + { + "epoch": 1.7616239369296667, + "grad_norm": 0.1523146051258254, + "learning_rate": 4.352471777309055e-05, + "loss": 2.6806, + "step": 28378 + }, + { + "epoch": 1.7616860140294246, + "grad_norm": 0.1477513075434247, + "learning_rate": 4.3521136679589026e-05, + "loss": 2.9003, + "step": 28379 + }, + { + "epoch": 1.7617480911291823, + "grad_norm": 0.1509309388841448, + "learning_rate": 4.3517555619889314e-05, + "loss": 2.8913, + "step": 28380 + }, + { + "epoch": 1.7618101682289402, + "grad_norm": 0.15323384391830333, + "learning_rate": 4.351397459401009e-05, + "loss": 2.8057, + "step": 28381 + }, + { + "epoch": 1.7618722453286981, + "grad_norm": 0.15006565926891163, + "learning_rate": 4.351039360197003e-05, + "loss": 2.7903, + "step": 28382 + }, + { + "epoch": 1.761934322428456, + "grad_norm": 0.14857774308328367, + "learning_rate": 4.350681264378783e-05, + "loss": 2.7602, + "step": 28383 + }, + { + "epoch": 1.761996399528214, + "grad_norm": 0.16497604024567505, + "learning_rate": 4.350323171948215e-05, + "loss": 2.7257, + "step": 28384 + }, + { + "epoch": 1.7620584766279719, + "grad_norm": 0.1659183329308225, + "learning_rate": 4.3499650829071705e-05, + "loss": 2.8078, + "step": 28385 + }, + { + "epoch": 1.7621205537277298, + "grad_norm": 0.1902510367426652, + "learning_rate": 4.3496069972575136e-05, + "loss": 2.8185, + "step": 28386 + }, + { + "epoch": 1.7621826308274877, + "grad_norm": 0.15607931231647537, + "learning_rate": 4.3492489150011175e-05, + "loss": 2.7898, + "step": 28387 + }, + { + "epoch": 1.7622447079272456, + "grad_norm": 0.15297396227223886, + "learning_rate": 4.348890836139845e-05, + "loss": 2.7624, + "step": 28388 + }, + { + "epoch": 1.7623067850270036, + "grad_norm": 0.1549222261037229, + "learning_rate": 4.3485327606755696e-05, + "loss": 2.8066, + "step": 28389 + }, + { + "epoch": 1.7623688621267615, + "grad_norm": 0.1677504107357261, + "learning_rate": 4.3481746886101556e-05, + "loss": 2.8482, + "step": 28390 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.14243837588460656, + "learning_rate": 4.347816619945473e-05, + "loss": 2.7804, + "step": 28391 + }, + { + "epoch": 1.7624930163262773, + "grad_norm": 0.15557985241689437, + "learning_rate": 4.3474585546833884e-05, + "loss": 2.8262, + "step": 28392 + }, + { + "epoch": 1.7625550934260352, + "grad_norm": 0.15464305070483775, + "learning_rate": 4.3471004928257724e-05, + "loss": 2.701, + "step": 28393 + }, + { + "epoch": 1.7626171705257931, + "grad_norm": 0.16749120627812938, + "learning_rate": 4.346742434374492e-05, + "loss": 2.7997, + "step": 28394 + }, + { + "epoch": 1.762679247625551, + "grad_norm": 0.14698529109951183, + "learning_rate": 4.346384379331413e-05, + "loss": 2.8357, + "step": 28395 + }, + { + "epoch": 1.762741324725309, + "grad_norm": 0.15211104691364907, + "learning_rate": 4.346026327698406e-05, + "loss": 2.8537, + "step": 28396 + }, + { + "epoch": 1.762803401825067, + "grad_norm": 0.1724390992435766, + "learning_rate": 4.3456682794773384e-05, + "loss": 2.7061, + "step": 28397 + }, + { + "epoch": 1.7628654789248246, + "grad_norm": 0.15148640521589007, + "learning_rate": 4.345310234670077e-05, + "loss": 2.8322, + "step": 28398 + }, + { + "epoch": 1.7629275560245825, + "grad_norm": 0.15640834890780786, + "learning_rate": 4.344952193278493e-05, + "loss": 2.695, + "step": 28399 + }, + { + "epoch": 1.7629896331243404, + "grad_norm": 0.16008256752473718, + "learning_rate": 4.344594155304451e-05, + "loss": 2.8407, + "step": 28400 + }, + { + "epoch": 1.7630517102240983, + "grad_norm": 0.15589732661720068, + "learning_rate": 4.344236120749822e-05, + "loss": 2.7974, + "step": 28401 + }, + { + "epoch": 1.7631137873238563, + "grad_norm": 0.17662063565669903, + "learning_rate": 4.343878089616471e-05, + "loss": 2.7745, + "step": 28402 + }, + { + "epoch": 1.7631758644236142, + "grad_norm": 0.14824973955380816, + "learning_rate": 4.343520061906269e-05, + "loss": 2.793, + "step": 28403 + }, + { + "epoch": 1.7632379415233719, + "grad_norm": 0.1547157962523285, + "learning_rate": 4.34316203762108e-05, + "loss": 2.7513, + "step": 28404 + }, + { + "epoch": 1.7633000186231298, + "grad_norm": 0.15196269012588953, + "learning_rate": 4.342804016762776e-05, + "loss": 2.8112, + "step": 28405 + }, + { + "epoch": 1.7633620957228877, + "grad_norm": 0.15975368310558452, + "learning_rate": 4.3424459993332226e-05, + "loss": 2.749, + "step": 28406 + }, + { + "epoch": 1.7634241728226456, + "grad_norm": 0.14228602103426757, + "learning_rate": 4.3420879853342874e-05, + "loss": 2.6915, + "step": 28407 + }, + { + "epoch": 1.7634862499224035, + "grad_norm": 0.15537945708682055, + "learning_rate": 4.3417299747678394e-05, + "loss": 2.7972, + "step": 28408 + }, + { + "epoch": 1.7635483270221615, + "grad_norm": 0.1449945537493887, + "learning_rate": 4.3413719676357454e-05, + "loss": 2.7972, + "step": 28409 + }, + { + "epoch": 1.7636104041219194, + "grad_norm": 0.15486891853771032, + "learning_rate": 4.341013963939875e-05, + "loss": 2.7916, + "step": 28410 + }, + { + "epoch": 1.7636724812216773, + "grad_norm": 0.15185853519594097, + "learning_rate": 4.340655963682093e-05, + "loss": 2.7753, + "step": 28411 + }, + { + "epoch": 1.7637345583214352, + "grad_norm": 0.14399373781103253, + "learning_rate": 4.34029796686427e-05, + "loss": 2.7695, + "step": 28412 + }, + { + "epoch": 1.7637966354211931, + "grad_norm": 0.14044151654248554, + "learning_rate": 4.3399399734882716e-05, + "loss": 2.7259, + "step": 28413 + }, + { + "epoch": 1.763858712520951, + "grad_norm": 0.15672803114415007, + "learning_rate": 4.3395819835559683e-05, + "loss": 2.8364, + "step": 28414 + }, + { + "epoch": 1.763920789620709, + "grad_norm": 0.14782218389262508, + "learning_rate": 4.3392239970692244e-05, + "loss": 2.7234, + "step": 28415 + }, + { + "epoch": 1.763982866720467, + "grad_norm": 0.14764598578648883, + "learning_rate": 4.3388660140299106e-05, + "loss": 2.7714, + "step": 28416 + }, + { + "epoch": 1.7640449438202248, + "grad_norm": 0.16415163865648424, + "learning_rate": 4.338508034439892e-05, + "loss": 2.7757, + "step": 28417 + }, + { + "epoch": 1.7641070209199827, + "grad_norm": 0.1426552864200936, + "learning_rate": 4.338150058301039e-05, + "loss": 2.6387, + "step": 28418 + }, + { + "epoch": 1.7641690980197406, + "grad_norm": 0.15295949623476726, + "learning_rate": 4.337792085615218e-05, + "loss": 2.7878, + "step": 28419 + }, + { + "epoch": 1.7642311751194986, + "grad_norm": 0.13795143683037142, + "learning_rate": 4.3374341163842946e-05, + "loss": 2.7999, + "step": 28420 + }, + { + "epoch": 1.7642932522192565, + "grad_norm": 0.15351920378495668, + "learning_rate": 4.33707615061014e-05, + "loss": 2.7843, + "step": 28421 + }, + { + "epoch": 1.7643553293190142, + "grad_norm": 0.15617619798067076, + "learning_rate": 4.3367181882946184e-05, + "loss": 2.7095, + "step": 28422 + }, + { + "epoch": 1.764417406418772, + "grad_norm": 0.1536502992390457, + "learning_rate": 4.3363602294396e-05, + "loss": 2.8386, + "step": 28423 + }, + { + "epoch": 1.76447948351853, + "grad_norm": 0.16202943048586738, + "learning_rate": 4.336002274046951e-05, + "loss": 2.8369, + "step": 28424 + }, + { + "epoch": 1.764541560618288, + "grad_norm": 0.15938589278564017, + "learning_rate": 4.33564432211854e-05, + "loss": 2.8123, + "step": 28425 + }, + { + "epoch": 1.7646036377180458, + "grad_norm": 0.19269694904689544, + "learning_rate": 4.335286373656233e-05, + "loss": 2.7347, + "step": 28426 + }, + { + "epoch": 1.7646657148178038, + "grad_norm": 0.15725456604446172, + "learning_rate": 4.334928428661899e-05, + "loss": 2.8282, + "step": 28427 + }, + { + "epoch": 1.7647277919175615, + "grad_norm": 0.1541109715575992, + "learning_rate": 4.334570487137405e-05, + "loss": 2.8007, + "step": 28428 + }, + { + "epoch": 1.7647898690173194, + "grad_norm": 0.15966880868113367, + "learning_rate": 4.334212549084619e-05, + "loss": 2.8224, + "step": 28429 + }, + { + "epoch": 1.7648519461170773, + "grad_norm": 0.14387134090272993, + "learning_rate": 4.3338546145054065e-05, + "loss": 2.8711, + "step": 28430 + }, + { + "epoch": 1.7649140232168352, + "grad_norm": 0.16441444272762967, + "learning_rate": 4.333496683401635e-05, + "loss": 2.8147, + "step": 28431 + }, + { + "epoch": 1.7649761003165931, + "grad_norm": 0.14113900501255244, + "learning_rate": 4.333138755775174e-05, + "loss": 2.916, + "step": 28432 + }, + { + "epoch": 1.765038177416351, + "grad_norm": 0.14071314751122888, + "learning_rate": 4.332780831627891e-05, + "loss": 2.8857, + "step": 28433 + }, + { + "epoch": 1.765100254516109, + "grad_norm": 0.1539499269271583, + "learning_rate": 4.332422910961651e-05, + "loss": 2.7651, + "step": 28434 + }, + { + "epoch": 1.7651623316158669, + "grad_norm": 0.16019485171255737, + "learning_rate": 4.3320649937783245e-05, + "loss": 2.8453, + "step": 28435 + }, + { + "epoch": 1.7652244087156248, + "grad_norm": 0.1539600171894843, + "learning_rate": 4.331707080079775e-05, + "loss": 2.8351, + "step": 28436 + }, + { + "epoch": 1.7652864858153827, + "grad_norm": 0.1485464862964055, + "learning_rate": 4.3313491698678745e-05, + "loss": 2.8799, + "step": 28437 + }, + { + "epoch": 1.7653485629151406, + "grad_norm": 0.1461016742066791, + "learning_rate": 4.330991263144486e-05, + "loss": 2.7188, + "step": 28438 + }, + { + "epoch": 1.7654106400148986, + "grad_norm": 0.15980276198475435, + "learning_rate": 4.330633359911479e-05, + "loss": 2.733, + "step": 28439 + }, + { + "epoch": 1.7654727171146565, + "grad_norm": 0.15285748278965383, + "learning_rate": 4.33027546017072e-05, + "loss": 2.7864, + "step": 28440 + }, + { + "epoch": 1.7655347942144144, + "grad_norm": 0.16835829564078245, + "learning_rate": 4.329917563924076e-05, + "loss": 2.812, + "step": 28441 + }, + { + "epoch": 1.7655968713141723, + "grad_norm": 0.16016469909897227, + "learning_rate": 4.329559671173415e-05, + "loss": 2.8422, + "step": 28442 + }, + { + "epoch": 1.7656589484139302, + "grad_norm": 0.14603404441737794, + "learning_rate": 4.329201781920605e-05, + "loss": 2.736, + "step": 28443 + }, + { + "epoch": 1.7657210255136881, + "grad_norm": 0.1640357268150028, + "learning_rate": 4.328843896167512e-05, + "loss": 2.7919, + "step": 28444 + }, + { + "epoch": 1.765783102613446, + "grad_norm": 0.15385347347344416, + "learning_rate": 4.328486013916002e-05, + "loss": 2.7387, + "step": 28445 + }, + { + "epoch": 1.7658451797132038, + "grad_norm": 0.15458159795230014, + "learning_rate": 4.328128135167945e-05, + "loss": 2.691, + "step": 28446 + }, + { + "epoch": 1.7659072568129617, + "grad_norm": 0.15274982252321376, + "learning_rate": 4.3277702599252045e-05, + "loss": 2.7996, + "step": 28447 + }, + { + "epoch": 1.7659693339127196, + "grad_norm": 0.17928831707512166, + "learning_rate": 4.327412388189652e-05, + "loss": 2.7262, + "step": 28448 + }, + { + "epoch": 1.7660314110124775, + "grad_norm": 0.15105807212547565, + "learning_rate": 4.3270545199631505e-05, + "loss": 2.7768, + "step": 28449 + }, + { + "epoch": 1.7660934881122354, + "grad_norm": 0.1730811539361077, + "learning_rate": 4.3266966552475706e-05, + "loss": 2.8148, + "step": 28450 + }, + { + "epoch": 1.7661555652119934, + "grad_norm": 0.16143086350985586, + "learning_rate": 4.326338794044777e-05, + "loss": 2.7528, + "step": 28451 + }, + { + "epoch": 1.766217642311751, + "grad_norm": 0.17987005662785815, + "learning_rate": 4.3259809363566375e-05, + "loss": 2.6734, + "step": 28452 + }, + { + "epoch": 1.766279719411509, + "grad_norm": 0.14835025708110702, + "learning_rate": 4.3256230821850186e-05, + "loss": 2.7809, + "step": 28453 + }, + { + "epoch": 1.7663417965112669, + "grad_norm": 0.16445649612487206, + "learning_rate": 4.325265231531788e-05, + "loss": 2.8138, + "step": 28454 + }, + { + "epoch": 1.7664038736110248, + "grad_norm": 0.1482309862410741, + "learning_rate": 4.324907384398813e-05, + "loss": 2.8581, + "step": 28455 + }, + { + "epoch": 1.7664659507107827, + "grad_norm": 0.15188927596110904, + "learning_rate": 4.32454954078796e-05, + "loss": 2.7943, + "step": 28456 + }, + { + "epoch": 1.7665280278105406, + "grad_norm": 0.1668872057097755, + "learning_rate": 4.3241917007010956e-05, + "loss": 2.7573, + "step": 28457 + }, + { + "epoch": 1.7665901049102986, + "grad_norm": 0.16252156215444485, + "learning_rate": 4.323833864140087e-05, + "loss": 2.8405, + "step": 28458 + }, + { + "epoch": 1.7666521820100565, + "grad_norm": 0.1650556917108092, + "learning_rate": 4.3234760311068024e-05, + "loss": 2.7819, + "step": 28459 + }, + { + "epoch": 1.7667142591098144, + "grad_norm": 0.15104205375380628, + "learning_rate": 4.323118201603106e-05, + "loss": 2.7297, + "step": 28460 + }, + { + "epoch": 1.7667763362095723, + "grad_norm": 0.1556972800869403, + "learning_rate": 4.322760375630868e-05, + "loss": 2.8194, + "step": 28461 + }, + { + "epoch": 1.7668384133093302, + "grad_norm": 0.1661975705053426, + "learning_rate": 4.322402553191951e-05, + "loss": 2.777, + "step": 28462 + }, + { + "epoch": 1.7669004904090881, + "grad_norm": 0.1653972553407149, + "learning_rate": 4.322044734288226e-05, + "loss": 2.7177, + "step": 28463 + }, + { + "epoch": 1.766962567508846, + "grad_norm": 0.16620352713968484, + "learning_rate": 4.321686918921557e-05, + "loss": 2.8144, + "step": 28464 + }, + { + "epoch": 1.767024644608604, + "grad_norm": 0.15900994828579298, + "learning_rate": 4.3213291070938126e-05, + "loss": 2.8035, + "step": 28465 + }, + { + "epoch": 1.767086721708362, + "grad_norm": 0.17346888829491872, + "learning_rate": 4.32097129880686e-05, + "loss": 2.9275, + "step": 28466 + }, + { + "epoch": 1.7671487988081198, + "grad_norm": 0.17369511154994305, + "learning_rate": 4.320613494062564e-05, + "loss": 2.8108, + "step": 28467 + }, + { + "epoch": 1.7672108759078777, + "grad_norm": 0.14208273853483275, + "learning_rate": 4.3202556928627934e-05, + "loss": 2.7646, + "step": 28468 + }, + { + "epoch": 1.7672729530076356, + "grad_norm": 0.1556832286182927, + "learning_rate": 4.319897895209414e-05, + "loss": 2.748, + "step": 28469 + }, + { + "epoch": 1.7673350301073933, + "grad_norm": 0.1491869840213497, + "learning_rate": 4.319540101104291e-05, + "loss": 2.7669, + "step": 28470 + }, + { + "epoch": 1.7673971072071513, + "grad_norm": 0.176877776686909, + "learning_rate": 4.319182310549294e-05, + "loss": 2.7568, + "step": 28471 + }, + { + "epoch": 1.7674591843069092, + "grad_norm": 0.15362192967460694, + "learning_rate": 4.318824523546286e-05, + "loss": 2.7434, + "step": 28472 + }, + { + "epoch": 1.767521261406667, + "grad_norm": 0.16172621728178338, + "learning_rate": 4.318466740097138e-05, + "loss": 2.8193, + "step": 28473 + }, + { + "epoch": 1.767583338506425, + "grad_norm": 0.16752758975870052, + "learning_rate": 4.3181089602037126e-05, + "loss": 2.8875, + "step": 28474 + }, + { + "epoch": 1.7676454156061827, + "grad_norm": 0.15289247554581126, + "learning_rate": 4.31775118386788e-05, + "loss": 2.7815, + "step": 28475 + }, + { + "epoch": 1.7677074927059406, + "grad_norm": 0.19728594382611986, + "learning_rate": 4.317393411091504e-05, + "loss": 2.7886, + "step": 28476 + }, + { + "epoch": 1.7677695698056985, + "grad_norm": 0.14364270995490172, + "learning_rate": 4.3170356418764526e-05, + "loss": 2.7748, + "step": 28477 + }, + { + "epoch": 1.7678316469054565, + "grad_norm": 0.1704524023380079, + "learning_rate": 4.316677876224592e-05, + "loss": 2.8261, + "step": 28478 + }, + { + "epoch": 1.7678937240052144, + "grad_norm": 0.17015985836234954, + "learning_rate": 4.316320114137789e-05, + "loss": 2.8099, + "step": 28479 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.14921079784194832, + "learning_rate": 4.315962355617911e-05, + "loss": 2.7834, + "step": 28480 + }, + { + "epoch": 1.7680178782047302, + "grad_norm": 0.16742106905203402, + "learning_rate": 4.315604600666821e-05, + "loss": 2.794, + "step": 28481 + }, + { + "epoch": 1.7680799553044881, + "grad_norm": 0.16401871028700957, + "learning_rate": 4.3152468492863904e-05, + "loss": 2.7917, + "step": 28482 + }, + { + "epoch": 1.768142032404246, + "grad_norm": 0.16660598243673005, + "learning_rate": 4.314889101478481e-05, + "loss": 2.8162, + "step": 28483 + }, + { + "epoch": 1.768204109504004, + "grad_norm": 0.15533299773289824, + "learning_rate": 4.314531357244963e-05, + "loss": 2.7156, + "step": 28484 + }, + { + "epoch": 1.768266186603762, + "grad_norm": 0.1539850585086232, + "learning_rate": 4.314173616587701e-05, + "loss": 2.826, + "step": 28485 + }, + { + "epoch": 1.7683282637035198, + "grad_norm": 0.15347921434705247, + "learning_rate": 4.3138158795085614e-05, + "loss": 2.782, + "step": 28486 + }, + { + "epoch": 1.7683903408032777, + "grad_norm": 0.16009415609087457, + "learning_rate": 4.31345814600941e-05, + "loss": 2.8659, + "step": 28487 + }, + { + "epoch": 1.7684524179030356, + "grad_norm": 0.19155116337062192, + "learning_rate": 4.3131004160921164e-05, + "loss": 2.7485, + "step": 28488 + }, + { + "epoch": 1.7685144950027936, + "grad_norm": 0.19636441203201585, + "learning_rate": 4.3127426897585426e-05, + "loss": 2.7356, + "step": 28489 + }, + { + "epoch": 1.7685765721025515, + "grad_norm": 0.18107114756142628, + "learning_rate": 4.312384967010559e-05, + "loss": 2.8058, + "step": 28490 + }, + { + "epoch": 1.7686386492023094, + "grad_norm": 0.16472380622473753, + "learning_rate": 4.312027247850029e-05, + "loss": 2.7786, + "step": 28491 + }, + { + "epoch": 1.7687007263020673, + "grad_norm": 0.14640097469161062, + "learning_rate": 4.311669532278819e-05, + "loss": 2.7457, + "step": 28492 + }, + { + "epoch": 1.768762803401825, + "grad_norm": 0.16500150941750782, + "learning_rate": 4.311311820298798e-05, + "loss": 2.7664, + "step": 28493 + }, + { + "epoch": 1.768824880501583, + "grad_norm": 0.1759539868081575, + "learning_rate": 4.310954111911829e-05, + "loss": 2.771, + "step": 28494 + }, + { + "epoch": 1.7688869576013408, + "grad_norm": 0.159696984643945, + "learning_rate": 4.310596407119779e-05, + "loss": 2.8249, + "step": 28495 + }, + { + "epoch": 1.7689490347010988, + "grad_norm": 0.1497011418686531, + "learning_rate": 4.310238705924515e-05, + "loss": 2.7902, + "step": 28496 + }, + { + "epoch": 1.7690111118008567, + "grad_norm": 0.15430906706336822, + "learning_rate": 4.3098810083279035e-05, + "loss": 2.7387, + "step": 28497 + }, + { + "epoch": 1.7690731889006146, + "grad_norm": 0.17145960635442298, + "learning_rate": 4.3095233143318114e-05, + "loss": 2.6669, + "step": 28498 + }, + { + "epoch": 1.7691352660003723, + "grad_norm": 0.14033893124919233, + "learning_rate": 4.3091656239381024e-05, + "loss": 2.7122, + "step": 28499 + }, + { + "epoch": 1.7691973431001302, + "grad_norm": 0.21429119145682307, + "learning_rate": 4.308807937148646e-05, + "loss": 2.6079, + "step": 28500 + }, + { + "epoch": 1.7692594201998881, + "grad_norm": 0.16606484412820532, + "learning_rate": 4.3084502539653035e-05, + "loss": 2.7276, + "step": 28501 + }, + { + "epoch": 1.769321497299646, + "grad_norm": 0.14248440549456703, + "learning_rate": 4.308092574389947e-05, + "loss": 2.6845, + "step": 28502 + }, + { + "epoch": 1.769383574399404, + "grad_norm": 0.1445392078064696, + "learning_rate": 4.307734898424437e-05, + "loss": 2.6955, + "step": 28503 + }, + { + "epoch": 1.7694456514991619, + "grad_norm": 0.18413355786660954, + "learning_rate": 4.307377226070644e-05, + "loss": 2.8449, + "step": 28504 + }, + { + "epoch": 1.7695077285989198, + "grad_norm": 0.15399584800523144, + "learning_rate": 4.3070195573304316e-05, + "loss": 2.7751, + "step": 28505 + }, + { + "epoch": 1.7695698056986777, + "grad_norm": 0.16172716169548973, + "learning_rate": 4.3066618922056656e-05, + "loss": 2.6575, + "step": 28506 + }, + { + "epoch": 1.7696318827984356, + "grad_norm": 0.16543425696681563, + "learning_rate": 4.306304230698214e-05, + "loss": 2.7747, + "step": 28507 + }, + { + "epoch": 1.7696939598981936, + "grad_norm": 0.15436263433537534, + "learning_rate": 4.30594657280994e-05, + "loss": 2.862, + "step": 28508 + }, + { + "epoch": 1.7697560369979515, + "grad_norm": 0.15876994023720295, + "learning_rate": 4.305588918542713e-05, + "loss": 2.8225, + "step": 28509 + }, + { + "epoch": 1.7698181140977094, + "grad_norm": 0.15722652103589382, + "learning_rate": 4.305231267898396e-05, + "loss": 2.7128, + "step": 28510 + }, + { + "epoch": 1.7698801911974673, + "grad_norm": 0.16592614623557972, + "learning_rate": 4.304873620878857e-05, + "loss": 2.7711, + "step": 28511 + }, + { + "epoch": 1.7699422682972252, + "grad_norm": 0.15369332247746703, + "learning_rate": 4.30451597748596e-05, + "loss": 2.8764, + "step": 28512 + }, + { + "epoch": 1.7700043453969831, + "grad_norm": 0.14997659391337392, + "learning_rate": 4.304158337721573e-05, + "loss": 2.8097, + "step": 28513 + }, + { + "epoch": 1.770066422496741, + "grad_norm": 0.1472147218675023, + "learning_rate": 4.30380070158756e-05, + "loss": 2.8042, + "step": 28514 + }, + { + "epoch": 1.770128499596499, + "grad_norm": 0.15835235174570433, + "learning_rate": 4.303443069085789e-05, + "loss": 2.809, + "step": 28515 + }, + { + "epoch": 1.770190576696257, + "grad_norm": 0.1528790617903835, + "learning_rate": 4.303085440218124e-05, + "loss": 2.692, + "step": 28516 + }, + { + "epoch": 1.7702526537960146, + "grad_norm": 0.15333415533857012, + "learning_rate": 4.3027278149864306e-05, + "loss": 2.7637, + "step": 28517 + }, + { + "epoch": 1.7703147308957725, + "grad_norm": 0.16069741813381386, + "learning_rate": 4.302370193392577e-05, + "loss": 2.805, + "step": 28518 + }, + { + "epoch": 1.7703768079955304, + "grad_norm": 0.150989232882562, + "learning_rate": 4.302012575438426e-05, + "loss": 2.7073, + "step": 28519 + }, + { + "epoch": 1.7704388850952884, + "grad_norm": 0.16236905475338084, + "learning_rate": 4.301654961125846e-05, + "loss": 2.7108, + "step": 28520 + }, + { + "epoch": 1.7705009621950463, + "grad_norm": 0.14743416903227172, + "learning_rate": 4.3012973504567e-05, + "loss": 2.7721, + "step": 28521 + }, + { + "epoch": 1.7705630392948042, + "grad_norm": 0.15061494891675062, + "learning_rate": 4.300939743432857e-05, + "loss": 2.7893, + "step": 28522 + }, + { + "epoch": 1.7706251163945619, + "grad_norm": 0.16205096054113208, + "learning_rate": 4.3005821400561804e-05, + "loss": 2.8377, + "step": 28523 + }, + { + "epoch": 1.7706871934943198, + "grad_norm": 0.1458292170068541, + "learning_rate": 4.3002245403285366e-05, + "loss": 2.7508, + "step": 28524 + }, + { + "epoch": 1.7707492705940777, + "grad_norm": 0.16139281709984396, + "learning_rate": 4.299866944251791e-05, + "loss": 2.7543, + "step": 28525 + }, + { + "epoch": 1.7708113476938356, + "grad_norm": 0.14958350851886376, + "learning_rate": 4.29950935182781e-05, + "loss": 2.697, + "step": 28526 + }, + { + "epoch": 1.7708734247935936, + "grad_norm": 0.15398516132690074, + "learning_rate": 4.299151763058458e-05, + "loss": 2.8093, + "step": 28527 + }, + { + "epoch": 1.7709355018933515, + "grad_norm": 0.14439208186644464, + "learning_rate": 4.298794177945602e-05, + "loss": 2.6334, + "step": 28528 + }, + { + "epoch": 1.7709975789931094, + "grad_norm": 0.16139172069485266, + "learning_rate": 4.298436596491105e-05, + "loss": 2.7535, + "step": 28529 + }, + { + "epoch": 1.7710596560928673, + "grad_norm": 0.1531292398994602, + "learning_rate": 4.298079018696838e-05, + "loss": 2.8941, + "step": 28530 + }, + { + "epoch": 1.7711217331926252, + "grad_norm": 0.16942666229229347, + "learning_rate": 4.297721444564661e-05, + "loss": 2.7883, + "step": 28531 + }, + { + "epoch": 1.7711838102923831, + "grad_norm": 0.15543741888129278, + "learning_rate": 4.297363874096443e-05, + "loss": 2.8033, + "step": 28532 + }, + { + "epoch": 1.771245887392141, + "grad_norm": 0.16136602806241762, + "learning_rate": 4.297006307294046e-05, + "loss": 2.7658, + "step": 28533 + }, + { + "epoch": 1.771307964491899, + "grad_norm": 0.16167454829576872, + "learning_rate": 4.2966487441593404e-05, + "loss": 2.7278, + "step": 28534 + }, + { + "epoch": 1.771370041591657, + "grad_norm": 0.16059121519680036, + "learning_rate": 4.296291184694187e-05, + "loss": 2.8453, + "step": 28535 + }, + { + "epoch": 1.7714321186914148, + "grad_norm": 0.15764247781203686, + "learning_rate": 4.295933628900455e-05, + "loss": 2.8502, + "step": 28536 + }, + { + "epoch": 1.7714941957911727, + "grad_norm": 0.14996225483169923, + "learning_rate": 4.2955760767800066e-05, + "loss": 2.7107, + "step": 28537 + }, + { + "epoch": 1.7715562728909307, + "grad_norm": 0.18005312127485543, + "learning_rate": 4.295218528334711e-05, + "loss": 2.7247, + "step": 28538 + }, + { + "epoch": 1.7716183499906886, + "grad_norm": 0.1343594109563672, + "learning_rate": 4.294860983566429e-05, + "loss": 2.7012, + "step": 28539 + }, + { + "epoch": 1.7716804270904465, + "grad_norm": 0.17708646663997857, + "learning_rate": 4.29450344247703e-05, + "loss": 2.7348, + "step": 28540 + }, + { + "epoch": 1.7717425041902042, + "grad_norm": 0.15332618629235178, + "learning_rate": 4.294145905068378e-05, + "loss": 2.7691, + "step": 28541 + }, + { + "epoch": 1.771804581289962, + "grad_norm": 0.14760154058744687, + "learning_rate": 4.2937883713423374e-05, + "loss": 2.7956, + "step": 28542 + }, + { + "epoch": 1.77186665838972, + "grad_norm": 0.1412004116034452, + "learning_rate": 4.2934308413007744e-05, + "loss": 2.6809, + "step": 28543 + }, + { + "epoch": 1.771928735489478, + "grad_norm": 0.13764601020961365, + "learning_rate": 4.293073314945554e-05, + "loss": 2.7621, + "step": 28544 + }, + { + "epoch": 1.7719908125892359, + "grad_norm": 0.13565887666429588, + "learning_rate": 4.292715792278542e-05, + "loss": 2.7952, + "step": 28545 + }, + { + "epoch": 1.7720528896889938, + "grad_norm": 0.15300802146579867, + "learning_rate": 4.292358273301603e-05, + "loss": 2.786, + "step": 28546 + }, + { + "epoch": 1.7721149667887515, + "grad_norm": 0.14401480189217852, + "learning_rate": 4.292000758016604e-05, + "loss": 2.7813, + "step": 28547 + }, + { + "epoch": 1.7721770438885094, + "grad_norm": 0.13734334236058487, + "learning_rate": 4.291643246425407e-05, + "loss": 2.7391, + "step": 28548 + }, + { + "epoch": 1.7722391209882673, + "grad_norm": 0.15520135537938576, + "learning_rate": 4.291285738529881e-05, + "loss": 2.7268, + "step": 28549 + }, + { + "epoch": 1.7723011980880252, + "grad_norm": 0.16261774803869553, + "learning_rate": 4.290928234331888e-05, + "loss": 2.8142, + "step": 28550 + }, + { + "epoch": 1.7723632751877831, + "grad_norm": 0.15373034833327356, + "learning_rate": 4.290570733833295e-05, + "loss": 2.7575, + "step": 28551 + }, + { + "epoch": 1.772425352287541, + "grad_norm": 0.1465765980648206, + "learning_rate": 4.290213237035966e-05, + "loss": 2.718, + "step": 28552 + }, + { + "epoch": 1.772487429387299, + "grad_norm": 0.16830477260581606, + "learning_rate": 4.2898557439417686e-05, + "loss": 2.8433, + "step": 28553 + }, + { + "epoch": 1.772549506487057, + "grad_norm": 0.13974083997872544, + "learning_rate": 4.289498254552565e-05, + "loss": 2.8083, + "step": 28554 + }, + { + "epoch": 1.7726115835868148, + "grad_norm": 0.15081802547235434, + "learning_rate": 4.2891407688702215e-05, + "loss": 2.7536, + "step": 28555 + }, + { + "epoch": 1.7726736606865727, + "grad_norm": 0.135040534929806, + "learning_rate": 4.288783286896604e-05, + "loss": 2.6999, + "step": 28556 + }, + { + "epoch": 1.7727357377863306, + "grad_norm": 0.1441578875390173, + "learning_rate": 4.288425808633575e-05, + "loss": 2.7487, + "step": 28557 + }, + { + "epoch": 1.7727978148860886, + "grad_norm": 0.14541438002437182, + "learning_rate": 4.288068334083003e-05, + "loss": 2.7105, + "step": 28558 + }, + { + "epoch": 1.7728598919858465, + "grad_norm": 0.14290700501568462, + "learning_rate": 4.28771086324675e-05, + "loss": 2.7594, + "step": 28559 + }, + { + "epoch": 1.7729219690856044, + "grad_norm": 0.16949460143273487, + "learning_rate": 4.287353396126684e-05, + "loss": 2.7852, + "step": 28560 + }, + { + "epoch": 1.7729840461853623, + "grad_norm": 0.14537159092572235, + "learning_rate": 4.286995932724667e-05, + "loss": 2.7419, + "step": 28561 + }, + { + "epoch": 1.7730461232851202, + "grad_norm": 0.14908777400545165, + "learning_rate": 4.286638473042564e-05, + "loss": 2.7556, + "step": 28562 + }, + { + "epoch": 1.7731082003848782, + "grad_norm": 0.13900530045938284, + "learning_rate": 4.286281017082243e-05, + "loss": 2.7214, + "step": 28563 + }, + { + "epoch": 1.773170277484636, + "grad_norm": 0.16608440993738935, + "learning_rate": 4.285923564845566e-05, + "loss": 2.8598, + "step": 28564 + }, + { + "epoch": 1.7732323545843938, + "grad_norm": 0.17902071218894547, + "learning_rate": 4.285566116334401e-05, + "loss": 2.8137, + "step": 28565 + }, + { + "epoch": 1.7732944316841517, + "grad_norm": 0.14386217441571714, + "learning_rate": 4.285208671550611e-05, + "loss": 2.8437, + "step": 28566 + }, + { + "epoch": 1.7733565087839096, + "grad_norm": 0.15116839737551557, + "learning_rate": 4.284851230496059e-05, + "loss": 2.7606, + "step": 28567 + }, + { + "epoch": 1.7734185858836675, + "grad_norm": 0.17436460688768393, + "learning_rate": 4.2844937931726133e-05, + "loss": 2.8479, + "step": 28568 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.1484352036348702, + "learning_rate": 4.2841363595821364e-05, + "loss": 2.7, + "step": 28569 + }, + { + "epoch": 1.7735427400831834, + "grad_norm": 0.1458622373025274, + "learning_rate": 4.2837789297264954e-05, + "loss": 2.716, + "step": 28570 + }, + { + "epoch": 1.773604817182941, + "grad_norm": 0.15730639024584486, + "learning_rate": 4.283421503607552e-05, + "loss": 2.8413, + "step": 28571 + }, + { + "epoch": 1.773666894282699, + "grad_norm": 0.14694843536227362, + "learning_rate": 4.283064081227173e-05, + "loss": 2.7801, + "step": 28572 + }, + { + "epoch": 1.7737289713824569, + "grad_norm": 0.15528721418382838, + "learning_rate": 4.282706662587222e-05, + "loss": 2.7334, + "step": 28573 + }, + { + "epoch": 1.7737910484822148, + "grad_norm": 0.14479113745649855, + "learning_rate": 4.2823492476895655e-05, + "loss": 2.7229, + "step": 28574 + }, + { + "epoch": 1.7738531255819727, + "grad_norm": 0.15446370799112524, + "learning_rate": 4.2819918365360664e-05, + "loss": 2.8011, + "step": 28575 + }, + { + "epoch": 1.7739152026817306, + "grad_norm": 0.15006444286826856, + "learning_rate": 4.2816344291285915e-05, + "loss": 2.7508, + "step": 28576 + }, + { + "epoch": 1.7739772797814886, + "grad_norm": 0.15303374802285966, + "learning_rate": 4.281277025469003e-05, + "loss": 2.7753, + "step": 28577 + }, + { + "epoch": 1.7740393568812465, + "grad_norm": 0.14075467965035277, + "learning_rate": 4.280919625559167e-05, + "loss": 2.7766, + "step": 28578 + }, + { + "epoch": 1.7741014339810044, + "grad_norm": 0.14532253451207439, + "learning_rate": 4.280562229400948e-05, + "loss": 2.8033, + "step": 28579 + }, + { + "epoch": 1.7741635110807623, + "grad_norm": 0.18693178226421062, + "learning_rate": 4.28020483699621e-05, + "loss": 2.711, + "step": 28580 + }, + { + "epoch": 1.7742255881805202, + "grad_norm": 0.14010690245622145, + "learning_rate": 4.2798474483468194e-05, + "loss": 2.7374, + "step": 28581 + }, + { + "epoch": 1.7742876652802781, + "grad_norm": 0.15339239212025438, + "learning_rate": 4.279490063454638e-05, + "loss": 2.7824, + "step": 28582 + }, + { + "epoch": 1.774349742380036, + "grad_norm": 0.14565478333440884, + "learning_rate": 4.2791326823215325e-05, + "loss": 2.8304, + "step": 28583 + }, + { + "epoch": 1.774411819479794, + "grad_norm": 0.1507880174768526, + "learning_rate": 4.278775304949367e-05, + "loss": 2.6822, + "step": 28584 + }, + { + "epoch": 1.774473896579552, + "grad_norm": 0.144785573514396, + "learning_rate": 4.2784179313400055e-05, + "loss": 2.8057, + "step": 28585 + }, + { + "epoch": 1.7745359736793098, + "grad_norm": 0.1478889707763982, + "learning_rate": 4.278060561495313e-05, + "loss": 2.7464, + "step": 28586 + }, + { + "epoch": 1.7745980507790677, + "grad_norm": 0.14176107170035307, + "learning_rate": 4.2777031954171535e-05, + "loss": 2.7075, + "step": 28587 + }, + { + "epoch": 1.7746601278788257, + "grad_norm": 0.1411516539991097, + "learning_rate": 4.277345833107392e-05, + "loss": 2.6909, + "step": 28588 + }, + { + "epoch": 1.7747222049785834, + "grad_norm": 0.15048475539862727, + "learning_rate": 4.276988474567893e-05, + "loss": 2.706, + "step": 28589 + }, + { + "epoch": 1.7747842820783413, + "grad_norm": 0.14688779592574314, + "learning_rate": 4.276631119800521e-05, + "loss": 2.71, + "step": 28590 + }, + { + "epoch": 1.7748463591780992, + "grad_norm": 0.13477081482334258, + "learning_rate": 4.276273768807138e-05, + "loss": 2.7493, + "step": 28591 + }, + { + "epoch": 1.774908436277857, + "grad_norm": 0.14336735316016438, + "learning_rate": 4.275916421589613e-05, + "loss": 2.8401, + "step": 28592 + }, + { + "epoch": 1.774970513377615, + "grad_norm": 0.15660038048176078, + "learning_rate": 4.275559078149806e-05, + "loss": 2.7936, + "step": 28593 + }, + { + "epoch": 1.775032590477373, + "grad_norm": 0.16798548836809693, + "learning_rate": 4.2752017384895844e-05, + "loss": 2.8241, + "step": 28594 + }, + { + "epoch": 1.7750946675771306, + "grad_norm": 0.14630972941625853, + "learning_rate": 4.274844402610809e-05, + "loss": 2.7982, + "step": 28595 + }, + { + "epoch": 1.7751567446768886, + "grad_norm": 0.14962742101531945, + "learning_rate": 4.2744870705153474e-05, + "loss": 2.733, + "step": 28596 + }, + { + "epoch": 1.7752188217766465, + "grad_norm": 0.14929718856938287, + "learning_rate": 4.274129742205064e-05, + "loss": 2.7319, + "step": 28597 + }, + { + "epoch": 1.7752808988764044, + "grad_norm": 0.14972660934523147, + "learning_rate": 4.273772417681821e-05, + "loss": 2.6673, + "step": 28598 + }, + { + "epoch": 1.7753429759761623, + "grad_norm": 0.1449217804037027, + "learning_rate": 4.273415096947485e-05, + "loss": 2.8039, + "step": 28599 + }, + { + "epoch": 1.7754050530759202, + "grad_norm": 0.14319859019381673, + "learning_rate": 4.273057780003917e-05, + "loss": 2.8126, + "step": 28600 + }, + { + "epoch": 1.7754671301756781, + "grad_norm": 0.14158638839974036, + "learning_rate": 4.2727004668529844e-05, + "loss": 2.8097, + "step": 28601 + }, + { + "epoch": 1.775529207275436, + "grad_norm": 0.1384191089882977, + "learning_rate": 4.2723431574965487e-05, + "loss": 2.7784, + "step": 28602 + }, + { + "epoch": 1.775591284375194, + "grad_norm": 0.14581504491391142, + "learning_rate": 4.271985851936477e-05, + "loss": 2.7554, + "step": 28603 + }, + { + "epoch": 1.775653361474952, + "grad_norm": 0.15421190819641814, + "learning_rate": 4.271628550174632e-05, + "loss": 2.9557, + "step": 28604 + }, + { + "epoch": 1.7757154385747098, + "grad_norm": 0.13655987091065194, + "learning_rate": 4.271271252212876e-05, + "loss": 2.7669, + "step": 28605 + }, + { + "epoch": 1.7757775156744677, + "grad_norm": 0.15381698585503933, + "learning_rate": 4.270913958053076e-05, + "loss": 2.7801, + "step": 28606 + }, + { + "epoch": 1.7758395927742257, + "grad_norm": 0.13901207390025216, + "learning_rate": 4.2705566676970944e-05, + "loss": 2.7007, + "step": 28607 + }, + { + "epoch": 1.7759016698739836, + "grad_norm": 0.15378942218764616, + "learning_rate": 4.270199381146796e-05, + "loss": 2.8505, + "step": 28608 + }, + { + "epoch": 1.7759637469737415, + "grad_norm": 0.19919098866330867, + "learning_rate": 4.269842098404044e-05, + "loss": 2.7657, + "step": 28609 + }, + { + "epoch": 1.7760258240734994, + "grad_norm": 0.15880629268879723, + "learning_rate": 4.269484819470704e-05, + "loss": 2.7407, + "step": 28610 + }, + { + "epoch": 1.7760879011732573, + "grad_norm": 0.2227298464359877, + "learning_rate": 4.2691275443486374e-05, + "loss": 2.7389, + "step": 28611 + }, + { + "epoch": 1.7761499782730152, + "grad_norm": 0.16488131483817153, + "learning_rate": 4.2687702730397114e-05, + "loss": 2.8445, + "step": 28612 + }, + { + "epoch": 1.776212055372773, + "grad_norm": 0.16022443482931273, + "learning_rate": 4.268413005545787e-05, + "loss": 2.7232, + "step": 28613 + }, + { + "epoch": 1.7762741324725309, + "grad_norm": 0.1595943817979935, + "learning_rate": 4.268055741868731e-05, + "loss": 2.8236, + "step": 28614 + }, + { + "epoch": 1.7763362095722888, + "grad_norm": 0.14264362607338055, + "learning_rate": 4.267698482010406e-05, + "loss": 2.8663, + "step": 28615 + }, + { + "epoch": 1.7763982866720467, + "grad_norm": 0.1555666883409796, + "learning_rate": 4.267341225972674e-05, + "loss": 2.719, + "step": 28616 + }, + { + "epoch": 1.7764603637718046, + "grad_norm": 0.153990973472019, + "learning_rate": 4.2669839737574026e-05, + "loss": 2.7493, + "step": 28617 + }, + { + "epoch": 1.7765224408715625, + "grad_norm": 0.2383831541065017, + "learning_rate": 4.2666267253664526e-05, + "loss": 2.8511, + "step": 28618 + }, + { + "epoch": 1.7765845179713202, + "grad_norm": 0.1753286468835333, + "learning_rate": 4.26626948080169e-05, + "loss": 2.7055, + "step": 28619 + }, + { + "epoch": 1.7766465950710781, + "grad_norm": 0.15169873996470956, + "learning_rate": 4.265912240064976e-05, + "loss": 2.8753, + "step": 28620 + }, + { + "epoch": 1.776708672170836, + "grad_norm": 0.15029776790459548, + "learning_rate": 4.2655550031581774e-05, + "loss": 2.8439, + "step": 28621 + }, + { + "epoch": 1.776770749270594, + "grad_norm": 0.19278373188332568, + "learning_rate": 4.265197770083156e-05, + "loss": 2.8372, + "step": 28622 + }, + { + "epoch": 1.776832826370352, + "grad_norm": 0.18467968520785735, + "learning_rate": 4.264840540841777e-05, + "loss": 2.7172, + "step": 28623 + }, + { + "epoch": 1.7768949034701098, + "grad_norm": 0.18325174211204814, + "learning_rate": 4.264483315435902e-05, + "loss": 2.8063, + "step": 28624 + }, + { + "epoch": 1.7769569805698677, + "grad_norm": 0.16886002256470994, + "learning_rate": 4.2641260938673976e-05, + "loss": 2.8123, + "step": 28625 + }, + { + "epoch": 1.7770190576696256, + "grad_norm": 0.168779401955166, + "learning_rate": 4.2637688761381254e-05, + "loss": 2.7512, + "step": 28626 + }, + { + "epoch": 1.7770811347693836, + "grad_norm": 0.17981031615807636, + "learning_rate": 4.2634116622499484e-05, + "loss": 2.7271, + "step": 28627 + }, + { + "epoch": 1.7771432118691415, + "grad_norm": 0.17226678994924524, + "learning_rate": 4.263054452204732e-05, + "loss": 2.81, + "step": 28628 + }, + { + "epoch": 1.7772052889688994, + "grad_norm": 0.15164204530478254, + "learning_rate": 4.2626972460043415e-05, + "loss": 2.7627, + "step": 28629 + }, + { + "epoch": 1.7772673660686573, + "grad_norm": 0.14489465853801953, + "learning_rate": 4.2623400436506363e-05, + "loss": 2.7939, + "step": 28630 + }, + { + "epoch": 1.7773294431684152, + "grad_norm": 0.14413674098625415, + "learning_rate": 4.261982845145484e-05, + "loss": 2.7378, + "step": 28631 + }, + { + "epoch": 1.7773915202681732, + "grad_norm": 0.16405753854788135, + "learning_rate": 4.261625650490744e-05, + "loss": 2.7094, + "step": 28632 + }, + { + "epoch": 1.777453597367931, + "grad_norm": 0.15164241994103042, + "learning_rate": 4.261268459688284e-05, + "loss": 2.6069, + "step": 28633 + }, + { + "epoch": 1.777515674467689, + "grad_norm": 0.14450904314740393, + "learning_rate": 4.260911272739965e-05, + "loss": 2.7531, + "step": 28634 + }, + { + "epoch": 1.777577751567447, + "grad_norm": 0.1449885993619189, + "learning_rate": 4.2605540896476524e-05, + "loss": 2.7155, + "step": 28635 + }, + { + "epoch": 1.7776398286672048, + "grad_norm": 0.15178016807236516, + "learning_rate": 4.2601969104132074e-05, + "loss": 2.8149, + "step": 28636 + }, + { + "epoch": 1.7777019057669625, + "grad_norm": 0.14233892442037918, + "learning_rate": 4.259839735038496e-05, + "loss": 2.7336, + "step": 28637 + }, + { + "epoch": 1.7777639828667204, + "grad_norm": 0.1483807894642578, + "learning_rate": 4.259482563525379e-05, + "loss": 2.7932, + "step": 28638 + }, + { + "epoch": 1.7778260599664784, + "grad_norm": 0.14994698894086098, + "learning_rate": 4.259125395875722e-05, + "loss": 2.7953, + "step": 28639 + }, + { + "epoch": 1.7778881370662363, + "grad_norm": 0.17140844267187724, + "learning_rate": 4.258768232091388e-05, + "loss": 2.7326, + "step": 28640 + }, + { + "epoch": 1.7779502141659942, + "grad_norm": 0.1457813239736257, + "learning_rate": 4.2584110721742394e-05, + "loss": 2.7749, + "step": 28641 + }, + { + "epoch": 1.778012291265752, + "grad_norm": 0.1399535134635799, + "learning_rate": 4.258053916126141e-05, + "loss": 2.6504, + "step": 28642 + }, + { + "epoch": 1.7780743683655098, + "grad_norm": 0.20824615720887574, + "learning_rate": 4.257696763948954e-05, + "loss": 2.7449, + "step": 28643 + }, + { + "epoch": 1.7781364454652677, + "grad_norm": 0.16826681925282244, + "learning_rate": 4.2573396156445444e-05, + "loss": 2.7851, + "step": 28644 + }, + { + "epoch": 1.7781985225650256, + "grad_norm": 0.14518109707419272, + "learning_rate": 4.256982471214774e-05, + "loss": 2.7496, + "step": 28645 + }, + { + "epoch": 1.7782605996647836, + "grad_norm": 0.1530821827950986, + "learning_rate": 4.256625330661507e-05, + "loss": 2.8176, + "step": 28646 + }, + { + "epoch": 1.7783226767645415, + "grad_norm": 0.15733025963691707, + "learning_rate": 4.2562681939866043e-05, + "loss": 2.7996, + "step": 28647 + }, + { + "epoch": 1.7783847538642994, + "grad_norm": 0.18082764926084088, + "learning_rate": 4.2559110611919327e-05, + "loss": 2.679, + "step": 28648 + }, + { + "epoch": 1.7784468309640573, + "grad_norm": 0.14437335642997925, + "learning_rate": 4.255553932279353e-05, + "loss": 2.7396, + "step": 28649 + }, + { + "epoch": 1.7785089080638152, + "grad_norm": 0.1479577606555427, + "learning_rate": 4.2551968072507295e-05, + "loss": 2.8076, + "step": 28650 + }, + { + "epoch": 1.7785709851635731, + "grad_norm": 0.1510818006096194, + "learning_rate": 4.254839686107926e-05, + "loss": 2.7335, + "step": 28651 + }, + { + "epoch": 1.778633062263331, + "grad_norm": 0.18698402777456521, + "learning_rate": 4.2544825688528026e-05, + "loss": 2.8352, + "step": 28652 + }, + { + "epoch": 1.778695139363089, + "grad_norm": 0.1558938975331443, + "learning_rate": 4.254125455487227e-05, + "loss": 2.8004, + "step": 28653 + }, + { + "epoch": 1.778757216462847, + "grad_norm": 0.16222183509072652, + "learning_rate": 4.2537683460130575e-05, + "loss": 2.7305, + "step": 28654 + }, + { + "epoch": 1.7788192935626048, + "grad_norm": 0.14856368273118611, + "learning_rate": 4.253411240432161e-05, + "loss": 2.6619, + "step": 28655 + }, + { + "epoch": 1.7788813706623627, + "grad_norm": 0.15850301270299222, + "learning_rate": 4.253054138746399e-05, + "loss": 2.7959, + "step": 28656 + }, + { + "epoch": 1.7789434477621207, + "grad_norm": 0.15161663543699733, + "learning_rate": 4.252697040957636e-05, + "loss": 2.8035, + "step": 28657 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.15367019724160313, + "learning_rate": 4.252339947067732e-05, + "loss": 2.7574, + "step": 28658 + }, + { + "epoch": 1.7790676019616365, + "grad_norm": 0.14100089432131765, + "learning_rate": 4.2519828570785534e-05, + "loss": 2.8221, + "step": 28659 + }, + { + "epoch": 1.7791296790613944, + "grad_norm": 0.3253610663938633, + "learning_rate": 4.25162577099196e-05, + "loss": 2.8133, + "step": 28660 + }, + { + "epoch": 1.779191756161152, + "grad_norm": 0.15843931862955465, + "learning_rate": 4.2512686888098174e-05, + "loss": 2.8033, + "step": 28661 + }, + { + "epoch": 1.77925383326091, + "grad_norm": 0.14717728446733233, + "learning_rate": 4.250911610533989e-05, + "loss": 2.7821, + "step": 28662 + }, + { + "epoch": 1.779315910360668, + "grad_norm": 0.14944665216805372, + "learning_rate": 4.2505545361663354e-05, + "loss": 2.838, + "step": 28663 + }, + { + "epoch": 1.7793779874604259, + "grad_norm": 0.15796291104702673, + "learning_rate": 4.250197465708722e-05, + "loss": 2.7891, + "step": 28664 + }, + { + "epoch": 1.7794400645601838, + "grad_norm": 0.18816548702309752, + "learning_rate": 4.2498403991630105e-05, + "loss": 2.7752, + "step": 28665 + }, + { + "epoch": 1.7795021416599417, + "grad_norm": 0.15570805261894743, + "learning_rate": 4.249483336531063e-05, + "loss": 2.8002, + "step": 28666 + }, + { + "epoch": 1.7795642187596994, + "grad_norm": 0.1779981270072741, + "learning_rate": 4.249126277814743e-05, + "loss": 2.7606, + "step": 28667 + }, + { + "epoch": 1.7796262958594573, + "grad_norm": 0.14865724084647003, + "learning_rate": 4.2487692230159134e-05, + "loss": 2.7573, + "step": 28668 + }, + { + "epoch": 1.7796883729592152, + "grad_norm": 0.1688531061708289, + "learning_rate": 4.248412172136438e-05, + "loss": 2.8756, + "step": 28669 + }, + { + "epoch": 1.7797504500589731, + "grad_norm": 0.14305218831001545, + "learning_rate": 4.248055125178178e-05, + "loss": 2.8187, + "step": 28670 + }, + { + "epoch": 1.779812527158731, + "grad_norm": 0.16356087245867532, + "learning_rate": 4.2476980821429966e-05, + "loss": 2.8215, + "step": 28671 + }, + { + "epoch": 1.779874604258489, + "grad_norm": 0.1387964548707749, + "learning_rate": 4.2473410430327574e-05, + "loss": 2.7694, + "step": 28672 + }, + { + "epoch": 1.779936681358247, + "grad_norm": 0.1844218832958271, + "learning_rate": 4.246984007849323e-05, + "loss": 2.6789, + "step": 28673 + }, + { + "epoch": 1.7799987584580048, + "grad_norm": 0.14042311985962025, + "learning_rate": 4.246626976594555e-05, + "loss": 2.8027, + "step": 28674 + }, + { + "epoch": 1.7800608355577627, + "grad_norm": 0.18576741413285167, + "learning_rate": 4.246269949270318e-05, + "loss": 2.813, + "step": 28675 + }, + { + "epoch": 1.7801229126575207, + "grad_norm": 0.16841322287767704, + "learning_rate": 4.2459129258784734e-05, + "loss": 2.7156, + "step": 28676 + }, + { + "epoch": 1.7801849897572786, + "grad_norm": 0.13876635059896764, + "learning_rate": 4.245555906420884e-05, + "loss": 2.688, + "step": 28677 + }, + { + "epoch": 1.7802470668570365, + "grad_norm": 0.16883799280916698, + "learning_rate": 4.2451988908994126e-05, + "loss": 2.6371, + "step": 28678 + }, + { + "epoch": 1.7803091439567944, + "grad_norm": 0.14865617580985335, + "learning_rate": 4.244841879315921e-05, + "loss": 2.7688, + "step": 28679 + }, + { + "epoch": 1.7803712210565523, + "grad_norm": 0.19699411308441256, + "learning_rate": 4.244484871672274e-05, + "loss": 2.7433, + "step": 28680 + }, + { + "epoch": 1.7804332981563102, + "grad_norm": 0.14976923621315688, + "learning_rate": 4.244127867970332e-05, + "loss": 2.812, + "step": 28681 + }, + { + "epoch": 1.7804953752560682, + "grad_norm": 0.15756202238170136, + "learning_rate": 4.243770868211959e-05, + "loss": 2.7768, + "step": 28682 + }, + { + "epoch": 1.780557452355826, + "grad_norm": 0.17552903412772938, + "learning_rate": 4.243413872399016e-05, + "loss": 2.8653, + "step": 28683 + }, + { + "epoch": 1.780619529455584, + "grad_norm": 0.16177359816486628, + "learning_rate": 4.243056880533367e-05, + "loss": 2.8, + "step": 28684 + }, + { + "epoch": 1.7806816065553417, + "grad_norm": 0.16022884654156674, + "learning_rate": 4.2426998926168735e-05, + "loss": 2.7969, + "step": 28685 + }, + { + "epoch": 1.7807436836550996, + "grad_norm": 0.22252662968134695, + "learning_rate": 4.2423429086514e-05, + "loss": 2.7609, + "step": 28686 + }, + { + "epoch": 1.7808057607548575, + "grad_norm": 0.15084663836161966, + "learning_rate": 4.241985928638807e-05, + "loss": 2.7393, + "step": 28687 + }, + { + "epoch": 1.7808678378546154, + "grad_norm": 0.14082286742454758, + "learning_rate": 4.241628952580956e-05, + "loss": 2.7664, + "step": 28688 + }, + { + "epoch": 1.7809299149543734, + "grad_norm": 0.14949230563894467, + "learning_rate": 4.2412719804797127e-05, + "loss": 2.797, + "step": 28689 + }, + { + "epoch": 1.7809919920541313, + "grad_norm": 0.142652831689465, + "learning_rate": 4.2409150123369356e-05, + "loss": 2.779, + "step": 28690 + }, + { + "epoch": 1.781054069153889, + "grad_norm": 0.1542160087851998, + "learning_rate": 4.240558048154492e-05, + "loss": 2.7547, + "step": 28691 + }, + { + "epoch": 1.781116146253647, + "grad_norm": 0.17306443484834183, + "learning_rate": 4.240201087934239e-05, + "loss": 2.8396, + "step": 28692 + }, + { + "epoch": 1.7811782233534048, + "grad_norm": 0.14238221936305118, + "learning_rate": 4.23984413167804e-05, + "loss": 2.7287, + "step": 28693 + }, + { + "epoch": 1.7812403004531627, + "grad_norm": 0.14962179416275143, + "learning_rate": 4.239487179387761e-05, + "loss": 2.7296, + "step": 28694 + }, + { + "epoch": 1.7813023775529206, + "grad_norm": 0.15329211273354845, + "learning_rate": 4.2391302310652614e-05, + "loss": 2.7758, + "step": 28695 + }, + { + "epoch": 1.7813644546526786, + "grad_norm": 0.17778122462612594, + "learning_rate": 4.2387732867124055e-05, + "loss": 2.8526, + "step": 28696 + }, + { + "epoch": 1.7814265317524365, + "grad_norm": 0.148361054612345, + "learning_rate": 4.238416346331052e-05, + "loss": 2.7774, + "step": 28697 + }, + { + "epoch": 1.7814886088521944, + "grad_norm": 0.19405189919811544, + "learning_rate": 4.238059409923067e-05, + "loss": 2.7734, + "step": 28698 + }, + { + "epoch": 1.7815506859519523, + "grad_norm": 0.14926326222617659, + "learning_rate": 4.23770247749031e-05, + "loss": 2.7822, + "step": 28699 + }, + { + "epoch": 1.7816127630517102, + "grad_norm": 0.1522736938495542, + "learning_rate": 4.237345549034645e-05, + "loss": 2.6388, + "step": 28700 + }, + { + "epoch": 1.7816748401514682, + "grad_norm": 0.16135229132920853, + "learning_rate": 4.236988624557934e-05, + "loss": 2.8547, + "step": 28701 + }, + { + "epoch": 1.781736917251226, + "grad_norm": 0.14299346574696464, + "learning_rate": 4.236631704062037e-05, + "loss": 2.7329, + "step": 28702 + }, + { + "epoch": 1.781798994350984, + "grad_norm": 0.16475148511565993, + "learning_rate": 4.236274787548819e-05, + "loss": 2.7827, + "step": 28703 + }, + { + "epoch": 1.781861071450742, + "grad_norm": 0.1553093131024315, + "learning_rate": 4.2359178750201393e-05, + "loss": 2.7798, + "step": 28704 + }, + { + "epoch": 1.7819231485504998, + "grad_norm": 0.16114664885329577, + "learning_rate": 4.235560966477863e-05, + "loss": 2.7749, + "step": 28705 + }, + { + "epoch": 1.7819852256502577, + "grad_norm": 0.1547004395669784, + "learning_rate": 4.23520406192385e-05, + "loss": 2.7025, + "step": 28706 + }, + { + "epoch": 1.7820473027500157, + "grad_norm": 0.14377773888025014, + "learning_rate": 4.2348471613599636e-05, + "loss": 2.807, + "step": 28707 + }, + { + "epoch": 1.7821093798497736, + "grad_norm": 0.16304634664438034, + "learning_rate": 4.234490264788065e-05, + "loss": 2.8446, + "step": 28708 + }, + { + "epoch": 1.7821714569495313, + "grad_norm": 0.14201626396434175, + "learning_rate": 4.2341333722100166e-05, + "loss": 2.7696, + "step": 28709 + }, + { + "epoch": 1.7822335340492892, + "grad_norm": 0.16686438527619285, + "learning_rate": 4.2337764836276796e-05, + "loss": 2.7861, + "step": 28710 + }, + { + "epoch": 1.782295611149047, + "grad_norm": 0.14482639437151895, + "learning_rate": 4.233419599042918e-05, + "loss": 2.7723, + "step": 28711 + }, + { + "epoch": 1.782357688248805, + "grad_norm": 0.14438237435634746, + "learning_rate": 4.2330627184575924e-05, + "loss": 2.7237, + "step": 28712 + }, + { + "epoch": 1.782419765348563, + "grad_norm": 0.14765824765665533, + "learning_rate": 4.232705841873563e-05, + "loss": 2.7475, + "step": 28713 + }, + { + "epoch": 1.7824818424483209, + "grad_norm": 0.1398070641566368, + "learning_rate": 4.2323489692926956e-05, + "loss": 2.8471, + "step": 28714 + }, + { + "epoch": 1.7825439195480786, + "grad_norm": 0.1477674998644155, + "learning_rate": 4.2319921007168486e-05, + "loss": 2.6925, + "step": 28715 + }, + { + "epoch": 1.7826059966478365, + "grad_norm": 0.1385195795775942, + "learning_rate": 4.2316352361478855e-05, + "loss": 2.8105, + "step": 28716 + }, + { + "epoch": 1.7826680737475944, + "grad_norm": 0.1589646085781219, + "learning_rate": 4.231278375587668e-05, + "loss": 2.8288, + "step": 28717 + }, + { + "epoch": 1.7827301508473523, + "grad_norm": 0.14481517251610526, + "learning_rate": 4.230921519038058e-05, + "loss": 2.7417, + "step": 28718 + }, + { + "epoch": 1.7827922279471102, + "grad_norm": 0.14694140029040373, + "learning_rate": 4.230564666500917e-05, + "loss": 2.8213, + "step": 28719 + }, + { + "epoch": 1.7828543050468681, + "grad_norm": 0.13718584514784216, + "learning_rate": 4.2302078179781065e-05, + "loss": 2.7396, + "step": 28720 + }, + { + "epoch": 1.782916382146626, + "grad_norm": 0.15075629848121871, + "learning_rate": 4.2298509734714894e-05, + "loss": 2.7009, + "step": 28721 + }, + { + "epoch": 1.782978459246384, + "grad_norm": 0.15085309619569415, + "learning_rate": 4.229494132982927e-05, + "loss": 2.764, + "step": 28722 + }, + { + "epoch": 1.783040536346142, + "grad_norm": 0.16273384004414315, + "learning_rate": 4.22913729651428e-05, + "loss": 2.8089, + "step": 28723 + }, + { + "epoch": 1.7831026134458998, + "grad_norm": 0.14185122480823398, + "learning_rate": 4.228780464067411e-05, + "loss": 2.8021, + "step": 28724 + }, + { + "epoch": 1.7831646905456577, + "grad_norm": 0.15432506014128922, + "learning_rate": 4.228423635644182e-05, + "loss": 2.648, + "step": 28725 + }, + { + "epoch": 1.7832267676454157, + "grad_norm": 0.15834966983611687, + "learning_rate": 4.2280668112464524e-05, + "loss": 2.6875, + "step": 28726 + }, + { + "epoch": 1.7832888447451736, + "grad_norm": 0.14290275587935877, + "learning_rate": 4.2277099908760866e-05, + "loss": 2.7625, + "step": 28727 + }, + { + "epoch": 1.7833509218449315, + "grad_norm": 0.14529194276254886, + "learning_rate": 4.227353174534946e-05, + "loss": 2.824, + "step": 28728 + }, + { + "epoch": 1.7834129989446894, + "grad_norm": 0.14417936688317487, + "learning_rate": 4.22699636222489e-05, + "loss": 2.7778, + "step": 28729 + }, + { + "epoch": 1.7834750760444473, + "grad_norm": 0.15084057540153017, + "learning_rate": 4.226639553947784e-05, + "loss": 2.8286, + "step": 28730 + }, + { + "epoch": 1.7835371531442052, + "grad_norm": 0.15455300765191685, + "learning_rate": 4.226282749705485e-05, + "loss": 2.7421, + "step": 28731 + }, + { + "epoch": 1.7835992302439632, + "grad_norm": 0.16275763445701488, + "learning_rate": 4.2259259494998585e-05, + "loss": 2.8019, + "step": 28732 + }, + { + "epoch": 1.7836613073437209, + "grad_norm": 0.1601748978993153, + "learning_rate": 4.225569153332763e-05, + "loss": 2.8067, + "step": 28733 + }, + { + "epoch": 1.7837233844434788, + "grad_norm": 0.14850493831603506, + "learning_rate": 4.225212361206062e-05, + "loss": 2.7658, + "step": 28734 + }, + { + "epoch": 1.7837854615432367, + "grad_norm": 0.150527959515223, + "learning_rate": 4.224855573121616e-05, + "loss": 2.7863, + "step": 28735 + }, + { + "epoch": 1.7838475386429946, + "grad_norm": 0.20460280784738616, + "learning_rate": 4.224498789081286e-05, + "loss": 2.8111, + "step": 28736 + }, + { + "epoch": 1.7839096157427525, + "grad_norm": 0.16565699830540176, + "learning_rate": 4.2241420090869356e-05, + "loss": 2.7345, + "step": 28737 + }, + { + "epoch": 1.7839716928425104, + "grad_norm": 0.1804258373978686, + "learning_rate": 4.223785233140423e-05, + "loss": 2.7693, + "step": 28738 + }, + { + "epoch": 1.7840337699422681, + "grad_norm": 0.1560637794872994, + "learning_rate": 4.2234284612436127e-05, + "loss": 2.674, + "step": 28739 + }, + { + "epoch": 1.784095847042026, + "grad_norm": 0.15184321461387093, + "learning_rate": 4.223071693398363e-05, + "loss": 2.7933, + "step": 28740 + }, + { + "epoch": 1.784157924141784, + "grad_norm": 0.1832054678923694, + "learning_rate": 4.222714929606538e-05, + "loss": 2.7159, + "step": 28741 + }, + { + "epoch": 1.784220001241542, + "grad_norm": 0.14103790434783015, + "learning_rate": 4.222358169869997e-05, + "loss": 2.7323, + "step": 28742 + }, + { + "epoch": 1.7842820783412998, + "grad_norm": 0.16098586809291024, + "learning_rate": 4.2220014141906036e-05, + "loss": 2.7618, + "step": 28743 + }, + { + "epoch": 1.7843441554410577, + "grad_norm": 0.14715898938240787, + "learning_rate": 4.2216446625702164e-05, + "loss": 2.6699, + "step": 28744 + }, + { + "epoch": 1.7844062325408157, + "grad_norm": 0.25118377681079057, + "learning_rate": 4.221287915010699e-05, + "loss": 2.7492, + "step": 28745 + }, + { + "epoch": 1.7844683096405736, + "grad_norm": 0.15491483773365491, + "learning_rate": 4.22093117151391e-05, + "loss": 2.7495, + "step": 28746 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.2197758004863369, + "learning_rate": 4.220574432081714e-05, + "loss": 2.8042, + "step": 28747 + }, + { + "epoch": 1.7845924638400894, + "grad_norm": 0.14354173121730016, + "learning_rate": 4.22021769671597e-05, + "loss": 2.798, + "step": 28748 + }, + { + "epoch": 1.7846545409398473, + "grad_norm": 0.15949128083647082, + "learning_rate": 4.2198609654185396e-05, + "loss": 2.6808, + "step": 28749 + }, + { + "epoch": 1.7847166180396052, + "grad_norm": 0.16250741304398278, + "learning_rate": 4.2195042381912844e-05, + "loss": 2.7563, + "step": 28750 + }, + { + "epoch": 1.7847786951393632, + "grad_norm": 0.16518783635405793, + "learning_rate": 4.219147515036063e-05, + "loss": 2.853, + "step": 28751 + }, + { + "epoch": 1.784840772239121, + "grad_norm": 0.16605089598324663, + "learning_rate": 4.21879079595474e-05, + "loss": 2.8691, + "step": 28752 + }, + { + "epoch": 1.784902849338879, + "grad_norm": 0.19293403081490487, + "learning_rate": 4.2184340809491754e-05, + "loss": 2.8419, + "step": 28753 + }, + { + "epoch": 1.784964926438637, + "grad_norm": 0.1536818754358359, + "learning_rate": 4.2180773700212295e-05, + "loss": 2.6866, + "step": 28754 + }, + { + "epoch": 1.7850270035383948, + "grad_norm": 0.14647142353022874, + "learning_rate": 4.217720663172763e-05, + "loss": 2.8008, + "step": 28755 + }, + { + "epoch": 1.7850890806381527, + "grad_norm": 0.13622618911201073, + "learning_rate": 4.2173639604056395e-05, + "loss": 2.684, + "step": 28756 + }, + { + "epoch": 1.7851511577379104, + "grad_norm": 0.151558281652993, + "learning_rate": 4.217007261721716e-05, + "loss": 2.7604, + "step": 28757 + }, + { + "epoch": 1.7852132348376684, + "grad_norm": 0.15077043232630113, + "learning_rate": 4.216650567122855e-05, + "loss": 2.7556, + "step": 28758 + }, + { + "epoch": 1.7852753119374263, + "grad_norm": 0.2433127136562692, + "learning_rate": 4.216293876610921e-05, + "loss": 2.8066, + "step": 28759 + }, + { + "epoch": 1.7853373890371842, + "grad_norm": 0.17187183550609114, + "learning_rate": 4.2159371901877696e-05, + "loss": 2.8022, + "step": 28760 + }, + { + "epoch": 1.7853994661369421, + "grad_norm": 0.1560646626861523, + "learning_rate": 4.215580507855267e-05, + "loss": 2.7898, + "step": 28761 + }, + { + "epoch": 1.7854615432367, + "grad_norm": 0.19856876102022555, + "learning_rate": 4.21522382961527e-05, + "loss": 2.7466, + "step": 28762 + }, + { + "epoch": 1.7855236203364577, + "grad_norm": 0.16233118807202856, + "learning_rate": 4.2148671554696404e-05, + "loss": 2.7823, + "step": 28763 + }, + { + "epoch": 1.7855856974362156, + "grad_norm": 0.1446630353924627, + "learning_rate": 4.2145104854202405e-05, + "loss": 2.7101, + "step": 28764 + }, + { + "epoch": 1.7856477745359736, + "grad_norm": 0.15097046823773375, + "learning_rate": 4.214153819468929e-05, + "loss": 2.8223, + "step": 28765 + }, + { + "epoch": 1.7857098516357315, + "grad_norm": 0.1483839904809899, + "learning_rate": 4.2137971576175686e-05, + "loss": 2.6522, + "step": 28766 + }, + { + "epoch": 1.7857719287354894, + "grad_norm": 0.18675149087872514, + "learning_rate": 4.213440499868018e-05, + "loss": 2.7618, + "step": 28767 + }, + { + "epoch": 1.7858340058352473, + "grad_norm": 0.1752585852859363, + "learning_rate": 4.213083846222141e-05, + "loss": 2.7747, + "step": 28768 + }, + { + "epoch": 1.7858960829350052, + "grad_norm": 0.17243386031212463, + "learning_rate": 4.212727196681795e-05, + "loss": 2.7894, + "step": 28769 + }, + { + "epoch": 1.7859581600347632, + "grad_norm": 0.17394708998735892, + "learning_rate": 4.212370551248844e-05, + "loss": 2.6887, + "step": 28770 + }, + { + "epoch": 1.786020237134521, + "grad_norm": 0.1671653374127116, + "learning_rate": 4.2120139099251455e-05, + "loss": 2.7654, + "step": 28771 + }, + { + "epoch": 1.786082314234279, + "grad_norm": 0.1764199362835722, + "learning_rate": 4.211657272712563e-05, + "loss": 2.8187, + "step": 28772 + }, + { + "epoch": 1.786144391334037, + "grad_norm": 0.15641676824531175, + "learning_rate": 4.211300639612954e-05, + "loss": 2.8371, + "step": 28773 + }, + { + "epoch": 1.7862064684337948, + "grad_norm": 0.15419805959950764, + "learning_rate": 4.210944010628183e-05, + "loss": 2.7712, + "step": 28774 + }, + { + "epoch": 1.7862685455335527, + "grad_norm": 0.19167318026386734, + "learning_rate": 4.2105873857601085e-05, + "loss": 2.7241, + "step": 28775 + }, + { + "epoch": 1.7863306226333107, + "grad_norm": 0.17072816813454092, + "learning_rate": 4.2102307650105896e-05, + "loss": 2.8182, + "step": 28776 + }, + { + "epoch": 1.7863926997330686, + "grad_norm": 0.15119807733371077, + "learning_rate": 4.209874148381491e-05, + "loss": 2.7289, + "step": 28777 + }, + { + "epoch": 1.7864547768328265, + "grad_norm": 0.16414129544847, + "learning_rate": 4.2095175358746684e-05, + "loss": 2.8412, + "step": 28778 + }, + { + "epoch": 1.7865168539325844, + "grad_norm": 0.16055957669692578, + "learning_rate": 4.2091609274919864e-05, + "loss": 2.9718, + "step": 28779 + }, + { + "epoch": 1.7865789310323423, + "grad_norm": 0.15535082178320536, + "learning_rate": 4.208804323235303e-05, + "loss": 2.7703, + "step": 28780 + }, + { + "epoch": 1.7866410081321, + "grad_norm": 0.15102658867938926, + "learning_rate": 4.20844772310648e-05, + "loss": 2.7108, + "step": 28781 + }, + { + "epoch": 1.786703085231858, + "grad_norm": 0.15585283825009083, + "learning_rate": 4.208091127107376e-05, + "loss": 2.7962, + "step": 28782 + }, + { + "epoch": 1.7867651623316159, + "grad_norm": 0.15984869128967646, + "learning_rate": 4.207734535239854e-05, + "loss": 2.767, + "step": 28783 + }, + { + "epoch": 1.7868272394313738, + "grad_norm": 0.15738290512638958, + "learning_rate": 4.207377947505773e-05, + "loss": 2.6895, + "step": 28784 + }, + { + "epoch": 1.7868893165311317, + "grad_norm": 0.15574574164360114, + "learning_rate": 4.207021363906994e-05, + "loss": 2.6747, + "step": 28785 + }, + { + "epoch": 1.7869513936308896, + "grad_norm": 0.16048400766440019, + "learning_rate": 4.206664784445377e-05, + "loss": 2.7944, + "step": 28786 + }, + { + "epoch": 1.7870134707306473, + "grad_norm": 0.15364708670950838, + "learning_rate": 4.206308209122782e-05, + "loss": 2.689, + "step": 28787 + }, + { + "epoch": 1.7870755478304052, + "grad_norm": 0.15830367036850485, + "learning_rate": 4.20595163794107e-05, + "loss": 2.7478, + "step": 28788 + }, + { + "epoch": 1.7871376249301631, + "grad_norm": 0.16987831280806473, + "learning_rate": 4.2055950709021006e-05, + "loss": 2.8643, + "step": 28789 + }, + { + "epoch": 1.787199702029921, + "grad_norm": 0.15869637701612452, + "learning_rate": 4.205238508007735e-05, + "loss": 2.7328, + "step": 28790 + }, + { + "epoch": 1.787261779129679, + "grad_norm": 0.16169009507665777, + "learning_rate": 4.204881949259832e-05, + "loss": 2.8046, + "step": 28791 + }, + { + "epoch": 1.787323856229437, + "grad_norm": 0.19281856932276584, + "learning_rate": 4.204525394660254e-05, + "loss": 2.7655, + "step": 28792 + }, + { + "epoch": 1.7873859333291948, + "grad_norm": 0.17312193054445943, + "learning_rate": 4.2041688442108604e-05, + "loss": 2.8539, + "step": 28793 + }, + { + "epoch": 1.7874480104289527, + "grad_norm": 0.16397785659963487, + "learning_rate": 4.2038122979135094e-05, + "loss": 2.7445, + "step": 28794 + }, + { + "epoch": 1.7875100875287107, + "grad_norm": 0.18342936034721544, + "learning_rate": 4.203455755770065e-05, + "loss": 2.8329, + "step": 28795 + }, + { + "epoch": 1.7875721646284686, + "grad_norm": 0.17843690300890036, + "learning_rate": 4.203099217782384e-05, + "loss": 2.8299, + "step": 28796 + }, + { + "epoch": 1.7876342417282265, + "grad_norm": 0.1810749947888538, + "learning_rate": 4.202742683952329e-05, + "loss": 2.7711, + "step": 28797 + }, + { + "epoch": 1.7876963188279844, + "grad_norm": 0.1766192277571204, + "learning_rate": 4.202386154281758e-05, + "loss": 2.7617, + "step": 28798 + }, + { + "epoch": 1.7877583959277423, + "grad_norm": 0.1610566681789352, + "learning_rate": 4.202029628772533e-05, + "loss": 2.709, + "step": 28799 + }, + { + "epoch": 1.7878204730275002, + "grad_norm": 0.15956366045380813, + "learning_rate": 4.201673107426513e-05, + "loss": 2.7003, + "step": 28800 + }, + { + "epoch": 1.7878825501272582, + "grad_norm": 0.1657976795562041, + "learning_rate": 4.201316590245557e-05, + "loss": 2.7483, + "step": 28801 + }, + { + "epoch": 1.787944627227016, + "grad_norm": 0.157386497043513, + "learning_rate": 4.2009600772315275e-05, + "loss": 2.7253, + "step": 28802 + }, + { + "epoch": 1.788006704326774, + "grad_norm": 0.1512879995126018, + "learning_rate": 4.2006035683862824e-05, + "loss": 2.8029, + "step": 28803 + }, + { + "epoch": 1.788068781426532, + "grad_norm": 0.17062303423185815, + "learning_rate": 4.200247063711683e-05, + "loss": 2.8144, + "step": 28804 + }, + { + "epoch": 1.7881308585262896, + "grad_norm": 0.16229240586765808, + "learning_rate": 4.1998905632095886e-05, + "loss": 2.767, + "step": 28805 + }, + { + "epoch": 1.7881929356260475, + "grad_norm": 0.19162300059923396, + "learning_rate": 4.19953406688186e-05, + "loss": 2.6716, + "step": 28806 + }, + { + "epoch": 1.7882550127258054, + "grad_norm": 0.19017739823284602, + "learning_rate": 4.199177574730355e-05, + "loss": 2.7156, + "step": 28807 + }, + { + "epoch": 1.7883170898255634, + "grad_norm": 0.1528118534601026, + "learning_rate": 4.198821086756937e-05, + "loss": 2.7395, + "step": 28808 + }, + { + "epoch": 1.7883791669253213, + "grad_norm": 0.15537200654131558, + "learning_rate": 4.1984646029634624e-05, + "loss": 2.8041, + "step": 28809 + }, + { + "epoch": 1.7884412440250792, + "grad_norm": 0.21019122779221985, + "learning_rate": 4.1981081233517936e-05, + "loss": 2.8355, + "step": 28810 + }, + { + "epoch": 1.788503321124837, + "grad_norm": 0.2060152802592848, + "learning_rate": 4.1977516479237885e-05, + "loss": 2.7962, + "step": 28811 + }, + { + "epoch": 1.7885653982245948, + "grad_norm": 0.1580792401183862, + "learning_rate": 4.197395176681308e-05, + "loss": 2.7001, + "step": 28812 + }, + { + "epoch": 1.7886274753243527, + "grad_norm": 0.16278496112133947, + "learning_rate": 4.197038709626213e-05, + "loss": 2.7606, + "step": 28813 + }, + { + "epoch": 1.7886895524241107, + "grad_norm": 0.15286972263628307, + "learning_rate": 4.1966822467603596e-05, + "loss": 2.843, + "step": 28814 + }, + { + "epoch": 1.7887516295238686, + "grad_norm": 0.13902662462749105, + "learning_rate": 4.196325788085612e-05, + "loss": 2.7866, + "step": 28815 + }, + { + "epoch": 1.7888137066236265, + "grad_norm": 0.15839719642501032, + "learning_rate": 4.195969333603826e-05, + "loss": 2.8636, + "step": 28816 + }, + { + "epoch": 1.7888757837233844, + "grad_norm": 0.14193430643045782, + "learning_rate": 4.1956128833168644e-05, + "loss": 2.6944, + "step": 28817 + }, + { + "epoch": 1.7889378608231423, + "grad_norm": 0.14829423541058195, + "learning_rate": 4.1952564372265854e-05, + "loss": 2.7849, + "step": 28818 + }, + { + "epoch": 1.7889999379229002, + "grad_norm": 0.1569266390728246, + "learning_rate": 4.194899995334849e-05, + "loss": 2.708, + "step": 28819 + }, + { + "epoch": 1.7890620150226582, + "grad_norm": 0.19187288429137572, + "learning_rate": 4.194543557643514e-05, + "loss": 2.7237, + "step": 28820 + }, + { + "epoch": 1.789124092122416, + "grad_norm": 0.1683386898388491, + "learning_rate": 4.194187124154442e-05, + "loss": 2.8451, + "step": 28821 + }, + { + "epoch": 1.789186169222174, + "grad_norm": 0.15696400583317585, + "learning_rate": 4.193830694869491e-05, + "loss": 2.7832, + "step": 28822 + }, + { + "epoch": 1.789248246321932, + "grad_norm": 0.15386306161571128, + "learning_rate": 4.193474269790521e-05, + "loss": 2.8765, + "step": 28823 + }, + { + "epoch": 1.7893103234216898, + "grad_norm": 0.1993797250265513, + "learning_rate": 4.193117848919389e-05, + "loss": 2.7912, + "step": 28824 + }, + { + "epoch": 1.7893724005214477, + "grad_norm": 0.15516506467752655, + "learning_rate": 4.19276143225796e-05, + "loss": 2.7751, + "step": 28825 + }, + { + "epoch": 1.7894344776212057, + "grad_norm": 0.1581256347010242, + "learning_rate": 4.19240501980809e-05, + "loss": 2.76, + "step": 28826 + }, + { + "epoch": 1.7894965547209636, + "grad_norm": 0.15297689062595415, + "learning_rate": 4.19204861157164e-05, + "loss": 2.7915, + "step": 28827 + }, + { + "epoch": 1.7895586318207215, + "grad_norm": 0.15696188167906833, + "learning_rate": 4.1916922075504665e-05, + "loss": 2.749, + "step": 28828 + }, + { + "epoch": 1.7896207089204792, + "grad_norm": 0.15350787723199563, + "learning_rate": 4.1913358077464323e-05, + "loss": 2.8598, + "step": 28829 + }, + { + "epoch": 1.7896827860202371, + "grad_norm": 0.16810446623552783, + "learning_rate": 4.1909794121613955e-05, + "loss": 2.7527, + "step": 28830 + }, + { + "epoch": 1.789744863119995, + "grad_norm": 0.1761273790994782, + "learning_rate": 4.190623020797215e-05, + "loss": 2.7739, + "step": 28831 + }, + { + "epoch": 1.789806940219753, + "grad_norm": 0.16210918541403105, + "learning_rate": 4.190266633655751e-05, + "loss": 2.8541, + "step": 28832 + }, + { + "epoch": 1.7898690173195109, + "grad_norm": 0.1965749660687185, + "learning_rate": 4.189910250738863e-05, + "loss": 2.7949, + "step": 28833 + }, + { + "epoch": 1.7899310944192688, + "grad_norm": 0.18085851772096934, + "learning_rate": 4.1895538720484087e-05, + "loss": 2.7124, + "step": 28834 + }, + { + "epoch": 1.7899931715190265, + "grad_norm": 0.16206296003521328, + "learning_rate": 4.1891974975862505e-05, + "loss": 2.7841, + "step": 28835 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.15452946262656653, + "learning_rate": 4.188841127354245e-05, + "loss": 2.7976, + "step": 28836 + }, + { + "epoch": 1.7901173257185423, + "grad_norm": 0.18689083445326543, + "learning_rate": 4.188484761354251e-05, + "loss": 2.7803, + "step": 28837 + }, + { + "epoch": 1.7901794028183002, + "grad_norm": 0.1596766361602183, + "learning_rate": 4.1881283995881304e-05, + "loss": 2.7847, + "step": 28838 + }, + { + "epoch": 1.7902414799180582, + "grad_norm": 0.14709848739438242, + "learning_rate": 4.187772042057739e-05, + "loss": 2.7598, + "step": 28839 + }, + { + "epoch": 1.790303557017816, + "grad_norm": 0.16792521910702107, + "learning_rate": 4.187415688764941e-05, + "loss": 2.78, + "step": 28840 + }, + { + "epoch": 1.790365634117574, + "grad_norm": 0.15672598698489776, + "learning_rate": 4.18705933971159e-05, + "loss": 2.7982, + "step": 28841 + }, + { + "epoch": 1.790427711217332, + "grad_norm": 0.16098496561165151, + "learning_rate": 4.18670299489955e-05, + "loss": 2.8333, + "step": 28842 + }, + { + "epoch": 1.7904897883170898, + "grad_norm": 0.18314584922580723, + "learning_rate": 4.1863466543306757e-05, + "loss": 2.8455, + "step": 28843 + }, + { + "epoch": 1.7905518654168477, + "grad_norm": 0.17360074999144326, + "learning_rate": 4.18599031800683e-05, + "loss": 2.7034, + "step": 28844 + }, + { + "epoch": 1.7906139425166057, + "grad_norm": 0.16293789088638003, + "learning_rate": 4.185633985929869e-05, + "loss": 2.8078, + "step": 28845 + }, + { + "epoch": 1.7906760196163636, + "grad_norm": 0.15178511360494382, + "learning_rate": 4.185277658101655e-05, + "loss": 2.7454, + "step": 28846 + }, + { + "epoch": 1.7907380967161215, + "grad_norm": 0.14808333961906842, + "learning_rate": 4.1849213345240446e-05, + "loss": 2.7868, + "step": 28847 + }, + { + "epoch": 1.7908001738158794, + "grad_norm": 0.16558295696984024, + "learning_rate": 4.184565015198897e-05, + "loss": 2.8612, + "step": 28848 + }, + { + "epoch": 1.7908622509156373, + "grad_norm": 0.2636431501777774, + "learning_rate": 4.184208700128072e-05, + "loss": 2.7983, + "step": 28849 + }, + { + "epoch": 1.7909243280153953, + "grad_norm": 0.15996441167801154, + "learning_rate": 4.183852389313428e-05, + "loss": 2.6905, + "step": 28850 + }, + { + "epoch": 1.7909864051151532, + "grad_norm": 0.18771625009037124, + "learning_rate": 4.183496082756825e-05, + "loss": 2.7635, + "step": 28851 + }, + { + "epoch": 1.7910484822149109, + "grad_norm": 0.1504201178923099, + "learning_rate": 4.18313978046012e-05, + "loss": 2.7828, + "step": 28852 + }, + { + "epoch": 1.7911105593146688, + "grad_norm": 0.15747651082056152, + "learning_rate": 4.182783482425174e-05, + "loss": 2.8344, + "step": 28853 + }, + { + "epoch": 1.7911726364144267, + "grad_norm": 0.14602582367273043, + "learning_rate": 4.1824271886538435e-05, + "loss": 2.7037, + "step": 28854 + }, + { + "epoch": 1.7912347135141846, + "grad_norm": 0.1534551707714902, + "learning_rate": 4.182070899147991e-05, + "loss": 2.7978, + "step": 28855 + }, + { + "epoch": 1.7912967906139425, + "grad_norm": 0.1528602793367617, + "learning_rate": 4.181714613909471e-05, + "loss": 2.7632, + "step": 28856 + }, + { + "epoch": 1.7913588677137005, + "grad_norm": 0.15067811833617462, + "learning_rate": 4.181358332940144e-05, + "loss": 2.7784, + "step": 28857 + }, + { + "epoch": 1.7914209448134581, + "grad_norm": 0.16763028709058544, + "learning_rate": 4.181002056241871e-05, + "loss": 2.7144, + "step": 28858 + }, + { + "epoch": 1.791483021913216, + "grad_norm": 0.15184777169790784, + "learning_rate": 4.1806457838165084e-05, + "loss": 2.7744, + "step": 28859 + }, + { + "epoch": 1.791545099012974, + "grad_norm": 0.14323756814649274, + "learning_rate": 4.180289515665917e-05, + "loss": 2.7247, + "step": 28860 + }, + { + "epoch": 1.791607176112732, + "grad_norm": 0.14488026003467483, + "learning_rate": 4.1799332517919535e-05, + "loss": 2.793, + "step": 28861 + }, + { + "epoch": 1.7916692532124898, + "grad_norm": 0.17217017965904075, + "learning_rate": 4.179576992196477e-05, + "loss": 2.808, + "step": 28862 + }, + { + "epoch": 1.7917313303122477, + "grad_norm": 0.1786274219753169, + "learning_rate": 4.179220736881347e-05, + "loss": 2.8579, + "step": 28863 + }, + { + "epoch": 1.7917934074120057, + "grad_norm": 0.17087910134054407, + "learning_rate": 4.1788644858484205e-05, + "loss": 2.7182, + "step": 28864 + }, + { + "epoch": 1.7918554845117636, + "grad_norm": 0.1439193065448499, + "learning_rate": 4.178508239099559e-05, + "loss": 2.7557, + "step": 28865 + }, + { + "epoch": 1.7919175616115215, + "grad_norm": 0.16681047781657962, + "learning_rate": 4.178151996636618e-05, + "loss": 2.807, + "step": 28866 + }, + { + "epoch": 1.7919796387112794, + "grad_norm": 0.16554321505545508, + "learning_rate": 4.1777957584614586e-05, + "loss": 2.7431, + "step": 28867 + }, + { + "epoch": 1.7920417158110373, + "grad_norm": 0.15622356310353172, + "learning_rate": 4.177439524575938e-05, + "loss": 2.9467, + "step": 28868 + }, + { + "epoch": 1.7921037929107952, + "grad_norm": 0.1493473839171041, + "learning_rate": 4.177083294981915e-05, + "loss": 2.7401, + "step": 28869 + }, + { + "epoch": 1.7921658700105532, + "grad_norm": 0.15876117639735493, + "learning_rate": 4.1767270696812474e-05, + "loss": 2.7685, + "step": 28870 + }, + { + "epoch": 1.792227947110311, + "grad_norm": 0.1446164756239303, + "learning_rate": 4.1763708486757956e-05, + "loss": 2.7601, + "step": 28871 + }, + { + "epoch": 1.792290024210069, + "grad_norm": 0.14292396120046214, + "learning_rate": 4.176014631967418e-05, + "loss": 2.7603, + "step": 28872 + }, + { + "epoch": 1.792352101309827, + "grad_norm": 0.14993505090022768, + "learning_rate": 4.175658419557969e-05, + "loss": 2.8359, + "step": 28873 + }, + { + "epoch": 1.7924141784095848, + "grad_norm": 0.1443048521128394, + "learning_rate": 4.175302211449314e-05, + "loss": 2.707, + "step": 28874 + }, + { + "epoch": 1.7924762555093428, + "grad_norm": 0.1674933359080147, + "learning_rate": 4.174946007643304e-05, + "loss": 2.7942, + "step": 28875 + }, + { + "epoch": 1.7925383326091004, + "grad_norm": 0.15759149103211703, + "learning_rate": 4.174589808141803e-05, + "loss": 2.7367, + "step": 28876 + }, + { + "epoch": 1.7926004097088584, + "grad_norm": 0.15263855970252416, + "learning_rate": 4.174233612946666e-05, + "loss": 2.8045, + "step": 28877 + }, + { + "epoch": 1.7926624868086163, + "grad_norm": 0.13769678020371443, + "learning_rate": 4.1738774220597544e-05, + "loss": 2.7509, + "step": 28878 + }, + { + "epoch": 1.7927245639083742, + "grad_norm": 0.17173882530227744, + "learning_rate": 4.1735212354829236e-05, + "loss": 2.8523, + "step": 28879 + }, + { + "epoch": 1.7927866410081321, + "grad_norm": 0.1482285233182298, + "learning_rate": 4.173165053218034e-05, + "loss": 2.7389, + "step": 28880 + }, + { + "epoch": 1.79284871810789, + "grad_norm": 0.1540855638735173, + "learning_rate": 4.172808875266942e-05, + "loss": 2.7877, + "step": 28881 + }, + { + "epoch": 1.7929107952076477, + "grad_norm": 0.15448945738722747, + "learning_rate": 4.172452701631508e-05, + "loss": 2.7599, + "step": 28882 + }, + { + "epoch": 1.7929728723074057, + "grad_norm": 0.15697503554555955, + "learning_rate": 4.1720965323135893e-05, + "loss": 2.7513, + "step": 28883 + }, + { + "epoch": 1.7930349494071636, + "grad_norm": 0.14727181186378818, + "learning_rate": 4.171740367315042e-05, + "loss": 2.8213, + "step": 28884 + }, + { + "epoch": 1.7930970265069215, + "grad_norm": 0.14703193685673197, + "learning_rate": 4.1713842066377285e-05, + "loss": 2.7597, + "step": 28885 + }, + { + "epoch": 1.7931591036066794, + "grad_norm": 0.16359162309205827, + "learning_rate": 4.171028050283503e-05, + "loss": 2.7385, + "step": 28886 + }, + { + "epoch": 1.7932211807064373, + "grad_norm": 0.14483717810897173, + "learning_rate": 4.170671898254227e-05, + "loss": 2.8032, + "step": 28887 + }, + { + "epoch": 1.7932832578061952, + "grad_norm": 0.14970499897054582, + "learning_rate": 4.170315750551756e-05, + "loss": 2.8415, + "step": 28888 + }, + { + "epoch": 1.7933453349059532, + "grad_norm": 0.14417750008369898, + "learning_rate": 4.169959607177948e-05, + "loss": 2.7889, + "step": 28889 + }, + { + "epoch": 1.793407412005711, + "grad_norm": 0.16026443804829205, + "learning_rate": 4.1696034681346637e-05, + "loss": 2.7435, + "step": 28890 + }, + { + "epoch": 1.793469489105469, + "grad_norm": 0.1487795690490817, + "learning_rate": 4.169247333423759e-05, + "loss": 2.8055, + "step": 28891 + }, + { + "epoch": 1.793531566205227, + "grad_norm": 0.14166282117991172, + "learning_rate": 4.168891203047094e-05, + "loss": 2.7803, + "step": 28892 + }, + { + "epoch": 1.7935936433049848, + "grad_norm": 0.14016733931577732, + "learning_rate": 4.1685350770065234e-05, + "loss": 2.8559, + "step": 28893 + }, + { + "epoch": 1.7936557204047427, + "grad_norm": 0.15394757364737624, + "learning_rate": 4.168178955303909e-05, + "loss": 2.7065, + "step": 28894 + }, + { + "epoch": 1.7937177975045007, + "grad_norm": 0.15288646325873395, + "learning_rate": 4.167822837941106e-05, + "loss": 2.7885, + "step": 28895 + }, + { + "epoch": 1.7937798746042586, + "grad_norm": 0.14025238128712775, + "learning_rate": 4.167466724919974e-05, + "loss": 2.7787, + "step": 28896 + }, + { + "epoch": 1.7938419517040165, + "grad_norm": 0.15079247674999358, + "learning_rate": 4.16711061624237e-05, + "loss": 2.7563, + "step": 28897 + }, + { + "epoch": 1.7939040288037744, + "grad_norm": 0.15169763488846394, + "learning_rate": 4.1667545119101516e-05, + "loss": 2.7689, + "step": 28898 + }, + { + "epoch": 1.7939661059035323, + "grad_norm": 0.14332326413450336, + "learning_rate": 4.166398411925178e-05, + "loss": 2.7428, + "step": 28899 + }, + { + "epoch": 1.79402818300329, + "grad_norm": 0.15339605319368183, + "learning_rate": 4.166042316289305e-05, + "loss": 2.7688, + "step": 28900 + }, + { + "epoch": 1.794090260103048, + "grad_norm": 0.14446602457897248, + "learning_rate": 4.165686225004393e-05, + "loss": 2.7155, + "step": 28901 + }, + { + "epoch": 1.7941523372028059, + "grad_norm": 0.15079008654657838, + "learning_rate": 4.165330138072297e-05, + "loss": 2.7453, + "step": 28902 + }, + { + "epoch": 1.7942144143025638, + "grad_norm": 0.14424018290408347, + "learning_rate": 4.164974055494878e-05, + "loss": 2.7785, + "step": 28903 + }, + { + "epoch": 1.7942764914023217, + "grad_norm": 0.16130308344831396, + "learning_rate": 4.164617977273991e-05, + "loss": 2.8055, + "step": 28904 + }, + { + "epoch": 1.7943385685020796, + "grad_norm": 0.13893957223260636, + "learning_rate": 4.164261903411495e-05, + "loss": 2.6868, + "step": 28905 + }, + { + "epoch": 1.7944006456018373, + "grad_norm": 0.176709426664298, + "learning_rate": 4.1639058339092474e-05, + "loss": 2.8569, + "step": 28906 + }, + { + "epoch": 1.7944627227015952, + "grad_norm": 0.1501598693340007, + "learning_rate": 4.1635497687691064e-05, + "loss": 2.7599, + "step": 28907 + }, + { + "epoch": 1.7945247998013532, + "grad_norm": 0.15615110247740713, + "learning_rate": 4.163193707992929e-05, + "loss": 2.7943, + "step": 28908 + }, + { + "epoch": 1.794586876901111, + "grad_norm": 0.14990380376545898, + "learning_rate": 4.1628376515825735e-05, + "loss": 2.7916, + "step": 28909 + }, + { + "epoch": 1.794648954000869, + "grad_norm": 0.15196746425096516, + "learning_rate": 4.162481599539898e-05, + "loss": 2.7655, + "step": 28910 + }, + { + "epoch": 1.794711031100627, + "grad_norm": 0.1488439428879639, + "learning_rate": 4.1621255518667565e-05, + "loss": 2.7272, + "step": 28911 + }, + { + "epoch": 1.7947731082003848, + "grad_norm": 0.15602234707000934, + "learning_rate": 4.161769508565012e-05, + "loss": 2.7219, + "step": 28912 + }, + { + "epoch": 1.7948351853001427, + "grad_norm": 0.1470721342219667, + "learning_rate": 4.1614134696365185e-05, + "loss": 2.8065, + "step": 28913 + }, + { + "epoch": 1.7948972623999007, + "grad_norm": 0.19060475054266596, + "learning_rate": 4.161057435083135e-05, + "loss": 2.7354, + "step": 28914 + }, + { + "epoch": 1.7949593394996586, + "grad_norm": 0.1506773086394089, + "learning_rate": 4.160701404906718e-05, + "loss": 2.7295, + "step": 28915 + }, + { + "epoch": 1.7950214165994165, + "grad_norm": 0.1383842117272427, + "learning_rate": 4.160345379109126e-05, + "loss": 2.7083, + "step": 28916 + }, + { + "epoch": 1.7950834936991744, + "grad_norm": 0.1445972575423252, + "learning_rate": 4.159989357692215e-05, + "loss": 2.7628, + "step": 28917 + }, + { + "epoch": 1.7951455707989323, + "grad_norm": 0.15200249647024397, + "learning_rate": 4.159633340657845e-05, + "loss": 2.8797, + "step": 28918 + }, + { + "epoch": 1.7952076478986903, + "grad_norm": 0.15116819406803755, + "learning_rate": 4.15927732800787e-05, + "loss": 2.7621, + "step": 28919 + }, + { + "epoch": 1.7952697249984482, + "grad_norm": 0.15601855660646488, + "learning_rate": 4.1589213197441503e-05, + "loss": 2.8503, + "step": 28920 + }, + { + "epoch": 1.795331802098206, + "grad_norm": 0.15155349463815312, + "learning_rate": 4.158565315868543e-05, + "loss": 2.8974, + "step": 28921 + }, + { + "epoch": 1.795393879197964, + "grad_norm": 0.14126255627330514, + "learning_rate": 4.158209316382903e-05, + "loss": 2.6805, + "step": 28922 + }, + { + "epoch": 1.795455956297722, + "grad_norm": 0.15880404069740361, + "learning_rate": 4.157853321289089e-05, + "loss": 2.8276, + "step": 28923 + }, + { + "epoch": 1.7955180333974796, + "grad_norm": 0.14634311222488916, + "learning_rate": 4.157497330588961e-05, + "loss": 2.7431, + "step": 28924 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.18135326995131268, + "learning_rate": 4.157141344284372e-05, + "loss": 2.8272, + "step": 28925 + }, + { + "epoch": 1.7956421875969955, + "grad_norm": 0.15147994978816884, + "learning_rate": 4.1567853623771824e-05, + "loss": 2.8167, + "step": 28926 + }, + { + "epoch": 1.7957042646967534, + "grad_norm": 0.14852570121813424, + "learning_rate": 4.156429384869247e-05, + "loss": 2.7646, + "step": 28927 + }, + { + "epoch": 1.7957663417965113, + "grad_norm": 0.15694859616366344, + "learning_rate": 4.156073411762427e-05, + "loss": 2.6797, + "step": 28928 + }, + { + "epoch": 1.7958284188962692, + "grad_norm": 0.13935700932314823, + "learning_rate": 4.155717443058574e-05, + "loss": 2.7322, + "step": 28929 + }, + { + "epoch": 1.795890495996027, + "grad_norm": 0.16965117212314193, + "learning_rate": 4.155361478759549e-05, + "loss": 2.765, + "step": 28930 + }, + { + "epoch": 1.7959525730957848, + "grad_norm": 0.15014309762996314, + "learning_rate": 4.155005518867209e-05, + "loss": 2.7196, + "step": 28931 + }, + { + "epoch": 1.7960146501955427, + "grad_norm": 0.19655931192732837, + "learning_rate": 4.1546495633834096e-05, + "loss": 2.8038, + "step": 28932 + }, + { + "epoch": 1.7960767272953007, + "grad_norm": 0.1552100262026878, + "learning_rate": 4.1542936123100095e-05, + "loss": 2.7761, + "step": 28933 + }, + { + "epoch": 1.7961388043950586, + "grad_norm": 0.14524590519838812, + "learning_rate": 4.1539376656488635e-05, + "loss": 2.8192, + "step": 28934 + }, + { + "epoch": 1.7962008814948165, + "grad_norm": 0.14804648673577842, + "learning_rate": 4.153581723401831e-05, + "loss": 2.7813, + "step": 28935 + }, + { + "epoch": 1.7962629585945744, + "grad_norm": 0.15415732797376855, + "learning_rate": 4.153225785570768e-05, + "loss": 2.8652, + "step": 28936 + }, + { + "epoch": 1.7963250356943323, + "grad_norm": 0.1476122414637784, + "learning_rate": 4.152869852157532e-05, + "loss": 2.7806, + "step": 28937 + }, + { + "epoch": 1.7963871127940902, + "grad_norm": 0.1493527592078617, + "learning_rate": 4.152513923163979e-05, + "loss": 2.7769, + "step": 28938 + }, + { + "epoch": 1.7964491898938482, + "grad_norm": 0.20269774471697605, + "learning_rate": 4.152157998591967e-05, + "loss": 2.7921, + "step": 28939 + }, + { + "epoch": 1.796511266993606, + "grad_norm": 0.16130753494286557, + "learning_rate": 4.151802078443351e-05, + "loss": 2.6853, + "step": 28940 + }, + { + "epoch": 1.796573344093364, + "grad_norm": 0.15660748119212065, + "learning_rate": 4.151446162719992e-05, + "loss": 2.7394, + "step": 28941 + }, + { + "epoch": 1.796635421193122, + "grad_norm": 0.1495885827080717, + "learning_rate": 4.1510902514237424e-05, + "loss": 2.804, + "step": 28942 + }, + { + "epoch": 1.7966974982928798, + "grad_norm": 0.15551876385055607, + "learning_rate": 4.1507343445564625e-05, + "loss": 2.7814, + "step": 28943 + }, + { + "epoch": 1.7967595753926378, + "grad_norm": 0.14551389924702066, + "learning_rate": 4.1503784421200066e-05, + "loss": 2.7182, + "step": 28944 + }, + { + "epoch": 1.7968216524923957, + "grad_norm": 0.1817992860792377, + "learning_rate": 4.150022544116233e-05, + "loss": 2.7972, + "step": 28945 + }, + { + "epoch": 1.7968837295921536, + "grad_norm": 0.166666510825283, + "learning_rate": 4.149666650546999e-05, + "loss": 2.814, + "step": 28946 + }, + { + "epoch": 1.7969458066919115, + "grad_norm": 0.14173720867313083, + "learning_rate": 4.1493107614141594e-05, + "loss": 2.7178, + "step": 28947 + }, + { + "epoch": 1.7970078837916692, + "grad_norm": 0.22425772464433527, + "learning_rate": 4.148954876719572e-05, + "loss": 2.8147, + "step": 28948 + }, + { + "epoch": 1.7970699608914271, + "grad_norm": 0.14304680413328255, + "learning_rate": 4.148598996465093e-05, + "loss": 2.7096, + "step": 28949 + }, + { + "epoch": 1.797132037991185, + "grad_norm": 0.23036808304640588, + "learning_rate": 4.1482431206525815e-05, + "loss": 2.8171, + "step": 28950 + }, + { + "epoch": 1.797194115090943, + "grad_norm": 0.19898771465181384, + "learning_rate": 4.147887249283891e-05, + "loss": 2.7757, + "step": 28951 + }, + { + "epoch": 1.7972561921907009, + "grad_norm": 0.19631945436444875, + "learning_rate": 4.147531382360881e-05, + "loss": 2.7879, + "step": 28952 + }, + { + "epoch": 1.7973182692904588, + "grad_norm": 0.17223811265090505, + "learning_rate": 4.147175519885405e-05, + "loss": 2.7582, + "step": 28953 + }, + { + "epoch": 1.7973803463902165, + "grad_norm": 0.16664053605296805, + "learning_rate": 4.146819661859323e-05, + "loss": 2.6379, + "step": 28954 + }, + { + "epoch": 1.7974424234899744, + "grad_norm": 0.15568292352763993, + "learning_rate": 4.146463808284488e-05, + "loss": 2.8537, + "step": 28955 + }, + { + "epoch": 1.7975045005897323, + "grad_norm": 0.1654016811079287, + "learning_rate": 4.146107959162758e-05, + "loss": 2.7872, + "step": 28956 + }, + { + "epoch": 1.7975665776894902, + "grad_norm": 0.15949978267758014, + "learning_rate": 4.145752114495993e-05, + "loss": 2.7359, + "step": 28957 + }, + { + "epoch": 1.7976286547892482, + "grad_norm": 0.15595649332181855, + "learning_rate": 4.1453962742860454e-05, + "loss": 2.7845, + "step": 28958 + }, + { + "epoch": 1.797690731889006, + "grad_norm": 0.2039231611232757, + "learning_rate": 4.145040438534772e-05, + "loss": 2.7566, + "step": 28959 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.1533556794116267, + "learning_rate": 4.1446846072440314e-05, + "loss": 2.7819, + "step": 28960 + }, + { + "epoch": 1.797814886088522, + "grad_norm": 0.16590136915921544, + "learning_rate": 4.1443287804156784e-05, + "loss": 2.7862, + "step": 28961 + }, + { + "epoch": 1.7978769631882798, + "grad_norm": 0.15738495258837099, + "learning_rate": 4.14397295805157e-05, + "loss": 2.8465, + "step": 28962 + }, + { + "epoch": 1.7979390402880377, + "grad_norm": 0.1585584876128336, + "learning_rate": 4.143617140153562e-05, + "loss": 2.7574, + "step": 28963 + }, + { + "epoch": 1.7980011173877957, + "grad_norm": 0.15175692927472959, + "learning_rate": 4.143261326723512e-05, + "loss": 2.8021, + "step": 28964 + }, + { + "epoch": 1.7980631944875536, + "grad_norm": 0.16048109442598635, + "learning_rate": 4.1429055177632746e-05, + "loss": 2.704, + "step": 28965 + }, + { + "epoch": 1.7981252715873115, + "grad_norm": 0.17476863062989254, + "learning_rate": 4.142549713274709e-05, + "loss": 2.7574, + "step": 28966 + }, + { + "epoch": 1.7981873486870694, + "grad_norm": 0.14547266951790594, + "learning_rate": 4.1421939132596676e-05, + "loss": 2.7959, + "step": 28967 + }, + { + "epoch": 1.7982494257868273, + "grad_norm": 0.14727718702507822, + "learning_rate": 4.1418381177200103e-05, + "loss": 2.7616, + "step": 28968 + }, + { + "epoch": 1.7983115028865853, + "grad_norm": 0.145294128558974, + "learning_rate": 4.141482326657591e-05, + "loss": 2.7226, + "step": 28969 + }, + { + "epoch": 1.7983735799863432, + "grad_norm": 0.1466910430230829, + "learning_rate": 4.141126540074268e-05, + "loss": 2.8473, + "step": 28970 + }, + { + "epoch": 1.798435657086101, + "grad_norm": 0.17613233896590735, + "learning_rate": 4.140770757971896e-05, + "loss": 2.7404, + "step": 28971 + }, + { + "epoch": 1.7984977341858588, + "grad_norm": 0.1708395938385195, + "learning_rate": 4.1404149803523304e-05, + "loss": 2.7684, + "step": 28972 + }, + { + "epoch": 1.7985598112856167, + "grad_norm": 0.16111188810488175, + "learning_rate": 4.140059207217429e-05, + "loss": 2.8437, + "step": 28973 + }, + { + "epoch": 1.7986218883853746, + "grad_norm": 0.1470729131076064, + "learning_rate": 4.139703438569047e-05, + "loss": 2.8223, + "step": 28974 + }, + { + "epoch": 1.7986839654851325, + "grad_norm": 0.19434075852744848, + "learning_rate": 4.139347674409043e-05, + "loss": 2.6612, + "step": 28975 + }, + { + "epoch": 1.7987460425848905, + "grad_norm": 0.16680364394021174, + "learning_rate": 4.1389919147392684e-05, + "loss": 2.9001, + "step": 28976 + }, + { + "epoch": 1.7988081196846484, + "grad_norm": 0.1642305898163954, + "learning_rate": 4.138636159561584e-05, + "loss": 2.7435, + "step": 28977 + }, + { + "epoch": 1.798870196784406, + "grad_norm": 0.15349613726055025, + "learning_rate": 4.138280408877843e-05, + "loss": 2.7584, + "step": 28978 + }, + { + "epoch": 1.798932273884164, + "grad_norm": 0.18779836201572997, + "learning_rate": 4.137924662689903e-05, + "loss": 2.8088, + "step": 28979 + }, + { + "epoch": 1.798994350983922, + "grad_norm": 0.1518763438899232, + "learning_rate": 4.1375689209996186e-05, + "loss": 2.8007, + "step": 28980 + }, + { + "epoch": 1.7990564280836798, + "grad_norm": 0.15407429708962367, + "learning_rate": 4.137213183808847e-05, + "loss": 2.7789, + "step": 28981 + }, + { + "epoch": 1.7991185051834377, + "grad_norm": 0.1707047154682012, + "learning_rate": 4.136857451119444e-05, + "loss": 2.8422, + "step": 28982 + }, + { + "epoch": 1.7991805822831957, + "grad_norm": 0.14559203788887112, + "learning_rate": 4.136501722933263e-05, + "loss": 2.7618, + "step": 28983 + }, + { + "epoch": 1.7992426593829536, + "grad_norm": 0.1634645179586491, + "learning_rate": 4.1361459992521645e-05, + "loss": 2.8328, + "step": 28984 + }, + { + "epoch": 1.7993047364827115, + "grad_norm": 0.143078845788503, + "learning_rate": 4.135790280078001e-05, + "loss": 2.6983, + "step": 28985 + }, + { + "epoch": 1.7993668135824694, + "grad_norm": 0.15667348336314615, + "learning_rate": 4.1354345654126304e-05, + "loss": 2.8292, + "step": 28986 + }, + { + "epoch": 1.7994288906822273, + "grad_norm": 0.17130722745577345, + "learning_rate": 4.1350788552579064e-05, + "loss": 2.7617, + "step": 28987 + }, + { + "epoch": 1.7994909677819853, + "grad_norm": 0.14901299971277598, + "learning_rate": 4.134723149615685e-05, + "loss": 2.7719, + "step": 28988 + }, + { + "epoch": 1.7995530448817432, + "grad_norm": 0.14960326886975647, + "learning_rate": 4.134367448487824e-05, + "loss": 2.7481, + "step": 28989 + }, + { + "epoch": 1.799615121981501, + "grad_norm": 0.18331690469919662, + "learning_rate": 4.134011751876178e-05, + "loss": 2.7919, + "step": 28990 + }, + { + "epoch": 1.799677199081259, + "grad_norm": 0.15767043702041877, + "learning_rate": 4.133656059782604e-05, + "loss": 2.625, + "step": 28991 + }, + { + "epoch": 1.799739276181017, + "grad_norm": 0.14578412849389238, + "learning_rate": 4.133300372208956e-05, + "loss": 2.6766, + "step": 28992 + }, + { + "epoch": 1.7998013532807748, + "grad_norm": 0.14552195051493025, + "learning_rate": 4.1329446891570905e-05, + "loss": 2.7671, + "step": 28993 + }, + { + "epoch": 1.7998634303805328, + "grad_norm": 0.21574882255378774, + "learning_rate": 4.1325890106288624e-05, + "loss": 2.7665, + "step": 28994 + }, + { + "epoch": 1.7999255074802907, + "grad_norm": 0.20027847423585218, + "learning_rate": 4.132233336626129e-05, + "loss": 2.6888, + "step": 28995 + }, + { + "epoch": 1.7999875845800484, + "grad_norm": 0.1595899350394003, + "learning_rate": 4.1318776671507455e-05, + "loss": 2.7819, + "step": 28996 + }, + { + "epoch": 1.8000496616798063, + "grad_norm": 0.15143520918769304, + "learning_rate": 4.1315220022045654e-05, + "loss": 2.8212, + "step": 28997 + }, + { + "epoch": 1.8001117387795642, + "grad_norm": 0.15181338472253805, + "learning_rate": 4.131166341789447e-05, + "loss": 2.7417, + "step": 28998 + }, + { + "epoch": 1.8001738158793221, + "grad_norm": 0.1544231549921806, + "learning_rate": 4.130810685907243e-05, + "loss": 2.779, + "step": 28999 + }, + { + "epoch": 1.80023589297908, + "grad_norm": 0.16959042944538463, + "learning_rate": 4.1304550345598124e-05, + "loss": 2.8942, + "step": 29000 + }, + { + "epoch": 1.800297970078838, + "grad_norm": 0.14816456724000943, + "learning_rate": 4.130099387749008e-05, + "loss": 2.8734, + "step": 29001 + }, + { + "epoch": 1.8003600471785957, + "grad_norm": 0.14446018379175307, + "learning_rate": 4.1297437454766877e-05, + "loss": 2.7348, + "step": 29002 + }, + { + "epoch": 1.8004221242783536, + "grad_norm": 0.15520419370221278, + "learning_rate": 4.129388107744703e-05, + "loss": 2.7384, + "step": 29003 + }, + { + "epoch": 1.8004842013781115, + "grad_norm": 0.1665092534523876, + "learning_rate": 4.129032474554915e-05, + "loss": 2.7162, + "step": 29004 + }, + { + "epoch": 1.8005462784778694, + "grad_norm": 0.14887605681232446, + "learning_rate": 4.128676845909173e-05, + "loss": 2.8272, + "step": 29005 + }, + { + "epoch": 1.8006083555776273, + "grad_norm": 0.15423277235584884, + "learning_rate": 4.128321221809338e-05, + "loss": 2.7649, + "step": 29006 + }, + { + "epoch": 1.8006704326773852, + "grad_norm": 0.16425680960803773, + "learning_rate": 4.127965602257262e-05, + "loss": 2.8099, + "step": 29007 + }, + { + "epoch": 1.8007325097771432, + "grad_norm": 0.14530192921244464, + "learning_rate": 4.1276099872548e-05, + "loss": 2.8541, + "step": 29008 + }, + { + "epoch": 1.800794586876901, + "grad_norm": 0.1516069297951105, + "learning_rate": 4.12725437680381e-05, + "loss": 2.8054, + "step": 29009 + }, + { + "epoch": 1.800856663976659, + "grad_norm": 0.14004215013619575, + "learning_rate": 4.126898770906144e-05, + "loss": 2.7211, + "step": 29010 + }, + { + "epoch": 1.800918741076417, + "grad_norm": 0.15287817846364227, + "learning_rate": 4.12654316956366e-05, + "loss": 2.8634, + "step": 29011 + }, + { + "epoch": 1.8009808181761748, + "grad_norm": 0.14769282644863763, + "learning_rate": 4.126187572778212e-05, + "loss": 2.8166, + "step": 29012 + }, + { + "epoch": 1.8010428952759328, + "grad_norm": 0.144522531349529, + "learning_rate": 4.125831980551657e-05, + "loss": 2.7916, + "step": 29013 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.1512652576943686, + "learning_rate": 4.125476392885847e-05, + "loss": 2.7782, + "step": 29014 + }, + { + "epoch": 1.8011670494754486, + "grad_norm": 0.15173560806101793, + "learning_rate": 4.12512080978264e-05, + "loss": 2.7551, + "step": 29015 + }, + { + "epoch": 1.8012291265752065, + "grad_norm": 0.1403628793894921, + "learning_rate": 4.1247652312438894e-05, + "loss": 2.7159, + "step": 29016 + }, + { + "epoch": 1.8012912036749644, + "grad_norm": 0.15546672115248125, + "learning_rate": 4.124409657271452e-05, + "loss": 2.8034, + "step": 29017 + }, + { + "epoch": 1.8013532807747223, + "grad_norm": 0.176740872114037, + "learning_rate": 4.124054087867182e-05, + "loss": 2.8219, + "step": 29018 + }, + { + "epoch": 1.8014153578744803, + "grad_norm": 0.1791717965663207, + "learning_rate": 4.123698523032934e-05, + "loss": 2.7563, + "step": 29019 + }, + { + "epoch": 1.801477434974238, + "grad_norm": 0.1470987406455992, + "learning_rate": 4.1233429627705614e-05, + "loss": 2.724, + "step": 29020 + }, + { + "epoch": 1.8015395120739959, + "grad_norm": 0.1384710953767157, + "learning_rate": 4.122987407081924e-05, + "loss": 2.743, + "step": 29021 + }, + { + "epoch": 1.8016015891737538, + "grad_norm": 0.14895605811357734, + "learning_rate": 4.122631855968873e-05, + "loss": 2.7751, + "step": 29022 + }, + { + "epoch": 1.8016636662735117, + "grad_norm": 0.14406575168241537, + "learning_rate": 4.1222763094332657e-05, + "loss": 2.7408, + "step": 29023 + }, + { + "epoch": 1.8017257433732696, + "grad_norm": 0.14804850605088832, + "learning_rate": 4.1219207674769545e-05, + "loss": 2.8066, + "step": 29024 + }, + { + "epoch": 1.8017878204730275, + "grad_norm": 0.14167220885524706, + "learning_rate": 4.1215652301017974e-05, + "loss": 2.8026, + "step": 29025 + }, + { + "epoch": 1.8018498975727852, + "grad_norm": 0.14450957805946857, + "learning_rate": 4.121209697309646e-05, + "loss": 2.7726, + "step": 29026 + }, + { + "epoch": 1.8019119746725432, + "grad_norm": 0.15576731513322614, + "learning_rate": 4.1208541691023596e-05, + "loss": 2.7684, + "step": 29027 + }, + { + "epoch": 1.801974051772301, + "grad_norm": 0.1433292747101291, + "learning_rate": 4.1204986454817876e-05, + "loss": 2.7561, + "step": 29028 + }, + { + "epoch": 1.802036128872059, + "grad_norm": 0.15642443695043234, + "learning_rate": 4.1201431264497896e-05, + "loss": 2.7558, + "step": 29029 + }, + { + "epoch": 1.802098205971817, + "grad_norm": 0.13641582034862568, + "learning_rate": 4.1197876120082175e-05, + "loss": 2.8127, + "step": 29030 + }, + { + "epoch": 1.8021602830715748, + "grad_norm": 0.14741150867583636, + "learning_rate": 4.1194321021589274e-05, + "loss": 2.7985, + "step": 29031 + }, + { + "epoch": 1.8022223601713327, + "grad_norm": 0.1522942915130123, + "learning_rate": 4.1190765969037746e-05, + "loss": 2.7654, + "step": 29032 + }, + { + "epoch": 1.8022844372710907, + "grad_norm": 0.14176360756788128, + "learning_rate": 4.118721096244612e-05, + "loss": 2.7042, + "step": 29033 + }, + { + "epoch": 1.8023465143708486, + "grad_norm": 0.14275674865915808, + "learning_rate": 4.118365600183296e-05, + "loss": 2.7579, + "step": 29034 + }, + { + "epoch": 1.8024085914706065, + "grad_norm": 0.17395570445120637, + "learning_rate": 4.1180101087216806e-05, + "loss": 2.7728, + "step": 29035 + }, + { + "epoch": 1.8024706685703644, + "grad_norm": 0.1401020964276918, + "learning_rate": 4.117654621861621e-05, + "loss": 2.7547, + "step": 29036 + }, + { + "epoch": 1.8025327456701223, + "grad_norm": 0.1455618418801099, + "learning_rate": 4.11729913960497e-05, + "loss": 2.7969, + "step": 29037 + }, + { + "epoch": 1.8025948227698803, + "grad_norm": 0.16263817520011706, + "learning_rate": 4.1169436619535853e-05, + "loss": 2.7933, + "step": 29038 + }, + { + "epoch": 1.8026568998696382, + "grad_norm": 0.15370513221606663, + "learning_rate": 4.1165881889093174e-05, + "loss": 2.7958, + "step": 29039 + }, + { + "epoch": 1.802718976969396, + "grad_norm": 0.14681455910685234, + "learning_rate": 4.116232720474026e-05, + "loss": 2.8052, + "step": 29040 + }, + { + "epoch": 1.802781054069154, + "grad_norm": 0.14354287312996736, + "learning_rate": 4.115877256649562e-05, + "loss": 2.7366, + "step": 29041 + }, + { + "epoch": 1.802843131168912, + "grad_norm": 0.17546145701687102, + "learning_rate": 4.1155217974377814e-05, + "loss": 2.7947, + "step": 29042 + }, + { + "epoch": 1.8029052082686698, + "grad_norm": 0.14123931457182048, + "learning_rate": 4.1151663428405385e-05, + "loss": 2.6416, + "step": 29043 + }, + { + "epoch": 1.8029672853684275, + "grad_norm": 0.16219718117751636, + "learning_rate": 4.114810892859686e-05, + "loss": 2.6667, + "step": 29044 + }, + { + "epoch": 1.8030293624681855, + "grad_norm": 0.13526110165182278, + "learning_rate": 4.114455447497081e-05, + "loss": 2.7883, + "step": 29045 + }, + { + "epoch": 1.8030914395679434, + "grad_norm": 0.13458965395054756, + "learning_rate": 4.114100006754577e-05, + "loss": 2.7021, + "step": 29046 + }, + { + "epoch": 1.8031535166677013, + "grad_norm": 0.14822767510828688, + "learning_rate": 4.113744570634029e-05, + "loss": 2.7462, + "step": 29047 + }, + { + "epoch": 1.8032155937674592, + "grad_norm": 0.1738399705172564, + "learning_rate": 4.113389139137289e-05, + "loss": 2.8335, + "step": 29048 + }, + { + "epoch": 1.8032776708672171, + "grad_norm": 0.145229111094344, + "learning_rate": 4.1130337122662144e-05, + "loss": 2.7944, + "step": 29049 + }, + { + "epoch": 1.8033397479669748, + "grad_norm": 0.17127562179811703, + "learning_rate": 4.1126782900226574e-05, + "loss": 2.734, + "step": 29050 + }, + { + "epoch": 1.8034018250667327, + "grad_norm": 0.16935000157718078, + "learning_rate": 4.112322872408474e-05, + "loss": 2.7925, + "step": 29051 + }, + { + "epoch": 1.8034639021664907, + "grad_norm": 0.16884906320333207, + "learning_rate": 4.111967459425517e-05, + "loss": 2.7446, + "step": 29052 + }, + { + "epoch": 1.8035259792662486, + "grad_norm": 0.15572255039725827, + "learning_rate": 4.11161205107564e-05, + "loss": 2.7627, + "step": 29053 + }, + { + "epoch": 1.8035880563660065, + "grad_norm": 0.14557555241208914, + "learning_rate": 4.1112566473607004e-05, + "loss": 2.7458, + "step": 29054 + }, + { + "epoch": 1.8036501334657644, + "grad_norm": 0.13633998447633144, + "learning_rate": 4.1109012482825506e-05, + "loss": 2.6382, + "step": 29055 + }, + { + "epoch": 1.8037122105655223, + "grad_norm": 0.14727056606310923, + "learning_rate": 4.1105458538430445e-05, + "loss": 2.6729, + "step": 29056 + }, + { + "epoch": 1.8037742876652803, + "grad_norm": 0.14214198804136605, + "learning_rate": 4.1101904640440376e-05, + "loss": 2.7075, + "step": 29057 + }, + { + "epoch": 1.8038363647650382, + "grad_norm": 0.16082289397396443, + "learning_rate": 4.109835078887382e-05, + "loss": 2.7885, + "step": 29058 + }, + { + "epoch": 1.803898441864796, + "grad_norm": 0.14554802767695935, + "learning_rate": 4.109479698374933e-05, + "loss": 2.856, + "step": 29059 + }, + { + "epoch": 1.803960518964554, + "grad_norm": 0.13876688080202132, + "learning_rate": 4.109124322508545e-05, + "loss": 2.7825, + "step": 29060 + }, + { + "epoch": 1.804022596064312, + "grad_norm": 0.16165411459129808, + "learning_rate": 4.1087689512900716e-05, + "loss": 2.7903, + "step": 29061 + }, + { + "epoch": 1.8040846731640698, + "grad_norm": 0.16325751433267915, + "learning_rate": 4.108413584721366e-05, + "loss": 2.7747, + "step": 29062 + }, + { + "epoch": 1.8041467502638278, + "grad_norm": 0.1557884014079839, + "learning_rate": 4.1080582228042843e-05, + "loss": 2.8113, + "step": 29063 + }, + { + "epoch": 1.8042088273635857, + "grad_norm": 0.1434995584314976, + "learning_rate": 4.107702865540678e-05, + "loss": 2.787, + "step": 29064 + }, + { + "epoch": 1.8042709044633436, + "grad_norm": 0.15504549197212802, + "learning_rate": 4.1073475129324046e-05, + "loss": 2.7142, + "step": 29065 + }, + { + "epoch": 1.8043329815631015, + "grad_norm": 0.14461767344928692, + "learning_rate": 4.1069921649813145e-05, + "loss": 2.7888, + "step": 29066 + }, + { + "epoch": 1.8043950586628594, + "grad_norm": 0.14428274596663432, + "learning_rate": 4.1066368216892635e-05, + "loss": 2.7494, + "step": 29067 + }, + { + "epoch": 1.8044571357626171, + "grad_norm": 0.13895935830544184, + "learning_rate": 4.1062814830581054e-05, + "loss": 2.8377, + "step": 29068 + }, + { + "epoch": 1.804519212862375, + "grad_norm": 0.15891607373272354, + "learning_rate": 4.105926149089693e-05, + "loss": 2.721, + "step": 29069 + }, + { + "epoch": 1.804581289962133, + "grad_norm": 0.14149336792092926, + "learning_rate": 4.105570819785881e-05, + "loss": 2.8087, + "step": 29070 + }, + { + "epoch": 1.8046433670618909, + "grad_norm": 0.14053595558272333, + "learning_rate": 4.1052154951485236e-05, + "loss": 2.6855, + "step": 29071 + }, + { + "epoch": 1.8047054441616488, + "grad_norm": 0.15727059121019984, + "learning_rate": 4.104860175179474e-05, + "loss": 2.7004, + "step": 29072 + }, + { + "epoch": 1.8047675212614067, + "grad_norm": 0.13923639584892988, + "learning_rate": 4.104504859880586e-05, + "loss": 2.7175, + "step": 29073 + }, + { + "epoch": 1.8048295983611644, + "grad_norm": 0.15814988688685894, + "learning_rate": 4.104149549253714e-05, + "loss": 2.7177, + "step": 29074 + }, + { + "epoch": 1.8048916754609223, + "grad_norm": 0.15367249416465123, + "learning_rate": 4.10379424330071e-05, + "loss": 2.6458, + "step": 29075 + }, + { + "epoch": 1.8049537525606802, + "grad_norm": 0.1431023419136072, + "learning_rate": 4.103438942023431e-05, + "loss": 2.7713, + "step": 29076 + }, + { + "epoch": 1.8050158296604382, + "grad_norm": 0.13597313013870643, + "learning_rate": 4.1030836454237266e-05, + "loss": 2.6941, + "step": 29077 + }, + { + "epoch": 1.805077906760196, + "grad_norm": 0.15064016298488914, + "learning_rate": 4.102728353503454e-05, + "loss": 2.7494, + "step": 29078 + }, + { + "epoch": 1.805139983859954, + "grad_norm": 0.1542905009437977, + "learning_rate": 4.1023730662644656e-05, + "loss": 2.8564, + "step": 29079 + }, + { + "epoch": 1.805202060959712, + "grad_norm": 0.14500612139112656, + "learning_rate": 4.102017783708614e-05, + "loss": 2.794, + "step": 29080 + }, + { + "epoch": 1.8052641380594698, + "grad_norm": 0.14874556847390447, + "learning_rate": 4.1016625058377536e-05, + "loss": 2.7197, + "step": 29081 + }, + { + "epoch": 1.8053262151592278, + "grad_norm": 0.1509405275696735, + "learning_rate": 4.101307232653738e-05, + "loss": 2.7772, + "step": 29082 + }, + { + "epoch": 1.8053882922589857, + "grad_norm": 0.15314656636117113, + "learning_rate": 4.100951964158422e-05, + "loss": 2.746, + "step": 29083 + }, + { + "epoch": 1.8054503693587436, + "grad_norm": 0.15159402892952673, + "learning_rate": 4.1005967003536555e-05, + "loss": 2.8329, + "step": 29084 + }, + { + "epoch": 1.8055124464585015, + "grad_norm": 0.1401533788155167, + "learning_rate": 4.100241441241296e-05, + "loss": 2.7037, + "step": 29085 + }, + { + "epoch": 1.8055745235582594, + "grad_norm": 0.1437279764698792, + "learning_rate": 4.099886186823193e-05, + "loss": 2.8023, + "step": 29086 + }, + { + "epoch": 1.8056366006580173, + "grad_norm": 0.1483577463824979, + "learning_rate": 4.099530937101204e-05, + "loss": 2.7925, + "step": 29087 + }, + { + "epoch": 1.8056986777577753, + "grad_norm": 0.1365636286154229, + "learning_rate": 4.099175692077181e-05, + "loss": 2.7536, + "step": 29088 + }, + { + "epoch": 1.8057607548575332, + "grad_norm": 0.1469977601368875, + "learning_rate": 4.098820451752977e-05, + "loss": 2.719, + "step": 29089 + }, + { + "epoch": 1.805822831957291, + "grad_norm": 0.16707659701750383, + "learning_rate": 4.098465216130446e-05, + "loss": 2.8036, + "step": 29090 + }, + { + "epoch": 1.805884909057049, + "grad_norm": 0.14803450238305166, + "learning_rate": 4.0981099852114396e-05, + "loss": 2.7362, + "step": 29091 + }, + { + "epoch": 1.8059469861568067, + "grad_norm": 0.14156876545950148, + "learning_rate": 4.097754758997814e-05, + "loss": 2.7889, + "step": 29092 + }, + { + "epoch": 1.8060090632565646, + "grad_norm": 0.15254645433478284, + "learning_rate": 4.0973995374914195e-05, + "loss": 2.7235, + "step": 29093 + }, + { + "epoch": 1.8060711403563225, + "grad_norm": 0.15009168539785595, + "learning_rate": 4.0970443206941114e-05, + "loss": 2.8279, + "step": 29094 + }, + { + "epoch": 1.8061332174560805, + "grad_norm": 0.16251501211185232, + "learning_rate": 4.096689108607742e-05, + "loss": 2.7436, + "step": 29095 + }, + { + "epoch": 1.8061952945558384, + "grad_norm": 0.13794720383195663, + "learning_rate": 4.0963339012341644e-05, + "loss": 2.7583, + "step": 29096 + }, + { + "epoch": 1.8062573716555963, + "grad_norm": 0.16349059287402912, + "learning_rate": 4.095978698575233e-05, + "loss": 2.7273, + "step": 29097 + }, + { + "epoch": 1.806319448755354, + "grad_norm": 0.16690124578568127, + "learning_rate": 4.0956235006327995e-05, + "loss": 2.7845, + "step": 29098 + }, + { + "epoch": 1.806381525855112, + "grad_norm": 0.15467878952933303, + "learning_rate": 4.0952683074087185e-05, + "loss": 2.8405, + "step": 29099 + }, + { + "epoch": 1.8064436029548698, + "grad_norm": 0.16522969291131684, + "learning_rate": 4.094913118904841e-05, + "loss": 2.7447, + "step": 29100 + }, + { + "epoch": 1.8065056800546277, + "grad_norm": 0.14605859547702507, + "learning_rate": 4.0945579351230234e-05, + "loss": 2.7892, + "step": 29101 + }, + { + "epoch": 1.8065677571543857, + "grad_norm": 0.17335141640299873, + "learning_rate": 4.094202756065115e-05, + "loss": 2.7602, + "step": 29102 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.15418233724694233, + "learning_rate": 4.0938475817329724e-05, + "loss": 2.8298, + "step": 29103 + }, + { + "epoch": 1.8066919113539015, + "grad_norm": 0.18858336352908434, + "learning_rate": 4.0934924121284465e-05, + "loss": 2.6912, + "step": 29104 + }, + { + "epoch": 1.8067539884536594, + "grad_norm": 0.14276777352762854, + "learning_rate": 4.0931372472533893e-05, + "loss": 2.6506, + "step": 29105 + }, + { + "epoch": 1.8068160655534173, + "grad_norm": 0.15337922853401742, + "learning_rate": 4.092782087109657e-05, + "loss": 2.7908, + "step": 29106 + }, + { + "epoch": 1.8068781426531753, + "grad_norm": 0.15504258466830004, + "learning_rate": 4.0924269316991e-05, + "loss": 2.6771, + "step": 29107 + }, + { + "epoch": 1.8069402197529332, + "grad_norm": 0.20138409910772562, + "learning_rate": 4.092071781023572e-05, + "loss": 2.769, + "step": 29108 + }, + { + "epoch": 1.807002296852691, + "grad_norm": 0.1578583787571186, + "learning_rate": 4.0917166350849255e-05, + "loss": 2.8519, + "step": 29109 + }, + { + "epoch": 1.807064373952449, + "grad_norm": 0.16790589553806953, + "learning_rate": 4.091361493885015e-05, + "loss": 2.8501, + "step": 29110 + }, + { + "epoch": 1.807126451052207, + "grad_norm": 0.16199537658127924, + "learning_rate": 4.091006357425691e-05, + "loss": 2.6841, + "step": 29111 + }, + { + "epoch": 1.8071885281519648, + "grad_norm": 0.15328750137779223, + "learning_rate": 4.0906512257088084e-05, + "loss": 2.7766, + "step": 29112 + }, + { + "epoch": 1.8072506052517228, + "grad_norm": 0.18384877206886646, + "learning_rate": 4.0902960987362185e-05, + "loss": 2.7467, + "step": 29113 + }, + { + "epoch": 1.8073126823514807, + "grad_norm": 0.15238569366930912, + "learning_rate": 4.089940976509775e-05, + "loss": 2.7682, + "step": 29114 + }, + { + "epoch": 1.8073747594512386, + "grad_norm": 0.16158023329243662, + "learning_rate": 4.0895858590313295e-05, + "loss": 2.8034, + "step": 29115 + }, + { + "epoch": 1.8074368365509963, + "grad_norm": 0.15375285273905984, + "learning_rate": 4.0892307463027376e-05, + "loss": 2.6728, + "step": 29116 + }, + { + "epoch": 1.8074989136507542, + "grad_norm": 0.14193856487437428, + "learning_rate": 4.088875638325849e-05, + "loss": 2.7615, + "step": 29117 + }, + { + "epoch": 1.8075609907505121, + "grad_norm": 0.22920055120834174, + "learning_rate": 4.0885205351025164e-05, + "loss": 2.7962, + "step": 29118 + }, + { + "epoch": 1.80762306785027, + "grad_norm": 0.15362634912430767, + "learning_rate": 4.088165436634592e-05, + "loss": 2.8113, + "step": 29119 + }, + { + "epoch": 1.807685144950028, + "grad_norm": 0.18579207832343292, + "learning_rate": 4.0878103429239325e-05, + "loss": 2.771, + "step": 29120 + }, + { + "epoch": 1.8077472220497859, + "grad_norm": 0.18131342509854576, + "learning_rate": 4.087455253972388e-05, + "loss": 2.7463, + "step": 29121 + }, + { + "epoch": 1.8078092991495436, + "grad_norm": 0.1765916633573809, + "learning_rate": 4.0871001697818104e-05, + "loss": 2.7816, + "step": 29122 + }, + { + "epoch": 1.8078713762493015, + "grad_norm": 0.14373058778882555, + "learning_rate": 4.086745090354052e-05, + "loss": 2.7704, + "step": 29123 + }, + { + "epoch": 1.8079334533490594, + "grad_norm": 0.15886709025192192, + "learning_rate": 4.0863900156909676e-05, + "loss": 2.7395, + "step": 29124 + }, + { + "epoch": 1.8079955304488173, + "grad_norm": 0.13764120621590437, + "learning_rate": 4.0860349457944067e-05, + "loss": 2.6169, + "step": 29125 + }, + { + "epoch": 1.8080576075485753, + "grad_norm": 0.15528675238183617, + "learning_rate": 4.085679880666225e-05, + "loss": 2.7495, + "step": 29126 + }, + { + "epoch": 1.8081196846483332, + "grad_norm": 0.15015005091951494, + "learning_rate": 4.0853248203082724e-05, + "loss": 2.6844, + "step": 29127 + }, + { + "epoch": 1.808181761748091, + "grad_norm": 0.14408818198064405, + "learning_rate": 4.084969764722403e-05, + "loss": 2.8294, + "step": 29128 + }, + { + "epoch": 1.808243838847849, + "grad_norm": 0.14882266117188916, + "learning_rate": 4.084614713910468e-05, + "loss": 2.8321, + "step": 29129 + }, + { + "epoch": 1.808305915947607, + "grad_norm": 0.14747423149966926, + "learning_rate": 4.08425966787432e-05, + "loss": 2.7423, + "step": 29130 + }, + { + "epoch": 1.8083679930473648, + "grad_norm": 0.1578516057162378, + "learning_rate": 4.083904626615812e-05, + "loss": 2.8353, + "step": 29131 + }, + { + "epoch": 1.8084300701471228, + "grad_norm": 0.14470508791926026, + "learning_rate": 4.0835495901367955e-05, + "loss": 2.6829, + "step": 29132 + }, + { + "epoch": 1.8084921472468807, + "grad_norm": 0.154905669412685, + "learning_rate": 4.083194558439124e-05, + "loss": 2.7935, + "step": 29133 + }, + { + "epoch": 1.8085542243466386, + "grad_norm": 0.1612764816200156, + "learning_rate": 4.0828395315246476e-05, + "loss": 2.8311, + "step": 29134 + }, + { + "epoch": 1.8086163014463965, + "grad_norm": 0.14927534133744014, + "learning_rate": 4.0824845093952216e-05, + "loss": 2.7815, + "step": 29135 + }, + { + "epoch": 1.8086783785461544, + "grad_norm": 0.14303802755590747, + "learning_rate": 4.082129492052695e-05, + "loss": 2.7246, + "step": 29136 + }, + { + "epoch": 1.8087404556459123, + "grad_norm": 0.14813319812292114, + "learning_rate": 4.081774479498924e-05, + "loss": 2.7687, + "step": 29137 + }, + { + "epoch": 1.8088025327456703, + "grad_norm": 0.14675532396007382, + "learning_rate": 4.081419471735756e-05, + "loss": 2.8134, + "step": 29138 + }, + { + "epoch": 1.8088646098454282, + "grad_norm": 0.15783645972048574, + "learning_rate": 4.081064468765048e-05, + "loss": 2.8351, + "step": 29139 + }, + { + "epoch": 1.8089266869451859, + "grad_norm": 0.1401797637224907, + "learning_rate": 4.0807094705886475e-05, + "loss": 2.6753, + "step": 29140 + }, + { + "epoch": 1.8089887640449438, + "grad_norm": 0.14548352698102618, + "learning_rate": 4.08035447720841e-05, + "loss": 2.6838, + "step": 29141 + }, + { + "epoch": 1.8090508411447017, + "grad_norm": 0.1435776322381571, + "learning_rate": 4.0799994886261875e-05, + "loss": 2.7719, + "step": 29142 + }, + { + "epoch": 1.8091129182444596, + "grad_norm": 0.1477020011139982, + "learning_rate": 4.0796445048438295e-05, + "loss": 2.8062, + "step": 29143 + }, + { + "epoch": 1.8091749953442176, + "grad_norm": 0.15835970911970304, + "learning_rate": 4.0792895258631895e-05, + "loss": 2.6982, + "step": 29144 + }, + { + "epoch": 1.8092370724439755, + "grad_norm": 0.1697115802627162, + "learning_rate": 4.07893455168612e-05, + "loss": 2.7643, + "step": 29145 + }, + { + "epoch": 1.8092991495437332, + "grad_norm": 0.15094993672361687, + "learning_rate": 4.0785795823144726e-05, + "loss": 2.7674, + "step": 29146 + }, + { + "epoch": 1.809361226643491, + "grad_norm": 0.1657261793563486, + "learning_rate": 4.078224617750098e-05, + "loss": 2.8267, + "step": 29147 + }, + { + "epoch": 1.809423303743249, + "grad_norm": 0.14344125924555162, + "learning_rate": 4.0778696579948516e-05, + "loss": 2.798, + "step": 29148 + }, + { + "epoch": 1.809485380843007, + "grad_norm": 0.21348028153568008, + "learning_rate": 4.0775147030505814e-05, + "loss": 2.7192, + "step": 29149 + }, + { + "epoch": 1.8095474579427648, + "grad_norm": 0.1804972832487184, + "learning_rate": 4.0771597529191416e-05, + "loss": 2.751, + "step": 29150 + }, + { + "epoch": 1.8096095350425228, + "grad_norm": 0.1728474083094142, + "learning_rate": 4.0768048076023814e-05, + "loss": 2.8536, + "step": 29151 + }, + { + "epoch": 1.8096716121422807, + "grad_norm": 0.18419758941282485, + "learning_rate": 4.0764498671021554e-05, + "loss": 2.7643, + "step": 29152 + }, + { + "epoch": 1.8097336892420386, + "grad_norm": 0.16055052189269908, + "learning_rate": 4.076094931420317e-05, + "loss": 2.7777, + "step": 29153 + }, + { + "epoch": 1.8097957663417965, + "grad_norm": 0.14612175882071715, + "learning_rate": 4.075740000558714e-05, + "loss": 2.7877, + "step": 29154 + }, + { + "epoch": 1.8098578434415544, + "grad_norm": 0.15708153118947268, + "learning_rate": 4.0753850745192e-05, + "loss": 2.7503, + "step": 29155 + }, + { + "epoch": 1.8099199205413123, + "grad_norm": 0.15317522461157, + "learning_rate": 4.075030153303627e-05, + "loss": 2.8086, + "step": 29156 + }, + { + "epoch": 1.8099819976410703, + "grad_norm": 0.1397141522065042, + "learning_rate": 4.074675236913845e-05, + "loss": 2.7447, + "step": 29157 + }, + { + "epoch": 1.8100440747408282, + "grad_norm": 0.22224957989802097, + "learning_rate": 4.0743203253517084e-05, + "loss": 2.877, + "step": 29158 + }, + { + "epoch": 1.810106151840586, + "grad_norm": 0.15199391839799908, + "learning_rate": 4.073965418619066e-05, + "loss": 2.6919, + "step": 29159 + }, + { + "epoch": 1.810168228940344, + "grad_norm": 0.15566046482078477, + "learning_rate": 4.0736105167177725e-05, + "loss": 2.742, + "step": 29160 + }, + { + "epoch": 1.810230306040102, + "grad_norm": 0.15564640404494437, + "learning_rate": 4.073255619649676e-05, + "loss": 2.9385, + "step": 29161 + }, + { + "epoch": 1.8102923831398599, + "grad_norm": 0.1532632150451803, + "learning_rate": 4.072900727416631e-05, + "loss": 2.7891, + "step": 29162 + }, + { + "epoch": 1.8103544602396178, + "grad_norm": 0.14521266779138342, + "learning_rate": 4.0725458400204873e-05, + "loss": 2.7591, + "step": 29163 + }, + { + "epoch": 1.8104165373393755, + "grad_norm": 0.1666529651066324, + "learning_rate": 4.072190957463098e-05, + "loss": 2.7111, + "step": 29164 + }, + { + "epoch": 1.8104786144391334, + "grad_norm": 0.16401005088213771, + "learning_rate": 4.0718360797463126e-05, + "loss": 2.8048, + "step": 29165 + }, + { + "epoch": 1.8105406915388913, + "grad_norm": 0.1415637130137972, + "learning_rate": 4.0714812068719856e-05, + "loss": 2.7093, + "step": 29166 + }, + { + "epoch": 1.8106027686386492, + "grad_norm": 0.14431941772821238, + "learning_rate": 4.071126338841965e-05, + "loss": 2.7508, + "step": 29167 + }, + { + "epoch": 1.8106648457384071, + "grad_norm": 0.1445949388343444, + "learning_rate": 4.070771475658104e-05, + "loss": 2.726, + "step": 29168 + }, + { + "epoch": 1.810726922838165, + "grad_norm": 0.14929803694108695, + "learning_rate": 4.070416617322254e-05, + "loss": 2.9043, + "step": 29169 + }, + { + "epoch": 1.8107889999379227, + "grad_norm": 0.14999652600239147, + "learning_rate": 4.0700617638362655e-05, + "loss": 2.7965, + "step": 29170 + }, + { + "epoch": 1.8108510770376807, + "grad_norm": 0.1686911628282369, + "learning_rate": 4.069706915201992e-05, + "loss": 2.7423, + "step": 29171 + }, + { + "epoch": 1.8109131541374386, + "grad_norm": 0.13743646790367162, + "learning_rate": 4.0693520714212815e-05, + "loss": 2.7366, + "step": 29172 + }, + { + "epoch": 1.8109752312371965, + "grad_norm": 0.15406297507609382, + "learning_rate": 4.068997232495988e-05, + "loss": 2.7083, + "step": 29173 + }, + { + "epoch": 1.8110373083369544, + "grad_norm": 0.1476701931702766, + "learning_rate": 4.068642398427962e-05, + "loss": 2.7669, + "step": 29174 + }, + { + "epoch": 1.8110993854367123, + "grad_norm": 0.14726594955246675, + "learning_rate": 4.0682875692190555e-05, + "loss": 2.7811, + "step": 29175 + }, + { + "epoch": 1.8111614625364703, + "grad_norm": 0.14234653371775152, + "learning_rate": 4.067932744871117e-05, + "loss": 2.6767, + "step": 29176 + }, + { + "epoch": 1.8112235396362282, + "grad_norm": 0.16003327183935145, + "learning_rate": 4.067577925386001e-05, + "loss": 2.7845, + "step": 29177 + }, + { + "epoch": 1.811285616735986, + "grad_norm": 0.1720766217896871, + "learning_rate": 4.067223110765558e-05, + "loss": 2.8285, + "step": 29178 + }, + { + "epoch": 1.811347693835744, + "grad_norm": 0.14139653852213943, + "learning_rate": 4.066868301011637e-05, + "loss": 2.8226, + "step": 29179 + }, + { + "epoch": 1.811409770935502, + "grad_norm": 0.1480035839347629, + "learning_rate": 4.066513496126092e-05, + "loss": 2.7126, + "step": 29180 + }, + { + "epoch": 1.8114718480352598, + "grad_norm": 0.1698457284196244, + "learning_rate": 4.066158696110771e-05, + "loss": 2.7645, + "step": 29181 + }, + { + "epoch": 1.8115339251350178, + "grad_norm": 0.1467795576790281, + "learning_rate": 4.065803900967529e-05, + "loss": 2.7754, + "step": 29182 + }, + { + "epoch": 1.8115960022347757, + "grad_norm": 0.1449617212840226, + "learning_rate": 4.065449110698212e-05, + "loss": 2.7601, + "step": 29183 + }, + { + "epoch": 1.8116580793345336, + "grad_norm": 0.14161714928441782, + "learning_rate": 4.065094325304674e-05, + "loss": 2.7459, + "step": 29184 + }, + { + "epoch": 1.8117201564342915, + "grad_norm": 0.1463722435612698, + "learning_rate": 4.064739544788768e-05, + "loss": 2.795, + "step": 29185 + }, + { + "epoch": 1.8117822335340494, + "grad_norm": 0.14209045331371947, + "learning_rate": 4.064384769152341e-05, + "loss": 2.7267, + "step": 29186 + }, + { + "epoch": 1.8118443106338074, + "grad_norm": 0.1458547156416721, + "learning_rate": 4.0640299983972474e-05, + "loss": 2.7518, + "step": 29187 + }, + { + "epoch": 1.811906387733565, + "grad_norm": 0.14504650134903804, + "learning_rate": 4.063675232525336e-05, + "loss": 2.7378, + "step": 29188 + }, + { + "epoch": 1.811968464833323, + "grad_norm": 0.14257547297689402, + "learning_rate": 4.063320471538458e-05, + "loss": 2.7476, + "step": 29189 + }, + { + "epoch": 1.8120305419330809, + "grad_norm": 0.14195537430991925, + "learning_rate": 4.062965715438464e-05, + "loss": 2.6551, + "step": 29190 + }, + { + "epoch": 1.8120926190328388, + "grad_norm": 0.15919702236591873, + "learning_rate": 4.062610964227207e-05, + "loss": 2.7938, + "step": 29191 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.1460410084099657, + "learning_rate": 4.0622562179065356e-05, + "loss": 2.7408, + "step": 29192 + }, + { + "epoch": 1.8122167732323546, + "grad_norm": 0.13660139087669493, + "learning_rate": 4.0619014764783e-05, + "loss": 2.7318, + "step": 29193 + }, + { + "epoch": 1.8122788503321123, + "grad_norm": 0.14345359486514475, + "learning_rate": 4.0615467399443534e-05, + "loss": 2.6896, + "step": 29194 + }, + { + "epoch": 1.8123409274318703, + "grad_norm": 0.1473034592554259, + "learning_rate": 4.061192008306544e-05, + "loss": 2.8638, + "step": 29195 + }, + { + "epoch": 1.8124030045316282, + "grad_norm": 0.14606499082876065, + "learning_rate": 4.060837281566725e-05, + "loss": 2.7232, + "step": 29196 + }, + { + "epoch": 1.812465081631386, + "grad_norm": 0.14263606527200476, + "learning_rate": 4.060482559726745e-05, + "loss": 2.7071, + "step": 29197 + }, + { + "epoch": 1.812527158731144, + "grad_norm": 0.1485317771572403, + "learning_rate": 4.060127842788456e-05, + "loss": 2.7912, + "step": 29198 + }, + { + "epoch": 1.812589235830902, + "grad_norm": 0.1506590429493023, + "learning_rate": 4.059773130753708e-05, + "loss": 2.6956, + "step": 29199 + }, + { + "epoch": 1.8126513129306598, + "grad_norm": 0.1534914410205474, + "learning_rate": 4.059418423624353e-05, + "loss": 2.8147, + "step": 29200 + }, + { + "epoch": 1.8127133900304178, + "grad_norm": 0.14987836018407438, + "learning_rate": 4.059063721402239e-05, + "loss": 2.699, + "step": 29201 + }, + { + "epoch": 1.8127754671301757, + "grad_norm": 0.15888440646861204, + "learning_rate": 4.058709024089218e-05, + "loss": 2.7671, + "step": 29202 + }, + { + "epoch": 1.8128375442299336, + "grad_norm": 0.15349192035912174, + "learning_rate": 4.058354331687142e-05, + "loss": 2.6912, + "step": 29203 + }, + { + "epoch": 1.8128996213296915, + "grad_norm": 0.1407973306449754, + "learning_rate": 4.057999644197858e-05, + "loss": 2.7755, + "step": 29204 + }, + { + "epoch": 1.8129616984294494, + "grad_norm": 0.16781608163929154, + "learning_rate": 4.0576449616232204e-05, + "loss": 2.7326, + "step": 29205 + }, + { + "epoch": 1.8130237755292073, + "grad_norm": 0.14385048888733, + "learning_rate": 4.057290283965076e-05, + "loss": 2.7302, + "step": 29206 + }, + { + "epoch": 1.8130858526289653, + "grad_norm": 0.1607322047808237, + "learning_rate": 4.056935611225279e-05, + "loss": 2.8118, + "step": 29207 + }, + { + "epoch": 1.8131479297287232, + "grad_norm": 0.1415959587303757, + "learning_rate": 4.056580943405676e-05, + "loss": 2.749, + "step": 29208 + }, + { + "epoch": 1.813210006828481, + "grad_norm": 0.1466714109988669, + "learning_rate": 4.05622628050812e-05, + "loss": 2.8128, + "step": 29209 + }, + { + "epoch": 1.813272083928239, + "grad_norm": 0.14552179051791891, + "learning_rate": 4.0558716225344604e-05, + "loss": 2.8323, + "step": 29210 + }, + { + "epoch": 1.813334161027997, + "grad_norm": 0.15611009891148583, + "learning_rate": 4.055516969486549e-05, + "loss": 2.7953, + "step": 29211 + }, + { + "epoch": 1.8133962381277546, + "grad_norm": 0.1444177697562022, + "learning_rate": 4.055162321366233e-05, + "loss": 2.789, + "step": 29212 + }, + { + "epoch": 1.8134583152275126, + "grad_norm": 0.13958513385645765, + "learning_rate": 4.054807678175365e-05, + "loss": 2.6677, + "step": 29213 + }, + { + "epoch": 1.8135203923272705, + "grad_norm": 0.144879713634716, + "learning_rate": 4.054453039915797e-05, + "loss": 2.6418, + "step": 29214 + }, + { + "epoch": 1.8135824694270284, + "grad_norm": 0.14275359108436397, + "learning_rate": 4.054098406589374e-05, + "loss": 2.8135, + "step": 29215 + }, + { + "epoch": 1.8136445465267863, + "grad_norm": 0.15871557785224555, + "learning_rate": 4.0537437781979506e-05, + "loss": 2.7219, + "step": 29216 + }, + { + "epoch": 1.8137066236265442, + "grad_norm": 0.14001897856743875, + "learning_rate": 4.053389154743374e-05, + "loss": 2.6647, + "step": 29217 + }, + { + "epoch": 1.813768700726302, + "grad_norm": 0.14521572700237004, + "learning_rate": 4.0530345362274965e-05, + "loss": 2.7675, + "step": 29218 + }, + { + "epoch": 1.8138307778260598, + "grad_norm": 0.1631749361436531, + "learning_rate": 4.052679922652169e-05, + "loss": 2.7775, + "step": 29219 + }, + { + "epoch": 1.8138928549258178, + "grad_norm": 0.1389684460857303, + "learning_rate": 4.05232531401924e-05, + "loss": 2.7812, + "step": 29220 + }, + { + "epoch": 1.8139549320255757, + "grad_norm": 0.1407272311650962, + "learning_rate": 4.0519707103305605e-05, + "loss": 2.7758, + "step": 29221 + }, + { + "epoch": 1.8140170091253336, + "grad_norm": 0.14562085899924132, + "learning_rate": 4.051616111587979e-05, + "loss": 2.7334, + "step": 29222 + }, + { + "epoch": 1.8140790862250915, + "grad_norm": 0.14138155321090642, + "learning_rate": 4.0512615177933464e-05, + "loss": 2.754, + "step": 29223 + }, + { + "epoch": 1.8141411633248494, + "grad_norm": 0.16516664753776475, + "learning_rate": 4.050906928948513e-05, + "loss": 2.7098, + "step": 29224 + }, + { + "epoch": 1.8142032404246073, + "grad_norm": 0.17614440492343728, + "learning_rate": 4.050552345055329e-05, + "loss": 2.7365, + "step": 29225 + }, + { + "epoch": 1.8142653175243653, + "grad_norm": 0.14347014889817192, + "learning_rate": 4.050197766115643e-05, + "loss": 2.7362, + "step": 29226 + }, + { + "epoch": 1.8143273946241232, + "grad_norm": 0.14891937116992165, + "learning_rate": 4.049843192131307e-05, + "loss": 2.8524, + "step": 29227 + }, + { + "epoch": 1.814389471723881, + "grad_norm": 0.15526297671990624, + "learning_rate": 4.049488623104169e-05, + "loss": 2.7966, + "step": 29228 + }, + { + "epoch": 1.814451548823639, + "grad_norm": 0.1631676703216979, + "learning_rate": 4.04913405903608e-05, + "loss": 2.7175, + "step": 29229 + }, + { + "epoch": 1.814513625923397, + "grad_norm": 0.1564089026688233, + "learning_rate": 4.0487794999288894e-05, + "loss": 2.8216, + "step": 29230 + }, + { + "epoch": 1.8145757030231549, + "grad_norm": 0.15159782558397203, + "learning_rate": 4.0484249457844464e-05, + "loss": 2.8815, + "step": 29231 + }, + { + "epoch": 1.8146377801229128, + "grad_norm": 0.1540387868234979, + "learning_rate": 4.0480703966046025e-05, + "loss": 2.7547, + "step": 29232 + }, + { + "epoch": 1.8146998572226707, + "grad_norm": 0.14833022934656473, + "learning_rate": 4.0477158523912054e-05, + "loss": 2.7581, + "step": 29233 + }, + { + "epoch": 1.8147619343224286, + "grad_norm": 0.13933494031724156, + "learning_rate": 4.0473613131461066e-05, + "loss": 2.832, + "step": 29234 + }, + { + "epoch": 1.8148240114221863, + "grad_norm": 0.14470758506739978, + "learning_rate": 4.047006778871154e-05, + "loss": 2.7464, + "step": 29235 + }, + { + "epoch": 1.8148860885219442, + "grad_norm": 0.1356097886129419, + "learning_rate": 4.0466522495682e-05, + "loss": 2.7649, + "step": 29236 + }, + { + "epoch": 1.8149481656217021, + "grad_norm": 0.14902151813995915, + "learning_rate": 4.046297725239091e-05, + "loss": 2.793, + "step": 29237 + }, + { + "epoch": 1.81501024272146, + "grad_norm": 0.13281883196602715, + "learning_rate": 4.04594320588568e-05, + "loss": 2.7032, + "step": 29238 + }, + { + "epoch": 1.815072319821218, + "grad_norm": 0.13671414503789145, + "learning_rate": 4.0455886915098145e-05, + "loss": 2.761, + "step": 29239 + }, + { + "epoch": 1.815134396920976, + "grad_norm": 0.1491408436805077, + "learning_rate": 4.0452341821133435e-05, + "loss": 2.7614, + "step": 29240 + }, + { + "epoch": 1.8151964740207336, + "grad_norm": 0.13564574293964468, + "learning_rate": 4.044879677698118e-05, + "loss": 2.6812, + "step": 29241 + }, + { + "epoch": 1.8152585511204915, + "grad_norm": 0.14216470876321372, + "learning_rate": 4.044525178265987e-05, + "loss": 2.7689, + "step": 29242 + }, + { + "epoch": 1.8153206282202494, + "grad_norm": 0.1381255658168337, + "learning_rate": 4.044170683818801e-05, + "loss": 2.7256, + "step": 29243 + }, + { + "epoch": 1.8153827053200073, + "grad_norm": 0.1512595994125715, + "learning_rate": 4.0438161943584066e-05, + "loss": 2.7355, + "step": 29244 + }, + { + "epoch": 1.8154447824197653, + "grad_norm": 0.14593787930457675, + "learning_rate": 4.0434617098866574e-05, + "loss": 2.7994, + "step": 29245 + }, + { + "epoch": 1.8155068595195232, + "grad_norm": 0.15205329324537484, + "learning_rate": 4.043107230405399e-05, + "loss": 2.7639, + "step": 29246 + }, + { + "epoch": 1.815568936619281, + "grad_norm": 0.1511491866915151, + "learning_rate": 4.0427527559164835e-05, + "loss": 2.7063, + "step": 29247 + }, + { + "epoch": 1.815631013719039, + "grad_norm": 0.145328462400784, + "learning_rate": 4.042398286421759e-05, + "loss": 2.7184, + "step": 29248 + }, + { + "epoch": 1.815693090818797, + "grad_norm": 0.15065311462330144, + "learning_rate": 4.042043821923075e-05, + "loss": 2.7572, + "step": 29249 + }, + { + "epoch": 1.8157551679185548, + "grad_norm": 0.14120213308362695, + "learning_rate": 4.04168936242228e-05, + "loss": 2.7082, + "step": 29250 + }, + { + "epoch": 1.8158172450183128, + "grad_norm": 0.15631954910226895, + "learning_rate": 4.041334907921224e-05, + "loss": 2.8066, + "step": 29251 + }, + { + "epoch": 1.8158793221180707, + "grad_norm": 0.13704349233688604, + "learning_rate": 4.040980458421759e-05, + "loss": 2.6764, + "step": 29252 + }, + { + "epoch": 1.8159413992178286, + "grad_norm": 0.14916758059398377, + "learning_rate": 4.04062601392573e-05, + "loss": 2.7245, + "step": 29253 + }, + { + "epoch": 1.8160034763175865, + "grad_norm": 0.13638063583564392, + "learning_rate": 4.040271574434989e-05, + "loss": 2.7441, + "step": 29254 + }, + { + "epoch": 1.8160655534173444, + "grad_norm": 0.14219262643103334, + "learning_rate": 4.0399171399513835e-05, + "loss": 2.739, + "step": 29255 + }, + { + "epoch": 1.8161276305171024, + "grad_norm": 0.13890612655865878, + "learning_rate": 4.039562710476763e-05, + "loss": 2.7176, + "step": 29256 + }, + { + "epoch": 1.8161897076168603, + "grad_norm": 0.14596355675303926, + "learning_rate": 4.039208286012978e-05, + "loss": 2.7358, + "step": 29257 + }, + { + "epoch": 1.8162517847166182, + "grad_norm": 0.13931227283878556, + "learning_rate": 4.0388538665618756e-05, + "loss": 2.828, + "step": 29258 + }, + { + "epoch": 1.8163138618163759, + "grad_norm": 0.14364463557264273, + "learning_rate": 4.038499452125307e-05, + "loss": 2.721, + "step": 29259 + }, + { + "epoch": 1.8163759389161338, + "grad_norm": 0.14415068397468866, + "learning_rate": 4.0381450427051186e-05, + "loss": 2.7698, + "step": 29260 + }, + { + "epoch": 1.8164380160158917, + "grad_norm": 0.13928992261026762, + "learning_rate": 4.037790638303163e-05, + "loss": 2.7821, + "step": 29261 + }, + { + "epoch": 1.8165000931156496, + "grad_norm": 0.15353072968155462, + "learning_rate": 4.037436238921285e-05, + "loss": 2.7406, + "step": 29262 + }, + { + "epoch": 1.8165621702154076, + "grad_norm": 0.16008147378139667, + "learning_rate": 4.037081844561338e-05, + "loss": 2.7239, + "step": 29263 + }, + { + "epoch": 1.8166242473151655, + "grad_norm": 0.158757181474494, + "learning_rate": 4.036727455225168e-05, + "loss": 2.8038, + "step": 29264 + }, + { + "epoch": 1.8166863244149232, + "grad_norm": 0.14164954720751968, + "learning_rate": 4.036373070914624e-05, + "loss": 2.7824, + "step": 29265 + }, + { + "epoch": 1.816748401514681, + "grad_norm": 0.1550456841886416, + "learning_rate": 4.036018691631557e-05, + "loss": 2.7255, + "step": 29266 + }, + { + "epoch": 1.816810478614439, + "grad_norm": 0.14084385689612225, + "learning_rate": 4.035664317377813e-05, + "loss": 2.7857, + "step": 29267 + }, + { + "epoch": 1.816872555714197, + "grad_norm": 0.15249948610938366, + "learning_rate": 4.0353099481552436e-05, + "loss": 2.7687, + "step": 29268 + }, + { + "epoch": 1.8169346328139548, + "grad_norm": 0.16054898467151232, + "learning_rate": 4.034955583965695e-05, + "loss": 2.841, + "step": 29269 + }, + { + "epoch": 1.8169967099137128, + "grad_norm": 0.14771778272736175, + "learning_rate": 4.034601224811019e-05, + "loss": 2.8879, + "step": 29270 + }, + { + "epoch": 1.8170587870134707, + "grad_norm": 0.14090789050401265, + "learning_rate": 4.034246870693061e-05, + "loss": 2.8229, + "step": 29271 + }, + { + "epoch": 1.8171208641132286, + "grad_norm": 0.14208611421180292, + "learning_rate": 4.033892521613673e-05, + "loss": 2.7866, + "step": 29272 + }, + { + "epoch": 1.8171829412129865, + "grad_norm": 0.1532863287711362, + "learning_rate": 4.033538177574701e-05, + "loss": 2.797, + "step": 29273 + }, + { + "epoch": 1.8172450183127444, + "grad_norm": 0.14023797102331462, + "learning_rate": 4.0331838385779966e-05, + "loss": 2.7572, + "step": 29274 + }, + { + "epoch": 1.8173070954125023, + "grad_norm": 0.14966554608152088, + "learning_rate": 4.032829504625406e-05, + "loss": 2.8348, + "step": 29275 + }, + { + "epoch": 1.8173691725122603, + "grad_norm": 0.1526669838957045, + "learning_rate": 4.032475175718778e-05, + "loss": 2.765, + "step": 29276 + }, + { + "epoch": 1.8174312496120182, + "grad_norm": 0.13979642536128967, + "learning_rate": 4.0321208518599627e-05, + "loss": 2.7665, + "step": 29277 + }, + { + "epoch": 1.817493326711776, + "grad_norm": 0.16729806470532568, + "learning_rate": 4.031766533050807e-05, + "loss": 2.8125, + "step": 29278 + }, + { + "epoch": 1.817555403811534, + "grad_norm": 0.1500915985274888, + "learning_rate": 4.0314122192931616e-05, + "loss": 2.8143, + "step": 29279 + }, + { + "epoch": 1.817617480911292, + "grad_norm": 0.1565035848401215, + "learning_rate": 4.031057910588872e-05, + "loss": 2.7509, + "step": 29280 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.17443890367993545, + "learning_rate": 4.030703606939791e-05, + "loss": 2.7447, + "step": 29281 + }, + { + "epoch": 1.8177416351108078, + "grad_norm": 0.18509837706818869, + "learning_rate": 4.030349308347761e-05, + "loss": 2.7454, + "step": 29282 + }, + { + "epoch": 1.8178037122105655, + "grad_norm": 0.16209193216263898, + "learning_rate": 4.029995014814636e-05, + "loss": 2.7986, + "step": 29283 + }, + { + "epoch": 1.8178657893103234, + "grad_norm": 0.1546891031760167, + "learning_rate": 4.0296407263422634e-05, + "loss": 2.7228, + "step": 29284 + }, + { + "epoch": 1.8179278664100813, + "grad_norm": 0.19868653395226823, + "learning_rate": 4.0292864429324894e-05, + "loss": 2.7302, + "step": 29285 + }, + { + "epoch": 1.8179899435098392, + "grad_norm": 0.16898117897478612, + "learning_rate": 4.0289321645871646e-05, + "loss": 2.7466, + "step": 29286 + }, + { + "epoch": 1.8180520206095971, + "grad_norm": 0.16504549544398062, + "learning_rate": 4.028577891308136e-05, + "loss": 2.7762, + "step": 29287 + }, + { + "epoch": 1.818114097709355, + "grad_norm": 0.1439441535934111, + "learning_rate": 4.0282236230972526e-05, + "loss": 2.7675, + "step": 29288 + }, + { + "epoch": 1.8181761748091128, + "grad_norm": 0.14359485297874258, + "learning_rate": 4.027869359956363e-05, + "loss": 2.7614, + "step": 29289 + }, + { + "epoch": 1.8182382519088707, + "grad_norm": 0.1517093851202665, + "learning_rate": 4.027515101887314e-05, + "loss": 2.8581, + "step": 29290 + }, + { + "epoch": 1.8183003290086286, + "grad_norm": 0.15699781042822725, + "learning_rate": 4.027160848891955e-05, + "loss": 2.8225, + "step": 29291 + }, + { + "epoch": 1.8183624061083865, + "grad_norm": 0.1534252840042749, + "learning_rate": 4.026806600972134e-05, + "loss": 2.7986, + "step": 29292 + }, + { + "epoch": 1.8184244832081444, + "grad_norm": 0.14886481491156095, + "learning_rate": 4.0264523581297e-05, + "loss": 2.6789, + "step": 29293 + }, + { + "epoch": 1.8184865603079023, + "grad_norm": 0.1699895066465793, + "learning_rate": 4.026098120366499e-05, + "loss": 2.8698, + "step": 29294 + }, + { + "epoch": 1.8185486374076603, + "grad_norm": 0.1667816329352686, + "learning_rate": 4.0257438876843814e-05, + "loss": 2.8039, + "step": 29295 + }, + { + "epoch": 1.8186107145074182, + "grad_norm": 0.14536159004082755, + "learning_rate": 4.0253896600851935e-05, + "loss": 2.8348, + "step": 29296 + }, + { + "epoch": 1.818672791607176, + "grad_norm": 0.1490957259789012, + "learning_rate": 4.025035437570786e-05, + "loss": 2.8022, + "step": 29297 + }, + { + "epoch": 1.818734868706934, + "grad_norm": 0.21826985149891528, + "learning_rate": 4.024681220143004e-05, + "loss": 2.7685, + "step": 29298 + }, + { + "epoch": 1.818796945806692, + "grad_norm": 0.14243402776591074, + "learning_rate": 4.0243270078036975e-05, + "loss": 2.781, + "step": 29299 + }, + { + "epoch": 1.8188590229064499, + "grad_norm": 0.14048993061295506, + "learning_rate": 4.023972800554714e-05, + "loss": 2.6959, + "step": 29300 + }, + { + "epoch": 1.8189211000062078, + "grad_norm": 0.14903624768275303, + "learning_rate": 4.0236185983979e-05, + "loss": 2.8022, + "step": 29301 + }, + { + "epoch": 1.8189831771059657, + "grad_norm": 0.1736981999329674, + "learning_rate": 4.023264401335107e-05, + "loss": 2.8126, + "step": 29302 + }, + { + "epoch": 1.8190452542057236, + "grad_norm": 0.14333552544141043, + "learning_rate": 4.0229102093681786e-05, + "loss": 2.8073, + "step": 29303 + }, + { + "epoch": 1.8191073313054815, + "grad_norm": 0.16319333041151296, + "learning_rate": 4.0225560224989664e-05, + "loss": 2.7205, + "step": 29304 + }, + { + "epoch": 1.8191694084052394, + "grad_norm": 0.1419213728382937, + "learning_rate": 4.0222018407293155e-05, + "loss": 2.9054, + "step": 29305 + }, + { + "epoch": 1.8192314855049974, + "grad_norm": 0.1616695711667224, + "learning_rate": 4.0218476640610756e-05, + "loss": 2.7594, + "step": 29306 + }, + { + "epoch": 1.819293562604755, + "grad_norm": 0.16659148142144273, + "learning_rate": 4.0214934924960934e-05, + "loss": 2.8144, + "step": 29307 + }, + { + "epoch": 1.819355639704513, + "grad_norm": 0.1717967050125648, + "learning_rate": 4.021139326036219e-05, + "loss": 2.8103, + "step": 29308 + }, + { + "epoch": 1.819417716804271, + "grad_norm": 0.1546301381364547, + "learning_rate": 4.0207851646832954e-05, + "loss": 2.801, + "step": 29309 + }, + { + "epoch": 1.8194797939040288, + "grad_norm": 0.16367466817215937, + "learning_rate": 4.0204310084391754e-05, + "loss": 2.7812, + "step": 29310 + }, + { + "epoch": 1.8195418710037867, + "grad_norm": 0.15141370735266554, + "learning_rate": 4.020076857305703e-05, + "loss": 2.7638, + "step": 29311 + }, + { + "epoch": 1.8196039481035446, + "grad_norm": 0.166459404823959, + "learning_rate": 4.0197227112847295e-05, + "loss": 2.6071, + "step": 29312 + }, + { + "epoch": 1.8196660252033023, + "grad_norm": 0.14525474713261505, + "learning_rate": 4.0193685703781e-05, + "loss": 2.7826, + "step": 29313 + }, + { + "epoch": 1.8197281023030603, + "grad_norm": 0.14151290222594676, + "learning_rate": 4.019014434587662e-05, + "loss": 2.7188, + "step": 29314 + }, + { + "epoch": 1.8197901794028182, + "grad_norm": 0.15988754782711398, + "learning_rate": 4.018660303915263e-05, + "loss": 2.6792, + "step": 29315 + }, + { + "epoch": 1.819852256502576, + "grad_norm": 0.1872105251378236, + "learning_rate": 4.018306178362753e-05, + "loss": 2.8252, + "step": 29316 + }, + { + "epoch": 1.819914333602334, + "grad_norm": 0.1495231217932561, + "learning_rate": 4.0179520579319773e-05, + "loss": 2.8134, + "step": 29317 + }, + { + "epoch": 1.819976410702092, + "grad_norm": 0.1518384120475521, + "learning_rate": 4.017597942624784e-05, + "loss": 2.8147, + "step": 29318 + }, + { + "epoch": 1.8200384878018498, + "grad_norm": 0.17793430360067636, + "learning_rate": 4.017243832443021e-05, + "loss": 2.8357, + "step": 29319 + }, + { + "epoch": 1.8201005649016078, + "grad_norm": 0.1639893573114963, + "learning_rate": 4.0168897273885365e-05, + "loss": 2.7691, + "step": 29320 + }, + { + "epoch": 1.8201626420013657, + "grad_norm": 0.1688652730988122, + "learning_rate": 4.016535627463176e-05, + "loss": 2.7929, + "step": 29321 + }, + { + "epoch": 1.8202247191011236, + "grad_norm": 0.17176831250416266, + "learning_rate": 4.016181532668788e-05, + "loss": 2.8115, + "step": 29322 + }, + { + "epoch": 1.8202867962008815, + "grad_norm": 0.1424485754617338, + "learning_rate": 4.01582744300722e-05, + "loss": 2.7386, + "step": 29323 + }, + { + "epoch": 1.8203488733006394, + "grad_norm": 0.15434130499757434, + "learning_rate": 4.015473358480319e-05, + "loss": 2.776, + "step": 29324 + }, + { + "epoch": 1.8204109504003974, + "grad_norm": 0.16993410241868334, + "learning_rate": 4.015119279089933e-05, + "loss": 2.7731, + "step": 29325 + }, + { + "epoch": 1.8204730275001553, + "grad_norm": 0.16148331984612163, + "learning_rate": 4.014765204837908e-05, + "loss": 2.7884, + "step": 29326 + }, + { + "epoch": 1.8205351045999132, + "grad_norm": 0.14818253705447523, + "learning_rate": 4.0144111357260924e-05, + "loss": 2.7881, + "step": 29327 + }, + { + "epoch": 1.820597181699671, + "grad_norm": 0.1445961303972565, + "learning_rate": 4.014057071756332e-05, + "loss": 2.6885, + "step": 29328 + }, + { + "epoch": 1.820659258799429, + "grad_norm": 0.14618828412775697, + "learning_rate": 4.013703012930477e-05, + "loss": 2.718, + "step": 29329 + }, + { + "epoch": 1.820721335899187, + "grad_norm": 0.1388195688555261, + "learning_rate": 4.0133489592503715e-05, + "loss": 2.7574, + "step": 29330 + }, + { + "epoch": 1.8207834129989446, + "grad_norm": 0.16513180243213937, + "learning_rate": 4.0129949107178654e-05, + "loss": 2.7978, + "step": 29331 + }, + { + "epoch": 1.8208454900987026, + "grad_norm": 0.15446366578536055, + "learning_rate": 4.012640867334803e-05, + "loss": 2.676, + "step": 29332 + }, + { + "epoch": 1.8209075671984605, + "grad_norm": 0.13420013288626012, + "learning_rate": 4.0122868291030344e-05, + "loss": 2.6813, + "step": 29333 + }, + { + "epoch": 1.8209696442982184, + "grad_norm": 0.18050513623186795, + "learning_rate": 4.011932796024404e-05, + "loss": 2.7709, + "step": 29334 + }, + { + "epoch": 1.8210317213979763, + "grad_norm": 0.15223837533781062, + "learning_rate": 4.01157876810076e-05, + "loss": 2.7634, + "step": 29335 + }, + { + "epoch": 1.8210937984977342, + "grad_norm": 0.16053752175811928, + "learning_rate": 4.0112247453339495e-05, + "loss": 2.7312, + "step": 29336 + }, + { + "epoch": 1.821155875597492, + "grad_norm": 0.1505135925216726, + "learning_rate": 4.010870727725821e-05, + "loss": 2.7976, + "step": 29337 + }, + { + "epoch": 1.8212179526972498, + "grad_norm": 0.17779894064774332, + "learning_rate": 4.010516715278219e-05, + "loss": 2.8225, + "step": 29338 + }, + { + "epoch": 1.8212800297970078, + "grad_norm": 0.16407640148485847, + "learning_rate": 4.010162707992991e-05, + "loss": 2.8111, + "step": 29339 + }, + { + "epoch": 1.8213421068967657, + "grad_norm": 0.16689623479715887, + "learning_rate": 4.009808705871986e-05, + "loss": 2.8047, + "step": 29340 + }, + { + "epoch": 1.8214041839965236, + "grad_norm": 0.14984605707852786, + "learning_rate": 4.0094547089170474e-05, + "loss": 2.8594, + "step": 29341 + }, + { + "epoch": 1.8214662610962815, + "grad_norm": 0.146037571096154, + "learning_rate": 4.0091007171300256e-05, + "loss": 2.7846, + "step": 29342 + }, + { + "epoch": 1.8215283381960394, + "grad_norm": 0.14141297772246839, + "learning_rate": 4.008746730512765e-05, + "loss": 2.6718, + "step": 29343 + }, + { + "epoch": 1.8215904152957973, + "grad_norm": 0.15409939236194678, + "learning_rate": 4.0083927490671134e-05, + "loss": 2.6571, + "step": 29344 + }, + { + "epoch": 1.8216524923955553, + "grad_norm": 0.15403455454686318, + "learning_rate": 4.008038772794917e-05, + "loss": 2.7555, + "step": 29345 + }, + { + "epoch": 1.8217145694953132, + "grad_norm": 0.14033370179606872, + "learning_rate": 4.0076848016980246e-05, + "loss": 2.7664, + "step": 29346 + }, + { + "epoch": 1.821776646595071, + "grad_norm": 0.15995212665740796, + "learning_rate": 4.0073308357782804e-05, + "loss": 2.7333, + "step": 29347 + }, + { + "epoch": 1.821838723694829, + "grad_norm": 0.15053514249780636, + "learning_rate": 4.006976875037531e-05, + "loss": 2.7805, + "step": 29348 + }, + { + "epoch": 1.821900800794587, + "grad_norm": 0.14633320319452162, + "learning_rate": 4.006622919477626e-05, + "loss": 2.7669, + "step": 29349 + }, + { + "epoch": 1.8219628778943449, + "grad_norm": 0.15324181168913917, + "learning_rate": 4.0062689691004107e-05, + "loss": 2.6909, + "step": 29350 + }, + { + "epoch": 1.8220249549941028, + "grad_norm": 0.14565069109104178, + "learning_rate": 4.00591502390773e-05, + "loss": 2.8605, + "step": 29351 + }, + { + "epoch": 1.8220870320938607, + "grad_norm": 0.14595494918532703, + "learning_rate": 4.005561083901434e-05, + "loss": 2.7953, + "step": 29352 + }, + { + "epoch": 1.8221491091936186, + "grad_norm": 0.1550798949325605, + "learning_rate": 4.0052071490833654e-05, + "loss": 2.7783, + "step": 29353 + }, + { + "epoch": 1.8222111862933765, + "grad_norm": 0.14075194695288162, + "learning_rate": 4.004853219455374e-05, + "loss": 2.8061, + "step": 29354 + }, + { + "epoch": 1.8222732633931342, + "grad_norm": 0.1525229964544712, + "learning_rate": 4.004499295019304e-05, + "loss": 2.665, + "step": 29355 + }, + { + "epoch": 1.8223353404928921, + "grad_norm": 0.15051532504838797, + "learning_rate": 4.004145375777003e-05, + "loss": 2.8003, + "step": 29356 + }, + { + "epoch": 1.82239741759265, + "grad_norm": 0.14735014966292753, + "learning_rate": 4.0037914617303175e-05, + "loss": 2.7431, + "step": 29357 + }, + { + "epoch": 1.822459494692408, + "grad_norm": 0.15331655850036366, + "learning_rate": 4.003437552881094e-05, + "loss": 2.7596, + "step": 29358 + }, + { + "epoch": 1.822521571792166, + "grad_norm": 0.15019567040647566, + "learning_rate": 4.003083649231178e-05, + "loss": 2.8099, + "step": 29359 + }, + { + "epoch": 1.8225836488919238, + "grad_norm": 0.1502712568695532, + "learning_rate": 4.002729750782418e-05, + "loss": 2.7221, + "step": 29360 + }, + { + "epoch": 1.8226457259916815, + "grad_norm": 0.15027739857509556, + "learning_rate": 4.0023758575366563e-05, + "loss": 2.8002, + "step": 29361 + }, + { + "epoch": 1.8227078030914394, + "grad_norm": 0.16032316070193386, + "learning_rate": 4.0020219694957443e-05, + "loss": 2.723, + "step": 29362 + }, + { + "epoch": 1.8227698801911973, + "grad_norm": 0.1631246851958237, + "learning_rate": 4.001668086661526e-05, + "loss": 2.8288, + "step": 29363 + }, + { + "epoch": 1.8228319572909553, + "grad_norm": 0.1410496116005899, + "learning_rate": 4.0013142090358466e-05, + "loss": 2.7448, + "step": 29364 + }, + { + "epoch": 1.8228940343907132, + "grad_norm": 0.14378740300902595, + "learning_rate": 4.0009603366205546e-05, + "loss": 2.7639, + "step": 29365 + }, + { + "epoch": 1.822956111490471, + "grad_norm": 0.14287989179283456, + "learning_rate": 4.000606469417493e-05, + "loss": 2.7737, + "step": 29366 + }, + { + "epoch": 1.823018188590229, + "grad_norm": 0.15414957669187931, + "learning_rate": 4.000252607428512e-05, + "loss": 2.7597, + "step": 29367 + }, + { + "epoch": 1.823080265689987, + "grad_norm": 0.1423809634848682, + "learning_rate": 3.999898750655455e-05, + "loss": 2.8911, + "step": 29368 + }, + { + "epoch": 1.8231423427897449, + "grad_norm": 0.14226881861571725, + "learning_rate": 3.999544899100169e-05, + "loss": 2.7707, + "step": 29369 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.14416961275059945, + "learning_rate": 3.9991910527645e-05, + "loss": 2.7786, + "step": 29370 + }, + { + "epoch": 1.8232664969892607, + "grad_norm": 0.1557803785573294, + "learning_rate": 3.9988372116502946e-05, + "loss": 2.7511, + "step": 29371 + }, + { + "epoch": 1.8233285740890186, + "grad_norm": 0.14521348871805878, + "learning_rate": 3.998483375759398e-05, + "loss": 2.843, + "step": 29372 + }, + { + "epoch": 1.8233906511887765, + "grad_norm": 0.1562061427077601, + "learning_rate": 3.998129545093658e-05, + "loss": 2.7479, + "step": 29373 + }, + { + "epoch": 1.8234527282885344, + "grad_norm": 0.14710283692346932, + "learning_rate": 3.9977757196549195e-05, + "loss": 2.8407, + "step": 29374 + }, + { + "epoch": 1.8235148053882924, + "grad_norm": 0.13676809374144497, + "learning_rate": 3.997421899445026e-05, + "loss": 2.7595, + "step": 29375 + }, + { + "epoch": 1.8235768824880503, + "grad_norm": 0.1432369569625002, + "learning_rate": 3.997068084465828e-05, + "loss": 2.7749, + "step": 29376 + }, + { + "epoch": 1.8236389595878082, + "grad_norm": 0.14881998275150773, + "learning_rate": 3.9967142747191683e-05, + "loss": 2.8452, + "step": 29377 + }, + { + "epoch": 1.8237010366875661, + "grad_norm": 0.15680123487986472, + "learning_rate": 3.9963604702068946e-05, + "loss": 2.7969, + "step": 29378 + }, + { + "epoch": 1.8237631137873238, + "grad_norm": 0.16619319395214946, + "learning_rate": 3.996006670930851e-05, + "loss": 2.7117, + "step": 29379 + }, + { + "epoch": 1.8238251908870817, + "grad_norm": 0.16284283776358088, + "learning_rate": 3.9956528768928845e-05, + "loss": 2.7357, + "step": 29380 + }, + { + "epoch": 1.8238872679868396, + "grad_norm": 0.16176079420055942, + "learning_rate": 3.995299088094839e-05, + "loss": 2.7337, + "step": 29381 + }, + { + "epoch": 1.8239493450865976, + "grad_norm": 0.140396535246961, + "learning_rate": 3.994945304538564e-05, + "loss": 2.7389, + "step": 29382 + }, + { + "epoch": 1.8240114221863555, + "grad_norm": 0.1447236546006781, + "learning_rate": 3.994591526225904e-05, + "loss": 2.7533, + "step": 29383 + }, + { + "epoch": 1.8240734992861134, + "grad_norm": 0.20662305284864613, + "learning_rate": 3.9942377531587025e-05, + "loss": 2.8367, + "step": 29384 + }, + { + "epoch": 1.824135576385871, + "grad_norm": 0.14613231950486555, + "learning_rate": 3.9938839853388084e-05, + "loss": 2.7674, + "step": 29385 + }, + { + "epoch": 1.824197653485629, + "grad_norm": 0.16484848847290673, + "learning_rate": 3.993530222768064e-05, + "loss": 2.7391, + "step": 29386 + }, + { + "epoch": 1.824259730585387, + "grad_norm": 0.1567196820118233, + "learning_rate": 3.9931764654483186e-05, + "loss": 2.8587, + "step": 29387 + }, + { + "epoch": 1.8243218076851448, + "grad_norm": 0.19899867561692763, + "learning_rate": 3.992822713381416e-05, + "loss": 2.7607, + "step": 29388 + }, + { + "epoch": 1.8243838847849028, + "grad_norm": 0.17626561083885306, + "learning_rate": 3.9924689665692006e-05, + "loss": 2.8977, + "step": 29389 + }, + { + "epoch": 1.8244459618846607, + "grad_norm": 0.16136539896867508, + "learning_rate": 3.99211522501352e-05, + "loss": 2.8201, + "step": 29390 + }, + { + "epoch": 1.8245080389844186, + "grad_norm": 0.15678375392615243, + "learning_rate": 3.991761488716218e-05, + "loss": 2.7558, + "step": 29391 + }, + { + "epoch": 1.8245701160841765, + "grad_norm": 0.14846608865392474, + "learning_rate": 3.991407757679142e-05, + "loss": 2.8668, + "step": 29392 + }, + { + "epoch": 1.8246321931839344, + "grad_norm": 0.15942064431065134, + "learning_rate": 3.9910540319041356e-05, + "loss": 2.7145, + "step": 29393 + }, + { + "epoch": 1.8246942702836924, + "grad_norm": 0.14698769882124468, + "learning_rate": 3.990700311393047e-05, + "loss": 2.7825, + "step": 29394 + }, + { + "epoch": 1.8247563473834503, + "grad_norm": 0.15186088405165102, + "learning_rate": 3.990346596147718e-05, + "loss": 2.7381, + "step": 29395 + }, + { + "epoch": 1.8248184244832082, + "grad_norm": 0.16375914980353834, + "learning_rate": 3.989992886169997e-05, + "loss": 2.7599, + "step": 29396 + }, + { + "epoch": 1.824880501582966, + "grad_norm": 0.1553132527964441, + "learning_rate": 3.9896391814617276e-05, + "loss": 2.7997, + "step": 29397 + }, + { + "epoch": 1.824942578682724, + "grad_norm": 0.14370181530397835, + "learning_rate": 3.989285482024756e-05, + "loss": 2.7096, + "step": 29398 + }, + { + "epoch": 1.825004655782482, + "grad_norm": 0.15233624268643114, + "learning_rate": 3.988931787860929e-05, + "loss": 2.7781, + "step": 29399 + }, + { + "epoch": 1.8250667328822399, + "grad_norm": 0.14703540159682174, + "learning_rate": 3.988578098972088e-05, + "loss": 2.7629, + "step": 29400 + }, + { + "epoch": 1.8251288099819978, + "grad_norm": 0.15068350178189788, + "learning_rate": 3.988224415360082e-05, + "loss": 2.7346, + "step": 29401 + }, + { + "epoch": 1.8251908870817557, + "grad_norm": 0.14315733119653737, + "learning_rate": 3.9878707370267545e-05, + "loss": 2.6857, + "step": 29402 + }, + { + "epoch": 1.8252529641815134, + "grad_norm": 0.15279522047269548, + "learning_rate": 3.9875170639739515e-05, + "loss": 2.8323, + "step": 29403 + }, + { + "epoch": 1.8253150412812713, + "grad_norm": 0.14319719188863161, + "learning_rate": 3.987163396203516e-05, + "loss": 2.7119, + "step": 29404 + }, + { + "epoch": 1.8253771183810292, + "grad_norm": 0.1725237858222935, + "learning_rate": 3.9868097337172975e-05, + "loss": 2.788, + "step": 29405 + }, + { + "epoch": 1.8254391954807871, + "grad_norm": 0.14849145435740918, + "learning_rate": 3.986456076517136e-05, + "loss": 2.7212, + "step": 29406 + }, + { + "epoch": 1.825501272580545, + "grad_norm": 0.13608650065580977, + "learning_rate": 3.986102424604881e-05, + "loss": 2.7066, + "step": 29407 + }, + { + "epoch": 1.825563349680303, + "grad_norm": 0.15677976801960508, + "learning_rate": 3.985748777982375e-05, + "loss": 2.7937, + "step": 29408 + }, + { + "epoch": 1.8256254267800607, + "grad_norm": 0.14968962596187946, + "learning_rate": 3.985395136651464e-05, + "loss": 2.6523, + "step": 29409 + }, + { + "epoch": 1.8256875038798186, + "grad_norm": 0.14538729880626258, + "learning_rate": 3.985041500613994e-05, + "loss": 2.789, + "step": 29410 + }, + { + "epoch": 1.8257495809795765, + "grad_norm": 0.16500534187144442, + "learning_rate": 3.9846878698718074e-05, + "loss": 2.756, + "step": 29411 + }, + { + "epoch": 1.8258116580793344, + "grad_norm": 0.15196865316251024, + "learning_rate": 3.9843342444267514e-05, + "loss": 2.7916, + "step": 29412 + }, + { + "epoch": 1.8258737351790923, + "grad_norm": 0.15071427829538378, + "learning_rate": 3.9839806242806684e-05, + "loss": 2.727, + "step": 29413 + }, + { + "epoch": 1.8259358122788503, + "grad_norm": 0.14641054836180079, + "learning_rate": 3.983627009435406e-05, + "loss": 2.7204, + "step": 29414 + }, + { + "epoch": 1.8259978893786082, + "grad_norm": 0.14827129822026955, + "learning_rate": 3.983273399892809e-05, + "loss": 2.6555, + "step": 29415 + }, + { + "epoch": 1.826059966478366, + "grad_norm": 0.15625587094721272, + "learning_rate": 3.982919795654721e-05, + "loss": 2.7285, + "step": 29416 + }, + { + "epoch": 1.826122043578124, + "grad_norm": 0.15020001085371176, + "learning_rate": 3.982566196722988e-05, + "loss": 2.8723, + "step": 29417 + }, + { + "epoch": 1.826184120677882, + "grad_norm": 0.14811568762080538, + "learning_rate": 3.982212603099453e-05, + "loss": 2.7945, + "step": 29418 + }, + { + "epoch": 1.8262461977776399, + "grad_norm": 0.13981986028254223, + "learning_rate": 3.9818590147859634e-05, + "loss": 2.7162, + "step": 29419 + }, + { + "epoch": 1.8263082748773978, + "grad_norm": 0.14377664417591327, + "learning_rate": 3.9815054317843616e-05, + "loss": 2.7832, + "step": 29420 + }, + { + "epoch": 1.8263703519771557, + "grad_norm": 0.16475380278563131, + "learning_rate": 3.9811518540964936e-05, + "loss": 2.759, + "step": 29421 + }, + { + "epoch": 1.8264324290769136, + "grad_norm": 0.17687787146309944, + "learning_rate": 3.9807982817242026e-05, + "loss": 2.8618, + "step": 29422 + }, + { + "epoch": 1.8264945061766715, + "grad_norm": 0.1436318170856907, + "learning_rate": 3.9804447146693364e-05, + "loss": 2.776, + "step": 29423 + }, + { + "epoch": 1.8265565832764294, + "grad_norm": 0.156953141491112, + "learning_rate": 3.980091152933737e-05, + "loss": 2.7344, + "step": 29424 + }, + { + "epoch": 1.8266186603761874, + "grad_norm": 0.1463196464012994, + "learning_rate": 3.979737596519248e-05, + "loss": 2.7616, + "step": 29425 + }, + { + "epoch": 1.8266807374759453, + "grad_norm": 0.16028949381920926, + "learning_rate": 3.9793840454277174e-05, + "loss": 2.7879, + "step": 29426 + }, + { + "epoch": 1.826742814575703, + "grad_norm": 0.16008797795368904, + "learning_rate": 3.9790304996609874e-05, + "loss": 2.8707, + "step": 29427 + }, + { + "epoch": 1.826804891675461, + "grad_norm": 0.14286594212097142, + "learning_rate": 3.978676959220904e-05, + "loss": 2.7763, + "step": 29428 + }, + { + "epoch": 1.8268669687752188, + "grad_norm": 0.15621131775779448, + "learning_rate": 3.978323424109309e-05, + "loss": 2.8894, + "step": 29429 + }, + { + "epoch": 1.8269290458749767, + "grad_norm": 0.14301088056318442, + "learning_rate": 3.9779698943280504e-05, + "loss": 2.7935, + "step": 29430 + }, + { + "epoch": 1.8269911229747346, + "grad_norm": 0.14884563853627322, + "learning_rate": 3.977616369878969e-05, + "loss": 2.7507, + "step": 29431 + }, + { + "epoch": 1.8270532000744926, + "grad_norm": 0.1457736703613325, + "learning_rate": 3.977262850763914e-05, + "loss": 2.7616, + "step": 29432 + }, + { + "epoch": 1.8271152771742503, + "grad_norm": 0.14198345776657625, + "learning_rate": 3.9769093369847246e-05, + "loss": 2.7373, + "step": 29433 + }, + { + "epoch": 1.8271773542740082, + "grad_norm": 0.13579737961446778, + "learning_rate": 3.976555828543249e-05, + "loss": 2.7298, + "step": 29434 + }, + { + "epoch": 1.827239431373766, + "grad_norm": 0.14871064578108636, + "learning_rate": 3.97620232544133e-05, + "loss": 2.8527, + "step": 29435 + }, + { + "epoch": 1.827301508473524, + "grad_norm": 0.1393462559649412, + "learning_rate": 3.975848827680811e-05, + "loss": 2.7497, + "step": 29436 + }, + { + "epoch": 1.827363585573282, + "grad_norm": 0.1555410334507104, + "learning_rate": 3.975495335263538e-05, + "loss": 2.7733, + "step": 29437 + }, + { + "epoch": 1.8274256626730399, + "grad_norm": 0.15461656793687362, + "learning_rate": 3.975141848191354e-05, + "loss": 2.8181, + "step": 29438 + }, + { + "epoch": 1.8274877397727978, + "grad_norm": 0.14981285182384801, + "learning_rate": 3.974788366466105e-05, + "loss": 2.8187, + "step": 29439 + }, + { + "epoch": 1.8275498168725557, + "grad_norm": 0.1622034914203485, + "learning_rate": 3.9744348900896324e-05, + "loss": 2.8201, + "step": 29440 + }, + { + "epoch": 1.8276118939723136, + "grad_norm": 0.14532567472107666, + "learning_rate": 3.9740814190637825e-05, + "loss": 2.7857, + "step": 29441 + }, + { + "epoch": 1.8276739710720715, + "grad_norm": 0.19318926678605225, + "learning_rate": 3.973727953390399e-05, + "loss": 2.8047, + "step": 29442 + }, + { + "epoch": 1.8277360481718294, + "grad_norm": 0.15528225423864692, + "learning_rate": 3.9733744930713266e-05, + "loss": 2.6652, + "step": 29443 + }, + { + "epoch": 1.8277981252715874, + "grad_norm": 0.17249130703093077, + "learning_rate": 3.973021038108407e-05, + "loss": 2.7025, + "step": 29444 + }, + { + "epoch": 1.8278602023713453, + "grad_norm": 0.15808485461053384, + "learning_rate": 3.9726675885034876e-05, + "loss": 2.8276, + "step": 29445 + }, + { + "epoch": 1.8279222794711032, + "grad_norm": 0.16681387790076543, + "learning_rate": 3.9723141442584085e-05, + "loss": 2.8333, + "step": 29446 + }, + { + "epoch": 1.8279843565708611, + "grad_norm": 0.14913553568937482, + "learning_rate": 3.971960705375017e-05, + "loss": 2.7248, + "step": 29447 + }, + { + "epoch": 1.828046433670619, + "grad_norm": 0.15978231039801433, + "learning_rate": 3.971607271855157e-05, + "loss": 2.7945, + "step": 29448 + }, + { + "epoch": 1.828108510770377, + "grad_norm": 0.16657067803065043, + "learning_rate": 3.971253843700672e-05, + "loss": 2.7194, + "step": 29449 + }, + { + "epoch": 1.8281705878701349, + "grad_norm": 0.1489877790633266, + "learning_rate": 3.970900420913404e-05, + "loss": 2.7639, + "step": 29450 + }, + { + "epoch": 1.8282326649698926, + "grad_norm": 0.1705185291466284, + "learning_rate": 3.9705470034951994e-05, + "loss": 2.8378, + "step": 29451 + }, + { + "epoch": 1.8282947420696505, + "grad_norm": 0.16511010441095972, + "learning_rate": 3.9701935914479e-05, + "loss": 2.6857, + "step": 29452 + }, + { + "epoch": 1.8283568191694084, + "grad_norm": 0.14899400492902815, + "learning_rate": 3.969840184773351e-05, + "loss": 2.7701, + "step": 29453 + }, + { + "epoch": 1.8284188962691663, + "grad_norm": 0.1721624712139342, + "learning_rate": 3.969486783473396e-05, + "loss": 2.7842, + "step": 29454 + }, + { + "epoch": 1.8284809733689242, + "grad_norm": 0.1450733508802033, + "learning_rate": 3.969133387549879e-05, + "loss": 2.6985, + "step": 29455 + }, + { + "epoch": 1.8285430504686822, + "grad_norm": 0.17217605948302914, + "learning_rate": 3.9687799970046424e-05, + "loss": 2.7764, + "step": 29456 + }, + { + "epoch": 1.8286051275684398, + "grad_norm": 0.15272584096184041, + "learning_rate": 3.968426611839533e-05, + "loss": 2.7615, + "step": 29457 + }, + { + "epoch": 1.8286672046681978, + "grad_norm": 0.14611502813736255, + "learning_rate": 3.96807323205639e-05, + "loss": 2.7991, + "step": 29458 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.1574496410389858, + "learning_rate": 3.9677198576570606e-05, + "loss": 2.8136, + "step": 29459 + }, + { + "epoch": 1.8287913588677136, + "grad_norm": 0.16595500512472752, + "learning_rate": 3.967366488643388e-05, + "loss": 2.83, + "step": 29460 + }, + { + "epoch": 1.8288534359674715, + "grad_norm": 0.14576591412056272, + "learning_rate": 3.967013125017213e-05, + "loss": 2.7701, + "step": 29461 + }, + { + "epoch": 1.8289155130672294, + "grad_norm": 0.16805096765391114, + "learning_rate": 3.966659766780383e-05, + "loss": 2.7126, + "step": 29462 + }, + { + "epoch": 1.8289775901669874, + "grad_norm": 0.16424117123087495, + "learning_rate": 3.966306413934739e-05, + "loss": 2.7156, + "step": 29463 + }, + { + "epoch": 1.8290396672667453, + "grad_norm": 0.15474267731698482, + "learning_rate": 3.965953066482126e-05, + "loss": 2.7855, + "step": 29464 + }, + { + "epoch": 1.8291017443665032, + "grad_norm": 0.16289354157066557, + "learning_rate": 3.965599724424386e-05, + "loss": 2.8034, + "step": 29465 + }, + { + "epoch": 1.829163821466261, + "grad_norm": 0.19233514224205317, + "learning_rate": 3.9652463877633645e-05, + "loss": 2.6584, + "step": 29466 + }, + { + "epoch": 1.829225898566019, + "grad_norm": 0.162910612741619, + "learning_rate": 3.9648930565009026e-05, + "loss": 2.7945, + "step": 29467 + }, + { + "epoch": 1.829287975665777, + "grad_norm": 0.14807897486477542, + "learning_rate": 3.964539730638846e-05, + "loss": 2.7487, + "step": 29468 + }, + { + "epoch": 1.8293500527655349, + "grad_norm": 0.16008263152012678, + "learning_rate": 3.964186410179036e-05, + "loss": 2.8817, + "step": 29469 + }, + { + "epoch": 1.8294121298652928, + "grad_norm": 0.1568261847783157, + "learning_rate": 3.963833095123318e-05, + "loss": 2.7388, + "step": 29470 + }, + { + "epoch": 1.8294742069650507, + "grad_norm": 0.18201073502292042, + "learning_rate": 3.963479785473534e-05, + "loss": 2.8076, + "step": 29471 + }, + { + "epoch": 1.8295362840648086, + "grad_norm": 0.14803554043623224, + "learning_rate": 3.9631264812315265e-05, + "loss": 2.8233, + "step": 29472 + }, + { + "epoch": 1.8295983611645665, + "grad_norm": 0.16868702212227477, + "learning_rate": 3.962773182399141e-05, + "loss": 2.7487, + "step": 29473 + }, + { + "epoch": 1.8296604382643245, + "grad_norm": 0.1575378038435072, + "learning_rate": 3.962419888978219e-05, + "loss": 2.7747, + "step": 29474 + }, + { + "epoch": 1.8297225153640821, + "grad_norm": 0.15802691500969457, + "learning_rate": 3.9620666009706054e-05, + "loss": 2.739, + "step": 29475 + }, + { + "epoch": 1.82978459246384, + "grad_norm": 0.15741473245627252, + "learning_rate": 3.961713318378141e-05, + "loss": 2.7847, + "step": 29476 + }, + { + "epoch": 1.829846669563598, + "grad_norm": 0.1858187139769933, + "learning_rate": 3.9613600412026705e-05, + "loss": 2.7926, + "step": 29477 + }, + { + "epoch": 1.829908746663356, + "grad_norm": 0.15107924237865328, + "learning_rate": 3.961006769446037e-05, + "loss": 2.8163, + "step": 29478 + }, + { + "epoch": 1.8299708237631138, + "grad_norm": 0.16386400431109016, + "learning_rate": 3.960653503110082e-05, + "loss": 2.6954, + "step": 29479 + }, + { + "epoch": 1.8300329008628717, + "grad_norm": 0.15930616584613078, + "learning_rate": 3.960300242196652e-05, + "loss": 2.6667, + "step": 29480 + }, + { + "epoch": 1.8300949779626294, + "grad_norm": 0.14702927648900183, + "learning_rate": 3.959946986707587e-05, + "loss": 2.7344, + "step": 29481 + }, + { + "epoch": 1.8301570550623874, + "grad_norm": 0.14444947506846192, + "learning_rate": 3.9595937366447316e-05, + "loss": 2.7968, + "step": 29482 + }, + { + "epoch": 1.8302191321621453, + "grad_norm": 0.14745294157669417, + "learning_rate": 3.959240492009928e-05, + "loss": 2.7573, + "step": 29483 + }, + { + "epoch": 1.8302812092619032, + "grad_norm": 0.1510359554745661, + "learning_rate": 3.95888725280502e-05, + "loss": 2.7388, + "step": 29484 + }, + { + "epoch": 1.830343286361661, + "grad_norm": 0.14986190498960023, + "learning_rate": 3.9585340190318496e-05, + "loss": 2.7859, + "step": 29485 + }, + { + "epoch": 1.830405363461419, + "grad_norm": 0.1556291643987307, + "learning_rate": 3.9581807906922595e-05, + "loss": 2.6825, + "step": 29486 + }, + { + "epoch": 1.830467440561177, + "grad_norm": 0.14511579792697432, + "learning_rate": 3.957827567788094e-05, + "loss": 2.7213, + "step": 29487 + }, + { + "epoch": 1.8305295176609349, + "grad_norm": 0.16441495819464116, + "learning_rate": 3.9574743503211944e-05, + "loss": 2.7599, + "step": 29488 + }, + { + "epoch": 1.8305915947606928, + "grad_norm": 0.14569050245861032, + "learning_rate": 3.957121138293405e-05, + "loss": 2.7852, + "step": 29489 + }, + { + "epoch": 1.8306536718604507, + "grad_norm": 0.14175297764353592, + "learning_rate": 3.956767931706566e-05, + "loss": 2.7253, + "step": 29490 + }, + { + "epoch": 1.8307157489602086, + "grad_norm": 0.15441409619825802, + "learning_rate": 3.956414730562524e-05, + "loss": 2.7136, + "step": 29491 + }, + { + "epoch": 1.8307778260599665, + "grad_norm": 0.14976860193667818, + "learning_rate": 3.956061534863119e-05, + "loss": 2.7489, + "step": 29492 + }, + { + "epoch": 1.8308399031597244, + "grad_norm": 0.155578667156918, + "learning_rate": 3.9557083446101944e-05, + "loss": 2.8282, + "step": 29493 + }, + { + "epoch": 1.8309019802594824, + "grad_norm": 0.14385047593884964, + "learning_rate": 3.9553551598055926e-05, + "loss": 2.77, + "step": 29494 + }, + { + "epoch": 1.8309640573592403, + "grad_norm": 0.14799892106863086, + "learning_rate": 3.955001980451157e-05, + "loss": 2.8331, + "step": 29495 + }, + { + "epoch": 1.8310261344589982, + "grad_norm": 0.20235774128089232, + "learning_rate": 3.95464880654873e-05, + "loss": 2.7357, + "step": 29496 + }, + { + "epoch": 1.8310882115587561, + "grad_norm": 0.17587977403323937, + "learning_rate": 3.954295638100153e-05, + "loss": 2.784, + "step": 29497 + }, + { + "epoch": 1.831150288658514, + "grad_norm": 0.15578101824835983, + "learning_rate": 3.953942475107271e-05, + "loss": 2.6906, + "step": 29498 + }, + { + "epoch": 1.8312123657582717, + "grad_norm": 0.17641790913215805, + "learning_rate": 3.953589317571923e-05, + "loss": 2.834, + "step": 29499 + }, + { + "epoch": 1.8312744428580296, + "grad_norm": 0.16009302752240762, + "learning_rate": 3.953236165495955e-05, + "loss": 2.8043, + "step": 29500 + }, + { + "epoch": 1.8313365199577876, + "grad_norm": 0.16643036479567708, + "learning_rate": 3.952883018881207e-05, + "loss": 2.7922, + "step": 29501 + }, + { + "epoch": 1.8313985970575455, + "grad_norm": 0.177672616478179, + "learning_rate": 3.9525298777295234e-05, + "loss": 2.8688, + "step": 29502 + }, + { + "epoch": 1.8314606741573034, + "grad_norm": 0.1603912339289534, + "learning_rate": 3.952176742042745e-05, + "loss": 2.7665, + "step": 29503 + }, + { + "epoch": 1.8315227512570613, + "grad_norm": 0.14424101582222618, + "learning_rate": 3.9518236118227154e-05, + "loss": 2.7716, + "step": 29504 + }, + { + "epoch": 1.831584828356819, + "grad_norm": 0.15691330309136392, + "learning_rate": 3.951470487071276e-05, + "loss": 2.8426, + "step": 29505 + }, + { + "epoch": 1.831646905456577, + "grad_norm": 0.1499255283580488, + "learning_rate": 3.95111736779027e-05, + "loss": 2.8229, + "step": 29506 + }, + { + "epoch": 1.8317089825563349, + "grad_norm": 0.15030614536121073, + "learning_rate": 3.9507642539815394e-05, + "loss": 2.7298, + "step": 29507 + }, + { + "epoch": 1.8317710596560928, + "grad_norm": 0.31065472343583184, + "learning_rate": 3.9504111456469264e-05, + "loss": 2.7229, + "step": 29508 + }, + { + "epoch": 1.8318331367558507, + "grad_norm": 0.1494365438365301, + "learning_rate": 3.9500580427882737e-05, + "loss": 2.6437, + "step": 29509 + }, + { + "epoch": 1.8318952138556086, + "grad_norm": 0.1703214988605936, + "learning_rate": 3.949704945407422e-05, + "loss": 2.7167, + "step": 29510 + }, + { + "epoch": 1.8319572909553665, + "grad_norm": 0.1723087321731846, + "learning_rate": 3.9493518535062154e-05, + "loss": 2.7306, + "step": 29511 + }, + { + "epoch": 1.8320193680551244, + "grad_norm": 0.24607462278845066, + "learning_rate": 3.948998767086493e-05, + "loss": 2.787, + "step": 29512 + }, + { + "epoch": 1.8320814451548824, + "grad_norm": 0.15024864009037053, + "learning_rate": 3.948645686150101e-05, + "loss": 2.7417, + "step": 29513 + }, + { + "epoch": 1.8321435222546403, + "grad_norm": 0.16437253155142134, + "learning_rate": 3.94829261069888e-05, + "loss": 2.7433, + "step": 29514 + }, + { + "epoch": 1.8322055993543982, + "grad_norm": 0.16219780130324882, + "learning_rate": 3.947939540734671e-05, + "loss": 2.7018, + "step": 29515 + }, + { + "epoch": 1.8322676764541561, + "grad_norm": 0.16084871968258427, + "learning_rate": 3.9475864762593175e-05, + "loss": 2.8635, + "step": 29516 + }, + { + "epoch": 1.832329753553914, + "grad_norm": 0.17093010745198967, + "learning_rate": 3.94723341727466e-05, + "loss": 2.7919, + "step": 29517 + }, + { + "epoch": 1.832391830653672, + "grad_norm": 0.16175225574204344, + "learning_rate": 3.9468803637825424e-05, + "loss": 2.7877, + "step": 29518 + }, + { + "epoch": 1.8324539077534299, + "grad_norm": 0.1598791481584214, + "learning_rate": 3.946527315784805e-05, + "loss": 2.7601, + "step": 29519 + }, + { + "epoch": 1.8325159848531878, + "grad_norm": 0.1593213299533745, + "learning_rate": 3.9461742732832906e-05, + "loss": 2.7471, + "step": 29520 + }, + { + "epoch": 1.8325780619529457, + "grad_norm": 0.16003451139996192, + "learning_rate": 3.9458212362798415e-05, + "loss": 2.7659, + "step": 29521 + }, + { + "epoch": 1.8326401390527036, + "grad_norm": 0.15309963016153885, + "learning_rate": 3.945468204776297e-05, + "loss": 2.7146, + "step": 29522 + }, + { + "epoch": 1.8327022161524613, + "grad_norm": 0.1408325094795207, + "learning_rate": 3.945115178774503e-05, + "loss": 2.6808, + "step": 29523 + }, + { + "epoch": 1.8327642932522192, + "grad_norm": 0.14569052163602747, + "learning_rate": 3.944762158276298e-05, + "loss": 2.8236, + "step": 29524 + }, + { + "epoch": 1.8328263703519772, + "grad_norm": 0.14236548649323785, + "learning_rate": 3.944409143283526e-05, + "loss": 2.7339, + "step": 29525 + }, + { + "epoch": 1.832888447451735, + "grad_norm": 0.13931209233978648, + "learning_rate": 3.944056133798026e-05, + "loss": 2.7264, + "step": 29526 + }, + { + "epoch": 1.832950524551493, + "grad_norm": 0.1457783980157035, + "learning_rate": 3.943703129821643e-05, + "loss": 2.7269, + "step": 29527 + }, + { + "epoch": 1.833012601651251, + "grad_norm": 0.17527104083863254, + "learning_rate": 3.943350131356216e-05, + "loss": 2.8035, + "step": 29528 + }, + { + "epoch": 1.8330746787510086, + "grad_norm": 0.15823792439831824, + "learning_rate": 3.942997138403589e-05, + "loss": 2.7328, + "step": 29529 + }, + { + "epoch": 1.8331367558507665, + "grad_norm": 0.18374544329410428, + "learning_rate": 3.9426441509656016e-05, + "loss": 2.8597, + "step": 29530 + }, + { + "epoch": 1.8331988329505244, + "grad_norm": 0.1395450289203393, + "learning_rate": 3.942291169044097e-05, + "loss": 2.7889, + "step": 29531 + }, + { + "epoch": 1.8332609100502824, + "grad_norm": 0.15933545220318224, + "learning_rate": 3.9419381926409156e-05, + "loss": 2.7884, + "step": 29532 + }, + { + "epoch": 1.8333229871500403, + "grad_norm": 0.15928960878077483, + "learning_rate": 3.9415852217579005e-05, + "loss": 2.7378, + "step": 29533 + }, + { + "epoch": 1.8333850642497982, + "grad_norm": 0.14323009721858285, + "learning_rate": 3.941232256396892e-05, + "loss": 2.7416, + "step": 29534 + }, + { + "epoch": 1.833447141349556, + "grad_norm": 0.14476013746529987, + "learning_rate": 3.9408792965597305e-05, + "loss": 2.7036, + "step": 29535 + }, + { + "epoch": 1.833509218449314, + "grad_norm": 0.14599938532458206, + "learning_rate": 3.9405263422482605e-05, + "loss": 2.7837, + "step": 29536 + }, + { + "epoch": 1.833571295549072, + "grad_norm": 0.14345756800728365, + "learning_rate": 3.94017339346432e-05, + "loss": 2.6488, + "step": 29537 + }, + { + "epoch": 1.8336333726488299, + "grad_norm": 0.16357980974805356, + "learning_rate": 3.939820450209754e-05, + "loss": 2.7019, + "step": 29538 + }, + { + "epoch": 1.8336954497485878, + "grad_norm": 0.15957826316758042, + "learning_rate": 3.939467512486401e-05, + "loss": 2.7913, + "step": 29539 + }, + { + "epoch": 1.8337575268483457, + "grad_norm": 0.1382965933374828, + "learning_rate": 3.9391145802961036e-05, + "loss": 2.7027, + "step": 29540 + }, + { + "epoch": 1.8338196039481036, + "grad_norm": 0.15393833561777734, + "learning_rate": 3.938761653640703e-05, + "loss": 2.7917, + "step": 29541 + }, + { + "epoch": 1.8338816810478615, + "grad_norm": 0.15169437486663592, + "learning_rate": 3.9384087325220407e-05, + "loss": 2.7963, + "step": 29542 + }, + { + "epoch": 1.8339437581476195, + "grad_norm": 0.14397741216050428, + "learning_rate": 3.938055816941957e-05, + "loss": 2.706, + "step": 29543 + }, + { + "epoch": 1.8340058352473774, + "grad_norm": 0.17738478727738854, + "learning_rate": 3.937702906902295e-05, + "loss": 2.816, + "step": 29544 + }, + { + "epoch": 1.8340679123471353, + "grad_norm": 0.14471644058559513, + "learning_rate": 3.937350002404893e-05, + "loss": 2.7874, + "step": 29545 + }, + { + "epoch": 1.8341299894468932, + "grad_norm": 0.18090460975774525, + "learning_rate": 3.936997103451596e-05, + "loss": 2.7288, + "step": 29546 + }, + { + "epoch": 1.834192066546651, + "grad_norm": 0.16155014901201548, + "learning_rate": 3.936644210044242e-05, + "loss": 2.7801, + "step": 29547 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.14887718283240495, + "learning_rate": 3.936291322184674e-05, + "loss": 2.7109, + "step": 29548 + }, + { + "epoch": 1.8343162207461667, + "grad_norm": 0.16425352669294427, + "learning_rate": 3.935938439874732e-05, + "loss": 2.7857, + "step": 29549 + }, + { + "epoch": 1.8343782978459247, + "grad_norm": 0.1641305600638698, + "learning_rate": 3.935585563116258e-05, + "loss": 2.7487, + "step": 29550 + }, + { + "epoch": 1.8344403749456826, + "grad_norm": 0.15213849839542753, + "learning_rate": 3.9352326919110915e-05, + "loss": 2.823, + "step": 29551 + }, + { + "epoch": 1.8345024520454405, + "grad_norm": 0.16042913472554302, + "learning_rate": 3.934879826261076e-05, + "loss": 2.8043, + "step": 29552 + }, + { + "epoch": 1.8345645291451982, + "grad_norm": 0.17592114603759104, + "learning_rate": 3.9345269661680494e-05, + "loss": 2.7031, + "step": 29553 + }, + { + "epoch": 1.834626606244956, + "grad_norm": 0.1555938533653135, + "learning_rate": 3.934174111633856e-05, + "loss": 2.8516, + "step": 29554 + }, + { + "epoch": 1.834688683344714, + "grad_norm": 0.16285929664478907, + "learning_rate": 3.933821262660333e-05, + "loss": 2.7296, + "step": 29555 + }, + { + "epoch": 1.834750760444472, + "grad_norm": 0.1568510416437108, + "learning_rate": 3.9334684192493256e-05, + "loss": 2.7809, + "step": 29556 + }, + { + "epoch": 1.8348128375442299, + "grad_norm": 0.14670307998949067, + "learning_rate": 3.93311558140267e-05, + "loss": 2.6787, + "step": 29557 + }, + { + "epoch": 1.8348749146439878, + "grad_norm": 0.15135116492347048, + "learning_rate": 3.9327627491222114e-05, + "loss": 2.7262, + "step": 29558 + }, + { + "epoch": 1.8349369917437457, + "grad_norm": 0.16476015642586175, + "learning_rate": 3.932409922409788e-05, + "loss": 2.7777, + "step": 29559 + }, + { + "epoch": 1.8349990688435036, + "grad_norm": 0.15136305280391207, + "learning_rate": 3.9320571012672404e-05, + "loss": 2.8691, + "step": 29560 + }, + { + "epoch": 1.8350611459432615, + "grad_norm": 0.18335648687596834, + "learning_rate": 3.9317042856964115e-05, + "loss": 2.713, + "step": 29561 + }, + { + "epoch": 1.8351232230430194, + "grad_norm": 0.15475597768425067, + "learning_rate": 3.9313514756991394e-05, + "loss": 2.7971, + "step": 29562 + }, + { + "epoch": 1.8351853001427774, + "grad_norm": 0.14699304635686306, + "learning_rate": 3.930998671277267e-05, + "loss": 2.7735, + "step": 29563 + }, + { + "epoch": 1.8352473772425353, + "grad_norm": 0.15880260972794563, + "learning_rate": 3.9306458724326326e-05, + "loss": 2.7346, + "step": 29564 + }, + { + "epoch": 1.8353094543422932, + "grad_norm": 0.14566559525550615, + "learning_rate": 3.93029307916708e-05, + "loss": 2.6859, + "step": 29565 + }, + { + "epoch": 1.8353715314420511, + "grad_norm": 0.14262278393035968, + "learning_rate": 3.929940291482447e-05, + "loss": 2.8757, + "step": 29566 + }, + { + "epoch": 1.835433608541809, + "grad_norm": 0.1512054502180871, + "learning_rate": 3.9295875093805764e-05, + "loss": 2.8241, + "step": 29567 + }, + { + "epoch": 1.835495685641567, + "grad_norm": 0.14165096736502572, + "learning_rate": 3.929234732863306e-05, + "loss": 2.7596, + "step": 29568 + }, + { + "epoch": 1.8355577627413249, + "grad_norm": 0.15916784511582613, + "learning_rate": 3.9288819619324796e-05, + "loss": 2.7962, + "step": 29569 + }, + { + "epoch": 1.8356198398410828, + "grad_norm": 0.1519135060671687, + "learning_rate": 3.9285291965899354e-05, + "loss": 2.8346, + "step": 29570 + }, + { + "epoch": 1.8356819169408405, + "grad_norm": 0.15587379943336366, + "learning_rate": 3.928176436837514e-05, + "loss": 2.7415, + "step": 29571 + }, + { + "epoch": 1.8357439940405984, + "grad_norm": 0.14622873251490426, + "learning_rate": 3.927823682677057e-05, + "loss": 2.7332, + "step": 29572 + }, + { + "epoch": 1.8358060711403563, + "grad_norm": 0.14580911125818705, + "learning_rate": 3.9274709341104034e-05, + "loss": 2.7122, + "step": 29573 + }, + { + "epoch": 1.8358681482401142, + "grad_norm": 0.17900218057503012, + "learning_rate": 3.927118191139395e-05, + "loss": 2.8436, + "step": 29574 + }, + { + "epoch": 1.8359302253398722, + "grad_norm": 0.1416150842984893, + "learning_rate": 3.926765453765871e-05, + "loss": 2.78, + "step": 29575 + }, + { + "epoch": 1.83599230243963, + "grad_norm": 0.149111153971439, + "learning_rate": 3.9264127219916726e-05, + "loss": 2.797, + "step": 29576 + }, + { + "epoch": 1.8360543795393878, + "grad_norm": 0.14615730647320455, + "learning_rate": 3.9260599958186376e-05, + "loss": 2.8762, + "step": 29577 + }, + { + "epoch": 1.8361164566391457, + "grad_norm": 0.1595426470097602, + "learning_rate": 3.92570727524861e-05, + "loss": 2.7204, + "step": 29578 + }, + { + "epoch": 1.8361785337389036, + "grad_norm": 0.15038755958702754, + "learning_rate": 3.9253545602834286e-05, + "loss": 2.8288, + "step": 29579 + }, + { + "epoch": 1.8362406108386615, + "grad_norm": 0.14596275280601181, + "learning_rate": 3.925001850924932e-05, + "loss": 2.7098, + "step": 29580 + }, + { + "epoch": 1.8363026879384194, + "grad_norm": 0.14303968785479643, + "learning_rate": 3.924649147174963e-05, + "loss": 2.7097, + "step": 29581 + }, + { + "epoch": 1.8363647650381774, + "grad_norm": 0.14520136675092882, + "learning_rate": 3.9242964490353606e-05, + "loss": 2.8181, + "step": 29582 + }, + { + "epoch": 1.8364268421379353, + "grad_norm": 0.14679654872019565, + "learning_rate": 3.923943756507965e-05, + "loss": 2.6886, + "step": 29583 + }, + { + "epoch": 1.8364889192376932, + "grad_norm": 0.14172495365759455, + "learning_rate": 3.923591069594616e-05, + "loss": 2.7125, + "step": 29584 + }, + { + "epoch": 1.8365509963374511, + "grad_norm": 0.1426432604479933, + "learning_rate": 3.9232383882971526e-05, + "loss": 2.6847, + "step": 29585 + }, + { + "epoch": 1.836613073437209, + "grad_norm": 0.15320125744044635, + "learning_rate": 3.9228857126174166e-05, + "loss": 2.7997, + "step": 29586 + }, + { + "epoch": 1.836675150536967, + "grad_norm": 0.15449171801663328, + "learning_rate": 3.9225330425572474e-05, + "loss": 2.7206, + "step": 29587 + }, + { + "epoch": 1.8367372276367249, + "grad_norm": 0.14809625702857207, + "learning_rate": 3.922180378118485e-05, + "loss": 2.8631, + "step": 29588 + }, + { + "epoch": 1.8367993047364828, + "grad_norm": 0.1488013200825989, + "learning_rate": 3.921827719302968e-05, + "loss": 2.7954, + "step": 29589 + }, + { + "epoch": 1.8368613818362407, + "grad_norm": 0.14248934032490948, + "learning_rate": 3.921475066112539e-05, + "loss": 2.7647, + "step": 29590 + }, + { + "epoch": 1.8369234589359986, + "grad_norm": 0.14642437023771857, + "learning_rate": 3.921122418549035e-05, + "loss": 2.7358, + "step": 29591 + }, + { + "epoch": 1.8369855360357565, + "grad_norm": 0.16472744725952923, + "learning_rate": 3.9207697766142994e-05, + "loss": 2.7868, + "step": 29592 + }, + { + "epoch": 1.8370476131355145, + "grad_norm": 0.15996756720399058, + "learning_rate": 3.920417140310167e-05, + "loss": 2.6775, + "step": 29593 + }, + { + "epoch": 1.8371096902352724, + "grad_norm": 0.14670969482160476, + "learning_rate": 3.920064509638483e-05, + "loss": 2.8201, + "step": 29594 + }, + { + "epoch": 1.83717176733503, + "grad_norm": 0.14898692892493062, + "learning_rate": 3.919711884601084e-05, + "loss": 2.7305, + "step": 29595 + }, + { + "epoch": 1.837233844434788, + "grad_norm": 0.17388892985496388, + "learning_rate": 3.91935926519981e-05, + "loss": 2.7872, + "step": 29596 + }, + { + "epoch": 1.837295921534546, + "grad_norm": 0.160159855313543, + "learning_rate": 3.919006651436501e-05, + "loss": 2.7897, + "step": 29597 + }, + { + "epoch": 1.8373579986343038, + "grad_norm": 0.1609042077674849, + "learning_rate": 3.918654043312996e-05, + "loss": 2.7649, + "step": 29598 + }, + { + "epoch": 1.8374200757340617, + "grad_norm": 0.14098072095547548, + "learning_rate": 3.918301440831137e-05, + "loss": 2.7346, + "step": 29599 + }, + { + "epoch": 1.8374821528338194, + "grad_norm": 0.186334184432119, + "learning_rate": 3.91794884399276e-05, + "loss": 2.8039, + "step": 29600 + }, + { + "epoch": 1.8375442299335774, + "grad_norm": 0.1614545788379698, + "learning_rate": 3.917596252799709e-05, + "loss": 2.7575, + "step": 29601 + }, + { + "epoch": 1.8376063070333353, + "grad_norm": 0.16006936647425216, + "learning_rate": 3.9172436672538187e-05, + "loss": 2.8752, + "step": 29602 + }, + { + "epoch": 1.8376683841330932, + "grad_norm": 0.16026870919527467, + "learning_rate": 3.916891087356933e-05, + "loss": 2.7045, + "step": 29603 + }, + { + "epoch": 1.837730461232851, + "grad_norm": 0.14500266596793263, + "learning_rate": 3.916538513110888e-05, + "loss": 2.8543, + "step": 29604 + }, + { + "epoch": 1.837792538332609, + "grad_norm": 0.1554169022065731, + "learning_rate": 3.916185944517525e-05, + "loss": 2.7661, + "step": 29605 + }, + { + "epoch": 1.837854615432367, + "grad_norm": 0.16036310444784166, + "learning_rate": 3.915833381578684e-05, + "loss": 2.7867, + "step": 29606 + }, + { + "epoch": 1.8379166925321249, + "grad_norm": 0.15175835303722132, + "learning_rate": 3.9154808242962024e-05, + "loss": 2.7285, + "step": 29607 + }, + { + "epoch": 1.8379787696318828, + "grad_norm": 0.1417046007623611, + "learning_rate": 3.9151282726719205e-05, + "loss": 2.7933, + "step": 29608 + }, + { + "epoch": 1.8380408467316407, + "grad_norm": 0.15098445249916922, + "learning_rate": 3.914775726707678e-05, + "loss": 2.7278, + "step": 29609 + }, + { + "epoch": 1.8381029238313986, + "grad_norm": 0.1515335276529483, + "learning_rate": 3.914423186405313e-05, + "loss": 2.7354, + "step": 29610 + }, + { + "epoch": 1.8381650009311565, + "grad_norm": 0.1637985911146148, + "learning_rate": 3.914070651766667e-05, + "loss": 2.7383, + "step": 29611 + }, + { + "epoch": 1.8382270780309145, + "grad_norm": 0.14454474257303512, + "learning_rate": 3.913718122793577e-05, + "loss": 2.7917, + "step": 29612 + }, + { + "epoch": 1.8382891551306724, + "grad_norm": 0.15679178483395767, + "learning_rate": 3.9133655994878846e-05, + "loss": 2.7285, + "step": 29613 + }, + { + "epoch": 1.8383512322304303, + "grad_norm": 0.1480796729825345, + "learning_rate": 3.9130130818514265e-05, + "loss": 2.8229, + "step": 29614 + }, + { + "epoch": 1.8384133093301882, + "grad_norm": 0.15088240610155945, + "learning_rate": 3.912660569886044e-05, + "loss": 2.7598, + "step": 29615 + }, + { + "epoch": 1.8384753864299461, + "grad_norm": 0.15837026526630504, + "learning_rate": 3.912308063593575e-05, + "loss": 2.6719, + "step": 29616 + }, + { + "epoch": 1.838537463529704, + "grad_norm": 0.1448781772755533, + "learning_rate": 3.911955562975859e-05, + "loss": 2.716, + "step": 29617 + }, + { + "epoch": 1.8385995406294617, + "grad_norm": 0.16227183187310318, + "learning_rate": 3.9116030680347344e-05, + "loss": 2.841, + "step": 29618 + }, + { + "epoch": 1.8386616177292197, + "grad_norm": 0.1653599919910774, + "learning_rate": 3.9112505787720416e-05, + "loss": 2.7723, + "step": 29619 + }, + { + "epoch": 1.8387236948289776, + "grad_norm": 0.14241651625394644, + "learning_rate": 3.910898095189619e-05, + "loss": 2.7696, + "step": 29620 + }, + { + "epoch": 1.8387857719287355, + "grad_norm": 0.16915081733494305, + "learning_rate": 3.9105456172893045e-05, + "loss": 2.6913, + "step": 29621 + }, + { + "epoch": 1.8388478490284934, + "grad_norm": 0.14176728646603848, + "learning_rate": 3.910193145072939e-05, + "loss": 2.7771, + "step": 29622 + }, + { + "epoch": 1.8389099261282513, + "grad_norm": 0.15033943370164807, + "learning_rate": 3.9098406785423594e-05, + "loss": 2.8005, + "step": 29623 + }, + { + "epoch": 1.838972003228009, + "grad_norm": 0.14993541116718398, + "learning_rate": 3.909488217699406e-05, + "loss": 2.8527, + "step": 29624 + }, + { + "epoch": 1.839034080327767, + "grad_norm": 0.16772208843529998, + "learning_rate": 3.909135762545917e-05, + "loss": 2.7959, + "step": 29625 + }, + { + "epoch": 1.8390961574275249, + "grad_norm": 0.14776661119215664, + "learning_rate": 3.908783313083732e-05, + "loss": 2.7126, + "step": 29626 + }, + { + "epoch": 1.8391582345272828, + "grad_norm": 0.14901576216144294, + "learning_rate": 3.9084308693146884e-05, + "loss": 2.7661, + "step": 29627 + }, + { + "epoch": 1.8392203116270407, + "grad_norm": 0.14187330939214096, + "learning_rate": 3.908078431240628e-05, + "loss": 2.751, + "step": 29628 + }, + { + "epoch": 1.8392823887267986, + "grad_norm": 0.15229705559549672, + "learning_rate": 3.907725998863385e-05, + "loss": 2.8287, + "step": 29629 + }, + { + "epoch": 1.8393444658265565, + "grad_norm": 0.1453654854064346, + "learning_rate": 3.9073735721848024e-05, + "loss": 2.7796, + "step": 29630 + }, + { + "epoch": 1.8394065429263144, + "grad_norm": 0.15176042728798025, + "learning_rate": 3.9070211512067176e-05, + "loss": 2.783, + "step": 29631 + }, + { + "epoch": 1.8394686200260724, + "grad_norm": 0.1479040460899891, + "learning_rate": 3.906668735930967e-05, + "loss": 2.7556, + "step": 29632 + }, + { + "epoch": 1.8395306971258303, + "grad_norm": 0.15489604359475467, + "learning_rate": 3.906316326359393e-05, + "loss": 2.7942, + "step": 29633 + }, + { + "epoch": 1.8395927742255882, + "grad_norm": 0.14821077901151983, + "learning_rate": 3.90596392249383e-05, + "loss": 2.7822, + "step": 29634 + }, + { + "epoch": 1.8396548513253461, + "grad_norm": 0.16705136059415618, + "learning_rate": 3.90561152433612e-05, + "loss": 2.9053, + "step": 29635 + }, + { + "epoch": 1.839716928425104, + "grad_norm": 0.16487222059544773, + "learning_rate": 3.9052591318881e-05, + "loss": 2.731, + "step": 29636 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.14786238057559573, + "learning_rate": 3.90490674515161e-05, + "loss": 2.8573, + "step": 29637 + }, + { + "epoch": 1.8398410826246199, + "grad_norm": 0.16300420940410928, + "learning_rate": 3.904554364128485e-05, + "loss": 2.752, + "step": 29638 + }, + { + "epoch": 1.8399031597243778, + "grad_norm": 0.14839509935633538, + "learning_rate": 3.904201988820568e-05, + "loss": 2.7157, + "step": 29639 + }, + { + "epoch": 1.8399652368241357, + "grad_norm": 0.14618717562071523, + "learning_rate": 3.9038496192296935e-05, + "loss": 2.813, + "step": 29640 + }, + { + "epoch": 1.8400273139238936, + "grad_norm": 0.15845087455789233, + "learning_rate": 3.903497255357703e-05, + "loss": 2.8186, + "step": 29641 + }, + { + "epoch": 1.8400893910236513, + "grad_norm": 0.1484950355653641, + "learning_rate": 3.903144897206433e-05, + "loss": 2.6991, + "step": 29642 + }, + { + "epoch": 1.8401514681234092, + "grad_norm": 0.14699077811314376, + "learning_rate": 3.9027925447777206e-05, + "loss": 2.7453, + "step": 29643 + }, + { + "epoch": 1.8402135452231672, + "grad_norm": 0.14828646655699737, + "learning_rate": 3.9024401980734084e-05, + "loss": 2.8565, + "step": 29644 + }, + { + "epoch": 1.840275622322925, + "grad_norm": 0.1633507411869913, + "learning_rate": 3.9020878570953316e-05, + "loss": 2.8148, + "step": 29645 + }, + { + "epoch": 1.840337699422683, + "grad_norm": 0.1484445457292136, + "learning_rate": 3.901735521845329e-05, + "loss": 2.7511, + "step": 29646 + }, + { + "epoch": 1.840399776522441, + "grad_norm": 0.1487198077543172, + "learning_rate": 3.901383192325239e-05, + "loss": 2.6888, + "step": 29647 + }, + { + "epoch": 1.8404618536221986, + "grad_norm": 0.14814834916709083, + "learning_rate": 3.901030868536898e-05, + "loss": 2.7437, + "step": 29648 + }, + { + "epoch": 1.8405239307219565, + "grad_norm": 0.14323964875330053, + "learning_rate": 3.900678550482147e-05, + "loss": 2.7996, + "step": 29649 + }, + { + "epoch": 1.8405860078217144, + "grad_norm": 0.16623181123474914, + "learning_rate": 3.900326238162823e-05, + "loss": 2.8416, + "step": 29650 + }, + { + "epoch": 1.8406480849214724, + "grad_norm": 0.15477046234951453, + "learning_rate": 3.899973931580764e-05, + "loss": 2.6559, + "step": 29651 + }, + { + "epoch": 1.8407101620212303, + "grad_norm": 0.15125000016748413, + "learning_rate": 3.8996216307378073e-05, + "loss": 2.7111, + "step": 29652 + }, + { + "epoch": 1.8407722391209882, + "grad_norm": 0.1558679320202072, + "learning_rate": 3.8992693356357926e-05, + "loss": 2.8591, + "step": 29653 + }, + { + "epoch": 1.8408343162207461, + "grad_norm": 0.16543359267704602, + "learning_rate": 3.898917046276556e-05, + "loss": 2.7032, + "step": 29654 + }, + { + "epoch": 1.840896393320504, + "grad_norm": 0.17428630814779864, + "learning_rate": 3.898564762661938e-05, + "loss": 2.8158, + "step": 29655 + }, + { + "epoch": 1.840958470420262, + "grad_norm": 0.15083002329411965, + "learning_rate": 3.898212484793774e-05, + "loss": 2.8394, + "step": 29656 + }, + { + "epoch": 1.8410205475200199, + "grad_norm": 0.17200593404354503, + "learning_rate": 3.8978602126739024e-05, + "loss": 2.74, + "step": 29657 + }, + { + "epoch": 1.8410826246197778, + "grad_norm": 0.14758548150777887, + "learning_rate": 3.897507946304163e-05, + "loss": 2.6681, + "step": 29658 + }, + { + "epoch": 1.8411447017195357, + "grad_norm": 0.1489578900352853, + "learning_rate": 3.8971556856863914e-05, + "loss": 2.7012, + "step": 29659 + }, + { + "epoch": 1.8412067788192936, + "grad_norm": 0.1514823598466985, + "learning_rate": 3.896803430822427e-05, + "loss": 2.7869, + "step": 29660 + }, + { + "epoch": 1.8412688559190515, + "grad_norm": 0.148621495301013, + "learning_rate": 3.896451181714105e-05, + "loss": 2.776, + "step": 29661 + }, + { + "epoch": 1.8413309330188095, + "grad_norm": 0.16616458941415616, + "learning_rate": 3.8960989383632665e-05, + "loss": 2.7027, + "step": 29662 + }, + { + "epoch": 1.8413930101185674, + "grad_norm": 0.14685851953246962, + "learning_rate": 3.895746700771748e-05, + "loss": 2.7389, + "step": 29663 + }, + { + "epoch": 1.8414550872183253, + "grad_norm": 0.17341342492589373, + "learning_rate": 3.8953944689413865e-05, + "loss": 2.7899, + "step": 29664 + }, + { + "epoch": 1.8415171643180832, + "grad_norm": 0.15612543624545158, + "learning_rate": 3.89504224287402e-05, + "loss": 2.8279, + "step": 29665 + }, + { + "epoch": 1.841579241417841, + "grad_norm": 0.15409001233386158, + "learning_rate": 3.894690022571488e-05, + "loss": 2.8998, + "step": 29666 + }, + { + "epoch": 1.8416413185175988, + "grad_norm": 0.1536970672747691, + "learning_rate": 3.8943378080356244e-05, + "loss": 2.715, + "step": 29667 + }, + { + "epoch": 1.8417033956173567, + "grad_norm": 0.15083446280199686, + "learning_rate": 3.89398559926827e-05, + "loss": 2.7066, + "step": 29668 + }, + { + "epoch": 1.8417654727171147, + "grad_norm": 0.15410827626928603, + "learning_rate": 3.8936333962712615e-05, + "loss": 2.7722, + "step": 29669 + }, + { + "epoch": 1.8418275498168726, + "grad_norm": 0.15364327401653585, + "learning_rate": 3.8932811990464354e-05, + "loss": 2.7487, + "step": 29670 + }, + { + "epoch": 1.8418896269166305, + "grad_norm": 0.15146028055564192, + "learning_rate": 3.89292900759563e-05, + "loss": 2.7875, + "step": 29671 + }, + { + "epoch": 1.8419517040163882, + "grad_norm": 0.22862536900700678, + "learning_rate": 3.8925768219206826e-05, + "loss": 2.7843, + "step": 29672 + }, + { + "epoch": 1.842013781116146, + "grad_norm": 0.16345648442870994, + "learning_rate": 3.892224642023431e-05, + "loss": 2.7963, + "step": 29673 + }, + { + "epoch": 1.842075858215904, + "grad_norm": 0.14663365704703119, + "learning_rate": 3.891872467905713e-05, + "loss": 2.718, + "step": 29674 + }, + { + "epoch": 1.842137935315662, + "grad_norm": 0.15704638659653963, + "learning_rate": 3.891520299569364e-05, + "loss": 2.7847, + "step": 29675 + }, + { + "epoch": 1.8422000124154199, + "grad_norm": 0.14181799954568894, + "learning_rate": 3.8911681370162224e-05, + "loss": 2.7512, + "step": 29676 + }, + { + "epoch": 1.8422620895151778, + "grad_norm": 0.15420370669172592, + "learning_rate": 3.8908159802481266e-05, + "loss": 2.7713, + "step": 29677 + }, + { + "epoch": 1.8423241666149357, + "grad_norm": 0.14248411789653484, + "learning_rate": 3.890463829266914e-05, + "loss": 2.8247, + "step": 29678 + }, + { + "epoch": 1.8423862437146936, + "grad_norm": 0.14007878185826683, + "learning_rate": 3.89011168407442e-05, + "loss": 2.6748, + "step": 29679 + }, + { + "epoch": 1.8424483208144515, + "grad_norm": 0.1601686647302028, + "learning_rate": 3.8897595446724836e-05, + "loss": 2.7945, + "step": 29680 + }, + { + "epoch": 1.8425103979142095, + "grad_norm": 0.18010385623226324, + "learning_rate": 3.889407411062941e-05, + "loss": 2.8468, + "step": 29681 + }, + { + "epoch": 1.8425724750139674, + "grad_norm": 0.13734333557960385, + "learning_rate": 3.889055283247628e-05, + "loss": 2.7897, + "step": 29682 + }, + { + "epoch": 1.8426345521137253, + "grad_norm": 0.17937050722390538, + "learning_rate": 3.888703161228385e-05, + "loss": 2.7516, + "step": 29683 + }, + { + "epoch": 1.8426966292134832, + "grad_norm": 0.15249333618214542, + "learning_rate": 3.8883510450070464e-05, + "loss": 2.8101, + "step": 29684 + }, + { + "epoch": 1.8427587063132411, + "grad_norm": 0.15020317930099344, + "learning_rate": 3.887998934585451e-05, + "loss": 2.6561, + "step": 29685 + }, + { + "epoch": 1.842820783412999, + "grad_norm": 0.15267815323444128, + "learning_rate": 3.887646829965434e-05, + "loss": 2.8102, + "step": 29686 + }, + { + "epoch": 1.842882860512757, + "grad_norm": 0.1467258307136741, + "learning_rate": 3.887294731148835e-05, + "loss": 2.7518, + "step": 29687 + }, + { + "epoch": 1.8429449376125149, + "grad_norm": 0.16484607609016158, + "learning_rate": 3.8869426381374886e-05, + "loss": 2.7733, + "step": 29688 + }, + { + "epoch": 1.8430070147122728, + "grad_norm": 0.14232598833554744, + "learning_rate": 3.886590550933233e-05, + "loss": 2.7709, + "step": 29689 + }, + { + "epoch": 1.8430690918120305, + "grad_norm": 0.14470002269518964, + "learning_rate": 3.886238469537904e-05, + "loss": 2.7961, + "step": 29690 + }, + { + "epoch": 1.8431311689117884, + "grad_norm": 0.16169785350744345, + "learning_rate": 3.8858863939533404e-05, + "loss": 2.7865, + "step": 29691 + }, + { + "epoch": 1.8431932460115463, + "grad_norm": 0.14593334827194657, + "learning_rate": 3.885534324181378e-05, + "loss": 2.8066, + "step": 29692 + }, + { + "epoch": 1.8432553231113042, + "grad_norm": 0.17576753244917317, + "learning_rate": 3.885182260223853e-05, + "loss": 2.7612, + "step": 29693 + }, + { + "epoch": 1.8433174002110622, + "grad_norm": 0.23055870919020882, + "learning_rate": 3.884830202082603e-05, + "loss": 2.7942, + "step": 29694 + }, + { + "epoch": 1.84337947731082, + "grad_norm": 0.18230370527206788, + "learning_rate": 3.884478149759465e-05, + "loss": 2.6896, + "step": 29695 + }, + { + "epoch": 1.8434415544105778, + "grad_norm": 0.14697350536620762, + "learning_rate": 3.884126103256275e-05, + "loss": 2.7483, + "step": 29696 + }, + { + "epoch": 1.8435036315103357, + "grad_norm": 0.17109039969626014, + "learning_rate": 3.8837740625748685e-05, + "loss": 2.7316, + "step": 29697 + }, + { + "epoch": 1.8435657086100936, + "grad_norm": 0.17255265852301388, + "learning_rate": 3.883422027717086e-05, + "loss": 2.7502, + "step": 29698 + }, + { + "epoch": 1.8436277857098515, + "grad_norm": 0.19257418800647333, + "learning_rate": 3.88306999868476e-05, + "loss": 2.7934, + "step": 29699 + }, + { + "epoch": 1.8436898628096094, + "grad_norm": 0.1694129689440573, + "learning_rate": 3.88271797547973e-05, + "loss": 2.77, + "step": 29700 + }, + { + "epoch": 1.8437519399093674, + "grad_norm": 0.1545617630517176, + "learning_rate": 3.882365958103831e-05, + "loss": 2.7894, + "step": 29701 + }, + { + "epoch": 1.8438140170091253, + "grad_norm": 0.15687841170427197, + "learning_rate": 3.8820139465589005e-05, + "loss": 2.6987, + "step": 29702 + }, + { + "epoch": 1.8438760941088832, + "grad_norm": 0.1630306892696139, + "learning_rate": 3.881661940846774e-05, + "loss": 2.8013, + "step": 29703 + }, + { + "epoch": 1.8439381712086411, + "grad_norm": 0.2057070941592821, + "learning_rate": 3.8813099409692896e-05, + "loss": 2.769, + "step": 29704 + }, + { + "epoch": 1.844000248308399, + "grad_norm": 0.1640055762039322, + "learning_rate": 3.8809579469282834e-05, + "loss": 2.7754, + "step": 29705 + }, + { + "epoch": 1.844062325408157, + "grad_norm": 0.14737439932065793, + "learning_rate": 3.880605958725589e-05, + "loss": 2.7734, + "step": 29706 + }, + { + "epoch": 1.8441244025079149, + "grad_norm": 0.18583725743541255, + "learning_rate": 3.880253976363047e-05, + "loss": 2.7741, + "step": 29707 + }, + { + "epoch": 1.8441864796076728, + "grad_norm": 0.17576119520438138, + "learning_rate": 3.87990199984249e-05, + "loss": 2.7549, + "step": 29708 + }, + { + "epoch": 1.8442485567074307, + "grad_norm": 0.17577043076174037, + "learning_rate": 3.879550029165756e-05, + "loss": 2.659, + "step": 29709 + }, + { + "epoch": 1.8443106338071886, + "grad_norm": 0.1585621821175149, + "learning_rate": 3.8791980643346835e-05, + "loss": 2.7326, + "step": 29710 + }, + { + "epoch": 1.8443727109069465, + "grad_norm": 0.15958203911152755, + "learning_rate": 3.878846105351105e-05, + "loss": 2.7744, + "step": 29711 + }, + { + "epoch": 1.8444347880067045, + "grad_norm": 0.160637249535171, + "learning_rate": 3.8784941522168606e-05, + "loss": 2.7482, + "step": 29712 + }, + { + "epoch": 1.8444968651064624, + "grad_norm": 0.1535086416641169, + "learning_rate": 3.878142204933783e-05, + "loss": 2.7492, + "step": 29713 + }, + { + "epoch": 1.84455894220622, + "grad_norm": 0.1560729394487221, + "learning_rate": 3.877790263503711e-05, + "loss": 2.7396, + "step": 29714 + }, + { + "epoch": 1.844621019305978, + "grad_norm": 0.15442702683120216, + "learning_rate": 3.877438327928478e-05, + "loss": 2.7423, + "step": 29715 + }, + { + "epoch": 1.844683096405736, + "grad_norm": 0.17865438556682495, + "learning_rate": 3.877086398209924e-05, + "loss": 2.7093, + "step": 29716 + }, + { + "epoch": 1.8447451735054938, + "grad_norm": 0.1742300893745528, + "learning_rate": 3.8767344743498813e-05, + "loss": 2.7115, + "step": 29717 + }, + { + "epoch": 1.8448072506052517, + "grad_norm": 0.1500251714130154, + "learning_rate": 3.876382556350187e-05, + "loss": 2.8156, + "step": 29718 + }, + { + "epoch": 1.8448693277050097, + "grad_norm": 0.1480303249318147, + "learning_rate": 3.8760306442126796e-05, + "loss": 2.7968, + "step": 29719 + }, + { + "epoch": 1.8449314048047674, + "grad_norm": 0.1622763141845236, + "learning_rate": 3.875678737939191e-05, + "loss": 2.7309, + "step": 29720 + }, + { + "epoch": 1.8449934819045253, + "grad_norm": 0.20508332018856712, + "learning_rate": 3.875326837531561e-05, + "loss": 2.6981, + "step": 29721 + }, + { + "epoch": 1.8450555590042832, + "grad_norm": 0.15624137496984933, + "learning_rate": 3.874974942991623e-05, + "loss": 2.7812, + "step": 29722 + }, + { + "epoch": 1.8451176361040411, + "grad_norm": 0.16029876671539459, + "learning_rate": 3.874623054321215e-05, + "loss": 2.8419, + "step": 29723 + }, + { + "epoch": 1.845179713203799, + "grad_norm": 0.1734976681310732, + "learning_rate": 3.87427117152217e-05, + "loss": 2.8512, + "step": 29724 + }, + { + "epoch": 1.845241790303557, + "grad_norm": 0.1468739035240801, + "learning_rate": 3.873919294596327e-05, + "loss": 2.6553, + "step": 29725 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.17674265843970993, + "learning_rate": 3.8735674235455194e-05, + "loss": 2.7556, + "step": 29726 + }, + { + "epoch": 1.8453659445030728, + "grad_norm": 0.184332841158837, + "learning_rate": 3.873215558371585e-05, + "loss": 2.7483, + "step": 29727 + }, + { + "epoch": 1.8454280216028307, + "grad_norm": 0.15816996662541583, + "learning_rate": 3.8728636990763575e-05, + "loss": 2.7576, + "step": 29728 + }, + { + "epoch": 1.8454900987025886, + "grad_norm": 0.1505269139017551, + "learning_rate": 3.872511845661674e-05, + "loss": 2.7713, + "step": 29729 + }, + { + "epoch": 1.8455521758023465, + "grad_norm": 0.16945954159854665, + "learning_rate": 3.872159998129371e-05, + "loss": 2.781, + "step": 29730 + }, + { + "epoch": 1.8456142529021045, + "grad_norm": 0.14597040288443008, + "learning_rate": 3.871808156481282e-05, + "loss": 2.767, + "step": 29731 + }, + { + "epoch": 1.8456763300018624, + "grad_norm": 0.17489677539611104, + "learning_rate": 3.871456320719245e-05, + "loss": 2.7546, + "step": 29732 + }, + { + "epoch": 1.8457384071016203, + "grad_norm": 0.1525547083060885, + "learning_rate": 3.871104490845092e-05, + "loss": 2.782, + "step": 29733 + }, + { + "epoch": 1.8458004842013782, + "grad_norm": 0.14451572618368563, + "learning_rate": 3.870752666860662e-05, + "loss": 2.7741, + "step": 29734 + }, + { + "epoch": 1.8458625613011361, + "grad_norm": 0.15320763426179346, + "learning_rate": 3.8704008487677894e-05, + "loss": 2.7633, + "step": 29735 + }, + { + "epoch": 1.845924638400894, + "grad_norm": 0.15042179577299247, + "learning_rate": 3.870049036568311e-05, + "loss": 2.7538, + "step": 29736 + }, + { + "epoch": 1.845986715500652, + "grad_norm": 0.15400032324726065, + "learning_rate": 3.869697230264059e-05, + "loss": 2.7776, + "step": 29737 + }, + { + "epoch": 1.8460487926004097, + "grad_norm": 0.15647069127497745, + "learning_rate": 3.869345429856873e-05, + "loss": 2.8354, + "step": 29738 + }, + { + "epoch": 1.8461108697001676, + "grad_norm": 0.1847453826074356, + "learning_rate": 3.868993635348585e-05, + "loss": 2.76, + "step": 29739 + }, + { + "epoch": 1.8461729467999255, + "grad_norm": 0.16214548421186045, + "learning_rate": 3.868641846741032e-05, + "loss": 2.6905, + "step": 29740 + }, + { + "epoch": 1.8462350238996834, + "grad_norm": 0.14548330292600162, + "learning_rate": 3.868290064036049e-05, + "loss": 2.7304, + "step": 29741 + }, + { + "epoch": 1.8462971009994413, + "grad_norm": 0.14436519594675185, + "learning_rate": 3.867938287235472e-05, + "loss": 2.737, + "step": 29742 + }, + { + "epoch": 1.8463591780991992, + "grad_norm": 0.1578587386405327, + "learning_rate": 3.867586516341135e-05, + "loss": 2.806, + "step": 29743 + }, + { + "epoch": 1.846421255198957, + "grad_norm": 0.15647138766417254, + "learning_rate": 3.8672347513548745e-05, + "loss": 2.7605, + "step": 29744 + }, + { + "epoch": 1.8464833322987149, + "grad_norm": 0.18293065181838866, + "learning_rate": 3.8668829922785256e-05, + "loss": 2.8414, + "step": 29745 + }, + { + "epoch": 1.8465454093984728, + "grad_norm": 0.15482728066032753, + "learning_rate": 3.866531239113923e-05, + "loss": 2.6887, + "step": 29746 + }, + { + "epoch": 1.8466074864982307, + "grad_norm": 0.1803966708551712, + "learning_rate": 3.866179491862902e-05, + "loss": 2.7404, + "step": 29747 + }, + { + "epoch": 1.8466695635979886, + "grad_norm": 0.16573692939180182, + "learning_rate": 3.865827750527298e-05, + "loss": 2.8263, + "step": 29748 + }, + { + "epoch": 1.8467316406977465, + "grad_norm": 0.16037150780509707, + "learning_rate": 3.865476015108945e-05, + "loss": 2.7574, + "step": 29749 + }, + { + "epoch": 1.8467937177975045, + "grad_norm": 0.15681315502426768, + "learning_rate": 3.865124285609681e-05, + "loss": 2.7042, + "step": 29750 + }, + { + "epoch": 1.8468557948972624, + "grad_norm": 0.15480445717412547, + "learning_rate": 3.864772562031337e-05, + "loss": 2.6737, + "step": 29751 + }, + { + "epoch": 1.8469178719970203, + "grad_norm": 0.16625131257872747, + "learning_rate": 3.8644208443757526e-05, + "loss": 2.6713, + "step": 29752 + }, + { + "epoch": 1.8469799490967782, + "grad_norm": 0.1552047037547241, + "learning_rate": 3.864069132644758e-05, + "loss": 2.7487, + "step": 29753 + }, + { + "epoch": 1.8470420261965361, + "grad_norm": 0.17564922248872938, + "learning_rate": 3.8637174268401924e-05, + "loss": 2.7467, + "step": 29754 + }, + { + "epoch": 1.847104103296294, + "grad_norm": 0.1396559799931789, + "learning_rate": 3.8633657269638876e-05, + "loss": 2.6839, + "step": 29755 + }, + { + "epoch": 1.847166180396052, + "grad_norm": 0.29798770982245343, + "learning_rate": 3.8630140330176794e-05, + "loss": 2.8681, + "step": 29756 + }, + { + "epoch": 1.8472282574958099, + "grad_norm": 0.14784398758707284, + "learning_rate": 3.862662345003404e-05, + "loss": 2.8236, + "step": 29757 + }, + { + "epoch": 1.8472903345955678, + "grad_norm": 0.16218062638911848, + "learning_rate": 3.8623106629228944e-05, + "loss": 2.8189, + "step": 29758 + }, + { + "epoch": 1.8473524116953257, + "grad_norm": 0.20063342096059633, + "learning_rate": 3.861958986777987e-05, + "loss": 2.7457, + "step": 29759 + }, + { + "epoch": 1.8474144887950836, + "grad_norm": 0.18784344530882086, + "learning_rate": 3.8616073165705154e-05, + "loss": 2.7916, + "step": 29760 + }, + { + "epoch": 1.8474765658948415, + "grad_norm": 0.1549789226721677, + "learning_rate": 3.861255652302315e-05, + "loss": 2.7437, + "step": 29761 + }, + { + "epoch": 1.8475386429945992, + "grad_norm": 0.14982430231427452, + "learning_rate": 3.86090399397522e-05, + "loss": 2.8178, + "step": 29762 + }, + { + "epoch": 1.8476007200943572, + "grad_norm": 0.14974351476953907, + "learning_rate": 3.860552341591066e-05, + "loss": 2.6997, + "step": 29763 + }, + { + "epoch": 1.847662797194115, + "grad_norm": 0.16276717299107657, + "learning_rate": 3.8602006951516864e-05, + "loss": 2.8294, + "step": 29764 + }, + { + "epoch": 1.847724874293873, + "grad_norm": 0.15186371121621778, + "learning_rate": 3.859849054658917e-05, + "loss": 2.7519, + "step": 29765 + }, + { + "epoch": 1.847786951393631, + "grad_norm": 0.15198672469848282, + "learning_rate": 3.859497420114592e-05, + "loss": 2.8832, + "step": 29766 + }, + { + "epoch": 1.8478490284933888, + "grad_norm": 0.17352516583487113, + "learning_rate": 3.8591457915205446e-05, + "loss": 2.8193, + "step": 29767 + }, + { + "epoch": 1.8479111055931465, + "grad_norm": 0.1496321389432056, + "learning_rate": 3.858794168878612e-05, + "loss": 2.7481, + "step": 29768 + }, + { + "epoch": 1.8479731826929044, + "grad_norm": 0.17957791843201618, + "learning_rate": 3.858442552190626e-05, + "loss": 2.8181, + "step": 29769 + }, + { + "epoch": 1.8480352597926624, + "grad_norm": 0.14767734487911827, + "learning_rate": 3.858090941458423e-05, + "loss": 2.8001, + "step": 29770 + }, + { + "epoch": 1.8480973368924203, + "grad_norm": 0.15644057096422972, + "learning_rate": 3.8577393366838364e-05, + "loss": 2.7994, + "step": 29771 + }, + { + "epoch": 1.8481594139921782, + "grad_norm": 0.14692162423019647, + "learning_rate": 3.8573877378687015e-05, + "loss": 2.7899, + "step": 29772 + }, + { + "epoch": 1.8482214910919361, + "grad_norm": 0.17292789885701254, + "learning_rate": 3.857036145014852e-05, + "loss": 2.7211, + "step": 29773 + }, + { + "epoch": 1.848283568191694, + "grad_norm": 0.292792737857793, + "learning_rate": 3.8566845581241206e-05, + "loss": 2.795, + "step": 29774 + }, + { + "epoch": 1.848345645291452, + "grad_norm": 0.1527524442301578, + "learning_rate": 3.856332977198345e-05, + "loss": 2.7424, + "step": 29775 + }, + { + "epoch": 1.8484077223912099, + "grad_norm": 0.21605342685991719, + "learning_rate": 3.855981402239358e-05, + "loss": 2.7168, + "step": 29776 + }, + { + "epoch": 1.8484697994909678, + "grad_norm": 0.17179008033295126, + "learning_rate": 3.855629833248994e-05, + "loss": 2.7804, + "step": 29777 + }, + { + "epoch": 1.8485318765907257, + "grad_norm": 0.15226677025628024, + "learning_rate": 3.8552782702290865e-05, + "loss": 2.7889, + "step": 29778 + }, + { + "epoch": 1.8485939536904836, + "grad_norm": 0.17926642589445047, + "learning_rate": 3.85492671318147e-05, + "loss": 2.7699, + "step": 29779 + }, + { + "epoch": 1.8486560307902415, + "grad_norm": 0.16541951248421857, + "learning_rate": 3.8545751621079806e-05, + "loss": 2.7489, + "step": 29780 + }, + { + "epoch": 1.8487181078899995, + "grad_norm": 0.1567273002414331, + "learning_rate": 3.854223617010449e-05, + "loss": 2.8688, + "step": 29781 + }, + { + "epoch": 1.8487801849897574, + "grad_norm": 0.14807394957766393, + "learning_rate": 3.853872077890711e-05, + "loss": 2.7478, + "step": 29782 + }, + { + "epoch": 1.8488422620895153, + "grad_norm": 0.18489932450951996, + "learning_rate": 3.8535205447506e-05, + "loss": 2.7643, + "step": 29783 + }, + { + "epoch": 1.8489043391892732, + "grad_norm": 0.15822382191095727, + "learning_rate": 3.853169017591952e-05, + "loss": 2.7608, + "step": 29784 + }, + { + "epoch": 1.8489664162890311, + "grad_norm": 0.15223221486213365, + "learning_rate": 3.8528174964165986e-05, + "loss": 2.7809, + "step": 29785 + }, + { + "epoch": 1.8490284933887888, + "grad_norm": 0.1481894566432114, + "learning_rate": 3.852465981226376e-05, + "loss": 2.7208, + "step": 29786 + }, + { + "epoch": 1.8490905704885467, + "grad_norm": 0.1448270181396904, + "learning_rate": 3.852114472023115e-05, + "loss": 2.6982, + "step": 29787 + }, + { + "epoch": 1.8491526475883047, + "grad_norm": 0.14203615255676005, + "learning_rate": 3.8517629688086535e-05, + "loss": 2.7492, + "step": 29788 + }, + { + "epoch": 1.8492147246880626, + "grad_norm": 0.1440602502316214, + "learning_rate": 3.851411471584822e-05, + "loss": 2.7834, + "step": 29789 + }, + { + "epoch": 1.8492768017878205, + "grad_norm": 0.14629686434101105, + "learning_rate": 3.851059980353457e-05, + "loss": 2.8337, + "step": 29790 + }, + { + "epoch": 1.8493388788875784, + "grad_norm": 0.1390323386540901, + "learning_rate": 3.8507084951163904e-05, + "loss": 2.744, + "step": 29791 + }, + { + "epoch": 1.8494009559873361, + "grad_norm": 0.14647422119553086, + "learning_rate": 3.850357015875456e-05, + "loss": 2.745, + "step": 29792 + }, + { + "epoch": 1.849463033087094, + "grad_norm": 0.15569058650589362, + "learning_rate": 3.850005542632489e-05, + "loss": 2.84, + "step": 29793 + }, + { + "epoch": 1.849525110186852, + "grad_norm": 0.14668176067934208, + "learning_rate": 3.849654075389321e-05, + "loss": 2.7795, + "step": 29794 + }, + { + "epoch": 1.8495871872866099, + "grad_norm": 0.1882446027839454, + "learning_rate": 3.8493026141477884e-05, + "loss": 2.785, + "step": 29795 + }, + { + "epoch": 1.8496492643863678, + "grad_norm": 0.14323371894422351, + "learning_rate": 3.848951158909721e-05, + "loss": 2.6943, + "step": 29796 + }, + { + "epoch": 1.8497113414861257, + "grad_norm": 0.15521372240639988, + "learning_rate": 3.848599709676958e-05, + "loss": 2.8774, + "step": 29797 + }, + { + "epoch": 1.8497734185858836, + "grad_norm": 0.17676764402919903, + "learning_rate": 3.848248266451328e-05, + "loss": 2.6715, + "step": 29798 + }, + { + "epoch": 1.8498354956856415, + "grad_norm": 0.15028290172667896, + "learning_rate": 3.847896829234667e-05, + "loss": 2.6775, + "step": 29799 + }, + { + "epoch": 1.8498975727853995, + "grad_norm": 0.13705917618639976, + "learning_rate": 3.847545398028807e-05, + "loss": 2.7616, + "step": 29800 + }, + { + "epoch": 1.8499596498851574, + "grad_norm": 0.20477199412093303, + "learning_rate": 3.847193972835584e-05, + "loss": 2.7473, + "step": 29801 + }, + { + "epoch": 1.8500217269849153, + "grad_norm": 0.1493892462073988, + "learning_rate": 3.8468425536568295e-05, + "loss": 2.7084, + "step": 29802 + }, + { + "epoch": 1.8500838040846732, + "grad_norm": 0.14502554858532532, + "learning_rate": 3.846491140494376e-05, + "loss": 2.7308, + "step": 29803 + }, + { + "epoch": 1.8501458811844311, + "grad_norm": 0.14414481102672377, + "learning_rate": 3.846139733350059e-05, + "loss": 2.7326, + "step": 29804 + }, + { + "epoch": 1.850207958284189, + "grad_norm": 0.14090570937116625, + "learning_rate": 3.845788332225711e-05, + "loss": 2.7259, + "step": 29805 + }, + { + "epoch": 1.850270035383947, + "grad_norm": 0.1559672957203317, + "learning_rate": 3.845436937123166e-05, + "loss": 2.6063, + "step": 29806 + }, + { + "epoch": 1.8503321124837049, + "grad_norm": 0.16416138552648535, + "learning_rate": 3.845085548044255e-05, + "loss": 2.7686, + "step": 29807 + }, + { + "epoch": 1.8503941895834628, + "grad_norm": 0.16246061306237938, + "learning_rate": 3.844734164990814e-05, + "loss": 2.7611, + "step": 29808 + }, + { + "epoch": 1.8504562666832207, + "grad_norm": 0.15222713702345245, + "learning_rate": 3.8443827879646765e-05, + "loss": 2.7958, + "step": 29809 + }, + { + "epoch": 1.8505183437829784, + "grad_norm": 0.16354943842148587, + "learning_rate": 3.844031416967673e-05, + "loss": 2.765, + "step": 29810 + }, + { + "epoch": 1.8505804208827363, + "grad_norm": 0.1835522604792218, + "learning_rate": 3.8436800520016394e-05, + "loss": 2.7452, + "step": 29811 + }, + { + "epoch": 1.8506424979824943, + "grad_norm": 0.1457825441701671, + "learning_rate": 3.843328693068407e-05, + "loss": 2.7477, + "step": 29812 + }, + { + "epoch": 1.8507045750822522, + "grad_norm": 0.15330123175201144, + "learning_rate": 3.8429773401698106e-05, + "loss": 2.8125, + "step": 29813 + }, + { + "epoch": 1.85076665218201, + "grad_norm": 0.15945957595059268, + "learning_rate": 3.842625993307681e-05, + "loss": 2.708, + "step": 29814 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.15317329112364028, + "learning_rate": 3.842274652483854e-05, + "loss": 2.763, + "step": 29815 + }, + { + "epoch": 1.8508908063815257, + "grad_norm": 0.22046636017788984, + "learning_rate": 3.8419233177001605e-05, + "loss": 2.749, + "step": 29816 + }, + { + "epoch": 1.8509528834812836, + "grad_norm": 0.14540910893377557, + "learning_rate": 3.8415719889584336e-05, + "loss": 2.7274, + "step": 29817 + }, + { + "epoch": 1.8510149605810415, + "grad_norm": 0.17498586631648308, + "learning_rate": 3.8412206662605077e-05, + "loss": 2.7509, + "step": 29818 + }, + { + "epoch": 1.8510770376807995, + "grad_norm": 0.21296554961846986, + "learning_rate": 3.8408693496082136e-05, + "loss": 2.6994, + "step": 29819 + }, + { + "epoch": 1.8511391147805574, + "grad_norm": 0.18812876814810245, + "learning_rate": 3.840518039003387e-05, + "loss": 2.7907, + "step": 29820 + }, + { + "epoch": 1.8512011918803153, + "grad_norm": 0.1593641137163709, + "learning_rate": 3.840166734447858e-05, + "loss": 2.8307, + "step": 29821 + }, + { + "epoch": 1.8512632689800732, + "grad_norm": 0.1492534971474431, + "learning_rate": 3.8398154359434625e-05, + "loss": 2.7076, + "step": 29822 + }, + { + "epoch": 1.8513253460798311, + "grad_norm": 0.192576519028968, + "learning_rate": 3.83946414349203e-05, + "loss": 2.7236, + "step": 29823 + }, + { + "epoch": 1.851387423179589, + "grad_norm": 0.14135833107453313, + "learning_rate": 3.839112857095395e-05, + "loss": 2.7212, + "step": 29824 + }, + { + "epoch": 1.851449500279347, + "grad_norm": 0.17689907899132157, + "learning_rate": 3.83876157675539e-05, + "loss": 2.7362, + "step": 29825 + }, + { + "epoch": 1.8515115773791049, + "grad_norm": 0.15887192069310194, + "learning_rate": 3.8384103024738484e-05, + "loss": 2.8357, + "step": 29826 + }, + { + "epoch": 1.8515736544788628, + "grad_norm": 0.15616214785832733, + "learning_rate": 3.8380590342526025e-05, + "loss": 2.7452, + "step": 29827 + }, + { + "epoch": 1.8516357315786207, + "grad_norm": 0.1560531628072303, + "learning_rate": 3.8377077720934826e-05, + "loss": 2.8383, + "step": 29828 + }, + { + "epoch": 1.8516978086783786, + "grad_norm": 0.17925280359701273, + "learning_rate": 3.837356515998326e-05, + "loss": 2.8656, + "step": 29829 + }, + { + "epoch": 1.8517598857781365, + "grad_norm": 0.1431596146396129, + "learning_rate": 3.8370052659689604e-05, + "loss": 2.6505, + "step": 29830 + }, + { + "epoch": 1.8518219628778945, + "grad_norm": 0.13751044612998156, + "learning_rate": 3.836654022007222e-05, + "loss": 2.8095, + "step": 29831 + }, + { + "epoch": 1.8518840399776524, + "grad_norm": 0.15420394827394598, + "learning_rate": 3.836302784114942e-05, + "loss": 2.843, + "step": 29832 + }, + { + "epoch": 1.8519461170774103, + "grad_norm": 0.14864167175758472, + "learning_rate": 3.8359515522939526e-05, + "loss": 2.7418, + "step": 29833 + }, + { + "epoch": 1.852008194177168, + "grad_norm": 0.20061559522873293, + "learning_rate": 3.835600326546086e-05, + "loss": 2.7804, + "step": 29834 + }, + { + "epoch": 1.852070271276926, + "grad_norm": 0.15534860363071185, + "learning_rate": 3.835249106873177e-05, + "loss": 2.729, + "step": 29835 + }, + { + "epoch": 1.8521323483766838, + "grad_norm": 0.23549092388987844, + "learning_rate": 3.834897893277054e-05, + "loss": 2.6903, + "step": 29836 + }, + { + "epoch": 1.8521944254764418, + "grad_norm": 0.14954840784878579, + "learning_rate": 3.8345466857595524e-05, + "loss": 2.84, + "step": 29837 + }, + { + "epoch": 1.8522565025761997, + "grad_norm": 0.16428736195252625, + "learning_rate": 3.8341954843225033e-05, + "loss": 2.7316, + "step": 29838 + }, + { + "epoch": 1.8523185796759576, + "grad_norm": 0.15351222109721524, + "learning_rate": 3.833844288967738e-05, + "loss": 2.7996, + "step": 29839 + }, + { + "epoch": 1.8523806567757153, + "grad_norm": 0.15132206274955692, + "learning_rate": 3.8334930996970925e-05, + "loss": 2.738, + "step": 29840 + }, + { + "epoch": 1.8524427338754732, + "grad_norm": 0.14171632522813446, + "learning_rate": 3.8331419165123966e-05, + "loss": 2.7954, + "step": 29841 + }, + { + "epoch": 1.8525048109752311, + "grad_norm": 0.19968370334018753, + "learning_rate": 3.832790739415482e-05, + "loss": 2.7845, + "step": 29842 + }, + { + "epoch": 1.852566888074989, + "grad_norm": 0.15305597150718783, + "learning_rate": 3.8324395684081814e-05, + "loss": 2.7735, + "step": 29843 + }, + { + "epoch": 1.852628965174747, + "grad_norm": 0.15673748503390664, + "learning_rate": 3.832088403492327e-05, + "loss": 2.7827, + "step": 29844 + }, + { + "epoch": 1.8526910422745049, + "grad_norm": 0.16911233242472679, + "learning_rate": 3.8317372446697506e-05, + "loss": 2.7189, + "step": 29845 + }, + { + "epoch": 1.8527531193742628, + "grad_norm": 0.14518787105314188, + "learning_rate": 3.831386091942285e-05, + "loss": 2.7197, + "step": 29846 + }, + { + "epoch": 1.8528151964740207, + "grad_norm": 0.14367905892836214, + "learning_rate": 3.8310349453117624e-05, + "loss": 2.8229, + "step": 29847 + }, + { + "epoch": 1.8528772735737786, + "grad_norm": 0.15466461542181656, + "learning_rate": 3.830683804780013e-05, + "loss": 2.7905, + "step": 29848 + }, + { + "epoch": 1.8529393506735365, + "grad_norm": 0.16595571783425325, + "learning_rate": 3.8303326703488714e-05, + "loss": 2.7731, + "step": 29849 + }, + { + "epoch": 1.8530014277732945, + "grad_norm": 0.13975357552792675, + "learning_rate": 3.829981542020167e-05, + "loss": 2.7008, + "step": 29850 + }, + { + "epoch": 1.8530635048730524, + "grad_norm": 0.16761040721186782, + "learning_rate": 3.829630419795734e-05, + "loss": 2.7041, + "step": 29851 + }, + { + "epoch": 1.8531255819728103, + "grad_norm": 0.21515019842542488, + "learning_rate": 3.829279303677403e-05, + "loss": 2.7568, + "step": 29852 + }, + { + "epoch": 1.8531876590725682, + "grad_norm": 0.14020176617299646, + "learning_rate": 3.8289281936670054e-05, + "loss": 2.7675, + "step": 29853 + }, + { + "epoch": 1.8532497361723261, + "grad_norm": 0.142797803821709, + "learning_rate": 3.828577089766374e-05, + "loss": 2.7675, + "step": 29854 + }, + { + "epoch": 1.853311813272084, + "grad_norm": 0.1489645172771769, + "learning_rate": 3.82822599197734e-05, + "loss": 2.7455, + "step": 29855 + }, + { + "epoch": 1.853373890371842, + "grad_norm": 0.14048065618820677, + "learning_rate": 3.8278749003017366e-05, + "loss": 2.8506, + "step": 29856 + }, + { + "epoch": 1.8534359674715999, + "grad_norm": 0.18335754336729151, + "learning_rate": 3.8275238147413934e-05, + "loss": 2.7412, + "step": 29857 + }, + { + "epoch": 1.8534980445713576, + "grad_norm": 0.15504435068167063, + "learning_rate": 3.827172735298143e-05, + "loss": 2.8361, + "step": 29858 + }, + { + "epoch": 1.8535601216711155, + "grad_norm": 0.13864799078913823, + "learning_rate": 3.826821661973817e-05, + "loss": 2.7488, + "step": 29859 + }, + { + "epoch": 1.8536221987708734, + "grad_norm": 0.14872546247045534, + "learning_rate": 3.826470594770248e-05, + "loss": 2.8191, + "step": 29860 + }, + { + "epoch": 1.8536842758706313, + "grad_norm": 0.16236704719771236, + "learning_rate": 3.826119533689266e-05, + "loss": 2.7604, + "step": 29861 + }, + { + "epoch": 1.8537463529703893, + "grad_norm": 0.1492034699570542, + "learning_rate": 3.825768478732705e-05, + "loss": 2.8267, + "step": 29862 + }, + { + "epoch": 1.8538084300701472, + "grad_norm": 0.19175410289055322, + "learning_rate": 3.8254174299023934e-05, + "loss": 2.8062, + "step": 29863 + }, + { + "epoch": 1.8538705071699049, + "grad_norm": 0.14445232478818112, + "learning_rate": 3.8250663872001644e-05, + "loss": 2.7969, + "step": 29864 + }, + { + "epoch": 1.8539325842696628, + "grad_norm": 0.13922261626615298, + "learning_rate": 3.8247153506278504e-05, + "loss": 2.7577, + "step": 29865 + }, + { + "epoch": 1.8539946613694207, + "grad_norm": 0.17548238460970006, + "learning_rate": 3.8243643201872805e-05, + "loss": 2.7769, + "step": 29866 + }, + { + "epoch": 1.8540567384691786, + "grad_norm": 0.16485009859162836, + "learning_rate": 3.8240132958802876e-05, + "loss": 2.7814, + "step": 29867 + }, + { + "epoch": 1.8541188155689365, + "grad_norm": 0.14604331691297515, + "learning_rate": 3.823662277708702e-05, + "loss": 2.6952, + "step": 29868 + }, + { + "epoch": 1.8541808926686945, + "grad_norm": 0.1513095684519915, + "learning_rate": 3.823311265674358e-05, + "loss": 2.7735, + "step": 29869 + }, + { + "epoch": 1.8542429697684524, + "grad_norm": 0.16048010785915798, + "learning_rate": 3.822960259779083e-05, + "loss": 2.8238, + "step": 29870 + }, + { + "epoch": 1.8543050468682103, + "grad_norm": 0.16985091558394372, + "learning_rate": 3.822609260024711e-05, + "loss": 2.6828, + "step": 29871 + }, + { + "epoch": 1.8543671239679682, + "grad_norm": 0.16679711688021914, + "learning_rate": 3.82225826641307e-05, + "loss": 2.7204, + "step": 29872 + }, + { + "epoch": 1.8544292010677261, + "grad_norm": 0.13991974438696364, + "learning_rate": 3.821907278945994e-05, + "loss": 2.763, + "step": 29873 + }, + { + "epoch": 1.854491278167484, + "grad_norm": 0.16353037232276998, + "learning_rate": 3.821556297625316e-05, + "loss": 2.864, + "step": 29874 + }, + { + "epoch": 1.854553355267242, + "grad_norm": 0.14760723183405458, + "learning_rate": 3.821205322452863e-05, + "loss": 2.6154, + "step": 29875 + }, + { + "epoch": 1.8546154323669999, + "grad_norm": 0.15769198934111836, + "learning_rate": 3.82085435343047e-05, + "loss": 2.7904, + "step": 29876 + }, + { + "epoch": 1.8546775094667578, + "grad_norm": 0.20080333580823972, + "learning_rate": 3.820503390559965e-05, + "loss": 2.7571, + "step": 29877 + }, + { + "epoch": 1.8547395865665157, + "grad_norm": 0.15086047359325333, + "learning_rate": 3.82015243384318e-05, + "loss": 2.7929, + "step": 29878 + }, + { + "epoch": 1.8548016636662736, + "grad_norm": 0.14369809519733742, + "learning_rate": 3.819801483281947e-05, + "loss": 2.8219, + "step": 29879 + }, + { + "epoch": 1.8548637407660316, + "grad_norm": 0.1466430694547053, + "learning_rate": 3.819450538878094e-05, + "loss": 2.7661, + "step": 29880 + }, + { + "epoch": 1.8549258178657895, + "grad_norm": 0.15957789548974868, + "learning_rate": 3.819099600633457e-05, + "loss": 2.7556, + "step": 29881 + }, + { + "epoch": 1.8549878949655472, + "grad_norm": 0.14506214812908697, + "learning_rate": 3.8187486685498616e-05, + "loss": 2.7939, + "step": 29882 + }, + { + "epoch": 1.855049972065305, + "grad_norm": 0.15695705772512275, + "learning_rate": 3.818397742629143e-05, + "loss": 2.6999, + "step": 29883 + }, + { + "epoch": 1.855112049165063, + "grad_norm": 0.14706561801707982, + "learning_rate": 3.8180468228731283e-05, + "loss": 2.8863, + "step": 29884 + }, + { + "epoch": 1.855174126264821, + "grad_norm": 0.1412868290546156, + "learning_rate": 3.8176959092836524e-05, + "loss": 2.7084, + "step": 29885 + }, + { + "epoch": 1.8552362033645788, + "grad_norm": 0.14932369790695013, + "learning_rate": 3.817345001862542e-05, + "loss": 2.7459, + "step": 29886 + }, + { + "epoch": 1.8552982804643368, + "grad_norm": 0.14839667461540068, + "learning_rate": 3.816994100611632e-05, + "loss": 2.7796, + "step": 29887 + }, + { + "epoch": 1.8553603575640945, + "grad_norm": 0.16096112323267578, + "learning_rate": 3.81664320553275e-05, + "loss": 2.8573, + "step": 29888 + }, + { + "epoch": 1.8554224346638524, + "grad_norm": 0.15675041413214458, + "learning_rate": 3.816292316627727e-05, + "loss": 2.8659, + "step": 29889 + }, + { + "epoch": 1.8554845117636103, + "grad_norm": 0.1398107212637317, + "learning_rate": 3.815941433898395e-05, + "loss": 2.7917, + "step": 29890 + }, + { + "epoch": 1.8555465888633682, + "grad_norm": 0.15824534006022392, + "learning_rate": 3.8155905573465834e-05, + "loss": 2.7387, + "step": 29891 + }, + { + "epoch": 1.8556086659631261, + "grad_norm": 0.14069855938355572, + "learning_rate": 3.815239686974124e-05, + "loss": 2.7297, + "step": 29892 + }, + { + "epoch": 1.855670743062884, + "grad_norm": 0.1410090645719664, + "learning_rate": 3.814888822782846e-05, + "loss": 2.7408, + "step": 29893 + }, + { + "epoch": 1.855732820162642, + "grad_norm": 0.14766502152481623, + "learning_rate": 3.814537964774582e-05, + "loss": 2.7796, + "step": 29894 + }, + { + "epoch": 1.8557948972623999, + "grad_norm": 0.13808006614062218, + "learning_rate": 3.81418711295116e-05, + "loss": 2.808, + "step": 29895 + }, + { + "epoch": 1.8558569743621578, + "grad_norm": 0.16132904588191746, + "learning_rate": 3.813836267314412e-05, + "loss": 2.7512, + "step": 29896 + }, + { + "epoch": 1.8559190514619157, + "grad_norm": 0.15761665263788358, + "learning_rate": 3.8134854278661684e-05, + "loss": 2.7248, + "step": 29897 + }, + { + "epoch": 1.8559811285616736, + "grad_norm": 0.14197080417064728, + "learning_rate": 3.813134594608259e-05, + "loss": 2.6702, + "step": 29898 + }, + { + "epoch": 1.8560432056614315, + "grad_norm": 0.1533606591561995, + "learning_rate": 3.812783767542514e-05, + "loss": 2.7744, + "step": 29899 + }, + { + "epoch": 1.8561052827611895, + "grad_norm": 0.1545555505651817, + "learning_rate": 3.812432946670766e-05, + "loss": 2.8619, + "step": 29900 + }, + { + "epoch": 1.8561673598609474, + "grad_norm": 0.15215511135992282, + "learning_rate": 3.812082131994843e-05, + "loss": 2.8489, + "step": 29901 + }, + { + "epoch": 1.8562294369607053, + "grad_norm": 0.15106836787251202, + "learning_rate": 3.811731323516575e-05, + "loss": 2.7378, + "step": 29902 + }, + { + "epoch": 1.8562915140604632, + "grad_norm": 0.16057600861524665, + "learning_rate": 3.811380521237795e-05, + "loss": 2.7085, + "step": 29903 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.15001576633842048, + "learning_rate": 3.811029725160329e-05, + "loss": 2.7457, + "step": 29904 + }, + { + "epoch": 1.856415668259979, + "grad_norm": 0.14611116550292044, + "learning_rate": 3.810678935286009e-05, + "loss": 2.642, + "step": 29905 + }, + { + "epoch": 1.8564777453597368, + "grad_norm": 0.14848121200222553, + "learning_rate": 3.810328151616668e-05, + "loss": 2.7535, + "step": 29906 + }, + { + "epoch": 1.8565398224594947, + "grad_norm": 0.1729512169850138, + "learning_rate": 3.809977374154132e-05, + "loss": 2.7146, + "step": 29907 + }, + { + "epoch": 1.8566018995592526, + "grad_norm": 0.14536640797773687, + "learning_rate": 3.809626602900235e-05, + "loss": 2.7621, + "step": 29908 + }, + { + "epoch": 1.8566639766590105, + "grad_norm": 0.14837540406852598, + "learning_rate": 3.8092758378568034e-05, + "loss": 2.7901, + "step": 29909 + }, + { + "epoch": 1.8567260537587684, + "grad_norm": 0.146916121955521, + "learning_rate": 3.8089250790256706e-05, + "loss": 2.7533, + "step": 29910 + }, + { + "epoch": 1.8567881308585263, + "grad_norm": 0.1557047031052734, + "learning_rate": 3.808574326408663e-05, + "loss": 2.7677, + "step": 29911 + }, + { + "epoch": 1.856850207958284, + "grad_norm": 0.14509171638443916, + "learning_rate": 3.808223580007613e-05, + "loss": 2.7102, + "step": 29912 + }, + { + "epoch": 1.856912285058042, + "grad_norm": 0.14147992664760864, + "learning_rate": 3.8078728398243506e-05, + "loss": 2.7521, + "step": 29913 + }, + { + "epoch": 1.8569743621577999, + "grad_norm": 0.14387774932854375, + "learning_rate": 3.807522105860704e-05, + "loss": 2.7419, + "step": 29914 + }, + { + "epoch": 1.8570364392575578, + "grad_norm": 0.14006182699709926, + "learning_rate": 3.8071713781185045e-05, + "loss": 2.8943, + "step": 29915 + }, + { + "epoch": 1.8570985163573157, + "grad_norm": 0.1445752605239807, + "learning_rate": 3.806820656599581e-05, + "loss": 2.8318, + "step": 29916 + }, + { + "epoch": 1.8571605934570736, + "grad_norm": 0.14557397861354962, + "learning_rate": 3.806469941305765e-05, + "loss": 2.7751, + "step": 29917 + }, + { + "epoch": 1.8572226705568315, + "grad_norm": 0.17121640189918033, + "learning_rate": 3.8061192322388836e-05, + "loss": 2.6374, + "step": 29918 + }, + { + "epoch": 1.8572847476565895, + "grad_norm": 0.16820011173424881, + "learning_rate": 3.805768529400769e-05, + "loss": 2.7708, + "step": 29919 + }, + { + "epoch": 1.8573468247563474, + "grad_norm": 0.1600992112818923, + "learning_rate": 3.805417832793249e-05, + "loss": 2.6678, + "step": 29920 + }, + { + "epoch": 1.8574089018561053, + "grad_norm": 0.16425039113591855, + "learning_rate": 3.805067142418155e-05, + "loss": 2.6886, + "step": 29921 + }, + { + "epoch": 1.8574709789558632, + "grad_norm": 0.1458174017026297, + "learning_rate": 3.8047164582773154e-05, + "loss": 2.7818, + "step": 29922 + }, + { + "epoch": 1.8575330560556211, + "grad_norm": 0.19323135723472926, + "learning_rate": 3.804365780372561e-05, + "loss": 2.7075, + "step": 29923 + }, + { + "epoch": 1.857595133155379, + "grad_norm": 0.15097610651221263, + "learning_rate": 3.8040151087057194e-05, + "loss": 2.8243, + "step": 29924 + }, + { + "epoch": 1.857657210255137, + "grad_norm": 0.1521603568602657, + "learning_rate": 3.8036644432786214e-05, + "loss": 2.7058, + "step": 29925 + }, + { + "epoch": 1.8577192873548949, + "grad_norm": 0.1469595576941398, + "learning_rate": 3.803313784093098e-05, + "loss": 2.7216, + "step": 29926 + }, + { + "epoch": 1.8577813644546528, + "grad_norm": 0.15223263087032612, + "learning_rate": 3.8029631311509754e-05, + "loss": 2.7699, + "step": 29927 + }, + { + "epoch": 1.8578434415544107, + "grad_norm": 0.18889169977724704, + "learning_rate": 3.802612484454086e-05, + "loss": 2.7558, + "step": 29928 + }, + { + "epoch": 1.8579055186541686, + "grad_norm": 0.15129664838640566, + "learning_rate": 3.8022618440042554e-05, + "loss": 2.7863, + "step": 29929 + }, + { + "epoch": 1.8579675957539263, + "grad_norm": 0.15358062683227114, + "learning_rate": 3.8019112098033184e-05, + "loss": 2.8038, + "step": 29930 + }, + { + "epoch": 1.8580296728536843, + "grad_norm": 0.20341196330596348, + "learning_rate": 3.8015605818530996e-05, + "loss": 2.7818, + "step": 29931 + }, + { + "epoch": 1.8580917499534422, + "grad_norm": 0.17447751611431825, + "learning_rate": 3.801209960155432e-05, + "loss": 2.7901, + "step": 29932 + }, + { + "epoch": 1.8581538270532, + "grad_norm": 0.16309693579221937, + "learning_rate": 3.800859344712141e-05, + "loss": 2.719, + "step": 29933 + }, + { + "epoch": 1.858215904152958, + "grad_norm": 0.1646212135605792, + "learning_rate": 3.8005087355250596e-05, + "loss": 2.768, + "step": 29934 + }, + { + "epoch": 1.858277981252716, + "grad_norm": 0.13730118531363525, + "learning_rate": 3.800158132596014e-05, + "loss": 2.7371, + "step": 29935 + }, + { + "epoch": 1.8583400583524736, + "grad_norm": 0.16904056487861713, + "learning_rate": 3.799807535926836e-05, + "loss": 2.7217, + "step": 29936 + }, + { + "epoch": 1.8584021354522315, + "grad_norm": 0.14924051765131147, + "learning_rate": 3.7994569455193526e-05, + "loss": 2.7268, + "step": 29937 + }, + { + "epoch": 1.8584642125519895, + "grad_norm": 0.15039296581874112, + "learning_rate": 3.799106361375393e-05, + "loss": 2.6595, + "step": 29938 + }, + { + "epoch": 1.8585262896517474, + "grad_norm": 0.14017417619776767, + "learning_rate": 3.798755783496787e-05, + "loss": 2.6868, + "step": 29939 + }, + { + "epoch": 1.8585883667515053, + "grad_norm": 0.16681485492363263, + "learning_rate": 3.7984052118853654e-05, + "loss": 2.7935, + "step": 29940 + }, + { + "epoch": 1.8586504438512632, + "grad_norm": 0.14046576544505063, + "learning_rate": 3.798054646542954e-05, + "loss": 2.7043, + "step": 29941 + }, + { + "epoch": 1.8587125209510211, + "grad_norm": 0.15990810826078972, + "learning_rate": 3.7977040874713844e-05, + "loss": 2.7793, + "step": 29942 + }, + { + "epoch": 1.858774598050779, + "grad_norm": 0.1537617687822909, + "learning_rate": 3.797353534672484e-05, + "loss": 2.7321, + "step": 29943 + }, + { + "epoch": 1.858836675150537, + "grad_norm": 0.14523788351465833, + "learning_rate": 3.7970029881480825e-05, + "loss": 2.7883, + "step": 29944 + }, + { + "epoch": 1.8588987522502949, + "grad_norm": 0.15959432344238386, + "learning_rate": 3.7966524479000075e-05, + "loss": 2.76, + "step": 29945 + }, + { + "epoch": 1.8589608293500528, + "grad_norm": 0.1617471197832169, + "learning_rate": 3.79630191393009e-05, + "loss": 2.7673, + "step": 29946 + }, + { + "epoch": 1.8590229064498107, + "grad_norm": 0.13916400435631773, + "learning_rate": 3.7959513862401564e-05, + "loss": 2.8121, + "step": 29947 + }, + { + "epoch": 1.8590849835495686, + "grad_norm": 0.1473500927786637, + "learning_rate": 3.795600864832038e-05, + "loss": 2.8064, + "step": 29948 + }, + { + "epoch": 1.8591470606493266, + "grad_norm": 0.14525259280229663, + "learning_rate": 3.795250349707561e-05, + "loss": 2.783, + "step": 29949 + }, + { + "epoch": 1.8592091377490845, + "grad_norm": 0.14539200699614185, + "learning_rate": 3.7948998408685564e-05, + "loss": 2.6968, + "step": 29950 + }, + { + "epoch": 1.8592712148488424, + "grad_norm": 0.158169342484248, + "learning_rate": 3.7945493383168525e-05, + "loss": 2.7548, + "step": 29951 + }, + { + "epoch": 1.8593332919486003, + "grad_norm": 0.14188440947061626, + "learning_rate": 3.7941988420542755e-05, + "loss": 2.7631, + "step": 29952 + }, + { + "epoch": 1.8593953690483582, + "grad_norm": 0.14976688564567928, + "learning_rate": 3.7938483520826576e-05, + "loss": 2.698, + "step": 29953 + }, + { + "epoch": 1.859457446148116, + "grad_norm": 0.14032720452637415, + "learning_rate": 3.793497868403824e-05, + "loss": 2.7718, + "step": 29954 + }, + { + "epoch": 1.8595195232478738, + "grad_norm": 0.15363670917402425, + "learning_rate": 3.7931473910196066e-05, + "loss": 2.7769, + "step": 29955 + }, + { + "epoch": 1.8595816003476318, + "grad_norm": 0.14197662273454614, + "learning_rate": 3.79279691993183e-05, + "loss": 2.6924, + "step": 29956 + }, + { + "epoch": 1.8596436774473897, + "grad_norm": 0.1625096767552061, + "learning_rate": 3.7924464551423274e-05, + "loss": 2.7508, + "step": 29957 + }, + { + "epoch": 1.8597057545471476, + "grad_norm": 0.14236653317293418, + "learning_rate": 3.792095996652923e-05, + "loss": 2.7849, + "step": 29958 + }, + { + "epoch": 1.8597678316469055, + "grad_norm": 0.1402489614737236, + "learning_rate": 3.791745544465448e-05, + "loss": 2.7425, + "step": 29959 + }, + { + "epoch": 1.8598299087466632, + "grad_norm": 0.1770667648044184, + "learning_rate": 3.791395098581729e-05, + "loss": 2.7132, + "step": 29960 + }, + { + "epoch": 1.8598919858464211, + "grad_norm": 0.14321679293040102, + "learning_rate": 3.791044659003596e-05, + "loss": 2.774, + "step": 29961 + }, + { + "epoch": 1.859954062946179, + "grad_norm": 0.16783110390265665, + "learning_rate": 3.790694225732877e-05, + "loss": 2.7369, + "step": 29962 + }, + { + "epoch": 1.860016140045937, + "grad_norm": 0.1555927520110086, + "learning_rate": 3.790343798771399e-05, + "loss": 2.7751, + "step": 29963 + }, + { + "epoch": 1.8600782171456949, + "grad_norm": 0.13933892395780229, + "learning_rate": 3.7899933781209905e-05, + "loss": 2.6754, + "step": 29964 + }, + { + "epoch": 1.8601402942454528, + "grad_norm": 0.1583304422725282, + "learning_rate": 3.789642963783481e-05, + "loss": 2.8036, + "step": 29965 + }, + { + "epoch": 1.8602023713452107, + "grad_norm": 0.1375393694630702, + "learning_rate": 3.789292555760698e-05, + "loss": 2.8301, + "step": 29966 + }, + { + "epoch": 1.8602644484449686, + "grad_norm": 0.1525701100480276, + "learning_rate": 3.788942154054469e-05, + "loss": 2.808, + "step": 29967 + }, + { + "epoch": 1.8603265255447266, + "grad_norm": 0.14753902969582183, + "learning_rate": 3.788591758666623e-05, + "loss": 2.8259, + "step": 29968 + }, + { + "epoch": 1.8603886026444845, + "grad_norm": 0.14143350428121865, + "learning_rate": 3.7882413695989873e-05, + "loss": 2.7462, + "step": 29969 + }, + { + "epoch": 1.8604506797442424, + "grad_norm": 0.14749908585386354, + "learning_rate": 3.78789098685339e-05, + "loss": 2.7242, + "step": 29970 + }, + { + "epoch": 1.8605127568440003, + "grad_norm": 0.14388451346836537, + "learning_rate": 3.787540610431662e-05, + "loss": 2.8344, + "step": 29971 + }, + { + "epoch": 1.8605748339437582, + "grad_norm": 0.15696696061426935, + "learning_rate": 3.787190240335627e-05, + "loss": 2.7122, + "step": 29972 + }, + { + "epoch": 1.8606369110435161, + "grad_norm": 0.14199882554774101, + "learning_rate": 3.786839876567116e-05, + "loss": 2.7695, + "step": 29973 + }, + { + "epoch": 1.860698988143274, + "grad_norm": 0.15655725191461164, + "learning_rate": 3.786489519127956e-05, + "loss": 2.7141, + "step": 29974 + }, + { + "epoch": 1.860761065243032, + "grad_norm": 0.15370313874252675, + "learning_rate": 3.786139168019975e-05, + "loss": 2.8182, + "step": 29975 + }, + { + "epoch": 1.86082314234279, + "grad_norm": 0.17110990792014616, + "learning_rate": 3.785788823245e-05, + "loss": 2.728, + "step": 29976 + }, + { + "epoch": 1.8608852194425476, + "grad_norm": 0.14750815260913447, + "learning_rate": 3.78543848480486e-05, + "loss": 2.7913, + "step": 29977 + }, + { + "epoch": 1.8609472965423055, + "grad_norm": 0.16433845314075318, + "learning_rate": 3.785088152701381e-05, + "loss": 2.8139, + "step": 29978 + }, + { + "epoch": 1.8610093736420634, + "grad_norm": 0.15928411276424284, + "learning_rate": 3.784737826936393e-05, + "loss": 2.8029, + "step": 29979 + }, + { + "epoch": 1.8610714507418213, + "grad_norm": 0.1456130178683347, + "learning_rate": 3.784387507511723e-05, + "loss": 2.8719, + "step": 29980 + }, + { + "epoch": 1.8611335278415793, + "grad_norm": 0.14931889538782825, + "learning_rate": 3.784037194429198e-05, + "loss": 2.7454, + "step": 29981 + }, + { + "epoch": 1.8611956049413372, + "grad_norm": 0.16847255469405845, + "learning_rate": 3.783686887690647e-05, + "loss": 2.7284, + "step": 29982 + }, + { + "epoch": 1.8612576820410949, + "grad_norm": 0.14232538632421363, + "learning_rate": 3.7833365872978956e-05, + "loss": 2.859, + "step": 29983 + }, + { + "epoch": 1.8613197591408528, + "grad_norm": 0.15935396237929003, + "learning_rate": 3.7829862932527734e-05, + "loss": 2.794, + "step": 29984 + }, + { + "epoch": 1.8613818362406107, + "grad_norm": 0.15849358185888915, + "learning_rate": 3.7826360055571065e-05, + "loss": 2.7978, + "step": 29985 + }, + { + "epoch": 1.8614439133403686, + "grad_norm": 0.14714510992635593, + "learning_rate": 3.782285724212724e-05, + "loss": 2.7539, + "step": 29986 + }, + { + "epoch": 1.8615059904401265, + "grad_norm": 0.14764474945088568, + "learning_rate": 3.781935449221452e-05, + "loss": 2.8202, + "step": 29987 + }, + { + "epoch": 1.8615680675398845, + "grad_norm": 0.14300383413533335, + "learning_rate": 3.781585180585118e-05, + "loss": 2.7173, + "step": 29988 + }, + { + "epoch": 1.8616301446396424, + "grad_norm": 0.15114070191426263, + "learning_rate": 3.781234918305551e-05, + "loss": 2.6107, + "step": 29989 + }, + { + "epoch": 1.8616922217394003, + "grad_norm": 0.15154340392673535, + "learning_rate": 3.780884662384576e-05, + "loss": 2.783, + "step": 29990 + }, + { + "epoch": 1.8617542988391582, + "grad_norm": 0.15289002680088884, + "learning_rate": 3.780534412824022e-05, + "loss": 2.749, + "step": 29991 + }, + { + "epoch": 1.8618163759389161, + "grad_norm": 0.1965041713714707, + "learning_rate": 3.780184169625716e-05, + "loss": 2.7011, + "step": 29992 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.14290368131551687, + "learning_rate": 3.7798339327914853e-05, + "loss": 2.7443, + "step": 29993 + }, + { + "epoch": 1.861940530138432, + "grad_norm": 0.16010754123693766, + "learning_rate": 3.779483702323158e-05, + "loss": 2.8805, + "step": 29994 + }, + { + "epoch": 1.8620026072381899, + "grad_norm": 0.14629257997529047, + "learning_rate": 3.7791334782225594e-05, + "loss": 2.8335, + "step": 29995 + }, + { + "epoch": 1.8620646843379478, + "grad_norm": 0.14016866818192325, + "learning_rate": 3.7787832604915185e-05, + "loss": 2.7441, + "step": 29996 + }, + { + "epoch": 1.8621267614377057, + "grad_norm": 0.15575144025318857, + "learning_rate": 3.778433049131862e-05, + "loss": 2.7401, + "step": 29997 + }, + { + "epoch": 1.8621888385374636, + "grad_norm": 0.15283498669773998, + "learning_rate": 3.778082844145417e-05, + "loss": 2.8223, + "step": 29998 + }, + { + "epoch": 1.8622509156372216, + "grad_norm": 0.15896362431704442, + "learning_rate": 3.777732645534009e-05, + "loss": 2.7064, + "step": 29999 + }, + { + "epoch": 1.8623129927369795, + "grad_norm": 0.15177413626798777, + "learning_rate": 3.7773824532994675e-05, + "loss": 2.7238, + "step": 30000 + }, + { + "epoch": 1.8623750698367372, + "grad_norm": 0.1430268607313102, + "learning_rate": 3.777032267443619e-05, + "loss": 2.7802, + "step": 30001 + }, + { + "epoch": 1.862437146936495, + "grad_norm": 0.1600007022901858, + "learning_rate": 3.7766820879682896e-05, + "loss": 2.7577, + "step": 30002 + }, + { + "epoch": 1.862499224036253, + "grad_norm": 0.16123713959561137, + "learning_rate": 3.776331914875304e-05, + "loss": 2.7538, + "step": 30003 + }, + { + "epoch": 1.862561301136011, + "grad_norm": 0.20081130370594946, + "learning_rate": 3.7759817481664944e-05, + "loss": 2.7751, + "step": 30004 + }, + { + "epoch": 1.8626233782357688, + "grad_norm": 0.14420700431472358, + "learning_rate": 3.775631587843686e-05, + "loss": 2.7118, + "step": 30005 + }, + { + "epoch": 1.8626854553355268, + "grad_norm": 0.15626208139442369, + "learning_rate": 3.7752814339087035e-05, + "loss": 2.7649, + "step": 30006 + }, + { + "epoch": 1.8627475324352845, + "grad_norm": 0.14457874548702734, + "learning_rate": 3.7749312863633764e-05, + "loss": 2.776, + "step": 30007 + }, + { + "epoch": 1.8628096095350424, + "grad_norm": 0.17274846488357506, + "learning_rate": 3.774581145209528e-05, + "loss": 2.7506, + "step": 30008 + }, + { + "epoch": 1.8628716866348003, + "grad_norm": 0.14489585406369304, + "learning_rate": 3.7742310104489896e-05, + "loss": 2.7821, + "step": 30009 + }, + { + "epoch": 1.8629337637345582, + "grad_norm": 0.1565131296429423, + "learning_rate": 3.773880882083584e-05, + "loss": 2.763, + "step": 30010 + }, + { + "epoch": 1.8629958408343161, + "grad_norm": 0.1465454673528659, + "learning_rate": 3.77353076011514e-05, + "loss": 2.7842, + "step": 30011 + }, + { + "epoch": 1.863057917934074, + "grad_norm": 0.15495104880065705, + "learning_rate": 3.773180644545485e-05, + "loss": 2.7413, + "step": 30012 + }, + { + "epoch": 1.863119995033832, + "grad_norm": 0.15109934952270526, + "learning_rate": 3.7728305353764425e-05, + "loss": 2.7953, + "step": 30013 + }, + { + "epoch": 1.8631820721335899, + "grad_norm": 0.17462396603409933, + "learning_rate": 3.772480432609843e-05, + "loss": 2.8779, + "step": 30014 + }, + { + "epoch": 1.8632441492333478, + "grad_norm": 0.1873522712958489, + "learning_rate": 3.7721303362475094e-05, + "loss": 2.7879, + "step": 30015 + }, + { + "epoch": 1.8633062263331057, + "grad_norm": 0.15156599955107145, + "learning_rate": 3.7717802462912707e-05, + "loss": 2.8218, + "step": 30016 + }, + { + "epoch": 1.8633683034328636, + "grad_norm": 0.15218736495809027, + "learning_rate": 3.7714301627429513e-05, + "loss": 2.866, + "step": 30017 + }, + { + "epoch": 1.8634303805326216, + "grad_norm": 0.1653464011962283, + "learning_rate": 3.7710800856043804e-05, + "loss": 2.7573, + "step": 30018 + }, + { + "epoch": 1.8634924576323795, + "grad_norm": 0.14947517242367464, + "learning_rate": 3.770730014877382e-05, + "loss": 2.7665, + "step": 30019 + }, + { + "epoch": 1.8635545347321374, + "grad_norm": 0.15447166055726677, + "learning_rate": 3.770379950563785e-05, + "loss": 2.8263, + "step": 30020 + }, + { + "epoch": 1.8636166118318953, + "grad_norm": 0.15893506051500386, + "learning_rate": 3.770029892665413e-05, + "loss": 2.7734, + "step": 30021 + }, + { + "epoch": 1.8636786889316532, + "grad_norm": 0.15580653798425112, + "learning_rate": 3.769679841184094e-05, + "loss": 2.8111, + "step": 30022 + }, + { + "epoch": 1.8637407660314111, + "grad_norm": 0.1414269785151426, + "learning_rate": 3.7693297961216545e-05, + "loss": 2.7474, + "step": 30023 + }, + { + "epoch": 1.863802843131169, + "grad_norm": 0.1533563838731174, + "learning_rate": 3.768979757479919e-05, + "loss": 2.8374, + "step": 30024 + }, + { + "epoch": 1.8638649202309268, + "grad_norm": 0.16730922585939947, + "learning_rate": 3.7686297252607163e-05, + "loss": 2.7704, + "step": 30025 + }, + { + "epoch": 1.8639269973306847, + "grad_norm": 0.1373451596514301, + "learning_rate": 3.7682796994658695e-05, + "loss": 2.7418, + "step": 30026 + }, + { + "epoch": 1.8639890744304426, + "grad_norm": 0.15392290741680043, + "learning_rate": 3.767929680097209e-05, + "loss": 2.7449, + "step": 30027 + }, + { + "epoch": 1.8640511515302005, + "grad_norm": 0.1538108220014686, + "learning_rate": 3.767579667156556e-05, + "loss": 2.8494, + "step": 30028 + }, + { + "epoch": 1.8641132286299584, + "grad_norm": 0.154078340896008, + "learning_rate": 3.767229660645741e-05, + "loss": 2.6233, + "step": 30029 + }, + { + "epoch": 1.8641753057297163, + "grad_norm": 0.14599455000147327, + "learning_rate": 3.766879660566587e-05, + "loss": 2.8272, + "step": 30030 + }, + { + "epoch": 1.864237382829474, + "grad_norm": 0.166890297306433, + "learning_rate": 3.766529666920922e-05, + "loss": 2.7875, + "step": 30031 + }, + { + "epoch": 1.864299459929232, + "grad_norm": 0.1622724689313139, + "learning_rate": 3.7661796797105696e-05, + "loss": 2.7318, + "step": 30032 + }, + { + "epoch": 1.8643615370289899, + "grad_norm": 0.16753274863458173, + "learning_rate": 3.765829698937359e-05, + "loss": 2.7071, + "step": 30033 + }, + { + "epoch": 1.8644236141287478, + "grad_norm": 0.18976180501888124, + "learning_rate": 3.7654797246031124e-05, + "loss": 2.776, + "step": 30034 + }, + { + "epoch": 1.8644856912285057, + "grad_norm": 0.14013628012558615, + "learning_rate": 3.7651297567096596e-05, + "loss": 2.7161, + "step": 30035 + }, + { + "epoch": 1.8645477683282636, + "grad_norm": 0.15325008290313483, + "learning_rate": 3.764779795258823e-05, + "loss": 2.8329, + "step": 30036 + }, + { + "epoch": 1.8646098454280216, + "grad_norm": 0.1719159489315575, + "learning_rate": 3.7644298402524316e-05, + "loss": 2.7654, + "step": 30037 + }, + { + "epoch": 1.8646719225277795, + "grad_norm": 0.16379150648088206, + "learning_rate": 3.764079891692309e-05, + "loss": 2.8254, + "step": 30038 + }, + { + "epoch": 1.8647339996275374, + "grad_norm": 0.1481502979441166, + "learning_rate": 3.7637299495802825e-05, + "loss": 2.8221, + "step": 30039 + }, + { + "epoch": 1.8647960767272953, + "grad_norm": 0.17237112119313414, + "learning_rate": 3.7633800139181756e-05, + "loss": 2.8407, + "step": 30040 + }, + { + "epoch": 1.8648581538270532, + "grad_norm": 0.1435624983342876, + "learning_rate": 3.763030084707817e-05, + "loss": 2.6567, + "step": 30041 + }, + { + "epoch": 1.8649202309268111, + "grad_norm": 0.14986688275224952, + "learning_rate": 3.762680161951029e-05, + "loss": 2.7423, + "step": 30042 + }, + { + "epoch": 1.864982308026569, + "grad_norm": 0.14182751484800918, + "learning_rate": 3.76233024564964e-05, + "loss": 2.8101, + "step": 30043 + }, + { + "epoch": 1.865044385126327, + "grad_norm": 0.136939476854218, + "learning_rate": 3.761980335805474e-05, + "loss": 2.7186, + "step": 30044 + }, + { + "epoch": 1.865106462226085, + "grad_norm": 0.1466595111385173, + "learning_rate": 3.761630432420358e-05, + "loss": 2.8012, + "step": 30045 + }, + { + "epoch": 1.8651685393258428, + "grad_norm": 0.1415644169460804, + "learning_rate": 3.7612805354961165e-05, + "loss": 2.7568, + "step": 30046 + }, + { + "epoch": 1.8652306164256007, + "grad_norm": 0.13914061291976781, + "learning_rate": 3.760930645034575e-05, + "loss": 2.6599, + "step": 30047 + }, + { + "epoch": 1.8652926935253586, + "grad_norm": 0.142606536438253, + "learning_rate": 3.760580761037559e-05, + "loss": 2.689, + "step": 30048 + }, + { + "epoch": 1.8653547706251163, + "grad_norm": 0.142235160532126, + "learning_rate": 3.760230883506894e-05, + "loss": 2.7213, + "step": 30049 + }, + { + "epoch": 1.8654168477248743, + "grad_norm": 0.14947129069878032, + "learning_rate": 3.759881012444406e-05, + "loss": 2.739, + "step": 30050 + }, + { + "epoch": 1.8654789248246322, + "grad_norm": 0.1547493216252441, + "learning_rate": 3.759531147851919e-05, + "loss": 2.7551, + "step": 30051 + }, + { + "epoch": 1.86554100192439, + "grad_norm": 0.1517607095802705, + "learning_rate": 3.7591812897312594e-05, + "loss": 2.8017, + "step": 30052 + }, + { + "epoch": 1.865603079024148, + "grad_norm": 0.14314254973814056, + "learning_rate": 3.758831438084251e-05, + "loss": 2.7801, + "step": 30053 + }, + { + "epoch": 1.865665156123906, + "grad_norm": 0.1423199681076169, + "learning_rate": 3.7584815929127225e-05, + "loss": 2.822, + "step": 30054 + }, + { + "epoch": 1.8657272332236636, + "grad_norm": 0.145934503379541, + "learning_rate": 3.7581317542184955e-05, + "loss": 2.7767, + "step": 30055 + }, + { + "epoch": 1.8657893103234215, + "grad_norm": 0.14704166579167702, + "learning_rate": 3.7577819220033976e-05, + "loss": 2.697, + "step": 30056 + }, + { + "epoch": 1.8658513874231795, + "grad_norm": 0.15437187110571754, + "learning_rate": 3.757432096269252e-05, + "loss": 2.7295, + "step": 30057 + }, + { + "epoch": 1.8659134645229374, + "grad_norm": 0.15775663454020933, + "learning_rate": 3.757082277017886e-05, + "loss": 2.7782, + "step": 30058 + }, + { + "epoch": 1.8659755416226953, + "grad_norm": 0.13944463597166204, + "learning_rate": 3.756732464251122e-05, + "loss": 2.7729, + "step": 30059 + }, + { + "epoch": 1.8660376187224532, + "grad_norm": 0.14989343424112733, + "learning_rate": 3.756382657970788e-05, + "loss": 2.6941, + "step": 30060 + }, + { + "epoch": 1.8660996958222111, + "grad_norm": 0.14958434904062703, + "learning_rate": 3.756032858178708e-05, + "loss": 2.8408, + "step": 30061 + }, + { + "epoch": 1.866161772921969, + "grad_norm": 0.1371884044886775, + "learning_rate": 3.7556830648767045e-05, + "loss": 2.7581, + "step": 30062 + }, + { + "epoch": 1.866223850021727, + "grad_norm": 0.1949923657335058, + "learning_rate": 3.755333278066606e-05, + "loss": 2.712, + "step": 30063 + }, + { + "epoch": 1.8662859271214849, + "grad_norm": 0.15494491803983565, + "learning_rate": 3.7549834977502344e-05, + "loss": 2.7377, + "step": 30064 + }, + { + "epoch": 1.8663480042212428, + "grad_norm": 0.14950750582967992, + "learning_rate": 3.754633723929418e-05, + "loss": 2.8323, + "step": 30065 + }, + { + "epoch": 1.8664100813210007, + "grad_norm": 0.15066706652684062, + "learning_rate": 3.754283956605978e-05, + "loss": 2.7349, + "step": 30066 + }, + { + "epoch": 1.8664721584207586, + "grad_norm": 0.14771187506555872, + "learning_rate": 3.7539341957817424e-05, + "loss": 2.7785, + "step": 30067 + }, + { + "epoch": 1.8665342355205166, + "grad_norm": 0.17777885676862026, + "learning_rate": 3.753584441458534e-05, + "loss": 2.8309, + "step": 30068 + }, + { + "epoch": 1.8665963126202745, + "grad_norm": 0.14682302113213894, + "learning_rate": 3.753234693638176e-05, + "loss": 2.7201, + "step": 30069 + }, + { + "epoch": 1.8666583897200324, + "grad_norm": 0.15287619857481985, + "learning_rate": 3.7528849523224984e-05, + "loss": 2.774, + "step": 30070 + }, + { + "epoch": 1.8667204668197903, + "grad_norm": 0.1538812072344039, + "learning_rate": 3.752535217513321e-05, + "loss": 2.6808, + "step": 30071 + }, + { + "epoch": 1.8667825439195482, + "grad_norm": 0.15977430156846986, + "learning_rate": 3.752185489212472e-05, + "loss": 2.6429, + "step": 30072 + }, + { + "epoch": 1.866844621019306, + "grad_norm": 0.15099761515077761, + "learning_rate": 3.7518357674217733e-05, + "loss": 2.8052, + "step": 30073 + }, + { + "epoch": 1.8669066981190638, + "grad_norm": 0.15262367134195437, + "learning_rate": 3.75148605214305e-05, + "loss": 2.8235, + "step": 30074 + }, + { + "epoch": 1.8669687752188218, + "grad_norm": 0.15969312977058456, + "learning_rate": 3.751136343378128e-05, + "loss": 2.6397, + "step": 30075 + }, + { + "epoch": 1.8670308523185797, + "grad_norm": 0.1514590015674299, + "learning_rate": 3.7507866411288293e-05, + "loss": 2.7121, + "step": 30076 + }, + { + "epoch": 1.8670929294183376, + "grad_norm": 0.16049252074928969, + "learning_rate": 3.7504369453969824e-05, + "loss": 2.7013, + "step": 30077 + }, + { + "epoch": 1.8671550065180955, + "grad_norm": 0.14935870173454485, + "learning_rate": 3.750087256184407e-05, + "loss": 2.6863, + "step": 30078 + }, + { + "epoch": 1.8672170836178532, + "grad_norm": 0.141779715180156, + "learning_rate": 3.7497375734929305e-05, + "loss": 2.7321, + "step": 30079 + }, + { + "epoch": 1.8672791607176111, + "grad_norm": 0.14687117688296308, + "learning_rate": 3.749387897324377e-05, + "loss": 2.7363, + "step": 30080 + }, + { + "epoch": 1.867341237817369, + "grad_norm": 0.15685380855228426, + "learning_rate": 3.7490382276805705e-05, + "loss": 2.742, + "step": 30081 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.14671567457630072, + "learning_rate": 3.748688564563334e-05, + "loss": 2.83, + "step": 30082 + }, + { + "epoch": 1.8674653920168849, + "grad_norm": 0.14350674925194334, + "learning_rate": 3.7483389079744944e-05, + "loss": 2.7507, + "step": 30083 + }, + { + "epoch": 1.8675274691166428, + "grad_norm": 0.15307931729481516, + "learning_rate": 3.7479892579158735e-05, + "loss": 2.7369, + "step": 30084 + }, + { + "epoch": 1.8675895462164007, + "grad_norm": 0.1438755549603256, + "learning_rate": 3.747639614389298e-05, + "loss": 2.744, + "step": 30085 + }, + { + "epoch": 1.8676516233161586, + "grad_norm": 0.1607607505156495, + "learning_rate": 3.747289977396591e-05, + "loss": 2.7382, + "step": 30086 + }, + { + "epoch": 1.8677137004159166, + "grad_norm": 0.1498199944962978, + "learning_rate": 3.746940346939574e-05, + "loss": 2.8425, + "step": 30087 + }, + { + "epoch": 1.8677757775156745, + "grad_norm": 0.14287531592909025, + "learning_rate": 3.7465907230200755e-05, + "loss": 2.7149, + "step": 30088 + }, + { + "epoch": 1.8678378546154324, + "grad_norm": 0.16318155101362786, + "learning_rate": 3.746241105639916e-05, + "loss": 2.7632, + "step": 30089 + }, + { + "epoch": 1.8678999317151903, + "grad_norm": 0.14748314822842898, + "learning_rate": 3.7458914948009215e-05, + "loss": 2.716, + "step": 30090 + }, + { + "epoch": 1.8679620088149482, + "grad_norm": 0.1660226203342235, + "learning_rate": 3.745541890504916e-05, + "loss": 2.72, + "step": 30091 + }, + { + "epoch": 1.8680240859147061, + "grad_norm": 0.16362775808921104, + "learning_rate": 3.745192292753723e-05, + "loss": 2.6813, + "step": 30092 + }, + { + "epoch": 1.868086163014464, + "grad_norm": 0.16623538002286195, + "learning_rate": 3.744842701549165e-05, + "loss": 2.7525, + "step": 30093 + }, + { + "epoch": 1.868148240114222, + "grad_norm": 0.14725870197122023, + "learning_rate": 3.744493116893069e-05, + "loss": 2.7586, + "step": 30094 + }, + { + "epoch": 1.86821031721398, + "grad_norm": 0.1612856631245273, + "learning_rate": 3.744143538787256e-05, + "loss": 2.8376, + "step": 30095 + }, + { + "epoch": 1.8682723943137378, + "grad_norm": 0.147095258403149, + "learning_rate": 3.743793967233552e-05, + "loss": 2.8194, + "step": 30096 + }, + { + "epoch": 1.8683344714134955, + "grad_norm": 0.15229234684084725, + "learning_rate": 3.743444402233779e-05, + "loss": 2.8201, + "step": 30097 + }, + { + "epoch": 1.8683965485132534, + "grad_norm": 0.1707461250582494, + "learning_rate": 3.7430948437897615e-05, + "loss": 2.7783, + "step": 30098 + }, + { + "epoch": 1.8684586256130113, + "grad_norm": 0.14762436103601767, + "learning_rate": 3.7427452919033244e-05, + "loss": 2.7582, + "step": 30099 + }, + { + "epoch": 1.8685207027127693, + "grad_norm": 0.1713939676083166, + "learning_rate": 3.7423957465762894e-05, + "loss": 2.7311, + "step": 30100 + }, + { + "epoch": 1.8685827798125272, + "grad_norm": 0.14716695065010135, + "learning_rate": 3.74204620781048e-05, + "loss": 2.8514, + "step": 30101 + }, + { + "epoch": 1.868644856912285, + "grad_norm": 0.1832967140913096, + "learning_rate": 3.7416966756077226e-05, + "loss": 2.8142, + "step": 30102 + }, + { + "epoch": 1.8687069340120428, + "grad_norm": 0.15969049953883435, + "learning_rate": 3.7413471499698384e-05, + "loss": 2.7338, + "step": 30103 + }, + { + "epoch": 1.8687690111118007, + "grad_norm": 0.14029320675546797, + "learning_rate": 3.740997630898653e-05, + "loss": 2.6973, + "step": 30104 + }, + { + "epoch": 1.8688310882115586, + "grad_norm": 0.17426739599422003, + "learning_rate": 3.7406481183959865e-05, + "loss": 2.7649, + "step": 30105 + }, + { + "epoch": 1.8688931653113166, + "grad_norm": 0.1475693638807588, + "learning_rate": 3.7402986124636666e-05, + "loss": 2.7282, + "step": 30106 + }, + { + "epoch": 1.8689552424110745, + "grad_norm": 0.1473183858648073, + "learning_rate": 3.7399491131035125e-05, + "loss": 2.7885, + "step": 30107 + }, + { + "epoch": 1.8690173195108324, + "grad_norm": 0.14324659255978234, + "learning_rate": 3.739599620317352e-05, + "loss": 2.7593, + "step": 30108 + }, + { + "epoch": 1.8690793966105903, + "grad_norm": 0.1494634397110921, + "learning_rate": 3.7392501341070055e-05, + "loss": 2.675, + "step": 30109 + }, + { + "epoch": 1.8691414737103482, + "grad_norm": 0.15127985496090537, + "learning_rate": 3.738900654474297e-05, + "loss": 2.7153, + "step": 30110 + }, + { + "epoch": 1.8692035508101061, + "grad_norm": 0.17627444484973853, + "learning_rate": 3.7385511814210495e-05, + "loss": 2.789, + "step": 30111 + }, + { + "epoch": 1.869265627909864, + "grad_norm": 0.14224430746737504, + "learning_rate": 3.738201714949087e-05, + "loss": 2.7664, + "step": 30112 + }, + { + "epoch": 1.869327705009622, + "grad_norm": 0.14995400100965686, + "learning_rate": 3.7378522550602324e-05, + "loss": 2.7216, + "step": 30113 + }, + { + "epoch": 1.86938978210938, + "grad_norm": 0.13964149484516075, + "learning_rate": 3.737502801756308e-05, + "loss": 2.7657, + "step": 30114 + }, + { + "epoch": 1.8694518592091378, + "grad_norm": 0.15263381877152937, + "learning_rate": 3.73715335503914e-05, + "loss": 2.7152, + "step": 30115 + }, + { + "epoch": 1.8695139363088957, + "grad_norm": 0.14261665215622943, + "learning_rate": 3.736803914910548e-05, + "loss": 2.693, + "step": 30116 + }, + { + "epoch": 1.8695760134086536, + "grad_norm": 0.13929075169864238, + "learning_rate": 3.736454481372357e-05, + "loss": 2.7754, + "step": 30117 + }, + { + "epoch": 1.8696380905084116, + "grad_norm": 0.14916668777581107, + "learning_rate": 3.73610505442639e-05, + "loss": 2.7358, + "step": 30118 + }, + { + "epoch": 1.8697001676081695, + "grad_norm": 0.20937240976183252, + "learning_rate": 3.73575563407447e-05, + "loss": 2.7241, + "step": 30119 + }, + { + "epoch": 1.8697622447079274, + "grad_norm": 0.1417949211364308, + "learning_rate": 3.735406220318418e-05, + "loss": 2.7552, + "step": 30120 + }, + { + "epoch": 1.869824321807685, + "grad_norm": 0.15424687746479568, + "learning_rate": 3.735056813160061e-05, + "loss": 2.8008, + "step": 30121 + }, + { + "epoch": 1.869886398907443, + "grad_norm": 0.16084398913991052, + "learning_rate": 3.734707412601219e-05, + "loss": 2.7652, + "step": 30122 + }, + { + "epoch": 1.869948476007201, + "grad_norm": 0.1470275028837719, + "learning_rate": 3.7343580186437154e-05, + "loss": 2.7639, + "step": 30123 + }, + { + "epoch": 1.8700105531069589, + "grad_norm": 0.15454583062600147, + "learning_rate": 3.7340086312893734e-05, + "loss": 2.7538, + "step": 30124 + }, + { + "epoch": 1.8700726302067168, + "grad_norm": 0.1440220251859941, + "learning_rate": 3.733659250540015e-05, + "loss": 2.8644, + "step": 30125 + }, + { + "epoch": 1.8701347073064747, + "grad_norm": 0.1507064549823247, + "learning_rate": 3.733309876397465e-05, + "loss": 2.7457, + "step": 30126 + }, + { + "epoch": 1.8701967844062324, + "grad_norm": 0.13684718366489113, + "learning_rate": 3.7329605088635435e-05, + "loss": 2.6621, + "step": 30127 + }, + { + "epoch": 1.8702588615059903, + "grad_norm": 0.15235093295452065, + "learning_rate": 3.732611147940076e-05, + "loss": 2.7779, + "step": 30128 + }, + { + "epoch": 1.8703209386057482, + "grad_norm": 0.1454338231242839, + "learning_rate": 3.7322617936288814e-05, + "loss": 2.7921, + "step": 30129 + }, + { + "epoch": 1.8703830157055061, + "grad_norm": 0.15137748067990436, + "learning_rate": 3.731912445931787e-05, + "loss": 2.8261, + "step": 30130 + }, + { + "epoch": 1.870445092805264, + "grad_norm": 0.14618349964927893, + "learning_rate": 3.7315631048506125e-05, + "loss": 2.7462, + "step": 30131 + }, + { + "epoch": 1.870507169905022, + "grad_norm": 0.14732819070510136, + "learning_rate": 3.731213770387182e-05, + "loss": 2.7045, + "step": 30132 + }, + { + "epoch": 1.87056924700478, + "grad_norm": 0.14729644113996535, + "learning_rate": 3.730864442543317e-05, + "loss": 2.8196, + "step": 30133 + }, + { + "epoch": 1.8706313241045378, + "grad_norm": 0.14163472017582351, + "learning_rate": 3.730515121320838e-05, + "loss": 2.772, + "step": 30134 + }, + { + "epoch": 1.8706934012042957, + "grad_norm": 0.1834297157566054, + "learning_rate": 3.730165806721571e-05, + "loss": 2.8019, + "step": 30135 + }, + { + "epoch": 1.8707554783040536, + "grad_norm": 0.15743862682975104, + "learning_rate": 3.7298164987473385e-05, + "loss": 2.6712, + "step": 30136 + }, + { + "epoch": 1.8708175554038116, + "grad_norm": 0.15658770663822633, + "learning_rate": 3.729467197399961e-05, + "loss": 2.6763, + "step": 30137 + }, + { + "epoch": 1.8708796325035695, + "grad_norm": 0.15540131515355446, + "learning_rate": 3.729117902681262e-05, + "loss": 2.7653, + "step": 30138 + }, + { + "epoch": 1.8709417096033274, + "grad_norm": 0.17062613455426634, + "learning_rate": 3.7287686145930625e-05, + "loss": 2.7913, + "step": 30139 + }, + { + "epoch": 1.8710037867030853, + "grad_norm": 0.18504663594848994, + "learning_rate": 3.7284193331371864e-05, + "loss": 2.8522, + "step": 30140 + }, + { + "epoch": 1.8710658638028432, + "grad_norm": 0.15062139813481365, + "learning_rate": 3.728070058315455e-05, + "loss": 2.7189, + "step": 30141 + }, + { + "epoch": 1.8711279409026011, + "grad_norm": 0.16248624151929408, + "learning_rate": 3.727720790129691e-05, + "loss": 2.7644, + "step": 30142 + }, + { + "epoch": 1.871190018002359, + "grad_norm": 0.14589426067864136, + "learning_rate": 3.727371528581717e-05, + "loss": 2.6996, + "step": 30143 + }, + { + "epoch": 1.871252095102117, + "grad_norm": 0.14678463362549404, + "learning_rate": 3.727022273673354e-05, + "loss": 2.7936, + "step": 30144 + }, + { + "epoch": 1.8713141722018747, + "grad_norm": 0.1665125142666264, + "learning_rate": 3.726673025406425e-05, + "loss": 2.7064, + "step": 30145 + }, + { + "epoch": 1.8713762493016326, + "grad_norm": 0.1539726956223703, + "learning_rate": 3.726323783782752e-05, + "loss": 2.707, + "step": 30146 + }, + { + "epoch": 1.8714383264013905, + "grad_norm": 0.15553685402695033, + "learning_rate": 3.7259745488041575e-05, + "loss": 2.7398, + "step": 30147 + }, + { + "epoch": 1.8715004035011484, + "grad_norm": 0.17225991842977492, + "learning_rate": 3.725625320472462e-05, + "loss": 2.7106, + "step": 30148 + }, + { + "epoch": 1.8715624806009064, + "grad_norm": 0.14988699719530232, + "learning_rate": 3.72527609878949e-05, + "loss": 2.7619, + "step": 30149 + }, + { + "epoch": 1.8716245577006643, + "grad_norm": 0.17433557485123477, + "learning_rate": 3.724926883757061e-05, + "loss": 2.7672, + "step": 30150 + }, + { + "epoch": 1.871686634800422, + "grad_norm": 0.1536744274321117, + "learning_rate": 3.724577675376999e-05, + "loss": 2.7898, + "step": 30151 + }, + { + "epoch": 1.8717487119001799, + "grad_norm": 0.15637895507903216, + "learning_rate": 3.724228473651123e-05, + "loss": 2.7337, + "step": 30152 + }, + { + "epoch": 1.8718107889999378, + "grad_norm": 0.16052580855327384, + "learning_rate": 3.723879278581258e-05, + "loss": 2.8082, + "step": 30153 + }, + { + "epoch": 1.8718728660996957, + "grad_norm": 0.16054865402201288, + "learning_rate": 3.723530090169224e-05, + "loss": 2.7668, + "step": 30154 + }, + { + "epoch": 1.8719349431994536, + "grad_norm": 0.15106404621025815, + "learning_rate": 3.723180908416845e-05, + "loss": 2.6922, + "step": 30155 + }, + { + "epoch": 1.8719970202992116, + "grad_norm": 0.13477127782050854, + "learning_rate": 3.722831733325939e-05, + "loss": 2.7334, + "step": 30156 + }, + { + "epoch": 1.8720590973989695, + "grad_norm": 0.1526626037790334, + "learning_rate": 3.7224825648983313e-05, + "loss": 2.7283, + "step": 30157 + }, + { + "epoch": 1.8721211744987274, + "grad_norm": 0.1422900202331499, + "learning_rate": 3.722133403135842e-05, + "loss": 2.7835, + "step": 30158 + }, + { + "epoch": 1.8721832515984853, + "grad_norm": 0.1404233387240141, + "learning_rate": 3.7217842480402925e-05, + "loss": 2.8222, + "step": 30159 + }, + { + "epoch": 1.8722453286982432, + "grad_norm": 0.15094733923874284, + "learning_rate": 3.7214350996135056e-05, + "loss": 2.77, + "step": 30160 + }, + { + "epoch": 1.8723074057980011, + "grad_norm": 0.15160258048309702, + "learning_rate": 3.721085957857301e-05, + "loss": 2.7152, + "step": 30161 + }, + { + "epoch": 1.872369482897759, + "grad_norm": 0.14690397565193258, + "learning_rate": 3.720736822773503e-05, + "loss": 2.7539, + "step": 30162 + }, + { + "epoch": 1.872431559997517, + "grad_norm": 0.14410451447050263, + "learning_rate": 3.72038769436393e-05, + "loss": 2.6805, + "step": 30163 + }, + { + "epoch": 1.872493637097275, + "grad_norm": 0.1456559278631505, + "learning_rate": 3.720038572630406e-05, + "loss": 2.6415, + "step": 30164 + }, + { + "epoch": 1.8725557141970328, + "grad_norm": 0.14514245497418166, + "learning_rate": 3.719689457574751e-05, + "loss": 2.8363, + "step": 30165 + }, + { + "epoch": 1.8726177912967907, + "grad_norm": 0.14987579383950064, + "learning_rate": 3.719340349198787e-05, + "loss": 2.7386, + "step": 30166 + }, + { + "epoch": 1.8726798683965487, + "grad_norm": 0.14495577197275886, + "learning_rate": 3.7189912475043345e-05, + "loss": 2.7386, + "step": 30167 + }, + { + "epoch": 1.8727419454963066, + "grad_norm": 0.14665623437989903, + "learning_rate": 3.718642152493215e-05, + "loss": 2.7324, + "step": 30168 + }, + { + "epoch": 1.8728040225960643, + "grad_norm": 0.1548300175683473, + "learning_rate": 3.718293064167253e-05, + "loss": 2.8489, + "step": 30169 + }, + { + "epoch": 1.8728660996958222, + "grad_norm": 0.14597907333929971, + "learning_rate": 3.717943982528265e-05, + "loss": 2.6971, + "step": 30170 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.1397887039000887, + "learning_rate": 3.717594907578076e-05, + "loss": 2.6778, + "step": 30171 + }, + { + "epoch": 1.872990253895338, + "grad_norm": 0.1403020952929332, + "learning_rate": 3.717245839318506e-05, + "loss": 2.6905, + "step": 30172 + }, + { + "epoch": 1.873052330995096, + "grad_norm": 0.13713881809547634, + "learning_rate": 3.7168967777513754e-05, + "loss": 2.7538, + "step": 30173 + }, + { + "epoch": 1.8731144080948539, + "grad_norm": 0.14178502267110846, + "learning_rate": 3.7165477228785055e-05, + "loss": 2.7734, + "step": 30174 + }, + { + "epoch": 1.8731764851946116, + "grad_norm": 0.13401072301457725, + "learning_rate": 3.7161986747017177e-05, + "loss": 2.7415, + "step": 30175 + }, + { + "epoch": 1.8732385622943695, + "grad_norm": 0.1425415475513195, + "learning_rate": 3.7158496332228334e-05, + "loss": 2.7679, + "step": 30176 + }, + { + "epoch": 1.8733006393941274, + "grad_norm": 0.13658877052406748, + "learning_rate": 3.715500598443673e-05, + "loss": 2.7806, + "step": 30177 + }, + { + "epoch": 1.8733627164938853, + "grad_norm": 0.1395233968316772, + "learning_rate": 3.715151570366058e-05, + "loss": 2.7841, + "step": 30178 + }, + { + "epoch": 1.8734247935936432, + "grad_norm": 0.14295170453085418, + "learning_rate": 3.714802548991808e-05, + "loss": 2.7638, + "step": 30179 + }, + { + "epoch": 1.8734868706934011, + "grad_norm": 0.13793328195456553, + "learning_rate": 3.714453534322747e-05, + "loss": 2.8097, + "step": 30180 + }, + { + "epoch": 1.873548947793159, + "grad_norm": 0.14218631235611795, + "learning_rate": 3.7141045263606925e-05, + "loss": 2.726, + "step": 30181 + }, + { + "epoch": 1.873611024892917, + "grad_norm": 0.14176074978646358, + "learning_rate": 3.713755525107468e-05, + "loss": 2.8673, + "step": 30182 + }, + { + "epoch": 1.873673101992675, + "grad_norm": 0.13728002052616253, + "learning_rate": 3.713406530564893e-05, + "loss": 2.7416, + "step": 30183 + }, + { + "epoch": 1.8737351790924328, + "grad_norm": 0.1410885561497196, + "learning_rate": 3.713057542734787e-05, + "loss": 2.8087, + "step": 30184 + }, + { + "epoch": 1.8737972561921907, + "grad_norm": 0.14815181294338414, + "learning_rate": 3.712708561618974e-05, + "loss": 2.769, + "step": 30185 + }, + { + "epoch": 1.8738593332919486, + "grad_norm": 0.13389669390913733, + "learning_rate": 3.712359587219272e-05, + "loss": 2.7373, + "step": 30186 + }, + { + "epoch": 1.8739214103917066, + "grad_norm": 0.14261591423592626, + "learning_rate": 3.7120106195375034e-05, + "loss": 2.7561, + "step": 30187 + }, + { + "epoch": 1.8739834874914645, + "grad_norm": 0.13782937037432155, + "learning_rate": 3.7116616585754874e-05, + "loss": 2.7209, + "step": 30188 + }, + { + "epoch": 1.8740455645912224, + "grad_norm": 0.15228818833905483, + "learning_rate": 3.711312704335045e-05, + "loss": 2.7687, + "step": 30189 + }, + { + "epoch": 1.8741076416909803, + "grad_norm": 0.14306237667515057, + "learning_rate": 3.710963756817997e-05, + "loss": 2.8123, + "step": 30190 + }, + { + "epoch": 1.8741697187907382, + "grad_norm": 0.1515160474673968, + "learning_rate": 3.7106148160261654e-05, + "loss": 2.76, + "step": 30191 + }, + { + "epoch": 1.8742317958904962, + "grad_norm": 0.155963898031639, + "learning_rate": 3.7102658819613676e-05, + "loss": 2.7934, + "step": 30192 + }, + { + "epoch": 1.8742938729902539, + "grad_norm": 0.15266223774688092, + "learning_rate": 3.709916954625427e-05, + "loss": 2.8168, + "step": 30193 + }, + { + "epoch": 1.8743559500900118, + "grad_norm": 0.1519150509732242, + "learning_rate": 3.709568034020163e-05, + "loss": 2.7992, + "step": 30194 + }, + { + "epoch": 1.8744180271897697, + "grad_norm": 0.15215773719308373, + "learning_rate": 3.709219120147395e-05, + "loss": 2.7283, + "step": 30195 + }, + { + "epoch": 1.8744801042895276, + "grad_norm": 0.16519058765610056, + "learning_rate": 3.708870213008945e-05, + "loss": 2.7858, + "step": 30196 + }, + { + "epoch": 1.8745421813892855, + "grad_norm": 0.1545520736329841, + "learning_rate": 3.708521312606631e-05, + "loss": 2.7598, + "step": 30197 + }, + { + "epoch": 1.8746042584890434, + "grad_norm": 0.15143769990661438, + "learning_rate": 3.7081724189422764e-05, + "loss": 2.8565, + "step": 30198 + }, + { + "epoch": 1.8746663355888011, + "grad_norm": 0.18285602108597196, + "learning_rate": 3.7078235320176986e-05, + "loss": 2.8397, + "step": 30199 + }, + { + "epoch": 1.874728412688559, + "grad_norm": 0.14459708364051638, + "learning_rate": 3.7074746518347184e-05, + "loss": 2.7745, + "step": 30200 + }, + { + "epoch": 1.874790489788317, + "grad_norm": 0.18763028029458764, + "learning_rate": 3.7071257783951586e-05, + "loss": 2.7465, + "step": 30201 + }, + { + "epoch": 1.874852566888075, + "grad_norm": 0.14058765604294224, + "learning_rate": 3.706776911700836e-05, + "loss": 2.7604, + "step": 30202 + }, + { + "epoch": 1.8749146439878328, + "grad_norm": 0.15185194226061588, + "learning_rate": 3.706428051753574e-05, + "loss": 2.8587, + "step": 30203 + }, + { + "epoch": 1.8749767210875907, + "grad_norm": 0.1770490176012556, + "learning_rate": 3.706079198555189e-05, + "loss": 2.7533, + "step": 30204 + }, + { + "epoch": 1.8750387981873486, + "grad_norm": 0.14750376452070973, + "learning_rate": 3.705730352107505e-05, + "loss": 2.7632, + "step": 30205 + }, + { + "epoch": 1.8751008752871066, + "grad_norm": 0.1425152796889694, + "learning_rate": 3.705381512412338e-05, + "loss": 2.7861, + "step": 30206 + }, + { + "epoch": 1.8751629523868645, + "grad_norm": 0.1431523673470552, + "learning_rate": 3.7050326794715115e-05, + "loss": 2.751, + "step": 30207 + }, + { + "epoch": 1.8752250294866224, + "grad_norm": 0.13495928905490595, + "learning_rate": 3.704683853286844e-05, + "loss": 2.7402, + "step": 30208 + }, + { + "epoch": 1.8752871065863803, + "grad_norm": 0.14000858869135768, + "learning_rate": 3.704335033860154e-05, + "loss": 2.7519, + "step": 30209 + }, + { + "epoch": 1.8753491836861382, + "grad_norm": 0.14931562709168453, + "learning_rate": 3.703986221193264e-05, + "loss": 2.8027, + "step": 30210 + }, + { + "epoch": 1.8754112607858962, + "grad_norm": 0.1482525099426069, + "learning_rate": 3.7036374152879905e-05, + "loss": 2.8553, + "step": 30211 + }, + { + "epoch": 1.875473337885654, + "grad_norm": 0.14761851272533252, + "learning_rate": 3.7032886161461574e-05, + "loss": 2.8324, + "step": 30212 + }, + { + "epoch": 1.875535414985412, + "grad_norm": 0.147277851001977, + "learning_rate": 3.7029398237695805e-05, + "loss": 2.7313, + "step": 30213 + }, + { + "epoch": 1.87559749208517, + "grad_norm": 0.14488995347021144, + "learning_rate": 3.702591038160083e-05, + "loss": 2.7353, + "step": 30214 + }, + { + "epoch": 1.8756595691849278, + "grad_norm": 0.1434436420505256, + "learning_rate": 3.702242259319482e-05, + "loss": 2.7671, + "step": 30215 + }, + { + "epoch": 1.8757216462846857, + "grad_norm": 0.15411095947120973, + "learning_rate": 3.701893487249599e-05, + "loss": 2.7647, + "step": 30216 + }, + { + "epoch": 1.8757837233844434, + "grad_norm": 0.15018626980787703, + "learning_rate": 3.7015447219522513e-05, + "loss": 2.7736, + "step": 30217 + }, + { + "epoch": 1.8758458004842014, + "grad_norm": 0.15018442186375378, + "learning_rate": 3.701195963429262e-05, + "loss": 2.7669, + "step": 30218 + }, + { + "epoch": 1.8759078775839593, + "grad_norm": 0.14459922841487496, + "learning_rate": 3.7008472116824485e-05, + "loss": 2.7557, + "step": 30219 + }, + { + "epoch": 1.8759699546837172, + "grad_norm": 0.14807368541517527, + "learning_rate": 3.700498466713629e-05, + "loss": 2.7562, + "step": 30220 + }, + { + "epoch": 1.876032031783475, + "grad_norm": 0.14277279641024454, + "learning_rate": 3.700149728524625e-05, + "loss": 2.7997, + "step": 30221 + }, + { + "epoch": 1.876094108883233, + "grad_norm": 0.140775586669511, + "learning_rate": 3.6998009971172556e-05, + "loss": 2.6898, + "step": 30222 + }, + { + "epoch": 1.8761561859829907, + "grad_norm": 0.15504278890216744, + "learning_rate": 3.6994522724933405e-05, + "loss": 2.7983, + "step": 30223 + }, + { + "epoch": 1.8762182630827486, + "grad_norm": 0.14353316009416608, + "learning_rate": 3.6991035546546974e-05, + "loss": 2.7373, + "step": 30224 + }, + { + "epoch": 1.8762803401825066, + "grad_norm": 0.13967549788780775, + "learning_rate": 3.698754843603148e-05, + "loss": 2.825, + "step": 30225 + }, + { + "epoch": 1.8763424172822645, + "grad_norm": 0.15389901182901272, + "learning_rate": 3.69840613934051e-05, + "loss": 2.8208, + "step": 30226 + }, + { + "epoch": 1.8764044943820224, + "grad_norm": 0.17138308877171024, + "learning_rate": 3.698057441868603e-05, + "loss": 2.6361, + "step": 30227 + }, + { + "epoch": 1.8764665714817803, + "grad_norm": 0.1506676970209893, + "learning_rate": 3.697708751189246e-05, + "loss": 2.7862, + "step": 30228 + }, + { + "epoch": 1.8765286485815382, + "grad_norm": 0.1433829948134389, + "learning_rate": 3.69736006730426e-05, + "loss": 2.7049, + "step": 30229 + }, + { + "epoch": 1.8765907256812961, + "grad_norm": 0.16250969967870277, + "learning_rate": 3.6970113902154615e-05, + "loss": 2.6519, + "step": 30230 + }, + { + "epoch": 1.876652802781054, + "grad_norm": 0.15022138262672372, + "learning_rate": 3.696662719924672e-05, + "loss": 2.8289, + "step": 30231 + }, + { + "epoch": 1.876714879880812, + "grad_norm": 0.15197959193792668, + "learning_rate": 3.696314056433707e-05, + "loss": 2.7862, + "step": 30232 + }, + { + "epoch": 1.87677695698057, + "grad_norm": 0.1588362282385437, + "learning_rate": 3.69596539974439e-05, + "loss": 2.8137, + "step": 30233 + }, + { + "epoch": 1.8768390340803278, + "grad_norm": 0.14808218240567939, + "learning_rate": 3.695616749858538e-05, + "loss": 2.7163, + "step": 30234 + }, + { + "epoch": 1.8769011111800857, + "grad_norm": 0.15284630218274983, + "learning_rate": 3.69526810677797e-05, + "loss": 2.7154, + "step": 30235 + }, + { + "epoch": 1.8769631882798437, + "grad_norm": 0.16541890443610255, + "learning_rate": 3.6949194705045043e-05, + "loss": 2.7715, + "step": 30236 + }, + { + "epoch": 1.8770252653796016, + "grad_norm": 0.16108575769646868, + "learning_rate": 3.694570841039962e-05, + "loss": 2.741, + "step": 30237 + }, + { + "epoch": 1.8770873424793595, + "grad_norm": 0.14751168192654482, + "learning_rate": 3.69422221838616e-05, + "loss": 2.7161, + "step": 30238 + }, + { + "epoch": 1.8771494195791174, + "grad_norm": 0.15616874966492586, + "learning_rate": 3.693873602544917e-05, + "loss": 2.7302, + "step": 30239 + }, + { + "epoch": 1.8772114966788753, + "grad_norm": 0.14896020335132926, + "learning_rate": 3.693524993518053e-05, + "loss": 2.7846, + "step": 30240 + }, + { + "epoch": 1.877273573778633, + "grad_norm": 0.15933774928853559, + "learning_rate": 3.6931763913073866e-05, + "loss": 2.7353, + "step": 30241 + }, + { + "epoch": 1.877335650878391, + "grad_norm": 0.1536636396100969, + "learning_rate": 3.6928277959147354e-05, + "loss": 2.7481, + "step": 30242 + }, + { + "epoch": 1.8773977279781489, + "grad_norm": 0.14595396014392095, + "learning_rate": 3.69247920734192e-05, + "loss": 2.7109, + "step": 30243 + }, + { + "epoch": 1.8774598050779068, + "grad_norm": 0.14623170678139577, + "learning_rate": 3.692130625590757e-05, + "loss": 2.7204, + "step": 30244 + }, + { + "epoch": 1.8775218821776647, + "grad_norm": 0.14630900375681524, + "learning_rate": 3.691782050663066e-05, + "loss": 2.6896, + "step": 30245 + }, + { + "epoch": 1.8775839592774226, + "grad_norm": 0.15335521786613682, + "learning_rate": 3.6914334825606666e-05, + "loss": 2.6843, + "step": 30246 + }, + { + "epoch": 1.8776460363771803, + "grad_norm": 0.16205861549341435, + "learning_rate": 3.691084921285375e-05, + "loss": 2.7386, + "step": 30247 + }, + { + "epoch": 1.8777081134769382, + "grad_norm": 0.1512819234600179, + "learning_rate": 3.6907363668390125e-05, + "loss": 2.8654, + "step": 30248 + }, + { + "epoch": 1.8777701905766961, + "grad_norm": 0.15531739026704358, + "learning_rate": 3.6903878192233955e-05, + "loss": 2.7632, + "step": 30249 + }, + { + "epoch": 1.877832267676454, + "grad_norm": 0.1482631826799604, + "learning_rate": 3.6900392784403434e-05, + "loss": 2.7773, + "step": 30250 + }, + { + "epoch": 1.877894344776212, + "grad_norm": 0.14955282935525735, + "learning_rate": 3.6896907444916736e-05, + "loss": 2.8074, + "step": 30251 + }, + { + "epoch": 1.87795642187597, + "grad_norm": 0.14125789505280117, + "learning_rate": 3.689342217379207e-05, + "loss": 2.6991, + "step": 30252 + }, + { + "epoch": 1.8780184989757278, + "grad_norm": 0.1606647165713177, + "learning_rate": 3.6889936971047575e-05, + "loss": 2.6907, + "step": 30253 + }, + { + "epoch": 1.8780805760754857, + "grad_norm": 0.14195051933000757, + "learning_rate": 3.688645183670148e-05, + "loss": 2.795, + "step": 30254 + }, + { + "epoch": 1.8781426531752436, + "grad_norm": 0.15179116951531793, + "learning_rate": 3.688296677077195e-05, + "loss": 2.7699, + "step": 30255 + }, + { + "epoch": 1.8782047302750016, + "grad_norm": 0.17028122754796035, + "learning_rate": 3.687948177327717e-05, + "loss": 2.8601, + "step": 30256 + }, + { + "epoch": 1.8782668073747595, + "grad_norm": 0.15791492938563975, + "learning_rate": 3.687599684423532e-05, + "loss": 2.7091, + "step": 30257 + }, + { + "epoch": 1.8783288844745174, + "grad_norm": 0.14738749892212397, + "learning_rate": 3.687251198366456e-05, + "loss": 2.8745, + "step": 30258 + }, + { + "epoch": 1.8783909615742753, + "grad_norm": 0.14859923536121902, + "learning_rate": 3.6869027191583114e-05, + "loss": 2.8415, + "step": 30259 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.1528356326230378, + "learning_rate": 3.6865542468009125e-05, + "loss": 2.8336, + "step": 30260 + }, + { + "epoch": 1.8785151157737912, + "grad_norm": 0.1361166091974096, + "learning_rate": 3.68620578129608e-05, + "loss": 2.8343, + "step": 30261 + }, + { + "epoch": 1.878577192873549, + "grad_norm": 0.15298695983861488, + "learning_rate": 3.68585732264563e-05, + "loss": 2.6923, + "step": 30262 + }, + { + "epoch": 1.878639269973307, + "grad_norm": 0.15387427123654085, + "learning_rate": 3.685508870851382e-05, + "loss": 2.7012, + "step": 30263 + }, + { + "epoch": 1.878701347073065, + "grad_norm": 0.20675115711910066, + "learning_rate": 3.685160425915153e-05, + "loss": 2.7378, + "step": 30264 + }, + { + "epoch": 1.8787634241728226, + "grad_norm": 0.1434804438941778, + "learning_rate": 3.6848119878387595e-05, + "loss": 2.7563, + "step": 30265 + }, + { + "epoch": 1.8788255012725805, + "grad_norm": 0.1393276010302173, + "learning_rate": 3.684463556624023e-05, + "loss": 2.675, + "step": 30266 + }, + { + "epoch": 1.8788875783723384, + "grad_norm": 0.14276152403215395, + "learning_rate": 3.68411513227276e-05, + "loss": 2.7316, + "step": 30267 + }, + { + "epoch": 1.8789496554720964, + "grad_norm": 0.14251394656388802, + "learning_rate": 3.6837667147867874e-05, + "loss": 2.742, + "step": 30268 + }, + { + "epoch": 1.8790117325718543, + "grad_norm": 0.1385942497050879, + "learning_rate": 3.683418304167924e-05, + "loss": 2.6648, + "step": 30269 + }, + { + "epoch": 1.8790738096716122, + "grad_norm": 0.14409934411599493, + "learning_rate": 3.683069900417985e-05, + "loss": 2.768, + "step": 30270 + }, + { + "epoch": 1.87913588677137, + "grad_norm": 0.16462353872187951, + "learning_rate": 3.682721503538792e-05, + "loss": 2.7689, + "step": 30271 + }, + { + "epoch": 1.8791979638711278, + "grad_norm": 0.1417075385418922, + "learning_rate": 3.682373113532159e-05, + "loss": 2.7767, + "step": 30272 + }, + { + "epoch": 1.8792600409708857, + "grad_norm": 0.14938897190230693, + "learning_rate": 3.6820247303999066e-05, + "loss": 2.8473, + "step": 30273 + }, + { + "epoch": 1.8793221180706436, + "grad_norm": 0.14946082262355104, + "learning_rate": 3.6816763541438505e-05, + "loss": 2.7976, + "step": 30274 + }, + { + "epoch": 1.8793841951704016, + "grad_norm": 0.1565424744805846, + "learning_rate": 3.68132798476581e-05, + "loss": 2.7652, + "step": 30275 + }, + { + "epoch": 1.8794462722701595, + "grad_norm": 0.14895583302949253, + "learning_rate": 3.6809796222675994e-05, + "loss": 2.7377, + "step": 30276 + }, + { + "epoch": 1.8795083493699174, + "grad_norm": 0.15033672655010824, + "learning_rate": 3.6806312666510405e-05, + "loss": 2.7908, + "step": 30277 + }, + { + "epoch": 1.8795704264696753, + "grad_norm": 0.159535706128122, + "learning_rate": 3.680282917917947e-05, + "loss": 2.7678, + "step": 30278 + }, + { + "epoch": 1.8796325035694332, + "grad_norm": 0.1919077900195949, + "learning_rate": 3.679934576070139e-05, + "loss": 2.7827, + "step": 30279 + }, + { + "epoch": 1.8796945806691912, + "grad_norm": 0.18641551625450017, + "learning_rate": 3.6795862411094316e-05, + "loss": 2.8457, + "step": 30280 + }, + { + "epoch": 1.879756657768949, + "grad_norm": 0.15271618142025747, + "learning_rate": 3.6792379130376456e-05, + "loss": 2.8373, + "step": 30281 + }, + { + "epoch": 1.879818734868707, + "grad_norm": 0.14373091825012427, + "learning_rate": 3.6788895918565946e-05, + "loss": 2.7803, + "step": 30282 + }, + { + "epoch": 1.879880811968465, + "grad_norm": 0.1488826188873235, + "learning_rate": 3.678541277568096e-05, + "loss": 2.7805, + "step": 30283 + }, + { + "epoch": 1.8799428890682228, + "grad_norm": 0.18940720542411213, + "learning_rate": 3.6781929701739706e-05, + "loss": 2.8014, + "step": 30284 + }, + { + "epoch": 1.8800049661679807, + "grad_norm": 0.15284614375955513, + "learning_rate": 3.6778446696760316e-05, + "loss": 2.7402, + "step": 30285 + }, + { + "epoch": 1.8800670432677387, + "grad_norm": 0.14874096643642032, + "learning_rate": 3.677496376076099e-05, + "loss": 2.8103, + "step": 30286 + }, + { + "epoch": 1.8801291203674966, + "grad_norm": 0.14586697485585814, + "learning_rate": 3.677148089375988e-05, + "loss": 2.7558, + "step": 30287 + }, + { + "epoch": 1.8801911974672545, + "grad_norm": 0.14909624443534109, + "learning_rate": 3.676799809577517e-05, + "loss": 2.6944, + "step": 30288 + }, + { + "epoch": 1.8802532745670122, + "grad_norm": 0.14841406411326935, + "learning_rate": 3.6764515366825024e-05, + "loss": 2.8038, + "step": 30289 + }, + { + "epoch": 1.88031535166677, + "grad_norm": 0.14400810207647632, + "learning_rate": 3.676103270692762e-05, + "loss": 2.8108, + "step": 30290 + }, + { + "epoch": 1.880377428766528, + "grad_norm": 0.15388810055602914, + "learning_rate": 3.675755011610112e-05, + "loss": 2.7443, + "step": 30291 + }, + { + "epoch": 1.880439505866286, + "grad_norm": 0.164096147629049, + "learning_rate": 3.6754067594363694e-05, + "loss": 2.7544, + "step": 30292 + }, + { + "epoch": 1.8805015829660439, + "grad_norm": 0.1408633358187467, + "learning_rate": 3.675058514173352e-05, + "loss": 2.7567, + "step": 30293 + }, + { + "epoch": 1.8805636600658018, + "grad_norm": 0.1550115920642779, + "learning_rate": 3.674710275822875e-05, + "loss": 2.7924, + "step": 30294 + }, + { + "epoch": 1.8806257371655595, + "grad_norm": 0.1489430964675472, + "learning_rate": 3.674362044386757e-05, + "loss": 2.7499, + "step": 30295 + }, + { + "epoch": 1.8806878142653174, + "grad_norm": 0.1438460798623905, + "learning_rate": 3.674013819866813e-05, + "loss": 2.6481, + "step": 30296 + }, + { + "epoch": 1.8807498913650753, + "grad_norm": 0.1509736575282679, + "learning_rate": 3.673665602264862e-05, + "loss": 2.8234, + "step": 30297 + }, + { + "epoch": 1.8808119684648332, + "grad_norm": 0.14198972838947121, + "learning_rate": 3.6733173915827176e-05, + "loss": 2.7498, + "step": 30298 + }, + { + "epoch": 1.8808740455645911, + "grad_norm": 0.15238249726154074, + "learning_rate": 3.672969187822199e-05, + "loss": 2.7166, + "step": 30299 + }, + { + "epoch": 1.880936122664349, + "grad_norm": 0.14678977284246134, + "learning_rate": 3.6726209909851234e-05, + "loss": 2.8098, + "step": 30300 + }, + { + "epoch": 1.880998199764107, + "grad_norm": 0.15263311707682548, + "learning_rate": 3.6722728010733056e-05, + "loss": 2.7124, + "step": 30301 + }, + { + "epoch": 1.881060276863865, + "grad_norm": 0.1562381739909422, + "learning_rate": 3.671924618088564e-05, + "loss": 2.8534, + "step": 30302 + }, + { + "epoch": 1.8811223539636228, + "grad_norm": 0.15994226302433226, + "learning_rate": 3.6715764420327123e-05, + "loss": 2.7268, + "step": 30303 + }, + { + "epoch": 1.8811844310633807, + "grad_norm": 0.15236863519919386, + "learning_rate": 3.671228272907571e-05, + "loss": 2.8377, + "step": 30304 + }, + { + "epoch": 1.8812465081631387, + "grad_norm": 0.15799367839089234, + "learning_rate": 3.670880110714952e-05, + "loss": 2.7655, + "step": 30305 + }, + { + "epoch": 1.8813085852628966, + "grad_norm": 0.15065480227270334, + "learning_rate": 3.670531955456676e-05, + "loss": 2.7942, + "step": 30306 + }, + { + "epoch": 1.8813706623626545, + "grad_norm": 0.15001274914006824, + "learning_rate": 3.670183807134557e-05, + "loss": 2.711, + "step": 30307 + }, + { + "epoch": 1.8814327394624124, + "grad_norm": 0.14828791736007682, + "learning_rate": 3.6698356657504115e-05, + "loss": 2.8127, + "step": 30308 + }, + { + "epoch": 1.8814948165621703, + "grad_norm": 0.15606595763759593, + "learning_rate": 3.669487531306057e-05, + "loss": 2.7967, + "step": 30309 + }, + { + "epoch": 1.8815568936619282, + "grad_norm": 0.16704821065144482, + "learning_rate": 3.6691394038033076e-05, + "loss": 2.7405, + "step": 30310 + }, + { + "epoch": 1.8816189707616862, + "grad_norm": 0.14290924684037853, + "learning_rate": 3.668791283243983e-05, + "loss": 2.7898, + "step": 30311 + }, + { + "epoch": 1.881681047861444, + "grad_norm": 0.171711790014947, + "learning_rate": 3.668443169629896e-05, + "loss": 2.766, + "step": 30312 + }, + { + "epoch": 1.8817431249612018, + "grad_norm": 0.14686599612537854, + "learning_rate": 3.668095062962865e-05, + "loss": 2.7472, + "step": 30313 + }, + { + "epoch": 1.8818052020609597, + "grad_norm": 0.15157929602692555, + "learning_rate": 3.667746963244704e-05, + "loss": 2.8474, + "step": 30314 + }, + { + "epoch": 1.8818672791607176, + "grad_norm": 0.15484064593701946, + "learning_rate": 3.667398870477232e-05, + "loss": 2.6974, + "step": 30315 + }, + { + "epoch": 1.8819293562604755, + "grad_norm": 0.14124402915521547, + "learning_rate": 3.6670507846622624e-05, + "loss": 2.6501, + "step": 30316 + }, + { + "epoch": 1.8819914333602334, + "grad_norm": 0.1513864874072098, + "learning_rate": 3.666702705801614e-05, + "loss": 2.7926, + "step": 30317 + }, + { + "epoch": 1.8820535104599914, + "grad_norm": 0.15538767445037172, + "learning_rate": 3.666354633897101e-05, + "loss": 2.7937, + "step": 30318 + }, + { + "epoch": 1.882115587559749, + "grad_norm": 0.15211470821378711, + "learning_rate": 3.666006568950539e-05, + "loss": 2.8391, + "step": 30319 + }, + { + "epoch": 1.882177664659507, + "grad_norm": 0.1442128295241145, + "learning_rate": 3.665658510963744e-05, + "loss": 2.8054, + "step": 30320 + }, + { + "epoch": 1.882239741759265, + "grad_norm": 0.1367544195783413, + "learning_rate": 3.665310459938533e-05, + "loss": 2.7058, + "step": 30321 + }, + { + "epoch": 1.8823018188590228, + "grad_norm": 0.15331785236498788, + "learning_rate": 3.664962415876722e-05, + "loss": 2.7343, + "step": 30322 + }, + { + "epoch": 1.8823638959587807, + "grad_norm": 0.13725069647666138, + "learning_rate": 3.664614378780125e-05, + "loss": 2.8161, + "step": 30323 + }, + { + "epoch": 1.8824259730585386, + "grad_norm": 0.14995410038117182, + "learning_rate": 3.6642663486505604e-05, + "loss": 2.7804, + "step": 30324 + }, + { + "epoch": 1.8824880501582966, + "grad_norm": 0.14968391434072126, + "learning_rate": 3.6639183254898404e-05, + "loss": 2.8255, + "step": 30325 + }, + { + "epoch": 1.8825501272580545, + "grad_norm": 0.15069359445173477, + "learning_rate": 3.6635703092997855e-05, + "loss": 2.6783, + "step": 30326 + }, + { + "epoch": 1.8826122043578124, + "grad_norm": 0.1502744795919548, + "learning_rate": 3.6632223000822063e-05, + "loss": 2.7434, + "step": 30327 + }, + { + "epoch": 1.8826742814575703, + "grad_norm": 0.1516685193685758, + "learning_rate": 3.6628742978389216e-05, + "loss": 2.8619, + "step": 30328 + }, + { + "epoch": 1.8827363585573282, + "grad_norm": 0.15566893059023992, + "learning_rate": 3.6625263025717474e-05, + "loss": 2.8466, + "step": 30329 + }, + { + "epoch": 1.8827984356570862, + "grad_norm": 0.16554156104502207, + "learning_rate": 3.662178314282497e-05, + "loss": 2.7451, + "step": 30330 + }, + { + "epoch": 1.882860512756844, + "grad_norm": 0.15901886240244484, + "learning_rate": 3.661830332972986e-05, + "loss": 2.7175, + "step": 30331 + }, + { + "epoch": 1.882922589856602, + "grad_norm": 0.20317198136674583, + "learning_rate": 3.661482358645033e-05, + "loss": 2.7689, + "step": 30332 + }, + { + "epoch": 1.88298466695636, + "grad_norm": 0.16451382434253367, + "learning_rate": 3.6611343913004505e-05, + "loss": 2.812, + "step": 30333 + }, + { + "epoch": 1.8830467440561178, + "grad_norm": 0.14378479272191794, + "learning_rate": 3.660786430941056e-05, + "loss": 2.6972, + "step": 30334 + }, + { + "epoch": 1.8831088211558757, + "grad_norm": 0.15713578915027662, + "learning_rate": 3.660438477568662e-05, + "loss": 2.8348, + "step": 30335 + }, + { + "epoch": 1.8831708982556337, + "grad_norm": 0.14397673296298624, + "learning_rate": 3.660090531185088e-05, + "loss": 2.6944, + "step": 30336 + }, + { + "epoch": 1.8832329753553914, + "grad_norm": 0.14553989466351916, + "learning_rate": 3.6597425917921444e-05, + "loss": 2.6994, + "step": 30337 + }, + { + "epoch": 1.8832950524551493, + "grad_norm": 0.17081559489391884, + "learning_rate": 3.659394659391651e-05, + "loss": 2.7786, + "step": 30338 + }, + { + "epoch": 1.8833571295549072, + "grad_norm": 0.13805989096677299, + "learning_rate": 3.65904673398542e-05, + "loss": 2.7716, + "step": 30339 + }, + { + "epoch": 1.8834192066546651, + "grad_norm": 0.14380431367161658, + "learning_rate": 3.658698815575269e-05, + "loss": 2.7521, + "step": 30340 + }, + { + "epoch": 1.883481283754423, + "grad_norm": 0.15963394199102995, + "learning_rate": 3.65835090416301e-05, + "loss": 2.8687, + "step": 30341 + }, + { + "epoch": 1.883543360854181, + "grad_norm": 0.139261496603494, + "learning_rate": 3.6580029997504615e-05, + "loss": 2.7049, + "step": 30342 + }, + { + "epoch": 1.8836054379539386, + "grad_norm": 0.1352947393813369, + "learning_rate": 3.6576551023394377e-05, + "loss": 2.7489, + "step": 30343 + }, + { + "epoch": 1.8836675150536966, + "grad_norm": 0.14835427485895392, + "learning_rate": 3.657307211931752e-05, + "loss": 2.8058, + "step": 30344 + }, + { + "epoch": 1.8837295921534545, + "grad_norm": 0.13825511092642823, + "learning_rate": 3.656959328529221e-05, + "loss": 2.699, + "step": 30345 + }, + { + "epoch": 1.8837916692532124, + "grad_norm": 0.14829034161406662, + "learning_rate": 3.656611452133658e-05, + "loss": 2.7579, + "step": 30346 + }, + { + "epoch": 1.8838537463529703, + "grad_norm": 0.14545403197105422, + "learning_rate": 3.656263582746881e-05, + "loss": 2.7322, + "step": 30347 + }, + { + "epoch": 1.8839158234527282, + "grad_norm": 0.14553833967685967, + "learning_rate": 3.6559157203707015e-05, + "loss": 2.7177, + "step": 30348 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.152517641299801, + "learning_rate": 3.655567865006937e-05, + "loss": 2.8144, + "step": 30349 + }, + { + "epoch": 1.884039977652244, + "grad_norm": 0.13630614007790337, + "learning_rate": 3.6552200166574e-05, + "loss": 2.6956, + "step": 30350 + }, + { + "epoch": 1.884102054752002, + "grad_norm": 0.15712615177260447, + "learning_rate": 3.654872175323909e-05, + "loss": 2.8172, + "step": 30351 + }, + { + "epoch": 1.88416413185176, + "grad_norm": 0.14869243288255718, + "learning_rate": 3.654524341008274e-05, + "loss": 2.7697, + "step": 30352 + }, + { + "epoch": 1.8842262089515178, + "grad_norm": 0.1488334557186836, + "learning_rate": 3.654176513712314e-05, + "loss": 2.8811, + "step": 30353 + }, + { + "epoch": 1.8842882860512757, + "grad_norm": 0.1548612209148854, + "learning_rate": 3.653828693437841e-05, + "loss": 2.8094, + "step": 30354 + }, + { + "epoch": 1.8843503631510337, + "grad_norm": 0.1510680103074306, + "learning_rate": 3.653480880186671e-05, + "loss": 2.7629, + "step": 30355 + }, + { + "epoch": 1.8844124402507916, + "grad_norm": 0.1625800595771041, + "learning_rate": 3.653133073960617e-05, + "loss": 2.8191, + "step": 30356 + }, + { + "epoch": 1.8844745173505495, + "grad_norm": 0.15396647146492812, + "learning_rate": 3.652785274761496e-05, + "loss": 2.7952, + "step": 30357 + }, + { + "epoch": 1.8845365944503074, + "grad_norm": 0.15651164797543443, + "learning_rate": 3.65243748259112e-05, + "loss": 2.7852, + "step": 30358 + }, + { + "epoch": 1.8845986715500653, + "grad_norm": 0.16669454316619925, + "learning_rate": 3.652089697451305e-05, + "loss": 2.8144, + "step": 30359 + }, + { + "epoch": 1.884660748649823, + "grad_norm": 0.14797734789455874, + "learning_rate": 3.6517419193438665e-05, + "loss": 2.7317, + "step": 30360 + }, + { + "epoch": 1.884722825749581, + "grad_norm": 0.14899673022612384, + "learning_rate": 3.651394148270616e-05, + "loss": 2.776, + "step": 30361 + }, + { + "epoch": 1.8847849028493389, + "grad_norm": 0.15773816128249127, + "learning_rate": 3.651046384233371e-05, + "loss": 2.837, + "step": 30362 + }, + { + "epoch": 1.8848469799490968, + "grad_norm": 0.1473692615324458, + "learning_rate": 3.650698627233942e-05, + "loss": 2.7797, + "step": 30363 + }, + { + "epoch": 1.8849090570488547, + "grad_norm": 0.14648810064028434, + "learning_rate": 3.650350877274147e-05, + "loss": 2.7458, + "step": 30364 + }, + { + "epoch": 1.8849711341486126, + "grad_norm": 0.15742348251279278, + "learning_rate": 3.6500031343558004e-05, + "loss": 2.6597, + "step": 30365 + }, + { + "epoch": 1.8850332112483703, + "grad_norm": 0.1600838939693999, + "learning_rate": 3.649655398480715e-05, + "loss": 2.8257, + "step": 30366 + }, + { + "epoch": 1.8850952883481282, + "grad_norm": 0.14807355333375422, + "learning_rate": 3.6493076696507046e-05, + "loss": 2.7316, + "step": 30367 + }, + { + "epoch": 1.8851573654478861, + "grad_norm": 0.16773193876418802, + "learning_rate": 3.648959947867585e-05, + "loss": 2.7685, + "step": 30368 + }, + { + "epoch": 1.885219442547644, + "grad_norm": 0.15954851937859238, + "learning_rate": 3.648612233133168e-05, + "loss": 2.7223, + "step": 30369 + }, + { + "epoch": 1.885281519647402, + "grad_norm": 0.14379170372611605, + "learning_rate": 3.64826452544927e-05, + "loss": 2.807, + "step": 30370 + }, + { + "epoch": 1.88534359674716, + "grad_norm": 0.16456117795253075, + "learning_rate": 3.647916824817703e-05, + "loss": 2.7441, + "step": 30371 + }, + { + "epoch": 1.8854056738469178, + "grad_norm": 0.15073111619745289, + "learning_rate": 3.6475691312402836e-05, + "loss": 2.8079, + "step": 30372 + }, + { + "epoch": 1.8854677509466757, + "grad_norm": 0.15365017801273548, + "learning_rate": 3.647221444718823e-05, + "loss": 2.7715, + "step": 30373 + }, + { + "epoch": 1.8855298280464337, + "grad_norm": 0.16043532875012492, + "learning_rate": 3.646873765255138e-05, + "loss": 2.7808, + "step": 30374 + }, + { + "epoch": 1.8855919051461916, + "grad_norm": 0.15856451977235436, + "learning_rate": 3.646526092851039e-05, + "loss": 2.7917, + "step": 30375 + }, + { + "epoch": 1.8856539822459495, + "grad_norm": 0.1479381015742794, + "learning_rate": 3.646178427508344e-05, + "loss": 2.7272, + "step": 30376 + }, + { + "epoch": 1.8857160593457074, + "grad_norm": 0.1655843068461253, + "learning_rate": 3.6458307692288636e-05, + "loss": 2.7108, + "step": 30377 + }, + { + "epoch": 1.8857781364454653, + "grad_norm": 0.15208932220302937, + "learning_rate": 3.6454831180144135e-05, + "loss": 2.7942, + "step": 30378 + }, + { + "epoch": 1.8858402135452232, + "grad_norm": 0.14302078535209273, + "learning_rate": 3.645135473866807e-05, + "loss": 2.7526, + "step": 30379 + }, + { + "epoch": 1.8859022906449812, + "grad_norm": 0.16271153606111097, + "learning_rate": 3.6447878367878566e-05, + "loss": 2.7265, + "step": 30380 + }, + { + "epoch": 1.885964367744739, + "grad_norm": 0.15106844801629346, + "learning_rate": 3.6444402067793785e-05, + "loss": 2.8452, + "step": 30381 + }, + { + "epoch": 1.886026444844497, + "grad_norm": 0.16592762802773497, + "learning_rate": 3.644092583843183e-05, + "loss": 2.783, + "step": 30382 + }, + { + "epoch": 1.886088521944255, + "grad_norm": 0.16293679344651787, + "learning_rate": 3.643744967981087e-05, + "loss": 2.7431, + "step": 30383 + }, + { + "epoch": 1.8861505990440126, + "grad_norm": 0.15686544565916719, + "learning_rate": 3.643397359194902e-05, + "loss": 2.7082, + "step": 30384 + }, + { + "epoch": 1.8862126761437705, + "grad_norm": 0.14430067640831382, + "learning_rate": 3.643049757486443e-05, + "loss": 2.8159, + "step": 30385 + }, + { + "epoch": 1.8862747532435284, + "grad_norm": 0.1688356374298673, + "learning_rate": 3.642702162857522e-05, + "loss": 2.7861, + "step": 30386 + }, + { + "epoch": 1.8863368303432864, + "grad_norm": 0.14710279893687242, + "learning_rate": 3.642354575309954e-05, + "loss": 2.8113, + "step": 30387 + }, + { + "epoch": 1.8863989074430443, + "grad_norm": 0.17004256511492316, + "learning_rate": 3.64200699484555e-05, + "loss": 2.8112, + "step": 30388 + }, + { + "epoch": 1.8864609845428022, + "grad_norm": 0.14633270040630825, + "learning_rate": 3.641659421466128e-05, + "loss": 2.7698, + "step": 30389 + }, + { + "epoch": 1.88652306164256, + "grad_norm": 0.1411245400976251, + "learning_rate": 3.641311855173498e-05, + "loss": 2.6999, + "step": 30390 + }, + { + "epoch": 1.8865851387423178, + "grad_norm": 0.22872936816272801, + "learning_rate": 3.6409642959694725e-05, + "loss": 2.7755, + "step": 30391 + }, + { + "epoch": 1.8866472158420757, + "grad_norm": 0.1565025136912575, + "learning_rate": 3.640616743855867e-05, + "loss": 2.8299, + "step": 30392 + }, + { + "epoch": 1.8867092929418336, + "grad_norm": 0.15542527338486806, + "learning_rate": 3.6402691988344936e-05, + "loss": 2.7921, + "step": 30393 + }, + { + "epoch": 1.8867713700415916, + "grad_norm": 0.16288112868842472, + "learning_rate": 3.6399216609071664e-05, + "loss": 2.8311, + "step": 30394 + }, + { + "epoch": 1.8868334471413495, + "grad_norm": 0.1552163984989586, + "learning_rate": 3.639574130075697e-05, + "loss": 2.8364, + "step": 30395 + }, + { + "epoch": 1.8868955242411074, + "grad_norm": 0.15338630835225922, + "learning_rate": 3.639226606341899e-05, + "loss": 2.7161, + "step": 30396 + }, + { + "epoch": 1.8869576013408653, + "grad_norm": 0.1967203706185242, + "learning_rate": 3.6388790897075876e-05, + "loss": 2.7129, + "step": 30397 + }, + { + "epoch": 1.8870196784406232, + "grad_norm": 0.1489608410699758, + "learning_rate": 3.638531580174574e-05, + "loss": 2.7029, + "step": 30398 + }, + { + "epoch": 1.8870817555403812, + "grad_norm": 0.16299673599064343, + "learning_rate": 3.6381840777446725e-05, + "loss": 2.7382, + "step": 30399 + }, + { + "epoch": 1.887143832640139, + "grad_norm": 0.15514313130463014, + "learning_rate": 3.6378365824196934e-05, + "loss": 2.7866, + "step": 30400 + }, + { + "epoch": 1.887205909739897, + "grad_norm": 0.17295813104034438, + "learning_rate": 3.637489094201454e-05, + "loss": 2.8697, + "step": 30401 + }, + { + "epoch": 1.887267986839655, + "grad_norm": 0.1713833224402927, + "learning_rate": 3.637141613091763e-05, + "loss": 2.7862, + "step": 30402 + }, + { + "epoch": 1.8873300639394128, + "grad_norm": 0.1529190680956696, + "learning_rate": 3.636794139092436e-05, + "loss": 2.8245, + "step": 30403 + }, + { + "epoch": 1.8873921410391707, + "grad_norm": 0.16343132154153955, + "learning_rate": 3.636446672205285e-05, + "loss": 2.8032, + "step": 30404 + }, + { + "epoch": 1.8874542181389287, + "grad_norm": 0.1663454649949875, + "learning_rate": 3.636099212432121e-05, + "loss": 2.7813, + "step": 30405 + }, + { + "epoch": 1.8875162952386866, + "grad_norm": 0.1560693053654965, + "learning_rate": 3.635751759774761e-05, + "loss": 2.7738, + "step": 30406 + }, + { + "epoch": 1.8875783723384445, + "grad_norm": 0.19442913561873063, + "learning_rate": 3.635404314235014e-05, + "loss": 2.7834, + "step": 30407 + }, + { + "epoch": 1.8876404494382022, + "grad_norm": 0.15694258497364105, + "learning_rate": 3.635056875814694e-05, + "loss": 2.7108, + "step": 30408 + }, + { + "epoch": 1.8877025265379601, + "grad_norm": 0.15852467507801468, + "learning_rate": 3.6347094445156126e-05, + "loss": 2.9112, + "step": 30409 + }, + { + "epoch": 1.887764603637718, + "grad_norm": 0.15070657239681784, + "learning_rate": 3.634362020339586e-05, + "loss": 2.8114, + "step": 30410 + }, + { + "epoch": 1.887826680737476, + "grad_norm": 0.19298376341574527, + "learning_rate": 3.634014603288421e-05, + "loss": 2.807, + "step": 30411 + }, + { + "epoch": 1.8878887578372339, + "grad_norm": 0.16099174566759478, + "learning_rate": 3.633667193363936e-05, + "loss": 2.8147, + "step": 30412 + }, + { + "epoch": 1.8879508349369918, + "grad_norm": 0.15772702550935835, + "learning_rate": 3.633319790567939e-05, + "loss": 2.7697, + "step": 30413 + }, + { + "epoch": 1.8880129120367495, + "grad_norm": 0.1567143989502858, + "learning_rate": 3.6329723949022465e-05, + "loss": 2.6606, + "step": 30414 + }, + { + "epoch": 1.8880749891365074, + "grad_norm": 0.17242018433067358, + "learning_rate": 3.632625006368669e-05, + "loss": 2.6496, + "step": 30415 + }, + { + "epoch": 1.8881370662362653, + "grad_norm": 0.2083755609313701, + "learning_rate": 3.6322776249690166e-05, + "loss": 2.8203, + "step": 30416 + }, + { + "epoch": 1.8881991433360232, + "grad_norm": 0.16360456270629573, + "learning_rate": 3.6319302507051054e-05, + "loss": 2.7313, + "step": 30417 + }, + { + "epoch": 1.8882612204357812, + "grad_norm": 0.1544138247881101, + "learning_rate": 3.631582883578745e-05, + "loss": 2.7333, + "step": 30418 + }, + { + "epoch": 1.888323297535539, + "grad_norm": 0.16036034582060332, + "learning_rate": 3.6312355235917505e-05, + "loss": 2.7194, + "step": 30419 + }, + { + "epoch": 1.888385374635297, + "grad_norm": 0.18234254717327972, + "learning_rate": 3.630888170745931e-05, + "loss": 2.7054, + "step": 30420 + }, + { + "epoch": 1.888447451735055, + "grad_norm": 0.15107340451830542, + "learning_rate": 3.630540825043102e-05, + "loss": 2.7578, + "step": 30421 + }, + { + "epoch": 1.8885095288348128, + "grad_norm": 0.14998001546650905, + "learning_rate": 3.630193486485072e-05, + "loss": 2.8053, + "step": 30422 + }, + { + "epoch": 1.8885716059345707, + "grad_norm": 0.16188832804406467, + "learning_rate": 3.6298461550736565e-05, + "loss": 2.6826, + "step": 30423 + }, + { + "epoch": 1.8886336830343287, + "grad_norm": 0.1513465375042962, + "learning_rate": 3.629498830810665e-05, + "loss": 2.7659, + "step": 30424 + }, + { + "epoch": 1.8886957601340866, + "grad_norm": 0.16482296740340222, + "learning_rate": 3.6291515136979116e-05, + "loss": 2.8047, + "step": 30425 + }, + { + "epoch": 1.8887578372338445, + "grad_norm": 0.14404605926818967, + "learning_rate": 3.628804203737207e-05, + "loss": 2.7984, + "step": 30426 + }, + { + "epoch": 1.8888199143336024, + "grad_norm": 0.1444075351340906, + "learning_rate": 3.628456900930365e-05, + "loss": 2.8129, + "step": 30427 + }, + { + "epoch": 1.8888819914333603, + "grad_norm": 0.15905013399826037, + "learning_rate": 3.628109605279196e-05, + "loss": 2.7306, + "step": 30428 + }, + { + "epoch": 1.8889440685331182, + "grad_norm": 0.14489319304147882, + "learning_rate": 3.62776231678551e-05, + "loss": 2.7724, + "step": 30429 + }, + { + "epoch": 1.8890061456328762, + "grad_norm": 0.18053994941716422, + "learning_rate": 3.627415035451123e-05, + "loss": 2.7526, + "step": 30430 + }, + { + "epoch": 1.889068222732634, + "grad_norm": 0.15413316057038187, + "learning_rate": 3.6270677612778446e-05, + "loss": 2.8125, + "step": 30431 + }, + { + "epoch": 1.8891302998323918, + "grad_norm": 0.17529498765893217, + "learning_rate": 3.626720494267487e-05, + "loss": 2.8186, + "step": 30432 + }, + { + "epoch": 1.8891923769321497, + "grad_norm": 0.18835207924623018, + "learning_rate": 3.626373234421863e-05, + "loss": 2.7818, + "step": 30433 + }, + { + "epoch": 1.8892544540319076, + "grad_norm": 0.18330942620289517, + "learning_rate": 3.626025981742781e-05, + "loss": 2.7444, + "step": 30434 + }, + { + "epoch": 1.8893165311316655, + "grad_norm": 0.15233064252757075, + "learning_rate": 3.625678736232058e-05, + "loss": 2.6712, + "step": 30435 + }, + { + "epoch": 1.8893786082314235, + "grad_norm": 0.15688615282468257, + "learning_rate": 3.6253314978915e-05, + "loss": 2.7293, + "step": 30436 + }, + { + "epoch": 1.8894406853311814, + "grad_norm": 0.1461727069217367, + "learning_rate": 3.6249842667229226e-05, + "loss": 2.7228, + "step": 30437 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.15415074877354484, + "learning_rate": 3.624637042728135e-05, + "loss": 2.7087, + "step": 30438 + }, + { + "epoch": 1.889564839530697, + "grad_norm": 0.14408228062973583, + "learning_rate": 3.624289825908952e-05, + "loss": 2.7536, + "step": 30439 + }, + { + "epoch": 1.889626916630455, + "grad_norm": 0.158231161717948, + "learning_rate": 3.623942616267182e-05, + "loss": 2.7504, + "step": 30440 + }, + { + "epoch": 1.8896889937302128, + "grad_norm": 0.15832977170661042, + "learning_rate": 3.6235954138046364e-05, + "loss": 2.7577, + "step": 30441 + }, + { + "epoch": 1.8897510708299707, + "grad_norm": 0.1553559054294486, + "learning_rate": 3.6232482185231286e-05, + "loss": 2.7857, + "step": 30442 + }, + { + "epoch": 1.8898131479297287, + "grad_norm": 0.16073739634496495, + "learning_rate": 3.622901030424468e-05, + "loss": 2.6852, + "step": 30443 + }, + { + "epoch": 1.8898752250294866, + "grad_norm": 0.14484357595176076, + "learning_rate": 3.622553849510469e-05, + "loss": 2.7538, + "step": 30444 + }, + { + "epoch": 1.8899373021292445, + "grad_norm": 0.1653617886184465, + "learning_rate": 3.622206675782939e-05, + "loss": 2.6513, + "step": 30445 + }, + { + "epoch": 1.8899993792290024, + "grad_norm": 0.147471989602738, + "learning_rate": 3.621859509243692e-05, + "loss": 2.7938, + "step": 30446 + }, + { + "epoch": 1.8900614563287603, + "grad_norm": 0.14181002695130504, + "learning_rate": 3.621512349894539e-05, + "loss": 2.722, + "step": 30447 + }, + { + "epoch": 1.8901235334285182, + "grad_norm": 0.19310698750823596, + "learning_rate": 3.6211651977372904e-05, + "loss": 2.7966, + "step": 30448 + }, + { + "epoch": 1.8901856105282762, + "grad_norm": 0.2358222781129905, + "learning_rate": 3.6208180527737564e-05, + "loss": 2.7866, + "step": 30449 + }, + { + "epoch": 1.890247687628034, + "grad_norm": 0.17951098355078493, + "learning_rate": 3.620470915005751e-05, + "loss": 2.7608, + "step": 30450 + }, + { + "epoch": 1.890309764727792, + "grad_norm": 0.14589957169139162, + "learning_rate": 3.620123784435083e-05, + "loss": 2.9435, + "step": 30451 + }, + { + "epoch": 1.89037184182755, + "grad_norm": 0.16074906516396575, + "learning_rate": 3.6197766610635655e-05, + "loss": 2.8408, + "step": 30452 + }, + { + "epoch": 1.8904339189273078, + "grad_norm": 0.15681085065245678, + "learning_rate": 3.619429544893007e-05, + "loss": 2.833, + "step": 30453 + }, + { + "epoch": 1.8904959960270658, + "grad_norm": 0.14099807392277622, + "learning_rate": 3.61908243592522e-05, + "loss": 2.7341, + "step": 30454 + }, + { + "epoch": 1.8905580731268237, + "grad_norm": 0.16065149379123883, + "learning_rate": 3.618735334162015e-05, + "loss": 2.6758, + "step": 30455 + }, + { + "epoch": 1.8906201502265814, + "grad_norm": 0.1452452896412301, + "learning_rate": 3.618388239605203e-05, + "loss": 2.7445, + "step": 30456 + }, + { + "epoch": 1.8906822273263393, + "grad_norm": 0.15056156995366507, + "learning_rate": 3.618041152256595e-05, + "loss": 2.819, + "step": 30457 + }, + { + "epoch": 1.8907443044260972, + "grad_norm": 0.15053746250938943, + "learning_rate": 3.617694072118002e-05, + "loss": 2.7828, + "step": 30458 + }, + { + "epoch": 1.8908063815258551, + "grad_norm": 0.15380232056434628, + "learning_rate": 3.6173469991912346e-05, + "loss": 2.7291, + "step": 30459 + }, + { + "epoch": 1.890868458625613, + "grad_norm": 0.15584677284365728, + "learning_rate": 3.616999933478103e-05, + "loss": 2.7744, + "step": 30460 + }, + { + "epoch": 1.890930535725371, + "grad_norm": 0.1512269016038786, + "learning_rate": 3.6166528749804193e-05, + "loss": 2.79, + "step": 30461 + }, + { + "epoch": 1.8909926128251286, + "grad_norm": 0.14382901870557627, + "learning_rate": 3.616305823699991e-05, + "loss": 2.8945, + "step": 30462 + }, + { + "epoch": 1.8910546899248866, + "grad_norm": 0.14002469200556045, + "learning_rate": 3.615958779638633e-05, + "loss": 2.8193, + "step": 30463 + }, + { + "epoch": 1.8911167670246445, + "grad_norm": 0.1423442437742276, + "learning_rate": 3.615611742798155e-05, + "loss": 2.7539, + "step": 30464 + }, + { + "epoch": 1.8911788441244024, + "grad_norm": 0.20434264040603556, + "learning_rate": 3.615264713180366e-05, + "loss": 2.7386, + "step": 30465 + }, + { + "epoch": 1.8912409212241603, + "grad_norm": 0.24741356267477774, + "learning_rate": 3.614917690787076e-05, + "loss": 2.7573, + "step": 30466 + }, + { + "epoch": 1.8913029983239182, + "grad_norm": 0.15806954242513252, + "learning_rate": 3.614570675620098e-05, + "loss": 2.787, + "step": 30467 + }, + { + "epoch": 1.8913650754236762, + "grad_norm": 0.15498492589645554, + "learning_rate": 3.61422366768124e-05, + "loss": 2.8393, + "step": 30468 + }, + { + "epoch": 1.891427152523434, + "grad_norm": 0.17509726860994917, + "learning_rate": 3.613876666972315e-05, + "loss": 2.6739, + "step": 30469 + }, + { + "epoch": 1.891489229623192, + "grad_norm": 0.1746667552173444, + "learning_rate": 3.6135296734951307e-05, + "loss": 2.7723, + "step": 30470 + }, + { + "epoch": 1.89155130672295, + "grad_norm": 0.16260438051724918, + "learning_rate": 3.6131826872514995e-05, + "loss": 2.7331, + "step": 30471 + }, + { + "epoch": 1.8916133838227078, + "grad_norm": 0.16439760702654252, + "learning_rate": 3.61283570824323e-05, + "loss": 2.8722, + "step": 30472 + }, + { + "epoch": 1.8916754609224657, + "grad_norm": 0.14746135433719154, + "learning_rate": 3.6124887364721336e-05, + "loss": 2.7432, + "step": 30473 + }, + { + "epoch": 1.8917375380222237, + "grad_norm": 0.15444707199685231, + "learning_rate": 3.61214177194002e-05, + "loss": 2.81, + "step": 30474 + }, + { + "epoch": 1.8917996151219816, + "grad_norm": 0.1518007222687868, + "learning_rate": 3.6117948146487016e-05, + "loss": 2.8549, + "step": 30475 + }, + { + "epoch": 1.8918616922217395, + "grad_norm": 0.14936181320457448, + "learning_rate": 3.6114478645999836e-05, + "loss": 2.818, + "step": 30476 + }, + { + "epoch": 1.8919237693214974, + "grad_norm": 0.14591169309849678, + "learning_rate": 3.6111009217956815e-05, + "loss": 2.7033, + "step": 30477 + }, + { + "epoch": 1.8919858464212553, + "grad_norm": 0.15719768898781258, + "learning_rate": 3.6107539862376034e-05, + "loss": 2.6863, + "step": 30478 + }, + { + "epoch": 1.8920479235210133, + "grad_norm": 0.18343241684546505, + "learning_rate": 3.610407057927557e-05, + "loss": 2.8068, + "step": 30479 + }, + { + "epoch": 1.892110000620771, + "grad_norm": 0.15242812681061452, + "learning_rate": 3.610060136867356e-05, + "loss": 2.7509, + "step": 30480 + }, + { + "epoch": 1.8921720777205289, + "grad_norm": 0.16271599481285628, + "learning_rate": 3.609713223058807e-05, + "loss": 2.7884, + "step": 30481 + }, + { + "epoch": 1.8922341548202868, + "grad_norm": 0.19560734428462653, + "learning_rate": 3.6093663165037236e-05, + "loss": 2.757, + "step": 30482 + }, + { + "epoch": 1.8922962319200447, + "grad_norm": 0.13809056739653128, + "learning_rate": 3.609019417203912e-05, + "loss": 2.7592, + "step": 30483 + }, + { + "epoch": 1.8923583090198026, + "grad_norm": 0.1431373706618829, + "learning_rate": 3.608672525161185e-05, + "loss": 2.7464, + "step": 30484 + }, + { + "epoch": 1.8924203861195605, + "grad_norm": 0.1764243442016982, + "learning_rate": 3.6083256403773505e-05, + "loss": 2.6995, + "step": 30485 + }, + { + "epoch": 1.8924824632193182, + "grad_norm": 0.14288378963525342, + "learning_rate": 3.607978762854219e-05, + "loss": 2.6987, + "step": 30486 + }, + { + "epoch": 1.8925445403190762, + "grad_norm": 0.150018932468971, + "learning_rate": 3.6076318925936006e-05, + "loss": 2.7885, + "step": 30487 + }, + { + "epoch": 1.892606617418834, + "grad_norm": 0.14011511154274706, + "learning_rate": 3.6072850295973046e-05, + "loss": 2.6558, + "step": 30488 + }, + { + "epoch": 1.892668694518592, + "grad_norm": 0.136468870148061, + "learning_rate": 3.606938173867141e-05, + "loss": 2.671, + "step": 30489 + }, + { + "epoch": 1.89273077161835, + "grad_norm": 0.14414335082870172, + "learning_rate": 3.6065913254049174e-05, + "loss": 2.7774, + "step": 30490 + }, + { + "epoch": 1.8927928487181078, + "grad_norm": 0.15308032113859807, + "learning_rate": 3.6062444842124475e-05, + "loss": 2.7701, + "step": 30491 + }, + { + "epoch": 1.8928549258178657, + "grad_norm": 0.13947771213600216, + "learning_rate": 3.6058976502915366e-05, + "loss": 2.8456, + "step": 30492 + }, + { + "epoch": 1.8929170029176237, + "grad_norm": 0.14981489084601432, + "learning_rate": 3.605550823643997e-05, + "loss": 2.7712, + "step": 30493 + }, + { + "epoch": 1.8929790800173816, + "grad_norm": 0.1456537219198288, + "learning_rate": 3.6052040042716355e-05, + "loss": 2.8161, + "step": 30494 + }, + { + "epoch": 1.8930411571171395, + "grad_norm": 0.1690292811237637, + "learning_rate": 3.604857192176264e-05, + "loss": 2.7407, + "step": 30495 + }, + { + "epoch": 1.8931032342168974, + "grad_norm": 0.14799427064151352, + "learning_rate": 3.604510387359693e-05, + "loss": 2.783, + "step": 30496 + }, + { + "epoch": 1.8931653113166553, + "grad_norm": 0.14090364717815154, + "learning_rate": 3.604163589823729e-05, + "loss": 2.7657, + "step": 30497 + }, + { + "epoch": 1.8932273884164132, + "grad_norm": 0.14952015071241972, + "learning_rate": 3.6038167995701825e-05, + "loss": 2.6986, + "step": 30498 + }, + { + "epoch": 1.8932894655161712, + "grad_norm": 0.1494192234718728, + "learning_rate": 3.603470016600863e-05, + "loss": 2.7293, + "step": 30499 + }, + { + "epoch": 1.893351542615929, + "grad_norm": 0.15724051751230658, + "learning_rate": 3.603123240917579e-05, + "loss": 2.7805, + "step": 30500 + }, + { + "epoch": 1.893413619715687, + "grad_norm": 0.1438710301839198, + "learning_rate": 3.60277647252214e-05, + "loss": 2.7459, + "step": 30501 + }, + { + "epoch": 1.893475696815445, + "grad_norm": 0.1576777021767612, + "learning_rate": 3.602429711416357e-05, + "loss": 2.7696, + "step": 30502 + }, + { + "epoch": 1.8935377739152028, + "grad_norm": 0.14840344615616285, + "learning_rate": 3.6020829576020356e-05, + "loss": 2.7855, + "step": 30503 + }, + { + "epoch": 1.8935998510149605, + "grad_norm": 0.15378598243781944, + "learning_rate": 3.6017362110809875e-05, + "loss": 2.7233, + "step": 30504 + }, + { + "epoch": 1.8936619281147185, + "grad_norm": 0.19331825666862607, + "learning_rate": 3.601389471855021e-05, + "loss": 2.7935, + "step": 30505 + }, + { + "epoch": 1.8937240052144764, + "grad_norm": 0.14113073670027135, + "learning_rate": 3.601042739925944e-05, + "loss": 2.6644, + "step": 30506 + }, + { + "epoch": 1.8937860823142343, + "grad_norm": 0.15161049884071773, + "learning_rate": 3.6006960152955685e-05, + "loss": 2.77, + "step": 30507 + }, + { + "epoch": 1.8938481594139922, + "grad_norm": 0.17015281965407728, + "learning_rate": 3.6003492979657e-05, + "loss": 2.7264, + "step": 30508 + }, + { + "epoch": 1.8939102365137501, + "grad_norm": 0.16776656579720847, + "learning_rate": 3.6000025879381504e-05, + "loss": 2.734, + "step": 30509 + }, + { + "epoch": 1.8939723136135078, + "grad_norm": 0.15567299878209584, + "learning_rate": 3.5996558852147256e-05, + "loss": 2.7288, + "step": 30510 + }, + { + "epoch": 1.8940343907132657, + "grad_norm": 0.19817524987240262, + "learning_rate": 3.599309189797238e-05, + "loss": 2.7177, + "step": 30511 + }, + { + "epoch": 1.8940964678130237, + "grad_norm": 0.19580090491845084, + "learning_rate": 3.5989625016874916e-05, + "loss": 2.7749, + "step": 30512 + }, + { + "epoch": 1.8941585449127816, + "grad_norm": 0.15092903835161572, + "learning_rate": 3.598615820887301e-05, + "loss": 2.7587, + "step": 30513 + }, + { + "epoch": 1.8942206220125395, + "grad_norm": 0.15092384878862913, + "learning_rate": 3.598269147398471e-05, + "loss": 2.7875, + "step": 30514 + }, + { + "epoch": 1.8942826991122974, + "grad_norm": 0.16129972309479296, + "learning_rate": 3.5979224812228094e-05, + "loss": 2.9187, + "step": 30515 + }, + { + "epoch": 1.8943447762120553, + "grad_norm": 0.15823871895014516, + "learning_rate": 3.597575822362128e-05, + "loss": 2.8578, + "step": 30516 + }, + { + "epoch": 1.8944068533118132, + "grad_norm": 0.1735671956759926, + "learning_rate": 3.5972291708182336e-05, + "loss": 2.7838, + "step": 30517 + }, + { + "epoch": 1.8944689304115712, + "grad_norm": 0.18093762692593618, + "learning_rate": 3.596882526592936e-05, + "loss": 2.7905, + "step": 30518 + }, + { + "epoch": 1.894531007511329, + "grad_norm": 0.14064011889882522, + "learning_rate": 3.5965358896880414e-05, + "loss": 2.6462, + "step": 30519 + }, + { + "epoch": 1.894593084611087, + "grad_norm": 0.1532631056659511, + "learning_rate": 3.596189260105361e-05, + "loss": 2.8361, + "step": 30520 + }, + { + "epoch": 1.894655161710845, + "grad_norm": 0.15457450057982, + "learning_rate": 3.5958426378467015e-05, + "loss": 2.8143, + "step": 30521 + }, + { + "epoch": 1.8947172388106028, + "grad_norm": 0.1502612226046981, + "learning_rate": 3.595496022913872e-05, + "loss": 2.7246, + "step": 30522 + }, + { + "epoch": 1.8947793159103608, + "grad_norm": 0.16468050340162332, + "learning_rate": 3.59514941530868e-05, + "loss": 2.7181, + "step": 30523 + }, + { + "epoch": 1.8948413930101187, + "grad_norm": 0.1559852443201984, + "learning_rate": 3.594802815032935e-05, + "loss": 2.8384, + "step": 30524 + }, + { + "epoch": 1.8949034701098766, + "grad_norm": 0.17168022628716995, + "learning_rate": 3.594456222088446e-05, + "loss": 2.7628, + "step": 30525 + }, + { + "epoch": 1.8949655472096345, + "grad_norm": 0.14671197375898457, + "learning_rate": 3.594109636477019e-05, + "loss": 2.845, + "step": 30526 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.1539344091360655, + "learning_rate": 3.5937630582004614e-05, + "loss": 2.7219, + "step": 30527 + }, + { + "epoch": 1.8950897014091501, + "grad_norm": 0.15105609304219575, + "learning_rate": 3.5934164872605855e-05, + "loss": 2.8074, + "step": 30528 + }, + { + "epoch": 1.895151778508908, + "grad_norm": 0.15062259148812918, + "learning_rate": 3.593069923659197e-05, + "loss": 2.7671, + "step": 30529 + }, + { + "epoch": 1.895213855608666, + "grad_norm": 0.19019925774450505, + "learning_rate": 3.592723367398104e-05, + "loss": 2.7093, + "step": 30530 + }, + { + "epoch": 1.8952759327084239, + "grad_norm": 0.16301446479224346, + "learning_rate": 3.592376818479115e-05, + "loss": 2.7596, + "step": 30531 + }, + { + "epoch": 1.8953380098081818, + "grad_norm": 0.14879077983495062, + "learning_rate": 3.5920302769040384e-05, + "loss": 2.7853, + "step": 30532 + }, + { + "epoch": 1.8954000869079397, + "grad_norm": 0.16927141348421496, + "learning_rate": 3.591683742674681e-05, + "loss": 2.7473, + "step": 30533 + }, + { + "epoch": 1.8954621640076974, + "grad_norm": 0.14499948664894982, + "learning_rate": 3.591337215792852e-05, + "loss": 2.751, + "step": 30534 + }, + { + "epoch": 1.8955242411074553, + "grad_norm": 0.14955758699931188, + "learning_rate": 3.5909906962603576e-05, + "loss": 2.7026, + "step": 30535 + }, + { + "epoch": 1.8955863182072132, + "grad_norm": 0.18890762448501025, + "learning_rate": 3.5906441840790075e-05, + "loss": 2.693, + "step": 30536 + }, + { + "epoch": 1.8956483953069712, + "grad_norm": 0.1438130629121058, + "learning_rate": 3.590297679250608e-05, + "loss": 2.7642, + "step": 30537 + }, + { + "epoch": 1.895710472406729, + "grad_norm": 0.1562765635321884, + "learning_rate": 3.5899511817769695e-05, + "loss": 2.8289, + "step": 30538 + }, + { + "epoch": 1.895772549506487, + "grad_norm": 0.1368459858799891, + "learning_rate": 3.589604691659898e-05, + "loss": 2.598, + "step": 30539 + }, + { + "epoch": 1.895834626606245, + "grad_norm": 0.14661413167835366, + "learning_rate": 3.5892582089011995e-05, + "loss": 2.7454, + "step": 30540 + }, + { + "epoch": 1.8958967037060028, + "grad_norm": 0.14359467137718765, + "learning_rate": 3.588911733502685e-05, + "loss": 2.781, + "step": 30541 + }, + { + "epoch": 1.8959587808057607, + "grad_norm": 0.1828343125722505, + "learning_rate": 3.588565265466158e-05, + "loss": 2.7798, + "step": 30542 + }, + { + "epoch": 1.8960208579055187, + "grad_norm": 0.1463807247937501, + "learning_rate": 3.588218804793432e-05, + "loss": 2.6983, + "step": 30543 + }, + { + "epoch": 1.8960829350052766, + "grad_norm": 0.1548628506778962, + "learning_rate": 3.5878723514863084e-05, + "loss": 2.7655, + "step": 30544 + }, + { + "epoch": 1.8961450121050345, + "grad_norm": 0.1571153757020179, + "learning_rate": 3.5875259055465994e-05, + "loss": 2.8004, + "step": 30545 + }, + { + "epoch": 1.8962070892047924, + "grad_norm": 0.15136702752873982, + "learning_rate": 3.5871794669761096e-05, + "loss": 2.7535, + "step": 30546 + }, + { + "epoch": 1.8962691663045503, + "grad_norm": 0.149638717652356, + "learning_rate": 3.5868330357766475e-05, + "loss": 2.701, + "step": 30547 + }, + { + "epoch": 1.8963312434043083, + "grad_norm": 0.14371169191428418, + "learning_rate": 3.5864866119500204e-05, + "loss": 2.816, + "step": 30548 + }, + { + "epoch": 1.8963933205040662, + "grad_norm": 0.1537780186372359, + "learning_rate": 3.586140195498037e-05, + "loss": 2.7459, + "step": 30549 + }, + { + "epoch": 1.896455397603824, + "grad_norm": 0.15017618025221788, + "learning_rate": 3.5857937864225035e-05, + "loss": 2.773, + "step": 30550 + }, + { + "epoch": 1.896517474703582, + "grad_norm": 0.1711166351289241, + "learning_rate": 3.5854473847252254e-05, + "loss": 2.7803, + "step": 30551 + }, + { + "epoch": 1.8965795518033397, + "grad_norm": 0.16046781008481345, + "learning_rate": 3.585100990408013e-05, + "loss": 2.8447, + "step": 30552 + }, + { + "epoch": 1.8966416289030976, + "grad_norm": 0.1396801585736725, + "learning_rate": 3.584754603472671e-05, + "loss": 2.7066, + "step": 30553 + }, + { + "epoch": 1.8967037060028555, + "grad_norm": 0.1419390635435795, + "learning_rate": 3.584408223921009e-05, + "loss": 2.7745, + "step": 30554 + }, + { + "epoch": 1.8967657831026135, + "grad_norm": 0.14665749809974454, + "learning_rate": 3.584061851754832e-05, + "loss": 2.6977, + "step": 30555 + }, + { + "epoch": 1.8968278602023714, + "grad_norm": 0.16718357674282572, + "learning_rate": 3.5837154869759496e-05, + "loss": 2.7431, + "step": 30556 + }, + { + "epoch": 1.8968899373021293, + "grad_norm": 0.15997857612549385, + "learning_rate": 3.583369129586165e-05, + "loss": 2.7919, + "step": 30557 + }, + { + "epoch": 1.896952014401887, + "grad_norm": 0.15048197659425092, + "learning_rate": 3.58302277958729e-05, + "loss": 2.798, + "step": 30558 + }, + { + "epoch": 1.897014091501645, + "grad_norm": 0.16102681575017033, + "learning_rate": 3.582676436981127e-05, + "loss": 2.7492, + "step": 30559 + }, + { + "epoch": 1.8970761686014028, + "grad_norm": 0.15750255410454664, + "learning_rate": 3.5823301017694846e-05, + "loss": 2.7728, + "step": 30560 + }, + { + "epoch": 1.8971382457011607, + "grad_norm": 0.14508111846908664, + "learning_rate": 3.5819837739541715e-05, + "loss": 2.7088, + "step": 30561 + }, + { + "epoch": 1.8972003228009187, + "grad_norm": 0.16286451761711834, + "learning_rate": 3.581637453536993e-05, + "loss": 2.7031, + "step": 30562 + }, + { + "epoch": 1.8972623999006766, + "grad_norm": 0.14518844195169958, + "learning_rate": 3.581291140519758e-05, + "loss": 2.7958, + "step": 30563 + }, + { + "epoch": 1.8973244770004345, + "grad_norm": 0.15560079650644398, + "learning_rate": 3.580944834904271e-05, + "loss": 2.782, + "step": 30564 + }, + { + "epoch": 1.8973865541001924, + "grad_norm": 0.158302693764337, + "learning_rate": 3.580598536692337e-05, + "loss": 2.857, + "step": 30565 + }, + { + "epoch": 1.8974486311999503, + "grad_norm": 0.15458684546044513, + "learning_rate": 3.580252245885767e-05, + "loss": 2.7566, + "step": 30566 + }, + { + "epoch": 1.8975107082997082, + "grad_norm": 0.1591007586954568, + "learning_rate": 3.5799059624863654e-05, + "loss": 2.6762, + "step": 30567 + }, + { + "epoch": 1.8975727853994662, + "grad_norm": 0.16194563374949164, + "learning_rate": 3.579559686495939e-05, + "loss": 2.7028, + "step": 30568 + }, + { + "epoch": 1.897634862499224, + "grad_norm": 0.1399912195134435, + "learning_rate": 3.579213417916294e-05, + "loss": 2.6365, + "step": 30569 + }, + { + "epoch": 1.897696939598982, + "grad_norm": 0.17268444323034704, + "learning_rate": 3.578867156749238e-05, + "loss": 2.7024, + "step": 30570 + }, + { + "epoch": 1.89775901669874, + "grad_norm": 0.16491434361994395, + "learning_rate": 3.5785209029965765e-05, + "loss": 2.7524, + "step": 30571 + }, + { + "epoch": 1.8978210937984978, + "grad_norm": 0.14674103821492918, + "learning_rate": 3.578174656660118e-05, + "loss": 2.7515, + "step": 30572 + }, + { + "epoch": 1.8978831708982558, + "grad_norm": 0.1462394383174774, + "learning_rate": 3.5778284177416654e-05, + "loss": 2.7586, + "step": 30573 + }, + { + "epoch": 1.8979452479980137, + "grad_norm": 0.14505207454423405, + "learning_rate": 3.577482186243029e-05, + "loss": 2.6983, + "step": 30574 + }, + { + "epoch": 1.8980073250977716, + "grad_norm": 0.14642825005118892, + "learning_rate": 3.577135962166013e-05, + "loss": 2.7526, + "step": 30575 + }, + { + "epoch": 1.8980694021975293, + "grad_norm": 0.16999792720954512, + "learning_rate": 3.576789745512423e-05, + "loss": 2.6428, + "step": 30576 + }, + { + "epoch": 1.8981314792972872, + "grad_norm": 0.15583360143394492, + "learning_rate": 3.576443536284068e-05, + "loss": 2.7587, + "step": 30577 + }, + { + "epoch": 1.8981935563970451, + "grad_norm": 0.16019265412415712, + "learning_rate": 3.576097334482751e-05, + "loss": 2.8401, + "step": 30578 + }, + { + "epoch": 1.898255633496803, + "grad_norm": 0.14261183927982962, + "learning_rate": 3.575751140110282e-05, + "loss": 2.8129, + "step": 30579 + }, + { + "epoch": 1.898317710596561, + "grad_norm": 0.14728858804138198, + "learning_rate": 3.5754049531684625e-05, + "loss": 2.8159, + "step": 30580 + }, + { + "epoch": 1.8983797876963189, + "grad_norm": 0.19072418915444603, + "learning_rate": 3.575058773659103e-05, + "loss": 2.807, + "step": 30581 + }, + { + "epoch": 1.8984418647960766, + "grad_norm": 0.14322653392337228, + "learning_rate": 3.5747126015840065e-05, + "loss": 2.7773, + "step": 30582 + }, + { + "epoch": 1.8985039418958345, + "grad_norm": 0.15331409830511344, + "learning_rate": 3.5743664369449815e-05, + "loss": 2.8338, + "step": 30583 + }, + { + "epoch": 1.8985660189955924, + "grad_norm": 0.15251873432839808, + "learning_rate": 3.574020279743832e-05, + "loss": 2.7753, + "step": 30584 + }, + { + "epoch": 1.8986280960953503, + "grad_norm": 0.15958922891541572, + "learning_rate": 3.573674129982366e-05, + "loss": 2.6528, + "step": 30585 + }, + { + "epoch": 1.8986901731951082, + "grad_norm": 0.15546575667983717, + "learning_rate": 3.573327987662388e-05, + "loss": 2.7361, + "step": 30586 + }, + { + "epoch": 1.8987522502948662, + "grad_norm": 0.1387724109004573, + "learning_rate": 3.5729818527857035e-05, + "loss": 2.7616, + "step": 30587 + }, + { + "epoch": 1.898814327394624, + "grad_norm": 0.1468081583454539, + "learning_rate": 3.57263572535412e-05, + "loss": 2.6901, + "step": 30588 + }, + { + "epoch": 1.898876404494382, + "grad_norm": 0.15149942589877666, + "learning_rate": 3.5722896053694413e-05, + "loss": 2.8159, + "step": 30589 + }, + { + "epoch": 1.89893848159414, + "grad_norm": 0.14365835404157015, + "learning_rate": 3.571943492833475e-05, + "loss": 2.7036, + "step": 30590 + }, + { + "epoch": 1.8990005586938978, + "grad_norm": 0.1395067149395388, + "learning_rate": 3.571597387748026e-05, + "loss": 2.6563, + "step": 30591 + }, + { + "epoch": 1.8990626357936558, + "grad_norm": 0.1471757088127518, + "learning_rate": 3.5712512901149e-05, + "loss": 2.743, + "step": 30592 + }, + { + "epoch": 1.8991247128934137, + "grad_norm": 0.1675720298209924, + "learning_rate": 3.570905199935902e-05, + "loss": 2.8656, + "step": 30593 + }, + { + "epoch": 1.8991867899931716, + "grad_norm": 0.1472476022351867, + "learning_rate": 3.5705591172128385e-05, + "loss": 2.7088, + "step": 30594 + }, + { + "epoch": 1.8992488670929295, + "grad_norm": 0.1620925009804196, + "learning_rate": 3.570213041947517e-05, + "loss": 2.8265, + "step": 30595 + }, + { + "epoch": 1.8993109441926874, + "grad_norm": 0.14552907340363608, + "learning_rate": 3.56986697414174e-05, + "loss": 2.7499, + "step": 30596 + }, + { + "epoch": 1.8993730212924453, + "grad_norm": 0.15180121921638978, + "learning_rate": 3.569520913797315e-05, + "loss": 2.7991, + "step": 30597 + }, + { + "epoch": 1.8994350983922033, + "grad_norm": 0.14773434435734664, + "learning_rate": 3.5691748609160455e-05, + "loss": 2.7689, + "step": 30598 + }, + { + "epoch": 1.8994971754919612, + "grad_norm": 0.14824776695517236, + "learning_rate": 3.5688288154997396e-05, + "loss": 2.7046, + "step": 30599 + }, + { + "epoch": 1.8995592525917189, + "grad_norm": 0.15139831099939693, + "learning_rate": 3.5684827775502006e-05, + "loss": 2.8465, + "step": 30600 + }, + { + "epoch": 1.8996213296914768, + "grad_norm": 0.17919127735138876, + "learning_rate": 3.568136747069234e-05, + "loss": 2.7844, + "step": 30601 + }, + { + "epoch": 1.8996834067912347, + "grad_norm": 0.14834625802854104, + "learning_rate": 3.567790724058646e-05, + "loss": 2.7021, + "step": 30602 + }, + { + "epoch": 1.8997454838909926, + "grad_norm": 0.1460799929827561, + "learning_rate": 3.5674447085202414e-05, + "loss": 2.7234, + "step": 30603 + }, + { + "epoch": 1.8998075609907505, + "grad_norm": 0.1454489864256842, + "learning_rate": 3.567098700455825e-05, + "loss": 2.6667, + "step": 30604 + }, + { + "epoch": 1.8998696380905085, + "grad_norm": 0.14722884778855092, + "learning_rate": 3.566752699867203e-05, + "loss": 2.7938, + "step": 30605 + }, + { + "epoch": 1.8999317151902662, + "grad_norm": 0.15057433042804888, + "learning_rate": 3.56640670675618e-05, + "loss": 2.812, + "step": 30606 + }, + { + "epoch": 1.899993792290024, + "grad_norm": 0.14964528362522475, + "learning_rate": 3.5660607211245614e-05, + "loss": 2.7502, + "step": 30607 + }, + { + "epoch": 1.900055869389782, + "grad_norm": 0.14162602048970874, + "learning_rate": 3.5657147429741525e-05, + "loss": 2.8076, + "step": 30608 + }, + { + "epoch": 1.90011794648954, + "grad_norm": 0.14333058725635572, + "learning_rate": 3.5653687723067566e-05, + "loss": 2.7108, + "step": 30609 + }, + { + "epoch": 1.9001800235892978, + "grad_norm": 0.14617210163892, + "learning_rate": 3.565022809124181e-05, + "loss": 2.7594, + "step": 30610 + }, + { + "epoch": 1.9002421006890557, + "grad_norm": 0.14752000926569236, + "learning_rate": 3.5646768534282306e-05, + "loss": 2.8441, + "step": 30611 + }, + { + "epoch": 1.9003041777888137, + "grad_norm": 0.15551800331045623, + "learning_rate": 3.564330905220708e-05, + "loss": 2.7682, + "step": 30612 + }, + { + "epoch": 1.9003662548885716, + "grad_norm": 0.1504984011208715, + "learning_rate": 3.563984964503421e-05, + "loss": 2.6752, + "step": 30613 + }, + { + "epoch": 1.9004283319883295, + "grad_norm": 0.14093154371282504, + "learning_rate": 3.563639031278171e-05, + "loss": 2.6934, + "step": 30614 + }, + { + "epoch": 1.9004904090880874, + "grad_norm": 0.1744365545210984, + "learning_rate": 3.563293105546767e-05, + "loss": 2.7755, + "step": 30615 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.145280160689378, + "learning_rate": 3.56294718731101e-05, + "loss": 2.7823, + "step": 30616 + }, + { + "epoch": 1.9006145632876033, + "grad_norm": 0.18139008707135482, + "learning_rate": 3.562601276572708e-05, + "loss": 2.7107, + "step": 30617 + }, + { + "epoch": 1.9006766403873612, + "grad_norm": 0.15508175060156215, + "learning_rate": 3.562255373333662e-05, + "loss": 2.741, + "step": 30618 + }, + { + "epoch": 1.900738717487119, + "grad_norm": 0.14733511881259767, + "learning_rate": 3.561909477595681e-05, + "loss": 2.7947, + "step": 30619 + }, + { + "epoch": 1.900800794586877, + "grad_norm": 0.18071416211756738, + "learning_rate": 3.561563589360566e-05, + "loss": 2.7276, + "step": 30620 + }, + { + "epoch": 1.900862871686635, + "grad_norm": 0.1571734617137825, + "learning_rate": 3.5612177086301234e-05, + "loss": 2.7998, + "step": 30621 + }, + { + "epoch": 1.9009249487863928, + "grad_norm": 0.1597590347180607, + "learning_rate": 3.560871835406157e-05, + "loss": 2.7269, + "step": 30622 + }, + { + "epoch": 1.9009870258861508, + "grad_norm": 0.148723114192515, + "learning_rate": 3.5605259696904716e-05, + "loss": 2.7095, + "step": 30623 + }, + { + "epoch": 1.9010491029859085, + "grad_norm": 0.15300968314130986, + "learning_rate": 3.560180111484872e-05, + "loss": 2.7573, + "step": 30624 + }, + { + "epoch": 1.9011111800856664, + "grad_norm": 0.14873029667524376, + "learning_rate": 3.5598342607911605e-05, + "loss": 2.7382, + "step": 30625 + }, + { + "epoch": 1.9011732571854243, + "grad_norm": 0.15281507135930888, + "learning_rate": 3.559488417611144e-05, + "loss": 2.7748, + "step": 30626 + }, + { + "epoch": 1.9012353342851822, + "grad_norm": 0.15566035114300922, + "learning_rate": 3.559142581946628e-05, + "loss": 2.712, + "step": 30627 + }, + { + "epoch": 1.9012974113849401, + "grad_norm": 0.1392113641584317, + "learning_rate": 3.558796753799414e-05, + "loss": 2.7651, + "step": 30628 + }, + { + "epoch": 1.901359488484698, + "grad_norm": 0.14046241050052669, + "learning_rate": 3.558450933171307e-05, + "loss": 2.6072, + "step": 30629 + }, + { + "epoch": 1.9014215655844557, + "grad_norm": 0.15155050190893143, + "learning_rate": 3.5581051200641116e-05, + "loss": 2.773, + "step": 30630 + }, + { + "epoch": 1.9014836426842137, + "grad_norm": 0.1347905149833183, + "learning_rate": 3.5577593144796315e-05, + "loss": 2.7077, + "step": 30631 + }, + { + "epoch": 1.9015457197839716, + "grad_norm": 0.1649411266408101, + "learning_rate": 3.557413516419671e-05, + "loss": 2.7939, + "step": 30632 + }, + { + "epoch": 1.9016077968837295, + "grad_norm": 0.15744129468154125, + "learning_rate": 3.5570677258860355e-05, + "loss": 2.822, + "step": 30633 + }, + { + "epoch": 1.9016698739834874, + "grad_norm": 0.15335161654901716, + "learning_rate": 3.5567219428805265e-05, + "loss": 2.7359, + "step": 30634 + }, + { + "epoch": 1.9017319510832453, + "grad_norm": 0.1578310960256434, + "learning_rate": 3.5563761674049514e-05, + "loss": 2.6818, + "step": 30635 + }, + { + "epoch": 1.9017940281830032, + "grad_norm": 0.14779481920183113, + "learning_rate": 3.556030399461111e-05, + "loss": 2.6859, + "step": 30636 + }, + { + "epoch": 1.9018561052827612, + "grad_norm": 0.15532601264144302, + "learning_rate": 3.5556846390508105e-05, + "loss": 2.7091, + "step": 30637 + }, + { + "epoch": 1.901918182382519, + "grad_norm": 0.16675951291341243, + "learning_rate": 3.555338886175854e-05, + "loss": 2.8548, + "step": 30638 + }, + { + "epoch": 1.901980259482277, + "grad_norm": 0.23066471141752978, + "learning_rate": 3.554993140838045e-05, + "loss": 2.8052, + "step": 30639 + }, + { + "epoch": 1.902042336582035, + "grad_norm": 0.16029367714705958, + "learning_rate": 3.5546474030391886e-05, + "loss": 2.8208, + "step": 30640 + }, + { + "epoch": 1.9021044136817928, + "grad_norm": 0.17250436190777804, + "learning_rate": 3.554301672781086e-05, + "loss": 2.7782, + "step": 30641 + }, + { + "epoch": 1.9021664907815508, + "grad_norm": 0.16403417115636407, + "learning_rate": 3.553955950065544e-05, + "loss": 2.6802, + "step": 30642 + }, + { + "epoch": 1.9022285678813087, + "grad_norm": 0.18243142742166782, + "learning_rate": 3.553610234894363e-05, + "loss": 2.7225, + "step": 30643 + }, + { + "epoch": 1.9022906449810666, + "grad_norm": 0.1674536970000105, + "learning_rate": 3.553264527269351e-05, + "loss": 2.7269, + "step": 30644 + }, + { + "epoch": 1.9023527220808245, + "grad_norm": 0.15669530950902777, + "learning_rate": 3.552918827192307e-05, + "loss": 2.7329, + "step": 30645 + }, + { + "epoch": 1.9024147991805824, + "grad_norm": 0.1697734974187003, + "learning_rate": 3.5525731346650384e-05, + "loss": 2.7405, + "step": 30646 + }, + { + "epoch": 1.9024768762803403, + "grad_norm": 0.14666548023931586, + "learning_rate": 3.552227449689346e-05, + "loss": 2.7671, + "step": 30647 + }, + { + "epoch": 1.902538953380098, + "grad_norm": 0.15410900750687367, + "learning_rate": 3.551881772267035e-05, + "loss": 2.6842, + "step": 30648 + }, + { + "epoch": 1.902601030479856, + "grad_norm": 0.14240821747784155, + "learning_rate": 3.551536102399908e-05, + "loss": 2.7605, + "step": 30649 + }, + { + "epoch": 1.9026631075796139, + "grad_norm": 0.1594602359249625, + "learning_rate": 3.551190440089769e-05, + "loss": 2.8239, + "step": 30650 + }, + { + "epoch": 1.9027251846793718, + "grad_norm": 0.14885644392986025, + "learning_rate": 3.550844785338422e-05, + "loss": 2.7441, + "step": 30651 + }, + { + "epoch": 1.9027872617791297, + "grad_norm": 0.1455802609109997, + "learning_rate": 3.5504991381476685e-05, + "loss": 2.7704, + "step": 30652 + }, + { + "epoch": 1.9028493388788876, + "grad_norm": 0.1431903952805796, + "learning_rate": 3.550153498519313e-05, + "loss": 2.7689, + "step": 30653 + }, + { + "epoch": 1.9029114159786453, + "grad_norm": 0.15994513367215413, + "learning_rate": 3.549807866455158e-05, + "loss": 2.7249, + "step": 30654 + }, + { + "epoch": 1.9029734930784032, + "grad_norm": 0.15335297084851363, + "learning_rate": 3.549462241957009e-05, + "loss": 2.7503, + "step": 30655 + }, + { + "epoch": 1.9030355701781612, + "grad_norm": 0.14871774745546332, + "learning_rate": 3.549116625026666e-05, + "loss": 2.7784, + "step": 30656 + }, + { + "epoch": 1.903097647277919, + "grad_norm": 0.15388708382563324, + "learning_rate": 3.548771015665935e-05, + "loss": 2.8056, + "step": 30657 + }, + { + "epoch": 1.903159724377677, + "grad_norm": 0.1455296941599905, + "learning_rate": 3.548425413876616e-05, + "loss": 2.809, + "step": 30658 + }, + { + "epoch": 1.903221801477435, + "grad_norm": 0.14415778417202993, + "learning_rate": 3.5480798196605145e-05, + "loss": 2.6556, + "step": 30659 + }, + { + "epoch": 1.9032838785771928, + "grad_norm": 0.1491452772694784, + "learning_rate": 3.547734233019435e-05, + "loss": 2.7705, + "step": 30660 + }, + { + "epoch": 1.9033459556769508, + "grad_norm": 0.15247552014687643, + "learning_rate": 3.547388653955178e-05, + "loss": 2.8138, + "step": 30661 + }, + { + "epoch": 1.9034080327767087, + "grad_norm": 0.14117721244589193, + "learning_rate": 3.5470430824695456e-05, + "loss": 2.7611, + "step": 30662 + }, + { + "epoch": 1.9034701098764666, + "grad_norm": 0.15042913241149033, + "learning_rate": 3.5466975185643435e-05, + "loss": 2.7342, + "step": 30663 + }, + { + "epoch": 1.9035321869762245, + "grad_norm": 0.14627705207689887, + "learning_rate": 3.5463519622413724e-05, + "loss": 2.7537, + "step": 30664 + }, + { + "epoch": 1.9035942640759824, + "grad_norm": 0.14627737678537442, + "learning_rate": 3.5460064135024375e-05, + "loss": 2.8941, + "step": 30665 + }, + { + "epoch": 1.9036563411757403, + "grad_norm": 0.14173081516988617, + "learning_rate": 3.5456608723493386e-05, + "loss": 2.7785, + "step": 30666 + }, + { + "epoch": 1.9037184182754983, + "grad_norm": 0.13985599085975617, + "learning_rate": 3.545315338783881e-05, + "loss": 2.705, + "step": 30667 + }, + { + "epoch": 1.9037804953752562, + "grad_norm": 0.14112010530121463, + "learning_rate": 3.5449698128078654e-05, + "loss": 2.7516, + "step": 30668 + }, + { + "epoch": 1.903842572475014, + "grad_norm": 0.14124284887567812, + "learning_rate": 3.544624294423096e-05, + "loss": 2.7655, + "step": 30669 + }, + { + "epoch": 1.903904649574772, + "grad_norm": 0.14700434270310508, + "learning_rate": 3.544278783631375e-05, + "loss": 2.7625, + "step": 30670 + }, + { + "epoch": 1.90396672667453, + "grad_norm": 0.14346243690397267, + "learning_rate": 3.543933280434505e-05, + "loss": 2.7344, + "step": 30671 + }, + { + "epoch": 1.9040288037742876, + "grad_norm": 0.14544470269712143, + "learning_rate": 3.543587784834288e-05, + "loss": 2.7934, + "step": 30672 + }, + { + "epoch": 1.9040908808740455, + "grad_norm": 0.14661262619875975, + "learning_rate": 3.543242296832528e-05, + "loss": 2.6504, + "step": 30673 + }, + { + "epoch": 1.9041529579738035, + "grad_norm": 0.15721449598366374, + "learning_rate": 3.5428968164310265e-05, + "loss": 2.8027, + "step": 30674 + }, + { + "epoch": 1.9042150350735614, + "grad_norm": 0.150785806320211, + "learning_rate": 3.542551343631585e-05, + "loss": 2.7863, + "step": 30675 + }, + { + "epoch": 1.9042771121733193, + "grad_norm": 0.15037995461596104, + "learning_rate": 3.5422058784360084e-05, + "loss": 2.7039, + "step": 30676 + }, + { + "epoch": 1.9043391892730772, + "grad_norm": 0.16342666005606926, + "learning_rate": 3.541860420846096e-05, + "loss": 2.7812, + "step": 30677 + }, + { + "epoch": 1.904401266372835, + "grad_norm": 0.15495599530634951, + "learning_rate": 3.541514970863653e-05, + "loss": 2.7266, + "step": 30678 + }, + { + "epoch": 1.9044633434725928, + "grad_norm": 0.1443064526599745, + "learning_rate": 3.54116952849048e-05, + "loss": 2.6666, + "step": 30679 + }, + { + "epoch": 1.9045254205723507, + "grad_norm": 0.1558956419546681, + "learning_rate": 3.54082409372838e-05, + "loss": 2.7737, + "step": 30680 + }, + { + "epoch": 1.9045874976721087, + "grad_norm": 0.14161785952739578, + "learning_rate": 3.5404786665791544e-05, + "loss": 2.7617, + "step": 30681 + }, + { + "epoch": 1.9046495747718666, + "grad_norm": 0.14103095084939882, + "learning_rate": 3.540133247044607e-05, + "loss": 2.7205, + "step": 30682 + }, + { + "epoch": 1.9047116518716245, + "grad_norm": 0.1471205376103896, + "learning_rate": 3.5397878351265365e-05, + "loss": 2.7493, + "step": 30683 + }, + { + "epoch": 1.9047737289713824, + "grad_norm": 0.16812815975212178, + "learning_rate": 3.53944243082675e-05, + "loss": 2.8097, + "step": 30684 + }, + { + "epoch": 1.9048358060711403, + "grad_norm": 0.1582441983063422, + "learning_rate": 3.539097034147046e-05, + "loss": 2.8295, + "step": 30685 + }, + { + "epoch": 1.9048978831708983, + "grad_norm": 0.1519234434621813, + "learning_rate": 3.5387516450892263e-05, + "loss": 2.747, + "step": 30686 + }, + { + "epoch": 1.9049599602706562, + "grad_norm": 0.14796456487148935, + "learning_rate": 3.538406263655096e-05, + "loss": 2.8021, + "step": 30687 + }, + { + "epoch": 1.905022037370414, + "grad_norm": 0.1449913164428448, + "learning_rate": 3.538060889846454e-05, + "loss": 2.7851, + "step": 30688 + }, + { + "epoch": 1.905084114470172, + "grad_norm": 0.17099889799789128, + "learning_rate": 3.537715523665103e-05, + "loss": 2.6719, + "step": 30689 + }, + { + "epoch": 1.90514619156993, + "grad_norm": 0.15102157513953796, + "learning_rate": 3.537370165112845e-05, + "loss": 2.8239, + "step": 30690 + }, + { + "epoch": 1.9052082686696878, + "grad_norm": 0.1731111549184706, + "learning_rate": 3.537024814191481e-05, + "loss": 2.7679, + "step": 30691 + }, + { + "epoch": 1.9052703457694458, + "grad_norm": 0.1506856526341083, + "learning_rate": 3.536679470902815e-05, + "loss": 2.7753, + "step": 30692 + }, + { + "epoch": 1.9053324228692037, + "grad_norm": 0.14137829246026243, + "learning_rate": 3.5363341352486465e-05, + "loss": 2.792, + "step": 30693 + }, + { + "epoch": 1.9053944999689616, + "grad_norm": 0.17340271038093916, + "learning_rate": 3.5359888072307795e-05, + "loss": 2.732, + "step": 30694 + }, + { + "epoch": 1.9054565770687195, + "grad_norm": 0.15243571513352153, + "learning_rate": 3.535643486851014e-05, + "loss": 2.8255, + "step": 30695 + }, + { + "epoch": 1.9055186541684772, + "grad_norm": 0.15718872491689176, + "learning_rate": 3.535298174111151e-05, + "loss": 2.8994, + "step": 30696 + }, + { + "epoch": 1.9055807312682351, + "grad_norm": 0.14097354002027623, + "learning_rate": 3.5349528690129934e-05, + "loss": 2.711, + "step": 30697 + }, + { + "epoch": 1.905642808367993, + "grad_norm": 0.1452744167425145, + "learning_rate": 3.534607571558343e-05, + "loss": 2.7913, + "step": 30698 + }, + { + "epoch": 1.905704885467751, + "grad_norm": 0.1525613929629451, + "learning_rate": 3.534262281749001e-05, + "loss": 2.7345, + "step": 30699 + }, + { + "epoch": 1.9057669625675089, + "grad_norm": 0.1415414814260946, + "learning_rate": 3.533916999586767e-05, + "loss": 2.7318, + "step": 30700 + }, + { + "epoch": 1.9058290396672668, + "grad_norm": 0.1422764840294474, + "learning_rate": 3.533571725073446e-05, + "loss": 2.7204, + "step": 30701 + }, + { + "epoch": 1.9058911167670245, + "grad_norm": 0.1436732574529136, + "learning_rate": 3.533226458210835e-05, + "loss": 2.8259, + "step": 30702 + }, + { + "epoch": 1.9059531938667824, + "grad_norm": 0.15159263434163714, + "learning_rate": 3.532881199000739e-05, + "loss": 2.7324, + "step": 30703 + }, + { + "epoch": 1.9060152709665403, + "grad_norm": 0.14262562443924998, + "learning_rate": 3.532535947444957e-05, + "loss": 2.7942, + "step": 30704 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.14202479548208738, + "learning_rate": 3.5321907035452914e-05, + "loss": 2.7629, + "step": 30705 + }, + { + "epoch": 1.9061394251660562, + "grad_norm": 0.15244728018412013, + "learning_rate": 3.531845467303544e-05, + "loss": 2.7145, + "step": 30706 + }, + { + "epoch": 1.906201502265814, + "grad_norm": 0.14723530616615588, + "learning_rate": 3.5315002387215144e-05, + "loss": 2.795, + "step": 30707 + }, + { + "epoch": 1.906263579365572, + "grad_norm": 0.14687938201738776, + "learning_rate": 3.5311550178010044e-05, + "loss": 2.7471, + "step": 30708 + }, + { + "epoch": 1.90632565646533, + "grad_norm": 0.14327249882970552, + "learning_rate": 3.5308098045438164e-05, + "loss": 2.6774, + "step": 30709 + }, + { + "epoch": 1.9063877335650878, + "grad_norm": 0.13838728056276842, + "learning_rate": 3.5304645989517506e-05, + "loss": 2.7578, + "step": 30710 + }, + { + "epoch": 1.9064498106648458, + "grad_norm": 0.14740684603493165, + "learning_rate": 3.5301194010266057e-05, + "loss": 2.7574, + "step": 30711 + }, + { + "epoch": 1.9065118877646037, + "grad_norm": 0.16072163572136902, + "learning_rate": 3.529774210770187e-05, + "loss": 2.7662, + "step": 30712 + }, + { + "epoch": 1.9065739648643616, + "grad_norm": 0.14278642905809724, + "learning_rate": 3.529429028184291e-05, + "loss": 2.7318, + "step": 30713 + }, + { + "epoch": 1.9066360419641195, + "grad_norm": 0.15340053986268806, + "learning_rate": 3.529083853270723e-05, + "loss": 2.7357, + "step": 30714 + }, + { + "epoch": 1.9066981190638774, + "grad_norm": 0.1500856726087449, + "learning_rate": 3.528738686031279e-05, + "loss": 2.7324, + "step": 30715 + }, + { + "epoch": 1.9067601961636353, + "grad_norm": 0.15296489071677855, + "learning_rate": 3.528393526467764e-05, + "loss": 2.6912, + "step": 30716 + }, + { + "epoch": 1.9068222732633933, + "grad_norm": 0.15586732256137178, + "learning_rate": 3.528048374581977e-05, + "loss": 2.7971, + "step": 30717 + }, + { + "epoch": 1.9068843503631512, + "grad_norm": 0.14086331598414667, + "learning_rate": 3.527703230375719e-05, + "loss": 2.7473, + "step": 30718 + }, + { + "epoch": 1.906946427462909, + "grad_norm": 0.15182032285757113, + "learning_rate": 3.52735809385079e-05, + "loss": 2.7273, + "step": 30719 + }, + { + "epoch": 1.9070085045626668, + "grad_norm": 0.14935298369375447, + "learning_rate": 3.5270129650089926e-05, + "loss": 2.8291, + "step": 30720 + }, + { + "epoch": 1.9070705816624247, + "grad_norm": 0.14571173670582643, + "learning_rate": 3.526667843852126e-05, + "loss": 2.7899, + "step": 30721 + }, + { + "epoch": 1.9071326587621826, + "grad_norm": 0.14948107270167546, + "learning_rate": 3.5263227303819895e-05, + "loss": 2.8097, + "step": 30722 + }, + { + "epoch": 1.9071947358619405, + "grad_norm": 0.15354075044214524, + "learning_rate": 3.5259776246003865e-05, + "loss": 2.7331, + "step": 30723 + }, + { + "epoch": 1.9072568129616985, + "grad_norm": 0.14535838653694186, + "learning_rate": 3.5256325265091136e-05, + "loss": 2.7336, + "step": 30724 + }, + { + "epoch": 1.9073188900614562, + "grad_norm": 0.14838394654593354, + "learning_rate": 3.525287436109975e-05, + "loss": 2.8907, + "step": 30725 + }, + { + "epoch": 1.907380967161214, + "grad_norm": 0.13939360729593867, + "learning_rate": 3.524942353404772e-05, + "loss": 2.695, + "step": 30726 + }, + { + "epoch": 1.907443044260972, + "grad_norm": 0.13921244124258617, + "learning_rate": 3.5245972783953e-05, + "loss": 2.7145, + "step": 30727 + }, + { + "epoch": 1.90750512136073, + "grad_norm": 0.1406179333606698, + "learning_rate": 3.5242522110833645e-05, + "loss": 2.676, + "step": 30728 + }, + { + "epoch": 1.9075671984604878, + "grad_norm": 0.1353577514708716, + "learning_rate": 3.5239071514707614e-05, + "loss": 2.7794, + "step": 30729 + }, + { + "epoch": 1.9076292755602458, + "grad_norm": 0.13902362349718284, + "learning_rate": 3.5235620995592945e-05, + "loss": 2.7822, + "step": 30730 + }, + { + "epoch": 1.9076913526600037, + "grad_norm": 0.14182819777130987, + "learning_rate": 3.5232170553507614e-05, + "loss": 2.7294, + "step": 30731 + }, + { + "epoch": 1.9077534297597616, + "grad_norm": 0.15408888209863392, + "learning_rate": 3.522872018846965e-05, + "loss": 2.6916, + "step": 30732 + }, + { + "epoch": 1.9078155068595195, + "grad_norm": 0.1505755488947314, + "learning_rate": 3.522526990049703e-05, + "loss": 2.6402, + "step": 30733 + }, + { + "epoch": 1.9078775839592774, + "grad_norm": 0.1454568171907951, + "learning_rate": 3.522181968960776e-05, + "loss": 2.7453, + "step": 30734 + }, + { + "epoch": 1.9079396610590353, + "grad_norm": 0.15320735463553506, + "learning_rate": 3.521836955581985e-05, + "loss": 2.7596, + "step": 30735 + }, + { + "epoch": 1.9080017381587933, + "grad_norm": 0.16889222915287092, + "learning_rate": 3.521491949915129e-05, + "loss": 2.7789, + "step": 30736 + }, + { + "epoch": 1.9080638152585512, + "grad_norm": 0.13706624964043818, + "learning_rate": 3.521146951962009e-05, + "loss": 2.81, + "step": 30737 + }, + { + "epoch": 1.908125892358309, + "grad_norm": 0.139065338759433, + "learning_rate": 3.5208019617244236e-05, + "loss": 2.7322, + "step": 30738 + }, + { + "epoch": 1.908187969458067, + "grad_norm": 0.1418544352957429, + "learning_rate": 3.520456979204174e-05, + "loss": 2.7544, + "step": 30739 + }, + { + "epoch": 1.908250046557825, + "grad_norm": 0.1445714920313289, + "learning_rate": 3.520112004403059e-05, + "loss": 2.6683, + "step": 30740 + }, + { + "epoch": 1.9083121236575828, + "grad_norm": 0.17402905199303756, + "learning_rate": 3.51976703732288e-05, + "loss": 2.7347, + "step": 30741 + }, + { + "epoch": 1.9083742007573408, + "grad_norm": 0.16178188220433515, + "learning_rate": 3.519422077965434e-05, + "loss": 2.7379, + "step": 30742 + }, + { + "epoch": 1.9084362778570985, + "grad_norm": 0.1639296356987918, + "learning_rate": 3.519077126332524e-05, + "loss": 2.7252, + "step": 30743 + }, + { + "epoch": 1.9084983549568564, + "grad_norm": 0.15104465577576942, + "learning_rate": 3.5187321824259476e-05, + "loss": 2.737, + "step": 30744 + }, + { + "epoch": 1.9085604320566143, + "grad_norm": 0.16764627595867312, + "learning_rate": 3.518387246247505e-05, + "loss": 2.7389, + "step": 30745 + }, + { + "epoch": 1.9086225091563722, + "grad_norm": 0.15124958761411203, + "learning_rate": 3.518042317798996e-05, + "loss": 2.7268, + "step": 30746 + }, + { + "epoch": 1.9086845862561301, + "grad_norm": 0.16699068169165868, + "learning_rate": 3.517697397082219e-05, + "loss": 2.7985, + "step": 30747 + }, + { + "epoch": 1.908746663355888, + "grad_norm": 0.14339638759734627, + "learning_rate": 3.5173524840989764e-05, + "loss": 2.7504, + "step": 30748 + }, + { + "epoch": 1.9088087404556457, + "grad_norm": 0.1699312306163448, + "learning_rate": 3.517007578851064e-05, + "loss": 2.6921, + "step": 30749 + }, + { + "epoch": 1.9088708175554037, + "grad_norm": 0.15484629365273342, + "learning_rate": 3.516662681340284e-05, + "loss": 2.7799, + "step": 30750 + }, + { + "epoch": 1.9089328946551616, + "grad_norm": 0.1490372162694709, + "learning_rate": 3.5163177915684344e-05, + "loss": 2.7826, + "step": 30751 + }, + { + "epoch": 1.9089949717549195, + "grad_norm": 0.14108868816923475, + "learning_rate": 3.515972909537316e-05, + "loss": 2.8469, + "step": 30752 + }, + { + "epoch": 1.9090570488546774, + "grad_norm": 0.15091469719817444, + "learning_rate": 3.515628035248726e-05, + "loss": 2.7892, + "step": 30753 + }, + { + "epoch": 1.9091191259544353, + "grad_norm": 0.14151190846353828, + "learning_rate": 3.515283168704465e-05, + "loss": 2.7756, + "step": 30754 + }, + { + "epoch": 1.9091812030541933, + "grad_norm": 0.14610903017437635, + "learning_rate": 3.514938309906333e-05, + "loss": 2.794, + "step": 30755 + }, + { + "epoch": 1.9092432801539512, + "grad_norm": 0.1515783682607805, + "learning_rate": 3.5145934588561255e-05, + "loss": 2.7469, + "step": 30756 + }, + { + "epoch": 1.909305357253709, + "grad_norm": 0.14172419138033568, + "learning_rate": 3.5142486155556475e-05, + "loss": 2.6874, + "step": 30757 + }, + { + "epoch": 1.909367434353467, + "grad_norm": 0.16770941653010435, + "learning_rate": 3.513903780006694e-05, + "loss": 2.7417, + "step": 30758 + }, + { + "epoch": 1.909429511453225, + "grad_norm": 0.13617078779599623, + "learning_rate": 3.513558952211066e-05, + "loss": 2.8013, + "step": 30759 + }, + { + "epoch": 1.9094915885529828, + "grad_norm": 0.14628110769770594, + "learning_rate": 3.513214132170563e-05, + "loss": 2.7237, + "step": 30760 + }, + { + "epoch": 1.9095536656527408, + "grad_norm": 0.15830462931733727, + "learning_rate": 3.51286931988698e-05, + "loss": 2.7326, + "step": 30761 + }, + { + "epoch": 1.9096157427524987, + "grad_norm": 0.14344480422169717, + "learning_rate": 3.512524515362121e-05, + "loss": 2.7744, + "step": 30762 + }, + { + "epoch": 1.9096778198522566, + "grad_norm": 0.1691986951189108, + "learning_rate": 3.512179718597781e-05, + "loss": 2.7332, + "step": 30763 + }, + { + "epoch": 1.9097398969520145, + "grad_norm": 0.15009865968327865, + "learning_rate": 3.511834929595762e-05, + "loss": 2.7587, + "step": 30764 + }, + { + "epoch": 1.9098019740517724, + "grad_norm": 0.14067335754337007, + "learning_rate": 3.51149014835786e-05, + "loss": 2.8148, + "step": 30765 + }, + { + "epoch": 1.9098640511515304, + "grad_norm": 0.15386320083307314, + "learning_rate": 3.511145374885877e-05, + "loss": 2.7199, + "step": 30766 + }, + { + "epoch": 1.909926128251288, + "grad_norm": 0.13579011660994636, + "learning_rate": 3.510800609181608e-05, + "loss": 2.7855, + "step": 30767 + }, + { + "epoch": 1.909988205351046, + "grad_norm": 0.14423616675064763, + "learning_rate": 3.5104558512468554e-05, + "loss": 2.7913, + "step": 30768 + }, + { + "epoch": 1.9100502824508039, + "grad_norm": 0.14378742891734536, + "learning_rate": 3.510111101083415e-05, + "loss": 2.7154, + "step": 30769 + }, + { + "epoch": 1.9101123595505618, + "grad_norm": 0.1556741175162182, + "learning_rate": 3.509766358693089e-05, + "loss": 2.8006, + "step": 30770 + }, + { + "epoch": 1.9101744366503197, + "grad_norm": 0.1469229110227685, + "learning_rate": 3.509421624077672e-05, + "loss": 2.7496, + "step": 30771 + }, + { + "epoch": 1.9102365137500776, + "grad_norm": 0.13879057679664109, + "learning_rate": 3.5090768972389634e-05, + "loss": 2.7548, + "step": 30772 + }, + { + "epoch": 1.9102985908498353, + "grad_norm": 0.14526503105489394, + "learning_rate": 3.508732178178764e-05, + "loss": 2.7545, + "step": 30773 + }, + { + "epoch": 1.9103606679495932, + "grad_norm": 0.17000823731135103, + "learning_rate": 3.5083874668988695e-05, + "loss": 2.7326, + "step": 30774 + }, + { + "epoch": 1.9104227450493512, + "grad_norm": 0.14674710554269194, + "learning_rate": 3.508042763401081e-05, + "loss": 2.8211, + "step": 30775 + }, + { + "epoch": 1.910484822149109, + "grad_norm": 0.16684788609551096, + "learning_rate": 3.507698067687195e-05, + "loss": 2.7778, + "step": 30776 + }, + { + "epoch": 1.910546899248867, + "grad_norm": 0.15003610290730846, + "learning_rate": 3.507353379759011e-05, + "loss": 2.7509, + "step": 30777 + }, + { + "epoch": 1.910608976348625, + "grad_norm": 0.1504597195522976, + "learning_rate": 3.5070086996183255e-05, + "loss": 2.8166, + "step": 30778 + }, + { + "epoch": 1.9106710534483828, + "grad_norm": 0.13958019643314296, + "learning_rate": 3.5066640272669396e-05, + "loss": 2.7397, + "step": 30779 + }, + { + "epoch": 1.9107331305481408, + "grad_norm": 0.15093354899208983, + "learning_rate": 3.506319362706648e-05, + "loss": 2.8107, + "step": 30780 + }, + { + "epoch": 1.9107952076478987, + "grad_norm": 0.15374671660189315, + "learning_rate": 3.505974705939253e-05, + "loss": 2.7592, + "step": 30781 + }, + { + "epoch": 1.9108572847476566, + "grad_norm": 0.15223367088583373, + "learning_rate": 3.50563005696655e-05, + "loss": 2.7292, + "step": 30782 + }, + { + "epoch": 1.9109193618474145, + "grad_norm": 0.1479823827556059, + "learning_rate": 3.505285415790337e-05, + "loss": 2.7348, + "step": 30783 + }, + { + "epoch": 1.9109814389471724, + "grad_norm": 0.15618571508737872, + "learning_rate": 3.504940782412414e-05, + "loss": 2.7647, + "step": 30784 + }, + { + "epoch": 1.9110435160469303, + "grad_norm": 0.1394822659218198, + "learning_rate": 3.504596156834576e-05, + "loss": 2.6962, + "step": 30785 + }, + { + "epoch": 1.9111055931466883, + "grad_norm": 0.13825397249355534, + "learning_rate": 3.504251539058625e-05, + "loss": 2.8542, + "step": 30786 + }, + { + "epoch": 1.9111676702464462, + "grad_norm": 0.7379695092883254, + "learning_rate": 3.5039069290863555e-05, + "loss": 2.7791, + "step": 30787 + }, + { + "epoch": 1.911229747346204, + "grad_norm": 0.18415453669322146, + "learning_rate": 3.503562326919568e-05, + "loss": 2.7837, + "step": 30788 + }, + { + "epoch": 1.911291824445962, + "grad_norm": 0.1777307384244691, + "learning_rate": 3.503217732560056e-05, + "loss": 2.7685, + "step": 30789 + }, + { + "epoch": 1.91135390154572, + "grad_norm": 0.1602043277405498, + "learning_rate": 3.5028731460096225e-05, + "loss": 2.8149, + "step": 30790 + }, + { + "epoch": 1.9114159786454776, + "grad_norm": 0.15598891618444805, + "learning_rate": 3.5025285672700635e-05, + "loss": 2.6842, + "step": 30791 + }, + { + "epoch": 1.9114780557452355, + "grad_norm": 0.1606900692663507, + "learning_rate": 3.5021839963431765e-05, + "loss": 2.7702, + "step": 30792 + }, + { + "epoch": 1.9115401328449935, + "grad_norm": 0.1992476853693318, + "learning_rate": 3.5018394332307593e-05, + "loss": 2.7759, + "step": 30793 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.15297071727426728, + "learning_rate": 3.501494877934609e-05, + "loss": 2.735, + "step": 30794 + }, + { + "epoch": 1.9116642870445093, + "grad_norm": 0.1495321966159239, + "learning_rate": 3.501150330456524e-05, + "loss": 2.6828, + "step": 30795 + }, + { + "epoch": 1.9117263641442672, + "grad_norm": 0.1500125380580769, + "learning_rate": 3.500805790798303e-05, + "loss": 2.7767, + "step": 30796 + }, + { + "epoch": 1.911788441244025, + "grad_norm": 0.1499750538785036, + "learning_rate": 3.50046125896174e-05, + "loss": 2.7706, + "step": 30797 + }, + { + "epoch": 1.9118505183437828, + "grad_norm": 0.16937823861710344, + "learning_rate": 3.500116734948635e-05, + "loss": 2.8002, + "step": 30798 + }, + { + "epoch": 1.9119125954435408, + "grad_norm": 0.17942317927025256, + "learning_rate": 3.499772218760785e-05, + "loss": 2.8186, + "step": 30799 + }, + { + "epoch": 1.9119746725432987, + "grad_norm": 0.16506231020100784, + "learning_rate": 3.499427710399989e-05, + "loss": 2.6959, + "step": 30800 + }, + { + "epoch": 1.9120367496430566, + "grad_norm": 0.1500086391961218, + "learning_rate": 3.4990832098680414e-05, + "loss": 2.7526, + "step": 30801 + }, + { + "epoch": 1.9120988267428145, + "grad_norm": 0.14663818549401708, + "learning_rate": 3.4987387171667416e-05, + "loss": 2.6905, + "step": 30802 + }, + { + "epoch": 1.9121609038425724, + "grad_norm": 0.1473752903579524, + "learning_rate": 3.498394232297886e-05, + "loss": 2.7145, + "step": 30803 + }, + { + "epoch": 1.9122229809423303, + "grad_norm": 0.14606096108841546, + "learning_rate": 3.498049755263273e-05, + "loss": 2.7038, + "step": 30804 + }, + { + "epoch": 1.9122850580420883, + "grad_norm": 0.14287674997623015, + "learning_rate": 3.497705286064698e-05, + "loss": 2.7132, + "step": 30805 + }, + { + "epoch": 1.9123471351418462, + "grad_norm": 0.14775740902335868, + "learning_rate": 3.49736082470396e-05, + "loss": 2.6706, + "step": 30806 + }, + { + "epoch": 1.912409212241604, + "grad_norm": 0.14575817024607024, + "learning_rate": 3.4970163711828564e-05, + "loss": 2.74, + "step": 30807 + }, + { + "epoch": 1.912471289341362, + "grad_norm": 0.1429802241176729, + "learning_rate": 3.496671925503181e-05, + "loss": 2.7034, + "step": 30808 + }, + { + "epoch": 1.91253336644112, + "grad_norm": 0.14034124727720862, + "learning_rate": 3.4963274876667344e-05, + "loss": 2.7555, + "step": 30809 + }, + { + "epoch": 1.9125954435408778, + "grad_norm": 0.14102830935463223, + "learning_rate": 3.4959830576753115e-05, + "loss": 2.8071, + "step": 30810 + }, + { + "epoch": 1.9126575206406358, + "grad_norm": 0.13913966914838707, + "learning_rate": 3.495638635530711e-05, + "loss": 2.6807, + "step": 30811 + }, + { + "epoch": 1.9127195977403937, + "grad_norm": 0.17371802647311813, + "learning_rate": 3.495294221234728e-05, + "loss": 2.7426, + "step": 30812 + }, + { + "epoch": 1.9127816748401516, + "grad_norm": 0.1448509508028154, + "learning_rate": 3.494949814789161e-05, + "loss": 2.7712, + "step": 30813 + }, + { + "epoch": 1.9128437519399095, + "grad_norm": 0.15201420475753807, + "learning_rate": 3.494605416195806e-05, + "loss": 2.7634, + "step": 30814 + }, + { + "epoch": 1.9129058290396672, + "grad_norm": 0.14578322134289848, + "learning_rate": 3.4942610254564606e-05, + "loss": 2.7529, + "step": 30815 + }, + { + "epoch": 1.9129679061394251, + "grad_norm": 0.15611800936990508, + "learning_rate": 3.4939166425729194e-05, + "loss": 2.7507, + "step": 30816 + }, + { + "epoch": 1.913029983239183, + "grad_norm": 0.13868015560088123, + "learning_rate": 3.493572267546983e-05, + "loss": 2.7234, + "step": 30817 + }, + { + "epoch": 1.913092060338941, + "grad_norm": 0.1542238412710414, + "learning_rate": 3.493227900380444e-05, + "loss": 2.7036, + "step": 30818 + }, + { + "epoch": 1.9131541374386989, + "grad_norm": 0.14694989303774475, + "learning_rate": 3.4928835410751015e-05, + "loss": 2.73, + "step": 30819 + }, + { + "epoch": 1.9132162145384568, + "grad_norm": 0.1729621264198312, + "learning_rate": 3.492539189632752e-05, + "loss": 2.7719, + "step": 30820 + }, + { + "epoch": 1.9132782916382145, + "grad_norm": 0.1532195422194663, + "learning_rate": 3.49219484605519e-05, + "loss": 2.7777, + "step": 30821 + }, + { + "epoch": 1.9133403687379724, + "grad_norm": 0.14639176938312712, + "learning_rate": 3.4918505103442125e-05, + "loss": 2.8148, + "step": 30822 + }, + { + "epoch": 1.9134024458377303, + "grad_norm": 0.15180288183172896, + "learning_rate": 3.4915061825016197e-05, + "loss": 2.8296, + "step": 30823 + }, + { + "epoch": 1.9134645229374883, + "grad_norm": 0.14277303776508946, + "learning_rate": 3.4911618625292036e-05, + "loss": 2.7419, + "step": 30824 + }, + { + "epoch": 1.9135266000372462, + "grad_norm": 0.14839317889836104, + "learning_rate": 3.4908175504287635e-05, + "loss": 2.7023, + "step": 30825 + }, + { + "epoch": 1.913588677137004, + "grad_norm": 0.1447105198049775, + "learning_rate": 3.490473246202094e-05, + "loss": 2.7369, + "step": 30826 + }, + { + "epoch": 1.913650754236762, + "grad_norm": 0.15629743213739222, + "learning_rate": 3.4901289498509925e-05, + "loss": 2.8499, + "step": 30827 + }, + { + "epoch": 1.91371283133652, + "grad_norm": 0.1523376549210625, + "learning_rate": 3.4897846613772544e-05, + "loss": 2.6279, + "step": 30828 + }, + { + "epoch": 1.9137749084362778, + "grad_norm": 0.17225258165834614, + "learning_rate": 3.489440380782677e-05, + "loss": 2.8597, + "step": 30829 + }, + { + "epoch": 1.9138369855360358, + "grad_norm": 0.14871763473312194, + "learning_rate": 3.489096108069054e-05, + "loss": 2.7649, + "step": 30830 + }, + { + "epoch": 1.9138990626357937, + "grad_norm": 0.14891161616764437, + "learning_rate": 3.488751843238186e-05, + "loss": 2.7469, + "step": 30831 + }, + { + "epoch": 1.9139611397355516, + "grad_norm": 0.15256491526247198, + "learning_rate": 3.4884075862918656e-05, + "loss": 2.71, + "step": 30832 + }, + { + "epoch": 1.9140232168353095, + "grad_norm": 0.1521679830177799, + "learning_rate": 3.488063337231889e-05, + "loss": 2.782, + "step": 30833 + }, + { + "epoch": 1.9140852939350674, + "grad_norm": 0.20024817164321007, + "learning_rate": 3.4877190960600534e-05, + "loss": 2.7381, + "step": 30834 + }, + { + "epoch": 1.9141473710348254, + "grad_norm": 0.1443402665228502, + "learning_rate": 3.487374862778154e-05, + "loss": 2.6605, + "step": 30835 + }, + { + "epoch": 1.9142094481345833, + "grad_norm": 0.1625346499877547, + "learning_rate": 3.487030637387988e-05, + "loss": 2.7294, + "step": 30836 + }, + { + "epoch": 1.9142715252343412, + "grad_norm": 0.15204635961592447, + "learning_rate": 3.4866864198913495e-05, + "loss": 2.6899, + "step": 30837 + }, + { + "epoch": 1.914333602334099, + "grad_norm": 0.14395237029331281, + "learning_rate": 3.486342210290037e-05, + "loss": 2.7008, + "step": 30838 + }, + { + "epoch": 1.9143956794338568, + "grad_norm": 0.1578201025368532, + "learning_rate": 3.4859980085858424e-05, + "loss": 2.7573, + "step": 30839 + }, + { + "epoch": 1.9144577565336147, + "grad_norm": 0.15520550183352558, + "learning_rate": 3.4856538147805655e-05, + "loss": 2.8741, + "step": 30840 + }, + { + "epoch": 1.9145198336333726, + "grad_norm": 0.14623694185298317, + "learning_rate": 3.485309628875999e-05, + "loss": 2.7575, + "step": 30841 + }, + { + "epoch": 1.9145819107331306, + "grad_norm": 0.14803197956891911, + "learning_rate": 3.484965450873941e-05, + "loss": 2.703, + "step": 30842 + }, + { + "epoch": 1.9146439878328885, + "grad_norm": 0.14873803608529484, + "learning_rate": 3.484621280776185e-05, + "loss": 2.7291, + "step": 30843 + }, + { + "epoch": 1.9147060649326464, + "grad_norm": 0.18526257266946675, + "learning_rate": 3.484277118584529e-05, + "loss": 2.802, + "step": 30844 + }, + { + "epoch": 1.914768142032404, + "grad_norm": 0.1518717324725638, + "learning_rate": 3.483932964300767e-05, + "loss": 2.8179, + "step": 30845 + }, + { + "epoch": 1.914830219132162, + "grad_norm": 0.15442663482699523, + "learning_rate": 3.483588817926693e-05, + "loss": 2.7616, + "step": 30846 + }, + { + "epoch": 1.91489229623192, + "grad_norm": 0.17291449354905078, + "learning_rate": 3.483244679464106e-05, + "loss": 2.7294, + "step": 30847 + }, + { + "epoch": 1.9149543733316778, + "grad_norm": 0.15762783756696894, + "learning_rate": 3.482900548914798e-05, + "loss": 2.7279, + "step": 30848 + }, + { + "epoch": 1.9150164504314358, + "grad_norm": 0.14950478361404212, + "learning_rate": 3.482556426280568e-05, + "loss": 2.8266, + "step": 30849 + }, + { + "epoch": 1.9150785275311937, + "grad_norm": 0.15712226346272395, + "learning_rate": 3.482212311563208e-05, + "loss": 2.841, + "step": 30850 + }, + { + "epoch": 1.9151406046309516, + "grad_norm": 0.1685845873449079, + "learning_rate": 3.481868204764516e-05, + "loss": 2.7313, + "step": 30851 + }, + { + "epoch": 1.9152026817307095, + "grad_norm": 0.16267513456844676, + "learning_rate": 3.481524105886286e-05, + "loss": 2.7073, + "step": 30852 + }, + { + "epoch": 1.9152647588304674, + "grad_norm": 0.16127928232302066, + "learning_rate": 3.4811800149303134e-05, + "loss": 2.7445, + "step": 30853 + }, + { + "epoch": 1.9153268359302253, + "grad_norm": 0.16483543180658153, + "learning_rate": 3.480835931898392e-05, + "loss": 2.7772, + "step": 30854 + }, + { + "epoch": 1.9153889130299833, + "grad_norm": 0.16330466193862464, + "learning_rate": 3.4804918567923176e-05, + "loss": 2.704, + "step": 30855 + }, + { + "epoch": 1.9154509901297412, + "grad_norm": 0.14294564550306474, + "learning_rate": 3.480147789613888e-05, + "loss": 2.8525, + "step": 30856 + }, + { + "epoch": 1.915513067229499, + "grad_norm": 0.15769749359973503, + "learning_rate": 3.479803730364897e-05, + "loss": 2.7474, + "step": 30857 + }, + { + "epoch": 1.915575144329257, + "grad_norm": 0.19491679160115594, + "learning_rate": 3.4794596790471377e-05, + "loss": 2.7689, + "step": 30858 + }, + { + "epoch": 1.915637221429015, + "grad_norm": 0.15105232593040838, + "learning_rate": 3.479115635662407e-05, + "loss": 2.7726, + "step": 30859 + }, + { + "epoch": 1.9156992985287729, + "grad_norm": 0.16951465591398387, + "learning_rate": 3.4787716002124985e-05, + "loss": 2.8523, + "step": 30860 + }, + { + "epoch": 1.9157613756285308, + "grad_norm": 0.15415166105774555, + "learning_rate": 3.4784275726992085e-05, + "loss": 2.7159, + "step": 30861 + }, + { + "epoch": 1.9158234527282887, + "grad_norm": 0.17295283244324766, + "learning_rate": 3.47808355312433e-05, + "loss": 2.7583, + "step": 30862 + }, + { + "epoch": 1.9158855298280464, + "grad_norm": 0.19085911968212563, + "learning_rate": 3.4777395414896616e-05, + "loss": 2.7197, + "step": 30863 + }, + { + "epoch": 1.9159476069278043, + "grad_norm": 0.15531014062635995, + "learning_rate": 3.477395537796993e-05, + "loss": 2.7447, + "step": 30864 + }, + { + "epoch": 1.9160096840275622, + "grad_norm": 0.15406220734937934, + "learning_rate": 3.477051542048124e-05, + "loss": 2.787, + "step": 30865 + }, + { + "epoch": 1.9160717611273201, + "grad_norm": 0.15132157653756473, + "learning_rate": 3.4767075542448444e-05, + "loss": 2.7098, + "step": 30866 + }, + { + "epoch": 1.916133838227078, + "grad_norm": 0.1713235581133324, + "learning_rate": 3.4763635743889524e-05, + "loss": 2.8193, + "step": 30867 + }, + { + "epoch": 1.916195915326836, + "grad_norm": 0.1603223764805243, + "learning_rate": 3.476019602482241e-05, + "loss": 2.711, + "step": 30868 + }, + { + "epoch": 1.9162579924265937, + "grad_norm": 0.15061374931936475, + "learning_rate": 3.475675638526507e-05, + "loss": 2.7115, + "step": 30869 + }, + { + "epoch": 1.9163200695263516, + "grad_norm": 0.1518063358415481, + "learning_rate": 3.475331682523542e-05, + "loss": 2.7265, + "step": 30870 + }, + { + "epoch": 1.9163821466261095, + "grad_norm": 0.15380683776821907, + "learning_rate": 3.4749877344751416e-05, + "loss": 2.7597, + "step": 30871 + }, + { + "epoch": 1.9164442237258674, + "grad_norm": 0.18202185872619306, + "learning_rate": 3.4746437943831e-05, + "loss": 2.7976, + "step": 30872 + }, + { + "epoch": 1.9165063008256253, + "grad_norm": 0.14730775847631516, + "learning_rate": 3.4742998622492127e-05, + "loss": 2.6654, + "step": 30873 + }, + { + "epoch": 1.9165683779253833, + "grad_norm": 0.16130534096609167, + "learning_rate": 3.473955938075273e-05, + "loss": 2.7539, + "step": 30874 + }, + { + "epoch": 1.9166304550251412, + "grad_norm": 0.1585205155676213, + "learning_rate": 3.473612021863075e-05, + "loss": 2.7918, + "step": 30875 + }, + { + "epoch": 1.916692532124899, + "grad_norm": 0.15913695364393785, + "learning_rate": 3.473268113614415e-05, + "loss": 2.7168, + "step": 30876 + }, + { + "epoch": 1.916754609224657, + "grad_norm": 0.14495477611386362, + "learning_rate": 3.472924213331086e-05, + "loss": 2.8208, + "step": 30877 + }, + { + "epoch": 1.916816686324415, + "grad_norm": 0.14772064505270008, + "learning_rate": 3.4725803210148814e-05, + "loss": 2.7209, + "step": 30878 + }, + { + "epoch": 1.9168787634241728, + "grad_norm": 0.14414655549169464, + "learning_rate": 3.4722364366675955e-05, + "loss": 2.6538, + "step": 30879 + }, + { + "epoch": 1.9169408405239308, + "grad_norm": 0.16222186953698248, + "learning_rate": 3.471892560291024e-05, + "loss": 2.7763, + "step": 30880 + }, + { + "epoch": 1.9170029176236887, + "grad_norm": 0.1894712636070871, + "learning_rate": 3.47154869188696e-05, + "loss": 2.7247, + "step": 30881 + }, + { + "epoch": 1.9170649947234466, + "grad_norm": 0.18783954830226962, + "learning_rate": 3.4712048314571964e-05, + "loss": 2.7411, + "step": 30882 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.14658715774751208, + "learning_rate": 3.4708609790035294e-05, + "loss": 2.6598, + "step": 30883 + }, + { + "epoch": 1.9171891489229624, + "grad_norm": 0.1571455503933884, + "learning_rate": 3.470517134527751e-05, + "loss": 2.7729, + "step": 30884 + }, + { + "epoch": 1.9172512260227204, + "grad_norm": 0.18264295018889745, + "learning_rate": 3.470173298031657e-05, + "loss": 2.6707, + "step": 30885 + }, + { + "epoch": 1.9173133031224783, + "grad_norm": 0.1445490980726821, + "learning_rate": 3.469829469517039e-05, + "loss": 2.7129, + "step": 30886 + }, + { + "epoch": 1.917375380222236, + "grad_norm": 0.1470704751184788, + "learning_rate": 3.469485648985691e-05, + "loss": 2.6938, + "step": 30887 + }, + { + "epoch": 1.9174374573219939, + "grad_norm": 0.18271354948041107, + "learning_rate": 3.469141836439411e-05, + "loss": 2.7563, + "step": 30888 + }, + { + "epoch": 1.9174995344217518, + "grad_norm": 0.17382352265683845, + "learning_rate": 3.468798031879987e-05, + "loss": 2.7239, + "step": 30889 + }, + { + "epoch": 1.9175616115215097, + "grad_norm": 0.15095576701977953, + "learning_rate": 3.4684542353092174e-05, + "loss": 2.7089, + "step": 30890 + }, + { + "epoch": 1.9176236886212676, + "grad_norm": 0.14195740154292905, + "learning_rate": 3.4681104467288934e-05, + "loss": 2.7815, + "step": 30891 + }, + { + "epoch": 1.9176857657210256, + "grad_norm": 0.14258438253388267, + "learning_rate": 3.467766666140809e-05, + "loss": 2.7638, + "step": 30892 + }, + { + "epoch": 1.9177478428207833, + "grad_norm": 0.14514520767390623, + "learning_rate": 3.4674228935467575e-05, + "loss": 2.8131, + "step": 30893 + }, + { + "epoch": 1.9178099199205412, + "grad_norm": 0.13419973731686796, + "learning_rate": 3.4670791289485337e-05, + "loss": 2.7083, + "step": 30894 + }, + { + "epoch": 1.917871997020299, + "grad_norm": 0.16186212735193214, + "learning_rate": 3.46673537234793e-05, + "loss": 2.7556, + "step": 30895 + }, + { + "epoch": 1.917934074120057, + "grad_norm": 0.15859812415719568, + "learning_rate": 3.46639162374674e-05, + "loss": 2.7673, + "step": 30896 + }, + { + "epoch": 1.917996151219815, + "grad_norm": 0.15071821451655554, + "learning_rate": 3.466047883146757e-05, + "loss": 2.7072, + "step": 30897 + }, + { + "epoch": 1.9180582283195728, + "grad_norm": 0.13755651336747665, + "learning_rate": 3.465704150549773e-05, + "loss": 2.7136, + "step": 30898 + }, + { + "epoch": 1.9181203054193308, + "grad_norm": 0.1414590842246275, + "learning_rate": 3.4653604259575856e-05, + "loss": 2.7571, + "step": 30899 + }, + { + "epoch": 1.9181823825190887, + "grad_norm": 0.13793841336288848, + "learning_rate": 3.465016709371984e-05, + "loss": 2.6362, + "step": 30900 + }, + { + "epoch": 1.9182444596188466, + "grad_norm": 0.14629686434101105, + "learning_rate": 3.464673000794763e-05, + "loss": 2.7951, + "step": 30901 + }, + { + "epoch": 1.9183065367186045, + "grad_norm": 0.14093784792152986, + "learning_rate": 3.464329300227716e-05, + "loss": 2.721, + "step": 30902 + }, + { + "epoch": 1.9183686138183624, + "grad_norm": 0.1498545157211805, + "learning_rate": 3.463985607672635e-05, + "loss": 2.6281, + "step": 30903 + }, + { + "epoch": 1.9184306909181204, + "grad_norm": 0.16388752666733866, + "learning_rate": 3.463641923131314e-05, + "loss": 2.721, + "step": 30904 + }, + { + "epoch": 1.9184927680178783, + "grad_norm": 0.13699757209988045, + "learning_rate": 3.4632982466055474e-05, + "loss": 2.7256, + "step": 30905 + }, + { + "epoch": 1.9185548451176362, + "grad_norm": 0.1354894411945553, + "learning_rate": 3.462954578097127e-05, + "loss": 2.7559, + "step": 30906 + }, + { + "epoch": 1.918616922217394, + "grad_norm": 0.14910490176684268, + "learning_rate": 3.462610917607844e-05, + "loss": 2.7719, + "step": 30907 + }, + { + "epoch": 1.918678999317152, + "grad_norm": 0.14997461918838007, + "learning_rate": 3.462267265139494e-05, + "loss": 2.7459, + "step": 30908 + }, + { + "epoch": 1.91874107641691, + "grad_norm": 0.1380900278524268, + "learning_rate": 3.461923620693869e-05, + "loss": 2.816, + "step": 30909 + }, + { + "epoch": 1.9188031535166679, + "grad_norm": 0.1397667163851044, + "learning_rate": 3.461579984272761e-05, + "loss": 2.6582, + "step": 30910 + }, + { + "epoch": 1.9188652306164256, + "grad_norm": 0.13681411822936068, + "learning_rate": 3.4612363558779646e-05, + "loss": 2.7711, + "step": 30911 + }, + { + "epoch": 1.9189273077161835, + "grad_norm": 0.18289992939820968, + "learning_rate": 3.460892735511272e-05, + "loss": 2.7871, + "step": 30912 + }, + { + "epoch": 1.9189893848159414, + "grad_norm": 0.13627923755651294, + "learning_rate": 3.4605491231744746e-05, + "loss": 2.7703, + "step": 30913 + }, + { + "epoch": 1.9190514619156993, + "grad_norm": 0.14018784231806547, + "learning_rate": 3.4602055188693674e-05, + "loss": 2.7017, + "step": 30914 + }, + { + "epoch": 1.9191135390154572, + "grad_norm": 0.13763157742510165, + "learning_rate": 3.45986192259774e-05, + "loss": 2.7384, + "step": 30915 + }, + { + "epoch": 1.9191756161152151, + "grad_norm": 0.13763923720665805, + "learning_rate": 3.459518334361388e-05, + "loss": 2.7231, + "step": 30916 + }, + { + "epoch": 1.9192376932149728, + "grad_norm": 0.14727241895065535, + "learning_rate": 3.4591747541621033e-05, + "loss": 2.7736, + "step": 30917 + }, + { + "epoch": 1.9192997703147308, + "grad_norm": 0.13784126905934124, + "learning_rate": 3.458831182001677e-05, + "loss": 2.6546, + "step": 30918 + }, + { + "epoch": 1.9193618474144887, + "grad_norm": 0.13731279064376903, + "learning_rate": 3.458487617881903e-05, + "loss": 2.7468, + "step": 30919 + }, + { + "epoch": 1.9194239245142466, + "grad_norm": 0.14360892640069603, + "learning_rate": 3.458144061804571e-05, + "loss": 2.813, + "step": 30920 + }, + { + "epoch": 1.9194860016140045, + "grad_norm": 0.15974735188154907, + "learning_rate": 3.457800513771478e-05, + "loss": 2.6944, + "step": 30921 + }, + { + "epoch": 1.9195480787137624, + "grad_norm": 0.14276196763793014, + "learning_rate": 3.457456973784414e-05, + "loss": 2.6644, + "step": 30922 + }, + { + "epoch": 1.9196101558135203, + "grad_norm": 0.1731038058043002, + "learning_rate": 3.457113441845171e-05, + "loss": 2.7785, + "step": 30923 + }, + { + "epoch": 1.9196722329132783, + "grad_norm": 0.15735646917156135, + "learning_rate": 3.456769917955543e-05, + "loss": 2.7793, + "step": 30924 + }, + { + "epoch": 1.9197343100130362, + "grad_norm": 0.14916542658299262, + "learning_rate": 3.45642640211732e-05, + "loss": 2.8132, + "step": 30925 + }, + { + "epoch": 1.919796387112794, + "grad_norm": 0.14547596653371214, + "learning_rate": 3.4560828943322954e-05, + "loss": 2.7458, + "step": 30926 + }, + { + "epoch": 1.919858464212552, + "grad_norm": 0.15529821905876134, + "learning_rate": 3.455739394602261e-05, + "loss": 2.7365, + "step": 30927 + }, + { + "epoch": 1.91992054131231, + "grad_norm": 0.1576491416863627, + "learning_rate": 3.45539590292901e-05, + "loss": 2.6663, + "step": 30928 + }, + { + "epoch": 1.9199826184120679, + "grad_norm": 0.1542523718285249, + "learning_rate": 3.455052419314331e-05, + "loss": 2.8231, + "step": 30929 + }, + { + "epoch": 1.9200446955118258, + "grad_norm": 0.13890200981774345, + "learning_rate": 3.454708943760021e-05, + "loss": 2.7519, + "step": 30930 + }, + { + "epoch": 1.9201067726115837, + "grad_norm": 0.14744505259868365, + "learning_rate": 3.454365476267869e-05, + "loss": 2.8785, + "step": 30931 + }, + { + "epoch": 1.9201688497113416, + "grad_norm": 0.14852814667242936, + "learning_rate": 3.454022016839667e-05, + "loss": 2.7974, + "step": 30932 + }, + { + "epoch": 1.9202309268110995, + "grad_norm": 0.1415116254703175, + "learning_rate": 3.4536785654772084e-05, + "loss": 2.7307, + "step": 30933 + }, + { + "epoch": 1.9202930039108574, + "grad_norm": 0.15177776887878563, + "learning_rate": 3.453335122182283e-05, + "loss": 2.795, + "step": 30934 + }, + { + "epoch": 1.9203550810106151, + "grad_norm": 0.19837279316071865, + "learning_rate": 3.452991686956685e-05, + "loss": 2.757, + "step": 30935 + }, + { + "epoch": 1.920417158110373, + "grad_norm": 0.15608757040222773, + "learning_rate": 3.452648259802203e-05, + "loss": 2.6317, + "step": 30936 + }, + { + "epoch": 1.920479235210131, + "grad_norm": 0.17117617246107322, + "learning_rate": 3.4523048407206325e-05, + "loss": 2.7899, + "step": 30937 + }, + { + "epoch": 1.920541312309889, + "grad_norm": 0.15126987528654098, + "learning_rate": 3.451961429713761e-05, + "loss": 2.6807, + "step": 30938 + }, + { + "epoch": 1.9206033894096468, + "grad_norm": 0.15530912121219517, + "learning_rate": 3.4516180267833856e-05, + "loss": 2.7111, + "step": 30939 + }, + { + "epoch": 1.9206654665094047, + "grad_norm": 0.15802771062246132, + "learning_rate": 3.451274631931293e-05, + "loss": 2.7491, + "step": 30940 + }, + { + "epoch": 1.9207275436091624, + "grad_norm": 0.16547697420547638, + "learning_rate": 3.4509312451592774e-05, + "loss": 2.6489, + "step": 30941 + }, + { + "epoch": 1.9207896207089203, + "grad_norm": 0.17295355939453008, + "learning_rate": 3.4505878664691295e-05, + "loss": 2.8106, + "step": 30942 + }, + { + "epoch": 1.9208516978086783, + "grad_norm": 0.1531222759218611, + "learning_rate": 3.45024449586264e-05, + "loss": 2.8266, + "step": 30943 + }, + { + "epoch": 1.9209137749084362, + "grad_norm": 0.14764922796435734, + "learning_rate": 3.449901133341602e-05, + "loss": 2.7191, + "step": 30944 + }, + { + "epoch": 1.920975852008194, + "grad_norm": 0.1405498979613403, + "learning_rate": 3.449557778907805e-05, + "loss": 2.748, + "step": 30945 + }, + { + "epoch": 1.921037929107952, + "grad_norm": 0.15024672472854303, + "learning_rate": 3.449214432563043e-05, + "loss": 2.7271, + "step": 30946 + }, + { + "epoch": 1.92110000620771, + "grad_norm": 0.15561346694043132, + "learning_rate": 3.4488710943091045e-05, + "loss": 2.8325, + "step": 30947 + }, + { + "epoch": 1.9211620833074678, + "grad_norm": 0.21403851896851156, + "learning_rate": 3.448527764147783e-05, + "loss": 2.7339, + "step": 30948 + }, + { + "epoch": 1.9212241604072258, + "grad_norm": 0.20193614039764757, + "learning_rate": 3.448184442080868e-05, + "loss": 2.7968, + "step": 30949 + }, + { + "epoch": 1.9212862375069837, + "grad_norm": 0.1522674124756164, + "learning_rate": 3.447841128110152e-05, + "loss": 2.8554, + "step": 30950 + }, + { + "epoch": 1.9213483146067416, + "grad_norm": 0.14468259233551312, + "learning_rate": 3.447497822237425e-05, + "loss": 2.6589, + "step": 30951 + }, + { + "epoch": 1.9214103917064995, + "grad_norm": 0.14716975407824284, + "learning_rate": 3.44715452446448e-05, + "loss": 2.5894, + "step": 30952 + }, + { + "epoch": 1.9214724688062574, + "grad_norm": 0.1402549510726185, + "learning_rate": 3.446811234793104e-05, + "loss": 2.7312, + "step": 30953 + }, + { + "epoch": 1.9215345459060154, + "grad_norm": 0.13899914310854997, + "learning_rate": 3.446467953225092e-05, + "loss": 2.8087, + "step": 30954 + }, + { + "epoch": 1.9215966230057733, + "grad_norm": 0.1498544784321064, + "learning_rate": 3.446124679762235e-05, + "loss": 2.729, + "step": 30955 + }, + { + "epoch": 1.9216587001055312, + "grad_norm": 0.1499136631571821, + "learning_rate": 3.4457814144063234e-05, + "loss": 2.8453, + "step": 30956 + }, + { + "epoch": 1.921720777205289, + "grad_norm": 0.15072161305496476, + "learning_rate": 3.4454381571591454e-05, + "loss": 2.7903, + "step": 30957 + }, + { + "epoch": 1.921782854305047, + "grad_norm": 0.1411771068965153, + "learning_rate": 3.445094908022496e-05, + "loss": 2.7842, + "step": 30958 + }, + { + "epoch": 1.9218449314048047, + "grad_norm": 0.15116486101857843, + "learning_rate": 3.4447516669981623e-05, + "loss": 2.7743, + "step": 30959 + }, + { + "epoch": 1.9219070085045626, + "grad_norm": 0.14414342190062931, + "learning_rate": 3.444408434087938e-05, + "loss": 2.789, + "step": 30960 + }, + { + "epoch": 1.9219690856043206, + "grad_norm": 0.14072701277332827, + "learning_rate": 3.444065209293611e-05, + "loss": 2.695, + "step": 30961 + }, + { + "epoch": 1.9220311627040785, + "grad_norm": 0.1523107285046313, + "learning_rate": 3.443721992616975e-05, + "loss": 2.8191, + "step": 30962 + }, + { + "epoch": 1.9220932398038364, + "grad_norm": 0.14629360492412435, + "learning_rate": 3.4433787840598173e-05, + "loss": 2.7426, + "step": 30963 + }, + { + "epoch": 1.9221553169035943, + "grad_norm": 0.14835591332774706, + "learning_rate": 3.443035583623933e-05, + "loss": 2.7842, + "step": 30964 + }, + { + "epoch": 1.922217394003352, + "grad_norm": 0.17544505500290442, + "learning_rate": 3.4426923913111075e-05, + "loss": 2.7432, + "step": 30965 + }, + { + "epoch": 1.92227947110311, + "grad_norm": 0.14496889097186744, + "learning_rate": 3.4423492071231356e-05, + "loss": 2.7032, + "step": 30966 + }, + { + "epoch": 1.9223415482028678, + "grad_norm": 0.14222060410303747, + "learning_rate": 3.4420060310618064e-05, + "loss": 2.7029, + "step": 30967 + }, + { + "epoch": 1.9224036253026258, + "grad_norm": 0.14521639399159583, + "learning_rate": 3.441662863128908e-05, + "loss": 2.7315, + "step": 30968 + }, + { + "epoch": 1.9224657024023837, + "grad_norm": 0.13917012764525571, + "learning_rate": 3.441319703326234e-05, + "loss": 2.7603, + "step": 30969 + }, + { + "epoch": 1.9225277795021416, + "grad_norm": 0.15174303461535696, + "learning_rate": 3.4409765516555726e-05, + "loss": 2.7831, + "step": 30970 + }, + { + "epoch": 1.9225898566018995, + "grad_norm": 0.1549466971875669, + "learning_rate": 3.440633408118716e-05, + "loss": 2.7294, + "step": 30971 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.16520117523116506, + "learning_rate": 3.440290272717453e-05, + "loss": 2.6439, + "step": 30972 + }, + { + "epoch": 1.9227140108014154, + "grad_norm": 0.15543183458738236, + "learning_rate": 3.439947145453575e-05, + "loss": 2.7724, + "step": 30973 + }, + { + "epoch": 1.9227760879011733, + "grad_norm": 0.14138179694076208, + "learning_rate": 3.4396040263288696e-05, + "loss": 2.7453, + "step": 30974 + }, + { + "epoch": 1.9228381650009312, + "grad_norm": 0.14242905832486155, + "learning_rate": 3.4392609153451306e-05, + "loss": 2.7458, + "step": 30975 + }, + { + "epoch": 1.922900242100689, + "grad_norm": 0.17264484177955997, + "learning_rate": 3.4389178125041456e-05, + "loss": 2.7835, + "step": 30976 + }, + { + "epoch": 1.922962319200447, + "grad_norm": 0.14885146366060645, + "learning_rate": 3.4385747178077056e-05, + "loss": 2.8286, + "step": 30977 + }, + { + "epoch": 1.923024396300205, + "grad_norm": 0.14547022391616393, + "learning_rate": 3.4382316312576e-05, + "loss": 2.8154, + "step": 30978 + }, + { + "epoch": 1.9230864733999629, + "grad_norm": 0.1430062958658546, + "learning_rate": 3.437888552855619e-05, + "loss": 2.7523, + "step": 30979 + }, + { + "epoch": 1.9231485504997208, + "grad_norm": 0.14742540727233716, + "learning_rate": 3.437545482603553e-05, + "loss": 2.7063, + "step": 30980 + }, + { + "epoch": 1.9232106275994787, + "grad_norm": 0.1410979094267135, + "learning_rate": 3.4372024205031916e-05, + "loss": 2.7783, + "step": 30981 + }, + { + "epoch": 1.9232727046992366, + "grad_norm": 0.14727153993865075, + "learning_rate": 3.436859366556325e-05, + "loss": 2.7749, + "step": 30982 + }, + { + "epoch": 1.9233347817989943, + "grad_norm": 0.14313723402542441, + "learning_rate": 3.436516320764741e-05, + "loss": 2.7988, + "step": 30983 + }, + { + "epoch": 1.9233968588987522, + "grad_norm": 0.1617151027305013, + "learning_rate": 3.4361732831302327e-05, + "loss": 2.6946, + "step": 30984 + }, + { + "epoch": 1.9234589359985101, + "grad_norm": 0.16507477345744984, + "learning_rate": 3.435830253654586e-05, + "loss": 2.7905, + "step": 30985 + }, + { + "epoch": 1.923521013098268, + "grad_norm": 0.15045342434667042, + "learning_rate": 3.435487232339593e-05, + "loss": 2.6813, + "step": 30986 + }, + { + "epoch": 1.923583090198026, + "grad_norm": 0.13921355845771785, + "learning_rate": 3.435144219187043e-05, + "loss": 2.7465, + "step": 30987 + }, + { + "epoch": 1.923645167297784, + "grad_norm": 0.18056543080719914, + "learning_rate": 3.434801214198726e-05, + "loss": 2.7805, + "step": 30988 + }, + { + "epoch": 1.9237072443975416, + "grad_norm": 0.13528592801349615, + "learning_rate": 3.434458217376432e-05, + "loss": 2.8345, + "step": 30989 + }, + { + "epoch": 1.9237693214972995, + "grad_norm": 0.13898116529938784, + "learning_rate": 3.434115228721948e-05, + "loss": 2.8046, + "step": 30990 + }, + { + "epoch": 1.9238313985970574, + "grad_norm": 0.1426263035410422, + "learning_rate": 3.433772248237066e-05, + "loss": 2.7152, + "step": 30991 + }, + { + "epoch": 1.9238934756968153, + "grad_norm": 0.14993811314176406, + "learning_rate": 3.4334292759235745e-05, + "loss": 2.6964, + "step": 30992 + }, + { + "epoch": 1.9239555527965733, + "grad_norm": 0.1581086477109692, + "learning_rate": 3.4330863117832616e-05, + "loss": 2.7503, + "step": 30993 + }, + { + "epoch": 1.9240176298963312, + "grad_norm": 0.1960033690998695, + "learning_rate": 3.432743355817919e-05, + "loss": 2.7043, + "step": 30994 + }, + { + "epoch": 1.924079706996089, + "grad_norm": 0.17383364872246249, + "learning_rate": 3.432400408029334e-05, + "loss": 2.8109, + "step": 30995 + }, + { + "epoch": 1.924141784095847, + "grad_norm": 0.16590049341776836, + "learning_rate": 3.432057468419298e-05, + "loss": 2.7205, + "step": 30996 + }, + { + "epoch": 1.924203861195605, + "grad_norm": 0.1591414306163294, + "learning_rate": 3.431714536989596e-05, + "loss": 2.7581, + "step": 30997 + }, + { + "epoch": 1.9242659382953629, + "grad_norm": 0.17348270170897143, + "learning_rate": 3.431371613742023e-05, + "loss": 2.7351, + "step": 30998 + }, + { + "epoch": 1.9243280153951208, + "grad_norm": 0.14632484013810867, + "learning_rate": 3.431028698678362e-05, + "loss": 2.6967, + "step": 30999 + }, + { + "epoch": 1.9243900924948787, + "grad_norm": 0.17713991247691477, + "learning_rate": 3.430685791800408e-05, + "loss": 2.7921, + "step": 31000 + }, + { + "epoch": 1.9244521695946366, + "grad_norm": 0.14714366051413905, + "learning_rate": 3.430342893109946e-05, + "loss": 2.7393, + "step": 31001 + }, + { + "epoch": 1.9245142466943945, + "grad_norm": 0.14517870429586258, + "learning_rate": 3.4300000026087666e-05, + "loss": 2.7396, + "step": 31002 + }, + { + "epoch": 1.9245763237941524, + "grad_norm": 0.17223613361144996, + "learning_rate": 3.429657120298658e-05, + "loss": 2.8059, + "step": 31003 + }, + { + "epoch": 1.9246384008939104, + "grad_norm": 0.1549339121144985, + "learning_rate": 3.429314246181409e-05, + "loss": 2.7832, + "step": 31004 + }, + { + "epoch": 1.9247004779936683, + "grad_norm": 0.1515060218800033, + "learning_rate": 3.4289713802588106e-05, + "loss": 2.6964, + "step": 31005 + }, + { + "epoch": 1.9247625550934262, + "grad_norm": 0.14824717642690893, + "learning_rate": 3.4286285225326474e-05, + "loss": 2.675, + "step": 31006 + }, + { + "epoch": 1.924824632193184, + "grad_norm": 0.15316882819982636, + "learning_rate": 3.428285673004713e-05, + "loss": 2.6879, + "step": 31007 + }, + { + "epoch": 1.9248867092929418, + "grad_norm": 0.15736218047138753, + "learning_rate": 3.427942831676791e-05, + "loss": 2.7368, + "step": 31008 + }, + { + "epoch": 1.9249487863926997, + "grad_norm": 0.13805795491241865, + "learning_rate": 3.427599998550676e-05, + "loss": 2.826, + "step": 31009 + }, + { + "epoch": 1.9250108634924576, + "grad_norm": 0.15353658935825282, + "learning_rate": 3.4272571736281514e-05, + "loss": 2.7499, + "step": 31010 + }, + { + "epoch": 1.9250729405922156, + "grad_norm": 0.1500883532627493, + "learning_rate": 3.426914356911009e-05, + "loss": 2.7464, + "step": 31011 + }, + { + "epoch": 1.9251350176919735, + "grad_norm": 0.14835617071008916, + "learning_rate": 3.4265715484010354e-05, + "loss": 2.7224, + "step": 31012 + }, + { + "epoch": 1.9251970947917312, + "grad_norm": 0.14854609131297863, + "learning_rate": 3.426228748100021e-05, + "loss": 2.7569, + "step": 31013 + }, + { + "epoch": 1.925259171891489, + "grad_norm": 0.1441673517647306, + "learning_rate": 3.425885956009752e-05, + "loss": 2.6813, + "step": 31014 + }, + { + "epoch": 1.925321248991247, + "grad_norm": 0.13488886906039801, + "learning_rate": 3.425543172132019e-05, + "loss": 2.7108, + "step": 31015 + }, + { + "epoch": 1.925383326091005, + "grad_norm": 0.14571683067515578, + "learning_rate": 3.42520039646861e-05, + "loss": 2.7935, + "step": 31016 + }, + { + "epoch": 1.9254454031907628, + "grad_norm": 0.14271506826723218, + "learning_rate": 3.424857629021312e-05, + "loss": 2.6762, + "step": 31017 + }, + { + "epoch": 1.9255074802905208, + "grad_norm": 0.14452587581643656, + "learning_rate": 3.424514869791912e-05, + "loss": 2.6868, + "step": 31018 + }, + { + "epoch": 1.9255695573902787, + "grad_norm": 0.14849217562340009, + "learning_rate": 3.424172118782203e-05, + "loss": 2.79, + "step": 31019 + }, + { + "epoch": 1.9256316344900366, + "grad_norm": 0.18242478051822608, + "learning_rate": 3.42382937599397e-05, + "loss": 2.7028, + "step": 31020 + }, + { + "epoch": 1.9256937115897945, + "grad_norm": 0.14406195046740003, + "learning_rate": 3.423486641429002e-05, + "loss": 2.776, + "step": 31021 + }, + { + "epoch": 1.9257557886895524, + "grad_norm": 0.13912755350038508, + "learning_rate": 3.423143915089087e-05, + "loss": 2.7641, + "step": 31022 + }, + { + "epoch": 1.9258178657893104, + "grad_norm": 0.1410092230846783, + "learning_rate": 3.422801196976013e-05, + "loss": 2.6882, + "step": 31023 + }, + { + "epoch": 1.9258799428890683, + "grad_norm": 0.13844140471383118, + "learning_rate": 3.422458487091568e-05, + "loss": 2.6981, + "step": 31024 + }, + { + "epoch": 1.9259420199888262, + "grad_norm": 0.15084982409634512, + "learning_rate": 3.42211578543754e-05, + "loss": 2.8309, + "step": 31025 + }, + { + "epoch": 1.926004097088584, + "grad_norm": 0.14058537057273118, + "learning_rate": 3.4217730920157164e-05, + "loss": 2.7825, + "step": 31026 + }, + { + "epoch": 1.926066174188342, + "grad_norm": 0.17950535955237776, + "learning_rate": 3.421430406827887e-05, + "loss": 2.8052, + "step": 31027 + }, + { + "epoch": 1.9261282512881, + "grad_norm": 0.14342826025006644, + "learning_rate": 3.421087729875838e-05, + "loss": 2.7764, + "step": 31028 + }, + { + "epoch": 1.9261903283878579, + "grad_norm": 0.16929082324517936, + "learning_rate": 3.420745061161357e-05, + "loss": 2.7181, + "step": 31029 + }, + { + "epoch": 1.9262524054876158, + "grad_norm": 0.14630608835189815, + "learning_rate": 3.4204024006862334e-05, + "loss": 2.7435, + "step": 31030 + }, + { + "epoch": 1.9263144825873735, + "grad_norm": 0.15251702455953792, + "learning_rate": 3.420059748452252e-05, + "loss": 2.7695, + "step": 31031 + }, + { + "epoch": 1.9263765596871314, + "grad_norm": 0.16644944561632227, + "learning_rate": 3.419717104461204e-05, + "loss": 2.8015, + "step": 31032 + }, + { + "epoch": 1.9264386367868893, + "grad_norm": 0.15809252487457437, + "learning_rate": 3.4193744687148754e-05, + "loss": 2.8431, + "step": 31033 + }, + { + "epoch": 1.9265007138866472, + "grad_norm": 0.17590721173717314, + "learning_rate": 3.419031841215054e-05, + "loss": 2.7212, + "step": 31034 + }, + { + "epoch": 1.9265627909864051, + "grad_norm": 0.14166350489715818, + "learning_rate": 3.418689221963526e-05, + "loss": 2.8163, + "step": 31035 + }, + { + "epoch": 1.926624868086163, + "grad_norm": 0.14408279773476737, + "learning_rate": 3.418346610962082e-05, + "loss": 2.7893, + "step": 31036 + }, + { + "epoch": 1.9266869451859208, + "grad_norm": 0.158908208789962, + "learning_rate": 3.418004008212506e-05, + "loss": 2.8244, + "step": 31037 + }, + { + "epoch": 1.9267490222856787, + "grad_norm": 0.13982289760843064, + "learning_rate": 3.4176614137165885e-05, + "loss": 2.7101, + "step": 31038 + }, + { + "epoch": 1.9268110993854366, + "grad_norm": 0.15665030461414745, + "learning_rate": 3.4173188274761144e-05, + "loss": 2.7648, + "step": 31039 + }, + { + "epoch": 1.9268731764851945, + "grad_norm": 0.14078908195683867, + "learning_rate": 3.416976249492873e-05, + "loss": 2.7325, + "step": 31040 + }, + { + "epoch": 1.9269352535849524, + "grad_norm": 0.13871262180054544, + "learning_rate": 3.416633679768651e-05, + "loss": 2.7434, + "step": 31041 + }, + { + "epoch": 1.9269973306847104, + "grad_norm": 0.1492063661998871, + "learning_rate": 3.4162911183052346e-05, + "loss": 2.7601, + "step": 31042 + }, + { + "epoch": 1.9270594077844683, + "grad_norm": 0.14397143511237856, + "learning_rate": 3.415948565104413e-05, + "loss": 2.8291, + "step": 31043 + }, + { + "epoch": 1.9271214848842262, + "grad_norm": 0.15346286002779727, + "learning_rate": 3.4156060201679704e-05, + "loss": 2.7388, + "step": 31044 + }, + { + "epoch": 1.927183561983984, + "grad_norm": 0.15514715925823727, + "learning_rate": 3.415263483497698e-05, + "loss": 2.7374, + "step": 31045 + }, + { + "epoch": 1.927245639083742, + "grad_norm": 0.1574987933513801, + "learning_rate": 3.414920955095379e-05, + "loss": 2.7553, + "step": 31046 + }, + { + "epoch": 1.9273077161835, + "grad_norm": 0.14407325684816605, + "learning_rate": 3.414578434962803e-05, + "loss": 2.8362, + "step": 31047 + }, + { + "epoch": 1.9273697932832579, + "grad_norm": 0.14272092174316842, + "learning_rate": 3.4142359231017565e-05, + "loss": 2.7277, + "step": 31048 + }, + { + "epoch": 1.9274318703830158, + "grad_norm": 0.16682632195288943, + "learning_rate": 3.413893419514026e-05, + "loss": 2.6125, + "step": 31049 + }, + { + "epoch": 1.9274939474827737, + "grad_norm": 0.15921358905959457, + "learning_rate": 3.413550924201397e-05, + "loss": 2.7525, + "step": 31050 + }, + { + "epoch": 1.9275560245825316, + "grad_norm": 0.15789088884810082, + "learning_rate": 3.413208437165658e-05, + "loss": 2.6962, + "step": 31051 + }, + { + "epoch": 1.9276181016822895, + "grad_norm": 0.14793634516036003, + "learning_rate": 3.4128659584085974e-05, + "loss": 2.7179, + "step": 31052 + }, + { + "epoch": 1.9276801787820474, + "grad_norm": 0.14740419243360536, + "learning_rate": 3.412523487932001e-05, + "loss": 2.7448, + "step": 31053 + }, + { + "epoch": 1.9277422558818054, + "grad_norm": 0.1803957312527428, + "learning_rate": 3.412181025737653e-05, + "loss": 2.7758, + "step": 31054 + }, + { + "epoch": 1.927804332981563, + "grad_norm": 0.15402303611941068, + "learning_rate": 3.411838571827344e-05, + "loss": 2.8233, + "step": 31055 + }, + { + "epoch": 1.927866410081321, + "grad_norm": 0.18208522119314505, + "learning_rate": 3.4114961262028566e-05, + "loss": 2.7355, + "step": 31056 + }, + { + "epoch": 1.927928487181079, + "grad_norm": 0.14659596960802151, + "learning_rate": 3.411153688865981e-05, + "loss": 2.7472, + "step": 31057 + }, + { + "epoch": 1.9279905642808368, + "grad_norm": 0.1799398391905487, + "learning_rate": 3.4108112598185014e-05, + "loss": 2.7724, + "step": 31058 + }, + { + "epoch": 1.9280526413805947, + "grad_norm": 0.15226292911646325, + "learning_rate": 3.4104688390622064e-05, + "loss": 2.7409, + "step": 31059 + }, + { + "epoch": 1.9281147184803527, + "grad_norm": 0.16342194714769376, + "learning_rate": 3.410126426598881e-05, + "loss": 2.824, + "step": 31060 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.1504876146048027, + "learning_rate": 3.409784022430312e-05, + "loss": 2.859, + "step": 31061 + }, + { + "epoch": 1.9282388726798683, + "grad_norm": 0.1656986464201399, + "learning_rate": 3.409441626558285e-05, + "loss": 2.7229, + "step": 31062 + }, + { + "epoch": 1.9283009497796262, + "grad_norm": 0.16661094066703833, + "learning_rate": 3.409099238984588e-05, + "loss": 2.7322, + "step": 31063 + }, + { + "epoch": 1.928363026879384, + "grad_norm": 0.14591065270126985, + "learning_rate": 3.4087568597110057e-05, + "loss": 2.7218, + "step": 31064 + }, + { + "epoch": 1.928425103979142, + "grad_norm": 0.1495324519735915, + "learning_rate": 3.408414488739326e-05, + "loss": 2.7228, + "step": 31065 + }, + { + "epoch": 1.9284871810789, + "grad_norm": 0.1467022420048283, + "learning_rate": 3.4080721260713344e-05, + "loss": 2.7619, + "step": 31066 + }, + { + "epoch": 1.9285492581786579, + "grad_norm": 0.15227240334499642, + "learning_rate": 3.407729771708816e-05, + "loss": 2.7932, + "step": 31067 + }, + { + "epoch": 1.9286113352784158, + "grad_norm": 0.1387846179468453, + "learning_rate": 3.4073874256535585e-05, + "loss": 2.7868, + "step": 31068 + }, + { + "epoch": 1.9286734123781737, + "grad_norm": 0.1397308693196192, + "learning_rate": 3.407045087907347e-05, + "loss": 2.7077, + "step": 31069 + }, + { + "epoch": 1.9287354894779316, + "grad_norm": 0.13987115293143546, + "learning_rate": 3.406702758471969e-05, + "loss": 2.6976, + "step": 31070 + }, + { + "epoch": 1.9287975665776895, + "grad_norm": 0.17040760399073956, + "learning_rate": 3.406360437349208e-05, + "loss": 2.7095, + "step": 31071 + }, + { + "epoch": 1.9288596436774474, + "grad_norm": 0.1404838913680212, + "learning_rate": 3.406018124540853e-05, + "loss": 2.7628, + "step": 31072 + }, + { + "epoch": 1.9289217207772054, + "grad_norm": 0.14719660859362152, + "learning_rate": 3.405675820048687e-05, + "loss": 2.7759, + "step": 31073 + }, + { + "epoch": 1.9289837978769633, + "grad_norm": 0.1543880929394704, + "learning_rate": 3.405333523874498e-05, + "loss": 2.7472, + "step": 31074 + }, + { + "epoch": 1.9290458749767212, + "grad_norm": 0.14791453619103243, + "learning_rate": 3.404991236020071e-05, + "loss": 2.7541, + "step": 31075 + }, + { + "epoch": 1.9291079520764791, + "grad_norm": 0.13536222369109202, + "learning_rate": 3.404648956487192e-05, + "loss": 2.7909, + "step": 31076 + }, + { + "epoch": 1.929170029176237, + "grad_norm": 0.15450680611720463, + "learning_rate": 3.4043066852776474e-05, + "loss": 2.8477, + "step": 31077 + }, + { + "epoch": 1.929232106275995, + "grad_norm": 0.14362736244225816, + "learning_rate": 3.403964422393221e-05, + "loss": 2.8209, + "step": 31078 + }, + { + "epoch": 1.9292941833757526, + "grad_norm": 0.13908254232528333, + "learning_rate": 3.4036221678357004e-05, + "loss": 2.7194, + "step": 31079 + }, + { + "epoch": 1.9293562604755106, + "grad_norm": 0.14702822499705478, + "learning_rate": 3.4032799216068694e-05, + "loss": 2.679, + "step": 31080 + }, + { + "epoch": 1.9294183375752685, + "grad_norm": 0.1403019094293557, + "learning_rate": 3.402937683708516e-05, + "loss": 2.63, + "step": 31081 + }, + { + "epoch": 1.9294804146750264, + "grad_norm": 0.14702923214913183, + "learning_rate": 3.402595454142423e-05, + "loss": 2.7134, + "step": 31082 + }, + { + "epoch": 1.9295424917747843, + "grad_norm": 0.13880437243102664, + "learning_rate": 3.402253232910379e-05, + "loss": 2.7167, + "step": 31083 + }, + { + "epoch": 1.9296045688745422, + "grad_norm": 0.14007605592100572, + "learning_rate": 3.4019110200141656e-05, + "loss": 2.7683, + "step": 31084 + }, + { + "epoch": 1.9296666459743, + "grad_norm": 0.16083263411168147, + "learning_rate": 3.401568815455571e-05, + "loss": 2.732, + "step": 31085 + }, + { + "epoch": 1.9297287230740579, + "grad_norm": 0.1564411365175521, + "learning_rate": 3.4012266192363805e-05, + "loss": 2.661, + "step": 31086 + }, + { + "epoch": 1.9297908001738158, + "grad_norm": 0.15246111066155027, + "learning_rate": 3.400884431358379e-05, + "loss": 2.7496, + "step": 31087 + }, + { + "epoch": 1.9298528772735737, + "grad_norm": 0.1454311463300524, + "learning_rate": 3.400542251823352e-05, + "loss": 2.7439, + "step": 31088 + }, + { + "epoch": 1.9299149543733316, + "grad_norm": 0.1396728841017772, + "learning_rate": 3.4002000806330836e-05, + "loss": 2.7691, + "step": 31089 + }, + { + "epoch": 1.9299770314730895, + "grad_norm": 0.15830729434094995, + "learning_rate": 3.39985791778936e-05, + "loss": 2.7532, + "step": 31090 + }, + { + "epoch": 1.9300391085728474, + "grad_norm": 0.16012878298739014, + "learning_rate": 3.399515763293967e-05, + "loss": 2.7416, + "step": 31091 + }, + { + "epoch": 1.9301011856726054, + "grad_norm": 0.13809884913401416, + "learning_rate": 3.399173617148687e-05, + "loss": 2.7733, + "step": 31092 + }, + { + "epoch": 1.9301632627723633, + "grad_norm": 0.13389164409857274, + "learning_rate": 3.398831479355308e-05, + "loss": 2.7049, + "step": 31093 + }, + { + "epoch": 1.9302253398721212, + "grad_norm": 0.15498565900701272, + "learning_rate": 3.398489349915613e-05, + "loss": 2.7228, + "step": 31094 + }, + { + "epoch": 1.930287416971879, + "grad_norm": 0.13923281061301576, + "learning_rate": 3.39814722883139e-05, + "loss": 2.7524, + "step": 31095 + }, + { + "epoch": 1.930349494071637, + "grad_norm": 0.15161364395412333, + "learning_rate": 3.397805116104419e-05, + "loss": 2.7531, + "step": 31096 + }, + { + "epoch": 1.930411571171395, + "grad_norm": 0.14943622599257708, + "learning_rate": 3.39746301173649e-05, + "loss": 2.7946, + "step": 31097 + }, + { + "epoch": 1.9304736482711529, + "grad_norm": 0.1656032152064189, + "learning_rate": 3.397120915729384e-05, + "loss": 2.8103, + "step": 31098 + }, + { + "epoch": 1.9305357253709108, + "grad_norm": 0.15710275526057307, + "learning_rate": 3.3967788280848875e-05, + "loss": 2.66, + "step": 31099 + }, + { + "epoch": 1.9305978024706687, + "grad_norm": 0.16408399032442192, + "learning_rate": 3.396436748804785e-05, + "loss": 2.7909, + "step": 31100 + }, + { + "epoch": 1.9306598795704266, + "grad_norm": 0.14760928239574175, + "learning_rate": 3.396094677890862e-05, + "loss": 2.8206, + "step": 31101 + }, + { + "epoch": 1.9307219566701843, + "grad_norm": 0.16195072315825437, + "learning_rate": 3.395752615344902e-05, + "loss": 2.7109, + "step": 31102 + }, + { + "epoch": 1.9307840337699422, + "grad_norm": 0.16924244874994723, + "learning_rate": 3.39541056116869e-05, + "loss": 2.7977, + "step": 31103 + }, + { + "epoch": 1.9308461108697001, + "grad_norm": 0.1519831338488418, + "learning_rate": 3.3950685153640114e-05, + "loss": 2.6195, + "step": 31104 + }, + { + "epoch": 1.930908187969458, + "grad_norm": 0.1472722355600947, + "learning_rate": 3.394726477932648e-05, + "loss": 2.7613, + "step": 31105 + }, + { + "epoch": 1.930970265069216, + "grad_norm": 0.15741282145967864, + "learning_rate": 3.394384448876388e-05, + "loss": 2.7793, + "step": 31106 + }, + { + "epoch": 1.931032342168974, + "grad_norm": 0.15931711524804681, + "learning_rate": 3.394042428197013e-05, + "loss": 2.8199, + "step": 31107 + }, + { + "epoch": 1.9310944192687316, + "grad_norm": 0.1525118340723691, + "learning_rate": 3.39370041589631e-05, + "loss": 2.7501, + "step": 31108 + }, + { + "epoch": 1.9311564963684895, + "grad_norm": 0.1419048087696443, + "learning_rate": 3.39335841197606e-05, + "loss": 2.7116, + "step": 31109 + }, + { + "epoch": 1.9312185734682474, + "grad_norm": 0.15669948775177894, + "learning_rate": 3.393016416438051e-05, + "loss": 2.7061, + "step": 31110 + }, + { + "epoch": 1.9312806505680054, + "grad_norm": 0.13467437999974577, + "learning_rate": 3.392674429284064e-05, + "loss": 2.7745, + "step": 31111 + }, + { + "epoch": 1.9313427276677633, + "grad_norm": 0.15412874961037018, + "learning_rate": 3.392332450515886e-05, + "loss": 2.7008, + "step": 31112 + }, + { + "epoch": 1.9314048047675212, + "grad_norm": 0.17539062542479925, + "learning_rate": 3.3919904801352994e-05, + "loss": 2.7973, + "step": 31113 + }, + { + "epoch": 1.931466881867279, + "grad_norm": 0.14283530020940682, + "learning_rate": 3.391648518144088e-05, + "loss": 2.7176, + "step": 31114 + }, + { + "epoch": 1.931528958967037, + "grad_norm": 0.1810953302454359, + "learning_rate": 3.3913065645440376e-05, + "loss": 2.7698, + "step": 31115 + }, + { + "epoch": 1.931591036066795, + "grad_norm": 0.14356137603955849, + "learning_rate": 3.3909646193369305e-05, + "loss": 2.7629, + "step": 31116 + }, + { + "epoch": 1.9316531131665529, + "grad_norm": 0.1437888020515933, + "learning_rate": 3.390622682524551e-05, + "loss": 2.7156, + "step": 31117 + }, + { + "epoch": 1.9317151902663108, + "grad_norm": 0.157255027990057, + "learning_rate": 3.390280754108685e-05, + "loss": 2.718, + "step": 31118 + }, + { + "epoch": 1.9317772673660687, + "grad_norm": 0.15131010394304767, + "learning_rate": 3.389938834091114e-05, + "loss": 2.7832, + "step": 31119 + }, + { + "epoch": 1.9318393444658266, + "grad_norm": 0.1594203522144075, + "learning_rate": 3.389596922473625e-05, + "loss": 2.6685, + "step": 31120 + }, + { + "epoch": 1.9319014215655845, + "grad_norm": 0.1649327076526128, + "learning_rate": 3.389255019257998e-05, + "loss": 2.791, + "step": 31121 + }, + { + "epoch": 1.9319634986653424, + "grad_norm": 0.1519156088521054, + "learning_rate": 3.38891312444602e-05, + "loss": 2.7765, + "step": 31122 + }, + { + "epoch": 1.9320255757651004, + "grad_norm": 0.14984974885931598, + "learning_rate": 3.388571238039472e-05, + "loss": 2.7922, + "step": 31123 + }, + { + "epoch": 1.9320876528648583, + "grad_norm": 0.14039595812993755, + "learning_rate": 3.38822936004014e-05, + "loss": 2.7743, + "step": 31124 + }, + { + "epoch": 1.9321497299646162, + "grad_norm": 0.15396751186638757, + "learning_rate": 3.387887490449806e-05, + "loss": 2.6596, + "step": 31125 + }, + { + "epoch": 1.932211807064374, + "grad_norm": 0.15848151778667766, + "learning_rate": 3.3875456292702546e-05, + "loss": 2.7412, + "step": 31126 + }, + { + "epoch": 1.9322738841641318, + "grad_norm": 0.17182964029846812, + "learning_rate": 3.38720377650327e-05, + "loss": 2.7697, + "step": 31127 + }, + { + "epoch": 1.9323359612638897, + "grad_norm": 0.14230909833373345, + "learning_rate": 3.386861932150633e-05, + "loss": 2.7103, + "step": 31128 + }, + { + "epoch": 1.9323980383636477, + "grad_norm": 0.14467052244762743, + "learning_rate": 3.3865200962141305e-05, + "loss": 2.7106, + "step": 31129 + }, + { + "epoch": 1.9324601154634056, + "grad_norm": 0.14719247063466329, + "learning_rate": 3.3861782686955435e-05, + "loss": 2.7478, + "step": 31130 + }, + { + "epoch": 1.9325221925631635, + "grad_norm": 0.2458137099602176, + "learning_rate": 3.3858364495966565e-05, + "loss": 2.8348, + "step": 31131 + }, + { + "epoch": 1.9325842696629212, + "grad_norm": 0.1508140609797174, + "learning_rate": 3.385494638919252e-05, + "loss": 2.7266, + "step": 31132 + }, + { + "epoch": 1.932646346762679, + "grad_norm": 0.14405417317924912, + "learning_rate": 3.385152836665115e-05, + "loss": 2.6764, + "step": 31133 + }, + { + "epoch": 1.932708423862437, + "grad_norm": 0.14722315456649468, + "learning_rate": 3.384811042836027e-05, + "loss": 2.6983, + "step": 31134 + }, + { + "epoch": 1.932770500962195, + "grad_norm": 0.1559327361615732, + "learning_rate": 3.384469257433772e-05, + "loss": 2.7806, + "step": 31135 + }, + { + "epoch": 1.9328325780619529, + "grad_norm": 0.14435382856021212, + "learning_rate": 3.384127480460133e-05, + "loss": 2.6516, + "step": 31136 + }, + { + "epoch": 1.9328946551617108, + "grad_norm": 0.16073896652832442, + "learning_rate": 3.3837857119168936e-05, + "loss": 2.7968, + "step": 31137 + }, + { + "epoch": 1.9329567322614687, + "grad_norm": 0.1469506725713019, + "learning_rate": 3.3834439518058364e-05, + "loss": 2.7975, + "step": 31138 + }, + { + "epoch": 1.9330188093612266, + "grad_norm": 0.1434491411795857, + "learning_rate": 3.383102200128744e-05, + "loss": 2.6872, + "step": 31139 + }, + { + "epoch": 1.9330808864609845, + "grad_norm": 0.14538558204059124, + "learning_rate": 3.382760456887401e-05, + "loss": 2.7988, + "step": 31140 + }, + { + "epoch": 1.9331429635607424, + "grad_norm": 0.1751268178524175, + "learning_rate": 3.382418722083588e-05, + "loss": 2.7446, + "step": 31141 + }, + { + "epoch": 1.9332050406605004, + "grad_norm": 0.14789024915209134, + "learning_rate": 3.38207699571909e-05, + "loss": 2.7512, + "step": 31142 + }, + { + "epoch": 1.9332671177602583, + "grad_norm": 0.16344928810175227, + "learning_rate": 3.3817352777956876e-05, + "loss": 2.7632, + "step": 31143 + }, + { + "epoch": 1.9333291948600162, + "grad_norm": 0.17464122375081204, + "learning_rate": 3.3813935683151676e-05, + "loss": 2.7967, + "step": 31144 + }, + { + "epoch": 1.9333912719597741, + "grad_norm": 0.14999691095747725, + "learning_rate": 3.381051867279308e-05, + "loss": 2.7749, + "step": 31145 + }, + { + "epoch": 1.933453349059532, + "grad_norm": 0.22910911567120215, + "learning_rate": 3.380710174689895e-05, + "loss": 2.6889, + "step": 31146 + }, + { + "epoch": 1.93351542615929, + "grad_norm": 0.1825224074560092, + "learning_rate": 3.38036849054871e-05, + "loss": 2.7728, + "step": 31147 + }, + { + "epoch": 1.9335775032590479, + "grad_norm": 0.18537455135839065, + "learning_rate": 3.380026814857536e-05, + "loss": 2.7626, + "step": 31148 + }, + { + "epoch": 1.9336395803588058, + "grad_norm": 0.13981871460739423, + "learning_rate": 3.3796851476181534e-05, + "loss": 2.7866, + "step": 31149 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.20300811375589572, + "learning_rate": 3.3793434888323486e-05, + "loss": 2.8216, + "step": 31150 + }, + { + "epoch": 1.9337637345583214, + "grad_norm": 0.16743623243851, + "learning_rate": 3.379001838501902e-05, + "loss": 2.7847, + "step": 31151 + }, + { + "epoch": 1.9338258116580793, + "grad_norm": 0.1433871712619424, + "learning_rate": 3.3786601966285966e-05, + "loss": 2.7414, + "step": 31152 + }, + { + "epoch": 1.9338878887578372, + "grad_norm": 0.15228658606247084, + "learning_rate": 3.3783185632142136e-05, + "loss": 2.7786, + "step": 31153 + }, + { + "epoch": 1.9339499658575952, + "grad_norm": 0.1773068452658822, + "learning_rate": 3.377976938260538e-05, + "loss": 2.7393, + "step": 31154 + }, + { + "epoch": 1.934012042957353, + "grad_norm": 0.1459705113480472, + "learning_rate": 3.3776353217693494e-05, + "loss": 2.7355, + "step": 31155 + }, + { + "epoch": 1.9340741200571108, + "grad_norm": 0.14995749760523208, + "learning_rate": 3.377293713742431e-05, + "loss": 2.7927, + "step": 31156 + }, + { + "epoch": 1.9341361971568687, + "grad_norm": 0.15880341904832151, + "learning_rate": 3.376952114181565e-05, + "loss": 2.7455, + "step": 31157 + }, + { + "epoch": 1.9341982742566266, + "grad_norm": 0.1496761553951039, + "learning_rate": 3.376610523088535e-05, + "loss": 2.6746, + "step": 31158 + }, + { + "epoch": 1.9342603513563845, + "grad_norm": 0.14109134173529414, + "learning_rate": 3.3762689404651204e-05, + "loss": 2.6856, + "step": 31159 + }, + { + "epoch": 1.9343224284561424, + "grad_norm": 0.14398123501247972, + "learning_rate": 3.375927366313106e-05, + "loss": 2.7972, + "step": 31160 + }, + { + "epoch": 1.9343845055559004, + "grad_norm": 0.14029726275989374, + "learning_rate": 3.375585800634273e-05, + "loss": 2.7327, + "step": 31161 + }, + { + "epoch": 1.9344465826556583, + "grad_norm": 0.20983868077987194, + "learning_rate": 3.3752442434304035e-05, + "loss": 2.6574, + "step": 31162 + }, + { + "epoch": 1.9345086597554162, + "grad_norm": 0.16498788909283865, + "learning_rate": 3.3749026947032786e-05, + "loss": 2.7051, + "step": 31163 + }, + { + "epoch": 1.9345707368551741, + "grad_norm": 0.1499677819679624, + "learning_rate": 3.374561154454681e-05, + "loss": 2.7491, + "step": 31164 + }, + { + "epoch": 1.934632813954932, + "grad_norm": 0.16830789903152127, + "learning_rate": 3.3742196226863924e-05, + "loss": 2.7916, + "step": 31165 + }, + { + "epoch": 1.93469489105469, + "grad_norm": 0.1433639491856063, + "learning_rate": 3.373878099400194e-05, + "loss": 2.7974, + "step": 31166 + }, + { + "epoch": 1.9347569681544479, + "grad_norm": 0.14844687331871612, + "learning_rate": 3.3735365845978696e-05, + "loss": 2.8985, + "step": 31167 + }, + { + "epoch": 1.9348190452542058, + "grad_norm": 0.14298796211981363, + "learning_rate": 3.373195078281198e-05, + "loss": 2.7688, + "step": 31168 + }, + { + "epoch": 1.9348811223539637, + "grad_norm": 0.15607588723075771, + "learning_rate": 3.372853580451964e-05, + "loss": 2.7688, + "step": 31169 + }, + { + "epoch": 1.9349431994537216, + "grad_norm": 0.159555693182003, + "learning_rate": 3.372512091111947e-05, + "loss": 2.7558, + "step": 31170 + }, + { + "epoch": 1.9350052765534795, + "grad_norm": 0.1927881569542556, + "learning_rate": 3.37217061026293e-05, + "loss": 2.7462, + "step": 31171 + }, + { + "epoch": 1.9350673536532375, + "grad_norm": 0.16994022401855086, + "learning_rate": 3.3718291379066936e-05, + "loss": 2.7522, + "step": 31172 + }, + { + "epoch": 1.9351294307529954, + "grad_norm": 0.16987899261095585, + "learning_rate": 3.371487674045021e-05, + "loss": 2.6839, + "step": 31173 + }, + { + "epoch": 1.935191507852753, + "grad_norm": 0.1371766324833068, + "learning_rate": 3.371146218679692e-05, + "loss": 2.7627, + "step": 31174 + }, + { + "epoch": 1.935253584952511, + "grad_norm": 0.16620687800290093, + "learning_rate": 3.370804771812488e-05, + "loss": 2.8878, + "step": 31175 + }, + { + "epoch": 1.935315662052269, + "grad_norm": 0.15200757568626388, + "learning_rate": 3.370463333445192e-05, + "loss": 2.8539, + "step": 31176 + }, + { + "epoch": 1.9353777391520268, + "grad_norm": 0.1627233323129132, + "learning_rate": 3.370121903579583e-05, + "loss": 2.8489, + "step": 31177 + }, + { + "epoch": 1.9354398162517847, + "grad_norm": 0.1387867854409598, + "learning_rate": 3.3697804822174453e-05, + "loss": 2.7942, + "step": 31178 + }, + { + "epoch": 1.9355018933515427, + "grad_norm": 0.15279801815934368, + "learning_rate": 3.369439069360557e-05, + "loss": 2.7646, + "step": 31179 + }, + { + "epoch": 1.9355639704513004, + "grad_norm": 0.15519202395223583, + "learning_rate": 3.369097665010702e-05, + "loss": 2.7905, + "step": 31180 + }, + { + "epoch": 1.9356260475510583, + "grad_norm": 0.1415016676973831, + "learning_rate": 3.3687562691696593e-05, + "loss": 2.6982, + "step": 31181 + }, + { + "epoch": 1.9356881246508162, + "grad_norm": 0.14642664089512794, + "learning_rate": 3.36841488183921e-05, + "loss": 2.763, + "step": 31182 + }, + { + "epoch": 1.935750201750574, + "grad_norm": 0.14633968837118733, + "learning_rate": 3.368073503021139e-05, + "loss": 2.8227, + "step": 31183 + }, + { + "epoch": 1.935812278850332, + "grad_norm": 0.13751906078611933, + "learning_rate": 3.367732132717224e-05, + "loss": 2.7525, + "step": 31184 + }, + { + "epoch": 1.93587435595009, + "grad_norm": 0.17569668643123643, + "learning_rate": 3.367390770929246e-05, + "loss": 2.7091, + "step": 31185 + }, + { + "epoch": 1.9359364330498479, + "grad_norm": 0.14921526680121447, + "learning_rate": 3.3670494176589875e-05, + "loss": 2.7439, + "step": 31186 + }, + { + "epoch": 1.9359985101496058, + "grad_norm": 0.13866869828789782, + "learning_rate": 3.366708072908229e-05, + "loss": 2.725, + "step": 31187 + }, + { + "epoch": 1.9360605872493637, + "grad_norm": 0.14579990051984665, + "learning_rate": 3.366366736678751e-05, + "loss": 2.7997, + "step": 31188 + }, + { + "epoch": 1.9361226643491216, + "grad_norm": 0.15214563594878266, + "learning_rate": 3.366025408972333e-05, + "loss": 2.8014, + "step": 31189 + }, + { + "epoch": 1.9361847414488795, + "grad_norm": 0.14751836782587843, + "learning_rate": 3.3656840897907585e-05, + "loss": 2.7465, + "step": 31190 + }, + { + "epoch": 1.9362468185486374, + "grad_norm": 0.15574996927844276, + "learning_rate": 3.365342779135805e-05, + "loss": 2.796, + "step": 31191 + }, + { + "epoch": 1.9363088956483954, + "grad_norm": 0.14269868769962873, + "learning_rate": 3.365001477009257e-05, + "loss": 2.7698, + "step": 31192 + }, + { + "epoch": 1.9363709727481533, + "grad_norm": 0.18471108976578177, + "learning_rate": 3.364660183412892e-05, + "loss": 2.7788, + "step": 31193 + }, + { + "epoch": 1.9364330498479112, + "grad_norm": 0.16208429027567442, + "learning_rate": 3.3643188983484924e-05, + "loss": 2.8445, + "step": 31194 + }, + { + "epoch": 1.9364951269476691, + "grad_norm": 0.18590934884917149, + "learning_rate": 3.363977621817838e-05, + "loss": 2.7754, + "step": 31195 + }, + { + "epoch": 1.936557204047427, + "grad_norm": 0.14218108534753296, + "learning_rate": 3.36363635382271e-05, + "loss": 2.7549, + "step": 31196 + }, + { + "epoch": 1.936619281147185, + "grad_norm": 0.14803038784611128, + "learning_rate": 3.3632950943648864e-05, + "loss": 2.7331, + "step": 31197 + }, + { + "epoch": 1.9366813582469427, + "grad_norm": 0.15217655739815322, + "learning_rate": 3.362953843446152e-05, + "loss": 2.7526, + "step": 31198 + }, + { + "epoch": 1.9367434353467006, + "grad_norm": 0.14386709435423944, + "learning_rate": 3.3626126010682845e-05, + "loss": 2.7674, + "step": 31199 + }, + { + "epoch": 1.9368055124464585, + "grad_norm": 0.16004099420373766, + "learning_rate": 3.3622713672330635e-05, + "loss": 2.7558, + "step": 31200 + }, + { + "epoch": 1.9368675895462164, + "grad_norm": 0.1600107427525886, + "learning_rate": 3.361930141942271e-05, + "loss": 2.8254, + "step": 31201 + }, + { + "epoch": 1.9369296666459743, + "grad_norm": 0.14810103631030505, + "learning_rate": 3.361588925197686e-05, + "loss": 2.729, + "step": 31202 + }, + { + "epoch": 1.9369917437457322, + "grad_norm": 0.17051172918341648, + "learning_rate": 3.3612477170010904e-05, + "loss": 2.8052, + "step": 31203 + }, + { + "epoch": 1.93705382084549, + "grad_norm": 0.14167103871676873, + "learning_rate": 3.360906517354262e-05, + "loss": 2.7362, + "step": 31204 + }, + { + "epoch": 1.9371158979452479, + "grad_norm": 0.15953101254332502, + "learning_rate": 3.360565326258983e-05, + "loss": 2.7584, + "step": 31205 + }, + { + "epoch": 1.9371779750450058, + "grad_norm": 0.14764813673345134, + "learning_rate": 3.360224143717032e-05, + "loss": 2.7183, + "step": 31206 + }, + { + "epoch": 1.9372400521447637, + "grad_norm": 0.15334057522167932, + "learning_rate": 3.359882969730191e-05, + "loss": 2.7711, + "step": 31207 + }, + { + "epoch": 1.9373021292445216, + "grad_norm": 0.14131924998314008, + "learning_rate": 3.3595418043002375e-05, + "loss": 2.8012, + "step": 31208 + }, + { + "epoch": 1.9373642063442795, + "grad_norm": 0.15527247782476625, + "learning_rate": 3.3592006474289535e-05, + "loss": 2.7452, + "step": 31209 + }, + { + "epoch": 1.9374262834440374, + "grad_norm": 0.1529908558494836, + "learning_rate": 3.3588594991181166e-05, + "loss": 2.7991, + "step": 31210 + }, + { + "epoch": 1.9374883605437954, + "grad_norm": 0.15211265716185685, + "learning_rate": 3.35851835936951e-05, + "loss": 2.6455, + "step": 31211 + }, + { + "epoch": 1.9375504376435533, + "grad_norm": 0.15714416951360682, + "learning_rate": 3.358177228184911e-05, + "loss": 2.7971, + "step": 31212 + }, + { + "epoch": 1.9376125147433112, + "grad_norm": 0.1522447251997947, + "learning_rate": 3.357836105566099e-05, + "loss": 2.7701, + "step": 31213 + }, + { + "epoch": 1.9376745918430691, + "grad_norm": 0.18340693769977626, + "learning_rate": 3.3574949915148555e-05, + "loss": 2.7821, + "step": 31214 + }, + { + "epoch": 1.937736668942827, + "grad_norm": 0.1584792611802305, + "learning_rate": 3.357153886032958e-05, + "loss": 2.767, + "step": 31215 + }, + { + "epoch": 1.937798746042585, + "grad_norm": 0.1372315734417488, + "learning_rate": 3.3568127891221886e-05, + "loss": 2.7509, + "step": 31216 + }, + { + "epoch": 1.9378608231423429, + "grad_norm": 0.14392649580640554, + "learning_rate": 3.356471700784326e-05, + "loss": 2.7662, + "step": 31217 + }, + { + "epoch": 1.9379229002421008, + "grad_norm": 0.15470602618164175, + "learning_rate": 3.356130621021149e-05, + "loss": 2.7329, + "step": 31218 + }, + { + "epoch": 1.9379849773418587, + "grad_norm": 0.14229231759444214, + "learning_rate": 3.355789549834438e-05, + "loss": 2.7401, + "step": 31219 + }, + { + "epoch": 1.9380470544416166, + "grad_norm": 0.14733671172555696, + "learning_rate": 3.355448487225972e-05, + "loss": 2.7424, + "step": 31220 + }, + { + "epoch": 1.9381091315413745, + "grad_norm": 0.1503035243580379, + "learning_rate": 3.3551074331975306e-05, + "loss": 2.7986, + "step": 31221 + }, + { + "epoch": 1.9381712086411322, + "grad_norm": 0.15702296040608305, + "learning_rate": 3.354766387750893e-05, + "loss": 2.7515, + "step": 31222 + }, + { + "epoch": 1.9382332857408902, + "grad_norm": 0.15579146217803955, + "learning_rate": 3.354425350887839e-05, + "loss": 2.7406, + "step": 31223 + }, + { + "epoch": 1.938295362840648, + "grad_norm": 0.1515552582927908, + "learning_rate": 3.354084322610147e-05, + "loss": 2.6668, + "step": 31224 + }, + { + "epoch": 1.938357439940406, + "grad_norm": 0.15360206786186506, + "learning_rate": 3.3537433029195964e-05, + "loss": 2.6199, + "step": 31225 + }, + { + "epoch": 1.938419517040164, + "grad_norm": 0.1489503621311626, + "learning_rate": 3.353402291817968e-05, + "loss": 2.727, + "step": 31226 + }, + { + "epoch": 1.9384815941399218, + "grad_norm": 0.14688349709382462, + "learning_rate": 3.353061289307037e-05, + "loss": 2.7802, + "step": 31227 + }, + { + "epoch": 1.9385436712396795, + "grad_norm": 0.14785224581803685, + "learning_rate": 3.3527202953885874e-05, + "loss": 2.788, + "step": 31228 + }, + { + "epoch": 1.9386057483394374, + "grad_norm": 0.15423306219983526, + "learning_rate": 3.352379310064394e-05, + "loss": 2.7179, + "step": 31229 + }, + { + "epoch": 1.9386678254391954, + "grad_norm": 0.1468015479500757, + "learning_rate": 3.352038333336239e-05, + "loss": 2.8065, + "step": 31230 + }, + { + "epoch": 1.9387299025389533, + "grad_norm": 0.14294879883559736, + "learning_rate": 3.351697365205899e-05, + "loss": 2.8406, + "step": 31231 + }, + { + "epoch": 1.9387919796387112, + "grad_norm": 0.13992765163380155, + "learning_rate": 3.351356405675156e-05, + "loss": 2.8126, + "step": 31232 + }, + { + "epoch": 1.9388540567384691, + "grad_norm": 0.1575385782795754, + "learning_rate": 3.351015454745785e-05, + "loss": 2.7527, + "step": 31233 + }, + { + "epoch": 1.938916133838227, + "grad_norm": 0.15459635799753557, + "learning_rate": 3.350674512419567e-05, + "loss": 2.7663, + "step": 31234 + }, + { + "epoch": 1.938978210937985, + "grad_norm": 0.14381781616232683, + "learning_rate": 3.35033357869828e-05, + "loss": 2.7869, + "step": 31235 + }, + { + "epoch": 1.9390402880377429, + "grad_norm": 0.14921330072528927, + "learning_rate": 3.349992653583705e-05, + "loss": 2.699, + "step": 31236 + }, + { + "epoch": 1.9391023651375008, + "grad_norm": 0.1549965290673084, + "learning_rate": 3.3496517370776184e-05, + "loss": 2.7245, + "step": 31237 + }, + { + "epoch": 1.9391644422372587, + "grad_norm": 0.1659095425231459, + "learning_rate": 3.3493108291817976e-05, + "loss": 2.7489, + "step": 31238 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.1455292269930837, + "learning_rate": 3.348969929898025e-05, + "loss": 2.6846, + "step": 31239 + }, + { + "epoch": 1.9392885964367745, + "grad_norm": 0.14897213198922174, + "learning_rate": 3.3486290392280756e-05, + "loss": 2.765, + "step": 31240 + }, + { + "epoch": 1.9393506735365325, + "grad_norm": 0.1557978585150682, + "learning_rate": 3.3482881571737304e-05, + "loss": 2.8003, + "step": 31241 + }, + { + "epoch": 1.9394127506362904, + "grad_norm": 0.16273417197666698, + "learning_rate": 3.347947283736765e-05, + "loss": 2.7414, + "step": 31242 + }, + { + "epoch": 1.9394748277360483, + "grad_norm": 0.15335134933156025, + "learning_rate": 3.347606418918962e-05, + "loss": 2.8014, + "step": 31243 + }, + { + "epoch": 1.9395369048358062, + "grad_norm": 0.16202596448887302, + "learning_rate": 3.3472655627220964e-05, + "loss": 2.7663, + "step": 31244 + }, + { + "epoch": 1.9395989819355641, + "grad_norm": 0.1390913743142114, + "learning_rate": 3.346924715147948e-05, + "loss": 2.6422, + "step": 31245 + }, + { + "epoch": 1.9396610590353218, + "grad_norm": 0.16999468394515593, + "learning_rate": 3.3465838761982936e-05, + "loss": 2.812, + "step": 31246 + }, + { + "epoch": 1.9397231361350797, + "grad_norm": 0.1667483428907769, + "learning_rate": 3.3462430458749136e-05, + "loss": 2.7396, + "step": 31247 + }, + { + "epoch": 1.9397852132348377, + "grad_norm": 0.15342179359801397, + "learning_rate": 3.3459022241795834e-05, + "loss": 2.7097, + "step": 31248 + }, + { + "epoch": 1.9398472903345956, + "grad_norm": 0.1723087429831132, + "learning_rate": 3.345561411114084e-05, + "loss": 2.8383, + "step": 31249 + }, + { + "epoch": 1.9399093674343535, + "grad_norm": 0.1631612204207595, + "learning_rate": 3.3452206066801925e-05, + "loss": 2.8157, + "step": 31250 + }, + { + "epoch": 1.9399714445341114, + "grad_norm": 0.1676023334632496, + "learning_rate": 3.3448798108796874e-05, + "loss": 2.7531, + "step": 31251 + }, + { + "epoch": 1.940033521633869, + "grad_norm": 0.16994851550274287, + "learning_rate": 3.344539023714345e-05, + "loss": 2.7702, + "step": 31252 + }, + { + "epoch": 1.940095598733627, + "grad_norm": 0.15826179452607367, + "learning_rate": 3.3441982451859457e-05, + "loss": 2.7451, + "step": 31253 + }, + { + "epoch": 1.940157675833385, + "grad_norm": 0.1504295843614865, + "learning_rate": 3.3438574752962645e-05, + "loss": 2.7692, + "step": 31254 + }, + { + "epoch": 1.9402197529331429, + "grad_norm": 0.1427158056748537, + "learning_rate": 3.343516714047082e-05, + "loss": 2.7885, + "step": 31255 + }, + { + "epoch": 1.9402818300329008, + "grad_norm": 0.16811075971876768, + "learning_rate": 3.3431759614401733e-05, + "loss": 2.8349, + "step": 31256 + }, + { + "epoch": 1.9403439071326587, + "grad_norm": 0.144953362624975, + "learning_rate": 3.34283521747732e-05, + "loss": 2.7419, + "step": 31257 + }, + { + "epoch": 1.9404059842324166, + "grad_norm": 0.14883914991779262, + "learning_rate": 3.3424944821602964e-05, + "loss": 2.7457, + "step": 31258 + }, + { + "epoch": 1.9404680613321745, + "grad_norm": 0.13785043054960713, + "learning_rate": 3.342153755490882e-05, + "loss": 2.7511, + "step": 31259 + }, + { + "epoch": 1.9405301384319324, + "grad_norm": 0.15994392253290593, + "learning_rate": 3.341813037470854e-05, + "loss": 2.7885, + "step": 31260 + }, + { + "epoch": 1.9405922155316904, + "grad_norm": 0.14566948249167247, + "learning_rate": 3.34147232810199e-05, + "loss": 2.7112, + "step": 31261 + }, + { + "epoch": 1.9406542926314483, + "grad_norm": 0.14437059546436026, + "learning_rate": 3.341131627386067e-05, + "loss": 2.8256, + "step": 31262 + }, + { + "epoch": 1.9407163697312062, + "grad_norm": 0.15049364846552205, + "learning_rate": 3.340790935324863e-05, + "loss": 2.7698, + "step": 31263 + }, + { + "epoch": 1.9407784468309641, + "grad_norm": 0.16427514509816685, + "learning_rate": 3.3404502519201565e-05, + "loss": 2.7294, + "step": 31264 + }, + { + "epoch": 1.940840523930722, + "grad_norm": 0.17480438914353466, + "learning_rate": 3.340109577173722e-05, + "loss": 2.7262, + "step": 31265 + }, + { + "epoch": 1.94090260103048, + "grad_norm": 0.1604383356973886, + "learning_rate": 3.339768911087341e-05, + "loss": 2.7843, + "step": 31266 + }, + { + "epoch": 1.9409646781302379, + "grad_norm": 0.1457525218141027, + "learning_rate": 3.339428253662787e-05, + "loss": 2.8087, + "step": 31267 + }, + { + "epoch": 1.9410267552299958, + "grad_norm": 0.14842984531140607, + "learning_rate": 3.33908760490184e-05, + "loss": 2.7621, + "step": 31268 + }, + { + "epoch": 1.9410888323297537, + "grad_norm": 0.14420571266214408, + "learning_rate": 3.338746964806275e-05, + "loss": 2.8311, + "step": 31269 + }, + { + "epoch": 1.9411509094295114, + "grad_norm": 0.17625446202331763, + "learning_rate": 3.3384063333778716e-05, + "loss": 2.6828, + "step": 31270 + }, + { + "epoch": 1.9412129865292693, + "grad_norm": 0.15041650202676987, + "learning_rate": 3.338065710618404e-05, + "loss": 2.7583, + "step": 31271 + }, + { + "epoch": 1.9412750636290272, + "grad_norm": 0.14987315909620783, + "learning_rate": 3.337725096529653e-05, + "loss": 2.683, + "step": 31272 + }, + { + "epoch": 1.9413371407287852, + "grad_norm": 0.14655041159818005, + "learning_rate": 3.337384491113394e-05, + "loss": 2.7651, + "step": 31273 + }, + { + "epoch": 1.941399217828543, + "grad_norm": 0.1500264067552421, + "learning_rate": 3.337043894371401e-05, + "loss": 2.811, + "step": 31274 + }, + { + "epoch": 1.941461294928301, + "grad_norm": 0.14513119338617542, + "learning_rate": 3.336703306305456e-05, + "loss": 2.8579, + "step": 31275 + }, + { + "epoch": 1.9415233720280587, + "grad_norm": 0.14053102505695372, + "learning_rate": 3.336362726917332e-05, + "loss": 2.7014, + "step": 31276 + }, + { + "epoch": 1.9415854491278166, + "grad_norm": 0.15580860018373302, + "learning_rate": 3.336022156208809e-05, + "loss": 2.6738, + "step": 31277 + }, + { + "epoch": 1.9416475262275745, + "grad_norm": 0.15534008443172342, + "learning_rate": 3.335681594181661e-05, + "loss": 2.8202, + "step": 31278 + }, + { + "epoch": 1.9417096033273324, + "grad_norm": 0.144664116951461, + "learning_rate": 3.335341040837667e-05, + "loss": 2.7783, + "step": 31279 + }, + { + "epoch": 1.9417716804270904, + "grad_norm": 0.13818244798812065, + "learning_rate": 3.335000496178602e-05, + "loss": 2.6255, + "step": 31280 + }, + { + "epoch": 1.9418337575268483, + "grad_norm": 0.14503557906319972, + "learning_rate": 3.3346599602062434e-05, + "loss": 2.7638, + "step": 31281 + }, + { + "epoch": 1.9418958346266062, + "grad_norm": 0.15740953780453978, + "learning_rate": 3.3343194329223695e-05, + "loss": 2.7764, + "step": 31282 + }, + { + "epoch": 1.9419579117263641, + "grad_norm": 0.13750696489727296, + "learning_rate": 3.333978914328755e-05, + "loss": 2.7282, + "step": 31283 + }, + { + "epoch": 1.942019988826122, + "grad_norm": 0.1451633202911729, + "learning_rate": 3.3336384044271774e-05, + "loss": 2.7973, + "step": 31284 + }, + { + "epoch": 1.94208206592588, + "grad_norm": 0.1368710214872754, + "learning_rate": 3.333297903219412e-05, + "loss": 2.7146, + "step": 31285 + }, + { + "epoch": 1.9421441430256379, + "grad_norm": 0.15170576315002438, + "learning_rate": 3.332957410707237e-05, + "loss": 2.731, + "step": 31286 + }, + { + "epoch": 1.9422062201253958, + "grad_norm": 0.16135050200476453, + "learning_rate": 3.3326169268924275e-05, + "loss": 2.6642, + "step": 31287 + }, + { + "epoch": 1.9422682972251537, + "grad_norm": 0.16079288230204, + "learning_rate": 3.332276451776759e-05, + "loss": 2.6862, + "step": 31288 + }, + { + "epoch": 1.9423303743249116, + "grad_norm": 0.13758966437604447, + "learning_rate": 3.331935985362011e-05, + "loss": 2.6786, + "step": 31289 + }, + { + "epoch": 1.9423924514246695, + "grad_norm": 0.1668610029654661, + "learning_rate": 3.331595527649957e-05, + "loss": 2.6751, + "step": 31290 + }, + { + "epoch": 1.9424545285244275, + "grad_norm": 0.1411593998715267, + "learning_rate": 3.331255078642374e-05, + "loss": 2.706, + "step": 31291 + }, + { + "epoch": 1.9425166056241854, + "grad_norm": 0.15129515256672438, + "learning_rate": 3.330914638341038e-05, + "loss": 2.6957, + "step": 31292 + }, + { + "epoch": 1.9425786827239433, + "grad_norm": 0.1366639027253034, + "learning_rate": 3.3305742067477264e-05, + "loss": 2.6878, + "step": 31293 + }, + { + "epoch": 1.942640759823701, + "grad_norm": 0.1472490569490891, + "learning_rate": 3.330233783864214e-05, + "loss": 2.7643, + "step": 31294 + }, + { + "epoch": 1.942702836923459, + "grad_norm": 0.1391017992231352, + "learning_rate": 3.3298933696922776e-05, + "loss": 2.7706, + "step": 31295 + }, + { + "epoch": 1.9427649140232168, + "grad_norm": 0.16095726970427665, + "learning_rate": 3.329552964233692e-05, + "loss": 2.709, + "step": 31296 + }, + { + "epoch": 1.9428269911229747, + "grad_norm": 0.17837293377009036, + "learning_rate": 3.3292125674902355e-05, + "loss": 2.7781, + "step": 31297 + }, + { + "epoch": 1.9428890682227327, + "grad_norm": 0.14576103910563482, + "learning_rate": 3.328872179463683e-05, + "loss": 2.798, + "step": 31298 + }, + { + "epoch": 1.9429511453224906, + "grad_norm": 0.15324658242914174, + "learning_rate": 3.328531800155808e-05, + "loss": 2.7802, + "step": 31299 + }, + { + "epoch": 1.9430132224222483, + "grad_norm": 0.1543308172436241, + "learning_rate": 3.3281914295683897e-05, + "loss": 2.6667, + "step": 31300 + }, + { + "epoch": 1.9430752995220062, + "grad_norm": 0.14608365243297325, + "learning_rate": 3.3278510677032016e-05, + "loss": 2.7904, + "step": 31301 + }, + { + "epoch": 1.9431373766217641, + "grad_norm": 0.16888526996173328, + "learning_rate": 3.327510714562022e-05, + "loss": 2.7344, + "step": 31302 + }, + { + "epoch": 1.943199453721522, + "grad_norm": 0.15557087893620686, + "learning_rate": 3.3271703701466226e-05, + "loss": 2.781, + "step": 31303 + }, + { + "epoch": 1.94326153082128, + "grad_norm": 0.16066330797095704, + "learning_rate": 3.326830034458784e-05, + "loss": 2.78, + "step": 31304 + }, + { + "epoch": 1.9433236079210379, + "grad_norm": 0.14539706092417723, + "learning_rate": 3.3264897075002774e-05, + "loss": 2.758, + "step": 31305 + }, + { + "epoch": 1.9433856850207958, + "grad_norm": 0.1594673377771955, + "learning_rate": 3.326149389272881e-05, + "loss": 2.8329, + "step": 31306 + }, + { + "epoch": 1.9434477621205537, + "grad_norm": 0.15509950745038295, + "learning_rate": 3.32580907977837e-05, + "loss": 2.7813, + "step": 31307 + }, + { + "epoch": 1.9435098392203116, + "grad_norm": 0.13962203223003739, + "learning_rate": 3.325468779018519e-05, + "loss": 2.8263, + "step": 31308 + }, + { + "epoch": 1.9435719163200695, + "grad_norm": 0.14616047335083932, + "learning_rate": 3.325128486995105e-05, + "loss": 2.8192, + "step": 31309 + }, + { + "epoch": 1.9436339934198275, + "grad_norm": 0.14819251724833446, + "learning_rate": 3.3247882037099e-05, + "loss": 2.8024, + "step": 31310 + }, + { + "epoch": 1.9436960705195854, + "grad_norm": 0.24555677534165313, + "learning_rate": 3.324447929164683e-05, + "loss": 2.7224, + "step": 31311 + }, + { + "epoch": 1.9437581476193433, + "grad_norm": 0.15426060093252553, + "learning_rate": 3.324107663361228e-05, + "loss": 2.7865, + "step": 31312 + }, + { + "epoch": 1.9438202247191012, + "grad_norm": 0.16404458356848917, + "learning_rate": 3.323767406301308e-05, + "loss": 2.7848, + "step": 31313 + }, + { + "epoch": 1.9438823018188591, + "grad_norm": 0.16802477456744475, + "learning_rate": 3.323427157986703e-05, + "loss": 2.7665, + "step": 31314 + }, + { + "epoch": 1.943944378918617, + "grad_norm": 0.173249734372256, + "learning_rate": 3.323086918419184e-05, + "loss": 2.7628, + "step": 31315 + }, + { + "epoch": 1.944006456018375, + "grad_norm": 0.15168708100475195, + "learning_rate": 3.322746687600529e-05, + "loss": 2.8135, + "step": 31316 + }, + { + "epoch": 1.9440685331181329, + "grad_norm": 0.1549054229126306, + "learning_rate": 3.3224064655325106e-05, + "loss": 2.7894, + "step": 31317 + }, + { + "epoch": 1.9441306102178906, + "grad_norm": 0.1564130885623238, + "learning_rate": 3.3220662522169054e-05, + "loss": 2.7583, + "step": 31318 + }, + { + "epoch": 1.9441926873176485, + "grad_norm": 0.15683259829794186, + "learning_rate": 3.321726047655488e-05, + "loss": 2.7599, + "step": 31319 + }, + { + "epoch": 1.9442547644174064, + "grad_norm": 0.14852032107695043, + "learning_rate": 3.321385851850033e-05, + "loss": 2.7979, + "step": 31320 + }, + { + "epoch": 1.9443168415171643, + "grad_norm": 0.14329642465079961, + "learning_rate": 3.3210456648023156e-05, + "loss": 2.6458, + "step": 31321 + }, + { + "epoch": 1.9443789186169222, + "grad_norm": 0.18276489118800682, + "learning_rate": 3.3207054865141106e-05, + "loss": 2.729, + "step": 31322 + }, + { + "epoch": 1.9444409957166802, + "grad_norm": 0.14321772934302113, + "learning_rate": 3.320365316987194e-05, + "loss": 2.7512, + "step": 31323 + }, + { + "epoch": 1.9445030728164379, + "grad_norm": 0.1506619173925319, + "learning_rate": 3.320025156223337e-05, + "loss": 2.6871, + "step": 31324 + }, + { + "epoch": 1.9445651499161958, + "grad_norm": 0.16203474142199387, + "learning_rate": 3.3196850042243184e-05, + "loss": 2.7607, + "step": 31325 + }, + { + "epoch": 1.9446272270159537, + "grad_norm": 0.14332529801155247, + "learning_rate": 3.3193448609919095e-05, + "loss": 2.6619, + "step": 31326 + }, + { + "epoch": 1.9446893041157116, + "grad_norm": 0.17392934573856647, + "learning_rate": 3.319004726527888e-05, + "loss": 2.8552, + "step": 31327 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.1590177027755971, + "learning_rate": 3.318664600834025e-05, + "loss": 2.7532, + "step": 31328 + }, + { + "epoch": 1.9448134583152275, + "grad_norm": 0.16732191694040502, + "learning_rate": 3.318324483912099e-05, + "loss": 2.7537, + "step": 31329 + }, + { + "epoch": 1.9448755354149854, + "grad_norm": 0.1464474631533416, + "learning_rate": 3.3179843757638806e-05, + "loss": 2.7844, + "step": 31330 + }, + { + "epoch": 1.9449376125147433, + "grad_norm": 0.16208495105456658, + "learning_rate": 3.3176442763911474e-05, + "loss": 2.7446, + "step": 31331 + }, + { + "epoch": 1.9449996896145012, + "grad_norm": 0.1372158549990943, + "learning_rate": 3.317304185795672e-05, + "loss": 2.7801, + "step": 31332 + }, + { + "epoch": 1.9450617667142591, + "grad_norm": 0.15392766309772982, + "learning_rate": 3.3169641039792294e-05, + "loss": 2.8331, + "step": 31333 + }, + { + "epoch": 1.945123843814017, + "grad_norm": 0.14161433458277173, + "learning_rate": 3.316624030943594e-05, + "loss": 2.7724, + "step": 31334 + }, + { + "epoch": 1.945185920913775, + "grad_norm": 0.15938134671645074, + "learning_rate": 3.316283966690538e-05, + "loss": 2.7476, + "step": 31335 + }, + { + "epoch": 1.9452479980135329, + "grad_norm": 0.15438912446738945, + "learning_rate": 3.315943911221839e-05, + "loss": 2.7066, + "step": 31336 + }, + { + "epoch": 1.9453100751132908, + "grad_norm": 0.16072788800661963, + "learning_rate": 3.3156038645392685e-05, + "loss": 2.761, + "step": 31337 + }, + { + "epoch": 1.9453721522130487, + "grad_norm": 0.1597418949346344, + "learning_rate": 3.315263826644602e-05, + "loss": 2.8088, + "step": 31338 + }, + { + "epoch": 1.9454342293128066, + "grad_norm": 0.1471487682046154, + "learning_rate": 3.314923797539612e-05, + "loss": 2.762, + "step": 31339 + }, + { + "epoch": 1.9454963064125645, + "grad_norm": 0.15285093294204616, + "learning_rate": 3.314583777226075e-05, + "loss": 2.7439, + "step": 31340 + }, + { + "epoch": 1.9455583835123225, + "grad_norm": 0.14694126719120304, + "learning_rate": 3.3142437657057624e-05, + "loss": 2.7902, + "step": 31341 + }, + { + "epoch": 1.9456204606120802, + "grad_norm": 0.14838741737530844, + "learning_rate": 3.31390376298045e-05, + "loss": 2.705, + "step": 31342 + }, + { + "epoch": 1.945682537711838, + "grad_norm": 0.1567199078305072, + "learning_rate": 3.3135637690519103e-05, + "loss": 2.768, + "step": 31343 + }, + { + "epoch": 1.945744614811596, + "grad_norm": 0.14312878833779533, + "learning_rate": 3.313223783921919e-05, + "loss": 2.7241, + "step": 31344 + }, + { + "epoch": 1.945806691911354, + "grad_norm": 0.1612494480045984, + "learning_rate": 3.312883807592249e-05, + "loss": 2.7831, + "step": 31345 + }, + { + "epoch": 1.9458687690111118, + "grad_norm": 0.14176609740404017, + "learning_rate": 3.31254384006467e-05, + "loss": 2.7699, + "step": 31346 + }, + { + "epoch": 1.9459308461108697, + "grad_norm": 0.1388981477479183, + "learning_rate": 3.312203881340963e-05, + "loss": 2.7138, + "step": 31347 + }, + { + "epoch": 1.9459929232106274, + "grad_norm": 0.1552665816898821, + "learning_rate": 3.311863931422898e-05, + "loss": 2.8124, + "step": 31348 + }, + { + "epoch": 1.9460550003103854, + "grad_norm": 0.1530194458813108, + "learning_rate": 3.311523990312248e-05, + "loss": 2.672, + "step": 31349 + }, + { + "epoch": 1.9461170774101433, + "grad_norm": 0.15363715775021597, + "learning_rate": 3.311184058010788e-05, + "loss": 2.7033, + "step": 31350 + }, + { + "epoch": 1.9461791545099012, + "grad_norm": 0.14558763682666126, + "learning_rate": 3.310844134520289e-05, + "loss": 2.6278, + "step": 31351 + }, + { + "epoch": 1.9462412316096591, + "grad_norm": 0.14565026267766593, + "learning_rate": 3.310504219842528e-05, + "loss": 2.7366, + "step": 31352 + }, + { + "epoch": 1.946303308709417, + "grad_norm": 0.15128207120887224, + "learning_rate": 3.310164313979275e-05, + "loss": 2.7259, + "step": 31353 + }, + { + "epoch": 1.946365385809175, + "grad_norm": 0.14535407450923957, + "learning_rate": 3.3098244169323067e-05, + "loss": 2.7142, + "step": 31354 + }, + { + "epoch": 1.9464274629089329, + "grad_norm": 0.15408440338447993, + "learning_rate": 3.309484528703393e-05, + "loss": 2.7129, + "step": 31355 + }, + { + "epoch": 1.9464895400086908, + "grad_norm": 0.14831777186173078, + "learning_rate": 3.309144649294311e-05, + "loss": 2.785, + "step": 31356 + }, + { + "epoch": 1.9465516171084487, + "grad_norm": 0.14332924871899336, + "learning_rate": 3.30880477870683e-05, + "loss": 2.7421, + "step": 31357 + }, + { + "epoch": 1.9466136942082066, + "grad_norm": 0.15551923694055284, + "learning_rate": 3.308464916942726e-05, + "loss": 2.7385, + "step": 31358 + }, + { + "epoch": 1.9466757713079645, + "grad_norm": 0.14168745921453654, + "learning_rate": 3.3081250640037707e-05, + "loss": 2.8341, + "step": 31359 + }, + { + "epoch": 1.9467378484077225, + "grad_norm": 0.14950811629686328, + "learning_rate": 3.307785219891737e-05, + "loss": 2.729, + "step": 31360 + }, + { + "epoch": 1.9467999255074804, + "grad_norm": 0.15625432127699124, + "learning_rate": 3.307445384608401e-05, + "loss": 2.7056, + "step": 31361 + }, + { + "epoch": 1.9468620026072383, + "grad_norm": 0.14705557401043642, + "learning_rate": 3.307105558155531e-05, + "loss": 2.7804, + "step": 31362 + }, + { + "epoch": 1.9469240797069962, + "grad_norm": 0.15492687296151494, + "learning_rate": 3.306765740534903e-05, + "loss": 2.8229, + "step": 31363 + }, + { + "epoch": 1.9469861568067541, + "grad_norm": 0.14142754483977102, + "learning_rate": 3.3064259317482886e-05, + "loss": 2.7884, + "step": 31364 + }, + { + "epoch": 1.947048233906512, + "grad_norm": 0.13887017135526245, + "learning_rate": 3.3060861317974624e-05, + "loss": 2.7579, + "step": 31365 + }, + { + "epoch": 1.9471103110062697, + "grad_norm": 0.13678217506618795, + "learning_rate": 3.305746340684195e-05, + "loss": 2.7397, + "step": 31366 + }, + { + "epoch": 1.9471723881060277, + "grad_norm": 0.13985037042088325, + "learning_rate": 3.3054065584102614e-05, + "loss": 2.8095, + "step": 31367 + }, + { + "epoch": 1.9472344652057856, + "grad_norm": 0.15777241391300656, + "learning_rate": 3.305066784977432e-05, + "loss": 2.7266, + "step": 31368 + }, + { + "epoch": 1.9472965423055435, + "grad_norm": 0.15540322691391242, + "learning_rate": 3.3047270203874816e-05, + "loss": 2.711, + "step": 31369 + }, + { + "epoch": 1.9473586194053014, + "grad_norm": 0.16298369097774115, + "learning_rate": 3.30438726464218e-05, + "loss": 2.8232, + "step": 31370 + }, + { + "epoch": 1.9474206965050593, + "grad_norm": 0.1356550183784027, + "learning_rate": 3.304047517743304e-05, + "loss": 2.6868, + "step": 31371 + }, + { + "epoch": 1.947482773604817, + "grad_norm": 0.14647740665391734, + "learning_rate": 3.303707779692623e-05, + "loss": 2.787, + "step": 31372 + }, + { + "epoch": 1.947544850704575, + "grad_norm": 0.14845115197294606, + "learning_rate": 3.30336805049191e-05, + "loss": 2.7371, + "step": 31373 + }, + { + "epoch": 1.9476069278043329, + "grad_norm": 0.15505320447269325, + "learning_rate": 3.303028330142938e-05, + "loss": 2.7659, + "step": 31374 + }, + { + "epoch": 1.9476690049040908, + "grad_norm": 0.13987628648837894, + "learning_rate": 3.302688618647479e-05, + "loss": 2.7271, + "step": 31375 + }, + { + "epoch": 1.9477310820038487, + "grad_norm": 0.13863871406253692, + "learning_rate": 3.302348916007306e-05, + "loss": 2.8029, + "step": 31376 + }, + { + "epoch": 1.9477931591036066, + "grad_norm": 0.14685608432588504, + "learning_rate": 3.302009222224189e-05, + "loss": 2.7209, + "step": 31377 + }, + { + "epoch": 1.9478552362033645, + "grad_norm": 0.15129799645814002, + "learning_rate": 3.301669537299905e-05, + "loss": 2.6968, + "step": 31378 + }, + { + "epoch": 1.9479173133031225, + "grad_norm": 0.1415642853702187, + "learning_rate": 3.30132986123622e-05, + "loss": 2.7416, + "step": 31379 + }, + { + "epoch": 1.9479793904028804, + "grad_norm": 0.14802838715838484, + "learning_rate": 3.300990194034911e-05, + "loss": 2.6285, + "step": 31380 + }, + { + "epoch": 1.9480414675026383, + "grad_norm": 0.15778760737317485, + "learning_rate": 3.300650535697749e-05, + "loss": 2.8389, + "step": 31381 + }, + { + "epoch": 1.9481035446023962, + "grad_norm": 0.13847164688747562, + "learning_rate": 3.300310886226505e-05, + "loss": 2.6844, + "step": 31382 + }, + { + "epoch": 1.9481656217021541, + "grad_norm": 0.15140775321552785, + "learning_rate": 3.2999712456229524e-05, + "loss": 2.8136, + "step": 31383 + }, + { + "epoch": 1.948227698801912, + "grad_norm": 0.1583658429440124, + "learning_rate": 3.2996316138888634e-05, + "loss": 2.7273, + "step": 31384 + }, + { + "epoch": 1.94828977590167, + "grad_norm": 0.13817610568728678, + "learning_rate": 3.299291991026007e-05, + "loss": 2.7171, + "step": 31385 + }, + { + "epoch": 1.9483518530014279, + "grad_norm": 0.19560753473189724, + "learning_rate": 3.298952377036159e-05, + "loss": 2.6509, + "step": 31386 + }, + { + "epoch": 1.9484139301011858, + "grad_norm": 0.1626225186146924, + "learning_rate": 3.298612771921088e-05, + "loss": 2.7996, + "step": 31387 + }, + { + "epoch": 1.9484760072009437, + "grad_norm": 0.18297132537488736, + "learning_rate": 3.298273175682568e-05, + "loss": 2.7333, + "step": 31388 + }, + { + "epoch": 1.9485380843007016, + "grad_norm": 0.15810217403303586, + "learning_rate": 3.29793358832237e-05, + "loss": 2.7607, + "step": 31389 + }, + { + "epoch": 1.9486001614004593, + "grad_norm": 0.15572045117382385, + "learning_rate": 3.2975940098422653e-05, + "loss": 2.7575, + "step": 31390 + }, + { + "epoch": 1.9486622385002172, + "grad_norm": 0.1960048230730952, + "learning_rate": 3.297254440244025e-05, + "loss": 2.7856, + "step": 31391 + }, + { + "epoch": 1.9487243155999752, + "grad_norm": 0.17514939658022646, + "learning_rate": 3.296914879529424e-05, + "loss": 2.7675, + "step": 31392 + }, + { + "epoch": 1.948786392699733, + "grad_norm": 0.15337268275093668, + "learning_rate": 3.296575327700229e-05, + "loss": 2.7601, + "step": 31393 + }, + { + "epoch": 1.948848469799491, + "grad_norm": 0.15856293392986606, + "learning_rate": 3.296235784758216e-05, + "loss": 2.7863, + "step": 31394 + }, + { + "epoch": 1.948910546899249, + "grad_norm": 0.17108932188914217, + "learning_rate": 3.295896250705154e-05, + "loss": 2.7606, + "step": 31395 + }, + { + "epoch": 1.9489726239990066, + "grad_norm": 0.2001411178550635, + "learning_rate": 3.295556725542814e-05, + "loss": 2.7577, + "step": 31396 + }, + { + "epoch": 1.9490347010987645, + "grad_norm": 0.1650051894780683, + "learning_rate": 3.2952172092729694e-05, + "loss": 2.7377, + "step": 31397 + }, + { + "epoch": 1.9490967781985225, + "grad_norm": 0.15462954789067465, + "learning_rate": 3.29487770189739e-05, + "loss": 2.7414, + "step": 31398 + }, + { + "epoch": 1.9491588552982804, + "grad_norm": 0.15653820280457845, + "learning_rate": 3.294538203417848e-05, + "loss": 2.8066, + "step": 31399 + }, + { + "epoch": 1.9492209323980383, + "grad_norm": 0.16634726217612214, + "learning_rate": 3.294198713836113e-05, + "loss": 2.8337, + "step": 31400 + }, + { + "epoch": 1.9492830094977962, + "grad_norm": 0.14049997334887296, + "learning_rate": 3.2938592331539587e-05, + "loss": 2.7035, + "step": 31401 + }, + { + "epoch": 1.9493450865975541, + "grad_norm": 0.15114963648848903, + "learning_rate": 3.293519761373154e-05, + "loss": 2.7891, + "step": 31402 + }, + { + "epoch": 1.949407163697312, + "grad_norm": 0.14343505207998106, + "learning_rate": 3.293180298495472e-05, + "loss": 2.6704, + "step": 31403 + }, + { + "epoch": 1.94946924079707, + "grad_norm": 0.14438359347484625, + "learning_rate": 3.292840844522682e-05, + "loss": 2.7102, + "step": 31404 + }, + { + "epoch": 1.9495313178968279, + "grad_norm": 0.14506228937259488, + "learning_rate": 3.2925013994565564e-05, + "loss": 2.836, + "step": 31405 + }, + { + "epoch": 1.9495933949965858, + "grad_norm": 0.14702348685571617, + "learning_rate": 3.292161963298864e-05, + "loss": 2.7767, + "step": 31406 + }, + { + "epoch": 1.9496554720963437, + "grad_norm": 0.15762867655236446, + "learning_rate": 3.2918225360513785e-05, + "loss": 2.7081, + "step": 31407 + }, + { + "epoch": 1.9497175491961016, + "grad_norm": 0.1600166561446236, + "learning_rate": 3.29148311771587e-05, + "loss": 2.7221, + "step": 31408 + }, + { + "epoch": 1.9497796262958595, + "grad_norm": 0.15587789215190107, + "learning_rate": 3.291143708294108e-05, + "loss": 2.7281, + "step": 31409 + }, + { + "epoch": 1.9498417033956175, + "grad_norm": 0.14118733822229446, + "learning_rate": 3.290804307787865e-05, + "loss": 2.7356, + "step": 31410 + }, + { + "epoch": 1.9499037804953754, + "grad_norm": 0.15909269799428097, + "learning_rate": 3.290464916198908e-05, + "loss": 2.7331, + "step": 31411 + }, + { + "epoch": 1.9499658575951333, + "grad_norm": 0.17803949847137238, + "learning_rate": 3.290125533529012e-05, + "loss": 2.6968, + "step": 31412 + }, + { + "epoch": 1.9500279346948912, + "grad_norm": 0.14263001895418584, + "learning_rate": 3.2897861597799475e-05, + "loss": 2.7512, + "step": 31413 + }, + { + "epoch": 1.950090011794649, + "grad_norm": 0.18220792390905452, + "learning_rate": 3.289446794953482e-05, + "loss": 2.819, + "step": 31414 + }, + { + "epoch": 1.9501520888944068, + "grad_norm": 0.1379584105301417, + "learning_rate": 3.28910743905139e-05, + "loss": 2.7688, + "step": 31415 + }, + { + "epoch": 1.9502141659941648, + "grad_norm": 0.14607472040809683, + "learning_rate": 3.2887680920754384e-05, + "loss": 2.6945, + "step": 31416 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.1530449088507285, + "learning_rate": 3.288428754027399e-05, + "loss": 2.7674, + "step": 31417 + }, + { + "epoch": 1.9503383201936806, + "grad_norm": 0.15013594551626727, + "learning_rate": 3.2880894249090424e-05, + "loss": 2.7611, + "step": 31418 + }, + { + "epoch": 1.9504003972934385, + "grad_norm": 0.14943836363595112, + "learning_rate": 3.287750104722139e-05, + "loss": 2.7097, + "step": 31419 + }, + { + "epoch": 1.9504624743931962, + "grad_norm": 0.14443060231544563, + "learning_rate": 3.2874107934684604e-05, + "loss": 2.6673, + "step": 31420 + }, + { + "epoch": 1.9505245514929541, + "grad_norm": 0.1421659075931768, + "learning_rate": 3.287071491149773e-05, + "loss": 2.7474, + "step": 31421 + }, + { + "epoch": 1.950586628592712, + "grad_norm": 0.18558701532749852, + "learning_rate": 3.286732197767851e-05, + "loss": 2.773, + "step": 31422 + }, + { + "epoch": 1.95064870569247, + "grad_norm": 0.1534403859253609, + "learning_rate": 3.286392913324463e-05, + "loss": 2.7665, + "step": 31423 + }, + { + "epoch": 1.9507107827922279, + "grad_norm": 0.14968862426630272, + "learning_rate": 3.286053637821379e-05, + "loss": 2.7524, + "step": 31424 + }, + { + "epoch": 1.9507728598919858, + "grad_norm": 0.14300740949193108, + "learning_rate": 3.285714371260368e-05, + "loss": 2.7463, + "step": 31425 + }, + { + "epoch": 1.9508349369917437, + "grad_norm": 0.1596110647917787, + "learning_rate": 3.285375113643203e-05, + "loss": 2.65, + "step": 31426 + }, + { + "epoch": 1.9508970140915016, + "grad_norm": 0.14884946775216307, + "learning_rate": 3.285035864971651e-05, + "loss": 2.7248, + "step": 31427 + }, + { + "epoch": 1.9509590911912595, + "grad_norm": 0.15774225876573086, + "learning_rate": 3.284696625247484e-05, + "loss": 2.7227, + "step": 31428 + }, + { + "epoch": 1.9510211682910175, + "grad_norm": 0.13890251268351733, + "learning_rate": 3.2843573944724704e-05, + "loss": 2.7685, + "step": 31429 + }, + { + "epoch": 1.9510832453907754, + "grad_norm": 0.14022697306998003, + "learning_rate": 3.284018172648382e-05, + "loss": 2.782, + "step": 31430 + }, + { + "epoch": 1.9511453224905333, + "grad_norm": 0.14528449414184696, + "learning_rate": 3.283678959776986e-05, + "loss": 2.7933, + "step": 31431 + }, + { + "epoch": 1.9512073995902912, + "grad_norm": 0.14410402329479896, + "learning_rate": 3.283339755860055e-05, + "loss": 2.8339, + "step": 31432 + }, + { + "epoch": 1.9512694766900491, + "grad_norm": 0.14068806744015794, + "learning_rate": 3.2830005608993565e-05, + "loss": 2.7145, + "step": 31433 + }, + { + "epoch": 1.951331553789807, + "grad_norm": 0.14331880639886813, + "learning_rate": 3.282661374896661e-05, + "loss": 2.7068, + "step": 31434 + }, + { + "epoch": 1.951393630889565, + "grad_norm": 0.1379204931141309, + "learning_rate": 3.282322197853738e-05, + "loss": 2.6641, + "step": 31435 + }, + { + "epoch": 1.9514557079893229, + "grad_norm": 0.17577535301489627, + "learning_rate": 3.2819830297723566e-05, + "loss": 2.7289, + "step": 31436 + }, + { + "epoch": 1.9515177850890808, + "grad_norm": 0.15925978184283085, + "learning_rate": 3.281643870654287e-05, + "loss": 2.7638, + "step": 31437 + }, + { + "epoch": 1.9515798621888385, + "grad_norm": 0.20085218638520888, + "learning_rate": 3.281304720501298e-05, + "loss": 2.7927, + "step": 31438 + }, + { + "epoch": 1.9516419392885964, + "grad_norm": 0.15317688447324765, + "learning_rate": 3.2809655793151605e-05, + "loss": 2.8173, + "step": 31439 + }, + { + "epoch": 1.9517040163883543, + "grad_norm": 0.14977761835979883, + "learning_rate": 3.2806264470976414e-05, + "loss": 2.819, + "step": 31440 + }, + { + "epoch": 1.9517660934881123, + "grad_norm": 0.14961749918818917, + "learning_rate": 3.2802873238505135e-05, + "loss": 2.7127, + "step": 31441 + }, + { + "epoch": 1.9518281705878702, + "grad_norm": 0.1583212599783631, + "learning_rate": 3.279948209575542e-05, + "loss": 2.8456, + "step": 31442 + }, + { + "epoch": 1.951890247687628, + "grad_norm": 0.1519951071026188, + "learning_rate": 3.2796091042745e-05, + "loss": 2.7574, + "step": 31443 + }, + { + "epoch": 1.9519523247873858, + "grad_norm": 0.15194973978620957, + "learning_rate": 3.279270007949153e-05, + "loss": 2.6877, + "step": 31444 + }, + { + "epoch": 1.9520144018871437, + "grad_norm": 0.1735107975803092, + "learning_rate": 3.2789309206012744e-05, + "loss": 2.818, + "step": 31445 + }, + { + "epoch": 1.9520764789869016, + "grad_norm": 0.17018580492669583, + "learning_rate": 3.278591842232629e-05, + "loss": 2.82, + "step": 31446 + }, + { + "epoch": 1.9521385560866595, + "grad_norm": 0.18265199584510103, + "learning_rate": 3.2782527728449894e-05, + "loss": 2.8693, + "step": 31447 + }, + { + "epoch": 1.9522006331864175, + "grad_norm": 0.14857307943140716, + "learning_rate": 3.2779137124401215e-05, + "loss": 2.6844, + "step": 31448 + }, + { + "epoch": 1.9522627102861754, + "grad_norm": 0.14727178024461196, + "learning_rate": 3.277574661019798e-05, + "loss": 2.7783, + "step": 31449 + }, + { + "epoch": 1.9523247873859333, + "grad_norm": 0.20642206246957517, + "learning_rate": 3.277235618585783e-05, + "loss": 2.8023, + "step": 31450 + }, + { + "epoch": 1.9523868644856912, + "grad_norm": 0.14498671086433113, + "learning_rate": 3.27689658513985e-05, + "loss": 2.7729, + "step": 31451 + }, + { + "epoch": 1.9524489415854491, + "grad_norm": 0.16342489344370326, + "learning_rate": 3.276557560683765e-05, + "loss": 2.779, + "step": 31452 + }, + { + "epoch": 1.952511018685207, + "grad_norm": 0.1501178559093102, + "learning_rate": 3.2762185452192986e-05, + "loss": 2.839, + "step": 31453 + }, + { + "epoch": 1.952573095784965, + "grad_norm": 0.16116062848991192, + "learning_rate": 3.275879538748217e-05, + "loss": 2.8252, + "step": 31454 + }, + { + "epoch": 1.9526351728847229, + "grad_norm": 0.15612977886521587, + "learning_rate": 3.275540541272292e-05, + "loss": 2.8275, + "step": 31455 + }, + { + "epoch": 1.9526972499844808, + "grad_norm": 0.17726587022316365, + "learning_rate": 3.275201552793289e-05, + "loss": 2.7798, + "step": 31456 + }, + { + "epoch": 1.9527593270842387, + "grad_norm": 0.20747286103478216, + "learning_rate": 3.274862573312979e-05, + "loss": 2.7803, + "step": 31457 + }, + { + "epoch": 1.9528214041839966, + "grad_norm": 0.144386908909071, + "learning_rate": 3.2745236028331305e-05, + "loss": 2.7012, + "step": 31458 + }, + { + "epoch": 1.9528834812837546, + "grad_norm": 0.16601067703951755, + "learning_rate": 3.27418464135551e-05, + "loss": 2.7094, + "step": 31459 + }, + { + "epoch": 1.9529455583835125, + "grad_norm": 0.1395199658251211, + "learning_rate": 3.273845688881888e-05, + "loss": 2.75, + "step": 31460 + }, + { + "epoch": 1.9530076354832704, + "grad_norm": 0.14530696704544338, + "learning_rate": 3.2735067454140314e-05, + "loss": 2.7025, + "step": 31461 + }, + { + "epoch": 1.953069712583028, + "grad_norm": 0.1457555824735264, + "learning_rate": 3.27316781095371e-05, + "loss": 2.7286, + "step": 31462 + }, + { + "epoch": 1.953131789682786, + "grad_norm": 0.13810849929438723, + "learning_rate": 3.27282888550269e-05, + "loss": 2.6978, + "step": 31463 + }, + { + "epoch": 1.953193866782544, + "grad_norm": 0.15356977177614153, + "learning_rate": 3.2724899690627425e-05, + "loss": 2.7144, + "step": 31464 + }, + { + "epoch": 1.9532559438823018, + "grad_norm": 0.14255377809810738, + "learning_rate": 3.2721510616356334e-05, + "loss": 2.623, + "step": 31465 + }, + { + "epoch": 1.9533180209820598, + "grad_norm": 0.15676154204703188, + "learning_rate": 3.271812163223132e-05, + "loss": 2.7723, + "step": 31466 + }, + { + "epoch": 1.9533800980818177, + "grad_norm": 0.14978627983305556, + "learning_rate": 3.271473273827005e-05, + "loss": 2.8226, + "step": 31467 + }, + { + "epoch": 1.9534421751815754, + "grad_norm": 0.15352135737347763, + "learning_rate": 3.2711343934490235e-05, + "loss": 2.7991, + "step": 31468 + }, + { + "epoch": 1.9535042522813333, + "grad_norm": 0.14597377797922056, + "learning_rate": 3.270795522090952e-05, + "loss": 2.7379, + "step": 31469 + }, + { + "epoch": 1.9535663293810912, + "grad_norm": 0.14317746457474362, + "learning_rate": 3.2704566597545605e-05, + "loss": 2.7421, + "step": 31470 + }, + { + "epoch": 1.9536284064808491, + "grad_norm": 0.14354987363464283, + "learning_rate": 3.270117806441617e-05, + "loss": 2.7562, + "step": 31471 + }, + { + "epoch": 1.953690483580607, + "grad_norm": 0.16119493962950432, + "learning_rate": 3.269778962153888e-05, + "loss": 2.8162, + "step": 31472 + }, + { + "epoch": 1.953752560680365, + "grad_norm": 0.1422399468686759, + "learning_rate": 3.269440126893143e-05, + "loss": 2.7127, + "step": 31473 + }, + { + "epoch": 1.9538146377801229, + "grad_norm": 0.14099263112067134, + "learning_rate": 3.269101300661148e-05, + "loss": 2.7423, + "step": 31474 + }, + { + "epoch": 1.9538767148798808, + "grad_norm": 0.14110953910116064, + "learning_rate": 3.268762483459672e-05, + "loss": 2.7558, + "step": 31475 + }, + { + "epoch": 1.9539387919796387, + "grad_norm": 0.15810550220811947, + "learning_rate": 3.268423675290483e-05, + "loss": 2.6586, + "step": 31476 + }, + { + "epoch": 1.9540008690793966, + "grad_norm": 0.15361121698291175, + "learning_rate": 3.268084876155346e-05, + "loss": 2.7507, + "step": 31477 + }, + { + "epoch": 1.9540629461791545, + "grad_norm": 0.14182213014526293, + "learning_rate": 3.2677460860560325e-05, + "loss": 2.8337, + "step": 31478 + }, + { + "epoch": 1.9541250232789125, + "grad_norm": 0.1923404298367189, + "learning_rate": 3.2674073049943074e-05, + "loss": 2.7822, + "step": 31479 + }, + { + "epoch": 1.9541871003786704, + "grad_norm": 0.1665964792153613, + "learning_rate": 3.26706853297194e-05, + "loss": 2.7011, + "step": 31480 + }, + { + "epoch": 1.9542491774784283, + "grad_norm": 0.1498015310161331, + "learning_rate": 3.266729769990696e-05, + "loss": 2.7677, + "step": 31481 + }, + { + "epoch": 1.9543112545781862, + "grad_norm": 0.14556655722041756, + "learning_rate": 3.266391016052344e-05, + "loss": 2.6943, + "step": 31482 + }, + { + "epoch": 1.9543733316779441, + "grad_norm": 0.14939654627830112, + "learning_rate": 3.2660522711586514e-05, + "loss": 2.7413, + "step": 31483 + }, + { + "epoch": 1.954435408777702, + "grad_norm": 0.16048690924371956, + "learning_rate": 3.2657135353113845e-05, + "loss": 2.7784, + "step": 31484 + }, + { + "epoch": 1.9544974858774598, + "grad_norm": 0.1439805558329952, + "learning_rate": 3.265374808512311e-05, + "loss": 2.7233, + "step": 31485 + }, + { + "epoch": 1.9545595629772177, + "grad_norm": 0.15981962135599237, + "learning_rate": 3.265036090763198e-05, + "loss": 2.7404, + "step": 31486 + }, + { + "epoch": 1.9546216400769756, + "grad_norm": 0.1720386506306916, + "learning_rate": 3.264697382065813e-05, + "loss": 2.8624, + "step": 31487 + }, + { + "epoch": 1.9546837171767335, + "grad_norm": 0.15681765676634798, + "learning_rate": 3.264358682421923e-05, + "loss": 2.7344, + "step": 31488 + }, + { + "epoch": 1.9547457942764914, + "grad_norm": 0.16808567856492318, + "learning_rate": 3.264019991833296e-05, + "loss": 2.7254, + "step": 31489 + }, + { + "epoch": 1.9548078713762493, + "grad_norm": 0.14886478363074174, + "learning_rate": 3.263681310301696e-05, + "loss": 2.789, + "step": 31490 + }, + { + "epoch": 1.954869948476007, + "grad_norm": 0.16043335504753967, + "learning_rate": 3.263342637828894e-05, + "loss": 2.6862, + "step": 31491 + }, + { + "epoch": 1.954932025575765, + "grad_norm": 0.1491883074579174, + "learning_rate": 3.263003974416654e-05, + "loss": 2.7903, + "step": 31492 + }, + { + "epoch": 1.9549941026755229, + "grad_norm": 0.17462948591577465, + "learning_rate": 3.2626653200667454e-05, + "loss": 2.6953, + "step": 31493 + }, + { + "epoch": 1.9550561797752808, + "grad_norm": 0.14768355662028873, + "learning_rate": 3.2623266747809336e-05, + "loss": 2.6709, + "step": 31494 + }, + { + "epoch": 1.9551182568750387, + "grad_norm": 0.14416556233077732, + "learning_rate": 3.261988038560983e-05, + "loss": 2.8082, + "step": 31495 + }, + { + "epoch": 1.9551803339747966, + "grad_norm": 0.1427277798582723, + "learning_rate": 3.261649411408665e-05, + "loss": 2.6586, + "step": 31496 + }, + { + "epoch": 1.9552424110745545, + "grad_norm": 0.15883650968216947, + "learning_rate": 3.2613107933257425e-05, + "loss": 2.8112, + "step": 31497 + }, + { + "epoch": 1.9553044881743125, + "grad_norm": 0.15322783286964012, + "learning_rate": 3.260972184313985e-05, + "loss": 2.7592, + "step": 31498 + }, + { + "epoch": 1.9553665652740704, + "grad_norm": 0.15932927387291337, + "learning_rate": 3.2606335843751564e-05, + "loss": 2.6997, + "step": 31499 + }, + { + "epoch": 1.9554286423738283, + "grad_norm": 0.14461436974562047, + "learning_rate": 3.260294993511026e-05, + "loss": 2.7669, + "step": 31500 + }, + { + "epoch": 1.9554907194735862, + "grad_norm": 0.13781943712485745, + "learning_rate": 3.2599564117233575e-05, + "loss": 2.8134, + "step": 31501 + }, + { + "epoch": 1.9555527965733441, + "grad_norm": 0.1440329467390436, + "learning_rate": 3.25961783901392e-05, + "loss": 2.7929, + "step": 31502 + }, + { + "epoch": 1.955614873673102, + "grad_norm": 0.16823693976932197, + "learning_rate": 3.2592792753844783e-05, + "loss": 2.7368, + "step": 31503 + }, + { + "epoch": 1.95567695077286, + "grad_norm": 0.1458473915465301, + "learning_rate": 3.2589407208367993e-05, + "loss": 2.7752, + "step": 31504 + }, + { + "epoch": 1.9557390278726179, + "grad_norm": 0.1388525189192014, + "learning_rate": 3.2586021753726495e-05, + "loss": 2.7696, + "step": 31505 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.15301765649803142, + "learning_rate": 3.258263638993794e-05, + "loss": 2.7621, + "step": 31506 + }, + { + "epoch": 1.9558631820721337, + "grad_norm": 0.13667393358563606, + "learning_rate": 3.257925111702001e-05, + "loss": 2.7623, + "step": 31507 + }, + { + "epoch": 1.9559252591718916, + "grad_norm": 0.14090530618836705, + "learning_rate": 3.2575865934990345e-05, + "loss": 2.8226, + "step": 31508 + }, + { + "epoch": 1.9559873362716493, + "grad_norm": 0.13846253322669108, + "learning_rate": 3.257248084386663e-05, + "loss": 2.7621, + "step": 31509 + }, + { + "epoch": 1.9560494133714073, + "grad_norm": 0.14417810728064454, + "learning_rate": 3.2569095843666485e-05, + "loss": 2.7114, + "step": 31510 + }, + { + "epoch": 1.9561114904711652, + "grad_norm": 0.14379629576546796, + "learning_rate": 3.2565710934407614e-05, + "loss": 2.7554, + "step": 31511 + }, + { + "epoch": 1.956173567570923, + "grad_norm": 0.17351915997404302, + "learning_rate": 3.2562326116107674e-05, + "loss": 2.7366, + "step": 31512 + }, + { + "epoch": 1.956235644670681, + "grad_norm": 0.144929061310467, + "learning_rate": 3.2558941388784304e-05, + "loss": 2.7267, + "step": 31513 + }, + { + "epoch": 1.956297721770439, + "grad_norm": 0.13846540527161016, + "learning_rate": 3.2555556752455176e-05, + "loss": 2.8297, + "step": 31514 + }, + { + "epoch": 1.9563597988701966, + "grad_norm": 0.1455558531137461, + "learning_rate": 3.255217220713793e-05, + "loss": 2.7823, + "step": 31515 + }, + { + "epoch": 1.9564218759699545, + "grad_norm": 0.13929780543642922, + "learning_rate": 3.254878775285025e-05, + "loss": 2.7073, + "step": 31516 + }, + { + "epoch": 1.9564839530697125, + "grad_norm": 0.13813813341282205, + "learning_rate": 3.254540338960977e-05, + "loss": 2.7239, + "step": 31517 + }, + { + "epoch": 1.9565460301694704, + "grad_norm": 0.1816708836937317, + "learning_rate": 3.254201911743417e-05, + "loss": 2.7997, + "step": 31518 + }, + { + "epoch": 1.9566081072692283, + "grad_norm": 0.14567988417434122, + "learning_rate": 3.253863493634109e-05, + "loss": 2.6538, + "step": 31519 + }, + { + "epoch": 1.9566701843689862, + "grad_norm": 0.14991561383585741, + "learning_rate": 3.253525084634819e-05, + "loss": 2.7382, + "step": 31520 + }, + { + "epoch": 1.9567322614687441, + "grad_norm": 0.15651177293588636, + "learning_rate": 3.2531866847473126e-05, + "loss": 2.6633, + "step": 31521 + }, + { + "epoch": 1.956794338568502, + "grad_norm": 0.13871303135659338, + "learning_rate": 3.2528482939733544e-05, + "loss": 2.7435, + "step": 31522 + }, + { + "epoch": 1.95685641566826, + "grad_norm": 0.16104499273291095, + "learning_rate": 3.252509912314712e-05, + "loss": 2.7554, + "step": 31523 + }, + { + "epoch": 1.9569184927680179, + "grad_norm": 0.13629519384559363, + "learning_rate": 3.252171539773149e-05, + "loss": 2.7542, + "step": 31524 + }, + { + "epoch": 1.9569805698677758, + "grad_norm": 0.1491447152725371, + "learning_rate": 3.251833176350432e-05, + "loss": 2.7371, + "step": 31525 + }, + { + "epoch": 1.9570426469675337, + "grad_norm": 0.14920337632757621, + "learning_rate": 3.2514948220483246e-05, + "loss": 2.7499, + "step": 31526 + }, + { + "epoch": 1.9571047240672916, + "grad_norm": 0.14073270408449073, + "learning_rate": 3.251156476868594e-05, + "loss": 2.697, + "step": 31527 + }, + { + "epoch": 1.9571668011670496, + "grad_norm": 0.14466842379369044, + "learning_rate": 3.250818140813004e-05, + "loss": 2.7743, + "step": 31528 + }, + { + "epoch": 1.9572288782668075, + "grad_norm": 0.14797313736047643, + "learning_rate": 3.2504798138833215e-05, + "loss": 2.7632, + "step": 31529 + }, + { + "epoch": 1.9572909553665654, + "grad_norm": 0.15276685671774573, + "learning_rate": 3.2501414960813106e-05, + "loss": 2.8383, + "step": 31530 + }, + { + "epoch": 1.9573530324663233, + "grad_norm": 0.14523247135303974, + "learning_rate": 3.2498031874087353e-05, + "loss": 2.6957, + "step": 31531 + }, + { + "epoch": 1.9574151095660812, + "grad_norm": 0.1429582519012163, + "learning_rate": 3.2494648878673616e-05, + "loss": 2.7337, + "step": 31532 + }, + { + "epoch": 1.957477186665839, + "grad_norm": 0.13836050671761338, + "learning_rate": 3.2491265974589546e-05, + "loss": 2.6762, + "step": 31533 + }, + { + "epoch": 1.9575392637655968, + "grad_norm": 0.14758206754500086, + "learning_rate": 3.24878831618528e-05, + "loss": 2.7858, + "step": 31534 + }, + { + "epoch": 1.9576013408653548, + "grad_norm": 0.1547821778693789, + "learning_rate": 3.248450044048101e-05, + "loss": 2.7631, + "step": 31535 + }, + { + "epoch": 1.9576634179651127, + "grad_norm": 0.1437935237197631, + "learning_rate": 3.248111781049184e-05, + "loss": 2.7675, + "step": 31536 + }, + { + "epoch": 1.9577254950648706, + "grad_norm": 0.14427611023471254, + "learning_rate": 3.247773527190292e-05, + "loss": 2.8202, + "step": 31537 + }, + { + "epoch": 1.9577875721646285, + "grad_norm": 0.20334514279413968, + "learning_rate": 3.247435282473192e-05, + "loss": 2.7297, + "step": 31538 + }, + { + "epoch": 1.9578496492643862, + "grad_norm": 0.1432801300470975, + "learning_rate": 3.247097046899647e-05, + "loss": 2.7655, + "step": 31539 + }, + { + "epoch": 1.9579117263641441, + "grad_norm": 0.1480638481717225, + "learning_rate": 3.246758820471423e-05, + "loss": 2.6917, + "step": 31540 + }, + { + "epoch": 1.957973803463902, + "grad_norm": 0.16504474494676946, + "learning_rate": 3.246420603190283e-05, + "loss": 2.7829, + "step": 31541 + }, + { + "epoch": 1.95803588056366, + "grad_norm": 0.14353992100758245, + "learning_rate": 3.246082395057991e-05, + "loss": 2.8254, + "step": 31542 + }, + { + "epoch": 1.9580979576634179, + "grad_norm": 0.1686944863613585, + "learning_rate": 3.2457441960763144e-05, + "loss": 2.8073, + "step": 31543 + }, + { + "epoch": 1.9581600347631758, + "grad_norm": 0.14139339667858142, + "learning_rate": 3.245406006247018e-05, + "loss": 2.7479, + "step": 31544 + }, + { + "epoch": 1.9582221118629337, + "grad_norm": 0.14205427478368854, + "learning_rate": 3.245067825571862e-05, + "loss": 2.762, + "step": 31545 + }, + { + "epoch": 1.9582841889626916, + "grad_norm": 0.14145013670870896, + "learning_rate": 3.244729654052614e-05, + "loss": 2.7043, + "step": 31546 + }, + { + "epoch": 1.9583462660624495, + "grad_norm": 0.14264861415469982, + "learning_rate": 3.2443914916910375e-05, + "loss": 2.761, + "step": 31547 + }, + { + "epoch": 1.9584083431622075, + "grad_norm": 0.15505325853092275, + "learning_rate": 3.244053338488897e-05, + "loss": 2.7128, + "step": 31548 + }, + { + "epoch": 1.9584704202619654, + "grad_norm": 0.15436613966731336, + "learning_rate": 3.243715194447956e-05, + "loss": 2.7793, + "step": 31549 + }, + { + "epoch": 1.9585324973617233, + "grad_norm": 0.1389269698773391, + "learning_rate": 3.24337705956998e-05, + "loss": 2.8027, + "step": 31550 + }, + { + "epoch": 1.9585945744614812, + "grad_norm": 0.14968501561796957, + "learning_rate": 3.243038933856731e-05, + "loss": 2.7467, + "step": 31551 + }, + { + "epoch": 1.9586566515612391, + "grad_norm": 0.16876017619285533, + "learning_rate": 3.242700817309976e-05, + "loss": 2.7907, + "step": 31552 + }, + { + "epoch": 1.958718728660997, + "grad_norm": 0.16225217366746292, + "learning_rate": 3.2423627099314755e-05, + "loss": 2.7357, + "step": 31553 + }, + { + "epoch": 1.958780805760755, + "grad_norm": 0.15091789383835166, + "learning_rate": 3.2420246117229974e-05, + "loss": 2.7322, + "step": 31554 + }, + { + "epoch": 1.958842882860513, + "grad_norm": 0.1382391652760199, + "learning_rate": 3.241686522686304e-05, + "loss": 2.6943, + "step": 31555 + }, + { + "epoch": 1.9589049599602708, + "grad_norm": 0.15793219667904496, + "learning_rate": 3.241348442823157e-05, + "loss": 2.7696, + "step": 31556 + }, + { + "epoch": 1.9589670370600285, + "grad_norm": 0.15256093511907942, + "learning_rate": 3.2410103721353226e-05, + "loss": 2.8469, + "step": 31557 + }, + { + "epoch": 1.9590291141597864, + "grad_norm": 0.14421359156292765, + "learning_rate": 3.240672310624564e-05, + "loss": 2.7441, + "step": 31558 + }, + { + "epoch": 1.9590911912595443, + "grad_norm": 0.1521744520989712, + "learning_rate": 3.240334258292645e-05, + "loss": 2.7253, + "step": 31559 + }, + { + "epoch": 1.9591532683593023, + "grad_norm": 0.14693200699368908, + "learning_rate": 3.2399962151413296e-05, + "loss": 2.7715, + "step": 31560 + }, + { + "epoch": 1.9592153454590602, + "grad_norm": 0.14339440019245067, + "learning_rate": 3.239658181172382e-05, + "loss": 2.7192, + "step": 31561 + }, + { + "epoch": 1.959277422558818, + "grad_norm": 0.14623561081240297, + "learning_rate": 3.2393201563875636e-05, + "loss": 2.7761, + "step": 31562 + }, + { + "epoch": 1.9593394996585758, + "grad_norm": 0.1489938486604817, + "learning_rate": 3.238982140788641e-05, + "loss": 2.6932, + "step": 31563 + }, + { + "epoch": 1.9594015767583337, + "grad_norm": 0.14254140381002214, + "learning_rate": 3.238644134377374e-05, + "loss": 2.7731, + "step": 31564 + }, + { + "epoch": 1.9594636538580916, + "grad_norm": 0.1711474321359945, + "learning_rate": 3.2383061371555304e-05, + "loss": 2.7738, + "step": 31565 + }, + { + "epoch": 1.9595257309578495, + "grad_norm": 0.15327043997805845, + "learning_rate": 3.237968149124869e-05, + "loss": 2.6677, + "step": 31566 + }, + { + "epoch": 1.9595878080576075, + "grad_norm": 0.14766511612971317, + "learning_rate": 3.237630170287158e-05, + "loss": 2.7438, + "step": 31567 + }, + { + "epoch": 1.9596498851573654, + "grad_norm": 0.13734211499756663, + "learning_rate": 3.237292200644157e-05, + "loss": 2.6213, + "step": 31568 + }, + { + "epoch": 1.9597119622571233, + "grad_norm": 0.1537693518578573, + "learning_rate": 3.2369542401976294e-05, + "loss": 2.7548, + "step": 31569 + }, + { + "epoch": 1.9597740393568812, + "grad_norm": 0.13586645133140565, + "learning_rate": 3.236616288949341e-05, + "loss": 2.7971, + "step": 31570 + }, + { + "epoch": 1.9598361164566391, + "grad_norm": 0.16294657297557247, + "learning_rate": 3.2362783469010527e-05, + "loss": 2.7699, + "step": 31571 + }, + { + "epoch": 1.959898193556397, + "grad_norm": 0.14910779992424628, + "learning_rate": 3.2359404140545295e-05, + "loss": 2.7373, + "step": 31572 + }, + { + "epoch": 1.959960270656155, + "grad_norm": 0.14528034659457836, + "learning_rate": 3.235602490411532e-05, + "loss": 2.7624, + "step": 31573 + }, + { + "epoch": 1.9600223477559129, + "grad_norm": 0.1360143636023825, + "learning_rate": 3.235264575973826e-05, + "loss": 2.7477, + "step": 31574 + }, + { + "epoch": 1.9600844248556708, + "grad_norm": 0.15217178983301133, + "learning_rate": 3.2349266707431705e-05, + "loss": 2.7226, + "step": 31575 + }, + { + "epoch": 1.9601465019554287, + "grad_norm": 0.14466069198689255, + "learning_rate": 3.234588774721333e-05, + "loss": 2.8262, + "step": 31576 + }, + { + "epoch": 1.9602085790551866, + "grad_norm": 0.14760073925239595, + "learning_rate": 3.2342508879100747e-05, + "loss": 2.8359, + "step": 31577 + }, + { + "epoch": 1.9602706561549446, + "grad_norm": 0.15332392062661424, + "learning_rate": 3.233913010311157e-05, + "loss": 2.7722, + "step": 31578 + }, + { + "epoch": 1.9603327332547025, + "grad_norm": 0.14335573124217715, + "learning_rate": 3.2335751419263456e-05, + "loss": 2.7963, + "step": 31579 + }, + { + "epoch": 1.9603948103544604, + "grad_norm": 0.158779982275539, + "learning_rate": 3.233237282757401e-05, + "loss": 2.7569, + "step": 31580 + }, + { + "epoch": 1.960456887454218, + "grad_norm": 0.1409198134335269, + "learning_rate": 3.232899432806086e-05, + "loss": 2.628, + "step": 31581 + }, + { + "epoch": 1.960518964553976, + "grad_norm": 0.17687255338722288, + "learning_rate": 3.2325615920741634e-05, + "loss": 2.8704, + "step": 31582 + }, + { + "epoch": 1.960581041653734, + "grad_norm": 0.16345264984578153, + "learning_rate": 3.232223760563396e-05, + "loss": 2.7359, + "step": 31583 + }, + { + "epoch": 1.9606431187534918, + "grad_norm": 0.15245103726214, + "learning_rate": 3.2318859382755474e-05, + "loss": 2.7558, + "step": 31584 + }, + { + "epoch": 1.9607051958532498, + "grad_norm": 0.20409802155411894, + "learning_rate": 3.231548125212378e-05, + "loss": 2.7797, + "step": 31585 + }, + { + "epoch": 1.9607672729530077, + "grad_norm": 0.15088495532505067, + "learning_rate": 3.2312103213756516e-05, + "loss": 2.7621, + "step": 31586 + }, + { + "epoch": 1.9608293500527654, + "grad_norm": 0.16011242732175116, + "learning_rate": 3.23087252676713e-05, + "loss": 2.7639, + "step": 31587 + }, + { + "epoch": 1.9608914271525233, + "grad_norm": 0.14675372473654158, + "learning_rate": 3.230534741388577e-05, + "loss": 2.7006, + "step": 31588 + }, + { + "epoch": 1.9609535042522812, + "grad_norm": 0.15366811240573586, + "learning_rate": 3.230196965241753e-05, + "loss": 2.719, + "step": 31589 + }, + { + "epoch": 1.9610155813520391, + "grad_norm": 0.15488488387157465, + "learning_rate": 3.229859198328421e-05, + "loss": 2.7733, + "step": 31590 + }, + { + "epoch": 1.961077658451797, + "grad_norm": 0.14203622468308516, + "learning_rate": 3.2295214406503445e-05, + "loss": 2.7945, + "step": 31591 + }, + { + "epoch": 1.961139735551555, + "grad_norm": 0.1674537303700405, + "learning_rate": 3.2291836922092826e-05, + "loss": 2.7333, + "step": 31592 + }, + { + "epoch": 1.9612018126513129, + "grad_norm": 0.1359091562011702, + "learning_rate": 3.228845953007e-05, + "loss": 2.7228, + "step": 31593 + }, + { + "epoch": 1.9612638897510708, + "grad_norm": 0.16600312578607712, + "learning_rate": 3.228508223045258e-05, + "loss": 2.7432, + "step": 31594 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.14692036912116546, + "learning_rate": 3.2281705023258197e-05, + "loss": 2.784, + "step": 31595 + }, + { + "epoch": 1.9613880439505866, + "grad_norm": 0.139370394724726, + "learning_rate": 3.2278327908504445e-05, + "loss": 2.6277, + "step": 31596 + }, + { + "epoch": 1.9614501210503446, + "grad_norm": 0.1472327506813663, + "learning_rate": 3.227495088620897e-05, + "loss": 2.7073, + "step": 31597 + }, + { + "epoch": 1.9615121981501025, + "grad_norm": 0.14544382544566337, + "learning_rate": 3.227157395638937e-05, + "loss": 2.7821, + "step": 31598 + }, + { + "epoch": 1.9615742752498604, + "grad_norm": 0.146514374522092, + "learning_rate": 3.226819711906328e-05, + "loss": 2.7182, + "step": 31599 + }, + { + "epoch": 1.9616363523496183, + "grad_norm": 0.15058724443632948, + "learning_rate": 3.226482037424829e-05, + "loss": 2.7727, + "step": 31600 + }, + { + "epoch": 1.9616984294493762, + "grad_norm": 0.19982213851178499, + "learning_rate": 3.226144372196206e-05, + "loss": 2.685, + "step": 31601 + }, + { + "epoch": 1.9617605065491341, + "grad_norm": 0.15687511092634193, + "learning_rate": 3.2258067162222176e-05, + "loss": 2.7154, + "step": 31602 + }, + { + "epoch": 1.961822583648892, + "grad_norm": 0.14663004943237792, + "learning_rate": 3.2254690695046275e-05, + "loss": 2.8142, + "step": 31603 + }, + { + "epoch": 1.96188466074865, + "grad_norm": 0.14929945920048132, + "learning_rate": 3.225131432045195e-05, + "loss": 2.829, + "step": 31604 + }, + { + "epoch": 1.9619467378484077, + "grad_norm": 0.13959336028333943, + "learning_rate": 3.224793803845682e-05, + "loss": 2.7482, + "step": 31605 + }, + { + "epoch": 1.9620088149481656, + "grad_norm": 0.1613052370401358, + "learning_rate": 3.224456184907852e-05, + "loss": 2.7447, + "step": 31606 + }, + { + "epoch": 1.9620708920479235, + "grad_norm": 0.14825698894242445, + "learning_rate": 3.2241185752334634e-05, + "loss": 2.8595, + "step": 31607 + }, + { + "epoch": 1.9621329691476814, + "grad_norm": 0.1392789768438333, + "learning_rate": 3.223780974824279e-05, + "loss": 2.8508, + "step": 31608 + }, + { + "epoch": 1.9621950462474393, + "grad_norm": 0.14845371785159428, + "learning_rate": 3.223443383682062e-05, + "loss": 2.737, + "step": 31609 + }, + { + "epoch": 1.9622571233471973, + "grad_norm": 0.16607790508034984, + "learning_rate": 3.223105801808572e-05, + "loss": 2.7881, + "step": 31610 + }, + { + "epoch": 1.962319200446955, + "grad_norm": 0.17816978610998005, + "learning_rate": 3.222768229205571e-05, + "loss": 2.7758, + "step": 31611 + }, + { + "epoch": 1.9623812775467129, + "grad_norm": 0.18982210310608627, + "learning_rate": 3.222430665874818e-05, + "loss": 2.7705, + "step": 31612 + }, + { + "epoch": 1.9624433546464708, + "grad_norm": 0.1548055280402228, + "learning_rate": 3.2220931118180774e-05, + "loss": 2.7325, + "step": 31613 + }, + { + "epoch": 1.9625054317462287, + "grad_norm": 0.14802393269990843, + "learning_rate": 3.221755567037107e-05, + "loss": 2.8649, + "step": 31614 + }, + { + "epoch": 1.9625675088459866, + "grad_norm": 0.153241890696234, + "learning_rate": 3.221418031533671e-05, + "loss": 2.6594, + "step": 31615 + }, + { + "epoch": 1.9626295859457445, + "grad_norm": 0.1458397477873684, + "learning_rate": 3.221080505309529e-05, + "loss": 2.7889, + "step": 31616 + }, + { + "epoch": 1.9626916630455025, + "grad_norm": 0.16648709164734124, + "learning_rate": 3.2207429883664404e-05, + "loss": 2.724, + "step": 31617 + }, + { + "epoch": 1.9627537401452604, + "grad_norm": 0.14355489510242217, + "learning_rate": 3.220405480706169e-05, + "loss": 2.7742, + "step": 31618 + }, + { + "epoch": 1.9628158172450183, + "grad_norm": 0.1669954500381596, + "learning_rate": 3.220067982330472e-05, + "loss": 2.7305, + "step": 31619 + }, + { + "epoch": 1.9628778943447762, + "grad_norm": 0.15347912332597954, + "learning_rate": 3.219730493241115e-05, + "loss": 2.8237, + "step": 31620 + }, + { + "epoch": 1.9629399714445341, + "grad_norm": 0.18394103582550225, + "learning_rate": 3.219393013439855e-05, + "loss": 2.8142, + "step": 31621 + }, + { + "epoch": 1.963002048544292, + "grad_norm": 0.15441593574335324, + "learning_rate": 3.2190555429284544e-05, + "loss": 2.7778, + "step": 31622 + }, + { + "epoch": 1.96306412564405, + "grad_norm": 0.1660508062447051, + "learning_rate": 3.218718081708673e-05, + "loss": 2.8133, + "step": 31623 + }, + { + "epoch": 1.963126202743808, + "grad_norm": 0.15793023887045507, + "learning_rate": 3.218380629782272e-05, + "loss": 2.7796, + "step": 31624 + }, + { + "epoch": 1.9631882798435658, + "grad_norm": 0.1567309368944104, + "learning_rate": 3.218043187151011e-05, + "loss": 2.7837, + "step": 31625 + }, + { + "epoch": 1.9632503569433237, + "grad_norm": 0.1486562008696261, + "learning_rate": 3.217705753816652e-05, + "loss": 2.6869, + "step": 31626 + }, + { + "epoch": 1.9633124340430816, + "grad_norm": 0.15354717381817928, + "learning_rate": 3.217368329780954e-05, + "loss": 2.7453, + "step": 31627 + }, + { + "epoch": 1.9633745111428396, + "grad_norm": 0.14433387863280642, + "learning_rate": 3.217030915045679e-05, + "loss": 2.7742, + "step": 31628 + }, + { + "epoch": 1.9634365882425973, + "grad_norm": 0.14678844047015438, + "learning_rate": 3.2166935096125874e-05, + "loss": 2.636, + "step": 31629 + }, + { + "epoch": 1.9634986653423552, + "grad_norm": 0.14680927485118278, + "learning_rate": 3.216356113483436e-05, + "loss": 2.8402, + "step": 31630 + }, + { + "epoch": 1.963560742442113, + "grad_norm": 0.13897075814738513, + "learning_rate": 3.216018726659991e-05, + "loss": 2.6834, + "step": 31631 + }, + { + "epoch": 1.963622819541871, + "grad_norm": 0.1364693751551218, + "learning_rate": 3.2156813491440065e-05, + "loss": 2.6386, + "step": 31632 + }, + { + "epoch": 1.963684896641629, + "grad_norm": 0.1410670551233049, + "learning_rate": 3.2153439809372476e-05, + "loss": 2.7431, + "step": 31633 + }, + { + "epoch": 1.9637469737413868, + "grad_norm": 0.16150472087426304, + "learning_rate": 3.215006622041471e-05, + "loss": 2.7629, + "step": 31634 + }, + { + "epoch": 1.9638090508411445, + "grad_norm": 0.14546040908510158, + "learning_rate": 3.2146692724584394e-05, + "loss": 2.8629, + "step": 31635 + }, + { + "epoch": 1.9638711279409025, + "grad_norm": 0.1423133455627478, + "learning_rate": 3.21433193218991e-05, + "loss": 2.666, + "step": 31636 + }, + { + "epoch": 1.9639332050406604, + "grad_norm": 0.13778658472921856, + "learning_rate": 3.213994601237645e-05, + "loss": 2.7562, + "step": 31637 + }, + { + "epoch": 1.9639952821404183, + "grad_norm": 0.1541465376963606, + "learning_rate": 3.2136572796034046e-05, + "loss": 2.7337, + "step": 31638 + }, + { + "epoch": 1.9640573592401762, + "grad_norm": 0.15286866257882337, + "learning_rate": 3.2133199672889475e-05, + "loss": 2.7149, + "step": 31639 + }, + { + "epoch": 1.9641194363399341, + "grad_norm": 0.1469005268315346, + "learning_rate": 3.212982664296034e-05, + "loss": 2.7315, + "step": 31640 + }, + { + "epoch": 1.964181513439692, + "grad_norm": 0.14907655431761427, + "learning_rate": 3.212645370626421e-05, + "loss": 2.7433, + "step": 31641 + }, + { + "epoch": 1.96424359053945, + "grad_norm": 0.15495634991004797, + "learning_rate": 3.212308086281874e-05, + "loss": 2.6642, + "step": 31642 + }, + { + "epoch": 1.9643056676392079, + "grad_norm": 0.15488351891490032, + "learning_rate": 3.211970811264149e-05, + "loss": 2.8006, + "step": 31643 + }, + { + "epoch": 1.9643677447389658, + "grad_norm": 0.18386558942098288, + "learning_rate": 3.211633545575006e-05, + "loss": 2.8085, + "step": 31644 + }, + { + "epoch": 1.9644298218387237, + "grad_norm": 0.141537974319364, + "learning_rate": 3.2112962892162055e-05, + "loss": 2.7715, + "step": 31645 + }, + { + "epoch": 1.9644918989384816, + "grad_norm": 0.1531350662854181, + "learning_rate": 3.2109590421895064e-05, + "loss": 2.7368, + "step": 31646 + }, + { + "epoch": 1.9645539760382396, + "grad_norm": 0.16729463549299903, + "learning_rate": 3.2106218044966686e-05, + "loss": 2.6839, + "step": 31647 + }, + { + "epoch": 1.9646160531379975, + "grad_norm": 0.15091973280144136, + "learning_rate": 3.2102845761394505e-05, + "loss": 2.7165, + "step": 31648 + }, + { + "epoch": 1.9646781302377554, + "grad_norm": 0.1668874400927559, + "learning_rate": 3.2099473571196125e-05, + "loss": 2.8035, + "step": 31649 + }, + { + "epoch": 1.9647402073375133, + "grad_norm": 0.1518351243794473, + "learning_rate": 3.209610147438914e-05, + "loss": 2.7464, + "step": 31650 + }, + { + "epoch": 1.9648022844372712, + "grad_norm": 0.16457740839924684, + "learning_rate": 3.209272947099114e-05, + "loss": 2.7369, + "step": 31651 + }, + { + "epoch": 1.9648643615370291, + "grad_norm": 0.15285951775492804, + "learning_rate": 3.20893575610197e-05, + "loss": 2.7159, + "step": 31652 + }, + { + "epoch": 1.9649264386367868, + "grad_norm": 0.14501912023826435, + "learning_rate": 3.2085985744492456e-05, + "loss": 2.7861, + "step": 31653 + }, + { + "epoch": 1.9649885157365448, + "grad_norm": 0.13839123764496844, + "learning_rate": 3.2082614021426974e-05, + "loss": 2.7288, + "step": 31654 + }, + { + "epoch": 1.9650505928363027, + "grad_norm": 0.13914291542734036, + "learning_rate": 3.207924239184083e-05, + "loss": 2.7506, + "step": 31655 + }, + { + "epoch": 1.9651126699360606, + "grad_norm": 0.13825781214088345, + "learning_rate": 3.207587085575163e-05, + "loss": 2.6319, + "step": 31656 + }, + { + "epoch": 1.9651747470358185, + "grad_norm": 0.1531618660222317, + "learning_rate": 3.207249941317696e-05, + "loss": 2.7878, + "step": 31657 + }, + { + "epoch": 1.9652368241355764, + "grad_norm": 0.1500001582005779, + "learning_rate": 3.206912806413442e-05, + "loss": 2.8254, + "step": 31658 + }, + { + "epoch": 1.9652989012353341, + "grad_norm": 0.15852461045368788, + "learning_rate": 3.206575680864158e-05, + "loss": 2.7224, + "step": 31659 + }, + { + "epoch": 1.965360978335092, + "grad_norm": 0.1513844018807344, + "learning_rate": 3.206238564671605e-05, + "loss": 2.7229, + "step": 31660 + }, + { + "epoch": 1.96542305543485, + "grad_norm": 0.14383327932824586, + "learning_rate": 3.2059014578375405e-05, + "loss": 2.7078, + "step": 31661 + }, + { + "epoch": 1.9654851325346079, + "grad_norm": 0.13917263711206931, + "learning_rate": 3.205564360363724e-05, + "loss": 2.7498, + "step": 31662 + }, + { + "epoch": 1.9655472096343658, + "grad_norm": 0.13949589968196194, + "learning_rate": 3.205227272251914e-05, + "loss": 2.7336, + "step": 31663 + }, + { + "epoch": 1.9656092867341237, + "grad_norm": 0.14105414104963257, + "learning_rate": 3.204890193503869e-05, + "loss": 2.8337, + "step": 31664 + }, + { + "epoch": 1.9656713638338816, + "grad_norm": 0.14623383394987616, + "learning_rate": 3.204553124121348e-05, + "loss": 2.8059, + "step": 31665 + }, + { + "epoch": 1.9657334409336396, + "grad_norm": 0.16501784329313543, + "learning_rate": 3.2042160641061084e-05, + "loss": 2.805, + "step": 31666 + }, + { + "epoch": 1.9657955180333975, + "grad_norm": 0.13542289368054966, + "learning_rate": 3.2038790134599093e-05, + "loss": 2.742, + "step": 31667 + }, + { + "epoch": 1.9658575951331554, + "grad_norm": 0.14715143907005668, + "learning_rate": 3.2035419721845096e-05, + "loss": 2.8051, + "step": 31668 + }, + { + "epoch": 1.9659196722329133, + "grad_norm": 0.15661682342861466, + "learning_rate": 3.2032049402816684e-05, + "loss": 2.8261, + "step": 31669 + }, + { + "epoch": 1.9659817493326712, + "grad_norm": 0.14104448773362774, + "learning_rate": 3.202867917753142e-05, + "loss": 2.7208, + "step": 31670 + }, + { + "epoch": 1.9660438264324291, + "grad_norm": 0.14897896489290766, + "learning_rate": 3.202530904600691e-05, + "loss": 2.7959, + "step": 31671 + }, + { + "epoch": 1.966105903532187, + "grad_norm": 0.14367308243269566, + "learning_rate": 3.202193900826072e-05, + "loss": 2.772, + "step": 31672 + }, + { + "epoch": 1.966167980631945, + "grad_norm": 0.16530532914204865, + "learning_rate": 3.201856906431042e-05, + "loss": 2.7136, + "step": 31673 + }, + { + "epoch": 1.966230057731703, + "grad_norm": 0.14262192850165614, + "learning_rate": 3.201519921417364e-05, + "loss": 2.7452, + "step": 31674 + }, + { + "epoch": 1.9662921348314608, + "grad_norm": 0.13879416003583875, + "learning_rate": 3.201182945786791e-05, + "loss": 2.7746, + "step": 31675 + }, + { + "epoch": 1.9663542119312187, + "grad_norm": 0.13792688768257388, + "learning_rate": 3.200845979541085e-05, + "loss": 2.7836, + "step": 31676 + }, + { + "epoch": 1.9664162890309764, + "grad_norm": 0.15030597186832342, + "learning_rate": 3.200509022682001e-05, + "loss": 2.6807, + "step": 31677 + }, + { + "epoch": 1.9664783661307343, + "grad_norm": 0.1431905448744681, + "learning_rate": 3.2001720752112996e-05, + "loss": 2.7344, + "step": 31678 + }, + { + "epoch": 1.9665404432304923, + "grad_norm": 0.13598202710566734, + "learning_rate": 3.1998351371307376e-05, + "loss": 2.6897, + "step": 31679 + }, + { + "epoch": 1.9666025203302502, + "grad_norm": 0.14606363909280876, + "learning_rate": 3.199498208442071e-05, + "loss": 2.7402, + "step": 31680 + }, + { + "epoch": 1.966664597430008, + "grad_norm": 0.21474676544098284, + "learning_rate": 3.199161289147061e-05, + "loss": 2.7582, + "step": 31681 + }, + { + "epoch": 1.966726674529766, + "grad_norm": 0.20122446835611515, + "learning_rate": 3.1988243792474624e-05, + "loss": 2.8015, + "step": 31682 + }, + { + "epoch": 1.9667887516295237, + "grad_norm": 0.14433991809977367, + "learning_rate": 3.1984874787450354e-05, + "loss": 2.7745, + "step": 31683 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.14124576988468984, + "learning_rate": 3.1981505876415354e-05, + "loss": 2.8444, + "step": 31684 + }, + { + "epoch": 1.9669129058290395, + "grad_norm": 0.1565861780950537, + "learning_rate": 3.197813705938722e-05, + "loss": 2.8051, + "step": 31685 + }, + { + "epoch": 1.9669749829287975, + "grad_norm": 0.14971774536333995, + "learning_rate": 3.197476833638351e-05, + "loss": 2.6956, + "step": 31686 + }, + { + "epoch": 1.9670370600285554, + "grad_norm": 0.15711368631700648, + "learning_rate": 3.197139970742182e-05, + "loss": 2.7964, + "step": 31687 + }, + { + "epoch": 1.9670991371283133, + "grad_norm": 0.16604195554278176, + "learning_rate": 3.196803117251971e-05, + "loss": 2.7444, + "step": 31688 + }, + { + "epoch": 1.9671612142280712, + "grad_norm": 0.1497254399517046, + "learning_rate": 3.1964662731694766e-05, + "loss": 2.7579, + "step": 31689 + }, + { + "epoch": 1.9672232913278291, + "grad_norm": 0.15131199968925746, + "learning_rate": 3.196129438496456e-05, + "loss": 2.7823, + "step": 31690 + }, + { + "epoch": 1.967285368427587, + "grad_norm": 0.15226324717630604, + "learning_rate": 3.195792613234664e-05, + "loss": 2.6912, + "step": 31691 + }, + { + "epoch": 1.967347445527345, + "grad_norm": 0.15238853554342896, + "learning_rate": 3.195455797385861e-05, + "loss": 2.7769, + "step": 31692 + }, + { + "epoch": 1.967409522627103, + "grad_norm": 0.1451319955224867, + "learning_rate": 3.195118990951803e-05, + "loss": 2.7861, + "step": 31693 + }, + { + "epoch": 1.9674715997268608, + "grad_norm": 0.15008307257682937, + "learning_rate": 3.194782193934247e-05, + "loss": 2.7699, + "step": 31694 + }, + { + "epoch": 1.9675336768266187, + "grad_norm": 0.15575455556440498, + "learning_rate": 3.1944454063349506e-05, + "loss": 2.7759, + "step": 31695 + }, + { + "epoch": 1.9675957539263766, + "grad_norm": 0.16192334197985805, + "learning_rate": 3.194108628155672e-05, + "loss": 2.8347, + "step": 31696 + }, + { + "epoch": 1.9676578310261346, + "grad_norm": 0.15165698090500368, + "learning_rate": 3.193771859398166e-05, + "loss": 2.7571, + "step": 31697 + }, + { + "epoch": 1.9677199081258925, + "grad_norm": 0.14966579498898813, + "learning_rate": 3.193435100064191e-05, + "loss": 2.7071, + "step": 31698 + }, + { + "epoch": 1.9677819852256504, + "grad_norm": 0.20565850919668874, + "learning_rate": 3.193098350155503e-05, + "loss": 2.8088, + "step": 31699 + }, + { + "epoch": 1.9678440623254083, + "grad_norm": 0.19039358139090728, + "learning_rate": 3.19276160967386e-05, + "loss": 2.8217, + "step": 31700 + }, + { + "epoch": 1.967906139425166, + "grad_norm": 0.15442596540212586, + "learning_rate": 3.19242487862102e-05, + "loss": 2.6501, + "step": 31701 + }, + { + "epoch": 1.967968216524924, + "grad_norm": 0.1900743641855618, + "learning_rate": 3.192088156998736e-05, + "loss": 2.8104, + "step": 31702 + }, + { + "epoch": 1.9680302936246818, + "grad_norm": 0.16624536886074875, + "learning_rate": 3.1917514448087685e-05, + "loss": 2.789, + "step": 31703 + }, + { + "epoch": 1.9680923707244398, + "grad_norm": 0.15122973446210183, + "learning_rate": 3.1914147420528706e-05, + "loss": 2.7821, + "step": 31704 + }, + { + "epoch": 1.9681544478241977, + "grad_norm": 0.1522390299119351, + "learning_rate": 3.191078048732803e-05, + "loss": 2.7983, + "step": 31705 + }, + { + "epoch": 1.9682165249239556, + "grad_norm": 0.16655362413314653, + "learning_rate": 3.1907413648503184e-05, + "loss": 2.8045, + "step": 31706 + }, + { + "epoch": 1.9682786020237133, + "grad_norm": 0.14787023469239305, + "learning_rate": 3.190404690407176e-05, + "loss": 2.7717, + "step": 31707 + }, + { + "epoch": 1.9683406791234712, + "grad_norm": 0.1649590078011121, + "learning_rate": 3.190068025405132e-05, + "loss": 2.7923, + "step": 31708 + }, + { + "epoch": 1.9684027562232291, + "grad_norm": 0.14789710055369584, + "learning_rate": 3.1897313698459424e-05, + "loss": 2.7579, + "step": 31709 + }, + { + "epoch": 1.968464833322987, + "grad_norm": 0.164066893654983, + "learning_rate": 3.1893947237313646e-05, + "loss": 2.6745, + "step": 31710 + }, + { + "epoch": 1.968526910422745, + "grad_norm": 0.15526989866818475, + "learning_rate": 3.1890580870631524e-05, + "loss": 2.6521, + "step": 31711 + }, + { + "epoch": 1.9685889875225029, + "grad_norm": 0.14603995617963617, + "learning_rate": 3.188721459843066e-05, + "loss": 2.7463, + "step": 31712 + }, + { + "epoch": 1.9686510646222608, + "grad_norm": 0.14558647256992888, + "learning_rate": 3.1883848420728576e-05, + "loss": 2.7961, + "step": 31713 + }, + { + "epoch": 1.9687131417220187, + "grad_norm": 0.1549418044821477, + "learning_rate": 3.188048233754286e-05, + "loss": 2.7217, + "step": 31714 + }, + { + "epoch": 1.9687752188217766, + "grad_norm": 0.1479995503438904, + "learning_rate": 3.187711634889107e-05, + "loss": 2.7142, + "step": 31715 + }, + { + "epoch": 1.9688372959215346, + "grad_norm": 0.15780189639056139, + "learning_rate": 3.1873750454790756e-05, + "loss": 2.8103, + "step": 31716 + }, + { + "epoch": 1.9688993730212925, + "grad_norm": 0.1444140938884442, + "learning_rate": 3.187038465525949e-05, + "loss": 2.7701, + "step": 31717 + }, + { + "epoch": 1.9689614501210504, + "grad_norm": 0.1535660178100247, + "learning_rate": 3.186701895031483e-05, + "loss": 2.7701, + "step": 31718 + }, + { + "epoch": 1.9690235272208083, + "grad_norm": 0.13945258351496012, + "learning_rate": 3.186365333997433e-05, + "loss": 2.6993, + "step": 31719 + }, + { + "epoch": 1.9690856043205662, + "grad_norm": 0.16539189471089438, + "learning_rate": 3.1860287824255555e-05, + "loss": 2.7561, + "step": 31720 + }, + { + "epoch": 1.9691476814203241, + "grad_norm": 0.14161009927206855, + "learning_rate": 3.1856922403176065e-05, + "loss": 2.691, + "step": 31721 + }, + { + "epoch": 1.969209758520082, + "grad_norm": 0.15394019899778424, + "learning_rate": 3.18535570767534e-05, + "loss": 2.7928, + "step": 31722 + }, + { + "epoch": 1.96927183561984, + "grad_norm": 0.1593535883395496, + "learning_rate": 3.185019184500516e-05, + "loss": 2.8461, + "step": 31723 + }, + { + "epoch": 1.969333912719598, + "grad_norm": 0.18089302606793975, + "learning_rate": 3.184682670794885e-05, + "loss": 2.7301, + "step": 31724 + }, + { + "epoch": 1.9693959898193556, + "grad_norm": 0.15882745046514585, + "learning_rate": 3.184346166560207e-05, + "loss": 2.7516, + "step": 31725 + }, + { + "epoch": 1.9694580669191135, + "grad_norm": 0.1862577073371462, + "learning_rate": 3.184009671798236e-05, + "loss": 2.7455, + "step": 31726 + }, + { + "epoch": 1.9695201440188714, + "grad_norm": 0.16748545669824166, + "learning_rate": 3.183673186510725e-05, + "loss": 2.7526, + "step": 31727 + }, + { + "epoch": 1.9695822211186294, + "grad_norm": 0.1460357344277806, + "learning_rate": 3.1833367106994345e-05, + "loss": 2.7503, + "step": 31728 + }, + { + "epoch": 1.9696442982183873, + "grad_norm": 0.1485279021288117, + "learning_rate": 3.183000244366116e-05, + "loss": 2.6945, + "step": 31729 + }, + { + "epoch": 1.9697063753181452, + "grad_norm": 0.16739921155649035, + "learning_rate": 3.182663787512526e-05, + "loss": 2.7792, + "step": 31730 + }, + { + "epoch": 1.9697684524179029, + "grad_norm": 0.14681495874870462, + "learning_rate": 3.182327340140421e-05, + "loss": 2.6612, + "step": 31731 + }, + { + "epoch": 1.9698305295176608, + "grad_norm": 0.16425132670621523, + "learning_rate": 3.181990902251555e-05, + "loss": 2.7179, + "step": 31732 + }, + { + "epoch": 1.9698926066174187, + "grad_norm": 0.1474498845688519, + "learning_rate": 3.181654473847684e-05, + "loss": 2.7139, + "step": 31733 + }, + { + "epoch": 1.9699546837171766, + "grad_norm": 0.16908287225271534, + "learning_rate": 3.181318054930563e-05, + "loss": 2.7979, + "step": 31734 + }, + { + "epoch": 1.9700167608169346, + "grad_norm": 0.15038219031919095, + "learning_rate": 3.180981645501947e-05, + "loss": 2.7117, + "step": 31735 + }, + { + "epoch": 1.9700788379166925, + "grad_norm": 0.180878764239899, + "learning_rate": 3.1806452455635926e-05, + "loss": 2.7876, + "step": 31736 + }, + { + "epoch": 1.9701409150164504, + "grad_norm": 0.17595477016883443, + "learning_rate": 3.180308855117252e-05, + "loss": 2.6959, + "step": 31737 + }, + { + "epoch": 1.9702029921162083, + "grad_norm": 0.13851439573901117, + "learning_rate": 3.179972474164683e-05, + "loss": 2.7001, + "step": 31738 + }, + { + "epoch": 1.9702650692159662, + "grad_norm": 0.15701927713517425, + "learning_rate": 3.179636102707637e-05, + "loss": 2.7002, + "step": 31739 + }, + { + "epoch": 1.9703271463157241, + "grad_norm": 0.15250599608984292, + "learning_rate": 3.179299740747874e-05, + "loss": 2.7369, + "step": 31740 + }, + { + "epoch": 1.970389223415482, + "grad_norm": 0.1662535645256959, + "learning_rate": 3.178963388287145e-05, + "loss": 2.7758, + "step": 31741 + }, + { + "epoch": 1.97045130051524, + "grad_norm": 0.19530448897145014, + "learning_rate": 3.178627045327207e-05, + "loss": 2.7914, + "step": 31742 + }, + { + "epoch": 1.970513377614998, + "grad_norm": 0.14474777164086391, + "learning_rate": 3.178290711869814e-05, + "loss": 2.6985, + "step": 31743 + }, + { + "epoch": 1.9705754547147558, + "grad_norm": 0.1712455657311798, + "learning_rate": 3.1779543879167196e-05, + "loss": 2.6752, + "step": 31744 + }, + { + "epoch": 1.9706375318145137, + "grad_norm": 0.16017431064279944, + "learning_rate": 3.1776180734696796e-05, + "loss": 2.7485, + "step": 31745 + }, + { + "epoch": 1.9706996089142716, + "grad_norm": 0.16978062316361747, + "learning_rate": 3.177281768530449e-05, + "loss": 2.7721, + "step": 31746 + }, + { + "epoch": 1.9707616860140296, + "grad_norm": 0.18568377213024634, + "learning_rate": 3.176945473100782e-05, + "loss": 2.7198, + "step": 31747 + }, + { + "epoch": 1.9708237631137875, + "grad_norm": 0.18923968650539424, + "learning_rate": 3.176609187182433e-05, + "loss": 2.7512, + "step": 31748 + }, + { + "epoch": 1.9708858402135452, + "grad_norm": 0.17501613678783712, + "learning_rate": 3.176272910777155e-05, + "loss": 2.8579, + "step": 31749 + }, + { + "epoch": 1.970947917313303, + "grad_norm": 0.15514426586548055, + "learning_rate": 3.175936643886707e-05, + "loss": 2.7538, + "step": 31750 + }, + { + "epoch": 1.971009994413061, + "grad_norm": 0.17349198340472963, + "learning_rate": 3.1756003865128384e-05, + "loss": 2.7246, + "step": 31751 + }, + { + "epoch": 1.971072071512819, + "grad_norm": 0.16406923234841134, + "learning_rate": 3.1752641386573054e-05, + "loss": 2.7578, + "step": 31752 + }, + { + "epoch": 1.9711341486125769, + "grad_norm": 0.1613410702053748, + "learning_rate": 3.174927900321863e-05, + "loss": 2.7549, + "step": 31753 + }, + { + "epoch": 1.9711962257123348, + "grad_norm": 0.17167767121017397, + "learning_rate": 3.174591671508264e-05, + "loss": 2.7956, + "step": 31754 + }, + { + "epoch": 1.9712583028120925, + "grad_norm": 0.17567923022497509, + "learning_rate": 3.174255452218264e-05, + "loss": 2.8407, + "step": 31755 + }, + { + "epoch": 1.9713203799118504, + "grad_norm": 0.15394767649068908, + "learning_rate": 3.173919242453616e-05, + "loss": 2.7244, + "step": 31756 + }, + { + "epoch": 1.9713824570116083, + "grad_norm": 0.17409395401120495, + "learning_rate": 3.173583042216075e-05, + "loss": 2.8341, + "step": 31757 + }, + { + "epoch": 1.9714445341113662, + "grad_norm": 0.16879627511651388, + "learning_rate": 3.1732468515073946e-05, + "loss": 2.7612, + "step": 31758 + }, + { + "epoch": 1.9715066112111241, + "grad_norm": 0.16227365121289708, + "learning_rate": 3.172910670329329e-05, + "loss": 2.8171, + "step": 31759 + }, + { + "epoch": 1.971568688310882, + "grad_norm": 0.15597050225846723, + "learning_rate": 3.172574498683631e-05, + "loss": 2.787, + "step": 31760 + }, + { + "epoch": 1.97163076541064, + "grad_norm": 0.17815389482473398, + "learning_rate": 3.172238336572056e-05, + "loss": 2.7985, + "step": 31761 + }, + { + "epoch": 1.971692842510398, + "grad_norm": 0.16348438358170142, + "learning_rate": 3.171902183996358e-05, + "loss": 2.751, + "step": 31762 + }, + { + "epoch": 1.9717549196101558, + "grad_norm": 0.144835499834074, + "learning_rate": 3.17156604095829e-05, + "loss": 2.7539, + "step": 31763 + }, + { + "epoch": 1.9718169967099137, + "grad_norm": 0.17054407164628302, + "learning_rate": 3.171229907459606e-05, + "loss": 2.7846, + "step": 31764 + }, + { + "epoch": 1.9718790738096716, + "grad_norm": 0.1408935737246294, + "learning_rate": 3.170893783502058e-05, + "loss": 2.7943, + "step": 31765 + }, + { + "epoch": 1.9719411509094296, + "grad_norm": 0.1679064058331154, + "learning_rate": 3.1705576690874025e-05, + "loss": 2.7829, + "step": 31766 + }, + { + "epoch": 1.9720032280091875, + "grad_norm": 0.1428714960787487, + "learning_rate": 3.170221564217391e-05, + "loss": 2.7861, + "step": 31767 + }, + { + "epoch": 1.9720653051089454, + "grad_norm": 0.165574806857972, + "learning_rate": 3.169885468893779e-05, + "loss": 2.8045, + "step": 31768 + }, + { + "epoch": 1.9721273822087033, + "grad_norm": 0.16928745640536758, + "learning_rate": 3.1695493831183174e-05, + "loss": 2.7557, + "step": 31769 + }, + { + "epoch": 1.9721894593084612, + "grad_norm": 0.1461729171772865, + "learning_rate": 3.169213306892762e-05, + "loss": 2.7115, + "step": 31770 + }, + { + "epoch": 1.9722515364082192, + "grad_norm": 0.16738922480760743, + "learning_rate": 3.168877240218864e-05, + "loss": 2.7854, + "step": 31771 + }, + { + "epoch": 1.972313613507977, + "grad_norm": 0.14567635522941633, + "learning_rate": 3.168541183098378e-05, + "loss": 2.8072, + "step": 31772 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.16366910886589345, + "learning_rate": 3.168205135533058e-05, + "loss": 2.8169, + "step": 31773 + }, + { + "epoch": 1.9724377677074927, + "grad_norm": 0.16538027190832513, + "learning_rate": 3.167869097524656e-05, + "loss": 2.7279, + "step": 31774 + }, + { + "epoch": 1.9724998448072506, + "grad_norm": 0.1754797469045004, + "learning_rate": 3.167533069074927e-05, + "loss": 2.7025, + "step": 31775 + }, + { + "epoch": 1.9725619219070085, + "grad_norm": 0.15876501283658073, + "learning_rate": 3.167197050185622e-05, + "loss": 2.7276, + "step": 31776 + }, + { + "epoch": 1.9726239990067664, + "grad_norm": 0.15220197169175628, + "learning_rate": 3.166861040858495e-05, + "loss": 2.728, + "step": 31777 + }, + { + "epoch": 1.9726860761065244, + "grad_norm": 0.15695652963259998, + "learning_rate": 3.166525041095299e-05, + "loss": 2.7706, + "step": 31778 + }, + { + "epoch": 1.972748153206282, + "grad_norm": 0.1517108523005523, + "learning_rate": 3.1661890508977865e-05, + "loss": 2.7551, + "step": 31779 + }, + { + "epoch": 1.97281023030604, + "grad_norm": 0.1446120899528812, + "learning_rate": 3.165853070267711e-05, + "loss": 2.7555, + "step": 31780 + }, + { + "epoch": 1.9728723074057979, + "grad_norm": 0.18645352156697703, + "learning_rate": 3.165517099206826e-05, + "loss": 2.8118, + "step": 31781 + }, + { + "epoch": 1.9729343845055558, + "grad_norm": 0.14435308016510698, + "learning_rate": 3.1651811377168834e-05, + "loss": 2.7852, + "step": 31782 + }, + { + "epoch": 1.9729964616053137, + "grad_norm": 0.17171039610441152, + "learning_rate": 3.164845185799635e-05, + "loss": 2.7725, + "step": 31783 + }, + { + "epoch": 1.9730585387050716, + "grad_norm": 0.15647708960612966, + "learning_rate": 3.164509243456837e-05, + "loss": 2.7194, + "step": 31784 + }, + { + "epoch": 1.9731206158048296, + "grad_norm": 0.1488627315946426, + "learning_rate": 3.164173310690238e-05, + "loss": 2.7667, + "step": 31785 + }, + { + "epoch": 1.9731826929045875, + "grad_norm": 0.18382901478044214, + "learning_rate": 3.163837387501594e-05, + "loss": 2.7629, + "step": 31786 + }, + { + "epoch": 1.9732447700043454, + "grad_norm": 0.15069853856259952, + "learning_rate": 3.163501473892654e-05, + "loss": 2.7915, + "step": 31787 + }, + { + "epoch": 1.9733068471041033, + "grad_norm": 0.163451515977133, + "learning_rate": 3.163165569865174e-05, + "loss": 2.7617, + "step": 31788 + }, + { + "epoch": 1.9733689242038612, + "grad_norm": 0.16400877890481344, + "learning_rate": 3.162829675420906e-05, + "loss": 2.7217, + "step": 31789 + }, + { + "epoch": 1.9734310013036191, + "grad_norm": 0.14763864962317796, + "learning_rate": 3.1624937905616e-05, + "loss": 2.7484, + "step": 31790 + }, + { + "epoch": 1.973493078403377, + "grad_norm": 0.1546352033162875, + "learning_rate": 3.162157915289011e-05, + "loss": 2.7281, + "step": 31791 + }, + { + "epoch": 1.973555155503135, + "grad_norm": 0.15683888091749276, + "learning_rate": 3.161822049604889e-05, + "loss": 2.7316, + "step": 31792 + }, + { + "epoch": 1.973617232602893, + "grad_norm": 0.16664513558150554, + "learning_rate": 3.1614861935109894e-05, + "loss": 2.7102, + "step": 31793 + }, + { + "epoch": 1.9736793097026508, + "grad_norm": 0.16483357294134013, + "learning_rate": 3.161150347009061e-05, + "loss": 2.7463, + "step": 31794 + }, + { + "epoch": 1.9737413868024087, + "grad_norm": 0.15135067880496858, + "learning_rate": 3.160814510100859e-05, + "loss": 2.6699, + "step": 31795 + }, + { + "epoch": 1.9738034639021667, + "grad_norm": 0.1491303836349544, + "learning_rate": 3.160478682788133e-05, + "loss": 2.6544, + "step": 31796 + }, + { + "epoch": 1.9738655410019244, + "grad_norm": 0.1575799489140334, + "learning_rate": 3.160142865072637e-05, + "loss": 2.692, + "step": 31797 + }, + { + "epoch": 1.9739276181016823, + "grad_norm": 0.15226845836988046, + "learning_rate": 3.159807056956121e-05, + "loss": 2.7345, + "step": 31798 + }, + { + "epoch": 1.9739896952014402, + "grad_norm": 0.1540248984747182, + "learning_rate": 3.15947125844034e-05, + "loss": 2.8345, + "step": 31799 + }, + { + "epoch": 1.974051772301198, + "grad_norm": 0.16314715532777138, + "learning_rate": 3.1591354695270445e-05, + "loss": 2.7713, + "step": 31800 + }, + { + "epoch": 1.974113849400956, + "grad_norm": 0.14065526927910357, + "learning_rate": 3.1587996902179846e-05, + "loss": 2.7058, + "step": 31801 + }, + { + "epoch": 1.974175926500714, + "grad_norm": 0.13844825283079853, + "learning_rate": 3.1584639205149146e-05, + "loss": 2.745, + "step": 31802 + }, + { + "epoch": 1.9742380036004716, + "grad_norm": 0.20070515352072643, + "learning_rate": 3.158128160419584e-05, + "loss": 2.7315, + "step": 31803 + }, + { + "epoch": 1.9743000807002296, + "grad_norm": 0.1703999961731719, + "learning_rate": 3.157792409933746e-05, + "loss": 2.8359, + "step": 31804 + }, + { + "epoch": 1.9743621577999875, + "grad_norm": 0.16145716879537497, + "learning_rate": 3.157456669059154e-05, + "loss": 2.8521, + "step": 31805 + }, + { + "epoch": 1.9744242348997454, + "grad_norm": 0.15861835265416283, + "learning_rate": 3.157120937797556e-05, + "loss": 2.8476, + "step": 31806 + }, + { + "epoch": 1.9744863119995033, + "grad_norm": 0.14953078279850757, + "learning_rate": 3.1567852161507074e-05, + "loss": 2.7284, + "step": 31807 + }, + { + "epoch": 1.9745483890992612, + "grad_norm": 0.16422305879103255, + "learning_rate": 3.156449504120357e-05, + "loss": 2.8158, + "step": 31808 + }, + { + "epoch": 1.9746104661990191, + "grad_norm": 0.15294341516572285, + "learning_rate": 3.156113801708257e-05, + "loss": 2.7382, + "step": 31809 + }, + { + "epoch": 1.974672543298777, + "grad_norm": 0.1879265027280838, + "learning_rate": 3.155778108916159e-05, + "loss": 2.8408, + "step": 31810 + }, + { + "epoch": 1.974734620398535, + "grad_norm": 0.1533312520297523, + "learning_rate": 3.1554424257458134e-05, + "loss": 2.7164, + "step": 31811 + }, + { + "epoch": 1.974796697498293, + "grad_norm": 0.15177062629793805, + "learning_rate": 3.155106752198974e-05, + "loss": 2.7893, + "step": 31812 + }, + { + "epoch": 1.9748587745980508, + "grad_norm": 0.1931045953687068, + "learning_rate": 3.154771088277389e-05, + "loss": 2.7395, + "step": 31813 + }, + { + "epoch": 1.9749208516978087, + "grad_norm": 0.14614555592917308, + "learning_rate": 3.154435433982813e-05, + "loss": 2.7, + "step": 31814 + }, + { + "epoch": 1.9749829287975667, + "grad_norm": 0.14946271690101834, + "learning_rate": 3.154099789316994e-05, + "loss": 2.8057, + "step": 31815 + }, + { + "epoch": 1.9750450058973246, + "grad_norm": 0.16118887879564767, + "learning_rate": 3.1537641542816856e-05, + "loss": 2.7583, + "step": 31816 + }, + { + "epoch": 1.9751070829970825, + "grad_norm": 0.1460201090577789, + "learning_rate": 3.1534285288786356e-05, + "loss": 2.8547, + "step": 31817 + }, + { + "epoch": 1.9751691600968404, + "grad_norm": 0.14953970768606342, + "learning_rate": 3.1530929131096e-05, + "loss": 2.6894, + "step": 31818 + }, + { + "epoch": 1.9752312371965983, + "grad_norm": 0.1673747248793343, + "learning_rate": 3.152757306976325e-05, + "loss": 2.798, + "step": 31819 + }, + { + "epoch": 1.9752933142963562, + "grad_norm": 0.17965484405328414, + "learning_rate": 3.152421710480564e-05, + "loss": 2.7682, + "step": 31820 + }, + { + "epoch": 1.975355391396114, + "grad_norm": 0.1587292963423297, + "learning_rate": 3.152086123624068e-05, + "loss": 2.8635, + "step": 31821 + }, + { + "epoch": 1.9754174684958719, + "grad_norm": 0.14463273557839054, + "learning_rate": 3.1517505464085874e-05, + "loss": 2.7357, + "step": 31822 + }, + { + "epoch": 1.9754795455956298, + "grad_norm": 0.14650101885018177, + "learning_rate": 3.1514149788358715e-05, + "loss": 2.7263, + "step": 31823 + }, + { + "epoch": 1.9755416226953877, + "grad_norm": 0.14861252780401954, + "learning_rate": 3.151079420907674e-05, + "loss": 2.6955, + "step": 31824 + }, + { + "epoch": 1.9756036997951456, + "grad_norm": 0.17200850590199965, + "learning_rate": 3.1507438726257447e-05, + "loss": 2.8022, + "step": 31825 + }, + { + "epoch": 1.9756657768949035, + "grad_norm": 0.14617954325784802, + "learning_rate": 3.150408333991831e-05, + "loss": 2.7333, + "step": 31826 + }, + { + "epoch": 1.9757278539946612, + "grad_norm": 0.154159605556168, + "learning_rate": 3.150072805007688e-05, + "loss": 2.7309, + "step": 31827 + }, + { + "epoch": 1.9757899310944191, + "grad_norm": 0.14874969453339507, + "learning_rate": 3.149737285675063e-05, + "loss": 2.731, + "step": 31828 + }, + { + "epoch": 1.975852008194177, + "grad_norm": 0.1610006079040329, + "learning_rate": 3.1494017759957086e-05, + "loss": 2.6735, + "step": 31829 + }, + { + "epoch": 1.975914085293935, + "grad_norm": 0.14755116159339923, + "learning_rate": 3.149066275971373e-05, + "loss": 2.7106, + "step": 31830 + }, + { + "epoch": 1.975976162393693, + "grad_norm": 0.14695359420093057, + "learning_rate": 3.14873078560381e-05, + "loss": 2.7386, + "step": 31831 + }, + { + "epoch": 1.9760382394934508, + "grad_norm": 0.13920196440573965, + "learning_rate": 3.1483953048947666e-05, + "loss": 2.7114, + "step": 31832 + }, + { + "epoch": 1.9761003165932087, + "grad_norm": 0.15268715643209746, + "learning_rate": 3.148059833845995e-05, + "loss": 2.8401, + "step": 31833 + }, + { + "epoch": 1.9761623936929666, + "grad_norm": 0.1461958204786626, + "learning_rate": 3.147724372459244e-05, + "loss": 2.7577, + "step": 31834 + }, + { + "epoch": 1.9762244707927246, + "grad_norm": 0.1447233392765799, + "learning_rate": 3.1473889207362665e-05, + "loss": 2.6381, + "step": 31835 + }, + { + "epoch": 1.9762865478924825, + "grad_norm": 0.16904971584736497, + "learning_rate": 3.147053478678809e-05, + "loss": 2.8063, + "step": 31836 + }, + { + "epoch": 1.9763486249922404, + "grad_norm": 0.13699507037790457, + "learning_rate": 3.146718046288623e-05, + "loss": 2.7818, + "step": 31837 + }, + { + "epoch": 1.9764107020919983, + "grad_norm": 0.1472956381450765, + "learning_rate": 3.1463826235674584e-05, + "loss": 2.7892, + "step": 31838 + }, + { + "epoch": 1.9764727791917562, + "grad_norm": 0.15384912713638202, + "learning_rate": 3.146047210517067e-05, + "loss": 2.7476, + "step": 31839 + }, + { + "epoch": 1.9765348562915142, + "grad_norm": 0.14164230812785436, + "learning_rate": 3.1457118071391965e-05, + "loss": 2.7872, + "step": 31840 + }, + { + "epoch": 1.976596933391272, + "grad_norm": 0.16657562053884875, + "learning_rate": 3.145376413435599e-05, + "loss": 2.7648, + "step": 31841 + }, + { + "epoch": 1.97665901049103, + "grad_norm": 0.15152027633292223, + "learning_rate": 3.145041029408022e-05, + "loss": 2.7022, + "step": 31842 + }, + { + "epoch": 1.976721087590788, + "grad_norm": 0.17078859341356611, + "learning_rate": 3.144705655058217e-05, + "loss": 2.852, + "step": 31843 + }, + { + "epoch": 1.9767831646905456, + "grad_norm": 0.17132150871210283, + "learning_rate": 3.144370290387932e-05, + "loss": 2.775, + "step": 31844 + }, + { + "epoch": 1.9768452417903035, + "grad_norm": 0.1484858158181763, + "learning_rate": 3.144034935398918e-05, + "loss": 2.7849, + "step": 31845 + }, + { + "epoch": 1.9769073188900614, + "grad_norm": 0.14778849870614857, + "learning_rate": 3.1436995900929244e-05, + "loss": 2.7206, + "step": 31846 + }, + { + "epoch": 1.9769693959898194, + "grad_norm": 0.14948505385587532, + "learning_rate": 3.1433642544717014e-05, + "loss": 2.7973, + "step": 31847 + }, + { + "epoch": 1.9770314730895773, + "grad_norm": 0.14714065404589688, + "learning_rate": 3.143028928536996e-05, + "loss": 2.7068, + "step": 31848 + }, + { + "epoch": 1.9770935501893352, + "grad_norm": 0.14494772138204337, + "learning_rate": 3.142693612290561e-05, + "loss": 2.7651, + "step": 31849 + }, + { + "epoch": 1.9771556272890929, + "grad_norm": 0.14553308587475725, + "learning_rate": 3.1423583057341456e-05, + "loss": 2.693, + "step": 31850 + }, + { + "epoch": 1.9772177043888508, + "grad_norm": 0.16396604268292875, + "learning_rate": 3.142023008869495e-05, + "loss": 2.7061, + "step": 31851 + }, + { + "epoch": 1.9772797814886087, + "grad_norm": 0.16623804675956813, + "learning_rate": 3.141687721698363e-05, + "loss": 2.75, + "step": 31852 + }, + { + "epoch": 1.9773418585883666, + "grad_norm": 0.2209585197490985, + "learning_rate": 3.1413524442224964e-05, + "loss": 2.6945, + "step": 31853 + }, + { + "epoch": 1.9774039356881246, + "grad_norm": 0.158048713350942, + "learning_rate": 3.141017176443646e-05, + "loss": 2.8544, + "step": 31854 + }, + { + "epoch": 1.9774660127878825, + "grad_norm": 0.14383316277793698, + "learning_rate": 3.14068191836356e-05, + "loss": 2.7385, + "step": 31855 + }, + { + "epoch": 1.9775280898876404, + "grad_norm": 0.1439966030175486, + "learning_rate": 3.1403466699839884e-05, + "loss": 2.7176, + "step": 31856 + }, + { + "epoch": 1.9775901669873983, + "grad_norm": 0.21677733668358168, + "learning_rate": 3.140011431306679e-05, + "loss": 2.8046, + "step": 31857 + }, + { + "epoch": 1.9776522440871562, + "grad_norm": 0.15342942989670963, + "learning_rate": 3.139676202333381e-05, + "loss": 2.7647, + "step": 31858 + }, + { + "epoch": 1.9777143211869141, + "grad_norm": 0.18326903095149022, + "learning_rate": 3.1393409830658446e-05, + "loss": 2.7893, + "step": 31859 + }, + { + "epoch": 1.977776398286672, + "grad_norm": 0.1520477255414342, + "learning_rate": 3.1390057735058185e-05, + "loss": 2.7605, + "step": 31860 + }, + { + "epoch": 1.97783847538643, + "grad_norm": 0.14193334842701102, + "learning_rate": 3.1386705736550506e-05, + "loss": 2.7895, + "step": 31861 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.1727825986282088, + "learning_rate": 3.13833538351529e-05, + "loss": 2.7742, + "step": 31862 + }, + { + "epoch": 1.9779626295859458, + "grad_norm": 0.151366504544594, + "learning_rate": 3.138000203088286e-05, + "loss": 2.7781, + "step": 31863 + }, + { + "epoch": 1.9780247066857037, + "grad_norm": 0.14729688373370198, + "learning_rate": 3.1376650323757857e-05, + "loss": 2.8274, + "step": 31864 + }, + { + "epoch": 1.9780867837854617, + "grad_norm": 0.19316305925901375, + "learning_rate": 3.1373298713795404e-05, + "loss": 2.7731, + "step": 31865 + }, + { + "epoch": 1.9781488608852196, + "grad_norm": 0.16191053260112118, + "learning_rate": 3.136994720101296e-05, + "loss": 2.6666, + "step": 31866 + }, + { + "epoch": 1.9782109379849775, + "grad_norm": 0.15755331548624085, + "learning_rate": 3.136659578542804e-05, + "loss": 2.7267, + "step": 31867 + }, + { + "epoch": 1.9782730150847352, + "grad_norm": 0.18598294464102275, + "learning_rate": 3.13632444670581e-05, + "loss": 2.8186, + "step": 31868 + }, + { + "epoch": 1.978335092184493, + "grad_norm": 0.15320023615173983, + "learning_rate": 3.1359893245920646e-05, + "loss": 2.7977, + "step": 31869 + }, + { + "epoch": 1.978397169284251, + "grad_norm": 0.17377919676462428, + "learning_rate": 3.135654212203314e-05, + "loss": 2.8153, + "step": 31870 + }, + { + "epoch": 1.978459246384009, + "grad_norm": 0.15868571931479195, + "learning_rate": 3.135319109541308e-05, + "loss": 2.8717, + "step": 31871 + }, + { + "epoch": 1.9785213234837669, + "grad_norm": 0.16667625032130187, + "learning_rate": 3.134984016607797e-05, + "loss": 2.7983, + "step": 31872 + }, + { + "epoch": 1.9785834005835248, + "grad_norm": 0.15207303275755346, + "learning_rate": 3.1346489334045256e-05, + "loss": 2.7816, + "step": 31873 + }, + { + "epoch": 1.9786454776832825, + "grad_norm": 0.23282977782532494, + "learning_rate": 3.1343138599332445e-05, + "loss": 2.7968, + "step": 31874 + }, + { + "epoch": 1.9787075547830404, + "grad_norm": 0.16470353594961126, + "learning_rate": 3.1339787961957014e-05, + "loss": 2.7155, + "step": 31875 + }, + { + "epoch": 1.9787696318827983, + "grad_norm": 0.1462877861751602, + "learning_rate": 3.133643742193641e-05, + "loss": 2.6987, + "step": 31876 + }, + { + "epoch": 1.9788317089825562, + "grad_norm": 0.1428722913461113, + "learning_rate": 3.133308697928817e-05, + "loss": 2.7337, + "step": 31877 + }, + { + "epoch": 1.9788937860823141, + "grad_norm": 0.15440997071252494, + "learning_rate": 3.132973663402974e-05, + "loss": 2.7716, + "step": 31878 + }, + { + "epoch": 1.978955863182072, + "grad_norm": 0.14022755088247418, + "learning_rate": 3.1326386386178605e-05, + "loss": 2.5999, + "step": 31879 + }, + { + "epoch": 1.97901794028183, + "grad_norm": 0.15060443047200356, + "learning_rate": 3.1323036235752244e-05, + "loss": 2.8147, + "step": 31880 + }, + { + "epoch": 1.979080017381588, + "grad_norm": 0.1418301086257647, + "learning_rate": 3.131968618276815e-05, + "loss": 2.7083, + "step": 31881 + }, + { + "epoch": 1.9791420944813458, + "grad_norm": 0.14845716195639633, + "learning_rate": 3.131633622724377e-05, + "loss": 2.7799, + "step": 31882 + }, + { + "epoch": 1.9792041715811037, + "grad_norm": 0.14288627298776185, + "learning_rate": 3.131298636919661e-05, + "loss": 2.6811, + "step": 31883 + }, + { + "epoch": 1.9792662486808617, + "grad_norm": 0.17174516997875303, + "learning_rate": 3.1309636608644124e-05, + "loss": 2.681, + "step": 31884 + }, + { + "epoch": 1.9793283257806196, + "grad_norm": 0.14488864219458195, + "learning_rate": 3.130628694560382e-05, + "loss": 2.7427, + "step": 31885 + }, + { + "epoch": 1.9793904028803775, + "grad_norm": 0.14451087988537673, + "learning_rate": 3.130293738009314e-05, + "loss": 2.7413, + "step": 31886 + }, + { + "epoch": 1.9794524799801354, + "grad_norm": 0.21310866852662863, + "learning_rate": 3.129958791212958e-05, + "loss": 2.779, + "step": 31887 + }, + { + "epoch": 1.9795145570798933, + "grad_norm": 0.1416412626716075, + "learning_rate": 3.129623854173062e-05, + "loss": 2.7516, + "step": 31888 + }, + { + "epoch": 1.9795766341796512, + "grad_norm": 0.15520545382890163, + "learning_rate": 3.1292889268913706e-05, + "loss": 2.7024, + "step": 31889 + }, + { + "epoch": 1.9796387112794092, + "grad_norm": 0.15806465211114376, + "learning_rate": 3.128954009369634e-05, + "loss": 2.5904, + "step": 31890 + }, + { + "epoch": 1.979700788379167, + "grad_norm": 0.15268937055181336, + "learning_rate": 3.128619101609598e-05, + "loss": 2.7148, + "step": 31891 + }, + { + "epoch": 1.9797628654789248, + "grad_norm": 0.14279179044145254, + "learning_rate": 3.128284203613011e-05, + "loss": 2.7311, + "step": 31892 + }, + { + "epoch": 1.9798249425786827, + "grad_norm": 0.1613559622682121, + "learning_rate": 3.127949315381619e-05, + "loss": 2.8087, + "step": 31893 + }, + { + "epoch": 1.9798870196784406, + "grad_norm": 0.15436391339859337, + "learning_rate": 3.1276144369171704e-05, + "loss": 2.7595, + "step": 31894 + }, + { + "epoch": 1.9799490967781985, + "grad_norm": 0.1666170781592608, + "learning_rate": 3.1272795682214115e-05, + "loss": 2.6508, + "step": 31895 + }, + { + "epoch": 1.9800111738779564, + "grad_norm": 0.1435302077720039, + "learning_rate": 3.12694470929609e-05, + "loss": 2.7537, + "step": 31896 + }, + { + "epoch": 1.9800732509777144, + "grad_norm": 0.14504429255804988, + "learning_rate": 3.1266098601429534e-05, + "loss": 2.7814, + "step": 31897 + }, + { + "epoch": 1.980135328077472, + "grad_norm": 0.14728700093114624, + "learning_rate": 3.126275020763747e-05, + "loss": 2.7611, + "step": 31898 + }, + { + "epoch": 1.98019740517723, + "grad_norm": 0.1482670143758629, + "learning_rate": 3.1259401911602184e-05, + "loss": 2.7496, + "step": 31899 + }, + { + "epoch": 1.980259482276988, + "grad_norm": 0.14980963782097628, + "learning_rate": 3.125605371334115e-05, + "loss": 2.7324, + "step": 31900 + }, + { + "epoch": 1.9803215593767458, + "grad_norm": 0.15372730105013735, + "learning_rate": 3.125270561287184e-05, + "loss": 2.7072, + "step": 31901 + }, + { + "epoch": 1.9803836364765037, + "grad_norm": 0.15007871013092403, + "learning_rate": 3.1249357610211705e-05, + "loss": 2.8012, + "step": 31902 + }, + { + "epoch": 1.9804457135762616, + "grad_norm": 0.14160196370035094, + "learning_rate": 3.1246009705378217e-05, + "loss": 2.6929, + "step": 31903 + }, + { + "epoch": 1.9805077906760196, + "grad_norm": 0.14586541058639635, + "learning_rate": 3.1242661898388865e-05, + "loss": 2.8418, + "step": 31904 + }, + { + "epoch": 1.9805698677757775, + "grad_norm": 0.144072377710293, + "learning_rate": 3.123931418926109e-05, + "loss": 2.6585, + "step": 31905 + }, + { + "epoch": 1.9806319448755354, + "grad_norm": 0.16671337588263851, + "learning_rate": 3.1235966578012374e-05, + "loss": 2.7435, + "step": 31906 + }, + { + "epoch": 1.9806940219752933, + "grad_norm": 0.15174439099635742, + "learning_rate": 3.123261906466017e-05, + "loss": 2.8261, + "step": 31907 + }, + { + "epoch": 1.9807560990750512, + "grad_norm": 0.1618862512445199, + "learning_rate": 3.122927164922196e-05, + "loss": 2.8104, + "step": 31908 + }, + { + "epoch": 1.9808181761748092, + "grad_norm": 0.1664966514550675, + "learning_rate": 3.1225924331715186e-05, + "loss": 2.801, + "step": 31909 + }, + { + "epoch": 1.980880253274567, + "grad_norm": 0.1776708761953195, + "learning_rate": 3.122257711215733e-05, + "loss": 2.8117, + "step": 31910 + }, + { + "epoch": 1.980942330374325, + "grad_norm": 0.1498782234791761, + "learning_rate": 3.1219229990565845e-05, + "loss": 2.7209, + "step": 31911 + }, + { + "epoch": 1.981004407474083, + "grad_norm": 0.1533706303050197, + "learning_rate": 3.1215882966958194e-05, + "loss": 2.8721, + "step": 31912 + }, + { + "epoch": 1.9810664845738408, + "grad_norm": 0.16235905688014843, + "learning_rate": 3.121253604135185e-05, + "loss": 2.7729, + "step": 31913 + }, + { + "epoch": 1.9811285616735987, + "grad_norm": 0.14675451800436085, + "learning_rate": 3.1209189213764244e-05, + "loss": 2.6821, + "step": 31914 + }, + { + "epoch": 1.9811906387733567, + "grad_norm": 0.1456625390989722, + "learning_rate": 3.120584248421289e-05, + "loss": 2.7184, + "step": 31915 + }, + { + "epoch": 1.9812527158731144, + "grad_norm": 0.20264910005178566, + "learning_rate": 3.1202495852715194e-05, + "loss": 2.7694, + "step": 31916 + }, + { + "epoch": 1.9813147929728723, + "grad_norm": 0.14440699339474897, + "learning_rate": 3.119914931928865e-05, + "loss": 2.8057, + "step": 31917 + }, + { + "epoch": 1.9813768700726302, + "grad_norm": 0.1505795011122032, + "learning_rate": 3.1195802883950705e-05, + "loss": 2.7601, + "step": 31918 + }, + { + "epoch": 1.981438947172388, + "grad_norm": 0.16814320398356958, + "learning_rate": 3.119245654671883e-05, + "loss": 2.768, + "step": 31919 + }, + { + "epoch": 1.981501024272146, + "grad_norm": 0.14317618314844674, + "learning_rate": 3.118911030761046e-05, + "loss": 2.6594, + "step": 31920 + }, + { + "epoch": 1.981563101371904, + "grad_norm": 0.14788528042792126, + "learning_rate": 3.118576416664308e-05, + "loss": 2.65, + "step": 31921 + }, + { + "epoch": 1.9816251784716616, + "grad_norm": 0.15872613380157563, + "learning_rate": 3.1182418123834134e-05, + "loss": 2.735, + "step": 31922 + }, + { + "epoch": 1.9816872555714196, + "grad_norm": 0.14766430883264436, + "learning_rate": 3.117907217920107e-05, + "loss": 2.8304, + "step": 31923 + }, + { + "epoch": 1.9817493326711775, + "grad_norm": 0.14068398298310064, + "learning_rate": 3.117572633276137e-05, + "loss": 2.7508, + "step": 31924 + }, + { + "epoch": 1.9818114097709354, + "grad_norm": 0.1450307629792236, + "learning_rate": 3.117238058453247e-05, + "loss": 2.7546, + "step": 31925 + }, + { + "epoch": 1.9818734868706933, + "grad_norm": 0.14729200880648502, + "learning_rate": 3.116903493453183e-05, + "loss": 2.6834, + "step": 31926 + }, + { + "epoch": 1.9819355639704512, + "grad_norm": 0.14818341066251872, + "learning_rate": 3.11656893827769e-05, + "loss": 2.7442, + "step": 31927 + }, + { + "epoch": 1.9819976410702091, + "grad_norm": 0.14297687606439463, + "learning_rate": 3.1162343929285155e-05, + "loss": 2.729, + "step": 31928 + }, + { + "epoch": 1.982059718169967, + "grad_norm": 0.1680875624131108, + "learning_rate": 3.1158998574074024e-05, + "loss": 2.6646, + "step": 31929 + }, + { + "epoch": 1.982121795269725, + "grad_norm": 0.16091324266933377, + "learning_rate": 3.115565331716098e-05, + "loss": 2.7459, + "step": 31930 + }, + { + "epoch": 1.982183872369483, + "grad_norm": 0.15525985754356644, + "learning_rate": 3.115230815856345e-05, + "loss": 2.8402, + "step": 31931 + }, + { + "epoch": 1.9822459494692408, + "grad_norm": 0.17191364655794333, + "learning_rate": 3.114896309829892e-05, + "loss": 2.8048, + "step": 31932 + }, + { + "epoch": 1.9823080265689987, + "grad_norm": 0.1523117190702202, + "learning_rate": 3.114561813638481e-05, + "loss": 2.6765, + "step": 31933 + }, + { + "epoch": 1.9823701036687567, + "grad_norm": 0.16626074030250165, + "learning_rate": 3.1142273272838604e-05, + "loss": 2.7143, + "step": 31934 + }, + { + "epoch": 1.9824321807685146, + "grad_norm": 0.15524013929986993, + "learning_rate": 3.1138928507677726e-05, + "loss": 2.7663, + "step": 31935 + }, + { + "epoch": 1.9824942578682725, + "grad_norm": 0.1557819448989322, + "learning_rate": 3.113558384091964e-05, + "loss": 2.7262, + "step": 31936 + }, + { + "epoch": 1.9825563349680304, + "grad_norm": 0.24491024780661125, + "learning_rate": 3.113223927258179e-05, + "loss": 2.6941, + "step": 31937 + }, + { + "epoch": 1.9826184120677883, + "grad_norm": 0.226670214928233, + "learning_rate": 3.1128894802681634e-05, + "loss": 2.8236, + "step": 31938 + }, + { + "epoch": 1.9826804891675462, + "grad_norm": 0.14681937376725085, + "learning_rate": 3.1125550431236605e-05, + "loss": 2.691, + "step": 31939 + }, + { + "epoch": 1.982742566267304, + "grad_norm": 0.14123508120340306, + "learning_rate": 3.112220615826418e-05, + "loss": 2.7543, + "step": 31940 + }, + { + "epoch": 1.9828046433670619, + "grad_norm": 0.17165664327240635, + "learning_rate": 3.1118861983781766e-05, + "loss": 2.7632, + "step": 31941 + }, + { + "epoch": 1.9828667204668198, + "grad_norm": 0.194220954578682, + "learning_rate": 3.1115517907806853e-05, + "loss": 2.7601, + "step": 31942 + }, + { + "epoch": 1.9829287975665777, + "grad_norm": 0.1577077988034323, + "learning_rate": 3.111217393035685e-05, + "loss": 2.7443, + "step": 31943 + }, + { + "epoch": 1.9829908746663356, + "grad_norm": 0.17372271738670378, + "learning_rate": 3.110883005144923e-05, + "loss": 2.6533, + "step": 31944 + }, + { + "epoch": 1.9830529517660935, + "grad_norm": 0.14391626507497454, + "learning_rate": 3.110548627110142e-05, + "loss": 2.6972, + "step": 31945 + }, + { + "epoch": 1.9831150288658512, + "grad_norm": 0.15304130631695304, + "learning_rate": 3.1102142589330894e-05, + "loss": 2.7113, + "step": 31946 + }, + { + "epoch": 1.9831771059656091, + "grad_norm": 0.162197445305433, + "learning_rate": 3.1098799006155075e-05, + "loss": 2.6673, + "step": 31947 + }, + { + "epoch": 1.983239183065367, + "grad_norm": 0.1498110303567982, + "learning_rate": 3.10954555215914e-05, + "loss": 2.7107, + "step": 31948 + }, + { + "epoch": 1.983301260165125, + "grad_norm": 0.14732035826422232, + "learning_rate": 3.109211213565733e-05, + "loss": 2.7071, + "step": 31949 + }, + { + "epoch": 1.983363337264883, + "grad_norm": 0.15367372442857646, + "learning_rate": 3.108876884837029e-05, + "loss": 2.7034, + "step": 31950 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.1393330286640486, + "learning_rate": 3.108542565974774e-05, + "loss": 2.7878, + "step": 31951 + }, + { + "epoch": 1.9834874914643987, + "grad_norm": 0.13529726565985598, + "learning_rate": 3.1082082569807116e-05, + "loss": 2.7333, + "step": 31952 + }, + { + "epoch": 1.9835495685641567, + "grad_norm": 0.14292136120894128, + "learning_rate": 3.1078739578565866e-05, + "loss": 2.8315, + "step": 31953 + }, + { + "epoch": 1.9836116456639146, + "grad_norm": 0.15027804309876294, + "learning_rate": 3.1075396686041406e-05, + "loss": 2.7837, + "step": 31954 + }, + { + "epoch": 1.9836737227636725, + "grad_norm": 0.16030060844859198, + "learning_rate": 3.107205389225121e-05, + "loss": 2.7891, + "step": 31955 + }, + { + "epoch": 1.9837357998634304, + "grad_norm": 0.14596080672543021, + "learning_rate": 3.1068711197212695e-05, + "loss": 2.6499, + "step": 31956 + }, + { + "epoch": 1.9837978769631883, + "grad_norm": 0.14217443668176072, + "learning_rate": 3.106536860094332e-05, + "loss": 2.759, + "step": 31957 + }, + { + "epoch": 1.9838599540629462, + "grad_norm": 0.14332062589954744, + "learning_rate": 3.106202610346051e-05, + "loss": 2.7266, + "step": 31958 + }, + { + "epoch": 1.9839220311627042, + "grad_norm": 0.13768687120900966, + "learning_rate": 3.105868370478171e-05, + "loss": 2.7611, + "step": 31959 + }, + { + "epoch": 1.983984108262462, + "grad_norm": 0.1453721739157262, + "learning_rate": 3.105534140492436e-05, + "loss": 2.757, + "step": 31960 + }, + { + "epoch": 1.98404618536222, + "grad_norm": 0.13786429323997668, + "learning_rate": 3.105199920390588e-05, + "loss": 2.7375, + "step": 31961 + }, + { + "epoch": 1.984108262461978, + "grad_norm": 0.1428469970678736, + "learning_rate": 3.104865710174373e-05, + "loss": 2.6678, + "step": 31962 + }, + { + "epoch": 1.9841703395617358, + "grad_norm": 0.16053310692589257, + "learning_rate": 3.104531509845532e-05, + "loss": 2.7787, + "step": 31963 + }, + { + "epoch": 1.9842324166614935, + "grad_norm": 0.16041149757053907, + "learning_rate": 3.1041973194058126e-05, + "loss": 2.777, + "step": 31964 + }, + { + "epoch": 1.9842944937612514, + "grad_norm": 0.14915318246872414, + "learning_rate": 3.103863138856954e-05, + "loss": 2.7386, + "step": 31965 + }, + { + "epoch": 1.9843565708610094, + "grad_norm": 0.15539131848126342, + "learning_rate": 3.103528968200703e-05, + "loss": 2.8135, + "step": 31966 + }, + { + "epoch": 1.9844186479607673, + "grad_norm": 0.1453565156580173, + "learning_rate": 3.103194807438801e-05, + "loss": 2.7678, + "step": 31967 + }, + { + "epoch": 1.9844807250605252, + "grad_norm": 0.18004979016189296, + "learning_rate": 3.10286065657299e-05, + "loss": 2.7575, + "step": 31968 + }, + { + "epoch": 1.9845428021602831, + "grad_norm": 0.15458861065563748, + "learning_rate": 3.102526515605019e-05, + "loss": 2.7452, + "step": 31969 + }, + { + "epoch": 1.9846048792600408, + "grad_norm": 0.1437353696841575, + "learning_rate": 3.102192384536626e-05, + "loss": 2.6905, + "step": 31970 + }, + { + "epoch": 1.9846669563597987, + "grad_norm": 0.14011463961639004, + "learning_rate": 3.1018582633695576e-05, + "loss": 2.7349, + "step": 31971 + }, + { + "epoch": 1.9847290334595566, + "grad_norm": 0.1431769246858745, + "learning_rate": 3.101524152105555e-05, + "loss": 2.6982, + "step": 31972 + }, + { + "epoch": 1.9847911105593146, + "grad_norm": 0.15566750071520474, + "learning_rate": 3.101190050746361e-05, + "loss": 2.7282, + "step": 31973 + }, + { + "epoch": 1.9848531876590725, + "grad_norm": 0.16740069699971222, + "learning_rate": 3.1008559592937206e-05, + "loss": 2.7674, + "step": 31974 + }, + { + "epoch": 1.9849152647588304, + "grad_norm": 0.14839889626709468, + "learning_rate": 3.100521877749374e-05, + "loss": 2.7104, + "step": 31975 + }, + { + "epoch": 1.9849773418585883, + "grad_norm": 0.1451165680596132, + "learning_rate": 3.100187806115067e-05, + "loss": 2.7836, + "step": 31976 + }, + { + "epoch": 1.9850394189583462, + "grad_norm": 0.1397104925715864, + "learning_rate": 3.0998537443925407e-05, + "loss": 2.7292, + "step": 31977 + }, + { + "epoch": 1.9851014960581042, + "grad_norm": 0.15586384501536546, + "learning_rate": 3.0995196925835404e-05, + "loss": 2.8661, + "step": 31978 + }, + { + "epoch": 1.985163573157862, + "grad_norm": 0.1458041227127711, + "learning_rate": 3.099185650689805e-05, + "loss": 2.6875, + "step": 31979 + }, + { + "epoch": 1.98522565025762, + "grad_norm": 0.13481075111841137, + "learning_rate": 3.098851618713081e-05, + "loss": 2.7904, + "step": 31980 + }, + { + "epoch": 1.985287727357378, + "grad_norm": 0.1438783707364717, + "learning_rate": 3.0985175966551086e-05, + "loss": 2.7622, + "step": 31981 + }, + { + "epoch": 1.9853498044571358, + "grad_norm": 0.1431772369110109, + "learning_rate": 3.098183584517632e-05, + "loss": 2.8111, + "step": 31982 + }, + { + "epoch": 1.9854118815568937, + "grad_norm": 0.14128231365737517, + "learning_rate": 3.0978495823023935e-05, + "loss": 2.8386, + "step": 31983 + }, + { + "epoch": 1.9854739586566517, + "grad_norm": 0.15348035513992184, + "learning_rate": 3.097515590011135e-05, + "loss": 2.6103, + "step": 31984 + }, + { + "epoch": 1.9855360357564096, + "grad_norm": 0.1717670871346544, + "learning_rate": 3.0971816076456004e-05, + "loss": 2.7245, + "step": 31985 + }, + { + "epoch": 1.9855981128561675, + "grad_norm": 0.14409098067483897, + "learning_rate": 3.0968476352075304e-05, + "loss": 2.7647, + "step": 31986 + }, + { + "epoch": 1.9856601899559254, + "grad_norm": 0.18003387859195463, + "learning_rate": 3.09651367269867e-05, + "loss": 2.7319, + "step": 31987 + }, + { + "epoch": 1.985722267055683, + "grad_norm": 0.14767028777186034, + "learning_rate": 3.096179720120758e-05, + "loss": 2.7692, + "step": 31988 + }, + { + "epoch": 1.985784344155441, + "grad_norm": 0.1402723075122196, + "learning_rate": 3.095845777475539e-05, + "loss": 2.7224, + "step": 31989 + }, + { + "epoch": 1.985846421255199, + "grad_norm": 0.13604608999814238, + "learning_rate": 3.095511844764755e-05, + "loss": 2.7295, + "step": 31990 + }, + { + "epoch": 1.9859084983549569, + "grad_norm": 0.13660529060937923, + "learning_rate": 3.0951779219901476e-05, + "loss": 2.7004, + "step": 31991 + }, + { + "epoch": 1.9859705754547148, + "grad_norm": 0.14107155100341162, + "learning_rate": 3.0948440091534595e-05, + "loss": 2.7765, + "step": 31992 + }, + { + "epoch": 1.9860326525544727, + "grad_norm": 0.15541778908146128, + "learning_rate": 3.094510106256433e-05, + "loss": 2.6869, + "step": 31993 + }, + { + "epoch": 1.9860947296542304, + "grad_norm": 0.15013392946429557, + "learning_rate": 3.09417621330081e-05, + "loss": 2.7388, + "step": 31994 + }, + { + "epoch": 1.9861568067539883, + "grad_norm": 0.1395987709129641, + "learning_rate": 3.093842330288333e-05, + "loss": 2.5302, + "step": 31995 + }, + { + "epoch": 1.9862188838537462, + "grad_norm": 0.15588221179519682, + "learning_rate": 3.093508457220742e-05, + "loss": 2.6625, + "step": 31996 + }, + { + "epoch": 1.9862809609535041, + "grad_norm": 0.13836013650496132, + "learning_rate": 3.0931745940997806e-05, + "loss": 2.6979, + "step": 31997 + }, + { + "epoch": 1.986343038053262, + "grad_norm": 0.13600203801742275, + "learning_rate": 3.0928407409271906e-05, + "loss": 2.7074, + "step": 31998 + }, + { + "epoch": 1.98640511515302, + "grad_norm": 0.1463316375445257, + "learning_rate": 3.092506897704712e-05, + "loss": 2.7847, + "step": 31999 + }, + { + "epoch": 1.986467192252778, + "grad_norm": 0.15107114822038958, + "learning_rate": 3.09217306443409e-05, + "loss": 2.7228, + "step": 32000 + }, + { + "epoch": 1.9865292693525358, + "grad_norm": 0.14270189217373555, + "learning_rate": 3.091839241117061e-05, + "loss": 2.8301, + "step": 32001 + }, + { + "epoch": 1.9865913464522937, + "grad_norm": 0.13618792618189843, + "learning_rate": 3.091505427755372e-05, + "loss": 2.7643, + "step": 32002 + }, + { + "epoch": 1.9866534235520517, + "grad_norm": 0.16467379038820942, + "learning_rate": 3.0911716243507635e-05, + "loss": 2.729, + "step": 32003 + }, + { + "epoch": 1.9867155006518096, + "grad_norm": 0.1636960501103716, + "learning_rate": 3.0908378309049746e-05, + "loss": 2.7244, + "step": 32004 + }, + { + "epoch": 1.9867775777515675, + "grad_norm": 0.1651857728474532, + "learning_rate": 3.0905040474197497e-05, + "loss": 2.7168, + "step": 32005 + }, + { + "epoch": 1.9868396548513254, + "grad_norm": 0.1629864052035436, + "learning_rate": 3.090170273896827e-05, + "loss": 2.759, + "step": 32006 + }, + { + "epoch": 1.9869017319510833, + "grad_norm": 0.14724359853532615, + "learning_rate": 3.089836510337951e-05, + "loss": 2.7759, + "step": 32007 + }, + { + "epoch": 1.9869638090508412, + "grad_norm": 0.16124479275250225, + "learning_rate": 3.0895027567448603e-05, + "loss": 2.8047, + "step": 32008 + }, + { + "epoch": 1.9870258861505992, + "grad_norm": 0.14415847543734822, + "learning_rate": 3.089169013119299e-05, + "loss": 2.7196, + "step": 32009 + }, + { + "epoch": 1.987087963250357, + "grad_norm": 0.14907756012586745, + "learning_rate": 3.0888352794630064e-05, + "loss": 2.755, + "step": 32010 + }, + { + "epoch": 1.987150040350115, + "grad_norm": 0.1601580352218499, + "learning_rate": 3.0885015557777234e-05, + "loss": 2.758, + "step": 32011 + }, + { + "epoch": 1.9872121174498727, + "grad_norm": 0.14977113280817134, + "learning_rate": 3.088167842065193e-05, + "loss": 2.7601, + "step": 32012 + }, + { + "epoch": 1.9872741945496306, + "grad_norm": 0.1664379750039789, + "learning_rate": 3.087834138327154e-05, + "loss": 2.692, + "step": 32013 + }, + { + "epoch": 1.9873362716493885, + "grad_norm": 0.1500325150316297, + "learning_rate": 3.0875004445653486e-05, + "loss": 2.7658, + "step": 32014 + }, + { + "epoch": 1.9873983487491464, + "grad_norm": 0.16563868083771885, + "learning_rate": 3.087166760781518e-05, + "loss": 2.6958, + "step": 32015 + }, + { + "epoch": 1.9874604258489044, + "grad_norm": 0.14401480189217852, + "learning_rate": 3.086833086977403e-05, + "loss": 2.7273, + "step": 32016 + }, + { + "epoch": 1.9875225029486623, + "grad_norm": 0.15864312244344211, + "learning_rate": 3.086499423154743e-05, + "loss": 2.7291, + "step": 32017 + }, + { + "epoch": 1.98758458004842, + "grad_norm": 0.14104774959830157, + "learning_rate": 3.086165769315281e-05, + "loss": 2.6914, + "step": 32018 + }, + { + "epoch": 1.987646657148178, + "grad_norm": 0.1504222972761317, + "learning_rate": 3.0858321254607567e-05, + "loss": 2.7557, + "step": 32019 + }, + { + "epoch": 1.9877087342479358, + "grad_norm": 0.14774202247325183, + "learning_rate": 3.0854984915929114e-05, + "loss": 2.6394, + "step": 32020 + }, + { + "epoch": 1.9877708113476937, + "grad_norm": 0.13864598905706033, + "learning_rate": 3.085164867713485e-05, + "loss": 2.6446, + "step": 32021 + }, + { + "epoch": 1.9878328884474517, + "grad_norm": 0.14973005527091823, + "learning_rate": 3.0848312538242174e-05, + "loss": 2.7776, + "step": 32022 + }, + { + "epoch": 1.9878949655472096, + "grad_norm": 0.15612482182658272, + "learning_rate": 3.084497649926852e-05, + "loss": 2.7921, + "step": 32023 + }, + { + "epoch": 1.9879570426469675, + "grad_norm": 0.17586826184229182, + "learning_rate": 3.084164056023125e-05, + "loss": 2.7666, + "step": 32024 + }, + { + "epoch": 1.9880191197467254, + "grad_norm": 0.18092294650934596, + "learning_rate": 3.08383047211478e-05, + "loss": 2.7255, + "step": 32025 + }, + { + "epoch": 1.9880811968464833, + "grad_norm": 0.15502760277971406, + "learning_rate": 3.083496898203557e-05, + "loss": 2.7397, + "step": 32026 + }, + { + "epoch": 1.9881432739462412, + "grad_norm": 0.14821885343713306, + "learning_rate": 3.0831633342911966e-05, + "loss": 2.7589, + "step": 32027 + }, + { + "epoch": 1.9882053510459992, + "grad_norm": 0.1585986585280274, + "learning_rate": 3.082829780379437e-05, + "loss": 2.8291, + "step": 32028 + }, + { + "epoch": 1.988267428145757, + "grad_norm": 0.14634122847886147, + "learning_rate": 3.0824962364700206e-05, + "loss": 2.8129, + "step": 32029 + }, + { + "epoch": 1.988329505245515, + "grad_norm": 0.15828607292020005, + "learning_rate": 3.082162702564686e-05, + "loss": 2.7687, + "step": 32030 + }, + { + "epoch": 1.988391582345273, + "grad_norm": 0.16961027492177266, + "learning_rate": 3.0818291786651754e-05, + "loss": 2.7859, + "step": 32031 + }, + { + "epoch": 1.9884536594450308, + "grad_norm": 0.14815702417111445, + "learning_rate": 3.081495664773228e-05, + "loss": 2.7813, + "step": 32032 + }, + { + "epoch": 1.9885157365447887, + "grad_norm": 0.15104522920112226, + "learning_rate": 3.0811621608905815e-05, + "loss": 2.8001, + "step": 32033 + }, + { + "epoch": 1.9885778136445467, + "grad_norm": 0.14176261556292868, + "learning_rate": 3.080828667018978e-05, + "loss": 2.7718, + "step": 32034 + }, + { + "epoch": 1.9886398907443046, + "grad_norm": 0.1589303140320847, + "learning_rate": 3.080495183160158e-05, + "loss": 2.7623, + "step": 32035 + }, + { + "epoch": 1.9887019678440623, + "grad_norm": 0.1397068995034242, + "learning_rate": 3.08016170931586e-05, + "loss": 2.8166, + "step": 32036 + }, + { + "epoch": 1.9887640449438202, + "grad_norm": 0.14448939181240347, + "learning_rate": 3.079828245487826e-05, + "loss": 2.6969, + "step": 32037 + }, + { + "epoch": 1.9888261220435781, + "grad_norm": 0.1427349834606646, + "learning_rate": 3.0794947916777926e-05, + "loss": 2.6868, + "step": 32038 + }, + { + "epoch": 1.988888199143336, + "grad_norm": 0.15686166369412596, + "learning_rate": 3.079161347887502e-05, + "loss": 2.7805, + "step": 32039 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.1607649215807427, + "learning_rate": 3.078827914118693e-05, + "loss": 2.7797, + "step": 32040 + }, + { + "epoch": 1.9890123533428519, + "grad_norm": 0.14147990031673094, + "learning_rate": 3.078494490373105e-05, + "loss": 2.7212, + "step": 32041 + }, + { + "epoch": 1.9890744304426096, + "grad_norm": 0.1454229299290406, + "learning_rate": 3.078161076652477e-05, + "loss": 2.7552, + "step": 32042 + }, + { + "epoch": 1.9891365075423675, + "grad_norm": 0.14564943142423756, + "learning_rate": 3.07782767295855e-05, + "loss": 2.7265, + "step": 32043 + }, + { + "epoch": 1.9891985846421254, + "grad_norm": 0.21404396659088692, + "learning_rate": 3.077494279293062e-05, + "loss": 2.7435, + "step": 32044 + }, + { + "epoch": 1.9892606617418833, + "grad_norm": 0.15563031336894428, + "learning_rate": 3.0771608956577533e-05, + "loss": 2.8568, + "step": 32045 + }, + { + "epoch": 1.9893227388416412, + "grad_norm": 0.14729701651156366, + "learning_rate": 3.076827522054364e-05, + "loss": 2.8041, + "step": 32046 + }, + { + "epoch": 1.9893848159413992, + "grad_norm": 0.15700740823002282, + "learning_rate": 3.076494158484631e-05, + "loss": 2.8177, + "step": 32047 + }, + { + "epoch": 1.989446893041157, + "grad_norm": 0.15002906984534153, + "learning_rate": 3.076160804950296e-05, + "loss": 2.7673, + "step": 32048 + }, + { + "epoch": 1.989508970140915, + "grad_norm": 0.1485460223475107, + "learning_rate": 3.0758274614530964e-05, + "loss": 2.6848, + "step": 32049 + }, + { + "epoch": 1.989571047240673, + "grad_norm": 0.18136229776942628, + "learning_rate": 3.075494127994772e-05, + "loss": 2.7049, + "step": 32050 + }, + { + "epoch": 1.9896331243404308, + "grad_norm": 0.15146998329715117, + "learning_rate": 3.075160804577062e-05, + "loss": 2.6536, + "step": 32051 + }, + { + "epoch": 1.9896952014401887, + "grad_norm": 0.14256227744878233, + "learning_rate": 3.074827491201706e-05, + "loss": 2.7207, + "step": 32052 + }, + { + "epoch": 1.9897572785399467, + "grad_norm": 0.19315560518116098, + "learning_rate": 3.0744941878704414e-05, + "loss": 2.6884, + "step": 32053 + }, + { + "epoch": 1.9898193556397046, + "grad_norm": 0.16694048588687807, + "learning_rate": 3.074160894585009e-05, + "loss": 2.6706, + "step": 32054 + }, + { + "epoch": 1.9898814327394625, + "grad_norm": 0.16365965704278745, + "learning_rate": 3.073827611347145e-05, + "loss": 2.7549, + "step": 32055 + }, + { + "epoch": 1.9899435098392204, + "grad_norm": 0.15732848968486324, + "learning_rate": 3.0734943381585924e-05, + "loss": 2.8196, + "step": 32056 + }, + { + "epoch": 1.9900055869389783, + "grad_norm": 0.14147857718381654, + "learning_rate": 3.073161075021087e-05, + "loss": 2.699, + "step": 32057 + }, + { + "epoch": 1.9900676640387363, + "grad_norm": 0.15649878246546736, + "learning_rate": 3.0728278219363666e-05, + "loss": 2.8314, + "step": 32058 + }, + { + "epoch": 1.9901297411384942, + "grad_norm": 0.2176394627814147, + "learning_rate": 3.072494578906172e-05, + "loss": 2.7596, + "step": 32059 + }, + { + "epoch": 1.9901918182382519, + "grad_norm": 0.16986830184726104, + "learning_rate": 3.072161345932241e-05, + "loss": 2.6479, + "step": 32060 + }, + { + "epoch": 1.9902538953380098, + "grad_norm": 0.15607274253012457, + "learning_rate": 3.0718281230163126e-05, + "loss": 2.8307, + "step": 32061 + }, + { + "epoch": 1.9903159724377677, + "grad_norm": 0.15194142230356175, + "learning_rate": 3.071494910160124e-05, + "loss": 2.8315, + "step": 32062 + }, + { + "epoch": 1.9903780495375256, + "grad_norm": 0.17406761623342198, + "learning_rate": 3.071161707365416e-05, + "loss": 2.7334, + "step": 32063 + }, + { + "epoch": 1.9904401266372835, + "grad_norm": 0.1599477247806128, + "learning_rate": 3.0708285146339244e-05, + "loss": 2.7655, + "step": 32064 + }, + { + "epoch": 1.9905022037370415, + "grad_norm": 0.161192547678445, + "learning_rate": 3.07049533196739e-05, + "loss": 2.8378, + "step": 32065 + }, + { + "epoch": 1.9905642808367991, + "grad_norm": 0.14908616856736667, + "learning_rate": 3.070162159367548e-05, + "loss": 2.7888, + "step": 32066 + }, + { + "epoch": 1.990626357936557, + "grad_norm": 0.14681630356726558, + "learning_rate": 3.0698289968361385e-05, + "loss": 2.7277, + "step": 32067 + }, + { + "epoch": 1.990688435036315, + "grad_norm": 0.14735866941360562, + "learning_rate": 3.069495844374901e-05, + "loss": 2.7408, + "step": 32068 + }, + { + "epoch": 1.990750512136073, + "grad_norm": 0.1505282936150366, + "learning_rate": 3.0691627019855716e-05, + "loss": 2.7485, + "step": 32069 + }, + { + "epoch": 1.9908125892358308, + "grad_norm": 0.16409622141012747, + "learning_rate": 3.068829569669891e-05, + "loss": 2.7699, + "step": 32070 + }, + { + "epoch": 1.9908746663355887, + "grad_norm": 0.17933177982231183, + "learning_rate": 3.068496447429594e-05, + "loss": 2.8225, + "step": 32071 + }, + { + "epoch": 1.9909367434353467, + "grad_norm": 0.14883301143684408, + "learning_rate": 3.06816333526642e-05, + "loss": 2.7638, + "step": 32072 + }, + { + "epoch": 1.9909988205351046, + "grad_norm": 0.14597024975918638, + "learning_rate": 3.067830233182107e-05, + "loss": 2.7124, + "step": 32073 + }, + { + "epoch": 1.9910608976348625, + "grad_norm": 0.1729953781813587, + "learning_rate": 3.0674971411783916e-05, + "loss": 2.7527, + "step": 32074 + }, + { + "epoch": 1.9911229747346204, + "grad_norm": 0.14225823298883458, + "learning_rate": 3.0671640592570144e-05, + "loss": 2.7337, + "step": 32075 + }, + { + "epoch": 1.9911850518343783, + "grad_norm": 0.17928476396600826, + "learning_rate": 3.06683098741971e-05, + "loss": 2.7583, + "step": 32076 + }, + { + "epoch": 1.9912471289341362, + "grad_norm": 0.15384137243339477, + "learning_rate": 3.066497925668219e-05, + "loss": 2.7249, + "step": 32077 + }, + { + "epoch": 1.9913092060338942, + "grad_norm": 0.1472045235396844, + "learning_rate": 3.066164874004277e-05, + "loss": 2.7356, + "step": 32078 + }, + { + "epoch": 1.991371283133652, + "grad_norm": 0.17626597012544465, + "learning_rate": 3.065831832429623e-05, + "loss": 2.7866, + "step": 32079 + }, + { + "epoch": 1.99143336023341, + "grad_norm": 0.13963369144932916, + "learning_rate": 3.0654988009459926e-05, + "loss": 2.8171, + "step": 32080 + }, + { + "epoch": 1.991495437333168, + "grad_norm": 0.14384009735695863, + "learning_rate": 3.065165779555126e-05, + "loss": 2.7265, + "step": 32081 + }, + { + "epoch": 1.9915575144329258, + "grad_norm": 0.16304757472218506, + "learning_rate": 3.064832768258758e-05, + "loss": 2.6888, + "step": 32082 + }, + { + "epoch": 1.9916195915326838, + "grad_norm": 0.18366894298178177, + "learning_rate": 3.064499767058628e-05, + "loss": 2.7813, + "step": 32083 + }, + { + "epoch": 1.9916816686324414, + "grad_norm": 0.16285680904707514, + "learning_rate": 3.0641667759564725e-05, + "loss": 2.7509, + "step": 32084 + }, + { + "epoch": 1.9917437457321994, + "grad_norm": 0.15985778584760968, + "learning_rate": 3.063833794954028e-05, + "loss": 2.7371, + "step": 32085 + }, + { + "epoch": 1.9918058228319573, + "grad_norm": 0.15750844340550216, + "learning_rate": 3.0635008240530336e-05, + "loss": 2.8002, + "step": 32086 + }, + { + "epoch": 1.9918678999317152, + "grad_norm": 0.14669708067557743, + "learning_rate": 3.063167863255225e-05, + "loss": 2.7657, + "step": 32087 + }, + { + "epoch": 1.9919299770314731, + "grad_norm": 0.16388946445434963, + "learning_rate": 3.06283491256234e-05, + "loss": 2.769, + "step": 32088 + }, + { + "epoch": 1.991992054131231, + "grad_norm": 0.1522253138545048, + "learning_rate": 3.0625019719761156e-05, + "loss": 2.6561, + "step": 32089 + }, + { + "epoch": 1.9920541312309887, + "grad_norm": 0.14996369563291684, + "learning_rate": 3.062169041498289e-05, + "loss": 2.7574, + "step": 32090 + }, + { + "epoch": 1.9921162083307467, + "grad_norm": 0.14175071753214322, + "learning_rate": 3.061836121130596e-05, + "loss": 2.728, + "step": 32091 + }, + { + "epoch": 1.9921782854305046, + "grad_norm": 0.15007141221387377, + "learning_rate": 3.061503210874775e-05, + "loss": 2.7336, + "step": 32092 + }, + { + "epoch": 1.9922403625302625, + "grad_norm": 0.14476819205439898, + "learning_rate": 3.061170310732563e-05, + "loss": 2.7247, + "step": 32093 + }, + { + "epoch": 1.9923024396300204, + "grad_norm": 0.1434841306992347, + "learning_rate": 3.060837420705696e-05, + "loss": 2.7505, + "step": 32094 + }, + { + "epoch": 1.9923645167297783, + "grad_norm": 0.1448651979259696, + "learning_rate": 3.0605045407959107e-05, + "loss": 2.8627, + "step": 32095 + }, + { + "epoch": 1.9924265938295362, + "grad_norm": 0.1521174082131181, + "learning_rate": 3.060171671004943e-05, + "loss": 2.8519, + "step": 32096 + }, + { + "epoch": 1.9924886709292942, + "grad_norm": 0.1587992433800947, + "learning_rate": 3.059838811334532e-05, + "loss": 2.7203, + "step": 32097 + }, + { + "epoch": 1.992550748029052, + "grad_norm": 0.14841583369893424, + "learning_rate": 3.059505961786412e-05, + "loss": 2.7455, + "step": 32098 + }, + { + "epoch": 1.99261282512881, + "grad_norm": 0.14673148610794381, + "learning_rate": 3.0591731223623195e-05, + "loss": 2.8079, + "step": 32099 + }, + { + "epoch": 1.992674902228568, + "grad_norm": 0.14648353576419748, + "learning_rate": 3.0588402930639936e-05, + "loss": 2.8602, + "step": 32100 + }, + { + "epoch": 1.9927369793283258, + "grad_norm": 0.1423006820324425, + "learning_rate": 3.0585074738931684e-05, + "loss": 2.7315, + "step": 32101 + }, + { + "epoch": 1.9927990564280837, + "grad_norm": 0.16006324556490614, + "learning_rate": 3.058174664851582e-05, + "loss": 2.6913, + "step": 32102 + }, + { + "epoch": 1.9928611335278417, + "grad_norm": 0.14396909338734412, + "learning_rate": 3.057841865940969e-05, + "loss": 2.7835, + "step": 32103 + }, + { + "epoch": 1.9929232106275996, + "grad_norm": 0.1537097007478742, + "learning_rate": 3.057509077163067e-05, + "loss": 2.7521, + "step": 32104 + }, + { + "epoch": 1.9929852877273575, + "grad_norm": 0.14694591925363581, + "learning_rate": 3.05717629851961e-05, + "loss": 2.7435, + "step": 32105 + }, + { + "epoch": 1.9930473648271154, + "grad_norm": 0.13806939545305655, + "learning_rate": 3.056843530012338e-05, + "loss": 2.6191, + "step": 32106 + }, + { + "epoch": 1.9931094419268733, + "grad_norm": 0.1600030538520842, + "learning_rate": 3.0565107716429854e-05, + "loss": 2.7568, + "step": 32107 + }, + { + "epoch": 1.993171519026631, + "grad_norm": 0.14564761543870638, + "learning_rate": 3.056178023413286e-05, + "loss": 2.7733, + "step": 32108 + }, + { + "epoch": 1.993233596126389, + "grad_norm": 0.14073537759772106, + "learning_rate": 3.055845285324979e-05, + "loss": 2.713, + "step": 32109 + }, + { + "epoch": 1.9932956732261469, + "grad_norm": 0.15349334623084168, + "learning_rate": 3.0555125573797984e-05, + "loss": 2.7871, + "step": 32110 + }, + { + "epoch": 1.9933577503259048, + "grad_norm": 0.15793248563066703, + "learning_rate": 3.055179839579481e-05, + "loss": 2.7115, + "step": 32111 + }, + { + "epoch": 1.9934198274256627, + "grad_norm": 0.16953257107549566, + "learning_rate": 3.054847131925762e-05, + "loss": 2.682, + "step": 32112 + }, + { + "epoch": 1.9934819045254206, + "grad_norm": 0.21295144148618841, + "learning_rate": 3.054514434420379e-05, + "loss": 2.7379, + "step": 32113 + }, + { + "epoch": 1.9935439816251783, + "grad_norm": 0.16270914923365246, + "learning_rate": 3.0541817470650645e-05, + "loss": 2.7807, + "step": 32114 + }, + { + "epoch": 1.9936060587249362, + "grad_norm": 0.13710852647176464, + "learning_rate": 3.053849069861558e-05, + "loss": 2.788, + "step": 32115 + }, + { + "epoch": 1.9936681358246942, + "grad_norm": 0.15671681170914603, + "learning_rate": 3.0535164028115924e-05, + "loss": 2.6599, + "step": 32116 + }, + { + "epoch": 1.993730212924452, + "grad_norm": 0.1690315015705992, + "learning_rate": 3.053183745916905e-05, + "loss": 2.77, + "step": 32117 + }, + { + "epoch": 1.99379229002421, + "grad_norm": 0.15188817227112542, + "learning_rate": 3.052851099179231e-05, + "loss": 2.7244, + "step": 32118 + }, + { + "epoch": 1.993854367123968, + "grad_norm": 0.14510168452261085, + "learning_rate": 3.052518462600305e-05, + "loss": 2.8341, + "step": 32119 + }, + { + "epoch": 1.9939164442237258, + "grad_norm": 0.14383877003577217, + "learning_rate": 3.052185836181862e-05, + "loss": 2.7411, + "step": 32120 + }, + { + "epoch": 1.9939785213234837, + "grad_norm": 0.14912305179851798, + "learning_rate": 3.0518532199256386e-05, + "loss": 2.6757, + "step": 32121 + }, + { + "epoch": 1.9940405984232417, + "grad_norm": 0.14938678991204685, + "learning_rate": 3.0515206138333713e-05, + "loss": 2.7288, + "step": 32122 + }, + { + "epoch": 1.9941026755229996, + "grad_norm": 0.15447425907049614, + "learning_rate": 3.051188017906792e-05, + "loss": 2.7538, + "step": 32123 + }, + { + "epoch": 1.9941647526227575, + "grad_norm": 0.1466262702338084, + "learning_rate": 3.0508554321476395e-05, + "loss": 2.8029, + "step": 32124 + }, + { + "epoch": 1.9942268297225154, + "grad_norm": 0.1472001263999688, + "learning_rate": 3.050522856557646e-05, + "loss": 2.6848, + "step": 32125 + }, + { + "epoch": 1.9942889068222733, + "grad_norm": 0.14219087764375235, + "learning_rate": 3.0501902911385495e-05, + "loss": 2.7207, + "step": 32126 + }, + { + "epoch": 1.9943509839220313, + "grad_norm": 0.14693264083827126, + "learning_rate": 3.049857735892082e-05, + "loss": 2.651, + "step": 32127 + }, + { + "epoch": 1.9944130610217892, + "grad_norm": 0.15846857124586336, + "learning_rate": 3.0495251908199817e-05, + "loss": 2.8206, + "step": 32128 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.17036151465036464, + "learning_rate": 3.0491926559239808e-05, + "loss": 2.754, + "step": 32129 + }, + { + "epoch": 1.994537215221305, + "grad_norm": 0.17986325293284008, + "learning_rate": 3.0488601312058164e-05, + "loss": 2.7931, + "step": 32130 + }, + { + "epoch": 1.994599292321063, + "grad_norm": 0.14452751902326802, + "learning_rate": 3.0485276166672223e-05, + "loss": 2.7875, + "step": 32131 + }, + { + "epoch": 1.9946613694208206, + "grad_norm": 0.14288065442254444, + "learning_rate": 3.048195112309931e-05, + "loss": 2.7836, + "step": 32132 + }, + { + "epoch": 1.9947234465205785, + "grad_norm": 0.15755072637767042, + "learning_rate": 3.0478626181356812e-05, + "loss": 2.7946, + "step": 32133 + }, + { + "epoch": 1.9947855236203365, + "grad_norm": 0.1454443569198062, + "learning_rate": 3.047530134146207e-05, + "loss": 2.7811, + "step": 32134 + }, + { + "epoch": 1.9948476007200944, + "grad_norm": 0.14865274884613938, + "learning_rate": 3.0471976603432408e-05, + "loss": 2.7456, + "step": 32135 + }, + { + "epoch": 1.9949096778198523, + "grad_norm": 0.1543173293547305, + "learning_rate": 3.046865196728519e-05, + "loss": 2.8683, + "step": 32136 + }, + { + "epoch": 1.9949717549196102, + "grad_norm": 0.15682952815506854, + "learning_rate": 3.046532743303775e-05, + "loss": 2.7787, + "step": 32137 + }, + { + "epoch": 1.995033832019368, + "grad_norm": 0.14429923714984988, + "learning_rate": 3.046200300070745e-05, + "loss": 2.675, + "step": 32138 + }, + { + "epoch": 1.9950959091191258, + "grad_norm": 0.14557871915027204, + "learning_rate": 3.0458678670311612e-05, + "loss": 2.721, + "step": 32139 + }, + { + "epoch": 1.9951579862188837, + "grad_norm": 0.1425185928428843, + "learning_rate": 3.0455354441867606e-05, + "loss": 2.694, + "step": 32140 + }, + { + "epoch": 1.9952200633186417, + "grad_norm": 0.1479008976552036, + "learning_rate": 3.0452030315392743e-05, + "loss": 2.706, + "step": 32141 + }, + { + "epoch": 1.9952821404183996, + "grad_norm": 0.1462742060717681, + "learning_rate": 3.0448706290904404e-05, + "loss": 2.7892, + "step": 32142 + }, + { + "epoch": 1.9953442175181575, + "grad_norm": 0.14528291077996927, + "learning_rate": 3.04453823684199e-05, + "loss": 2.6723, + "step": 32143 + }, + { + "epoch": 1.9954062946179154, + "grad_norm": 0.15744424050527928, + "learning_rate": 3.0442058547956577e-05, + "loss": 2.7334, + "step": 32144 + }, + { + "epoch": 1.9954683717176733, + "grad_norm": 0.1553692312266831, + "learning_rate": 3.043873482953179e-05, + "loss": 2.7375, + "step": 32145 + }, + { + "epoch": 1.9955304488174312, + "grad_norm": 0.17001123380709077, + "learning_rate": 3.0435411213162868e-05, + "loss": 2.6864, + "step": 32146 + }, + { + "epoch": 1.9955925259171892, + "grad_norm": 0.1395947079566928, + "learning_rate": 3.0432087698867163e-05, + "loss": 2.6828, + "step": 32147 + }, + { + "epoch": 1.995654603016947, + "grad_norm": 0.14004763652777902, + "learning_rate": 3.042876428666199e-05, + "loss": 2.8003, + "step": 32148 + }, + { + "epoch": 1.995716680116705, + "grad_norm": 0.15555227183124046, + "learning_rate": 3.0425440976564723e-05, + "loss": 2.7301, + "step": 32149 + }, + { + "epoch": 1.995778757216463, + "grad_norm": 0.15520860410086376, + "learning_rate": 3.0422117768592673e-05, + "loss": 2.7704, + "step": 32150 + }, + { + "epoch": 1.9958408343162208, + "grad_norm": 0.15769806645681472, + "learning_rate": 3.0418794662763194e-05, + "loss": 2.8243, + "step": 32151 + }, + { + "epoch": 1.9959029114159788, + "grad_norm": 0.15020940439218916, + "learning_rate": 3.0415471659093603e-05, + "loss": 2.7166, + "step": 32152 + }, + { + "epoch": 1.9959649885157367, + "grad_norm": 0.15799702064053087, + "learning_rate": 3.0412148757601265e-05, + "loss": 2.6712, + "step": 32153 + }, + { + "epoch": 1.9960270656154946, + "grad_norm": 0.19071966736341778, + "learning_rate": 3.040882595830349e-05, + "loss": 2.7822, + "step": 32154 + }, + { + "epoch": 1.9960891427152525, + "grad_norm": 0.1469982099655214, + "learning_rate": 3.040550326121764e-05, + "loss": 2.8006, + "step": 32155 + }, + { + "epoch": 1.9961512198150102, + "grad_norm": 0.14831873258201828, + "learning_rate": 3.0402180666361037e-05, + "loss": 2.7362, + "step": 32156 + }, + { + "epoch": 1.9962132969147681, + "grad_norm": 0.16059344212676416, + "learning_rate": 3.0398858173751e-05, + "loss": 2.739, + "step": 32157 + }, + { + "epoch": 1.996275374014526, + "grad_norm": 0.14350989025538402, + "learning_rate": 3.0395535783404882e-05, + "loss": 2.7549, + "step": 32158 + }, + { + "epoch": 1.996337451114284, + "grad_norm": 0.1607430918078287, + "learning_rate": 3.0392213495340015e-05, + "loss": 2.694, + "step": 32159 + }, + { + "epoch": 1.9963995282140419, + "grad_norm": 0.16304584398661104, + "learning_rate": 3.0388891309573735e-05, + "loss": 2.7743, + "step": 32160 + }, + { + "epoch": 1.9964616053137998, + "grad_norm": 0.15166651754799873, + "learning_rate": 3.038556922612336e-05, + "loss": 2.645, + "step": 32161 + }, + { + "epoch": 1.9965236824135575, + "grad_norm": 0.165398477224668, + "learning_rate": 3.0382247245006246e-05, + "loss": 2.7439, + "step": 32162 + }, + { + "epoch": 1.9965857595133154, + "grad_norm": 0.14179262885259428, + "learning_rate": 3.0378925366239696e-05, + "loss": 2.7991, + "step": 32163 + }, + { + "epoch": 1.9966478366130733, + "grad_norm": 0.143949588313579, + "learning_rate": 3.0375603589841062e-05, + "loss": 2.6863, + "step": 32164 + }, + { + "epoch": 1.9967099137128312, + "grad_norm": 0.1591669791361686, + "learning_rate": 3.0372281915827654e-05, + "loss": 2.6218, + "step": 32165 + }, + { + "epoch": 1.9967719908125892, + "grad_norm": 0.14488035645798306, + "learning_rate": 3.0368960344216823e-05, + "loss": 2.7288, + "step": 32166 + }, + { + "epoch": 1.996834067912347, + "grad_norm": 0.13692708490888503, + "learning_rate": 3.0365638875025902e-05, + "loss": 2.763, + "step": 32167 + }, + { + "epoch": 1.996896145012105, + "grad_norm": 0.14087540134949872, + "learning_rate": 3.0362317508272215e-05, + "loss": 2.6749, + "step": 32168 + }, + { + "epoch": 1.996958222111863, + "grad_norm": 0.15650102597233967, + "learning_rate": 3.035899624397307e-05, + "loss": 2.8566, + "step": 32169 + }, + { + "epoch": 1.9970202992116208, + "grad_norm": 0.13944076221493767, + "learning_rate": 3.0355675082145817e-05, + "loss": 2.7347, + "step": 32170 + }, + { + "epoch": 1.9970823763113787, + "grad_norm": 0.15184010492351288, + "learning_rate": 3.0352354022807773e-05, + "loss": 2.6714, + "step": 32171 + }, + { + "epoch": 1.9971444534111367, + "grad_norm": 0.14270443090152282, + "learning_rate": 3.0349033065976274e-05, + "loss": 2.7757, + "step": 32172 + }, + { + "epoch": 1.9972065305108946, + "grad_norm": 0.14821407167087666, + "learning_rate": 3.0345712211668627e-05, + "loss": 2.7401, + "step": 32173 + }, + { + "epoch": 1.9972686076106525, + "grad_norm": 0.15115223665846084, + "learning_rate": 3.0342391459902185e-05, + "loss": 2.7294, + "step": 32174 + }, + { + "epoch": 1.9973306847104104, + "grad_norm": 0.1919930571558709, + "learning_rate": 3.0339070810694248e-05, + "loss": 2.7671, + "step": 32175 + }, + { + "epoch": 1.9973927618101683, + "grad_norm": 0.15701866028264644, + "learning_rate": 3.0335750264062157e-05, + "loss": 2.7598, + "step": 32176 + }, + { + "epoch": 1.9974548389099263, + "grad_norm": 0.17950319084171187, + "learning_rate": 3.0332429820023222e-05, + "loss": 2.7679, + "step": 32177 + }, + { + "epoch": 1.9975169160096842, + "grad_norm": 0.16186688567595137, + "learning_rate": 3.0329109478594786e-05, + "loss": 2.8182, + "step": 32178 + }, + { + "epoch": 1.997578993109442, + "grad_norm": 0.14978999173970375, + "learning_rate": 3.0325789239794157e-05, + "loss": 2.729, + "step": 32179 + }, + { + "epoch": 1.9976410702091998, + "grad_norm": 0.14488238133263218, + "learning_rate": 3.0322469103638664e-05, + "loss": 2.6385, + "step": 32180 + }, + { + "epoch": 1.9977031473089577, + "grad_norm": 0.15228734439394773, + "learning_rate": 3.031914907014563e-05, + "loss": 2.7794, + "step": 32181 + }, + { + "epoch": 1.9977652244087156, + "grad_norm": 0.14234097890532885, + "learning_rate": 3.0315829139332364e-05, + "loss": 2.728, + "step": 32182 + }, + { + "epoch": 1.9978273015084735, + "grad_norm": 0.15567710874443916, + "learning_rate": 3.0312509311216198e-05, + "loss": 2.7714, + "step": 32183 + }, + { + "epoch": 1.9978893786082315, + "grad_norm": 0.1515225812608011, + "learning_rate": 3.0309189585814445e-05, + "loss": 2.6739, + "step": 32184 + }, + { + "epoch": 1.9979514557079894, + "grad_norm": 0.14717798686213107, + "learning_rate": 3.0305869963144446e-05, + "loss": 2.7961, + "step": 32185 + }, + { + "epoch": 1.998013532807747, + "grad_norm": 0.14928073793850077, + "learning_rate": 3.0302550443223488e-05, + "loss": 2.801, + "step": 32186 + }, + { + "epoch": 1.998075609907505, + "grad_norm": 0.1491740299539494, + "learning_rate": 3.029923102606892e-05, + "loss": 2.7477, + "step": 32187 + }, + { + "epoch": 1.998137687007263, + "grad_norm": 0.15600346546992933, + "learning_rate": 3.0295911711698034e-05, + "loss": 2.767, + "step": 32188 + }, + { + "epoch": 1.9981997641070208, + "grad_norm": 0.16306536653264658, + "learning_rate": 3.029259250012817e-05, + "loss": 2.7329, + "step": 32189 + }, + { + "epoch": 1.9982618412067787, + "grad_norm": 0.1473461550701369, + "learning_rate": 3.0289273391376626e-05, + "loss": 2.8557, + "step": 32190 + }, + { + "epoch": 1.9983239183065367, + "grad_norm": 0.13769569126374656, + "learning_rate": 3.0285954385460735e-05, + "loss": 2.8535, + "step": 32191 + }, + { + "epoch": 1.9983859954062946, + "grad_norm": 0.1461545537189922, + "learning_rate": 3.028263548239781e-05, + "loss": 2.7715, + "step": 32192 + }, + { + "epoch": 1.9984480725060525, + "grad_norm": 0.14224022841286904, + "learning_rate": 3.027931668220515e-05, + "loss": 2.7434, + "step": 32193 + }, + { + "epoch": 1.9985101496058104, + "grad_norm": 0.17278159605906798, + "learning_rate": 3.027599798490009e-05, + "loss": 2.7376, + "step": 32194 + }, + { + "epoch": 1.9985722267055683, + "grad_norm": 0.17017905186724133, + "learning_rate": 3.0272679390499924e-05, + "loss": 2.7802, + "step": 32195 + }, + { + "epoch": 1.9986343038053263, + "grad_norm": 0.15837856266813824, + "learning_rate": 3.0269360899022e-05, + "loss": 2.7598, + "step": 32196 + }, + { + "epoch": 1.9986963809050842, + "grad_norm": 0.13090973923951144, + "learning_rate": 3.0266042510483582e-05, + "loss": 2.6898, + "step": 32197 + }, + { + "epoch": 1.998758458004842, + "grad_norm": 0.14438707661383773, + "learning_rate": 3.0262724224902016e-05, + "loss": 2.7767, + "step": 32198 + }, + { + "epoch": 1.9988205351046, + "grad_norm": 0.15266744752186903, + "learning_rate": 3.0259406042294625e-05, + "loss": 2.6739, + "step": 32199 + }, + { + "epoch": 1.998882612204358, + "grad_norm": 0.1418962044036368, + "learning_rate": 3.02560879626787e-05, + "loss": 2.7686, + "step": 32200 + }, + { + "epoch": 1.9989446893041158, + "grad_norm": 0.15844520837891274, + "learning_rate": 3.025276998607156e-05, + "loss": 2.8136, + "step": 32201 + }, + { + "epoch": 1.9990067664038738, + "grad_norm": 0.14449726169047444, + "learning_rate": 3.0249452112490506e-05, + "loss": 2.7003, + "step": 32202 + }, + { + "epoch": 1.9990688435036317, + "grad_norm": 0.14995555989271164, + "learning_rate": 3.024613434195286e-05, + "loss": 2.7171, + "step": 32203 + }, + { + "epoch": 1.9991309206033894, + "grad_norm": 0.1541878761917801, + "learning_rate": 3.0242816674475925e-05, + "loss": 2.7092, + "step": 32204 + }, + { + "epoch": 1.9991929977031473, + "grad_norm": 0.14971322919603183, + "learning_rate": 3.023949911007702e-05, + "loss": 2.7392, + "step": 32205 + }, + { + "epoch": 1.9992550748029052, + "grad_norm": 0.14368200819198854, + "learning_rate": 3.0236181648773436e-05, + "loss": 2.7367, + "step": 32206 + }, + { + "epoch": 1.9993171519026631, + "grad_norm": 0.14065223006356364, + "learning_rate": 3.0232864290582484e-05, + "loss": 2.7543, + "step": 32207 + }, + { + "epoch": 1.999379229002421, + "grad_norm": 0.1476212571070951, + "learning_rate": 3.0229547035521493e-05, + "loss": 2.7437, + "step": 32208 + }, + { + "epoch": 1.999441306102179, + "grad_norm": 0.14441462270325195, + "learning_rate": 3.0226229883607743e-05, + "loss": 2.6779, + "step": 32209 + }, + { + "epoch": 1.9995033832019367, + "grad_norm": 0.16386901708952364, + "learning_rate": 3.0222912834858557e-05, + "loss": 2.7647, + "step": 32210 + }, + { + "epoch": 1.9995654603016946, + "grad_norm": 0.16550960276906937, + "learning_rate": 3.021959588929123e-05, + "loss": 2.7591, + "step": 32211 + }, + { + "epoch": 1.9996275374014525, + "grad_norm": 0.13740908179879857, + "learning_rate": 3.0216279046923084e-05, + "loss": 2.7305, + "step": 32212 + }, + { + "epoch": 1.9996896145012104, + "grad_norm": 0.15025711945718195, + "learning_rate": 3.0212962307771396e-05, + "loss": 2.721, + "step": 32213 + }, + { + "epoch": 1.9997516916009683, + "grad_norm": 0.15985739550885733, + "learning_rate": 3.0209645671853502e-05, + "loss": 2.6658, + "step": 32214 + }, + { + "epoch": 1.9998137687007262, + "grad_norm": 0.15739600605600684, + "learning_rate": 3.0206329139186685e-05, + "loss": 2.7984, + "step": 32215 + }, + { + "epoch": 1.9998758458004842, + "grad_norm": 0.1697531278779226, + "learning_rate": 3.0203012709788258e-05, + "loss": 2.7358, + "step": 32216 + }, + { + "epoch": 1.999937922900242, + "grad_norm": 0.16007344501108872, + "learning_rate": 3.0199696383675524e-05, + "loss": 2.7202, + "step": 32217 + }, + { + "epoch": 2.0, + "grad_norm": 0.1867329983083151, + "learning_rate": 3.0196380160865766e-05, + "loss": 2.7821, + "step": 32218 + }, + { + "epoch": 2.000062077099758, + "grad_norm": 0.1729250821589746, + "learning_rate": 3.0193064041376318e-05, + "loss": 2.7168, + "step": 32219 + }, + { + "epoch": 2.000124154199516, + "grad_norm": 0.14799072136742014, + "learning_rate": 3.0189748025224446e-05, + "loss": 2.7527, + "step": 32220 + }, + { + "epoch": 2.0001862312992738, + "grad_norm": 0.20413456882642622, + "learning_rate": 3.0186432112427476e-05, + "loss": 2.8429, + "step": 32221 + }, + { + "epoch": 2.0002483083990317, + "grad_norm": 0.16721952033680842, + "learning_rate": 3.0183116303002696e-05, + "loss": 2.7435, + "step": 32222 + }, + { + "epoch": 2.0003103854987896, + "grad_norm": 0.1557282677967918, + "learning_rate": 3.0179800596967417e-05, + "loss": 2.7084, + "step": 32223 + }, + { + "epoch": 2.0003724625985475, + "grad_norm": 0.16434063496006698, + "learning_rate": 3.0176484994338917e-05, + "loss": 2.6502, + "step": 32224 + }, + { + "epoch": 2.0004345396983054, + "grad_norm": 0.14851851511174446, + "learning_rate": 3.017316949513452e-05, + "loss": 2.7193, + "step": 32225 + }, + { + "epoch": 2.0004966167980633, + "grad_norm": 0.14346138523627938, + "learning_rate": 3.0169854099371498e-05, + "loss": 2.6817, + "step": 32226 + }, + { + "epoch": 2.0005586938978213, + "grad_norm": 0.15225329525831313, + "learning_rate": 3.0166538807067175e-05, + "loss": 2.7202, + "step": 32227 + }, + { + "epoch": 2.000620770997579, + "grad_norm": 0.17736352230543348, + "learning_rate": 3.0163223618238834e-05, + "loss": 2.8183, + "step": 32228 + }, + { + "epoch": 2.000682848097337, + "grad_norm": 0.18010046400416035, + "learning_rate": 3.015990853290376e-05, + "loss": 2.8123, + "step": 32229 + }, + { + "epoch": 2.0007449251970946, + "grad_norm": 0.1464584899920046, + "learning_rate": 3.0156593551079242e-05, + "loss": 2.8065, + "step": 32230 + }, + { + "epoch": 2.0008070022968525, + "grad_norm": 0.15753815263579538, + "learning_rate": 3.015327867278262e-05, + "loss": 2.7917, + "step": 32231 + }, + { + "epoch": 2.0008690793966104, + "grad_norm": 0.15679448745080604, + "learning_rate": 3.0149963898031154e-05, + "loss": 2.7069, + "step": 32232 + }, + { + "epoch": 2.0009311564963683, + "grad_norm": 0.15253363293156233, + "learning_rate": 3.014664922684215e-05, + "loss": 2.8202, + "step": 32233 + }, + { + "epoch": 2.0009932335961262, + "grad_norm": 0.2093519739043822, + "learning_rate": 3.014333465923288e-05, + "loss": 2.8082, + "step": 32234 + }, + { + "epoch": 2.001055310695884, + "grad_norm": 0.14406401916828288, + "learning_rate": 3.0140020195220675e-05, + "loss": 2.7439, + "step": 32235 + }, + { + "epoch": 2.001117387795642, + "grad_norm": 0.1503765731290849, + "learning_rate": 3.0136705834822787e-05, + "loss": 2.7588, + "step": 32236 + }, + { + "epoch": 2.0011794648954, + "grad_norm": 0.14419689682433584, + "learning_rate": 3.0133391578056548e-05, + "loss": 2.7507, + "step": 32237 + }, + { + "epoch": 2.001241541995158, + "grad_norm": 0.1466687504415341, + "learning_rate": 3.0130077424939206e-05, + "loss": 2.7105, + "step": 32238 + }, + { + "epoch": 2.001303619094916, + "grad_norm": 0.14130279987850472, + "learning_rate": 3.0126763375488086e-05, + "loss": 2.6709, + "step": 32239 + }, + { + "epoch": 2.0013656961946737, + "grad_norm": 0.14895014954362887, + "learning_rate": 3.0123449429720462e-05, + "loss": 2.7399, + "step": 32240 + }, + { + "epoch": 2.0014277732944317, + "grad_norm": 0.1489377063917074, + "learning_rate": 3.0120135587653636e-05, + "loss": 2.7741, + "step": 32241 + }, + { + "epoch": 2.0014898503941896, + "grad_norm": 0.16008654682575335, + "learning_rate": 3.011682184930489e-05, + "loss": 2.7665, + "step": 32242 + }, + { + "epoch": 2.0015519274939475, + "grad_norm": 0.15287553454482664, + "learning_rate": 3.01135082146915e-05, + "loss": 2.7823, + "step": 32243 + }, + { + "epoch": 2.0016140045937054, + "grad_norm": 0.16531236016676817, + "learning_rate": 3.011019468383077e-05, + "loss": 2.7166, + "step": 32244 + }, + { + "epoch": 2.0016760816934633, + "grad_norm": 0.144959678243535, + "learning_rate": 3.010688125673997e-05, + "loss": 2.7676, + "step": 32245 + }, + { + "epoch": 2.0017381587932213, + "grad_norm": 0.15389315384851318, + "learning_rate": 3.010356793343642e-05, + "loss": 2.7016, + "step": 32246 + }, + { + "epoch": 2.001800235892979, + "grad_norm": 0.16002176616369326, + "learning_rate": 3.010025471393737e-05, + "loss": 2.7031, + "step": 32247 + }, + { + "epoch": 2.001862312992737, + "grad_norm": 0.14327028219038337, + "learning_rate": 3.0096941598260135e-05, + "loss": 2.6787, + "step": 32248 + }, + { + "epoch": 2.001924390092495, + "grad_norm": 0.14101279617811394, + "learning_rate": 3.009362858642198e-05, + "loss": 2.6964, + "step": 32249 + }, + { + "epoch": 2.001986467192253, + "grad_norm": 0.14484217423751602, + "learning_rate": 3.00903156784402e-05, + "loss": 2.7096, + "step": 32250 + }, + { + "epoch": 2.002048544292011, + "grad_norm": 0.141346761403632, + "learning_rate": 3.008700287433207e-05, + "loss": 2.6954, + "step": 32251 + }, + { + "epoch": 2.0021106213917688, + "grad_norm": 0.17537693042433594, + "learning_rate": 3.0083690174114894e-05, + "loss": 2.7634, + "step": 32252 + }, + { + "epoch": 2.0021726984915267, + "grad_norm": 0.1443957711941434, + "learning_rate": 3.0080377577805935e-05, + "loss": 2.7671, + "step": 32253 + }, + { + "epoch": 2.002234775591284, + "grad_norm": 0.1555326086233689, + "learning_rate": 3.0077065085422474e-05, + "loss": 2.7831, + "step": 32254 + }, + { + "epoch": 2.002296852691042, + "grad_norm": 0.14366528409326187, + "learning_rate": 3.007375269698181e-05, + "loss": 2.7013, + "step": 32255 + }, + { + "epoch": 2.0023589297908, + "grad_norm": 0.18815553829183324, + "learning_rate": 3.0070440412501205e-05, + "loss": 2.7287, + "step": 32256 + }, + { + "epoch": 2.002421006890558, + "grad_norm": 0.15856841970271524, + "learning_rate": 3.0067128231997966e-05, + "loss": 2.714, + "step": 32257 + }, + { + "epoch": 2.002483083990316, + "grad_norm": 0.16262490671361876, + "learning_rate": 3.0063816155489342e-05, + "loss": 2.7051, + "step": 32258 + }, + { + "epoch": 2.0025451610900737, + "grad_norm": 0.15850415259255468, + "learning_rate": 3.0060504182992638e-05, + "loss": 2.7081, + "step": 32259 + }, + { + "epoch": 2.0026072381898317, + "grad_norm": 0.15447137719007178, + "learning_rate": 3.0057192314525118e-05, + "loss": 2.7242, + "step": 32260 + }, + { + "epoch": 2.0026693152895896, + "grad_norm": 0.15541065199205895, + "learning_rate": 3.0053880550104073e-05, + "loss": 2.7591, + "step": 32261 + }, + { + "epoch": 2.0027313923893475, + "grad_norm": 0.21139961415524489, + "learning_rate": 3.0050568889746768e-05, + "loss": 2.79, + "step": 32262 + }, + { + "epoch": 2.0027934694891054, + "grad_norm": 0.1546309873636753, + "learning_rate": 3.004725733347048e-05, + "loss": 2.6494, + "step": 32263 + }, + { + "epoch": 2.0028555465888633, + "grad_norm": 0.19559702176516103, + "learning_rate": 3.004394588129251e-05, + "loss": 2.7035, + "step": 32264 + }, + { + "epoch": 2.0029176236886213, + "grad_norm": 0.14527379489535716, + "learning_rate": 3.004063453323011e-05, + "loss": 2.8145, + "step": 32265 + }, + { + "epoch": 2.002979700788379, + "grad_norm": 0.17848392305247632, + "learning_rate": 3.0037323289300577e-05, + "loss": 2.7708, + "step": 32266 + }, + { + "epoch": 2.003041777888137, + "grad_norm": 0.16355804249502343, + "learning_rate": 3.0034012149521173e-05, + "loss": 2.747, + "step": 32267 + }, + { + "epoch": 2.003103854987895, + "grad_norm": 0.1462819162617232, + "learning_rate": 3.0030701113909167e-05, + "loss": 2.7582, + "step": 32268 + }, + { + "epoch": 2.003165932087653, + "grad_norm": 0.14452657176513997, + "learning_rate": 3.002739018248185e-05, + "loss": 2.8079, + "step": 32269 + }, + { + "epoch": 2.003228009187411, + "grad_norm": 0.17167617937481178, + "learning_rate": 3.0024079355256472e-05, + "loss": 2.6952, + "step": 32270 + }, + { + "epoch": 2.0032900862871688, + "grad_norm": 0.15015766753441592, + "learning_rate": 3.002076863225034e-05, + "loss": 2.6289, + "step": 32271 + }, + { + "epoch": 2.0033521633869267, + "grad_norm": 0.14509073429642122, + "learning_rate": 3.0017458013480692e-05, + "loss": 2.6199, + "step": 32272 + }, + { + "epoch": 2.0034142404866846, + "grad_norm": 0.14780241226146065, + "learning_rate": 3.001414749896483e-05, + "loss": 2.7035, + "step": 32273 + }, + { + "epoch": 2.0034763175864425, + "grad_norm": 0.15320766465592123, + "learning_rate": 3.0010837088719995e-05, + "loss": 2.7572, + "step": 32274 + }, + { + "epoch": 2.0035383946862004, + "grad_norm": 0.17695262353080785, + "learning_rate": 3.000752678276349e-05, + "loss": 2.7613, + "step": 32275 + }, + { + "epoch": 2.0036004717859583, + "grad_norm": 0.14575114162097752, + "learning_rate": 3.0004216581112564e-05, + "loss": 2.6099, + "step": 32276 + }, + { + "epoch": 2.0036625488857163, + "grad_norm": 0.165917822134437, + "learning_rate": 3.0000906483784497e-05, + "loss": 2.7613, + "step": 32277 + }, + { + "epoch": 2.0037246259854737, + "grad_norm": 0.14306730458885647, + "learning_rate": 2.9997596490796566e-05, + "loss": 2.7788, + "step": 32278 + }, + { + "epoch": 2.0037867030852317, + "grad_norm": 0.1399815128564783, + "learning_rate": 2.9994286602166012e-05, + "loss": 2.7108, + "step": 32279 + }, + { + "epoch": 2.0038487801849896, + "grad_norm": 0.1534058095580271, + "learning_rate": 2.9990976817910134e-05, + "loss": 2.7353, + "step": 32280 + }, + { + "epoch": 2.0039108572847475, + "grad_norm": 0.17444429594761338, + "learning_rate": 2.9987667138046178e-05, + "loss": 2.6765, + "step": 32281 + }, + { + "epoch": 2.0039729343845054, + "grad_norm": 0.14345517244946016, + "learning_rate": 2.9984357562591426e-05, + "loss": 2.7511, + "step": 32282 + }, + { + "epoch": 2.0040350114842633, + "grad_norm": 0.14744162906776942, + "learning_rate": 2.9981048091563134e-05, + "loss": 2.7515, + "step": 32283 + }, + { + "epoch": 2.0040970885840212, + "grad_norm": 0.15350336943183876, + "learning_rate": 2.997773872497859e-05, + "loss": 2.6793, + "step": 32284 + }, + { + "epoch": 2.004159165683779, + "grad_norm": 0.1464845403034549, + "learning_rate": 2.9974429462855026e-05, + "loss": 2.7879, + "step": 32285 + }, + { + "epoch": 2.004221242783537, + "grad_norm": 0.13914388594964075, + "learning_rate": 2.997112030520973e-05, + "loss": 2.7238, + "step": 32286 + }, + { + "epoch": 2.004283319883295, + "grad_norm": 0.16171677284137803, + "learning_rate": 2.9967811252059957e-05, + "loss": 2.6301, + "step": 32287 + }, + { + "epoch": 2.004345396983053, + "grad_norm": 0.14698848449032817, + "learning_rate": 2.9964502303422987e-05, + "loss": 2.7115, + "step": 32288 + }, + { + "epoch": 2.004407474082811, + "grad_norm": 0.17503617585450992, + "learning_rate": 2.996119345931607e-05, + "loss": 2.7468, + "step": 32289 + }, + { + "epoch": 2.0044695511825688, + "grad_norm": 0.1458002390662995, + "learning_rate": 2.9957884719756468e-05, + "loss": 2.7304, + "step": 32290 + }, + { + "epoch": 2.0045316282823267, + "grad_norm": 0.1455598456520326, + "learning_rate": 2.995457608476145e-05, + "loss": 2.7467, + "step": 32291 + }, + { + "epoch": 2.0045937053820846, + "grad_norm": 0.15523494387264677, + "learning_rate": 2.995126755434826e-05, + "loss": 2.7689, + "step": 32292 + }, + { + "epoch": 2.0046557824818425, + "grad_norm": 0.1430634508055509, + "learning_rate": 2.9947959128534197e-05, + "loss": 2.645, + "step": 32293 + }, + { + "epoch": 2.0047178595816004, + "grad_norm": 0.1584029113570466, + "learning_rate": 2.9944650807336482e-05, + "loss": 2.7999, + "step": 32294 + }, + { + "epoch": 2.0047799366813583, + "grad_norm": 0.15235953982761619, + "learning_rate": 2.9941342590772405e-05, + "loss": 2.6999, + "step": 32295 + }, + { + "epoch": 2.0048420137811163, + "grad_norm": 0.13923937232726707, + "learning_rate": 2.993803447885919e-05, + "loss": 2.6899, + "step": 32296 + }, + { + "epoch": 2.004904090880874, + "grad_norm": 0.14926804781744085, + "learning_rate": 2.9934726471614126e-05, + "loss": 2.7076, + "step": 32297 + }, + { + "epoch": 2.004966167980632, + "grad_norm": 0.15318926902723845, + "learning_rate": 2.993141856905448e-05, + "loss": 2.7201, + "step": 32298 + }, + { + "epoch": 2.00502824508039, + "grad_norm": 0.14870983163448992, + "learning_rate": 2.992811077119748e-05, + "loss": 2.7798, + "step": 32299 + }, + { + "epoch": 2.005090322180148, + "grad_norm": 0.14799734786877608, + "learning_rate": 2.9924803078060415e-05, + "loss": 2.6823, + "step": 32300 + }, + { + "epoch": 2.005152399279906, + "grad_norm": 0.15733962998777826, + "learning_rate": 2.9921495489660512e-05, + "loss": 2.6973, + "step": 32301 + }, + { + "epoch": 2.0052144763796633, + "grad_norm": 0.14999915236988584, + "learning_rate": 2.9918188006015057e-05, + "loss": 2.7483, + "step": 32302 + }, + { + "epoch": 2.0052765534794212, + "grad_norm": 0.13931460593442924, + "learning_rate": 2.9914880627141282e-05, + "loss": 2.699, + "step": 32303 + }, + { + "epoch": 2.005338630579179, + "grad_norm": 0.1510637071302429, + "learning_rate": 2.9911573353056444e-05, + "loss": 2.7298, + "step": 32304 + }, + { + "epoch": 2.005400707678937, + "grad_norm": 0.14690649881851037, + "learning_rate": 2.990826618377782e-05, + "loss": 2.8344, + "step": 32305 + }, + { + "epoch": 2.005462784778695, + "grad_norm": 0.16248516395590804, + "learning_rate": 2.990495911932263e-05, + "loss": 2.7798, + "step": 32306 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.13845970148005052, + "learning_rate": 2.990165215970816e-05, + "loss": 2.7545, + "step": 32307 + }, + { + "epoch": 2.005586938978211, + "grad_norm": 0.1673269597079831, + "learning_rate": 2.9898345304951645e-05, + "loss": 2.7764, + "step": 32308 + }, + { + "epoch": 2.0056490160779687, + "grad_norm": 0.14925930635433002, + "learning_rate": 2.989503855507034e-05, + "loss": 2.8347, + "step": 32309 + }, + { + "epoch": 2.0057110931777267, + "grad_norm": 0.14970554020108645, + "learning_rate": 2.98917319100815e-05, + "loss": 2.7049, + "step": 32310 + }, + { + "epoch": 2.0057731702774846, + "grad_norm": 0.14511728684638872, + "learning_rate": 2.9888425370002387e-05, + "loss": 2.7071, + "step": 32311 + }, + { + "epoch": 2.0058352473772425, + "grad_norm": 0.15975748405141094, + "learning_rate": 2.9885118934850225e-05, + "loss": 2.6966, + "step": 32312 + }, + { + "epoch": 2.0058973244770004, + "grad_norm": 0.1547975444950571, + "learning_rate": 2.9881812604642294e-05, + "loss": 2.7686, + "step": 32313 + }, + { + "epoch": 2.0059594015767583, + "grad_norm": 0.14751857616348216, + "learning_rate": 2.9878506379395834e-05, + "loss": 2.7283, + "step": 32314 + }, + { + "epoch": 2.0060214786765163, + "grad_norm": 0.13865652805349496, + "learning_rate": 2.987520025912807e-05, + "loss": 2.863, + "step": 32315 + }, + { + "epoch": 2.006083555776274, + "grad_norm": 0.13874036145454494, + "learning_rate": 2.987189424385629e-05, + "loss": 2.6767, + "step": 32316 + }, + { + "epoch": 2.006145632876032, + "grad_norm": 0.15270456354862388, + "learning_rate": 2.9868588333597714e-05, + "loss": 2.7055, + "step": 32317 + }, + { + "epoch": 2.00620770997579, + "grad_norm": 0.1399734023918526, + "learning_rate": 2.986528252836961e-05, + "loss": 2.7884, + "step": 32318 + }, + { + "epoch": 2.006269787075548, + "grad_norm": 0.14006299728090796, + "learning_rate": 2.98619768281892e-05, + "loss": 2.6789, + "step": 32319 + }, + { + "epoch": 2.006331864175306, + "grad_norm": 0.1757298341366073, + "learning_rate": 2.9858671233073753e-05, + "loss": 2.7094, + "step": 32320 + }, + { + "epoch": 2.0063939412750638, + "grad_norm": 0.148466546579662, + "learning_rate": 2.985536574304051e-05, + "loss": 2.7229, + "step": 32321 + }, + { + "epoch": 2.0064560183748217, + "grad_norm": 0.17173458996470287, + "learning_rate": 2.9852060358106715e-05, + "loss": 2.7275, + "step": 32322 + }, + { + "epoch": 2.0065180954745796, + "grad_norm": 0.14211065919871518, + "learning_rate": 2.9848755078289603e-05, + "loss": 2.7498, + "step": 32323 + }, + { + "epoch": 2.0065801725743375, + "grad_norm": 0.18026898462286678, + "learning_rate": 2.984544990360644e-05, + "loss": 2.7581, + "step": 32324 + }, + { + "epoch": 2.0066422496740954, + "grad_norm": 0.16957633741774547, + "learning_rate": 2.9842144834074448e-05, + "loss": 2.7216, + "step": 32325 + }, + { + "epoch": 2.006704326773853, + "grad_norm": 0.1639886928478764, + "learning_rate": 2.983883986971089e-05, + "loss": 2.7512, + "step": 32326 + }, + { + "epoch": 2.006766403873611, + "grad_norm": 0.14255247146582412, + "learning_rate": 2.983553501053299e-05, + "loss": 2.721, + "step": 32327 + }, + { + "epoch": 2.0068284809733687, + "grad_norm": 0.15218454380610788, + "learning_rate": 2.983223025655799e-05, + "loss": 2.7614, + "step": 32328 + }, + { + "epoch": 2.0068905580731267, + "grad_norm": 0.14906542125374575, + "learning_rate": 2.982892560780315e-05, + "loss": 2.6827, + "step": 32329 + }, + { + "epoch": 2.0069526351728846, + "grad_norm": 0.19714656862960744, + "learning_rate": 2.9825621064285703e-05, + "loss": 2.7565, + "step": 32330 + }, + { + "epoch": 2.0070147122726425, + "grad_norm": 0.14847700948429646, + "learning_rate": 2.9822316626022882e-05, + "loss": 2.7573, + "step": 32331 + }, + { + "epoch": 2.0070767893724004, + "grad_norm": 0.15618549445926572, + "learning_rate": 2.9819012293031946e-05, + "loss": 2.7176, + "step": 32332 + }, + { + "epoch": 2.0071388664721583, + "grad_norm": 0.1528598650367148, + "learning_rate": 2.9815708065330118e-05, + "loss": 2.694, + "step": 32333 + }, + { + "epoch": 2.0072009435719163, + "grad_norm": 0.14804754986772742, + "learning_rate": 2.9812403942934642e-05, + "loss": 2.7037, + "step": 32334 + }, + { + "epoch": 2.007263020671674, + "grad_norm": 0.1503573603811736, + "learning_rate": 2.9809099925862742e-05, + "loss": 2.6649, + "step": 32335 + }, + { + "epoch": 2.007325097771432, + "grad_norm": 0.14231218724013295, + "learning_rate": 2.980579601413169e-05, + "loss": 2.7063, + "step": 32336 + }, + { + "epoch": 2.00738717487119, + "grad_norm": 0.14480523610737897, + "learning_rate": 2.9802492207758682e-05, + "loss": 2.6741, + "step": 32337 + }, + { + "epoch": 2.007449251970948, + "grad_norm": 0.14466747102050528, + "learning_rate": 2.9799188506760993e-05, + "loss": 2.6919, + "step": 32338 + }, + { + "epoch": 2.007511329070706, + "grad_norm": 0.15387755770382042, + "learning_rate": 2.9795884911155835e-05, + "loss": 2.7457, + "step": 32339 + }, + { + "epoch": 2.0075734061704638, + "grad_norm": 0.1458168779750953, + "learning_rate": 2.9792581420960448e-05, + "loss": 2.7623, + "step": 32340 + }, + { + "epoch": 2.0076354832702217, + "grad_norm": 0.16114686846837292, + "learning_rate": 2.9789278036192074e-05, + "loss": 2.7246, + "step": 32341 + }, + { + "epoch": 2.0076975603699796, + "grad_norm": 0.1427102652358649, + "learning_rate": 2.9785974756867935e-05, + "loss": 2.7244, + "step": 32342 + }, + { + "epoch": 2.0077596374697375, + "grad_norm": 0.14701591691466476, + "learning_rate": 2.9782671583005282e-05, + "loss": 2.717, + "step": 32343 + }, + { + "epoch": 2.0078217145694954, + "grad_norm": 0.15741599855269411, + "learning_rate": 2.9779368514621332e-05, + "loss": 2.7075, + "step": 32344 + }, + { + "epoch": 2.0078837916692533, + "grad_norm": 0.14654232786115473, + "learning_rate": 2.9776065551733333e-05, + "loss": 2.7381, + "step": 32345 + }, + { + "epoch": 2.0079458687690113, + "grad_norm": 0.13749293070740273, + "learning_rate": 2.9772762694358498e-05, + "loss": 2.6856, + "step": 32346 + }, + { + "epoch": 2.008007945868769, + "grad_norm": 0.14920903768437985, + "learning_rate": 2.9769459942514077e-05, + "loss": 2.7371, + "step": 32347 + }, + { + "epoch": 2.008070022968527, + "grad_norm": 0.14567512775027328, + "learning_rate": 2.976615729621729e-05, + "loss": 2.8166, + "step": 32348 + }, + { + "epoch": 2.008132100068285, + "grad_norm": 0.14562485615064205, + "learning_rate": 2.9762854755485382e-05, + "loss": 2.707, + "step": 32349 + }, + { + "epoch": 2.0081941771680425, + "grad_norm": 0.16896220216504015, + "learning_rate": 2.9759552320335566e-05, + "loss": 2.7061, + "step": 32350 + }, + { + "epoch": 2.0082562542678004, + "grad_norm": 0.14573284004681597, + "learning_rate": 2.9756249990785086e-05, + "loss": 2.7288, + "step": 32351 + }, + { + "epoch": 2.0083183313675583, + "grad_norm": 0.1442474013682059, + "learning_rate": 2.9752947766851158e-05, + "loss": 2.818, + "step": 32352 + }, + { + "epoch": 2.0083804084673162, + "grad_norm": 0.1457802890345229, + "learning_rate": 2.9749645648551017e-05, + "loss": 2.7556, + "step": 32353 + }, + { + "epoch": 2.008442485567074, + "grad_norm": 0.18429354953716803, + "learning_rate": 2.9746343635901895e-05, + "loss": 2.7415, + "step": 32354 + }, + { + "epoch": 2.008504562666832, + "grad_norm": 0.1388326840787172, + "learning_rate": 2.9743041728921005e-05, + "loss": 2.7525, + "step": 32355 + }, + { + "epoch": 2.00856663976659, + "grad_norm": 0.15590405313790975, + "learning_rate": 2.9739739927625597e-05, + "loss": 2.7698, + "step": 32356 + }, + { + "epoch": 2.008628716866348, + "grad_norm": 0.15949251293080968, + "learning_rate": 2.973643823203287e-05, + "loss": 2.7001, + "step": 32357 + }, + { + "epoch": 2.008690793966106, + "grad_norm": 0.13852035276646396, + "learning_rate": 2.9733136642160075e-05, + "loss": 2.7402, + "step": 32358 + }, + { + "epoch": 2.0087528710658638, + "grad_norm": 0.19596943999545993, + "learning_rate": 2.9729835158024422e-05, + "loss": 2.6404, + "step": 32359 + }, + { + "epoch": 2.0088149481656217, + "grad_norm": 0.16202398717047486, + "learning_rate": 2.972653377964314e-05, + "loss": 2.7435, + "step": 32360 + }, + { + "epoch": 2.0088770252653796, + "grad_norm": 0.1580148801413541, + "learning_rate": 2.972323250703344e-05, + "loss": 2.75, + "step": 32361 + }, + { + "epoch": 2.0089391023651375, + "grad_norm": 0.16416511278538565, + "learning_rate": 2.971993134021257e-05, + "loss": 2.725, + "step": 32362 + }, + { + "epoch": 2.0090011794648954, + "grad_norm": 0.14311438136918672, + "learning_rate": 2.9716630279197745e-05, + "loss": 2.7575, + "step": 32363 + }, + { + "epoch": 2.0090632565646533, + "grad_norm": 0.13967381093456363, + "learning_rate": 2.971332932400619e-05, + "loss": 2.754, + "step": 32364 + }, + { + "epoch": 2.0091253336644113, + "grad_norm": 0.13881139721018007, + "learning_rate": 2.9710028474655104e-05, + "loss": 2.7021, + "step": 32365 + }, + { + "epoch": 2.009187410764169, + "grad_norm": 0.16198522915692679, + "learning_rate": 2.970672773116174e-05, + "loss": 2.733, + "step": 32366 + }, + { + "epoch": 2.009249487863927, + "grad_norm": 0.14814538194767943, + "learning_rate": 2.970342709354329e-05, + "loss": 2.8085, + "step": 32367 + }, + { + "epoch": 2.009311564963685, + "grad_norm": 0.14221252965179937, + "learning_rate": 2.9700126561817e-05, + "loss": 2.7384, + "step": 32368 + }, + { + "epoch": 2.009373642063443, + "grad_norm": 0.16401947115712812, + "learning_rate": 2.9696826136000065e-05, + "loss": 2.771, + "step": 32369 + }, + { + "epoch": 2.009435719163201, + "grad_norm": 0.15168149371843745, + "learning_rate": 2.969352581610973e-05, + "loss": 2.698, + "step": 32370 + }, + { + "epoch": 2.0094977962629588, + "grad_norm": 0.15931823177584836, + "learning_rate": 2.969022560216319e-05, + "loss": 2.7619, + "step": 32371 + }, + { + "epoch": 2.0095598733627167, + "grad_norm": 0.1472966814053017, + "learning_rate": 2.9686925494177687e-05, + "loss": 2.7233, + "step": 32372 + }, + { + "epoch": 2.0096219504624746, + "grad_norm": 0.16554279311558825, + "learning_rate": 2.9683625492170408e-05, + "loss": 2.793, + "step": 32373 + }, + { + "epoch": 2.009684027562232, + "grad_norm": 0.1913367846029675, + "learning_rate": 2.96803255961586e-05, + "loss": 2.7041, + "step": 32374 + }, + { + "epoch": 2.00974610466199, + "grad_norm": 0.15582232357552947, + "learning_rate": 2.9677025806159453e-05, + "loss": 2.7532, + "step": 32375 + }, + { + "epoch": 2.009808181761748, + "grad_norm": 0.15053089833074967, + "learning_rate": 2.9673726122190216e-05, + "loss": 2.8058, + "step": 32376 + }, + { + "epoch": 2.009870258861506, + "grad_norm": 0.15518425833688235, + "learning_rate": 2.9670426544268072e-05, + "loss": 2.7266, + "step": 32377 + }, + { + "epoch": 2.0099323359612637, + "grad_norm": 0.16150267374147598, + "learning_rate": 2.9667127072410238e-05, + "loss": 2.771, + "step": 32378 + }, + { + "epoch": 2.0099944130610217, + "grad_norm": 0.1449952731410272, + "learning_rate": 2.9663827706633963e-05, + "loss": 2.7374, + "step": 32379 + }, + { + "epoch": 2.0100564901607796, + "grad_norm": 0.13944969840875404, + "learning_rate": 2.9660528446956408e-05, + "loss": 2.7252, + "step": 32380 + }, + { + "epoch": 2.0101185672605375, + "grad_norm": 0.14154400806827105, + "learning_rate": 2.9657229293394832e-05, + "loss": 2.7673, + "step": 32381 + }, + { + "epoch": 2.0101806443602954, + "grad_norm": 0.14585601823210267, + "learning_rate": 2.965393024596642e-05, + "loss": 2.7202, + "step": 32382 + }, + { + "epoch": 2.0102427214600533, + "grad_norm": 0.15331043527459245, + "learning_rate": 2.9650631304688402e-05, + "loss": 2.7587, + "step": 32383 + }, + { + "epoch": 2.0103047985598113, + "grad_norm": 0.17978383674531062, + "learning_rate": 2.964733246957797e-05, + "loss": 2.7209, + "step": 32384 + }, + { + "epoch": 2.010366875659569, + "grad_norm": 0.16204219598132755, + "learning_rate": 2.9644033740652355e-05, + "loss": 2.7405, + "step": 32385 + }, + { + "epoch": 2.010428952759327, + "grad_norm": 0.13873297728684997, + "learning_rate": 2.9640735117928746e-05, + "loss": 2.6155, + "step": 32386 + }, + { + "epoch": 2.010491029859085, + "grad_norm": 0.16809530813172638, + "learning_rate": 2.9637436601424377e-05, + "loss": 2.802, + "step": 32387 + }, + { + "epoch": 2.010553106958843, + "grad_norm": 0.1514263837180668, + "learning_rate": 2.9634138191156446e-05, + "loss": 2.7143, + "step": 32388 + }, + { + "epoch": 2.010615184058601, + "grad_norm": 0.1626260978865454, + "learning_rate": 2.9630839887142147e-05, + "loss": 2.7462, + "step": 32389 + }, + { + "epoch": 2.0106772611583588, + "grad_norm": 0.14610416660810174, + "learning_rate": 2.962754168939871e-05, + "loss": 2.8293, + "step": 32390 + }, + { + "epoch": 2.0107393382581167, + "grad_norm": 0.13939833085476624, + "learning_rate": 2.9624243597943323e-05, + "loss": 2.7363, + "step": 32391 + }, + { + "epoch": 2.0108014153578746, + "grad_norm": 0.16096827458145002, + "learning_rate": 2.9620945612793215e-05, + "loss": 2.8314, + "step": 32392 + }, + { + "epoch": 2.0108634924576325, + "grad_norm": 0.15860685590326215, + "learning_rate": 2.9617647733965576e-05, + "loss": 2.7074, + "step": 32393 + }, + { + "epoch": 2.0109255695573904, + "grad_norm": 0.16577817540998455, + "learning_rate": 2.9614349961477604e-05, + "loss": 2.679, + "step": 32394 + }, + { + "epoch": 2.0109876466571484, + "grad_norm": 0.1622199175709414, + "learning_rate": 2.9611052295346532e-05, + "loss": 2.6456, + "step": 32395 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.1839454305943701, + "learning_rate": 2.9607754735589543e-05, + "loss": 2.7213, + "step": 32396 + }, + { + "epoch": 2.011111800856664, + "grad_norm": 0.16542148299516232, + "learning_rate": 2.9604457282223863e-05, + "loss": 2.7194, + "step": 32397 + }, + { + "epoch": 2.0111738779564217, + "grad_norm": 0.14524232723591934, + "learning_rate": 2.9601159935266663e-05, + "loss": 2.7615, + "step": 32398 + }, + { + "epoch": 2.0112359550561796, + "grad_norm": 0.15453591119854665, + "learning_rate": 2.959786269473518e-05, + "loss": 2.7542, + "step": 32399 + }, + { + "epoch": 2.0112980321559375, + "grad_norm": 0.21734805107660163, + "learning_rate": 2.9594565560646582e-05, + "loss": 2.7907, + "step": 32400 + }, + { + "epoch": 2.0113601092556954, + "grad_norm": 0.14945778175951815, + "learning_rate": 2.9591268533018103e-05, + "loss": 2.6479, + "step": 32401 + }, + { + "epoch": 2.0114221863554533, + "grad_norm": 0.15289635569066568, + "learning_rate": 2.9587971611866934e-05, + "loss": 2.7937, + "step": 32402 + }, + { + "epoch": 2.0114842634552113, + "grad_norm": 0.15335914095852485, + "learning_rate": 2.9584674797210253e-05, + "loss": 2.7138, + "step": 32403 + }, + { + "epoch": 2.011546340554969, + "grad_norm": 0.13930195059437542, + "learning_rate": 2.9581378089065305e-05, + "loss": 2.7031, + "step": 32404 + }, + { + "epoch": 2.011608417654727, + "grad_norm": 0.14750120105900968, + "learning_rate": 2.957808148744924e-05, + "loss": 2.6244, + "step": 32405 + }, + { + "epoch": 2.011670494754485, + "grad_norm": 0.17866271572213444, + "learning_rate": 2.95747849923793e-05, + "loss": 2.7234, + "step": 32406 + }, + { + "epoch": 2.011732571854243, + "grad_norm": 0.139392057238925, + "learning_rate": 2.9571488603872648e-05, + "loss": 2.7125, + "step": 32407 + }, + { + "epoch": 2.011794648954001, + "grad_norm": 0.17164655155164812, + "learning_rate": 2.9568192321946514e-05, + "loss": 2.7667, + "step": 32408 + }, + { + "epoch": 2.0118567260537588, + "grad_norm": 0.14481652947435186, + "learning_rate": 2.956489614661807e-05, + "loss": 2.6532, + "step": 32409 + }, + { + "epoch": 2.0119188031535167, + "grad_norm": 0.14988932102558333, + "learning_rate": 2.9561600077904532e-05, + "loss": 2.7947, + "step": 32410 + }, + { + "epoch": 2.0119808802532746, + "grad_norm": 0.15337492947977088, + "learning_rate": 2.9558304115823077e-05, + "loss": 2.6545, + "step": 32411 + }, + { + "epoch": 2.0120429573530325, + "grad_norm": 0.14934147835724193, + "learning_rate": 2.9555008260390928e-05, + "loss": 2.7791, + "step": 32412 + }, + { + "epoch": 2.0121050344527904, + "grad_norm": 0.15034145319256056, + "learning_rate": 2.9551712511625264e-05, + "loss": 2.704, + "step": 32413 + }, + { + "epoch": 2.0121671115525483, + "grad_norm": 0.15649415252733112, + "learning_rate": 2.954841686954326e-05, + "loss": 2.7063, + "step": 32414 + }, + { + "epoch": 2.0122291886523063, + "grad_norm": 0.1863354939377145, + "learning_rate": 2.9545121334162145e-05, + "loss": 2.7618, + "step": 32415 + }, + { + "epoch": 2.012291265752064, + "grad_norm": 0.14927569695761855, + "learning_rate": 2.954182590549909e-05, + "loss": 2.7524, + "step": 32416 + }, + { + "epoch": 2.012353342851822, + "grad_norm": 0.1432510395399292, + "learning_rate": 2.95385305835713e-05, + "loss": 2.6566, + "step": 32417 + }, + { + "epoch": 2.01241541995158, + "grad_norm": 0.16209572998898275, + "learning_rate": 2.9535235368395964e-05, + "loss": 2.7061, + "step": 32418 + }, + { + "epoch": 2.012477497051338, + "grad_norm": 0.1408893167469467, + "learning_rate": 2.9531940259990275e-05, + "loss": 2.6952, + "step": 32419 + }, + { + "epoch": 2.012539574151096, + "grad_norm": 0.18060395555428504, + "learning_rate": 2.9528645258371412e-05, + "loss": 2.7851, + "step": 32420 + }, + { + "epoch": 2.0126016512508538, + "grad_norm": 0.14644142792872078, + "learning_rate": 2.9525350363556594e-05, + "loss": 2.7081, + "step": 32421 + }, + { + "epoch": 2.0126637283506112, + "grad_norm": 0.14230737061108542, + "learning_rate": 2.9522055575562967e-05, + "loss": 2.7025, + "step": 32422 + }, + { + "epoch": 2.012725805450369, + "grad_norm": 0.1491311018107921, + "learning_rate": 2.951876089440777e-05, + "loss": 2.7312, + "step": 32423 + }, + { + "epoch": 2.012787882550127, + "grad_norm": 0.18255658087493584, + "learning_rate": 2.951546632010817e-05, + "loss": 2.6115, + "step": 32424 + }, + { + "epoch": 2.012849959649885, + "grad_norm": 0.13964324221250435, + "learning_rate": 2.951217185268134e-05, + "loss": 2.6376, + "step": 32425 + }, + { + "epoch": 2.012912036749643, + "grad_norm": 0.173732897573715, + "learning_rate": 2.9508877492144498e-05, + "loss": 2.6574, + "step": 32426 + }, + { + "epoch": 2.012974113849401, + "grad_norm": 0.14005169964410674, + "learning_rate": 2.9505583238514796e-05, + "loss": 2.7225, + "step": 32427 + }, + { + "epoch": 2.0130361909491588, + "grad_norm": 0.2023268939328492, + "learning_rate": 2.9502289091809444e-05, + "loss": 2.756, + "step": 32428 + }, + { + "epoch": 2.0130982680489167, + "grad_norm": 0.1485939330933222, + "learning_rate": 2.949899505204564e-05, + "loss": 2.6994, + "step": 32429 + }, + { + "epoch": 2.0131603451486746, + "grad_norm": 0.14285987293666064, + "learning_rate": 2.9495701119240547e-05, + "loss": 2.664, + "step": 32430 + }, + { + "epoch": 2.0132224222484325, + "grad_norm": 0.14051545697930393, + "learning_rate": 2.9492407293411374e-05, + "loss": 2.733, + "step": 32431 + }, + { + "epoch": 2.0132844993481904, + "grad_norm": 0.15122854589978715, + "learning_rate": 2.9489113574575273e-05, + "loss": 2.7907, + "step": 32432 + }, + { + "epoch": 2.0133465764479483, + "grad_norm": 0.14281287529226194, + "learning_rate": 2.9485819962749462e-05, + "loss": 2.7383, + "step": 32433 + }, + { + "epoch": 2.0134086535477063, + "grad_norm": 0.1463611338666421, + "learning_rate": 2.9482526457951086e-05, + "loss": 2.7445, + "step": 32434 + }, + { + "epoch": 2.013470730647464, + "grad_norm": 0.21197380594946832, + "learning_rate": 2.9479233060197375e-05, + "loss": 2.7951, + "step": 32435 + }, + { + "epoch": 2.013532807747222, + "grad_norm": 0.1677890526073993, + "learning_rate": 2.947593976950547e-05, + "loss": 2.7127, + "step": 32436 + }, + { + "epoch": 2.01359488484698, + "grad_norm": 0.13998926359970346, + "learning_rate": 2.9472646585892577e-05, + "loss": 2.7878, + "step": 32437 + }, + { + "epoch": 2.013656961946738, + "grad_norm": 0.14900713715204986, + "learning_rate": 2.9469353509375874e-05, + "loss": 2.7289, + "step": 32438 + }, + { + "epoch": 2.013719039046496, + "grad_norm": 0.1444644967137321, + "learning_rate": 2.9466060539972523e-05, + "loss": 2.7408, + "step": 32439 + }, + { + "epoch": 2.0137811161462538, + "grad_norm": 0.14407604290407527, + "learning_rate": 2.9462767677699733e-05, + "loss": 2.729, + "step": 32440 + }, + { + "epoch": 2.0138431932460117, + "grad_norm": 0.14249202663091085, + "learning_rate": 2.945947492257466e-05, + "loss": 2.6594, + "step": 32441 + }, + { + "epoch": 2.0139052703457696, + "grad_norm": 0.14637683735809237, + "learning_rate": 2.9456182274614497e-05, + "loss": 2.7118, + "step": 32442 + }, + { + "epoch": 2.0139673474455275, + "grad_norm": 0.14803642749409146, + "learning_rate": 2.945288973383642e-05, + "loss": 2.6898, + "step": 32443 + }, + { + "epoch": 2.0140294245452854, + "grad_norm": 0.1720057932670248, + "learning_rate": 2.9449597300257603e-05, + "loss": 2.7111, + "step": 32444 + }, + { + "epoch": 2.0140915016450434, + "grad_norm": 0.18261855425807683, + "learning_rate": 2.9446304973895224e-05, + "loss": 2.7258, + "step": 32445 + }, + { + "epoch": 2.014153578744801, + "grad_norm": 0.13801695410357215, + "learning_rate": 2.9443012754766463e-05, + "loss": 2.6498, + "step": 32446 + }, + { + "epoch": 2.0142156558445587, + "grad_norm": 0.16145085245665258, + "learning_rate": 2.943972064288849e-05, + "loss": 2.6233, + "step": 32447 + }, + { + "epoch": 2.0142777329443167, + "grad_norm": 0.17356127175894842, + "learning_rate": 2.94364286382785e-05, + "loss": 2.7188, + "step": 32448 + }, + { + "epoch": 2.0143398100440746, + "grad_norm": 0.1733721689520476, + "learning_rate": 2.9433136740953647e-05, + "loss": 2.7938, + "step": 32449 + }, + { + "epoch": 2.0144018871438325, + "grad_norm": 0.14156576559161219, + "learning_rate": 2.9429844950931107e-05, + "loss": 2.685, + "step": 32450 + }, + { + "epoch": 2.0144639642435904, + "grad_norm": 0.16422343308207507, + "learning_rate": 2.9426553268228062e-05, + "loss": 2.7359, + "step": 32451 + }, + { + "epoch": 2.0145260413433483, + "grad_norm": 0.16535717592144153, + "learning_rate": 2.9423261692861682e-05, + "loss": 2.6804, + "step": 32452 + }, + { + "epoch": 2.0145881184431063, + "grad_norm": 0.15747463014422663, + "learning_rate": 2.941997022484915e-05, + "loss": 2.8124, + "step": 32453 + }, + { + "epoch": 2.014650195542864, + "grad_norm": 0.1558325137269384, + "learning_rate": 2.9416678864207613e-05, + "loss": 2.7064, + "step": 32454 + }, + { + "epoch": 2.014712272642622, + "grad_norm": 0.19820589757535595, + "learning_rate": 2.9413387610954273e-05, + "loss": 2.7641, + "step": 32455 + }, + { + "epoch": 2.01477434974238, + "grad_norm": 0.16193753637175776, + "learning_rate": 2.941009646510628e-05, + "loss": 2.7459, + "step": 32456 + }, + { + "epoch": 2.014836426842138, + "grad_norm": 0.181991208056033, + "learning_rate": 2.9406805426680817e-05, + "loss": 2.6637, + "step": 32457 + }, + { + "epoch": 2.014898503941896, + "grad_norm": 0.14658515006857561, + "learning_rate": 2.940351449569504e-05, + "loss": 2.7066, + "step": 32458 + }, + { + "epoch": 2.0149605810416538, + "grad_norm": 0.17575210223663493, + "learning_rate": 2.9400223672166122e-05, + "loss": 2.6887, + "step": 32459 + }, + { + "epoch": 2.0150226581414117, + "grad_norm": 0.15866351542762877, + "learning_rate": 2.9396932956111255e-05, + "loss": 2.7178, + "step": 32460 + }, + { + "epoch": 2.0150847352411696, + "grad_norm": 0.16266098736543594, + "learning_rate": 2.939364234754758e-05, + "loss": 2.7084, + "step": 32461 + }, + { + "epoch": 2.0151468123409275, + "grad_norm": 0.17772340216667623, + "learning_rate": 2.9390351846492288e-05, + "loss": 2.7495, + "step": 32462 + }, + { + "epoch": 2.0152088894406854, + "grad_norm": 0.16962438609195563, + "learning_rate": 2.938706145296253e-05, + "loss": 2.7957, + "step": 32463 + }, + { + "epoch": 2.0152709665404434, + "grad_norm": 0.14002751205672168, + "learning_rate": 2.9383771166975465e-05, + "loss": 2.7101, + "step": 32464 + }, + { + "epoch": 2.0153330436402013, + "grad_norm": 0.16152173125577823, + "learning_rate": 2.9380480988548286e-05, + "loss": 2.6906, + "step": 32465 + }, + { + "epoch": 2.015395120739959, + "grad_norm": 0.14852106728532893, + "learning_rate": 2.9377190917698128e-05, + "loss": 2.7488, + "step": 32466 + }, + { + "epoch": 2.015457197839717, + "grad_norm": 0.14020638943732713, + "learning_rate": 2.9373900954442187e-05, + "loss": 2.7813, + "step": 32467 + }, + { + "epoch": 2.015519274939475, + "grad_norm": 0.1562804669240621, + "learning_rate": 2.9370611098797597e-05, + "loss": 2.6975, + "step": 32468 + }, + { + "epoch": 2.015581352039233, + "grad_norm": 0.13925387924011307, + "learning_rate": 2.936732135078155e-05, + "loss": 2.7474, + "step": 32469 + }, + { + "epoch": 2.0156434291389904, + "grad_norm": 0.14472917587865416, + "learning_rate": 2.936403171041118e-05, + "loss": 2.7453, + "step": 32470 + }, + { + "epoch": 2.0157055062387483, + "grad_norm": 0.15677185530873328, + "learning_rate": 2.936074217770368e-05, + "loss": 2.8154, + "step": 32471 + }, + { + "epoch": 2.0157675833385063, + "grad_norm": 0.14198879043734217, + "learning_rate": 2.9357452752676194e-05, + "loss": 2.7366, + "step": 32472 + }, + { + "epoch": 2.015829660438264, + "grad_norm": 0.14527733361891207, + "learning_rate": 2.9354163435345895e-05, + "loss": 2.7756, + "step": 32473 + }, + { + "epoch": 2.015891737538022, + "grad_norm": 0.1388556713045143, + "learning_rate": 2.9350874225729936e-05, + "loss": 2.7845, + "step": 32474 + }, + { + "epoch": 2.01595381463778, + "grad_norm": 0.1388099480011795, + "learning_rate": 2.9347585123845465e-05, + "loss": 2.7977, + "step": 32475 + }, + { + "epoch": 2.016015891737538, + "grad_norm": 0.15208848940152106, + "learning_rate": 2.934429612970967e-05, + "loss": 2.7278, + "step": 32476 + }, + { + "epoch": 2.016077968837296, + "grad_norm": 0.14402624130106437, + "learning_rate": 2.9341007243339692e-05, + "loss": 2.7021, + "step": 32477 + }, + { + "epoch": 2.0161400459370538, + "grad_norm": 0.17065627242924492, + "learning_rate": 2.9337718464752695e-05, + "loss": 2.7304, + "step": 32478 + }, + { + "epoch": 2.0162021230368117, + "grad_norm": 0.16795820380525825, + "learning_rate": 2.933442979396583e-05, + "loss": 2.7491, + "step": 32479 + }, + { + "epoch": 2.0162642001365696, + "grad_norm": 0.14528053891005255, + "learning_rate": 2.9331141230996273e-05, + "loss": 2.8136, + "step": 32480 + }, + { + "epoch": 2.0163262772363275, + "grad_norm": 0.14364969911092362, + "learning_rate": 2.9327852775861152e-05, + "loss": 2.7608, + "step": 32481 + }, + { + "epoch": 2.0163883543360854, + "grad_norm": 0.14017385728357357, + "learning_rate": 2.932456442857766e-05, + "loss": 2.723, + "step": 32482 + }, + { + "epoch": 2.0164504314358433, + "grad_norm": 0.16880014831185605, + "learning_rate": 2.9321276189162926e-05, + "loss": 2.7701, + "step": 32483 + }, + { + "epoch": 2.0165125085356013, + "grad_norm": 0.15881080830448802, + "learning_rate": 2.9317988057634117e-05, + "loss": 2.7653, + "step": 32484 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.15215812280113059, + "learning_rate": 2.9314700034008392e-05, + "loss": 2.7021, + "step": 32485 + }, + { + "epoch": 2.016636662735117, + "grad_norm": 0.15348374106031493, + "learning_rate": 2.9311412118302884e-05, + "loss": 2.6749, + "step": 32486 + }, + { + "epoch": 2.016698739834875, + "grad_norm": 0.15353854859984822, + "learning_rate": 2.9308124310534774e-05, + "loss": 2.6641, + "step": 32487 + }, + { + "epoch": 2.016760816934633, + "grad_norm": 0.15924985778269263, + "learning_rate": 2.9304836610721187e-05, + "loss": 2.662, + "step": 32488 + }, + { + "epoch": 2.016822894034391, + "grad_norm": 0.18796946443785045, + "learning_rate": 2.9301549018879305e-05, + "loss": 2.6808, + "step": 32489 + }, + { + "epoch": 2.0168849711341488, + "grad_norm": 0.14338796366923484, + "learning_rate": 2.929826153502626e-05, + "loss": 2.6025, + "step": 32490 + }, + { + "epoch": 2.0169470482339067, + "grad_norm": 0.14434361521286584, + "learning_rate": 2.9294974159179218e-05, + "loss": 2.7124, + "step": 32491 + }, + { + "epoch": 2.0170091253336646, + "grad_norm": 0.1405531514257558, + "learning_rate": 2.9291686891355296e-05, + "loss": 2.7696, + "step": 32492 + }, + { + "epoch": 2.0170712024334225, + "grad_norm": 0.13881718718590352, + "learning_rate": 2.928839973157169e-05, + "loss": 2.8177, + "step": 32493 + }, + { + "epoch": 2.01713327953318, + "grad_norm": 0.1483908881235014, + "learning_rate": 2.9285112679845535e-05, + "loss": 2.6431, + "step": 32494 + }, + { + "epoch": 2.017195356632938, + "grad_norm": 0.1444430081882454, + "learning_rate": 2.928182573619397e-05, + "loss": 2.7995, + "step": 32495 + }, + { + "epoch": 2.017257433732696, + "grad_norm": 0.14238482919947112, + "learning_rate": 2.9278538900634156e-05, + "loss": 2.7428, + "step": 32496 + }, + { + "epoch": 2.0173195108324538, + "grad_norm": 0.14732091457723154, + "learning_rate": 2.927525217318322e-05, + "loss": 2.7899, + "step": 32497 + }, + { + "epoch": 2.0173815879322117, + "grad_norm": 0.16401315128553834, + "learning_rate": 2.927196555385834e-05, + "loss": 2.7041, + "step": 32498 + }, + { + "epoch": 2.0174436650319696, + "grad_norm": 0.176818700177385, + "learning_rate": 2.926867904267665e-05, + "loss": 2.772, + "step": 32499 + }, + { + "epoch": 2.0175057421317275, + "grad_norm": 0.1465511551270697, + "learning_rate": 2.926539263965528e-05, + "loss": 2.7347, + "step": 32500 + }, + { + "epoch": 2.0175678192314854, + "grad_norm": 0.1446248279952145, + "learning_rate": 2.92621063448114e-05, + "loss": 2.6877, + "step": 32501 + }, + { + "epoch": 2.0176298963312433, + "grad_norm": 0.14974595277480343, + "learning_rate": 2.9258820158162135e-05, + "loss": 2.7669, + "step": 32502 + }, + { + "epoch": 2.0176919734310013, + "grad_norm": 0.1614925818625564, + "learning_rate": 2.9255534079724656e-05, + "loss": 2.7645, + "step": 32503 + }, + { + "epoch": 2.017754050530759, + "grad_norm": 0.13473426758103732, + "learning_rate": 2.9252248109516077e-05, + "loss": 2.6838, + "step": 32504 + }, + { + "epoch": 2.017816127630517, + "grad_norm": 0.14665898406794414, + "learning_rate": 2.9248962247553568e-05, + "loss": 2.7097, + "step": 32505 + }, + { + "epoch": 2.017878204730275, + "grad_norm": 0.14311702340607707, + "learning_rate": 2.924567649385425e-05, + "loss": 2.6674, + "step": 32506 + }, + { + "epoch": 2.017940281830033, + "grad_norm": 0.14317304783371637, + "learning_rate": 2.9242390848435287e-05, + "loss": 2.6812, + "step": 32507 + }, + { + "epoch": 2.018002358929791, + "grad_norm": 0.14400753943297845, + "learning_rate": 2.9239105311313797e-05, + "loss": 2.74, + "step": 32508 + }, + { + "epoch": 2.0180644360295488, + "grad_norm": 0.1551141641764564, + "learning_rate": 2.9235819882506956e-05, + "loss": 2.7678, + "step": 32509 + }, + { + "epoch": 2.0181265131293067, + "grad_norm": 0.1481006024081975, + "learning_rate": 2.9232534562031864e-05, + "loss": 2.754, + "step": 32510 + }, + { + "epoch": 2.0181885902290646, + "grad_norm": 0.14499236986005032, + "learning_rate": 2.9229249349905684e-05, + "loss": 2.7848, + "step": 32511 + }, + { + "epoch": 2.0182506673288225, + "grad_norm": 0.1451414475628651, + "learning_rate": 2.922596424614556e-05, + "loss": 2.7322, + "step": 32512 + }, + { + "epoch": 2.0183127444285804, + "grad_norm": 0.14608151670226024, + "learning_rate": 2.9222679250768613e-05, + "loss": 2.7213, + "step": 32513 + }, + { + "epoch": 2.0183748215283384, + "grad_norm": 0.15037165559306226, + "learning_rate": 2.9219394363792e-05, + "loss": 2.6623, + "step": 32514 + }, + { + "epoch": 2.0184368986280963, + "grad_norm": 0.14723481278393374, + "learning_rate": 2.9216109585232842e-05, + "loss": 2.7346, + "step": 32515 + }, + { + "epoch": 2.018498975727854, + "grad_norm": 0.15610280852624508, + "learning_rate": 2.9212824915108296e-05, + "loss": 2.7244, + "step": 32516 + }, + { + "epoch": 2.018561052827612, + "grad_norm": 0.17698003172779975, + "learning_rate": 2.9209540353435483e-05, + "loss": 2.7853, + "step": 32517 + }, + { + "epoch": 2.0186231299273696, + "grad_norm": 0.17257545531294474, + "learning_rate": 2.9206255900231548e-05, + "loss": 2.784, + "step": 32518 + }, + { + "epoch": 2.0186852070271275, + "grad_norm": 0.15135522610564212, + "learning_rate": 2.920297155551362e-05, + "loss": 2.7155, + "step": 32519 + }, + { + "epoch": 2.0187472841268854, + "grad_norm": 0.17967856426411227, + "learning_rate": 2.9199687319298846e-05, + "loss": 2.7389, + "step": 32520 + }, + { + "epoch": 2.0188093612266433, + "grad_norm": 0.15134862355252832, + "learning_rate": 2.9196403191604337e-05, + "loss": 2.7015, + "step": 32521 + }, + { + "epoch": 2.0188714383264013, + "grad_norm": 0.14847621287420387, + "learning_rate": 2.919311917244726e-05, + "loss": 2.7419, + "step": 32522 + }, + { + "epoch": 2.018933515426159, + "grad_norm": 0.14304434961061707, + "learning_rate": 2.9189835261844734e-05, + "loss": 2.7068, + "step": 32523 + }, + { + "epoch": 2.018995592525917, + "grad_norm": 0.16963791414280163, + "learning_rate": 2.9186551459813884e-05, + "loss": 2.7686, + "step": 32524 + }, + { + "epoch": 2.019057669625675, + "grad_norm": 0.17347106802059406, + "learning_rate": 2.9183267766371825e-05, + "loss": 2.7017, + "step": 32525 + }, + { + "epoch": 2.019119746725433, + "grad_norm": 0.16641594921196023, + "learning_rate": 2.9179984181535713e-05, + "loss": 2.8031, + "step": 32526 + }, + { + "epoch": 2.019181823825191, + "grad_norm": 0.14092231156919915, + "learning_rate": 2.91767007053227e-05, + "loss": 2.7388, + "step": 32527 + }, + { + "epoch": 2.0192439009249488, + "grad_norm": 0.14339228286150565, + "learning_rate": 2.91734173377499e-05, + "loss": 2.6539, + "step": 32528 + }, + { + "epoch": 2.0193059780247067, + "grad_norm": 0.14942856637208352, + "learning_rate": 2.9170134078834427e-05, + "loss": 2.7267, + "step": 32529 + }, + { + "epoch": 2.0193680551244646, + "grad_norm": 0.16171411793290055, + "learning_rate": 2.9166850928593404e-05, + "loss": 2.6335, + "step": 32530 + }, + { + "epoch": 2.0194301322242225, + "grad_norm": 0.21019065178292648, + "learning_rate": 2.9163567887043998e-05, + "loss": 2.7373, + "step": 32531 + }, + { + "epoch": 2.0194922093239804, + "grad_norm": 0.19839942033602745, + "learning_rate": 2.9160284954203322e-05, + "loss": 2.6584, + "step": 32532 + }, + { + "epoch": 2.0195542864237384, + "grad_norm": 0.17081358846868266, + "learning_rate": 2.9157002130088497e-05, + "loss": 2.6565, + "step": 32533 + }, + { + "epoch": 2.0196163635234963, + "grad_norm": 0.1581711442435535, + "learning_rate": 2.9153719414716625e-05, + "loss": 2.8276, + "step": 32534 + }, + { + "epoch": 2.019678440623254, + "grad_norm": 0.22244249088211493, + "learning_rate": 2.915043680810488e-05, + "loss": 2.7408, + "step": 32535 + }, + { + "epoch": 2.019740517723012, + "grad_norm": 0.15363921875912898, + "learning_rate": 2.9147154310270376e-05, + "loss": 2.7318, + "step": 32536 + }, + { + "epoch": 2.01980259482277, + "grad_norm": 0.1547078983748284, + "learning_rate": 2.914387192123022e-05, + "loss": 2.7134, + "step": 32537 + }, + { + "epoch": 2.019864671922528, + "grad_norm": 0.1507585471361164, + "learning_rate": 2.9140589641001548e-05, + "loss": 2.7309, + "step": 32538 + }, + { + "epoch": 2.019926749022286, + "grad_norm": 0.14415175645640024, + "learning_rate": 2.9137307469601472e-05, + "loss": 2.7153, + "step": 32539 + }, + { + "epoch": 2.019988826122044, + "grad_norm": 0.17578837062400648, + "learning_rate": 2.913402540704714e-05, + "loss": 2.7833, + "step": 32540 + }, + { + "epoch": 2.0200509032218017, + "grad_norm": 0.1482833136805844, + "learning_rate": 2.913074345335567e-05, + "loss": 2.7545, + "step": 32541 + }, + { + "epoch": 2.020112980321559, + "grad_norm": 0.13760012183812967, + "learning_rate": 2.9127461608544172e-05, + "loss": 2.7465, + "step": 32542 + }, + { + "epoch": 2.020175057421317, + "grad_norm": 0.1399073767698686, + "learning_rate": 2.912417987262975e-05, + "loss": 2.6407, + "step": 32543 + }, + { + "epoch": 2.020237134521075, + "grad_norm": 0.14658919716536717, + "learning_rate": 2.912089824562958e-05, + "loss": 2.7911, + "step": 32544 + }, + { + "epoch": 2.020299211620833, + "grad_norm": 0.16393376022258788, + "learning_rate": 2.9117616727560747e-05, + "loss": 2.8035, + "step": 32545 + }, + { + "epoch": 2.020361288720591, + "grad_norm": 0.149427151574161, + "learning_rate": 2.911433531844038e-05, + "loss": 2.7197, + "step": 32546 + }, + { + "epoch": 2.0204233658203488, + "grad_norm": 0.1594587232363004, + "learning_rate": 2.911105401828559e-05, + "loss": 2.7846, + "step": 32547 + }, + { + "epoch": 2.0204854429201067, + "grad_norm": 0.14450279161865176, + "learning_rate": 2.9107772827113483e-05, + "loss": 2.684, + "step": 32548 + }, + { + "epoch": 2.0205475200198646, + "grad_norm": 0.15253258885556253, + "learning_rate": 2.910449174494122e-05, + "loss": 2.7948, + "step": 32549 + }, + { + "epoch": 2.0206095971196225, + "grad_norm": 0.14311742686554582, + "learning_rate": 2.910121077178589e-05, + "loss": 2.7746, + "step": 32550 + }, + { + "epoch": 2.0206716742193804, + "grad_norm": 0.15009388816536448, + "learning_rate": 2.9097929907664623e-05, + "loss": 2.7227, + "step": 32551 + }, + { + "epoch": 2.0207337513191383, + "grad_norm": 0.1407369062421176, + "learning_rate": 2.9094649152594512e-05, + "loss": 2.7558, + "step": 32552 + }, + { + "epoch": 2.0207958284188963, + "grad_norm": 0.14981793689450643, + "learning_rate": 2.909136850659271e-05, + "loss": 2.712, + "step": 32553 + }, + { + "epoch": 2.020857905518654, + "grad_norm": 0.15197372125326714, + "learning_rate": 2.908808796967631e-05, + "loss": 2.7601, + "step": 32554 + }, + { + "epoch": 2.020919982618412, + "grad_norm": 0.1417600271199675, + "learning_rate": 2.9084807541862435e-05, + "loss": 2.7342, + "step": 32555 + }, + { + "epoch": 2.02098205971817, + "grad_norm": 0.15233284348792375, + "learning_rate": 2.908152722316817e-05, + "loss": 2.6684, + "step": 32556 + }, + { + "epoch": 2.021044136817928, + "grad_norm": 0.14782297142673298, + "learning_rate": 2.9078247013610682e-05, + "loss": 2.7878, + "step": 32557 + }, + { + "epoch": 2.021106213917686, + "grad_norm": 0.1785379208053483, + "learning_rate": 2.9074966913207036e-05, + "loss": 2.7008, + "step": 32558 + }, + { + "epoch": 2.0211682910174438, + "grad_norm": 0.15502981351030426, + "learning_rate": 2.907168692197439e-05, + "loss": 2.7894, + "step": 32559 + }, + { + "epoch": 2.0212303681172017, + "grad_norm": 0.17230679718504097, + "learning_rate": 2.9068407039929835e-05, + "loss": 2.8646, + "step": 32560 + }, + { + "epoch": 2.0212924452169596, + "grad_norm": 0.14834381585393921, + "learning_rate": 2.9065127267090476e-05, + "loss": 2.7002, + "step": 32561 + }, + { + "epoch": 2.0213545223167175, + "grad_norm": 0.1629733250763054, + "learning_rate": 2.9061847603473413e-05, + "loss": 2.6199, + "step": 32562 + }, + { + "epoch": 2.0214165994164754, + "grad_norm": 0.14137168509250647, + "learning_rate": 2.905856804909579e-05, + "loss": 2.7641, + "step": 32563 + }, + { + "epoch": 2.0214786765162334, + "grad_norm": 0.14820991813241152, + "learning_rate": 2.9055288603974705e-05, + "loss": 2.6489, + "step": 32564 + }, + { + "epoch": 2.021540753615991, + "grad_norm": 0.14945330135735865, + "learning_rate": 2.9052009268127267e-05, + "loss": 2.6892, + "step": 32565 + }, + { + "epoch": 2.0216028307157488, + "grad_norm": 0.13969453304302226, + "learning_rate": 2.9048730041570558e-05, + "loss": 2.7734, + "step": 32566 + }, + { + "epoch": 2.0216649078155067, + "grad_norm": 0.14920782678378414, + "learning_rate": 2.904545092432173e-05, + "loss": 2.7028, + "step": 32567 + }, + { + "epoch": 2.0217269849152646, + "grad_norm": 0.17268907053810031, + "learning_rate": 2.9042171916397877e-05, + "loss": 2.7109, + "step": 32568 + }, + { + "epoch": 2.0217890620150225, + "grad_norm": 0.14420087532043654, + "learning_rate": 2.9038893017816093e-05, + "loss": 2.7231, + "step": 32569 + }, + { + "epoch": 2.0218511391147804, + "grad_norm": 0.14633991111503533, + "learning_rate": 2.9035614228593476e-05, + "loss": 2.7165, + "step": 32570 + }, + { + "epoch": 2.0219132162145383, + "grad_norm": 0.1649105598782169, + "learning_rate": 2.903233554874717e-05, + "loss": 2.7555, + "step": 32571 + }, + { + "epoch": 2.0219752933142963, + "grad_norm": 0.18649089988796777, + "learning_rate": 2.902905697829426e-05, + "loss": 2.7319, + "step": 32572 + }, + { + "epoch": 2.022037370414054, + "grad_norm": 0.18490441172583602, + "learning_rate": 2.902577851725185e-05, + "loss": 2.6799, + "step": 32573 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.1447314795663951, + "learning_rate": 2.9022500165637044e-05, + "loss": 2.7468, + "step": 32574 + }, + { + "epoch": 2.02216152461357, + "grad_norm": 0.1728094178885493, + "learning_rate": 2.9019221923466926e-05, + "loss": 2.7677, + "step": 32575 + }, + { + "epoch": 2.022223601713328, + "grad_norm": 0.15575688154309064, + "learning_rate": 2.9015943790758642e-05, + "loss": 2.7299, + "step": 32576 + }, + { + "epoch": 2.022285678813086, + "grad_norm": 0.14087616160860753, + "learning_rate": 2.9012665767529278e-05, + "loss": 2.8179, + "step": 32577 + }, + { + "epoch": 2.0223477559128438, + "grad_norm": 0.14500491392918405, + "learning_rate": 2.900938785379592e-05, + "loss": 2.6563, + "step": 32578 + }, + { + "epoch": 2.0224098330126017, + "grad_norm": 0.14612386846920056, + "learning_rate": 2.9006110049575674e-05, + "loss": 2.7278, + "step": 32579 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.19196907323430956, + "learning_rate": 2.9002832354885663e-05, + "loss": 2.843, + "step": 32580 + }, + { + "epoch": 2.0225339872121175, + "grad_norm": 0.14540154461694474, + "learning_rate": 2.899955476974297e-05, + "loss": 2.6417, + "step": 32581 + }, + { + "epoch": 2.0225960643118754, + "grad_norm": 0.2108841846339317, + "learning_rate": 2.89962772941647e-05, + "loss": 2.7489, + "step": 32582 + }, + { + "epoch": 2.0226581414116334, + "grad_norm": 0.14768275573026146, + "learning_rate": 2.8992999928167952e-05, + "loss": 2.7658, + "step": 32583 + }, + { + "epoch": 2.0227202185113913, + "grad_norm": 0.1424515763521091, + "learning_rate": 2.8989722671769804e-05, + "loss": 2.7328, + "step": 32584 + }, + { + "epoch": 2.022782295611149, + "grad_norm": 0.15246545380791868, + "learning_rate": 2.8986445524987382e-05, + "loss": 2.7223, + "step": 32585 + }, + { + "epoch": 2.022844372710907, + "grad_norm": 0.15146807108231702, + "learning_rate": 2.8983168487837787e-05, + "loss": 2.7805, + "step": 32586 + }, + { + "epoch": 2.022906449810665, + "grad_norm": 0.15474497035778045, + "learning_rate": 2.89798915603381e-05, + "loss": 2.6161, + "step": 32587 + }, + { + "epoch": 2.022968526910423, + "grad_norm": 0.14601102644168057, + "learning_rate": 2.8976614742505403e-05, + "loss": 2.771, + "step": 32588 + }, + { + "epoch": 2.023030604010181, + "grad_norm": 0.14922115863234173, + "learning_rate": 2.897333803435683e-05, + "loss": 2.7881, + "step": 32589 + }, + { + "epoch": 2.0230926811099383, + "grad_norm": 0.14294140403079753, + "learning_rate": 2.8970061435909435e-05, + "loss": 2.8429, + "step": 32590 + }, + { + "epoch": 2.0231547582096963, + "grad_norm": 0.1452069275884664, + "learning_rate": 2.8966784947180358e-05, + "loss": 2.6982, + "step": 32591 + }, + { + "epoch": 2.023216835309454, + "grad_norm": 0.15763065583120223, + "learning_rate": 2.896350856818667e-05, + "loss": 2.7554, + "step": 32592 + }, + { + "epoch": 2.023278912409212, + "grad_norm": 0.1914230066866479, + "learning_rate": 2.8960232298945437e-05, + "loss": 2.7279, + "step": 32593 + }, + { + "epoch": 2.02334098950897, + "grad_norm": 0.15178911409156784, + "learning_rate": 2.8956956139473808e-05, + "loss": 2.7597, + "step": 32594 + }, + { + "epoch": 2.023403066608728, + "grad_norm": 0.1410315980080711, + "learning_rate": 2.8953680089788842e-05, + "loss": 2.7569, + "step": 32595 + }, + { + "epoch": 2.023465143708486, + "grad_norm": 0.14568998466739316, + "learning_rate": 2.895040414990764e-05, + "loss": 2.7689, + "step": 32596 + }, + { + "epoch": 2.0235272208082438, + "grad_norm": 0.1485855280469723, + "learning_rate": 2.8947128319847287e-05, + "loss": 2.8086, + "step": 32597 + }, + { + "epoch": 2.0235892979080017, + "grad_norm": 0.14192004725896135, + "learning_rate": 2.894385259962486e-05, + "loss": 2.7735, + "step": 32598 + }, + { + "epoch": 2.0236513750077596, + "grad_norm": 0.1358181583035127, + "learning_rate": 2.8940576989257483e-05, + "loss": 2.7528, + "step": 32599 + }, + { + "epoch": 2.0237134521075175, + "grad_norm": 0.17429240512972644, + "learning_rate": 2.8937301488762227e-05, + "loss": 2.7372, + "step": 32600 + }, + { + "epoch": 2.0237755292072754, + "grad_norm": 0.14463213029007285, + "learning_rate": 2.8934026098156186e-05, + "loss": 2.7996, + "step": 32601 + }, + { + "epoch": 2.0238376063070334, + "grad_norm": 0.13364728669139214, + "learning_rate": 2.893075081745642e-05, + "loss": 2.6369, + "step": 32602 + }, + { + "epoch": 2.0238996834067913, + "grad_norm": 0.1493587266764461, + "learning_rate": 2.8927475646680062e-05, + "loss": 2.754, + "step": 32603 + }, + { + "epoch": 2.023961760506549, + "grad_norm": 0.15720992859227234, + "learning_rate": 2.8924200585844185e-05, + "loss": 2.7642, + "step": 32604 + }, + { + "epoch": 2.024023837606307, + "grad_norm": 0.1412336304872144, + "learning_rate": 2.8920925634965867e-05, + "loss": 2.7138, + "step": 32605 + }, + { + "epoch": 2.024085914706065, + "grad_norm": 0.15259168086112898, + "learning_rate": 2.8917650794062172e-05, + "loss": 2.7167, + "step": 32606 + }, + { + "epoch": 2.024147991805823, + "grad_norm": 0.14245457717926885, + "learning_rate": 2.8914376063150233e-05, + "loss": 2.7285, + "step": 32607 + }, + { + "epoch": 2.024210068905581, + "grad_norm": 0.14070501964846033, + "learning_rate": 2.8911101442247114e-05, + "loss": 2.6558, + "step": 32608 + }, + { + "epoch": 2.024272146005339, + "grad_norm": 0.17967128680516317, + "learning_rate": 2.8907826931369903e-05, + "loss": 2.7516, + "step": 32609 + }, + { + "epoch": 2.0243342231050967, + "grad_norm": 0.14677745110970414, + "learning_rate": 2.890455253053567e-05, + "loss": 2.7417, + "step": 32610 + }, + { + "epoch": 2.0243963002048546, + "grad_norm": 0.1517755905499764, + "learning_rate": 2.89012782397615e-05, + "loss": 2.764, + "step": 32611 + }, + { + "epoch": 2.0244583773046125, + "grad_norm": 0.13778347547719022, + "learning_rate": 2.8898004059064493e-05, + "loss": 2.7633, + "step": 32612 + }, + { + "epoch": 2.02452045440437, + "grad_norm": 0.1525286628216124, + "learning_rate": 2.8894729988461722e-05, + "loss": 2.6821, + "step": 32613 + }, + { + "epoch": 2.024582531504128, + "grad_norm": 0.14838985255613638, + "learning_rate": 2.889145602797027e-05, + "loss": 2.706, + "step": 32614 + }, + { + "epoch": 2.024644608603886, + "grad_norm": 0.15220471909415353, + "learning_rate": 2.88881821776072e-05, + "loss": 2.7757, + "step": 32615 + }, + { + "epoch": 2.0247066857036438, + "grad_norm": 0.13958231153991926, + "learning_rate": 2.8884908437389624e-05, + "loss": 2.6949, + "step": 32616 + }, + { + "epoch": 2.0247687628034017, + "grad_norm": 0.18038318554260865, + "learning_rate": 2.8881634807334608e-05, + "loss": 2.6569, + "step": 32617 + }, + { + "epoch": 2.0248308399031596, + "grad_norm": 0.15593717373410573, + "learning_rate": 2.8878361287459233e-05, + "loss": 2.7008, + "step": 32618 + }, + { + "epoch": 2.0248929170029175, + "grad_norm": 0.13878120222951973, + "learning_rate": 2.887508787778055e-05, + "loss": 2.7276, + "step": 32619 + }, + { + "epoch": 2.0249549941026754, + "grad_norm": 0.16044944001621955, + "learning_rate": 2.887181457831568e-05, + "loss": 2.7496, + "step": 32620 + }, + { + "epoch": 2.0250170712024333, + "grad_norm": 0.15398011105390216, + "learning_rate": 2.8868541389081683e-05, + "loss": 2.7534, + "step": 32621 + }, + { + "epoch": 2.0250791483021913, + "grad_norm": 0.18704767105689007, + "learning_rate": 2.886526831009564e-05, + "loss": 2.7392, + "step": 32622 + }, + { + "epoch": 2.025141225401949, + "grad_norm": 0.15539411138130357, + "learning_rate": 2.8861995341374602e-05, + "loss": 2.8199, + "step": 32623 + }, + { + "epoch": 2.025203302501707, + "grad_norm": 0.15197136801328215, + "learning_rate": 2.8858722482935685e-05, + "loss": 2.8343, + "step": 32624 + }, + { + "epoch": 2.025265379601465, + "grad_norm": 0.14502567702092461, + "learning_rate": 2.8855449734795924e-05, + "loss": 2.7972, + "step": 32625 + }, + { + "epoch": 2.025327456701223, + "grad_norm": 0.1671723960578115, + "learning_rate": 2.8852177096972433e-05, + "loss": 2.664, + "step": 32626 + }, + { + "epoch": 2.025389533800981, + "grad_norm": 0.14336254599699627, + "learning_rate": 2.8848904569482272e-05, + "loss": 2.7929, + "step": 32627 + }, + { + "epoch": 2.0254516109007388, + "grad_norm": 0.14439203671674558, + "learning_rate": 2.8845632152342506e-05, + "loss": 2.7016, + "step": 32628 + }, + { + "epoch": 2.0255136880004967, + "grad_norm": 0.1427123535302346, + "learning_rate": 2.88423598455702e-05, + "loss": 2.7241, + "step": 32629 + }, + { + "epoch": 2.0255757651002546, + "grad_norm": 0.15348542792467723, + "learning_rate": 2.883908764918245e-05, + "loss": 2.6733, + "step": 32630 + }, + { + "epoch": 2.0256378422000125, + "grad_norm": 0.13659930461035272, + "learning_rate": 2.8835815563196324e-05, + "loss": 2.6372, + "step": 32631 + }, + { + "epoch": 2.0256999192997704, + "grad_norm": 0.1347621212379427, + "learning_rate": 2.883254358762888e-05, + "loss": 2.7545, + "step": 32632 + }, + { + "epoch": 2.0257619963995284, + "grad_norm": 0.1601294576518012, + "learning_rate": 2.88292717224972e-05, + "loss": 2.7357, + "step": 32633 + }, + { + "epoch": 2.0258240734992863, + "grad_norm": 0.1477922859947432, + "learning_rate": 2.882599996781833e-05, + "loss": 2.7353, + "step": 32634 + }, + { + "epoch": 2.025886150599044, + "grad_norm": 0.1366598206630859, + "learning_rate": 2.8822728323609372e-05, + "loss": 2.745, + "step": 32635 + }, + { + "epoch": 2.025948227698802, + "grad_norm": 0.1665883004208308, + "learning_rate": 2.881945678988738e-05, + "loss": 2.7066, + "step": 32636 + }, + { + "epoch": 2.0260103047985596, + "grad_norm": 0.15976985398830246, + "learning_rate": 2.881618536666943e-05, + "loss": 2.6623, + "step": 32637 + }, + { + "epoch": 2.0260723818983175, + "grad_norm": 0.14970611253470192, + "learning_rate": 2.881291405397256e-05, + "loss": 2.7277, + "step": 32638 + }, + { + "epoch": 2.0261344589980754, + "grad_norm": 0.1365841134489639, + "learning_rate": 2.880964285181388e-05, + "loss": 2.7341, + "step": 32639 + }, + { + "epoch": 2.0261965360978333, + "grad_norm": 0.16330204995470857, + "learning_rate": 2.8806371760210437e-05, + "loss": 2.7572, + "step": 32640 + }, + { + "epoch": 2.0262586131975913, + "grad_norm": 0.14932278107431954, + "learning_rate": 2.8803100779179305e-05, + "loss": 2.7398, + "step": 32641 + }, + { + "epoch": 2.026320690297349, + "grad_norm": 0.18765915632578056, + "learning_rate": 2.879982990873752e-05, + "loss": 2.8323, + "step": 32642 + }, + { + "epoch": 2.026382767397107, + "grad_norm": 0.1567647858073197, + "learning_rate": 2.879655914890218e-05, + "loss": 2.8331, + "step": 32643 + }, + { + "epoch": 2.026444844496865, + "grad_norm": 0.1511903960935009, + "learning_rate": 2.879328849969034e-05, + "loss": 2.7026, + "step": 32644 + }, + { + "epoch": 2.026506921596623, + "grad_norm": 0.14726186413604603, + "learning_rate": 2.879001796111907e-05, + "loss": 2.713, + "step": 32645 + }, + { + "epoch": 2.026568998696381, + "grad_norm": 0.15096130713488795, + "learning_rate": 2.8786747533205423e-05, + "loss": 2.7706, + "step": 32646 + }, + { + "epoch": 2.0266310757961388, + "grad_norm": 0.153249675734054, + "learning_rate": 2.8783477215966436e-05, + "loss": 2.713, + "step": 32647 + }, + { + "epoch": 2.0266931528958967, + "grad_norm": 0.15021596401988738, + "learning_rate": 2.878020700941923e-05, + "loss": 2.7482, + "step": 32648 + }, + { + "epoch": 2.0267552299956546, + "grad_norm": 0.14989900740597994, + "learning_rate": 2.8776936913580833e-05, + "loss": 2.7944, + "step": 32649 + }, + { + "epoch": 2.0268173070954125, + "grad_norm": 0.15301054744891032, + "learning_rate": 2.87736669284683e-05, + "loss": 2.765, + "step": 32650 + }, + { + "epoch": 2.0268793841951704, + "grad_norm": 0.1482036153396112, + "learning_rate": 2.8770397054098687e-05, + "loss": 2.7038, + "step": 32651 + }, + { + "epoch": 2.0269414612949284, + "grad_norm": 0.1452172918568697, + "learning_rate": 2.8767127290489084e-05, + "loss": 2.6324, + "step": 32652 + }, + { + "epoch": 2.0270035383946863, + "grad_norm": 0.1592244804874719, + "learning_rate": 2.8763857637656533e-05, + "loss": 2.7571, + "step": 32653 + }, + { + "epoch": 2.027065615494444, + "grad_norm": 0.13712148612891897, + "learning_rate": 2.876058809561809e-05, + "loss": 2.7795, + "step": 32654 + }, + { + "epoch": 2.027127692594202, + "grad_norm": 0.1485585798044026, + "learning_rate": 2.8757318664390796e-05, + "loss": 2.7478, + "step": 32655 + }, + { + "epoch": 2.02718976969396, + "grad_norm": 0.1476178692077417, + "learning_rate": 2.875404934399173e-05, + "loss": 2.7028, + "step": 32656 + }, + { + "epoch": 2.027251846793718, + "grad_norm": 0.16320819612205445, + "learning_rate": 2.875078013443796e-05, + "loss": 2.7058, + "step": 32657 + }, + { + "epoch": 2.027313923893476, + "grad_norm": 0.17134793704070875, + "learning_rate": 2.8747511035746534e-05, + "loss": 2.6777, + "step": 32658 + }, + { + "epoch": 2.027376000993234, + "grad_norm": 0.15025765250080503, + "learning_rate": 2.87442420479345e-05, + "loss": 2.7438, + "step": 32659 + }, + { + "epoch": 2.0274380780929917, + "grad_norm": 0.15222878886908106, + "learning_rate": 2.8740973171018914e-05, + "loss": 2.6673, + "step": 32660 + }, + { + "epoch": 2.027500155192749, + "grad_norm": 0.1536159883758967, + "learning_rate": 2.8737704405016812e-05, + "loss": 2.7009, + "step": 32661 + }, + { + "epoch": 2.027562232292507, + "grad_norm": 0.1552460244409963, + "learning_rate": 2.873443574994529e-05, + "loss": 2.7981, + "step": 32662 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.1534691531576657, + "learning_rate": 2.8731167205821373e-05, + "loss": 2.7531, + "step": 32663 + }, + { + "epoch": 2.027686386492023, + "grad_norm": 0.17942773660135775, + "learning_rate": 2.872789877266213e-05, + "loss": 2.5798, + "step": 32664 + }, + { + "epoch": 2.027748463591781, + "grad_norm": 0.14894878022253732, + "learning_rate": 2.872463045048458e-05, + "loss": 2.7826, + "step": 32665 + }, + { + "epoch": 2.0278105406915388, + "grad_norm": 0.14872149229356935, + "learning_rate": 2.8721362239305817e-05, + "loss": 2.7465, + "step": 32666 + }, + { + "epoch": 2.0278726177912967, + "grad_norm": 0.1640804655592716, + "learning_rate": 2.871809413914287e-05, + "loss": 2.6483, + "step": 32667 + }, + { + "epoch": 2.0279346948910546, + "grad_norm": 0.1580913172155964, + "learning_rate": 2.871482615001279e-05, + "loss": 2.7081, + "step": 32668 + }, + { + "epoch": 2.0279967719908125, + "grad_norm": 0.15056576995774543, + "learning_rate": 2.8711558271932616e-05, + "loss": 2.6317, + "step": 32669 + }, + { + "epoch": 2.0280588490905704, + "grad_norm": 0.14743397320915164, + "learning_rate": 2.8708290504919427e-05, + "loss": 2.7636, + "step": 32670 + }, + { + "epoch": 2.0281209261903284, + "grad_norm": 0.17138242580330024, + "learning_rate": 2.8705022848990258e-05, + "loss": 2.7246, + "step": 32671 + }, + { + "epoch": 2.0281830032900863, + "grad_norm": 0.14680746686889806, + "learning_rate": 2.870175530416215e-05, + "loss": 2.7113, + "step": 32672 + }, + { + "epoch": 2.028245080389844, + "grad_norm": 0.14520539469142135, + "learning_rate": 2.8698487870452156e-05, + "loss": 2.6755, + "step": 32673 + }, + { + "epoch": 2.028307157489602, + "grad_norm": 0.14894198345162626, + "learning_rate": 2.86952205478773e-05, + "loss": 2.7374, + "step": 32674 + }, + { + "epoch": 2.02836923458936, + "grad_norm": 0.14762107414989822, + "learning_rate": 2.869195333645468e-05, + "loss": 2.6699, + "step": 32675 + }, + { + "epoch": 2.028431311689118, + "grad_norm": 0.1418393342103342, + "learning_rate": 2.86886862362013e-05, + "loss": 2.7495, + "step": 32676 + }, + { + "epoch": 2.028493388788876, + "grad_norm": 0.13816414146863273, + "learning_rate": 2.8685419247134226e-05, + "loss": 2.7257, + "step": 32677 + }, + { + "epoch": 2.028555465888634, + "grad_norm": 0.150536936643231, + "learning_rate": 2.8682152369270464e-05, + "loss": 2.7911, + "step": 32678 + }, + { + "epoch": 2.0286175429883917, + "grad_norm": 0.15142402197205668, + "learning_rate": 2.867888560262712e-05, + "loss": 2.794, + "step": 32679 + }, + { + "epoch": 2.0286796200881496, + "grad_norm": 0.14443721805381576, + "learning_rate": 2.86756189472212e-05, + "loss": 2.7794, + "step": 32680 + }, + { + "epoch": 2.0287416971879075, + "grad_norm": 0.15067027460197935, + "learning_rate": 2.867235240306975e-05, + "loss": 2.7477, + "step": 32681 + }, + { + "epoch": 2.0288037742876655, + "grad_norm": 0.1569638871661067, + "learning_rate": 2.8669085970189825e-05, + "loss": 2.7663, + "step": 32682 + }, + { + "epoch": 2.0288658513874234, + "grad_norm": 0.1389239196674071, + "learning_rate": 2.8665819648598425e-05, + "loss": 2.706, + "step": 32683 + }, + { + "epoch": 2.0289279284871813, + "grad_norm": 0.15773105242792732, + "learning_rate": 2.866255343831265e-05, + "loss": 2.6937, + "step": 32684 + }, + { + "epoch": 2.0289900055869388, + "grad_norm": 0.1471618689049393, + "learning_rate": 2.865928733934951e-05, + "loss": 2.7877, + "step": 32685 + }, + { + "epoch": 2.0290520826866967, + "grad_norm": 0.14972886102406435, + "learning_rate": 2.865602135172605e-05, + "loss": 2.7475, + "step": 32686 + }, + { + "epoch": 2.0291141597864546, + "grad_norm": 0.15250733957842164, + "learning_rate": 2.8652755475459293e-05, + "loss": 2.7443, + "step": 32687 + }, + { + "epoch": 2.0291762368862125, + "grad_norm": 0.1685479513084283, + "learning_rate": 2.86494897105663e-05, + "loss": 2.6874, + "step": 32688 + }, + { + "epoch": 2.0292383139859704, + "grad_norm": 0.14885069408152413, + "learning_rate": 2.8646224057064097e-05, + "loss": 2.6132, + "step": 32689 + }, + { + "epoch": 2.0293003910857284, + "grad_norm": 0.15378779921668065, + "learning_rate": 2.8642958514969735e-05, + "loss": 2.74, + "step": 32690 + }, + { + "epoch": 2.0293624681854863, + "grad_norm": 0.16937300398484514, + "learning_rate": 2.863969308430025e-05, + "loss": 2.7345, + "step": 32691 + }, + { + "epoch": 2.029424545285244, + "grad_norm": 0.1429600955344842, + "learning_rate": 2.863642776507265e-05, + "loss": 2.7749, + "step": 32692 + }, + { + "epoch": 2.029486622385002, + "grad_norm": 0.14878864540648543, + "learning_rate": 2.863316255730401e-05, + "loss": 2.7739, + "step": 32693 + }, + { + "epoch": 2.02954869948476, + "grad_norm": 0.1771266629464716, + "learning_rate": 2.8629897461011353e-05, + "loss": 2.7205, + "step": 32694 + }, + { + "epoch": 2.029610776584518, + "grad_norm": 0.15430342986271461, + "learning_rate": 2.86266324762117e-05, + "loss": 2.7209, + "step": 32695 + }, + { + "epoch": 2.029672853684276, + "grad_norm": 0.1558456314452189, + "learning_rate": 2.86233676029221e-05, + "loss": 2.6991, + "step": 32696 + }, + { + "epoch": 2.0297349307840338, + "grad_norm": 0.1506945400254825, + "learning_rate": 2.862010284115957e-05, + "loss": 2.7097, + "step": 32697 + }, + { + "epoch": 2.0297970078837917, + "grad_norm": 0.13833027390918282, + "learning_rate": 2.861683819094116e-05, + "loss": 2.7266, + "step": 32698 + }, + { + "epoch": 2.0298590849835496, + "grad_norm": 0.1439968811271095, + "learning_rate": 2.8613573652283897e-05, + "loss": 2.7419, + "step": 32699 + }, + { + "epoch": 2.0299211620833075, + "grad_norm": 0.1397872181860645, + "learning_rate": 2.8610309225204818e-05, + "loss": 2.7547, + "step": 32700 + }, + { + "epoch": 2.0299832391830654, + "grad_norm": 0.148712781329056, + "learning_rate": 2.8607044909720927e-05, + "loss": 2.7591, + "step": 32701 + }, + { + "epoch": 2.0300453162828234, + "grad_norm": 0.17798892855434953, + "learning_rate": 2.8603780705849293e-05, + "loss": 2.6836, + "step": 32702 + }, + { + "epoch": 2.0301073933825813, + "grad_norm": 0.1388580120719881, + "learning_rate": 2.8600516613606932e-05, + "loss": 2.7101, + "step": 32703 + }, + { + "epoch": 2.030169470482339, + "grad_norm": 0.1544919109221441, + "learning_rate": 2.8597252633010863e-05, + "loss": 2.6883, + "step": 32704 + }, + { + "epoch": 2.030231547582097, + "grad_norm": 0.17618565135471767, + "learning_rate": 2.859398876407811e-05, + "loss": 2.7472, + "step": 32705 + }, + { + "epoch": 2.030293624681855, + "grad_norm": 0.14361174740459237, + "learning_rate": 2.8590725006825736e-05, + "loss": 2.6907, + "step": 32706 + }, + { + "epoch": 2.030355701781613, + "grad_norm": 0.14500044366341164, + "learning_rate": 2.858746136127074e-05, + "loss": 2.7047, + "step": 32707 + }, + { + "epoch": 2.030417778881371, + "grad_norm": 0.1625484758753124, + "learning_rate": 2.8584197827430164e-05, + "loss": 2.7014, + "step": 32708 + }, + { + "epoch": 2.0304798559811283, + "grad_norm": 0.14629624047386716, + "learning_rate": 2.8580934405321026e-05, + "loss": 2.7935, + "step": 32709 + }, + { + "epoch": 2.0305419330808863, + "grad_norm": 0.16358090287401536, + "learning_rate": 2.8577671094960325e-05, + "loss": 2.6874, + "step": 32710 + }, + { + "epoch": 2.030604010180644, + "grad_norm": 0.1536419283376032, + "learning_rate": 2.857440789636514e-05, + "loss": 2.7042, + "step": 32711 + }, + { + "epoch": 2.030666087280402, + "grad_norm": 0.15717367502962837, + "learning_rate": 2.857114480955247e-05, + "loss": 2.7578, + "step": 32712 + }, + { + "epoch": 2.03072816438016, + "grad_norm": 0.14445512287456244, + "learning_rate": 2.856788183453934e-05, + "loss": 2.694, + "step": 32713 + }, + { + "epoch": 2.030790241479918, + "grad_norm": 0.15041175297542345, + "learning_rate": 2.856461897134275e-05, + "loss": 2.7805, + "step": 32714 + }, + { + "epoch": 2.030852318579676, + "grad_norm": 0.14523892231881794, + "learning_rate": 2.8561356219979773e-05, + "loss": 2.766, + "step": 32715 + }, + { + "epoch": 2.0309143956794338, + "grad_norm": 0.17153238697547374, + "learning_rate": 2.8558093580467397e-05, + "loss": 2.7194, + "step": 32716 + }, + { + "epoch": 2.0309764727791917, + "grad_norm": 0.14627208585745738, + "learning_rate": 2.8554831052822656e-05, + "loss": 2.7422, + "step": 32717 + }, + { + "epoch": 2.0310385498789496, + "grad_norm": 0.14580553434152901, + "learning_rate": 2.8551568637062565e-05, + "loss": 2.6995, + "step": 32718 + }, + { + "epoch": 2.0311006269787075, + "grad_norm": 0.14025920738376954, + "learning_rate": 2.8548306333204132e-05, + "loss": 2.6588, + "step": 32719 + }, + { + "epoch": 2.0311627040784654, + "grad_norm": 0.16915728660962145, + "learning_rate": 2.854504414126441e-05, + "loss": 2.735, + "step": 32720 + }, + { + "epoch": 2.0312247811782234, + "grad_norm": 0.20285101899489813, + "learning_rate": 2.854178206126038e-05, + "loss": 2.7031, + "step": 32721 + }, + { + "epoch": 2.0312868582779813, + "grad_norm": 0.16483004160464748, + "learning_rate": 2.8538520093209104e-05, + "loss": 2.7359, + "step": 32722 + }, + { + "epoch": 2.031348935377739, + "grad_norm": 0.162510897426906, + "learning_rate": 2.8535258237127578e-05, + "loss": 2.7621, + "step": 32723 + }, + { + "epoch": 2.031411012477497, + "grad_norm": 0.15569818330953214, + "learning_rate": 2.85319964930328e-05, + "loss": 2.6649, + "step": 32724 + }, + { + "epoch": 2.031473089577255, + "grad_norm": 0.1396031339560034, + "learning_rate": 2.852873486094183e-05, + "loss": 2.6659, + "step": 32725 + }, + { + "epoch": 2.031535166677013, + "grad_norm": 0.15344454961808213, + "learning_rate": 2.8525473340871655e-05, + "loss": 2.7534, + "step": 32726 + }, + { + "epoch": 2.031597243776771, + "grad_norm": 0.1711022877849153, + "learning_rate": 2.8522211932839305e-05, + "loss": 2.6931, + "step": 32727 + }, + { + "epoch": 2.031659320876529, + "grad_norm": 0.16605625777308622, + "learning_rate": 2.8518950636861775e-05, + "loss": 2.7704, + "step": 32728 + }, + { + "epoch": 2.0317213979762867, + "grad_norm": 0.14732001056752475, + "learning_rate": 2.851568945295611e-05, + "loss": 2.7376, + "step": 32729 + }, + { + "epoch": 2.0317834750760446, + "grad_norm": 0.19143803001801205, + "learning_rate": 2.85124283811393e-05, + "loss": 2.7098, + "step": 32730 + }, + { + "epoch": 2.0318455521758025, + "grad_norm": 0.1603279705093334, + "learning_rate": 2.8509167421428373e-05, + "loss": 2.8221, + "step": 32731 + }, + { + "epoch": 2.0319076292755605, + "grad_norm": 0.1518213411601673, + "learning_rate": 2.8505906573840336e-05, + "loss": 2.7305, + "step": 32732 + }, + { + "epoch": 2.031969706375318, + "grad_norm": 0.1521865877695962, + "learning_rate": 2.8502645838392185e-05, + "loss": 2.8276, + "step": 32733 + }, + { + "epoch": 2.032031783475076, + "grad_norm": 0.1467340122405678, + "learning_rate": 2.849938521510097e-05, + "loss": 2.8116, + "step": 32734 + }, + { + "epoch": 2.0320938605748338, + "grad_norm": 0.15994212909832808, + "learning_rate": 2.8496124703983677e-05, + "loss": 2.6701, + "step": 32735 + }, + { + "epoch": 2.0321559376745917, + "grad_norm": 0.16592511907643362, + "learning_rate": 2.8492864305057325e-05, + "loss": 2.6993, + "step": 32736 + }, + { + "epoch": 2.0322180147743496, + "grad_norm": 0.14023944532106208, + "learning_rate": 2.84896040183389e-05, + "loss": 2.6525, + "step": 32737 + }, + { + "epoch": 2.0322800918741075, + "grad_norm": 0.13588940575333308, + "learning_rate": 2.8486343843845453e-05, + "loss": 2.7076, + "step": 32738 + }, + { + "epoch": 2.0323421689738654, + "grad_norm": 0.1554489044002226, + "learning_rate": 2.848308378159398e-05, + "loss": 2.7433, + "step": 32739 + }, + { + "epoch": 2.0324042460736234, + "grad_norm": 0.15076174090973737, + "learning_rate": 2.847982383160147e-05, + "loss": 2.7421, + "step": 32740 + }, + { + "epoch": 2.0324663231733813, + "grad_norm": 0.162829820458466, + "learning_rate": 2.847656399388493e-05, + "loss": 2.7049, + "step": 32741 + }, + { + "epoch": 2.032528400273139, + "grad_norm": 0.17731967165995408, + "learning_rate": 2.84733042684614e-05, + "loss": 2.7419, + "step": 32742 + }, + { + "epoch": 2.032590477372897, + "grad_norm": 0.152489909948272, + "learning_rate": 2.8470044655347867e-05, + "loss": 2.6945, + "step": 32743 + }, + { + "epoch": 2.032652554472655, + "grad_norm": 0.1412051604633832, + "learning_rate": 2.8466785154561336e-05, + "loss": 2.7313, + "step": 32744 + }, + { + "epoch": 2.032714631572413, + "grad_norm": 0.16857578127917724, + "learning_rate": 2.8463525766118815e-05, + "loss": 2.6324, + "step": 32745 + }, + { + "epoch": 2.032776708672171, + "grad_norm": 0.1854181445434336, + "learning_rate": 2.846026649003729e-05, + "loss": 2.7515, + "step": 32746 + }, + { + "epoch": 2.032838785771929, + "grad_norm": 0.16386626633213552, + "learning_rate": 2.84570073263338e-05, + "loss": 2.7142, + "step": 32747 + }, + { + "epoch": 2.0329008628716867, + "grad_norm": 0.1438348074235594, + "learning_rate": 2.845374827502534e-05, + "loss": 2.7348, + "step": 32748 + }, + { + "epoch": 2.0329629399714446, + "grad_norm": 0.15277786635687698, + "learning_rate": 2.8450489336128892e-05, + "loss": 2.6948, + "step": 32749 + }, + { + "epoch": 2.0330250170712025, + "grad_norm": 0.14437173082110025, + "learning_rate": 2.8447230509661466e-05, + "loss": 2.6887, + "step": 32750 + }, + { + "epoch": 2.0330870941709605, + "grad_norm": 0.16456985929016105, + "learning_rate": 2.844397179564009e-05, + "loss": 2.7923, + "step": 32751 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.15989617421199817, + "learning_rate": 2.8440713194081736e-05, + "loss": 2.7158, + "step": 32752 + }, + { + "epoch": 2.0332112483704763, + "grad_norm": 0.20185389229759143, + "learning_rate": 2.843745470500342e-05, + "loss": 2.641, + "step": 32753 + }, + { + "epoch": 2.033273325470234, + "grad_norm": 0.13906977880909208, + "learning_rate": 2.843419632842212e-05, + "loss": 2.648, + "step": 32754 + }, + { + "epoch": 2.033335402569992, + "grad_norm": 0.14013175423896657, + "learning_rate": 2.843093806435485e-05, + "loss": 2.7557, + "step": 32755 + }, + { + "epoch": 2.03339747966975, + "grad_norm": 0.18358095611472378, + "learning_rate": 2.842767991281864e-05, + "loss": 2.7493, + "step": 32756 + }, + { + "epoch": 2.0334595567695075, + "grad_norm": 0.1466614098469277, + "learning_rate": 2.8424421873830447e-05, + "loss": 2.7691, + "step": 32757 + }, + { + "epoch": 2.0335216338692654, + "grad_norm": 0.1775460555542709, + "learning_rate": 2.8421163947407294e-05, + "loss": 2.6486, + "step": 32758 + }, + { + "epoch": 2.0335837109690234, + "grad_norm": 0.15047608461656162, + "learning_rate": 2.8417906133566163e-05, + "loss": 2.7457, + "step": 32759 + }, + { + "epoch": 2.0336457880687813, + "grad_norm": 0.17841763217102943, + "learning_rate": 2.8414648432324033e-05, + "loss": 2.6992, + "step": 32760 + }, + { + "epoch": 2.033707865168539, + "grad_norm": 0.16310641451444835, + "learning_rate": 2.8411390843697944e-05, + "loss": 2.6894, + "step": 32761 + }, + { + "epoch": 2.033769942268297, + "grad_norm": 0.1738018273166977, + "learning_rate": 2.840813336770487e-05, + "loss": 2.7843, + "step": 32762 + }, + { + "epoch": 2.033832019368055, + "grad_norm": 0.1561154501478081, + "learning_rate": 2.8404876004361804e-05, + "loss": 2.6736, + "step": 32763 + }, + { + "epoch": 2.033894096467813, + "grad_norm": 0.14828821882387064, + "learning_rate": 2.840161875368572e-05, + "loss": 2.7863, + "step": 32764 + }, + { + "epoch": 2.033956173567571, + "grad_norm": 0.1722889542731316, + "learning_rate": 2.839836161569366e-05, + "loss": 2.7715, + "step": 32765 + }, + { + "epoch": 2.0340182506673288, + "grad_norm": 0.2304260569970463, + "learning_rate": 2.8395104590402587e-05, + "loss": 2.7171, + "step": 32766 + }, + { + "epoch": 2.0340803277670867, + "grad_norm": 0.18762814593235033, + "learning_rate": 2.8391847677829507e-05, + "loss": 2.6737, + "step": 32767 + }, + { + "epoch": 2.0341424048668446, + "grad_norm": 0.1468647468823111, + "learning_rate": 2.8388590877991396e-05, + "loss": 2.6502, + "step": 32768 + }, + { + "epoch": 2.0342044819666025, + "grad_norm": 0.142851319577535, + "learning_rate": 2.838533419090523e-05, + "loss": 2.6766, + "step": 32769 + }, + { + "epoch": 2.0342665590663604, + "grad_norm": 0.18075630307650364, + "learning_rate": 2.838207761658805e-05, + "loss": 2.6726, + "step": 32770 + }, + { + "epoch": 2.0343286361661184, + "grad_norm": 0.1536144605762297, + "learning_rate": 2.8378821155056812e-05, + "loss": 2.7334, + "step": 32771 + }, + { + "epoch": 2.0343907132658763, + "grad_norm": 0.1818957741497367, + "learning_rate": 2.8375564806328513e-05, + "loss": 2.6619, + "step": 32772 + }, + { + "epoch": 2.034452790365634, + "grad_norm": 0.17333038196798223, + "learning_rate": 2.837230857042013e-05, + "loss": 2.6717, + "step": 32773 + }, + { + "epoch": 2.034514867465392, + "grad_norm": 0.16199642290843227, + "learning_rate": 2.8369052447348675e-05, + "loss": 2.6943, + "step": 32774 + }, + { + "epoch": 2.03457694456515, + "grad_norm": 0.14659748796451677, + "learning_rate": 2.836579643713113e-05, + "loss": 2.6293, + "step": 32775 + }, + { + "epoch": 2.034639021664908, + "grad_norm": 0.1711328207110464, + "learning_rate": 2.8362540539784476e-05, + "loss": 2.7187, + "step": 32776 + }, + { + "epoch": 2.034701098764666, + "grad_norm": 0.15894143581467451, + "learning_rate": 2.8359284755325677e-05, + "loss": 2.7453, + "step": 32777 + }, + { + "epoch": 2.034763175864424, + "grad_norm": 0.1718695758917372, + "learning_rate": 2.835602908377177e-05, + "loss": 2.7418, + "step": 32778 + }, + { + "epoch": 2.0348252529641817, + "grad_norm": 0.1685845928692715, + "learning_rate": 2.835277352513971e-05, + "loss": 2.7018, + "step": 32779 + }, + { + "epoch": 2.0348873300639396, + "grad_norm": 0.13989581358997757, + "learning_rate": 2.834951807944648e-05, + "loss": 2.7312, + "step": 32780 + }, + { + "epoch": 2.034949407163697, + "grad_norm": 0.15042604301830262, + "learning_rate": 2.8346262746709073e-05, + "loss": 2.7435, + "step": 32781 + }, + { + "epoch": 2.035011484263455, + "grad_norm": 0.1622840962390849, + "learning_rate": 2.8343007526944454e-05, + "loss": 2.7402, + "step": 32782 + }, + { + "epoch": 2.035073561363213, + "grad_norm": 0.13862917471009473, + "learning_rate": 2.833975242016964e-05, + "loss": 2.744, + "step": 32783 + }, + { + "epoch": 2.035135638462971, + "grad_norm": 0.1512698198762705, + "learning_rate": 2.8336497426401594e-05, + "loss": 2.7153, + "step": 32784 + }, + { + "epoch": 2.0351977155627288, + "grad_norm": 0.20716423585869148, + "learning_rate": 2.83332425456573e-05, + "loss": 2.7961, + "step": 32785 + }, + { + "epoch": 2.0352597926624867, + "grad_norm": 0.15491935255658573, + "learning_rate": 2.832998777795372e-05, + "loss": 2.7688, + "step": 32786 + }, + { + "epoch": 2.0353218697622446, + "grad_norm": 0.17297669100430363, + "learning_rate": 2.832673312330786e-05, + "loss": 2.7467, + "step": 32787 + }, + { + "epoch": 2.0353839468620025, + "grad_norm": 0.14495872095994072, + "learning_rate": 2.8323478581736706e-05, + "loss": 2.7113, + "step": 32788 + }, + { + "epoch": 2.0354460239617604, + "grad_norm": 0.16202130280717258, + "learning_rate": 2.832022415325723e-05, + "loss": 2.6823, + "step": 32789 + }, + { + "epoch": 2.0355081010615184, + "grad_norm": 0.15068997278759613, + "learning_rate": 2.831696983788641e-05, + "loss": 2.7349, + "step": 32790 + }, + { + "epoch": 2.0355701781612763, + "grad_norm": 0.17681777316455974, + "learning_rate": 2.83137156356412e-05, + "loss": 2.7035, + "step": 32791 + }, + { + "epoch": 2.035632255261034, + "grad_norm": 0.21093487737932615, + "learning_rate": 2.8310461546538613e-05, + "loss": 2.7252, + "step": 32792 + }, + { + "epoch": 2.035694332360792, + "grad_norm": 0.1457143891712998, + "learning_rate": 2.830720757059563e-05, + "loss": 2.6425, + "step": 32793 + }, + { + "epoch": 2.03575640946055, + "grad_norm": 0.17846801799142453, + "learning_rate": 2.8303953707829195e-05, + "loss": 2.7361, + "step": 32794 + }, + { + "epoch": 2.035818486560308, + "grad_norm": 0.15073744305501158, + "learning_rate": 2.8300699958256304e-05, + "loss": 2.7318, + "step": 32795 + }, + { + "epoch": 2.035880563660066, + "grad_norm": 0.15533773422564287, + "learning_rate": 2.8297446321893916e-05, + "loss": 2.7311, + "step": 32796 + }, + { + "epoch": 2.035942640759824, + "grad_norm": 0.1916764842936307, + "learning_rate": 2.8294192798759034e-05, + "loss": 2.7189, + "step": 32797 + }, + { + "epoch": 2.0360047178595817, + "grad_norm": 0.15146151649072037, + "learning_rate": 2.8290939388868622e-05, + "loss": 2.6616, + "step": 32798 + }, + { + "epoch": 2.0360667949593396, + "grad_norm": 0.16420064514201269, + "learning_rate": 2.828768609223964e-05, + "loss": 2.8367, + "step": 32799 + }, + { + "epoch": 2.0361288720590975, + "grad_norm": 0.16289179204618778, + "learning_rate": 2.8284432908889057e-05, + "loss": 2.7904, + "step": 32800 + }, + { + "epoch": 2.0361909491588555, + "grad_norm": 0.13764216702805976, + "learning_rate": 2.8281179838833884e-05, + "loss": 2.6848, + "step": 32801 + }, + { + "epoch": 2.0362530262586134, + "grad_norm": 0.15632014488239063, + "learning_rate": 2.8277926882091055e-05, + "loss": 2.668, + "step": 32802 + }, + { + "epoch": 2.0363151033583713, + "grad_norm": 0.15453445878665423, + "learning_rate": 2.8274674038677562e-05, + "loss": 2.6175, + "step": 32803 + }, + { + "epoch": 2.036377180458129, + "grad_norm": 0.2893453708911868, + "learning_rate": 2.827142130861037e-05, + "loss": 2.8027, + "step": 32804 + }, + { + "epoch": 2.0364392575578867, + "grad_norm": 0.1406129063598271, + "learning_rate": 2.8268168691906426e-05, + "loss": 2.5792, + "step": 32805 + }, + { + "epoch": 2.0365013346576446, + "grad_norm": 0.16888310825286534, + "learning_rate": 2.8264916188582746e-05, + "loss": 2.7798, + "step": 32806 + }, + { + "epoch": 2.0365634117574025, + "grad_norm": 0.2610322146494361, + "learning_rate": 2.826166379865627e-05, + "loss": 2.7869, + "step": 32807 + }, + { + "epoch": 2.0366254888571604, + "grad_norm": 0.19053860019610705, + "learning_rate": 2.8258411522143967e-05, + "loss": 2.6728, + "step": 32808 + }, + { + "epoch": 2.0366875659569184, + "grad_norm": 0.16508260976293254, + "learning_rate": 2.82551593590628e-05, + "loss": 2.6638, + "step": 32809 + }, + { + "epoch": 2.0367496430566763, + "grad_norm": 0.1808166890204666, + "learning_rate": 2.825190730942976e-05, + "loss": 2.6755, + "step": 32810 + }, + { + "epoch": 2.036811720156434, + "grad_norm": 0.16567335749884515, + "learning_rate": 2.8248655373261788e-05, + "loss": 2.7691, + "step": 32811 + }, + { + "epoch": 2.036873797256192, + "grad_norm": 0.16670828103549573, + "learning_rate": 2.824540355057588e-05, + "loss": 2.6111, + "step": 32812 + }, + { + "epoch": 2.03693587435595, + "grad_norm": 0.20034727516280132, + "learning_rate": 2.8242151841388955e-05, + "loss": 2.793, + "step": 32813 + }, + { + "epoch": 2.036997951455708, + "grad_norm": 0.18445201897683386, + "learning_rate": 2.8238900245718025e-05, + "loss": 2.6899, + "step": 32814 + }, + { + "epoch": 2.037060028555466, + "grad_norm": 0.15861285686494864, + "learning_rate": 2.823564876358003e-05, + "loss": 2.7897, + "step": 32815 + }, + { + "epoch": 2.037122105655224, + "grad_norm": 0.14594438842918703, + "learning_rate": 2.8232397394991944e-05, + "loss": 2.7276, + "step": 32816 + }, + { + "epoch": 2.0371841827549817, + "grad_norm": 0.14355614719816506, + "learning_rate": 2.822914613997073e-05, + "loss": 2.7535, + "step": 32817 + }, + { + "epoch": 2.0372462598547396, + "grad_norm": 0.1758150915842431, + "learning_rate": 2.8225894998533318e-05, + "loss": 2.7131, + "step": 32818 + }, + { + "epoch": 2.0373083369544975, + "grad_norm": 0.22305802853517706, + "learning_rate": 2.8222643970696716e-05, + "loss": 2.7394, + "step": 32819 + }, + { + "epoch": 2.0373704140542555, + "grad_norm": 0.19224684892138114, + "learning_rate": 2.8219393056477857e-05, + "loss": 2.8157, + "step": 32820 + }, + { + "epoch": 2.0374324911540134, + "grad_norm": 0.1441376068088704, + "learning_rate": 2.8216142255893723e-05, + "loss": 2.7063, + "step": 32821 + }, + { + "epoch": 2.0374945682537713, + "grad_norm": 0.1706937108619223, + "learning_rate": 2.8212891568961264e-05, + "loss": 2.7923, + "step": 32822 + }, + { + "epoch": 2.037556645353529, + "grad_norm": 0.2645778632286893, + "learning_rate": 2.8209640995697422e-05, + "loss": 2.7216, + "step": 32823 + }, + { + "epoch": 2.037618722453287, + "grad_norm": 0.1771850796186045, + "learning_rate": 2.820639053611919e-05, + "loss": 2.7642, + "step": 32824 + }, + { + "epoch": 2.037680799553045, + "grad_norm": 0.19188957110878826, + "learning_rate": 2.8203140190243516e-05, + "loss": 2.668, + "step": 32825 + }, + { + "epoch": 2.037742876652803, + "grad_norm": 0.1552906866509244, + "learning_rate": 2.8199889958087343e-05, + "loss": 2.8192, + "step": 32826 + }, + { + "epoch": 2.037804953752561, + "grad_norm": 0.1532829506417132, + "learning_rate": 2.819663983966762e-05, + "loss": 2.6738, + "step": 32827 + }, + { + "epoch": 2.037867030852319, + "grad_norm": 0.21570084143358653, + "learning_rate": 2.819338983500134e-05, + "loss": 2.729, + "step": 32828 + }, + { + "epoch": 2.0379291079520763, + "grad_norm": 0.16843125519296517, + "learning_rate": 2.8190139944105437e-05, + "loss": 2.7293, + "step": 32829 + }, + { + "epoch": 2.037991185051834, + "grad_norm": 0.15145653579525015, + "learning_rate": 2.818689016699687e-05, + "loss": 2.8157, + "step": 32830 + }, + { + "epoch": 2.038053262151592, + "grad_norm": 0.1984729081950944, + "learning_rate": 2.818364050369259e-05, + "loss": 2.6271, + "step": 32831 + }, + { + "epoch": 2.03811533925135, + "grad_norm": 0.15049701494523174, + "learning_rate": 2.8180390954209535e-05, + "loss": 2.7317, + "step": 32832 + }, + { + "epoch": 2.038177416351108, + "grad_norm": 0.1499846105330301, + "learning_rate": 2.8177141518564697e-05, + "loss": 2.7364, + "step": 32833 + }, + { + "epoch": 2.038239493450866, + "grad_norm": 0.15542180991696347, + "learning_rate": 2.817389219677501e-05, + "loss": 2.7926, + "step": 32834 + }, + { + "epoch": 2.0383015705506238, + "grad_norm": 0.13593772142764945, + "learning_rate": 2.8170642988857433e-05, + "loss": 2.7365, + "step": 32835 + }, + { + "epoch": 2.0383636476503817, + "grad_norm": 0.1508305913608324, + "learning_rate": 2.8167393894828877e-05, + "loss": 2.7169, + "step": 32836 + }, + { + "epoch": 2.0384257247501396, + "grad_norm": 0.19603856552669727, + "learning_rate": 2.8164144914706357e-05, + "loss": 2.7131, + "step": 32837 + }, + { + "epoch": 2.0384878018498975, + "grad_norm": 0.1659247935273991, + "learning_rate": 2.816089604850679e-05, + "loss": 2.6588, + "step": 32838 + }, + { + "epoch": 2.0385498789496554, + "grad_norm": 0.15687785366396056, + "learning_rate": 2.8157647296247132e-05, + "loss": 2.6272, + "step": 32839 + }, + { + "epoch": 2.0386119560494134, + "grad_norm": 0.14906105401679326, + "learning_rate": 2.8154398657944304e-05, + "loss": 2.7455, + "step": 32840 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.1401640370288509, + "learning_rate": 2.8151150133615313e-05, + "loss": 2.6183, + "step": 32841 + }, + { + "epoch": 2.038736110248929, + "grad_norm": 0.19419734172418437, + "learning_rate": 2.8147901723277064e-05, + "loss": 2.7964, + "step": 32842 + }, + { + "epoch": 2.038798187348687, + "grad_norm": 0.1800754236610351, + "learning_rate": 2.814465342694652e-05, + "loss": 2.7911, + "step": 32843 + }, + { + "epoch": 2.038860264448445, + "grad_norm": 0.1789547762796685, + "learning_rate": 2.814140524464063e-05, + "loss": 2.7893, + "step": 32844 + }, + { + "epoch": 2.038922341548203, + "grad_norm": 0.14759826581283153, + "learning_rate": 2.8138157176376312e-05, + "loss": 2.711, + "step": 32845 + }, + { + "epoch": 2.038984418647961, + "grad_norm": 0.1457389110052721, + "learning_rate": 2.8134909222170554e-05, + "loss": 2.772, + "step": 32846 + }, + { + "epoch": 2.039046495747719, + "grad_norm": 0.1486020619066279, + "learning_rate": 2.813166138204028e-05, + "loss": 2.7929, + "step": 32847 + }, + { + "epoch": 2.0391085728474767, + "grad_norm": 0.14820984272670398, + "learning_rate": 2.812841365600244e-05, + "loss": 2.6211, + "step": 32848 + }, + { + "epoch": 2.0391706499472346, + "grad_norm": 0.14514398853594665, + "learning_rate": 2.8125166044073957e-05, + "loss": 2.7074, + "step": 32849 + }, + { + "epoch": 2.0392327270469925, + "grad_norm": 0.15326261953509915, + "learning_rate": 2.8121918546271814e-05, + "loss": 2.7175, + "step": 32850 + }, + { + "epoch": 2.0392948041467505, + "grad_norm": 0.14359582584030456, + "learning_rate": 2.811867116261293e-05, + "loss": 2.7007, + "step": 32851 + }, + { + "epoch": 2.0393568812465084, + "grad_norm": 0.1485062929158277, + "learning_rate": 2.8115423893114235e-05, + "loss": 2.7709, + "step": 32852 + }, + { + "epoch": 2.039418958346266, + "grad_norm": 0.14709889259611997, + "learning_rate": 2.81121767377927e-05, + "loss": 2.7339, + "step": 32853 + }, + { + "epoch": 2.0394810354460238, + "grad_norm": 0.14076817694201293, + "learning_rate": 2.8108929696665253e-05, + "loss": 2.7872, + "step": 32854 + }, + { + "epoch": 2.0395431125457817, + "grad_norm": 0.14425660793690906, + "learning_rate": 2.810568276974882e-05, + "loss": 2.7071, + "step": 32855 + }, + { + "epoch": 2.0396051896455396, + "grad_norm": 0.13577654973093717, + "learning_rate": 2.810243595706037e-05, + "loss": 2.6724, + "step": 32856 + }, + { + "epoch": 2.0396672667452975, + "grad_norm": 0.15111469621650384, + "learning_rate": 2.8099189258616825e-05, + "loss": 2.7515, + "step": 32857 + }, + { + "epoch": 2.0397293438450554, + "grad_norm": 0.17337530068974455, + "learning_rate": 2.8095942674435132e-05, + "loss": 2.7326, + "step": 32858 + }, + { + "epoch": 2.0397914209448134, + "grad_norm": 0.1418541595512019, + "learning_rate": 2.8092696204532197e-05, + "loss": 2.6475, + "step": 32859 + }, + { + "epoch": 2.0398534980445713, + "grad_norm": 0.14367339357960238, + "learning_rate": 2.808944984892501e-05, + "loss": 2.7512, + "step": 32860 + }, + { + "epoch": 2.039915575144329, + "grad_norm": 0.14896194769291252, + "learning_rate": 2.808620360763048e-05, + "loss": 2.6935, + "step": 32861 + }, + { + "epoch": 2.039977652244087, + "grad_norm": 0.13719945594809016, + "learning_rate": 2.808295748066555e-05, + "loss": 2.72, + "step": 32862 + }, + { + "epoch": 2.040039729343845, + "grad_norm": 0.14731421971642422, + "learning_rate": 2.8079711468047125e-05, + "loss": 2.7404, + "step": 32863 + }, + { + "epoch": 2.040101806443603, + "grad_norm": 0.15475136180395582, + "learning_rate": 2.8076465569792188e-05, + "loss": 2.7376, + "step": 32864 + }, + { + "epoch": 2.040163883543361, + "grad_norm": 0.1422633196870736, + "learning_rate": 2.8073219785917653e-05, + "loss": 2.6968, + "step": 32865 + }, + { + "epoch": 2.040225960643119, + "grad_norm": 0.14975356506639942, + "learning_rate": 2.8069974116440457e-05, + "loss": 2.7286, + "step": 32866 + }, + { + "epoch": 2.0402880377428767, + "grad_norm": 0.13892217666023443, + "learning_rate": 2.8066728561377527e-05, + "loss": 2.5647, + "step": 32867 + }, + { + "epoch": 2.0403501148426346, + "grad_norm": 0.1423109438324836, + "learning_rate": 2.8063483120745777e-05, + "loss": 2.7378, + "step": 32868 + }, + { + "epoch": 2.0404121919423925, + "grad_norm": 0.14625738358606655, + "learning_rate": 2.8060237794562184e-05, + "loss": 2.6712, + "step": 32869 + }, + { + "epoch": 2.0404742690421505, + "grad_norm": 0.1756767863237087, + "learning_rate": 2.805699258284365e-05, + "loss": 2.6999, + "step": 32870 + }, + { + "epoch": 2.0405363461419084, + "grad_norm": 0.1461356653680937, + "learning_rate": 2.8053747485607117e-05, + "loss": 2.7114, + "step": 32871 + }, + { + "epoch": 2.0405984232416663, + "grad_norm": 0.14463501504570553, + "learning_rate": 2.8050502502869492e-05, + "loss": 2.7091, + "step": 32872 + }, + { + "epoch": 2.040660500341424, + "grad_norm": 0.1502020322191873, + "learning_rate": 2.8047257634647737e-05, + "loss": 2.7507, + "step": 32873 + }, + { + "epoch": 2.040722577441182, + "grad_norm": 0.1453154719981964, + "learning_rate": 2.8044012880958782e-05, + "loss": 2.7231, + "step": 32874 + }, + { + "epoch": 2.04078465454094, + "grad_norm": 0.16082104084886795, + "learning_rate": 2.8040768241819528e-05, + "loss": 2.7368, + "step": 32875 + }, + { + "epoch": 2.040846731640698, + "grad_norm": 0.1725183821123483, + "learning_rate": 2.8037523717246907e-05, + "loss": 2.7739, + "step": 32876 + }, + { + "epoch": 2.0409088087404554, + "grad_norm": 0.14877595087087905, + "learning_rate": 2.803427930725787e-05, + "loss": 2.7854, + "step": 32877 + }, + { + "epoch": 2.0409708858402134, + "grad_norm": 0.15584526093712234, + "learning_rate": 2.8031035011869333e-05, + "loss": 2.8243, + "step": 32878 + }, + { + "epoch": 2.0410329629399713, + "grad_norm": 0.18270233535563335, + "learning_rate": 2.8027790831098217e-05, + "loss": 2.6972, + "step": 32879 + }, + { + "epoch": 2.041095040039729, + "grad_norm": 0.14641617140847224, + "learning_rate": 2.8024546764961447e-05, + "loss": 2.7056, + "step": 32880 + }, + { + "epoch": 2.041157117139487, + "grad_norm": 0.14928191705426533, + "learning_rate": 2.8021302813475936e-05, + "loss": 2.6977, + "step": 32881 + }, + { + "epoch": 2.041219194239245, + "grad_norm": 0.14355030831644677, + "learning_rate": 2.8018058976658645e-05, + "loss": 2.6839, + "step": 32882 + }, + { + "epoch": 2.041281271339003, + "grad_norm": 0.15315273262465123, + "learning_rate": 2.8014815254526478e-05, + "loss": 2.7798, + "step": 32883 + }, + { + "epoch": 2.041343348438761, + "grad_norm": 0.1626973576777803, + "learning_rate": 2.801157164709635e-05, + "loss": 2.7364, + "step": 32884 + }, + { + "epoch": 2.041405425538519, + "grad_norm": 0.1538418325203578, + "learning_rate": 2.8008328154385176e-05, + "loss": 2.6426, + "step": 32885 + }, + { + "epoch": 2.0414675026382767, + "grad_norm": 0.14374526990488923, + "learning_rate": 2.8005084776409895e-05, + "loss": 2.7158, + "step": 32886 + }, + { + "epoch": 2.0415295797380346, + "grad_norm": 0.14499433536193507, + "learning_rate": 2.8001841513187443e-05, + "loss": 2.6348, + "step": 32887 + }, + { + "epoch": 2.0415916568377925, + "grad_norm": 0.14576098799059098, + "learning_rate": 2.799859836473472e-05, + "loss": 2.6827, + "step": 32888 + }, + { + "epoch": 2.0416537339375505, + "grad_norm": 0.14373100896485766, + "learning_rate": 2.799535533106865e-05, + "loss": 2.7348, + "step": 32889 + }, + { + "epoch": 2.0417158110373084, + "grad_norm": 0.17716825893758079, + "learning_rate": 2.799211241220614e-05, + "loss": 2.8279, + "step": 32890 + }, + { + "epoch": 2.0417778881370663, + "grad_norm": 0.145428616778264, + "learning_rate": 2.7988869608164138e-05, + "loss": 2.6394, + "step": 32891 + }, + { + "epoch": 2.041839965236824, + "grad_norm": 0.15143908362164757, + "learning_rate": 2.7985626918959553e-05, + "loss": 2.7589, + "step": 32892 + }, + { + "epoch": 2.041902042336582, + "grad_norm": 0.14176040159306627, + "learning_rate": 2.798238434460929e-05, + "loss": 2.7478, + "step": 32893 + }, + { + "epoch": 2.04196411943634, + "grad_norm": 0.15519425634470774, + "learning_rate": 2.7979141885130273e-05, + "loss": 2.7288, + "step": 32894 + }, + { + "epoch": 2.042026196536098, + "grad_norm": 0.1439377416537464, + "learning_rate": 2.7975899540539396e-05, + "loss": 2.751, + "step": 32895 + }, + { + "epoch": 2.042088273635856, + "grad_norm": 0.15106543334784958, + "learning_rate": 2.7972657310853624e-05, + "loss": 2.7423, + "step": 32896 + }, + { + "epoch": 2.042150350735614, + "grad_norm": 0.15484969180627342, + "learning_rate": 2.7969415196089843e-05, + "loss": 2.7365, + "step": 32897 + }, + { + "epoch": 2.0422124278353717, + "grad_norm": 0.14049290704980424, + "learning_rate": 2.7966173196264974e-05, + "loss": 2.7344, + "step": 32898 + }, + { + "epoch": 2.0422745049351296, + "grad_norm": 0.17606034374947926, + "learning_rate": 2.7962931311395903e-05, + "loss": 2.799, + "step": 32899 + }, + { + "epoch": 2.0423365820348875, + "grad_norm": 0.1405852513297873, + "learning_rate": 2.795968954149959e-05, + "loss": 2.6664, + "step": 32900 + }, + { + "epoch": 2.042398659134645, + "grad_norm": 0.14509540717866756, + "learning_rate": 2.795644788659293e-05, + "loss": 2.6988, + "step": 32901 + }, + { + "epoch": 2.042460736234403, + "grad_norm": 0.1528674928517162, + "learning_rate": 2.7953206346692833e-05, + "loss": 2.8025, + "step": 32902 + }, + { + "epoch": 2.042522813334161, + "grad_norm": 0.16656098827620766, + "learning_rate": 2.794996492181621e-05, + "loss": 2.6884, + "step": 32903 + }, + { + "epoch": 2.0425848904339188, + "grad_norm": 0.14900758716488413, + "learning_rate": 2.7946723611979953e-05, + "loss": 2.7462, + "step": 32904 + }, + { + "epoch": 2.0426469675336767, + "grad_norm": 0.15916020326781039, + "learning_rate": 2.794348241720101e-05, + "loss": 2.7624, + "step": 32905 + }, + { + "epoch": 2.0427090446334346, + "grad_norm": 0.15140347199701307, + "learning_rate": 2.7940241337496276e-05, + "loss": 2.6548, + "step": 32906 + }, + { + "epoch": 2.0427711217331925, + "grad_norm": 0.20287104466978395, + "learning_rate": 2.793700037288266e-05, + "loss": 2.7152, + "step": 32907 + }, + { + "epoch": 2.0428331988329504, + "grad_norm": 0.15292975010389506, + "learning_rate": 2.7933759523377045e-05, + "loss": 2.7179, + "step": 32908 + }, + { + "epoch": 2.0428952759327084, + "grad_norm": 0.1414457121208422, + "learning_rate": 2.7930518788996386e-05, + "loss": 2.7048, + "step": 32909 + }, + { + "epoch": 2.0429573530324663, + "grad_norm": 0.1468897931278575, + "learning_rate": 2.7927278169757565e-05, + "loss": 2.743, + "step": 32910 + }, + { + "epoch": 2.043019430132224, + "grad_norm": 0.14569422922258912, + "learning_rate": 2.7924037665677495e-05, + "loss": 2.7312, + "step": 32911 + }, + { + "epoch": 2.043081507231982, + "grad_norm": 0.14940222524987878, + "learning_rate": 2.7920797276773058e-05, + "loss": 2.6695, + "step": 32912 + }, + { + "epoch": 2.04314358433174, + "grad_norm": 0.16249379572578473, + "learning_rate": 2.79175570030612e-05, + "loss": 2.6725, + "step": 32913 + }, + { + "epoch": 2.043205661431498, + "grad_norm": 0.1482207635871024, + "learning_rate": 2.7914316844558807e-05, + "loss": 2.7785, + "step": 32914 + }, + { + "epoch": 2.043267738531256, + "grad_norm": 0.14077050575553957, + "learning_rate": 2.7911076801282786e-05, + "loss": 2.7715, + "step": 32915 + }, + { + "epoch": 2.043329815631014, + "grad_norm": 0.15005076388827526, + "learning_rate": 2.790783687325005e-05, + "loss": 2.7185, + "step": 32916 + }, + { + "epoch": 2.0433918927307717, + "grad_norm": 0.14097448472721852, + "learning_rate": 2.7904597060477457e-05, + "loss": 2.7369, + "step": 32917 + }, + { + "epoch": 2.0434539698305296, + "grad_norm": 0.18469927082214724, + "learning_rate": 2.790135736298195e-05, + "loss": 2.6871, + "step": 32918 + }, + { + "epoch": 2.0435160469302875, + "grad_norm": 0.14594118495511987, + "learning_rate": 2.7898117780780452e-05, + "loss": 2.7172, + "step": 32919 + }, + { + "epoch": 2.0435781240300455, + "grad_norm": 0.15225555238832883, + "learning_rate": 2.7894878313889833e-05, + "loss": 2.811, + "step": 32920 + }, + { + "epoch": 2.0436402011298034, + "grad_norm": 0.14056049297913262, + "learning_rate": 2.7891638962327e-05, + "loss": 2.7555, + "step": 32921 + }, + { + "epoch": 2.0437022782295613, + "grad_norm": 0.1591559199276027, + "learning_rate": 2.788839972610884e-05, + "loss": 2.7645, + "step": 32922 + }, + { + "epoch": 2.043764355329319, + "grad_norm": 0.16375634032816558, + "learning_rate": 2.7885160605252274e-05, + "loss": 2.6873, + "step": 32923 + }, + { + "epoch": 2.0438264324290767, + "grad_norm": 0.15737933679805988, + "learning_rate": 2.7881921599774203e-05, + "loss": 2.6931, + "step": 32924 + }, + { + "epoch": 2.0438885095288346, + "grad_norm": 0.15087275816603757, + "learning_rate": 2.7878682709691517e-05, + "loss": 2.7779, + "step": 32925 + }, + { + "epoch": 2.0439505866285925, + "grad_norm": 0.15096058532740098, + "learning_rate": 2.7875443935021085e-05, + "loss": 2.7626, + "step": 32926 + }, + { + "epoch": 2.0440126637283504, + "grad_norm": 0.14541126734998874, + "learning_rate": 2.787220527577986e-05, + "loss": 2.713, + "step": 32927 + }, + { + "epoch": 2.0440747408281084, + "grad_norm": 0.14890002043503275, + "learning_rate": 2.7868966731984715e-05, + "loss": 2.7614, + "step": 32928 + }, + { + "epoch": 2.0441368179278663, + "grad_norm": 0.15013345801439043, + "learning_rate": 2.786572830365254e-05, + "loss": 2.6135, + "step": 32929 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.1647075902042067, + "learning_rate": 2.786248999080023e-05, + "loss": 2.7953, + "step": 32930 + }, + { + "epoch": 2.044260972127382, + "grad_norm": 0.14129409293832004, + "learning_rate": 2.7859251793444665e-05, + "loss": 2.7029, + "step": 32931 + }, + { + "epoch": 2.04432304922714, + "grad_norm": 0.1407187334899757, + "learning_rate": 2.785601371160278e-05, + "loss": 2.6623, + "step": 32932 + }, + { + "epoch": 2.044385126326898, + "grad_norm": 0.13960255355905868, + "learning_rate": 2.7852775745291443e-05, + "loss": 2.6276, + "step": 32933 + }, + { + "epoch": 2.044447203426656, + "grad_norm": 0.15483372885864308, + "learning_rate": 2.7849537894527555e-05, + "loss": 2.7901, + "step": 32934 + }, + { + "epoch": 2.044509280526414, + "grad_norm": 0.1439404203393627, + "learning_rate": 2.7846300159327986e-05, + "loss": 2.7414, + "step": 32935 + }, + { + "epoch": 2.0445713576261717, + "grad_norm": 0.14883592740329635, + "learning_rate": 2.7843062539709662e-05, + "loss": 2.7755, + "step": 32936 + }, + { + "epoch": 2.0446334347259296, + "grad_norm": 0.1499214967799423, + "learning_rate": 2.7839825035689465e-05, + "loss": 2.7453, + "step": 32937 + }, + { + "epoch": 2.0446955118256875, + "grad_norm": 0.15405002597658804, + "learning_rate": 2.783658764728428e-05, + "loss": 2.7235, + "step": 32938 + }, + { + "epoch": 2.0447575889254455, + "grad_norm": 0.1589350019173135, + "learning_rate": 2.7833350374510996e-05, + "loss": 2.717, + "step": 32939 + }, + { + "epoch": 2.0448196660252034, + "grad_norm": 0.1419652674577635, + "learning_rate": 2.7830113217386485e-05, + "loss": 2.8173, + "step": 32940 + }, + { + "epoch": 2.0448817431249613, + "grad_norm": 0.15551012820810042, + "learning_rate": 2.7826876175927673e-05, + "loss": 2.7662, + "step": 32941 + }, + { + "epoch": 2.044943820224719, + "grad_norm": 0.145243103109817, + "learning_rate": 2.7823639250151434e-05, + "loss": 2.6998, + "step": 32942 + }, + { + "epoch": 2.045005897324477, + "grad_norm": 0.14647039346727198, + "learning_rate": 2.782040244007465e-05, + "loss": 2.7735, + "step": 32943 + }, + { + "epoch": 2.045067974424235, + "grad_norm": 0.13947841991892257, + "learning_rate": 2.781716574571419e-05, + "loss": 2.771, + "step": 32944 + }, + { + "epoch": 2.045130051523993, + "grad_norm": 0.13804081256380382, + "learning_rate": 2.7813929167086984e-05, + "loss": 2.6582, + "step": 32945 + }, + { + "epoch": 2.045192128623751, + "grad_norm": 0.16776903610762173, + "learning_rate": 2.7810692704209897e-05, + "loss": 2.7105, + "step": 32946 + }, + { + "epoch": 2.045254205723509, + "grad_norm": 0.1780996969722449, + "learning_rate": 2.7807456357099808e-05, + "loss": 2.591, + "step": 32947 + }, + { + "epoch": 2.0453162828232667, + "grad_norm": 0.14657762102764865, + "learning_rate": 2.780422012577359e-05, + "loss": 2.7517, + "step": 32948 + }, + { + "epoch": 2.045378359923024, + "grad_norm": 0.16836899931232255, + "learning_rate": 2.7800984010248167e-05, + "loss": 2.7344, + "step": 32949 + }, + { + "epoch": 2.045440437022782, + "grad_norm": 0.17103726334214295, + "learning_rate": 2.779774801054039e-05, + "loss": 2.728, + "step": 32950 + }, + { + "epoch": 2.04550251412254, + "grad_norm": 0.16966586174353188, + "learning_rate": 2.779451212666714e-05, + "loss": 2.8608, + "step": 32951 + }, + { + "epoch": 2.045564591222298, + "grad_norm": 0.1501957448151993, + "learning_rate": 2.7791276358645324e-05, + "loss": 2.6807, + "step": 32952 + }, + { + "epoch": 2.045626668322056, + "grad_norm": 0.13812442631623936, + "learning_rate": 2.7788040706491813e-05, + "loss": 2.7729, + "step": 32953 + }, + { + "epoch": 2.045688745421814, + "grad_norm": 0.13916415158609816, + "learning_rate": 2.778480517022347e-05, + "loss": 2.8304, + "step": 32954 + }, + { + "epoch": 2.0457508225215717, + "grad_norm": 0.16106520303634766, + "learning_rate": 2.7781569749857205e-05, + "loss": 2.6868, + "step": 32955 + }, + { + "epoch": 2.0458128996213296, + "grad_norm": 0.14307203704897797, + "learning_rate": 2.777833444540989e-05, + "loss": 2.7222, + "step": 32956 + }, + { + "epoch": 2.0458749767210875, + "grad_norm": 0.13958528731411357, + "learning_rate": 2.7775099256898396e-05, + "loss": 2.7487, + "step": 32957 + }, + { + "epoch": 2.0459370538208455, + "grad_norm": 0.1620521157034938, + "learning_rate": 2.777186418433958e-05, + "loss": 2.6697, + "step": 32958 + }, + { + "epoch": 2.0459991309206034, + "grad_norm": 0.13922523850541965, + "learning_rate": 2.7768629227750374e-05, + "loss": 2.7007, + "step": 32959 + }, + { + "epoch": 2.0460612080203613, + "grad_norm": 0.15996227496732657, + "learning_rate": 2.776539438714762e-05, + "loss": 2.7455, + "step": 32960 + }, + { + "epoch": 2.046123285120119, + "grad_norm": 0.14527913500123688, + "learning_rate": 2.7762159662548203e-05, + "loss": 2.7439, + "step": 32961 + }, + { + "epoch": 2.046185362219877, + "grad_norm": 0.1505971456628935, + "learning_rate": 2.7758925053968976e-05, + "loss": 2.6867, + "step": 32962 + }, + { + "epoch": 2.046247439319635, + "grad_norm": 0.14796225487211057, + "learning_rate": 2.775569056142686e-05, + "loss": 2.7829, + "step": 32963 + }, + { + "epoch": 2.046309516419393, + "grad_norm": 0.1543638047992979, + "learning_rate": 2.7752456184938702e-05, + "loss": 2.7636, + "step": 32964 + }, + { + "epoch": 2.046371593519151, + "grad_norm": 0.17569852577862743, + "learning_rate": 2.7749221924521384e-05, + "loss": 2.6389, + "step": 32965 + }, + { + "epoch": 2.046433670618909, + "grad_norm": 0.14998638642642675, + "learning_rate": 2.7745987780191774e-05, + "loss": 2.717, + "step": 32966 + }, + { + "epoch": 2.0464957477186667, + "grad_norm": 0.15672265328427037, + "learning_rate": 2.7742753751966726e-05, + "loss": 2.7836, + "step": 32967 + }, + { + "epoch": 2.0465578248184246, + "grad_norm": 0.18051503194881424, + "learning_rate": 2.773951983986316e-05, + "loss": 2.7941, + "step": 32968 + }, + { + "epoch": 2.0466199019181825, + "grad_norm": 0.14416824324339314, + "learning_rate": 2.773628604389792e-05, + "loss": 2.7254, + "step": 32969 + }, + { + "epoch": 2.0466819790179405, + "grad_norm": 0.14233719051669477, + "learning_rate": 2.7733052364087886e-05, + "loss": 2.6825, + "step": 32970 + }, + { + "epoch": 2.0467440561176984, + "grad_norm": 0.16175096025211366, + "learning_rate": 2.7729818800449892e-05, + "loss": 2.6877, + "step": 32971 + }, + { + "epoch": 2.046806133217456, + "grad_norm": 0.15877172931506262, + "learning_rate": 2.7726585353000868e-05, + "loss": 2.7546, + "step": 32972 + }, + { + "epoch": 2.0468682103172138, + "grad_norm": 0.1736261501686114, + "learning_rate": 2.7723352021757652e-05, + "loss": 2.6324, + "step": 32973 + }, + { + "epoch": 2.0469302874169717, + "grad_norm": 0.1541605117467041, + "learning_rate": 2.7720118806737115e-05, + "loss": 2.7107, + "step": 32974 + }, + { + "epoch": 2.0469923645167296, + "grad_norm": 0.1502143644395422, + "learning_rate": 2.7716885707956115e-05, + "loss": 2.6597, + "step": 32975 + }, + { + "epoch": 2.0470544416164875, + "grad_norm": 0.1453298466080809, + "learning_rate": 2.771365272543155e-05, + "loss": 2.7875, + "step": 32976 + }, + { + "epoch": 2.0471165187162454, + "grad_norm": 0.14867577126229015, + "learning_rate": 2.771041985918026e-05, + "loss": 2.7471, + "step": 32977 + }, + { + "epoch": 2.0471785958160034, + "grad_norm": 0.14182923528310856, + "learning_rate": 2.7707187109219124e-05, + "loss": 2.673, + "step": 32978 + }, + { + "epoch": 2.0472406729157613, + "grad_norm": 0.1641931581201123, + "learning_rate": 2.7703954475565007e-05, + "loss": 2.7256, + "step": 32979 + }, + { + "epoch": 2.047302750015519, + "grad_norm": 0.14853405948913673, + "learning_rate": 2.7700721958234754e-05, + "loss": 2.6571, + "step": 32980 + }, + { + "epoch": 2.047364827115277, + "grad_norm": 0.14971215300934237, + "learning_rate": 2.7697489557245267e-05, + "loss": 2.7014, + "step": 32981 + }, + { + "epoch": 2.047426904215035, + "grad_norm": 0.14944219012819768, + "learning_rate": 2.769425727261339e-05, + "loss": 2.7057, + "step": 32982 + }, + { + "epoch": 2.047488981314793, + "grad_norm": 0.17578808453306785, + "learning_rate": 2.7691025104355965e-05, + "loss": 2.7025, + "step": 32983 + }, + { + "epoch": 2.047551058414551, + "grad_norm": 0.1478645283747194, + "learning_rate": 2.7687793052489897e-05, + "loss": 2.8091, + "step": 32984 + }, + { + "epoch": 2.047613135514309, + "grad_norm": 0.13834322682754174, + "learning_rate": 2.7684561117032014e-05, + "loss": 2.6992, + "step": 32985 + }, + { + "epoch": 2.0476752126140667, + "grad_norm": 0.1434336560800142, + "learning_rate": 2.7681329297999213e-05, + "loss": 2.7497, + "step": 32986 + }, + { + "epoch": 2.0477372897138246, + "grad_norm": 0.14128410664990432, + "learning_rate": 2.7678097595408336e-05, + "loss": 2.6606, + "step": 32987 + }, + { + "epoch": 2.0477993668135825, + "grad_norm": 0.14344461593762137, + "learning_rate": 2.7674866009276236e-05, + "loss": 2.825, + "step": 32988 + }, + { + "epoch": 2.0478614439133405, + "grad_norm": 0.15010348068581772, + "learning_rate": 2.7671634539619785e-05, + "loss": 2.6715, + "step": 32989 + }, + { + "epoch": 2.0479235210130984, + "grad_norm": 0.14373189667029876, + "learning_rate": 2.7668403186455817e-05, + "loss": 2.6266, + "step": 32990 + }, + { + "epoch": 2.0479855981128563, + "grad_norm": 0.14565069748526174, + "learning_rate": 2.7665171949801227e-05, + "loss": 2.7992, + "step": 32991 + }, + { + "epoch": 2.048047675212614, + "grad_norm": 0.14274681902521297, + "learning_rate": 2.766194082967286e-05, + "loss": 2.6608, + "step": 32992 + }, + { + "epoch": 2.048109752312372, + "grad_norm": 0.1374503791203514, + "learning_rate": 2.765870982608757e-05, + "loss": 2.7578, + "step": 32993 + }, + { + "epoch": 2.04817182941213, + "grad_norm": 0.14426032009677472, + "learning_rate": 2.7655478939062196e-05, + "loss": 2.7409, + "step": 32994 + }, + { + "epoch": 2.048233906511888, + "grad_norm": 0.18808974463233302, + "learning_rate": 2.7652248168613625e-05, + "loss": 2.6684, + "step": 32995 + }, + { + "epoch": 2.048295983611646, + "grad_norm": 0.14674891426864287, + "learning_rate": 2.7649017514758706e-05, + "loss": 2.673, + "step": 32996 + }, + { + "epoch": 2.0483580607114034, + "grad_norm": 0.1756458025538154, + "learning_rate": 2.7645786977514287e-05, + "loss": 2.7997, + "step": 32997 + }, + { + "epoch": 2.0484201378111613, + "grad_norm": 0.1505459750989417, + "learning_rate": 2.764255655689721e-05, + "loss": 2.7887, + "step": 32998 + }, + { + "epoch": 2.048482214910919, + "grad_norm": 0.14013113615500036, + "learning_rate": 2.763932625292436e-05, + "loss": 2.7285, + "step": 32999 + }, + { + "epoch": 2.048544292010677, + "grad_norm": 0.14281397738412507, + "learning_rate": 2.763609606561257e-05, + "loss": 2.6423, + "step": 33000 + }, + { + "epoch": 2.048606369110435, + "grad_norm": 0.14685606530067463, + "learning_rate": 2.7632865994978695e-05, + "loss": 2.6328, + "step": 33001 + }, + { + "epoch": 2.048668446210193, + "grad_norm": 0.1687732658418908, + "learning_rate": 2.7629636041039592e-05, + "loss": 2.6993, + "step": 33002 + }, + { + "epoch": 2.048730523309951, + "grad_norm": 0.14783334128451658, + "learning_rate": 2.7626406203812082e-05, + "loss": 2.6759, + "step": 33003 + }, + { + "epoch": 2.048792600409709, + "grad_norm": 0.15823594682978245, + "learning_rate": 2.7623176483313074e-05, + "loss": 2.7756, + "step": 33004 + }, + { + "epoch": 2.0488546775094667, + "grad_norm": 0.14153014388002774, + "learning_rate": 2.7619946879559377e-05, + "loss": 2.7304, + "step": 33005 + }, + { + "epoch": 2.0489167546092246, + "grad_norm": 0.16353156829036133, + "learning_rate": 2.7616717392567854e-05, + "loss": 2.7625, + "step": 33006 + }, + { + "epoch": 2.0489788317089825, + "grad_norm": 0.15991490485390533, + "learning_rate": 2.7613488022355327e-05, + "loss": 2.6846, + "step": 33007 + }, + { + "epoch": 2.0490409088087405, + "grad_norm": 0.14385262537063098, + "learning_rate": 2.761025876893869e-05, + "loss": 2.7164, + "step": 33008 + }, + { + "epoch": 2.0491029859084984, + "grad_norm": 0.19869943324885583, + "learning_rate": 2.760702963233477e-05, + "loss": 2.6455, + "step": 33009 + }, + { + "epoch": 2.0491650630082563, + "grad_norm": 0.18858652415493077, + "learning_rate": 2.7603800612560405e-05, + "loss": 2.7427, + "step": 33010 + }, + { + "epoch": 2.049227140108014, + "grad_norm": 0.1465332776139047, + "learning_rate": 2.7600571709632438e-05, + "loss": 2.7179, + "step": 33011 + }, + { + "epoch": 2.049289217207772, + "grad_norm": 0.15082719527799465, + "learning_rate": 2.7597342923567737e-05, + "loss": 2.7235, + "step": 33012 + }, + { + "epoch": 2.04935129430753, + "grad_norm": 0.14413769080631011, + "learning_rate": 2.7594114254383143e-05, + "loss": 2.7316, + "step": 33013 + }, + { + "epoch": 2.049413371407288, + "grad_norm": 0.16915201761055587, + "learning_rate": 2.7590885702095492e-05, + "loss": 2.8106, + "step": 33014 + }, + { + "epoch": 2.049475448507046, + "grad_norm": 0.1470417861325449, + "learning_rate": 2.7587657266721635e-05, + "loss": 2.7207, + "step": 33015 + }, + { + "epoch": 2.049537525606804, + "grad_norm": 0.18023220719057723, + "learning_rate": 2.7584428948278385e-05, + "loss": 2.7943, + "step": 33016 + }, + { + "epoch": 2.0495996027065617, + "grad_norm": 0.14191955508526524, + "learning_rate": 2.758120074678261e-05, + "loss": 2.723, + "step": 33017 + }, + { + "epoch": 2.0496616798063196, + "grad_norm": 0.14419426165609806, + "learning_rate": 2.7577972662251178e-05, + "loss": 2.8068, + "step": 33018 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.14630385401754134, + "learning_rate": 2.7574744694700905e-05, + "loss": 2.7513, + "step": 33019 + }, + { + "epoch": 2.049785834005835, + "grad_norm": 0.14483544196217896, + "learning_rate": 2.7571516844148626e-05, + "loss": 2.7585, + "step": 33020 + }, + { + "epoch": 2.049847911105593, + "grad_norm": 0.14923571245538547, + "learning_rate": 2.7568289110611178e-05, + "loss": 2.7182, + "step": 33021 + }, + { + "epoch": 2.049909988205351, + "grad_norm": 0.1454062587505201, + "learning_rate": 2.7565061494105426e-05, + "loss": 2.7469, + "step": 33022 + }, + { + "epoch": 2.049972065305109, + "grad_norm": 0.14829419144893388, + "learning_rate": 2.756183399464819e-05, + "loss": 2.7834, + "step": 33023 + }, + { + "epoch": 2.0500341424048667, + "grad_norm": 0.15513371833167577, + "learning_rate": 2.7558606612256323e-05, + "loss": 2.7238, + "step": 33024 + }, + { + "epoch": 2.0500962195046246, + "grad_norm": 0.15087914698313054, + "learning_rate": 2.7555379346946646e-05, + "loss": 2.8495, + "step": 33025 + }, + { + "epoch": 2.0501582966043825, + "grad_norm": 0.16238361739270463, + "learning_rate": 2.7552152198735985e-05, + "loss": 2.7678, + "step": 33026 + }, + { + "epoch": 2.0502203737041405, + "grad_norm": 0.16407620849558108, + "learning_rate": 2.7548925167641215e-05, + "loss": 2.7082, + "step": 33027 + }, + { + "epoch": 2.0502824508038984, + "grad_norm": 0.1589100607787352, + "learning_rate": 2.7545698253679152e-05, + "loss": 2.7931, + "step": 33028 + }, + { + "epoch": 2.0503445279036563, + "grad_norm": 0.14170627668432054, + "learning_rate": 2.7542471456866637e-05, + "loss": 2.7109, + "step": 33029 + }, + { + "epoch": 2.050406605003414, + "grad_norm": 0.1472184226988537, + "learning_rate": 2.753924477722048e-05, + "loss": 2.6913, + "step": 33030 + }, + { + "epoch": 2.050468682103172, + "grad_norm": 0.15744983623553058, + "learning_rate": 2.7536018214757555e-05, + "loss": 2.5973, + "step": 33031 + }, + { + "epoch": 2.05053075920293, + "grad_norm": 0.14191269072487328, + "learning_rate": 2.753279176949467e-05, + "loss": 2.8317, + "step": 33032 + }, + { + "epoch": 2.050592836302688, + "grad_norm": 0.14801782333052704, + "learning_rate": 2.752956544144867e-05, + "loss": 2.6994, + "step": 33033 + }, + { + "epoch": 2.050654913402446, + "grad_norm": 0.14630018097779285, + "learning_rate": 2.752633923063636e-05, + "loss": 2.6362, + "step": 33034 + }, + { + "epoch": 2.050716990502204, + "grad_norm": 0.14490001904383495, + "learning_rate": 2.7523113137074613e-05, + "loss": 2.7029, + "step": 33035 + }, + { + "epoch": 2.0507790676019617, + "grad_norm": 0.13825091417184554, + "learning_rate": 2.7519887160780245e-05, + "loss": 2.6413, + "step": 33036 + }, + { + "epoch": 2.0508411447017196, + "grad_norm": 0.17339685603909663, + "learning_rate": 2.751666130177008e-05, + "loss": 2.8096, + "step": 33037 + }, + { + "epoch": 2.0509032218014775, + "grad_norm": 0.1424459014124704, + "learning_rate": 2.7513435560060952e-05, + "loss": 2.7932, + "step": 33038 + }, + { + "epoch": 2.0509652989012355, + "grad_norm": 0.14088917793029151, + "learning_rate": 2.7510209935669667e-05, + "loss": 2.8416, + "step": 33039 + }, + { + "epoch": 2.0510273760009934, + "grad_norm": 0.14366593235040995, + "learning_rate": 2.7506984428613104e-05, + "loss": 2.7145, + "step": 33040 + }, + { + "epoch": 2.0510894531007513, + "grad_norm": 0.1413633249792903, + "learning_rate": 2.750375903890805e-05, + "loss": 2.7986, + "step": 33041 + }, + { + "epoch": 2.051151530200509, + "grad_norm": 0.14312207958207063, + "learning_rate": 2.7500533766571355e-05, + "loss": 2.6052, + "step": 33042 + }, + { + "epoch": 2.051213607300267, + "grad_norm": 0.15736184312575915, + "learning_rate": 2.749730861161982e-05, + "loss": 2.7491, + "step": 33043 + }, + { + "epoch": 2.0512756844000246, + "grad_norm": 0.13766956088543877, + "learning_rate": 2.7494083574070307e-05, + "loss": 2.7247, + "step": 33044 + }, + { + "epoch": 2.0513377614997825, + "grad_norm": 0.1367678417896479, + "learning_rate": 2.7490858653939623e-05, + "loss": 2.7166, + "step": 33045 + }, + { + "epoch": 2.0513998385995404, + "grad_norm": 0.1546862734640026, + "learning_rate": 2.7487633851244586e-05, + "loss": 2.6591, + "step": 33046 + }, + { + "epoch": 2.0514619156992984, + "grad_norm": 0.14045873057791974, + "learning_rate": 2.7484409166002017e-05, + "loss": 2.7188, + "step": 33047 + }, + { + "epoch": 2.0515239927990563, + "grad_norm": 0.1352029422082947, + "learning_rate": 2.7481184598228772e-05, + "loss": 2.7294, + "step": 33048 + }, + { + "epoch": 2.051586069898814, + "grad_norm": 0.147192242853728, + "learning_rate": 2.7477960147941627e-05, + "loss": 2.6668, + "step": 33049 + }, + { + "epoch": 2.051648146998572, + "grad_norm": 0.1467189627027341, + "learning_rate": 2.7474735815157453e-05, + "loss": 2.7982, + "step": 33050 + }, + { + "epoch": 2.05171022409833, + "grad_norm": 0.15982617114703246, + "learning_rate": 2.7471511599893053e-05, + "loss": 2.756, + "step": 33051 + }, + { + "epoch": 2.051772301198088, + "grad_norm": 0.15328346101136425, + "learning_rate": 2.7468287502165245e-05, + "loss": 2.8477, + "step": 33052 + }, + { + "epoch": 2.051834378297846, + "grad_norm": 0.15713465118811806, + "learning_rate": 2.746506352199083e-05, + "loss": 2.7595, + "step": 33053 + }, + { + "epoch": 2.051896455397604, + "grad_norm": 0.14910169748845334, + "learning_rate": 2.7461839659386674e-05, + "loss": 2.6403, + "step": 33054 + }, + { + "epoch": 2.0519585324973617, + "grad_norm": 0.15304814619049517, + "learning_rate": 2.745861591436957e-05, + "loss": 2.6912, + "step": 33055 + }, + { + "epoch": 2.0520206095971196, + "grad_norm": 0.14149087986771813, + "learning_rate": 2.7455392286956334e-05, + "loss": 2.6626, + "step": 33056 + }, + { + "epoch": 2.0520826866968775, + "grad_norm": 0.1461143333806132, + "learning_rate": 2.745216877716377e-05, + "loss": 2.7586, + "step": 33057 + }, + { + "epoch": 2.0521447637966355, + "grad_norm": 0.1480592060741201, + "learning_rate": 2.744894538500874e-05, + "loss": 2.7331, + "step": 33058 + }, + { + "epoch": 2.0522068408963934, + "grad_norm": 0.16085694713714926, + "learning_rate": 2.7445722110508032e-05, + "loss": 2.7393, + "step": 33059 + }, + { + "epoch": 2.0522689179961513, + "grad_norm": 0.17098899621792019, + "learning_rate": 2.744249895367847e-05, + "loss": 2.8274, + "step": 33060 + }, + { + "epoch": 2.052330995095909, + "grad_norm": 0.13807508838772642, + "learning_rate": 2.7439275914536843e-05, + "loss": 2.7052, + "step": 33061 + }, + { + "epoch": 2.052393072195667, + "grad_norm": 0.1426952351408842, + "learning_rate": 2.7436052993100013e-05, + "loss": 2.6453, + "step": 33062 + }, + { + "epoch": 2.052455149295425, + "grad_norm": 0.1484262186080063, + "learning_rate": 2.7432830189384774e-05, + "loss": 2.7303, + "step": 33063 + }, + { + "epoch": 2.052517226395183, + "grad_norm": 0.168885848986196, + "learning_rate": 2.742960750340794e-05, + "loss": 2.7726, + "step": 33064 + }, + { + "epoch": 2.052579303494941, + "grad_norm": 0.16973611389957818, + "learning_rate": 2.7426384935186317e-05, + "loss": 2.7992, + "step": 33065 + }, + { + "epoch": 2.052641380594699, + "grad_norm": 0.14199422129101955, + "learning_rate": 2.742316248473671e-05, + "loss": 2.7526, + "step": 33066 + }, + { + "epoch": 2.0527034576944567, + "grad_norm": 0.1451901161468851, + "learning_rate": 2.7419940152075957e-05, + "loss": 2.7298, + "step": 33067 + }, + { + "epoch": 2.052765534794214, + "grad_norm": 0.1461277690079321, + "learning_rate": 2.741671793722086e-05, + "loss": 2.7247, + "step": 33068 + }, + { + "epoch": 2.052827611893972, + "grad_norm": 0.14142182222263563, + "learning_rate": 2.741349584018823e-05, + "loss": 2.7745, + "step": 33069 + }, + { + "epoch": 2.05288968899373, + "grad_norm": 0.16588489209505877, + "learning_rate": 2.7410273860994856e-05, + "loss": 2.7484, + "step": 33070 + }, + { + "epoch": 2.052951766093488, + "grad_norm": 0.16267376055168345, + "learning_rate": 2.7407051999657584e-05, + "loss": 2.7958, + "step": 33071 + }, + { + "epoch": 2.053013843193246, + "grad_norm": 0.15635757319530197, + "learning_rate": 2.7403830256193208e-05, + "loss": 2.6692, + "step": 33072 + }, + { + "epoch": 2.053075920293004, + "grad_norm": 0.14587540885235484, + "learning_rate": 2.740060863061854e-05, + "loss": 2.7203, + "step": 33073 + }, + { + "epoch": 2.0531379973927617, + "grad_norm": 0.15507236993714263, + "learning_rate": 2.739738712295038e-05, + "loss": 2.7577, + "step": 33074 + }, + { + "epoch": 2.0532000744925196, + "grad_norm": 0.1486462518077521, + "learning_rate": 2.7394165733205513e-05, + "loss": 2.6955, + "step": 33075 + }, + { + "epoch": 2.0532621515922775, + "grad_norm": 0.14774802978488094, + "learning_rate": 2.739094446140079e-05, + "loss": 2.8136, + "step": 33076 + }, + { + "epoch": 2.0533242286920355, + "grad_norm": 0.1415164626021867, + "learning_rate": 2.7387723307553003e-05, + "loss": 2.6835, + "step": 33077 + }, + { + "epoch": 2.0533863057917934, + "grad_norm": 0.16152485635819253, + "learning_rate": 2.7384502271678957e-05, + "loss": 2.6915, + "step": 33078 + }, + { + "epoch": 2.0534483828915513, + "grad_norm": 0.18008407078192895, + "learning_rate": 2.738128135379543e-05, + "loss": 2.7883, + "step": 33079 + }, + { + "epoch": 2.053510459991309, + "grad_norm": 0.16838565917744405, + "learning_rate": 2.7378060553919265e-05, + "loss": 2.7184, + "step": 33080 + }, + { + "epoch": 2.053572537091067, + "grad_norm": 0.15358931638851764, + "learning_rate": 2.7374839872067254e-05, + "loss": 2.7362, + "step": 33081 + }, + { + "epoch": 2.053634614190825, + "grad_norm": 0.15020318550141185, + "learning_rate": 2.7371619308256173e-05, + "loss": 2.761, + "step": 33082 + }, + { + "epoch": 2.053696691290583, + "grad_norm": 0.1724393530846659, + "learning_rate": 2.736839886250287e-05, + "loss": 2.7061, + "step": 33083 + }, + { + "epoch": 2.053758768390341, + "grad_norm": 0.17898523927410828, + "learning_rate": 2.7365178534824104e-05, + "loss": 2.6727, + "step": 33084 + }, + { + "epoch": 2.053820845490099, + "grad_norm": 0.14414879742123063, + "learning_rate": 2.7361958325236713e-05, + "loss": 2.7231, + "step": 33085 + }, + { + "epoch": 2.0538829225898567, + "grad_norm": 0.14764655979592833, + "learning_rate": 2.7358738233757485e-05, + "loss": 2.7338, + "step": 33086 + }, + { + "epoch": 2.0539449996896146, + "grad_norm": 0.18780837925660457, + "learning_rate": 2.7355518260403212e-05, + "loss": 2.7546, + "step": 33087 + }, + { + "epoch": 2.0540070767893726, + "grad_norm": 0.148522271243884, + "learning_rate": 2.7352298405190703e-05, + "loss": 2.7528, + "step": 33088 + }, + { + "epoch": 2.0540691538891305, + "grad_norm": 0.1409162445914574, + "learning_rate": 2.734907866813673e-05, + "loss": 2.7077, + "step": 33089 + }, + { + "epoch": 2.0541312309888884, + "grad_norm": 0.14345648384379317, + "learning_rate": 2.7345859049258127e-05, + "loss": 2.6879, + "step": 33090 + }, + { + "epoch": 2.0541933080886463, + "grad_norm": 0.14995074655614746, + "learning_rate": 2.734263954857168e-05, + "loss": 2.7441, + "step": 33091 + }, + { + "epoch": 2.054255385188404, + "grad_norm": 0.1559694573044909, + "learning_rate": 2.733942016609418e-05, + "loss": 2.6605, + "step": 33092 + }, + { + "epoch": 2.0543174622881617, + "grad_norm": 0.14160392364638058, + "learning_rate": 2.7336200901842414e-05, + "loss": 2.7228, + "step": 33093 + }, + { + "epoch": 2.0543795393879196, + "grad_norm": 0.1468739922975912, + "learning_rate": 2.7332981755833197e-05, + "loss": 2.7293, + "step": 33094 + }, + { + "epoch": 2.0544416164876775, + "grad_norm": 0.15126924114556567, + "learning_rate": 2.732976272808333e-05, + "loss": 2.7288, + "step": 33095 + }, + { + "epoch": 2.0545036935874355, + "grad_norm": 0.14571719497981536, + "learning_rate": 2.7326543818609585e-05, + "loss": 2.6819, + "step": 33096 + }, + { + "epoch": 2.0545657706871934, + "grad_norm": 0.15599610442943257, + "learning_rate": 2.7323325027428748e-05, + "loss": 2.7679, + "step": 33097 + }, + { + "epoch": 2.0546278477869513, + "grad_norm": 0.1536183527978081, + "learning_rate": 2.732010635455765e-05, + "loss": 2.7673, + "step": 33098 + }, + { + "epoch": 2.054689924886709, + "grad_norm": 0.14027323702375874, + "learning_rate": 2.7316887800013065e-05, + "loss": 2.6676, + "step": 33099 + }, + { + "epoch": 2.054752001986467, + "grad_norm": 0.1589041882590317, + "learning_rate": 2.731366936381178e-05, + "loss": 2.6604, + "step": 33100 + }, + { + "epoch": 2.054814079086225, + "grad_norm": 0.1524338272527362, + "learning_rate": 2.7310451045970587e-05, + "loss": 2.7006, + "step": 33101 + }, + { + "epoch": 2.054876156185983, + "grad_norm": 0.18473839549100488, + "learning_rate": 2.7307232846506258e-05, + "loss": 2.7893, + "step": 33102 + }, + { + "epoch": 2.054938233285741, + "grad_norm": 0.15964387717756248, + "learning_rate": 2.730401476543563e-05, + "loss": 2.8176, + "step": 33103 + }, + { + "epoch": 2.055000310385499, + "grad_norm": 0.1468157643726238, + "learning_rate": 2.7300796802775463e-05, + "loss": 2.7712, + "step": 33104 + }, + { + "epoch": 2.0550623874852567, + "grad_norm": 0.1574455891772718, + "learning_rate": 2.7297578958542552e-05, + "loss": 2.7423, + "step": 33105 + }, + { + "epoch": 2.0551244645850146, + "grad_norm": 0.16982865791156465, + "learning_rate": 2.7294361232753664e-05, + "loss": 2.7097, + "step": 33106 + }, + { + "epoch": 2.0551865416847725, + "grad_norm": 0.1565485724233295, + "learning_rate": 2.729114362542562e-05, + "loss": 2.8481, + "step": 33107 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.14362894460218037, + "learning_rate": 2.7287926136575197e-05, + "loss": 2.6957, + "step": 33108 + }, + { + "epoch": 2.0553106958842884, + "grad_norm": 0.1846035215989118, + "learning_rate": 2.7284708766219175e-05, + "loss": 2.7033, + "step": 33109 + }, + { + "epoch": 2.0553727729840463, + "grad_norm": 0.14000436467366298, + "learning_rate": 2.7281491514374345e-05, + "loss": 2.7018, + "step": 33110 + }, + { + "epoch": 2.055434850083804, + "grad_norm": 0.14559179481041784, + "learning_rate": 2.7278274381057468e-05, + "loss": 2.7834, + "step": 33111 + }, + { + "epoch": 2.055496927183562, + "grad_norm": 0.15156665088435914, + "learning_rate": 2.7275057366285363e-05, + "loss": 2.7053, + "step": 33112 + }, + { + "epoch": 2.05555900428332, + "grad_norm": 0.1648594369701088, + "learning_rate": 2.7271840470074807e-05, + "loss": 2.6849, + "step": 33113 + }, + { + "epoch": 2.055621081383078, + "grad_norm": 0.14729252096595655, + "learning_rate": 2.726862369244255e-05, + "loss": 2.6553, + "step": 33114 + }, + { + "epoch": 2.055683158482836, + "grad_norm": 0.1695779795342446, + "learning_rate": 2.7265407033405432e-05, + "loss": 2.7733, + "step": 33115 + }, + { + "epoch": 2.0557452355825934, + "grad_norm": 0.14715978046563977, + "learning_rate": 2.7262190492980176e-05, + "loss": 2.8168, + "step": 33116 + }, + { + "epoch": 2.0558073126823513, + "grad_norm": 0.14774318865463631, + "learning_rate": 2.7258974071183612e-05, + "loss": 2.8075, + "step": 33117 + }, + { + "epoch": 2.055869389782109, + "grad_norm": 0.14224608833410401, + "learning_rate": 2.725575776803251e-05, + "loss": 2.769, + "step": 33118 + }, + { + "epoch": 2.055931466881867, + "grad_norm": 0.16516244666457336, + "learning_rate": 2.725254158354363e-05, + "loss": 2.7852, + "step": 33119 + }, + { + "epoch": 2.055993543981625, + "grad_norm": 0.14349170522857488, + "learning_rate": 2.724932551773375e-05, + "loss": 2.7175, + "step": 33120 + }, + { + "epoch": 2.056055621081383, + "grad_norm": 0.15646406649457317, + "learning_rate": 2.7246109570619677e-05, + "loss": 2.8471, + "step": 33121 + }, + { + "epoch": 2.056117698181141, + "grad_norm": 0.14552600798034152, + "learning_rate": 2.7242893742218174e-05, + "loss": 2.7653, + "step": 33122 + }, + { + "epoch": 2.056179775280899, + "grad_norm": 0.1709611287252996, + "learning_rate": 2.7239678032546024e-05, + "loss": 2.7216, + "step": 33123 + }, + { + "epoch": 2.0562418523806567, + "grad_norm": 0.1543574757417152, + "learning_rate": 2.7236462441619992e-05, + "loss": 2.6972, + "step": 33124 + }, + { + "epoch": 2.0563039294804146, + "grad_norm": 0.14848003907262058, + "learning_rate": 2.723324696945685e-05, + "loss": 2.7128, + "step": 33125 + }, + { + "epoch": 2.0563660065801725, + "grad_norm": 0.15556522160598077, + "learning_rate": 2.723003161607339e-05, + "loss": 2.7585, + "step": 33126 + }, + { + "epoch": 2.0564280836799305, + "grad_norm": 0.15629248041612148, + "learning_rate": 2.7226816381486396e-05, + "loss": 2.7517, + "step": 33127 + }, + { + "epoch": 2.0564901607796884, + "grad_norm": 0.1552225005374681, + "learning_rate": 2.7223601265712627e-05, + "loss": 2.7368, + "step": 33128 + }, + { + "epoch": 2.0565522378794463, + "grad_norm": 0.13708485223726768, + "learning_rate": 2.722038626876884e-05, + "loss": 2.7369, + "step": 33129 + }, + { + "epoch": 2.056614314979204, + "grad_norm": 0.138612143318501, + "learning_rate": 2.7217171390671848e-05, + "loss": 2.648, + "step": 33130 + }, + { + "epoch": 2.056676392078962, + "grad_norm": 0.1402376588952269, + "learning_rate": 2.7213956631438404e-05, + "loss": 2.7327, + "step": 33131 + }, + { + "epoch": 2.05673846917872, + "grad_norm": 0.13656967755273686, + "learning_rate": 2.7210741991085275e-05, + "loss": 2.6396, + "step": 33132 + }, + { + "epoch": 2.056800546278478, + "grad_norm": 0.18078386613144776, + "learning_rate": 2.7207527469629222e-05, + "loss": 2.6478, + "step": 33133 + }, + { + "epoch": 2.056862623378236, + "grad_norm": 0.17473578560270103, + "learning_rate": 2.720431306708705e-05, + "loss": 2.7883, + "step": 33134 + }, + { + "epoch": 2.056924700477994, + "grad_norm": 0.3067850635295813, + "learning_rate": 2.720109878347551e-05, + "loss": 2.7733, + "step": 33135 + }, + { + "epoch": 2.0569867775777517, + "grad_norm": 0.19340708172980353, + "learning_rate": 2.7197884618811377e-05, + "loss": 2.7313, + "step": 33136 + }, + { + "epoch": 2.0570488546775096, + "grad_norm": 0.151744802204492, + "learning_rate": 2.7194670573111403e-05, + "loss": 2.6957, + "step": 33137 + }, + { + "epoch": 2.0571109317772676, + "grad_norm": 0.15386629989999437, + "learning_rate": 2.7191456646392362e-05, + "loss": 2.7675, + "step": 33138 + }, + { + "epoch": 2.0571730088770255, + "grad_norm": 0.17457793355931447, + "learning_rate": 2.718824283867104e-05, + "loss": 2.7535, + "step": 33139 + }, + { + "epoch": 2.057235085976783, + "grad_norm": 0.15172082752255045, + "learning_rate": 2.7185029149964196e-05, + "loss": 2.7037, + "step": 33140 + }, + { + "epoch": 2.057297163076541, + "grad_norm": 0.1574004852113249, + "learning_rate": 2.7181815580288593e-05, + "loss": 2.7709, + "step": 33141 + }, + { + "epoch": 2.057359240176299, + "grad_norm": 0.14383212029597384, + "learning_rate": 2.717860212966098e-05, + "loss": 2.6994, + "step": 33142 + }, + { + "epoch": 2.0574213172760567, + "grad_norm": 0.17821334396489433, + "learning_rate": 2.7175388798098154e-05, + "loss": 2.7426, + "step": 33143 + }, + { + "epoch": 2.0574833943758146, + "grad_norm": 0.18651598770682837, + "learning_rate": 2.717217558561687e-05, + "loss": 2.7453, + "step": 33144 + }, + { + "epoch": 2.0575454714755725, + "grad_norm": 0.14467423686171832, + "learning_rate": 2.7168962492233885e-05, + "loss": 2.6626, + "step": 33145 + }, + { + "epoch": 2.0576075485753305, + "grad_norm": 0.1471783412221988, + "learning_rate": 2.716574951796595e-05, + "loss": 2.7327, + "step": 33146 + }, + { + "epoch": 2.0576696256750884, + "grad_norm": 0.14253198190520824, + "learning_rate": 2.7162536662829842e-05, + "loss": 2.6908, + "step": 33147 + }, + { + "epoch": 2.0577317027748463, + "grad_norm": 0.15334921764835596, + "learning_rate": 2.715932392684234e-05, + "loss": 2.7352, + "step": 33148 + }, + { + "epoch": 2.057793779874604, + "grad_norm": 0.1459376814736947, + "learning_rate": 2.7156111310020194e-05, + "loss": 2.7402, + "step": 33149 + }, + { + "epoch": 2.057855856974362, + "grad_norm": 0.15375501516931828, + "learning_rate": 2.7152898812380155e-05, + "loss": 2.733, + "step": 33150 + }, + { + "epoch": 2.05791793407412, + "grad_norm": 0.14668191941108663, + "learning_rate": 2.714968643393899e-05, + "loss": 2.8413, + "step": 33151 + }, + { + "epoch": 2.057980011173878, + "grad_norm": 0.14012975376378925, + "learning_rate": 2.7146474174713445e-05, + "loss": 2.7196, + "step": 33152 + }, + { + "epoch": 2.058042088273636, + "grad_norm": 0.1408384411977005, + "learning_rate": 2.7143262034720308e-05, + "loss": 2.6861, + "step": 33153 + }, + { + "epoch": 2.058104165373394, + "grad_norm": 0.16047359056238628, + "learning_rate": 2.7140050013976326e-05, + "loss": 2.7858, + "step": 33154 + }, + { + "epoch": 2.0581662424731517, + "grad_norm": 0.1526841066278517, + "learning_rate": 2.7136838112498242e-05, + "loss": 2.7411, + "step": 33155 + }, + { + "epoch": 2.0582283195729096, + "grad_norm": 0.1422750242941934, + "learning_rate": 2.7133626330302814e-05, + "loss": 2.7741, + "step": 33156 + }, + { + "epoch": 2.0582903966726676, + "grad_norm": 0.1462512067903956, + "learning_rate": 2.7130414667406823e-05, + "loss": 2.7282, + "step": 33157 + }, + { + "epoch": 2.0583524737724255, + "grad_norm": 0.14720893320112918, + "learning_rate": 2.7127203123827016e-05, + "loss": 2.8179, + "step": 33158 + }, + { + "epoch": 2.0584145508721834, + "grad_norm": 0.1491005419325686, + "learning_rate": 2.7123991699580136e-05, + "loss": 2.7099, + "step": 33159 + }, + { + "epoch": 2.0584766279719413, + "grad_norm": 0.14217239289148026, + "learning_rate": 2.7120780394682942e-05, + "loss": 2.7344, + "step": 33160 + }, + { + "epoch": 2.0585387050716992, + "grad_norm": 0.13796150233844753, + "learning_rate": 2.7117569209152182e-05, + "loss": 2.6347, + "step": 33161 + }, + { + "epoch": 2.058600782171457, + "grad_norm": 0.1480460904183647, + "learning_rate": 2.711435814300463e-05, + "loss": 2.7595, + "step": 33162 + }, + { + "epoch": 2.058662859271215, + "grad_norm": 0.20324044431591004, + "learning_rate": 2.711114719625703e-05, + "loss": 2.762, + "step": 33163 + }, + { + "epoch": 2.0587249363709725, + "grad_norm": 0.1549854727251506, + "learning_rate": 2.7107936368926136e-05, + "loss": 2.7079, + "step": 33164 + }, + { + "epoch": 2.0587870134707305, + "grad_norm": 0.14007635511183333, + "learning_rate": 2.710472566102867e-05, + "loss": 2.7258, + "step": 33165 + }, + { + "epoch": 2.0588490905704884, + "grad_norm": 0.15097657533080527, + "learning_rate": 2.710151507258143e-05, + "loss": 2.7986, + "step": 33166 + }, + { + "epoch": 2.0589111676702463, + "grad_norm": 0.14861389395659821, + "learning_rate": 2.7098304603601142e-05, + "loss": 2.7287, + "step": 33167 + }, + { + "epoch": 2.058973244770004, + "grad_norm": 0.15242376427039267, + "learning_rate": 2.709509425410456e-05, + "loss": 2.7095, + "step": 33168 + }, + { + "epoch": 2.059035321869762, + "grad_norm": 0.14189193156119384, + "learning_rate": 2.709188402410841e-05, + "loss": 2.6941, + "step": 33169 + }, + { + "epoch": 2.05909739896952, + "grad_norm": 0.14109642429711133, + "learning_rate": 2.708867391362948e-05, + "loss": 2.7805, + "step": 33170 + }, + { + "epoch": 2.059159476069278, + "grad_norm": 0.1522808923402027, + "learning_rate": 2.7085463922684508e-05, + "loss": 2.6773, + "step": 33171 + }, + { + "epoch": 2.059221553169036, + "grad_norm": 0.2039904132067665, + "learning_rate": 2.7082254051290228e-05, + "loss": 2.6775, + "step": 33172 + }, + { + "epoch": 2.059283630268794, + "grad_norm": 0.14049973471792976, + "learning_rate": 2.7079044299463396e-05, + "loss": 2.7202, + "step": 33173 + }, + { + "epoch": 2.0593457073685517, + "grad_norm": 0.14811039319429392, + "learning_rate": 2.7075834667220733e-05, + "loss": 2.7344, + "step": 33174 + }, + { + "epoch": 2.0594077844683096, + "grad_norm": 0.13756389975453076, + "learning_rate": 2.7072625154579024e-05, + "loss": 2.7009, + "step": 33175 + }, + { + "epoch": 2.0594698615680676, + "grad_norm": 0.14459672939544407, + "learning_rate": 2.7069415761554993e-05, + "loss": 2.6833, + "step": 33176 + }, + { + "epoch": 2.0595319386678255, + "grad_norm": 0.16525692075760093, + "learning_rate": 2.706620648816539e-05, + "loss": 2.6907, + "step": 33177 + }, + { + "epoch": 2.0595940157675834, + "grad_norm": 0.1697418421110905, + "learning_rate": 2.706299733442693e-05, + "loss": 2.71, + "step": 33178 + }, + { + "epoch": 2.0596560928673413, + "grad_norm": 0.14134234016882605, + "learning_rate": 2.7059788300356408e-05, + "loss": 2.739, + "step": 33179 + }, + { + "epoch": 2.059718169967099, + "grad_norm": 0.14741132545831356, + "learning_rate": 2.7056579385970515e-05, + "loss": 2.7875, + "step": 33180 + }, + { + "epoch": 2.059780247066857, + "grad_norm": 0.16291615791456815, + "learning_rate": 2.7053370591286042e-05, + "loss": 2.7332, + "step": 33181 + }, + { + "epoch": 2.059842324166615, + "grad_norm": 0.16016492587525258, + "learning_rate": 2.70501619163197e-05, + "loss": 2.7086, + "step": 33182 + }, + { + "epoch": 2.059904401266373, + "grad_norm": 0.14750997725783782, + "learning_rate": 2.7046953361088216e-05, + "loss": 2.7929, + "step": 33183 + }, + { + "epoch": 2.059966478366131, + "grad_norm": 0.1513196378286442, + "learning_rate": 2.7043744925608372e-05, + "loss": 2.7288, + "step": 33184 + }, + { + "epoch": 2.060028555465889, + "grad_norm": 0.14374025508772392, + "learning_rate": 2.7040536609896876e-05, + "loss": 2.7927, + "step": 33185 + }, + { + "epoch": 2.0600906325656467, + "grad_norm": 0.16277467410902005, + "learning_rate": 2.703732841397048e-05, + "loss": 2.7186, + "step": 33186 + }, + { + "epoch": 2.0601527096654046, + "grad_norm": 0.14125806647240388, + "learning_rate": 2.7034120337845915e-05, + "loss": 2.7538, + "step": 33187 + }, + { + "epoch": 2.060214786765162, + "grad_norm": 0.15263565536587442, + "learning_rate": 2.7030912381539898e-05, + "loss": 2.7439, + "step": 33188 + }, + { + "epoch": 2.06027686386492, + "grad_norm": 0.1473695016790587, + "learning_rate": 2.702770454506921e-05, + "loss": 2.7468, + "step": 33189 + }, + { + "epoch": 2.060338940964678, + "grad_norm": 0.14389519304175677, + "learning_rate": 2.7024496828450563e-05, + "loss": 2.7833, + "step": 33190 + }, + { + "epoch": 2.060401018064436, + "grad_norm": 0.14557400420390587, + "learning_rate": 2.7021289231700687e-05, + "loss": 2.7332, + "step": 33191 + }, + { + "epoch": 2.060463095164194, + "grad_norm": 0.15635172393673077, + "learning_rate": 2.701808175483631e-05, + "loss": 2.7469, + "step": 33192 + }, + { + "epoch": 2.0605251722639517, + "grad_norm": 0.15625835038760033, + "learning_rate": 2.70148743978742e-05, + "loss": 2.6541, + "step": 33193 + }, + { + "epoch": 2.0605872493637096, + "grad_norm": 0.1386993072133031, + "learning_rate": 2.7011667160831066e-05, + "loss": 2.7133, + "step": 33194 + }, + { + "epoch": 2.0606493264634675, + "grad_norm": 0.14909976739280026, + "learning_rate": 2.700846004372365e-05, + "loss": 2.7245, + "step": 33195 + }, + { + "epoch": 2.0607114035632255, + "grad_norm": 0.14284577787483319, + "learning_rate": 2.700525304656868e-05, + "loss": 2.7672, + "step": 33196 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.14000632702844196, + "learning_rate": 2.700204616938286e-05, + "loss": 2.7233, + "step": 33197 + }, + { + "epoch": 2.0608355577627413, + "grad_norm": 0.14294606899184933, + "learning_rate": 2.6998839412182976e-05, + "loss": 2.7532, + "step": 33198 + }, + { + "epoch": 2.060897634862499, + "grad_norm": 0.1398832241479462, + "learning_rate": 2.6995632774985724e-05, + "loss": 2.7277, + "step": 33199 + }, + { + "epoch": 2.060959711962257, + "grad_norm": 0.16601955745751834, + "learning_rate": 2.6992426257807844e-05, + "loss": 2.6629, + "step": 33200 + }, + { + "epoch": 2.061021789062015, + "grad_norm": 0.1519610537737952, + "learning_rate": 2.698921986066604e-05, + "loss": 2.8042, + "step": 33201 + }, + { + "epoch": 2.061083866161773, + "grad_norm": 0.17149430617682945, + "learning_rate": 2.698601358357708e-05, + "loss": 2.7759, + "step": 33202 + }, + { + "epoch": 2.061145943261531, + "grad_norm": 0.13802687988591955, + "learning_rate": 2.6982807426557675e-05, + "loss": 2.7482, + "step": 33203 + }, + { + "epoch": 2.061208020361289, + "grad_norm": 0.14169480772402282, + "learning_rate": 2.6979601389624552e-05, + "loss": 2.7166, + "step": 33204 + }, + { + "epoch": 2.0612700974610467, + "grad_norm": 0.1399225931772175, + "learning_rate": 2.6976395472794426e-05, + "loss": 2.6503, + "step": 33205 + }, + { + "epoch": 2.0613321745608046, + "grad_norm": 0.14835077186497136, + "learning_rate": 2.6973189676084044e-05, + "loss": 2.787, + "step": 33206 + }, + { + "epoch": 2.0613942516605626, + "grad_norm": 0.13945511461222382, + "learning_rate": 2.6969983999510117e-05, + "loss": 2.6933, + "step": 33207 + }, + { + "epoch": 2.0614563287603205, + "grad_norm": 0.15249856394146327, + "learning_rate": 2.696677844308938e-05, + "loss": 2.7453, + "step": 33208 + }, + { + "epoch": 2.0615184058600784, + "grad_norm": 0.1473545486461514, + "learning_rate": 2.6963573006838554e-05, + "loss": 2.6977, + "step": 33209 + }, + { + "epoch": 2.0615804829598363, + "grad_norm": 0.14554624242025724, + "learning_rate": 2.6960367690774335e-05, + "loss": 2.6761, + "step": 33210 + }, + { + "epoch": 2.0616425600595942, + "grad_norm": 0.14098440046499386, + "learning_rate": 2.6957162494913497e-05, + "loss": 2.7234, + "step": 33211 + }, + { + "epoch": 2.0617046371593517, + "grad_norm": 0.14360696787804364, + "learning_rate": 2.695395741927273e-05, + "loss": 2.5905, + "step": 33212 + }, + { + "epoch": 2.0617667142591096, + "grad_norm": 0.1516766430502353, + "learning_rate": 2.6950752463868745e-05, + "loss": 2.7035, + "step": 33213 + }, + { + "epoch": 2.0618287913588675, + "grad_norm": 0.14968431876551092, + "learning_rate": 2.69475476287183e-05, + "loss": 2.7341, + "step": 33214 + }, + { + "epoch": 2.0618908684586255, + "grad_norm": 0.1711806174761325, + "learning_rate": 2.6944342913838074e-05, + "loss": 2.6527, + "step": 33215 + }, + { + "epoch": 2.0619529455583834, + "grad_norm": 0.1510464438691423, + "learning_rate": 2.694113831924483e-05, + "loss": 2.7883, + "step": 33216 + }, + { + "epoch": 2.0620150226581413, + "grad_norm": 0.14549787858681495, + "learning_rate": 2.693793384495526e-05, + "loss": 2.7213, + "step": 33217 + }, + { + "epoch": 2.062077099757899, + "grad_norm": 0.1648187464693762, + "learning_rate": 2.6934729490986088e-05, + "loss": 2.7374, + "step": 33218 + }, + { + "epoch": 2.062139176857657, + "grad_norm": 0.14163690980856128, + "learning_rate": 2.6931525257354017e-05, + "loss": 2.8108, + "step": 33219 + }, + { + "epoch": 2.062201253957415, + "grad_norm": 0.14561076010716476, + "learning_rate": 2.6928321144075797e-05, + "loss": 2.6452, + "step": 33220 + }, + { + "epoch": 2.062263331057173, + "grad_norm": 0.15866183665689967, + "learning_rate": 2.692511715116812e-05, + "loss": 2.6001, + "step": 33221 + }, + { + "epoch": 2.062325408156931, + "grad_norm": 0.14765942711414354, + "learning_rate": 2.692191327864772e-05, + "loss": 2.8459, + "step": 33222 + }, + { + "epoch": 2.062387485256689, + "grad_norm": 0.14496670669592537, + "learning_rate": 2.6918709526531294e-05, + "loss": 2.7005, + "step": 33223 + }, + { + "epoch": 2.0624495623564467, + "grad_norm": 0.15984560333363315, + "learning_rate": 2.6915505894835547e-05, + "loss": 2.8299, + "step": 33224 + }, + { + "epoch": 2.0625116394562046, + "grad_norm": 0.16579645499702098, + "learning_rate": 2.6912302383577226e-05, + "loss": 2.6484, + "step": 33225 + }, + { + "epoch": 2.0625737165559626, + "grad_norm": 0.14963454141924043, + "learning_rate": 2.690909899277303e-05, + "loss": 2.8239, + "step": 33226 + }, + { + "epoch": 2.0626357936557205, + "grad_norm": 0.1517299059391497, + "learning_rate": 2.6905895722439668e-05, + "loss": 2.6552, + "step": 33227 + }, + { + "epoch": 2.0626978707554784, + "grad_norm": 0.1562922301444555, + "learning_rate": 2.6902692572593836e-05, + "loss": 2.7362, + "step": 33228 + }, + { + "epoch": 2.0627599478552363, + "grad_norm": 0.13807546610968519, + "learning_rate": 2.6899489543252287e-05, + "loss": 2.6634, + "step": 33229 + }, + { + "epoch": 2.0628220249549942, + "grad_norm": 0.16009827471654703, + "learning_rate": 2.68962866344317e-05, + "loss": 2.6792, + "step": 33230 + }, + { + "epoch": 2.062884102054752, + "grad_norm": 0.14047495466538087, + "learning_rate": 2.6893083846148803e-05, + "loss": 2.8099, + "step": 33231 + }, + { + "epoch": 2.06294617915451, + "grad_norm": 0.17067687249561084, + "learning_rate": 2.6889881178420273e-05, + "loss": 2.8105, + "step": 33232 + }, + { + "epoch": 2.063008256254268, + "grad_norm": 0.15030504863494773, + "learning_rate": 2.6886678631262862e-05, + "loss": 2.7924, + "step": 33233 + }, + { + "epoch": 2.063070333354026, + "grad_norm": 0.14089478337028624, + "learning_rate": 2.6883476204693266e-05, + "loss": 2.6913, + "step": 33234 + }, + { + "epoch": 2.0631324104537834, + "grad_norm": 0.13864222733470102, + "learning_rate": 2.688027389872818e-05, + "loss": 2.7823, + "step": 33235 + }, + { + "epoch": 2.0631944875535413, + "grad_norm": 0.14270955390361797, + "learning_rate": 2.6877071713384316e-05, + "loss": 2.8132, + "step": 33236 + }, + { + "epoch": 2.063256564653299, + "grad_norm": 0.16267329109329598, + "learning_rate": 2.687386964867837e-05, + "loss": 2.7198, + "step": 33237 + }, + { + "epoch": 2.063318641753057, + "grad_norm": 0.15259711886157157, + "learning_rate": 2.6870667704627074e-05, + "loss": 2.7341, + "step": 33238 + }, + { + "epoch": 2.063380718852815, + "grad_norm": 0.1595766407098171, + "learning_rate": 2.6867465881247123e-05, + "loss": 2.8291, + "step": 33239 + }, + { + "epoch": 2.063442795952573, + "grad_norm": 0.15510528384289476, + "learning_rate": 2.6864264178555214e-05, + "loss": 2.6869, + "step": 33240 + }, + { + "epoch": 2.063504873052331, + "grad_norm": 0.14388633228807782, + "learning_rate": 2.6861062596568038e-05, + "loss": 2.7631, + "step": 33241 + }, + { + "epoch": 2.063566950152089, + "grad_norm": 0.1681148647600111, + "learning_rate": 2.685786113530233e-05, + "loss": 2.7575, + "step": 33242 + }, + { + "epoch": 2.0636290272518467, + "grad_norm": 0.16196434013447197, + "learning_rate": 2.6854659794774782e-05, + "loss": 2.6763, + "step": 33243 + }, + { + "epoch": 2.0636911043516046, + "grad_norm": 0.1476160080381356, + "learning_rate": 2.6851458575002096e-05, + "loss": 2.7684, + "step": 33244 + }, + { + "epoch": 2.0637531814513626, + "grad_norm": 0.1549461862857485, + "learning_rate": 2.684825747600095e-05, + "loss": 2.766, + "step": 33245 + }, + { + "epoch": 2.0638152585511205, + "grad_norm": 0.15478388668466245, + "learning_rate": 2.6845056497788084e-05, + "loss": 2.7789, + "step": 33246 + }, + { + "epoch": 2.0638773356508784, + "grad_norm": 0.14237544272230876, + "learning_rate": 2.684185564038016e-05, + "loss": 2.6799, + "step": 33247 + }, + { + "epoch": 2.0639394127506363, + "grad_norm": 0.15328336379822838, + "learning_rate": 2.6838654903793914e-05, + "loss": 2.7894, + "step": 33248 + }, + { + "epoch": 2.064001489850394, + "grad_norm": 0.14112120741393133, + "learning_rate": 2.683545428804603e-05, + "loss": 2.6868, + "step": 33249 + }, + { + "epoch": 2.064063566950152, + "grad_norm": 0.14995506303931416, + "learning_rate": 2.6832253793153207e-05, + "loss": 2.7047, + "step": 33250 + }, + { + "epoch": 2.06412564404991, + "grad_norm": 0.19683352442233343, + "learning_rate": 2.682905341913211e-05, + "loss": 2.7097, + "step": 33251 + }, + { + "epoch": 2.064187721149668, + "grad_norm": 0.15489802772859113, + "learning_rate": 2.6825853165999492e-05, + "loss": 2.6906, + "step": 33252 + }, + { + "epoch": 2.064249798249426, + "grad_norm": 0.13776151269670214, + "learning_rate": 2.6822653033772028e-05, + "loss": 2.6556, + "step": 33253 + }, + { + "epoch": 2.064311875349184, + "grad_norm": 0.1410208732811842, + "learning_rate": 2.6819453022466405e-05, + "loss": 2.6799, + "step": 33254 + }, + { + "epoch": 2.0643739524489417, + "grad_norm": 0.1410183967043196, + "learning_rate": 2.6816253132099305e-05, + "loss": 2.7587, + "step": 33255 + }, + { + "epoch": 2.0644360295486996, + "grad_norm": 0.13939753581151323, + "learning_rate": 2.6813053362687464e-05, + "loss": 2.7306, + "step": 33256 + }, + { + "epoch": 2.0644981066484576, + "grad_norm": 0.15911404596062137, + "learning_rate": 2.6809853714247546e-05, + "loss": 2.7858, + "step": 33257 + }, + { + "epoch": 2.0645601837482155, + "grad_norm": 0.17620750244621552, + "learning_rate": 2.6806654186796253e-05, + "loss": 2.7755, + "step": 33258 + }, + { + "epoch": 2.0646222608479734, + "grad_norm": 0.14426252797960024, + "learning_rate": 2.6803454780350278e-05, + "loss": 2.7883, + "step": 33259 + }, + { + "epoch": 2.064684337947731, + "grad_norm": 0.15603201681185885, + "learning_rate": 2.6800255494926286e-05, + "loss": 2.7426, + "step": 33260 + }, + { + "epoch": 2.064746415047489, + "grad_norm": 0.13896216646619217, + "learning_rate": 2.6797056330541015e-05, + "loss": 2.686, + "step": 33261 + }, + { + "epoch": 2.0648084921472467, + "grad_norm": 0.17198993351291342, + "learning_rate": 2.6793857287211132e-05, + "loss": 2.7915, + "step": 33262 + }, + { + "epoch": 2.0648705692470046, + "grad_norm": 0.1353069779698712, + "learning_rate": 2.6790658364953337e-05, + "loss": 2.7428, + "step": 33263 + }, + { + "epoch": 2.0649326463467625, + "grad_norm": 0.18602457291744132, + "learning_rate": 2.6787459563784278e-05, + "loss": 2.7741, + "step": 33264 + }, + { + "epoch": 2.0649947234465205, + "grad_norm": 0.14172224624047502, + "learning_rate": 2.6784260883720706e-05, + "loss": 2.7967, + "step": 33265 + }, + { + "epoch": 2.0650568005462784, + "grad_norm": 0.13800348466727091, + "learning_rate": 2.6781062324779282e-05, + "loss": 2.7023, + "step": 33266 + }, + { + "epoch": 2.0651188776460363, + "grad_norm": 0.13531701994432163, + "learning_rate": 2.677786388697669e-05, + "loss": 2.7598, + "step": 33267 + }, + { + "epoch": 2.065180954745794, + "grad_norm": 0.14638036214045277, + "learning_rate": 2.6774665570329598e-05, + "loss": 2.8236, + "step": 33268 + }, + { + "epoch": 2.065243031845552, + "grad_norm": 0.14655874271904296, + "learning_rate": 2.6771467374854735e-05, + "loss": 2.6552, + "step": 33269 + }, + { + "epoch": 2.06530510894531, + "grad_norm": 0.1388849783552183, + "learning_rate": 2.676826930056877e-05, + "loss": 2.6769, + "step": 33270 + }, + { + "epoch": 2.065367186045068, + "grad_norm": 0.14102546969257604, + "learning_rate": 2.6765071347488378e-05, + "loss": 2.7014, + "step": 33271 + }, + { + "epoch": 2.065429263144826, + "grad_norm": 0.1527673749079231, + "learning_rate": 2.6761873515630254e-05, + "loss": 2.7333, + "step": 33272 + }, + { + "epoch": 2.065491340244584, + "grad_norm": 0.15958017741308128, + "learning_rate": 2.6758675805011053e-05, + "loss": 2.6944, + "step": 33273 + }, + { + "epoch": 2.0655534173443417, + "grad_norm": 0.13820970106679617, + "learning_rate": 2.6755478215647502e-05, + "loss": 2.6944, + "step": 33274 + }, + { + "epoch": 2.0656154944440996, + "grad_norm": 0.1723121372671533, + "learning_rate": 2.675228074755627e-05, + "loss": 2.5898, + "step": 33275 + }, + { + "epoch": 2.0656775715438576, + "grad_norm": 0.14430301275247484, + "learning_rate": 2.6749083400754026e-05, + "loss": 2.6441, + "step": 33276 + }, + { + "epoch": 2.0657396486436155, + "grad_norm": 0.1808767252729711, + "learning_rate": 2.6745886175257438e-05, + "loss": 2.757, + "step": 33277 + }, + { + "epoch": 2.0658017257433734, + "grad_norm": 0.15592565847971576, + "learning_rate": 2.6742689071083216e-05, + "loss": 2.6819, + "step": 33278 + }, + { + "epoch": 2.0658638028431313, + "grad_norm": 0.13869030252438386, + "learning_rate": 2.673949208824804e-05, + "loss": 2.6199, + "step": 33279 + }, + { + "epoch": 2.0659258799428892, + "grad_norm": 0.15148840941653424, + "learning_rate": 2.6736295226768582e-05, + "loss": 2.7823, + "step": 33280 + }, + { + "epoch": 2.065987957042647, + "grad_norm": 0.14259262533787265, + "learning_rate": 2.6733098486661524e-05, + "loss": 2.8289, + "step": 33281 + }, + { + "epoch": 2.066050034142405, + "grad_norm": 0.1434221888870277, + "learning_rate": 2.672990186794352e-05, + "loss": 2.8581, + "step": 33282 + }, + { + "epoch": 2.0661121112421625, + "grad_norm": 0.1651776707976322, + "learning_rate": 2.672670537063129e-05, + "loss": 2.7579, + "step": 33283 + }, + { + "epoch": 2.0661741883419205, + "grad_norm": 0.14991583747892448, + "learning_rate": 2.6723508994741474e-05, + "loss": 2.768, + "step": 33284 + }, + { + "epoch": 2.0662362654416784, + "grad_norm": 0.14481231064304556, + "learning_rate": 2.6720312740290775e-05, + "loss": 2.7351, + "step": 33285 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.14556025513694698, + "learning_rate": 2.671711660729585e-05, + "loss": 2.695, + "step": 33286 + }, + { + "epoch": 2.066360419641194, + "grad_norm": 0.17105528037558024, + "learning_rate": 2.6713920595773363e-05, + "loss": 2.7027, + "step": 33287 + }, + { + "epoch": 2.066422496740952, + "grad_norm": 0.15379153565774403, + "learning_rate": 2.671072470574002e-05, + "loss": 2.7856, + "step": 33288 + }, + { + "epoch": 2.06648457384071, + "grad_norm": 0.15189416275872444, + "learning_rate": 2.670752893721249e-05, + "loss": 2.6774, + "step": 33289 + }, + { + "epoch": 2.066546650940468, + "grad_norm": 0.1847339994208451, + "learning_rate": 2.6704333290207428e-05, + "loss": 2.7446, + "step": 33290 + }, + { + "epoch": 2.066608728040226, + "grad_norm": 0.16017651429722016, + "learning_rate": 2.670113776474149e-05, + "loss": 2.697, + "step": 33291 + }, + { + "epoch": 2.066670805139984, + "grad_norm": 0.14856024109587868, + "learning_rate": 2.6697942360831403e-05, + "loss": 2.6903, + "step": 33292 + }, + { + "epoch": 2.0667328822397417, + "grad_norm": 0.18901492103394482, + "learning_rate": 2.6694747078493797e-05, + "loss": 2.6964, + "step": 33293 + }, + { + "epoch": 2.0667949593394996, + "grad_norm": 0.16045087951617917, + "learning_rate": 2.669155191774536e-05, + "loss": 2.7019, + "step": 33294 + }, + { + "epoch": 2.0668570364392576, + "grad_norm": 0.18380543497030627, + "learning_rate": 2.668835687860276e-05, + "loss": 2.7257, + "step": 33295 + }, + { + "epoch": 2.0669191135390155, + "grad_norm": 0.15542116275523293, + "learning_rate": 2.6685161961082633e-05, + "loss": 2.7929, + "step": 33296 + }, + { + "epoch": 2.0669811906387734, + "grad_norm": 0.14682446736652083, + "learning_rate": 2.6681967165201694e-05, + "loss": 2.7187, + "step": 33297 + }, + { + "epoch": 2.0670432677385313, + "grad_norm": 0.1403112819572034, + "learning_rate": 2.66787724909766e-05, + "loss": 2.7227, + "step": 33298 + }, + { + "epoch": 2.0671053448382892, + "grad_norm": 0.15427687068624732, + "learning_rate": 2.6675577938424007e-05, + "loss": 2.7712, + "step": 33299 + }, + { + "epoch": 2.067167421938047, + "grad_norm": 0.1419382696088875, + "learning_rate": 2.6672383507560573e-05, + "loss": 2.7646, + "step": 33300 + }, + { + "epoch": 2.067229499037805, + "grad_norm": 0.2284719148222824, + "learning_rate": 2.666918919840299e-05, + "loss": 2.7593, + "step": 33301 + }, + { + "epoch": 2.067291576137563, + "grad_norm": 0.14452108785627849, + "learning_rate": 2.666599501096791e-05, + "loss": 2.7352, + "step": 33302 + }, + { + "epoch": 2.067353653237321, + "grad_norm": 0.14430949881354274, + "learning_rate": 2.6662800945272e-05, + "loss": 2.693, + "step": 33303 + }, + { + "epoch": 2.067415730337079, + "grad_norm": 0.16569212079772616, + "learning_rate": 2.6659607001331906e-05, + "loss": 2.7623, + "step": 33304 + }, + { + "epoch": 2.0674778074368367, + "grad_norm": 0.1513222720048702, + "learning_rate": 2.6656413179164318e-05, + "loss": 2.7052, + "step": 33305 + }, + { + "epoch": 2.0675398845365947, + "grad_norm": 0.14605986436768298, + "learning_rate": 2.66532194787859e-05, + "loss": 2.7525, + "step": 33306 + }, + { + "epoch": 2.0676019616363526, + "grad_norm": 0.13850761813211926, + "learning_rate": 2.6650025900213303e-05, + "loss": 2.7023, + "step": 33307 + }, + { + "epoch": 2.06766403873611, + "grad_norm": 0.1505379079475219, + "learning_rate": 2.664683244346319e-05, + "loss": 2.6629, + "step": 33308 + }, + { + "epoch": 2.067726115835868, + "grad_norm": 0.17339509969800013, + "learning_rate": 2.66436391085522e-05, + "loss": 2.5892, + "step": 33309 + }, + { + "epoch": 2.067788192935626, + "grad_norm": 0.15872352861863118, + "learning_rate": 2.6640445895497028e-05, + "loss": 2.7615, + "step": 33310 + }, + { + "epoch": 2.067850270035384, + "grad_norm": 0.14157158764560196, + "learning_rate": 2.6637252804314306e-05, + "loss": 2.6522, + "step": 33311 + }, + { + "epoch": 2.0679123471351417, + "grad_norm": 0.1802362790149773, + "learning_rate": 2.663405983502073e-05, + "loss": 2.8228, + "step": 33312 + }, + { + "epoch": 2.0679744242348996, + "grad_norm": 0.1477674935617942, + "learning_rate": 2.663086698763293e-05, + "loss": 2.7517, + "step": 33313 + }, + { + "epoch": 2.0680365013346576, + "grad_norm": 0.170245498199293, + "learning_rate": 2.6627674262167558e-05, + "loss": 2.7427, + "step": 33314 + }, + { + "epoch": 2.0680985784344155, + "grad_norm": 0.17418427896105004, + "learning_rate": 2.6624481658641297e-05, + "loss": 2.8304, + "step": 33315 + }, + { + "epoch": 2.0681606555341734, + "grad_norm": 0.15397865944778266, + "learning_rate": 2.6621289177070786e-05, + "loss": 2.6646, + "step": 33316 + }, + { + "epoch": 2.0682227326339313, + "grad_norm": 0.19001392068813, + "learning_rate": 2.6618096817472694e-05, + "loss": 2.7541, + "step": 33317 + }, + { + "epoch": 2.0682848097336892, + "grad_norm": 0.16195249435000225, + "learning_rate": 2.661490457986364e-05, + "loss": 2.6704, + "step": 33318 + }, + { + "epoch": 2.068346886833447, + "grad_norm": 0.14936387708981913, + "learning_rate": 2.661171246426033e-05, + "loss": 2.7594, + "step": 33319 + }, + { + "epoch": 2.068408963933205, + "grad_norm": 0.15117954190351532, + "learning_rate": 2.6608520470679393e-05, + "loss": 2.7013, + "step": 33320 + }, + { + "epoch": 2.068471041032963, + "grad_norm": 0.14283793436812106, + "learning_rate": 2.660532859913748e-05, + "loss": 2.6809, + "step": 33321 + }, + { + "epoch": 2.068533118132721, + "grad_norm": 0.1479401790261897, + "learning_rate": 2.660213684965125e-05, + "loss": 2.7427, + "step": 33322 + }, + { + "epoch": 2.068595195232479, + "grad_norm": 0.150422105343029, + "learning_rate": 2.6598945222237336e-05, + "loss": 2.7028, + "step": 33323 + }, + { + "epoch": 2.0686572723322367, + "grad_norm": 0.16346819273967267, + "learning_rate": 2.659575371691242e-05, + "loss": 2.7156, + "step": 33324 + }, + { + "epoch": 2.0687193494319946, + "grad_norm": 0.14501778444207847, + "learning_rate": 2.659256233369314e-05, + "loss": 2.7913, + "step": 33325 + }, + { + "epoch": 2.0687814265317526, + "grad_norm": 0.1613650065052815, + "learning_rate": 2.658937107259615e-05, + "loss": 2.7669, + "step": 33326 + }, + { + "epoch": 2.0688435036315105, + "grad_norm": 0.15005222866625526, + "learning_rate": 2.6586179933638067e-05, + "loss": 2.7717, + "step": 33327 + }, + { + "epoch": 2.0689055807312684, + "grad_norm": 0.1425339813351268, + "learning_rate": 2.6582988916835584e-05, + "loss": 2.6492, + "step": 33328 + }, + { + "epoch": 2.0689676578310263, + "grad_norm": 0.15508359423777207, + "learning_rate": 2.657979802220534e-05, + "loss": 2.7226, + "step": 33329 + }, + { + "epoch": 2.0690297349307842, + "grad_norm": 0.14659833290086038, + "learning_rate": 2.6576607249763973e-05, + "loss": 2.7588, + "step": 33330 + }, + { + "epoch": 2.0690918120305417, + "grad_norm": 0.1847637011083077, + "learning_rate": 2.6573416599528134e-05, + "loss": 2.6692, + "step": 33331 + }, + { + "epoch": 2.0691538891302996, + "grad_norm": 0.14151596244383444, + "learning_rate": 2.6570226071514442e-05, + "loss": 2.7147, + "step": 33332 + }, + { + "epoch": 2.0692159662300575, + "grad_norm": 0.15579317785682836, + "learning_rate": 2.6567035665739593e-05, + "loss": 2.7156, + "step": 33333 + }, + { + "epoch": 2.0692780433298155, + "grad_norm": 0.20990634474610925, + "learning_rate": 2.6563845382220203e-05, + "loss": 2.7404, + "step": 33334 + }, + { + "epoch": 2.0693401204295734, + "grad_norm": 0.14913368721517153, + "learning_rate": 2.6560655220972923e-05, + "loss": 2.7227, + "step": 33335 + }, + { + "epoch": 2.0694021975293313, + "grad_norm": 0.18411183811570955, + "learning_rate": 2.655746518201438e-05, + "loss": 2.6941, + "step": 33336 + }, + { + "epoch": 2.069464274629089, + "grad_norm": 0.1445702680553171, + "learning_rate": 2.655427526536125e-05, + "loss": 2.676, + "step": 33337 + }, + { + "epoch": 2.069526351728847, + "grad_norm": 0.1460662724159172, + "learning_rate": 2.6551085471030153e-05, + "loss": 2.6829, + "step": 33338 + }, + { + "epoch": 2.069588428828605, + "grad_norm": 0.15384062176223798, + "learning_rate": 2.6547895799037736e-05, + "loss": 2.7259, + "step": 33339 + }, + { + "epoch": 2.069650505928363, + "grad_norm": 0.14686675073913927, + "learning_rate": 2.6544706249400618e-05, + "loss": 2.7136, + "step": 33340 + }, + { + "epoch": 2.069712583028121, + "grad_norm": 0.16786581589245, + "learning_rate": 2.654151682213549e-05, + "loss": 2.6861, + "step": 33341 + }, + { + "epoch": 2.069774660127879, + "grad_norm": 0.15001934219862958, + "learning_rate": 2.653832751725896e-05, + "loss": 2.7878, + "step": 33342 + }, + { + "epoch": 2.0698367372276367, + "grad_norm": 0.1489586840694362, + "learning_rate": 2.653513833478767e-05, + "loss": 2.7105, + "step": 33343 + }, + { + "epoch": 2.0698988143273946, + "grad_norm": 0.2568509202655512, + "learning_rate": 2.6531949274738236e-05, + "loss": 2.7754, + "step": 33344 + }, + { + "epoch": 2.0699608914271526, + "grad_norm": 0.21487561769823874, + "learning_rate": 2.6528760337127346e-05, + "loss": 2.6959, + "step": 33345 + }, + { + "epoch": 2.0700229685269105, + "grad_norm": 0.14531937500882147, + "learning_rate": 2.652557152197159e-05, + "loss": 2.6431, + "step": 33346 + }, + { + "epoch": 2.0700850456266684, + "grad_norm": 0.16024004093421335, + "learning_rate": 2.652238282928764e-05, + "loss": 2.7274, + "step": 33347 + }, + { + "epoch": 2.0701471227264263, + "grad_norm": 0.17845295697154123, + "learning_rate": 2.6519194259092127e-05, + "loss": 2.7868, + "step": 33348 + }, + { + "epoch": 2.0702091998261842, + "grad_norm": 0.1872764586019233, + "learning_rate": 2.6516005811401668e-05, + "loss": 2.8274, + "step": 33349 + }, + { + "epoch": 2.070271276925942, + "grad_norm": 0.14817982192321752, + "learning_rate": 2.6512817486232898e-05, + "loss": 2.7759, + "step": 33350 + }, + { + "epoch": 2.0703333540257, + "grad_norm": 0.14533815798298422, + "learning_rate": 2.6509629283602477e-05, + "loss": 2.7071, + "step": 33351 + }, + { + "epoch": 2.070395431125458, + "grad_norm": 0.17210931952844002, + "learning_rate": 2.650644120352702e-05, + "loss": 2.7549, + "step": 33352 + }, + { + "epoch": 2.070457508225216, + "grad_norm": 0.1472346040448036, + "learning_rate": 2.6503253246023165e-05, + "loss": 2.7203, + "step": 33353 + }, + { + "epoch": 2.070519585324974, + "grad_norm": 0.16018471232320514, + "learning_rate": 2.6500065411107517e-05, + "loss": 2.8325, + "step": 33354 + }, + { + "epoch": 2.0705816624247317, + "grad_norm": 0.18660759150887227, + "learning_rate": 2.6496877698796758e-05, + "loss": 2.6703, + "step": 33355 + }, + { + "epoch": 2.070643739524489, + "grad_norm": 0.1453230664298013, + "learning_rate": 2.649369010910749e-05, + "loss": 2.711, + "step": 33356 + }, + { + "epoch": 2.070705816624247, + "grad_norm": 0.1553214077054902, + "learning_rate": 2.6490502642056346e-05, + "loss": 2.7247, + "step": 33357 + }, + { + "epoch": 2.070767893724005, + "grad_norm": 0.1475582685788887, + "learning_rate": 2.6487315297659965e-05, + "loss": 2.7537, + "step": 33358 + }, + { + "epoch": 2.070829970823763, + "grad_norm": 0.14778360848612904, + "learning_rate": 2.6484128075934938e-05, + "loss": 2.7224, + "step": 33359 + }, + { + "epoch": 2.070892047923521, + "grad_norm": 0.1456611708407483, + "learning_rate": 2.648094097689795e-05, + "loss": 2.7962, + "step": 33360 + }, + { + "epoch": 2.070954125023279, + "grad_norm": 0.14838749269077828, + "learning_rate": 2.64777540005656e-05, + "loss": 2.7374, + "step": 33361 + }, + { + "epoch": 2.0710162021230367, + "grad_norm": 0.15540993287167634, + "learning_rate": 2.647456714695452e-05, + "loss": 2.7011, + "step": 33362 + }, + { + "epoch": 2.0710782792227946, + "grad_norm": 0.1469931223867725, + "learning_rate": 2.647138041608131e-05, + "loss": 2.7693, + "step": 33363 + }, + { + "epoch": 2.0711403563225526, + "grad_norm": 0.14981416973159073, + "learning_rate": 2.6468193807962644e-05, + "loss": 2.7599, + "step": 33364 + }, + { + "epoch": 2.0712024334223105, + "grad_norm": 0.21837772469712263, + "learning_rate": 2.646500732261512e-05, + "loss": 2.7671, + "step": 33365 + }, + { + "epoch": 2.0712645105220684, + "grad_norm": 0.14479299634441847, + "learning_rate": 2.646182096005537e-05, + "loss": 2.7667, + "step": 33366 + }, + { + "epoch": 2.0713265876218263, + "grad_norm": 0.14613976952237046, + "learning_rate": 2.645863472029999e-05, + "loss": 2.7238, + "step": 33367 + }, + { + "epoch": 2.0713886647215842, + "grad_norm": 0.15263010890558043, + "learning_rate": 2.645544860336565e-05, + "loss": 2.668, + "step": 33368 + }, + { + "epoch": 2.071450741821342, + "grad_norm": 0.15194362277252005, + "learning_rate": 2.6452262609268953e-05, + "loss": 2.8017, + "step": 33369 + }, + { + "epoch": 2.0715128189211, + "grad_norm": 0.16226199445855116, + "learning_rate": 2.6449076738026514e-05, + "loss": 2.7822, + "step": 33370 + }, + { + "epoch": 2.071574896020858, + "grad_norm": 0.14321770333164213, + "learning_rate": 2.644589098965496e-05, + "loss": 2.7287, + "step": 33371 + }, + { + "epoch": 2.071636973120616, + "grad_norm": 0.14722140860095315, + "learning_rate": 2.644270536417089e-05, + "loss": 2.7147, + "step": 33372 + }, + { + "epoch": 2.071699050220374, + "grad_norm": 0.14395265495797435, + "learning_rate": 2.6439519861590965e-05, + "loss": 2.7323, + "step": 33373 + }, + { + "epoch": 2.0717611273201317, + "grad_norm": 0.16294229200323282, + "learning_rate": 2.6436334481931785e-05, + "loss": 2.7553, + "step": 33374 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.16063764377659098, + "learning_rate": 2.6433149225209967e-05, + "loss": 2.6971, + "step": 33375 + }, + { + "epoch": 2.0718852815196476, + "grad_norm": 0.1702971479951137, + "learning_rate": 2.6429964091442117e-05, + "loss": 2.7464, + "step": 33376 + }, + { + "epoch": 2.0719473586194055, + "grad_norm": 0.1436356945231113, + "learning_rate": 2.642677908064486e-05, + "loss": 2.712, + "step": 33377 + }, + { + "epoch": 2.0720094357191634, + "grad_norm": 0.14310983903411692, + "learning_rate": 2.642359419283484e-05, + "loss": 2.687, + "step": 33378 + }, + { + "epoch": 2.072071512818921, + "grad_norm": 0.14624164179409677, + "learning_rate": 2.642040942802865e-05, + "loss": 2.697, + "step": 33379 + }, + { + "epoch": 2.072133589918679, + "grad_norm": 0.1702129731592551, + "learning_rate": 2.641722478624291e-05, + "loss": 2.7473, + "step": 33380 + }, + { + "epoch": 2.0721956670184367, + "grad_norm": 0.1443704728969055, + "learning_rate": 2.6414040267494232e-05, + "loss": 2.7461, + "step": 33381 + }, + { + "epoch": 2.0722577441181946, + "grad_norm": 0.14521804862471774, + "learning_rate": 2.6410855871799217e-05, + "loss": 2.7287, + "step": 33382 + }, + { + "epoch": 2.0723198212179526, + "grad_norm": 0.1491776259909902, + "learning_rate": 2.640767159917451e-05, + "loss": 2.7583, + "step": 33383 + }, + { + "epoch": 2.0723818983177105, + "grad_norm": 0.1505928104776624, + "learning_rate": 2.6404487449636705e-05, + "loss": 2.7129, + "step": 33384 + }, + { + "epoch": 2.0724439754174684, + "grad_norm": 0.1790531196933384, + "learning_rate": 2.6401303423202423e-05, + "loss": 2.6745, + "step": 33385 + }, + { + "epoch": 2.0725060525172263, + "grad_norm": 0.14456240215166263, + "learning_rate": 2.6398119519888243e-05, + "loss": 2.8136, + "step": 33386 + }, + { + "epoch": 2.0725681296169842, + "grad_norm": 0.15664956145824838, + "learning_rate": 2.6394935739710823e-05, + "loss": 2.711, + "step": 33387 + }, + { + "epoch": 2.072630206716742, + "grad_norm": 0.14854133262057237, + "learning_rate": 2.639175208268676e-05, + "loss": 2.8011, + "step": 33388 + }, + { + "epoch": 2.0726922838165, + "grad_norm": 0.14951884889949, + "learning_rate": 2.638856854883266e-05, + "loss": 2.7312, + "step": 33389 + }, + { + "epoch": 2.072754360916258, + "grad_norm": 0.1405235427861989, + "learning_rate": 2.6385385138165096e-05, + "loss": 2.6815, + "step": 33390 + }, + { + "epoch": 2.072816438016016, + "grad_norm": 0.13449761789944076, + "learning_rate": 2.6382201850700745e-05, + "loss": 2.648, + "step": 33391 + }, + { + "epoch": 2.072878515115774, + "grad_norm": 0.1972086229210049, + "learning_rate": 2.637901868645617e-05, + "loss": 2.7514, + "step": 33392 + }, + { + "epoch": 2.0729405922155317, + "grad_norm": 0.1448171725780846, + "learning_rate": 2.6375835645447994e-05, + "loss": 2.8322, + "step": 33393 + }, + { + "epoch": 2.0730026693152896, + "grad_norm": 0.1416248433936281, + "learning_rate": 2.6372652727692816e-05, + "loss": 2.7592, + "step": 33394 + }, + { + "epoch": 2.0730647464150476, + "grad_norm": 0.145422372759554, + "learning_rate": 2.6369469933207224e-05, + "loss": 2.6811, + "step": 33395 + }, + { + "epoch": 2.0731268235148055, + "grad_norm": 0.14486893949224314, + "learning_rate": 2.6366287262007873e-05, + "loss": 2.7833, + "step": 33396 + }, + { + "epoch": 2.0731889006145634, + "grad_norm": 0.13718959249538612, + "learning_rate": 2.6363104714111326e-05, + "loss": 2.6273, + "step": 33397 + }, + { + "epoch": 2.0732509777143213, + "grad_norm": 0.13936029063639435, + "learning_rate": 2.6359922289534206e-05, + "loss": 2.6988, + "step": 33398 + }, + { + "epoch": 2.0733130548140792, + "grad_norm": 0.16296159264052296, + "learning_rate": 2.6356739988293087e-05, + "loss": 2.6895, + "step": 33399 + }, + { + "epoch": 2.073375131913837, + "grad_norm": 0.14508873800551608, + "learning_rate": 2.6353557810404617e-05, + "loss": 2.7077, + "step": 33400 + }, + { + "epoch": 2.073437209013595, + "grad_norm": 0.1517667357817376, + "learning_rate": 2.635037575588538e-05, + "loss": 2.806, + "step": 33401 + }, + { + "epoch": 2.073499286113353, + "grad_norm": 0.155465487105668, + "learning_rate": 2.634719382475197e-05, + "loss": 2.6427, + "step": 33402 + }, + { + "epoch": 2.073561363213111, + "grad_norm": 0.14769744222780784, + "learning_rate": 2.6344012017020968e-05, + "loss": 2.7588, + "step": 33403 + }, + { + "epoch": 2.0736234403128684, + "grad_norm": 0.14600019544880666, + "learning_rate": 2.634083033270902e-05, + "loss": 2.6657, + "step": 33404 + }, + { + "epoch": 2.0736855174126263, + "grad_norm": 0.16953336213373896, + "learning_rate": 2.6337648771832713e-05, + "loss": 2.7699, + "step": 33405 + }, + { + "epoch": 2.073747594512384, + "grad_norm": 0.1451426538889653, + "learning_rate": 2.6334467334408624e-05, + "loss": 2.7642, + "step": 33406 + }, + { + "epoch": 2.073809671612142, + "grad_norm": 0.1490724997795058, + "learning_rate": 2.633128602045337e-05, + "loss": 2.7374, + "step": 33407 + }, + { + "epoch": 2.0738717487119, + "grad_norm": 0.1486876976646049, + "learning_rate": 2.6328104829983523e-05, + "loss": 2.7192, + "step": 33408 + }, + { + "epoch": 2.073933825811658, + "grad_norm": 0.14270385659173465, + "learning_rate": 2.6324923763015697e-05, + "loss": 2.8034, + "step": 33409 + }, + { + "epoch": 2.073995902911416, + "grad_norm": 0.16116405530960337, + "learning_rate": 2.6321742819566515e-05, + "loss": 2.7898, + "step": 33410 + }, + { + "epoch": 2.074057980011174, + "grad_norm": 0.16426231500575858, + "learning_rate": 2.6318561999652546e-05, + "loss": 2.709, + "step": 33411 + }, + { + "epoch": 2.0741200571109317, + "grad_norm": 0.1603674252338685, + "learning_rate": 2.6315381303290386e-05, + "loss": 2.7089, + "step": 33412 + }, + { + "epoch": 2.0741821342106896, + "grad_norm": 0.15717634737861014, + "learning_rate": 2.6312200730496618e-05, + "loss": 2.7779, + "step": 33413 + }, + { + "epoch": 2.0742442113104476, + "grad_norm": 0.13648743134184277, + "learning_rate": 2.630902028128786e-05, + "loss": 2.7651, + "step": 33414 + }, + { + "epoch": 2.0743062884102055, + "grad_norm": 0.1451284532550965, + "learning_rate": 2.63058399556807e-05, + "loss": 2.7066, + "step": 33415 + }, + { + "epoch": 2.0743683655099634, + "grad_norm": 0.14180826687830955, + "learning_rate": 2.630265975369172e-05, + "loss": 2.724, + "step": 33416 + }, + { + "epoch": 2.0744304426097213, + "grad_norm": 0.1656727897325933, + "learning_rate": 2.6299479675337523e-05, + "loss": 2.6984, + "step": 33417 + }, + { + "epoch": 2.0744925197094792, + "grad_norm": 0.14181642345054826, + "learning_rate": 2.6296299720634678e-05, + "loss": 2.705, + "step": 33418 + }, + { + "epoch": 2.074554596809237, + "grad_norm": 0.14429037539017883, + "learning_rate": 2.6293119889599798e-05, + "loss": 2.6988, + "step": 33419 + }, + { + "epoch": 2.074616673908995, + "grad_norm": 0.17485038863184693, + "learning_rate": 2.6289940182249472e-05, + "loss": 2.7285, + "step": 33420 + }, + { + "epoch": 2.074678751008753, + "grad_norm": 0.1536267613573033, + "learning_rate": 2.6286760598600292e-05, + "loss": 2.7493, + "step": 33421 + }, + { + "epoch": 2.074740828108511, + "grad_norm": 0.14576162053799685, + "learning_rate": 2.6283581138668812e-05, + "loss": 2.7023, + "step": 33422 + }, + { + "epoch": 2.074802905208269, + "grad_norm": 0.18147853058605837, + "learning_rate": 2.6280401802471665e-05, + "loss": 2.7652, + "step": 33423 + }, + { + "epoch": 2.0748649823080267, + "grad_norm": 0.14336155206342122, + "learning_rate": 2.6277222590025418e-05, + "loss": 2.6667, + "step": 33424 + }, + { + "epoch": 2.0749270594077847, + "grad_norm": 0.163547274519714, + "learning_rate": 2.627404350134667e-05, + "loss": 2.7314, + "step": 33425 + }, + { + "epoch": 2.0749891365075426, + "grad_norm": 0.16015190607088017, + "learning_rate": 2.6270864536451967e-05, + "loss": 2.6764, + "step": 33426 + }, + { + "epoch": 2.0750512136073, + "grad_norm": 0.14742900174153115, + "learning_rate": 2.6267685695357947e-05, + "loss": 2.7821, + "step": 33427 + }, + { + "epoch": 2.075113290707058, + "grad_norm": 0.21021941487227164, + "learning_rate": 2.626450697808117e-05, + "loss": 2.7117, + "step": 33428 + }, + { + "epoch": 2.075175367806816, + "grad_norm": 0.15247559955103457, + "learning_rate": 2.626132838463823e-05, + "loss": 2.6662, + "step": 33429 + }, + { + "epoch": 2.075237444906574, + "grad_norm": 0.1503799670022201, + "learning_rate": 2.6258149915045694e-05, + "loss": 2.7251, + "step": 33430 + }, + { + "epoch": 2.0752995220063317, + "grad_norm": 0.14655898419374466, + "learning_rate": 2.625497156932014e-05, + "loss": 2.6993, + "step": 33431 + }, + { + "epoch": 2.0753615991060896, + "grad_norm": 0.15585855684567593, + "learning_rate": 2.6251793347478183e-05, + "loss": 2.7154, + "step": 33432 + }, + { + "epoch": 2.0754236762058476, + "grad_norm": 0.1487379484243291, + "learning_rate": 2.624861524953638e-05, + "loss": 2.7878, + "step": 33433 + }, + { + "epoch": 2.0754857533056055, + "grad_norm": 0.1461886026627085, + "learning_rate": 2.624543727551132e-05, + "loss": 2.8265, + "step": 33434 + }, + { + "epoch": 2.0755478304053634, + "grad_norm": 0.15577664797039295, + "learning_rate": 2.624225942541956e-05, + "loss": 2.6894, + "step": 33435 + }, + { + "epoch": 2.0756099075051213, + "grad_norm": 0.14654868303780622, + "learning_rate": 2.623908169927771e-05, + "loss": 2.6923, + "step": 33436 + }, + { + "epoch": 2.0756719846048792, + "grad_norm": 0.21116142396683768, + "learning_rate": 2.6235904097102354e-05, + "loss": 2.7085, + "step": 33437 + }, + { + "epoch": 2.075734061704637, + "grad_norm": 0.1372538992227145, + "learning_rate": 2.6232726618910052e-05, + "loss": 2.7859, + "step": 33438 + }, + { + "epoch": 2.075796138804395, + "grad_norm": 0.18061106136355495, + "learning_rate": 2.6229549264717363e-05, + "loss": 2.7143, + "step": 33439 + }, + { + "epoch": 2.075858215904153, + "grad_norm": 0.14685899515404208, + "learning_rate": 2.6226372034540903e-05, + "loss": 2.7759, + "step": 33440 + }, + { + "epoch": 2.075920293003911, + "grad_norm": 0.2010887955897795, + "learning_rate": 2.622319492839723e-05, + "loss": 2.7668, + "step": 33441 + }, + { + "epoch": 2.075982370103669, + "grad_norm": 0.18139422532082977, + "learning_rate": 2.622001794630291e-05, + "loss": 2.7299, + "step": 33442 + }, + { + "epoch": 2.0760444472034267, + "grad_norm": 0.1634541711546772, + "learning_rate": 2.6216841088274547e-05, + "loss": 2.771, + "step": 33443 + }, + { + "epoch": 2.0761065243031847, + "grad_norm": 0.1651843069527841, + "learning_rate": 2.6213664354328692e-05, + "loss": 2.7839, + "step": 33444 + }, + { + "epoch": 2.0761686014029426, + "grad_norm": 0.16468221695766755, + "learning_rate": 2.621048774448191e-05, + "loss": 2.7091, + "step": 33445 + }, + { + "epoch": 2.0762306785027005, + "grad_norm": 0.15209401274988174, + "learning_rate": 2.6207311258750804e-05, + "loss": 2.7334, + "step": 33446 + }, + { + "epoch": 2.0762927556024584, + "grad_norm": 0.1476666171859679, + "learning_rate": 2.6204134897151933e-05, + "loss": 2.6703, + "step": 33447 + }, + { + "epoch": 2.0763548327022163, + "grad_norm": 0.21027309367381897, + "learning_rate": 2.6200958659701867e-05, + "loss": 2.688, + "step": 33448 + }, + { + "epoch": 2.0764169098019742, + "grad_norm": 0.15183947316403937, + "learning_rate": 2.619778254641716e-05, + "loss": 2.755, + "step": 33449 + }, + { + "epoch": 2.076478986901732, + "grad_norm": 0.14290622297554087, + "learning_rate": 2.6194606557314416e-05, + "loss": 2.7199, + "step": 33450 + }, + { + "epoch": 2.07654106400149, + "grad_norm": 0.15736742994964653, + "learning_rate": 2.619143069241019e-05, + "loss": 2.7119, + "step": 33451 + }, + { + "epoch": 2.0766031411012476, + "grad_norm": 0.15233129059142236, + "learning_rate": 2.6188254951721048e-05, + "loss": 2.7414, + "step": 33452 + }, + { + "epoch": 2.0766652182010055, + "grad_norm": 0.17695608662870005, + "learning_rate": 2.6185079335263542e-05, + "loss": 2.779, + "step": 33453 + }, + { + "epoch": 2.0767272953007634, + "grad_norm": 0.17214611734861346, + "learning_rate": 2.618190384305428e-05, + "loss": 2.7602, + "step": 33454 + }, + { + "epoch": 2.0767893724005213, + "grad_norm": 0.14519559402992718, + "learning_rate": 2.6178728475109803e-05, + "loss": 2.6766, + "step": 33455 + }, + { + "epoch": 2.0768514495002792, + "grad_norm": 0.14938997560712458, + "learning_rate": 2.617555323144668e-05, + "loss": 2.7645, + "step": 33456 + }, + { + "epoch": 2.076913526600037, + "grad_norm": 0.1684831294976236, + "learning_rate": 2.6172378112081487e-05, + "loss": 2.7847, + "step": 33457 + }, + { + "epoch": 2.076975603699795, + "grad_norm": 0.1509709494116077, + "learning_rate": 2.616920311703076e-05, + "loss": 2.7556, + "step": 33458 + }, + { + "epoch": 2.077037680799553, + "grad_norm": 0.1521980860825405, + "learning_rate": 2.6166028246311103e-05, + "loss": 2.6775, + "step": 33459 + }, + { + "epoch": 2.077099757899311, + "grad_norm": 0.1496825828421545, + "learning_rate": 2.6162853499939067e-05, + "loss": 2.7165, + "step": 33460 + }, + { + "epoch": 2.077161834999069, + "grad_norm": 0.15761503953190986, + "learning_rate": 2.6159678877931204e-05, + "loss": 2.737, + "step": 33461 + }, + { + "epoch": 2.0772239120988267, + "grad_norm": 0.15443086238909548, + "learning_rate": 2.615650438030407e-05, + "loss": 2.6785, + "step": 33462 + }, + { + "epoch": 2.0772859891985846, + "grad_norm": 0.15505172687378915, + "learning_rate": 2.615333000707426e-05, + "loss": 2.8336, + "step": 33463 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.16887315962372415, + "learning_rate": 2.615015575825831e-05, + "loss": 2.6899, + "step": 33464 + }, + { + "epoch": 2.0774101433981005, + "grad_norm": 0.1792104234232822, + "learning_rate": 2.614698163387279e-05, + "loss": 2.7198, + "step": 33465 + }, + { + "epoch": 2.0774722204978584, + "grad_norm": 0.1646829464860176, + "learning_rate": 2.6143807633934265e-05, + "loss": 2.7438, + "step": 33466 + }, + { + "epoch": 2.0775342975976163, + "grad_norm": 0.15149467390954233, + "learning_rate": 2.614063375845926e-05, + "loss": 2.6798, + "step": 33467 + }, + { + "epoch": 2.0775963746973742, + "grad_norm": 0.1510587194835375, + "learning_rate": 2.6137460007464382e-05, + "loss": 2.7591, + "step": 33468 + }, + { + "epoch": 2.077658451797132, + "grad_norm": 0.17475052745750405, + "learning_rate": 2.6134286380966166e-05, + "loss": 2.7247, + "step": 33469 + }, + { + "epoch": 2.07772052889689, + "grad_norm": 0.16631542512098205, + "learning_rate": 2.6131112878981177e-05, + "loss": 2.7152, + "step": 33470 + }, + { + "epoch": 2.077782605996648, + "grad_norm": 0.14520175800482527, + "learning_rate": 2.6127939501525946e-05, + "loss": 2.7053, + "step": 33471 + }, + { + "epoch": 2.077844683096406, + "grad_norm": 0.15704777426628722, + "learning_rate": 2.612476624861706e-05, + "loss": 2.7507, + "step": 33472 + }, + { + "epoch": 2.077906760196164, + "grad_norm": 0.18219579946023406, + "learning_rate": 2.612159312027107e-05, + "loss": 2.796, + "step": 33473 + }, + { + "epoch": 2.0779688372959217, + "grad_norm": 0.15817680257643157, + "learning_rate": 2.611842011650453e-05, + "loss": 2.8381, + "step": 33474 + }, + { + "epoch": 2.078030914395679, + "grad_norm": 0.14293838736179867, + "learning_rate": 2.611524723733397e-05, + "loss": 2.7739, + "step": 33475 + }, + { + "epoch": 2.078092991495437, + "grad_norm": 0.15583248982120465, + "learning_rate": 2.6112074482775955e-05, + "loss": 2.7577, + "step": 33476 + }, + { + "epoch": 2.078155068595195, + "grad_norm": 0.14626727865111772, + "learning_rate": 2.6108901852847072e-05, + "loss": 2.7464, + "step": 33477 + }, + { + "epoch": 2.078217145694953, + "grad_norm": 0.16156194359371806, + "learning_rate": 2.6105729347563845e-05, + "loss": 2.741, + "step": 33478 + }, + { + "epoch": 2.078279222794711, + "grad_norm": 0.1610897353418525, + "learning_rate": 2.610255696694283e-05, + "loss": 2.7238, + "step": 33479 + }, + { + "epoch": 2.078341299894469, + "grad_norm": 0.1684354464205824, + "learning_rate": 2.609938471100058e-05, + "loss": 2.6993, + "step": 33480 + }, + { + "epoch": 2.0784033769942267, + "grad_norm": 0.15773671473921239, + "learning_rate": 2.609621257975362e-05, + "loss": 2.6284, + "step": 33481 + }, + { + "epoch": 2.0784654540939846, + "grad_norm": 0.14594496274977375, + "learning_rate": 2.609304057321854e-05, + "loss": 2.6828, + "step": 33482 + }, + { + "epoch": 2.0785275311937426, + "grad_norm": 0.14099924304096134, + "learning_rate": 2.6089868691411868e-05, + "loss": 2.5835, + "step": 33483 + }, + { + "epoch": 2.0785896082935005, + "grad_norm": 0.180174974889424, + "learning_rate": 2.6086696934350163e-05, + "loss": 2.6431, + "step": 33484 + }, + { + "epoch": 2.0786516853932584, + "grad_norm": 0.1532543368437649, + "learning_rate": 2.6083525302049937e-05, + "loss": 2.6902, + "step": 33485 + }, + { + "epoch": 2.0787137624930163, + "grad_norm": 0.14396598827883683, + "learning_rate": 2.608035379452779e-05, + "loss": 2.667, + "step": 33486 + }, + { + "epoch": 2.0787758395927742, + "grad_norm": 0.1483454293259849, + "learning_rate": 2.607718241180025e-05, + "loss": 2.7293, + "step": 33487 + }, + { + "epoch": 2.078837916692532, + "grad_norm": 0.15458688763257894, + "learning_rate": 2.6074011153883848e-05, + "loss": 2.7201, + "step": 33488 + }, + { + "epoch": 2.07889999379229, + "grad_norm": 0.14668972245187772, + "learning_rate": 2.6070840020795117e-05, + "loss": 2.6694, + "step": 33489 + }, + { + "epoch": 2.078962070892048, + "grad_norm": 0.1532686960599254, + "learning_rate": 2.606766901255065e-05, + "loss": 2.7047, + "step": 33490 + }, + { + "epoch": 2.079024147991806, + "grad_norm": 0.14592240939648532, + "learning_rate": 2.606449812916696e-05, + "loss": 2.6747, + "step": 33491 + }, + { + "epoch": 2.079086225091564, + "grad_norm": 0.14833746224476274, + "learning_rate": 2.6061327370660592e-05, + "loss": 2.7001, + "step": 33492 + }, + { + "epoch": 2.0791483021913217, + "grad_norm": 0.15564557232522336, + "learning_rate": 2.605815673704809e-05, + "loss": 2.7752, + "step": 33493 + }, + { + "epoch": 2.0792103792910797, + "grad_norm": 0.1471264943368647, + "learning_rate": 2.605498622834598e-05, + "loss": 2.7667, + "step": 33494 + }, + { + "epoch": 2.0792724563908376, + "grad_norm": 0.1473807059793909, + "learning_rate": 2.6051815844570837e-05, + "loss": 2.7711, + "step": 33495 + }, + { + "epoch": 2.0793345334905955, + "grad_norm": 0.16504906166980576, + "learning_rate": 2.604864558573919e-05, + "loss": 2.7584, + "step": 33496 + }, + { + "epoch": 2.0793966105903534, + "grad_norm": 0.13867796629301274, + "learning_rate": 2.604547545186757e-05, + "loss": 2.651, + "step": 33497 + }, + { + "epoch": 2.0794586876901113, + "grad_norm": 0.14568820754279932, + "learning_rate": 2.60423054429725e-05, + "loss": 2.7963, + "step": 33498 + }, + { + "epoch": 2.079520764789869, + "grad_norm": 0.15130055714579982, + "learning_rate": 2.6039135559070553e-05, + "loss": 2.7178, + "step": 33499 + }, + { + "epoch": 2.0795828418896267, + "grad_norm": 0.14959487695391005, + "learning_rate": 2.603596580017826e-05, + "loss": 2.7161, + "step": 33500 + }, + { + "epoch": 2.0796449189893846, + "grad_norm": 0.14459222074598213, + "learning_rate": 2.6032796166312146e-05, + "loss": 2.6477, + "step": 33501 + }, + { + "epoch": 2.0797069960891426, + "grad_norm": 0.15328188736614895, + "learning_rate": 2.6029626657488753e-05, + "loss": 2.7956, + "step": 33502 + }, + { + "epoch": 2.0797690731889005, + "grad_norm": 0.14521715717743253, + "learning_rate": 2.6026457273724597e-05, + "loss": 2.7196, + "step": 33503 + }, + { + "epoch": 2.0798311502886584, + "grad_norm": 0.15844668959878802, + "learning_rate": 2.602328801503625e-05, + "loss": 2.7207, + "step": 33504 + }, + { + "epoch": 2.0798932273884163, + "grad_norm": 0.14486410501036817, + "learning_rate": 2.6020118881440236e-05, + "loss": 2.8122, + "step": 33505 + }, + { + "epoch": 2.0799553044881742, + "grad_norm": 0.1693242019493849, + "learning_rate": 2.6016949872953083e-05, + "loss": 2.7392, + "step": 33506 + }, + { + "epoch": 2.080017381587932, + "grad_norm": 0.14106005684566894, + "learning_rate": 2.6013780989591306e-05, + "loss": 2.7521, + "step": 33507 + }, + { + "epoch": 2.08007945868769, + "grad_norm": 0.14422704282250526, + "learning_rate": 2.601061223137145e-05, + "loss": 2.7174, + "step": 33508 + }, + { + "epoch": 2.080141535787448, + "grad_norm": 0.13716894686900272, + "learning_rate": 2.6007443598310076e-05, + "loss": 2.7037, + "step": 33509 + }, + { + "epoch": 2.080203612887206, + "grad_norm": 0.14101923544322173, + "learning_rate": 2.6004275090423697e-05, + "loss": 2.6898, + "step": 33510 + }, + { + "epoch": 2.080265689986964, + "grad_norm": 0.1633170369277569, + "learning_rate": 2.6001106707728835e-05, + "loss": 2.7901, + "step": 33511 + }, + { + "epoch": 2.0803277670867217, + "grad_norm": 0.14952183869221297, + "learning_rate": 2.599793845024201e-05, + "loss": 2.6712, + "step": 33512 + }, + { + "epoch": 2.0803898441864797, + "grad_norm": 0.14793912771326298, + "learning_rate": 2.5994770317979782e-05, + "loss": 2.7107, + "step": 33513 + }, + { + "epoch": 2.0804519212862376, + "grad_norm": 0.13615195086461765, + "learning_rate": 2.5991602310958662e-05, + "loss": 2.7123, + "step": 33514 + }, + { + "epoch": 2.0805139983859955, + "grad_norm": 0.1464623434664984, + "learning_rate": 2.5988434429195192e-05, + "loss": 2.7268, + "step": 33515 + }, + { + "epoch": 2.0805760754857534, + "grad_norm": 0.1518211264583809, + "learning_rate": 2.5985266672705876e-05, + "loss": 2.6859, + "step": 33516 + }, + { + "epoch": 2.0806381525855113, + "grad_norm": 0.155044717096891, + "learning_rate": 2.5982099041507247e-05, + "loss": 2.741, + "step": 33517 + }, + { + "epoch": 2.0807002296852692, + "grad_norm": 0.1489303025356685, + "learning_rate": 2.5978931535615847e-05, + "loss": 2.6528, + "step": 33518 + }, + { + "epoch": 2.080762306785027, + "grad_norm": 0.15089409020323957, + "learning_rate": 2.59757641550482e-05, + "loss": 2.7321, + "step": 33519 + }, + { + "epoch": 2.080824383884785, + "grad_norm": 0.15609691392328706, + "learning_rate": 2.5972596899820818e-05, + "loss": 2.7538, + "step": 33520 + }, + { + "epoch": 2.080886460984543, + "grad_norm": 0.14562315497641676, + "learning_rate": 2.5969429769950217e-05, + "loss": 2.6846, + "step": 33521 + }, + { + "epoch": 2.080948538084301, + "grad_norm": 0.147978959068732, + "learning_rate": 2.5966262765452943e-05, + "loss": 2.7116, + "step": 33522 + }, + { + "epoch": 2.0810106151840584, + "grad_norm": 0.1572485783960736, + "learning_rate": 2.5963095886345517e-05, + "loss": 2.7345, + "step": 33523 + }, + { + "epoch": 2.0810726922838163, + "grad_norm": 0.13998990892120308, + "learning_rate": 2.5959929132644456e-05, + "loss": 2.6268, + "step": 33524 + }, + { + "epoch": 2.0811347693835742, + "grad_norm": 0.15892128360343083, + "learning_rate": 2.595676250436626e-05, + "loss": 2.703, + "step": 33525 + }, + { + "epoch": 2.081196846483332, + "grad_norm": 0.1497133909344032, + "learning_rate": 2.5953596001527493e-05, + "loss": 2.7071, + "step": 33526 + }, + { + "epoch": 2.08125892358309, + "grad_norm": 0.1382625206154695, + "learning_rate": 2.595042962414465e-05, + "loss": 2.7168, + "step": 33527 + }, + { + "epoch": 2.081321000682848, + "grad_norm": 0.15390779838170413, + "learning_rate": 2.5947263372234253e-05, + "loss": 2.83, + "step": 33528 + }, + { + "epoch": 2.081383077782606, + "grad_norm": 0.14423638628603985, + "learning_rate": 2.5944097245812827e-05, + "loss": 2.6547, + "step": 33529 + }, + { + "epoch": 2.081445154882364, + "grad_norm": 0.137594740913214, + "learning_rate": 2.594093124489686e-05, + "loss": 2.7095, + "step": 33530 + }, + { + "epoch": 2.0815072319821217, + "grad_norm": 0.1830088245929754, + "learning_rate": 2.593776536950292e-05, + "loss": 2.8242, + "step": 33531 + }, + { + "epoch": 2.0815693090818796, + "grad_norm": 0.15004080177735127, + "learning_rate": 2.5934599619647493e-05, + "loss": 2.6735, + "step": 33532 + }, + { + "epoch": 2.0816313861816376, + "grad_norm": 0.1471268235006902, + "learning_rate": 2.593143399534711e-05, + "loss": 2.7329, + "step": 33533 + }, + { + "epoch": 2.0816934632813955, + "grad_norm": 0.14254912643260284, + "learning_rate": 2.5928268496618252e-05, + "loss": 2.7838, + "step": 33534 + }, + { + "epoch": 2.0817555403811534, + "grad_norm": 0.15188894485495605, + "learning_rate": 2.5925103123477483e-05, + "loss": 2.766, + "step": 33535 + }, + { + "epoch": 2.0818176174809113, + "grad_norm": 0.14136416825978054, + "learning_rate": 2.5921937875941292e-05, + "loss": 2.6523, + "step": 33536 + }, + { + "epoch": 2.0818796945806692, + "grad_norm": 0.15566660928018902, + "learning_rate": 2.5918772754026205e-05, + "loss": 2.8017, + "step": 33537 + }, + { + "epoch": 2.081941771680427, + "grad_norm": 0.14379652892575437, + "learning_rate": 2.59156077577487e-05, + "loss": 2.6258, + "step": 33538 + }, + { + "epoch": 2.082003848780185, + "grad_norm": 0.1398420325765162, + "learning_rate": 2.591244288712534e-05, + "loss": 2.7777, + "step": 33539 + }, + { + "epoch": 2.082065925879943, + "grad_norm": 0.1406600895124334, + "learning_rate": 2.590927814217259e-05, + "loss": 2.7608, + "step": 33540 + }, + { + "epoch": 2.082128002979701, + "grad_norm": 0.14537834321611612, + "learning_rate": 2.5906113522907006e-05, + "loss": 2.6944, + "step": 33541 + }, + { + "epoch": 2.082190080079459, + "grad_norm": 0.15153212636221017, + "learning_rate": 2.590294902934507e-05, + "loss": 2.8923, + "step": 33542 + }, + { + "epoch": 2.0822521571792167, + "grad_norm": 0.1355396310041373, + "learning_rate": 2.5899784661503308e-05, + "loss": 2.6489, + "step": 33543 + }, + { + "epoch": 2.0823142342789747, + "grad_norm": 0.13935957557064146, + "learning_rate": 2.58966204193982e-05, + "loss": 2.6953, + "step": 33544 + }, + { + "epoch": 2.0823763113787326, + "grad_norm": 0.1483545510771834, + "learning_rate": 2.5893456303046283e-05, + "loss": 2.8144, + "step": 33545 + }, + { + "epoch": 2.0824383884784905, + "grad_norm": 0.1389288468963191, + "learning_rate": 2.5890292312464064e-05, + "loss": 2.8398, + "step": 33546 + }, + { + "epoch": 2.082500465578248, + "grad_norm": 0.13615980332268615, + "learning_rate": 2.588712844766804e-05, + "loss": 2.7402, + "step": 33547 + }, + { + "epoch": 2.082562542678006, + "grad_norm": 0.14632024470880753, + "learning_rate": 2.5883964708674713e-05, + "loss": 2.7233, + "step": 33548 + }, + { + "epoch": 2.082624619777764, + "grad_norm": 0.15113027552262154, + "learning_rate": 2.5880801095500607e-05, + "loss": 2.6381, + "step": 33549 + }, + { + "epoch": 2.0826866968775217, + "grad_norm": 0.14074384118219524, + "learning_rate": 2.5877637608162208e-05, + "loss": 2.704, + "step": 33550 + }, + { + "epoch": 2.0827487739772796, + "grad_norm": 0.14282439792704135, + "learning_rate": 2.587447424667604e-05, + "loss": 2.6991, + "step": 33551 + }, + { + "epoch": 2.0828108510770376, + "grad_norm": 0.14638444670987388, + "learning_rate": 2.5871311011058595e-05, + "loss": 2.7165, + "step": 33552 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.14921595336130797, + "learning_rate": 2.5868147901326356e-05, + "loss": 2.7621, + "step": 33553 + }, + { + "epoch": 2.0829350052765534, + "grad_norm": 0.1415533509887445, + "learning_rate": 2.586498491749587e-05, + "loss": 2.6869, + "step": 33554 + }, + { + "epoch": 2.0829970823763113, + "grad_norm": 0.145841420891708, + "learning_rate": 2.586182205958362e-05, + "loss": 2.813, + "step": 33555 + }, + { + "epoch": 2.0830591594760692, + "grad_norm": 0.1397491038895949, + "learning_rate": 2.58586593276061e-05, + "loss": 2.6994, + "step": 33556 + }, + { + "epoch": 2.083121236575827, + "grad_norm": 0.139165001500422, + "learning_rate": 2.5855496721579798e-05, + "loss": 2.7024, + "step": 33557 + }, + { + "epoch": 2.083183313675585, + "grad_norm": 0.14131563850563553, + "learning_rate": 2.5852334241521243e-05, + "loss": 2.6492, + "step": 33558 + }, + { + "epoch": 2.083245390775343, + "grad_norm": 0.13828203314219983, + "learning_rate": 2.5849171887446926e-05, + "loss": 2.7611, + "step": 33559 + }, + { + "epoch": 2.083307467875101, + "grad_norm": 0.14157238363663804, + "learning_rate": 2.5846009659373344e-05, + "loss": 2.7043, + "step": 33560 + }, + { + "epoch": 2.083369544974859, + "grad_norm": 0.14845994102096632, + "learning_rate": 2.5842847557316974e-05, + "loss": 2.7309, + "step": 33561 + }, + { + "epoch": 2.0834316220746167, + "grad_norm": 0.1370659846473008, + "learning_rate": 2.583968558129435e-05, + "loss": 2.6328, + "step": 33562 + }, + { + "epoch": 2.0834936991743747, + "grad_norm": 0.15000551630844178, + "learning_rate": 2.583652373132195e-05, + "loss": 2.7801, + "step": 33563 + }, + { + "epoch": 2.0835557762741326, + "grad_norm": 0.16499544165376356, + "learning_rate": 2.5833362007416273e-05, + "loss": 2.7421, + "step": 33564 + }, + { + "epoch": 2.0836178533738905, + "grad_norm": 0.1409309952201166, + "learning_rate": 2.583020040959382e-05, + "loss": 2.7404, + "step": 33565 + }, + { + "epoch": 2.0836799304736484, + "grad_norm": 0.14606504821350066, + "learning_rate": 2.582703893787105e-05, + "loss": 2.7245, + "step": 33566 + }, + { + "epoch": 2.0837420075734063, + "grad_norm": 0.14923897626116156, + "learning_rate": 2.582387759226451e-05, + "loss": 2.6822, + "step": 33567 + }, + { + "epoch": 2.0838040846731642, + "grad_norm": 0.14412394688164434, + "learning_rate": 2.5820716372790664e-05, + "loss": 2.7263, + "step": 33568 + }, + { + "epoch": 2.083866161772922, + "grad_norm": 0.1513039364485153, + "learning_rate": 2.5817555279466017e-05, + "loss": 2.6985, + "step": 33569 + }, + { + "epoch": 2.08392823887268, + "grad_norm": 0.14221743462452374, + "learning_rate": 2.5814394312307033e-05, + "loss": 2.6529, + "step": 33570 + }, + { + "epoch": 2.0839903159724376, + "grad_norm": 0.14112899455194144, + "learning_rate": 2.581123347133024e-05, + "loss": 2.779, + "step": 33571 + }, + { + "epoch": 2.0840523930721955, + "grad_norm": 0.14702268870458104, + "learning_rate": 2.5808072756552127e-05, + "loss": 2.7609, + "step": 33572 + }, + { + "epoch": 2.0841144701719534, + "grad_norm": 0.14740249915826859, + "learning_rate": 2.5804912167989138e-05, + "loss": 2.7644, + "step": 33573 + }, + { + "epoch": 2.0841765472717113, + "grad_norm": 0.15554344644796242, + "learning_rate": 2.580175170565782e-05, + "loss": 2.7502, + "step": 33574 + }, + { + "epoch": 2.0842386243714692, + "grad_norm": 0.13961151275108105, + "learning_rate": 2.579859136957462e-05, + "loss": 2.6313, + "step": 33575 + }, + { + "epoch": 2.084300701471227, + "grad_norm": 0.13989692534770617, + "learning_rate": 2.579543115975606e-05, + "loss": 2.6544, + "step": 33576 + }, + { + "epoch": 2.084362778570985, + "grad_norm": 0.15479936745083747, + "learning_rate": 2.5792271076218618e-05, + "loss": 2.7949, + "step": 33577 + }, + { + "epoch": 2.084424855670743, + "grad_norm": 0.15655059510309496, + "learning_rate": 2.578911111897877e-05, + "loss": 2.7437, + "step": 33578 + }, + { + "epoch": 2.084486932770501, + "grad_norm": 0.1386964870156687, + "learning_rate": 2.5785951288053e-05, + "loss": 2.6778, + "step": 33579 + }, + { + "epoch": 2.084549009870259, + "grad_norm": 0.1397166918967955, + "learning_rate": 2.5782791583457787e-05, + "loss": 2.7181, + "step": 33580 + }, + { + "epoch": 2.0846110869700167, + "grad_norm": 0.13873921357791122, + "learning_rate": 2.5779632005209653e-05, + "loss": 2.7389, + "step": 33581 + }, + { + "epoch": 2.0846731640697747, + "grad_norm": 0.14583849613951036, + "learning_rate": 2.577647255332506e-05, + "loss": 2.7399, + "step": 33582 + }, + { + "epoch": 2.0847352411695326, + "grad_norm": 0.14694778257248367, + "learning_rate": 2.5773313227820484e-05, + "loss": 2.7777, + "step": 33583 + }, + { + "epoch": 2.0847973182692905, + "grad_norm": 0.13996522491770264, + "learning_rate": 2.577015402871239e-05, + "loss": 2.7624, + "step": 33584 + }, + { + "epoch": 2.0848593953690484, + "grad_norm": 0.1475993889614447, + "learning_rate": 2.5766994956017308e-05, + "loss": 2.7614, + "step": 33585 + }, + { + "epoch": 2.0849214724688063, + "grad_norm": 0.14164203197035044, + "learning_rate": 2.57638360097517e-05, + "loss": 2.8297, + "step": 33586 + }, + { + "epoch": 2.0849835495685642, + "grad_norm": 0.1384326927198577, + "learning_rate": 2.5760677189932044e-05, + "loss": 2.7757, + "step": 33587 + }, + { + "epoch": 2.085045626668322, + "grad_norm": 0.15255948221878063, + "learning_rate": 2.5757518496574794e-05, + "loss": 2.8018, + "step": 33588 + }, + { + "epoch": 2.08510770376808, + "grad_norm": 0.1523512936216055, + "learning_rate": 2.575435992969648e-05, + "loss": 2.813, + "step": 33589 + }, + { + "epoch": 2.085169780867838, + "grad_norm": 0.1389506854889067, + "learning_rate": 2.5751201489313547e-05, + "loss": 2.6587, + "step": 33590 + }, + { + "epoch": 2.085231857967596, + "grad_norm": 0.14727069886469865, + "learning_rate": 2.5748043175442494e-05, + "loss": 2.6335, + "step": 33591 + }, + { + "epoch": 2.085293935067354, + "grad_norm": 0.15987410348509615, + "learning_rate": 2.5744884988099784e-05, + "loss": 2.7173, + "step": 33592 + }, + { + "epoch": 2.0853560121671117, + "grad_norm": 0.14426457443730684, + "learning_rate": 2.5741726927301868e-05, + "loss": 2.7791, + "step": 33593 + }, + { + "epoch": 2.0854180892668692, + "grad_norm": 0.1462718821115207, + "learning_rate": 2.5738568993065283e-05, + "loss": 2.7788, + "step": 33594 + }, + { + "epoch": 2.085480166366627, + "grad_norm": 0.1419726934261066, + "learning_rate": 2.5735411185406467e-05, + "loss": 2.7212, + "step": 33595 + }, + { + "epoch": 2.085542243466385, + "grad_norm": 0.14050306225728162, + "learning_rate": 2.57322535043419e-05, + "loss": 2.734, + "step": 33596 + }, + { + "epoch": 2.085604320566143, + "grad_norm": 0.15259959671894022, + "learning_rate": 2.5729095949888043e-05, + "loss": 2.737, + "step": 33597 + }, + { + "epoch": 2.085666397665901, + "grad_norm": 0.14095939508448496, + "learning_rate": 2.57259385220614e-05, + "loss": 2.7594, + "step": 33598 + }, + { + "epoch": 2.085728474765659, + "grad_norm": 0.15666684932414865, + "learning_rate": 2.5722781220878432e-05, + "loss": 2.7451, + "step": 33599 + }, + { + "epoch": 2.0857905518654167, + "grad_norm": 0.141746532288862, + "learning_rate": 2.5719624046355605e-05, + "loss": 2.6629, + "step": 33600 + }, + { + "epoch": 2.0858526289651746, + "grad_norm": 0.14566558886194017, + "learning_rate": 2.5716466998509398e-05, + "loss": 2.7428, + "step": 33601 + }, + { + "epoch": 2.0859147060649326, + "grad_norm": 0.13399589172113185, + "learning_rate": 2.5713310077356257e-05, + "loss": 2.6647, + "step": 33602 + }, + { + "epoch": 2.0859767831646905, + "grad_norm": 0.1424497261374115, + "learning_rate": 2.5710153282912685e-05, + "loss": 2.7707, + "step": 33603 + }, + { + "epoch": 2.0860388602644484, + "grad_norm": 0.1388664022925131, + "learning_rate": 2.570699661519515e-05, + "loss": 2.7343, + "step": 33604 + }, + { + "epoch": 2.0861009373642063, + "grad_norm": 0.15958760073332864, + "learning_rate": 2.570384007422011e-05, + "loss": 2.7437, + "step": 33605 + }, + { + "epoch": 2.0861630144639642, + "grad_norm": 0.14939305525559413, + "learning_rate": 2.570068366000401e-05, + "loss": 2.7962, + "step": 33606 + }, + { + "epoch": 2.086225091563722, + "grad_norm": 0.1424485754617338, + "learning_rate": 2.5697527372563342e-05, + "loss": 2.6796, + "step": 33607 + }, + { + "epoch": 2.08628716866348, + "grad_norm": 0.14114440258000094, + "learning_rate": 2.5694371211914592e-05, + "loss": 2.6604, + "step": 33608 + }, + { + "epoch": 2.086349245763238, + "grad_norm": 0.15806125236997776, + "learning_rate": 2.5691215178074213e-05, + "loss": 2.6943, + "step": 33609 + }, + { + "epoch": 2.086411322862996, + "grad_norm": 0.1430843849733164, + "learning_rate": 2.568805927105866e-05, + "loss": 2.7471, + "step": 33610 + }, + { + "epoch": 2.086473399962754, + "grad_norm": 0.1448144136429165, + "learning_rate": 2.5684903490884392e-05, + "loss": 2.6338, + "step": 33611 + }, + { + "epoch": 2.0865354770625117, + "grad_norm": 0.14744941085328744, + "learning_rate": 2.56817478375679e-05, + "loss": 2.7272, + "step": 33612 + }, + { + "epoch": 2.0865975541622697, + "grad_norm": 0.14352760578498397, + "learning_rate": 2.5678592311125627e-05, + "loss": 2.7739, + "step": 33613 + }, + { + "epoch": 2.0866596312620276, + "grad_norm": 0.1509805787355452, + "learning_rate": 2.5675436911574052e-05, + "loss": 2.7036, + "step": 33614 + }, + { + "epoch": 2.0867217083617855, + "grad_norm": 0.1420292938315935, + "learning_rate": 2.5672281638929623e-05, + "loss": 2.6969, + "step": 33615 + }, + { + "epoch": 2.0867837854615434, + "grad_norm": 0.15159874094436154, + "learning_rate": 2.5669126493208785e-05, + "loss": 2.6426, + "step": 33616 + }, + { + "epoch": 2.0868458625613013, + "grad_norm": 0.1407144579858673, + "learning_rate": 2.566597147442804e-05, + "loss": 2.7275, + "step": 33617 + }, + { + "epoch": 2.0869079396610593, + "grad_norm": 0.14089710348125425, + "learning_rate": 2.566281658260384e-05, + "loss": 2.8006, + "step": 33618 + }, + { + "epoch": 2.0869700167608167, + "grad_norm": 0.15028474845986575, + "learning_rate": 2.5659661817752622e-05, + "loss": 2.78, + "step": 33619 + }, + { + "epoch": 2.0870320938605746, + "grad_norm": 0.14414570264479099, + "learning_rate": 2.565650717989084e-05, + "loss": 2.7007, + "step": 33620 + }, + { + "epoch": 2.0870941709603326, + "grad_norm": 0.14239146804141403, + "learning_rate": 2.565335266903499e-05, + "loss": 2.6695, + "step": 33621 + }, + { + "epoch": 2.0871562480600905, + "grad_norm": 0.13785415307559393, + "learning_rate": 2.565019828520151e-05, + "loss": 2.7426, + "step": 33622 + }, + { + "epoch": 2.0872183251598484, + "grad_norm": 0.13606092369163542, + "learning_rate": 2.5647044028406856e-05, + "loss": 2.7614, + "step": 33623 + }, + { + "epoch": 2.0872804022596063, + "grad_norm": 0.1377107868562739, + "learning_rate": 2.5643889898667473e-05, + "loss": 2.7278, + "step": 33624 + }, + { + "epoch": 2.0873424793593642, + "grad_norm": 0.14068318858545764, + "learning_rate": 2.5640735895999833e-05, + "loss": 2.7612, + "step": 33625 + }, + { + "epoch": 2.087404556459122, + "grad_norm": 0.17103013005439732, + "learning_rate": 2.5637582020420403e-05, + "loss": 2.7633, + "step": 33626 + }, + { + "epoch": 2.08746663355888, + "grad_norm": 0.15766009983885806, + "learning_rate": 2.563442827194561e-05, + "loss": 2.6705, + "step": 33627 + }, + { + "epoch": 2.087528710658638, + "grad_norm": 0.14313852230685026, + "learning_rate": 2.563127465059193e-05, + "loss": 2.7479, + "step": 33628 + }, + { + "epoch": 2.087590787758396, + "grad_norm": 0.14295498802571335, + "learning_rate": 2.5628121156375777e-05, + "loss": 2.7509, + "step": 33629 + }, + { + "epoch": 2.087652864858154, + "grad_norm": 0.14276921517825267, + "learning_rate": 2.5624967789313658e-05, + "loss": 2.7172, + "step": 33630 + }, + { + "epoch": 2.0877149419579117, + "grad_norm": 0.1449378261887374, + "learning_rate": 2.5621814549421996e-05, + "loss": 2.8097, + "step": 33631 + }, + { + "epoch": 2.0877770190576697, + "grad_norm": 0.13993913897576368, + "learning_rate": 2.561866143671725e-05, + "loss": 2.7488, + "step": 33632 + }, + { + "epoch": 2.0878390961574276, + "grad_norm": 0.1418414024974827, + "learning_rate": 2.561550845121584e-05, + "loss": 2.7718, + "step": 33633 + }, + { + "epoch": 2.0879011732571855, + "grad_norm": 0.1606086354602807, + "learning_rate": 2.5612355592934267e-05, + "loss": 2.811, + "step": 33634 + }, + { + "epoch": 2.0879632503569434, + "grad_norm": 0.14513261156016394, + "learning_rate": 2.5609202861888947e-05, + "loss": 2.6538, + "step": 33635 + }, + { + "epoch": 2.0880253274567013, + "grad_norm": 0.1513006125448135, + "learning_rate": 2.560605025809635e-05, + "loss": 2.6973, + "step": 33636 + }, + { + "epoch": 2.0880874045564592, + "grad_norm": 0.14595009962784536, + "learning_rate": 2.56028977815729e-05, + "loss": 2.694, + "step": 33637 + }, + { + "epoch": 2.088149481656217, + "grad_norm": 0.1474151919315244, + "learning_rate": 2.5599745432335037e-05, + "loss": 2.7646, + "step": 33638 + }, + { + "epoch": 2.088211558755975, + "grad_norm": 0.140991270387315, + "learning_rate": 2.559659321039923e-05, + "loss": 2.729, + "step": 33639 + }, + { + "epoch": 2.088273635855733, + "grad_norm": 0.15379668295624585, + "learning_rate": 2.5593441115781937e-05, + "loss": 2.6258, + "step": 33640 + }, + { + "epoch": 2.088335712955491, + "grad_norm": 0.13687158624973111, + "learning_rate": 2.5590289148499585e-05, + "loss": 2.7193, + "step": 33641 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.1407002539079662, + "learning_rate": 2.558713730856862e-05, + "loss": 2.6845, + "step": 33642 + }, + { + "epoch": 2.0884598671550063, + "grad_norm": 0.1461397121669713, + "learning_rate": 2.5583985596005467e-05, + "loss": 2.7589, + "step": 33643 + }, + { + "epoch": 2.0885219442547642, + "grad_norm": 0.14351032505829456, + "learning_rate": 2.5580834010826616e-05, + "loss": 2.7354, + "step": 33644 + }, + { + "epoch": 2.088584021354522, + "grad_norm": 0.13503034824528423, + "learning_rate": 2.557768255304847e-05, + "loss": 2.7118, + "step": 33645 + }, + { + "epoch": 2.08864609845428, + "grad_norm": 0.14844566247482327, + "learning_rate": 2.557453122268749e-05, + "loss": 2.7245, + "step": 33646 + }, + { + "epoch": 2.088708175554038, + "grad_norm": 0.1400680772631853, + "learning_rate": 2.557138001976009e-05, + "loss": 2.7508, + "step": 33647 + }, + { + "epoch": 2.088770252653796, + "grad_norm": 0.14893457980820915, + "learning_rate": 2.5568228944282745e-05, + "loss": 2.7096, + "step": 33648 + }, + { + "epoch": 2.088832329753554, + "grad_norm": 0.14537297472235958, + "learning_rate": 2.5565077996271886e-05, + "loss": 2.743, + "step": 33649 + }, + { + "epoch": 2.0888944068533117, + "grad_norm": 0.1374268315447706, + "learning_rate": 2.5561927175743945e-05, + "loss": 2.7127, + "step": 33650 + }, + { + "epoch": 2.0889564839530697, + "grad_norm": 0.1475360186593376, + "learning_rate": 2.5558776482715363e-05, + "loss": 2.7582, + "step": 33651 + }, + { + "epoch": 2.0890185610528276, + "grad_norm": 0.15516914607963297, + "learning_rate": 2.5555625917202554e-05, + "loss": 2.8058, + "step": 33652 + }, + { + "epoch": 2.0890806381525855, + "grad_norm": 0.1470728244541644, + "learning_rate": 2.5552475479222003e-05, + "loss": 2.7333, + "step": 33653 + }, + { + "epoch": 2.0891427152523434, + "grad_norm": 0.14381074451235512, + "learning_rate": 2.5549325168790123e-05, + "loss": 2.723, + "step": 33654 + }, + { + "epoch": 2.0892047923521013, + "grad_norm": 0.1483855407525085, + "learning_rate": 2.5546174985923344e-05, + "loss": 2.7642, + "step": 33655 + }, + { + "epoch": 2.0892668694518592, + "grad_norm": 0.1585785567236126, + "learning_rate": 2.5543024930638092e-05, + "loss": 2.6731, + "step": 33656 + }, + { + "epoch": 2.089328946551617, + "grad_norm": 0.13990741671011178, + "learning_rate": 2.553987500295083e-05, + "loss": 2.6648, + "step": 33657 + }, + { + "epoch": 2.089391023651375, + "grad_norm": 0.14872611998163432, + "learning_rate": 2.5536725202877988e-05, + "loss": 2.7215, + "step": 33658 + }, + { + "epoch": 2.089453100751133, + "grad_norm": 0.15680615865672884, + "learning_rate": 2.5533575530435983e-05, + "loss": 2.7294, + "step": 33659 + }, + { + "epoch": 2.089515177850891, + "grad_norm": 0.14099935532863836, + "learning_rate": 2.5530425985641233e-05, + "loss": 2.7537, + "step": 33660 + }, + { + "epoch": 2.089577254950649, + "grad_norm": 0.14032496790939078, + "learning_rate": 2.552727656851021e-05, + "loss": 2.7261, + "step": 33661 + }, + { + "epoch": 2.0896393320504068, + "grad_norm": 0.15111587334833407, + "learning_rate": 2.552412727905933e-05, + "loss": 2.7831, + "step": 33662 + }, + { + "epoch": 2.0897014091501647, + "grad_norm": 0.14077235819333347, + "learning_rate": 2.5520978117305015e-05, + "loss": 2.6814, + "step": 33663 + }, + { + "epoch": 2.0897634862499226, + "grad_norm": 0.14031603435958662, + "learning_rate": 2.55178290832637e-05, + "loss": 2.7749, + "step": 33664 + }, + { + "epoch": 2.0898255633496805, + "grad_norm": 0.1457046483141756, + "learning_rate": 2.55146801769518e-05, + "loss": 2.8174, + "step": 33665 + }, + { + "epoch": 2.0898876404494384, + "grad_norm": 0.1634766187861447, + "learning_rate": 2.5511531398385775e-05, + "loss": 2.7575, + "step": 33666 + }, + { + "epoch": 2.089949717549196, + "grad_norm": 0.1419834184103279, + "learning_rate": 2.5508382747582032e-05, + "loss": 2.7789, + "step": 33667 + }, + { + "epoch": 2.090011794648954, + "grad_norm": 0.16304176554695965, + "learning_rate": 2.5505234224557007e-05, + "loss": 2.6876, + "step": 33668 + }, + { + "epoch": 2.0900738717487117, + "grad_norm": 0.14652459546096624, + "learning_rate": 2.5502085829327092e-05, + "loss": 2.6752, + "step": 33669 + }, + { + "epoch": 2.0901359488484696, + "grad_norm": 0.16760268909422626, + "learning_rate": 2.5498937561908774e-05, + "loss": 2.6741, + "step": 33670 + }, + { + "epoch": 2.0901980259482276, + "grad_norm": 0.15311024482831115, + "learning_rate": 2.5495789422318417e-05, + "loss": 2.837, + "step": 33671 + }, + { + "epoch": 2.0902601030479855, + "grad_norm": 0.1496624035891429, + "learning_rate": 2.5492641410572505e-05, + "loss": 2.6928, + "step": 33672 + }, + { + "epoch": 2.0903221801477434, + "grad_norm": 0.15481614004313457, + "learning_rate": 2.5489493526687426e-05, + "loss": 2.6992, + "step": 33673 + }, + { + "epoch": 2.0903842572475013, + "grad_norm": 0.14222152742956656, + "learning_rate": 2.5486345770679588e-05, + "loss": 2.6136, + "step": 33674 + }, + { + "epoch": 2.0904463343472592, + "grad_norm": 0.14384753012926796, + "learning_rate": 2.5483198142565455e-05, + "loss": 2.7364, + "step": 33675 + }, + { + "epoch": 2.090508411447017, + "grad_norm": 0.148675451791929, + "learning_rate": 2.5480050642361432e-05, + "loss": 2.6952, + "step": 33676 + }, + { + "epoch": 2.090570488546775, + "grad_norm": 0.16583210394354095, + "learning_rate": 2.547690327008394e-05, + "loss": 2.7338, + "step": 33677 + }, + { + "epoch": 2.090632565646533, + "grad_norm": 0.16383541376565794, + "learning_rate": 2.547375602574939e-05, + "loss": 2.7159, + "step": 33678 + }, + { + "epoch": 2.090694642746291, + "grad_norm": 0.1514928296303617, + "learning_rate": 2.5470608909374194e-05, + "loss": 2.7476, + "step": 33679 + }, + { + "epoch": 2.090756719846049, + "grad_norm": 0.14982174125993075, + "learning_rate": 2.54674619209748e-05, + "loss": 2.7719, + "step": 33680 + }, + { + "epoch": 2.0908187969458067, + "grad_norm": 0.13513850029395297, + "learning_rate": 2.5464315060567617e-05, + "loss": 2.6413, + "step": 33681 + }, + { + "epoch": 2.0908808740455647, + "grad_norm": 0.14791159576356258, + "learning_rate": 2.546116832816906e-05, + "loss": 2.7444, + "step": 33682 + }, + { + "epoch": 2.0909429511453226, + "grad_norm": 0.159046070209644, + "learning_rate": 2.545802172379552e-05, + "loss": 2.7616, + "step": 33683 + }, + { + "epoch": 2.0910050282450805, + "grad_norm": 0.1430150418302486, + "learning_rate": 2.545487524746346e-05, + "loss": 2.8007, + "step": 33684 + }, + { + "epoch": 2.0910671053448384, + "grad_norm": 0.14368044606209188, + "learning_rate": 2.5451728899189274e-05, + "loss": 2.7109, + "step": 33685 + }, + { + "epoch": 2.0911291824445963, + "grad_norm": 0.17168134920706957, + "learning_rate": 2.5448582678989386e-05, + "loss": 2.7965, + "step": 33686 + }, + { + "epoch": 2.0911912595443543, + "grad_norm": 0.14861285994318726, + "learning_rate": 2.5445436586880195e-05, + "loss": 2.7469, + "step": 33687 + }, + { + "epoch": 2.091253336644112, + "grad_norm": 0.1638303090042265, + "learning_rate": 2.54422906228781e-05, + "loss": 2.7364, + "step": 33688 + }, + { + "epoch": 2.09131541374387, + "grad_norm": 0.14995079624299154, + "learning_rate": 2.5439144786999557e-05, + "loss": 2.7548, + "step": 33689 + }, + { + "epoch": 2.0913774908436276, + "grad_norm": 0.14802355519726604, + "learning_rate": 2.5435999079260965e-05, + "loss": 2.774, + "step": 33690 + }, + { + "epoch": 2.0914395679433855, + "grad_norm": 0.1422066487004736, + "learning_rate": 2.543285349967872e-05, + "loss": 2.6542, + "step": 33691 + }, + { + "epoch": 2.0915016450431434, + "grad_norm": 0.1415324272545629, + "learning_rate": 2.542970804826923e-05, + "loss": 2.7574, + "step": 33692 + }, + { + "epoch": 2.0915637221429013, + "grad_norm": 0.15256829097755792, + "learning_rate": 2.542656272504893e-05, + "loss": 2.6945, + "step": 33693 + }, + { + "epoch": 2.0916257992426592, + "grad_norm": 0.14660898630447844, + "learning_rate": 2.5423417530034222e-05, + "loss": 2.6563, + "step": 33694 + }, + { + "epoch": 2.091687876342417, + "grad_norm": 0.16846444486807474, + "learning_rate": 2.5420272463241514e-05, + "loss": 2.7863, + "step": 33695 + }, + { + "epoch": 2.091749953442175, + "grad_norm": 0.15432021411194874, + "learning_rate": 2.541712752468719e-05, + "loss": 2.5832, + "step": 33696 + }, + { + "epoch": 2.091812030541933, + "grad_norm": 0.14970275937509436, + "learning_rate": 2.5413982714387698e-05, + "loss": 2.8617, + "step": 33697 + }, + { + "epoch": 2.091874107641691, + "grad_norm": 0.14546878982690908, + "learning_rate": 2.5410838032359418e-05, + "loss": 2.6654, + "step": 33698 + }, + { + "epoch": 2.091936184741449, + "grad_norm": 0.1443645314760301, + "learning_rate": 2.5407693478618777e-05, + "loss": 2.6487, + "step": 33699 + }, + { + "epoch": 2.0919982618412067, + "grad_norm": 0.13965415275989576, + "learning_rate": 2.540454905318217e-05, + "loss": 2.6816, + "step": 33700 + }, + { + "epoch": 2.0920603389409647, + "grad_norm": 0.13908123656288124, + "learning_rate": 2.540140475606597e-05, + "loss": 2.7115, + "step": 33701 + }, + { + "epoch": 2.0921224160407226, + "grad_norm": 0.13782798516820274, + "learning_rate": 2.5398260587286643e-05, + "loss": 2.7696, + "step": 33702 + }, + { + "epoch": 2.0921844931404805, + "grad_norm": 0.1464542548684357, + "learning_rate": 2.5395116546860558e-05, + "loss": 2.6764, + "step": 33703 + }, + { + "epoch": 2.0922465702402384, + "grad_norm": 0.1887197654868616, + "learning_rate": 2.5391972634804106e-05, + "loss": 2.6711, + "step": 33704 + }, + { + "epoch": 2.0923086473399963, + "grad_norm": 0.14643580585188445, + "learning_rate": 2.538882885113373e-05, + "loss": 2.7575, + "step": 33705 + }, + { + "epoch": 2.0923707244397542, + "grad_norm": 0.14838324985972945, + "learning_rate": 2.5385685195865777e-05, + "loss": 2.6457, + "step": 33706 + }, + { + "epoch": 2.092432801539512, + "grad_norm": 0.15397348194117222, + "learning_rate": 2.5382541669016707e-05, + "loss": 2.8128, + "step": 33707 + }, + { + "epoch": 2.09249487863927, + "grad_norm": 0.14167614649745783, + "learning_rate": 2.5379398270602894e-05, + "loss": 2.7245, + "step": 33708 + }, + { + "epoch": 2.092556955739028, + "grad_norm": 0.14834798447733127, + "learning_rate": 2.5376255000640736e-05, + "loss": 2.7134, + "step": 33709 + }, + { + "epoch": 2.092619032838786, + "grad_norm": 0.14712340522507852, + "learning_rate": 2.5373111859146616e-05, + "loss": 2.8159, + "step": 33710 + }, + { + "epoch": 2.092681109938544, + "grad_norm": 0.15005771524990666, + "learning_rate": 2.536996884613696e-05, + "loss": 2.7807, + "step": 33711 + }, + { + "epoch": 2.0927431870383018, + "grad_norm": 0.17226643313436757, + "learning_rate": 2.536682596162816e-05, + "loss": 2.6727, + "step": 33712 + }, + { + "epoch": 2.0928052641380597, + "grad_norm": 0.14120132840967925, + "learning_rate": 2.5363683205636614e-05, + "loss": 2.719, + "step": 33713 + }, + { + "epoch": 2.0928673412378176, + "grad_norm": 0.1459172141007496, + "learning_rate": 2.536054057817871e-05, + "loss": 2.8663, + "step": 33714 + }, + { + "epoch": 2.092929418337575, + "grad_norm": 0.1474848026896258, + "learning_rate": 2.535739807927083e-05, + "loss": 2.6912, + "step": 33715 + }, + { + "epoch": 2.092991495437333, + "grad_norm": 0.14943044859348859, + "learning_rate": 2.5354255708929405e-05, + "loss": 2.7063, + "step": 33716 + }, + { + "epoch": 2.093053572537091, + "grad_norm": 0.1594702169742198, + "learning_rate": 2.5351113467170808e-05, + "loss": 2.6991, + "step": 33717 + }, + { + "epoch": 2.093115649636849, + "grad_norm": 0.15391156216472532, + "learning_rate": 2.5347971354011436e-05, + "loss": 2.8063, + "step": 33718 + }, + { + "epoch": 2.0931777267366067, + "grad_norm": 0.14304211641414658, + "learning_rate": 2.5344829369467665e-05, + "loss": 2.688, + "step": 33719 + }, + { + "epoch": 2.0932398038363647, + "grad_norm": 0.16726425940051032, + "learning_rate": 2.5341687513555923e-05, + "loss": 2.7437, + "step": 33720 + }, + { + "epoch": 2.0933018809361226, + "grad_norm": 0.14155101531643888, + "learning_rate": 2.5338545786292585e-05, + "loss": 2.7315, + "step": 33721 + }, + { + "epoch": 2.0933639580358805, + "grad_norm": 0.14413596561830294, + "learning_rate": 2.5335404187694033e-05, + "loss": 2.7636, + "step": 33722 + }, + { + "epoch": 2.0934260351356384, + "grad_norm": 0.14334359510923592, + "learning_rate": 2.533226271777668e-05, + "loss": 2.6948, + "step": 33723 + }, + { + "epoch": 2.0934881122353963, + "grad_norm": 0.14194106476421406, + "learning_rate": 2.5329121376556864e-05, + "loss": 2.725, + "step": 33724 + }, + { + "epoch": 2.0935501893351542, + "grad_norm": 0.14038518484334742, + "learning_rate": 2.5325980164051044e-05, + "loss": 2.7916, + "step": 33725 + }, + { + "epoch": 2.093612266434912, + "grad_norm": 0.1398919057247386, + "learning_rate": 2.5322839080275566e-05, + "loss": 2.7331, + "step": 33726 + }, + { + "epoch": 2.09367434353467, + "grad_norm": 0.15587134375065914, + "learning_rate": 2.5319698125246834e-05, + "loss": 2.6779, + "step": 33727 + }, + { + "epoch": 2.093736420634428, + "grad_norm": 0.15478327295740069, + "learning_rate": 2.5316557298981204e-05, + "loss": 2.6383, + "step": 33728 + }, + { + "epoch": 2.093798497734186, + "grad_norm": 0.1570283161192913, + "learning_rate": 2.531341660149511e-05, + "loss": 2.7386, + "step": 33729 + }, + { + "epoch": 2.093860574833944, + "grad_norm": 0.14894440956552352, + "learning_rate": 2.5310276032804907e-05, + "loss": 2.796, + "step": 33730 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.1589891721203247, + "learning_rate": 2.5307135592926996e-05, + "loss": 2.6354, + "step": 33731 + }, + { + "epoch": 2.0939847290334597, + "grad_norm": 0.15605463689199303, + "learning_rate": 2.5303995281877728e-05, + "loss": 2.6397, + "step": 33732 + }, + { + "epoch": 2.0940468061332176, + "grad_norm": 0.1718965592734444, + "learning_rate": 2.530085509967354e-05, + "loss": 2.7305, + "step": 33733 + }, + { + "epoch": 2.0941088832329755, + "grad_norm": 0.15938595121756208, + "learning_rate": 2.5297715046330772e-05, + "loss": 2.6949, + "step": 33734 + }, + { + "epoch": 2.0941709603327334, + "grad_norm": 0.15292772216313527, + "learning_rate": 2.5294575121865834e-05, + "loss": 2.7838, + "step": 33735 + }, + { + "epoch": 2.0942330374324913, + "grad_norm": 0.15828680250891858, + "learning_rate": 2.5291435326295088e-05, + "loss": 2.7075, + "step": 33736 + }, + { + "epoch": 2.0942951145322493, + "grad_norm": 0.15137077450347594, + "learning_rate": 2.528829565963491e-05, + "loss": 2.7983, + "step": 33737 + }, + { + "epoch": 2.0943571916320067, + "grad_norm": 0.1522256931733331, + "learning_rate": 2.528515612190168e-05, + "loss": 2.7291, + "step": 33738 + }, + { + "epoch": 2.0944192687317646, + "grad_norm": 0.16009139864667488, + "learning_rate": 2.5282016713111824e-05, + "loss": 2.701, + "step": 33739 + }, + { + "epoch": 2.0944813458315226, + "grad_norm": 0.15087035071550492, + "learning_rate": 2.527887743328168e-05, + "loss": 2.6772, + "step": 33740 + }, + { + "epoch": 2.0945434229312805, + "grad_norm": 0.1340617443054526, + "learning_rate": 2.527573828242763e-05, + "loss": 2.7201, + "step": 33741 + }, + { + "epoch": 2.0946055000310384, + "grad_norm": 0.1435775089937754, + "learning_rate": 2.5272599260566044e-05, + "loss": 2.6894, + "step": 33742 + }, + { + "epoch": 2.0946675771307963, + "grad_norm": 0.14536100060141263, + "learning_rate": 2.5269460367713326e-05, + "loss": 2.6612, + "step": 33743 + }, + { + "epoch": 2.0947296542305542, + "grad_norm": 0.14244903965588138, + "learning_rate": 2.5266321603885834e-05, + "loss": 2.741, + "step": 33744 + }, + { + "epoch": 2.094791731330312, + "grad_norm": 0.14119363100993237, + "learning_rate": 2.5263182969099953e-05, + "loss": 2.7146, + "step": 33745 + }, + { + "epoch": 2.09485380843007, + "grad_norm": 0.1390905909079892, + "learning_rate": 2.5260044463372022e-05, + "loss": 2.7614, + "step": 33746 + }, + { + "epoch": 2.094915885529828, + "grad_norm": 0.14193666204157288, + "learning_rate": 2.525690608671848e-05, + "loss": 2.8224, + "step": 33747 + }, + { + "epoch": 2.094977962629586, + "grad_norm": 0.16195532936660678, + "learning_rate": 2.525376783915565e-05, + "loss": 2.7414, + "step": 33748 + }, + { + "epoch": 2.095040039729344, + "grad_norm": 0.15167641586323816, + "learning_rate": 2.5250629720699926e-05, + "loss": 2.6187, + "step": 33749 + }, + { + "epoch": 2.0951021168291017, + "grad_norm": 0.14022219772385044, + "learning_rate": 2.5247491731367678e-05, + "loss": 2.7637, + "step": 33750 + }, + { + "epoch": 2.0951641939288597, + "grad_norm": 0.15161885290469065, + "learning_rate": 2.5244353871175253e-05, + "loss": 2.7169, + "step": 33751 + }, + { + "epoch": 2.0952262710286176, + "grad_norm": 0.14323966825884638, + "learning_rate": 2.5241216140139062e-05, + "loss": 2.7068, + "step": 33752 + }, + { + "epoch": 2.0952883481283755, + "grad_norm": 0.14007198021341827, + "learning_rate": 2.5238078538275456e-05, + "loss": 2.712, + "step": 33753 + }, + { + "epoch": 2.0953504252281334, + "grad_norm": 0.14505758974203734, + "learning_rate": 2.5234941065600815e-05, + "loss": 2.7082, + "step": 33754 + }, + { + "epoch": 2.0954125023278913, + "grad_norm": 0.15745269317545024, + "learning_rate": 2.5231803722131464e-05, + "loss": 2.7347, + "step": 33755 + }, + { + "epoch": 2.0954745794276493, + "grad_norm": 0.14957513417372312, + "learning_rate": 2.5228666507883837e-05, + "loss": 2.7873, + "step": 33756 + }, + { + "epoch": 2.095536656527407, + "grad_norm": 0.1431419901987296, + "learning_rate": 2.522552942287426e-05, + "loss": 2.7729, + "step": 33757 + }, + { + "epoch": 2.095598733627165, + "grad_norm": 0.1539385231667859, + "learning_rate": 2.5222392467119117e-05, + "loss": 2.793, + "step": 33758 + }, + { + "epoch": 2.095660810726923, + "grad_norm": 0.1489273508958489, + "learning_rate": 2.521925564063475e-05, + "loss": 2.6779, + "step": 33759 + }, + { + "epoch": 2.095722887826681, + "grad_norm": 0.14057285613654313, + "learning_rate": 2.521611894343756e-05, + "loss": 2.6634, + "step": 33760 + }, + { + "epoch": 2.095784964926439, + "grad_norm": 0.18446186452129146, + "learning_rate": 2.5212982375543893e-05, + "loss": 2.6622, + "step": 33761 + }, + { + "epoch": 2.0958470420261968, + "grad_norm": 0.14733793800396022, + "learning_rate": 2.520984593697011e-05, + "loss": 2.7289, + "step": 33762 + }, + { + "epoch": 2.0959091191259542, + "grad_norm": 0.14451921902663634, + "learning_rate": 2.5206709627732582e-05, + "loss": 2.6414, + "step": 33763 + }, + { + "epoch": 2.095971196225712, + "grad_norm": 0.14810191668748648, + "learning_rate": 2.520357344784765e-05, + "loss": 2.729, + "step": 33764 + }, + { + "epoch": 2.09603327332547, + "grad_norm": 0.14004447107232, + "learning_rate": 2.5200437397331712e-05, + "loss": 2.7175, + "step": 33765 + }, + { + "epoch": 2.096095350425228, + "grad_norm": 0.1571242076296921, + "learning_rate": 2.519730147620112e-05, + "loss": 2.7154, + "step": 33766 + }, + { + "epoch": 2.096157427524986, + "grad_norm": 0.14877653304043986, + "learning_rate": 2.5194165684472215e-05, + "loss": 2.7555, + "step": 33767 + }, + { + "epoch": 2.096219504624744, + "grad_norm": 0.16591914683390188, + "learning_rate": 2.5191030022161355e-05, + "loss": 2.6965, + "step": 33768 + }, + { + "epoch": 2.0962815817245017, + "grad_norm": 0.15696354896446746, + "learning_rate": 2.518789448928492e-05, + "loss": 2.6892, + "step": 33769 + }, + { + "epoch": 2.0963436588242597, + "grad_norm": 0.14067364884338288, + "learning_rate": 2.518475908585928e-05, + "loss": 2.7801, + "step": 33770 + }, + { + "epoch": 2.0964057359240176, + "grad_norm": 0.14564563317513174, + "learning_rate": 2.5181623811900768e-05, + "loss": 2.7508, + "step": 33771 + }, + { + "epoch": 2.0964678130237755, + "grad_norm": 0.15195914160969626, + "learning_rate": 2.517848866742576e-05, + "loss": 2.7585, + "step": 33772 + }, + { + "epoch": 2.0965298901235334, + "grad_norm": 0.14518297662844254, + "learning_rate": 2.5175353652450595e-05, + "loss": 2.6896, + "step": 33773 + }, + { + "epoch": 2.0965919672232913, + "grad_norm": 0.14252603572239286, + "learning_rate": 2.517221876699162e-05, + "loss": 2.6138, + "step": 33774 + }, + { + "epoch": 2.0966540443230492, + "grad_norm": 0.15771663889143808, + "learning_rate": 2.5169084011065224e-05, + "loss": 2.7214, + "step": 33775 + }, + { + "epoch": 2.096716121422807, + "grad_norm": 0.1699747025376253, + "learning_rate": 2.5165949384687748e-05, + "loss": 2.7292, + "step": 33776 + }, + { + "epoch": 2.096778198522565, + "grad_norm": 0.14300597024255624, + "learning_rate": 2.5162814887875542e-05, + "loss": 2.6559, + "step": 33777 + }, + { + "epoch": 2.096840275622323, + "grad_norm": 0.1499043442782137, + "learning_rate": 2.5159680520644936e-05, + "loss": 2.7462, + "step": 33778 + }, + { + "epoch": 2.096902352722081, + "grad_norm": 0.14752724400967768, + "learning_rate": 2.5156546283012327e-05, + "loss": 2.7423, + "step": 33779 + }, + { + "epoch": 2.096964429821839, + "grad_norm": 0.16135619314208755, + "learning_rate": 2.515341217499405e-05, + "loss": 2.7458, + "step": 33780 + }, + { + "epoch": 2.0970265069215968, + "grad_norm": 0.1823650904787658, + "learning_rate": 2.515027819660646e-05, + "loss": 2.7153, + "step": 33781 + }, + { + "epoch": 2.0970885840213547, + "grad_norm": 0.16731754753213915, + "learning_rate": 2.514714434786587e-05, + "loss": 2.7073, + "step": 33782 + }, + { + "epoch": 2.0971506611211126, + "grad_norm": 0.1590157700453917, + "learning_rate": 2.5144010628788684e-05, + "loss": 2.763, + "step": 33783 + }, + { + "epoch": 2.0972127382208705, + "grad_norm": 0.14864271183608138, + "learning_rate": 2.5140877039391232e-05, + "loss": 2.8208, + "step": 33784 + }, + { + "epoch": 2.0972748153206284, + "grad_norm": 0.1635100281436913, + "learning_rate": 2.513774357968986e-05, + "loss": 2.7762, + "step": 33785 + }, + { + "epoch": 2.097336892420386, + "grad_norm": 0.14830020779333392, + "learning_rate": 2.5134610249700907e-05, + "loss": 2.7548, + "step": 33786 + }, + { + "epoch": 2.097398969520144, + "grad_norm": 0.14791896246306, + "learning_rate": 2.5131477049440722e-05, + "loss": 2.7814, + "step": 33787 + }, + { + "epoch": 2.0974610466199017, + "grad_norm": 0.14702027522121663, + "learning_rate": 2.5128343978925666e-05, + "loss": 2.6991, + "step": 33788 + }, + { + "epoch": 2.0975231237196597, + "grad_norm": 0.1432899447291697, + "learning_rate": 2.512521103817208e-05, + "loss": 2.6959, + "step": 33789 + }, + { + "epoch": 2.0975852008194176, + "grad_norm": 0.1682130844048656, + "learning_rate": 2.512207822719631e-05, + "loss": 2.6845, + "step": 33790 + }, + { + "epoch": 2.0976472779191755, + "grad_norm": 0.17239473612417933, + "learning_rate": 2.5118945546014677e-05, + "loss": 2.7535, + "step": 33791 + }, + { + "epoch": 2.0977093550189334, + "grad_norm": 0.14791451730196326, + "learning_rate": 2.511581299464356e-05, + "loss": 2.8408, + "step": 33792 + }, + { + "epoch": 2.0977714321186913, + "grad_norm": 0.1504848049091531, + "learning_rate": 2.5112680573099297e-05, + "loss": 2.6865, + "step": 33793 + }, + { + "epoch": 2.0978335092184492, + "grad_norm": 0.16707445650132177, + "learning_rate": 2.5109548281398215e-05, + "loss": 2.7321, + "step": 33794 + }, + { + "epoch": 2.097895586318207, + "grad_norm": 0.15221927521618345, + "learning_rate": 2.510641611955664e-05, + "loss": 2.717, + "step": 33795 + }, + { + "epoch": 2.097957663417965, + "grad_norm": 0.156080499741539, + "learning_rate": 2.5103284087590962e-05, + "loss": 2.726, + "step": 33796 + }, + { + "epoch": 2.098019740517723, + "grad_norm": 0.1447682756859606, + "learning_rate": 2.510015218551749e-05, + "loss": 2.7785, + "step": 33797 + }, + { + "epoch": 2.098081817617481, + "grad_norm": 0.17630108685589854, + "learning_rate": 2.5097020413352568e-05, + "loss": 2.7143, + "step": 33798 + }, + { + "epoch": 2.098143894717239, + "grad_norm": 0.15756156724701384, + "learning_rate": 2.5093888771112538e-05, + "loss": 2.7583, + "step": 33799 + }, + { + "epoch": 2.0982059718169968, + "grad_norm": 0.15370844047680013, + "learning_rate": 2.509075725881371e-05, + "loss": 2.8864, + "step": 33800 + }, + { + "epoch": 2.0982680489167547, + "grad_norm": 0.14202554958851912, + "learning_rate": 2.5087625876472477e-05, + "loss": 2.6988, + "step": 33801 + }, + { + "epoch": 2.0983301260165126, + "grad_norm": 0.16908957545743225, + "learning_rate": 2.5084494624105125e-05, + "loss": 2.7511, + "step": 33802 + }, + { + "epoch": 2.0983922031162705, + "grad_norm": 0.13826459525724252, + "learning_rate": 2.5081363501728032e-05, + "loss": 2.7727, + "step": 33803 + }, + { + "epoch": 2.0984542802160284, + "grad_norm": 0.14870713239350364, + "learning_rate": 2.5078232509357514e-05, + "loss": 2.8274, + "step": 33804 + }, + { + "epoch": 2.0985163573157863, + "grad_norm": 0.14713900837410696, + "learning_rate": 2.507510164700989e-05, + "loss": 2.6271, + "step": 33805 + }, + { + "epoch": 2.0985784344155443, + "grad_norm": 0.1453243801941575, + "learning_rate": 2.5071970914701533e-05, + "loss": 2.6758, + "step": 33806 + }, + { + "epoch": 2.098640511515302, + "grad_norm": 0.15039465019487105, + "learning_rate": 2.506884031244875e-05, + "loss": 2.8262, + "step": 33807 + }, + { + "epoch": 2.09870258861506, + "grad_norm": 0.13937251301543732, + "learning_rate": 2.5065709840267882e-05, + "loss": 2.7547, + "step": 33808 + }, + { + "epoch": 2.098764665714818, + "grad_norm": 0.16520238165298962, + "learning_rate": 2.5062579498175237e-05, + "loss": 2.7955, + "step": 33809 + }, + { + "epoch": 2.098826742814576, + "grad_norm": 0.15448969249422692, + "learning_rate": 2.5059449286187188e-05, + "loss": 2.8089, + "step": 33810 + }, + { + "epoch": 2.0988888199143334, + "grad_norm": 0.15232624051145982, + "learning_rate": 2.5056319204320045e-05, + "loss": 2.7492, + "step": 33811 + }, + { + "epoch": 2.0989508970140913, + "grad_norm": 0.16182163278430348, + "learning_rate": 2.5053189252590147e-05, + "loss": 2.7478, + "step": 33812 + }, + { + "epoch": 2.0990129741138492, + "grad_norm": 0.14806737682334895, + "learning_rate": 2.505005943101381e-05, + "loss": 2.701, + "step": 33813 + }, + { + "epoch": 2.099075051213607, + "grad_norm": 0.14715714139540786, + "learning_rate": 2.5046929739607355e-05, + "loss": 2.8277, + "step": 33814 + }, + { + "epoch": 2.099137128313365, + "grad_norm": 0.15827358113357728, + "learning_rate": 2.5043800178387143e-05, + "loss": 2.7584, + "step": 33815 + }, + { + "epoch": 2.099199205413123, + "grad_norm": 0.1608986685024899, + "learning_rate": 2.5040670747369482e-05, + "loss": 2.6662, + "step": 33816 + }, + { + "epoch": 2.099261282512881, + "grad_norm": 0.15007917554726943, + "learning_rate": 2.50375414465707e-05, + "loss": 2.7671, + "step": 33817 + }, + { + "epoch": 2.099323359612639, + "grad_norm": 0.17226450848379768, + "learning_rate": 2.5034412276007114e-05, + "loss": 2.6208, + "step": 33818 + }, + { + "epoch": 2.0993854367123967, + "grad_norm": 0.1480766729396571, + "learning_rate": 2.5031283235695068e-05, + "loss": 2.7373, + "step": 33819 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.15079179119164873, + "learning_rate": 2.5028154325650883e-05, + "loss": 2.7508, + "step": 33820 + }, + { + "epoch": 2.0995095909119126, + "grad_norm": 0.1421546263997226, + "learning_rate": 2.5025025545890878e-05, + "loss": 2.7076, + "step": 33821 + }, + { + "epoch": 2.0995716680116705, + "grad_norm": 0.1485355329540123, + "learning_rate": 2.502189689643138e-05, + "loss": 2.7505, + "step": 33822 + }, + { + "epoch": 2.0996337451114284, + "grad_norm": 0.14998721848113394, + "learning_rate": 2.5018768377288686e-05, + "loss": 2.7325, + "step": 33823 + }, + { + "epoch": 2.0996958222111863, + "grad_norm": 0.15296248575112154, + "learning_rate": 2.5015639988479167e-05, + "loss": 2.8093, + "step": 33824 + }, + { + "epoch": 2.0997578993109443, + "grad_norm": 0.14454267430904993, + "learning_rate": 2.5012511730019118e-05, + "loss": 2.7165, + "step": 33825 + }, + { + "epoch": 2.099819976410702, + "grad_norm": 0.17662978409404892, + "learning_rate": 2.5009383601924853e-05, + "loss": 2.7625, + "step": 33826 + }, + { + "epoch": 2.09988205351046, + "grad_norm": 0.14192856487469155, + "learning_rate": 2.5006255604212693e-05, + "loss": 2.7298, + "step": 33827 + }, + { + "epoch": 2.099944130610218, + "grad_norm": 0.1379852623956876, + "learning_rate": 2.5003127736898974e-05, + "loss": 2.754, + "step": 33828 + }, + { + "epoch": 2.100006207709976, + "grad_norm": 0.14172520993954513, + "learning_rate": 2.500000000000001e-05, + "loss": 2.7752, + "step": 33829 + }, + { + "epoch": 2.100068284809734, + "grad_norm": 0.14219058290220501, + "learning_rate": 2.499687239353212e-05, + "loss": 2.7347, + "step": 33830 + }, + { + "epoch": 2.1001303619094918, + "grad_norm": 0.14416721610445088, + "learning_rate": 2.4993744917511592e-05, + "loss": 2.7291, + "step": 33831 + }, + { + "epoch": 2.1001924390092497, + "grad_norm": 0.1416869793791055, + "learning_rate": 2.4990617571954784e-05, + "loss": 2.7705, + "step": 33832 + }, + { + "epoch": 2.1002545161090076, + "grad_norm": 0.15680946088607722, + "learning_rate": 2.4987490356877996e-05, + "loss": 2.6929, + "step": 33833 + }, + { + "epoch": 2.100316593208765, + "grad_norm": 0.14284020987701165, + "learning_rate": 2.4984363272297545e-05, + "loss": 2.7661, + "step": 33834 + }, + { + "epoch": 2.100378670308523, + "grad_norm": 0.14595565108161387, + "learning_rate": 2.498123631822973e-05, + "loss": 2.6971, + "step": 33835 + }, + { + "epoch": 2.100440747408281, + "grad_norm": 0.14209717145030107, + "learning_rate": 2.497810949469089e-05, + "loss": 2.751, + "step": 33836 + }, + { + "epoch": 2.100502824508039, + "grad_norm": 0.13859325517439214, + "learning_rate": 2.497498280169731e-05, + "loss": 2.6487, + "step": 33837 + }, + { + "epoch": 2.1005649016077967, + "grad_norm": 0.1439537224454416, + "learning_rate": 2.497185623926534e-05, + "loss": 2.7786, + "step": 33838 + }, + { + "epoch": 2.1006269787075547, + "grad_norm": 0.15969517677241268, + "learning_rate": 2.4968729807411266e-05, + "loss": 2.7766, + "step": 33839 + }, + { + "epoch": 2.1006890558073126, + "grad_norm": 0.1516267212748808, + "learning_rate": 2.4965603506151414e-05, + "loss": 2.741, + "step": 33840 + }, + { + "epoch": 2.1007511329070705, + "grad_norm": 0.15152754749291183, + "learning_rate": 2.496247733550206e-05, + "loss": 2.7582, + "step": 33841 + }, + { + "epoch": 2.1008132100068284, + "grad_norm": 0.14033280586766872, + "learning_rate": 2.4959351295479566e-05, + "loss": 2.7671, + "step": 33842 + }, + { + "epoch": 2.1008752871065863, + "grad_norm": 0.1449152124935009, + "learning_rate": 2.4956225386100208e-05, + "loss": 2.7339, + "step": 33843 + }, + { + "epoch": 2.1009373642063442, + "grad_norm": 0.14186121056465054, + "learning_rate": 2.4953099607380303e-05, + "loss": 2.7632, + "step": 33844 + }, + { + "epoch": 2.100999441306102, + "grad_norm": 0.14894182712849932, + "learning_rate": 2.4949973959336143e-05, + "loss": 2.6417, + "step": 33845 + }, + { + "epoch": 2.10106151840586, + "grad_norm": 0.16323435192011226, + "learning_rate": 2.4946848441984066e-05, + "loss": 2.7004, + "step": 33846 + }, + { + "epoch": 2.101123595505618, + "grad_norm": 0.14234600376266937, + "learning_rate": 2.494372305534036e-05, + "loss": 2.761, + "step": 33847 + }, + { + "epoch": 2.101185672605376, + "grad_norm": 0.1634258280410114, + "learning_rate": 2.494059779942134e-05, + "loss": 2.7167, + "step": 33848 + }, + { + "epoch": 2.101247749705134, + "grad_norm": 0.1545010495447262, + "learning_rate": 2.49374726742433e-05, + "loss": 2.753, + "step": 33849 + }, + { + "epoch": 2.1013098268048918, + "grad_norm": 0.1549998337990885, + "learning_rate": 2.4934347679822533e-05, + "loss": 2.7899, + "step": 33850 + }, + { + "epoch": 2.1013719039046497, + "grad_norm": 0.14081924983032354, + "learning_rate": 2.4931222816175377e-05, + "loss": 2.7014, + "step": 33851 + }, + { + "epoch": 2.1014339810044076, + "grad_norm": 0.15012057941544502, + "learning_rate": 2.4928098083318112e-05, + "loss": 2.7571, + "step": 33852 + }, + { + "epoch": 2.1014960581041655, + "grad_norm": 0.14393543820306212, + "learning_rate": 2.492497348126705e-05, + "loss": 2.7192, + "step": 33853 + }, + { + "epoch": 2.1015581352039234, + "grad_norm": 0.1465393916764609, + "learning_rate": 2.4921849010038472e-05, + "loss": 2.6987, + "step": 33854 + }, + { + "epoch": 2.1016202123036813, + "grad_norm": 0.1512995168716649, + "learning_rate": 2.491872466964871e-05, + "loss": 2.6717, + "step": 33855 + }, + { + "epoch": 2.1016822894034393, + "grad_norm": 0.1393059084365274, + "learning_rate": 2.4915600460114052e-05, + "loss": 2.7903, + "step": 33856 + }, + { + "epoch": 2.101744366503197, + "grad_norm": 0.14301534138470817, + "learning_rate": 2.49124763814508e-05, + "loss": 2.7899, + "step": 33857 + }, + { + "epoch": 2.1018064436029547, + "grad_norm": 0.14240929000251595, + "learning_rate": 2.4909352433675242e-05, + "loss": 2.6915, + "step": 33858 + }, + { + "epoch": 2.1018685207027126, + "grad_norm": 0.14667873839415427, + "learning_rate": 2.490622861680367e-05, + "loss": 2.6547, + "step": 33859 + }, + { + "epoch": 2.1019305978024705, + "grad_norm": 0.14041823179927274, + "learning_rate": 2.490310493085241e-05, + "loss": 2.7474, + "step": 33860 + }, + { + "epoch": 2.1019926749022284, + "grad_norm": 0.1461403685663054, + "learning_rate": 2.4899981375837745e-05, + "loss": 2.71, + "step": 33861 + }, + { + "epoch": 2.1020547520019863, + "grad_norm": 0.13741048478313003, + "learning_rate": 2.4896857951775972e-05, + "loss": 2.763, + "step": 33862 + }, + { + "epoch": 2.1021168291017442, + "grad_norm": 0.15433523449729208, + "learning_rate": 2.4893734658683365e-05, + "loss": 2.6955, + "step": 33863 + }, + { + "epoch": 2.102178906201502, + "grad_norm": 0.14034556069455711, + "learning_rate": 2.489061149657626e-05, + "loss": 2.7158, + "step": 33864 + }, + { + "epoch": 2.10224098330126, + "grad_norm": 0.14465869619946922, + "learning_rate": 2.4887488465470926e-05, + "loss": 2.7276, + "step": 33865 + }, + { + "epoch": 2.102303060401018, + "grad_norm": 0.13460360338099792, + "learning_rate": 2.4884365565383667e-05, + "loss": 2.7936, + "step": 33866 + }, + { + "epoch": 2.102365137500776, + "grad_norm": 0.13730239947652378, + "learning_rate": 2.4881242796330744e-05, + "loss": 2.7181, + "step": 33867 + }, + { + "epoch": 2.102427214600534, + "grad_norm": 0.14409171750445557, + "learning_rate": 2.487812015832848e-05, + "loss": 2.8004, + "step": 33868 + }, + { + "epoch": 2.1024892917002918, + "grad_norm": 0.14519551705876335, + "learning_rate": 2.4874997651393178e-05, + "loss": 2.6976, + "step": 33869 + }, + { + "epoch": 2.1025513688000497, + "grad_norm": 0.1558936525956826, + "learning_rate": 2.4871875275541112e-05, + "loss": 2.783, + "step": 33870 + }, + { + "epoch": 2.1026134458998076, + "grad_norm": 0.1443494349093355, + "learning_rate": 2.4868753030788577e-05, + "loss": 2.7007, + "step": 33871 + }, + { + "epoch": 2.1026755229995655, + "grad_norm": 0.1530279177793096, + "learning_rate": 2.4865630917151856e-05, + "loss": 2.7463, + "step": 33872 + }, + { + "epoch": 2.1027376000993234, + "grad_norm": 0.1464494154857238, + "learning_rate": 2.4862508934647216e-05, + "loss": 2.7058, + "step": 33873 + }, + { + "epoch": 2.1027996771990813, + "grad_norm": 0.14974764442859945, + "learning_rate": 2.4859387083291e-05, + "loss": 2.651, + "step": 33874 + }, + { + "epoch": 2.1028617542988393, + "grad_norm": 0.1465439421021356, + "learning_rate": 2.4856265363099452e-05, + "loss": 2.7001, + "step": 33875 + }, + { + "epoch": 2.102923831398597, + "grad_norm": 0.15694619883611205, + "learning_rate": 2.4853143774088877e-05, + "loss": 2.7299, + "step": 33876 + }, + { + "epoch": 2.102985908498355, + "grad_norm": 0.15380606876298902, + "learning_rate": 2.485002231627554e-05, + "loss": 2.7115, + "step": 33877 + }, + { + "epoch": 2.103047985598113, + "grad_norm": 0.13804775476310283, + "learning_rate": 2.4846900989675753e-05, + "loss": 2.7171, + "step": 33878 + }, + { + "epoch": 2.103110062697871, + "grad_norm": 0.16017470020924343, + "learning_rate": 2.48437797943058e-05, + "loss": 2.755, + "step": 33879 + }, + { + "epoch": 2.103172139797629, + "grad_norm": 0.14764272461358163, + "learning_rate": 2.484065873018195e-05, + "loss": 2.7629, + "step": 33880 + }, + { + "epoch": 2.1032342168973868, + "grad_norm": 0.16436668987807157, + "learning_rate": 2.4837537797320463e-05, + "loss": 2.7417, + "step": 33881 + }, + { + "epoch": 2.1032962939971442, + "grad_norm": 0.14540878228639426, + "learning_rate": 2.483441699573768e-05, + "loss": 2.721, + "step": 33882 + }, + { + "epoch": 2.103358371096902, + "grad_norm": 0.1475791899106007, + "learning_rate": 2.4831296325449848e-05, + "loss": 2.7172, + "step": 33883 + }, + { + "epoch": 2.10342044819666, + "grad_norm": 0.14981257207778834, + "learning_rate": 2.4828175786473258e-05, + "loss": 2.7996, + "step": 33884 + }, + { + "epoch": 2.103482525296418, + "grad_norm": 0.1416278354343262, + "learning_rate": 2.482505537882418e-05, + "loss": 2.7113, + "step": 33885 + }, + { + "epoch": 2.103544602396176, + "grad_norm": 0.14008691946273416, + "learning_rate": 2.4821935102518884e-05, + "loss": 2.7152, + "step": 33886 + }, + { + "epoch": 2.103606679495934, + "grad_norm": 0.14590943995894246, + "learning_rate": 2.4818814957573683e-05, + "loss": 2.7618, + "step": 33887 + }, + { + "epoch": 2.1036687565956917, + "grad_norm": 0.1632038649473312, + "learning_rate": 2.4815694944004837e-05, + "loss": 2.7607, + "step": 33888 + }, + { + "epoch": 2.1037308336954497, + "grad_norm": 0.14833450508681761, + "learning_rate": 2.481257506182862e-05, + "loss": 2.7151, + "step": 33889 + }, + { + "epoch": 2.1037929107952076, + "grad_norm": 0.13828207355189373, + "learning_rate": 2.4809455311061297e-05, + "loss": 2.7965, + "step": 33890 + }, + { + "epoch": 2.1038549878949655, + "grad_norm": 0.13749887780251763, + "learning_rate": 2.4806335691719186e-05, + "loss": 2.6743, + "step": 33891 + }, + { + "epoch": 2.1039170649947234, + "grad_norm": 0.1488636074671694, + "learning_rate": 2.480321620381853e-05, + "loss": 2.7923, + "step": 33892 + }, + { + "epoch": 2.1039791420944813, + "grad_norm": 0.13889807399206766, + "learning_rate": 2.4800096847375615e-05, + "loss": 2.6671, + "step": 33893 + }, + { + "epoch": 2.1040412191942393, + "grad_norm": 0.1470314111212112, + "learning_rate": 2.4796977622406708e-05, + "loss": 2.7804, + "step": 33894 + }, + { + "epoch": 2.104103296293997, + "grad_norm": 0.13837944009121994, + "learning_rate": 2.4793858528928076e-05, + "loss": 2.7662, + "step": 33895 + }, + { + "epoch": 2.104165373393755, + "grad_norm": 0.14717715158145278, + "learning_rate": 2.4790739566956018e-05, + "loss": 2.6788, + "step": 33896 + }, + { + "epoch": 2.104227450493513, + "grad_norm": 0.14760665767167738, + "learning_rate": 2.478762073650679e-05, + "loss": 2.7805, + "step": 33897 + }, + { + "epoch": 2.104289527593271, + "grad_norm": 0.14743327835082018, + "learning_rate": 2.4784502037596664e-05, + "loss": 2.7089, + "step": 33898 + }, + { + "epoch": 2.104351604693029, + "grad_norm": 0.1400812218595543, + "learning_rate": 2.4781383470241897e-05, + "loss": 2.6659, + "step": 33899 + }, + { + "epoch": 2.1044136817927868, + "grad_norm": 0.14226908046416067, + "learning_rate": 2.477826503445877e-05, + "loss": 2.6985, + "step": 33900 + }, + { + "epoch": 2.1044757588925447, + "grad_norm": 0.16688064843344355, + "learning_rate": 2.4775146730263582e-05, + "loss": 2.6965, + "step": 33901 + }, + { + "epoch": 2.1045378359923026, + "grad_norm": 0.14455293157694749, + "learning_rate": 2.4772028557672573e-05, + "loss": 2.6161, + "step": 33902 + }, + { + "epoch": 2.1045999130920605, + "grad_norm": 0.13959150554329128, + "learning_rate": 2.4768910516702014e-05, + "loss": 2.7292, + "step": 33903 + }, + { + "epoch": 2.1046619901918184, + "grad_norm": 0.1405028567740668, + "learning_rate": 2.476579260736816e-05, + "loss": 2.7269, + "step": 33904 + }, + { + "epoch": 2.1047240672915764, + "grad_norm": 0.15795028756917912, + "learning_rate": 2.476267482968731e-05, + "loss": 2.6802, + "step": 33905 + }, + { + "epoch": 2.104786144391334, + "grad_norm": 0.14457007479499956, + "learning_rate": 2.4759557183675708e-05, + "loss": 2.776, + "step": 33906 + }, + { + "epoch": 2.1048482214910917, + "grad_norm": 0.1367702182894918, + "learning_rate": 2.4756439669349625e-05, + "loss": 2.6867, + "step": 33907 + }, + { + "epoch": 2.1049102985908497, + "grad_norm": 0.1487111593174163, + "learning_rate": 2.4753322286725327e-05, + "loss": 2.8538, + "step": 33908 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.14425579447694395, + "learning_rate": 2.475020503581905e-05, + "loss": 2.6532, + "step": 33909 + }, + { + "epoch": 2.1050344527903655, + "grad_norm": 0.13832451068245932, + "learning_rate": 2.474708791664711e-05, + "loss": 2.6785, + "step": 33910 + }, + { + "epoch": 2.1050965298901234, + "grad_norm": 0.15857268367343347, + "learning_rate": 2.474397092922574e-05, + "loss": 2.7526, + "step": 33911 + }, + { + "epoch": 2.1051586069898813, + "grad_norm": 0.15194898589902944, + "learning_rate": 2.47408540735712e-05, + "loss": 2.6837, + "step": 33912 + }, + { + "epoch": 2.1052206840896392, + "grad_norm": 0.146292166174037, + "learning_rate": 2.473773734969974e-05, + "loss": 2.7174, + "step": 33913 + }, + { + "epoch": 2.105282761189397, + "grad_norm": 0.14943835740379913, + "learning_rate": 2.4734620757627657e-05, + "loss": 2.6928, + "step": 33914 + }, + { + "epoch": 2.105344838289155, + "grad_norm": 0.14494130885978485, + "learning_rate": 2.4731504297371194e-05, + "loss": 2.6825, + "step": 33915 + }, + { + "epoch": 2.105406915388913, + "grad_norm": 0.14949029959669488, + "learning_rate": 2.4728387968946602e-05, + "loss": 2.7029, + "step": 33916 + }, + { + "epoch": 2.105468992488671, + "grad_norm": 0.15077335406170195, + "learning_rate": 2.472527177237013e-05, + "loss": 2.7679, + "step": 33917 + }, + { + "epoch": 2.105531069588429, + "grad_norm": 0.1552450645963038, + "learning_rate": 2.472215570765807e-05, + "loss": 2.7681, + "step": 33918 + }, + { + "epoch": 2.1055931466881868, + "grad_norm": 0.1488387869973163, + "learning_rate": 2.4719039774826653e-05, + "loss": 2.6732, + "step": 33919 + }, + { + "epoch": 2.1056552237879447, + "grad_norm": 0.13452720283774072, + "learning_rate": 2.4715923973892148e-05, + "loss": 2.5905, + "step": 33920 + }, + { + "epoch": 2.1057173008877026, + "grad_norm": 0.16563697717653347, + "learning_rate": 2.4712808304870804e-05, + "loss": 2.7679, + "step": 33921 + }, + { + "epoch": 2.1057793779874605, + "grad_norm": 0.15388424541772455, + "learning_rate": 2.470969276777886e-05, + "loss": 2.6745, + "step": 33922 + }, + { + "epoch": 2.1058414550872184, + "grad_norm": 0.1886659273746275, + "learning_rate": 2.4706577362632605e-05, + "loss": 2.7209, + "step": 33923 + }, + { + "epoch": 2.1059035321869763, + "grad_norm": 0.1563202521225714, + "learning_rate": 2.4703462089448283e-05, + "loss": 2.7655, + "step": 33924 + }, + { + "epoch": 2.1059656092867343, + "grad_norm": 0.142333840424087, + "learning_rate": 2.470034694824213e-05, + "loss": 2.8054, + "step": 33925 + }, + { + "epoch": 2.106027686386492, + "grad_norm": 0.14981747066660855, + "learning_rate": 2.4697231939030398e-05, + "loss": 2.6941, + "step": 33926 + }, + { + "epoch": 2.10608976348625, + "grad_norm": 0.15229994192897206, + "learning_rate": 2.4694117061829364e-05, + "loss": 2.7152, + "step": 33927 + }, + { + "epoch": 2.106151840586008, + "grad_norm": 0.13775051985190828, + "learning_rate": 2.4691002316655265e-05, + "loss": 2.734, + "step": 33928 + }, + { + "epoch": 2.106213917685766, + "grad_norm": 0.14863601385370842, + "learning_rate": 2.4687887703524355e-05, + "loss": 2.6492, + "step": 33929 + }, + { + "epoch": 2.1062759947855234, + "grad_norm": 0.13829756306477198, + "learning_rate": 2.4684773222452855e-05, + "loss": 2.6631, + "step": 33930 + }, + { + "epoch": 2.1063380718852813, + "grad_norm": 0.14184689151968627, + "learning_rate": 2.4681658873457058e-05, + "loss": 2.7198, + "step": 33931 + }, + { + "epoch": 2.1064001489850392, + "grad_norm": 0.18737220382639944, + "learning_rate": 2.46785446565532e-05, + "loss": 2.7172, + "step": 33932 + }, + { + "epoch": 2.106462226084797, + "grad_norm": 0.1439647785621861, + "learning_rate": 2.4675430571757495e-05, + "loss": 2.7176, + "step": 33933 + }, + { + "epoch": 2.106524303184555, + "grad_norm": 0.13730463106634463, + "learning_rate": 2.467231661908624e-05, + "loss": 2.7018, + "step": 33934 + }, + { + "epoch": 2.106586380284313, + "grad_norm": 0.13798878555583657, + "learning_rate": 2.4669202798555652e-05, + "loss": 2.6608, + "step": 33935 + }, + { + "epoch": 2.106648457384071, + "grad_norm": 0.14255841001307235, + "learning_rate": 2.4666089110181965e-05, + "loss": 2.7972, + "step": 33936 + }, + { + "epoch": 2.106710534483829, + "grad_norm": 0.15979481829088715, + "learning_rate": 2.4662975553981464e-05, + "loss": 2.726, + "step": 33937 + }, + { + "epoch": 2.1067726115835868, + "grad_norm": 0.14764699503236745, + "learning_rate": 2.465986212997037e-05, + "loss": 2.7416, + "step": 33938 + }, + { + "epoch": 2.1068346886833447, + "grad_norm": 0.1514193475744777, + "learning_rate": 2.4656748838164924e-05, + "loss": 2.6733, + "step": 33939 + }, + { + "epoch": 2.1068967657831026, + "grad_norm": 0.14579518633849695, + "learning_rate": 2.4653635678581357e-05, + "loss": 2.7827, + "step": 33940 + }, + { + "epoch": 2.1069588428828605, + "grad_norm": 0.14450397105147597, + "learning_rate": 2.4650522651235936e-05, + "loss": 2.778, + "step": 33941 + }, + { + "epoch": 2.1070209199826184, + "grad_norm": 0.13952466508122505, + "learning_rate": 2.46474097561449e-05, + "loss": 2.6317, + "step": 33942 + }, + { + "epoch": 2.1070829970823763, + "grad_norm": 0.1467558316464397, + "learning_rate": 2.4644296993324478e-05, + "loss": 2.6688, + "step": 33943 + }, + { + "epoch": 2.1071450741821343, + "grad_norm": 0.14508492507774418, + "learning_rate": 2.4641184362790913e-05, + "loss": 2.7522, + "step": 33944 + }, + { + "epoch": 2.107207151281892, + "grad_norm": 0.14411897108978447, + "learning_rate": 2.4638071864560432e-05, + "loss": 2.7129, + "step": 33945 + }, + { + "epoch": 2.10726922838165, + "grad_norm": 0.14298391730729573, + "learning_rate": 2.46349594986493e-05, + "loss": 2.7242, + "step": 33946 + }, + { + "epoch": 2.107331305481408, + "grad_norm": 0.14046045451763087, + "learning_rate": 2.4631847265073742e-05, + "loss": 2.7313, + "step": 33947 + }, + { + "epoch": 2.107393382581166, + "grad_norm": 0.14119328801405956, + "learning_rate": 2.4628735163850002e-05, + "loss": 2.733, + "step": 33948 + }, + { + "epoch": 2.107455459680924, + "grad_norm": 0.13620310682931394, + "learning_rate": 2.4625623194994286e-05, + "loss": 2.6322, + "step": 33949 + }, + { + "epoch": 2.1075175367806818, + "grad_norm": 0.14952395642588814, + "learning_rate": 2.462251135852287e-05, + "loss": 2.8008, + "step": 33950 + }, + { + "epoch": 2.1075796138804397, + "grad_norm": 0.13985522505495476, + "learning_rate": 2.4619399654451974e-05, + "loss": 2.7703, + "step": 33951 + }, + { + "epoch": 2.1076416909801976, + "grad_norm": 0.14497365771861423, + "learning_rate": 2.4616288082797834e-05, + "loss": 2.7708, + "step": 33952 + }, + { + "epoch": 2.107703768079955, + "grad_norm": 0.1414551010368837, + "learning_rate": 2.4613176643576664e-05, + "loss": 2.7285, + "step": 33953 + }, + { + "epoch": 2.107765845179713, + "grad_norm": 0.13831917140417832, + "learning_rate": 2.4610065336804726e-05, + "loss": 2.7214, + "step": 33954 + }, + { + "epoch": 2.107827922279471, + "grad_norm": 0.1488787842685167, + "learning_rate": 2.4606954162498247e-05, + "loss": 2.7442, + "step": 33955 + }, + { + "epoch": 2.107889999379229, + "grad_norm": 0.15065146404680824, + "learning_rate": 2.4603843120673454e-05, + "loss": 2.733, + "step": 33956 + }, + { + "epoch": 2.1079520764789867, + "grad_norm": 1.7654008301069712, + "learning_rate": 2.4600732211346573e-05, + "loss": 2.7824, + "step": 33957 + }, + { + "epoch": 2.1080141535787447, + "grad_norm": 0.15587802956978297, + "learning_rate": 2.4597621434533818e-05, + "loss": 2.7683, + "step": 33958 + }, + { + "epoch": 2.1080762306785026, + "grad_norm": 0.14241560727053354, + "learning_rate": 2.4594510790251457e-05, + "loss": 2.744, + "step": 33959 + }, + { + "epoch": 2.1081383077782605, + "grad_norm": 0.15165318573151843, + "learning_rate": 2.4591400278515702e-05, + "loss": 2.7751, + "step": 33960 + }, + { + "epoch": 2.1082003848780184, + "grad_norm": 0.1630354534749416, + "learning_rate": 2.458828989934278e-05, + "loss": 2.6442, + "step": 33961 + }, + { + "epoch": 2.1082624619777763, + "grad_norm": 0.14482463880337834, + "learning_rate": 2.45851796527489e-05, + "loss": 2.7426, + "step": 33962 + }, + { + "epoch": 2.1083245390775343, + "grad_norm": 0.14858523345432273, + "learning_rate": 2.458206953875033e-05, + "loss": 2.749, + "step": 33963 + }, + { + "epoch": 2.108386616177292, + "grad_norm": 0.1452926799116457, + "learning_rate": 2.457895955736327e-05, + "loss": 2.6546, + "step": 33964 + }, + { + "epoch": 2.10844869327705, + "grad_norm": 0.15214637049787752, + "learning_rate": 2.457584970860395e-05, + "loss": 2.8188, + "step": 33965 + }, + { + "epoch": 2.108510770376808, + "grad_norm": 0.1453185610891483, + "learning_rate": 2.4572739992488568e-05, + "loss": 2.6991, + "step": 33966 + }, + { + "epoch": 2.108572847476566, + "grad_norm": 0.144913207357887, + "learning_rate": 2.4569630409033384e-05, + "loss": 2.7206, + "step": 33967 + }, + { + "epoch": 2.108634924576324, + "grad_norm": 0.14138528816014762, + "learning_rate": 2.4566520958254625e-05, + "loss": 2.7004, + "step": 33968 + }, + { + "epoch": 2.1086970016760818, + "grad_norm": 0.15585043601882861, + "learning_rate": 2.4563411640168504e-05, + "loss": 2.7919, + "step": 33969 + }, + { + "epoch": 2.1087590787758397, + "grad_norm": 0.1571631273993795, + "learning_rate": 2.4560302454791235e-05, + "loss": 2.7454, + "step": 33970 + }, + { + "epoch": 2.1088211558755976, + "grad_norm": 0.16117802765826283, + "learning_rate": 2.455719340213905e-05, + "loss": 2.7105, + "step": 33971 + }, + { + "epoch": 2.1088832329753555, + "grad_norm": 0.1645408310414561, + "learning_rate": 2.455408448222814e-05, + "loss": 2.6737, + "step": 33972 + }, + { + "epoch": 2.1089453100751134, + "grad_norm": 0.14823794126397496, + "learning_rate": 2.455097569507477e-05, + "loss": 2.7111, + "step": 33973 + }, + { + "epoch": 2.1090073871748714, + "grad_norm": 0.15118323192272726, + "learning_rate": 2.4547867040695134e-05, + "loss": 2.8125, + "step": 33974 + }, + { + "epoch": 2.1090694642746293, + "grad_norm": 0.1454105884450788, + "learning_rate": 2.4544758519105454e-05, + "loss": 2.7, + "step": 33975 + }, + { + "epoch": 2.109131541374387, + "grad_norm": 0.14234332778650302, + "learning_rate": 2.4541650130321937e-05, + "loss": 2.7175, + "step": 33976 + }, + { + "epoch": 2.109193618474145, + "grad_norm": 0.17703362543274956, + "learning_rate": 2.453854187436082e-05, + "loss": 2.7796, + "step": 33977 + }, + { + "epoch": 2.1092556955739026, + "grad_norm": 0.14517808203873625, + "learning_rate": 2.4535433751238318e-05, + "loss": 2.7673, + "step": 33978 + }, + { + "epoch": 2.1093177726736605, + "grad_norm": 0.17289300734218385, + "learning_rate": 2.4532325760970636e-05, + "loss": 2.7807, + "step": 33979 + }, + { + "epoch": 2.1093798497734184, + "grad_norm": 0.15187140132816293, + "learning_rate": 2.4529217903573974e-05, + "loss": 2.7144, + "step": 33980 + }, + { + "epoch": 2.1094419268731763, + "grad_norm": 0.1537420522469306, + "learning_rate": 2.4526110179064594e-05, + "loss": 2.7292, + "step": 33981 + }, + { + "epoch": 2.1095040039729342, + "grad_norm": 0.1558210385531104, + "learning_rate": 2.452300258745867e-05, + "loss": 2.7939, + "step": 33982 + }, + { + "epoch": 2.109566081072692, + "grad_norm": 0.15321551221795027, + "learning_rate": 2.451989512877243e-05, + "loss": 2.6727, + "step": 33983 + }, + { + "epoch": 2.10962815817245, + "grad_norm": 0.14713181150852742, + "learning_rate": 2.4516787803022084e-05, + "loss": 2.6361, + "step": 33984 + }, + { + "epoch": 2.109690235272208, + "grad_norm": 0.1479856616167573, + "learning_rate": 2.451368061022382e-05, + "loss": 2.7067, + "step": 33985 + }, + { + "epoch": 2.109752312371966, + "grad_norm": 0.1462834888104384, + "learning_rate": 2.4510573550393896e-05, + "loss": 2.7167, + "step": 33986 + }, + { + "epoch": 2.109814389471724, + "grad_norm": 0.16081181544274203, + "learning_rate": 2.450746662354849e-05, + "loss": 2.7299, + "step": 33987 + }, + { + "epoch": 2.1098764665714818, + "grad_norm": 0.15326616825486214, + "learning_rate": 2.4504359829703822e-05, + "loss": 2.7179, + "step": 33988 + }, + { + "epoch": 2.1099385436712397, + "grad_norm": 0.16119061213650987, + "learning_rate": 2.4501253168876083e-05, + "loss": 2.7753, + "step": 33989 + }, + { + "epoch": 2.1100006207709976, + "grad_norm": 0.14777881893853415, + "learning_rate": 2.449814664108151e-05, + "loss": 2.7383, + "step": 33990 + }, + { + "epoch": 2.1100626978707555, + "grad_norm": 0.1497142493889878, + "learning_rate": 2.4495040246336298e-05, + "loss": 2.8506, + "step": 33991 + }, + { + "epoch": 2.1101247749705134, + "grad_norm": 0.1441393319772343, + "learning_rate": 2.4491933984656657e-05, + "loss": 2.7238, + "step": 33992 + }, + { + "epoch": 2.1101868520702713, + "grad_norm": 0.14961999526812128, + "learning_rate": 2.4488827856058784e-05, + "loss": 2.7597, + "step": 33993 + }, + { + "epoch": 2.1102489291700293, + "grad_norm": 0.16110291637164217, + "learning_rate": 2.448572186055887e-05, + "loss": 2.7492, + "step": 33994 + }, + { + "epoch": 2.110311006269787, + "grad_norm": 0.17027063861462238, + "learning_rate": 2.448261599817316e-05, + "loss": 2.7379, + "step": 33995 + }, + { + "epoch": 2.110373083369545, + "grad_norm": 0.15134896814769924, + "learning_rate": 2.447951026891784e-05, + "loss": 2.739, + "step": 33996 + }, + { + "epoch": 2.110435160469303, + "grad_norm": 0.1577866157700523, + "learning_rate": 2.44764046728091e-05, + "loss": 2.7071, + "step": 33997 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.15053330502325904, + "learning_rate": 2.4473299209863142e-05, + "loss": 2.7049, + "step": 33998 + }, + { + "epoch": 2.110559314668819, + "grad_norm": 0.1486350426525376, + "learning_rate": 2.4470193880096176e-05, + "loss": 2.7388, + "step": 33999 + }, + { + "epoch": 2.1106213917685768, + "grad_norm": 0.16289148902206516, + "learning_rate": 2.446708868352443e-05, + "loss": 2.7086, + "step": 34000 + }, + { + "epoch": 2.1106834688683342, + "grad_norm": 0.1698776494493953, + "learning_rate": 2.4463983620164078e-05, + "loss": 2.7511, + "step": 34001 + }, + { + "epoch": 2.110745545968092, + "grad_norm": 0.15700820900896928, + "learning_rate": 2.446087869003132e-05, + "loss": 2.6574, + "step": 34002 + }, + { + "epoch": 2.11080762306785, + "grad_norm": 0.14510469472720128, + "learning_rate": 2.445777389314234e-05, + "loss": 2.7339, + "step": 34003 + }, + { + "epoch": 2.110869700167608, + "grad_norm": 0.13805900726588866, + "learning_rate": 2.4454669229513372e-05, + "loss": 2.7492, + "step": 34004 + }, + { + "epoch": 2.110931777267366, + "grad_norm": 0.1656798332693272, + "learning_rate": 2.4451564699160595e-05, + "loss": 2.6971, + "step": 34005 + }, + { + "epoch": 2.110993854367124, + "grad_norm": 0.1455002853187569, + "learning_rate": 2.444846030210021e-05, + "loss": 2.7014, + "step": 34006 + }, + { + "epoch": 2.1110559314668818, + "grad_norm": 0.15114735052012, + "learning_rate": 2.444535603834841e-05, + "loss": 2.6884, + "step": 34007 + }, + { + "epoch": 2.1111180085666397, + "grad_norm": 0.1410191033586862, + "learning_rate": 2.444225190792137e-05, + "loss": 2.8009, + "step": 34008 + }, + { + "epoch": 2.1111800856663976, + "grad_norm": 0.15777872403971835, + "learning_rate": 2.443914791083533e-05, + "loss": 2.7775, + "step": 34009 + }, + { + "epoch": 2.1112421627661555, + "grad_norm": 0.14536499208465517, + "learning_rate": 2.443604404710646e-05, + "loss": 2.7071, + "step": 34010 + }, + { + "epoch": 2.1113042398659134, + "grad_norm": 0.14836868146459864, + "learning_rate": 2.4432940316750953e-05, + "loss": 2.7357, + "step": 34011 + }, + { + "epoch": 2.1113663169656713, + "grad_norm": 0.14409825831797846, + "learning_rate": 2.4429836719784983e-05, + "loss": 2.7269, + "step": 34012 + }, + { + "epoch": 2.1114283940654293, + "grad_norm": 0.1502927176589306, + "learning_rate": 2.4426733256224778e-05, + "loss": 2.7497, + "step": 34013 + }, + { + "epoch": 2.111490471165187, + "grad_norm": 0.16011878482555117, + "learning_rate": 2.4423629926086516e-05, + "loss": 2.7885, + "step": 34014 + }, + { + "epoch": 2.111552548264945, + "grad_norm": 0.14675668836319966, + "learning_rate": 2.4420526729386385e-05, + "loss": 2.7598, + "step": 34015 + }, + { + "epoch": 2.111614625364703, + "grad_norm": 0.1614898771318484, + "learning_rate": 2.441742366614056e-05, + "loss": 2.676, + "step": 34016 + }, + { + "epoch": 2.111676702464461, + "grad_norm": 0.16773221638627084, + "learning_rate": 2.441432073636526e-05, + "loss": 2.8188, + "step": 34017 + }, + { + "epoch": 2.111738779564219, + "grad_norm": 0.14327491694258102, + "learning_rate": 2.4411217940076665e-05, + "loss": 2.7359, + "step": 34018 + }, + { + "epoch": 2.1118008566639768, + "grad_norm": 0.14076030367916334, + "learning_rate": 2.4408115277290955e-05, + "loss": 2.6553, + "step": 34019 + }, + { + "epoch": 2.1118629337637347, + "grad_norm": 0.16252836979296373, + "learning_rate": 2.440501274802432e-05, + "loss": 2.7714, + "step": 34020 + }, + { + "epoch": 2.1119250108634926, + "grad_norm": 0.14741395997802947, + "learning_rate": 2.4401910352292927e-05, + "loss": 2.7618, + "step": 34021 + }, + { + "epoch": 2.1119870879632505, + "grad_norm": 0.15204561233309832, + "learning_rate": 2.4398808090113006e-05, + "loss": 2.6703, + "step": 34022 + }, + { + "epoch": 2.1120491650630084, + "grad_norm": 0.1503330591183246, + "learning_rate": 2.4395705961500713e-05, + "loss": 2.7438, + "step": 34023 + }, + { + "epoch": 2.1121112421627664, + "grad_norm": 0.14151891072019887, + "learning_rate": 2.4392603966472242e-05, + "loss": 2.6262, + "step": 34024 + }, + { + "epoch": 2.1121733192625243, + "grad_norm": 0.15041297275911475, + "learning_rate": 2.438950210504375e-05, + "loss": 2.8412, + "step": 34025 + }, + { + "epoch": 2.1122353963622817, + "grad_norm": 0.15634319387211576, + "learning_rate": 2.438640037723146e-05, + "loss": 2.7591, + "step": 34026 + }, + { + "epoch": 2.1122974734620397, + "grad_norm": 0.1468461147778751, + "learning_rate": 2.4383298783051543e-05, + "loss": 2.6937, + "step": 34027 + }, + { + "epoch": 2.1123595505617976, + "grad_norm": 0.14611807482978817, + "learning_rate": 2.438019732252017e-05, + "loss": 2.7583, + "step": 34028 + }, + { + "epoch": 2.1124216276615555, + "grad_norm": 0.14779529811172376, + "learning_rate": 2.437709599565353e-05, + "loss": 2.7428, + "step": 34029 + }, + { + "epoch": 2.1124837047613134, + "grad_norm": 0.15466884852199012, + "learning_rate": 2.437399480246778e-05, + "loss": 2.7265, + "step": 34030 + }, + { + "epoch": 2.1125457818610713, + "grad_norm": 0.16877603594942756, + "learning_rate": 2.4370893742979122e-05, + "loss": 2.7449, + "step": 34031 + }, + { + "epoch": 2.1126078589608293, + "grad_norm": 0.14023414574607027, + "learning_rate": 2.436779281720375e-05, + "loss": 2.6055, + "step": 34032 + }, + { + "epoch": 2.112669936060587, + "grad_norm": 0.15738491708347685, + "learning_rate": 2.4364692025157826e-05, + "loss": 2.7635, + "step": 34033 + }, + { + "epoch": 2.112732013160345, + "grad_norm": 0.14522762332138708, + "learning_rate": 2.436159136685752e-05, + "loss": 2.7038, + "step": 34034 + }, + { + "epoch": 2.112794090260103, + "grad_norm": 0.15355546497428596, + "learning_rate": 2.435849084231901e-05, + "loss": 2.7154, + "step": 34035 + }, + { + "epoch": 2.112856167359861, + "grad_norm": 0.14222964720614337, + "learning_rate": 2.4355390451558485e-05, + "loss": 2.6422, + "step": 34036 + }, + { + "epoch": 2.112918244459619, + "grad_norm": 0.15477564929923934, + "learning_rate": 2.435229019459212e-05, + "loss": 2.6419, + "step": 34037 + }, + { + "epoch": 2.1129803215593768, + "grad_norm": 0.15269617739647418, + "learning_rate": 2.4349190071436085e-05, + "loss": 2.728, + "step": 34038 + }, + { + "epoch": 2.1130423986591347, + "grad_norm": 0.18071214190794135, + "learning_rate": 2.4346090082106525e-05, + "loss": 2.717, + "step": 34039 + }, + { + "epoch": 2.1131044757588926, + "grad_norm": 0.16908990592901643, + "learning_rate": 2.4342990226619667e-05, + "loss": 2.8201, + "step": 34040 + }, + { + "epoch": 2.1131665528586505, + "grad_norm": 0.15505135446859167, + "learning_rate": 2.4339890504991658e-05, + "loss": 2.7154, + "step": 34041 + }, + { + "epoch": 2.1132286299584084, + "grad_norm": 0.15326216378602506, + "learning_rate": 2.4336790917238666e-05, + "loss": 2.7337, + "step": 34042 + }, + { + "epoch": 2.1132907070581664, + "grad_norm": 0.16716559369075498, + "learning_rate": 2.4333691463376868e-05, + "loss": 2.7751, + "step": 34043 + }, + { + "epoch": 2.1133527841579243, + "grad_norm": 0.1559190224878676, + "learning_rate": 2.433059214342241e-05, + "loss": 2.6921, + "step": 34044 + }, + { + "epoch": 2.113414861257682, + "grad_norm": 0.14156606821274706, + "learning_rate": 2.4327492957391506e-05, + "loss": 2.786, + "step": 34045 + }, + { + "epoch": 2.11347693835744, + "grad_norm": 0.14324525973913654, + "learning_rate": 2.43243939053003e-05, + "loss": 2.7176, + "step": 34046 + }, + { + "epoch": 2.113539015457198, + "grad_norm": 0.14950051642571116, + "learning_rate": 2.4321294987164967e-05, + "loss": 2.783, + "step": 34047 + }, + { + "epoch": 2.113601092556956, + "grad_norm": 0.16321883811196028, + "learning_rate": 2.4318196203001654e-05, + "loss": 2.6994, + "step": 34048 + }, + { + "epoch": 2.1136631696567134, + "grad_norm": 0.15967458308807386, + "learning_rate": 2.431509755282656e-05, + "loss": 2.7801, + "step": 34049 + }, + { + "epoch": 2.1137252467564713, + "grad_norm": 0.14039443904045487, + "learning_rate": 2.4311999036655843e-05, + "loss": 2.7245, + "step": 34050 + }, + { + "epoch": 2.1137873238562292, + "grad_norm": 0.14493965106739784, + "learning_rate": 2.4308900654505662e-05, + "loss": 2.7062, + "step": 34051 + }, + { + "epoch": 2.113849400955987, + "grad_norm": 0.15046045614223294, + "learning_rate": 2.430580240639216e-05, + "loss": 2.7566, + "step": 34052 + }, + { + "epoch": 2.113911478055745, + "grad_norm": 0.14301560837837154, + "learning_rate": 2.4302704292331546e-05, + "loss": 2.7755, + "step": 34053 + }, + { + "epoch": 2.113973555155503, + "grad_norm": 0.17063373774732676, + "learning_rate": 2.429960631233996e-05, + "loss": 2.6056, + "step": 34054 + }, + { + "epoch": 2.114035632255261, + "grad_norm": 0.1575839145809131, + "learning_rate": 2.4296508466433572e-05, + "loss": 2.6785, + "step": 34055 + }, + { + "epoch": 2.114097709355019, + "grad_norm": 0.14788397681865004, + "learning_rate": 2.4293410754628538e-05, + "loss": 2.6681, + "step": 34056 + }, + { + "epoch": 2.1141597864547768, + "grad_norm": 0.15838520145265741, + "learning_rate": 2.4290313176940997e-05, + "loss": 2.7892, + "step": 34057 + }, + { + "epoch": 2.1142218635545347, + "grad_norm": 0.14151346162555722, + "learning_rate": 2.4287215733387155e-05, + "loss": 2.7504, + "step": 34058 + }, + { + "epoch": 2.1142839406542926, + "grad_norm": 0.13931232631992554, + "learning_rate": 2.4284118423983147e-05, + "loss": 2.7374, + "step": 34059 + }, + { + "epoch": 2.1143460177540505, + "grad_norm": 0.14809563445288693, + "learning_rate": 2.428102124874514e-05, + "loss": 2.7312, + "step": 34060 + }, + { + "epoch": 2.1144080948538084, + "grad_norm": 0.15068469464338116, + "learning_rate": 2.427792420768927e-05, + "loss": 2.7341, + "step": 34061 + }, + { + "epoch": 2.1144701719535663, + "grad_norm": 0.14335356786332396, + "learning_rate": 2.427482730083173e-05, + "loss": 2.7824, + "step": 34062 + }, + { + "epoch": 2.1145322490533243, + "grad_norm": 0.17101120089598032, + "learning_rate": 2.4271730528188664e-05, + "loss": 2.7495, + "step": 34063 + }, + { + "epoch": 2.114594326153082, + "grad_norm": 0.1600591027634017, + "learning_rate": 2.4268633889776205e-05, + "loss": 2.7073, + "step": 34064 + }, + { + "epoch": 2.11465640325284, + "grad_norm": 0.14612050321486522, + "learning_rate": 2.4265537385610548e-05, + "loss": 2.7102, + "step": 34065 + }, + { + "epoch": 2.114718480352598, + "grad_norm": 0.16809822792014698, + "learning_rate": 2.4262441015707808e-05, + "loss": 2.7599, + "step": 34066 + }, + { + "epoch": 2.114780557452356, + "grad_norm": 0.15434439446468623, + "learning_rate": 2.4259344780084185e-05, + "loss": 2.6801, + "step": 34067 + }, + { + "epoch": 2.114842634552114, + "grad_norm": 0.17064798804588388, + "learning_rate": 2.425624867875581e-05, + "loss": 2.7622, + "step": 34068 + }, + { + "epoch": 2.1149047116518718, + "grad_norm": 0.15148810202551893, + "learning_rate": 2.425315271173883e-05, + "loss": 2.7593, + "step": 34069 + }, + { + "epoch": 2.1149667887516297, + "grad_norm": 0.1641930787104746, + "learning_rate": 2.425005687904941e-05, + "loss": 2.8455, + "step": 34070 + }, + { + "epoch": 2.1150288658513876, + "grad_norm": 0.1503243981726022, + "learning_rate": 2.424696118070367e-05, + "loss": 2.569, + "step": 34071 + }, + { + "epoch": 2.1150909429511455, + "grad_norm": 0.14224636986614153, + "learning_rate": 2.4243865616717802e-05, + "loss": 2.7179, + "step": 34072 + }, + { + "epoch": 2.1151530200509034, + "grad_norm": 0.15006901052580335, + "learning_rate": 2.4240770187107943e-05, + "loss": 2.7731, + "step": 34073 + }, + { + "epoch": 2.115215097150661, + "grad_norm": 0.15105937300410024, + "learning_rate": 2.423767489189024e-05, + "loss": 2.7904, + "step": 34074 + }, + { + "epoch": 2.115277174250419, + "grad_norm": 0.14780215391480483, + "learning_rate": 2.423457973108082e-05, + "loss": 2.7651, + "step": 34075 + }, + { + "epoch": 2.1153392513501768, + "grad_norm": 0.14624143800574493, + "learning_rate": 2.4231484704695874e-05, + "loss": 2.7583, + "step": 34076 + }, + { + "epoch": 2.1154013284499347, + "grad_norm": 0.14540562465720996, + "learning_rate": 2.422838981275152e-05, + "loss": 2.7539, + "step": 34077 + }, + { + "epoch": 2.1154634055496926, + "grad_norm": 0.14113039355154353, + "learning_rate": 2.4225295055263926e-05, + "loss": 2.6633, + "step": 34078 + }, + { + "epoch": 2.1155254826494505, + "grad_norm": 0.15205180486923756, + "learning_rate": 2.4222200432249214e-05, + "loss": 2.7954, + "step": 34079 + }, + { + "epoch": 2.1155875597492084, + "grad_norm": 0.13744583932618093, + "learning_rate": 2.4219105943723524e-05, + "loss": 2.5966, + "step": 34080 + }, + { + "epoch": 2.1156496368489663, + "grad_norm": 0.14116909151019497, + "learning_rate": 2.4216011589703037e-05, + "loss": 2.791, + "step": 34081 + }, + { + "epoch": 2.1157117139487243, + "grad_norm": 0.13735872076419203, + "learning_rate": 2.4212917370203874e-05, + "loss": 2.6657, + "step": 34082 + }, + { + "epoch": 2.115773791048482, + "grad_norm": 0.1363742438845211, + "learning_rate": 2.4209823285242183e-05, + "loss": 2.6881, + "step": 34083 + }, + { + "epoch": 2.11583586814824, + "grad_norm": 0.14339794632701047, + "learning_rate": 2.420672933483409e-05, + "loss": 2.681, + "step": 34084 + }, + { + "epoch": 2.115897945247998, + "grad_norm": 0.15449957872112638, + "learning_rate": 2.4203635518995764e-05, + "loss": 2.768, + "step": 34085 + }, + { + "epoch": 2.115960022347756, + "grad_norm": 0.14253590886356074, + "learning_rate": 2.4200541837743336e-05, + "loss": 2.7724, + "step": 34086 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.15341797530613138, + "learning_rate": 2.4197448291092943e-05, + "loss": 2.713, + "step": 34087 + }, + { + "epoch": 2.1160841765472718, + "grad_norm": 0.14561062579165762, + "learning_rate": 2.419435487906071e-05, + "loss": 2.7301, + "step": 34088 + }, + { + "epoch": 2.1161462536470297, + "grad_norm": 0.1436598451005619, + "learning_rate": 2.4191261601662813e-05, + "loss": 2.7336, + "step": 34089 + }, + { + "epoch": 2.1162083307467876, + "grad_norm": 0.1684833395497272, + "learning_rate": 2.4188168458915362e-05, + "loss": 2.7531, + "step": 34090 + }, + { + "epoch": 2.1162704078465455, + "grad_norm": 0.15653813736016756, + "learning_rate": 2.4185075450834504e-05, + "loss": 2.7475, + "step": 34091 + }, + { + "epoch": 2.1163324849463034, + "grad_norm": 0.1506391366976696, + "learning_rate": 2.418198257743638e-05, + "loss": 2.7082, + "step": 34092 + }, + { + "epoch": 2.1163945620460614, + "grad_norm": 0.14303005134158603, + "learning_rate": 2.4178889838737096e-05, + "loss": 2.7982, + "step": 34093 + }, + { + "epoch": 2.1164566391458193, + "grad_norm": 0.15142838256286273, + "learning_rate": 2.4175797234752835e-05, + "loss": 2.7306, + "step": 34094 + }, + { + "epoch": 2.116518716245577, + "grad_norm": 0.15114987062884985, + "learning_rate": 2.4172704765499705e-05, + "loss": 2.7719, + "step": 34095 + }, + { + "epoch": 2.116580793345335, + "grad_norm": 0.14772875257189344, + "learning_rate": 2.416961243099385e-05, + "loss": 2.6631, + "step": 34096 + }, + { + "epoch": 2.1166428704450926, + "grad_norm": 0.14418601352659594, + "learning_rate": 2.4166520231251373e-05, + "loss": 2.7916, + "step": 34097 + }, + { + "epoch": 2.1167049475448505, + "grad_norm": 0.15306093666396683, + "learning_rate": 2.416342816628843e-05, + "loss": 2.8156, + "step": 34098 + }, + { + "epoch": 2.1167670246446084, + "grad_norm": 0.15098201598746766, + "learning_rate": 2.416033623612118e-05, + "loss": 2.6972, + "step": 34099 + }, + { + "epoch": 2.1168291017443663, + "grad_norm": 0.1565643605294912, + "learning_rate": 2.415724444076572e-05, + "loss": 2.6067, + "step": 34100 + }, + { + "epoch": 2.1168911788441243, + "grad_norm": 0.15073453298065964, + "learning_rate": 2.4154152780238194e-05, + "loss": 2.6942, + "step": 34101 + }, + { + "epoch": 2.116953255943882, + "grad_norm": 0.1426014815756459, + "learning_rate": 2.415106125455471e-05, + "loss": 2.7062, + "step": 34102 + }, + { + "epoch": 2.11701533304364, + "grad_norm": 0.1625272810483828, + "learning_rate": 2.4147969863731428e-05, + "loss": 2.7375, + "step": 34103 + }, + { + "epoch": 2.117077410143398, + "grad_norm": 0.1409658037553033, + "learning_rate": 2.4144878607784473e-05, + "loss": 2.6883, + "step": 34104 + }, + { + "epoch": 2.117139487243156, + "grad_norm": 0.1507064858808844, + "learning_rate": 2.4141787486729954e-05, + "loss": 2.7594, + "step": 34105 + }, + { + "epoch": 2.117201564342914, + "grad_norm": 0.17281093227648014, + "learning_rate": 2.4138696500584006e-05, + "loss": 2.6635, + "step": 34106 + }, + { + "epoch": 2.1172636414426718, + "grad_norm": 0.15024258399233853, + "learning_rate": 2.4135605649362742e-05, + "loss": 2.6733, + "step": 34107 + }, + { + "epoch": 2.1173257185424297, + "grad_norm": 0.16647405720409558, + "learning_rate": 2.4132514933082324e-05, + "loss": 2.7565, + "step": 34108 + }, + { + "epoch": 2.1173877956421876, + "grad_norm": 0.14460463851516844, + "learning_rate": 2.412942435175885e-05, + "loss": 2.7058, + "step": 34109 + }, + { + "epoch": 2.1174498727419455, + "grad_norm": 0.1528110428801107, + "learning_rate": 2.412633390540845e-05, + "loss": 2.7664, + "step": 34110 + }, + { + "epoch": 2.1175119498417034, + "grad_norm": 0.16113407828093917, + "learning_rate": 2.412324359404723e-05, + "loss": 2.7482, + "step": 34111 + }, + { + "epoch": 2.1175740269414614, + "grad_norm": 0.13810842511683027, + "learning_rate": 2.4120153417691345e-05, + "loss": 2.7069, + "step": 34112 + }, + { + "epoch": 2.1176361040412193, + "grad_norm": 0.15946493159295144, + "learning_rate": 2.4117063376356907e-05, + "loss": 2.7315, + "step": 34113 + }, + { + "epoch": 2.117698181140977, + "grad_norm": 0.14783732901308727, + "learning_rate": 2.4113973470060032e-05, + "loss": 2.7664, + "step": 34114 + }, + { + "epoch": 2.117760258240735, + "grad_norm": 0.18469154574550584, + "learning_rate": 2.411088369881684e-05, + "loss": 2.7012, + "step": 34115 + }, + { + "epoch": 2.117822335340493, + "grad_norm": 0.19091633944779618, + "learning_rate": 2.410779406264343e-05, + "loss": 2.7709, + "step": 34116 + }, + { + "epoch": 2.117884412440251, + "grad_norm": 0.14624310014620207, + "learning_rate": 2.410470456155597e-05, + "loss": 2.708, + "step": 34117 + }, + { + "epoch": 2.117946489540009, + "grad_norm": 0.1452374282339503, + "learning_rate": 2.4101615195570544e-05, + "loss": 2.7175, + "step": 34118 + }, + { + "epoch": 2.1180085666397668, + "grad_norm": 0.1595061062119313, + "learning_rate": 2.4098525964703284e-05, + "loss": 2.6752, + "step": 34119 + }, + { + "epoch": 2.1180706437395247, + "grad_norm": 0.14193248884307202, + "learning_rate": 2.4095436868970282e-05, + "loss": 2.7757, + "step": 34120 + }, + { + "epoch": 2.1181327208392826, + "grad_norm": 0.13731667017301613, + "learning_rate": 2.4092347908387696e-05, + "loss": 2.7291, + "step": 34121 + }, + { + "epoch": 2.11819479793904, + "grad_norm": 0.15768721725326632, + "learning_rate": 2.4089259082971615e-05, + "loss": 2.7796, + "step": 34122 + }, + { + "epoch": 2.118256875038798, + "grad_norm": 0.14350086942020573, + "learning_rate": 2.4086170392738167e-05, + "loss": 2.7278, + "step": 34123 + }, + { + "epoch": 2.118318952138556, + "grad_norm": 0.14324189187366032, + "learning_rate": 2.408308183770343e-05, + "loss": 2.7417, + "step": 34124 + }, + { + "epoch": 2.118381029238314, + "grad_norm": 0.1481875523744488, + "learning_rate": 2.4079993417883573e-05, + "loss": 2.6579, + "step": 34125 + }, + { + "epoch": 2.1184431063380718, + "grad_norm": 0.1507738420415893, + "learning_rate": 2.407690513329468e-05, + "loss": 2.6893, + "step": 34126 + }, + { + "epoch": 2.1185051834378297, + "grad_norm": 0.1767837020713405, + "learning_rate": 2.407381698395287e-05, + "loss": 2.8007, + "step": 34127 + }, + { + "epoch": 2.1185672605375876, + "grad_norm": 0.14660208740872238, + "learning_rate": 2.4070728969874246e-05, + "loss": 2.754, + "step": 34128 + }, + { + "epoch": 2.1186293376373455, + "grad_norm": 0.15419064860961965, + "learning_rate": 2.4067641091074906e-05, + "loss": 2.7376, + "step": 34129 + }, + { + "epoch": 2.1186914147371034, + "grad_norm": 0.1472619210543908, + "learning_rate": 2.406455334757098e-05, + "loss": 2.6915, + "step": 34130 + }, + { + "epoch": 2.1187534918368613, + "grad_norm": 0.14424962236141775, + "learning_rate": 2.4061465739378584e-05, + "loss": 2.6791, + "step": 34131 + }, + { + "epoch": 2.1188155689366193, + "grad_norm": 0.15224777768851167, + "learning_rate": 2.405837826651383e-05, + "loss": 2.6577, + "step": 34132 + }, + { + "epoch": 2.118877646036377, + "grad_norm": 0.14794960273718025, + "learning_rate": 2.4055290928992806e-05, + "loss": 2.7915, + "step": 34133 + }, + { + "epoch": 2.118939723136135, + "grad_norm": 0.14932604297704474, + "learning_rate": 2.4052203726831618e-05, + "loss": 2.7468, + "step": 34134 + }, + { + "epoch": 2.119001800235893, + "grad_norm": 0.16114514621579085, + "learning_rate": 2.4049116660046396e-05, + "loss": 2.7577, + "step": 34135 + }, + { + "epoch": 2.119063877335651, + "grad_norm": 0.17551303640097124, + "learning_rate": 2.404602972865323e-05, + "loss": 2.8729, + "step": 34136 + }, + { + "epoch": 2.119125954435409, + "grad_norm": 0.16878584684113118, + "learning_rate": 2.4042942932668227e-05, + "loss": 2.7957, + "step": 34137 + }, + { + "epoch": 2.1191880315351668, + "grad_norm": 0.16099469594407645, + "learning_rate": 2.4039856272107474e-05, + "loss": 2.7798, + "step": 34138 + }, + { + "epoch": 2.1192501086349247, + "grad_norm": 0.1406269007130493, + "learning_rate": 2.4036769746987113e-05, + "loss": 2.8277, + "step": 34139 + }, + { + "epoch": 2.1193121857346826, + "grad_norm": 0.14019362194862722, + "learning_rate": 2.4033683357323227e-05, + "loss": 2.7957, + "step": 34140 + }, + { + "epoch": 2.1193742628344405, + "grad_norm": 0.16594841662559826, + "learning_rate": 2.4030597103131918e-05, + "loss": 2.8123, + "step": 34141 + }, + { + "epoch": 2.1194363399341984, + "grad_norm": 0.15001269326545238, + "learning_rate": 2.402751098442929e-05, + "loss": 2.7039, + "step": 34142 + }, + { + "epoch": 2.1194984170339564, + "grad_norm": 0.14049038802049135, + "learning_rate": 2.4024425001231425e-05, + "loss": 2.6795, + "step": 34143 + }, + { + "epoch": 2.1195604941337143, + "grad_norm": 0.16750712502328113, + "learning_rate": 2.4021339153554456e-05, + "loss": 2.7552, + "step": 34144 + }, + { + "epoch": 2.1196225712334718, + "grad_norm": 0.14852801499514678, + "learning_rate": 2.4018253441414468e-05, + "loss": 2.6927, + "step": 34145 + }, + { + "epoch": 2.1196846483332297, + "grad_norm": 0.16622144618808726, + "learning_rate": 2.4015167864827557e-05, + "loss": 2.717, + "step": 34146 + }, + { + "epoch": 2.1197467254329876, + "grad_norm": 0.15640830722679186, + "learning_rate": 2.4012082423809805e-05, + "loss": 2.6779, + "step": 34147 + }, + { + "epoch": 2.1198088025327455, + "grad_norm": 0.14018774931056813, + "learning_rate": 2.4008997118377352e-05, + "loss": 2.7681, + "step": 34148 + }, + { + "epoch": 2.1198708796325034, + "grad_norm": 0.14028014841282407, + "learning_rate": 2.4005911948546268e-05, + "loss": 2.6505, + "step": 34149 + }, + { + "epoch": 2.1199329567322613, + "grad_norm": 0.14840477030456808, + "learning_rate": 2.400282691433265e-05, + "loss": 2.77, + "step": 34150 + }, + { + "epoch": 2.1199950338320193, + "grad_norm": 0.1805959315322523, + "learning_rate": 2.3999742015752575e-05, + "loss": 2.8732, + "step": 34151 + }, + { + "epoch": 2.120057110931777, + "grad_norm": 0.1450244440344761, + "learning_rate": 2.3996657252822173e-05, + "loss": 2.7218, + "step": 34152 + }, + { + "epoch": 2.120119188031535, + "grad_norm": 0.15174724488059016, + "learning_rate": 2.3993572625557532e-05, + "loss": 2.7781, + "step": 34153 + }, + { + "epoch": 2.120181265131293, + "grad_norm": 0.1560273372091545, + "learning_rate": 2.399048813397473e-05, + "loss": 2.8137, + "step": 34154 + }, + { + "epoch": 2.120243342231051, + "grad_norm": 0.1472339145710282, + "learning_rate": 2.3987403778089868e-05, + "loss": 2.6693, + "step": 34155 + }, + { + "epoch": 2.120305419330809, + "grad_norm": 0.16704844480852177, + "learning_rate": 2.3984319557919012e-05, + "loss": 2.772, + "step": 34156 + }, + { + "epoch": 2.1203674964305668, + "grad_norm": 0.1504386168855667, + "learning_rate": 2.3981235473478298e-05, + "loss": 2.7236, + "step": 34157 + }, + { + "epoch": 2.1204295735303247, + "grad_norm": 0.14335058585517232, + "learning_rate": 2.39781515247838e-05, + "loss": 2.7205, + "step": 34158 + }, + { + "epoch": 2.1204916506300826, + "grad_norm": 0.18691745304552235, + "learning_rate": 2.3975067711851595e-05, + "loss": 2.7066, + "step": 34159 + }, + { + "epoch": 2.1205537277298405, + "grad_norm": 0.15358027511622274, + "learning_rate": 2.3971984034697763e-05, + "loss": 2.7587, + "step": 34160 + }, + { + "epoch": 2.1206158048295984, + "grad_norm": 0.13949705468464488, + "learning_rate": 2.3968900493338426e-05, + "loss": 2.6846, + "step": 34161 + }, + { + "epoch": 2.1206778819293564, + "grad_norm": 0.16980847049322637, + "learning_rate": 2.396581708778964e-05, + "loss": 2.8104, + "step": 34162 + }, + { + "epoch": 2.1207399590291143, + "grad_norm": 0.1665615586062769, + "learning_rate": 2.3962733818067524e-05, + "loss": 2.7269, + "step": 34163 + }, + { + "epoch": 2.120802036128872, + "grad_norm": 0.14829803490378551, + "learning_rate": 2.3959650684188146e-05, + "loss": 2.7854, + "step": 34164 + }, + { + "epoch": 2.12086411322863, + "grad_norm": 0.14003576570085777, + "learning_rate": 2.3956567686167587e-05, + "loss": 2.7411, + "step": 34165 + }, + { + "epoch": 2.120926190328388, + "grad_norm": 0.14284812497516342, + "learning_rate": 2.395348482402192e-05, + "loss": 2.7405, + "step": 34166 + }, + { + "epoch": 2.120988267428146, + "grad_norm": 0.17398299593950622, + "learning_rate": 2.3950402097767273e-05, + "loss": 2.7431, + "step": 34167 + }, + { + "epoch": 2.121050344527904, + "grad_norm": 0.1387022012136417, + "learning_rate": 2.394731950741969e-05, + "loss": 2.7055, + "step": 34168 + }, + { + "epoch": 2.121112421627662, + "grad_norm": 0.1498914335673027, + "learning_rate": 2.3944237052995272e-05, + "loss": 2.7936, + "step": 34169 + }, + { + "epoch": 2.1211744987274193, + "grad_norm": 0.14991103529195887, + "learning_rate": 2.3941154734510073e-05, + "loss": 2.6893, + "step": 34170 + }, + { + "epoch": 2.121236575827177, + "grad_norm": 0.14567416877499612, + "learning_rate": 2.393807255198022e-05, + "loss": 2.7859, + "step": 34171 + }, + { + "epoch": 2.121298652926935, + "grad_norm": 0.1471022164744916, + "learning_rate": 2.3934990505421767e-05, + "loss": 2.795, + "step": 34172 + }, + { + "epoch": 2.121360730026693, + "grad_norm": 0.16518534435651083, + "learning_rate": 2.3931908594850794e-05, + "loss": 2.611, + "step": 34173 + }, + { + "epoch": 2.121422807126451, + "grad_norm": 0.14720903442566596, + "learning_rate": 2.3928826820283364e-05, + "loss": 2.7893, + "step": 34174 + }, + { + "epoch": 2.121484884226209, + "grad_norm": 0.14432836804395818, + "learning_rate": 2.392574518173559e-05, + "loss": 2.6469, + "step": 34175 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.1474227729571902, + "learning_rate": 2.3922663679223533e-05, + "loss": 2.7517, + "step": 34176 + }, + { + "epoch": 2.1216090384257247, + "grad_norm": 0.14771543734455292, + "learning_rate": 2.3919582312763278e-05, + "loss": 2.7804, + "step": 34177 + }, + { + "epoch": 2.1216711155254826, + "grad_norm": 0.15274638984818686, + "learning_rate": 2.3916501082370884e-05, + "loss": 2.8311, + "step": 34178 + }, + { + "epoch": 2.1217331926252405, + "grad_norm": 0.17373713244159072, + "learning_rate": 2.3913419988062424e-05, + "loss": 2.7321, + "step": 34179 + }, + { + "epoch": 2.1217952697249984, + "grad_norm": 0.15504639298497594, + "learning_rate": 2.3910339029854002e-05, + "loss": 2.7058, + "step": 34180 + }, + { + "epoch": 2.1218573468247564, + "grad_norm": 0.14928162383587118, + "learning_rate": 2.3907258207761667e-05, + "loss": 2.6499, + "step": 34181 + }, + { + "epoch": 2.1219194239245143, + "grad_norm": 0.13908641936294727, + "learning_rate": 2.390417752180151e-05, + "loss": 2.7547, + "step": 34182 + }, + { + "epoch": 2.121981501024272, + "grad_norm": 0.14742878064314366, + "learning_rate": 2.3901096971989568e-05, + "loss": 2.6862, + "step": 34183 + }, + { + "epoch": 2.12204357812403, + "grad_norm": 0.1432198687627696, + "learning_rate": 2.389801655834196e-05, + "loss": 2.7308, + "step": 34184 + }, + { + "epoch": 2.122105655223788, + "grad_norm": 0.16478138055620786, + "learning_rate": 2.3894936280874742e-05, + "loss": 2.7603, + "step": 34185 + }, + { + "epoch": 2.122167732323546, + "grad_norm": 0.14711171918673638, + "learning_rate": 2.389185613960397e-05, + "loss": 2.6876, + "step": 34186 + }, + { + "epoch": 2.122229809423304, + "grad_norm": 0.15218686927139896, + "learning_rate": 2.388877613454571e-05, + "loss": 2.7246, + "step": 34187 + }, + { + "epoch": 2.1222918865230618, + "grad_norm": 0.1451435971263249, + "learning_rate": 2.3885696265716058e-05, + "loss": 2.7715, + "step": 34188 + }, + { + "epoch": 2.1223539636228197, + "grad_norm": 0.15156871546949044, + "learning_rate": 2.388261653313107e-05, + "loss": 2.7353, + "step": 34189 + }, + { + "epoch": 2.1224160407225776, + "grad_norm": 0.14735447912301264, + "learning_rate": 2.387953693680681e-05, + "loss": 2.7948, + "step": 34190 + }, + { + "epoch": 2.1224781178223355, + "grad_norm": 0.14930433719776157, + "learning_rate": 2.3876457476759352e-05, + "loss": 2.707, + "step": 34191 + }, + { + "epoch": 2.1225401949220934, + "grad_norm": 0.1440789775775588, + "learning_rate": 2.3873378153004732e-05, + "loss": 2.7981, + "step": 34192 + }, + { + "epoch": 2.122602272021851, + "grad_norm": 0.16485790038278628, + "learning_rate": 2.3870298965559062e-05, + "loss": 2.7528, + "step": 34193 + }, + { + "epoch": 2.122664349121609, + "grad_norm": 0.1479193906018352, + "learning_rate": 2.386721991443839e-05, + "loss": 2.7383, + "step": 34194 + }, + { + "epoch": 2.1227264262213668, + "grad_norm": 0.15350674877762113, + "learning_rate": 2.3864140999658747e-05, + "loss": 2.7749, + "step": 34195 + }, + { + "epoch": 2.1227885033211247, + "grad_norm": 0.15433239829703288, + "learning_rate": 2.386106222123625e-05, + "loss": 2.74, + "step": 34196 + }, + { + "epoch": 2.1228505804208826, + "grad_norm": 0.16915120274547182, + "learning_rate": 2.3857983579186914e-05, + "loss": 2.7389, + "step": 34197 + }, + { + "epoch": 2.1229126575206405, + "grad_norm": 0.14194906280365405, + "learning_rate": 2.3854905073526852e-05, + "loss": 2.7496, + "step": 34198 + }, + { + "epoch": 2.1229747346203984, + "grad_norm": 0.16112311940949037, + "learning_rate": 2.385182670427209e-05, + "loss": 2.685, + "step": 34199 + }, + { + "epoch": 2.1230368117201563, + "grad_norm": 0.14316874154265255, + "learning_rate": 2.3848748471438696e-05, + "loss": 2.6656, + "step": 34200 + }, + { + "epoch": 2.1230988888199143, + "grad_norm": 0.14370047374352254, + "learning_rate": 2.384567037504271e-05, + "loss": 2.6747, + "step": 34201 + }, + { + "epoch": 2.123160965919672, + "grad_norm": 0.14306449237884347, + "learning_rate": 2.3842592415100235e-05, + "loss": 2.7002, + "step": 34202 + }, + { + "epoch": 2.12322304301943, + "grad_norm": 0.1456970801646378, + "learning_rate": 2.38395145916273e-05, + "loss": 2.6855, + "step": 34203 + }, + { + "epoch": 2.123285120119188, + "grad_norm": 0.1566878264316891, + "learning_rate": 2.383643690463997e-05, + "loss": 2.6774, + "step": 34204 + }, + { + "epoch": 2.123347197218946, + "grad_norm": 0.16136610309178287, + "learning_rate": 2.38333593541543e-05, + "loss": 2.8425, + "step": 34205 + }, + { + "epoch": 2.123409274318704, + "grad_norm": 0.15144381275669247, + "learning_rate": 2.3830281940186327e-05, + "loss": 2.7193, + "step": 34206 + }, + { + "epoch": 2.1234713514184618, + "grad_norm": 0.14319141641836308, + "learning_rate": 2.3827204662752146e-05, + "loss": 2.6573, + "step": 34207 + }, + { + "epoch": 2.1235334285182197, + "grad_norm": 0.15955589747599333, + "learning_rate": 2.3824127521867788e-05, + "loss": 2.9069, + "step": 34208 + }, + { + "epoch": 2.1235955056179776, + "grad_norm": 0.15127478824984272, + "learning_rate": 2.3821050517549316e-05, + "loss": 2.7574, + "step": 34209 + }, + { + "epoch": 2.1236575827177355, + "grad_norm": 0.16968412867511184, + "learning_rate": 2.3817973649812758e-05, + "loss": 2.6885, + "step": 34210 + }, + { + "epoch": 2.1237196598174934, + "grad_norm": 0.1619256311109994, + "learning_rate": 2.3814896918674208e-05, + "loss": 2.7146, + "step": 34211 + }, + { + "epoch": 2.1237817369172514, + "grad_norm": 0.1535624881491309, + "learning_rate": 2.3811820324149696e-05, + "loss": 2.8705, + "step": 34212 + }, + { + "epoch": 2.1238438140170093, + "grad_norm": 0.1515077860872565, + "learning_rate": 2.3808743866255275e-05, + "loss": 2.6964, + "step": 34213 + }, + { + "epoch": 2.123905891116767, + "grad_norm": 0.14736946376119847, + "learning_rate": 2.3805667545006993e-05, + "loss": 2.7272, + "step": 34214 + }, + { + "epoch": 2.123967968216525, + "grad_norm": 0.1687263050486282, + "learning_rate": 2.3802591360420884e-05, + "loss": 2.7081, + "step": 34215 + }, + { + "epoch": 2.124030045316283, + "grad_norm": 0.14683468574828615, + "learning_rate": 2.379951531251304e-05, + "loss": 2.7181, + "step": 34216 + }, + { + "epoch": 2.124092122416041, + "grad_norm": 0.1433671322940381, + "learning_rate": 2.3796439401299482e-05, + "loss": 2.7348, + "step": 34217 + }, + { + "epoch": 2.1241541995157984, + "grad_norm": 0.14591253562333023, + "learning_rate": 2.3793363626796257e-05, + "loss": 2.6659, + "step": 34218 + }, + { + "epoch": 2.1242162766155563, + "grad_norm": 0.14679856619639794, + "learning_rate": 2.37902879890194e-05, + "loss": 2.7221, + "step": 34219 + }, + { + "epoch": 2.1242783537153143, + "grad_norm": 0.15429050697996033, + "learning_rate": 2.378721248798499e-05, + "loss": 2.7405, + "step": 34220 + }, + { + "epoch": 2.124340430815072, + "grad_norm": 0.13877807499612937, + "learning_rate": 2.3784137123709056e-05, + "loss": 2.7672, + "step": 34221 + }, + { + "epoch": 2.12440250791483, + "grad_norm": 0.16796766325133314, + "learning_rate": 2.3781061896207647e-05, + "loss": 2.6614, + "step": 34222 + }, + { + "epoch": 2.124464585014588, + "grad_norm": 0.15126011045215781, + "learning_rate": 2.3777986805496783e-05, + "loss": 2.7595, + "step": 34223 + }, + { + "epoch": 2.124526662114346, + "grad_norm": 0.1430034238427931, + "learning_rate": 2.377491185159254e-05, + "loss": 2.7172, + "step": 34224 + }, + { + "epoch": 2.124588739214104, + "grad_norm": 0.1545795675827542, + "learning_rate": 2.377183703451095e-05, + "loss": 2.7106, + "step": 34225 + }, + { + "epoch": 2.1246508163138618, + "grad_norm": 0.14760160368451022, + "learning_rate": 2.376876235426806e-05, + "loss": 2.6843, + "step": 34226 + }, + { + "epoch": 2.1247128934136197, + "grad_norm": 0.14198088646875903, + "learning_rate": 2.37656878108799e-05, + "loss": 2.7031, + "step": 34227 + }, + { + "epoch": 2.1247749705133776, + "grad_norm": 0.1510039246748574, + "learning_rate": 2.3762613404362498e-05, + "loss": 2.6517, + "step": 34228 + }, + { + "epoch": 2.1248370476131355, + "grad_norm": 0.14609846780676566, + "learning_rate": 2.3759539134731912e-05, + "loss": 2.7467, + "step": 34229 + }, + { + "epoch": 2.1248991247128934, + "grad_norm": 0.15732268836462804, + "learning_rate": 2.3756465002004198e-05, + "loss": 2.7393, + "step": 34230 + }, + { + "epoch": 2.1249612018126514, + "grad_norm": 0.15806511168926096, + "learning_rate": 2.3753391006195376e-05, + "loss": 2.8018, + "step": 34231 + }, + { + "epoch": 2.1250232789124093, + "grad_norm": 0.140424499362826, + "learning_rate": 2.375031714732148e-05, + "loss": 2.687, + "step": 34232 + }, + { + "epoch": 2.125085356012167, + "grad_norm": 0.14268794468097346, + "learning_rate": 2.374724342539854e-05, + "loss": 2.7157, + "step": 34233 + }, + { + "epoch": 2.125147433111925, + "grad_norm": 0.14383583046134923, + "learning_rate": 2.3744169840442625e-05, + "loss": 2.7877, + "step": 34234 + }, + { + "epoch": 2.125209510211683, + "grad_norm": 0.15632156282994716, + "learning_rate": 2.3741096392469746e-05, + "loss": 2.76, + "step": 34235 + }, + { + "epoch": 2.125271587311441, + "grad_norm": 0.14399530947764808, + "learning_rate": 2.3738023081495942e-05, + "loss": 2.7475, + "step": 34236 + }, + { + "epoch": 2.125333664411199, + "grad_norm": 0.14528437875573438, + "learning_rate": 2.373494990753723e-05, + "loss": 2.703, + "step": 34237 + }, + { + "epoch": 2.125395741510957, + "grad_norm": 0.1424725938886337, + "learning_rate": 2.3731876870609676e-05, + "loss": 2.6813, + "step": 34238 + }, + { + "epoch": 2.1254578186107147, + "grad_norm": 0.1502273961835167, + "learning_rate": 2.3728803970729296e-05, + "loss": 2.6824, + "step": 34239 + }, + { + "epoch": 2.1255198957104726, + "grad_norm": 0.15616668625761593, + "learning_rate": 2.3725731207912128e-05, + "loss": 2.7685, + "step": 34240 + }, + { + "epoch": 2.12558197281023, + "grad_norm": 0.1370588092537525, + "learning_rate": 2.3722658582174195e-05, + "loss": 2.7384, + "step": 34241 + }, + { + "epoch": 2.125644049909988, + "grad_norm": 0.14246929927602064, + "learning_rate": 2.371958609353151e-05, + "loss": 2.6832, + "step": 34242 + }, + { + "epoch": 2.125706127009746, + "grad_norm": 0.15563677616834964, + "learning_rate": 2.3716513742000145e-05, + "loss": 2.7159, + "step": 34243 + }, + { + "epoch": 2.125768204109504, + "grad_norm": 0.14307340403032795, + "learning_rate": 2.3713441527596116e-05, + "loss": 2.7738, + "step": 34244 + }, + { + "epoch": 2.1258302812092618, + "grad_norm": 0.14326985315945653, + "learning_rate": 2.3710369450335435e-05, + "loss": 2.7581, + "step": 34245 + }, + { + "epoch": 2.1258923583090197, + "grad_norm": 0.13680690239538096, + "learning_rate": 2.3707297510234122e-05, + "loss": 2.6788, + "step": 34246 + }, + { + "epoch": 2.1259544354087776, + "grad_norm": 0.14914259215386938, + "learning_rate": 2.3704225707308243e-05, + "loss": 2.7036, + "step": 34247 + }, + { + "epoch": 2.1260165125085355, + "grad_norm": 0.14380786912666363, + "learning_rate": 2.3701154041573797e-05, + "loss": 2.7099, + "step": 34248 + }, + { + "epoch": 2.1260785896082934, + "grad_norm": 0.13928918712451419, + "learning_rate": 2.3698082513046816e-05, + "loss": 2.6469, + "step": 34249 + }, + { + "epoch": 2.1261406667080514, + "grad_norm": 0.15907947919940058, + "learning_rate": 2.369501112174332e-05, + "loss": 2.8138, + "step": 34250 + }, + { + "epoch": 2.1262027438078093, + "grad_norm": 0.14311973697447475, + "learning_rate": 2.3691939867679324e-05, + "loss": 2.823, + "step": 34251 + }, + { + "epoch": 2.126264820907567, + "grad_norm": 0.1459226391595932, + "learning_rate": 2.368886875087088e-05, + "loss": 2.828, + "step": 34252 + }, + { + "epoch": 2.126326898007325, + "grad_norm": 0.1603223939077084, + "learning_rate": 2.3685797771334e-05, + "loss": 2.754, + "step": 34253 + }, + { + "epoch": 2.126388975107083, + "grad_norm": 0.14145280983014455, + "learning_rate": 2.3682726929084697e-05, + "loss": 2.7082, + "step": 34254 + }, + { + "epoch": 2.126451052206841, + "grad_norm": 0.14549265532947625, + "learning_rate": 2.3679656224138974e-05, + "loss": 2.6634, + "step": 34255 + }, + { + "epoch": 2.126513129306599, + "grad_norm": 0.15232205847832095, + "learning_rate": 2.3676585656512895e-05, + "loss": 2.7163, + "step": 34256 + }, + { + "epoch": 2.1265752064063568, + "grad_norm": 0.15374409973445607, + "learning_rate": 2.3673515226222463e-05, + "loss": 2.7575, + "step": 34257 + }, + { + "epoch": 2.1266372835061147, + "grad_norm": 0.15536185212526213, + "learning_rate": 2.3670444933283693e-05, + "loss": 2.6869, + "step": 34258 + }, + { + "epoch": 2.1266993606058726, + "grad_norm": 0.13997205171099764, + "learning_rate": 2.3667374777712583e-05, + "loss": 2.6789, + "step": 34259 + }, + { + "epoch": 2.1267614377056305, + "grad_norm": 0.15006340023129813, + "learning_rate": 2.3664304759525185e-05, + "loss": 2.6713, + "step": 34260 + }, + { + "epoch": 2.1268235148053884, + "grad_norm": 0.1486675086610554, + "learning_rate": 2.3661234878737488e-05, + "loss": 2.8634, + "step": 34261 + }, + { + "epoch": 2.1268855919051464, + "grad_norm": 0.1530008267504196, + "learning_rate": 2.365816513536554e-05, + "loss": 2.7223, + "step": 34262 + }, + { + "epoch": 2.1269476690049043, + "grad_norm": 0.14104018248500733, + "learning_rate": 2.3655095529425343e-05, + "loss": 2.7857, + "step": 34263 + }, + { + "epoch": 2.1270097461046618, + "grad_norm": 0.15129653758545497, + "learning_rate": 2.3652026060932902e-05, + "loss": 2.8119, + "step": 34264 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.1469739362591232, + "learning_rate": 2.364895672990422e-05, + "loss": 2.7575, + "step": 34265 + }, + { + "epoch": 2.1271339003041776, + "grad_norm": 0.14820232081459017, + "learning_rate": 2.3645887536355348e-05, + "loss": 2.8073, + "step": 34266 + }, + { + "epoch": 2.1271959774039355, + "grad_norm": 0.1349835085730085, + "learning_rate": 2.364281848030228e-05, + "loss": 2.7157, + "step": 34267 + }, + { + "epoch": 2.1272580545036934, + "grad_norm": 0.14208574715178557, + "learning_rate": 2.3639749561761027e-05, + "loss": 2.8047, + "step": 34268 + }, + { + "epoch": 2.1273201316034513, + "grad_norm": 0.1481225914698965, + "learning_rate": 2.3636680780747576e-05, + "loss": 2.7434, + "step": 34269 + }, + { + "epoch": 2.1273822087032093, + "grad_norm": 0.14235122471559902, + "learning_rate": 2.3633612137277982e-05, + "loss": 2.6791, + "step": 34270 + }, + { + "epoch": 2.127444285802967, + "grad_norm": 0.14451446951289654, + "learning_rate": 2.3630543631368236e-05, + "loss": 2.7318, + "step": 34271 + }, + { + "epoch": 2.127506362902725, + "grad_norm": 0.1475429306957362, + "learning_rate": 2.3627475263034344e-05, + "loss": 2.6508, + "step": 34272 + }, + { + "epoch": 2.127568440002483, + "grad_norm": 0.14638458031535986, + "learning_rate": 2.36244070322923e-05, + "loss": 2.6579, + "step": 34273 + }, + { + "epoch": 2.127630517102241, + "grad_norm": 0.1449656466677979, + "learning_rate": 2.3621338939158146e-05, + "loss": 2.737, + "step": 34274 + }, + { + "epoch": 2.127692594201999, + "grad_norm": 0.1402341922344409, + "learning_rate": 2.361827098364786e-05, + "loss": 2.6573, + "step": 34275 + }, + { + "epoch": 2.1277546713017568, + "grad_norm": 0.1618216673157487, + "learning_rate": 2.3615203165777472e-05, + "loss": 2.6576, + "step": 34276 + }, + { + "epoch": 2.1278167484015147, + "grad_norm": 0.14338944454981278, + "learning_rate": 2.3612135485562965e-05, + "loss": 2.7466, + "step": 34277 + }, + { + "epoch": 2.1278788255012726, + "grad_norm": 0.1527141933327297, + "learning_rate": 2.360906794302034e-05, + "loss": 2.6921, + "step": 34278 + }, + { + "epoch": 2.1279409026010305, + "grad_norm": 0.15116500272088298, + "learning_rate": 2.360600053816563e-05, + "loss": 2.7469, + "step": 34279 + }, + { + "epoch": 2.1280029797007884, + "grad_norm": 0.14366486272454704, + "learning_rate": 2.360293327101482e-05, + "loss": 2.6594, + "step": 34280 + }, + { + "epoch": 2.1280650568005464, + "grad_norm": 0.17821065783937567, + "learning_rate": 2.359986614158392e-05, + "loss": 2.8178, + "step": 34281 + }, + { + "epoch": 2.1281271339003043, + "grad_norm": 0.1394075971169321, + "learning_rate": 2.359679914988891e-05, + "loss": 2.7187, + "step": 34282 + }, + { + "epoch": 2.128189211000062, + "grad_norm": 0.13361760450734247, + "learning_rate": 2.3593732295945826e-05, + "loss": 2.6836, + "step": 34283 + }, + { + "epoch": 2.12825128809982, + "grad_norm": 0.15523090019810235, + "learning_rate": 2.359066557977065e-05, + "loss": 2.7863, + "step": 34284 + }, + { + "epoch": 2.128313365199578, + "grad_norm": 0.13282825572859233, + "learning_rate": 2.3587599001379378e-05, + "loss": 2.6458, + "step": 34285 + }, + { + "epoch": 2.128375442299336, + "grad_norm": 0.13592762938650996, + "learning_rate": 2.358453256078802e-05, + "loss": 2.6848, + "step": 34286 + }, + { + "epoch": 2.128437519399094, + "grad_norm": 0.15888804064143927, + "learning_rate": 2.3581466258012546e-05, + "loss": 2.7589, + "step": 34287 + }, + { + "epoch": 2.128499596498852, + "grad_norm": 0.14632541933017532, + "learning_rate": 2.3578400093068996e-05, + "loss": 2.6719, + "step": 34288 + }, + { + "epoch": 2.1285616735986093, + "grad_norm": 0.14644563798519747, + "learning_rate": 2.3575334065973347e-05, + "loss": 2.7479, + "step": 34289 + }, + { + "epoch": 2.128623750698367, + "grad_norm": 0.14609348914179232, + "learning_rate": 2.3572268176741597e-05, + "loss": 2.5753, + "step": 34290 + }, + { + "epoch": 2.128685827798125, + "grad_norm": 0.14823618840278194, + "learning_rate": 2.3569202425389714e-05, + "loss": 2.7648, + "step": 34291 + }, + { + "epoch": 2.128747904897883, + "grad_norm": 0.15281825262486992, + "learning_rate": 2.356613681193374e-05, + "loss": 2.6991, + "step": 34292 + }, + { + "epoch": 2.128809981997641, + "grad_norm": 0.13922357954345943, + "learning_rate": 2.356307133638963e-05, + "loss": 2.7009, + "step": 34293 + }, + { + "epoch": 2.128872059097399, + "grad_norm": 0.1425394044842202, + "learning_rate": 2.3560005998773416e-05, + "loss": 2.7958, + "step": 34294 + }, + { + "epoch": 2.1289341361971568, + "grad_norm": 0.1562628144731524, + "learning_rate": 2.3556940799101068e-05, + "loss": 2.8081, + "step": 34295 + }, + { + "epoch": 2.1289962132969147, + "grad_norm": 0.1515384074853479, + "learning_rate": 2.3553875737388557e-05, + "loss": 2.766, + "step": 34296 + }, + { + "epoch": 2.1290582903966726, + "grad_norm": 0.13608806783286656, + "learning_rate": 2.355081081365192e-05, + "loss": 2.7126, + "step": 34297 + }, + { + "epoch": 2.1291203674964305, + "grad_norm": 0.13984233892783787, + "learning_rate": 2.354774602790712e-05, + "loss": 2.6487, + "step": 34298 + }, + { + "epoch": 2.1291824445961884, + "grad_norm": 0.14483887565460246, + "learning_rate": 2.354468138017015e-05, + "loss": 2.6297, + "step": 34299 + }, + { + "epoch": 2.1292445216959464, + "grad_norm": 0.14856107486912729, + "learning_rate": 2.3541616870457002e-05, + "loss": 2.7474, + "step": 34300 + }, + { + "epoch": 2.1293065987957043, + "grad_norm": 0.14757968845215774, + "learning_rate": 2.353855249878364e-05, + "loss": 2.7217, + "step": 34301 + }, + { + "epoch": 2.129368675895462, + "grad_norm": 0.14880656489200705, + "learning_rate": 2.3535488265166096e-05, + "loss": 2.7199, + "step": 34302 + }, + { + "epoch": 2.12943075299522, + "grad_norm": 0.1444201042060954, + "learning_rate": 2.353242416962033e-05, + "loss": 2.7193, + "step": 34303 + }, + { + "epoch": 2.129492830094978, + "grad_norm": 0.16320760836727133, + "learning_rate": 2.3529360212162332e-05, + "loss": 2.6835, + "step": 34304 + }, + { + "epoch": 2.129554907194736, + "grad_norm": 0.15663310411049275, + "learning_rate": 2.352629639280807e-05, + "loss": 2.8197, + "step": 34305 + }, + { + "epoch": 2.129616984294494, + "grad_norm": 0.14526102400217697, + "learning_rate": 2.3523232711573568e-05, + "loss": 2.6953, + "step": 34306 + }, + { + "epoch": 2.129679061394252, + "grad_norm": 0.1416492381898799, + "learning_rate": 2.3520169168474786e-05, + "loss": 2.7151, + "step": 34307 + }, + { + "epoch": 2.1297411384940097, + "grad_norm": 0.1433699190799285, + "learning_rate": 2.3517105763527712e-05, + "loss": 2.6757, + "step": 34308 + }, + { + "epoch": 2.1298032155937676, + "grad_norm": 0.14121037742957432, + "learning_rate": 2.3514042496748307e-05, + "loss": 2.6972, + "step": 34309 + }, + { + "epoch": 2.1298652926935255, + "grad_norm": 0.14139453618127362, + "learning_rate": 2.3510979368152593e-05, + "loss": 2.8076, + "step": 34310 + }, + { + "epoch": 2.1299273697932835, + "grad_norm": 0.14900312447752223, + "learning_rate": 2.350791637775652e-05, + "loss": 2.6789, + "step": 34311 + }, + { + "epoch": 2.129989446893041, + "grad_norm": 0.14102641405165212, + "learning_rate": 2.3504853525576087e-05, + "loss": 2.5974, + "step": 34312 + }, + { + "epoch": 2.1300515239927993, + "grad_norm": 0.15119709795109182, + "learning_rate": 2.3501790811627267e-05, + "loss": 2.7765, + "step": 34313 + }, + { + "epoch": 2.1301136010925568, + "grad_norm": 0.13988767818371065, + "learning_rate": 2.3498728235926014e-05, + "loss": 2.6183, + "step": 34314 + }, + { + "epoch": 2.1301756781923147, + "grad_norm": 0.16621601688678977, + "learning_rate": 2.3495665798488348e-05, + "loss": 2.6956, + "step": 34315 + }, + { + "epoch": 2.1302377552920726, + "grad_norm": 0.1480195724857311, + "learning_rate": 2.349260349933023e-05, + "loss": 2.7672, + "step": 34316 + }, + { + "epoch": 2.1302998323918305, + "grad_norm": 0.1523344452779652, + "learning_rate": 2.348954133846763e-05, + "loss": 2.7762, + "step": 34317 + }, + { + "epoch": 2.1303619094915884, + "grad_norm": 0.14817502004151187, + "learning_rate": 2.3486479315916515e-05, + "loss": 2.7738, + "step": 34318 + }, + { + "epoch": 2.1304239865913464, + "grad_norm": 0.1525091044158877, + "learning_rate": 2.348341743169289e-05, + "loss": 2.6575, + "step": 34319 + }, + { + "epoch": 2.1304860636911043, + "grad_norm": 0.145474826991068, + "learning_rate": 2.348035568581271e-05, + "loss": 2.7657, + "step": 34320 + }, + { + "epoch": 2.130548140790862, + "grad_norm": 0.14341917583711397, + "learning_rate": 2.3477294078291957e-05, + "loss": 2.6941, + "step": 34321 + }, + { + "epoch": 2.13061021789062, + "grad_norm": 0.15042105899374295, + "learning_rate": 2.347423260914658e-05, + "loss": 2.7981, + "step": 34322 + }, + { + "epoch": 2.130672294990378, + "grad_norm": 0.14631666119217696, + "learning_rate": 2.347117127839259e-05, + "loss": 2.73, + "step": 34323 + }, + { + "epoch": 2.130734372090136, + "grad_norm": 0.16020895509409042, + "learning_rate": 2.3468110086045937e-05, + "loss": 2.7066, + "step": 34324 + }, + { + "epoch": 2.130796449189894, + "grad_norm": 0.17057612952392737, + "learning_rate": 2.3465049032122593e-05, + "loss": 2.6237, + "step": 34325 + }, + { + "epoch": 2.1308585262896518, + "grad_norm": 0.15436661628977713, + "learning_rate": 2.3461988116638518e-05, + "loss": 2.7155, + "step": 34326 + }, + { + "epoch": 2.1309206033894097, + "grad_norm": 0.14084314275000842, + "learning_rate": 2.3458927339609705e-05, + "loss": 2.7946, + "step": 34327 + }, + { + "epoch": 2.1309826804891676, + "grad_norm": 0.1490959508517953, + "learning_rate": 2.3455866701052098e-05, + "loss": 2.7611, + "step": 34328 + }, + { + "epoch": 2.1310447575889255, + "grad_norm": 0.14809772227212897, + "learning_rate": 2.345280620098169e-05, + "loss": 2.7392, + "step": 34329 + }, + { + "epoch": 2.1311068346886834, + "grad_norm": 0.16139505028065895, + "learning_rate": 2.3449745839414445e-05, + "loss": 2.7315, + "step": 34330 + }, + { + "epoch": 2.1311689117884414, + "grad_norm": 0.14906936352800945, + "learning_rate": 2.3446685616366315e-05, + "loss": 2.6757, + "step": 34331 + }, + { + "epoch": 2.1312309888881993, + "grad_norm": 0.14314063037871888, + "learning_rate": 2.3443625531853257e-05, + "loss": 2.6807, + "step": 34332 + }, + { + "epoch": 2.131293065987957, + "grad_norm": 0.14164817306008276, + "learning_rate": 2.3440565585891268e-05, + "loss": 2.6456, + "step": 34333 + }, + { + "epoch": 2.131355143087715, + "grad_norm": 0.1491978583265739, + "learning_rate": 2.34375057784963e-05, + "loss": 2.8137, + "step": 34334 + }, + { + "epoch": 2.131417220187473, + "grad_norm": 0.18546543341726726, + "learning_rate": 2.3434446109684305e-05, + "loss": 2.7085, + "step": 34335 + }, + { + "epoch": 2.131479297287231, + "grad_norm": 0.14371999966583443, + "learning_rate": 2.343138657947126e-05, + "loss": 2.7258, + "step": 34336 + }, + { + "epoch": 2.1315413743869884, + "grad_norm": 0.146156509964006, + "learning_rate": 2.34283271878731e-05, + "loss": 2.7036, + "step": 34337 + }, + { + "epoch": 2.1316034514867463, + "grad_norm": 0.14780236815352701, + "learning_rate": 2.3425267934905826e-05, + "loss": 2.7242, + "step": 34338 + }, + { + "epoch": 2.1316655285865043, + "grad_norm": 0.14045776251018974, + "learning_rate": 2.342220882058538e-05, + "loss": 2.7698, + "step": 34339 + }, + { + "epoch": 2.131727605686262, + "grad_norm": 0.15090174332408637, + "learning_rate": 2.3419149844927723e-05, + "loss": 2.6963, + "step": 34340 + }, + { + "epoch": 2.13178968278602, + "grad_norm": 0.13518873772427112, + "learning_rate": 2.3416091007948797e-05, + "loss": 2.7484, + "step": 34341 + }, + { + "epoch": 2.131851759885778, + "grad_norm": 0.1490546122649296, + "learning_rate": 2.3413032309664595e-05, + "loss": 2.7967, + "step": 34342 + }, + { + "epoch": 2.131913836985536, + "grad_norm": 0.14971294304310062, + "learning_rate": 2.3409973750091057e-05, + "loss": 2.7915, + "step": 34343 + }, + { + "epoch": 2.131975914085294, + "grad_norm": 0.13696660327469318, + "learning_rate": 2.3406915329244145e-05, + "loss": 2.7126, + "step": 34344 + }, + { + "epoch": 2.1320379911850518, + "grad_norm": 0.1469060106715736, + "learning_rate": 2.340385704713979e-05, + "loss": 2.7534, + "step": 34345 + }, + { + "epoch": 2.1321000682848097, + "grad_norm": 0.14396380172474518, + "learning_rate": 2.3400798903793986e-05, + "loss": 2.6581, + "step": 34346 + }, + { + "epoch": 2.1321621453845676, + "grad_norm": 0.13890645508810034, + "learning_rate": 2.339774089922267e-05, + "loss": 2.7924, + "step": 34347 + }, + { + "epoch": 2.1322242224843255, + "grad_norm": 0.1380344637552151, + "learning_rate": 2.33946830334418e-05, + "loss": 2.6703, + "step": 34348 + }, + { + "epoch": 2.1322862995840834, + "grad_norm": 0.13690193030006245, + "learning_rate": 2.3391625306467328e-05, + "loss": 2.8066, + "step": 34349 + }, + { + "epoch": 2.1323483766838414, + "grad_norm": 0.14220744113682293, + "learning_rate": 2.3388567718315186e-05, + "loss": 2.7207, + "step": 34350 + }, + { + "epoch": 2.1324104537835993, + "grad_norm": 0.16000958449371208, + "learning_rate": 2.3385510269001366e-05, + "loss": 2.7845, + "step": 34351 + }, + { + "epoch": 2.132472530883357, + "grad_norm": 0.1428528060212226, + "learning_rate": 2.33824529585418e-05, + "loss": 2.7401, + "step": 34352 + }, + { + "epoch": 2.132534607983115, + "grad_norm": 0.14202507089533528, + "learning_rate": 2.337939578695244e-05, + "loss": 2.7055, + "step": 34353 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.15345903061997407, + "learning_rate": 2.3376338754249215e-05, + "loss": 2.7513, + "step": 34354 + }, + { + "epoch": 2.132658762182631, + "grad_norm": 0.1434685909529602, + "learning_rate": 2.337328186044811e-05, + "loss": 2.6791, + "step": 34355 + }, + { + "epoch": 2.132720839282389, + "grad_norm": 0.1465094544876007, + "learning_rate": 2.337022510556506e-05, + "loss": 2.7438, + "step": 34356 + }, + { + "epoch": 2.132782916382147, + "grad_norm": 0.13877569261438882, + "learning_rate": 2.336716848961601e-05, + "loss": 2.7085, + "step": 34357 + }, + { + "epoch": 2.1328449934819047, + "grad_norm": 0.14133339185163385, + "learning_rate": 2.336411201261689e-05, + "loss": 2.7564, + "step": 34358 + }, + { + "epoch": 2.1329070705816626, + "grad_norm": 0.15273236568524384, + "learning_rate": 2.336105567458366e-05, + "loss": 2.7323, + "step": 34359 + }, + { + "epoch": 2.13296914768142, + "grad_norm": 0.13953705327053814, + "learning_rate": 2.335799947553229e-05, + "loss": 2.7581, + "step": 34360 + }, + { + "epoch": 2.1330312247811785, + "grad_norm": 0.1395048056401967, + "learning_rate": 2.3354943415478703e-05, + "loss": 2.6712, + "step": 34361 + }, + { + "epoch": 2.133093301880936, + "grad_norm": 0.14251336495148953, + "learning_rate": 2.3351887494438844e-05, + "loss": 2.6368, + "step": 34362 + }, + { + "epoch": 2.133155378980694, + "grad_norm": 0.1513156433944957, + "learning_rate": 2.334883171242866e-05, + "loss": 2.8112, + "step": 34363 + }, + { + "epoch": 2.1332174560804518, + "grad_norm": 0.13988500178460664, + "learning_rate": 2.334577606946407e-05, + "loss": 2.6727, + "step": 34364 + }, + { + "epoch": 2.1332795331802097, + "grad_norm": 0.14068383734386855, + "learning_rate": 2.334272056556106e-05, + "loss": 2.7484, + "step": 34365 + }, + { + "epoch": 2.1333416102799676, + "grad_norm": 0.13035587693465694, + "learning_rate": 2.3339665200735543e-05, + "loss": 2.6238, + "step": 34366 + }, + { + "epoch": 2.1334036873797255, + "grad_norm": 0.16129290401792523, + "learning_rate": 2.3336609975003465e-05, + "loss": 2.7086, + "step": 34367 + }, + { + "epoch": 2.1334657644794834, + "grad_norm": 0.1397938271605239, + "learning_rate": 2.333355488838075e-05, + "loss": 2.7693, + "step": 34368 + }, + { + "epoch": 2.1335278415792414, + "grad_norm": 0.1406297219331369, + "learning_rate": 2.3330499940883365e-05, + "loss": 2.7418, + "step": 34369 + }, + { + "epoch": 2.1335899186789993, + "grad_norm": 0.14709115558262664, + "learning_rate": 2.3327445132527242e-05, + "loss": 2.7708, + "step": 34370 + }, + { + "epoch": 2.133651995778757, + "grad_norm": 0.1454914327017171, + "learning_rate": 2.3324390463328312e-05, + "loss": 2.6313, + "step": 34371 + }, + { + "epoch": 2.133714072878515, + "grad_norm": 0.15141574942015837, + "learning_rate": 2.332133593330249e-05, + "loss": 2.7085, + "step": 34372 + }, + { + "epoch": 2.133776149978273, + "grad_norm": 0.14128996667633267, + "learning_rate": 2.331828154246576e-05, + "loss": 2.6944, + "step": 34373 + }, + { + "epoch": 2.133838227078031, + "grad_norm": 0.13734166744810172, + "learning_rate": 2.3315227290834028e-05, + "loss": 2.736, + "step": 34374 + }, + { + "epoch": 2.133900304177789, + "grad_norm": 0.1431426082944641, + "learning_rate": 2.3312173178423234e-05, + "loss": 2.8098, + "step": 34375 + }, + { + "epoch": 2.133962381277547, + "grad_norm": 0.1388147450934945, + "learning_rate": 2.3309119205249313e-05, + "loss": 2.7676, + "step": 34376 + }, + { + "epoch": 2.1340244583773047, + "grad_norm": 0.148005213700829, + "learning_rate": 2.3306065371328173e-05, + "loss": 2.6863, + "step": 34377 + }, + { + "epoch": 2.1340865354770626, + "grad_norm": 0.151446481670189, + "learning_rate": 2.3303011676675794e-05, + "loss": 2.722, + "step": 34378 + }, + { + "epoch": 2.1341486125768205, + "grad_norm": 0.1449082393863427, + "learning_rate": 2.329995812130808e-05, + "loss": 2.7592, + "step": 34379 + }, + { + "epoch": 2.1342106896765785, + "grad_norm": 0.14260683032010246, + "learning_rate": 2.3296904705240968e-05, + "loss": 2.8347, + "step": 34380 + }, + { + "epoch": 2.1342727667763364, + "grad_norm": 0.1428571153166012, + "learning_rate": 2.329385142849037e-05, + "loss": 2.6716, + "step": 34381 + }, + { + "epoch": 2.1343348438760943, + "grad_norm": 0.13998511218202503, + "learning_rate": 2.3290798291072246e-05, + "loss": 2.6918, + "step": 34382 + }, + { + "epoch": 2.134396920975852, + "grad_norm": 0.14074220673759216, + "learning_rate": 2.3287745293002517e-05, + "loss": 2.7051, + "step": 34383 + }, + { + "epoch": 2.13445899807561, + "grad_norm": 0.14430321282438774, + "learning_rate": 2.32846924342971e-05, + "loss": 2.6435, + "step": 34384 + }, + { + "epoch": 2.1345210751753676, + "grad_norm": 0.13912066518365973, + "learning_rate": 2.328163971497193e-05, + "loss": 2.6741, + "step": 34385 + }, + { + "epoch": 2.1345831522751255, + "grad_norm": 0.14660923404884008, + "learning_rate": 2.327858713504292e-05, + "loss": 2.6907, + "step": 34386 + }, + { + "epoch": 2.1346452293748834, + "grad_norm": 0.14771584715875016, + "learning_rate": 2.3275534694526018e-05, + "loss": 2.7197, + "step": 34387 + }, + { + "epoch": 2.1347073064746414, + "grad_norm": 0.16324836956585728, + "learning_rate": 2.327248239343714e-05, + "loss": 2.8171, + "step": 34388 + }, + { + "epoch": 2.1347693835743993, + "grad_norm": 0.15832731882287449, + "learning_rate": 2.326943023179221e-05, + "loss": 2.6303, + "step": 34389 + }, + { + "epoch": 2.134831460674157, + "grad_norm": 0.13620768120617258, + "learning_rate": 2.3266378209607137e-05, + "loss": 2.676, + "step": 34390 + }, + { + "epoch": 2.134893537773915, + "grad_norm": 0.14685811366750973, + "learning_rate": 2.326332632689787e-05, + "loss": 2.8044, + "step": 34391 + }, + { + "epoch": 2.134955614873673, + "grad_norm": 0.14663417785730787, + "learning_rate": 2.3260274583680308e-05, + "loss": 2.668, + "step": 34392 + }, + { + "epoch": 2.135017691973431, + "grad_norm": 0.15703679117703065, + "learning_rate": 2.32572229799704e-05, + "loss": 2.7278, + "step": 34393 + }, + { + "epoch": 2.135079769073189, + "grad_norm": 0.14299812249680083, + "learning_rate": 2.3254171515784053e-05, + "loss": 2.7122, + "step": 34394 + }, + { + "epoch": 2.1351418461729468, + "grad_norm": 0.14638428129338898, + "learning_rate": 2.3251120191137165e-05, + "loss": 2.823, + "step": 34395 + }, + { + "epoch": 2.1352039232727047, + "grad_norm": 0.14087046948207782, + "learning_rate": 2.3248069006045698e-05, + "loss": 2.688, + "step": 34396 + }, + { + "epoch": 2.1352660003724626, + "grad_norm": 0.13994710502015298, + "learning_rate": 2.3245017960525552e-05, + "loss": 2.6359, + "step": 34397 + }, + { + "epoch": 2.1353280774722205, + "grad_norm": 0.14106015588032148, + "learning_rate": 2.3241967054592634e-05, + "loss": 2.6903, + "step": 34398 + }, + { + "epoch": 2.1353901545719784, + "grad_norm": 0.139017547349736, + "learning_rate": 2.3238916288262875e-05, + "loss": 2.6433, + "step": 34399 + }, + { + "epoch": 2.1354522316717364, + "grad_norm": 0.1433231276752656, + "learning_rate": 2.323586566155217e-05, + "loss": 2.6932, + "step": 34400 + }, + { + "epoch": 2.1355143087714943, + "grad_norm": 0.14707098804941843, + "learning_rate": 2.3232815174476464e-05, + "loss": 2.8535, + "step": 34401 + }, + { + "epoch": 2.135576385871252, + "grad_norm": 0.1465169234672387, + "learning_rate": 2.3229764827051664e-05, + "loss": 2.8117, + "step": 34402 + }, + { + "epoch": 2.13563846297101, + "grad_norm": 0.14605830216404814, + "learning_rate": 2.3226714619293676e-05, + "loss": 2.6952, + "step": 34403 + }, + { + "epoch": 2.135700540070768, + "grad_norm": 0.15449629945744445, + "learning_rate": 2.32236645512184e-05, + "loss": 2.8053, + "step": 34404 + }, + { + "epoch": 2.135762617170526, + "grad_norm": 0.153745626245874, + "learning_rate": 2.3220614622841786e-05, + "loss": 2.7065, + "step": 34405 + }, + { + "epoch": 2.135824694270284, + "grad_norm": 0.14075467303371594, + "learning_rate": 2.321756483417972e-05, + "loss": 2.6828, + "step": 34406 + }, + { + "epoch": 2.135886771370042, + "grad_norm": 0.1567744691613021, + "learning_rate": 2.3214515185248132e-05, + "loss": 2.7001, + "step": 34407 + }, + { + "epoch": 2.1359488484697993, + "grad_norm": 0.13941910058834775, + "learning_rate": 2.3211465676062895e-05, + "loss": 2.7299, + "step": 34408 + }, + { + "epoch": 2.136010925569557, + "grad_norm": 0.14017679392435686, + "learning_rate": 2.3208416306639964e-05, + "loss": 2.7083, + "step": 34409 + }, + { + "epoch": 2.136073002669315, + "grad_norm": 0.14916324756631263, + "learning_rate": 2.3205367076995227e-05, + "loss": 2.6758, + "step": 34410 + }, + { + "epoch": 2.136135079769073, + "grad_norm": 0.14128386934337608, + "learning_rate": 2.3202317987144593e-05, + "loss": 2.7435, + "step": 34411 + }, + { + "epoch": 2.136197156868831, + "grad_norm": 0.14583569266477073, + "learning_rate": 2.3199269037103975e-05, + "loss": 2.7068, + "step": 34412 + }, + { + "epoch": 2.136259233968589, + "grad_norm": 0.14520833219540055, + "learning_rate": 2.319622022688926e-05, + "loss": 2.7554, + "step": 34413 + }, + { + "epoch": 2.1363213110683468, + "grad_norm": 0.14104763734915113, + "learning_rate": 2.3193171556516385e-05, + "loss": 2.8124, + "step": 34414 + }, + { + "epoch": 2.1363833881681047, + "grad_norm": 0.18223219082921752, + "learning_rate": 2.3190123026001248e-05, + "loss": 2.6917, + "step": 34415 + }, + { + "epoch": 2.1364454652678626, + "grad_norm": 0.150440950766949, + "learning_rate": 2.318707463535974e-05, + "loss": 2.7724, + "step": 34416 + }, + { + "epoch": 2.1365075423676205, + "grad_norm": 0.1361247989370194, + "learning_rate": 2.318402638460776e-05, + "loss": 2.6583, + "step": 34417 + }, + { + "epoch": 2.1365696194673784, + "grad_norm": 0.13934346891196475, + "learning_rate": 2.318097827376124e-05, + "loss": 2.794, + "step": 34418 + }, + { + "epoch": 2.1366316965671364, + "grad_norm": 0.14159943809231096, + "learning_rate": 2.3177930302836064e-05, + "loss": 2.7039, + "step": 34419 + }, + { + "epoch": 2.1366937736668943, + "grad_norm": 0.14184192120603725, + "learning_rate": 2.317488247184814e-05, + "loss": 2.7741, + "step": 34420 + }, + { + "epoch": 2.136755850766652, + "grad_norm": 0.14619080053495756, + "learning_rate": 2.317183478081337e-05, + "loss": 2.8119, + "step": 34421 + }, + { + "epoch": 2.13681792786641, + "grad_norm": 0.14214991581067649, + "learning_rate": 2.3168787229747623e-05, + "loss": 2.7307, + "step": 34422 + }, + { + "epoch": 2.136880004966168, + "grad_norm": 0.1435371310339204, + "learning_rate": 2.3165739818666855e-05, + "loss": 2.7593, + "step": 34423 + }, + { + "epoch": 2.136942082065926, + "grad_norm": 0.13754960054147528, + "learning_rate": 2.316269254758691e-05, + "loss": 2.6933, + "step": 34424 + }, + { + "epoch": 2.137004159165684, + "grad_norm": 0.14649860948701784, + "learning_rate": 2.3159645416523733e-05, + "loss": 2.7736, + "step": 34425 + }, + { + "epoch": 2.137066236265442, + "grad_norm": 0.1402219121277598, + "learning_rate": 2.3156598425493204e-05, + "loss": 2.6416, + "step": 34426 + }, + { + "epoch": 2.1371283133651997, + "grad_norm": 0.13990287676279245, + "learning_rate": 2.31535515745112e-05, + "loss": 2.6979, + "step": 34427 + }, + { + "epoch": 2.1371903904649576, + "grad_norm": 0.14079175439950212, + "learning_rate": 2.3150504863593653e-05, + "loss": 2.6922, + "step": 34428 + }, + { + "epoch": 2.1372524675647155, + "grad_norm": 0.15413789163786745, + "learning_rate": 2.3147458292756437e-05, + "loss": 2.7424, + "step": 34429 + }, + { + "epoch": 2.1373145446644735, + "grad_norm": 0.1411335940409332, + "learning_rate": 2.3144411862015454e-05, + "loss": 2.7576, + "step": 34430 + }, + { + "epoch": 2.1373766217642314, + "grad_norm": 0.15413866502972598, + "learning_rate": 2.314136557138657e-05, + "loss": 2.7182, + "step": 34431 + }, + { + "epoch": 2.1374386988639893, + "grad_norm": 0.14222719167744627, + "learning_rate": 2.3138319420885722e-05, + "loss": 2.7143, + "step": 34432 + }, + { + "epoch": 2.1375007759637468, + "grad_norm": 0.16004578339549783, + "learning_rate": 2.3135273410528783e-05, + "loss": 2.7016, + "step": 34433 + }, + { + "epoch": 2.1375628530635047, + "grad_norm": 0.14176240533582216, + "learning_rate": 2.3132227540331647e-05, + "loss": 2.7118, + "step": 34434 + }, + { + "epoch": 2.1376249301632626, + "grad_norm": 0.14202801516073163, + "learning_rate": 2.3129181810310196e-05, + "loss": 2.6446, + "step": 34435 + }, + { + "epoch": 2.1376870072630205, + "grad_norm": 0.1443809810654583, + "learning_rate": 2.3126136220480315e-05, + "loss": 2.7842, + "step": 34436 + }, + { + "epoch": 2.1377490843627784, + "grad_norm": 0.14290917515466808, + "learning_rate": 2.312309077085792e-05, + "loss": 2.7372, + "step": 34437 + }, + { + "epoch": 2.1378111614625364, + "grad_norm": 0.17142065098128798, + "learning_rate": 2.3120045461458883e-05, + "loss": 2.7316, + "step": 34438 + }, + { + "epoch": 2.1378732385622943, + "grad_norm": 0.14092710284306434, + "learning_rate": 2.3117000292299102e-05, + "loss": 2.6782, + "step": 34439 + }, + { + "epoch": 2.137935315662052, + "grad_norm": 0.16214077427123727, + "learning_rate": 2.3113955263394432e-05, + "loss": 2.7769, + "step": 34440 + }, + { + "epoch": 2.13799739276181, + "grad_norm": 0.1477051096514203, + "learning_rate": 2.3110910374760803e-05, + "loss": 2.7012, + "step": 34441 + }, + { + "epoch": 2.138059469861568, + "grad_norm": 0.16048644499455378, + "learning_rate": 2.3107865626414082e-05, + "loss": 2.6938, + "step": 34442 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.15249653637384364, + "learning_rate": 2.3104821018370153e-05, + "loss": 2.6645, + "step": 34443 + }, + { + "epoch": 2.138183624061084, + "grad_norm": 0.14288168429210285, + "learning_rate": 2.3101776550644888e-05, + "loss": 2.7068, + "step": 34444 + }, + { + "epoch": 2.138245701160842, + "grad_norm": 0.1488330239518436, + "learning_rate": 2.30987322232542e-05, + "loss": 2.6726, + "step": 34445 + }, + { + "epoch": 2.1383077782605997, + "grad_norm": 0.1352684550258637, + "learning_rate": 2.309568803621396e-05, + "loss": 2.6168, + "step": 34446 + }, + { + "epoch": 2.1383698553603576, + "grad_norm": 0.1616645132327775, + "learning_rate": 2.309264398954004e-05, + "loss": 2.689, + "step": 34447 + }, + { + "epoch": 2.1384319324601155, + "grad_norm": 0.14099407771114852, + "learning_rate": 2.3089600083248336e-05, + "loss": 2.7509, + "step": 34448 + }, + { + "epoch": 2.1384940095598735, + "grad_norm": 0.14397130573629888, + "learning_rate": 2.3086556317354697e-05, + "loss": 2.8014, + "step": 34449 + }, + { + "epoch": 2.1385560866596314, + "grad_norm": 0.1439654060635358, + "learning_rate": 2.3083512691875047e-05, + "loss": 2.7367, + "step": 34450 + }, + { + "epoch": 2.1386181637593893, + "grad_norm": 0.15477413294676706, + "learning_rate": 2.3080469206825245e-05, + "loss": 2.6993, + "step": 34451 + }, + { + "epoch": 2.138680240859147, + "grad_norm": 0.13825893033391584, + "learning_rate": 2.3077425862221175e-05, + "loss": 2.6331, + "step": 34452 + }, + { + "epoch": 2.138742317958905, + "grad_norm": 0.1429230554443735, + "learning_rate": 2.3074382658078685e-05, + "loss": 2.6717, + "step": 34453 + }, + { + "epoch": 2.138804395058663, + "grad_norm": 0.17202449387736599, + "learning_rate": 2.30713395944137e-05, + "loss": 2.7561, + "step": 34454 + }, + { + "epoch": 2.138866472158421, + "grad_norm": 0.14234964798182678, + "learning_rate": 2.3068296671242067e-05, + "loss": 2.7624, + "step": 34455 + }, + { + "epoch": 2.1389285492581784, + "grad_norm": 0.14574116039093124, + "learning_rate": 2.3065253888579675e-05, + "loss": 2.7139, + "step": 34456 + }, + { + "epoch": 2.1389906263579364, + "grad_norm": 0.1446853666580819, + "learning_rate": 2.3062211246442374e-05, + "loss": 2.7391, + "step": 34457 + }, + { + "epoch": 2.1390527034576943, + "grad_norm": 0.17122155297817493, + "learning_rate": 2.305916874484605e-05, + "loss": 2.7608, + "step": 34458 + }, + { + "epoch": 2.139114780557452, + "grad_norm": 0.1436509438970992, + "learning_rate": 2.3056126383806602e-05, + "loss": 2.6908, + "step": 34459 + }, + { + "epoch": 2.13917685765721, + "grad_norm": 0.14883478229881947, + "learning_rate": 2.305308416333989e-05, + "loss": 2.6936, + "step": 34460 + }, + { + "epoch": 2.139238934756968, + "grad_norm": 0.14461459514690503, + "learning_rate": 2.3050042083461774e-05, + "loss": 2.7522, + "step": 34461 + }, + { + "epoch": 2.139301011856726, + "grad_norm": 0.1488137120554404, + "learning_rate": 2.3047000144188136e-05, + "loss": 2.6847, + "step": 34462 + }, + { + "epoch": 2.139363088956484, + "grad_norm": 0.1444767643064825, + "learning_rate": 2.3043958345534816e-05, + "loss": 2.6913, + "step": 34463 + }, + { + "epoch": 2.139425166056242, + "grad_norm": 0.14214725579852686, + "learning_rate": 2.3040916687517733e-05, + "loss": 2.6664, + "step": 34464 + }, + { + "epoch": 2.1394872431559997, + "grad_norm": 0.1397970782349161, + "learning_rate": 2.3037875170152727e-05, + "loss": 2.7777, + "step": 34465 + }, + { + "epoch": 2.1395493202557576, + "grad_norm": 0.15220132307727274, + "learning_rate": 2.303483379345568e-05, + "loss": 2.7818, + "step": 34466 + }, + { + "epoch": 2.1396113973555155, + "grad_norm": 0.14849378121163492, + "learning_rate": 2.303179255744243e-05, + "loss": 2.8739, + "step": 34467 + }, + { + "epoch": 2.1396734744552734, + "grad_norm": 0.13385577522478972, + "learning_rate": 2.302875146212888e-05, + "loss": 2.7211, + "step": 34468 + }, + { + "epoch": 2.1397355515550314, + "grad_norm": 0.1392008069545791, + "learning_rate": 2.302571050753088e-05, + "loss": 2.7483, + "step": 34469 + }, + { + "epoch": 2.1397976286547893, + "grad_norm": 0.14801927047479813, + "learning_rate": 2.3022669693664294e-05, + "loss": 2.6555, + "step": 34470 + }, + { + "epoch": 2.139859705754547, + "grad_norm": 0.14342773429188546, + "learning_rate": 2.3019629020544996e-05, + "loss": 2.7386, + "step": 34471 + }, + { + "epoch": 2.139921782854305, + "grad_norm": 0.1424629843888797, + "learning_rate": 2.301658848818882e-05, + "loss": 2.666, + "step": 34472 + }, + { + "epoch": 2.139983859954063, + "grad_norm": 0.14245753218040708, + "learning_rate": 2.3013548096611677e-05, + "loss": 2.716, + "step": 34473 + }, + { + "epoch": 2.140045937053821, + "grad_norm": 0.14432766468668085, + "learning_rate": 2.30105078458294e-05, + "loss": 2.6961, + "step": 34474 + }, + { + "epoch": 2.140108014153579, + "grad_norm": 0.14284596694823243, + "learning_rate": 2.3007467735857856e-05, + "loss": 2.7649, + "step": 34475 + }, + { + "epoch": 2.140170091253337, + "grad_norm": 0.15976457269037553, + "learning_rate": 2.300442776671289e-05, + "loss": 2.7636, + "step": 34476 + }, + { + "epoch": 2.1402321683530947, + "grad_norm": 0.15200578664521844, + "learning_rate": 2.300138793841039e-05, + "loss": 2.7211, + "step": 34477 + }, + { + "epoch": 2.1402942454528526, + "grad_norm": 0.14338591769064282, + "learning_rate": 2.2998348250966207e-05, + "loss": 2.8227, + "step": 34478 + }, + { + "epoch": 2.1403563225526105, + "grad_norm": 0.1569409233997518, + "learning_rate": 2.2995308704396196e-05, + "loss": 2.7643, + "step": 34479 + }, + { + "epoch": 2.1404183996523685, + "grad_norm": 0.1465001352097177, + "learning_rate": 2.2992269298716196e-05, + "loss": 2.7083, + "step": 34480 + }, + { + "epoch": 2.140480476752126, + "grad_norm": 0.1561752378894575, + "learning_rate": 2.2989230033942107e-05, + "loss": 2.7312, + "step": 34481 + }, + { + "epoch": 2.140542553851884, + "grad_norm": 0.1648500194984444, + "learning_rate": 2.2986190910089756e-05, + "loss": 2.7504, + "step": 34482 + }, + { + "epoch": 2.1406046309516418, + "grad_norm": 0.1609394126611773, + "learning_rate": 2.2983151927175007e-05, + "loss": 2.8026, + "step": 34483 + }, + { + "epoch": 2.1406667080513997, + "grad_norm": 0.1434998569745338, + "learning_rate": 2.2980113085213713e-05, + "loss": 2.6979, + "step": 34484 + }, + { + "epoch": 2.1407287851511576, + "grad_norm": 0.14665641219038483, + "learning_rate": 2.297707438422171e-05, + "loss": 2.7487, + "step": 34485 + }, + { + "epoch": 2.1407908622509155, + "grad_norm": 0.14163033423306354, + "learning_rate": 2.2974035824214886e-05, + "loss": 2.6405, + "step": 34486 + }, + { + "epoch": 2.1408529393506734, + "grad_norm": 0.16851833720670756, + "learning_rate": 2.2970997405209075e-05, + "loss": 2.7736, + "step": 34487 + }, + { + "epoch": 2.1409150164504314, + "grad_norm": 0.14680932560124144, + "learning_rate": 2.296795912722014e-05, + "loss": 2.6988, + "step": 34488 + }, + { + "epoch": 2.1409770935501893, + "grad_norm": 0.16114225070939295, + "learning_rate": 2.29649209902639e-05, + "loss": 2.7603, + "step": 34489 + }, + { + "epoch": 2.141039170649947, + "grad_norm": 0.141935572822351, + "learning_rate": 2.296188299435623e-05, + "loss": 2.6807, + "step": 34490 + }, + { + "epoch": 2.141101247749705, + "grad_norm": 0.13475022020613914, + "learning_rate": 2.295884513951301e-05, + "loss": 2.5595, + "step": 34491 + }, + { + "epoch": 2.141163324849463, + "grad_norm": 0.144977820463582, + "learning_rate": 2.295580742575005e-05, + "loss": 2.7438, + "step": 34492 + }, + { + "epoch": 2.141225401949221, + "grad_norm": 0.15229290333761944, + "learning_rate": 2.2952769853083205e-05, + "loss": 2.7384, + "step": 34493 + }, + { + "epoch": 2.141287479048979, + "grad_norm": 0.15923458744042762, + "learning_rate": 2.2949732421528307e-05, + "loss": 2.6993, + "step": 34494 + }, + { + "epoch": 2.141349556148737, + "grad_norm": 0.14859160153974788, + "learning_rate": 2.2946695131101247e-05, + "loss": 2.7094, + "step": 34495 + }, + { + "epoch": 2.1414116332484947, + "grad_norm": 0.16107676716223965, + "learning_rate": 2.2943657981817845e-05, + "loss": 2.7312, + "step": 34496 + }, + { + "epoch": 2.1414737103482526, + "grad_norm": 0.16796842286721245, + "learning_rate": 2.2940620973693944e-05, + "loss": 2.6517, + "step": 34497 + }, + { + "epoch": 2.1415357874480105, + "grad_norm": 0.15358532034520364, + "learning_rate": 2.293758410674539e-05, + "loss": 2.6916, + "step": 34498 + }, + { + "epoch": 2.1415978645477685, + "grad_norm": 0.13955698820369314, + "learning_rate": 2.293454738098802e-05, + "loss": 2.7607, + "step": 34499 + }, + { + "epoch": 2.1416599416475264, + "grad_norm": 0.17530288774484604, + "learning_rate": 2.29315107964377e-05, + "loss": 2.7398, + "step": 34500 + }, + { + "epoch": 2.1417220187472843, + "grad_norm": 0.1693961901975504, + "learning_rate": 2.2928474353110256e-05, + "loss": 2.6835, + "step": 34501 + }, + { + "epoch": 2.141784095847042, + "grad_norm": 0.17683550144257237, + "learning_rate": 2.2925438051021537e-05, + "loss": 2.7324, + "step": 34502 + }, + { + "epoch": 2.1418461729468, + "grad_norm": 0.14780115833078603, + "learning_rate": 2.292240189018736e-05, + "loss": 2.6932, + "step": 34503 + }, + { + "epoch": 2.1419082500465576, + "grad_norm": 0.16987260016338476, + "learning_rate": 2.2919365870623605e-05, + "loss": 2.7685, + "step": 34504 + }, + { + "epoch": 2.1419703271463155, + "grad_norm": 0.14435331242607208, + "learning_rate": 2.2916329992346093e-05, + "loss": 2.7375, + "step": 34505 + }, + { + "epoch": 2.1420324042460734, + "grad_norm": 0.15114171863382267, + "learning_rate": 2.291329425537066e-05, + "loss": 2.6791, + "step": 34506 + }, + { + "epoch": 2.1420944813458314, + "grad_norm": 0.14794855778612606, + "learning_rate": 2.2910258659713147e-05, + "loss": 2.7747, + "step": 34507 + }, + { + "epoch": 2.1421565584455893, + "grad_norm": 0.14288859986944893, + "learning_rate": 2.2907223205389372e-05, + "loss": 2.6825, + "step": 34508 + }, + { + "epoch": 2.142218635545347, + "grad_norm": 0.16104798829706207, + "learning_rate": 2.2904187892415212e-05, + "loss": 2.6999, + "step": 34509 + }, + { + "epoch": 2.142280712645105, + "grad_norm": 0.17634033725349121, + "learning_rate": 2.290115272080648e-05, + "loss": 2.7525, + "step": 34510 + }, + { + "epoch": 2.142342789744863, + "grad_norm": 0.18419178491144572, + "learning_rate": 2.2898117690579013e-05, + "loss": 2.7195, + "step": 34511 + }, + { + "epoch": 2.142404866844621, + "grad_norm": 0.1415662458378945, + "learning_rate": 2.2895082801748626e-05, + "loss": 2.7345, + "step": 34512 + }, + { + "epoch": 2.142466943944379, + "grad_norm": 0.1439999726344288, + "learning_rate": 2.289204805433119e-05, + "loss": 2.7441, + "step": 34513 + }, + { + "epoch": 2.142529021044137, + "grad_norm": 0.18994215153643548, + "learning_rate": 2.288901344834252e-05, + "loss": 2.7836, + "step": 34514 + }, + { + "epoch": 2.1425910981438947, + "grad_norm": 0.15308210978938783, + "learning_rate": 2.2885978983798452e-05, + "loss": 2.6712, + "step": 34515 + }, + { + "epoch": 2.1426531752436526, + "grad_norm": 0.14755508751935406, + "learning_rate": 2.288294466071479e-05, + "loss": 2.6631, + "step": 34516 + }, + { + "epoch": 2.1427152523434105, + "grad_norm": 0.1420080073803823, + "learning_rate": 2.2879910479107407e-05, + "loss": 2.7569, + "step": 34517 + }, + { + "epoch": 2.1427773294431685, + "grad_norm": 0.1599392467506252, + "learning_rate": 2.2876876438992117e-05, + "loss": 2.678, + "step": 34518 + }, + { + "epoch": 2.1428394065429264, + "grad_norm": 0.14831675462170227, + "learning_rate": 2.287384254038475e-05, + "loss": 2.6493, + "step": 34519 + }, + { + "epoch": 2.1429014836426843, + "grad_norm": 0.14275739451812577, + "learning_rate": 2.2870808783301128e-05, + "loss": 2.6705, + "step": 34520 + }, + { + "epoch": 2.142963560742442, + "grad_norm": 0.17182437736439213, + "learning_rate": 2.2867775167757067e-05, + "loss": 2.7782, + "step": 34521 + }, + { + "epoch": 2.1430256378422, + "grad_norm": 0.1783337286156733, + "learning_rate": 2.286474169376842e-05, + "loss": 2.7884, + "step": 34522 + }, + { + "epoch": 2.143087714941958, + "grad_norm": 0.14961911137425407, + "learning_rate": 2.286170836135099e-05, + "loss": 2.7983, + "step": 34523 + }, + { + "epoch": 2.143149792041716, + "grad_norm": 0.1561975569931304, + "learning_rate": 2.2858675170520638e-05, + "loss": 2.761, + "step": 34524 + }, + { + "epoch": 2.143211869141474, + "grad_norm": 0.1737889985187093, + "learning_rate": 2.2855642121293163e-05, + "loss": 2.7529, + "step": 34525 + }, + { + "epoch": 2.143273946241232, + "grad_norm": 0.15415192688788756, + "learning_rate": 2.2852609213684373e-05, + "loss": 2.6512, + "step": 34526 + }, + { + "epoch": 2.1433360233409897, + "grad_norm": 0.14528680825508983, + "learning_rate": 2.284957644771013e-05, + "loss": 2.7312, + "step": 34527 + }, + { + "epoch": 2.1433981004407476, + "grad_norm": 0.15693536886791032, + "learning_rate": 2.2846543823386233e-05, + "loss": 2.684, + "step": 34528 + }, + { + "epoch": 2.143460177540505, + "grad_norm": 0.14619929229321077, + "learning_rate": 2.284351134072852e-05, + "loss": 2.7973, + "step": 34529 + }, + { + "epoch": 2.143522254640263, + "grad_norm": 0.15870064928573097, + "learning_rate": 2.2840478999752773e-05, + "loss": 2.8302, + "step": 34530 + }, + { + "epoch": 2.143584331740021, + "grad_norm": 0.16161723867007954, + "learning_rate": 2.283744680047486e-05, + "loss": 2.7497, + "step": 34531 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.15275051758478697, + "learning_rate": 2.2834414742910583e-05, + "loss": 2.6809, + "step": 34532 + }, + { + "epoch": 2.143708485939537, + "grad_norm": 0.14036199028248036, + "learning_rate": 2.283138282707576e-05, + "loss": 2.7594, + "step": 34533 + }, + { + "epoch": 2.1437705630392947, + "grad_norm": 0.1376410776560227, + "learning_rate": 2.2828351052986203e-05, + "loss": 2.6277, + "step": 34534 + }, + { + "epoch": 2.1438326401390526, + "grad_norm": 0.13587541011750165, + "learning_rate": 2.282531942065772e-05, + "loss": 2.7275, + "step": 34535 + }, + { + "epoch": 2.1438947172388105, + "grad_norm": 0.1431160472897544, + "learning_rate": 2.2822287930106162e-05, + "loss": 2.7003, + "step": 34536 + }, + { + "epoch": 2.1439567943385685, + "grad_norm": 0.1586126396129582, + "learning_rate": 2.281925658134732e-05, + "loss": 2.7569, + "step": 34537 + }, + { + "epoch": 2.1440188714383264, + "grad_norm": 0.14919075453611924, + "learning_rate": 2.281622537439702e-05, + "loss": 2.7121, + "step": 34538 + }, + { + "epoch": 2.1440809485380843, + "grad_norm": 0.14243213809099442, + "learning_rate": 2.2813194309271052e-05, + "loss": 2.7604, + "step": 34539 + }, + { + "epoch": 2.144143025637842, + "grad_norm": 0.14375943873349958, + "learning_rate": 2.2810163385985266e-05, + "loss": 2.6765, + "step": 34540 + }, + { + "epoch": 2.1442051027376, + "grad_norm": 0.14735390397578813, + "learning_rate": 2.280713260455546e-05, + "loss": 2.7804, + "step": 34541 + }, + { + "epoch": 2.144267179837358, + "grad_norm": 0.14396686806638004, + "learning_rate": 2.2804101964997444e-05, + "loss": 2.8141, + "step": 34542 + }, + { + "epoch": 2.144329256937116, + "grad_norm": 0.1443994023809324, + "learning_rate": 2.280107146732701e-05, + "loss": 2.669, + "step": 34543 + }, + { + "epoch": 2.144391334036874, + "grad_norm": 0.14572247409949776, + "learning_rate": 2.279804111156001e-05, + "loss": 2.6978, + "step": 34544 + }, + { + "epoch": 2.144453411136632, + "grad_norm": 0.1415957482558339, + "learning_rate": 2.279501089771223e-05, + "loss": 2.6532, + "step": 34545 + }, + { + "epoch": 2.1445154882363897, + "grad_norm": 0.1499756997015036, + "learning_rate": 2.2791980825799485e-05, + "loss": 2.7382, + "step": 34546 + }, + { + "epoch": 2.1445775653361476, + "grad_norm": 0.14876875807190754, + "learning_rate": 2.2788950895837584e-05, + "loss": 2.7297, + "step": 34547 + }, + { + "epoch": 2.1446396424359055, + "grad_norm": 0.14890367938021368, + "learning_rate": 2.2785921107842313e-05, + "loss": 2.6451, + "step": 34548 + }, + { + "epoch": 2.1447017195356635, + "grad_norm": 0.14305110107421123, + "learning_rate": 2.278289146182951e-05, + "loss": 2.7917, + "step": 34549 + }, + { + "epoch": 2.1447637966354214, + "grad_norm": 0.14370036356650348, + "learning_rate": 2.2779861957814975e-05, + "loss": 2.7606, + "step": 34550 + }, + { + "epoch": 2.1448258737351793, + "grad_norm": 0.14703372941070103, + "learning_rate": 2.27768325958145e-05, + "loss": 2.6984, + "step": 34551 + }, + { + "epoch": 2.1448879508349368, + "grad_norm": 0.1478037417944257, + "learning_rate": 2.2773803375843888e-05, + "loss": 2.6778, + "step": 34552 + }, + { + "epoch": 2.1449500279346947, + "grad_norm": 0.14823841245968566, + "learning_rate": 2.2770774297918968e-05, + "loss": 2.6922, + "step": 34553 + }, + { + "epoch": 2.1450121050344526, + "grad_norm": 0.1532867236914781, + "learning_rate": 2.276774536205553e-05, + "loss": 2.6943, + "step": 34554 + }, + { + "epoch": 2.1450741821342105, + "grad_norm": 0.14963765337748716, + "learning_rate": 2.2764716568269356e-05, + "loss": 2.7696, + "step": 34555 + }, + { + "epoch": 2.1451362592339684, + "grad_norm": 0.14517170535323917, + "learning_rate": 2.2761687916576284e-05, + "loss": 2.822, + "step": 34556 + }, + { + "epoch": 2.1451983363337264, + "grad_norm": 0.1389075144226403, + "learning_rate": 2.2758659406992105e-05, + "loss": 2.5629, + "step": 34557 + }, + { + "epoch": 2.1452604134334843, + "grad_norm": 0.14567767859378108, + "learning_rate": 2.2755631039532587e-05, + "loss": 2.7857, + "step": 34558 + }, + { + "epoch": 2.145322490533242, + "grad_norm": 0.14751620867338655, + "learning_rate": 2.275260281421358e-05, + "loss": 2.7315, + "step": 34559 + }, + { + "epoch": 2.145384567633, + "grad_norm": 0.14574265570196004, + "learning_rate": 2.2749574731050856e-05, + "loss": 2.7907, + "step": 34560 + }, + { + "epoch": 2.145446644732758, + "grad_norm": 0.1471869025552372, + "learning_rate": 2.2746546790060213e-05, + "loss": 2.6273, + "step": 34561 + }, + { + "epoch": 2.145508721832516, + "grad_norm": 0.15514127036675138, + "learning_rate": 2.2743518991257433e-05, + "loss": 2.7833, + "step": 34562 + }, + { + "epoch": 2.145570798932274, + "grad_norm": 0.1565087738575037, + "learning_rate": 2.2740491334658352e-05, + "loss": 2.7365, + "step": 34563 + }, + { + "epoch": 2.145632876032032, + "grad_norm": 0.16442615079408895, + "learning_rate": 2.2737463820278747e-05, + "loss": 2.6866, + "step": 34564 + }, + { + "epoch": 2.1456949531317897, + "grad_norm": 0.150284692686377, + "learning_rate": 2.273443644813441e-05, + "loss": 2.7758, + "step": 34565 + }, + { + "epoch": 2.1457570302315476, + "grad_norm": 0.1431708231496718, + "learning_rate": 2.2731409218241124e-05, + "loss": 2.8084, + "step": 34566 + }, + { + "epoch": 2.1458191073313055, + "grad_norm": 0.1659421253188317, + "learning_rate": 2.2728382130614705e-05, + "loss": 2.7752, + "step": 34567 + }, + { + "epoch": 2.1458811844310635, + "grad_norm": 0.14154347510844043, + "learning_rate": 2.2725355185270942e-05, + "loss": 2.6864, + "step": 34568 + }, + { + "epoch": 2.1459432615308214, + "grad_norm": 0.17858473694119642, + "learning_rate": 2.272232838222562e-05, + "loss": 2.7602, + "step": 34569 + }, + { + "epoch": 2.1460053386305793, + "grad_norm": 0.14279764729441347, + "learning_rate": 2.2719301721494534e-05, + "loss": 2.802, + "step": 34570 + }, + { + "epoch": 2.146067415730337, + "grad_norm": 0.1616053098133924, + "learning_rate": 2.2716275203093455e-05, + "loss": 2.7476, + "step": 34571 + }, + { + "epoch": 2.146129492830095, + "grad_norm": 0.15779906347879838, + "learning_rate": 2.271324882703821e-05, + "loss": 2.6268, + "step": 34572 + }, + { + "epoch": 2.146191569929853, + "grad_norm": 0.15508623053942103, + "learning_rate": 2.2710222593344565e-05, + "loss": 2.7315, + "step": 34573 + }, + { + "epoch": 2.146253647029611, + "grad_norm": 0.14318774809330895, + "learning_rate": 2.270719650202832e-05, + "loss": 2.616, + "step": 34574 + }, + { + "epoch": 2.1463157241293684, + "grad_norm": 0.142391291445685, + "learning_rate": 2.2704170553105237e-05, + "loss": 2.7316, + "step": 34575 + }, + { + "epoch": 2.146377801229127, + "grad_norm": 0.143949588313579, + "learning_rate": 2.2701144746591142e-05, + "loss": 2.746, + "step": 34576 + }, + { + "epoch": 2.1464398783288843, + "grad_norm": 0.14914570188798515, + "learning_rate": 2.26981190825018e-05, + "loss": 2.8139, + "step": 34577 + }, + { + "epoch": 2.146501955428642, + "grad_norm": 0.1691875870506787, + "learning_rate": 2.2695093560853e-05, + "loss": 2.7391, + "step": 34578 + }, + { + "epoch": 2.1465640325284, + "grad_norm": 0.139926919500088, + "learning_rate": 2.26920681816605e-05, + "loss": 2.717, + "step": 34579 + }, + { + "epoch": 2.146626109628158, + "grad_norm": 0.1411254639960028, + "learning_rate": 2.2689042944940138e-05, + "loss": 2.7344, + "step": 34580 + }, + { + "epoch": 2.146688186727916, + "grad_norm": 0.13688703806296448, + "learning_rate": 2.2686017850707663e-05, + "loss": 2.7214, + "step": 34581 + }, + { + "epoch": 2.146750263827674, + "grad_norm": 0.1573684597027399, + "learning_rate": 2.268299289897886e-05, + "loss": 2.7663, + "step": 34582 + }, + { + "epoch": 2.146812340927432, + "grad_norm": 0.14842151254588992, + "learning_rate": 2.267996808976952e-05, + "loss": 2.658, + "step": 34583 + }, + { + "epoch": 2.1468744180271897, + "grad_norm": 0.1443207019099996, + "learning_rate": 2.2676943423095404e-05, + "loss": 2.7628, + "step": 34584 + }, + { + "epoch": 2.1469364951269476, + "grad_norm": 0.1439407244382021, + "learning_rate": 2.267391889897232e-05, + "loss": 2.7462, + "step": 34585 + }, + { + "epoch": 2.1469985722267055, + "grad_norm": 0.13415614120022254, + "learning_rate": 2.2670894517416036e-05, + "loss": 2.7495, + "step": 34586 + }, + { + "epoch": 2.1470606493264635, + "grad_norm": 0.14460134095431526, + "learning_rate": 2.2667870278442327e-05, + "loss": 2.7703, + "step": 34587 + }, + { + "epoch": 2.1471227264262214, + "grad_norm": 0.17356332154297624, + "learning_rate": 2.2664846182066957e-05, + "loss": 2.8131, + "step": 34588 + }, + { + "epoch": 2.1471848035259793, + "grad_norm": 0.14371604025912094, + "learning_rate": 2.2661822228305724e-05, + "loss": 2.7375, + "step": 34589 + }, + { + "epoch": 2.147246880625737, + "grad_norm": 0.14878405722053176, + "learning_rate": 2.2658798417174415e-05, + "loss": 2.8277, + "step": 34590 + }, + { + "epoch": 2.147308957725495, + "grad_norm": 0.14136202710808177, + "learning_rate": 2.2655774748688796e-05, + "loss": 2.656, + "step": 34591 + }, + { + "epoch": 2.147371034825253, + "grad_norm": 0.16238568209603335, + "learning_rate": 2.265275122286463e-05, + "loss": 2.7074, + "step": 34592 + }, + { + "epoch": 2.147433111925011, + "grad_norm": 0.14180845733502356, + "learning_rate": 2.2649727839717694e-05, + "loss": 2.6779, + "step": 34593 + }, + { + "epoch": 2.147495189024769, + "grad_norm": 0.15832619530708317, + "learning_rate": 2.2646704599263774e-05, + "loss": 2.7376, + "step": 34594 + }, + { + "epoch": 2.147557266124527, + "grad_norm": 0.16446616830091515, + "learning_rate": 2.2643681501518643e-05, + "loss": 2.75, + "step": 34595 + }, + { + "epoch": 2.1476193432242847, + "grad_norm": 0.1462048788749252, + "learning_rate": 2.2640658546498068e-05, + "loss": 2.7444, + "step": 34596 + }, + { + "epoch": 2.1476814203240426, + "grad_norm": 0.15968398500892556, + "learning_rate": 2.263763573421781e-05, + "loss": 2.7349, + "step": 34597 + }, + { + "epoch": 2.1477434974238006, + "grad_norm": 0.14545494117508576, + "learning_rate": 2.263461306469364e-05, + "loss": 2.7055, + "step": 34598 + }, + { + "epoch": 2.1478055745235585, + "grad_norm": 0.14381786149229475, + "learning_rate": 2.263159053794135e-05, + "loss": 2.8164, + "step": 34599 + }, + { + "epoch": 2.147867651623316, + "grad_norm": 0.13928403192899705, + "learning_rate": 2.2628568153976696e-05, + "loss": 2.7274, + "step": 34600 + }, + { + "epoch": 2.147929728723074, + "grad_norm": 0.15965110503101615, + "learning_rate": 2.262554591281545e-05, + "loss": 2.8197, + "step": 34601 + }, + { + "epoch": 2.147991805822832, + "grad_norm": 0.15357279792758113, + "learning_rate": 2.2622523814473355e-05, + "loss": 2.7086, + "step": 34602 + }, + { + "epoch": 2.1480538829225897, + "grad_norm": 0.1534141022834196, + "learning_rate": 2.261950185896622e-05, + "loss": 2.6545, + "step": 34603 + }, + { + "epoch": 2.1481159600223476, + "grad_norm": 0.19310420953697524, + "learning_rate": 2.261648004630979e-05, + "loss": 2.7322, + "step": 34604 + }, + { + "epoch": 2.1481780371221055, + "grad_norm": 0.1436981794516896, + "learning_rate": 2.261345837651983e-05, + "loss": 2.6804, + "step": 34605 + }, + { + "epoch": 2.1482401142218635, + "grad_norm": 0.14524952154400825, + "learning_rate": 2.2610436849612103e-05, + "loss": 2.6334, + "step": 34606 + }, + { + "epoch": 2.1483021913216214, + "grad_norm": 0.13915700406971662, + "learning_rate": 2.2607415465602367e-05, + "loss": 2.6037, + "step": 34607 + }, + { + "epoch": 2.1483642684213793, + "grad_norm": 0.18344602322963666, + "learning_rate": 2.2604394224506405e-05, + "loss": 2.7805, + "step": 34608 + }, + { + "epoch": 2.148426345521137, + "grad_norm": 0.14521835004810135, + "learning_rate": 2.260137312633997e-05, + "loss": 2.6733, + "step": 34609 + }, + { + "epoch": 2.148488422620895, + "grad_norm": 0.14785710857274464, + "learning_rate": 2.2598352171118825e-05, + "loss": 2.7327, + "step": 34610 + }, + { + "epoch": 2.148550499720653, + "grad_norm": 0.13875578638417468, + "learning_rate": 2.2595331358858707e-05, + "loss": 2.6947, + "step": 34611 + }, + { + "epoch": 2.148612576820411, + "grad_norm": 0.15568953967318433, + "learning_rate": 2.259231068957542e-05, + "loss": 2.7072, + "step": 34612 + }, + { + "epoch": 2.148674653920169, + "grad_norm": 0.16543506762190377, + "learning_rate": 2.25892901632847e-05, + "loss": 2.713, + "step": 34613 + }, + { + "epoch": 2.148736731019927, + "grad_norm": 0.18410199408720385, + "learning_rate": 2.2586269780002307e-05, + "loss": 2.7017, + "step": 34614 + }, + { + "epoch": 2.1487988081196847, + "grad_norm": 0.14247903254453806, + "learning_rate": 2.2583249539743978e-05, + "loss": 2.6233, + "step": 34615 + }, + { + "epoch": 2.1488608852194426, + "grad_norm": 0.14275577660328423, + "learning_rate": 2.2580229442525513e-05, + "loss": 2.7132, + "step": 34616 + }, + { + "epoch": 2.1489229623192005, + "grad_norm": 0.15331160769686947, + "learning_rate": 2.257720948836265e-05, + "loss": 2.779, + "step": 34617 + }, + { + "epoch": 2.1489850394189585, + "grad_norm": 0.14952739456492, + "learning_rate": 2.257418967727114e-05, + "loss": 2.7638, + "step": 34618 + }, + { + "epoch": 2.1490471165187164, + "grad_norm": 0.14006761847654278, + "learning_rate": 2.257117000926674e-05, + "loss": 2.7206, + "step": 34619 + }, + { + "epoch": 2.1491091936184743, + "grad_norm": 0.1452456102442421, + "learning_rate": 2.2568150484365184e-05, + "loss": 2.659, + "step": 34620 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.13766252519634503, + "learning_rate": 2.256513110258225e-05, + "loss": 2.6691, + "step": 34621 + }, + { + "epoch": 2.14923334781799, + "grad_norm": 0.1378916632313316, + "learning_rate": 2.2562111863933706e-05, + "loss": 2.6699, + "step": 34622 + }, + { + "epoch": 2.1492954249177476, + "grad_norm": 0.15022541235633713, + "learning_rate": 2.255909276843528e-05, + "loss": 2.7162, + "step": 34623 + }, + { + "epoch": 2.149357502017506, + "grad_norm": 0.17529583771889468, + "learning_rate": 2.2556073816102735e-05, + "loss": 2.7314, + "step": 34624 + }, + { + "epoch": 2.1494195791172634, + "grad_norm": 0.14198442199125255, + "learning_rate": 2.2553055006951795e-05, + "loss": 2.7826, + "step": 34625 + }, + { + "epoch": 2.1494816562170214, + "grad_norm": 0.15324516641607971, + "learning_rate": 2.2550036340998244e-05, + "loss": 2.7379, + "step": 34626 + }, + { + "epoch": 2.1495437333167793, + "grad_norm": 0.15490058902741768, + "learning_rate": 2.254701781825782e-05, + "loss": 2.7554, + "step": 34627 + }, + { + "epoch": 2.149605810416537, + "grad_norm": 0.14933456225688702, + "learning_rate": 2.2543999438746272e-05, + "loss": 2.6962, + "step": 34628 + }, + { + "epoch": 2.149667887516295, + "grad_norm": 0.1591927751180828, + "learning_rate": 2.254098120247932e-05, + "loss": 2.7155, + "step": 34629 + }, + { + "epoch": 2.149729964616053, + "grad_norm": 0.16429926046543425, + "learning_rate": 2.253796310947276e-05, + "loss": 2.6867, + "step": 34630 + }, + { + "epoch": 2.149792041715811, + "grad_norm": 0.14925340354934813, + "learning_rate": 2.2534945159742306e-05, + "loss": 2.7667, + "step": 34631 + }, + { + "epoch": 2.149854118815569, + "grad_norm": 0.15067788346275166, + "learning_rate": 2.253192735330371e-05, + "loss": 2.6017, + "step": 34632 + }, + { + "epoch": 2.149916195915327, + "grad_norm": 0.14357045794166862, + "learning_rate": 2.2528909690172723e-05, + "loss": 2.7476, + "step": 34633 + }, + { + "epoch": 2.1499782730150847, + "grad_norm": 0.141713795086378, + "learning_rate": 2.2525892170365064e-05, + "loss": 2.7107, + "step": 34634 + }, + { + "epoch": 2.1500403501148426, + "grad_norm": 0.15871679249694443, + "learning_rate": 2.252287479389651e-05, + "loss": 2.6989, + "step": 34635 + }, + { + "epoch": 2.1501024272146005, + "grad_norm": 0.1426969059547909, + "learning_rate": 2.251985756078279e-05, + "loss": 2.7444, + "step": 34636 + }, + { + "epoch": 2.1501645043143585, + "grad_norm": 0.1378449175133791, + "learning_rate": 2.2516840471039647e-05, + "loss": 2.618, + "step": 34637 + }, + { + "epoch": 2.1502265814141164, + "grad_norm": 0.14700094692134463, + "learning_rate": 2.2513823524682796e-05, + "loss": 2.7071, + "step": 34638 + }, + { + "epoch": 2.1502886585138743, + "grad_norm": 0.13585930170032, + "learning_rate": 2.2510806721728017e-05, + "loss": 2.6973, + "step": 34639 + }, + { + "epoch": 2.150350735613632, + "grad_norm": 0.14584980528198613, + "learning_rate": 2.2507790062191037e-05, + "loss": 2.6684, + "step": 34640 + }, + { + "epoch": 2.15041281271339, + "grad_norm": 0.165134842296296, + "learning_rate": 2.2504773546087594e-05, + "loss": 2.774, + "step": 34641 + }, + { + "epoch": 2.150474889813148, + "grad_norm": 0.14421722733674344, + "learning_rate": 2.2501757173433412e-05, + "loss": 2.634, + "step": 34642 + }, + { + "epoch": 2.150536966912906, + "grad_norm": 0.1357043096812382, + "learning_rate": 2.249874094424423e-05, + "loss": 2.7041, + "step": 34643 + }, + { + "epoch": 2.150599044012664, + "grad_norm": 0.14467319400291956, + "learning_rate": 2.2495724858535805e-05, + "loss": 2.7377, + "step": 34644 + }, + { + "epoch": 2.150661121112422, + "grad_norm": 0.13956571676576215, + "learning_rate": 2.2492708916323862e-05, + "loss": 2.8224, + "step": 34645 + }, + { + "epoch": 2.1507231982121797, + "grad_norm": 0.15490434672487155, + "learning_rate": 2.2489693117624135e-05, + "loss": 2.7236, + "step": 34646 + }, + { + "epoch": 2.1507852753119376, + "grad_norm": 0.14253370690522998, + "learning_rate": 2.248667746245233e-05, + "loss": 2.7596, + "step": 34647 + }, + { + "epoch": 2.150847352411695, + "grad_norm": 0.14421936484481662, + "learning_rate": 2.2483661950824237e-05, + "loss": 2.77, + "step": 34648 + }, + { + "epoch": 2.150909429511453, + "grad_norm": 0.14208026735735116, + "learning_rate": 2.248064658275556e-05, + "loss": 2.654, + "step": 34649 + }, + { + "epoch": 2.150971506611211, + "grad_norm": 0.16795958449618414, + "learning_rate": 2.2477631358262026e-05, + "loss": 2.6864, + "step": 34650 + }, + { + "epoch": 2.151033583710969, + "grad_norm": 0.1473512178189151, + "learning_rate": 2.247461627735935e-05, + "loss": 2.6495, + "step": 34651 + }, + { + "epoch": 2.151095660810727, + "grad_norm": 0.15819397063908147, + "learning_rate": 2.2471601340063304e-05, + "loss": 2.8019, + "step": 34652 + }, + { + "epoch": 2.1511577379104847, + "grad_norm": 0.14681911369081543, + "learning_rate": 2.2468586546389596e-05, + "loss": 2.6191, + "step": 34653 + }, + { + "epoch": 2.1512198150102426, + "grad_norm": 0.15738265658872924, + "learning_rate": 2.246557189635394e-05, + "loss": 2.8366, + "step": 34654 + }, + { + "epoch": 2.1512818921100005, + "grad_norm": 0.15034179390154317, + "learning_rate": 2.2462557389972095e-05, + "loss": 2.712, + "step": 34655 + }, + { + "epoch": 2.1513439692097585, + "grad_norm": 0.15123427308303106, + "learning_rate": 2.245954302725977e-05, + "loss": 2.7612, + "step": 34656 + }, + { + "epoch": 2.1514060463095164, + "grad_norm": 0.16071785757754223, + "learning_rate": 2.2456528808232686e-05, + "loss": 2.5958, + "step": 34657 + }, + { + "epoch": 2.1514681234092743, + "grad_norm": 0.14186042932437276, + "learning_rate": 2.245351473290659e-05, + "loss": 2.7517, + "step": 34658 + }, + { + "epoch": 2.151530200509032, + "grad_norm": 0.19857939805605693, + "learning_rate": 2.245050080129719e-05, + "loss": 2.7663, + "step": 34659 + }, + { + "epoch": 2.15159227760879, + "grad_norm": 0.167357658312251, + "learning_rate": 2.244748701342022e-05, + "loss": 2.6931, + "step": 34660 + }, + { + "epoch": 2.151654354708548, + "grad_norm": 0.1750675867050551, + "learning_rate": 2.2444473369291387e-05, + "loss": 2.6932, + "step": 34661 + }, + { + "epoch": 2.151716431808306, + "grad_norm": 0.15862197530852223, + "learning_rate": 2.2441459868926435e-05, + "loss": 2.6736, + "step": 34662 + }, + { + "epoch": 2.151778508908064, + "grad_norm": 0.14949423689642324, + "learning_rate": 2.2438446512341076e-05, + "loss": 2.7419, + "step": 34663 + }, + { + "epoch": 2.151840586007822, + "grad_norm": 0.15481775223389418, + "learning_rate": 2.243543329955104e-05, + "loss": 2.6982, + "step": 34664 + }, + { + "epoch": 2.1519026631075797, + "grad_norm": 0.1401536313262036, + "learning_rate": 2.2432420230572015e-05, + "loss": 2.7401, + "step": 34665 + }, + { + "epoch": 2.1519647402073376, + "grad_norm": 0.14082842259855452, + "learning_rate": 2.242940730541977e-05, + "loss": 2.7568, + "step": 34666 + }, + { + "epoch": 2.1520268173070956, + "grad_norm": 0.14622265004710422, + "learning_rate": 2.2426394524109994e-05, + "loss": 2.7462, + "step": 34667 + }, + { + "epoch": 2.1520888944068535, + "grad_norm": 0.15392267144407315, + "learning_rate": 2.2423381886658417e-05, + "loss": 2.7569, + "step": 34668 + }, + { + "epoch": 2.1521509715066114, + "grad_norm": 0.15182031672319743, + "learning_rate": 2.242036939308075e-05, + "loss": 2.6557, + "step": 34669 + }, + { + "epoch": 2.1522130486063693, + "grad_norm": 0.14928041976280995, + "learning_rate": 2.241735704339269e-05, + "loss": 2.6866, + "step": 34670 + }, + { + "epoch": 2.152275125706127, + "grad_norm": 0.1653179430831698, + "learning_rate": 2.241434483760999e-05, + "loss": 2.7284, + "step": 34671 + }, + { + "epoch": 2.152337202805885, + "grad_norm": 0.16057038262708664, + "learning_rate": 2.2411332775748356e-05, + "loss": 2.7056, + "step": 34672 + }, + { + "epoch": 2.1523992799056426, + "grad_norm": 0.16377848496638714, + "learning_rate": 2.2408320857823495e-05, + "loss": 2.7197, + "step": 34673 + }, + { + "epoch": 2.1524613570054005, + "grad_norm": 0.15780943286861226, + "learning_rate": 2.24053090838511e-05, + "loss": 2.7315, + "step": 34674 + }, + { + "epoch": 2.1525234341051585, + "grad_norm": 0.1540032199899965, + "learning_rate": 2.2402297453846927e-05, + "loss": 2.765, + "step": 34675 + }, + { + "epoch": 2.1525855112049164, + "grad_norm": 0.18012810673374005, + "learning_rate": 2.2399285967826672e-05, + "loss": 2.6328, + "step": 34676 + }, + { + "epoch": 2.1526475883046743, + "grad_norm": 0.1535899288229936, + "learning_rate": 2.2396274625806034e-05, + "loss": 2.793, + "step": 34677 + }, + { + "epoch": 2.152709665404432, + "grad_norm": 0.17191159335640488, + "learning_rate": 2.239326342780072e-05, + "loss": 2.6713, + "step": 34678 + }, + { + "epoch": 2.15277174250419, + "grad_norm": 0.14766112375037044, + "learning_rate": 2.2390252373826472e-05, + "loss": 2.7853, + "step": 34679 + }, + { + "epoch": 2.152833819603948, + "grad_norm": 0.17066788516977124, + "learning_rate": 2.238724146389898e-05, + "loss": 2.6659, + "step": 34680 + }, + { + "epoch": 2.152895896703706, + "grad_norm": 0.17596813439986558, + "learning_rate": 2.2384230698033947e-05, + "loss": 2.675, + "step": 34681 + }, + { + "epoch": 2.152957973803464, + "grad_norm": 0.14054715465583567, + "learning_rate": 2.238122007624709e-05, + "loss": 2.7002, + "step": 34682 + }, + { + "epoch": 2.153020050903222, + "grad_norm": 0.15739664509785037, + "learning_rate": 2.237820959855409e-05, + "loss": 2.6267, + "step": 34683 + }, + { + "epoch": 2.1530821280029797, + "grad_norm": 0.14066889528141696, + "learning_rate": 2.23751992649707e-05, + "loss": 2.6689, + "step": 34684 + }, + { + "epoch": 2.1531442051027376, + "grad_norm": 0.1554217679713774, + "learning_rate": 2.2372189075512597e-05, + "loss": 2.7117, + "step": 34685 + }, + { + "epoch": 2.1532062822024955, + "grad_norm": 0.14077322486121688, + "learning_rate": 2.2369179030195476e-05, + "loss": 2.7993, + "step": 34686 + }, + { + "epoch": 2.1532683593022535, + "grad_norm": 0.14107984918995606, + "learning_rate": 2.236616912903507e-05, + "loss": 2.654, + "step": 34687 + }, + { + "epoch": 2.1533304364020114, + "grad_norm": 0.1422004334886968, + "learning_rate": 2.2363159372047056e-05, + "loss": 2.6897, + "step": 34688 + }, + { + "epoch": 2.1533925135017693, + "grad_norm": 0.14977195362217385, + "learning_rate": 2.236014975924716e-05, + "loss": 2.7861, + "step": 34689 + }, + { + "epoch": 2.153454590601527, + "grad_norm": 0.1608110741442001, + "learning_rate": 2.2357140290651074e-05, + "loss": 2.8083, + "step": 34690 + }, + { + "epoch": 2.153516667701285, + "grad_norm": 0.14379343952127693, + "learning_rate": 2.23541309662745e-05, + "loss": 2.7315, + "step": 34691 + }, + { + "epoch": 2.153578744801043, + "grad_norm": 0.14891007762724837, + "learning_rate": 2.2351121786133133e-05, + "loss": 2.7073, + "step": 34692 + }, + { + "epoch": 2.153640821900801, + "grad_norm": 0.16519247633077253, + "learning_rate": 2.234811275024266e-05, + "loss": 2.713, + "step": 34693 + }, + { + "epoch": 2.153702899000559, + "grad_norm": 0.14373470230181384, + "learning_rate": 2.2345103858618815e-05, + "loss": 2.683, + "step": 34694 + }, + { + "epoch": 2.153764976100317, + "grad_norm": 0.1628272923725636, + "learning_rate": 2.2342095111277273e-05, + "loss": 2.6738, + "step": 34695 + }, + { + "epoch": 2.1538270532000743, + "grad_norm": 0.1439076191840132, + "learning_rate": 2.2339086508233737e-05, + "loss": 2.7177, + "step": 34696 + }, + { + "epoch": 2.153889130299832, + "grad_norm": 0.14118456772372076, + "learning_rate": 2.2336078049503883e-05, + "loss": 2.7196, + "step": 34697 + }, + { + "epoch": 2.15395120739959, + "grad_norm": 0.17838211242140112, + "learning_rate": 2.2333069735103445e-05, + "loss": 2.7512, + "step": 34698 + }, + { + "epoch": 2.154013284499348, + "grad_norm": 0.1399970471760453, + "learning_rate": 2.2330061565048098e-05, + "loss": 2.6856, + "step": 34699 + }, + { + "epoch": 2.154075361599106, + "grad_norm": 0.14796772453975932, + "learning_rate": 2.2327053539353538e-05, + "loss": 2.6863, + "step": 34700 + }, + { + "epoch": 2.154137438698864, + "grad_norm": 0.14221723816691917, + "learning_rate": 2.2324045658035436e-05, + "loss": 2.6825, + "step": 34701 + }, + { + "epoch": 2.154199515798622, + "grad_norm": 0.14220782752980501, + "learning_rate": 2.232103792110953e-05, + "loss": 2.6659, + "step": 34702 + }, + { + "epoch": 2.1542615928983797, + "grad_norm": 0.1571911718686673, + "learning_rate": 2.2318030328591484e-05, + "loss": 2.7276, + "step": 34703 + }, + { + "epoch": 2.1543236699981376, + "grad_norm": 0.1410221412717041, + "learning_rate": 2.2315022880496996e-05, + "loss": 2.7104, + "step": 34704 + }, + { + "epoch": 2.1543857470978955, + "grad_norm": 0.14864774295642422, + "learning_rate": 2.231201557684175e-05, + "loss": 2.7378, + "step": 34705 + }, + { + "epoch": 2.1544478241976535, + "grad_norm": 0.147312165047512, + "learning_rate": 2.2309008417641424e-05, + "loss": 2.7054, + "step": 34706 + }, + { + "epoch": 2.1545099012974114, + "grad_norm": 0.17410114899385717, + "learning_rate": 2.2306001402911742e-05, + "loss": 2.7102, + "step": 34707 + }, + { + "epoch": 2.1545719783971693, + "grad_norm": 0.17031004295851596, + "learning_rate": 2.2302994532668375e-05, + "loss": 2.6972, + "step": 34708 + }, + { + "epoch": 2.154634055496927, + "grad_norm": 0.15934084708656884, + "learning_rate": 2.229998780692701e-05, + "loss": 2.6921, + "step": 34709 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.14484743381083084, + "learning_rate": 2.229698122570331e-05, + "loss": 2.7626, + "step": 34710 + }, + { + "epoch": 2.154758209696443, + "grad_norm": 0.1403806404443775, + "learning_rate": 2.2293974789013006e-05, + "loss": 2.7865, + "step": 34711 + }, + { + "epoch": 2.154820286796201, + "grad_norm": 0.14296603017836018, + "learning_rate": 2.2290968496871763e-05, + "loss": 2.7089, + "step": 34712 + }, + { + "epoch": 2.154882363895959, + "grad_norm": 0.14112982603450883, + "learning_rate": 2.228796234929526e-05, + "loss": 2.7094, + "step": 34713 + }, + { + "epoch": 2.154944440995717, + "grad_norm": 0.17040424282011862, + "learning_rate": 2.228495634629917e-05, + "loss": 2.6573, + "step": 34714 + }, + { + "epoch": 2.1550065180954747, + "grad_norm": 0.17056719149137198, + "learning_rate": 2.2281950487899205e-05, + "loss": 2.7466, + "step": 34715 + }, + { + "epoch": 2.1550685951952326, + "grad_norm": 0.1538648835373104, + "learning_rate": 2.227894477411104e-05, + "loss": 2.6849, + "step": 34716 + }, + { + "epoch": 2.1551306722949906, + "grad_norm": 0.17703494060537986, + "learning_rate": 2.227593920495034e-05, + "loss": 2.718, + "step": 34717 + }, + { + "epoch": 2.1551927493947485, + "grad_norm": 0.13740285969564797, + "learning_rate": 2.22729337804328e-05, + "loss": 2.6691, + "step": 34718 + }, + { + "epoch": 2.155254826494506, + "grad_norm": 0.15847946686183653, + "learning_rate": 2.2269928500574085e-05, + "loss": 2.6749, + "step": 34719 + }, + { + "epoch": 2.1553169035942643, + "grad_norm": 0.14800004117673546, + "learning_rate": 2.2266923365389876e-05, + "loss": 2.7817, + "step": 34720 + }, + { + "epoch": 2.155378980694022, + "grad_norm": 0.15471237709875466, + "learning_rate": 2.2263918374895876e-05, + "loss": 2.7761, + "step": 34721 + }, + { + "epoch": 2.1554410577937797, + "grad_norm": 0.16247223263783153, + "learning_rate": 2.226091352910775e-05, + "loss": 2.8004, + "step": 34722 + }, + { + "epoch": 2.1555031348935376, + "grad_norm": 0.14377334057975097, + "learning_rate": 2.2257908828041175e-05, + "loss": 2.6676, + "step": 34723 + }, + { + "epoch": 2.1555652119932955, + "grad_norm": 0.14362058619132162, + "learning_rate": 2.22549042717118e-05, + "loss": 2.7545, + "step": 34724 + }, + { + "epoch": 2.1556272890930535, + "grad_norm": 0.16582521289822988, + "learning_rate": 2.225189986013535e-05, + "loss": 2.6984, + "step": 34725 + }, + { + "epoch": 2.1556893661928114, + "grad_norm": 0.16598543565804988, + "learning_rate": 2.2248895593327473e-05, + "loss": 2.7751, + "step": 34726 + }, + { + "epoch": 2.1557514432925693, + "grad_norm": 0.15332444300827944, + "learning_rate": 2.2245891471303842e-05, + "loss": 2.7336, + "step": 34727 + }, + { + "epoch": 2.155813520392327, + "grad_norm": 0.15753873198399113, + "learning_rate": 2.2242887494080133e-05, + "loss": 2.7507, + "step": 34728 + }, + { + "epoch": 2.155875597492085, + "grad_norm": 0.1463892373446671, + "learning_rate": 2.2239883661672e-05, + "loss": 2.6552, + "step": 34729 + }, + { + "epoch": 2.155937674591843, + "grad_norm": 0.14963547501351, + "learning_rate": 2.2236879974095153e-05, + "loss": 2.7401, + "step": 34730 + }, + { + "epoch": 2.155999751691601, + "grad_norm": 0.14616282456972934, + "learning_rate": 2.2233876431365242e-05, + "loss": 2.8037, + "step": 34731 + }, + { + "epoch": 2.156061828791359, + "grad_norm": 0.16481736206988312, + "learning_rate": 2.223087303349794e-05, + "loss": 2.7389, + "step": 34732 + }, + { + "epoch": 2.156123905891117, + "grad_norm": 0.15272995096028794, + "learning_rate": 2.2227869780508893e-05, + "loss": 2.7403, + "step": 34733 + }, + { + "epoch": 2.1561859829908747, + "grad_norm": 0.1417600074108296, + "learning_rate": 2.2224866672413814e-05, + "loss": 2.7575, + "step": 34734 + }, + { + "epoch": 2.1562480600906326, + "grad_norm": 0.14946775157442668, + "learning_rate": 2.222186370922834e-05, + "loss": 2.7828, + "step": 34735 + }, + { + "epoch": 2.1563101371903906, + "grad_norm": 0.13533286946780748, + "learning_rate": 2.2218860890968152e-05, + "loss": 2.718, + "step": 34736 + }, + { + "epoch": 2.1563722142901485, + "grad_norm": 0.15441474758066667, + "learning_rate": 2.2215858217648894e-05, + "loss": 2.6274, + "step": 34737 + }, + { + "epoch": 2.1564342913899064, + "grad_norm": 0.14352635992911236, + "learning_rate": 2.2212855689286267e-05, + "loss": 2.691, + "step": 34738 + }, + { + "epoch": 2.1564963684896643, + "grad_norm": 0.14507354988973767, + "learning_rate": 2.220985330589591e-05, + "loss": 2.7695, + "step": 34739 + }, + { + "epoch": 2.1565584455894222, + "grad_norm": 0.149247731395234, + "learning_rate": 2.22068510674935e-05, + "loss": 2.7128, + "step": 34740 + }, + { + "epoch": 2.15662052268918, + "grad_norm": 0.1539210377768088, + "learning_rate": 2.22038489740947e-05, + "loss": 2.7339, + "step": 34741 + }, + { + "epoch": 2.156682599788938, + "grad_norm": 0.1416589554857154, + "learning_rate": 2.2200847025715142e-05, + "loss": 2.6986, + "step": 34742 + }, + { + "epoch": 2.156744676888696, + "grad_norm": 0.14085767623291695, + "learning_rate": 2.2197845222370532e-05, + "loss": 2.7145, + "step": 34743 + }, + { + "epoch": 2.1568067539884535, + "grad_norm": 0.15041338141539817, + "learning_rate": 2.2194843564076516e-05, + "loss": 2.7415, + "step": 34744 + }, + { + "epoch": 2.1568688310882114, + "grad_norm": 0.13913090046994278, + "learning_rate": 2.2191842050848748e-05, + "loss": 2.707, + "step": 34745 + }, + { + "epoch": 2.1569309081879693, + "grad_norm": 0.1489811278510934, + "learning_rate": 2.2188840682702876e-05, + "loss": 2.7703, + "step": 34746 + }, + { + "epoch": 2.156992985287727, + "grad_norm": 0.18972124220685108, + "learning_rate": 2.218583945965459e-05, + "loss": 2.7237, + "step": 34747 + }, + { + "epoch": 2.157055062387485, + "grad_norm": 0.1624082258011448, + "learning_rate": 2.2182838381719536e-05, + "loss": 2.7466, + "step": 34748 + }, + { + "epoch": 2.157117139487243, + "grad_norm": 0.1428123014185754, + "learning_rate": 2.217983744891336e-05, + "loss": 2.712, + "step": 34749 + }, + { + "epoch": 2.157179216587001, + "grad_norm": 0.14045615125937916, + "learning_rate": 2.217683666125171e-05, + "loss": 2.7113, + "step": 34750 + }, + { + "epoch": 2.157241293686759, + "grad_norm": 0.14036832670576588, + "learning_rate": 2.217383601875027e-05, + "loss": 2.7382, + "step": 34751 + }, + { + "epoch": 2.157303370786517, + "grad_norm": 0.14710845883165702, + "learning_rate": 2.217083552142467e-05, + "loss": 2.7442, + "step": 34752 + }, + { + "epoch": 2.1573654478862747, + "grad_norm": 0.15924967648886512, + "learning_rate": 2.21678351692906e-05, + "loss": 2.7749, + "step": 34753 + }, + { + "epoch": 2.1574275249860326, + "grad_norm": 0.14179316744410425, + "learning_rate": 2.2164834962363684e-05, + "loss": 2.5753, + "step": 34754 + }, + { + "epoch": 2.1574896020857905, + "grad_norm": 0.14206427904585098, + "learning_rate": 2.2161834900659584e-05, + "loss": 2.668, + "step": 34755 + }, + { + "epoch": 2.1575516791855485, + "grad_norm": 0.14481575774610272, + "learning_rate": 2.2158834984193926e-05, + "loss": 2.7821, + "step": 34756 + }, + { + "epoch": 2.1576137562853064, + "grad_norm": 0.15747112303685232, + "learning_rate": 2.215583521298241e-05, + "loss": 2.7089, + "step": 34757 + }, + { + "epoch": 2.1576758333850643, + "grad_norm": 0.15006551652853048, + "learning_rate": 2.2152835587040655e-05, + "loss": 2.6558, + "step": 34758 + }, + { + "epoch": 2.157737910484822, + "grad_norm": 0.1436403823218925, + "learning_rate": 2.214983610638432e-05, + "loss": 2.6625, + "step": 34759 + }, + { + "epoch": 2.15779998758458, + "grad_norm": 0.1536057118116891, + "learning_rate": 2.2146836771029035e-05, + "loss": 2.8618, + "step": 34760 + }, + { + "epoch": 2.157862064684338, + "grad_norm": 0.14078457705623443, + "learning_rate": 2.2143837580990477e-05, + "loss": 2.687, + "step": 34761 + }, + { + "epoch": 2.157924141784096, + "grad_norm": 0.14348465645068867, + "learning_rate": 2.2140838536284288e-05, + "loss": 2.8127, + "step": 34762 + }, + { + "epoch": 2.157986218883854, + "grad_norm": 0.1502602433138089, + "learning_rate": 2.21378396369261e-05, + "loss": 2.7635, + "step": 34763 + }, + { + "epoch": 2.158048295983612, + "grad_norm": 0.14594935303692005, + "learning_rate": 2.213484088293155e-05, + "loss": 2.7042, + "step": 34764 + }, + { + "epoch": 2.1581103730833697, + "grad_norm": 0.1560289906052255, + "learning_rate": 2.2131842274316318e-05, + "loss": 2.7294, + "step": 34765 + }, + { + "epoch": 2.1581724501831276, + "grad_norm": 0.1578699654110908, + "learning_rate": 2.2128843811096034e-05, + "loss": 2.7269, + "step": 34766 + }, + { + "epoch": 2.158234527282885, + "grad_norm": 0.1461268257499405, + "learning_rate": 2.2125845493286334e-05, + "loss": 2.7157, + "step": 34767 + }, + { + "epoch": 2.158296604382643, + "grad_norm": 0.14907284963613968, + "learning_rate": 2.2122847320902863e-05, + "loss": 2.7487, + "step": 34768 + }, + { + "epoch": 2.158358681482401, + "grad_norm": 0.1576485391135663, + "learning_rate": 2.2119849293961248e-05, + "loss": 2.7718, + "step": 34769 + }, + { + "epoch": 2.158420758582159, + "grad_norm": 0.1430087054552189, + "learning_rate": 2.211685141247717e-05, + "loss": 2.7322, + "step": 34770 + }, + { + "epoch": 2.158482835681917, + "grad_norm": 0.14696215595236886, + "learning_rate": 2.211385367646624e-05, + "loss": 2.6908, + "step": 34771 + }, + { + "epoch": 2.1585449127816747, + "grad_norm": 0.1682320073176282, + "learning_rate": 2.2110856085944108e-05, + "loss": 2.7635, + "step": 34772 + }, + { + "epoch": 2.1586069898814326, + "grad_norm": 0.14137445192416306, + "learning_rate": 2.2107858640926393e-05, + "loss": 2.6622, + "step": 34773 + }, + { + "epoch": 2.1586690669811905, + "grad_norm": 0.15483369878371459, + "learning_rate": 2.210486134142877e-05, + "loss": 2.7501, + "step": 34774 + }, + { + "epoch": 2.1587311440809485, + "grad_norm": 0.14988679214972425, + "learning_rate": 2.2101864187466857e-05, + "loss": 2.7311, + "step": 34775 + }, + { + "epoch": 2.1587932211807064, + "grad_norm": 0.14819291945868215, + "learning_rate": 2.2098867179056286e-05, + "loss": 2.812, + "step": 34776 + }, + { + "epoch": 2.1588552982804643, + "grad_norm": 0.16349232459790555, + "learning_rate": 2.20958703162127e-05, + "loss": 2.6904, + "step": 34777 + }, + { + "epoch": 2.158917375380222, + "grad_norm": 0.16891255364313787, + "learning_rate": 2.2092873598951718e-05, + "loss": 2.6527, + "step": 34778 + }, + { + "epoch": 2.15897945247998, + "grad_norm": 0.14633114747990583, + "learning_rate": 2.2089877027289007e-05, + "loss": 2.6836, + "step": 34779 + }, + { + "epoch": 2.159041529579738, + "grad_norm": 0.14844536760460778, + "learning_rate": 2.2086880601240183e-05, + "loss": 2.7573, + "step": 34780 + }, + { + "epoch": 2.159103606679496, + "grad_norm": 0.15567434485158108, + "learning_rate": 2.2083884320820874e-05, + "loss": 2.7525, + "step": 34781 + }, + { + "epoch": 2.159165683779254, + "grad_norm": 0.14053003760559848, + "learning_rate": 2.2080888186046705e-05, + "loss": 2.7876, + "step": 34782 + }, + { + "epoch": 2.159227760879012, + "grad_norm": 0.1683719475426278, + "learning_rate": 2.2077892196933336e-05, + "loss": 2.6102, + "step": 34783 + }, + { + "epoch": 2.1592898379787697, + "grad_norm": 0.14246832525889078, + "learning_rate": 2.2074896353496383e-05, + "loss": 2.6622, + "step": 34784 + }, + { + "epoch": 2.1593519150785276, + "grad_norm": 0.14051166576863736, + "learning_rate": 2.2071900655751455e-05, + "loss": 2.6859, + "step": 34785 + }, + { + "epoch": 2.1594139921782856, + "grad_norm": 0.1357022645271881, + "learning_rate": 2.2068905103714216e-05, + "loss": 2.5823, + "step": 34786 + }, + { + "epoch": 2.1594760692780435, + "grad_norm": 0.14807893083922083, + "learning_rate": 2.2065909697400267e-05, + "loss": 2.6572, + "step": 34787 + }, + { + "epoch": 2.1595381463778014, + "grad_norm": 0.1434335716702783, + "learning_rate": 2.2062914436825262e-05, + "loss": 2.6309, + "step": 34788 + }, + { + "epoch": 2.1596002234775593, + "grad_norm": 0.14186932470506994, + "learning_rate": 2.2059919322004817e-05, + "loss": 2.6702, + "step": 34789 + }, + { + "epoch": 2.1596623005773172, + "grad_norm": 0.15994361392384288, + "learning_rate": 2.2056924352954554e-05, + "loss": 2.7235, + "step": 34790 + }, + { + "epoch": 2.159724377677075, + "grad_norm": 0.1423789553649645, + "learning_rate": 2.2053929529690104e-05, + "loss": 2.6511, + "step": 34791 + }, + { + "epoch": 2.1597864547768326, + "grad_norm": 0.13869100089551822, + "learning_rate": 2.2050934852227067e-05, + "loss": 2.6888, + "step": 34792 + }, + { + "epoch": 2.1598485318765905, + "grad_norm": 0.14508541293267901, + "learning_rate": 2.2047940320581105e-05, + "loss": 2.7448, + "step": 34793 + }, + { + "epoch": 2.1599106089763485, + "grad_norm": 0.14730376273395762, + "learning_rate": 2.2044945934767823e-05, + "loss": 2.7304, + "step": 34794 + }, + { + "epoch": 2.1599726860761064, + "grad_norm": 0.14248201318417558, + "learning_rate": 2.2041951694802847e-05, + "loss": 2.7507, + "step": 34795 + }, + { + "epoch": 2.1600347631758643, + "grad_norm": 0.15040079924611707, + "learning_rate": 2.2038957600701776e-05, + "loss": 2.68, + "step": 34796 + }, + { + "epoch": 2.160096840275622, + "grad_norm": 0.14084385689612225, + "learning_rate": 2.203596365248027e-05, + "loss": 2.7109, + "step": 34797 + }, + { + "epoch": 2.16015891737538, + "grad_norm": 0.15300813711412914, + "learning_rate": 2.2032969850153927e-05, + "loss": 2.7148, + "step": 34798 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.1639037556362997, + "learning_rate": 2.202997619373838e-05, + "loss": 2.7152, + "step": 34799 + }, + { + "epoch": 2.160283071574896, + "grad_norm": 0.14085928289079325, + "learning_rate": 2.2026982683249208e-05, + "loss": 2.6418, + "step": 34800 + }, + { + "epoch": 2.160345148674654, + "grad_norm": 0.14739846807378346, + "learning_rate": 2.202398931870208e-05, + "loss": 2.7832, + "step": 34801 + }, + { + "epoch": 2.160407225774412, + "grad_norm": 0.14787262800511833, + "learning_rate": 2.202099610011259e-05, + "loss": 2.6886, + "step": 34802 + }, + { + "epoch": 2.1604693028741697, + "grad_norm": 0.14695859442575832, + "learning_rate": 2.201800302749636e-05, + "loss": 2.7501, + "step": 34803 + }, + { + "epoch": 2.1605313799739276, + "grad_norm": 0.13916700245931893, + "learning_rate": 2.2015010100868992e-05, + "loss": 2.6648, + "step": 34804 + }, + { + "epoch": 2.1605934570736856, + "grad_norm": 0.15837096506396806, + "learning_rate": 2.2012017320246097e-05, + "loss": 2.7434, + "step": 34805 + }, + { + "epoch": 2.1606555341734435, + "grad_norm": 0.1391041827149225, + "learning_rate": 2.2009024685643326e-05, + "loss": 2.8028, + "step": 34806 + }, + { + "epoch": 2.1607176112732014, + "grad_norm": 0.14618004657284744, + "learning_rate": 2.200603219707626e-05, + "loss": 2.8357, + "step": 34807 + }, + { + "epoch": 2.1607796883729593, + "grad_norm": 0.14673502775901526, + "learning_rate": 2.2003039854560525e-05, + "loss": 2.6486, + "step": 34808 + }, + { + "epoch": 2.1608417654727172, + "grad_norm": 0.14387861670970173, + "learning_rate": 2.2000047658111706e-05, + "loss": 2.6919, + "step": 34809 + }, + { + "epoch": 2.160903842572475, + "grad_norm": 0.1514309410457017, + "learning_rate": 2.1997055607745455e-05, + "loss": 2.7225, + "step": 34810 + }, + { + "epoch": 2.160965919672233, + "grad_norm": 0.14108953309120656, + "learning_rate": 2.1994063703477362e-05, + "loss": 2.7274, + "step": 34811 + }, + { + "epoch": 2.161027996771991, + "grad_norm": 0.14572763799279592, + "learning_rate": 2.1991071945323038e-05, + "loss": 2.77, + "step": 34812 + }, + { + "epoch": 2.161090073871749, + "grad_norm": 0.16872954509529545, + "learning_rate": 2.198808033329809e-05, + "loss": 2.7632, + "step": 34813 + }, + { + "epoch": 2.161152150971507, + "grad_norm": 0.14919923784595682, + "learning_rate": 2.1985088867418107e-05, + "loss": 2.6873, + "step": 34814 + }, + { + "epoch": 2.1612142280712643, + "grad_norm": 0.15973122536525125, + "learning_rate": 2.1982097547698737e-05, + "loss": 2.7478, + "step": 34815 + }, + { + "epoch": 2.161276305171022, + "grad_norm": 0.16583099757658992, + "learning_rate": 2.197910637415557e-05, + "loss": 2.7109, + "step": 34816 + }, + { + "epoch": 2.16133838227078, + "grad_norm": 0.13981749565201648, + "learning_rate": 2.19761153468042e-05, + "loss": 2.7068, + "step": 34817 + }, + { + "epoch": 2.161400459370538, + "grad_norm": 0.1448852803896122, + "learning_rate": 2.197312446566023e-05, + "loss": 2.6973, + "step": 34818 + }, + { + "epoch": 2.161462536470296, + "grad_norm": 0.1488032290143415, + "learning_rate": 2.1970133730739263e-05, + "loss": 2.7398, + "step": 34819 + }, + { + "epoch": 2.161524613570054, + "grad_norm": 0.14450887558418124, + "learning_rate": 2.1967143142056935e-05, + "loss": 2.7735, + "step": 34820 + }, + { + "epoch": 2.161586690669812, + "grad_norm": 0.13996943680904966, + "learning_rate": 2.1964152699628826e-05, + "loss": 2.7607, + "step": 34821 + }, + { + "epoch": 2.1616487677695697, + "grad_norm": 0.14671759160105527, + "learning_rate": 2.1961162403470543e-05, + "loss": 2.7116, + "step": 34822 + }, + { + "epoch": 2.1617108448693276, + "grad_norm": 0.14925426464960778, + "learning_rate": 2.195817225359766e-05, + "loss": 2.734, + "step": 34823 + }, + { + "epoch": 2.1617729219690855, + "grad_norm": 0.15688238323170609, + "learning_rate": 2.195518225002582e-05, + "loss": 2.6292, + "step": 34824 + }, + { + "epoch": 2.1618349990688435, + "grad_norm": 0.14043765038845973, + "learning_rate": 2.19521923927706e-05, + "loss": 2.7651, + "step": 34825 + }, + { + "epoch": 2.1618970761686014, + "grad_norm": 0.14108110344776548, + "learning_rate": 2.1949202681847603e-05, + "loss": 2.6696, + "step": 34826 + }, + { + "epoch": 2.1619591532683593, + "grad_norm": 0.15661897604828698, + "learning_rate": 2.1946213117272425e-05, + "loss": 2.6819, + "step": 34827 + }, + { + "epoch": 2.162021230368117, + "grad_norm": 0.1519869881875216, + "learning_rate": 2.1943223699060645e-05, + "loss": 2.7424, + "step": 34828 + }, + { + "epoch": 2.162083307467875, + "grad_norm": 0.14168032067312275, + "learning_rate": 2.194023442722789e-05, + "loss": 2.7443, + "step": 34829 + }, + { + "epoch": 2.162145384567633, + "grad_norm": 0.13837154533067655, + "learning_rate": 2.193724530178975e-05, + "loss": 2.6801, + "step": 34830 + }, + { + "epoch": 2.162207461667391, + "grad_norm": 0.14358629151276842, + "learning_rate": 2.1934256322761814e-05, + "loss": 2.7224, + "step": 34831 + }, + { + "epoch": 2.162269538767149, + "grad_norm": 0.16395430744027953, + "learning_rate": 2.1931267490159658e-05, + "loss": 2.7463, + "step": 34832 + }, + { + "epoch": 2.162331615866907, + "grad_norm": 0.14261327597635778, + "learning_rate": 2.1928278803998908e-05, + "loss": 2.7172, + "step": 34833 + }, + { + "epoch": 2.1623936929666647, + "grad_norm": 0.16587443238026853, + "learning_rate": 2.1925290264295145e-05, + "loss": 2.6631, + "step": 34834 + }, + { + "epoch": 2.1624557700664226, + "grad_norm": 0.14520423378372768, + "learning_rate": 2.192230187106396e-05, + "loss": 2.7155, + "step": 34835 + }, + { + "epoch": 2.1625178471661806, + "grad_norm": 0.1397432925452047, + "learning_rate": 2.191931362432092e-05, + "loss": 2.696, + "step": 34836 + }, + { + "epoch": 2.1625799242659385, + "grad_norm": 0.1394854374590091, + "learning_rate": 2.191632552408166e-05, + "loss": 2.6966, + "step": 34837 + }, + { + "epoch": 2.1626420013656964, + "grad_norm": 0.13730389172905388, + "learning_rate": 2.1913337570361748e-05, + "loss": 2.6882, + "step": 34838 + }, + { + "epoch": 2.1627040784654543, + "grad_norm": 0.14534126581526755, + "learning_rate": 2.1910349763176763e-05, + "loss": 2.7658, + "step": 34839 + }, + { + "epoch": 2.162766155565212, + "grad_norm": 0.14650951805499135, + "learning_rate": 2.1907362102542312e-05, + "loss": 2.7499, + "step": 34840 + }, + { + "epoch": 2.1628282326649697, + "grad_norm": 0.13843286763787138, + "learning_rate": 2.1904374588473947e-05, + "loss": 2.6088, + "step": 34841 + }, + { + "epoch": 2.1628903097647276, + "grad_norm": 0.1521427405665611, + "learning_rate": 2.1901387220987306e-05, + "loss": 2.7274, + "step": 34842 + }, + { + "epoch": 2.1629523868644855, + "grad_norm": 0.14676194278325996, + "learning_rate": 2.1898400000097948e-05, + "loss": 2.7757, + "step": 34843 + }, + { + "epoch": 2.1630144639642435, + "grad_norm": 0.14027198218172576, + "learning_rate": 2.189541292582145e-05, + "loss": 2.7088, + "step": 34844 + }, + { + "epoch": 2.1630765410640014, + "grad_norm": 0.15011698735482112, + "learning_rate": 2.1892425998173392e-05, + "loss": 2.6971, + "step": 34845 + }, + { + "epoch": 2.1631386181637593, + "grad_norm": 0.13683491267877645, + "learning_rate": 2.1889439217169392e-05, + "loss": 2.7671, + "step": 34846 + }, + { + "epoch": 2.163200695263517, + "grad_norm": 0.14180252021922704, + "learning_rate": 2.188645258282501e-05, + "loss": 2.7283, + "step": 34847 + }, + { + "epoch": 2.163262772363275, + "grad_norm": 0.15220645072764627, + "learning_rate": 2.1883466095155836e-05, + "loss": 2.6958, + "step": 34848 + }, + { + "epoch": 2.163324849463033, + "grad_norm": 0.1379451379195357, + "learning_rate": 2.188047975417742e-05, + "loss": 2.686, + "step": 34849 + }, + { + "epoch": 2.163386926562791, + "grad_norm": 0.13326160148560656, + "learning_rate": 2.1877493559905366e-05, + "loss": 2.6941, + "step": 34850 + }, + { + "epoch": 2.163449003662549, + "grad_norm": 0.1402501036342707, + "learning_rate": 2.1874507512355276e-05, + "loss": 2.7474, + "step": 34851 + }, + { + "epoch": 2.163511080762307, + "grad_norm": 0.1548249286853331, + "learning_rate": 2.1871521611542707e-05, + "loss": 2.7141, + "step": 34852 + }, + { + "epoch": 2.1635731578620647, + "grad_norm": 0.13608899854956383, + "learning_rate": 2.186853585748324e-05, + "loss": 2.665, + "step": 34853 + }, + { + "epoch": 2.1636352349618226, + "grad_norm": 0.14414837746590242, + "learning_rate": 2.1865550250192444e-05, + "loss": 2.6824, + "step": 34854 + }, + { + "epoch": 2.1636973120615806, + "grad_norm": 0.1486421730011422, + "learning_rate": 2.186256478968589e-05, + "loss": 2.732, + "step": 34855 + }, + { + "epoch": 2.1637593891613385, + "grad_norm": 0.13628782753399907, + "learning_rate": 2.1859579475979186e-05, + "loss": 2.7204, + "step": 34856 + }, + { + "epoch": 2.1638214662610964, + "grad_norm": 0.14504178194372755, + "learning_rate": 2.1856594309087886e-05, + "loss": 2.7047, + "step": 34857 + }, + { + "epoch": 2.1638835433608543, + "grad_norm": 0.14801915722053938, + "learning_rate": 2.1853609289027565e-05, + "loss": 2.6998, + "step": 34858 + }, + { + "epoch": 2.1639456204606122, + "grad_norm": 0.1546938654042667, + "learning_rate": 2.1850624415813776e-05, + "loss": 2.6757, + "step": 34859 + }, + { + "epoch": 2.16400769756037, + "grad_norm": 0.1452271936599812, + "learning_rate": 2.1847639689462136e-05, + "loss": 2.7, + "step": 34860 + }, + { + "epoch": 2.164069774660128, + "grad_norm": 0.13745109056704133, + "learning_rate": 2.184465510998819e-05, + "loss": 2.7982, + "step": 34861 + }, + { + "epoch": 2.164131851759886, + "grad_norm": 0.14344819329280145, + "learning_rate": 2.184167067740751e-05, + "loss": 2.6527, + "step": 34862 + }, + { + "epoch": 2.1641939288596435, + "grad_norm": 0.14911554472366084, + "learning_rate": 2.183868639173568e-05, + "loss": 2.7266, + "step": 34863 + }, + { + "epoch": 2.1642560059594014, + "grad_norm": 0.1412674744523665, + "learning_rate": 2.1835702252988233e-05, + "loss": 2.7008, + "step": 34864 + }, + { + "epoch": 2.1643180830591593, + "grad_norm": 0.14440697404687775, + "learning_rate": 2.1832718261180784e-05, + "loss": 2.6618, + "step": 34865 + }, + { + "epoch": 2.164380160158917, + "grad_norm": 0.1436177199679165, + "learning_rate": 2.1829734416328886e-05, + "loss": 2.6907, + "step": 34866 + }, + { + "epoch": 2.164442237258675, + "grad_norm": 0.14255545056995417, + "learning_rate": 2.1826750718448097e-05, + "loss": 2.7172, + "step": 34867 + }, + { + "epoch": 2.164504314358433, + "grad_norm": 0.1688538121634105, + "learning_rate": 2.182376716755397e-05, + "loss": 2.7233, + "step": 34868 + }, + { + "epoch": 2.164566391458191, + "grad_norm": 0.14539784237772277, + "learning_rate": 2.1820783763662113e-05, + "loss": 2.7899, + "step": 34869 + }, + { + "epoch": 2.164628468557949, + "grad_norm": 0.15360306222613362, + "learning_rate": 2.1817800506788066e-05, + "loss": 2.7094, + "step": 34870 + }, + { + "epoch": 2.164690545657707, + "grad_norm": 0.13966170828125501, + "learning_rate": 2.1814817396947394e-05, + "loss": 2.853, + "step": 34871 + }, + { + "epoch": 2.1647526227574647, + "grad_norm": 0.14978080196023105, + "learning_rate": 2.181183443415564e-05, + "loss": 2.8353, + "step": 34872 + }, + { + "epoch": 2.1648146998572226, + "grad_norm": 0.13865368011815032, + "learning_rate": 2.1808851618428418e-05, + "loss": 2.7028, + "step": 34873 + }, + { + "epoch": 2.1648767769569806, + "grad_norm": 0.15220938772880943, + "learning_rate": 2.180586894978125e-05, + "loss": 2.7304, + "step": 34874 + }, + { + "epoch": 2.1649388540567385, + "grad_norm": 0.14562539975475733, + "learning_rate": 2.1802886428229713e-05, + "loss": 2.6945, + "step": 34875 + }, + { + "epoch": 2.1650009311564964, + "grad_norm": 0.14763958953032463, + "learning_rate": 2.179990405378936e-05, + "loss": 2.7209, + "step": 34876 + }, + { + "epoch": 2.1650630082562543, + "grad_norm": 0.14533458229166796, + "learning_rate": 2.1796921826475734e-05, + "loss": 2.7199, + "step": 34877 + }, + { + "epoch": 2.1651250853560122, + "grad_norm": 0.1452840582382741, + "learning_rate": 2.1793939746304438e-05, + "loss": 2.7862, + "step": 34878 + }, + { + "epoch": 2.16518716245577, + "grad_norm": 0.14972122881530683, + "learning_rate": 2.1790957813291e-05, + "loss": 2.716, + "step": 34879 + }, + { + "epoch": 2.165249239555528, + "grad_norm": 0.16165691451292474, + "learning_rate": 2.1787976027450986e-05, + "loss": 2.7045, + "step": 34880 + }, + { + "epoch": 2.165311316655286, + "grad_norm": 0.13844935603210723, + "learning_rate": 2.178499438879993e-05, + "loss": 2.6463, + "step": 34881 + }, + { + "epoch": 2.165373393755044, + "grad_norm": 0.14090788389457018, + "learning_rate": 2.1782012897353425e-05, + "loss": 2.7577, + "step": 34882 + }, + { + "epoch": 2.165435470854802, + "grad_norm": 0.14014002832631695, + "learning_rate": 2.1779031553126993e-05, + "loss": 2.7534, + "step": 34883 + }, + { + "epoch": 2.1654975479545597, + "grad_norm": 0.13874026747669796, + "learning_rate": 2.1776050356136225e-05, + "loss": 2.7046, + "step": 34884 + }, + { + "epoch": 2.1655596250543176, + "grad_norm": 0.14959251119524258, + "learning_rate": 2.1773069306396647e-05, + "loss": 2.7318, + "step": 34885 + }, + { + "epoch": 2.1656217021540756, + "grad_norm": 0.16952092451302764, + "learning_rate": 2.1770088403923806e-05, + "loss": 2.6762, + "step": 34886 + }, + { + "epoch": 2.1656837792538335, + "grad_norm": 0.14404332435693326, + "learning_rate": 2.176710764873328e-05, + "loss": 2.736, + "step": 34887 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.14947468643474596, + "learning_rate": 2.176412704084061e-05, + "loss": 2.6795, + "step": 34888 + }, + { + "epoch": 2.165807933453349, + "grad_norm": 0.1450955804832193, + "learning_rate": 2.1761146580261343e-05, + "loss": 2.7309, + "step": 34889 + }, + { + "epoch": 2.165870010553107, + "grad_norm": 0.15741767286082917, + "learning_rate": 2.175816626701103e-05, + "loss": 2.775, + "step": 34890 + }, + { + "epoch": 2.1659320876528647, + "grad_norm": 0.15007733869570028, + "learning_rate": 2.1755186101105207e-05, + "loss": 2.7088, + "step": 34891 + }, + { + "epoch": 2.1659941647526226, + "grad_norm": 0.1430263723663137, + "learning_rate": 2.1752206082559445e-05, + "loss": 2.7429, + "step": 34892 + }, + { + "epoch": 2.1660562418523805, + "grad_norm": 0.14360634529571153, + "learning_rate": 2.1749226211389286e-05, + "loss": 2.6635, + "step": 34893 + }, + { + "epoch": 2.1661183189521385, + "grad_norm": 0.1399244435286008, + "learning_rate": 2.1746246487610265e-05, + "loss": 2.783, + "step": 34894 + }, + { + "epoch": 2.1661803960518964, + "grad_norm": 0.1376034247419044, + "learning_rate": 2.1743266911237925e-05, + "loss": 2.6898, + "step": 34895 + }, + { + "epoch": 2.1662424731516543, + "grad_norm": 0.14712678552028638, + "learning_rate": 2.1740287482287835e-05, + "loss": 2.6835, + "step": 34896 + }, + { + "epoch": 2.166304550251412, + "grad_norm": 0.1434907835848457, + "learning_rate": 2.1737308200775524e-05, + "loss": 2.7895, + "step": 34897 + }, + { + "epoch": 2.16636662735117, + "grad_norm": 0.18245740009554706, + "learning_rate": 2.1734329066716542e-05, + "loss": 2.6656, + "step": 34898 + }, + { + "epoch": 2.166428704450928, + "grad_norm": 0.14657277936411411, + "learning_rate": 2.1731350080126427e-05, + "loss": 2.75, + "step": 34899 + }, + { + "epoch": 2.166490781550686, + "grad_norm": 0.13855405960607528, + "learning_rate": 2.1728371241020702e-05, + "loss": 2.6106, + "step": 34900 + }, + { + "epoch": 2.166552858650444, + "grad_norm": 0.1434713433113773, + "learning_rate": 2.172539254941494e-05, + "loss": 2.7881, + "step": 34901 + }, + { + "epoch": 2.166614935750202, + "grad_norm": 0.15393730711915188, + "learning_rate": 2.1722414005324675e-05, + "loss": 2.711, + "step": 34902 + }, + { + "epoch": 2.1666770128499597, + "grad_norm": 0.1470004274105002, + "learning_rate": 2.171943560876544e-05, + "loss": 2.7984, + "step": 34903 + }, + { + "epoch": 2.1667390899497176, + "grad_norm": 0.14792636027496417, + "learning_rate": 2.1716457359752752e-05, + "loss": 2.7393, + "step": 34904 + }, + { + "epoch": 2.1668011670494756, + "grad_norm": 0.1480283619923293, + "learning_rate": 2.1713479258302193e-05, + "loss": 2.788, + "step": 34905 + }, + { + "epoch": 2.1668632441492335, + "grad_norm": 0.14005408026961053, + "learning_rate": 2.171050130442928e-05, + "loss": 2.6939, + "step": 34906 + }, + { + "epoch": 2.1669253212489914, + "grad_norm": 0.14272123496588293, + "learning_rate": 2.170752349814955e-05, + "loss": 2.6824, + "step": 34907 + }, + { + "epoch": 2.1669873983487493, + "grad_norm": 0.15461937483128493, + "learning_rate": 2.170454583947852e-05, + "loss": 2.7322, + "step": 34908 + }, + { + "epoch": 2.1670494754485072, + "grad_norm": 0.139105448091438, + "learning_rate": 2.1701568328431758e-05, + "loss": 2.7561, + "step": 34909 + }, + { + "epoch": 2.167111552548265, + "grad_norm": 0.13493759130003943, + "learning_rate": 2.169859096502479e-05, + "loss": 2.7778, + "step": 34910 + }, + { + "epoch": 2.1671736296480226, + "grad_norm": 0.15341212931106207, + "learning_rate": 2.1695613749273136e-05, + "loss": 2.6148, + "step": 34911 + }, + { + "epoch": 2.1672357067477805, + "grad_norm": 0.14974455341438994, + "learning_rate": 2.169263668119234e-05, + "loss": 2.6538, + "step": 34912 + }, + { + "epoch": 2.1672977838475385, + "grad_norm": 0.14117818877388458, + "learning_rate": 2.1689659760797913e-05, + "loss": 2.6835, + "step": 34913 + }, + { + "epoch": 2.1673598609472964, + "grad_norm": 0.14107746606939706, + "learning_rate": 2.168668298810542e-05, + "loss": 2.6443, + "step": 34914 + }, + { + "epoch": 2.1674219380470543, + "grad_norm": 0.14789319000397666, + "learning_rate": 2.168370636313038e-05, + "loss": 2.7306, + "step": 34915 + }, + { + "epoch": 2.167484015146812, + "grad_norm": 0.14063851635931596, + "learning_rate": 2.1680729885888297e-05, + "loss": 2.6995, + "step": 34916 + }, + { + "epoch": 2.16754609224657, + "grad_norm": 0.18329225295748391, + "learning_rate": 2.1677753556394736e-05, + "loss": 2.6871, + "step": 34917 + }, + { + "epoch": 2.167608169346328, + "grad_norm": 0.1466645912396999, + "learning_rate": 2.1674777374665195e-05, + "loss": 2.7218, + "step": 34918 + }, + { + "epoch": 2.167670246446086, + "grad_norm": 0.1408447231240486, + "learning_rate": 2.167180134071523e-05, + "loss": 2.6495, + "step": 34919 + }, + { + "epoch": 2.167732323545844, + "grad_norm": 0.14067994474852596, + "learning_rate": 2.166882545456036e-05, + "loss": 2.7674, + "step": 34920 + }, + { + "epoch": 2.167794400645602, + "grad_norm": 0.14351316746874737, + "learning_rate": 2.16658497162161e-05, + "loss": 2.7671, + "step": 34921 + }, + { + "epoch": 2.1678564777453597, + "grad_norm": 0.19398522480145938, + "learning_rate": 2.166287412569797e-05, + "loss": 2.7382, + "step": 34922 + }, + { + "epoch": 2.1679185548451176, + "grad_norm": 0.14050151118300697, + "learning_rate": 2.1659898683021518e-05, + "loss": 2.7686, + "step": 34923 + }, + { + "epoch": 2.1679806319448756, + "grad_norm": 0.16280761543019712, + "learning_rate": 2.1656923388202256e-05, + "loss": 2.7117, + "step": 34924 + }, + { + "epoch": 2.1680427090446335, + "grad_norm": 0.13769940444601764, + "learning_rate": 2.16539482412557e-05, + "loss": 2.6935, + "step": 34925 + }, + { + "epoch": 2.1681047861443914, + "grad_norm": 0.15829193307009526, + "learning_rate": 2.165097324219738e-05, + "loss": 2.7804, + "step": 34926 + }, + { + "epoch": 2.1681668632441493, + "grad_norm": 0.1690909414024645, + "learning_rate": 2.16479983910428e-05, + "loss": 2.6365, + "step": 34927 + }, + { + "epoch": 2.1682289403439072, + "grad_norm": 0.14034913741240113, + "learning_rate": 2.1645023687807507e-05, + "loss": 2.6974, + "step": 34928 + }, + { + "epoch": 2.168291017443665, + "grad_norm": 0.13928695389851875, + "learning_rate": 2.1642049132507013e-05, + "loss": 2.7777, + "step": 34929 + }, + { + "epoch": 2.168353094543423, + "grad_norm": 0.15178535903660692, + "learning_rate": 2.1639074725156832e-05, + "loss": 2.6638, + "step": 34930 + }, + { + "epoch": 2.168415171643181, + "grad_norm": 0.14934061775985782, + "learning_rate": 2.163610046577247e-05, + "loss": 2.6898, + "step": 34931 + }, + { + "epoch": 2.168477248742939, + "grad_norm": 0.14949864754575395, + "learning_rate": 2.163312635436947e-05, + "loss": 2.6865, + "step": 34932 + }, + { + "epoch": 2.168539325842697, + "grad_norm": 0.13978445991277647, + "learning_rate": 2.1630152390963338e-05, + "loss": 2.684, + "step": 34933 + }, + { + "epoch": 2.1686014029424547, + "grad_norm": 0.14124956117198936, + "learning_rate": 2.1627178575569586e-05, + "loss": 2.5938, + "step": 34934 + }, + { + "epoch": 2.1686634800422127, + "grad_norm": 0.14555700481872144, + "learning_rate": 2.1624204908203715e-05, + "loss": 2.6512, + "step": 34935 + }, + { + "epoch": 2.16872555714197, + "grad_norm": 0.1401753986954568, + "learning_rate": 2.162123138888127e-05, + "loss": 2.7266, + "step": 34936 + }, + { + "epoch": 2.168787634241728, + "grad_norm": 0.15259766814376957, + "learning_rate": 2.1618258017617755e-05, + "loss": 2.6966, + "step": 34937 + }, + { + "epoch": 2.168849711341486, + "grad_norm": 0.17966913046442098, + "learning_rate": 2.1615284794428677e-05, + "loss": 2.7041, + "step": 34938 + }, + { + "epoch": 2.168911788441244, + "grad_norm": 0.1415318679299134, + "learning_rate": 2.1612311719329544e-05, + "loss": 2.6759, + "step": 34939 + }, + { + "epoch": 2.168973865541002, + "grad_norm": 0.15465131924134665, + "learning_rate": 2.160933879233586e-05, + "loss": 2.739, + "step": 34940 + }, + { + "epoch": 2.1690359426407597, + "grad_norm": 0.14491580374614166, + "learning_rate": 2.160636601346316e-05, + "loss": 2.7929, + "step": 34941 + }, + { + "epoch": 2.1690980197405176, + "grad_norm": 0.15699172993498614, + "learning_rate": 2.1603393382726943e-05, + "loss": 2.7395, + "step": 34942 + }, + { + "epoch": 2.1691600968402756, + "grad_norm": 0.15480594916531037, + "learning_rate": 2.160042090014272e-05, + "loss": 2.7014, + "step": 34943 + }, + { + "epoch": 2.1692221739400335, + "grad_norm": 0.14870227864665816, + "learning_rate": 2.1597448565725976e-05, + "loss": 2.7084, + "step": 34944 + }, + { + "epoch": 2.1692842510397914, + "grad_norm": 0.13841303321777357, + "learning_rate": 2.159447637949225e-05, + "loss": 2.6711, + "step": 34945 + }, + { + "epoch": 2.1693463281395493, + "grad_norm": 0.15680912829058719, + "learning_rate": 2.1591504341457042e-05, + "loss": 2.7586, + "step": 34946 + }, + { + "epoch": 2.1694084052393072, + "grad_norm": 0.1558376772794865, + "learning_rate": 2.1588532451635853e-05, + "loss": 2.754, + "step": 34947 + }, + { + "epoch": 2.169470482339065, + "grad_norm": 0.15338368533592445, + "learning_rate": 2.1585560710044185e-05, + "loss": 2.8276, + "step": 34948 + }, + { + "epoch": 2.169532559438823, + "grad_norm": 0.1506671221587834, + "learning_rate": 2.1582589116697527e-05, + "loss": 2.73, + "step": 34949 + }, + { + "epoch": 2.169594636538581, + "grad_norm": 0.2266544862339594, + "learning_rate": 2.1579617671611403e-05, + "loss": 2.7352, + "step": 34950 + }, + { + "epoch": 2.169656713638339, + "grad_norm": 0.1837701051929912, + "learning_rate": 2.157664637480133e-05, + "loss": 2.6591, + "step": 34951 + }, + { + "epoch": 2.169718790738097, + "grad_norm": 0.14117141371009334, + "learning_rate": 2.1573675226282787e-05, + "loss": 2.6639, + "step": 34952 + }, + { + "epoch": 2.1697808678378547, + "grad_norm": 0.15410164058282033, + "learning_rate": 2.1570704226071285e-05, + "loss": 2.7035, + "step": 34953 + }, + { + "epoch": 2.1698429449376126, + "grad_norm": 0.14628203086098607, + "learning_rate": 2.1567733374182304e-05, + "loss": 2.7258, + "step": 34954 + }, + { + "epoch": 2.1699050220373706, + "grad_norm": 0.167262839561835, + "learning_rate": 2.1564762670631377e-05, + "loss": 2.7182, + "step": 34955 + }, + { + "epoch": 2.1699670991371285, + "grad_norm": 0.14761996378411518, + "learning_rate": 2.1561792115433988e-05, + "loss": 2.7397, + "step": 34956 + }, + { + "epoch": 2.1700291762368864, + "grad_norm": 0.1594840982961137, + "learning_rate": 2.155882170860563e-05, + "loss": 2.6653, + "step": 34957 + }, + { + "epoch": 2.1700912533366443, + "grad_norm": 0.15282319503444033, + "learning_rate": 2.1555851450161786e-05, + "loss": 2.6864, + "step": 34958 + }, + { + "epoch": 2.170153330436402, + "grad_norm": 0.147286830205122, + "learning_rate": 2.1552881340117985e-05, + "loss": 2.7211, + "step": 34959 + }, + { + "epoch": 2.1702154075361597, + "grad_norm": 0.14736901506577832, + "learning_rate": 2.154991137848972e-05, + "loss": 2.6123, + "step": 34960 + }, + { + "epoch": 2.1702774846359176, + "grad_norm": 0.14213881025762973, + "learning_rate": 2.154694156529246e-05, + "loss": 2.753, + "step": 34961 + }, + { + "epoch": 2.1703395617356755, + "grad_norm": 0.14709182039927696, + "learning_rate": 2.154397190054172e-05, + "loss": 2.7185, + "step": 34962 + }, + { + "epoch": 2.1704016388354335, + "grad_norm": 0.1427925861527268, + "learning_rate": 2.1541002384252968e-05, + "loss": 2.7206, + "step": 34963 + }, + { + "epoch": 2.1704637159351914, + "grad_norm": 0.1574066504980206, + "learning_rate": 2.153803301644173e-05, + "loss": 2.8092, + "step": 34964 + }, + { + "epoch": 2.1705257930349493, + "grad_norm": 0.13540169666759544, + "learning_rate": 2.1535063797123484e-05, + "loss": 2.6744, + "step": 34965 + }, + { + "epoch": 2.170587870134707, + "grad_norm": 0.13904641840669352, + "learning_rate": 2.1532094726313724e-05, + "loss": 2.7985, + "step": 34966 + }, + { + "epoch": 2.170649947234465, + "grad_norm": 0.1503970157206739, + "learning_rate": 2.1529125804027918e-05, + "loss": 2.7377, + "step": 34967 + }, + { + "epoch": 2.170712024334223, + "grad_norm": 0.15177374968435522, + "learning_rate": 2.1526157030281587e-05, + "loss": 2.7393, + "step": 34968 + }, + { + "epoch": 2.170774101433981, + "grad_norm": 0.15252460235360354, + "learning_rate": 2.1523188405090212e-05, + "loss": 2.8092, + "step": 34969 + }, + { + "epoch": 2.170836178533739, + "grad_norm": 0.15849468655998677, + "learning_rate": 2.1520219928469276e-05, + "loss": 2.7287, + "step": 34970 + }, + { + "epoch": 2.170898255633497, + "grad_norm": 0.1431228213998158, + "learning_rate": 2.1517251600434247e-05, + "loss": 2.6341, + "step": 34971 + }, + { + "epoch": 2.1709603327332547, + "grad_norm": 0.14056902009633507, + "learning_rate": 2.151428342100065e-05, + "loss": 2.729, + "step": 34972 + }, + { + "epoch": 2.1710224098330126, + "grad_norm": 0.15615022573173812, + "learning_rate": 2.1511315390183945e-05, + "loss": 2.8011, + "step": 34973 + }, + { + "epoch": 2.1710844869327706, + "grad_norm": 0.14113189152159208, + "learning_rate": 2.1508347507999632e-05, + "loss": 2.659, + "step": 34974 + }, + { + "epoch": 2.1711465640325285, + "grad_norm": 0.15200892970675514, + "learning_rate": 2.150537977446318e-05, + "loss": 2.8678, + "step": 34975 + }, + { + "epoch": 2.1712086411322864, + "grad_norm": 0.14398070460591822, + "learning_rate": 2.1502412189590065e-05, + "loss": 2.7174, + "step": 34976 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.1448528153525948, + "learning_rate": 2.1499444753395803e-05, + "loss": 2.7746, + "step": 34977 + }, + { + "epoch": 2.1713327953318022, + "grad_norm": 0.14758237676114824, + "learning_rate": 2.1496477465895853e-05, + "loss": 2.6658, + "step": 34978 + }, + { + "epoch": 2.17139487243156, + "grad_norm": 0.14952833505657315, + "learning_rate": 2.1493510327105704e-05, + "loss": 2.7982, + "step": 34979 + }, + { + "epoch": 2.171456949531318, + "grad_norm": 0.1521795867746469, + "learning_rate": 2.1490543337040808e-05, + "loss": 2.7074, + "step": 34980 + }, + { + "epoch": 2.171519026631076, + "grad_norm": 0.14564748755154797, + "learning_rate": 2.1487576495716673e-05, + "loss": 2.7386, + "step": 34981 + }, + { + "epoch": 2.1715811037308335, + "grad_norm": 0.16132569184211834, + "learning_rate": 2.1484609803148786e-05, + "loss": 2.7176, + "step": 34982 + }, + { + "epoch": 2.171643180830592, + "grad_norm": 0.14729970365685152, + "learning_rate": 2.1481643259352617e-05, + "loss": 2.7255, + "step": 34983 + }, + { + "epoch": 2.1717052579303493, + "grad_norm": 0.14332191903063093, + "learning_rate": 2.147867686434364e-05, + "loss": 2.7351, + "step": 34984 + }, + { + "epoch": 2.171767335030107, + "grad_norm": 0.1464047597058149, + "learning_rate": 2.147571061813731e-05, + "loss": 2.7305, + "step": 34985 + }, + { + "epoch": 2.171829412129865, + "grad_norm": 0.15376952749974504, + "learning_rate": 2.1472744520749133e-05, + "loss": 2.738, + "step": 34986 + }, + { + "epoch": 2.171891489229623, + "grad_norm": 0.1653843996735059, + "learning_rate": 2.1469778572194577e-05, + "loss": 2.7041, + "step": 34987 + }, + { + "epoch": 2.171953566329381, + "grad_norm": 0.14779739647493115, + "learning_rate": 2.1466812772489113e-05, + "loss": 2.7096, + "step": 34988 + }, + { + "epoch": 2.172015643429139, + "grad_norm": 0.15621766113364205, + "learning_rate": 2.1463847121648217e-05, + "loss": 2.8536, + "step": 34989 + }, + { + "epoch": 2.172077720528897, + "grad_norm": 0.1567020374355067, + "learning_rate": 2.1460881619687333e-05, + "loss": 2.7062, + "step": 34990 + }, + { + "epoch": 2.1721397976286547, + "grad_norm": 0.14983079178024428, + "learning_rate": 2.145791626662198e-05, + "loss": 2.6598, + "step": 34991 + }, + { + "epoch": 2.1722018747284126, + "grad_norm": 0.14041399357394524, + "learning_rate": 2.145495106246761e-05, + "loss": 2.7075, + "step": 34992 + }, + { + "epoch": 2.1722639518281706, + "grad_norm": 0.13850149245008886, + "learning_rate": 2.1451986007239684e-05, + "loss": 2.6694, + "step": 34993 + }, + { + "epoch": 2.1723260289279285, + "grad_norm": 0.1444470379332432, + "learning_rate": 2.1449021100953664e-05, + "loss": 2.7592, + "step": 34994 + }, + { + "epoch": 2.1723881060276864, + "grad_norm": 0.13971858496923903, + "learning_rate": 2.1446056343625042e-05, + "loss": 2.6774, + "step": 34995 + }, + { + "epoch": 2.1724501831274443, + "grad_norm": 0.14240329945638197, + "learning_rate": 2.1443091735269284e-05, + "loss": 2.7564, + "step": 34996 + }, + { + "epoch": 2.1725122602272022, + "grad_norm": 0.14863053744927177, + "learning_rate": 2.1440127275901845e-05, + "loss": 2.6547, + "step": 34997 + }, + { + "epoch": 2.17257433732696, + "grad_norm": 0.1567510973958109, + "learning_rate": 2.1437162965538198e-05, + "loss": 2.7118, + "step": 34998 + }, + { + "epoch": 2.172636414426718, + "grad_norm": 0.14896996890042122, + "learning_rate": 2.1434198804193785e-05, + "loss": 2.7594, + "step": 34999 + }, + { + "epoch": 2.172698491526476, + "grad_norm": 0.1495325827663743, + "learning_rate": 2.1431234791884108e-05, + "loss": 2.6947, + "step": 35000 + }, + { + "epoch": 2.172760568626234, + "grad_norm": 0.1357290892833783, + "learning_rate": 2.1428270928624616e-05, + "loss": 2.6587, + "step": 35001 + }, + { + "epoch": 2.172822645725992, + "grad_norm": 0.13531247740254201, + "learning_rate": 2.1425307214430774e-05, + "loss": 2.6445, + "step": 35002 + }, + { + "epoch": 2.1728847228257497, + "grad_norm": 0.14844058686108533, + "learning_rate": 2.1422343649318023e-05, + "loss": 2.6438, + "step": 35003 + }, + { + "epoch": 2.1729467999255077, + "grad_norm": 0.14423295762162497, + "learning_rate": 2.1419380233301855e-05, + "loss": 2.7968, + "step": 35004 + }, + { + "epoch": 2.1730088770252656, + "grad_norm": 0.1528440781497962, + "learning_rate": 2.1416416966397722e-05, + "loss": 2.7092, + "step": 35005 + }, + { + "epoch": 2.1730709541250235, + "grad_norm": 0.14730535598599154, + "learning_rate": 2.141345384862108e-05, + "loss": 2.6803, + "step": 35006 + }, + { + "epoch": 2.173133031224781, + "grad_norm": 0.1441215817864154, + "learning_rate": 2.141049087998737e-05, + "loss": 2.7418, + "step": 35007 + }, + { + "epoch": 2.173195108324539, + "grad_norm": 0.1630786106039889, + "learning_rate": 2.140752806051209e-05, + "loss": 2.7397, + "step": 35008 + }, + { + "epoch": 2.173257185424297, + "grad_norm": 0.1466232213975882, + "learning_rate": 2.1404565390210673e-05, + "loss": 2.7119, + "step": 35009 + }, + { + "epoch": 2.1733192625240547, + "grad_norm": 0.13973991359051288, + "learning_rate": 2.1401602869098587e-05, + "loss": 2.7696, + "step": 35010 + }, + { + "epoch": 2.1733813396238126, + "grad_norm": 0.14679387774206712, + "learning_rate": 2.1398640497191276e-05, + "loss": 2.7374, + "step": 35011 + }, + { + "epoch": 2.1734434167235706, + "grad_norm": 0.17105559071583093, + "learning_rate": 2.1395678274504183e-05, + "loss": 2.6912, + "step": 35012 + }, + { + "epoch": 2.1735054938233285, + "grad_norm": 0.13856478034600597, + "learning_rate": 2.1392716201052797e-05, + "loss": 2.7799, + "step": 35013 + }, + { + "epoch": 2.1735675709230864, + "grad_norm": 0.14725132121430867, + "learning_rate": 2.1389754276852536e-05, + "loss": 2.7013, + "step": 35014 + }, + { + "epoch": 2.1736296480228443, + "grad_norm": 0.13895112115334574, + "learning_rate": 2.1386792501918895e-05, + "loss": 2.7162, + "step": 35015 + }, + { + "epoch": 2.1736917251226022, + "grad_norm": 0.1479095116136185, + "learning_rate": 2.1383830876267302e-05, + "loss": 2.7092, + "step": 35016 + }, + { + "epoch": 2.17375380222236, + "grad_norm": 0.14023924609254246, + "learning_rate": 2.1380869399913195e-05, + "loss": 2.6967, + "step": 35017 + }, + { + "epoch": 2.173815879322118, + "grad_norm": 0.1364651030093993, + "learning_rate": 2.1377908072872056e-05, + "loss": 2.7039, + "step": 35018 + }, + { + "epoch": 2.173877956421876, + "grad_norm": 0.15993426802769564, + "learning_rate": 2.1374946895159313e-05, + "loss": 2.7988, + "step": 35019 + }, + { + "epoch": 2.173940033521634, + "grad_norm": 0.14855082477539952, + "learning_rate": 2.1371985866790428e-05, + "loss": 2.6197, + "step": 35020 + }, + { + "epoch": 2.174002110621392, + "grad_norm": 0.13683361949912026, + "learning_rate": 2.136902498778082e-05, + "loss": 2.7686, + "step": 35021 + }, + { + "epoch": 2.1740641877211497, + "grad_norm": 0.14857573722944975, + "learning_rate": 2.1366064258145986e-05, + "loss": 2.7534, + "step": 35022 + }, + { + "epoch": 2.1741262648209077, + "grad_norm": 0.13554797927890333, + "learning_rate": 2.1363103677901336e-05, + "loss": 2.6646, + "step": 35023 + }, + { + "epoch": 2.1741883419206656, + "grad_norm": 0.16345904266655029, + "learning_rate": 2.136014324706233e-05, + "loss": 2.6834, + "step": 35024 + }, + { + "epoch": 2.1742504190204235, + "grad_norm": 0.15307738867949894, + "learning_rate": 2.1357182965644408e-05, + "loss": 2.6528, + "step": 35025 + }, + { + "epoch": 2.1743124961201814, + "grad_norm": 0.14281793571494783, + "learning_rate": 2.1354222833663003e-05, + "loss": 2.6645, + "step": 35026 + }, + { + "epoch": 2.1743745732199393, + "grad_norm": 0.1454025630364613, + "learning_rate": 2.135126285113358e-05, + "loss": 2.681, + "step": 35027 + }, + { + "epoch": 2.1744366503196972, + "grad_norm": 0.159423243943644, + "learning_rate": 2.1348303018071582e-05, + "loss": 2.8121, + "step": 35028 + }, + { + "epoch": 2.174498727419455, + "grad_norm": 0.13883214071044145, + "learning_rate": 2.1345343334492435e-05, + "loss": 2.7702, + "step": 35029 + }, + { + "epoch": 2.1745608045192126, + "grad_norm": 0.14260562866604892, + "learning_rate": 2.1342383800411574e-05, + "loss": 2.703, + "step": 35030 + }, + { + "epoch": 2.174622881618971, + "grad_norm": 0.14348579881757784, + "learning_rate": 2.1339424415844473e-05, + "loss": 2.7101, + "step": 35031 + }, + { + "epoch": 2.1746849587187285, + "grad_norm": 0.1546238079341187, + "learning_rate": 2.1336465180806554e-05, + "loss": 2.794, + "step": 35032 + }, + { + "epoch": 2.1747470358184864, + "grad_norm": 0.15858730129228893, + "learning_rate": 2.1333506095313248e-05, + "loss": 2.707, + "step": 35033 + }, + { + "epoch": 2.1748091129182443, + "grad_norm": 0.152098831738096, + "learning_rate": 2.1330547159380003e-05, + "loss": 2.7375, + "step": 35034 + }, + { + "epoch": 2.174871190018002, + "grad_norm": 0.16259715793292692, + "learning_rate": 2.1327588373022234e-05, + "loss": 2.64, + "step": 35035 + }, + { + "epoch": 2.17493326711776, + "grad_norm": 0.18779524763592248, + "learning_rate": 2.132462973625542e-05, + "loss": 2.7466, + "step": 35036 + }, + { + "epoch": 2.174995344217518, + "grad_norm": 0.1548719794392159, + "learning_rate": 2.1321671249094965e-05, + "loss": 2.6713, + "step": 35037 + }, + { + "epoch": 2.175057421317276, + "grad_norm": 0.1378183221235947, + "learning_rate": 2.131871291155632e-05, + "loss": 2.5807, + "step": 35038 + }, + { + "epoch": 2.175119498417034, + "grad_norm": 0.15748024253634993, + "learning_rate": 2.131575472365489e-05, + "loss": 2.717, + "step": 35039 + }, + { + "epoch": 2.175181575516792, + "grad_norm": 0.14209703381361669, + "learning_rate": 2.1312796685406155e-05, + "loss": 2.7793, + "step": 35040 + }, + { + "epoch": 2.1752436526165497, + "grad_norm": 0.16389992015314747, + "learning_rate": 2.130983879682552e-05, + "loss": 2.6771, + "step": 35041 + }, + { + "epoch": 2.1753057297163076, + "grad_norm": 0.14331025443624004, + "learning_rate": 2.130688105792842e-05, + "loss": 2.6416, + "step": 35042 + }, + { + "epoch": 2.1753678068160656, + "grad_norm": 0.1491765084818184, + "learning_rate": 2.130392346873027e-05, + "loss": 2.7715, + "step": 35043 + }, + { + "epoch": 2.1754298839158235, + "grad_norm": 0.15582463658917917, + "learning_rate": 2.130096602924654e-05, + "loss": 2.7412, + "step": 35044 + }, + { + "epoch": 2.1754919610155814, + "grad_norm": 0.1716018967046681, + "learning_rate": 2.129800873949263e-05, + "loss": 2.6675, + "step": 35045 + }, + { + "epoch": 2.1755540381153393, + "grad_norm": 0.14327272634206484, + "learning_rate": 2.129505159948398e-05, + "loss": 2.71, + "step": 35046 + }, + { + "epoch": 2.1756161152150972, + "grad_norm": 0.15936153065105907, + "learning_rate": 2.1292094609235992e-05, + "loss": 2.717, + "step": 35047 + }, + { + "epoch": 2.175678192314855, + "grad_norm": 0.15892569046983662, + "learning_rate": 2.128913776876414e-05, + "loss": 2.7776, + "step": 35048 + }, + { + "epoch": 2.175740269414613, + "grad_norm": 0.1487116290137263, + "learning_rate": 2.12861810780838e-05, + "loss": 2.7392, + "step": 35049 + }, + { + "epoch": 2.175802346514371, + "grad_norm": 0.14445477472851798, + "learning_rate": 2.128322453721045e-05, + "loss": 2.6705, + "step": 35050 + }, + { + "epoch": 2.175864423614129, + "grad_norm": 0.13530228365902564, + "learning_rate": 2.128026814615948e-05, + "loss": 2.7444, + "step": 35051 + }, + { + "epoch": 2.175926500713887, + "grad_norm": 0.13439450787728519, + "learning_rate": 2.127731190494633e-05, + "loss": 2.7577, + "step": 35052 + }, + { + "epoch": 2.1759885778136447, + "grad_norm": 0.15357897739831688, + "learning_rate": 2.12743558135864e-05, + "loss": 2.6937, + "step": 35053 + }, + { + "epoch": 2.1760506549134027, + "grad_norm": 0.14564546691954117, + "learning_rate": 2.1271399872095143e-05, + "loss": 2.8072, + "step": 35054 + }, + { + "epoch": 2.17611273201316, + "grad_norm": 0.15285619721379579, + "learning_rate": 2.1268444080487966e-05, + "loss": 2.6898, + "step": 35055 + }, + { + "epoch": 2.176174809112918, + "grad_norm": 0.15682807916927705, + "learning_rate": 2.126548843878029e-05, + "loss": 2.6735, + "step": 35056 + }, + { + "epoch": 2.176236886212676, + "grad_norm": 0.13426863902227273, + "learning_rate": 2.1262532946987524e-05, + "loss": 2.5347, + "step": 35057 + }, + { + "epoch": 2.176298963312434, + "grad_norm": 0.1501920925435143, + "learning_rate": 2.1259577605125114e-05, + "loss": 2.6912, + "step": 35058 + }, + { + "epoch": 2.176361040412192, + "grad_norm": 0.15592043213712364, + "learning_rate": 2.1256622413208465e-05, + "loss": 2.7483, + "step": 35059 + }, + { + "epoch": 2.1764231175119497, + "grad_norm": 0.14676363075750942, + "learning_rate": 2.1253667371252995e-05, + "loss": 2.6575, + "step": 35060 + }, + { + "epoch": 2.1764851946117076, + "grad_norm": 0.14154104715492827, + "learning_rate": 2.125071247927412e-05, + "loss": 2.6932, + "step": 35061 + }, + { + "epoch": 2.1765472717114656, + "grad_norm": 0.14876129572318153, + "learning_rate": 2.1247757737287245e-05, + "loss": 2.6541, + "step": 35062 + }, + { + "epoch": 2.1766093488112235, + "grad_norm": 0.1540817015870782, + "learning_rate": 2.1244803145307806e-05, + "loss": 2.8124, + "step": 35063 + }, + { + "epoch": 2.1766714259109814, + "grad_norm": 0.1618193421819795, + "learning_rate": 2.1241848703351214e-05, + "loss": 2.6863, + "step": 35064 + }, + { + "epoch": 2.1767335030107393, + "grad_norm": 0.14628158519668166, + "learning_rate": 2.123889441143288e-05, + "loss": 2.7016, + "step": 35065 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.14456175791454706, + "learning_rate": 2.12359402695682e-05, + "loss": 2.6742, + "step": 35066 + }, + { + "epoch": 2.176857657210255, + "grad_norm": 0.20034461617955662, + "learning_rate": 2.1232986277772617e-05, + "loss": 2.7106, + "step": 35067 + }, + { + "epoch": 2.176919734310013, + "grad_norm": 0.14397928155427034, + "learning_rate": 2.1230032436061525e-05, + "loss": 2.7513, + "step": 35068 + }, + { + "epoch": 2.176981811409771, + "grad_norm": 0.1636109211298254, + "learning_rate": 2.122707874445034e-05, + "loss": 2.6932, + "step": 35069 + }, + { + "epoch": 2.177043888509529, + "grad_norm": 0.1482586724585586, + "learning_rate": 2.1224125202954455e-05, + "loss": 2.7619, + "step": 35070 + }, + { + "epoch": 2.177105965609287, + "grad_norm": 0.13905219859640144, + "learning_rate": 2.1221171811589314e-05, + "loss": 2.6532, + "step": 35071 + }, + { + "epoch": 2.1771680427090447, + "grad_norm": 0.13762564283129025, + "learning_rate": 2.12182185703703e-05, + "loss": 2.7908, + "step": 35072 + }, + { + "epoch": 2.1772301198088027, + "grad_norm": 0.16324427907078007, + "learning_rate": 2.121526547931283e-05, + "loss": 2.7221, + "step": 35073 + }, + { + "epoch": 2.1772921969085606, + "grad_norm": 0.14958944810475555, + "learning_rate": 2.1212312538432305e-05, + "loss": 2.7934, + "step": 35074 + }, + { + "epoch": 2.1773542740083185, + "grad_norm": 0.17066683197917637, + "learning_rate": 2.1209359747744116e-05, + "loss": 2.7525, + "step": 35075 + }, + { + "epoch": 2.1774163511080764, + "grad_norm": 0.162742452901701, + "learning_rate": 2.120640710726371e-05, + "loss": 2.728, + "step": 35076 + }, + { + "epoch": 2.1774784282078343, + "grad_norm": 0.14350928023116952, + "learning_rate": 2.120345461700646e-05, + "loss": 2.6744, + "step": 35077 + }, + { + "epoch": 2.177540505307592, + "grad_norm": 0.14392089196642732, + "learning_rate": 2.1200502276987783e-05, + "loss": 2.6965, + "step": 35078 + }, + { + "epoch": 2.17760258240735, + "grad_norm": 0.14596438621755345, + "learning_rate": 2.119755008722306e-05, + "loss": 2.7336, + "step": 35079 + }, + { + "epoch": 2.1776646595071076, + "grad_norm": 0.1401140214569262, + "learning_rate": 2.119459804772771e-05, + "loss": 2.7479, + "step": 35080 + }, + { + "epoch": 2.1777267366068656, + "grad_norm": 0.14042676092289605, + "learning_rate": 2.119164615851715e-05, + "loss": 2.7236, + "step": 35081 + }, + { + "epoch": 2.1777888137066235, + "grad_norm": 0.14693098015966313, + "learning_rate": 2.1188694419606762e-05, + "loss": 2.6683, + "step": 35082 + }, + { + "epoch": 2.1778508908063814, + "grad_norm": 0.15574451578742593, + "learning_rate": 2.1185742831011957e-05, + "loss": 2.7233, + "step": 35083 + }, + { + "epoch": 2.1779129679061393, + "grad_norm": 0.15062938043746213, + "learning_rate": 2.1182791392748126e-05, + "loss": 2.7445, + "step": 35084 + }, + { + "epoch": 2.1779750450058972, + "grad_norm": 0.14015254153998394, + "learning_rate": 2.1179840104830644e-05, + "loss": 2.7267, + "step": 35085 + }, + { + "epoch": 2.178037122105655, + "grad_norm": 0.1339240472177015, + "learning_rate": 2.1176888967274954e-05, + "loss": 2.641, + "step": 35086 + }, + { + "epoch": 2.178099199205413, + "grad_norm": 0.1423518135328609, + "learning_rate": 2.117393798009643e-05, + "loss": 2.7813, + "step": 35087 + }, + { + "epoch": 2.178161276305171, + "grad_norm": 0.14464201420247394, + "learning_rate": 2.1170987143310472e-05, + "loss": 2.6423, + "step": 35088 + }, + { + "epoch": 2.178223353404929, + "grad_norm": 0.14323987631783663, + "learning_rate": 2.1168036456932455e-05, + "loss": 2.7402, + "step": 35089 + }, + { + "epoch": 2.178285430504687, + "grad_norm": 0.14553715582814025, + "learning_rate": 2.1165085920977806e-05, + "loss": 2.805, + "step": 35090 + }, + { + "epoch": 2.1783475076044447, + "grad_norm": 0.14870856030435184, + "learning_rate": 2.1162135535461907e-05, + "loss": 2.7362, + "step": 35091 + }, + { + "epoch": 2.1784095847042027, + "grad_norm": 0.15641806623580956, + "learning_rate": 2.115918530040014e-05, + "loss": 2.767, + "step": 35092 + }, + { + "epoch": 2.1784716618039606, + "grad_norm": 0.16918100883691897, + "learning_rate": 2.1156235215807896e-05, + "loss": 2.7773, + "step": 35093 + }, + { + "epoch": 2.1785337389037185, + "grad_norm": 0.13804491450801784, + "learning_rate": 2.1153285281700592e-05, + "loss": 2.7016, + "step": 35094 + }, + { + "epoch": 2.1785958160034764, + "grad_norm": 0.1457634159353777, + "learning_rate": 2.1150335498093593e-05, + "loss": 2.7539, + "step": 35095 + }, + { + "epoch": 2.1786578931032343, + "grad_norm": 0.14983421044717057, + "learning_rate": 2.1147385865002307e-05, + "loss": 2.8039, + "step": 35096 + }, + { + "epoch": 2.1787199702029922, + "grad_norm": 0.1406575602347729, + "learning_rate": 2.1144436382442107e-05, + "loss": 2.7092, + "step": 35097 + }, + { + "epoch": 2.17878204730275, + "grad_norm": 0.13698993082746705, + "learning_rate": 2.1141487050428373e-05, + "loss": 2.6702, + "step": 35098 + }, + { + "epoch": 2.178844124402508, + "grad_norm": 0.16172531893192027, + "learning_rate": 2.113853786897652e-05, + "loss": 2.7507, + "step": 35099 + }, + { + "epoch": 2.178906201502266, + "grad_norm": 0.15307778413887851, + "learning_rate": 2.113558883810193e-05, + "loss": 2.7256, + "step": 35100 + }, + { + "epoch": 2.178968278602024, + "grad_norm": 0.14507973188867188, + "learning_rate": 2.1132639957819973e-05, + "loss": 2.726, + "step": 35101 + }, + { + "epoch": 2.179030355701782, + "grad_norm": 0.14883615266871375, + "learning_rate": 2.112969122814602e-05, + "loss": 2.7711, + "step": 35102 + }, + { + "epoch": 2.1790924328015393, + "grad_norm": 0.14998529978086617, + "learning_rate": 2.11267426490955e-05, + "loss": 2.7792, + "step": 35103 + }, + { + "epoch": 2.179154509901297, + "grad_norm": 0.14670920601984017, + "learning_rate": 2.1123794220683768e-05, + "loss": 2.6277, + "step": 35104 + }, + { + "epoch": 2.179216587001055, + "grad_norm": 0.15845528275202078, + "learning_rate": 2.1120845942926214e-05, + "loss": 2.7457, + "step": 35105 + }, + { + "epoch": 2.179278664100813, + "grad_norm": 0.16392541448778683, + "learning_rate": 2.1117897815838206e-05, + "loss": 2.6697, + "step": 35106 + }, + { + "epoch": 2.179340741200571, + "grad_norm": 0.1389402693508083, + "learning_rate": 2.111494983943515e-05, + "loss": 2.6043, + "step": 35107 + }, + { + "epoch": 2.179402818300329, + "grad_norm": 0.13880427849651847, + "learning_rate": 2.1112002013732406e-05, + "loss": 2.6306, + "step": 35108 + }, + { + "epoch": 2.179464895400087, + "grad_norm": 0.15032834460245714, + "learning_rate": 2.110905433874537e-05, + "loss": 2.7123, + "step": 35109 + }, + { + "epoch": 2.1795269724998447, + "grad_norm": 0.13985102970206556, + "learning_rate": 2.1106106814489407e-05, + "loss": 2.7838, + "step": 35110 + }, + { + "epoch": 2.1795890495996026, + "grad_norm": 0.15093836183874523, + "learning_rate": 2.110315944097988e-05, + "loss": 2.7199, + "step": 35111 + }, + { + "epoch": 2.1796511266993606, + "grad_norm": 0.15769110344463635, + "learning_rate": 2.1100212218232185e-05, + "loss": 2.7348, + "step": 35112 + }, + { + "epoch": 2.1797132037991185, + "grad_norm": 0.14106266473499393, + "learning_rate": 2.109726514626172e-05, + "loss": 2.7251, + "step": 35113 + }, + { + "epoch": 2.1797752808988764, + "grad_norm": 0.13570281356485897, + "learning_rate": 2.1094318225083835e-05, + "loss": 2.7448, + "step": 35114 + }, + { + "epoch": 2.1798373579986343, + "grad_norm": 0.14614965343331476, + "learning_rate": 2.109137145471391e-05, + "loss": 2.7422, + "step": 35115 + }, + { + "epoch": 2.1798994350983922, + "grad_norm": 0.1598942462757471, + "learning_rate": 2.10884248351673e-05, + "loss": 2.6602, + "step": 35116 + }, + { + "epoch": 2.17996151219815, + "grad_norm": 0.14389717353005968, + "learning_rate": 2.1085478366459416e-05, + "loss": 2.6685, + "step": 35117 + }, + { + "epoch": 2.180023589297908, + "grad_norm": 0.14545515246816657, + "learning_rate": 2.1082532048605608e-05, + "loss": 2.6694, + "step": 35118 + }, + { + "epoch": 2.180085666397666, + "grad_norm": 0.15683359593297816, + "learning_rate": 2.1079585881621256e-05, + "loss": 2.7077, + "step": 35119 + }, + { + "epoch": 2.180147743497424, + "grad_norm": 0.13555341396994416, + "learning_rate": 2.107663986552172e-05, + "loss": 2.6619, + "step": 35120 + }, + { + "epoch": 2.180209820597182, + "grad_norm": 0.1564224424088014, + "learning_rate": 2.107369400032236e-05, + "loss": 2.7922, + "step": 35121 + }, + { + "epoch": 2.1802718976969397, + "grad_norm": 0.15721784887394907, + "learning_rate": 2.107074828603857e-05, + "loss": 2.8576, + "step": 35122 + }, + { + "epoch": 2.1803339747966977, + "grad_norm": 0.14050605167264454, + "learning_rate": 2.106780272268572e-05, + "loss": 2.6954, + "step": 35123 + }, + { + "epoch": 2.1803960518964556, + "grad_norm": 0.15954558905853886, + "learning_rate": 2.106485731027916e-05, + "loss": 2.676, + "step": 35124 + }, + { + "epoch": 2.1804581289962135, + "grad_norm": 0.15072980630704202, + "learning_rate": 2.1061912048834247e-05, + "loss": 2.6294, + "step": 35125 + }, + { + "epoch": 2.180520206095971, + "grad_norm": 0.167981069689665, + "learning_rate": 2.1058966938366385e-05, + "loss": 2.6856, + "step": 35126 + }, + { + "epoch": 2.180582283195729, + "grad_norm": 0.16181087012481238, + "learning_rate": 2.1056021978890916e-05, + "loss": 2.6199, + "step": 35127 + }, + { + "epoch": 2.180644360295487, + "grad_norm": 0.15764032735275646, + "learning_rate": 2.105307717042321e-05, + "loss": 2.7334, + "step": 35128 + }, + { + "epoch": 2.1807064373952447, + "grad_norm": 0.14982031774234916, + "learning_rate": 2.1050132512978604e-05, + "loss": 2.7262, + "step": 35129 + }, + { + "epoch": 2.1807685144950026, + "grad_norm": 0.16534983701862277, + "learning_rate": 2.1047188006572506e-05, + "loss": 2.6955, + "step": 35130 + }, + { + "epoch": 2.1808305915947606, + "grad_norm": 0.14708821769021213, + "learning_rate": 2.1044243651220253e-05, + "loss": 2.7743, + "step": 35131 + }, + { + "epoch": 2.1808926686945185, + "grad_norm": 0.13965143854077752, + "learning_rate": 2.1041299446937214e-05, + "loss": 2.7472, + "step": 35132 + }, + { + "epoch": 2.1809547457942764, + "grad_norm": 0.14556793916366795, + "learning_rate": 2.103835539373874e-05, + "loss": 2.6848, + "step": 35133 + }, + { + "epoch": 2.1810168228940343, + "grad_norm": 0.16233680467003597, + "learning_rate": 2.1035411491640185e-05, + "loss": 2.7375, + "step": 35134 + }, + { + "epoch": 2.1810788999937922, + "grad_norm": 0.13771119939180038, + "learning_rate": 2.1032467740656934e-05, + "loss": 2.6437, + "step": 35135 + }, + { + "epoch": 2.18114097709355, + "grad_norm": 0.13914321662464077, + "learning_rate": 2.1029524140804335e-05, + "loss": 2.7633, + "step": 35136 + }, + { + "epoch": 2.181203054193308, + "grad_norm": 0.17065279063595604, + "learning_rate": 2.1026580692097734e-05, + "loss": 2.7576, + "step": 35137 + }, + { + "epoch": 2.181265131293066, + "grad_norm": 0.1381514279296921, + "learning_rate": 2.1023637394552488e-05, + "loss": 2.7979, + "step": 35138 + }, + { + "epoch": 2.181327208392824, + "grad_norm": 0.14190621324667657, + "learning_rate": 2.102069424818397e-05, + "loss": 2.7489, + "step": 35139 + }, + { + "epoch": 2.181389285492582, + "grad_norm": 0.14822330203778347, + "learning_rate": 2.1017751253007528e-05, + "loss": 2.7099, + "step": 35140 + }, + { + "epoch": 2.1814513625923397, + "grad_norm": 0.14215636253017047, + "learning_rate": 2.101480840903851e-05, + "loss": 2.6673, + "step": 35141 + }, + { + "epoch": 2.1815134396920977, + "grad_norm": 0.14268198541602684, + "learning_rate": 2.1011865716292255e-05, + "loss": 2.6688, + "step": 35142 + }, + { + "epoch": 2.1815755167918556, + "grad_norm": 0.14586083260754407, + "learning_rate": 2.1008923174784155e-05, + "loss": 2.7869, + "step": 35143 + }, + { + "epoch": 2.1816375938916135, + "grad_norm": 0.13859898034798568, + "learning_rate": 2.100598078452954e-05, + "loss": 2.7556, + "step": 35144 + }, + { + "epoch": 2.1816996709913714, + "grad_norm": 0.14349295787570213, + "learning_rate": 2.1003038545543746e-05, + "loss": 2.7051, + "step": 35145 + }, + { + "epoch": 2.1817617480911293, + "grad_norm": 0.15662094430170662, + "learning_rate": 2.100009645784215e-05, + "loss": 2.667, + "step": 35146 + }, + { + "epoch": 2.1818238251908872, + "grad_norm": 0.13684689783081008, + "learning_rate": 2.09971545214401e-05, + "loss": 2.7027, + "step": 35147 + }, + { + "epoch": 2.181885902290645, + "grad_norm": 0.142045961400775, + "learning_rate": 2.099421273635291e-05, + "loss": 2.6686, + "step": 35148 + }, + { + "epoch": 2.181947979390403, + "grad_norm": 0.1422850392170296, + "learning_rate": 2.099127110259598e-05, + "loss": 2.8078, + "step": 35149 + }, + { + "epoch": 2.182010056490161, + "grad_norm": 0.14497602176128588, + "learning_rate": 2.098832962018462e-05, + "loss": 2.7316, + "step": 35150 + }, + { + "epoch": 2.1820721335899185, + "grad_norm": 0.15581017819620696, + "learning_rate": 2.0985388289134196e-05, + "loss": 2.7913, + "step": 35151 + }, + { + "epoch": 2.1821342106896764, + "grad_norm": 0.14067564820446526, + "learning_rate": 2.0982447109460017e-05, + "loss": 2.7057, + "step": 35152 + }, + { + "epoch": 2.1821962877894343, + "grad_norm": 0.14191571607185488, + "learning_rate": 2.097950608117748e-05, + "loss": 2.6312, + "step": 35153 + }, + { + "epoch": 2.1822583648891922, + "grad_norm": 0.1502159888194486, + "learning_rate": 2.0976565204301907e-05, + "loss": 2.6876, + "step": 35154 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.14124185321320346, + "learning_rate": 2.0973624478848634e-05, + "loss": 2.6607, + "step": 35155 + }, + { + "epoch": 2.182382519088708, + "grad_norm": 0.14785037500063322, + "learning_rate": 2.0970683904832994e-05, + "loss": 2.7296, + "step": 35156 + }, + { + "epoch": 2.182444596188466, + "grad_norm": 0.1462831195495058, + "learning_rate": 2.0967743482270362e-05, + "loss": 2.7753, + "step": 35157 + }, + { + "epoch": 2.182506673288224, + "grad_norm": 0.14118775379239154, + "learning_rate": 2.096480321117606e-05, + "loss": 2.7186, + "step": 35158 + }, + { + "epoch": 2.182568750387982, + "grad_norm": 0.14833184924503906, + "learning_rate": 2.0961863091565426e-05, + "loss": 2.6793, + "step": 35159 + }, + { + "epoch": 2.1826308274877397, + "grad_norm": 0.1345915153420649, + "learning_rate": 2.0958923123453798e-05, + "loss": 2.7413, + "step": 35160 + }, + { + "epoch": 2.1826929045874977, + "grad_norm": 0.14885222697901784, + "learning_rate": 2.0955983306856507e-05, + "loss": 2.7202, + "step": 35161 + }, + { + "epoch": 2.1827549816872556, + "grad_norm": 0.15142050388625888, + "learning_rate": 2.0953043641788917e-05, + "loss": 2.7201, + "step": 35162 + }, + { + "epoch": 2.1828170587870135, + "grad_norm": 0.1341985575415809, + "learning_rate": 2.095010412826635e-05, + "loss": 2.7137, + "step": 35163 + }, + { + "epoch": 2.1828791358867714, + "grad_norm": 0.1568455373692977, + "learning_rate": 2.0947164766304138e-05, + "loss": 2.7562, + "step": 35164 + }, + { + "epoch": 2.1829412129865293, + "grad_norm": 0.14299140109746478, + "learning_rate": 2.0944225555917607e-05, + "loss": 2.6508, + "step": 35165 + }, + { + "epoch": 2.1830032900862872, + "grad_norm": 0.15148555680394832, + "learning_rate": 2.094128649712212e-05, + "loss": 2.7756, + "step": 35166 + }, + { + "epoch": 2.183065367186045, + "grad_norm": 0.17084006811353797, + "learning_rate": 2.0938347589933e-05, + "loss": 2.7386, + "step": 35167 + }, + { + "epoch": 2.183127444285803, + "grad_norm": 0.16380228682921302, + "learning_rate": 2.093540883436557e-05, + "loss": 2.6261, + "step": 35168 + }, + { + "epoch": 2.183189521385561, + "grad_norm": 0.1414431310473353, + "learning_rate": 2.0932470230435176e-05, + "loss": 2.7676, + "step": 35169 + }, + { + "epoch": 2.183251598485319, + "grad_norm": 0.15391849043140327, + "learning_rate": 2.0929531778157112e-05, + "loss": 2.7726, + "step": 35170 + }, + { + "epoch": 2.183313675585077, + "grad_norm": 0.15492430608576135, + "learning_rate": 2.0926593477546762e-05, + "loss": 2.7148, + "step": 35171 + }, + { + "epoch": 2.1833757526848347, + "grad_norm": 0.14422720425583205, + "learning_rate": 2.0923655328619433e-05, + "loss": 2.6714, + "step": 35172 + }, + { + "epoch": 2.1834378297845927, + "grad_norm": 0.15728896543465423, + "learning_rate": 2.092071733139045e-05, + "loss": 2.6539, + "step": 35173 + }, + { + "epoch": 2.18349990688435, + "grad_norm": 0.1497551633497145, + "learning_rate": 2.0917779485875128e-05, + "loss": 2.6777, + "step": 35174 + }, + { + "epoch": 2.183561983984108, + "grad_norm": 0.1402164923335462, + "learning_rate": 2.0914841792088824e-05, + "loss": 2.7969, + "step": 35175 + }, + { + "epoch": 2.183624061083866, + "grad_norm": 0.153386338710982, + "learning_rate": 2.0911904250046853e-05, + "loss": 2.6829, + "step": 35176 + }, + { + "epoch": 2.183686138183624, + "grad_norm": 0.13826716157298866, + "learning_rate": 2.0908966859764547e-05, + "loss": 2.7969, + "step": 35177 + }, + { + "epoch": 2.183748215283382, + "grad_norm": 0.15601711202761886, + "learning_rate": 2.0906029621257195e-05, + "loss": 2.7271, + "step": 35178 + }, + { + "epoch": 2.1838102923831397, + "grad_norm": 0.1451091489452262, + "learning_rate": 2.0903092534540153e-05, + "loss": 2.6688, + "step": 35179 + }, + { + "epoch": 2.1838723694828976, + "grad_norm": 0.1507137963020531, + "learning_rate": 2.0900155599628756e-05, + "loss": 2.6147, + "step": 35180 + }, + { + "epoch": 2.1839344465826556, + "grad_norm": 0.15616198683516758, + "learning_rate": 2.0897218816538317e-05, + "loss": 2.6487, + "step": 35181 + }, + { + "epoch": 2.1839965236824135, + "grad_norm": 0.14300410115048115, + "learning_rate": 2.089428218528415e-05, + "loss": 2.7031, + "step": 35182 + }, + { + "epoch": 2.1840586007821714, + "grad_norm": 0.15799296513269717, + "learning_rate": 2.0891345705881572e-05, + "loss": 2.7057, + "step": 35183 + }, + { + "epoch": 2.1841206778819293, + "grad_norm": 0.1424638603837344, + "learning_rate": 2.0888409378345897e-05, + "loss": 2.8254, + "step": 35184 + }, + { + "epoch": 2.1841827549816872, + "grad_norm": 0.13937377595577816, + "learning_rate": 2.088547320269248e-05, + "loss": 2.7871, + "step": 35185 + }, + { + "epoch": 2.184244832081445, + "grad_norm": 0.1418730730711532, + "learning_rate": 2.088253717893661e-05, + "loss": 2.7714, + "step": 35186 + }, + { + "epoch": 2.184306909181203, + "grad_norm": 0.174151050987996, + "learning_rate": 2.0879601307093616e-05, + "loss": 2.6762, + "step": 35187 + }, + { + "epoch": 2.184368986280961, + "grad_norm": 0.15042230965890413, + "learning_rate": 2.087666558717879e-05, + "loss": 2.6955, + "step": 35188 + }, + { + "epoch": 2.184431063380719, + "grad_norm": 0.15593640328979125, + "learning_rate": 2.0873730019207487e-05, + "loss": 2.7376, + "step": 35189 + }, + { + "epoch": 2.184493140480477, + "grad_norm": 0.13843414587818562, + "learning_rate": 2.087079460319501e-05, + "loss": 2.738, + "step": 35190 + }, + { + "epoch": 2.1845552175802347, + "grad_norm": 0.1441885261234567, + "learning_rate": 2.086785933915667e-05, + "loss": 2.8091, + "step": 35191 + }, + { + "epoch": 2.1846172946799927, + "grad_norm": 0.1493518238463432, + "learning_rate": 2.0864924227107758e-05, + "loss": 2.7576, + "step": 35192 + }, + { + "epoch": 2.1846793717797506, + "grad_norm": 0.14708048644723754, + "learning_rate": 2.0861989267063624e-05, + "loss": 2.821, + "step": 35193 + }, + { + "epoch": 2.1847414488795085, + "grad_norm": 0.16100304898268578, + "learning_rate": 2.0859054459039574e-05, + "loss": 2.6936, + "step": 35194 + }, + { + "epoch": 2.1848035259792664, + "grad_norm": 0.16722186506339448, + "learning_rate": 2.0856119803050904e-05, + "loss": 2.6753, + "step": 35195 + }, + { + "epoch": 2.1848656030790243, + "grad_norm": 0.1468262497673678, + "learning_rate": 2.0853185299112932e-05, + "loss": 2.7297, + "step": 35196 + }, + { + "epoch": 2.1849276801787822, + "grad_norm": 0.14658368241246145, + "learning_rate": 2.0850250947240952e-05, + "loss": 2.8338, + "step": 35197 + }, + { + "epoch": 2.18498975727854, + "grad_norm": 0.15706268797695086, + "learning_rate": 2.0847316747450306e-05, + "loss": 2.6749, + "step": 35198 + }, + { + "epoch": 2.1850518343782976, + "grad_norm": 0.1457917304529009, + "learning_rate": 2.0844382699756287e-05, + "loss": 2.6847, + "step": 35199 + }, + { + "epoch": 2.1851139114780556, + "grad_norm": 0.1530181616661791, + "learning_rate": 2.0841448804174203e-05, + "loss": 2.6313, + "step": 35200 + }, + { + "epoch": 2.1851759885778135, + "grad_norm": 0.1585457999460937, + "learning_rate": 2.0838515060719338e-05, + "loss": 2.7854, + "step": 35201 + }, + { + "epoch": 2.1852380656775714, + "grad_norm": 0.1759503028490187, + "learning_rate": 2.083558146940704e-05, + "loss": 2.6701, + "step": 35202 + }, + { + "epoch": 2.1853001427773293, + "grad_norm": 0.15065009782561967, + "learning_rate": 2.0832648030252587e-05, + "loss": 2.7566, + "step": 35203 + }, + { + "epoch": 2.1853622198770872, + "grad_norm": 0.1520853786026151, + "learning_rate": 2.08297147432713e-05, + "loss": 2.7654, + "step": 35204 + }, + { + "epoch": 2.185424296976845, + "grad_norm": 0.14666334663118463, + "learning_rate": 2.0826781608478464e-05, + "loss": 2.7071, + "step": 35205 + }, + { + "epoch": 2.185486374076603, + "grad_norm": 0.14781689155458422, + "learning_rate": 2.0823848625889375e-05, + "loss": 2.6856, + "step": 35206 + }, + { + "epoch": 2.185548451176361, + "grad_norm": 0.14350402353208588, + "learning_rate": 2.0820915795519368e-05, + "loss": 2.7414, + "step": 35207 + }, + { + "epoch": 2.185610528276119, + "grad_norm": 0.19035694982841733, + "learning_rate": 2.0817983117383726e-05, + "loss": 2.7678, + "step": 35208 + }, + { + "epoch": 2.185672605375877, + "grad_norm": 0.1905922318263774, + "learning_rate": 2.0815050591497754e-05, + "loss": 2.7212, + "step": 35209 + }, + { + "epoch": 2.1857346824756347, + "grad_norm": 0.137856355466507, + "learning_rate": 2.081211821787673e-05, + "loss": 2.6796, + "step": 35210 + }, + { + "epoch": 2.1857967595753927, + "grad_norm": 0.15438675978499117, + "learning_rate": 2.080918599653597e-05, + "loss": 2.7171, + "step": 35211 + }, + { + "epoch": 2.1858588366751506, + "grad_norm": 0.15974106704819704, + "learning_rate": 2.0806253927490787e-05, + "loss": 2.7325, + "step": 35212 + }, + { + "epoch": 2.1859209137749085, + "grad_norm": 0.13829591317747975, + "learning_rate": 2.0803322010756466e-05, + "loss": 2.721, + "step": 35213 + }, + { + "epoch": 2.1859829908746664, + "grad_norm": 0.1700265934423428, + "learning_rate": 2.0800390246348305e-05, + "loss": 2.6976, + "step": 35214 + }, + { + "epoch": 2.1860450679744243, + "grad_norm": 0.17516427908163854, + "learning_rate": 2.0797458634281575e-05, + "loss": 2.7491, + "step": 35215 + }, + { + "epoch": 2.1861071450741822, + "grad_norm": 0.15526938283172742, + "learning_rate": 2.079452717457161e-05, + "loss": 2.767, + "step": 35216 + }, + { + "epoch": 2.18616922217394, + "grad_norm": 0.16450900671022248, + "learning_rate": 2.0791595867233686e-05, + "loss": 2.6983, + "step": 35217 + }, + { + "epoch": 2.186231299273698, + "grad_norm": 0.15970257142081765, + "learning_rate": 2.0788664712283102e-05, + "loss": 2.6888, + "step": 35218 + }, + { + "epoch": 2.186293376373456, + "grad_norm": 0.1804928664405337, + "learning_rate": 2.0785733709735144e-05, + "loss": 2.7443, + "step": 35219 + }, + { + "epoch": 2.186355453473214, + "grad_norm": 0.23250498207455958, + "learning_rate": 2.0782802859605084e-05, + "loss": 2.6265, + "step": 35220 + }, + { + "epoch": 2.186417530572972, + "grad_norm": 0.14999963045074716, + "learning_rate": 2.0779872161908255e-05, + "loss": 2.645, + "step": 35221 + }, + { + "epoch": 2.1864796076727293, + "grad_norm": 0.14391159920582408, + "learning_rate": 2.0776941616659928e-05, + "loss": 2.8289, + "step": 35222 + }, + { + "epoch": 2.1865416847724872, + "grad_norm": 0.17467348939944832, + "learning_rate": 2.0774011223875383e-05, + "loss": 2.7538, + "step": 35223 + }, + { + "epoch": 2.186603761872245, + "grad_norm": 0.17341308658196525, + "learning_rate": 2.077108098356991e-05, + "loss": 2.7399, + "step": 35224 + }, + { + "epoch": 2.186665838972003, + "grad_norm": 0.16903462006454756, + "learning_rate": 2.0768150895758808e-05, + "loss": 2.7258, + "step": 35225 + }, + { + "epoch": 2.186727916071761, + "grad_norm": 0.16468350635442006, + "learning_rate": 2.076522096045737e-05, + "loss": 2.733, + "step": 35226 + }, + { + "epoch": 2.186789993171519, + "grad_norm": 0.14801376495916135, + "learning_rate": 2.0762291177680866e-05, + "loss": 2.6735, + "step": 35227 + }, + { + "epoch": 2.186852070271277, + "grad_norm": 0.1429052389022791, + "learning_rate": 2.0759361547444573e-05, + "loss": 2.5906, + "step": 35228 + }, + { + "epoch": 2.1869141473710347, + "grad_norm": 0.158912709788452, + "learning_rate": 2.07564320697638e-05, + "loss": 2.7008, + "step": 35229 + }, + { + "epoch": 2.1869762244707927, + "grad_norm": 0.1700105216648072, + "learning_rate": 2.0753502744653826e-05, + "loss": 2.8051, + "step": 35230 + }, + { + "epoch": 2.1870383015705506, + "grad_norm": 0.14201434905672744, + "learning_rate": 2.0750573572129922e-05, + "loss": 2.7073, + "step": 35231 + }, + { + "epoch": 2.1871003786703085, + "grad_norm": 0.15512502523202254, + "learning_rate": 2.0747644552207383e-05, + "loss": 2.6959, + "step": 35232 + }, + { + "epoch": 2.1871624557700664, + "grad_norm": 0.15318884953737444, + "learning_rate": 2.0744715684901462e-05, + "loss": 2.72, + "step": 35233 + }, + { + "epoch": 2.1872245328698243, + "grad_norm": 0.15597821678776602, + "learning_rate": 2.074178697022748e-05, + "loss": 2.7174, + "step": 35234 + }, + { + "epoch": 2.1872866099695822, + "grad_norm": 0.15523443391949515, + "learning_rate": 2.0738858408200696e-05, + "loss": 2.743, + "step": 35235 + }, + { + "epoch": 2.18734868706934, + "grad_norm": 0.17017909564806838, + "learning_rate": 2.0735929998836395e-05, + "loss": 2.7512, + "step": 35236 + }, + { + "epoch": 2.187410764169098, + "grad_norm": 0.14216559969792156, + "learning_rate": 2.0733001742149833e-05, + "loss": 2.7114, + "step": 35237 + }, + { + "epoch": 2.187472841268856, + "grad_norm": 0.14460461919374185, + "learning_rate": 2.0730073638156323e-05, + "loss": 2.6823, + "step": 35238 + }, + { + "epoch": 2.187534918368614, + "grad_norm": 0.19537404044508133, + "learning_rate": 2.072714568687112e-05, + "loss": 2.7575, + "step": 35239 + }, + { + "epoch": 2.187596995468372, + "grad_norm": 0.1717571537281943, + "learning_rate": 2.072421788830951e-05, + "loss": 2.7662, + "step": 35240 + }, + { + "epoch": 2.1876590725681297, + "grad_norm": 0.1447094579002256, + "learning_rate": 2.072129024248674e-05, + "loss": 2.7355, + "step": 35241 + }, + { + "epoch": 2.1877211496678877, + "grad_norm": 0.15402830870334003, + "learning_rate": 2.0718362749418123e-05, + "loss": 2.7574, + "step": 35242 + }, + { + "epoch": 2.1877832267676456, + "grad_norm": 0.13379880076141445, + "learning_rate": 2.0715435409118906e-05, + "loss": 2.6738, + "step": 35243 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.17199978308504588, + "learning_rate": 2.0712508221604382e-05, + "loss": 2.7253, + "step": 35244 + }, + { + "epoch": 2.1879073809671614, + "grad_norm": 0.1433695553067557, + "learning_rate": 2.0709581186889814e-05, + "loss": 2.7275, + "step": 35245 + }, + { + "epoch": 2.1879694580669193, + "grad_norm": 0.14694280099372897, + "learning_rate": 2.0706654304990476e-05, + "loss": 2.7032, + "step": 35246 + }, + { + "epoch": 2.188031535166677, + "grad_norm": 0.1407030008475409, + "learning_rate": 2.070372757592161e-05, + "loss": 2.7536, + "step": 35247 + }, + { + "epoch": 2.1880936122664347, + "grad_norm": 0.14637468681741728, + "learning_rate": 2.0700800999698534e-05, + "loss": 2.7736, + "step": 35248 + }, + { + "epoch": 2.1881556893661926, + "grad_norm": 0.1370380351867149, + "learning_rate": 2.0697874576336484e-05, + "loss": 2.7931, + "step": 35249 + }, + { + "epoch": 2.1882177664659506, + "grad_norm": 0.1405599960449295, + "learning_rate": 2.0694948305850738e-05, + "loss": 2.6943, + "step": 35250 + }, + { + "epoch": 2.1882798435657085, + "grad_norm": 0.15528333980246492, + "learning_rate": 2.0692022188256543e-05, + "loss": 2.6956, + "step": 35251 + }, + { + "epoch": 2.1883419206654664, + "grad_norm": 0.14248262760652478, + "learning_rate": 2.0689096223569204e-05, + "loss": 2.7243, + "step": 35252 + }, + { + "epoch": 2.1884039977652243, + "grad_norm": 0.16494572274142552, + "learning_rate": 2.0686170411803963e-05, + "loss": 2.7459, + "step": 35253 + }, + { + "epoch": 2.1884660748649822, + "grad_norm": 0.14320139977039725, + "learning_rate": 2.0683244752976087e-05, + "loss": 2.6703, + "step": 35254 + }, + { + "epoch": 2.18852815196474, + "grad_norm": 0.15287282965890384, + "learning_rate": 2.0680319247100837e-05, + "loss": 2.7501, + "step": 35255 + }, + { + "epoch": 2.188590229064498, + "grad_norm": 0.1601246884113678, + "learning_rate": 2.067739389419347e-05, + "loss": 2.7316, + "step": 35256 + }, + { + "epoch": 2.188652306164256, + "grad_norm": 0.14453622448582815, + "learning_rate": 2.0674468694269267e-05, + "loss": 2.6403, + "step": 35257 + }, + { + "epoch": 2.188714383264014, + "grad_norm": 0.14979259685878352, + "learning_rate": 2.0671543647343484e-05, + "loss": 2.7006, + "step": 35258 + }, + { + "epoch": 2.188776460363772, + "grad_norm": 0.16344192051243053, + "learning_rate": 2.0668618753431375e-05, + "loss": 2.8605, + "step": 35259 + }, + { + "epoch": 2.1888385374635297, + "grad_norm": 0.14589441388364033, + "learning_rate": 2.0665694012548188e-05, + "loss": 2.7077, + "step": 35260 + }, + { + "epoch": 2.1889006145632877, + "grad_norm": 0.14613530847966585, + "learning_rate": 2.066276942470921e-05, + "loss": 2.7017, + "step": 35261 + }, + { + "epoch": 2.1889626916630456, + "grad_norm": 0.203458997422699, + "learning_rate": 2.0659844989929688e-05, + "loss": 2.6845, + "step": 35262 + }, + { + "epoch": 2.1890247687628035, + "grad_norm": 0.14408616533588725, + "learning_rate": 2.0656920708224874e-05, + "loss": 2.6437, + "step": 35263 + }, + { + "epoch": 2.1890868458625614, + "grad_norm": 0.15653528752701165, + "learning_rate": 2.0653996579610014e-05, + "loss": 2.7442, + "step": 35264 + }, + { + "epoch": 2.1891489229623193, + "grad_norm": 0.14624103679659733, + "learning_rate": 2.0651072604100398e-05, + "loss": 2.7188, + "step": 35265 + }, + { + "epoch": 2.1892110000620773, + "grad_norm": 0.13938677889825796, + "learning_rate": 2.0648148781711258e-05, + "loss": 2.7648, + "step": 35266 + }, + { + "epoch": 2.189273077161835, + "grad_norm": 0.15810693359638892, + "learning_rate": 2.064522511245785e-05, + "loss": 2.7791, + "step": 35267 + }, + { + "epoch": 2.189335154261593, + "grad_norm": 0.1360372999122338, + "learning_rate": 2.064230159635543e-05, + "loss": 2.705, + "step": 35268 + }, + { + "epoch": 2.189397231361351, + "grad_norm": 0.15093962055793841, + "learning_rate": 2.0639378233419225e-05, + "loss": 2.7487, + "step": 35269 + }, + { + "epoch": 2.1894593084611085, + "grad_norm": 0.15358094216321594, + "learning_rate": 2.063645502366454e-05, + "loss": 2.7166, + "step": 35270 + }, + { + "epoch": 2.1895213855608664, + "grad_norm": 0.15442510901653067, + "learning_rate": 2.063353196710659e-05, + "loss": 2.7348, + "step": 35271 + }, + { + "epoch": 2.1895834626606243, + "grad_norm": 0.14603704178090177, + "learning_rate": 2.0630609063760635e-05, + "loss": 2.7458, + "step": 35272 + }, + { + "epoch": 2.1896455397603822, + "grad_norm": 0.1653164727269071, + "learning_rate": 2.062768631364191e-05, + "loss": 2.7035, + "step": 35273 + }, + { + "epoch": 2.18970761686014, + "grad_norm": 0.1593203888152518, + "learning_rate": 2.062476371676569e-05, + "loss": 2.7882, + "step": 35274 + }, + { + "epoch": 2.189769693959898, + "grad_norm": 0.15640542525532256, + "learning_rate": 2.0621841273147202e-05, + "loss": 2.7432, + "step": 35275 + }, + { + "epoch": 2.189831771059656, + "grad_norm": 0.15179388754040493, + "learning_rate": 2.061891898280169e-05, + "loss": 2.6961, + "step": 35276 + }, + { + "epoch": 2.189893848159414, + "grad_norm": 0.14603227147978362, + "learning_rate": 2.0615996845744423e-05, + "loss": 2.6879, + "step": 35277 + }, + { + "epoch": 2.189955925259172, + "grad_norm": 0.14977687219365893, + "learning_rate": 2.0613074861990617e-05, + "loss": 2.7938, + "step": 35278 + }, + { + "epoch": 2.1900180023589297, + "grad_norm": 0.14140907226823124, + "learning_rate": 2.061015303155555e-05, + "loss": 2.6765, + "step": 35279 + }, + { + "epoch": 2.1900800794586877, + "grad_norm": 0.15157567094814484, + "learning_rate": 2.0607231354454455e-05, + "loss": 2.6983, + "step": 35280 + }, + { + "epoch": 2.1901421565584456, + "grad_norm": 0.17374471206131092, + "learning_rate": 2.060430983070257e-05, + "loss": 2.7161, + "step": 35281 + }, + { + "epoch": 2.1902042336582035, + "grad_norm": 0.1708556639412643, + "learning_rate": 2.060138846031513e-05, + "loss": 2.7249, + "step": 35282 + }, + { + "epoch": 2.1902663107579614, + "grad_norm": 0.146873897183113, + "learning_rate": 2.0598467243307367e-05, + "loss": 2.7785, + "step": 35283 + }, + { + "epoch": 2.1903283878577193, + "grad_norm": 0.15703869488846128, + "learning_rate": 2.059554617969456e-05, + "loss": 2.7549, + "step": 35284 + }, + { + "epoch": 2.1903904649574772, + "grad_norm": 0.15196312525775035, + "learning_rate": 2.0592625269491923e-05, + "loss": 2.7247, + "step": 35285 + }, + { + "epoch": 2.190452542057235, + "grad_norm": 0.1623621543937868, + "learning_rate": 2.0589704512714696e-05, + "loss": 2.7769, + "step": 35286 + }, + { + "epoch": 2.190514619156993, + "grad_norm": 0.15207866689847468, + "learning_rate": 2.0586783909378104e-05, + "loss": 2.7808, + "step": 35287 + }, + { + "epoch": 2.190576696256751, + "grad_norm": 0.16785410361777842, + "learning_rate": 2.058386345949742e-05, + "loss": 2.6267, + "step": 35288 + }, + { + "epoch": 2.190638773356509, + "grad_norm": 0.15013991550869837, + "learning_rate": 2.058094316308786e-05, + "loss": 2.7605, + "step": 35289 + }, + { + "epoch": 2.190700850456267, + "grad_norm": 0.13904560125736115, + "learning_rate": 2.0578023020164668e-05, + "loss": 2.6906, + "step": 35290 + }, + { + "epoch": 2.1907629275560248, + "grad_norm": 0.14408244868907466, + "learning_rate": 2.0575103030743048e-05, + "loss": 2.7075, + "step": 35291 + }, + { + "epoch": 2.1908250046557827, + "grad_norm": 0.19127633103638106, + "learning_rate": 2.057218319483828e-05, + "loss": 2.6171, + "step": 35292 + }, + { + "epoch": 2.1908870817555406, + "grad_norm": 0.16273712500172918, + "learning_rate": 2.056926351246557e-05, + "loss": 2.6367, + "step": 35293 + }, + { + "epoch": 2.1909491588552985, + "grad_norm": 0.1403131139079132, + "learning_rate": 2.0566343983640162e-05, + "loss": 2.7082, + "step": 35294 + }, + { + "epoch": 2.191011235955056, + "grad_norm": 0.17344900105979924, + "learning_rate": 2.056342460837728e-05, + "loss": 2.7349, + "step": 35295 + }, + { + "epoch": 2.191073313054814, + "grad_norm": 0.1446331670057516, + "learning_rate": 2.056050538669214e-05, + "loss": 2.7651, + "step": 35296 + }, + { + "epoch": 2.191135390154572, + "grad_norm": 0.15460669521459366, + "learning_rate": 2.0557586318600008e-05, + "loss": 2.7752, + "step": 35297 + }, + { + "epoch": 2.1911974672543297, + "grad_norm": 0.15842977230329908, + "learning_rate": 2.05546674041161e-05, + "loss": 2.7237, + "step": 35298 + }, + { + "epoch": 2.1912595443540877, + "grad_norm": 0.15401487897971083, + "learning_rate": 2.0551748643255638e-05, + "loss": 2.7369, + "step": 35299 + }, + { + "epoch": 2.1913216214538456, + "grad_norm": 0.17027577454830875, + "learning_rate": 2.0548830036033833e-05, + "loss": 2.7602, + "step": 35300 + }, + { + "epoch": 2.1913836985536035, + "grad_norm": 0.15712083495885795, + "learning_rate": 2.054591158246595e-05, + "loss": 2.6671, + "step": 35301 + }, + { + "epoch": 2.1914457756533614, + "grad_norm": 0.1435017845091864, + "learning_rate": 2.054299328256719e-05, + "loss": 2.7265, + "step": 35302 + }, + { + "epoch": 2.1915078527531193, + "grad_norm": 0.14777576236978338, + "learning_rate": 2.0540075136352794e-05, + "loss": 2.7624, + "step": 35303 + }, + { + "epoch": 2.1915699298528772, + "grad_norm": 0.15719935377219585, + "learning_rate": 2.0537157143837974e-05, + "loss": 2.7912, + "step": 35304 + }, + { + "epoch": 2.191632006952635, + "grad_norm": 0.15065215642566643, + "learning_rate": 2.0534239305037937e-05, + "loss": 2.6789, + "step": 35305 + }, + { + "epoch": 2.191694084052393, + "grad_norm": 0.16177194015466542, + "learning_rate": 2.0531321619967953e-05, + "loss": 2.7709, + "step": 35306 + }, + { + "epoch": 2.191756161152151, + "grad_norm": 0.14438490933722992, + "learning_rate": 2.052840408864321e-05, + "loss": 2.6936, + "step": 35307 + }, + { + "epoch": 2.191818238251909, + "grad_norm": 0.14828872754263295, + "learning_rate": 2.052548671107894e-05, + "loss": 2.7913, + "step": 35308 + }, + { + "epoch": 2.191880315351667, + "grad_norm": 0.17714719929625153, + "learning_rate": 2.0522569487290344e-05, + "loss": 2.648, + "step": 35309 + }, + { + "epoch": 2.1919423924514247, + "grad_norm": 0.1446145822668411, + "learning_rate": 2.0519652417292657e-05, + "loss": 2.6111, + "step": 35310 + }, + { + "epoch": 2.1920044695511827, + "grad_norm": 0.15511460247590025, + "learning_rate": 2.051673550110112e-05, + "loss": 2.7571, + "step": 35311 + }, + { + "epoch": 2.1920665466509406, + "grad_norm": 0.14125547537698643, + "learning_rate": 2.0513818738730928e-05, + "loss": 2.7415, + "step": 35312 + }, + { + "epoch": 2.1921286237506985, + "grad_norm": 0.14272269013739564, + "learning_rate": 2.0510902130197308e-05, + "loss": 2.6744, + "step": 35313 + }, + { + "epoch": 2.1921907008504564, + "grad_norm": 0.14058999447110848, + "learning_rate": 2.0507985675515446e-05, + "loss": 2.6823, + "step": 35314 + }, + { + "epoch": 2.1922527779502143, + "grad_norm": 0.13475306079180288, + "learning_rate": 2.0505069374700603e-05, + "loss": 2.8051, + "step": 35315 + }, + { + "epoch": 2.1923148550499723, + "grad_norm": 0.14397778086638657, + "learning_rate": 2.0502153227767972e-05, + "loss": 2.6462, + "step": 35316 + }, + { + "epoch": 2.19237693214973, + "grad_norm": 0.14185140207631072, + "learning_rate": 2.0499237234732765e-05, + "loss": 2.8632, + "step": 35317 + }, + { + "epoch": 2.1924390092494876, + "grad_norm": 0.1532253530154376, + "learning_rate": 2.0496321395610207e-05, + "loss": 2.7394, + "step": 35318 + }, + { + "epoch": 2.1925010863492456, + "grad_norm": 0.13972223772667056, + "learning_rate": 2.049340571041548e-05, + "loss": 2.7357, + "step": 35319 + }, + { + "epoch": 2.1925631634490035, + "grad_norm": 0.15895328920776441, + "learning_rate": 2.0490490179163835e-05, + "loss": 2.8196, + "step": 35320 + }, + { + "epoch": 2.1926252405487614, + "grad_norm": 0.14728984633574926, + "learning_rate": 2.0487574801870468e-05, + "loss": 2.7315, + "step": 35321 + }, + { + "epoch": 2.1926873176485193, + "grad_norm": 0.15564834869601277, + "learning_rate": 2.0484659578550586e-05, + "loss": 2.7659, + "step": 35322 + }, + { + "epoch": 2.1927493947482772, + "grad_norm": 0.1618038308431892, + "learning_rate": 2.0481744509219387e-05, + "loss": 2.78, + "step": 35323 + }, + { + "epoch": 2.192811471848035, + "grad_norm": 0.15480292907147378, + "learning_rate": 2.0478829593892107e-05, + "loss": 2.8161, + "step": 35324 + }, + { + "epoch": 2.192873548947793, + "grad_norm": 0.14340223274622563, + "learning_rate": 2.0475914832583937e-05, + "loss": 2.6993, + "step": 35325 + }, + { + "epoch": 2.192935626047551, + "grad_norm": 0.15662700945881108, + "learning_rate": 2.0473000225310092e-05, + "loss": 2.6913, + "step": 35326 + }, + { + "epoch": 2.192997703147309, + "grad_norm": 0.14716943766690493, + "learning_rate": 2.0470085772085752e-05, + "loss": 2.7232, + "step": 35327 + }, + { + "epoch": 2.193059780247067, + "grad_norm": 0.1459506101151685, + "learning_rate": 2.0467171472926162e-05, + "loss": 2.8176, + "step": 35328 + }, + { + "epoch": 2.1931218573468247, + "grad_norm": 0.14149805429387963, + "learning_rate": 2.046425732784651e-05, + "loss": 2.6987, + "step": 35329 + }, + { + "epoch": 2.1931839344465827, + "grad_norm": 0.1444329623504518, + "learning_rate": 2.0461343336861992e-05, + "loss": 2.8308, + "step": 35330 + }, + { + "epoch": 2.1932460115463406, + "grad_norm": 0.15987974814939143, + "learning_rate": 2.0458429499987817e-05, + "loss": 2.682, + "step": 35331 + }, + { + "epoch": 2.1933080886460985, + "grad_norm": 0.13933549509280344, + "learning_rate": 2.0455515817239175e-05, + "loss": 2.7296, + "step": 35332 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.15472716680233928, + "learning_rate": 2.0452602288631288e-05, + "loss": 2.7027, + "step": 35333 + }, + { + "epoch": 2.1934322428456143, + "grad_norm": 0.14439704179741983, + "learning_rate": 2.0449688914179355e-05, + "loss": 2.712, + "step": 35334 + }, + { + "epoch": 2.1934943199453723, + "grad_norm": 0.14358948266430635, + "learning_rate": 2.044677569389857e-05, + "loss": 2.7243, + "step": 35335 + }, + { + "epoch": 2.19355639704513, + "grad_norm": 0.1689477600625972, + "learning_rate": 2.0443862627804105e-05, + "loss": 2.7717, + "step": 35336 + }, + { + "epoch": 2.193618474144888, + "grad_norm": 0.14496209391094383, + "learning_rate": 2.0440949715911207e-05, + "loss": 2.8443, + "step": 35337 + }, + { + "epoch": 2.193680551244646, + "grad_norm": 0.1457832468965249, + "learning_rate": 2.0438036958235046e-05, + "loss": 2.6413, + "step": 35338 + }, + { + "epoch": 2.193742628344404, + "grad_norm": 0.14330104555935172, + "learning_rate": 2.043512435479083e-05, + "loss": 2.6505, + "step": 35339 + }, + { + "epoch": 2.193804705444162, + "grad_norm": 0.17043897718440595, + "learning_rate": 2.043221190559374e-05, + "loss": 2.7465, + "step": 35340 + }, + { + "epoch": 2.1938667825439193, + "grad_norm": 0.14205767736082459, + "learning_rate": 2.0429299610658966e-05, + "loss": 2.6938, + "step": 35341 + }, + { + "epoch": 2.1939288596436777, + "grad_norm": 0.1453200222960159, + "learning_rate": 2.0426387470001705e-05, + "loss": 2.7231, + "step": 35342 + }, + { + "epoch": 2.193990936743435, + "grad_norm": 0.1375963924395945, + "learning_rate": 2.042347548363719e-05, + "loss": 2.6557, + "step": 35343 + }, + { + "epoch": 2.194053013843193, + "grad_norm": 0.14266876709999318, + "learning_rate": 2.0420563651580577e-05, + "loss": 2.7111, + "step": 35344 + }, + { + "epoch": 2.194115090942951, + "grad_norm": 0.14144643639321064, + "learning_rate": 2.0417651973847063e-05, + "loss": 2.7578, + "step": 35345 + }, + { + "epoch": 2.194177168042709, + "grad_norm": 0.14331874791453456, + "learning_rate": 2.0414740450451826e-05, + "loss": 2.7198, + "step": 35346 + }, + { + "epoch": 2.194239245142467, + "grad_norm": 0.14124094985773308, + "learning_rate": 2.0411829081410084e-05, + "loss": 2.7555, + "step": 35347 + }, + { + "epoch": 2.1943013222422247, + "grad_norm": 0.140708018010795, + "learning_rate": 2.0408917866737014e-05, + "loss": 2.6516, + "step": 35348 + }, + { + "epoch": 2.1943633993419827, + "grad_norm": 0.14207534453883727, + "learning_rate": 2.04060068064478e-05, + "loss": 2.7545, + "step": 35349 + }, + { + "epoch": 2.1944254764417406, + "grad_norm": 0.1389472537458753, + "learning_rate": 2.0403095900557624e-05, + "loss": 2.684, + "step": 35350 + }, + { + "epoch": 2.1944875535414985, + "grad_norm": 0.141095183376834, + "learning_rate": 2.0400185149081692e-05, + "loss": 2.6927, + "step": 35351 + }, + { + "epoch": 2.1945496306412564, + "grad_norm": 0.14071614570037605, + "learning_rate": 2.0397274552035184e-05, + "loss": 2.7231, + "step": 35352 + }, + { + "epoch": 2.1946117077410143, + "grad_norm": 0.13785319374094346, + "learning_rate": 2.0394364109433273e-05, + "loss": 2.7163, + "step": 35353 + }, + { + "epoch": 2.1946737848407722, + "grad_norm": 0.15814668331446316, + "learning_rate": 2.0391453821291156e-05, + "loss": 2.7465, + "step": 35354 + }, + { + "epoch": 2.19473586194053, + "grad_norm": 0.1416045024172011, + "learning_rate": 2.0388543687623997e-05, + "loss": 2.7806, + "step": 35355 + }, + { + "epoch": 2.194797939040288, + "grad_norm": 0.13695191528081985, + "learning_rate": 2.038563370844701e-05, + "loss": 2.7966, + "step": 35356 + }, + { + "epoch": 2.194860016140046, + "grad_norm": 0.17253913229935944, + "learning_rate": 2.0382723883775363e-05, + "loss": 2.6948, + "step": 35357 + }, + { + "epoch": 2.194922093239804, + "grad_norm": 0.17238266163224447, + "learning_rate": 2.0379814213624233e-05, + "loss": 2.76, + "step": 35358 + }, + { + "epoch": 2.194984170339562, + "grad_norm": 0.14665697737228303, + "learning_rate": 2.0376904698008785e-05, + "loss": 2.7808, + "step": 35359 + }, + { + "epoch": 2.1950462474393198, + "grad_norm": 0.1594280984256967, + "learning_rate": 2.0373995336944236e-05, + "loss": 2.7275, + "step": 35360 + }, + { + "epoch": 2.1951083245390777, + "grad_norm": 0.14429561635094776, + "learning_rate": 2.0371086130445744e-05, + "loss": 2.7002, + "step": 35361 + }, + { + "epoch": 2.1951704016388356, + "grad_norm": 0.14524067287938794, + "learning_rate": 2.0368177078528482e-05, + "loss": 2.6552, + "step": 35362 + }, + { + "epoch": 2.1952324787385935, + "grad_norm": 0.14648832315846302, + "learning_rate": 2.0365268181207624e-05, + "loss": 2.6559, + "step": 35363 + }, + { + "epoch": 2.1952945558383514, + "grad_norm": 0.14170998993099437, + "learning_rate": 2.0362359438498374e-05, + "loss": 2.7649, + "step": 35364 + }, + { + "epoch": 2.1953566329381093, + "grad_norm": 0.15897719252199685, + "learning_rate": 2.0359450850415885e-05, + "loss": 2.7823, + "step": 35365 + }, + { + "epoch": 2.195418710037867, + "grad_norm": 0.15485034135633663, + "learning_rate": 2.035654241697534e-05, + "loss": 2.6468, + "step": 35366 + }, + { + "epoch": 2.1954807871376247, + "grad_norm": 0.18296670360381803, + "learning_rate": 2.0353634138191908e-05, + "loss": 2.7841, + "step": 35367 + }, + { + "epoch": 2.1955428642373827, + "grad_norm": 0.15470136064987977, + "learning_rate": 2.0350726014080746e-05, + "loss": 2.7702, + "step": 35368 + }, + { + "epoch": 2.1956049413371406, + "grad_norm": 0.15399485617723038, + "learning_rate": 2.0347818044657062e-05, + "loss": 2.8046, + "step": 35369 + }, + { + "epoch": 2.1956670184368985, + "grad_norm": 0.1385218184517809, + "learning_rate": 2.034491022993601e-05, + "loss": 2.6828, + "step": 35370 + }, + { + "epoch": 2.1957290955366564, + "grad_norm": 0.15566363579896195, + "learning_rate": 2.034200256993276e-05, + "loss": 2.7938, + "step": 35371 + }, + { + "epoch": 2.1957911726364143, + "grad_norm": 0.14413037638425333, + "learning_rate": 2.033909506466246e-05, + "loss": 2.6622, + "step": 35372 + }, + { + "epoch": 2.1958532497361722, + "grad_norm": 0.15664220699738604, + "learning_rate": 2.0336187714140327e-05, + "loss": 2.6804, + "step": 35373 + }, + { + "epoch": 2.19591532683593, + "grad_norm": 0.15296858028679103, + "learning_rate": 2.033328051838148e-05, + "loss": 2.6548, + "step": 35374 + }, + { + "epoch": 2.195977403935688, + "grad_norm": 0.14679463907095128, + "learning_rate": 2.0330373477401132e-05, + "loss": 2.7208, + "step": 35375 + }, + { + "epoch": 2.196039481035446, + "grad_norm": 0.13959165232217866, + "learning_rate": 2.0327466591214427e-05, + "loss": 2.6831, + "step": 35376 + }, + { + "epoch": 2.196101558135204, + "grad_norm": 0.151286866812085, + "learning_rate": 2.032455985983651e-05, + "loss": 2.6782, + "step": 35377 + }, + { + "epoch": 2.196163635234962, + "grad_norm": 0.1393778921263461, + "learning_rate": 2.0321653283282594e-05, + "loss": 2.713, + "step": 35378 + }, + { + "epoch": 2.1962257123347197, + "grad_norm": 0.17774779929302348, + "learning_rate": 2.031874686156781e-05, + "loss": 2.6932, + "step": 35379 + }, + { + "epoch": 2.1962877894344777, + "grad_norm": 0.14245038646414268, + "learning_rate": 2.0315840594707337e-05, + "loss": 2.7324, + "step": 35380 + }, + { + "epoch": 2.1963498665342356, + "grad_norm": 0.15767887165845318, + "learning_rate": 2.0312934482716324e-05, + "loss": 2.6108, + "step": 35381 + }, + { + "epoch": 2.1964119436339935, + "grad_norm": 0.15536121670290176, + "learning_rate": 2.031002852560992e-05, + "loss": 2.7726, + "step": 35382 + }, + { + "epoch": 2.1964740207337514, + "grad_norm": 0.14337049072162086, + "learning_rate": 2.0307122723403326e-05, + "loss": 2.7049, + "step": 35383 + }, + { + "epoch": 2.1965360978335093, + "grad_norm": 0.162424877803214, + "learning_rate": 2.030421707611168e-05, + "loss": 2.6723, + "step": 35384 + }, + { + "epoch": 2.1965981749332673, + "grad_norm": 0.16300930575562803, + "learning_rate": 2.0301311583750142e-05, + "loss": 2.7921, + "step": 35385 + }, + { + "epoch": 2.196660252033025, + "grad_norm": 0.14375610882798726, + "learning_rate": 2.0298406246333852e-05, + "loss": 2.7304, + "step": 35386 + }, + { + "epoch": 2.196722329132783, + "grad_norm": 0.14411095130262633, + "learning_rate": 2.029550106387801e-05, + "loss": 2.7181, + "step": 35387 + }, + { + "epoch": 2.196784406232541, + "grad_norm": 0.14544527258756923, + "learning_rate": 2.0292596036397743e-05, + "loss": 2.7208, + "step": 35388 + }, + { + "epoch": 2.1968464833322985, + "grad_norm": 0.14280973852271556, + "learning_rate": 2.0289691163908216e-05, + "loss": 2.7993, + "step": 35389 + }, + { + "epoch": 2.196908560432057, + "grad_norm": 0.14728724753505407, + "learning_rate": 2.0286786446424583e-05, + "loss": 2.715, + "step": 35390 + }, + { + "epoch": 2.1969706375318143, + "grad_norm": 0.15186274839167888, + "learning_rate": 2.028388188396198e-05, + "loss": 2.7727, + "step": 35391 + }, + { + "epoch": 2.1970327146315722, + "grad_norm": 0.14243507394193367, + "learning_rate": 2.02809774765356e-05, + "loss": 2.6812, + "step": 35392 + }, + { + "epoch": 2.19709479173133, + "grad_norm": 0.16189423613418408, + "learning_rate": 2.0278073224160572e-05, + "loss": 2.7703, + "step": 35393 + }, + { + "epoch": 2.197156868831088, + "grad_norm": 0.13895560506642618, + "learning_rate": 2.027516912685205e-05, + "loss": 2.694, + "step": 35394 + }, + { + "epoch": 2.197218945930846, + "grad_norm": 0.17112144633609566, + "learning_rate": 2.0272265184625173e-05, + "loss": 2.8002, + "step": 35395 + }, + { + "epoch": 2.197281023030604, + "grad_norm": 0.1446217112068349, + "learning_rate": 2.0269361397495123e-05, + "loss": 2.7915, + "step": 35396 + }, + { + "epoch": 2.197343100130362, + "grad_norm": 0.1472081360554698, + "learning_rate": 2.0266457765477033e-05, + "loss": 2.6958, + "step": 35397 + }, + { + "epoch": 2.1974051772301197, + "grad_norm": 0.14180192912108375, + "learning_rate": 2.0263554288586044e-05, + "loss": 2.6623, + "step": 35398 + }, + { + "epoch": 2.1974672543298777, + "grad_norm": 0.14391842646655842, + "learning_rate": 2.0260650966837298e-05, + "loss": 2.7155, + "step": 35399 + }, + { + "epoch": 2.1975293314296356, + "grad_norm": 0.14340344071395367, + "learning_rate": 2.0257747800245967e-05, + "loss": 2.6914, + "step": 35400 + }, + { + "epoch": 2.1975914085293935, + "grad_norm": 0.141663879625403, + "learning_rate": 2.0254844788827193e-05, + "loss": 2.6742, + "step": 35401 + }, + { + "epoch": 2.1976534856291514, + "grad_norm": 0.14336958778654083, + "learning_rate": 2.0251941932596113e-05, + "loss": 2.703, + "step": 35402 + }, + { + "epoch": 2.1977155627289093, + "grad_norm": 0.14053713517211652, + "learning_rate": 2.0249039231567873e-05, + "loss": 2.6716, + "step": 35403 + }, + { + "epoch": 2.1977776398286673, + "grad_norm": 0.15816905985347507, + "learning_rate": 2.0246136685757604e-05, + "loss": 2.7499, + "step": 35404 + }, + { + "epoch": 2.197839716928425, + "grad_norm": 0.17100354368493514, + "learning_rate": 2.0243234295180475e-05, + "loss": 2.7245, + "step": 35405 + }, + { + "epoch": 2.197901794028183, + "grad_norm": 0.16247512737069023, + "learning_rate": 2.024033205985162e-05, + "loss": 2.7483, + "step": 35406 + }, + { + "epoch": 2.197963871127941, + "grad_norm": 0.15063473471172886, + "learning_rate": 2.0237429979786154e-05, + "loss": 2.7911, + "step": 35407 + }, + { + "epoch": 2.198025948227699, + "grad_norm": 0.156407128244882, + "learning_rate": 2.0234528054999263e-05, + "loss": 2.678, + "step": 35408 + }, + { + "epoch": 2.198088025327457, + "grad_norm": 0.14890760717343887, + "learning_rate": 2.023162628550605e-05, + "loss": 2.6828, + "step": 35409 + }, + { + "epoch": 2.1981501024272148, + "grad_norm": 0.1408582382346696, + "learning_rate": 2.0228724671321682e-05, + "loss": 2.7691, + "step": 35410 + }, + { + "epoch": 2.1982121795269727, + "grad_norm": 0.14284117483831762, + "learning_rate": 2.0225823212461286e-05, + "loss": 2.6498, + "step": 35411 + }, + { + "epoch": 2.1982742566267306, + "grad_norm": 0.14753184603130529, + "learning_rate": 2.0222921908939997e-05, + "loss": 2.6055, + "step": 35412 + }, + { + "epoch": 2.1983363337264885, + "grad_norm": 0.13759995262985883, + "learning_rate": 2.022002076077294e-05, + "loss": 2.7526, + "step": 35413 + }, + { + "epoch": 2.198398410826246, + "grad_norm": 0.1413185646003444, + "learning_rate": 2.0217119767975275e-05, + "loss": 2.7387, + "step": 35414 + }, + { + "epoch": 2.198460487926004, + "grad_norm": 0.1430117662264112, + "learning_rate": 2.021421893056213e-05, + "loss": 2.7308, + "step": 35415 + }, + { + "epoch": 2.198522565025762, + "grad_norm": 0.14435779627979187, + "learning_rate": 2.0211318248548632e-05, + "loss": 2.7041, + "step": 35416 + }, + { + "epoch": 2.1985846421255197, + "grad_norm": 0.13926491391558832, + "learning_rate": 2.0208417721949923e-05, + "loss": 2.6834, + "step": 35417 + }, + { + "epoch": 2.1986467192252777, + "grad_norm": 0.16490877527736297, + "learning_rate": 2.0205517350781112e-05, + "loss": 2.7311, + "step": 35418 + }, + { + "epoch": 2.1987087963250356, + "grad_norm": 0.14151584398501843, + "learning_rate": 2.0202617135057366e-05, + "loss": 2.6808, + "step": 35419 + }, + { + "epoch": 2.1987708734247935, + "grad_norm": 0.1400466190677557, + "learning_rate": 2.01997170747938e-05, + "loss": 2.7126, + "step": 35420 + }, + { + "epoch": 2.1988329505245514, + "grad_norm": 0.16436016236990714, + "learning_rate": 2.019681717000555e-05, + "loss": 2.7596, + "step": 35421 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.13774706496981967, + "learning_rate": 2.0193917420707715e-05, + "loss": 2.7451, + "step": 35422 + }, + { + "epoch": 2.1989571047240672, + "grad_norm": 0.14764955596282206, + "learning_rate": 2.019101782691547e-05, + "loss": 2.6995, + "step": 35423 + }, + { + "epoch": 2.199019181823825, + "grad_norm": 0.15725471410409514, + "learning_rate": 2.0188118388643917e-05, + "loss": 2.7581, + "step": 35424 + }, + { + "epoch": 2.199081258923583, + "grad_norm": 0.14124696332697126, + "learning_rate": 2.0185219105908192e-05, + "loss": 2.6258, + "step": 35425 + }, + { + "epoch": 2.199143336023341, + "grad_norm": 0.14259859486829915, + "learning_rate": 2.0182319978723413e-05, + "loss": 2.6724, + "step": 35426 + }, + { + "epoch": 2.199205413123099, + "grad_norm": 0.1532135670812514, + "learning_rate": 2.0179421007104698e-05, + "loss": 2.7198, + "step": 35427 + }, + { + "epoch": 2.199267490222857, + "grad_norm": 0.14610869234341914, + "learning_rate": 2.0176522191067193e-05, + "loss": 2.7092, + "step": 35428 + }, + { + "epoch": 2.1993295673226148, + "grad_norm": 0.16588917572776757, + "learning_rate": 2.0173623530626018e-05, + "loss": 2.7656, + "step": 35429 + }, + { + "epoch": 2.1993916444223727, + "grad_norm": 0.16742282131103856, + "learning_rate": 2.017072502579628e-05, + "loss": 2.6611, + "step": 35430 + }, + { + "epoch": 2.1994537215221306, + "grad_norm": 0.15754900028165206, + "learning_rate": 2.0167826676593104e-05, + "loss": 2.68, + "step": 35431 + }, + { + "epoch": 2.1995157986218885, + "grad_norm": 0.14821257615909067, + "learning_rate": 2.016492848303163e-05, + "loss": 2.7707, + "step": 35432 + }, + { + "epoch": 2.1995778757216464, + "grad_norm": 0.1586710520675558, + "learning_rate": 2.0162030445126967e-05, + "loss": 2.7611, + "step": 35433 + }, + { + "epoch": 2.1996399528214043, + "grad_norm": 0.14220178920083662, + "learning_rate": 2.015913256289424e-05, + "loss": 2.6565, + "step": 35434 + }, + { + "epoch": 2.1997020299211623, + "grad_norm": 0.15547357412741908, + "learning_rate": 2.0156234836348536e-05, + "loss": 2.6945, + "step": 35435 + }, + { + "epoch": 2.19976410702092, + "grad_norm": 0.1362626232634126, + "learning_rate": 2.0153337265505022e-05, + "loss": 2.6535, + "step": 35436 + }, + { + "epoch": 2.1998261841206777, + "grad_norm": 0.14219634007651186, + "learning_rate": 2.0150439850378795e-05, + "loss": 2.7962, + "step": 35437 + }, + { + "epoch": 2.199888261220436, + "grad_norm": 0.14103936368051886, + "learning_rate": 2.014754259098497e-05, + "loss": 2.6495, + "step": 35438 + }, + { + "epoch": 2.1999503383201935, + "grad_norm": 0.14051490686242235, + "learning_rate": 2.0144645487338658e-05, + "loss": 2.727, + "step": 35439 + }, + { + "epoch": 2.2000124154199514, + "grad_norm": 0.14387235073422833, + "learning_rate": 2.0141748539454964e-05, + "loss": 2.7732, + "step": 35440 + }, + { + "epoch": 2.2000744925197093, + "grad_norm": 0.136876641790568, + "learning_rate": 2.0138851747349015e-05, + "loss": 2.6978, + "step": 35441 + }, + { + "epoch": 2.2001365696194672, + "grad_norm": 0.13685894317811428, + "learning_rate": 2.013595511103594e-05, + "loss": 2.7676, + "step": 35442 + }, + { + "epoch": 2.200198646719225, + "grad_norm": 0.15408307364671273, + "learning_rate": 2.0133058630530842e-05, + "loss": 2.6733, + "step": 35443 + }, + { + "epoch": 2.200260723818983, + "grad_norm": 0.1548786963725244, + "learning_rate": 2.0130162305848827e-05, + "loss": 2.6278, + "step": 35444 + }, + { + "epoch": 2.200322800918741, + "grad_norm": 0.14235098264557378, + "learning_rate": 2.0127266137004984e-05, + "loss": 2.7135, + "step": 35445 + }, + { + "epoch": 2.200384878018499, + "grad_norm": 0.1404116986442451, + "learning_rate": 2.0124370124014464e-05, + "loss": 2.7489, + "step": 35446 + }, + { + "epoch": 2.200446955118257, + "grad_norm": 0.14061407205718696, + "learning_rate": 2.012147426689236e-05, + "loss": 2.708, + "step": 35447 + }, + { + "epoch": 2.2005090322180147, + "grad_norm": 0.15301384029440268, + "learning_rate": 2.011857856565378e-05, + "loss": 2.6663, + "step": 35448 + }, + { + "epoch": 2.2005711093177727, + "grad_norm": 0.13942406375590158, + "learning_rate": 2.0115683020313806e-05, + "loss": 2.7599, + "step": 35449 + }, + { + "epoch": 2.2006331864175306, + "grad_norm": 0.14131017498194084, + "learning_rate": 2.011278763088759e-05, + "loss": 2.7039, + "step": 35450 + }, + { + "epoch": 2.2006952635172885, + "grad_norm": 0.13907257134689438, + "learning_rate": 2.0109892397390217e-05, + "loss": 2.738, + "step": 35451 + }, + { + "epoch": 2.2007573406170464, + "grad_norm": 0.13965768716763638, + "learning_rate": 2.010699731983679e-05, + "loss": 2.7732, + "step": 35452 + }, + { + "epoch": 2.2008194177168043, + "grad_norm": 0.14346034005272376, + "learning_rate": 2.0104102398242418e-05, + "loss": 2.7365, + "step": 35453 + }, + { + "epoch": 2.2008814948165623, + "grad_norm": 0.13381932605471364, + "learning_rate": 2.0101207632622182e-05, + "loss": 2.7236, + "step": 35454 + }, + { + "epoch": 2.20094357191632, + "grad_norm": 0.13758307813602064, + "learning_rate": 2.0098313022991218e-05, + "loss": 2.7749, + "step": 35455 + }, + { + "epoch": 2.201005649016078, + "grad_norm": 0.15583698403468949, + "learning_rate": 2.0095418569364617e-05, + "loss": 2.7007, + "step": 35456 + }, + { + "epoch": 2.201067726115836, + "grad_norm": 0.13594588083981787, + "learning_rate": 2.0092524271757474e-05, + "loss": 2.6991, + "step": 35457 + }, + { + "epoch": 2.201129803215594, + "grad_norm": 0.1425406916350925, + "learning_rate": 2.0089630130184878e-05, + "loss": 2.7821, + "step": 35458 + }, + { + "epoch": 2.201191880315352, + "grad_norm": 0.14412699689250963, + "learning_rate": 2.0086736144661962e-05, + "loss": 2.6419, + "step": 35459 + }, + { + "epoch": 2.2012539574151098, + "grad_norm": 0.1442183574431461, + "learning_rate": 2.00838423152038e-05, + "loss": 2.7266, + "step": 35460 + }, + { + "epoch": 2.2013160345148677, + "grad_norm": 0.15006922152836252, + "learning_rate": 2.0080948641825498e-05, + "loss": 2.6531, + "step": 35461 + }, + { + "epoch": 2.201378111614625, + "grad_norm": 0.15095213930641133, + "learning_rate": 2.0078055124542128e-05, + "loss": 2.7383, + "step": 35462 + }, + { + "epoch": 2.201440188714383, + "grad_norm": 0.1457806404035606, + "learning_rate": 2.0075161763368834e-05, + "loss": 2.7095, + "step": 35463 + }, + { + "epoch": 2.201502265814141, + "grad_norm": 0.13501919511697746, + "learning_rate": 2.007226855832068e-05, + "loss": 2.7107, + "step": 35464 + }, + { + "epoch": 2.201564342913899, + "grad_norm": 0.1426454476573264, + "learning_rate": 2.0069375509412763e-05, + "loss": 2.6755, + "step": 35465 + }, + { + "epoch": 2.201626420013657, + "grad_norm": 0.1764901595330748, + "learning_rate": 2.0066482616660188e-05, + "loss": 2.7835, + "step": 35466 + }, + { + "epoch": 2.2016884971134147, + "grad_norm": 0.16019448544991755, + "learning_rate": 2.0063589880078015e-05, + "loss": 2.6279, + "step": 35467 + }, + { + "epoch": 2.2017505742131727, + "grad_norm": 0.14246264444912715, + "learning_rate": 2.006069729968138e-05, + "loss": 2.7042, + "step": 35468 + }, + { + "epoch": 2.2018126513129306, + "grad_norm": 0.1479091590052854, + "learning_rate": 2.0057804875485357e-05, + "loss": 2.6973, + "step": 35469 + }, + { + "epoch": 2.2018747284126885, + "grad_norm": 0.145203900261569, + "learning_rate": 2.005491260750504e-05, + "loss": 2.7378, + "step": 35470 + }, + { + "epoch": 2.2019368055124464, + "grad_norm": 0.1503047388238193, + "learning_rate": 2.0052020495755487e-05, + "loss": 2.7699, + "step": 35471 + }, + { + "epoch": 2.2019988826122043, + "grad_norm": 0.13921005291430008, + "learning_rate": 2.004912854025183e-05, + "loss": 2.7096, + "step": 35472 + }, + { + "epoch": 2.2020609597119623, + "grad_norm": 0.1435796365661461, + "learning_rate": 2.0046236741009135e-05, + "loss": 2.6653, + "step": 35473 + }, + { + "epoch": 2.20212303681172, + "grad_norm": 0.15190315724371167, + "learning_rate": 2.00433450980425e-05, + "loss": 2.8059, + "step": 35474 + }, + { + "epoch": 2.202185113911478, + "grad_norm": 0.13617872807454362, + "learning_rate": 2.004045361136701e-05, + "loss": 2.6373, + "step": 35475 + }, + { + "epoch": 2.202247191011236, + "grad_norm": 0.14034830794049993, + "learning_rate": 2.0037562280997744e-05, + "loss": 2.6789, + "step": 35476 + }, + { + "epoch": 2.202309268110994, + "grad_norm": 0.15140078387122077, + "learning_rate": 2.0034671106949775e-05, + "loss": 2.7615, + "step": 35477 + }, + { + "epoch": 2.202371345210752, + "grad_norm": 0.13878830870604297, + "learning_rate": 2.0031780089238216e-05, + "loss": 2.7715, + "step": 35478 + }, + { + "epoch": 2.2024334223105098, + "grad_norm": 0.14123130931031722, + "learning_rate": 2.002888922787814e-05, + "loss": 2.7212, + "step": 35479 + }, + { + "epoch": 2.2024954994102677, + "grad_norm": 0.1396518453435638, + "learning_rate": 2.002599852288462e-05, + "loss": 2.6583, + "step": 35480 + }, + { + "epoch": 2.2025575765100256, + "grad_norm": 0.13876039070881535, + "learning_rate": 2.0023107974272726e-05, + "loss": 2.6615, + "step": 35481 + }, + { + "epoch": 2.2026196536097835, + "grad_norm": 0.13745015552205972, + "learning_rate": 2.002021758205757e-05, + "loss": 2.7148, + "step": 35482 + }, + { + "epoch": 2.2026817307095414, + "grad_norm": 0.14241220670559546, + "learning_rate": 2.001732734625422e-05, + "loss": 2.7742, + "step": 35483 + }, + { + "epoch": 2.2027438078092993, + "grad_norm": 0.1382229213218848, + "learning_rate": 2.0014437266877754e-05, + "loss": 2.724, + "step": 35484 + }, + { + "epoch": 2.202805884909057, + "grad_norm": 0.14180814866366878, + "learning_rate": 2.0011547343943225e-05, + "loss": 2.7385, + "step": 35485 + }, + { + "epoch": 2.202867962008815, + "grad_norm": 0.15027297359758868, + "learning_rate": 2.0008657577465755e-05, + "loss": 2.7246, + "step": 35486 + }, + { + "epoch": 2.2029300391085727, + "grad_norm": 0.14774743095575263, + "learning_rate": 2.00057679674604e-05, + "loss": 2.7304, + "step": 35487 + }, + { + "epoch": 2.2029921162083306, + "grad_norm": 0.15236170369507487, + "learning_rate": 2.000287851394223e-05, + "loss": 2.6975, + "step": 35488 + }, + { + "epoch": 2.2030541933080885, + "grad_norm": 0.1653955548236003, + "learning_rate": 1.999998921692633e-05, + "loss": 2.7188, + "step": 35489 + }, + { + "epoch": 2.2031162704078464, + "grad_norm": 0.14140492301655497, + "learning_rate": 1.9997100076427744e-05, + "loss": 2.6662, + "step": 35490 + }, + { + "epoch": 2.2031783475076043, + "grad_norm": 0.1547751859708934, + "learning_rate": 1.9994211092461594e-05, + "loss": 2.8315, + "step": 35491 + }, + { + "epoch": 2.2032404246073622, + "grad_norm": 0.14217563542969763, + "learning_rate": 1.9991322265042928e-05, + "loss": 2.7678, + "step": 35492 + }, + { + "epoch": 2.20330250170712, + "grad_norm": 0.15004703981207157, + "learning_rate": 1.9988433594186816e-05, + "loss": 2.7842, + "step": 35493 + }, + { + "epoch": 2.203364578806878, + "grad_norm": 0.15607046303009275, + "learning_rate": 1.998554507990832e-05, + "loss": 2.7623, + "step": 35494 + }, + { + "epoch": 2.203426655906636, + "grad_norm": 0.15153905893722966, + "learning_rate": 1.998265672222253e-05, + "loss": 2.8084, + "step": 35495 + }, + { + "epoch": 2.203488733006394, + "grad_norm": 0.14672395188125753, + "learning_rate": 1.9979768521144516e-05, + "loss": 2.7155, + "step": 35496 + }, + { + "epoch": 2.203550810106152, + "grad_norm": 0.13851366958270683, + "learning_rate": 1.9976880476689337e-05, + "loss": 2.7291, + "step": 35497 + }, + { + "epoch": 2.2036128872059098, + "grad_norm": 0.15973302116779312, + "learning_rate": 1.9973992588872037e-05, + "loss": 2.7556, + "step": 35498 + }, + { + "epoch": 2.2036749643056677, + "grad_norm": 0.16022750967189317, + "learning_rate": 1.9971104857707724e-05, + "loss": 2.7267, + "step": 35499 + }, + { + "epoch": 2.2037370414054256, + "grad_norm": 0.1451088922216756, + "learning_rate": 1.996821728321145e-05, + "loss": 2.7234, + "step": 35500 + }, + { + "epoch": 2.2037991185051835, + "grad_norm": 0.13690159015740355, + "learning_rate": 1.996532986539828e-05, + "loss": 2.7077, + "step": 35501 + }, + { + "epoch": 2.2038611956049414, + "grad_norm": 0.1421727793797154, + "learning_rate": 1.996244260428327e-05, + "loss": 2.6623, + "step": 35502 + }, + { + "epoch": 2.2039232727046993, + "grad_norm": 0.13716387494243817, + "learning_rate": 1.9959555499881472e-05, + "loss": 2.7102, + "step": 35503 + }, + { + "epoch": 2.2039853498044573, + "grad_norm": 0.1543913564047945, + "learning_rate": 1.9956668552207986e-05, + "loss": 2.7101, + "step": 35504 + }, + { + "epoch": 2.204047426904215, + "grad_norm": 0.13573691817560374, + "learning_rate": 1.9953781761277834e-05, + "loss": 2.6885, + "step": 35505 + }, + { + "epoch": 2.204109504003973, + "grad_norm": 0.15304716039127617, + "learning_rate": 1.9950895127106112e-05, + "loss": 2.7191, + "step": 35506 + }, + { + "epoch": 2.204171581103731, + "grad_norm": 0.15078348395318392, + "learning_rate": 1.9948008649707865e-05, + "loss": 2.7262, + "step": 35507 + }, + { + "epoch": 2.204233658203489, + "grad_norm": 0.17665291935324806, + "learning_rate": 1.994512232909814e-05, + "loss": 2.667, + "step": 35508 + }, + { + "epoch": 2.204295735303247, + "grad_norm": 0.1713539646508863, + "learning_rate": 1.994223616529202e-05, + "loss": 2.6558, + "step": 35509 + }, + { + "epoch": 2.2043578124030043, + "grad_norm": 0.1398697746291997, + "learning_rate": 1.9939350158304553e-05, + "loss": 2.6926, + "step": 35510 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.13582889614239577, + "learning_rate": 1.9936464308150792e-05, + "loss": 2.7141, + "step": 35511 + }, + { + "epoch": 2.20448196660252, + "grad_norm": 0.14745213311797453, + "learning_rate": 1.993357861484578e-05, + "loss": 2.7887, + "step": 35512 + }, + { + "epoch": 2.204544043702278, + "grad_norm": 0.1447398832113759, + "learning_rate": 1.9930693078404605e-05, + "loss": 2.7609, + "step": 35513 + }, + { + "epoch": 2.204606120802036, + "grad_norm": 0.1497334078192561, + "learning_rate": 1.9927807698842305e-05, + "loss": 2.6824, + "step": 35514 + }, + { + "epoch": 2.204668197901794, + "grad_norm": 0.14203488050464316, + "learning_rate": 1.992492247617393e-05, + "loss": 2.7204, + "step": 35515 + }, + { + "epoch": 2.204730275001552, + "grad_norm": 0.13727694729054477, + "learning_rate": 1.9922037410414535e-05, + "loss": 2.7384, + "step": 35516 + }, + { + "epoch": 2.2047923521013097, + "grad_norm": 0.17467353205377276, + "learning_rate": 1.991915250157915e-05, + "loss": 2.7019, + "step": 35517 + }, + { + "epoch": 2.2048544292010677, + "grad_norm": 0.15420526489039324, + "learning_rate": 1.9916267749682875e-05, + "loss": 2.7141, + "step": 35518 + }, + { + "epoch": 2.2049165063008256, + "grad_norm": 0.18247783668381226, + "learning_rate": 1.9913383154740734e-05, + "loss": 2.705, + "step": 35519 + }, + { + "epoch": 2.2049785834005835, + "grad_norm": 0.14699890688040482, + "learning_rate": 1.9910498716767774e-05, + "loss": 2.7659, + "step": 35520 + }, + { + "epoch": 2.2050406605003414, + "grad_norm": 0.14036797505835766, + "learning_rate": 1.9907614435779028e-05, + "loss": 2.7929, + "step": 35521 + }, + { + "epoch": 2.2051027376000993, + "grad_norm": 0.14036719214209603, + "learning_rate": 1.9904730311789577e-05, + "loss": 2.7452, + "step": 35522 + }, + { + "epoch": 2.2051648146998573, + "grad_norm": 0.16408102180743495, + "learning_rate": 1.990184634481446e-05, + "loss": 2.758, + "step": 35523 + }, + { + "epoch": 2.205226891799615, + "grad_norm": 0.14062615897442807, + "learning_rate": 1.9898962534868716e-05, + "loss": 2.7398, + "step": 35524 + }, + { + "epoch": 2.205288968899373, + "grad_norm": 0.14748620454680153, + "learning_rate": 1.989607888196739e-05, + "loss": 2.7586, + "step": 35525 + }, + { + "epoch": 2.205351045999131, + "grad_norm": 0.15646227483910713, + "learning_rate": 1.9893195386125513e-05, + "loss": 2.721, + "step": 35526 + }, + { + "epoch": 2.205413123098889, + "grad_norm": 0.1477340606754697, + "learning_rate": 1.9890312047358162e-05, + "loss": 2.7402, + "step": 35527 + }, + { + "epoch": 2.205475200198647, + "grad_norm": 0.15162693625194243, + "learning_rate": 1.988742886568036e-05, + "loss": 2.6977, + "step": 35528 + }, + { + "epoch": 2.2055372772984048, + "grad_norm": 0.1662134058283809, + "learning_rate": 1.988454584110715e-05, + "loss": 2.7387, + "step": 35529 + }, + { + "epoch": 2.2055993543981627, + "grad_norm": 0.15045521327653424, + "learning_rate": 1.9881662973653557e-05, + "loss": 2.7821, + "step": 35530 + }, + { + "epoch": 2.2056614314979206, + "grad_norm": 0.19289799756112763, + "learning_rate": 1.9878780263334668e-05, + "loss": 2.6602, + "step": 35531 + }, + { + "epoch": 2.2057235085976785, + "grad_norm": 0.15175705201340156, + "learning_rate": 1.9875897710165487e-05, + "loss": 2.7406, + "step": 35532 + }, + { + "epoch": 2.205785585697436, + "grad_norm": 0.13889697435474047, + "learning_rate": 1.987301531416106e-05, + "loss": 2.6934, + "step": 35533 + }, + { + "epoch": 2.205847662797194, + "grad_norm": 0.14175301049372652, + "learning_rate": 1.9870133075336406e-05, + "loss": 2.7136, + "step": 35534 + }, + { + "epoch": 2.205909739896952, + "grad_norm": 0.15439529539398125, + "learning_rate": 1.9867250993706604e-05, + "loss": 2.6953, + "step": 35535 + }, + { + "epoch": 2.2059718169967097, + "grad_norm": 0.1472655384862833, + "learning_rate": 1.9864369069286666e-05, + "loss": 2.7592, + "step": 35536 + }, + { + "epoch": 2.2060338940964677, + "grad_norm": 0.20532315466072576, + "learning_rate": 1.986148730209163e-05, + "loss": 2.6675, + "step": 35537 + }, + { + "epoch": 2.2060959711962256, + "grad_norm": 0.21383719037943857, + "learning_rate": 1.9858605692136517e-05, + "loss": 2.6548, + "step": 35538 + }, + { + "epoch": 2.2061580482959835, + "grad_norm": 0.16129323314171973, + "learning_rate": 1.985572423943639e-05, + "loss": 2.7037, + "step": 35539 + }, + { + "epoch": 2.2062201253957414, + "grad_norm": 0.16088680214150014, + "learning_rate": 1.9852842944006252e-05, + "loss": 2.7485, + "step": 35540 + }, + { + "epoch": 2.2062822024954993, + "grad_norm": 0.15598535776120084, + "learning_rate": 1.9849961805861167e-05, + "loss": 2.7097, + "step": 35541 + }, + { + "epoch": 2.2063442795952573, + "grad_norm": 0.1731335177632918, + "learning_rate": 1.9847080825016146e-05, + "loss": 2.7282, + "step": 35542 + }, + { + "epoch": 2.206406356695015, + "grad_norm": 0.15874726119457327, + "learning_rate": 1.984420000148623e-05, + "loss": 2.6951, + "step": 35543 + }, + { + "epoch": 2.206468433794773, + "grad_norm": 0.14888378238997613, + "learning_rate": 1.9841319335286423e-05, + "loss": 2.6743, + "step": 35544 + }, + { + "epoch": 2.206530510894531, + "grad_norm": 0.153865034858732, + "learning_rate": 1.983843882643179e-05, + "loss": 2.7439, + "step": 35545 + }, + { + "epoch": 2.206592587994289, + "grad_norm": 0.15473535861827778, + "learning_rate": 1.983555847493735e-05, + "loss": 2.7683, + "step": 35546 + }, + { + "epoch": 2.206654665094047, + "grad_norm": 0.14602484148920133, + "learning_rate": 1.9832678280818124e-05, + "loss": 2.8175, + "step": 35547 + }, + { + "epoch": 2.2067167421938048, + "grad_norm": 0.14588040134421204, + "learning_rate": 1.9829798244089114e-05, + "loss": 2.7269, + "step": 35548 + }, + { + "epoch": 2.2067788192935627, + "grad_norm": 0.13465751928296504, + "learning_rate": 1.9826918364765395e-05, + "loss": 2.7039, + "step": 35549 + }, + { + "epoch": 2.2068408963933206, + "grad_norm": 0.13805224777888686, + "learning_rate": 1.9824038642861963e-05, + "loss": 2.6251, + "step": 35550 + }, + { + "epoch": 2.2069029734930785, + "grad_norm": 0.15696546543078824, + "learning_rate": 1.982115907839385e-05, + "loss": 2.6986, + "step": 35551 + }, + { + "epoch": 2.2069650505928364, + "grad_norm": 0.14461246348070714, + "learning_rate": 1.9818279671376073e-05, + "loss": 2.8065, + "step": 35552 + }, + { + "epoch": 2.2070271276925943, + "grad_norm": 0.14221038817540763, + "learning_rate": 1.9815400421823644e-05, + "loss": 2.6908, + "step": 35553 + }, + { + "epoch": 2.2070892047923523, + "grad_norm": 0.14356181717379332, + "learning_rate": 1.9812521329751615e-05, + "loss": 2.7133, + "step": 35554 + }, + { + "epoch": 2.20715128189211, + "grad_norm": 0.14507446789792663, + "learning_rate": 1.9809642395174987e-05, + "loss": 2.7457, + "step": 35555 + }, + { + "epoch": 2.207213358991868, + "grad_norm": 0.1485529688901992, + "learning_rate": 1.980676361810879e-05, + "loss": 2.8023, + "step": 35556 + }, + { + "epoch": 2.207275436091626, + "grad_norm": 0.14086133912166934, + "learning_rate": 1.980388499856801e-05, + "loss": 2.6462, + "step": 35557 + }, + { + "epoch": 2.2073375131913835, + "grad_norm": 0.15488031392529442, + "learning_rate": 1.9801006536567717e-05, + "loss": 2.7701, + "step": 35558 + }, + { + "epoch": 2.2073995902911414, + "grad_norm": 0.14387880442583123, + "learning_rate": 1.9798128232122904e-05, + "loss": 2.7718, + "step": 35559 + }, + { + "epoch": 2.2074616673908993, + "grad_norm": 0.1381870781556915, + "learning_rate": 1.9795250085248584e-05, + "loss": 2.6499, + "step": 35560 + }, + { + "epoch": 2.2075237444906572, + "grad_norm": 0.14049695728908548, + "learning_rate": 1.9792372095959784e-05, + "loss": 2.7589, + "step": 35561 + }, + { + "epoch": 2.207585821590415, + "grad_norm": 0.15106069236718672, + "learning_rate": 1.9789494264271485e-05, + "loss": 2.7476, + "step": 35562 + }, + { + "epoch": 2.207647898690173, + "grad_norm": 0.1383114213410147, + "learning_rate": 1.9786616590198753e-05, + "loss": 2.7449, + "step": 35563 + }, + { + "epoch": 2.207709975789931, + "grad_norm": 0.1518721678650291, + "learning_rate": 1.9783739073756575e-05, + "loss": 2.6516, + "step": 35564 + }, + { + "epoch": 2.207772052889689, + "grad_norm": 0.1374875862304954, + "learning_rate": 1.9780861714959964e-05, + "loss": 2.7212, + "step": 35565 + }, + { + "epoch": 2.207834129989447, + "grad_norm": 0.13945549527459286, + "learning_rate": 1.9777984513823917e-05, + "loss": 2.7554, + "step": 35566 + }, + { + "epoch": 2.2078962070892048, + "grad_norm": 0.1454789817973133, + "learning_rate": 1.9775107470363473e-05, + "loss": 2.6843, + "step": 35567 + }, + { + "epoch": 2.2079582841889627, + "grad_norm": 0.1491010728647629, + "learning_rate": 1.9772230584593638e-05, + "loss": 2.7047, + "step": 35568 + }, + { + "epoch": 2.2080203612887206, + "grad_norm": 0.14263461574616618, + "learning_rate": 1.9769353856529415e-05, + "loss": 2.6202, + "step": 35569 + }, + { + "epoch": 2.2080824383884785, + "grad_norm": 0.1380210162707382, + "learning_rate": 1.976647728618579e-05, + "loss": 2.6187, + "step": 35570 + }, + { + "epoch": 2.2081445154882364, + "grad_norm": 0.1516704044944724, + "learning_rate": 1.9763600873577793e-05, + "loss": 2.6875, + "step": 35571 + }, + { + "epoch": 2.2082065925879943, + "grad_norm": 0.1490375099687158, + "learning_rate": 1.976072461872045e-05, + "loss": 2.6648, + "step": 35572 + }, + { + "epoch": 2.2082686696877523, + "grad_norm": 0.14686734681785632, + "learning_rate": 1.9757848521628748e-05, + "loss": 2.767, + "step": 35573 + }, + { + "epoch": 2.20833074678751, + "grad_norm": 0.13857903529285517, + "learning_rate": 1.975497258231769e-05, + "loss": 2.673, + "step": 35574 + }, + { + "epoch": 2.208392823887268, + "grad_norm": 0.13947500116064848, + "learning_rate": 1.9752096800802278e-05, + "loss": 2.7388, + "step": 35575 + }, + { + "epoch": 2.208454900987026, + "grad_norm": 0.14280222564692216, + "learning_rate": 1.9749221177097504e-05, + "loss": 2.6385, + "step": 35576 + }, + { + "epoch": 2.208516978086784, + "grad_norm": 0.14984685883663254, + "learning_rate": 1.974634571121841e-05, + "loss": 2.7802, + "step": 35577 + }, + { + "epoch": 2.208579055186542, + "grad_norm": 0.14334759079108145, + "learning_rate": 1.974347040317997e-05, + "loss": 2.7697, + "step": 35578 + }, + { + "epoch": 2.2086411322862998, + "grad_norm": 0.1516562009984646, + "learning_rate": 1.974059525299719e-05, + "loss": 2.7743, + "step": 35579 + }, + { + "epoch": 2.2087032093860577, + "grad_norm": 0.13649376340695105, + "learning_rate": 1.9737720260685056e-05, + "loss": 2.6572, + "step": 35580 + }, + { + "epoch": 2.208765286485815, + "grad_norm": 0.14756496499000968, + "learning_rate": 1.9734845426258596e-05, + "loss": 2.7035, + "step": 35581 + }, + { + "epoch": 2.208827363585573, + "grad_norm": 0.14448558885120757, + "learning_rate": 1.9731970749732797e-05, + "loss": 2.725, + "step": 35582 + }, + { + "epoch": 2.208889440685331, + "grad_norm": 0.1372971018490535, + "learning_rate": 1.9729096231122656e-05, + "loss": 2.7757, + "step": 35583 + }, + { + "epoch": 2.208951517785089, + "grad_norm": 0.16647099144430755, + "learning_rate": 1.9726221870443144e-05, + "loss": 2.6863, + "step": 35584 + }, + { + "epoch": 2.209013594884847, + "grad_norm": 0.15229621169952728, + "learning_rate": 1.9723347667709306e-05, + "loss": 2.7675, + "step": 35585 + }, + { + "epoch": 2.2090756719846047, + "grad_norm": 0.14314729272274007, + "learning_rate": 1.9720473622936115e-05, + "loss": 2.7135, + "step": 35586 + }, + { + "epoch": 2.2091377490843627, + "grad_norm": 0.13723124090266686, + "learning_rate": 1.971759973613856e-05, + "loss": 2.7044, + "step": 35587 + }, + { + "epoch": 2.2091998261841206, + "grad_norm": 0.13667758594433416, + "learning_rate": 1.9714726007331637e-05, + "loss": 2.7872, + "step": 35588 + }, + { + "epoch": 2.2092619032838785, + "grad_norm": 0.14250371898850986, + "learning_rate": 1.971185243653032e-05, + "loss": 2.676, + "step": 35589 + }, + { + "epoch": 2.2093239803836364, + "grad_norm": 0.1610963895842326, + "learning_rate": 1.9708979023749645e-05, + "loss": 2.7337, + "step": 35590 + }, + { + "epoch": 2.2093860574833943, + "grad_norm": 0.16114535427412727, + "learning_rate": 1.970610576900458e-05, + "loss": 2.7249, + "step": 35591 + }, + { + "epoch": 2.2094481345831523, + "grad_norm": 0.16028799476682531, + "learning_rate": 1.9703232672310114e-05, + "loss": 2.6722, + "step": 35592 + }, + { + "epoch": 2.20951021168291, + "grad_norm": 0.1619710274447362, + "learning_rate": 1.9700359733681216e-05, + "loss": 2.6297, + "step": 35593 + }, + { + "epoch": 2.209572288782668, + "grad_norm": 0.14761511845386593, + "learning_rate": 1.9697486953132922e-05, + "loss": 2.68, + "step": 35594 + }, + { + "epoch": 2.209634365882426, + "grad_norm": 0.14612370913125458, + "learning_rate": 1.969461433068019e-05, + "loss": 2.7987, + "step": 35595 + }, + { + "epoch": 2.209696442982184, + "grad_norm": 0.14834588762418283, + "learning_rate": 1.969174186633801e-05, + "loss": 2.7582, + "step": 35596 + }, + { + "epoch": 2.209758520081942, + "grad_norm": 0.1477728317857389, + "learning_rate": 1.9688869560121376e-05, + "loss": 2.7997, + "step": 35597 + }, + { + "epoch": 2.2098205971816998, + "grad_norm": 0.14879953630457496, + "learning_rate": 1.968599741204524e-05, + "loss": 2.7105, + "step": 35598 + }, + { + "epoch": 2.2098826742814577, + "grad_norm": 0.16156687792374858, + "learning_rate": 1.9683125422124642e-05, + "loss": 2.7161, + "step": 35599 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.15455627366083963, + "learning_rate": 1.9680253590374537e-05, + "loss": 2.6659, + "step": 35600 + }, + { + "epoch": 2.2100068284809735, + "grad_norm": 0.15529271372017436, + "learning_rate": 1.9677381916809912e-05, + "loss": 2.7128, + "step": 35601 + }, + { + "epoch": 2.2100689055807314, + "grad_norm": 0.15200116077367304, + "learning_rate": 1.9674510401445722e-05, + "loss": 2.6859, + "step": 35602 + }, + { + "epoch": 2.2101309826804894, + "grad_norm": 0.14508915523900465, + "learning_rate": 1.967163904429699e-05, + "loss": 2.7973, + "step": 35603 + }, + { + "epoch": 2.2101930597802473, + "grad_norm": 0.15447896160511684, + "learning_rate": 1.9668767845378672e-05, + "loss": 2.7645, + "step": 35604 + }, + { + "epoch": 2.210255136880005, + "grad_norm": 0.17356710982356627, + "learning_rate": 1.9665896804705762e-05, + "loss": 2.7931, + "step": 35605 + }, + { + "epoch": 2.2103172139797627, + "grad_norm": 0.14360666307244746, + "learning_rate": 1.9663025922293237e-05, + "loss": 2.7283, + "step": 35606 + }, + { + "epoch": 2.2103792910795206, + "grad_norm": 0.16133484168306503, + "learning_rate": 1.9660155198156055e-05, + "loss": 2.7037, + "step": 35607 + }, + { + "epoch": 2.2104413681792785, + "grad_norm": 0.13773181783084185, + "learning_rate": 1.965728463230922e-05, + "loss": 2.7033, + "step": 35608 + }, + { + "epoch": 2.2105034452790364, + "grad_norm": 0.13172430327667956, + "learning_rate": 1.9654414224767702e-05, + "loss": 2.6263, + "step": 35609 + }, + { + "epoch": 2.2105655223787943, + "grad_norm": 0.1690933648261733, + "learning_rate": 1.965154397554647e-05, + "loss": 2.7394, + "step": 35610 + }, + { + "epoch": 2.2106275994785523, + "grad_norm": 0.14848380245028908, + "learning_rate": 1.9648673884660502e-05, + "loss": 2.7391, + "step": 35611 + }, + { + "epoch": 2.21068967657831, + "grad_norm": 0.13997042821528333, + "learning_rate": 1.9645803952124754e-05, + "loss": 2.8038, + "step": 35612 + }, + { + "epoch": 2.210751753678068, + "grad_norm": 0.15716006371343655, + "learning_rate": 1.9642934177954227e-05, + "loss": 2.7034, + "step": 35613 + }, + { + "epoch": 2.210813830777826, + "grad_norm": 0.1481906570112763, + "learning_rate": 1.9640064562163885e-05, + "loss": 2.7294, + "step": 35614 + }, + { + "epoch": 2.210875907877584, + "grad_norm": 0.1523646315911021, + "learning_rate": 1.9637195104768697e-05, + "loss": 2.7802, + "step": 35615 + }, + { + "epoch": 2.210937984977342, + "grad_norm": 0.18295950603181674, + "learning_rate": 1.9634325805783616e-05, + "loss": 2.6538, + "step": 35616 + }, + { + "epoch": 2.2110000620770998, + "grad_norm": 0.15761198463030454, + "learning_rate": 1.9631456665223647e-05, + "loss": 2.7566, + "step": 35617 + }, + { + "epoch": 2.2110621391768577, + "grad_norm": 0.1844038439672737, + "learning_rate": 1.962858768310374e-05, + "loss": 2.693, + "step": 35618 + }, + { + "epoch": 2.2111242162766156, + "grad_norm": 0.14290849739708303, + "learning_rate": 1.9625718859438863e-05, + "loss": 2.7355, + "step": 35619 + }, + { + "epoch": 2.2111862933763735, + "grad_norm": 0.16480090095128497, + "learning_rate": 1.9622850194243965e-05, + "loss": 2.754, + "step": 35620 + }, + { + "epoch": 2.2112483704761314, + "grad_norm": 0.17054704234507762, + "learning_rate": 1.961998168753405e-05, + "loss": 2.75, + "step": 35621 + }, + { + "epoch": 2.2113104475758893, + "grad_norm": 0.15194050900764436, + "learning_rate": 1.9617113339324066e-05, + "loss": 2.7951, + "step": 35622 + }, + { + "epoch": 2.2113725246756473, + "grad_norm": 0.16621424069915694, + "learning_rate": 1.961424514962898e-05, + "loss": 2.7294, + "step": 35623 + }, + { + "epoch": 2.211434601775405, + "grad_norm": 0.1453973619764503, + "learning_rate": 1.961137711846375e-05, + "loss": 2.7142, + "step": 35624 + }, + { + "epoch": 2.211496678875163, + "grad_norm": 0.1402235061450148, + "learning_rate": 1.9608509245843332e-05, + "loss": 2.7716, + "step": 35625 + }, + { + "epoch": 2.211558755974921, + "grad_norm": 0.14141023140249828, + "learning_rate": 1.960564153178271e-05, + "loss": 2.7891, + "step": 35626 + }, + { + "epoch": 2.211620833074679, + "grad_norm": 0.14123458004858622, + "learning_rate": 1.9602773976296834e-05, + "loss": 2.6844, + "step": 35627 + }, + { + "epoch": 2.211682910174437, + "grad_norm": 0.14371755016030938, + "learning_rate": 1.9599906579400668e-05, + "loss": 2.7064, + "step": 35628 + }, + { + "epoch": 2.2117449872741943, + "grad_norm": 0.14094966920473512, + "learning_rate": 1.9597039341109152e-05, + "loss": 2.6781, + "step": 35629 + }, + { + "epoch": 2.2118070643739522, + "grad_norm": 0.14603554310682992, + "learning_rate": 1.9594172261437278e-05, + "loss": 2.789, + "step": 35630 + }, + { + "epoch": 2.21186914147371, + "grad_norm": 0.1413385316023337, + "learning_rate": 1.959130534039999e-05, + "loss": 2.6751, + "step": 35631 + }, + { + "epoch": 2.211931218573468, + "grad_norm": 0.14495651083138908, + "learning_rate": 1.958843857801225e-05, + "loss": 2.7476, + "step": 35632 + }, + { + "epoch": 2.211993295673226, + "grad_norm": 0.14119497660107946, + "learning_rate": 1.958557197428898e-05, + "loss": 2.7562, + "step": 35633 + }, + { + "epoch": 2.212055372772984, + "grad_norm": 0.13994719818758009, + "learning_rate": 1.9582705529245188e-05, + "loss": 2.7798, + "step": 35634 + }, + { + "epoch": 2.212117449872742, + "grad_norm": 0.13804461766088308, + "learning_rate": 1.9579839242895803e-05, + "loss": 2.7264, + "step": 35635 + }, + { + "epoch": 2.2121795269724998, + "grad_norm": 0.136835314242604, + "learning_rate": 1.9576973115255765e-05, + "loss": 2.6836, + "step": 35636 + }, + { + "epoch": 2.2122416040722577, + "grad_norm": 0.13702658329958775, + "learning_rate": 1.9574107146340058e-05, + "loss": 2.7236, + "step": 35637 + }, + { + "epoch": 2.2123036811720156, + "grad_norm": 0.14602372536353556, + "learning_rate": 1.9571241336163625e-05, + "loss": 2.7197, + "step": 35638 + }, + { + "epoch": 2.2123657582717735, + "grad_norm": 0.15247236837920342, + "learning_rate": 1.9568375684741393e-05, + "loss": 2.7942, + "step": 35639 + }, + { + "epoch": 2.2124278353715314, + "grad_norm": 0.1352739766787899, + "learning_rate": 1.9565510192088348e-05, + "loss": 2.7456, + "step": 35640 + }, + { + "epoch": 2.2124899124712893, + "grad_norm": 0.14210548839109147, + "learning_rate": 1.9562644858219432e-05, + "loss": 2.7652, + "step": 35641 + }, + { + "epoch": 2.2125519895710473, + "grad_norm": 0.18101129927588955, + "learning_rate": 1.9559779683149583e-05, + "loss": 2.6494, + "step": 35642 + }, + { + "epoch": 2.212614066670805, + "grad_norm": 0.1664306445889086, + "learning_rate": 1.9556914666893733e-05, + "loss": 2.7551, + "step": 35643 + }, + { + "epoch": 2.212676143770563, + "grad_norm": 0.14079455908675234, + "learning_rate": 1.9554049809466867e-05, + "loss": 2.7785, + "step": 35644 + }, + { + "epoch": 2.212738220870321, + "grad_norm": 0.14245341346730764, + "learning_rate": 1.9551185110883913e-05, + "loss": 2.6661, + "step": 35645 + }, + { + "epoch": 2.212800297970079, + "grad_norm": 0.16138597310261105, + "learning_rate": 1.9548320571159824e-05, + "loss": 2.7897, + "step": 35646 + }, + { + "epoch": 2.212862375069837, + "grad_norm": 0.14336573563271365, + "learning_rate": 1.9545456190309537e-05, + "loss": 2.7359, + "step": 35647 + }, + { + "epoch": 2.2129244521695948, + "grad_norm": 0.13910676701661262, + "learning_rate": 1.9542591968347978e-05, + "loss": 2.7617, + "step": 35648 + }, + { + "epoch": 2.2129865292693527, + "grad_norm": 0.14876879563311873, + "learning_rate": 1.9539727905290128e-05, + "loss": 2.7576, + "step": 35649 + }, + { + "epoch": 2.2130486063691106, + "grad_norm": 0.16542867797579563, + "learning_rate": 1.9536864001150912e-05, + "loss": 2.7713, + "step": 35650 + }, + { + "epoch": 2.2131106834688685, + "grad_norm": 0.13969151959407053, + "learning_rate": 1.9534000255945272e-05, + "loss": 2.6933, + "step": 35651 + }, + { + "epoch": 2.2131727605686264, + "grad_norm": 0.14145500227873786, + "learning_rate": 1.9531136669688133e-05, + "loss": 2.7752, + "step": 35652 + }, + { + "epoch": 2.2132348376683844, + "grad_norm": 0.15027270710318677, + "learning_rate": 1.9528273242394467e-05, + "loss": 2.7056, + "step": 35653 + }, + { + "epoch": 2.213296914768142, + "grad_norm": 0.15613511153249487, + "learning_rate": 1.9525409974079194e-05, + "loss": 2.7467, + "step": 35654 + }, + { + "epoch": 2.2133589918678997, + "grad_norm": 0.15618574490196468, + "learning_rate": 1.952254686475726e-05, + "loss": 2.7169, + "step": 35655 + }, + { + "epoch": 2.2134210689676577, + "grad_norm": 0.15070634992717435, + "learning_rate": 1.9519683914443582e-05, + "loss": 2.7568, + "step": 35656 + }, + { + "epoch": 2.2134831460674156, + "grad_norm": 0.1628790016509791, + "learning_rate": 1.9516821123153122e-05, + "loss": 2.7029, + "step": 35657 + }, + { + "epoch": 2.2135452231671735, + "grad_norm": 0.1368743351806973, + "learning_rate": 1.9513958490900813e-05, + "loss": 2.6335, + "step": 35658 + }, + { + "epoch": 2.2136073002669314, + "grad_norm": 0.15584281078076256, + "learning_rate": 1.951109601770158e-05, + "loss": 2.7085, + "step": 35659 + }, + { + "epoch": 2.2136693773666893, + "grad_norm": 0.1471976968448538, + "learning_rate": 1.950823370357036e-05, + "loss": 2.6437, + "step": 35660 + }, + { + "epoch": 2.2137314544664473, + "grad_norm": 0.17043660567985885, + "learning_rate": 1.9505371548522072e-05, + "loss": 2.6866, + "step": 35661 + }, + { + "epoch": 2.213793531566205, + "grad_norm": 0.1397590066100806, + "learning_rate": 1.950250955257168e-05, + "loss": 2.7171, + "step": 35662 + }, + { + "epoch": 2.213855608665963, + "grad_norm": 0.14876575940461323, + "learning_rate": 1.94996477157341e-05, + "loss": 2.7226, + "step": 35663 + }, + { + "epoch": 2.213917685765721, + "grad_norm": 0.14424554836223705, + "learning_rate": 1.9496786038024255e-05, + "loss": 2.669, + "step": 35664 + }, + { + "epoch": 2.213979762865479, + "grad_norm": 0.1473059945464534, + "learning_rate": 1.9493924519457075e-05, + "loss": 2.8077, + "step": 35665 + }, + { + "epoch": 2.214041839965237, + "grad_norm": 0.16547794786560147, + "learning_rate": 1.9491063160047508e-05, + "loss": 2.6397, + "step": 35666 + }, + { + "epoch": 2.2141039170649948, + "grad_norm": 0.1672388452493622, + "learning_rate": 1.948820195981047e-05, + "loss": 2.6906, + "step": 35667 + }, + { + "epoch": 2.2141659941647527, + "grad_norm": 0.1418478107131673, + "learning_rate": 1.948534091876089e-05, + "loss": 2.7472, + "step": 35668 + }, + { + "epoch": 2.2142280712645106, + "grad_norm": 0.153476423009594, + "learning_rate": 1.948248003691368e-05, + "loss": 2.7719, + "step": 35669 + }, + { + "epoch": 2.2142901483642685, + "grad_norm": 0.13870431628010557, + "learning_rate": 1.9479619314283777e-05, + "loss": 2.6661, + "step": 35670 + }, + { + "epoch": 2.2143522254640264, + "grad_norm": 0.1613541556687307, + "learning_rate": 1.9476758750886127e-05, + "loss": 2.748, + "step": 35671 + }, + { + "epoch": 2.2144143025637844, + "grad_norm": 0.13847046315436182, + "learning_rate": 1.9473898346735635e-05, + "loss": 2.6666, + "step": 35672 + }, + { + "epoch": 2.2144763796635423, + "grad_norm": 0.14069237684408695, + "learning_rate": 1.9471038101847232e-05, + "loss": 2.7034, + "step": 35673 + }, + { + "epoch": 2.2145384567633, + "grad_norm": 0.1517167822654243, + "learning_rate": 1.9468178016235827e-05, + "loss": 2.7543, + "step": 35674 + }, + { + "epoch": 2.214600533863058, + "grad_norm": 0.13925547765029087, + "learning_rate": 1.9465318089916336e-05, + "loss": 2.6719, + "step": 35675 + }, + { + "epoch": 2.214662610962816, + "grad_norm": 0.16871869869800177, + "learning_rate": 1.946245832290371e-05, + "loss": 2.7251, + "step": 35676 + }, + { + "epoch": 2.2147246880625735, + "grad_norm": 0.16486894428571874, + "learning_rate": 1.9459598715212852e-05, + "loss": 2.7485, + "step": 35677 + }, + { + "epoch": 2.2147867651623314, + "grad_norm": 0.14602181198540337, + "learning_rate": 1.945673926685868e-05, + "loss": 2.7355, + "step": 35678 + }, + { + "epoch": 2.2148488422620893, + "grad_norm": 0.15359441589218883, + "learning_rate": 1.9453879977856098e-05, + "loss": 2.7912, + "step": 35679 + }, + { + "epoch": 2.2149109193618473, + "grad_norm": 0.1611656097788298, + "learning_rate": 1.945102084822006e-05, + "loss": 2.7504, + "step": 35680 + }, + { + "epoch": 2.214972996461605, + "grad_norm": 0.1463803748651451, + "learning_rate": 1.9448161877965453e-05, + "loss": 2.6686, + "step": 35681 + }, + { + "epoch": 2.215035073561363, + "grad_norm": 0.14741394734255567, + "learning_rate": 1.944530306710721e-05, + "loss": 2.6397, + "step": 35682 + }, + { + "epoch": 2.215097150661121, + "grad_norm": 0.14336252650816886, + "learning_rate": 1.9442444415660215e-05, + "loss": 2.7708, + "step": 35683 + }, + { + "epoch": 2.215159227760879, + "grad_norm": 0.14076213640236004, + "learning_rate": 1.9439585923639432e-05, + "loss": 2.6884, + "step": 35684 + }, + { + "epoch": 2.215221304860637, + "grad_norm": 0.1392338340199958, + "learning_rate": 1.943672759105974e-05, + "loss": 2.7238, + "step": 35685 + }, + { + "epoch": 2.2152833819603948, + "grad_norm": 0.15132169347476881, + "learning_rate": 1.9433869417936064e-05, + "loss": 2.6351, + "step": 35686 + }, + { + "epoch": 2.2153454590601527, + "grad_norm": 0.1421444974574957, + "learning_rate": 1.943101140428331e-05, + "loss": 2.7416, + "step": 35687 + }, + { + "epoch": 2.2154075361599106, + "grad_norm": 0.20831212988876943, + "learning_rate": 1.9428153550116372e-05, + "loss": 2.7189, + "step": 35688 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.1542711477922544, + "learning_rate": 1.94252958554502e-05, + "loss": 2.7282, + "step": 35689 + }, + { + "epoch": 2.2155316903594264, + "grad_norm": 0.1387702230480542, + "learning_rate": 1.9422438320299674e-05, + "loss": 2.7337, + "step": 35690 + }, + { + "epoch": 2.2155937674591843, + "grad_norm": 0.14800585553427703, + "learning_rate": 1.9419580944679717e-05, + "loss": 2.7329, + "step": 35691 + }, + { + "epoch": 2.2156558445589423, + "grad_norm": 0.15633259021000176, + "learning_rate": 1.9416723728605208e-05, + "loss": 2.7482, + "step": 35692 + }, + { + "epoch": 2.2157179216587, + "grad_norm": 0.1450856889551392, + "learning_rate": 1.9413866672091097e-05, + "loss": 2.7642, + "step": 35693 + }, + { + "epoch": 2.215779998758458, + "grad_norm": 0.14331549874739746, + "learning_rate": 1.9411009775152267e-05, + "loss": 2.6744, + "step": 35694 + }, + { + "epoch": 2.215842075858216, + "grad_norm": 0.17085499892628847, + "learning_rate": 1.9408153037803622e-05, + "loss": 2.8173, + "step": 35695 + }, + { + "epoch": 2.215904152957974, + "grad_norm": 0.16821730875045915, + "learning_rate": 1.9405296460060075e-05, + "loss": 2.7469, + "step": 35696 + }, + { + "epoch": 2.215966230057732, + "grad_norm": 0.14431577807553164, + "learning_rate": 1.9402440041936498e-05, + "loss": 2.6917, + "step": 35697 + }, + { + "epoch": 2.2160283071574898, + "grad_norm": 0.1700535242322443, + "learning_rate": 1.9399583783447844e-05, + "loss": 2.7474, + "step": 35698 + }, + { + "epoch": 2.2160903842572477, + "grad_norm": 0.14836768340499493, + "learning_rate": 1.939672768460899e-05, + "loss": 2.6853, + "step": 35699 + }, + { + "epoch": 2.216152461357005, + "grad_norm": 0.16502240339541288, + "learning_rate": 1.9393871745434833e-05, + "loss": 2.6498, + "step": 35700 + }, + { + "epoch": 2.2162145384567635, + "grad_norm": 0.1458542622863913, + "learning_rate": 1.9391015965940258e-05, + "loss": 2.6842, + "step": 35701 + }, + { + "epoch": 2.216276615556521, + "grad_norm": 0.14670562565596365, + "learning_rate": 1.938816034614019e-05, + "loss": 2.6825, + "step": 35702 + }, + { + "epoch": 2.216338692656279, + "grad_norm": 0.16181476663554203, + "learning_rate": 1.938530488604953e-05, + "loss": 2.7058, + "step": 35703 + }, + { + "epoch": 2.216400769756037, + "grad_norm": 0.14620929320641685, + "learning_rate": 1.938244958568317e-05, + "loss": 2.7513, + "step": 35704 + }, + { + "epoch": 2.2164628468557948, + "grad_norm": 0.187540487050814, + "learning_rate": 1.9379594445056003e-05, + "loss": 2.7242, + "step": 35705 + }, + { + "epoch": 2.2165249239555527, + "grad_norm": 0.13939917266032706, + "learning_rate": 1.937673946418291e-05, + "loss": 2.727, + "step": 35706 + }, + { + "epoch": 2.2165870010553106, + "grad_norm": 0.1363512139866442, + "learning_rate": 1.9373884643078815e-05, + "loss": 2.7168, + "step": 35707 + }, + { + "epoch": 2.2166490781550685, + "grad_norm": 0.14129804770891974, + "learning_rate": 1.9371029981758603e-05, + "loss": 2.5817, + "step": 35708 + }, + { + "epoch": 2.2167111552548264, + "grad_norm": 0.14641229127492708, + "learning_rate": 1.936817548023716e-05, + "loss": 2.6181, + "step": 35709 + }, + { + "epoch": 2.2167732323545843, + "grad_norm": 0.143465468520167, + "learning_rate": 1.936532113852938e-05, + "loss": 2.69, + "step": 35710 + }, + { + "epoch": 2.2168353094543423, + "grad_norm": 0.1426080907398325, + "learning_rate": 1.9362466956650144e-05, + "loss": 2.7344, + "step": 35711 + }, + { + "epoch": 2.2168973865541, + "grad_norm": 0.14040697601193486, + "learning_rate": 1.9359612934614368e-05, + "loss": 2.7691, + "step": 35712 + }, + { + "epoch": 2.216959463653858, + "grad_norm": 0.15489475690043933, + "learning_rate": 1.9356759072436935e-05, + "loss": 2.7346, + "step": 35713 + }, + { + "epoch": 2.217021540753616, + "grad_norm": 0.1369748985752107, + "learning_rate": 1.9353905370132725e-05, + "loss": 2.7337, + "step": 35714 + }, + { + "epoch": 2.217083617853374, + "grad_norm": 0.13681298142039577, + "learning_rate": 1.9351051827716616e-05, + "loss": 2.7281, + "step": 35715 + }, + { + "epoch": 2.217145694953132, + "grad_norm": 0.15209995226874973, + "learning_rate": 1.934819844520352e-05, + "loss": 2.6409, + "step": 35716 + }, + { + "epoch": 2.2172077720528898, + "grad_norm": 0.14867267675579499, + "learning_rate": 1.9345345222608318e-05, + "loss": 2.7208, + "step": 35717 + }, + { + "epoch": 2.2172698491526477, + "grad_norm": 0.18688896074423914, + "learning_rate": 1.9342492159945897e-05, + "loss": 2.6912, + "step": 35718 + }, + { + "epoch": 2.2173319262524056, + "grad_norm": 0.13758550147579426, + "learning_rate": 1.9339639257231113e-05, + "loss": 2.5668, + "step": 35719 + }, + { + "epoch": 2.2173940033521635, + "grad_norm": 0.14025295235463922, + "learning_rate": 1.9336786514478887e-05, + "loss": 2.7398, + "step": 35720 + }, + { + "epoch": 2.2174560804519214, + "grad_norm": 0.13791295024838437, + "learning_rate": 1.93339339317041e-05, + "loss": 2.7247, + "step": 35721 + }, + { + "epoch": 2.2175181575516794, + "grad_norm": 0.16738386676861372, + "learning_rate": 1.9331081508921616e-05, + "loss": 2.7514, + "step": 35722 + }, + { + "epoch": 2.2175802346514373, + "grad_norm": 0.15290568712724578, + "learning_rate": 1.932822924614633e-05, + "loss": 2.74, + "step": 35723 + }, + { + "epoch": 2.217642311751195, + "grad_norm": 0.16003413895083446, + "learning_rate": 1.9325377143393096e-05, + "loss": 2.6799, + "step": 35724 + }, + { + "epoch": 2.2177043888509527, + "grad_norm": 0.1490859811610159, + "learning_rate": 1.932252520067684e-05, + "loss": 2.8302, + "step": 35725 + }, + { + "epoch": 2.2177664659507106, + "grad_norm": 0.14299844162515327, + "learning_rate": 1.9319673418012413e-05, + "loss": 2.6783, + "step": 35726 + }, + { + "epoch": 2.2178285430504685, + "grad_norm": 0.14373045171630527, + "learning_rate": 1.9316821795414697e-05, + "loss": 2.697, + "step": 35727 + }, + { + "epoch": 2.2178906201502264, + "grad_norm": 0.13763283604038504, + "learning_rate": 1.9313970332898558e-05, + "loss": 2.6416, + "step": 35728 + }, + { + "epoch": 2.2179526972499843, + "grad_norm": 0.1414907087301748, + "learning_rate": 1.9311119030478896e-05, + "loss": 2.7052, + "step": 35729 + }, + { + "epoch": 2.2180147743497423, + "grad_norm": 0.1680784310932711, + "learning_rate": 1.930826788817058e-05, + "loss": 2.7485, + "step": 35730 + }, + { + "epoch": 2.2180768514495, + "grad_norm": 0.1375336340385781, + "learning_rate": 1.9305416905988477e-05, + "loss": 2.7328, + "step": 35731 + }, + { + "epoch": 2.218138928549258, + "grad_norm": 0.1605786939400986, + "learning_rate": 1.930256608394747e-05, + "loss": 2.751, + "step": 35732 + }, + { + "epoch": 2.218201005649016, + "grad_norm": 0.15168444088411973, + "learning_rate": 1.9299715422062408e-05, + "loss": 2.8359, + "step": 35733 + }, + { + "epoch": 2.218263082748774, + "grad_norm": 0.15787091519582708, + "learning_rate": 1.9296864920348202e-05, + "loss": 2.7593, + "step": 35734 + }, + { + "epoch": 2.218325159848532, + "grad_norm": 0.15694548675156772, + "learning_rate": 1.9294014578819687e-05, + "loss": 2.7382, + "step": 35735 + }, + { + "epoch": 2.2183872369482898, + "grad_norm": 0.1417190327583697, + "learning_rate": 1.9291164397491767e-05, + "loss": 2.7568, + "step": 35736 + }, + { + "epoch": 2.2184493140480477, + "grad_norm": 0.1421523464679102, + "learning_rate": 1.92883143763793e-05, + "loss": 2.6229, + "step": 35737 + }, + { + "epoch": 2.2185113911478056, + "grad_norm": 0.13817570127991133, + "learning_rate": 1.928546451549714e-05, + "loss": 2.6916, + "step": 35738 + }, + { + "epoch": 2.2185734682475635, + "grad_norm": 0.1378982685013083, + "learning_rate": 1.928261481486018e-05, + "loss": 2.7422, + "step": 35739 + }, + { + "epoch": 2.2186355453473214, + "grad_norm": 0.14568345139048133, + "learning_rate": 1.927976527448327e-05, + "loss": 2.7306, + "step": 35740 + }, + { + "epoch": 2.2186976224470794, + "grad_norm": 0.15129436463929946, + "learning_rate": 1.9276915894381285e-05, + "loss": 2.6839, + "step": 35741 + }, + { + "epoch": 2.2187596995468373, + "grad_norm": 0.14493679807796495, + "learning_rate": 1.9274066674569073e-05, + "loss": 2.6581, + "step": 35742 + }, + { + "epoch": 2.218821776646595, + "grad_norm": 0.157153865048192, + "learning_rate": 1.927121761506154e-05, + "loss": 2.6404, + "step": 35743 + }, + { + "epoch": 2.218883853746353, + "grad_norm": 0.15240916657174763, + "learning_rate": 1.9268368715873513e-05, + "loss": 2.7063, + "step": 35744 + }, + { + "epoch": 2.218945930846111, + "grad_norm": 0.1396930929958808, + "learning_rate": 1.926551997701987e-05, + "loss": 2.7467, + "step": 35745 + }, + { + "epoch": 2.219008007945869, + "grad_norm": 0.13560836013113553, + "learning_rate": 1.9262671398515468e-05, + "loss": 2.5354, + "step": 35746 + }, + { + "epoch": 2.219070085045627, + "grad_norm": 0.16527177554227876, + "learning_rate": 1.925982298037516e-05, + "loss": 2.6643, + "step": 35747 + }, + { + "epoch": 2.2191321621453843, + "grad_norm": 0.13787943116483223, + "learning_rate": 1.925697472261383e-05, + "loss": 2.7289, + "step": 35748 + }, + { + "epoch": 2.2191942392451427, + "grad_norm": 0.15601122015636018, + "learning_rate": 1.925412662524633e-05, + "loss": 2.6599, + "step": 35749 + }, + { + "epoch": 2.2192563163449, + "grad_norm": 0.15914350227073304, + "learning_rate": 1.9251278688287504e-05, + "loss": 2.705, + "step": 35750 + }, + { + "epoch": 2.219318393444658, + "grad_norm": 0.14573698749738412, + "learning_rate": 1.9248430911752213e-05, + "loss": 2.6909, + "step": 35751 + }, + { + "epoch": 2.219380470544416, + "grad_norm": 0.13893885500157496, + "learning_rate": 1.9245583295655344e-05, + "loss": 2.7109, + "step": 35752 + }, + { + "epoch": 2.219442547644174, + "grad_norm": 0.14719822831346177, + "learning_rate": 1.924273584001173e-05, + "loss": 2.713, + "step": 35753 + }, + { + "epoch": 2.219504624743932, + "grad_norm": 0.1383475890758974, + "learning_rate": 1.9239888544836226e-05, + "loss": 2.7627, + "step": 35754 + }, + { + "epoch": 2.2195667018436898, + "grad_norm": 0.1578358932662726, + "learning_rate": 1.9237041410143674e-05, + "loss": 2.7009, + "step": 35755 + }, + { + "epoch": 2.2196287789434477, + "grad_norm": 0.1439729293976729, + "learning_rate": 1.9234194435948966e-05, + "loss": 2.6902, + "step": 35756 + }, + { + "epoch": 2.2196908560432056, + "grad_norm": 0.14161168424012927, + "learning_rate": 1.9231347622266932e-05, + "loss": 2.6635, + "step": 35757 + }, + { + "epoch": 2.2197529331429635, + "grad_norm": 0.14094680152784414, + "learning_rate": 1.922850096911243e-05, + "loss": 2.7016, + "step": 35758 + }, + { + "epoch": 2.2198150102427214, + "grad_norm": 0.14195074896117693, + "learning_rate": 1.9225654476500304e-05, + "loss": 2.7605, + "step": 35759 + }, + { + "epoch": 2.2198770873424793, + "grad_norm": 0.1473993147370372, + "learning_rate": 1.9222808144445393e-05, + "loss": 2.7871, + "step": 35760 + }, + { + "epoch": 2.2199391644422373, + "grad_norm": 0.14788995947186812, + "learning_rate": 1.921996197296258e-05, + "loss": 2.7137, + "step": 35761 + }, + { + "epoch": 2.220001241541995, + "grad_norm": 0.1487650081632551, + "learning_rate": 1.92171159620667e-05, + "loss": 2.7497, + "step": 35762 + }, + { + "epoch": 2.220063318641753, + "grad_norm": 0.1352054908666238, + "learning_rate": 1.92142701117726e-05, + "loss": 2.7136, + "step": 35763 + }, + { + "epoch": 2.220125395741511, + "grad_norm": 0.1378369515973106, + "learning_rate": 1.9211424422095108e-05, + "loss": 2.7331, + "step": 35764 + }, + { + "epoch": 2.220187472841269, + "grad_norm": 0.14299746469937843, + "learning_rate": 1.9208578893049107e-05, + "loss": 2.6762, + "step": 35765 + }, + { + "epoch": 2.220249549941027, + "grad_norm": 0.16594263042809188, + "learning_rate": 1.920573352464942e-05, + "loss": 2.7651, + "step": 35766 + }, + { + "epoch": 2.2203116270407848, + "grad_norm": 0.15467992147511506, + "learning_rate": 1.920288831691088e-05, + "loss": 2.7293, + "step": 35767 + }, + { + "epoch": 2.2203737041405427, + "grad_norm": 0.1391189514192288, + "learning_rate": 1.9200043269848373e-05, + "loss": 2.7466, + "step": 35768 + }, + { + "epoch": 2.2204357812403006, + "grad_norm": 0.14153451976717762, + "learning_rate": 1.919719838347669e-05, + "loss": 2.744, + "step": 35769 + }, + { + "epoch": 2.2204978583400585, + "grad_norm": 0.1428024212997418, + "learning_rate": 1.9194353657810725e-05, + "loss": 2.7192, + "step": 35770 + }, + { + "epoch": 2.2205599354398164, + "grad_norm": 0.13688301028149236, + "learning_rate": 1.919150909286529e-05, + "loss": 2.6574, + "step": 35771 + }, + { + "epoch": 2.2206220125395744, + "grad_norm": 0.15966475480604275, + "learning_rate": 1.9188664688655235e-05, + "loss": 2.7662, + "step": 35772 + }, + { + "epoch": 2.220684089639332, + "grad_norm": 0.13544355104190728, + "learning_rate": 1.9185820445195397e-05, + "loss": 2.7712, + "step": 35773 + }, + { + "epoch": 2.2207461667390898, + "grad_norm": 0.13462484995869925, + "learning_rate": 1.9182976362500594e-05, + "loss": 2.635, + "step": 35774 + }, + { + "epoch": 2.2208082438388477, + "grad_norm": 0.14856407766784313, + "learning_rate": 1.9180132440585702e-05, + "loss": 2.7222, + "step": 35775 + }, + { + "epoch": 2.2208703209386056, + "grad_norm": 0.1448175713009647, + "learning_rate": 1.9177288679465545e-05, + "loss": 2.6917, + "step": 35776 + }, + { + "epoch": 2.2209323980383635, + "grad_norm": 0.16440396310281252, + "learning_rate": 1.9174445079154945e-05, + "loss": 2.6839, + "step": 35777 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.14600973161261493, + "learning_rate": 1.9171601639668742e-05, + "loss": 2.6554, + "step": 35778 + }, + { + "epoch": 2.2210565522378793, + "grad_norm": 0.14891964008614278, + "learning_rate": 1.9168758361021787e-05, + "loss": 2.6923, + "step": 35779 + }, + { + "epoch": 2.2211186293376373, + "grad_norm": 0.13393079253466653, + "learning_rate": 1.916591524322891e-05, + "loss": 2.7013, + "step": 35780 + }, + { + "epoch": 2.221180706437395, + "grad_norm": 0.1778991061581655, + "learning_rate": 1.9163072286304935e-05, + "loss": 2.6989, + "step": 35781 + }, + { + "epoch": 2.221242783537153, + "grad_norm": 0.14249314427615362, + "learning_rate": 1.91602294902647e-05, + "loss": 2.7122, + "step": 35782 + }, + { + "epoch": 2.221304860636911, + "grad_norm": 0.14807824529675342, + "learning_rate": 1.9157386855123012e-05, + "loss": 2.6218, + "step": 35783 + }, + { + "epoch": 2.221366937736669, + "grad_norm": 0.15032675241324064, + "learning_rate": 1.9154544380894746e-05, + "loss": 2.7039, + "step": 35784 + }, + { + "epoch": 2.221429014836427, + "grad_norm": 0.14193307283578077, + "learning_rate": 1.9151702067594707e-05, + "loss": 2.8351, + "step": 35785 + }, + { + "epoch": 2.2214910919361848, + "grad_norm": 0.14145934757193046, + "learning_rate": 1.914885991523773e-05, + "loss": 2.7569, + "step": 35786 + }, + { + "epoch": 2.2215531690359427, + "grad_norm": 0.1447155396128667, + "learning_rate": 1.914601792383862e-05, + "loss": 2.7045, + "step": 35787 + }, + { + "epoch": 2.2216152461357006, + "grad_norm": 0.14420248993851456, + "learning_rate": 1.9143176093412245e-05, + "loss": 2.7545, + "step": 35788 + }, + { + "epoch": 2.2216773232354585, + "grad_norm": 0.1406875709565711, + "learning_rate": 1.914033442397341e-05, + "loss": 2.8244, + "step": 35789 + }, + { + "epoch": 2.2217394003352164, + "grad_norm": 0.13989210544792596, + "learning_rate": 1.9137492915536943e-05, + "loss": 2.6819, + "step": 35790 + }, + { + "epoch": 2.2218014774349744, + "grad_norm": 0.14519118736509426, + "learning_rate": 1.913465156811765e-05, + "loss": 2.6697, + "step": 35791 + }, + { + "epoch": 2.2218635545347323, + "grad_norm": 0.13975440852969034, + "learning_rate": 1.9131810381730386e-05, + "loss": 2.7702, + "step": 35792 + }, + { + "epoch": 2.22192563163449, + "grad_norm": 0.15240736391256066, + "learning_rate": 1.9128969356389964e-05, + "loss": 2.6995, + "step": 35793 + }, + { + "epoch": 2.221987708734248, + "grad_norm": 0.14378807014676293, + "learning_rate": 1.91261284921112e-05, + "loss": 2.6655, + "step": 35794 + }, + { + "epoch": 2.222049785834006, + "grad_norm": 0.1589421272380545, + "learning_rate": 1.912328778890892e-05, + "loss": 2.7785, + "step": 35795 + }, + { + "epoch": 2.2221118629337635, + "grad_norm": 0.1607819001747235, + "learning_rate": 1.912044724679793e-05, + "loss": 2.6763, + "step": 35796 + }, + { + "epoch": 2.222173940033522, + "grad_norm": 0.13881747567200461, + "learning_rate": 1.9117606865793075e-05, + "loss": 2.8347, + "step": 35797 + }, + { + "epoch": 2.2222360171332793, + "grad_norm": 0.17099680113414628, + "learning_rate": 1.9114766645909164e-05, + "loss": 2.6986, + "step": 35798 + }, + { + "epoch": 2.2222980942330373, + "grad_norm": 0.1745276520152439, + "learning_rate": 1.9111926587161017e-05, + "loss": 2.7461, + "step": 35799 + }, + { + "epoch": 2.222360171332795, + "grad_norm": 0.15881416267874285, + "learning_rate": 1.9109086689563425e-05, + "loss": 2.6416, + "step": 35800 + }, + { + "epoch": 2.222422248432553, + "grad_norm": 0.16023942485609635, + "learning_rate": 1.910624695313123e-05, + "loss": 2.7525, + "step": 35801 + }, + { + "epoch": 2.222484325532311, + "grad_norm": 0.17196491447149018, + "learning_rate": 1.9103407377879257e-05, + "loss": 2.6909, + "step": 35802 + }, + { + "epoch": 2.222546402632069, + "grad_norm": 0.13955982437891706, + "learning_rate": 1.9100567963822312e-05, + "loss": 2.7824, + "step": 35803 + }, + { + "epoch": 2.222608479731827, + "grad_norm": 0.15480082940976336, + "learning_rate": 1.9097728710975205e-05, + "loss": 2.6896, + "step": 35804 + }, + { + "epoch": 2.2226705568315848, + "grad_norm": 0.15085042295756992, + "learning_rate": 1.909488961935273e-05, + "loss": 2.5876, + "step": 35805 + }, + { + "epoch": 2.2227326339313427, + "grad_norm": 0.15214053685606016, + "learning_rate": 1.9092050688969738e-05, + "loss": 2.6911, + "step": 35806 + }, + { + "epoch": 2.2227947110311006, + "grad_norm": 0.15865132926176065, + "learning_rate": 1.9089211919841016e-05, + "loss": 2.6852, + "step": 35807 + }, + { + "epoch": 2.2228567881308585, + "grad_norm": 0.15640163812116348, + "learning_rate": 1.908637331198138e-05, + "loss": 2.7549, + "step": 35808 + }, + { + "epoch": 2.2229188652306164, + "grad_norm": 0.1583383653232229, + "learning_rate": 1.9083534865405645e-05, + "loss": 2.6506, + "step": 35809 + }, + { + "epoch": 2.2229809423303744, + "grad_norm": 0.15205783178389573, + "learning_rate": 1.9080696580128594e-05, + "loss": 2.7636, + "step": 35810 + }, + { + "epoch": 2.2230430194301323, + "grad_norm": 0.14954833934554448, + "learning_rate": 1.907785845616507e-05, + "loss": 2.617, + "step": 35811 + }, + { + "epoch": 2.22310509652989, + "grad_norm": 0.18783557188400443, + "learning_rate": 1.9075020493529867e-05, + "loss": 2.7676, + "step": 35812 + }, + { + "epoch": 2.223167173629648, + "grad_norm": 0.1599793096385568, + "learning_rate": 1.9072182692237782e-05, + "loss": 2.6991, + "step": 35813 + }, + { + "epoch": 2.223229250729406, + "grad_norm": 0.14556748491525068, + "learning_rate": 1.906934505230362e-05, + "loss": 2.7032, + "step": 35814 + }, + { + "epoch": 2.223291327829164, + "grad_norm": 0.1542023840223625, + "learning_rate": 1.906650757374221e-05, + "loss": 2.6491, + "step": 35815 + }, + { + "epoch": 2.223353404928922, + "grad_norm": 0.1579934543927977, + "learning_rate": 1.9063670256568333e-05, + "loss": 2.7331, + "step": 35816 + }, + { + "epoch": 2.22341548202868, + "grad_norm": 0.1362141630197235, + "learning_rate": 1.9060833100796805e-05, + "loss": 2.7247, + "step": 35817 + }, + { + "epoch": 2.2234775591284377, + "grad_norm": 0.16296328426642584, + "learning_rate": 1.9057996106442418e-05, + "loss": 2.7287, + "step": 35818 + }, + { + "epoch": 2.2235396362281956, + "grad_norm": 0.15186772802980736, + "learning_rate": 1.9055159273519964e-05, + "loss": 2.7079, + "step": 35819 + }, + { + "epoch": 2.2236017133279535, + "grad_norm": 0.1397881642479419, + "learning_rate": 1.9052322602044264e-05, + "loss": 2.7226, + "step": 35820 + }, + { + "epoch": 2.223663790427711, + "grad_norm": 0.1450271540221412, + "learning_rate": 1.9049486092030116e-05, + "loss": 2.6824, + "step": 35821 + }, + { + "epoch": 2.223725867527469, + "grad_norm": 0.15935522475693234, + "learning_rate": 1.9046649743492316e-05, + "loss": 2.8223, + "step": 35822 + }, + { + "epoch": 2.223787944627227, + "grad_norm": 0.14372224824923327, + "learning_rate": 1.9043813556445638e-05, + "loss": 2.6843, + "step": 35823 + }, + { + "epoch": 2.2238500217269848, + "grad_norm": 0.13881272563847286, + "learning_rate": 1.9040977530904913e-05, + "loss": 2.6241, + "step": 35824 + }, + { + "epoch": 2.2239120988267427, + "grad_norm": 0.1853138514464634, + "learning_rate": 1.903814166688493e-05, + "loss": 2.7033, + "step": 35825 + }, + { + "epoch": 2.2239741759265006, + "grad_norm": 0.14423206008584125, + "learning_rate": 1.9035305964400473e-05, + "loss": 2.71, + "step": 35826 + }, + { + "epoch": 2.2240362530262585, + "grad_norm": 0.14469518900599745, + "learning_rate": 1.9032470423466325e-05, + "loss": 2.7125, + "step": 35827 + }, + { + "epoch": 2.2240983301260164, + "grad_norm": 0.13851832901950448, + "learning_rate": 1.9029635044097315e-05, + "loss": 2.6906, + "step": 35828 + }, + { + "epoch": 2.2241604072257743, + "grad_norm": 0.1461706425784589, + "learning_rate": 1.9026799826308222e-05, + "loss": 2.7688, + "step": 35829 + }, + { + "epoch": 2.2242224843255323, + "grad_norm": 0.14424428288106678, + "learning_rate": 1.902396477011383e-05, + "loss": 2.6825, + "step": 35830 + }, + { + "epoch": 2.22428456142529, + "grad_norm": 0.163737349512934, + "learning_rate": 1.902112987552893e-05, + "loss": 2.7672, + "step": 35831 + }, + { + "epoch": 2.224346638525048, + "grad_norm": 0.15406510897355957, + "learning_rate": 1.90182951425683e-05, + "loss": 2.7784, + "step": 35832 + }, + { + "epoch": 2.224408715624806, + "grad_norm": 0.14860361617032836, + "learning_rate": 1.9015460571246746e-05, + "loss": 2.817, + "step": 35833 + }, + { + "epoch": 2.224470792724564, + "grad_norm": 0.16799680342499856, + "learning_rate": 1.9012626161579073e-05, + "loss": 2.7449, + "step": 35834 + }, + { + "epoch": 2.224532869824322, + "grad_norm": 0.14424313360793523, + "learning_rate": 1.9009791913580054e-05, + "loss": 2.8184, + "step": 35835 + }, + { + "epoch": 2.2245949469240798, + "grad_norm": 0.16089660208206966, + "learning_rate": 1.900695782726447e-05, + "loss": 2.6817, + "step": 35836 + }, + { + "epoch": 2.2246570240238377, + "grad_norm": 0.14422098571350422, + "learning_rate": 1.9004123902647097e-05, + "loss": 2.6839, + "step": 35837 + }, + { + "epoch": 2.2247191011235956, + "grad_norm": 0.16886207977872064, + "learning_rate": 1.900129013974275e-05, + "loss": 2.8101, + "step": 35838 + }, + { + "epoch": 2.2247811782233535, + "grad_norm": 0.1467581225593222, + "learning_rate": 1.8998456538566195e-05, + "loss": 2.74, + "step": 35839 + }, + { + "epoch": 2.2248432553231114, + "grad_norm": 0.14763857392569774, + "learning_rate": 1.8995623099132224e-05, + "loss": 2.7693, + "step": 35840 + }, + { + "epoch": 2.2249053324228694, + "grad_norm": 0.1436769263461801, + "learning_rate": 1.8992789821455594e-05, + "loss": 2.8487, + "step": 35841 + }, + { + "epoch": 2.2249674095226273, + "grad_norm": 0.15220635282663147, + "learning_rate": 1.8989956705551122e-05, + "loss": 2.6387, + "step": 35842 + }, + { + "epoch": 2.225029486622385, + "grad_norm": 0.14462897502271627, + "learning_rate": 1.8987123751433573e-05, + "loss": 2.755, + "step": 35843 + }, + { + "epoch": 2.2250915637221427, + "grad_norm": 0.1563418952600084, + "learning_rate": 1.8984290959117734e-05, + "loss": 2.7457, + "step": 35844 + }, + { + "epoch": 2.225153640821901, + "grad_norm": 0.14994353557832266, + "learning_rate": 1.8981458328618375e-05, + "loss": 2.7855, + "step": 35845 + }, + { + "epoch": 2.2252157179216585, + "grad_norm": 0.151135328591848, + "learning_rate": 1.8978625859950254e-05, + "loss": 2.7388, + "step": 35846 + }, + { + "epoch": 2.2252777950214164, + "grad_norm": 0.14361320003785819, + "learning_rate": 1.89757935531282e-05, + "loss": 2.7454, + "step": 35847 + }, + { + "epoch": 2.2253398721211743, + "grad_norm": 0.1420538748607921, + "learning_rate": 1.8972961408166955e-05, + "loss": 2.6768, + "step": 35848 + }, + { + "epoch": 2.2254019492209323, + "grad_norm": 0.14479772385842074, + "learning_rate": 1.8970129425081305e-05, + "loss": 2.7523, + "step": 35849 + }, + { + "epoch": 2.22546402632069, + "grad_norm": 0.1544829164464895, + "learning_rate": 1.896729760388601e-05, + "loss": 2.7012, + "step": 35850 + }, + { + "epoch": 2.225526103420448, + "grad_norm": 0.1362106623317257, + "learning_rate": 1.896446594459587e-05, + "loss": 2.7324, + "step": 35851 + }, + { + "epoch": 2.225588180520206, + "grad_norm": 0.15483430028117473, + "learning_rate": 1.8961634447225645e-05, + "loss": 2.6756, + "step": 35852 + }, + { + "epoch": 2.225650257619964, + "grad_norm": 0.1635670389905609, + "learning_rate": 1.8958803111790108e-05, + "loss": 2.7012, + "step": 35853 + }, + { + "epoch": 2.225712334719722, + "grad_norm": 0.1475686885932813, + "learning_rate": 1.8955971938304014e-05, + "loss": 2.8027, + "step": 35854 + }, + { + "epoch": 2.2257744118194798, + "grad_norm": 0.1550827174623728, + "learning_rate": 1.895314092678217e-05, + "loss": 2.8243, + "step": 35855 + }, + { + "epoch": 2.2258364889192377, + "grad_norm": 0.15152483083660148, + "learning_rate": 1.8950310077239326e-05, + "loss": 2.7401, + "step": 35856 + }, + { + "epoch": 2.2258985660189956, + "grad_norm": 0.14106432187505724, + "learning_rate": 1.8947479389690247e-05, + "loss": 2.7112, + "step": 35857 + }, + { + "epoch": 2.2259606431187535, + "grad_norm": 0.14812078065536544, + "learning_rate": 1.894464886414971e-05, + "loss": 2.6255, + "step": 35858 + }, + { + "epoch": 2.2260227202185114, + "grad_norm": 0.1715715558193006, + "learning_rate": 1.8941818500632462e-05, + "loss": 2.7923, + "step": 35859 + }, + { + "epoch": 2.2260847973182694, + "grad_norm": 0.14337448565391667, + "learning_rate": 1.8938988299153304e-05, + "loss": 2.6824, + "step": 35860 + }, + { + "epoch": 2.2261468744180273, + "grad_norm": 0.1466924651633934, + "learning_rate": 1.893615825972698e-05, + "loss": 2.7522, + "step": 35861 + }, + { + "epoch": 2.226208951517785, + "grad_norm": 0.14656666034007382, + "learning_rate": 1.8933328382368264e-05, + "loss": 2.78, + "step": 35862 + }, + { + "epoch": 2.226271028617543, + "grad_norm": 0.15778331038130283, + "learning_rate": 1.8930498667091896e-05, + "loss": 2.7362, + "step": 35863 + }, + { + "epoch": 2.226333105717301, + "grad_norm": 0.13903680655114362, + "learning_rate": 1.8927669113912677e-05, + "loss": 2.7541, + "step": 35864 + }, + { + "epoch": 2.226395182817059, + "grad_norm": 0.13832572259586667, + "learning_rate": 1.892483972284535e-05, + "loss": 2.687, + "step": 35865 + }, + { + "epoch": 2.226457259916817, + "grad_norm": 0.14258888935580344, + "learning_rate": 1.8922010493904664e-05, + "loss": 2.7494, + "step": 35866 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.15112647944728802, + "learning_rate": 1.8919181427105408e-05, + "loss": 2.6904, + "step": 35867 + }, + { + "epoch": 2.2265814141163327, + "grad_norm": 0.1420169066330281, + "learning_rate": 1.891635252246233e-05, + "loss": 2.8135, + "step": 35868 + }, + { + "epoch": 2.22664349121609, + "grad_norm": 0.15690079104666, + "learning_rate": 1.8913523779990168e-05, + "loss": 2.7552, + "step": 35869 + }, + { + "epoch": 2.226705568315848, + "grad_norm": 0.16066953354342217, + "learning_rate": 1.8910695199703716e-05, + "loss": 2.7437, + "step": 35870 + }, + { + "epoch": 2.226767645415606, + "grad_norm": 0.15259093014550792, + "learning_rate": 1.8907866781617717e-05, + "loss": 2.7221, + "step": 35871 + }, + { + "epoch": 2.226829722515364, + "grad_norm": 0.15152492303161857, + "learning_rate": 1.890503852574692e-05, + "loss": 2.6386, + "step": 35872 + }, + { + "epoch": 2.226891799615122, + "grad_norm": 0.14911971050987405, + "learning_rate": 1.890221043210607e-05, + "loss": 2.6795, + "step": 35873 + }, + { + "epoch": 2.2269538767148798, + "grad_norm": 0.14320941196382656, + "learning_rate": 1.8899382500709955e-05, + "loss": 2.6526, + "step": 35874 + }, + { + "epoch": 2.2270159538146377, + "grad_norm": 0.14000750442810717, + "learning_rate": 1.8896554731573312e-05, + "loss": 2.6947, + "step": 35875 + }, + { + "epoch": 2.2270780309143956, + "grad_norm": 0.14111209987687054, + "learning_rate": 1.8893727124710896e-05, + "loss": 2.8087, + "step": 35876 + }, + { + "epoch": 2.2271401080141535, + "grad_norm": 0.15479699098608296, + "learning_rate": 1.8890899680137437e-05, + "loss": 2.6737, + "step": 35877 + }, + { + "epoch": 2.2272021851139114, + "grad_norm": 0.14505740355135235, + "learning_rate": 1.8888072397867723e-05, + "loss": 2.8013, + "step": 35878 + }, + { + "epoch": 2.2272642622136694, + "grad_norm": 0.16813902208818934, + "learning_rate": 1.8885245277916487e-05, + "loss": 2.7725, + "step": 35879 + }, + { + "epoch": 2.2273263393134273, + "grad_norm": 0.1390360898217048, + "learning_rate": 1.8882418320298485e-05, + "loss": 2.6987, + "step": 35880 + }, + { + "epoch": 2.227388416413185, + "grad_norm": 0.14308379917072647, + "learning_rate": 1.8879591525028456e-05, + "loss": 2.729, + "step": 35881 + }, + { + "epoch": 2.227450493512943, + "grad_norm": 0.14264905158283187, + "learning_rate": 1.8876764892121135e-05, + "loss": 2.7127, + "step": 35882 + }, + { + "epoch": 2.227512570612701, + "grad_norm": 0.1421831879643137, + "learning_rate": 1.887393842159131e-05, + "loss": 2.6644, + "step": 35883 + }, + { + "epoch": 2.227574647712459, + "grad_norm": 0.1431425041943168, + "learning_rate": 1.88711121134537e-05, + "loss": 2.6548, + "step": 35884 + }, + { + "epoch": 2.227636724812217, + "grad_norm": 0.1464125266302191, + "learning_rate": 1.8868285967723053e-05, + "loss": 2.7007, + "step": 35885 + }, + { + "epoch": 2.227698801911975, + "grad_norm": 0.1480946093983295, + "learning_rate": 1.8865459984414097e-05, + "loss": 2.754, + "step": 35886 + }, + { + "epoch": 2.2277608790117327, + "grad_norm": 0.14720061357155845, + "learning_rate": 1.8862634163541616e-05, + "loss": 2.7634, + "step": 35887 + }, + { + "epoch": 2.2278229561114906, + "grad_norm": 0.1366623148925534, + "learning_rate": 1.885980850512033e-05, + "loss": 2.7229, + "step": 35888 + }, + { + "epoch": 2.2278850332112485, + "grad_norm": 0.14636537804769154, + "learning_rate": 1.8856983009164987e-05, + "loss": 2.6721, + "step": 35889 + }, + { + "epoch": 2.2279471103110065, + "grad_norm": 0.16207229810718005, + "learning_rate": 1.8854157675690297e-05, + "loss": 2.7105, + "step": 35890 + }, + { + "epoch": 2.2280091874107644, + "grad_norm": 0.14380228656955696, + "learning_rate": 1.8851332504711054e-05, + "loss": 2.7098, + "step": 35891 + }, + { + "epoch": 2.228071264510522, + "grad_norm": 0.13704788916032853, + "learning_rate": 1.8848507496241967e-05, + "loss": 2.7366, + "step": 35892 + }, + { + "epoch": 2.2281333416102798, + "grad_norm": 0.14830882995740988, + "learning_rate": 1.884568265029778e-05, + "loss": 2.7485, + "step": 35893 + }, + { + "epoch": 2.2281954187100377, + "grad_norm": 0.14152108240234337, + "learning_rate": 1.884285796689323e-05, + "loss": 2.7186, + "step": 35894 + }, + { + "epoch": 2.2282574958097956, + "grad_norm": 0.13718306172562464, + "learning_rate": 1.8840033446043033e-05, + "loss": 2.7652, + "step": 35895 + }, + { + "epoch": 2.2283195729095535, + "grad_norm": 0.13793794749290636, + "learning_rate": 1.8837209087761966e-05, + "loss": 2.6598, + "step": 35896 + }, + { + "epoch": 2.2283816500093114, + "grad_norm": 0.14770789025522268, + "learning_rate": 1.8834384892064742e-05, + "loss": 2.7123, + "step": 35897 + }, + { + "epoch": 2.2284437271090693, + "grad_norm": 0.17097615246095005, + "learning_rate": 1.883156085896608e-05, + "loss": 2.7761, + "step": 35898 + }, + { + "epoch": 2.2285058042088273, + "grad_norm": 0.133534997714279, + "learning_rate": 1.8828736988480754e-05, + "loss": 2.7679, + "step": 35899 + }, + { + "epoch": 2.228567881308585, + "grad_norm": 0.14031241033546965, + "learning_rate": 1.8825913280623453e-05, + "loss": 2.7639, + "step": 35900 + }, + { + "epoch": 2.228629958408343, + "grad_norm": 0.14348366985378327, + "learning_rate": 1.882308973540895e-05, + "loss": 2.7555, + "step": 35901 + }, + { + "epoch": 2.228692035508101, + "grad_norm": 0.14982968535860428, + "learning_rate": 1.882026635285195e-05, + "loss": 2.7741, + "step": 35902 + }, + { + "epoch": 2.228754112607859, + "grad_norm": 0.15744972976471797, + "learning_rate": 1.8817443132967193e-05, + "loss": 2.7581, + "step": 35903 + }, + { + "epoch": 2.228816189707617, + "grad_norm": 0.14232392709052255, + "learning_rate": 1.881462007576939e-05, + "loss": 2.6594, + "step": 35904 + }, + { + "epoch": 2.2288782668073748, + "grad_norm": 0.1450619940451033, + "learning_rate": 1.88117971812733e-05, + "loss": 2.6701, + "step": 35905 + }, + { + "epoch": 2.2289403439071327, + "grad_norm": 0.14889075070481045, + "learning_rate": 1.8808974449493637e-05, + "loss": 2.7046, + "step": 35906 + }, + { + "epoch": 2.2290024210068906, + "grad_norm": 0.14336873031774724, + "learning_rate": 1.8806151880445123e-05, + "loss": 2.8301, + "step": 35907 + }, + { + "epoch": 2.2290644981066485, + "grad_norm": 0.1423094386402672, + "learning_rate": 1.880332947414249e-05, + "loss": 2.7803, + "step": 35908 + }, + { + "epoch": 2.2291265752064064, + "grad_norm": 0.14205690375699637, + "learning_rate": 1.8800507230600444e-05, + "loss": 2.669, + "step": 35909 + }, + { + "epoch": 2.2291886523061644, + "grad_norm": 0.1497019257393418, + "learning_rate": 1.879768514983374e-05, + "loss": 2.8174, + "step": 35910 + }, + { + "epoch": 2.2292507294059223, + "grad_norm": 0.14377570492590372, + "learning_rate": 1.879486323185709e-05, + "loss": 2.7453, + "step": 35911 + }, + { + "epoch": 2.22931280650568, + "grad_norm": 0.13818124155799136, + "learning_rate": 1.8792041476685214e-05, + "loss": 2.6482, + "step": 35912 + }, + { + "epoch": 2.229374883605438, + "grad_norm": 0.14138269280905347, + "learning_rate": 1.878921988433281e-05, + "loss": 2.6755, + "step": 35913 + }, + { + "epoch": 2.229436960705196, + "grad_norm": 0.1446038592155807, + "learning_rate": 1.8786398454814647e-05, + "loss": 2.7361, + "step": 35914 + }, + { + "epoch": 2.229499037804954, + "grad_norm": 0.14380051202225286, + "learning_rate": 1.878357718814542e-05, + "loss": 2.6892, + "step": 35915 + }, + { + "epoch": 2.229561114904712, + "grad_norm": 0.14201303746127034, + "learning_rate": 1.8780756084339844e-05, + "loss": 2.7187, + "step": 35916 + }, + { + "epoch": 2.2296231920044693, + "grad_norm": 0.13765150414901478, + "learning_rate": 1.8777935143412643e-05, + "loss": 2.773, + "step": 35917 + }, + { + "epoch": 2.2296852691042273, + "grad_norm": 0.14242234276523666, + "learning_rate": 1.877511436537852e-05, + "loss": 2.6372, + "step": 35918 + }, + { + "epoch": 2.229747346203985, + "grad_norm": 0.13728727254675915, + "learning_rate": 1.8772293750252224e-05, + "loss": 2.7077, + "step": 35919 + }, + { + "epoch": 2.229809423303743, + "grad_norm": 0.1592681557012568, + "learning_rate": 1.8769473298048444e-05, + "loss": 2.6379, + "step": 35920 + }, + { + "epoch": 2.229871500403501, + "grad_norm": 0.13905662567075583, + "learning_rate": 1.8766653008781908e-05, + "loss": 2.7505, + "step": 35921 + }, + { + "epoch": 2.229933577503259, + "grad_norm": 0.14374762823387302, + "learning_rate": 1.8763832882467313e-05, + "loss": 2.7751, + "step": 35922 + }, + { + "epoch": 2.229995654603017, + "grad_norm": 0.14664618773723984, + "learning_rate": 1.8761012919119396e-05, + "loss": 2.6579, + "step": 35923 + }, + { + "epoch": 2.2300577317027748, + "grad_norm": 0.144717103440521, + "learning_rate": 1.875819311875286e-05, + "loss": 2.7197, + "step": 35924 + }, + { + "epoch": 2.2301198088025327, + "grad_norm": 0.15507408156036118, + "learning_rate": 1.8755373481382417e-05, + "loss": 2.7458, + "step": 35925 + }, + { + "epoch": 2.2301818859022906, + "grad_norm": 0.14669383649992124, + "learning_rate": 1.8752554007022755e-05, + "loss": 2.7207, + "step": 35926 + }, + { + "epoch": 2.2302439630020485, + "grad_norm": 0.14605489076333442, + "learning_rate": 1.8749734695688625e-05, + "loss": 2.704, + "step": 35927 + }, + { + "epoch": 2.2303060401018064, + "grad_norm": 0.15080310559368193, + "learning_rate": 1.874691554739471e-05, + "loss": 2.7371, + "step": 35928 + }, + { + "epoch": 2.2303681172015644, + "grad_norm": 0.14270317133271318, + "learning_rate": 1.8744096562155728e-05, + "loss": 2.7067, + "step": 35929 + }, + { + "epoch": 2.2304301943013223, + "grad_norm": 0.1424031686556202, + "learning_rate": 1.874127773998638e-05, + "loss": 2.6673, + "step": 35930 + }, + { + "epoch": 2.23049227140108, + "grad_norm": 0.17127447446775346, + "learning_rate": 1.8738459080901355e-05, + "loss": 2.6919, + "step": 35931 + }, + { + "epoch": 2.230554348500838, + "grad_norm": 0.14496971970324507, + "learning_rate": 1.8735640584915383e-05, + "loss": 2.7466, + "step": 35932 + }, + { + "epoch": 2.230616425600596, + "grad_norm": 0.14714293896736325, + "learning_rate": 1.8732822252043176e-05, + "loss": 2.7681, + "step": 35933 + }, + { + "epoch": 2.230678502700354, + "grad_norm": 0.15170859934423703, + "learning_rate": 1.8730004082299428e-05, + "loss": 2.6648, + "step": 35934 + }, + { + "epoch": 2.230740579800112, + "grad_norm": 0.18032338782218763, + "learning_rate": 1.8727186075698844e-05, + "loss": 2.7241, + "step": 35935 + }, + { + "epoch": 2.23080265689987, + "grad_norm": 0.14971880284839734, + "learning_rate": 1.87243682322561e-05, + "loss": 2.6957, + "step": 35936 + }, + { + "epoch": 2.2308647339996277, + "grad_norm": 0.1386473795250252, + "learning_rate": 1.8721550551985934e-05, + "loss": 2.7532, + "step": 35937 + }, + { + "epoch": 2.2309268110993856, + "grad_norm": 0.14018388944501073, + "learning_rate": 1.8718733034903034e-05, + "loss": 2.6545, + "step": 35938 + }, + { + "epoch": 2.2309888881991435, + "grad_norm": 0.13326705254384902, + "learning_rate": 1.8715915681022102e-05, + "loss": 2.6144, + "step": 35939 + }, + { + "epoch": 2.231050965298901, + "grad_norm": 0.16175324030745086, + "learning_rate": 1.8713098490357812e-05, + "loss": 2.7059, + "step": 35940 + }, + { + "epoch": 2.231113042398659, + "grad_norm": 0.15343736323216514, + "learning_rate": 1.87102814629249e-05, + "loss": 2.7689, + "step": 35941 + }, + { + "epoch": 2.231175119498417, + "grad_norm": 0.14927721925196483, + "learning_rate": 1.8707464598738044e-05, + "loss": 2.6779, + "step": 35942 + }, + { + "epoch": 2.2312371965981748, + "grad_norm": 0.1636527428108264, + "learning_rate": 1.8704647897811945e-05, + "loss": 2.6988, + "step": 35943 + }, + { + "epoch": 2.2312992736979327, + "grad_norm": 0.1391248624784676, + "learning_rate": 1.8701831360161288e-05, + "loss": 2.6889, + "step": 35944 + }, + { + "epoch": 2.2313613507976906, + "grad_norm": 0.14719811442749303, + "learning_rate": 1.8699014985800763e-05, + "loss": 2.6215, + "step": 35945 + }, + { + "epoch": 2.2314234278974485, + "grad_norm": 0.15387342993805267, + "learning_rate": 1.8696198774745092e-05, + "loss": 2.7457, + "step": 35946 + }, + { + "epoch": 2.2314855049972064, + "grad_norm": 0.1405582600743247, + "learning_rate": 1.869338272700895e-05, + "loss": 2.7536, + "step": 35947 + }, + { + "epoch": 2.2315475820969644, + "grad_norm": 0.14618995961382591, + "learning_rate": 1.869056684260703e-05, + "loss": 2.6095, + "step": 35948 + }, + { + "epoch": 2.2316096591967223, + "grad_norm": 0.14006278450276097, + "learning_rate": 1.868775112155401e-05, + "loss": 2.677, + "step": 35949 + }, + { + "epoch": 2.23167173629648, + "grad_norm": 0.14769981311840832, + "learning_rate": 1.868493556386461e-05, + "loss": 2.7879, + "step": 35950 + }, + { + "epoch": 2.231733813396238, + "grad_norm": 0.14792826161026013, + "learning_rate": 1.8682120169553502e-05, + "loss": 2.817, + "step": 35951 + }, + { + "epoch": 2.231795890495996, + "grad_norm": 0.1493039691702772, + "learning_rate": 1.8679304938635373e-05, + "loss": 2.7904, + "step": 35952 + }, + { + "epoch": 2.231857967595754, + "grad_norm": 0.13596337636667122, + "learning_rate": 1.8676489871124912e-05, + "loss": 2.5874, + "step": 35953 + }, + { + "epoch": 2.231920044695512, + "grad_norm": 0.15795881928755137, + "learning_rate": 1.8673674967036797e-05, + "loss": 2.6578, + "step": 35954 + }, + { + "epoch": 2.23198212179527, + "grad_norm": 0.18199707250677225, + "learning_rate": 1.867086022638574e-05, + "loss": 2.7552, + "step": 35955 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.14263336861980427, + "learning_rate": 1.866804564918641e-05, + "loss": 2.6596, + "step": 35956 + }, + { + "epoch": 2.2321062759947856, + "grad_norm": 0.1366715622258666, + "learning_rate": 1.866523123545349e-05, + "loss": 2.5833, + "step": 35957 + }, + { + "epoch": 2.2321683530945435, + "grad_norm": 0.15491765124728193, + "learning_rate": 1.8662416985201646e-05, + "loss": 2.6894, + "step": 35958 + }, + { + "epoch": 2.2322304301943015, + "grad_norm": 0.14383343472851082, + "learning_rate": 1.8659602898445606e-05, + "loss": 2.7354, + "step": 35959 + }, + { + "epoch": 2.2322925072940594, + "grad_norm": 0.1496308505528619, + "learning_rate": 1.8656788975200018e-05, + "loss": 2.6593, + "step": 35960 + }, + { + "epoch": 2.2323545843938173, + "grad_norm": 0.15481725895342938, + "learning_rate": 1.8653975215479574e-05, + "loss": 2.6831, + "step": 35961 + }, + { + "epoch": 2.232416661493575, + "grad_norm": 0.1528829849178253, + "learning_rate": 1.8651161619298936e-05, + "loss": 2.7866, + "step": 35962 + }, + { + "epoch": 2.232478738593333, + "grad_norm": 0.15008413369314674, + "learning_rate": 1.864834818667281e-05, + "loss": 2.6551, + "step": 35963 + }, + { + "epoch": 2.232540815693091, + "grad_norm": 0.16311646364245913, + "learning_rate": 1.8645534917615846e-05, + "loss": 2.7481, + "step": 35964 + }, + { + "epoch": 2.2326028927928485, + "grad_norm": 0.14830639972591594, + "learning_rate": 1.8642721812142757e-05, + "loss": 2.6861, + "step": 35965 + }, + { + "epoch": 2.2326649698926064, + "grad_norm": 0.14327518345252468, + "learning_rate": 1.8639908870268195e-05, + "loss": 2.715, + "step": 35966 + }, + { + "epoch": 2.2327270469923644, + "grad_norm": 0.144500568071572, + "learning_rate": 1.863709609200685e-05, + "loss": 2.7408, + "step": 35967 + }, + { + "epoch": 2.2327891240921223, + "grad_norm": 0.14724399068767308, + "learning_rate": 1.8634283477373365e-05, + "loss": 2.7252, + "step": 35968 + }, + { + "epoch": 2.23285120119188, + "grad_norm": 0.1404699490679524, + "learning_rate": 1.8631471026382457e-05, + "loss": 2.7474, + "step": 35969 + }, + { + "epoch": 2.232913278291638, + "grad_norm": 0.1592768039378474, + "learning_rate": 1.862865873904878e-05, + "loss": 2.7663, + "step": 35970 + }, + { + "epoch": 2.232975355391396, + "grad_norm": 0.1383066000574262, + "learning_rate": 1.8625846615387003e-05, + "loss": 2.6794, + "step": 35971 + }, + { + "epoch": 2.233037432491154, + "grad_norm": 0.143798718023339, + "learning_rate": 1.862303465541179e-05, + "loss": 2.7187, + "step": 35972 + }, + { + "epoch": 2.233099509590912, + "grad_norm": 0.15790536315286932, + "learning_rate": 1.8620222859137836e-05, + "loss": 2.7586, + "step": 35973 + }, + { + "epoch": 2.2331615866906698, + "grad_norm": 0.1484646772300956, + "learning_rate": 1.8617411226579795e-05, + "loss": 2.7626, + "step": 35974 + }, + { + "epoch": 2.2332236637904277, + "grad_norm": 0.14317881754031567, + "learning_rate": 1.8614599757752338e-05, + "loss": 2.7582, + "step": 35975 + }, + { + "epoch": 2.2332857408901856, + "grad_norm": 0.14709211165133912, + "learning_rate": 1.8611788452670122e-05, + "loss": 2.7713, + "step": 35976 + }, + { + "epoch": 2.2333478179899435, + "grad_norm": 0.1476324549176737, + "learning_rate": 1.8608977311347835e-05, + "loss": 2.6965, + "step": 35977 + }, + { + "epoch": 2.2334098950897014, + "grad_norm": 0.14208879503210498, + "learning_rate": 1.860616633380014e-05, + "loss": 2.668, + "step": 35978 + }, + { + "epoch": 2.2334719721894594, + "grad_norm": 0.13849361137174207, + "learning_rate": 1.8603355520041694e-05, + "loss": 2.6373, + "step": 35979 + }, + { + "epoch": 2.2335340492892173, + "grad_norm": 0.14770896212933146, + "learning_rate": 1.860054487008716e-05, + "loss": 2.6957, + "step": 35980 + }, + { + "epoch": 2.233596126388975, + "grad_norm": 0.1382255423129857, + "learning_rate": 1.859773438395119e-05, + "loss": 2.7542, + "step": 35981 + }, + { + "epoch": 2.233658203488733, + "grad_norm": 0.15126086161564462, + "learning_rate": 1.8594924061648484e-05, + "loss": 2.7636, + "step": 35982 + }, + { + "epoch": 2.233720280588491, + "grad_norm": 0.14539428737071924, + "learning_rate": 1.859211390319368e-05, + "loss": 2.7204, + "step": 35983 + }, + { + "epoch": 2.233782357688249, + "grad_norm": 0.19641024137096685, + "learning_rate": 1.8589303908601448e-05, + "loss": 2.7146, + "step": 35984 + }, + { + "epoch": 2.233844434788007, + "grad_norm": 0.14977654263576365, + "learning_rate": 1.8586494077886417e-05, + "loss": 2.7269, + "step": 35985 + }, + { + "epoch": 2.233906511887765, + "grad_norm": 0.15716270666461557, + "learning_rate": 1.858368441106329e-05, + "loss": 2.6687, + "step": 35986 + }, + { + "epoch": 2.2339685889875227, + "grad_norm": 0.13561950601050335, + "learning_rate": 1.8580874908146712e-05, + "loss": 2.6723, + "step": 35987 + }, + { + "epoch": 2.23403066608728, + "grad_norm": 0.14112369538524383, + "learning_rate": 1.8578065569151336e-05, + "loss": 2.7319, + "step": 35988 + }, + { + "epoch": 2.234092743187038, + "grad_norm": 0.14570790173061496, + "learning_rate": 1.857525639409182e-05, + "loss": 2.7186, + "step": 35989 + }, + { + "epoch": 2.234154820286796, + "grad_norm": 0.17097859818708902, + "learning_rate": 1.8572447382982803e-05, + "loss": 2.7381, + "step": 35990 + }, + { + "epoch": 2.234216897386554, + "grad_norm": 0.1451002467909199, + "learning_rate": 1.8569638535838978e-05, + "loss": 2.7219, + "step": 35991 + }, + { + "epoch": 2.234278974486312, + "grad_norm": 0.158967783950291, + "learning_rate": 1.8566829852674972e-05, + "loss": 2.7366, + "step": 35992 + }, + { + "epoch": 2.2343410515860698, + "grad_norm": 0.14857794993542, + "learning_rate": 1.8564021333505448e-05, + "loss": 2.8297, + "step": 35993 + }, + { + "epoch": 2.2344031286858277, + "grad_norm": 0.15908433831721583, + "learning_rate": 1.8561212978345037e-05, + "loss": 2.6992, + "step": 35994 + }, + { + "epoch": 2.2344652057855856, + "grad_norm": 0.1500152200189382, + "learning_rate": 1.8558404787208433e-05, + "loss": 2.7714, + "step": 35995 + }, + { + "epoch": 2.2345272828853435, + "grad_norm": 0.13803926755650384, + "learning_rate": 1.855559676011026e-05, + "loss": 2.7014, + "step": 35996 + }, + { + "epoch": 2.2345893599851014, + "grad_norm": 0.1460212124550966, + "learning_rate": 1.8552788897065154e-05, + "loss": 2.6888, + "step": 35997 + }, + { + "epoch": 2.2346514370848594, + "grad_norm": 0.13883221450131975, + "learning_rate": 1.85499811980878e-05, + "loss": 2.727, + "step": 35998 + }, + { + "epoch": 2.2347135141846173, + "grad_norm": 0.1434426941274024, + "learning_rate": 1.8547173663192812e-05, + "loss": 2.7616, + "step": 35999 + }, + { + "epoch": 2.234775591284375, + "grad_norm": 0.14706335722547392, + "learning_rate": 1.8544366292394877e-05, + "loss": 2.7174, + "step": 36000 + }, + { + "epoch": 2.234837668384133, + "grad_norm": 0.15270409393627668, + "learning_rate": 1.8541559085708614e-05, + "loss": 2.7698, + "step": 36001 + }, + { + "epoch": 2.234899745483891, + "grad_norm": 0.14960395364923704, + "learning_rate": 1.8538752043148673e-05, + "loss": 2.6202, + "step": 36002 + }, + { + "epoch": 2.234961822583649, + "grad_norm": 0.1407427625832308, + "learning_rate": 1.85359451647297e-05, + "loss": 2.6693, + "step": 36003 + }, + { + "epoch": 2.235023899683407, + "grad_norm": 0.18598302476212208, + "learning_rate": 1.8533138450466326e-05, + "loss": 2.703, + "step": 36004 + }, + { + "epoch": 2.235085976783165, + "grad_norm": 0.14393977978864164, + "learning_rate": 1.853033190037322e-05, + "loss": 2.6618, + "step": 36005 + }, + { + "epoch": 2.2351480538829227, + "grad_norm": 0.15452041006495315, + "learning_rate": 1.8527525514465017e-05, + "loss": 2.7108, + "step": 36006 + }, + { + "epoch": 2.2352101309826806, + "grad_norm": 0.16176848591216741, + "learning_rate": 1.852471929275635e-05, + "loss": 2.6956, + "step": 36007 + }, + { + "epoch": 2.2352722080824385, + "grad_norm": 0.14013675862487865, + "learning_rate": 1.8521913235261844e-05, + "loss": 2.7044, + "step": 36008 + }, + { + "epoch": 2.2353342851821965, + "grad_norm": 0.16202528622539733, + "learning_rate": 1.851910734199618e-05, + "loss": 2.8363, + "step": 36009 + }, + { + "epoch": 2.2353963622819544, + "grad_norm": 0.13778272519064905, + "learning_rate": 1.8516301612973973e-05, + "loss": 2.7797, + "step": 36010 + }, + { + "epoch": 2.2354584393817123, + "grad_norm": 0.14851816394929312, + "learning_rate": 1.8513496048209867e-05, + "loss": 2.6635, + "step": 36011 + }, + { + "epoch": 2.23552051648147, + "grad_norm": 0.14997979191914132, + "learning_rate": 1.851069064771847e-05, + "loss": 2.7189, + "step": 36012 + }, + { + "epoch": 2.2355825935812277, + "grad_norm": 0.13912227851291437, + "learning_rate": 1.850788541151447e-05, + "loss": 2.7545, + "step": 36013 + }, + { + "epoch": 2.2356446706809856, + "grad_norm": 0.13942432426721862, + "learning_rate": 1.8505080339612475e-05, + "loss": 2.621, + "step": 36014 + }, + { + "epoch": 2.2357067477807435, + "grad_norm": 0.14991840935021647, + "learning_rate": 1.850227543202712e-05, + "loss": 2.7159, + "step": 36015 + }, + { + "epoch": 2.2357688248805014, + "grad_norm": 0.16746587661923293, + "learning_rate": 1.849947068877304e-05, + "loss": 2.813, + "step": 36016 + }, + { + "epoch": 2.2358309019802594, + "grad_norm": 0.17078939501333606, + "learning_rate": 1.849666610986485e-05, + "loss": 2.6828, + "step": 36017 + }, + { + "epoch": 2.2358929790800173, + "grad_norm": 0.16403347848609176, + "learning_rate": 1.8493861695317223e-05, + "loss": 2.7008, + "step": 36018 + }, + { + "epoch": 2.235955056179775, + "grad_norm": 0.14121168328981276, + "learning_rate": 1.849105744514476e-05, + "loss": 2.6908, + "step": 36019 + }, + { + "epoch": 2.236017133279533, + "grad_norm": 0.14609116230677174, + "learning_rate": 1.8488253359362105e-05, + "loss": 2.731, + "step": 36020 + }, + { + "epoch": 2.236079210379291, + "grad_norm": 0.1549374225546246, + "learning_rate": 1.8485449437983866e-05, + "loss": 2.8315, + "step": 36021 + }, + { + "epoch": 2.236141287479049, + "grad_norm": 0.14335114458037637, + "learning_rate": 1.8482645681024703e-05, + "loss": 2.7056, + "step": 36022 + }, + { + "epoch": 2.236203364578807, + "grad_norm": 0.17385218485938225, + "learning_rate": 1.8479842088499226e-05, + "loss": 2.7492, + "step": 36023 + }, + { + "epoch": 2.236265441678565, + "grad_norm": 0.15603395068831832, + "learning_rate": 1.847703866042207e-05, + "loss": 2.7255, + "step": 36024 + }, + { + "epoch": 2.2363275187783227, + "grad_norm": 0.15056535552956302, + "learning_rate": 1.8474235396807833e-05, + "loss": 2.7158, + "step": 36025 + }, + { + "epoch": 2.2363895958780806, + "grad_norm": 0.15731004311306737, + "learning_rate": 1.847143229767118e-05, + "loss": 2.7528, + "step": 36026 + }, + { + "epoch": 2.2364516729778385, + "grad_norm": 0.1483321568981767, + "learning_rate": 1.8468629363026724e-05, + "loss": 2.6728, + "step": 36027 + }, + { + "epoch": 2.2365137500775965, + "grad_norm": 0.15304820095693228, + "learning_rate": 1.846582659288908e-05, + "loss": 2.6055, + "step": 36028 + }, + { + "epoch": 2.2365758271773544, + "grad_norm": 0.13783368131696663, + "learning_rate": 1.846302398727286e-05, + "loss": 2.7271, + "step": 36029 + }, + { + "epoch": 2.2366379042771123, + "grad_norm": 0.13690436569681252, + "learning_rate": 1.8460221546192713e-05, + "loss": 2.6724, + "step": 36030 + }, + { + "epoch": 2.23669998137687, + "grad_norm": 0.15460999020786106, + "learning_rate": 1.8457419269663228e-05, + "loss": 2.6464, + "step": 36031 + }, + { + "epoch": 2.236762058476628, + "grad_norm": 0.16317753874744317, + "learning_rate": 1.8454617157699067e-05, + "loss": 2.6666, + "step": 36032 + }, + { + "epoch": 2.236824135576386, + "grad_norm": 0.14984864879273574, + "learning_rate": 1.845181521031482e-05, + "loss": 2.6823, + "step": 36033 + }, + { + "epoch": 2.236886212676144, + "grad_norm": 0.1478192416280789, + "learning_rate": 1.8449013427525113e-05, + "loss": 2.7316, + "step": 36034 + }, + { + "epoch": 2.236948289775902, + "grad_norm": 0.15622600728839722, + "learning_rate": 1.8446211809344548e-05, + "loss": 2.7244, + "step": 36035 + }, + { + "epoch": 2.2370103668756594, + "grad_norm": 0.13318454299590715, + "learning_rate": 1.844341035578777e-05, + "loss": 2.6969, + "step": 36036 + }, + { + "epoch": 2.2370724439754173, + "grad_norm": 0.1421164851431073, + "learning_rate": 1.8440609066869384e-05, + "loss": 2.7182, + "step": 36037 + }, + { + "epoch": 2.237134521075175, + "grad_norm": 0.19643788365949547, + "learning_rate": 1.8437807942604e-05, + "loss": 2.7003, + "step": 36038 + }, + { + "epoch": 2.237196598174933, + "grad_norm": 0.1610662958817676, + "learning_rate": 1.8435006983006237e-05, + "loss": 2.7785, + "step": 36039 + }, + { + "epoch": 2.237258675274691, + "grad_norm": 0.14315965367251127, + "learning_rate": 1.8432206188090685e-05, + "loss": 2.7168, + "step": 36040 + }, + { + "epoch": 2.237320752374449, + "grad_norm": 0.13914788844594758, + "learning_rate": 1.8429405557871993e-05, + "loss": 2.6859, + "step": 36041 + }, + { + "epoch": 2.237382829474207, + "grad_norm": 0.14545186778648125, + "learning_rate": 1.8426605092364756e-05, + "loss": 2.7282, + "step": 36042 + }, + { + "epoch": 2.2374449065739648, + "grad_norm": 0.13980646462175675, + "learning_rate": 1.8423804791583588e-05, + "loss": 2.6276, + "step": 36043 + }, + { + "epoch": 2.2375069836737227, + "grad_norm": 0.14333086665820874, + "learning_rate": 1.8421004655543077e-05, + "loss": 2.6997, + "step": 36044 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.15845006343681386, + "learning_rate": 1.8418204684257867e-05, + "loss": 2.7726, + "step": 36045 + }, + { + "epoch": 2.2376311378732385, + "grad_norm": 0.1396052287019175, + "learning_rate": 1.8415404877742548e-05, + "loss": 2.7927, + "step": 36046 + }, + { + "epoch": 2.2376932149729964, + "grad_norm": 0.1370636880186494, + "learning_rate": 1.8412605236011732e-05, + "loss": 2.6727, + "step": 36047 + }, + { + "epoch": 2.2377552920727544, + "grad_norm": 0.1377798794721279, + "learning_rate": 1.8409805759080005e-05, + "loss": 2.6704, + "step": 36048 + }, + { + "epoch": 2.2378173691725123, + "grad_norm": 0.1381762000488374, + "learning_rate": 1.8407006446962012e-05, + "loss": 2.7021, + "step": 36049 + }, + { + "epoch": 2.23787944627227, + "grad_norm": 0.13816157998013648, + "learning_rate": 1.8404207299672327e-05, + "loss": 2.6758, + "step": 36050 + }, + { + "epoch": 2.237941523372028, + "grad_norm": 0.13821829909307803, + "learning_rate": 1.840140831722557e-05, + "loss": 2.6236, + "step": 36051 + }, + { + "epoch": 2.238003600471786, + "grad_norm": 0.15723378893433965, + "learning_rate": 1.839860949963633e-05, + "loss": 2.7, + "step": 36052 + }, + { + "epoch": 2.238065677571544, + "grad_norm": 0.1511975168073279, + "learning_rate": 1.8395810846919203e-05, + "loss": 2.6419, + "step": 36053 + }, + { + "epoch": 2.238127754671302, + "grad_norm": 0.151843631687813, + "learning_rate": 1.8393012359088824e-05, + "loss": 2.7039, + "step": 36054 + }, + { + "epoch": 2.23818983177106, + "grad_norm": 0.14317472608126106, + "learning_rate": 1.8390214036159762e-05, + "loss": 2.7812, + "step": 36055 + }, + { + "epoch": 2.2382519088708177, + "grad_norm": 0.16384706089422268, + "learning_rate": 1.8387415878146636e-05, + "loss": 2.7346, + "step": 36056 + }, + { + "epoch": 2.2383139859705756, + "grad_norm": 0.14576383762692915, + "learning_rate": 1.8384617885064015e-05, + "loss": 2.6847, + "step": 36057 + }, + { + "epoch": 2.2383760630703335, + "grad_norm": 0.14661652643893486, + "learning_rate": 1.8381820056926535e-05, + "loss": 2.7076, + "step": 36058 + }, + { + "epoch": 2.238438140170091, + "grad_norm": 0.14718017628747174, + "learning_rate": 1.8379022393748773e-05, + "loss": 2.6887, + "step": 36059 + }, + { + "epoch": 2.2385002172698494, + "grad_norm": 0.14670415285680607, + "learning_rate": 1.837622489554533e-05, + "loss": 2.7061, + "step": 36060 + }, + { + "epoch": 2.238562294369607, + "grad_norm": 0.14191010501928586, + "learning_rate": 1.8373427562330787e-05, + "loss": 2.691, + "step": 36061 + }, + { + "epoch": 2.2386243714693648, + "grad_norm": 0.1422499380727509, + "learning_rate": 1.8370630394119743e-05, + "loss": 2.7488, + "step": 36062 + }, + { + "epoch": 2.2386864485691227, + "grad_norm": 0.1476309472056191, + "learning_rate": 1.8367833390926814e-05, + "loss": 2.7869, + "step": 36063 + }, + { + "epoch": 2.2387485256688806, + "grad_norm": 0.15150613252775194, + "learning_rate": 1.836503655276658e-05, + "loss": 2.7151, + "step": 36064 + }, + { + "epoch": 2.2388106027686385, + "grad_norm": 0.1406817785185937, + "learning_rate": 1.836223987965363e-05, + "loss": 2.7619, + "step": 36065 + }, + { + "epoch": 2.2388726798683964, + "grad_norm": 0.1332630062027103, + "learning_rate": 1.8359443371602553e-05, + "loss": 2.7293, + "step": 36066 + }, + { + "epoch": 2.2389347569681544, + "grad_norm": 0.1393676549318914, + "learning_rate": 1.8356647028627926e-05, + "loss": 2.6721, + "step": 36067 + }, + { + "epoch": 2.2389968340679123, + "grad_norm": 0.14120901219914025, + "learning_rate": 1.8353850850744368e-05, + "loss": 2.7218, + "step": 36068 + }, + { + "epoch": 2.23905891116767, + "grad_norm": 0.1386342400536191, + "learning_rate": 1.8351054837966458e-05, + "loss": 2.6822, + "step": 36069 + }, + { + "epoch": 2.239120988267428, + "grad_norm": 0.13724566150020154, + "learning_rate": 1.8348258990308774e-05, + "loss": 2.6777, + "step": 36070 + }, + { + "epoch": 2.239183065367186, + "grad_norm": 0.16821167811162974, + "learning_rate": 1.834546330778589e-05, + "loss": 2.691, + "step": 36071 + }, + { + "epoch": 2.239245142466944, + "grad_norm": 0.1518542605532944, + "learning_rate": 1.8342667790412426e-05, + "loss": 2.6311, + "step": 36072 + }, + { + "epoch": 2.239307219566702, + "grad_norm": 0.14238023742524034, + "learning_rate": 1.8339872438202948e-05, + "loss": 2.7702, + "step": 36073 + }, + { + "epoch": 2.23936929666646, + "grad_norm": 0.15142254586209442, + "learning_rate": 1.833707725117204e-05, + "loss": 2.7128, + "step": 36074 + }, + { + "epoch": 2.2394313737662177, + "grad_norm": 0.1510098885671707, + "learning_rate": 1.833428222933427e-05, + "loss": 2.6848, + "step": 36075 + }, + { + "epoch": 2.2394934508659756, + "grad_norm": 0.14365421140949108, + "learning_rate": 1.8331487372704258e-05, + "loss": 2.7662, + "step": 36076 + }, + { + "epoch": 2.2395555279657335, + "grad_norm": 0.13377898240924402, + "learning_rate": 1.832869268129656e-05, + "loss": 2.7054, + "step": 36077 + }, + { + "epoch": 2.2396176050654915, + "grad_norm": 0.1392122539242014, + "learning_rate": 1.8325898155125758e-05, + "loss": 2.725, + "step": 36078 + }, + { + "epoch": 2.2396796821652494, + "grad_norm": 0.1492785169332487, + "learning_rate": 1.8323103794206442e-05, + "loss": 2.7602, + "step": 36079 + }, + { + "epoch": 2.2397417592650073, + "grad_norm": 0.14374238025124286, + "learning_rate": 1.8320309598553158e-05, + "loss": 2.721, + "step": 36080 + }, + { + "epoch": 2.239803836364765, + "grad_norm": 0.14000065275448803, + "learning_rate": 1.8317515568180532e-05, + "loss": 2.674, + "step": 36081 + }, + { + "epoch": 2.239865913464523, + "grad_norm": 0.1469724914955407, + "learning_rate": 1.831472170310312e-05, + "loss": 2.7544, + "step": 36082 + }, + { + "epoch": 2.239927990564281, + "grad_norm": 0.1437659622742327, + "learning_rate": 1.8311928003335494e-05, + "loss": 2.7022, + "step": 36083 + }, + { + "epoch": 2.2399900676640385, + "grad_norm": 0.13718536993341265, + "learning_rate": 1.8309134468892215e-05, + "loss": 2.6811, + "step": 36084 + }, + { + "epoch": 2.2400521447637964, + "grad_norm": 0.14657715084686274, + "learning_rate": 1.8306341099787894e-05, + "loss": 2.7843, + "step": 36085 + }, + { + "epoch": 2.2401142218635544, + "grad_norm": 0.1386856757267934, + "learning_rate": 1.8303547896037083e-05, + "loss": 2.7628, + "step": 36086 + }, + { + "epoch": 2.2401762989633123, + "grad_norm": 0.14139653852213943, + "learning_rate": 1.8300754857654358e-05, + "loss": 2.6788, + "step": 36087 + }, + { + "epoch": 2.24023837606307, + "grad_norm": 0.14763290281243052, + "learning_rate": 1.829796198465429e-05, + "loss": 2.7852, + "step": 36088 + }, + { + "epoch": 2.240300453162828, + "grad_norm": 0.15565810150258427, + "learning_rate": 1.8295169277051432e-05, + "loss": 2.7278, + "step": 36089 + }, + { + "epoch": 2.240362530262586, + "grad_norm": 0.15941341185022187, + "learning_rate": 1.829237673486039e-05, + "loss": 2.8249, + "step": 36090 + }, + { + "epoch": 2.240424607362344, + "grad_norm": 0.13736689746815006, + "learning_rate": 1.828958435809572e-05, + "loss": 2.7087, + "step": 36091 + }, + { + "epoch": 2.240486684462102, + "grad_norm": 0.1568806557186751, + "learning_rate": 1.828679214677198e-05, + "loss": 2.7476, + "step": 36092 + }, + { + "epoch": 2.24054876156186, + "grad_norm": 0.15048769505787032, + "learning_rate": 1.828400010090373e-05, + "loss": 2.8203, + "step": 36093 + }, + { + "epoch": 2.2406108386616177, + "grad_norm": 0.14258207028299794, + "learning_rate": 1.8281208220505562e-05, + "loss": 2.8035, + "step": 36094 + }, + { + "epoch": 2.2406729157613756, + "grad_norm": 0.14291023740287792, + "learning_rate": 1.8278416505592023e-05, + "loss": 2.8297, + "step": 36095 + }, + { + "epoch": 2.2407349928611335, + "grad_norm": 0.1406340529918661, + "learning_rate": 1.82756249561777e-05, + "loss": 2.7705, + "step": 36096 + }, + { + "epoch": 2.2407970699608915, + "grad_norm": 0.1387907579738772, + "learning_rate": 1.8272833572277142e-05, + "loss": 2.7601, + "step": 36097 + }, + { + "epoch": 2.2408591470606494, + "grad_norm": 0.15020924938807026, + "learning_rate": 1.8270042353904898e-05, + "loss": 2.6862, + "step": 36098 + }, + { + "epoch": 2.2409212241604073, + "grad_norm": 0.1471415085220668, + "learning_rate": 1.826725130107556e-05, + "loss": 2.7484, + "step": 36099 + }, + { + "epoch": 2.240983301260165, + "grad_norm": 0.13757965289582924, + "learning_rate": 1.8264460413803674e-05, + "loss": 2.7249, + "step": 36100 + }, + { + "epoch": 2.241045378359923, + "grad_norm": 0.14777914034724476, + "learning_rate": 1.8261669692103804e-05, + "loss": 2.6914, + "step": 36101 + }, + { + "epoch": 2.241107455459681, + "grad_norm": 0.144259635775355, + "learning_rate": 1.8258879135990508e-05, + "loss": 2.741, + "step": 36102 + }, + { + "epoch": 2.241169532559439, + "grad_norm": 0.15132190888517139, + "learning_rate": 1.825608874547833e-05, + "loss": 2.7337, + "step": 36103 + }, + { + "epoch": 2.241231609659197, + "grad_norm": 0.13965527311014564, + "learning_rate": 1.8253298520581857e-05, + "loss": 2.7624, + "step": 36104 + }, + { + "epoch": 2.241293686758955, + "grad_norm": 0.15925934325570945, + "learning_rate": 1.8250508461315637e-05, + "loss": 2.6748, + "step": 36105 + }, + { + "epoch": 2.2413557638587127, + "grad_norm": 0.1520473457786227, + "learning_rate": 1.8247718567694218e-05, + "loss": 2.6555, + "step": 36106 + }, + { + "epoch": 2.24141784095847, + "grad_norm": 0.16444411626863956, + "learning_rate": 1.8244928839732145e-05, + "loss": 2.687, + "step": 36107 + }, + { + "epoch": 2.2414799180582285, + "grad_norm": 0.1421176254012394, + "learning_rate": 1.8242139277443998e-05, + "loss": 2.706, + "step": 36108 + }, + { + "epoch": 2.241541995157986, + "grad_norm": 0.13777627661353586, + "learning_rate": 1.8239349880844325e-05, + "loss": 2.7161, + "step": 36109 + }, + { + "epoch": 2.241604072257744, + "grad_norm": 0.1571552695473038, + "learning_rate": 1.8236560649947677e-05, + "loss": 2.7517, + "step": 36110 + }, + { + "epoch": 2.241666149357502, + "grad_norm": 0.14972415236422482, + "learning_rate": 1.8233771584768577e-05, + "loss": 2.7065, + "step": 36111 + }, + { + "epoch": 2.2417282264572598, + "grad_norm": 0.14658703068004006, + "learning_rate": 1.8230982685321625e-05, + "loss": 2.7842, + "step": 36112 + }, + { + "epoch": 2.2417903035570177, + "grad_norm": 0.1542957643879383, + "learning_rate": 1.8228193951621347e-05, + "loss": 2.6935, + "step": 36113 + }, + { + "epoch": 2.2418523806567756, + "grad_norm": 0.15941073610723375, + "learning_rate": 1.8225405383682297e-05, + "loss": 2.6982, + "step": 36114 + }, + { + "epoch": 2.2419144577565335, + "grad_norm": 0.14734982099798893, + "learning_rate": 1.822261698151902e-05, + "loss": 2.8001, + "step": 36115 + }, + { + "epoch": 2.2419765348562914, + "grad_norm": 0.15764526627599376, + "learning_rate": 1.8219828745146045e-05, + "loss": 2.7658, + "step": 36116 + }, + { + "epoch": 2.2420386119560494, + "grad_norm": 0.18030174628243567, + "learning_rate": 1.8217040674577952e-05, + "loss": 2.6882, + "step": 36117 + }, + { + "epoch": 2.2421006890558073, + "grad_norm": 0.15342255238873867, + "learning_rate": 1.8214252769829277e-05, + "loss": 2.7021, + "step": 36118 + }, + { + "epoch": 2.242162766155565, + "grad_norm": 0.15309369292669872, + "learning_rate": 1.8211465030914566e-05, + "loss": 2.7122, + "step": 36119 + }, + { + "epoch": 2.242224843255323, + "grad_norm": 0.143705567718165, + "learning_rate": 1.8208677457848333e-05, + "loss": 2.6709, + "step": 36120 + }, + { + "epoch": 2.242286920355081, + "grad_norm": 0.1443622993481056, + "learning_rate": 1.8205890050645175e-05, + "loss": 2.6649, + "step": 36121 + }, + { + "epoch": 2.242348997454839, + "grad_norm": 0.1556609973103123, + "learning_rate": 1.82031028093196e-05, + "loss": 2.6915, + "step": 36122 + }, + { + "epoch": 2.242411074554597, + "grad_norm": 0.14803908863577728, + "learning_rate": 1.8200315733886154e-05, + "loss": 2.6559, + "step": 36123 + }, + { + "epoch": 2.242473151654355, + "grad_norm": 0.1426431363963097, + "learning_rate": 1.8197528824359383e-05, + "loss": 2.726, + "step": 36124 + }, + { + "epoch": 2.2425352287541127, + "grad_norm": 0.14176208342496108, + "learning_rate": 1.8194742080753813e-05, + "loss": 2.722, + "step": 36125 + }, + { + "epoch": 2.2425973058538706, + "grad_norm": 0.1412592927758023, + "learning_rate": 1.8191955503084006e-05, + "loss": 2.7159, + "step": 36126 + }, + { + "epoch": 2.2426593829536285, + "grad_norm": 0.1392070824992607, + "learning_rate": 1.8189169091364488e-05, + "loss": 2.6434, + "step": 36127 + }, + { + "epoch": 2.2427214600533865, + "grad_norm": 0.13716333854298687, + "learning_rate": 1.8186382845609783e-05, + "loss": 2.6117, + "step": 36128 + }, + { + "epoch": 2.2427835371531444, + "grad_norm": 0.16781349547675187, + "learning_rate": 1.818359676583446e-05, + "loss": 2.5971, + "step": 36129 + }, + { + "epoch": 2.2428456142529023, + "grad_norm": 0.14166341943268285, + "learning_rate": 1.8180810852053015e-05, + "loss": 2.7972, + "step": 36130 + }, + { + "epoch": 2.24290769135266, + "grad_norm": 0.14856239134367683, + "learning_rate": 1.817802510428002e-05, + "loss": 2.6951, + "step": 36131 + }, + { + "epoch": 2.2429697684524177, + "grad_norm": 0.1442165169796148, + "learning_rate": 1.8175239522530002e-05, + "loss": 2.7146, + "step": 36132 + }, + { + "epoch": 2.2430318455521756, + "grad_norm": 0.14052638596346842, + "learning_rate": 1.817245410681748e-05, + "loss": 2.7043, + "step": 36133 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.14154590302030562, + "learning_rate": 1.8169668857156975e-05, + "loss": 2.5901, + "step": 36134 + }, + { + "epoch": 2.2431559997516914, + "grad_norm": 0.14978181547507455, + "learning_rate": 1.8166883773563048e-05, + "loss": 2.6704, + "step": 36135 + }, + { + "epoch": 2.2432180768514494, + "grad_norm": 0.1436142570787218, + "learning_rate": 1.8164098856050215e-05, + "loss": 2.8365, + "step": 36136 + }, + { + "epoch": 2.2432801539512073, + "grad_norm": 0.13812768297049408, + "learning_rate": 1.8161314104633014e-05, + "loss": 2.6942, + "step": 36137 + }, + { + "epoch": 2.243342231050965, + "grad_norm": 0.1385569364753989, + "learning_rate": 1.815852951932596e-05, + "loss": 2.627, + "step": 36138 + }, + { + "epoch": 2.243404308150723, + "grad_norm": 0.14642120906778947, + "learning_rate": 1.8155745100143573e-05, + "loss": 2.6571, + "step": 36139 + }, + { + "epoch": 2.243466385250481, + "grad_norm": 0.14895341335409976, + "learning_rate": 1.8152960847100413e-05, + "loss": 2.6838, + "step": 36140 + }, + { + "epoch": 2.243528462350239, + "grad_norm": 0.13969139958812032, + "learning_rate": 1.8150176760210985e-05, + "loss": 2.7543, + "step": 36141 + }, + { + "epoch": 2.243590539449997, + "grad_norm": 0.14009807465885807, + "learning_rate": 1.8147392839489817e-05, + "loss": 2.7727, + "step": 36142 + }, + { + "epoch": 2.243652616549755, + "grad_norm": 0.1417335158523797, + "learning_rate": 1.8144609084951418e-05, + "loss": 2.6797, + "step": 36143 + }, + { + "epoch": 2.2437146936495127, + "grad_norm": 0.14011825546227744, + "learning_rate": 1.8141825496610343e-05, + "loss": 2.6246, + "step": 36144 + }, + { + "epoch": 2.2437767707492706, + "grad_norm": 0.18183124958227537, + "learning_rate": 1.8139042074481093e-05, + "loss": 2.6702, + "step": 36145 + }, + { + "epoch": 2.2438388478490285, + "grad_norm": 0.1918866687357813, + "learning_rate": 1.81362588185782e-05, + "loss": 2.8079, + "step": 36146 + }, + { + "epoch": 2.2439009249487865, + "grad_norm": 0.1572731079485435, + "learning_rate": 1.8133475728916165e-05, + "loss": 2.7388, + "step": 36147 + }, + { + "epoch": 2.2439630020485444, + "grad_norm": 0.18304103480557868, + "learning_rate": 1.8130692805509536e-05, + "loss": 2.7311, + "step": 36148 + }, + { + "epoch": 2.2440250791483023, + "grad_norm": 0.1677068731489252, + "learning_rate": 1.812791004837282e-05, + "loss": 2.745, + "step": 36149 + }, + { + "epoch": 2.24408715624806, + "grad_norm": 0.16512356237049905, + "learning_rate": 1.8125127457520534e-05, + "loss": 2.6877, + "step": 36150 + }, + { + "epoch": 2.244149233347818, + "grad_norm": 0.14184813242947844, + "learning_rate": 1.8122345032967197e-05, + "loss": 2.7871, + "step": 36151 + }, + { + "epoch": 2.244211310447576, + "grad_norm": 0.14455109537330435, + "learning_rate": 1.8119562774727312e-05, + "loss": 2.691, + "step": 36152 + }, + { + "epoch": 2.244273387547334, + "grad_norm": 0.13551918758396808, + "learning_rate": 1.8116780682815415e-05, + "loss": 2.6737, + "step": 36153 + }, + { + "epoch": 2.244335464647092, + "grad_norm": 0.14934440310403851, + "learning_rate": 1.811399875724602e-05, + "loss": 2.7286, + "step": 36154 + }, + { + "epoch": 2.2443975417468494, + "grad_norm": 0.15018992842212317, + "learning_rate": 1.811121699803363e-05, + "loss": 2.738, + "step": 36155 + }, + { + "epoch": 2.2444596188466077, + "grad_norm": 0.14266416488032968, + "learning_rate": 1.810843540519275e-05, + "loss": 2.6996, + "step": 36156 + }, + { + "epoch": 2.244521695946365, + "grad_norm": 0.14354742122607259, + "learning_rate": 1.8105653978737917e-05, + "loss": 2.7116, + "step": 36157 + }, + { + "epoch": 2.244583773046123, + "grad_norm": 0.1414420907033691, + "learning_rate": 1.8102872718683633e-05, + "loss": 2.7243, + "step": 36158 + }, + { + "epoch": 2.244645850145881, + "grad_norm": 0.1431285475828456, + "learning_rate": 1.81000916250444e-05, + "loss": 2.751, + "step": 36159 + }, + { + "epoch": 2.244707927245639, + "grad_norm": 0.13866295584350366, + "learning_rate": 1.8097310697834725e-05, + "loss": 2.7241, + "step": 36160 + }, + { + "epoch": 2.244770004345397, + "grad_norm": 0.141244187404236, + "learning_rate": 1.8094529937069114e-05, + "loss": 2.6905, + "step": 36161 + }, + { + "epoch": 2.244832081445155, + "grad_norm": 0.14073856060046197, + "learning_rate": 1.809174934276211e-05, + "loss": 2.7264, + "step": 36162 + }, + { + "epoch": 2.2448941585449127, + "grad_norm": 0.14456327830953242, + "learning_rate": 1.8088968914928196e-05, + "loss": 2.8284, + "step": 36163 + }, + { + "epoch": 2.2449562356446706, + "grad_norm": 0.1357944305659908, + "learning_rate": 1.808618865358187e-05, + "loss": 2.7094, + "step": 36164 + }, + { + "epoch": 2.2450183127444285, + "grad_norm": 0.13845715891234478, + "learning_rate": 1.8083408558737652e-05, + "loss": 2.6717, + "step": 36165 + }, + { + "epoch": 2.2450803898441865, + "grad_norm": 0.14227251063426138, + "learning_rate": 1.8080628630410025e-05, + "loss": 2.7554, + "step": 36166 + }, + { + "epoch": 2.2451424669439444, + "grad_norm": 0.1572239502399542, + "learning_rate": 1.8077848868613517e-05, + "loss": 2.8049, + "step": 36167 + }, + { + "epoch": 2.2452045440437023, + "grad_norm": 0.14873318334695798, + "learning_rate": 1.8075069273362626e-05, + "loss": 2.7647, + "step": 36168 + }, + { + "epoch": 2.24526662114346, + "grad_norm": 0.14075974790279266, + "learning_rate": 1.8072289844671846e-05, + "loss": 2.7136, + "step": 36169 + }, + { + "epoch": 2.245328698243218, + "grad_norm": 0.14210966306282866, + "learning_rate": 1.8069510582555665e-05, + "loss": 2.7635, + "step": 36170 + }, + { + "epoch": 2.245390775342976, + "grad_norm": 0.14597260404205756, + "learning_rate": 1.8066731487028616e-05, + "loss": 2.6466, + "step": 36171 + }, + { + "epoch": 2.245452852442734, + "grad_norm": 0.1567132638710771, + "learning_rate": 1.8063952558105178e-05, + "loss": 2.7252, + "step": 36172 + }, + { + "epoch": 2.245514929542492, + "grad_norm": 0.14399131883383307, + "learning_rate": 1.8061173795799853e-05, + "loss": 2.7022, + "step": 36173 + }, + { + "epoch": 2.24557700664225, + "grad_norm": 0.13729889261923697, + "learning_rate": 1.805839520012714e-05, + "loss": 2.713, + "step": 36174 + }, + { + "epoch": 2.2456390837420077, + "grad_norm": 0.14517926881548424, + "learning_rate": 1.8055616771101513e-05, + "loss": 2.7183, + "step": 36175 + }, + { + "epoch": 2.2457011608417656, + "grad_norm": 0.15272360297849055, + "learning_rate": 1.8052838508737507e-05, + "loss": 2.7171, + "step": 36176 + }, + { + "epoch": 2.2457632379415235, + "grad_norm": 0.1536268826020915, + "learning_rate": 1.8050060413049602e-05, + "loss": 2.622, + "step": 36177 + }, + { + "epoch": 2.2458253150412815, + "grad_norm": 0.1489044862124366, + "learning_rate": 1.8047282484052282e-05, + "loss": 2.6144, + "step": 36178 + }, + { + "epoch": 2.2458873921410394, + "grad_norm": 0.17184191081839958, + "learning_rate": 1.804450472176003e-05, + "loss": 2.7049, + "step": 36179 + }, + { + "epoch": 2.245949469240797, + "grad_norm": 0.1472495186596275, + "learning_rate": 1.8041727126187375e-05, + "loss": 2.7508, + "step": 36180 + }, + { + "epoch": 2.2460115463405548, + "grad_norm": 0.1725991070375489, + "learning_rate": 1.8038949697348783e-05, + "loss": 2.6883, + "step": 36181 + }, + { + "epoch": 2.2460736234403127, + "grad_norm": 0.13958079693990216, + "learning_rate": 1.803617243525875e-05, + "loss": 2.8335, + "step": 36182 + }, + { + "epoch": 2.2461357005400706, + "grad_norm": 0.16670952124288807, + "learning_rate": 1.8033395339931752e-05, + "loss": 2.7704, + "step": 36183 + }, + { + "epoch": 2.2461977776398285, + "grad_norm": 0.15242913494290522, + "learning_rate": 1.8030618411382306e-05, + "loss": 2.7554, + "step": 36184 + }, + { + "epoch": 2.2462598547395864, + "grad_norm": 0.1484649093318249, + "learning_rate": 1.802784164962488e-05, + "loss": 2.5229, + "step": 36185 + }, + { + "epoch": 2.2463219318393444, + "grad_norm": 0.1420203691244781, + "learning_rate": 1.8025065054673973e-05, + "loss": 2.7751, + "step": 36186 + }, + { + "epoch": 2.2463840089391023, + "grad_norm": 0.14370494556300345, + "learning_rate": 1.802228862654406e-05, + "loss": 2.6937, + "step": 36187 + }, + { + "epoch": 2.24644608603886, + "grad_norm": 0.15059195084693408, + "learning_rate": 1.8019512365249615e-05, + "loss": 2.6631, + "step": 36188 + }, + { + "epoch": 2.246508163138618, + "grad_norm": 0.145107229925694, + "learning_rate": 1.8016736270805153e-05, + "loss": 2.7384, + "step": 36189 + }, + { + "epoch": 2.246570240238376, + "grad_norm": 0.14478074911406782, + "learning_rate": 1.8013960343225138e-05, + "loss": 2.7194, + "step": 36190 + }, + { + "epoch": 2.246632317338134, + "grad_norm": 0.15466519349069052, + "learning_rate": 1.8011184582524064e-05, + "loss": 2.7453, + "step": 36191 + }, + { + "epoch": 2.246694394437892, + "grad_norm": 0.14326701892287413, + "learning_rate": 1.8008408988716386e-05, + "loss": 2.7589, + "step": 36192 + }, + { + "epoch": 2.24675647153765, + "grad_norm": 0.1427059580322154, + "learning_rate": 1.8005633561816604e-05, + "loss": 2.638, + "step": 36193 + }, + { + "epoch": 2.2468185486374077, + "grad_norm": 0.14147608887048976, + "learning_rate": 1.8002858301839213e-05, + "loss": 2.75, + "step": 36194 + }, + { + "epoch": 2.2468806257371656, + "grad_norm": 0.14291896969628243, + "learning_rate": 1.800008320879868e-05, + "loss": 2.6311, + "step": 36195 + }, + { + "epoch": 2.2469427028369235, + "grad_norm": 0.1681947739898298, + "learning_rate": 1.799730828270948e-05, + "loss": 2.6924, + "step": 36196 + }, + { + "epoch": 2.2470047799366815, + "grad_norm": 0.14162612570452454, + "learning_rate": 1.7994533523586076e-05, + "loss": 2.6607, + "step": 36197 + }, + { + "epoch": 2.2470668570364394, + "grad_norm": 0.1442060162015402, + "learning_rate": 1.799175893144298e-05, + "loss": 2.8472, + "step": 36198 + }, + { + "epoch": 2.2471289341361973, + "grad_norm": 0.14078574794734808, + "learning_rate": 1.798898450629464e-05, + "loss": 2.6843, + "step": 36199 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.13304305400950378, + "learning_rate": 1.798621024815555e-05, + "loss": 2.6858, + "step": 36200 + }, + { + "epoch": 2.247253088335713, + "grad_norm": 0.14684489707677353, + "learning_rate": 1.7983436157040162e-05, + "loss": 2.6669, + "step": 36201 + }, + { + "epoch": 2.247315165435471, + "grad_norm": 0.14622731860255214, + "learning_rate": 1.7980662232962953e-05, + "loss": 2.8115, + "step": 36202 + }, + { + "epoch": 2.2473772425352285, + "grad_norm": 0.14773061232109863, + "learning_rate": 1.7977888475938408e-05, + "loss": 2.7337, + "step": 36203 + }, + { + "epoch": 2.247439319634987, + "grad_norm": 0.1428568806230744, + "learning_rate": 1.7975114885980998e-05, + "loss": 2.6549, + "step": 36204 + }, + { + "epoch": 2.2475013967347444, + "grad_norm": 0.14381395658694568, + "learning_rate": 1.797234146310519e-05, + "loss": 2.6904, + "step": 36205 + }, + { + "epoch": 2.2475634738345023, + "grad_norm": 0.1596776393563244, + "learning_rate": 1.7969568207325433e-05, + "loss": 2.7036, + "step": 36206 + }, + { + "epoch": 2.24762555093426, + "grad_norm": 0.14005692632744152, + "learning_rate": 1.7966795118656233e-05, + "loss": 2.6692, + "step": 36207 + }, + { + "epoch": 2.247687628034018, + "grad_norm": 0.14583389815720668, + "learning_rate": 1.7964022197112034e-05, + "loss": 2.7122, + "step": 36208 + }, + { + "epoch": 2.247749705133776, + "grad_norm": 0.14482237518227792, + "learning_rate": 1.796124944270731e-05, + "loss": 2.6807, + "step": 36209 + }, + { + "epoch": 2.247811782233534, + "grad_norm": 0.14634173123777483, + "learning_rate": 1.795847685545653e-05, + "loss": 2.6798, + "step": 36210 + }, + { + "epoch": 2.247873859333292, + "grad_norm": 0.14553727741304706, + "learning_rate": 1.795570443537413e-05, + "loss": 2.6597, + "step": 36211 + }, + { + "epoch": 2.24793593643305, + "grad_norm": 0.1538817398286447, + "learning_rate": 1.7952932182474624e-05, + "loss": 2.7285, + "step": 36212 + }, + { + "epoch": 2.2479980135328077, + "grad_norm": 0.1564414639422253, + "learning_rate": 1.7950160096772444e-05, + "loss": 2.6949, + "step": 36213 + }, + { + "epoch": 2.2480600906325656, + "grad_norm": 0.14015246179919613, + "learning_rate": 1.7947388178282066e-05, + "loss": 2.8084, + "step": 36214 + }, + { + "epoch": 2.2481221677323235, + "grad_norm": 0.1457881083924601, + "learning_rate": 1.7944616427017923e-05, + "loss": 2.7083, + "step": 36215 + }, + { + "epoch": 2.2481842448320815, + "grad_norm": 0.14565486645691084, + "learning_rate": 1.7941844842994516e-05, + "loss": 2.725, + "step": 36216 + }, + { + "epoch": 2.2482463219318394, + "grad_norm": 0.14099517420363067, + "learning_rate": 1.7939073426226283e-05, + "loss": 2.6534, + "step": 36217 + }, + { + "epoch": 2.2483083990315973, + "grad_norm": 0.13575955149512633, + "learning_rate": 1.7936302176727694e-05, + "loss": 2.7474, + "step": 36218 + }, + { + "epoch": 2.248370476131355, + "grad_norm": 0.150500010058729, + "learning_rate": 1.7933531094513178e-05, + "loss": 2.7555, + "step": 36219 + }, + { + "epoch": 2.248432553231113, + "grad_norm": 0.17208570872607806, + "learning_rate": 1.7930760179597234e-05, + "loss": 2.7608, + "step": 36220 + }, + { + "epoch": 2.248494630330871, + "grad_norm": 0.1506810789460864, + "learning_rate": 1.7927989431994296e-05, + "loss": 2.6553, + "step": 36221 + }, + { + "epoch": 2.248556707430629, + "grad_norm": 0.15215384433330834, + "learning_rate": 1.7925218851718827e-05, + "loss": 2.6574, + "step": 36222 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.16096352440867331, + "learning_rate": 1.7922448438785277e-05, + "loss": 2.763, + "step": 36223 + }, + { + "epoch": 2.248680861630145, + "grad_norm": 0.15826104716789666, + "learning_rate": 1.7919678193208084e-05, + "loss": 2.6836, + "step": 36224 + }, + { + "epoch": 2.2487429387299027, + "grad_norm": 0.1414474042786771, + "learning_rate": 1.791690811500173e-05, + "loss": 2.7456, + "step": 36225 + }, + { + "epoch": 2.2488050158296606, + "grad_norm": 0.14023826986870505, + "learning_rate": 1.7914138204180642e-05, + "loss": 2.7697, + "step": 36226 + }, + { + "epoch": 2.2488670929294186, + "grad_norm": 0.14853365193244158, + "learning_rate": 1.7911368460759305e-05, + "loss": 2.7293, + "step": 36227 + }, + { + "epoch": 2.248929170029176, + "grad_norm": 0.14766633968091925, + "learning_rate": 1.7908598884752142e-05, + "loss": 2.7623, + "step": 36228 + }, + { + "epoch": 2.248991247128934, + "grad_norm": 0.13923095106997016, + "learning_rate": 1.790582947617359e-05, + "loss": 2.7377, + "step": 36229 + }, + { + "epoch": 2.249053324228692, + "grad_norm": 0.1426978523055518, + "learning_rate": 1.790306023503814e-05, + "loss": 2.7466, + "step": 36230 + }, + { + "epoch": 2.24911540132845, + "grad_norm": 0.15121543414427416, + "learning_rate": 1.790029116136021e-05, + "loss": 2.6963, + "step": 36231 + }, + { + "epoch": 2.2491774784282077, + "grad_norm": 0.1370120036964417, + "learning_rate": 1.789752225515426e-05, + "loss": 2.6621, + "step": 36232 + }, + { + "epoch": 2.2492395555279656, + "grad_norm": 0.1528607301914192, + "learning_rate": 1.789475351643471e-05, + "loss": 2.7137, + "step": 36233 + }, + { + "epoch": 2.2493016326277235, + "grad_norm": 0.1583948563007458, + "learning_rate": 1.7891984945216043e-05, + "loss": 2.7209, + "step": 36234 + }, + { + "epoch": 2.2493637097274815, + "grad_norm": 0.14493504384712153, + "learning_rate": 1.788921654151268e-05, + "loss": 2.6853, + "step": 36235 + }, + { + "epoch": 2.2494257868272394, + "grad_norm": 0.15735361048458169, + "learning_rate": 1.7886448305339077e-05, + "loss": 2.7189, + "step": 36236 + }, + { + "epoch": 2.2494878639269973, + "grad_norm": 0.16403462536479083, + "learning_rate": 1.7883680236709664e-05, + "loss": 2.7494, + "step": 36237 + }, + { + "epoch": 2.249549941026755, + "grad_norm": 0.1618434666975289, + "learning_rate": 1.7880912335638872e-05, + "loss": 2.7749, + "step": 36238 + }, + { + "epoch": 2.249612018126513, + "grad_norm": 0.13806114567739544, + "learning_rate": 1.7878144602141178e-05, + "loss": 2.7339, + "step": 36239 + }, + { + "epoch": 2.249674095226271, + "grad_norm": 0.1512663043609485, + "learning_rate": 1.7875377036230996e-05, + "loss": 2.7273, + "step": 36240 + }, + { + "epoch": 2.249736172326029, + "grad_norm": 0.14593133798002142, + "learning_rate": 1.7872609637922773e-05, + "loss": 2.792, + "step": 36241 + }, + { + "epoch": 2.249798249425787, + "grad_norm": 0.14248654285266382, + "learning_rate": 1.786984240723093e-05, + "loss": 2.6574, + "step": 36242 + }, + { + "epoch": 2.249860326525545, + "grad_norm": 0.13957416453305344, + "learning_rate": 1.7867075344169933e-05, + "loss": 2.7014, + "step": 36243 + }, + { + "epoch": 2.2499224036253027, + "grad_norm": 0.14067224530140893, + "learning_rate": 1.7864308448754203e-05, + "loss": 2.6834, + "step": 36244 + }, + { + "epoch": 2.2499844807250606, + "grad_norm": 0.14932859382264418, + "learning_rate": 1.7861541720998182e-05, + "loss": 2.7144, + "step": 36245 + }, + { + "epoch": 2.2500465578248185, + "grad_norm": 0.14759759065758596, + "learning_rate": 1.785877516091628e-05, + "loss": 2.7205, + "step": 36246 + }, + { + "epoch": 2.2501086349245765, + "grad_norm": 0.14645075730168364, + "learning_rate": 1.7856008768522963e-05, + "loss": 2.7661, + "step": 36247 + }, + { + "epoch": 2.2501707120243344, + "grad_norm": 0.14968263261801498, + "learning_rate": 1.785324254383266e-05, + "loss": 2.7328, + "step": 36248 + }, + { + "epoch": 2.2502327891240923, + "grad_norm": 0.1599943342733092, + "learning_rate": 1.7850476486859785e-05, + "loss": 2.7068, + "step": 36249 + }, + { + "epoch": 2.25029486622385, + "grad_norm": 0.1397458983499266, + "learning_rate": 1.784771059761879e-05, + "loss": 2.6941, + "step": 36250 + }, + { + "epoch": 2.2503569433236077, + "grad_norm": 0.15377877567023912, + "learning_rate": 1.784494487612407e-05, + "loss": 2.8272, + "step": 36251 + }, + { + "epoch": 2.250419020423366, + "grad_norm": 0.14270360206733937, + "learning_rate": 1.7842179322390097e-05, + "loss": 2.6885, + "step": 36252 + }, + { + "epoch": 2.2504810975231235, + "grad_norm": 0.15825857556453782, + "learning_rate": 1.7839413936431277e-05, + "loss": 2.6698, + "step": 36253 + }, + { + "epoch": 2.2505431746228814, + "grad_norm": 0.14395380008058498, + "learning_rate": 1.7836648718262044e-05, + "loss": 2.8051, + "step": 36254 + }, + { + "epoch": 2.2506052517226394, + "grad_norm": 0.14432341865893694, + "learning_rate": 1.783388366789681e-05, + "loss": 2.6694, + "step": 36255 + }, + { + "epoch": 2.2506673288223973, + "grad_norm": 0.1411618739745989, + "learning_rate": 1.7831118785350025e-05, + "loss": 2.7073, + "step": 36256 + }, + { + "epoch": 2.250729405922155, + "grad_norm": 0.18075778695173222, + "learning_rate": 1.7828354070636105e-05, + "loss": 2.6785, + "step": 36257 + }, + { + "epoch": 2.250791483021913, + "grad_norm": 0.14037320986928548, + "learning_rate": 1.7825589523769475e-05, + "loss": 2.8098, + "step": 36258 + }, + { + "epoch": 2.250853560121671, + "grad_norm": 0.1543440927616383, + "learning_rate": 1.782282514476453e-05, + "loss": 2.7837, + "step": 36259 + }, + { + "epoch": 2.250915637221429, + "grad_norm": 0.13502116784619037, + "learning_rate": 1.782006093363574e-05, + "loss": 2.7177, + "step": 36260 + }, + { + "epoch": 2.250977714321187, + "grad_norm": 0.1403796851067565, + "learning_rate": 1.781729689039749e-05, + "loss": 2.6414, + "step": 36261 + }, + { + "epoch": 2.251039791420945, + "grad_norm": 0.19568294680003287, + "learning_rate": 1.7814533015064227e-05, + "loss": 2.7639, + "step": 36262 + }, + { + "epoch": 2.2511018685207027, + "grad_norm": 0.15914142476416013, + "learning_rate": 1.7811769307650362e-05, + "loss": 2.6964, + "step": 36263 + }, + { + "epoch": 2.2511639456204606, + "grad_norm": 0.15336126035832442, + "learning_rate": 1.7809005768170306e-05, + "loss": 2.7638, + "step": 36264 + }, + { + "epoch": 2.2512260227202185, + "grad_norm": 0.1434644363496832, + "learning_rate": 1.780624239663847e-05, + "loss": 2.8205, + "step": 36265 + }, + { + "epoch": 2.2512880998199765, + "grad_norm": 0.15374030762065932, + "learning_rate": 1.7803479193069294e-05, + "loss": 2.7229, + "step": 36266 + }, + { + "epoch": 2.2513501769197344, + "grad_norm": 0.13520606258608406, + "learning_rate": 1.780071615747718e-05, + "loss": 2.6649, + "step": 36267 + }, + { + "epoch": 2.2514122540194923, + "grad_norm": 0.1431905708907806, + "learning_rate": 1.7797953289876558e-05, + "loss": 2.7364, + "step": 36268 + }, + { + "epoch": 2.25147433111925, + "grad_norm": 0.13980968209548866, + "learning_rate": 1.7795190590281806e-05, + "loss": 2.6661, + "step": 36269 + }, + { + "epoch": 2.251536408219008, + "grad_norm": 0.1445926136481699, + "learning_rate": 1.7792428058707382e-05, + "loss": 2.7186, + "step": 36270 + }, + { + "epoch": 2.251598485318766, + "grad_norm": 0.1435338932988281, + "learning_rate": 1.778966569516768e-05, + "loss": 2.6317, + "step": 36271 + }, + { + "epoch": 2.251660562418524, + "grad_norm": 0.13833256297466726, + "learning_rate": 1.778690349967711e-05, + "loss": 2.6609, + "step": 36272 + }, + { + "epoch": 2.251722639518282, + "grad_norm": 0.13651076573188142, + "learning_rate": 1.778414147225008e-05, + "loss": 2.6022, + "step": 36273 + }, + { + "epoch": 2.2517847166180394, + "grad_norm": 0.1478582171576496, + "learning_rate": 1.7781379612900994e-05, + "loss": 2.7471, + "step": 36274 + }, + { + "epoch": 2.2518467937177977, + "grad_norm": 0.1405049314778867, + "learning_rate": 1.777861792164429e-05, + "loss": 2.678, + "step": 36275 + }, + { + "epoch": 2.251908870817555, + "grad_norm": 0.1719023346839067, + "learning_rate": 1.7775856398494355e-05, + "loss": 2.7481, + "step": 36276 + }, + { + "epoch": 2.251970947917313, + "grad_norm": 0.15309161849016567, + "learning_rate": 1.7773095043465598e-05, + "loss": 2.6896, + "step": 36277 + }, + { + "epoch": 2.252033025017071, + "grad_norm": 0.1433526128478574, + "learning_rate": 1.7770333856572417e-05, + "loss": 2.6872, + "step": 36278 + }, + { + "epoch": 2.252095102116829, + "grad_norm": 0.16696570002315284, + "learning_rate": 1.776757283782924e-05, + "loss": 2.6915, + "step": 36279 + }, + { + "epoch": 2.252157179216587, + "grad_norm": 0.1395454960990815, + "learning_rate": 1.7764811987250463e-05, + "loss": 2.7156, + "step": 36280 + }, + { + "epoch": 2.252219256316345, + "grad_norm": 0.14290973560562778, + "learning_rate": 1.7762051304850485e-05, + "loss": 2.6571, + "step": 36281 + }, + { + "epoch": 2.2522813334161027, + "grad_norm": 0.15339381891713874, + "learning_rate": 1.7759290790643694e-05, + "loss": 2.6324, + "step": 36282 + }, + { + "epoch": 2.2523434105158606, + "grad_norm": 0.14640237420406282, + "learning_rate": 1.7756530444644532e-05, + "loss": 2.7823, + "step": 36283 + }, + { + "epoch": 2.2524054876156185, + "grad_norm": 0.15460578561541843, + "learning_rate": 1.775377026686737e-05, + "loss": 2.6476, + "step": 36284 + }, + { + "epoch": 2.2524675647153765, + "grad_norm": 0.1395400834037401, + "learning_rate": 1.775101025732662e-05, + "loss": 2.7342, + "step": 36285 + }, + { + "epoch": 2.2525296418151344, + "grad_norm": 0.14006390158442655, + "learning_rate": 1.7748250416036682e-05, + "loss": 2.7262, + "step": 36286 + }, + { + "epoch": 2.2525917189148923, + "grad_norm": 0.144219636067141, + "learning_rate": 1.774549074301193e-05, + "loss": 2.7626, + "step": 36287 + }, + { + "epoch": 2.25265379601465, + "grad_norm": 0.16432747559991748, + "learning_rate": 1.7742731238266797e-05, + "loss": 2.76, + "step": 36288 + }, + { + "epoch": 2.252715873114408, + "grad_norm": 0.16237795652975187, + "learning_rate": 1.773997190181567e-05, + "loss": 2.8274, + "step": 36289 + }, + { + "epoch": 2.252777950214166, + "grad_norm": 0.14063823823076466, + "learning_rate": 1.7737212733672942e-05, + "loss": 2.6656, + "step": 36290 + }, + { + "epoch": 2.252840027313924, + "grad_norm": 0.1734218349771224, + "learning_rate": 1.773445373385299e-05, + "loss": 2.6892, + "step": 36291 + }, + { + "epoch": 2.252902104413682, + "grad_norm": 0.1397731797408536, + "learning_rate": 1.773169490237022e-05, + "loss": 2.7076, + "step": 36292 + }, + { + "epoch": 2.25296418151344, + "grad_norm": 0.1454629188766113, + "learning_rate": 1.7728936239239053e-05, + "loss": 2.7267, + "step": 36293 + }, + { + "epoch": 2.2530262586131977, + "grad_norm": 0.16475488246953474, + "learning_rate": 1.772617774447386e-05, + "loss": 2.7636, + "step": 36294 + }, + { + "epoch": 2.2530883357129556, + "grad_norm": 0.13550445266565517, + "learning_rate": 1.772341941808903e-05, + "loss": 2.7044, + "step": 36295 + }, + { + "epoch": 2.2531504128127136, + "grad_norm": 0.1527396584106745, + "learning_rate": 1.772066126009894e-05, + "loss": 2.7877, + "step": 36296 + }, + { + "epoch": 2.2532124899124715, + "grad_norm": 0.15279262997641999, + "learning_rate": 1.7717903270518017e-05, + "loss": 2.7157, + "step": 36297 + }, + { + "epoch": 2.2532745670122294, + "grad_norm": 0.1381254039948368, + "learning_rate": 1.7715145449360627e-05, + "loss": 2.6875, + "step": 36298 + }, + { + "epoch": 2.253336644111987, + "grad_norm": 0.1545214768724163, + "learning_rate": 1.7712387796641163e-05, + "loss": 2.672, + "step": 36299 + }, + { + "epoch": 2.2533987212117452, + "grad_norm": 0.14599978081885961, + "learning_rate": 1.7709630312374004e-05, + "loss": 2.6133, + "step": 36300 + }, + { + "epoch": 2.2534607983115027, + "grad_norm": 0.159777122774021, + "learning_rate": 1.7706872996573526e-05, + "loss": 2.7108, + "step": 36301 + }, + { + "epoch": 2.2535228754112606, + "grad_norm": 0.1480906474643141, + "learning_rate": 1.770411584925415e-05, + "loss": 2.6336, + "step": 36302 + }, + { + "epoch": 2.2535849525110185, + "grad_norm": 0.15236044449910563, + "learning_rate": 1.7701358870430245e-05, + "loss": 2.7502, + "step": 36303 + }, + { + "epoch": 2.2536470296107765, + "grad_norm": 0.15983305862707783, + "learning_rate": 1.7698602060116187e-05, + "loss": 2.6602, + "step": 36304 + }, + { + "epoch": 2.2537091067105344, + "grad_norm": 0.14395928619136586, + "learning_rate": 1.769584541832635e-05, + "loss": 2.7061, + "step": 36305 + }, + { + "epoch": 2.2537711838102923, + "grad_norm": 0.14531181883908664, + "learning_rate": 1.7693088945075143e-05, + "loss": 2.7765, + "step": 36306 + }, + { + "epoch": 2.25383326091005, + "grad_norm": 0.1402049546474703, + "learning_rate": 1.7690332640376932e-05, + "loss": 2.7543, + "step": 36307 + }, + { + "epoch": 2.253895338009808, + "grad_norm": 0.14491875354664674, + "learning_rate": 1.7687576504246107e-05, + "loss": 2.6766, + "step": 36308 + }, + { + "epoch": 2.253957415109566, + "grad_norm": 0.14511428332086138, + "learning_rate": 1.7684820536697035e-05, + "loss": 2.7566, + "step": 36309 + }, + { + "epoch": 2.254019492209324, + "grad_norm": 0.16160102212514474, + "learning_rate": 1.768206473774408e-05, + "loss": 2.7154, + "step": 36310 + }, + { + "epoch": 2.254081569309082, + "grad_norm": 0.14156385116900735, + "learning_rate": 1.767930910740166e-05, + "loss": 2.7687, + "step": 36311 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.14105012001538234, + "learning_rate": 1.767655364568413e-05, + "loss": 2.5854, + "step": 36312 + }, + { + "epoch": 2.2542057235085977, + "grad_norm": 0.1428576694525652, + "learning_rate": 1.7673798352605865e-05, + "loss": 2.7502, + "step": 36313 + }, + { + "epoch": 2.2542678006083556, + "grad_norm": 0.1473121840138176, + "learning_rate": 1.767104322818122e-05, + "loss": 2.6305, + "step": 36314 + }, + { + "epoch": 2.2543298777081135, + "grad_norm": 0.1388843346054375, + "learning_rate": 1.766828827242461e-05, + "loss": 2.8247, + "step": 36315 + }, + { + "epoch": 2.2543919548078715, + "grad_norm": 0.1389426019901279, + "learning_rate": 1.7665533485350393e-05, + "loss": 2.7376, + "step": 36316 + }, + { + "epoch": 2.2544540319076294, + "grad_norm": 0.14547021751401118, + "learning_rate": 1.7662778866972936e-05, + "loss": 2.6113, + "step": 36317 + }, + { + "epoch": 2.2545161090073873, + "grad_norm": 0.13481266472337303, + "learning_rate": 1.7660024417306593e-05, + "loss": 2.681, + "step": 36318 + }, + { + "epoch": 2.254578186107145, + "grad_norm": 0.1495699972616138, + "learning_rate": 1.765727013636577e-05, + "loss": 2.803, + "step": 36319 + }, + { + "epoch": 2.254640263206903, + "grad_norm": 0.1407930112238107, + "learning_rate": 1.7654516024164826e-05, + "loss": 2.7667, + "step": 36320 + }, + { + "epoch": 2.254702340306661, + "grad_norm": 0.15606105228091732, + "learning_rate": 1.7651762080718115e-05, + "loss": 2.6985, + "step": 36321 + }, + { + "epoch": 2.2547644174064185, + "grad_norm": 0.1402808322301173, + "learning_rate": 1.7649008306040022e-05, + "loss": 2.7123, + "step": 36322 + }, + { + "epoch": 2.254826494506177, + "grad_norm": 0.1458791628135747, + "learning_rate": 1.7646254700144883e-05, + "loss": 2.6417, + "step": 36323 + }, + { + "epoch": 2.2548885716059344, + "grad_norm": 0.13678566112774282, + "learning_rate": 1.7643501263047092e-05, + "loss": 2.6601, + "step": 36324 + }, + { + "epoch": 2.2549506487056923, + "grad_norm": 0.14205481893926397, + "learning_rate": 1.764074799476102e-05, + "loss": 2.7649, + "step": 36325 + }, + { + "epoch": 2.25501272580545, + "grad_norm": 0.13248970972444976, + "learning_rate": 1.763799489530103e-05, + "loss": 2.6545, + "step": 36326 + }, + { + "epoch": 2.255074802905208, + "grad_norm": 0.1421811967018494, + "learning_rate": 1.763524196468147e-05, + "loss": 2.7685, + "step": 36327 + }, + { + "epoch": 2.255136880004966, + "grad_norm": 0.1398172825002577, + "learning_rate": 1.7632489202916698e-05, + "loss": 2.7897, + "step": 36328 + }, + { + "epoch": 2.255198957104724, + "grad_norm": 0.13759348871811442, + "learning_rate": 1.7629736610021097e-05, + "loss": 2.7878, + "step": 36329 + }, + { + "epoch": 2.255261034204482, + "grad_norm": 0.13985680327417901, + "learning_rate": 1.7626984186009027e-05, + "loss": 2.6854, + "step": 36330 + }, + { + "epoch": 2.25532311130424, + "grad_norm": 0.15076537320446803, + "learning_rate": 1.762423193089483e-05, + "loss": 2.725, + "step": 36331 + }, + { + "epoch": 2.2553851884039977, + "grad_norm": 0.14724838651751085, + "learning_rate": 1.7621479844692863e-05, + "loss": 2.8073, + "step": 36332 + }, + { + "epoch": 2.2554472655037556, + "grad_norm": 0.15563775154902582, + "learning_rate": 1.7618727927417518e-05, + "loss": 2.6988, + "step": 36333 + }, + { + "epoch": 2.2555093426035135, + "grad_norm": 0.1521404327911639, + "learning_rate": 1.761597617908312e-05, + "loss": 2.7055, + "step": 36334 + }, + { + "epoch": 2.2555714197032715, + "grad_norm": 0.1506663680351443, + "learning_rate": 1.761322459970404e-05, + "loss": 2.7944, + "step": 36335 + }, + { + "epoch": 2.2556334968030294, + "grad_norm": 0.15620366840090594, + "learning_rate": 1.7610473189294628e-05, + "loss": 2.7353, + "step": 36336 + }, + { + "epoch": 2.2556955739027873, + "grad_norm": 0.160896110073296, + "learning_rate": 1.7607721947869226e-05, + "loss": 2.7184, + "step": 36337 + }, + { + "epoch": 2.255757651002545, + "grad_norm": 0.14714708464840862, + "learning_rate": 1.760497087544222e-05, + "loss": 2.6714, + "step": 36338 + }, + { + "epoch": 2.255819728102303, + "grad_norm": 0.1478407181828271, + "learning_rate": 1.7602219972027943e-05, + "loss": 2.6817, + "step": 36339 + }, + { + "epoch": 2.255881805202061, + "grad_norm": 0.14791196725548394, + "learning_rate": 1.7599469237640746e-05, + "loss": 2.7242, + "step": 36340 + }, + { + "epoch": 2.255943882301819, + "grad_norm": 0.1579790471113949, + "learning_rate": 1.759671867229497e-05, + "loss": 2.6927, + "step": 36341 + }, + { + "epoch": 2.256005959401577, + "grad_norm": 0.1437192155694025, + "learning_rate": 1.7593968276004995e-05, + "loss": 2.6631, + "step": 36342 + }, + { + "epoch": 2.256068036501335, + "grad_norm": 0.1443414730925604, + "learning_rate": 1.759121804878516e-05, + "loss": 2.6777, + "step": 36343 + }, + { + "epoch": 2.2561301136010927, + "grad_norm": 0.13331548029862975, + "learning_rate": 1.7588467990649805e-05, + "loss": 2.6648, + "step": 36344 + }, + { + "epoch": 2.2561921907008506, + "grad_norm": 0.1512983842573732, + "learning_rate": 1.7585718101613273e-05, + "loss": 2.6907, + "step": 36345 + }, + { + "epoch": 2.2562542678006086, + "grad_norm": 0.15782190828322445, + "learning_rate": 1.758296838168991e-05, + "loss": 2.7716, + "step": 36346 + }, + { + "epoch": 2.256316344900366, + "grad_norm": 0.14191830823759588, + "learning_rate": 1.7580218830894084e-05, + "loss": 2.7458, + "step": 36347 + }, + { + "epoch": 2.2563784220001244, + "grad_norm": 0.13816625803118943, + "learning_rate": 1.7577469449240127e-05, + "loss": 2.646, + "step": 36348 + }, + { + "epoch": 2.256440499099882, + "grad_norm": 0.15769455250690861, + "learning_rate": 1.7574720236742383e-05, + "loss": 2.6532, + "step": 36349 + }, + { + "epoch": 2.25650257619964, + "grad_norm": 0.1373951053949565, + "learning_rate": 1.7571971193415177e-05, + "loss": 2.6909, + "step": 36350 + }, + { + "epoch": 2.2565646532993977, + "grad_norm": 0.14963372607548017, + "learning_rate": 1.756922231927289e-05, + "loss": 2.7102, + "step": 36351 + }, + { + "epoch": 2.2566267303991556, + "grad_norm": 0.1503372964807993, + "learning_rate": 1.7566473614329837e-05, + "loss": 2.7148, + "step": 36352 + }, + { + "epoch": 2.2566888074989135, + "grad_norm": 0.14354462491015188, + "learning_rate": 1.756372507860037e-05, + "loss": 2.7514, + "step": 36353 + }, + { + "epoch": 2.2567508845986715, + "grad_norm": 0.14087292221520567, + "learning_rate": 1.7560976712098804e-05, + "loss": 2.69, + "step": 36354 + }, + { + "epoch": 2.2568129616984294, + "grad_norm": 0.14974692920965724, + "learning_rate": 1.7558228514839514e-05, + "loss": 2.6934, + "step": 36355 + }, + { + "epoch": 2.2568750387981873, + "grad_norm": 0.16785365974477875, + "learning_rate": 1.755548048683682e-05, + "loss": 2.6966, + "step": 36356 + }, + { + "epoch": 2.256937115897945, + "grad_norm": 0.13951833706874, + "learning_rate": 1.755273262810505e-05, + "loss": 2.7068, + "step": 36357 + }, + { + "epoch": 2.256999192997703, + "grad_norm": 0.13454563041858286, + "learning_rate": 1.7549984938658564e-05, + "loss": 2.7232, + "step": 36358 + }, + { + "epoch": 2.257061270097461, + "grad_norm": 0.1496077759062165, + "learning_rate": 1.754723741851168e-05, + "loss": 2.7387, + "step": 36359 + }, + { + "epoch": 2.257123347197219, + "grad_norm": 0.1479757178215859, + "learning_rate": 1.7544490067678726e-05, + "loss": 2.6331, + "step": 36360 + }, + { + "epoch": 2.257185424296977, + "grad_norm": 0.15888006879867814, + "learning_rate": 1.7541742886174055e-05, + "loss": 2.7415, + "step": 36361 + }, + { + "epoch": 2.257247501396735, + "grad_norm": 0.14939192688973854, + "learning_rate": 1.7538995874012e-05, + "loss": 2.6301, + "step": 36362 + }, + { + "epoch": 2.2573095784964927, + "grad_norm": 0.14927701336880123, + "learning_rate": 1.753624903120688e-05, + "loss": 2.7301, + "step": 36363 + }, + { + "epoch": 2.2573716555962506, + "grad_norm": 0.14305254637987863, + "learning_rate": 1.753350235777301e-05, + "loss": 2.6859, + "step": 36364 + }, + { + "epoch": 2.2574337326960086, + "grad_norm": 0.14715357825907274, + "learning_rate": 1.753075585372476e-05, + "loss": 2.7303, + "step": 36365 + }, + { + "epoch": 2.2574958097957665, + "grad_norm": 0.14365965709832426, + "learning_rate": 1.7528009519076437e-05, + "loss": 2.7827, + "step": 36366 + }, + { + "epoch": 2.2575578868955244, + "grad_norm": 0.13712403308732674, + "learning_rate": 1.7525263353842374e-05, + "loss": 2.7132, + "step": 36367 + }, + { + "epoch": 2.2576199639952823, + "grad_norm": 0.15758836475591045, + "learning_rate": 1.7522517358036876e-05, + "loss": 2.7148, + "step": 36368 + }, + { + "epoch": 2.2576820410950402, + "grad_norm": 0.14894852386422164, + "learning_rate": 1.7519771531674306e-05, + "loss": 2.6272, + "step": 36369 + }, + { + "epoch": 2.2577441181947977, + "grad_norm": 0.1473039713750855, + "learning_rate": 1.7517025874768977e-05, + "loss": 2.704, + "step": 36370 + }, + { + "epoch": 2.257806195294556, + "grad_norm": 0.1400037061145391, + "learning_rate": 1.751428038733521e-05, + "loss": 2.7097, + "step": 36371 + }, + { + "epoch": 2.2578682723943135, + "grad_norm": 0.14500945470461774, + "learning_rate": 1.751153506938732e-05, + "loss": 2.7477, + "step": 36372 + }, + { + "epoch": 2.2579303494940715, + "grad_norm": 0.13779729083424866, + "learning_rate": 1.750878992093963e-05, + "loss": 2.6783, + "step": 36373 + }, + { + "epoch": 2.2579924265938294, + "grad_norm": 0.14256059199138832, + "learning_rate": 1.7506044942006478e-05, + "loss": 2.6993, + "step": 36374 + }, + { + "epoch": 2.2580545036935873, + "grad_norm": 0.15624030798416697, + "learning_rate": 1.7503300132602184e-05, + "loss": 2.7558, + "step": 36375 + }, + { + "epoch": 2.258116580793345, + "grad_norm": 0.15333728939266456, + "learning_rate": 1.750055549274106e-05, + "loss": 2.7616, + "step": 36376 + }, + { + "epoch": 2.258178657893103, + "grad_norm": 0.15985970839770564, + "learning_rate": 1.7497811022437415e-05, + "loss": 2.7689, + "step": 36377 + }, + { + "epoch": 2.258240734992861, + "grad_norm": 0.15001854136234385, + "learning_rate": 1.7495066721705595e-05, + "loss": 2.7472, + "step": 36378 + }, + { + "epoch": 2.258302812092619, + "grad_norm": 0.1571744275582169, + "learning_rate": 1.74923225905599e-05, + "loss": 2.7416, + "step": 36379 + }, + { + "epoch": 2.258364889192377, + "grad_norm": 0.15503992960354124, + "learning_rate": 1.748957862901465e-05, + "loss": 2.6848, + "step": 36380 + }, + { + "epoch": 2.258426966292135, + "grad_norm": 0.14239662192400013, + "learning_rate": 1.748683483708415e-05, + "loss": 2.7545, + "step": 36381 + }, + { + "epoch": 2.2584890433918927, + "grad_norm": 0.151884131471032, + "learning_rate": 1.7484091214782734e-05, + "loss": 2.6276, + "step": 36382 + }, + { + "epoch": 2.2585511204916506, + "grad_norm": 0.1666695841643167, + "learning_rate": 1.748134776212471e-05, + "loss": 2.687, + "step": 36383 + }, + { + "epoch": 2.2586131975914086, + "grad_norm": 0.14379492917952097, + "learning_rate": 1.747860447912439e-05, + "loss": 2.7334, + "step": 36384 + }, + { + "epoch": 2.2586752746911665, + "grad_norm": 0.14893379815207886, + "learning_rate": 1.7475861365796087e-05, + "loss": 2.8543, + "step": 36385 + }, + { + "epoch": 2.2587373517909244, + "grad_norm": 0.14511022076114335, + "learning_rate": 1.747311842215409e-05, + "loss": 2.6964, + "step": 36386 + }, + { + "epoch": 2.2587994288906823, + "grad_norm": 0.16491717854566232, + "learning_rate": 1.747037564821275e-05, + "loss": 2.6424, + "step": 36387 + }, + { + "epoch": 2.25886150599044, + "grad_norm": 0.1747651508309519, + "learning_rate": 1.7467633043986352e-05, + "loss": 2.7004, + "step": 36388 + }, + { + "epoch": 2.258923583090198, + "grad_norm": 0.14238719697959712, + "learning_rate": 1.7464890609489214e-05, + "loss": 2.7071, + "step": 36389 + }, + { + "epoch": 2.258985660189956, + "grad_norm": 0.1631872467782705, + "learning_rate": 1.7462148344735628e-05, + "loss": 2.6909, + "step": 36390 + }, + { + "epoch": 2.259047737289714, + "grad_norm": 0.14837805913071003, + "learning_rate": 1.7459406249739906e-05, + "loss": 2.6941, + "step": 36391 + }, + { + "epoch": 2.259109814389472, + "grad_norm": 0.15889705537584758, + "learning_rate": 1.745666432451638e-05, + "loss": 2.7136, + "step": 36392 + }, + { + "epoch": 2.25917189148923, + "grad_norm": 0.1555518467394538, + "learning_rate": 1.745392256907934e-05, + "loss": 2.6733, + "step": 36393 + }, + { + "epoch": 2.2592339685889877, + "grad_norm": 0.16186566014758882, + "learning_rate": 1.7451180983443082e-05, + "loss": 2.7526, + "step": 36394 + }, + { + "epoch": 2.259296045688745, + "grad_norm": 0.16182402694705147, + "learning_rate": 1.7448439567621915e-05, + "loss": 2.7114, + "step": 36395 + }, + { + "epoch": 2.2593581227885036, + "grad_norm": 0.1654978250385116, + "learning_rate": 1.7445698321630122e-05, + "loss": 2.6961, + "step": 36396 + }, + { + "epoch": 2.259420199888261, + "grad_norm": 0.1463545732786655, + "learning_rate": 1.7442957245482045e-05, + "loss": 2.6766, + "step": 36397 + }, + { + "epoch": 2.259482276988019, + "grad_norm": 0.14233202139546103, + "learning_rate": 1.7440216339191962e-05, + "loss": 2.669, + "step": 36398 + }, + { + "epoch": 2.259544354087777, + "grad_norm": 0.137847572716439, + "learning_rate": 1.7437475602774173e-05, + "loss": 2.6232, + "step": 36399 + }, + { + "epoch": 2.259606431187535, + "grad_norm": 0.14086802331755707, + "learning_rate": 1.7434735036242968e-05, + "loss": 2.6788, + "step": 36400 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.15075025042154064, + "learning_rate": 1.7431994639612663e-05, + "loss": 2.6443, + "step": 36401 + }, + { + "epoch": 2.2597305853870506, + "grad_norm": 0.14559139181103534, + "learning_rate": 1.7429254412897557e-05, + "loss": 2.787, + "step": 36402 + }, + { + "epoch": 2.2597926624868085, + "grad_norm": 0.19613572199611637, + "learning_rate": 1.7426514356111928e-05, + "loss": 2.7549, + "step": 36403 + }, + { + "epoch": 2.2598547395865665, + "grad_norm": 0.16333181155073243, + "learning_rate": 1.7423774469270073e-05, + "loss": 2.762, + "step": 36404 + }, + { + "epoch": 2.2599168166863244, + "grad_norm": 0.13909798287314723, + "learning_rate": 1.7421034752386307e-05, + "loss": 2.6855, + "step": 36405 + }, + { + "epoch": 2.2599788937860823, + "grad_norm": 0.16242348446766477, + "learning_rate": 1.741829520547491e-05, + "loss": 2.7404, + "step": 36406 + }, + { + "epoch": 2.26004097088584, + "grad_norm": 0.15154253125142947, + "learning_rate": 1.741555582855018e-05, + "loss": 2.7047, + "step": 36407 + }, + { + "epoch": 2.260103047985598, + "grad_norm": 0.14837502118329843, + "learning_rate": 1.7412816621626405e-05, + "loss": 2.6892, + "step": 36408 + }, + { + "epoch": 2.260165125085356, + "grad_norm": 0.14707736470816116, + "learning_rate": 1.741007758471786e-05, + "loss": 2.6779, + "step": 36409 + }, + { + "epoch": 2.260227202185114, + "grad_norm": 0.13718628641690986, + "learning_rate": 1.7407338717838867e-05, + "loss": 2.7626, + "step": 36410 + }, + { + "epoch": 2.260289279284872, + "grad_norm": 0.16081922824022488, + "learning_rate": 1.7404600021003703e-05, + "loss": 2.8144, + "step": 36411 + }, + { + "epoch": 2.26035135638463, + "grad_norm": 0.13968781936318844, + "learning_rate": 1.7401861494226652e-05, + "loss": 2.6055, + "step": 36412 + }, + { + "epoch": 2.2604134334843877, + "grad_norm": 0.14517062757590676, + "learning_rate": 1.7399123137521983e-05, + "loss": 2.7696, + "step": 36413 + }, + { + "epoch": 2.2604755105841456, + "grad_norm": 0.14465749871379838, + "learning_rate": 1.739638495090402e-05, + "loss": 2.7493, + "step": 36414 + }, + { + "epoch": 2.2605375876839036, + "grad_norm": 0.14858527732986074, + "learning_rate": 1.7393646934387036e-05, + "loss": 2.7062, + "step": 36415 + }, + { + "epoch": 2.2605996647836615, + "grad_norm": 0.1484061822981448, + "learning_rate": 1.7390909087985308e-05, + "loss": 2.8409, + "step": 36416 + }, + { + "epoch": 2.2606617418834194, + "grad_norm": 0.1439050693235724, + "learning_rate": 1.7388171411713105e-05, + "loss": 2.7377, + "step": 36417 + }, + { + "epoch": 2.260723818983177, + "grad_norm": 0.1386077692388003, + "learning_rate": 1.7385433905584752e-05, + "loss": 2.7049, + "step": 36418 + }, + { + "epoch": 2.2607858960829352, + "grad_norm": 0.14549703366084082, + "learning_rate": 1.7382696569614503e-05, + "loss": 2.7141, + "step": 36419 + }, + { + "epoch": 2.2608479731826927, + "grad_norm": 0.14917385514438386, + "learning_rate": 1.7379959403816643e-05, + "loss": 2.717, + "step": 36420 + }, + { + "epoch": 2.2609100502824506, + "grad_norm": 0.13649214630187467, + "learning_rate": 1.7377222408205456e-05, + "loss": 2.7072, + "step": 36421 + }, + { + "epoch": 2.2609721273822085, + "grad_norm": 0.14205234727608626, + "learning_rate": 1.7374485582795202e-05, + "loss": 2.7416, + "step": 36422 + }, + { + "epoch": 2.2610342044819665, + "grad_norm": 0.14519696026129547, + "learning_rate": 1.737174892760018e-05, + "loss": 2.8154, + "step": 36423 + }, + { + "epoch": 2.2610962815817244, + "grad_norm": 0.13953373606689468, + "learning_rate": 1.7369012442634673e-05, + "loss": 2.6844, + "step": 36424 + }, + { + "epoch": 2.2611583586814823, + "grad_norm": 0.1413979678073212, + "learning_rate": 1.7366276127912957e-05, + "loss": 2.7141, + "step": 36425 + }, + { + "epoch": 2.26122043578124, + "grad_norm": 0.14675572376323393, + "learning_rate": 1.7363539983449295e-05, + "loss": 2.6937, + "step": 36426 + }, + { + "epoch": 2.261282512880998, + "grad_norm": 0.1397891835891369, + "learning_rate": 1.7360804009257957e-05, + "loss": 2.7274, + "step": 36427 + }, + { + "epoch": 2.261344589980756, + "grad_norm": 0.13990647811138276, + "learning_rate": 1.735806820535324e-05, + "loss": 2.6184, + "step": 36428 + }, + { + "epoch": 2.261406667080514, + "grad_norm": 0.1522223955151354, + "learning_rate": 1.7355332571749412e-05, + "loss": 2.7117, + "step": 36429 + }, + { + "epoch": 2.261468744180272, + "grad_norm": 0.13911726441174777, + "learning_rate": 1.7352597108460728e-05, + "loss": 2.6953, + "step": 36430 + }, + { + "epoch": 2.26153082128003, + "grad_norm": 0.13914113500331346, + "learning_rate": 1.734986181550148e-05, + "loss": 2.6384, + "step": 36431 + }, + { + "epoch": 2.2615928983797877, + "grad_norm": 0.1652006002045782, + "learning_rate": 1.7347126692885907e-05, + "loss": 2.664, + "step": 36432 + }, + { + "epoch": 2.2616549754795456, + "grad_norm": 0.15385281366137743, + "learning_rate": 1.7344391740628318e-05, + "loss": 2.7093, + "step": 36433 + }, + { + "epoch": 2.2617170525793036, + "grad_norm": 0.18373157522379924, + "learning_rate": 1.7341656958742963e-05, + "loss": 2.7706, + "step": 36434 + }, + { + "epoch": 2.2617791296790615, + "grad_norm": 0.1658901245121854, + "learning_rate": 1.733892234724412e-05, + "loss": 2.7522, + "step": 36435 + }, + { + "epoch": 2.2618412067788194, + "grad_norm": 0.1559144828528086, + "learning_rate": 1.7336187906146024e-05, + "loss": 2.7212, + "step": 36436 + }, + { + "epoch": 2.2619032838785773, + "grad_norm": 0.16610461823010408, + "learning_rate": 1.7333453635462983e-05, + "loss": 2.662, + "step": 36437 + }, + { + "epoch": 2.2619653609783352, + "grad_norm": 0.15426121673923462, + "learning_rate": 1.7330719535209245e-05, + "loss": 2.6619, + "step": 36438 + }, + { + "epoch": 2.262027438078093, + "grad_norm": 0.1452640052599247, + "learning_rate": 1.7327985605399072e-05, + "loss": 2.8205, + "step": 36439 + }, + { + "epoch": 2.262089515177851, + "grad_norm": 0.14253684974932898, + "learning_rate": 1.7325251846046714e-05, + "loss": 2.7696, + "step": 36440 + }, + { + "epoch": 2.262151592277609, + "grad_norm": 0.158802386870886, + "learning_rate": 1.7322518257166465e-05, + "loss": 2.727, + "step": 36441 + }, + { + "epoch": 2.262213669377367, + "grad_norm": 0.16024680601357416, + "learning_rate": 1.7319784838772573e-05, + "loss": 2.6677, + "step": 36442 + }, + { + "epoch": 2.2622757464771244, + "grad_norm": 0.14936785513258133, + "learning_rate": 1.7317051590879298e-05, + "loss": 2.708, + "step": 36443 + }, + { + "epoch": 2.2623378235768827, + "grad_norm": 0.14753882769949006, + "learning_rate": 1.7314318513500897e-05, + "loss": 2.6934, + "step": 36444 + }, + { + "epoch": 2.26239990067664, + "grad_norm": 0.15122733269442276, + "learning_rate": 1.7311585606651615e-05, + "loss": 2.6775, + "step": 36445 + }, + { + "epoch": 2.262461977776398, + "grad_norm": 0.16388825405048632, + "learning_rate": 1.7308852870345742e-05, + "loss": 2.7697, + "step": 36446 + }, + { + "epoch": 2.262524054876156, + "grad_norm": 0.13874145562050696, + "learning_rate": 1.730612030459752e-05, + "loss": 2.6077, + "step": 36447 + }, + { + "epoch": 2.262586131975914, + "grad_norm": 0.15573000211289595, + "learning_rate": 1.7303387909421205e-05, + "loss": 2.7575, + "step": 36448 + }, + { + "epoch": 2.262648209075672, + "grad_norm": 0.14840640194038265, + "learning_rate": 1.7300655684831036e-05, + "loss": 2.6829, + "step": 36449 + }, + { + "epoch": 2.26271028617543, + "grad_norm": 0.15418658962800533, + "learning_rate": 1.72979236308413e-05, + "loss": 2.6269, + "step": 36450 + }, + { + "epoch": 2.2627723632751877, + "grad_norm": 0.15064375493509732, + "learning_rate": 1.729519174746624e-05, + "loss": 2.7424, + "step": 36451 + }, + { + "epoch": 2.2628344403749456, + "grad_norm": 0.140355700021259, + "learning_rate": 1.72924600347201e-05, + "loss": 2.7041, + "step": 36452 + }, + { + "epoch": 2.2628965174747036, + "grad_norm": 0.1429731631509851, + "learning_rate": 1.7289728492617118e-05, + "loss": 2.7304, + "step": 36453 + }, + { + "epoch": 2.2629585945744615, + "grad_norm": 0.1388510298952948, + "learning_rate": 1.7286997121171584e-05, + "loss": 2.7839, + "step": 36454 + }, + { + "epoch": 2.2630206716742194, + "grad_norm": 0.1422306555976488, + "learning_rate": 1.7284265920397707e-05, + "loss": 2.6437, + "step": 36455 + }, + { + "epoch": 2.2630827487739773, + "grad_norm": 0.14539292939915494, + "learning_rate": 1.7281534890309775e-05, + "loss": 2.6633, + "step": 36456 + }, + { + "epoch": 2.263144825873735, + "grad_norm": 0.15870421724693806, + "learning_rate": 1.7278804030922014e-05, + "loss": 2.7525, + "step": 36457 + }, + { + "epoch": 2.263206902973493, + "grad_norm": 0.16277596145148784, + "learning_rate": 1.7276073342248682e-05, + "loss": 2.6527, + "step": 36458 + }, + { + "epoch": 2.263268980073251, + "grad_norm": 0.1401330701505882, + "learning_rate": 1.7273342824304e-05, + "loss": 2.6767, + "step": 36459 + }, + { + "epoch": 2.263331057173009, + "grad_norm": 0.1496940432971978, + "learning_rate": 1.7270612477102248e-05, + "loss": 2.8094, + "step": 36460 + }, + { + "epoch": 2.263393134272767, + "grad_norm": 0.15761928202432168, + "learning_rate": 1.726788230065766e-05, + "loss": 2.7437, + "step": 36461 + }, + { + "epoch": 2.263455211372525, + "grad_norm": 0.14131855141987346, + "learning_rate": 1.7265152294984473e-05, + "loss": 2.7475, + "step": 36462 + }, + { + "epoch": 2.2635172884722827, + "grad_norm": 0.14705597299730247, + "learning_rate": 1.7262422460096916e-05, + "loss": 2.7713, + "step": 36463 + }, + { + "epoch": 2.2635793655720406, + "grad_norm": 0.1402800089933552, + "learning_rate": 1.725969279600927e-05, + "loss": 2.7233, + "step": 36464 + }, + { + "epoch": 2.2636414426717986, + "grad_norm": 0.16067789189358334, + "learning_rate": 1.725696330273575e-05, + "loss": 2.7296, + "step": 36465 + }, + { + "epoch": 2.263703519771556, + "grad_norm": 0.16212667235046477, + "learning_rate": 1.72542339802906e-05, + "loss": 2.635, + "step": 36466 + }, + { + "epoch": 2.2637655968713144, + "grad_norm": 0.1567341040414121, + "learning_rate": 1.7251504828688047e-05, + "loss": 2.694, + "step": 36467 + }, + { + "epoch": 2.263827673971072, + "grad_norm": 0.14345720445636762, + "learning_rate": 1.724877584794236e-05, + "loss": 2.7775, + "step": 36468 + }, + { + "epoch": 2.26388975107083, + "grad_norm": 0.14613503443974576, + "learning_rate": 1.7246047038067757e-05, + "loss": 2.71, + "step": 36469 + }, + { + "epoch": 2.2639518281705877, + "grad_norm": 0.14028044052935718, + "learning_rate": 1.7243318399078483e-05, + "loss": 2.7242, + "step": 36470 + }, + { + "epoch": 2.2640139052703456, + "grad_norm": 0.15361302370315372, + "learning_rate": 1.7240589930988765e-05, + "loss": 2.7187, + "step": 36471 + }, + { + "epoch": 2.2640759823701035, + "grad_norm": 0.15899395784932568, + "learning_rate": 1.723786163381283e-05, + "loss": 2.7019, + "step": 36472 + }, + { + "epoch": 2.2641380594698615, + "grad_norm": 0.1371368486641387, + "learning_rate": 1.723513350756493e-05, + "loss": 2.6305, + "step": 36473 + }, + { + "epoch": 2.2642001365696194, + "grad_norm": 0.1547193718356259, + "learning_rate": 1.7232405552259307e-05, + "loss": 2.7414, + "step": 36474 + }, + { + "epoch": 2.2642622136693773, + "grad_norm": 0.1365431748122498, + "learning_rate": 1.722967776791017e-05, + "loss": 2.714, + "step": 36475 + }, + { + "epoch": 2.264324290769135, + "grad_norm": 0.13866000729134892, + "learning_rate": 1.722695015453174e-05, + "loss": 2.686, + "step": 36476 + }, + { + "epoch": 2.264386367868893, + "grad_norm": 0.14002925460711568, + "learning_rate": 1.722422271213829e-05, + "loss": 2.7675, + "step": 36477 + }, + { + "epoch": 2.264448444968651, + "grad_norm": 0.14263680961738667, + "learning_rate": 1.7221495440744023e-05, + "loss": 2.6776, + "step": 36478 + }, + { + "epoch": 2.264510522068409, + "grad_norm": 0.15194763134437295, + "learning_rate": 1.7218768340363172e-05, + "loss": 2.768, + "step": 36479 + }, + { + "epoch": 2.264572599168167, + "grad_norm": 0.1474657184255272, + "learning_rate": 1.721604141100997e-05, + "loss": 2.7011, + "step": 36480 + }, + { + "epoch": 2.264634676267925, + "grad_norm": 0.16997423680639523, + "learning_rate": 1.7213314652698614e-05, + "loss": 2.7551, + "step": 36481 + }, + { + "epoch": 2.2646967533676827, + "grad_norm": 0.1474165754977908, + "learning_rate": 1.721058806544338e-05, + "loss": 2.6857, + "step": 36482 + }, + { + "epoch": 2.2647588304674406, + "grad_norm": 0.1451533692179799, + "learning_rate": 1.720786164925846e-05, + "loss": 2.7123, + "step": 36483 + }, + { + "epoch": 2.2648209075671986, + "grad_norm": 0.16321538025461285, + "learning_rate": 1.720513540415809e-05, + "loss": 2.7438, + "step": 36484 + }, + { + "epoch": 2.2648829846669565, + "grad_norm": 0.1458970183440081, + "learning_rate": 1.7202409330156473e-05, + "loss": 2.6926, + "step": 36485 + }, + { + "epoch": 2.2649450617667144, + "grad_norm": 0.16445686988462935, + "learning_rate": 1.719968342726786e-05, + "loss": 2.688, + "step": 36486 + }, + { + "epoch": 2.2650071388664723, + "grad_norm": 0.14239389457536233, + "learning_rate": 1.7196957695506465e-05, + "loss": 2.6358, + "step": 36487 + }, + { + "epoch": 2.2650692159662302, + "grad_norm": 0.13942303506543216, + "learning_rate": 1.719423213488649e-05, + "loss": 2.7556, + "step": 36488 + }, + { + "epoch": 2.265131293065988, + "grad_norm": 0.1411011964579796, + "learning_rate": 1.7191506745422186e-05, + "loss": 2.6699, + "step": 36489 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.16368706070548614, + "learning_rate": 1.718878152712774e-05, + "loss": 2.686, + "step": 36490 + }, + { + "epoch": 2.2652554472655035, + "grad_norm": 0.17052733865558117, + "learning_rate": 1.7186056480017397e-05, + "loss": 2.7217, + "step": 36491 + }, + { + "epoch": 2.265317524365262, + "grad_norm": 0.13268717996079835, + "learning_rate": 1.718333160410537e-05, + "loss": 2.6548, + "step": 36492 + }, + { + "epoch": 2.2653796014650194, + "grad_norm": 0.1425308972354129, + "learning_rate": 1.7180606899405864e-05, + "loss": 2.6184, + "step": 36493 + }, + { + "epoch": 2.2654416785647773, + "grad_norm": 0.15903169387040322, + "learning_rate": 1.71778823659331e-05, + "loss": 2.6904, + "step": 36494 + }, + { + "epoch": 2.265503755664535, + "grad_norm": 0.19098611381367764, + "learning_rate": 1.717515800370128e-05, + "loss": 2.7406, + "step": 36495 + }, + { + "epoch": 2.265565832764293, + "grad_norm": 0.13971163913418583, + "learning_rate": 1.7172433812724642e-05, + "loss": 2.7599, + "step": 36496 + }, + { + "epoch": 2.265627909864051, + "grad_norm": 0.14338107867271616, + "learning_rate": 1.7169709793017386e-05, + "loss": 2.8371, + "step": 36497 + }, + { + "epoch": 2.265689986963809, + "grad_norm": 0.13992508914904458, + "learning_rate": 1.7166985944593723e-05, + "loss": 2.6909, + "step": 36498 + }, + { + "epoch": 2.265752064063567, + "grad_norm": 0.1406192910836965, + "learning_rate": 1.7164262267467845e-05, + "loss": 2.8171, + "step": 36499 + }, + { + "epoch": 2.265814141163325, + "grad_norm": 0.1560549173842779, + "learning_rate": 1.7161538761654e-05, + "loss": 2.7293, + "step": 36500 + }, + { + "epoch": 2.2658762182630827, + "grad_norm": 0.15818995550466153, + "learning_rate": 1.7158815427166386e-05, + "loss": 2.7099, + "step": 36501 + }, + { + "epoch": 2.2659382953628406, + "grad_norm": 0.13598356809160814, + "learning_rate": 1.7156092264019195e-05, + "loss": 2.691, + "step": 36502 + }, + { + "epoch": 2.2660003724625986, + "grad_norm": 0.14374542540029936, + "learning_rate": 1.7153369272226634e-05, + "loss": 2.7572, + "step": 36503 + }, + { + "epoch": 2.2660624495623565, + "grad_norm": 0.16315034633791733, + "learning_rate": 1.7150646451802933e-05, + "loss": 2.5845, + "step": 36504 + }, + { + "epoch": 2.2661245266621144, + "grad_norm": 0.1433630332168201, + "learning_rate": 1.7147923802762284e-05, + "loss": 2.7083, + "step": 36505 + }, + { + "epoch": 2.2661866037618723, + "grad_norm": 0.1673735841946908, + "learning_rate": 1.714520132511889e-05, + "loss": 2.7049, + "step": 36506 + }, + { + "epoch": 2.2662486808616302, + "grad_norm": 0.15209530476805547, + "learning_rate": 1.7142479018886964e-05, + "loss": 2.7843, + "step": 36507 + }, + { + "epoch": 2.266310757961388, + "grad_norm": 0.14191318292237592, + "learning_rate": 1.7139756884080675e-05, + "loss": 2.7202, + "step": 36508 + }, + { + "epoch": 2.266372835061146, + "grad_norm": 0.14481496672037927, + "learning_rate": 1.7137034920714275e-05, + "loss": 2.7357, + "step": 36509 + }, + { + "epoch": 2.266434912160904, + "grad_norm": 0.16184979073107617, + "learning_rate": 1.713431312880194e-05, + "loss": 2.7033, + "step": 36510 + }, + { + "epoch": 2.266496989260662, + "grad_norm": 0.14419108388917065, + "learning_rate": 1.7131591508357868e-05, + "loss": 2.771, + "step": 36511 + }, + { + "epoch": 2.26655906636042, + "grad_norm": 0.15041563519677192, + "learning_rate": 1.712887005939625e-05, + "loss": 2.7743, + "step": 36512 + }, + { + "epoch": 2.2666211434601777, + "grad_norm": 0.1474052096530933, + "learning_rate": 1.7126148781931313e-05, + "loss": 2.7501, + "step": 36513 + }, + { + "epoch": 2.266683220559935, + "grad_norm": 0.14718505491870199, + "learning_rate": 1.712342767597724e-05, + "loss": 2.6245, + "step": 36514 + }, + { + "epoch": 2.2667452976596936, + "grad_norm": 0.1705555555132736, + "learning_rate": 1.712070674154822e-05, + "loss": 2.7919, + "step": 36515 + }, + { + "epoch": 2.266807374759451, + "grad_norm": 0.14108268115622755, + "learning_rate": 1.711798597865846e-05, + "loss": 2.7218, + "step": 36516 + }, + { + "epoch": 2.266869451859209, + "grad_norm": 0.14327525495510818, + "learning_rate": 1.711526538732213e-05, + "loss": 2.7993, + "step": 36517 + }, + { + "epoch": 2.266931528958967, + "grad_norm": 0.1373321736310033, + "learning_rate": 1.7112544967553457e-05, + "loss": 2.6917, + "step": 36518 + }, + { + "epoch": 2.266993606058725, + "grad_norm": 0.1453776833761375, + "learning_rate": 1.7109824719366623e-05, + "loss": 2.748, + "step": 36519 + }, + { + "epoch": 2.2670556831584827, + "grad_norm": 0.14545111863816507, + "learning_rate": 1.7107104642775823e-05, + "loss": 2.7109, + "step": 36520 + }, + { + "epoch": 2.2671177602582406, + "grad_norm": 0.14260168403482792, + "learning_rate": 1.710438473779522e-05, + "loss": 2.7232, + "step": 36521 + }, + { + "epoch": 2.2671798373579986, + "grad_norm": 0.1358574919545447, + "learning_rate": 1.7101665004439025e-05, + "loss": 2.6547, + "step": 36522 + }, + { + "epoch": 2.2672419144577565, + "grad_norm": 0.16086659837190379, + "learning_rate": 1.7098945442721458e-05, + "loss": 2.7409, + "step": 36523 + }, + { + "epoch": 2.2673039915575144, + "grad_norm": 0.17811317739235402, + "learning_rate": 1.7096226052656672e-05, + "loss": 2.6939, + "step": 36524 + }, + { + "epoch": 2.2673660686572723, + "grad_norm": 0.16525323503144015, + "learning_rate": 1.7093506834258865e-05, + "loss": 2.7619, + "step": 36525 + }, + { + "epoch": 2.2674281457570302, + "grad_norm": 0.1487337782072687, + "learning_rate": 1.70907877875422e-05, + "loss": 2.7622, + "step": 36526 + }, + { + "epoch": 2.267490222856788, + "grad_norm": 0.14799967619862126, + "learning_rate": 1.7088068912520906e-05, + "loss": 2.7733, + "step": 36527 + }, + { + "epoch": 2.267552299956546, + "grad_norm": 0.14430345161953356, + "learning_rate": 1.7085350209209146e-05, + "loss": 2.705, + "step": 36528 + }, + { + "epoch": 2.267614377056304, + "grad_norm": 0.1434795870878872, + "learning_rate": 1.7082631677621096e-05, + "loss": 2.6544, + "step": 36529 + }, + { + "epoch": 2.267676454156062, + "grad_norm": 0.16166391410483472, + "learning_rate": 1.7079913317770957e-05, + "loss": 2.5566, + "step": 36530 + }, + { + "epoch": 2.26773853125582, + "grad_norm": 0.14442192272931262, + "learning_rate": 1.707719512967288e-05, + "loss": 2.8102, + "step": 36531 + }, + { + "epoch": 2.2678006083555777, + "grad_norm": 0.13963971410861672, + "learning_rate": 1.7074477113341087e-05, + "loss": 2.6733, + "step": 36532 + }, + { + "epoch": 2.2678626854553356, + "grad_norm": 0.13858103127162724, + "learning_rate": 1.7071759268789744e-05, + "loss": 2.6967, + "step": 36533 + }, + { + "epoch": 2.2679247625550936, + "grad_norm": 0.1428060277853721, + "learning_rate": 1.706904159603302e-05, + "loss": 2.6362, + "step": 36534 + }, + { + "epoch": 2.2679868396548515, + "grad_norm": 0.1479698771038049, + "learning_rate": 1.7066324095085085e-05, + "loss": 2.7282, + "step": 36535 + }, + { + "epoch": 2.2680489167546094, + "grad_norm": 0.14146324505469457, + "learning_rate": 1.706360676596015e-05, + "loss": 2.7973, + "step": 36536 + }, + { + "epoch": 2.268110993854367, + "grad_norm": 0.14047097009171014, + "learning_rate": 1.7060889608672375e-05, + "loss": 2.8048, + "step": 36537 + }, + { + "epoch": 2.2681730709541252, + "grad_norm": 0.13723404370684564, + "learning_rate": 1.7058172623235935e-05, + "loss": 2.6487, + "step": 36538 + }, + { + "epoch": 2.2682351480538827, + "grad_norm": 0.14722590631797755, + "learning_rate": 1.7055455809664984e-05, + "loss": 2.6677, + "step": 36539 + }, + { + "epoch": 2.2682972251536406, + "grad_norm": 0.14278298514362361, + "learning_rate": 1.7052739167973742e-05, + "loss": 2.7358, + "step": 36540 + }, + { + "epoch": 2.2683593022533985, + "grad_norm": 0.14589250519321176, + "learning_rate": 1.7050022698176347e-05, + "loss": 2.8005, + "step": 36541 + }, + { + "epoch": 2.2684213793531565, + "grad_norm": 0.13903527261166843, + "learning_rate": 1.7047306400286993e-05, + "loss": 2.7068, + "step": 36542 + }, + { + "epoch": 2.2684834564529144, + "grad_norm": 0.1491497232150829, + "learning_rate": 1.7044590274319838e-05, + "loss": 2.7751, + "step": 36543 + }, + { + "epoch": 2.2685455335526723, + "grad_norm": 0.1434120066017914, + "learning_rate": 1.7041874320289035e-05, + "loss": 2.7224, + "step": 36544 + }, + { + "epoch": 2.26860761065243, + "grad_norm": 0.15276313179309356, + "learning_rate": 1.70391585382088e-05, + "loss": 2.7398, + "step": 36545 + }, + { + "epoch": 2.268669687752188, + "grad_norm": 0.14442145842768933, + "learning_rate": 1.7036442928093272e-05, + "loss": 2.76, + "step": 36546 + }, + { + "epoch": 2.268731764851946, + "grad_norm": 0.14999735179192647, + "learning_rate": 1.703372748995662e-05, + "loss": 2.6662, + "step": 36547 + }, + { + "epoch": 2.268793841951704, + "grad_norm": 0.1705129253411789, + "learning_rate": 1.7031012223813e-05, + "loss": 2.7064, + "step": 36548 + }, + { + "epoch": 2.268855919051462, + "grad_norm": 0.14756541940086618, + "learning_rate": 1.7028297129676613e-05, + "loss": 2.8434, + "step": 36549 + }, + { + "epoch": 2.26891799615122, + "grad_norm": 0.14418106571428646, + "learning_rate": 1.7025582207561598e-05, + "loss": 2.7088, + "step": 36550 + }, + { + "epoch": 2.2689800732509777, + "grad_norm": 0.14888077978394032, + "learning_rate": 1.7022867457482128e-05, + "loss": 2.7311, + "step": 36551 + }, + { + "epoch": 2.2690421503507356, + "grad_norm": 0.14405822673094953, + "learning_rate": 1.7020152879452346e-05, + "loss": 2.6454, + "step": 36552 + }, + { + "epoch": 2.2691042274504936, + "grad_norm": 0.14397160330110836, + "learning_rate": 1.7017438473486435e-05, + "loss": 2.7752, + "step": 36553 + }, + { + "epoch": 2.2691663045502515, + "grad_norm": 0.13915806818951554, + "learning_rate": 1.7014724239598574e-05, + "loss": 2.7131, + "step": 36554 + }, + { + "epoch": 2.2692283816500094, + "grad_norm": 0.1581685593602423, + "learning_rate": 1.7012010177802895e-05, + "loss": 2.8045, + "step": 36555 + }, + { + "epoch": 2.2692904587497673, + "grad_norm": 0.14514624714134206, + "learning_rate": 1.7009296288113574e-05, + "loss": 2.7575, + "step": 36556 + }, + { + "epoch": 2.2693525358495252, + "grad_norm": 0.1469546842517701, + "learning_rate": 1.700658257054476e-05, + "loss": 2.7018, + "step": 36557 + }, + { + "epoch": 2.269414612949283, + "grad_norm": 0.14550864456246082, + "learning_rate": 1.7003869025110597e-05, + "loss": 2.6799, + "step": 36558 + }, + { + "epoch": 2.269476690049041, + "grad_norm": 0.16380021723931093, + "learning_rate": 1.700115565182528e-05, + "loss": 2.7943, + "step": 36559 + }, + { + "epoch": 2.269538767148799, + "grad_norm": 0.1397167385573401, + "learning_rate": 1.6998442450702946e-05, + "loss": 2.7011, + "step": 36560 + }, + { + "epoch": 2.269600844248557, + "grad_norm": 0.14382969860063557, + "learning_rate": 1.6995729421757745e-05, + "loss": 2.7354, + "step": 36561 + }, + { + "epoch": 2.2696629213483144, + "grad_norm": 0.14836480845364394, + "learning_rate": 1.6993016565003815e-05, + "loss": 2.7725, + "step": 36562 + }, + { + "epoch": 2.2697249984480727, + "grad_norm": 0.1528539184887879, + "learning_rate": 1.6990303880455353e-05, + "loss": 2.7383, + "step": 36563 + }, + { + "epoch": 2.26978707554783, + "grad_norm": 0.1493722071645926, + "learning_rate": 1.6987591368126487e-05, + "loss": 2.6746, + "step": 36564 + }, + { + "epoch": 2.269849152647588, + "grad_norm": 0.1560103665030627, + "learning_rate": 1.698487902803137e-05, + "loss": 2.7669, + "step": 36565 + }, + { + "epoch": 2.269911229747346, + "grad_norm": 0.17456612744781036, + "learning_rate": 1.6982166860184155e-05, + "loss": 2.6826, + "step": 36566 + }, + { + "epoch": 2.269973306847104, + "grad_norm": 0.1584508686802183, + "learning_rate": 1.6979454864598977e-05, + "loss": 2.7789, + "step": 36567 + }, + { + "epoch": 2.270035383946862, + "grad_norm": 0.14395833519421736, + "learning_rate": 1.697674304129001e-05, + "loss": 2.7216, + "step": 36568 + }, + { + "epoch": 2.27009746104662, + "grad_norm": 0.13719889932338997, + "learning_rate": 1.69740313902714e-05, + "loss": 2.7205, + "step": 36569 + }, + { + "epoch": 2.2701595381463777, + "grad_norm": 0.13482532699153793, + "learning_rate": 1.6971319911557278e-05, + "loss": 2.6662, + "step": 36570 + }, + { + "epoch": 2.2702216152461356, + "grad_norm": 0.14016908677214995, + "learning_rate": 1.6968608605161785e-05, + "loss": 2.6775, + "step": 36571 + }, + { + "epoch": 2.2702836923458936, + "grad_norm": 0.13736539912018336, + "learning_rate": 1.696589747109909e-05, + "loss": 2.7081, + "step": 36572 + }, + { + "epoch": 2.2703457694456515, + "grad_norm": 0.1366008727220154, + "learning_rate": 1.6963186509383333e-05, + "loss": 2.6936, + "step": 36573 + }, + { + "epoch": 2.2704078465454094, + "grad_norm": 0.14459062336215933, + "learning_rate": 1.696047572002865e-05, + "loss": 2.6849, + "step": 36574 + }, + { + "epoch": 2.2704699236451673, + "grad_norm": 0.14455230662604518, + "learning_rate": 1.695776510304917e-05, + "loss": 2.7534, + "step": 36575 + }, + { + "epoch": 2.2705320007449252, + "grad_norm": 0.14081442843139794, + "learning_rate": 1.695505465845907e-05, + "loss": 2.6342, + "step": 36576 + }, + { + "epoch": 2.270594077844683, + "grad_norm": 0.14100187847521262, + "learning_rate": 1.6952344386272466e-05, + "loss": 2.7471, + "step": 36577 + }, + { + "epoch": 2.270656154944441, + "grad_norm": 0.1515868776910867, + "learning_rate": 1.6949634286503508e-05, + "loss": 2.7663, + "step": 36578 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.14988799135170383, + "learning_rate": 1.694692435916633e-05, + "loss": 2.7579, + "step": 36579 + }, + { + "epoch": 2.270780309143957, + "grad_norm": 0.1604447673567864, + "learning_rate": 1.6944214604275054e-05, + "loss": 2.7143, + "step": 36580 + }, + { + "epoch": 2.270842386243715, + "grad_norm": 0.14129726994611236, + "learning_rate": 1.6941505021843856e-05, + "loss": 2.6732, + "step": 36581 + }, + { + "epoch": 2.2709044633434727, + "grad_norm": 0.1424619188057877, + "learning_rate": 1.6938795611886848e-05, + "loss": 2.69, + "step": 36582 + }, + { + "epoch": 2.2709665404432307, + "grad_norm": 0.15090818028569256, + "learning_rate": 1.693608637441817e-05, + "loss": 2.7445, + "step": 36583 + }, + { + "epoch": 2.2710286175429886, + "grad_norm": 0.16282583384361113, + "learning_rate": 1.6933377309451937e-05, + "loss": 2.6916, + "step": 36584 + }, + { + "epoch": 2.271090694642746, + "grad_norm": 0.15012378055766493, + "learning_rate": 1.6930668417002322e-05, + "loss": 2.7632, + "step": 36585 + }, + { + "epoch": 2.2711527717425044, + "grad_norm": 0.13886452442947514, + "learning_rate": 1.692795969708342e-05, + "loss": 2.736, + "step": 36586 + }, + { + "epoch": 2.271214848842262, + "grad_norm": 0.13597393150600617, + "learning_rate": 1.6925251149709392e-05, + "loss": 2.6864, + "step": 36587 + }, + { + "epoch": 2.27127692594202, + "grad_norm": 0.14078739511666655, + "learning_rate": 1.692254277489436e-05, + "loss": 2.7381, + "step": 36588 + }, + { + "epoch": 2.2713390030417777, + "grad_norm": 0.17794972253551647, + "learning_rate": 1.6919834572652443e-05, + "loss": 2.7331, + "step": 36589 + }, + { + "epoch": 2.2714010801415356, + "grad_norm": 0.13971567869255547, + "learning_rate": 1.691712654299779e-05, + "loss": 2.6982, + "step": 36590 + }, + { + "epoch": 2.2714631572412936, + "grad_norm": 0.14410518660296312, + "learning_rate": 1.6914418685944517e-05, + "loss": 2.7838, + "step": 36591 + }, + { + "epoch": 2.2715252343410515, + "grad_norm": 0.13960864427115927, + "learning_rate": 1.691171100150676e-05, + "loss": 2.6946, + "step": 36592 + }, + { + "epoch": 2.2715873114408094, + "grad_norm": 0.1417417949962706, + "learning_rate": 1.6909003489698632e-05, + "loss": 2.6603, + "step": 36593 + }, + { + "epoch": 2.2716493885405673, + "grad_norm": 0.1582921154608376, + "learning_rate": 1.6906296150534255e-05, + "loss": 2.5921, + "step": 36594 + }, + { + "epoch": 2.2717114656403252, + "grad_norm": 0.14737862064113785, + "learning_rate": 1.690358898402778e-05, + "loss": 2.6985, + "step": 36595 + }, + { + "epoch": 2.271773542740083, + "grad_norm": 0.1510683740374198, + "learning_rate": 1.6900881990193314e-05, + "loss": 2.6517, + "step": 36596 + }, + { + "epoch": 2.271835619839841, + "grad_norm": 0.15382328266060563, + "learning_rate": 1.6898175169044983e-05, + "loss": 2.7505, + "step": 36597 + }, + { + "epoch": 2.271897696939599, + "grad_norm": 0.14557692147502932, + "learning_rate": 1.6895468520596892e-05, + "loss": 2.7383, + "step": 36598 + }, + { + "epoch": 2.271959774039357, + "grad_norm": 0.15034808139234954, + "learning_rate": 1.6892762044863193e-05, + "loss": 2.6881, + "step": 36599 + }, + { + "epoch": 2.272021851139115, + "grad_norm": 0.1523148374753812, + "learning_rate": 1.689005574185799e-05, + "loss": 2.7867, + "step": 36600 + }, + { + "epoch": 2.2720839282388727, + "grad_norm": 0.14129484435325174, + "learning_rate": 1.688734961159541e-05, + "loss": 2.6607, + "step": 36601 + }, + { + "epoch": 2.2721460053386306, + "grad_norm": 0.14215520292841147, + "learning_rate": 1.688464365408956e-05, + "loss": 2.6485, + "step": 36602 + }, + { + "epoch": 2.2722080824383886, + "grad_norm": 0.14721519001199265, + "learning_rate": 1.688193786935455e-05, + "loss": 2.6867, + "step": 36603 + }, + { + "epoch": 2.2722701595381465, + "grad_norm": 0.15255428707132568, + "learning_rate": 1.6879232257404527e-05, + "loss": 2.6541, + "step": 36604 + }, + { + "epoch": 2.2723322366379044, + "grad_norm": 0.15216809930454275, + "learning_rate": 1.6876526818253585e-05, + "loss": 2.7266, + "step": 36605 + }, + { + "epoch": 2.2723943137376623, + "grad_norm": 0.14342631224714666, + "learning_rate": 1.687382155191585e-05, + "loss": 2.812, + "step": 36606 + }, + { + "epoch": 2.2724563908374202, + "grad_norm": 0.15377461497080785, + "learning_rate": 1.6871116458405407e-05, + "loss": 2.69, + "step": 36607 + }, + { + "epoch": 2.272518467937178, + "grad_norm": 0.16988296721050755, + "learning_rate": 1.6868411537736406e-05, + "loss": 2.7256, + "step": 36608 + }, + { + "epoch": 2.272580545036936, + "grad_norm": 0.14353106427611517, + "learning_rate": 1.6865706789922953e-05, + "loss": 2.6452, + "step": 36609 + }, + { + "epoch": 2.2726426221366935, + "grad_norm": 0.15682944501690155, + "learning_rate": 1.6863002214979147e-05, + "loss": 2.8182, + "step": 36610 + }, + { + "epoch": 2.272704699236452, + "grad_norm": 0.13886975555620706, + "learning_rate": 1.6860297812919085e-05, + "loss": 2.7032, + "step": 36611 + }, + { + "epoch": 2.2727667763362094, + "grad_norm": 0.15902471311803007, + "learning_rate": 1.6857593583756913e-05, + "loss": 2.6527, + "step": 36612 + }, + { + "epoch": 2.2728288534359673, + "grad_norm": 0.15606527734186382, + "learning_rate": 1.6854889527506722e-05, + "loss": 2.7544, + "step": 36613 + }, + { + "epoch": 2.272890930535725, + "grad_norm": 0.13734033157208617, + "learning_rate": 1.6852185644182617e-05, + "loss": 2.6906, + "step": 36614 + }, + { + "epoch": 2.272953007635483, + "grad_norm": 0.14652017156001404, + "learning_rate": 1.6849481933798706e-05, + "loss": 2.7035, + "step": 36615 + }, + { + "epoch": 2.273015084735241, + "grad_norm": 0.17342575523200474, + "learning_rate": 1.6846778396369077e-05, + "loss": 2.779, + "step": 36616 + }, + { + "epoch": 2.273077161834999, + "grad_norm": 0.14644806093865964, + "learning_rate": 1.6844075031907875e-05, + "loss": 2.6788, + "step": 36617 + }, + { + "epoch": 2.273139238934757, + "grad_norm": 0.1384818292646572, + "learning_rate": 1.6841371840429175e-05, + "loss": 2.7531, + "step": 36618 + }, + { + "epoch": 2.273201316034515, + "grad_norm": 0.14563970540663593, + "learning_rate": 1.6838668821947078e-05, + "loss": 2.7007, + "step": 36619 + }, + { + "epoch": 2.2732633931342727, + "grad_norm": 0.1420483151602969, + "learning_rate": 1.683596597647571e-05, + "loss": 2.6743, + "step": 36620 + }, + { + "epoch": 2.2733254702340306, + "grad_norm": 0.1931466271194832, + "learning_rate": 1.683326330402914e-05, + "loss": 2.8179, + "step": 36621 + }, + { + "epoch": 2.2733875473337886, + "grad_norm": 0.15134869124092737, + "learning_rate": 1.6830560804621508e-05, + "loss": 2.7648, + "step": 36622 + }, + { + "epoch": 2.2734496244335465, + "grad_norm": 0.14105169146770016, + "learning_rate": 1.6827858478266888e-05, + "loss": 2.6622, + "step": 36623 + }, + { + "epoch": 2.2735117015333044, + "grad_norm": 0.15046879359370438, + "learning_rate": 1.6825156324979385e-05, + "loss": 2.6997, + "step": 36624 + }, + { + "epoch": 2.2735737786330623, + "grad_norm": 0.16338372596795822, + "learning_rate": 1.682245434477308e-05, + "loss": 2.7042, + "step": 36625 + }, + { + "epoch": 2.2736358557328202, + "grad_norm": 0.1413415297039295, + "learning_rate": 1.6819752537662098e-05, + "loss": 2.7428, + "step": 36626 + }, + { + "epoch": 2.273697932832578, + "grad_norm": 0.15335880088018533, + "learning_rate": 1.6817050903660524e-05, + "loss": 2.6999, + "step": 36627 + }, + { + "epoch": 2.273760009932336, + "grad_norm": 0.1637316671967174, + "learning_rate": 1.681434944278245e-05, + "loss": 2.6955, + "step": 36628 + }, + { + "epoch": 2.273822087032094, + "grad_norm": 0.14256896028719312, + "learning_rate": 1.681164815504197e-05, + "loss": 2.6716, + "step": 36629 + }, + { + "epoch": 2.273884164131852, + "grad_norm": 0.14386664120904188, + "learning_rate": 1.6808947040453165e-05, + "loss": 2.7547, + "step": 36630 + }, + { + "epoch": 2.27394624123161, + "grad_norm": 0.15531961486147036, + "learning_rate": 1.6806246099030153e-05, + "loss": 2.7171, + "step": 36631 + }, + { + "epoch": 2.2740083183313677, + "grad_norm": 0.14435360275175294, + "learning_rate": 1.680354533078702e-05, + "loss": 2.7656, + "step": 36632 + }, + { + "epoch": 2.274070395431125, + "grad_norm": 0.1446834420176821, + "learning_rate": 1.6800844735737843e-05, + "loss": 2.7436, + "step": 36633 + }, + { + "epoch": 2.2741324725308836, + "grad_norm": 0.1646665963581871, + "learning_rate": 1.6798144313896707e-05, + "loss": 2.7473, + "step": 36634 + }, + { + "epoch": 2.274194549630641, + "grad_norm": 0.1518082560652371, + "learning_rate": 1.679544406527772e-05, + "loss": 2.6555, + "step": 36635 + }, + { + "epoch": 2.274256626730399, + "grad_norm": 0.14689567679085472, + "learning_rate": 1.679274398989497e-05, + "loss": 2.6924, + "step": 36636 + }, + { + "epoch": 2.274318703830157, + "grad_norm": 0.16687789151011845, + "learning_rate": 1.6790044087762534e-05, + "loss": 2.7329, + "step": 36637 + }, + { + "epoch": 2.274380780929915, + "grad_norm": 0.1672672660779203, + "learning_rate": 1.6787344358894485e-05, + "loss": 2.7893, + "step": 36638 + }, + { + "epoch": 2.2744428580296727, + "grad_norm": 0.17456270229902882, + "learning_rate": 1.6784644803304938e-05, + "loss": 2.7286, + "step": 36639 + }, + { + "epoch": 2.2745049351294306, + "grad_norm": 0.14568155272179598, + "learning_rate": 1.678194542100796e-05, + "loss": 2.7648, + "step": 36640 + }, + { + "epoch": 2.2745670122291886, + "grad_norm": 0.136600531829707, + "learning_rate": 1.6779246212017642e-05, + "loss": 2.6841, + "step": 36641 + }, + { + "epoch": 2.2746290893289465, + "grad_norm": 0.13713585035538278, + "learning_rate": 1.6776547176348058e-05, + "loss": 2.7095, + "step": 36642 + }, + { + "epoch": 2.2746911664287044, + "grad_norm": 0.15172884405640347, + "learning_rate": 1.6773848314013274e-05, + "loss": 2.747, + "step": 36643 + }, + { + "epoch": 2.2747532435284623, + "grad_norm": 0.13952180149087454, + "learning_rate": 1.677114962502741e-05, + "loss": 2.7767, + "step": 36644 + }, + { + "epoch": 2.2748153206282202, + "grad_norm": 0.16705075848580478, + "learning_rate": 1.676845110940452e-05, + "loss": 2.609, + "step": 36645 + }, + { + "epoch": 2.274877397727978, + "grad_norm": 0.14078312169777304, + "learning_rate": 1.6765752767158692e-05, + "loss": 2.7554, + "step": 36646 + }, + { + "epoch": 2.274939474827736, + "grad_norm": 0.1397658701298005, + "learning_rate": 1.6763054598303985e-05, + "loss": 2.7987, + "step": 36647 + }, + { + "epoch": 2.275001551927494, + "grad_norm": 0.14812460974750535, + "learning_rate": 1.6760356602854503e-05, + "loss": 2.693, + "step": 36648 + }, + { + "epoch": 2.275063629027252, + "grad_norm": 0.15191730086783095, + "learning_rate": 1.6757658780824308e-05, + "loss": 2.6658, + "step": 36649 + }, + { + "epoch": 2.27512570612701, + "grad_norm": 0.1450207514342278, + "learning_rate": 1.6754961132227482e-05, + "loss": 2.7087, + "step": 36650 + }, + { + "epoch": 2.2751877832267677, + "grad_norm": 0.1502259269140281, + "learning_rate": 1.675226365707809e-05, + "loss": 2.7042, + "step": 36651 + }, + { + "epoch": 2.2752498603265257, + "grad_norm": 0.14268141754318012, + "learning_rate": 1.67495663553902e-05, + "loss": 2.748, + "step": 36652 + }, + { + "epoch": 2.2753119374262836, + "grad_norm": 0.1416215093309594, + "learning_rate": 1.6746869227177885e-05, + "loss": 2.7736, + "step": 36653 + }, + { + "epoch": 2.2753740145260415, + "grad_norm": 0.1371088796862054, + "learning_rate": 1.674417227245525e-05, + "loss": 2.6889, + "step": 36654 + }, + { + "epoch": 2.2754360916257994, + "grad_norm": 0.17508253999256027, + "learning_rate": 1.6741475491236335e-05, + "loss": 2.7847, + "step": 36655 + }, + { + "epoch": 2.2754981687255573, + "grad_norm": 0.14321807399334682, + "learning_rate": 1.6738778883535222e-05, + "loss": 2.7747, + "step": 36656 + }, + { + "epoch": 2.2755602458253152, + "grad_norm": 0.1635562602191435, + "learning_rate": 1.6736082449365952e-05, + "loss": 2.6763, + "step": 36657 + }, + { + "epoch": 2.2756223229250727, + "grad_norm": 0.1591312533687128, + "learning_rate": 1.673338618874264e-05, + "loss": 2.7182, + "step": 36658 + }, + { + "epoch": 2.275684400024831, + "grad_norm": 0.14149658652803404, + "learning_rate": 1.673069010167932e-05, + "loss": 2.725, + "step": 36659 + }, + { + "epoch": 2.2757464771245886, + "grad_norm": 0.1518746207570023, + "learning_rate": 1.6727994188190067e-05, + "loss": 2.6873, + "step": 36660 + }, + { + "epoch": 2.2758085542243465, + "grad_norm": 0.15046616923415557, + "learning_rate": 1.6725298448288933e-05, + "loss": 2.7094, + "step": 36661 + }, + { + "epoch": 2.2758706313241044, + "grad_norm": 0.14444049357040195, + "learning_rate": 1.6722602881990007e-05, + "loss": 2.7369, + "step": 36662 + }, + { + "epoch": 2.2759327084238623, + "grad_norm": 0.15399256406247125, + "learning_rate": 1.671990748930734e-05, + "loss": 2.6583, + "step": 36663 + }, + { + "epoch": 2.2759947855236202, + "grad_norm": 0.14801742064438755, + "learning_rate": 1.6717212270255e-05, + "loss": 2.7753, + "step": 36664 + }, + { + "epoch": 2.276056862623378, + "grad_norm": 0.1386741852903667, + "learning_rate": 1.6714517224847038e-05, + "loss": 2.723, + "step": 36665 + }, + { + "epoch": 2.276118939723136, + "grad_norm": 0.14010355886046252, + "learning_rate": 1.6711822353097507e-05, + "loss": 2.7286, + "step": 36666 + }, + { + "epoch": 2.276181016822894, + "grad_norm": 0.14134694589328575, + "learning_rate": 1.6709127655020495e-05, + "loss": 2.7939, + "step": 36667 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.15404388353945064, + "learning_rate": 1.6706433130630045e-05, + "loss": 2.7712, + "step": 36668 + }, + { + "epoch": 2.27630517102241, + "grad_norm": 0.1491123719028649, + "learning_rate": 1.6703738779940214e-05, + "loss": 2.7822, + "step": 36669 + }, + { + "epoch": 2.2763672481221677, + "grad_norm": 0.14271474850522084, + "learning_rate": 1.6701044602965044e-05, + "loss": 2.7105, + "step": 36670 + }, + { + "epoch": 2.2764293252219256, + "grad_norm": 0.13887799753427144, + "learning_rate": 1.6698350599718625e-05, + "loss": 2.6756, + "step": 36671 + }, + { + "epoch": 2.2764914023216836, + "grad_norm": 0.14970847649819147, + "learning_rate": 1.6695656770215e-05, + "loss": 2.7469, + "step": 36672 + }, + { + "epoch": 2.2765534794214415, + "grad_norm": 0.1376797687604091, + "learning_rate": 1.6692963114468212e-05, + "loss": 2.6665, + "step": 36673 + }, + { + "epoch": 2.2766155565211994, + "grad_norm": 0.1409834954598882, + "learning_rate": 1.6690269632492313e-05, + "loss": 2.8012, + "step": 36674 + }, + { + "epoch": 2.2766776336209573, + "grad_norm": 0.14147371900758457, + "learning_rate": 1.6687576324301374e-05, + "loss": 2.6639, + "step": 36675 + }, + { + "epoch": 2.2767397107207152, + "grad_norm": 0.1422841163026485, + "learning_rate": 1.6684883189909432e-05, + "loss": 2.7985, + "step": 36676 + }, + { + "epoch": 2.276801787820473, + "grad_norm": 0.13870056956890367, + "learning_rate": 1.6682190229330552e-05, + "loss": 2.7957, + "step": 36677 + }, + { + "epoch": 2.276863864920231, + "grad_norm": 0.16296285564665872, + "learning_rate": 1.6679497442578767e-05, + "loss": 2.7108, + "step": 36678 + }, + { + "epoch": 2.276925942019989, + "grad_norm": 0.18430267588439436, + "learning_rate": 1.6676804829668118e-05, + "loss": 2.713, + "step": 36679 + }, + { + "epoch": 2.276988019119747, + "grad_norm": 0.1518986018212871, + "learning_rate": 1.6674112390612686e-05, + "loss": 2.8083, + "step": 36680 + }, + { + "epoch": 2.2770500962195044, + "grad_norm": 0.13812385993374823, + "learning_rate": 1.6671420125426497e-05, + "loss": 2.7409, + "step": 36681 + }, + { + "epoch": 2.2771121733192627, + "grad_norm": 0.14009571472422344, + "learning_rate": 1.6668728034123598e-05, + "loss": 2.7202, + "step": 36682 + }, + { + "epoch": 2.27717425041902, + "grad_norm": 0.1379114240678038, + "learning_rate": 1.666603611671802e-05, + "loss": 2.7051, + "step": 36683 + }, + { + "epoch": 2.277236327518778, + "grad_norm": 0.13916449958314175, + "learning_rate": 1.666334437322383e-05, + "loss": 2.7716, + "step": 36684 + }, + { + "epoch": 2.277298404618536, + "grad_norm": 0.1449715570287845, + "learning_rate": 1.6660652803655075e-05, + "loss": 2.7427, + "step": 36685 + }, + { + "epoch": 2.277360481718294, + "grad_norm": 0.14187682790250894, + "learning_rate": 1.665796140802579e-05, + "loss": 2.6482, + "step": 36686 + }, + { + "epoch": 2.277422558818052, + "grad_norm": 0.16681714392622773, + "learning_rate": 1.665527018635002e-05, + "loss": 2.7939, + "step": 36687 + }, + { + "epoch": 2.27748463591781, + "grad_norm": 0.1404183843464691, + "learning_rate": 1.6652579138641776e-05, + "loss": 2.8084, + "step": 36688 + }, + { + "epoch": 2.2775467130175677, + "grad_norm": 0.15133270363092, + "learning_rate": 1.6649888264915143e-05, + "loss": 2.6508, + "step": 36689 + }, + { + "epoch": 2.2776087901173256, + "grad_norm": 0.14017719255863476, + "learning_rate": 1.6647197565184137e-05, + "loss": 2.8001, + "step": 36690 + }, + { + "epoch": 2.2776708672170836, + "grad_norm": 0.14758674358123355, + "learning_rate": 1.6644507039462796e-05, + "loss": 2.7738, + "step": 36691 + }, + { + "epoch": 2.2777329443168415, + "grad_norm": 0.16602005111179605, + "learning_rate": 1.664181668776516e-05, + "loss": 2.6844, + "step": 36692 + }, + { + "epoch": 2.2777950214165994, + "grad_norm": 0.1343709168811826, + "learning_rate": 1.6639126510105253e-05, + "loss": 2.7132, + "step": 36693 + }, + { + "epoch": 2.2778570985163573, + "grad_norm": 0.1452801478683209, + "learning_rate": 1.6636436506497134e-05, + "loss": 2.7001, + "step": 36694 + }, + { + "epoch": 2.2779191756161152, + "grad_norm": 0.1446899367718417, + "learning_rate": 1.663374667695482e-05, + "loss": 2.8238, + "step": 36695 + }, + { + "epoch": 2.277981252715873, + "grad_norm": 0.16535689431184022, + "learning_rate": 1.6631057021492356e-05, + "loss": 2.7146, + "step": 36696 + }, + { + "epoch": 2.278043329815631, + "grad_norm": 0.13726785608040593, + "learning_rate": 1.662836754012375e-05, + "loss": 2.7539, + "step": 36697 + }, + { + "epoch": 2.278105406915389, + "grad_norm": 0.15881323612790157, + "learning_rate": 1.6625678232863068e-05, + "loss": 2.6473, + "step": 36698 + }, + { + "epoch": 2.278167484015147, + "grad_norm": 0.1823035724475829, + "learning_rate": 1.6622989099724323e-05, + "loss": 2.7494, + "step": 36699 + }, + { + "epoch": 2.278229561114905, + "grad_norm": 0.1538726007400777, + "learning_rate": 1.6620300140721552e-05, + "loss": 2.7275, + "step": 36700 + }, + { + "epoch": 2.2782916382146627, + "grad_norm": 0.1377958444772431, + "learning_rate": 1.6617611355868772e-05, + "loss": 2.7683, + "step": 36701 + }, + { + "epoch": 2.2783537153144207, + "grad_norm": 0.13894709958355456, + "learning_rate": 1.6614922745180006e-05, + "loss": 2.7917, + "step": 36702 + }, + { + "epoch": 2.2784157924141786, + "grad_norm": 0.14946249881108548, + "learning_rate": 1.6612234308669305e-05, + "loss": 2.6933, + "step": 36703 + }, + { + "epoch": 2.2784778695139365, + "grad_norm": 0.14418773165668916, + "learning_rate": 1.6609546046350686e-05, + "loss": 2.6333, + "step": 36704 + }, + { + "epoch": 2.2785399466136944, + "grad_norm": 0.13778571955309094, + "learning_rate": 1.660685795823817e-05, + "loss": 2.7431, + "step": 36705 + }, + { + "epoch": 2.278602023713452, + "grad_norm": 0.14659539783790124, + "learning_rate": 1.6604170044345767e-05, + "loss": 2.7392, + "step": 36706 + }, + { + "epoch": 2.2786641008132102, + "grad_norm": 0.13842101980673535, + "learning_rate": 1.6601482304687534e-05, + "loss": 2.6809, + "step": 36707 + }, + { + "epoch": 2.2787261779129677, + "grad_norm": 0.1393762282993094, + "learning_rate": 1.659879473927747e-05, + "loss": 2.8639, + "step": 36708 + }, + { + "epoch": 2.2787882550127256, + "grad_norm": 0.1491803541765402, + "learning_rate": 1.6596107348129604e-05, + "loss": 2.7623, + "step": 36709 + }, + { + "epoch": 2.2788503321124836, + "grad_norm": 0.13705743664513928, + "learning_rate": 1.659342013125794e-05, + "loss": 2.7372, + "step": 36710 + }, + { + "epoch": 2.2789124092122415, + "grad_norm": 0.14432390908797518, + "learning_rate": 1.659073308867653e-05, + "loss": 2.765, + "step": 36711 + }, + { + "epoch": 2.2789744863119994, + "grad_norm": 0.1421072841047971, + "learning_rate": 1.6588046220399372e-05, + "loss": 2.6879, + "step": 36712 + }, + { + "epoch": 2.2790365634117573, + "grad_norm": 0.1363334198390765, + "learning_rate": 1.658535952644049e-05, + "loss": 2.6765, + "step": 36713 + }, + { + "epoch": 2.2790986405115152, + "grad_norm": 0.1539985331660552, + "learning_rate": 1.6582673006813905e-05, + "loss": 2.7376, + "step": 36714 + }, + { + "epoch": 2.279160717611273, + "grad_norm": 0.1515241916162735, + "learning_rate": 1.65799866615336e-05, + "loss": 2.7393, + "step": 36715 + }, + { + "epoch": 2.279222794711031, + "grad_norm": 0.1514970837338365, + "learning_rate": 1.657730049061364e-05, + "loss": 2.7354, + "step": 36716 + }, + { + "epoch": 2.279284871810789, + "grad_norm": 0.13747386169165238, + "learning_rate": 1.6574614494068003e-05, + "loss": 2.7311, + "step": 36717 + }, + { + "epoch": 2.279346948910547, + "grad_norm": 0.13093171326543793, + "learning_rate": 1.657192867191073e-05, + "loss": 2.6493, + "step": 36718 + }, + { + "epoch": 2.279409026010305, + "grad_norm": 0.14897897739564636, + "learning_rate": 1.6569243024155816e-05, + "loss": 2.7022, + "step": 36719 + }, + { + "epoch": 2.2794711031100627, + "grad_norm": 0.1344218222416135, + "learning_rate": 1.656655755081727e-05, + "loss": 2.7797, + "step": 36720 + }, + { + "epoch": 2.2795331802098207, + "grad_norm": 0.16166554441894107, + "learning_rate": 1.656387225190912e-05, + "loss": 2.8021, + "step": 36721 + }, + { + "epoch": 2.2795952573095786, + "grad_norm": 0.16020772269640637, + "learning_rate": 1.6561187127445364e-05, + "loss": 2.6545, + "step": 36722 + }, + { + "epoch": 2.2796573344093365, + "grad_norm": 0.15328463363995826, + "learning_rate": 1.6558502177440016e-05, + "loss": 2.7765, + "step": 36723 + }, + { + "epoch": 2.2797194115090944, + "grad_norm": 0.14003747489593235, + "learning_rate": 1.655581740190706e-05, + "loss": 2.7523, + "step": 36724 + }, + { + "epoch": 2.2797814886088523, + "grad_norm": 0.1420998323999956, + "learning_rate": 1.6553132800860544e-05, + "loss": 2.7297, + "step": 36725 + }, + { + "epoch": 2.2798435657086102, + "grad_norm": 0.1569185794562897, + "learning_rate": 1.655044837431445e-05, + "loss": 2.7701, + "step": 36726 + }, + { + "epoch": 2.279905642808368, + "grad_norm": 0.15856793809000044, + "learning_rate": 1.6547764122282788e-05, + "loss": 2.6432, + "step": 36727 + }, + { + "epoch": 2.279967719908126, + "grad_norm": 0.1452486174660443, + "learning_rate": 1.654508004477956e-05, + "loss": 2.7145, + "step": 36728 + }, + { + "epoch": 2.2800297970078836, + "grad_norm": 0.14133140838503508, + "learning_rate": 1.6542396141818762e-05, + "loss": 2.6806, + "step": 36729 + }, + { + "epoch": 2.280091874107642, + "grad_norm": 0.15098891830714573, + "learning_rate": 1.6539712413414416e-05, + "loss": 2.7703, + "step": 36730 + }, + { + "epoch": 2.2801539512073994, + "grad_norm": 0.14300475891737702, + "learning_rate": 1.653702885958051e-05, + "loss": 2.6473, + "step": 36731 + }, + { + "epoch": 2.2802160283071573, + "grad_norm": 0.1420658065022586, + "learning_rate": 1.6534345480331054e-05, + "loss": 2.7734, + "step": 36732 + }, + { + "epoch": 2.2802781054069152, + "grad_norm": 0.1388832952114724, + "learning_rate": 1.653166227568002e-05, + "loss": 2.8216, + "step": 36733 + }, + { + "epoch": 2.280340182506673, + "grad_norm": 0.14080917695311196, + "learning_rate": 1.6528979245641445e-05, + "loss": 2.6664, + "step": 36734 + }, + { + "epoch": 2.280402259606431, + "grad_norm": 0.13946167254498645, + "learning_rate": 1.652629639022932e-05, + "loss": 2.7994, + "step": 36735 + }, + { + "epoch": 2.280464336706189, + "grad_norm": 0.14441066299411892, + "learning_rate": 1.652361370945762e-05, + "loss": 2.6358, + "step": 36736 + }, + { + "epoch": 2.280526413805947, + "grad_norm": 0.1484113532177143, + "learning_rate": 1.652093120334036e-05, + "loss": 2.7442, + "step": 36737 + }, + { + "epoch": 2.280588490905705, + "grad_norm": 0.13774277836160817, + "learning_rate": 1.6518248871891513e-05, + "loss": 2.729, + "step": 36738 + }, + { + "epoch": 2.2806505680054627, + "grad_norm": 0.13808330360717125, + "learning_rate": 1.6515566715125108e-05, + "loss": 2.7003, + "step": 36739 + }, + { + "epoch": 2.2807126451052206, + "grad_norm": 0.14197757388635107, + "learning_rate": 1.651288473305512e-05, + "loss": 2.7335, + "step": 36740 + }, + { + "epoch": 2.2807747222049786, + "grad_norm": 0.1437592249481086, + "learning_rate": 1.651020292569554e-05, + "loss": 2.6784, + "step": 36741 + }, + { + "epoch": 2.2808367993047365, + "grad_norm": 0.14130291192495747, + "learning_rate": 1.650752129306034e-05, + "loss": 2.6538, + "step": 36742 + }, + { + "epoch": 2.2808988764044944, + "grad_norm": 0.15143812424723535, + "learning_rate": 1.6504839835163554e-05, + "loss": 2.6943, + "step": 36743 + }, + { + "epoch": 2.2809609535042523, + "grad_norm": 0.15146486760722797, + "learning_rate": 1.650215855201915e-05, + "loss": 2.6377, + "step": 36744 + }, + { + "epoch": 2.2810230306040102, + "grad_norm": 0.14231858080159968, + "learning_rate": 1.6499477443641116e-05, + "loss": 2.7407, + "step": 36745 + }, + { + "epoch": 2.281085107703768, + "grad_norm": 0.14552899020928428, + "learning_rate": 1.649679651004342e-05, + "loss": 2.7329, + "step": 36746 + }, + { + "epoch": 2.281147184803526, + "grad_norm": 0.15035118477611117, + "learning_rate": 1.649411575124009e-05, + "loss": 2.7365, + "step": 36747 + }, + { + "epoch": 2.281209261903284, + "grad_norm": 0.1407096396150163, + "learning_rate": 1.6491435167245095e-05, + "loss": 2.7162, + "step": 36748 + }, + { + "epoch": 2.281271339003042, + "grad_norm": 0.14154671231500618, + "learning_rate": 1.6488754758072412e-05, + "loss": 2.6849, + "step": 36749 + }, + { + "epoch": 2.2813334161028, + "grad_norm": 0.1429237917788964, + "learning_rate": 1.6486074523736016e-05, + "loss": 2.7358, + "step": 36750 + }, + { + "epoch": 2.2813954932025577, + "grad_norm": 0.13693275732161195, + "learning_rate": 1.6483394464249925e-05, + "loss": 2.7228, + "step": 36751 + }, + { + "epoch": 2.2814575703023157, + "grad_norm": 0.14042723180032354, + "learning_rate": 1.6480714579628076e-05, + "loss": 2.7299, + "step": 36752 + }, + { + "epoch": 2.2815196474020736, + "grad_norm": 0.14363075368991432, + "learning_rate": 1.64780348698845e-05, + "loss": 2.6935, + "step": 36753 + }, + { + "epoch": 2.281581724501831, + "grad_norm": 0.133370887570051, + "learning_rate": 1.6475355335033154e-05, + "loss": 2.7154, + "step": 36754 + }, + { + "epoch": 2.2816438016015894, + "grad_norm": 0.1459401766758247, + "learning_rate": 1.6472675975088016e-05, + "loss": 2.6532, + "step": 36755 + }, + { + "epoch": 2.281705878701347, + "grad_norm": 0.14330948759399095, + "learning_rate": 1.646999679006305e-05, + "loss": 2.8275, + "step": 36756 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.1401656981486997, + "learning_rate": 1.6467317779972262e-05, + "loss": 2.7435, + "step": 36757 + }, + { + "epoch": 2.2818300329008627, + "grad_norm": 0.1431986812457739, + "learning_rate": 1.6464638944829625e-05, + "loss": 2.6359, + "step": 36758 + }, + { + "epoch": 2.2818921100006206, + "grad_norm": 0.1473911069532328, + "learning_rate": 1.64619602846491e-05, + "loss": 2.7425, + "step": 36759 + }, + { + "epoch": 2.2819541871003786, + "grad_norm": 0.14400250787960167, + "learning_rate": 1.645928179944466e-05, + "loss": 2.6543, + "step": 36760 + }, + { + "epoch": 2.2820162642001365, + "grad_norm": 0.1391485577484754, + "learning_rate": 1.6456603489230305e-05, + "loss": 2.7753, + "step": 36761 + }, + { + "epoch": 2.2820783412998944, + "grad_norm": 0.13741325682117006, + "learning_rate": 1.645392535401999e-05, + "loss": 2.7451, + "step": 36762 + }, + { + "epoch": 2.2821404183996523, + "grad_norm": 0.13484320964240834, + "learning_rate": 1.645124739382769e-05, + "loss": 2.6968, + "step": 36763 + }, + { + "epoch": 2.2822024954994102, + "grad_norm": 0.14874247541950242, + "learning_rate": 1.6448569608667382e-05, + "loss": 2.6786, + "step": 36764 + }, + { + "epoch": 2.282264572599168, + "grad_norm": 0.13933959902654477, + "learning_rate": 1.644589199855301e-05, + "loss": 2.6631, + "step": 36765 + }, + { + "epoch": 2.282326649698926, + "grad_norm": 0.14344784919536444, + "learning_rate": 1.6443214563498582e-05, + "loss": 2.6863, + "step": 36766 + }, + { + "epoch": 2.282388726798684, + "grad_norm": 0.143227840908735, + "learning_rate": 1.644053730351805e-05, + "loss": 2.7303, + "step": 36767 + }, + { + "epoch": 2.282450803898442, + "grad_norm": 0.14918110332631232, + "learning_rate": 1.6437860218625378e-05, + "loss": 2.6989, + "step": 36768 + }, + { + "epoch": 2.2825128809982, + "grad_norm": 0.14722221200071256, + "learning_rate": 1.643518330883453e-05, + "loss": 2.7278, + "step": 36769 + }, + { + "epoch": 2.2825749580979577, + "grad_norm": 0.1445564235209233, + "learning_rate": 1.6432506574159483e-05, + "loss": 2.6492, + "step": 36770 + }, + { + "epoch": 2.2826370351977157, + "grad_norm": 0.1366996547442941, + "learning_rate": 1.6429830014614207e-05, + "loss": 2.6929, + "step": 36771 + }, + { + "epoch": 2.2826991122974736, + "grad_norm": 0.1671635713120338, + "learning_rate": 1.6427153630212654e-05, + "loss": 2.7263, + "step": 36772 + }, + { + "epoch": 2.2827611893972315, + "grad_norm": 0.14499337188394823, + "learning_rate": 1.6424477420968777e-05, + "loss": 2.7901, + "step": 36773 + }, + { + "epoch": 2.2828232664969894, + "grad_norm": 0.13858810097192584, + "learning_rate": 1.642180138689656e-05, + "loss": 2.6799, + "step": 36774 + }, + { + "epoch": 2.2828853435967473, + "grad_norm": 0.17667811267158515, + "learning_rate": 1.6419125528009967e-05, + "loss": 2.7283, + "step": 36775 + }, + { + "epoch": 2.2829474206965052, + "grad_norm": 0.14537298753522987, + "learning_rate": 1.6416449844322945e-05, + "loss": 2.6869, + "step": 36776 + }, + { + "epoch": 2.2830094977962627, + "grad_norm": 0.15131505252933652, + "learning_rate": 1.6413774335849452e-05, + "loss": 2.7118, + "step": 36777 + }, + { + "epoch": 2.283071574896021, + "grad_norm": 0.1542347891756297, + "learning_rate": 1.641109900260344e-05, + "loss": 2.7431, + "step": 36778 + }, + { + "epoch": 2.2831336519957786, + "grad_norm": 0.14538358979829902, + "learning_rate": 1.6408423844598893e-05, + "loss": 2.6903, + "step": 36779 + }, + { + "epoch": 2.2831957290955365, + "grad_norm": 0.13852542208918958, + "learning_rate": 1.6405748861849758e-05, + "loss": 2.6693, + "step": 36780 + }, + { + "epoch": 2.2832578061952944, + "grad_norm": 0.14388067510077482, + "learning_rate": 1.6403074054369982e-05, + "loss": 2.6692, + "step": 36781 + }, + { + "epoch": 2.2833198832950523, + "grad_norm": 0.14737043066358674, + "learning_rate": 1.6400399422173507e-05, + "loss": 2.6636, + "step": 36782 + }, + { + "epoch": 2.2833819603948102, + "grad_norm": 0.14386692604333268, + "learning_rate": 1.6397724965274308e-05, + "loss": 2.6974, + "step": 36783 + }, + { + "epoch": 2.283444037494568, + "grad_norm": 0.13952425123263973, + "learning_rate": 1.6395050683686348e-05, + "loss": 2.7393, + "step": 36784 + }, + { + "epoch": 2.283506114594326, + "grad_norm": 0.1500873790491239, + "learning_rate": 1.639237657742357e-05, + "loss": 2.7522, + "step": 36785 + }, + { + "epoch": 2.283568191694084, + "grad_norm": 0.14422165730316064, + "learning_rate": 1.6389702646499922e-05, + "loss": 2.681, + "step": 36786 + }, + { + "epoch": 2.283630268793842, + "grad_norm": 0.13871777137631147, + "learning_rate": 1.638702889092935e-05, + "loss": 2.7237, + "step": 36787 + }, + { + "epoch": 2.2836923458936, + "grad_norm": 0.15123973525895956, + "learning_rate": 1.6384355310725796e-05, + "loss": 2.8371, + "step": 36788 + }, + { + "epoch": 2.2837544229933577, + "grad_norm": 0.16838357954992594, + "learning_rate": 1.6381681905903235e-05, + "loss": 2.8029, + "step": 36789 + }, + { + "epoch": 2.2838165000931157, + "grad_norm": 0.14203383793797003, + "learning_rate": 1.6379008676475598e-05, + "loss": 2.7408, + "step": 36790 + }, + { + "epoch": 2.2838785771928736, + "grad_norm": 0.15844453829870583, + "learning_rate": 1.637633562245684e-05, + "loss": 2.661, + "step": 36791 + }, + { + "epoch": 2.2839406542926315, + "grad_norm": 0.17105959242125696, + "learning_rate": 1.6373662743860883e-05, + "loss": 2.7787, + "step": 36792 + }, + { + "epoch": 2.2840027313923894, + "grad_norm": 0.1432620718755966, + "learning_rate": 1.6370990040701707e-05, + "loss": 2.6884, + "step": 36793 + }, + { + "epoch": 2.2840648084921473, + "grad_norm": 0.17451929525313345, + "learning_rate": 1.6368317512993243e-05, + "loss": 2.7222, + "step": 36794 + }, + { + "epoch": 2.2841268855919052, + "grad_norm": 0.16373938008261477, + "learning_rate": 1.6365645160749426e-05, + "loss": 2.7394, + "step": 36795 + }, + { + "epoch": 2.284188962691663, + "grad_norm": 0.14428141626270374, + "learning_rate": 1.6362972983984188e-05, + "loss": 2.8326, + "step": 36796 + }, + { + "epoch": 2.284251039791421, + "grad_norm": 0.16873759802527138, + "learning_rate": 1.63603009827115e-05, + "loss": 2.7438, + "step": 36797 + }, + { + "epoch": 2.284313116891179, + "grad_norm": 0.1591695185496529, + "learning_rate": 1.635762915694529e-05, + "loss": 2.7115, + "step": 36798 + }, + { + "epoch": 2.284375193990937, + "grad_norm": 0.15605386702759896, + "learning_rate": 1.6354957506699498e-05, + "loss": 2.7899, + "step": 36799 + }, + { + "epoch": 2.284437271090695, + "grad_norm": 0.152451599288614, + "learning_rate": 1.635228603198806e-05, + "loss": 2.713, + "step": 36800 + }, + { + "epoch": 2.2844993481904527, + "grad_norm": 0.1560303992609798, + "learning_rate": 1.6349614732824898e-05, + "loss": 2.7559, + "step": 36801 + }, + { + "epoch": 2.2845614252902102, + "grad_norm": 0.14533833740603702, + "learning_rate": 1.634694360922398e-05, + "loss": 2.7455, + "step": 36802 + }, + { + "epoch": 2.2846235023899686, + "grad_norm": 0.14647366802503894, + "learning_rate": 1.6344272661199223e-05, + "loss": 2.7986, + "step": 36803 + }, + { + "epoch": 2.284685579489726, + "grad_norm": 0.16990625923574879, + "learning_rate": 1.634160188876457e-05, + "loss": 2.7748, + "step": 36804 + }, + { + "epoch": 2.284747656589484, + "grad_norm": 0.1526668740893014, + "learning_rate": 1.6338931291933936e-05, + "loss": 2.7247, + "step": 36805 + }, + { + "epoch": 2.284809733689242, + "grad_norm": 0.17941828961316797, + "learning_rate": 1.6336260870721283e-05, + "loss": 2.6543, + "step": 36806 + }, + { + "epoch": 2.284871810789, + "grad_norm": 0.13788705691511147, + "learning_rate": 1.6333590625140525e-05, + "loss": 2.6431, + "step": 36807 + }, + { + "epoch": 2.2849338878887577, + "grad_norm": 0.13989642605640604, + "learning_rate": 1.6330920555205602e-05, + "loss": 2.6863, + "step": 36808 + }, + { + "epoch": 2.2849959649885156, + "grad_norm": 0.15139171647705874, + "learning_rate": 1.632825066093042e-05, + "loss": 2.7644, + "step": 36809 + }, + { + "epoch": 2.2850580420882736, + "grad_norm": 0.13761781310676396, + "learning_rate": 1.632558094232895e-05, + "loss": 2.7264, + "step": 36810 + }, + { + "epoch": 2.2851201191880315, + "grad_norm": 0.14141148272996246, + "learning_rate": 1.6322911399415096e-05, + "loss": 2.7031, + "step": 36811 + }, + { + "epoch": 2.2851821962877894, + "grad_norm": 0.16198970790020376, + "learning_rate": 1.632024203220279e-05, + "loss": 2.7093, + "step": 36812 + }, + { + "epoch": 2.2852442733875473, + "grad_norm": 0.1406115684452647, + "learning_rate": 1.6317572840705958e-05, + "loss": 2.6566, + "step": 36813 + }, + { + "epoch": 2.2853063504873052, + "grad_norm": 0.14498381383552736, + "learning_rate": 1.631490382493851e-05, + "loss": 2.7636, + "step": 36814 + }, + { + "epoch": 2.285368427587063, + "grad_norm": 0.15823585854489652, + "learning_rate": 1.6312234984914382e-05, + "loss": 2.8109, + "step": 36815 + }, + { + "epoch": 2.285430504686821, + "grad_norm": 0.15067037968228328, + "learning_rate": 1.6309566320647518e-05, + "loss": 2.6979, + "step": 36816 + }, + { + "epoch": 2.285492581786579, + "grad_norm": 0.1442983529346166, + "learning_rate": 1.630689783215183e-05, + "loss": 2.6933, + "step": 36817 + }, + { + "epoch": 2.285554658886337, + "grad_norm": 0.18104253767248124, + "learning_rate": 1.6304229519441232e-05, + "loss": 2.6877, + "step": 36818 + }, + { + "epoch": 2.285616735986095, + "grad_norm": 0.1784125792342353, + "learning_rate": 1.6301561382529633e-05, + "loss": 2.7787, + "step": 36819 + }, + { + "epoch": 2.2856788130858527, + "grad_norm": 0.1551331480120304, + "learning_rate": 1.6298893421430982e-05, + "loss": 2.691, + "step": 36820 + }, + { + "epoch": 2.2857408901856107, + "grad_norm": 0.17407539546769055, + "learning_rate": 1.629622563615919e-05, + "loss": 2.661, + "step": 36821 + }, + { + "epoch": 2.2858029672853686, + "grad_norm": 0.16724474252290608, + "learning_rate": 1.6293558026728166e-05, + "loss": 2.6721, + "step": 36822 + }, + { + "epoch": 2.2858650443851265, + "grad_norm": 0.1393193923349946, + "learning_rate": 1.6290890593151836e-05, + "loss": 2.7136, + "step": 36823 + }, + { + "epoch": 2.2859271214848844, + "grad_norm": 0.14984264489722043, + "learning_rate": 1.6288223335444096e-05, + "loss": 2.7349, + "step": 36824 + }, + { + "epoch": 2.285989198584642, + "grad_norm": 0.1424819739656378, + "learning_rate": 1.6285556253618895e-05, + "loss": 2.8175, + "step": 36825 + }, + { + "epoch": 2.2860512756844003, + "grad_norm": 0.14899345486301782, + "learning_rate": 1.6282889347690134e-05, + "loss": 2.6737, + "step": 36826 + }, + { + "epoch": 2.2861133527841577, + "grad_norm": 0.1594160466536878, + "learning_rate": 1.628022261767172e-05, + "loss": 2.6551, + "step": 36827 + }, + { + "epoch": 2.2861754298839156, + "grad_norm": 0.15572730494399586, + "learning_rate": 1.6277556063577552e-05, + "loss": 2.6726, + "step": 36828 + }, + { + "epoch": 2.2862375069836736, + "grad_norm": 0.1418923844492079, + "learning_rate": 1.6274889685421584e-05, + "loss": 2.7241, + "step": 36829 + }, + { + "epoch": 2.2862995840834315, + "grad_norm": 0.15356719434562766, + "learning_rate": 1.6272223483217703e-05, + "loss": 2.6989, + "step": 36830 + }, + { + "epoch": 2.2863616611831894, + "grad_norm": 0.14197620291379148, + "learning_rate": 1.6269557456979818e-05, + "loss": 2.742, + "step": 36831 + }, + { + "epoch": 2.2864237382829473, + "grad_norm": 0.15979608301399326, + "learning_rate": 1.6266891606721816e-05, + "loss": 2.6675, + "step": 36832 + }, + { + "epoch": 2.2864858153827052, + "grad_norm": 0.14225573212583928, + "learning_rate": 1.6264225932457654e-05, + "loss": 2.7559, + "step": 36833 + }, + { + "epoch": 2.286547892482463, + "grad_norm": 0.16612110713799288, + "learning_rate": 1.6261560434201216e-05, + "loss": 2.6759, + "step": 36834 + }, + { + "epoch": 2.286609969582221, + "grad_norm": 0.1591855732584388, + "learning_rate": 1.6258895111966406e-05, + "loss": 2.7367, + "step": 36835 + }, + { + "epoch": 2.286672046681979, + "grad_norm": 0.14694776355914535, + "learning_rate": 1.6256229965767127e-05, + "loss": 2.6994, + "step": 36836 + }, + { + "epoch": 2.286734123781737, + "grad_norm": 0.14460830309906983, + "learning_rate": 1.6253564995617276e-05, + "loss": 2.7043, + "step": 36837 + }, + { + "epoch": 2.286796200881495, + "grad_norm": 0.16104068434314095, + "learning_rate": 1.6250900201530782e-05, + "loss": 2.6949, + "step": 36838 + }, + { + "epoch": 2.2868582779812527, + "grad_norm": 0.15102072411625006, + "learning_rate": 1.6248235583521537e-05, + "loss": 2.6789, + "step": 36839 + }, + { + "epoch": 2.2869203550810107, + "grad_norm": 0.13885670419817797, + "learning_rate": 1.6245571141603437e-05, + "loss": 2.6623, + "step": 36840 + }, + { + "epoch": 2.2869824321807686, + "grad_norm": 0.15119378401819886, + "learning_rate": 1.6242906875790376e-05, + "loss": 2.799, + "step": 36841 + }, + { + "epoch": 2.2870445092805265, + "grad_norm": 0.15291918990156542, + "learning_rate": 1.6240242786096278e-05, + "loss": 2.7385, + "step": 36842 + }, + { + "epoch": 2.2871065863802844, + "grad_norm": 0.13969886642866877, + "learning_rate": 1.6237578872535024e-05, + "loss": 2.6702, + "step": 36843 + }, + { + "epoch": 2.2871686634800423, + "grad_norm": 0.14296975630525902, + "learning_rate": 1.6234915135120526e-05, + "loss": 2.776, + "step": 36844 + }, + { + "epoch": 2.2872307405798002, + "grad_norm": 0.14330023317412902, + "learning_rate": 1.623225157386665e-05, + "loss": 2.6522, + "step": 36845 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.1424940658366894, + "learning_rate": 1.6229588188787332e-05, + "loss": 2.7552, + "step": 36846 + }, + { + "epoch": 2.287354894779316, + "grad_norm": 0.14660610862775855, + "learning_rate": 1.6226924979896453e-05, + "loss": 2.7004, + "step": 36847 + }, + { + "epoch": 2.287416971879074, + "grad_norm": 0.14472616429859728, + "learning_rate": 1.6224261947207885e-05, + "loss": 2.7734, + "step": 36848 + }, + { + "epoch": 2.287479048978832, + "grad_norm": 0.1495268090903074, + "learning_rate": 1.6221599090735563e-05, + "loss": 2.7251, + "step": 36849 + }, + { + "epoch": 2.2875411260785894, + "grad_norm": 0.16018421231377689, + "learning_rate": 1.6218936410493356e-05, + "loss": 2.7235, + "step": 36850 + }, + { + "epoch": 2.2876032031783478, + "grad_norm": 0.1457661377411897, + "learning_rate": 1.6216273906495143e-05, + "loss": 2.7808, + "step": 36851 + }, + { + "epoch": 2.2876652802781052, + "grad_norm": 0.1883032402081397, + "learning_rate": 1.6213611578754845e-05, + "loss": 2.6959, + "step": 36852 + }, + { + "epoch": 2.287727357377863, + "grad_norm": 0.16084900918496214, + "learning_rate": 1.6210949427286338e-05, + "loss": 2.7395, + "step": 36853 + }, + { + "epoch": 2.287789434477621, + "grad_norm": 0.14663168811341598, + "learning_rate": 1.620828745210351e-05, + "loss": 2.7799, + "step": 36854 + }, + { + "epoch": 2.287851511577379, + "grad_norm": 0.1476614138795206, + "learning_rate": 1.6205625653220236e-05, + "loss": 2.7198, + "step": 36855 + }, + { + "epoch": 2.287913588677137, + "grad_norm": 0.15797471995444332, + "learning_rate": 1.620296403065043e-05, + "loss": 2.647, + "step": 36856 + }, + { + "epoch": 2.287975665776895, + "grad_norm": 0.1432219431240463, + "learning_rate": 1.6200302584407966e-05, + "loss": 2.6924, + "step": 36857 + }, + { + "epoch": 2.2880377428766527, + "grad_norm": 0.1471751646068489, + "learning_rate": 1.619764131450673e-05, + "loss": 2.793, + "step": 36858 + }, + { + "epoch": 2.2880998199764107, + "grad_norm": 0.16843361622715486, + "learning_rate": 1.619498022096058e-05, + "loss": 2.7938, + "step": 36859 + }, + { + "epoch": 2.2881618970761686, + "grad_norm": 0.18590462979117103, + "learning_rate": 1.6192319303783453e-05, + "loss": 2.7986, + "step": 36860 + }, + { + "epoch": 2.2882239741759265, + "grad_norm": 0.14614540936002277, + "learning_rate": 1.6189658562989197e-05, + "loss": 2.752, + "step": 36861 + }, + { + "epoch": 2.2882860512756844, + "grad_norm": 0.15723641287707366, + "learning_rate": 1.6186997998591703e-05, + "loss": 2.7523, + "step": 36862 + }, + { + "epoch": 2.2883481283754423, + "grad_norm": 0.16889994348367804, + "learning_rate": 1.618433761060485e-05, + "loss": 2.7958, + "step": 36863 + }, + { + "epoch": 2.2884102054752002, + "grad_norm": 0.17186177267848346, + "learning_rate": 1.6181677399042495e-05, + "loss": 2.7155, + "step": 36864 + }, + { + "epoch": 2.288472282574958, + "grad_norm": 0.14133636370463826, + "learning_rate": 1.617901736391856e-05, + "loss": 2.6583, + "step": 36865 + }, + { + "epoch": 2.288534359674716, + "grad_norm": 0.1451573214989279, + "learning_rate": 1.61763575052469e-05, + "loss": 2.6781, + "step": 36866 + }, + { + "epoch": 2.288596436774474, + "grad_norm": 0.1548879384446929, + "learning_rate": 1.617369782304139e-05, + "loss": 2.7306, + "step": 36867 + }, + { + "epoch": 2.288658513874232, + "grad_norm": 0.14515186142010417, + "learning_rate": 1.61710383173159e-05, + "loss": 2.7558, + "step": 36868 + }, + { + "epoch": 2.28872059097399, + "grad_norm": 0.14255180507874804, + "learning_rate": 1.6168378988084325e-05, + "loss": 2.7475, + "step": 36869 + }, + { + "epoch": 2.2887826680737478, + "grad_norm": 0.147743012152099, + "learning_rate": 1.6165719835360537e-05, + "loss": 2.779, + "step": 36870 + }, + { + "epoch": 2.2888447451735057, + "grad_norm": 0.13764432544781097, + "learning_rate": 1.6163060859158395e-05, + "loss": 2.7091, + "step": 36871 + }, + { + "epoch": 2.2889068222732636, + "grad_norm": 0.13701960295589313, + "learning_rate": 1.6160402059491787e-05, + "loss": 2.6574, + "step": 36872 + }, + { + "epoch": 2.288968899373021, + "grad_norm": 0.1495873997836424, + "learning_rate": 1.6157743436374556e-05, + "loss": 2.7266, + "step": 36873 + }, + { + "epoch": 2.2890309764727794, + "grad_norm": 0.1413366141018153, + "learning_rate": 1.6155084989820608e-05, + "loss": 2.6715, + "step": 36874 + }, + { + "epoch": 2.289093053572537, + "grad_norm": 0.14313606285043004, + "learning_rate": 1.61524267198438e-05, + "loss": 2.7206, + "step": 36875 + }, + { + "epoch": 2.289155130672295, + "grad_norm": 0.20411034159054392, + "learning_rate": 1.6149768626457994e-05, + "loss": 2.77, + "step": 36876 + }, + { + "epoch": 2.2892172077720527, + "grad_norm": 0.1717731596766244, + "learning_rate": 1.614711070967705e-05, + "loss": 2.6738, + "step": 36877 + }, + { + "epoch": 2.2892792848718106, + "grad_norm": 0.14201059786149997, + "learning_rate": 1.6144452969514863e-05, + "loss": 2.6838, + "step": 36878 + }, + { + "epoch": 2.2893413619715686, + "grad_norm": 0.1412817137908242, + "learning_rate": 1.6141795405985282e-05, + "loss": 2.7449, + "step": 36879 + }, + { + "epoch": 2.2894034390713265, + "grad_norm": 0.14709272581140606, + "learning_rate": 1.613913801910218e-05, + "loss": 2.7832, + "step": 36880 + }, + { + "epoch": 2.2894655161710844, + "grad_norm": 0.14168537553381666, + "learning_rate": 1.613648080887939e-05, + "loss": 2.663, + "step": 36881 + }, + { + "epoch": 2.2895275932708423, + "grad_norm": 0.14669765839796303, + "learning_rate": 1.6133823775330803e-05, + "loss": 2.7001, + "step": 36882 + }, + { + "epoch": 2.2895896703706002, + "grad_norm": 0.13875294048612966, + "learning_rate": 1.6131166918470296e-05, + "loss": 2.6714, + "step": 36883 + }, + { + "epoch": 2.289651747470358, + "grad_norm": 0.14455266097998126, + "learning_rate": 1.6128510238311717e-05, + "loss": 2.6916, + "step": 36884 + }, + { + "epoch": 2.289713824570116, + "grad_norm": 0.14086903484479346, + "learning_rate": 1.6125853734868918e-05, + "loss": 2.745, + "step": 36885 + }, + { + "epoch": 2.289775901669874, + "grad_norm": 0.16778782037938714, + "learning_rate": 1.6123197408155765e-05, + "loss": 2.7627, + "step": 36886 + }, + { + "epoch": 2.289837978769632, + "grad_norm": 0.15583152163590547, + "learning_rate": 1.61205412581861e-05, + "loss": 2.6952, + "step": 36887 + }, + { + "epoch": 2.28990005586939, + "grad_norm": 0.13676735150373942, + "learning_rate": 1.611788528497381e-05, + "loss": 2.661, + "step": 36888 + }, + { + "epoch": 2.2899621329691477, + "grad_norm": 0.14925708503016738, + "learning_rate": 1.6115229488532734e-05, + "loss": 2.6524, + "step": 36889 + }, + { + "epoch": 2.2900242100689057, + "grad_norm": 0.13812506686506767, + "learning_rate": 1.611257386887674e-05, + "loss": 2.7411, + "step": 36890 + }, + { + "epoch": 2.2900862871686636, + "grad_norm": 0.14815994715588485, + "learning_rate": 1.6109918426019648e-05, + "loss": 2.669, + "step": 36891 + }, + { + "epoch": 2.2901483642684215, + "grad_norm": 0.14020171303114087, + "learning_rate": 1.6107263159975365e-05, + "loss": 2.8519, + "step": 36892 + }, + { + "epoch": 2.2902104413681794, + "grad_norm": 0.16846920467011792, + "learning_rate": 1.6104608070757714e-05, + "loss": 2.68, + "step": 36893 + }, + { + "epoch": 2.2902725184679373, + "grad_norm": 0.13968602588268314, + "learning_rate": 1.610195315838055e-05, + "loss": 2.7489, + "step": 36894 + }, + { + "epoch": 2.2903345955676953, + "grad_norm": 0.14008146120344064, + "learning_rate": 1.609929842285771e-05, + "loss": 2.705, + "step": 36895 + }, + { + "epoch": 2.290396672667453, + "grad_norm": 0.14399499902634286, + "learning_rate": 1.6096643864203076e-05, + "loss": 2.6907, + "step": 36896 + }, + { + "epoch": 2.290458749767211, + "grad_norm": 0.1504259872972017, + "learning_rate": 1.6093989482430483e-05, + "loss": 2.6161, + "step": 36897 + }, + { + "epoch": 2.2905208268669686, + "grad_norm": 0.16780017555344712, + "learning_rate": 1.609133527755377e-05, + "loss": 2.6979, + "step": 36898 + }, + { + "epoch": 2.290582903966727, + "grad_norm": 0.1411516671951652, + "learning_rate": 1.60886812495868e-05, + "loss": 2.6813, + "step": 36899 + }, + { + "epoch": 2.2906449810664844, + "grad_norm": 0.1395821514067737, + "learning_rate": 1.6086027398543397e-05, + "loss": 2.7901, + "step": 36900 + }, + { + "epoch": 2.2907070581662423, + "grad_norm": 0.15721316902500246, + "learning_rate": 1.6083373724437432e-05, + "loss": 2.7347, + "step": 36901 + }, + { + "epoch": 2.2907691352660002, + "grad_norm": 0.1408694513538973, + "learning_rate": 1.6080720227282746e-05, + "loss": 2.6805, + "step": 36902 + }, + { + "epoch": 2.290831212365758, + "grad_norm": 0.13843469080862594, + "learning_rate": 1.6078066907093175e-05, + "loss": 2.6181, + "step": 36903 + }, + { + "epoch": 2.290893289465516, + "grad_norm": 0.14991614188145186, + "learning_rate": 1.6075413763882546e-05, + "loss": 2.7728, + "step": 36904 + }, + { + "epoch": 2.290955366565274, + "grad_norm": 0.14920738985847615, + "learning_rate": 1.6072760797664737e-05, + "loss": 2.7261, + "step": 36905 + }, + { + "epoch": 2.291017443665032, + "grad_norm": 0.16557283254691074, + "learning_rate": 1.607010800845357e-05, + "loss": 2.7065, + "step": 36906 + }, + { + "epoch": 2.29107952076479, + "grad_norm": 0.17339866607231583, + "learning_rate": 1.6067455396262885e-05, + "loss": 2.7246, + "step": 36907 + }, + { + "epoch": 2.2911415978645477, + "grad_norm": 0.14401073418410365, + "learning_rate": 1.6064802961106525e-05, + "loss": 2.7078, + "step": 36908 + }, + { + "epoch": 2.2912036749643057, + "grad_norm": 0.13923992079498446, + "learning_rate": 1.6062150702998308e-05, + "loss": 2.7155, + "step": 36909 + }, + { + "epoch": 2.2912657520640636, + "grad_norm": 0.15146450482938664, + "learning_rate": 1.6059498621952108e-05, + "loss": 2.7707, + "step": 36910 + }, + { + "epoch": 2.2913278291638215, + "grad_norm": 0.1421148402718681, + "learning_rate": 1.605684671798174e-05, + "loss": 2.7724, + "step": 36911 + }, + { + "epoch": 2.2913899062635794, + "grad_norm": 0.13809398670828, + "learning_rate": 1.6054194991101047e-05, + "loss": 2.6038, + "step": 36912 + }, + { + "epoch": 2.2914519833633373, + "grad_norm": 0.14462615454179137, + "learning_rate": 1.6051543441323835e-05, + "loss": 2.6458, + "step": 36913 + }, + { + "epoch": 2.2915140604630952, + "grad_norm": 0.13727814131520455, + "learning_rate": 1.6048892068663966e-05, + "loss": 2.7567, + "step": 36914 + }, + { + "epoch": 2.291576137562853, + "grad_norm": 0.14371610506207286, + "learning_rate": 1.604624087313529e-05, + "loss": 2.718, + "step": 36915 + }, + { + "epoch": 2.291638214662611, + "grad_norm": 0.14018406882149634, + "learning_rate": 1.6043589854751613e-05, + "loss": 2.7079, + "step": 36916 + }, + { + "epoch": 2.291700291762369, + "grad_norm": 0.1471244750272805, + "learning_rate": 1.6040939013526775e-05, + "loss": 2.7486, + "step": 36917 + }, + { + "epoch": 2.291762368862127, + "grad_norm": 0.1430106982200745, + "learning_rate": 1.603828834947458e-05, + "loss": 2.6773, + "step": 36918 + }, + { + "epoch": 2.291824445961885, + "grad_norm": 0.14368759542215637, + "learning_rate": 1.6035637862608898e-05, + "loss": 2.7087, + "step": 36919 + }, + { + "epoch": 2.2918865230616428, + "grad_norm": 0.1378356880891028, + "learning_rate": 1.603298755294354e-05, + "loss": 2.7237, + "step": 36920 + }, + { + "epoch": 2.2919486001614002, + "grad_norm": 0.15137228803347352, + "learning_rate": 1.603033742049233e-05, + "loss": 2.7235, + "step": 36921 + }, + { + "epoch": 2.2920106772611586, + "grad_norm": 0.14336528090283468, + "learning_rate": 1.60276874652691e-05, + "loss": 2.6914, + "step": 36922 + }, + { + "epoch": 2.292072754360916, + "grad_norm": 0.1355301621327225, + "learning_rate": 1.6025037687287654e-05, + "loss": 2.7355, + "step": 36923 + }, + { + "epoch": 2.292134831460674, + "grad_norm": 0.1611872725498093, + "learning_rate": 1.6022388086561846e-05, + "loss": 2.6651, + "step": 36924 + }, + { + "epoch": 2.292196908560432, + "grad_norm": 0.14112240850900679, + "learning_rate": 1.6019738663105488e-05, + "loss": 2.7392, + "step": 36925 + }, + { + "epoch": 2.29225898566019, + "grad_norm": 0.15156945895903437, + "learning_rate": 1.6017089416932408e-05, + "loss": 2.7667, + "step": 36926 + }, + { + "epoch": 2.2923210627599477, + "grad_norm": 0.14111978192519145, + "learning_rate": 1.60144403480564e-05, + "loss": 2.7028, + "step": 36927 + }, + { + "epoch": 2.2923831398597057, + "grad_norm": 0.13763923720665805, + "learning_rate": 1.6011791456491327e-05, + "loss": 2.6933, + "step": 36928 + }, + { + "epoch": 2.2924452169594636, + "grad_norm": 0.1393097525167817, + "learning_rate": 1.6009142742250988e-05, + "loss": 2.7604, + "step": 36929 + }, + { + "epoch": 2.2925072940592215, + "grad_norm": 0.15127693069515485, + "learning_rate": 1.60064942053492e-05, + "loss": 2.637, + "step": 36930 + }, + { + "epoch": 2.2925693711589794, + "grad_norm": 0.15624514214593482, + "learning_rate": 1.600384584579977e-05, + "loss": 2.8677, + "step": 36931 + }, + { + "epoch": 2.2926314482587373, + "grad_norm": 0.28789427249000504, + "learning_rate": 1.6001197663616545e-05, + "loss": 2.6851, + "step": 36932 + }, + { + "epoch": 2.2926935253584952, + "grad_norm": 0.13993324234675578, + "learning_rate": 1.599854965881333e-05, + "loss": 2.7071, + "step": 36933 + }, + { + "epoch": 2.292755602458253, + "grad_norm": 0.16992047188716675, + "learning_rate": 1.5995901831403925e-05, + "loss": 2.7183, + "step": 36934 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.15110749147685162, + "learning_rate": 1.599325418140216e-05, + "loss": 2.6931, + "step": 36935 + }, + { + "epoch": 2.292879756657769, + "grad_norm": 0.14368879450936284, + "learning_rate": 1.599060670882183e-05, + "loss": 2.6994, + "step": 36936 + }, + { + "epoch": 2.292941833757527, + "grad_norm": 0.1519934956090632, + "learning_rate": 1.5987959413676778e-05, + "loss": 2.7446, + "step": 36937 + }, + { + "epoch": 2.293003910857285, + "grad_norm": 0.16289795532016157, + "learning_rate": 1.5985312295980797e-05, + "loss": 2.7506, + "step": 36938 + }, + { + "epoch": 2.2930659879570428, + "grad_norm": 0.1507882460023663, + "learning_rate": 1.5982665355747695e-05, + "loss": 2.7715, + "step": 36939 + }, + { + "epoch": 2.2931280650568007, + "grad_norm": 0.1418559978380175, + "learning_rate": 1.5980018592991275e-05, + "loss": 2.763, + "step": 36940 + }, + { + "epoch": 2.2931901421565586, + "grad_norm": 0.13977880328352693, + "learning_rate": 1.597737200772537e-05, + "loss": 2.6297, + "step": 36941 + }, + { + "epoch": 2.2932522192563165, + "grad_norm": 0.14403079353213838, + "learning_rate": 1.5974725599963773e-05, + "loss": 2.7186, + "step": 36942 + }, + { + "epoch": 2.2933142963560744, + "grad_norm": 0.14855734479454946, + "learning_rate": 1.5972079369720295e-05, + "loss": 2.7394, + "step": 36943 + }, + { + "epoch": 2.293376373455832, + "grad_norm": 0.14410693801032018, + "learning_rate": 1.5969433317008725e-05, + "loss": 2.736, + "step": 36944 + }, + { + "epoch": 2.2934384505555903, + "grad_norm": 0.13910037982098317, + "learning_rate": 1.5966787441842894e-05, + "loss": 2.7784, + "step": 36945 + }, + { + "epoch": 2.2935005276553477, + "grad_norm": 0.1814200076719307, + "learning_rate": 1.5964141744236582e-05, + "loss": 2.8111, + "step": 36946 + }, + { + "epoch": 2.2935626047551056, + "grad_norm": 0.13561076381783999, + "learning_rate": 1.596149622420362e-05, + "loss": 2.7362, + "step": 36947 + }, + { + "epoch": 2.2936246818548636, + "grad_norm": 0.2014783820495722, + "learning_rate": 1.59588508817578e-05, + "loss": 2.6726, + "step": 36948 + }, + { + "epoch": 2.2936867589546215, + "grad_norm": 0.13909827747231424, + "learning_rate": 1.595620571691291e-05, + "loss": 2.6264, + "step": 36949 + }, + { + "epoch": 2.2937488360543794, + "grad_norm": 0.13634695179965894, + "learning_rate": 1.595356072968275e-05, + "loss": 2.6573, + "step": 36950 + }, + { + "epoch": 2.2938109131541373, + "grad_norm": 0.1374689297359477, + "learning_rate": 1.5950915920081137e-05, + "loss": 2.6261, + "step": 36951 + }, + { + "epoch": 2.2938729902538952, + "grad_norm": 0.15635137845432964, + "learning_rate": 1.5948271288121867e-05, + "loss": 2.6404, + "step": 36952 + }, + { + "epoch": 2.293935067353653, + "grad_norm": 0.14811221042377457, + "learning_rate": 1.594562683381873e-05, + "loss": 2.6754, + "step": 36953 + }, + { + "epoch": 2.293997144453411, + "grad_norm": 0.141694143876124, + "learning_rate": 1.5942982557185514e-05, + "loss": 2.6969, + "step": 36954 + }, + { + "epoch": 2.294059221553169, + "grad_norm": 0.14675258876955766, + "learning_rate": 1.5940338458236044e-05, + "loss": 2.7498, + "step": 36955 + }, + { + "epoch": 2.294121298652927, + "grad_norm": 0.13965241886679047, + "learning_rate": 1.5937694536984088e-05, + "loss": 2.7902, + "step": 36956 + }, + { + "epoch": 2.294183375752685, + "grad_norm": 0.17976696910794987, + "learning_rate": 1.5935050793443452e-05, + "loss": 2.692, + "step": 36957 + }, + { + "epoch": 2.2942454528524427, + "grad_norm": 0.13707906382539373, + "learning_rate": 1.593240722762792e-05, + "loss": 2.7102, + "step": 36958 + }, + { + "epoch": 2.2943075299522007, + "grad_norm": 0.1455153776928801, + "learning_rate": 1.592976383955128e-05, + "loss": 2.6512, + "step": 36959 + }, + { + "epoch": 2.2943696070519586, + "grad_norm": 0.16073518299772252, + "learning_rate": 1.592712062922735e-05, + "loss": 2.641, + "step": 36960 + }, + { + "epoch": 2.2944316841517165, + "grad_norm": 0.13494218787142093, + "learning_rate": 1.59244775966699e-05, + "loss": 2.6889, + "step": 36961 + }, + { + "epoch": 2.2944937612514744, + "grad_norm": 0.15697111976007305, + "learning_rate": 1.5921834741892723e-05, + "loss": 2.6798, + "step": 36962 + }, + { + "epoch": 2.2945558383512323, + "grad_norm": 0.15643976727967557, + "learning_rate": 1.5919192064909588e-05, + "loss": 2.7489, + "step": 36963 + }, + { + "epoch": 2.2946179154509903, + "grad_norm": 0.16095062128298476, + "learning_rate": 1.5916549565734322e-05, + "loss": 2.6885, + "step": 36964 + }, + { + "epoch": 2.294679992550748, + "grad_norm": 0.1418896999179848, + "learning_rate": 1.5913907244380687e-05, + "loss": 2.6739, + "step": 36965 + }, + { + "epoch": 2.294742069650506, + "grad_norm": 0.14146308705072497, + "learning_rate": 1.5911265100862477e-05, + "loss": 2.7911, + "step": 36966 + }, + { + "epoch": 2.294804146750264, + "grad_norm": 0.13786487419948967, + "learning_rate": 1.590862313519345e-05, + "loss": 2.6426, + "step": 36967 + }, + { + "epoch": 2.294866223850022, + "grad_norm": 0.14505299910890335, + "learning_rate": 1.5905981347387434e-05, + "loss": 2.6379, + "step": 36968 + }, + { + "epoch": 2.2949283009497794, + "grad_norm": 0.13546646712424493, + "learning_rate": 1.590333973745819e-05, + "loss": 2.7338, + "step": 36969 + }, + { + "epoch": 2.2949903780495378, + "grad_norm": 0.17970004245272492, + "learning_rate": 1.5900698305419497e-05, + "loss": 2.6387, + "step": 36970 + }, + { + "epoch": 2.2950524551492952, + "grad_norm": 0.13730271827729024, + "learning_rate": 1.5898057051285143e-05, + "loss": 2.6761, + "step": 36971 + }, + { + "epoch": 2.295114532249053, + "grad_norm": 0.16407927359257163, + "learning_rate": 1.5895415975068884e-05, + "loss": 2.6834, + "step": 36972 + }, + { + "epoch": 2.295176609348811, + "grad_norm": 0.13818649181117237, + "learning_rate": 1.5892775076784532e-05, + "loss": 2.7063, + "step": 36973 + }, + { + "epoch": 2.295238686448569, + "grad_norm": 0.14977979465647487, + "learning_rate": 1.5890134356445856e-05, + "loss": 2.7356, + "step": 36974 + }, + { + "epoch": 2.295300763548327, + "grad_norm": 0.1399001074557236, + "learning_rate": 1.5887493814066632e-05, + "loss": 2.6708, + "step": 36975 + }, + { + "epoch": 2.295362840648085, + "grad_norm": 0.1511038550791764, + "learning_rate": 1.5884853449660612e-05, + "loss": 2.7161, + "step": 36976 + }, + { + "epoch": 2.2954249177478427, + "grad_norm": 0.14717964475367756, + "learning_rate": 1.588221326324161e-05, + "loss": 2.6654, + "step": 36977 + }, + { + "epoch": 2.2954869948476007, + "grad_norm": 0.14726257245165858, + "learning_rate": 1.5879573254823393e-05, + "loss": 2.7172, + "step": 36978 + }, + { + "epoch": 2.2955490719473586, + "grad_norm": 0.1379185618556558, + "learning_rate": 1.58769334244197e-05, + "loss": 2.723, + "step": 36979 + }, + { + "epoch": 2.2956111490471165, + "grad_norm": 0.14030182313546816, + "learning_rate": 1.587429377204435e-05, + "loss": 2.7864, + "step": 36980 + }, + { + "epoch": 2.2956732261468744, + "grad_norm": 0.14649730625296206, + "learning_rate": 1.5871654297711075e-05, + "loss": 2.8079, + "step": 36981 + }, + { + "epoch": 2.2957353032466323, + "grad_norm": 0.15816749359878748, + "learning_rate": 1.586901500143368e-05, + "loss": 2.7852, + "step": 36982 + }, + { + "epoch": 2.2957973803463902, + "grad_norm": 0.1417956436250883, + "learning_rate": 1.586637588322592e-05, + "loss": 2.7213, + "step": 36983 + }, + { + "epoch": 2.295859457446148, + "grad_norm": 0.15906192074016326, + "learning_rate": 1.5863736943101564e-05, + "loss": 2.7703, + "step": 36984 + }, + { + "epoch": 2.295921534545906, + "grad_norm": 0.13867687162611742, + "learning_rate": 1.586109818107438e-05, + "loss": 2.699, + "step": 36985 + }, + { + "epoch": 2.295983611645664, + "grad_norm": 0.1566394363510428, + "learning_rate": 1.5858459597158115e-05, + "loss": 2.7021, + "step": 36986 + }, + { + "epoch": 2.296045688745422, + "grad_norm": 0.13878928841836344, + "learning_rate": 1.585582119136657e-05, + "loss": 2.6939, + "step": 36987 + }, + { + "epoch": 2.29610776584518, + "grad_norm": 0.13703030102008637, + "learning_rate": 1.5853182963713504e-05, + "loss": 2.7356, + "step": 36988 + }, + { + "epoch": 2.2961698429449378, + "grad_norm": 0.1475393326898009, + "learning_rate": 1.5850544914212662e-05, + "loss": 2.6954, + "step": 36989 + }, + { + "epoch": 2.2962319200446957, + "grad_norm": 0.17868955932489947, + "learning_rate": 1.58479070428778e-05, + "loss": 2.732, + "step": 36990 + }, + { + "epoch": 2.2962939971444536, + "grad_norm": 0.1406932175265479, + "learning_rate": 1.5845269349722723e-05, + "loss": 2.6377, + "step": 36991 + }, + { + "epoch": 2.296356074244211, + "grad_norm": 0.14270574266862035, + "learning_rate": 1.5842631834761162e-05, + "loss": 2.7752, + "step": 36992 + }, + { + "epoch": 2.2964181513439694, + "grad_norm": 0.14304238335783342, + "learning_rate": 1.5839994498006883e-05, + "loss": 2.6657, + "step": 36993 + }, + { + "epoch": 2.296480228443727, + "grad_norm": 0.1481744543645733, + "learning_rate": 1.583735733947363e-05, + "loss": 2.7364, + "step": 36994 + }, + { + "epoch": 2.296542305543485, + "grad_norm": 0.14486605939480907, + "learning_rate": 1.583472035917519e-05, + "loss": 2.7125, + "step": 36995 + }, + { + "epoch": 2.2966043826432427, + "grad_norm": 0.14058476110773324, + "learning_rate": 1.583208355712531e-05, + "loss": 2.8004, + "step": 36996 + }, + { + "epoch": 2.2966664597430007, + "grad_norm": 0.15367083359575387, + "learning_rate": 1.5829446933337745e-05, + "loss": 2.7337, + "step": 36997 + }, + { + "epoch": 2.2967285368427586, + "grad_norm": 0.14252306253795527, + "learning_rate": 1.582681048782625e-05, + "loss": 2.6222, + "step": 36998 + }, + { + "epoch": 2.2967906139425165, + "grad_norm": 0.1453456615223056, + "learning_rate": 1.5824174220604567e-05, + "loss": 2.6095, + "step": 36999 + }, + { + "epoch": 2.2968526910422744, + "grad_norm": 0.15449537715199285, + "learning_rate": 1.5821538131686476e-05, + "loss": 2.7567, + "step": 37000 + }, + { + "epoch": 2.2969147681420323, + "grad_norm": 0.14029955292336163, + "learning_rate": 1.5818902221085717e-05, + "loss": 2.764, + "step": 37001 + }, + { + "epoch": 2.2969768452417902, + "grad_norm": 0.15635217068008317, + "learning_rate": 1.5816266488816046e-05, + "loss": 2.7067, + "step": 37002 + }, + { + "epoch": 2.297038922341548, + "grad_norm": 0.15548847710710065, + "learning_rate": 1.5813630934891195e-05, + "loss": 2.7168, + "step": 37003 + }, + { + "epoch": 2.297100999441306, + "grad_norm": 0.14477354437694584, + "learning_rate": 1.5810995559324943e-05, + "loss": 2.6831, + "step": 37004 + }, + { + "epoch": 2.297163076541064, + "grad_norm": 0.1441213879244297, + "learning_rate": 1.580836036213103e-05, + "loss": 2.6863, + "step": 37005 + }, + { + "epoch": 2.297225153640822, + "grad_norm": 0.1385727313057963, + "learning_rate": 1.58057253433232e-05, + "loss": 2.761, + "step": 37006 + }, + { + "epoch": 2.29728723074058, + "grad_norm": 0.1452219221882186, + "learning_rate": 1.58030905029152e-05, + "loss": 2.7508, + "step": 37007 + }, + { + "epoch": 2.2973493078403378, + "grad_norm": 0.14031880873237582, + "learning_rate": 1.580045584092077e-05, + "loss": 2.6526, + "step": 37008 + }, + { + "epoch": 2.2974113849400957, + "grad_norm": 0.1376261232925329, + "learning_rate": 1.579782135735367e-05, + "loss": 2.5929, + "step": 37009 + }, + { + "epoch": 2.2974734620398536, + "grad_norm": 0.14478413264366763, + "learning_rate": 1.5795187052227644e-05, + "loss": 2.6773, + "step": 37010 + }, + { + "epoch": 2.2975355391396115, + "grad_norm": 0.14282081798895752, + "learning_rate": 1.5792552925556432e-05, + "loss": 2.8159, + "step": 37011 + }, + { + "epoch": 2.2975976162393694, + "grad_norm": 0.1477803503502388, + "learning_rate": 1.5789918977353756e-05, + "loss": 2.7244, + "step": 37012 + }, + { + "epoch": 2.2976596933391273, + "grad_norm": 0.1461305031470315, + "learning_rate": 1.578728520763338e-05, + "loss": 2.6758, + "step": 37013 + }, + { + "epoch": 2.2977217704388853, + "grad_norm": 0.15604548779275235, + "learning_rate": 1.5784651616409058e-05, + "loss": 2.748, + "step": 37014 + }, + { + "epoch": 2.297783847538643, + "grad_norm": 0.15602486006672872, + "learning_rate": 1.5782018203694515e-05, + "loss": 2.7325, + "step": 37015 + }, + { + "epoch": 2.297845924638401, + "grad_norm": 0.1544285164381135, + "learning_rate": 1.5779384969503493e-05, + "loss": 2.6784, + "step": 37016 + }, + { + "epoch": 2.2979080017381586, + "grad_norm": 0.1441320822543889, + "learning_rate": 1.5776751913849702e-05, + "loss": 2.7055, + "step": 37017 + }, + { + "epoch": 2.297970078837917, + "grad_norm": 0.14954741143491215, + "learning_rate": 1.5774119036746926e-05, + "loss": 2.7004, + "step": 37018 + }, + { + "epoch": 2.2980321559376744, + "grad_norm": 0.16413441285402616, + "learning_rate": 1.577148633820888e-05, + "loss": 2.6342, + "step": 37019 + }, + { + "epoch": 2.2980942330374323, + "grad_norm": 0.14995510030337608, + "learning_rate": 1.5768853818249296e-05, + "loss": 2.7483, + "step": 37020 + }, + { + "epoch": 2.2981563101371902, + "grad_norm": 0.14334974126642758, + "learning_rate": 1.5766221476881917e-05, + "loss": 2.7989, + "step": 37021 + }, + { + "epoch": 2.298218387236948, + "grad_norm": 0.1507080802379677, + "learning_rate": 1.5763589314120447e-05, + "loss": 2.6837, + "step": 37022 + }, + { + "epoch": 2.298280464336706, + "grad_norm": 0.15112231351461863, + "learning_rate": 1.5760957329978665e-05, + "loss": 2.7695, + "step": 37023 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.16260562338798662, + "learning_rate": 1.575832552447028e-05, + "loss": 2.5906, + "step": 37024 + }, + { + "epoch": 2.298404618536222, + "grad_norm": 0.16625564278799712, + "learning_rate": 1.575569389760902e-05, + "loss": 2.7004, + "step": 37025 + }, + { + "epoch": 2.29846669563598, + "grad_norm": 0.1396138475168115, + "learning_rate": 1.5753062449408607e-05, + "loss": 2.7268, + "step": 37026 + }, + { + "epoch": 2.2985287727357377, + "grad_norm": 0.1494092691188674, + "learning_rate": 1.575043117988279e-05, + "loss": 2.6891, + "step": 37027 + }, + { + "epoch": 2.2985908498354957, + "grad_norm": 0.17593047377851395, + "learning_rate": 1.5747800089045296e-05, + "loss": 2.704, + "step": 37028 + }, + { + "epoch": 2.2986529269352536, + "grad_norm": 0.14350227774621668, + "learning_rate": 1.5745169176909836e-05, + "loss": 2.7128, + "step": 37029 + }, + { + "epoch": 2.2987150040350115, + "grad_norm": 0.13764691686004107, + "learning_rate": 1.5742538443490135e-05, + "loss": 2.6467, + "step": 37030 + }, + { + "epoch": 2.2987770811347694, + "grad_norm": 0.1399071704117639, + "learning_rate": 1.5739907888799936e-05, + "loss": 2.6517, + "step": 37031 + }, + { + "epoch": 2.2988391582345273, + "grad_norm": 0.14401669666768555, + "learning_rate": 1.573727751285296e-05, + "loss": 2.6566, + "step": 37032 + }, + { + "epoch": 2.2989012353342853, + "grad_norm": 0.14273226257748794, + "learning_rate": 1.5734647315662927e-05, + "loss": 2.7166, + "step": 37033 + }, + { + "epoch": 2.298963312434043, + "grad_norm": 0.16619909470679708, + "learning_rate": 1.573201729724355e-05, + "loss": 2.8019, + "step": 37034 + }, + { + "epoch": 2.299025389533801, + "grad_norm": 0.14174839825246624, + "learning_rate": 1.5729387457608547e-05, + "loss": 2.6646, + "step": 37035 + }, + { + "epoch": 2.299087466633559, + "grad_norm": 0.16323097997468453, + "learning_rate": 1.5726757796771665e-05, + "loss": 2.7513, + "step": 37036 + }, + { + "epoch": 2.299149543733317, + "grad_norm": 0.14119274713425947, + "learning_rate": 1.5724128314746605e-05, + "loss": 2.7447, + "step": 37037 + }, + { + "epoch": 2.299211620833075, + "grad_norm": 0.14866144453415125, + "learning_rate": 1.5721499011547088e-05, + "loss": 2.712, + "step": 37038 + }, + { + "epoch": 2.2992736979328328, + "grad_norm": 0.13682679947579895, + "learning_rate": 1.5718869887186817e-05, + "loss": 2.7187, + "step": 37039 + }, + { + "epoch": 2.2993357750325902, + "grad_norm": 0.14361833601623458, + "learning_rate": 1.5716240941679538e-05, + "loss": 2.6867, + "step": 37040 + }, + { + "epoch": 2.2993978521323486, + "grad_norm": 0.1397054795827389, + "learning_rate": 1.5713612175038956e-05, + "loss": 2.8074, + "step": 37041 + }, + { + "epoch": 2.299459929232106, + "grad_norm": 0.13588358012510984, + "learning_rate": 1.5710983587278784e-05, + "loss": 2.6773, + "step": 37042 + }, + { + "epoch": 2.299522006331864, + "grad_norm": 0.14242198311094867, + "learning_rate": 1.5708355178412732e-05, + "loss": 2.755, + "step": 37043 + }, + { + "epoch": 2.299584083431622, + "grad_norm": 0.14709889259611997, + "learning_rate": 1.5705726948454496e-05, + "loss": 2.6806, + "step": 37044 + }, + { + "epoch": 2.29964616053138, + "grad_norm": 0.1467057780136533, + "learning_rate": 1.570309889741781e-05, + "loss": 2.6907, + "step": 37045 + }, + { + "epoch": 2.2997082376311377, + "grad_norm": 0.13946326857361294, + "learning_rate": 1.5700471025316406e-05, + "loss": 2.7459, + "step": 37046 + }, + { + "epoch": 2.2997703147308957, + "grad_norm": 0.1406011163774736, + "learning_rate": 1.5697843332163965e-05, + "loss": 2.7189, + "step": 37047 + }, + { + "epoch": 2.2998323918306536, + "grad_norm": 0.14429750743907055, + "learning_rate": 1.56952158179742e-05, + "loss": 2.6807, + "step": 37048 + }, + { + "epoch": 2.2998944689304115, + "grad_norm": 0.13329584860146618, + "learning_rate": 1.569258848276081e-05, + "loss": 2.6895, + "step": 37049 + }, + { + "epoch": 2.2999565460301694, + "grad_norm": 0.1423533444463435, + "learning_rate": 1.5689961326537527e-05, + "loss": 2.7157, + "step": 37050 + }, + { + "epoch": 2.3000186231299273, + "grad_norm": 0.15128660210367648, + "learning_rate": 1.568733434931805e-05, + "loss": 2.6973, + "step": 37051 + }, + { + "epoch": 2.3000807002296852, + "grad_norm": 0.137568164856985, + "learning_rate": 1.5684707551116073e-05, + "loss": 2.6208, + "step": 37052 + }, + { + "epoch": 2.300142777329443, + "grad_norm": 0.13902537193142264, + "learning_rate": 1.56820809319453e-05, + "loss": 2.7289, + "step": 37053 + }, + { + "epoch": 2.300204854429201, + "grad_norm": 0.13818149767253077, + "learning_rate": 1.5679454491819444e-05, + "loss": 2.6298, + "step": 37054 + }, + { + "epoch": 2.300266931528959, + "grad_norm": 0.1352553729036321, + "learning_rate": 1.5676828230752217e-05, + "loss": 2.6722, + "step": 37055 + }, + { + "epoch": 2.300329008628717, + "grad_norm": 0.1399627762401424, + "learning_rate": 1.5674202148757304e-05, + "loss": 2.7702, + "step": 37056 + }, + { + "epoch": 2.300391085728475, + "grad_norm": 0.13922091044623805, + "learning_rate": 1.567157624584841e-05, + "loss": 2.7068, + "step": 37057 + }, + { + "epoch": 2.3004531628282328, + "grad_norm": 0.14763667516787213, + "learning_rate": 1.5668950522039228e-05, + "loss": 2.825, + "step": 37058 + }, + { + "epoch": 2.3005152399279907, + "grad_norm": 0.13692365006033344, + "learning_rate": 1.5666324977343476e-05, + "loss": 2.6533, + "step": 37059 + }, + { + "epoch": 2.3005773170277486, + "grad_norm": 0.17029317212456338, + "learning_rate": 1.5663699611774846e-05, + "loss": 2.7345, + "step": 37060 + }, + { + "epoch": 2.3006393941275065, + "grad_norm": 0.14599655942300382, + "learning_rate": 1.566107442534702e-05, + "loss": 2.7604, + "step": 37061 + }, + { + "epoch": 2.3007014712272644, + "grad_norm": 0.14538428804709724, + "learning_rate": 1.56584494180737e-05, + "loss": 2.737, + "step": 37062 + }, + { + "epoch": 2.3007635483270223, + "grad_norm": 0.1467647856760733, + "learning_rate": 1.5655824589968598e-05, + "loss": 2.7393, + "step": 37063 + }, + { + "epoch": 2.3008256254267803, + "grad_norm": 0.14264172611695353, + "learning_rate": 1.5653199941045393e-05, + "loss": 2.7341, + "step": 37064 + }, + { + "epoch": 2.3008877025265377, + "grad_norm": 0.143128560596637, + "learning_rate": 1.5650575471317786e-05, + "loss": 2.7998, + "step": 37065 + }, + { + "epoch": 2.300949779626296, + "grad_norm": 0.1429764070701265, + "learning_rate": 1.5647951180799446e-05, + "loss": 2.7798, + "step": 37066 + }, + { + "epoch": 2.3010118567260536, + "grad_norm": 0.13356364535615797, + "learning_rate": 1.5645327069504106e-05, + "loss": 2.6883, + "step": 37067 + }, + { + "epoch": 2.3010739338258115, + "grad_norm": 0.1390991411799123, + "learning_rate": 1.564270313744543e-05, + "loss": 2.7432, + "step": 37068 + }, + { + "epoch": 2.3011360109255694, + "grad_norm": 0.14294855777750803, + "learning_rate": 1.5640079384637115e-05, + "loss": 2.7164, + "step": 37069 + }, + { + "epoch": 2.3011980880253273, + "grad_norm": 0.1510750998019448, + "learning_rate": 1.5637455811092842e-05, + "loss": 2.8064, + "step": 37070 + }, + { + "epoch": 2.3012601651250852, + "grad_norm": 0.15742924462383387, + "learning_rate": 1.5634832416826288e-05, + "loss": 2.7114, + "step": 37071 + }, + { + "epoch": 2.301322242224843, + "grad_norm": 0.1466290331868751, + "learning_rate": 1.5632209201851177e-05, + "loss": 2.6789, + "step": 37072 + }, + { + "epoch": 2.301384319324601, + "grad_norm": 0.1569106856161537, + "learning_rate": 1.562958616618117e-05, + "loss": 2.7436, + "step": 37073 + }, + { + "epoch": 2.301446396424359, + "grad_norm": 0.14171013451562245, + "learning_rate": 1.5626963309829954e-05, + "loss": 2.6645, + "step": 37074 + }, + { + "epoch": 2.301508473524117, + "grad_norm": 0.14851621372842663, + "learning_rate": 1.5624340632811208e-05, + "loss": 2.7128, + "step": 37075 + }, + { + "epoch": 2.301570550623875, + "grad_norm": 0.13872262536462288, + "learning_rate": 1.562171813513863e-05, + "loss": 2.6892, + "step": 37076 + }, + { + "epoch": 2.3016326277236328, + "grad_norm": 0.147681809791564, + "learning_rate": 1.561909581682588e-05, + "loss": 2.7257, + "step": 37077 + }, + { + "epoch": 2.3016947048233907, + "grad_norm": 0.1383974758764027, + "learning_rate": 1.5616473677886666e-05, + "loss": 2.7506, + "step": 37078 + }, + { + "epoch": 2.3017567819231486, + "grad_norm": 0.1368520971955132, + "learning_rate": 1.561385171833466e-05, + "loss": 2.7008, + "step": 37079 + }, + { + "epoch": 2.3018188590229065, + "grad_norm": 0.14192942448239573, + "learning_rate": 1.5611229938183518e-05, + "loss": 2.7099, + "step": 37080 + }, + { + "epoch": 2.3018809361226644, + "grad_norm": 0.13593516594428595, + "learning_rate": 1.5608608337446956e-05, + "loss": 2.6413, + "step": 37081 + }, + { + "epoch": 2.3019430132224223, + "grad_norm": 0.1406529186933181, + "learning_rate": 1.5605986916138636e-05, + "loss": 2.6887, + "step": 37082 + }, + { + "epoch": 2.3020050903221803, + "grad_norm": 0.1498749922361841, + "learning_rate": 1.5603365674272224e-05, + "loss": 2.6811, + "step": 37083 + }, + { + "epoch": 2.302067167421938, + "grad_norm": 0.1391198417760407, + "learning_rate": 1.5600744611861414e-05, + "loss": 2.7158, + "step": 37084 + }, + { + "epoch": 2.302129244521696, + "grad_norm": 0.13692877169405274, + "learning_rate": 1.5598123728919846e-05, + "loss": 2.6991, + "step": 37085 + }, + { + "epoch": 2.302191321621454, + "grad_norm": 0.1382208527925332, + "learning_rate": 1.5595503025461233e-05, + "loss": 2.7514, + "step": 37086 + }, + { + "epoch": 2.302253398721212, + "grad_norm": 0.14600458407513242, + "learning_rate": 1.5592882501499235e-05, + "loss": 2.7593, + "step": 37087 + }, + { + "epoch": 2.3023154758209694, + "grad_norm": 0.15171063130359266, + "learning_rate": 1.5590262157047526e-05, + "loss": 2.6971, + "step": 37088 + }, + { + "epoch": 2.3023775529207278, + "grad_norm": 0.1433003696551683, + "learning_rate": 1.558764199211975e-05, + "loss": 2.752, + "step": 37089 + }, + { + "epoch": 2.3024396300204852, + "grad_norm": 0.138639869488829, + "learning_rate": 1.5585022006729617e-05, + "loss": 2.7497, + "step": 37090 + }, + { + "epoch": 2.302501707120243, + "grad_norm": 0.1372890837961379, + "learning_rate": 1.5582402200890778e-05, + "loss": 2.6337, + "step": 37091 + }, + { + "epoch": 2.302563784220001, + "grad_norm": 0.14024042817761662, + "learning_rate": 1.557978257461691e-05, + "loss": 2.7835, + "step": 37092 + }, + { + "epoch": 2.302625861319759, + "grad_norm": 0.13529657730526196, + "learning_rate": 1.5577163127921663e-05, + "loss": 2.6222, + "step": 37093 + }, + { + "epoch": 2.302687938419517, + "grad_norm": 0.1420654197230043, + "learning_rate": 1.5574543860818695e-05, + "loss": 2.623, + "step": 37094 + }, + { + "epoch": 2.302750015519275, + "grad_norm": 0.15493122513141577, + "learning_rate": 1.5571924773321712e-05, + "loss": 2.7011, + "step": 37095 + }, + { + "epoch": 2.3028120926190327, + "grad_norm": 0.14035550759321244, + "learning_rate": 1.556930586544435e-05, + "loss": 2.7608, + "step": 37096 + }, + { + "epoch": 2.3028741697187907, + "grad_norm": 0.1401499632315293, + "learning_rate": 1.556668713720028e-05, + "loss": 2.6903, + "step": 37097 + }, + { + "epoch": 2.3029362468185486, + "grad_norm": 0.14018863952265423, + "learning_rate": 1.556406858860315e-05, + "loss": 2.7231, + "step": 37098 + }, + { + "epoch": 2.3029983239183065, + "grad_norm": 0.1793984181705161, + "learning_rate": 1.556145021966664e-05, + "loss": 2.6415, + "step": 37099 + }, + { + "epoch": 2.3030604010180644, + "grad_norm": 0.15056636376330218, + "learning_rate": 1.5558832030404412e-05, + "loss": 2.778, + "step": 37100 + }, + { + "epoch": 2.3031224781178223, + "grad_norm": 0.1413948984489918, + "learning_rate": 1.555621402083012e-05, + "loss": 2.7066, + "step": 37101 + }, + { + "epoch": 2.3031845552175803, + "grad_norm": 0.14824017157710548, + "learning_rate": 1.5553596190957397e-05, + "loss": 2.6677, + "step": 37102 + }, + { + "epoch": 2.303246632317338, + "grad_norm": 0.13943774328163566, + "learning_rate": 1.5550978540799943e-05, + "loss": 2.6896, + "step": 37103 + }, + { + "epoch": 2.303308709417096, + "grad_norm": 0.13477927981872995, + "learning_rate": 1.5548361070371404e-05, + "loss": 2.6304, + "step": 37104 + }, + { + "epoch": 2.303370786516854, + "grad_norm": 0.14703592731807433, + "learning_rate": 1.5545743779685422e-05, + "loss": 2.6771, + "step": 37105 + }, + { + "epoch": 2.303432863616612, + "grad_norm": 0.15538838168673458, + "learning_rate": 1.5543126668755658e-05, + "loss": 2.7534, + "step": 37106 + }, + { + "epoch": 2.30349494071637, + "grad_norm": 0.13991316798609701, + "learning_rate": 1.5540509737595754e-05, + "loss": 2.69, + "step": 37107 + }, + { + "epoch": 2.3035570178161278, + "grad_norm": 0.1439047586775886, + "learning_rate": 1.5537892986219394e-05, + "loss": 2.7647, + "step": 37108 + }, + { + "epoch": 2.3036190949158857, + "grad_norm": 0.14533362747749184, + "learning_rate": 1.553527641464021e-05, + "loss": 2.7167, + "step": 37109 + }, + { + "epoch": 2.3036811720156436, + "grad_norm": 0.17635314416579564, + "learning_rate": 1.5532660022871836e-05, + "loss": 2.657, + "step": 37110 + }, + { + "epoch": 2.3037432491154015, + "grad_norm": 0.14578170728253997, + "learning_rate": 1.5530043810927964e-05, + "loss": 2.7264, + "step": 37111 + }, + { + "epoch": 2.3038053262151594, + "grad_norm": 0.14867427413055076, + "learning_rate": 1.55274277788222e-05, + "loss": 2.7488, + "step": 37112 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.13768246773403747, + "learning_rate": 1.5524811926568234e-05, + "loss": 2.778, + "step": 37113 + }, + { + "epoch": 2.3039294804146753, + "grad_norm": 0.18069836058975988, + "learning_rate": 1.5522196254179687e-05, + "loss": 2.6726, + "step": 37114 + }, + { + "epoch": 2.3039915575144327, + "grad_norm": 0.1847312266163702, + "learning_rate": 1.5519580761670215e-05, + "loss": 2.718, + "step": 37115 + }, + { + "epoch": 2.3040536346141907, + "grad_norm": 0.1436875889405767, + "learning_rate": 1.551696544905345e-05, + "loss": 2.6991, + "step": 37116 + }, + { + "epoch": 2.3041157117139486, + "grad_norm": 0.14390797512540496, + "learning_rate": 1.5514350316343057e-05, + "loss": 2.7527, + "step": 37117 + }, + { + "epoch": 2.3041777888137065, + "grad_norm": 0.15251865494669672, + "learning_rate": 1.5511735363552672e-05, + "loss": 2.7168, + "step": 37118 + }, + { + "epoch": 2.3042398659134644, + "grad_norm": 0.1609726890499688, + "learning_rate": 1.5509120590695937e-05, + "loss": 2.7106, + "step": 37119 + }, + { + "epoch": 2.3043019430132223, + "grad_norm": 0.15376912170611448, + "learning_rate": 1.550650599778649e-05, + "loss": 2.7465, + "step": 37120 + }, + { + "epoch": 2.3043640201129802, + "grad_norm": 0.15822598209801098, + "learning_rate": 1.5503891584837965e-05, + "loss": 2.768, + "step": 37121 + }, + { + "epoch": 2.304426097212738, + "grad_norm": 0.14553438494457166, + "learning_rate": 1.550127735186402e-05, + "loss": 2.7134, + "step": 37122 + }, + { + "epoch": 2.304488174312496, + "grad_norm": 0.14289060734477071, + "learning_rate": 1.549866329887829e-05, + "loss": 2.7106, + "step": 37123 + }, + { + "epoch": 2.304550251412254, + "grad_norm": 0.15189987097692942, + "learning_rate": 1.549604942589441e-05, + "loss": 2.6746, + "step": 37124 + }, + { + "epoch": 2.304612328512012, + "grad_norm": 0.13790788542484225, + "learning_rate": 1.5493435732925997e-05, + "loss": 2.6678, + "step": 37125 + }, + { + "epoch": 2.30467440561177, + "grad_norm": 0.1583638552110524, + "learning_rate": 1.549082221998673e-05, + "loss": 2.7987, + "step": 37126 + }, + { + "epoch": 2.3047364827115278, + "grad_norm": 0.14159209121072586, + "learning_rate": 1.548820888709021e-05, + "loss": 2.7433, + "step": 37127 + }, + { + "epoch": 2.3047985598112857, + "grad_norm": 0.1394490238729995, + "learning_rate": 1.5485595734250086e-05, + "loss": 2.7306, + "step": 37128 + }, + { + "epoch": 2.3048606369110436, + "grad_norm": 0.14282636718077277, + "learning_rate": 1.5482982761479992e-05, + "loss": 2.6702, + "step": 37129 + }, + { + "epoch": 2.3049227140108015, + "grad_norm": 0.13850764502804014, + "learning_rate": 1.5480369968793534e-05, + "loss": 2.6847, + "step": 37130 + }, + { + "epoch": 2.3049847911105594, + "grad_norm": 0.13710732417732893, + "learning_rate": 1.5477757356204387e-05, + "loss": 2.757, + "step": 37131 + }, + { + "epoch": 2.3050468682103173, + "grad_norm": 0.15667171193577853, + "learning_rate": 1.547514492372616e-05, + "loss": 2.7368, + "step": 37132 + }, + { + "epoch": 2.3051089453100753, + "grad_norm": 0.1600847026298322, + "learning_rate": 1.5472532671372482e-05, + "loss": 2.7559, + "step": 37133 + }, + { + "epoch": 2.305171022409833, + "grad_norm": 0.14457381111535492, + "learning_rate": 1.546992059915697e-05, + "loss": 2.6513, + "step": 37134 + }, + { + "epoch": 2.305233099509591, + "grad_norm": 0.1472004680529408, + "learning_rate": 1.5467308707093274e-05, + "loss": 2.6845, + "step": 37135 + }, + { + "epoch": 2.3052951766093486, + "grad_norm": 0.15967462391647036, + "learning_rate": 1.5464696995195016e-05, + "loss": 2.7016, + "step": 37136 + }, + { + "epoch": 2.305357253709107, + "grad_norm": 0.14742699289257358, + "learning_rate": 1.5462085463475817e-05, + "loss": 2.7731, + "step": 37137 + }, + { + "epoch": 2.3054193308088644, + "grad_norm": 0.14185568270482668, + "learning_rate": 1.5459474111949284e-05, + "loss": 2.7161, + "step": 37138 + }, + { + "epoch": 2.3054814079086223, + "grad_norm": 0.1465573574091185, + "learning_rate": 1.545686294062908e-05, + "loss": 2.7597, + "step": 37139 + }, + { + "epoch": 2.3055434850083802, + "grad_norm": 0.14177729127228414, + "learning_rate": 1.5454251949528804e-05, + "loss": 2.6509, + "step": 37140 + }, + { + "epoch": 2.305605562108138, + "grad_norm": 0.15215944487844377, + "learning_rate": 1.545164113866208e-05, + "loss": 2.6749, + "step": 37141 + }, + { + "epoch": 2.305667639207896, + "grad_norm": 0.13785539614718295, + "learning_rate": 1.5449030508042538e-05, + "loss": 2.7503, + "step": 37142 + }, + { + "epoch": 2.305729716307654, + "grad_norm": 0.17271977050166423, + "learning_rate": 1.544642005768377e-05, + "loss": 2.7458, + "step": 37143 + }, + { + "epoch": 2.305791793407412, + "grad_norm": 0.1491222274127629, + "learning_rate": 1.5443809787599418e-05, + "loss": 2.6653, + "step": 37144 + }, + { + "epoch": 2.30585387050717, + "grad_norm": 0.13877038411811898, + "learning_rate": 1.5441199697803117e-05, + "loss": 2.779, + "step": 37145 + }, + { + "epoch": 2.3059159476069278, + "grad_norm": 0.15784159901946476, + "learning_rate": 1.5438589788308466e-05, + "loss": 2.6232, + "step": 37146 + }, + { + "epoch": 2.3059780247066857, + "grad_norm": 0.15774255987341337, + "learning_rate": 1.543598005912908e-05, + "loss": 2.7035, + "step": 37147 + }, + { + "epoch": 2.3060401018064436, + "grad_norm": 0.13885728771263062, + "learning_rate": 1.543337051027856e-05, + "loss": 2.7416, + "step": 37148 + }, + { + "epoch": 2.3061021789062015, + "grad_norm": 0.13809491064978482, + "learning_rate": 1.5430761141770555e-05, + "loss": 2.7285, + "step": 37149 + }, + { + "epoch": 2.3061642560059594, + "grad_norm": 0.15549806625154308, + "learning_rate": 1.5428151953618662e-05, + "loss": 2.7472, + "step": 37150 + }, + { + "epoch": 2.3062263331057173, + "grad_norm": 0.1463453014127814, + "learning_rate": 1.542554294583649e-05, + "loss": 2.8493, + "step": 37151 + }, + { + "epoch": 2.3062884102054753, + "grad_norm": 0.14108455589982963, + "learning_rate": 1.5422934118437643e-05, + "loss": 2.6914, + "step": 37152 + }, + { + "epoch": 2.306350487305233, + "grad_norm": 0.16585152874758965, + "learning_rate": 1.5420325471435754e-05, + "loss": 2.6685, + "step": 37153 + }, + { + "epoch": 2.306412564404991, + "grad_norm": 0.14282475004725168, + "learning_rate": 1.541771700484442e-05, + "loss": 2.6926, + "step": 37154 + }, + { + "epoch": 2.306474641504749, + "grad_norm": 0.13894445868941097, + "learning_rate": 1.5415108718677263e-05, + "loss": 2.6392, + "step": 37155 + }, + { + "epoch": 2.306536718604507, + "grad_norm": 0.1413975067491637, + "learning_rate": 1.5412500612947866e-05, + "loss": 2.6971, + "step": 37156 + }, + { + "epoch": 2.306598795704265, + "grad_norm": 0.13991654941300408, + "learning_rate": 1.5409892687669842e-05, + "loss": 2.7547, + "step": 37157 + }, + { + "epoch": 2.3066608728040228, + "grad_norm": 0.15635517871875632, + "learning_rate": 1.5407284942856816e-05, + "loss": 2.6814, + "step": 37158 + }, + { + "epoch": 2.3067229499037807, + "grad_norm": 0.14407739389554883, + "learning_rate": 1.5404677378522387e-05, + "loss": 2.727, + "step": 37159 + }, + { + "epoch": 2.3067850270035386, + "grad_norm": 0.148740578230828, + "learning_rate": 1.5402069994680153e-05, + "loss": 2.695, + "step": 37160 + }, + { + "epoch": 2.306847104103296, + "grad_norm": 0.13755543685815566, + "learning_rate": 1.53994627913437e-05, + "loss": 2.7643, + "step": 37161 + }, + { + "epoch": 2.3069091812030544, + "grad_norm": 0.15297717676146055, + "learning_rate": 1.5396855768526663e-05, + "loss": 2.7565, + "step": 37162 + }, + { + "epoch": 2.306971258302812, + "grad_norm": 0.13740273091275912, + "learning_rate": 1.5394248926242634e-05, + "loss": 2.6566, + "step": 37163 + }, + { + "epoch": 2.30703333540257, + "grad_norm": 0.13897273509713348, + "learning_rate": 1.5391642264505206e-05, + "loss": 2.7061, + "step": 37164 + }, + { + "epoch": 2.3070954125023277, + "grad_norm": 0.16849385840305175, + "learning_rate": 1.5389035783327972e-05, + "loss": 2.6905, + "step": 37165 + }, + { + "epoch": 2.3071574896020857, + "grad_norm": 0.16123242622541192, + "learning_rate": 1.5386429482724553e-05, + "loss": 2.7154, + "step": 37166 + }, + { + "epoch": 2.3072195667018436, + "grad_norm": 0.16624624278548278, + "learning_rate": 1.5383823362708534e-05, + "loss": 2.6287, + "step": 37167 + }, + { + "epoch": 2.3072816438016015, + "grad_norm": 0.14117078038663666, + "learning_rate": 1.538121742329351e-05, + "loss": 2.7697, + "step": 37168 + }, + { + "epoch": 2.3073437209013594, + "grad_norm": 0.1476346817612875, + "learning_rate": 1.5378611664493077e-05, + "loss": 2.7274, + "step": 37169 + }, + { + "epoch": 2.3074057980011173, + "grad_norm": 0.1475105391693289, + "learning_rate": 1.5376006086320822e-05, + "loss": 2.77, + "step": 37170 + }, + { + "epoch": 2.3074678751008753, + "grad_norm": 0.13750599636744398, + "learning_rate": 1.5373400688790358e-05, + "loss": 2.7507, + "step": 37171 + }, + { + "epoch": 2.307529952200633, + "grad_norm": 0.14634924059572435, + "learning_rate": 1.5370795471915274e-05, + "loss": 2.7497, + "step": 37172 + }, + { + "epoch": 2.307592029300391, + "grad_norm": 0.17217464228531007, + "learning_rate": 1.5368190435709153e-05, + "loss": 2.7505, + "step": 37173 + }, + { + "epoch": 2.307654106400149, + "grad_norm": 0.13627128263148763, + "learning_rate": 1.5365585580185578e-05, + "loss": 2.566, + "step": 37174 + }, + { + "epoch": 2.307716183499907, + "grad_norm": 0.15756991905478385, + "learning_rate": 1.536298090535816e-05, + "loss": 2.6986, + "step": 37175 + }, + { + "epoch": 2.307778260599665, + "grad_norm": 0.14205022305374063, + "learning_rate": 1.5360376411240466e-05, + "loss": 2.7913, + "step": 37176 + }, + { + "epoch": 2.3078403376994228, + "grad_norm": 0.1404782430137665, + "learning_rate": 1.535777209784612e-05, + "loss": 2.7946, + "step": 37177 + }, + { + "epoch": 2.3079024147991807, + "grad_norm": 0.14353994696057124, + "learning_rate": 1.5355167965188678e-05, + "loss": 2.7197, + "step": 37178 + }, + { + "epoch": 2.3079644918989386, + "grad_norm": 0.1544461735177458, + "learning_rate": 1.5352564013281735e-05, + "loss": 2.7272, + "step": 37179 + }, + { + "epoch": 2.3080265689986965, + "grad_norm": 0.15885658470773215, + "learning_rate": 1.5349960242138867e-05, + "loss": 2.6241, + "step": 37180 + }, + { + "epoch": 2.3080886460984544, + "grad_norm": 0.1742096902860894, + "learning_rate": 1.5347356651773677e-05, + "loss": 2.7261, + "step": 37181 + }, + { + "epoch": 2.3081507231982124, + "grad_norm": 0.1375419628527639, + "learning_rate": 1.5344753242199744e-05, + "loss": 2.694, + "step": 37182 + }, + { + "epoch": 2.3082128002979703, + "grad_norm": 0.13752880580397012, + "learning_rate": 1.5342150013430646e-05, + "loss": 2.7203, + "step": 37183 + }, + { + "epoch": 2.3082748773977277, + "grad_norm": 0.14053999132789594, + "learning_rate": 1.5339546965479944e-05, + "loss": 2.7516, + "step": 37184 + }, + { + "epoch": 2.308336954497486, + "grad_norm": 0.14203677546472387, + "learning_rate": 1.533694409836126e-05, + "loss": 2.7414, + "step": 37185 + }, + { + "epoch": 2.3083990315972436, + "grad_norm": 0.1368413783937452, + "learning_rate": 1.5334341412088155e-05, + "loss": 2.7255, + "step": 37186 + }, + { + "epoch": 2.3084611086970015, + "grad_norm": 0.1896777149077286, + "learning_rate": 1.5331738906674202e-05, + "loss": 2.6027, + "step": 37187 + }, + { + "epoch": 2.3085231857967594, + "grad_norm": 0.1401522890273336, + "learning_rate": 1.5329136582132975e-05, + "loss": 2.6667, + "step": 37188 + }, + { + "epoch": 2.3085852628965173, + "grad_norm": 0.14012280837079757, + "learning_rate": 1.532653443847807e-05, + "loss": 2.8225, + "step": 37189 + }, + { + "epoch": 2.3086473399962752, + "grad_norm": 0.1506011406018466, + "learning_rate": 1.5323932475723046e-05, + "loss": 2.7038, + "step": 37190 + }, + { + "epoch": 2.308709417096033, + "grad_norm": 0.13398312326185768, + "learning_rate": 1.5321330693881493e-05, + "loss": 2.734, + "step": 37191 + }, + { + "epoch": 2.308771494195791, + "grad_norm": 0.1364726644797093, + "learning_rate": 1.531872909296697e-05, + "loss": 2.6808, + "step": 37192 + }, + { + "epoch": 2.308833571295549, + "grad_norm": 0.1352905471692978, + "learning_rate": 1.5316127672993048e-05, + "loss": 2.5832, + "step": 37193 + }, + { + "epoch": 2.308895648395307, + "grad_norm": 0.13550982037505743, + "learning_rate": 1.531352643397332e-05, + "loss": 2.6282, + "step": 37194 + }, + { + "epoch": 2.308957725495065, + "grad_norm": 0.13527850674442704, + "learning_rate": 1.531092537592134e-05, + "loss": 2.7277, + "step": 37195 + }, + { + "epoch": 2.3090198025948228, + "grad_norm": 0.13952374393269926, + "learning_rate": 1.530832449885069e-05, + "loss": 2.6195, + "step": 37196 + }, + { + "epoch": 2.3090818796945807, + "grad_norm": 0.1342942868727305, + "learning_rate": 1.5305723802774914e-05, + "loss": 2.6622, + "step": 37197 + }, + { + "epoch": 2.3091439567943386, + "grad_norm": 0.14071151932876763, + "learning_rate": 1.5303123287707614e-05, + "loss": 2.7582, + "step": 37198 + }, + { + "epoch": 2.3092060338940965, + "grad_norm": 0.1432194981057099, + "learning_rate": 1.5300522953662344e-05, + "loss": 2.6428, + "step": 37199 + }, + { + "epoch": 2.3092681109938544, + "grad_norm": 0.15097185623512469, + "learning_rate": 1.529792280065267e-05, + "loss": 2.6831, + "step": 37200 + }, + { + "epoch": 2.3093301880936123, + "grad_norm": 0.13213131628213923, + "learning_rate": 1.5295322828692142e-05, + "loss": 2.6975, + "step": 37201 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.14851033782778514, + "learning_rate": 1.5292723037794356e-05, + "loss": 2.7459, + "step": 37202 + }, + { + "epoch": 2.309454342293128, + "grad_norm": 0.13441758201933482, + "learning_rate": 1.5290123427972857e-05, + "loss": 2.8094, + "step": 37203 + }, + { + "epoch": 2.309516419392886, + "grad_norm": 0.1412214504776395, + "learning_rate": 1.528752399924121e-05, + "loss": 2.6496, + "step": 37204 + }, + { + "epoch": 2.309578496492644, + "grad_norm": 0.14280486693736985, + "learning_rate": 1.5284924751612977e-05, + "loss": 2.6331, + "step": 37205 + }, + { + "epoch": 2.309640573592402, + "grad_norm": 0.14673478022704148, + "learning_rate": 1.5282325685101708e-05, + "loss": 2.7378, + "step": 37206 + }, + { + "epoch": 2.30970265069216, + "grad_norm": 0.18183414855798177, + "learning_rate": 1.5279726799720983e-05, + "loss": 2.6562, + "step": 37207 + }, + { + "epoch": 2.3097647277919178, + "grad_norm": 0.14454303512982022, + "learning_rate": 1.527712809548434e-05, + "loss": 2.7537, + "step": 37208 + }, + { + "epoch": 2.3098268048916752, + "grad_norm": 0.14439892510759691, + "learning_rate": 1.5274529572405367e-05, + "loss": 2.6629, + "step": 37209 + }, + { + "epoch": 2.3098888819914336, + "grad_norm": 0.16583989323188603, + "learning_rate": 1.52719312304976e-05, + "loss": 2.7228, + "step": 37210 + }, + { + "epoch": 2.309950959091191, + "grad_norm": 0.15547922281234172, + "learning_rate": 1.5269333069774576e-05, + "loss": 2.646, + "step": 37211 + }, + { + "epoch": 2.310013036190949, + "grad_norm": 0.14405176168209965, + "learning_rate": 1.5266735090249897e-05, + "loss": 2.6826, + "step": 37212 + }, + { + "epoch": 2.310075113290707, + "grad_norm": 0.15484257664895118, + "learning_rate": 1.526413729193709e-05, + "loss": 2.7715, + "step": 37213 + }, + { + "epoch": 2.310137190390465, + "grad_norm": 0.14804141001818236, + "learning_rate": 1.5261539674849712e-05, + "loss": 2.6978, + "step": 37214 + }, + { + "epoch": 2.3101992674902228, + "grad_norm": 0.13459280930538672, + "learning_rate": 1.5258942239001302e-05, + "loss": 2.6263, + "step": 37215 + }, + { + "epoch": 2.3102613445899807, + "grad_norm": 0.16043662905856035, + "learning_rate": 1.5256344984405435e-05, + "loss": 2.7272, + "step": 37216 + }, + { + "epoch": 2.3103234216897386, + "grad_norm": 0.15337138934887004, + "learning_rate": 1.525374791107565e-05, + "loss": 2.6321, + "step": 37217 + }, + { + "epoch": 2.3103854987894965, + "grad_norm": 0.14245096833367574, + "learning_rate": 1.52511510190255e-05, + "loss": 2.6183, + "step": 37218 + }, + { + "epoch": 2.3104475758892544, + "grad_norm": 0.15847229135430815, + "learning_rate": 1.5248554308268531e-05, + "loss": 2.7279, + "step": 37219 + }, + { + "epoch": 2.3105096529890123, + "grad_norm": 0.14257334348420952, + "learning_rate": 1.524595777881827e-05, + "loss": 2.7467, + "step": 37220 + }, + { + "epoch": 2.3105717300887703, + "grad_norm": 0.14263897080872046, + "learning_rate": 1.5243361430688307e-05, + "loss": 2.6915, + "step": 37221 + }, + { + "epoch": 2.310633807188528, + "grad_norm": 0.13745810997698027, + "learning_rate": 1.524076526389216e-05, + "loss": 2.7661, + "step": 37222 + }, + { + "epoch": 2.310695884288286, + "grad_norm": 0.13508729278094622, + "learning_rate": 1.5238169278443382e-05, + "loss": 2.6525, + "step": 37223 + }, + { + "epoch": 2.310757961388044, + "grad_norm": 0.13844856899304783, + "learning_rate": 1.523557347435549e-05, + "loss": 2.755, + "step": 37224 + }, + { + "epoch": 2.310820038487802, + "grad_norm": 0.1483766343160122, + "learning_rate": 1.5232977851642077e-05, + "loss": 2.7741, + "step": 37225 + }, + { + "epoch": 2.31088211558756, + "grad_norm": 0.1733010906903305, + "learning_rate": 1.523038241031665e-05, + "loss": 2.6472, + "step": 37226 + }, + { + "epoch": 2.3109441926873178, + "grad_norm": 0.13389327174426574, + "learning_rate": 1.5227787150392764e-05, + "loss": 2.7326, + "step": 37227 + }, + { + "epoch": 2.3110062697870757, + "grad_norm": 0.1480280537078015, + "learning_rate": 1.5225192071883953e-05, + "loss": 2.7474, + "step": 37228 + }, + { + "epoch": 2.3110683468868336, + "grad_norm": 0.14477656140424974, + "learning_rate": 1.5222597174803738e-05, + "loss": 2.5783, + "step": 37229 + }, + { + "epoch": 2.3111304239865915, + "grad_norm": 0.14842875981950776, + "learning_rate": 1.5220002459165694e-05, + "loss": 2.775, + "step": 37230 + }, + { + "epoch": 2.3111925010863494, + "grad_norm": 0.14803963595685954, + "learning_rate": 1.521740792498334e-05, + "loss": 2.7065, + "step": 37231 + }, + { + "epoch": 2.311254578186107, + "grad_norm": 0.14961842666414799, + "learning_rate": 1.5214813572270214e-05, + "loss": 2.6882, + "step": 37232 + }, + { + "epoch": 2.3113166552858653, + "grad_norm": 0.1419927061824429, + "learning_rate": 1.521221940103983e-05, + "loss": 2.7629, + "step": 37233 + }, + { + "epoch": 2.3113787323856227, + "grad_norm": 0.14240597430560992, + "learning_rate": 1.5209625411305761e-05, + "loss": 2.7495, + "step": 37234 + }, + { + "epoch": 2.3114408094853807, + "grad_norm": 0.14903586649528633, + "learning_rate": 1.5207031603081522e-05, + "loss": 2.6169, + "step": 37235 + }, + { + "epoch": 2.3115028865851386, + "grad_norm": 0.14132976755607932, + "learning_rate": 1.5204437976380647e-05, + "loss": 2.804, + "step": 37236 + }, + { + "epoch": 2.3115649636848965, + "grad_norm": 0.14187440565379675, + "learning_rate": 1.5201844531216646e-05, + "loss": 2.7101, + "step": 37237 + }, + { + "epoch": 2.3116270407846544, + "grad_norm": 0.13880554660701516, + "learning_rate": 1.5199251267603088e-05, + "loss": 2.6869, + "step": 37238 + }, + { + "epoch": 2.3116891178844123, + "grad_norm": 0.1386786781576454, + "learning_rate": 1.5196658185553487e-05, + "loss": 2.6731, + "step": 37239 + }, + { + "epoch": 2.3117511949841703, + "grad_norm": 0.14495446771746348, + "learning_rate": 1.5194065285081366e-05, + "loss": 2.7419, + "step": 37240 + }, + { + "epoch": 2.311813272083928, + "grad_norm": 0.13830126682169855, + "learning_rate": 1.5191472566200238e-05, + "loss": 2.64, + "step": 37241 + }, + { + "epoch": 2.311875349183686, + "grad_norm": 0.1431477026029207, + "learning_rate": 1.518888002892367e-05, + "loss": 2.8729, + "step": 37242 + }, + { + "epoch": 2.311937426283444, + "grad_norm": 0.15347249684685618, + "learning_rate": 1.5186287673265143e-05, + "loss": 2.7497, + "step": 37243 + }, + { + "epoch": 2.311999503383202, + "grad_norm": 0.14574693706321945, + "learning_rate": 1.518369549923822e-05, + "loss": 2.675, + "step": 37244 + }, + { + "epoch": 2.31206158048296, + "grad_norm": 0.13713985712888863, + "learning_rate": 1.5181103506856414e-05, + "loss": 2.7069, + "step": 37245 + }, + { + "epoch": 2.3121236575827178, + "grad_norm": 0.13723924876274235, + "learning_rate": 1.517851169613324e-05, + "loss": 2.6729, + "step": 37246 + }, + { + "epoch": 2.3121857346824757, + "grad_norm": 0.14703262727763278, + "learning_rate": 1.517592006708221e-05, + "loss": 2.747, + "step": 37247 + }, + { + "epoch": 2.3122478117822336, + "grad_norm": 0.13924737171555443, + "learning_rate": 1.5173328619716876e-05, + "loss": 2.7521, + "step": 37248 + }, + { + "epoch": 2.3123098888819915, + "grad_norm": 0.17128497959433994, + "learning_rate": 1.517073735405074e-05, + "loss": 2.6657, + "step": 37249 + }, + { + "epoch": 2.3123719659817494, + "grad_norm": 0.14632267610350871, + "learning_rate": 1.5168146270097322e-05, + "loss": 2.7735, + "step": 37250 + }, + { + "epoch": 2.3124340430815074, + "grad_norm": 0.15247471388692888, + "learning_rate": 1.5165555367870127e-05, + "loss": 2.6883, + "step": 37251 + }, + { + "epoch": 2.3124961201812653, + "grad_norm": 0.13495019353783444, + "learning_rate": 1.51629646473827e-05, + "loss": 2.682, + "step": 37252 + }, + { + "epoch": 2.312558197281023, + "grad_norm": 0.14111952454374546, + "learning_rate": 1.516037410864854e-05, + "loss": 2.664, + "step": 37253 + }, + { + "epoch": 2.312620274380781, + "grad_norm": 0.1575952259348618, + "learning_rate": 1.515778375168117e-05, + "loss": 2.6715, + "step": 37254 + }, + { + "epoch": 2.312682351480539, + "grad_norm": 0.1517801987523444, + "learning_rate": 1.5155193576494098e-05, + "loss": 2.6682, + "step": 37255 + }, + { + "epoch": 2.312744428580297, + "grad_norm": 0.1460145919461187, + "learning_rate": 1.5152603583100828e-05, + "loss": 2.6374, + "step": 37256 + }, + { + "epoch": 2.3128065056800544, + "grad_norm": 0.16244720967823748, + "learning_rate": 1.5150013771514898e-05, + "loss": 2.6477, + "step": 37257 + }, + { + "epoch": 2.3128685827798128, + "grad_norm": 0.1657936912797522, + "learning_rate": 1.5147424141749806e-05, + "loss": 2.7112, + "step": 37258 + }, + { + "epoch": 2.3129306598795702, + "grad_norm": 0.1521634171620039, + "learning_rate": 1.514483469381906e-05, + "loss": 2.733, + "step": 37259 + }, + { + "epoch": 2.312992736979328, + "grad_norm": 0.1388668382213723, + "learning_rate": 1.5142245427736163e-05, + "loss": 2.7242, + "step": 37260 + }, + { + "epoch": 2.313054814079086, + "grad_norm": 0.16080841007435767, + "learning_rate": 1.5139656343514647e-05, + "loss": 2.8036, + "step": 37261 + }, + { + "epoch": 2.313116891178844, + "grad_norm": 0.15456535424914677, + "learning_rate": 1.5137067441168008e-05, + "loss": 2.6871, + "step": 37262 + }, + { + "epoch": 2.313178968278602, + "grad_norm": 0.153839544179497, + "learning_rate": 1.5134478720709749e-05, + "loss": 2.8332, + "step": 37263 + }, + { + "epoch": 2.31324104537836, + "grad_norm": 0.14771687484165974, + "learning_rate": 1.5131890182153374e-05, + "loss": 2.6553, + "step": 37264 + }, + { + "epoch": 2.3133031224781178, + "grad_norm": 0.1434256629069606, + "learning_rate": 1.5129301825512382e-05, + "loss": 2.7294, + "step": 37265 + }, + { + "epoch": 2.3133651995778757, + "grad_norm": 0.13616822300761391, + "learning_rate": 1.5126713650800305e-05, + "loss": 2.7139, + "step": 37266 + }, + { + "epoch": 2.3134272766776336, + "grad_norm": 0.13646325352464128, + "learning_rate": 1.5124125658030625e-05, + "loss": 2.7568, + "step": 37267 + }, + { + "epoch": 2.3134893537773915, + "grad_norm": 0.18750400340256812, + "learning_rate": 1.5121537847216849e-05, + "loss": 2.7632, + "step": 37268 + }, + { + "epoch": 2.3135514308771494, + "grad_norm": 0.14482723677944453, + "learning_rate": 1.5118950218372463e-05, + "loss": 2.7204, + "step": 37269 + }, + { + "epoch": 2.3136135079769073, + "grad_norm": 0.1397739393321086, + "learning_rate": 1.5116362771510994e-05, + "loss": 2.7815, + "step": 37270 + }, + { + "epoch": 2.3136755850766653, + "grad_norm": 0.14320637492702723, + "learning_rate": 1.5113775506645933e-05, + "loss": 2.7402, + "step": 37271 + }, + { + "epoch": 2.313737662176423, + "grad_norm": 0.14719493823886887, + "learning_rate": 1.5111188423790772e-05, + "loss": 2.6954, + "step": 37272 + }, + { + "epoch": 2.313799739276181, + "grad_norm": 0.149555999059027, + "learning_rate": 1.5108601522958993e-05, + "loss": 2.6124, + "step": 37273 + }, + { + "epoch": 2.313861816375939, + "grad_norm": 0.1487336717587563, + "learning_rate": 1.5106014804164114e-05, + "loss": 2.6544, + "step": 37274 + }, + { + "epoch": 2.313923893475697, + "grad_norm": 0.14143260873477942, + "learning_rate": 1.510342826741964e-05, + "loss": 2.6388, + "step": 37275 + }, + { + "epoch": 2.313985970575455, + "grad_norm": 0.13644460713411438, + "learning_rate": 1.510084191273905e-05, + "loss": 2.6653, + "step": 37276 + }, + { + "epoch": 2.3140480476752128, + "grad_norm": 0.15020193921217043, + "learning_rate": 1.5098255740135842e-05, + "loss": 2.6949, + "step": 37277 + }, + { + "epoch": 2.3141101247749707, + "grad_norm": 0.14832102447129916, + "learning_rate": 1.5095669749623508e-05, + "loss": 2.7379, + "step": 37278 + }, + { + "epoch": 2.3141722018747286, + "grad_norm": 0.14452662331676247, + "learning_rate": 1.509308394121552e-05, + "loss": 2.7281, + "step": 37279 + }, + { + "epoch": 2.314234278974486, + "grad_norm": 0.1570719141998251, + "learning_rate": 1.5090498314925405e-05, + "loss": 2.7335, + "step": 37280 + }, + { + "epoch": 2.3142963560742444, + "grad_norm": 0.14355636128598118, + "learning_rate": 1.5087912870766635e-05, + "loss": 2.671, + "step": 37281 + }, + { + "epoch": 2.314358433174002, + "grad_norm": 0.1331113300446346, + "learning_rate": 1.5085327608752698e-05, + "loss": 2.5965, + "step": 37282 + }, + { + "epoch": 2.31442051027376, + "grad_norm": 0.1581329555074534, + "learning_rate": 1.5082742528897064e-05, + "loss": 2.7116, + "step": 37283 + }, + { + "epoch": 2.3144825873735178, + "grad_norm": 0.14294277877676154, + "learning_rate": 1.5080157631213254e-05, + "loss": 2.7253, + "step": 37284 + }, + { + "epoch": 2.3145446644732757, + "grad_norm": 0.1377172858379169, + "learning_rate": 1.5077572915714744e-05, + "loss": 2.614, + "step": 37285 + }, + { + "epoch": 2.3146067415730336, + "grad_norm": 0.16221402134265325, + "learning_rate": 1.5074988382415012e-05, + "loss": 2.7146, + "step": 37286 + }, + { + "epoch": 2.3146688186727915, + "grad_norm": 0.14182366677640518, + "learning_rate": 1.5072404031327525e-05, + "loss": 2.7259, + "step": 37287 + }, + { + "epoch": 2.3147308957725494, + "grad_norm": 0.1517041117484498, + "learning_rate": 1.5069819862465806e-05, + "loss": 2.6583, + "step": 37288 + }, + { + "epoch": 2.3147929728723073, + "grad_norm": 0.15747829093690452, + "learning_rate": 1.506723587584331e-05, + "loss": 2.7052, + "step": 37289 + }, + { + "epoch": 2.3148550499720653, + "grad_norm": 0.17466949584218308, + "learning_rate": 1.5064652071473528e-05, + "loss": 2.6609, + "step": 37290 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.14465355852968584, + "learning_rate": 1.5062068449369937e-05, + "loss": 2.7437, + "step": 37291 + }, + { + "epoch": 2.314979204171581, + "grad_norm": 0.13792370731653086, + "learning_rate": 1.5059485009546003e-05, + "loss": 2.7483, + "step": 37292 + }, + { + "epoch": 2.315041281271339, + "grad_norm": 0.14575204258185984, + "learning_rate": 1.5056901752015235e-05, + "loss": 2.7272, + "step": 37293 + }, + { + "epoch": 2.315103358371097, + "grad_norm": 0.13700859816702743, + "learning_rate": 1.5054318676791085e-05, + "loss": 2.6724, + "step": 37294 + }, + { + "epoch": 2.315165435470855, + "grad_norm": 0.14207153595635408, + "learning_rate": 1.5051735783887044e-05, + "loss": 2.7598, + "step": 37295 + }, + { + "epoch": 2.3152275125706128, + "grad_norm": 0.14968828829237207, + "learning_rate": 1.5049153073316563e-05, + "loss": 2.8055, + "step": 37296 + }, + { + "epoch": 2.3152895896703707, + "grad_norm": 0.14181814402020043, + "learning_rate": 1.5046570545093153e-05, + "loss": 2.7687, + "step": 37297 + }, + { + "epoch": 2.3153516667701286, + "grad_norm": 0.15403604794918113, + "learning_rate": 1.504398819923027e-05, + "loss": 2.7505, + "step": 37298 + }, + { + "epoch": 2.3154137438698865, + "grad_norm": 0.14617600726184593, + "learning_rate": 1.5041406035741385e-05, + "loss": 2.6692, + "step": 37299 + }, + { + "epoch": 2.3154758209696444, + "grad_norm": 0.1714587046845943, + "learning_rate": 1.5038824054639972e-05, + "loss": 2.7054, + "step": 37300 + }, + { + "epoch": 2.3155378980694024, + "grad_norm": 0.143123179292967, + "learning_rate": 1.5036242255939487e-05, + "loss": 2.667, + "step": 37301 + }, + { + "epoch": 2.3155999751691603, + "grad_norm": 0.14230335226592236, + "learning_rate": 1.5033660639653424e-05, + "loss": 2.7675, + "step": 37302 + }, + { + "epoch": 2.3156620522689177, + "grad_norm": 0.16089201187116942, + "learning_rate": 1.5031079205795245e-05, + "loss": 2.8205, + "step": 37303 + }, + { + "epoch": 2.315724129368676, + "grad_norm": 0.1321877764976261, + "learning_rate": 1.5028497954378417e-05, + "loss": 2.632, + "step": 37304 + }, + { + "epoch": 2.3157862064684336, + "grad_norm": 0.15492548432909262, + "learning_rate": 1.5025916885416386e-05, + "loss": 2.7154, + "step": 37305 + }, + { + "epoch": 2.3158482835681915, + "grad_norm": 0.1797041781604986, + "learning_rate": 1.5023335998922655e-05, + "loss": 2.7488, + "step": 37306 + }, + { + "epoch": 2.3159103606679494, + "grad_norm": 0.13623565745983315, + "learning_rate": 1.5020755294910649e-05, + "loss": 2.679, + "step": 37307 + }, + { + "epoch": 2.3159724377677073, + "grad_norm": 0.1508103125240059, + "learning_rate": 1.5018174773393878e-05, + "loss": 2.7051, + "step": 37308 + }, + { + "epoch": 2.3160345148674653, + "grad_norm": 0.15200839667750343, + "learning_rate": 1.5015594434385778e-05, + "loss": 2.7623, + "step": 37309 + }, + { + "epoch": 2.316096591967223, + "grad_norm": 0.16169826244187116, + "learning_rate": 1.50130142778998e-05, + "loss": 2.8073, + "step": 37310 + }, + { + "epoch": 2.316158669066981, + "grad_norm": 0.14920424395848825, + "learning_rate": 1.5010434303949434e-05, + "loss": 2.7752, + "step": 37311 + }, + { + "epoch": 2.316220746166739, + "grad_norm": 0.14200198023004668, + "learning_rate": 1.5007854512548125e-05, + "loss": 2.8003, + "step": 37312 + }, + { + "epoch": 2.316282823266497, + "grad_norm": 0.1504649560713467, + "learning_rate": 1.5005274903709337e-05, + "loss": 2.7152, + "step": 37313 + }, + { + "epoch": 2.316344900366255, + "grad_norm": 0.15935900598567992, + "learning_rate": 1.5002695477446522e-05, + "loss": 2.6459, + "step": 37314 + }, + { + "epoch": 2.3164069774660128, + "grad_norm": 0.15780514827449293, + "learning_rate": 1.5000116233773126e-05, + "loss": 2.6903, + "step": 37315 + }, + { + "epoch": 2.3164690545657707, + "grad_norm": 0.15863263142154357, + "learning_rate": 1.4997537172702636e-05, + "loss": 2.7268, + "step": 37316 + }, + { + "epoch": 2.3165311316655286, + "grad_norm": 0.15666346680724763, + "learning_rate": 1.4994958294248495e-05, + "loss": 2.7304, + "step": 37317 + }, + { + "epoch": 2.3165932087652865, + "grad_norm": 0.15073541033383217, + "learning_rate": 1.499237959842415e-05, + "loss": 2.7518, + "step": 37318 + }, + { + "epoch": 2.3166552858650444, + "grad_norm": 0.1360725868908037, + "learning_rate": 1.4989801085243044e-05, + "loss": 2.8126, + "step": 37319 + }, + { + "epoch": 2.3167173629648024, + "grad_norm": 0.15248738755577615, + "learning_rate": 1.4987222754718661e-05, + "loss": 2.693, + "step": 37320 + }, + { + "epoch": 2.3167794400645603, + "grad_norm": 0.15757118390426933, + "learning_rate": 1.4984644606864433e-05, + "loss": 2.6536, + "step": 37321 + }, + { + "epoch": 2.316841517164318, + "grad_norm": 0.18406513247849943, + "learning_rate": 1.498206664169382e-05, + "loss": 2.7393, + "step": 37322 + }, + { + "epoch": 2.316903594264076, + "grad_norm": 0.15129321351920053, + "learning_rate": 1.4979488859220247e-05, + "loss": 2.6803, + "step": 37323 + }, + { + "epoch": 2.316965671363834, + "grad_norm": 0.17541536823578957, + "learning_rate": 1.4976911259457206e-05, + "loss": 2.6954, + "step": 37324 + }, + { + "epoch": 2.317027748463592, + "grad_norm": 0.1523678283736607, + "learning_rate": 1.4974333842418109e-05, + "loss": 2.6937, + "step": 37325 + }, + { + "epoch": 2.31708982556335, + "grad_norm": 0.14197449737920995, + "learning_rate": 1.4971756608116421e-05, + "loss": 2.6807, + "step": 37326 + }, + { + "epoch": 2.3171519026631078, + "grad_norm": 0.14575698179327343, + "learning_rate": 1.4969179556565583e-05, + "loss": 2.7387, + "step": 37327 + }, + { + "epoch": 2.3172139797628653, + "grad_norm": 0.14940556022039594, + "learning_rate": 1.4966602687779024e-05, + "loss": 2.6513, + "step": 37328 + }, + { + "epoch": 2.3172760568626236, + "grad_norm": 0.16926204340682802, + "learning_rate": 1.4964026001770215e-05, + "loss": 2.6922, + "step": 37329 + }, + { + "epoch": 2.317338133962381, + "grad_norm": 0.14669000180447025, + "learning_rate": 1.4961449498552593e-05, + "loss": 2.7492, + "step": 37330 + }, + { + "epoch": 2.317400211062139, + "grad_norm": 0.15081233806024386, + "learning_rate": 1.4958873178139594e-05, + "loss": 2.7097, + "step": 37331 + }, + { + "epoch": 2.317462288161897, + "grad_norm": 0.14275335622063823, + "learning_rate": 1.4956297040544636e-05, + "loss": 2.6763, + "step": 37332 + }, + { + "epoch": 2.317524365261655, + "grad_norm": 0.15565644416689753, + "learning_rate": 1.4953721085781208e-05, + "loss": 2.6695, + "step": 37333 + }, + { + "epoch": 2.3175864423614128, + "grad_norm": 0.17884874502469025, + "learning_rate": 1.4951145313862724e-05, + "loss": 2.7056, + "step": 37334 + }, + { + "epoch": 2.3176485194611707, + "grad_norm": 0.14901261846583888, + "learning_rate": 1.494856972480262e-05, + "loss": 2.6734, + "step": 37335 + }, + { + "epoch": 2.3177105965609286, + "grad_norm": 0.16364750145762705, + "learning_rate": 1.4945994318614326e-05, + "loss": 2.7084, + "step": 37336 + }, + { + "epoch": 2.3177726736606865, + "grad_norm": 0.16249847250917415, + "learning_rate": 1.49434190953113e-05, + "loss": 2.7446, + "step": 37337 + }, + { + "epoch": 2.3178347507604444, + "grad_norm": 0.15814063521687535, + "learning_rate": 1.4940844054906966e-05, + "loss": 2.6899, + "step": 37338 + }, + { + "epoch": 2.3178968278602023, + "grad_norm": 0.14146976914785525, + "learning_rate": 1.4938269197414751e-05, + "loss": 2.7132, + "step": 37339 + }, + { + "epoch": 2.3179589049599603, + "grad_norm": 0.14919423780779825, + "learning_rate": 1.4935694522848104e-05, + "loss": 2.6727, + "step": 37340 + }, + { + "epoch": 2.318020982059718, + "grad_norm": 0.15327566553727945, + "learning_rate": 1.4933120031220454e-05, + "loss": 2.7078, + "step": 37341 + }, + { + "epoch": 2.318083059159476, + "grad_norm": 0.157094746178435, + "learning_rate": 1.493054572254522e-05, + "loss": 2.6766, + "step": 37342 + }, + { + "epoch": 2.318145136259234, + "grad_norm": 0.1612333562027471, + "learning_rate": 1.492797159683585e-05, + "loss": 2.7148, + "step": 37343 + }, + { + "epoch": 2.318207213358992, + "grad_norm": 0.16812373374440845, + "learning_rate": 1.4925397654105772e-05, + "loss": 2.7591, + "step": 37344 + }, + { + "epoch": 2.31826929045875, + "grad_norm": 0.14200617761967285, + "learning_rate": 1.4922823894368404e-05, + "loss": 2.718, + "step": 37345 + }, + { + "epoch": 2.3183313675585078, + "grad_norm": 0.15240718670084547, + "learning_rate": 1.4920250317637169e-05, + "loss": 2.7298, + "step": 37346 + }, + { + "epoch": 2.3183934446582657, + "grad_norm": 0.16401579169277353, + "learning_rate": 1.4917676923925516e-05, + "loss": 2.7643, + "step": 37347 + }, + { + "epoch": 2.3184555217580236, + "grad_norm": 0.14443590266798306, + "learning_rate": 1.4915103713246858e-05, + "loss": 2.7304, + "step": 37348 + }, + { + "epoch": 2.3185175988577815, + "grad_norm": 0.15090286040039286, + "learning_rate": 1.4912530685614623e-05, + "loss": 2.6197, + "step": 37349 + }, + { + "epoch": 2.3185796759575394, + "grad_norm": 0.14829357598449244, + "learning_rate": 1.4909957841042232e-05, + "loss": 2.6992, + "step": 37350 + }, + { + "epoch": 2.318641753057297, + "grad_norm": 0.1558183130751385, + "learning_rate": 1.4907385179543093e-05, + "loss": 2.7636, + "step": 37351 + }, + { + "epoch": 2.3187038301570553, + "grad_norm": 0.13800816807726723, + "learning_rate": 1.4904812701130661e-05, + "loss": 2.6437, + "step": 37352 + }, + { + "epoch": 2.3187659072568128, + "grad_norm": 0.13522828200226275, + "learning_rate": 1.4902240405818346e-05, + "loss": 2.6847, + "step": 37353 + }, + { + "epoch": 2.3188279843565707, + "grad_norm": 0.15038973944267037, + "learning_rate": 1.4899668293619556e-05, + "loss": 2.691, + "step": 37354 + }, + { + "epoch": 2.3188900614563286, + "grad_norm": 0.1410424737916595, + "learning_rate": 1.4897096364547703e-05, + "loss": 2.6965, + "step": 37355 + }, + { + "epoch": 2.3189521385560865, + "grad_norm": 0.15169002192223674, + "learning_rate": 1.489452461861624e-05, + "loss": 2.6829, + "step": 37356 + }, + { + "epoch": 2.3190142156558444, + "grad_norm": 0.1351091181431792, + "learning_rate": 1.4891953055838559e-05, + "loss": 2.672, + "step": 37357 + }, + { + "epoch": 2.3190762927556023, + "grad_norm": 0.13110778489078864, + "learning_rate": 1.488938167622808e-05, + "loss": 2.7381, + "step": 37358 + }, + { + "epoch": 2.3191383698553603, + "grad_norm": 0.14258410166925053, + "learning_rate": 1.4886810479798213e-05, + "loss": 2.7418, + "step": 37359 + }, + { + "epoch": 2.319200446955118, + "grad_norm": 0.15272331026979621, + "learning_rate": 1.4884239466562389e-05, + "loss": 2.6969, + "step": 37360 + }, + { + "epoch": 2.319262524054876, + "grad_norm": 0.13657345544530677, + "learning_rate": 1.4881668636534012e-05, + "loss": 2.779, + "step": 37361 + }, + { + "epoch": 2.319324601154634, + "grad_norm": 0.1392092634858565, + "learning_rate": 1.4879097989726497e-05, + "loss": 2.6794, + "step": 37362 + }, + { + "epoch": 2.319386678254392, + "grad_norm": 0.14551530089086767, + "learning_rate": 1.4876527526153256e-05, + "loss": 2.7028, + "step": 37363 + }, + { + "epoch": 2.31944875535415, + "grad_norm": 0.1475068330394342, + "learning_rate": 1.487395724582768e-05, + "loss": 2.6426, + "step": 37364 + }, + { + "epoch": 2.3195108324539078, + "grad_norm": 0.1442161295106153, + "learning_rate": 1.4871387148763211e-05, + "loss": 2.7583, + "step": 37365 + }, + { + "epoch": 2.3195729095536657, + "grad_norm": 0.1405587172595026, + "learning_rate": 1.4868817234973242e-05, + "loss": 2.7693, + "step": 37366 + }, + { + "epoch": 2.3196349866534236, + "grad_norm": 0.16433726871242368, + "learning_rate": 1.4866247504471182e-05, + "loss": 2.6668, + "step": 37367 + }, + { + "epoch": 2.3196970637531815, + "grad_norm": 0.1409679971715858, + "learning_rate": 1.4863677957270423e-05, + "loss": 2.71, + "step": 37368 + }, + { + "epoch": 2.3197591408529394, + "grad_norm": 0.14749703376018078, + "learning_rate": 1.4861108593384404e-05, + "loss": 2.7344, + "step": 37369 + }, + { + "epoch": 2.3198212179526974, + "grad_norm": 0.14528119278076057, + "learning_rate": 1.4858539412826511e-05, + "loss": 2.7383, + "step": 37370 + }, + { + "epoch": 2.3198832950524553, + "grad_norm": 0.1603419518233325, + "learning_rate": 1.4855970415610144e-05, + "loss": 2.7845, + "step": 37371 + }, + { + "epoch": 2.319945372152213, + "grad_norm": 0.15772895631745484, + "learning_rate": 1.4853401601748701e-05, + "loss": 2.7631, + "step": 37372 + }, + { + "epoch": 2.320007449251971, + "grad_norm": 0.14405523344940152, + "learning_rate": 1.485083297125559e-05, + "loss": 2.75, + "step": 37373 + }, + { + "epoch": 2.320069526351729, + "grad_norm": 0.15721655156269077, + "learning_rate": 1.4848264524144235e-05, + "loss": 2.7133, + "step": 37374 + }, + { + "epoch": 2.320131603451487, + "grad_norm": 0.1421648856404014, + "learning_rate": 1.484569626042801e-05, + "loss": 2.7946, + "step": 37375 + }, + { + "epoch": 2.3201936805512444, + "grad_norm": 0.14552901580755145, + "learning_rate": 1.484312818012033e-05, + "loss": 2.7368, + "step": 37376 + }, + { + "epoch": 2.320255757651003, + "grad_norm": 0.1358257009355085, + "learning_rate": 1.4840560283234584e-05, + "loss": 2.6873, + "step": 37377 + }, + { + "epoch": 2.3203178347507603, + "grad_norm": 0.146144287782528, + "learning_rate": 1.4837992569784148e-05, + "loss": 2.7261, + "step": 37378 + }, + { + "epoch": 2.320379911850518, + "grad_norm": 0.13861706147367972, + "learning_rate": 1.4835425039782463e-05, + "loss": 2.7277, + "step": 37379 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.13771740080153286, + "learning_rate": 1.4832857693242902e-05, + "loss": 2.657, + "step": 37380 + }, + { + "epoch": 2.320504066050034, + "grad_norm": 0.16127854317313728, + "learning_rate": 1.4830290530178854e-05, + "loss": 2.6306, + "step": 37381 + }, + { + "epoch": 2.320566143149792, + "grad_norm": 0.14696355645787282, + "learning_rate": 1.4827723550603706e-05, + "loss": 2.7293, + "step": 37382 + }, + { + "epoch": 2.32062822024955, + "grad_norm": 0.14042993765690664, + "learning_rate": 1.4825156754530877e-05, + "loss": 2.7365, + "step": 37383 + }, + { + "epoch": 2.3206902973493078, + "grad_norm": 0.1415466070411606, + "learning_rate": 1.4822590141973747e-05, + "loss": 2.6515, + "step": 37384 + }, + { + "epoch": 2.3207523744490657, + "grad_norm": 0.14685823415878677, + "learning_rate": 1.4820023712945702e-05, + "loss": 2.7381, + "step": 37385 + }, + { + "epoch": 2.3208144515488236, + "grad_norm": 0.171834702536101, + "learning_rate": 1.4817457467460122e-05, + "loss": 2.7805, + "step": 37386 + }, + { + "epoch": 2.3208765286485815, + "grad_norm": 0.15649045681002371, + "learning_rate": 1.4814891405530418e-05, + "loss": 2.7857, + "step": 37387 + }, + { + "epoch": 2.3209386057483394, + "grad_norm": 0.1378792285262114, + "learning_rate": 1.4812325527169968e-05, + "loss": 2.6308, + "step": 37388 + }, + { + "epoch": 2.3210006828480974, + "grad_norm": 0.14255653505293056, + "learning_rate": 1.4809759832392157e-05, + "loss": 2.761, + "step": 37389 + }, + { + "epoch": 2.3210627599478553, + "grad_norm": 0.13653933469366838, + "learning_rate": 1.4807194321210372e-05, + "loss": 2.6716, + "step": 37390 + }, + { + "epoch": 2.321124837047613, + "grad_norm": 0.16514938662333087, + "learning_rate": 1.4804628993637981e-05, + "loss": 2.6966, + "step": 37391 + }, + { + "epoch": 2.321186914147371, + "grad_norm": 0.1518067468818889, + "learning_rate": 1.4802063849688403e-05, + "loss": 2.7518, + "step": 37392 + }, + { + "epoch": 2.321248991247129, + "grad_norm": 0.1486781014959242, + "learning_rate": 1.4799498889375002e-05, + "loss": 2.7294, + "step": 37393 + }, + { + "epoch": 2.321311068346887, + "grad_norm": 0.14409116165088334, + "learning_rate": 1.4796934112711158e-05, + "loss": 2.7286, + "step": 37394 + }, + { + "epoch": 2.321373145446645, + "grad_norm": 0.14528398772433795, + "learning_rate": 1.4794369519710238e-05, + "loss": 2.7334, + "step": 37395 + }, + { + "epoch": 2.3214352225464028, + "grad_norm": 0.15180195543207056, + "learning_rate": 1.4791805110385659e-05, + "loss": 2.7851, + "step": 37396 + }, + { + "epoch": 2.3214972996461607, + "grad_norm": 0.1529259621567148, + "learning_rate": 1.4789240884750777e-05, + "loss": 2.7162, + "step": 37397 + }, + { + "epoch": 2.3215593767459186, + "grad_norm": 0.14335309360473786, + "learning_rate": 1.4786676842818975e-05, + "loss": 2.6721, + "step": 37398 + }, + { + "epoch": 2.321621453845676, + "grad_norm": 0.14182451388645428, + "learning_rate": 1.4784112984603626e-05, + "loss": 2.6551, + "step": 37399 + }, + { + "epoch": 2.3216835309454344, + "grad_norm": 0.14686645904016205, + "learning_rate": 1.4781549310118093e-05, + "loss": 2.7966, + "step": 37400 + }, + { + "epoch": 2.321745608045192, + "grad_norm": 0.149480312594453, + "learning_rate": 1.4778985819375784e-05, + "loss": 2.7493, + "step": 37401 + }, + { + "epoch": 2.32180768514495, + "grad_norm": 0.14390215699114803, + "learning_rate": 1.4776422512390054e-05, + "loss": 2.6025, + "step": 37402 + }, + { + "epoch": 2.3218697622447078, + "grad_norm": 0.14539023904067624, + "learning_rate": 1.4773859389174282e-05, + "loss": 2.6569, + "step": 37403 + }, + { + "epoch": 2.3219318393444657, + "grad_norm": 0.13560400591586938, + "learning_rate": 1.4771296449741818e-05, + "loss": 2.6881, + "step": 37404 + }, + { + "epoch": 2.3219939164442236, + "grad_norm": 0.15471464049056047, + "learning_rate": 1.4768733694106058e-05, + "loss": 2.7051, + "step": 37405 + }, + { + "epoch": 2.3220559935439815, + "grad_norm": 0.15253865172346734, + "learning_rate": 1.4766171122280375e-05, + "loss": 2.679, + "step": 37406 + }, + { + "epoch": 2.3221180706437394, + "grad_norm": 0.14823424075492408, + "learning_rate": 1.4763608734278134e-05, + "loss": 2.756, + "step": 37407 + }, + { + "epoch": 2.3221801477434973, + "grad_norm": 0.15304004662719348, + "learning_rate": 1.47610465301127e-05, + "loss": 2.695, + "step": 37408 + }, + { + "epoch": 2.3222422248432553, + "grad_norm": 0.20668227095170816, + "learning_rate": 1.4758484509797427e-05, + "loss": 2.8063, + "step": 37409 + }, + { + "epoch": 2.322304301943013, + "grad_norm": 0.15105270202926005, + "learning_rate": 1.4755922673345706e-05, + "loss": 2.7594, + "step": 37410 + }, + { + "epoch": 2.322366379042771, + "grad_norm": 0.14247549622233924, + "learning_rate": 1.4753361020770901e-05, + "loss": 2.7012, + "step": 37411 + }, + { + "epoch": 2.322428456142529, + "grad_norm": 0.13716506316160076, + "learning_rate": 1.4750799552086358e-05, + "loss": 2.6184, + "step": 37412 + }, + { + "epoch": 2.322490533242287, + "grad_norm": 0.14619715188151078, + "learning_rate": 1.474823826730546e-05, + "loss": 2.6615, + "step": 37413 + }, + { + "epoch": 2.322552610342045, + "grad_norm": 0.17487225745009027, + "learning_rate": 1.4745677166441541e-05, + "loss": 2.7274, + "step": 37414 + }, + { + "epoch": 2.3226146874418028, + "grad_norm": 0.14762419700888676, + "learning_rate": 1.4743116249507993e-05, + "loss": 2.6926, + "step": 37415 + }, + { + "epoch": 2.3226767645415607, + "grad_norm": 0.1723731527122314, + "learning_rate": 1.4740555516518173e-05, + "loss": 2.6749, + "step": 37416 + }, + { + "epoch": 2.3227388416413186, + "grad_norm": 0.14713956537277337, + "learning_rate": 1.473799496748543e-05, + "loss": 2.6932, + "step": 37417 + }, + { + "epoch": 2.3228009187410765, + "grad_norm": 0.17102302369711606, + "learning_rate": 1.4735434602423114e-05, + "loss": 2.755, + "step": 37418 + }, + { + "epoch": 2.3228629958408344, + "grad_norm": 0.1670630901239193, + "learning_rate": 1.4732874421344616e-05, + "loss": 2.7831, + "step": 37419 + }, + { + "epoch": 2.3229250729405924, + "grad_norm": 0.14946927814362418, + "learning_rate": 1.4730314424263269e-05, + "loss": 2.6568, + "step": 37420 + }, + { + "epoch": 2.3229871500403503, + "grad_norm": 0.1560381705145413, + "learning_rate": 1.4727754611192434e-05, + "loss": 2.7082, + "step": 37421 + }, + { + "epoch": 2.323049227140108, + "grad_norm": 0.14581620095866582, + "learning_rate": 1.4725194982145452e-05, + "loss": 2.7297, + "step": 37422 + }, + { + "epoch": 2.323111304239866, + "grad_norm": 0.15289909670695806, + "learning_rate": 1.4722635537135704e-05, + "loss": 2.7439, + "step": 37423 + }, + { + "epoch": 2.3231733813396236, + "grad_norm": 0.14458847846015643, + "learning_rate": 1.4720076276176532e-05, + "loss": 2.6931, + "step": 37424 + }, + { + "epoch": 2.323235458439382, + "grad_norm": 0.16176921130921024, + "learning_rate": 1.4717517199281288e-05, + "loss": 2.7197, + "step": 37425 + }, + { + "epoch": 2.3232975355391394, + "grad_norm": 0.14327236882248087, + "learning_rate": 1.4714958306463318e-05, + "loss": 2.8062, + "step": 37426 + }, + { + "epoch": 2.3233596126388973, + "grad_norm": 0.14555974328062402, + "learning_rate": 1.4712399597735966e-05, + "loss": 2.8092, + "step": 37427 + }, + { + "epoch": 2.3234216897386553, + "grad_norm": 0.1500942232313922, + "learning_rate": 1.47098410731126e-05, + "loss": 2.7243, + "step": 37428 + }, + { + "epoch": 2.323483766838413, + "grad_norm": 0.1435086961630023, + "learning_rate": 1.4707282732606565e-05, + "loss": 2.7551, + "step": 37429 + }, + { + "epoch": 2.323545843938171, + "grad_norm": 0.14499213862278276, + "learning_rate": 1.4704724576231205e-05, + "loss": 2.6579, + "step": 37430 + }, + { + "epoch": 2.323607921037929, + "grad_norm": 0.16936818160166475, + "learning_rate": 1.4702166603999845e-05, + "loss": 2.6991, + "step": 37431 + }, + { + "epoch": 2.323669998137687, + "grad_norm": 0.14934980344547086, + "learning_rate": 1.469960881592587e-05, + "loss": 2.7888, + "step": 37432 + }, + { + "epoch": 2.323732075237445, + "grad_norm": 0.15778984435971394, + "learning_rate": 1.4697051212022606e-05, + "loss": 2.7089, + "step": 37433 + }, + { + "epoch": 2.3237941523372028, + "grad_norm": 0.16713767935981683, + "learning_rate": 1.4694493792303393e-05, + "loss": 2.7662, + "step": 37434 + }, + { + "epoch": 2.3238562294369607, + "grad_norm": 0.13646745065874608, + "learning_rate": 1.4691936556781578e-05, + "loss": 2.6645, + "step": 37435 + }, + { + "epoch": 2.3239183065367186, + "grad_norm": 0.14937744439890288, + "learning_rate": 1.468937950547048e-05, + "loss": 2.7086, + "step": 37436 + }, + { + "epoch": 2.3239803836364765, + "grad_norm": 0.13672327306622523, + "learning_rate": 1.4686822638383486e-05, + "loss": 2.7529, + "step": 37437 + }, + { + "epoch": 2.3240424607362344, + "grad_norm": 0.14364824684675223, + "learning_rate": 1.4684265955533889e-05, + "loss": 2.7094, + "step": 37438 + }, + { + "epoch": 2.3241045378359924, + "grad_norm": 0.15713356656026498, + "learning_rate": 1.4681709456935066e-05, + "loss": 2.6695, + "step": 37439 + }, + { + "epoch": 2.3241666149357503, + "grad_norm": 0.14344805695240626, + "learning_rate": 1.4679153142600338e-05, + "loss": 2.6866, + "step": 37440 + }, + { + "epoch": 2.324228692035508, + "grad_norm": 0.14880166431800168, + "learning_rate": 1.4676597012543031e-05, + "loss": 2.829, + "step": 37441 + }, + { + "epoch": 2.324290769135266, + "grad_norm": 0.1481731721555173, + "learning_rate": 1.4674041066776506e-05, + "loss": 2.7332, + "step": 37442 + }, + { + "epoch": 2.324352846235024, + "grad_norm": 0.14120668402210734, + "learning_rate": 1.4671485305314087e-05, + "loss": 2.7686, + "step": 37443 + }, + { + "epoch": 2.324414923334782, + "grad_norm": 0.14049049408579434, + "learning_rate": 1.46689297281691e-05, + "loss": 2.7337, + "step": 37444 + }, + { + "epoch": 2.32447700043454, + "grad_norm": 0.14857248392768443, + "learning_rate": 1.4666374335354872e-05, + "loss": 2.7118, + "step": 37445 + }, + { + "epoch": 2.324539077534298, + "grad_norm": 0.14890059586484836, + "learning_rate": 1.4663819126884758e-05, + "loss": 2.7306, + "step": 37446 + }, + { + "epoch": 2.3246011546340553, + "grad_norm": 0.18743395635632867, + "learning_rate": 1.4661264102772083e-05, + "loss": 2.7964, + "step": 37447 + }, + { + "epoch": 2.3246632317338136, + "grad_norm": 0.1384399987225607, + "learning_rate": 1.4658709263030168e-05, + "loss": 2.7057, + "step": 37448 + }, + { + "epoch": 2.324725308833571, + "grad_norm": 0.13552400494992886, + "learning_rate": 1.4656154607672346e-05, + "loss": 2.7172, + "step": 37449 + }, + { + "epoch": 2.324787385933329, + "grad_norm": 0.14559244088646528, + "learning_rate": 1.465360013671193e-05, + "loss": 2.69, + "step": 37450 + }, + { + "epoch": 2.324849463033087, + "grad_norm": 0.1360500330297226, + "learning_rate": 1.4651045850162282e-05, + "loss": 2.6193, + "step": 37451 + }, + { + "epoch": 2.324911540132845, + "grad_norm": 0.13693583148068633, + "learning_rate": 1.4648491748036708e-05, + "loss": 2.7224, + "step": 37452 + }, + { + "epoch": 2.3249736172326028, + "grad_norm": 0.1342237955306314, + "learning_rate": 1.4645937830348532e-05, + "loss": 2.7744, + "step": 37453 + }, + { + "epoch": 2.3250356943323607, + "grad_norm": 0.14003228739120305, + "learning_rate": 1.4643384097111068e-05, + "loss": 2.6798, + "step": 37454 + }, + { + "epoch": 2.3250977714321186, + "grad_norm": 0.1466053272620773, + "learning_rate": 1.4640830548337669e-05, + "loss": 2.7899, + "step": 37455 + }, + { + "epoch": 2.3251598485318765, + "grad_norm": 0.13531803855397115, + "learning_rate": 1.4638277184041637e-05, + "loss": 2.608, + "step": 37456 + }, + { + "epoch": 2.3252219256316344, + "grad_norm": 0.15589805543719382, + "learning_rate": 1.4635724004236295e-05, + "loss": 2.6918, + "step": 37457 + }, + { + "epoch": 2.3252840027313924, + "grad_norm": 0.13878294700722293, + "learning_rate": 1.4633171008934954e-05, + "loss": 2.6873, + "step": 37458 + }, + { + "epoch": 2.3253460798311503, + "grad_norm": 0.14511529092075603, + "learning_rate": 1.4630618198150964e-05, + "loss": 2.7207, + "step": 37459 + }, + { + "epoch": 2.325408156930908, + "grad_norm": 0.14168671645489248, + "learning_rate": 1.462806557189762e-05, + "loss": 2.6426, + "step": 37460 + }, + { + "epoch": 2.325470234030666, + "grad_norm": 0.15364542586601238, + "learning_rate": 1.4625513130188245e-05, + "loss": 2.7066, + "step": 37461 + }, + { + "epoch": 2.325532311130424, + "grad_norm": 0.13935939513290915, + "learning_rate": 1.4622960873036162e-05, + "loss": 2.6839, + "step": 37462 + }, + { + "epoch": 2.325594388230182, + "grad_norm": 0.1419876163486723, + "learning_rate": 1.4620408800454655e-05, + "loss": 2.697, + "step": 37463 + }, + { + "epoch": 2.32565646532994, + "grad_norm": 0.1393754531763176, + "learning_rate": 1.4617856912457083e-05, + "loss": 2.6674, + "step": 37464 + }, + { + "epoch": 2.3257185424296978, + "grad_norm": 0.13895949234781751, + "learning_rate": 1.4615305209056745e-05, + "loss": 2.6889, + "step": 37465 + }, + { + "epoch": 2.3257806195294557, + "grad_norm": 0.14433732425710052, + "learning_rate": 1.4612753690266945e-05, + "loss": 2.7269, + "step": 37466 + }, + { + "epoch": 2.3258426966292136, + "grad_norm": 0.14339605636510983, + "learning_rate": 1.4610202356100987e-05, + "loss": 2.7607, + "step": 37467 + }, + { + "epoch": 2.3259047737289715, + "grad_norm": 0.1427411492989398, + "learning_rate": 1.460765120657221e-05, + "loss": 2.6595, + "step": 37468 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.14382710203172708, + "learning_rate": 1.460510024169391e-05, + "loss": 2.716, + "step": 37469 + }, + { + "epoch": 2.3260289279284874, + "grad_norm": 0.1464006120884749, + "learning_rate": 1.4602549461479376e-05, + "loss": 2.6392, + "step": 37470 + }, + { + "epoch": 2.3260910050282453, + "grad_norm": 0.1431185916856946, + "learning_rate": 1.459999886594195e-05, + "loss": 2.6781, + "step": 37471 + }, + { + "epoch": 2.3261530821280028, + "grad_norm": 0.18508976293890994, + "learning_rate": 1.4597448455094904e-05, + "loss": 2.6717, + "step": 37472 + }, + { + "epoch": 2.326215159227761, + "grad_norm": 0.17388714099272753, + "learning_rate": 1.4594898228951587e-05, + "loss": 2.7513, + "step": 37473 + }, + { + "epoch": 2.3262772363275186, + "grad_norm": 0.14632832798533865, + "learning_rate": 1.4592348187525274e-05, + "loss": 2.7617, + "step": 37474 + }, + { + "epoch": 2.3263393134272765, + "grad_norm": 0.15633937544519397, + "learning_rate": 1.458979833082928e-05, + "loss": 2.714, + "step": 37475 + }, + { + "epoch": 2.3264013905270344, + "grad_norm": 0.14480481805581977, + "learning_rate": 1.4587248658876907e-05, + "loss": 2.7515, + "step": 37476 + }, + { + "epoch": 2.3264634676267923, + "grad_norm": 0.13926180423189205, + "learning_rate": 1.4584699171681432e-05, + "loss": 2.7197, + "step": 37477 + }, + { + "epoch": 2.3265255447265503, + "grad_norm": 0.15562878739103103, + "learning_rate": 1.4582149869256194e-05, + "loss": 2.7227, + "step": 37478 + }, + { + "epoch": 2.326587621826308, + "grad_norm": 0.15061415743088793, + "learning_rate": 1.4579600751614486e-05, + "loss": 2.7322, + "step": 37479 + }, + { + "epoch": 2.326649698926066, + "grad_norm": 0.14606483780267704, + "learning_rate": 1.4577051818769593e-05, + "loss": 2.724, + "step": 37480 + }, + { + "epoch": 2.326711776025824, + "grad_norm": 0.13863436769251095, + "learning_rate": 1.4574503070734807e-05, + "loss": 2.6602, + "step": 37481 + }, + { + "epoch": 2.326773853125582, + "grad_norm": 0.15258271475596577, + "learning_rate": 1.4571954507523456e-05, + "loss": 2.6898, + "step": 37482 + }, + { + "epoch": 2.32683593022534, + "grad_norm": 0.14935515994227977, + "learning_rate": 1.4569406129148816e-05, + "loss": 2.7187, + "step": 37483 + }, + { + "epoch": 2.3268980073250978, + "grad_norm": 0.19056836482913617, + "learning_rate": 1.4566857935624184e-05, + "loss": 2.7568, + "step": 37484 + }, + { + "epoch": 2.3269600844248557, + "grad_norm": 0.14456816150391635, + "learning_rate": 1.456430992696286e-05, + "loss": 2.7836, + "step": 37485 + }, + { + "epoch": 2.3270221615246136, + "grad_norm": 0.17581480023960605, + "learning_rate": 1.4561762103178111e-05, + "loss": 2.7179, + "step": 37486 + }, + { + "epoch": 2.3270842386243715, + "grad_norm": 0.15500976559616086, + "learning_rate": 1.4559214464283278e-05, + "loss": 2.6618, + "step": 37487 + }, + { + "epoch": 2.3271463157241294, + "grad_norm": 0.140851996583485, + "learning_rate": 1.4556667010291618e-05, + "loss": 2.6987, + "step": 37488 + }, + { + "epoch": 2.3272083928238874, + "grad_norm": 0.14104907016983498, + "learning_rate": 1.4554119741216432e-05, + "loss": 2.7161, + "step": 37489 + }, + { + "epoch": 2.3272704699236453, + "grad_norm": 0.14157108110352154, + "learning_rate": 1.4551572657070994e-05, + "loss": 2.7935, + "step": 37490 + }, + { + "epoch": 2.327332547023403, + "grad_norm": 0.14398748978150228, + "learning_rate": 1.4549025757868622e-05, + "loss": 2.6513, + "step": 37491 + }, + { + "epoch": 2.327394624123161, + "grad_norm": 0.14330755747288043, + "learning_rate": 1.4546479043622591e-05, + "loss": 2.6929, + "step": 37492 + }, + { + "epoch": 2.327456701222919, + "grad_norm": 0.16707403842855378, + "learning_rate": 1.4543932514346182e-05, + "loss": 2.6975, + "step": 37493 + }, + { + "epoch": 2.327518778322677, + "grad_norm": 0.14806386075173505, + "learning_rate": 1.4541386170052668e-05, + "loss": 2.7909, + "step": 37494 + }, + { + "epoch": 2.3275808554224344, + "grad_norm": 0.14212951892774722, + "learning_rate": 1.4538840010755367e-05, + "loss": 2.6775, + "step": 37495 + }, + { + "epoch": 2.327642932522193, + "grad_norm": 0.13754664843548298, + "learning_rate": 1.4536294036467545e-05, + "loss": 2.7525, + "step": 37496 + }, + { + "epoch": 2.3277050096219503, + "grad_norm": 0.14806728247546913, + "learning_rate": 1.4533748247202489e-05, + "loss": 2.6934, + "step": 37497 + }, + { + "epoch": 2.327767086721708, + "grad_norm": 0.14066796838432466, + "learning_rate": 1.4531202642973479e-05, + "loss": 2.7571, + "step": 37498 + }, + { + "epoch": 2.327829163821466, + "grad_norm": 0.1460055281228574, + "learning_rate": 1.452865722379378e-05, + "loss": 2.6724, + "step": 37499 + }, + { + "epoch": 2.327891240921224, + "grad_norm": 0.152158153404902, + "learning_rate": 1.4526111989676699e-05, + "loss": 2.7734, + "step": 37500 + }, + { + "epoch": 2.327953318020982, + "grad_norm": 0.13917282448377596, + "learning_rate": 1.4523566940635502e-05, + "loss": 2.6685, + "step": 37501 + }, + { + "epoch": 2.32801539512074, + "grad_norm": 0.1650504384832096, + "learning_rate": 1.4521022076683472e-05, + "loss": 2.7552, + "step": 37502 + }, + { + "epoch": 2.3280774722204978, + "grad_norm": 0.14055839921779711, + "learning_rate": 1.4518477397833868e-05, + "loss": 2.6742, + "step": 37503 + }, + { + "epoch": 2.3281395493202557, + "grad_norm": 0.1455712852034026, + "learning_rate": 1.4515932904099976e-05, + "loss": 2.6878, + "step": 37504 + }, + { + "epoch": 2.3282016264200136, + "grad_norm": 0.13699110016011942, + "learning_rate": 1.4513388595495098e-05, + "loss": 2.6502, + "step": 37505 + }, + { + "epoch": 2.3282637035197715, + "grad_norm": 0.13969233963197103, + "learning_rate": 1.4510844472032482e-05, + "loss": 2.7463, + "step": 37506 + }, + { + "epoch": 2.3283257806195294, + "grad_norm": 0.15090068795710615, + "learning_rate": 1.4508300533725406e-05, + "loss": 2.8288, + "step": 37507 + }, + { + "epoch": 2.3283878577192874, + "grad_norm": 0.15865132926176065, + "learning_rate": 1.4505756780587131e-05, + "loss": 2.7294, + "step": 37508 + }, + { + "epoch": 2.3284499348190453, + "grad_norm": 0.13880229914745662, + "learning_rate": 1.4503213212630951e-05, + "loss": 2.6783, + "step": 37509 + }, + { + "epoch": 2.328512011918803, + "grad_norm": 0.14964291243982156, + "learning_rate": 1.4500669829870122e-05, + "loss": 2.7588, + "step": 37510 + }, + { + "epoch": 2.328574089018561, + "grad_norm": 0.1460769646227501, + "learning_rate": 1.449812663231792e-05, + "loss": 2.8257, + "step": 37511 + }, + { + "epoch": 2.328636166118319, + "grad_norm": 0.1506070586061124, + "learning_rate": 1.4495583619987613e-05, + "loss": 2.774, + "step": 37512 + }, + { + "epoch": 2.328698243218077, + "grad_norm": 0.16050726524922596, + "learning_rate": 1.4493040792892442e-05, + "loss": 2.755, + "step": 37513 + }, + { + "epoch": 2.328760320317835, + "grad_norm": 0.1581040590257572, + "learning_rate": 1.4490498151045718e-05, + "loss": 2.6615, + "step": 37514 + }, + { + "epoch": 2.328822397417593, + "grad_norm": 0.13953256801866978, + "learning_rate": 1.4487955694460681e-05, + "loss": 2.6776, + "step": 37515 + }, + { + "epoch": 2.3288844745173507, + "grad_norm": 0.14638405861705642, + "learning_rate": 1.4485413423150607e-05, + "loss": 2.7574, + "step": 37516 + }, + { + "epoch": 2.3289465516171086, + "grad_norm": 0.14548981318420723, + "learning_rate": 1.4482871337128729e-05, + "loss": 2.7434, + "step": 37517 + }, + { + "epoch": 2.3290086287168665, + "grad_norm": 0.14155969987504105, + "learning_rate": 1.4480329436408353e-05, + "loss": 2.7517, + "step": 37518 + }, + { + "epoch": 2.3290707058166245, + "grad_norm": 0.13543406169991287, + "learning_rate": 1.4477787721002722e-05, + "loss": 2.6486, + "step": 37519 + }, + { + "epoch": 2.329132782916382, + "grad_norm": 0.14495421071996217, + "learning_rate": 1.4475246190925096e-05, + "loss": 2.763, + "step": 37520 + }, + { + "epoch": 2.3291948600161403, + "grad_norm": 0.1443871024145538, + "learning_rate": 1.4472704846188729e-05, + "loss": 2.7896, + "step": 37521 + }, + { + "epoch": 2.3292569371158978, + "grad_norm": 0.15739917165454648, + "learning_rate": 1.4470163686806877e-05, + "loss": 2.6992, + "step": 37522 + }, + { + "epoch": 2.3293190142156557, + "grad_norm": 0.15198403463308868, + "learning_rate": 1.4467622712792816e-05, + "loss": 2.7489, + "step": 37523 + }, + { + "epoch": 2.3293810913154136, + "grad_norm": 0.15329190616722024, + "learning_rate": 1.4465081924159796e-05, + "loss": 2.7498, + "step": 37524 + }, + { + "epoch": 2.3294431684151715, + "grad_norm": 0.14168123437390936, + "learning_rate": 1.446254132092107e-05, + "loss": 2.7818, + "step": 37525 + }, + { + "epoch": 2.3295052455149294, + "grad_norm": 0.14743658205711183, + "learning_rate": 1.4460000903089876e-05, + "loss": 2.7557, + "step": 37526 + }, + { + "epoch": 2.3295673226146874, + "grad_norm": 0.15786384771656634, + "learning_rate": 1.44574606706795e-05, + "loss": 2.6875, + "step": 37527 + }, + { + "epoch": 2.3296293997144453, + "grad_norm": 0.14033291205207415, + "learning_rate": 1.4454920623703182e-05, + "loss": 2.6441, + "step": 37528 + }, + { + "epoch": 2.329691476814203, + "grad_norm": 0.14380392509196588, + "learning_rate": 1.4452380762174172e-05, + "loss": 2.7709, + "step": 37529 + }, + { + "epoch": 2.329753553913961, + "grad_norm": 0.14287034228134432, + "learning_rate": 1.4449841086105702e-05, + "loss": 2.7219, + "step": 37530 + }, + { + "epoch": 2.329815631013719, + "grad_norm": 0.1398524015297155, + "learning_rate": 1.4447301595511065e-05, + "loss": 2.6611, + "step": 37531 + }, + { + "epoch": 2.329877708113477, + "grad_norm": 0.1479630479549817, + "learning_rate": 1.444476229040348e-05, + "loss": 2.7818, + "step": 37532 + }, + { + "epoch": 2.329939785213235, + "grad_norm": 0.14386069192756945, + "learning_rate": 1.4442223170796205e-05, + "loss": 2.726, + "step": 37533 + }, + { + "epoch": 2.3300018623129928, + "grad_norm": 0.13807248477611628, + "learning_rate": 1.4439684236702477e-05, + "loss": 2.7224, + "step": 37534 + }, + { + "epoch": 2.3300639394127507, + "grad_norm": 0.14284227019190046, + "learning_rate": 1.4437145488135539e-05, + "loss": 2.7747, + "step": 37535 + }, + { + "epoch": 2.3301260165125086, + "grad_norm": 0.14987323987911974, + "learning_rate": 1.4434606925108645e-05, + "loss": 2.7377, + "step": 37536 + }, + { + "epoch": 2.3301880936122665, + "grad_norm": 0.1417568144943161, + "learning_rate": 1.4432068547635052e-05, + "loss": 2.7989, + "step": 37537 + }, + { + "epoch": 2.3302501707120244, + "grad_norm": 0.1411151291881327, + "learning_rate": 1.4429530355728e-05, + "loss": 2.7894, + "step": 37538 + }, + { + "epoch": 2.3303122478117824, + "grad_norm": 0.15223845486565818, + "learning_rate": 1.4426992349400714e-05, + "loss": 2.6577, + "step": 37539 + }, + { + "epoch": 2.3303743249115403, + "grad_norm": 0.14896365450200175, + "learning_rate": 1.4424454528666437e-05, + "loss": 2.7377, + "step": 37540 + }, + { + "epoch": 2.330436402011298, + "grad_norm": 0.1356558490862367, + "learning_rate": 1.442191689353843e-05, + "loss": 2.7194, + "step": 37541 + }, + { + "epoch": 2.330498479111056, + "grad_norm": 0.15075088056778452, + "learning_rate": 1.4419379444029918e-05, + "loss": 2.7341, + "step": 37542 + }, + { + "epoch": 2.3305605562108136, + "grad_norm": 0.14957210808685137, + "learning_rate": 1.4416842180154139e-05, + "loss": 2.6904, + "step": 37543 + }, + { + "epoch": 2.330622633310572, + "grad_norm": 0.15950363054598046, + "learning_rate": 1.4414305101924319e-05, + "loss": 2.7355, + "step": 37544 + }, + { + "epoch": 2.3306847104103294, + "grad_norm": 0.137951193790915, + "learning_rate": 1.4411768209353726e-05, + "loss": 2.6943, + "step": 37545 + }, + { + "epoch": 2.3307467875100873, + "grad_norm": 0.14259527704197467, + "learning_rate": 1.4409231502455573e-05, + "loss": 2.6135, + "step": 37546 + }, + { + "epoch": 2.3308088646098453, + "grad_norm": 0.1356842068128023, + "learning_rate": 1.4406694981243101e-05, + "loss": 2.6415, + "step": 37547 + }, + { + "epoch": 2.330870941709603, + "grad_norm": 0.14075726011499953, + "learning_rate": 1.4404158645729543e-05, + "loss": 2.7339, + "step": 37548 + }, + { + "epoch": 2.330933018809361, + "grad_norm": 0.15514301724778937, + "learning_rate": 1.4401622495928114e-05, + "loss": 2.664, + "step": 37549 + }, + { + "epoch": 2.330995095909119, + "grad_norm": 0.13700560041539608, + "learning_rate": 1.439908653185208e-05, + "loss": 2.6493, + "step": 37550 + }, + { + "epoch": 2.331057173008877, + "grad_norm": 0.18237347583643704, + "learning_rate": 1.4396550753514654e-05, + "loss": 2.8179, + "step": 37551 + }, + { + "epoch": 2.331119250108635, + "grad_norm": 0.1396357790671795, + "learning_rate": 1.4394015160929064e-05, + "loss": 2.6188, + "step": 37552 + }, + { + "epoch": 2.3311813272083928, + "grad_norm": 0.1546424665307675, + "learning_rate": 1.4391479754108522e-05, + "loss": 2.7256, + "step": 37553 + }, + { + "epoch": 2.3312434043081507, + "grad_norm": 0.17172327714251007, + "learning_rate": 1.4388944533066295e-05, + "loss": 2.7237, + "step": 37554 + }, + { + "epoch": 2.3313054814079086, + "grad_norm": 0.14608494023162646, + "learning_rate": 1.438640949781559e-05, + "loss": 2.6719, + "step": 37555 + }, + { + "epoch": 2.3313675585076665, + "grad_norm": 0.1361143923410452, + "learning_rate": 1.4383874648369628e-05, + "loss": 2.7328, + "step": 37556 + }, + { + "epoch": 2.3314296356074244, + "grad_norm": 0.16107917239612465, + "learning_rate": 1.4381339984741621e-05, + "loss": 2.7251, + "step": 37557 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.14608385644143745, + "learning_rate": 1.4378805506944832e-05, + "loss": 2.7078, + "step": 37558 + }, + { + "epoch": 2.3315537898069403, + "grad_norm": 0.14017636871321093, + "learning_rate": 1.437627121499246e-05, + "loss": 2.7751, + "step": 37559 + }, + { + "epoch": 2.331615866906698, + "grad_norm": 0.16964377742711317, + "learning_rate": 1.4373737108897722e-05, + "loss": 2.7849, + "step": 37560 + }, + { + "epoch": 2.331677944006456, + "grad_norm": 0.150253908759082, + "learning_rate": 1.4371203188673854e-05, + "loss": 2.7088, + "step": 37561 + }, + { + "epoch": 2.331740021106214, + "grad_norm": 0.141536230603561, + "learning_rate": 1.436866945433405e-05, + "loss": 2.754, + "step": 37562 + }, + { + "epoch": 2.331802098205972, + "grad_norm": 0.14435600920658465, + "learning_rate": 1.4366135905891565e-05, + "loss": 2.698, + "step": 37563 + }, + { + "epoch": 2.33186417530573, + "grad_norm": 0.14145329045992155, + "learning_rate": 1.4363602543359594e-05, + "loss": 2.7323, + "step": 37564 + }, + { + "epoch": 2.331926252405488, + "grad_norm": 0.13719980214036223, + "learning_rate": 1.4361069366751367e-05, + "loss": 2.687, + "step": 37565 + }, + { + "epoch": 2.3319883295052457, + "grad_norm": 0.14866664416221412, + "learning_rate": 1.4358536376080072e-05, + "loss": 2.6829, + "step": 37566 + }, + { + "epoch": 2.3320504066050036, + "grad_norm": 0.1357604638838668, + "learning_rate": 1.4356003571358962e-05, + "loss": 2.7588, + "step": 37567 + }, + { + "epoch": 2.332112483704761, + "grad_norm": 0.16147783505171198, + "learning_rate": 1.4353470952601234e-05, + "loss": 2.6544, + "step": 37568 + }, + { + "epoch": 2.3321745608045195, + "grad_norm": 0.14516349351464602, + "learning_rate": 1.4350938519820084e-05, + "loss": 2.6725, + "step": 37569 + }, + { + "epoch": 2.332236637904277, + "grad_norm": 0.135935234456531, + "learning_rate": 1.4348406273028758e-05, + "loss": 2.7048, + "step": 37570 + }, + { + "epoch": 2.332298715004035, + "grad_norm": 0.14206502638708945, + "learning_rate": 1.4345874212240456e-05, + "loss": 2.6783, + "step": 37571 + }, + { + "epoch": 2.3323607921037928, + "grad_norm": 0.14340398624443101, + "learning_rate": 1.4343342337468364e-05, + "loss": 2.7012, + "step": 37572 + }, + { + "epoch": 2.3324228692035507, + "grad_norm": 0.1384136186026233, + "learning_rate": 1.4340810648725728e-05, + "loss": 2.7599, + "step": 37573 + }, + { + "epoch": 2.3324849463033086, + "grad_norm": 0.1435674869254947, + "learning_rate": 1.433827914602574e-05, + "loss": 2.7449, + "step": 37574 + }, + { + "epoch": 2.3325470234030665, + "grad_norm": 0.13799603407368244, + "learning_rate": 1.4335747829381601e-05, + "loss": 2.6939, + "step": 37575 + }, + { + "epoch": 2.3326091005028244, + "grad_norm": 0.13809675851424952, + "learning_rate": 1.4333216698806517e-05, + "loss": 2.7279, + "step": 37576 + }, + { + "epoch": 2.3326711776025824, + "grad_norm": 0.1405244308710577, + "learning_rate": 1.4330685754313711e-05, + "loss": 2.7033, + "step": 37577 + }, + { + "epoch": 2.3327332547023403, + "grad_norm": 0.14419524339107012, + "learning_rate": 1.4328154995916376e-05, + "loss": 2.7189, + "step": 37578 + }, + { + "epoch": 2.332795331802098, + "grad_norm": 0.14424478649225023, + "learning_rate": 1.4325624423627716e-05, + "loss": 2.7815, + "step": 37579 + }, + { + "epoch": 2.332857408901856, + "grad_norm": 0.15218290983950308, + "learning_rate": 1.4323094037460915e-05, + "loss": 2.6698, + "step": 37580 + }, + { + "epoch": 2.332919486001614, + "grad_norm": 0.14264246390372626, + "learning_rate": 1.4320563837429213e-05, + "loss": 2.7947, + "step": 37581 + }, + { + "epoch": 2.332981563101372, + "grad_norm": 0.15994088882174395, + "learning_rate": 1.431803382354579e-05, + "loss": 2.6882, + "step": 37582 + }, + { + "epoch": 2.33304364020113, + "grad_norm": 0.15990350714689328, + "learning_rate": 1.4315503995823848e-05, + "loss": 2.669, + "step": 37583 + }, + { + "epoch": 2.333105717300888, + "grad_norm": 0.13875173901636595, + "learning_rate": 1.431297435427658e-05, + "loss": 2.6827, + "step": 37584 + }, + { + "epoch": 2.3331677944006457, + "grad_norm": 0.14446893843750544, + "learning_rate": 1.431044489891718e-05, + "loss": 2.7259, + "step": 37585 + }, + { + "epoch": 2.3332298715004036, + "grad_norm": 0.13826324809080412, + "learning_rate": 1.430791562975886e-05, + "loss": 2.7066, + "step": 37586 + }, + { + "epoch": 2.3332919486001615, + "grad_norm": 0.15185169080346225, + "learning_rate": 1.4305386546814815e-05, + "loss": 2.7028, + "step": 37587 + }, + { + "epoch": 2.3333540256999195, + "grad_norm": 0.1419996912887401, + "learning_rate": 1.4302857650098233e-05, + "loss": 2.7232, + "step": 37588 + }, + { + "epoch": 2.3334161027996774, + "grad_norm": 0.16003142702949147, + "learning_rate": 1.4300328939622292e-05, + "loss": 2.7761, + "step": 37589 + }, + { + "epoch": 2.3334781798994353, + "grad_norm": 0.14493781333739778, + "learning_rate": 1.4297800415400215e-05, + "loss": 2.5344, + "step": 37590 + }, + { + "epoch": 2.3335402569991928, + "grad_norm": 0.13964467610448755, + "learning_rate": 1.4295272077445181e-05, + "loss": 2.6853, + "step": 37591 + }, + { + "epoch": 2.333602334098951, + "grad_norm": 0.14943886220726754, + "learning_rate": 1.429274392577038e-05, + "loss": 2.6884, + "step": 37592 + }, + { + "epoch": 2.3336644111987086, + "grad_norm": 0.13768428731296614, + "learning_rate": 1.4290215960388987e-05, + "loss": 2.7064, + "step": 37593 + }, + { + "epoch": 2.3337264882984665, + "grad_norm": 0.16403680554773242, + "learning_rate": 1.4287688181314224e-05, + "loss": 2.6015, + "step": 37594 + }, + { + "epoch": 2.3337885653982244, + "grad_norm": 0.14142622121703757, + "learning_rate": 1.428516058855926e-05, + "loss": 2.72, + "step": 37595 + }, + { + "epoch": 2.3338506424979824, + "grad_norm": 0.163923516895877, + "learning_rate": 1.4282633182137278e-05, + "loss": 2.7495, + "step": 37596 + }, + { + "epoch": 2.3339127195977403, + "grad_norm": 0.13163793317719724, + "learning_rate": 1.4280105962061474e-05, + "loss": 2.6087, + "step": 37597 + }, + { + "epoch": 2.333974796697498, + "grad_norm": 0.15673888732222557, + "learning_rate": 1.4277578928345015e-05, + "loss": 2.8051, + "step": 37598 + }, + { + "epoch": 2.334036873797256, + "grad_norm": 0.17129229800257895, + "learning_rate": 1.427505208100111e-05, + "loss": 2.7293, + "step": 37599 + }, + { + "epoch": 2.334098950897014, + "grad_norm": 0.14254333777169653, + "learning_rate": 1.4272525420042937e-05, + "loss": 2.7026, + "step": 37600 + }, + { + "epoch": 2.334161027996772, + "grad_norm": 0.145101536899031, + "learning_rate": 1.4269998945483648e-05, + "loss": 2.7853, + "step": 37601 + }, + { + "epoch": 2.33422310509653, + "grad_norm": 0.14659016921436008, + "learning_rate": 1.4267472657336473e-05, + "loss": 2.6903, + "step": 37602 + }, + { + "epoch": 2.3342851821962878, + "grad_norm": 0.15038174441981522, + "learning_rate": 1.4264946555614545e-05, + "loss": 2.6844, + "step": 37603 + }, + { + "epoch": 2.3343472592960457, + "grad_norm": 0.15346578512122247, + "learning_rate": 1.4262420640331081e-05, + "loss": 2.7752, + "step": 37604 + }, + { + "epoch": 2.3344093363958036, + "grad_norm": 0.14451660262453686, + "learning_rate": 1.425989491149925e-05, + "loss": 2.694, + "step": 37605 + }, + { + "epoch": 2.3344714134955615, + "grad_norm": 0.1453716549906723, + "learning_rate": 1.4257369369132217e-05, + "loss": 2.7728, + "step": 37606 + }, + { + "epoch": 2.3345334905953194, + "grad_norm": 0.14261398125844782, + "learning_rate": 1.4254844013243157e-05, + "loss": 2.6687, + "step": 37607 + }, + { + "epoch": 2.3345955676950774, + "grad_norm": 0.15555871991208287, + "learning_rate": 1.4252318843845264e-05, + "loss": 2.6596, + "step": 37608 + }, + { + "epoch": 2.3346576447948353, + "grad_norm": 0.15976346511126982, + "learning_rate": 1.4249793860951699e-05, + "loss": 2.6888, + "step": 37609 + }, + { + "epoch": 2.334719721894593, + "grad_norm": 0.14965714521174187, + "learning_rate": 1.424726906457564e-05, + "loss": 2.7489, + "step": 37610 + }, + { + "epoch": 2.334781798994351, + "grad_norm": 0.15593166706398376, + "learning_rate": 1.4244744454730257e-05, + "loss": 2.739, + "step": 37611 + }, + { + "epoch": 2.334843876094109, + "grad_norm": 0.14942334339513727, + "learning_rate": 1.424222003142871e-05, + "loss": 2.725, + "step": 37612 + }, + { + "epoch": 2.334905953193867, + "grad_norm": 0.17459012294516094, + "learning_rate": 1.4239695794684193e-05, + "loss": 2.6796, + "step": 37613 + }, + { + "epoch": 2.334968030293625, + "grad_norm": 0.14101402461446189, + "learning_rate": 1.4237171744509869e-05, + "loss": 2.7118, + "step": 37614 + }, + { + "epoch": 2.335030107393383, + "grad_norm": 0.14491084871734133, + "learning_rate": 1.42346478809189e-05, + "loss": 2.7521, + "step": 37615 + }, + { + "epoch": 2.3350921844931403, + "grad_norm": 0.1497400753731867, + "learning_rate": 1.4232124203924435e-05, + "loss": 2.7676, + "step": 37616 + }, + { + "epoch": 2.3351542615928986, + "grad_norm": 0.14211710769896851, + "learning_rate": 1.422960071353968e-05, + "loss": 2.7542, + "step": 37617 + }, + { + "epoch": 2.335216338692656, + "grad_norm": 0.14431226094805585, + "learning_rate": 1.4227077409777784e-05, + "loss": 2.6897, + "step": 37618 + }, + { + "epoch": 2.335278415792414, + "grad_norm": 0.15308149532327583, + "learning_rate": 1.422455429265191e-05, + "loss": 2.7581, + "step": 37619 + }, + { + "epoch": 2.335340492892172, + "grad_norm": 0.1391135622958474, + "learning_rate": 1.4222031362175215e-05, + "loss": 2.754, + "step": 37620 + }, + { + "epoch": 2.33540256999193, + "grad_norm": 0.1669294451359514, + "learning_rate": 1.4219508618360855e-05, + "loss": 2.7622, + "step": 37621 + }, + { + "epoch": 2.3354646470916878, + "grad_norm": 0.13989915549375798, + "learning_rate": 1.4216986061222021e-05, + "loss": 2.7296, + "step": 37622 + }, + { + "epoch": 2.3355267241914457, + "grad_norm": 0.14559389294791097, + "learning_rate": 1.4214463690771856e-05, + "loss": 2.732, + "step": 37623 + }, + { + "epoch": 2.3355888012912036, + "grad_norm": 0.15222022354910117, + "learning_rate": 1.4211941507023525e-05, + "loss": 2.6523, + "step": 37624 + }, + { + "epoch": 2.3356508783909615, + "grad_norm": 0.1402332026929389, + "learning_rate": 1.4209419509990162e-05, + "loss": 2.7076, + "step": 37625 + }, + { + "epoch": 2.3357129554907194, + "grad_norm": 0.15089380628967689, + "learning_rate": 1.4206897699684967e-05, + "loss": 2.7445, + "step": 37626 + }, + { + "epoch": 2.3357750325904774, + "grad_norm": 0.1387361055367822, + "learning_rate": 1.4204376076121072e-05, + "loss": 2.692, + "step": 37627 + }, + { + "epoch": 2.3358371096902353, + "grad_norm": 0.1419088777770504, + "learning_rate": 1.420185463931164e-05, + "loss": 2.7175, + "step": 37628 + }, + { + "epoch": 2.335899186789993, + "grad_norm": 0.15491099612291656, + "learning_rate": 1.4199333389269803e-05, + "loss": 2.7165, + "step": 37629 + }, + { + "epoch": 2.335961263889751, + "grad_norm": 0.13919635770300082, + "learning_rate": 1.4196812326008752e-05, + "loss": 2.6671, + "step": 37630 + }, + { + "epoch": 2.336023340989509, + "grad_norm": 0.1396398675062915, + "learning_rate": 1.4194291449541625e-05, + "loss": 2.6642, + "step": 37631 + }, + { + "epoch": 2.336085418089267, + "grad_norm": 0.16218571416610358, + "learning_rate": 1.4191770759881568e-05, + "loss": 2.7268, + "step": 37632 + }, + { + "epoch": 2.336147495189025, + "grad_norm": 0.15325537600274478, + "learning_rate": 1.4189250257041736e-05, + "loss": 2.7323, + "step": 37633 + }, + { + "epoch": 2.336209572288783, + "grad_norm": 0.1442620631649714, + "learning_rate": 1.4186729941035265e-05, + "loss": 2.7853, + "step": 37634 + }, + { + "epoch": 2.3362716493885407, + "grad_norm": 0.14125316115234746, + "learning_rate": 1.4184209811875315e-05, + "loss": 2.6653, + "step": 37635 + }, + { + "epoch": 2.3363337264882986, + "grad_norm": 0.13990480060009813, + "learning_rate": 1.4181689869575054e-05, + "loss": 2.7166, + "step": 37636 + }, + { + "epoch": 2.3363958035880565, + "grad_norm": 0.16317857749438336, + "learning_rate": 1.4179170114147611e-05, + "loss": 2.7985, + "step": 37637 + }, + { + "epoch": 2.3364578806878145, + "grad_norm": 0.1448218092714745, + "learning_rate": 1.4176650545606135e-05, + "loss": 2.6241, + "step": 37638 + }, + { + "epoch": 2.336519957787572, + "grad_norm": 0.15473703785688694, + "learning_rate": 1.4174131163963756e-05, + "loss": 2.7979, + "step": 37639 + }, + { + "epoch": 2.3365820348873303, + "grad_norm": 0.2098354141812272, + "learning_rate": 1.4171611969233645e-05, + "loss": 2.7091, + "step": 37640 + }, + { + "epoch": 2.3366441119870878, + "grad_norm": 0.14197407099236323, + "learning_rate": 1.4169092961428932e-05, + "loss": 2.7292, + "step": 37641 + }, + { + "epoch": 2.3367061890868457, + "grad_norm": 0.14081849587759437, + "learning_rate": 1.4166574140562761e-05, + "loss": 2.6936, + "step": 37642 + }, + { + "epoch": 2.3367682661866036, + "grad_norm": 0.156646844450274, + "learning_rate": 1.4164055506648255e-05, + "loss": 2.683, + "step": 37643 + }, + { + "epoch": 2.3368303432863615, + "grad_norm": 0.14630510804985739, + "learning_rate": 1.4161537059698582e-05, + "loss": 2.7035, + "step": 37644 + }, + { + "epoch": 2.3368924203861194, + "grad_norm": 0.15772491163118893, + "learning_rate": 1.4159018799726876e-05, + "loss": 2.7603, + "step": 37645 + }, + { + "epoch": 2.3369544974858774, + "grad_norm": 0.13703120494785243, + "learning_rate": 1.4156500726746268e-05, + "loss": 2.715, + "step": 37646 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.14132919424990514, + "learning_rate": 1.4153982840769897e-05, + "loss": 2.6821, + "step": 37647 + }, + { + "epoch": 2.337078651685393, + "grad_norm": 0.1599273499659064, + "learning_rate": 1.4151465141810882e-05, + "loss": 2.7525, + "step": 37648 + }, + { + "epoch": 2.337140728785151, + "grad_norm": 0.13561274854417674, + "learning_rate": 1.4148947629882398e-05, + "loss": 2.7313, + "step": 37649 + }, + { + "epoch": 2.337202805884909, + "grad_norm": 0.14746545948901307, + "learning_rate": 1.414643030499755e-05, + "loss": 2.6415, + "step": 37650 + }, + { + "epoch": 2.337264882984667, + "grad_norm": 0.1451783835450996, + "learning_rate": 1.4143913167169481e-05, + "loss": 2.717, + "step": 37651 + }, + { + "epoch": 2.337326960084425, + "grad_norm": 0.1337984179267807, + "learning_rate": 1.4141396216411306e-05, + "loss": 2.6839, + "step": 37652 + }, + { + "epoch": 2.337389037184183, + "grad_norm": 0.13518270277787406, + "learning_rate": 1.4138879452736187e-05, + "loss": 2.7369, + "step": 37653 + }, + { + "epoch": 2.3374511142839407, + "grad_norm": 0.13564526919554867, + "learning_rate": 1.4136362876157238e-05, + "loss": 2.6868, + "step": 37654 + }, + { + "epoch": 2.3375131913836986, + "grad_norm": 0.1427186378067608, + "learning_rate": 1.4133846486687597e-05, + "loss": 2.7324, + "step": 37655 + }, + { + "epoch": 2.3375752684834565, + "grad_norm": 0.13944800204529484, + "learning_rate": 1.4131330284340377e-05, + "loss": 2.6896, + "step": 37656 + }, + { + "epoch": 2.3376373455832145, + "grad_norm": 0.16801883815657637, + "learning_rate": 1.4128814269128704e-05, + "loss": 2.7272, + "step": 37657 + }, + { + "epoch": 2.3376994226829724, + "grad_norm": 0.13433923855358754, + "learning_rate": 1.4126298441065728e-05, + "loss": 2.743, + "step": 37658 + }, + { + "epoch": 2.3377614997827303, + "grad_norm": 0.1385692229926275, + "learning_rate": 1.4123782800164564e-05, + "loss": 2.76, + "step": 37659 + }, + { + "epoch": 2.337823576882488, + "grad_norm": 0.13455338973785155, + "learning_rate": 1.4121267346438333e-05, + "loss": 2.7417, + "step": 37660 + }, + { + "epoch": 2.337885653982246, + "grad_norm": 0.14137698155143502, + "learning_rate": 1.4118752079900143e-05, + "loss": 2.6856, + "step": 37661 + }, + { + "epoch": 2.3379477310820036, + "grad_norm": 0.15714719795081444, + "learning_rate": 1.4116237000563153e-05, + "loss": 2.7219, + "step": 37662 + }, + { + "epoch": 2.338009808181762, + "grad_norm": 0.1536030015943503, + "learning_rate": 1.4113722108440463e-05, + "loss": 2.5763, + "step": 37663 + }, + { + "epoch": 2.3380718852815194, + "grad_norm": 0.14246046750802036, + "learning_rate": 1.41112074035452e-05, + "loss": 2.6811, + "step": 37664 + }, + { + "epoch": 2.3381339623812774, + "grad_norm": 0.14574045746164216, + "learning_rate": 1.4108692885890462e-05, + "loss": 2.6503, + "step": 37665 + }, + { + "epoch": 2.3381960394810353, + "grad_norm": 0.13918461501022608, + "learning_rate": 1.4106178555489403e-05, + "loss": 2.7299, + "step": 37666 + }, + { + "epoch": 2.338258116580793, + "grad_norm": 0.14122156918339734, + "learning_rate": 1.4103664412355105e-05, + "loss": 2.7279, + "step": 37667 + }, + { + "epoch": 2.338320193680551, + "grad_norm": 0.1383890774151386, + "learning_rate": 1.4101150456500722e-05, + "loss": 2.7344, + "step": 37668 + }, + { + "epoch": 2.338382270780309, + "grad_norm": 0.1339597240047539, + "learning_rate": 1.4098636687939354e-05, + "loss": 2.7673, + "step": 37669 + }, + { + "epoch": 2.338444347880067, + "grad_norm": 0.13247727414013305, + "learning_rate": 1.4096123106684111e-05, + "loss": 2.6791, + "step": 37670 + }, + { + "epoch": 2.338506424979825, + "grad_norm": 0.14237773870525125, + "learning_rate": 1.4093609712748096e-05, + "loss": 2.7462, + "step": 37671 + }, + { + "epoch": 2.338568502079583, + "grad_norm": 0.14192441768651093, + "learning_rate": 1.4091096506144447e-05, + "loss": 2.7188, + "step": 37672 + }, + { + "epoch": 2.3386305791793407, + "grad_norm": 0.1417611373969754, + "learning_rate": 1.408858348688627e-05, + "loss": 2.7618, + "step": 37673 + }, + { + "epoch": 2.3386926562790986, + "grad_norm": 0.13512580534203614, + "learning_rate": 1.4086070654986667e-05, + "loss": 2.7286, + "step": 37674 + }, + { + "epoch": 2.3387547333788565, + "grad_norm": 0.13857566827361373, + "learning_rate": 1.4083558010458736e-05, + "loss": 2.6826, + "step": 37675 + }, + { + "epoch": 2.3388168104786144, + "grad_norm": 0.13697687713416334, + "learning_rate": 1.4081045553315619e-05, + "loss": 2.6962, + "step": 37676 + }, + { + "epoch": 2.3388788875783724, + "grad_norm": 0.14059180954151348, + "learning_rate": 1.4078533283570406e-05, + "loss": 2.7057, + "step": 37677 + }, + { + "epoch": 2.3389409646781303, + "grad_norm": 0.157680395515522, + "learning_rate": 1.4076021201236205e-05, + "loss": 2.634, + "step": 37678 + }, + { + "epoch": 2.339003041777888, + "grad_norm": 0.14035012613650294, + "learning_rate": 1.407350930632611e-05, + "loss": 2.739, + "step": 37679 + }, + { + "epoch": 2.339065118877646, + "grad_norm": 0.14023714089388864, + "learning_rate": 1.4070997598853252e-05, + "loss": 2.6496, + "step": 37680 + }, + { + "epoch": 2.339127195977404, + "grad_norm": 0.1701064094343825, + "learning_rate": 1.4068486078830723e-05, + "loss": 2.7463, + "step": 37681 + }, + { + "epoch": 2.339189273077162, + "grad_norm": 0.15020499601263307, + "learning_rate": 1.406597474627162e-05, + "loss": 2.7111, + "step": 37682 + }, + { + "epoch": 2.33925135017692, + "grad_norm": 0.1371416295794823, + "learning_rate": 1.4063463601189053e-05, + "loss": 2.6889, + "step": 37683 + }, + { + "epoch": 2.339313427276678, + "grad_norm": 0.14545780961252824, + "learning_rate": 1.4060952643596103e-05, + "loss": 2.762, + "step": 37684 + }, + { + "epoch": 2.3393755043764357, + "grad_norm": 0.13629730526594813, + "learning_rate": 1.4058441873505906e-05, + "loss": 2.6399, + "step": 37685 + }, + { + "epoch": 2.3394375814761936, + "grad_norm": 0.13900182986427193, + "learning_rate": 1.4055931290931545e-05, + "loss": 2.6184, + "step": 37686 + }, + { + "epoch": 2.339499658575951, + "grad_norm": 0.15355730267541617, + "learning_rate": 1.4053420895886111e-05, + "loss": 2.6298, + "step": 37687 + }, + { + "epoch": 2.3395617356757095, + "grad_norm": 0.14227882088385424, + "learning_rate": 1.4050910688382696e-05, + "loss": 2.7626, + "step": 37688 + }, + { + "epoch": 2.339623812775467, + "grad_norm": 0.14851027511681264, + "learning_rate": 1.404840066843442e-05, + "loss": 2.727, + "step": 37689 + }, + { + "epoch": 2.339685889875225, + "grad_norm": 0.15385319502099046, + "learning_rate": 1.404589083605436e-05, + "loss": 2.7408, + "step": 37690 + }, + { + "epoch": 2.3397479669749828, + "grad_norm": 0.16008255007144467, + "learning_rate": 1.4043381191255623e-05, + "loss": 2.7993, + "step": 37691 + }, + { + "epoch": 2.3398100440747407, + "grad_norm": 0.1414615465027725, + "learning_rate": 1.4040871734051292e-05, + "loss": 2.6849, + "step": 37692 + }, + { + "epoch": 2.3398721211744986, + "grad_norm": 0.1425180243189887, + "learning_rate": 1.4038362464454447e-05, + "loss": 2.8022, + "step": 37693 + }, + { + "epoch": 2.3399341982742565, + "grad_norm": 0.1570599425373667, + "learning_rate": 1.4035853382478209e-05, + "loss": 2.7874, + "step": 37694 + }, + { + "epoch": 2.3399962753740144, + "grad_norm": 0.14236743592799148, + "learning_rate": 1.4033344488135659e-05, + "loss": 2.8014, + "step": 37695 + }, + { + "epoch": 2.3400583524737724, + "grad_norm": 0.15754816678331093, + "learning_rate": 1.4030835781439877e-05, + "loss": 2.7348, + "step": 37696 + }, + { + "epoch": 2.3401204295735303, + "grad_norm": 0.15444696345602976, + "learning_rate": 1.4028327262403946e-05, + "loss": 2.6951, + "step": 37697 + }, + { + "epoch": 2.340182506673288, + "grad_norm": 0.13711174612358826, + "learning_rate": 1.4025818931040973e-05, + "loss": 2.6586, + "step": 37698 + }, + { + "epoch": 2.340244583773046, + "grad_norm": 0.14192868298903577, + "learning_rate": 1.402331078736404e-05, + "loss": 2.7165, + "step": 37699 + }, + { + "epoch": 2.340306660872804, + "grad_norm": 0.14415459267982358, + "learning_rate": 1.4020802831386214e-05, + "loss": 2.6068, + "step": 37700 + }, + { + "epoch": 2.340368737972562, + "grad_norm": 0.14301717125810998, + "learning_rate": 1.4018295063120613e-05, + "loss": 2.7957, + "step": 37701 + }, + { + "epoch": 2.34043081507232, + "grad_norm": 0.13881230295812233, + "learning_rate": 1.4015787482580278e-05, + "loss": 2.7584, + "step": 37702 + }, + { + "epoch": 2.340492892172078, + "grad_norm": 0.15229311737429804, + "learning_rate": 1.4013280089778335e-05, + "loss": 2.6789, + "step": 37703 + }, + { + "epoch": 2.3405549692718357, + "grad_norm": 0.14279937560712322, + "learning_rate": 1.4010772884727841e-05, + "loss": 2.7129, + "step": 37704 + }, + { + "epoch": 2.3406170463715936, + "grad_norm": 0.14349855895681254, + "learning_rate": 1.4008265867441888e-05, + "loss": 2.6955, + "step": 37705 + }, + { + "epoch": 2.3406791234713515, + "grad_norm": 0.13851369647745265, + "learning_rate": 1.4005759037933547e-05, + "loss": 2.7199, + "step": 37706 + }, + { + "epoch": 2.3407412005711095, + "grad_norm": 0.14199297509899803, + "learning_rate": 1.4003252396215882e-05, + "loss": 2.5935, + "step": 37707 + }, + { + "epoch": 2.3408032776708674, + "grad_norm": 0.14968904112179804, + "learning_rate": 1.4000745942302002e-05, + "loss": 2.8171, + "step": 37708 + }, + { + "epoch": 2.3408653547706253, + "grad_norm": 0.13997630997094368, + "learning_rate": 1.3998239676204971e-05, + "loss": 2.7414, + "step": 37709 + }, + { + "epoch": 2.3409274318703828, + "grad_norm": 0.1423675405948742, + "learning_rate": 1.3995733597937865e-05, + "loss": 2.8262, + "step": 37710 + }, + { + "epoch": 2.340989508970141, + "grad_norm": 0.13594929243875553, + "learning_rate": 1.399322770751374e-05, + "loss": 2.6373, + "step": 37711 + }, + { + "epoch": 2.3410515860698986, + "grad_norm": 0.1404195781883694, + "learning_rate": 1.3990722004945705e-05, + "loss": 2.788, + "step": 37712 + }, + { + "epoch": 2.3411136631696565, + "grad_norm": 0.1561941762384968, + "learning_rate": 1.3988216490246813e-05, + "loss": 2.8066, + "step": 37713 + }, + { + "epoch": 2.3411757402694144, + "grad_norm": 0.13798626130846808, + "learning_rate": 1.3985711163430137e-05, + "loss": 2.6433, + "step": 37714 + }, + { + "epoch": 2.3412378173691724, + "grad_norm": 0.14640961328671784, + "learning_rate": 1.3983206024508732e-05, + "loss": 2.6916, + "step": 37715 + }, + { + "epoch": 2.3412998944689303, + "grad_norm": 0.13716324348463746, + "learning_rate": 1.3980701073495706e-05, + "loss": 2.6744, + "step": 37716 + }, + { + "epoch": 2.341361971568688, + "grad_norm": 0.14636877584906335, + "learning_rate": 1.39781963104041e-05, + "loss": 2.6904, + "step": 37717 + }, + { + "epoch": 2.341424048668446, + "grad_norm": 0.142603813104305, + "learning_rate": 1.3975691735246993e-05, + "loss": 2.7387, + "step": 37718 + }, + { + "epoch": 2.341486125768204, + "grad_norm": 0.13939172985398893, + "learning_rate": 1.3973187348037442e-05, + "loss": 2.6334, + "step": 37719 + }, + { + "epoch": 2.341548202867962, + "grad_norm": 0.1418637905965147, + "learning_rate": 1.397068314878851e-05, + "loss": 2.6562, + "step": 37720 + }, + { + "epoch": 2.34161027996772, + "grad_norm": 0.14744250706574902, + "learning_rate": 1.3968179137513282e-05, + "loss": 2.7299, + "step": 37721 + }, + { + "epoch": 2.341672357067478, + "grad_norm": 0.14842559741635508, + "learning_rate": 1.3965675314224808e-05, + "loss": 2.6456, + "step": 37722 + }, + { + "epoch": 2.3417344341672357, + "grad_norm": 0.1352832569542697, + "learning_rate": 1.3963171678936155e-05, + "loss": 2.7185, + "step": 37723 + }, + { + "epoch": 2.3417965112669936, + "grad_norm": 0.13577939628152197, + "learning_rate": 1.3960668231660373e-05, + "loss": 2.6344, + "step": 37724 + }, + { + "epoch": 2.3418585883667515, + "grad_norm": 0.14516987697625208, + "learning_rate": 1.3958164972410542e-05, + "loss": 2.6798, + "step": 37725 + }, + { + "epoch": 2.3419206654665095, + "grad_norm": 0.14023170176431277, + "learning_rate": 1.3955661901199717e-05, + "loss": 2.5904, + "step": 37726 + }, + { + "epoch": 2.3419827425662674, + "grad_norm": 0.1442001648827548, + "learning_rate": 1.3953159018040956e-05, + "loss": 2.6877, + "step": 37727 + }, + { + "epoch": 2.3420448196660253, + "grad_norm": 0.13734609541121806, + "learning_rate": 1.3950656322947298e-05, + "loss": 2.6709, + "step": 37728 + }, + { + "epoch": 2.342106896765783, + "grad_norm": 0.13856976739187235, + "learning_rate": 1.394815381593183e-05, + "loss": 2.7097, + "step": 37729 + }, + { + "epoch": 2.342168973865541, + "grad_norm": 0.13885090245557, + "learning_rate": 1.39456514970076e-05, + "loss": 2.5547, + "step": 37730 + }, + { + "epoch": 2.342231050965299, + "grad_norm": 0.140904618792082, + "learning_rate": 1.3943149366187659e-05, + "loss": 2.6374, + "step": 37731 + }, + { + "epoch": 2.342293128065057, + "grad_norm": 0.14002118681563305, + "learning_rate": 1.3940647423485042e-05, + "loss": 2.7568, + "step": 37732 + }, + { + "epoch": 2.342355205164815, + "grad_norm": 0.13942724997627218, + "learning_rate": 1.3938145668912833e-05, + "loss": 2.6689, + "step": 37733 + }, + { + "epoch": 2.342417282264573, + "grad_norm": 0.14032168924320942, + "learning_rate": 1.3935644102484064e-05, + "loss": 2.78, + "step": 37734 + }, + { + "epoch": 2.3424793593643303, + "grad_norm": 0.13905321663505577, + "learning_rate": 1.3933142724211807e-05, + "loss": 2.6772, + "step": 37735 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.142718853150897, + "learning_rate": 1.39306415341091e-05, + "loss": 2.7096, + "step": 37736 + }, + { + "epoch": 2.342603513563846, + "grad_norm": 0.1379457928036198, + "learning_rate": 1.3928140532188994e-05, + "loss": 2.6867, + "step": 37737 + }, + { + "epoch": 2.342665590663604, + "grad_norm": 0.13741731650212513, + "learning_rate": 1.3925639718464518e-05, + "loss": 2.6743, + "step": 37738 + }, + { + "epoch": 2.342727667763362, + "grad_norm": 0.13744777045098214, + "learning_rate": 1.3923139092948756e-05, + "loss": 2.726, + "step": 37739 + }, + { + "epoch": 2.34278974486312, + "grad_norm": 0.14113872787026613, + "learning_rate": 1.3920638655654732e-05, + "loss": 2.6861, + "step": 37740 + }, + { + "epoch": 2.342851821962878, + "grad_norm": 0.13857138713500192, + "learning_rate": 1.3918138406595493e-05, + "loss": 2.7357, + "step": 37741 + }, + { + "epoch": 2.3429138990626357, + "grad_norm": 0.13781768690672067, + "learning_rate": 1.3915638345784087e-05, + "loss": 2.7132, + "step": 37742 + }, + { + "epoch": 2.3429759761623936, + "grad_norm": 0.13785605145758462, + "learning_rate": 1.391313847323354e-05, + "loss": 2.7481, + "step": 37743 + }, + { + "epoch": 2.3430380532621515, + "grad_norm": 0.13738884876516894, + "learning_rate": 1.3910638788956925e-05, + "loss": 2.7135, + "step": 37744 + }, + { + "epoch": 2.3431001303619095, + "grad_norm": 0.1396567535591906, + "learning_rate": 1.390813929296727e-05, + "loss": 2.7537, + "step": 37745 + }, + { + "epoch": 2.3431622074616674, + "grad_norm": 0.13489604941342206, + "learning_rate": 1.3905639985277613e-05, + "loss": 2.6475, + "step": 37746 + }, + { + "epoch": 2.3432242845614253, + "grad_norm": 0.14174595409797158, + "learning_rate": 1.3903140865900982e-05, + "loss": 2.6982, + "step": 37747 + }, + { + "epoch": 2.343286361661183, + "grad_norm": 0.16632940707556984, + "learning_rate": 1.3900641934850439e-05, + "loss": 2.7598, + "step": 37748 + }, + { + "epoch": 2.343348438760941, + "grad_norm": 0.13913816311696817, + "learning_rate": 1.3898143192139013e-05, + "loss": 2.7441, + "step": 37749 + }, + { + "epoch": 2.343410515860699, + "grad_norm": 0.13627224626825826, + "learning_rate": 1.3895644637779737e-05, + "loss": 2.6351, + "step": 37750 + }, + { + "epoch": 2.343472592960457, + "grad_norm": 0.13385795991126673, + "learning_rate": 1.3893146271785634e-05, + "loss": 2.6491, + "step": 37751 + }, + { + "epoch": 2.343534670060215, + "grad_norm": 0.15195555010824144, + "learning_rate": 1.3890648094169762e-05, + "loss": 2.7305, + "step": 37752 + }, + { + "epoch": 2.343596747159973, + "grad_norm": 0.13708162515985733, + "learning_rate": 1.3888150104945153e-05, + "loss": 2.6857, + "step": 37753 + }, + { + "epoch": 2.3436588242597307, + "grad_norm": 0.14326508172891503, + "learning_rate": 1.3885652304124824e-05, + "loss": 2.5796, + "step": 37754 + }, + { + "epoch": 2.3437209013594886, + "grad_norm": 0.13932938575580622, + "learning_rate": 1.3883154691721816e-05, + "loss": 2.7956, + "step": 37755 + }, + { + "epoch": 2.3437829784592465, + "grad_norm": 0.1693432481852993, + "learning_rate": 1.3880657267749142e-05, + "loss": 2.7133, + "step": 37756 + }, + { + "epoch": 2.3438450555590045, + "grad_norm": 0.15310484330187882, + "learning_rate": 1.3878160032219867e-05, + "loss": 2.7302, + "step": 37757 + }, + { + "epoch": 2.343907132658762, + "grad_norm": 0.13744059466086736, + "learning_rate": 1.3875662985146991e-05, + "loss": 2.7098, + "step": 37758 + }, + { + "epoch": 2.3439692097585203, + "grad_norm": 0.13385192062729903, + "learning_rate": 1.3873166126543558e-05, + "loss": 2.6002, + "step": 37759 + }, + { + "epoch": 2.3440312868582778, + "grad_norm": 0.13902322155516858, + "learning_rate": 1.3870669456422575e-05, + "loss": 2.7797, + "step": 37760 + }, + { + "epoch": 2.3440933639580357, + "grad_norm": 0.1399762168228932, + "learning_rate": 1.3868172974797089e-05, + "loss": 2.6997, + "step": 37761 + }, + { + "epoch": 2.3441554410577936, + "grad_norm": 0.14255261519596152, + "learning_rate": 1.3865676681680118e-05, + "loss": 2.6672, + "step": 37762 + }, + { + "epoch": 2.3442175181575515, + "grad_norm": 0.135641589040135, + "learning_rate": 1.3863180577084684e-05, + "loss": 2.6466, + "step": 37763 + }, + { + "epoch": 2.3442795952573094, + "grad_norm": 0.13911623345266058, + "learning_rate": 1.3860684661023793e-05, + "loss": 2.7575, + "step": 37764 + }, + { + "epoch": 2.3443416723570674, + "grad_norm": 0.14845857345167462, + "learning_rate": 1.3858188933510486e-05, + "loss": 2.7739, + "step": 37765 + }, + { + "epoch": 2.3444037494568253, + "grad_norm": 0.14394758913680272, + "learning_rate": 1.3855693394557789e-05, + "loss": 2.7542, + "step": 37766 + }, + { + "epoch": 2.344465826556583, + "grad_norm": 0.15455198926970576, + "learning_rate": 1.3853198044178722e-05, + "loss": 2.6497, + "step": 37767 + }, + { + "epoch": 2.344527903656341, + "grad_norm": 0.1706704444504004, + "learning_rate": 1.3850702882386291e-05, + "loss": 2.7589, + "step": 37768 + }, + { + "epoch": 2.344589980756099, + "grad_norm": 0.13733727324862782, + "learning_rate": 1.3848207909193522e-05, + "loss": 2.7116, + "step": 37769 + }, + { + "epoch": 2.344652057855857, + "grad_norm": 0.1432123383837515, + "learning_rate": 1.3845713124613413e-05, + "loss": 2.7505, + "step": 37770 + }, + { + "epoch": 2.344714134955615, + "grad_norm": 0.14385559049790678, + "learning_rate": 1.3843218528659008e-05, + "loss": 2.6381, + "step": 37771 + }, + { + "epoch": 2.344776212055373, + "grad_norm": 0.1446462444393622, + "learning_rate": 1.3840724121343313e-05, + "loss": 2.7004, + "step": 37772 + }, + { + "epoch": 2.3448382891551307, + "grad_norm": 0.13943205922761573, + "learning_rate": 1.3838229902679334e-05, + "loss": 2.7076, + "step": 37773 + }, + { + "epoch": 2.3449003662548886, + "grad_norm": 0.13606585877057342, + "learning_rate": 1.3835735872680078e-05, + "loss": 2.6204, + "step": 37774 + }, + { + "epoch": 2.3449624433546465, + "grad_norm": 0.1613770916313466, + "learning_rate": 1.383324203135858e-05, + "loss": 2.7091, + "step": 37775 + }, + { + "epoch": 2.3450245204544045, + "grad_norm": 0.14051871786888762, + "learning_rate": 1.3830748378727832e-05, + "loss": 2.8313, + "step": 37776 + }, + { + "epoch": 2.3450865975541624, + "grad_norm": 0.1606188234670536, + "learning_rate": 1.3828254914800859e-05, + "loss": 2.6701, + "step": 37777 + }, + { + "epoch": 2.3451486746539203, + "grad_norm": 0.14821185981616733, + "learning_rate": 1.382576163959064e-05, + "loss": 2.6864, + "step": 37778 + }, + { + "epoch": 2.345210751753678, + "grad_norm": 0.14550043253776979, + "learning_rate": 1.3823268553110225e-05, + "loss": 2.7359, + "step": 37779 + }, + { + "epoch": 2.345272828853436, + "grad_norm": 0.16332980442451844, + "learning_rate": 1.3820775655372591e-05, + "loss": 2.7111, + "step": 37780 + }, + { + "epoch": 2.345334905953194, + "grad_norm": 0.17326299011328203, + "learning_rate": 1.3818282946390764e-05, + "loss": 2.6939, + "step": 37781 + }, + { + "epoch": 2.345396983052952, + "grad_norm": 0.1454415778648637, + "learning_rate": 1.3815790426177733e-05, + "loss": 2.6853, + "step": 37782 + }, + { + "epoch": 2.3454590601527094, + "grad_norm": 0.15784073756426037, + "learning_rate": 1.3813298094746491e-05, + "loss": 2.7492, + "step": 37783 + }, + { + "epoch": 2.345521137252468, + "grad_norm": 0.1407905438686399, + "learning_rate": 1.3810805952110073e-05, + "loss": 2.8182, + "step": 37784 + }, + { + "epoch": 2.3455832143522253, + "grad_norm": 0.13781967363911715, + "learning_rate": 1.380831399828147e-05, + "loss": 2.6436, + "step": 37785 + }, + { + "epoch": 2.345645291451983, + "grad_norm": 0.16250117764596436, + "learning_rate": 1.3805822233273674e-05, + "loss": 2.7214, + "step": 37786 + }, + { + "epoch": 2.345707368551741, + "grad_norm": 0.1538490000132549, + "learning_rate": 1.3803330657099677e-05, + "loss": 2.6666, + "step": 37787 + }, + { + "epoch": 2.345769445651499, + "grad_norm": 0.14567768498681646, + "learning_rate": 1.3800839269772503e-05, + "loss": 2.7632, + "step": 37788 + }, + { + "epoch": 2.345831522751257, + "grad_norm": 0.16305767888098835, + "learning_rate": 1.3798348071305139e-05, + "loss": 2.7501, + "step": 37789 + }, + { + "epoch": 2.345893599851015, + "grad_norm": 0.14739764667941196, + "learning_rate": 1.3795857061710587e-05, + "loss": 2.7158, + "step": 37790 + }, + { + "epoch": 2.345955676950773, + "grad_norm": 0.14208607488398928, + "learning_rate": 1.3793366241001832e-05, + "loss": 2.7485, + "step": 37791 + }, + { + "epoch": 2.3460177540505307, + "grad_norm": 0.13539529289391708, + "learning_rate": 1.3790875609191861e-05, + "loss": 2.6937, + "step": 37792 + }, + { + "epoch": 2.3460798311502886, + "grad_norm": 0.17579581942049385, + "learning_rate": 1.37883851662937e-05, + "loss": 2.8248, + "step": 37793 + }, + { + "epoch": 2.3461419082500465, + "grad_norm": 0.13989991440102342, + "learning_rate": 1.3785894912320319e-05, + "loss": 2.6618, + "step": 37794 + }, + { + "epoch": 2.3462039853498045, + "grad_norm": 0.15434246355498837, + "learning_rate": 1.378340484728472e-05, + "loss": 2.7744, + "step": 37795 + }, + { + "epoch": 2.3462660624495624, + "grad_norm": 0.13939265187490346, + "learning_rate": 1.3780914971199872e-05, + "loss": 2.7978, + "step": 37796 + }, + { + "epoch": 2.3463281395493203, + "grad_norm": 0.14387862965564957, + "learning_rate": 1.37784252840788e-05, + "loss": 2.699, + "step": 37797 + }, + { + "epoch": 2.346390216649078, + "grad_norm": 0.13943320140043353, + "learning_rate": 1.3775935785934457e-05, + "loss": 2.7827, + "step": 37798 + }, + { + "epoch": 2.346452293748836, + "grad_norm": 0.142192004205814, + "learning_rate": 1.3773446476779872e-05, + "loss": 2.7431, + "step": 37799 + }, + { + "epoch": 2.346514370848594, + "grad_norm": 0.1354496362483075, + "learning_rate": 1.3770957356628006e-05, + "loss": 2.6846, + "step": 37800 + }, + { + "epoch": 2.346576447948352, + "grad_norm": 0.14182632629096142, + "learning_rate": 1.3768468425491837e-05, + "loss": 2.6822, + "step": 37801 + }, + { + "epoch": 2.34663852504811, + "grad_norm": 0.14786023904394166, + "learning_rate": 1.3765979683384379e-05, + "loss": 2.6606, + "step": 37802 + }, + { + "epoch": 2.346700602147868, + "grad_norm": 0.13866669685474436, + "learning_rate": 1.37634911303186e-05, + "loss": 2.786, + "step": 37803 + }, + { + "epoch": 2.3467626792476257, + "grad_norm": 0.13292455854450205, + "learning_rate": 1.3761002766307485e-05, + "loss": 2.767, + "step": 37804 + }, + { + "epoch": 2.3468247563473836, + "grad_norm": 0.1411383385504209, + "learning_rate": 1.375851459136402e-05, + "loss": 2.6148, + "step": 37805 + }, + { + "epoch": 2.346886833447141, + "grad_norm": 0.13630862711424882, + "learning_rate": 1.375602660550116e-05, + "loss": 2.7835, + "step": 37806 + }, + { + "epoch": 2.3469489105468995, + "grad_norm": 0.13586155013876267, + "learning_rate": 1.375353880873193e-05, + "loss": 2.5936, + "step": 37807 + }, + { + "epoch": 2.347010987646657, + "grad_norm": 0.1530096405344709, + "learning_rate": 1.3751051201069281e-05, + "loss": 2.6805, + "step": 37808 + }, + { + "epoch": 2.347073064746415, + "grad_norm": 0.13953586522678832, + "learning_rate": 1.3748563782526202e-05, + "loss": 2.7968, + "step": 37809 + }, + { + "epoch": 2.347135141846173, + "grad_norm": 0.1390877853401084, + "learning_rate": 1.374607655311565e-05, + "loss": 2.7374, + "step": 37810 + }, + { + "epoch": 2.3471972189459307, + "grad_norm": 0.1395766133557826, + "learning_rate": 1.374358951285063e-05, + "loss": 2.6193, + "step": 37811 + }, + { + "epoch": 2.3472592960456886, + "grad_norm": 0.15586494445410526, + "learning_rate": 1.3741102661744109e-05, + "loss": 2.6831, + "step": 37812 + }, + { + "epoch": 2.3473213731454465, + "grad_norm": 0.14958998975336998, + "learning_rate": 1.3738615999809056e-05, + "loss": 2.7885, + "step": 37813 + }, + { + "epoch": 2.3473834502452045, + "grad_norm": 0.14052746622320145, + "learning_rate": 1.3736129527058433e-05, + "loss": 2.7337, + "step": 37814 + }, + { + "epoch": 2.3474455273449624, + "grad_norm": 0.14663495271795468, + "learning_rate": 1.3733643243505234e-05, + "loss": 2.7428, + "step": 37815 + }, + { + "epoch": 2.3475076044447203, + "grad_norm": 0.1519982258762171, + "learning_rate": 1.3731157149162426e-05, + "loss": 2.6925, + "step": 37816 + }, + { + "epoch": 2.347569681544478, + "grad_norm": 0.15440076034884004, + "learning_rate": 1.3728671244042979e-05, + "loss": 2.6729, + "step": 37817 + }, + { + "epoch": 2.347631758644236, + "grad_norm": 0.14886369505413866, + "learning_rate": 1.3726185528159857e-05, + "loss": 2.6805, + "step": 37818 + }, + { + "epoch": 2.347693835743994, + "grad_norm": 0.146993755967822, + "learning_rate": 1.3723700001526013e-05, + "loss": 2.8702, + "step": 37819 + }, + { + "epoch": 2.347755912843752, + "grad_norm": 0.1382532180240013, + "learning_rate": 1.3721214664154453e-05, + "loss": 2.6614, + "step": 37820 + }, + { + "epoch": 2.34781798994351, + "grad_norm": 0.1480768553339363, + "learning_rate": 1.371872951605812e-05, + "loss": 2.7423, + "step": 37821 + }, + { + "epoch": 2.347880067043268, + "grad_norm": 0.13777412702075847, + "learning_rate": 1.371624455724998e-05, + "loss": 2.7053, + "step": 37822 + }, + { + "epoch": 2.3479421441430257, + "grad_norm": 0.1416186881397014, + "learning_rate": 1.3713759787742991e-05, + "loss": 2.7405, + "step": 37823 + }, + { + "epoch": 2.3480042212427836, + "grad_norm": 0.15113546415967083, + "learning_rate": 1.371127520755014e-05, + "loss": 2.7503, + "step": 37824 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.14311390631797152, + "learning_rate": 1.370879081668437e-05, + "loss": 2.7852, + "step": 37825 + }, + { + "epoch": 2.3481283754422995, + "grad_norm": 0.1407051917416251, + "learning_rate": 1.3706306615158655e-05, + "loss": 2.7434, + "step": 37826 + }, + { + "epoch": 2.3481904525420574, + "grad_norm": 0.15643725499466576, + "learning_rate": 1.3703822602985944e-05, + "loss": 2.6264, + "step": 37827 + }, + { + "epoch": 2.3482525296418153, + "grad_norm": 0.13867194887615977, + "learning_rate": 1.370133878017919e-05, + "loss": 2.6854, + "step": 37828 + }, + { + "epoch": 2.348314606741573, + "grad_norm": 0.1369558322332087, + "learning_rate": 1.3698855146751377e-05, + "loss": 2.7314, + "step": 37829 + }, + { + "epoch": 2.348376683841331, + "grad_norm": 0.14664924244667127, + "learning_rate": 1.3696371702715449e-05, + "loss": 2.6813, + "step": 37830 + }, + { + "epoch": 2.3484387609410886, + "grad_norm": 0.1620129562799742, + "learning_rate": 1.3693888448084346e-05, + "loss": 2.7834, + "step": 37831 + }, + { + "epoch": 2.348500838040847, + "grad_norm": 0.15293284372748583, + "learning_rate": 1.3691405382871058e-05, + "loss": 2.7052, + "step": 37832 + }, + { + "epoch": 2.3485629151406044, + "grad_norm": 0.14376820366081552, + "learning_rate": 1.3688922507088509e-05, + "loss": 2.7022, + "step": 37833 + }, + { + "epoch": 2.3486249922403624, + "grad_norm": 0.14776892424993748, + "learning_rate": 1.3686439820749674e-05, + "loss": 2.6982, + "step": 37834 + }, + { + "epoch": 2.3486870693401203, + "grad_norm": 0.14501825325656434, + "learning_rate": 1.3683957323867508e-05, + "loss": 2.7025, + "step": 37835 + }, + { + "epoch": 2.348749146439878, + "grad_norm": 0.13799044586449224, + "learning_rate": 1.3681475016454948e-05, + "loss": 2.7311, + "step": 37836 + }, + { + "epoch": 2.348811223539636, + "grad_norm": 0.14463560100466288, + "learning_rate": 1.3678992898524934e-05, + "loss": 2.6789, + "step": 37837 + }, + { + "epoch": 2.348873300639394, + "grad_norm": 0.15462442831744794, + "learning_rate": 1.3676510970090445e-05, + "loss": 2.7047, + "step": 37838 + }, + { + "epoch": 2.348935377739152, + "grad_norm": 0.14206380703883262, + "learning_rate": 1.3674029231164415e-05, + "loss": 2.8446, + "step": 37839 + }, + { + "epoch": 2.34899745483891, + "grad_norm": 0.13405945178659096, + "learning_rate": 1.3671547681759794e-05, + "loss": 2.6856, + "step": 37840 + }, + { + "epoch": 2.349059531938668, + "grad_norm": 0.15048813445463455, + "learning_rate": 1.3669066321889535e-05, + "loss": 2.7559, + "step": 37841 + }, + { + "epoch": 2.3491216090384257, + "grad_norm": 0.14525370202993318, + "learning_rate": 1.3666585151566552e-05, + "loss": 2.6659, + "step": 37842 + }, + { + "epoch": 2.3491836861381836, + "grad_norm": 0.15144343762913606, + "learning_rate": 1.3664104170803832e-05, + "loss": 2.6727, + "step": 37843 + }, + { + "epoch": 2.3492457632379415, + "grad_norm": 0.14777976425627576, + "learning_rate": 1.3661623379614303e-05, + "loss": 2.8418, + "step": 37844 + }, + { + "epoch": 2.3493078403376995, + "grad_norm": 0.1403271779792006, + "learning_rate": 1.3659142778010908e-05, + "loss": 2.6646, + "step": 37845 + }, + { + "epoch": 2.3493699174374574, + "grad_norm": 0.1591536260696029, + "learning_rate": 1.3656662366006567e-05, + "loss": 2.7595, + "step": 37846 + }, + { + "epoch": 2.3494319945372153, + "grad_norm": 0.1645690783280772, + "learning_rate": 1.3654182143614263e-05, + "loss": 2.7001, + "step": 37847 + }, + { + "epoch": 2.349494071636973, + "grad_norm": 0.13868229786561526, + "learning_rate": 1.365170211084691e-05, + "loss": 2.6517, + "step": 37848 + }, + { + "epoch": 2.349556148736731, + "grad_norm": 0.14143423520094794, + "learning_rate": 1.3649222267717448e-05, + "loss": 2.7754, + "step": 37849 + }, + { + "epoch": 2.349618225836489, + "grad_norm": 0.15162044381028528, + "learning_rate": 1.3646742614238806e-05, + "loss": 2.6889, + "step": 37850 + }, + { + "epoch": 2.349680302936247, + "grad_norm": 0.1448427336388641, + "learning_rate": 1.3644263150423952e-05, + "loss": 2.682, + "step": 37851 + }, + { + "epoch": 2.349742380036005, + "grad_norm": 0.15423686031340603, + "learning_rate": 1.3641783876285796e-05, + "loss": 2.666, + "step": 37852 + }, + { + "epoch": 2.349804457135763, + "grad_norm": 0.14760106735861206, + "learning_rate": 1.3639304791837281e-05, + "loss": 2.7145, + "step": 37853 + }, + { + "epoch": 2.3498665342355203, + "grad_norm": 0.15469479856708665, + "learning_rate": 1.3636825897091344e-05, + "loss": 2.6617, + "step": 37854 + }, + { + "epoch": 2.3499286113352786, + "grad_norm": 0.14028080567416812, + "learning_rate": 1.3634347192060898e-05, + "loss": 2.7681, + "step": 37855 + }, + { + "epoch": 2.349990688435036, + "grad_norm": 0.14185769823211317, + "learning_rate": 1.3631868676758903e-05, + "loss": 2.7431, + "step": 37856 + }, + { + "epoch": 2.350052765534794, + "grad_norm": 0.14326050516666358, + "learning_rate": 1.3629390351198279e-05, + "loss": 2.7371, + "step": 37857 + }, + { + "epoch": 2.350114842634552, + "grad_norm": 0.1388204074618635, + "learning_rate": 1.3626912215391952e-05, + "loss": 2.6459, + "step": 37858 + }, + { + "epoch": 2.35017691973431, + "grad_norm": 0.15064621546429588, + "learning_rate": 1.3624434269352843e-05, + "loss": 2.76, + "step": 37859 + }, + { + "epoch": 2.350238996834068, + "grad_norm": 0.13667175984075222, + "learning_rate": 1.3621956513093903e-05, + "loss": 2.801, + "step": 37860 + }, + { + "epoch": 2.3503010739338257, + "grad_norm": 0.15868495047715894, + "learning_rate": 1.361947894662805e-05, + "loss": 2.7322, + "step": 37861 + }, + { + "epoch": 2.3503631510335836, + "grad_norm": 0.19265060376311044, + "learning_rate": 1.3617001569968202e-05, + "loss": 2.6783, + "step": 37862 + }, + { + "epoch": 2.3504252281333415, + "grad_norm": 0.14977397454656832, + "learning_rate": 1.361452438312728e-05, + "loss": 2.7702, + "step": 37863 + }, + { + "epoch": 2.3504873052330995, + "grad_norm": 0.14949576942492596, + "learning_rate": 1.361204738611821e-05, + "loss": 2.6164, + "step": 37864 + }, + { + "epoch": 2.3505493823328574, + "grad_norm": 0.1401323989042848, + "learning_rate": 1.3609570578953945e-05, + "loss": 2.6514, + "step": 37865 + }, + { + "epoch": 2.3506114594326153, + "grad_norm": 0.15159598255970913, + "learning_rate": 1.3607093961647382e-05, + "loss": 2.7137, + "step": 37866 + }, + { + "epoch": 2.350673536532373, + "grad_norm": 0.16131698024947014, + "learning_rate": 1.3604617534211439e-05, + "loss": 2.695, + "step": 37867 + }, + { + "epoch": 2.350735613632131, + "grad_norm": 0.14682628148248944, + "learning_rate": 1.360214129665905e-05, + "loss": 2.6838, + "step": 37868 + }, + { + "epoch": 2.350797690731889, + "grad_norm": 0.14415498677479024, + "learning_rate": 1.3599665249003108e-05, + "loss": 2.723, + "step": 37869 + }, + { + "epoch": 2.350859767831647, + "grad_norm": 0.1460658898537642, + "learning_rate": 1.3597189391256559e-05, + "loss": 2.673, + "step": 37870 + }, + { + "epoch": 2.350921844931405, + "grad_norm": 0.1409043742368257, + "learning_rate": 1.3594713723432317e-05, + "loss": 2.7415, + "step": 37871 + }, + { + "epoch": 2.350983922031163, + "grad_norm": 0.1388903159988782, + "learning_rate": 1.3592238245543287e-05, + "loss": 2.6798, + "step": 37872 + }, + { + "epoch": 2.3510459991309207, + "grad_norm": 0.13976585680290435, + "learning_rate": 1.3589762957602376e-05, + "loss": 2.7372, + "step": 37873 + }, + { + "epoch": 2.3511080762306786, + "grad_norm": 0.14868678317392292, + "learning_rate": 1.3587287859622522e-05, + "loss": 2.7288, + "step": 37874 + }, + { + "epoch": 2.3511701533304366, + "grad_norm": 0.15592644092278737, + "learning_rate": 1.3584812951616627e-05, + "loss": 2.6723, + "step": 37875 + }, + { + "epoch": 2.3512322304301945, + "grad_norm": 0.15649111145230044, + "learning_rate": 1.3582338233597608e-05, + "loss": 2.7458, + "step": 37876 + }, + { + "epoch": 2.3512943075299524, + "grad_norm": 0.13822907956151032, + "learning_rate": 1.3579863705578366e-05, + "loss": 2.6118, + "step": 37877 + }, + { + "epoch": 2.3513563846297103, + "grad_norm": 0.1460919591722528, + "learning_rate": 1.3577389367571802e-05, + "loss": 2.7328, + "step": 37878 + }, + { + "epoch": 2.351418461729468, + "grad_norm": 0.14000802327905384, + "learning_rate": 1.3574915219590856e-05, + "loss": 2.7069, + "step": 37879 + }, + { + "epoch": 2.351480538829226, + "grad_norm": 0.13618546429583625, + "learning_rate": 1.3572441261648416e-05, + "loss": 2.6117, + "step": 37880 + }, + { + "epoch": 2.3515426159289836, + "grad_norm": 0.130499151822235, + "learning_rate": 1.3569967493757401e-05, + "loss": 2.6492, + "step": 37881 + }, + { + "epoch": 2.3516046930287415, + "grad_norm": 0.13733206512644328, + "learning_rate": 1.356749391593069e-05, + "loss": 2.6883, + "step": 37882 + }, + { + "epoch": 2.3516667701284995, + "grad_norm": 0.13813609058207202, + "learning_rate": 1.356502052818122e-05, + "loss": 2.7051, + "step": 37883 + }, + { + "epoch": 2.3517288472282574, + "grad_norm": 0.1373300781215278, + "learning_rate": 1.3562547330521885e-05, + "loss": 2.7651, + "step": 37884 + }, + { + "epoch": 2.3517909243280153, + "grad_norm": 0.13853217193387954, + "learning_rate": 1.3560074322965589e-05, + "loss": 2.7483, + "step": 37885 + }, + { + "epoch": 2.351853001427773, + "grad_norm": 0.13633561947178713, + "learning_rate": 1.3557601505525214e-05, + "loss": 2.7547, + "step": 37886 + }, + { + "epoch": 2.351915078527531, + "grad_norm": 0.1381662917341526, + "learning_rate": 1.355512887821369e-05, + "loss": 2.6773, + "step": 37887 + }, + { + "epoch": 2.351977155627289, + "grad_norm": 0.1361700970177453, + "learning_rate": 1.3552656441043909e-05, + "loss": 2.6284, + "step": 37888 + }, + { + "epoch": 2.352039232727047, + "grad_norm": 0.14331778616873697, + "learning_rate": 1.355018419402877e-05, + "loss": 2.7728, + "step": 37889 + }, + { + "epoch": 2.352101309826805, + "grad_norm": 0.15796700859619264, + "learning_rate": 1.3547712137181162e-05, + "loss": 2.7144, + "step": 37890 + }, + { + "epoch": 2.352163386926563, + "grad_norm": 0.13665580662991933, + "learning_rate": 1.3545240270513981e-05, + "loss": 2.6625, + "step": 37891 + }, + { + "epoch": 2.3522254640263207, + "grad_norm": 0.16127281464667712, + "learning_rate": 1.354276859404014e-05, + "loss": 2.6842, + "step": 37892 + }, + { + "epoch": 2.3522875411260786, + "grad_norm": 0.15532537107652036, + "learning_rate": 1.3540297107772532e-05, + "loss": 2.7452, + "step": 37893 + }, + { + "epoch": 2.3523496182258365, + "grad_norm": 0.1423510807821201, + "learning_rate": 1.3537825811724036e-05, + "loss": 2.6398, + "step": 37894 + }, + { + "epoch": 2.3524116953255945, + "grad_norm": 0.14002218450670878, + "learning_rate": 1.353535470590755e-05, + "loss": 2.7379, + "step": 37895 + }, + { + "epoch": 2.3524737724253524, + "grad_norm": 0.1573363092892491, + "learning_rate": 1.3532883790335966e-05, + "loss": 2.7654, + "step": 37896 + }, + { + "epoch": 2.3525358495251103, + "grad_norm": 0.1377598158264999, + "learning_rate": 1.353041306502219e-05, + "loss": 2.7044, + "step": 37897 + }, + { + "epoch": 2.352597926624868, + "grad_norm": 0.1408871485473836, + "learning_rate": 1.352794252997911e-05, + "loss": 2.7829, + "step": 37898 + }, + { + "epoch": 2.352660003724626, + "grad_norm": 0.14524172448804734, + "learning_rate": 1.3525472185219601e-05, + "loss": 2.6697, + "step": 37899 + }, + { + "epoch": 2.352722080824384, + "grad_norm": 0.15325951433790105, + "learning_rate": 1.3523002030756549e-05, + "loss": 2.7109, + "step": 37900 + }, + { + "epoch": 2.352784157924142, + "grad_norm": 0.15369682489582837, + "learning_rate": 1.352053206660286e-05, + "loss": 2.6713, + "step": 37901 + }, + { + "epoch": 2.3528462350238994, + "grad_norm": 0.15871794258673894, + "learning_rate": 1.3518062292771417e-05, + "loss": 2.7362, + "step": 37902 + }, + { + "epoch": 2.352908312123658, + "grad_norm": 0.13643365835879157, + "learning_rate": 1.3515592709275093e-05, + "loss": 2.7171, + "step": 37903 + }, + { + "epoch": 2.3529703892234153, + "grad_norm": 0.14504353488362573, + "learning_rate": 1.351312331612678e-05, + "loss": 2.7194, + "step": 37904 + }, + { + "epoch": 2.353032466323173, + "grad_norm": 0.15566243322804918, + "learning_rate": 1.351065411333935e-05, + "loss": 2.7655, + "step": 37905 + }, + { + "epoch": 2.353094543422931, + "grad_norm": 0.14375879737637293, + "learning_rate": 1.3508185100925708e-05, + "loss": 2.6034, + "step": 37906 + }, + { + "epoch": 2.353156620522689, + "grad_norm": 0.1326827439202814, + "learning_rate": 1.3505716278898722e-05, + "loss": 2.6377, + "step": 37907 + }, + { + "epoch": 2.353218697622447, + "grad_norm": 0.1369422924307852, + "learning_rate": 1.3503247647271272e-05, + "loss": 2.7803, + "step": 37908 + }, + { + "epoch": 2.353280774722205, + "grad_norm": 0.1489408266566018, + "learning_rate": 1.350077920605623e-05, + "loss": 2.7142, + "step": 37909 + }, + { + "epoch": 2.353342851821963, + "grad_norm": 0.15236967428638937, + "learning_rate": 1.3498310955266492e-05, + "loss": 2.6609, + "step": 37910 + }, + { + "epoch": 2.3534049289217207, + "grad_norm": 0.1443564672671687, + "learning_rate": 1.3495842894914929e-05, + "loss": 2.6302, + "step": 37911 + }, + { + "epoch": 2.3534670060214786, + "grad_norm": 0.15605779986088772, + "learning_rate": 1.3493375025014415e-05, + "loss": 2.6956, + "step": 37912 + }, + { + "epoch": 2.3535290831212365, + "grad_norm": 0.14017995639178465, + "learning_rate": 1.349090734557783e-05, + "loss": 2.71, + "step": 37913 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.14210440046591047, + "learning_rate": 1.3488439856618024e-05, + "loss": 2.6033, + "step": 37914 + }, + { + "epoch": 2.3536532373207524, + "grad_norm": 0.13758066829213048, + "learning_rate": 1.3485972558147908e-05, + "loss": 2.669, + "step": 37915 + }, + { + "epoch": 2.3537153144205103, + "grad_norm": 0.14122146366717309, + "learning_rate": 1.3483505450180333e-05, + "loss": 2.7175, + "step": 37916 + }, + { + "epoch": 2.353777391520268, + "grad_norm": 0.13879579058035327, + "learning_rate": 1.348103853272818e-05, + "loss": 2.6994, + "step": 37917 + }, + { + "epoch": 2.353839468620026, + "grad_norm": 0.16166165007246378, + "learning_rate": 1.3478571805804297e-05, + "loss": 2.6379, + "step": 37918 + }, + { + "epoch": 2.353901545719784, + "grad_norm": 0.13796076652135633, + "learning_rate": 1.3476105269421585e-05, + "loss": 2.6767, + "step": 37919 + }, + { + "epoch": 2.353963622819542, + "grad_norm": 0.13657511250239723, + "learning_rate": 1.3473638923592901e-05, + "loss": 2.6893, + "step": 37920 + }, + { + "epoch": 2.3540256999193, + "grad_norm": 0.15083560507426483, + "learning_rate": 1.3471172768331109e-05, + "loss": 2.8637, + "step": 37921 + }, + { + "epoch": 2.354087777019058, + "grad_norm": 0.142917946611888, + "learning_rate": 1.3468706803649062e-05, + "loss": 2.7588, + "step": 37922 + }, + { + "epoch": 2.3541498541188157, + "grad_norm": 0.1446343968886279, + "learning_rate": 1.3466241029559651e-05, + "loss": 2.7224, + "step": 37923 + }, + { + "epoch": 2.3542119312185736, + "grad_norm": 0.14571506026936848, + "learning_rate": 1.3463775446075732e-05, + "loss": 2.8288, + "step": 37924 + }, + { + "epoch": 2.3542740083183316, + "grad_norm": 0.142214906849293, + "learning_rate": 1.346131005321017e-05, + "loss": 2.7553, + "step": 37925 + }, + { + "epoch": 2.3543360854180895, + "grad_norm": 0.13459278854671503, + "learning_rate": 1.3458844850975815e-05, + "loss": 2.6832, + "step": 37926 + }, + { + "epoch": 2.354398162517847, + "grad_norm": 0.15514516630469444, + "learning_rate": 1.345637983938553e-05, + "loss": 2.7175, + "step": 37927 + }, + { + "epoch": 2.3544602396176053, + "grad_norm": 0.1508556953032771, + "learning_rate": 1.3453915018452195e-05, + "loss": 2.7398, + "step": 37928 + }, + { + "epoch": 2.354522316717363, + "grad_norm": 0.14015335223541792, + "learning_rate": 1.3451450388188642e-05, + "loss": 2.7238, + "step": 37929 + }, + { + "epoch": 2.3545843938171207, + "grad_norm": 0.13758055998354873, + "learning_rate": 1.3448985948607756e-05, + "loss": 2.7684, + "step": 37930 + }, + { + "epoch": 2.3546464709168786, + "grad_norm": 0.15131164885489928, + "learning_rate": 1.3446521699722386e-05, + "loss": 2.6907, + "step": 37931 + }, + { + "epoch": 2.3547085480166365, + "grad_norm": 0.14016616990314743, + "learning_rate": 1.3444057641545376e-05, + "loss": 2.7585, + "step": 37932 + }, + { + "epoch": 2.3547706251163945, + "grad_norm": 0.13688752792019526, + "learning_rate": 1.34415937740896e-05, + "loss": 2.79, + "step": 37933 + }, + { + "epoch": 2.3548327022161524, + "grad_norm": 0.15201907528649275, + "learning_rate": 1.3439130097367908e-05, + "loss": 2.664, + "step": 37934 + }, + { + "epoch": 2.3548947793159103, + "grad_norm": 0.15247012056710849, + "learning_rate": 1.3436666611393145e-05, + "loss": 2.6707, + "step": 37935 + }, + { + "epoch": 2.354956856415668, + "grad_norm": 0.1445617514721614, + "learning_rate": 1.3434203316178157e-05, + "loss": 2.7852, + "step": 37936 + }, + { + "epoch": 2.355018933515426, + "grad_norm": 0.139485570995835, + "learning_rate": 1.3431740211735821e-05, + "loss": 2.7253, + "step": 37937 + }, + { + "epoch": 2.355081010615184, + "grad_norm": 0.1427198189327235, + "learning_rate": 1.3429277298078979e-05, + "loss": 2.7371, + "step": 37938 + }, + { + "epoch": 2.355143087714942, + "grad_norm": 0.14286508814821838, + "learning_rate": 1.3426814575220469e-05, + "loss": 2.7376, + "step": 37939 + }, + { + "epoch": 2.3552051648147, + "grad_norm": 0.14417663450065346, + "learning_rate": 1.3424352043173144e-05, + "loss": 2.7769, + "step": 37940 + }, + { + "epoch": 2.355267241914458, + "grad_norm": 0.15379736723119114, + "learning_rate": 1.3421889701949842e-05, + "loss": 2.7611, + "step": 37941 + }, + { + "epoch": 2.3553293190142157, + "grad_norm": 0.1389163441310545, + "learning_rate": 1.3419427551563435e-05, + "loss": 2.7142, + "step": 37942 + }, + { + "epoch": 2.3553913961139736, + "grad_norm": 0.13715288851997745, + "learning_rate": 1.341696559202676e-05, + "loss": 2.7316, + "step": 37943 + }, + { + "epoch": 2.3554534732137316, + "grad_norm": 0.13834530025903527, + "learning_rate": 1.3414503823352647e-05, + "loss": 2.7266, + "step": 37944 + }, + { + "epoch": 2.3555155503134895, + "grad_norm": 0.14104188611137, + "learning_rate": 1.3412042245553941e-05, + "loss": 2.7091, + "step": 37945 + }, + { + "epoch": 2.3555776274132474, + "grad_norm": 0.1350303068624664, + "learning_rate": 1.3409580858643505e-05, + "loss": 2.7113, + "step": 37946 + }, + { + "epoch": 2.3556397045130053, + "grad_norm": 0.13802964629221234, + "learning_rate": 1.340711966263417e-05, + "loss": 2.7499, + "step": 37947 + }, + { + "epoch": 2.3557017816127632, + "grad_norm": 0.1387799338965747, + "learning_rate": 1.340465865753877e-05, + "loss": 2.7122, + "step": 37948 + }, + { + "epoch": 2.355763858712521, + "grad_norm": 0.14466483798630095, + "learning_rate": 1.340219784337014e-05, + "loss": 2.6969, + "step": 37949 + }, + { + "epoch": 2.3558259358122786, + "grad_norm": 0.13881359783195354, + "learning_rate": 1.339973722014114e-05, + "loss": 2.7971, + "step": 37950 + }, + { + "epoch": 2.355888012912037, + "grad_norm": 0.1518359892393377, + "learning_rate": 1.3397276787864598e-05, + "loss": 2.6885, + "step": 37951 + }, + { + "epoch": 2.3559500900117945, + "grad_norm": 0.14148775328379187, + "learning_rate": 1.3394816546553352e-05, + "loss": 2.6618, + "step": 37952 + }, + { + "epoch": 2.3560121671115524, + "grad_norm": 0.13752146493738482, + "learning_rate": 1.3392356496220226e-05, + "loss": 2.7476, + "step": 37953 + }, + { + "epoch": 2.3560742442113103, + "grad_norm": 0.13529221993943058, + "learning_rate": 1.3389896636878052e-05, + "loss": 2.7284, + "step": 37954 + }, + { + "epoch": 2.356136321311068, + "grad_norm": 0.1339488641374265, + "learning_rate": 1.338743696853969e-05, + "loss": 2.6995, + "step": 37955 + }, + { + "epoch": 2.356198398410826, + "grad_norm": 0.1573477627395001, + "learning_rate": 1.3384977491217959e-05, + "loss": 2.6875, + "step": 37956 + }, + { + "epoch": 2.356260475510584, + "grad_norm": 0.13987132605056293, + "learning_rate": 1.3382518204925686e-05, + "loss": 2.5918, + "step": 37957 + }, + { + "epoch": 2.356322552610342, + "grad_norm": 0.15758613081923, + "learning_rate": 1.3380059109675692e-05, + "loss": 2.6398, + "step": 37958 + }, + { + "epoch": 2.3563846297101, + "grad_norm": 0.13886831366274172, + "learning_rate": 1.3377600205480834e-05, + "loss": 2.6969, + "step": 37959 + }, + { + "epoch": 2.356446706809858, + "grad_norm": 0.13934421747843145, + "learning_rate": 1.3375141492353927e-05, + "loss": 2.7184, + "step": 37960 + }, + { + "epoch": 2.3565087839096157, + "grad_norm": 0.13977234685096088, + "learning_rate": 1.3372682970307792e-05, + "loss": 2.653, + "step": 37961 + }, + { + "epoch": 2.3565708610093736, + "grad_norm": 0.14787301219118454, + "learning_rate": 1.3370224639355256e-05, + "loss": 2.715, + "step": 37962 + }, + { + "epoch": 2.3566329381091315, + "grad_norm": 0.15361127761145235, + "learning_rate": 1.3367766499509154e-05, + "loss": 2.6764, + "step": 37963 + }, + { + "epoch": 2.3566950152088895, + "grad_norm": 0.14212007626993736, + "learning_rate": 1.3365308550782302e-05, + "loss": 2.7279, + "step": 37964 + }, + { + "epoch": 2.3567570923086474, + "grad_norm": 0.1458273904872451, + "learning_rate": 1.3362850793187537e-05, + "loss": 2.7296, + "step": 37965 + }, + { + "epoch": 2.3568191694084053, + "grad_norm": 0.14601787034742789, + "learning_rate": 1.3360393226737678e-05, + "loss": 2.6718, + "step": 37966 + }, + { + "epoch": 2.356881246508163, + "grad_norm": 0.13801961274477023, + "learning_rate": 1.335793585144554e-05, + "loss": 2.7183, + "step": 37967 + }, + { + "epoch": 2.356943323607921, + "grad_norm": 0.1516815919582161, + "learning_rate": 1.335547866732393e-05, + "loss": 2.7305, + "step": 37968 + }, + { + "epoch": 2.357005400707679, + "grad_norm": 0.1404528425012044, + "learning_rate": 1.3353021674385701e-05, + "loss": 2.6535, + "step": 37969 + }, + { + "epoch": 2.357067477807437, + "grad_norm": 0.14126127725498502, + "learning_rate": 1.3350564872643651e-05, + "loss": 2.7378, + "step": 37970 + }, + { + "epoch": 2.357129554907195, + "grad_norm": 0.16021783737814058, + "learning_rate": 1.3348108262110604e-05, + "loss": 2.8111, + "step": 37971 + }, + { + "epoch": 2.357191632006953, + "grad_norm": 0.1450294337118843, + "learning_rate": 1.3345651842799356e-05, + "loss": 2.8003, + "step": 37972 + }, + { + "epoch": 2.3572537091067107, + "grad_norm": 0.15320382886935968, + "learning_rate": 1.334319561472276e-05, + "loss": 2.6014, + "step": 37973 + }, + { + "epoch": 2.3573157862064686, + "grad_norm": 0.14492599604831743, + "learning_rate": 1.3340739577893608e-05, + "loss": 2.7028, + "step": 37974 + }, + { + "epoch": 2.357377863306226, + "grad_norm": 0.13879063047880977, + "learning_rate": 1.3338283732324719e-05, + "loss": 2.7498, + "step": 37975 + }, + { + "epoch": 2.3574399404059845, + "grad_norm": 0.14148367216503335, + "learning_rate": 1.33358280780289e-05, + "loss": 2.7065, + "step": 37976 + }, + { + "epoch": 2.357502017505742, + "grad_norm": 0.1467208162087093, + "learning_rate": 1.3333372615018953e-05, + "loss": 2.7069, + "step": 37977 + }, + { + "epoch": 2.3575640946055, + "grad_norm": 0.180028519242058, + "learning_rate": 1.333091734330772e-05, + "loss": 2.7687, + "step": 37978 + }, + { + "epoch": 2.357626171705258, + "grad_norm": 0.16106897881473747, + "learning_rate": 1.3328462262907992e-05, + "loss": 2.7857, + "step": 37979 + }, + { + "epoch": 2.3576882488050157, + "grad_norm": 0.14382182457140574, + "learning_rate": 1.3326007373832577e-05, + "loss": 2.6787, + "step": 37980 + }, + { + "epoch": 2.3577503259047736, + "grad_norm": 0.17751016203240086, + "learning_rate": 1.332355267609427e-05, + "loss": 2.6981, + "step": 37981 + }, + { + "epoch": 2.3578124030045315, + "grad_norm": 0.13371712776451714, + "learning_rate": 1.3321098169705904e-05, + "loss": 2.6642, + "step": 37982 + }, + { + "epoch": 2.3578744801042895, + "grad_norm": 0.14226822290871308, + "learning_rate": 1.331864385468028e-05, + "loss": 2.7433, + "step": 37983 + }, + { + "epoch": 2.3579365572040474, + "grad_norm": 0.14913240700846017, + "learning_rate": 1.3316189731030187e-05, + "loss": 2.6729, + "step": 37984 + }, + { + "epoch": 2.3579986343038053, + "grad_norm": 0.14524534734982447, + "learning_rate": 1.3313735798768429e-05, + "loss": 2.7533, + "step": 37985 + }, + { + "epoch": 2.358060711403563, + "grad_norm": 0.15501401931097197, + "learning_rate": 1.331128205790783e-05, + "loss": 2.8464, + "step": 37986 + }, + { + "epoch": 2.358122788503321, + "grad_norm": 0.14413692836621658, + "learning_rate": 1.330882850846118e-05, + "loss": 2.6825, + "step": 37987 + }, + { + "epoch": 2.358184865603079, + "grad_norm": 0.1417361770585132, + "learning_rate": 1.330637515044128e-05, + "loss": 2.7148, + "step": 37988 + }, + { + "epoch": 2.358246942702837, + "grad_norm": 0.15377504497552846, + "learning_rate": 1.3303921983860922e-05, + "loss": 2.6077, + "step": 37989 + }, + { + "epoch": 2.358309019802595, + "grad_norm": 0.1387533029386526, + "learning_rate": 1.3301469008732903e-05, + "loss": 2.6601, + "step": 37990 + }, + { + "epoch": 2.358371096902353, + "grad_norm": 0.1623119673949718, + "learning_rate": 1.3299016225070044e-05, + "loss": 2.7505, + "step": 37991 + }, + { + "epoch": 2.3584331740021107, + "grad_norm": 0.13944780168604334, + "learning_rate": 1.3296563632885123e-05, + "loss": 2.7736, + "step": 37992 + }, + { + "epoch": 2.3584952511018686, + "grad_norm": 0.1370813669903826, + "learning_rate": 1.3294111232190942e-05, + "loss": 2.706, + "step": 37993 + }, + { + "epoch": 2.3585573282016266, + "grad_norm": 0.14979700494038206, + "learning_rate": 1.329165902300028e-05, + "loss": 2.7012, + "step": 37994 + }, + { + "epoch": 2.3586194053013845, + "grad_norm": 0.1404345467778545, + "learning_rate": 1.3289207005325944e-05, + "loss": 2.6346, + "step": 37995 + }, + { + "epoch": 2.3586814824011424, + "grad_norm": 0.1380666568250348, + "learning_rate": 1.3286755179180738e-05, + "loss": 2.781, + "step": 37996 + }, + { + "epoch": 2.3587435595009003, + "grad_norm": 0.1814662137019362, + "learning_rate": 1.3284303544577442e-05, + "loss": 2.7394, + "step": 37997 + }, + { + "epoch": 2.358805636600658, + "grad_norm": 0.14307548051510457, + "learning_rate": 1.3281852101528851e-05, + "loss": 2.6769, + "step": 37998 + }, + { + "epoch": 2.358867713700416, + "grad_norm": 0.14432696777884188, + "learning_rate": 1.3279400850047734e-05, + "loss": 2.7234, + "step": 37999 + }, + { + "epoch": 2.3589297908001736, + "grad_norm": 0.16648143605502785, + "learning_rate": 1.3276949790146915e-05, + "loss": 2.7479, + "step": 38000 + }, + { + "epoch": 2.3589918678999315, + "grad_norm": 0.1329315087112111, + "learning_rate": 1.3274498921839163e-05, + "loss": 2.7327, + "step": 38001 + }, + { + "epoch": 2.3590539449996895, + "grad_norm": 0.13958318559676744, + "learning_rate": 1.3272048245137265e-05, + "loss": 2.7118, + "step": 38002 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.1390055885550819, + "learning_rate": 1.326959776005401e-05, + "loss": 2.655, + "step": 38003 + }, + { + "epoch": 2.3591780991992053, + "grad_norm": 0.1377688204924828, + "learning_rate": 1.3267147466602159e-05, + "loss": 2.7582, + "step": 38004 + }, + { + "epoch": 2.359240176298963, + "grad_norm": 0.145488008009057, + "learning_rate": 1.3264697364794538e-05, + "loss": 2.7359, + "step": 38005 + }, + { + "epoch": 2.359302253398721, + "grad_norm": 0.13424218150899594, + "learning_rate": 1.32622474546439e-05, + "loss": 2.7668, + "step": 38006 + }, + { + "epoch": 2.359364330498479, + "grad_norm": 0.15716261185099212, + "learning_rate": 1.3259797736163043e-05, + "loss": 2.5683, + "step": 38007 + }, + { + "epoch": 2.359426407598237, + "grad_norm": 0.14726862461667728, + "learning_rate": 1.3257348209364722e-05, + "loss": 2.6549, + "step": 38008 + }, + { + "epoch": 2.359488484697995, + "grad_norm": 0.13779428996446932, + "learning_rate": 1.325489887426175e-05, + "loss": 2.7475, + "step": 38009 + }, + { + "epoch": 2.359550561797753, + "grad_norm": 0.13001769813092307, + "learning_rate": 1.3252449730866894e-05, + "loss": 2.7016, + "step": 38010 + }, + { + "epoch": 2.3596126388975107, + "grad_norm": 0.13565803224455708, + "learning_rate": 1.3250000779192928e-05, + "loss": 2.7249, + "step": 38011 + }, + { + "epoch": 2.3596747159972686, + "grad_norm": 0.1373897028831028, + "learning_rate": 1.3247552019252629e-05, + "loss": 2.7307, + "step": 38012 + }, + { + "epoch": 2.3597367930970266, + "grad_norm": 0.1464003830755054, + "learning_rate": 1.3245103451058754e-05, + "loss": 2.8054, + "step": 38013 + }, + { + "epoch": 2.3597988701967845, + "grad_norm": 0.13869650044531712, + "learning_rate": 1.3242655074624116e-05, + "loss": 2.8163, + "step": 38014 + }, + { + "epoch": 2.3598609472965424, + "grad_norm": 0.14179055328569104, + "learning_rate": 1.324020688996147e-05, + "loss": 2.7088, + "step": 38015 + }, + { + "epoch": 2.3599230243963003, + "grad_norm": 0.14266208893691568, + "learning_rate": 1.3237758897083586e-05, + "loss": 2.6831, + "step": 38016 + }, + { + "epoch": 2.3599851014960582, + "grad_norm": 0.15637805578850383, + "learning_rate": 1.3235311096003223e-05, + "loss": 2.72, + "step": 38017 + }, + { + "epoch": 2.360047178595816, + "grad_norm": 0.1636247073014995, + "learning_rate": 1.3232863486733182e-05, + "loss": 2.6668, + "step": 38018 + }, + { + "epoch": 2.360109255695574, + "grad_norm": 0.14336420903384176, + "learning_rate": 1.3230416069286223e-05, + "loss": 2.8171, + "step": 38019 + }, + { + "epoch": 2.360171332795332, + "grad_norm": 0.1423490329854965, + "learning_rate": 1.3227968843675104e-05, + "loss": 2.6976, + "step": 38020 + }, + { + "epoch": 2.36023340989509, + "grad_norm": 0.14538490942164808, + "learning_rate": 1.3225521809912584e-05, + "loss": 2.7156, + "step": 38021 + }, + { + "epoch": 2.360295486994848, + "grad_norm": 0.14247182907463876, + "learning_rate": 1.3223074968011456e-05, + "loss": 2.6839, + "step": 38022 + }, + { + "epoch": 2.3603575640946053, + "grad_norm": 0.14743732743376448, + "learning_rate": 1.322062831798448e-05, + "loss": 2.7922, + "step": 38023 + }, + { + "epoch": 2.3604196411943636, + "grad_norm": 0.1335830703727508, + "learning_rate": 1.3218181859844408e-05, + "loss": 2.6153, + "step": 38024 + }, + { + "epoch": 2.360481718294121, + "grad_norm": 0.13894411684411406, + "learning_rate": 1.3215735593604017e-05, + "loss": 2.7494, + "step": 38025 + }, + { + "epoch": 2.360543795393879, + "grad_norm": 0.14641446034586195, + "learning_rate": 1.3213289519276045e-05, + "loss": 2.8045, + "step": 38026 + }, + { + "epoch": 2.360605872493637, + "grad_norm": 0.15513926533128097, + "learning_rate": 1.3210843636873272e-05, + "loss": 2.6787, + "step": 38027 + }, + { + "epoch": 2.360667949593395, + "grad_norm": 0.14004918597884344, + "learning_rate": 1.3208397946408474e-05, + "loss": 2.8133, + "step": 38028 + }, + { + "epoch": 2.360730026693153, + "grad_norm": 0.15378592793412202, + "learning_rate": 1.3205952447894393e-05, + "loss": 2.692, + "step": 38029 + }, + { + "epoch": 2.3607921037929107, + "grad_norm": 0.14732104733343462, + "learning_rate": 1.3203507141343796e-05, + "loss": 2.718, + "step": 38030 + }, + { + "epoch": 2.3608541808926686, + "grad_norm": 0.16178083448962738, + "learning_rate": 1.3201062026769417e-05, + "loss": 2.6774, + "step": 38031 + }, + { + "epoch": 2.3609162579924265, + "grad_norm": 0.12854406743561525, + "learning_rate": 1.3198617104184042e-05, + "loss": 2.6425, + "step": 38032 + }, + { + "epoch": 2.3609783350921845, + "grad_norm": 0.15720411697247616, + "learning_rate": 1.3196172373600418e-05, + "loss": 2.7384, + "step": 38033 + }, + { + "epoch": 2.3610404121919424, + "grad_norm": 0.13499115303420742, + "learning_rate": 1.31937278350313e-05, + "loss": 2.6559, + "step": 38034 + }, + { + "epoch": 2.3611024892917003, + "grad_norm": 0.13745878750723262, + "learning_rate": 1.319128348848942e-05, + "loss": 2.7311, + "step": 38035 + }, + { + "epoch": 2.361164566391458, + "grad_norm": 0.1625638359569314, + "learning_rate": 1.3188839333987568e-05, + "loss": 2.7491, + "step": 38036 + }, + { + "epoch": 2.361226643491216, + "grad_norm": 0.13569307467941064, + "learning_rate": 1.3186395371538474e-05, + "loss": 2.6966, + "step": 38037 + }, + { + "epoch": 2.361288720590974, + "grad_norm": 0.14331014395921987, + "learning_rate": 1.3183951601154892e-05, + "loss": 2.6268, + "step": 38038 + }, + { + "epoch": 2.361350797690732, + "grad_norm": 0.13967837833004595, + "learning_rate": 1.3181508022849575e-05, + "loss": 2.7416, + "step": 38039 + }, + { + "epoch": 2.36141287479049, + "grad_norm": 0.13976172540383663, + "learning_rate": 1.3179064636635252e-05, + "loss": 2.7177, + "step": 38040 + }, + { + "epoch": 2.361474951890248, + "grad_norm": 0.13386123687412865, + "learning_rate": 1.31766214425247e-05, + "loss": 2.7612, + "step": 38041 + }, + { + "epoch": 2.3615370289900057, + "grad_norm": 0.15090731627995443, + "learning_rate": 1.317417844053066e-05, + "loss": 2.739, + "step": 38042 + }, + { + "epoch": 2.3615991060897636, + "grad_norm": 0.13615665008165698, + "learning_rate": 1.3171735630665866e-05, + "loss": 2.6562, + "step": 38043 + }, + { + "epoch": 2.3616611831895216, + "grad_norm": 0.1370610516045888, + "learning_rate": 1.3169293012943052e-05, + "loss": 2.6622, + "step": 38044 + }, + { + "epoch": 2.3617232602892795, + "grad_norm": 0.14073157907611736, + "learning_rate": 1.316685058737499e-05, + "loss": 2.7754, + "step": 38045 + }, + { + "epoch": 2.361785337389037, + "grad_norm": 0.13850391989326835, + "learning_rate": 1.3164408353974417e-05, + "loss": 2.683, + "step": 38046 + }, + { + "epoch": 2.3618474144887953, + "grad_norm": 0.14030801624661512, + "learning_rate": 1.3161966312754065e-05, + "loss": 2.6836, + "step": 38047 + }, + { + "epoch": 2.361909491588553, + "grad_norm": 0.1462356426556205, + "learning_rate": 1.3159524463726675e-05, + "loss": 2.7486, + "step": 38048 + }, + { + "epoch": 2.3619715686883107, + "grad_norm": 0.1458400223821453, + "learning_rate": 1.3157082806904974e-05, + "loss": 2.7385, + "step": 38049 + }, + { + "epoch": 2.3620336457880686, + "grad_norm": 0.13669153351320382, + "learning_rate": 1.3154641342301733e-05, + "loss": 2.7636, + "step": 38050 + }, + { + "epoch": 2.3620957228878265, + "grad_norm": 0.13810843860366176, + "learning_rate": 1.3152200069929677e-05, + "loss": 2.6079, + "step": 38051 + }, + { + "epoch": 2.3621577999875845, + "grad_norm": 0.16676591299920854, + "learning_rate": 1.314975898980153e-05, + "loss": 2.5892, + "step": 38052 + }, + { + "epoch": 2.3622198770873424, + "grad_norm": 0.14192960821466483, + "learning_rate": 1.3147318101930028e-05, + "loss": 2.6443, + "step": 38053 + }, + { + "epoch": 2.3622819541871003, + "grad_norm": 0.14816233578879381, + "learning_rate": 1.3144877406327921e-05, + "loss": 2.7298, + "step": 38054 + }, + { + "epoch": 2.362344031286858, + "grad_norm": 0.13525038070766154, + "learning_rate": 1.3142436903007944e-05, + "loss": 2.7766, + "step": 38055 + }, + { + "epoch": 2.362406108386616, + "grad_norm": 0.1512230463680909, + "learning_rate": 1.3139996591982818e-05, + "loss": 2.71, + "step": 38056 + }, + { + "epoch": 2.362468185486374, + "grad_norm": 0.14720809176947328, + "learning_rate": 1.313755647326526e-05, + "loss": 2.7553, + "step": 38057 + }, + { + "epoch": 2.362530262586132, + "grad_norm": 0.13533591115275567, + "learning_rate": 1.3135116546868036e-05, + "loss": 2.65, + "step": 38058 + }, + { + "epoch": 2.36259233968589, + "grad_norm": 0.14260517804316833, + "learning_rate": 1.3132676812803856e-05, + "loss": 2.7501, + "step": 38059 + }, + { + "epoch": 2.362654416785648, + "grad_norm": 0.1344537789607616, + "learning_rate": 1.3130237271085439e-05, + "loss": 2.6653, + "step": 38060 + }, + { + "epoch": 2.3627164938854057, + "grad_norm": 0.14266496130343323, + "learning_rate": 1.3127797921725537e-05, + "loss": 2.6868, + "step": 38061 + }, + { + "epoch": 2.3627785709851636, + "grad_norm": 0.14159545227548012, + "learning_rate": 1.3125358764736867e-05, + "loss": 2.7304, + "step": 38062 + }, + { + "epoch": 2.3628406480849216, + "grad_norm": 0.15119309411968873, + "learning_rate": 1.3122919800132138e-05, + "loss": 2.7365, + "step": 38063 + }, + { + "epoch": 2.3629027251846795, + "grad_norm": 0.1399346666117364, + "learning_rate": 1.3120481027924097e-05, + "loss": 2.7904, + "step": 38064 + }, + { + "epoch": 2.3629648022844374, + "grad_norm": 0.13953186718504143, + "learning_rate": 1.3118042448125468e-05, + "loss": 2.7292, + "step": 38065 + }, + { + "epoch": 2.3630268793841953, + "grad_norm": 0.1344129259349127, + "learning_rate": 1.311560406074896e-05, + "loss": 2.7225, + "step": 38066 + }, + { + "epoch": 2.3630889564839532, + "grad_norm": 0.15141803133649093, + "learning_rate": 1.3113165865807286e-05, + "loss": 2.687, + "step": 38067 + }, + { + "epoch": 2.363151033583711, + "grad_norm": 0.14829233876763168, + "learning_rate": 1.3110727863313193e-05, + "loss": 2.7833, + "step": 38068 + }, + { + "epoch": 2.3632131106834686, + "grad_norm": 0.1644105228596812, + "learning_rate": 1.3108290053279388e-05, + "loss": 2.832, + "step": 38069 + }, + { + "epoch": 2.363275187783227, + "grad_norm": 0.14454367945038402, + "learning_rate": 1.3105852435718591e-05, + "loss": 2.732, + "step": 38070 + }, + { + "epoch": 2.3633372648829845, + "grad_norm": 0.13974134649054692, + "learning_rate": 1.3103415010643504e-05, + "loss": 2.6388, + "step": 38071 + }, + { + "epoch": 2.3633993419827424, + "grad_norm": 0.15056567098992493, + "learning_rate": 1.310097777806687e-05, + "loss": 2.6609, + "step": 38072 + }, + { + "epoch": 2.3634614190825003, + "grad_norm": 0.1706170190090408, + "learning_rate": 1.3098540738001397e-05, + "loss": 2.7628, + "step": 38073 + }, + { + "epoch": 2.363523496182258, + "grad_norm": 0.1632624829837214, + "learning_rate": 1.3096103890459793e-05, + "loss": 2.8719, + "step": 38074 + }, + { + "epoch": 2.363585573282016, + "grad_norm": 0.1409540829344221, + "learning_rate": 1.309366723545477e-05, + "loss": 2.774, + "step": 38075 + }, + { + "epoch": 2.363647650381774, + "grad_norm": 0.1399094070511218, + "learning_rate": 1.3091230772999035e-05, + "loss": 2.7606, + "step": 38076 + }, + { + "epoch": 2.363709727481532, + "grad_norm": 0.13835904605466662, + "learning_rate": 1.3088794503105323e-05, + "loss": 2.701, + "step": 38077 + }, + { + "epoch": 2.36377180458129, + "grad_norm": 0.14361893909255471, + "learning_rate": 1.3086358425786326e-05, + "loss": 2.6656, + "step": 38078 + }, + { + "epoch": 2.363833881681048, + "grad_norm": 0.14468088008204327, + "learning_rate": 1.3083922541054765e-05, + "loss": 2.7271, + "step": 38079 + }, + { + "epoch": 2.3638959587808057, + "grad_norm": 0.13264185798180697, + "learning_rate": 1.3081486848923325e-05, + "loss": 2.6561, + "step": 38080 + }, + { + "epoch": 2.3639580358805636, + "grad_norm": 0.1557242608546893, + "learning_rate": 1.3079051349404742e-05, + "loss": 2.7453, + "step": 38081 + }, + { + "epoch": 2.3640201129803216, + "grad_norm": 0.1425869102906762, + "learning_rate": 1.3076616042511713e-05, + "loss": 2.74, + "step": 38082 + }, + { + "epoch": 2.3640821900800795, + "grad_norm": 0.14521612463093111, + "learning_rate": 1.307418092825694e-05, + "loss": 2.6987, + "step": 38083 + }, + { + "epoch": 2.3641442671798374, + "grad_norm": 0.15507325277674083, + "learning_rate": 1.3071746006653113e-05, + "loss": 2.8017, + "step": 38084 + }, + { + "epoch": 2.3642063442795953, + "grad_norm": 0.14081117438984686, + "learning_rate": 1.306931127771297e-05, + "loss": 2.7015, + "step": 38085 + }, + { + "epoch": 2.3642684213793532, + "grad_norm": 0.1426703272514042, + "learning_rate": 1.3066876741449197e-05, + "loss": 2.7456, + "step": 38086 + }, + { + "epoch": 2.364330498479111, + "grad_norm": 0.13873662242965865, + "learning_rate": 1.3064442397874487e-05, + "loss": 2.6477, + "step": 38087 + }, + { + "epoch": 2.364392575578869, + "grad_norm": 0.1455287790234084, + "learning_rate": 1.3062008247001556e-05, + "loss": 2.7384, + "step": 38088 + }, + { + "epoch": 2.364454652678627, + "grad_norm": 0.14635768498120705, + "learning_rate": 1.3059574288843073e-05, + "loss": 2.684, + "step": 38089 + }, + { + "epoch": 2.364516729778385, + "grad_norm": 0.1529374718664102, + "learning_rate": 1.3057140523411775e-05, + "loss": 2.6458, + "step": 38090 + }, + { + "epoch": 2.364578806878143, + "grad_norm": 0.1393751925736538, + "learning_rate": 1.3054706950720346e-05, + "loss": 2.7006, + "step": 38091 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.1448091400075263, + "learning_rate": 1.3052273570781476e-05, + "loss": 2.7266, + "step": 38092 + }, + { + "epoch": 2.3647029610776586, + "grad_norm": 0.1378180585765236, + "learning_rate": 1.3049840383607853e-05, + "loss": 2.7213, + "step": 38093 + }, + { + "epoch": 2.364765038177416, + "grad_norm": 0.14298429508851046, + "learning_rate": 1.304740738921218e-05, + "loss": 2.7151, + "step": 38094 + }, + { + "epoch": 2.3648271152771745, + "grad_norm": 0.1375741154694617, + "learning_rate": 1.3044974587607167e-05, + "loss": 2.7446, + "step": 38095 + }, + { + "epoch": 2.364889192376932, + "grad_norm": 0.13221328561635445, + "learning_rate": 1.3042541978805495e-05, + "loss": 2.6663, + "step": 38096 + }, + { + "epoch": 2.36495126947669, + "grad_norm": 0.1428648665056008, + "learning_rate": 1.3040109562819853e-05, + "loss": 2.6713, + "step": 38097 + }, + { + "epoch": 2.365013346576448, + "grad_norm": 0.13906722060774346, + "learning_rate": 1.3037677339662935e-05, + "loss": 2.7599, + "step": 38098 + }, + { + "epoch": 2.3650754236762057, + "grad_norm": 0.13685284577887522, + "learning_rate": 1.3035245309347405e-05, + "loss": 2.7727, + "step": 38099 + }, + { + "epoch": 2.3651375007759636, + "grad_norm": 0.1391955815783836, + "learning_rate": 1.3032813471885996e-05, + "loss": 2.7292, + "step": 38100 + }, + { + "epoch": 2.3651995778757215, + "grad_norm": 0.13681994507792064, + "learning_rate": 1.3030381827291372e-05, + "loss": 2.6612, + "step": 38101 + }, + { + "epoch": 2.3652616549754795, + "grad_norm": 0.14044191442536733, + "learning_rate": 1.3027950375576214e-05, + "loss": 2.6559, + "step": 38102 + }, + { + "epoch": 2.3653237320752374, + "grad_norm": 0.13985341374343155, + "learning_rate": 1.3025519116753205e-05, + "loss": 2.672, + "step": 38103 + }, + { + "epoch": 2.3653858091749953, + "grad_norm": 0.1364760902065128, + "learning_rate": 1.3023088050835053e-05, + "loss": 2.6155, + "step": 38104 + }, + { + "epoch": 2.365447886274753, + "grad_norm": 0.13875626964408297, + "learning_rate": 1.3020657177834428e-05, + "loss": 2.704, + "step": 38105 + }, + { + "epoch": 2.365509963374511, + "grad_norm": 0.140482492559662, + "learning_rate": 1.3018226497764008e-05, + "loss": 2.7906, + "step": 38106 + }, + { + "epoch": 2.365572040474269, + "grad_norm": 0.1415174629135881, + "learning_rate": 1.3015796010636466e-05, + "loss": 2.7478, + "step": 38107 + }, + { + "epoch": 2.365634117574027, + "grad_norm": 0.13932458632945266, + "learning_rate": 1.3013365716464505e-05, + "loss": 2.7687, + "step": 38108 + }, + { + "epoch": 2.365696194673785, + "grad_norm": 0.13668411361099653, + "learning_rate": 1.30109356152608e-05, + "loss": 2.7322, + "step": 38109 + }, + { + "epoch": 2.365758271773543, + "grad_norm": 0.14317019867074388, + "learning_rate": 1.3008505707038021e-05, + "loss": 2.8108, + "step": 38110 + }, + { + "epoch": 2.3658203488733007, + "grad_norm": 0.14431032487800377, + "learning_rate": 1.3006075991808848e-05, + "loss": 2.7657, + "step": 38111 + }, + { + "epoch": 2.3658824259730586, + "grad_norm": 0.13687038867838186, + "learning_rate": 1.300364646958594e-05, + "loss": 2.7039, + "step": 38112 + }, + { + "epoch": 2.3659445030728166, + "grad_norm": 0.1344774870049822, + "learning_rate": 1.3001217140382004e-05, + "loss": 2.7561, + "step": 38113 + }, + { + "epoch": 2.3660065801725745, + "grad_norm": 0.1450407224642865, + "learning_rate": 1.2998788004209705e-05, + "loss": 2.7241, + "step": 38114 + }, + { + "epoch": 2.3660686572723324, + "grad_norm": 0.14428462431392597, + "learning_rate": 1.2996359061081708e-05, + "loss": 2.6982, + "step": 38115 + }, + { + "epoch": 2.3661307343720903, + "grad_norm": 0.1448534325775791, + "learning_rate": 1.299393031101067e-05, + "loss": 2.7762, + "step": 38116 + }, + { + "epoch": 2.366192811471848, + "grad_norm": 0.18479921386456585, + "learning_rate": 1.2991501754009299e-05, + "loss": 2.6641, + "step": 38117 + }, + { + "epoch": 2.366254888571606, + "grad_norm": 0.14718654821578137, + "learning_rate": 1.298907339009025e-05, + "loss": 2.7661, + "step": 38118 + }, + { + "epoch": 2.3663169656713636, + "grad_norm": 0.14774676908915543, + "learning_rate": 1.2986645219266191e-05, + "loss": 2.7285, + "step": 38119 + }, + { + "epoch": 2.3663790427711215, + "grad_norm": 0.13856375199751575, + "learning_rate": 1.2984217241549773e-05, + "loss": 2.6862, + "step": 38120 + }, + { + "epoch": 2.3664411198708795, + "grad_norm": 0.13494827498174064, + "learning_rate": 1.2981789456953692e-05, + "loss": 2.6038, + "step": 38121 + }, + { + "epoch": 2.3665031969706374, + "grad_norm": 0.15542553703670303, + "learning_rate": 1.2979361865490603e-05, + "loss": 2.7694, + "step": 38122 + }, + { + "epoch": 2.3665652740703953, + "grad_norm": 0.1377833673280309, + "learning_rate": 1.2976934467173169e-05, + "loss": 2.6755, + "step": 38123 + }, + { + "epoch": 2.366627351170153, + "grad_norm": 0.14331642151835383, + "learning_rate": 1.2974507262014057e-05, + "loss": 2.7026, + "step": 38124 + }, + { + "epoch": 2.366689428269911, + "grad_norm": 0.1351624947687351, + "learning_rate": 1.2972080250025914e-05, + "loss": 2.6861, + "step": 38125 + }, + { + "epoch": 2.366751505369669, + "grad_norm": 0.15021344064313408, + "learning_rate": 1.2969653431221418e-05, + "loss": 2.727, + "step": 38126 + }, + { + "epoch": 2.366813582469427, + "grad_norm": 0.159129322017654, + "learning_rate": 1.2967226805613242e-05, + "loss": 2.7919, + "step": 38127 + }, + { + "epoch": 2.366875659569185, + "grad_norm": 0.13948243284662765, + "learning_rate": 1.2964800373214036e-05, + "loss": 2.7322, + "step": 38128 + }, + { + "epoch": 2.366937736668943, + "grad_norm": 0.16516562129404247, + "learning_rate": 1.2962374134036454e-05, + "loss": 2.6716, + "step": 38129 + }, + { + "epoch": 2.3669998137687007, + "grad_norm": 0.13544972563348276, + "learning_rate": 1.2959948088093143e-05, + "loss": 2.7123, + "step": 38130 + }, + { + "epoch": 2.3670618908684586, + "grad_norm": 0.14886414550059504, + "learning_rate": 1.2957522235396796e-05, + "loss": 2.6856, + "step": 38131 + }, + { + "epoch": 2.3671239679682166, + "grad_norm": 0.1522130466634785, + "learning_rate": 1.295509657596004e-05, + "loss": 2.6958, + "step": 38132 + }, + { + "epoch": 2.3671860450679745, + "grad_norm": 0.14034406760355436, + "learning_rate": 1.2952671109795544e-05, + "loss": 2.8282, + "step": 38133 + }, + { + "epoch": 2.3672481221677324, + "grad_norm": 0.1451412550630917, + "learning_rate": 1.2950245836915953e-05, + "loss": 2.6616, + "step": 38134 + }, + { + "epoch": 2.3673101992674903, + "grad_norm": 0.13975518821484245, + "learning_rate": 1.2947820757333906e-05, + "loss": 2.7217, + "step": 38135 + }, + { + "epoch": 2.3673722763672482, + "grad_norm": 0.1519291444502844, + "learning_rate": 1.2945395871062094e-05, + "loss": 2.717, + "step": 38136 + }, + { + "epoch": 2.367434353467006, + "grad_norm": 0.13856037788546963, + "learning_rate": 1.2942971178113145e-05, + "loss": 2.6607, + "step": 38137 + }, + { + "epoch": 2.367496430566764, + "grad_norm": 0.1415439422708697, + "learning_rate": 1.294054667849971e-05, + "loss": 2.6427, + "step": 38138 + }, + { + "epoch": 2.367558507666522, + "grad_norm": 0.14219854725900088, + "learning_rate": 1.2938122372234423e-05, + "loss": 2.6839, + "step": 38139 + }, + { + "epoch": 2.36762058476628, + "grad_norm": 0.13632246896547787, + "learning_rate": 1.2935698259329965e-05, + "loss": 2.7388, + "step": 38140 + }, + { + "epoch": 2.367682661866038, + "grad_norm": 0.13968538582458545, + "learning_rate": 1.2933274339798967e-05, + "loss": 2.742, + "step": 38141 + }, + { + "epoch": 2.3677447389657953, + "grad_norm": 0.14783930708813767, + "learning_rate": 1.2930850613654078e-05, + "loss": 2.6906, + "step": 38142 + }, + { + "epoch": 2.3678068160655537, + "grad_norm": 0.15327987016023736, + "learning_rate": 1.2928427080907918e-05, + "loss": 2.6675, + "step": 38143 + }, + { + "epoch": 2.367868893165311, + "grad_norm": 0.13980329370563302, + "learning_rate": 1.2926003741573172e-05, + "loss": 2.6883, + "step": 38144 + }, + { + "epoch": 2.367930970265069, + "grad_norm": 0.1516954124329727, + "learning_rate": 1.2923580595662466e-05, + "loss": 2.7284, + "step": 38145 + }, + { + "epoch": 2.367993047364827, + "grad_norm": 0.14046217843618347, + "learning_rate": 1.2921157643188443e-05, + "loss": 2.6625, + "step": 38146 + }, + { + "epoch": 2.368055124464585, + "grad_norm": 0.1508576831890681, + "learning_rate": 1.2918734884163736e-05, + "loss": 2.7057, + "step": 38147 + }, + { + "epoch": 2.368117201564343, + "grad_norm": 0.1402764637088707, + "learning_rate": 1.2916312318600982e-05, + "loss": 2.7936, + "step": 38148 + }, + { + "epoch": 2.3681792786641007, + "grad_norm": 0.14268394358092315, + "learning_rate": 1.2913889946512837e-05, + "loss": 2.7468, + "step": 38149 + }, + { + "epoch": 2.3682413557638586, + "grad_norm": 0.1395887167151068, + "learning_rate": 1.2911467767911938e-05, + "loss": 2.7209, + "step": 38150 + }, + { + "epoch": 2.3683034328636166, + "grad_norm": 0.14120132840967925, + "learning_rate": 1.2909045782810909e-05, + "loss": 2.7695, + "step": 38151 + }, + { + "epoch": 2.3683655099633745, + "grad_norm": 0.157027029104109, + "learning_rate": 1.2906623991222382e-05, + "loss": 2.6402, + "step": 38152 + }, + { + "epoch": 2.3684275870631324, + "grad_norm": 0.14981873880309685, + "learning_rate": 1.2904202393159016e-05, + "loss": 2.6995, + "step": 38153 + }, + { + "epoch": 2.3684896641628903, + "grad_norm": 0.16812078670093522, + "learning_rate": 1.2901780988633433e-05, + "loss": 2.6878, + "step": 38154 + }, + { + "epoch": 2.3685517412626482, + "grad_norm": 0.13870894245591875, + "learning_rate": 1.289935977765826e-05, + "loss": 2.7602, + "step": 38155 + }, + { + "epoch": 2.368613818362406, + "grad_norm": 0.14713439406543163, + "learning_rate": 1.2896938760246118e-05, + "loss": 2.7611, + "step": 38156 + }, + { + "epoch": 2.368675895462164, + "grad_norm": 0.13936853033496946, + "learning_rate": 1.2894517936409673e-05, + "loss": 2.7733, + "step": 38157 + }, + { + "epoch": 2.368737972561922, + "grad_norm": 0.1393674076796445, + "learning_rate": 1.289209730616152e-05, + "loss": 2.6494, + "step": 38158 + }, + { + "epoch": 2.36880004966168, + "grad_norm": 0.13808467276289402, + "learning_rate": 1.2889676869514317e-05, + "loss": 2.6502, + "step": 38159 + }, + { + "epoch": 2.368862126761438, + "grad_norm": 0.1633638481045837, + "learning_rate": 1.2887256626480681e-05, + "loss": 2.7133, + "step": 38160 + }, + { + "epoch": 2.3689242038611957, + "grad_norm": 0.1412916936568753, + "learning_rate": 1.2884836577073234e-05, + "loss": 2.7252, + "step": 38161 + }, + { + "epoch": 2.3689862809609536, + "grad_norm": 0.15882730387162536, + "learning_rate": 1.288241672130459e-05, + "loss": 2.6771, + "step": 38162 + }, + { + "epoch": 2.3690483580607116, + "grad_norm": 0.13856713274849528, + "learning_rate": 1.2879997059187403e-05, + "loss": 2.7106, + "step": 38163 + }, + { + "epoch": 2.3691104351604695, + "grad_norm": 0.14171682467935034, + "learning_rate": 1.2877577590734285e-05, + "loss": 2.6328, + "step": 38164 + }, + { + "epoch": 2.369172512260227, + "grad_norm": 0.14519476658777747, + "learning_rate": 1.2875158315957858e-05, + "loss": 2.6573, + "step": 38165 + }, + { + "epoch": 2.3692345893599853, + "grad_norm": 0.13662670306673835, + "learning_rate": 1.2872739234870723e-05, + "loss": 2.6797, + "step": 38166 + }, + { + "epoch": 2.369296666459743, + "grad_norm": 0.14122103500670144, + "learning_rate": 1.2870320347485537e-05, + "loss": 2.7116, + "step": 38167 + }, + { + "epoch": 2.3693587435595007, + "grad_norm": 0.1398447830545639, + "learning_rate": 1.2867901653814907e-05, + "loss": 2.7436, + "step": 38168 + }, + { + "epoch": 2.3694208206592586, + "grad_norm": 0.14638477754228296, + "learning_rate": 1.2865483153871444e-05, + "loss": 2.7864, + "step": 38169 + }, + { + "epoch": 2.3694828977590165, + "grad_norm": 0.1416787496198644, + "learning_rate": 1.2863064847667755e-05, + "loss": 2.7603, + "step": 38170 + }, + { + "epoch": 2.3695449748587745, + "grad_norm": 0.14005758463632076, + "learning_rate": 1.2860646735216486e-05, + "loss": 2.7244, + "step": 38171 + }, + { + "epoch": 2.3696070519585324, + "grad_norm": 0.1424333739019943, + "learning_rate": 1.2858228816530244e-05, + "loss": 2.6825, + "step": 38172 + }, + { + "epoch": 2.3696691290582903, + "grad_norm": 0.14063344373784378, + "learning_rate": 1.2855811091621633e-05, + "loss": 2.7883, + "step": 38173 + }, + { + "epoch": 2.369731206158048, + "grad_norm": 0.14833179901589477, + "learning_rate": 1.2853393560503274e-05, + "loss": 2.5861, + "step": 38174 + }, + { + "epoch": 2.369793283257806, + "grad_norm": 0.1384153612853939, + "learning_rate": 1.2850976223187766e-05, + "loss": 2.7012, + "step": 38175 + }, + { + "epoch": 2.369855360357564, + "grad_norm": 0.15082002002971265, + "learning_rate": 1.2848559079687744e-05, + "loss": 2.7083, + "step": 38176 + }, + { + "epoch": 2.369917437457322, + "grad_norm": 0.13314632927697723, + "learning_rate": 1.2846142130015814e-05, + "loss": 2.7885, + "step": 38177 + }, + { + "epoch": 2.36997951455708, + "grad_norm": 0.14723013189598652, + "learning_rate": 1.2843725374184574e-05, + "loss": 2.7033, + "step": 38178 + }, + { + "epoch": 2.370041591656838, + "grad_norm": 0.15907891717137818, + "learning_rate": 1.2841308812206626e-05, + "loss": 2.7327, + "step": 38179 + }, + { + "epoch": 2.3701036687565957, + "grad_norm": 0.1485678514540638, + "learning_rate": 1.2838892444094609e-05, + "loss": 2.7349, + "step": 38180 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.14238193155861265, + "learning_rate": 1.2836476269861108e-05, + "loss": 2.8276, + "step": 38181 + }, + { + "epoch": 2.3702278229561116, + "grad_norm": 0.1520227571802822, + "learning_rate": 1.283406028951873e-05, + "loss": 2.7711, + "step": 38182 + }, + { + "epoch": 2.3702899000558695, + "grad_norm": 0.14762653122397693, + "learning_rate": 1.2831644503080087e-05, + "loss": 2.6994, + "step": 38183 + }, + { + "epoch": 2.3703519771556274, + "grad_norm": 0.1378860505293554, + "learning_rate": 1.2829228910557756e-05, + "loss": 2.6925, + "step": 38184 + }, + { + "epoch": 2.3704140542553853, + "grad_norm": 0.1336031408522866, + "learning_rate": 1.282681351196438e-05, + "loss": 2.6795, + "step": 38185 + }, + { + "epoch": 2.3704761313551432, + "grad_norm": 0.1505814122411897, + "learning_rate": 1.2824398307312547e-05, + "loss": 2.7536, + "step": 38186 + }, + { + "epoch": 2.370538208454901, + "grad_norm": 0.14426649175462253, + "learning_rate": 1.2821983296614842e-05, + "loss": 2.7485, + "step": 38187 + }, + { + "epoch": 2.370600285554659, + "grad_norm": 0.1396897528293899, + "learning_rate": 1.281956847988387e-05, + "loss": 2.7091, + "step": 38188 + }, + { + "epoch": 2.370662362654417, + "grad_norm": 0.14141575033193143, + "learning_rate": 1.2817153857132248e-05, + "loss": 2.6744, + "step": 38189 + }, + { + "epoch": 2.3707244397541745, + "grad_norm": 0.15718775323540252, + "learning_rate": 1.2814739428372557e-05, + "loss": 2.7184, + "step": 38190 + }, + { + "epoch": 2.370786516853933, + "grad_norm": 0.20044847623748324, + "learning_rate": 1.2812325193617386e-05, + "loss": 2.7541, + "step": 38191 + }, + { + "epoch": 2.3708485939536903, + "grad_norm": 0.1670931739408381, + "learning_rate": 1.2809911152879356e-05, + "loss": 2.7701, + "step": 38192 + }, + { + "epoch": 2.370910671053448, + "grad_norm": 0.13780382628620227, + "learning_rate": 1.2807497306171029e-05, + "loss": 2.7428, + "step": 38193 + }, + { + "epoch": 2.370972748153206, + "grad_norm": 0.14841506813609426, + "learning_rate": 1.2805083653505035e-05, + "loss": 2.7485, + "step": 38194 + }, + { + "epoch": 2.371034825252964, + "grad_norm": 0.14436068651743364, + "learning_rate": 1.2802670194893951e-05, + "loss": 2.6409, + "step": 38195 + }, + { + "epoch": 2.371096902352722, + "grad_norm": 0.1406153635855907, + "learning_rate": 1.2800256930350362e-05, + "loss": 2.6617, + "step": 38196 + }, + { + "epoch": 2.37115897945248, + "grad_norm": 0.14070617134486166, + "learning_rate": 1.2797843859886871e-05, + "loss": 2.6373, + "step": 38197 + }, + { + "epoch": 2.371221056552238, + "grad_norm": 0.150291032140399, + "learning_rate": 1.2795430983516038e-05, + "loss": 2.6629, + "step": 38198 + }, + { + "epoch": 2.3712831336519957, + "grad_norm": 0.1427194730791719, + "learning_rate": 1.2793018301250492e-05, + "loss": 2.7887, + "step": 38199 + }, + { + "epoch": 2.3713452107517536, + "grad_norm": 0.14487280953300738, + "learning_rate": 1.2790605813102796e-05, + "loss": 2.7525, + "step": 38200 + }, + { + "epoch": 2.3714072878515116, + "grad_norm": 0.14219716531789167, + "learning_rate": 1.2788193519085551e-05, + "loss": 2.7857, + "step": 38201 + }, + { + "epoch": 2.3714693649512695, + "grad_norm": 0.14615187100817406, + "learning_rate": 1.2785781419211312e-05, + "loss": 2.7689, + "step": 38202 + }, + { + "epoch": 2.3715314420510274, + "grad_norm": 0.1459616681085604, + "learning_rate": 1.2783369513492705e-05, + "loss": 2.6608, + "step": 38203 + }, + { + "epoch": 2.3715935191507853, + "grad_norm": 0.1533329648671267, + "learning_rate": 1.278095780194229e-05, + "loss": 2.6402, + "step": 38204 + }, + { + "epoch": 2.3716555962505432, + "grad_norm": 0.14627412966818257, + "learning_rate": 1.2778546284572657e-05, + "loss": 2.749, + "step": 38205 + }, + { + "epoch": 2.371717673350301, + "grad_norm": 0.1462775295891135, + "learning_rate": 1.2776134961396364e-05, + "loss": 2.7185, + "step": 38206 + }, + { + "epoch": 2.371779750450059, + "grad_norm": 0.14967708250759876, + "learning_rate": 1.2773723832426032e-05, + "loss": 2.7385, + "step": 38207 + }, + { + "epoch": 2.371841827549817, + "grad_norm": 0.13717359087824946, + "learning_rate": 1.2771312897674215e-05, + "loss": 2.6923, + "step": 38208 + }, + { + "epoch": 2.371903904649575, + "grad_norm": 0.14212626878462278, + "learning_rate": 1.2768902157153495e-05, + "loss": 2.7311, + "step": 38209 + }, + { + "epoch": 2.371965981749333, + "grad_norm": 0.1385905403380544, + "learning_rate": 1.2766491610876458e-05, + "loss": 2.7289, + "step": 38210 + }, + { + "epoch": 2.3720280588490907, + "grad_norm": 0.14244528029359926, + "learning_rate": 1.2764081258855654e-05, + "loss": 2.7137, + "step": 38211 + }, + { + "epoch": 2.3720901359488487, + "grad_norm": 0.1419222981115794, + "learning_rate": 1.276167110110369e-05, + "loss": 2.7313, + "step": 38212 + }, + { + "epoch": 2.372152213048606, + "grad_norm": 0.13511664521416492, + "learning_rate": 1.2759261137633133e-05, + "loss": 2.6126, + "step": 38213 + }, + { + "epoch": 2.3722142901483645, + "grad_norm": 0.14824542995721512, + "learning_rate": 1.2756851368456546e-05, + "loss": 2.6944, + "step": 38214 + }, + { + "epoch": 2.372276367248122, + "grad_norm": 0.14645656957577233, + "learning_rate": 1.2754441793586491e-05, + "loss": 2.7446, + "step": 38215 + }, + { + "epoch": 2.37233844434788, + "grad_norm": 0.14350889734230604, + "learning_rate": 1.2752032413035574e-05, + "loss": 2.6669, + "step": 38216 + }, + { + "epoch": 2.372400521447638, + "grad_norm": 0.14078500043041645, + "learning_rate": 1.2749623226816343e-05, + "loss": 2.6381, + "step": 38217 + }, + { + "epoch": 2.3724625985473957, + "grad_norm": 0.13886555725728852, + "learning_rate": 1.2747214234941373e-05, + "loss": 2.6656, + "step": 38218 + }, + { + "epoch": 2.3725246756471536, + "grad_norm": 0.14014539126898304, + "learning_rate": 1.2744805437423229e-05, + "loss": 2.716, + "step": 38219 + }, + { + "epoch": 2.3725867527469116, + "grad_norm": 0.14991115332944635, + "learning_rate": 1.2742396834274467e-05, + "loss": 2.7417, + "step": 38220 + }, + { + "epoch": 2.3726488298466695, + "grad_norm": 0.15483366269379267, + "learning_rate": 1.2739988425507676e-05, + "loss": 2.7185, + "step": 38221 + }, + { + "epoch": 2.3727109069464274, + "grad_norm": 0.1363311928455819, + "learning_rate": 1.2737580211135414e-05, + "loss": 2.7387, + "step": 38222 + }, + { + "epoch": 2.3727729840461853, + "grad_norm": 0.15217753659715122, + "learning_rate": 1.2735172191170237e-05, + "loss": 2.6581, + "step": 38223 + }, + { + "epoch": 2.3728350611459432, + "grad_norm": 0.1390757321373771, + "learning_rate": 1.27327643656247e-05, + "loss": 2.721, + "step": 38224 + }, + { + "epoch": 2.372897138245701, + "grad_norm": 0.13581984514666753, + "learning_rate": 1.2730356734511378e-05, + "loss": 2.6853, + "step": 38225 + }, + { + "epoch": 2.372959215345459, + "grad_norm": 0.13839775177874308, + "learning_rate": 1.272794929784285e-05, + "loss": 2.7044, + "step": 38226 + }, + { + "epoch": 2.373021292445217, + "grad_norm": 0.13808503022496113, + "learning_rate": 1.2725542055631655e-05, + "loss": 2.6904, + "step": 38227 + }, + { + "epoch": 2.373083369544975, + "grad_norm": 0.1379372385573059, + "learning_rate": 1.2723135007890359e-05, + "loss": 2.7051, + "step": 38228 + }, + { + "epoch": 2.373145446644733, + "grad_norm": 0.14307921680993269, + "learning_rate": 1.27207281546315e-05, + "loss": 2.68, + "step": 38229 + }, + { + "epoch": 2.3732075237444907, + "grad_norm": 0.13990423476834263, + "learning_rate": 1.2718321495867669e-05, + "loss": 2.6907, + "step": 38230 + }, + { + "epoch": 2.3732696008442487, + "grad_norm": 0.15134958349426736, + "learning_rate": 1.2715915031611402e-05, + "loss": 2.7153, + "step": 38231 + }, + { + "epoch": 2.3733316779440066, + "grad_norm": 0.1549079239681528, + "learning_rate": 1.271350876187526e-05, + "loss": 2.7138, + "step": 38232 + }, + { + "epoch": 2.3733937550437645, + "grad_norm": 0.14604521087852562, + "learning_rate": 1.2711102686671794e-05, + "loss": 2.7428, + "step": 38233 + }, + { + "epoch": 2.3734558321435224, + "grad_norm": 0.14229074675312436, + "learning_rate": 1.2708696806013542e-05, + "loss": 2.7559, + "step": 38234 + }, + { + "epoch": 2.3735179092432803, + "grad_norm": 0.14097902980219235, + "learning_rate": 1.2706291119913088e-05, + "loss": 2.709, + "step": 38235 + }, + { + "epoch": 2.3735799863430382, + "grad_norm": 0.13703637693115978, + "learning_rate": 1.2703885628382967e-05, + "loss": 2.6619, + "step": 38236 + }, + { + "epoch": 2.373642063442796, + "grad_norm": 0.14437293712785348, + "learning_rate": 1.2701480331435722e-05, + "loss": 2.7188, + "step": 38237 + }, + { + "epoch": 2.3737041405425536, + "grad_norm": 0.14739015282399898, + "learning_rate": 1.2699075229083901e-05, + "loss": 2.8196, + "step": 38238 + }, + { + "epoch": 2.373766217642312, + "grad_norm": 0.14986299252988738, + "learning_rate": 1.2696670321340071e-05, + "loss": 2.6974, + "step": 38239 + }, + { + "epoch": 2.3738282947420695, + "grad_norm": 0.14724108747054288, + "learning_rate": 1.269426560821677e-05, + "loss": 2.6781, + "step": 38240 + }, + { + "epoch": 2.3738903718418274, + "grad_norm": 0.13190486660150338, + "learning_rate": 1.2691861089726542e-05, + "loss": 2.6849, + "step": 38241 + }, + { + "epoch": 2.3739524489415853, + "grad_norm": 0.13531209196773616, + "learning_rate": 1.2689456765881913e-05, + "loss": 2.6533, + "step": 38242 + }, + { + "epoch": 2.374014526041343, + "grad_norm": 0.14349866928877544, + "learning_rate": 1.2687052636695462e-05, + "loss": 2.7622, + "step": 38243 + }, + { + "epoch": 2.374076603141101, + "grad_norm": 0.16707716558750507, + "learning_rate": 1.2684648702179719e-05, + "loss": 2.7999, + "step": 38244 + }, + { + "epoch": 2.374138680240859, + "grad_norm": 0.14537152045424484, + "learning_rate": 1.2682244962347217e-05, + "loss": 2.6854, + "step": 38245 + }, + { + "epoch": 2.374200757340617, + "grad_norm": 0.1518388046001804, + "learning_rate": 1.2679841417210504e-05, + "loss": 2.7172, + "step": 38246 + }, + { + "epoch": 2.374262834440375, + "grad_norm": 0.15759928576829801, + "learning_rate": 1.2677438066782104e-05, + "loss": 2.7702, + "step": 38247 + }, + { + "epoch": 2.374324911540133, + "grad_norm": 0.14822213963383263, + "learning_rate": 1.2675034911074585e-05, + "loss": 2.7274, + "step": 38248 + }, + { + "epoch": 2.3743869886398907, + "grad_norm": 0.13583394935264423, + "learning_rate": 1.2672631950100466e-05, + "loss": 2.6373, + "step": 38249 + }, + { + "epoch": 2.3744490657396486, + "grad_norm": 0.14044726584059095, + "learning_rate": 1.2670229183872295e-05, + "loss": 2.7369, + "step": 38250 + }, + { + "epoch": 2.3745111428394066, + "grad_norm": 0.15707117303870105, + "learning_rate": 1.2667826612402578e-05, + "loss": 2.6616, + "step": 38251 + }, + { + "epoch": 2.3745732199391645, + "grad_norm": 0.13404580701479704, + "learning_rate": 1.2665424235703894e-05, + "loss": 2.619, + "step": 38252 + }, + { + "epoch": 2.3746352970389224, + "grad_norm": 0.14664407925203202, + "learning_rate": 1.2663022053788748e-05, + "loss": 2.7601, + "step": 38253 + }, + { + "epoch": 2.3746973741386803, + "grad_norm": 0.15236015720536336, + "learning_rate": 1.2660620066669687e-05, + "loss": 2.7637, + "step": 38254 + }, + { + "epoch": 2.3747594512384382, + "grad_norm": 0.1393486620090076, + "learning_rate": 1.2658218274359218e-05, + "loss": 2.7325, + "step": 38255 + }, + { + "epoch": 2.374821528338196, + "grad_norm": 0.13432185735675553, + "learning_rate": 1.2655816676869887e-05, + "loss": 2.6419, + "step": 38256 + }, + { + "epoch": 2.374883605437954, + "grad_norm": 0.1460173919860647, + "learning_rate": 1.2653415274214243e-05, + "loss": 2.6271, + "step": 38257 + }, + { + "epoch": 2.374945682537712, + "grad_norm": 0.14785716526194712, + "learning_rate": 1.2651014066404798e-05, + "loss": 2.7003, + "step": 38258 + }, + { + "epoch": 2.37500775963747, + "grad_norm": 0.14999606653857647, + "learning_rate": 1.2648613053454078e-05, + "loss": 2.6919, + "step": 38259 + }, + { + "epoch": 2.375069836737228, + "grad_norm": 0.1447999621377416, + "learning_rate": 1.264621223537461e-05, + "loss": 2.7152, + "step": 38260 + }, + { + "epoch": 2.3751319138369853, + "grad_norm": 0.14177846710268738, + "learning_rate": 1.2643811612178908e-05, + "loss": 2.7645, + "step": 38261 + }, + { + "epoch": 2.3751939909367437, + "grad_norm": 0.15788065458411746, + "learning_rate": 1.2641411183879525e-05, + "loss": 2.7413, + "step": 38262 + }, + { + "epoch": 2.375256068036501, + "grad_norm": 0.1360408187531145, + "learning_rate": 1.2639010950488966e-05, + "loss": 2.7737, + "step": 38263 + }, + { + "epoch": 2.375318145136259, + "grad_norm": 0.140403798758401, + "learning_rate": 1.2636610912019759e-05, + "loss": 2.7402, + "step": 38264 + }, + { + "epoch": 2.375380222236017, + "grad_norm": 0.14103902030947996, + "learning_rate": 1.2634211068484408e-05, + "loss": 2.7057, + "step": 38265 + }, + { + "epoch": 2.375442299335775, + "grad_norm": 0.14402703665862798, + "learning_rate": 1.2631811419895462e-05, + "loss": 2.7246, + "step": 38266 + }, + { + "epoch": 2.375504376435533, + "grad_norm": 0.14671433518321408, + "learning_rate": 1.2629411966265431e-05, + "loss": 2.7508, + "step": 38267 + }, + { + "epoch": 2.3755664535352907, + "grad_norm": 0.15369251048672697, + "learning_rate": 1.2627012707606822e-05, + "loss": 2.6482, + "step": 38268 + }, + { + "epoch": 2.3756285306350486, + "grad_norm": 0.13819893930370752, + "learning_rate": 1.262461364393217e-05, + "loss": 2.7211, + "step": 38269 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.1330545547741825, + "learning_rate": 1.262221477525396e-05, + "loss": 2.7497, + "step": 38270 + }, + { + "epoch": 2.3757526848345645, + "grad_norm": 0.14582507856598412, + "learning_rate": 1.2619816101584747e-05, + "loss": 2.6617, + "step": 38271 + }, + { + "epoch": 2.3758147619343224, + "grad_norm": 0.145709659440116, + "learning_rate": 1.2617417622937027e-05, + "loss": 2.7788, + "step": 38272 + }, + { + "epoch": 2.3758768390340803, + "grad_norm": 0.15323476166130348, + "learning_rate": 1.2615019339323314e-05, + "loss": 2.7576, + "step": 38273 + }, + { + "epoch": 2.3759389161338382, + "grad_norm": 0.13312194341633923, + "learning_rate": 1.2612621250756107e-05, + "loss": 2.6808, + "step": 38274 + }, + { + "epoch": 2.376000993233596, + "grad_norm": 0.15678709820771392, + "learning_rate": 1.2610223357247946e-05, + "loss": 2.7604, + "step": 38275 + }, + { + "epoch": 2.376063070333354, + "grad_norm": 0.14449468356304934, + "learning_rate": 1.2607825658811323e-05, + "loss": 2.6441, + "step": 38276 + }, + { + "epoch": 2.376125147433112, + "grad_norm": 0.14074423821040757, + "learning_rate": 1.2605428155458753e-05, + "loss": 2.7411, + "step": 38277 + }, + { + "epoch": 2.37618722453287, + "grad_norm": 0.15236982709273225, + "learning_rate": 1.260303084720273e-05, + "loss": 2.7955, + "step": 38278 + }, + { + "epoch": 2.376249301632628, + "grad_norm": 0.1400986596505909, + "learning_rate": 1.2600633734055789e-05, + "loss": 2.6151, + "step": 38279 + }, + { + "epoch": 2.3763113787323857, + "grad_norm": 0.14232853376836788, + "learning_rate": 1.2598236816030417e-05, + "loss": 2.7046, + "step": 38280 + }, + { + "epoch": 2.3763734558321437, + "grad_norm": 0.15297247676576314, + "learning_rate": 1.2595840093139121e-05, + "loss": 2.714, + "step": 38281 + }, + { + "epoch": 2.3764355329319016, + "grad_norm": 0.13844081944647255, + "learning_rate": 1.2593443565394414e-05, + "loss": 2.7776, + "step": 38282 + }, + { + "epoch": 2.3764976100316595, + "grad_norm": 0.14922075919351482, + "learning_rate": 1.2591047232808773e-05, + "loss": 2.7873, + "step": 38283 + }, + { + "epoch": 2.3765596871314174, + "grad_norm": 0.14229274302764808, + "learning_rate": 1.2588651095394732e-05, + "loss": 2.7536, + "step": 38284 + }, + { + "epoch": 2.3766217642311753, + "grad_norm": 0.14148883278584085, + "learning_rate": 1.2586255153164789e-05, + "loss": 2.6994, + "step": 38285 + }, + { + "epoch": 2.376683841330933, + "grad_norm": 0.1399804882625983, + "learning_rate": 1.2583859406131427e-05, + "loss": 2.7253, + "step": 38286 + }, + { + "epoch": 2.376745918430691, + "grad_norm": 0.16662295831315146, + "learning_rate": 1.2581463854307146e-05, + "loss": 2.7177, + "step": 38287 + }, + { + "epoch": 2.3768079955304486, + "grad_norm": 0.14977925369425432, + "learning_rate": 1.2579068497704461e-05, + "loss": 2.7312, + "step": 38288 + }, + { + "epoch": 2.3768700726302066, + "grad_norm": 0.16599400323425992, + "learning_rate": 1.2576673336335849e-05, + "loss": 2.6838, + "step": 38289 + }, + { + "epoch": 2.3769321497299645, + "grad_norm": 0.1450808937926043, + "learning_rate": 1.2574278370213826e-05, + "loss": 2.7043, + "step": 38290 + }, + { + "epoch": 2.3769942268297224, + "grad_norm": 0.1396672029491582, + "learning_rate": 1.257188359935088e-05, + "loss": 2.7624, + "step": 38291 + }, + { + "epoch": 2.3770563039294803, + "grad_norm": 0.14148893152037326, + "learning_rate": 1.2569489023759495e-05, + "loss": 2.7319, + "step": 38292 + }, + { + "epoch": 2.3771183810292382, + "grad_norm": 0.14356443151504067, + "learning_rate": 1.2567094643452182e-05, + "loss": 2.6878, + "step": 38293 + }, + { + "epoch": 2.377180458128996, + "grad_norm": 0.1436749039289712, + "learning_rate": 1.2564700458441424e-05, + "loss": 2.7166, + "step": 38294 + }, + { + "epoch": 2.377242535228754, + "grad_norm": 0.14729831267011664, + "learning_rate": 1.256230646873971e-05, + "loss": 2.7133, + "step": 38295 + }, + { + "epoch": 2.377304612328512, + "grad_norm": 0.13606173823156087, + "learning_rate": 1.255991267435953e-05, + "loss": 2.68, + "step": 38296 + }, + { + "epoch": 2.37736668942827, + "grad_norm": 0.14101995530176598, + "learning_rate": 1.2557519075313367e-05, + "loss": 2.7003, + "step": 38297 + }, + { + "epoch": 2.377428766528028, + "grad_norm": 0.15514822175613954, + "learning_rate": 1.2555125671613726e-05, + "loss": 2.7192, + "step": 38298 + }, + { + "epoch": 2.3774908436277857, + "grad_norm": 0.15181894875167495, + "learning_rate": 1.2552732463273087e-05, + "loss": 2.6492, + "step": 38299 + }, + { + "epoch": 2.3775529207275437, + "grad_norm": 0.14515080274415124, + "learning_rate": 1.2550339450303927e-05, + "loss": 2.6941, + "step": 38300 + }, + { + "epoch": 2.3776149978273016, + "grad_norm": 0.16338829177740186, + "learning_rate": 1.2547946632718732e-05, + "loss": 2.6489, + "step": 38301 + }, + { + "epoch": 2.3776770749270595, + "grad_norm": 0.1432962036753494, + "learning_rate": 1.2545554010529997e-05, + "loss": 2.748, + "step": 38302 + }, + { + "epoch": 2.3777391520268174, + "grad_norm": 0.14330074660212533, + "learning_rate": 1.2543161583750207e-05, + "loss": 2.6531, + "step": 38303 + }, + { + "epoch": 2.3778012291265753, + "grad_norm": 0.14300678429941194, + "learning_rate": 1.2540769352391829e-05, + "loss": 2.7419, + "step": 38304 + }, + { + "epoch": 2.3778633062263332, + "grad_norm": 0.1490100309608514, + "learning_rate": 1.2538377316467342e-05, + "loss": 2.6391, + "step": 38305 + }, + { + "epoch": 2.377925383326091, + "grad_norm": 0.13887721963047717, + "learning_rate": 1.2535985475989242e-05, + "loss": 2.7813, + "step": 38306 + }, + { + "epoch": 2.377987460425849, + "grad_norm": 0.13839214614276807, + "learning_rate": 1.2533593830970008e-05, + "loss": 2.6642, + "step": 38307 + }, + { + "epoch": 2.378049537525607, + "grad_norm": 0.1528276924273043, + "learning_rate": 1.2531202381422107e-05, + "loss": 2.6995, + "step": 38308 + }, + { + "epoch": 2.3781116146253645, + "grad_norm": 0.1506180220685246, + "learning_rate": 1.252881112735802e-05, + "loss": 2.6949, + "step": 38309 + }, + { + "epoch": 2.378173691725123, + "grad_norm": 0.13799585185303903, + "learning_rate": 1.2526420068790206e-05, + "loss": 2.6632, + "step": 38310 + }, + { + "epoch": 2.3782357688248803, + "grad_norm": 0.1708420469709593, + "learning_rate": 1.2524029205731169e-05, + "loss": 2.7484, + "step": 38311 + }, + { + "epoch": 2.378297845924638, + "grad_norm": 0.14285569411100899, + "learning_rate": 1.2521638538193375e-05, + "loss": 2.6668, + "step": 38312 + }, + { + "epoch": 2.378359923024396, + "grad_norm": 0.1349975552823209, + "learning_rate": 1.2519248066189287e-05, + "loss": 2.6546, + "step": 38313 + }, + { + "epoch": 2.378422000124154, + "grad_norm": 0.1383041085461297, + "learning_rate": 1.2516857789731363e-05, + "loss": 2.7102, + "step": 38314 + }, + { + "epoch": 2.378484077223912, + "grad_norm": 0.135324308340464, + "learning_rate": 1.2514467708832112e-05, + "loss": 2.7585, + "step": 38315 + }, + { + "epoch": 2.37854615432367, + "grad_norm": 0.1477999611006164, + "learning_rate": 1.2512077823503977e-05, + "loss": 2.7748, + "step": 38316 + }, + { + "epoch": 2.378608231423428, + "grad_norm": 0.14499520890977602, + "learning_rate": 1.2509688133759434e-05, + "loss": 2.7283, + "step": 38317 + }, + { + "epoch": 2.3786703085231857, + "grad_norm": 0.1350042262674121, + "learning_rate": 1.2507298639610954e-05, + "loss": 2.6747, + "step": 38318 + }, + { + "epoch": 2.3787323856229436, + "grad_norm": 0.14289372930392685, + "learning_rate": 1.250490934107098e-05, + "loss": 2.6603, + "step": 38319 + }, + { + "epoch": 2.3787944627227016, + "grad_norm": 0.1390696381762089, + "learning_rate": 1.250252023815201e-05, + "loss": 2.6913, + "step": 38320 + }, + { + "epoch": 2.3788565398224595, + "grad_norm": 0.1537748511707328, + "learning_rate": 1.2500131330866494e-05, + "loss": 2.7338, + "step": 38321 + }, + { + "epoch": 2.3789186169222174, + "grad_norm": 0.13616422184127142, + "learning_rate": 1.2497742619226887e-05, + "loss": 2.6964, + "step": 38322 + }, + { + "epoch": 2.3789806940219753, + "grad_norm": 0.1404354022671123, + "learning_rate": 1.249535410324567e-05, + "loss": 2.7382, + "step": 38323 + }, + { + "epoch": 2.3790427711217332, + "grad_norm": 0.13154051140498535, + "learning_rate": 1.2492965782935284e-05, + "loss": 2.742, + "step": 38324 + }, + { + "epoch": 2.379104848221491, + "grad_norm": 0.1393699403237864, + "learning_rate": 1.2490577658308212e-05, + "loss": 2.7536, + "step": 38325 + }, + { + "epoch": 2.379166925321249, + "grad_norm": 0.15242992922425636, + "learning_rate": 1.2488189729376903e-05, + "loss": 2.6675, + "step": 38326 + }, + { + "epoch": 2.379229002421007, + "grad_norm": 0.13749113569080046, + "learning_rate": 1.2485801996153817e-05, + "loss": 2.7184, + "step": 38327 + }, + { + "epoch": 2.379291079520765, + "grad_norm": 0.1422427753803623, + "learning_rate": 1.2483414458651393e-05, + "loss": 2.7621, + "step": 38328 + }, + { + "epoch": 2.379353156620523, + "grad_norm": 0.136003558233289, + "learning_rate": 1.2481027116882115e-05, + "loss": 2.7132, + "step": 38329 + }, + { + "epoch": 2.3794152337202807, + "grad_norm": 0.1502885781899188, + "learning_rate": 1.247863997085843e-05, + "loss": 2.656, + "step": 38330 + }, + { + "epoch": 2.3794773108200387, + "grad_norm": 0.13102716392834782, + "learning_rate": 1.2476253020592788e-05, + "loss": 2.6971, + "step": 38331 + }, + { + "epoch": 2.3795393879197966, + "grad_norm": 0.15477375385632772, + "learning_rate": 1.2473866266097645e-05, + "loss": 2.6885, + "step": 38332 + }, + { + "epoch": 2.3796014650195545, + "grad_norm": 0.1397601127923533, + "learning_rate": 1.2471479707385436e-05, + "loss": 2.6379, + "step": 38333 + }, + { + "epoch": 2.379663542119312, + "grad_norm": 0.14493292331995444, + "learning_rate": 1.2469093344468641e-05, + "loss": 2.7101, + "step": 38334 + }, + { + "epoch": 2.3797256192190703, + "grad_norm": 0.13578087097703295, + "learning_rate": 1.2466707177359698e-05, + "loss": 2.7387, + "step": 38335 + }, + { + "epoch": 2.379787696318828, + "grad_norm": 0.15161938116090917, + "learning_rate": 1.2464321206071055e-05, + "loss": 2.7404, + "step": 38336 + }, + { + "epoch": 2.3798497734185857, + "grad_norm": 0.1561612115361483, + "learning_rate": 1.2461935430615146e-05, + "loss": 2.6512, + "step": 38337 + }, + { + "epoch": 2.3799118505183436, + "grad_norm": 0.1540609318264584, + "learning_rate": 1.2459549851004443e-05, + "loss": 2.6631, + "step": 38338 + }, + { + "epoch": 2.3799739276181016, + "grad_norm": 0.1364715589479256, + "learning_rate": 1.2457164467251387e-05, + "loss": 2.7344, + "step": 38339 + }, + { + "epoch": 2.3800360047178595, + "grad_norm": 0.13926847157607358, + "learning_rate": 1.2454779279368417e-05, + "loss": 2.7321, + "step": 38340 + }, + { + "epoch": 2.3800980818176174, + "grad_norm": 0.1432637880848053, + "learning_rate": 1.2452394287367964e-05, + "loss": 2.76, + "step": 38341 + }, + { + "epoch": 2.3801601589173753, + "grad_norm": 0.13513770775585385, + "learning_rate": 1.2450009491262493e-05, + "loss": 2.6811, + "step": 38342 + }, + { + "epoch": 2.3802222360171332, + "grad_norm": 0.14489842505323072, + "learning_rate": 1.2447624891064447e-05, + "loss": 2.7519, + "step": 38343 + }, + { + "epoch": 2.380284313116891, + "grad_norm": 0.16009855975875384, + "learning_rate": 1.2445240486786252e-05, + "loss": 2.6823, + "step": 38344 + }, + { + "epoch": 2.380346390216649, + "grad_norm": 0.13535373322698965, + "learning_rate": 1.2442856278440357e-05, + "loss": 2.6931, + "step": 38345 + }, + { + "epoch": 2.380408467316407, + "grad_norm": 0.14288312479339935, + "learning_rate": 1.2440472266039182e-05, + "loss": 2.6814, + "step": 38346 + }, + { + "epoch": 2.380470544416165, + "grad_norm": 0.1362424728956678, + "learning_rate": 1.2438088449595198e-05, + "loss": 2.6356, + "step": 38347 + }, + { + "epoch": 2.380532621515923, + "grad_norm": 0.1465564613989368, + "learning_rate": 1.243570482912082e-05, + "loss": 2.6738, + "step": 38348 + }, + { + "epoch": 2.3805946986156807, + "grad_norm": 0.1542805349172301, + "learning_rate": 1.2433321404628495e-05, + "loss": 2.8187, + "step": 38349 + }, + { + "epoch": 2.3806567757154387, + "grad_norm": 0.14270892088019643, + "learning_rate": 1.2430938176130636e-05, + "loss": 2.6449, + "step": 38350 + }, + { + "epoch": 2.3807188528151966, + "grad_norm": 0.13626811830111507, + "learning_rate": 1.2428555143639703e-05, + "loss": 2.7306, + "step": 38351 + }, + { + "epoch": 2.3807809299149545, + "grad_norm": 0.13590838871481542, + "learning_rate": 1.2426172307168121e-05, + "loss": 2.7264, + "step": 38352 + }, + { + "epoch": 2.3808430070147124, + "grad_norm": 0.14941084615200098, + "learning_rate": 1.242378966672832e-05, + "loss": 2.7601, + "step": 38353 + }, + { + "epoch": 2.3809050841144703, + "grad_norm": 0.14059651934119247, + "learning_rate": 1.2421407222332732e-05, + "loss": 2.7531, + "step": 38354 + }, + { + "epoch": 2.3809671612142282, + "grad_norm": 0.1415700088071642, + "learning_rate": 1.2419024973993765e-05, + "loss": 2.6239, + "step": 38355 + }, + { + "epoch": 2.381029238313986, + "grad_norm": 0.14710427407584623, + "learning_rate": 1.2416642921723876e-05, + "loss": 2.6884, + "step": 38356 + }, + { + "epoch": 2.3810913154137436, + "grad_norm": 0.1378427487217858, + "learning_rate": 1.2414261065535494e-05, + "loss": 2.715, + "step": 38357 + }, + { + "epoch": 2.381153392513502, + "grad_norm": 0.1393633780744398, + "learning_rate": 1.2411879405441034e-05, + "loss": 2.6293, + "step": 38358 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.13256581621909597, + "learning_rate": 1.240949794145293e-05, + "loss": 2.7441, + "step": 38359 + }, + { + "epoch": 2.3812775467130174, + "grad_norm": 0.13887225033186512, + "learning_rate": 1.240711667358358e-05, + "loss": 2.7091, + "step": 38360 + }, + { + "epoch": 2.3813396238127753, + "grad_norm": 0.15133306672471708, + "learning_rate": 1.2404735601845446e-05, + "loss": 2.7461, + "step": 38361 + }, + { + "epoch": 2.3814017009125332, + "grad_norm": 0.14756736325373404, + "learning_rate": 1.2402354726250937e-05, + "loss": 2.7582, + "step": 38362 + }, + { + "epoch": 2.381463778012291, + "grad_norm": 0.13633998447633144, + "learning_rate": 1.2399974046812462e-05, + "loss": 2.7228, + "step": 38363 + }, + { + "epoch": 2.381525855112049, + "grad_norm": 0.1530594580863254, + "learning_rate": 1.2397593563542442e-05, + "loss": 2.7751, + "step": 38364 + }, + { + "epoch": 2.381587932211807, + "grad_norm": 0.14149111024485342, + "learning_rate": 1.2395213276453316e-05, + "loss": 2.7612, + "step": 38365 + }, + { + "epoch": 2.381650009311565, + "grad_norm": 0.14188572225511398, + "learning_rate": 1.2392833185557495e-05, + "loss": 2.7174, + "step": 38366 + }, + { + "epoch": 2.381712086411323, + "grad_norm": 0.1345101713487851, + "learning_rate": 1.2390453290867388e-05, + "loss": 2.7163, + "step": 38367 + }, + { + "epoch": 2.3817741635110807, + "grad_norm": 0.13664952296604008, + "learning_rate": 1.2388073592395416e-05, + "loss": 2.7596, + "step": 38368 + }, + { + "epoch": 2.3818362406108387, + "grad_norm": 0.17694727612191205, + "learning_rate": 1.2385694090153981e-05, + "loss": 2.7353, + "step": 38369 + }, + { + "epoch": 2.3818983177105966, + "grad_norm": 0.1358544208156375, + "learning_rate": 1.238331478415553e-05, + "loss": 2.7227, + "step": 38370 + }, + { + "epoch": 2.3819603948103545, + "grad_norm": 0.13194940417223147, + "learning_rate": 1.2380935674412453e-05, + "loss": 2.5446, + "step": 38371 + }, + { + "epoch": 2.3820224719101124, + "grad_norm": 0.14385663280978905, + "learning_rate": 1.2378556760937171e-05, + "loss": 2.7473, + "step": 38372 + }, + { + "epoch": 2.3820845490098703, + "grad_norm": 0.14633005914871158, + "learning_rate": 1.2376178043742076e-05, + "loss": 2.7437, + "step": 38373 + }, + { + "epoch": 2.3821466261096282, + "grad_norm": 0.13334561387022864, + "learning_rate": 1.2373799522839602e-05, + "loss": 2.6774, + "step": 38374 + }, + { + "epoch": 2.382208703209386, + "grad_norm": 0.13852015106543575, + "learning_rate": 1.2371421198242156e-05, + "loss": 2.7161, + "step": 38375 + }, + { + "epoch": 2.382270780309144, + "grad_norm": 0.1407520263540408, + "learning_rate": 1.2369043069962139e-05, + "loss": 2.6629, + "step": 38376 + }, + { + "epoch": 2.382332857408902, + "grad_norm": 0.1422087312923414, + "learning_rate": 1.2366665138011946e-05, + "loss": 2.7184, + "step": 38377 + }, + { + "epoch": 2.38239493450866, + "grad_norm": 0.1408121135721606, + "learning_rate": 1.2364287402404012e-05, + "loss": 2.691, + "step": 38378 + }, + { + "epoch": 2.382457011608418, + "grad_norm": 0.14132532601987807, + "learning_rate": 1.2361909863150723e-05, + "loss": 2.6853, + "step": 38379 + }, + { + "epoch": 2.3825190887081757, + "grad_norm": 0.139492341145367, + "learning_rate": 1.2359532520264489e-05, + "loss": 2.6732, + "step": 38380 + }, + { + "epoch": 2.3825811658079337, + "grad_norm": 0.16061201027286282, + "learning_rate": 1.2357155373757712e-05, + "loss": 2.756, + "step": 38381 + }, + { + "epoch": 2.382643242907691, + "grad_norm": 0.1426009460370689, + "learning_rate": 1.2354778423642777e-05, + "loss": 2.7752, + "step": 38382 + }, + { + "epoch": 2.3827053200074495, + "grad_norm": 0.14599853054296744, + "learning_rate": 1.2352401669932117e-05, + "loss": 2.7065, + "step": 38383 + }, + { + "epoch": 2.382767397107207, + "grad_norm": 0.14306218788776207, + "learning_rate": 1.235002511263812e-05, + "loss": 2.7924, + "step": 38384 + }, + { + "epoch": 2.382829474206965, + "grad_norm": 0.1359257109232636, + "learning_rate": 1.2347648751773177e-05, + "loss": 2.8383, + "step": 38385 + }, + { + "epoch": 2.382891551306723, + "grad_norm": 0.1475069088045211, + "learning_rate": 1.2345272587349681e-05, + "loss": 2.7323, + "step": 38386 + }, + { + "epoch": 2.3829536284064807, + "grad_norm": 0.14046347136121315, + "learning_rate": 1.2342896619380035e-05, + "loss": 2.7754, + "step": 38387 + }, + { + "epoch": 2.3830157055062386, + "grad_norm": 0.1369391164018302, + "learning_rate": 1.2340520847876658e-05, + "loss": 2.6329, + "step": 38388 + }, + { + "epoch": 2.3830777826059966, + "grad_norm": 0.148570039203178, + "learning_rate": 1.2338145272851919e-05, + "loss": 2.7613, + "step": 38389 + }, + { + "epoch": 2.3831398597057545, + "grad_norm": 0.14196128535184774, + "learning_rate": 1.2335769894318228e-05, + "loss": 2.6969, + "step": 38390 + }, + { + "epoch": 2.3832019368055124, + "grad_norm": 0.1344186282303987, + "learning_rate": 1.2333394712287949e-05, + "loss": 2.7145, + "step": 38391 + }, + { + "epoch": 2.3832640139052703, + "grad_norm": 0.13858459304784504, + "learning_rate": 1.2331019726773508e-05, + "loss": 2.7258, + "step": 38392 + }, + { + "epoch": 2.3833260910050282, + "grad_norm": 0.14426316064154968, + "learning_rate": 1.2328644937787282e-05, + "loss": 2.6949, + "step": 38393 + }, + { + "epoch": 2.383388168104786, + "grad_norm": 0.1331385369201457, + "learning_rate": 1.2326270345341655e-05, + "loss": 2.6759, + "step": 38394 + }, + { + "epoch": 2.383450245204544, + "grad_norm": 0.14869702389566566, + "learning_rate": 1.2323895949449033e-05, + "loss": 2.7432, + "step": 38395 + }, + { + "epoch": 2.383512322304302, + "grad_norm": 0.13734598691765634, + "learning_rate": 1.2321521750121768e-05, + "loss": 2.7134, + "step": 38396 + }, + { + "epoch": 2.38357439940406, + "grad_norm": 0.13458886510027912, + "learning_rate": 1.231914774737229e-05, + "loss": 2.7493, + "step": 38397 + }, + { + "epoch": 2.383636476503818, + "grad_norm": 0.13455690585652594, + "learning_rate": 1.2316773941212962e-05, + "loss": 2.6791, + "step": 38398 + }, + { + "epoch": 2.3836985536035757, + "grad_norm": 0.14388288880925262, + "learning_rate": 1.2314400331656173e-05, + "loss": 2.77, + "step": 38399 + }, + { + "epoch": 2.3837606307033337, + "grad_norm": 0.15166522187703718, + "learning_rate": 1.2312026918714292e-05, + "loss": 2.731, + "step": 38400 + }, + { + "epoch": 2.3838227078030916, + "grad_norm": 0.1429187937523539, + "learning_rate": 1.2309653702399732e-05, + "loss": 2.6682, + "step": 38401 + }, + { + "epoch": 2.3838847849028495, + "grad_norm": 0.1419526319227541, + "learning_rate": 1.230728068272486e-05, + "loss": 2.7124, + "step": 38402 + }, + { + "epoch": 2.3839468620026074, + "grad_norm": 0.1473206111340324, + "learning_rate": 1.2304907859702053e-05, + "loss": 2.743, + "step": 38403 + }, + { + "epoch": 2.3840089391023653, + "grad_norm": 0.13812361719768393, + "learning_rate": 1.2302535233343688e-05, + "loss": 2.7356, + "step": 38404 + }, + { + "epoch": 2.384071016202123, + "grad_norm": 0.16770270258669842, + "learning_rate": 1.2300162803662135e-05, + "loss": 2.5817, + "step": 38405 + }, + { + "epoch": 2.384133093301881, + "grad_norm": 0.14000539574485743, + "learning_rate": 1.2297790570669804e-05, + "loss": 2.7395, + "step": 38406 + }, + { + "epoch": 2.3841951704016386, + "grad_norm": 0.14336906161311633, + "learning_rate": 1.229541853437905e-05, + "loss": 2.7606, + "step": 38407 + }, + { + "epoch": 2.3842572475013966, + "grad_norm": 0.14158069851511557, + "learning_rate": 1.2293046694802251e-05, + "loss": 2.7083, + "step": 38408 + }, + { + "epoch": 2.3843193246011545, + "grad_norm": 0.13421628083787798, + "learning_rate": 1.229067505195176e-05, + "loss": 2.703, + "step": 38409 + }, + { + "epoch": 2.3843814017009124, + "grad_norm": 0.16291280224998253, + "learning_rate": 1.2288303605839996e-05, + "loss": 2.7053, + "step": 38410 + }, + { + "epoch": 2.3844434788006703, + "grad_norm": 0.1416468251991677, + "learning_rate": 1.22859323564793e-05, + "loss": 2.6847, + "step": 38411 + }, + { + "epoch": 2.3845055559004282, + "grad_norm": 0.14059511502924513, + "learning_rate": 1.2283561303882052e-05, + "loss": 2.6945, + "step": 38412 + }, + { + "epoch": 2.384567633000186, + "grad_norm": 0.13841385410157717, + "learning_rate": 1.2281190448060608e-05, + "loss": 2.747, + "step": 38413 + }, + { + "epoch": 2.384629710099944, + "grad_norm": 0.1360428314344112, + "learning_rate": 1.2278819789027362e-05, + "loss": 2.6776, + "step": 38414 + }, + { + "epoch": 2.384691787199702, + "grad_norm": 0.14305627025332615, + "learning_rate": 1.2276449326794676e-05, + "loss": 2.7477, + "step": 38415 + }, + { + "epoch": 2.38475386429946, + "grad_norm": 0.14075688958965127, + "learning_rate": 1.2274079061374905e-05, + "loss": 2.6879, + "step": 38416 + }, + { + "epoch": 2.384815941399218, + "grad_norm": 0.13528580409954394, + "learning_rate": 1.2271708992780422e-05, + "loss": 2.6725, + "step": 38417 + }, + { + "epoch": 2.3848780184989757, + "grad_norm": 0.14032713152163484, + "learning_rate": 1.2269339121023583e-05, + "loss": 2.6675, + "step": 38418 + }, + { + "epoch": 2.3849400955987337, + "grad_norm": 0.1672292109179253, + "learning_rate": 1.2266969446116771e-05, + "loss": 2.7051, + "step": 38419 + }, + { + "epoch": 2.3850021726984916, + "grad_norm": 0.14196741263162516, + "learning_rate": 1.2264599968072327e-05, + "loss": 2.6946, + "step": 38420 + }, + { + "epoch": 2.3850642497982495, + "grad_norm": 0.1558412330987187, + "learning_rate": 1.2262230686902637e-05, + "loss": 2.6818, + "step": 38421 + }, + { + "epoch": 2.3851263268980074, + "grad_norm": 0.14018213552950173, + "learning_rate": 1.2259861602620055e-05, + "loss": 2.7704, + "step": 38422 + }, + { + "epoch": 2.3851884039977653, + "grad_norm": 0.14270490078963036, + "learning_rate": 1.2257492715236918e-05, + "loss": 2.6374, + "step": 38423 + }, + { + "epoch": 2.3852504810975232, + "grad_norm": 0.137554827509523, + "learning_rate": 1.2255124024765623e-05, + "loss": 2.7242, + "step": 38424 + }, + { + "epoch": 2.385312558197281, + "grad_norm": 0.14515301633051925, + "learning_rate": 1.2252755531218508e-05, + "loss": 2.696, + "step": 38425 + }, + { + "epoch": 2.385374635297039, + "grad_norm": 0.13756352062791297, + "learning_rate": 1.225038723460793e-05, + "loss": 2.7258, + "step": 38426 + }, + { + "epoch": 2.385436712396797, + "grad_norm": 0.1370196505349019, + "learning_rate": 1.2248019134946225e-05, + "loss": 2.5957, + "step": 38427 + }, + { + "epoch": 2.3854987894965545, + "grad_norm": 0.14125987295864653, + "learning_rate": 1.2245651232245792e-05, + "loss": 2.6293, + "step": 38428 + }, + { + "epoch": 2.385560866596313, + "grad_norm": 0.1603687841666991, + "learning_rate": 1.2243283526518957e-05, + "loss": 2.804, + "step": 38429 + }, + { + "epoch": 2.3856229436960703, + "grad_norm": 0.1329806819699252, + "learning_rate": 1.2240916017778082e-05, + "loss": 2.6473, + "step": 38430 + }, + { + "epoch": 2.3856850207958282, + "grad_norm": 0.1415177195712947, + "learning_rate": 1.2238548706035507e-05, + "loss": 2.7307, + "step": 38431 + }, + { + "epoch": 2.385747097895586, + "grad_norm": 0.1467596202004906, + "learning_rate": 1.223618159130358e-05, + "loss": 2.7053, + "step": 38432 + }, + { + "epoch": 2.385809174995344, + "grad_norm": 0.13831643098837795, + "learning_rate": 1.2233814673594679e-05, + "loss": 2.6679, + "step": 38433 + }, + { + "epoch": 2.385871252095102, + "grad_norm": 0.14533744028855802, + "learning_rate": 1.2231447952921132e-05, + "loss": 2.6623, + "step": 38434 + }, + { + "epoch": 2.38593332919486, + "grad_norm": 0.17460975753605731, + "learning_rate": 1.222908142929529e-05, + "loss": 2.7761, + "step": 38435 + }, + { + "epoch": 2.385995406294618, + "grad_norm": 0.14672588783995852, + "learning_rate": 1.2226715102729486e-05, + "loss": 2.6916, + "step": 38436 + }, + { + "epoch": 2.3860574833943757, + "grad_norm": 0.1362629239926007, + "learning_rate": 1.2224348973236088e-05, + "loss": 2.7074, + "step": 38437 + }, + { + "epoch": 2.3861195604941337, + "grad_norm": 0.1430924167358507, + "learning_rate": 1.222198304082744e-05, + "loss": 2.7127, + "step": 38438 + }, + { + "epoch": 2.3861816375938916, + "grad_norm": 0.13672528251300348, + "learning_rate": 1.2219617305515874e-05, + "loss": 2.7181, + "step": 38439 + }, + { + "epoch": 2.3862437146936495, + "grad_norm": 0.13890166786797734, + "learning_rate": 1.2217251767313732e-05, + "loss": 2.6658, + "step": 38440 + }, + { + "epoch": 2.3863057917934074, + "grad_norm": 0.14702071864632799, + "learning_rate": 1.2214886426233351e-05, + "loss": 2.7491, + "step": 38441 + }, + { + "epoch": 2.3863678688931653, + "grad_norm": 0.14828679942649114, + "learning_rate": 1.2212521282287092e-05, + "loss": 2.7172, + "step": 38442 + }, + { + "epoch": 2.3864299459929232, + "grad_norm": 0.13663777950938089, + "learning_rate": 1.2210156335487282e-05, + "loss": 2.781, + "step": 38443 + }, + { + "epoch": 2.386492023092681, + "grad_norm": 0.1480281166230634, + "learning_rate": 1.2207791585846263e-05, + "loss": 2.7361, + "step": 38444 + }, + { + "epoch": 2.386554100192439, + "grad_norm": 0.1572676539775798, + "learning_rate": 1.2205427033376348e-05, + "loss": 2.6867, + "step": 38445 + }, + { + "epoch": 2.386616177292197, + "grad_norm": 0.144763013236649, + "learning_rate": 1.2203062678089916e-05, + "loss": 2.6992, + "step": 38446 + }, + { + "epoch": 2.386678254391955, + "grad_norm": 0.1396643022715591, + "learning_rate": 1.2200698519999282e-05, + "loss": 2.64, + "step": 38447 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.14284770771570104, + "learning_rate": 1.2198334559116782e-05, + "loss": 2.7437, + "step": 38448 + }, + { + "epoch": 2.3868024085914707, + "grad_norm": 0.13681059885599262, + "learning_rate": 1.2195970795454725e-05, + "loss": 2.6655, + "step": 38449 + }, + { + "epoch": 2.3868644856912287, + "grad_norm": 0.16968364019201623, + "learning_rate": 1.2193607229025483e-05, + "loss": 2.7335, + "step": 38450 + }, + { + "epoch": 2.3869265627909866, + "grad_norm": 0.14015484071318604, + "learning_rate": 1.2191243859841373e-05, + "loss": 2.696, + "step": 38451 + }, + { + "epoch": 2.3869886398907445, + "grad_norm": 0.14788364934011894, + "learning_rate": 1.2188880687914723e-05, + "loss": 2.71, + "step": 38452 + }, + { + "epoch": 2.387050716990502, + "grad_norm": 0.13087596361148018, + "learning_rate": 1.218651771325784e-05, + "loss": 2.65, + "step": 38453 + }, + { + "epoch": 2.3871127940902603, + "grad_norm": 0.14551179995609678, + "learning_rate": 1.218415493588309e-05, + "loss": 2.7258, + "step": 38454 + }, + { + "epoch": 2.387174871190018, + "grad_norm": 0.13645791649451422, + "learning_rate": 1.2181792355802773e-05, + "loss": 2.736, + "step": 38455 + }, + { + "epoch": 2.3872369482897757, + "grad_norm": 0.14093330812669785, + "learning_rate": 1.2179429973029244e-05, + "loss": 2.7421, + "step": 38456 + }, + { + "epoch": 2.3872990253895336, + "grad_norm": 0.1400655505934594, + "learning_rate": 1.2177067787574802e-05, + "loss": 2.6784, + "step": 38457 + }, + { + "epoch": 2.3873611024892916, + "grad_norm": 0.13501304912195883, + "learning_rate": 1.2174705799451785e-05, + "loss": 2.6997, + "step": 38458 + }, + { + "epoch": 2.3874231795890495, + "grad_norm": 0.14065319017142938, + "learning_rate": 1.2172344008672493e-05, + "loss": 2.7425, + "step": 38459 + }, + { + "epoch": 2.3874852566888074, + "grad_norm": 0.1640785413801626, + "learning_rate": 1.2169982415249276e-05, + "loss": 2.7279, + "step": 38460 + }, + { + "epoch": 2.3875473337885653, + "grad_norm": 0.15212629766819769, + "learning_rate": 1.2167621019194447e-05, + "loss": 2.687, + "step": 38461 + }, + { + "epoch": 2.3876094108883232, + "grad_norm": 0.13674680558218602, + "learning_rate": 1.2165259820520324e-05, + "loss": 2.7115, + "step": 38462 + }, + { + "epoch": 2.387671487988081, + "grad_norm": 0.14728915712056798, + "learning_rate": 1.2162898819239216e-05, + "loss": 2.7293, + "step": 38463 + }, + { + "epoch": 2.387733565087839, + "grad_norm": 0.20350313752389426, + "learning_rate": 1.216053801536346e-05, + "loss": 2.7637, + "step": 38464 + }, + { + "epoch": 2.387795642187597, + "grad_norm": 0.16650178633875742, + "learning_rate": 1.2158177408905364e-05, + "loss": 2.6917, + "step": 38465 + }, + { + "epoch": 2.387857719287355, + "grad_norm": 0.13671030972177117, + "learning_rate": 1.2155816999877239e-05, + "loss": 2.7248, + "step": 38466 + }, + { + "epoch": 2.387919796387113, + "grad_norm": 0.1344917181351953, + "learning_rate": 1.2153456788291407e-05, + "loss": 2.6514, + "step": 38467 + }, + { + "epoch": 2.3879818734868707, + "grad_norm": 0.14962006996313837, + "learning_rate": 1.2151096774160164e-05, + "loss": 2.7523, + "step": 38468 + }, + { + "epoch": 2.3880439505866287, + "grad_norm": 0.1408184297411975, + "learning_rate": 1.2148736957495848e-05, + "loss": 2.7791, + "step": 38469 + }, + { + "epoch": 2.3881060276863866, + "grad_norm": 0.1487302027478911, + "learning_rate": 1.2146377338310766e-05, + "loss": 2.7705, + "step": 38470 + }, + { + "epoch": 2.3881681047861445, + "grad_norm": 0.14868780414603106, + "learning_rate": 1.2144017916617218e-05, + "loss": 2.748, + "step": 38471 + }, + { + "epoch": 2.3882301818859024, + "grad_norm": 0.1350958275653983, + "learning_rate": 1.214165869242751e-05, + "loss": 2.7383, + "step": 38472 + }, + { + "epoch": 2.3882922589856603, + "grad_norm": 0.14833965967002433, + "learning_rate": 1.2139299665753968e-05, + "loss": 2.6731, + "step": 38473 + }, + { + "epoch": 2.3883543360854183, + "grad_norm": 0.13710230431433998, + "learning_rate": 1.213694083660889e-05, + "loss": 2.7413, + "step": 38474 + }, + { + "epoch": 2.388416413185176, + "grad_norm": 0.13581873430109181, + "learning_rate": 1.2134582205004585e-05, + "loss": 2.6743, + "step": 38475 + }, + { + "epoch": 2.3884784902849336, + "grad_norm": 0.13529495965818036, + "learning_rate": 1.2132223770953344e-05, + "loss": 2.7367, + "step": 38476 + }, + { + "epoch": 2.388540567384692, + "grad_norm": 0.17343865922592003, + "learning_rate": 1.21298655344675e-05, + "loss": 2.7201, + "step": 38477 + }, + { + "epoch": 2.3886026444844495, + "grad_norm": 0.13941644191448307, + "learning_rate": 1.2127507495559337e-05, + "loss": 2.64, + "step": 38478 + }, + { + "epoch": 2.3886647215842074, + "grad_norm": 0.16429318377184315, + "learning_rate": 1.2125149654241158e-05, + "loss": 2.7172, + "step": 38479 + }, + { + "epoch": 2.3887267986839653, + "grad_norm": 0.14291597862003885, + "learning_rate": 1.2122792010525274e-05, + "loss": 2.7393, + "step": 38480 + }, + { + "epoch": 2.3887888757837232, + "grad_norm": 0.1519342812864511, + "learning_rate": 1.2120434564423965e-05, + "loss": 2.6725, + "step": 38481 + }, + { + "epoch": 2.388850952883481, + "grad_norm": 0.14507623328196312, + "learning_rate": 1.2118077315949554e-05, + "loss": 2.7195, + "step": 38482 + }, + { + "epoch": 2.388913029983239, + "grad_norm": 0.1380678035478123, + "learning_rate": 1.2115720265114333e-05, + "loss": 2.6063, + "step": 38483 + }, + { + "epoch": 2.388975107082997, + "grad_norm": 0.13709383330332842, + "learning_rate": 1.2113363411930595e-05, + "loss": 2.7457, + "step": 38484 + }, + { + "epoch": 2.389037184182755, + "grad_norm": 0.13557163336717165, + "learning_rate": 1.2111006756410625e-05, + "loss": 2.7568, + "step": 38485 + }, + { + "epoch": 2.389099261282513, + "grad_norm": 0.13761599941644947, + "learning_rate": 1.2108650298566727e-05, + "loss": 2.7534, + "step": 38486 + }, + { + "epoch": 2.3891613383822707, + "grad_norm": 0.13497620177448477, + "learning_rate": 1.2106294038411214e-05, + "loss": 2.7106, + "step": 38487 + }, + { + "epoch": 2.3892234154820287, + "grad_norm": 0.14009550199576795, + "learning_rate": 1.210393797595637e-05, + "loss": 2.6817, + "step": 38488 + }, + { + "epoch": 2.3892854925817866, + "grad_norm": 0.13695774306689543, + "learning_rate": 1.2101582111214478e-05, + "loss": 2.6955, + "step": 38489 + }, + { + "epoch": 2.3893475696815445, + "grad_norm": 0.17751200882154913, + "learning_rate": 1.2099226444197825e-05, + "loss": 2.7363, + "step": 38490 + }, + { + "epoch": 2.3894096467813024, + "grad_norm": 0.14426091403348876, + "learning_rate": 1.20968709749187e-05, + "loss": 2.7093, + "step": 38491 + }, + { + "epoch": 2.3894717238810603, + "grad_norm": 0.1410212695294464, + "learning_rate": 1.2094515703389419e-05, + "loss": 2.791, + "step": 38492 + }, + { + "epoch": 2.3895338009808182, + "grad_norm": 0.13771317413490902, + "learning_rate": 1.2092160629622246e-05, + "loss": 2.6873, + "step": 38493 + }, + { + "epoch": 2.389595878080576, + "grad_norm": 0.15171133726497435, + "learning_rate": 1.2089805753629473e-05, + "loss": 2.7627, + "step": 38494 + }, + { + "epoch": 2.389657955180334, + "grad_norm": 0.1854527284793277, + "learning_rate": 1.2087451075423373e-05, + "loss": 2.762, + "step": 38495 + }, + { + "epoch": 2.389720032280092, + "grad_norm": 0.14176634047277348, + "learning_rate": 1.2085096595016259e-05, + "loss": 2.6976, + "step": 38496 + }, + { + "epoch": 2.38978210937985, + "grad_norm": 0.14456300773193348, + "learning_rate": 1.2082742312420403e-05, + "loss": 2.6387, + "step": 38497 + }, + { + "epoch": 2.389844186479608, + "grad_norm": 0.14605640836825703, + "learning_rate": 1.2080388227648081e-05, + "loss": 2.7521, + "step": 38498 + }, + { + "epoch": 2.3899062635793658, + "grad_norm": 0.13312489570022581, + "learning_rate": 1.207803434071157e-05, + "loss": 2.7089, + "step": 38499 + }, + { + "epoch": 2.3899683406791237, + "grad_norm": 0.14007979243616328, + "learning_rate": 1.2075680651623167e-05, + "loss": 2.6903, + "step": 38500 + }, + { + "epoch": 2.390030417778881, + "grad_norm": 0.1429555482970657, + "learning_rate": 1.2073327160395149e-05, + "loss": 2.7426, + "step": 38501 + }, + { + "epoch": 2.3900924948786395, + "grad_norm": 0.14796017773015874, + "learning_rate": 1.2070973867039787e-05, + "loss": 2.7509, + "step": 38502 + }, + { + "epoch": 2.390154571978397, + "grad_norm": 0.15137082372420171, + "learning_rate": 1.2068620771569367e-05, + "loss": 2.5816, + "step": 38503 + }, + { + "epoch": 2.390216649078155, + "grad_norm": 0.13834317970373755, + "learning_rate": 1.206626787399614e-05, + "loss": 2.7614, + "step": 38504 + }, + { + "epoch": 2.390278726177913, + "grad_norm": 0.14641867753843402, + "learning_rate": 1.2063915174332419e-05, + "loss": 2.7544, + "step": 38505 + }, + { + "epoch": 2.3903408032776707, + "grad_norm": 0.13855311183931532, + "learning_rate": 1.2061562672590465e-05, + "loss": 2.7073, + "step": 38506 + }, + { + "epoch": 2.3904028803774287, + "grad_norm": 0.1575832644784277, + "learning_rate": 1.2059210368782548e-05, + "loss": 2.7648, + "step": 38507 + }, + { + "epoch": 2.3904649574771866, + "grad_norm": 0.156751156809902, + "learning_rate": 1.2056858262920922e-05, + "loss": 2.6423, + "step": 38508 + }, + { + "epoch": 2.3905270345769445, + "grad_norm": 0.1423551566602547, + "learning_rate": 1.2054506355017898e-05, + "loss": 2.6776, + "step": 38509 + }, + { + "epoch": 2.3905891116767024, + "grad_norm": 0.15099000389796735, + "learning_rate": 1.2052154645085717e-05, + "loss": 2.7425, + "step": 38510 + }, + { + "epoch": 2.3906511887764603, + "grad_norm": 0.15413621795626306, + "learning_rate": 1.2049803133136667e-05, + "loss": 2.7814, + "step": 38511 + }, + { + "epoch": 2.3907132658762182, + "grad_norm": 0.1617823715193193, + "learning_rate": 1.2047451819182987e-05, + "loss": 2.6364, + "step": 38512 + }, + { + "epoch": 2.390775342975976, + "grad_norm": 0.14236587899902575, + "learning_rate": 1.2045100703236978e-05, + "loss": 2.7148, + "step": 38513 + }, + { + "epoch": 2.390837420075734, + "grad_norm": 0.14849172404858108, + "learning_rate": 1.2042749785310898e-05, + "loss": 2.7233, + "step": 38514 + }, + { + "epoch": 2.390899497175492, + "grad_norm": 0.14468973723467718, + "learning_rate": 1.2040399065417001e-05, + "loss": 2.6634, + "step": 38515 + }, + { + "epoch": 2.39096157427525, + "grad_norm": 0.14348844050617512, + "learning_rate": 1.203804854356756e-05, + "loss": 2.7417, + "step": 38516 + }, + { + "epoch": 2.391023651375008, + "grad_norm": 0.1396361525674011, + "learning_rate": 1.2035698219774816e-05, + "loss": 2.7593, + "step": 38517 + }, + { + "epoch": 2.3910857284747657, + "grad_norm": 0.13724863365149856, + "learning_rate": 1.2033348094051056e-05, + "loss": 2.7816, + "step": 38518 + }, + { + "epoch": 2.3911478055745237, + "grad_norm": 0.1436366217208999, + "learning_rate": 1.2030998166408547e-05, + "loss": 2.6632, + "step": 38519 + }, + { + "epoch": 2.3912098826742816, + "grad_norm": 0.1525413136920563, + "learning_rate": 1.2028648436859541e-05, + "loss": 2.6418, + "step": 38520 + }, + { + "epoch": 2.3912719597740395, + "grad_norm": 0.14054038230526084, + "learning_rate": 1.2026298905416294e-05, + "loss": 2.7532, + "step": 38521 + }, + { + "epoch": 2.3913340368737974, + "grad_norm": 0.15721848863994337, + "learning_rate": 1.2023949572091048e-05, + "loss": 2.7188, + "step": 38522 + }, + { + "epoch": 2.3913961139735553, + "grad_norm": 0.1432841989968768, + "learning_rate": 1.2021600436896091e-05, + "loss": 2.7709, + "step": 38523 + }, + { + "epoch": 2.391458191073313, + "grad_norm": 0.18559931967567955, + "learning_rate": 1.2019251499843664e-05, + "loss": 2.6666, + "step": 38524 + }, + { + "epoch": 2.391520268173071, + "grad_norm": 0.16102723795599053, + "learning_rate": 1.201690276094602e-05, + "loss": 2.7186, + "step": 38525 + }, + { + "epoch": 2.3915823452728286, + "grad_norm": 0.14936387085455924, + "learning_rate": 1.2014554220215418e-05, + "loss": 2.6337, + "step": 38526 + }, + { + "epoch": 2.3916444223725866, + "grad_norm": 0.15651232038049967, + "learning_rate": 1.201220587766409e-05, + "loss": 2.659, + "step": 38527 + }, + { + "epoch": 2.3917064994723445, + "grad_norm": 0.15405766740690344, + "learning_rate": 1.2009857733304324e-05, + "loss": 2.6997, + "step": 38528 + }, + { + "epoch": 2.3917685765721024, + "grad_norm": 0.13439489594365564, + "learning_rate": 1.2007509787148353e-05, + "loss": 2.68, + "step": 38529 + }, + { + "epoch": 2.3918306536718603, + "grad_norm": 0.1386624588254384, + "learning_rate": 1.2005162039208423e-05, + "loss": 2.6462, + "step": 38530 + }, + { + "epoch": 2.3918927307716182, + "grad_norm": 0.13573961461325207, + "learning_rate": 1.2002814489496772e-05, + "loss": 2.6225, + "step": 38531 + }, + { + "epoch": 2.391954807871376, + "grad_norm": 0.14904207785480966, + "learning_rate": 1.2000467138025678e-05, + "loss": 2.6896, + "step": 38532 + }, + { + "epoch": 2.392016884971134, + "grad_norm": 0.14508618322660463, + "learning_rate": 1.199811998480737e-05, + "loss": 2.787, + "step": 38533 + }, + { + "epoch": 2.392078962070892, + "grad_norm": 0.13898881771112448, + "learning_rate": 1.1995773029854102e-05, + "loss": 2.7837, + "step": 38534 + }, + { + "epoch": 2.39214103917065, + "grad_norm": 0.1436725444062462, + "learning_rate": 1.1993426273178087e-05, + "loss": 2.6917, + "step": 38535 + }, + { + "epoch": 2.392203116270408, + "grad_norm": 0.13618269462081828, + "learning_rate": 1.1991079714791614e-05, + "loss": 2.6963, + "step": 38536 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.13879969575723908, + "learning_rate": 1.1988733354706905e-05, + "loss": 2.7101, + "step": 38537 + }, + { + "epoch": 2.3923272704699237, + "grad_norm": 0.1384298065407213, + "learning_rate": 1.1986387192936205e-05, + "loss": 2.7324, + "step": 38538 + }, + { + "epoch": 2.3923893475696816, + "grad_norm": 0.1415259915973465, + "learning_rate": 1.1984041229491744e-05, + "loss": 2.7483, + "step": 38539 + }, + { + "epoch": 2.3924514246694395, + "grad_norm": 0.13818521801967823, + "learning_rate": 1.1981695464385756e-05, + "loss": 2.7564, + "step": 38540 + }, + { + "epoch": 2.3925135017691974, + "grad_norm": 0.15119968498576553, + "learning_rate": 1.197934989763051e-05, + "loss": 2.701, + "step": 38541 + }, + { + "epoch": 2.3925755788689553, + "grad_norm": 0.1349823149492875, + "learning_rate": 1.1977004529238223e-05, + "loss": 2.6254, + "step": 38542 + }, + { + "epoch": 2.3926376559687133, + "grad_norm": 0.1524862149038949, + "learning_rate": 1.1974659359221135e-05, + "loss": 2.7046, + "step": 38543 + }, + { + "epoch": 2.392699733068471, + "grad_norm": 0.14765335943026664, + "learning_rate": 1.1972314387591466e-05, + "loss": 2.6659, + "step": 38544 + }, + { + "epoch": 2.392761810168229, + "grad_norm": 0.14920229022133796, + "learning_rate": 1.1969969614361476e-05, + "loss": 2.658, + "step": 38545 + }, + { + "epoch": 2.392823887267987, + "grad_norm": 0.13718966716974987, + "learning_rate": 1.196762503954339e-05, + "loss": 2.6899, + "step": 38546 + }, + { + "epoch": 2.392885964367745, + "grad_norm": 0.14655100260863063, + "learning_rate": 1.196528066314943e-05, + "loss": 2.7644, + "step": 38547 + }, + { + "epoch": 2.392948041467503, + "grad_norm": 0.1376808578263106, + "learning_rate": 1.1962936485191828e-05, + "loss": 2.7652, + "step": 38548 + }, + { + "epoch": 2.3930101185672603, + "grad_norm": 0.1618770347655119, + "learning_rate": 1.1960592505682832e-05, + "loss": 2.7302, + "step": 38549 + }, + { + "epoch": 2.3930721956670187, + "grad_norm": 0.1416766723846275, + "learning_rate": 1.1958248724634658e-05, + "loss": 2.6431, + "step": 38550 + }, + { + "epoch": 2.393134272766776, + "grad_norm": 0.14519426627163187, + "learning_rate": 1.195590514205952e-05, + "loss": 2.6474, + "step": 38551 + }, + { + "epoch": 2.393196349866534, + "grad_norm": 0.15175630944291973, + "learning_rate": 1.1953561757969678e-05, + "loss": 2.7712, + "step": 38552 + }, + { + "epoch": 2.393258426966292, + "grad_norm": 0.1353277424911627, + "learning_rate": 1.1951218572377338e-05, + "loss": 2.6489, + "step": 38553 + }, + { + "epoch": 2.39332050406605, + "grad_norm": 0.1573019972202429, + "learning_rate": 1.1948875585294711e-05, + "loss": 2.7124, + "step": 38554 + }, + { + "epoch": 2.393382581165808, + "grad_norm": 0.143917630508763, + "learning_rate": 1.1946532796734056e-05, + "loss": 2.7192, + "step": 38555 + }, + { + "epoch": 2.3934446582655657, + "grad_norm": 0.13103434266903793, + "learning_rate": 1.1944190206707573e-05, + "loss": 2.6436, + "step": 38556 + }, + { + "epoch": 2.3935067353653237, + "grad_norm": 0.13147584693709888, + "learning_rate": 1.1941847815227492e-05, + "loss": 2.7095, + "step": 38557 + }, + { + "epoch": 2.3935688124650816, + "grad_norm": 0.1425721611447844, + "learning_rate": 1.1939505622306013e-05, + "loss": 2.6873, + "step": 38558 + }, + { + "epoch": 2.3936308895648395, + "grad_norm": 0.1419561681489478, + "learning_rate": 1.1937163627955389e-05, + "loss": 2.6872, + "step": 38559 + }, + { + "epoch": 2.3936929666645974, + "grad_norm": 0.1433090391843339, + "learning_rate": 1.193482183218782e-05, + "loss": 2.6245, + "step": 38560 + }, + { + "epoch": 2.3937550437643553, + "grad_norm": 0.15332335572350816, + "learning_rate": 1.193248023501553e-05, + "loss": 2.7202, + "step": 38561 + }, + { + "epoch": 2.3938171208641132, + "grad_norm": 0.14628438649261136, + "learning_rate": 1.1930138836450716e-05, + "loss": 2.7012, + "step": 38562 + }, + { + "epoch": 2.393879197963871, + "grad_norm": 0.13904432194015537, + "learning_rate": 1.1927797636505622e-05, + "loss": 2.7426, + "step": 38563 + }, + { + "epoch": 2.393941275063629, + "grad_norm": 0.13563947427879142, + "learning_rate": 1.1925456635192451e-05, + "loss": 2.7486, + "step": 38564 + }, + { + "epoch": 2.394003352163387, + "grad_norm": 0.14783223251306898, + "learning_rate": 1.1923115832523419e-05, + "loss": 2.6972, + "step": 38565 + }, + { + "epoch": 2.394065429263145, + "grad_norm": 0.13446385694870377, + "learning_rate": 1.1920775228510728e-05, + "loss": 2.667, + "step": 38566 + }, + { + "epoch": 2.394127506362903, + "grad_norm": 0.17067740724567979, + "learning_rate": 1.1918434823166585e-05, + "loss": 2.7039, + "step": 38567 + }, + { + "epoch": 2.3941895834626608, + "grad_norm": 0.14712270256969553, + "learning_rate": 1.1916094616503232e-05, + "loss": 2.7692, + "step": 38568 + }, + { + "epoch": 2.3942516605624187, + "grad_norm": 0.1510469124694222, + "learning_rate": 1.191375460853285e-05, + "loss": 2.7444, + "step": 38569 + }, + { + "epoch": 2.3943137376621766, + "grad_norm": 0.3771249251145898, + "learning_rate": 1.1911414799267662e-05, + "loss": 2.6568, + "step": 38570 + }, + { + "epoch": 2.3943758147619345, + "grad_norm": 0.14702560256871178, + "learning_rate": 1.1909075188719853e-05, + "loss": 2.6851, + "step": 38571 + }, + { + "epoch": 2.394437891861692, + "grad_norm": 0.1392253856704454, + "learning_rate": 1.1906735776901662e-05, + "loss": 2.7679, + "step": 38572 + }, + { + "epoch": 2.3944999689614503, + "grad_norm": 0.14144550142272957, + "learning_rate": 1.1904396563825277e-05, + "loss": 2.8198, + "step": 38573 + }, + { + "epoch": 2.394562046061208, + "grad_norm": 0.1516611137286222, + "learning_rate": 1.1902057549502899e-05, + "loss": 2.7827, + "step": 38574 + }, + { + "epoch": 2.3946241231609657, + "grad_norm": 0.13839802768053344, + "learning_rate": 1.189971873394674e-05, + "loss": 2.7126, + "step": 38575 + }, + { + "epoch": 2.3946862002607237, + "grad_norm": 0.13615558986490656, + "learning_rate": 1.1897380117168983e-05, + "loss": 2.7558, + "step": 38576 + }, + { + "epoch": 2.3947482773604816, + "grad_norm": 0.14063323844513304, + "learning_rate": 1.1895041699181853e-05, + "loss": 2.65, + "step": 38577 + }, + { + "epoch": 2.3948103544602395, + "grad_norm": 0.13862537894012283, + "learning_rate": 1.1892703479997547e-05, + "loss": 2.6992, + "step": 38578 + }, + { + "epoch": 2.3948724315599974, + "grad_norm": 0.13313440972415416, + "learning_rate": 1.1890365459628256e-05, + "loss": 2.6826, + "step": 38579 + }, + { + "epoch": 2.3949345086597553, + "grad_norm": 0.13921626783442323, + "learning_rate": 1.1888027638086163e-05, + "loss": 2.7331, + "step": 38580 + }, + { + "epoch": 2.3949965857595132, + "grad_norm": 0.14927093656815418, + "learning_rate": 1.1885690015383499e-05, + "loss": 2.6841, + "step": 38581 + }, + { + "epoch": 2.395058662859271, + "grad_norm": 0.14757542239653276, + "learning_rate": 1.1883352591532442e-05, + "loss": 2.8144, + "step": 38582 + }, + { + "epoch": 2.395120739959029, + "grad_norm": 0.2545653957368018, + "learning_rate": 1.188101536654519e-05, + "loss": 2.7224, + "step": 38583 + }, + { + "epoch": 2.395182817058787, + "grad_norm": 0.1381258355197406, + "learning_rate": 1.1878678340433914e-05, + "loss": 2.7409, + "step": 38584 + }, + { + "epoch": 2.395244894158545, + "grad_norm": 0.13819782062527275, + "learning_rate": 1.1876341513210831e-05, + "loss": 2.5326, + "step": 38585 + }, + { + "epoch": 2.395306971258303, + "grad_norm": 0.13809041227781715, + "learning_rate": 1.1874004884888145e-05, + "loss": 2.7273, + "step": 38586 + }, + { + "epoch": 2.3953690483580607, + "grad_norm": 0.13565128356297848, + "learning_rate": 1.187166845547803e-05, + "loss": 2.6253, + "step": 38587 + }, + { + "epoch": 2.3954311254578187, + "grad_norm": 0.1366328787202842, + "learning_rate": 1.1869332224992674e-05, + "loss": 2.7296, + "step": 38588 + }, + { + "epoch": 2.3954932025575766, + "grad_norm": 0.1410357318291134, + "learning_rate": 1.1866996193444274e-05, + "loss": 2.6302, + "step": 38589 + }, + { + "epoch": 2.3955552796573345, + "grad_norm": 0.13540263210220035, + "learning_rate": 1.1864660360844993e-05, + "loss": 2.7959, + "step": 38590 + }, + { + "epoch": 2.3956173567570924, + "grad_norm": 0.14918127812738452, + "learning_rate": 1.1862324727207052e-05, + "loss": 2.7946, + "step": 38591 + }, + { + "epoch": 2.3956794338568503, + "grad_norm": 0.1584915839756473, + "learning_rate": 1.1859989292542617e-05, + "loss": 2.6302, + "step": 38592 + }, + { + "epoch": 2.3957415109566083, + "grad_norm": 0.1638188937728511, + "learning_rate": 1.1857654056863882e-05, + "loss": 2.6708, + "step": 38593 + }, + { + "epoch": 2.395803588056366, + "grad_norm": 0.13901787561586607, + "learning_rate": 1.185531902018301e-05, + "loss": 2.7964, + "step": 38594 + }, + { + "epoch": 2.395865665156124, + "grad_norm": 0.138852136603776, + "learning_rate": 1.1852984182512211e-05, + "loss": 2.6849, + "step": 38595 + }, + { + "epoch": 2.395927742255882, + "grad_norm": 0.13989510126151947, + "learning_rate": 1.1850649543863657e-05, + "loss": 2.5471, + "step": 38596 + }, + { + "epoch": 2.3959898193556395, + "grad_norm": 0.157341855586068, + "learning_rate": 1.1848315104249524e-05, + "loss": 2.6517, + "step": 38597 + }, + { + "epoch": 2.396051896455398, + "grad_norm": 0.16282180137204308, + "learning_rate": 1.1845980863681983e-05, + "loss": 2.7406, + "step": 38598 + }, + { + "epoch": 2.3961139735551553, + "grad_norm": 0.14463534987968613, + "learning_rate": 1.1843646822173227e-05, + "loss": 2.7606, + "step": 38599 + }, + { + "epoch": 2.3961760506549132, + "grad_norm": 0.1382470339130104, + "learning_rate": 1.1841312979735437e-05, + "loss": 2.6783, + "step": 38600 + }, + { + "epoch": 2.396238127754671, + "grad_norm": 0.1408720495495529, + "learning_rate": 1.1838979336380774e-05, + "loss": 2.6898, + "step": 38601 + }, + { + "epoch": 2.396300204854429, + "grad_norm": 0.15323923482002355, + "learning_rate": 1.1836645892121423e-05, + "loss": 2.6644, + "step": 38602 + }, + { + "epoch": 2.396362281954187, + "grad_norm": 0.14444890769673918, + "learning_rate": 1.1834312646969543e-05, + "loss": 2.6618, + "step": 38603 + }, + { + "epoch": 2.396424359053945, + "grad_norm": 0.14640000138642684, + "learning_rate": 1.1831979600937327e-05, + "loss": 2.6774, + "step": 38604 + }, + { + "epoch": 2.396486436153703, + "grad_norm": 0.14433769204359923, + "learning_rate": 1.1829646754036944e-05, + "loss": 2.7387, + "step": 38605 + }, + { + "epoch": 2.3965485132534607, + "grad_norm": 0.14408802685512637, + "learning_rate": 1.1827314106280557e-05, + "loss": 2.7453, + "step": 38606 + }, + { + "epoch": 2.3966105903532187, + "grad_norm": 0.1575563715253975, + "learning_rate": 1.1824981657680323e-05, + "loss": 2.7916, + "step": 38607 + }, + { + "epoch": 2.3966726674529766, + "grad_norm": 0.14042540133836404, + "learning_rate": 1.1822649408248449e-05, + "loss": 2.7248, + "step": 38608 + }, + { + "epoch": 2.3967347445527345, + "grad_norm": 0.149706460910614, + "learning_rate": 1.1820317357997073e-05, + "loss": 2.6228, + "step": 38609 + }, + { + "epoch": 2.3967968216524924, + "grad_norm": 0.15167343784005355, + "learning_rate": 1.1817985506938374e-05, + "loss": 2.6234, + "step": 38610 + }, + { + "epoch": 2.3968588987522503, + "grad_norm": 0.1440880203915595, + "learning_rate": 1.1815653855084513e-05, + "loss": 2.7328, + "step": 38611 + }, + { + "epoch": 2.3969209758520083, + "grad_norm": 0.14495529010640582, + "learning_rate": 1.1813322402447641e-05, + "loss": 2.6438, + "step": 38612 + }, + { + "epoch": 2.396983052951766, + "grad_norm": 0.13569676030177452, + "learning_rate": 1.1810991149039951e-05, + "loss": 2.7374, + "step": 38613 + }, + { + "epoch": 2.397045130051524, + "grad_norm": 0.15504135925944582, + "learning_rate": 1.1808660094873585e-05, + "loss": 2.749, + "step": 38614 + }, + { + "epoch": 2.397107207151282, + "grad_norm": 0.1459621083690861, + "learning_rate": 1.1806329239960717e-05, + "loss": 2.6794, + "step": 38615 + }, + { + "epoch": 2.39716928425104, + "grad_norm": 0.17062073078683276, + "learning_rate": 1.1803998584313485e-05, + "loss": 2.7457, + "step": 38616 + }, + { + "epoch": 2.397231361350798, + "grad_norm": 0.15549683844282394, + "learning_rate": 1.1801668127944066e-05, + "loss": 2.6234, + "step": 38617 + }, + { + "epoch": 2.3972934384505558, + "grad_norm": 0.13681252533270688, + "learning_rate": 1.1799337870864634e-05, + "loss": 2.6551, + "step": 38618 + }, + { + "epoch": 2.3973555155503137, + "grad_norm": 0.13694163954862074, + "learning_rate": 1.1797007813087324e-05, + "loss": 2.738, + "step": 38619 + }, + { + "epoch": 2.397417592650071, + "grad_norm": 0.1393917699419816, + "learning_rate": 1.17946779546243e-05, + "loss": 2.6914, + "step": 38620 + }, + { + "epoch": 2.3974796697498295, + "grad_norm": 0.1373879268541387, + "learning_rate": 1.1792348295487705e-05, + "loss": 2.6308, + "step": 38621 + }, + { + "epoch": 2.397541746849587, + "grad_norm": 0.16355307143428585, + "learning_rate": 1.1790018835689714e-05, + "loss": 2.6736, + "step": 38622 + }, + { + "epoch": 2.397603823949345, + "grad_norm": 0.14088379041634536, + "learning_rate": 1.1787689575242477e-05, + "loss": 2.748, + "step": 38623 + }, + { + "epoch": 2.397665901049103, + "grad_norm": 0.1381774671834174, + "learning_rate": 1.1785360514158133e-05, + "loss": 2.7201, + "step": 38624 + }, + { + "epoch": 2.3977279781488607, + "grad_norm": 0.13749830884326955, + "learning_rate": 1.1783031652448844e-05, + "loss": 2.6283, + "step": 38625 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.14561032518031178, + "learning_rate": 1.178070299012674e-05, + "loss": 2.6808, + "step": 38626 + }, + { + "epoch": 2.3978521323483766, + "grad_norm": 0.17020862872212092, + "learning_rate": 1.1778374527204e-05, + "loss": 2.6837, + "step": 38627 + }, + { + "epoch": 2.3979142094481345, + "grad_norm": 0.13542570640273394, + "learning_rate": 1.1776046263692764e-05, + "loss": 2.6135, + "step": 38628 + }, + { + "epoch": 2.3979762865478924, + "grad_norm": 0.13951432517376797, + "learning_rate": 1.1773718199605165e-05, + "loss": 2.7156, + "step": 38629 + }, + { + "epoch": 2.3980383636476503, + "grad_norm": 0.15344729905002782, + "learning_rate": 1.1771390334953353e-05, + "loss": 2.821, + "step": 38630 + }, + { + "epoch": 2.3981004407474082, + "grad_norm": 0.15994961131407295, + "learning_rate": 1.1769062669749487e-05, + "loss": 2.7042, + "step": 38631 + }, + { + "epoch": 2.398162517847166, + "grad_norm": 0.140669808931145, + "learning_rate": 1.1766735204005703e-05, + "loss": 2.7756, + "step": 38632 + }, + { + "epoch": 2.398224594946924, + "grad_norm": 0.16465917577151393, + "learning_rate": 1.176440793773414e-05, + "loss": 2.7891, + "step": 38633 + }, + { + "epoch": 2.398286672046682, + "grad_norm": 0.15212618747152115, + "learning_rate": 1.1762080870946934e-05, + "loss": 2.7687, + "step": 38634 + }, + { + "epoch": 2.39834874914644, + "grad_norm": 0.14217042113006315, + "learning_rate": 1.1759754003656243e-05, + "loss": 2.7039, + "step": 38635 + }, + { + "epoch": 2.398410826246198, + "grad_norm": 0.1407631089930652, + "learning_rate": 1.1757427335874204e-05, + "loss": 2.688, + "step": 38636 + }, + { + "epoch": 2.3984729033459558, + "grad_norm": 0.17122217305537557, + "learning_rate": 1.175510086761295e-05, + "loss": 2.6596, + "step": 38637 + }, + { + "epoch": 2.3985349804457137, + "grad_norm": 0.13748127961600434, + "learning_rate": 1.1752774598884614e-05, + "loss": 2.6675, + "step": 38638 + }, + { + "epoch": 2.3985970575454716, + "grad_norm": 0.13730003897122164, + "learning_rate": 1.175044852970133e-05, + "loss": 2.6534, + "step": 38639 + }, + { + "epoch": 2.3986591346452295, + "grad_norm": 0.18351443586293073, + "learning_rate": 1.1748122660075251e-05, + "loss": 2.7185, + "step": 38640 + }, + { + "epoch": 2.3987212117449874, + "grad_norm": 0.1500812543893244, + "learning_rate": 1.1745796990018503e-05, + "loss": 2.7133, + "step": 38641 + }, + { + "epoch": 2.3987832888447453, + "grad_norm": 0.13317525633641733, + "learning_rate": 1.174347151954322e-05, + "loss": 2.7415, + "step": 38642 + }, + { + "epoch": 2.3988453659445033, + "grad_norm": 0.1627574112932571, + "learning_rate": 1.1741146248661517e-05, + "loss": 2.715, + "step": 38643 + }, + { + "epoch": 2.398907443044261, + "grad_norm": 0.15245286994952625, + "learning_rate": 1.1738821177385556e-05, + "loss": 2.8608, + "step": 38644 + }, + { + "epoch": 2.3989695201440187, + "grad_norm": 0.13367823026610914, + "learning_rate": 1.1736496305727451e-05, + "loss": 2.6284, + "step": 38645 + }, + { + "epoch": 2.399031597243777, + "grad_norm": 0.13716202808993538, + "learning_rate": 1.1734171633699337e-05, + "loss": 2.6888, + "step": 38646 + }, + { + "epoch": 2.3990936743435345, + "grad_norm": 0.13950588045877524, + "learning_rate": 1.1731847161313325e-05, + "loss": 2.7153, + "step": 38647 + }, + { + "epoch": 2.3991557514432924, + "grad_norm": 0.16471594154107336, + "learning_rate": 1.1729522888581574e-05, + "loss": 2.7119, + "step": 38648 + }, + { + "epoch": 2.3992178285430503, + "grad_norm": 0.1549484582832765, + "learning_rate": 1.1727198815516172e-05, + "loss": 2.6914, + "step": 38649 + }, + { + "epoch": 2.3992799056428082, + "grad_norm": 0.13765603716051186, + "learning_rate": 1.1724874942129283e-05, + "loss": 2.6029, + "step": 38650 + }, + { + "epoch": 2.399341982742566, + "grad_norm": 0.1514939792336867, + "learning_rate": 1.1722551268433008e-05, + "loss": 2.7037, + "step": 38651 + }, + { + "epoch": 2.399404059842324, + "grad_norm": 0.14199435246849085, + "learning_rate": 1.1720227794439481e-05, + "loss": 2.6919, + "step": 38652 + }, + { + "epoch": 2.399466136942082, + "grad_norm": 0.14414894602051523, + "learning_rate": 1.17179045201608e-05, + "loss": 2.7487, + "step": 38653 + }, + { + "epoch": 2.39952821404184, + "grad_norm": 0.14244456109994388, + "learning_rate": 1.1715581445609119e-05, + "loss": 2.6622, + "step": 38654 + }, + { + "epoch": 2.399590291141598, + "grad_norm": 0.14105841946266923, + "learning_rate": 1.1713258570796542e-05, + "loss": 2.6239, + "step": 38655 + }, + { + "epoch": 2.3996523682413557, + "grad_norm": 0.13703680508922886, + "learning_rate": 1.1710935895735193e-05, + "loss": 2.7011, + "step": 38656 + }, + { + "epoch": 2.3997144453411137, + "grad_norm": 0.14050956465153874, + "learning_rate": 1.1708613420437176e-05, + "loss": 2.7199, + "step": 38657 + }, + { + "epoch": 2.3997765224408716, + "grad_norm": 0.1469307392963217, + "learning_rate": 1.1706291144914627e-05, + "loss": 2.7222, + "step": 38658 + }, + { + "epoch": 2.3998385995406295, + "grad_norm": 0.1331257491918983, + "learning_rate": 1.1703969069179654e-05, + "loss": 2.6277, + "step": 38659 + }, + { + "epoch": 2.3999006766403874, + "grad_norm": 0.14029763449911414, + "learning_rate": 1.170164719324437e-05, + "loss": 2.6924, + "step": 38660 + }, + { + "epoch": 2.3999627537401453, + "grad_norm": 0.14335523750092977, + "learning_rate": 1.169932551712089e-05, + "loss": 2.7276, + "step": 38661 + }, + { + "epoch": 2.4000248308399033, + "grad_norm": 0.17921492380449605, + "learning_rate": 1.1697004040821314e-05, + "loss": 2.7156, + "step": 38662 + }, + { + "epoch": 2.400086907939661, + "grad_norm": 0.1435524752180427, + "learning_rate": 1.1694682764357779e-05, + "loss": 2.661, + "step": 38663 + }, + { + "epoch": 2.400148985039419, + "grad_norm": 0.1392381483000582, + "learning_rate": 1.1692361687742381e-05, + "loss": 2.6694, + "step": 38664 + }, + { + "epoch": 2.400211062139177, + "grad_norm": 0.15288254631230228, + "learning_rate": 1.1690040810987236e-05, + "loss": 2.7358, + "step": 38665 + }, + { + "epoch": 2.400273139238935, + "grad_norm": 0.1382350623587043, + "learning_rate": 1.1687720134104434e-05, + "loss": 2.7112, + "step": 38666 + }, + { + "epoch": 2.400335216338693, + "grad_norm": 0.17045836873354467, + "learning_rate": 1.1685399657106111e-05, + "loss": 2.7773, + "step": 38667 + }, + { + "epoch": 2.4003972934384503, + "grad_norm": 0.1352459736505005, + "learning_rate": 1.1683079380004353e-05, + "loss": 2.7117, + "step": 38668 + }, + { + "epoch": 2.4004593705382087, + "grad_norm": 0.14071706566172137, + "learning_rate": 1.1680759302811278e-05, + "loss": 2.7046, + "step": 38669 + }, + { + "epoch": 2.400521447637966, + "grad_norm": 0.15420843559487818, + "learning_rate": 1.1678439425538968e-05, + "loss": 2.7471, + "step": 38670 + }, + { + "epoch": 2.400583524737724, + "grad_norm": 0.14104407834447433, + "learning_rate": 1.1676119748199554e-05, + "loss": 2.8222, + "step": 38671 + }, + { + "epoch": 2.400645601837482, + "grad_norm": 0.13794180943999956, + "learning_rate": 1.167380027080513e-05, + "loss": 2.7352, + "step": 38672 + }, + { + "epoch": 2.40070767893724, + "grad_norm": 0.1391646601968684, + "learning_rate": 1.1671480993367794e-05, + "loss": 2.7618, + "step": 38673 + }, + { + "epoch": 2.400769756036998, + "grad_norm": 0.13700266377877007, + "learning_rate": 1.1669161915899646e-05, + "loss": 2.6406, + "step": 38674 + }, + { + "epoch": 2.4008318331367557, + "grad_norm": 0.14014973729502295, + "learning_rate": 1.1666843038412772e-05, + "loss": 2.6214, + "step": 38675 + }, + { + "epoch": 2.4008939102365137, + "grad_norm": 0.14200087183614174, + "learning_rate": 1.1664524360919293e-05, + "loss": 2.6615, + "step": 38676 + }, + { + "epoch": 2.4009559873362716, + "grad_norm": 0.17023820043837518, + "learning_rate": 1.1662205883431303e-05, + "loss": 2.6961, + "step": 38677 + }, + { + "epoch": 2.4010180644360295, + "grad_norm": 0.141297012887972, + "learning_rate": 1.165988760596089e-05, + "loss": 2.7664, + "step": 38678 + }, + { + "epoch": 2.4010801415357874, + "grad_norm": 0.14171390680787635, + "learning_rate": 1.1657569528520135e-05, + "loss": 2.7172, + "step": 38679 + }, + { + "epoch": 2.4011422186355453, + "grad_norm": 0.15609525528030185, + "learning_rate": 1.1655251651121158e-05, + "loss": 2.6058, + "step": 38680 + }, + { + "epoch": 2.4012042957353033, + "grad_norm": 0.13859217327816586, + "learning_rate": 1.1652933973776048e-05, + "loss": 2.6629, + "step": 38681 + }, + { + "epoch": 2.401266372835061, + "grad_norm": 0.143086975493794, + "learning_rate": 1.1650616496496875e-05, + "loss": 2.6871, + "step": 38682 + }, + { + "epoch": 2.401328449934819, + "grad_norm": 0.14304795650737945, + "learning_rate": 1.1648299219295755e-05, + "loss": 2.7342, + "step": 38683 + }, + { + "epoch": 2.401390527034577, + "grad_norm": 0.13841816029751372, + "learning_rate": 1.1645982142184759e-05, + "loss": 2.6612, + "step": 38684 + }, + { + "epoch": 2.401452604134335, + "grad_norm": 0.13522045809859531, + "learning_rate": 1.1643665265176001e-05, + "loss": 2.7137, + "step": 38685 + }, + { + "epoch": 2.401514681234093, + "grad_norm": 0.14254704881385913, + "learning_rate": 1.1641348588281547e-05, + "loss": 2.7472, + "step": 38686 + }, + { + "epoch": 2.4015767583338508, + "grad_norm": 0.16887872960470937, + "learning_rate": 1.1639032111513493e-05, + "loss": 2.7073, + "step": 38687 + }, + { + "epoch": 2.4016388354336087, + "grad_norm": 0.132560013144771, + "learning_rate": 1.163671583488392e-05, + "loss": 2.6896, + "step": 38688 + }, + { + "epoch": 2.4017009125333666, + "grad_norm": 0.13636608961456168, + "learning_rate": 1.1634399758404895e-05, + "loss": 2.7105, + "step": 38689 + }, + { + "epoch": 2.4017629896331245, + "grad_norm": 0.133767878115188, + "learning_rate": 1.1632083882088541e-05, + "loss": 2.7515, + "step": 38690 + }, + { + "epoch": 2.4018250667328824, + "grad_norm": 0.13820318480814345, + "learning_rate": 1.1629768205946917e-05, + "loss": 2.7302, + "step": 38691 + }, + { + "epoch": 2.4018871438326403, + "grad_norm": 0.1446944681224013, + "learning_rate": 1.1627452729992105e-05, + "loss": 2.767, + "step": 38692 + }, + { + "epoch": 2.401949220932398, + "grad_norm": 0.1375736889836148, + "learning_rate": 1.162513745423618e-05, + "loss": 2.7036, + "step": 38693 + }, + { + "epoch": 2.402011298032156, + "grad_norm": 0.13913755400629788, + "learning_rate": 1.1622822378691234e-05, + "loss": 2.7071, + "step": 38694 + }, + { + "epoch": 2.4020733751319137, + "grad_norm": 0.13345833434824367, + "learning_rate": 1.1620507503369343e-05, + "loss": 2.6955, + "step": 38695 + }, + { + "epoch": 2.4021354522316716, + "grad_norm": 0.14824897313325225, + "learning_rate": 1.1618192828282587e-05, + "loss": 2.7363, + "step": 38696 + }, + { + "epoch": 2.4021975293314295, + "grad_norm": 0.15217555983269487, + "learning_rate": 1.1615878353443017e-05, + "loss": 2.7822, + "step": 38697 + }, + { + "epoch": 2.4022596064311874, + "grad_norm": 0.13816187657597234, + "learning_rate": 1.1613564078862738e-05, + "loss": 2.6691, + "step": 38698 + }, + { + "epoch": 2.4023216835309453, + "grad_norm": 0.15169454677160826, + "learning_rate": 1.161125000455382e-05, + "loss": 2.724, + "step": 38699 + }, + { + "epoch": 2.4023837606307032, + "grad_norm": 0.1519730839210359, + "learning_rate": 1.1608936130528331e-05, + "loss": 2.7582, + "step": 38700 + }, + { + "epoch": 2.402445837730461, + "grad_norm": 0.15149231937582316, + "learning_rate": 1.1606622456798338e-05, + "loss": 2.7889, + "step": 38701 + }, + { + "epoch": 2.402507914830219, + "grad_norm": 0.1400001737900064, + "learning_rate": 1.16043089833759e-05, + "loss": 2.7321, + "step": 38702 + }, + { + "epoch": 2.402569991929977, + "grad_norm": 0.13542581643468823, + "learning_rate": 1.1601995710273123e-05, + "loss": 2.7124, + "step": 38703 + }, + { + "epoch": 2.402632069029735, + "grad_norm": 0.13751650082673705, + "learning_rate": 1.1599682637502052e-05, + "loss": 2.7158, + "step": 38704 + }, + { + "epoch": 2.402694146129493, + "grad_norm": 0.1746673310720463, + "learning_rate": 1.1597369765074761e-05, + "loss": 2.7146, + "step": 38705 + }, + { + "epoch": 2.4027562232292508, + "grad_norm": 0.1616257784991386, + "learning_rate": 1.1595057093003297e-05, + "loss": 2.7729, + "step": 38706 + }, + { + "epoch": 2.4028183003290087, + "grad_norm": 0.13828531975872665, + "learning_rate": 1.1592744621299762e-05, + "loss": 2.6155, + "step": 38707 + }, + { + "epoch": 2.4028803774287666, + "grad_norm": 0.1435358852607256, + "learning_rate": 1.1590432349976193e-05, + "loss": 2.7458, + "step": 38708 + }, + { + "epoch": 2.4029424545285245, + "grad_norm": 0.1539683768459697, + "learning_rate": 1.1588120279044674e-05, + "loss": 2.7553, + "step": 38709 + }, + { + "epoch": 2.4030045316282824, + "grad_norm": 0.13297958242488944, + "learning_rate": 1.1585808408517252e-05, + "loss": 2.741, + "step": 38710 + }, + { + "epoch": 2.4030666087280403, + "grad_norm": 0.1382664206468242, + "learning_rate": 1.1583496738405975e-05, + "loss": 2.7374, + "step": 38711 + }, + { + "epoch": 2.4031286858277983, + "grad_norm": 0.1443039937475869, + "learning_rate": 1.1581185268722938e-05, + "loss": 2.7352, + "step": 38712 + }, + { + "epoch": 2.403190762927556, + "grad_norm": 0.14155983145516488, + "learning_rate": 1.1578873999480183e-05, + "loss": 2.614, + "step": 38713 + }, + { + "epoch": 2.403252840027314, + "grad_norm": 0.14461571570807655, + "learning_rate": 1.1576562930689771e-05, + "loss": 2.6627, + "step": 38714 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.15174468559334314, + "learning_rate": 1.1574252062363743e-05, + "loss": 2.7325, + "step": 38715 + }, + { + "epoch": 2.4033769942268295, + "grad_norm": 0.13577756489317275, + "learning_rate": 1.1571941394514175e-05, + "loss": 2.6995, + "step": 38716 + }, + { + "epoch": 2.403439071326588, + "grad_norm": 0.1356703890822677, + "learning_rate": 1.1569630927153125e-05, + "loss": 2.6939, + "step": 38717 + }, + { + "epoch": 2.4035011484263453, + "grad_norm": 0.15031435506354057, + "learning_rate": 1.1567320660292642e-05, + "loss": 2.7111, + "step": 38718 + }, + { + "epoch": 2.4035632255261032, + "grad_norm": 0.14067350319345187, + "learning_rate": 1.156501059394478e-05, + "loss": 2.744, + "step": 38719 + }, + { + "epoch": 2.403625302625861, + "grad_norm": 0.146307195957989, + "learning_rate": 1.1562700728121567e-05, + "loss": 2.7307, + "step": 38720 + }, + { + "epoch": 2.403687379725619, + "grad_norm": 0.1333612786812493, + "learning_rate": 1.1560391062835097e-05, + "loss": 2.7978, + "step": 38721 + }, + { + "epoch": 2.403749456825377, + "grad_norm": 0.14158781576844637, + "learning_rate": 1.1558081598097393e-05, + "loss": 2.6609, + "step": 38722 + }, + { + "epoch": 2.403811533925135, + "grad_norm": 0.1512493105253946, + "learning_rate": 1.1555772333920512e-05, + "loss": 2.6491, + "step": 38723 + }, + { + "epoch": 2.403873611024893, + "grad_norm": 0.13819784084243325, + "learning_rate": 1.15534632703165e-05, + "loss": 2.7924, + "step": 38724 + }, + { + "epoch": 2.4039356881246507, + "grad_norm": 0.13449007695758247, + "learning_rate": 1.1551154407297388e-05, + "loss": 2.667, + "step": 38725 + }, + { + "epoch": 2.4039977652244087, + "grad_norm": 0.14348711642263814, + "learning_rate": 1.1548845744875253e-05, + "loss": 2.7983, + "step": 38726 + }, + { + "epoch": 2.4040598423241666, + "grad_norm": 0.15592985137307006, + "learning_rate": 1.154653728306212e-05, + "loss": 2.7481, + "step": 38727 + }, + { + "epoch": 2.4041219194239245, + "grad_norm": 0.14565944451530555, + "learning_rate": 1.154422902187004e-05, + "loss": 2.698, + "step": 38728 + }, + { + "epoch": 2.4041839965236824, + "grad_norm": 0.153852081206136, + "learning_rate": 1.1541920961311036e-05, + "loss": 2.7362, + "step": 38729 + }, + { + "epoch": 2.4042460736234403, + "grad_norm": 0.15249174216847572, + "learning_rate": 1.1539613101397183e-05, + "loss": 2.7336, + "step": 38730 + }, + { + "epoch": 2.4043081507231983, + "grad_norm": 0.16474580951297574, + "learning_rate": 1.1537305442140506e-05, + "loss": 2.6858, + "step": 38731 + }, + { + "epoch": 2.404370227822956, + "grad_norm": 0.1454542752797323, + "learning_rate": 1.1534997983553043e-05, + "loss": 2.7029, + "step": 38732 + }, + { + "epoch": 2.404432304922714, + "grad_norm": 0.14780755389610015, + "learning_rate": 1.1532690725646817e-05, + "loss": 2.7806, + "step": 38733 + }, + { + "epoch": 2.404494382022472, + "grad_norm": 0.14298075823565282, + "learning_rate": 1.1530383668433898e-05, + "loss": 2.7613, + "step": 38734 + }, + { + "epoch": 2.40455645912223, + "grad_norm": 0.13790178713048523, + "learning_rate": 1.1528076811926303e-05, + "loss": 2.7343, + "step": 38735 + }, + { + "epoch": 2.404618536221988, + "grad_norm": 0.1463758957051172, + "learning_rate": 1.1525770156136074e-05, + "loss": 2.6961, + "step": 38736 + }, + { + "epoch": 2.4046806133217458, + "grad_norm": 0.14439135301957728, + "learning_rate": 1.1523463701075238e-05, + "loss": 2.7674, + "step": 38737 + }, + { + "epoch": 2.4047426904215037, + "grad_norm": 0.17565107293737844, + "learning_rate": 1.152115744675582e-05, + "loss": 2.8097, + "step": 38738 + }, + { + "epoch": 2.4048047675212616, + "grad_norm": 0.1381211695851985, + "learning_rate": 1.1518851393189884e-05, + "loss": 2.7204, + "step": 38739 + }, + { + "epoch": 2.4048668446210195, + "grad_norm": 0.15915034322040988, + "learning_rate": 1.1516545540389435e-05, + "loss": 2.6756, + "step": 38740 + }, + { + "epoch": 2.404928921720777, + "grad_norm": 0.13792609090985558, + "learning_rate": 1.1514239888366513e-05, + "loss": 2.7087, + "step": 38741 + }, + { + "epoch": 2.4049909988205354, + "grad_norm": 0.1366890398169023, + "learning_rate": 1.1511934437133132e-05, + "loss": 2.6548, + "step": 38742 + }, + { + "epoch": 2.405053075920293, + "grad_norm": 0.1343868225463612, + "learning_rate": 1.1509629186701342e-05, + "loss": 2.7003, + "step": 38743 + }, + { + "epoch": 2.4051151530200507, + "grad_norm": 0.15023412240255607, + "learning_rate": 1.1507324137083165e-05, + "loss": 2.6455, + "step": 38744 + }, + { + "epoch": 2.4051772301198087, + "grad_norm": 0.1468607707943028, + "learning_rate": 1.150501928829062e-05, + "loss": 2.728, + "step": 38745 + }, + { + "epoch": 2.4052393072195666, + "grad_norm": 0.13666057029967793, + "learning_rate": 1.1502714640335738e-05, + "loss": 2.7481, + "step": 38746 + }, + { + "epoch": 2.4053013843193245, + "grad_norm": 0.14026075449762504, + "learning_rate": 1.1500410193230521e-05, + "loss": 2.7381, + "step": 38747 + }, + { + "epoch": 2.4053634614190824, + "grad_norm": 0.14091173714693772, + "learning_rate": 1.1498105946987009e-05, + "loss": 2.6736, + "step": 38748 + }, + { + "epoch": 2.4054255385188403, + "grad_norm": 0.14161691253596695, + "learning_rate": 1.1495801901617243e-05, + "loss": 2.7191, + "step": 38749 + }, + { + "epoch": 2.4054876156185983, + "grad_norm": 0.13888052569151071, + "learning_rate": 1.1493498057133223e-05, + "loss": 2.7378, + "step": 38750 + }, + { + "epoch": 2.405549692718356, + "grad_norm": 0.13676751493257086, + "learning_rate": 1.1491194413546968e-05, + "loss": 2.66, + "step": 38751 + }, + { + "epoch": 2.405611769818114, + "grad_norm": 0.14341646793436863, + "learning_rate": 1.1488890970870491e-05, + "loss": 2.6768, + "step": 38752 + }, + { + "epoch": 2.405673846917872, + "grad_norm": 0.14319796583597946, + "learning_rate": 1.1486587729115833e-05, + "loss": 2.7245, + "step": 38753 + }, + { + "epoch": 2.40573592401763, + "grad_norm": 0.14271136812013657, + "learning_rate": 1.1484284688294994e-05, + "loss": 2.6629, + "step": 38754 + }, + { + "epoch": 2.405798001117388, + "grad_norm": 0.14372734793193215, + "learning_rate": 1.1481981848419993e-05, + "loss": 2.6902, + "step": 38755 + }, + { + "epoch": 2.4058600782171458, + "grad_norm": 0.14637329340289268, + "learning_rate": 1.147967920950283e-05, + "loss": 2.6955, + "step": 38756 + }, + { + "epoch": 2.4059221553169037, + "grad_norm": 0.14236206510529145, + "learning_rate": 1.1477376771555548e-05, + "loss": 2.714, + "step": 38757 + }, + { + "epoch": 2.4059842324166616, + "grad_norm": 0.1350882028159557, + "learning_rate": 1.1475074534590135e-05, + "loss": 2.7275, + "step": 38758 + }, + { + "epoch": 2.4060463095164195, + "grad_norm": 0.14376134335294968, + "learning_rate": 1.1472772498618617e-05, + "loss": 2.7937, + "step": 38759 + }, + { + "epoch": 2.4061083866161774, + "grad_norm": 0.13882295679402062, + "learning_rate": 1.1470470663652999e-05, + "loss": 2.6955, + "step": 38760 + }, + { + "epoch": 2.4061704637159353, + "grad_norm": 0.14207392862336524, + "learning_rate": 1.1468169029705273e-05, + "loss": 2.7055, + "step": 38761 + }, + { + "epoch": 2.4062325408156933, + "grad_norm": 0.14022282868989552, + "learning_rate": 1.1465867596787473e-05, + "loss": 2.728, + "step": 38762 + }, + { + "epoch": 2.406294617915451, + "grad_norm": 0.1368892764240705, + "learning_rate": 1.1463566364911599e-05, + "loss": 2.708, + "step": 38763 + }, + { + "epoch": 2.4063566950152087, + "grad_norm": 0.13380748733288028, + "learning_rate": 1.1461265334089655e-05, + "loss": 2.6492, + "step": 38764 + }, + { + "epoch": 2.406418772114967, + "grad_norm": 0.13651676925724524, + "learning_rate": 1.145896450433363e-05, + "loss": 2.6794, + "step": 38765 + }, + { + "epoch": 2.4064808492147245, + "grad_norm": 0.14919763360727623, + "learning_rate": 1.145666387565556e-05, + "loss": 2.7785, + "step": 38766 + }, + { + "epoch": 2.4065429263144824, + "grad_norm": 0.16362708077371732, + "learning_rate": 1.145436344806743e-05, + "loss": 2.6976, + "step": 38767 + }, + { + "epoch": 2.4066050034142403, + "grad_norm": 0.13879938710464562, + "learning_rate": 1.1452063221581244e-05, + "loss": 2.5952, + "step": 38768 + }, + { + "epoch": 2.4066670805139982, + "grad_norm": 0.1521148306637738, + "learning_rate": 1.1449763196208984e-05, + "loss": 2.7451, + "step": 38769 + }, + { + "epoch": 2.406729157613756, + "grad_norm": 0.13629767424867203, + "learning_rate": 1.1447463371962686e-05, + "loss": 2.7545, + "step": 38770 + }, + { + "epoch": 2.406791234713514, + "grad_norm": 0.14860015665797902, + "learning_rate": 1.1445163748854326e-05, + "loss": 2.723, + "step": 38771 + }, + { + "epoch": 2.406853311813272, + "grad_norm": 0.1396878460318698, + "learning_rate": 1.1442864326895913e-05, + "loss": 2.7161, + "step": 38772 + }, + { + "epoch": 2.40691538891303, + "grad_norm": 0.1451210796712189, + "learning_rate": 1.1440565106099433e-05, + "loss": 2.6983, + "step": 38773 + }, + { + "epoch": 2.406977466012788, + "grad_norm": 0.1391293340982131, + "learning_rate": 1.1438266086476874e-05, + "loss": 2.6919, + "step": 38774 + }, + { + "epoch": 2.4070395431125458, + "grad_norm": 0.13936856374719173, + "learning_rate": 1.1435967268040249e-05, + "loss": 2.6347, + "step": 38775 + }, + { + "epoch": 2.4071016202123037, + "grad_norm": 0.14463001176101214, + "learning_rate": 1.1433668650801549e-05, + "loss": 2.711, + "step": 38776 + }, + { + "epoch": 2.4071636973120616, + "grad_norm": 0.13756159789826466, + "learning_rate": 1.1431370234772764e-05, + "loss": 2.6999, + "step": 38777 + }, + { + "epoch": 2.4072257744118195, + "grad_norm": 0.13365966219378303, + "learning_rate": 1.142907201996587e-05, + "loss": 2.6595, + "step": 38778 + }, + { + "epoch": 2.4072878515115774, + "grad_norm": 0.14650012885257274, + "learning_rate": 1.1426774006392882e-05, + "loss": 2.7294, + "step": 38779 + }, + { + "epoch": 2.4073499286113353, + "grad_norm": 0.13993755502985103, + "learning_rate": 1.1424476194065765e-05, + "loss": 2.7503, + "step": 38780 + }, + { + "epoch": 2.4074120057110933, + "grad_norm": 0.14501330172841032, + "learning_rate": 1.1422178582996528e-05, + "loss": 2.7664, + "step": 38781 + }, + { + "epoch": 2.407474082810851, + "grad_norm": 0.13842403400191558, + "learning_rate": 1.141988117319716e-05, + "loss": 2.6771, + "step": 38782 + }, + { + "epoch": 2.407536159910609, + "grad_norm": 0.13946420347729588, + "learning_rate": 1.1417583964679617e-05, + "loss": 2.6963, + "step": 38783 + }, + { + "epoch": 2.407598237010367, + "grad_norm": 0.1411173466819495, + "learning_rate": 1.1415286957455924e-05, + "loss": 2.831, + "step": 38784 + }, + { + "epoch": 2.407660314110125, + "grad_norm": 0.13567310057107063, + "learning_rate": 1.1412990151538038e-05, + "loss": 2.739, + "step": 38785 + }, + { + "epoch": 2.407722391209883, + "grad_norm": 0.14067059678375438, + "learning_rate": 1.1410693546937956e-05, + "loss": 2.6972, + "step": 38786 + }, + { + "epoch": 2.4077844683096403, + "grad_norm": 0.13958598787953003, + "learning_rate": 1.1408397143667654e-05, + "loss": 2.813, + "step": 38787 + }, + { + "epoch": 2.4078465454093987, + "grad_norm": 0.15065257061506937, + "learning_rate": 1.1406100941739095e-05, + "loss": 2.7322, + "step": 38788 + }, + { + "epoch": 2.407908622509156, + "grad_norm": 0.14753599972180675, + "learning_rate": 1.1403804941164291e-05, + "loss": 2.7397, + "step": 38789 + }, + { + "epoch": 2.407970699608914, + "grad_norm": 0.14559447504777212, + "learning_rate": 1.1401509141955208e-05, + "loss": 2.6922, + "step": 38790 + }, + { + "epoch": 2.408032776708672, + "grad_norm": 0.15292154073634828, + "learning_rate": 1.1399213544123821e-05, + "loss": 2.7769, + "step": 38791 + }, + { + "epoch": 2.40809485380843, + "grad_norm": 0.14280532997337847, + "learning_rate": 1.1396918147682095e-05, + "loss": 2.8057, + "step": 38792 + }, + { + "epoch": 2.408156930908188, + "grad_norm": 0.14563854796090112, + "learning_rate": 1.1394622952642031e-05, + "loss": 2.697, + "step": 38793 + }, + { + "epoch": 2.4082190080079457, + "grad_norm": 0.13929082524656636, + "learning_rate": 1.1392327959015592e-05, + "loss": 2.6643, + "step": 38794 + }, + { + "epoch": 2.4082810851077037, + "grad_norm": 0.15545094737963303, + "learning_rate": 1.1390033166814751e-05, + "loss": 2.812, + "step": 38795 + }, + { + "epoch": 2.4083431622074616, + "grad_norm": 0.15618304964039514, + "learning_rate": 1.1387738576051476e-05, + "loss": 2.6684, + "step": 38796 + }, + { + "epoch": 2.4084052393072195, + "grad_norm": 0.14593384605424092, + "learning_rate": 1.1385444186737732e-05, + "loss": 2.7737, + "step": 38797 + }, + { + "epoch": 2.4084673164069774, + "grad_norm": 0.14863014895525106, + "learning_rate": 1.1383149998885511e-05, + "loss": 2.7406, + "step": 38798 + }, + { + "epoch": 2.4085293935067353, + "grad_norm": 0.1443600607342809, + "learning_rate": 1.1380856012506775e-05, + "loss": 2.6572, + "step": 38799 + }, + { + "epoch": 2.4085914706064933, + "grad_norm": 0.15798048554059055, + "learning_rate": 1.1378562227613481e-05, + "loss": 2.7094, + "step": 38800 + }, + { + "epoch": 2.408653547706251, + "grad_norm": 0.14096104022468792, + "learning_rate": 1.1376268644217597e-05, + "loss": 2.6944, + "step": 38801 + }, + { + "epoch": 2.408715624806009, + "grad_norm": 0.14887983520160344, + "learning_rate": 1.1373975262331104e-05, + "loss": 2.6717, + "step": 38802 + }, + { + "epoch": 2.408777701905767, + "grad_norm": 0.1401577378835133, + "learning_rate": 1.1371682081965963e-05, + "loss": 2.6681, + "step": 38803 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.14224189147650232, + "learning_rate": 1.1369389103134132e-05, + "loss": 2.7551, + "step": 38804 + }, + { + "epoch": 2.408901856105283, + "grad_norm": 0.1642337881293167, + "learning_rate": 1.1367096325847565e-05, + "loss": 2.7225, + "step": 38805 + }, + { + "epoch": 2.4089639332050408, + "grad_norm": 0.1416016677329889, + "learning_rate": 1.1364803750118241e-05, + "loss": 2.7263, + "step": 38806 + }, + { + "epoch": 2.4090260103047987, + "grad_norm": 0.1486429436598147, + "learning_rate": 1.1362511375958123e-05, + "loss": 2.7263, + "step": 38807 + }, + { + "epoch": 2.4090880874045566, + "grad_norm": 0.14690196596312552, + "learning_rate": 1.1360219203379157e-05, + "loss": 2.7861, + "step": 38808 + }, + { + "epoch": 2.4091501645043145, + "grad_norm": 0.13755488844450772, + "learning_rate": 1.1357927232393312e-05, + "loss": 2.6731, + "step": 38809 + }, + { + "epoch": 2.4092122416040724, + "grad_norm": 0.14577280786730867, + "learning_rate": 1.1355635463012527e-05, + "loss": 2.7036, + "step": 38810 + }, + { + "epoch": 2.4092743187038304, + "grad_norm": 0.15982586231060228, + "learning_rate": 1.1353343895248786e-05, + "loss": 2.7607, + "step": 38811 + }, + { + "epoch": 2.409336395803588, + "grad_norm": 0.1633935186460566, + "learning_rate": 1.135105252911403e-05, + "loss": 2.76, + "step": 38812 + }, + { + "epoch": 2.409398472903346, + "grad_norm": 0.16010141015291027, + "learning_rate": 1.1348761364620203e-05, + "loss": 2.6257, + "step": 38813 + }, + { + "epoch": 2.4094605500031037, + "grad_norm": 0.1479427600621681, + "learning_rate": 1.1346470401779286e-05, + "loss": 2.6394, + "step": 38814 + }, + { + "epoch": 2.4095226271028616, + "grad_norm": 0.14607915779921352, + "learning_rate": 1.1344179640603203e-05, + "loss": 2.7321, + "step": 38815 + }, + { + "epoch": 2.4095847042026195, + "grad_norm": 0.15565406882108235, + "learning_rate": 1.1341889081103929e-05, + "loss": 2.6648, + "step": 38816 + }, + { + "epoch": 2.4096467813023774, + "grad_norm": 0.14010243544755696, + "learning_rate": 1.1339598723293404e-05, + "loss": 2.6599, + "step": 38817 + }, + { + "epoch": 2.4097088584021353, + "grad_norm": 0.13879942736371848, + "learning_rate": 1.1337308567183586e-05, + "loss": 2.7054, + "step": 38818 + }, + { + "epoch": 2.4097709355018933, + "grad_norm": 0.15312034784762688, + "learning_rate": 1.133501861278639e-05, + "loss": 2.6666, + "step": 38819 + }, + { + "epoch": 2.409833012601651, + "grad_norm": 0.1597569477271117, + "learning_rate": 1.1332728860113807e-05, + "loss": 2.822, + "step": 38820 + }, + { + "epoch": 2.409895089701409, + "grad_norm": 0.15435152655298573, + "learning_rate": 1.1330439309177765e-05, + "loss": 2.6945, + "step": 38821 + }, + { + "epoch": 2.409957166801167, + "grad_norm": 0.1356998075563982, + "learning_rate": 1.1328149959990208e-05, + "loss": 2.7069, + "step": 38822 + }, + { + "epoch": 2.410019243900925, + "grad_norm": 0.1418123255212046, + "learning_rate": 1.1325860812563082e-05, + "loss": 2.6663, + "step": 38823 + }, + { + "epoch": 2.410081321000683, + "grad_norm": 0.14010645044396422, + "learning_rate": 1.1323571866908311e-05, + "loss": 2.7161, + "step": 38824 + }, + { + "epoch": 2.4101433981004408, + "grad_norm": 0.13791195080460306, + "learning_rate": 1.1321283123037873e-05, + "loss": 2.7311, + "step": 38825 + }, + { + "epoch": 2.4102054752001987, + "grad_norm": 0.13770396969682344, + "learning_rate": 1.1318994580963688e-05, + "loss": 2.7575, + "step": 38826 + }, + { + "epoch": 2.4102675522999566, + "grad_norm": 0.1341388543089304, + "learning_rate": 1.1316706240697694e-05, + "loss": 2.6791, + "step": 38827 + }, + { + "epoch": 2.4103296293997145, + "grad_norm": 0.168362394646562, + "learning_rate": 1.1314418102251827e-05, + "loss": 2.7687, + "step": 38828 + }, + { + "epoch": 2.4103917064994724, + "grad_norm": 0.13515111136177096, + "learning_rate": 1.1312130165638047e-05, + "loss": 2.7065, + "step": 38829 + }, + { + "epoch": 2.4104537835992303, + "grad_norm": 0.13590009684480286, + "learning_rate": 1.1309842430868267e-05, + "loss": 2.815, + "step": 38830 + }, + { + "epoch": 2.4105158606989883, + "grad_norm": 0.13876689422485713, + "learning_rate": 1.1307554897954436e-05, + "loss": 2.6845, + "step": 38831 + }, + { + "epoch": 2.410577937798746, + "grad_norm": 0.13925927630881693, + "learning_rate": 1.1305267566908489e-05, + "loss": 2.781, + "step": 38832 + }, + { + "epoch": 2.410640014898504, + "grad_norm": 0.1499918692093814, + "learning_rate": 1.1302980437742333e-05, + "loss": 2.6581, + "step": 38833 + }, + { + "epoch": 2.410702091998262, + "grad_norm": 0.1555106731900046, + "learning_rate": 1.130069351046794e-05, + "loss": 2.7402, + "step": 38834 + }, + { + "epoch": 2.4107641690980195, + "grad_norm": 0.14582053764156247, + "learning_rate": 1.1298406785097227e-05, + "loss": 2.7341, + "step": 38835 + }, + { + "epoch": 2.410826246197778, + "grad_norm": 0.1480447441893858, + "learning_rate": 1.1296120261642113e-05, + "loss": 2.6702, + "step": 38836 + }, + { + "epoch": 2.4108883232975353, + "grad_norm": 0.1435682977984546, + "learning_rate": 1.1293833940114528e-05, + "loss": 2.7485, + "step": 38837 + }, + { + "epoch": 2.4109504003972932, + "grad_norm": 0.13316968263116993, + "learning_rate": 1.1291547820526415e-05, + "loss": 2.621, + "step": 38838 + }, + { + "epoch": 2.411012477497051, + "grad_norm": 0.1440663334921896, + "learning_rate": 1.1289261902889704e-05, + "loss": 2.6417, + "step": 38839 + }, + { + "epoch": 2.411074554596809, + "grad_norm": 0.14206505260951766, + "learning_rate": 1.1286976187216302e-05, + "loss": 2.697, + "step": 38840 + }, + { + "epoch": 2.411136631696567, + "grad_norm": 0.13913207858406842, + "learning_rate": 1.1284690673518133e-05, + "loss": 2.634, + "step": 38841 + }, + { + "epoch": 2.411198708796325, + "grad_norm": 0.13589632079254232, + "learning_rate": 1.1282405361807141e-05, + "loss": 2.6407, + "step": 38842 + }, + { + "epoch": 2.411260785896083, + "grad_norm": 0.14337796082968957, + "learning_rate": 1.1280120252095244e-05, + "loss": 2.6963, + "step": 38843 + }, + { + "epoch": 2.4113228629958408, + "grad_norm": 0.15250998988027892, + "learning_rate": 1.1277835344394355e-05, + "loss": 2.7714, + "step": 38844 + }, + { + "epoch": 2.4113849400955987, + "grad_norm": 0.13937575387109344, + "learning_rate": 1.1275550638716403e-05, + "loss": 2.7202, + "step": 38845 + }, + { + "epoch": 2.4114470171953566, + "grad_norm": 0.1343642213845469, + "learning_rate": 1.1273266135073291e-05, + "loss": 2.6679, + "step": 38846 + }, + { + "epoch": 2.4115090942951145, + "grad_norm": 0.1335043906680139, + "learning_rate": 1.1270981833476946e-05, + "loss": 2.7801, + "step": 38847 + }, + { + "epoch": 2.4115711713948724, + "grad_norm": 0.1452683712495671, + "learning_rate": 1.126869773393931e-05, + "loss": 2.6203, + "step": 38848 + }, + { + "epoch": 2.4116332484946303, + "grad_norm": 0.144532287439283, + "learning_rate": 1.1266413836472278e-05, + "loss": 2.72, + "step": 38849 + }, + { + "epoch": 2.4116953255943883, + "grad_norm": 0.13636656768404457, + "learning_rate": 1.1264130141087764e-05, + "loss": 2.6223, + "step": 38850 + }, + { + "epoch": 2.411757402694146, + "grad_norm": 0.13419633675996243, + "learning_rate": 1.1261846647797674e-05, + "loss": 2.6406, + "step": 38851 + }, + { + "epoch": 2.411819479793904, + "grad_norm": 0.14123609010178553, + "learning_rate": 1.1259563356613951e-05, + "loss": 2.6473, + "step": 38852 + }, + { + "epoch": 2.411881556893662, + "grad_norm": 0.13866756996671922, + "learning_rate": 1.1257280267548492e-05, + "loss": 2.7428, + "step": 38853 + }, + { + "epoch": 2.41194363399342, + "grad_norm": 0.13625146164022614, + "learning_rate": 1.1254997380613203e-05, + "loss": 2.6368, + "step": 38854 + }, + { + "epoch": 2.412005711093178, + "grad_norm": 0.14609709088182526, + "learning_rate": 1.1252714695819988e-05, + "loss": 2.7137, + "step": 38855 + }, + { + "epoch": 2.4120677881929358, + "grad_norm": 0.14047033361023886, + "learning_rate": 1.1250432213180779e-05, + "loss": 2.7931, + "step": 38856 + }, + { + "epoch": 2.4121298652926937, + "grad_norm": 0.13737573806644263, + "learning_rate": 1.124814993270747e-05, + "loss": 2.6997, + "step": 38857 + }, + { + "epoch": 2.4121919423924516, + "grad_norm": 0.14638073115608063, + "learning_rate": 1.1245867854411968e-05, + "loss": 2.7437, + "step": 38858 + }, + { + "epoch": 2.4122540194922095, + "grad_norm": 0.1495294561647225, + "learning_rate": 1.1243585978306187e-05, + "loss": 2.6099, + "step": 38859 + }, + { + "epoch": 2.412316096591967, + "grad_norm": 0.14592833206329042, + "learning_rate": 1.1241304304402011e-05, + "loss": 2.7051, + "step": 38860 + }, + { + "epoch": 2.4123781736917254, + "grad_norm": 0.1434881873735024, + "learning_rate": 1.123902283271137e-05, + "loss": 2.7341, + "step": 38861 + }, + { + "epoch": 2.412440250791483, + "grad_norm": 0.14285617654018046, + "learning_rate": 1.1236741563246162e-05, + "loss": 2.6984, + "step": 38862 + }, + { + "epoch": 2.4125023278912407, + "grad_norm": 0.15372429611536864, + "learning_rate": 1.1234460496018279e-05, + "loss": 2.6694, + "step": 38863 + }, + { + "epoch": 2.4125644049909987, + "grad_norm": 0.13438426529191244, + "learning_rate": 1.1232179631039613e-05, + "loss": 2.7278, + "step": 38864 + }, + { + "epoch": 2.4126264820907566, + "grad_norm": 0.1432574757140726, + "learning_rate": 1.1229898968322094e-05, + "loss": 2.6897, + "step": 38865 + }, + { + "epoch": 2.4126885591905145, + "grad_norm": 0.14854034826069007, + "learning_rate": 1.1227618507877603e-05, + "loss": 2.6753, + "step": 38866 + }, + { + "epoch": 2.4127506362902724, + "grad_norm": 0.14040816331787037, + "learning_rate": 1.1225338249718032e-05, + "loss": 2.7226, + "step": 38867 + }, + { + "epoch": 2.4128127133900303, + "grad_norm": 0.1417034703253752, + "learning_rate": 1.1223058193855278e-05, + "loss": 2.7521, + "step": 38868 + }, + { + "epoch": 2.4128747904897883, + "grad_norm": 0.1433560300847376, + "learning_rate": 1.1220778340301252e-05, + "loss": 2.571, + "step": 38869 + }, + { + "epoch": 2.412936867589546, + "grad_norm": 0.13743834494981122, + "learning_rate": 1.121849868906784e-05, + "loss": 2.6742, + "step": 38870 + }, + { + "epoch": 2.412998944689304, + "grad_norm": 0.1659696459210017, + "learning_rate": 1.1216219240166942e-05, + "loss": 2.7183, + "step": 38871 + }, + { + "epoch": 2.413061021789062, + "grad_norm": 0.14632527294118816, + "learning_rate": 1.1213939993610435e-05, + "loss": 2.7278, + "step": 38872 + }, + { + "epoch": 2.41312309888882, + "grad_norm": 0.13204543810572586, + "learning_rate": 1.1211660949410208e-05, + "loss": 2.6615, + "step": 38873 + }, + { + "epoch": 2.413185175988578, + "grad_norm": 0.1454101977530805, + "learning_rate": 1.1209382107578176e-05, + "loss": 2.7093, + "step": 38874 + }, + { + "epoch": 2.4132472530883358, + "grad_norm": 0.15752597400711968, + "learning_rate": 1.1207103468126212e-05, + "loss": 2.7025, + "step": 38875 + }, + { + "epoch": 2.4133093301880937, + "grad_norm": 0.14980915291707128, + "learning_rate": 1.1204825031066207e-05, + "loss": 2.8074, + "step": 38876 + }, + { + "epoch": 2.4133714072878516, + "grad_norm": 0.13760269377822817, + "learning_rate": 1.1202546796410036e-05, + "loss": 2.6836, + "step": 38877 + }, + { + "epoch": 2.4134334843876095, + "grad_norm": 0.14242469030455054, + "learning_rate": 1.1200268764169614e-05, + "loss": 2.6806, + "step": 38878 + }, + { + "epoch": 2.4134955614873674, + "grad_norm": 0.1352557240718602, + "learning_rate": 1.1197990934356789e-05, + "loss": 2.6259, + "step": 38879 + }, + { + "epoch": 2.4135576385871254, + "grad_norm": 0.14324920615255876, + "learning_rate": 1.1195713306983479e-05, + "loss": 2.723, + "step": 38880 + }, + { + "epoch": 2.4136197156868833, + "grad_norm": 0.13759694068782194, + "learning_rate": 1.1193435882061554e-05, + "loss": 2.7116, + "step": 38881 + }, + { + "epoch": 2.413681792786641, + "grad_norm": 0.13866655581306342, + "learning_rate": 1.1191158659602897e-05, + "loss": 2.6326, + "step": 38882 + }, + { + "epoch": 2.4137438698863987, + "grad_norm": 0.13524850083990447, + "learning_rate": 1.118888163961937e-05, + "loss": 2.7107, + "step": 38883 + }, + { + "epoch": 2.413805946986157, + "grad_norm": 0.1382681517135739, + "learning_rate": 1.1186604822122882e-05, + "loss": 2.7472, + "step": 38884 + }, + { + "epoch": 2.4138680240859145, + "grad_norm": 0.1410226762017852, + "learning_rate": 1.11843282071253e-05, + "loss": 2.7635, + "step": 38885 + }, + { + "epoch": 2.4139301011856724, + "grad_norm": 0.13869045697214913, + "learning_rate": 1.1182051794638498e-05, + "loss": 2.7534, + "step": 38886 + }, + { + "epoch": 2.4139921782854303, + "grad_norm": 0.15538488143463366, + "learning_rate": 1.1179775584674345e-05, + "loss": 2.7306, + "step": 38887 + }, + { + "epoch": 2.4140542553851883, + "grad_norm": 0.1397397603026625, + "learning_rate": 1.1177499577244744e-05, + "loss": 2.6364, + "step": 38888 + }, + { + "epoch": 2.414116332484946, + "grad_norm": 0.14342627328681842, + "learning_rate": 1.1175223772361549e-05, + "loss": 2.761, + "step": 38889 + }, + { + "epoch": 2.414178409584704, + "grad_norm": 0.13542623593069394, + "learning_rate": 1.1172948170036635e-05, + "loss": 2.7663, + "step": 38890 + }, + { + "epoch": 2.414240486684462, + "grad_norm": 0.14401140675425708, + "learning_rate": 1.1170672770281858e-05, + "loss": 2.7325, + "step": 38891 + }, + { + "epoch": 2.41430256378422, + "grad_norm": 0.15156550797692545, + "learning_rate": 1.116839757310913e-05, + "loss": 2.7168, + "step": 38892 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.13526683704246967, + "learning_rate": 1.1166122578530291e-05, + "loss": 2.6132, + "step": 38893 + }, + { + "epoch": 2.4144267179837358, + "grad_norm": 0.14070732303183633, + "learning_rate": 1.1163847786557224e-05, + "loss": 2.8195, + "step": 38894 + }, + { + "epoch": 2.4144887950834937, + "grad_norm": 0.14151808810462344, + "learning_rate": 1.1161573197201785e-05, + "loss": 2.6795, + "step": 38895 + }, + { + "epoch": 2.4145508721832516, + "grad_norm": 0.14515878432887602, + "learning_rate": 1.1159298810475838e-05, + "loss": 2.7615, + "step": 38896 + }, + { + "epoch": 2.4146129492830095, + "grad_norm": 0.13681363491652387, + "learning_rate": 1.1157024626391272e-05, + "loss": 2.6814, + "step": 38897 + }, + { + "epoch": 2.4146750263827674, + "grad_norm": 0.13547474428794395, + "learning_rate": 1.1154750644959933e-05, + "loss": 2.6921, + "step": 38898 + }, + { + "epoch": 2.4147371034825253, + "grad_norm": 0.13808743801138, + "learning_rate": 1.1152476866193695e-05, + "loss": 2.635, + "step": 38899 + }, + { + "epoch": 2.4147991805822833, + "grad_norm": 0.15000240577318227, + "learning_rate": 1.1150203290104399e-05, + "loss": 2.704, + "step": 38900 + }, + { + "epoch": 2.414861257682041, + "grad_norm": 0.1438150769106032, + "learning_rate": 1.1147929916703936e-05, + "loss": 2.6623, + "step": 38901 + }, + { + "epoch": 2.414923334781799, + "grad_norm": 0.1439366611070325, + "learning_rate": 1.1145656746004157e-05, + "loss": 2.7312, + "step": 38902 + }, + { + "epoch": 2.414985411881557, + "grad_norm": 0.13637483802081243, + "learning_rate": 1.1143383778016919e-05, + "loss": 2.7468, + "step": 38903 + }, + { + "epoch": 2.415047488981315, + "grad_norm": 0.1363871572291616, + "learning_rate": 1.1141111012754069e-05, + "loss": 2.6669, + "step": 38904 + }, + { + "epoch": 2.415109566081073, + "grad_norm": 0.17030539475939893, + "learning_rate": 1.1138838450227485e-05, + "loss": 2.7708, + "step": 38905 + }, + { + "epoch": 2.4151716431808308, + "grad_norm": 0.13781390933313106, + "learning_rate": 1.1136566090449019e-05, + "loss": 2.8079, + "step": 38906 + }, + { + "epoch": 2.4152337202805887, + "grad_norm": 0.14098105125648305, + "learning_rate": 1.1134293933430518e-05, + "loss": 2.6579, + "step": 38907 + }, + { + "epoch": 2.415295797380346, + "grad_norm": 0.1383774008228197, + "learning_rate": 1.1132021979183843e-05, + "loss": 2.7552, + "step": 38908 + }, + { + "epoch": 2.4153578744801045, + "grad_norm": 0.14968247706839605, + "learning_rate": 1.1129750227720832e-05, + "loss": 2.6238, + "step": 38909 + }, + { + "epoch": 2.415419951579862, + "grad_norm": 0.14672233327322912, + "learning_rate": 1.112747867905336e-05, + "loss": 2.6984, + "step": 38910 + }, + { + "epoch": 2.41548202867962, + "grad_norm": 0.14348518220021622, + "learning_rate": 1.112520733319326e-05, + "loss": 2.7145, + "step": 38911 + }, + { + "epoch": 2.415544105779378, + "grad_norm": 0.1729030047663181, + "learning_rate": 1.11229361901524e-05, + "loss": 2.7436, + "step": 38912 + }, + { + "epoch": 2.4156061828791358, + "grad_norm": 0.14267450497870415, + "learning_rate": 1.1120665249942624e-05, + "loss": 2.7853, + "step": 38913 + }, + { + "epoch": 2.4156682599788937, + "grad_norm": 0.14205814283228446, + "learning_rate": 1.1118394512575758e-05, + "loss": 2.673, + "step": 38914 + }, + { + "epoch": 2.4157303370786516, + "grad_norm": 0.15203556653527517, + "learning_rate": 1.111612397806368e-05, + "loss": 2.6641, + "step": 38915 + }, + { + "epoch": 2.4157924141784095, + "grad_norm": 0.14027497652166154, + "learning_rate": 1.1113853646418227e-05, + "loss": 2.6833, + "step": 38916 + }, + { + "epoch": 2.4158544912781674, + "grad_norm": 0.13707571431580357, + "learning_rate": 1.1111583517651237e-05, + "loss": 2.6388, + "step": 38917 + }, + { + "epoch": 2.4159165683779253, + "grad_norm": 0.14863297491277777, + "learning_rate": 1.110931359177454e-05, + "loss": 2.7406, + "step": 38918 + }, + { + "epoch": 2.4159786454776833, + "grad_norm": 0.15544988095971277, + "learning_rate": 1.110704386880001e-05, + "loss": 2.696, + "step": 38919 + }, + { + "epoch": 2.416040722577441, + "grad_norm": 0.14777090324066042, + "learning_rate": 1.1104774348739478e-05, + "loss": 2.7567, + "step": 38920 + }, + { + "epoch": 2.416102799677199, + "grad_norm": 0.14688451157877558, + "learning_rate": 1.1102505031604776e-05, + "loss": 2.6788, + "step": 38921 + }, + { + "epoch": 2.416164876776957, + "grad_norm": 0.1406101046726706, + "learning_rate": 1.1100235917407748e-05, + "loss": 2.7399, + "step": 38922 + }, + { + "epoch": 2.416226953876715, + "grad_norm": 0.156978464744136, + "learning_rate": 1.109796700616022e-05, + "loss": 2.7056, + "step": 38923 + }, + { + "epoch": 2.416289030976473, + "grad_norm": 0.13639048267837356, + "learning_rate": 1.1095698297874053e-05, + "loss": 2.7218, + "step": 38924 + }, + { + "epoch": 2.4163511080762308, + "grad_norm": 0.13949461114167166, + "learning_rate": 1.1093429792561077e-05, + "loss": 2.7411, + "step": 38925 + }, + { + "epoch": 2.4164131851759887, + "grad_norm": 0.13429498036475584, + "learning_rate": 1.109116149023312e-05, + "loss": 2.668, + "step": 38926 + }, + { + "epoch": 2.4164752622757466, + "grad_norm": 0.13560734370222605, + "learning_rate": 1.1088893390902006e-05, + "loss": 2.6662, + "step": 38927 + }, + { + "epoch": 2.4165373393755045, + "grad_norm": 0.13867001465100265, + "learning_rate": 1.1086625494579595e-05, + "loss": 2.7186, + "step": 38928 + }, + { + "epoch": 2.4165994164752624, + "grad_norm": 0.1417749461512429, + "learning_rate": 1.10843578012777e-05, + "loss": 2.7529, + "step": 38929 + }, + { + "epoch": 2.4166614935750204, + "grad_norm": 0.13627171319344006, + "learning_rate": 1.1082090311008165e-05, + "loss": 2.6883, + "step": 38930 + }, + { + "epoch": 2.416723570674778, + "grad_norm": 0.1390503768244174, + "learning_rate": 1.107982302378281e-05, + "loss": 2.6604, + "step": 38931 + }, + { + "epoch": 2.416785647774536, + "grad_norm": 0.16150991643844373, + "learning_rate": 1.1077555939613454e-05, + "loss": 2.6381, + "step": 38932 + }, + { + "epoch": 2.4168477248742937, + "grad_norm": 0.13961852360162352, + "learning_rate": 1.107528905851195e-05, + "loss": 2.7289, + "step": 38933 + }, + { + "epoch": 2.4169098019740516, + "grad_norm": 0.15800934568022038, + "learning_rate": 1.1073022380490111e-05, + "loss": 2.751, + "step": 38934 + }, + { + "epoch": 2.4169718790738095, + "grad_norm": 0.1375694714412773, + "learning_rate": 1.1070755905559766e-05, + "loss": 2.7153, + "step": 38935 + }, + { + "epoch": 2.4170339561735674, + "grad_norm": 0.14690483150474917, + "learning_rate": 1.106848963373272e-05, + "loss": 2.7566, + "step": 38936 + }, + { + "epoch": 2.4170960332733253, + "grad_norm": 0.13694650888646712, + "learning_rate": 1.1066223565020833e-05, + "loss": 2.7023, + "step": 38937 + }, + { + "epoch": 2.4171581103730833, + "grad_norm": 0.14369138709614293, + "learning_rate": 1.1063957699435906e-05, + "loss": 2.6147, + "step": 38938 + }, + { + "epoch": 2.417220187472841, + "grad_norm": 0.14037488841751297, + "learning_rate": 1.1061692036989769e-05, + "loss": 2.7227, + "step": 38939 + }, + { + "epoch": 2.417282264572599, + "grad_norm": 0.14275565264937484, + "learning_rate": 1.1059426577694216e-05, + "loss": 2.7882, + "step": 38940 + }, + { + "epoch": 2.417344341672357, + "grad_norm": 0.13979977630277057, + "learning_rate": 1.1057161321561105e-05, + "loss": 2.7465, + "step": 38941 + }, + { + "epoch": 2.417406418772115, + "grad_norm": 0.14955242457488954, + "learning_rate": 1.1054896268602239e-05, + "loss": 2.732, + "step": 38942 + }, + { + "epoch": 2.417468495871873, + "grad_norm": 0.14079586218894147, + "learning_rate": 1.1052631418829435e-05, + "loss": 2.7164, + "step": 38943 + }, + { + "epoch": 2.4175305729716308, + "grad_norm": 0.14405330685516074, + "learning_rate": 1.1050366772254489e-05, + "loss": 2.674, + "step": 38944 + }, + { + "epoch": 2.4175926500713887, + "grad_norm": 0.13870637759956236, + "learning_rate": 1.1048102328889248e-05, + "loss": 2.6395, + "step": 38945 + }, + { + "epoch": 2.4176547271711466, + "grad_norm": 0.1492438874366786, + "learning_rate": 1.1045838088745503e-05, + "loss": 2.6918, + "step": 38946 + }, + { + "epoch": 2.4177168042709045, + "grad_norm": 0.13741335170656793, + "learning_rate": 1.1043574051835087e-05, + "loss": 2.6929, + "step": 38947 + }, + { + "epoch": 2.4177788813706624, + "grad_norm": 0.13467533431681736, + "learning_rate": 1.104131021816981e-05, + "loss": 2.7987, + "step": 38948 + }, + { + "epoch": 2.4178409584704204, + "grad_norm": 0.14845950816782838, + "learning_rate": 1.1039046587761464e-05, + "loss": 2.7083, + "step": 38949 + }, + { + "epoch": 2.4179030355701783, + "grad_norm": 0.1477554991863785, + "learning_rate": 1.1036783160621866e-05, + "loss": 2.7232, + "step": 38950 + }, + { + "epoch": 2.417965112669936, + "grad_norm": 0.1390320171207281, + "learning_rate": 1.1034519936762838e-05, + "loss": 2.7091, + "step": 38951 + }, + { + "epoch": 2.418027189769694, + "grad_norm": 0.13398903846966045, + "learning_rate": 1.1032256916196182e-05, + "loss": 2.6788, + "step": 38952 + }, + { + "epoch": 2.418089266869452, + "grad_norm": 0.1414099416198223, + "learning_rate": 1.1029994098933699e-05, + "loss": 2.7167, + "step": 38953 + }, + { + "epoch": 2.41815134396921, + "grad_norm": 0.13611565814308282, + "learning_rate": 1.102773148498718e-05, + "loss": 2.5907, + "step": 38954 + }, + { + "epoch": 2.418213421068968, + "grad_norm": 0.167422687806236, + "learning_rate": 1.1025469074368466e-05, + "loss": 2.6931, + "step": 38955 + }, + { + "epoch": 2.4182754981687253, + "grad_norm": 0.13537562570621026, + "learning_rate": 1.1023206867089337e-05, + "loss": 2.6645, + "step": 38956 + }, + { + "epoch": 2.4183375752684837, + "grad_norm": 0.14159063099726402, + "learning_rate": 1.1020944863161603e-05, + "loss": 2.7665, + "step": 38957 + }, + { + "epoch": 2.418399652368241, + "grad_norm": 0.14392061371002635, + "learning_rate": 1.1018683062597063e-05, + "loss": 2.7175, + "step": 38958 + }, + { + "epoch": 2.418461729467999, + "grad_norm": 0.1507895492088116, + "learning_rate": 1.10164214654075e-05, + "loss": 2.6654, + "step": 38959 + }, + { + "epoch": 2.418523806567757, + "grad_norm": 0.13351388462510472, + "learning_rate": 1.101416007160474e-05, + "loss": 2.7061, + "step": 38960 + }, + { + "epoch": 2.418585883667515, + "grad_norm": 0.1352824170742331, + "learning_rate": 1.1011898881200577e-05, + "loss": 2.7207, + "step": 38961 + }, + { + "epoch": 2.418647960767273, + "grad_norm": 0.13813073727831152, + "learning_rate": 1.1009637894206804e-05, + "loss": 2.6156, + "step": 38962 + }, + { + "epoch": 2.4187100378670308, + "grad_norm": 0.1341265855128422, + "learning_rate": 1.1007377110635193e-05, + "loss": 2.7269, + "step": 38963 + }, + { + "epoch": 2.4187721149667887, + "grad_norm": 0.14560692247244547, + "learning_rate": 1.1005116530497583e-05, + "loss": 2.6846, + "step": 38964 + }, + { + "epoch": 2.4188341920665466, + "grad_norm": 0.14838311177739066, + "learning_rate": 1.100285615380574e-05, + "loss": 2.6775, + "step": 38965 + }, + { + "epoch": 2.4188962691663045, + "grad_norm": 0.13702277711926322, + "learning_rate": 1.1000595980571472e-05, + "loss": 2.6966, + "step": 38966 + }, + { + "epoch": 2.4189583462660624, + "grad_norm": 0.14378693665847847, + "learning_rate": 1.0998336010806554e-05, + "loss": 2.5433, + "step": 38967 + }, + { + "epoch": 2.4190204233658203, + "grad_norm": 0.1407485260437375, + "learning_rate": 1.0996076244522775e-05, + "loss": 2.7049, + "step": 38968 + }, + { + "epoch": 2.4190825004655783, + "grad_norm": 0.14062052958058532, + "learning_rate": 1.099381668173195e-05, + "loss": 2.8046, + "step": 38969 + }, + { + "epoch": 2.419144577565336, + "grad_norm": 0.1433153427855221, + "learning_rate": 1.099155732244585e-05, + "loss": 2.605, + "step": 38970 + }, + { + "epoch": 2.419206654665094, + "grad_norm": 0.14742963976644746, + "learning_rate": 1.0989298166676265e-05, + "loss": 2.6836, + "step": 38971 + }, + { + "epoch": 2.419268731764852, + "grad_norm": 0.14332850147182485, + "learning_rate": 1.098703921443497e-05, + "loss": 2.7451, + "step": 38972 + }, + { + "epoch": 2.41933080886461, + "grad_norm": 0.14521081427580576, + "learning_rate": 1.0984780465733769e-05, + "loss": 2.7064, + "step": 38973 + }, + { + "epoch": 2.419392885964368, + "grad_norm": 0.1413268219237725, + "learning_rate": 1.098252192058445e-05, + "loss": 2.6813, + "step": 38974 + }, + { + "epoch": 2.4194549630641258, + "grad_norm": 0.1579422683950798, + "learning_rate": 1.098026357899878e-05, + "loss": 2.6908, + "step": 38975 + }, + { + "epoch": 2.4195170401638837, + "grad_norm": 0.14897688942373802, + "learning_rate": 1.0978005440988532e-05, + "loss": 2.627, + "step": 38976 + }, + { + "epoch": 2.4195791172636416, + "grad_norm": 0.15249577908263728, + "learning_rate": 1.0975747506565504e-05, + "loss": 2.6559, + "step": 38977 + }, + { + "epoch": 2.4196411943633995, + "grad_norm": 0.15775958037907756, + "learning_rate": 1.0973489775741492e-05, + "loss": 2.7424, + "step": 38978 + }, + { + "epoch": 2.419703271463157, + "grad_norm": 0.15509888296263089, + "learning_rate": 1.0971232248528252e-05, + "loss": 2.726, + "step": 38979 + }, + { + "epoch": 2.4197653485629154, + "grad_norm": 0.15474967068427609, + "learning_rate": 1.096897492493757e-05, + "loss": 2.6894, + "step": 38980 + }, + { + "epoch": 2.419827425662673, + "grad_norm": 0.14916784906273892, + "learning_rate": 1.0966717804981225e-05, + "loss": 2.8064, + "step": 38981 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.1560263642639278, + "learning_rate": 1.0964460888670968e-05, + "loss": 2.7802, + "step": 38982 + }, + { + "epoch": 2.4199515798621887, + "grad_norm": 0.1507044218431741, + "learning_rate": 1.0962204176018614e-05, + "loss": 2.6981, + "step": 38983 + }, + { + "epoch": 2.4200136569619466, + "grad_norm": 0.16073530467467154, + "learning_rate": 1.0959947667035919e-05, + "loss": 2.8101, + "step": 38984 + }, + { + "epoch": 2.4200757340617045, + "grad_norm": 0.13626120162553854, + "learning_rate": 1.0957691361734652e-05, + "loss": 2.6982, + "step": 38985 + }, + { + "epoch": 2.4201378111614624, + "grad_norm": 0.14326900159426564, + "learning_rate": 1.0955435260126574e-05, + "loss": 2.7832, + "step": 38986 + }, + { + "epoch": 2.4201998882612203, + "grad_norm": 0.15196129279261578, + "learning_rate": 1.0953179362223482e-05, + "loss": 2.6644, + "step": 38987 + }, + { + "epoch": 2.4202619653609783, + "grad_norm": 0.1388064188456055, + "learning_rate": 1.0950923668037127e-05, + "loss": 2.8023, + "step": 38988 + }, + { + "epoch": 2.420324042460736, + "grad_norm": 0.1418380406947885, + "learning_rate": 1.0948668177579285e-05, + "loss": 2.6992, + "step": 38989 + }, + { + "epoch": 2.420386119560494, + "grad_norm": 0.1934268910606352, + "learning_rate": 1.0946412890861712e-05, + "loss": 2.7076, + "step": 38990 + }, + { + "epoch": 2.420448196660252, + "grad_norm": 0.16574885869855913, + "learning_rate": 1.0944157807896193e-05, + "loss": 2.7707, + "step": 38991 + }, + { + "epoch": 2.42051027376001, + "grad_norm": 0.14354321051151694, + "learning_rate": 1.0941902928694486e-05, + "loss": 2.6101, + "step": 38992 + }, + { + "epoch": 2.420572350859768, + "grad_norm": 0.14504862663652107, + "learning_rate": 1.0939648253268353e-05, + "loss": 2.6993, + "step": 38993 + }, + { + "epoch": 2.4206344279595258, + "grad_norm": 0.1436342356333544, + "learning_rate": 1.0937393781629556e-05, + "loss": 2.7862, + "step": 38994 + }, + { + "epoch": 2.4206965050592837, + "grad_norm": 0.14245392340967428, + "learning_rate": 1.093513951378985e-05, + "loss": 2.7213, + "step": 38995 + }, + { + "epoch": 2.4207585821590416, + "grad_norm": 0.14046908715900358, + "learning_rate": 1.093288544976101e-05, + "loss": 2.6471, + "step": 38996 + }, + { + "epoch": 2.4208206592587995, + "grad_norm": 0.15028803286214845, + "learning_rate": 1.0930631589554796e-05, + "loss": 2.7361, + "step": 38997 + }, + { + "epoch": 2.4208827363585574, + "grad_norm": 0.17019080114262525, + "learning_rate": 1.092837793318296e-05, + "loss": 2.6637, + "step": 38998 + }, + { + "epoch": 2.4209448134583154, + "grad_norm": 0.1362649675665499, + "learning_rate": 1.0926124480657246e-05, + "loss": 2.6125, + "step": 38999 + }, + { + "epoch": 2.4210068905580733, + "grad_norm": 0.15663730184973124, + "learning_rate": 1.092387123198944e-05, + "loss": 2.6985, + "step": 39000 + }, + { + "epoch": 2.421068967657831, + "grad_norm": 0.16086142834635386, + "learning_rate": 1.0921618187191291e-05, + "loss": 2.7195, + "step": 39001 + }, + { + "epoch": 2.421131044757589, + "grad_norm": 0.14310968284811226, + "learning_rate": 1.091936534627454e-05, + "loss": 2.7047, + "step": 39002 + }, + { + "epoch": 2.421193121857347, + "grad_norm": 0.1612231781464718, + "learning_rate": 1.0917112709250948e-05, + "loss": 2.7373, + "step": 39003 + }, + { + "epoch": 2.4212551989571045, + "grad_norm": 0.13878902000471688, + "learning_rate": 1.091486027613225e-05, + "loss": 2.6947, + "step": 39004 + }, + { + "epoch": 2.421317276056863, + "grad_norm": 0.14681624647616195, + "learning_rate": 1.0912608046930235e-05, + "loss": 2.7526, + "step": 39005 + }, + { + "epoch": 2.4213793531566203, + "grad_norm": 0.13773793041208043, + "learning_rate": 1.0910356021656626e-05, + "loss": 2.7396, + "step": 39006 + }, + { + "epoch": 2.4214414302563783, + "grad_norm": 0.16098240276869863, + "learning_rate": 1.0908104200323183e-05, + "loss": 2.7938, + "step": 39007 + }, + { + "epoch": 2.421503507356136, + "grad_norm": 0.13696852075765173, + "learning_rate": 1.0905852582941634e-05, + "loss": 2.6841, + "step": 39008 + }, + { + "epoch": 2.421565584455894, + "grad_norm": 0.14494562674164344, + "learning_rate": 1.0903601169523752e-05, + "loss": 2.6286, + "step": 39009 + }, + { + "epoch": 2.421627661555652, + "grad_norm": 0.1385844855238653, + "learning_rate": 1.090134996008127e-05, + "loss": 2.7195, + "step": 39010 + }, + { + "epoch": 2.42168973865541, + "grad_norm": 0.13770691843022534, + "learning_rate": 1.0899098954625942e-05, + "loss": 2.6505, + "step": 39011 + }, + { + "epoch": 2.421751815755168, + "grad_norm": 0.15342746926168774, + "learning_rate": 1.0896848153169508e-05, + "loss": 2.6867, + "step": 39012 + }, + { + "epoch": 2.4218138928549258, + "grad_norm": 0.1456067241919037, + "learning_rate": 1.0894597555723701e-05, + "loss": 2.6759, + "step": 39013 + }, + { + "epoch": 2.4218759699546837, + "grad_norm": 0.13690226363904823, + "learning_rate": 1.0892347162300282e-05, + "loss": 2.7285, + "step": 39014 + }, + { + "epoch": 2.4219380470544416, + "grad_norm": 0.13675203599364358, + "learning_rate": 1.0890096972910985e-05, + "loss": 2.7372, + "step": 39015 + }, + { + "epoch": 2.4220001241541995, + "grad_norm": 0.15038985091171203, + "learning_rate": 1.088784698756754e-05, + "loss": 2.7469, + "step": 39016 + }, + { + "epoch": 2.4220622012539574, + "grad_norm": 0.14753589872160122, + "learning_rate": 1.0885597206281701e-05, + "loss": 2.723, + "step": 39017 + }, + { + "epoch": 2.4221242783537154, + "grad_norm": 0.14436666680310273, + "learning_rate": 1.0883347629065178e-05, + "loss": 2.7349, + "step": 39018 + }, + { + "epoch": 2.4221863554534733, + "grad_norm": 0.15958158973802902, + "learning_rate": 1.0881098255929739e-05, + "loss": 2.7974, + "step": 39019 + }, + { + "epoch": 2.422248432553231, + "grad_norm": 0.13525142047677324, + "learning_rate": 1.0878849086887106e-05, + "loss": 2.684, + "step": 39020 + }, + { + "epoch": 2.422310509652989, + "grad_norm": 0.14766003260737617, + "learning_rate": 1.0876600121949016e-05, + "loss": 2.7263, + "step": 39021 + }, + { + "epoch": 2.422372586752747, + "grad_norm": 0.13994617999733322, + "learning_rate": 1.087435136112719e-05, + "loss": 2.7758, + "step": 39022 + }, + { + "epoch": 2.422434663852505, + "grad_norm": 0.13986185745134616, + "learning_rate": 1.0872102804433376e-05, + "loss": 2.6645, + "step": 39023 + }, + { + "epoch": 2.422496740952263, + "grad_norm": 0.13867057208671446, + "learning_rate": 1.0869854451879307e-05, + "loss": 2.7482, + "step": 39024 + }, + { + "epoch": 2.422558818052021, + "grad_norm": 0.1369612314596335, + "learning_rate": 1.0867606303476708e-05, + "loss": 2.7248, + "step": 39025 + }, + { + "epoch": 2.4226208951517787, + "grad_norm": 0.15111824607310725, + "learning_rate": 1.0865358359237287e-05, + "loss": 2.639, + "step": 39026 + }, + { + "epoch": 2.422682972251536, + "grad_norm": 0.14072600684323355, + "learning_rate": 1.0863110619172812e-05, + "loss": 2.716, + "step": 39027 + }, + { + "epoch": 2.4227450493512945, + "grad_norm": 0.1455516621103919, + "learning_rate": 1.0860863083294986e-05, + "loss": 2.7341, + "step": 39028 + }, + { + "epoch": 2.422807126451052, + "grad_norm": 0.13866917513515986, + "learning_rate": 1.0858615751615542e-05, + "loss": 2.6899, + "step": 39029 + }, + { + "epoch": 2.42286920355081, + "grad_norm": 0.143104483058404, + "learning_rate": 1.0856368624146196e-05, + "loss": 2.7265, + "step": 39030 + }, + { + "epoch": 2.422931280650568, + "grad_norm": 0.14924914165390582, + "learning_rate": 1.0854121700898667e-05, + "loss": 2.6998, + "step": 39031 + }, + { + "epoch": 2.4229933577503258, + "grad_norm": 0.13389884315089629, + "learning_rate": 1.0851874981884701e-05, + "loss": 2.6169, + "step": 39032 + }, + { + "epoch": 2.4230554348500837, + "grad_norm": 0.13330339422215087, + "learning_rate": 1.0849628467116007e-05, + "loss": 2.6709, + "step": 39033 + }, + { + "epoch": 2.4231175119498416, + "grad_norm": 0.14477805381242392, + "learning_rate": 1.0847382156604308e-05, + "loss": 2.6958, + "step": 39034 + }, + { + "epoch": 2.4231795890495995, + "grad_norm": 0.1542053192458765, + "learning_rate": 1.0845136050361303e-05, + "loss": 2.6827, + "step": 39035 + }, + { + "epoch": 2.4232416661493574, + "grad_norm": 0.15649641395381647, + "learning_rate": 1.0842890148398743e-05, + "loss": 2.8518, + "step": 39036 + }, + { + "epoch": 2.4233037432491153, + "grad_norm": 0.15306424667900748, + "learning_rate": 1.0840644450728332e-05, + "loss": 2.7425, + "step": 39037 + }, + { + "epoch": 2.4233658203488733, + "grad_norm": 0.1442420101523155, + "learning_rate": 1.0838398957361784e-05, + "loss": 2.7633, + "step": 39038 + }, + { + "epoch": 2.423427897448631, + "grad_norm": 0.14908899837462813, + "learning_rate": 1.08361536683108e-05, + "loss": 2.7325, + "step": 39039 + }, + { + "epoch": 2.423489974548389, + "grad_norm": 0.13606541386788537, + "learning_rate": 1.0833908583587127e-05, + "loss": 2.6444, + "step": 39040 + }, + { + "epoch": 2.423552051648147, + "grad_norm": 0.14548165130974508, + "learning_rate": 1.083166370320246e-05, + "loss": 2.6617, + "step": 39041 + }, + { + "epoch": 2.423614128747905, + "grad_norm": 0.1380208543261626, + "learning_rate": 1.0829419027168497e-05, + "loss": 2.7187, + "step": 39042 + }, + { + "epoch": 2.423676205847663, + "grad_norm": 0.1395995314532153, + "learning_rate": 1.0827174555496978e-05, + "loss": 2.7175, + "step": 39043 + }, + { + "epoch": 2.4237382829474208, + "grad_norm": 0.14164934338744997, + "learning_rate": 1.0824930288199603e-05, + "loss": 2.7595, + "step": 39044 + }, + { + "epoch": 2.4238003600471787, + "grad_norm": 0.1387059277376687, + "learning_rate": 1.0822686225288059e-05, + "loss": 2.7358, + "step": 39045 + }, + { + "epoch": 2.4238624371469366, + "grad_norm": 0.15144777306306037, + "learning_rate": 1.0820442366774081e-05, + "loss": 2.6622, + "step": 39046 + }, + { + "epoch": 2.4239245142466945, + "grad_norm": 0.13558540621087145, + "learning_rate": 1.0818198712669375e-05, + "loss": 2.6909, + "step": 39047 + }, + { + "epoch": 2.4239865913464524, + "grad_norm": 0.14672055595757102, + "learning_rate": 1.0815955262985634e-05, + "loss": 2.7387, + "step": 39048 + }, + { + "epoch": 2.4240486684462104, + "grad_norm": 0.1392926572459596, + "learning_rate": 1.0813712017734551e-05, + "loss": 2.6839, + "step": 39049 + }, + { + "epoch": 2.4241107455459683, + "grad_norm": 0.14051723324561513, + "learning_rate": 1.0811468976927863e-05, + "loss": 2.7121, + "step": 39050 + }, + { + "epoch": 2.424172822645726, + "grad_norm": 0.15587893174504266, + "learning_rate": 1.080922614057725e-05, + "loss": 2.7393, + "step": 39051 + }, + { + "epoch": 2.4242348997454837, + "grad_norm": 0.1533432779302924, + "learning_rate": 1.0806983508694424e-05, + "loss": 2.6574, + "step": 39052 + }, + { + "epoch": 2.424296976845242, + "grad_norm": 0.1365347509649075, + "learning_rate": 1.0804741081291076e-05, + "loss": 2.7532, + "step": 39053 + }, + { + "epoch": 2.4243590539449995, + "grad_norm": 0.1463594094340835, + "learning_rate": 1.080249885837889e-05, + "loss": 2.7207, + "step": 39054 + }, + { + "epoch": 2.4244211310447574, + "grad_norm": 0.15438055835400766, + "learning_rate": 1.0800256839969602e-05, + "loss": 2.6788, + "step": 39055 + }, + { + "epoch": 2.4244832081445153, + "grad_norm": 0.13905105999170841, + "learning_rate": 1.0798015026074893e-05, + "loss": 2.689, + "step": 39056 + }, + { + "epoch": 2.4245452852442733, + "grad_norm": 0.1511988349649697, + "learning_rate": 1.0795773416706451e-05, + "loss": 2.7488, + "step": 39057 + }, + { + "epoch": 2.424607362344031, + "grad_norm": 0.1373332112015294, + "learning_rate": 1.0793532011875968e-05, + "loss": 2.727, + "step": 39058 + }, + { + "epoch": 2.424669439443789, + "grad_norm": 0.13900264727092143, + "learning_rate": 1.0791290811595156e-05, + "loss": 2.7447, + "step": 39059 + }, + { + "epoch": 2.424731516543547, + "grad_norm": 0.16054974458269328, + "learning_rate": 1.0789049815875696e-05, + "loss": 2.7657, + "step": 39060 + }, + { + "epoch": 2.424793593643305, + "grad_norm": 0.14339437421311932, + "learning_rate": 1.0786809024729289e-05, + "loss": 2.7515, + "step": 39061 + }, + { + "epoch": 2.424855670743063, + "grad_norm": 0.1391226868629952, + "learning_rate": 1.0784568438167597e-05, + "loss": 2.6422, + "step": 39062 + }, + { + "epoch": 2.4249177478428208, + "grad_norm": 0.1369803174469719, + "learning_rate": 1.078232805620235e-05, + "loss": 2.5676, + "step": 39063 + }, + { + "epoch": 2.4249798249425787, + "grad_norm": 0.1544045121153613, + "learning_rate": 1.078008787884522e-05, + "loss": 2.6636, + "step": 39064 + }, + { + "epoch": 2.4250419020423366, + "grad_norm": 0.13591170532117383, + "learning_rate": 1.0777847906107885e-05, + "loss": 2.6881, + "step": 39065 + }, + { + "epoch": 2.4251039791420945, + "grad_norm": 0.14404221227414907, + "learning_rate": 1.077560813800204e-05, + "loss": 2.712, + "step": 39066 + }, + { + "epoch": 2.4251660562418524, + "grad_norm": 0.16645833619594771, + "learning_rate": 1.0773368574539362e-05, + "loss": 2.7574, + "step": 39067 + }, + { + "epoch": 2.4252281333416104, + "grad_norm": 0.1513574720156541, + "learning_rate": 1.0771129215731552e-05, + "loss": 2.6709, + "step": 39068 + }, + { + "epoch": 2.4252902104413683, + "grad_norm": 0.13961484144808792, + "learning_rate": 1.0768890061590287e-05, + "loss": 2.6244, + "step": 39069 + }, + { + "epoch": 2.425352287541126, + "grad_norm": 0.13631797359238979, + "learning_rate": 1.0766651112127246e-05, + "loss": 2.8104, + "step": 39070 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.14153956009508403, + "learning_rate": 1.0764412367354092e-05, + "loss": 2.7201, + "step": 39071 + }, + { + "epoch": 2.425476441740642, + "grad_norm": 0.14218933188128194, + "learning_rate": 1.076217382728254e-05, + "loss": 2.7959, + "step": 39072 + }, + { + "epoch": 2.4255385188404, + "grad_norm": 0.14097164398094603, + "learning_rate": 1.0759935491924256e-05, + "loss": 2.7237, + "step": 39073 + }, + { + "epoch": 2.425600595940158, + "grad_norm": 0.15007992641593068, + "learning_rate": 1.0757697361290908e-05, + "loss": 2.716, + "step": 39074 + }, + { + "epoch": 2.4256626730399153, + "grad_norm": 0.1445838085184158, + "learning_rate": 1.0755459435394171e-05, + "loss": 2.7374, + "step": 39075 + }, + { + "epoch": 2.4257247501396737, + "grad_norm": 0.14447138809610865, + "learning_rate": 1.0753221714245726e-05, + "loss": 2.6799, + "step": 39076 + }, + { + "epoch": 2.425786827239431, + "grad_norm": 0.13678832327507726, + "learning_rate": 1.0750984197857266e-05, + "loss": 2.6883, + "step": 39077 + }, + { + "epoch": 2.425848904339189, + "grad_norm": 0.13971657857826517, + "learning_rate": 1.0748746886240446e-05, + "loss": 2.712, + "step": 39078 + }, + { + "epoch": 2.425910981438947, + "grad_norm": 0.13907077662879802, + "learning_rate": 1.0746509779406949e-05, + "loss": 2.7397, + "step": 39079 + }, + { + "epoch": 2.425973058538705, + "grad_norm": 0.13925476873423343, + "learning_rate": 1.0744272877368433e-05, + "loss": 2.651, + "step": 39080 + }, + { + "epoch": 2.426035135638463, + "grad_norm": 0.13576545790307396, + "learning_rate": 1.074203618013656e-05, + "loss": 2.6333, + "step": 39081 + }, + { + "epoch": 2.4260972127382208, + "grad_norm": 0.14689542952958895, + "learning_rate": 1.0739799687723035e-05, + "loss": 2.7591, + "step": 39082 + }, + { + "epoch": 2.4261592898379787, + "grad_norm": 0.13721966261094362, + "learning_rate": 1.0737563400139505e-05, + "loss": 2.7125, + "step": 39083 + }, + { + "epoch": 2.4262213669377366, + "grad_norm": 0.16894496520783997, + "learning_rate": 1.0735327317397637e-05, + "loss": 2.7427, + "step": 39084 + }, + { + "epoch": 2.4262834440374945, + "grad_norm": 0.1373881980050839, + "learning_rate": 1.073309143950908e-05, + "loss": 2.6614, + "step": 39085 + }, + { + "epoch": 2.4263455211372524, + "grad_norm": 0.14605853808974986, + "learning_rate": 1.0730855766485536e-05, + "loss": 2.7831, + "step": 39086 + }, + { + "epoch": 2.4264075982370104, + "grad_norm": 0.14229314882429, + "learning_rate": 1.0728620298338649e-05, + "loss": 2.7318, + "step": 39087 + }, + { + "epoch": 2.4264696753367683, + "grad_norm": 0.13846747687358651, + "learning_rate": 1.0726385035080083e-05, + "loss": 2.7227, + "step": 39088 + }, + { + "epoch": 2.426531752436526, + "grad_norm": 0.15061038544893, + "learning_rate": 1.0724149976721488e-05, + "loss": 2.6749, + "step": 39089 + }, + { + "epoch": 2.426593829536284, + "grad_norm": 0.14245168749498302, + "learning_rate": 1.0721915123274545e-05, + "loss": 2.6628, + "step": 39090 + }, + { + "epoch": 2.426655906636042, + "grad_norm": 0.14266723957120353, + "learning_rate": 1.071968047475091e-05, + "loss": 2.7602, + "step": 39091 + }, + { + "epoch": 2.4267179837358, + "grad_norm": 0.16425990535387094, + "learning_rate": 1.0717446031162236e-05, + "loss": 2.76, + "step": 39092 + }, + { + "epoch": 2.426780060835558, + "grad_norm": 0.14081231199015778, + "learning_rate": 1.0715211792520186e-05, + "loss": 2.7289, + "step": 39093 + }, + { + "epoch": 2.426842137935316, + "grad_norm": 0.14241025134398727, + "learning_rate": 1.0712977758836402e-05, + "loss": 2.632, + "step": 39094 + }, + { + "epoch": 2.4269042150350737, + "grad_norm": 0.13981894773972584, + "learning_rate": 1.0710743930122558e-05, + "loss": 2.7475, + "step": 39095 + }, + { + "epoch": 2.4269662921348316, + "grad_norm": 0.13869844101584552, + "learning_rate": 1.0708510306390301e-05, + "loss": 2.7415, + "step": 39096 + }, + { + "epoch": 2.4270283692345895, + "grad_norm": 0.13655885474405052, + "learning_rate": 1.0706276887651284e-05, + "loss": 2.673, + "step": 39097 + }, + { + "epoch": 2.4270904463343475, + "grad_norm": 0.1444414800795435, + "learning_rate": 1.0704043673917147e-05, + "loss": 2.7606, + "step": 39098 + }, + { + "epoch": 2.4271525234341054, + "grad_norm": 0.13988665956023671, + "learning_rate": 1.0701810665199568e-05, + "loss": 2.7599, + "step": 39099 + }, + { + "epoch": 2.427214600533863, + "grad_norm": 0.14992423006506955, + "learning_rate": 1.0699577861510179e-05, + "loss": 2.6987, + "step": 39100 + }, + { + "epoch": 2.427276677633621, + "grad_norm": 0.14224156410330843, + "learning_rate": 1.0697345262860636e-05, + "loss": 2.7146, + "step": 39101 + }, + { + "epoch": 2.4273387547333787, + "grad_norm": 0.13531062592889112, + "learning_rate": 1.0695112869262586e-05, + "loss": 2.6503, + "step": 39102 + }, + { + "epoch": 2.4274008318331366, + "grad_norm": 0.14756594954509683, + "learning_rate": 1.0692880680727656e-05, + "loss": 2.685, + "step": 39103 + }, + { + "epoch": 2.4274629089328945, + "grad_norm": 0.1377821438847541, + "learning_rate": 1.0690648697267524e-05, + "loss": 2.7043, + "step": 39104 + }, + { + "epoch": 2.4275249860326524, + "grad_norm": 0.13950918497334489, + "learning_rate": 1.0688416918893823e-05, + "loss": 2.7572, + "step": 39105 + }, + { + "epoch": 2.4275870631324103, + "grad_norm": 0.1503912256897659, + "learning_rate": 1.0686185345618187e-05, + "loss": 2.6626, + "step": 39106 + }, + { + "epoch": 2.4276491402321683, + "grad_norm": 0.14516640620430896, + "learning_rate": 1.068395397745226e-05, + "loss": 2.776, + "step": 39107 + }, + { + "epoch": 2.427711217331926, + "grad_norm": 0.14215174372120468, + "learning_rate": 1.0681722814407686e-05, + "loss": 2.7277, + "step": 39108 + }, + { + "epoch": 2.427773294431684, + "grad_norm": 0.14091527305768708, + "learning_rate": 1.0679491856496127e-05, + "loss": 2.7366, + "step": 39109 + }, + { + "epoch": 2.427835371531442, + "grad_norm": 0.13503685898399595, + "learning_rate": 1.0677261103729197e-05, + "loss": 2.7063, + "step": 39110 + }, + { + "epoch": 2.4278974486312, + "grad_norm": 0.13382212376542135, + "learning_rate": 1.0675030556118549e-05, + "loss": 2.6797, + "step": 39111 + }, + { + "epoch": 2.427959525730958, + "grad_norm": 0.1349084001073335, + "learning_rate": 1.0672800213675793e-05, + "loss": 2.7336, + "step": 39112 + }, + { + "epoch": 2.4280216028307158, + "grad_norm": 0.13490768215496424, + "learning_rate": 1.0670570076412601e-05, + "loss": 2.7522, + "step": 39113 + }, + { + "epoch": 2.4280836799304737, + "grad_norm": 0.15268325877144961, + "learning_rate": 1.0668340144340594e-05, + "loss": 2.6999, + "step": 39114 + }, + { + "epoch": 2.4281457570302316, + "grad_norm": 0.14103468187646206, + "learning_rate": 1.0666110417471403e-05, + "loss": 2.6424, + "step": 39115 + }, + { + "epoch": 2.4282078341299895, + "grad_norm": 0.13751024972036532, + "learning_rate": 1.0663880895816663e-05, + "loss": 2.7792, + "step": 39116 + }, + { + "epoch": 2.4282699112297474, + "grad_norm": 0.13230622792142635, + "learning_rate": 1.0661651579387994e-05, + "loss": 2.6174, + "step": 39117 + }, + { + "epoch": 2.4283319883295054, + "grad_norm": 0.1370280853523716, + "learning_rate": 1.065942246819705e-05, + "loss": 2.6528, + "step": 39118 + }, + { + "epoch": 2.4283940654292633, + "grad_norm": 0.13904410760313718, + "learning_rate": 1.0657193562255452e-05, + "loss": 2.6567, + "step": 39119 + }, + { + "epoch": 2.428456142529021, + "grad_norm": 0.151474477829248, + "learning_rate": 1.0654964861574823e-05, + "loss": 2.6862, + "step": 39120 + }, + { + "epoch": 2.428518219628779, + "grad_norm": 0.14752076684713875, + "learning_rate": 1.0652736366166782e-05, + "loss": 2.7445, + "step": 39121 + }, + { + "epoch": 2.428580296728537, + "grad_norm": 0.13472705095511728, + "learning_rate": 1.0650508076042975e-05, + "loss": 2.7124, + "step": 39122 + }, + { + "epoch": 2.4286423738282945, + "grad_norm": 0.13675420165232308, + "learning_rate": 1.0648279991215026e-05, + "loss": 2.6946, + "step": 39123 + }, + { + "epoch": 2.428704450928053, + "grad_norm": 0.1402568966411682, + "learning_rate": 1.0646052111694555e-05, + "loss": 2.6958, + "step": 39124 + }, + { + "epoch": 2.4287665280278103, + "grad_norm": 0.1355896167810924, + "learning_rate": 1.0643824437493166e-05, + "loss": 2.7226, + "step": 39125 + }, + { + "epoch": 2.4288286051275683, + "grad_norm": 0.1372780124153152, + "learning_rate": 1.0641596968622513e-05, + "loss": 2.7505, + "step": 39126 + }, + { + "epoch": 2.428890682227326, + "grad_norm": 0.1497821885462962, + "learning_rate": 1.0639369705094203e-05, + "loss": 2.7617, + "step": 39127 + }, + { + "epoch": 2.428952759327084, + "grad_norm": 0.13852497836277863, + "learning_rate": 1.0637142646919857e-05, + "loss": 2.6461, + "step": 39128 + }, + { + "epoch": 2.429014836426842, + "grad_norm": 0.15095753765773934, + "learning_rate": 1.0634915794111094e-05, + "loss": 2.7877, + "step": 39129 + }, + { + "epoch": 2.4290769135266, + "grad_norm": 0.14745517113128878, + "learning_rate": 1.0632689146679514e-05, + "loss": 2.6691, + "step": 39130 + }, + { + "epoch": 2.429138990626358, + "grad_norm": 0.13601220670564504, + "learning_rate": 1.0630462704636768e-05, + "loss": 2.683, + "step": 39131 + }, + { + "epoch": 2.4292010677261158, + "grad_norm": 0.1403331907857562, + "learning_rate": 1.0628236467994456e-05, + "loss": 2.6877, + "step": 39132 + }, + { + "epoch": 2.4292631448258737, + "grad_norm": 0.1368757776669397, + "learning_rate": 1.0626010436764195e-05, + "loss": 2.7815, + "step": 39133 + }, + { + "epoch": 2.4293252219256316, + "grad_norm": 0.13599306017992926, + "learning_rate": 1.0623784610957576e-05, + "loss": 2.6571, + "step": 39134 + }, + { + "epoch": 2.4293872990253895, + "grad_norm": 0.1440059937688365, + "learning_rate": 1.062155899058625e-05, + "loss": 2.662, + "step": 39135 + }, + { + "epoch": 2.4294493761251474, + "grad_norm": 0.14889695560833352, + "learning_rate": 1.0619333575661804e-05, + "loss": 2.7399, + "step": 39136 + }, + { + "epoch": 2.4295114532249054, + "grad_norm": 0.1426171745753746, + "learning_rate": 1.061710836619586e-05, + "loss": 2.7094, + "step": 39137 + }, + { + "epoch": 2.4295735303246633, + "grad_norm": 0.13963016977404152, + "learning_rate": 1.0614883362200024e-05, + "loss": 2.7583, + "step": 39138 + }, + { + "epoch": 2.429635607424421, + "grad_norm": 0.14693088508207552, + "learning_rate": 1.0612658563685884e-05, + "loss": 2.7131, + "step": 39139 + }, + { + "epoch": 2.429697684524179, + "grad_norm": 0.13437235851796575, + "learning_rate": 1.0610433970665085e-05, + "loss": 2.6397, + "step": 39140 + }, + { + "epoch": 2.429759761623937, + "grad_norm": 0.13602192958876527, + "learning_rate": 1.0608209583149198e-05, + "loss": 2.657, + "step": 39141 + }, + { + "epoch": 2.429821838723695, + "grad_norm": 0.14607753842209834, + "learning_rate": 1.0605985401149854e-05, + "loss": 2.7198, + "step": 39142 + }, + { + "epoch": 2.429883915823453, + "grad_norm": 0.14374588540656955, + "learning_rate": 1.0603761424678654e-05, + "loss": 2.8327, + "step": 39143 + }, + { + "epoch": 2.429945992923211, + "grad_norm": 0.14173352242332057, + "learning_rate": 1.0601537653747174e-05, + "loss": 2.728, + "step": 39144 + }, + { + "epoch": 2.4300080700229687, + "grad_norm": 0.15003199360311006, + "learning_rate": 1.0599314088367056e-05, + "loss": 2.6556, + "step": 39145 + }, + { + "epoch": 2.4300701471227266, + "grad_norm": 0.13959552856330215, + "learning_rate": 1.059709072854988e-05, + "loss": 2.7021, + "step": 39146 + }, + { + "epoch": 2.4301322242224845, + "grad_norm": 0.1484686542744329, + "learning_rate": 1.0594867574307243e-05, + "loss": 2.7285, + "step": 39147 + }, + { + "epoch": 2.430194301322242, + "grad_norm": 0.13878981182348119, + "learning_rate": 1.0592644625650739e-05, + "loss": 2.7865, + "step": 39148 + }, + { + "epoch": 2.430256378422, + "grad_norm": 0.1441831714619499, + "learning_rate": 1.0590421882591984e-05, + "loss": 2.7154, + "step": 39149 + }, + { + "epoch": 2.430318455521758, + "grad_norm": 0.13377644834431246, + "learning_rate": 1.0588199345142569e-05, + "loss": 2.7272, + "step": 39150 + }, + { + "epoch": 2.4303805326215158, + "grad_norm": 0.15104701728770692, + "learning_rate": 1.058597701331408e-05, + "loss": 2.6846, + "step": 39151 + }, + { + "epoch": 2.4304426097212737, + "grad_norm": 0.13873391711183866, + "learning_rate": 1.0583754887118125e-05, + "loss": 2.746, + "step": 39152 + }, + { + "epoch": 2.4305046868210316, + "grad_norm": 0.1418344292945278, + "learning_rate": 1.0581532966566272e-05, + "loss": 2.6608, + "step": 39153 + }, + { + "epoch": 2.4305667639207895, + "grad_norm": 0.15281629633722205, + "learning_rate": 1.0579311251670144e-05, + "loss": 2.7353, + "step": 39154 + }, + { + "epoch": 2.4306288410205474, + "grad_norm": 0.14172152994913853, + "learning_rate": 1.0577089742441315e-05, + "loss": 2.674, + "step": 39155 + }, + { + "epoch": 2.4306909181203054, + "grad_norm": 0.1398049191422101, + "learning_rate": 1.0574868438891384e-05, + "loss": 2.7191, + "step": 39156 + }, + { + "epoch": 2.4307529952200633, + "grad_norm": 0.14728748781538764, + "learning_rate": 1.0572647341031922e-05, + "loss": 2.7296, + "step": 39157 + }, + { + "epoch": 2.430815072319821, + "grad_norm": 0.13946424354445652, + "learning_rate": 1.0570426448874537e-05, + "loss": 2.6412, + "step": 39158 + }, + { + "epoch": 2.430877149419579, + "grad_norm": 0.13829381206802588, + "learning_rate": 1.0568205762430816e-05, + "loss": 2.6633, + "step": 39159 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.13768133809487162, + "learning_rate": 1.0565985281712337e-05, + "loss": 2.6392, + "step": 39160 + }, + { + "epoch": 2.431001303619095, + "grad_norm": 0.1382368207667509, + "learning_rate": 1.0563765006730669e-05, + "loss": 2.691, + "step": 39161 + }, + { + "epoch": 2.431063380718853, + "grad_norm": 0.14061209831394061, + "learning_rate": 1.056154493749743e-05, + "loss": 2.7876, + "step": 39162 + }, + { + "epoch": 2.431125457818611, + "grad_norm": 0.14175065840083817, + "learning_rate": 1.0559325074024185e-05, + "loss": 2.7293, + "step": 39163 + }, + { + "epoch": 2.4311875349183687, + "grad_norm": 0.1571495625803871, + "learning_rate": 1.055710541632251e-05, + "loss": 2.7032, + "step": 39164 + }, + { + "epoch": 2.4312496120181266, + "grad_norm": 0.14322153345633173, + "learning_rate": 1.0554885964403994e-05, + "loss": 2.7152, + "step": 39165 + }, + { + "epoch": 2.4313116891178845, + "grad_norm": 0.14040882661290452, + "learning_rate": 1.0552666718280202e-05, + "loss": 2.6428, + "step": 39166 + }, + { + "epoch": 2.4313737662176425, + "grad_norm": 0.13756273529088073, + "learning_rate": 1.0550447677962738e-05, + "loss": 2.7226, + "step": 39167 + }, + { + "epoch": 2.4314358433174004, + "grad_norm": 0.14169080487121216, + "learning_rate": 1.0548228843463164e-05, + "loss": 2.6929, + "step": 39168 + }, + { + "epoch": 2.4314979204171583, + "grad_norm": 0.13821462006579513, + "learning_rate": 1.0546010214793057e-05, + "loss": 2.7392, + "step": 39169 + }, + { + "epoch": 2.431559997516916, + "grad_norm": 0.15251453314774235, + "learning_rate": 1.0543791791963975e-05, + "loss": 2.6745, + "step": 39170 + }, + { + "epoch": 2.4316220746166737, + "grad_norm": 0.1442417518854188, + "learning_rate": 1.0541573574987529e-05, + "loss": 2.7869, + "step": 39171 + }, + { + "epoch": 2.431684151716432, + "grad_norm": 0.14127253090031155, + "learning_rate": 1.0539355563875269e-05, + "loss": 2.7416, + "step": 39172 + }, + { + "epoch": 2.4317462288161895, + "grad_norm": 0.13468159944792651, + "learning_rate": 1.053713775863876e-05, + "loss": 2.73, + "step": 39173 + }, + { + "epoch": 2.4318083059159474, + "grad_norm": 0.15555109234835263, + "learning_rate": 1.0534920159289597e-05, + "loss": 2.6969, + "step": 39174 + }, + { + "epoch": 2.4318703830157054, + "grad_norm": 0.14020324085145358, + "learning_rate": 1.0532702765839319e-05, + "loss": 2.7601, + "step": 39175 + }, + { + "epoch": 2.4319324601154633, + "grad_norm": 0.13594274318931338, + "learning_rate": 1.0530485578299526e-05, + "loss": 2.7679, + "step": 39176 + }, + { + "epoch": 2.431994537215221, + "grad_norm": 0.13495169109683788, + "learning_rate": 1.0528268596681773e-05, + "loss": 2.6798, + "step": 39177 + }, + { + "epoch": 2.432056614314979, + "grad_norm": 0.14129461365611007, + "learning_rate": 1.0526051820997623e-05, + "loss": 2.6838, + "step": 39178 + }, + { + "epoch": 2.432118691414737, + "grad_norm": 0.14985481403343937, + "learning_rate": 1.0523835251258646e-05, + "loss": 2.6851, + "step": 39179 + }, + { + "epoch": 2.432180768514495, + "grad_norm": 0.14151483708107868, + "learning_rate": 1.0521618887476387e-05, + "loss": 2.7379, + "step": 39180 + }, + { + "epoch": 2.432242845614253, + "grad_norm": 0.15381127006987655, + "learning_rate": 1.0519402729662441e-05, + "loss": 2.6533, + "step": 39181 + }, + { + "epoch": 2.4323049227140108, + "grad_norm": 0.14479063574563453, + "learning_rate": 1.0517186777828358e-05, + "loss": 2.6841, + "step": 39182 + }, + { + "epoch": 2.4323669998137687, + "grad_norm": 0.14547215095133773, + "learning_rate": 1.0514971031985693e-05, + "loss": 2.8255, + "step": 39183 + }, + { + "epoch": 2.4324290769135266, + "grad_norm": 0.13982133898891383, + "learning_rate": 1.0512755492145997e-05, + "loss": 2.6832, + "step": 39184 + }, + { + "epoch": 2.4324911540132845, + "grad_norm": 0.1418316320376764, + "learning_rate": 1.0510540158320852e-05, + "loss": 2.7224, + "step": 39185 + }, + { + "epoch": 2.4325532311130424, + "grad_norm": 0.1389056706389627, + "learning_rate": 1.050832503052181e-05, + "loss": 2.6918, + "step": 39186 + }, + { + "epoch": 2.4326153082128004, + "grad_norm": 0.1442574730400928, + "learning_rate": 1.050611010876042e-05, + "loss": 2.6675, + "step": 39187 + }, + { + "epoch": 2.4326773853125583, + "grad_norm": 0.15734926612791658, + "learning_rate": 1.050389539304824e-05, + "loss": 2.6752, + "step": 39188 + }, + { + "epoch": 2.432739462412316, + "grad_norm": 0.14795997001435976, + "learning_rate": 1.0501680883396819e-05, + "loss": 2.6125, + "step": 39189 + }, + { + "epoch": 2.432801539512074, + "grad_norm": 0.14264356078320156, + "learning_rate": 1.0499466579817724e-05, + "loss": 2.6866, + "step": 39190 + }, + { + "epoch": 2.432863616611832, + "grad_norm": 0.13627402316925857, + "learning_rate": 1.0497252482322506e-05, + "loss": 2.6797, + "step": 39191 + }, + { + "epoch": 2.43292569371159, + "grad_norm": 0.1415302294249829, + "learning_rate": 1.0495038590922706e-05, + "loss": 2.7283, + "step": 39192 + }, + { + "epoch": 2.432987770811348, + "grad_norm": 0.14633826280253157, + "learning_rate": 1.0492824905629866e-05, + "loss": 2.7521, + "step": 39193 + }, + { + "epoch": 2.4330498479111053, + "grad_norm": 0.13937250633318285, + "learning_rate": 1.049061142645557e-05, + "loss": 2.6989, + "step": 39194 + }, + { + "epoch": 2.4331119250108637, + "grad_norm": 0.14021707683123533, + "learning_rate": 1.0488398153411338e-05, + "loss": 2.7658, + "step": 39195 + }, + { + "epoch": 2.433174002110621, + "grad_norm": 0.14018222189706703, + "learning_rate": 1.0486185086508732e-05, + "loss": 2.7131, + "step": 39196 + }, + { + "epoch": 2.433236079210379, + "grad_norm": 0.15608816110115517, + "learning_rate": 1.048397222575927e-05, + "loss": 2.6885, + "step": 39197 + }, + { + "epoch": 2.433298156310137, + "grad_norm": 0.13802440356789192, + "learning_rate": 1.0481759571174531e-05, + "loss": 2.7129, + "step": 39198 + }, + { + "epoch": 2.433360233409895, + "grad_norm": 0.15415790190353923, + "learning_rate": 1.0479547122766054e-05, + "loss": 2.7631, + "step": 39199 + }, + { + "epoch": 2.433422310509653, + "grad_norm": 0.1497497465305858, + "learning_rate": 1.0477334880545364e-05, + "loss": 2.7001, + "step": 39200 + }, + { + "epoch": 2.4334843876094108, + "grad_norm": 0.1441879512658269, + "learning_rate": 1.0475122844524021e-05, + "loss": 2.7512, + "step": 39201 + }, + { + "epoch": 2.4335464647091687, + "grad_norm": 0.15777468062928973, + "learning_rate": 1.0472911014713538e-05, + "loss": 2.6056, + "step": 39202 + }, + { + "epoch": 2.4336085418089266, + "grad_norm": 0.1353304401997309, + "learning_rate": 1.0470699391125488e-05, + "loss": 2.6635, + "step": 39203 + }, + { + "epoch": 2.4336706189086845, + "grad_norm": 0.14432183765924417, + "learning_rate": 1.0468487973771396e-05, + "loss": 2.6817, + "step": 39204 + }, + { + "epoch": 2.4337326960084424, + "grad_norm": 0.16024384777947362, + "learning_rate": 1.0466276762662802e-05, + "loss": 2.7665, + "step": 39205 + }, + { + "epoch": 2.4337947731082004, + "grad_norm": 0.13746633496843788, + "learning_rate": 1.046406575781122e-05, + "loss": 2.7511, + "step": 39206 + }, + { + "epoch": 2.4338568502079583, + "grad_norm": 0.1386820896846006, + "learning_rate": 1.0461854959228207e-05, + "loss": 2.6104, + "step": 39207 + }, + { + "epoch": 2.433918927307716, + "grad_norm": 0.13913874545104976, + "learning_rate": 1.0459644366925309e-05, + "loss": 2.665, + "step": 39208 + }, + { + "epoch": 2.433981004407474, + "grad_norm": 0.1368394455172841, + "learning_rate": 1.0457433980914044e-05, + "loss": 2.6262, + "step": 39209 + }, + { + "epoch": 2.434043081507232, + "grad_norm": 0.14029499247399255, + "learning_rate": 1.0455223801205948e-05, + "loss": 2.698, + "step": 39210 + }, + { + "epoch": 2.43410515860699, + "grad_norm": 0.13642596502161994, + "learning_rate": 1.045301382781253e-05, + "loss": 2.784, + "step": 39211 + }, + { + "epoch": 2.434167235706748, + "grad_norm": 0.13851282911926832, + "learning_rate": 1.0450804060745362e-05, + "loss": 2.744, + "step": 39212 + }, + { + "epoch": 2.434229312806506, + "grad_norm": 0.1440222644473612, + "learning_rate": 1.044859450001594e-05, + "loss": 2.6584, + "step": 39213 + }, + { + "epoch": 2.4342913899062637, + "grad_norm": 0.16252902876640385, + "learning_rate": 1.0446385145635807e-05, + "loss": 2.675, + "step": 39214 + }, + { + "epoch": 2.4343534670060216, + "grad_norm": 0.13876155854529398, + "learning_rate": 1.0444175997616485e-05, + "loss": 2.611, + "step": 39215 + }, + { + "epoch": 2.4344155441057795, + "grad_norm": 0.14612875687294985, + "learning_rate": 1.0441967055969487e-05, + "loss": 2.7117, + "step": 39216 + }, + { + "epoch": 2.4344776212055375, + "grad_norm": 0.14008910669574798, + "learning_rate": 1.043975832070636e-05, + "loss": 2.7072, + "step": 39217 + }, + { + "epoch": 2.4345396983052954, + "grad_norm": 0.13479553802886668, + "learning_rate": 1.0437549791838619e-05, + "loss": 2.7702, + "step": 39218 + }, + { + "epoch": 2.434601775405053, + "grad_norm": 0.1385603375569403, + "learning_rate": 1.0435341469377785e-05, + "loss": 2.701, + "step": 39219 + }, + { + "epoch": 2.434663852504811, + "grad_norm": 0.13781805181889978, + "learning_rate": 1.0433133353335368e-05, + "loss": 2.6751, + "step": 39220 + }, + { + "epoch": 2.4347259296045687, + "grad_norm": 0.13213625011349148, + "learning_rate": 1.0430925443722906e-05, + "loss": 2.6525, + "step": 39221 + }, + { + "epoch": 2.4347880067043266, + "grad_norm": 0.1409552722391426, + "learning_rate": 1.042871774055192e-05, + "loss": 2.7411, + "step": 39222 + }, + { + "epoch": 2.4348500838040845, + "grad_norm": 0.15886267589750253, + "learning_rate": 1.0426510243833914e-05, + "loss": 2.6682, + "step": 39223 + }, + { + "epoch": 2.4349121609038424, + "grad_norm": 0.14342716937169092, + "learning_rate": 1.0424302953580412e-05, + "loss": 2.6139, + "step": 39224 + }, + { + "epoch": 2.4349742380036004, + "grad_norm": 0.13880737159226397, + "learning_rate": 1.0422095869802917e-05, + "loss": 2.6975, + "step": 39225 + }, + { + "epoch": 2.4350363151033583, + "grad_norm": 0.1542361236436812, + "learning_rate": 1.0419888992512966e-05, + "loss": 2.7423, + "step": 39226 + }, + { + "epoch": 2.435098392203116, + "grad_norm": 0.1370601342836808, + "learning_rate": 1.0417682321722066e-05, + "loss": 2.6562, + "step": 39227 + }, + { + "epoch": 2.435160469302874, + "grad_norm": 0.1370405904920561, + "learning_rate": 1.0415475857441726e-05, + "loss": 2.6969, + "step": 39228 + }, + { + "epoch": 2.435222546402632, + "grad_norm": 0.13928663295291807, + "learning_rate": 1.0413269599683446e-05, + "loss": 2.7699, + "step": 39229 + }, + { + "epoch": 2.43528462350239, + "grad_norm": 0.13918256077355395, + "learning_rate": 1.0411063548458754e-05, + "loss": 2.7163, + "step": 39230 + }, + { + "epoch": 2.435346700602148, + "grad_norm": 0.13762883685200683, + "learning_rate": 1.0408857703779163e-05, + "loss": 2.7284, + "step": 39231 + }, + { + "epoch": 2.435408777701906, + "grad_norm": 0.13805487875628072, + "learning_rate": 1.040665206565617e-05, + "loss": 2.6171, + "step": 39232 + }, + { + "epoch": 2.4354708548016637, + "grad_norm": 0.1450948615889124, + "learning_rate": 1.0404446634101267e-05, + "loss": 2.5779, + "step": 39233 + }, + { + "epoch": 2.4355329319014216, + "grad_norm": 0.1405094917415486, + "learning_rate": 1.0402241409125996e-05, + "loss": 2.7578, + "step": 39234 + }, + { + "epoch": 2.4355950090011795, + "grad_norm": 0.13799062134310158, + "learning_rate": 1.0400036390741847e-05, + "loss": 2.6635, + "step": 39235 + }, + { + "epoch": 2.4356570861009375, + "grad_norm": 0.13848408219940175, + "learning_rate": 1.0397831578960315e-05, + "loss": 2.7518, + "step": 39236 + }, + { + "epoch": 2.4357191632006954, + "grad_norm": 0.13862158306621422, + "learning_rate": 1.039562697379291e-05, + "loss": 2.658, + "step": 39237 + }, + { + "epoch": 2.4357812403004533, + "grad_norm": 0.13671614780674407, + "learning_rate": 1.0393422575251121e-05, + "loss": 2.7296, + "step": 39238 + }, + { + "epoch": 2.435843317400211, + "grad_norm": 0.15085571382407306, + "learning_rate": 1.0391218383346462e-05, + "loss": 2.732, + "step": 39239 + }, + { + "epoch": 2.435905394499969, + "grad_norm": 0.13999200453236155, + "learning_rate": 1.0389014398090447e-05, + "loss": 2.6792, + "step": 39240 + }, + { + "epoch": 2.435967471599727, + "grad_norm": 0.13294837816896504, + "learning_rate": 1.0386810619494559e-05, + "loss": 2.6838, + "step": 39241 + }, + { + "epoch": 2.4360295486994845, + "grad_norm": 0.13176292941044554, + "learning_rate": 1.0384607047570293e-05, + "loss": 2.7783, + "step": 39242 + }, + { + "epoch": 2.436091625799243, + "grad_norm": 0.15029791662091405, + "learning_rate": 1.038240368232914e-05, + "loss": 2.7532, + "step": 39243 + }, + { + "epoch": 2.4361537028990004, + "grad_norm": 0.14329314248037808, + "learning_rate": 1.0380200523782619e-05, + "loss": 2.71, + "step": 39244 + }, + { + "epoch": 2.4362157799987583, + "grad_norm": 0.14053868585322218, + "learning_rate": 1.0377997571942206e-05, + "loss": 2.7354, + "step": 39245 + }, + { + "epoch": 2.436277857098516, + "grad_norm": 0.14219645796843078, + "learning_rate": 1.0375794826819402e-05, + "loss": 2.6124, + "step": 39246 + }, + { + "epoch": 2.436339934198274, + "grad_norm": 0.14043658270109757, + "learning_rate": 1.0373592288425676e-05, + "loss": 2.6767, + "step": 39247 + }, + { + "epoch": 2.436402011298032, + "grad_norm": 0.14884718399964025, + "learning_rate": 1.037138995677256e-05, + "loss": 2.667, + "step": 39248 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.157463316041834, + "learning_rate": 1.0369187831871518e-05, + "loss": 2.7554, + "step": 39249 + }, + { + "epoch": 2.436526165497548, + "grad_norm": 0.14114336003415745, + "learning_rate": 1.036698591373404e-05, + "loss": 2.6454, + "step": 39250 + }, + { + "epoch": 2.4365882425973058, + "grad_norm": 0.13431160923106392, + "learning_rate": 1.0364784202371625e-05, + "loss": 2.6732, + "step": 39251 + }, + { + "epoch": 2.4366503196970637, + "grad_norm": 0.1789018309767908, + "learning_rate": 1.0362582697795736e-05, + "loss": 2.7333, + "step": 39252 + }, + { + "epoch": 2.4367123967968216, + "grad_norm": 0.14359140250992503, + "learning_rate": 1.0360381400017887e-05, + "loss": 2.6218, + "step": 39253 + }, + { + "epoch": 2.4367744738965795, + "grad_norm": 0.14031181295987083, + "learning_rate": 1.0358180309049553e-05, + "loss": 2.7674, + "step": 39254 + }, + { + "epoch": 2.4368365509963374, + "grad_norm": 0.1408808619020895, + "learning_rate": 1.0355979424902218e-05, + "loss": 2.7625, + "step": 39255 + }, + { + "epoch": 2.4368986280960954, + "grad_norm": 0.14575321829548651, + "learning_rate": 1.0353778747587344e-05, + "loss": 2.6642, + "step": 39256 + }, + { + "epoch": 2.4369607051958533, + "grad_norm": 0.14406321108553605, + "learning_rate": 1.0351578277116447e-05, + "loss": 2.6741, + "step": 39257 + }, + { + "epoch": 2.437022782295611, + "grad_norm": 0.1421332538794772, + "learning_rate": 1.0349378013500993e-05, + "loss": 2.7136, + "step": 39258 + }, + { + "epoch": 2.437084859395369, + "grad_norm": 0.15082902299753354, + "learning_rate": 1.0347177956752457e-05, + "loss": 2.6764, + "step": 39259 + }, + { + "epoch": 2.437146936495127, + "grad_norm": 0.148456528347485, + "learning_rate": 1.0344978106882309e-05, + "loss": 2.5658, + "step": 39260 + }, + { + "epoch": 2.437209013594885, + "grad_norm": 0.1641698043593182, + "learning_rate": 1.0342778463902048e-05, + "loss": 2.7354, + "step": 39261 + }, + { + "epoch": 2.437271090694643, + "grad_norm": 0.14934346769018444, + "learning_rate": 1.0340579027823133e-05, + "loss": 2.7362, + "step": 39262 + }, + { + "epoch": 2.437333167794401, + "grad_norm": 0.14498243274820494, + "learning_rate": 1.0338379798657056e-05, + "loss": 2.7414, + "step": 39263 + }, + { + "epoch": 2.4373952448941587, + "grad_norm": 0.1514726087103473, + "learning_rate": 1.0336180776415271e-05, + "loss": 2.6986, + "step": 39264 + }, + { + "epoch": 2.4374573219939166, + "grad_norm": 0.14235986700005412, + "learning_rate": 1.0333981961109252e-05, + "loss": 2.7309, + "step": 39265 + }, + { + "epoch": 2.4375193990936745, + "grad_norm": 0.13603457514412154, + "learning_rate": 1.0331783352750484e-05, + "loss": 2.7008, + "step": 39266 + }, + { + "epoch": 2.437581476193432, + "grad_norm": 0.1412120526218309, + "learning_rate": 1.032958495135044e-05, + "loss": 2.73, + "step": 39267 + }, + { + "epoch": 2.4376435532931904, + "grad_norm": 0.1358002188703967, + "learning_rate": 1.0327386756920582e-05, + "loss": 2.7357, + "step": 39268 + }, + { + "epoch": 2.437705630392948, + "grad_norm": 0.13926431873418957, + "learning_rate": 1.032518876947236e-05, + "loss": 2.7133, + "step": 39269 + }, + { + "epoch": 2.4377677074927058, + "grad_norm": 0.14601065011323433, + "learning_rate": 1.0322990989017272e-05, + "loss": 2.6915, + "step": 39270 + }, + { + "epoch": 2.4378297845924637, + "grad_norm": 0.140739334833487, + "learning_rate": 1.0320793415566777e-05, + "loss": 2.7432, + "step": 39271 + }, + { + "epoch": 2.4378918616922216, + "grad_norm": 0.13705127332712386, + "learning_rate": 1.0318596049132318e-05, + "loss": 2.7362, + "step": 39272 + }, + { + "epoch": 2.4379539387919795, + "grad_norm": 0.1367104255321815, + "learning_rate": 1.0316398889725393e-05, + "loss": 2.7356, + "step": 39273 + }, + { + "epoch": 2.4380160158917374, + "grad_norm": 0.1401901342690393, + "learning_rate": 1.0314201937357449e-05, + "loss": 2.7983, + "step": 39274 + }, + { + "epoch": 2.4380780929914954, + "grad_norm": 0.14230400018266867, + "learning_rate": 1.031200519203993e-05, + "loss": 2.7127, + "step": 39275 + }, + { + "epoch": 2.4381401700912533, + "grad_norm": 0.1676842975619902, + "learning_rate": 1.0309808653784331e-05, + "loss": 2.706, + "step": 39276 + }, + { + "epoch": 2.438202247191011, + "grad_norm": 0.15970052451377229, + "learning_rate": 1.0307612322602095e-05, + "loss": 2.7834, + "step": 39277 + }, + { + "epoch": 2.438264324290769, + "grad_norm": 0.1374080786901077, + "learning_rate": 1.030541619850468e-05, + "loss": 2.7138, + "step": 39278 + }, + { + "epoch": 2.438326401390527, + "grad_norm": 0.1388955461542401, + "learning_rate": 1.0303220281503529e-05, + "loss": 2.7334, + "step": 39279 + }, + { + "epoch": 2.438388478490285, + "grad_norm": 0.1348264252991744, + "learning_rate": 1.030102457161013e-05, + "loss": 2.6385, + "step": 39280 + }, + { + "epoch": 2.438450555590043, + "grad_norm": 0.1415399351514738, + "learning_rate": 1.0298829068835924e-05, + "loss": 2.6641, + "step": 39281 + }, + { + "epoch": 2.438512632689801, + "grad_norm": 0.1395718424489264, + "learning_rate": 1.0296633773192367e-05, + "loss": 2.7252, + "step": 39282 + }, + { + "epoch": 2.4385747097895587, + "grad_norm": 0.14486405357884263, + "learning_rate": 1.0294438684690894e-05, + "loss": 2.6388, + "step": 39283 + }, + { + "epoch": 2.4386367868893166, + "grad_norm": 0.13948703321571437, + "learning_rate": 1.0292243803342988e-05, + "loss": 2.739, + "step": 39284 + }, + { + "epoch": 2.4386988639890745, + "grad_norm": 0.14713108357375976, + "learning_rate": 1.0290049129160084e-05, + "loss": 2.7109, + "step": 39285 + }, + { + "epoch": 2.4387609410888325, + "grad_norm": 0.15751244043612922, + "learning_rate": 1.0287854662153628e-05, + "loss": 2.7214, + "step": 39286 + }, + { + "epoch": 2.4388230181885904, + "grad_norm": 0.13767621739870825, + "learning_rate": 1.0285660402335084e-05, + "loss": 2.7644, + "step": 39287 + }, + { + "epoch": 2.4388850952883483, + "grad_norm": 0.14357687331054755, + "learning_rate": 1.0283466349715876e-05, + "loss": 2.714, + "step": 39288 + }, + { + "epoch": 2.438947172388106, + "grad_norm": 0.1429124335597229, + "learning_rate": 1.0281272504307477e-05, + "loss": 2.7284, + "step": 39289 + }, + { + "epoch": 2.4390092494878637, + "grad_norm": 0.15712392311895546, + "learning_rate": 1.0279078866121317e-05, + "loss": 2.6758, + "step": 39290 + }, + { + "epoch": 2.439071326587622, + "grad_norm": 0.14744275972606935, + "learning_rate": 1.0276885435168849e-05, + "loss": 2.6194, + "step": 39291 + }, + { + "epoch": 2.4391334036873795, + "grad_norm": 0.13982061962019993, + "learning_rate": 1.02746922114615e-05, + "loss": 2.7027, + "step": 39292 + }, + { + "epoch": 2.4391954807871374, + "grad_norm": 0.16402321299687225, + "learning_rate": 1.0272499195010738e-05, + "loss": 2.7889, + "step": 39293 + }, + { + "epoch": 2.4392575578868954, + "grad_norm": 0.14461296580971048, + "learning_rate": 1.0270306385827994e-05, + "loss": 2.7465, + "step": 39294 + }, + { + "epoch": 2.4393196349866533, + "grad_norm": 0.13912873164285217, + "learning_rate": 1.0268113783924705e-05, + "loss": 2.6754, + "step": 39295 + }, + { + "epoch": 2.439381712086411, + "grad_norm": 0.13559158807979693, + "learning_rate": 1.02659213893123e-05, + "loss": 2.7226, + "step": 39296 + }, + { + "epoch": 2.439443789186169, + "grad_norm": 0.137918149940384, + "learning_rate": 1.0263729202002237e-05, + "loss": 2.7515, + "step": 39297 + }, + { + "epoch": 2.439505866285927, + "grad_norm": 0.13784937660384533, + "learning_rate": 1.0261537222005946e-05, + "loss": 2.7687, + "step": 39298 + }, + { + "epoch": 2.439567943385685, + "grad_norm": 0.15360870694032747, + "learning_rate": 1.0259345449334868e-05, + "loss": 2.7677, + "step": 39299 + }, + { + "epoch": 2.439630020485443, + "grad_norm": 0.1472558939119328, + "learning_rate": 1.0257153884000425e-05, + "loss": 2.7313, + "step": 39300 + }, + { + "epoch": 2.439692097585201, + "grad_norm": 0.1358383030533232, + "learning_rate": 1.025496252601405e-05, + "loss": 2.7244, + "step": 39301 + }, + { + "epoch": 2.4397541746849587, + "grad_norm": 0.17256685290084625, + "learning_rate": 1.0252771375387193e-05, + "loss": 2.7185, + "step": 39302 + }, + { + "epoch": 2.4398162517847166, + "grad_norm": 0.150798609579235, + "learning_rate": 1.0250580432131285e-05, + "loss": 2.7124, + "step": 39303 + }, + { + "epoch": 2.4398783288844745, + "grad_norm": 0.1345578956135177, + "learning_rate": 1.0248389696257727e-05, + "loss": 2.6711, + "step": 39304 + }, + { + "epoch": 2.4399404059842325, + "grad_norm": 0.14293570944461453, + "learning_rate": 1.0246199167777987e-05, + "loss": 2.7849, + "step": 39305 + }, + { + "epoch": 2.4400024830839904, + "grad_norm": 0.15339131138867454, + "learning_rate": 1.0244008846703462e-05, + "loss": 2.7301, + "step": 39306 + }, + { + "epoch": 2.4400645601837483, + "grad_norm": 0.15456354660986357, + "learning_rate": 1.0241818733045604e-05, + "loss": 2.7404, + "step": 39307 + }, + { + "epoch": 2.440126637283506, + "grad_norm": 0.15881446175412617, + "learning_rate": 1.0239628826815834e-05, + "loss": 2.6541, + "step": 39308 + }, + { + "epoch": 2.440188714383264, + "grad_norm": 0.13220886889676936, + "learning_rate": 1.0237439128025572e-05, + "loss": 2.7, + "step": 39309 + }, + { + "epoch": 2.440250791483022, + "grad_norm": 0.14587229963106088, + "learning_rate": 1.0235249636686229e-05, + "loss": 2.6723, + "step": 39310 + }, + { + "epoch": 2.44031286858278, + "grad_norm": 0.14887356077078268, + "learning_rate": 1.0233060352809254e-05, + "loss": 2.7874, + "step": 39311 + }, + { + "epoch": 2.440374945682538, + "grad_norm": 0.1466965156481864, + "learning_rate": 1.0230871276406056e-05, + "loss": 2.7187, + "step": 39312 + }, + { + "epoch": 2.440437022782296, + "grad_norm": 0.13461346951598788, + "learning_rate": 1.022868240748806e-05, + "loss": 2.6665, + "step": 39313 + }, + { + "epoch": 2.4404990998820537, + "grad_norm": 0.14280184086226153, + "learning_rate": 1.022649374606668e-05, + "loss": 2.7132, + "step": 39314 + }, + { + "epoch": 2.440561176981811, + "grad_norm": 0.1339348465153333, + "learning_rate": 1.0224305292153324e-05, + "loss": 2.6465, + "step": 39315 + }, + { + "epoch": 2.4406232540815695, + "grad_norm": 0.13692188158985347, + "learning_rate": 1.0222117045759439e-05, + "loss": 2.6733, + "step": 39316 + }, + { + "epoch": 2.440685331181327, + "grad_norm": 0.13486916255073766, + "learning_rate": 1.0219929006896423e-05, + "loss": 2.6733, + "step": 39317 + }, + { + "epoch": 2.440747408281085, + "grad_norm": 0.1423243066235948, + "learning_rate": 1.0217741175575696e-05, + "loss": 2.7369, + "step": 39318 + }, + { + "epoch": 2.440809485380843, + "grad_norm": 0.13526576985134087, + "learning_rate": 1.021555355180866e-05, + "loss": 2.6984, + "step": 39319 + }, + { + "epoch": 2.4408715624806008, + "grad_norm": 0.14144653515733718, + "learning_rate": 1.0213366135606751e-05, + "loss": 2.6989, + "step": 39320 + }, + { + "epoch": 2.4409336395803587, + "grad_norm": 0.13594435312820344, + "learning_rate": 1.0211178926981363e-05, + "loss": 2.7066, + "step": 39321 + }, + { + "epoch": 2.4409957166801166, + "grad_norm": 0.14414494670808978, + "learning_rate": 1.0208991925943923e-05, + "loss": 2.6933, + "step": 39322 + }, + { + "epoch": 2.4410577937798745, + "grad_norm": 0.13402600434084763, + "learning_rate": 1.0206805132505825e-05, + "loss": 2.717, + "step": 39323 + }, + { + "epoch": 2.4411198708796324, + "grad_norm": 0.16037720464757985, + "learning_rate": 1.0204618546678474e-05, + "loss": 2.7345, + "step": 39324 + }, + { + "epoch": 2.4411819479793904, + "grad_norm": 0.13955324436413877, + "learning_rate": 1.0202432168473303e-05, + "loss": 2.6498, + "step": 39325 + }, + { + "epoch": 2.4412440250791483, + "grad_norm": 0.13779809511031443, + "learning_rate": 1.0200245997901703e-05, + "loss": 2.729, + "step": 39326 + }, + { + "epoch": 2.441306102178906, + "grad_norm": 0.1525458865525882, + "learning_rate": 1.0198060034975082e-05, + "loss": 2.7134, + "step": 39327 + }, + { + "epoch": 2.441368179278664, + "grad_norm": 0.14360268756999978, + "learning_rate": 1.0195874279704832e-05, + "loss": 2.6426, + "step": 39328 + }, + { + "epoch": 2.441430256378422, + "grad_norm": 0.13651612116210582, + "learning_rate": 1.019368873210238e-05, + "loss": 2.6839, + "step": 39329 + }, + { + "epoch": 2.44149233347818, + "grad_norm": 0.15178487430869017, + "learning_rate": 1.019150339217912e-05, + "loss": 2.7082, + "step": 39330 + }, + { + "epoch": 2.441554410577938, + "grad_norm": 0.13629081373983826, + "learning_rate": 1.0189318259946446e-05, + "loss": 2.6841, + "step": 39331 + }, + { + "epoch": 2.441616487677696, + "grad_norm": 0.1351914244288075, + "learning_rate": 1.018713333541575e-05, + "loss": 2.7557, + "step": 39332 + }, + { + "epoch": 2.4416785647774537, + "grad_norm": 0.13844323449926826, + "learning_rate": 1.018494861859846e-05, + "loss": 2.765, + "step": 39333 + }, + { + "epoch": 2.4417406418772116, + "grad_norm": 0.13746667371371227, + "learning_rate": 1.018276410950596e-05, + "loss": 2.7811, + "step": 39334 + }, + { + "epoch": 2.4418027189769695, + "grad_norm": 0.141317378353035, + "learning_rate": 1.0180579808149644e-05, + "loss": 2.8112, + "step": 39335 + }, + { + "epoch": 2.4418647960767275, + "grad_norm": 0.1375744201014, + "learning_rate": 1.0178395714540911e-05, + "loss": 2.7122, + "step": 39336 + }, + { + "epoch": 2.4419268731764854, + "grad_norm": 0.14777605857611947, + "learning_rate": 1.017621182869114e-05, + "loss": 2.6726, + "step": 39337 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.13664762826882443, + "learning_rate": 1.0174028150611736e-05, + "loss": 2.8143, + "step": 39338 + }, + { + "epoch": 2.442051027376001, + "grad_norm": 0.13270164519553573, + "learning_rate": 1.0171844680314114e-05, + "loss": 2.7687, + "step": 39339 + }, + { + "epoch": 2.4421131044757587, + "grad_norm": 0.13482976161278687, + "learning_rate": 1.0169661417809645e-05, + "loss": 2.6413, + "step": 39340 + }, + { + "epoch": 2.4421751815755166, + "grad_norm": 0.14611423139769306, + "learning_rate": 1.0167478363109723e-05, + "loss": 2.8, + "step": 39341 + }, + { + "epoch": 2.4422372586752745, + "grad_norm": 0.1387033829690387, + "learning_rate": 1.0165295516225721e-05, + "loss": 2.6682, + "step": 39342 + }, + { + "epoch": 2.4422993357750324, + "grad_norm": 0.13719112670597114, + "learning_rate": 1.016311287716905e-05, + "loss": 2.789, + "step": 39343 + }, + { + "epoch": 2.4423614128747904, + "grad_norm": 0.14065370001914904, + "learning_rate": 1.01609304459511e-05, + "loss": 2.7658, + "step": 39344 + }, + { + "epoch": 2.4424234899745483, + "grad_norm": 0.14543113352230608, + "learning_rate": 1.0158748222583243e-05, + "loss": 2.671, + "step": 39345 + }, + { + "epoch": 2.442485567074306, + "grad_norm": 0.14535263286669076, + "learning_rate": 1.015656620707685e-05, + "loss": 2.6852, + "step": 39346 + }, + { + "epoch": 2.442547644174064, + "grad_norm": 0.1441899083560174, + "learning_rate": 1.0154384399443345e-05, + "loss": 2.7954, + "step": 39347 + }, + { + "epoch": 2.442609721273822, + "grad_norm": 0.13782493766573597, + "learning_rate": 1.0152202799694088e-05, + "loss": 2.6859, + "step": 39348 + }, + { + "epoch": 2.44267179837358, + "grad_norm": 0.15160564590101314, + "learning_rate": 1.015002140784046e-05, + "loss": 2.7418, + "step": 39349 + }, + { + "epoch": 2.442733875473338, + "grad_norm": 0.1599668685689175, + "learning_rate": 1.0147840223893845e-05, + "loss": 2.7101, + "step": 39350 + }, + { + "epoch": 2.442795952573096, + "grad_norm": 0.1322127995730989, + "learning_rate": 1.0145659247865608e-05, + "loss": 2.7322, + "step": 39351 + }, + { + "epoch": 2.4428580296728537, + "grad_norm": 0.34191327973874014, + "learning_rate": 1.0143478479767154e-05, + "loss": 2.7237, + "step": 39352 + }, + { + "epoch": 2.4429201067726116, + "grad_norm": 0.14250605865197313, + "learning_rate": 1.014129791960985e-05, + "loss": 2.7708, + "step": 39353 + }, + { + "epoch": 2.4429821838723695, + "grad_norm": 0.13481166302019254, + "learning_rate": 1.0139117567405071e-05, + "loss": 2.642, + "step": 39354 + }, + { + "epoch": 2.4430442609721275, + "grad_norm": 0.1421172453162119, + "learning_rate": 1.0136937423164178e-05, + "loss": 2.6677, + "step": 39355 + }, + { + "epoch": 2.4431063380718854, + "grad_norm": 0.14018756329538837, + "learning_rate": 1.0134757486898572e-05, + "loss": 2.6785, + "step": 39356 + }, + { + "epoch": 2.4431684151716433, + "grad_norm": 0.14534436717340127, + "learning_rate": 1.0132577758619615e-05, + "loss": 2.6179, + "step": 39357 + }, + { + "epoch": 2.443230492271401, + "grad_norm": 0.13864704366447528, + "learning_rate": 1.0130398238338678e-05, + "loss": 2.6289, + "step": 39358 + }, + { + "epoch": 2.443292569371159, + "grad_norm": 0.13793811628655991, + "learning_rate": 1.0128218926067135e-05, + "loss": 2.6737, + "step": 39359 + }, + { + "epoch": 2.443354646470917, + "grad_norm": 0.1438366851455695, + "learning_rate": 1.0126039821816335e-05, + "loss": 2.8076, + "step": 39360 + }, + { + "epoch": 2.443416723570675, + "grad_norm": 0.13999350803088143, + "learning_rate": 1.0123860925597678e-05, + "loss": 2.6611, + "step": 39361 + }, + { + "epoch": 2.443478800670433, + "grad_norm": 0.1695473478734463, + "learning_rate": 1.012168223742252e-05, + "loss": 2.7087, + "step": 39362 + }, + { + "epoch": 2.4435408777701904, + "grad_norm": 0.1693647558116107, + "learning_rate": 1.011950375730223e-05, + "loss": 2.6183, + "step": 39363 + }, + { + "epoch": 2.4436029548699487, + "grad_norm": 0.14788404609286177, + "learning_rate": 1.0117325485248152e-05, + "loss": 2.746, + "step": 39364 + }, + { + "epoch": 2.443665031969706, + "grad_norm": 0.13963036320169403, + "learning_rate": 1.0115147421271687e-05, + "loss": 2.81, + "step": 39365 + }, + { + "epoch": 2.443727109069464, + "grad_norm": 0.1491197604736201, + "learning_rate": 1.0112969565384179e-05, + "loss": 2.7353, + "step": 39366 + }, + { + "epoch": 2.443789186169222, + "grad_norm": 0.1577456830937105, + "learning_rate": 1.011079191759699e-05, + "loss": 2.7273, + "step": 39367 + }, + { + "epoch": 2.44385126326898, + "grad_norm": 0.13744850223855476, + "learning_rate": 1.010861447792147e-05, + "loss": 2.7124, + "step": 39368 + }, + { + "epoch": 2.443913340368738, + "grad_norm": 0.16611548393713282, + "learning_rate": 1.0106437246369004e-05, + "loss": 2.6616, + "step": 39369 + }, + { + "epoch": 2.443975417468496, + "grad_norm": 0.13637603311470803, + "learning_rate": 1.0104260222950928e-05, + "loss": 2.718, + "step": 39370 + }, + { + "epoch": 2.4440374945682537, + "grad_norm": 0.15016489303753736, + "learning_rate": 1.0102083407678626e-05, + "loss": 2.7494, + "step": 39371 + }, + { + "epoch": 2.4440995716680116, + "grad_norm": 0.1355388751678694, + "learning_rate": 1.0099906800563435e-05, + "loss": 2.6553, + "step": 39372 + }, + { + "epoch": 2.4441616487677695, + "grad_norm": 0.14560493325864798, + "learning_rate": 1.0097730401616723e-05, + "loss": 2.664, + "step": 39373 + }, + { + "epoch": 2.4442237258675275, + "grad_norm": 0.15007969060634252, + "learning_rate": 1.0095554210849822e-05, + "loss": 2.7573, + "step": 39374 + }, + { + "epoch": 2.4442858029672854, + "grad_norm": 0.15079008654657838, + "learning_rate": 1.0093378228274114e-05, + "loss": 2.6568, + "step": 39375 + }, + { + "epoch": 2.4443478800670433, + "grad_norm": 0.1593439623585715, + "learning_rate": 1.0091202453900944e-05, + "loss": 2.7404, + "step": 39376 + }, + { + "epoch": 2.444409957166801, + "grad_norm": 0.13205897220777715, + "learning_rate": 1.0089026887741655e-05, + "loss": 2.6566, + "step": 39377 + }, + { + "epoch": 2.444472034266559, + "grad_norm": 0.14080946135808564, + "learning_rate": 1.008685152980759e-05, + "loss": 2.8274, + "step": 39378 + }, + { + "epoch": 2.444534111366317, + "grad_norm": 0.141450057699425, + "learning_rate": 1.0084676380110119e-05, + "loss": 2.6834, + "step": 39379 + }, + { + "epoch": 2.444596188466075, + "grad_norm": 0.14411150707986775, + "learning_rate": 1.0082501438660586e-05, + "loss": 2.7886, + "step": 39380 + }, + { + "epoch": 2.444658265565833, + "grad_norm": 0.13658436573967742, + "learning_rate": 1.0080326705470333e-05, + "loss": 2.7088, + "step": 39381 + }, + { + "epoch": 2.444720342665591, + "grad_norm": 0.15333192016008224, + "learning_rate": 1.0078152180550687e-05, + "loss": 2.6804, + "step": 39382 + }, + { + "epoch": 2.4447824197653487, + "grad_norm": 0.14460367244067543, + "learning_rate": 1.0075977863913033e-05, + "loss": 2.7143, + "step": 39383 + }, + { + "epoch": 2.4448444968651066, + "grad_norm": 0.13896912965578992, + "learning_rate": 1.007380375556869e-05, + "loss": 2.6643, + "step": 39384 + }, + { + "epoch": 2.4449065739648645, + "grad_norm": 0.14269162585500492, + "learning_rate": 1.0071629855529007e-05, + "loss": 2.6594, + "step": 39385 + }, + { + "epoch": 2.444968651064622, + "grad_norm": 0.14231904541974363, + "learning_rate": 1.0069456163805324e-05, + "loss": 2.7224, + "step": 39386 + }, + { + "epoch": 2.4450307281643804, + "grad_norm": 0.1455185457405702, + "learning_rate": 1.0067282680408968e-05, + "loss": 2.6405, + "step": 39387 + }, + { + "epoch": 2.445092805264138, + "grad_norm": 0.14208188640952177, + "learning_rate": 1.006510940535131e-05, + "loss": 2.7565, + "step": 39388 + }, + { + "epoch": 2.4451548823638958, + "grad_norm": 0.16361771759579744, + "learning_rate": 1.0062936338643664e-05, + "loss": 2.7246, + "step": 39389 + }, + { + "epoch": 2.4452169594636537, + "grad_norm": 0.15373632155995787, + "learning_rate": 1.0060763480297375e-05, + "loss": 2.6956, + "step": 39390 + }, + { + "epoch": 2.4452790365634116, + "grad_norm": 0.1500611970162533, + "learning_rate": 1.0058590830323767e-05, + "loss": 2.7723, + "step": 39391 + }, + { + "epoch": 2.4453411136631695, + "grad_norm": 0.15354602139089052, + "learning_rate": 1.00564183887342e-05, + "loss": 2.6952, + "step": 39392 + }, + { + "epoch": 2.4454031907629274, + "grad_norm": 0.14823796011183216, + "learning_rate": 1.0054246155539998e-05, + "loss": 2.7692, + "step": 39393 + }, + { + "epoch": 2.4454652678626854, + "grad_norm": 0.15387316362678138, + "learning_rate": 1.0052074130752487e-05, + "loss": 2.7198, + "step": 39394 + }, + { + "epoch": 2.4455273449624433, + "grad_norm": 0.13923597445472022, + "learning_rate": 1.0049902314383003e-05, + "loss": 2.7566, + "step": 39395 + }, + { + "epoch": 2.445589422062201, + "grad_norm": 0.13752816247800648, + "learning_rate": 1.0047730706442865e-05, + "loss": 2.6662, + "step": 39396 + }, + { + "epoch": 2.445651499161959, + "grad_norm": 0.1529210413392362, + "learning_rate": 1.0045559306943426e-05, + "loss": 2.753, + "step": 39397 + }, + { + "epoch": 2.445713576261717, + "grad_norm": 0.15241085921876124, + "learning_rate": 1.0043388115896008e-05, + "loss": 2.6781, + "step": 39398 + }, + { + "epoch": 2.445775653361475, + "grad_norm": 0.13828823589200526, + "learning_rate": 1.0041217133311925e-05, + "loss": 2.7357, + "step": 39399 + }, + { + "epoch": 2.445837730461233, + "grad_norm": 0.1439496853602887, + "learning_rate": 1.0039046359202508e-05, + "loss": 2.7051, + "step": 39400 + }, + { + "epoch": 2.445899807560991, + "grad_norm": 0.15300173980410908, + "learning_rate": 1.0036875793579099e-05, + "loss": 2.577, + "step": 39401 + }, + { + "epoch": 2.4459618846607487, + "grad_norm": 0.1401259654102877, + "learning_rate": 1.0034705436453007e-05, + "loss": 2.6971, + "step": 39402 + }, + { + "epoch": 2.4460239617605066, + "grad_norm": 0.23783453513430805, + "learning_rate": 1.0032535287835548e-05, + "loss": 2.721, + "step": 39403 + }, + { + "epoch": 2.4460860388602645, + "grad_norm": 0.17982546029771124, + "learning_rate": 1.0030365347738062e-05, + "loss": 2.6965, + "step": 39404 + }, + { + "epoch": 2.4461481159600225, + "grad_norm": 0.13709942409346076, + "learning_rate": 1.0028195616171859e-05, + "loss": 2.6733, + "step": 39405 + }, + { + "epoch": 2.4462101930597804, + "grad_norm": 0.13764370296032133, + "learning_rate": 1.0026026093148272e-05, + "loss": 2.6537, + "step": 39406 + }, + { + "epoch": 2.4462722701595383, + "grad_norm": 0.15518354416881797, + "learning_rate": 1.0023856778678608e-05, + "loss": 2.6733, + "step": 39407 + }, + { + "epoch": 2.446334347259296, + "grad_norm": 0.15860510019737242, + "learning_rate": 1.0021687672774194e-05, + "loss": 2.6619, + "step": 39408 + }, + { + "epoch": 2.446396424359054, + "grad_norm": 0.1420909055689828, + "learning_rate": 1.0019518775446334e-05, + "loss": 2.6763, + "step": 39409 + }, + { + "epoch": 2.446458501458812, + "grad_norm": 0.15655486049211367, + "learning_rate": 1.001735008670634e-05, + "loss": 2.6839, + "step": 39410 + }, + { + "epoch": 2.4465205785585695, + "grad_norm": 0.14476841078299651, + "learning_rate": 1.0015181606565555e-05, + "loss": 2.7351, + "step": 39411 + }, + { + "epoch": 2.446582655658328, + "grad_norm": 0.13871818091715563, + "learning_rate": 1.0013013335035266e-05, + "loss": 2.6306, + "step": 39412 + }, + { + "epoch": 2.4466447327580854, + "grad_norm": 0.16694695713465413, + "learning_rate": 1.0010845272126796e-05, + "loss": 2.753, + "step": 39413 + }, + { + "epoch": 2.4467068098578433, + "grad_norm": 0.156113356207614, + "learning_rate": 1.0008677417851443e-05, + "loss": 2.712, + "step": 39414 + }, + { + "epoch": 2.446768886957601, + "grad_norm": 0.14266265035855485, + "learning_rate": 1.000650977222054e-05, + "loss": 2.7907, + "step": 39415 + }, + { + "epoch": 2.446830964057359, + "grad_norm": 0.1438326059251576, + "learning_rate": 1.0004342335245381e-05, + "loss": 2.7475, + "step": 39416 + }, + { + "epoch": 2.446893041157117, + "grad_norm": 0.14654733576337634, + "learning_rate": 1.0002175106937283e-05, + "loss": 2.6914, + "step": 39417 + }, + { + "epoch": 2.446955118256875, + "grad_norm": 0.1412200062162607, + "learning_rate": 1.0000008087307532e-05, + "loss": 2.7199, + "step": 39418 + }, + { + "epoch": 2.447017195356633, + "grad_norm": 0.1512837579980653, + "learning_rate": 9.997841276367464e-06, + "loss": 2.7011, + "step": 39419 + }, + { + "epoch": 2.447079272456391, + "grad_norm": 0.13669918465263028, + "learning_rate": 9.995674674128363e-06, + "loss": 2.6862, + "step": 39420 + }, + { + "epoch": 2.4471413495561487, + "grad_norm": 0.16466739948089695, + "learning_rate": 9.993508280601543e-06, + "loss": 2.7484, + "step": 39421 + }, + { + "epoch": 2.4472034266559066, + "grad_norm": 0.16778622735048204, + "learning_rate": 9.9913420957983e-06, + "loss": 2.686, + "step": 39422 + }, + { + "epoch": 2.4472655037556645, + "grad_norm": 0.13893774228068448, + "learning_rate": 9.989176119729932e-06, + "loss": 2.6266, + "step": 39423 + }, + { + "epoch": 2.4473275808554225, + "grad_norm": 0.13425686766819467, + "learning_rate": 9.98701035240775e-06, + "loss": 2.5915, + "step": 39424 + }, + { + "epoch": 2.4473896579551804, + "grad_norm": 0.14648765560291294, + "learning_rate": 9.984844793843051e-06, + "loss": 2.6935, + "step": 39425 + }, + { + "epoch": 2.4474517350549383, + "grad_norm": 0.1421574959170992, + "learning_rate": 9.982679444047132e-06, + "loss": 2.7061, + "step": 39426 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.13920340284319163, + "learning_rate": 9.980514303031274e-06, + "loss": 2.6376, + "step": 39427 + }, + { + "epoch": 2.447575889254454, + "grad_norm": 0.13306586557309535, + "learning_rate": 9.9783493708068e-06, + "loss": 2.6423, + "step": 39428 + }, + { + "epoch": 2.447637966354212, + "grad_norm": 0.14153930347698157, + "learning_rate": 9.976184647384995e-06, + "loss": 2.6763, + "step": 39429 + }, + { + "epoch": 2.44770004345397, + "grad_norm": 0.13888174616474092, + "learning_rate": 9.97402013277715e-06, + "loss": 2.7445, + "step": 39430 + }, + { + "epoch": 2.447762120553728, + "grad_norm": 0.13974327255163024, + "learning_rate": 9.971855826994547e-06, + "loss": 2.6692, + "step": 39431 + }, + { + "epoch": 2.447824197653486, + "grad_norm": 0.13538928779122192, + "learning_rate": 9.969691730048503e-06, + "loss": 2.7185, + "step": 39432 + }, + { + "epoch": 2.4478862747532437, + "grad_norm": 0.14229063548454002, + "learning_rate": 9.967527841950292e-06, + "loss": 2.707, + "step": 39433 + }, + { + "epoch": 2.447948351853001, + "grad_norm": 0.13861876128728268, + "learning_rate": 9.965364162711205e-06, + "loss": 2.6966, + "step": 39434 + }, + { + "epoch": 2.4480104289527596, + "grad_norm": 0.13735743929631944, + "learning_rate": 9.963200692342523e-06, + "loss": 2.6705, + "step": 39435 + }, + { + "epoch": 2.448072506052517, + "grad_norm": 0.13690467181903726, + "learning_rate": 9.961037430855553e-06, + "loss": 2.742, + "step": 39436 + }, + { + "epoch": 2.448134583152275, + "grad_norm": 0.14344843351127765, + "learning_rate": 9.958874378261557e-06, + "loss": 2.7361, + "step": 39437 + }, + { + "epoch": 2.448196660252033, + "grad_norm": 0.13977743739645296, + "learning_rate": 9.95671153457185e-06, + "loss": 2.7162, + "step": 39438 + }, + { + "epoch": 2.448258737351791, + "grad_norm": 0.13683619223398624, + "learning_rate": 9.9545488997977e-06, + "loss": 2.5935, + "step": 39439 + }, + { + "epoch": 2.4483208144515487, + "grad_norm": 0.13551644552601055, + "learning_rate": 9.952386473950387e-06, + "loss": 2.7868, + "step": 39440 + }, + { + "epoch": 2.4483828915513066, + "grad_norm": 0.13492501549525418, + "learning_rate": 9.95022425704118e-06, + "loss": 2.6643, + "step": 39441 + }, + { + "epoch": 2.4484449686510645, + "grad_norm": 0.14159441305067158, + "learning_rate": 9.948062249081392e-06, + "loss": 2.674, + "step": 39442 + }, + { + "epoch": 2.4485070457508225, + "grad_norm": 0.15613134170156326, + "learning_rate": 9.945900450082286e-06, + "loss": 2.6965, + "step": 39443 + }, + { + "epoch": 2.4485691228505804, + "grad_norm": 0.1424130110773261, + "learning_rate": 9.94373886005514e-06, + "loss": 2.7517, + "step": 39444 + }, + { + "epoch": 2.4486311999503383, + "grad_norm": 0.1408469382628352, + "learning_rate": 9.941577479011233e-06, + "loss": 2.6926, + "step": 39445 + }, + { + "epoch": 2.448693277050096, + "grad_norm": 0.13206575636859094, + "learning_rate": 9.93941630696183e-06, + "loss": 2.6251, + "step": 39446 + }, + { + "epoch": 2.448755354149854, + "grad_norm": 0.16363850369440291, + "learning_rate": 9.937255343918229e-06, + "loss": 2.6629, + "step": 39447 + }, + { + "epoch": 2.448817431249612, + "grad_norm": 0.1448526867637254, + "learning_rate": 9.93509458989169e-06, + "loss": 2.6467, + "step": 39448 + }, + { + "epoch": 2.44887950834937, + "grad_norm": 0.14674601394992745, + "learning_rate": 9.932934044893489e-06, + "loss": 2.6381, + "step": 39449 + }, + { + "epoch": 2.448941585449128, + "grad_norm": 0.13893705855619903, + "learning_rate": 9.930773708934887e-06, + "loss": 2.7131, + "step": 39450 + }, + { + "epoch": 2.449003662548886, + "grad_norm": 0.1500586151794315, + "learning_rate": 9.92861358202718e-06, + "loss": 2.7072, + "step": 39451 + }, + { + "epoch": 2.4490657396486437, + "grad_norm": 0.1378128753817024, + "learning_rate": 9.926453664181618e-06, + "loss": 2.668, + "step": 39452 + }, + { + "epoch": 2.4491278167484016, + "grad_norm": 0.1400147814695995, + "learning_rate": 9.92429395540948e-06, + "loss": 2.7313, + "step": 39453 + }, + { + "epoch": 2.4491898938481595, + "grad_norm": 0.13724082992055114, + "learning_rate": 9.922134455722016e-06, + "loss": 2.7027, + "step": 39454 + }, + { + "epoch": 2.4492519709479175, + "grad_norm": 0.14231742252159446, + "learning_rate": 9.919975165130513e-06, + "loss": 2.7285, + "step": 39455 + }, + { + "epoch": 2.4493140480476754, + "grad_norm": 0.13791785282040786, + "learning_rate": 9.917816083646237e-06, + "loss": 2.8091, + "step": 39456 + }, + { + "epoch": 2.4493761251474333, + "grad_norm": 0.14583290829483037, + "learning_rate": 9.915657211280439e-06, + "loss": 2.6804, + "step": 39457 + }, + { + "epoch": 2.449438202247191, + "grad_norm": 0.13582323248387393, + "learning_rate": 9.913498548044392e-06, + "loss": 2.6521, + "step": 39458 + }, + { + "epoch": 2.4495002793469487, + "grad_norm": 0.14038339363796037, + "learning_rate": 9.91134009394934e-06, + "loss": 2.7673, + "step": 39459 + }, + { + "epoch": 2.449562356446707, + "grad_norm": 0.14486980736751998, + "learning_rate": 9.909181849006571e-06, + "loss": 2.7351, + "step": 39460 + }, + { + "epoch": 2.4496244335464645, + "grad_norm": 0.1419690592109794, + "learning_rate": 9.907023813227334e-06, + "loss": 2.7412, + "step": 39461 + }, + { + "epoch": 2.4496865106462224, + "grad_norm": 0.150957383421809, + "learning_rate": 9.904865986622886e-06, + "loss": 2.7093, + "step": 39462 + }, + { + "epoch": 2.4497485877459804, + "grad_norm": 0.1342529135934862, + "learning_rate": 9.902708369204471e-06, + "loss": 2.6777, + "step": 39463 + }, + { + "epoch": 2.4498106648457383, + "grad_norm": 0.13635277812120591, + "learning_rate": 9.90055096098338e-06, + "loss": 2.7216, + "step": 39464 + }, + { + "epoch": 2.449872741945496, + "grad_norm": 0.15285035410480644, + "learning_rate": 9.898393761970842e-06, + "loss": 2.7744, + "step": 39465 + }, + { + "epoch": 2.449934819045254, + "grad_norm": 0.13936321768973212, + "learning_rate": 9.896236772178119e-06, + "loss": 2.7327, + "step": 39466 + }, + { + "epoch": 2.449996896145012, + "grad_norm": 0.13565043909597285, + "learning_rate": 9.894079991616457e-06, + "loss": 2.7093, + "step": 39467 + }, + { + "epoch": 2.45005897324477, + "grad_norm": 0.13432402059592743, + "learning_rate": 9.891923420297112e-06, + "loss": 2.7356, + "step": 39468 + }, + { + "epoch": 2.450121050344528, + "grad_norm": 0.13969509305734606, + "learning_rate": 9.889767058231348e-06, + "loss": 2.6832, + "step": 39469 + }, + { + "epoch": 2.450183127444286, + "grad_norm": 0.1442424104650915, + "learning_rate": 9.887610905430411e-06, + "loss": 2.7988, + "step": 39470 + }, + { + "epoch": 2.4502452045440437, + "grad_norm": 0.13795879532228678, + "learning_rate": 9.885454961905544e-06, + "loss": 2.7138, + "step": 39471 + }, + { + "epoch": 2.4503072816438016, + "grad_norm": 0.13876770630400917, + "learning_rate": 9.883299227667997e-06, + "loss": 2.746, + "step": 39472 + }, + { + "epoch": 2.4503693587435595, + "grad_norm": 0.13681622164139964, + "learning_rate": 9.881143702729006e-06, + "loss": 2.6742, + "step": 39473 + }, + { + "epoch": 2.4504314358433175, + "grad_norm": 0.13257322074168076, + "learning_rate": 9.87898838709984e-06, + "loss": 2.677, + "step": 39474 + }, + { + "epoch": 2.4504935129430754, + "grad_norm": 0.14058271408095416, + "learning_rate": 9.87683328079173e-06, + "loss": 2.7298, + "step": 39475 + }, + { + "epoch": 2.4505555900428333, + "grad_norm": 0.1423326560951402, + "learning_rate": 9.874678383815921e-06, + "loss": 2.7662, + "step": 39476 + }, + { + "epoch": 2.450617667142591, + "grad_norm": 0.14476760019888987, + "learning_rate": 9.872523696183639e-06, + "loss": 2.7502, + "step": 39477 + }, + { + "epoch": 2.450679744242349, + "grad_norm": 0.14627956059031824, + "learning_rate": 9.870369217906161e-06, + "loss": 2.7551, + "step": 39478 + }, + { + "epoch": 2.450741821342107, + "grad_norm": 0.1371648526778136, + "learning_rate": 9.86821494899471e-06, + "loss": 2.6581, + "step": 39479 + }, + { + "epoch": 2.450803898441865, + "grad_norm": 0.14476376594589088, + "learning_rate": 9.866060889460515e-06, + "loss": 2.7833, + "step": 39480 + }, + { + "epoch": 2.450865975541623, + "grad_norm": 0.1529886095678283, + "learning_rate": 9.863907039314819e-06, + "loss": 2.7893, + "step": 39481 + }, + { + "epoch": 2.4509280526413804, + "grad_norm": 0.17170230360647828, + "learning_rate": 9.861753398568874e-06, + "loss": 2.7938, + "step": 39482 + }, + { + "epoch": 2.4509901297411387, + "grad_norm": 0.14014034731749483, + "learning_rate": 9.8595999672339e-06, + "loss": 2.6622, + "step": 39483 + }, + { + "epoch": 2.451052206840896, + "grad_norm": 0.14186109895916, + "learning_rate": 9.857446745321142e-06, + "loss": 2.7137, + "step": 39484 + }, + { + "epoch": 2.451114283940654, + "grad_norm": 0.14345402983868416, + "learning_rate": 9.85529373284183e-06, + "loss": 2.7611, + "step": 39485 + }, + { + "epoch": 2.451176361040412, + "grad_norm": 0.1400517195936506, + "learning_rate": 9.85314092980718e-06, + "loss": 2.7142, + "step": 39486 + }, + { + "epoch": 2.45123843814017, + "grad_norm": 0.1360777337124173, + "learning_rate": 9.850988336228456e-06, + "loss": 2.7412, + "step": 39487 + }, + { + "epoch": 2.451300515239928, + "grad_norm": 0.1478179563378209, + "learning_rate": 9.84883595211687e-06, + "loss": 2.7829, + "step": 39488 + }, + { + "epoch": 2.451362592339686, + "grad_norm": 0.142735662042153, + "learning_rate": 9.846683777483657e-06, + "loss": 2.7708, + "step": 39489 + }, + { + "epoch": 2.4514246694394437, + "grad_norm": 0.17268148772880423, + "learning_rate": 9.844531812340025e-06, + "loss": 2.701, + "step": 39490 + }, + { + "epoch": 2.4514867465392016, + "grad_norm": 0.14238371724494062, + "learning_rate": 9.842380056697237e-06, + "loss": 2.7448, + "step": 39491 + }, + { + "epoch": 2.4515488236389595, + "grad_norm": 0.15745306581596585, + "learning_rate": 9.840228510566496e-06, + "loss": 2.7188, + "step": 39492 + }, + { + "epoch": 2.4516109007387175, + "grad_norm": 0.13989743795158713, + "learning_rate": 9.838077173959031e-06, + "loss": 2.7066, + "step": 39493 + }, + { + "epoch": 2.4516729778384754, + "grad_norm": 0.1694794518582274, + "learning_rate": 9.835926046886074e-06, + "loss": 2.7896, + "step": 39494 + }, + { + "epoch": 2.4517350549382333, + "grad_norm": 0.14177162218773653, + "learning_rate": 9.833775129358825e-06, + "loss": 2.6904, + "step": 39495 + }, + { + "epoch": 2.451797132037991, + "grad_norm": 0.15190927588684874, + "learning_rate": 9.83162442138853e-06, + "loss": 2.6345, + "step": 39496 + }, + { + "epoch": 2.451859209137749, + "grad_norm": 0.15353773579058003, + "learning_rate": 9.829473922986404e-06, + "loss": 2.7399, + "step": 39497 + }, + { + "epoch": 2.451921286237507, + "grad_norm": 0.15072944176004435, + "learning_rate": 9.827323634163665e-06, + "loss": 2.6521, + "step": 39498 + }, + { + "epoch": 2.451983363337265, + "grad_norm": 0.1490453521493956, + "learning_rate": 9.825173554931521e-06, + "loss": 2.6521, + "step": 39499 + }, + { + "epoch": 2.452045440437023, + "grad_norm": 0.14369014266032637, + "learning_rate": 9.823023685301208e-06, + "loss": 2.7181, + "step": 39500 + }, + { + "epoch": 2.452107517536781, + "grad_norm": 0.16015360992860808, + "learning_rate": 9.820874025283922e-06, + "loss": 2.6362, + "step": 39501 + }, + { + "epoch": 2.4521695946365387, + "grad_norm": 0.1357106371103201, + "learning_rate": 9.818724574890903e-06, + "loss": 2.7263, + "step": 39502 + }, + { + "epoch": 2.4522316717362966, + "grad_norm": 0.14368232580138368, + "learning_rate": 9.816575334133354e-06, + "loss": 2.7022, + "step": 39503 + }, + { + "epoch": 2.4522937488360546, + "grad_norm": 0.1444177697562022, + "learning_rate": 9.814426303022472e-06, + "loss": 2.7449, + "step": 39504 + }, + { + "epoch": 2.4523558259358125, + "grad_norm": 0.14111418542275275, + "learning_rate": 9.8122774815695e-06, + "loss": 2.689, + "step": 39505 + }, + { + "epoch": 2.4524179030355704, + "grad_norm": 0.1347942598300797, + "learning_rate": 9.810128869785629e-06, + "loss": 2.6972, + "step": 39506 + }, + { + "epoch": 2.452479980135328, + "grad_norm": 0.13805063543719204, + "learning_rate": 9.807980467682071e-06, + "loss": 2.7148, + "step": 39507 + }, + { + "epoch": 2.4525420572350862, + "grad_norm": 0.1494525722666927, + "learning_rate": 9.805832275270032e-06, + "loss": 2.6629, + "step": 39508 + }, + { + "epoch": 2.4526041343348437, + "grad_norm": 0.15277674470236727, + "learning_rate": 9.803684292560716e-06, + "loss": 2.6526, + "step": 39509 + }, + { + "epoch": 2.4526662114346016, + "grad_norm": 0.16506687471694453, + "learning_rate": 9.801536519565352e-06, + "loss": 2.6839, + "step": 39510 + }, + { + "epoch": 2.4527282885343595, + "grad_norm": 0.1392393388840437, + "learning_rate": 9.799388956295124e-06, + "loss": 2.7434, + "step": 39511 + }, + { + "epoch": 2.4527903656341175, + "grad_norm": 0.13923480390855722, + "learning_rate": 9.797241602761248e-06, + "loss": 2.7155, + "step": 39512 + }, + { + "epoch": 2.4528524427338754, + "grad_norm": 0.13938769427039568, + "learning_rate": 9.795094458974907e-06, + "loss": 2.6646, + "step": 39513 + }, + { + "epoch": 2.4529145198336333, + "grad_norm": 0.14045430128195063, + "learning_rate": 9.792947524947332e-06, + "loss": 2.7501, + "step": 39514 + }, + { + "epoch": 2.452976596933391, + "grad_norm": 0.13936148017690134, + "learning_rate": 9.790800800689709e-06, + "loss": 2.656, + "step": 39515 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.14517457297181693, + "learning_rate": 9.788654286213245e-06, + "loss": 2.7257, + "step": 39516 + }, + { + "epoch": 2.453100751132907, + "grad_norm": 0.142722735815185, + "learning_rate": 9.786507981529114e-06, + "loss": 2.7727, + "step": 39517 + }, + { + "epoch": 2.453162828232665, + "grad_norm": 0.14422743026218598, + "learning_rate": 9.784361886648547e-06, + "loss": 2.6348, + "step": 39518 + }, + { + "epoch": 2.453224905332423, + "grad_norm": 0.13983816316835584, + "learning_rate": 9.782216001582728e-06, + "loss": 2.7677, + "step": 39519 + }, + { + "epoch": 2.453286982432181, + "grad_norm": 0.14160583752723052, + "learning_rate": 9.78007032634285e-06, + "loss": 2.7004, + "step": 39520 + }, + { + "epoch": 2.4533490595319387, + "grad_norm": 0.13830456644886957, + "learning_rate": 9.777924860940113e-06, + "loss": 2.6593, + "step": 39521 + }, + { + "epoch": 2.4534111366316966, + "grad_norm": 0.14411651544088008, + "learning_rate": 9.775779605385687e-06, + "loss": 2.7234, + "step": 39522 + }, + { + "epoch": 2.4534732137314545, + "grad_norm": 0.14474448377511467, + "learning_rate": 9.773634559690803e-06, + "loss": 2.6172, + "step": 39523 + }, + { + "epoch": 2.4535352908312125, + "grad_norm": 0.15587553810502439, + "learning_rate": 9.771489723866628e-06, + "loss": 2.5917, + "step": 39524 + }, + { + "epoch": 2.4535973679309704, + "grad_norm": 0.14613975677672814, + "learning_rate": 9.769345097924359e-06, + "loss": 2.6554, + "step": 39525 + }, + { + "epoch": 2.4536594450307283, + "grad_norm": 0.14451845859938195, + "learning_rate": 9.767200681875167e-06, + "loss": 2.7399, + "step": 39526 + }, + { + "epoch": 2.453721522130486, + "grad_norm": 0.1387944418597167, + "learning_rate": 9.765056475730272e-06, + "loss": 2.7894, + "step": 39527 + }, + { + "epoch": 2.453783599230244, + "grad_norm": 0.13737547367029906, + "learning_rate": 9.76291247950084e-06, + "loss": 2.6455, + "step": 39528 + }, + { + "epoch": 2.453845676330002, + "grad_norm": 0.13917780981689604, + "learning_rate": 9.76076869319807e-06, + "loss": 2.7413, + "step": 39529 + }, + { + "epoch": 2.4539077534297595, + "grad_norm": 0.13656419464910463, + "learning_rate": 9.758625116833132e-06, + "loss": 2.6281, + "step": 39530 + }, + { + "epoch": 2.453969830529518, + "grad_norm": 0.1396757245914338, + "learning_rate": 9.756481750417207e-06, + "loss": 2.6699, + "step": 39531 + }, + { + "epoch": 2.4540319076292754, + "grad_norm": 0.14358715416845114, + "learning_rate": 9.754338593961493e-06, + "loss": 2.6872, + "step": 39532 + }, + { + "epoch": 2.4540939847290333, + "grad_norm": 0.14988944529356749, + "learning_rate": 9.75219564747717e-06, + "loss": 2.7502, + "step": 39533 + }, + { + "epoch": 2.454156061828791, + "grad_norm": 0.1363762857903619, + "learning_rate": 9.750052910975399e-06, + "loss": 2.7076, + "step": 39534 + }, + { + "epoch": 2.454218138928549, + "grad_norm": 0.1375843778095864, + "learning_rate": 9.747910384467385e-06, + "loss": 2.7583, + "step": 39535 + }, + { + "epoch": 2.454280216028307, + "grad_norm": 0.14934676655693993, + "learning_rate": 9.745768067964279e-06, + "loss": 2.7398, + "step": 39536 + }, + { + "epoch": 2.454342293128065, + "grad_norm": 0.1531141863624729, + "learning_rate": 9.743625961477288e-06, + "loss": 2.6239, + "step": 39537 + }, + { + "epoch": 2.454404370227823, + "grad_norm": 0.13753415545068803, + "learning_rate": 9.741484065017569e-06, + "loss": 2.6718, + "step": 39538 + }, + { + "epoch": 2.454466447327581, + "grad_norm": 0.15369378906722017, + "learning_rate": 9.739342378596306e-06, + "loss": 2.6598, + "step": 39539 + }, + { + "epoch": 2.4545285244273387, + "grad_norm": 0.16329570231748, + "learning_rate": 9.737200902224652e-06, + "loss": 2.6645, + "step": 39540 + }, + { + "epoch": 2.4545906015270966, + "grad_norm": 0.13845110499177674, + "learning_rate": 9.735059635913813e-06, + "loss": 2.8034, + "step": 39541 + }, + { + "epoch": 2.4546526786268545, + "grad_norm": 0.13860912649489088, + "learning_rate": 9.732918579674938e-06, + "loss": 2.7218, + "step": 39542 + }, + { + "epoch": 2.4547147557266125, + "grad_norm": 0.15137697618889268, + "learning_rate": 9.730777733519203e-06, + "loss": 2.7782, + "step": 39543 + }, + { + "epoch": 2.4547768328263704, + "grad_norm": 0.15709115944747376, + "learning_rate": 9.728637097457771e-06, + "loss": 2.6379, + "step": 39544 + }, + { + "epoch": 2.4548389099261283, + "grad_norm": 0.1340913698482569, + "learning_rate": 9.726496671501811e-06, + "loss": 2.6886, + "step": 39545 + }, + { + "epoch": 2.454900987025886, + "grad_norm": 0.16234327011478053, + "learning_rate": 9.724356455662503e-06, + "loss": 2.8011, + "step": 39546 + }, + { + "epoch": 2.454963064125644, + "grad_norm": 0.1340611260239234, + "learning_rate": 9.722216449951005e-06, + "loss": 2.7121, + "step": 39547 + }, + { + "epoch": 2.455025141225402, + "grad_norm": 0.13879615292097738, + "learning_rate": 9.72007665437848e-06, + "loss": 2.7258, + "step": 39548 + }, + { + "epoch": 2.45508721832516, + "grad_norm": 0.15830227017511378, + "learning_rate": 9.717937068956085e-06, + "loss": 2.6596, + "step": 39549 + }, + { + "epoch": 2.455149295424918, + "grad_norm": 0.149674543823476, + "learning_rate": 9.715797693695001e-06, + "loss": 2.6064, + "step": 39550 + }, + { + "epoch": 2.455211372524676, + "grad_norm": 0.1442125259990591, + "learning_rate": 9.71365852860638e-06, + "loss": 2.7257, + "step": 39551 + }, + { + "epoch": 2.4552734496244337, + "grad_norm": 0.1404444674803698, + "learning_rate": 9.711519573701384e-06, + "loss": 2.734, + "step": 39552 + }, + { + "epoch": 2.455335526724191, + "grad_norm": 0.1349996111122464, + "learning_rate": 9.709380828991155e-06, + "loss": 2.6488, + "step": 39553 + }, + { + "epoch": 2.4553976038239496, + "grad_norm": 0.14071015587692812, + "learning_rate": 9.707242294486878e-06, + "loss": 2.7255, + "step": 39554 + }, + { + "epoch": 2.455459680923707, + "grad_norm": 0.14560722309081633, + "learning_rate": 9.705103970199703e-06, + "loss": 2.6595, + "step": 39555 + }, + { + "epoch": 2.455521758023465, + "grad_norm": 0.1382822823351241, + "learning_rate": 9.702965856140784e-06, + "loss": 2.6479, + "step": 39556 + }, + { + "epoch": 2.455583835123223, + "grad_norm": 0.14972799021306166, + "learning_rate": 9.700827952321273e-06, + "loss": 2.6564, + "step": 39557 + }, + { + "epoch": 2.455645912222981, + "grad_norm": 0.14987801220772437, + "learning_rate": 9.698690258752313e-06, + "loss": 2.7029, + "step": 39558 + }, + { + "epoch": 2.4557079893227387, + "grad_norm": 0.13530039074567152, + "learning_rate": 9.696552775445084e-06, + "loss": 2.731, + "step": 39559 + }, + { + "epoch": 2.4557700664224966, + "grad_norm": 0.1456586836468551, + "learning_rate": 9.694415502410719e-06, + "loss": 2.5824, + "step": 39560 + }, + { + "epoch": 2.4558321435222545, + "grad_norm": 0.14619795453957055, + "learning_rate": 9.692278439660375e-06, + "loss": 2.7105, + "step": 39561 + }, + { + "epoch": 2.4558942206220125, + "grad_norm": 0.14619939421679579, + "learning_rate": 9.690141587205187e-06, + "loss": 2.7738, + "step": 39562 + }, + { + "epoch": 2.4559562977217704, + "grad_norm": 0.14525495230526467, + "learning_rate": 9.688004945056328e-06, + "loss": 2.7195, + "step": 39563 + }, + { + "epoch": 2.4560183748215283, + "grad_norm": 0.14075863634346802, + "learning_rate": 9.685868513224932e-06, + "loss": 2.7149, + "step": 39564 + }, + { + "epoch": 2.456080451921286, + "grad_norm": 0.13325640879583517, + "learning_rate": 9.683732291722148e-06, + "loss": 2.687, + "step": 39565 + }, + { + "epoch": 2.456142529021044, + "grad_norm": 0.1391670760726427, + "learning_rate": 9.681596280559108e-06, + "loss": 2.736, + "step": 39566 + }, + { + "epoch": 2.456204606120802, + "grad_norm": 0.15410616714308842, + "learning_rate": 9.679460479746966e-06, + "loss": 2.7437, + "step": 39567 + }, + { + "epoch": 2.45626668322056, + "grad_norm": 0.14621689216907532, + "learning_rate": 9.67732488929688e-06, + "loss": 2.6659, + "step": 39568 + }, + { + "epoch": 2.456328760320318, + "grad_norm": 0.1391092173723178, + "learning_rate": 9.675189509219979e-06, + "loss": 2.7272, + "step": 39569 + }, + { + "epoch": 2.456390837420076, + "grad_norm": 0.13864346334044883, + "learning_rate": 9.673054339527404e-06, + "loss": 2.7116, + "step": 39570 + }, + { + "epoch": 2.4564529145198337, + "grad_norm": 0.15313398373778053, + "learning_rate": 9.670919380230293e-06, + "loss": 2.7053, + "step": 39571 + }, + { + "epoch": 2.4565149916195916, + "grad_norm": 0.14657327497481565, + "learning_rate": 9.668784631339772e-06, + "loss": 2.7242, + "step": 39572 + }, + { + "epoch": 2.4565770687193496, + "grad_norm": 0.13465001496269946, + "learning_rate": 9.666650092867008e-06, + "loss": 2.6869, + "step": 39573 + }, + { + "epoch": 2.4566391458191075, + "grad_norm": 0.13519672190260282, + "learning_rate": 9.664515764823117e-06, + "loss": 2.6452, + "step": 39574 + }, + { + "epoch": 2.4567012229188654, + "grad_norm": 0.15547534722559153, + "learning_rate": 9.662381647219243e-06, + "loss": 2.7032, + "step": 39575 + }, + { + "epoch": 2.4567633000186233, + "grad_norm": 0.13194871952704743, + "learning_rate": 9.660247740066508e-06, + "loss": 2.6228, + "step": 39576 + }, + { + "epoch": 2.4568253771183812, + "grad_norm": 0.13886900443217698, + "learning_rate": 9.658114043376059e-06, + "loss": 2.6615, + "step": 39577 + }, + { + "epoch": 2.4568874542181387, + "grad_norm": 0.13979563924410168, + "learning_rate": 9.655980557159022e-06, + "loss": 2.7922, + "step": 39578 + }, + { + "epoch": 2.456949531317897, + "grad_norm": 0.14506818937627436, + "learning_rate": 9.653847281426532e-06, + "loss": 2.7252, + "step": 39579 + }, + { + "epoch": 2.4570116084176545, + "grad_norm": 0.13757641034679735, + "learning_rate": 9.651714216189716e-06, + "loss": 2.8093, + "step": 39580 + }, + { + "epoch": 2.4570736855174125, + "grad_norm": 0.13881901872701707, + "learning_rate": 9.649581361459687e-06, + "loss": 2.674, + "step": 39581 + }, + { + "epoch": 2.4571357626171704, + "grad_norm": 0.13529992955913317, + "learning_rate": 9.647448717247598e-06, + "loss": 2.6508, + "step": 39582 + }, + { + "epoch": 2.4571978397169283, + "grad_norm": 0.1369693027022166, + "learning_rate": 9.64531628356457e-06, + "loss": 2.7842, + "step": 39583 + }, + { + "epoch": 2.457259916816686, + "grad_norm": 0.13616522043396678, + "learning_rate": 9.643184060421717e-06, + "loss": 2.6579, + "step": 39584 + }, + { + "epoch": 2.457321993916444, + "grad_norm": 0.13641237262648923, + "learning_rate": 9.641052047830162e-06, + "loss": 2.7439, + "step": 39585 + }, + { + "epoch": 2.457384071016202, + "grad_norm": 0.13871924168945812, + "learning_rate": 9.638920245801048e-06, + "loss": 2.6697, + "step": 39586 + }, + { + "epoch": 2.45744614811596, + "grad_norm": 0.13775101339941884, + "learning_rate": 9.636788654345486e-06, + "loss": 2.7255, + "step": 39587 + }, + { + "epoch": 2.457508225215718, + "grad_norm": 0.13393123757456077, + "learning_rate": 9.634657273474596e-06, + "loss": 2.7047, + "step": 39588 + }, + { + "epoch": 2.457570302315476, + "grad_norm": 0.15355992879100652, + "learning_rate": 9.63252610319948e-06, + "loss": 2.6715, + "step": 39589 + }, + { + "epoch": 2.4576323794152337, + "grad_norm": 0.14553808371038496, + "learning_rate": 9.63039514353129e-06, + "loss": 2.6868, + "step": 39590 + }, + { + "epoch": 2.4576944565149916, + "grad_norm": 0.15501653663806297, + "learning_rate": 9.628264394481135e-06, + "loss": 2.6503, + "step": 39591 + }, + { + "epoch": 2.4577565336147496, + "grad_norm": 0.13039049439969388, + "learning_rate": 9.626133856060116e-06, + "loss": 2.7006, + "step": 39592 + }, + { + "epoch": 2.4578186107145075, + "grad_norm": 0.13418054733111928, + "learning_rate": 9.624003528279363e-06, + "loss": 2.6497, + "step": 39593 + }, + { + "epoch": 2.4578806878142654, + "grad_norm": 0.1563883169222834, + "learning_rate": 9.621873411149973e-06, + "loss": 2.7467, + "step": 39594 + }, + { + "epoch": 2.4579427649140233, + "grad_norm": 0.13490662592962965, + "learning_rate": 9.619743504683082e-06, + "loss": 2.7257, + "step": 39595 + }, + { + "epoch": 2.458004842013781, + "grad_norm": 0.13699069905295055, + "learning_rate": 9.617613808889791e-06, + "loss": 2.628, + "step": 39596 + }, + { + "epoch": 2.458066919113539, + "grad_norm": 0.14282786040474071, + "learning_rate": 9.615484323781215e-06, + "loss": 2.6796, + "step": 39597 + }, + { + "epoch": 2.458128996213297, + "grad_norm": 0.13390834391636008, + "learning_rate": 9.613355049368445e-06, + "loss": 2.7461, + "step": 39598 + }, + { + "epoch": 2.458191073313055, + "grad_norm": 0.1358285601692421, + "learning_rate": 9.611225985662609e-06, + "loss": 2.6877, + "step": 39599 + }, + { + "epoch": 2.458253150412813, + "grad_norm": 0.14494310156742257, + "learning_rate": 9.609097132674822e-06, + "loss": 2.6642, + "step": 39600 + }, + { + "epoch": 2.4583152275125704, + "grad_norm": 0.14900904344710192, + "learning_rate": 9.60696849041618e-06, + "loss": 2.6928, + "step": 39601 + }, + { + "epoch": 2.4583773046123287, + "grad_norm": 0.14093402842413902, + "learning_rate": 9.60484005889779e-06, + "loss": 2.778, + "step": 39602 + }, + { + "epoch": 2.458439381712086, + "grad_norm": 0.1397455851227723, + "learning_rate": 9.602711838130745e-06, + "loss": 2.7058, + "step": 39603 + }, + { + "epoch": 2.458501458811844, + "grad_norm": 0.15863249051893408, + "learning_rate": 9.60058382812617e-06, + "loss": 2.6657, + "step": 39604 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.1361963988329578, + "learning_rate": 9.598456028895158e-06, + "loss": 2.711, + "step": 39605 + }, + { + "epoch": 2.45862561301136, + "grad_norm": 0.13573365905096887, + "learning_rate": 9.59632844044881e-06, + "loss": 2.6358, + "step": 39606 + }, + { + "epoch": 2.458687690111118, + "grad_norm": 0.15666950654015988, + "learning_rate": 9.59420106279822e-06, + "loss": 2.7096, + "step": 39607 + }, + { + "epoch": 2.458749767210876, + "grad_norm": 0.13497120615293612, + "learning_rate": 9.592073895954484e-06, + "loss": 2.6883, + "step": 39608 + }, + { + "epoch": 2.4588118443106337, + "grad_norm": 0.14344406406907892, + "learning_rate": 9.589946939928718e-06, + "loss": 2.6933, + "step": 39609 + }, + { + "epoch": 2.4588739214103916, + "grad_norm": 0.13901586581107797, + "learning_rate": 9.587820194732005e-06, + "loss": 2.6648, + "step": 39610 + }, + { + "epoch": 2.4589359985101495, + "grad_norm": 0.14249022269378, + "learning_rate": 9.58569366037545e-06, + "loss": 2.6286, + "step": 39611 + }, + { + "epoch": 2.4589980756099075, + "grad_norm": 0.13571410952194507, + "learning_rate": 9.58356733687013e-06, + "loss": 2.6866, + "step": 39612 + }, + { + "epoch": 2.4590601527096654, + "grad_norm": 0.14132354673031147, + "learning_rate": 9.581441224227161e-06, + "loss": 2.7537, + "step": 39613 + }, + { + "epoch": 2.4591222298094233, + "grad_norm": 0.14956287378486827, + "learning_rate": 9.57931532245762e-06, + "loss": 2.7998, + "step": 39614 + }, + { + "epoch": 2.459184306909181, + "grad_norm": 0.16041001707596073, + "learning_rate": 9.577189631572614e-06, + "loss": 2.7221, + "step": 39615 + }, + { + "epoch": 2.459246384008939, + "grad_norm": 0.13744861065119712, + "learning_rate": 9.575064151583213e-06, + "loss": 2.6984, + "step": 39616 + }, + { + "epoch": 2.459308461108697, + "grad_norm": 0.1354649546310458, + "learning_rate": 9.572938882500509e-06, + "loss": 2.6978, + "step": 39617 + }, + { + "epoch": 2.459370538208455, + "grad_norm": 0.14013350878504943, + "learning_rate": 9.570813824335606e-06, + "loss": 2.6954, + "step": 39618 + }, + { + "epoch": 2.459432615308213, + "grad_norm": 0.14537747837669943, + "learning_rate": 9.568688977099589e-06, + "loss": 2.6694, + "step": 39619 + }, + { + "epoch": 2.459494692407971, + "grad_norm": 0.16326676126472234, + "learning_rate": 9.56656434080353e-06, + "loss": 2.7465, + "step": 39620 + }, + { + "epoch": 2.4595567695077287, + "grad_norm": 0.13797957925429075, + "learning_rate": 9.564439915458506e-06, + "loss": 2.7208, + "step": 39621 + }, + { + "epoch": 2.4596188466074866, + "grad_norm": 0.13889398383358753, + "learning_rate": 9.56231570107563e-06, + "loss": 2.7044, + "step": 39622 + }, + { + "epoch": 2.4596809237072446, + "grad_norm": 0.14061411179669936, + "learning_rate": 9.56019169766597e-06, + "loss": 2.7283, + "step": 39623 + }, + { + "epoch": 2.4597430008070025, + "grad_norm": 0.1409813749507197, + "learning_rate": 9.558067905240608e-06, + "loss": 2.6503, + "step": 39624 + }, + { + "epoch": 2.4598050779067604, + "grad_norm": 0.1338427568154333, + "learning_rate": 9.555944323810612e-06, + "loss": 2.7234, + "step": 39625 + }, + { + "epoch": 2.459867155006518, + "grad_norm": 0.14332628569862196, + "learning_rate": 9.553820953387083e-06, + "loss": 2.6321, + "step": 39626 + }, + { + "epoch": 2.4599292321062762, + "grad_norm": 0.1373938378234202, + "learning_rate": 9.55169779398109e-06, + "loss": 2.7054, + "step": 39627 + }, + { + "epoch": 2.4599913092060337, + "grad_norm": 0.1479786632685383, + "learning_rate": 9.54957484560371e-06, + "loss": 2.6771, + "step": 39628 + }, + { + "epoch": 2.4600533863057916, + "grad_norm": 0.1363525322322451, + "learning_rate": 9.547452108266013e-06, + "loss": 2.7141, + "step": 39629 + }, + { + "epoch": 2.4601154634055495, + "grad_norm": 0.13810195802939582, + "learning_rate": 9.545329581979068e-06, + "loss": 2.6477, + "step": 39630 + }, + { + "epoch": 2.4601775405053075, + "grad_norm": 0.13544309721894982, + "learning_rate": 9.543207266753973e-06, + "loss": 2.8172, + "step": 39631 + }, + { + "epoch": 2.4602396176050654, + "grad_norm": 0.1574982020298194, + "learning_rate": 9.541085162601776e-06, + "loss": 2.727, + "step": 39632 + }, + { + "epoch": 2.4603016947048233, + "grad_norm": 0.13563667285585426, + "learning_rate": 9.538963269533568e-06, + "loss": 2.6583, + "step": 39633 + }, + { + "epoch": 2.460363771804581, + "grad_norm": 0.13818613461156767, + "learning_rate": 9.536841587560413e-06, + "loss": 2.7482, + "step": 39634 + }, + { + "epoch": 2.460425848904339, + "grad_norm": 0.135702381197879, + "learning_rate": 9.534720116693363e-06, + "loss": 2.7074, + "step": 39635 + }, + { + "epoch": 2.460487926004097, + "grad_norm": 0.13818337133814299, + "learning_rate": 9.53259885694352e-06, + "loss": 2.7569, + "step": 39636 + }, + { + "epoch": 2.460550003103855, + "grad_norm": 0.13605482475455646, + "learning_rate": 9.530477808321925e-06, + "loss": 2.6544, + "step": 39637 + }, + { + "epoch": 2.460612080203613, + "grad_norm": 0.13310014206099866, + "learning_rate": 9.528356970839658e-06, + "loss": 2.6652, + "step": 39638 + }, + { + "epoch": 2.460674157303371, + "grad_norm": 0.14914345389391387, + "learning_rate": 9.52623634450776e-06, + "loss": 2.6659, + "step": 39639 + }, + { + "epoch": 2.4607362344031287, + "grad_norm": 0.1404150879636286, + "learning_rate": 9.52411592933733e-06, + "loss": 2.6833, + "step": 39640 + }, + { + "epoch": 2.4607983115028866, + "grad_norm": 0.1383169427163356, + "learning_rate": 9.521995725339411e-06, + "loss": 2.7549, + "step": 39641 + }, + { + "epoch": 2.4608603886026446, + "grad_norm": 0.13445418070944462, + "learning_rate": 9.519875732525064e-06, + "loss": 2.7474, + "step": 39642 + }, + { + "epoch": 2.4609224657024025, + "grad_norm": 0.15225418832730633, + "learning_rate": 9.517755950905356e-06, + "loss": 2.7023, + "step": 39643 + }, + { + "epoch": 2.4609845428021604, + "grad_norm": 0.13802963279768207, + "learning_rate": 9.515636380491328e-06, + "loss": 2.711, + "step": 39644 + }, + { + "epoch": 2.4610466199019183, + "grad_norm": 0.13288156032202164, + "learning_rate": 9.513517021294066e-06, + "loss": 2.6422, + "step": 39645 + }, + { + "epoch": 2.4611086970016762, + "grad_norm": 0.1335199740746719, + "learning_rate": 9.511397873324613e-06, + "loss": 2.7814, + "step": 39646 + }, + { + "epoch": 2.461170774101434, + "grad_norm": 0.14995740444654926, + "learning_rate": 9.509278936594029e-06, + "loss": 2.6697, + "step": 39647 + }, + { + "epoch": 2.461232851201192, + "grad_norm": 0.13939000605939772, + "learning_rate": 9.507160211113353e-06, + "loss": 2.6991, + "step": 39648 + }, + { + "epoch": 2.4612949283009495, + "grad_norm": 0.14631947454656288, + "learning_rate": 9.505041696893662e-06, + "loss": 2.7272, + "step": 39649 + }, + { + "epoch": 2.461357005400708, + "grad_norm": 0.13849053144279855, + "learning_rate": 9.502923393946001e-06, + "loss": 2.7542, + "step": 39650 + }, + { + "epoch": 2.4614190825004654, + "grad_norm": 0.1479331029795337, + "learning_rate": 9.500805302281418e-06, + "loss": 2.7967, + "step": 39651 + }, + { + "epoch": 2.4614811596002233, + "grad_norm": 0.15154020818985325, + "learning_rate": 9.498687421910952e-06, + "loss": 2.7298, + "step": 39652 + }, + { + "epoch": 2.461543236699981, + "grad_norm": 0.15801974842082164, + "learning_rate": 9.49656975284568e-06, + "loss": 2.7153, + "step": 39653 + }, + { + "epoch": 2.461605313799739, + "grad_norm": 0.13877845751558424, + "learning_rate": 9.49445229509664e-06, + "loss": 2.7053, + "step": 39654 + }, + { + "epoch": 2.461667390899497, + "grad_norm": 0.14440910229717124, + "learning_rate": 9.492335048674872e-06, + "loss": 2.6823, + "step": 39655 + }, + { + "epoch": 2.461729467999255, + "grad_norm": 0.13847795547624075, + "learning_rate": 9.490218013591423e-06, + "loss": 2.717, + "step": 39656 + }, + { + "epoch": 2.461791545099013, + "grad_norm": 0.13563162602881876, + "learning_rate": 9.488101189857328e-06, + "loss": 2.7704, + "step": 39657 + }, + { + "epoch": 2.461853622198771, + "grad_norm": 0.18240267348267344, + "learning_rate": 9.485984577483665e-06, + "loss": 2.7699, + "step": 39658 + }, + { + "epoch": 2.4619156992985287, + "grad_norm": 0.15682803759981498, + "learning_rate": 9.483868176481447e-06, + "loss": 2.5874, + "step": 39659 + }, + { + "epoch": 2.4619777763982866, + "grad_norm": 0.13616941991487072, + "learning_rate": 9.481751986861726e-06, + "loss": 2.713, + "step": 39660 + }, + { + "epoch": 2.4620398534980446, + "grad_norm": 0.17503615989227903, + "learning_rate": 9.479636008635528e-06, + "loss": 2.792, + "step": 39661 + }, + { + "epoch": 2.4621019305978025, + "grad_norm": 0.14847251203110917, + "learning_rate": 9.47752024181392e-06, + "loss": 2.6296, + "step": 39662 + }, + { + "epoch": 2.4621640076975604, + "grad_norm": 0.13207428896586415, + "learning_rate": 9.47540468640793e-06, + "loss": 2.7059, + "step": 39663 + }, + { + "epoch": 2.4622260847973183, + "grad_norm": 0.140891815423352, + "learning_rate": 9.473289342428581e-06, + "loss": 2.6824, + "step": 39664 + }, + { + "epoch": 2.462288161897076, + "grad_norm": 0.13796257567927683, + "learning_rate": 9.471174209886913e-06, + "loss": 2.674, + "step": 39665 + }, + { + "epoch": 2.462350238996834, + "grad_norm": 0.13715927134471675, + "learning_rate": 9.469059288793986e-06, + "loss": 2.7184, + "step": 39666 + }, + { + "epoch": 2.462412316096592, + "grad_norm": 0.13795444778368154, + "learning_rate": 9.466944579160796e-06, + "loss": 2.6996, + "step": 39667 + }, + { + "epoch": 2.46247439319635, + "grad_norm": 0.1395967894860083, + "learning_rate": 9.46483008099841e-06, + "loss": 2.676, + "step": 39668 + }, + { + "epoch": 2.462536470296108, + "grad_norm": 0.1378814507801442, + "learning_rate": 9.462715794317844e-06, + "loss": 2.6202, + "step": 39669 + }, + { + "epoch": 2.462598547395866, + "grad_norm": 0.14434473142560728, + "learning_rate": 9.460601719130135e-06, + "loss": 2.6783, + "step": 39670 + }, + { + "epoch": 2.4626606244956237, + "grad_norm": 0.13611097803793237, + "learning_rate": 9.45848785544629e-06, + "loss": 2.715, + "step": 39671 + }, + { + "epoch": 2.4627227015953816, + "grad_norm": 0.15207116487128455, + "learning_rate": 9.456374203277374e-06, + "loss": 2.7039, + "step": 39672 + }, + { + "epoch": 2.4627847786951396, + "grad_norm": 0.14050783468611636, + "learning_rate": 9.454260762634392e-06, + "loss": 2.7554, + "step": 39673 + }, + { + "epoch": 2.462846855794897, + "grad_norm": 0.13410580168062128, + "learning_rate": 9.452147533528377e-06, + "loss": 2.605, + "step": 39674 + }, + { + "epoch": 2.4629089328946554, + "grad_norm": 0.1454989347373828, + "learning_rate": 9.450034515970336e-06, + "loss": 2.6404, + "step": 39675 + }, + { + "epoch": 2.462971009994413, + "grad_norm": 0.14927525399200356, + "learning_rate": 9.447921709971325e-06, + "loss": 2.7246, + "step": 39676 + }, + { + "epoch": 2.463033087094171, + "grad_norm": 0.15369063804396663, + "learning_rate": 9.445809115542348e-06, + "loss": 2.7741, + "step": 39677 + }, + { + "epoch": 2.4630951641939287, + "grad_norm": 0.13575710928155746, + "learning_rate": 9.44369673269443e-06, + "loss": 2.6604, + "step": 39678 + }, + { + "epoch": 2.4631572412936866, + "grad_norm": 0.13912962863099568, + "learning_rate": 9.441584561438593e-06, + "loss": 2.7189, + "step": 39679 + }, + { + "epoch": 2.4632193183934445, + "grad_norm": 0.14762566693782753, + "learning_rate": 9.439472601785843e-06, + "loss": 2.6779, + "step": 39680 + }, + { + "epoch": 2.4632813954932025, + "grad_norm": 0.1672122074747874, + "learning_rate": 9.437360853747224e-06, + "loss": 2.728, + "step": 39681 + }, + { + "epoch": 2.4633434725929604, + "grad_norm": 0.13961502155568276, + "learning_rate": 9.43524931733374e-06, + "loss": 2.7482, + "step": 39682 + }, + { + "epoch": 2.4634055496927183, + "grad_norm": 0.13456995898949153, + "learning_rate": 9.433137992556402e-06, + "loss": 2.7469, + "step": 39683 + }, + { + "epoch": 2.463467626792476, + "grad_norm": 0.14259868630336467, + "learning_rate": 9.431026879426224e-06, + "loss": 2.731, + "step": 39684 + }, + { + "epoch": 2.463529703892234, + "grad_norm": 0.14849479723897088, + "learning_rate": 9.428915977954239e-06, + "loss": 2.7989, + "step": 39685 + }, + { + "epoch": 2.463591780991992, + "grad_norm": 0.15364961430913654, + "learning_rate": 9.426805288151447e-06, + "loss": 2.8072, + "step": 39686 + }, + { + "epoch": 2.46365385809175, + "grad_norm": 0.13724249249173234, + "learning_rate": 9.424694810028861e-06, + "loss": 2.7294, + "step": 39687 + }, + { + "epoch": 2.463715935191508, + "grad_norm": 0.16501737485965048, + "learning_rate": 9.422584543597485e-06, + "loss": 2.7116, + "step": 39688 + }, + { + "epoch": 2.463778012291266, + "grad_norm": 0.13996545115205453, + "learning_rate": 9.420474488868341e-06, + "loss": 2.7208, + "step": 39689 + }, + { + "epoch": 2.4638400893910237, + "grad_norm": 0.13831496985997338, + "learning_rate": 9.41836464585244e-06, + "loss": 2.636, + "step": 39690 + }, + { + "epoch": 2.4639021664907816, + "grad_norm": 0.14264405698781235, + "learning_rate": 9.416255014560776e-06, + "loss": 2.6337, + "step": 39691 + }, + { + "epoch": 2.4639642435905396, + "grad_norm": 0.13089733854155236, + "learning_rate": 9.414145595004364e-06, + "loss": 2.6046, + "step": 39692 + }, + { + "epoch": 2.4640263206902975, + "grad_norm": 0.15666576740225777, + "learning_rate": 9.412036387194196e-06, + "loss": 2.5829, + "step": 39693 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.14281585548557402, + "learning_rate": 9.409927391141294e-06, + "loss": 2.7619, + "step": 39694 + }, + { + "epoch": 2.4641504748898133, + "grad_norm": 0.14529653225608183, + "learning_rate": 9.407818606856661e-06, + "loss": 2.724, + "step": 39695 + }, + { + "epoch": 2.4642125519895712, + "grad_norm": 0.13739993832779768, + "learning_rate": 9.40571003435129e-06, + "loss": 2.644, + "step": 39696 + }, + { + "epoch": 2.4642746290893287, + "grad_norm": 0.1428318574732982, + "learning_rate": 9.40360167363617e-06, + "loss": 2.7206, + "step": 39697 + }, + { + "epoch": 2.464336706189087, + "grad_norm": 0.1477332663633163, + "learning_rate": 9.401493524722315e-06, + "loss": 2.6608, + "step": 39698 + }, + { + "epoch": 2.4643987832888445, + "grad_norm": 0.16985135423750944, + "learning_rate": 9.399385587620741e-06, + "loss": 2.7444, + "step": 39699 + }, + { + "epoch": 2.4644608603886025, + "grad_norm": 0.14351871584749956, + "learning_rate": 9.397277862342423e-06, + "loss": 2.6836, + "step": 39700 + }, + { + "epoch": 2.4645229374883604, + "grad_norm": 0.1376711168065904, + "learning_rate": 9.395170348898369e-06, + "loss": 2.6719, + "step": 39701 + }, + { + "epoch": 2.4645850145881183, + "grad_norm": 0.14233098100532873, + "learning_rate": 9.393063047299555e-06, + "loss": 2.679, + "step": 39702 + }, + { + "epoch": 2.464647091687876, + "grad_norm": 0.16308581757683827, + "learning_rate": 9.390955957556997e-06, + "loss": 2.787, + "step": 39703 + }, + { + "epoch": 2.464709168787634, + "grad_norm": 0.15120240748397723, + "learning_rate": 9.388849079681683e-06, + "loss": 2.6814, + "step": 39704 + }, + { + "epoch": 2.464771245887392, + "grad_norm": 0.1396818121129709, + "learning_rate": 9.386742413684602e-06, + "loss": 2.6924, + "step": 39705 + }, + { + "epoch": 2.46483332298715, + "grad_norm": 0.13855775650664937, + "learning_rate": 9.384635959576743e-06, + "loss": 2.7204, + "step": 39706 + }, + { + "epoch": 2.464895400086908, + "grad_norm": 0.15678658142286075, + "learning_rate": 9.382529717369087e-06, + "loss": 2.6966, + "step": 39707 + }, + { + "epoch": 2.464957477186666, + "grad_norm": 0.13597323287836777, + "learning_rate": 9.380423687072642e-06, + "loss": 2.696, + "step": 39708 + }, + { + "epoch": 2.4650195542864237, + "grad_norm": 0.13990193813360502, + "learning_rate": 9.378317868698394e-06, + "loss": 2.5574, + "step": 39709 + }, + { + "epoch": 2.4650816313861816, + "grad_norm": 0.1388494134143284, + "learning_rate": 9.376212262257323e-06, + "loss": 2.7156, + "step": 39710 + }, + { + "epoch": 2.4651437084859396, + "grad_norm": 0.1391861004654128, + "learning_rate": 9.374106867760396e-06, + "loss": 2.6552, + "step": 39711 + }, + { + "epoch": 2.4652057855856975, + "grad_norm": 0.13290960604675003, + "learning_rate": 9.372001685218635e-06, + "loss": 2.7161, + "step": 39712 + }, + { + "epoch": 2.4652678626854554, + "grad_norm": 0.14212510893732827, + "learning_rate": 9.369896714642996e-06, + "loss": 2.723, + "step": 39713 + }, + { + "epoch": 2.4653299397852133, + "grad_norm": 0.16495741003710665, + "learning_rate": 9.367791956044476e-06, + "loss": 2.7322, + "step": 39714 + }, + { + "epoch": 2.4653920168849712, + "grad_norm": 0.15700813782878387, + "learning_rate": 9.365687409434043e-06, + "loss": 2.7292, + "step": 39715 + }, + { + "epoch": 2.465454093984729, + "grad_norm": 0.13960727671928674, + "learning_rate": 9.363583074822674e-06, + "loss": 2.7334, + "step": 39716 + }, + { + "epoch": 2.465516171084487, + "grad_norm": 0.15234471589784818, + "learning_rate": 9.36147895222137e-06, + "loss": 2.6791, + "step": 39717 + }, + { + "epoch": 2.465578248184245, + "grad_norm": 0.14248571275103358, + "learning_rate": 9.359375041641099e-06, + "loss": 2.7343, + "step": 39718 + }, + { + "epoch": 2.465640325284003, + "grad_norm": 0.13851823489101933, + "learning_rate": 9.357271343092826e-06, + "loss": 2.6878, + "step": 39719 + }, + { + "epoch": 2.465702402383761, + "grad_norm": 0.15429379061742687, + "learning_rate": 9.355167856587526e-06, + "loss": 2.7026, + "step": 39720 + }, + { + "epoch": 2.4657644794835187, + "grad_norm": 0.15084781758522317, + "learning_rate": 9.3530645821362e-06, + "loss": 2.7222, + "step": 39721 + }, + { + "epoch": 2.465826556583276, + "grad_norm": 0.1419887445234425, + "learning_rate": 9.350961519749795e-06, + "loss": 2.7888, + "step": 39722 + }, + { + "epoch": 2.4658886336830346, + "grad_norm": 0.13613478742614554, + "learning_rate": 9.348858669439298e-06, + "loss": 2.7127, + "step": 39723 + }, + { + "epoch": 2.465950710782792, + "grad_norm": 0.14832489862552678, + "learning_rate": 9.346756031215659e-06, + "loss": 2.6124, + "step": 39724 + }, + { + "epoch": 2.46601278788255, + "grad_norm": 0.1420398243958742, + "learning_rate": 9.344653605089875e-06, + "loss": 2.6361, + "step": 39725 + }, + { + "epoch": 2.466074864982308, + "grad_norm": 0.18155438416507919, + "learning_rate": 9.342551391072901e-06, + "loss": 2.805, + "step": 39726 + }, + { + "epoch": 2.466136942082066, + "grad_norm": 0.13779959551194043, + "learning_rate": 9.34044938917571e-06, + "loss": 2.7573, + "step": 39727 + }, + { + "epoch": 2.4661990191818237, + "grad_norm": 0.14496326318230582, + "learning_rate": 9.338347599409264e-06, + "loss": 2.6227, + "step": 39728 + }, + { + "epoch": 2.4662610962815816, + "grad_norm": 0.14043264346135328, + "learning_rate": 9.336246021784516e-06, + "loss": 2.7911, + "step": 39729 + }, + { + "epoch": 2.4663231733813396, + "grad_norm": 0.14044552847658956, + "learning_rate": 9.334144656312444e-06, + "loss": 2.7672, + "step": 39730 + }, + { + "epoch": 2.4663852504810975, + "grad_norm": 0.1332181318001628, + "learning_rate": 9.332043503004023e-06, + "loss": 2.7299, + "step": 39731 + }, + { + "epoch": 2.4664473275808554, + "grad_norm": 0.14672845849973865, + "learning_rate": 9.329942561870203e-06, + "loss": 2.7539, + "step": 39732 + }, + { + "epoch": 2.4665094046806133, + "grad_norm": 0.13911805436224212, + "learning_rate": 9.327841832921947e-06, + "loss": 2.7209, + "step": 39733 + }, + { + "epoch": 2.4665714817803712, + "grad_norm": 0.13911239071966056, + "learning_rate": 9.3257413161702e-06, + "loss": 2.6599, + "step": 39734 + }, + { + "epoch": 2.466633558880129, + "grad_norm": 0.14384769846286918, + "learning_rate": 9.32364101162595e-06, + "loss": 2.6078, + "step": 39735 + }, + { + "epoch": 2.466695635979887, + "grad_norm": 0.15073025117675998, + "learning_rate": 9.321540919300136e-06, + "loss": 2.7145, + "step": 39736 + }, + { + "epoch": 2.466757713079645, + "grad_norm": 0.15387017364587108, + "learning_rate": 9.319441039203719e-06, + "loss": 2.6723, + "step": 39737 + }, + { + "epoch": 2.466819790179403, + "grad_norm": 0.1413971971807001, + "learning_rate": 9.317341371347643e-06, + "loss": 2.7721, + "step": 39738 + }, + { + "epoch": 2.466881867279161, + "grad_norm": 0.1566730494247372, + "learning_rate": 9.315241915742883e-06, + "loss": 2.7307, + "step": 39739 + }, + { + "epoch": 2.4669439443789187, + "grad_norm": 0.1473667589286652, + "learning_rate": 9.313142672400389e-06, + "loss": 2.6445, + "step": 39740 + }, + { + "epoch": 2.4670060214786766, + "grad_norm": 0.1360708621144796, + "learning_rate": 9.311043641331096e-06, + "loss": 2.617, + "step": 39741 + }, + { + "epoch": 2.4670680985784346, + "grad_norm": 0.14732423976800438, + "learning_rate": 9.308944822545974e-06, + "loss": 2.642, + "step": 39742 + }, + { + "epoch": 2.4671301756781925, + "grad_norm": 0.1340750193030034, + "learning_rate": 9.306846216055948e-06, + "loss": 2.7383, + "step": 39743 + }, + { + "epoch": 2.4671922527779504, + "grad_norm": 0.13551713963615178, + "learning_rate": 9.304747821872e-06, + "loss": 2.6964, + "step": 39744 + }, + { + "epoch": 2.467254329877708, + "grad_norm": 0.1545321867290529, + "learning_rate": 9.302649640005063e-06, + "loss": 2.7227, + "step": 39745 + }, + { + "epoch": 2.4673164069774662, + "grad_norm": 0.13708571504298858, + "learning_rate": 9.30055167046608e-06, + "loss": 2.5935, + "step": 39746 + }, + { + "epoch": 2.4673784840772237, + "grad_norm": 0.13556029118698967, + "learning_rate": 9.298453913265981e-06, + "loss": 2.7414, + "step": 39747 + }, + { + "epoch": 2.4674405611769816, + "grad_norm": 0.13590515426041402, + "learning_rate": 9.296356368415748e-06, + "loss": 2.7092, + "step": 39748 + }, + { + "epoch": 2.4675026382767395, + "grad_norm": 0.14492769255361904, + "learning_rate": 9.294259035926306e-06, + "loss": 2.7466, + "step": 39749 + }, + { + "epoch": 2.4675647153764975, + "grad_norm": 0.13940926725007977, + "learning_rate": 9.29216191580859e-06, + "loss": 2.7382, + "step": 39750 + }, + { + "epoch": 2.4676267924762554, + "grad_norm": 0.14574136487881512, + "learning_rate": 9.29006500807355e-06, + "loss": 2.753, + "step": 39751 + }, + { + "epoch": 2.4676888695760133, + "grad_norm": 0.13626265060245712, + "learning_rate": 9.287968312732115e-06, + "loss": 2.7186, + "step": 39752 + }, + { + "epoch": 2.467750946675771, + "grad_norm": 0.14858702607289864, + "learning_rate": 9.28587182979524e-06, + "loss": 2.6365, + "step": 39753 + }, + { + "epoch": 2.467813023775529, + "grad_norm": 0.13829132706083339, + "learning_rate": 9.28377555927386e-06, + "loss": 2.7022, + "step": 39754 + }, + { + "epoch": 2.467875100875287, + "grad_norm": 0.13213577788332037, + "learning_rate": 9.281679501178902e-06, + "loss": 2.6518, + "step": 39755 + }, + { + "epoch": 2.467937177975045, + "grad_norm": 0.13967761821911354, + "learning_rate": 9.279583655521295e-06, + "loss": 2.6942, + "step": 39756 + }, + { + "epoch": 2.467999255074803, + "grad_norm": 0.1432427110914695, + "learning_rate": 9.277488022312003e-06, + "loss": 2.7689, + "step": 39757 + }, + { + "epoch": 2.468061332174561, + "grad_norm": 0.14372655739576617, + "learning_rate": 9.27539260156194e-06, + "loss": 2.6639, + "step": 39758 + }, + { + "epoch": 2.4681234092743187, + "grad_norm": 0.15423141370490162, + "learning_rate": 9.273297393282037e-06, + "loss": 2.7058, + "step": 39759 + }, + { + "epoch": 2.4681854863740766, + "grad_norm": 0.13394095159258396, + "learning_rate": 9.271202397483215e-06, + "loss": 2.6803, + "step": 39760 + }, + { + "epoch": 2.4682475634738346, + "grad_norm": 0.14118878941413657, + "learning_rate": 9.26910761417643e-06, + "loss": 2.7525, + "step": 39761 + }, + { + "epoch": 2.4683096405735925, + "grad_norm": 0.14593838983464613, + "learning_rate": 9.267013043372602e-06, + "loss": 2.7189, + "step": 39762 + }, + { + "epoch": 2.4683717176733504, + "grad_norm": 0.1404178404817938, + "learning_rate": 9.26491868508264e-06, + "loss": 2.6499, + "step": 39763 + }, + { + "epoch": 2.4684337947731083, + "grad_norm": 0.13464914346614457, + "learning_rate": 9.2628245393175e-06, + "loss": 2.7008, + "step": 39764 + }, + { + "epoch": 2.4684958718728662, + "grad_norm": 0.14001227379325873, + "learning_rate": 9.260730606088098e-06, + "loss": 2.7886, + "step": 39765 + }, + { + "epoch": 2.468557948972624, + "grad_norm": 0.13668495169175415, + "learning_rate": 9.258636885405337e-06, + "loss": 2.7299, + "step": 39766 + }, + { + "epoch": 2.468620026072382, + "grad_norm": 0.17155472761277307, + "learning_rate": 9.256543377280168e-06, + "loss": 2.7275, + "step": 39767 + }, + { + "epoch": 2.46868210317214, + "grad_norm": 0.14274482257651672, + "learning_rate": 9.254450081723503e-06, + "loss": 2.7163, + "step": 39768 + }, + { + "epoch": 2.468744180271898, + "grad_norm": 0.1409361298213254, + "learning_rate": 9.252356998746265e-06, + "loss": 2.7895, + "step": 39769 + }, + { + "epoch": 2.4688062573716554, + "grad_norm": 0.1391721486060765, + "learning_rate": 9.25026412835936e-06, + "loss": 2.6867, + "step": 39770 + }, + { + "epoch": 2.4688683344714137, + "grad_norm": 0.15542839523357285, + "learning_rate": 9.248171470573736e-06, + "loss": 2.7492, + "step": 39771 + }, + { + "epoch": 2.468930411571171, + "grad_norm": 0.13826204236571055, + "learning_rate": 9.246079025400289e-06, + "loss": 2.6255, + "step": 39772 + }, + { + "epoch": 2.468992488670929, + "grad_norm": 0.1357369593429934, + "learning_rate": 9.24398679284994e-06, + "loss": 2.7695, + "step": 39773 + }, + { + "epoch": 2.469054565770687, + "grad_norm": 0.15903494988835826, + "learning_rate": 9.241894772933596e-06, + "loss": 2.6591, + "step": 39774 + }, + { + "epoch": 2.469116642870445, + "grad_norm": 0.15280470436197907, + "learning_rate": 9.239802965662191e-06, + "loss": 2.7335, + "step": 39775 + }, + { + "epoch": 2.469178719970203, + "grad_norm": 0.14110748648707686, + "learning_rate": 9.237711371046636e-06, + "loss": 2.6816, + "step": 39776 + }, + { + "epoch": 2.469240797069961, + "grad_norm": 0.14380631484018983, + "learning_rate": 9.235619989097832e-06, + "loss": 2.6754, + "step": 39777 + }, + { + "epoch": 2.4693028741697187, + "grad_norm": 0.14319399849861483, + "learning_rate": 9.233528819826692e-06, + "loss": 2.7081, + "step": 39778 + }, + { + "epoch": 2.4693649512694766, + "grad_norm": 0.13691044719673512, + "learning_rate": 9.231437863244118e-06, + "loss": 2.6607, + "step": 39779 + }, + { + "epoch": 2.4694270283692346, + "grad_norm": 0.13500752369569605, + "learning_rate": 9.229347119361042e-06, + "loss": 2.6673, + "step": 39780 + }, + { + "epoch": 2.4694891054689925, + "grad_norm": 0.13865840873344615, + "learning_rate": 9.22725658818836e-06, + "loss": 2.6905, + "step": 39781 + }, + { + "epoch": 2.4695511825687504, + "grad_norm": 0.1422131976286629, + "learning_rate": 9.22516626973698e-06, + "loss": 2.7188, + "step": 39782 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.15354294012176528, + "learning_rate": 9.22307616401779e-06, + "loss": 2.7362, + "step": 39783 + }, + { + "epoch": 2.4696753367682662, + "grad_norm": 0.18002011777789087, + "learning_rate": 9.220986271041726e-06, + "loss": 2.6524, + "step": 39784 + }, + { + "epoch": 2.469737413868024, + "grad_norm": 0.1472677202859862, + "learning_rate": 9.218896590819675e-06, + "loss": 2.7313, + "step": 39785 + }, + { + "epoch": 2.469799490967782, + "grad_norm": 0.1562456189971908, + "learning_rate": 9.216807123362536e-06, + "loss": 2.7017, + "step": 39786 + }, + { + "epoch": 2.46986156806754, + "grad_norm": 0.15267553635180234, + "learning_rate": 9.214717868681205e-06, + "loss": 2.7553, + "step": 39787 + }, + { + "epoch": 2.469923645167298, + "grad_norm": 0.14962161987632863, + "learning_rate": 9.212628826786607e-06, + "loss": 2.646, + "step": 39788 + }, + { + "epoch": 2.469985722267056, + "grad_norm": 0.14467439135867882, + "learning_rate": 9.210539997689627e-06, + "loss": 2.7695, + "step": 39789 + }, + { + "epoch": 2.4700477993668137, + "grad_norm": 0.13560395097219677, + "learning_rate": 9.208451381401156e-06, + "loss": 2.6613, + "step": 39790 + }, + { + "epoch": 2.4701098764665717, + "grad_norm": 0.1475657034069409, + "learning_rate": 9.206362977932104e-06, + "loss": 2.7082, + "step": 39791 + }, + { + "epoch": 2.4701719535663296, + "grad_norm": 0.16504143256116502, + "learning_rate": 9.204274787293343e-06, + "loss": 2.7121, + "step": 39792 + }, + { + "epoch": 2.470234030666087, + "grad_norm": 0.14148903025483675, + "learning_rate": 9.202186809495794e-06, + "loss": 2.7586, + "step": 39793 + }, + { + "epoch": 2.4702961077658454, + "grad_norm": 0.16147120806419774, + "learning_rate": 9.200099044550342e-06, + "loss": 2.6249, + "step": 39794 + }, + { + "epoch": 2.470358184865603, + "grad_norm": 0.13596793141049596, + "learning_rate": 9.198011492467879e-06, + "loss": 2.6925, + "step": 39795 + }, + { + "epoch": 2.470420261965361, + "grad_norm": 0.13847394034158783, + "learning_rate": 9.195924153259284e-06, + "loss": 2.7948, + "step": 39796 + }, + { + "epoch": 2.4704823390651187, + "grad_norm": 0.17117158586798883, + "learning_rate": 9.193837026935454e-06, + "loss": 2.6963, + "step": 39797 + }, + { + "epoch": 2.4705444161648766, + "grad_norm": 0.1424844708576752, + "learning_rate": 9.191750113507298e-06, + "loss": 2.631, + "step": 39798 + }, + { + "epoch": 2.4706064932646346, + "grad_norm": 0.14891193513134685, + "learning_rate": 9.189663412985689e-06, + "loss": 2.6662, + "step": 39799 + }, + { + "epoch": 2.4706685703643925, + "grad_norm": 0.15372696178625012, + "learning_rate": 9.187576925381509e-06, + "loss": 2.7132, + "step": 39800 + }, + { + "epoch": 2.4707306474641504, + "grad_norm": 0.13441821251836816, + "learning_rate": 9.185490650705653e-06, + "loss": 2.7471, + "step": 39801 + }, + { + "epoch": 2.4707927245639083, + "grad_norm": 0.1376607526871956, + "learning_rate": 9.18340458896898e-06, + "loss": 2.7206, + "step": 39802 + }, + { + "epoch": 2.4708548016636662, + "grad_norm": 0.13827395770139264, + "learning_rate": 9.181318740182415e-06, + "loss": 2.7167, + "step": 39803 + }, + { + "epoch": 2.470916878763424, + "grad_norm": 0.15034707169479966, + "learning_rate": 9.179233104356816e-06, + "loss": 2.7033, + "step": 39804 + }, + { + "epoch": 2.470978955863182, + "grad_norm": 0.17033450133277186, + "learning_rate": 9.177147681503062e-06, + "loss": 2.6978, + "step": 39805 + }, + { + "epoch": 2.47104103296294, + "grad_norm": 0.15149393005297443, + "learning_rate": 9.175062471632035e-06, + "loss": 2.6923, + "step": 39806 + }, + { + "epoch": 2.471103110062698, + "grad_norm": 0.1402597983434368, + "learning_rate": 9.172977474754624e-06, + "loss": 2.6767, + "step": 39807 + }, + { + "epoch": 2.471165187162456, + "grad_norm": 0.13514626692136897, + "learning_rate": 9.170892690881705e-06, + "loss": 2.6912, + "step": 39808 + }, + { + "epoch": 2.4712272642622137, + "grad_norm": 0.1397866385534412, + "learning_rate": 9.168808120024146e-06, + "loss": 2.6945, + "step": 39809 + }, + { + "epoch": 2.4712893413619716, + "grad_norm": 0.14973099448961716, + "learning_rate": 9.166723762192819e-06, + "loss": 2.7574, + "step": 39810 + }, + { + "epoch": 2.4713514184617296, + "grad_norm": 0.1385886721772108, + "learning_rate": 9.164639617398617e-06, + "loss": 2.6849, + "step": 39811 + }, + { + "epoch": 2.4714134955614875, + "grad_norm": 0.1520306168981153, + "learning_rate": 9.162555685652403e-06, + "loss": 2.649, + "step": 39812 + }, + { + "epoch": 2.4714755726612454, + "grad_norm": 0.1556964725656172, + "learning_rate": 9.160471966965051e-06, + "loss": 2.7142, + "step": 39813 + }, + { + "epoch": 2.4715376497610033, + "grad_norm": 0.15283343280966144, + "learning_rate": 9.158388461347427e-06, + "loss": 2.6993, + "step": 39814 + }, + { + "epoch": 2.4715997268607612, + "grad_norm": 0.14621828707405474, + "learning_rate": 9.156305168810398e-06, + "loss": 2.6327, + "step": 39815 + }, + { + "epoch": 2.471661803960519, + "grad_norm": 0.13842648971509372, + "learning_rate": 9.15422208936485e-06, + "loss": 2.6662, + "step": 39816 + }, + { + "epoch": 2.471723881060277, + "grad_norm": 0.1379867135166058, + "learning_rate": 9.152139223021639e-06, + "loss": 2.7308, + "step": 39817 + }, + { + "epoch": 2.4717859581600345, + "grad_norm": 0.13608698655106338, + "learning_rate": 9.150056569791637e-06, + "loss": 2.6165, + "step": 39818 + }, + { + "epoch": 2.471848035259793, + "grad_norm": 0.13807207332004817, + "learning_rate": 9.147974129685694e-06, + "loss": 2.6317, + "step": 39819 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.13971888492554677, + "learning_rate": 9.145891902714698e-06, + "loss": 2.6964, + "step": 39820 + }, + { + "epoch": 2.4719721894593083, + "grad_norm": 0.13538036562905184, + "learning_rate": 9.143809888889499e-06, + "loss": 2.7116, + "step": 39821 + }, + { + "epoch": 2.472034266559066, + "grad_norm": 0.1479392347334833, + "learning_rate": 9.14172808822097e-06, + "loss": 2.7465, + "step": 39822 + }, + { + "epoch": 2.472096343658824, + "grad_norm": 0.17184380768453958, + "learning_rate": 9.139646500719944e-06, + "loss": 2.737, + "step": 39823 + }, + { + "epoch": 2.472158420758582, + "grad_norm": 0.13509461425250663, + "learning_rate": 9.137565126397319e-06, + "loss": 2.7703, + "step": 39824 + }, + { + "epoch": 2.47222049785834, + "grad_norm": 0.16872218176468756, + "learning_rate": 9.13548396526393e-06, + "loss": 2.6721, + "step": 39825 + }, + { + "epoch": 2.472282574958098, + "grad_norm": 0.1414526781504686, + "learning_rate": 9.13340301733065e-06, + "loss": 2.682, + "step": 39826 + }, + { + "epoch": 2.472344652057856, + "grad_norm": 0.14168333126206922, + "learning_rate": 9.131322282608323e-06, + "loss": 2.6122, + "step": 39827 + }, + { + "epoch": 2.4724067291576137, + "grad_norm": 0.14957075068643005, + "learning_rate": 9.129241761107793e-06, + "loss": 2.7237, + "step": 39828 + }, + { + "epoch": 2.4724688062573716, + "grad_norm": 0.15655938155862328, + "learning_rate": 9.127161452839932e-06, + "loss": 2.7342, + "step": 39829 + }, + { + "epoch": 2.4725308833571296, + "grad_norm": 0.13289658605402557, + "learning_rate": 9.125081357815607e-06, + "loss": 2.5243, + "step": 39830 + }, + { + "epoch": 2.4725929604568875, + "grad_norm": 0.13881067260757105, + "learning_rate": 9.123001476045655e-06, + "loss": 2.5646, + "step": 39831 + }, + { + "epoch": 2.4726550375566454, + "grad_norm": 0.14040758624864177, + "learning_rate": 9.120921807540927e-06, + "loss": 2.6504, + "step": 39832 + }, + { + "epoch": 2.4727171146564033, + "grad_norm": 0.14616537964547666, + "learning_rate": 9.118842352312263e-06, + "loss": 2.7305, + "step": 39833 + }, + { + "epoch": 2.4727791917561612, + "grad_norm": 0.13483212392157135, + "learning_rate": 9.11676311037053e-06, + "loss": 2.7174, + "step": 39834 + }, + { + "epoch": 2.472841268855919, + "grad_norm": 0.13991816687768915, + "learning_rate": 9.114684081726576e-06, + "loss": 2.6769, + "step": 39835 + }, + { + "epoch": 2.472903345955677, + "grad_norm": 0.15781525170410554, + "learning_rate": 9.112605266391234e-06, + "loss": 2.7451, + "step": 39836 + }, + { + "epoch": 2.472965423055435, + "grad_norm": 0.1340730743306013, + "learning_rate": 9.110526664375357e-06, + "loss": 2.6299, + "step": 39837 + }, + { + "epoch": 2.473027500155193, + "grad_norm": 0.14036863854329656, + "learning_rate": 9.108448275689779e-06, + "loss": 2.7387, + "step": 39838 + }, + { + "epoch": 2.473089577254951, + "grad_norm": 0.1352999777428749, + "learning_rate": 9.106370100345363e-06, + "loss": 2.7186, + "step": 39839 + }, + { + "epoch": 2.4731516543547087, + "grad_norm": 0.1347073553412456, + "learning_rate": 9.104292138352944e-06, + "loss": 2.6375, + "step": 39840 + }, + { + "epoch": 2.473213731454466, + "grad_norm": 0.14101972415492753, + "learning_rate": 9.102214389723356e-06, + "loss": 2.7046, + "step": 39841 + }, + { + "epoch": 2.4732758085542246, + "grad_norm": 0.1408374492889791, + "learning_rate": 9.10013685446744e-06, + "loss": 2.7141, + "step": 39842 + }, + { + "epoch": 2.473337885653982, + "grad_norm": 0.1383498240017056, + "learning_rate": 9.098059532596043e-06, + "loss": 2.6573, + "step": 39843 + }, + { + "epoch": 2.47339996275374, + "grad_norm": 0.14182784317692956, + "learning_rate": 9.095982424120003e-06, + "loss": 2.6399, + "step": 39844 + }, + { + "epoch": 2.473462039853498, + "grad_norm": 0.15396339256128247, + "learning_rate": 9.093905529050151e-06, + "loss": 2.775, + "step": 39845 + }, + { + "epoch": 2.473524116953256, + "grad_norm": 0.13545691750889596, + "learning_rate": 9.09182884739731e-06, + "loss": 2.679, + "step": 39846 + }, + { + "epoch": 2.4735861940530137, + "grad_norm": 0.1372197101205845, + "learning_rate": 9.089752379172345e-06, + "loss": 2.6659, + "step": 39847 + }, + { + "epoch": 2.4736482711527716, + "grad_norm": 0.15930798397831897, + "learning_rate": 9.087676124386069e-06, + "loss": 2.7238, + "step": 39848 + }, + { + "epoch": 2.4737103482525296, + "grad_norm": 0.14225953577700948, + "learning_rate": 9.085600083049322e-06, + "loss": 2.6881, + "step": 39849 + }, + { + "epoch": 2.4737724253522875, + "grad_norm": 0.14251516206078738, + "learning_rate": 9.083524255172932e-06, + "loss": 2.6877, + "step": 39850 + }, + { + "epoch": 2.4738345024520454, + "grad_norm": 0.14866158235796434, + "learning_rate": 9.081448640767714e-06, + "loss": 2.7482, + "step": 39851 + }, + { + "epoch": 2.4738965795518033, + "grad_norm": 0.14824238929181946, + "learning_rate": 9.079373239844524e-06, + "loss": 2.685, + "step": 39852 + }, + { + "epoch": 2.4739586566515612, + "grad_norm": 0.14854392202019628, + "learning_rate": 9.07729805241418e-06, + "loss": 2.7571, + "step": 39853 + }, + { + "epoch": 2.474020733751319, + "grad_norm": 0.14595538946611974, + "learning_rate": 9.075223078487505e-06, + "loss": 2.6665, + "step": 39854 + }, + { + "epoch": 2.474082810851077, + "grad_norm": 0.13989809701095987, + "learning_rate": 9.073148318075315e-06, + "loss": 2.6621, + "step": 39855 + }, + { + "epoch": 2.474144887950835, + "grad_norm": 0.1591485232873144, + "learning_rate": 9.071073771188455e-06, + "loss": 2.7201, + "step": 39856 + }, + { + "epoch": 2.474206965050593, + "grad_norm": 0.14143534803544439, + "learning_rate": 9.068999437837739e-06, + "loss": 2.7119, + "step": 39857 + }, + { + "epoch": 2.474269042150351, + "grad_norm": 0.15836593703726187, + "learning_rate": 9.066925318033992e-06, + "loss": 2.7336, + "step": 39858 + }, + { + "epoch": 2.4743311192501087, + "grad_norm": 0.13506635338812872, + "learning_rate": 9.064851411788016e-06, + "loss": 2.708, + "step": 39859 + }, + { + "epoch": 2.4743931963498667, + "grad_norm": 0.14798089119587798, + "learning_rate": 9.062777719110665e-06, + "loss": 2.7189, + "step": 39860 + }, + { + "epoch": 2.4744552734496246, + "grad_norm": 0.14517417522944687, + "learning_rate": 9.060704240012724e-06, + "loss": 2.7362, + "step": 39861 + }, + { + "epoch": 2.4745173505493825, + "grad_norm": 0.1416467265746947, + "learning_rate": 9.058630974505045e-06, + "loss": 2.7291, + "step": 39862 + }, + { + "epoch": 2.4745794276491404, + "grad_norm": 0.13795750592683645, + "learning_rate": 9.056557922598418e-06, + "loss": 2.6753, + "step": 39863 + }, + { + "epoch": 2.4746415047488983, + "grad_norm": 0.14209656846955357, + "learning_rate": 9.054485084303677e-06, + "loss": 2.7408, + "step": 39864 + }, + { + "epoch": 2.4747035818486562, + "grad_norm": 0.1576882567302495, + "learning_rate": 9.05241245963161e-06, + "loss": 2.7373, + "step": 39865 + }, + { + "epoch": 2.4747656589484137, + "grad_norm": 0.1401940471074438, + "learning_rate": 9.050340048593059e-06, + "loss": 2.6966, + "step": 39866 + }, + { + "epoch": 2.474827736048172, + "grad_norm": 0.16003978378993688, + "learning_rate": 9.048267851198827e-06, + "loss": 2.7299, + "step": 39867 + }, + { + "epoch": 2.4748898131479296, + "grad_norm": 0.14601212990763457, + "learning_rate": 9.046195867459722e-06, + "loss": 2.6466, + "step": 39868 + }, + { + "epoch": 2.4749518902476875, + "grad_norm": 0.1453036277379984, + "learning_rate": 9.044124097386547e-06, + "loss": 2.7282, + "step": 39869 + }, + { + "epoch": 2.4750139673474454, + "grad_norm": 0.13700986930267797, + "learning_rate": 9.042052540990126e-06, + "loss": 2.7358, + "step": 39870 + }, + { + "epoch": 2.4750760444472033, + "grad_norm": 0.13503200354759812, + "learning_rate": 9.039981198281261e-06, + "loss": 2.6271, + "step": 39871 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.14761043069784396, + "learning_rate": 9.03791006927076e-06, + "loss": 2.739, + "step": 39872 + }, + { + "epoch": 2.475200198646719, + "grad_norm": 0.1376230510184748, + "learning_rate": 9.035839153969411e-06, + "loss": 2.7501, + "step": 39873 + }, + { + "epoch": 2.475262275746477, + "grad_norm": 0.156084497538412, + "learning_rate": 9.033768452388048e-06, + "loss": 2.6929, + "step": 39874 + }, + { + "epoch": 2.475324352846235, + "grad_norm": 0.13428234437821027, + "learning_rate": 9.031697964537461e-06, + "loss": 2.7529, + "step": 39875 + }, + { + "epoch": 2.475386429945993, + "grad_norm": 0.13552263741266124, + "learning_rate": 9.029627690428444e-06, + "loss": 2.7444, + "step": 39876 + }, + { + "epoch": 2.475448507045751, + "grad_norm": 0.133896937352493, + "learning_rate": 9.027557630071814e-06, + "loss": 2.6902, + "step": 39877 + }, + { + "epoch": 2.4755105841455087, + "grad_norm": 0.14531627952623685, + "learning_rate": 9.025487783478342e-06, + "loss": 2.7063, + "step": 39878 + }, + { + "epoch": 2.4755726612452666, + "grad_norm": 0.1325146968693453, + "learning_rate": 9.023418150658864e-06, + "loss": 2.6584, + "step": 39879 + }, + { + "epoch": 2.4756347383450246, + "grad_norm": 0.13514335879549882, + "learning_rate": 9.021348731624164e-06, + "loss": 2.636, + "step": 39880 + }, + { + "epoch": 2.4756968154447825, + "grad_norm": 0.13522049942321937, + "learning_rate": 9.019279526385028e-06, + "loss": 2.6924, + "step": 39881 + }, + { + "epoch": 2.4757588925445404, + "grad_norm": 0.13836990305620317, + "learning_rate": 9.017210534952253e-06, + "loss": 2.7267, + "step": 39882 + }, + { + "epoch": 2.4758209696442983, + "grad_norm": 0.13774851183900919, + "learning_rate": 9.015141757336647e-06, + "loss": 2.6924, + "step": 39883 + }, + { + "epoch": 2.4758830467440562, + "grad_norm": 0.13722742682536868, + "learning_rate": 9.013073193548998e-06, + "loss": 2.6604, + "step": 39884 + }, + { + "epoch": 2.475945123843814, + "grad_norm": 0.1513690948366159, + "learning_rate": 9.011004843600102e-06, + "loss": 2.656, + "step": 39885 + }, + { + "epoch": 2.476007200943572, + "grad_norm": 0.1497332025633518, + "learning_rate": 9.008936707500737e-06, + "loss": 2.7081, + "step": 39886 + }, + { + "epoch": 2.47606927804333, + "grad_norm": 0.13369692114635356, + "learning_rate": 9.00686878526169e-06, + "loss": 2.6169, + "step": 39887 + }, + { + "epoch": 2.476131355143088, + "grad_norm": 0.13665239222367875, + "learning_rate": 9.004801076893771e-06, + "loss": 2.7428, + "step": 39888 + }, + { + "epoch": 2.4761934322428454, + "grad_norm": 0.13800009685664993, + "learning_rate": 9.002733582407757e-06, + "loss": 2.7016, + "step": 39889 + }, + { + "epoch": 2.4762555093426037, + "grad_norm": 0.1370627910999698, + "learning_rate": 9.000666301814436e-06, + "loss": 2.7383, + "step": 39890 + }, + { + "epoch": 2.476317586442361, + "grad_norm": 0.1380523017481576, + "learning_rate": 8.998599235124577e-06, + "loss": 2.7851, + "step": 39891 + }, + { + "epoch": 2.476379663542119, + "grad_norm": 0.13378186450167526, + "learning_rate": 8.996532382348994e-06, + "loss": 2.7608, + "step": 39892 + }, + { + "epoch": 2.476441740641877, + "grad_norm": 0.13772277017174223, + "learning_rate": 8.994465743498454e-06, + "loss": 2.7867, + "step": 39893 + }, + { + "epoch": 2.476503817741635, + "grad_norm": 0.15227396907422974, + "learning_rate": 8.992399318583728e-06, + "loss": 2.679, + "step": 39894 + }, + { + "epoch": 2.476565894841393, + "grad_norm": 0.13551781999808746, + "learning_rate": 8.990333107615622e-06, + "loss": 2.6062, + "step": 39895 + }, + { + "epoch": 2.476627971941151, + "grad_norm": 0.15149506120531772, + "learning_rate": 8.988267110604893e-06, + "loss": 2.6799, + "step": 39896 + }, + { + "epoch": 2.4766900490409087, + "grad_norm": 0.13785725397784762, + "learning_rate": 8.986201327562343e-06, + "loss": 2.7658, + "step": 39897 + }, + { + "epoch": 2.4767521261406666, + "grad_norm": 0.14314813199994642, + "learning_rate": 8.984135758498736e-06, + "loss": 2.6699, + "step": 39898 + }, + { + "epoch": 2.4768142032404246, + "grad_norm": 0.13924741853336398, + "learning_rate": 8.982070403424847e-06, + "loss": 2.7257, + "step": 39899 + }, + { + "epoch": 2.4768762803401825, + "grad_norm": 0.14246825988837516, + "learning_rate": 8.98000526235146e-06, + "loss": 2.6784, + "step": 39900 + }, + { + "epoch": 2.4769383574399404, + "grad_norm": 0.13271389835293215, + "learning_rate": 8.977940335289326e-06, + "loss": 2.6657, + "step": 39901 + }, + { + "epoch": 2.4770004345396983, + "grad_norm": 0.14688724431275432, + "learning_rate": 8.975875622249246e-06, + "loss": 2.6435, + "step": 39902 + }, + { + "epoch": 2.4770625116394562, + "grad_norm": 0.1501109011284528, + "learning_rate": 8.973811123241988e-06, + "loss": 2.7956, + "step": 39903 + }, + { + "epoch": 2.477124588739214, + "grad_norm": 0.14650195969891802, + "learning_rate": 8.971746838278316e-06, + "loss": 2.8002, + "step": 39904 + }, + { + "epoch": 2.477186665838972, + "grad_norm": 0.13886990980438899, + "learning_rate": 8.969682767368986e-06, + "loss": 2.7875, + "step": 39905 + }, + { + "epoch": 2.47724874293873, + "grad_norm": 0.13770736479262316, + "learning_rate": 8.967618910524795e-06, + "loss": 2.695, + "step": 39906 + }, + { + "epoch": 2.477310820038488, + "grad_norm": 0.13756668224112598, + "learning_rate": 8.965555267756498e-06, + "loss": 2.7356, + "step": 39907 + }, + { + "epoch": 2.477372897138246, + "grad_norm": 0.13978450655069988, + "learning_rate": 8.963491839074861e-06, + "loss": 2.6557, + "step": 39908 + }, + { + "epoch": 2.4774349742380037, + "grad_norm": 0.1374560434890845, + "learning_rate": 8.961428624490631e-06, + "loss": 2.7124, + "step": 39909 + }, + { + "epoch": 2.4774970513377617, + "grad_norm": 0.1443599252550968, + "learning_rate": 8.959365624014609e-06, + "loss": 2.6911, + "step": 39910 + }, + { + "epoch": 2.4775591284375196, + "grad_norm": 0.13419037516755833, + "learning_rate": 8.957302837657538e-06, + "loss": 2.7692, + "step": 39911 + }, + { + "epoch": 2.477621205537277, + "grad_norm": 0.1435441642589158, + "learning_rate": 8.955240265430181e-06, + "loss": 2.6464, + "step": 39912 + }, + { + "epoch": 2.4776832826370354, + "grad_norm": 0.13853873322065427, + "learning_rate": 8.953177907343297e-06, + "loss": 2.6617, + "step": 39913 + }, + { + "epoch": 2.477745359736793, + "grad_norm": 0.14668096066873498, + "learning_rate": 8.951115763407642e-06, + "loss": 2.6025, + "step": 39914 + }, + { + "epoch": 2.477807436836551, + "grad_norm": 0.15287870846029944, + "learning_rate": 8.949053833633986e-06, + "loss": 2.7617, + "step": 39915 + }, + { + "epoch": 2.4778695139363087, + "grad_norm": 0.1448426307606174, + "learning_rate": 8.946992118033088e-06, + "loss": 2.6568, + "step": 39916 + }, + { + "epoch": 2.4779315910360666, + "grad_norm": 0.14014965090743894, + "learning_rate": 8.944930616615693e-06, + "loss": 2.7803, + "step": 39917 + }, + { + "epoch": 2.4779936681358246, + "grad_norm": 0.1436047110042478, + "learning_rate": 8.942869329392555e-06, + "loss": 2.6775, + "step": 39918 + }, + { + "epoch": 2.4780557452355825, + "grad_norm": 0.13695864067351662, + "learning_rate": 8.940808256374445e-06, + "loss": 2.7724, + "step": 39919 + }, + { + "epoch": 2.4781178223353404, + "grad_norm": 0.14176176808300064, + "learning_rate": 8.9387473975721e-06, + "loss": 2.7236, + "step": 39920 + }, + { + "epoch": 2.4781798994350983, + "grad_norm": 0.1391684881024983, + "learning_rate": 8.936686752996282e-06, + "loss": 2.717, + "step": 39921 + }, + { + "epoch": 2.4782419765348562, + "grad_norm": 0.1358224919396355, + "learning_rate": 8.93462632265774e-06, + "loss": 2.7852, + "step": 39922 + }, + { + "epoch": 2.478304053634614, + "grad_norm": 0.1372283158813314, + "learning_rate": 8.93256610656721e-06, + "loss": 2.7158, + "step": 39923 + }, + { + "epoch": 2.478366130734372, + "grad_norm": 0.13562974457857854, + "learning_rate": 8.930506104735458e-06, + "loss": 2.7533, + "step": 39924 + }, + { + "epoch": 2.47842820783413, + "grad_norm": 0.14111695070346642, + "learning_rate": 8.928446317173229e-06, + "loss": 2.6285, + "step": 39925 + }, + { + "epoch": 2.478490284933888, + "grad_norm": 0.1350892507274634, + "learning_rate": 8.926386743891268e-06, + "loss": 2.6019, + "step": 39926 + }, + { + "epoch": 2.478552362033646, + "grad_norm": 0.15196663691012324, + "learning_rate": 8.9243273849003e-06, + "loss": 2.6895, + "step": 39927 + }, + { + "epoch": 2.4786144391334037, + "grad_norm": 0.14403242945503708, + "learning_rate": 8.922268240211095e-06, + "loss": 2.6839, + "step": 39928 + }, + { + "epoch": 2.4786765162331617, + "grad_norm": 0.14646030864111542, + "learning_rate": 8.920209309834399e-06, + "loss": 2.7008, + "step": 39929 + }, + { + "epoch": 2.4787385933329196, + "grad_norm": 0.1483477584706326, + "learning_rate": 8.918150593780938e-06, + "loss": 2.6925, + "step": 39930 + }, + { + "epoch": 2.4788006704326775, + "grad_norm": 0.13430575677063164, + "learning_rate": 8.916092092061462e-06, + "loss": 2.6859, + "step": 39931 + }, + { + "epoch": 2.4788627475324354, + "grad_norm": 0.1506517175072952, + "learning_rate": 8.914033804686694e-06, + "loss": 2.7549, + "step": 39932 + }, + { + "epoch": 2.4789248246321933, + "grad_norm": 0.14159320280457227, + "learning_rate": 8.911975731667399e-06, + "loss": 2.7633, + "step": 39933 + }, + { + "epoch": 2.4789869017319512, + "grad_norm": 0.13658190418064572, + "learning_rate": 8.909917873014301e-06, + "loss": 2.7746, + "step": 39934 + }, + { + "epoch": 2.479048978831709, + "grad_norm": 0.14545261693093903, + "learning_rate": 8.907860228738141e-06, + "loss": 2.684, + "step": 39935 + }, + { + "epoch": 2.479111055931467, + "grad_norm": 0.13696787480007894, + "learning_rate": 8.905802798849643e-06, + "loss": 2.6277, + "step": 39936 + }, + { + "epoch": 2.4791731330312246, + "grad_norm": 0.1460110774691918, + "learning_rate": 8.903745583359535e-06, + "loss": 2.6283, + "step": 39937 + }, + { + "epoch": 2.479235210130983, + "grad_norm": 0.13611075223931646, + "learning_rate": 8.901688582278578e-06, + "loss": 2.7232, + "step": 39938 + }, + { + "epoch": 2.4792972872307404, + "grad_norm": 0.14098276220331601, + "learning_rate": 8.899631795617492e-06, + "loss": 2.6287, + "step": 39939 + }, + { + "epoch": 2.4793593643304983, + "grad_norm": 0.14862737933942105, + "learning_rate": 8.897575223387e-06, + "loss": 2.7034, + "step": 39940 + }, + { + "epoch": 2.4794214414302562, + "grad_norm": 0.13644187002265096, + "learning_rate": 8.895518865597823e-06, + "loss": 2.7402, + "step": 39941 + }, + { + "epoch": 2.479483518530014, + "grad_norm": 0.15054658754140177, + "learning_rate": 8.893462722260715e-06, + "loss": 2.7891, + "step": 39942 + }, + { + "epoch": 2.479545595629772, + "grad_norm": 0.1378821194756851, + "learning_rate": 8.891406793386392e-06, + "loss": 2.6549, + "step": 39943 + }, + { + "epoch": 2.47960767272953, + "grad_norm": 0.13928575034870302, + "learning_rate": 8.889351078985581e-06, + "loss": 2.7397, + "step": 39944 + }, + { + "epoch": 2.479669749829288, + "grad_norm": 0.14732359496501307, + "learning_rate": 8.887295579068989e-06, + "loss": 2.7598, + "step": 39945 + }, + { + "epoch": 2.479731826929046, + "grad_norm": 0.14087417831536672, + "learning_rate": 8.885240293647373e-06, + "loss": 2.606, + "step": 39946 + }, + { + "epoch": 2.4797939040288037, + "grad_norm": 0.13418637054991445, + "learning_rate": 8.883185222731439e-06, + "loss": 2.7205, + "step": 39947 + }, + { + "epoch": 2.4798559811285616, + "grad_norm": 0.1409230649665038, + "learning_rate": 8.881130366331908e-06, + "loss": 2.6459, + "step": 39948 + }, + { + "epoch": 2.4799180582283196, + "grad_norm": 0.1367956422110824, + "learning_rate": 8.879075724459502e-06, + "loss": 2.7676, + "step": 39949 + }, + { + "epoch": 2.4799801353280775, + "grad_norm": 0.139107242362214, + "learning_rate": 8.877021297124922e-06, + "loss": 2.6447, + "step": 39950 + }, + { + "epoch": 2.4800422124278354, + "grad_norm": 0.13949778907444096, + "learning_rate": 8.874967084338925e-06, + "loss": 2.5888, + "step": 39951 + }, + { + "epoch": 2.4801042895275933, + "grad_norm": 0.13105764598889752, + "learning_rate": 8.872913086112205e-06, + "loss": 2.7385, + "step": 39952 + }, + { + "epoch": 2.4801663666273512, + "grad_norm": 0.13317911653255865, + "learning_rate": 8.87085930245548e-06, + "loss": 2.6878, + "step": 39953 + }, + { + "epoch": 2.480228443727109, + "grad_norm": 0.14460371752429535, + "learning_rate": 8.86880573337946e-06, + "loss": 2.7188, + "step": 39954 + }, + { + "epoch": 2.480290520826867, + "grad_norm": 0.6422648724355629, + "learning_rate": 8.866752378894871e-06, + "loss": 2.7374, + "step": 39955 + }, + { + "epoch": 2.480352597926625, + "grad_norm": 0.13652818204960687, + "learning_rate": 8.864699239012425e-06, + "loss": 2.6776, + "step": 39956 + }, + { + "epoch": 2.480414675026383, + "grad_norm": 0.1342100911350522, + "learning_rate": 8.862646313742828e-06, + "loss": 2.6492, + "step": 39957 + }, + { + "epoch": 2.480476752126141, + "grad_norm": 0.1407212484317117, + "learning_rate": 8.860593603096778e-06, + "loss": 2.6617, + "step": 39958 + }, + { + "epoch": 2.4805388292258987, + "grad_norm": 0.14475957774477144, + "learning_rate": 8.858541107085e-06, + "loss": 2.7736, + "step": 39959 + }, + { + "epoch": 2.480600906325656, + "grad_norm": 0.13496857716734115, + "learning_rate": 8.856488825718211e-06, + "loss": 2.6012, + "step": 39960 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.14011599556953266, + "learning_rate": 8.854436759007106e-06, + "loss": 2.7431, + "step": 39961 + }, + { + "epoch": 2.480725060525172, + "grad_norm": 0.1429559652417633, + "learning_rate": 8.852384906962397e-06, + "loss": 2.7305, + "step": 39962 + }, + { + "epoch": 2.48078713762493, + "grad_norm": 0.1360661462453828, + "learning_rate": 8.850333269594786e-06, + "loss": 2.7105, + "step": 39963 + }, + { + "epoch": 2.480849214724688, + "grad_norm": 0.13825824998863004, + "learning_rate": 8.848281846914958e-06, + "loss": 2.6597, + "step": 39964 + }, + { + "epoch": 2.480911291824446, + "grad_norm": 0.1415679957626189, + "learning_rate": 8.846230638933651e-06, + "loss": 2.766, + "step": 39965 + }, + { + "epoch": 2.4809733689242037, + "grad_norm": 0.13587238736435175, + "learning_rate": 8.844179645661543e-06, + "loss": 2.6552, + "step": 39966 + }, + { + "epoch": 2.4810354460239616, + "grad_norm": 0.13905189050428693, + "learning_rate": 8.842128867109345e-06, + "loss": 2.6802, + "step": 39967 + }, + { + "epoch": 2.4810975231237196, + "grad_norm": 0.13591968814306649, + "learning_rate": 8.840078303287741e-06, + "loss": 2.7199, + "step": 39968 + }, + { + "epoch": 2.4811596002234775, + "grad_norm": 0.15272399935395323, + "learning_rate": 8.838027954207451e-06, + "loss": 2.7151, + "step": 39969 + }, + { + "epoch": 2.4812216773232354, + "grad_norm": 0.15445920396804177, + "learning_rate": 8.83597781987916e-06, + "loss": 2.7393, + "step": 39970 + }, + { + "epoch": 2.4812837544229933, + "grad_norm": 0.14222273887726802, + "learning_rate": 8.833927900313565e-06, + "loss": 2.7045, + "step": 39971 + }, + { + "epoch": 2.4813458315227512, + "grad_norm": 0.1477998854857538, + "learning_rate": 8.831878195521365e-06, + "loss": 2.7131, + "step": 39972 + }, + { + "epoch": 2.481407908622509, + "grad_norm": 0.14005889459529, + "learning_rate": 8.829828705513238e-06, + "loss": 2.6775, + "step": 39973 + }, + { + "epoch": 2.481469985722267, + "grad_norm": 0.14113284176747778, + "learning_rate": 8.827779430299898e-06, + "loss": 2.6727, + "step": 39974 + }, + { + "epoch": 2.481532062822025, + "grad_norm": 0.14873693405796762, + "learning_rate": 8.825730369892032e-06, + "loss": 2.6444, + "step": 39975 + }, + { + "epoch": 2.481594139921783, + "grad_norm": 0.15357881366677403, + "learning_rate": 8.82368152430032e-06, + "loss": 2.676, + "step": 39976 + }, + { + "epoch": 2.481656217021541, + "grad_norm": 0.1397385473233081, + "learning_rate": 8.821632893535447e-06, + "loss": 2.7722, + "step": 39977 + }, + { + "epoch": 2.4817182941212987, + "grad_norm": 0.14422560928663689, + "learning_rate": 8.819584477608129e-06, + "loss": 2.6735, + "step": 39978 + }, + { + "epoch": 2.4817803712210567, + "grad_norm": 0.1402161668735496, + "learning_rate": 8.817536276529032e-06, + "loss": 2.7572, + "step": 39979 + }, + { + "epoch": 2.4818424483208146, + "grad_norm": 0.16153355097446404, + "learning_rate": 8.815488290308848e-06, + "loss": 2.79, + "step": 39980 + }, + { + "epoch": 2.4819045254205725, + "grad_norm": 0.14085276358095136, + "learning_rate": 8.813440518958244e-06, + "loss": 2.6901, + "step": 39981 + }, + { + "epoch": 2.4819666025203304, + "grad_norm": 0.1351823100836701, + "learning_rate": 8.81139296248793e-06, + "loss": 2.7502, + "step": 39982 + }, + { + "epoch": 2.4820286796200883, + "grad_norm": 0.15420350738610947, + "learning_rate": 8.809345620908583e-06, + "loss": 2.7154, + "step": 39983 + }, + { + "epoch": 2.4820907567198462, + "grad_norm": 0.1504786840469217, + "learning_rate": 8.807298494230876e-06, + "loss": 2.7487, + "step": 39984 + }, + { + "epoch": 2.4821528338196037, + "grad_norm": 0.13690571262948134, + "learning_rate": 8.805251582465496e-06, + "loss": 2.6967, + "step": 39985 + }, + { + "epoch": 2.482214910919362, + "grad_norm": 0.13726568495608538, + "learning_rate": 8.8032048856231e-06, + "loss": 2.6071, + "step": 39986 + }, + { + "epoch": 2.4822769880191196, + "grad_norm": 0.15279008210430028, + "learning_rate": 8.801158403714405e-06, + "loss": 2.7955, + "step": 39987 + }, + { + "epoch": 2.4823390651188775, + "grad_norm": 0.13674819492974166, + "learning_rate": 8.799112136750059e-06, + "loss": 2.6864, + "step": 39988 + }, + { + "epoch": 2.4824011422186354, + "grad_norm": 0.14630267636319386, + "learning_rate": 8.797066084740752e-06, + "loss": 2.7459, + "step": 39989 + }, + { + "epoch": 2.4824632193183933, + "grad_norm": 0.14275599841527456, + "learning_rate": 8.795020247697144e-06, + "loss": 2.7003, + "step": 39990 + }, + { + "epoch": 2.4825252964181512, + "grad_norm": 0.13925259514835384, + "learning_rate": 8.79297462562993e-06, + "loss": 2.7301, + "step": 39991 + }, + { + "epoch": 2.482587373517909, + "grad_norm": 0.14438009090035103, + "learning_rate": 8.790929218549754e-06, + "loss": 2.6952, + "step": 39992 + }, + { + "epoch": 2.482649450617667, + "grad_norm": 0.1473639782079517, + "learning_rate": 8.78888402646732e-06, + "loss": 2.7995, + "step": 39993 + }, + { + "epoch": 2.482711527717425, + "grad_norm": 0.1397260970144434, + "learning_rate": 8.78683904939328e-06, + "loss": 2.713, + "step": 39994 + }, + { + "epoch": 2.482773604817183, + "grad_norm": 0.14393384000072967, + "learning_rate": 8.784794287338293e-06, + "loss": 2.7218, + "step": 39995 + }, + { + "epoch": 2.482835681916941, + "grad_norm": 0.14636439177971372, + "learning_rate": 8.782749740313051e-06, + "loss": 2.6479, + "step": 39996 + }, + { + "epoch": 2.4828977590166987, + "grad_norm": 0.1481561252631286, + "learning_rate": 8.780705408328217e-06, + "loss": 2.7608, + "step": 39997 + }, + { + "epoch": 2.4829598361164567, + "grad_norm": 0.13913328346319673, + "learning_rate": 8.778661291394441e-06, + "loss": 2.7022, + "step": 39998 + }, + { + "epoch": 2.4830219132162146, + "grad_norm": 0.1494198405326657, + "learning_rate": 8.776617389522395e-06, + "loss": 2.759, + "step": 39999 + }, + { + "epoch": 2.4830839903159725, + "grad_norm": 0.1363780476778538, + "learning_rate": 8.774573702722738e-06, + "loss": 2.7173, + "step": 40000 + }, + { + "epoch": 2.4831460674157304, + "grad_norm": 0.13713214228780377, + "learning_rate": 8.772530231006149e-06, + "loss": 2.6645, + "step": 40001 + }, + { + "epoch": 2.4832081445154883, + "grad_norm": 0.13764067167666075, + "learning_rate": 8.770486974383274e-06, + "loss": 2.6679, + "step": 40002 + }, + { + "epoch": 2.4832702216152462, + "grad_norm": 0.1853141429339395, + "learning_rate": 8.768443932864778e-06, + "loss": 2.6566, + "step": 40003 + }, + { + "epoch": 2.483332298715004, + "grad_norm": 0.1346289176282348, + "learning_rate": 8.766401106461309e-06, + "loss": 2.6829, + "step": 40004 + }, + { + "epoch": 2.483394375814762, + "grad_norm": 0.15866872772145718, + "learning_rate": 8.764358495183544e-06, + "loss": 2.815, + "step": 40005 + }, + { + "epoch": 2.48345645291452, + "grad_norm": 0.1373383785929031, + "learning_rate": 8.762316099042139e-06, + "loss": 2.6718, + "step": 40006 + }, + { + "epoch": 2.483518530014278, + "grad_norm": 0.13629170207219765, + "learning_rate": 8.760273918047735e-06, + "loss": 2.7567, + "step": 40007 + }, + { + "epoch": 2.4835806071140354, + "grad_norm": 0.13581164387513417, + "learning_rate": 8.758231952210982e-06, + "loss": 2.7237, + "step": 40008 + }, + { + "epoch": 2.4836426842137937, + "grad_norm": 0.14133405080447672, + "learning_rate": 8.756190201542557e-06, + "loss": 2.772, + "step": 40009 + }, + { + "epoch": 2.4837047613135512, + "grad_norm": 0.13902758256392736, + "learning_rate": 8.754148666053102e-06, + "loss": 2.7864, + "step": 40010 + }, + { + "epoch": 2.483766838413309, + "grad_norm": 0.14876682365670996, + "learning_rate": 8.752107345753263e-06, + "loss": 2.7527, + "step": 40011 + }, + { + "epoch": 2.483828915513067, + "grad_norm": 0.1381229159577686, + "learning_rate": 8.750066240653693e-06, + "loss": 2.6701, + "step": 40012 + }, + { + "epoch": 2.483890992612825, + "grad_norm": 0.15319276473143387, + "learning_rate": 8.748025350765032e-06, + "loss": 2.6612, + "step": 40013 + }, + { + "epoch": 2.483953069712583, + "grad_norm": 0.1626575863569845, + "learning_rate": 8.745984676097951e-06, + "loss": 2.725, + "step": 40014 + }, + { + "epoch": 2.484015146812341, + "grad_norm": 0.14378653507762837, + "learning_rate": 8.74394421666308e-06, + "loss": 2.6758, + "step": 40015 + }, + { + "epoch": 2.4840772239120987, + "grad_norm": 0.1499595160291493, + "learning_rate": 8.741903972471066e-06, + "loss": 2.6926, + "step": 40016 + }, + { + "epoch": 2.4841393010118566, + "grad_norm": 0.13862629262168466, + "learning_rate": 8.739863943532544e-06, + "loss": 2.6831, + "step": 40017 + }, + { + "epoch": 2.4842013781116146, + "grad_norm": 0.14188540062403543, + "learning_rate": 8.73782412985818e-06, + "loss": 2.7459, + "step": 40018 + }, + { + "epoch": 2.4842634552113725, + "grad_norm": 0.13777063217097446, + "learning_rate": 8.735784531458602e-06, + "loss": 2.6318, + "step": 40019 + }, + { + "epoch": 2.4843255323111304, + "grad_norm": 0.14513792477659654, + "learning_rate": 8.733745148344458e-06, + "loss": 2.5846, + "step": 40020 + }, + { + "epoch": 2.4843876094108883, + "grad_norm": 0.1413511100242717, + "learning_rate": 8.731705980526378e-06, + "loss": 2.6666, + "step": 40021 + }, + { + "epoch": 2.4844496865106462, + "grad_norm": 0.14797674999343, + "learning_rate": 8.729667028014998e-06, + "loss": 2.678, + "step": 40022 + }, + { + "epoch": 2.484511763610404, + "grad_norm": 0.13885467864622072, + "learning_rate": 8.727628290820972e-06, + "loss": 2.6998, + "step": 40023 + }, + { + "epoch": 2.484573840710162, + "grad_norm": 0.14471119555848974, + "learning_rate": 8.725589768954934e-06, + "loss": 2.6599, + "step": 40024 + }, + { + "epoch": 2.48463591780992, + "grad_norm": 0.13547328688384352, + "learning_rate": 8.723551462427498e-06, + "loss": 2.6797, + "step": 40025 + }, + { + "epoch": 2.484697994909678, + "grad_norm": 0.15923430085164086, + "learning_rate": 8.72151337124933e-06, + "loss": 2.7298, + "step": 40026 + }, + { + "epoch": 2.484760072009436, + "grad_norm": 0.1395340230715335, + "learning_rate": 8.71947549543103e-06, + "loss": 2.76, + "step": 40027 + }, + { + "epoch": 2.4848221491091937, + "grad_norm": 0.13646199094684905, + "learning_rate": 8.717437834983262e-06, + "loss": 2.6723, + "step": 40028 + }, + { + "epoch": 2.4848842262089517, + "grad_norm": 0.14752841820167645, + "learning_rate": 8.715400389916645e-06, + "loss": 2.673, + "step": 40029 + }, + { + "epoch": 2.4849463033087096, + "grad_norm": 0.14612712529874625, + "learning_rate": 8.7133631602418e-06, + "loss": 2.6639, + "step": 40030 + }, + { + "epoch": 2.4850083804084675, + "grad_norm": 0.14659902536385688, + "learning_rate": 8.711326145969356e-06, + "loss": 2.7512, + "step": 40031 + }, + { + "epoch": 2.4850704575082254, + "grad_norm": 0.13397850073790912, + "learning_rate": 8.709289347109956e-06, + "loss": 2.8114, + "step": 40032 + }, + { + "epoch": 2.485132534607983, + "grad_norm": 0.13624100319661364, + "learning_rate": 8.707252763674218e-06, + "loss": 2.8032, + "step": 40033 + }, + { + "epoch": 2.4851946117077413, + "grad_norm": 0.13669844204076592, + "learning_rate": 8.705216395672766e-06, + "loss": 2.6852, + "step": 40034 + }, + { + "epoch": 2.4852566888074987, + "grad_norm": 0.15680249999048895, + "learning_rate": 8.703180243116227e-06, + "loss": 2.6546, + "step": 40035 + }, + { + "epoch": 2.4853187659072566, + "grad_norm": 0.15313741380625115, + "learning_rate": 8.701144306015207e-06, + "loss": 2.7775, + "step": 40036 + }, + { + "epoch": 2.4853808430070146, + "grad_norm": 0.15432651452389345, + "learning_rate": 8.699108584380356e-06, + "loss": 2.6535, + "step": 40037 + }, + { + "epoch": 2.4854429201067725, + "grad_norm": 0.1445545358199867, + "learning_rate": 8.69707307822228e-06, + "loss": 2.7992, + "step": 40038 + }, + { + "epoch": 2.4855049972065304, + "grad_norm": 0.13555775606965972, + "learning_rate": 8.695037787551602e-06, + "loss": 2.7202, + "step": 40039 + }, + { + "epoch": 2.4855670743062883, + "grad_norm": 0.19200372864969528, + "learning_rate": 8.693002712378929e-06, + "loss": 2.7338, + "step": 40040 + }, + { + "epoch": 2.4856291514060462, + "grad_norm": 0.15289352934685402, + "learning_rate": 8.690967852714899e-06, + "loss": 2.7317, + "step": 40041 + }, + { + "epoch": 2.485691228505804, + "grad_norm": 0.14053465669755605, + "learning_rate": 8.688933208570116e-06, + "loss": 2.7968, + "step": 40042 + }, + { + "epoch": 2.485753305605562, + "grad_norm": 0.1604794056520148, + "learning_rate": 8.6868987799552e-06, + "loss": 2.743, + "step": 40043 + }, + { + "epoch": 2.48581538270532, + "grad_norm": 0.1373090059624141, + "learning_rate": 8.684864566880746e-06, + "loss": 2.6297, + "step": 40044 + }, + { + "epoch": 2.485877459805078, + "grad_norm": 0.1427197406263324, + "learning_rate": 8.682830569357398e-06, + "loss": 2.7182, + "step": 40045 + }, + { + "epoch": 2.485939536904836, + "grad_norm": 0.15413444153785605, + "learning_rate": 8.68079678739575e-06, + "loss": 2.7806, + "step": 40046 + }, + { + "epoch": 2.4860016140045937, + "grad_norm": 0.13390113841953535, + "learning_rate": 8.67876322100642e-06, + "loss": 2.62, + "step": 40047 + }, + { + "epoch": 2.4860636911043517, + "grad_norm": 0.14369684433845178, + "learning_rate": 8.676729870200012e-06, + "loss": 2.6984, + "step": 40048 + }, + { + "epoch": 2.4861257682041096, + "grad_norm": 0.1603280227890559, + "learning_rate": 8.67469673498712e-06, + "loss": 2.6882, + "step": 40049 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.14316403830034954, + "learning_rate": 8.672663815378385e-06, + "loss": 2.73, + "step": 40050 + }, + { + "epoch": 2.4862499224036254, + "grad_norm": 0.1384715392757925, + "learning_rate": 8.670631111384392e-06, + "loss": 2.7144, + "step": 40051 + }, + { + "epoch": 2.4863119995033833, + "grad_norm": 0.13463645771098812, + "learning_rate": 8.66859862301575e-06, + "loss": 2.7388, + "step": 40052 + }, + { + "epoch": 2.4863740766031412, + "grad_norm": 0.1592438808258117, + "learning_rate": 8.666566350283052e-06, + "loss": 2.7142, + "step": 40053 + }, + { + "epoch": 2.486436153702899, + "grad_norm": 0.1356505214832292, + "learning_rate": 8.664534293196924e-06, + "loss": 2.6598, + "step": 40054 + }, + { + "epoch": 2.486498230802657, + "grad_norm": 0.14078310846717257, + "learning_rate": 8.66250245176795e-06, + "loss": 2.7173, + "step": 40055 + }, + { + "epoch": 2.4865603079024146, + "grad_norm": 0.13365062456643306, + "learning_rate": 8.660470826006744e-06, + "loss": 2.627, + "step": 40056 + }, + { + "epoch": 2.486622385002173, + "grad_norm": 0.14957804812544223, + "learning_rate": 8.65843941592389e-06, + "loss": 2.7762, + "step": 40057 + }, + { + "epoch": 2.4866844621019304, + "grad_norm": 0.13743203607568857, + "learning_rate": 8.656408221529988e-06, + "loss": 2.6799, + "step": 40058 + }, + { + "epoch": 2.4867465392016883, + "grad_norm": 0.14075520898029092, + "learning_rate": 8.654377242835632e-06, + "loss": 2.7038, + "step": 40059 + }, + { + "epoch": 2.4868086163014462, + "grad_norm": 0.13630892090950156, + "learning_rate": 8.652346479851447e-06, + "loss": 2.6972, + "step": 40060 + }, + { + "epoch": 2.486870693401204, + "grad_norm": 0.155154206424789, + "learning_rate": 8.650315932588005e-06, + "loss": 2.7124, + "step": 40061 + }, + { + "epoch": 2.486932770500962, + "grad_norm": 0.13608139522896723, + "learning_rate": 8.6482856010559e-06, + "loss": 2.6213, + "step": 40062 + }, + { + "epoch": 2.48699484760072, + "grad_norm": 0.13795613550903715, + "learning_rate": 8.64625548526572e-06, + "loss": 2.6419, + "step": 40063 + }, + { + "epoch": 2.487056924700478, + "grad_norm": 0.14766453588539108, + "learning_rate": 8.644225585228071e-06, + "loss": 2.761, + "step": 40064 + }, + { + "epoch": 2.487119001800236, + "grad_norm": 0.14499198446439954, + "learning_rate": 8.642195900953543e-06, + "loss": 2.7608, + "step": 40065 + }, + { + "epoch": 2.4871810788999937, + "grad_norm": 0.13511390877258442, + "learning_rate": 8.640166432452718e-06, + "loss": 2.7409, + "step": 40066 + }, + { + "epoch": 2.4872431559997517, + "grad_norm": 0.13169233483770793, + "learning_rate": 8.638137179736167e-06, + "loss": 2.7014, + "step": 40067 + }, + { + "epoch": 2.4873052330995096, + "grad_norm": 0.13603418490904462, + "learning_rate": 8.636108142814514e-06, + "loss": 2.6715, + "step": 40068 + }, + { + "epoch": 2.4873673101992675, + "grad_norm": 0.17539052453494858, + "learning_rate": 8.634079321698324e-06, + "loss": 2.65, + "step": 40069 + }, + { + "epoch": 2.4874293872990254, + "grad_norm": 0.14035532180036542, + "learning_rate": 8.632050716398183e-06, + "loss": 2.6146, + "step": 40070 + }, + { + "epoch": 2.4874914643987833, + "grad_norm": 0.13606412706422982, + "learning_rate": 8.630022326924676e-06, + "loss": 2.7625, + "step": 40071 + }, + { + "epoch": 2.4875535414985412, + "grad_norm": 0.1455357159991519, + "learning_rate": 8.627994153288372e-06, + "loss": 2.5747, + "step": 40072 + }, + { + "epoch": 2.487615618598299, + "grad_norm": 0.1471697667346822, + "learning_rate": 8.625966195499879e-06, + "loss": 2.7131, + "step": 40073 + }, + { + "epoch": 2.487677695698057, + "grad_norm": 0.168718097019928, + "learning_rate": 8.623938453569768e-06, + "loss": 2.7417, + "step": 40074 + }, + { + "epoch": 2.487739772797815, + "grad_norm": 0.14144077380129905, + "learning_rate": 8.62191092750861e-06, + "loss": 2.6888, + "step": 40075 + }, + { + "epoch": 2.487801849897573, + "grad_norm": 0.13210681355597065, + "learning_rate": 8.619883617326979e-06, + "loss": 2.6767, + "step": 40076 + }, + { + "epoch": 2.487863926997331, + "grad_norm": 0.14724618545652834, + "learning_rate": 8.617856523035466e-06, + "loss": 2.7537, + "step": 40077 + }, + { + "epoch": 2.4879260040970888, + "grad_norm": 0.13108002152016193, + "learning_rate": 8.615829644644646e-06, + "loss": 2.6167, + "step": 40078 + }, + { + "epoch": 2.4879880811968467, + "grad_norm": 0.14311071757721713, + "learning_rate": 8.613802982165093e-06, + "loss": 2.6674, + "step": 40079 + }, + { + "epoch": 2.4880501582966046, + "grad_norm": 0.13830769092121747, + "learning_rate": 8.611776535607358e-06, + "loss": 2.6839, + "step": 40080 + }, + { + "epoch": 2.488112235396362, + "grad_norm": 0.14148057175258144, + "learning_rate": 8.609750304982051e-06, + "loss": 2.648, + "step": 40081 + }, + { + "epoch": 2.4881743124961204, + "grad_norm": 0.13326304813433656, + "learning_rate": 8.607724290299717e-06, + "loss": 2.6503, + "step": 40082 + }, + { + "epoch": 2.488236389595878, + "grad_norm": 0.13807342235175676, + "learning_rate": 8.605698491570935e-06, + "loss": 2.7955, + "step": 40083 + }, + { + "epoch": 2.488298466695636, + "grad_norm": 0.1343584058739732, + "learning_rate": 8.603672908806276e-06, + "loss": 2.6865, + "step": 40084 + }, + { + "epoch": 2.4883605437953937, + "grad_norm": 0.13992700602518104, + "learning_rate": 8.601647542016294e-06, + "loss": 2.5634, + "step": 40085 + }, + { + "epoch": 2.4884226208951516, + "grad_norm": 0.141784608851604, + "learning_rate": 8.599622391211576e-06, + "loss": 2.7376, + "step": 40086 + }, + { + "epoch": 2.4884846979949096, + "grad_norm": 0.13613553311271492, + "learning_rate": 8.59759745640268e-06, + "loss": 2.613, + "step": 40087 + }, + { + "epoch": 2.4885467750946675, + "grad_norm": 0.13520811524567064, + "learning_rate": 8.595572737600166e-06, + "loss": 2.7449, + "step": 40088 + }, + { + "epoch": 2.4886088521944254, + "grad_norm": 0.13578802473772242, + "learning_rate": 8.593548234814592e-06, + "loss": 2.6914, + "step": 40089 + }, + { + "epoch": 2.4886709292941833, + "grad_norm": 0.13813194414953997, + "learning_rate": 8.591523948056524e-06, + "loss": 2.6798, + "step": 40090 + }, + { + "epoch": 2.4887330063939412, + "grad_norm": 0.14724645110356516, + "learning_rate": 8.589499877336544e-06, + "loss": 2.7485, + "step": 40091 + }, + { + "epoch": 2.488795083493699, + "grad_norm": 0.15115101667906772, + "learning_rate": 8.587476022665192e-06, + "loss": 2.6632, + "step": 40092 + }, + { + "epoch": 2.488857160593457, + "grad_norm": 0.13549509131155094, + "learning_rate": 8.585452384053034e-06, + "loss": 2.749, + "step": 40093 + }, + { + "epoch": 2.488919237693215, + "grad_norm": 0.13794205249599023, + "learning_rate": 8.583428961510609e-06, + "loss": 2.6161, + "step": 40094 + }, + { + "epoch": 2.488981314792973, + "grad_norm": 0.1365059695440632, + "learning_rate": 8.581405755048506e-06, + "loss": 2.7357, + "step": 40095 + }, + { + "epoch": 2.489043391892731, + "grad_norm": 0.13630907805553957, + "learning_rate": 8.579382764677257e-06, + "loss": 2.6563, + "step": 40096 + }, + { + "epoch": 2.4891054689924887, + "grad_norm": 0.14077463401211, + "learning_rate": 8.577359990407429e-06, + "loss": 2.703, + "step": 40097 + }, + { + "epoch": 2.4891675460922467, + "grad_norm": 0.14400622658960113, + "learning_rate": 8.57533743224957e-06, + "loss": 2.7364, + "step": 40098 + }, + { + "epoch": 2.4892296231920046, + "grad_norm": 0.136485193214082, + "learning_rate": 8.573315090214217e-06, + "loss": 2.7239, + "step": 40099 + }, + { + "epoch": 2.4892917002917625, + "grad_norm": 0.1605209292494652, + "learning_rate": 8.57129296431195e-06, + "loss": 2.7582, + "step": 40100 + }, + { + "epoch": 2.4893537773915204, + "grad_norm": 0.13424072459880748, + "learning_rate": 8.569271054553308e-06, + "loss": 2.7038, + "step": 40101 + }, + { + "epoch": 2.4894158544912783, + "grad_norm": 0.15428120497312114, + "learning_rate": 8.567249360948832e-06, + "loss": 2.6521, + "step": 40102 + }, + { + "epoch": 2.4894779315910363, + "grad_norm": 0.13788738787258065, + "learning_rate": 8.565227883509059e-06, + "loss": 2.7548, + "step": 40103 + }, + { + "epoch": 2.4895400086907937, + "grad_norm": 0.15178155480123118, + "learning_rate": 8.563206622244568e-06, + "loss": 2.6468, + "step": 40104 + }, + { + "epoch": 2.489602085790552, + "grad_norm": 0.13992478963477614, + "learning_rate": 8.561185577165886e-06, + "loss": 2.76, + "step": 40105 + }, + { + "epoch": 2.4896641628903096, + "grad_norm": 0.14459897078600253, + "learning_rate": 8.55916474828356e-06, + "loss": 2.7595, + "step": 40106 + }, + { + "epoch": 2.4897262399900675, + "grad_norm": 0.13900313637260167, + "learning_rate": 8.55714413560813e-06, + "loss": 2.7047, + "step": 40107 + }, + { + "epoch": 2.4897883170898254, + "grad_norm": 0.13936488835471786, + "learning_rate": 8.555123739150123e-06, + "loss": 2.6551, + "step": 40108 + }, + { + "epoch": 2.4898503941895833, + "grad_norm": 0.13795303007842624, + "learning_rate": 8.553103558920117e-06, + "loss": 2.7365, + "step": 40109 + }, + { + "epoch": 2.4899124712893412, + "grad_norm": 0.13411213508395958, + "learning_rate": 8.551083594928627e-06, + "loss": 2.6685, + "step": 40110 + }, + { + "epoch": 2.489974548389099, + "grad_norm": 0.14558294775789307, + "learning_rate": 8.549063847186195e-06, + "loss": 2.6554, + "step": 40111 + }, + { + "epoch": 2.490036625488857, + "grad_norm": 0.1448925695581929, + "learning_rate": 8.54704431570335e-06, + "loss": 2.6454, + "step": 40112 + }, + { + "epoch": 2.490098702588615, + "grad_norm": 0.13591876997110608, + "learning_rate": 8.545025000490648e-06, + "loss": 2.6388, + "step": 40113 + }, + { + "epoch": 2.490160779688373, + "grad_norm": 0.13694433266748743, + "learning_rate": 8.543005901558614e-06, + "loss": 2.6187, + "step": 40114 + }, + { + "epoch": 2.490222856788131, + "grad_norm": 0.13774402919955447, + "learning_rate": 8.540987018917784e-06, + "loss": 2.6153, + "step": 40115 + }, + { + "epoch": 2.4902849338878887, + "grad_norm": 0.14174958089273718, + "learning_rate": 8.538968352578674e-06, + "loss": 2.6634, + "step": 40116 + }, + { + "epoch": 2.4903470109876467, + "grad_norm": 0.14281938338020542, + "learning_rate": 8.536949902551849e-06, + "loss": 2.675, + "step": 40117 + }, + { + "epoch": 2.4904090880874046, + "grad_norm": 0.1558144040827236, + "learning_rate": 8.534931668847817e-06, + "loss": 2.6366, + "step": 40118 + }, + { + "epoch": 2.4904711651871625, + "grad_norm": 0.13449509737454987, + "learning_rate": 8.532913651477114e-06, + "loss": 2.6746, + "step": 40119 + }, + { + "epoch": 2.4905332422869204, + "grad_norm": 0.14838257199974256, + "learning_rate": 8.530895850450266e-06, + "loss": 2.7627, + "step": 40120 + }, + { + "epoch": 2.4905953193866783, + "grad_norm": 0.1325541817124907, + "learning_rate": 8.52887826577779e-06, + "loss": 2.6969, + "step": 40121 + }, + { + "epoch": 2.4906573964864362, + "grad_norm": 0.1606807725834215, + "learning_rate": 8.526860897470234e-06, + "loss": 2.739, + "step": 40122 + }, + { + "epoch": 2.490719473586194, + "grad_norm": 0.13546364837332908, + "learning_rate": 8.524843745538103e-06, + "loss": 2.666, + "step": 40123 + }, + { + "epoch": 2.490781550685952, + "grad_norm": 0.13387668830811442, + "learning_rate": 8.522826809991941e-06, + "loss": 2.6959, + "step": 40124 + }, + { + "epoch": 2.49084362778571, + "grad_norm": 0.13014721310121635, + "learning_rate": 8.520810090842263e-06, + "loss": 2.6174, + "step": 40125 + }, + { + "epoch": 2.490905704885468, + "grad_norm": 0.1350370383007473, + "learning_rate": 8.518793588099571e-06, + "loss": 2.682, + "step": 40126 + }, + { + "epoch": 2.490967781985226, + "grad_norm": 0.14092303853157884, + "learning_rate": 8.51677730177442e-06, + "loss": 2.7729, + "step": 40127 + }, + { + "epoch": 2.4910298590849838, + "grad_norm": 0.1388492926805549, + "learning_rate": 8.51476123187731e-06, + "loss": 2.7703, + "step": 40128 + }, + { + "epoch": 2.4910919361847412, + "grad_norm": 0.13326612308430139, + "learning_rate": 8.512745378418763e-06, + "loss": 2.6381, + "step": 40129 + }, + { + "epoch": 2.4911540132844996, + "grad_norm": 0.1474844238070207, + "learning_rate": 8.510729741409285e-06, + "loss": 2.6262, + "step": 40130 + }, + { + "epoch": 2.491216090384257, + "grad_norm": 0.13802370182391419, + "learning_rate": 8.508714320859406e-06, + "loss": 2.7613, + "step": 40131 + }, + { + "epoch": 2.491278167484015, + "grad_norm": 0.15202134814660312, + "learning_rate": 8.506699116779643e-06, + "loss": 2.6901, + "step": 40132 + }, + { + "epoch": 2.491340244583773, + "grad_norm": 0.13576646628850692, + "learning_rate": 8.504684129180501e-06, + "loss": 2.6859, + "step": 40133 + }, + { + "epoch": 2.491402321683531, + "grad_norm": 0.13963737310580157, + "learning_rate": 8.502669358072497e-06, + "loss": 2.6847, + "step": 40134 + }, + { + "epoch": 2.4914643987832887, + "grad_norm": 0.15741479161988353, + "learning_rate": 8.50065480346613e-06, + "loss": 2.8255, + "step": 40135 + }, + { + "epoch": 2.4915264758830467, + "grad_norm": 0.14882219808396155, + "learning_rate": 8.498640465371932e-06, + "loss": 2.6259, + "step": 40136 + }, + { + "epoch": 2.4915885529828046, + "grad_norm": 0.13622899892143978, + "learning_rate": 8.496626343800401e-06, + "loss": 2.6975, + "step": 40137 + }, + { + "epoch": 2.4916506300825625, + "grad_norm": 0.1392297871706786, + "learning_rate": 8.494612438762045e-06, + "loss": 2.7014, + "step": 40138 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.1343650046223993, + "learning_rate": 8.492598750267361e-06, + "loss": 2.635, + "step": 40139 + }, + { + "epoch": 2.4917747842820783, + "grad_norm": 0.13552355139638458, + "learning_rate": 8.490585278326873e-06, + "loss": 2.8086, + "step": 40140 + }, + { + "epoch": 2.4918368613818362, + "grad_norm": 0.15242646490525255, + "learning_rate": 8.488572022951085e-06, + "loss": 2.7974, + "step": 40141 + }, + { + "epoch": 2.491898938481594, + "grad_norm": 0.13774987080043544, + "learning_rate": 8.48655898415049e-06, + "loss": 2.7293, + "step": 40142 + }, + { + "epoch": 2.491961015581352, + "grad_norm": 0.143108856361, + "learning_rate": 8.484546161935597e-06, + "loss": 2.6972, + "step": 40143 + }, + { + "epoch": 2.49202309268111, + "grad_norm": 0.13821633830501065, + "learning_rate": 8.48253355631689e-06, + "loss": 2.5902, + "step": 40144 + }, + { + "epoch": 2.492085169780868, + "grad_norm": 0.13801206180002656, + "learning_rate": 8.480521167304894e-06, + "loss": 2.7, + "step": 40145 + }, + { + "epoch": 2.492147246880626, + "grad_norm": 0.14290738300022, + "learning_rate": 8.478508994910106e-06, + "loss": 2.6214, + "step": 40146 + }, + { + "epoch": 2.4922093239803838, + "grad_norm": 0.1374545732148469, + "learning_rate": 8.476497039143011e-06, + "loss": 2.7978, + "step": 40147 + }, + { + "epoch": 2.4922714010801417, + "grad_norm": 0.1363637675389468, + "learning_rate": 8.474485300014102e-06, + "loss": 2.6543, + "step": 40148 + }, + { + "epoch": 2.4923334781798996, + "grad_norm": 0.15052442048648204, + "learning_rate": 8.47247377753389e-06, + "loss": 2.6387, + "step": 40149 + }, + { + "epoch": 2.4923955552796575, + "grad_norm": 0.15854589393254293, + "learning_rate": 8.47046247171287e-06, + "loss": 2.733, + "step": 40150 + }, + { + "epoch": 2.4924576323794154, + "grad_norm": 0.15634815590067982, + "learning_rate": 8.468451382561526e-06, + "loss": 2.6667, + "step": 40151 + }, + { + "epoch": 2.492519709479173, + "grad_norm": 0.13796733473141531, + "learning_rate": 8.46644051009034e-06, + "loss": 2.7679, + "step": 40152 + }, + { + "epoch": 2.4925817865789313, + "grad_norm": 0.14426511671145517, + "learning_rate": 8.46442985430983e-06, + "loss": 2.7311, + "step": 40153 + }, + { + "epoch": 2.4926438636786887, + "grad_norm": 0.139359749325274, + "learning_rate": 8.462419415230476e-06, + "loss": 2.791, + "step": 40154 + }, + { + "epoch": 2.4927059407784466, + "grad_norm": 0.14544617544314584, + "learning_rate": 8.460409192862762e-06, + "loss": 2.6179, + "step": 40155 + }, + { + "epoch": 2.4927680178782046, + "grad_norm": 0.14108444367996287, + "learning_rate": 8.458399187217159e-06, + "loss": 2.8011, + "step": 40156 + }, + { + "epoch": 2.4928300949779625, + "grad_norm": 0.1492332286700824, + "learning_rate": 8.456389398304188e-06, + "loss": 2.6802, + "step": 40157 + }, + { + "epoch": 2.4928921720777204, + "grad_norm": 0.15145767337593197, + "learning_rate": 8.454379826134306e-06, + "loss": 2.8109, + "step": 40158 + }, + { + "epoch": 2.4929542491774783, + "grad_norm": 0.13804208094115097, + "learning_rate": 8.452370470718023e-06, + "loss": 2.697, + "step": 40159 + }, + { + "epoch": 2.4930163262772362, + "grad_norm": 0.17096168437645226, + "learning_rate": 8.450361332065814e-06, + "loss": 2.7716, + "step": 40160 + }, + { + "epoch": 2.493078403376994, + "grad_norm": 0.13884149842257895, + "learning_rate": 8.448352410188148e-06, + "loss": 2.6732, + "step": 40161 + }, + { + "epoch": 2.493140480476752, + "grad_norm": 0.16878265753619812, + "learning_rate": 8.446343705095511e-06, + "loss": 2.7868, + "step": 40162 + }, + { + "epoch": 2.49320255757651, + "grad_norm": 0.13577382658934456, + "learning_rate": 8.444335216798393e-06, + "loss": 2.6194, + "step": 40163 + }, + { + "epoch": 2.493264634676268, + "grad_norm": 0.14902862999745384, + "learning_rate": 8.442326945307267e-06, + "loss": 2.6919, + "step": 40164 + }, + { + "epoch": 2.493326711776026, + "grad_norm": 0.13487573631057354, + "learning_rate": 8.44031889063261e-06, + "loss": 2.6963, + "step": 40165 + }, + { + "epoch": 2.4933887888757837, + "grad_norm": 0.13602241571467774, + "learning_rate": 8.438311052784886e-06, + "loss": 2.7252, + "step": 40166 + }, + { + "epoch": 2.4934508659755417, + "grad_norm": 0.15065965493213332, + "learning_rate": 8.436303431774595e-06, + "loss": 2.7225, + "step": 40167 + }, + { + "epoch": 2.4935129430752996, + "grad_norm": 0.1742825355000152, + "learning_rate": 8.434296027612199e-06, + "loss": 2.791, + "step": 40168 + }, + { + "epoch": 2.4935750201750575, + "grad_norm": 0.15556683202039903, + "learning_rate": 8.432288840308173e-06, + "loss": 2.7339, + "step": 40169 + }, + { + "epoch": 2.4936370972748154, + "grad_norm": 0.1445919953098186, + "learning_rate": 8.43028186987298e-06, + "loss": 2.6473, + "step": 40170 + }, + { + "epoch": 2.4936991743745733, + "grad_norm": 0.15311910097281353, + "learning_rate": 8.428275116317092e-06, + "loss": 2.6867, + "step": 40171 + }, + { + "epoch": 2.4937612514743313, + "grad_norm": 0.14029052481579857, + "learning_rate": 8.426268579650997e-06, + "loss": 2.7641, + "step": 40172 + }, + { + "epoch": 2.493823328574089, + "grad_norm": 0.1357651423523523, + "learning_rate": 8.424262259885146e-06, + "loss": 2.6756, + "step": 40173 + }, + { + "epoch": 2.493885405673847, + "grad_norm": 0.15636070025497956, + "learning_rate": 8.422256157030017e-06, + "loss": 2.796, + "step": 40174 + }, + { + "epoch": 2.493947482773605, + "grad_norm": 0.15737512334435488, + "learning_rate": 8.420250271096059e-06, + "loss": 2.7019, + "step": 40175 + }, + { + "epoch": 2.494009559873363, + "grad_norm": 0.1491772201925383, + "learning_rate": 8.418244602093755e-06, + "loss": 2.698, + "step": 40176 + }, + { + "epoch": 2.4940716369731204, + "grad_norm": 0.13825977234077252, + "learning_rate": 8.416239150033573e-06, + "loss": 2.6669, + "step": 40177 + }, + { + "epoch": 2.4941337140728788, + "grad_norm": 0.13513061605590349, + "learning_rate": 8.414233914925957e-06, + "loss": 2.6558, + "step": 40178 + }, + { + "epoch": 2.4941957911726362, + "grad_norm": 0.13551125676827191, + "learning_rate": 8.412228896781371e-06, + "loss": 2.6965, + "step": 40179 + }, + { + "epoch": 2.494257868272394, + "grad_norm": 0.15049931079169657, + "learning_rate": 8.410224095610293e-06, + "loss": 2.762, + "step": 40180 + }, + { + "epoch": 2.494319945372152, + "grad_norm": 0.13946692134026736, + "learning_rate": 8.408219511423171e-06, + "loss": 2.7846, + "step": 40181 + }, + { + "epoch": 2.49438202247191, + "grad_norm": 0.13746997982376027, + "learning_rate": 8.406215144230467e-06, + "loss": 2.6552, + "step": 40182 + }, + { + "epoch": 2.494444099571668, + "grad_norm": 0.15012571609891656, + "learning_rate": 8.404210994042633e-06, + "loss": 2.7251, + "step": 40183 + }, + { + "epoch": 2.494506176671426, + "grad_norm": 0.1548349679406191, + "learning_rate": 8.40220706087012e-06, + "loss": 2.738, + "step": 40184 + }, + { + "epoch": 2.4945682537711837, + "grad_norm": 0.17698310488614485, + "learning_rate": 8.4002033447234e-06, + "loss": 2.7167, + "step": 40185 + }, + { + "epoch": 2.4946303308709417, + "grad_norm": 0.1450034045876165, + "learning_rate": 8.39819984561292e-06, + "loss": 2.7113, + "step": 40186 + }, + { + "epoch": 2.4946924079706996, + "grad_norm": 0.13791521922900127, + "learning_rate": 8.39619656354913e-06, + "loss": 2.7134, + "step": 40187 + }, + { + "epoch": 2.4947544850704575, + "grad_norm": 0.15891156110824511, + "learning_rate": 8.394193498542469e-06, + "loss": 2.652, + "step": 40188 + }, + { + "epoch": 2.4948165621702154, + "grad_norm": 0.147237595944058, + "learning_rate": 8.392190650603398e-06, + "loss": 2.8157, + "step": 40189 + }, + { + "epoch": 2.4948786392699733, + "grad_norm": 0.14019441912035058, + "learning_rate": 8.390188019742385e-06, + "loss": 2.7622, + "step": 40190 + }, + { + "epoch": 2.4949407163697312, + "grad_norm": 0.13581890572847105, + "learning_rate": 8.388185605969862e-06, + "loss": 2.6349, + "step": 40191 + }, + { + "epoch": 2.495002793469489, + "grad_norm": 0.14722191468005277, + "learning_rate": 8.386183409296277e-06, + "loss": 2.7565, + "step": 40192 + }, + { + "epoch": 2.495064870569247, + "grad_norm": 0.1391502376836391, + "learning_rate": 8.384181429732074e-06, + "loss": 2.7505, + "step": 40193 + }, + { + "epoch": 2.495126947669005, + "grad_norm": 0.14715073654853733, + "learning_rate": 8.382179667287687e-06, + "loss": 2.7594, + "step": 40194 + }, + { + "epoch": 2.495189024768763, + "grad_norm": 0.14056865570019741, + "learning_rate": 8.380178121973581e-06, + "loss": 2.6907, + "step": 40195 + }, + { + "epoch": 2.495251101868521, + "grad_norm": 0.14566987248825947, + "learning_rate": 8.378176793800196e-06, + "loss": 2.6019, + "step": 40196 + }, + { + "epoch": 2.4953131789682788, + "grad_norm": 0.14051059864369034, + "learning_rate": 8.376175682777959e-06, + "loss": 2.7069, + "step": 40197 + }, + { + "epoch": 2.4953752560680367, + "grad_norm": 0.13675141625481693, + "learning_rate": 8.374174788917311e-06, + "loss": 2.7018, + "step": 40198 + }, + { + "epoch": 2.4954373331677946, + "grad_norm": 0.13640061558014016, + "learning_rate": 8.37217411222871e-06, + "loss": 2.7664, + "step": 40199 + }, + { + "epoch": 2.495499410267552, + "grad_norm": 0.1515398947581839, + "learning_rate": 8.370173652722579e-06, + "loss": 2.5752, + "step": 40200 + }, + { + "epoch": 2.4955614873673104, + "grad_norm": 0.134924518512922, + "learning_rate": 8.368173410409364e-06, + "loss": 2.7392, + "step": 40201 + }, + { + "epoch": 2.495623564467068, + "grad_norm": 0.14047996008444935, + "learning_rate": 8.366173385299474e-06, + "loss": 2.6481, + "step": 40202 + }, + { + "epoch": 2.495685641566826, + "grad_norm": 0.13626207648137015, + "learning_rate": 8.364173577403383e-06, + "loss": 2.7444, + "step": 40203 + }, + { + "epoch": 2.4957477186665837, + "grad_norm": 0.14777532121030978, + "learning_rate": 8.362173986731508e-06, + "loss": 2.6848, + "step": 40204 + }, + { + "epoch": 2.4958097957663417, + "grad_norm": 0.13248350262278766, + "learning_rate": 8.360174613294275e-06, + "loss": 2.6938, + "step": 40205 + }, + { + "epoch": 2.4958718728660996, + "grad_norm": 0.13582873158422026, + "learning_rate": 8.358175457102118e-06, + "loss": 2.6717, + "step": 40206 + }, + { + "epoch": 2.4959339499658575, + "grad_norm": 0.1372014855726845, + "learning_rate": 8.356176518165459e-06, + "loss": 2.8014, + "step": 40207 + }, + { + "epoch": 2.4959960270656154, + "grad_norm": 0.14452994835757174, + "learning_rate": 8.35417779649475e-06, + "loss": 2.6959, + "step": 40208 + }, + { + "epoch": 2.4960581041653733, + "grad_norm": 0.15876657319665666, + "learning_rate": 8.352179292100403e-06, + "loss": 2.7634, + "step": 40209 + }, + { + "epoch": 2.4961201812651312, + "grad_norm": 0.13258315367369805, + "learning_rate": 8.350181004992846e-06, + "loss": 2.7521, + "step": 40210 + }, + { + "epoch": 2.496182258364889, + "grad_norm": 0.13522248299032172, + "learning_rate": 8.348182935182497e-06, + "loss": 2.6748, + "step": 40211 + }, + { + "epoch": 2.496244335464647, + "grad_norm": 0.1521396369984274, + "learning_rate": 8.346185082679798e-06, + "loss": 2.7662, + "step": 40212 + }, + { + "epoch": 2.496306412564405, + "grad_norm": 0.14091517392121145, + "learning_rate": 8.344187447495161e-06, + "loss": 2.6218, + "step": 40213 + }, + { + "epoch": 2.496368489664163, + "grad_norm": 0.1363376414630975, + "learning_rate": 8.342190029639013e-06, + "loss": 2.771, + "step": 40214 + }, + { + "epoch": 2.496430566763921, + "grad_norm": 0.14257909173507072, + "learning_rate": 8.340192829121763e-06, + "loss": 2.8025, + "step": 40215 + }, + { + "epoch": 2.4964926438636788, + "grad_norm": 0.14549844186807184, + "learning_rate": 8.33819584595385e-06, + "loss": 2.7194, + "step": 40216 + }, + { + "epoch": 2.4965547209634367, + "grad_norm": 0.14046048767011024, + "learning_rate": 8.336199080145685e-06, + "loss": 2.8012, + "step": 40217 + }, + { + "epoch": 2.4966167980631946, + "grad_norm": 0.15085528784519067, + "learning_rate": 8.33420253170768e-06, + "loss": 2.6717, + "step": 40218 + }, + { + "epoch": 2.4966788751629525, + "grad_norm": 0.14445383344056706, + "learning_rate": 8.332206200650256e-06, + "loss": 2.779, + "step": 40219 + }, + { + "epoch": 2.4967409522627104, + "grad_norm": 0.14354243193751373, + "learning_rate": 8.33021008698382e-06, + "loss": 2.6964, + "step": 40220 + }, + { + "epoch": 2.4968030293624683, + "grad_norm": 0.13762657668399833, + "learning_rate": 8.328214190718797e-06, + "loss": 2.7301, + "step": 40221 + }, + { + "epoch": 2.4968651064622263, + "grad_norm": 0.1499835921790789, + "learning_rate": 8.32621851186559e-06, + "loss": 2.6404, + "step": 40222 + }, + { + "epoch": 2.496927183561984, + "grad_norm": 0.16915005201714697, + "learning_rate": 8.324223050434632e-06, + "loss": 2.8242, + "step": 40223 + }, + { + "epoch": 2.496989260661742, + "grad_norm": 0.15371277261532856, + "learning_rate": 8.322227806436317e-06, + "loss": 2.7157, + "step": 40224 + }, + { + "epoch": 2.4970513377614996, + "grad_norm": 0.14506229579275107, + "learning_rate": 8.320232779881044e-06, + "loss": 2.7558, + "step": 40225 + }, + { + "epoch": 2.497113414861258, + "grad_norm": 0.15955916030741737, + "learning_rate": 8.318237970779247e-06, + "loss": 2.746, + "step": 40226 + }, + { + "epoch": 2.4971754919610154, + "grad_norm": 0.13956129249404006, + "learning_rate": 8.316243379141325e-06, + "loss": 2.6817, + "step": 40227 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.13682070745096275, + "learning_rate": 8.314249004977675e-06, + "loss": 2.6508, + "step": 40228 + }, + { + "epoch": 2.4972996461605312, + "grad_norm": 0.13528156342032807, + "learning_rate": 8.312254848298712e-06, + "loss": 2.6463, + "step": 40229 + }, + { + "epoch": 2.497361723260289, + "grad_norm": 0.15132520154887272, + "learning_rate": 8.310260909114819e-06, + "loss": 2.6727, + "step": 40230 + }, + { + "epoch": 2.497423800360047, + "grad_norm": 0.14072911064387783, + "learning_rate": 8.30826718743643e-06, + "loss": 2.7281, + "step": 40231 + }, + { + "epoch": 2.497485877459805, + "grad_norm": 0.18195105247250817, + "learning_rate": 8.306273683273935e-06, + "loss": 2.6487, + "step": 40232 + }, + { + "epoch": 2.497547954559563, + "grad_norm": 0.14559044507793883, + "learning_rate": 8.304280396637726e-06, + "loss": 2.6113, + "step": 40233 + }, + { + "epoch": 2.497610031659321, + "grad_norm": 0.13524418324250798, + "learning_rate": 8.3022873275382e-06, + "loss": 2.7273, + "step": 40234 + }, + { + "epoch": 2.4976721087590787, + "grad_norm": 0.18509021579447496, + "learning_rate": 8.300294475985776e-06, + "loss": 2.7398, + "step": 40235 + }, + { + "epoch": 2.4977341858588367, + "grad_norm": 0.1492044374582194, + "learning_rate": 8.298301841990835e-06, + "loss": 2.7006, + "step": 40236 + }, + { + "epoch": 2.4977962629585946, + "grad_norm": 0.1724085329646899, + "learning_rate": 8.296309425563781e-06, + "loss": 2.7277, + "step": 40237 + }, + { + "epoch": 2.4978583400583525, + "grad_norm": 0.13540292786326755, + "learning_rate": 8.294317226714986e-06, + "loss": 2.7292, + "step": 40238 + }, + { + "epoch": 2.4979204171581104, + "grad_norm": 0.15834277664955962, + "learning_rate": 8.29232524545488e-06, + "loss": 2.6988, + "step": 40239 + }, + { + "epoch": 2.4979824942578683, + "grad_norm": 0.1387169321494927, + "learning_rate": 8.290333481793837e-06, + "loss": 2.6738, + "step": 40240 + }, + { + "epoch": 2.4980445713576263, + "grad_norm": 0.13460141695867037, + "learning_rate": 8.288341935742249e-06, + "loss": 2.72, + "step": 40241 + }, + { + "epoch": 2.498106648457384, + "grad_norm": 0.14066731293197998, + "learning_rate": 8.286350607310506e-06, + "loss": 2.6785, + "step": 40242 + }, + { + "epoch": 2.498168725557142, + "grad_norm": 0.13777860192141914, + "learning_rate": 8.284359496508987e-06, + "loss": 2.669, + "step": 40243 + }, + { + "epoch": 2.4982308026569, + "grad_norm": 0.13501319398027958, + "learning_rate": 8.282368603348107e-06, + "loss": 2.7441, + "step": 40244 + }, + { + "epoch": 2.498292879756658, + "grad_norm": 0.15298472567512156, + "learning_rate": 8.280377927838234e-06, + "loss": 2.6655, + "step": 40245 + }, + { + "epoch": 2.498354956856416, + "grad_norm": 0.16216771673077032, + "learning_rate": 8.278387469989758e-06, + "loss": 2.7187, + "step": 40246 + }, + { + "epoch": 2.4984170339561738, + "grad_norm": 0.14460528900169112, + "learning_rate": 8.276397229813048e-06, + "loss": 2.6546, + "step": 40247 + }, + { + "epoch": 2.4984791110559312, + "grad_norm": 0.1674165909734779, + "learning_rate": 8.274407207318518e-06, + "loss": 2.715, + "step": 40248 + }, + { + "epoch": 2.4985411881556896, + "grad_norm": 0.1411123308723944, + "learning_rate": 8.27241740251653e-06, + "loss": 2.691, + "step": 40249 + }, + { + "epoch": 2.498603265255447, + "grad_norm": 0.14647951118023797, + "learning_rate": 8.270427815417475e-06, + "loss": 2.74, + "step": 40250 + }, + { + "epoch": 2.498665342355205, + "grad_norm": 0.15390365932433578, + "learning_rate": 8.268438446031718e-06, + "loss": 2.7318, + "step": 40251 + }, + { + "epoch": 2.498727419454963, + "grad_norm": 0.13662614410860188, + "learning_rate": 8.266449294369655e-06, + "loss": 2.6783, + "step": 40252 + }, + { + "epoch": 2.498789496554721, + "grad_norm": 0.13087508121577224, + "learning_rate": 8.264460360441661e-06, + "loss": 2.6507, + "step": 40253 + }, + { + "epoch": 2.4988515736544787, + "grad_norm": 0.14001009201178855, + "learning_rate": 8.262471644258096e-06, + "loss": 2.7303, + "step": 40254 + }, + { + "epoch": 2.4989136507542367, + "grad_norm": 0.13471784988771585, + "learning_rate": 8.260483145829356e-06, + "loss": 2.738, + "step": 40255 + }, + { + "epoch": 2.4989757278539946, + "grad_norm": 0.13262927515732878, + "learning_rate": 8.258494865165816e-06, + "loss": 2.7439, + "step": 40256 + }, + { + "epoch": 2.4990378049537525, + "grad_norm": 0.13485700853746777, + "learning_rate": 8.256506802277825e-06, + "loss": 2.6616, + "step": 40257 + }, + { + "epoch": 2.4990998820535104, + "grad_norm": 0.17070858900809732, + "learning_rate": 8.254518957175783e-06, + "loss": 2.7264, + "step": 40258 + }, + { + "epoch": 2.4991619591532683, + "grad_norm": 0.13710760946848807, + "learning_rate": 8.25253132987005e-06, + "loss": 2.7176, + "step": 40259 + }, + { + "epoch": 2.4992240362530262, + "grad_norm": 0.13577692698805957, + "learning_rate": 8.250543920370995e-06, + "loss": 2.7108, + "step": 40260 + }, + { + "epoch": 2.499286113352784, + "grad_norm": 0.13685933106150935, + "learning_rate": 8.248556728688977e-06, + "loss": 2.7394, + "step": 40261 + }, + { + "epoch": 2.499348190452542, + "grad_norm": 0.1440823323403225, + "learning_rate": 8.246569754834389e-06, + "loss": 2.8014, + "step": 40262 + }, + { + "epoch": 2.4994102675523, + "grad_norm": 0.14314867199740788, + "learning_rate": 8.24458299881758e-06, + "loss": 2.7135, + "step": 40263 + }, + { + "epoch": 2.499472344652058, + "grad_norm": 0.14165220341645107, + "learning_rate": 8.242596460648916e-06, + "loss": 2.7253, + "step": 40264 + }, + { + "epoch": 2.499534421751816, + "grad_norm": 0.13803679820762083, + "learning_rate": 8.24061014033875e-06, + "loss": 2.7492, + "step": 40265 + }, + { + "epoch": 2.4995964988515738, + "grad_norm": 0.14441934325696074, + "learning_rate": 8.238624037897474e-06, + "loss": 2.694, + "step": 40266 + }, + { + "epoch": 2.4996585759513317, + "grad_norm": 0.14825955818138742, + "learning_rate": 8.236638153335429e-06, + "loss": 2.7285, + "step": 40267 + }, + { + "epoch": 2.4997206530510896, + "grad_norm": 0.13800512455369526, + "learning_rate": 8.234652486662991e-06, + "loss": 2.7225, + "step": 40268 + }, + { + "epoch": 2.4997827301508475, + "grad_norm": 0.1424266650848155, + "learning_rate": 8.2326670378905e-06, + "loss": 2.7162, + "step": 40269 + }, + { + "epoch": 2.4998448072506054, + "grad_norm": 0.14270579487800658, + "learning_rate": 8.230681807028318e-06, + "loss": 2.7297, + "step": 40270 + }, + { + "epoch": 2.4999068843503633, + "grad_norm": 0.14193433923868054, + "learning_rate": 8.22869679408682e-06, + "loss": 2.7904, + "step": 40271 + }, + { + "epoch": 2.4999689614501213, + "grad_norm": 0.16250171064430105, + "learning_rate": 8.226711999076353e-06, + "loss": 2.6625, + "step": 40272 + }, + { + "epoch": 2.5000310385498787, + "grad_norm": 0.1451875054203021, + "learning_rate": 8.224727422007267e-06, + "loss": 2.7132, + "step": 40273 + }, + { + "epoch": 2.500093115649637, + "grad_norm": 0.1567381327025283, + "learning_rate": 8.222743062889909e-06, + "loss": 2.6371, + "step": 40274 + }, + { + "epoch": 2.5001551927493946, + "grad_norm": 0.15101674027679882, + "learning_rate": 8.22075892173465e-06, + "loss": 2.7111, + "step": 40275 + }, + { + "epoch": 2.5002172698491525, + "grad_norm": 0.13878932197003277, + "learning_rate": 8.218774998551836e-06, + "loss": 2.741, + "step": 40276 + }, + { + "epoch": 2.5002793469489104, + "grad_norm": 0.14570394520873886, + "learning_rate": 8.216791293351822e-06, + "loss": 2.6349, + "step": 40277 + }, + { + "epoch": 2.5003414240486683, + "grad_norm": 0.13807774590956814, + "learning_rate": 8.214807806144941e-06, + "loss": 2.6239, + "step": 40278 + }, + { + "epoch": 2.5004035011484262, + "grad_norm": 0.136509435369503, + "learning_rate": 8.212824536941549e-06, + "loss": 2.65, + "step": 40279 + }, + { + "epoch": 2.500465578248184, + "grad_norm": 0.15486673557885433, + "learning_rate": 8.210841485752002e-06, + "loss": 2.6682, + "step": 40280 + }, + { + "epoch": 2.500527655347942, + "grad_norm": 0.174156543077749, + "learning_rate": 8.208858652586643e-06, + "loss": 2.6789, + "step": 40281 + }, + { + "epoch": 2.5005897324477, + "grad_norm": 0.17641349049483848, + "learning_rate": 8.206876037455807e-06, + "loss": 2.6474, + "step": 40282 + }, + { + "epoch": 2.500651809547458, + "grad_norm": 0.13659459334920182, + "learning_rate": 8.204893640369838e-06, + "loss": 2.7293, + "step": 40283 + }, + { + "epoch": 2.500713886647216, + "grad_norm": 0.1463274242080877, + "learning_rate": 8.202911461339096e-06, + "loss": 2.7286, + "step": 40284 + }, + { + "epoch": 2.5007759637469738, + "grad_norm": 0.1424246641483368, + "learning_rate": 8.200929500373911e-06, + "loss": 2.7681, + "step": 40285 + }, + { + "epoch": 2.5008380408467317, + "grad_norm": 0.1579897053249171, + "learning_rate": 8.198947757484626e-06, + "loss": 2.7502, + "step": 40286 + }, + { + "epoch": 2.5009001179464896, + "grad_norm": 0.13673223701438508, + "learning_rate": 8.196966232681563e-06, + "loss": 2.6992, + "step": 40287 + }, + { + "epoch": 2.5009621950462475, + "grad_norm": 0.1497414685559116, + "learning_rate": 8.194984925975074e-06, + "loss": 2.7021, + "step": 40288 + }, + { + "epoch": 2.5010242721460054, + "grad_norm": 0.14230374494309048, + "learning_rate": 8.193003837375512e-06, + "loss": 2.7475, + "step": 40289 + }, + { + "epoch": 2.5010863492457633, + "grad_norm": 0.1577128064759182, + "learning_rate": 8.191022966893197e-06, + "loss": 2.7025, + "step": 40290 + }, + { + "epoch": 2.5011484263455213, + "grad_norm": 0.14157581095187896, + "learning_rate": 8.189042314538464e-06, + "loss": 2.6294, + "step": 40291 + }, + { + "epoch": 2.501210503445279, + "grad_norm": 0.15355192294042433, + "learning_rate": 8.18706188032165e-06, + "loss": 2.6612, + "step": 40292 + }, + { + "epoch": 2.501272580545037, + "grad_norm": 0.16605632507478735, + "learning_rate": 8.18508166425307e-06, + "loss": 2.7638, + "step": 40293 + }, + { + "epoch": 2.5013346576447946, + "grad_norm": 0.13259154059128966, + "learning_rate": 8.183101666343085e-06, + "loss": 2.7959, + "step": 40294 + }, + { + "epoch": 2.501396734744553, + "grad_norm": 0.14790888195529106, + "learning_rate": 8.181121886602006e-06, + "loss": 2.701, + "step": 40295 + }, + { + "epoch": 2.5014588118443104, + "grad_norm": 0.16202660251378814, + "learning_rate": 8.179142325040168e-06, + "loss": 2.6904, + "step": 40296 + }, + { + "epoch": 2.5015208889440688, + "grad_norm": 0.14050517010000382, + "learning_rate": 8.177162981667885e-06, + "loss": 2.636, + "step": 40297 + }, + { + "epoch": 2.5015829660438262, + "grad_norm": 0.13674231053749855, + "learning_rate": 8.175183856495511e-06, + "loss": 2.6455, + "step": 40298 + }, + { + "epoch": 2.5016450431435846, + "grad_norm": 0.1416042656476059, + "learning_rate": 8.173204949533347e-06, + "loss": 2.7739, + "step": 40299 + }, + { + "epoch": 2.501707120243342, + "grad_norm": 0.14776949147928742, + "learning_rate": 8.171226260791736e-06, + "loss": 2.6715, + "step": 40300 + }, + { + "epoch": 2.5017691973431, + "grad_norm": 0.1450374605032313, + "learning_rate": 8.169247790280976e-06, + "loss": 2.6715, + "step": 40301 + }, + { + "epoch": 2.501831274442858, + "grad_norm": 0.14275231237718203, + "learning_rate": 8.167269538011419e-06, + "loss": 2.6608, + "step": 40302 + }, + { + "epoch": 2.501893351542616, + "grad_norm": 0.13908370076063928, + "learning_rate": 8.165291503993367e-06, + "loss": 2.7501, + "step": 40303 + }, + { + "epoch": 2.5019554286423737, + "grad_norm": 0.14251099273251616, + "learning_rate": 8.163313688237145e-06, + "loss": 2.684, + "step": 40304 + }, + { + "epoch": 2.5020175057421317, + "grad_norm": 0.1382564110187762, + "learning_rate": 8.16133609075308e-06, + "loss": 2.7279, + "step": 40305 + }, + { + "epoch": 2.5020795828418896, + "grad_norm": 0.1415984909776821, + "learning_rate": 8.159358711551463e-06, + "loss": 2.6758, + "step": 40306 + }, + { + "epoch": 2.5021416599416475, + "grad_norm": 0.14805039323637303, + "learning_rate": 8.15738155064264e-06, + "loss": 2.7561, + "step": 40307 + }, + { + "epoch": 2.5022037370414054, + "grad_norm": 0.1618003772815189, + "learning_rate": 8.155404608036915e-06, + "loss": 2.7081, + "step": 40308 + }, + { + "epoch": 2.5022658141411633, + "grad_norm": 0.1394427525378686, + "learning_rate": 8.153427883744608e-06, + "loss": 2.6386, + "step": 40309 + }, + { + "epoch": 2.5023278912409213, + "grad_norm": 0.1386010835507177, + "learning_rate": 8.151451377776009e-06, + "loss": 2.7372, + "step": 40310 + }, + { + "epoch": 2.502389968340679, + "grad_norm": 0.13755700084062433, + "learning_rate": 8.149475090141462e-06, + "loss": 2.6076, + "step": 40311 + }, + { + "epoch": 2.502452045440437, + "grad_norm": 0.13658091545331089, + "learning_rate": 8.147499020851263e-06, + "loss": 2.5935, + "step": 40312 + }, + { + "epoch": 2.502514122540195, + "grad_norm": 0.1331004989155211, + "learning_rate": 8.14552316991572e-06, + "loss": 2.7171, + "step": 40313 + }, + { + "epoch": 2.502576199639953, + "grad_norm": 0.1502517083375579, + "learning_rate": 8.143547537345141e-06, + "loss": 2.6596, + "step": 40314 + }, + { + "epoch": 2.502638276739711, + "grad_norm": 0.14882312425837582, + "learning_rate": 8.141572123149827e-06, + "loss": 2.7359, + "step": 40315 + }, + { + "epoch": 2.5027003538394688, + "grad_norm": 0.14046597098252517, + "learning_rate": 8.139596927340104e-06, + "loss": 2.7069, + "step": 40316 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.14390402736252084, + "learning_rate": 8.137621949926267e-06, + "loss": 2.7089, + "step": 40317 + }, + { + "epoch": 2.5028245080389846, + "grad_norm": 0.14508807042943872, + "learning_rate": 8.135647190918617e-06, + "loss": 2.6567, + "step": 40318 + }, + { + "epoch": 2.502886585138742, + "grad_norm": 0.15457463915642566, + "learning_rate": 8.133672650327445e-06, + "loss": 2.6755, + "step": 40319 + }, + { + "epoch": 2.5029486622385004, + "grad_norm": 0.15380861797029818, + "learning_rate": 8.131698328163068e-06, + "loss": 2.6304, + "step": 40320 + }, + { + "epoch": 2.503010739338258, + "grad_norm": 0.14653165689966316, + "learning_rate": 8.129724224435798e-06, + "loss": 2.7155, + "step": 40321 + }, + { + "epoch": 2.5030728164380163, + "grad_norm": 0.1365502477142695, + "learning_rate": 8.127750339155921e-06, + "loss": 2.701, + "step": 40322 + }, + { + "epoch": 2.5031348935377737, + "grad_norm": 0.15442581463035848, + "learning_rate": 8.125776672333729e-06, + "loss": 2.6906, + "step": 40323 + }, + { + "epoch": 2.5031969706375317, + "grad_norm": 0.13741230118601028, + "learning_rate": 8.12380322397952e-06, + "loss": 2.7485, + "step": 40324 + }, + { + "epoch": 2.5032590477372896, + "grad_norm": 0.13727955920599927, + "learning_rate": 8.121829994103608e-06, + "loss": 2.7246, + "step": 40325 + }, + { + "epoch": 2.5033211248370475, + "grad_norm": 0.14191622794630984, + "learning_rate": 8.119856982716273e-06, + "loss": 2.6957, + "step": 40326 + }, + { + "epoch": 2.5033832019368054, + "grad_norm": 0.14524562948040032, + "learning_rate": 8.117884189827812e-06, + "loss": 2.6562, + "step": 40327 + }, + { + "epoch": 2.5034452790365633, + "grad_norm": 0.13632570035029806, + "learning_rate": 8.115911615448513e-06, + "loss": 2.6236, + "step": 40328 + }, + { + "epoch": 2.5035073561363212, + "grad_norm": 0.1525815611480208, + "learning_rate": 8.11393925958866e-06, + "loss": 2.699, + "step": 40329 + }, + { + "epoch": 2.503569433236079, + "grad_norm": 0.14247122767955414, + "learning_rate": 8.111967122258568e-06, + "loss": 2.7555, + "step": 40330 + }, + { + "epoch": 2.503631510335837, + "grad_norm": 0.13716904871299695, + "learning_rate": 8.109995203468512e-06, + "loss": 2.7578, + "step": 40331 + }, + { + "epoch": 2.503693587435595, + "grad_norm": 0.14041597010183138, + "learning_rate": 8.108023503228778e-06, + "loss": 2.7216, + "step": 40332 + }, + { + "epoch": 2.503755664535353, + "grad_norm": 0.14997233393957934, + "learning_rate": 8.106052021549649e-06, + "loss": 2.7368, + "step": 40333 + }, + { + "epoch": 2.503817741635111, + "grad_norm": 0.14950565572513452, + "learning_rate": 8.104080758441424e-06, + "loss": 2.7212, + "step": 40334 + }, + { + "epoch": 2.5038798187348688, + "grad_norm": 0.135711769428221, + "learning_rate": 8.10210971391438e-06, + "loss": 2.6676, + "step": 40335 + }, + { + "epoch": 2.5039418958346267, + "grad_norm": 0.14077336379221342, + "learning_rate": 8.100138887978802e-06, + "loss": 2.6425, + "step": 40336 + }, + { + "epoch": 2.5040039729343846, + "grad_norm": 0.1451919186113088, + "learning_rate": 8.098168280644957e-06, + "loss": 2.7931, + "step": 40337 + }, + { + "epoch": 2.5040660500341425, + "grad_norm": 0.14089156423566393, + "learning_rate": 8.096197891923151e-06, + "loss": 2.6968, + "step": 40338 + }, + { + "epoch": 2.5041281271339004, + "grad_norm": 0.16847231146023953, + "learning_rate": 8.094227721823661e-06, + "loss": 2.7145, + "step": 40339 + }, + { + "epoch": 2.5041902042336583, + "grad_norm": 0.14418501881034215, + "learning_rate": 8.092257770356748e-06, + "loss": 2.7317, + "step": 40340 + }, + { + "epoch": 2.5042522813334163, + "grad_norm": 0.1535817486798022, + "learning_rate": 8.090288037532706e-06, + "loss": 2.6869, + "step": 40341 + }, + { + "epoch": 2.5043143584331737, + "grad_norm": 0.13573909316892777, + "learning_rate": 8.088318523361793e-06, + "loss": 2.7157, + "step": 40342 + }, + { + "epoch": 2.504376435532932, + "grad_norm": 0.1417733235938128, + "learning_rate": 8.086349227854306e-06, + "loss": 2.6829, + "step": 40343 + }, + { + "epoch": 2.5044385126326896, + "grad_norm": 0.15537255800181815, + "learning_rate": 8.08438015102051e-06, + "loss": 2.7213, + "step": 40344 + }, + { + "epoch": 2.504500589732448, + "grad_norm": 0.1683184565142798, + "learning_rate": 8.082411292870678e-06, + "loss": 2.696, + "step": 40345 + }, + { + "epoch": 2.5045626668322054, + "grad_norm": 0.1402803011101786, + "learning_rate": 8.080442653415071e-06, + "loss": 2.6388, + "step": 40346 + }, + { + "epoch": 2.5046247439319638, + "grad_norm": 0.13468002973493454, + "learning_rate": 8.078474232663985e-06, + "loss": 2.7249, + "step": 40347 + }, + { + "epoch": 2.5046868210317212, + "grad_norm": 0.14486472218725063, + "learning_rate": 8.076506030627672e-06, + "loss": 2.6517, + "step": 40348 + }, + { + "epoch": 2.504748898131479, + "grad_norm": 0.13976908187472, + "learning_rate": 8.07453804731641e-06, + "loss": 2.6608, + "step": 40349 + }, + { + "epoch": 2.504810975231237, + "grad_norm": 0.1600933300260761, + "learning_rate": 8.072570282740444e-06, + "loss": 2.8163, + "step": 40350 + }, + { + "epoch": 2.504873052330995, + "grad_norm": 0.1452826671843026, + "learning_rate": 8.070602736910071e-06, + "loss": 2.7295, + "step": 40351 + }, + { + "epoch": 2.504935129430753, + "grad_norm": 0.15356774015805275, + "learning_rate": 8.06863540983554e-06, + "loss": 2.6789, + "step": 40352 + }, + { + "epoch": 2.504997206530511, + "grad_norm": 0.1470961764568876, + "learning_rate": 8.066668301527108e-06, + "loss": 2.6246, + "step": 40353 + }, + { + "epoch": 2.5050592836302688, + "grad_norm": 0.18419785233015984, + "learning_rate": 8.064701411995057e-06, + "loss": 2.6231, + "step": 40354 + }, + { + "epoch": 2.5051213607300267, + "grad_norm": 0.13411680161082914, + "learning_rate": 8.06273474124964e-06, + "loss": 2.7094, + "step": 40355 + }, + { + "epoch": 2.5051834378297846, + "grad_norm": 0.13505961651528584, + "learning_rate": 8.060768289301108e-06, + "loss": 2.6402, + "step": 40356 + }, + { + "epoch": 2.5052455149295425, + "grad_norm": 0.13314404897559107, + "learning_rate": 8.058802056159737e-06, + "loss": 2.6295, + "step": 40357 + }, + { + "epoch": 2.5053075920293004, + "grad_norm": 0.1495275378189974, + "learning_rate": 8.056836041835774e-06, + "loss": 2.7071, + "step": 40358 + }, + { + "epoch": 2.5053696691290583, + "grad_norm": 0.137960044201679, + "learning_rate": 8.054870246339486e-06, + "loss": 2.6739, + "step": 40359 + }, + { + "epoch": 2.5054317462288163, + "grad_norm": 0.1407695067521829, + "learning_rate": 8.052904669681105e-06, + "loss": 2.7497, + "step": 40360 + }, + { + "epoch": 2.505493823328574, + "grad_norm": 0.13876859891445398, + "learning_rate": 8.05093931187092e-06, + "loss": 2.7552, + "step": 40361 + }, + { + "epoch": 2.505555900428332, + "grad_norm": 0.13540776999910417, + "learning_rate": 8.048974172919167e-06, + "loss": 2.7899, + "step": 40362 + }, + { + "epoch": 2.50561797752809, + "grad_norm": 0.13168156380619386, + "learning_rate": 8.047009252836096e-06, + "loss": 2.6523, + "step": 40363 + }, + { + "epoch": 2.505680054627848, + "grad_norm": 0.16142261915115694, + "learning_rate": 8.045044551631964e-06, + "loss": 2.7125, + "step": 40364 + }, + { + "epoch": 2.505742131727606, + "grad_norm": 0.14568563131290138, + "learning_rate": 8.04308006931701e-06, + "loss": 2.6072, + "step": 40365 + }, + { + "epoch": 2.5058042088273638, + "grad_norm": 0.15790180073314364, + "learning_rate": 8.0411158059015e-06, + "loss": 2.6178, + "step": 40366 + }, + { + "epoch": 2.5058662859271212, + "grad_norm": 0.1536591788061875, + "learning_rate": 8.03915176139568e-06, + "loss": 2.712, + "step": 40367 + }, + { + "epoch": 2.5059283630268796, + "grad_norm": 0.1471326027378857, + "learning_rate": 8.037187935809787e-06, + "loss": 2.6796, + "step": 40368 + }, + { + "epoch": 2.505990440126637, + "grad_norm": 0.13824681160321609, + "learning_rate": 8.035224329154062e-06, + "loss": 2.75, + "step": 40369 + }, + { + "epoch": 2.5060525172263954, + "grad_norm": 0.13984723379814476, + "learning_rate": 8.03326094143877e-06, + "loss": 2.6985, + "step": 40370 + }, + { + "epoch": 2.506114594326153, + "grad_norm": 0.14597524538739814, + "learning_rate": 8.031297772674146e-06, + "loss": 2.6809, + "step": 40371 + }, + { + "epoch": 2.506176671425911, + "grad_norm": 0.15716724579991947, + "learning_rate": 8.029334822870427e-06, + "loss": 2.7585, + "step": 40372 + }, + { + "epoch": 2.5062387485256687, + "grad_norm": 0.14340344720837633, + "learning_rate": 8.027372092037843e-06, + "loss": 2.6842, + "step": 40373 + }, + { + "epoch": 2.5063008256254267, + "grad_norm": 0.14746960873403306, + "learning_rate": 8.025409580186666e-06, + "loss": 2.7153, + "step": 40374 + }, + { + "epoch": 2.5063629027251846, + "grad_norm": 0.1410919490120513, + "learning_rate": 8.023447287327113e-06, + "loss": 2.7358, + "step": 40375 + }, + { + "epoch": 2.5064249798249425, + "grad_norm": 0.13808351943557567, + "learning_rate": 8.021485213469426e-06, + "loss": 2.7366, + "step": 40376 + }, + { + "epoch": 2.5064870569247004, + "grad_norm": 0.13814873137656103, + "learning_rate": 8.019523358623843e-06, + "loss": 2.7241, + "step": 40377 + }, + { + "epoch": 2.5065491340244583, + "grad_norm": 0.1604624124629409, + "learning_rate": 8.017561722800582e-06, + "loss": 2.7691, + "step": 40378 + }, + { + "epoch": 2.5066112111242163, + "grad_norm": 0.13664207351946253, + "learning_rate": 8.015600306009907e-06, + "loss": 2.7097, + "step": 40379 + }, + { + "epoch": 2.506673288223974, + "grad_norm": 0.14265888356872897, + "learning_rate": 8.01363910826204e-06, + "loss": 2.7358, + "step": 40380 + }, + { + "epoch": 2.506735365323732, + "grad_norm": 0.13671545978401792, + "learning_rate": 8.011678129567208e-06, + "loss": 2.7194, + "step": 40381 + }, + { + "epoch": 2.50679744242349, + "grad_norm": 0.1369840364231169, + "learning_rate": 8.009717369935631e-06, + "loss": 2.6369, + "step": 40382 + }, + { + "epoch": 2.506859519523248, + "grad_norm": 0.14515635910276453, + "learning_rate": 8.007756829377566e-06, + "loss": 2.6667, + "step": 40383 + }, + { + "epoch": 2.506921596623006, + "grad_norm": 0.14273356756390143, + "learning_rate": 8.005796507903223e-06, + "loss": 2.7877, + "step": 40384 + }, + { + "epoch": 2.5069836737227638, + "grad_norm": 0.13940819168662627, + "learning_rate": 8.00383640552283e-06, + "loss": 2.5961, + "step": 40385 + }, + { + "epoch": 2.5070457508225217, + "grad_norm": 0.1631524812544026, + "learning_rate": 8.001876522246627e-06, + "loss": 2.6743, + "step": 40386 + }, + { + "epoch": 2.5071078279222796, + "grad_norm": 0.17160586938939126, + "learning_rate": 7.999916858084811e-06, + "loss": 2.672, + "step": 40387 + }, + { + "epoch": 2.5071699050220375, + "grad_norm": 0.15126581796973274, + "learning_rate": 7.99795741304764e-06, + "loss": 2.7623, + "step": 40388 + }, + { + "epoch": 2.5072319821217954, + "grad_norm": 0.1400494918770273, + "learning_rate": 7.995998187145327e-06, + "loss": 2.7004, + "step": 40389 + }, + { + "epoch": 2.507294059221553, + "grad_norm": 0.14987793142738473, + "learning_rate": 7.994039180388085e-06, + "loss": 2.7182, + "step": 40390 + }, + { + "epoch": 2.5073561363213113, + "grad_norm": 0.1374882703893244, + "learning_rate": 7.992080392786133e-06, + "loss": 2.7547, + "step": 40391 + }, + { + "epoch": 2.5074182134210687, + "grad_norm": 0.13903693382047078, + "learning_rate": 7.990121824349689e-06, + "loss": 2.713, + "step": 40392 + }, + { + "epoch": 2.507480290520827, + "grad_norm": 0.14112927171334155, + "learning_rate": 7.988163475088989e-06, + "loss": 2.6496, + "step": 40393 + }, + { + "epoch": 2.5075423676205846, + "grad_norm": 0.16050147437929174, + "learning_rate": 7.986205345014241e-06, + "loss": 2.6825, + "step": 40394 + }, + { + "epoch": 2.507604444720343, + "grad_norm": 0.14418445039797317, + "learning_rate": 7.984247434135655e-06, + "loss": 2.8111, + "step": 40395 + }, + { + "epoch": 2.5076665218201004, + "grad_norm": 0.1370480046854723, + "learning_rate": 7.98228974246344e-06, + "loss": 2.6892, + "step": 40396 + }, + { + "epoch": 2.5077285989198583, + "grad_norm": 0.18062959290641206, + "learning_rate": 7.980332270007828e-06, + "loss": 2.7604, + "step": 40397 + }, + { + "epoch": 2.5077906760196162, + "grad_norm": 0.13805155966883514, + "learning_rate": 7.978375016779028e-06, + "loss": 2.7056, + "step": 40398 + }, + { + "epoch": 2.507852753119374, + "grad_norm": 0.13122991510391688, + "learning_rate": 7.976417982787243e-06, + "loss": 2.6725, + "step": 40399 + }, + { + "epoch": 2.507914830219132, + "grad_norm": 0.1668251886837351, + "learning_rate": 7.974461168042675e-06, + "loss": 2.6826, + "step": 40400 + }, + { + "epoch": 2.50797690731889, + "grad_norm": 0.1366150121855419, + "learning_rate": 7.972504572555556e-06, + "loss": 2.7597, + "step": 40401 + }, + { + "epoch": 2.508038984418648, + "grad_norm": 0.16604247717517948, + "learning_rate": 7.970548196336087e-06, + "loss": 2.7495, + "step": 40402 + }, + { + "epoch": 2.508101061518406, + "grad_norm": 0.13664836434136687, + "learning_rate": 7.968592039394467e-06, + "loss": 2.6602, + "step": 40403 + }, + { + "epoch": 2.5081631386181638, + "grad_norm": 0.13592055149313237, + "learning_rate": 7.96663610174091e-06, + "loss": 2.7001, + "step": 40404 + }, + { + "epoch": 2.5082252157179217, + "grad_norm": 0.14106690987956125, + "learning_rate": 7.964680383385603e-06, + "loss": 2.7359, + "step": 40405 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.1457068534864921, + "learning_rate": 7.962724884338779e-06, + "loss": 2.6444, + "step": 40406 + }, + { + "epoch": 2.5083493699174375, + "grad_norm": 0.1380938922905518, + "learning_rate": 7.960769604610619e-06, + "loss": 2.671, + "step": 40407 + }, + { + "epoch": 2.5084114470171954, + "grad_norm": 0.1422265892479297, + "learning_rate": 7.958814544211329e-06, + "loss": 2.731, + "step": 40408 + }, + { + "epoch": 2.5084735241169533, + "grad_norm": 0.13212849686722022, + "learning_rate": 7.956859703151099e-06, + "loss": 2.6193, + "step": 40409 + }, + { + "epoch": 2.5085356012167113, + "grad_norm": 0.15064517685385745, + "learning_rate": 7.95490508144015e-06, + "loss": 2.7034, + "step": 40410 + }, + { + "epoch": 2.508597678316469, + "grad_norm": 0.15564438158205918, + "learning_rate": 7.952950679088667e-06, + "loss": 2.6374, + "step": 40411 + }, + { + "epoch": 2.508659755416227, + "grad_norm": 0.14067380111360406, + "learning_rate": 7.95099649610685e-06, + "loss": 2.7598, + "step": 40412 + }, + { + "epoch": 2.508721832515985, + "grad_norm": 0.1426124270210678, + "learning_rate": 7.949042532504892e-06, + "loss": 2.6454, + "step": 40413 + }, + { + "epoch": 2.508783909615743, + "grad_norm": 0.1877212312012699, + "learning_rate": 7.947088788292972e-06, + "loss": 2.627, + "step": 40414 + }, + { + "epoch": 2.5088459867155004, + "grad_norm": 0.1569076229366643, + "learning_rate": 7.945135263481313e-06, + "loss": 2.6933, + "step": 40415 + }, + { + "epoch": 2.5089080638152588, + "grad_norm": 0.14098360776010754, + "learning_rate": 7.943181958080087e-06, + "loss": 2.672, + "step": 40416 + }, + { + "epoch": 2.5089701409150162, + "grad_norm": 0.14021705690521785, + "learning_rate": 7.9412288720995e-06, + "loss": 2.6577, + "step": 40417 + }, + { + "epoch": 2.5090322180147746, + "grad_norm": 0.13671468319975147, + "learning_rate": 7.939276005549712e-06, + "loss": 2.6756, + "step": 40418 + }, + { + "epoch": 2.509094295114532, + "grad_norm": 0.13864658689282175, + "learning_rate": 7.937323358440935e-06, + "loss": 2.7805, + "step": 40419 + }, + { + "epoch": 2.50915637221429, + "grad_norm": 0.14747586711564603, + "learning_rate": 7.935370930783365e-06, + "loss": 2.7007, + "step": 40420 + }, + { + "epoch": 2.509218449314048, + "grad_norm": 0.1390373491195701, + "learning_rate": 7.933418722587177e-06, + "loss": 2.7075, + "step": 40421 + }, + { + "epoch": 2.509280526413806, + "grad_norm": 0.14054505406928797, + "learning_rate": 7.931466733862558e-06, + "loss": 2.7119, + "step": 40422 + }, + { + "epoch": 2.5093426035135638, + "grad_norm": 0.1370227363381873, + "learning_rate": 7.929514964619671e-06, + "loss": 2.7067, + "step": 40423 + }, + { + "epoch": 2.5094046806133217, + "grad_norm": 0.1354628164868496, + "learning_rate": 7.927563414868732e-06, + "loss": 2.6364, + "step": 40424 + }, + { + "epoch": 2.5094667577130796, + "grad_norm": 0.13879646158076295, + "learning_rate": 7.92561208461991e-06, + "loss": 2.7399, + "step": 40425 + }, + { + "epoch": 2.5095288348128375, + "grad_norm": 0.14766493322685773, + "learning_rate": 7.923660973883384e-06, + "loss": 2.7002, + "step": 40426 + }, + { + "epoch": 2.5095909119125954, + "grad_norm": 0.14583230160166583, + "learning_rate": 7.921710082669331e-06, + "loss": 2.7124, + "step": 40427 + }, + { + "epoch": 2.5096529890123533, + "grad_norm": 0.13675616978408095, + "learning_rate": 7.91975941098792e-06, + "loss": 2.7084, + "step": 40428 + }, + { + "epoch": 2.5097150661121113, + "grad_norm": 0.14142451563515002, + "learning_rate": 7.917808958849354e-06, + "loss": 2.708, + "step": 40429 + }, + { + "epoch": 2.509777143211869, + "grad_norm": 0.143776061193871, + "learning_rate": 7.915858726263791e-06, + "loss": 2.7122, + "step": 40430 + }, + { + "epoch": 2.509839220311627, + "grad_norm": 0.13402867960983336, + "learning_rate": 7.913908713241407e-06, + "loss": 2.6729, + "step": 40431 + }, + { + "epoch": 2.509901297411385, + "grad_norm": 0.13548472570972894, + "learning_rate": 7.911958919792373e-06, + "loss": 2.6937, + "step": 40432 + }, + { + "epoch": 2.509963374511143, + "grad_norm": 0.1369893325603898, + "learning_rate": 7.910009345926872e-06, + "loss": 2.6589, + "step": 40433 + }, + { + "epoch": 2.510025451610901, + "grad_norm": 0.1400592071217618, + "learning_rate": 7.908059991655076e-06, + "loss": 2.735, + "step": 40434 + }, + { + "epoch": 2.5100875287106588, + "grad_norm": 0.1684061775295847, + "learning_rate": 7.906110856987142e-06, + "loss": 2.7434, + "step": 40435 + }, + { + "epoch": 2.5101496058104167, + "grad_norm": 0.1378698392768549, + "learning_rate": 7.904161941933241e-06, + "loss": 2.7579, + "step": 40436 + }, + { + "epoch": 2.5102116829101746, + "grad_norm": 0.15619820093809766, + "learning_rate": 7.902213246503553e-06, + "loss": 2.6966, + "step": 40437 + }, + { + "epoch": 2.510273760009932, + "grad_norm": 0.14160890231347728, + "learning_rate": 7.900264770708243e-06, + "loss": 2.6844, + "step": 40438 + }, + { + "epoch": 2.5103358371096904, + "grad_norm": 0.13922553283531564, + "learning_rate": 7.89831651455747e-06, + "loss": 2.6611, + "step": 40439 + }, + { + "epoch": 2.510397914209448, + "grad_norm": 0.13335013262025527, + "learning_rate": 7.8963684780614e-06, + "loss": 2.6735, + "step": 40440 + }, + { + "epoch": 2.5104599913092063, + "grad_norm": 0.14438005864788905, + "learning_rate": 7.894420661230183e-06, + "loss": 2.8214, + "step": 40441 + }, + { + "epoch": 2.5105220684089637, + "grad_norm": 0.15183690929764296, + "learning_rate": 7.892473064074008e-06, + "loss": 2.7216, + "step": 40442 + }, + { + "epoch": 2.510584145508722, + "grad_norm": 0.140740208322043, + "learning_rate": 7.890525686603023e-06, + "loss": 2.7294, + "step": 40443 + }, + { + "epoch": 2.5106462226084796, + "grad_norm": 0.13686299208220645, + "learning_rate": 7.88857852882739e-06, + "loss": 2.6849, + "step": 40444 + }, + { + "epoch": 2.5107082997082375, + "grad_norm": 0.14217352615061724, + "learning_rate": 7.886631590757254e-06, + "loss": 2.6599, + "step": 40445 + }, + { + "epoch": 2.5107703768079954, + "grad_norm": 0.13223511345625708, + "learning_rate": 7.884684872402792e-06, + "loss": 2.7419, + "step": 40446 + }, + { + "epoch": 2.5108324539077533, + "grad_norm": 0.15148988488472526, + "learning_rate": 7.882738373774156e-06, + "loss": 2.7342, + "step": 40447 + }, + { + "epoch": 2.5108945310075113, + "grad_norm": 0.13485378339664764, + "learning_rate": 7.880792094881496e-06, + "loss": 2.6434, + "step": 40448 + }, + { + "epoch": 2.510956608107269, + "grad_norm": 0.14075128527481331, + "learning_rate": 7.87884603573497e-06, + "loss": 2.6299, + "step": 40449 + }, + { + "epoch": 2.511018685207027, + "grad_norm": 0.13224512812270775, + "learning_rate": 7.876900196344716e-06, + "loss": 2.6862, + "step": 40450 + }, + { + "epoch": 2.511080762306785, + "grad_norm": 0.139093898593135, + "learning_rate": 7.874954576720906e-06, + "loss": 2.7342, + "step": 40451 + }, + { + "epoch": 2.511142839406543, + "grad_norm": 0.13456752979075617, + "learning_rate": 7.873009176873687e-06, + "loss": 2.7319, + "step": 40452 + }, + { + "epoch": 2.511204916506301, + "grad_norm": 0.14021060072646452, + "learning_rate": 7.871063996813211e-06, + "loss": 2.6992, + "step": 40453 + }, + { + "epoch": 2.5112669936060588, + "grad_norm": 0.13493476150105016, + "learning_rate": 7.869119036549622e-06, + "loss": 2.7434, + "step": 40454 + }, + { + "epoch": 2.5113290707058167, + "grad_norm": 0.14031035269769992, + "learning_rate": 7.867174296093049e-06, + "loss": 2.7885, + "step": 40455 + }, + { + "epoch": 2.5113911478055746, + "grad_norm": 0.13139840744465478, + "learning_rate": 7.865229775453676e-06, + "loss": 2.7598, + "step": 40456 + }, + { + "epoch": 2.5114532249053325, + "grad_norm": 0.13481411545271485, + "learning_rate": 7.86328547464162e-06, + "loss": 2.6862, + "step": 40457 + }, + { + "epoch": 2.5115153020050904, + "grad_norm": 0.15408558200608916, + "learning_rate": 7.861341393667037e-06, + "loss": 2.6715, + "step": 40458 + }, + { + "epoch": 2.5115773791048484, + "grad_norm": 0.15040192004307265, + "learning_rate": 7.859397532540053e-06, + "loss": 2.8343, + "step": 40459 + }, + { + "epoch": 2.5116394562046063, + "grad_norm": 0.1385527488630946, + "learning_rate": 7.857453891270832e-06, + "loss": 2.664, + "step": 40460 + }, + { + "epoch": 2.5117015333043637, + "grad_norm": 0.14425485189061235, + "learning_rate": 7.85551046986951e-06, + "loss": 2.6549, + "step": 40461 + }, + { + "epoch": 2.511763610404122, + "grad_norm": 0.13359574461446216, + "learning_rate": 7.853567268346213e-06, + "loss": 2.6909, + "step": 40462 + }, + { + "epoch": 2.5118256875038796, + "grad_norm": 0.13807636994017775, + "learning_rate": 7.85162428671109e-06, + "loss": 2.7396, + "step": 40463 + }, + { + "epoch": 2.511887764603638, + "grad_norm": 0.1398050590354585, + "learning_rate": 7.849681524974267e-06, + "loss": 2.713, + "step": 40464 + }, + { + "epoch": 2.5119498417033954, + "grad_norm": 0.13451317624780962, + "learning_rate": 7.847738983145897e-06, + "loss": 2.7673, + "step": 40465 + }, + { + "epoch": 2.5120119188031538, + "grad_norm": 0.14695022259715154, + "learning_rate": 7.845796661236104e-06, + "loss": 2.6657, + "step": 40466 + }, + { + "epoch": 2.5120739959029112, + "grad_norm": 0.14120390071193603, + "learning_rate": 7.84385455925502e-06, + "loss": 2.777, + "step": 40467 + }, + { + "epoch": 2.512136073002669, + "grad_norm": 0.14686810776589448, + "learning_rate": 7.841912677212777e-06, + "loss": 2.7567, + "step": 40468 + }, + { + "epoch": 2.512198150102427, + "grad_norm": 0.1369817996104436, + "learning_rate": 7.839971015119518e-06, + "loss": 2.7443, + "step": 40469 + }, + { + "epoch": 2.512260227202185, + "grad_norm": 0.1405242916939507, + "learning_rate": 7.838029572985362e-06, + "loss": 2.7298, + "step": 40470 + }, + { + "epoch": 2.512322304301943, + "grad_norm": 0.21426508742494313, + "learning_rate": 7.836088350820442e-06, + "loss": 2.7378, + "step": 40471 + }, + { + "epoch": 2.512384381401701, + "grad_norm": 0.14792170756664896, + "learning_rate": 7.834147348634879e-06, + "loss": 2.6768, + "step": 40472 + }, + { + "epoch": 2.5124464585014588, + "grad_norm": 0.16579292731725767, + "learning_rate": 7.83220656643881e-06, + "loss": 2.7269, + "step": 40473 + }, + { + "epoch": 2.5125085356012167, + "grad_norm": 0.14661152090010282, + "learning_rate": 7.83026600424236e-06, + "loss": 2.6497, + "step": 40474 + }, + { + "epoch": 2.5125706127009746, + "grad_norm": 0.1350712972253501, + "learning_rate": 7.828325662055647e-06, + "loss": 2.639, + "step": 40475 + }, + { + "epoch": 2.5126326898007325, + "grad_norm": 0.12991123318744113, + "learning_rate": 7.826385539888798e-06, + "loss": 2.6915, + "step": 40476 + }, + { + "epoch": 2.5126947669004904, + "grad_norm": 0.14206142075712305, + "learning_rate": 7.82444563775192e-06, + "loss": 2.6695, + "step": 40477 + }, + { + "epoch": 2.5127568440002483, + "grad_norm": 0.1419407563812339, + "learning_rate": 7.822505955655162e-06, + "loss": 2.6288, + "step": 40478 + }, + { + "epoch": 2.5128189211000063, + "grad_norm": 0.16123191791255156, + "learning_rate": 7.820566493608628e-06, + "loss": 2.6209, + "step": 40479 + }, + { + "epoch": 2.512880998199764, + "grad_norm": 0.13391981905199246, + "learning_rate": 7.818627251622435e-06, + "loss": 2.7261, + "step": 40480 + }, + { + "epoch": 2.512943075299522, + "grad_norm": 0.15233912216395543, + "learning_rate": 7.816688229706699e-06, + "loss": 2.7078, + "step": 40481 + }, + { + "epoch": 2.51300515239928, + "grad_norm": 0.1388891626560639, + "learning_rate": 7.814749427871543e-06, + "loss": 2.6603, + "step": 40482 + }, + { + "epoch": 2.513067229499038, + "grad_norm": 0.15818493938709732, + "learning_rate": 7.81281084612709e-06, + "loss": 2.7083, + "step": 40483 + }, + { + "epoch": 2.513129306598796, + "grad_norm": 0.1360317887042066, + "learning_rate": 7.810872484483422e-06, + "loss": 2.7642, + "step": 40484 + }, + { + "epoch": 2.5131913836985538, + "grad_norm": 0.13140712512084365, + "learning_rate": 7.808934342950696e-06, + "loss": 2.6582, + "step": 40485 + }, + { + "epoch": 2.5132534607983112, + "grad_norm": 0.16629830024479095, + "learning_rate": 7.806996421538981e-06, + "loss": 2.6908, + "step": 40486 + }, + { + "epoch": 2.5133155378980696, + "grad_norm": 0.15107927320097092, + "learning_rate": 7.805058720258428e-06, + "loss": 2.7001, + "step": 40487 + }, + { + "epoch": 2.513377614997827, + "grad_norm": 0.13391716247169888, + "learning_rate": 7.803121239119121e-06, + "loss": 2.7179, + "step": 40488 + }, + { + "epoch": 2.5134396920975854, + "grad_norm": 0.14031127531965737, + "learning_rate": 7.801183978131177e-06, + "loss": 2.7389, + "step": 40489 + }, + { + "epoch": 2.513501769197343, + "grad_norm": 0.13769968174705075, + "learning_rate": 7.7992469373047e-06, + "loss": 2.7085, + "step": 40490 + }, + { + "epoch": 2.513563846297101, + "grad_norm": 0.13977066772984847, + "learning_rate": 7.797310116649786e-06, + "loss": 2.703, + "step": 40491 + }, + { + "epoch": 2.5136259233968588, + "grad_norm": 0.13982768659131845, + "learning_rate": 7.795373516176563e-06, + "loss": 2.7621, + "step": 40492 + }, + { + "epoch": 2.5136880004966167, + "grad_norm": 0.1432099517302215, + "learning_rate": 7.793437135895115e-06, + "loss": 2.7144, + "step": 40493 + }, + { + "epoch": 2.5137500775963746, + "grad_norm": 0.1501500695273511, + "learning_rate": 7.791500975815557e-06, + "loss": 2.7719, + "step": 40494 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.1404751270403935, + "learning_rate": 7.789565035947972e-06, + "loss": 2.6987, + "step": 40495 + }, + { + "epoch": 2.5138742317958904, + "grad_norm": 0.13724707972654313, + "learning_rate": 7.787629316302486e-06, + "loss": 2.7177, + "step": 40496 + }, + { + "epoch": 2.5139363088956483, + "grad_norm": 0.1539876591851382, + "learning_rate": 7.785693816889184e-06, + "loss": 2.6388, + "step": 40497 + }, + { + "epoch": 2.5139983859954063, + "grad_norm": 0.1487836691273717, + "learning_rate": 7.783758537718167e-06, + "loss": 2.7384, + "step": 40498 + }, + { + "epoch": 2.514060463095164, + "grad_norm": 0.14383271600087796, + "learning_rate": 7.781823478799528e-06, + "loss": 2.6704, + "step": 40499 + }, + { + "epoch": 2.514122540194922, + "grad_norm": 0.1373104777953367, + "learning_rate": 7.779888640143351e-06, + "loss": 2.6882, + "step": 40500 + }, + { + "epoch": 2.51418461729468, + "grad_norm": 0.13750065239838236, + "learning_rate": 7.77795402175976e-06, + "loss": 2.7165, + "step": 40501 + }, + { + "epoch": 2.514246694394438, + "grad_norm": 0.15084192754918108, + "learning_rate": 7.776019623658831e-06, + "loss": 2.7408, + "step": 40502 + }, + { + "epoch": 2.514308771494196, + "grad_norm": 0.14826173791645295, + "learning_rate": 7.774085445850653e-06, + "loss": 2.6806, + "step": 40503 + }, + { + "epoch": 2.5143708485939538, + "grad_norm": 0.14178268424660895, + "learning_rate": 7.77215148834532e-06, + "loss": 2.6129, + "step": 40504 + }, + { + "epoch": 2.5144329256937117, + "grad_norm": 0.14803867342533056, + "learning_rate": 7.770217751152931e-06, + "loss": 2.7489, + "step": 40505 + }, + { + "epoch": 2.5144950027934696, + "grad_norm": 0.14029121522100987, + "learning_rate": 7.768284234283563e-06, + "loss": 2.7204, + "step": 40506 + }, + { + "epoch": 2.5145570798932275, + "grad_norm": 0.13636974338877925, + "learning_rate": 7.766350937747314e-06, + "loss": 2.7266, + "step": 40507 + }, + { + "epoch": 2.5146191569929854, + "grad_norm": 0.13805209936328344, + "learning_rate": 7.764417861554251e-06, + "loss": 2.7171, + "step": 40508 + }, + { + "epoch": 2.514681234092743, + "grad_norm": 0.15857393465135303, + "learning_rate": 7.762485005714487e-06, + "loss": 2.7541, + "step": 40509 + }, + { + "epoch": 2.5147433111925013, + "grad_norm": 0.14299432546604096, + "learning_rate": 7.76055237023809e-06, + "loss": 2.683, + "step": 40510 + }, + { + "epoch": 2.5148053882922587, + "grad_norm": 0.14276719294928353, + "learning_rate": 7.758619955135143e-06, + "loss": 2.7333, + "step": 40511 + }, + { + "epoch": 2.514867465392017, + "grad_norm": 0.15279210577653665, + "learning_rate": 7.756687760415737e-06, + "loss": 2.7066, + "step": 40512 + }, + { + "epoch": 2.5149295424917746, + "grad_norm": 0.137227562559476, + "learning_rate": 7.754755786089929e-06, + "loss": 2.6564, + "step": 40513 + }, + { + "epoch": 2.514991619591533, + "grad_norm": 0.13943483783056307, + "learning_rate": 7.752824032167827e-06, + "loss": 2.643, + "step": 40514 + }, + { + "epoch": 2.5150536966912904, + "grad_norm": 0.13841933774964146, + "learning_rate": 7.750892498659501e-06, + "loss": 2.7682, + "step": 40515 + }, + { + "epoch": 2.5151157737910483, + "grad_norm": 0.13851153143370348, + "learning_rate": 7.74896118557501e-06, + "loss": 2.7064, + "step": 40516 + }, + { + "epoch": 2.5151778508908063, + "grad_norm": 0.14672793167683562, + "learning_rate": 7.747030092924457e-06, + "loss": 2.7149, + "step": 40517 + }, + { + "epoch": 2.515239927990564, + "grad_norm": 0.15307824652085752, + "learning_rate": 7.745099220717894e-06, + "loss": 2.7075, + "step": 40518 + }, + { + "epoch": 2.515302005090322, + "grad_norm": 0.14162402796937806, + "learning_rate": 7.74316856896542e-06, + "loss": 2.6417, + "step": 40519 + }, + { + "epoch": 2.51536408219008, + "grad_norm": 0.15453476011804462, + "learning_rate": 7.741238137677093e-06, + "loss": 2.7679, + "step": 40520 + }, + { + "epoch": 2.515426159289838, + "grad_norm": 0.15122260294808781, + "learning_rate": 7.739307926862987e-06, + "loss": 2.6175, + "step": 40521 + }, + { + "epoch": 2.515488236389596, + "grad_norm": 0.1513305312040706, + "learning_rate": 7.737377936533158e-06, + "loss": 2.7129, + "step": 40522 + }, + { + "epoch": 2.5155503134893538, + "grad_norm": 0.15956285498946043, + "learning_rate": 7.735448166697695e-06, + "loss": 2.7941, + "step": 40523 + }, + { + "epoch": 2.5156123905891117, + "grad_norm": 0.1357187827261987, + "learning_rate": 7.733518617366664e-06, + "loss": 2.6345, + "step": 40524 + }, + { + "epoch": 2.5156744676888696, + "grad_norm": 0.1579831383469701, + "learning_rate": 7.73158928855013e-06, + "loss": 2.5625, + "step": 40525 + }, + { + "epoch": 2.5157365447886275, + "grad_norm": 0.14553860204202906, + "learning_rate": 7.729660180258148e-06, + "loss": 2.7449, + "step": 40526 + }, + { + "epoch": 2.5157986218883854, + "grad_norm": 0.14665765685774917, + "learning_rate": 7.727731292500784e-06, + "loss": 2.6156, + "step": 40527 + }, + { + "epoch": 2.5158606989881434, + "grad_norm": 0.13511005560345182, + "learning_rate": 7.725802625288114e-06, + "loss": 2.7046, + "step": 40528 + }, + { + "epoch": 2.5159227760879013, + "grad_norm": 0.14076866652532768, + "learning_rate": 7.723874178630197e-06, + "loss": 2.8061, + "step": 40529 + }, + { + "epoch": 2.515984853187659, + "grad_norm": 0.14183563091609497, + "learning_rate": 7.72194595253709e-06, + "loss": 2.7138, + "step": 40530 + }, + { + "epoch": 2.516046930287417, + "grad_norm": 0.14558865394442844, + "learning_rate": 7.720017947018848e-06, + "loss": 2.8705, + "step": 40531 + }, + { + "epoch": 2.516109007387175, + "grad_norm": 0.13279378282264084, + "learning_rate": 7.718090162085546e-06, + "loss": 2.741, + "step": 40532 + }, + { + "epoch": 2.516171084486933, + "grad_norm": 0.15019632768225888, + "learning_rate": 7.71616259774723e-06, + "loss": 2.6692, + "step": 40533 + }, + { + "epoch": 2.5162331615866904, + "grad_norm": 0.13249694980403384, + "learning_rate": 7.714235254013957e-06, + "loss": 2.6714, + "step": 40534 + }, + { + "epoch": 2.5162952386864488, + "grad_norm": 0.1623472800527311, + "learning_rate": 7.712308130895785e-06, + "loss": 2.7222, + "step": 40535 + }, + { + "epoch": 2.5163573157862063, + "grad_norm": 0.13310997270337294, + "learning_rate": 7.710381228402757e-06, + "loss": 2.7667, + "step": 40536 + }, + { + "epoch": 2.5164193928859646, + "grad_norm": 0.1345662079006559, + "learning_rate": 7.708454546544946e-06, + "loss": 2.6946, + "step": 40537 + }, + { + "epoch": 2.516481469985722, + "grad_norm": 0.1400465924673945, + "learning_rate": 7.706528085332393e-06, + "loss": 2.7014, + "step": 40538 + }, + { + "epoch": 2.51654354708548, + "grad_norm": 0.1350588166186079, + "learning_rate": 7.704601844775156e-06, + "loss": 2.7273, + "step": 40539 + }, + { + "epoch": 2.516605624185238, + "grad_norm": 0.13613013534431073, + "learning_rate": 7.70267582488326e-06, + "loss": 2.7886, + "step": 40540 + }, + { + "epoch": 2.516667701284996, + "grad_norm": 0.1465147241306588, + "learning_rate": 7.700750025666791e-06, + "loss": 2.6823, + "step": 40541 + }, + { + "epoch": 2.5167297783847538, + "grad_norm": 0.1389938498468305, + "learning_rate": 7.698824447135771e-06, + "loss": 2.8249, + "step": 40542 + }, + { + "epoch": 2.5167918554845117, + "grad_norm": 0.13657445104580498, + "learning_rate": 7.696899089300257e-06, + "loss": 2.7293, + "step": 40543 + }, + { + "epoch": 2.5168539325842696, + "grad_norm": 0.15039658844258846, + "learning_rate": 7.694973952170276e-06, + "loss": 2.7265, + "step": 40544 + }, + { + "epoch": 2.5169160096840275, + "grad_norm": 0.133034814566846, + "learning_rate": 7.6930490357559e-06, + "loss": 2.5893, + "step": 40545 + }, + { + "epoch": 2.5169780867837854, + "grad_norm": 0.13608257236850005, + "learning_rate": 7.691124340067158e-06, + "loss": 2.648, + "step": 40546 + }, + { + "epoch": 2.5170401638835433, + "grad_norm": 0.14795464483703777, + "learning_rate": 7.689199865114088e-06, + "loss": 2.7373, + "step": 40547 + }, + { + "epoch": 2.5171022409833013, + "grad_norm": 0.1349807211346834, + "learning_rate": 7.687275610906736e-06, + "loss": 2.7446, + "step": 40548 + }, + { + "epoch": 2.517164318083059, + "grad_norm": 0.13856383937382794, + "learning_rate": 7.685351577455124e-06, + "loss": 2.6833, + "step": 40549 + }, + { + "epoch": 2.517226395182817, + "grad_norm": 0.15071074364182876, + "learning_rate": 7.683427764769308e-06, + "loss": 2.7088, + "step": 40550 + }, + { + "epoch": 2.517288472282575, + "grad_norm": 0.1384356730235081, + "learning_rate": 7.681504172859333e-06, + "loss": 2.7379, + "step": 40551 + }, + { + "epoch": 2.517350549382333, + "grad_norm": 0.1437356094226895, + "learning_rate": 7.679580801735225e-06, + "loss": 2.7828, + "step": 40552 + }, + { + "epoch": 2.517412626482091, + "grad_norm": 0.15127347691058723, + "learning_rate": 7.677657651407017e-06, + "loss": 2.6804, + "step": 40553 + }, + { + "epoch": 2.5174747035818488, + "grad_norm": 0.13634189711034567, + "learning_rate": 7.67573472188473e-06, + "loss": 2.6797, + "step": 40554 + }, + { + "epoch": 2.5175367806816067, + "grad_norm": 0.13364923786020236, + "learning_rate": 7.673812013178422e-06, + "loss": 2.613, + "step": 40555 + }, + { + "epoch": 2.5175988577813646, + "grad_norm": 0.13938017066849667, + "learning_rate": 7.67188952529811e-06, + "loss": 2.7269, + "step": 40556 + }, + { + "epoch": 2.517660934881122, + "grad_norm": 0.15191253129693078, + "learning_rate": 7.66996725825383e-06, + "loss": 2.6822, + "step": 40557 + }, + { + "epoch": 2.5177230119808804, + "grad_norm": 0.14319630737225483, + "learning_rate": 7.668045212055591e-06, + "loss": 2.7041, + "step": 40558 + }, + { + "epoch": 2.517785089080638, + "grad_norm": 0.14047105628168768, + "learning_rate": 7.666123386713447e-06, + "loss": 2.7148, + "step": 40559 + }, + { + "epoch": 2.5178471661803963, + "grad_norm": 0.1395385416481528, + "learning_rate": 7.664201782237418e-06, + "loss": 2.6904, + "step": 40560 + }, + { + "epoch": 2.5179092432801538, + "grad_norm": 0.1394391525705812, + "learning_rate": 7.662280398637528e-06, + "loss": 2.6703, + "step": 40561 + }, + { + "epoch": 2.517971320379912, + "grad_norm": 0.13717597392721784, + "learning_rate": 7.660359235923797e-06, + "loss": 2.7164, + "step": 40562 + }, + { + "epoch": 2.5180333974796696, + "grad_norm": 0.14252506862744785, + "learning_rate": 7.658438294106234e-06, + "loss": 2.6874, + "step": 40563 + }, + { + "epoch": 2.5180954745794275, + "grad_norm": 0.13359035576864164, + "learning_rate": 7.656517573194893e-06, + "loss": 2.8349, + "step": 40564 + }, + { + "epoch": 2.5181575516791854, + "grad_norm": 0.14185254446526904, + "learning_rate": 7.654597073199775e-06, + "loss": 2.7068, + "step": 40565 + }, + { + "epoch": 2.5182196287789433, + "grad_norm": 0.14461069243504374, + "learning_rate": 7.652676794130908e-06, + "loss": 2.7837, + "step": 40566 + }, + { + "epoch": 2.5182817058787013, + "grad_norm": 0.14417191891965675, + "learning_rate": 7.6507567359983e-06, + "loss": 2.5913, + "step": 40567 + }, + { + "epoch": 2.518343782978459, + "grad_norm": 0.13455099484851288, + "learning_rate": 7.648836898811979e-06, + "loss": 2.627, + "step": 40568 + }, + { + "epoch": 2.518405860078217, + "grad_norm": 0.1565059234896646, + "learning_rate": 7.646917282581956e-06, + "loss": 2.6926, + "step": 40569 + }, + { + "epoch": 2.518467937177975, + "grad_norm": 0.13438793829795959, + "learning_rate": 7.64499788731825e-06, + "loss": 2.7368, + "step": 40570 + }, + { + "epoch": 2.518530014277733, + "grad_norm": 0.13707375756644602, + "learning_rate": 7.64307871303086e-06, + "loss": 2.7542, + "step": 40571 + }, + { + "epoch": 2.518592091377491, + "grad_norm": 0.13324546364833229, + "learning_rate": 7.64115975972982e-06, + "loss": 2.8028, + "step": 40572 + }, + { + "epoch": 2.5186541684772488, + "grad_norm": 0.13774740302472924, + "learning_rate": 7.639241027425136e-06, + "loss": 2.6693, + "step": 40573 + }, + { + "epoch": 2.5187162455770067, + "grad_norm": 0.142136608699815, + "learning_rate": 7.637322516126817e-06, + "loss": 2.7388, + "step": 40574 + }, + { + "epoch": 2.5187783226767646, + "grad_norm": 0.14504591062916172, + "learning_rate": 7.635404225844861e-06, + "loss": 2.721, + "step": 40575 + }, + { + "epoch": 2.5188403997765225, + "grad_norm": 0.14020631636968, + "learning_rate": 7.63348615658928e-06, + "loss": 2.7232, + "step": 40576 + }, + { + "epoch": 2.5189024768762804, + "grad_norm": 0.15552330305979276, + "learning_rate": 7.631568308370102e-06, + "loss": 2.7444, + "step": 40577 + }, + { + "epoch": 2.5189645539760384, + "grad_norm": 0.1386821702708369, + "learning_rate": 7.629650681197309e-06, + "loss": 2.7185, + "step": 40578 + }, + { + "epoch": 2.5190266310757963, + "grad_norm": 0.13072155341995925, + "learning_rate": 7.6277332750809225e-06, + "loss": 2.6858, + "step": 40579 + }, + { + "epoch": 2.519088708175554, + "grad_norm": 0.1551774285951034, + "learning_rate": 7.625816090030918e-06, + "loss": 2.7374, + "step": 40580 + }, + { + "epoch": 2.519150785275312, + "grad_norm": 0.13920316867996818, + "learning_rate": 7.623899126057338e-06, + "loss": 2.7325, + "step": 40581 + }, + { + "epoch": 2.5192128623750696, + "grad_norm": 0.13791351749703826, + "learning_rate": 7.621982383170145e-06, + "loss": 2.7108, + "step": 40582 + }, + { + "epoch": 2.519274939474828, + "grad_norm": 0.15601890282585423, + "learning_rate": 7.6200658613793715e-06, + "loss": 2.7328, + "step": 40583 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.13500919307428677, + "learning_rate": 7.618149560695003e-06, + "loss": 2.6223, + "step": 40584 + }, + { + "epoch": 2.519399093674344, + "grad_norm": 0.13349565646833172, + "learning_rate": 7.616233481127039e-06, + "loss": 2.7008, + "step": 40585 + }, + { + "epoch": 2.5194611707741013, + "grad_norm": 0.14065172683199556, + "learning_rate": 7.614317622685457e-06, + "loss": 2.7207, + "step": 40586 + }, + { + "epoch": 2.519523247873859, + "grad_norm": 0.1610551473540209, + "learning_rate": 7.612401985380286e-06, + "loss": 2.7887, + "step": 40587 + }, + { + "epoch": 2.519585324973617, + "grad_norm": 0.13649521673820483, + "learning_rate": 7.6104865692214975e-06, + "loss": 2.7606, + "step": 40588 + }, + { + "epoch": 2.519647402073375, + "grad_norm": 0.14087288915978702, + "learning_rate": 7.608571374219098e-06, + "loss": 2.7004, + "step": 40589 + }, + { + "epoch": 2.519709479173133, + "grad_norm": 0.13484081990052144, + "learning_rate": 7.606656400383055e-06, + "loss": 2.6529, + "step": 40590 + }, + { + "epoch": 2.519771556272891, + "grad_norm": 0.15145719989744297, + "learning_rate": 7.60474164772339e-06, + "loss": 2.7383, + "step": 40591 + }, + { + "epoch": 2.5198336333726488, + "grad_norm": 0.14157868562256248, + "learning_rate": 7.602827116250083e-06, + "loss": 2.7739, + "step": 40592 + }, + { + "epoch": 2.5198957104724067, + "grad_norm": 0.13782926901824832, + "learning_rate": 7.600912805973121e-06, + "loss": 2.7134, + "step": 40593 + }, + { + "epoch": 2.5199577875721646, + "grad_norm": 0.15485938665924784, + "learning_rate": 7.5989987169024725e-06, + "loss": 2.7286, + "step": 40594 + }, + { + "epoch": 2.5200198646719225, + "grad_norm": 0.13289910886484058, + "learning_rate": 7.597084849048153e-06, + "loss": 2.7404, + "step": 40595 + }, + { + "epoch": 2.5200819417716804, + "grad_norm": 0.15865485724130313, + "learning_rate": 7.595171202420137e-06, + "loss": 2.6948, + "step": 40596 + }, + { + "epoch": 2.5201440188714384, + "grad_norm": 0.13317357096207003, + "learning_rate": 7.593257777028406e-06, + "loss": 2.6557, + "step": 40597 + }, + { + "epoch": 2.5202060959711963, + "grad_norm": 0.1406348211779583, + "learning_rate": 7.591344572882947e-06, + "loss": 2.7588, + "step": 40598 + }, + { + "epoch": 2.520268173070954, + "grad_norm": 0.1428405619582435, + "learning_rate": 7.589431589993723e-06, + "loss": 2.7317, + "step": 40599 + }, + { + "epoch": 2.520330250170712, + "grad_norm": 0.16980296940325942, + "learning_rate": 7.587518828370743e-06, + "loss": 2.6801, + "step": 40600 + }, + { + "epoch": 2.52039232727047, + "grad_norm": 0.1367192064005182, + "learning_rate": 7.585606288023972e-06, + "loss": 2.7322, + "step": 40601 + }, + { + "epoch": 2.520454404370228, + "grad_norm": 0.13870015997605836, + "learning_rate": 7.58369396896339e-06, + "loss": 2.7073, + "step": 40602 + }, + { + "epoch": 2.520516481469986, + "grad_norm": 0.13071116552115844, + "learning_rate": 7.581781871198962e-06, + "loss": 2.657, + "step": 40603 + }, + { + "epoch": 2.5205785585697438, + "grad_norm": 0.15005681531498472, + "learning_rate": 7.579869994740691e-06, + "loss": 2.6715, + "step": 40604 + }, + { + "epoch": 2.5206406356695013, + "grad_norm": 0.1478866847864039, + "learning_rate": 7.577958339598529e-06, + "loss": 2.714, + "step": 40605 + }, + { + "epoch": 2.5207027127692596, + "grad_norm": 0.15008490935822297, + "learning_rate": 7.576046905782458e-06, + "loss": 2.6875, + "step": 40606 + }, + { + "epoch": 2.520764789869017, + "grad_norm": 0.16108570566266603, + "learning_rate": 7.574135693302442e-06, + "loss": 2.6537, + "step": 40607 + }, + { + "epoch": 2.5208268669687754, + "grad_norm": 0.13471572753880048, + "learning_rate": 7.572224702168468e-06, + "loss": 2.7234, + "step": 40608 + }, + { + "epoch": 2.520888944068533, + "grad_norm": 0.15552428514040056, + "learning_rate": 7.5703139323904994e-06, + "loss": 2.7413, + "step": 40609 + }, + { + "epoch": 2.5209510211682913, + "grad_norm": 0.14959482714884995, + "learning_rate": 7.5684033839785004e-06, + "loss": 2.6508, + "step": 40610 + }, + { + "epoch": 2.5210130982680488, + "grad_norm": 0.15006655915079722, + "learning_rate": 7.566493056942442e-06, + "loss": 2.7047, + "step": 40611 + }, + { + "epoch": 2.5210751753678067, + "grad_norm": 0.13444577837062038, + "learning_rate": 7.5645829512922815e-06, + "loss": 2.707, + "step": 40612 + }, + { + "epoch": 2.5211372524675646, + "grad_norm": 0.15084765088925228, + "learning_rate": 7.562673067038001e-06, + "loss": 2.7194, + "step": 40613 + }, + { + "epoch": 2.5211993295673225, + "grad_norm": 0.13763881092259653, + "learning_rate": 7.560763404189558e-06, + "loss": 2.7013, + "step": 40614 + }, + { + "epoch": 2.5212614066670804, + "grad_norm": 0.143095710018686, + "learning_rate": 7.558853962756906e-06, + "loss": 2.6861, + "step": 40615 + }, + { + "epoch": 2.5213234837668383, + "grad_norm": 0.1360530039189998, + "learning_rate": 7.556944742750028e-06, + "loss": 2.7334, + "step": 40616 + }, + { + "epoch": 2.5213855608665963, + "grad_norm": 0.1407077267819349, + "learning_rate": 7.555035744178857e-06, + "loss": 2.6848, + "step": 40617 + }, + { + "epoch": 2.521447637966354, + "grad_norm": 0.1606611341807334, + "learning_rate": 7.553126967053381e-06, + "loss": 2.7322, + "step": 40618 + }, + { + "epoch": 2.521509715066112, + "grad_norm": 0.14572382260908914, + "learning_rate": 7.551218411383548e-06, + "loss": 2.6528, + "step": 40619 + }, + { + "epoch": 2.52157179216587, + "grad_norm": 0.1493315063485698, + "learning_rate": 7.549310077179312e-06, + "loss": 2.7951, + "step": 40620 + }, + { + "epoch": 2.521633869265628, + "grad_norm": 0.13624321115711746, + "learning_rate": 7.547401964450618e-06, + "loss": 2.6317, + "step": 40621 + }, + { + "epoch": 2.521695946365386, + "grad_norm": 0.1484589247550804, + "learning_rate": 7.5454940732074485e-06, + "loss": 2.7419, + "step": 40622 + }, + { + "epoch": 2.5217580234651438, + "grad_norm": 0.1452417629613905, + "learning_rate": 7.543586403459741e-06, + "loss": 2.7421, + "step": 40623 + }, + { + "epoch": 2.5218201005649017, + "grad_norm": 0.1474177189833972, + "learning_rate": 7.541678955217451e-06, + "loss": 2.614, + "step": 40624 + }, + { + "epoch": 2.5218821776646596, + "grad_norm": 0.13207992299858756, + "learning_rate": 7.5397717284905246e-06, + "loss": 2.6331, + "step": 40625 + }, + { + "epoch": 2.5219442547644175, + "grad_norm": 0.14322047351696554, + "learning_rate": 7.537864723288906e-06, + "loss": 2.7467, + "step": 40626 + }, + { + "epoch": 2.5220063318641754, + "grad_norm": 0.13699526072792154, + "learning_rate": 7.535957939622573e-06, + "loss": 2.7088, + "step": 40627 + }, + { + "epoch": 2.5220684089639334, + "grad_norm": 0.1345197604807978, + "learning_rate": 7.53405137750145e-06, + "loss": 2.7003, + "step": 40628 + }, + { + "epoch": 2.5221304860636913, + "grad_norm": 0.13545196024369396, + "learning_rate": 7.532145036935489e-06, + "loss": 2.6902, + "step": 40629 + }, + { + "epoch": 2.5221925631634488, + "grad_norm": 0.13306108519804039, + "learning_rate": 7.530238917934629e-06, + "loss": 2.6391, + "step": 40630 + }, + { + "epoch": 2.522254640263207, + "grad_norm": 0.17045436384090226, + "learning_rate": 7.528333020508832e-06, + "loss": 2.7318, + "step": 40631 + }, + { + "epoch": 2.5223167173629646, + "grad_norm": 0.13377071867438212, + "learning_rate": 7.526427344668036e-06, + "loss": 2.7059, + "step": 40632 + }, + { + "epoch": 2.522378794462723, + "grad_norm": 0.1462172106417041, + "learning_rate": 7.524521890422176e-06, + "loss": 2.6796, + "step": 40633 + }, + { + "epoch": 2.5224408715624804, + "grad_norm": 0.14979167046124697, + "learning_rate": 7.522616657781195e-06, + "loss": 2.7469, + "step": 40634 + }, + { + "epoch": 2.5225029486622383, + "grad_norm": 0.14607198522487, + "learning_rate": 7.520711646755024e-06, + "loss": 2.7113, + "step": 40635 + }, + { + "epoch": 2.5225650257619963, + "grad_norm": 0.14640878634459362, + "learning_rate": 7.5188068573536275e-06, + "loss": 2.6802, + "step": 40636 + }, + { + "epoch": 2.522627102861754, + "grad_norm": 0.14596861641325867, + "learning_rate": 7.5169022895869245e-06, + "loss": 2.6725, + "step": 40637 + }, + { + "epoch": 2.522689179961512, + "grad_norm": 0.13418616233429015, + "learning_rate": 7.5149979434648575e-06, + "loss": 2.5739, + "step": 40638 + }, + { + "epoch": 2.52275125706127, + "grad_norm": 0.13539918609424526, + "learning_rate": 7.513093818997347e-06, + "loss": 2.6762, + "step": 40639 + }, + { + "epoch": 2.522813334161028, + "grad_norm": 0.1424741496514405, + "learning_rate": 7.51118991619435e-06, + "loss": 2.5906, + "step": 40640 + }, + { + "epoch": 2.522875411260786, + "grad_norm": 0.15289483897386594, + "learning_rate": 7.509286235065794e-06, + "loss": 2.7554, + "step": 40641 + }, + { + "epoch": 2.5229374883605438, + "grad_norm": 0.13902857398769408, + "learning_rate": 7.507382775621602e-06, + "loss": 2.6811, + "step": 40642 + }, + { + "epoch": 2.5229995654603017, + "grad_norm": 0.13366098608170804, + "learning_rate": 7.505479537871701e-06, + "loss": 2.6854, + "step": 40643 + }, + { + "epoch": 2.5230616425600596, + "grad_norm": 0.1389216269380515, + "learning_rate": 7.503576521826045e-06, + "loss": 2.7366, + "step": 40644 + }, + { + "epoch": 2.5231237196598175, + "grad_norm": 0.1407786099909294, + "learning_rate": 7.50167372749454e-06, + "loss": 2.6731, + "step": 40645 + }, + { + "epoch": 2.5231857967595754, + "grad_norm": 0.1319902999846833, + "learning_rate": 7.4997711548871196e-06, + "loss": 2.6397, + "step": 40646 + }, + { + "epoch": 2.5232478738593334, + "grad_norm": 0.13757649835018085, + "learning_rate": 7.497868804013702e-06, + "loss": 2.7385, + "step": 40647 + }, + { + "epoch": 2.5233099509590913, + "grad_norm": 0.13706768331237457, + "learning_rate": 7.49596667488423e-06, + "loss": 2.6157, + "step": 40648 + }, + { + "epoch": 2.523372028058849, + "grad_norm": 0.14694720583346033, + "learning_rate": 7.4940647675086074e-06, + "loss": 2.6975, + "step": 40649 + }, + { + "epoch": 2.523434105158607, + "grad_norm": 0.1361272003461885, + "learning_rate": 7.49216308189678e-06, + "loss": 2.7176, + "step": 40650 + }, + { + "epoch": 2.523496182258365, + "grad_norm": 0.1520166002195815, + "learning_rate": 7.490261618058658e-06, + "loss": 2.7458, + "step": 40651 + }, + { + "epoch": 2.523558259358123, + "grad_norm": 0.13411915564535923, + "learning_rate": 7.488360376004155e-06, + "loss": 2.6446, + "step": 40652 + }, + { + "epoch": 2.5236203364578804, + "grad_norm": 0.14992979587346078, + "learning_rate": 7.486459355743192e-06, + "loss": 2.6723, + "step": 40653 + }, + { + "epoch": 2.523682413557639, + "grad_norm": 0.14203922772847719, + "learning_rate": 7.484558557285698e-06, + "loss": 2.648, + "step": 40654 + }, + { + "epoch": 2.5237444906573963, + "grad_norm": 0.13391116758639945, + "learning_rate": 7.482657980641583e-06, + "loss": 2.6462, + "step": 40655 + }, + { + "epoch": 2.5238065677571546, + "grad_norm": 0.15600618174344993, + "learning_rate": 7.480757625820767e-06, + "loss": 2.6925, + "step": 40656 + }, + { + "epoch": 2.523868644856912, + "grad_norm": 0.13834156402074108, + "learning_rate": 7.478857492833147e-06, + "loss": 2.7099, + "step": 40657 + }, + { + "epoch": 2.5239307219566705, + "grad_norm": 0.1416533211132411, + "learning_rate": 7.47695758168866e-06, + "loss": 2.6267, + "step": 40658 + }, + { + "epoch": 2.523992799056428, + "grad_norm": 0.14172450680590556, + "learning_rate": 7.475057892397208e-06, + "loss": 2.7306, + "step": 40659 + }, + { + "epoch": 2.524054876156186, + "grad_norm": 0.15374271253434812, + "learning_rate": 7.4731584249687014e-06, + "loss": 2.7736, + "step": 40660 + }, + { + "epoch": 2.5241169532559438, + "grad_norm": 0.13599409427074174, + "learning_rate": 7.471259179413048e-06, + "loss": 2.6969, + "step": 40661 + }, + { + "epoch": 2.5241790303557017, + "grad_norm": 0.13680503030080166, + "learning_rate": 7.469360155740157e-06, + "loss": 2.6972, + "step": 40662 + }, + { + "epoch": 2.5242411074554596, + "grad_norm": 0.13682705812559945, + "learning_rate": 7.467461353959942e-06, + "loss": 2.751, + "step": 40663 + }, + { + "epoch": 2.5243031845552175, + "grad_norm": 0.14078543041851688, + "learning_rate": 7.465562774082313e-06, + "loss": 2.7056, + "step": 40664 + }, + { + "epoch": 2.5243652616549754, + "grad_norm": 0.1591194249387889, + "learning_rate": 7.463664416117161e-06, + "loss": 2.7328, + "step": 40665 + }, + { + "epoch": 2.5244273387547334, + "grad_norm": 0.13363658959905814, + "learning_rate": 7.461766280074395e-06, + "loss": 2.6445, + "step": 40666 + }, + { + "epoch": 2.5244894158544913, + "grad_norm": 0.13571357425479907, + "learning_rate": 7.459868365963923e-06, + "loss": 2.6884, + "step": 40667 + }, + { + "epoch": 2.524551492954249, + "grad_norm": 0.13988666621793056, + "learning_rate": 7.4579706737956505e-06, + "loss": 2.8084, + "step": 40668 + }, + { + "epoch": 2.524613570054007, + "grad_norm": 0.13468592816517672, + "learning_rate": 7.456073203579472e-06, + "loss": 2.7213, + "step": 40669 + }, + { + "epoch": 2.524675647153765, + "grad_norm": 0.16164017763440047, + "learning_rate": 7.454175955325288e-06, + "loss": 2.7102, + "step": 40670 + }, + { + "epoch": 2.524737724253523, + "grad_norm": 0.14375330361426306, + "learning_rate": 7.452278929042983e-06, + "loss": 2.6558, + "step": 40671 + }, + { + "epoch": 2.524799801353281, + "grad_norm": 0.1380416559011043, + "learning_rate": 7.4503821247424835e-06, + "loss": 2.6952, + "step": 40672 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.13626407222521403, + "learning_rate": 7.448485542433664e-06, + "loss": 2.7203, + "step": 40673 + }, + { + "epoch": 2.5249239555527967, + "grad_norm": 0.13815136051650528, + "learning_rate": 7.446589182126429e-06, + "loss": 2.7857, + "step": 40674 + }, + { + "epoch": 2.5249860326525546, + "grad_norm": 0.14275790989843817, + "learning_rate": 7.4446930438306525e-06, + "loss": 2.7763, + "step": 40675 + }, + { + "epoch": 2.5250481097523125, + "grad_norm": 0.12823619233595165, + "learning_rate": 7.442797127556261e-06, + "loss": 2.6186, + "step": 40676 + }, + { + "epoch": 2.5251101868520704, + "grad_norm": 0.14941736604117675, + "learning_rate": 7.440901433313124e-06, + "loss": 2.7653, + "step": 40677 + }, + { + "epoch": 2.525172263951828, + "grad_norm": 0.13422587015008833, + "learning_rate": 7.439005961111134e-06, + "loss": 2.6587, + "step": 40678 + }, + { + "epoch": 2.5252343410515863, + "grad_norm": 0.14217270732266651, + "learning_rate": 7.437110710960177e-06, + "loss": 2.6917, + "step": 40679 + }, + { + "epoch": 2.5252964181513438, + "grad_norm": 0.15169616758034146, + "learning_rate": 7.4352156828701395e-06, + "loss": 2.6976, + "step": 40680 + }, + { + "epoch": 2.525358495251102, + "grad_norm": 0.15573945080620505, + "learning_rate": 7.43332087685093e-06, + "loss": 2.6953, + "step": 40681 + }, + { + "epoch": 2.5254205723508596, + "grad_norm": 0.14089141220079307, + "learning_rate": 7.4314262929124135e-06, + "loss": 2.7983, + "step": 40682 + }, + { + "epoch": 2.5254826494506175, + "grad_norm": 0.14350769675195796, + "learning_rate": 7.429531931064488e-06, + "loss": 2.7374, + "step": 40683 + }, + { + "epoch": 2.5255447265503754, + "grad_norm": 0.13814585274800867, + "learning_rate": 7.427637791317022e-06, + "loss": 2.5848, + "step": 40684 + }, + { + "epoch": 2.5256068036501333, + "grad_norm": 0.13285957231521534, + "learning_rate": 7.4257438736798965e-06, + "loss": 2.6766, + "step": 40685 + }, + { + "epoch": 2.5256688807498913, + "grad_norm": 0.16761591914169782, + "learning_rate": 7.423850178163011e-06, + "loss": 2.7561, + "step": 40686 + }, + { + "epoch": 2.525730957849649, + "grad_norm": 0.14723602726027576, + "learning_rate": 7.421956704776234e-06, + "loss": 2.6692, + "step": 40687 + }, + { + "epoch": 2.525793034949407, + "grad_norm": 0.15868326019506723, + "learning_rate": 7.420063453529447e-06, + "loss": 2.6568, + "step": 40688 + }, + { + "epoch": 2.525855112049165, + "grad_norm": 0.14210016009971382, + "learning_rate": 7.4181704244325135e-06, + "loss": 2.797, + "step": 40689 + }, + { + "epoch": 2.525917189148923, + "grad_norm": 0.14221903901813657, + "learning_rate": 7.416277617495332e-06, + "loss": 2.7158, + "step": 40690 + }, + { + "epoch": 2.525979266248681, + "grad_norm": 0.13578580251684624, + "learning_rate": 7.414385032727766e-06, + "loss": 2.7553, + "step": 40691 + }, + { + "epoch": 2.5260413433484388, + "grad_norm": 0.1372542995607164, + "learning_rate": 7.412492670139698e-06, + "loss": 2.7639, + "step": 40692 + }, + { + "epoch": 2.5261034204481967, + "grad_norm": 0.13340232809936328, + "learning_rate": 7.41060052974098e-06, + "loss": 2.7224, + "step": 40693 + }, + { + "epoch": 2.5261654975479546, + "grad_norm": 0.13904597634289684, + "learning_rate": 7.408708611541504e-06, + "loss": 2.6772, + "step": 40694 + }, + { + "epoch": 2.5262275746477125, + "grad_norm": 0.14224152481847455, + "learning_rate": 7.40681691555114e-06, + "loss": 2.7154, + "step": 40695 + }, + { + "epoch": 2.5262896517474704, + "grad_norm": 0.1598306229870062, + "learning_rate": 7.4049254417797534e-06, + "loss": 2.6852, + "step": 40696 + }, + { + "epoch": 2.5263517288472284, + "grad_norm": 0.13871783180044647, + "learning_rate": 7.403034190237207e-06, + "loss": 2.7456, + "step": 40697 + }, + { + "epoch": 2.5264138059469863, + "grad_norm": 0.13500924826000343, + "learning_rate": 7.40114316093336e-06, + "loss": 2.7187, + "step": 40698 + }, + { + "epoch": 2.526475883046744, + "grad_norm": 0.14363916337925178, + "learning_rate": 7.399252353878105e-06, + "loss": 2.7615, + "step": 40699 + }, + { + "epoch": 2.526537960146502, + "grad_norm": 0.1526173311442277, + "learning_rate": 7.39736176908129e-06, + "loss": 2.7062, + "step": 40700 + }, + { + "epoch": 2.5266000372462596, + "grad_norm": 0.13920858109495046, + "learning_rate": 7.395471406552784e-06, + "loss": 2.6564, + "step": 40701 + }, + { + "epoch": 2.526662114346018, + "grad_norm": 0.13176762965860125, + "learning_rate": 7.393581266302429e-06, + "loss": 2.65, + "step": 40702 + }, + { + "epoch": 2.5267241914457754, + "grad_norm": 0.14102602442175066, + "learning_rate": 7.391691348340119e-06, + "loss": 2.6704, + "step": 40703 + }, + { + "epoch": 2.526786268545534, + "grad_norm": 0.15766387446542424, + "learning_rate": 7.389801652675699e-06, + "loss": 2.6451, + "step": 40704 + }, + { + "epoch": 2.5268483456452913, + "grad_norm": 0.15429950663579806, + "learning_rate": 7.3879121793190284e-06, + "loss": 2.699, + "step": 40705 + }, + { + "epoch": 2.5269104227450496, + "grad_norm": 0.13350644158733146, + "learning_rate": 7.386022928279962e-06, + "loss": 2.7364, + "step": 40706 + }, + { + "epoch": 2.526972499844807, + "grad_norm": 0.136593631986516, + "learning_rate": 7.384133899568352e-06, + "loss": 2.6446, + "step": 40707 + }, + { + "epoch": 2.527034576944565, + "grad_norm": 0.13406348798294415, + "learning_rate": 7.382245093194068e-06, + "loss": 2.6824, + "step": 40708 + }, + { + "epoch": 2.527096654044323, + "grad_norm": 0.13900179636389687, + "learning_rate": 7.380356509166958e-06, + "loss": 2.7501, + "step": 40709 + }, + { + "epoch": 2.527158731144081, + "grad_norm": 0.13933111030038753, + "learning_rate": 7.378468147496875e-06, + "loss": 2.7687, + "step": 40710 + }, + { + "epoch": 2.5272208082438388, + "grad_norm": 0.1436578094770367, + "learning_rate": 7.3765800081936564e-06, + "loss": 2.7189, + "step": 40711 + }, + { + "epoch": 2.5272828853435967, + "grad_norm": 0.1521164898513768, + "learning_rate": 7.374692091267182e-06, + "loss": 2.6758, + "step": 40712 + }, + { + "epoch": 2.5273449624433546, + "grad_norm": 0.1333025418678802, + "learning_rate": 7.372804396727273e-06, + "loss": 2.7462, + "step": 40713 + }, + { + "epoch": 2.5274070395431125, + "grad_norm": 0.1301707539371052, + "learning_rate": 7.370916924583804e-06, + "loss": 2.6307, + "step": 40714 + }, + { + "epoch": 2.5274691166428704, + "grad_norm": 0.1340767628078527, + "learning_rate": 7.369029674846606e-06, + "loss": 2.7038, + "step": 40715 + }, + { + "epoch": 2.5275311937426284, + "grad_norm": 0.16243245206433082, + "learning_rate": 7.367142647525521e-06, + "loss": 2.7129, + "step": 40716 + }, + { + "epoch": 2.5275932708423863, + "grad_norm": 0.1328162375092539, + "learning_rate": 7.365255842630414e-06, + "loss": 2.6982, + "step": 40717 + }, + { + "epoch": 2.527655347942144, + "grad_norm": 0.129419842529641, + "learning_rate": 7.363369260171121e-06, + "loss": 2.6304, + "step": 40718 + }, + { + "epoch": 2.527717425041902, + "grad_norm": 0.14710378658520382, + "learning_rate": 7.3614829001574795e-06, + "loss": 2.6255, + "step": 40719 + }, + { + "epoch": 2.52777950214166, + "grad_norm": 0.14323744460951304, + "learning_rate": 7.35959676259933e-06, + "loss": 2.6739, + "step": 40720 + }, + { + "epoch": 2.527841579241418, + "grad_norm": 0.13675109616882855, + "learning_rate": 7.3577108475065045e-06, + "loss": 2.7355, + "step": 40721 + }, + { + "epoch": 2.527903656341176, + "grad_norm": 0.15744923881503944, + "learning_rate": 7.355825154888862e-06, + "loss": 2.7152, + "step": 40722 + }, + { + "epoch": 2.527965733440934, + "grad_norm": 0.13363346741980556, + "learning_rate": 7.353939684756234e-06, + "loss": 2.6394, + "step": 40723 + }, + { + "epoch": 2.5280278105406917, + "grad_norm": 0.15977065840908988, + "learning_rate": 7.352054437118455e-06, + "loss": 2.8059, + "step": 40724 + }, + { + "epoch": 2.5280898876404496, + "grad_norm": 0.13473287129085618, + "learning_rate": 7.350169411985352e-06, + "loss": 2.7142, + "step": 40725 + }, + { + "epoch": 2.528151964740207, + "grad_norm": 0.14028873240179307, + "learning_rate": 7.348284609366779e-06, + "loss": 2.7643, + "step": 40726 + }, + { + "epoch": 2.5282140418399655, + "grad_norm": 0.13874052927197053, + "learning_rate": 7.34640002927256e-06, + "loss": 2.7117, + "step": 40727 + }, + { + "epoch": 2.528276118939723, + "grad_norm": 0.15141886782550862, + "learning_rate": 7.344515671712526e-06, + "loss": 2.6806, + "step": 40728 + }, + { + "epoch": 2.5283381960394813, + "grad_norm": 0.15384666335039868, + "learning_rate": 7.342631536696498e-06, + "loss": 2.6975, + "step": 40729 + }, + { + "epoch": 2.5284002731392388, + "grad_norm": 0.13746807611026687, + "learning_rate": 7.340747624234323e-06, + "loss": 2.7015, + "step": 40730 + }, + { + "epoch": 2.5284623502389967, + "grad_norm": 0.13219250391108223, + "learning_rate": 7.3388639343358325e-06, + "loss": 2.685, + "step": 40731 + }, + { + "epoch": 2.5285244273387546, + "grad_norm": 0.1554704292245015, + "learning_rate": 7.33698046701084e-06, + "loss": 2.6115, + "step": 40732 + }, + { + "epoch": 2.5285865044385125, + "grad_norm": 0.15186634822032485, + "learning_rate": 7.3350972222691775e-06, + "loss": 2.6714, + "step": 40733 + }, + { + "epoch": 2.5286485815382704, + "grad_norm": 0.13097779765742693, + "learning_rate": 7.333214200120658e-06, + "loss": 2.6887, + "step": 40734 + }, + { + "epoch": 2.5287106586380284, + "grad_norm": 0.13440136817114404, + "learning_rate": 7.331331400575131e-06, + "loss": 2.6969, + "step": 40735 + }, + { + "epoch": 2.5287727357377863, + "grad_norm": 0.13689316795563958, + "learning_rate": 7.3294488236424044e-06, + "loss": 2.5961, + "step": 40736 + }, + { + "epoch": 2.528834812837544, + "grad_norm": 0.14500911431195515, + "learning_rate": 7.327566469332303e-06, + "loss": 2.6951, + "step": 40737 + }, + { + "epoch": 2.528896889937302, + "grad_norm": 0.1342625765916659, + "learning_rate": 7.325684337654631e-06, + "loss": 2.7287, + "step": 40738 + }, + { + "epoch": 2.52895896703706, + "grad_norm": 0.1479407896655974, + "learning_rate": 7.323802428619242e-06, + "loss": 2.7549, + "step": 40739 + }, + { + "epoch": 2.529021044136818, + "grad_norm": 0.1395261402411964, + "learning_rate": 7.321920742235933e-06, + "loss": 2.7739, + "step": 40740 + }, + { + "epoch": 2.529083121236576, + "grad_norm": 0.1322328244843756, + "learning_rate": 7.320039278514523e-06, + "loss": 2.5338, + "step": 40741 + }, + { + "epoch": 2.5291451983363338, + "grad_norm": 0.13357882444203278, + "learning_rate": 7.318158037464818e-06, + "loss": 2.716, + "step": 40742 + }, + { + "epoch": 2.5292072754360917, + "grad_norm": 0.14211969619146447, + "learning_rate": 7.316277019096651e-06, + "loss": 2.6137, + "step": 40743 + }, + { + "epoch": 2.5292693525358496, + "grad_norm": 0.17720607169106042, + "learning_rate": 7.314396223419834e-06, + "loss": 2.732, + "step": 40744 + }, + { + "epoch": 2.5293314296356075, + "grad_norm": 0.13774526650375332, + "learning_rate": 7.312515650444174e-06, + "loss": 2.6954, + "step": 40745 + }, + { + "epoch": 2.5293935067353654, + "grad_norm": 0.13613655928021498, + "learning_rate": 7.310635300179469e-06, + "loss": 2.7311, + "step": 40746 + }, + { + "epoch": 2.5294555838351234, + "grad_norm": 0.13877692072050613, + "learning_rate": 7.308755172635557e-06, + "loss": 2.7458, + "step": 40747 + }, + { + "epoch": 2.5295176609348813, + "grad_norm": 0.16480425207781726, + "learning_rate": 7.306875267822222e-06, + "loss": 2.6618, + "step": 40748 + }, + { + "epoch": 2.5295797380346388, + "grad_norm": 0.14324375786296317, + "learning_rate": 7.304995585749291e-06, + "loss": 2.6375, + "step": 40749 + }, + { + "epoch": 2.529641815134397, + "grad_norm": 0.1497525700175875, + "learning_rate": 7.303116126426562e-06, + "loss": 2.6927, + "step": 40750 + }, + { + "epoch": 2.5297038922341546, + "grad_norm": 0.14877772867185665, + "learning_rate": 7.301236889863844e-06, + "loss": 2.565, + "step": 40751 + }, + { + "epoch": 2.529765969333913, + "grad_norm": 0.13303444353490637, + "learning_rate": 7.299357876070922e-06, + "loss": 2.6459, + "step": 40752 + }, + { + "epoch": 2.5298280464336704, + "grad_norm": 0.15132366292989224, + "learning_rate": 7.297479085057635e-06, + "loss": 2.6647, + "step": 40753 + }, + { + "epoch": 2.529890123533429, + "grad_norm": 0.13587983102992335, + "learning_rate": 7.2956005168337615e-06, + "loss": 2.7208, + "step": 40754 + }, + { + "epoch": 2.5299522006331863, + "grad_norm": 0.13765687608955365, + "learning_rate": 7.293722171409106e-06, + "loss": 2.7088, + "step": 40755 + }, + { + "epoch": 2.530014277732944, + "grad_norm": 0.13703274772808582, + "learning_rate": 7.2918440487934724e-06, + "loss": 2.7552, + "step": 40756 + }, + { + "epoch": 2.530076354832702, + "grad_norm": 0.137325303766931, + "learning_rate": 7.289966148996646e-06, + "loss": 2.6936, + "step": 40757 + }, + { + "epoch": 2.53013843193246, + "grad_norm": 0.13524238592449161, + "learning_rate": 7.288088472028443e-06, + "loss": 2.6779, + "step": 40758 + }, + { + "epoch": 2.530200509032218, + "grad_norm": 0.15385469623694928, + "learning_rate": 7.286211017898653e-06, + "loss": 2.7177, + "step": 40759 + }, + { + "epoch": 2.530262586131976, + "grad_norm": 0.1393953176836746, + "learning_rate": 7.284333786617065e-06, + "loss": 2.6577, + "step": 40760 + }, + { + "epoch": 2.5303246632317338, + "grad_norm": 0.13734213534068945, + "learning_rate": 7.28245677819347e-06, + "loss": 2.62, + "step": 40761 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.13599098512630034, + "learning_rate": 7.280579992637676e-06, + "loss": 2.6975, + "step": 40762 + }, + { + "epoch": 2.5304488174312496, + "grad_norm": 0.1453200158872459, + "learning_rate": 7.278703429959471e-06, + "loss": 2.7114, + "step": 40763 + }, + { + "epoch": 2.5305108945310075, + "grad_norm": 0.13715656886581934, + "learning_rate": 7.276827090168637e-06, + "loss": 2.7607, + "step": 40764 + }, + { + "epoch": 2.5305729716307654, + "grad_norm": 0.13260657109280433, + "learning_rate": 7.274950973274958e-06, + "loss": 2.6349, + "step": 40765 + }, + { + "epoch": 2.5306350487305234, + "grad_norm": 0.1363400801086694, + "learning_rate": 7.273075079288244e-06, + "loss": 2.6586, + "step": 40766 + }, + { + "epoch": 2.5306971258302813, + "grad_norm": 0.14616588938092998, + "learning_rate": 7.2711994082182655e-06, + "loss": 2.655, + "step": 40767 + }, + { + "epoch": 2.530759202930039, + "grad_norm": 0.14686927454522902, + "learning_rate": 7.26932396007482e-06, + "loss": 2.8668, + "step": 40768 + }, + { + "epoch": 2.530821280029797, + "grad_norm": 0.14006110221415544, + "learning_rate": 7.267448734867677e-06, + "loss": 2.7782, + "step": 40769 + }, + { + "epoch": 2.530883357129555, + "grad_norm": 0.13621482622586753, + "learning_rate": 7.265573732606623e-06, + "loss": 2.7015, + "step": 40770 + }, + { + "epoch": 2.530945434229313, + "grad_norm": 0.1418210663139497, + "learning_rate": 7.263698953301451e-06, + "loss": 2.7654, + "step": 40771 + }, + { + "epoch": 2.531007511329071, + "grad_norm": 0.1444040460391532, + "learning_rate": 7.261824396961936e-06, + "loss": 2.7274, + "step": 40772 + }, + { + "epoch": 2.531069588428829, + "grad_norm": 0.1440890739491298, + "learning_rate": 7.259950063597859e-06, + "loss": 2.8067, + "step": 40773 + }, + { + "epoch": 2.5311316655285863, + "grad_norm": 0.1618663908627998, + "learning_rate": 7.2580759532189906e-06, + "loss": 2.6703, + "step": 40774 + }, + { + "epoch": 2.5311937426283446, + "grad_norm": 0.15663595810975234, + "learning_rate": 7.256202065835121e-06, + "loss": 2.7482, + "step": 40775 + }, + { + "epoch": 2.531255819728102, + "grad_norm": 0.16187034357215097, + "learning_rate": 7.254328401456028e-06, + "loss": 2.6996, + "step": 40776 + }, + { + "epoch": 2.5313178968278605, + "grad_norm": 0.13815633552132725, + "learning_rate": 7.252454960091476e-06, + "loss": 2.7215, + "step": 40777 + }, + { + "epoch": 2.531379973927618, + "grad_norm": 0.144952829350586, + "learning_rate": 7.250581741751239e-06, + "loss": 2.6273, + "step": 40778 + }, + { + "epoch": 2.531442051027376, + "grad_norm": 0.13392386641071582, + "learning_rate": 7.248708746445088e-06, + "loss": 2.6579, + "step": 40779 + }, + { + "epoch": 2.5315041281271338, + "grad_norm": 0.13236436551570144, + "learning_rate": 7.246835974182814e-06, + "loss": 2.6876, + "step": 40780 + }, + { + "epoch": 2.5315662052268917, + "grad_norm": 0.14421487668705893, + "learning_rate": 7.244963424974177e-06, + "loss": 2.7019, + "step": 40781 + }, + { + "epoch": 2.5316282823266496, + "grad_norm": 0.13566267992057107, + "learning_rate": 7.243091098828947e-06, + "loss": 2.7349, + "step": 40782 + }, + { + "epoch": 2.5316903594264075, + "grad_norm": 0.14197649154069367, + "learning_rate": 7.241218995756887e-06, + "loss": 2.692, + "step": 40783 + }, + { + "epoch": 2.5317524365261654, + "grad_norm": 0.1369893189633804, + "learning_rate": 7.2393471157677526e-06, + "loss": 2.726, + "step": 40784 + }, + { + "epoch": 2.5318145136259234, + "grad_norm": 0.15021081182226895, + "learning_rate": 7.237475458871346e-06, + "loss": 2.739, + "step": 40785 + }, + { + "epoch": 2.5318765907256813, + "grad_norm": 0.1418164103163485, + "learning_rate": 7.235604025077403e-06, + "loss": 2.7559, + "step": 40786 + }, + { + "epoch": 2.531938667825439, + "grad_norm": 0.1399189988995652, + "learning_rate": 7.2337328143957e-06, + "loss": 2.7348, + "step": 40787 + }, + { + "epoch": 2.532000744925197, + "grad_norm": 0.1700775157162324, + "learning_rate": 7.231861826835979e-06, + "loss": 2.7053, + "step": 40788 + }, + { + "epoch": 2.532062822024955, + "grad_norm": 0.1351017492031951, + "learning_rate": 7.229991062408031e-06, + "loss": 2.6923, + "step": 40789 + }, + { + "epoch": 2.532124899124713, + "grad_norm": 0.15096660645340465, + "learning_rate": 7.2281205211216006e-06, + "loss": 2.7868, + "step": 40790 + }, + { + "epoch": 2.532186976224471, + "grad_norm": 0.13755790807771656, + "learning_rate": 7.226250202986451e-06, + "loss": 2.7363, + "step": 40791 + }, + { + "epoch": 2.532249053324229, + "grad_norm": 0.14343379243409807, + "learning_rate": 7.224380108012324e-06, + "loss": 2.7335, + "step": 40792 + }, + { + "epoch": 2.5323111304239867, + "grad_norm": 0.1380915183378834, + "learning_rate": 7.222510236209001e-06, + "loss": 2.7008, + "step": 40793 + }, + { + "epoch": 2.5323732075237446, + "grad_norm": 0.13860751391191187, + "learning_rate": 7.2206405875862306e-06, + "loss": 2.7689, + "step": 40794 + }, + { + "epoch": 2.5324352846235025, + "grad_norm": 0.15619709788259745, + "learning_rate": 7.218771162153765e-06, + "loss": 2.6967, + "step": 40795 + }, + { + "epoch": 2.5324973617232605, + "grad_norm": 0.14303803406692114, + "learning_rate": 7.216901959921352e-06, + "loss": 2.6986, + "step": 40796 + }, + { + "epoch": 2.532559438823018, + "grad_norm": 0.1345167141880352, + "learning_rate": 7.2150329808987335e-06, + "loss": 2.6281, + "step": 40797 + }, + { + "epoch": 2.5326215159227763, + "grad_norm": 0.15059115305575277, + "learning_rate": 7.213164225095687e-06, + "loss": 2.7207, + "step": 40798 + }, + { + "epoch": 2.5326835930225338, + "grad_norm": 0.14153832963994947, + "learning_rate": 7.211295692521952e-06, + "loss": 2.6788, + "step": 40799 + }, + { + "epoch": 2.532745670122292, + "grad_norm": 0.1340456124764404, + "learning_rate": 7.209427383187273e-06, + "loss": 2.7129, + "step": 40800 + }, + { + "epoch": 2.5328077472220496, + "grad_norm": 0.13964761053520716, + "learning_rate": 7.20755929710139e-06, + "loss": 2.7585, + "step": 40801 + }, + { + "epoch": 2.532869824321808, + "grad_norm": 0.14925292307820331, + "learning_rate": 7.20569143427407e-06, + "loss": 2.6201, + "step": 40802 + }, + { + "epoch": 2.5329319014215654, + "grad_norm": 0.1559751059437646, + "learning_rate": 7.2038237947150424e-06, + "loss": 2.7735, + "step": 40803 + }, + { + "epoch": 2.5329939785213234, + "grad_norm": 0.14207185716604792, + "learning_rate": 7.20195637843406e-06, + "loss": 2.7041, + "step": 40804 + }, + { + "epoch": 2.5330560556210813, + "grad_norm": 0.13360989539980014, + "learning_rate": 7.200089185440856e-06, + "loss": 2.6692, + "step": 40805 + }, + { + "epoch": 2.533118132720839, + "grad_norm": 0.14510604897798288, + "learning_rate": 7.198222215745165e-06, + "loss": 2.7788, + "step": 40806 + }, + { + "epoch": 2.533180209820597, + "grad_norm": 0.1374452091637954, + "learning_rate": 7.196355469356758e-06, + "loss": 2.7782, + "step": 40807 + }, + { + "epoch": 2.533242286920355, + "grad_norm": 0.16065880964184562, + "learning_rate": 7.194488946285349e-06, + "loss": 2.6804, + "step": 40808 + }, + { + "epoch": 2.533304364020113, + "grad_norm": 0.13655361011297698, + "learning_rate": 7.192622646540686e-06, + "loss": 2.6751, + "step": 40809 + }, + { + "epoch": 2.533366441119871, + "grad_norm": 0.14168784045248772, + "learning_rate": 7.19075657013249e-06, + "loss": 2.7446, + "step": 40810 + }, + { + "epoch": 2.5334285182196288, + "grad_norm": 0.14486756373855836, + "learning_rate": 7.188890717070507e-06, + "loss": 2.6944, + "step": 40811 + }, + { + "epoch": 2.5334905953193867, + "grad_norm": 0.14278635731072625, + "learning_rate": 7.187025087364485e-06, + "loss": 2.7448, + "step": 40812 + }, + { + "epoch": 2.5335526724191446, + "grad_norm": 0.14304024128347687, + "learning_rate": 7.185159681024151e-06, + "loss": 2.6364, + "step": 40813 + }, + { + "epoch": 2.5336147495189025, + "grad_norm": 0.13487393408266632, + "learning_rate": 7.183294498059228e-06, + "loss": 2.6531, + "step": 40814 + }, + { + "epoch": 2.5336768266186604, + "grad_norm": 0.1355299284947313, + "learning_rate": 7.181429538479439e-06, + "loss": 2.7273, + "step": 40815 + }, + { + "epoch": 2.5337389037184184, + "grad_norm": 0.1503213747779145, + "learning_rate": 7.179564802294542e-06, + "loss": 2.7778, + "step": 40816 + }, + { + "epoch": 2.5338009808181763, + "grad_norm": 0.14729733897158653, + "learning_rate": 7.177700289514249e-06, + "loss": 2.6781, + "step": 40817 + }, + { + "epoch": 2.533863057917934, + "grad_norm": 0.15979302901114564, + "learning_rate": 7.175836000148295e-06, + "loss": 2.7201, + "step": 40818 + }, + { + "epoch": 2.533925135017692, + "grad_norm": 0.1645103880434316, + "learning_rate": 7.173971934206392e-06, + "loss": 2.6655, + "step": 40819 + }, + { + "epoch": 2.53398721211745, + "grad_norm": 0.1515458682855013, + "learning_rate": 7.172108091698265e-06, + "loss": 2.6992, + "step": 40820 + }, + { + "epoch": 2.534049289217208, + "grad_norm": 0.135500012632665, + "learning_rate": 7.1702444726336585e-06, + "loss": 2.6547, + "step": 40821 + }, + { + "epoch": 2.5341113663169654, + "grad_norm": 0.14651419018269363, + "learning_rate": 7.168381077022285e-06, + "loss": 2.7136, + "step": 40822 + }, + { + "epoch": 2.534173443416724, + "grad_norm": 0.14206303346838353, + "learning_rate": 7.166517904873865e-06, + "loss": 2.6876, + "step": 40823 + }, + { + "epoch": 2.5342355205164813, + "grad_norm": 0.1420919542713009, + "learning_rate": 7.164654956198108e-06, + "loss": 2.7327, + "step": 40824 + }, + { + "epoch": 2.5342975976162396, + "grad_norm": 0.1796621221782693, + "learning_rate": 7.162792231004756e-06, + "loss": 2.6668, + "step": 40825 + }, + { + "epoch": 2.534359674715997, + "grad_norm": 0.17104955803601635, + "learning_rate": 7.160929729303517e-06, + "loss": 2.6801, + "step": 40826 + }, + { + "epoch": 2.534421751815755, + "grad_norm": 0.1359982647956042, + "learning_rate": 7.159067451104107e-06, + "loss": 2.7259, + "step": 40827 + }, + { + "epoch": 2.534483828915513, + "grad_norm": 0.13627333291431437, + "learning_rate": 7.157205396416228e-06, + "loss": 2.6815, + "step": 40828 + }, + { + "epoch": 2.534545906015271, + "grad_norm": 0.13957627305873668, + "learning_rate": 7.155343565249623e-06, + "loss": 2.7042, + "step": 40829 + }, + { + "epoch": 2.5346079831150288, + "grad_norm": 0.13333166554148007, + "learning_rate": 7.153481957613989e-06, + "loss": 2.7027, + "step": 40830 + }, + { + "epoch": 2.5346700602147867, + "grad_norm": 0.1486897896974799, + "learning_rate": 7.151620573519041e-06, + "loss": 2.7172, + "step": 40831 + }, + { + "epoch": 2.5347321373145446, + "grad_norm": 0.13303470955791147, + "learning_rate": 7.149759412974494e-06, + "loss": 2.6948, + "step": 40832 + }, + { + "epoch": 2.5347942144143025, + "grad_norm": 0.13586708196153677, + "learning_rate": 7.1478984759900395e-06, + "loss": 2.668, + "step": 40833 + }, + { + "epoch": 2.5348562915140604, + "grad_norm": 0.13650245587136098, + "learning_rate": 7.1460377625754134e-06, + "loss": 2.6349, + "step": 40834 + }, + { + "epoch": 2.5349183686138184, + "grad_norm": 0.13696987385890363, + "learning_rate": 7.14417727274031e-06, + "loss": 2.7259, + "step": 40835 + }, + { + "epoch": 2.5349804457135763, + "grad_norm": 0.1337802494237097, + "learning_rate": 7.142317006494442e-06, + "loss": 2.7084, + "step": 40836 + }, + { + "epoch": 2.535042522813334, + "grad_norm": 0.13627802111179896, + "learning_rate": 7.1404569638474915e-06, + "loss": 2.7493, + "step": 40837 + }, + { + "epoch": 2.535104599913092, + "grad_norm": 0.13441608543835953, + "learning_rate": 7.138597144809201e-06, + "loss": 2.7625, + "step": 40838 + }, + { + "epoch": 2.53516667701285, + "grad_norm": 0.13494169095233363, + "learning_rate": 7.136737549389249e-06, + "loss": 2.711, + "step": 40839 + }, + { + "epoch": 2.535228754112608, + "grad_norm": 0.1372475207987335, + "learning_rate": 7.134878177597343e-06, + "loss": 2.7717, + "step": 40840 + }, + { + "epoch": 2.535290831212366, + "grad_norm": 0.13559087374490145, + "learning_rate": 7.1330190294431884e-06, + "loss": 2.6525, + "step": 40841 + }, + { + "epoch": 2.535352908312124, + "grad_norm": 0.13559770098413965, + "learning_rate": 7.131160104936463e-06, + "loss": 2.698, + "step": 40842 + }, + { + "epoch": 2.5354149854118817, + "grad_norm": 0.14540804572588079, + "learning_rate": 7.129301404086896e-06, + "loss": 2.6856, + "step": 40843 + }, + { + "epoch": 2.5354770625116396, + "grad_norm": 0.13197448655807403, + "learning_rate": 7.12744292690416e-06, + "loss": 2.6193, + "step": 40844 + }, + { + "epoch": 2.535539139611397, + "grad_norm": 0.14032129101942756, + "learning_rate": 7.125584673397967e-06, + "loss": 2.6055, + "step": 40845 + }, + { + "epoch": 2.5356012167111555, + "grad_norm": 0.14301620097228257, + "learning_rate": 7.1237266435780156e-06, + "loss": 2.7749, + "step": 40846 + }, + { + "epoch": 2.535663293810913, + "grad_norm": 0.13311515711273555, + "learning_rate": 7.1218688374539765e-06, + "loss": 2.6952, + "step": 40847 + }, + { + "epoch": 2.5357253709106713, + "grad_norm": 0.14558517396489712, + "learning_rate": 7.120011255035564e-06, + "loss": 2.6425, + "step": 40848 + }, + { + "epoch": 2.5357874480104288, + "grad_norm": 0.15562260553134172, + "learning_rate": 7.118153896332469e-06, + "loss": 2.6894, + "step": 40849 + }, + { + "epoch": 2.5358495251101867, + "grad_norm": 0.1422411319695388, + "learning_rate": 7.1162967613543685e-06, + "loss": 2.7049, + "step": 40850 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.15383433777452638, + "learning_rate": 7.114439850110949e-06, + "loss": 2.7268, + "step": 40851 + }, + { + "epoch": 2.5359736793097025, + "grad_norm": 0.13130104911403953, + "learning_rate": 7.1125831626119246e-06, + "loss": 2.7515, + "step": 40852 + }, + { + "epoch": 2.5360357564094604, + "grad_norm": 0.13183984397469314, + "learning_rate": 7.11072669886696e-06, + "loss": 2.7417, + "step": 40853 + }, + { + "epoch": 2.5360978335092184, + "grad_norm": 0.13942509243878123, + "learning_rate": 7.108870458885747e-06, + "loss": 2.6805, + "step": 40854 + }, + { + "epoch": 2.5361599106089763, + "grad_norm": 0.14288704210130748, + "learning_rate": 7.107014442677967e-06, + "loss": 2.7403, + "step": 40855 + }, + { + "epoch": 2.536221987708734, + "grad_norm": 0.15692612868228709, + "learning_rate": 7.105158650253297e-06, + "loss": 2.7674, + "step": 40856 + }, + { + "epoch": 2.536284064808492, + "grad_norm": 0.1482280395250052, + "learning_rate": 7.1033030816214315e-06, + "loss": 2.7257, + "step": 40857 + }, + { + "epoch": 2.53634614190825, + "grad_norm": 0.13358335621873862, + "learning_rate": 7.101447736792055e-06, + "loss": 2.6706, + "step": 40858 + }, + { + "epoch": 2.536408219008008, + "grad_norm": 0.15542593850568362, + "learning_rate": 7.099592615774836e-06, + "loss": 2.758, + "step": 40859 + }, + { + "epoch": 2.536470296107766, + "grad_norm": 0.1384233208277336, + "learning_rate": 7.097737718579451e-06, + "loss": 2.6815, + "step": 40860 + }, + { + "epoch": 2.536532373207524, + "grad_norm": 0.14475439218501884, + "learning_rate": 7.09588304521559e-06, + "loss": 2.6847, + "step": 40861 + }, + { + "epoch": 2.5365944503072817, + "grad_norm": 0.16733032146860632, + "learning_rate": 7.094028595692919e-06, + "loss": 2.7979, + "step": 40862 + }, + { + "epoch": 2.5366565274070396, + "grad_norm": 0.13455078027542208, + "learning_rate": 7.09217437002112e-06, + "loss": 2.762, + "step": 40863 + }, + { + "epoch": 2.5367186045067975, + "grad_norm": 0.13251042373384722, + "learning_rate": 7.090320368209857e-06, + "loss": 2.5497, + "step": 40864 + }, + { + "epoch": 2.5367806816065555, + "grad_norm": 0.13936716710946973, + "learning_rate": 7.088466590268816e-06, + "loss": 2.7304, + "step": 40865 + }, + { + "epoch": 2.5368427587063134, + "grad_norm": 0.1316150297983334, + "learning_rate": 7.086613036207662e-06, + "loss": 2.6635, + "step": 40866 + }, + { + "epoch": 2.5369048358060713, + "grad_norm": 0.13702744647220427, + "learning_rate": 7.084759706036065e-06, + "loss": 2.7296, + "step": 40867 + }, + { + "epoch": 2.5369669129058288, + "grad_norm": 0.14895770871980685, + "learning_rate": 7.0829065997636945e-06, + "loss": 2.761, + "step": 40868 + }, + { + "epoch": 2.537028990005587, + "grad_norm": 0.13385177451221417, + "learning_rate": 7.08105371740021e-06, + "loss": 2.7942, + "step": 40869 + }, + { + "epoch": 2.5370910671053446, + "grad_norm": 0.15006435598213105, + "learning_rate": 7.079201058955298e-06, + "loss": 2.7348, + "step": 40870 + }, + { + "epoch": 2.537153144205103, + "grad_norm": 0.14814774566965214, + "learning_rate": 7.0773486244386125e-06, + "loss": 2.6225, + "step": 40871 + }, + { + "epoch": 2.5372152213048604, + "grad_norm": 0.13544701654889618, + "learning_rate": 7.075496413859817e-06, + "loss": 2.7314, + "step": 40872 + }, + { + "epoch": 2.537277298404619, + "grad_norm": 0.13572892460419664, + "learning_rate": 7.073644427228571e-06, + "loss": 2.7051, + "step": 40873 + }, + { + "epoch": 2.5373393755043763, + "grad_norm": 0.13158916407058655, + "learning_rate": 7.071792664554549e-06, + "loss": 2.7167, + "step": 40874 + }, + { + "epoch": 2.537401452604134, + "grad_norm": 0.14073239305366347, + "learning_rate": 7.069941125847407e-06, + "loss": 2.6905, + "step": 40875 + }, + { + "epoch": 2.537463529703892, + "grad_norm": 0.15122008405145457, + "learning_rate": 7.068089811116807e-06, + "loss": 2.6652, + "step": 40876 + }, + { + "epoch": 2.53752560680365, + "grad_norm": 0.1346108473809605, + "learning_rate": 7.066238720372387e-06, + "loss": 2.7063, + "step": 40877 + }, + { + "epoch": 2.537587683903408, + "grad_norm": 0.15528748406747372, + "learning_rate": 7.0643878536238274e-06, + "loss": 2.7215, + "step": 40878 + }, + { + "epoch": 2.537649761003166, + "grad_norm": 0.1351192161780521, + "learning_rate": 7.062537210880788e-06, + "loss": 2.7025, + "step": 40879 + }, + { + "epoch": 2.537711838102924, + "grad_norm": 0.13912139486601338, + "learning_rate": 7.060686792152915e-06, + "loss": 2.6655, + "step": 40880 + }, + { + "epoch": 2.5377739152026817, + "grad_norm": 0.14431089279457782, + "learning_rate": 7.058836597449864e-06, + "loss": 2.6942, + "step": 40881 + }, + { + "epoch": 2.5378359923024396, + "grad_norm": 0.13890221766915073, + "learning_rate": 7.056986626781292e-06, + "loss": 2.6468, + "step": 40882 + }, + { + "epoch": 2.5378980694021975, + "grad_norm": 0.13839186349964727, + "learning_rate": 7.055136880156826e-06, + "loss": 2.7427, + "step": 40883 + }, + { + "epoch": 2.5379601465019554, + "grad_norm": 0.16396508844481175, + "learning_rate": 7.053287357586152e-06, + "loss": 2.6508, + "step": 40884 + }, + { + "epoch": 2.5380222236017134, + "grad_norm": 0.1398848886024215, + "learning_rate": 7.051438059078902e-06, + "loss": 2.6552, + "step": 40885 + }, + { + "epoch": 2.5380843007014713, + "grad_norm": 0.14075825258880567, + "learning_rate": 7.0495889846447285e-06, + "loss": 2.7063, + "step": 40886 + }, + { + "epoch": 2.538146377801229, + "grad_norm": 0.1468217080912173, + "learning_rate": 7.047740134293257e-06, + "loss": 2.7943, + "step": 40887 + }, + { + "epoch": 2.538208454900987, + "grad_norm": 0.13765743762791066, + "learning_rate": 7.045891508034169e-06, + "loss": 2.7698, + "step": 40888 + }, + { + "epoch": 2.538270532000745, + "grad_norm": 0.14436708612344012, + "learning_rate": 7.0440431058770965e-06, + "loss": 2.7918, + "step": 40889 + }, + { + "epoch": 2.538332609100503, + "grad_norm": 0.1450017988878507, + "learning_rate": 7.0421949278316746e-06, + "loss": 2.6335, + "step": 40890 + }, + { + "epoch": 2.538394686200261, + "grad_norm": 0.1525052632729024, + "learning_rate": 7.040346973907547e-06, + "loss": 2.614, + "step": 40891 + }, + { + "epoch": 2.538456763300019, + "grad_norm": 0.14289896934927104, + "learning_rate": 7.038499244114349e-06, + "loss": 2.8008, + "step": 40892 + }, + { + "epoch": 2.5385188403997763, + "grad_norm": 0.32329887284947567, + "learning_rate": 7.036651738461741e-06, + "loss": 2.6862, + "step": 40893 + }, + { + "epoch": 2.5385809174995346, + "grad_norm": 0.1617845532704733, + "learning_rate": 7.034804456959354e-06, + "loss": 2.6184, + "step": 40894 + }, + { + "epoch": 2.538642994599292, + "grad_norm": 0.1324721491306236, + "learning_rate": 7.032957399616818e-06, + "loss": 2.6475, + "step": 40895 + }, + { + "epoch": 2.5387050716990505, + "grad_norm": 0.13203594436426097, + "learning_rate": 7.031110566443766e-06, + "loss": 2.662, + "step": 40896 + }, + { + "epoch": 2.538767148798808, + "grad_norm": 0.14449497360468108, + "learning_rate": 7.02926395744985e-06, + "loss": 2.6703, + "step": 40897 + }, + { + "epoch": 2.538829225898566, + "grad_norm": 0.13736158876801552, + "learning_rate": 7.027417572644701e-06, + "loss": 2.6735, + "step": 40898 + }, + { + "epoch": 2.5388913029983238, + "grad_norm": 0.15924258832324767, + "learning_rate": 7.0255714120379405e-06, + "loss": 2.6792, + "step": 40899 + }, + { + "epoch": 2.5389533800980817, + "grad_norm": 0.13964287540056222, + "learning_rate": 7.023725475639198e-06, + "loss": 2.7176, + "step": 40900 + }, + { + "epoch": 2.5390154571978396, + "grad_norm": 0.15054855476396445, + "learning_rate": 7.021879763458117e-06, + "loss": 2.6908, + "step": 40901 + }, + { + "epoch": 2.5390775342975975, + "grad_norm": 0.22444517519210158, + "learning_rate": 7.020034275504328e-06, + "loss": 2.6873, + "step": 40902 + }, + { + "epoch": 2.5391396113973554, + "grad_norm": 0.14853261736042386, + "learning_rate": 7.018189011787457e-06, + "loss": 2.7537, + "step": 40903 + }, + { + "epoch": 2.5392016884971134, + "grad_norm": 0.15264316625761107, + "learning_rate": 7.016343972317124e-06, + "loss": 2.6895, + "step": 40904 + }, + { + "epoch": 2.5392637655968713, + "grad_norm": 0.1546435385186077, + "learning_rate": 7.01449915710295e-06, + "loss": 2.7438, + "step": 40905 + }, + { + "epoch": 2.539325842696629, + "grad_norm": 0.13699173920979288, + "learning_rate": 7.0126545661545805e-06, + "loss": 2.6696, + "step": 40906 + }, + { + "epoch": 2.539387919796387, + "grad_norm": 0.13691760315717985, + "learning_rate": 7.010810199481621e-06, + "loss": 2.7437, + "step": 40907 + }, + { + "epoch": 2.539449996896145, + "grad_norm": 0.16240972822126368, + "learning_rate": 7.008966057093707e-06, + "loss": 2.7711, + "step": 40908 + }, + { + "epoch": 2.539512073995903, + "grad_norm": 0.14361553459625317, + "learning_rate": 7.007122139000444e-06, + "loss": 2.6305, + "step": 40909 + }, + { + "epoch": 2.539574151095661, + "grad_norm": 0.14514778707941559, + "learning_rate": 7.005278445211455e-06, + "loss": 2.6599, + "step": 40910 + }, + { + "epoch": 2.539636228195419, + "grad_norm": 0.1415982739296874, + "learning_rate": 7.003434975736383e-06, + "loss": 2.7542, + "step": 40911 + }, + { + "epoch": 2.5396983052951767, + "grad_norm": 0.1524226522303659, + "learning_rate": 7.001591730584827e-06, + "loss": 2.7273, + "step": 40912 + }, + { + "epoch": 2.5397603823949346, + "grad_norm": 0.14579321246932564, + "learning_rate": 6.999748709766402e-06, + "loss": 2.6697, + "step": 40913 + }, + { + "epoch": 2.5398224594946925, + "grad_norm": 0.14923354694638774, + "learning_rate": 6.99790591329072e-06, + "loss": 2.6722, + "step": 40914 + }, + { + "epoch": 2.5398845365944505, + "grad_norm": 0.1637577564462708, + "learning_rate": 6.996063341167414e-06, + "loss": 2.7637, + "step": 40915 + }, + { + "epoch": 2.539946613694208, + "grad_norm": 0.13592466260869213, + "learning_rate": 6.994220993406081e-06, + "loss": 2.7812, + "step": 40916 + }, + { + "epoch": 2.5400086907939663, + "grad_norm": 0.1365513457886717, + "learning_rate": 6.992378870016336e-06, + "loss": 2.7727, + "step": 40917 + }, + { + "epoch": 2.5400707678937238, + "grad_norm": 0.13616215623587896, + "learning_rate": 6.990536971007794e-06, + "loss": 2.6393, + "step": 40918 + }, + { + "epoch": 2.540132844993482, + "grad_norm": 0.1405167626697862, + "learning_rate": 6.988695296390046e-06, + "loss": 2.8143, + "step": 40919 + }, + { + "epoch": 2.5401949220932396, + "grad_norm": 0.14741500240088762, + "learning_rate": 6.986853846172736e-06, + "loss": 2.6742, + "step": 40920 + }, + { + "epoch": 2.540256999192998, + "grad_norm": 0.1409273407503491, + "learning_rate": 6.985012620365444e-06, + "loss": 2.7635, + "step": 40921 + }, + { + "epoch": 2.5403190762927554, + "grad_norm": 0.1368154252076356, + "learning_rate": 6.983171618977785e-06, + "loss": 2.7396, + "step": 40922 + }, + { + "epoch": 2.5403811533925134, + "grad_norm": 0.13633955412998056, + "learning_rate": 6.9813308420193516e-06, + "loss": 2.8519, + "step": 40923 + }, + { + "epoch": 2.5404432304922713, + "grad_norm": 0.13212184280946943, + "learning_rate": 6.979490289499769e-06, + "loss": 2.6372, + "step": 40924 + }, + { + "epoch": 2.540505307592029, + "grad_norm": 0.1538607493784986, + "learning_rate": 6.9776499614286285e-06, + "loss": 2.7538, + "step": 40925 + }, + { + "epoch": 2.540567384691787, + "grad_norm": 0.15144803748976626, + "learning_rate": 6.975809857815535e-06, + "loss": 2.6908, + "step": 40926 + }, + { + "epoch": 2.540629461791545, + "grad_norm": 0.1380535430355622, + "learning_rate": 6.973969978670086e-06, + "loss": 2.7327, + "step": 40927 + }, + { + "epoch": 2.540691538891303, + "grad_norm": 0.17232472472087934, + "learning_rate": 6.9721303240018686e-06, + "loss": 2.7485, + "step": 40928 + }, + { + "epoch": 2.540753615991061, + "grad_norm": 0.1368211635113769, + "learning_rate": 6.970290893820502e-06, + "loss": 2.7095, + "step": 40929 + }, + { + "epoch": 2.540815693090819, + "grad_norm": 0.1838713029182471, + "learning_rate": 6.968451688135575e-06, + "loss": 2.8284, + "step": 40930 + }, + { + "epoch": 2.5408777701905767, + "grad_norm": 0.13915221208350328, + "learning_rate": 6.966612706956682e-06, + "loss": 2.5779, + "step": 40931 + }, + { + "epoch": 2.5409398472903346, + "grad_norm": 0.13785732829055344, + "learning_rate": 6.9647739502934016e-06, + "loss": 2.6636, + "step": 40932 + }, + { + "epoch": 2.5410019243900925, + "grad_norm": 0.13709581693904585, + "learning_rate": 6.962935418155359e-06, + "loss": 2.7841, + "step": 40933 + }, + { + "epoch": 2.5410640014898505, + "grad_norm": 0.13097155445648817, + "learning_rate": 6.961097110552128e-06, + "loss": 2.6532, + "step": 40934 + }, + { + "epoch": 2.5411260785896084, + "grad_norm": 0.13533951016225684, + "learning_rate": 6.959259027493304e-06, + "loss": 2.6386, + "step": 40935 + }, + { + "epoch": 2.5411881556893663, + "grad_norm": 0.1337609783684947, + "learning_rate": 6.95742116898846e-06, + "loss": 2.6874, + "step": 40936 + }, + { + "epoch": 2.541250232789124, + "grad_norm": 0.1389717365734875, + "learning_rate": 6.955583535047211e-06, + "loss": 2.6876, + "step": 40937 + }, + { + "epoch": 2.541312309888882, + "grad_norm": 0.1367799417613136, + "learning_rate": 6.953746125679134e-06, + "loss": 2.6714, + "step": 40938 + }, + { + "epoch": 2.54137438698864, + "grad_norm": 0.1325303615116631, + "learning_rate": 6.951908940893809e-06, + "loss": 2.6356, + "step": 40939 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.1577449450982747, + "learning_rate": 6.9500719807008285e-06, + "loss": 2.6978, + "step": 40940 + }, + { + "epoch": 2.5414985411881554, + "grad_norm": 0.15806921248099648, + "learning_rate": 6.948235245109763e-06, + "loss": 2.6912, + "step": 40941 + }, + { + "epoch": 2.541560618287914, + "grad_norm": 0.16141149523429782, + "learning_rate": 6.946398734130199e-06, + "loss": 2.6559, + "step": 40942 + }, + { + "epoch": 2.5416226953876713, + "grad_norm": 0.14161346648592962, + "learning_rate": 6.94456244777174e-06, + "loss": 2.6801, + "step": 40943 + }, + { + "epoch": 2.5416847724874296, + "grad_norm": 0.13555532396132045, + "learning_rate": 6.942726386043952e-06, + "loss": 2.6386, + "step": 40944 + }, + { + "epoch": 2.541746849587187, + "grad_norm": 0.13603250072376613, + "learning_rate": 6.940890548956414e-06, + "loss": 2.6356, + "step": 40945 + }, + { + "epoch": 2.541808926686945, + "grad_norm": 0.13574749092335192, + "learning_rate": 6.939054936518691e-06, + "loss": 2.7096, + "step": 40946 + }, + { + "epoch": 2.541871003786703, + "grad_norm": 0.14188065483990459, + "learning_rate": 6.937219548740381e-06, + "loss": 2.6782, + "step": 40947 + }, + { + "epoch": 2.541933080886461, + "grad_norm": 0.14343007186903806, + "learning_rate": 6.93538438563105e-06, + "loss": 2.6702, + "step": 40948 + }, + { + "epoch": 2.541995157986219, + "grad_norm": 0.14730203036893685, + "learning_rate": 6.933549447200277e-06, + "loss": 2.7328, + "step": 40949 + }, + { + "epoch": 2.5420572350859767, + "grad_norm": 0.13876110886229634, + "learning_rate": 6.931714733457617e-06, + "loss": 2.6705, + "step": 40950 + }, + { + "epoch": 2.5421193121857346, + "grad_norm": 0.15725363030435383, + "learning_rate": 6.929880244412668e-06, + "loss": 2.6965, + "step": 40951 + }, + { + "epoch": 2.5421813892854925, + "grad_norm": 0.13899784996321832, + "learning_rate": 6.928045980074988e-06, + "loss": 2.7046, + "step": 40952 + }, + { + "epoch": 2.5422434663852504, + "grad_norm": 0.15574369057149368, + "learning_rate": 6.926211940454147e-06, + "loss": 2.7968, + "step": 40953 + }, + { + "epoch": 2.5423055434850084, + "grad_norm": 0.14326025163124217, + "learning_rate": 6.924378125559716e-06, + "loss": 2.7954, + "step": 40954 + }, + { + "epoch": 2.5423676205847663, + "grad_norm": 0.13482612136588107, + "learning_rate": 6.922544535401254e-06, + "loss": 2.6741, + "step": 40955 + }, + { + "epoch": 2.542429697684524, + "grad_norm": 0.14039211725621126, + "learning_rate": 6.9207111699883424e-06, + "loss": 2.615, + "step": 40956 + }, + { + "epoch": 2.542491774784282, + "grad_norm": 0.14646843505599252, + "learning_rate": 6.918878029330539e-06, + "loss": 2.7536, + "step": 40957 + }, + { + "epoch": 2.54255385188404, + "grad_norm": 0.13226767595281633, + "learning_rate": 6.917045113437409e-06, + "loss": 2.771, + "step": 40958 + }, + { + "epoch": 2.542615928983798, + "grad_norm": 0.13486154570350178, + "learning_rate": 6.915212422318501e-06, + "loss": 2.7429, + "step": 40959 + }, + { + "epoch": 2.542678006083556, + "grad_norm": 0.15657616190207427, + "learning_rate": 6.913379955983401e-06, + "loss": 2.7016, + "step": 40960 + }, + { + "epoch": 2.542740083183314, + "grad_norm": 0.1337760793693162, + "learning_rate": 6.911547714441657e-06, + "loss": 2.7072, + "step": 40961 + }, + { + "epoch": 2.5428021602830717, + "grad_norm": 0.15228979671424375, + "learning_rate": 6.909715697702824e-06, + "loss": 2.6821, + "step": 40962 + }, + { + "epoch": 2.5428642373828296, + "grad_norm": 0.1463864571414414, + "learning_rate": 6.907883905776458e-06, + "loss": 2.7588, + "step": 40963 + }, + { + "epoch": 2.542926314482587, + "grad_norm": 0.15052950626738906, + "learning_rate": 6.906052338672136e-06, + "loss": 2.7075, + "step": 40964 + }, + { + "epoch": 2.5429883915823455, + "grad_norm": 0.15062183714153682, + "learning_rate": 6.904220996399397e-06, + "loss": 2.7385, + "step": 40965 + }, + { + "epoch": 2.543050468682103, + "grad_norm": 0.13733560504349027, + "learning_rate": 6.902389878967796e-06, + "loss": 2.6991, + "step": 40966 + }, + { + "epoch": 2.5431125457818613, + "grad_norm": 0.14281421866937447, + "learning_rate": 6.9005589863868956e-06, + "loss": 2.7258, + "step": 40967 + }, + { + "epoch": 2.5431746228816188, + "grad_norm": 0.14135357418325006, + "learning_rate": 6.8987283186662254e-06, + "loss": 2.6638, + "step": 40968 + }, + { + "epoch": 2.543236699981377, + "grad_norm": 0.14442419907539464, + "learning_rate": 6.896897875815367e-06, + "loss": 2.7396, + "step": 40969 + }, + { + "epoch": 2.5432987770811346, + "grad_norm": 0.1643172114742586, + "learning_rate": 6.895067657843857e-06, + "loss": 2.6503, + "step": 40970 + }, + { + "epoch": 2.5433608541808925, + "grad_norm": 0.14116349696266117, + "learning_rate": 6.893237664761243e-06, + "loss": 2.751, + "step": 40971 + }, + { + "epoch": 2.5434229312806504, + "grad_norm": 0.14007477271754154, + "learning_rate": 6.891407896577057e-06, + "loss": 2.6808, + "step": 40972 + }, + { + "epoch": 2.5434850083804084, + "grad_norm": 0.1487208848596818, + "learning_rate": 6.8895783533008794e-06, + "loss": 2.801, + "step": 40973 + }, + { + "epoch": 2.5435470854801663, + "grad_norm": 0.14099786916096657, + "learning_rate": 6.887749034942232e-06, + "loss": 2.7719, + "step": 40974 + }, + { + "epoch": 2.543609162579924, + "grad_norm": 0.1440077657840752, + "learning_rate": 6.885919941510655e-06, + "loss": 2.6595, + "step": 40975 + }, + { + "epoch": 2.543671239679682, + "grad_norm": 0.13911132625045589, + "learning_rate": 6.884091073015708e-06, + "loss": 2.7677, + "step": 40976 + }, + { + "epoch": 2.54373331677944, + "grad_norm": 0.15680962124436504, + "learning_rate": 6.882262429466924e-06, + "loss": 2.7902, + "step": 40977 + }, + { + "epoch": 2.543795393879198, + "grad_norm": 0.14048873736889397, + "learning_rate": 6.880434010873837e-06, + "loss": 2.6603, + "step": 40978 + }, + { + "epoch": 2.543857470978956, + "grad_norm": 0.1363936987954539, + "learning_rate": 6.878605817246004e-06, + "loss": 2.7081, + "step": 40979 + }, + { + "epoch": 2.543919548078714, + "grad_norm": 0.14354308723758114, + "learning_rate": 6.876777848592947e-06, + "loss": 2.7111, + "step": 40980 + }, + { + "epoch": 2.5439816251784717, + "grad_norm": 0.14658091225092132, + "learning_rate": 6.874950104924216e-06, + "loss": 2.7395, + "step": 40981 + }, + { + "epoch": 2.5440437022782296, + "grad_norm": 0.13902039453017143, + "learning_rate": 6.87312258624932e-06, + "loss": 2.7347, + "step": 40982 + }, + { + "epoch": 2.5441057793779875, + "grad_norm": 0.13605884282800465, + "learning_rate": 6.871295292577828e-06, + "loss": 2.6734, + "step": 40983 + }, + { + "epoch": 2.5441678564777455, + "grad_norm": 0.14024419351702536, + "learning_rate": 6.869468223919262e-06, + "loss": 2.7485, + "step": 40984 + }, + { + "epoch": 2.5442299335775034, + "grad_norm": 0.14066863707498353, + "learning_rate": 6.867641380283146e-06, + "loss": 2.6558, + "step": 40985 + }, + { + "epoch": 2.5442920106772613, + "grad_norm": 0.13602618826657548, + "learning_rate": 6.865814761679007e-06, + "loss": 2.6968, + "step": 40986 + }, + { + "epoch": 2.544354087777019, + "grad_norm": 0.1437111864349623, + "learning_rate": 6.8639883681163976e-06, + "loss": 2.6577, + "step": 40987 + }, + { + "epoch": 2.544416164876777, + "grad_norm": 0.1401340138775767, + "learning_rate": 6.862162199604827e-06, + "loss": 2.7143, + "step": 40988 + }, + { + "epoch": 2.5444782419765346, + "grad_norm": 0.14244487493034913, + "learning_rate": 6.860336256153827e-06, + "loss": 2.7345, + "step": 40989 + }, + { + "epoch": 2.544540319076293, + "grad_norm": 0.14963344599453732, + "learning_rate": 6.858510537772933e-06, + "loss": 2.7566, + "step": 40990 + }, + { + "epoch": 2.5446023961760504, + "grad_norm": 0.13575922907057128, + "learning_rate": 6.856685044471644e-06, + "loss": 2.6437, + "step": 40991 + }, + { + "epoch": 2.544664473275809, + "grad_norm": 0.13812735933127135, + "learning_rate": 6.854859776259515e-06, + "loss": 2.7692, + "step": 40992 + }, + { + "epoch": 2.5447265503755663, + "grad_norm": 0.1365779765042854, + "learning_rate": 6.853034733146057e-06, + "loss": 2.6894, + "step": 40993 + }, + { + "epoch": 2.544788627475324, + "grad_norm": 0.14441514506719752, + "learning_rate": 6.851209915140794e-06, + "loss": 2.7291, + "step": 40994 + }, + { + "epoch": 2.544850704575082, + "grad_norm": 0.16759919387579703, + "learning_rate": 6.849385322253232e-06, + "loss": 2.7143, + "step": 40995 + }, + { + "epoch": 2.54491278167484, + "grad_norm": 0.13313792834166008, + "learning_rate": 6.8475609544929096e-06, + "loss": 2.7093, + "step": 40996 + }, + { + "epoch": 2.544974858774598, + "grad_norm": 0.1352557240718602, + "learning_rate": 6.8457368118693414e-06, + "loss": 2.7687, + "step": 40997 + }, + { + "epoch": 2.545036935874356, + "grad_norm": 0.14039054505479137, + "learning_rate": 6.843912894392035e-06, + "loss": 2.6614, + "step": 40998 + }, + { + "epoch": 2.545099012974114, + "grad_norm": 0.1371631755857761, + "learning_rate": 6.842089202070507e-06, + "loss": 2.6521, + "step": 40999 + }, + { + "epoch": 2.5451610900738717, + "grad_norm": 0.14227574434484516, + "learning_rate": 6.8402657349142806e-06, + "loss": 2.7356, + "step": 41000 + }, + { + "epoch": 2.5452231671736296, + "grad_norm": 0.14438041987505176, + "learning_rate": 6.838442492932867e-06, + "loss": 2.6975, + "step": 41001 + }, + { + "epoch": 2.5452852442733875, + "grad_norm": 0.14006825678798415, + "learning_rate": 6.83661947613578e-06, + "loss": 2.6576, + "step": 41002 + }, + { + "epoch": 2.5453473213731455, + "grad_norm": 0.1339457909549635, + "learning_rate": 6.834796684532529e-06, + "loss": 2.6876, + "step": 41003 + }, + { + "epoch": 2.5454093984729034, + "grad_norm": 0.13833192338665398, + "learning_rate": 6.832974118132606e-06, + "loss": 2.7529, + "step": 41004 + }, + { + "epoch": 2.5454714755726613, + "grad_norm": 0.13223168350513242, + "learning_rate": 6.831151776945549e-06, + "loss": 2.7032, + "step": 41005 + }, + { + "epoch": 2.545533552672419, + "grad_norm": 0.14167295826470117, + "learning_rate": 6.829329660980854e-06, + "loss": 2.7961, + "step": 41006 + }, + { + "epoch": 2.545595629772177, + "grad_norm": 0.14321941356975493, + "learning_rate": 6.827507770248026e-06, + "loss": 2.7639, + "step": 41007 + }, + { + "epoch": 2.545657706871935, + "grad_norm": 0.13743057909796597, + "learning_rate": 6.825686104756556e-06, + "loss": 2.7905, + "step": 41008 + }, + { + "epoch": 2.545719783971693, + "grad_norm": 0.155771936781033, + "learning_rate": 6.8238646645159665e-06, + "loss": 2.7477, + "step": 41009 + }, + { + "epoch": 2.545781861071451, + "grad_norm": 0.17755465800965342, + "learning_rate": 6.82204344953577e-06, + "loss": 2.6662, + "step": 41010 + }, + { + "epoch": 2.545843938171209, + "grad_norm": 0.14369524348243887, + "learning_rate": 6.820222459825449e-06, + "loss": 2.7019, + "step": 41011 + }, + { + "epoch": 2.5459060152709663, + "grad_norm": 0.14213807640881368, + "learning_rate": 6.81840169539451e-06, + "loss": 2.6491, + "step": 41012 + }, + { + "epoch": 2.5459680923707246, + "grad_norm": 0.13789114317107706, + "learning_rate": 6.816581156252444e-06, + "loss": 2.6928, + "step": 41013 + }, + { + "epoch": 2.546030169470482, + "grad_norm": 0.1528264005068597, + "learning_rate": 6.814760842408768e-06, + "loss": 2.6369, + "step": 41014 + }, + { + "epoch": 2.5460922465702405, + "grad_norm": 0.14889084453088192, + "learning_rate": 6.812940753872965e-06, + "loss": 2.7658, + "step": 41015 + }, + { + "epoch": 2.546154323669998, + "grad_norm": 0.15040413065125938, + "learning_rate": 6.811120890654538e-06, + "loss": 2.7092, + "step": 41016 + }, + { + "epoch": 2.5462164007697563, + "grad_norm": 0.13957344389038967, + "learning_rate": 6.8093012527629785e-06, + "loss": 2.7672, + "step": 41017 + }, + { + "epoch": 2.546278477869514, + "grad_norm": 0.1306928956360561, + "learning_rate": 6.8074818402077634e-06, + "loss": 2.6626, + "step": 41018 + }, + { + "epoch": 2.5463405549692717, + "grad_norm": 0.13851049596603443, + "learning_rate": 6.805662652998418e-06, + "loss": 2.6205, + "step": 41019 + }, + { + "epoch": 2.5464026320690296, + "grad_norm": 0.14332167860010642, + "learning_rate": 6.8038436911444125e-06, + "loss": 2.7642, + "step": 41020 + }, + { + "epoch": 2.5464647091687875, + "grad_norm": 0.14840987851993911, + "learning_rate": 6.802024954655239e-06, + "loss": 2.7362, + "step": 41021 + }, + { + "epoch": 2.5465267862685455, + "grad_norm": 0.14142018902820683, + "learning_rate": 6.80020644354038e-06, + "loss": 2.7363, + "step": 41022 + }, + { + "epoch": 2.5465888633683034, + "grad_norm": 0.13886243863040462, + "learning_rate": 6.7983881578093376e-06, + "loss": 2.7447, + "step": 41023 + }, + { + "epoch": 2.5466509404680613, + "grad_norm": 0.14185131015955785, + "learning_rate": 6.7965700974716e-06, + "loss": 2.6848, + "step": 41024 + }, + { + "epoch": 2.546713017567819, + "grad_norm": 0.1377704902119529, + "learning_rate": 6.794752262536636e-06, + "loss": 2.6586, + "step": 41025 + }, + { + "epoch": 2.546775094667577, + "grad_norm": 0.14804631059483664, + "learning_rate": 6.792934653013944e-06, + "loss": 2.7647, + "step": 41026 + }, + { + "epoch": 2.546837171767335, + "grad_norm": 0.15226219512967493, + "learning_rate": 6.7911172689129885e-06, + "loss": 2.6628, + "step": 41027 + }, + { + "epoch": 2.546899248867093, + "grad_norm": 0.12945900516358194, + "learning_rate": 6.789300110243269e-06, + "loss": 2.6818, + "step": 41028 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.13555915760379972, + "learning_rate": 6.787483177014264e-06, + "loss": 2.6862, + "step": 41029 + }, + { + "epoch": 2.547023403066609, + "grad_norm": 0.1342998763165682, + "learning_rate": 6.785666469235452e-06, + "loss": 2.6105, + "step": 41030 + }, + { + "epoch": 2.5470854801663667, + "grad_norm": 0.13422957524352844, + "learning_rate": 6.7838499869162965e-06, + "loss": 2.6949, + "step": 41031 + }, + { + "epoch": 2.5471475572661246, + "grad_norm": 0.1507731687524702, + "learning_rate": 6.782033730066295e-06, + "loss": 2.756, + "step": 41032 + }, + { + "epoch": 2.5472096343658825, + "grad_norm": 0.14621668197676035, + "learning_rate": 6.780217698694918e-06, + "loss": 2.7051, + "step": 41033 + }, + { + "epoch": 2.5472717114656405, + "grad_norm": 0.15174815320237633, + "learning_rate": 6.778401892811637e-06, + "loss": 2.6305, + "step": 41034 + }, + { + "epoch": 2.5473337885653984, + "grad_norm": 0.13747277098591756, + "learning_rate": 6.776586312425914e-06, + "loss": 2.7452, + "step": 41035 + }, + { + "epoch": 2.5473958656651563, + "grad_norm": 0.13790886463856505, + "learning_rate": 6.774770957547244e-06, + "loss": 2.7451, + "step": 41036 + }, + { + "epoch": 2.5474579427649138, + "grad_norm": 0.14141975438457635, + "learning_rate": 6.77295582818509e-06, + "loss": 2.7047, + "step": 41037 + }, + { + "epoch": 2.547520019864672, + "grad_norm": 0.13336915578658423, + "learning_rate": 6.7711409243489175e-06, + "loss": 2.6322, + "step": 41038 + }, + { + "epoch": 2.5475820969644296, + "grad_norm": 0.13747552144464528, + "learning_rate": 6.769326246048191e-06, + "loss": 2.7623, + "step": 41039 + }, + { + "epoch": 2.547644174064188, + "grad_norm": 0.1401349775359011, + "learning_rate": 6.7675117932923804e-06, + "loss": 2.7525, + "step": 41040 + }, + { + "epoch": 2.5477062511639454, + "grad_norm": 0.1390859305541353, + "learning_rate": 6.765697566090956e-06, + "loss": 2.6911, + "step": 41041 + }, + { + "epoch": 2.5477683282637034, + "grad_norm": 0.13452400440548776, + "learning_rate": 6.763883564453389e-06, + "loss": 2.6485, + "step": 41042 + }, + { + "epoch": 2.5478304053634613, + "grad_norm": 0.14811746705895706, + "learning_rate": 6.7620697883891415e-06, + "loss": 2.6368, + "step": 41043 + }, + { + "epoch": 2.547892482463219, + "grad_norm": 0.1431975886184669, + "learning_rate": 6.760256237907669e-06, + "loss": 2.8274, + "step": 41044 + }, + { + "epoch": 2.547954559562977, + "grad_norm": 0.13810977379346098, + "learning_rate": 6.758442913018426e-06, + "loss": 2.7209, + "step": 41045 + }, + { + "epoch": 2.548016636662735, + "grad_norm": 0.15128654669953234, + "learning_rate": 6.756629813730897e-06, + "loss": 2.7661, + "step": 41046 + }, + { + "epoch": 2.548078713762493, + "grad_norm": 0.15602148153920284, + "learning_rate": 6.754816940054526e-06, + "loss": 2.8411, + "step": 41047 + }, + { + "epoch": 2.548140790862251, + "grad_norm": 0.15063120437443472, + "learning_rate": 6.753004291998777e-06, + "loss": 2.6656, + "step": 41048 + }, + { + "epoch": 2.548202867962009, + "grad_norm": 0.13995797857105385, + "learning_rate": 6.751191869573087e-06, + "loss": 2.5877, + "step": 41049 + }, + { + "epoch": 2.5482649450617667, + "grad_norm": 0.15032141195116921, + "learning_rate": 6.749379672786943e-06, + "loss": 2.7652, + "step": 41050 + }, + { + "epoch": 2.5483270221615246, + "grad_norm": 0.14359098092440417, + "learning_rate": 6.747567701649782e-06, + "loss": 2.7446, + "step": 41051 + }, + { + "epoch": 2.5483890992612825, + "grad_norm": 0.13541728869319763, + "learning_rate": 6.745755956171057e-06, + "loss": 2.6831, + "step": 41052 + }, + { + "epoch": 2.5484511763610405, + "grad_norm": 0.13838808813975387, + "learning_rate": 6.743944436360228e-06, + "loss": 2.7489, + "step": 41053 + }, + { + "epoch": 2.5485132534607984, + "grad_norm": 0.13741325004363916, + "learning_rate": 6.742133142226731e-06, + "loss": 2.724, + "step": 41054 + }, + { + "epoch": 2.5485753305605563, + "grad_norm": 0.14878058312476344, + "learning_rate": 6.7403220737800364e-06, + "loss": 2.7956, + "step": 41055 + }, + { + "epoch": 2.548637407660314, + "grad_norm": 0.13699104577277552, + "learning_rate": 6.7385112310295815e-06, + "loss": 2.6201, + "step": 41056 + }, + { + "epoch": 2.548699484760072, + "grad_norm": 0.14160834329088898, + "learning_rate": 6.73670061398482e-06, + "loss": 2.7597, + "step": 41057 + }, + { + "epoch": 2.54876156185983, + "grad_norm": 0.13785166014281156, + "learning_rate": 6.7348902226551825e-06, + "loss": 2.6945, + "step": 41058 + }, + { + "epoch": 2.548823638959588, + "grad_norm": 0.13492251675999287, + "learning_rate": 6.733080057050134e-06, + "loss": 2.7431, + "step": 41059 + }, + { + "epoch": 2.5488857160593454, + "grad_norm": 0.1397269635065439, + "learning_rate": 6.731270117179111e-06, + "loss": 2.6672, + "step": 41060 + }, + { + "epoch": 2.548947793159104, + "grad_norm": 0.16234356842544873, + "learning_rate": 6.729460403051557e-06, + "loss": 2.7209, + "step": 41061 + }, + { + "epoch": 2.5490098702588613, + "grad_norm": 0.13687534900516277, + "learning_rate": 6.7276509146769085e-06, + "loss": 2.616, + "step": 41062 + }, + { + "epoch": 2.5490719473586196, + "grad_norm": 0.1361368329235753, + "learning_rate": 6.7258416520646015e-06, + "loss": 2.7431, + "step": 41063 + }, + { + "epoch": 2.549134024458377, + "grad_norm": 0.13808419389716525, + "learning_rate": 6.724032615224096e-06, + "loss": 2.7619, + "step": 41064 + }, + { + "epoch": 2.5491961015581355, + "grad_norm": 0.13163854869031422, + "learning_rate": 6.7222238041648175e-06, + "loss": 2.7154, + "step": 41065 + }, + { + "epoch": 2.549258178657893, + "grad_norm": 0.14960719074616924, + "learning_rate": 6.720415218896198e-06, + "loss": 2.6716, + "step": 41066 + }, + { + "epoch": 2.549320255757651, + "grad_norm": 0.13406077867349872, + "learning_rate": 6.718606859427673e-06, + "loss": 2.6851, + "step": 41067 + }, + { + "epoch": 2.549382332857409, + "grad_norm": 0.13649518262262508, + "learning_rate": 6.716798725768697e-06, + "loss": 2.6591, + "step": 41068 + }, + { + "epoch": 2.5494444099571667, + "grad_norm": 0.13415194117648377, + "learning_rate": 6.7149908179286845e-06, + "loss": 2.628, + "step": 41069 + }, + { + "epoch": 2.5495064870569246, + "grad_norm": 0.14011297124427274, + "learning_rate": 6.713183135917073e-06, + "loss": 2.7148, + "step": 41070 + }, + { + "epoch": 2.5495685641566825, + "grad_norm": 0.13977993596024532, + "learning_rate": 6.711375679743282e-06, + "loss": 2.7225, + "step": 41071 + }, + { + "epoch": 2.5496306412564405, + "grad_norm": 0.13500948279904754, + "learning_rate": 6.709568449416764e-06, + "loss": 2.6943, + "step": 41072 + }, + { + "epoch": 2.5496927183561984, + "grad_norm": 0.14412167225525283, + "learning_rate": 6.70776144494693e-06, + "loss": 2.7442, + "step": 41073 + }, + { + "epoch": 2.5497547954559563, + "grad_norm": 0.13649867601372614, + "learning_rate": 6.705954666343222e-06, + "loss": 2.7755, + "step": 41074 + }, + { + "epoch": 2.549816872555714, + "grad_norm": 0.1804162155410759, + "learning_rate": 6.7041481136150554e-06, + "loss": 2.746, + "step": 41075 + }, + { + "epoch": 2.549878949655472, + "grad_norm": 0.19615253046230663, + "learning_rate": 6.7023417867718596e-06, + "loss": 2.7568, + "step": 41076 + }, + { + "epoch": 2.54994102675523, + "grad_norm": 0.13615348992736043, + "learning_rate": 6.70053568582305e-06, + "loss": 2.7236, + "step": 41077 + }, + { + "epoch": 2.550003103854988, + "grad_norm": 0.1583539808659751, + "learning_rate": 6.698729810778065e-06, + "loss": 2.6993, + "step": 41078 + }, + { + "epoch": 2.550065180954746, + "grad_norm": 0.140479482754165, + "learning_rate": 6.696924161646323e-06, + "loss": 2.6827, + "step": 41079 + }, + { + "epoch": 2.550127258054504, + "grad_norm": 0.13981412515177477, + "learning_rate": 6.695118738437234e-06, + "loss": 2.6834, + "step": 41080 + }, + { + "epoch": 2.5501893351542617, + "grad_norm": 0.18343413293013078, + "learning_rate": 6.693313541160218e-06, + "loss": 2.7781, + "step": 41081 + }, + { + "epoch": 2.5502514122540196, + "grad_norm": 0.17146976339180836, + "learning_rate": 6.691508569824706e-06, + "loss": 2.8249, + "step": 41082 + }, + { + "epoch": 2.5503134893537776, + "grad_norm": 0.1525467168570186, + "learning_rate": 6.689703824440102e-06, + "loss": 2.722, + "step": 41083 + }, + { + "epoch": 2.5503755664535355, + "grad_norm": 0.13766092858607018, + "learning_rate": 6.687899305015832e-06, + "loss": 2.6618, + "step": 41084 + }, + { + "epoch": 2.550437643553293, + "grad_norm": 0.13531855473836624, + "learning_rate": 6.6860950115613e-06, + "loss": 2.7441, + "step": 41085 + }, + { + "epoch": 2.5504997206530513, + "grad_norm": 0.14348005443740727, + "learning_rate": 6.6842909440859305e-06, + "loss": 2.6931, + "step": 41086 + }, + { + "epoch": 2.550561797752809, + "grad_norm": 0.1369618434491984, + "learning_rate": 6.6824871025991276e-06, + "loss": 2.698, + "step": 41087 + }, + { + "epoch": 2.550623874852567, + "grad_norm": 0.14664458732352922, + "learning_rate": 6.680683487110312e-06, + "loss": 2.6311, + "step": 41088 + }, + { + "epoch": 2.5506859519523246, + "grad_norm": 0.15824159107458396, + "learning_rate": 6.678880097628881e-06, + "loss": 2.7223, + "step": 41089 + }, + { + "epoch": 2.5507480290520825, + "grad_norm": 0.15027978453830904, + "learning_rate": 6.677076934164239e-06, + "loss": 2.6808, + "step": 41090 + }, + { + "epoch": 2.5508101061518405, + "grad_norm": 0.14088076274139905, + "learning_rate": 6.675273996725817e-06, + "loss": 2.7227, + "step": 41091 + }, + { + "epoch": 2.5508721832515984, + "grad_norm": 0.13815256046631194, + "learning_rate": 6.673471285323007e-06, + "loss": 2.6297, + "step": 41092 + }, + { + "epoch": 2.5509342603513563, + "grad_norm": 0.13451535025732622, + "learning_rate": 6.6716687999652135e-06, + "loss": 2.6741, + "step": 41093 + }, + { + "epoch": 2.550996337451114, + "grad_norm": 0.14010976738235087, + "learning_rate": 6.669866540661835e-06, + "loss": 2.7279, + "step": 41094 + }, + { + "epoch": 2.551058414550872, + "grad_norm": 0.1482613610193095, + "learning_rate": 6.668064507422289e-06, + "loss": 2.6967, + "step": 41095 + }, + { + "epoch": 2.55112049165063, + "grad_norm": 0.13841742017962114, + "learning_rate": 6.6662627002559766e-06, + "loss": 2.7436, + "step": 41096 + }, + { + "epoch": 2.551182568750388, + "grad_norm": 0.17051427442228498, + "learning_rate": 6.664461119172283e-06, + "loss": 2.785, + "step": 41097 + }, + { + "epoch": 2.551244645850146, + "grad_norm": 0.14568153993606436, + "learning_rate": 6.662659764180618e-06, + "loss": 2.6802, + "step": 41098 + }, + { + "epoch": 2.551306722949904, + "grad_norm": 0.16755570037118736, + "learning_rate": 6.660858635290368e-06, + "loss": 2.7232, + "step": 41099 + }, + { + "epoch": 2.5513688000496617, + "grad_norm": 0.13431812707404125, + "learning_rate": 6.659057732510948e-06, + "loss": 2.6684, + "step": 41100 + }, + { + "epoch": 2.5514308771494196, + "grad_norm": 0.13024717149070367, + "learning_rate": 6.657257055851751e-06, + "loss": 2.7174, + "step": 41101 + }, + { + "epoch": 2.5514929542491775, + "grad_norm": 0.13770560638692161, + "learning_rate": 6.655456605322158e-06, + "loss": 2.8028, + "step": 41102 + }, + { + "epoch": 2.5515550313489355, + "grad_norm": 0.17277828107129717, + "learning_rate": 6.653656380931561e-06, + "loss": 2.6998, + "step": 41103 + }, + { + "epoch": 2.5516171084486934, + "grad_norm": 0.15681715196001503, + "learning_rate": 6.651856382689376e-06, + "loss": 2.7991, + "step": 41104 + }, + { + "epoch": 2.5516791855484513, + "grad_norm": 0.16575756773027356, + "learning_rate": 6.650056610604976e-06, + "loss": 2.7646, + "step": 41105 + }, + { + "epoch": 2.551741262648209, + "grad_norm": 0.14480512033937573, + "learning_rate": 6.648257064687746e-06, + "loss": 2.7488, + "step": 41106 + }, + { + "epoch": 2.551803339747967, + "grad_norm": 0.1382773118616747, + "learning_rate": 6.646457744947094e-06, + "loss": 2.7145, + "step": 41107 + }, + { + "epoch": 2.5518654168477246, + "grad_norm": 0.1340612093878914, + "learning_rate": 6.644658651392382e-06, + "loss": 2.6819, + "step": 41108 + }, + { + "epoch": 2.551927493947483, + "grad_norm": 0.1378417420125346, + "learning_rate": 6.6428597840330235e-06, + "loss": 2.6553, + "step": 41109 + }, + { + "epoch": 2.5519895710472404, + "grad_norm": 0.18373066281405437, + "learning_rate": 6.641061142878391e-06, + "loss": 2.7111, + "step": 41110 + }, + { + "epoch": 2.552051648146999, + "grad_norm": 0.1511141600329971, + "learning_rate": 6.639262727937867e-06, + "loss": 2.667, + "step": 41111 + }, + { + "epoch": 2.5521137252467563, + "grad_norm": 0.1373237371456603, + "learning_rate": 6.637464539220839e-06, + "loss": 2.7209, + "step": 41112 + }, + { + "epoch": 2.5521758023465146, + "grad_norm": 0.17747192078444865, + "learning_rate": 6.635666576736671e-06, + "loss": 2.7099, + "step": 41113 + }, + { + "epoch": 2.552237879446272, + "grad_norm": 0.13645009484307, + "learning_rate": 6.633868840494772e-06, + "loss": 2.7021, + "step": 41114 + }, + { + "epoch": 2.55229995654603, + "grad_norm": 0.13403516257266074, + "learning_rate": 6.6320713305045025e-06, + "loss": 2.6233, + "step": 41115 + }, + { + "epoch": 2.552362033645788, + "grad_norm": 0.151862037001474, + "learning_rate": 6.630274046775248e-06, + "loss": 2.8422, + "step": 41116 + }, + { + "epoch": 2.552424110745546, + "grad_norm": 0.14223559923178394, + "learning_rate": 6.628476989316368e-06, + "loss": 2.7164, + "step": 41117 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.1351219869744416, + "learning_rate": 6.626680158137266e-06, + "loss": 2.6012, + "step": 41118 + }, + { + "epoch": 2.5525482649450617, + "grad_norm": 0.16105251045245256, + "learning_rate": 6.624883553247302e-06, + "loss": 2.7356, + "step": 41119 + }, + { + "epoch": 2.5526103420448196, + "grad_norm": 0.13571642898851613, + "learning_rate": 6.623087174655851e-06, + "loss": 2.6885, + "step": 41120 + }, + { + "epoch": 2.5526724191445775, + "grad_norm": 0.13080056135647003, + "learning_rate": 6.621291022372272e-06, + "loss": 2.7194, + "step": 41121 + }, + { + "epoch": 2.5527344962443355, + "grad_norm": 0.12946953668891695, + "learning_rate": 6.619495096405959e-06, + "loss": 2.6393, + "step": 41122 + }, + { + "epoch": 2.5527965733440934, + "grad_norm": 0.1447669633081935, + "learning_rate": 6.617699396766269e-06, + "loss": 2.7072, + "step": 41123 + }, + { + "epoch": 2.5528586504438513, + "grad_norm": 0.13939251156776641, + "learning_rate": 6.6159039234625795e-06, + "loss": 2.7525, + "step": 41124 + }, + { + "epoch": 2.552920727543609, + "grad_norm": 0.13839775177874308, + "learning_rate": 6.614108676504243e-06, + "loss": 2.6154, + "step": 41125 + }, + { + "epoch": 2.552982804643367, + "grad_norm": 0.13419050703335125, + "learning_rate": 6.612313655900632e-06, + "loss": 2.6617, + "step": 41126 + }, + { + "epoch": 2.553044881743125, + "grad_norm": 0.14726966806826663, + "learning_rate": 6.610518861661114e-06, + "loss": 2.7742, + "step": 41127 + }, + { + "epoch": 2.553106958842883, + "grad_norm": 0.1347151399118549, + "learning_rate": 6.60872429379506e-06, + "loss": 2.7955, + "step": 41128 + }, + { + "epoch": 2.553169035942641, + "grad_norm": 0.17120658341549627, + "learning_rate": 6.606929952311825e-06, + "loss": 2.634, + "step": 41129 + }, + { + "epoch": 2.553231113042399, + "grad_norm": 0.15453336796210754, + "learning_rate": 6.605135837220755e-06, + "loss": 2.696, + "step": 41130 + }, + { + "epoch": 2.5532931901421567, + "grad_norm": 0.15884942037235303, + "learning_rate": 6.603341948531239e-06, + "loss": 2.697, + "step": 41131 + }, + { + "epoch": 2.5533552672419146, + "grad_norm": 0.1321743683233151, + "learning_rate": 6.601548286252624e-06, + "loss": 2.7387, + "step": 41132 + }, + { + "epoch": 2.553417344341672, + "grad_norm": 0.14252951851671058, + "learning_rate": 6.599754850394263e-06, + "loss": 2.5398, + "step": 41133 + }, + { + "epoch": 2.5534794214414305, + "grad_norm": 0.15893334945355037, + "learning_rate": 6.597961640965511e-06, + "loss": 2.8113, + "step": 41134 + }, + { + "epoch": 2.553541498541188, + "grad_norm": 0.13490554208196415, + "learning_rate": 6.596168657975738e-06, + "loss": 2.8186, + "step": 41135 + }, + { + "epoch": 2.5536035756409463, + "grad_norm": 0.1639629584289817, + "learning_rate": 6.594375901434285e-06, + "loss": 2.7539, + "step": 41136 + }, + { + "epoch": 2.553665652740704, + "grad_norm": 0.15183806242951306, + "learning_rate": 6.592583371350513e-06, + "loss": 2.8036, + "step": 41137 + }, + { + "epoch": 2.5537277298404617, + "grad_norm": 0.1406812025717248, + "learning_rate": 6.590791067733776e-06, + "loss": 2.7217, + "step": 41138 + }, + { + "epoch": 2.5537898069402196, + "grad_norm": 0.13602624988633694, + "learning_rate": 6.588998990593403e-06, + "loss": 2.6096, + "step": 41139 + }, + { + "epoch": 2.5538518840399775, + "grad_norm": 0.14377398187200227, + "learning_rate": 6.587207139938767e-06, + "loss": 2.6706, + "step": 41140 + }, + { + "epoch": 2.5539139611397355, + "grad_norm": 0.14787156991374317, + "learning_rate": 6.585415515779214e-06, + "loss": 2.6875, + "step": 41141 + }, + { + "epoch": 2.5539760382394934, + "grad_norm": 0.148657773361917, + "learning_rate": 6.583624118124093e-06, + "loss": 2.6898, + "step": 41142 + }, + { + "epoch": 2.5540381153392513, + "grad_norm": 0.15564614076724007, + "learning_rate": 6.581832946982741e-06, + "loss": 2.6713, + "step": 41143 + }, + { + "epoch": 2.554100192439009, + "grad_norm": 0.1613831857929122, + "learning_rate": 6.580042002364495e-06, + "loss": 2.6695, + "step": 41144 + }, + { + "epoch": 2.554162269538767, + "grad_norm": 0.1482241125735224, + "learning_rate": 6.578251284278725e-06, + "loss": 2.756, + "step": 41145 + }, + { + "epoch": 2.554224346638525, + "grad_norm": 0.13784354597624435, + "learning_rate": 6.576460792734757e-06, + "loss": 2.6822, + "step": 41146 + }, + { + "epoch": 2.554286423738283, + "grad_norm": 0.14088320868472434, + "learning_rate": 6.5746705277419384e-06, + "loss": 2.6667, + "step": 41147 + }, + { + "epoch": 2.554348500838041, + "grad_norm": 0.13714124928549973, + "learning_rate": 6.572880489309602e-06, + "loss": 2.7747, + "step": 41148 + }, + { + "epoch": 2.554410577937799, + "grad_norm": 0.15282192745265175, + "learning_rate": 6.571090677447078e-06, + "loss": 2.752, + "step": 41149 + }, + { + "epoch": 2.5544726550375567, + "grad_norm": 0.1757093123418952, + "learning_rate": 6.5693010921637265e-06, + "loss": 2.6566, + "step": 41150 + }, + { + "epoch": 2.5545347321373146, + "grad_norm": 0.13994570750130347, + "learning_rate": 6.567511733468879e-06, + "loss": 2.7823, + "step": 41151 + }, + { + "epoch": 2.5545968092370726, + "grad_norm": 0.1381765572741239, + "learning_rate": 6.565722601371865e-06, + "loss": 2.7327, + "step": 41152 + }, + { + "epoch": 2.5546588863368305, + "grad_norm": 0.13534264803508048, + "learning_rate": 6.563933695882008e-06, + "loss": 2.7145, + "step": 41153 + }, + { + "epoch": 2.5547209634365884, + "grad_norm": 0.1612408535880582, + "learning_rate": 6.562145017008664e-06, + "loss": 2.6925, + "step": 41154 + }, + { + "epoch": 2.5547830405363463, + "grad_norm": 0.1402611396134428, + "learning_rate": 6.560356564761155e-06, + "loss": 2.7171, + "step": 41155 + }, + { + "epoch": 2.554845117636104, + "grad_norm": 0.13379827871391503, + "learning_rate": 6.5585683391488125e-06, + "loss": 2.6909, + "step": 41156 + }, + { + "epoch": 2.554907194735862, + "grad_norm": 0.15232518889989743, + "learning_rate": 6.5567803401809515e-06, + "loss": 2.7208, + "step": 41157 + }, + { + "epoch": 2.5549692718356196, + "grad_norm": 0.13905268752379063, + "learning_rate": 6.55499256786693e-06, + "loss": 2.7201, + "step": 41158 + }, + { + "epoch": 2.555031348935378, + "grad_norm": 0.1373523472079564, + "learning_rate": 6.553205022216053e-06, + "loss": 2.7459, + "step": 41159 + }, + { + "epoch": 2.5550934260351355, + "grad_norm": 0.16579673586001825, + "learning_rate": 6.5514177032376554e-06, + "loss": 2.7055, + "step": 41160 + }, + { + "epoch": 2.555155503134894, + "grad_norm": 0.1467412666962524, + "learning_rate": 6.54963061094106e-06, + "loss": 2.7321, + "step": 41161 + }, + { + "epoch": 2.5552175802346513, + "grad_norm": 0.1459095165534929, + "learning_rate": 6.547843745335575e-06, + "loss": 2.7393, + "step": 41162 + }, + { + "epoch": 2.555279657334409, + "grad_norm": 0.13614314709133984, + "learning_rate": 6.546057106430548e-06, + "loss": 2.6301, + "step": 41163 + }, + { + "epoch": 2.555341734434167, + "grad_norm": 0.13357021364269847, + "learning_rate": 6.544270694235294e-06, + "loss": 2.62, + "step": 41164 + }, + { + "epoch": 2.555403811533925, + "grad_norm": 0.13273147607907942, + "learning_rate": 6.542484508759128e-06, + "loss": 2.828, + "step": 41165 + }, + { + "epoch": 2.555465888633683, + "grad_norm": 0.132281433733464, + "learning_rate": 6.5406985500113595e-06, + "loss": 2.6626, + "step": 41166 + }, + { + "epoch": 2.555527965733441, + "grad_norm": 0.15052648699239313, + "learning_rate": 6.538912818001325e-06, + "loss": 2.7145, + "step": 41167 + }, + { + "epoch": 2.555590042833199, + "grad_norm": 0.1357276620571659, + "learning_rate": 6.537127312738334e-06, + "loss": 2.7324, + "step": 41168 + }, + { + "epoch": 2.5556521199329567, + "grad_norm": 0.1307763647718906, + "learning_rate": 6.5353420342317005e-06, + "loss": 2.6433, + "step": 41169 + }, + { + "epoch": 2.5557141970327146, + "grad_norm": 0.13343367746448184, + "learning_rate": 6.53355698249073e-06, + "loss": 2.6054, + "step": 41170 + }, + { + "epoch": 2.5557762741324725, + "grad_norm": 0.13584702374743549, + "learning_rate": 6.531772157524741e-06, + "loss": 2.743, + "step": 41171 + }, + { + "epoch": 2.5558383512322305, + "grad_norm": 0.134919272476657, + "learning_rate": 6.529987559343065e-06, + "loss": 2.6776, + "step": 41172 + }, + { + "epoch": 2.5559004283319884, + "grad_norm": 0.14787739562055477, + "learning_rate": 6.528203187954995e-06, + "loss": 2.7558, + "step": 41173 + }, + { + "epoch": 2.5559625054317463, + "grad_norm": 0.13654791512940548, + "learning_rate": 6.526419043369841e-06, + "loss": 2.7069, + "step": 41174 + }, + { + "epoch": 2.556024582531504, + "grad_norm": 0.13936993364140862, + "learning_rate": 6.524635125596912e-06, + "loss": 2.7905, + "step": 41175 + }, + { + "epoch": 2.556086659631262, + "grad_norm": 0.14251111689922688, + "learning_rate": 6.522851434645505e-06, + "loss": 2.6862, + "step": 41176 + }, + { + "epoch": 2.55614873673102, + "grad_norm": 0.13355567512247454, + "learning_rate": 6.521067970524947e-06, + "loss": 2.8452, + "step": 41177 + }, + { + "epoch": 2.556210813830778, + "grad_norm": 0.14500926845212775, + "learning_rate": 6.519284733244529e-06, + "loss": 2.6627, + "step": 41178 + }, + { + "epoch": 2.556272890930536, + "grad_norm": 0.1730791361080926, + "learning_rate": 6.517501722813562e-06, + "loss": 2.8023, + "step": 41179 + }, + { + "epoch": 2.556334968030294, + "grad_norm": 0.1377038614852223, + "learning_rate": 6.515718939241333e-06, + "loss": 2.7014, + "step": 41180 + }, + { + "epoch": 2.5563970451300513, + "grad_norm": 0.14316536537191665, + "learning_rate": 6.513936382537167e-06, + "loss": 2.6546, + "step": 41181 + }, + { + "epoch": 2.5564591222298096, + "grad_norm": 0.13464491731951936, + "learning_rate": 6.5121540527103505e-06, + "loss": 2.5754, + "step": 41182 + }, + { + "epoch": 2.556521199329567, + "grad_norm": 0.13033206226834992, + "learning_rate": 6.5103719497701835e-06, + "loss": 2.674, + "step": 41183 + }, + { + "epoch": 2.5565832764293255, + "grad_norm": 0.17276486415407424, + "learning_rate": 6.508590073725951e-06, + "loss": 2.7262, + "step": 41184 + }, + { + "epoch": 2.556645353529083, + "grad_norm": 0.15094481574863733, + "learning_rate": 6.5068084245869745e-06, + "loss": 2.7136, + "step": 41185 + }, + { + "epoch": 2.556707430628841, + "grad_norm": 0.13544229958703663, + "learning_rate": 6.505027002362535e-06, + "loss": 2.7115, + "step": 41186 + }, + { + "epoch": 2.556769507728599, + "grad_norm": 0.14330727802558219, + "learning_rate": 6.5032458070619305e-06, + "loss": 2.6654, + "step": 41187 + }, + { + "epoch": 2.5568315848283567, + "grad_norm": 0.13918377859830217, + "learning_rate": 6.5014648386944485e-06, + "loss": 2.7321, + "step": 41188 + }, + { + "epoch": 2.5568936619281146, + "grad_norm": 0.12856702530341874, + "learning_rate": 6.499684097269376e-06, + "loss": 2.6195, + "step": 41189 + }, + { + "epoch": 2.5569557390278725, + "grad_norm": 0.1561510845846876, + "learning_rate": 6.497903582796022e-06, + "loss": 2.7217, + "step": 41190 + }, + { + "epoch": 2.5570178161276305, + "grad_norm": 0.16616302005721162, + "learning_rate": 6.496123295283663e-06, + "loss": 2.6676, + "step": 41191 + }, + { + "epoch": 2.5570798932273884, + "grad_norm": 0.14944718187064257, + "learning_rate": 6.494343234741596e-06, + "loss": 2.6684, + "step": 41192 + }, + { + "epoch": 2.5571419703271463, + "grad_norm": 0.14191123381026044, + "learning_rate": 6.492563401179086e-06, + "loss": 2.7658, + "step": 41193 + }, + { + "epoch": 2.557204047426904, + "grad_norm": 0.14817989105912144, + "learning_rate": 6.490783794605443e-06, + "loss": 2.6942, + "step": 41194 + }, + { + "epoch": 2.557266124526662, + "grad_norm": 0.13263708338984379, + "learning_rate": 6.489004415029948e-06, + "loss": 2.6136, + "step": 41195 + }, + { + "epoch": 2.55732820162642, + "grad_norm": 0.14472241904109992, + "learning_rate": 6.487225262461877e-06, + "loss": 2.7268, + "step": 41196 + }, + { + "epoch": 2.557390278726178, + "grad_norm": 0.13449715396021944, + "learning_rate": 6.4854463369105165e-06, + "loss": 2.684, + "step": 41197 + }, + { + "epoch": 2.557452355825936, + "grad_norm": 0.14240993089755122, + "learning_rate": 6.483667638385138e-06, + "loss": 2.7202, + "step": 41198 + }, + { + "epoch": 2.557514432925694, + "grad_norm": 0.13811857358516524, + "learning_rate": 6.481889166895033e-06, + "loss": 2.6999, + "step": 41199 + }, + { + "epoch": 2.5575765100254517, + "grad_norm": 0.13370080807537235, + "learning_rate": 6.4801109224494836e-06, + "loss": 2.6983, + "step": 41200 + }, + { + "epoch": 2.5576385871252096, + "grad_norm": 0.14991096695442363, + "learning_rate": 6.478332905057755e-06, + "loss": 2.6762, + "step": 41201 + }, + { + "epoch": 2.5577006642249676, + "grad_norm": 0.139431311132746, + "learning_rate": 6.476555114729121e-06, + "loss": 2.679, + "step": 41202 + }, + { + "epoch": 2.5577627413247255, + "grad_norm": 0.14552191211566357, + "learning_rate": 6.474777551472877e-06, + "loss": 2.7482, + "step": 41203 + }, + { + "epoch": 2.557824818424483, + "grad_norm": 0.13965244554223208, + "learning_rate": 6.473000215298269e-06, + "loss": 2.6965, + "step": 41204 + }, + { + "epoch": 2.5578868955242413, + "grad_norm": 0.13657323723052925, + "learning_rate": 6.471223106214602e-06, + "loss": 2.7345, + "step": 41205 + }, + { + "epoch": 2.557948972623999, + "grad_norm": 0.13581056039252873, + "learning_rate": 6.469446224231124e-06, + "loss": 2.6814, + "step": 41206 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.15692396840628076, + "learning_rate": 6.46766956935711e-06, + "loss": 2.708, + "step": 41207 + }, + { + "epoch": 2.5580731268235146, + "grad_norm": 0.152161024011291, + "learning_rate": 6.465893141601836e-06, + "loss": 2.6457, + "step": 41208 + }, + { + "epoch": 2.558135203923273, + "grad_norm": 0.13684674810795794, + "learning_rate": 6.464116940974569e-06, + "loss": 2.697, + "step": 41209 + }, + { + "epoch": 2.5581972810230305, + "grad_norm": 0.14206303346838353, + "learning_rate": 6.462340967484571e-06, + "loss": 2.7138, + "step": 41210 + }, + { + "epoch": 2.5582593581227884, + "grad_norm": 0.1348757915508953, + "learning_rate": 6.460565221141107e-06, + "loss": 2.7552, + "step": 41211 + }, + { + "epoch": 2.5583214352225463, + "grad_norm": 0.13415237159818688, + "learning_rate": 6.458789701953439e-06, + "loss": 2.8051, + "step": 41212 + }, + { + "epoch": 2.558383512322304, + "grad_norm": 0.1335075019142154, + "learning_rate": 6.4570144099308395e-06, + "loss": 2.7381, + "step": 41213 + }, + { + "epoch": 2.558445589422062, + "grad_norm": 0.13563027330993235, + "learning_rate": 6.45523934508257e-06, + "loss": 2.7406, + "step": 41214 + }, + { + "epoch": 2.55850766652182, + "grad_norm": 0.13808754592239336, + "learning_rate": 6.453464507417889e-06, + "loss": 2.7754, + "step": 41215 + }, + { + "epoch": 2.558569743621578, + "grad_norm": 0.1452527466840933, + "learning_rate": 6.4516898969460394e-06, + "loss": 2.7483, + "step": 41216 + }, + { + "epoch": 2.558631820721336, + "grad_norm": 0.13553216178258384, + "learning_rate": 6.449915513676308e-06, + "loss": 2.7046, + "step": 41217 + }, + { + "epoch": 2.558693897821094, + "grad_norm": 0.14410439813956621, + "learning_rate": 6.448141357617943e-06, + "loss": 2.7349, + "step": 41218 + }, + { + "epoch": 2.5587559749208517, + "grad_norm": 0.13803101598017214, + "learning_rate": 6.4463674287801925e-06, + "loss": 2.6823, + "step": 41219 + }, + { + "epoch": 2.5588180520206096, + "grad_norm": 0.1543485941098737, + "learning_rate": 6.44459372717231e-06, + "loss": 2.6792, + "step": 41220 + }, + { + "epoch": 2.5588801291203676, + "grad_norm": 0.16229137292593773, + "learning_rate": 6.44282025280356e-06, + "loss": 2.7216, + "step": 41221 + }, + { + "epoch": 2.5589422062201255, + "grad_norm": 0.1432443300081241, + "learning_rate": 6.441047005683198e-06, + "loss": 2.61, + "step": 41222 + }, + { + "epoch": 2.5590042833198834, + "grad_norm": 0.13769041553203554, + "learning_rate": 6.439273985820465e-06, + "loss": 2.6322, + "step": 41223 + }, + { + "epoch": 2.5590663604196413, + "grad_norm": 0.14214168009429828, + "learning_rate": 6.4375011932246155e-06, + "loss": 2.7104, + "step": 41224 + }, + { + "epoch": 2.5591284375193992, + "grad_norm": 0.15116633964478318, + "learning_rate": 6.43572862790488e-06, + "loss": 2.6574, + "step": 41225 + }, + { + "epoch": 2.559190514619157, + "grad_norm": 0.17359050303906767, + "learning_rate": 6.4339562898705405e-06, + "loss": 2.6257, + "step": 41226 + }, + { + "epoch": 2.5592525917189146, + "grad_norm": 0.1371864765016093, + "learning_rate": 6.432184179130829e-06, + "loss": 2.6889, + "step": 41227 + }, + { + "epoch": 2.559314668818673, + "grad_norm": 0.13395057452065465, + "learning_rate": 6.430412295694988e-06, + "loss": 2.628, + "step": 41228 + }, + { + "epoch": 2.5593767459184305, + "grad_norm": 0.1420009898903421, + "learning_rate": 6.428640639572247e-06, + "loss": 2.6891, + "step": 41229 + }, + { + "epoch": 2.559438823018189, + "grad_norm": 0.16223379324311168, + "learning_rate": 6.42686921077188e-06, + "loss": 2.6406, + "step": 41230 + }, + { + "epoch": 2.5595009001179463, + "grad_norm": 0.1354452769304792, + "learning_rate": 6.42509800930311e-06, + "loss": 2.7027, + "step": 41231 + }, + { + "epoch": 2.5595629772177046, + "grad_norm": 0.13672582063102565, + "learning_rate": 6.423327035175186e-06, + "loss": 2.7109, + "step": 41232 + }, + { + "epoch": 2.559625054317462, + "grad_norm": 0.1423489217624722, + "learning_rate": 6.421556288397346e-06, + "loss": 2.6346, + "step": 41233 + }, + { + "epoch": 2.55968713141722, + "grad_norm": 0.1393302881364472, + "learning_rate": 6.419785768978808e-06, + "loss": 2.6939, + "step": 41234 + }, + { + "epoch": 2.559749208516978, + "grad_norm": 0.1428851518995852, + "learning_rate": 6.418015476928846e-06, + "loss": 2.7532, + "step": 41235 + }, + { + "epoch": 2.559811285616736, + "grad_norm": 0.14498944725079385, + "learning_rate": 6.416245412256672e-06, + "loss": 2.7093, + "step": 41236 + }, + { + "epoch": 2.559873362716494, + "grad_norm": 0.13927584741345217, + "learning_rate": 6.414475574971513e-06, + "loss": 2.6998, + "step": 41237 + }, + { + "epoch": 2.5599354398162517, + "grad_norm": 0.14482463880337834, + "learning_rate": 6.412705965082632e-06, + "loss": 2.7565, + "step": 41238 + }, + { + "epoch": 2.5599975169160096, + "grad_norm": 0.13820529403407658, + "learning_rate": 6.410936582599231e-06, + "loss": 2.7117, + "step": 41239 + }, + { + "epoch": 2.5600595940157675, + "grad_norm": 0.1398044528303711, + "learning_rate": 6.4091674275305654e-06, + "loss": 2.6859, + "step": 41240 + }, + { + "epoch": 2.5601216711155255, + "grad_norm": 0.14966209867052907, + "learning_rate": 6.407398499885858e-06, + "loss": 2.6616, + "step": 41241 + }, + { + "epoch": 2.5601837482152834, + "grad_norm": 0.13835674396471, + "learning_rate": 6.405629799674334e-06, + "loss": 2.6962, + "step": 41242 + }, + { + "epoch": 2.5602458253150413, + "grad_norm": 0.13631725623190488, + "learning_rate": 6.403861326905214e-06, + "loss": 2.6636, + "step": 41243 + }, + { + "epoch": 2.560307902414799, + "grad_norm": 0.1569365082750847, + "learning_rate": 6.40209308158774e-06, + "loss": 2.6893, + "step": 41244 + }, + { + "epoch": 2.560369979514557, + "grad_norm": 0.14572285756321054, + "learning_rate": 6.400325063731133e-06, + "loss": 2.6742, + "step": 41245 + }, + { + "epoch": 2.560432056614315, + "grad_norm": 0.1549759480154158, + "learning_rate": 6.398557273344613e-06, + "loss": 2.6625, + "step": 41246 + }, + { + "epoch": 2.560494133714073, + "grad_norm": 0.1364651917294403, + "learning_rate": 6.396789710437401e-06, + "loss": 2.7571, + "step": 41247 + }, + { + "epoch": 2.560556210813831, + "grad_norm": 0.14962220497994477, + "learning_rate": 6.395022375018716e-06, + "loss": 2.6659, + "step": 41248 + }, + { + "epoch": 2.560618287913589, + "grad_norm": 0.16879337843797812, + "learning_rate": 6.3932552670977915e-06, + "loss": 2.6754, + "step": 41249 + }, + { + "epoch": 2.5606803650133467, + "grad_norm": 0.14519725531426922, + "learning_rate": 6.391488386683842e-06, + "loss": 2.6266, + "step": 41250 + }, + { + "epoch": 2.5607424421131046, + "grad_norm": 0.15180174070286584, + "learning_rate": 6.389721733786086e-06, + "loss": 2.6217, + "step": 41251 + }, + { + "epoch": 2.560804519212862, + "grad_norm": 0.14629088019959383, + "learning_rate": 6.387955308413729e-06, + "loss": 2.6985, + "step": 41252 + }, + { + "epoch": 2.5608665963126205, + "grad_norm": 0.1334181119001936, + "learning_rate": 6.3861891105760015e-06, + "loss": 2.6763, + "step": 41253 + }, + { + "epoch": 2.560928673412378, + "grad_norm": 0.1480558533575563, + "learning_rate": 6.384423140282114e-06, + "loss": 2.7486, + "step": 41254 + }, + { + "epoch": 2.5609907505121363, + "grad_norm": 0.1508974847805472, + "learning_rate": 6.3826573975412806e-06, + "loss": 2.7035, + "step": 41255 + }, + { + "epoch": 2.561052827611894, + "grad_norm": 0.1422309502561545, + "learning_rate": 6.380891882362705e-06, + "loss": 2.6569, + "step": 41256 + }, + { + "epoch": 2.5611149047116517, + "grad_norm": 0.13951881101210742, + "learning_rate": 6.379126594755614e-06, + "loss": 2.6822, + "step": 41257 + }, + { + "epoch": 2.5611769818114096, + "grad_norm": 0.1358920511914571, + "learning_rate": 6.377361534729204e-06, + "loss": 2.6976, + "step": 41258 + }, + { + "epoch": 2.5612390589111675, + "grad_norm": 0.1389805622017422, + "learning_rate": 6.375596702292691e-06, + "loss": 2.7718, + "step": 41259 + }, + { + "epoch": 2.5613011360109255, + "grad_norm": 0.13689720904441255, + "learning_rate": 6.373832097455284e-06, + "loss": 2.7873, + "step": 41260 + }, + { + "epoch": 2.5613632131106834, + "grad_norm": 0.1330185301911654, + "learning_rate": 6.3720677202261715e-06, + "loss": 2.5825, + "step": 41261 + }, + { + "epoch": 2.5614252902104413, + "grad_norm": 0.14743772538755298, + "learning_rate": 6.370303570614583e-06, + "loss": 2.6687, + "step": 41262 + }, + { + "epoch": 2.561487367310199, + "grad_norm": 0.1376752500595515, + "learning_rate": 6.368539648629718e-06, + "loss": 2.8014, + "step": 41263 + }, + { + "epoch": 2.561549444409957, + "grad_norm": 0.13982279103670936, + "learning_rate": 6.366775954280768e-06, + "loss": 2.5836, + "step": 41264 + }, + { + "epoch": 2.561611521509715, + "grad_norm": 0.14465955246002657, + "learning_rate": 6.365012487576927e-06, + "loss": 2.6983, + "step": 41265 + }, + { + "epoch": 2.561673598609473, + "grad_norm": 0.1544112433521902, + "learning_rate": 6.363249248527426e-06, + "loss": 2.6689, + "step": 41266 + }, + { + "epoch": 2.561735675709231, + "grad_norm": 0.13528808272129728, + "learning_rate": 6.36148623714144e-06, + "loss": 2.6814, + "step": 41267 + }, + { + "epoch": 2.561797752808989, + "grad_norm": 0.1344157251478584, + "learning_rate": 6.359723453428179e-06, + "loss": 2.6524, + "step": 41268 + }, + { + "epoch": 2.5618598299087467, + "grad_norm": 0.13588902195492333, + "learning_rate": 6.3579608973968255e-06, + "loss": 2.7289, + "step": 41269 + }, + { + "epoch": 2.5619219070085046, + "grad_norm": 0.13657027765817734, + "learning_rate": 6.356198569056581e-06, + "loss": 2.7523, + "step": 41270 + }, + { + "epoch": 2.5619839841082626, + "grad_norm": 0.13381876232980439, + "learning_rate": 6.354436468416652e-06, + "loss": 2.6829, + "step": 41271 + }, + { + "epoch": 2.5620460612080205, + "grad_norm": 0.13545124517243734, + "learning_rate": 6.3526745954862245e-06, + "loss": 2.6633, + "step": 41272 + }, + { + "epoch": 2.5621081383077784, + "grad_norm": 0.13844602622089344, + "learning_rate": 6.350912950274496e-06, + "loss": 2.7794, + "step": 41273 + }, + { + "epoch": 2.5621702154075363, + "grad_norm": 0.14019586730405093, + "learning_rate": 6.349151532790643e-06, + "loss": 2.7594, + "step": 41274 + }, + { + "epoch": 2.562232292507294, + "grad_norm": 0.13141231292681177, + "learning_rate": 6.3473903430438575e-06, + "loss": 2.7204, + "step": 41275 + }, + { + "epoch": 2.562294369607052, + "grad_norm": 0.13987934922120457, + "learning_rate": 6.345629381043339e-06, + "loss": 2.7715, + "step": 41276 + }, + { + "epoch": 2.5623564467068096, + "grad_norm": 0.1355237850453697, + "learning_rate": 6.343868646798273e-06, + "loss": 2.6854, + "step": 41277 + }, + { + "epoch": 2.562418523806568, + "grad_norm": 0.13866970570976353, + "learning_rate": 6.3421081403178424e-06, + "loss": 2.5722, + "step": 41278 + }, + { + "epoch": 2.5624806009063255, + "grad_norm": 0.13526492297755008, + "learning_rate": 6.340347861611218e-06, + "loss": 2.6899, + "step": 41279 + }, + { + "epoch": 2.562542678006084, + "grad_norm": 0.1503095594126423, + "learning_rate": 6.338587810687613e-06, + "loss": 2.7861, + "step": 41280 + }, + { + "epoch": 2.5626047551058413, + "grad_norm": 0.13501455288172085, + "learning_rate": 6.336827987556193e-06, + "loss": 2.7286, + "step": 41281 + }, + { + "epoch": 2.562666832205599, + "grad_norm": 0.1364522243423293, + "learning_rate": 6.33506839222614e-06, + "loss": 2.692, + "step": 41282 + }, + { + "epoch": 2.562728909305357, + "grad_norm": 0.14209098422192515, + "learning_rate": 6.333309024706635e-06, + "loss": 2.6885, + "step": 41283 + }, + { + "epoch": 2.562790986405115, + "grad_norm": 0.1396017530075892, + "learning_rate": 6.331549885006849e-06, + "loss": 2.7003, + "step": 41284 + }, + { + "epoch": 2.562853063504873, + "grad_norm": 0.1420565366215021, + "learning_rate": 6.329790973135974e-06, + "loss": 2.7263, + "step": 41285 + }, + { + "epoch": 2.562915140604631, + "grad_norm": 0.13829000035849298, + "learning_rate": 6.328032289103186e-06, + "loss": 2.6775, + "step": 41286 + }, + { + "epoch": 2.562977217704389, + "grad_norm": 0.13809572668057152, + "learning_rate": 6.326273832917651e-06, + "loss": 2.6688, + "step": 41287 + }, + { + "epoch": 2.5630392948041467, + "grad_norm": 0.1393596089850107, + "learning_rate": 6.324515604588538e-06, + "loss": 2.7475, + "step": 41288 + }, + { + "epoch": 2.5631013719039046, + "grad_norm": 0.1397642975472815, + "learning_rate": 6.3227576041250456e-06, + "loss": 2.7206, + "step": 41289 + }, + { + "epoch": 2.5631634490036626, + "grad_norm": 0.13608807467639492, + "learning_rate": 6.320999831536323e-06, + "loss": 2.7778, + "step": 41290 + }, + { + "epoch": 2.5632255261034205, + "grad_norm": 0.14042186635697473, + "learning_rate": 6.319242286831551e-06, + "loss": 2.6536, + "step": 41291 + }, + { + "epoch": 2.5632876032031784, + "grad_norm": 0.16587992339168783, + "learning_rate": 6.317484970019882e-06, + "loss": 2.7138, + "step": 41292 + }, + { + "epoch": 2.5633496803029363, + "grad_norm": 0.14152280656261718, + "learning_rate": 6.315727881110512e-06, + "loss": 2.5725, + "step": 41293 + }, + { + "epoch": 2.5634117574026942, + "grad_norm": 0.13474923877332962, + "learning_rate": 6.3139710201125964e-06, + "loss": 2.7197, + "step": 41294 + }, + { + "epoch": 2.563473834502452, + "grad_norm": 0.15043795447762293, + "learning_rate": 6.3122143870352926e-06, + "loss": 2.6318, + "step": 41295 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.14263444598091513, + "learning_rate": 6.310457981887779e-06, + "loss": 2.6617, + "step": 41296 + }, + { + "epoch": 2.563597988701968, + "grad_norm": 0.14804342311166457, + "learning_rate": 6.3087018046791955e-06, + "loss": 2.767, + "step": 41297 + }, + { + "epoch": 2.563660065801726, + "grad_norm": 0.15166346564185534, + "learning_rate": 6.306945855418733e-06, + "loss": 2.5545, + "step": 41298 + }, + { + "epoch": 2.563722142901484, + "grad_norm": 0.13831390598718468, + "learning_rate": 6.3051901341155465e-06, + "loss": 2.7141, + "step": 41299 + }, + { + "epoch": 2.5637842200012413, + "grad_norm": 0.14039324498478245, + "learning_rate": 6.303434640778782e-06, + "loss": 2.6846, + "step": 41300 + }, + { + "epoch": 2.5638462971009996, + "grad_norm": 0.13906446144778078, + "learning_rate": 6.301679375417602e-06, + "loss": 2.7162, + "step": 41301 + }, + { + "epoch": 2.563908374200757, + "grad_norm": 0.14029768760463665, + "learning_rate": 6.299924338041169e-06, + "loss": 2.7187, + "step": 41302 + }, + { + "epoch": 2.5639704513005155, + "grad_norm": 0.1376980246959901, + "learning_rate": 6.298169528658648e-06, + "loss": 2.7296, + "step": 41303 + }, + { + "epoch": 2.564032528400273, + "grad_norm": 0.13670045866542224, + "learning_rate": 6.296414947279183e-06, + "loss": 2.6602, + "step": 41304 + }, + { + "epoch": 2.564094605500031, + "grad_norm": 0.13425398189883625, + "learning_rate": 6.294660593911933e-06, + "loss": 2.6982, + "step": 41305 + }, + { + "epoch": 2.564156682599789, + "grad_norm": 0.13387858743829711, + "learning_rate": 6.29290646856604e-06, + "loss": 2.72, + "step": 41306 + }, + { + "epoch": 2.5642187596995467, + "grad_norm": 0.16359816418643125, + "learning_rate": 6.291152571250674e-06, + "loss": 2.7096, + "step": 41307 + }, + { + "epoch": 2.5642808367993046, + "grad_norm": 0.14983046234094724, + "learning_rate": 6.289398901974974e-06, + "loss": 2.7701, + "step": 41308 + }, + { + "epoch": 2.5643429138990625, + "grad_norm": 0.1484532410657626, + "learning_rate": 6.287645460748093e-06, + "loss": 2.705, + "step": 41309 + }, + { + "epoch": 2.5644049909988205, + "grad_norm": 0.13241901028890618, + "learning_rate": 6.285892247579173e-06, + "loss": 2.6436, + "step": 41310 + }, + { + "epoch": 2.5644670680985784, + "grad_norm": 0.1605763740076608, + "learning_rate": 6.2841392624773564e-06, + "loss": 2.693, + "step": 41311 + }, + { + "epoch": 2.5645291451983363, + "grad_norm": 0.1422557452194773, + "learning_rate": 6.28238650545181e-06, + "loss": 2.7227, + "step": 41312 + }, + { + "epoch": 2.564591222298094, + "grad_norm": 0.13422820839511962, + "learning_rate": 6.280633976511668e-06, + "loss": 2.7037, + "step": 41313 + }, + { + "epoch": 2.564653299397852, + "grad_norm": 0.1329305769055184, + "learning_rate": 6.278881675666065e-06, + "loss": 2.6913, + "step": 41314 + }, + { + "epoch": 2.56471537649761, + "grad_norm": 0.15426082431367472, + "learning_rate": 6.277129602924148e-06, + "loss": 2.8171, + "step": 41315 + }, + { + "epoch": 2.564777453597368, + "grad_norm": 0.15983668288402922, + "learning_rate": 6.275377758295065e-06, + "loss": 2.7112, + "step": 41316 + }, + { + "epoch": 2.564839530697126, + "grad_norm": 0.16205677075194796, + "learning_rate": 6.273626141787953e-06, + "loss": 2.7341, + "step": 41317 + }, + { + "epoch": 2.564901607796884, + "grad_norm": 0.13963575905821088, + "learning_rate": 6.271874753411944e-06, + "loss": 2.7697, + "step": 41318 + }, + { + "epoch": 2.5649636848966417, + "grad_norm": 0.13604850648765068, + "learning_rate": 6.27012359317618e-06, + "loss": 2.784, + "step": 41319 + }, + { + "epoch": 2.5650257619963996, + "grad_norm": 0.13440852606406545, + "learning_rate": 6.268372661089794e-06, + "loss": 2.6326, + "step": 41320 + }, + { + "epoch": 2.5650878390961576, + "grad_norm": 0.1459061463552349, + "learning_rate": 6.2666219571619275e-06, + "loss": 2.7637, + "step": 41321 + }, + { + "epoch": 2.5651499161959155, + "grad_norm": 0.133438242088327, + "learning_rate": 6.264871481401713e-06, + "loss": 2.6391, + "step": 41322 + }, + { + "epoch": 2.565211993295673, + "grad_norm": 0.13719046821945888, + "learning_rate": 6.263121233818281e-06, + "loss": 2.7106, + "step": 41323 + }, + { + "epoch": 2.5652740703954313, + "grad_norm": 0.14777614050542676, + "learning_rate": 6.2613712144207525e-06, + "loss": 2.7075, + "step": 41324 + }, + { + "epoch": 2.565336147495189, + "grad_norm": 0.15545829828871646, + "learning_rate": 6.259621423218282e-06, + "loss": 2.7724, + "step": 41325 + }, + { + "epoch": 2.565398224594947, + "grad_norm": 0.12931019895883805, + "learning_rate": 6.257871860219983e-06, + "loss": 2.6818, + "step": 41326 + }, + { + "epoch": 2.5654603016947046, + "grad_norm": 0.13565298621217523, + "learning_rate": 6.256122525434982e-06, + "loss": 2.6884, + "step": 41327 + }, + { + "epoch": 2.565522378794463, + "grad_norm": 0.1334027469771111, + "learning_rate": 6.2543734188724004e-06, + "loss": 2.7008, + "step": 41328 + }, + { + "epoch": 2.5655844558942205, + "grad_norm": 0.1479153547150452, + "learning_rate": 6.252624540541385e-06, + "loss": 2.6973, + "step": 41329 + }, + { + "epoch": 2.5656465329939784, + "grad_norm": 0.1418837596377955, + "learning_rate": 6.250875890451047e-06, + "loss": 2.6636, + "step": 41330 + }, + { + "epoch": 2.5657086100937363, + "grad_norm": 0.13398741198731778, + "learning_rate": 6.249127468610505e-06, + "loss": 2.7575, + "step": 41331 + }, + { + "epoch": 2.565770687193494, + "grad_norm": 0.16933079660041175, + "learning_rate": 6.2473792750288914e-06, + "loss": 2.7059, + "step": 41332 + }, + { + "epoch": 2.565832764293252, + "grad_norm": 0.13639007980485937, + "learning_rate": 6.245631309715311e-06, + "loss": 2.7189, + "step": 41333 + }, + { + "epoch": 2.56589484139301, + "grad_norm": 0.14280360173273277, + "learning_rate": 6.243883572678899e-06, + "loss": 2.7015, + "step": 41334 + }, + { + "epoch": 2.565956918492768, + "grad_norm": 0.14879379051630323, + "learning_rate": 6.242136063928761e-06, + "loss": 2.6909, + "step": 41335 + }, + { + "epoch": 2.566018995592526, + "grad_norm": 0.14376532094620845, + "learning_rate": 6.240388783474033e-06, + "loss": 2.6973, + "step": 41336 + }, + { + "epoch": 2.566081072692284, + "grad_norm": 0.13487701373722646, + "learning_rate": 6.238641731323819e-06, + "loss": 2.6903, + "step": 41337 + }, + { + "epoch": 2.5661431497920417, + "grad_norm": 0.13285247117101648, + "learning_rate": 6.236894907487223e-06, + "loss": 2.6739, + "step": 41338 + }, + { + "epoch": 2.5662052268917996, + "grad_norm": 0.1347822234394769, + "learning_rate": 6.2351483119733826e-06, + "loss": 2.7785, + "step": 41339 + }, + { + "epoch": 2.5662673039915576, + "grad_norm": 0.13271874737138883, + "learning_rate": 6.2334019447914e-06, + "loss": 2.6135, + "step": 41340 + }, + { + "epoch": 2.5663293810913155, + "grad_norm": 0.15185493518208174, + "learning_rate": 6.231655805950382e-06, + "loss": 2.7396, + "step": 41341 + }, + { + "epoch": 2.5663914581910734, + "grad_norm": 0.13644894135042815, + "learning_rate": 6.229909895459429e-06, + "loss": 2.6335, + "step": 41342 + }, + { + "epoch": 2.5664535352908313, + "grad_norm": 0.13600303095313407, + "learning_rate": 6.228164213327669e-06, + "loss": 2.7197, + "step": 41343 + }, + { + "epoch": 2.5665156123905892, + "grad_norm": 0.15063917997498272, + "learning_rate": 6.226418759564206e-06, + "loss": 2.6878, + "step": 41344 + }, + { + "epoch": 2.566577689490347, + "grad_norm": 0.1530494544944681, + "learning_rate": 6.2246735341781435e-06, + "loss": 2.7295, + "step": 41345 + }, + { + "epoch": 2.566639766590105, + "grad_norm": 0.1337390513818682, + "learning_rate": 6.222928537178585e-06, + "loss": 2.6736, + "step": 41346 + }, + { + "epoch": 2.566701843689863, + "grad_norm": 0.1308219272494505, + "learning_rate": 6.221183768574629e-06, + "loss": 2.6827, + "step": 41347 + }, + { + "epoch": 2.5667639207896205, + "grad_norm": 0.1429893885239913, + "learning_rate": 6.21943922837539e-06, + "loss": 2.7266, + "step": 41348 + }, + { + "epoch": 2.566825997889379, + "grad_norm": 0.1553433338927868, + "learning_rate": 6.2176949165899665e-06, + "loss": 2.6982, + "step": 41349 + }, + { + "epoch": 2.5668880749891363, + "grad_norm": 0.13383301479360876, + "learning_rate": 6.215950833227457e-06, + "loss": 2.7189, + "step": 41350 + }, + { + "epoch": 2.5669501520888947, + "grad_norm": 0.13535691893356314, + "learning_rate": 6.214206978296949e-06, + "loss": 2.7744, + "step": 41351 + }, + { + "epoch": 2.567012229188652, + "grad_norm": 0.13508097752062795, + "learning_rate": 6.212463351807568e-06, + "loss": 2.6479, + "step": 41352 + }, + { + "epoch": 2.56707430628841, + "grad_norm": 0.15394281253173817, + "learning_rate": 6.2107199537683905e-06, + "loss": 2.6493, + "step": 41353 + }, + { + "epoch": 2.567136383388168, + "grad_norm": 0.1368649586495471, + "learning_rate": 6.20897678418852e-06, + "loss": 2.6142, + "step": 41354 + }, + { + "epoch": 2.567198460487926, + "grad_norm": 0.13849547409028737, + "learning_rate": 6.207233843077037e-06, + "loss": 2.6756, + "step": 41355 + }, + { + "epoch": 2.567260537587684, + "grad_norm": 0.15330415992935695, + "learning_rate": 6.205491130443053e-06, + "loss": 2.7136, + "step": 41356 + }, + { + "epoch": 2.5673226146874417, + "grad_norm": 0.1324725990708106, + "learning_rate": 6.2037486462956595e-06, + "loss": 2.7635, + "step": 41357 + }, + { + "epoch": 2.5673846917871996, + "grad_norm": 0.16234929358936948, + "learning_rate": 6.202006390643939e-06, + "loss": 2.5989, + "step": 41358 + }, + { + "epoch": 2.5674467688869576, + "grad_norm": 0.14620206331538413, + "learning_rate": 6.200264363496982e-06, + "loss": 2.6858, + "step": 41359 + }, + { + "epoch": 2.5675088459867155, + "grad_norm": 0.1339138103763367, + "learning_rate": 6.198522564863868e-06, + "loss": 2.702, + "step": 41360 + }, + { + "epoch": 2.5675709230864734, + "grad_norm": 0.14599857519586223, + "learning_rate": 6.1967809947537035e-06, + "loss": 2.7287, + "step": 41361 + }, + { + "epoch": 2.5676330001862313, + "grad_norm": 0.14139146674844968, + "learning_rate": 6.195039653175572e-06, + "loss": 2.7274, + "step": 41362 + }, + { + "epoch": 2.5676950772859892, + "grad_norm": 0.14389082422117372, + "learning_rate": 6.193298540138548e-06, + "loss": 2.7533, + "step": 41363 + }, + { + "epoch": 2.567757154385747, + "grad_norm": 0.13622782304717335, + "learning_rate": 6.191557655651708e-06, + "loss": 2.7328, + "step": 41364 + }, + { + "epoch": 2.567819231485505, + "grad_norm": 0.149509767037478, + "learning_rate": 6.189816999724163e-06, + "loss": 2.6543, + "step": 41365 + }, + { + "epoch": 2.567881308585263, + "grad_norm": 0.15530871344465572, + "learning_rate": 6.188076572364976e-06, + "loss": 2.6684, + "step": 41366 + }, + { + "epoch": 2.567943385685021, + "grad_norm": 0.13621600904942893, + "learning_rate": 6.186336373583229e-06, + "loss": 2.6981, + "step": 41367 + }, + { + "epoch": 2.568005462784779, + "grad_norm": 0.13771981501529953, + "learning_rate": 6.1845964033879875e-06, + "loss": 2.6848, + "step": 41368 + }, + { + "epoch": 2.5680675398845367, + "grad_norm": 0.14251222132468266, + "learning_rate": 6.1828566617883555e-06, + "loss": 2.6463, + "step": 41369 + }, + { + "epoch": 2.5681296169842946, + "grad_norm": 0.139349250146794, + "learning_rate": 6.181117148793386e-06, + "loss": 2.7069, + "step": 41370 + }, + { + "epoch": 2.568191694084052, + "grad_norm": 0.1412411147040528, + "learning_rate": 6.179377864412178e-06, + "loss": 2.6937, + "step": 41371 + }, + { + "epoch": 2.5682537711838105, + "grad_norm": 0.1367640692666955, + "learning_rate": 6.177638808653796e-06, + "loss": 2.6886, + "step": 41372 + }, + { + "epoch": 2.568315848283568, + "grad_norm": 0.14123146757359026, + "learning_rate": 6.175899981527305e-06, + "loss": 2.701, + "step": 41373 + }, + { + "epoch": 2.5683779253833263, + "grad_norm": 0.13036855773745185, + "learning_rate": 6.174161383041777e-06, + "loss": 2.5869, + "step": 41374 + }, + { + "epoch": 2.568440002483084, + "grad_norm": 0.1384226682028568, + "learning_rate": 6.1724230132062965e-06, + "loss": 2.7102, + "step": 41375 + }, + { + "epoch": 2.568502079582842, + "grad_norm": 0.1580234201595059, + "learning_rate": 6.170684872029925e-06, + "loss": 2.7286, + "step": 41376 + }, + { + "epoch": 2.5685641566825996, + "grad_norm": 0.1391418779649817, + "learning_rate": 6.168946959521732e-06, + "loss": 2.7358, + "step": 41377 + }, + { + "epoch": 2.5686262337823575, + "grad_norm": 0.14615833874266776, + "learning_rate": 6.167209275690766e-06, + "loss": 2.724, + "step": 41378 + }, + { + "epoch": 2.5686883108821155, + "grad_norm": 0.13948681955897135, + "learning_rate": 6.165471820546126e-06, + "loss": 2.6735, + "step": 41379 + }, + { + "epoch": 2.5687503879818734, + "grad_norm": 0.13781293620258941, + "learning_rate": 6.16373459409686e-06, + "loss": 2.6998, + "step": 41380 + }, + { + "epoch": 2.5688124650816313, + "grad_norm": 0.1581371075439046, + "learning_rate": 6.161997596352032e-06, + "loss": 2.7056, + "step": 41381 + }, + { + "epoch": 2.568874542181389, + "grad_norm": 0.1482096416446297, + "learning_rate": 6.1602608273207074e-06, + "loss": 2.7476, + "step": 41382 + }, + { + "epoch": 2.568936619281147, + "grad_norm": 0.1416949391786834, + "learning_rate": 6.15852428701193e-06, + "loss": 2.752, + "step": 41383 + }, + { + "epoch": 2.568998696380905, + "grad_norm": 0.14338351444040764, + "learning_rate": 6.156787975434786e-06, + "loss": 2.6891, + "step": 41384 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.14121868724319336, + "learning_rate": 6.155051892598324e-06, + "loss": 2.7046, + "step": 41385 + }, + { + "epoch": 2.569122850580421, + "grad_norm": 0.15112378639446986, + "learning_rate": 6.153316038511598e-06, + "loss": 2.7346, + "step": 41386 + }, + { + "epoch": 2.569184927680179, + "grad_norm": 0.13186325214136038, + "learning_rate": 6.151580413183655e-06, + "loss": 2.7506, + "step": 41387 + }, + { + "epoch": 2.5692470047799367, + "grad_norm": 0.13650207379664417, + "learning_rate": 6.149845016623574e-06, + "loss": 2.6421, + "step": 41388 + }, + { + "epoch": 2.5693090818796946, + "grad_norm": 0.16558779397559562, + "learning_rate": 6.1481098488403945e-06, + "loss": 2.7515, + "step": 41389 + }, + { + "epoch": 2.5693711589794526, + "grad_norm": 0.14721776478007959, + "learning_rate": 6.146374909843172e-06, + "loss": 2.7132, + "step": 41390 + }, + { + "epoch": 2.5694332360792105, + "grad_norm": 0.15695524796245902, + "learning_rate": 6.14464019964095e-06, + "loss": 2.6749, + "step": 41391 + }, + { + "epoch": 2.5694953131789684, + "grad_norm": 0.13730851082613332, + "learning_rate": 6.142905718242792e-06, + "loss": 2.5927, + "step": 41392 + }, + { + "epoch": 2.5695573902787263, + "grad_norm": 0.15459058066196682, + "learning_rate": 6.141171465657747e-06, + "loss": 2.7913, + "step": 41393 + }, + { + "epoch": 2.5696194673784842, + "grad_norm": 0.14218007005415464, + "learning_rate": 6.139437441894852e-06, + "loss": 2.6759, + "step": 41394 + }, + { + "epoch": 2.569681544478242, + "grad_norm": 0.14248166021694697, + "learning_rate": 6.137703646963167e-06, + "loss": 2.7357, + "step": 41395 + }, + { + "epoch": 2.5697436215779996, + "grad_norm": 0.14643118846445405, + "learning_rate": 6.135970080871711e-06, + "loss": 2.6959, + "step": 41396 + }, + { + "epoch": 2.569805698677758, + "grad_norm": 0.1395454960990815, + "learning_rate": 6.134236743629562e-06, + "loss": 2.7205, + "step": 41397 + }, + { + "epoch": 2.5698677757775155, + "grad_norm": 0.14524719401273864, + "learning_rate": 6.132503635245751e-06, + "loss": 2.6832, + "step": 41398 + }, + { + "epoch": 2.569929852877274, + "grad_norm": 0.13549551746653754, + "learning_rate": 6.1307707557293146e-06, + "loss": 2.6602, + "step": 41399 + }, + { + "epoch": 2.5699919299770313, + "grad_norm": 0.12974740621054268, + "learning_rate": 6.1290381050892855e-06, + "loss": 2.6709, + "step": 41400 + }, + { + "epoch": 2.570054007076789, + "grad_norm": 0.1351476107024686, + "learning_rate": 6.127305683334716e-06, + "loss": 2.7485, + "step": 41401 + }, + { + "epoch": 2.570116084176547, + "grad_norm": 0.14312865169314382, + "learning_rate": 6.125573490474651e-06, + "loss": 2.6557, + "step": 41402 + }, + { + "epoch": 2.570178161276305, + "grad_norm": 0.16414359903886752, + "learning_rate": 6.123841526518126e-06, + "loss": 2.7538, + "step": 41403 + }, + { + "epoch": 2.570240238376063, + "grad_norm": 0.13946794970703952, + "learning_rate": 6.122109791474168e-06, + "loss": 2.807, + "step": 41404 + }, + { + "epoch": 2.570302315475821, + "grad_norm": 0.13422130455721318, + "learning_rate": 6.120378285351802e-06, + "loss": 2.6639, + "step": 41405 + }, + { + "epoch": 2.570364392575579, + "grad_norm": 0.13927879630168413, + "learning_rate": 6.118647008160089e-06, + "loss": 2.6126, + "step": 41406 + }, + { + "epoch": 2.5704264696753367, + "grad_norm": 0.1476069920742126, + "learning_rate": 6.116915959908043e-06, + "loss": 2.7287, + "step": 41407 + }, + { + "epoch": 2.5704885467750946, + "grad_norm": 0.14963866163807793, + "learning_rate": 6.115185140604701e-06, + "loss": 2.6883, + "step": 41408 + }, + { + "epoch": 2.5705506238748526, + "grad_norm": 0.13849124427087292, + "learning_rate": 6.113454550259096e-06, + "loss": 2.6642, + "step": 41409 + }, + { + "epoch": 2.5706127009746105, + "grad_norm": 0.17062432240082614, + "learning_rate": 6.111724188880235e-06, + "loss": 2.7603, + "step": 41410 + }, + { + "epoch": 2.5706747780743684, + "grad_norm": 0.16183900692565895, + "learning_rate": 6.1099940564771756e-06, + "loss": 2.6923, + "step": 41411 + }, + { + "epoch": 2.5707368551741263, + "grad_norm": 0.13786495526341314, + "learning_rate": 6.108264153058934e-06, + "loss": 2.8177, + "step": 41412 + }, + { + "epoch": 2.5707989322738842, + "grad_norm": 0.13061906327669526, + "learning_rate": 6.106534478634535e-06, + "loss": 2.6554, + "step": 41413 + }, + { + "epoch": 2.570861009373642, + "grad_norm": 0.1333245406442973, + "learning_rate": 6.104805033212985e-06, + "loss": 2.6751, + "step": 41414 + }, + { + "epoch": 2.5709230864734, + "grad_norm": 0.1421460437075166, + "learning_rate": 6.103075816803338e-06, + "loss": 2.7123, + "step": 41415 + }, + { + "epoch": 2.570985163573158, + "grad_norm": 0.13795908560337022, + "learning_rate": 6.1013468294146005e-06, + "loss": 2.6844, + "step": 41416 + }, + { + "epoch": 2.571047240672916, + "grad_norm": 0.13911220326655502, + "learning_rate": 6.099618071055791e-06, + "loss": 2.7142, + "step": 41417 + }, + { + "epoch": 2.571109317772674, + "grad_norm": 0.1418969329399696, + "learning_rate": 6.097889541735935e-06, + "loss": 2.6887, + "step": 41418 + }, + { + "epoch": 2.5711713948724313, + "grad_norm": 0.15272470672914587, + "learning_rate": 6.096161241464032e-06, + "loss": 2.7466, + "step": 41419 + }, + { + "epoch": 2.5712334719721897, + "grad_norm": 0.13580799565874935, + "learning_rate": 6.094433170249125e-06, + "loss": 2.623, + "step": 41420 + }, + { + "epoch": 2.571295549071947, + "grad_norm": 0.14391950067904263, + "learning_rate": 6.0927053281002235e-06, + "loss": 2.6832, + "step": 41421 + }, + { + "epoch": 2.5713576261717055, + "grad_norm": 0.13672828641057205, + "learning_rate": 6.09097771502633e-06, + "loss": 2.7386, + "step": 41422 + }, + { + "epoch": 2.571419703271463, + "grad_norm": 0.1435526374099602, + "learning_rate": 6.089250331036461e-06, + "loss": 2.7499, + "step": 41423 + }, + { + "epoch": 2.5714817803712213, + "grad_norm": 0.13538423174221445, + "learning_rate": 6.087523176139643e-06, + "loss": 2.6988, + "step": 41424 + }, + { + "epoch": 2.571543857470979, + "grad_norm": 0.1389225520790118, + "learning_rate": 6.085796250344872e-06, + "loss": 2.7257, + "step": 41425 + }, + { + "epoch": 2.5716059345707367, + "grad_norm": 0.15153650227981072, + "learning_rate": 6.084069553661165e-06, + "loss": 2.7048, + "step": 41426 + }, + { + "epoch": 2.5716680116704946, + "grad_norm": 0.14659932394780942, + "learning_rate": 6.082343086097514e-06, + "loss": 2.7802, + "step": 41427 + }, + { + "epoch": 2.5717300887702526, + "grad_norm": 0.15384424190066787, + "learning_rate": 6.080616847662956e-06, + "loss": 2.7939, + "step": 41428 + }, + { + "epoch": 2.5717921658700105, + "grad_norm": 0.1561948500102499, + "learning_rate": 6.07889083836648e-06, + "loss": 2.7279, + "step": 41429 + }, + { + "epoch": 2.5718542429697684, + "grad_norm": 0.13771644727687396, + "learning_rate": 6.077165058217094e-06, + "loss": 2.7025, + "step": 41430 + }, + { + "epoch": 2.5719163200695263, + "grad_norm": 0.13595390960700193, + "learning_rate": 6.075439507223796e-06, + "loss": 2.6593, + "step": 41431 + }, + { + "epoch": 2.5719783971692842, + "grad_norm": 0.14093302397164884, + "learning_rate": 6.073714185395591e-06, + "loss": 2.725, + "step": 41432 + }, + { + "epoch": 2.572040474269042, + "grad_norm": 0.1539321040207914, + "learning_rate": 6.071989092741476e-06, + "loss": 2.7238, + "step": 41433 + }, + { + "epoch": 2.5721025513688, + "grad_norm": 0.13416111163694527, + "learning_rate": 6.070264229270467e-06, + "loss": 2.7376, + "step": 41434 + }, + { + "epoch": 2.572164628468558, + "grad_norm": 0.13612806238017253, + "learning_rate": 6.068539594991557e-06, + "loss": 2.6791, + "step": 41435 + }, + { + "epoch": 2.572226705568316, + "grad_norm": 0.13907641518011846, + "learning_rate": 6.0668151899137385e-06, + "loss": 2.6306, + "step": 41436 + }, + { + "epoch": 2.572288782668074, + "grad_norm": 0.136111751224295, + "learning_rate": 6.065091014046004e-06, + "loss": 2.6638, + "step": 41437 + }, + { + "epoch": 2.5723508597678317, + "grad_norm": 0.14011775031294463, + "learning_rate": 6.063367067397363e-06, + "loss": 2.6138, + "step": 41438 + }, + { + "epoch": 2.5724129368675897, + "grad_norm": 0.13552762645181235, + "learning_rate": 6.061643349976804e-06, + "loss": 2.7065, + "step": 41439 + }, + { + "epoch": 2.5724750139673476, + "grad_norm": 0.1411837233704891, + "learning_rate": 6.059919861793318e-06, + "loss": 2.6966, + "step": 41440 + }, + { + "epoch": 2.5725370910671055, + "grad_norm": 0.13676610535247796, + "learning_rate": 6.058196602855881e-06, + "loss": 2.7726, + "step": 41441 + }, + { + "epoch": 2.5725991681668634, + "grad_norm": 0.1462772112478103, + "learning_rate": 6.056473573173516e-06, + "loss": 2.6526, + "step": 41442 + }, + { + "epoch": 2.5726612452666213, + "grad_norm": 0.1316067717175945, + "learning_rate": 6.0547507727551915e-06, + "loss": 2.6453, + "step": 41443 + }, + { + "epoch": 2.572723322366379, + "grad_norm": 0.13460357570498785, + "learning_rate": 6.053028201609901e-06, + "loss": 2.6462, + "step": 41444 + }, + { + "epoch": 2.572785399466137, + "grad_norm": 0.14799884555541992, + "learning_rate": 6.051305859746637e-06, + "loss": 2.6512, + "step": 41445 + }, + { + "epoch": 2.5728474765658946, + "grad_norm": 0.13503051377642877, + "learning_rate": 6.049583747174364e-06, + "loss": 2.6734, + "step": 41446 + }, + { + "epoch": 2.572909553665653, + "grad_norm": 0.1354025220514058, + "learning_rate": 6.047861863902088e-06, + "loss": 2.7062, + "step": 41447 + }, + { + "epoch": 2.5729716307654105, + "grad_norm": 0.13831681478452368, + "learning_rate": 6.046140209938795e-06, + "loss": 2.7556, + "step": 41448 + }, + { + "epoch": 2.5730337078651684, + "grad_norm": 0.14566587017857804, + "learning_rate": 6.0444187852934545e-06, + "loss": 2.7173, + "step": 41449 + }, + { + "epoch": 2.5730957849649263, + "grad_norm": 0.13339391535945463, + "learning_rate": 6.0426975899750385e-06, + "loss": 2.7202, + "step": 41450 + }, + { + "epoch": 2.573157862064684, + "grad_norm": 0.13700816992081016, + "learning_rate": 6.040976623992556e-06, + "loss": 2.6952, + "step": 41451 + }, + { + "epoch": 2.573219939164442, + "grad_norm": 0.14245210591443658, + "learning_rate": 6.039255887354967e-06, + "loss": 2.7925, + "step": 41452 + }, + { + "epoch": 2.5732820162642, + "grad_norm": 0.1352160087401916, + "learning_rate": 6.037535380071252e-06, + "loss": 2.7083, + "step": 41453 + }, + { + "epoch": 2.573344093363958, + "grad_norm": 0.15250178234434578, + "learning_rate": 6.035815102150388e-06, + "loss": 2.7788, + "step": 41454 + }, + { + "epoch": 2.573406170463716, + "grad_norm": 0.14308745063441972, + "learning_rate": 6.034095053601341e-06, + "loss": 2.7513, + "step": 41455 + }, + { + "epoch": 2.573468247563474, + "grad_norm": 0.1355512428679276, + "learning_rate": 6.032375234433096e-06, + "loss": 2.6698, + "step": 41456 + }, + { + "epoch": 2.5735303246632317, + "grad_norm": 0.13480143831372532, + "learning_rate": 6.030655644654631e-06, + "loss": 2.6973, + "step": 41457 + }, + { + "epoch": 2.5735924017629896, + "grad_norm": 0.1348131344850595, + "learning_rate": 6.028936284274905e-06, + "loss": 2.6875, + "step": 41458 + }, + { + "epoch": 2.5736544788627476, + "grad_norm": 0.13875613540538836, + "learning_rate": 6.027217153302883e-06, + "loss": 2.8082, + "step": 41459 + }, + { + "epoch": 2.5737165559625055, + "grad_norm": 0.13712641020582617, + "learning_rate": 6.025498251747558e-06, + "loss": 2.6401, + "step": 41460 + }, + { + "epoch": 2.5737786330622634, + "grad_norm": 0.14987652708529148, + "learning_rate": 6.023779579617883e-06, + "loss": 2.6846, + "step": 41461 + }, + { + "epoch": 2.5738407101620213, + "grad_norm": 0.1532945550555039, + "learning_rate": 6.022061136922819e-06, + "loss": 2.6739, + "step": 41462 + }, + { + "epoch": 2.5739027872617792, + "grad_norm": 0.13669651395626523, + "learning_rate": 6.020342923671335e-06, + "loss": 2.6611, + "step": 41463 + }, + { + "epoch": 2.573964864361537, + "grad_norm": 0.14091126788856195, + "learning_rate": 6.018624939872403e-06, + "loss": 2.6296, + "step": 41464 + }, + { + "epoch": 2.574026941461295, + "grad_norm": 0.15333238784958098, + "learning_rate": 6.016907185534987e-06, + "loss": 2.7168, + "step": 41465 + }, + { + "epoch": 2.574089018561053, + "grad_norm": 0.13990093292798841, + "learning_rate": 6.01518966066803e-06, + "loss": 2.6472, + "step": 41466 + }, + { + "epoch": 2.5741510956608105, + "grad_norm": 0.15447662241348126, + "learning_rate": 6.013472365280515e-06, + "loss": 2.6073, + "step": 41467 + }, + { + "epoch": 2.574213172760569, + "grad_norm": 0.13292580567778145, + "learning_rate": 6.0117552993814e-06, + "loss": 2.7481, + "step": 41468 + }, + { + "epoch": 2.5742752498603263, + "grad_norm": 0.13428993859610877, + "learning_rate": 6.0100384629796165e-06, + "loss": 2.7343, + "step": 41469 + }, + { + "epoch": 2.5743373269600847, + "grad_norm": 0.13156562219299311, + "learning_rate": 6.008321856084159e-06, + "loss": 2.7603, + "step": 41470 + }, + { + "epoch": 2.574399404059842, + "grad_norm": 0.1784342515653088, + "learning_rate": 6.006605478703958e-06, + "loss": 2.8066, + "step": 41471 + }, + { + "epoch": 2.5744614811596005, + "grad_norm": 0.13789754585426076, + "learning_rate": 6.004889330847985e-06, + "loss": 2.7394, + "step": 41472 + }, + { + "epoch": 2.574523558259358, + "grad_norm": 0.13428985537413046, + "learning_rate": 6.003173412525171e-06, + "loss": 2.6468, + "step": 41473 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.1409879081306195, + "learning_rate": 6.001457723744486e-06, + "loss": 2.7629, + "step": 41474 + }, + { + "epoch": 2.574647712458874, + "grad_norm": 0.13580056862970147, + "learning_rate": 5.9997422645148846e-06, + "loss": 2.7444, + "step": 41475 + }, + { + "epoch": 2.5747097895586317, + "grad_norm": 0.13325656255267887, + "learning_rate": 5.998027034845305e-06, + "loss": 2.7209, + "step": 41476 + }, + { + "epoch": 2.5747718666583896, + "grad_norm": 0.1381234553734041, + "learning_rate": 5.996312034744694e-06, + "loss": 2.7044, + "step": 41477 + }, + { + "epoch": 2.5748339437581476, + "grad_norm": 0.13749230076084906, + "learning_rate": 5.994597264222013e-06, + "loss": 2.7826, + "step": 41478 + }, + { + "epoch": 2.5748960208579055, + "grad_norm": 0.1342653997484411, + "learning_rate": 5.992882723286197e-06, + "loss": 2.7591, + "step": 41479 + }, + { + "epoch": 2.5749580979576634, + "grad_norm": 0.14825614089534037, + "learning_rate": 5.991168411946202e-06, + "loss": 2.6361, + "step": 41480 + }, + { + "epoch": 2.5750201750574213, + "grad_norm": 0.14462792539811306, + "learning_rate": 5.989454330210964e-06, + "loss": 2.7042, + "step": 41481 + }, + { + "epoch": 2.5750822521571792, + "grad_norm": 0.1399745268403566, + "learning_rate": 5.987740478089409e-06, + "loss": 2.5927, + "step": 41482 + }, + { + "epoch": 2.575144329256937, + "grad_norm": 0.13311279231947556, + "learning_rate": 5.9860268555905154e-06, + "loss": 2.7664, + "step": 41483 + }, + { + "epoch": 2.575206406356695, + "grad_norm": 0.1419711321681902, + "learning_rate": 5.984313462723201e-06, + "loss": 2.7197, + "step": 41484 + }, + { + "epoch": 2.575268483456453, + "grad_norm": 0.14937904047189995, + "learning_rate": 5.98260029949641e-06, + "loss": 2.709, + "step": 41485 + }, + { + "epoch": 2.575330560556211, + "grad_norm": 0.13509945365147977, + "learning_rate": 5.980887365919063e-06, + "loss": 2.6475, + "step": 41486 + }, + { + "epoch": 2.575392637655969, + "grad_norm": 0.1611598425648372, + "learning_rate": 5.979174662000131e-06, + "loss": 2.6039, + "step": 41487 + }, + { + "epoch": 2.5754547147557267, + "grad_norm": 0.15451758931821138, + "learning_rate": 5.977462187748522e-06, + "loss": 2.7079, + "step": 41488 + }, + { + "epoch": 2.5755167918554847, + "grad_norm": 0.13999330845236135, + "learning_rate": 5.975749943173187e-06, + "loss": 2.7175, + "step": 41489 + }, + { + "epoch": 2.5755788689552426, + "grad_norm": 0.1489497181150542, + "learning_rate": 5.974037928283038e-06, + "loss": 2.684, + "step": 41490 + }, + { + "epoch": 2.5756409460550005, + "grad_norm": 0.13722235706030356, + "learning_rate": 5.972326143087032e-06, + "loss": 2.7353, + "step": 41491 + }, + { + "epoch": 2.575703023154758, + "grad_norm": 0.13587218173244286, + "learning_rate": 5.9706145875940925e-06, + "loss": 2.7958, + "step": 41492 + }, + { + "epoch": 2.5757651002545163, + "grad_norm": 0.13482397999825932, + "learning_rate": 5.968903261813141e-06, + "loss": 2.6997, + "step": 41493 + }, + { + "epoch": 2.575827177354274, + "grad_norm": 0.13388656627331932, + "learning_rate": 5.96719216575311e-06, + "loss": 2.7343, + "step": 41494 + }, + { + "epoch": 2.575889254454032, + "grad_norm": 0.14322911537053895, + "learning_rate": 5.965481299422915e-06, + "loss": 2.7201, + "step": 41495 + }, + { + "epoch": 2.5759513315537896, + "grad_norm": 0.1489264128624702, + "learning_rate": 5.963770662831509e-06, + "loss": 2.6379, + "step": 41496 + }, + { + "epoch": 2.5760134086535476, + "grad_norm": 0.13493812964532342, + "learning_rate": 5.9620602559878e-06, + "loss": 2.7234, + "step": 41497 + }, + { + "epoch": 2.5760754857533055, + "grad_norm": 0.14243053609781345, + "learning_rate": 5.960350078900711e-06, + "loss": 2.7339, + "step": 41498 + }, + { + "epoch": 2.5761375628530634, + "grad_norm": 0.13568422740449515, + "learning_rate": 5.958640131579157e-06, + "loss": 2.7624, + "step": 41499 + }, + { + "epoch": 2.5761996399528213, + "grad_norm": 0.13675181125351454, + "learning_rate": 5.95693041403207e-06, + "loss": 2.6744, + "step": 41500 + }, + { + "epoch": 2.5762617170525792, + "grad_norm": 0.13430663742967439, + "learning_rate": 5.955220926268379e-06, + "loss": 2.6937, + "step": 41501 + }, + { + "epoch": 2.576323794152337, + "grad_norm": 0.1385727850723569, + "learning_rate": 5.95351166829699e-06, + "loss": 2.7893, + "step": 41502 + }, + { + "epoch": 2.576385871252095, + "grad_norm": 0.1333586319252949, + "learning_rate": 5.951802640126824e-06, + "loss": 2.6464, + "step": 41503 + }, + { + "epoch": 2.576447948351853, + "grad_norm": 0.1397028263589312, + "learning_rate": 5.950093841766802e-06, + "loss": 2.6861, + "step": 41504 + }, + { + "epoch": 2.576510025451611, + "grad_norm": 0.1587859532332202, + "learning_rate": 5.948385273225815e-06, + "loss": 2.6388, + "step": 41505 + }, + { + "epoch": 2.576572102551369, + "grad_norm": 0.14456423176941596, + "learning_rate": 5.946676934512813e-06, + "loss": 2.7692, + "step": 41506 + }, + { + "epoch": 2.5766341796511267, + "grad_norm": 0.13308292794190738, + "learning_rate": 5.944968825636687e-06, + "loss": 2.642, + "step": 41507 + }, + { + "epoch": 2.5766962567508847, + "grad_norm": 0.14170635555074795, + "learning_rate": 5.94326094660636e-06, + "loss": 2.727, + "step": 41508 + }, + { + "epoch": 2.5767583338506426, + "grad_norm": 0.13619244636238037, + "learning_rate": 5.941553297430718e-06, + "loss": 2.694, + "step": 41509 + }, + { + "epoch": 2.5768204109504005, + "grad_norm": 0.13663251745905505, + "learning_rate": 5.939845878118705e-06, + "loss": 2.7254, + "step": 41510 + }, + { + "epoch": 2.5768824880501584, + "grad_norm": 0.13433780349407165, + "learning_rate": 5.938138688679207e-06, + "loss": 2.7053, + "step": 41511 + }, + { + "epoch": 2.5769445651499163, + "grad_norm": 0.1418204884269286, + "learning_rate": 5.936431729121139e-06, + "loss": 2.6814, + "step": 41512 + }, + { + "epoch": 2.5770066422496742, + "grad_norm": 0.13764606433816498, + "learning_rate": 5.9347249994533895e-06, + "loss": 2.6981, + "step": 41513 + }, + { + "epoch": 2.577068719349432, + "grad_norm": 0.13224995207605414, + "learning_rate": 5.933018499684895e-06, + "loss": 2.6881, + "step": 41514 + }, + { + "epoch": 2.5771307964491896, + "grad_norm": 0.13460900009421878, + "learning_rate": 5.931312229824532e-06, + "loss": 2.6212, + "step": 41515 + }, + { + "epoch": 2.577192873548948, + "grad_norm": 0.14431793348377028, + "learning_rate": 5.929606189881215e-06, + "loss": 2.6088, + "step": 41516 + }, + { + "epoch": 2.5772549506487055, + "grad_norm": 0.1467812708305846, + "learning_rate": 5.9279003798638375e-06, + "loss": 2.6947, + "step": 41517 + }, + { + "epoch": 2.577317027748464, + "grad_norm": 0.1323831082315707, + "learning_rate": 5.9261947997812985e-06, + "loss": 2.7024, + "step": 41518 + }, + { + "epoch": 2.5773791048482213, + "grad_norm": 0.12962152256297063, + "learning_rate": 5.924489449642507e-06, + "loss": 2.6344, + "step": 41519 + }, + { + "epoch": 2.5774411819479797, + "grad_norm": 0.13986306270246154, + "learning_rate": 5.922784329456355e-06, + "loss": 2.7245, + "step": 41520 + }, + { + "epoch": 2.577503259047737, + "grad_norm": 0.14905136316710055, + "learning_rate": 5.921079439231736e-06, + "loss": 2.7302, + "step": 41521 + }, + { + "epoch": 2.577565336147495, + "grad_norm": 0.14112641428516695, + "learning_rate": 5.919374778977538e-06, + "loss": 2.7279, + "step": 41522 + }, + { + "epoch": 2.577627413247253, + "grad_norm": 0.13406300864756732, + "learning_rate": 5.9176703487026694e-06, + "loss": 2.6046, + "step": 41523 + }, + { + "epoch": 2.577689490347011, + "grad_norm": 0.1377257252647767, + "learning_rate": 5.915966148416019e-06, + "loss": 2.7569, + "step": 41524 + }, + { + "epoch": 2.577751567446769, + "grad_norm": 0.13680524133819905, + "learning_rate": 5.914262178126473e-06, + "loss": 2.699, + "step": 41525 + }, + { + "epoch": 2.5778136445465267, + "grad_norm": 0.15490079946088456, + "learning_rate": 5.912558437842914e-06, + "loss": 2.6299, + "step": 41526 + }, + { + "epoch": 2.5778757216462846, + "grad_norm": 0.13193708707418658, + "learning_rate": 5.9108549275742495e-06, + "loss": 2.7552, + "step": 41527 + }, + { + "epoch": 2.5779377987460426, + "grad_norm": 0.1335051789502547, + "learning_rate": 5.909151647329358e-06, + "loss": 2.7826, + "step": 41528 + }, + { + "epoch": 2.5779998758458005, + "grad_norm": 0.1502823749697647, + "learning_rate": 5.907448597117126e-06, + "loss": 2.6647, + "step": 41529 + }, + { + "epoch": 2.5780619529455584, + "grad_norm": 0.1346371010195985, + "learning_rate": 5.905745776946442e-06, + "loss": 2.6866, + "step": 41530 + }, + { + "epoch": 2.5781240300453163, + "grad_norm": 0.13935819889166404, + "learning_rate": 5.904043186826175e-06, + "loss": 2.6632, + "step": 41531 + }, + { + "epoch": 2.5781861071450742, + "grad_norm": 0.13848189651697135, + "learning_rate": 5.902340826765213e-06, + "loss": 2.7565, + "step": 41532 + }, + { + "epoch": 2.578248184244832, + "grad_norm": 0.13784416080492337, + "learning_rate": 5.900638696772459e-06, + "loss": 2.6969, + "step": 41533 + }, + { + "epoch": 2.57831026134459, + "grad_norm": 0.15447108176414484, + "learning_rate": 5.89893679685678e-06, + "loss": 2.7155, + "step": 41534 + }, + { + "epoch": 2.578372338444348, + "grad_norm": 0.1359529437138557, + "learning_rate": 5.897235127027051e-06, + "loss": 2.6568, + "step": 41535 + }, + { + "epoch": 2.578434415544106, + "grad_norm": 0.1336702180934754, + "learning_rate": 5.895533687292143e-06, + "loss": 2.6718, + "step": 41536 + }, + { + "epoch": 2.578496492643864, + "grad_norm": 0.13560947956767283, + "learning_rate": 5.893832477660954e-06, + "loss": 2.6654, + "step": 41537 + }, + { + "epoch": 2.5785585697436217, + "grad_norm": 0.13831152234376184, + "learning_rate": 5.89213149814235e-06, + "loss": 2.6892, + "step": 41538 + }, + { + "epoch": 2.5786206468433797, + "grad_norm": 0.1513684426532512, + "learning_rate": 5.8904307487452015e-06, + "loss": 2.7412, + "step": 41539 + }, + { + "epoch": 2.578682723943137, + "grad_norm": 0.13682094569166725, + "learning_rate": 5.888730229478378e-06, + "loss": 2.7221, + "step": 41540 + }, + { + "epoch": 2.5787448010428955, + "grad_norm": 0.1434930357601365, + "learning_rate": 5.8870299403507524e-06, + "loss": 2.7156, + "step": 41541 + }, + { + "epoch": 2.578806878142653, + "grad_norm": 0.12997933436700504, + "learning_rate": 5.88532988137121e-06, + "loss": 2.6362, + "step": 41542 + }, + { + "epoch": 2.5788689552424113, + "grad_norm": 0.1423162773202311, + "learning_rate": 5.8836300525486165e-06, + "loss": 2.6257, + "step": 41543 + }, + { + "epoch": 2.578931032342169, + "grad_norm": 0.13052423462339205, + "learning_rate": 5.881930453891826e-06, + "loss": 2.702, + "step": 41544 + }, + { + "epoch": 2.5789931094419267, + "grad_norm": 0.140285366584727, + "learning_rate": 5.88023108540971e-06, + "loss": 2.7429, + "step": 41545 + }, + { + "epoch": 2.5790551865416846, + "grad_norm": 0.1383046674566228, + "learning_rate": 5.878531947111143e-06, + "loss": 2.7134, + "step": 41546 + }, + { + "epoch": 2.5791172636414426, + "grad_norm": 0.13686661898198058, + "learning_rate": 5.876833039004992e-06, + "loss": 2.7086, + "step": 41547 + }, + { + "epoch": 2.5791793407412005, + "grad_norm": 0.13383417691489055, + "learning_rate": 5.8751343611001095e-06, + "loss": 2.688, + "step": 41548 + }, + { + "epoch": 2.5792414178409584, + "grad_norm": 0.1358093191771488, + "learning_rate": 5.873435913405357e-06, + "loss": 2.6708, + "step": 41549 + }, + { + "epoch": 2.5793034949407163, + "grad_norm": 0.13780078500792606, + "learning_rate": 5.871737695929608e-06, + "loss": 2.6661, + "step": 41550 + }, + { + "epoch": 2.5793655720404742, + "grad_norm": 0.14272687938240944, + "learning_rate": 5.8700397086817185e-06, + "loss": 2.7101, + "step": 41551 + }, + { + "epoch": 2.579427649140232, + "grad_norm": 0.1324414230366921, + "learning_rate": 5.868341951670541e-06, + "loss": 2.7199, + "step": 41552 + }, + { + "epoch": 2.57948972623999, + "grad_norm": 0.13627381130920202, + "learning_rate": 5.866644424904938e-06, + "loss": 2.6879, + "step": 41553 + }, + { + "epoch": 2.579551803339748, + "grad_norm": 0.1383902349256583, + "learning_rate": 5.864947128393755e-06, + "loss": 2.7971, + "step": 41554 + }, + { + "epoch": 2.579613880439506, + "grad_norm": 0.13531807296632542, + "learning_rate": 5.86325006214587e-06, + "loss": 2.6679, + "step": 41555 + }, + { + "epoch": 2.579675957539264, + "grad_norm": 0.15358245816833507, + "learning_rate": 5.861553226170124e-06, + "loss": 2.708, + "step": 41556 + }, + { + "epoch": 2.5797380346390217, + "grad_norm": 0.1424877455110139, + "learning_rate": 5.859856620475368e-06, + "loss": 2.6885, + "step": 41557 + }, + { + "epoch": 2.5798001117387797, + "grad_norm": 0.14806191712711975, + "learning_rate": 5.858160245070443e-06, + "loss": 2.756, + "step": 41558 + }, + { + "epoch": 2.5798621888385376, + "grad_norm": 0.14734211612333545, + "learning_rate": 5.856464099964226e-06, + "loss": 2.6934, + "step": 41559 + }, + { + "epoch": 2.5799242659382955, + "grad_norm": 0.15597556570772578, + "learning_rate": 5.854768185165549e-06, + "loss": 2.7178, + "step": 41560 + }, + { + "epoch": 2.5799863430380534, + "grad_norm": 0.14467155245069208, + "learning_rate": 5.853072500683266e-06, + "loss": 2.7556, + "step": 41561 + }, + { + "epoch": 2.5800484201378113, + "grad_norm": 0.14144016143766017, + "learning_rate": 5.851377046526208e-06, + "loss": 2.6739, + "step": 41562 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.13586043277918777, + "learning_rate": 5.84968182270324e-06, + "loss": 2.6582, + "step": 41563 + }, + { + "epoch": 2.580172574337327, + "grad_norm": 0.13564176755744065, + "learning_rate": 5.847986829223195e-06, + "loss": 2.7151, + "step": 41564 + }, + { + "epoch": 2.5802346514370846, + "grad_norm": 0.15655296279503167, + "learning_rate": 5.846292066094933e-06, + "loss": 2.6777, + "step": 41565 + }, + { + "epoch": 2.580296728536843, + "grad_norm": 0.1343513354267203, + "learning_rate": 5.8445975333272785e-06, + "loss": 2.6996, + "step": 41566 + }, + { + "epoch": 2.5803588056366005, + "grad_norm": 0.1434187212766749, + "learning_rate": 5.842903230929081e-06, + "loss": 2.7128, + "step": 41567 + }, + { + "epoch": 2.580420882736359, + "grad_norm": 0.1406911985562779, + "learning_rate": 5.841209158909167e-06, + "loss": 2.7941, + "step": 41568 + }, + { + "epoch": 2.5804829598361163, + "grad_norm": 0.15203480082224077, + "learning_rate": 5.839515317276395e-06, + "loss": 2.6444, + "step": 41569 + }, + { + "epoch": 2.5805450369358742, + "grad_norm": 0.15006308992127712, + "learning_rate": 5.837821706039587e-06, + "loss": 2.7702, + "step": 41570 + }, + { + "epoch": 2.580607114035632, + "grad_norm": 0.15261500003322034, + "learning_rate": 5.83612832520759e-06, + "loss": 2.7154, + "step": 41571 + }, + { + "epoch": 2.58066919113539, + "grad_norm": 0.1555179194711625, + "learning_rate": 5.8344351747892216e-06, + "loss": 2.7371, + "step": 41572 + }, + { + "epoch": 2.580731268235148, + "grad_norm": 0.15808137871550706, + "learning_rate": 5.832742254793333e-06, + "loss": 2.6328, + "step": 41573 + }, + { + "epoch": 2.580793345334906, + "grad_norm": 0.15461949529771585, + "learning_rate": 5.831049565228752e-06, + "loss": 2.7065, + "step": 41574 + }, + { + "epoch": 2.580855422434664, + "grad_norm": 0.14021635285078302, + "learning_rate": 5.829357106104311e-06, + "loss": 2.7073, + "step": 41575 + }, + { + "epoch": 2.5809174995344217, + "grad_norm": 0.15000165451647315, + "learning_rate": 5.827664877428824e-06, + "loss": 2.7455, + "step": 41576 + }, + { + "epoch": 2.5809795766341797, + "grad_norm": 0.13814777408754014, + "learning_rate": 5.825972879211139e-06, + "loss": 2.6804, + "step": 41577 + }, + { + "epoch": 2.5810416537339376, + "grad_norm": 0.15021678860310173, + "learning_rate": 5.824281111460084e-06, + "loss": 2.6244, + "step": 41578 + }, + { + "epoch": 2.5811037308336955, + "grad_norm": 0.1537678922054424, + "learning_rate": 5.822589574184472e-06, + "loss": 2.8361, + "step": 41579 + }, + { + "epoch": 2.5811658079334534, + "grad_norm": 0.16417495529212825, + "learning_rate": 5.820898267393138e-06, + "loss": 2.6658, + "step": 41580 + }, + { + "epoch": 2.5812278850332113, + "grad_norm": 0.1339092271890034, + "learning_rate": 5.8192071910948944e-06, + "loss": 2.6937, + "step": 41581 + }, + { + "epoch": 2.5812899621329692, + "grad_norm": 0.13530043204588185, + "learning_rate": 5.817516345298579e-06, + "loss": 2.743, + "step": 41582 + }, + { + "epoch": 2.581352039232727, + "grad_norm": 0.13383745446007073, + "learning_rate": 5.815825730013008e-06, + "loss": 2.6898, + "step": 41583 + }, + { + "epoch": 2.581414116332485, + "grad_norm": 0.13714360573302117, + "learning_rate": 5.814135345246996e-06, + "loss": 2.7577, + "step": 41584 + }, + { + "epoch": 2.581476193432243, + "grad_norm": 0.15429398377039574, + "learning_rate": 5.812445191009364e-06, + "loss": 2.7692, + "step": 41585 + }, + { + "epoch": 2.5815382705320005, + "grad_norm": 0.13956133253332706, + "learning_rate": 5.8107552673089374e-06, + "loss": 2.6805, + "step": 41586 + }, + { + "epoch": 2.581600347631759, + "grad_norm": 0.13796996057784988, + "learning_rate": 5.809065574154527e-06, + "loss": 2.775, + "step": 41587 + }, + { + "epoch": 2.5816624247315163, + "grad_norm": 0.13714804009076034, + "learning_rate": 5.807376111554952e-06, + "loss": 2.8414, + "step": 41588 + }, + { + "epoch": 2.5817245018312747, + "grad_norm": 0.13731506275898697, + "learning_rate": 5.805686879519023e-06, + "loss": 2.7291, + "step": 41589 + }, + { + "epoch": 2.581786578931032, + "grad_norm": 0.13787469609789932, + "learning_rate": 5.803997878055545e-06, + "loss": 2.7472, + "step": 41590 + }, + { + "epoch": 2.5818486560307905, + "grad_norm": 0.1370267328259422, + "learning_rate": 5.802309107173348e-06, + "loss": 2.6966, + "step": 41591 + }, + { + "epoch": 2.581910733130548, + "grad_norm": 0.15525772806987925, + "learning_rate": 5.800620566881232e-06, + "loss": 2.7525, + "step": 41592 + }, + { + "epoch": 2.581972810230306, + "grad_norm": 0.14161658371801747, + "learning_rate": 5.798932257188011e-06, + "loss": 2.6868, + "step": 41593 + }, + { + "epoch": 2.582034887330064, + "grad_norm": 0.1336037194285523, + "learning_rate": 5.797244178102479e-06, + "loss": 2.7412, + "step": 41594 + }, + { + "epoch": 2.5820969644298217, + "grad_norm": 0.14059445261308054, + "learning_rate": 5.795556329633467e-06, + "loss": 2.7454, + "step": 41595 + }, + { + "epoch": 2.5821590415295796, + "grad_norm": 0.1395000256013676, + "learning_rate": 5.793868711789769e-06, + "loss": 2.768, + "step": 41596 + }, + { + "epoch": 2.5822211186293376, + "grad_norm": 0.1327027330072745, + "learning_rate": 5.792181324580176e-06, + "loss": 2.7234, + "step": 41597 + }, + { + "epoch": 2.5822831957290955, + "grad_norm": 0.14995845403102875, + "learning_rate": 5.790494168013516e-06, + "loss": 2.7237, + "step": 41598 + }, + { + "epoch": 2.5823452728288534, + "grad_norm": 0.14772598496956182, + "learning_rate": 5.78880724209857e-06, + "loss": 2.6739, + "step": 41599 + }, + { + "epoch": 2.5824073499286113, + "grad_norm": 0.16718776582536332, + "learning_rate": 5.7871205468441605e-06, + "loss": 2.8146, + "step": 41600 + }, + { + "epoch": 2.5824694270283692, + "grad_norm": 0.13552695988283112, + "learning_rate": 5.785434082259072e-06, + "loss": 2.7154, + "step": 41601 + }, + { + "epoch": 2.582531504128127, + "grad_norm": 0.14161612337160556, + "learning_rate": 5.783747848352111e-06, + "loss": 2.7676, + "step": 41602 + }, + { + "epoch": 2.582593581227885, + "grad_norm": 0.1450786405892113, + "learning_rate": 5.782061845132075e-06, + "loss": 2.7763, + "step": 41603 + }, + { + "epoch": 2.582655658327643, + "grad_norm": 0.13359515206119912, + "learning_rate": 5.78037607260774e-06, + "loss": 2.6931, + "step": 41604 + }, + { + "epoch": 2.582717735427401, + "grad_norm": 0.14541332966433287, + "learning_rate": 5.778690530787933e-06, + "loss": 2.7118, + "step": 41605 + }, + { + "epoch": 2.582779812527159, + "grad_norm": 0.13143915584254026, + "learning_rate": 5.777005219681431e-06, + "loss": 2.7273, + "step": 41606 + }, + { + "epoch": 2.5828418896269167, + "grad_norm": 0.14120535173356255, + "learning_rate": 5.775320139297025e-06, + "loss": 2.7591, + "step": 41607 + }, + { + "epoch": 2.5829039667266747, + "grad_norm": 0.1409241752288745, + "learning_rate": 5.773635289643503e-06, + "loss": 2.6204, + "step": 41608 + }, + { + "epoch": 2.5829660438264326, + "grad_norm": 0.14145786623697407, + "learning_rate": 5.77195067072967e-06, + "loss": 2.7269, + "step": 41609 + }, + { + "epoch": 2.5830281209261905, + "grad_norm": 0.14546057555220585, + "learning_rate": 5.770266282564307e-06, + "loss": 2.6772, + "step": 41610 + }, + { + "epoch": 2.583090198025948, + "grad_norm": 0.13904346459010028, + "learning_rate": 5.768582125156203e-06, + "loss": 2.7175, + "step": 41611 + }, + { + "epoch": 2.5831522751257063, + "grad_norm": 0.13554271614664395, + "learning_rate": 5.7668981985141365e-06, + "loss": 2.6993, + "step": 41612 + }, + { + "epoch": 2.583214352225464, + "grad_norm": 0.14160091794623642, + "learning_rate": 5.765214502646904e-06, + "loss": 2.7371, + "step": 41613 + }, + { + "epoch": 2.583276429325222, + "grad_norm": 0.13494441708314228, + "learning_rate": 5.763531037563291e-06, + "loss": 2.7056, + "step": 41614 + }, + { + "epoch": 2.5833385064249796, + "grad_norm": 0.14501641652265562, + "learning_rate": 5.761847803272075e-06, + "loss": 2.6644, + "step": 41615 + }, + { + "epoch": 2.5834005835247376, + "grad_norm": 0.13869019508235875, + "learning_rate": 5.760164799782036e-06, + "loss": 2.7329, + "step": 41616 + }, + { + "epoch": 2.5834626606244955, + "grad_norm": 0.16859614944164714, + "learning_rate": 5.7584820271019465e-06, + "loss": 2.6315, + "step": 41617 + }, + { + "epoch": 2.5835247377242534, + "grad_norm": 0.14097410156013093, + "learning_rate": 5.756799485240605e-06, + "loss": 2.7333, + "step": 41618 + }, + { + "epoch": 2.5835868148240113, + "grad_norm": 0.13502652717829214, + "learning_rate": 5.755117174206787e-06, + "loss": 2.7601, + "step": 41619 + }, + { + "epoch": 2.5836488919237692, + "grad_norm": 0.15321138485162433, + "learning_rate": 5.7534350940092575e-06, + "loss": 2.7028, + "step": 41620 + }, + { + "epoch": 2.583710969023527, + "grad_norm": 0.13565054208003546, + "learning_rate": 5.751753244656788e-06, + "loss": 2.754, + "step": 41621 + }, + { + "epoch": 2.583773046123285, + "grad_norm": 0.13769881602490255, + "learning_rate": 5.750071626158177e-06, + "loss": 2.7434, + "step": 41622 + }, + { + "epoch": 2.583835123223043, + "grad_norm": 0.13838567885447128, + "learning_rate": 5.748390238522183e-06, + "loss": 2.7049, + "step": 41623 + }, + { + "epoch": 2.583897200322801, + "grad_norm": 0.14576956867751453, + "learning_rate": 5.746709081757584e-06, + "loss": 2.7027, + "step": 41624 + }, + { + "epoch": 2.583959277422559, + "grad_norm": 0.13694276848873452, + "learning_rate": 5.745028155873139e-06, + "loss": 2.6631, + "step": 41625 + }, + { + "epoch": 2.5840213545223167, + "grad_norm": 0.13570269689453976, + "learning_rate": 5.743347460877618e-06, + "loss": 2.7084, + "step": 41626 + }, + { + "epoch": 2.5840834316220747, + "grad_norm": 0.14706720118378855, + "learning_rate": 5.74166699677981e-06, + "loss": 2.6739, + "step": 41627 + }, + { + "epoch": 2.5841455087218326, + "grad_norm": 0.14369364260859116, + "learning_rate": 5.739986763588467e-06, + "loss": 2.6509, + "step": 41628 + }, + { + "epoch": 2.5842075858215905, + "grad_norm": 0.1361904017002308, + "learning_rate": 5.738306761312356e-06, + "loss": 2.726, + "step": 41629 + }, + { + "epoch": 2.5842696629213484, + "grad_norm": 0.13409229358761401, + "learning_rate": 5.736626989960236e-06, + "loss": 2.6576, + "step": 41630 + }, + { + "epoch": 2.5843317400211063, + "grad_norm": 0.14703403344595567, + "learning_rate": 5.734947449540878e-06, + "loss": 2.6903, + "step": 41631 + }, + { + "epoch": 2.5843938171208642, + "grad_norm": 0.1543836771981977, + "learning_rate": 5.733268140063058e-06, + "loss": 2.7199, + "step": 41632 + }, + { + "epoch": 2.584455894220622, + "grad_norm": 0.17673766827527077, + "learning_rate": 5.731589061535519e-06, + "loss": 2.7339, + "step": 41633 + }, + { + "epoch": 2.5845179713203796, + "grad_norm": 0.14311299525643448, + "learning_rate": 5.729910213967027e-06, + "loss": 2.696, + "step": 41634 + }, + { + "epoch": 2.584580048420138, + "grad_norm": 0.13923463668687347, + "learning_rate": 5.728231597366335e-06, + "loss": 2.7964, + "step": 41635 + }, + { + "epoch": 2.5846421255198955, + "grad_norm": 0.15919066315238972, + "learning_rate": 5.7265532117422085e-06, + "loss": 2.6512, + "step": 41636 + }, + { + "epoch": 2.584704202619654, + "grad_norm": 0.15715638365581516, + "learning_rate": 5.724875057103407e-06, + "loss": 2.6907, + "step": 41637 + }, + { + "epoch": 2.5847662797194113, + "grad_norm": 0.140012579772088, + "learning_rate": 5.7231971334586795e-06, + "loss": 2.7332, + "step": 41638 + }, + { + "epoch": 2.5848283568191697, + "grad_norm": 0.13994310541711957, + "learning_rate": 5.721519440816781e-06, + "loss": 2.7514, + "step": 41639 + }, + { + "epoch": 2.584890433918927, + "grad_norm": 0.1448861931627294, + "learning_rate": 5.719841979186452e-06, + "loss": 2.682, + "step": 41640 + }, + { + "epoch": 2.584952511018685, + "grad_norm": 0.14817582455610478, + "learning_rate": 5.718164748576471e-06, + "loss": 2.7389, + "step": 41641 + }, + { + "epoch": 2.585014588118443, + "grad_norm": 0.1363472933259622, + "learning_rate": 5.716487748995575e-06, + "loss": 2.7121, + "step": 41642 + }, + { + "epoch": 2.585076665218201, + "grad_norm": 0.1325684015263443, + "learning_rate": 5.7148109804525065e-06, + "loss": 2.7321, + "step": 41643 + }, + { + "epoch": 2.585138742317959, + "grad_norm": 0.13217870869046683, + "learning_rate": 5.713134442956014e-06, + "loss": 2.7129, + "step": 41644 + }, + { + "epoch": 2.5852008194177167, + "grad_norm": 0.1350934078226338, + "learning_rate": 5.711458136514863e-06, + "loss": 2.7446, + "step": 41645 + }, + { + "epoch": 2.5852628965174747, + "grad_norm": 0.14333149693480737, + "learning_rate": 5.709782061137781e-06, + "loss": 2.686, + "step": 41646 + }, + { + "epoch": 2.5853249736172326, + "grad_norm": 0.1397796028209868, + "learning_rate": 5.7081062168335195e-06, + "loss": 2.6405, + "step": 41647 + }, + { + "epoch": 2.5853870507169905, + "grad_norm": 0.1399729965236059, + "learning_rate": 5.706430603610808e-06, + "loss": 2.6903, + "step": 41648 + }, + { + "epoch": 2.5854491278167484, + "grad_norm": 0.1387682029451156, + "learning_rate": 5.704755221478414e-06, + "loss": 2.7171, + "step": 41649 + }, + { + "epoch": 2.5855112049165063, + "grad_norm": 0.13564334673875855, + "learning_rate": 5.703080070445066e-06, + "loss": 2.736, + "step": 41650 + }, + { + "epoch": 2.5855732820162642, + "grad_norm": 0.14586947126747563, + "learning_rate": 5.701405150519501e-06, + "loss": 2.7045, + "step": 41651 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.14000180359300218, + "learning_rate": 5.699730461710457e-06, + "loss": 2.8125, + "step": 41652 + }, + { + "epoch": 2.58569743621578, + "grad_norm": 0.16115607469433416, + "learning_rate": 5.698056004026664e-06, + "loss": 2.7181, + "step": 41653 + }, + { + "epoch": 2.585759513315538, + "grad_norm": 0.15614727338869513, + "learning_rate": 5.696381777476878e-06, + "loss": 2.6554, + "step": 41654 + }, + { + "epoch": 2.585821590415296, + "grad_norm": 0.15549147190653248, + "learning_rate": 5.694707782069825e-06, + "loss": 2.6922, + "step": 41655 + }, + { + "epoch": 2.585883667515054, + "grad_norm": 0.15082601589315695, + "learning_rate": 5.693034017814236e-06, + "loss": 2.7971, + "step": 41656 + }, + { + "epoch": 2.5859457446148117, + "grad_norm": 0.14047453035675636, + "learning_rate": 5.691360484718838e-06, + "loss": 2.7475, + "step": 41657 + }, + { + "epoch": 2.5860078217145697, + "grad_norm": 0.13696201344581446, + "learning_rate": 5.689687182792375e-06, + "loss": 2.775, + "step": 41658 + }, + { + "epoch": 2.586069898814327, + "grad_norm": 0.14287945507142413, + "learning_rate": 5.688014112043571e-06, + "loss": 2.7489, + "step": 41659 + }, + { + "epoch": 2.5861319759140855, + "grad_norm": 0.14028313593957437, + "learning_rate": 5.686341272481155e-06, + "loss": 2.7153, + "step": 41660 + }, + { + "epoch": 2.586194053013843, + "grad_norm": 0.13151049535264284, + "learning_rate": 5.68466866411384e-06, + "loss": 2.6479, + "step": 41661 + }, + { + "epoch": 2.5862561301136013, + "grad_norm": 0.13762908046071398, + "learning_rate": 5.682996286950382e-06, + "loss": 2.7451, + "step": 41662 + }, + { + "epoch": 2.586318207213359, + "grad_norm": 0.1306340712039568, + "learning_rate": 5.681324140999478e-06, + "loss": 2.653, + "step": 41663 + }, + { + "epoch": 2.5863802843131167, + "grad_norm": 0.15339650247362197, + "learning_rate": 5.679652226269877e-06, + "loss": 2.7409, + "step": 41664 + }, + { + "epoch": 2.5864423614128746, + "grad_norm": 0.13054801423347917, + "learning_rate": 5.677980542770284e-06, + "loss": 2.6629, + "step": 41665 + }, + { + "epoch": 2.5865044385126326, + "grad_norm": 0.13522676684499513, + "learning_rate": 5.676309090509429e-06, + "loss": 2.6977, + "step": 41666 + }, + { + "epoch": 2.5865665156123905, + "grad_norm": 0.13779843304003397, + "learning_rate": 5.674637869496024e-06, + "loss": 2.6737, + "step": 41667 + }, + { + "epoch": 2.5866285927121484, + "grad_norm": 0.13298739810179197, + "learning_rate": 5.6729668797388e-06, + "loss": 2.7698, + "step": 41668 + }, + { + "epoch": 2.5866906698119063, + "grad_norm": 0.13432335498758513, + "learning_rate": 5.671296121246466e-06, + "loss": 2.6767, + "step": 41669 + }, + { + "epoch": 2.5867527469116642, + "grad_norm": 0.13417483491743495, + "learning_rate": 5.669625594027744e-06, + "loss": 2.6441, + "step": 41670 + }, + { + "epoch": 2.586814824011422, + "grad_norm": 0.1458947075261053, + "learning_rate": 5.667955298091337e-06, + "loss": 2.6935, + "step": 41671 + }, + { + "epoch": 2.58687690111118, + "grad_norm": 0.1678023456630479, + "learning_rate": 5.6662852334459785e-06, + "loss": 2.6915, + "step": 41672 + }, + { + "epoch": 2.586938978210938, + "grad_norm": 0.13980158164749115, + "learning_rate": 5.664615400100376e-06, + "loss": 2.6761, + "step": 41673 + }, + { + "epoch": 2.587001055310696, + "grad_norm": 0.14924208398693667, + "learning_rate": 5.662945798063229e-06, + "loss": 2.6423, + "step": 41674 + }, + { + "epoch": 2.587063132410454, + "grad_norm": 0.13820035448634413, + "learning_rate": 5.661276427343265e-06, + "loss": 2.6797, + "step": 41675 + }, + { + "epoch": 2.5871252095102117, + "grad_norm": 0.13311368087098255, + "learning_rate": 5.6596072879491706e-06, + "loss": 2.6726, + "step": 41676 + }, + { + "epoch": 2.5871872866099697, + "grad_norm": 0.1348683753353257, + "learning_rate": 5.657938379889682e-06, + "loss": 2.6348, + "step": 41677 + }, + { + "epoch": 2.5872493637097276, + "grad_norm": 0.1510371147215252, + "learning_rate": 5.656269703173495e-06, + "loss": 2.7425, + "step": 41678 + }, + { + "epoch": 2.5873114408094855, + "grad_norm": 0.14443171140773786, + "learning_rate": 5.654601257809311e-06, + "loss": 2.7316, + "step": 41679 + }, + { + "epoch": 2.5873735179092434, + "grad_norm": 0.13617899479420942, + "learning_rate": 5.652933043805825e-06, + "loss": 2.7431, + "step": 41680 + }, + { + "epoch": 2.5874355950090013, + "grad_norm": 0.15476297646826534, + "learning_rate": 5.651265061171768e-06, + "loss": 2.7529, + "step": 41681 + }, + { + "epoch": 2.587497672108759, + "grad_norm": 0.13278729536210784, + "learning_rate": 5.649597309915822e-06, + "loss": 2.6133, + "step": 41682 + }, + { + "epoch": 2.587559749208517, + "grad_norm": 0.13607466070005667, + "learning_rate": 5.6479297900466965e-06, + "loss": 2.6734, + "step": 41683 + }, + { + "epoch": 2.5876218263082746, + "grad_norm": 0.1324224142890684, + "learning_rate": 5.6462625015730795e-06, + "loss": 2.7086, + "step": 41684 + }, + { + "epoch": 2.587683903408033, + "grad_norm": 0.14269997341376967, + "learning_rate": 5.6445954445036866e-06, + "loss": 2.6867, + "step": 41685 + }, + { + "epoch": 2.5877459805077905, + "grad_norm": 0.13795664182261763, + "learning_rate": 5.64292861884721e-06, + "loss": 2.6609, + "step": 41686 + }, + { + "epoch": 2.587808057607549, + "grad_norm": 0.15017713528530108, + "learning_rate": 5.641262024612337e-06, + "loss": 2.7417, + "step": 41687 + }, + { + "epoch": 2.5878701347073063, + "grad_norm": 0.13407236579792917, + "learning_rate": 5.6395956618077726e-06, + "loss": 2.7666, + "step": 41688 + }, + { + "epoch": 2.5879322118070642, + "grad_norm": 0.13232138936864238, + "learning_rate": 5.637929530442193e-06, + "loss": 2.657, + "step": 41689 + }, + { + "epoch": 2.587994288906822, + "grad_norm": 0.15941332421742926, + "learning_rate": 5.636263630524319e-06, + "loss": 2.7034, + "step": 41690 + }, + { + "epoch": 2.58805636600658, + "grad_norm": 0.13681386636232074, + "learning_rate": 5.6345979620628204e-06, + "loss": 2.7434, + "step": 41691 + }, + { + "epoch": 2.588118443106338, + "grad_norm": 0.1602147274721443, + "learning_rate": 5.632932525066398e-06, + "loss": 2.646, + "step": 41692 + }, + { + "epoch": 2.588180520206096, + "grad_norm": 0.13831575765923346, + "learning_rate": 5.631267319543732e-06, + "loss": 2.7469, + "step": 41693 + }, + { + "epoch": 2.588242597305854, + "grad_norm": 0.14503887317253678, + "learning_rate": 5.629602345503521e-06, + "loss": 2.7877, + "step": 41694 + }, + { + "epoch": 2.5883046744056117, + "grad_norm": 0.13144857224026024, + "learning_rate": 5.6279376029544365e-06, + "loss": 2.6697, + "step": 41695 + }, + { + "epoch": 2.5883667515053697, + "grad_norm": 0.1556264714665535, + "learning_rate": 5.626273091905182e-06, + "loss": 2.69, + "step": 41696 + }, + { + "epoch": 2.5884288286051276, + "grad_norm": 0.13647334690226598, + "learning_rate": 5.624608812364429e-06, + "loss": 2.7283, + "step": 41697 + }, + { + "epoch": 2.5884909057048855, + "grad_norm": 0.14234192107959187, + "learning_rate": 5.62294476434086e-06, + "loss": 2.7492, + "step": 41698 + }, + { + "epoch": 2.5885529828046434, + "grad_norm": 0.15359322743797468, + "learning_rate": 5.621280947843172e-06, + "loss": 2.7809, + "step": 41699 + }, + { + "epoch": 2.5886150599044013, + "grad_norm": 0.13437098619099727, + "learning_rate": 5.61961736288003e-06, + "loss": 2.7532, + "step": 41700 + }, + { + "epoch": 2.5886771370041592, + "grad_norm": 0.13670368792807877, + "learning_rate": 5.617954009460119e-06, + "loss": 2.7149, + "step": 41701 + }, + { + "epoch": 2.588739214103917, + "grad_norm": 0.14699250781054524, + "learning_rate": 5.616290887592118e-06, + "loss": 2.7399, + "step": 41702 + }, + { + "epoch": 2.588801291203675, + "grad_norm": 0.1324705321454619, + "learning_rate": 5.614627997284694e-06, + "loss": 2.7154, + "step": 41703 + }, + { + "epoch": 2.588863368303433, + "grad_norm": 0.14070263017930665, + "learning_rate": 5.612965338546538e-06, + "loss": 2.7766, + "step": 41704 + }, + { + "epoch": 2.588925445403191, + "grad_norm": 0.14230604208281364, + "learning_rate": 5.611302911386318e-06, + "loss": 2.7297, + "step": 41705 + }, + { + "epoch": 2.588987522502949, + "grad_norm": 0.1451099191131528, + "learning_rate": 5.609640715812703e-06, + "loss": 2.7704, + "step": 41706 + }, + { + "epoch": 2.5890495996027063, + "grad_norm": 0.13790808126814305, + "learning_rate": 5.607978751834364e-06, + "loss": 2.6512, + "step": 41707 + }, + { + "epoch": 2.5891116767024647, + "grad_norm": 0.13536645496342148, + "learning_rate": 5.606317019459984e-06, + "loss": 2.764, + "step": 41708 + }, + { + "epoch": 2.589173753802222, + "grad_norm": 0.13288732832957736, + "learning_rate": 5.604655518698221e-06, + "loss": 2.7408, + "step": 41709 + }, + { + "epoch": 2.5892358309019805, + "grad_norm": 0.15831894226196574, + "learning_rate": 5.602994249557753e-06, + "loss": 2.7163, + "step": 41710 + }, + { + "epoch": 2.589297908001738, + "grad_norm": 0.13825796707178306, + "learning_rate": 5.601333212047227e-06, + "loss": 2.7251, + "step": 41711 + }, + { + "epoch": 2.589359985101496, + "grad_norm": 0.13445261526814667, + "learning_rate": 5.599672406175338e-06, + "loss": 2.7257, + "step": 41712 + }, + { + "epoch": 2.589422062201254, + "grad_norm": 0.13605592682739734, + "learning_rate": 5.598011831950733e-06, + "loss": 2.6904, + "step": 41713 + }, + { + "epoch": 2.5894841393010117, + "grad_norm": 0.1372226760470275, + "learning_rate": 5.596351489382085e-06, + "loss": 2.7316, + "step": 41714 + }, + { + "epoch": 2.5895462164007697, + "grad_norm": 0.13717999989595508, + "learning_rate": 5.594691378478045e-06, + "loss": 2.666, + "step": 41715 + }, + { + "epoch": 2.5896082935005276, + "grad_norm": 0.15151965553302, + "learning_rate": 5.593031499247276e-06, + "loss": 2.7326, + "step": 41716 + }, + { + "epoch": 2.5896703706002855, + "grad_norm": 0.1446755307686672, + "learning_rate": 5.591371851698446e-06, + "loss": 2.7526, + "step": 41717 + }, + { + "epoch": 2.5897324477000434, + "grad_norm": 0.13921893033102667, + "learning_rate": 5.5897124358402104e-06, + "loss": 2.7451, + "step": 41718 + }, + { + "epoch": 2.5897945247998013, + "grad_norm": 0.1632041445655074, + "learning_rate": 5.588053251681224e-06, + "loss": 2.7365, + "step": 41719 + }, + { + "epoch": 2.5898566018995592, + "grad_norm": 0.14083410981159855, + "learning_rate": 5.586394299230141e-06, + "loss": 2.7478, + "step": 41720 + }, + { + "epoch": 2.589918678999317, + "grad_norm": 0.1466830495761425, + "learning_rate": 5.584735578495625e-06, + "loss": 2.6964, + "step": 41721 + }, + { + "epoch": 2.589980756099075, + "grad_norm": 0.13343683223646619, + "learning_rate": 5.583077089486333e-06, + "loss": 2.6606, + "step": 41722 + }, + { + "epoch": 2.590042833198833, + "grad_norm": 0.13709111594753726, + "learning_rate": 5.5814188322109054e-06, + "loss": 2.7045, + "step": 41723 + }, + { + "epoch": 2.590104910298591, + "grad_norm": 0.14785652278304642, + "learning_rate": 5.579760806677997e-06, + "loss": 2.7021, + "step": 41724 + }, + { + "epoch": 2.590166987398349, + "grad_norm": 0.13731122387973266, + "learning_rate": 5.578103012896252e-06, + "loss": 2.6098, + "step": 41725 + }, + { + "epoch": 2.5902290644981067, + "grad_norm": 0.14375099720816328, + "learning_rate": 5.5764454508743404e-06, + "loss": 2.7478, + "step": 41726 + }, + { + "epoch": 2.5902911415978647, + "grad_norm": 0.13494278141126592, + "learning_rate": 5.574788120620894e-06, + "loss": 2.7594, + "step": 41727 + }, + { + "epoch": 2.5903532186976226, + "grad_norm": 0.14997531469197298, + "learning_rate": 5.5731310221445506e-06, + "loss": 2.7953, + "step": 41728 + }, + { + "epoch": 2.5904152957973805, + "grad_norm": 0.14502146427394685, + "learning_rate": 5.571474155453982e-06, + "loss": 2.7026, + "step": 41729 + }, + { + "epoch": 2.590477372897138, + "grad_norm": 0.13292921771858507, + "learning_rate": 5.569817520557802e-06, + "loss": 2.6621, + "step": 41730 + }, + { + "epoch": 2.5905394499968963, + "grad_norm": 0.1434223772005456, + "learning_rate": 5.568161117464687e-06, + "loss": 2.7393, + "step": 41731 + }, + { + "epoch": 2.590601527096654, + "grad_norm": 0.1344966415478763, + "learning_rate": 5.56650494618326e-06, + "loss": 2.6004, + "step": 41732 + }, + { + "epoch": 2.590663604196412, + "grad_norm": 0.1446040717522484, + "learning_rate": 5.564849006722161e-06, + "loss": 2.7123, + "step": 41733 + }, + { + "epoch": 2.5907256812961696, + "grad_norm": 0.13996675531942365, + "learning_rate": 5.563193299090019e-06, + "loss": 2.7301, + "step": 41734 + }, + { + "epoch": 2.590787758395928, + "grad_norm": 0.16056244210835768, + "learning_rate": 5.561537823295498e-06, + "loss": 2.6951, + "step": 41735 + }, + { + "epoch": 2.5908498354956855, + "grad_norm": 0.13388251083072025, + "learning_rate": 5.559882579347225e-06, + "loss": 2.6943, + "step": 41736 + }, + { + "epoch": 2.5909119125954434, + "grad_norm": 0.13852197981068468, + "learning_rate": 5.558227567253832e-06, + "loss": 2.7535, + "step": 41737 + }, + { + "epoch": 2.5909739896952013, + "grad_norm": 0.13423019968740824, + "learning_rate": 5.556572787023951e-06, + "loss": 2.6659, + "step": 41738 + }, + { + "epoch": 2.5910360667949592, + "grad_norm": 0.14240146169466472, + "learning_rate": 5.554918238666212e-06, + "loss": 2.8264, + "step": 41739 + }, + { + "epoch": 2.591098143894717, + "grad_norm": 0.15198408365520744, + "learning_rate": 5.553263922189261e-06, + "loss": 2.693, + "step": 41740 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.14080291989835164, + "learning_rate": 5.551609837601723e-06, + "loss": 2.7014, + "step": 41741 + }, + { + "epoch": 2.591222298094233, + "grad_norm": 0.13108424181778966, + "learning_rate": 5.5499559849122255e-06, + "loss": 2.5323, + "step": 41742 + }, + { + "epoch": 2.591284375193991, + "grad_norm": 0.135640868102472, + "learning_rate": 5.548302364129393e-06, + "loss": 2.66, + "step": 41743 + }, + { + "epoch": 2.591346452293749, + "grad_norm": 0.1552964679264775, + "learning_rate": 5.546648975261864e-06, + "loss": 2.64, + "step": 41744 + }, + { + "epoch": 2.5914085293935067, + "grad_norm": 0.13379610001369538, + "learning_rate": 5.54499581831826e-06, + "loss": 2.608, + "step": 41745 + }, + { + "epoch": 2.5914706064932647, + "grad_norm": 0.14356332220980497, + "learning_rate": 5.5433428933072015e-06, + "loss": 2.6423, + "step": 41746 + }, + { + "epoch": 2.5915326835930226, + "grad_norm": 0.15108655942041965, + "learning_rate": 5.54169020023731e-06, + "loss": 2.7475, + "step": 41747 + }, + { + "epoch": 2.5915947606927805, + "grad_norm": 0.13322308829829893, + "learning_rate": 5.540037739117221e-06, + "loss": 2.6952, + "step": 41748 + }, + { + "epoch": 2.5916568377925384, + "grad_norm": 0.14548546024147582, + "learning_rate": 5.5383855099555455e-06, + "loss": 2.7098, + "step": 41749 + }, + { + "epoch": 2.5917189148922963, + "grad_norm": 0.13964191501564083, + "learning_rate": 5.536733512760911e-06, + "loss": 2.6817, + "step": 41750 + }, + { + "epoch": 2.5917809919920543, + "grad_norm": 0.1357720568609937, + "learning_rate": 5.535081747541931e-06, + "loss": 2.6445, + "step": 41751 + }, + { + "epoch": 2.591843069091812, + "grad_norm": 0.13512775583835865, + "learning_rate": 5.533430214307211e-06, + "loss": 2.663, + "step": 41752 + }, + { + "epoch": 2.59190514619157, + "grad_norm": 0.1392694545978139, + "learning_rate": 5.531778913065394e-06, + "loss": 2.7614, + "step": 41753 + }, + { + "epoch": 2.591967223291328, + "grad_norm": 0.13487709659691582, + "learning_rate": 5.530127843825078e-06, + "loss": 2.7054, + "step": 41754 + }, + { + "epoch": 2.5920293003910855, + "grad_norm": 0.135758316673532, + "learning_rate": 5.528477006594884e-06, + "loss": 2.6808, + "step": 41755 + }, + { + "epoch": 2.592091377490844, + "grad_norm": 0.14579272059485493, + "learning_rate": 5.526826401383406e-06, + "loss": 2.6569, + "step": 41756 + }, + { + "epoch": 2.5921534545906013, + "grad_norm": 0.13219662528859724, + "learning_rate": 5.525176028199286e-06, + "loss": 2.7025, + "step": 41757 + }, + { + "epoch": 2.5922155316903597, + "grad_norm": 0.13190554441377636, + "learning_rate": 5.523525887051118e-06, + "loss": 2.7968, + "step": 41758 + }, + { + "epoch": 2.592277608790117, + "grad_norm": 0.14810166515168294, + "learning_rate": 5.5218759779475106e-06, + "loss": 2.6897, + "step": 41759 + }, + { + "epoch": 2.592339685889875, + "grad_norm": 0.13986577947351372, + "learning_rate": 5.520226300897074e-06, + "loss": 2.7888, + "step": 41760 + }, + { + "epoch": 2.592401762989633, + "grad_norm": 0.13726639736003837, + "learning_rate": 5.5185768559084085e-06, + "loss": 2.648, + "step": 41761 + }, + { + "epoch": 2.592463840089391, + "grad_norm": 0.13358857802925814, + "learning_rate": 5.516927642990122e-06, + "loss": 2.7622, + "step": 41762 + }, + { + "epoch": 2.592525917189149, + "grad_norm": 0.15766537483961707, + "learning_rate": 5.51527866215083e-06, + "loss": 2.6964, + "step": 41763 + }, + { + "epoch": 2.5925879942889067, + "grad_norm": 0.13919653835207535, + "learning_rate": 5.513629913399132e-06, + "loss": 2.6278, + "step": 41764 + }, + { + "epoch": 2.5926500713886647, + "grad_norm": 0.15491114041048776, + "learning_rate": 5.511981396743626e-06, + "loss": 2.7335, + "step": 41765 + }, + { + "epoch": 2.5927121484884226, + "grad_norm": 0.1381035090790141, + "learning_rate": 5.510333112192906e-06, + "loss": 2.7107, + "step": 41766 + }, + { + "epoch": 2.5927742255881805, + "grad_norm": 0.13400200791731673, + "learning_rate": 5.508685059755581e-06, + "loss": 2.6247, + "step": 41767 + }, + { + "epoch": 2.5928363026879384, + "grad_norm": 0.13657781284865253, + "learning_rate": 5.507037239440255e-06, + "loss": 2.711, + "step": 41768 + }, + { + "epoch": 2.5928983797876963, + "grad_norm": 0.13784080286103387, + "learning_rate": 5.5053896512555106e-06, + "loss": 2.7059, + "step": 41769 + }, + { + "epoch": 2.5929604568874542, + "grad_norm": 0.13135444878808303, + "learning_rate": 5.503742295209941e-06, + "loss": 2.6552, + "step": 41770 + }, + { + "epoch": 2.593022533987212, + "grad_norm": 0.1364757148820735, + "learning_rate": 5.50209517131216e-06, + "loss": 2.6994, + "step": 41771 + }, + { + "epoch": 2.59308461108697, + "grad_norm": 0.15309386326041455, + "learning_rate": 5.500448279570752e-06, + "loss": 2.7435, + "step": 41772 + }, + { + "epoch": 2.593146688186728, + "grad_norm": 0.13297481295550423, + "learning_rate": 5.498801619994309e-06, + "loss": 2.6553, + "step": 41773 + }, + { + "epoch": 2.593208765286486, + "grad_norm": 0.13598431460740718, + "learning_rate": 5.4971551925914185e-06, + "loss": 2.6613, + "step": 41774 + }, + { + "epoch": 2.593270842386244, + "grad_norm": 0.1428849628780377, + "learning_rate": 5.495508997370663e-06, + "loss": 2.6744, + "step": 41775 + }, + { + "epoch": 2.5933329194860018, + "grad_norm": 0.14044740509402714, + "learning_rate": 5.493863034340652e-06, + "loss": 2.7547, + "step": 41776 + }, + { + "epoch": 2.5933949965857597, + "grad_norm": 0.1309939091450363, + "learning_rate": 5.492217303509961e-06, + "loss": 2.6536, + "step": 41777 + }, + { + "epoch": 2.593457073685517, + "grad_norm": 0.13625159151127686, + "learning_rate": 5.490571804887173e-06, + "loss": 2.7321, + "step": 41778 + }, + { + "epoch": 2.5935191507852755, + "grad_norm": 0.16730551297956983, + "learning_rate": 5.488926538480876e-06, + "loss": 2.5979, + "step": 41779 + }, + { + "epoch": 2.593581227885033, + "grad_norm": 0.13728791021780526, + "learning_rate": 5.487281504299657e-06, + "loss": 2.6731, + "step": 41780 + }, + { + "epoch": 2.5936433049847913, + "grad_norm": 0.1501444684645379, + "learning_rate": 5.485636702352098e-06, + "loss": 2.648, + "step": 41781 + }, + { + "epoch": 2.593705382084549, + "grad_norm": 0.14360879021289885, + "learning_rate": 5.483992132646781e-06, + "loss": 2.742, + "step": 41782 + }, + { + "epoch": 2.593767459184307, + "grad_norm": 0.14842035796918931, + "learning_rate": 5.482347795192272e-06, + "loss": 2.6739, + "step": 41783 + }, + { + "epoch": 2.5938295362840647, + "grad_norm": 0.13487519081118135, + "learning_rate": 5.480703689997174e-06, + "loss": 2.6769, + "step": 41784 + }, + { + "epoch": 2.5938916133838226, + "grad_norm": 0.13284168203301666, + "learning_rate": 5.479059817070048e-06, + "loss": 2.6014, + "step": 41785 + }, + { + "epoch": 2.5939536904835805, + "grad_norm": 0.1520734369528741, + "learning_rate": 5.477416176419475e-06, + "loss": 2.6162, + "step": 41786 + }, + { + "epoch": 2.5940157675833384, + "grad_norm": 0.14691396663046977, + "learning_rate": 5.475772768054033e-06, + "loss": 2.6631, + "step": 41787 + }, + { + "epoch": 2.5940778446830963, + "grad_norm": 0.13723365688255526, + "learning_rate": 5.474129591982286e-06, + "loss": 2.6751, + "step": 41788 + }, + { + "epoch": 2.5941399217828542, + "grad_norm": 0.1351573337665832, + "learning_rate": 5.472486648212816e-06, + "loss": 2.783, + "step": 41789 + }, + { + "epoch": 2.594201998882612, + "grad_norm": 0.14276502717552506, + "learning_rate": 5.470843936754199e-06, + "loss": 2.677, + "step": 41790 + }, + { + "epoch": 2.59426407598237, + "grad_norm": 0.1402265679356619, + "learning_rate": 5.469201457614997e-06, + "loss": 2.747, + "step": 41791 + }, + { + "epoch": 2.594326153082128, + "grad_norm": 0.14230405253944864, + "learning_rate": 5.467559210803769e-06, + "loss": 2.6767, + "step": 41792 + }, + { + "epoch": 2.594388230181886, + "grad_norm": 0.14392146789075744, + "learning_rate": 5.465917196329107e-06, + "loss": 2.7366, + "step": 41793 + }, + { + "epoch": 2.594450307281644, + "grad_norm": 0.1449544484426667, + "learning_rate": 5.4642754141995545e-06, + "loss": 2.7659, + "step": 41794 + }, + { + "epoch": 2.5945123843814017, + "grad_norm": 0.13266761674922173, + "learning_rate": 5.4626338644236995e-06, + "loss": 2.7357, + "step": 41795 + }, + { + "epoch": 2.5945744614811597, + "grad_norm": 0.13413323732538007, + "learning_rate": 5.460992547010097e-06, + "loss": 2.6919, + "step": 41796 + }, + { + "epoch": 2.5946365385809176, + "grad_norm": 0.14862936569808965, + "learning_rate": 5.4593514619672995e-06, + "loss": 2.738, + "step": 41797 + }, + { + "epoch": 2.5946986156806755, + "grad_norm": 0.14053575014760322, + "learning_rate": 5.457710609303884e-06, + "loss": 2.6497, + "step": 41798 + }, + { + "epoch": 2.5947606927804334, + "grad_norm": 0.1539269915028441, + "learning_rate": 5.456069989028412e-06, + "loss": 2.6929, + "step": 41799 + }, + { + "epoch": 2.5948227698801913, + "grad_norm": 0.13990832201923215, + "learning_rate": 5.45442960114943e-06, + "loss": 2.7094, + "step": 41800 + }, + { + "epoch": 2.5948848469799493, + "grad_norm": 0.13908118968913116, + "learning_rate": 5.452789445675505e-06, + "loss": 2.7563, + "step": 41801 + }, + { + "epoch": 2.594946924079707, + "grad_norm": 0.13362106161183798, + "learning_rate": 5.451149522615179e-06, + "loss": 2.6689, + "step": 41802 + }, + { + "epoch": 2.5950090011794646, + "grad_norm": 0.14163952680268285, + "learning_rate": 5.4495098319770346e-06, + "loss": 2.7195, + "step": 41803 + }, + { + "epoch": 2.595071078279223, + "grad_norm": 0.1323770860866941, + "learning_rate": 5.447870373769609e-06, + "loss": 2.6921, + "step": 41804 + }, + { + "epoch": 2.5951331553789805, + "grad_norm": 0.13637618335434243, + "learning_rate": 5.446231148001462e-06, + "loss": 2.7163, + "step": 41805 + }, + { + "epoch": 2.595195232478739, + "grad_norm": 0.14332403740106325, + "learning_rate": 5.444592154681128e-06, + "loss": 2.6868, + "step": 41806 + }, + { + "epoch": 2.5952573095784963, + "grad_norm": 0.13313806824500649, + "learning_rate": 5.44295339381719e-06, + "loss": 2.7525, + "step": 41807 + }, + { + "epoch": 2.5953193866782542, + "grad_norm": 0.13242344109838505, + "learning_rate": 5.441314865418173e-06, + "loss": 2.7844, + "step": 41808 + }, + { + "epoch": 2.595381463778012, + "grad_norm": 0.13536918629986758, + "learning_rate": 5.439676569492641e-06, + "loss": 2.6887, + "step": 41809 + }, + { + "epoch": 2.59544354087777, + "grad_norm": 0.13370720245467163, + "learning_rate": 5.438038506049131e-06, + "loss": 2.6776, + "step": 41810 + }, + { + "epoch": 2.595505617977528, + "grad_norm": 0.13734116564848253, + "learning_rate": 5.436400675096176e-06, + "loss": 2.7192, + "step": 41811 + }, + { + "epoch": 2.595567695077286, + "grad_norm": 0.13314242616063435, + "learning_rate": 5.434763076642352e-06, + "loss": 2.6336, + "step": 41812 + }, + { + "epoch": 2.595629772177044, + "grad_norm": 0.14691553241386918, + "learning_rate": 5.433125710696191e-06, + "loss": 2.7006, + "step": 41813 + }, + { + "epoch": 2.5956918492768017, + "grad_norm": 0.13195073110329839, + "learning_rate": 5.431488577266231e-06, + "loss": 2.6853, + "step": 41814 + }, + { + "epoch": 2.5957539263765597, + "grad_norm": 0.13375860410493606, + "learning_rate": 5.429851676360998e-06, + "loss": 2.6697, + "step": 41815 + }, + { + "epoch": 2.5958160034763176, + "grad_norm": 0.1550408787043101, + "learning_rate": 5.4282150079890625e-06, + "loss": 2.703, + "step": 41816 + }, + { + "epoch": 2.5958780805760755, + "grad_norm": 0.13471691661136695, + "learning_rate": 5.426578572158952e-06, + "loss": 2.6642, + "step": 41817 + }, + { + "epoch": 2.5959401576758334, + "grad_norm": 0.1401254735814445, + "learning_rate": 5.424942368879199e-06, + "loss": 2.7002, + "step": 41818 + }, + { + "epoch": 2.5960022347755913, + "grad_norm": 0.13785589607498763, + "learning_rate": 5.423306398158334e-06, + "loss": 2.7126, + "step": 41819 + }, + { + "epoch": 2.5960643118753493, + "grad_norm": 0.13571010869206648, + "learning_rate": 5.421670660004907e-06, + "loss": 2.6912, + "step": 41820 + }, + { + "epoch": 2.596126388975107, + "grad_norm": 0.15230983576716153, + "learning_rate": 5.420035154427449e-06, + "loss": 2.7053, + "step": 41821 + }, + { + "epoch": 2.596188466074865, + "grad_norm": 0.13835139246723424, + "learning_rate": 5.4183998814344885e-06, + "loss": 2.6294, + "step": 41822 + }, + { + "epoch": 2.596250543174623, + "grad_norm": 0.14761710581558926, + "learning_rate": 5.416764841034561e-06, + "loss": 2.6731, + "step": 41823 + }, + { + "epoch": 2.596312620274381, + "grad_norm": 0.1317945627350006, + "learning_rate": 5.4151300332361825e-06, + "loss": 2.6971, + "step": 41824 + }, + { + "epoch": 2.596374697374139, + "grad_norm": 0.14295221269570568, + "learning_rate": 5.413495458047907e-06, + "loss": 2.6244, + "step": 41825 + }, + { + "epoch": 2.5964367744738963, + "grad_norm": 0.13075268367352694, + "learning_rate": 5.411861115478234e-06, + "loss": 2.653, + "step": 41826 + }, + { + "epoch": 2.5964988515736547, + "grad_norm": 0.14053495491204904, + "learning_rate": 5.410227005535718e-06, + "loss": 2.7788, + "step": 41827 + }, + { + "epoch": 2.596560928673412, + "grad_norm": 0.13225400124411896, + "learning_rate": 5.408593128228872e-06, + "loss": 2.6853, + "step": 41828 + }, + { + "epoch": 2.5966230057731705, + "grad_norm": 0.1463647735616548, + "learning_rate": 5.406959483566215e-06, + "loss": 2.6947, + "step": 41829 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.14467000744334513, + "learning_rate": 5.405326071556283e-06, + "loss": 2.6787, + "step": 41830 + }, + { + "epoch": 2.5967471599726863, + "grad_norm": 0.13348985195566992, + "learning_rate": 5.4036928922075846e-06, + "loss": 2.6921, + "step": 41831 + }, + { + "epoch": 2.596809237072444, + "grad_norm": 0.13408961958781304, + "learning_rate": 5.402059945528654e-06, + "loss": 2.6454, + "step": 41832 + }, + { + "epoch": 2.5968713141722017, + "grad_norm": 0.1321317462376276, + "learning_rate": 5.400427231527988e-06, + "loss": 2.6485, + "step": 41833 + }, + { + "epoch": 2.5969333912719597, + "grad_norm": 0.134902573537403, + "learning_rate": 5.398794750214136e-06, + "loss": 2.7644, + "step": 41834 + }, + { + "epoch": 2.5969954683717176, + "grad_norm": 0.14507973830805618, + "learning_rate": 5.397162501595593e-06, + "loss": 2.648, + "step": 41835 + }, + { + "epoch": 2.5970575454714755, + "grad_norm": 0.13336915578658423, + "learning_rate": 5.395530485680883e-06, + "loss": 2.7349, + "step": 41836 + }, + { + "epoch": 2.5971196225712334, + "grad_norm": 0.13599873046203564, + "learning_rate": 5.393898702478523e-06, + "loss": 2.7201, + "step": 41837 + }, + { + "epoch": 2.5971816996709913, + "grad_norm": 0.13678559304145377, + "learning_rate": 5.392267151997004e-06, + "loss": 2.7528, + "step": 41838 + }, + { + "epoch": 2.5972437767707492, + "grad_norm": 0.1362105187468988, + "learning_rate": 5.390635834244873e-06, + "loss": 2.7082, + "step": 41839 + }, + { + "epoch": 2.597305853870507, + "grad_norm": 0.13783022851942117, + "learning_rate": 5.38900474923062e-06, + "loss": 2.778, + "step": 41840 + }, + { + "epoch": 2.597367930970265, + "grad_norm": 0.15470838598412603, + "learning_rate": 5.387373896962761e-06, + "loss": 2.6543, + "step": 41841 + }, + { + "epoch": 2.597430008070023, + "grad_norm": 0.15763233376752786, + "learning_rate": 5.385743277449795e-06, + "loss": 2.824, + "step": 41842 + }, + { + "epoch": 2.597492085169781, + "grad_norm": 0.14637619474421842, + "learning_rate": 5.384112890700244e-06, + "loss": 2.6522, + "step": 41843 + }, + { + "epoch": 2.597554162269539, + "grad_norm": 0.13244176760220452, + "learning_rate": 5.382482736722605e-06, + "loss": 2.677, + "step": 41844 + }, + { + "epoch": 2.5976162393692968, + "grad_norm": 0.13674290988532758, + "learning_rate": 5.380852815525389e-06, + "loss": 2.8109, + "step": 41845 + }, + { + "epoch": 2.5976783164690547, + "grad_norm": 0.13569811922107394, + "learning_rate": 5.3792231271170886e-06, + "loss": 2.7299, + "step": 41846 + }, + { + "epoch": 2.5977403935688126, + "grad_norm": 0.15721698400096631, + "learning_rate": 5.377593671506209e-06, + "loss": 2.7008, + "step": 41847 + }, + { + "epoch": 2.5978024706685705, + "grad_norm": 0.13647919512364923, + "learning_rate": 5.375964448701265e-06, + "loss": 2.653, + "step": 41848 + }, + { + "epoch": 2.5978645477683284, + "grad_norm": 0.1371282847044974, + "learning_rate": 5.37433545871075e-06, + "loss": 2.6333, + "step": 41849 + }, + { + "epoch": 2.5979266248680863, + "grad_norm": 0.13742132862322928, + "learning_rate": 5.372706701543156e-06, + "loss": 2.7214, + "step": 41850 + }, + { + "epoch": 2.597988701967844, + "grad_norm": 0.15395985386316546, + "learning_rate": 5.3710781772069726e-06, + "loss": 2.7048, + "step": 41851 + }, + { + "epoch": 2.598050779067602, + "grad_norm": 0.13653462136319663, + "learning_rate": 5.369449885710725e-06, + "loss": 2.7136, + "step": 41852 + }, + { + "epoch": 2.5981128561673597, + "grad_norm": 0.13307091872574875, + "learning_rate": 5.367821827062886e-06, + "loss": 2.7456, + "step": 41853 + }, + { + "epoch": 2.598174933267118, + "grad_norm": 0.15346570016078884, + "learning_rate": 5.366194001271957e-06, + "loss": 2.7463, + "step": 41854 + }, + { + "epoch": 2.5982370103668755, + "grad_norm": 0.1339904216591949, + "learning_rate": 5.364566408346422e-06, + "loss": 2.6115, + "step": 41855 + }, + { + "epoch": 2.5982990874666334, + "grad_norm": 0.13842530559582192, + "learning_rate": 5.362939048294785e-06, + "loss": 2.684, + "step": 41856 + }, + { + "epoch": 2.5983611645663913, + "grad_norm": 0.14437349189655962, + "learning_rate": 5.361311921125534e-06, + "loss": 2.7283, + "step": 41857 + }, + { + "epoch": 2.5984232416661492, + "grad_norm": 0.13358635407883765, + "learning_rate": 5.359685026847155e-06, + "loss": 2.6957, + "step": 41858 + }, + { + "epoch": 2.598485318765907, + "grad_norm": 0.13134924451583874, + "learning_rate": 5.358058365468127e-06, + "loss": 2.7164, + "step": 41859 + }, + { + "epoch": 2.598547395865665, + "grad_norm": 0.15804163025107928, + "learning_rate": 5.356431936996953e-06, + "loss": 2.6878, + "step": 41860 + }, + { + "epoch": 2.598609472965423, + "grad_norm": 0.13390248078568737, + "learning_rate": 5.354805741442104e-06, + "loss": 2.6884, + "step": 41861 + }, + { + "epoch": 2.598671550065181, + "grad_norm": 0.13883572287690765, + "learning_rate": 5.353179778812078e-06, + "loss": 2.6676, + "step": 41862 + }, + { + "epoch": 2.598733627164939, + "grad_norm": 0.13494340255481974, + "learning_rate": 5.351554049115353e-06, + "loss": 2.6898, + "step": 41863 + }, + { + "epoch": 2.5987957042646967, + "grad_norm": 0.1340955370430782, + "learning_rate": 5.349928552360411e-06, + "loss": 2.651, + "step": 41864 + }, + { + "epoch": 2.5988577813644547, + "grad_norm": 0.14581655862811455, + "learning_rate": 5.348303288555722e-06, + "loss": 2.7335, + "step": 41865 + }, + { + "epoch": 2.5989198584642126, + "grad_norm": 0.14661548469112806, + "learning_rate": 5.346678257709781e-06, + "loss": 2.5796, + "step": 41866 + }, + { + "epoch": 2.5989819355639705, + "grad_norm": 0.13217656670890618, + "learning_rate": 5.345053459831062e-06, + "loss": 2.6955, + "step": 41867 + }, + { + "epoch": 2.5990440126637284, + "grad_norm": 0.14946377619045131, + "learning_rate": 5.343428894928038e-06, + "loss": 2.7511, + "step": 41868 + }, + { + "epoch": 2.5991060897634863, + "grad_norm": 0.13433766484016618, + "learning_rate": 5.3418045630091725e-06, + "loss": 2.607, + "step": 41869 + }, + { + "epoch": 2.5991681668632443, + "grad_norm": 0.14291350881708784, + "learning_rate": 5.340180464082972e-06, + "loss": 2.7619, + "step": 41870 + }, + { + "epoch": 2.599230243963002, + "grad_norm": 0.13892272638024214, + "learning_rate": 5.338556598157885e-06, + "loss": 2.8062, + "step": 41871 + }, + { + "epoch": 2.59929232106276, + "grad_norm": 0.13403855332221554, + "learning_rate": 5.336932965242392e-06, + "loss": 2.6939, + "step": 41872 + }, + { + "epoch": 2.599354398162518, + "grad_norm": 0.14234864697580435, + "learning_rate": 5.335309565344965e-06, + "loss": 2.6321, + "step": 41873 + }, + { + "epoch": 2.5994164752622755, + "grad_norm": 0.1346488252993301, + "learning_rate": 5.3336863984740524e-06, + "loss": 2.611, + "step": 41874 + }, + { + "epoch": 2.599478552362034, + "grad_norm": 0.14041320428154602, + "learning_rate": 5.3320634646381595e-06, + "loss": 2.7491, + "step": 41875 + }, + { + "epoch": 2.5995406294617913, + "grad_norm": 0.13314385311966306, + "learning_rate": 5.330440763845734e-06, + "loss": 2.6967, + "step": 41876 + }, + { + "epoch": 2.5996027065615497, + "grad_norm": 0.13758060736856373, + "learning_rate": 5.328818296105237e-06, + "loss": 2.7417, + "step": 41877 + }, + { + "epoch": 2.599664783661307, + "grad_norm": 0.1470992471467247, + "learning_rate": 5.327196061425138e-06, + "loss": 2.6484, + "step": 41878 + }, + { + "epoch": 2.5997268607610655, + "grad_norm": 0.13425894177660852, + "learning_rate": 5.325574059813903e-06, + "loss": 2.6945, + "step": 41879 + }, + { + "epoch": 2.599788937860823, + "grad_norm": 0.1429965919733753, + "learning_rate": 5.323952291279999e-06, + "loss": 2.7323, + "step": 41880 + }, + { + "epoch": 2.599851014960581, + "grad_norm": 0.1643476109204411, + "learning_rate": 5.322330755831878e-06, + "loss": 2.6714, + "step": 41881 + }, + { + "epoch": 2.599913092060339, + "grad_norm": 0.1341310224056468, + "learning_rate": 5.320709453477995e-06, + "loss": 2.6976, + "step": 41882 + }, + { + "epoch": 2.5999751691600967, + "grad_norm": 0.1415541931371932, + "learning_rate": 5.319088384226828e-06, + "loss": 2.7046, + "step": 41883 + }, + { + "epoch": 2.6000372462598547, + "grad_norm": 0.15624092194888548, + "learning_rate": 5.317467548086824e-06, + "loss": 2.736, + "step": 41884 + }, + { + "epoch": 2.6000993233596126, + "grad_norm": 0.14717115260820804, + "learning_rate": 5.315846945066438e-06, + "loss": 2.71, + "step": 41885 + }, + { + "epoch": 2.6001614004593705, + "grad_norm": 0.13708532780014773, + "learning_rate": 5.314226575174125e-06, + "loss": 2.7774, + "step": 41886 + }, + { + "epoch": 2.6002234775591284, + "grad_norm": 0.13884179356596327, + "learning_rate": 5.312606438418333e-06, + "loss": 2.7279, + "step": 41887 + }, + { + "epoch": 2.6002855546588863, + "grad_norm": 0.14498026794765032, + "learning_rate": 5.3109865348075275e-06, + "loss": 2.743, + "step": 41888 + }, + { + "epoch": 2.6003476317586443, + "grad_norm": 0.15859465363907468, + "learning_rate": 5.309366864350151e-06, + "loss": 2.7656, + "step": 41889 + }, + { + "epoch": 2.600409708858402, + "grad_norm": 0.1482928977145983, + "learning_rate": 5.307747427054666e-06, + "loss": 2.6792, + "step": 41890 + }, + { + "epoch": 2.60047178595816, + "grad_norm": 0.13324684057775418, + "learning_rate": 5.306128222929496e-06, + "loss": 2.6138, + "step": 41891 + }, + { + "epoch": 2.600533863057918, + "grad_norm": 0.14826419400603885, + "learning_rate": 5.304509251983103e-06, + "loss": 2.6382, + "step": 41892 + }, + { + "epoch": 2.600595940157676, + "grad_norm": 0.14172141166214752, + "learning_rate": 5.302890514223952e-06, + "loss": 2.7426, + "step": 41893 + }, + { + "epoch": 2.600658017257434, + "grad_norm": 0.13408887641476247, + "learning_rate": 5.301272009660469e-06, + "loss": 2.7162, + "step": 41894 + }, + { + "epoch": 2.6007200943571918, + "grad_norm": 0.13865938936358838, + "learning_rate": 5.299653738301097e-06, + "loss": 2.8019, + "step": 41895 + }, + { + "epoch": 2.6007821714569497, + "grad_norm": 0.13195747848249306, + "learning_rate": 5.298035700154286e-06, + "loss": 2.7369, + "step": 41896 + }, + { + "epoch": 2.6008442485567076, + "grad_norm": 0.1457106757061174, + "learning_rate": 5.296417895228462e-06, + "loss": 2.6523, + "step": 41897 + }, + { + "epoch": 2.6009063256564655, + "grad_norm": 0.13134062227041612, + "learning_rate": 5.29480032353209e-06, + "loss": 2.6363, + "step": 41898 + }, + { + "epoch": 2.600968402756223, + "grad_norm": 0.1377919243679034, + "learning_rate": 5.293182985073597e-06, + "loss": 2.6355, + "step": 41899 + }, + { + "epoch": 2.6010304798559813, + "grad_norm": 0.13956627062398827, + "learning_rate": 5.2915658798614264e-06, + "loss": 2.6898, + "step": 41900 + }, + { + "epoch": 2.601092556955739, + "grad_norm": 0.13622091116244267, + "learning_rate": 5.289949007903994e-06, + "loss": 2.6574, + "step": 41901 + }, + { + "epoch": 2.601154634055497, + "grad_norm": 0.14137325297042694, + "learning_rate": 5.288332369209764e-06, + "loss": 2.6843, + "step": 41902 + }, + { + "epoch": 2.6012167111552547, + "grad_norm": 0.14079995662952022, + "learning_rate": 5.286715963787153e-06, + "loss": 2.7671, + "step": 41903 + }, + { + "epoch": 2.6012787882550126, + "grad_norm": 0.13725949706336235, + "learning_rate": 5.285099791644604e-06, + "loss": 2.6728, + "step": 41904 + }, + { + "epoch": 2.6013408653547705, + "grad_norm": 0.1428624479711613, + "learning_rate": 5.283483852790538e-06, + "loss": 2.6778, + "step": 41905 + }, + { + "epoch": 2.6014029424545284, + "grad_norm": 0.1371122827248752, + "learning_rate": 5.281868147233393e-06, + "loss": 2.7641, + "step": 41906 + }, + { + "epoch": 2.6014650195542863, + "grad_norm": 0.1392102736858604, + "learning_rate": 5.280252674981606e-06, + "loss": 2.7289, + "step": 41907 + }, + { + "epoch": 2.6015270966540442, + "grad_norm": 0.14680937000752836, + "learning_rate": 5.278637436043593e-06, + "loss": 2.6354, + "step": 41908 + }, + { + "epoch": 2.601589173753802, + "grad_norm": 0.1597086420079443, + "learning_rate": 5.277022430427786e-06, + "loss": 2.7083, + "step": 41909 + }, + { + "epoch": 2.60165125085356, + "grad_norm": 0.13706124865790872, + "learning_rate": 5.2754076581425946e-06, + "loss": 2.7528, + "step": 41910 + }, + { + "epoch": 2.601713327953318, + "grad_norm": 0.14377630734213276, + "learning_rate": 5.273793119196474e-06, + "loss": 2.5747, + "step": 41911 + }, + { + "epoch": 2.601775405053076, + "grad_norm": 0.13952062666757464, + "learning_rate": 5.2721788135978335e-06, + "loss": 2.7429, + "step": 41912 + }, + { + "epoch": 2.601837482152834, + "grad_norm": 0.1592283525556743, + "learning_rate": 5.270564741355094e-06, + "loss": 2.8244, + "step": 41913 + }, + { + "epoch": 2.6018995592525918, + "grad_norm": 0.13284412876946494, + "learning_rate": 5.26895090247666e-06, + "loss": 2.7258, + "step": 41914 + }, + { + "epoch": 2.6019616363523497, + "grad_norm": 0.134225675872704, + "learning_rate": 5.267337296970987e-06, + "loss": 2.6883, + "step": 41915 + }, + { + "epoch": 2.6020237134521076, + "grad_norm": 0.13804070461625742, + "learning_rate": 5.2657239248464665e-06, + "loss": 2.7069, + "step": 41916 + }, + { + "epoch": 2.6020857905518655, + "grad_norm": 0.13395313310194865, + "learning_rate": 5.2641107861115315e-06, + "loss": 2.6306, + "step": 41917 + }, + { + "epoch": 2.6021478676516234, + "grad_norm": 0.1348180530679515, + "learning_rate": 5.262497880774575e-06, + "loss": 2.6794, + "step": 41918 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.13071289689495427, + "learning_rate": 5.260885208844041e-06, + "loss": 2.7186, + "step": 41919 + }, + { + "epoch": 2.6022720218511393, + "grad_norm": 0.14158579639928118, + "learning_rate": 5.259272770328328e-06, + "loss": 2.7343, + "step": 41920 + }, + { + "epoch": 2.602334098950897, + "grad_norm": 0.13874774522925384, + "learning_rate": 5.25766056523585e-06, + "loss": 2.682, + "step": 41921 + }, + { + "epoch": 2.6023961760506547, + "grad_norm": 0.13640988065425716, + "learning_rate": 5.256048593575019e-06, + "loss": 2.7633, + "step": 41922 + }, + { + "epoch": 2.602458253150413, + "grad_norm": 0.13524785355338403, + "learning_rate": 5.254436855354233e-06, + "loss": 2.6924, + "step": 41923 + }, + { + "epoch": 2.6025203302501705, + "grad_norm": 0.14730795446721817, + "learning_rate": 5.252825350581925e-06, + "loss": 2.7818, + "step": 41924 + }, + { + "epoch": 2.602582407349929, + "grad_norm": 0.13751484834087394, + "learning_rate": 5.2512140792664754e-06, + "loss": 2.7473, + "step": 41925 + }, + { + "epoch": 2.6026444844496863, + "grad_norm": 0.13876850495573015, + "learning_rate": 5.2496030414163176e-06, + "loss": 2.7725, + "step": 41926 + }, + { + "epoch": 2.6027065615494447, + "grad_norm": 0.13309877061118472, + "learning_rate": 5.2479922370398445e-06, + "loss": 2.7366, + "step": 41927 + }, + { + "epoch": 2.602768638649202, + "grad_norm": 0.1787326987103146, + "learning_rate": 5.2463816661454504e-06, + "loss": 2.6703, + "step": 41928 + }, + { + "epoch": 2.60283071574896, + "grad_norm": 0.1362159064928794, + "learning_rate": 5.24477132874156e-06, + "loss": 2.7882, + "step": 41929 + }, + { + "epoch": 2.602892792848718, + "grad_norm": 0.1419033583371843, + "learning_rate": 5.243161224836557e-06, + "loss": 2.6587, + "step": 41930 + }, + { + "epoch": 2.602954869948476, + "grad_norm": 0.13085031478034606, + "learning_rate": 5.241551354438851e-06, + "loss": 2.7097, + "step": 41931 + }, + { + "epoch": 2.603016947048234, + "grad_norm": 0.13351137343100045, + "learning_rate": 5.239941717556834e-06, + "loss": 2.6835, + "step": 41932 + }, + { + "epoch": 2.6030790241479917, + "grad_norm": 0.14557623054767813, + "learning_rate": 5.238332314198896e-06, + "loss": 2.7795, + "step": 41933 + }, + { + "epoch": 2.6031411012477497, + "grad_norm": 0.13693183914124693, + "learning_rate": 5.236723144373456e-06, + "loss": 2.79, + "step": 41934 + }, + { + "epoch": 2.6032031783475076, + "grad_norm": 0.1451494360783214, + "learning_rate": 5.235114208088904e-06, + "loss": 2.7324, + "step": 41935 + }, + { + "epoch": 2.6032652554472655, + "grad_norm": 0.1338781561363709, + "learning_rate": 5.23350550535362e-06, + "loss": 2.6697, + "step": 41936 + }, + { + "epoch": 2.6033273325470234, + "grad_norm": 0.13262904343117501, + "learning_rate": 5.231897036175998e-06, + "loss": 2.7267, + "step": 41937 + }, + { + "epoch": 2.6033894096467813, + "grad_norm": 0.13807027234217334, + "learning_rate": 5.230288800564448e-06, + "loss": 2.7086, + "step": 41938 + }, + { + "epoch": 2.6034514867465393, + "grad_norm": 0.13699814313871, + "learning_rate": 5.228680798527347e-06, + "loss": 2.686, + "step": 41939 + }, + { + "epoch": 2.603513563846297, + "grad_norm": 0.13298492598842482, + "learning_rate": 5.227073030073093e-06, + "loss": 2.6662, + "step": 41940 + }, + { + "epoch": 2.603575640946055, + "grad_norm": 0.13908056693638118, + "learning_rate": 5.225465495210052e-06, + "loss": 2.6543, + "step": 41941 + }, + { + "epoch": 2.603637718045813, + "grad_norm": 0.15078322453762488, + "learning_rate": 5.22385819394664e-06, + "loss": 2.7172, + "step": 41942 + }, + { + "epoch": 2.603699795145571, + "grad_norm": 0.13167224894708515, + "learning_rate": 5.222251126291228e-06, + "loss": 2.6061, + "step": 41943 + }, + { + "epoch": 2.603761872245329, + "grad_norm": 0.13621558514852422, + "learning_rate": 5.220644292252202e-06, + "loss": 2.6123, + "step": 41944 + }, + { + "epoch": 2.6038239493450868, + "grad_norm": 0.15487271909789105, + "learning_rate": 5.219037691837947e-06, + "loss": 2.702, + "step": 41945 + }, + { + "epoch": 2.6038860264448447, + "grad_norm": 0.13836188659240786, + "learning_rate": 5.217431325056832e-06, + "loss": 2.7017, + "step": 41946 + }, + { + "epoch": 2.603948103544602, + "grad_norm": 0.1430633661773878, + "learning_rate": 5.215825191917256e-06, + "loss": 2.7295, + "step": 41947 + }, + { + "epoch": 2.6040101806443605, + "grad_norm": 0.13893467890062836, + "learning_rate": 5.214219292427597e-06, + "loss": 2.7626, + "step": 41948 + }, + { + "epoch": 2.604072257744118, + "grad_norm": 0.13018335977116022, + "learning_rate": 5.212613626596225e-06, + "loss": 2.7421, + "step": 41949 + }, + { + "epoch": 2.6041343348438764, + "grad_norm": 0.1403856094220144, + "learning_rate": 5.21100819443151e-06, + "loss": 2.7199, + "step": 41950 + }, + { + "epoch": 2.604196411943634, + "grad_norm": 0.13123691243814894, + "learning_rate": 5.209402995941848e-06, + "loss": 2.6671, + "step": 41951 + }, + { + "epoch": 2.6042584890433917, + "grad_norm": 0.13478537428622264, + "learning_rate": 5.207798031135608e-06, + "loss": 2.6204, + "step": 41952 + }, + { + "epoch": 2.6043205661431497, + "grad_norm": 0.14062788086165529, + "learning_rate": 5.206193300021156e-06, + "loss": 2.7742, + "step": 41953 + }, + { + "epoch": 2.6043826432429076, + "grad_norm": 0.13554347883251802, + "learning_rate": 5.2045888026068575e-06, + "loss": 2.8008, + "step": 41954 + }, + { + "epoch": 2.6044447203426655, + "grad_norm": 0.13471778766949374, + "learning_rate": 5.202984538901107e-06, + "loss": 2.6607, + "step": 41955 + }, + { + "epoch": 2.6045067974424234, + "grad_norm": 0.1364202988437073, + "learning_rate": 5.201380508912257e-06, + "loss": 2.6971, + "step": 41956 + }, + { + "epoch": 2.6045688745421813, + "grad_norm": 0.13148824973922102, + "learning_rate": 5.199776712648669e-06, + "loss": 2.6443, + "step": 41957 + }, + { + "epoch": 2.6046309516419393, + "grad_norm": 0.1474585880319739, + "learning_rate": 5.198173150118734e-06, + "loss": 2.6584, + "step": 41958 + }, + { + "epoch": 2.604693028741697, + "grad_norm": 0.14224722100612208, + "learning_rate": 5.1965698213308045e-06, + "loss": 2.6146, + "step": 41959 + }, + { + "epoch": 2.604755105841455, + "grad_norm": 0.13674395873770734, + "learning_rate": 5.194966726293238e-06, + "loss": 2.6986, + "step": 41960 + }, + { + "epoch": 2.604817182941213, + "grad_norm": 0.1346624573817018, + "learning_rate": 5.193363865014417e-06, + "loss": 2.6217, + "step": 41961 + }, + { + "epoch": 2.604879260040971, + "grad_norm": 0.14746324903684824, + "learning_rate": 5.191761237502696e-06, + "loss": 2.778, + "step": 41962 + }, + { + "epoch": 2.604941337140729, + "grad_norm": 0.1485775424991189, + "learning_rate": 5.190158843766429e-06, + "loss": 2.744, + "step": 41963 + }, + { + "epoch": 2.6050034142404868, + "grad_norm": 0.1363175021844964, + "learning_rate": 5.188556683813978e-06, + "loss": 2.6738, + "step": 41964 + }, + { + "epoch": 2.6050654913402447, + "grad_norm": 0.13824070804922087, + "learning_rate": 5.186954757653712e-06, + "loss": 2.6609, + "step": 41965 + }, + { + "epoch": 2.6051275684400026, + "grad_norm": 0.15217494782610957, + "learning_rate": 5.1853530652939796e-06, + "loss": 2.6841, + "step": 41966 + }, + { + "epoch": 2.6051896455397605, + "grad_norm": 0.14365658419928887, + "learning_rate": 5.183751606743142e-06, + "loss": 2.6809, + "step": 41967 + }, + { + "epoch": 2.6052517226395184, + "grad_norm": 0.13471539570272273, + "learning_rate": 5.1821503820095375e-06, + "loss": 2.8255, + "step": 41968 + }, + { + "epoch": 2.6053137997392763, + "grad_norm": 0.1400692873888591, + "learning_rate": 5.1805493911015465e-06, + "loss": 2.6561, + "step": 41969 + }, + { + "epoch": 2.605375876839034, + "grad_norm": 0.1429387848099518, + "learning_rate": 5.178948634027514e-06, + "loss": 2.7128, + "step": 41970 + }, + { + "epoch": 2.605437953938792, + "grad_norm": 0.15178209476302623, + "learning_rate": 5.177348110795782e-06, + "loss": 2.7035, + "step": 41971 + }, + { + "epoch": 2.6055000310385497, + "grad_norm": 0.15058370680090685, + "learning_rate": 5.175747821414706e-06, + "loss": 2.7681, + "step": 41972 + }, + { + "epoch": 2.605562108138308, + "grad_norm": 0.13810550518680093, + "learning_rate": 5.174147765892628e-06, + "loss": 2.7219, + "step": 41973 + }, + { + "epoch": 2.6056241852380655, + "grad_norm": 0.14483000833124537, + "learning_rate": 5.17254794423791e-06, + "loss": 2.6838, + "step": 41974 + }, + { + "epoch": 2.6056862623378234, + "grad_norm": 0.13436947522893228, + "learning_rate": 5.170948356458893e-06, + "loss": 2.7258, + "step": 41975 + }, + { + "epoch": 2.6057483394375813, + "grad_norm": 0.1570610988298663, + "learning_rate": 5.169349002563923e-06, + "loss": 2.59, + "step": 41976 + }, + { + "epoch": 2.6058104165373392, + "grad_norm": 0.14146479875099596, + "learning_rate": 5.167749882561335e-06, + "loss": 2.7017, + "step": 41977 + }, + { + "epoch": 2.605872493637097, + "grad_norm": 0.1362828594905228, + "learning_rate": 5.166150996459484e-06, + "loss": 2.6559, + "step": 41978 + }, + { + "epoch": 2.605934570736855, + "grad_norm": 0.133604799895593, + "learning_rate": 5.164552344266704e-06, + "loss": 2.6612, + "step": 41979 + }, + { + "epoch": 2.605996647836613, + "grad_norm": 0.13974694465682047, + "learning_rate": 5.162953925991343e-06, + "loss": 2.6979, + "step": 41980 + }, + { + "epoch": 2.606058724936371, + "grad_norm": 0.14281923339754932, + "learning_rate": 5.161355741641738e-06, + "loss": 2.6612, + "step": 41981 + }, + { + "epoch": 2.606120802036129, + "grad_norm": 0.14431964359015886, + "learning_rate": 5.15975779122621e-06, + "loss": 2.7444, + "step": 41982 + }, + { + "epoch": 2.6061828791358868, + "grad_norm": 0.13511634882644327, + "learning_rate": 5.1581600747531265e-06, + "loss": 2.7188, + "step": 41983 + }, + { + "epoch": 2.6062449562356447, + "grad_norm": 0.1462055286117347, + "learning_rate": 5.156562592230802e-06, + "loss": 2.7018, + "step": 41984 + }, + { + "epoch": 2.6063070333354026, + "grad_norm": 0.1594359318810498, + "learning_rate": 5.154965343667584e-06, + "loss": 2.6673, + "step": 41985 + }, + { + "epoch": 2.6063691104351605, + "grad_norm": 0.13602943353603505, + "learning_rate": 5.153368329071778e-06, + "loss": 2.7118, + "step": 41986 + }, + { + "epoch": 2.6064311875349184, + "grad_norm": 0.13339952856597856, + "learning_rate": 5.151771548451756e-06, + "loss": 2.6246, + "step": 41987 + }, + { + "epoch": 2.6064932646346763, + "grad_norm": 0.14241256638457314, + "learning_rate": 5.150175001815821e-06, + "loss": 2.6824, + "step": 41988 + }, + { + "epoch": 2.6065553417344343, + "grad_norm": 0.13513803166329075, + "learning_rate": 5.148578689172317e-06, + "loss": 2.619, + "step": 41989 + }, + { + "epoch": 2.606617418834192, + "grad_norm": 0.1335009444945389, + "learning_rate": 5.146982610529555e-06, + "loss": 2.701, + "step": 41990 + }, + { + "epoch": 2.60667949593395, + "grad_norm": 0.13523556140751677, + "learning_rate": 5.145386765895871e-06, + "loss": 2.6398, + "step": 41991 + }, + { + "epoch": 2.606741573033708, + "grad_norm": 0.13265498020296776, + "learning_rate": 5.143791155279604e-06, + "loss": 2.6808, + "step": 41992 + }, + { + "epoch": 2.6068036501334655, + "grad_norm": 0.14906783911102162, + "learning_rate": 5.142195778689068e-06, + "loss": 2.6514, + "step": 41993 + }, + { + "epoch": 2.606865727233224, + "grad_norm": 0.1323158711971125, + "learning_rate": 5.140600636132587e-06, + "loss": 2.5875, + "step": 41994 + }, + { + "epoch": 2.6069278043329813, + "grad_norm": 0.15149112057827782, + "learning_rate": 5.139005727618479e-06, + "loss": 2.6661, + "step": 41995 + }, + { + "epoch": 2.6069898814327397, + "grad_norm": 0.13940386930757548, + "learning_rate": 5.1374110531550616e-06, + "loss": 2.7397, + "step": 41996 + }, + { + "epoch": 2.607051958532497, + "grad_norm": 0.13729768520857227, + "learning_rate": 5.135816612750672e-06, + "loss": 2.7063, + "step": 41997 + }, + { + "epoch": 2.6071140356322555, + "grad_norm": 0.13678490536803442, + "learning_rate": 5.134222406413619e-06, + "loss": 2.6767, + "step": 41998 + }, + { + "epoch": 2.607176112732013, + "grad_norm": 0.15103699756399203, + "learning_rate": 5.132628434152215e-06, + "loss": 2.7218, + "step": 41999 + }, + { + "epoch": 2.607238189831771, + "grad_norm": 0.13231024017260146, + "learning_rate": 5.131034695974774e-06, + "loss": 2.6572, + "step": 42000 + }, + { + "epoch": 2.607300266931529, + "grad_norm": 0.14739849966578325, + "learning_rate": 5.129441191889628e-06, + "loss": 2.7042, + "step": 42001 + }, + { + "epoch": 2.6073623440312867, + "grad_norm": 0.14783447524671373, + "learning_rate": 5.127847921905077e-06, + "loss": 2.7517, + "step": 42002 + }, + { + "epoch": 2.6074244211310447, + "grad_norm": 0.1325731575169577, + "learning_rate": 5.126254886029436e-06, + "loss": 2.6854, + "step": 42003 + }, + { + "epoch": 2.6074864982308026, + "grad_norm": 0.13386269791451583, + "learning_rate": 5.124662084271009e-06, + "loss": 2.6389, + "step": 42004 + }, + { + "epoch": 2.6075485753305605, + "grad_norm": 0.13862507661769183, + "learning_rate": 5.123069516638123e-06, + "loss": 2.687, + "step": 42005 + }, + { + "epoch": 2.6076106524303184, + "grad_norm": 0.14034402778757685, + "learning_rate": 5.1214771831390785e-06, + "loss": 2.6907, + "step": 42006 + }, + { + "epoch": 2.6076727295300763, + "grad_norm": 0.15622702668102872, + "learning_rate": 5.119885083782183e-06, + "loss": 2.6555, + "step": 42007 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.1335190952021929, + "learning_rate": 5.118293218575737e-06, + "loss": 2.628, + "step": 42008 + }, + { + "epoch": 2.607796883729592, + "grad_norm": 0.13434072212557008, + "learning_rate": 5.116701587528044e-06, + "loss": 2.8119, + "step": 42009 + }, + { + "epoch": 2.60785896082935, + "grad_norm": 0.14024807164452568, + "learning_rate": 5.1151101906474265e-06, + "loss": 2.6612, + "step": 42010 + }, + { + "epoch": 2.607921037929108, + "grad_norm": 0.13551413638153786, + "learning_rate": 5.113519027942177e-06, + "loss": 2.6934, + "step": 42011 + }, + { + "epoch": 2.607983115028866, + "grad_norm": 0.17756638605174874, + "learning_rate": 5.111928099420588e-06, + "loss": 2.7934, + "step": 42012 + }, + { + "epoch": 2.608045192128624, + "grad_norm": 0.14047869382972272, + "learning_rate": 5.1103374050909595e-06, + "loss": 2.6043, + "step": 42013 + }, + { + "epoch": 2.6081072692283818, + "grad_norm": 0.13234980008641986, + "learning_rate": 5.108746944961612e-06, + "loss": 2.6252, + "step": 42014 + }, + { + "epoch": 2.6081693463281397, + "grad_norm": 0.14714940111550082, + "learning_rate": 5.107156719040829e-06, + "loss": 2.72, + "step": 42015 + }, + { + "epoch": 2.6082314234278976, + "grad_norm": 0.13450197331051628, + "learning_rate": 5.105566727336902e-06, + "loss": 2.6899, + "step": 42016 + }, + { + "epoch": 2.6082935005276555, + "grad_norm": 0.1430258384186759, + "learning_rate": 5.103976969858137e-06, + "loss": 2.7317, + "step": 42017 + }, + { + "epoch": 2.608355577627413, + "grad_norm": 0.16876434821483, + "learning_rate": 5.102387446612811e-06, + "loss": 2.7496, + "step": 42018 + }, + { + "epoch": 2.6084176547271714, + "grad_norm": 0.13730096147237747, + "learning_rate": 5.100798157609238e-06, + "loss": 2.7008, + "step": 42019 + }, + { + "epoch": 2.608479731826929, + "grad_norm": 0.14141245085466628, + "learning_rate": 5.099209102855701e-06, + "loss": 2.6417, + "step": 42020 + }, + { + "epoch": 2.608541808926687, + "grad_norm": 0.13312472779958057, + "learning_rate": 5.097620282360494e-06, + "loss": 2.6539, + "step": 42021 + }, + { + "epoch": 2.6086038860264447, + "grad_norm": 0.13208871555819562, + "learning_rate": 5.096031696131892e-06, + "loss": 2.7118, + "step": 42022 + }, + { + "epoch": 2.6086659631262026, + "grad_norm": 0.12962578316105247, + "learning_rate": 5.09444334417819e-06, + "loss": 2.7201, + "step": 42023 + }, + { + "epoch": 2.6087280402259605, + "grad_norm": 0.13598962228507344, + "learning_rate": 5.092855226507687e-06, + "loss": 2.6542, + "step": 42024 + }, + { + "epoch": 2.6087901173257184, + "grad_norm": 0.14445659281527776, + "learning_rate": 5.091267343128664e-06, + "loss": 2.8164, + "step": 42025 + }, + { + "epoch": 2.6088521944254763, + "grad_norm": 0.1356786057563149, + "learning_rate": 5.089679694049398e-06, + "loss": 2.7103, + "step": 42026 + }, + { + "epoch": 2.6089142715252343, + "grad_norm": 0.14608637464745455, + "learning_rate": 5.088092279278167e-06, + "loss": 2.6661, + "step": 42027 + }, + { + "epoch": 2.608976348624992, + "grad_norm": 0.13170150684698156, + "learning_rate": 5.0865050988232745e-06, + "loss": 2.7154, + "step": 42028 + }, + { + "epoch": 2.60903842572475, + "grad_norm": 0.13340262829521607, + "learning_rate": 5.0849181526929805e-06, + "loss": 2.6748, + "step": 42029 + }, + { + "epoch": 2.609100502824508, + "grad_norm": 0.14438232275322097, + "learning_rate": 5.083331440895578e-06, + "loss": 2.7419, + "step": 42030 + }, + { + "epoch": 2.609162579924266, + "grad_norm": 0.13589498441497194, + "learning_rate": 5.081744963439339e-06, + "loss": 2.7075, + "step": 42031 + }, + { + "epoch": 2.609224657024024, + "grad_norm": 0.1626160413640739, + "learning_rate": 5.0801587203325285e-06, + "loss": 2.7573, + "step": 42032 + }, + { + "epoch": 2.6092867341237818, + "grad_norm": 0.1600192286347571, + "learning_rate": 5.078572711583446e-06, + "loss": 2.806, + "step": 42033 + }, + { + "epoch": 2.6093488112235397, + "grad_norm": 0.13316026204942116, + "learning_rate": 5.0769869372003565e-06, + "loss": 2.6904, + "step": 42034 + }, + { + "epoch": 2.6094108883232976, + "grad_norm": 0.1384185774531275, + "learning_rate": 5.075401397191526e-06, + "loss": 2.6834, + "step": 42035 + }, + { + "epoch": 2.6094729654230555, + "grad_norm": 0.13077988274063526, + "learning_rate": 5.0738160915652256e-06, + "loss": 2.5831, + "step": 42036 + }, + { + "epoch": 2.6095350425228134, + "grad_norm": 0.13550826025608717, + "learning_rate": 5.072231020329743e-06, + "loss": 2.7185, + "step": 42037 + }, + { + "epoch": 2.6095971196225713, + "grad_norm": 0.1360999134652047, + "learning_rate": 5.070646183493338e-06, + "loss": 2.671, + "step": 42038 + }, + { + "epoch": 2.6096591967223293, + "grad_norm": 0.13161649454518257, + "learning_rate": 5.069061581064277e-06, + "loss": 2.6946, + "step": 42039 + }, + { + "epoch": 2.609721273822087, + "grad_norm": 0.15028006341465358, + "learning_rate": 5.06747721305082e-06, + "loss": 2.7421, + "step": 42040 + }, + { + "epoch": 2.6097833509218447, + "grad_norm": 0.1302216705803094, + "learning_rate": 5.0658930794612535e-06, + "loss": 2.6958, + "step": 42041 + }, + { + "epoch": 2.609845428021603, + "grad_norm": 0.13604166079687172, + "learning_rate": 5.064309180303833e-06, + "loss": 2.6203, + "step": 42042 + }, + { + "epoch": 2.6099075051213605, + "grad_norm": 0.14686846287362973, + "learning_rate": 5.0627255155868136e-06, + "loss": 2.666, + "step": 42043 + }, + { + "epoch": 2.609969582221119, + "grad_norm": 0.13270069072111704, + "learning_rate": 5.061142085318471e-06, + "loss": 2.7135, + "step": 42044 + }, + { + "epoch": 2.6100316593208763, + "grad_norm": 0.33759012255264226, + "learning_rate": 5.059558889507043e-06, + "loss": 2.5712, + "step": 42045 + }, + { + "epoch": 2.6100937364206347, + "grad_norm": 0.133721369303818, + "learning_rate": 5.057975928160819e-06, + "loss": 2.7619, + "step": 42046 + }, + { + "epoch": 2.610155813520392, + "grad_norm": 0.1457652879809821, + "learning_rate": 5.056393201288045e-06, + "loss": 2.7452, + "step": 42047 + }, + { + "epoch": 2.61021789062015, + "grad_norm": 0.13405267125893336, + "learning_rate": 5.054810708896979e-06, + "loss": 2.7495, + "step": 42048 + }, + { + "epoch": 2.610279967719908, + "grad_norm": 0.14174469258238775, + "learning_rate": 5.0532284509958675e-06, + "loss": 2.7398, + "step": 42049 + }, + { + "epoch": 2.610342044819666, + "grad_norm": 0.14062356285844352, + "learning_rate": 5.0516464275929764e-06, + "loss": 2.7235, + "step": 42050 + }, + { + "epoch": 2.610404121919424, + "grad_norm": 0.15172845735737248, + "learning_rate": 5.050064638696567e-06, + "loss": 2.7185, + "step": 42051 + }, + { + "epoch": 2.6104661990191818, + "grad_norm": 0.13847294494681456, + "learning_rate": 5.048483084314875e-06, + "loss": 2.7369, + "step": 42052 + }, + { + "epoch": 2.6105282761189397, + "grad_norm": 0.1380488004480007, + "learning_rate": 5.046901764456153e-06, + "loss": 2.5214, + "step": 42053 + }, + { + "epoch": 2.6105903532186976, + "grad_norm": 0.14745691432896202, + "learning_rate": 5.045320679128668e-06, + "loss": 2.7571, + "step": 42054 + }, + { + "epoch": 2.6106524303184555, + "grad_norm": 0.1348722630299103, + "learning_rate": 5.04373982834066e-06, + "loss": 2.6119, + "step": 42055 + }, + { + "epoch": 2.6107145074182134, + "grad_norm": 0.1331560446019839, + "learning_rate": 5.042159212100361e-06, + "loss": 2.726, + "step": 42056 + }, + { + "epoch": 2.6107765845179713, + "grad_norm": 0.13177040732042206, + "learning_rate": 5.040578830416043e-06, + "loss": 2.7342, + "step": 42057 + }, + { + "epoch": 2.6108386616177293, + "grad_norm": 0.1570877919957718, + "learning_rate": 5.038998683295937e-06, + "loss": 2.7312, + "step": 42058 + }, + { + "epoch": 2.610900738717487, + "grad_norm": 0.13888028427795282, + "learning_rate": 5.037418770748281e-06, + "loss": 2.8228, + "step": 42059 + }, + { + "epoch": 2.610962815817245, + "grad_norm": 0.1299864491749924, + "learning_rate": 5.035839092781336e-06, + "loss": 2.6902, + "step": 42060 + }, + { + "epoch": 2.611024892917003, + "grad_norm": 0.13677721136038523, + "learning_rate": 5.034259649403333e-06, + "loss": 2.67, + "step": 42061 + }, + { + "epoch": 2.611086970016761, + "grad_norm": 0.1436663926112094, + "learning_rate": 5.032680440622517e-06, + "loss": 2.7627, + "step": 42062 + }, + { + "epoch": 2.611149047116519, + "grad_norm": 0.13212955415486502, + "learning_rate": 5.031101466447108e-06, + "loss": 2.7131, + "step": 42063 + }, + { + "epoch": 2.6112111242162768, + "grad_norm": 0.13260353001199232, + "learning_rate": 5.029522726885372e-06, + "loss": 2.6021, + "step": 42064 + }, + { + "epoch": 2.6112732013160347, + "grad_norm": 0.13848441173006792, + "learning_rate": 5.027944221945535e-06, + "loss": 2.8054, + "step": 42065 + }, + { + "epoch": 2.611335278415792, + "grad_norm": 0.13260471695483372, + "learning_rate": 5.026365951635825e-06, + "loss": 2.6657, + "step": 42066 + }, + { + "epoch": 2.6113973555155505, + "grad_norm": 0.16242974004927652, + "learning_rate": 5.024787915964485e-06, + "loss": 2.6415, + "step": 42067 + }, + { + "epoch": 2.611459432615308, + "grad_norm": 0.14400115618556175, + "learning_rate": 5.023210114939736e-06, + "loss": 2.6593, + "step": 42068 + }, + { + "epoch": 2.6115215097150664, + "grad_norm": 0.13203996482844607, + "learning_rate": 5.021632548569826e-06, + "loss": 2.7395, + "step": 42069 + }, + { + "epoch": 2.611583586814824, + "grad_norm": 0.13807276807302418, + "learning_rate": 5.020055216862979e-06, + "loss": 2.7488, + "step": 42070 + }, + { + "epoch": 2.6116456639145817, + "grad_norm": 0.1347782916874938, + "learning_rate": 5.018478119827424e-06, + "loss": 2.6919, + "step": 42071 + }, + { + "epoch": 2.6117077410143397, + "grad_norm": 0.13705882284386806, + "learning_rate": 5.016901257471379e-06, + "loss": 2.6986, + "step": 42072 + }, + { + "epoch": 2.6117698181140976, + "grad_norm": 0.139631010181895, + "learning_rate": 5.015324629803087e-06, + "loss": 2.658, + "step": 42073 + }, + { + "epoch": 2.6118318952138555, + "grad_norm": 0.14548051181162955, + "learning_rate": 5.013748236830767e-06, + "loss": 2.7513, + "step": 42074 + }, + { + "epoch": 2.6118939723136134, + "grad_norm": 0.1376274225596052, + "learning_rate": 5.012172078562644e-06, + "loss": 2.6669, + "step": 42075 + }, + { + "epoch": 2.6119560494133713, + "grad_norm": 0.148036521861636, + "learning_rate": 5.01059615500693e-06, + "loss": 2.7302, + "step": 42076 + }, + { + "epoch": 2.6120181265131293, + "grad_norm": 0.1378136592933006, + "learning_rate": 5.009020466171871e-06, + "loss": 2.7447, + "step": 42077 + }, + { + "epoch": 2.612080203612887, + "grad_norm": 0.13340099466310837, + "learning_rate": 5.00744501206567e-06, + "loss": 2.7401, + "step": 42078 + }, + { + "epoch": 2.612142280712645, + "grad_norm": 0.13759817254624113, + "learning_rate": 5.005869792696549e-06, + "loss": 2.6917, + "step": 42079 + }, + { + "epoch": 2.612204357812403, + "grad_norm": 0.13251707936368076, + "learning_rate": 5.004294808072735e-06, + "loss": 2.6097, + "step": 42080 + }, + { + "epoch": 2.612266434912161, + "grad_norm": 0.13844007272245723, + "learning_rate": 5.002720058202426e-06, + "loss": 2.7595, + "step": 42081 + }, + { + "epoch": 2.612328512011919, + "grad_norm": 0.13825258480530486, + "learning_rate": 5.001145543093855e-06, + "loss": 2.7353, + "step": 42082 + }, + { + "epoch": 2.6123905891116768, + "grad_norm": 0.14765475968713856, + "learning_rate": 4.999571262755237e-06, + "loss": 2.7235, + "step": 42083 + }, + { + "epoch": 2.6124526662114347, + "grad_norm": 0.14865778589166925, + "learning_rate": 4.9979972171947776e-06, + "loss": 2.7617, + "step": 42084 + }, + { + "epoch": 2.6125147433111926, + "grad_norm": 0.14748146848929547, + "learning_rate": 4.99642340642068e-06, + "loss": 2.7449, + "step": 42085 + }, + { + "epoch": 2.6125768204109505, + "grad_norm": 0.16075397230404698, + "learning_rate": 4.9948498304411785e-06, + "loss": 2.6627, + "step": 42086 + }, + { + "epoch": 2.6126388975107084, + "grad_norm": 0.1371396601885197, + "learning_rate": 4.99327648926447e-06, + "loss": 2.7, + "step": 42087 + }, + { + "epoch": 2.6127009746104664, + "grad_norm": 0.13590834759935266, + "learning_rate": 4.991703382898755e-06, + "loss": 2.753, + "step": 42088 + }, + { + "epoch": 2.612763051710224, + "grad_norm": 0.14100615847400186, + "learning_rate": 4.990130511352253e-06, + "loss": 2.6967, + "step": 42089 + }, + { + "epoch": 2.612825128809982, + "grad_norm": 0.13207725056496772, + "learning_rate": 4.988557874633165e-06, + "loss": 2.6816, + "step": 42090 + }, + { + "epoch": 2.6128872059097397, + "grad_norm": 0.13325676523188362, + "learning_rate": 4.986985472749706e-06, + "loss": 2.6818, + "step": 42091 + }, + { + "epoch": 2.612949283009498, + "grad_norm": 0.13624667683100955, + "learning_rate": 4.985413305710069e-06, + "loss": 2.7059, + "step": 42092 + }, + { + "epoch": 2.6130113601092555, + "grad_norm": 0.13713621029066994, + "learning_rate": 4.983841373522457e-06, + "loss": 2.7711, + "step": 42093 + }, + { + "epoch": 2.613073437209014, + "grad_norm": 0.1455356584056961, + "learning_rate": 4.982269676195078e-06, + "loss": 2.6167, + "step": 42094 + }, + { + "epoch": 2.6131355143087713, + "grad_norm": 0.13529108411011287, + "learning_rate": 4.980698213736112e-06, + "loss": 2.7601, + "step": 42095 + }, + { + "epoch": 2.6131975914085293, + "grad_norm": 0.13185849879742506, + "learning_rate": 4.979126986153781e-06, + "loss": 2.7116, + "step": 42096 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.1362516598644117, + "learning_rate": 4.977555993456278e-06, + "loss": 2.7259, + "step": 42097 + }, + { + "epoch": 2.613321745608045, + "grad_norm": 0.14373619256880077, + "learning_rate": 4.975985235651792e-06, + "loss": 2.677, + "step": 42098 + }, + { + "epoch": 2.613383822707803, + "grad_norm": 0.13698118771003664, + "learning_rate": 4.974414712748516e-06, + "loss": 2.5918, + "step": 42099 + }, + { + "epoch": 2.613445899807561, + "grad_norm": 0.13248656051782046, + "learning_rate": 4.972844424754652e-06, + "loss": 2.6768, + "step": 42100 + }, + { + "epoch": 2.613507976907319, + "grad_norm": 0.14125166446836004, + "learning_rate": 4.971274371678392e-06, + "loss": 2.6658, + "step": 42101 + }, + { + "epoch": 2.6135700540070768, + "grad_norm": 0.13282277263855632, + "learning_rate": 4.969704553527926e-06, + "loss": 2.6971, + "step": 42102 + }, + { + "epoch": 2.6136321311068347, + "grad_norm": 0.13201640457571492, + "learning_rate": 4.968134970311433e-06, + "loss": 2.6274, + "step": 42103 + }, + { + "epoch": 2.6136942082065926, + "grad_norm": 0.15226785285302424, + "learning_rate": 4.966565622037123e-06, + "loss": 2.7734, + "step": 42104 + }, + { + "epoch": 2.6137562853063505, + "grad_norm": 0.14225384662938137, + "learning_rate": 4.964996508713166e-06, + "loss": 2.7872, + "step": 42105 + }, + { + "epoch": 2.6138183624061084, + "grad_norm": 0.13537087185783292, + "learning_rate": 4.9634276303477615e-06, + "loss": 2.7334, + "step": 42106 + }, + { + "epoch": 2.6138804395058663, + "grad_norm": 0.15269705567690886, + "learning_rate": 4.961858986949086e-06, + "loss": 2.6665, + "step": 42107 + }, + { + "epoch": 2.6139425166056243, + "grad_norm": 0.13400894390200807, + "learning_rate": 4.960290578525312e-06, + "loss": 2.709, + "step": 42108 + }, + { + "epoch": 2.614004593705382, + "grad_norm": 0.15311795748844945, + "learning_rate": 4.958722405084648e-06, + "loss": 2.785, + "step": 42109 + }, + { + "epoch": 2.61406667080514, + "grad_norm": 0.14168778129494183, + "learning_rate": 4.957154466635267e-06, + "loss": 2.7334, + "step": 42110 + }, + { + "epoch": 2.614128747904898, + "grad_norm": 0.13681085072904797, + "learning_rate": 4.9555867631853435e-06, + "loss": 2.7853, + "step": 42111 + }, + { + "epoch": 2.614190825004656, + "grad_norm": 0.1424221596686217, + "learning_rate": 4.9540192947430455e-06, + "loss": 2.6792, + "step": 42112 + }, + { + "epoch": 2.614252902104414, + "grad_norm": 0.14220935344437277, + "learning_rate": 4.952452061316576e-06, + "loss": 2.7413, + "step": 42113 + }, + { + "epoch": 2.6143149792041713, + "grad_norm": 0.13445003155746602, + "learning_rate": 4.9508850629141016e-06, + "loss": 2.6529, + "step": 42114 + }, + { + "epoch": 2.6143770563039297, + "grad_norm": 0.13370716762772594, + "learning_rate": 4.949318299543792e-06, + "loss": 2.667, + "step": 42115 + }, + { + "epoch": 2.614439133403687, + "grad_norm": 0.1471188347398963, + "learning_rate": 4.9477517712138326e-06, + "loss": 2.6641, + "step": 42116 + }, + { + "epoch": 2.6145012105034455, + "grad_norm": 0.1297725697220209, + "learning_rate": 4.946185477932375e-06, + "loss": 2.6594, + "step": 42117 + }, + { + "epoch": 2.614563287603203, + "grad_norm": 0.13151518337814186, + "learning_rate": 4.944619419707613e-06, + "loss": 2.6276, + "step": 42118 + }, + { + "epoch": 2.614625364702961, + "grad_norm": 0.1381728029924164, + "learning_rate": 4.943053596547708e-06, + "loss": 2.623, + "step": 42119 + }, + { + "epoch": 2.614687441802719, + "grad_norm": 0.14664765476779798, + "learning_rate": 4.941488008460837e-06, + "loss": 2.6845, + "step": 42120 + }, + { + "epoch": 2.6147495189024768, + "grad_norm": 0.13498410883065992, + "learning_rate": 4.939922655455148e-06, + "loss": 2.6337, + "step": 42121 + }, + { + "epoch": 2.6148115960022347, + "grad_norm": 0.13323620220509255, + "learning_rate": 4.9383575375388184e-06, + "loss": 2.7072, + "step": 42122 + }, + { + "epoch": 2.6148736731019926, + "grad_norm": 0.1431774710794164, + "learning_rate": 4.936792654720029e-06, + "loss": 2.687, + "step": 42123 + }, + { + "epoch": 2.6149357502017505, + "grad_norm": 0.13393441539457526, + "learning_rate": 4.9352280070069315e-06, + "loss": 2.7208, + "step": 42124 + }, + { + "epoch": 2.6149978273015084, + "grad_norm": 0.1323050875752034, + "learning_rate": 4.933663594407689e-06, + "loss": 2.7356, + "step": 42125 + }, + { + "epoch": 2.6150599044012663, + "grad_norm": 0.1462424824180464, + "learning_rate": 4.932099416930452e-06, + "loss": 2.7332, + "step": 42126 + }, + { + "epoch": 2.6151219815010243, + "grad_norm": 0.14443854632343217, + "learning_rate": 4.9305354745834085e-06, + "loss": 2.7741, + "step": 42127 + }, + { + "epoch": 2.615184058600782, + "grad_norm": 0.13163854869031422, + "learning_rate": 4.928971767374696e-06, + "loss": 2.7349, + "step": 42128 + }, + { + "epoch": 2.61524613570054, + "grad_norm": 0.1481493927089444, + "learning_rate": 4.9274082953124854e-06, + "loss": 2.7329, + "step": 42129 + }, + { + "epoch": 2.615308212800298, + "grad_norm": 0.1414077484725885, + "learning_rate": 4.925845058404921e-06, + "loss": 2.6852, + "step": 42130 + }, + { + "epoch": 2.615370289900056, + "grad_norm": 0.14162356764716363, + "learning_rate": 4.924282056660162e-06, + "loss": 2.722, + "step": 42131 + }, + { + "epoch": 2.615432366999814, + "grad_norm": 0.13659155241996943, + "learning_rate": 4.922719290086369e-06, + "loss": 2.7372, + "step": 42132 + }, + { + "epoch": 2.6154944440995718, + "grad_norm": 0.15473318581296047, + "learning_rate": 4.921156758691698e-06, + "loss": 2.6992, + "step": 42133 + }, + { + "epoch": 2.6155565211993297, + "grad_norm": 0.1341700732300186, + "learning_rate": 4.919594462484289e-06, + "loss": 2.6738, + "step": 42134 + }, + { + "epoch": 2.6156185982990876, + "grad_norm": 0.15700432370999035, + "learning_rate": 4.918032401472295e-06, + "loss": 2.7868, + "step": 42135 + }, + { + "epoch": 2.6156806753988455, + "grad_norm": 0.13912154883519537, + "learning_rate": 4.916470575663878e-06, + "loss": 2.6763, + "step": 42136 + }, + { + "epoch": 2.615742752498603, + "grad_norm": 0.14959996320835914, + "learning_rate": 4.914908985067179e-06, + "loss": 2.7479, + "step": 42137 + }, + { + "epoch": 2.6158048295983614, + "grad_norm": 0.13529515240012419, + "learning_rate": 4.913347629690346e-06, + "loss": 2.7376, + "step": 42138 + }, + { + "epoch": 2.615866906698119, + "grad_norm": 0.13592458723935913, + "learning_rate": 4.911786509541511e-06, + "loss": 2.6506, + "step": 42139 + }, + { + "epoch": 2.615928983797877, + "grad_norm": 0.15298622323790761, + "learning_rate": 4.910225624628845e-06, + "loss": 2.7054, + "step": 42140 + }, + { + "epoch": 2.6159910608976347, + "grad_norm": 0.1356886957279305, + "learning_rate": 4.9086649749604755e-06, + "loss": 2.6894, + "step": 42141 + }, + { + "epoch": 2.616053137997393, + "grad_norm": 0.1363817557620483, + "learning_rate": 4.907104560544545e-06, + "loss": 2.7706, + "step": 42142 + }, + { + "epoch": 2.6161152150971505, + "grad_norm": 0.1368569152786071, + "learning_rate": 4.905544381389199e-06, + "loss": 2.5773, + "step": 42143 + }, + { + "epoch": 2.6161772921969084, + "grad_norm": 0.13424089110362927, + "learning_rate": 4.903984437502563e-06, + "loss": 2.6392, + "step": 42144 + }, + { + "epoch": 2.6162393692966663, + "grad_norm": 0.13501033817328514, + "learning_rate": 4.902424728892801e-06, + "loss": 2.6382, + "step": 42145 + }, + { + "epoch": 2.6163014463964243, + "grad_norm": 0.13429595124757504, + "learning_rate": 4.900865255568032e-06, + "loss": 2.6902, + "step": 42146 + }, + { + "epoch": 2.616363523496182, + "grad_norm": 0.14843605693316458, + "learning_rate": 4.899306017536404e-06, + "loss": 2.7069, + "step": 42147 + }, + { + "epoch": 2.61642560059594, + "grad_norm": 0.14700600255296062, + "learning_rate": 4.897747014806031e-06, + "loss": 2.684, + "step": 42148 + }, + { + "epoch": 2.616487677695698, + "grad_norm": 0.1520694990648316, + "learning_rate": 4.89618824738507e-06, + "loss": 2.6958, + "step": 42149 + }, + { + "epoch": 2.616549754795456, + "grad_norm": 0.13538267705517534, + "learning_rate": 4.894629715281645e-06, + "loss": 2.7532, + "step": 42150 + }, + { + "epoch": 2.616611831895214, + "grad_norm": 0.14964082750822755, + "learning_rate": 4.8930714185038864e-06, + "loss": 2.8158, + "step": 42151 + }, + { + "epoch": 2.6166739089949718, + "grad_norm": 0.13839920530394248, + "learning_rate": 4.891513357059924e-06, + "loss": 2.7333, + "step": 42152 + }, + { + "epoch": 2.6167359860947297, + "grad_norm": 0.13532352377276452, + "learning_rate": 4.889955530957879e-06, + "loss": 2.6888, + "step": 42153 + }, + { + "epoch": 2.6167980631944876, + "grad_norm": 0.14001726914657892, + "learning_rate": 4.888397940205885e-06, + "loss": 2.6926, + "step": 42154 + }, + { + "epoch": 2.6168601402942455, + "grad_norm": 0.15407186712801146, + "learning_rate": 4.88684058481208e-06, + "loss": 2.7337, + "step": 42155 + }, + { + "epoch": 2.6169222173940034, + "grad_norm": 0.1332466728307685, + "learning_rate": 4.885283464784579e-06, + "loss": 2.6494, + "step": 42156 + }, + { + "epoch": 2.6169842944937614, + "grad_norm": 0.13899728713973097, + "learning_rate": 4.883726580131509e-06, + "loss": 2.7266, + "step": 42157 + }, + { + "epoch": 2.6170463715935193, + "grad_norm": 0.13213820245016772, + "learning_rate": 4.882169930860981e-06, + "loss": 2.688, + "step": 42158 + }, + { + "epoch": 2.617108448693277, + "grad_norm": 0.1338169389070441, + "learning_rate": 4.880613516981131e-06, + "loss": 2.6893, + "step": 42159 + }, + { + "epoch": 2.617170525793035, + "grad_norm": 0.13945150828559688, + "learning_rate": 4.879057338500076e-06, + "loss": 2.727, + "step": 42160 + }, + { + "epoch": 2.617232602892793, + "grad_norm": 0.13337929477989774, + "learning_rate": 4.877501395425927e-06, + "loss": 2.6833, + "step": 42161 + }, + { + "epoch": 2.6172946799925505, + "grad_norm": 0.13948151809622858, + "learning_rate": 4.875945687766803e-06, + "loss": 2.6891, + "step": 42162 + }, + { + "epoch": 2.617356757092309, + "grad_norm": 0.16093381095102505, + "learning_rate": 4.874390215530833e-06, + "loss": 2.706, + "step": 42163 + }, + { + "epoch": 2.6174188341920663, + "grad_norm": 0.13307374616788661, + "learning_rate": 4.8728349787261205e-06, + "loss": 2.7224, + "step": 42164 + }, + { + "epoch": 2.6174809112918247, + "grad_norm": 0.13816703993781776, + "learning_rate": 4.871279977360788e-06, + "loss": 2.6306, + "step": 42165 + }, + { + "epoch": 2.617542988391582, + "grad_norm": 0.1388380975205881, + "learning_rate": 4.869725211442938e-06, + "loss": 2.7097, + "step": 42166 + }, + { + "epoch": 2.61760506549134, + "grad_norm": 0.1343989012775115, + "learning_rate": 4.868170680980683e-06, + "loss": 2.766, + "step": 42167 + }, + { + "epoch": 2.617667142591098, + "grad_norm": 0.13862617841181876, + "learning_rate": 4.866616385982148e-06, + "loss": 2.7757, + "step": 42168 + }, + { + "epoch": 2.617729219690856, + "grad_norm": 0.13634230012623336, + "learning_rate": 4.865062326455428e-06, + "loss": 2.6508, + "step": 42169 + }, + { + "epoch": 2.617791296790614, + "grad_norm": 0.15523321002509632, + "learning_rate": 4.863508502408631e-06, + "loss": 2.7742, + "step": 42170 + }, + { + "epoch": 2.6178533738903718, + "grad_norm": 0.14397973434495667, + "learning_rate": 4.861954913849864e-06, + "loss": 2.7292, + "step": 42171 + }, + { + "epoch": 2.6179154509901297, + "grad_norm": 0.1418299904293289, + "learning_rate": 4.8604015607872475e-06, + "loss": 2.7066, + "step": 42172 + }, + { + "epoch": 2.6179775280898876, + "grad_norm": 0.13129374308756572, + "learning_rate": 4.85884844322887e-06, + "loss": 2.6975, + "step": 42173 + }, + { + "epoch": 2.6180396051896455, + "grad_norm": 0.13320666613304608, + "learning_rate": 4.857295561182834e-06, + "loss": 2.7005, + "step": 42174 + }, + { + "epoch": 2.6181016822894034, + "grad_norm": 0.1318478755490816, + "learning_rate": 4.855742914657246e-06, + "loss": 2.7345, + "step": 42175 + }, + { + "epoch": 2.6181637593891613, + "grad_norm": 0.13243718269790797, + "learning_rate": 4.8541905036602105e-06, + "loss": 2.63, + "step": 42176 + }, + { + "epoch": 2.6182258364889193, + "grad_norm": 0.13738727608968684, + "learning_rate": 4.852638328199821e-06, + "loss": 2.6319, + "step": 42177 + }, + { + "epoch": 2.618287913588677, + "grad_norm": 0.135406366898015, + "learning_rate": 4.8510863882841814e-06, + "loss": 2.6703, + "step": 42178 + }, + { + "epoch": 2.618349990688435, + "grad_norm": 0.14305729234848924, + "learning_rate": 4.849534683921381e-06, + "loss": 2.702, + "step": 42179 + }, + { + "epoch": 2.618412067788193, + "grad_norm": 0.1353685189507102, + "learning_rate": 4.847983215119512e-06, + "loss": 2.7055, + "step": 42180 + }, + { + "epoch": 2.618474144887951, + "grad_norm": 0.1288510695753162, + "learning_rate": 4.846431981886684e-06, + "loss": 2.6565, + "step": 42181 + }, + { + "epoch": 2.618536221987709, + "grad_norm": 0.1414820857629038, + "learning_rate": 4.844880984230982e-06, + "loss": 2.6214, + "step": 42182 + }, + { + "epoch": 2.6185982990874668, + "grad_norm": 0.14011762402532688, + "learning_rate": 4.843330222160497e-06, + "loss": 2.8676, + "step": 42183 + }, + { + "epoch": 2.6186603761872247, + "grad_norm": 0.14665260826903714, + "learning_rate": 4.841779695683313e-06, + "loss": 2.6951, + "step": 42184 + }, + { + "epoch": 2.618722453286982, + "grad_norm": 0.1380234319214343, + "learning_rate": 4.8402294048075335e-06, + "loss": 2.6455, + "step": 42185 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.13622192984998172, + "learning_rate": 4.838679349541242e-06, + "loss": 2.589, + "step": 42186 + }, + { + "epoch": 2.618846607486498, + "grad_norm": 0.1520855010762162, + "learning_rate": 4.837129529892515e-06, + "loss": 2.7433, + "step": 42187 + }, + { + "epoch": 2.6189086845862564, + "grad_norm": 0.14546827124651052, + "learning_rate": 4.835579945869451e-06, + "loss": 2.5776, + "step": 42188 + }, + { + "epoch": 2.618970761686014, + "grad_norm": 0.1412123362154035, + "learning_rate": 4.834030597480127e-06, + "loss": 2.704, + "step": 42189 + }, + { + "epoch": 2.619032838785772, + "grad_norm": 0.13409807896832893, + "learning_rate": 4.832481484732637e-06, + "loss": 2.7083, + "step": 42190 + }, + { + "epoch": 2.6190949158855297, + "grad_norm": 0.1592273640759147, + "learning_rate": 4.830932607635053e-06, + "loss": 2.7544, + "step": 42191 + }, + { + "epoch": 2.6191569929852876, + "grad_norm": 0.144608650875494, + "learning_rate": 4.829383966195461e-06, + "loss": 2.7067, + "step": 42192 + }, + { + "epoch": 2.6192190700850455, + "grad_norm": 0.13398820437861775, + "learning_rate": 4.827835560421939e-06, + "loss": 2.6845, + "step": 42193 + }, + { + "epoch": 2.6192811471848034, + "grad_norm": 0.13948369479171102, + "learning_rate": 4.8262873903225595e-06, + "loss": 2.6755, + "step": 42194 + }, + { + "epoch": 2.6193432242845613, + "grad_norm": 0.13585218597002285, + "learning_rate": 4.824739455905408e-06, + "loss": 2.7147, + "step": 42195 + }, + { + "epoch": 2.6194053013843193, + "grad_norm": 0.13607434586636336, + "learning_rate": 4.823191757178563e-06, + "loss": 2.6614, + "step": 42196 + }, + { + "epoch": 2.619467378484077, + "grad_norm": 0.1498320722355386, + "learning_rate": 4.8216442941500905e-06, + "loss": 2.6523, + "step": 42197 + }, + { + "epoch": 2.619529455583835, + "grad_norm": 0.13736544657941938, + "learning_rate": 4.820097066828055e-06, + "loss": 2.6849, + "step": 42198 + }, + { + "epoch": 2.619591532683593, + "grad_norm": 0.13523442510232597, + "learning_rate": 4.8185500752205515e-06, + "loss": 2.7281, + "step": 42199 + }, + { + "epoch": 2.619653609783351, + "grad_norm": 0.1318388832610893, + "learning_rate": 4.817003319335645e-06, + "loss": 2.6935, + "step": 42200 + }, + { + "epoch": 2.619715686883109, + "grad_norm": 0.14257890230776918, + "learning_rate": 4.815456799181395e-06, + "loss": 2.6492, + "step": 42201 + }, + { + "epoch": 2.6197777639828668, + "grad_norm": 0.14042827303071923, + "learning_rate": 4.813910514765879e-06, + "loss": 2.6421, + "step": 42202 + }, + { + "epoch": 2.6198398410826247, + "grad_norm": 0.14295460365223656, + "learning_rate": 4.8123644660971525e-06, + "loss": 2.6327, + "step": 42203 + }, + { + "epoch": 2.6199019181823826, + "grad_norm": 0.1385383769292491, + "learning_rate": 4.810818653183297e-06, + "loss": 2.7215, + "step": 42204 + }, + { + "epoch": 2.6199639952821405, + "grad_norm": 0.13531273206422212, + "learning_rate": 4.809273076032372e-06, + "loss": 2.7693, + "step": 42205 + }, + { + "epoch": 2.6200260723818984, + "grad_norm": 0.13631244640314308, + "learning_rate": 4.80772773465244e-06, + "loss": 2.7078, + "step": 42206 + }, + { + "epoch": 2.6200881494816564, + "grad_norm": 0.14477580232867263, + "learning_rate": 4.806182629051553e-06, + "loss": 2.7576, + "step": 42207 + }, + { + "epoch": 2.6201502265814143, + "grad_norm": 0.1505363612816826, + "learning_rate": 4.804637759237795e-06, + "loss": 2.7249, + "step": 42208 + }, + { + "epoch": 2.620212303681172, + "grad_norm": 0.14021440007752917, + "learning_rate": 4.80309312521921e-06, + "loss": 2.6371, + "step": 42209 + }, + { + "epoch": 2.6202743807809297, + "grad_norm": 0.14001235361388736, + "learning_rate": 4.801548727003862e-06, + "loss": 2.7373, + "step": 42210 + }, + { + "epoch": 2.620336457880688, + "grad_norm": 0.1319624541007321, + "learning_rate": 4.800004564599797e-06, + "loss": 2.6606, + "step": 42211 + }, + { + "epoch": 2.6203985349804455, + "grad_norm": 0.13342412195883496, + "learning_rate": 4.7984606380150895e-06, + "loss": 2.7363, + "step": 42212 + }, + { + "epoch": 2.620460612080204, + "grad_norm": 0.14155526555335593, + "learning_rate": 4.796916947257785e-06, + "loss": 2.7158, + "step": 42213 + }, + { + "epoch": 2.6205226891799613, + "grad_norm": 0.14262922233596875, + "learning_rate": 4.7953734923359365e-06, + "loss": 2.618, + "step": 42214 + }, + { + "epoch": 2.6205847662797193, + "grad_norm": 0.15244177574489273, + "learning_rate": 4.7938302732576045e-06, + "loss": 2.6801, + "step": 42215 + }, + { + "epoch": 2.620646843379477, + "grad_norm": 0.13369577176397412, + "learning_rate": 4.792287290030817e-06, + "loss": 2.6987, + "step": 42216 + }, + { + "epoch": 2.620708920479235, + "grad_norm": 0.14760381206473447, + "learning_rate": 4.790744542663655e-06, + "loss": 2.7138, + "step": 42217 + }, + { + "epoch": 2.620770997578993, + "grad_norm": 0.13977687104911757, + "learning_rate": 4.7892020311641585e-06, + "loss": 2.7102, + "step": 42218 + }, + { + "epoch": 2.620833074678751, + "grad_norm": 0.13007369391881538, + "learning_rate": 4.7876597555403525e-06, + "loss": 2.6491, + "step": 42219 + }, + { + "epoch": 2.620895151778509, + "grad_norm": 0.13222967620293857, + "learning_rate": 4.786117715800315e-06, + "loss": 2.6375, + "step": 42220 + }, + { + "epoch": 2.6209572288782668, + "grad_norm": 0.14148456080575642, + "learning_rate": 4.784575911952066e-06, + "loss": 2.6492, + "step": 42221 + }, + { + "epoch": 2.6210193059780247, + "grad_norm": 0.14107703697139082, + "learning_rate": 4.783034344003673e-06, + "loss": 2.7253, + "step": 42222 + }, + { + "epoch": 2.6210813830777826, + "grad_norm": 0.1472737026781857, + "learning_rate": 4.781493011963162e-06, + "loss": 2.7556, + "step": 42223 + }, + { + "epoch": 2.6211434601775405, + "grad_norm": 0.14088059086270363, + "learning_rate": 4.779951915838582e-06, + "loss": 2.7409, + "step": 42224 + }, + { + "epoch": 2.6212055372772984, + "grad_norm": 0.13785330183532232, + "learning_rate": 4.7784110556379605e-06, + "loss": 2.6392, + "step": 42225 + }, + { + "epoch": 2.6212676143770564, + "grad_norm": 0.14656216780984807, + "learning_rate": 4.776870431369357e-06, + "loss": 2.7441, + "step": 42226 + }, + { + "epoch": 2.6213296914768143, + "grad_norm": 0.13692966949058819, + "learning_rate": 4.775330043040799e-06, + "loss": 2.7079, + "step": 42227 + }, + { + "epoch": 2.621391768576572, + "grad_norm": 0.14516548877752622, + "learning_rate": 4.773789890660324e-06, + "loss": 2.6783, + "step": 42228 + }, + { + "epoch": 2.62145384567633, + "grad_norm": 0.155671754405285, + "learning_rate": 4.772249974235965e-06, + "loss": 2.6919, + "step": 42229 + }, + { + "epoch": 2.621515922776088, + "grad_norm": 0.14882154099726697, + "learning_rate": 4.770710293775743e-06, + "loss": 2.6642, + "step": 42230 + }, + { + "epoch": 2.621577999875846, + "grad_norm": 0.1351868638884754, + "learning_rate": 4.769170849287719e-06, + "loss": 2.7254, + "step": 42231 + }, + { + "epoch": 2.621640076975604, + "grad_norm": 0.15756853598344503, + "learning_rate": 4.767631640779913e-06, + "loss": 2.6521, + "step": 42232 + }, + { + "epoch": 2.6217021540753613, + "grad_norm": 0.14764517207697586, + "learning_rate": 4.766092668260353e-06, + "loss": 2.7228, + "step": 42233 + }, + { + "epoch": 2.6217642311751197, + "grad_norm": 0.1348708060207353, + "learning_rate": 4.764553931737053e-06, + "loss": 2.6432, + "step": 42234 + }, + { + "epoch": 2.621826308274877, + "grad_norm": 0.13524951996559445, + "learning_rate": 4.763015431218071e-06, + "loss": 2.6483, + "step": 42235 + }, + { + "epoch": 2.6218883853746355, + "grad_norm": 0.133843062981769, + "learning_rate": 4.761477166711414e-06, + "loss": 2.6866, + "step": 42236 + }, + { + "epoch": 2.621950462474393, + "grad_norm": 0.13119045763239282, + "learning_rate": 4.759939138225117e-06, + "loss": 2.6574, + "step": 42237 + }, + { + "epoch": 2.6220125395741514, + "grad_norm": 0.14628893211776567, + "learning_rate": 4.758401345767199e-06, + "loss": 2.6493, + "step": 42238 + }, + { + "epoch": 2.622074616673909, + "grad_norm": 0.13398223352528144, + "learning_rate": 4.756863789345678e-06, + "loss": 2.6663, + "step": 42239 + }, + { + "epoch": 2.6221366937736668, + "grad_norm": 0.14050112009747406, + "learning_rate": 4.7553264689685905e-06, + "loss": 2.685, + "step": 42240 + }, + { + "epoch": 2.6221987708734247, + "grad_norm": 0.13151660674858243, + "learning_rate": 4.753789384643942e-06, + "loss": 2.6992, + "step": 42241 + }, + { + "epoch": 2.6222608479731826, + "grad_norm": 0.14540381843686687, + "learning_rate": 4.752252536379765e-06, + "loss": 2.7102, + "step": 42242 + }, + { + "epoch": 2.6223229250729405, + "grad_norm": 0.13531590497091522, + "learning_rate": 4.750715924184057e-06, + "loss": 2.6976, + "step": 42243 + }, + { + "epoch": 2.6223850021726984, + "grad_norm": 0.16760144993927584, + "learning_rate": 4.749179548064858e-06, + "loss": 2.7155, + "step": 42244 + }, + { + "epoch": 2.6224470792724563, + "grad_norm": 0.13457982067074947, + "learning_rate": 4.747643408030178e-06, + "loss": 2.688, + "step": 42245 + }, + { + "epoch": 2.6225091563722143, + "grad_norm": 0.1341269465800274, + "learning_rate": 4.7461075040880265e-06, + "loss": 2.7806, + "step": 42246 + }, + { + "epoch": 2.622571233471972, + "grad_norm": 0.1317684636700827, + "learning_rate": 4.744571836246409e-06, + "loss": 2.7467, + "step": 42247 + }, + { + "epoch": 2.62263331057173, + "grad_norm": 0.12760508465598894, + "learning_rate": 4.743036404513352e-06, + "loss": 2.6609, + "step": 42248 + }, + { + "epoch": 2.622695387671488, + "grad_norm": 0.13958046999765367, + "learning_rate": 4.741501208896864e-06, + "loss": 2.767, + "step": 42249 + }, + { + "epoch": 2.622757464771246, + "grad_norm": 0.13847491555250094, + "learning_rate": 4.739966249404948e-06, + "loss": 2.6377, + "step": 42250 + }, + { + "epoch": 2.622819541871004, + "grad_norm": 0.17033032949546947, + "learning_rate": 4.7384315260456166e-06, + "loss": 2.7286, + "step": 42251 + }, + { + "epoch": 2.6228816189707618, + "grad_norm": 0.14075173521623818, + "learning_rate": 4.736897038826871e-06, + "loss": 2.6245, + "step": 42252 + }, + { + "epoch": 2.6229436960705197, + "grad_norm": 0.1284163949964279, + "learning_rate": 4.735362787756714e-06, + "loss": 2.6551, + "step": 42253 + }, + { + "epoch": 2.6230057731702776, + "grad_norm": 0.1363838999857236, + "learning_rate": 4.733828772843174e-06, + "loss": 2.7125, + "step": 42254 + }, + { + "epoch": 2.6230678502700355, + "grad_norm": 0.13545380290991924, + "learning_rate": 4.732294994094233e-06, + "loss": 2.6525, + "step": 42255 + }, + { + "epoch": 2.6231299273697934, + "grad_norm": 0.14016231608792515, + "learning_rate": 4.730761451517901e-06, + "loss": 2.7125, + "step": 42256 + }, + { + "epoch": 2.6231920044695514, + "grad_norm": 0.13740053480702227, + "learning_rate": 4.729228145122166e-06, + "loss": 2.777, + "step": 42257 + }, + { + "epoch": 2.623254081569309, + "grad_norm": 0.14193141271538268, + "learning_rate": 4.727695074915045e-06, + "loss": 2.7259, + "step": 42258 + }, + { + "epoch": 2.623316158669067, + "grad_norm": 0.1328685375810534, + "learning_rate": 4.72616224090453e-06, + "loss": 2.7758, + "step": 42259 + }, + { + "epoch": 2.6233782357688247, + "grad_norm": 0.1374419363384858, + "learning_rate": 4.7246296430986155e-06, + "loss": 2.6384, + "step": 42260 + }, + { + "epoch": 2.623440312868583, + "grad_norm": 0.14259581913304784, + "learning_rate": 4.723097281505295e-06, + "loss": 2.6547, + "step": 42261 + }, + { + "epoch": 2.6235023899683405, + "grad_norm": 0.14120917708274372, + "learning_rate": 4.721565156132573e-06, + "loss": 2.6843, + "step": 42262 + }, + { + "epoch": 2.6235644670680984, + "grad_norm": 0.1587799646790647, + "learning_rate": 4.720033266988438e-06, + "loss": 2.7207, + "step": 42263 + }, + { + "epoch": 2.6236265441678563, + "grad_norm": 0.16615794196495123, + "learning_rate": 4.718501614080883e-06, + "loss": 2.6816, + "step": 42264 + }, + { + "epoch": 2.6236886212676143, + "grad_norm": 0.13176948143965425, + "learning_rate": 4.7169701974178914e-06, + "loss": 2.6679, + "step": 42265 + }, + { + "epoch": 2.623750698367372, + "grad_norm": 0.14342946799858997, + "learning_rate": 4.715439017007456e-06, + "loss": 2.6566, + "step": 42266 + }, + { + "epoch": 2.62381277546713, + "grad_norm": 0.13118670220152923, + "learning_rate": 4.7139080728575755e-06, + "loss": 2.704, + "step": 42267 + }, + { + "epoch": 2.623874852566888, + "grad_norm": 0.1443574027384941, + "learning_rate": 4.712377364976234e-06, + "loss": 2.6742, + "step": 42268 + }, + { + "epoch": 2.623936929666646, + "grad_norm": 0.15529403909730088, + "learning_rate": 4.710846893371407e-06, + "loss": 2.6835, + "step": 42269 + }, + { + "epoch": 2.623999006766404, + "grad_norm": 0.14090307874784772, + "learning_rate": 4.709316658051083e-06, + "loss": 2.667, + "step": 42270 + }, + { + "epoch": 2.6240610838661618, + "grad_norm": 0.14606376023946574, + "learning_rate": 4.707786659023255e-06, + "loss": 2.6108, + "step": 42271 + }, + { + "epoch": 2.6241231609659197, + "grad_norm": 0.1444773831380027, + "learning_rate": 4.706256896295902e-06, + "loss": 2.7384, + "step": 42272 + }, + { + "epoch": 2.6241852380656776, + "grad_norm": 0.13020219470494826, + "learning_rate": 4.704727369876999e-06, + "loss": 2.6124, + "step": 42273 + }, + { + "epoch": 2.6242473151654355, + "grad_norm": 0.13316827692964842, + "learning_rate": 4.703198079774518e-06, + "loss": 2.6944, + "step": 42274 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.13430856515136144, + "learning_rate": 4.701669025996463e-06, + "loss": 2.6952, + "step": 42275 + }, + { + "epoch": 2.6243714693649514, + "grad_norm": 0.13351256625408925, + "learning_rate": 4.70014020855079e-06, + "loss": 2.7522, + "step": 42276 + }, + { + "epoch": 2.6244335464647093, + "grad_norm": 0.14731025576025025, + "learning_rate": 4.698611627445493e-06, + "loss": 2.7405, + "step": 42277 + }, + { + "epoch": 2.624495623564467, + "grad_norm": 0.13468090794666035, + "learning_rate": 4.697083282688531e-06, + "loss": 2.6271, + "step": 42278 + }, + { + "epoch": 2.624557700664225, + "grad_norm": 0.14371370085298896, + "learning_rate": 4.695555174287869e-06, + "loss": 2.6997, + "step": 42279 + }, + { + "epoch": 2.624619777763983, + "grad_norm": 0.13478219580014447, + "learning_rate": 4.694027302251508e-06, + "loss": 2.6757, + "step": 42280 + }, + { + "epoch": 2.6246818548637405, + "grad_norm": 0.15128098155767922, + "learning_rate": 4.692499666587408e-06, + "loss": 2.68, + "step": 42281 + }, + { + "epoch": 2.624743931963499, + "grad_norm": 0.1475897977543333, + "learning_rate": 4.690972267303528e-06, + "loss": 2.7335, + "step": 42282 + }, + { + "epoch": 2.6248060090632563, + "grad_norm": 0.13484987445020294, + "learning_rate": 4.689445104407841e-06, + "loss": 2.6761, + "step": 42283 + }, + { + "epoch": 2.6248680861630147, + "grad_norm": 0.14166868526292165, + "learning_rate": 4.687918177908329e-06, + "loss": 2.5922, + "step": 42284 + }, + { + "epoch": 2.624930163262772, + "grad_norm": 0.143438415410148, + "learning_rate": 4.6863914878129355e-06, + "loss": 2.7378, + "step": 42285 + }, + { + "epoch": 2.6249922403625305, + "grad_norm": 0.15510757152307816, + "learning_rate": 4.684865034129648e-06, + "loss": 2.6905, + "step": 42286 + }, + { + "epoch": 2.625054317462288, + "grad_norm": 0.13201045036129866, + "learning_rate": 4.683338816866417e-06, + "loss": 2.6512, + "step": 42287 + }, + { + "epoch": 2.625116394562046, + "grad_norm": 0.13666397087221885, + "learning_rate": 4.681812836031213e-06, + "loss": 2.6791, + "step": 42288 + }, + { + "epoch": 2.625178471661804, + "grad_norm": 0.1357627619787094, + "learning_rate": 4.6802870916319854e-06, + "loss": 2.6631, + "step": 42289 + }, + { + "epoch": 2.6252405487615618, + "grad_norm": 0.14186623928564693, + "learning_rate": 4.678761583676705e-06, + "loss": 2.7332, + "step": 42290 + }, + { + "epoch": 2.6253026258613197, + "grad_norm": 0.16188640082100073, + "learning_rate": 4.677236312173333e-06, + "loss": 2.6526, + "step": 42291 + }, + { + "epoch": 2.6253647029610776, + "grad_norm": 0.1351618263990572, + "learning_rate": 4.675711277129818e-06, + "loss": 2.6307, + "step": 42292 + }, + { + "epoch": 2.6254267800608355, + "grad_norm": 0.1484129659555273, + "learning_rate": 4.67418647855411e-06, + "loss": 2.7242, + "step": 42293 + }, + { + "epoch": 2.6254888571605934, + "grad_norm": 0.13300837770137489, + "learning_rate": 4.6726619164541855e-06, + "loss": 2.692, + "step": 42294 + }, + { + "epoch": 2.6255509342603514, + "grad_norm": 0.143341301602514, + "learning_rate": 4.671137590837987e-06, + "loss": 2.7044, + "step": 42295 + }, + { + "epoch": 2.6256130113601093, + "grad_norm": 0.15551789551707026, + "learning_rate": 4.669613501713471e-06, + "loss": 2.6428, + "step": 42296 + }, + { + "epoch": 2.625675088459867, + "grad_norm": 0.1389833364292429, + "learning_rate": 4.6680896490885694e-06, + "loss": 2.7338, + "step": 42297 + }, + { + "epoch": 2.625737165559625, + "grad_norm": 0.13323005784337907, + "learning_rate": 4.666566032971265e-06, + "loss": 2.5833, + "step": 42298 + }, + { + "epoch": 2.625799242659383, + "grad_norm": 0.13117678422357834, + "learning_rate": 4.665042653369489e-06, + "loss": 2.6443, + "step": 42299 + }, + { + "epoch": 2.625861319759141, + "grad_norm": 0.13409387712599294, + "learning_rate": 4.663519510291186e-06, + "loss": 2.7736, + "step": 42300 + }, + { + "epoch": 2.625923396858899, + "grad_norm": 0.1434739398275198, + "learning_rate": 4.661996603744312e-06, + "loss": 2.6835, + "step": 42301 + }, + { + "epoch": 2.625985473958657, + "grad_norm": 0.13959239288602932, + "learning_rate": 4.660473933736797e-06, + "loss": 2.6978, + "step": 42302 + }, + { + "epoch": 2.6260475510584147, + "grad_norm": 0.1449590036483713, + "learning_rate": 4.658951500276609e-06, + "loss": 2.6481, + "step": 42303 + }, + { + "epoch": 2.6261096281581726, + "grad_norm": 0.13102664505363112, + "learning_rate": 4.6574293033716744e-06, + "loss": 2.6795, + "step": 42304 + }, + { + "epoch": 2.6261717052579305, + "grad_norm": 0.14964976450292664, + "learning_rate": 4.655907343029936e-06, + "loss": 2.6882, + "step": 42305 + }, + { + "epoch": 2.626233782357688, + "grad_norm": 0.13306063724779996, + "learning_rate": 4.654385619259333e-06, + "loss": 2.7616, + "step": 42306 + }, + { + "epoch": 2.6262958594574464, + "grad_norm": 0.13575810400867924, + "learning_rate": 4.652864132067813e-06, + "loss": 2.6441, + "step": 42307 + }, + { + "epoch": 2.626357936557204, + "grad_norm": 0.13721278711375637, + "learning_rate": 4.651342881463311e-06, + "loss": 2.7087, + "step": 42308 + }, + { + "epoch": 2.626420013656962, + "grad_norm": 0.13239655154093685, + "learning_rate": 4.649821867453763e-06, + "loss": 2.67, + "step": 42309 + }, + { + "epoch": 2.6264820907567197, + "grad_norm": 0.14466058254119904, + "learning_rate": 4.648301090047097e-06, + "loss": 2.6876, + "step": 42310 + }, + { + "epoch": 2.6265441678564776, + "grad_norm": 0.135394708216355, + "learning_rate": 4.646780549251256e-06, + "loss": 2.7072, + "step": 42311 + }, + { + "epoch": 2.6266062449562355, + "grad_norm": 0.15498793042590261, + "learning_rate": 4.645260245074174e-06, + "loss": 2.6108, + "step": 42312 + }, + { + "epoch": 2.6266683220559934, + "grad_norm": 0.13642575339751892, + "learning_rate": 4.643740177523781e-06, + "loss": 2.6946, + "step": 42313 + }, + { + "epoch": 2.6267303991557513, + "grad_norm": 0.13313178643985812, + "learning_rate": 4.642220346608006e-06, + "loss": 2.7255, + "step": 42314 + }, + { + "epoch": 2.6267924762555093, + "grad_norm": 0.14280482128585106, + "learning_rate": 4.640700752334765e-06, + "loss": 2.7355, + "step": 42315 + }, + { + "epoch": 2.626854553355267, + "grad_norm": 0.14099962613972683, + "learning_rate": 4.639181394712011e-06, + "loss": 2.7768, + "step": 42316 + }, + { + "epoch": 2.626916630455025, + "grad_norm": 0.13184093183311626, + "learning_rate": 4.637662273747662e-06, + "loss": 2.7144, + "step": 42317 + }, + { + "epoch": 2.626978707554783, + "grad_norm": 0.13507988128270496, + "learning_rate": 4.636143389449632e-06, + "loss": 2.6779, + "step": 42318 + }, + { + "epoch": 2.627040784654541, + "grad_norm": 0.1420697791231394, + "learning_rate": 4.63462474182586e-06, + "loss": 2.744, + "step": 42319 + }, + { + "epoch": 2.627102861754299, + "grad_norm": 0.14364356578819573, + "learning_rate": 4.633106330884257e-06, + "loss": 2.7885, + "step": 42320 + }, + { + "epoch": 2.6271649388540568, + "grad_norm": 0.1408063527144073, + "learning_rate": 4.63158815663276e-06, + "loss": 2.624, + "step": 42321 + }, + { + "epoch": 2.6272270159538147, + "grad_norm": 0.13417283586224876, + "learning_rate": 4.63007021907928e-06, + "loss": 2.6777, + "step": 42322 + }, + { + "epoch": 2.6272890930535726, + "grad_norm": 0.13881391987123806, + "learning_rate": 4.628552518231738e-06, + "loss": 2.7265, + "step": 42323 + }, + { + "epoch": 2.6273511701533305, + "grad_norm": 0.13570171548729584, + "learning_rate": 4.627035054098044e-06, + "loss": 2.6506, + "step": 42324 + }, + { + "epoch": 2.6274132472530884, + "grad_norm": 0.16822468310740274, + "learning_rate": 4.6255178266861264e-06, + "loss": 2.7431, + "step": 42325 + }, + { + "epoch": 2.6274753243528464, + "grad_norm": 0.1282569325197254, + "learning_rate": 4.624000836003906e-06, + "loss": 2.5929, + "step": 42326 + }, + { + "epoch": 2.6275374014526043, + "grad_norm": 0.14921104126636067, + "learning_rate": 4.622484082059281e-06, + "loss": 2.6915, + "step": 42327 + }, + { + "epoch": 2.627599478552362, + "grad_norm": 0.14780105121058695, + "learning_rate": 4.620967564860179e-06, + "loss": 2.7287, + "step": 42328 + }, + { + "epoch": 2.6276615556521197, + "grad_norm": 0.14185442216506264, + "learning_rate": 4.619451284414494e-06, + "loss": 2.7218, + "step": 42329 + }, + { + "epoch": 2.627723632751878, + "grad_norm": 0.13749914873466543, + "learning_rate": 4.617935240730154e-06, + "loss": 2.678, + "step": 42330 + }, + { + "epoch": 2.6277857098516355, + "grad_norm": 0.16526365516724276, + "learning_rate": 4.616419433815067e-06, + "loss": 2.7495, + "step": 42331 + }, + { + "epoch": 2.627847786951394, + "grad_norm": 0.12962864264075896, + "learning_rate": 4.614903863677133e-06, + "loss": 2.6172, + "step": 42332 + }, + { + "epoch": 2.6279098640511513, + "grad_norm": 0.14306032604017319, + "learning_rate": 4.613388530324259e-06, + "loss": 2.6956, + "step": 42333 + }, + { + "epoch": 2.6279719411509093, + "grad_norm": 0.14621046524318096, + "learning_rate": 4.611873433764358e-06, + "loss": 2.7493, + "step": 42334 + }, + { + "epoch": 2.628034018250667, + "grad_norm": 0.13388921650477228, + "learning_rate": 4.610358574005335e-06, + "loss": 2.7645, + "step": 42335 + }, + { + "epoch": 2.628096095350425, + "grad_norm": 0.14937046137901624, + "learning_rate": 4.6088439510550916e-06, + "loss": 2.6868, + "step": 42336 + }, + { + "epoch": 2.628158172450183, + "grad_norm": 0.16543608093517792, + "learning_rate": 4.607329564921525e-06, + "loss": 2.7015, + "step": 42337 + }, + { + "epoch": 2.628220249549941, + "grad_norm": 0.1327901358568064, + "learning_rate": 4.605815415612536e-06, + "loss": 2.5867, + "step": 42338 + }, + { + "epoch": 2.628282326649699, + "grad_norm": 0.13014857987406903, + "learning_rate": 4.604301503136033e-06, + "loss": 2.6547, + "step": 42339 + }, + { + "epoch": 2.6283444037494568, + "grad_norm": 0.13720702446228228, + "learning_rate": 4.602787827499905e-06, + "loss": 2.7832, + "step": 42340 + }, + { + "epoch": 2.6284064808492147, + "grad_norm": 0.13508059142526743, + "learning_rate": 4.601274388712057e-06, + "loss": 2.6726, + "step": 42341 + }, + { + "epoch": 2.6284685579489726, + "grad_norm": 0.13197371030360877, + "learning_rate": 4.5997611867803765e-06, + "loss": 2.6012, + "step": 42342 + }, + { + "epoch": 2.6285306350487305, + "grad_norm": 0.13101709881376103, + "learning_rate": 4.5982482217127695e-06, + "loss": 2.7054, + "step": 42343 + }, + { + "epoch": 2.6285927121484884, + "grad_norm": 0.1339314322790495, + "learning_rate": 4.596735493517123e-06, + "loss": 2.6206, + "step": 42344 + }, + { + "epoch": 2.6286547892482464, + "grad_norm": 0.13203087981584108, + "learning_rate": 4.595223002201327e-06, + "loss": 2.6919, + "step": 42345 + }, + { + "epoch": 2.6287168663480043, + "grad_norm": 0.13597582874138475, + "learning_rate": 4.593710747773266e-06, + "loss": 2.6295, + "step": 42346 + }, + { + "epoch": 2.628778943447762, + "grad_norm": 0.13305531072389423, + "learning_rate": 4.592198730240854e-06, + "loss": 2.7696, + "step": 42347 + }, + { + "epoch": 2.62884102054752, + "grad_norm": 0.1331803053385455, + "learning_rate": 4.590686949611955e-06, + "loss": 2.7526, + "step": 42348 + }, + { + "epoch": 2.628903097647278, + "grad_norm": 0.13407504708811901, + "learning_rate": 4.589175405894475e-06, + "loss": 2.622, + "step": 42349 + }, + { + "epoch": 2.628965174747036, + "grad_norm": 0.13126332294873463, + "learning_rate": 4.5876640990962785e-06, + "loss": 2.6442, + "step": 42350 + }, + { + "epoch": 2.629027251846794, + "grad_norm": 0.15011375504718005, + "learning_rate": 4.586153029225271e-06, + "loss": 2.7376, + "step": 42351 + }, + { + "epoch": 2.6290893289465513, + "grad_norm": 0.13671139288589423, + "learning_rate": 4.584642196289318e-06, + "loss": 2.7474, + "step": 42352 + }, + { + "epoch": 2.6291514060463097, + "grad_norm": 0.13496771462810234, + "learning_rate": 4.58313160029632e-06, + "loss": 2.7104, + "step": 42353 + }, + { + "epoch": 2.629213483146067, + "grad_norm": 0.1337656641081049, + "learning_rate": 4.581621241254153e-06, + "loss": 2.6816, + "step": 42354 + }, + { + "epoch": 2.6292755602458255, + "grad_norm": 0.14739736866797248, + "learning_rate": 4.580111119170688e-06, + "loss": 2.7381, + "step": 42355 + }, + { + "epoch": 2.629337637345583, + "grad_norm": 0.1457077291543572, + "learning_rate": 4.578601234053803e-06, + "loss": 2.7416, + "step": 42356 + }, + { + "epoch": 2.6293997144453414, + "grad_norm": 0.15488038007021546, + "learning_rate": 4.577091585911391e-06, + "loss": 2.7564, + "step": 42357 + }, + { + "epoch": 2.629461791545099, + "grad_norm": 0.14465811677215662, + "learning_rate": 4.57558217475132e-06, + "loss": 2.6939, + "step": 42358 + }, + { + "epoch": 2.6295238686448568, + "grad_norm": 0.13177446415639715, + "learning_rate": 4.574073000581464e-06, + "loss": 2.7149, + "step": 42359 + }, + { + "epoch": 2.6295859457446147, + "grad_norm": 0.13739509861653018, + "learning_rate": 4.57256406340969e-06, + "loss": 2.6528, + "step": 42360 + }, + { + "epoch": 2.6296480228443726, + "grad_norm": 0.1400161250867194, + "learning_rate": 4.571055363243881e-06, + "loss": 2.6747, + "step": 42361 + }, + { + "epoch": 2.6297100999441305, + "grad_norm": 0.13617075359913558, + "learning_rate": 4.5695469000919085e-06, + "loss": 2.6673, + "step": 42362 + }, + { + "epoch": 2.6297721770438884, + "grad_norm": 0.1683484653646683, + "learning_rate": 4.568038673961639e-06, + "loss": 2.7147, + "step": 42363 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.13446527681069587, + "learning_rate": 4.566530684860937e-06, + "loss": 2.7154, + "step": 42364 + }, + { + "epoch": 2.6298963312434043, + "grad_norm": 0.15852897547401032, + "learning_rate": 4.565022932797664e-06, + "loss": 2.682, + "step": 42365 + }, + { + "epoch": 2.629958408343162, + "grad_norm": 0.13473382519363508, + "learning_rate": 4.5635154177797065e-06, + "loss": 2.698, + "step": 42366 + }, + { + "epoch": 2.63002048544292, + "grad_norm": 0.13227235122404954, + "learning_rate": 4.562008139814922e-06, + "loss": 2.7605, + "step": 42367 + }, + { + "epoch": 2.630082562542678, + "grad_norm": 0.1355115866554727, + "learning_rate": 4.560501098911169e-06, + "loss": 2.5947, + "step": 42368 + }, + { + "epoch": 2.630144639642436, + "grad_norm": 0.13626617045862544, + "learning_rate": 4.5589942950763076e-06, + "loss": 2.6622, + "step": 42369 + }, + { + "epoch": 2.630206716742194, + "grad_norm": 0.15893027887242306, + "learning_rate": 4.557487728318205e-06, + "loss": 2.7658, + "step": 42370 + }, + { + "epoch": 2.630268793841952, + "grad_norm": 0.13724927828769243, + "learning_rate": 4.5559813986447265e-06, + "loss": 2.7573, + "step": 42371 + }, + { + "epoch": 2.6303308709417097, + "grad_norm": 0.1691625499390816, + "learning_rate": 4.554475306063727e-06, + "loss": 2.6433, + "step": 42372 + }, + { + "epoch": 2.6303929480414676, + "grad_norm": 0.13391308709164296, + "learning_rate": 4.552969450583055e-06, + "loss": 2.6228, + "step": 42373 + }, + { + "epoch": 2.6304550251412255, + "grad_norm": 0.1332032891661228, + "learning_rate": 4.551463832210573e-06, + "loss": 2.6923, + "step": 42374 + }, + { + "epoch": 2.6305171022409835, + "grad_norm": 0.15556231802856543, + "learning_rate": 4.549958450954139e-06, + "loss": 2.656, + "step": 42375 + }, + { + "epoch": 2.6305791793407414, + "grad_norm": 0.1405130510305482, + "learning_rate": 4.54845330682161e-06, + "loss": 2.6008, + "step": 42376 + }, + { + "epoch": 2.630641256440499, + "grad_norm": 0.14405111516125624, + "learning_rate": 4.546948399820833e-06, + "loss": 2.6655, + "step": 42377 + }, + { + "epoch": 2.630703333540257, + "grad_norm": 0.13977125409189414, + "learning_rate": 4.545443729959648e-06, + "loss": 2.7174, + "step": 42378 + }, + { + "epoch": 2.6307654106400147, + "grad_norm": 0.139684499072583, + "learning_rate": 4.5439392972459305e-06, + "loss": 2.7485, + "step": 42379 + }, + { + "epoch": 2.630827487739773, + "grad_norm": 0.1414207158671801, + "learning_rate": 4.542435101687515e-06, + "loss": 2.6267, + "step": 42380 + }, + { + "epoch": 2.6308895648395305, + "grad_norm": 0.14338914577689263, + "learning_rate": 4.540931143292248e-06, + "loss": 2.6598, + "step": 42381 + }, + { + "epoch": 2.6309516419392884, + "grad_norm": 0.13794928321583028, + "learning_rate": 4.539427422067971e-06, + "loss": 2.6716, + "step": 42382 + }, + { + "epoch": 2.6310137190390464, + "grad_norm": 0.13918638818688198, + "learning_rate": 4.5379239380225425e-06, + "loss": 2.7549, + "step": 42383 + }, + { + "epoch": 2.6310757961388043, + "grad_norm": 0.15161309724964198, + "learning_rate": 4.536420691163806e-06, + "loss": 2.664, + "step": 42384 + }, + { + "epoch": 2.631137873238562, + "grad_norm": 0.14893555530930516, + "learning_rate": 4.534917681499601e-06, + "loss": 2.7519, + "step": 42385 + }, + { + "epoch": 2.63119995033832, + "grad_norm": 0.13446346908135723, + "learning_rate": 4.533414909037764e-06, + "loss": 2.7552, + "step": 42386 + }, + { + "epoch": 2.631262027438078, + "grad_norm": 0.15567186209213293, + "learning_rate": 4.53191237378614e-06, + "loss": 2.6205, + "step": 42387 + }, + { + "epoch": 2.631324104537836, + "grad_norm": 0.14052345662841395, + "learning_rate": 4.530410075752562e-06, + "loss": 2.6714, + "step": 42388 + }, + { + "epoch": 2.631386181637594, + "grad_norm": 0.14058409864278215, + "learning_rate": 4.528908014944877e-06, + "loss": 2.6981, + "step": 42389 + }, + { + "epoch": 2.6314482587373518, + "grad_norm": 0.14427993808230033, + "learning_rate": 4.52740619137092e-06, + "loss": 2.6393, + "step": 42390 + }, + { + "epoch": 2.6315103358371097, + "grad_norm": 0.1374259505499793, + "learning_rate": 4.525904605038522e-06, + "loss": 2.788, + "step": 42391 + }, + { + "epoch": 2.6315724129368676, + "grad_norm": 0.14838060116700538, + "learning_rate": 4.5244032559555115e-06, + "loss": 2.7164, + "step": 42392 + }, + { + "epoch": 2.6316344900366255, + "grad_norm": 0.13753230002716862, + "learning_rate": 4.522902144129737e-06, + "loss": 2.6976, + "step": 42393 + }, + { + "epoch": 2.6316965671363834, + "grad_norm": 0.137173767401815, + "learning_rate": 4.52140126956902e-06, + "loss": 2.7578, + "step": 42394 + }, + { + "epoch": 2.6317586442361414, + "grad_norm": 0.1386117670590066, + "learning_rate": 4.519900632281193e-06, + "loss": 2.7075, + "step": 42395 + }, + { + "epoch": 2.6318207213358993, + "grad_norm": 0.14344862179034268, + "learning_rate": 4.5184002322740785e-06, + "loss": 2.7512, + "step": 42396 + }, + { + "epoch": 2.631882798435657, + "grad_norm": 0.1703942135673894, + "learning_rate": 4.516900069555519e-06, + "loss": 2.6957, + "step": 42397 + }, + { + "epoch": 2.631944875535415, + "grad_norm": 0.1358017619193736, + "learning_rate": 4.515400144133336e-06, + "loss": 2.6028, + "step": 42398 + }, + { + "epoch": 2.632006952635173, + "grad_norm": 0.1436731796661764, + "learning_rate": 4.513900456015347e-06, + "loss": 2.6889, + "step": 42399 + }, + { + "epoch": 2.6320690297349305, + "grad_norm": 0.13744323057064647, + "learning_rate": 4.512401005209388e-06, + "loss": 2.7028, + "step": 42400 + }, + { + "epoch": 2.632131106834689, + "grad_norm": 0.13215336907972744, + "learning_rate": 4.51090179172326e-06, + "loss": 2.7931, + "step": 42401 + }, + { + "epoch": 2.6321931839344463, + "grad_norm": 0.13505429987174053, + "learning_rate": 4.509402815564812e-06, + "loss": 2.7057, + "step": 42402 + }, + { + "epoch": 2.6322552610342047, + "grad_norm": 0.13153765808577383, + "learning_rate": 4.507904076741853e-06, + "loss": 2.639, + "step": 42403 + }, + { + "epoch": 2.632317338133962, + "grad_norm": 0.13633792153385366, + "learning_rate": 4.506405575262202e-06, + "loss": 2.6767, + "step": 42404 + }, + { + "epoch": 2.6323794152337205, + "grad_norm": 0.13509536568052524, + "learning_rate": 4.504907311133666e-06, + "loss": 2.7117, + "step": 42405 + }, + { + "epoch": 2.632441492333478, + "grad_norm": 0.1336244349904868, + "learning_rate": 4.503409284364085e-06, + "loss": 2.7383, + "step": 42406 + }, + { + "epoch": 2.632503569433236, + "grad_norm": 0.16245196233063475, + "learning_rate": 4.501911494961264e-06, + "loss": 2.646, + "step": 42407 + }, + { + "epoch": 2.632565646532994, + "grad_norm": 0.13233531750518523, + "learning_rate": 4.500413942933013e-06, + "loss": 2.5893, + "step": 42408 + }, + { + "epoch": 2.6326277236327518, + "grad_norm": 0.14113715078832492, + "learning_rate": 4.4989166282871476e-06, + "loss": 2.6291, + "step": 42409 + }, + { + "epoch": 2.6326898007325097, + "grad_norm": 0.13377199969114667, + "learning_rate": 4.497419551031468e-06, + "loss": 2.665, + "step": 42410 + }, + { + "epoch": 2.6327518778322676, + "grad_norm": 0.13689787574456447, + "learning_rate": 4.4959227111738115e-06, + "loss": 2.6919, + "step": 42411 + }, + { + "epoch": 2.6328139549320255, + "grad_norm": 0.1320480900244649, + "learning_rate": 4.494426108721966e-06, + "loss": 2.6883, + "step": 42412 + }, + { + "epoch": 2.6328760320317834, + "grad_norm": 0.13359799629289282, + "learning_rate": 4.4929297436837544e-06, + "loss": 2.6603, + "step": 42413 + }, + { + "epoch": 2.6329381091315414, + "grad_norm": 0.13587929641560145, + "learning_rate": 4.491433616066959e-06, + "loss": 2.6839, + "step": 42414 + }, + { + "epoch": 2.6330001862312993, + "grad_norm": 0.14074679901546855, + "learning_rate": 4.489937725879417e-06, + "loss": 2.7057, + "step": 42415 + }, + { + "epoch": 2.633062263331057, + "grad_norm": 0.13221013686964406, + "learning_rate": 4.488442073128907e-06, + "loss": 2.6993, + "step": 42416 + }, + { + "epoch": 2.633124340430815, + "grad_norm": 0.13597873961062742, + "learning_rate": 4.486946657823249e-06, + "loss": 2.7284, + "step": 42417 + }, + { + "epoch": 2.633186417530573, + "grad_norm": 0.14651234677595346, + "learning_rate": 4.485451479970243e-06, + "loss": 2.7999, + "step": 42418 + }, + { + "epoch": 2.633248494630331, + "grad_norm": 0.13436569774943422, + "learning_rate": 4.483956539577672e-06, + "loss": 2.6787, + "step": 42419 + }, + { + "epoch": 2.633310571730089, + "grad_norm": 0.13687479786661977, + "learning_rate": 4.482461836653362e-06, + "loss": 2.7378, + "step": 42420 + }, + { + "epoch": 2.633372648829847, + "grad_norm": 0.14306820943171258, + "learning_rate": 4.480967371205097e-06, + "loss": 2.713, + "step": 42421 + }, + { + "epoch": 2.6334347259296047, + "grad_norm": 0.13768055342987634, + "learning_rate": 4.479473143240676e-06, + "loss": 2.616, + "step": 42422 + }, + { + "epoch": 2.6334968030293626, + "grad_norm": 0.13566887199780178, + "learning_rate": 4.477979152767897e-06, + "loss": 2.7284, + "step": 42423 + }, + { + "epoch": 2.6335588801291205, + "grad_norm": 0.14058413839076725, + "learning_rate": 4.476485399794539e-06, + "loss": 2.7738, + "step": 42424 + }, + { + "epoch": 2.633620957228878, + "grad_norm": 0.14725831618246407, + "learning_rate": 4.474991884328417e-06, + "loss": 2.7858, + "step": 42425 + }, + { + "epoch": 2.6336830343286364, + "grad_norm": 0.14378143749419675, + "learning_rate": 4.473498606377313e-06, + "loss": 2.7629, + "step": 42426 + }, + { + "epoch": 2.633745111428394, + "grad_norm": 0.13920910292359132, + "learning_rate": 4.472005565949022e-06, + "loss": 2.6405, + "step": 42427 + }, + { + "epoch": 2.633807188528152, + "grad_norm": 0.13353442581465746, + "learning_rate": 4.470512763051321e-06, + "loss": 2.7056, + "step": 42428 + }, + { + "epoch": 2.6338692656279097, + "grad_norm": 0.1394439346960945, + "learning_rate": 4.469020197692014e-06, + "loss": 2.6764, + "step": 42429 + }, + { + "epoch": 2.6339313427276676, + "grad_norm": 0.14126659103871667, + "learning_rate": 4.467527869878885e-06, + "loss": 2.6847, + "step": 42430 + }, + { + "epoch": 2.6339934198274255, + "grad_norm": 0.13815078076174084, + "learning_rate": 4.466035779619709e-06, + "loss": 2.7331, + "step": 42431 + }, + { + "epoch": 2.6340554969271834, + "grad_norm": 0.13482594176861296, + "learning_rate": 4.464543926922276e-06, + "loss": 2.7956, + "step": 42432 + }, + { + "epoch": 2.6341175740269414, + "grad_norm": 0.14031819147236108, + "learning_rate": 4.46305231179438e-06, + "loss": 2.7452, + "step": 42433 + }, + { + "epoch": 2.6341796511266993, + "grad_norm": 0.13325235514232453, + "learning_rate": 4.461560934243791e-06, + "loss": 2.7431, + "step": 42434 + }, + { + "epoch": 2.634241728226457, + "grad_norm": 0.13339916552991465, + "learning_rate": 4.460069794278299e-06, + "loss": 2.7218, + "step": 42435 + }, + { + "epoch": 2.634303805326215, + "grad_norm": 0.13226986574651325, + "learning_rate": 4.458578891905674e-06, + "loss": 2.6831, + "step": 42436 + }, + { + "epoch": 2.634365882425973, + "grad_norm": 0.1320961397738634, + "learning_rate": 4.457088227133688e-06, + "loss": 2.7314, + "step": 42437 + }, + { + "epoch": 2.634427959525731, + "grad_norm": 0.1334410128923908, + "learning_rate": 4.45559779997014e-06, + "loss": 2.5398, + "step": 42438 + }, + { + "epoch": 2.634490036625489, + "grad_norm": 0.16788662521830822, + "learning_rate": 4.454107610422797e-06, + "loss": 2.7267, + "step": 42439 + }, + { + "epoch": 2.634552113725247, + "grad_norm": 0.14082887890668006, + "learning_rate": 4.452617658499425e-06, + "loss": 2.7158, + "step": 42440 + }, + { + "epoch": 2.6346141908250047, + "grad_norm": 0.1342139701414031, + "learning_rate": 4.4511279442078e-06, + "loss": 2.7193, + "step": 42441 + }, + { + "epoch": 2.6346762679247626, + "grad_norm": 0.13402954124354027, + "learning_rate": 4.449638467555706e-06, + "loss": 2.7289, + "step": 42442 + }, + { + "epoch": 2.6347383450245205, + "grad_norm": 0.1334736789171961, + "learning_rate": 4.448149228550908e-06, + "loss": 2.7673, + "step": 42443 + }, + { + "epoch": 2.6348004221242785, + "grad_norm": 0.15206729429293828, + "learning_rate": 4.446660227201166e-06, + "loss": 2.6075, + "step": 42444 + }, + { + "epoch": 2.6348624992240364, + "grad_norm": 0.1328480616811811, + "learning_rate": 4.445171463514253e-06, + "loss": 2.754, + "step": 42445 + }, + { + "epoch": 2.6349245763237943, + "grad_norm": 0.148340431900315, + "learning_rate": 4.443682937497951e-06, + "loss": 2.637, + "step": 42446 + }, + { + "epoch": 2.634986653423552, + "grad_norm": 0.1559543554450308, + "learning_rate": 4.442194649160009e-06, + "loss": 2.601, + "step": 42447 + }, + { + "epoch": 2.6350487305233097, + "grad_norm": 0.139586494953257, + "learning_rate": 4.440706598508199e-06, + "loss": 2.733, + "step": 42448 + }, + { + "epoch": 2.635110807623068, + "grad_norm": 0.13795201742289523, + "learning_rate": 4.4392187855502695e-06, + "loss": 2.7757, + "step": 42449 + }, + { + "epoch": 2.6351728847228255, + "grad_norm": 0.15866232385443024, + "learning_rate": 4.4377312102940104e-06, + "loss": 2.7225, + "step": 42450 + }, + { + "epoch": 2.635234961822584, + "grad_norm": 0.1316310986657412, + "learning_rate": 4.436243872747159e-06, + "loss": 2.6916, + "step": 42451 + }, + { + "epoch": 2.6352970389223414, + "grad_norm": 0.13697367471000546, + "learning_rate": 4.434756772917486e-06, + "loss": 2.6672, + "step": 42452 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.1354317511507758, + "learning_rate": 4.433269910812759e-06, + "loss": 2.6333, + "step": 42453 + }, + { + "epoch": 2.635421193121857, + "grad_norm": 0.1453313525593786, + "learning_rate": 4.431783286440716e-06, + "loss": 2.682, + "step": 42454 + }, + { + "epoch": 2.635483270221615, + "grad_norm": 0.1429793382590634, + "learning_rate": 4.430296899809117e-06, + "loss": 2.7858, + "step": 42455 + }, + { + "epoch": 2.635545347321373, + "grad_norm": 0.13449986140384057, + "learning_rate": 4.4288107509257326e-06, + "loss": 2.6798, + "step": 42456 + }, + { + "epoch": 2.635607424421131, + "grad_norm": 0.14554786770861397, + "learning_rate": 4.427324839798297e-06, + "loss": 2.7813, + "step": 42457 + }, + { + "epoch": 2.635669501520889, + "grad_norm": 0.1333068035847345, + "learning_rate": 4.425839166434581e-06, + "loss": 2.5824, + "step": 42458 + }, + { + "epoch": 2.6357315786206468, + "grad_norm": 0.14663232325618053, + "learning_rate": 4.4243537308423175e-06, + "loss": 2.6774, + "step": 42459 + }, + { + "epoch": 2.6357936557204047, + "grad_norm": 0.13315792603367876, + "learning_rate": 4.422868533029262e-06, + "loss": 2.7289, + "step": 42460 + }, + { + "epoch": 2.6358557328201626, + "grad_norm": 0.1335026745684556, + "learning_rate": 4.421383573003168e-06, + "loss": 2.8885, + "step": 42461 + }, + { + "epoch": 2.6359178099199205, + "grad_norm": 0.13272939915168402, + "learning_rate": 4.4198988507717864e-06, + "loss": 2.6999, + "step": 42462 + }, + { + "epoch": 2.6359798870196784, + "grad_norm": 0.1325511675342733, + "learning_rate": 4.418414366342855e-06, + "loss": 2.6465, + "step": 42463 + }, + { + "epoch": 2.6360419641194364, + "grad_norm": 0.1352030799748381, + "learning_rate": 4.416930119724111e-06, + "loss": 2.7043, + "step": 42464 + }, + { + "epoch": 2.6361040412191943, + "grad_norm": 0.13504577625546668, + "learning_rate": 4.415446110923316e-06, + "loss": 2.7277, + "step": 42465 + }, + { + "epoch": 2.636166118318952, + "grad_norm": 0.13189460720498988, + "learning_rate": 4.413962339948208e-06, + "loss": 2.6032, + "step": 42466 + }, + { + "epoch": 2.63622819541871, + "grad_norm": 0.13626528879445018, + "learning_rate": 4.4124788068065245e-06, + "loss": 2.775, + "step": 42467 + }, + { + "epoch": 2.636290272518468, + "grad_norm": 0.1296679361363166, + "learning_rate": 4.410995511505994e-06, + "loss": 2.7798, + "step": 42468 + }, + { + "epoch": 2.636352349618226, + "grad_norm": 0.1366125852649844, + "learning_rate": 4.409512454054382e-06, + "loss": 2.7641, + "step": 42469 + }, + { + "epoch": 2.636414426717984, + "grad_norm": 0.14365970896103508, + "learning_rate": 4.408029634459404e-06, + "loss": 2.7736, + "step": 42470 + }, + { + "epoch": 2.636476503817742, + "grad_norm": 0.13457535013145863, + "learning_rate": 4.406547052728804e-06, + "loss": 2.6805, + "step": 42471 + }, + { + "epoch": 2.6365385809174997, + "grad_norm": 0.14828177490181668, + "learning_rate": 4.405064708870321e-06, + "loss": 2.722, + "step": 42472 + }, + { + "epoch": 2.636600658017257, + "grad_norm": 0.13408245856209364, + "learning_rate": 4.403582602891671e-06, + "loss": 2.7815, + "step": 42473 + }, + { + "epoch": 2.6366627351170155, + "grad_norm": 0.13464188077548414, + "learning_rate": 4.402100734800613e-06, + "loss": 2.7475, + "step": 42474 + }, + { + "epoch": 2.636724812216773, + "grad_norm": 0.14392269091343285, + "learning_rate": 4.400619104604858e-06, + "loss": 2.7394, + "step": 42475 + }, + { + "epoch": 2.6367868893165314, + "grad_norm": 0.13276318027058787, + "learning_rate": 4.3991377123121515e-06, + "loss": 2.6965, + "step": 42476 + }, + { + "epoch": 2.636848966416289, + "grad_norm": 0.13135572500864964, + "learning_rate": 4.3976565579301965e-06, + "loss": 2.6517, + "step": 42477 + }, + { + "epoch": 2.6369110435160468, + "grad_norm": 0.1345728518242287, + "learning_rate": 4.396175641466749e-06, + "loss": 2.6597, + "step": 42478 + }, + { + "epoch": 2.6369731206158047, + "grad_norm": 0.14479385824283011, + "learning_rate": 4.394694962929524e-06, + "loss": 2.7257, + "step": 42479 + }, + { + "epoch": 2.6370351977155626, + "grad_norm": 0.13467747113269402, + "learning_rate": 4.39321452232625e-06, + "loss": 2.725, + "step": 42480 + }, + { + "epoch": 2.6370972748153205, + "grad_norm": 0.13593849559945514, + "learning_rate": 4.391734319664637e-06, + "loss": 2.6613, + "step": 42481 + }, + { + "epoch": 2.6371593519150784, + "grad_norm": 0.1363596491722934, + "learning_rate": 4.390254354952411e-06, + "loss": 2.7283, + "step": 42482 + }, + { + "epoch": 2.6372214290148364, + "grad_norm": 0.14126162008577386, + "learning_rate": 4.388774628197317e-06, + "loss": 2.7567, + "step": 42483 + }, + { + "epoch": 2.6372835061145943, + "grad_norm": 0.13370348985120026, + "learning_rate": 4.387295139407055e-06, + "loss": 2.7029, + "step": 42484 + }, + { + "epoch": 2.637345583214352, + "grad_norm": 0.1402585234610464, + "learning_rate": 4.385815888589345e-06, + "loss": 2.7572, + "step": 42485 + }, + { + "epoch": 2.63740766031411, + "grad_norm": 0.13974476539732228, + "learning_rate": 4.38433687575191e-06, + "loss": 2.6811, + "step": 42486 + }, + { + "epoch": 2.637469737413868, + "grad_norm": 0.15610789154763663, + "learning_rate": 4.3828581009024535e-06, + "loss": 2.752, + "step": 42487 + }, + { + "epoch": 2.637531814513626, + "grad_norm": 0.14188870223084393, + "learning_rate": 4.3813795640487096e-06, + "loss": 2.7268, + "step": 42488 + }, + { + "epoch": 2.637593891613384, + "grad_norm": 0.13703345454670815, + "learning_rate": 4.379901265198383e-06, + "loss": 2.7617, + "step": 42489 + }, + { + "epoch": 2.637655968713142, + "grad_norm": 0.15852288908692258, + "learning_rate": 4.378423204359189e-06, + "loss": 2.6962, + "step": 42490 + }, + { + "epoch": 2.6377180458128997, + "grad_norm": 0.13743772152938788, + "learning_rate": 4.376945381538822e-06, + "loss": 2.6918, + "step": 42491 + }, + { + "epoch": 2.6377801229126576, + "grad_norm": 0.1441166576111683, + "learning_rate": 4.375467796745014e-06, + "loss": 2.7669, + "step": 42492 + }, + { + "epoch": 2.6378422000124155, + "grad_norm": 0.1505668771558022, + "learning_rate": 4.3739904499854715e-06, + "loss": 2.67, + "step": 42493 + }, + { + "epoch": 2.6379042771121735, + "grad_norm": 0.1419946148230699, + "learning_rate": 4.372513341267898e-06, + "loss": 2.7042, + "step": 42494 + }, + { + "epoch": 2.6379663542119314, + "grad_norm": 0.14364575073340322, + "learning_rate": 4.371036470599982e-06, + "loss": 2.6369, + "step": 42495 + }, + { + "epoch": 2.638028431311689, + "grad_norm": 0.16607601525946603, + "learning_rate": 4.369559837989462e-06, + "loss": 2.7173, + "step": 42496 + }, + { + "epoch": 2.638090508411447, + "grad_norm": 0.12954014915524495, + "learning_rate": 4.368083443444021e-06, + "loss": 2.6651, + "step": 42497 + }, + { + "epoch": 2.6381525855112047, + "grad_norm": 0.14418343629306787, + "learning_rate": 4.366607286971369e-06, + "loss": 2.6491, + "step": 42498 + }, + { + "epoch": 2.638214662610963, + "grad_norm": 0.13413257771360973, + "learning_rate": 4.3651313685792064e-06, + "loss": 2.6811, + "step": 42499 + }, + { + "epoch": 2.6382767397107205, + "grad_norm": 0.13448311731098994, + "learning_rate": 4.36365568827522e-06, + "loss": 2.6437, + "step": 42500 + }, + { + "epoch": 2.638338816810479, + "grad_norm": 0.13615800441104564, + "learning_rate": 4.3621802460671266e-06, + "loss": 2.7694, + "step": 42501 + }, + { + "epoch": 2.6384008939102364, + "grad_norm": 0.1356125425185725, + "learning_rate": 4.36070504196262e-06, + "loss": 2.7142, + "step": 42502 + }, + { + "epoch": 2.6384629710099943, + "grad_norm": 0.14356804480658264, + "learning_rate": 4.359230075969395e-06, + "loss": 2.7219, + "step": 42503 + }, + { + "epoch": 2.638525048109752, + "grad_norm": 0.1331572406081098, + "learning_rate": 4.357755348095138e-06, + "loss": 2.6824, + "step": 42504 + }, + { + "epoch": 2.63858712520951, + "grad_norm": 0.13274525594989947, + "learning_rate": 4.356280858347561e-06, + "loss": 2.6298, + "step": 42505 + }, + { + "epoch": 2.638649202309268, + "grad_norm": 0.14186336387692583, + "learning_rate": 4.354806606734346e-06, + "loss": 2.758, + "step": 42506 + }, + { + "epoch": 2.638711279409026, + "grad_norm": 0.13157170271457536, + "learning_rate": 4.353332593263188e-06, + "loss": 2.6373, + "step": 42507 + }, + { + "epoch": 2.638773356508784, + "grad_norm": 0.13430905747872365, + "learning_rate": 4.3518588179417674e-06, + "loss": 2.7057, + "step": 42508 + }, + { + "epoch": 2.638835433608542, + "grad_norm": 0.1423514406155445, + "learning_rate": 4.35038528077778e-06, + "loss": 2.6763, + "step": 42509 + }, + { + "epoch": 2.6388975107082997, + "grad_norm": 0.1520576112912279, + "learning_rate": 4.348911981778919e-06, + "loss": 2.7207, + "step": 42510 + }, + { + "epoch": 2.6389595878080576, + "grad_norm": 0.1373187930158746, + "learning_rate": 4.347438920952868e-06, + "loss": 2.6908, + "step": 42511 + }, + { + "epoch": 2.6390216649078155, + "grad_norm": 0.13574857490934883, + "learning_rate": 4.345966098307314e-06, + "loss": 2.7296, + "step": 42512 + }, + { + "epoch": 2.6390837420075735, + "grad_norm": 0.130518133845147, + "learning_rate": 4.344493513849923e-06, + "loss": 2.6306, + "step": 42513 + }, + { + "epoch": 2.6391458191073314, + "grad_norm": 0.1373990232694085, + "learning_rate": 4.343021167588396e-06, + "loss": 2.5886, + "step": 42514 + }, + { + "epoch": 2.6392078962070893, + "grad_norm": 0.13396884505684756, + "learning_rate": 4.34154905953042e-06, + "loss": 2.6366, + "step": 42515 + }, + { + "epoch": 2.639269973306847, + "grad_norm": 0.136562673854499, + "learning_rate": 4.340077189683667e-06, + "loss": 2.6673, + "step": 42516 + }, + { + "epoch": 2.639332050406605, + "grad_norm": 0.1400631568643061, + "learning_rate": 4.338605558055814e-06, + "loss": 2.7346, + "step": 42517 + }, + { + "epoch": 2.639394127506363, + "grad_norm": 0.14270586013971248, + "learning_rate": 4.337134164654527e-06, + "loss": 2.6285, + "step": 42518 + }, + { + "epoch": 2.639456204606121, + "grad_norm": 0.15029790422789313, + "learning_rate": 4.335663009487511e-06, + "loss": 2.661, + "step": 42519 + }, + { + "epoch": 2.639518281705879, + "grad_norm": 0.13256630799337957, + "learning_rate": 4.334192092562428e-06, + "loss": 2.6829, + "step": 42520 + }, + { + "epoch": 2.6395803588056364, + "grad_norm": 0.1343098687748058, + "learning_rate": 4.332721413886947e-06, + "loss": 2.6117, + "step": 42521 + }, + { + "epoch": 2.6396424359053947, + "grad_norm": 0.13938905061367837, + "learning_rate": 4.3312509734687475e-06, + "loss": 2.7721, + "step": 42522 + }, + { + "epoch": 2.639704513005152, + "grad_norm": 0.13251537156235424, + "learning_rate": 4.329780771315484e-06, + "loss": 2.6679, + "step": 42523 + }, + { + "epoch": 2.6397665901049105, + "grad_norm": 0.13523252435230004, + "learning_rate": 4.3283108074348545e-06, + "loss": 2.7051, + "step": 42524 + }, + { + "epoch": 2.639828667204668, + "grad_norm": 0.12918370904190932, + "learning_rate": 4.326841081834515e-06, + "loss": 2.732, + "step": 42525 + }, + { + "epoch": 2.639890744304426, + "grad_norm": 0.13195214271744435, + "learning_rate": 4.325371594522132e-06, + "loss": 2.6477, + "step": 42526 + }, + { + "epoch": 2.639952821404184, + "grad_norm": 0.13394691733319194, + "learning_rate": 4.32390234550536e-06, + "loss": 2.6539, + "step": 42527 + }, + { + "epoch": 2.6400148985039418, + "grad_norm": 0.15660454341632846, + "learning_rate": 4.322433334791892e-06, + "loss": 2.7553, + "step": 42528 + }, + { + "epoch": 2.6400769756036997, + "grad_norm": 0.14270937117549592, + "learning_rate": 4.320964562389379e-06, + "loss": 2.7118, + "step": 42529 + }, + { + "epoch": 2.6401390527034576, + "grad_norm": 0.15409875173386514, + "learning_rate": 4.319496028305481e-06, + "loss": 2.6774, + "step": 42530 + }, + { + "epoch": 2.6402011298032155, + "grad_norm": 0.14068458539832263, + "learning_rate": 4.318027732547847e-06, + "loss": 2.696, + "step": 42531 + }, + { + "epoch": 2.6402632069029734, + "grad_norm": 0.13632841246828742, + "learning_rate": 4.3165596751241655e-06, + "loss": 2.7426, + "step": 42532 + }, + { + "epoch": 2.6403252840027314, + "grad_norm": 0.1430922995821008, + "learning_rate": 4.315091856042086e-06, + "loss": 2.7069, + "step": 42533 + }, + { + "epoch": 2.6403873611024893, + "grad_norm": 0.1358378505491994, + "learning_rate": 4.313624275309258e-06, + "loss": 2.6493, + "step": 42534 + }, + { + "epoch": 2.640449438202247, + "grad_norm": 0.1353649275925669, + "learning_rate": 4.312156932933342e-06, + "loss": 2.7433, + "step": 42535 + }, + { + "epoch": 2.640511515302005, + "grad_norm": 0.17298940776429297, + "learning_rate": 4.310689828921988e-06, + "loss": 2.6993, + "step": 42536 + }, + { + "epoch": 2.640573592401763, + "grad_norm": 0.1425576260606122, + "learning_rate": 4.309222963282866e-06, + "loss": 2.8155, + "step": 42537 + }, + { + "epoch": 2.640635669501521, + "grad_norm": 0.13591120509440602, + "learning_rate": 4.307756336023622e-06, + "loss": 2.6962, + "step": 42538 + }, + { + "epoch": 2.640697746601279, + "grad_norm": 0.13897055710007258, + "learning_rate": 4.306289947151898e-06, + "loss": 2.7623, + "step": 42539 + }, + { + "epoch": 2.640759823701037, + "grad_norm": 0.15029848669877208, + "learning_rate": 4.3048237966753506e-06, + "loss": 2.7397, + "step": 42540 + }, + { + "epoch": 2.6408219008007947, + "grad_norm": 0.14094770016098093, + "learning_rate": 4.303357884601633e-06, + "loss": 2.7131, + "step": 42541 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.1462671258366696, + "learning_rate": 4.301892210938396e-06, + "loss": 2.8077, + "step": 42542 + }, + { + "epoch": 2.6409460550003105, + "grad_norm": 0.1388737391282432, + "learning_rate": 4.3004267756932784e-06, + "loss": 2.73, + "step": 42543 + }, + { + "epoch": 2.641008132100068, + "grad_norm": 0.1367091243625188, + "learning_rate": 4.298961578873928e-06, + "loss": 2.7344, + "step": 42544 + }, + { + "epoch": 2.6410702091998264, + "grad_norm": 0.1345543103053597, + "learning_rate": 4.297496620487978e-06, + "loss": 2.6872, + "step": 42545 + }, + { + "epoch": 2.641132286299584, + "grad_norm": 0.13642721427978574, + "learning_rate": 4.296031900543096e-06, + "loss": 2.617, + "step": 42546 + }, + { + "epoch": 2.641194363399342, + "grad_norm": 0.14672447237102546, + "learning_rate": 4.294567419046902e-06, + "loss": 2.6772, + "step": 42547 + }, + { + "epoch": 2.6412564404990997, + "grad_norm": 0.14370950149104858, + "learning_rate": 4.293103176007052e-06, + "loss": 2.6584, + "step": 42548 + }, + { + "epoch": 2.641318517598858, + "grad_norm": 0.13228993129968716, + "learning_rate": 4.2916391714311775e-06, + "loss": 2.6768, + "step": 42549 + }, + { + "epoch": 2.6413805946986155, + "grad_norm": 0.1308175845814296, + "learning_rate": 4.290175405326907e-06, + "loss": 2.6531, + "step": 42550 + }, + { + "epoch": 2.6414426717983734, + "grad_norm": 0.1333489732646821, + "learning_rate": 4.2887118777019005e-06, + "loss": 2.6866, + "step": 42551 + }, + { + "epoch": 2.6415047488981314, + "grad_norm": 0.1398678569687122, + "learning_rate": 4.28724858856378e-06, + "loss": 2.7195, + "step": 42552 + }, + { + "epoch": 2.6415668259978893, + "grad_norm": 0.14099728129476385, + "learning_rate": 4.285785537920184e-06, + "loss": 2.6944, + "step": 42553 + }, + { + "epoch": 2.641628903097647, + "grad_norm": 0.16574037397598565, + "learning_rate": 4.284322725778727e-06, + "loss": 2.7046, + "step": 42554 + }, + { + "epoch": 2.641690980197405, + "grad_norm": 0.13287378047417567, + "learning_rate": 4.2828601521470724e-06, + "loss": 2.6718, + "step": 42555 + }, + { + "epoch": 2.641753057297163, + "grad_norm": 0.14306597009794134, + "learning_rate": 4.281397817032829e-06, + "loss": 2.7322, + "step": 42556 + }, + { + "epoch": 2.641815134396921, + "grad_norm": 0.14189867222329736, + "learning_rate": 4.279935720443634e-06, + "loss": 2.6904, + "step": 42557 + }, + { + "epoch": 2.641877211496679, + "grad_norm": 0.13445866220388908, + "learning_rate": 4.278473862387111e-06, + "loss": 2.7908, + "step": 42558 + }, + { + "epoch": 2.641939288596437, + "grad_norm": 0.13482601084451362, + "learning_rate": 4.277012242870881e-06, + "loss": 2.7874, + "step": 42559 + }, + { + "epoch": 2.6420013656961947, + "grad_norm": 0.13229916043220602, + "learning_rate": 4.275550861902589e-06, + "loss": 2.7347, + "step": 42560 + }, + { + "epoch": 2.6420634427959526, + "grad_norm": 0.136579810797754, + "learning_rate": 4.274089719489843e-06, + "loss": 2.803, + "step": 42561 + }, + { + "epoch": 2.6421255198957105, + "grad_norm": 0.1438073898901255, + "learning_rate": 4.2726288156402786e-06, + "loss": 2.7164, + "step": 42562 + }, + { + "epoch": 2.6421875969954685, + "grad_norm": 0.1346858590174323, + "learning_rate": 4.271168150361499e-06, + "loss": 2.6915, + "step": 42563 + }, + { + "epoch": 2.6422496740952264, + "grad_norm": 0.13172390734258643, + "learning_rate": 4.269707723661142e-06, + "loss": 2.6838, + "step": 42564 + }, + { + "epoch": 2.6423117511949843, + "grad_norm": 0.13588938519273147, + "learning_rate": 4.268247535546821e-06, + "loss": 2.7294, + "step": 42565 + }, + { + "epoch": 2.642373828294742, + "grad_norm": 0.14007965281728268, + "learning_rate": 4.2667875860261556e-06, + "loss": 2.7031, + "step": 42566 + }, + { + "epoch": 2.6424359053945, + "grad_norm": 0.14106490946187872, + "learning_rate": 4.265327875106756e-06, + "loss": 2.7016, + "step": 42567 + }, + { + "epoch": 2.642497982494258, + "grad_norm": 0.162107421679391, + "learning_rate": 4.26386840279625e-06, + "loss": 2.7633, + "step": 42568 + }, + { + "epoch": 2.6425600595940155, + "grad_norm": 0.17802891063576096, + "learning_rate": 4.262409169102244e-06, + "loss": 2.827, + "step": 42569 + }, + { + "epoch": 2.642622136693774, + "grad_norm": 0.1421079525763662, + "learning_rate": 4.260950174032352e-06, + "loss": 2.7677, + "step": 42570 + }, + { + "epoch": 2.6426842137935314, + "grad_norm": 0.14304764400006717, + "learning_rate": 4.259491417594191e-06, + "loss": 2.7093, + "step": 42571 + }, + { + "epoch": 2.6427462908932897, + "grad_norm": 0.13425978112241138, + "learning_rate": 4.258032899795355e-06, + "loss": 2.8403, + "step": 42572 + }, + { + "epoch": 2.642808367993047, + "grad_norm": 0.14734689458361613, + "learning_rate": 4.256574620643472e-06, + "loss": 2.73, + "step": 42573 + }, + { + "epoch": 2.642870445092805, + "grad_norm": 0.13103010654883065, + "learning_rate": 4.255116580146151e-06, + "loss": 2.6717, + "step": 42574 + }, + { + "epoch": 2.642932522192563, + "grad_norm": 0.16264328874731424, + "learning_rate": 4.253658778310987e-06, + "loss": 2.6801, + "step": 42575 + }, + { + "epoch": 2.642994599292321, + "grad_norm": 0.14321943307805665, + "learning_rate": 4.252201215145579e-06, + "loss": 2.7306, + "step": 42576 + }, + { + "epoch": 2.643056676392079, + "grad_norm": 0.14456581656044462, + "learning_rate": 4.250743890657555e-06, + "loss": 2.7158, + "step": 42577 + }, + { + "epoch": 2.643118753491837, + "grad_norm": 0.13081180362588965, + "learning_rate": 4.249286804854507e-06, + "loss": 2.6731, + "step": 42578 + }, + { + "epoch": 2.6431808305915947, + "grad_norm": 0.13106321713565627, + "learning_rate": 4.2478299577440375e-06, + "loss": 2.5613, + "step": 42579 + }, + { + "epoch": 2.6432429076913526, + "grad_norm": 0.16385466603940452, + "learning_rate": 4.246373349333732e-06, + "loss": 2.6898, + "step": 42580 + }, + { + "epoch": 2.6433049847911105, + "grad_norm": 0.15727007601409052, + "learning_rate": 4.244916979631209e-06, + "loss": 2.719, + "step": 42581 + }, + { + "epoch": 2.6433670618908685, + "grad_norm": 0.16453146891928053, + "learning_rate": 4.243460848644066e-06, + "loss": 2.663, + "step": 42582 + }, + { + "epoch": 2.6434291389906264, + "grad_norm": 0.14986395577326528, + "learning_rate": 4.242004956379897e-06, + "loss": 2.7159, + "step": 42583 + }, + { + "epoch": 2.6434912160903843, + "grad_norm": 0.14835175120495228, + "learning_rate": 4.2405493028462975e-06, + "loss": 2.671, + "step": 42584 + }, + { + "epoch": 2.643553293190142, + "grad_norm": 0.13038595879206794, + "learning_rate": 4.2390938880508606e-06, + "loss": 2.7198, + "step": 42585 + }, + { + "epoch": 2.6436153702899, + "grad_norm": 0.1366408739365785, + "learning_rate": 4.237638712001169e-06, + "loss": 2.7401, + "step": 42586 + }, + { + "epoch": 2.643677447389658, + "grad_norm": 0.134593556615435, + "learning_rate": 4.236183774704833e-06, + "loss": 2.6888, + "step": 42587 + }, + { + "epoch": 2.643739524489416, + "grad_norm": 0.14268355847728306, + "learning_rate": 4.234729076169442e-06, + "loss": 2.7169, + "step": 42588 + }, + { + "epoch": 2.643801601589174, + "grad_norm": 0.13198795031824684, + "learning_rate": 4.233274616402572e-06, + "loss": 2.758, + "step": 42589 + }, + { + "epoch": 2.643863678688932, + "grad_norm": 0.14164581922632144, + "learning_rate": 4.2318203954118175e-06, + "loss": 2.6807, + "step": 42590 + }, + { + "epoch": 2.6439257557886897, + "grad_norm": 0.1349958305686364, + "learning_rate": 4.230366413204767e-06, + "loss": 2.644, + "step": 42591 + }, + { + "epoch": 2.643987832888447, + "grad_norm": 0.13708706019383088, + "learning_rate": 4.228912669789015e-06, + "loss": 2.7719, + "step": 42592 + }, + { + "epoch": 2.6440499099882055, + "grad_norm": 0.15050223160234572, + "learning_rate": 4.227459165172132e-06, + "loss": 2.7243, + "step": 42593 + }, + { + "epoch": 2.644111987087963, + "grad_norm": 0.141521681254285, + "learning_rate": 4.226005899361701e-06, + "loss": 2.7511, + "step": 42594 + }, + { + "epoch": 2.6441740641877214, + "grad_norm": 0.14099243956191543, + "learning_rate": 4.224552872365306e-06, + "loss": 2.7394, + "step": 42595 + }, + { + "epoch": 2.644236141287479, + "grad_norm": 0.1301789886468671, + "learning_rate": 4.22310008419054e-06, + "loss": 2.7216, + "step": 42596 + }, + { + "epoch": 2.644298218387237, + "grad_norm": 0.13151103356345745, + "learning_rate": 4.221647534844975e-06, + "loss": 2.6208, + "step": 42597 + }, + { + "epoch": 2.6443602954869947, + "grad_norm": 0.13236232504485912, + "learning_rate": 4.220195224336188e-06, + "loss": 2.7022, + "step": 42598 + }, + { + "epoch": 2.6444223725867526, + "grad_norm": 0.13138997271962546, + "learning_rate": 4.218743152671744e-06, + "loss": 2.7072, + "step": 42599 + }, + { + "epoch": 2.6444844496865105, + "grad_norm": 0.15575524319668177, + "learning_rate": 4.21729131985924e-06, + "loss": 2.6777, + "step": 42600 + }, + { + "epoch": 2.6445465267862684, + "grad_norm": 0.1507517763593818, + "learning_rate": 4.21583972590624e-06, + "loss": 2.7117, + "step": 42601 + }, + { + "epoch": 2.6446086038860264, + "grad_norm": 0.14116104927839296, + "learning_rate": 4.2143883708203215e-06, + "loss": 2.8016, + "step": 42602 + }, + { + "epoch": 2.6446706809857843, + "grad_norm": 0.15378680604689599, + "learning_rate": 4.212937254609045e-06, + "loss": 2.6905, + "step": 42603 + }, + { + "epoch": 2.644732758085542, + "grad_norm": 0.14379816751376934, + "learning_rate": 4.21148637728e-06, + "loss": 2.7416, + "step": 42604 + }, + { + "epoch": 2.6447948351853, + "grad_norm": 0.131584018626018, + "learning_rate": 4.210035738840746e-06, + "loss": 2.6332, + "step": 42605 + }, + { + "epoch": 2.644856912285058, + "grad_norm": 0.1323160823558699, + "learning_rate": 4.208585339298849e-06, + "loss": 2.5973, + "step": 42606 + }, + { + "epoch": 2.644918989384816, + "grad_norm": 0.15579439137431952, + "learning_rate": 4.207135178661875e-06, + "loss": 2.6763, + "step": 42607 + }, + { + "epoch": 2.644981066484574, + "grad_norm": 0.15249148565897247, + "learning_rate": 4.205685256937392e-06, + "loss": 2.7565, + "step": 42608 + }, + { + "epoch": 2.645043143584332, + "grad_norm": 0.14223517362765778, + "learning_rate": 4.20423557413297e-06, + "loss": 2.7046, + "step": 42609 + }, + { + "epoch": 2.6451052206840897, + "grad_norm": 0.13657254166859895, + "learning_rate": 4.20278613025617e-06, + "loss": 2.7027, + "step": 42610 + }, + { + "epoch": 2.6451672977838476, + "grad_norm": 0.1433346937589993, + "learning_rate": 4.201336925314553e-06, + "loss": 2.7029, + "step": 42611 + }, + { + "epoch": 2.6452293748836055, + "grad_norm": 0.13356613464925834, + "learning_rate": 4.199887959315668e-06, + "loss": 2.7694, + "step": 42612 + }, + { + "epoch": 2.6452914519833635, + "grad_norm": 0.1440262994981082, + "learning_rate": 4.198439232267088e-06, + "loss": 2.6716, + "step": 42613 + }, + { + "epoch": 2.6453535290831214, + "grad_norm": 0.1409506272865463, + "learning_rate": 4.196990744176377e-06, + "loss": 2.7131, + "step": 42614 + }, + { + "epoch": 2.6454156061828793, + "grad_norm": 0.14899004190806395, + "learning_rate": 4.195542495051086e-06, + "loss": 2.7315, + "step": 42615 + }, + { + "epoch": 2.645477683282637, + "grad_norm": 0.13403534322951344, + "learning_rate": 4.19409448489877e-06, + "loss": 2.6828, + "step": 42616 + }, + { + "epoch": 2.6455397603823947, + "grad_norm": 0.1372686363198207, + "learning_rate": 4.192646713726972e-06, + "loss": 2.6969, + "step": 42617 + }, + { + "epoch": 2.645601837482153, + "grad_norm": 0.13838990517074062, + "learning_rate": 4.191199181543265e-06, + "loss": 2.7362, + "step": 42618 + }, + { + "epoch": 2.6456639145819105, + "grad_norm": 0.13069273886312036, + "learning_rate": 4.189751888355192e-06, + "loss": 2.6139, + "step": 42619 + }, + { + "epoch": 2.645725991681669, + "grad_norm": 0.1367489917550586, + "learning_rate": 4.188304834170303e-06, + "loss": 2.7178, + "step": 42620 + }, + { + "epoch": 2.6457880687814264, + "grad_norm": 0.13995790537371916, + "learning_rate": 4.186858018996154e-06, + "loss": 2.6958, + "step": 42621 + }, + { + "epoch": 2.6458501458811843, + "grad_norm": 0.13258615307618724, + "learning_rate": 4.185411442840276e-06, + "loss": 2.7772, + "step": 42622 + }, + { + "epoch": 2.645912222980942, + "grad_norm": 0.14522764897277407, + "learning_rate": 4.1839651057102355e-06, + "loss": 2.6677, + "step": 42623 + }, + { + "epoch": 2.6459743000807, + "grad_norm": 0.144200771984264, + "learning_rate": 4.182519007613572e-06, + "loss": 2.8393, + "step": 42624 + }, + { + "epoch": 2.646036377180458, + "grad_norm": 0.13355577972179716, + "learning_rate": 4.181073148557829e-06, + "loss": 2.643, + "step": 42625 + }, + { + "epoch": 2.646098454280216, + "grad_norm": 0.14080930923456653, + "learning_rate": 4.179627528550545e-06, + "loss": 2.7032, + "step": 42626 + }, + { + "epoch": 2.646160531379974, + "grad_norm": 0.13635861102375696, + "learning_rate": 4.178182147599275e-06, + "loss": 2.6911, + "step": 42627 + }, + { + "epoch": 2.646222608479732, + "grad_norm": 0.13315380643300354, + "learning_rate": 4.1767370057115505e-06, + "loss": 2.6543, + "step": 42628 + }, + { + "epoch": 2.6462846855794897, + "grad_norm": 0.15071498274440204, + "learning_rate": 4.175292102894912e-06, + "loss": 2.7314, + "step": 42629 + }, + { + "epoch": 2.6463467626792476, + "grad_norm": 0.14236324918958282, + "learning_rate": 4.173847439156903e-06, + "loss": 2.6869, + "step": 42630 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.136563833206181, + "learning_rate": 4.1724030145050444e-06, + "loss": 2.6024, + "step": 42631 + }, + { + "epoch": 2.6464709168787635, + "grad_norm": 0.12902972780901772, + "learning_rate": 4.170958828946897e-06, + "loss": 2.7315, + "step": 42632 + }, + { + "epoch": 2.6465329939785214, + "grad_norm": 0.13894541048953227, + "learning_rate": 4.169514882489977e-06, + "loss": 2.7597, + "step": 42633 + }, + { + "epoch": 2.6465950710782793, + "grad_norm": 0.15348290975702916, + "learning_rate": 4.16807117514183e-06, + "loss": 2.7325, + "step": 42634 + }, + { + "epoch": 2.646657148178037, + "grad_norm": 0.12942917557937092, + "learning_rate": 4.1666277069099704e-06, + "loss": 2.6962, + "step": 42635 + }, + { + "epoch": 2.646719225277795, + "grad_norm": 0.14303830101822662, + "learning_rate": 4.165184477801948e-06, + "loss": 2.6786, + "step": 42636 + }, + { + "epoch": 2.646781302377553, + "grad_norm": 0.15005695185718226, + "learning_rate": 4.163741487825284e-06, + "loss": 2.6186, + "step": 42637 + }, + { + "epoch": 2.646843379477311, + "grad_norm": 0.1490937708197223, + "learning_rate": 4.162298736987513e-06, + "loss": 2.6923, + "step": 42638 + }, + { + "epoch": 2.646905456577069, + "grad_norm": 0.13818576393175633, + "learning_rate": 4.160856225296145e-06, + "loss": 2.6878, + "step": 42639 + }, + { + "epoch": 2.6469675336768264, + "grad_norm": 0.14694304817528273, + "learning_rate": 4.159413952758723e-06, + "loss": 2.7084, + "step": 42640 + }, + { + "epoch": 2.6470296107765847, + "grad_norm": 0.1323533747459849, + "learning_rate": 4.157971919382769e-06, + "loss": 2.7113, + "step": 42641 + }, + { + "epoch": 2.647091687876342, + "grad_norm": 0.13502985165063314, + "learning_rate": 4.156530125175806e-06, + "loss": 2.7197, + "step": 42642 + }, + { + "epoch": 2.6471537649761006, + "grad_norm": 0.1408734577119071, + "learning_rate": 4.155088570145354e-06, + "loss": 2.6652, + "step": 42643 + }, + { + "epoch": 2.647215842075858, + "grad_norm": 0.1340055107039551, + "learning_rate": 4.1536472542989256e-06, + "loss": 2.7422, + "step": 42644 + }, + { + "epoch": 2.6472779191756164, + "grad_norm": 0.13539042279668062, + "learning_rate": 4.152206177644047e-06, + "loss": 2.7403, + "step": 42645 + }, + { + "epoch": 2.647339996275374, + "grad_norm": 0.1462155673154947, + "learning_rate": 4.150765340188251e-06, + "loss": 2.7354, + "step": 42646 + }, + { + "epoch": 2.647402073375132, + "grad_norm": 0.1564160061174318, + "learning_rate": 4.149324741939037e-06, + "loss": 2.7596, + "step": 42647 + }, + { + "epoch": 2.6474641504748897, + "grad_norm": 0.14525847866488478, + "learning_rate": 4.147884382903932e-06, + "loss": 2.6817, + "step": 42648 + }, + { + "epoch": 2.6475262275746476, + "grad_norm": 0.1435470449268701, + "learning_rate": 4.146444263090432e-06, + "loss": 2.6516, + "step": 42649 + }, + { + "epoch": 2.6475883046744055, + "grad_norm": 0.13873411850207934, + "learning_rate": 4.1450043825060735e-06, + "loss": 2.688, + "step": 42650 + }, + { + "epoch": 2.6476503817741635, + "grad_norm": 0.13390879598576366, + "learning_rate": 4.1435647411583625e-06, + "loss": 2.733, + "step": 42651 + }, + { + "epoch": 2.6477124588739214, + "grad_norm": 0.1336084873592445, + "learning_rate": 4.142125339054803e-06, + "loss": 2.7591, + "step": 42652 + }, + { + "epoch": 2.6477745359736793, + "grad_norm": 0.13982454945972358, + "learning_rate": 4.1406861762029015e-06, + "loss": 2.732, + "step": 42653 + }, + { + "epoch": 2.647836613073437, + "grad_norm": 0.13433338729687325, + "learning_rate": 4.139247252610179e-06, + "loss": 2.7647, + "step": 42654 + }, + { + "epoch": 2.647898690173195, + "grad_norm": 0.14824866530748745, + "learning_rate": 4.137808568284135e-06, + "loss": 2.7351, + "step": 42655 + }, + { + "epoch": 2.647960767272953, + "grad_norm": 0.1316299383200707, + "learning_rate": 4.13637012323228e-06, + "loss": 2.6316, + "step": 42656 + }, + { + "epoch": 2.648022844372711, + "grad_norm": 0.13960840411667816, + "learning_rate": 4.1349319174621205e-06, + "loss": 2.7101, + "step": 42657 + }, + { + "epoch": 2.648084921472469, + "grad_norm": 0.13743891415729328, + "learning_rate": 4.1334939509811375e-06, + "loss": 2.6648, + "step": 42658 + }, + { + "epoch": 2.648146998572227, + "grad_norm": 0.14522673193287436, + "learning_rate": 4.132056223796865e-06, + "loss": 2.6168, + "step": 42659 + }, + { + "epoch": 2.6482090756719847, + "grad_norm": 0.13107903392247888, + "learning_rate": 4.130618735916791e-06, + "loss": 2.5986, + "step": 42660 + }, + { + "epoch": 2.6482711527717426, + "grad_norm": 0.1333859698946839, + "learning_rate": 4.1291814873484106e-06, + "loss": 2.7254, + "step": 42661 + }, + { + "epoch": 2.6483332298715005, + "grad_norm": 0.13285972653131278, + "learning_rate": 4.1277444780992215e-06, + "loss": 2.7109, + "step": 42662 + }, + { + "epoch": 2.6483953069712585, + "grad_norm": 0.13860347565305192, + "learning_rate": 4.12630770817673e-06, + "loss": 2.6109, + "step": 42663 + }, + { + "epoch": 2.6484573840710164, + "grad_norm": 0.13444169130535338, + "learning_rate": 4.12487117758843e-06, + "loss": 2.6183, + "step": 42664 + }, + { + "epoch": 2.648519461170774, + "grad_norm": 0.13367083121582812, + "learning_rate": 4.123434886341809e-06, + "loss": 2.7067, + "step": 42665 + }, + { + "epoch": 2.648581538270532, + "grad_norm": 0.1529323504570518, + "learning_rate": 4.121998834444363e-06, + "loss": 2.8023, + "step": 42666 + }, + { + "epoch": 2.6486436153702897, + "grad_norm": 0.13308918406224574, + "learning_rate": 4.12056302190359e-06, + "loss": 2.7521, + "step": 42667 + }, + { + "epoch": 2.648705692470048, + "grad_norm": 0.13073861542658186, + "learning_rate": 4.1191274487269784e-06, + "loss": 2.6562, + "step": 42668 + }, + { + "epoch": 2.6487677695698055, + "grad_norm": 0.13258667287206205, + "learning_rate": 4.117692114922017e-06, + "loss": 2.6623, + "step": 42669 + }, + { + "epoch": 2.6488298466695634, + "grad_norm": 0.13058822923087687, + "learning_rate": 4.116257020496195e-06, + "loss": 2.6629, + "step": 42670 + }, + { + "epoch": 2.6488919237693214, + "grad_norm": 0.1353314724719469, + "learning_rate": 4.114822165456994e-06, + "loss": 2.6989, + "step": 42671 + }, + { + "epoch": 2.6489540008690793, + "grad_norm": 0.13958133739377657, + "learning_rate": 4.113387549811909e-06, + "loss": 2.6061, + "step": 42672 + }, + { + "epoch": 2.649016077968837, + "grad_norm": 0.13877945742988063, + "learning_rate": 4.111953173568417e-06, + "loss": 2.7085, + "step": 42673 + }, + { + "epoch": 2.649078155068595, + "grad_norm": 0.13628324901354197, + "learning_rate": 4.1105190367340115e-06, + "loss": 2.6894, + "step": 42674 + }, + { + "epoch": 2.649140232168353, + "grad_norm": 0.1363217584604459, + "learning_rate": 4.109085139316155e-06, + "loss": 2.6968, + "step": 42675 + }, + { + "epoch": 2.649202309268111, + "grad_norm": 0.13305260188421456, + "learning_rate": 4.1076514813223555e-06, + "loss": 2.7347, + "step": 42676 + }, + { + "epoch": 2.649264386367869, + "grad_norm": 0.14922563352236268, + "learning_rate": 4.106218062760075e-06, + "loss": 2.6729, + "step": 42677 + }, + { + "epoch": 2.649326463467627, + "grad_norm": 0.1348375805584996, + "learning_rate": 4.104784883636786e-06, + "loss": 2.7268, + "step": 42678 + }, + { + "epoch": 2.6493885405673847, + "grad_norm": 0.13443301107492886, + "learning_rate": 4.1033519439599876e-06, + "loss": 2.7036, + "step": 42679 + }, + { + "epoch": 2.6494506176671426, + "grad_norm": 0.14787905826667097, + "learning_rate": 4.101919243737146e-06, + "loss": 2.6362, + "step": 42680 + }, + { + "epoch": 2.6495126947669005, + "grad_norm": 0.13655768853010866, + "learning_rate": 4.100486782975721e-06, + "loss": 2.6887, + "step": 42681 + }, + { + "epoch": 2.6495747718666585, + "grad_norm": 0.16167166803143412, + "learning_rate": 4.0990545616832075e-06, + "loss": 2.7101, + "step": 42682 + }, + { + "epoch": 2.6496368489664164, + "grad_norm": 0.13792281599059805, + "learning_rate": 4.097622579867072e-06, + "loss": 2.6272, + "step": 42683 + }, + { + "epoch": 2.6496989260661743, + "grad_norm": 0.15003191290567724, + "learning_rate": 4.0961908375347806e-06, + "loss": 2.7599, + "step": 42684 + }, + { + "epoch": 2.649761003165932, + "grad_norm": 0.1957943598961903, + "learning_rate": 4.094759334693793e-06, + "loss": 2.7257, + "step": 42685 + }, + { + "epoch": 2.64982308026569, + "grad_norm": 0.14772167900680752, + "learning_rate": 4.093328071351604e-06, + "loss": 2.7725, + "step": 42686 + }, + { + "epoch": 2.649885157365448, + "grad_norm": 0.13363976048770063, + "learning_rate": 4.091897047515664e-06, + "loss": 2.5732, + "step": 42687 + }, + { + "epoch": 2.6499472344652055, + "grad_norm": 0.13405001731148208, + "learning_rate": 4.090466263193443e-06, + "loss": 2.7368, + "step": 42688 + }, + { + "epoch": 2.650009311564964, + "grad_norm": 0.1516484324191085, + "learning_rate": 4.089035718392398e-06, + "loss": 2.7246, + "step": 42689 + }, + { + "epoch": 2.6500713886647214, + "grad_norm": 0.1480572246466776, + "learning_rate": 4.087605413120005e-06, + "loss": 2.6444, + "step": 42690 + }, + { + "epoch": 2.6501334657644797, + "grad_norm": 0.14137708036408175, + "learning_rate": 4.086175347383719e-06, + "loss": 2.7411, + "step": 42691 + }, + { + "epoch": 2.650195542864237, + "grad_norm": 0.14019604666521124, + "learning_rate": 4.084745521191002e-06, + "loss": 2.5901, + "step": 42692 + }, + { + "epoch": 2.6502576199639956, + "grad_norm": 0.13606025973531566, + "learning_rate": 4.08331593454932e-06, + "loss": 2.7423, + "step": 42693 + }, + { + "epoch": 2.650319697063753, + "grad_norm": 0.1337849136043372, + "learning_rate": 4.081886587466111e-06, + "loss": 2.6734, + "step": 42694 + }, + { + "epoch": 2.650381774163511, + "grad_norm": 0.14010220943440468, + "learning_rate": 4.080457479948857e-06, + "loss": 2.6093, + "step": 42695 + }, + { + "epoch": 2.650443851263269, + "grad_norm": 0.14335347690974412, + "learning_rate": 4.079028612005004e-06, + "loss": 2.7068, + "step": 42696 + }, + { + "epoch": 2.650505928363027, + "grad_norm": 0.14768222600533734, + "learning_rate": 4.077599983642005e-06, + "loss": 2.7789, + "step": 42697 + }, + { + "epoch": 2.6505680054627847, + "grad_norm": 0.13613721602335568, + "learning_rate": 4.076171594867306e-06, + "loss": 2.7229, + "step": 42698 + }, + { + "epoch": 2.6506300825625426, + "grad_norm": 0.1361022810993917, + "learning_rate": 4.074743445688378e-06, + "loss": 2.6907, + "step": 42699 + }, + { + "epoch": 2.6506921596623005, + "grad_norm": 0.13688047244615456, + "learning_rate": 4.07331553611266e-06, + "loss": 2.7303, + "step": 42700 + }, + { + "epoch": 2.6507542367620585, + "grad_norm": 0.15451691426007289, + "learning_rate": 4.071887866147605e-06, + "loss": 2.6843, + "step": 42701 + }, + { + "epoch": 2.6508163138618164, + "grad_norm": 0.14129052695859295, + "learning_rate": 4.0704604358006545e-06, + "loss": 2.6049, + "step": 42702 + }, + { + "epoch": 2.6508783909615743, + "grad_norm": 0.14600205170074895, + "learning_rate": 4.069033245079268e-06, + "loss": 2.6495, + "step": 42703 + }, + { + "epoch": 2.650940468061332, + "grad_norm": 0.13248736188553653, + "learning_rate": 4.0676062939908885e-06, + "loss": 2.7519, + "step": 42704 + }, + { + "epoch": 2.65100254516109, + "grad_norm": 0.13347621175280608, + "learning_rate": 4.066179582542956e-06, + "loss": 2.6995, + "step": 42705 + }, + { + "epoch": 2.651064622260848, + "grad_norm": 0.13046783972325543, + "learning_rate": 4.064753110742914e-06, + "loss": 2.621, + "step": 42706 + }, + { + "epoch": 2.651126699360606, + "grad_norm": 0.14291053717701782, + "learning_rate": 4.063326878598195e-06, + "loss": 2.7064, + "step": 42707 + }, + { + "epoch": 2.651188776460364, + "grad_norm": 0.14089843868449153, + "learning_rate": 4.0619008861162655e-06, + "loss": 2.6266, + "step": 42708 + }, + { + "epoch": 2.651250853560122, + "grad_norm": 0.15271469340626193, + "learning_rate": 4.060475133304548e-06, + "loss": 2.6875, + "step": 42709 + }, + { + "epoch": 2.6513129306598797, + "grad_norm": 0.13607099900228817, + "learning_rate": 4.059049620170479e-06, + "loss": 2.6146, + "step": 42710 + }, + { + "epoch": 2.651375007759637, + "grad_norm": 0.1381883519300393, + "learning_rate": 4.0576243467215e-06, + "loss": 2.7135, + "step": 42711 + }, + { + "epoch": 2.6514370848593956, + "grad_norm": 0.14250073880288527, + "learning_rate": 4.056199312965042e-06, + "loss": 2.6237, + "step": 42712 + }, + { + "epoch": 2.651499161959153, + "grad_norm": 0.13268358621786225, + "learning_rate": 4.05477451890856e-06, + "loss": 2.693, + "step": 42713 + }, + { + "epoch": 2.6515612390589114, + "grad_norm": 0.13530836833566265, + "learning_rate": 4.053349964559466e-06, + "loss": 2.6806, + "step": 42714 + }, + { + "epoch": 2.651623316158669, + "grad_norm": 0.14166539167667236, + "learning_rate": 4.051925649925203e-06, + "loss": 2.7557, + "step": 42715 + }, + { + "epoch": 2.6516853932584272, + "grad_norm": 0.1338743160966738, + "learning_rate": 4.050501575013193e-06, + "loss": 2.7062, + "step": 42716 + }, + { + "epoch": 2.6517474703581847, + "grad_norm": 0.13598810191341673, + "learning_rate": 4.0490777398308755e-06, + "loss": 2.689, + "step": 42717 + }, + { + "epoch": 2.6518095474579426, + "grad_norm": 0.14203634270790183, + "learning_rate": 4.047654144385676e-06, + "loss": 2.7501, + "step": 42718 + }, + { + "epoch": 2.6518716245577005, + "grad_norm": 0.13453839675183044, + "learning_rate": 4.046230788685018e-06, + "loss": 2.7524, + "step": 42719 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.13446434870748106, + "learning_rate": 4.044807672736328e-06, + "loss": 2.6863, + "step": 42720 + }, + { + "epoch": 2.6519957787572164, + "grad_norm": 0.13937846009488358, + "learning_rate": 4.043384796547029e-06, + "loss": 2.8173, + "step": 42721 + }, + { + "epoch": 2.6520578558569743, + "grad_norm": 0.15150564690647653, + "learning_rate": 4.0419621601245515e-06, + "loss": 2.7086, + "step": 42722 + }, + { + "epoch": 2.652119932956732, + "grad_norm": 0.1353590174682045, + "learning_rate": 4.04053976347632e-06, + "loss": 2.7175, + "step": 42723 + }, + { + "epoch": 2.65218201005649, + "grad_norm": 0.13812701546376632, + "learning_rate": 4.039117606609744e-06, + "loss": 2.7389, + "step": 42724 + }, + { + "epoch": 2.652244087156248, + "grad_norm": 0.1339397208647812, + "learning_rate": 4.037695689532245e-06, + "loss": 2.6702, + "step": 42725 + }, + { + "epoch": 2.652306164256006, + "grad_norm": 0.13111521492868553, + "learning_rate": 4.036274012251251e-06, + "loss": 2.693, + "step": 42726 + }, + { + "epoch": 2.652368241355764, + "grad_norm": 0.13903436831627572, + "learning_rate": 4.034852574774173e-06, + "loss": 2.6293, + "step": 42727 + }, + { + "epoch": 2.652430318455522, + "grad_norm": 0.14419122598593553, + "learning_rate": 4.033431377108426e-06, + "loss": 2.7264, + "step": 42728 + }, + { + "epoch": 2.6524923955552797, + "grad_norm": 0.13682603033008242, + "learning_rate": 4.0320104192614286e-06, + "loss": 2.7501, + "step": 42729 + }, + { + "epoch": 2.6525544726550376, + "grad_norm": 0.13123710404347316, + "learning_rate": 4.030589701240583e-06, + "loss": 2.5824, + "step": 42730 + }, + { + "epoch": 2.6526165497547955, + "grad_norm": 0.13874938974375164, + "learning_rate": 4.029169223053319e-06, + "loss": 2.7437, + "step": 42731 + }, + { + "epoch": 2.6526786268545535, + "grad_norm": 0.1360883621042751, + "learning_rate": 4.0277489847070415e-06, + "loss": 2.6865, + "step": 42732 + }, + { + "epoch": 2.6527407039543114, + "grad_norm": 0.13430733779219245, + "learning_rate": 4.026328986209154e-06, + "loss": 2.6491, + "step": 42733 + }, + { + "epoch": 2.6528027810540693, + "grad_norm": 0.13657976988441364, + "learning_rate": 4.024909227567064e-06, + "loss": 2.7036, + "step": 42734 + }, + { + "epoch": 2.652864858153827, + "grad_norm": 0.1318266688726834, + "learning_rate": 4.023489708788186e-06, + "loss": 2.6945, + "step": 42735 + }, + { + "epoch": 2.6529269352535847, + "grad_norm": 0.13224190266595925, + "learning_rate": 4.022070429879932e-06, + "loss": 2.6965, + "step": 42736 + }, + { + "epoch": 2.652989012353343, + "grad_norm": 0.14204169306326334, + "learning_rate": 4.020651390849689e-06, + "loss": 2.7076, + "step": 42737 + }, + { + "epoch": 2.6530510894531005, + "grad_norm": 0.1314688056481427, + "learning_rate": 4.019232591704869e-06, + "loss": 2.6719, + "step": 42738 + }, + { + "epoch": 2.653113166552859, + "grad_norm": 0.1307789925753011, + "learning_rate": 4.017814032452877e-06, + "loss": 2.7318, + "step": 42739 + }, + { + "epoch": 2.6531752436526164, + "grad_norm": 0.13785773363188, + "learning_rate": 4.016395713101112e-06, + "loss": 2.7366, + "step": 42740 + }, + { + "epoch": 2.6532373207523743, + "grad_norm": 0.14075864957636874, + "learning_rate": 4.01497763365698e-06, + "loss": 2.6936, + "step": 42741 + }, + { + "epoch": 2.653299397852132, + "grad_norm": 0.13178088840247756, + "learning_rate": 4.013559794127869e-06, + "loss": 2.6459, + "step": 42742 + }, + { + "epoch": 2.65336147495189, + "grad_norm": 0.13254794952476873, + "learning_rate": 4.012142194521168e-06, + "loss": 2.7482, + "step": 42743 + }, + { + "epoch": 2.653423552051648, + "grad_norm": 0.13372662751141132, + "learning_rate": 4.010724834844287e-06, + "loss": 2.7489, + "step": 42744 + }, + { + "epoch": 2.653485629151406, + "grad_norm": 0.19252505586787047, + "learning_rate": 4.009307715104632e-06, + "loss": 2.6581, + "step": 42745 + }, + { + "epoch": 2.653547706251164, + "grad_norm": 0.14982286638871134, + "learning_rate": 4.0078908353095805e-06, + "loss": 2.7377, + "step": 42746 + }, + { + "epoch": 2.653609783350922, + "grad_norm": 0.16388880526837768, + "learning_rate": 4.006474195466526e-06, + "loss": 2.7732, + "step": 42747 + }, + { + "epoch": 2.6536718604506797, + "grad_norm": 0.1321843946376225, + "learning_rate": 4.0050577955828505e-06, + "loss": 2.8303, + "step": 42748 + }, + { + "epoch": 2.6537339375504376, + "grad_norm": 0.1364195069253994, + "learning_rate": 4.0036416356659666e-06, + "loss": 2.604, + "step": 42749 + }, + { + "epoch": 2.6537960146501955, + "grad_norm": 0.13533857429136503, + "learning_rate": 4.0022257157232516e-06, + "loss": 2.6663, + "step": 42750 + }, + { + "epoch": 2.6538580917499535, + "grad_norm": 0.1404938218440068, + "learning_rate": 4.000810035762087e-06, + "loss": 2.6723, + "step": 42751 + }, + { + "epoch": 2.6539201688497114, + "grad_norm": 0.13420275610640894, + "learning_rate": 3.999394595789857e-06, + "loss": 2.7094, + "step": 42752 + }, + { + "epoch": 2.6539822459494693, + "grad_norm": 0.13314247512516814, + "learning_rate": 3.997979395813961e-06, + "loss": 2.6005, + "step": 42753 + }, + { + "epoch": 2.654044323049227, + "grad_norm": 0.14300014146009196, + "learning_rate": 3.996564435841771e-06, + "loss": 2.6613, + "step": 42754 + }, + { + "epoch": 2.654106400148985, + "grad_norm": 0.13714399960214452, + "learning_rate": 3.995149715880675e-06, + "loss": 2.7008, + "step": 42755 + }, + { + "epoch": 2.654168477248743, + "grad_norm": 0.12921235557657684, + "learning_rate": 3.993735235938051e-06, + "loss": 2.688, + "step": 42756 + }, + { + "epoch": 2.654230554348501, + "grad_norm": 0.14962112813854128, + "learning_rate": 3.99232099602127e-06, + "loss": 2.6929, + "step": 42757 + }, + { + "epoch": 2.654292631448259, + "grad_norm": 0.14487665375686407, + "learning_rate": 3.990906996137727e-06, + "loss": 2.6133, + "step": 42758 + }, + { + "epoch": 2.6543547085480164, + "grad_norm": 0.14384967313078975, + "learning_rate": 3.989493236294795e-06, + "loss": 2.6893, + "step": 42759 + }, + { + "epoch": 2.6544167856477747, + "grad_norm": 0.1442415194448164, + "learning_rate": 3.988079716499843e-06, + "loss": 2.7296, + "step": 42760 + }, + { + "epoch": 2.654478862747532, + "grad_norm": 0.13109711501766605, + "learning_rate": 3.986666436760239e-06, + "loss": 2.5939, + "step": 42761 + }, + { + "epoch": 2.6545409398472906, + "grad_norm": 0.1338987457749931, + "learning_rate": 3.985253397083377e-06, + "loss": 2.7229, + "step": 42762 + }, + { + "epoch": 2.654603016947048, + "grad_norm": 0.13517156222047866, + "learning_rate": 3.983840597476618e-06, + "loss": 2.6965, + "step": 42763 + }, + { + "epoch": 2.6546650940468064, + "grad_norm": 0.14846735578744302, + "learning_rate": 3.982428037947334e-06, + "loss": 2.6387, + "step": 42764 + }, + { + "epoch": 2.654727171146564, + "grad_norm": 0.13091969167516648, + "learning_rate": 3.9810157185028895e-06, + "loss": 2.6989, + "step": 42765 + }, + { + "epoch": 2.654789248246322, + "grad_norm": 0.15085388642791558, + "learning_rate": 3.9796036391506534e-06, + "loss": 2.7701, + "step": 42766 + }, + { + "epoch": 2.6548513253460797, + "grad_norm": 0.14295440820769945, + "learning_rate": 3.978191799898006e-06, + "loss": 2.6838, + "step": 42767 + }, + { + "epoch": 2.6549134024458376, + "grad_norm": 0.14666093358458587, + "learning_rate": 3.9767802007523e-06, + "loss": 2.7509, + "step": 42768 + }, + { + "epoch": 2.6549754795455955, + "grad_norm": 0.14558938319430284, + "learning_rate": 3.9753688417209055e-06, + "loss": 2.6978, + "step": 42769 + }, + { + "epoch": 2.6550375566453535, + "grad_norm": 0.14673248894971858, + "learning_rate": 3.973957722811178e-06, + "loss": 2.6904, + "step": 42770 + }, + { + "epoch": 2.6550996337451114, + "grad_norm": 0.13319219281482642, + "learning_rate": 3.972546844030495e-06, + "loss": 2.7112, + "step": 42771 + }, + { + "epoch": 2.6551617108448693, + "grad_norm": 0.13562617387131753, + "learning_rate": 3.971136205386211e-06, + "loss": 2.6687, + "step": 42772 + }, + { + "epoch": 2.655223787944627, + "grad_norm": 0.1347915652091437, + "learning_rate": 3.969725806885677e-06, + "loss": 2.7272, + "step": 42773 + }, + { + "epoch": 2.655285865044385, + "grad_norm": 0.13576001798036125, + "learning_rate": 3.968315648536253e-06, + "loss": 2.7461, + "step": 42774 + }, + { + "epoch": 2.655347942144143, + "grad_norm": 0.14750298791016958, + "learning_rate": 3.96690573034531e-06, + "loss": 2.7388, + "step": 42775 + }, + { + "epoch": 2.655410019243901, + "grad_norm": 0.13425626415905945, + "learning_rate": 3.965496052320184e-06, + "loss": 2.6888, + "step": 42776 + }, + { + "epoch": 2.655472096343659, + "grad_norm": 0.13469421872244208, + "learning_rate": 3.964086614468254e-06, + "loss": 2.7009, + "step": 42777 + }, + { + "epoch": 2.655534173443417, + "grad_norm": 0.13319319271298455, + "learning_rate": 3.962677416796856e-06, + "loss": 2.7179, + "step": 42778 + }, + { + "epoch": 2.6555962505431747, + "grad_norm": 0.1382114395823358, + "learning_rate": 3.961268459313344e-06, + "loss": 2.6858, + "step": 42779 + }, + { + "epoch": 2.6556583276429326, + "grad_norm": 0.13172789491016115, + "learning_rate": 3.959859742025063e-06, + "loss": 2.6892, + "step": 42780 + }, + { + "epoch": 2.6557204047426906, + "grad_norm": 0.13267914303783176, + "learning_rate": 3.958451264939378e-06, + "loss": 2.6413, + "step": 42781 + }, + { + "epoch": 2.6557824818424485, + "grad_norm": 0.13257793441111365, + "learning_rate": 3.9570430280636336e-06, + "loss": 2.718, + "step": 42782 + }, + { + "epoch": 2.6558445589422064, + "grad_norm": 0.13003809688183246, + "learning_rate": 3.955635031405169e-06, + "loss": 2.6815, + "step": 42783 + }, + { + "epoch": 2.655906636041964, + "grad_norm": 0.13459176444827023, + "learning_rate": 3.954227274971328e-06, + "loss": 2.6789, + "step": 42784 + }, + { + "epoch": 2.6559687131417222, + "grad_norm": 0.13193470822420536, + "learning_rate": 3.952819758769471e-06, + "loss": 2.734, + "step": 42785 + }, + { + "epoch": 2.6560307902414797, + "grad_norm": 0.13933209956438913, + "learning_rate": 3.951412482806927e-06, + "loss": 2.7391, + "step": 42786 + }, + { + "epoch": 2.656092867341238, + "grad_norm": 0.13037068656257733, + "learning_rate": 3.950005447091049e-06, + "loss": 2.7381, + "step": 42787 + }, + { + "epoch": 2.6561549444409955, + "grad_norm": 0.14181769746214387, + "learning_rate": 3.948598651629159e-06, + "loss": 2.6523, + "step": 42788 + }, + { + "epoch": 2.6562170215407535, + "grad_norm": 0.15420493875709132, + "learning_rate": 3.94719209642862e-06, + "loss": 2.7374, + "step": 42789 + }, + { + "epoch": 2.6562790986405114, + "grad_norm": 0.13412857831383884, + "learning_rate": 3.945785781496758e-06, + "loss": 2.7093, + "step": 42790 + }, + { + "epoch": 2.6563411757402693, + "grad_norm": 0.1423097135026427, + "learning_rate": 3.944379706840912e-06, + "loss": 2.7364, + "step": 42791 + }, + { + "epoch": 2.656403252840027, + "grad_norm": 0.13887823895180437, + "learning_rate": 3.942973872468414e-06, + "loss": 2.6279, + "step": 42792 + }, + { + "epoch": 2.656465329939785, + "grad_norm": 0.13580626066431592, + "learning_rate": 3.9415682783865985e-06, + "loss": 2.7923, + "step": 42793 + }, + { + "epoch": 2.656527407039543, + "grad_norm": 0.13395009478122125, + "learning_rate": 3.9401629246028085e-06, + "loss": 2.7721, + "step": 42794 + }, + { + "epoch": 2.656589484139301, + "grad_norm": 0.1428040778162054, + "learning_rate": 3.938757811124371e-06, + "loss": 2.6259, + "step": 42795 + }, + { + "epoch": 2.656651561239059, + "grad_norm": 0.13546105644572423, + "learning_rate": 3.9373529379586165e-06, + "loss": 2.6623, + "step": 42796 + }, + { + "epoch": 2.656713638338817, + "grad_norm": 0.14674218696464664, + "learning_rate": 3.935948305112863e-06, + "loss": 2.7319, + "step": 42797 + }, + { + "epoch": 2.6567757154385747, + "grad_norm": 0.13654161285875246, + "learning_rate": 3.934543912594457e-06, + "loss": 2.6876, + "step": 42798 + }, + { + "epoch": 2.6568377925383326, + "grad_norm": 0.1438845069956563, + "learning_rate": 3.93313976041072e-06, + "loss": 2.7872, + "step": 42799 + }, + { + "epoch": 2.6568998696380905, + "grad_norm": 0.15988948166892406, + "learning_rate": 3.9317358485689796e-06, + "loss": 2.7209, + "step": 42800 + }, + { + "epoch": 2.6569619467378485, + "grad_norm": 0.13561943733875856, + "learning_rate": 3.930332177076551e-06, + "loss": 2.6514, + "step": 42801 + }, + { + "epoch": 2.6570240238376064, + "grad_norm": 0.13953924244825877, + "learning_rate": 3.928928745940757e-06, + "loss": 2.7796, + "step": 42802 + }, + { + "epoch": 2.6570861009373643, + "grad_norm": 0.13805866322813679, + "learning_rate": 3.927525555168937e-06, + "loss": 2.7401, + "step": 42803 + }, + { + "epoch": 2.657148178037122, + "grad_norm": 0.13198891700289628, + "learning_rate": 3.926122604768395e-06, + "loss": 2.6973, + "step": 42804 + }, + { + "epoch": 2.65721025513688, + "grad_norm": 0.15854945362829095, + "learning_rate": 3.924719894746459e-06, + "loss": 2.6828, + "step": 42805 + }, + { + "epoch": 2.657272332236638, + "grad_norm": 0.1438490645475007, + "learning_rate": 3.92331742511044e-06, + "loss": 2.5904, + "step": 42806 + }, + { + "epoch": 2.6573344093363955, + "grad_norm": 0.13239076918584347, + "learning_rate": 3.921915195867665e-06, + "loss": 2.7408, + "step": 42807 + }, + { + "epoch": 2.657396486436154, + "grad_norm": 0.13106666345354778, + "learning_rate": 3.920513207025445e-06, + "loss": 2.618, + "step": 42808 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.13739568155997076, + "learning_rate": 3.919111458591079e-06, + "loss": 2.7171, + "step": 42809 + }, + { + "epoch": 2.6575206406356697, + "grad_norm": 0.13290903846332353, + "learning_rate": 3.917709950571913e-06, + "loss": 2.6187, + "step": 42810 + }, + { + "epoch": 2.657582717735427, + "grad_norm": 0.14076748226006316, + "learning_rate": 3.916308682975228e-06, + "loss": 2.7863, + "step": 42811 + }, + { + "epoch": 2.6576447948351856, + "grad_norm": 0.13432861736349808, + "learning_rate": 3.914907655808353e-06, + "loss": 2.7363, + "step": 42812 + }, + { + "epoch": 2.657706871934943, + "grad_norm": 0.15804925547196344, + "learning_rate": 3.913506869078598e-06, + "loss": 2.7169, + "step": 42813 + }, + { + "epoch": 2.657768949034701, + "grad_norm": 0.1342175159686272, + "learning_rate": 3.912106322793263e-06, + "loss": 2.5621, + "step": 42814 + }, + { + "epoch": 2.657831026134459, + "grad_norm": 0.14286686127678033, + "learning_rate": 3.910706016959659e-06, + "loss": 2.7371, + "step": 42815 + }, + { + "epoch": 2.657893103234217, + "grad_norm": 0.13413972912079875, + "learning_rate": 3.90930595158508e-06, + "loss": 2.736, + "step": 42816 + }, + { + "epoch": 2.6579551803339747, + "grad_norm": 0.13393324718941735, + "learning_rate": 3.907906126676847e-06, + "loss": 2.6914, + "step": 42817 + }, + { + "epoch": 2.6580172574337326, + "grad_norm": 0.14183452122222043, + "learning_rate": 3.906506542242261e-06, + "loss": 2.678, + "step": 42818 + }, + { + "epoch": 2.6580793345334905, + "grad_norm": 0.13651766976327875, + "learning_rate": 3.9051071982886216e-06, + "loss": 2.7016, + "step": 42819 + }, + { + "epoch": 2.6581414116332485, + "grad_norm": 0.13402063279537427, + "learning_rate": 3.903708094823216e-06, + "loss": 2.6597, + "step": 42820 + }, + { + "epoch": 2.6582034887330064, + "grad_norm": 0.13316589209859875, + "learning_rate": 3.902309231853363e-06, + "loss": 2.6709, + "step": 42821 + }, + { + "epoch": 2.6582655658327643, + "grad_norm": 0.1297700866033276, + "learning_rate": 3.900910609386355e-06, + "loss": 2.6521, + "step": 42822 + }, + { + "epoch": 2.658327642932522, + "grad_norm": 0.13274287755092073, + "learning_rate": 3.899512227429486e-06, + "loss": 2.7276, + "step": 42823 + }, + { + "epoch": 2.65838972003228, + "grad_norm": 0.13319672376180505, + "learning_rate": 3.898114085990045e-06, + "loss": 2.6811, + "step": 42824 + }, + { + "epoch": 2.658451797132038, + "grad_norm": 0.13657245983755073, + "learning_rate": 3.8967161850753445e-06, + "loss": 2.6993, + "step": 42825 + }, + { + "epoch": 2.658513874231796, + "grad_norm": 0.21356324148900624, + "learning_rate": 3.895318524692665e-06, + "loss": 2.6733, + "step": 42826 + }, + { + "epoch": 2.658575951331554, + "grad_norm": 0.13156249334013642, + "learning_rate": 3.893921104849308e-06, + "loss": 2.7088, + "step": 42827 + }, + { + "epoch": 2.658638028431312, + "grad_norm": 0.1432493296793954, + "learning_rate": 3.892523925552549e-06, + "loss": 2.673, + "step": 42828 + }, + { + "epoch": 2.6587001055310697, + "grad_norm": 0.15879800590676216, + "learning_rate": 3.891126986809679e-06, + "loss": 2.7944, + "step": 42829 + }, + { + "epoch": 2.6587621826308276, + "grad_norm": 0.13228323607978082, + "learning_rate": 3.889730288628002e-06, + "loss": 2.6769, + "step": 42830 + }, + { + "epoch": 2.6588242597305856, + "grad_norm": 0.15203423112924125, + "learning_rate": 3.888333831014801e-06, + "loss": 2.7241, + "step": 42831 + }, + { + "epoch": 2.658886336830343, + "grad_norm": 0.12936455001413905, + "learning_rate": 3.886937613977348e-06, + "loss": 2.6764, + "step": 42832 + }, + { + "epoch": 2.6589484139301014, + "grad_norm": 0.14950434132718676, + "learning_rate": 3.885541637522932e-06, + "loss": 2.6611, + "step": 42833 + }, + { + "epoch": 2.659010491029859, + "grad_norm": 0.1385816697099472, + "learning_rate": 3.884145901658848e-06, + "loss": 2.5783, + "step": 42834 + }, + { + "epoch": 2.6590725681296172, + "grad_norm": 0.1342720793645775, + "learning_rate": 3.882750406392371e-06, + "loss": 2.6771, + "step": 42835 + }, + { + "epoch": 2.6591346452293747, + "grad_norm": 0.14059220037477543, + "learning_rate": 3.881355151730775e-06, + "loss": 2.679, + "step": 42836 + }, + { + "epoch": 2.6591967223291326, + "grad_norm": 0.15203958493405584, + "learning_rate": 3.879960137681343e-06, + "loss": 2.763, + "step": 42837 + }, + { + "epoch": 2.6592587994288905, + "grad_norm": 0.13592534778252993, + "learning_rate": 3.878565364251358e-06, + "loss": 2.7078, + "step": 42838 + }, + { + "epoch": 2.6593208765286485, + "grad_norm": 0.14064715131499883, + "learning_rate": 3.877170831448096e-06, + "loss": 2.8163, + "step": 42839 + }, + { + "epoch": 2.6593829536284064, + "grad_norm": 0.13318341716279702, + "learning_rate": 3.8757765392788305e-06, + "loss": 2.6763, + "step": 42840 + }, + { + "epoch": 2.6594450307281643, + "grad_norm": 0.1370705234333545, + "learning_rate": 3.874382487750838e-06, + "loss": 2.7237, + "step": 42841 + }, + { + "epoch": 2.659507107827922, + "grad_norm": 0.13192398521961243, + "learning_rate": 3.872988676871381e-06, + "loss": 2.6176, + "step": 42842 + }, + { + "epoch": 2.65956918492768, + "grad_norm": 0.13808764708889182, + "learning_rate": 3.8715951066477356e-06, + "loss": 2.6887, + "step": 42843 + }, + { + "epoch": 2.659631262027438, + "grad_norm": 0.1313249291820288, + "learning_rate": 3.87020177708719e-06, + "loss": 2.6607, + "step": 42844 + }, + { + "epoch": 2.659693339127196, + "grad_norm": 0.13460843967637506, + "learning_rate": 3.868808688196996e-06, + "loss": 2.7896, + "step": 42845 + }, + { + "epoch": 2.659755416226954, + "grad_norm": 0.1313163337088879, + "learning_rate": 3.8674158399844285e-06, + "loss": 2.7614, + "step": 42846 + }, + { + "epoch": 2.659817493326712, + "grad_norm": 0.15097983234610216, + "learning_rate": 3.866023232456739e-06, + "loss": 2.6608, + "step": 42847 + }, + { + "epoch": 2.6598795704264697, + "grad_norm": 0.1333855928573527, + "learning_rate": 3.8646308656212214e-06, + "loss": 2.7032, + "step": 42848 + }, + { + "epoch": 2.6599416475262276, + "grad_norm": 0.1415853162193128, + "learning_rate": 3.86323873948512e-06, + "loss": 2.6926, + "step": 42849 + }, + { + "epoch": 2.6600037246259856, + "grad_norm": 0.14983172415168727, + "learning_rate": 3.861846854055701e-06, + "loss": 2.7796, + "step": 42850 + }, + { + "epoch": 2.6600658017257435, + "grad_norm": 0.13302967603243857, + "learning_rate": 3.8604552093402304e-06, + "loss": 2.6411, + "step": 42851 + }, + { + "epoch": 2.6601278788255014, + "grad_norm": 0.13628043348876465, + "learning_rate": 3.859063805345953e-06, + "loss": 2.5815, + "step": 42852 + }, + { + "epoch": 2.6601899559252593, + "grad_norm": 0.13206928935383797, + "learning_rate": 3.857672642080151e-06, + "loss": 2.6723, + "step": 42853 + }, + { + "epoch": 2.6602520330250172, + "grad_norm": 0.1358321392690895, + "learning_rate": 3.856281719550075e-06, + "loss": 2.7481, + "step": 42854 + }, + { + "epoch": 2.6603141101247747, + "grad_norm": 0.14979441233264215, + "learning_rate": 3.854891037762981e-06, + "loss": 2.6602, + "step": 42855 + }, + { + "epoch": 2.660376187224533, + "grad_norm": 0.1496561184064279, + "learning_rate": 3.853500596726106e-06, + "loss": 2.7743, + "step": 42856 + }, + { + "epoch": 2.6604382643242905, + "grad_norm": 0.14739475280738595, + "learning_rate": 3.8521103964467344e-06, + "loss": 2.601, + "step": 42857 + }, + { + "epoch": 2.660500341424049, + "grad_norm": 0.13504212114193287, + "learning_rate": 3.850720436932104e-06, + "loss": 2.6658, + "step": 42858 + }, + { + "epoch": 2.6605624185238064, + "grad_norm": 0.1343779169919314, + "learning_rate": 3.84933071818947e-06, + "loss": 2.6924, + "step": 42859 + }, + { + "epoch": 2.6606244956235647, + "grad_norm": 0.1286058465753785, + "learning_rate": 3.847941240226072e-06, + "loss": 2.652, + "step": 42860 + }, + { + "epoch": 2.660686572723322, + "grad_norm": 0.1368029879865878, + "learning_rate": 3.8465520030491756e-06, + "loss": 2.7806, + "step": 42861 + }, + { + "epoch": 2.66074864982308, + "grad_norm": 0.1408084295605591, + "learning_rate": 3.845163006666025e-06, + "loss": 2.648, + "step": 42862 + }, + { + "epoch": 2.660810726922838, + "grad_norm": 0.13742151160568933, + "learning_rate": 3.843774251083859e-06, + "loss": 2.6565, + "step": 42863 + }, + { + "epoch": 2.660872804022596, + "grad_norm": 0.14558184743611327, + "learning_rate": 3.842385736309933e-06, + "loss": 2.7912, + "step": 42864 + }, + { + "epoch": 2.660934881122354, + "grad_norm": 0.130919293307718, + "learning_rate": 3.84099746235147e-06, + "loss": 2.5743, + "step": 42865 + }, + { + "epoch": 2.660996958222112, + "grad_norm": 0.1573544035830693, + "learning_rate": 3.839609429215746e-06, + "loss": 2.6764, + "step": 42866 + }, + { + "epoch": 2.6610590353218697, + "grad_norm": 0.1502452246546574, + "learning_rate": 3.838221636909978e-06, + "loss": 2.6704, + "step": 42867 + }, + { + "epoch": 2.6611211124216276, + "grad_norm": 0.1314945038210743, + "learning_rate": 3.836834085441415e-06, + "loss": 2.7827, + "step": 42868 + }, + { + "epoch": 2.6611831895213856, + "grad_norm": 0.13564338793454686, + "learning_rate": 3.835446774817292e-06, + "loss": 2.7652, + "step": 42869 + }, + { + "epoch": 2.6612452666211435, + "grad_norm": 0.13369947761597295, + "learning_rate": 3.834059705044851e-06, + "loss": 2.6276, + "step": 42870 + }, + { + "epoch": 2.6613073437209014, + "grad_norm": 0.13247440584648448, + "learning_rate": 3.8326728761313325e-06, + "loss": 2.6867, + "step": 42871 + }, + { + "epoch": 2.6613694208206593, + "grad_norm": 0.13041050633098833, + "learning_rate": 3.8312862880839624e-06, + "loss": 2.7029, + "step": 42872 + }, + { + "epoch": 2.661431497920417, + "grad_norm": 0.14435393823847914, + "learning_rate": 3.829899940909976e-06, + "loss": 2.7003, + "step": 42873 + }, + { + "epoch": 2.661493575020175, + "grad_norm": 0.14165040850844693, + "learning_rate": 3.828513834616609e-06, + "loss": 2.6699, + "step": 42874 + }, + { + "epoch": 2.661555652119933, + "grad_norm": 0.14542745765249132, + "learning_rate": 3.827127969211103e-06, + "loss": 2.6232, + "step": 42875 + }, + { + "epoch": 2.661617729219691, + "grad_norm": 0.13420426894551013, + "learning_rate": 3.825742344700678e-06, + "loss": 2.7199, + "step": 42876 + }, + { + "epoch": 2.661679806319449, + "grad_norm": 0.1476507039517106, + "learning_rate": 3.824356961092562e-06, + "loss": 2.6852, + "step": 42877 + }, + { + "epoch": 2.661741883419207, + "grad_norm": 0.1387838260984478, + "learning_rate": 3.822971818393989e-06, + "loss": 2.595, + "step": 42878 + }, + { + "epoch": 2.6618039605189647, + "grad_norm": 0.15083314763456404, + "learning_rate": 3.821586916612174e-06, + "loss": 2.6924, + "step": 42879 + }, + { + "epoch": 2.661866037618722, + "grad_norm": 0.13053474441820886, + "learning_rate": 3.820202255754357e-06, + "loss": 2.6784, + "step": 42880 + }, + { + "epoch": 2.6619281147184806, + "grad_norm": 0.1291953587261781, + "learning_rate": 3.818817835827754e-06, + "loss": 2.7115, + "step": 42881 + }, + { + "epoch": 2.661990191818238, + "grad_norm": 0.14220823356875656, + "learning_rate": 3.817433656839586e-06, + "loss": 2.7806, + "step": 42882 + }, + { + "epoch": 2.6620522689179964, + "grad_norm": 0.15573235835969151, + "learning_rate": 3.816049718797076e-06, + "loss": 2.7234, + "step": 42883 + }, + { + "epoch": 2.662114346017754, + "grad_norm": 0.1583543631480873, + "learning_rate": 3.8146660217074525e-06, + "loss": 2.7664, + "step": 42884 + }, + { + "epoch": 2.662176423117512, + "grad_norm": 0.14374887217203627, + "learning_rate": 3.8132825655779303e-06, + "loss": 2.8094, + "step": 42885 + }, + { + "epoch": 2.6622385002172697, + "grad_norm": 0.13463437558355137, + "learning_rate": 3.8118993504157207e-06, + "loss": 2.7039, + "step": 42886 + }, + { + "epoch": 2.6623005773170276, + "grad_norm": 0.141284581261765, + "learning_rate": 3.8105163762280406e-06, + "loss": 2.6822, + "step": 42887 + }, + { + "epoch": 2.6623626544167855, + "grad_norm": 0.14627106712493593, + "learning_rate": 3.809133643022111e-06, + "loss": 2.6844, + "step": 42888 + }, + { + "epoch": 2.6624247315165435, + "grad_norm": 0.14053321200505384, + "learning_rate": 3.80775115080515e-06, + "loss": 2.6445, + "step": 42889 + }, + { + "epoch": 2.6624868086163014, + "grad_norm": 0.13496583082323885, + "learning_rate": 3.8063688995843672e-06, + "loss": 2.6442, + "step": 42890 + }, + { + "epoch": 2.6625488857160593, + "grad_norm": 0.1313881510314403, + "learning_rate": 3.8049868893669683e-06, + "loss": 2.6601, + "step": 42891 + }, + { + "epoch": 2.662610962815817, + "grad_norm": 0.1387212826462604, + "learning_rate": 3.8036051201601585e-06, + "loss": 2.6492, + "step": 42892 + }, + { + "epoch": 2.662673039915575, + "grad_norm": 0.13630422694214303, + "learning_rate": 3.80222359197116e-06, + "loss": 2.7094, + "step": 42893 + }, + { + "epoch": 2.662735117015333, + "grad_norm": 0.13290473596197985, + "learning_rate": 3.800842304807184e-06, + "loss": 2.6471, + "step": 42894 + }, + { + "epoch": 2.662797194115091, + "grad_norm": 0.15125258013991522, + "learning_rate": 3.799461258675424e-06, + "loss": 2.7069, + "step": 42895 + }, + { + "epoch": 2.662859271214849, + "grad_norm": 0.1334098258121209, + "learning_rate": 3.7980804535830804e-06, + "loss": 2.7815, + "step": 42896 + }, + { + "epoch": 2.662921348314607, + "grad_norm": 0.13711046913992841, + "learning_rate": 3.7966998895373805e-06, + "loss": 2.7398, + "step": 42897 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.13555540640638547, + "learning_rate": 3.7953195665455078e-06, + "loss": 2.7258, + "step": 42898 + }, + { + "epoch": 2.6630455025141226, + "grad_norm": 0.14414814487473265, + "learning_rate": 3.7939394846146726e-06, + "loss": 2.7005, + "step": 42899 + }, + { + "epoch": 2.6631075796138806, + "grad_norm": 0.14023148924214696, + "learning_rate": 3.7925596437520748e-06, + "loss": 2.603, + "step": 42900 + }, + { + "epoch": 2.6631696567136385, + "grad_norm": 0.13334907104204732, + "learning_rate": 3.791180043964898e-06, + "loss": 2.6166, + "step": 42901 + }, + { + "epoch": 2.6632317338133964, + "grad_norm": 0.1432398893215135, + "learning_rate": 3.789800685260364e-06, + "loss": 2.7002, + "step": 42902 + }, + { + "epoch": 2.663293810913154, + "grad_norm": 0.13942513251718136, + "learning_rate": 3.788421567645656e-06, + "loss": 2.7477, + "step": 42903 + }, + { + "epoch": 2.6633558880129122, + "grad_norm": 0.1341164266280624, + "learning_rate": 3.787042691127968e-06, + "loss": 2.681, + "step": 42904 + }, + { + "epoch": 2.6634179651126697, + "grad_norm": 0.14124376540645392, + "learning_rate": 3.785664055714494e-06, + "loss": 2.7614, + "step": 42905 + }, + { + "epoch": 2.663480042212428, + "grad_norm": 0.14154336983218715, + "learning_rate": 3.7842856614124346e-06, + "loss": 2.7161, + "step": 42906 + }, + { + "epoch": 2.6635421193121855, + "grad_norm": 0.1438145070360761, + "learning_rate": 3.7829075082289665e-06, + "loss": 2.6267, + "step": 42907 + }, + { + "epoch": 2.663604196411944, + "grad_norm": 0.13077739738386457, + "learning_rate": 3.7815295961713017e-06, + "loss": 2.7079, + "step": 42908 + }, + { + "epoch": 2.6636662735117014, + "grad_norm": 0.13800209446022527, + "learning_rate": 3.780151925246611e-06, + "loss": 2.6756, + "step": 42909 + }, + { + "epoch": 2.6637283506114593, + "grad_norm": 0.14513060301101804, + "learning_rate": 3.778774495462084e-06, + "loss": 2.7684, + "step": 42910 + }, + { + "epoch": 2.663790427711217, + "grad_norm": 0.12928962773644326, + "learning_rate": 3.77739730682492e-06, + "loss": 2.6287, + "step": 42911 + }, + { + "epoch": 2.663852504810975, + "grad_norm": 0.134328804558844, + "learning_rate": 3.7760203593422972e-06, + "loss": 2.6347, + "step": 42912 + }, + { + "epoch": 2.663914581910733, + "grad_norm": 0.13714194875397515, + "learning_rate": 3.7746436530213923e-06, + "loss": 2.742, + "step": 42913 + }, + { + "epoch": 2.663976659010491, + "grad_norm": 0.1412605388473831, + "learning_rate": 3.7732671878693946e-06, + "loss": 2.7423, + "step": 42914 + }, + { + "epoch": 2.664038736110249, + "grad_norm": 0.12953828707401321, + "learning_rate": 3.7718909638934764e-06, + "loss": 2.6035, + "step": 42915 + }, + { + "epoch": 2.664100813210007, + "grad_norm": 0.13353054101703526, + "learning_rate": 3.770514981100831e-06, + "loss": 2.7136, + "step": 42916 + }, + { + "epoch": 2.6641628903097647, + "grad_norm": 0.13074340236203108, + "learning_rate": 3.769139239498637e-06, + "loss": 2.7528, + "step": 42917 + }, + { + "epoch": 2.6642249674095226, + "grad_norm": 0.14918815139289499, + "learning_rate": 3.7677637390940656e-06, + "loss": 2.6882, + "step": 42918 + }, + { + "epoch": 2.6642870445092806, + "grad_norm": 0.13122101531829455, + "learning_rate": 3.766388479894278e-06, + "loss": 2.6795, + "step": 42919 + }, + { + "epoch": 2.6643491216090385, + "grad_norm": 0.14571780854349908, + "learning_rate": 3.7650134619064802e-06, + "loss": 2.6601, + "step": 42920 + }, + { + "epoch": 2.6644111987087964, + "grad_norm": 0.14732605405147547, + "learning_rate": 3.7636386851378325e-06, + "loss": 2.6339, + "step": 42921 + }, + { + "epoch": 2.6644732758085543, + "grad_norm": 0.13100237648024085, + "learning_rate": 3.7622641495955013e-06, + "loss": 2.7235, + "step": 42922 + }, + { + "epoch": 2.6645353529083122, + "grad_norm": 0.1338592261781886, + "learning_rate": 3.760889855286659e-06, + "loss": 2.7485, + "step": 42923 + }, + { + "epoch": 2.66459743000807, + "grad_norm": 0.1336640588447222, + "learning_rate": 3.759515802218483e-06, + "loss": 2.7494, + "step": 42924 + }, + { + "epoch": 2.664659507107828, + "grad_norm": 0.13643898268596058, + "learning_rate": 3.7581419903981395e-06, + "loss": 2.7106, + "step": 42925 + }, + { + "epoch": 2.664721584207586, + "grad_norm": 0.1364246611388746, + "learning_rate": 3.756768419832801e-06, + "loss": 2.783, + "step": 42926 + }, + { + "epoch": 2.664783661307344, + "grad_norm": 0.1331824731339093, + "learning_rate": 3.7553950905296178e-06, + "loss": 2.8081, + "step": 42927 + }, + { + "epoch": 2.6648457384071014, + "grad_norm": 0.1320803249166443, + "learning_rate": 3.7540220024957662e-06, + "loss": 2.6778, + "step": 42928 + }, + { + "epoch": 2.6649078155068597, + "grad_norm": 0.14062822523657076, + "learning_rate": 3.752649155738408e-06, + "loss": 2.8011, + "step": 42929 + }, + { + "epoch": 2.664969892606617, + "grad_norm": 0.1325959656355186, + "learning_rate": 3.7512765502647095e-06, + "loss": 2.7379, + "step": 42930 + }, + { + "epoch": 2.6650319697063756, + "grad_norm": 0.1334049809695533, + "learning_rate": 3.749904186081832e-06, + "loss": 2.7066, + "step": 42931 + }, + { + "epoch": 2.665094046806133, + "grad_norm": 0.1369649849525844, + "learning_rate": 3.7485320631969188e-06, + "loss": 2.7439, + "step": 42932 + }, + { + "epoch": 2.665156123905891, + "grad_norm": 0.1310439800430726, + "learning_rate": 3.747160181617149e-06, + "loss": 2.6942, + "step": 42933 + }, + { + "epoch": 2.665218201005649, + "grad_norm": 0.14274275433061018, + "learning_rate": 3.7457885413496764e-06, + "loss": 2.7663, + "step": 42934 + }, + { + "epoch": 2.665280278105407, + "grad_norm": 0.13965451287345623, + "learning_rate": 3.744417142401652e-06, + "loss": 2.6759, + "step": 42935 + }, + { + "epoch": 2.6653423552051647, + "grad_norm": 0.1622012464030339, + "learning_rate": 3.743045984780236e-06, + "loss": 2.7124, + "step": 42936 + }, + { + "epoch": 2.6654044323049226, + "grad_norm": 0.13952322328084235, + "learning_rate": 3.7416750684925673e-06, + "loss": 2.6659, + "step": 42937 + }, + { + "epoch": 2.6654665094046806, + "grad_norm": 0.14037621531890315, + "learning_rate": 3.7403043935458125e-06, + "loss": 2.685, + "step": 42938 + }, + { + "epoch": 2.6655285865044385, + "grad_norm": 0.1320681540135954, + "learning_rate": 3.738933959947127e-06, + "loss": 2.6906, + "step": 42939 + }, + { + "epoch": 2.6655906636041964, + "grad_norm": 0.14244996150174596, + "learning_rate": 3.7375637677036446e-06, + "loss": 2.6597, + "step": 42940 + }, + { + "epoch": 2.6656527407039543, + "grad_norm": 0.13626820714940677, + "learning_rate": 3.7361938168225252e-06, + "loss": 2.6871, + "step": 42941 + }, + { + "epoch": 2.6657148178037122, + "grad_norm": 0.1347652034476501, + "learning_rate": 3.734824107310908e-06, + "loss": 2.7709, + "step": 42942 + }, + { + "epoch": 2.66577689490347, + "grad_norm": 0.1368777780708154, + "learning_rate": 3.733454639175954e-06, + "loss": 2.807, + "step": 42943 + }, + { + "epoch": 2.665838972003228, + "grad_norm": 0.13796416204822998, + "learning_rate": 3.7320854124248017e-06, + "loss": 2.7346, + "step": 42944 + }, + { + "epoch": 2.665901049102986, + "grad_norm": 0.14677665796582895, + "learning_rate": 3.73071642706459e-06, + "loss": 2.6901, + "step": 42945 + }, + { + "epoch": 2.665963126202744, + "grad_norm": 0.12943216172533223, + "learning_rate": 3.729347683102452e-06, + "loss": 2.7344, + "step": 42946 + }, + { + "epoch": 2.666025203302502, + "grad_norm": 0.13305125094355744, + "learning_rate": 3.727979180545549e-06, + "loss": 2.6612, + "step": 42947 + }, + { + "epoch": 2.6660872804022597, + "grad_norm": 0.1344177760193524, + "learning_rate": 3.7266109194010133e-06, + "loss": 2.7379, + "step": 42948 + }, + { + "epoch": 2.6661493575020176, + "grad_norm": 0.13746420762903305, + "learning_rate": 3.725242899675979e-06, + "loss": 2.7673, + "step": 42949 + }, + { + "epoch": 2.6662114346017756, + "grad_norm": 0.14937423973209243, + "learning_rate": 3.7238751213775847e-06, + "loss": 2.7106, + "step": 42950 + }, + { + "epoch": 2.666273511701533, + "grad_norm": 0.13852616835312995, + "learning_rate": 3.722507584512963e-06, + "loss": 2.6633, + "step": 42951 + }, + { + "epoch": 2.6663355888012914, + "grad_norm": 0.1324950730446903, + "learning_rate": 3.721140289089259e-06, + "loss": 2.6782, + "step": 42952 + }, + { + "epoch": 2.666397665901049, + "grad_norm": 0.15146274626685802, + "learning_rate": 3.719773235113605e-06, + "loss": 2.7002, + "step": 42953 + }, + { + "epoch": 2.6664597430008072, + "grad_norm": 0.14754976035337464, + "learning_rate": 3.7184064225931294e-06, + "loss": 2.6516, + "step": 42954 + }, + { + "epoch": 2.6665218201005647, + "grad_norm": 0.14080685539335583, + "learning_rate": 3.717039851534948e-06, + "loss": 2.701, + "step": 42955 + }, + { + "epoch": 2.666583897200323, + "grad_norm": 0.1429607600183843, + "learning_rate": 3.715673521946217e-06, + "loss": 2.6813, + "step": 42956 + }, + { + "epoch": 2.6666459743000805, + "grad_norm": 0.13498492296895626, + "learning_rate": 3.714307433834052e-06, + "loss": 2.7784, + "step": 42957 + }, + { + "epoch": 2.6667080513998385, + "grad_norm": 0.13169809128517565, + "learning_rate": 3.7129415872055874e-06, + "loss": 2.6861, + "step": 42958 + }, + { + "epoch": 2.6667701284995964, + "grad_norm": 0.1364070950533285, + "learning_rate": 3.7115759820679275e-06, + "loss": 2.7486, + "step": 42959 + }, + { + "epoch": 2.6668322055993543, + "grad_norm": 0.13220548757631628, + "learning_rate": 3.7102106184282224e-06, + "loss": 2.6939, + "step": 42960 + }, + { + "epoch": 2.666894282699112, + "grad_norm": 0.12793196610612753, + "learning_rate": 3.708845496293589e-06, + "loss": 2.5475, + "step": 42961 + }, + { + "epoch": 2.66695635979887, + "grad_norm": 0.13022910826570924, + "learning_rate": 3.7074806156711438e-06, + "loss": 2.6407, + "step": 42962 + }, + { + "epoch": 2.667018436898628, + "grad_norm": 0.13292825086581256, + "learning_rate": 3.7061159765680088e-06, + "loss": 2.7327, + "step": 42963 + }, + { + "epoch": 2.667080513998386, + "grad_norm": 0.1421306852916337, + "learning_rate": 3.7047515789913e-06, + "loss": 2.7168, + "step": 42964 + }, + { + "epoch": 2.667142591098144, + "grad_norm": 0.13596029391961442, + "learning_rate": 3.7033874229481457e-06, + "loss": 2.7257, + "step": 42965 + }, + { + "epoch": 2.667204668197902, + "grad_norm": 0.145825257389931, + "learning_rate": 3.702023508445662e-06, + "loss": 2.7213, + "step": 42966 + }, + { + "epoch": 2.6672667452976597, + "grad_norm": 0.13219028465458224, + "learning_rate": 3.7006598354909548e-06, + "loss": 2.7652, + "step": 42967 + }, + { + "epoch": 2.6673288223974176, + "grad_norm": 0.13658011764741598, + "learning_rate": 3.6992964040911403e-06, + "loss": 2.6616, + "step": 42968 + }, + { + "epoch": 2.6673908994971756, + "grad_norm": 0.13772255377799875, + "learning_rate": 3.697933214253346e-06, + "loss": 2.6727, + "step": 42969 + }, + { + "epoch": 2.6674529765969335, + "grad_norm": 0.1317526448240411, + "learning_rate": 3.6965702659846723e-06, + "loss": 2.6712, + "step": 42970 + }, + { + "epoch": 2.6675150536966914, + "grad_norm": 0.13749221947721255, + "learning_rate": 3.6952075592922355e-06, + "loss": 2.7217, + "step": 42971 + }, + { + "epoch": 2.6675771307964493, + "grad_norm": 0.1422988233109323, + "learning_rate": 3.6938450941831294e-06, + "loss": 2.7887, + "step": 42972 + }, + { + "epoch": 2.6676392078962072, + "grad_norm": 0.12795674423752484, + "learning_rate": 3.6924828706644767e-06, + "loss": 2.6346, + "step": 42973 + }, + { + "epoch": 2.667701284995965, + "grad_norm": 0.13076345287309843, + "learning_rate": 3.691120888743388e-06, + "loss": 2.6819, + "step": 42974 + }, + { + "epoch": 2.667763362095723, + "grad_norm": 0.13395034508027043, + "learning_rate": 3.689759148426969e-06, + "loss": 2.6482, + "step": 42975 + }, + { + "epoch": 2.6678254391954805, + "grad_norm": 0.1294342124121752, + "learning_rate": 3.6883976497223137e-06, + "loss": 2.6187, + "step": 42976 + }, + { + "epoch": 2.667887516295239, + "grad_norm": 0.12935503950800892, + "learning_rate": 3.6870363926365337e-06, + "loss": 2.6756, + "step": 42977 + }, + { + "epoch": 2.6679495933949964, + "grad_norm": 0.13689979419015316, + "learning_rate": 3.685675377176717e-06, + "loss": 2.6133, + "step": 42978 + }, + { + "epoch": 2.6680116704947547, + "grad_norm": 0.14663115459136766, + "learning_rate": 3.684314603349981e-06, + "loss": 2.6938, + "step": 42979 + }, + { + "epoch": 2.668073747594512, + "grad_norm": 0.13138345138458887, + "learning_rate": 3.682954071163425e-06, + "loss": 2.6742, + "step": 42980 + }, + { + "epoch": 2.66813582469427, + "grad_norm": 0.14346550746985118, + "learning_rate": 3.6815937806241374e-06, + "loss": 2.716, + "step": 42981 + }, + { + "epoch": 2.668197901794028, + "grad_norm": 0.1323776067034945, + "learning_rate": 3.6802337317392133e-06, + "loss": 2.6019, + "step": 42982 + }, + { + "epoch": 2.668259978893786, + "grad_norm": 0.13687117118357497, + "learning_rate": 3.678873924515758e-06, + "loss": 2.6567, + "step": 42983 + }, + { + "epoch": 2.668322055993544, + "grad_norm": 0.14042221786942755, + "learning_rate": 3.677514358960865e-06, + "loss": 2.6725, + "step": 42984 + }, + { + "epoch": 2.668384133093302, + "grad_norm": 0.14466322209072435, + "learning_rate": 3.67615503508163e-06, + "loss": 2.6635, + "step": 42985 + }, + { + "epoch": 2.6684462101930597, + "grad_norm": 0.1341760426655105, + "learning_rate": 3.674795952885135e-06, + "loss": 2.6506, + "step": 42986 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.15249360490311517, + "learning_rate": 3.673437112378464e-06, + "loss": 2.7073, + "step": 42987 + }, + { + "epoch": 2.6685703643925756, + "grad_norm": 0.13506486399583956, + "learning_rate": 3.672078513568733e-06, + "loss": 2.7133, + "step": 42988 + }, + { + "epoch": 2.6686324414923335, + "grad_norm": 0.12872902651896115, + "learning_rate": 3.670720156463009e-06, + "loss": 2.674, + "step": 42989 + }, + { + "epoch": 2.6686945185920914, + "grad_norm": 0.13767031855358747, + "learning_rate": 3.669362041068386e-06, + "loss": 2.6064, + "step": 42990 + }, + { + "epoch": 2.6687565956918493, + "grad_norm": 0.14810978325414184, + "learning_rate": 3.668004167391942e-06, + "loss": 2.7256, + "step": 42991 + }, + { + "epoch": 2.6688186727916072, + "grad_norm": 0.13918082769688642, + "learning_rate": 3.6666465354407766e-06, + "loss": 2.6598, + "step": 42992 + }, + { + "epoch": 2.668880749891365, + "grad_norm": 0.13187032180551606, + "learning_rate": 3.665289145221962e-06, + "loss": 2.6799, + "step": 42993 + }, + { + "epoch": 2.668942826991123, + "grad_norm": 0.13545970202350693, + "learning_rate": 3.6639319967425813e-06, + "loss": 2.6753, + "step": 42994 + }, + { + "epoch": 2.669004904090881, + "grad_norm": 0.14571720137111707, + "learning_rate": 3.662575090009712e-06, + "loss": 2.7145, + "step": 42995 + }, + { + "epoch": 2.669066981190639, + "grad_norm": 0.1408597589340521, + "learning_rate": 3.661218425030438e-06, + "loss": 2.7433, + "step": 42996 + }, + { + "epoch": 2.669129058290397, + "grad_norm": 0.1842296726686899, + "learning_rate": 3.6598620018118425e-06, + "loss": 2.7249, + "step": 42997 + }, + { + "epoch": 2.6691911353901547, + "grad_norm": 0.15146652776592387, + "learning_rate": 3.6585058203609967e-06, + "loss": 2.6617, + "step": 42998 + }, + { + "epoch": 2.669253212489912, + "grad_norm": 0.16052738660790225, + "learning_rate": 3.6571498806849737e-06, + "loss": 2.6438, + "step": 42999 + }, + { + "epoch": 2.6693152895896706, + "grad_norm": 0.15511569521706603, + "learning_rate": 3.655794182790839e-06, + "loss": 2.5869, + "step": 43000 + }, + { + "epoch": 2.669377366689428, + "grad_norm": 0.13635934865642463, + "learning_rate": 3.654438726685688e-06, + "loss": 2.7645, + "step": 43001 + }, + { + "epoch": 2.6694394437891864, + "grad_norm": 0.1304426675411726, + "learning_rate": 3.6530835123765817e-06, + "loss": 2.6423, + "step": 43002 + }, + { + "epoch": 2.669501520888944, + "grad_norm": 0.16486476972089018, + "learning_rate": 3.6517285398705915e-06, + "loss": 2.7714, + "step": 43003 + }, + { + "epoch": 2.6695635979887022, + "grad_norm": 0.1365854499025745, + "learning_rate": 3.650373809174773e-06, + "loss": 2.6831, + "step": 43004 + }, + { + "epoch": 2.6696256750884597, + "grad_norm": 0.13728455901982728, + "learning_rate": 3.6490193202962044e-06, + "loss": 2.7703, + "step": 43005 + }, + { + "epoch": 2.6696877521882176, + "grad_norm": 0.14290912953647, + "learning_rate": 3.647665073241968e-06, + "loss": 2.7513, + "step": 43006 + }, + { + "epoch": 2.6697498292879756, + "grad_norm": 0.1321159920369078, + "learning_rate": 3.646311068019109e-06, + "loss": 2.7456, + "step": 43007 + }, + { + "epoch": 2.6698119063877335, + "grad_norm": 0.15911294555965796, + "learning_rate": 3.644957304634705e-06, + "loss": 2.7464, + "step": 43008 + }, + { + "epoch": 2.6698739834874914, + "grad_norm": 0.14492286645802052, + "learning_rate": 3.6436037830958003e-06, + "loss": 2.7644, + "step": 43009 + }, + { + "epoch": 2.6699360605872493, + "grad_norm": 0.13625625311649966, + "learning_rate": 3.642250503409472e-06, + "loss": 2.6156, + "step": 43010 + }, + { + "epoch": 2.6699981376870072, + "grad_norm": 0.14039425993275143, + "learning_rate": 3.640897465582782e-06, + "loss": 2.7203, + "step": 43011 + }, + { + "epoch": 2.670060214786765, + "grad_norm": 0.13262771626449466, + "learning_rate": 3.6395446696227854e-06, + "loss": 2.7637, + "step": 43012 + }, + { + "epoch": 2.670122291886523, + "grad_norm": 0.1330362986842566, + "learning_rate": 3.638192115536532e-06, + "loss": 2.6736, + "step": 43013 + }, + { + "epoch": 2.670184368986281, + "grad_norm": 0.13179399741603587, + "learning_rate": 3.6368398033310824e-06, + "loss": 2.6701, + "step": 43014 + }, + { + "epoch": 2.670246446086039, + "grad_norm": 0.13403686490641728, + "learning_rate": 3.6354877330134984e-06, + "loss": 2.7961, + "step": 43015 + }, + { + "epoch": 2.670308523185797, + "grad_norm": 0.1317719480846833, + "learning_rate": 3.634135904590835e-06, + "loss": 2.6323, + "step": 43016 + }, + { + "epoch": 2.6703706002855547, + "grad_norm": 0.13983334790334107, + "learning_rate": 3.632784318070137e-06, + "loss": 2.6901, + "step": 43017 + }, + { + "epoch": 2.6704326773853126, + "grad_norm": 0.1445603083705397, + "learning_rate": 3.631432973458454e-06, + "loss": 2.6961, + "step": 43018 + }, + { + "epoch": 2.6704947544850706, + "grad_norm": 0.14129201004796932, + "learning_rate": 3.630081870762847e-06, + "loss": 2.6597, + "step": 43019 + }, + { + "epoch": 2.6705568315848285, + "grad_norm": 0.14278791617730494, + "learning_rate": 3.6287310099903605e-06, + "loss": 2.7055, + "step": 43020 + }, + { + "epoch": 2.6706189086845864, + "grad_norm": 0.1325637718235895, + "learning_rate": 3.6273803911480443e-06, + "loss": 2.6616, + "step": 43021 + }, + { + "epoch": 2.6706809857843443, + "grad_norm": 0.14206820582122448, + "learning_rate": 3.626030014242937e-06, + "loss": 2.6316, + "step": 43022 + }, + { + "epoch": 2.6707430628841022, + "grad_norm": 0.1327743335187486, + "learning_rate": 3.6246798792820833e-06, + "loss": 2.7047, + "step": 43023 + }, + { + "epoch": 2.6708051399838597, + "grad_norm": 0.14443281404337657, + "learning_rate": 3.623329986272539e-06, + "loss": 2.7201, + "step": 43024 + }, + { + "epoch": 2.670867217083618, + "grad_norm": 0.13346047669279626, + "learning_rate": 3.621980335221342e-06, + "loss": 2.6704, + "step": 43025 + }, + { + "epoch": 2.6709292941833755, + "grad_norm": 0.1356590757543629, + "learning_rate": 3.620630926135532e-06, + "loss": 2.7573, + "step": 43026 + }, + { + "epoch": 2.670991371283134, + "grad_norm": 0.14974872036022568, + "learning_rate": 3.6192817590221417e-06, + "loss": 2.7347, + "step": 43027 + }, + { + "epoch": 2.6710534483828914, + "grad_norm": 0.13223825456548396, + "learning_rate": 3.6179328338882213e-06, + "loss": 2.6924, + "step": 43028 + }, + { + "epoch": 2.6711155254826493, + "grad_norm": 0.15066254172671512, + "learning_rate": 3.6165841507408092e-06, + "loss": 2.7721, + "step": 43029 + }, + { + "epoch": 2.671177602582407, + "grad_norm": 0.13490048861356788, + "learning_rate": 3.615235709586939e-06, + "loss": 2.6588, + "step": 43030 + }, + { + "epoch": 2.671239679682165, + "grad_norm": 0.14934936693551915, + "learning_rate": 3.6138875104336335e-06, + "loss": 2.7497, + "step": 43031 + }, + { + "epoch": 2.671301756781923, + "grad_norm": 0.15659816813435823, + "learning_rate": 3.612539553287947e-06, + "loss": 2.7539, + "step": 43032 + }, + { + "epoch": 2.671363833881681, + "grad_norm": 0.13509675822415199, + "learning_rate": 3.611191838156902e-06, + "loss": 2.7552, + "step": 43033 + }, + { + "epoch": 2.671425910981439, + "grad_norm": 0.1461124913031492, + "learning_rate": 3.609844365047532e-06, + "loss": 2.7644, + "step": 43034 + }, + { + "epoch": 2.671487988081197, + "grad_norm": 0.13042393160147145, + "learning_rate": 3.608497133966865e-06, + "loss": 2.629, + "step": 43035 + }, + { + "epoch": 2.6715500651809547, + "grad_norm": 0.13626592441338292, + "learning_rate": 3.6071501449219225e-06, + "loss": 2.6965, + "step": 43036 + }, + { + "epoch": 2.6716121422807126, + "grad_norm": 0.15904956015544322, + "learning_rate": 3.6058033979197437e-06, + "loss": 2.6843, + "step": 43037 + }, + { + "epoch": 2.6716742193804706, + "grad_norm": 0.1368034713370391, + "learning_rate": 3.6044568929673505e-06, + "loss": 2.7614, + "step": 43038 + }, + { + "epoch": 2.6717362964802285, + "grad_norm": 0.1361226711573151, + "learning_rate": 3.603110630071771e-06, + "loss": 2.7168, + "step": 43039 + }, + { + "epoch": 2.6717983735799864, + "grad_norm": 0.13081254405810566, + "learning_rate": 3.601764609240027e-06, + "loss": 2.764, + "step": 43040 + }, + { + "epoch": 2.6718604506797443, + "grad_norm": 0.15370084833675454, + "learning_rate": 3.6004188304791354e-06, + "loss": 2.7348, + "step": 43041 + }, + { + "epoch": 2.6719225277795022, + "grad_norm": 0.1365324931490849, + "learning_rate": 3.5990732937961292e-06, + "loss": 2.7073, + "step": 43042 + }, + { + "epoch": 2.67198460487926, + "grad_norm": 0.13027770022577154, + "learning_rate": 3.5977279991980197e-06, + "loss": 2.6815, + "step": 43043 + }, + { + "epoch": 2.672046681979018, + "grad_norm": 0.1378269580795558, + "learning_rate": 3.5963829466918342e-06, + "loss": 2.6804, + "step": 43044 + }, + { + "epoch": 2.672108759078776, + "grad_norm": 0.1343725109978749, + "learning_rate": 3.5950381362845677e-06, + "loss": 2.7061, + "step": 43045 + }, + { + "epoch": 2.672170836178534, + "grad_norm": 0.17973205801760322, + "learning_rate": 3.593693567983264e-06, + "loss": 2.6396, + "step": 43046 + }, + { + "epoch": 2.6722329132782914, + "grad_norm": 0.13100024370178093, + "learning_rate": 3.592349241794929e-06, + "loss": 2.6698, + "step": 43047 + }, + { + "epoch": 2.6722949903780497, + "grad_norm": 0.13167097579442594, + "learning_rate": 3.591005157726568e-06, + "loss": 2.6928, + "step": 43048 + }, + { + "epoch": 2.672357067477807, + "grad_norm": 0.14803095407357733, + "learning_rate": 3.5896613157852034e-06, + "loss": 2.7954, + "step": 43049 + }, + { + "epoch": 2.6724191445775656, + "grad_norm": 0.13225147316937852, + "learning_rate": 3.5883177159778347e-06, + "loss": 2.7038, + "step": 43050 + }, + { + "epoch": 2.672481221677323, + "grad_norm": 0.15429656716810977, + "learning_rate": 3.5869743583114846e-06, + "loss": 2.724, + "step": 43051 + }, + { + "epoch": 2.6725432987770814, + "grad_norm": 0.13693825947347657, + "learning_rate": 3.5856312427931583e-06, + "loss": 2.8131, + "step": 43052 + }, + { + "epoch": 2.672605375876839, + "grad_norm": 0.15664588129827786, + "learning_rate": 3.5842883694298614e-06, + "loss": 2.7602, + "step": 43053 + }, + { + "epoch": 2.672667452976597, + "grad_norm": 0.13331149830198172, + "learning_rate": 3.5829457382285936e-06, + "loss": 2.7045, + "step": 43054 + }, + { + "epoch": 2.6727295300763547, + "grad_norm": 0.1615131455617054, + "learning_rate": 3.581603349196372e-06, + "loss": 2.6713, + "step": 43055 + }, + { + "epoch": 2.6727916071761126, + "grad_norm": 0.13622001553153468, + "learning_rate": 3.5802612023401904e-06, + "loss": 2.7648, + "step": 43056 + }, + { + "epoch": 2.6728536842758706, + "grad_norm": 0.1360701982066545, + "learning_rate": 3.5789192976670606e-06, + "loss": 2.6143, + "step": 43057 + }, + { + "epoch": 2.6729157613756285, + "grad_norm": 0.13899207421388715, + "learning_rate": 3.577577635183965e-06, + "loss": 2.6557, + "step": 43058 + }, + { + "epoch": 2.6729778384753864, + "grad_norm": 0.13374525592091088, + "learning_rate": 3.5762362148979325e-06, + "loss": 2.7261, + "step": 43059 + }, + { + "epoch": 2.6730399155751443, + "grad_norm": 0.13867054522239422, + "learning_rate": 3.5748950368159397e-06, + "loss": 2.6906, + "step": 43060 + }, + { + "epoch": 2.6731019926749022, + "grad_norm": 0.13292460058289304, + "learning_rate": 3.5735541009449867e-06, + "loss": 2.7457, + "step": 43061 + }, + { + "epoch": 2.67316406977466, + "grad_norm": 0.1396001785763067, + "learning_rate": 3.5722134072920742e-06, + "loss": 2.698, + "step": 43062 + }, + { + "epoch": 2.673226146874418, + "grad_norm": 0.13263840344128253, + "learning_rate": 3.5708729558641907e-06, + "loss": 2.7602, + "step": 43063 + }, + { + "epoch": 2.673288223974176, + "grad_norm": 0.13039412277220322, + "learning_rate": 3.5695327466683415e-06, + "loss": 2.6395, + "step": 43064 + }, + { + "epoch": 2.673350301073934, + "grad_norm": 0.13433894045069836, + "learning_rate": 3.56819277971151e-06, + "loss": 2.7199, + "step": 43065 + }, + { + "epoch": 2.673412378173692, + "grad_norm": 0.1355787569527858, + "learning_rate": 3.566853055000685e-06, + "loss": 2.7312, + "step": 43066 + }, + { + "epoch": 2.6734744552734497, + "grad_norm": 0.14016770475522336, + "learning_rate": 3.5655135725428555e-06, + "loss": 2.6413, + "step": 43067 + }, + { + "epoch": 2.6735365323732077, + "grad_norm": 0.13780686749736012, + "learning_rate": 3.5641743323450217e-06, + "loss": 2.7319, + "step": 43068 + }, + { + "epoch": 2.6735986094729656, + "grad_norm": 0.1324027416168686, + "learning_rate": 3.562835334414166e-06, + "loss": 2.6758, + "step": 43069 + }, + { + "epoch": 2.6736606865727235, + "grad_norm": 0.14638872838692651, + "learning_rate": 3.5614965787572676e-06, + "loss": 2.7344, + "step": 43070 + }, + { + "epoch": 2.6737227636724814, + "grad_norm": 0.13215434864883713, + "learning_rate": 3.5601580653813085e-06, + "loss": 2.6303, + "step": 43071 + }, + { + "epoch": 2.673784840772239, + "grad_norm": 0.13298766421890482, + "learning_rate": 3.558819794293283e-06, + "loss": 2.6514, + "step": 43072 + }, + { + "epoch": 2.6738469178719972, + "grad_norm": 0.15771959138225947, + "learning_rate": 3.5574817655001645e-06, + "loss": 2.7767, + "step": 43073 + }, + { + "epoch": 2.6739089949717547, + "grad_norm": 0.13755396087570695, + "learning_rate": 3.5561439790089414e-06, + "loss": 2.7197, + "step": 43074 + }, + { + "epoch": 2.673971072071513, + "grad_norm": 0.13613573834683373, + "learning_rate": 3.5548064348265962e-06, + "loss": 2.6863, + "step": 43075 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.1422652377898945, + "learning_rate": 3.553469132960097e-06, + "loss": 2.7776, + "step": 43076 + }, + { + "epoch": 2.6740952262710285, + "grad_norm": 0.14711418181480748, + "learning_rate": 3.552132073416414e-06, + "loss": 2.7021, + "step": 43077 + }, + { + "epoch": 2.6741573033707864, + "grad_norm": 0.14034228914554248, + "learning_rate": 3.550795256202544e-06, + "loss": 2.7304, + "step": 43078 + }, + { + "epoch": 2.6742193804705443, + "grad_norm": 0.13603983978512105, + "learning_rate": 3.5494586813254517e-06, + "loss": 2.6874, + "step": 43079 + }, + { + "epoch": 2.6742814575703022, + "grad_norm": 0.13277404593103564, + "learning_rate": 3.54812234879211e-06, + "loss": 2.6841, + "step": 43080 + }, + { + "epoch": 2.67434353467006, + "grad_norm": 0.14394559640222346, + "learning_rate": 3.546786258609486e-06, + "loss": 2.7628, + "step": 43081 + }, + { + "epoch": 2.674405611769818, + "grad_norm": 0.1497733775994311, + "learning_rate": 3.5454504107845567e-06, + "loss": 2.7166, + "step": 43082 + }, + { + "epoch": 2.674467688869576, + "grad_norm": 0.13168268126210933, + "learning_rate": 3.5441148053242945e-06, + "loss": 2.6581, + "step": 43083 + }, + { + "epoch": 2.674529765969334, + "grad_norm": 0.13002801966854635, + "learning_rate": 3.542779442235661e-06, + "loss": 2.6447, + "step": 43084 + }, + { + "epoch": 2.674591843069092, + "grad_norm": 0.13124514411844532, + "learning_rate": 3.5414443215256223e-06, + "loss": 2.7096, + "step": 43085 + }, + { + "epoch": 2.6746539201688497, + "grad_norm": 0.1345744712308792, + "learning_rate": 3.540109443201145e-06, + "loss": 2.7193, + "step": 43086 + }, + { + "epoch": 2.6747159972686076, + "grad_norm": 0.13532719881379224, + "learning_rate": 3.538774807269196e-06, + "loss": 2.7559, + "step": 43087 + }, + { + "epoch": 2.6747780743683656, + "grad_norm": 0.13894641590511433, + "learning_rate": 3.5374404137367422e-06, + "loss": 2.6987, + "step": 43088 + }, + { + "epoch": 2.6748401514681235, + "grad_norm": 0.15661456374077462, + "learning_rate": 3.5361062626107387e-06, + "loss": 2.7134, + "step": 43089 + }, + { + "epoch": 2.6749022285678814, + "grad_norm": 0.1624839545544477, + "learning_rate": 3.534772353898147e-06, + "loss": 2.6521, + "step": 43090 + }, + { + "epoch": 2.6749643056676393, + "grad_norm": 0.13545661499028122, + "learning_rate": 3.5334386876059277e-06, + "loss": 2.7033, + "step": 43091 + }, + { + "epoch": 2.6750263827673972, + "grad_norm": 0.13871887914793804, + "learning_rate": 3.532105263741042e-06, + "loss": 2.6675, + "step": 43092 + }, + { + "epoch": 2.675088459867155, + "grad_norm": 0.15301038919577112, + "learning_rate": 3.5307720823104463e-06, + "loss": 2.6639, + "step": 43093 + }, + { + "epoch": 2.675150536966913, + "grad_norm": 0.15366936088642594, + "learning_rate": 3.5294391433210783e-06, + "loss": 2.7166, + "step": 43094 + }, + { + "epoch": 2.6752126140666705, + "grad_norm": 0.144334594864853, + "learning_rate": 3.5281064467799217e-06, + "loss": 2.7293, + "step": 43095 + }, + { + "epoch": 2.675274691166429, + "grad_norm": 0.13811907930328415, + "learning_rate": 3.52677399269391e-06, + "loss": 2.7012, + "step": 43096 + }, + { + "epoch": 2.6753367682661864, + "grad_norm": 0.13487784923676324, + "learning_rate": 3.5254417810700047e-06, + "loss": 2.7124, + "step": 43097 + }, + { + "epoch": 2.6753988453659447, + "grad_norm": 0.1435521054197857, + "learning_rate": 3.5241098119151496e-06, + "loss": 2.6382, + "step": 43098 + }, + { + "epoch": 2.675460922465702, + "grad_norm": 0.13703102144420184, + "learning_rate": 3.522778085236289e-06, + "loss": 2.6979, + "step": 43099 + }, + { + "epoch": 2.67552299956546, + "grad_norm": 0.1509249348474905, + "learning_rate": 3.5214466010403847e-06, + "loss": 2.7504, + "step": 43100 + }, + { + "epoch": 2.675585076665218, + "grad_norm": 0.13235714632935808, + "learning_rate": 3.5201153593343805e-06, + "loss": 2.6351, + "step": 43101 + }, + { + "epoch": 2.675647153764976, + "grad_norm": 0.13603712191397302, + "learning_rate": 3.5187843601252157e-06, + "loss": 2.6612, + "step": 43102 + }, + { + "epoch": 2.675709230864734, + "grad_norm": 0.15097015979213613, + "learning_rate": 3.5174536034198236e-06, + "loss": 2.6359, + "step": 43103 + }, + { + "epoch": 2.675771307964492, + "grad_norm": 0.15092685394375868, + "learning_rate": 3.516123089225165e-06, + "loss": 2.585, + "step": 43104 + }, + { + "epoch": 2.6758333850642497, + "grad_norm": 0.1473471347666827, + "learning_rate": 3.5147928175481848e-06, + "loss": 2.7202, + "step": 43105 + }, + { + "epoch": 2.6758954621640076, + "grad_norm": 0.1427052858356762, + "learning_rate": 3.513462788395816e-06, + "loss": 2.636, + "step": 43106 + }, + { + "epoch": 2.6759575392637656, + "grad_norm": 0.14763710412526365, + "learning_rate": 3.512133001774992e-06, + "loss": 2.7208, + "step": 43107 + }, + { + "epoch": 2.6760196163635235, + "grad_norm": 0.13402139024496731, + "learning_rate": 3.510803457692652e-06, + "loss": 2.6547, + "step": 43108 + }, + { + "epoch": 2.6760816934632814, + "grad_norm": 0.13704131086208, + "learning_rate": 3.5094741561557455e-06, + "loss": 2.7073, + "step": 43109 + }, + { + "epoch": 2.6761437705630393, + "grad_norm": 0.13625717584935765, + "learning_rate": 3.508145097171195e-06, + "loss": 2.6285, + "step": 43110 + }, + { + "epoch": 2.6762058476627972, + "grad_norm": 0.1321573577833528, + "learning_rate": 3.506816280745934e-06, + "loss": 2.6217, + "step": 43111 + }, + { + "epoch": 2.676267924762555, + "grad_norm": 0.13201009056011698, + "learning_rate": 3.5054877068869064e-06, + "loss": 2.6538, + "step": 43112 + }, + { + "epoch": 2.676330001862313, + "grad_norm": 0.14069470029281825, + "learning_rate": 3.5041593756010236e-06, + "loss": 2.6675, + "step": 43113 + }, + { + "epoch": 2.676392078962071, + "grad_norm": 0.14379430741099986, + "learning_rate": 3.5028312868952363e-06, + "loss": 2.7747, + "step": 43114 + }, + { + "epoch": 2.676454156061829, + "grad_norm": 0.151812366366573, + "learning_rate": 3.5015034407764713e-06, + "loss": 2.7965, + "step": 43115 + }, + { + "epoch": 2.676516233161587, + "grad_norm": 0.13399731654177588, + "learning_rate": 3.5001758372516457e-06, + "loss": 2.682, + "step": 43116 + }, + { + "epoch": 2.6765783102613447, + "grad_norm": 0.13539034712994605, + "learning_rate": 3.498848476327682e-06, + "loss": 2.6617, + "step": 43117 + }, + { + "epoch": 2.676640387361102, + "grad_norm": 0.13689248762593229, + "learning_rate": 3.497521358011524e-06, + "loss": 2.633, + "step": 43118 + }, + { + "epoch": 2.6767024644608606, + "grad_norm": 0.15527588464499983, + "learning_rate": 3.4961944823100888e-06, + "loss": 2.6501, + "step": 43119 + }, + { + "epoch": 2.676764541560618, + "grad_norm": 0.1324131304461483, + "learning_rate": 3.4948678492302934e-06, + "loss": 2.6852, + "step": 43120 + }, + { + "epoch": 2.6768266186603764, + "grad_norm": 0.13670404900122915, + "learning_rate": 3.493541458779065e-06, + "loss": 2.7355, + "step": 43121 + }, + { + "epoch": 2.676888695760134, + "grad_norm": 0.1363459067238569, + "learning_rate": 3.4922153109633094e-06, + "loss": 2.8113, + "step": 43122 + }, + { + "epoch": 2.6769507728598922, + "grad_norm": 0.14872606362364718, + "learning_rate": 3.4908894057899653e-06, + "loss": 2.6836, + "step": 43123 + }, + { + "epoch": 2.6770128499596497, + "grad_norm": 0.1358667597921616, + "learning_rate": 3.489563743265939e-06, + "loss": 2.7678, + "step": 43124 + }, + { + "epoch": 2.6770749270594076, + "grad_norm": 0.14341439638671977, + "learning_rate": 3.4882383233981518e-06, + "loss": 2.7322, + "step": 43125 + }, + { + "epoch": 2.6771370041591656, + "grad_norm": 0.13757852241246427, + "learning_rate": 3.4869131461935102e-06, + "loss": 2.6762, + "step": 43126 + }, + { + "epoch": 2.6771990812589235, + "grad_norm": 0.13562372925598135, + "learning_rate": 3.485588211658941e-06, + "loss": 2.6306, + "step": 43127 + }, + { + "epoch": 2.6772611583586814, + "grad_norm": 0.13402707445484993, + "learning_rate": 3.4842635198013505e-06, + "loss": 2.7199, + "step": 43128 + }, + { + "epoch": 2.6773232354584393, + "grad_norm": 0.1311236176095174, + "learning_rate": 3.48293907062765e-06, + "loss": 2.7052, + "step": 43129 + }, + { + "epoch": 2.6773853125581972, + "grad_norm": 0.1307414149502877, + "learning_rate": 3.481614864144739e-06, + "loss": 2.6644, + "step": 43130 + }, + { + "epoch": 2.677447389657955, + "grad_norm": 0.13604276982216026, + "learning_rate": 3.4802909003595396e-06, + "loss": 2.8735, + "step": 43131 + }, + { + "epoch": 2.677509466757713, + "grad_norm": 0.1301425831409607, + "learning_rate": 3.4789671792789636e-06, + "loss": 2.762, + "step": 43132 + }, + { + "epoch": 2.677571543857471, + "grad_norm": 0.15085329375413759, + "learning_rate": 3.477643700909905e-06, + "loss": 2.6765, + "step": 43133 + }, + { + "epoch": 2.677633620957229, + "grad_norm": 0.14478598518911326, + "learning_rate": 3.476320465259275e-06, + "loss": 2.7164, + "step": 43134 + }, + { + "epoch": 2.677695698056987, + "grad_norm": 0.15495048982957338, + "learning_rate": 3.474997472333963e-06, + "loss": 2.773, + "step": 43135 + }, + { + "epoch": 2.6777577751567447, + "grad_norm": 0.13449186355503057, + "learning_rate": 3.4736747221408907e-06, + "loss": 2.7016, + "step": 43136 + }, + { + "epoch": 2.6778198522565027, + "grad_norm": 0.1308439089047761, + "learning_rate": 3.4723522146869527e-06, + "loss": 2.6499, + "step": 43137 + }, + { + "epoch": 2.6778819293562606, + "grad_norm": 0.1379672604779618, + "learning_rate": 3.4710299499790545e-06, + "loss": 2.7148, + "step": 43138 + }, + { + "epoch": 2.6779440064560185, + "grad_norm": 0.1367315354499744, + "learning_rate": 3.4697079280240853e-06, + "loss": 2.7191, + "step": 43139 + }, + { + "epoch": 2.6780060835557764, + "grad_norm": 0.13519652902033227, + "learning_rate": 3.468386148828934e-06, + "loss": 2.723, + "step": 43140 + }, + { + "epoch": 2.6780681606555343, + "grad_norm": 0.13160379245416604, + "learning_rate": 3.4670646124005223e-06, + "loss": 2.6834, + "step": 43141 + }, + { + "epoch": 2.6781302377552922, + "grad_norm": 0.1333560968564884, + "learning_rate": 3.4657433187457235e-06, + "loss": 2.6653, + "step": 43142 + }, + { + "epoch": 2.6781923148550497, + "grad_norm": 0.1429159525537398, + "learning_rate": 3.464422267871442e-06, + "loss": 2.7306, + "step": 43143 + }, + { + "epoch": 2.678254391954808, + "grad_norm": 0.13395057452065465, + "learning_rate": 3.4631014597845624e-06, + "loss": 2.7024, + "step": 43144 + }, + { + "epoch": 2.6783164690545656, + "grad_norm": 0.14474661993122317, + "learning_rate": 3.461780894491984e-06, + "loss": 2.5943, + "step": 43145 + }, + { + "epoch": 2.678378546154324, + "grad_norm": 0.13143743403546923, + "learning_rate": 3.4604605720005902e-06, + "loss": 2.6383, + "step": 43146 + }, + { + "epoch": 2.6784406232540814, + "grad_norm": 0.16652563518243535, + "learning_rate": 3.4591404923172756e-06, + "loss": 2.6931, + "step": 43147 + }, + { + "epoch": 2.6785027003538393, + "grad_norm": 0.1398317560987709, + "learning_rate": 3.4578206554489235e-06, + "loss": 2.8097, + "step": 43148 + }, + { + "epoch": 2.6785647774535972, + "grad_norm": 0.14422153460912998, + "learning_rate": 3.456501061402406e-06, + "loss": 2.7137, + "step": 43149 + }, + { + "epoch": 2.678626854553355, + "grad_norm": 0.13248762197752526, + "learning_rate": 3.4551817101846353e-06, + "loss": 2.6826, + "step": 43150 + }, + { + "epoch": 2.678688931653113, + "grad_norm": 0.14468476802685457, + "learning_rate": 3.4538626018024767e-06, + "loss": 2.7639, + "step": 43151 + }, + { + "epoch": 2.678751008752871, + "grad_norm": 0.152229535252708, + "learning_rate": 3.4525437362628197e-06, + "loss": 2.712, + "step": 43152 + }, + { + "epoch": 2.678813085852629, + "grad_norm": 0.13708933605029971, + "learning_rate": 3.4512251135725316e-06, + "loss": 2.7349, + "step": 43153 + }, + { + "epoch": 2.678875162952387, + "grad_norm": 0.14900636837698059, + "learning_rate": 3.4499067337385115e-06, + "loss": 2.714, + "step": 43154 + }, + { + "epoch": 2.6789372400521447, + "grad_norm": 0.14580912403273213, + "learning_rate": 3.4485885967676266e-06, + "loss": 2.7103, + "step": 43155 + }, + { + "epoch": 2.6789993171519026, + "grad_norm": 0.13612369742177494, + "learning_rate": 3.4472707026667604e-06, + "loss": 2.7508, + "step": 43156 + }, + { + "epoch": 2.6790613942516606, + "grad_norm": 0.15641491650627878, + "learning_rate": 3.4459530514427795e-06, + "loss": 2.7238, + "step": 43157 + }, + { + "epoch": 2.6791234713514185, + "grad_norm": 0.13437581006626542, + "learning_rate": 3.444635643102556e-06, + "loss": 2.6114, + "step": 43158 + }, + { + "epoch": 2.6791855484511764, + "grad_norm": 0.14774568488222778, + "learning_rate": 3.4433184776529736e-06, + "loss": 2.6388, + "step": 43159 + }, + { + "epoch": 2.6792476255509343, + "grad_norm": 0.1403553947904418, + "learning_rate": 3.44200155510091e-06, + "loss": 2.7448, + "step": 43160 + }, + { + "epoch": 2.6793097026506922, + "grad_norm": 0.136678682995703, + "learning_rate": 3.4406848754532205e-06, + "loss": 2.6521, + "step": 43161 + }, + { + "epoch": 2.67937177975045, + "grad_norm": 0.13767351830254732, + "learning_rate": 3.4393684387167725e-06, + "loss": 2.6834, + "step": 43162 + }, + { + "epoch": 2.679433856850208, + "grad_norm": 0.14881179699978045, + "learning_rate": 3.4380522448984544e-06, + "loss": 2.7481, + "step": 43163 + }, + { + "epoch": 2.679495933949966, + "grad_norm": 0.1343930110393596, + "learning_rate": 3.4367362940051163e-06, + "loss": 2.6915, + "step": 43164 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.13616488529065948, + "learning_rate": 3.4354205860436308e-06, + "loss": 2.7565, + "step": 43165 + }, + { + "epoch": 2.6796200881494814, + "grad_norm": 0.13720335904566164, + "learning_rate": 3.4341051210208532e-06, + "loss": 2.7331, + "step": 43166 + }, + { + "epoch": 2.6796821652492397, + "grad_norm": 0.14502409083330148, + "learning_rate": 3.4327898989436557e-06, + "loss": 2.7333, + "step": 43167 + }, + { + "epoch": 2.679744242348997, + "grad_norm": 0.1371299758063125, + "learning_rate": 3.4314749198189054e-06, + "loss": 2.7023, + "step": 43168 + }, + { + "epoch": 2.6798063194487556, + "grad_norm": 0.14879778380007794, + "learning_rate": 3.4301601836534413e-06, + "loss": 2.6852, + "step": 43169 + }, + { + "epoch": 2.679868396548513, + "grad_norm": 0.14412596299775277, + "learning_rate": 3.428845690454152e-06, + "loss": 2.647, + "step": 43170 + }, + { + "epoch": 2.6799304736482714, + "grad_norm": 0.14476220905578419, + "learning_rate": 3.4275314402278713e-06, + "loss": 2.6622, + "step": 43171 + }, + { + "epoch": 2.679992550748029, + "grad_norm": 0.14038442855941613, + "learning_rate": 3.4262174329814656e-06, + "loss": 2.7652, + "step": 43172 + }, + { + "epoch": 2.680054627847787, + "grad_norm": 0.16156307343025195, + "learning_rate": 3.4249036687217904e-06, + "loss": 2.7672, + "step": 43173 + }, + { + "epoch": 2.6801167049475447, + "grad_norm": 0.14444092557196636, + "learning_rate": 3.423590147455702e-06, + "loss": 2.6626, + "step": 43174 + }, + { + "epoch": 2.6801787820473026, + "grad_norm": 0.13700842822821252, + "learning_rate": 3.4222768691900552e-06, + "loss": 2.7573, + "step": 43175 + }, + { + "epoch": 2.6802408591470606, + "grad_norm": 0.1386928273880132, + "learning_rate": 3.420963833931684e-06, + "loss": 2.7235, + "step": 43176 + }, + { + "epoch": 2.6803029362468185, + "grad_norm": 0.13118417485672418, + "learning_rate": 3.4196510416874606e-06, + "loss": 2.6953, + "step": 43177 + }, + { + "epoch": 2.6803650133465764, + "grad_norm": 0.13137108122391786, + "learning_rate": 3.418338492464224e-06, + "loss": 2.6687, + "step": 43178 + }, + { + "epoch": 2.6804270904463343, + "grad_norm": 0.1335363088895322, + "learning_rate": 3.417026186268829e-06, + "loss": 2.7199, + "step": 43179 + }, + { + "epoch": 2.6804891675460922, + "grad_norm": 0.14574147351288672, + "learning_rate": 3.4157141231081045e-06, + "loss": 2.6926, + "step": 43180 + }, + { + "epoch": 2.68055124464585, + "grad_norm": 0.14637401238099326, + "learning_rate": 3.4144023029889173e-06, + "loss": 2.7094, + "step": 43181 + }, + { + "epoch": 2.680613321745608, + "grad_norm": 0.13074717052572837, + "learning_rate": 3.4130907259181e-06, + "loss": 2.7297, + "step": 43182 + }, + { + "epoch": 2.680675398845366, + "grad_norm": 0.14028887181259259, + "learning_rate": 3.411779391902503e-06, + "loss": 2.7389, + "step": 43183 + }, + { + "epoch": 2.680737475945124, + "grad_norm": 0.12877623185496337, + "learning_rate": 3.4104683009489547e-06, + "loss": 2.6705, + "step": 43184 + }, + { + "epoch": 2.680799553044882, + "grad_norm": 0.13311642344925093, + "learning_rate": 3.4091574530643042e-06, + "loss": 2.7891, + "step": 43185 + }, + { + "epoch": 2.6808616301446397, + "grad_norm": 0.1335724448342691, + "learning_rate": 3.4078468482553918e-06, + "loss": 2.7216, + "step": 43186 + }, + { + "epoch": 2.6809237072443977, + "grad_norm": 0.16046537247161172, + "learning_rate": 3.406536486529055e-06, + "loss": 2.6557, + "step": 43187 + }, + { + "epoch": 2.6809857843441556, + "grad_norm": 0.1379403848532208, + "learning_rate": 3.405226367892128e-06, + "loss": 2.7218, + "step": 43188 + }, + { + "epoch": 2.6810478614439135, + "grad_norm": 0.14241560073106352, + "learning_rate": 3.403916492351439e-06, + "loss": 2.648, + "step": 43189 + }, + { + "epoch": 2.6811099385436714, + "grad_norm": 0.13017792982544724, + "learning_rate": 3.4026068599138315e-06, + "loss": 2.7521, + "step": 43190 + }, + { + "epoch": 2.681172015643429, + "grad_norm": 0.144159115016326, + "learning_rate": 3.4012974705861454e-06, + "loss": 2.754, + "step": 43191 + }, + { + "epoch": 2.6812340927431872, + "grad_norm": 0.16030197375765173, + "learning_rate": 3.399988324375197e-06, + "loss": 2.6254, + "step": 43192 + }, + { + "epoch": 2.6812961698429447, + "grad_norm": 0.1568485062575936, + "learning_rate": 3.398679421287815e-06, + "loss": 2.7582, + "step": 43193 + }, + { + "epoch": 2.681358246942703, + "grad_norm": 0.13530651680578512, + "learning_rate": 3.3973707613308426e-06, + "loss": 2.609, + "step": 43194 + }, + { + "epoch": 2.6814203240424606, + "grad_norm": 0.13200752959396336, + "learning_rate": 3.396062344511103e-06, + "loss": 2.7575, + "step": 43195 + }, + { + "epoch": 2.6814824011422185, + "grad_norm": 0.13853433665505463, + "learning_rate": 3.394754170835418e-06, + "loss": 2.6903, + "step": 43196 + }, + { + "epoch": 2.6815444782419764, + "grad_norm": 0.13221134847680693, + "learning_rate": 3.3934462403106104e-06, + "loss": 2.7565, + "step": 43197 + }, + { + "epoch": 2.6816065553417343, + "grad_norm": 0.17917717114601212, + "learning_rate": 3.3921385529435026e-06, + "loss": 2.7646, + "step": 43198 + }, + { + "epoch": 2.6816686324414922, + "grad_norm": 0.1361396172134959, + "learning_rate": 3.3908311087409327e-06, + "loss": 2.6984, + "step": 43199 + }, + { + "epoch": 2.68173070954125, + "grad_norm": 0.1379833657913874, + "learning_rate": 3.389523907709713e-06, + "loss": 2.7915, + "step": 43200 + }, + { + "epoch": 2.681792786641008, + "grad_norm": 0.14417604021674785, + "learning_rate": 3.3882169498566595e-06, + "loss": 2.8644, + "step": 43201 + }, + { + "epoch": 2.681854863740766, + "grad_norm": 0.13125260185887105, + "learning_rate": 3.386910235188584e-06, + "loss": 2.7114, + "step": 43202 + }, + { + "epoch": 2.681916940840524, + "grad_norm": 0.1419883312976775, + "learning_rate": 3.385603763712314e-06, + "loss": 2.6904, + "step": 43203 + }, + { + "epoch": 2.681979017940282, + "grad_norm": 0.13219229255492548, + "learning_rate": 3.384297535434672e-06, + "loss": 2.7349, + "step": 43204 + }, + { + "epoch": 2.6820410950400397, + "grad_norm": 0.13350745308355705, + "learning_rate": 3.382991550362469e-06, + "loss": 2.7771, + "step": 43205 + }, + { + "epoch": 2.6821031721397977, + "grad_norm": 0.1324813233901516, + "learning_rate": 3.381685808502516e-06, + "loss": 2.6875, + "step": 43206 + }, + { + "epoch": 2.6821652492395556, + "grad_norm": 0.16231759613337607, + "learning_rate": 3.380380309861625e-06, + "loss": 2.6658, + "step": 43207 + }, + { + "epoch": 2.6822273263393135, + "grad_norm": 0.13188928301919017, + "learning_rate": 3.379075054446601e-06, + "loss": 2.7389, + "step": 43208 + }, + { + "epoch": 2.6822894034390714, + "grad_norm": 0.13515213811275445, + "learning_rate": 3.377770042264261e-06, + "loss": 2.7783, + "step": 43209 + }, + { + "epoch": 2.6823514805388293, + "grad_norm": 0.13043976878746102, + "learning_rate": 3.3764652733214163e-06, + "loss": 2.6386, + "step": 43210 + }, + { + "epoch": 2.6824135576385872, + "grad_norm": 0.14307956830303126, + "learning_rate": 3.3751607476248726e-06, + "loss": 2.7518, + "step": 43211 + }, + { + "epoch": 2.682475634738345, + "grad_norm": 0.1486608869728761, + "learning_rate": 3.3738564651814242e-06, + "loss": 2.7445, + "step": 43212 + }, + { + "epoch": 2.682537711838103, + "grad_norm": 0.1301474492462428, + "learning_rate": 3.3725524259978936e-06, + "loss": 2.7362, + "step": 43213 + }, + { + "epoch": 2.6825997889378606, + "grad_norm": 0.14442669463171265, + "learning_rate": 3.371248630081081e-06, + "loss": 2.6353, + "step": 43214 + }, + { + "epoch": 2.682661866037619, + "grad_norm": 0.13403151464180446, + "learning_rate": 3.369945077437775e-06, + "loss": 2.6561, + "step": 43215 + }, + { + "epoch": 2.6827239431373764, + "grad_norm": 0.1328590676067358, + "learning_rate": 3.3686417680747817e-06, + "loss": 2.6168, + "step": 43216 + }, + { + "epoch": 2.6827860202371347, + "grad_norm": 0.1384780698083536, + "learning_rate": 3.3673387019989123e-06, + "loss": 2.6441, + "step": 43217 + }, + { + "epoch": 2.6828480973368922, + "grad_norm": 0.13330193403814505, + "learning_rate": 3.366035879216961e-06, + "loss": 2.7097, + "step": 43218 + }, + { + "epoch": 2.6829101744366506, + "grad_norm": 0.13434089543898525, + "learning_rate": 3.364733299735717e-06, + "loss": 2.5773, + "step": 43219 + }, + { + "epoch": 2.682972251536408, + "grad_norm": 0.13507795077919962, + "learning_rate": 3.363430963561981e-06, + "loss": 2.6972, + "step": 43220 + }, + { + "epoch": 2.683034328636166, + "grad_norm": 0.13054628780696356, + "learning_rate": 3.3621288707025413e-06, + "loss": 2.649, + "step": 43221 + }, + { + "epoch": 2.683096405735924, + "grad_norm": 0.1515814464564201, + "learning_rate": 3.3608270211642034e-06, + "loss": 2.7473, + "step": 43222 + }, + { + "epoch": 2.683158482835682, + "grad_norm": 0.12998232220389405, + "learning_rate": 3.3595254149537513e-06, + "loss": 2.6497, + "step": 43223 + }, + { + "epoch": 2.6832205599354397, + "grad_norm": 0.15929408146278212, + "learning_rate": 3.3582240520779794e-06, + "loss": 2.6789, + "step": 43224 + }, + { + "epoch": 2.6832826370351976, + "grad_norm": 0.12954843832721477, + "learning_rate": 3.3569229325436657e-06, + "loss": 2.6127, + "step": 43225 + }, + { + "epoch": 2.6833447141349556, + "grad_norm": 0.14557122762401928, + "learning_rate": 3.3556220563576157e-06, + "loss": 2.5396, + "step": 43226 + }, + { + "epoch": 2.6834067912347135, + "grad_norm": 0.1348410201984329, + "learning_rate": 3.3543214235266128e-06, + "loss": 2.7837, + "step": 43227 + }, + { + "epoch": 2.6834688683344714, + "grad_norm": 0.146088095927857, + "learning_rate": 3.3530210340574354e-06, + "loss": 2.7271, + "step": 43228 + }, + { + "epoch": 2.6835309454342293, + "grad_norm": 0.14885936569369196, + "learning_rate": 3.3517208879568606e-06, + "loss": 2.7543, + "step": 43229 + }, + { + "epoch": 2.6835930225339872, + "grad_norm": 0.14844044883208263, + "learning_rate": 3.3504209852316948e-06, + "loss": 2.7572, + "step": 43230 + }, + { + "epoch": 2.683655099633745, + "grad_norm": 0.14113571886184229, + "learning_rate": 3.3491213258887044e-06, + "loss": 2.6895, + "step": 43231 + }, + { + "epoch": 2.683717176733503, + "grad_norm": 0.14214098557293042, + "learning_rate": 3.3478219099346785e-06, + "loss": 2.7443, + "step": 43232 + }, + { + "epoch": 2.683779253833261, + "grad_norm": 0.13234935676561072, + "learning_rate": 3.346522737376384e-06, + "loss": 2.7077, + "step": 43233 + }, + { + "epoch": 2.683841330933019, + "grad_norm": 0.14114288494742147, + "learning_rate": 3.3452238082206044e-06, + "loss": 2.6531, + "step": 43234 + }, + { + "epoch": 2.683903408032777, + "grad_norm": 0.14440890237196335, + "learning_rate": 3.343925122474112e-06, + "loss": 2.71, + "step": 43235 + }, + { + "epoch": 2.6839654851325347, + "grad_norm": 0.15021080562216535, + "learning_rate": 3.342626680143701e-06, + "loss": 2.6743, + "step": 43236 + }, + { + "epoch": 2.6840275622322927, + "grad_norm": 0.14850591036806418, + "learning_rate": 3.3413284812361278e-06, + "loss": 2.6914, + "step": 43237 + }, + { + "epoch": 2.6840896393320506, + "grad_norm": 0.15141277857189303, + "learning_rate": 3.3400305257581753e-06, + "loss": 2.6986, + "step": 43238 + }, + { + "epoch": 2.684151716431808, + "grad_norm": 0.13752431600314247, + "learning_rate": 3.3387328137165994e-06, + "loss": 2.7869, + "step": 43239 + }, + { + "epoch": 2.6842137935315664, + "grad_norm": 0.1520852316341636, + "learning_rate": 3.337435345118195e-06, + "loss": 2.7239, + "step": 43240 + }, + { + "epoch": 2.684275870631324, + "grad_norm": 0.13470409204817116, + "learning_rate": 3.3361381199697117e-06, + "loss": 2.6892, + "step": 43241 + }, + { + "epoch": 2.6843379477310823, + "grad_norm": 0.13046321400301453, + "learning_rate": 3.3348411382779278e-06, + "loss": 2.6733, + "step": 43242 + }, + { + "epoch": 2.6844000248308397, + "grad_norm": 0.15184528157203092, + "learning_rate": 3.333544400049604e-06, + "loss": 2.7635, + "step": 43243 + }, + { + "epoch": 2.6844621019305976, + "grad_norm": 0.15738431941655692, + "learning_rate": 3.3322479052915022e-06, + "loss": 2.7468, + "step": 43244 + }, + { + "epoch": 2.6845241790303556, + "grad_norm": 0.1372854137865973, + "learning_rate": 3.3309516540104004e-06, + "loss": 2.6775, + "step": 43245 + }, + { + "epoch": 2.6845862561301135, + "grad_norm": 0.14059639348361846, + "learning_rate": 3.3296556462130477e-06, + "loss": 2.6582, + "step": 43246 + }, + { + "epoch": 2.6846483332298714, + "grad_norm": 0.14623178957595728, + "learning_rate": 3.328359881906218e-06, + "loss": 2.7477, + "step": 43247 + }, + { + "epoch": 2.6847104103296293, + "grad_norm": 0.13376869965566462, + "learning_rate": 3.3270643610966546e-06, + "loss": 2.7033, + "step": 43248 + }, + { + "epoch": 2.6847724874293872, + "grad_norm": 0.1572971955384497, + "learning_rate": 3.3257690837911302e-06, + "loss": 2.7513, + "step": 43249 + }, + { + "epoch": 2.684834564529145, + "grad_norm": 0.13716330459358392, + "learning_rate": 3.3244740499964067e-06, + "loss": 2.7196, + "step": 43250 + }, + { + "epoch": 2.684896641628903, + "grad_norm": 0.15497519683043867, + "learning_rate": 3.323179259719228e-06, + "loss": 2.6911, + "step": 43251 + }, + { + "epoch": 2.684958718728661, + "grad_norm": 0.13275675445190693, + "learning_rate": 3.3218847129663442e-06, + "loss": 2.6497, + "step": 43252 + }, + { + "epoch": 2.685020795828419, + "grad_norm": 0.15037244835533414, + "learning_rate": 3.3205904097445338e-06, + "loss": 2.7754, + "step": 43253 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.1408896208210468, + "learning_rate": 3.31929635006053e-06, + "loss": 2.7503, + "step": 43254 + }, + { + "epoch": 2.6851449500279347, + "grad_norm": 0.14023190100355065, + "learning_rate": 3.3180025339210886e-06, + "loss": 2.6749, + "step": 43255 + }, + { + "epoch": 2.6852070271276927, + "grad_norm": 0.14617005003009578, + "learning_rate": 3.316708961332965e-06, + "loss": 2.7155, + "step": 43256 + }, + { + "epoch": 2.6852691042274506, + "grad_norm": 0.13684660519053718, + "learning_rate": 3.315415632302893e-06, + "loss": 2.6758, + "step": 43257 + }, + { + "epoch": 2.6853311813272085, + "grad_norm": 0.1382655719447026, + "learning_rate": 3.314122546837639e-06, + "loss": 2.8318, + "step": 43258 + }, + { + "epoch": 2.6853932584269664, + "grad_norm": 0.15780298232658638, + "learning_rate": 3.312829704943943e-06, + "loss": 2.6475, + "step": 43259 + }, + { + "epoch": 2.6854553355267243, + "grad_norm": 0.14473978669448664, + "learning_rate": 3.3115371066285426e-06, + "loss": 2.7251, + "step": 43260 + }, + { + "epoch": 2.6855174126264822, + "grad_norm": 0.14445914583465477, + "learning_rate": 3.310244751898184e-06, + "loss": 2.677, + "step": 43261 + }, + { + "epoch": 2.6855794897262397, + "grad_norm": 0.14345736026401587, + "learning_rate": 3.308952640759616e-06, + "loss": 2.7061, + "step": 43262 + }, + { + "epoch": 2.685641566825998, + "grad_norm": 0.1491515215374581, + "learning_rate": 3.307660773219584e-06, + "loss": 2.6941, + "step": 43263 + }, + { + "epoch": 2.6857036439257556, + "grad_norm": 0.13198828195538426, + "learning_rate": 3.3063691492848157e-06, + "loss": 2.6952, + "step": 43264 + }, + { + "epoch": 2.685765721025514, + "grad_norm": 0.13689719543825318, + "learning_rate": 3.3050777689620504e-06, + "loss": 2.6994, + "step": 43265 + }, + { + "epoch": 2.6858277981252714, + "grad_norm": 0.15215618250945845, + "learning_rate": 3.3037866322580323e-06, + "loss": 2.7357, + "step": 43266 + }, + { + "epoch": 2.6858898752250298, + "grad_norm": 0.14332200350612487, + "learning_rate": 3.3024957391794897e-06, + "loss": 2.7166, + "step": 43267 + }, + { + "epoch": 2.6859519523247872, + "grad_norm": 0.14011951168101194, + "learning_rate": 3.3012050897331726e-06, + "loss": 2.6843, + "step": 43268 + }, + { + "epoch": 2.686014029424545, + "grad_norm": 0.13961304035936056, + "learning_rate": 3.299914683925803e-06, + "loss": 2.607, + "step": 43269 + }, + { + "epoch": 2.686076106524303, + "grad_norm": 0.13540155910313786, + "learning_rate": 3.298624521764121e-06, + "loss": 2.7342, + "step": 43270 + }, + { + "epoch": 2.686138183624061, + "grad_norm": 0.13225721936405824, + "learning_rate": 3.297334603254837e-06, + "loss": 2.6208, + "step": 43271 + }, + { + "epoch": 2.686200260723819, + "grad_norm": 0.1692884796676873, + "learning_rate": 3.2960449284047066e-06, + "loss": 2.68, + "step": 43272 + }, + { + "epoch": 2.686262337823577, + "grad_norm": 0.14205304223082177, + "learning_rate": 3.2947554972204475e-06, + "loss": 2.7076, + "step": 43273 + }, + { + "epoch": 2.6863244149233347, + "grad_norm": 0.13432354219026477, + "learning_rate": 3.293466309708787e-06, + "loss": 2.681, + "step": 43274 + }, + { + "epoch": 2.6863864920230927, + "grad_norm": 0.12946655859854708, + "learning_rate": 3.2921773658764423e-06, + "loss": 2.652, + "step": 43275 + }, + { + "epoch": 2.6864485691228506, + "grad_norm": 0.13257757615002222, + "learning_rate": 3.2908886657301576e-06, + "loss": 2.742, + "step": 43276 + }, + { + "epoch": 2.6865106462226085, + "grad_norm": 0.13755712947908355, + "learning_rate": 3.2896002092766444e-06, + "loss": 2.693, + "step": 43277 + }, + { + "epoch": 2.6865727233223664, + "grad_norm": 0.13696221064162475, + "learning_rate": 3.288311996522625e-06, + "loss": 2.7247, + "step": 43278 + }, + { + "epoch": 2.6866348004221243, + "grad_norm": 0.1364359724188201, + "learning_rate": 3.2870240274748167e-06, + "loss": 2.7995, + "step": 43279 + }, + { + "epoch": 2.6866968775218822, + "grad_norm": 0.14878409477788063, + "learning_rate": 3.2857363021399467e-06, + "loss": 2.5872, + "step": 43280 + }, + { + "epoch": 2.68675895462164, + "grad_norm": 0.14367461223183234, + "learning_rate": 3.284448820524738e-06, + "loss": 2.7856, + "step": 43281 + }, + { + "epoch": 2.686821031721398, + "grad_norm": 0.13689810704794017, + "learning_rate": 3.2831615826358964e-06, + "loss": 2.6841, + "step": 43282 + }, + { + "epoch": 2.686883108821156, + "grad_norm": 0.14663673742239267, + "learning_rate": 3.281874588480138e-06, + "loss": 2.5996, + "step": 43283 + }, + { + "epoch": 2.686945185920914, + "grad_norm": 0.13855730616221917, + "learning_rate": 3.2805878380641807e-06, + "loss": 2.7153, + "step": 43284 + }, + { + "epoch": 2.687007263020672, + "grad_norm": 0.13300475062404463, + "learning_rate": 3.2793013313947406e-06, + "loss": 2.7381, + "step": 43285 + }, + { + "epoch": 2.6870693401204298, + "grad_norm": 0.13881997809605165, + "learning_rate": 3.2780150684785294e-06, + "loss": 2.7133, + "step": 43286 + }, + { + "epoch": 2.6871314172201872, + "grad_norm": 0.13396703062873971, + "learning_rate": 3.2767290493222526e-06, + "loss": 2.6981, + "step": 43287 + }, + { + "epoch": 2.6871934943199456, + "grad_norm": 0.14167583753783536, + "learning_rate": 3.275443273932621e-06, + "loss": 2.6425, + "step": 43288 + }, + { + "epoch": 2.687255571419703, + "grad_norm": 0.13481973860844487, + "learning_rate": 3.274157742316347e-06, + "loss": 2.6645, + "step": 43289 + }, + { + "epoch": 2.6873176485194614, + "grad_norm": 0.15032550095307054, + "learning_rate": 3.272872454480136e-06, + "loss": 2.6857, + "step": 43290 + }, + { + "epoch": 2.687379725619219, + "grad_norm": 0.1439819594670653, + "learning_rate": 3.271587410430693e-06, + "loss": 2.6351, + "step": 43291 + }, + { + "epoch": 2.687441802718977, + "grad_norm": 0.13907886607057632, + "learning_rate": 3.270302610174725e-06, + "loss": 2.6464, + "step": 43292 + }, + { + "epoch": 2.6875038798187347, + "grad_norm": 0.140526591412278, + "learning_rate": 3.2690180537189253e-06, + "loss": 2.7282, + "step": 43293 + }, + { + "epoch": 2.6875659569184926, + "grad_norm": 0.1353233379534326, + "learning_rate": 3.267733741070006e-06, + "loss": 2.7093, + "step": 43294 + }, + { + "epoch": 2.6876280340182506, + "grad_norm": 0.13420724599651032, + "learning_rate": 3.266449672234667e-06, + "loss": 2.7703, + "step": 43295 + }, + { + "epoch": 2.6876901111180085, + "grad_norm": 0.1369761904210641, + "learning_rate": 3.2651658472196034e-06, + "loss": 2.6255, + "step": 43296 + }, + { + "epoch": 2.6877521882177664, + "grad_norm": 0.13784464726083606, + "learning_rate": 3.2638822660315092e-06, + "loss": 2.702, + "step": 43297 + }, + { + "epoch": 2.6878142653175243, + "grad_norm": 0.13083889786793637, + "learning_rate": 3.262598928677091e-06, + "loss": 2.6916, + "step": 43298 + }, + { + "epoch": 2.6878763424172822, + "grad_norm": 0.14747838049789225, + "learning_rate": 3.2613158351630424e-06, + "loss": 2.6982, + "step": 43299 + }, + { + "epoch": 2.68793841951704, + "grad_norm": 0.1584439798963687, + "learning_rate": 3.2600329854960477e-06, + "loss": 2.7393, + "step": 43300 + }, + { + "epoch": 2.688000496616798, + "grad_norm": 0.13925454803321474, + "learning_rate": 3.258750379682818e-06, + "loss": 2.6952, + "step": 43301 + }, + { + "epoch": 2.688062573716556, + "grad_norm": 0.13900997021606873, + "learning_rate": 3.2574680177300197e-06, + "loss": 2.7518, + "step": 43302 + }, + { + "epoch": 2.688124650816314, + "grad_norm": 0.13314004086654055, + "learning_rate": 3.2561858996443706e-06, + "loss": 2.6059, + "step": 43303 + }, + { + "epoch": 2.688186727916072, + "grad_norm": 0.15070699879604993, + "learning_rate": 3.254904025432548e-06, + "loss": 2.7101, + "step": 43304 + }, + { + "epoch": 2.6882488050158297, + "grad_norm": 0.14589309886882998, + "learning_rate": 3.2536223951012358e-06, + "loss": 2.7135, + "step": 43305 + }, + { + "epoch": 2.6883108821155877, + "grad_norm": 0.13079166795853736, + "learning_rate": 3.252341008657128e-06, + "loss": 2.7174, + "step": 43306 + }, + { + "epoch": 2.6883729592153456, + "grad_norm": 0.15433263967780753, + "learning_rate": 3.2510598661068926e-06, + "loss": 2.7301, + "step": 43307 + }, + { + "epoch": 2.6884350363151035, + "grad_norm": 0.1338109604115761, + "learning_rate": 3.249778967457234e-06, + "loss": 2.7104, + "step": 43308 + }, + { + "epoch": 2.6884971134148614, + "grad_norm": 0.14215251680890725, + "learning_rate": 3.2484983127148316e-06, + "loss": 2.7795, + "step": 43309 + }, + { + "epoch": 2.688559190514619, + "grad_norm": 0.13329380841585037, + "learning_rate": 3.2472179018863624e-06, + "loss": 2.6962, + "step": 43310 + }, + { + "epoch": 2.6886212676143773, + "grad_norm": 0.13488382877659674, + "learning_rate": 3.2459377349784992e-06, + "loss": 2.6535, + "step": 43311 + }, + { + "epoch": 2.6886833447141347, + "grad_norm": 0.14538672228273822, + "learning_rate": 3.2446578119979365e-06, + "loss": 2.7472, + "step": 43312 + }, + { + "epoch": 2.688745421813893, + "grad_norm": 0.13928643236154212, + "learning_rate": 3.2433781329513413e-06, + "loss": 2.6934, + "step": 43313 + }, + { + "epoch": 2.6888074989136506, + "grad_norm": 0.1374804667123154, + "learning_rate": 3.242098697845397e-06, + "loss": 2.7537, + "step": 43314 + }, + { + "epoch": 2.688869576013409, + "grad_norm": 0.1347154786618188, + "learning_rate": 3.240819506686765e-06, + "loss": 2.6822, + "step": 43315 + }, + { + "epoch": 2.6889316531131664, + "grad_norm": 0.13396513970433163, + "learning_rate": 3.239540559482135e-06, + "loss": 2.7989, + "step": 43316 + }, + { + "epoch": 2.6889937302129243, + "grad_norm": 0.13761363752270753, + "learning_rate": 3.2382618562381726e-06, + "loss": 2.6997, + "step": 43317 + }, + { + "epoch": 2.6890558073126822, + "grad_norm": 0.13492296543133514, + "learning_rate": 3.2369833969615514e-06, + "loss": 2.7229, + "step": 43318 + }, + { + "epoch": 2.68911788441244, + "grad_norm": 0.15457744681220073, + "learning_rate": 3.2357051816589436e-06, + "loss": 2.7181, + "step": 43319 + }, + { + "epoch": 2.689179961512198, + "grad_norm": 0.13755604620408907, + "learning_rate": 3.2344272103370045e-06, + "loss": 2.7016, + "step": 43320 + }, + { + "epoch": 2.689242038611956, + "grad_norm": 0.13219381431171143, + "learning_rate": 3.233149483002418e-06, + "loss": 2.6517, + "step": 43321 + }, + { + "epoch": 2.689304115711714, + "grad_norm": 0.13632063804114064, + "learning_rate": 3.231871999661845e-06, + "loss": 2.7394, + "step": 43322 + }, + { + "epoch": 2.689366192811472, + "grad_norm": 0.1358340590539026, + "learning_rate": 3.2305947603219476e-06, + "loss": 2.641, + "step": 43323 + }, + { + "epoch": 2.6894282699112297, + "grad_norm": 0.13174432467772293, + "learning_rate": 3.2293177649893813e-06, + "loss": 2.5857, + "step": 43324 + }, + { + "epoch": 2.6894903470109877, + "grad_norm": 0.15518941345256584, + "learning_rate": 3.2280410136708296e-06, + "loss": 2.6547, + "step": 43325 + }, + { + "epoch": 2.6895524241107456, + "grad_norm": 0.1343668067452531, + "learning_rate": 3.226764506372942e-06, + "loss": 2.7295, + "step": 43326 + }, + { + "epoch": 2.6896145012105035, + "grad_norm": 0.1320631118567068, + "learning_rate": 3.225488243102376e-06, + "loss": 2.6975, + "step": 43327 + }, + { + "epoch": 2.6896765783102614, + "grad_norm": 0.1694523803673472, + "learning_rate": 3.2242122238657913e-06, + "loss": 2.7312, + "step": 43328 + }, + { + "epoch": 2.6897386554100193, + "grad_norm": 0.13468166859785788, + "learning_rate": 3.2229364486698444e-06, + "loss": 2.6764, + "step": 43329 + }, + { + "epoch": 2.6898007325097772, + "grad_norm": 0.142784922355736, + "learning_rate": 3.2216609175211963e-06, + "loss": 2.5922, + "step": 43330 + }, + { + "epoch": 2.689862809609535, + "grad_norm": 0.15547547900910785, + "learning_rate": 3.2203856304265034e-06, + "loss": 2.7336, + "step": 43331 + }, + { + "epoch": 2.689924886709293, + "grad_norm": 0.16635619180693137, + "learning_rate": 3.2191105873924153e-06, + "loss": 2.6987, + "step": 43332 + }, + { + "epoch": 2.689986963809051, + "grad_norm": 0.1386970242005916, + "learning_rate": 3.2178357884255715e-06, + "loss": 2.6129, + "step": 43333 + }, + { + "epoch": 2.690049040908809, + "grad_norm": 0.13026311596154747, + "learning_rate": 3.216561233532639e-06, + "loss": 2.6565, + "step": 43334 + }, + { + "epoch": 2.6901111180085664, + "grad_norm": 0.13100064182306265, + "learning_rate": 3.2152869227202674e-06, + "loss": 2.6537, + "step": 43335 + }, + { + "epoch": 2.6901731951083248, + "grad_norm": 0.13773299438857842, + "learning_rate": 3.214012855995102e-06, + "loss": 2.7264, + "step": 43336 + }, + { + "epoch": 2.6902352722080822, + "grad_norm": 0.15516085111743308, + "learning_rate": 3.212739033363793e-06, + "loss": 2.7806, + "step": 43337 + }, + { + "epoch": 2.6902973493078406, + "grad_norm": 0.13327530558076486, + "learning_rate": 3.211465454832974e-06, + "loss": 2.6666, + "step": 43338 + }, + { + "epoch": 2.690359426407598, + "grad_norm": 0.13360325935606313, + "learning_rate": 3.2101921204093054e-06, + "loss": 2.708, + "step": 43339 + }, + { + "epoch": 2.690421503507356, + "grad_norm": 0.13582892356873896, + "learning_rate": 3.2089190300994223e-06, + "loss": 2.7118, + "step": 43340 + }, + { + "epoch": 2.690483580607114, + "grad_norm": 0.1515892368998201, + "learning_rate": 3.2076461839099623e-06, + "loss": 2.77, + "step": 43341 + }, + { + "epoch": 2.690545657706872, + "grad_norm": 0.13458161299696456, + "learning_rate": 3.2063735818475773e-06, + "loss": 2.6542, + "step": 43342 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.14806264677559763, + "learning_rate": 3.205101223918894e-06, + "loss": 2.762, + "step": 43343 + }, + { + "epoch": 2.6906698119063877, + "grad_norm": 0.13956550990248742, + "learning_rate": 3.203829110130563e-06, + "loss": 2.6488, + "step": 43344 + }, + { + "epoch": 2.6907318890061456, + "grad_norm": 0.14074184278985383, + "learning_rate": 3.202557240489218e-06, + "loss": 2.7412, + "step": 43345 + }, + { + "epoch": 2.6907939661059035, + "grad_norm": 0.13463622943945203, + "learning_rate": 3.201285615001487e-06, + "loss": 2.6555, + "step": 43346 + }, + { + "epoch": 2.6908560432056614, + "grad_norm": 0.13399604462943887, + "learning_rate": 3.2000142336740092e-06, + "loss": 2.6958, + "step": 43347 + }, + { + "epoch": 2.6909181203054193, + "grad_norm": 0.12975712481361334, + "learning_rate": 3.198743096513418e-06, + "loss": 2.6148, + "step": 43348 + }, + { + "epoch": 2.6909801974051772, + "grad_norm": 0.14689195514779105, + "learning_rate": 3.1974722035263526e-06, + "loss": 2.7184, + "step": 43349 + }, + { + "epoch": 2.691042274504935, + "grad_norm": 0.13143504614224105, + "learning_rate": 3.1962015547194303e-06, + "loss": 2.6443, + "step": 43350 + }, + { + "epoch": 2.691104351604693, + "grad_norm": 0.17518266379158695, + "learning_rate": 3.194931150099284e-06, + "loss": 2.7299, + "step": 43351 + }, + { + "epoch": 2.691166428704451, + "grad_norm": 0.15408961342825728, + "learning_rate": 3.193660989672548e-06, + "loss": 2.7372, + "step": 43352 + }, + { + "epoch": 2.691228505804209, + "grad_norm": 0.13293550909602372, + "learning_rate": 3.1923910734458496e-06, + "loss": 2.6542, + "step": 43353 + }, + { + "epoch": 2.691290582903967, + "grad_norm": 0.1342981773166857, + "learning_rate": 3.1911214014258063e-06, + "loss": 2.7508, + "step": 43354 + }, + { + "epoch": 2.6913526600037248, + "grad_norm": 0.13152111044610362, + "learning_rate": 3.189851973619046e-06, + "loss": 2.7579, + "step": 43355 + }, + { + "epoch": 2.6914147371034827, + "grad_norm": 0.1330037773181219, + "learning_rate": 3.188582790032185e-06, + "loss": 2.683, + "step": 43356 + }, + { + "epoch": 2.6914768142032406, + "grad_norm": 0.13798177289361302, + "learning_rate": 3.187313850671858e-06, + "loss": 2.7266, + "step": 43357 + }, + { + "epoch": 2.691538891302998, + "grad_norm": 0.1316417889303035, + "learning_rate": 3.1860451555446812e-06, + "loss": 2.743, + "step": 43358 + }, + { + "epoch": 2.6916009684027564, + "grad_norm": 0.13282703572557944, + "learning_rate": 3.184776704657272e-06, + "loss": 2.7358, + "step": 43359 + }, + { + "epoch": 2.691663045502514, + "grad_norm": 0.1308838977895099, + "learning_rate": 3.1835084980162357e-06, + "loss": 2.6005, + "step": 43360 + }, + { + "epoch": 2.6917251226022723, + "grad_norm": 0.13909605457228244, + "learning_rate": 3.1822405356282113e-06, + "loss": 2.6278, + "step": 43361 + }, + { + "epoch": 2.6917871997020297, + "grad_norm": 0.1361965902992079, + "learning_rate": 3.1809728174998054e-06, + "loss": 2.7807, + "step": 43362 + }, + { + "epoch": 2.691849276801788, + "grad_norm": 0.14920725878063423, + "learning_rate": 3.1797053436376234e-06, + "loss": 2.6108, + "step": 43363 + }, + { + "epoch": 2.6919113539015456, + "grad_norm": 0.14438019410818098, + "learning_rate": 3.178438114048282e-06, + "loss": 2.725, + "step": 43364 + }, + { + "epoch": 2.6919734310013035, + "grad_norm": 0.14638664800376278, + "learning_rate": 3.177171128738404e-06, + "loss": 2.7017, + "step": 43365 + }, + { + "epoch": 2.6920355081010614, + "grad_norm": 0.12892250478736547, + "learning_rate": 3.175904387714579e-06, + "loss": 2.64, + "step": 43366 + }, + { + "epoch": 2.6920975852008193, + "grad_norm": 0.13381453779718813, + "learning_rate": 3.1746378909834396e-06, + "loss": 2.7018, + "step": 43367 + }, + { + "epoch": 2.6921596623005772, + "grad_norm": 0.13476296436060678, + "learning_rate": 3.1733716385515754e-06, + "loss": 2.8191, + "step": 43368 + }, + { + "epoch": 2.692221739400335, + "grad_norm": 0.13193777883853477, + "learning_rate": 3.1721056304256035e-06, + "loss": 2.7198, + "step": 43369 + }, + { + "epoch": 2.692283816500093, + "grad_norm": 0.13948056995444688, + "learning_rate": 3.1708398666121187e-06, + "loss": 2.7747, + "step": 43370 + }, + { + "epoch": 2.692345893599851, + "grad_norm": 0.15963049982681088, + "learning_rate": 3.169574347117732e-06, + "loss": 2.8028, + "step": 43371 + }, + { + "epoch": 2.692407970699609, + "grad_norm": 0.14903362309355642, + "learning_rate": 3.1683090719490494e-06, + "loss": 2.726, + "step": 43372 + }, + { + "epoch": 2.692470047799367, + "grad_norm": 0.13934133681222197, + "learning_rate": 3.1670440411126655e-06, + "loss": 2.6421, + "step": 43373 + }, + { + "epoch": 2.6925321248991247, + "grad_norm": 0.12885920070563783, + "learning_rate": 3.1657792546151756e-06, + "loss": 2.8012, + "step": 43374 + }, + { + "epoch": 2.6925942019988827, + "grad_norm": 0.15497200575606862, + "learning_rate": 3.16451471246319e-06, + "loss": 2.6905, + "step": 43375 + }, + { + "epoch": 2.6926562790986406, + "grad_norm": 0.15371838906632168, + "learning_rate": 3.1632504146633045e-06, + "loss": 2.6043, + "step": 43376 + }, + { + "epoch": 2.6927183561983985, + "grad_norm": 0.13260814428044906, + "learning_rate": 3.161986361222108e-06, + "loss": 2.739, + "step": 43377 + }, + { + "epoch": 2.6927804332981564, + "grad_norm": 0.13406842017495754, + "learning_rate": 3.1607225521462e-06, + "loss": 2.7017, + "step": 43378 + }, + { + "epoch": 2.6928425103979143, + "grad_norm": 0.13144937285093747, + "learning_rate": 3.1594589874421656e-06, + "loss": 2.7326, + "step": 43379 + }, + { + "epoch": 2.6929045874976723, + "grad_norm": 0.1338967495533719, + "learning_rate": 3.1581956671166146e-06, + "loss": 2.657, + "step": 43380 + }, + { + "epoch": 2.69296666459743, + "grad_norm": 0.13755656753124504, + "learning_rate": 3.156932591176126e-06, + "loss": 2.7224, + "step": 43381 + }, + { + "epoch": 2.693028741697188, + "grad_norm": 0.14053050813697543, + "learning_rate": 3.1556697596272887e-06, + "loss": 2.7508, + "step": 43382 + }, + { + "epoch": 2.6930908187969456, + "grad_norm": 0.1348797135559137, + "learning_rate": 3.1544071724766864e-06, + "loss": 2.7109, + "step": 43383 + }, + { + "epoch": 2.693152895896704, + "grad_norm": 0.1328856883626223, + "learning_rate": 3.1531448297309253e-06, + "loss": 2.7021, + "step": 43384 + }, + { + "epoch": 2.6932149729964614, + "grad_norm": 0.14139329787733732, + "learning_rate": 3.1518827313965825e-06, + "loss": 2.6958, + "step": 43385 + }, + { + "epoch": 2.6932770500962198, + "grad_norm": 0.13021684300907807, + "learning_rate": 3.150620877480237e-06, + "loss": 2.6569, + "step": 43386 + }, + { + "epoch": 2.6933391271959772, + "grad_norm": 0.13014913802848854, + "learning_rate": 3.1493592679884663e-06, + "loss": 2.6345, + "step": 43387 + }, + { + "epoch": 2.693401204295735, + "grad_norm": 0.1385926503887228, + "learning_rate": 3.148097902927871e-06, + "loss": 2.6596, + "step": 43388 + }, + { + "epoch": 2.693463281395493, + "grad_norm": 0.1387861076832798, + "learning_rate": 3.1468367823050237e-06, + "loss": 2.7409, + "step": 43389 + }, + { + "epoch": 2.693525358495251, + "grad_norm": 0.1307162527023179, + "learning_rate": 3.1455759061265023e-06, + "loss": 2.7099, + "step": 43390 + }, + { + "epoch": 2.693587435595009, + "grad_norm": 0.14523473499493567, + "learning_rate": 3.1443152743988846e-06, + "loss": 2.6947, + "step": 43391 + }, + { + "epoch": 2.693649512694767, + "grad_norm": 0.13350932258744397, + "learning_rate": 3.143054887128738e-06, + "loss": 2.7229, + "step": 43392 + }, + { + "epoch": 2.6937115897945247, + "grad_norm": 0.132032050745415, + "learning_rate": 3.1417947443226625e-06, + "loss": 2.7145, + "step": 43393 + }, + { + "epoch": 2.6937736668942827, + "grad_norm": 0.13409123788497213, + "learning_rate": 3.1405348459872143e-06, + "loss": 2.7157, + "step": 43394 + }, + { + "epoch": 2.6938357439940406, + "grad_norm": 0.13986601918607303, + "learning_rate": 3.1392751921289764e-06, + "loss": 2.7483, + "step": 43395 + }, + { + "epoch": 2.6938978210937985, + "grad_norm": 0.13729082718589153, + "learning_rate": 3.1380157827545053e-06, + "loss": 2.6833, + "step": 43396 + }, + { + "epoch": 2.6939598981935564, + "grad_norm": 0.128044528229803, + "learning_rate": 3.13675661787039e-06, + "loss": 2.701, + "step": 43397 + }, + { + "epoch": 2.6940219752933143, + "grad_norm": 0.1322428674942376, + "learning_rate": 3.1354976974831805e-06, + "loss": 2.6719, + "step": 43398 + }, + { + "epoch": 2.6940840523930722, + "grad_norm": 0.13313427681232629, + "learning_rate": 3.1342390215994723e-06, + "loss": 2.6704, + "step": 43399 + }, + { + "epoch": 2.69414612949283, + "grad_norm": 0.13490481721083175, + "learning_rate": 3.1329805902258093e-06, + "loss": 2.735, + "step": 43400 + }, + { + "epoch": 2.694208206592588, + "grad_norm": 0.13739458345515196, + "learning_rate": 3.131722403368759e-06, + "loss": 2.7673, + "step": 43401 + }, + { + "epoch": 2.694270283692346, + "grad_norm": 0.13035124724123942, + "learning_rate": 3.1304644610348987e-06, + "loss": 2.6045, + "step": 43402 + }, + { + "epoch": 2.694332360792104, + "grad_norm": 0.150790654763743, + "learning_rate": 3.1292067632307797e-06, + "loss": 2.6174, + "step": 43403 + }, + { + "epoch": 2.694394437891862, + "grad_norm": 0.13667475810051363, + "learning_rate": 3.1279493099629687e-06, + "loss": 2.6689, + "step": 43404 + }, + { + "epoch": 2.6944565149916198, + "grad_norm": 0.14054474262318412, + "learning_rate": 3.126692101238027e-06, + "loss": 2.6252, + "step": 43405 + }, + { + "epoch": 2.6945185920913772, + "grad_norm": 0.1348022811904802, + "learning_rate": 3.1254351370625047e-06, + "loss": 2.6975, + "step": 43406 + }, + { + "epoch": 2.6945806691911356, + "grad_norm": 0.13891128907434067, + "learning_rate": 3.124178417442969e-06, + "loss": 2.6716, + "step": 43407 + }, + { + "epoch": 2.694642746290893, + "grad_norm": 0.14167700106311729, + "learning_rate": 3.1229219423859755e-06, + "loss": 2.6831, + "step": 43408 + }, + { + "epoch": 2.6947048233906514, + "grad_norm": 0.13584539208780982, + "learning_rate": 3.1216657118980806e-06, + "loss": 2.7185, + "step": 43409 + }, + { + "epoch": 2.694766900490409, + "grad_norm": 0.13332072657228286, + "learning_rate": 3.120409725985829e-06, + "loss": 2.7437, + "step": 43410 + }, + { + "epoch": 2.6948289775901673, + "grad_norm": 0.13028630703522018, + "learning_rate": 3.119153984655787e-06, + "loss": 2.6736, + "step": 43411 + }, + { + "epoch": 2.6948910546899247, + "grad_norm": 0.14714164144013617, + "learning_rate": 3.1178984879144944e-06, + "loss": 2.6704, + "step": 43412 + }, + { + "epoch": 2.6949531317896827, + "grad_norm": 0.15915245572115164, + "learning_rate": 3.1166432357685128e-06, + "loss": 2.7228, + "step": 43413 + }, + { + "epoch": 2.6950152088894406, + "grad_norm": 0.13197456418326944, + "learning_rate": 3.115388228224375e-06, + "loss": 2.6838, + "step": 43414 + }, + { + "epoch": 2.6950772859891985, + "grad_norm": 0.1503804252930874, + "learning_rate": 3.1141334652886435e-06, + "loss": 2.7863, + "step": 43415 + }, + { + "epoch": 2.6951393630889564, + "grad_norm": 0.1379163739637321, + "learning_rate": 3.112878946967862e-06, + "loss": 2.7234, + "step": 43416 + }, + { + "epoch": 2.6952014401887143, + "grad_norm": 0.13950338366252008, + "learning_rate": 3.111624673268576e-06, + "loss": 2.7423, + "step": 43417 + }, + { + "epoch": 2.6952635172884722, + "grad_norm": 0.1432365408357364, + "learning_rate": 3.1103706441973245e-06, + "loss": 2.6221, + "step": 43418 + }, + { + "epoch": 2.69532559438823, + "grad_norm": 0.12968846171378695, + "learning_rate": 3.1091168597606414e-06, + "loss": 2.7283, + "step": 43419 + }, + { + "epoch": 2.695387671487988, + "grad_norm": 0.13159627676788854, + "learning_rate": 3.1078633199650877e-06, + "loss": 2.7618, + "step": 43420 + }, + { + "epoch": 2.695449748587746, + "grad_norm": 0.1301366576965088, + "learning_rate": 3.106610024817197e-06, + "loss": 2.6592, + "step": 43421 + }, + { + "epoch": 2.695511825687504, + "grad_norm": 0.15146918398198972, + "learning_rate": 3.1053569743235034e-06, + "loss": 2.6468, + "step": 43422 + }, + { + "epoch": 2.695573902787262, + "grad_norm": 0.14379134102000293, + "learning_rate": 3.1041041684905404e-06, + "loss": 2.7561, + "step": 43423 + }, + { + "epoch": 2.6956359798870198, + "grad_norm": 0.14750027921213782, + "learning_rate": 3.102851607324858e-06, + "loss": 2.7568, + "step": 43424 + }, + { + "epoch": 2.6956980569867777, + "grad_norm": 0.13477550000485577, + "learning_rate": 3.1015992908329847e-06, + "loss": 2.7705, + "step": 43425 + }, + { + "epoch": 2.6957601340865356, + "grad_norm": 0.13284715733211097, + "learning_rate": 3.100347219021449e-06, + "loss": 2.7001, + "step": 43426 + }, + { + "epoch": 2.6958222111862935, + "grad_norm": 0.13449157963996794, + "learning_rate": 3.099095391896789e-06, + "loss": 2.7551, + "step": 43427 + }, + { + "epoch": 2.6958842882860514, + "grad_norm": 0.13297592654736445, + "learning_rate": 3.0978438094655283e-06, + "loss": 2.7323, + "step": 43428 + }, + { + "epoch": 2.6959463653858093, + "grad_norm": 0.1545652096387821, + "learning_rate": 3.0965924717342052e-06, + "loss": 2.8144, + "step": 43429 + }, + { + "epoch": 2.6960084424855673, + "grad_norm": 0.13326176222512987, + "learning_rate": 3.095341378709349e-06, + "loss": 2.6853, + "step": 43430 + }, + { + "epoch": 2.6960705195853247, + "grad_norm": 0.15486383093299644, + "learning_rate": 3.0940905303974756e-06, + "loss": 2.6495, + "step": 43431 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.147435596638126, + "learning_rate": 3.0928399268051243e-06, + "loss": 2.6925, + "step": 43432 + }, + { + "epoch": 2.6961946737848406, + "grad_norm": 0.13869860888399246, + "learning_rate": 3.091589567938813e-06, + "loss": 2.8182, + "step": 43433 + }, + { + "epoch": 2.696256750884599, + "grad_norm": 0.13665908465224946, + "learning_rate": 3.090339453805069e-06, + "loss": 2.7666, + "step": 43434 + }, + { + "epoch": 2.6963188279843564, + "grad_norm": 0.13363044970848006, + "learning_rate": 3.0890895844104096e-06, + "loss": 2.7239, + "step": 43435 + }, + { + "epoch": 2.6963809050841143, + "grad_norm": 0.14334390047421822, + "learning_rate": 3.087839959761363e-06, + "loss": 2.727, + "step": 43436 + }, + { + "epoch": 2.6964429821838722, + "grad_norm": 0.14959784655064948, + "learning_rate": 3.086590579864429e-06, + "loss": 2.7627, + "step": 43437 + }, + { + "epoch": 2.69650505928363, + "grad_norm": 0.12898204482507838, + "learning_rate": 3.085341444726153e-06, + "loss": 2.7316, + "step": 43438 + }, + { + "epoch": 2.696567136383388, + "grad_norm": 0.136721822159174, + "learning_rate": 3.084092554353041e-06, + "loss": 2.6312, + "step": 43439 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.1412182519793777, + "learning_rate": 3.0828439087516037e-06, + "loss": 2.6271, + "step": 43440 + }, + { + "epoch": 2.696691290582904, + "grad_norm": 0.14984479538848233, + "learning_rate": 3.081595507928359e-06, + "loss": 2.7439, + "step": 43441 + }, + { + "epoch": 2.696753367682662, + "grad_norm": 0.1327866080233961, + "learning_rate": 3.0803473518898173e-06, + "loss": 2.7176, + "step": 43442 + }, + { + "epoch": 2.6968154447824197, + "grad_norm": 0.13354052130162847, + "learning_rate": 3.0790994406424965e-06, + "loss": 2.6946, + "step": 43443 + }, + { + "epoch": 2.6968775218821777, + "grad_norm": 0.13336366700169458, + "learning_rate": 3.077851774192908e-06, + "loss": 2.6448, + "step": 43444 + }, + { + "epoch": 2.6969395989819356, + "grad_norm": 0.14209407786978956, + "learning_rate": 3.0766043525475573e-06, + "loss": 2.6771, + "step": 43445 + }, + { + "epoch": 2.6970016760816935, + "grad_norm": 0.16084808277551882, + "learning_rate": 3.0753571757129396e-06, + "loss": 2.6608, + "step": 43446 + }, + { + "epoch": 2.6970637531814514, + "grad_norm": 0.13145557210483835, + "learning_rate": 3.074110243695588e-06, + "loss": 2.7424, + "step": 43447 + }, + { + "epoch": 2.6971258302812093, + "grad_norm": 0.15580292757508682, + "learning_rate": 3.0728635565019926e-06, + "loss": 2.7307, + "step": 43448 + }, + { + "epoch": 2.6971879073809673, + "grad_norm": 0.13592577258857486, + "learning_rate": 3.071617114138664e-06, + "loss": 2.6538, + "step": 43449 + }, + { + "epoch": 2.697249984480725, + "grad_norm": 0.1426063535781299, + "learning_rate": 3.070370916612092e-06, + "loss": 2.7372, + "step": 43450 + }, + { + "epoch": 2.697312061580483, + "grad_norm": 0.13804905006193427, + "learning_rate": 3.0691249639287932e-06, + "loss": 2.673, + "step": 43451 + }, + { + "epoch": 2.697374138680241, + "grad_norm": 0.15162862534682114, + "learning_rate": 3.067879256095263e-06, + "loss": 2.741, + "step": 43452 + }, + { + "epoch": 2.697436215779999, + "grad_norm": 0.13872167874952882, + "learning_rate": 3.0666337931179956e-06, + "loss": 2.7771, + "step": 43453 + }, + { + "epoch": 2.6974982928797564, + "grad_norm": 0.12971452695662153, + "learning_rate": 3.065388575003497e-06, + "loss": 2.6998, + "step": 43454 + }, + { + "epoch": 2.6975603699795148, + "grad_norm": 0.13493656981846838, + "learning_rate": 3.064143601758257e-06, + "loss": 2.6606, + "step": 43455 + }, + { + "epoch": 2.6976224470792722, + "grad_norm": 0.13051744882767735, + "learning_rate": 3.0628988733887754e-06, + "loss": 2.618, + "step": 43456 + }, + { + "epoch": 2.6976845241790306, + "grad_norm": 0.13380239239299754, + "learning_rate": 3.061654389901547e-06, + "loss": 2.7604, + "step": 43457 + }, + { + "epoch": 2.697746601278788, + "grad_norm": 0.138346040762752, + "learning_rate": 3.0604101513030612e-06, + "loss": 2.6806, + "step": 43458 + }, + { + "epoch": 2.697808678378546, + "grad_norm": 0.13326626984150303, + "learning_rate": 3.059166157599802e-06, + "loss": 2.7716, + "step": 43459 + }, + { + "epoch": 2.697870755478304, + "grad_norm": 0.1421940018665933, + "learning_rate": 3.0579224087982806e-06, + "loss": 2.6771, + "step": 43460 + }, + { + "epoch": 2.697932832578062, + "grad_norm": 0.15302960966826853, + "learning_rate": 3.056678904904969e-06, + "loss": 2.6959, + "step": 43461 + }, + { + "epoch": 2.6979949096778197, + "grad_norm": 0.13733567285710238, + "learning_rate": 3.0554356459263576e-06, + "loss": 2.7145, + "step": 43462 + }, + { + "epoch": 2.6980569867775777, + "grad_norm": 0.13505670652360474, + "learning_rate": 3.0541926318689406e-06, + "loss": 2.7765, + "step": 43463 + }, + { + "epoch": 2.6981190638773356, + "grad_norm": 0.15154753370227425, + "learning_rate": 3.052949862739185e-06, + "loss": 2.6953, + "step": 43464 + }, + { + "epoch": 2.6981811409770935, + "grad_norm": 0.13064861405856887, + "learning_rate": 3.051707338543591e-06, + "loss": 2.6619, + "step": 43465 + }, + { + "epoch": 2.6982432180768514, + "grad_norm": 0.13698309819002938, + "learning_rate": 3.050465059288643e-06, + "loss": 2.7942, + "step": 43466 + }, + { + "epoch": 2.6983052951766093, + "grad_norm": 0.14015860170711258, + "learning_rate": 3.049223024980813e-06, + "loss": 2.7537, + "step": 43467 + }, + { + "epoch": 2.6983673722763672, + "grad_norm": 0.1519897639861528, + "learning_rate": 3.0479812356265847e-06, + "loss": 2.7265, + "step": 43468 + }, + { + "epoch": 2.698429449376125, + "grad_norm": 0.13735263877070736, + "learning_rate": 3.046739691232431e-06, + "loss": 2.7046, + "step": 43469 + }, + { + "epoch": 2.698491526475883, + "grad_norm": 0.13489502071322412, + "learning_rate": 3.045498391804846e-06, + "loss": 2.6123, + "step": 43470 + }, + { + "epoch": 2.698553603575641, + "grad_norm": 0.1351287551983578, + "learning_rate": 3.044257337350287e-06, + "loss": 2.6695, + "step": 43471 + }, + { + "epoch": 2.698615680675399, + "grad_norm": 0.1455904066967375, + "learning_rate": 3.043016527875242e-06, + "loss": 2.834, + "step": 43472 + }, + { + "epoch": 2.698677757775157, + "grad_norm": 0.13230303915082153, + "learning_rate": 3.0417759633861674e-06, + "loss": 2.7181, + "step": 43473 + }, + { + "epoch": 2.6987398348749148, + "grad_norm": 0.13399028264581675, + "learning_rate": 3.0405356438895584e-06, + "loss": 2.7921, + "step": 43474 + }, + { + "epoch": 2.6988019119746727, + "grad_norm": 0.13281099935693605, + "learning_rate": 3.0392955693918702e-06, + "loss": 2.8353, + "step": 43475 + }, + { + "epoch": 2.6988639890744306, + "grad_norm": 0.12833196421637066, + "learning_rate": 3.0380557398995814e-06, + "loss": 2.6277, + "step": 43476 + }, + { + "epoch": 2.698926066174188, + "grad_norm": 0.1430678969686394, + "learning_rate": 3.036816155419153e-06, + "loss": 2.6638, + "step": 43477 + }, + { + "epoch": 2.6989881432739464, + "grad_norm": 0.1561542575282328, + "learning_rate": 3.0355768159570473e-06, + "loss": 2.7527, + "step": 43478 + }, + { + "epoch": 2.699050220373704, + "grad_norm": 0.15081724122740667, + "learning_rate": 3.0343377215197477e-06, + "loss": 2.6689, + "step": 43479 + }, + { + "epoch": 2.6991122974734623, + "grad_norm": 0.1335294948224165, + "learning_rate": 3.03309887211371e-06, + "loss": 2.6428, + "step": 43480 + }, + { + "epoch": 2.6991743745732197, + "grad_norm": 0.1415761859122348, + "learning_rate": 3.0318602677453955e-06, + "loss": 2.6169, + "step": 43481 + }, + { + "epoch": 2.699236451672978, + "grad_norm": 0.13473899550568363, + "learning_rate": 3.030621908421266e-06, + "loss": 2.6484, + "step": 43482 + }, + { + "epoch": 2.6992985287727356, + "grad_norm": 0.13955822278116886, + "learning_rate": 3.029383794147783e-06, + "loss": 2.7123, + "step": 43483 + }, + { + "epoch": 2.6993606058724935, + "grad_norm": 0.1312625779641827, + "learning_rate": 3.0281459249314135e-06, + "loss": 2.6641, + "step": 43484 + }, + { + "epoch": 2.6994226829722514, + "grad_norm": 0.13394323918653384, + "learning_rate": 3.026908300778608e-06, + "loss": 2.727, + "step": 43485 + }, + { + "epoch": 2.6994847600720093, + "grad_norm": 0.13808194118258219, + "learning_rate": 3.0256709216958168e-06, + "loss": 2.6629, + "step": 43486 + }, + { + "epoch": 2.6995468371717672, + "grad_norm": 0.16212085315559743, + "learning_rate": 3.0244337876895124e-06, + "loss": 2.7267, + "step": 43487 + }, + { + "epoch": 2.699608914271525, + "grad_norm": 0.13179453446910996, + "learning_rate": 3.0231968987661397e-06, + "loss": 2.7022, + "step": 43488 + }, + { + "epoch": 2.699670991371283, + "grad_norm": 0.1343893866821796, + "learning_rate": 3.0219602549321547e-06, + "loss": 2.7375, + "step": 43489 + }, + { + "epoch": 2.699733068471041, + "grad_norm": 0.13864279831698653, + "learning_rate": 3.020723856194002e-06, + "loss": 2.749, + "step": 43490 + }, + { + "epoch": 2.699795145570799, + "grad_norm": 0.13146955654802725, + "learning_rate": 3.0194877025581326e-06, + "loss": 2.7492, + "step": 43491 + }, + { + "epoch": 2.699857222670557, + "grad_norm": 0.13945870748934527, + "learning_rate": 3.018251794031013e-06, + "loss": 2.6616, + "step": 43492 + }, + { + "epoch": 2.6999192997703148, + "grad_norm": 0.13738413068471408, + "learning_rate": 3.017016130619077e-06, + "loss": 2.7682, + "step": 43493 + }, + { + "epoch": 2.6999813768700727, + "grad_norm": 0.14548537702218425, + "learning_rate": 3.0157807123287694e-06, + "loss": 2.6715, + "step": 43494 + }, + { + "epoch": 2.7000434539698306, + "grad_norm": 0.1433024298531612, + "learning_rate": 3.0145455391665354e-06, + "loss": 2.6099, + "step": 43495 + }, + { + "epoch": 2.7001055310695885, + "grad_norm": 0.13943879190083283, + "learning_rate": 3.013310611138831e-06, + "loss": 2.7182, + "step": 43496 + }, + { + "epoch": 2.7001676081693464, + "grad_norm": 0.14239407116786298, + "learning_rate": 3.012075928252078e-06, + "loss": 2.7514, + "step": 43497 + }, + { + "epoch": 2.7002296852691043, + "grad_norm": 0.14592993395018863, + "learning_rate": 3.0108414905127446e-06, + "loss": 2.7382, + "step": 43498 + }, + { + "epoch": 2.7002917623688623, + "grad_norm": 0.13922969352318626, + "learning_rate": 3.0096072979272585e-06, + "loss": 2.6836, + "step": 43499 + }, + { + "epoch": 2.70035383946862, + "grad_norm": 0.1333687926678442, + "learning_rate": 3.008373350502053e-06, + "loss": 2.7154, + "step": 43500 + }, + { + "epoch": 2.700415916568378, + "grad_norm": 0.146470647804478, + "learning_rate": 3.007139648243573e-06, + "loss": 2.7601, + "step": 43501 + }, + { + "epoch": 2.7004779936681356, + "grad_norm": 0.16578334378278445, + "learning_rate": 3.005906191158259e-06, + "loss": 2.7322, + "step": 43502 + }, + { + "epoch": 2.700540070767894, + "grad_norm": 0.13456148082614736, + "learning_rate": 3.0046729792525375e-06, + "loss": 2.7116, + "step": 43503 + }, + { + "epoch": 2.7006021478676514, + "grad_norm": 0.1397591465492856, + "learning_rate": 3.0034400125328432e-06, + "loss": 2.6429, + "step": 43504 + }, + { + "epoch": 2.7006642249674098, + "grad_norm": 0.14126502856970533, + "learning_rate": 3.00220729100561e-06, + "loss": 2.7696, + "step": 43505 + }, + { + "epoch": 2.7007263020671672, + "grad_norm": 0.13025880471195248, + "learning_rate": 3.0009748146772766e-06, + "loss": 2.6595, + "step": 43506 + }, + { + "epoch": 2.700788379166925, + "grad_norm": 0.14337660324768406, + "learning_rate": 2.9997425835542605e-06, + "loss": 2.6948, + "step": 43507 + }, + { + "epoch": 2.700850456266683, + "grad_norm": 0.14589767583523117, + "learning_rate": 2.9985105976430005e-06, + "loss": 2.6893, + "step": 43508 + }, + { + "epoch": 2.700912533366441, + "grad_norm": 0.1380593175737138, + "learning_rate": 2.9972788569499145e-06, + "loss": 2.7025, + "step": 43509 + }, + { + "epoch": 2.700974610466199, + "grad_norm": 0.13158314097852258, + "learning_rate": 2.9960473614814467e-06, + "loss": 2.6979, + "step": 43510 + }, + { + "epoch": 2.701036687565957, + "grad_norm": 0.1321163726974607, + "learning_rate": 2.9948161112440033e-06, + "loss": 2.7432, + "step": 43511 + }, + { + "epoch": 2.7010987646657147, + "grad_norm": 0.15295964236319776, + "learning_rate": 2.993585106244018e-06, + "loss": 2.7533, + "step": 43512 + }, + { + "epoch": 2.7011608417654727, + "grad_norm": 0.14558268547264233, + "learning_rate": 2.9923543464879076e-06, + "loss": 2.7077, + "step": 43513 + }, + { + "epoch": 2.7012229188652306, + "grad_norm": 0.13707550369481838, + "learning_rate": 2.9911238319820955e-06, + "loss": 2.6969, + "step": 43514 + }, + { + "epoch": 2.7012849959649885, + "grad_norm": 0.15896162648266576, + "learning_rate": 2.9898935627330038e-06, + "loss": 2.6734, + "step": 43515 + }, + { + "epoch": 2.7013470730647464, + "grad_norm": 0.14435548017705915, + "learning_rate": 2.9886635387470495e-06, + "loss": 2.7519, + "step": 43516 + }, + { + "epoch": 2.7014091501645043, + "grad_norm": 0.13185200769885305, + "learning_rate": 2.987433760030656e-06, + "loss": 2.6964, + "step": 43517 + }, + { + "epoch": 2.7014712272642623, + "grad_norm": 0.1385433716477861, + "learning_rate": 2.986204226590217e-06, + "loss": 2.7196, + "step": 43518 + }, + { + "epoch": 2.70153330436402, + "grad_norm": 0.14984345910418997, + "learning_rate": 2.984974938432178e-06, + "loss": 2.7643, + "step": 43519 + }, + { + "epoch": 2.701595381463778, + "grad_norm": 0.146985785319295, + "learning_rate": 2.983745895562934e-06, + "loss": 2.6778, + "step": 43520 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.14919108538799997, + "learning_rate": 2.9825170979889015e-06, + "loss": 2.7526, + "step": 43521 + }, + { + "epoch": 2.701719535663294, + "grad_norm": 0.13597191780484008, + "learning_rate": 2.9812885457164874e-06, + "loss": 2.7275, + "step": 43522 + }, + { + "epoch": 2.701781612763052, + "grad_norm": 0.1346804308087166, + "learning_rate": 2.980060238752108e-06, + "loss": 2.7002, + "step": 43523 + }, + { + "epoch": 2.7018436898628098, + "grad_norm": 0.14188079268649537, + "learning_rate": 2.97883217710217e-06, + "loss": 2.7589, + "step": 43524 + }, + { + "epoch": 2.7019057669625672, + "grad_norm": 0.1494663496093885, + "learning_rate": 2.9776043607730787e-06, + "loss": 2.6833, + "step": 43525 + }, + { + "epoch": 2.7019678440623256, + "grad_norm": 0.13721491835111035, + "learning_rate": 2.976376789771246e-06, + "loss": 2.749, + "step": 43526 + }, + { + "epoch": 2.702029921162083, + "grad_norm": 0.1343917636616864, + "learning_rate": 2.975149464103061e-06, + "loss": 2.8273, + "step": 43527 + }, + { + "epoch": 2.7020919982618414, + "grad_norm": 0.13662511480166054, + "learning_rate": 2.9739223837749407e-06, + "loss": 2.6365, + "step": 43528 + }, + { + "epoch": 2.702154075361599, + "grad_norm": 0.1335421532268954, + "learning_rate": 2.972695548793275e-06, + "loss": 2.6683, + "step": 43529 + }, + { + "epoch": 2.7022161524613573, + "grad_norm": 0.13425787351072477, + "learning_rate": 2.971468959164486e-06, + "loss": 2.7338, + "step": 43530 + }, + { + "epoch": 2.7022782295611147, + "grad_norm": 0.129010981583029, + "learning_rate": 2.9702426148949582e-06, + "loss": 2.6414, + "step": 43531 + }, + { + "epoch": 2.7023403066608727, + "grad_norm": 0.13256834532454892, + "learning_rate": 2.96901651599108e-06, + "loss": 2.6497, + "step": 43532 + }, + { + "epoch": 2.7024023837606306, + "grad_norm": 0.14376923365044572, + "learning_rate": 2.9677906624592744e-06, + "loss": 2.7634, + "step": 43533 + }, + { + "epoch": 2.7024644608603885, + "grad_norm": 0.13975311570882643, + "learning_rate": 2.966565054305914e-06, + "loss": 2.7995, + "step": 43534 + }, + { + "epoch": 2.7025265379601464, + "grad_norm": 0.13421662084691466, + "learning_rate": 2.965339691537411e-06, + "loss": 2.7047, + "step": 43535 + }, + { + "epoch": 2.7025886150599043, + "grad_norm": 0.13996573061751375, + "learning_rate": 2.964114574160132e-06, + "loss": 2.6703, + "step": 43536 + }, + { + "epoch": 2.7026506921596622, + "grad_norm": 0.1311162377689998, + "learning_rate": 2.962889702180499e-06, + "loss": 2.6389, + "step": 43537 + }, + { + "epoch": 2.70271276925942, + "grad_norm": 0.1456659852625006, + "learning_rate": 2.9616650756048857e-06, + "loss": 2.6676, + "step": 43538 + }, + { + "epoch": 2.702774846359178, + "grad_norm": 0.13639569944149166, + "learning_rate": 2.960440694439687e-06, + "loss": 2.7338, + "step": 43539 + }, + { + "epoch": 2.702836923458936, + "grad_norm": 0.1471431415071663, + "learning_rate": 2.9592165586912912e-06, + "loss": 2.6829, + "step": 43540 + }, + { + "epoch": 2.702899000558694, + "grad_norm": 0.13137946754022398, + "learning_rate": 2.957992668366072e-06, + "loss": 2.6495, + "step": 43541 + }, + { + "epoch": 2.702961077658452, + "grad_norm": 0.13444631175810837, + "learning_rate": 2.956769023470429e-06, + "loss": 2.77, + "step": 43542 + }, + { + "epoch": 2.7030231547582098, + "grad_norm": 0.14464533015206724, + "learning_rate": 2.955545624010747e-06, + "loss": 2.7737, + "step": 43543 + }, + { + "epoch": 2.7030852318579677, + "grad_norm": 0.15128402271145025, + "learning_rate": 2.9543224699934035e-06, + "loss": 2.6959, + "step": 43544 + }, + { + "epoch": 2.7031473089577256, + "grad_norm": 0.13662026127635923, + "learning_rate": 2.9530995614247715e-06, + "loss": 2.6964, + "step": 43545 + }, + { + "epoch": 2.7032093860574835, + "grad_norm": 0.140150634393709, + "learning_rate": 2.9518768983112455e-06, + "loss": 2.6688, + "step": 43546 + }, + { + "epoch": 2.7032714631572414, + "grad_norm": 0.13991947148578757, + "learning_rate": 2.9506544806591983e-06, + "loss": 2.6571, + "step": 43547 + }, + { + "epoch": 2.7033335402569993, + "grad_norm": 0.14629041546347465, + "learning_rate": 2.9494323084750144e-06, + "loss": 2.5113, + "step": 43548 + }, + { + "epoch": 2.7033956173567573, + "grad_norm": 0.1481361907341202, + "learning_rate": 2.9482103817650595e-06, + "loss": 2.7315, + "step": 43549 + }, + { + "epoch": 2.7034576944565147, + "grad_norm": 0.15322400366614347, + "learning_rate": 2.9469887005357077e-06, + "loss": 2.6545, + "step": 43550 + }, + { + "epoch": 2.703519771556273, + "grad_norm": 0.14720228386191117, + "learning_rate": 2.945767264793342e-06, + "loss": 2.7578, + "step": 43551 + }, + { + "epoch": 2.7035818486560306, + "grad_norm": 0.13049181517286568, + "learning_rate": 2.9445460745443355e-06, + "loss": 2.6874, + "step": 43552 + }, + { + "epoch": 2.703643925755789, + "grad_norm": 0.14887658854509042, + "learning_rate": 2.9433251297950546e-06, + "loss": 2.8049, + "step": 43553 + }, + { + "epoch": 2.7037060028555464, + "grad_norm": 0.1486752450754472, + "learning_rate": 2.942104430551862e-06, + "loss": 2.7411, + "step": 43554 + }, + { + "epoch": 2.7037680799553043, + "grad_norm": 0.14368640928820994, + "learning_rate": 2.9408839768211404e-06, + "loss": 2.7177, + "step": 43555 + }, + { + "epoch": 2.7038301570550622, + "grad_norm": 0.1350960481665715, + "learning_rate": 2.939663768609252e-06, + "loss": 2.7248, + "step": 43556 + }, + { + "epoch": 2.70389223415482, + "grad_norm": 0.14432935531949168, + "learning_rate": 2.938443805922564e-06, + "loss": 2.6176, + "step": 43557 + }, + { + "epoch": 2.703954311254578, + "grad_norm": 0.13754389939907827, + "learning_rate": 2.937224088767432e-06, + "loss": 2.6395, + "step": 43558 + }, + { + "epoch": 2.704016388354336, + "grad_norm": 0.14990687285797458, + "learning_rate": 2.936004617150234e-06, + "loss": 2.7855, + "step": 43559 + }, + { + "epoch": 2.704078465454094, + "grad_norm": 0.1311290297020864, + "learning_rate": 2.9347853910773217e-06, + "loss": 2.6631, + "step": 43560 + }, + { + "epoch": 2.704140542553852, + "grad_norm": 0.13429994566304562, + "learning_rate": 2.9335664105550608e-06, + "loss": 2.78, + "step": 43561 + }, + { + "epoch": 2.7042026196536098, + "grad_norm": 0.13719332616378313, + "learning_rate": 2.9323476755898027e-06, + "loss": 2.7114, + "step": 43562 + }, + { + "epoch": 2.7042646967533677, + "grad_norm": 0.1301983821563107, + "learning_rate": 2.9311291861879197e-06, + "loss": 2.6861, + "step": 43563 + }, + { + "epoch": 2.7043267738531256, + "grad_norm": 0.13319332556602104, + "learning_rate": 2.929910942355757e-06, + "loss": 2.6703, + "step": 43564 + }, + { + "epoch": 2.7043888509528835, + "grad_norm": 0.15360297734163028, + "learning_rate": 2.928692944099681e-06, + "loss": 2.7215, + "step": 43565 + }, + { + "epoch": 2.7044509280526414, + "grad_norm": 0.13369651015621342, + "learning_rate": 2.9274751914260434e-06, + "loss": 2.6947, + "step": 43566 + }, + { + "epoch": 2.7045130051523993, + "grad_norm": 0.14021221479902282, + "learning_rate": 2.9262576843411883e-06, + "loss": 2.6787, + "step": 43567 + }, + { + "epoch": 2.7045750822521573, + "grad_norm": 0.1419024788822286, + "learning_rate": 2.9250404228514717e-06, + "loss": 2.7176, + "step": 43568 + }, + { + "epoch": 2.704637159351915, + "grad_norm": 0.14297860872432266, + "learning_rate": 2.9238234069632555e-06, + "loss": 2.6364, + "step": 43569 + }, + { + "epoch": 2.704699236451673, + "grad_norm": 0.1358975132414333, + "learning_rate": 2.9226066366828785e-06, + "loss": 2.7603, + "step": 43570 + }, + { + "epoch": 2.704761313551431, + "grad_norm": 0.13363362074281127, + "learning_rate": 2.921390112016692e-06, + "loss": 2.7901, + "step": 43571 + }, + { + "epoch": 2.704823390651189, + "grad_norm": 0.14439853812805872, + "learning_rate": 2.9201738329710347e-06, + "loss": 2.6936, + "step": 43572 + }, + { + "epoch": 2.7048854677509464, + "grad_norm": 0.13432679392851907, + "learning_rate": 2.9189577995522687e-06, + "loss": 2.7229, + "step": 43573 + }, + { + "epoch": 2.7049475448507048, + "grad_norm": 0.1447353404133468, + "learning_rate": 2.9177420117667275e-06, + "loss": 2.6433, + "step": 43574 + }, + { + "epoch": 2.7050096219504622, + "grad_norm": 0.13525686019811642, + "learning_rate": 2.9165264696207563e-06, + "loss": 2.6595, + "step": 43575 + }, + { + "epoch": 2.7050716990502206, + "grad_norm": 0.13776856361076817, + "learning_rate": 2.9153111731206938e-06, + "loss": 2.6354, + "step": 43576 + }, + { + "epoch": 2.705133776149978, + "grad_norm": 0.146421864205192, + "learning_rate": 2.91409612227288e-06, + "loss": 2.6611, + "step": 43577 + }, + { + "epoch": 2.7051958532497364, + "grad_norm": 0.13025772508979663, + "learning_rate": 2.91288131708366e-06, + "loss": 2.7155, + "step": 43578 + }, + { + "epoch": 2.705257930349494, + "grad_norm": 0.13287040906530936, + "learning_rate": 2.9116667575593725e-06, + "loss": 2.7448, + "step": 43579 + }, + { + "epoch": 2.705320007449252, + "grad_norm": 0.14361236996350546, + "learning_rate": 2.9104524437063517e-06, + "loss": 2.6961, + "step": 43580 + }, + { + "epoch": 2.7053820845490097, + "grad_norm": 0.15717821384817787, + "learning_rate": 2.9092383755309207e-06, + "loss": 2.7268, + "step": 43581 + }, + { + "epoch": 2.7054441616487677, + "grad_norm": 0.13922008094194216, + "learning_rate": 2.9080245530394344e-06, + "loss": 2.7057, + "step": 43582 + }, + { + "epoch": 2.7055062387485256, + "grad_norm": 0.13570365770594478, + "learning_rate": 2.906810976238217e-06, + "loss": 2.7838, + "step": 43583 + }, + { + "epoch": 2.7055683158482835, + "grad_norm": 0.13466500243750626, + "learning_rate": 2.9055976451335953e-06, + "loss": 2.6383, + "step": 43584 + }, + { + "epoch": 2.7056303929480414, + "grad_norm": 0.13228661541293835, + "learning_rate": 2.9043845597318984e-06, + "loss": 2.8179, + "step": 43585 + }, + { + "epoch": 2.7056924700477993, + "grad_norm": 0.13233079929106595, + "learning_rate": 2.9031717200394657e-06, + "loss": 2.7727, + "step": 43586 + }, + { + "epoch": 2.7057545471475573, + "grad_norm": 0.1418831623140926, + "learning_rate": 2.90195912606262e-06, + "loss": 2.6353, + "step": 43587 + }, + { + "epoch": 2.705816624247315, + "grad_norm": 0.1341227178668314, + "learning_rate": 2.9007467778076893e-06, + "loss": 2.8366, + "step": 43588 + }, + { + "epoch": 2.705878701347073, + "grad_norm": 0.13592367595229796, + "learning_rate": 2.899534675280996e-06, + "loss": 2.7048, + "step": 43589 + }, + { + "epoch": 2.705940778446831, + "grad_norm": 0.1496621857901964, + "learning_rate": 2.8983228184888577e-06, + "loss": 2.7442, + "step": 43590 + }, + { + "epoch": 2.706002855546589, + "grad_norm": 0.14496107239692516, + "learning_rate": 2.8971112074376084e-06, + "loss": 2.776, + "step": 43591 + }, + { + "epoch": 2.706064932646347, + "grad_norm": 0.13485893529785592, + "learning_rate": 2.895899842133565e-06, + "loss": 2.7295, + "step": 43592 + }, + { + "epoch": 2.7061270097461048, + "grad_norm": 0.1323432416016633, + "learning_rate": 2.8946887225830445e-06, + "loss": 2.7369, + "step": 43593 + }, + { + "epoch": 2.7061890868458627, + "grad_norm": 0.14726087754790815, + "learning_rate": 2.893477848792364e-06, + "loss": 2.7467, + "step": 43594 + }, + { + "epoch": 2.7062511639456206, + "grad_norm": 0.13681550008622112, + "learning_rate": 2.892267220767847e-06, + "loss": 2.6731, + "step": 43595 + }, + { + "epoch": 2.7063132410453785, + "grad_norm": 0.1489215975314461, + "learning_rate": 2.8910568385158097e-06, + "loss": 2.7799, + "step": 43596 + }, + { + "epoch": 2.7063753181451364, + "grad_norm": 0.1451679395130519, + "learning_rate": 2.8898467020425702e-06, + "loss": 2.6876, + "step": 43597 + }, + { + "epoch": 2.706437395244894, + "grad_norm": 0.13696378139811366, + "learning_rate": 2.8886368113544337e-06, + "loss": 2.6695, + "step": 43598 + }, + { + "epoch": 2.7064994723446523, + "grad_norm": 0.13873084251786325, + "learning_rate": 2.887427166457718e-06, + "loss": 2.7349, + "step": 43599 + }, + { + "epoch": 2.7065615494444097, + "grad_norm": 0.14073425922831623, + "learning_rate": 2.8862177673587233e-06, + "loss": 2.7498, + "step": 43600 + }, + { + "epoch": 2.706623626544168, + "grad_norm": 0.13321550317342737, + "learning_rate": 2.8850086140637723e-06, + "loss": 2.7549, + "step": 43601 + }, + { + "epoch": 2.7066857036439256, + "grad_norm": 0.1328285712447892, + "learning_rate": 2.8837997065791713e-06, + "loss": 2.6211, + "step": 43602 + }, + { + "epoch": 2.7067477807436835, + "grad_norm": 0.13200900409569533, + "learning_rate": 2.882591044911226e-06, + "loss": 2.7532, + "step": 43603 + }, + { + "epoch": 2.7068098578434414, + "grad_norm": 0.13072070560456175, + "learning_rate": 2.8813826290662317e-06, + "loss": 2.7317, + "step": 43604 + }, + { + "epoch": 2.7068719349431993, + "grad_norm": 0.13126420273457007, + "learning_rate": 2.880174459050511e-06, + "loss": 2.6874, + "step": 43605 + }, + { + "epoch": 2.7069340120429572, + "grad_norm": 0.1378600305435521, + "learning_rate": 2.8789665348703588e-06, + "loss": 2.6735, + "step": 43606 + }, + { + "epoch": 2.706996089142715, + "grad_norm": 0.14609373138550014, + "learning_rate": 2.877758856532081e-06, + "loss": 2.7017, + "step": 43607 + }, + { + "epoch": 2.707058166242473, + "grad_norm": 0.1310979319827255, + "learning_rate": 2.876551424041962e-06, + "loss": 2.697, + "step": 43608 + }, + { + "epoch": 2.707120243342231, + "grad_norm": 0.14142740655140823, + "learning_rate": 2.8753442374063243e-06, + "loss": 2.7236, + "step": 43609 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.15888699729065023, + "learning_rate": 2.8741372966314515e-06, + "loss": 2.6757, + "step": 43610 + }, + { + "epoch": 2.707244397541747, + "grad_norm": 0.1341975026748945, + "learning_rate": 2.8729306017236503e-06, + "loss": 2.6684, + "step": 43611 + }, + { + "epoch": 2.7073064746415048, + "grad_norm": 0.15054687210977785, + "learning_rate": 2.871724152689209e-06, + "loss": 2.7521, + "step": 43612 + }, + { + "epoch": 2.7073685517412627, + "grad_norm": 0.13798495192132326, + "learning_rate": 2.8705179495344127e-06, + "loss": 2.7552, + "step": 43613 + }, + { + "epoch": 2.7074306288410206, + "grad_norm": 0.1415770476594643, + "learning_rate": 2.8693119922655775e-06, + "loss": 2.6578, + "step": 43614 + }, + { + "epoch": 2.7074927059407785, + "grad_norm": 0.12970865375852153, + "learning_rate": 2.8681062808889768e-06, + "loss": 2.6809, + "step": 43615 + }, + { + "epoch": 2.7075547830405364, + "grad_norm": 0.13162403030391528, + "learning_rate": 2.866900815410911e-06, + "loss": 2.6602, + "step": 43616 + }, + { + "epoch": 2.7076168601402943, + "grad_norm": 0.1290948889157381, + "learning_rate": 2.8656955958376586e-06, + "loss": 2.6493, + "step": 43617 + }, + { + "epoch": 2.7076789372400523, + "grad_norm": 0.1332255210395076, + "learning_rate": 2.864490622175525e-06, + "loss": 2.7269, + "step": 43618 + }, + { + "epoch": 2.70774101433981, + "grad_norm": 0.16806424553483543, + "learning_rate": 2.863285894430784e-06, + "loss": 2.7172, + "step": 43619 + }, + { + "epoch": 2.707803091439568, + "grad_norm": 0.13770338129521617, + "learning_rate": 2.8620814126097185e-06, + "loss": 2.7501, + "step": 43620 + }, + { + "epoch": 2.7078651685393256, + "grad_norm": 0.13408702193694438, + "learning_rate": 2.8608771767186183e-06, + "loss": 2.6532, + "step": 43621 + }, + { + "epoch": 2.707927245639084, + "grad_norm": 0.14928159888108763, + "learning_rate": 2.8596731867637673e-06, + "loss": 2.6876, + "step": 43622 + }, + { + "epoch": 2.7079893227388414, + "grad_norm": 0.14734178111964563, + "learning_rate": 2.8584694427514437e-06, + "loss": 2.8071, + "step": 43623 + }, + { + "epoch": 2.7080513998385998, + "grad_norm": 0.13910350650370562, + "learning_rate": 2.8572659446879312e-06, + "loss": 2.7502, + "step": 43624 + }, + { + "epoch": 2.7081134769383572, + "grad_norm": 0.13024596306334624, + "learning_rate": 2.856062692579509e-06, + "loss": 2.6406, + "step": 43625 + }, + { + "epoch": 2.7081755540381156, + "grad_norm": 0.1339728283702936, + "learning_rate": 2.854859686432443e-06, + "loss": 2.7373, + "step": 43626 + }, + { + "epoch": 2.708237631137873, + "grad_norm": 0.13280954077172702, + "learning_rate": 2.8536569262530233e-06, + "loss": 2.7104, + "step": 43627 + }, + { + "epoch": 2.708299708237631, + "grad_norm": 0.14987358786655014, + "learning_rate": 2.8524544120475173e-06, + "loss": 2.7728, + "step": 43628 + }, + { + "epoch": 2.708361785337389, + "grad_norm": 0.1335060579143446, + "learning_rate": 2.8512521438222084e-06, + "loss": 2.6694, + "step": 43629 + }, + { + "epoch": 2.708423862437147, + "grad_norm": 0.1302475933650698, + "learning_rate": 2.850050121583364e-06, + "loss": 2.7263, + "step": 43630 + }, + { + "epoch": 2.7084859395369048, + "grad_norm": 0.14491184487811312, + "learning_rate": 2.8488483453372407e-06, + "loss": 2.7079, + "step": 43631 + }, + { + "epoch": 2.7085480166366627, + "grad_norm": 0.14802530428473895, + "learning_rate": 2.847646815090138e-06, + "loss": 2.6736, + "step": 43632 + }, + { + "epoch": 2.7086100937364206, + "grad_norm": 0.12981837689206202, + "learning_rate": 2.846445530848302e-06, + "loss": 2.6731, + "step": 43633 + }, + { + "epoch": 2.7086721708361785, + "grad_norm": 0.13702750764171095, + "learning_rate": 2.84524449261801e-06, + "loss": 2.6173, + "step": 43634 + }, + { + "epoch": 2.7087342479359364, + "grad_norm": 0.12976061299731192, + "learning_rate": 2.844043700405513e-06, + "loss": 2.6286, + "step": 43635 + }, + { + "epoch": 2.7087963250356943, + "grad_norm": 0.1338393193544091, + "learning_rate": 2.8428431542171006e-06, + "loss": 2.708, + "step": 43636 + }, + { + "epoch": 2.7088584021354523, + "grad_norm": 0.14154084975850312, + "learning_rate": 2.8416428540590177e-06, + "loss": 2.6125, + "step": 43637 + }, + { + "epoch": 2.70892047923521, + "grad_norm": 0.13606152604095756, + "learning_rate": 2.840442799937537e-06, + "loss": 2.631, + "step": 43638 + }, + { + "epoch": 2.708982556334968, + "grad_norm": 0.13885646274320446, + "learning_rate": 2.8392429918589148e-06, + "loss": 2.7153, + "step": 43639 + }, + { + "epoch": 2.709044633434726, + "grad_norm": 0.14839864523467175, + "learning_rate": 2.838043429829401e-06, + "loss": 2.6354, + "step": 43640 + }, + { + "epoch": 2.709106710534484, + "grad_norm": 0.13337571270295542, + "learning_rate": 2.8368441138552694e-06, + "loss": 2.7252, + "step": 43641 + }, + { + "epoch": 2.709168787634242, + "grad_norm": 0.13980521891328526, + "learning_rate": 2.8356450439427752e-06, + "loss": 2.7215, + "step": 43642 + }, + { + "epoch": 2.7092308647339998, + "grad_norm": 0.15587351264029856, + "learning_rate": 2.8344462200981693e-06, + "loss": 2.6705, + "step": 43643 + }, + { + "epoch": 2.7092929418337577, + "grad_norm": 0.14954838916608673, + "learning_rate": 2.8332476423276968e-06, + "loss": 2.6792, + "step": 43644 + }, + { + "epoch": 2.7093550189335156, + "grad_norm": 0.13614705310918193, + "learning_rate": 2.83204931063763e-06, + "loss": 2.703, + "step": 43645 + }, + { + "epoch": 2.709417096033273, + "grad_norm": 0.14199622173428963, + "learning_rate": 2.8308512250342145e-06, + "loss": 2.6465, + "step": 43646 + }, + { + "epoch": 2.7094791731330314, + "grad_norm": 0.13557361179686034, + "learning_rate": 2.829653385523695e-06, + "loss": 2.771, + "step": 43647 + }, + { + "epoch": 2.709541250232789, + "grad_norm": 0.153583440531662, + "learning_rate": 2.828455792112328e-06, + "loss": 2.6831, + "step": 43648 + }, + { + "epoch": 2.7096033273325473, + "grad_norm": 0.14462597423520418, + "learning_rate": 2.827258444806352e-06, + "loss": 2.6932, + "step": 43649 + }, + { + "epoch": 2.7096654044323047, + "grad_norm": 0.14841332364208415, + "learning_rate": 2.826061343612024e-06, + "loss": 2.631, + "step": 43650 + }, + { + "epoch": 2.7097274815320627, + "grad_norm": 0.15498437906583112, + "learning_rate": 2.824864488535589e-06, + "loss": 2.7347, + "step": 43651 + }, + { + "epoch": 2.7097895586318206, + "grad_norm": 0.1418870875381068, + "learning_rate": 2.823667879583286e-06, + "loss": 2.6829, + "step": 43652 + }, + { + "epoch": 2.7098516357315785, + "grad_norm": 0.13204475395847276, + "learning_rate": 2.822471516761355e-06, + "loss": 2.6602, + "step": 43653 + }, + { + "epoch": 2.7099137128313364, + "grad_norm": 0.13255921924196748, + "learning_rate": 2.821275400076051e-06, + "loss": 2.6658, + "step": 43654 + }, + { + "epoch": 2.7099757899310943, + "grad_norm": 0.13371979528358677, + "learning_rate": 2.8200795295336037e-06, + "loss": 2.69, + "step": 43655 + }, + { + "epoch": 2.7100378670308523, + "grad_norm": 0.1513335344374769, + "learning_rate": 2.818883905140257e-06, + "loss": 2.6942, + "step": 43656 + }, + { + "epoch": 2.71009994413061, + "grad_norm": 0.15563564519782658, + "learning_rate": 2.8176885269022347e-06, + "loss": 2.7422, + "step": 43657 + }, + { + "epoch": 2.710162021230368, + "grad_norm": 0.1280614087419945, + "learning_rate": 2.816493394825798e-06, + "loss": 2.7371, + "step": 43658 + }, + { + "epoch": 2.710224098330126, + "grad_norm": 0.12850388673320162, + "learning_rate": 2.8152985089171635e-06, + "loss": 2.6428, + "step": 43659 + }, + { + "epoch": 2.710286175429884, + "grad_norm": 0.1368250093335142, + "learning_rate": 2.8141038691825714e-06, + "loss": 2.7121, + "step": 43660 + }, + { + "epoch": 2.710348252529642, + "grad_norm": 0.13387104641083522, + "learning_rate": 2.8129094756282557e-06, + "loss": 2.7146, + "step": 43661 + }, + { + "epoch": 2.7104103296293998, + "grad_norm": 0.14158092216807716, + "learning_rate": 2.81171532826045e-06, + "loss": 2.8254, + "step": 43662 + }, + { + "epoch": 2.7104724067291577, + "grad_norm": 0.15586201658376617, + "learning_rate": 2.810521427085372e-06, + "loss": 2.6228, + "step": 43663 + }, + { + "epoch": 2.7105344838289156, + "grad_norm": 0.15694414565023695, + "learning_rate": 2.809327772109266e-06, + "loss": 2.6556, + "step": 43664 + }, + { + "epoch": 2.7105965609286735, + "grad_norm": 0.14352719050089505, + "learning_rate": 2.8081343633383495e-06, + "loss": 2.6494, + "step": 43665 + }, + { + "epoch": 2.7106586380284314, + "grad_norm": 0.14322010286147097, + "learning_rate": 2.8069412007788567e-06, + "loss": 2.6191, + "step": 43666 + }, + { + "epoch": 2.7107207151281894, + "grad_norm": 0.13375098668162325, + "learning_rate": 2.8057482844369986e-06, + "loss": 2.7381, + "step": 43667 + }, + { + "epoch": 2.7107827922279473, + "grad_norm": 0.1308674738640983, + "learning_rate": 2.804555614319021e-06, + "loss": 2.6814, + "step": 43668 + }, + { + "epoch": 2.7108448693277047, + "grad_norm": 0.13672322538407855, + "learning_rate": 2.80336319043113e-06, + "loss": 2.7896, + "step": 43669 + }, + { + "epoch": 2.710906946427463, + "grad_norm": 0.13472588962288465, + "learning_rate": 2.8021710127795476e-06, + "loss": 2.6946, + "step": 43670 + }, + { + "epoch": 2.7109690235272206, + "grad_norm": 0.13548570868876475, + "learning_rate": 2.800979081370497e-06, + "loss": 2.6201, + "step": 43671 + }, + { + "epoch": 2.711031100626979, + "grad_norm": 0.154103495944698, + "learning_rate": 2.799787396210196e-06, + "loss": 2.7884, + "step": 43672 + }, + { + "epoch": 2.7110931777267364, + "grad_norm": 0.1354320674782842, + "learning_rate": 2.798595957304867e-06, + "loss": 2.7402, + "step": 43673 + }, + { + "epoch": 2.7111552548264948, + "grad_norm": 0.14321004927814812, + "learning_rate": 2.7974047646607217e-06, + "loss": 2.7254, + "step": 43674 + }, + { + "epoch": 2.7112173319262522, + "grad_norm": 0.1439448394146499, + "learning_rate": 2.7962138182839716e-06, + "loss": 2.6999, + "step": 43675 + }, + { + "epoch": 2.71127940902601, + "grad_norm": 0.13793580717146467, + "learning_rate": 2.7950231181808285e-06, + "loss": 2.7082, + "step": 43676 + }, + { + "epoch": 2.711341486125768, + "grad_norm": 0.14466089156412046, + "learning_rate": 2.7938326643575154e-06, + "loss": 2.7305, + "step": 43677 + }, + { + "epoch": 2.711403563225526, + "grad_norm": 0.13676927858961838, + "learning_rate": 2.792642456820238e-06, + "loss": 2.7098, + "step": 43678 + }, + { + "epoch": 2.711465640325284, + "grad_norm": 0.1338260627316003, + "learning_rate": 2.791452495575203e-06, + "loss": 2.7085, + "step": 43679 + }, + { + "epoch": 2.711527717425042, + "grad_norm": 0.1294914170784565, + "learning_rate": 2.790262780628611e-06, + "loss": 2.6632, + "step": 43680 + }, + { + "epoch": 2.7115897945247998, + "grad_norm": 0.1512440765320427, + "learning_rate": 2.7890733119866896e-06, + "loss": 2.7171, + "step": 43681 + }, + { + "epoch": 2.7116518716245577, + "grad_norm": 0.13909973037271606, + "learning_rate": 2.7878840896556346e-06, + "loss": 2.59, + "step": 43682 + }, + { + "epoch": 2.7117139487243156, + "grad_norm": 0.14383303975250847, + "learning_rate": 2.7866951136416464e-06, + "loss": 2.7449, + "step": 43683 + }, + { + "epoch": 2.7117760258240735, + "grad_norm": 0.13310099571140038, + "learning_rate": 2.785506383950931e-06, + "loss": 2.6699, + "step": 43684 + }, + { + "epoch": 2.7118381029238314, + "grad_norm": 0.15334700698262266, + "learning_rate": 2.784317900589689e-06, + "loss": 2.7011, + "step": 43685 + }, + { + "epoch": 2.7119001800235893, + "grad_norm": 0.13330245104292351, + "learning_rate": 2.7831296635641267e-06, + "loss": 2.6768, + "step": 43686 + }, + { + "epoch": 2.7119622571233473, + "grad_norm": 0.14082178281738672, + "learning_rate": 2.7819416728804335e-06, + "loss": 2.6638, + "step": 43687 + }, + { + "epoch": 2.712024334223105, + "grad_norm": 0.1413880942475764, + "learning_rate": 2.7807539285448216e-06, + "loss": 2.7972, + "step": 43688 + }, + { + "epoch": 2.712086411322863, + "grad_norm": 0.1334736370517409, + "learning_rate": 2.7795664305634684e-06, + "loss": 2.6679, + "step": 43689 + }, + { + "epoch": 2.712148488422621, + "grad_norm": 0.14066945141674075, + "learning_rate": 2.7783791789425863e-06, + "loss": 2.6727, + "step": 43690 + }, + { + "epoch": 2.712210565522379, + "grad_norm": 0.13585402320775658, + "learning_rate": 2.7771921736883643e-06, + "loss": 2.7448, + "step": 43691 + }, + { + "epoch": 2.712272642622137, + "grad_norm": 0.13151599774619985, + "learning_rate": 2.7760054148069926e-06, + "loss": 2.6906, + "step": 43692 + }, + { + "epoch": 2.7123347197218948, + "grad_norm": 0.13546087081505365, + "learning_rate": 2.77481890230466e-06, + "loss": 2.6398, + "step": 43693 + }, + { + "epoch": 2.7123967968216522, + "grad_norm": 0.15745212534058772, + "learning_rate": 2.773632636187562e-06, + "loss": 2.692, + "step": 43694 + }, + { + "epoch": 2.7124588739214106, + "grad_norm": 0.13304202498210407, + "learning_rate": 2.7724466164618935e-06, + "loss": 2.7375, + "step": 43695 + }, + { + "epoch": 2.712520951021168, + "grad_norm": 0.13579957421363154, + "learning_rate": 2.7712608431338384e-06, + "loss": 2.6853, + "step": 43696 + }, + { + "epoch": 2.7125830281209264, + "grad_norm": 0.13912049113042288, + "learning_rate": 2.7700753162095748e-06, + "loss": 2.6356, + "step": 43697 + }, + { + "epoch": 2.712645105220684, + "grad_norm": 0.1385588521145988, + "learning_rate": 2.7688900356952987e-06, + "loss": 2.7164, + "step": 43698 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.1531477035507209, + "learning_rate": 2.7677050015971815e-06, + "loss": 2.7551, + "step": 43699 + }, + { + "epoch": 2.7127692594201998, + "grad_norm": 0.128917382944417, + "learning_rate": 2.76652021392142e-06, + "loss": 2.6944, + "step": 43700 + }, + { + "epoch": 2.7128313365199577, + "grad_norm": 0.1343407983835003, + "learning_rate": 2.765335672674185e-06, + "loss": 2.7388, + "step": 43701 + }, + { + "epoch": 2.7128934136197156, + "grad_norm": 0.13222150581641728, + "learning_rate": 2.764151377861668e-06, + "loss": 2.6539, + "step": 43702 + }, + { + "epoch": 2.7129554907194735, + "grad_norm": 0.1354709770037185, + "learning_rate": 2.7629673294900293e-06, + "loss": 2.7791, + "step": 43703 + }, + { + "epoch": 2.7130175678192314, + "grad_norm": 0.12842631584859368, + "learning_rate": 2.761783527565459e-06, + "loss": 2.7031, + "step": 43704 + }, + { + "epoch": 2.7130796449189893, + "grad_norm": 0.13904485778125528, + "learning_rate": 2.7605999720941356e-06, + "loss": 2.6211, + "step": 43705 + }, + { + "epoch": 2.7131417220187473, + "grad_norm": 0.14951503059810423, + "learning_rate": 2.7594166630822314e-06, + "loss": 2.6851, + "step": 43706 + }, + { + "epoch": 2.713203799118505, + "grad_norm": 0.15142266887180753, + "learning_rate": 2.7582336005359144e-06, + "loss": 2.7429, + "step": 43707 + }, + { + "epoch": 2.713265876218263, + "grad_norm": 0.14185597157694502, + "learning_rate": 2.7570507844613623e-06, + "loss": 2.6682, + "step": 43708 + }, + { + "epoch": 2.713327953318021, + "grad_norm": 0.14147474595305085, + "learning_rate": 2.7558682148647485e-06, + "loss": 2.7247, + "step": 43709 + }, + { + "epoch": 2.713390030417779, + "grad_norm": 0.16459526683201878, + "learning_rate": 2.7546858917522345e-06, + "loss": 2.736, + "step": 43710 + }, + { + "epoch": 2.713452107517537, + "grad_norm": 0.13138012679683195, + "learning_rate": 2.7535038151299988e-06, + "loss": 2.6927, + "step": 43711 + }, + { + "epoch": 2.7135141846172948, + "grad_norm": 0.13087230590016657, + "learning_rate": 2.7523219850041915e-06, + "loss": 2.6664, + "step": 43712 + }, + { + "epoch": 2.7135762617170527, + "grad_norm": 0.13473281599193726, + "learning_rate": 2.7511404013810028e-06, + "loss": 2.784, + "step": 43713 + }, + { + "epoch": 2.7136383388168106, + "grad_norm": 0.12871964270367148, + "learning_rate": 2.7499590642665774e-06, + "loss": 2.6937, + "step": 43714 + }, + { + "epoch": 2.7137004159165685, + "grad_norm": 0.13435052438104084, + "learning_rate": 2.748777973667094e-06, + "loss": 2.789, + "step": 43715 + }, + { + "epoch": 2.7137624930163264, + "grad_norm": 0.14748569305991646, + "learning_rate": 2.7475971295886915e-06, + "loss": 2.6937, + "step": 43716 + }, + { + "epoch": 2.713824570116084, + "grad_norm": 0.14346857147853798, + "learning_rate": 2.7464165320375603e-06, + "loss": 2.7261, + "step": 43717 + }, + { + "epoch": 2.7138866472158423, + "grad_norm": 0.13185639399307142, + "learning_rate": 2.745236181019839e-06, + "loss": 2.6908, + "step": 43718 + }, + { + "epoch": 2.7139487243155997, + "grad_norm": 0.1493477954891291, + "learning_rate": 2.744056076541696e-06, + "loss": 2.7847, + "step": 43719 + }, + { + "epoch": 2.714010801415358, + "grad_norm": 0.13292267380964623, + "learning_rate": 2.7428762186092805e-06, + "loss": 2.7716, + "step": 43720 + }, + { + "epoch": 2.7140728785151156, + "grad_norm": 0.13356755010895838, + "learning_rate": 2.7416966072287497e-06, + "loss": 2.6983, + "step": 43721 + }, + { + "epoch": 2.714134955614874, + "grad_norm": 0.14405350080876747, + "learning_rate": 2.7405172424062653e-06, + "loss": 2.7937, + "step": 43722 + }, + { + "epoch": 2.7141970327146314, + "grad_norm": 0.13481221568493668, + "learning_rate": 2.739338124147972e-06, + "loss": 2.7144, + "step": 43723 + }, + { + "epoch": 2.7142591098143893, + "grad_norm": 0.14068539964776475, + "learning_rate": 2.7381592524600264e-06, + "loss": 2.7568, + "step": 43724 + }, + { + "epoch": 2.7143211869141473, + "grad_norm": 0.1469789864833907, + "learning_rate": 2.736980627348573e-06, + "loss": 2.6881, + "step": 43725 + }, + { + "epoch": 2.714383264013905, + "grad_norm": 0.1328870760282862, + "learning_rate": 2.7358022488197634e-06, + "loss": 2.6495, + "step": 43726 + }, + { + "epoch": 2.714445341113663, + "grad_norm": 0.14048451452934593, + "learning_rate": 2.7346241168797524e-06, + "loss": 2.712, + "step": 43727 + }, + { + "epoch": 2.714507418213421, + "grad_norm": 0.1394681300043287, + "learning_rate": 2.733446231534681e-06, + "loss": 2.6834, + "step": 43728 + }, + { + "epoch": 2.714569495313179, + "grad_norm": 0.13363877089976098, + "learning_rate": 2.732268592790699e-06, + "loss": 2.6575, + "step": 43729 + }, + { + "epoch": 2.714631572412937, + "grad_norm": 0.13644674354455855, + "learning_rate": 2.7310912006539347e-06, + "loss": 2.6598, + "step": 43730 + }, + { + "epoch": 2.7146936495126948, + "grad_norm": 0.13249849617752457, + "learning_rate": 2.729914055130556e-06, + "loss": 2.7051, + "step": 43731 + }, + { + "epoch": 2.7147557266124527, + "grad_norm": 0.14247503865113129, + "learning_rate": 2.728737156226685e-06, + "loss": 2.723, + "step": 43732 + }, + { + "epoch": 2.7148178037122106, + "grad_norm": 0.13070332061424597, + "learning_rate": 2.7275605039484676e-06, + "loss": 2.7628, + "step": 43733 + }, + { + "epoch": 2.7148798808119685, + "grad_norm": 0.13227180907004268, + "learning_rate": 2.7263840983020485e-06, + "loss": 2.5963, + "step": 43734 + }, + { + "epoch": 2.7149419579117264, + "grad_norm": 0.1483089618292976, + "learning_rate": 2.7252079392935446e-06, + "loss": 2.8063, + "step": 43735 + }, + { + "epoch": 2.7150040350114844, + "grad_norm": 0.13880180933807792, + "learning_rate": 2.7240320269291185e-06, + "loss": 2.686, + "step": 43736 + }, + { + "epoch": 2.7150661121112423, + "grad_norm": 0.15284731365004506, + "learning_rate": 2.7228563612148984e-06, + "loss": 2.7416, + "step": 43737 + }, + { + "epoch": 2.715128189211, + "grad_norm": 0.13550895440815536, + "learning_rate": 2.7216809421570067e-06, + "loss": 2.6924, + "step": 43738 + }, + { + "epoch": 2.715190266310758, + "grad_norm": 0.13420337373544142, + "learning_rate": 2.7205057697615776e-06, + "loss": 2.7061, + "step": 43739 + }, + { + "epoch": 2.715252343410516, + "grad_norm": 0.13491854767929334, + "learning_rate": 2.7193308440347563e-06, + "loss": 2.6984, + "step": 43740 + }, + { + "epoch": 2.715314420510274, + "grad_norm": 0.13547178134144575, + "learning_rate": 2.71815616498266e-06, + "loss": 2.7531, + "step": 43741 + }, + { + "epoch": 2.7153764976100314, + "grad_norm": 0.13347923295115816, + "learning_rate": 2.7169817326114233e-06, + "loss": 2.7054, + "step": 43742 + }, + { + "epoch": 2.7154385747097898, + "grad_norm": 0.14389569140249103, + "learning_rate": 2.715807546927168e-06, + "loss": 2.6384, + "step": 43743 + }, + { + "epoch": 2.7155006518095473, + "grad_norm": 0.13054108698744918, + "learning_rate": 2.7146336079360234e-06, + "loss": 2.7239, + "step": 43744 + }, + { + "epoch": 2.7155627289093056, + "grad_norm": 0.13341992681286924, + "learning_rate": 2.713459915644123e-06, + "loss": 2.6172, + "step": 43745 + }, + { + "epoch": 2.715624806009063, + "grad_norm": 0.15231834713345416, + "learning_rate": 2.7122864700575734e-06, + "loss": 2.7411, + "step": 43746 + }, + { + "epoch": 2.715686883108821, + "grad_norm": 0.14096146306886728, + "learning_rate": 2.7111132711825082e-06, + "loss": 2.6594, + "step": 43747 + }, + { + "epoch": 2.715748960208579, + "grad_norm": 0.144798296295218, + "learning_rate": 2.7099403190250395e-06, + "loss": 2.749, + "step": 43748 + }, + { + "epoch": 2.715811037308337, + "grad_norm": 0.13980312716393278, + "learning_rate": 2.7087676135912954e-06, + "loss": 2.7325, + "step": 43749 + }, + { + "epoch": 2.7158731144080948, + "grad_norm": 0.14120330051226745, + "learning_rate": 2.7075951548873933e-06, + "loss": 2.6948, + "step": 43750 + }, + { + "epoch": 2.7159351915078527, + "grad_norm": 0.1444135005817207, + "learning_rate": 2.7064229429194454e-06, + "loss": 2.7404, + "step": 43751 + }, + { + "epoch": 2.7159972686076106, + "grad_norm": 0.1622692491790589, + "learning_rate": 2.705250977693563e-06, + "loss": 2.773, + "step": 43752 + }, + { + "epoch": 2.7160593457073685, + "grad_norm": 0.14286142448199352, + "learning_rate": 2.7040792592158747e-06, + "loss": 2.747, + "step": 43753 + }, + { + "epoch": 2.7161214228071264, + "grad_norm": 0.13314988255221652, + "learning_rate": 2.7029077874924925e-06, + "loss": 2.7911, + "step": 43754 + }, + { + "epoch": 2.7161834999068843, + "grad_norm": 0.141260525661498, + "learning_rate": 2.7017365625295165e-06, + "loss": 2.6284, + "step": 43755 + }, + { + "epoch": 2.7162455770066423, + "grad_norm": 0.14547128666961617, + "learning_rate": 2.7005655843330535e-06, + "loss": 2.682, + "step": 43756 + }, + { + "epoch": 2.7163076541064, + "grad_norm": 0.13763272100613275, + "learning_rate": 2.6993948529092373e-06, + "loss": 2.7217, + "step": 43757 + }, + { + "epoch": 2.716369731206158, + "grad_norm": 0.14654874658818098, + "learning_rate": 2.6982243682641516e-06, + "loss": 2.713, + "step": 43758 + }, + { + "epoch": 2.716431808305916, + "grad_norm": 0.13097243620223056, + "learning_rate": 2.6970541304039087e-06, + "loss": 2.6434, + "step": 43759 + }, + { + "epoch": 2.716493885405674, + "grad_norm": 0.1449631796632356, + "learning_rate": 2.6958841393346255e-06, + "loss": 2.6667, + "step": 43760 + }, + { + "epoch": 2.716555962505432, + "grad_norm": 0.13359299096232052, + "learning_rate": 2.694714395062403e-06, + "loss": 2.6673, + "step": 43761 + }, + { + "epoch": 2.7166180396051898, + "grad_norm": 0.15209363922599486, + "learning_rate": 2.6935448975933252e-06, + "loss": 2.6381, + "step": 43762 + }, + { + "epoch": 2.7166801167049477, + "grad_norm": 0.1346950276986182, + "learning_rate": 2.692375646933515e-06, + "loss": 2.7004, + "step": 43763 + }, + { + "epoch": 2.7167421938047056, + "grad_norm": 0.1341565091299224, + "learning_rate": 2.6912066430890725e-06, + "loss": 2.7062, + "step": 43764 + }, + { + "epoch": 2.716804270904463, + "grad_norm": 0.1589166070344223, + "learning_rate": 2.6900378860660826e-06, + "loss": 2.6757, + "step": 43765 + }, + { + "epoch": 2.7168663480042214, + "grad_norm": 0.13720223224976325, + "learning_rate": 2.688869375870645e-06, + "loss": 2.7219, + "step": 43766 + }, + { + "epoch": 2.716928425103979, + "grad_norm": 0.14281206013008674, + "learning_rate": 2.687701112508867e-06, + "loss": 2.7255, + "step": 43767 + }, + { + "epoch": 2.7169905022037373, + "grad_norm": 0.14226544727396037, + "learning_rate": 2.6865330959868373e-06, + "loss": 2.7603, + "step": 43768 + }, + { + "epoch": 2.7170525793034948, + "grad_norm": 0.15257368097899307, + "learning_rate": 2.6853653263106514e-06, + "loss": 2.7602, + "step": 43769 + }, + { + "epoch": 2.717114656403253, + "grad_norm": 0.14266315955301465, + "learning_rate": 2.6841978034864047e-06, + "loss": 2.6682, + "step": 43770 + }, + { + "epoch": 2.7171767335030106, + "grad_norm": 0.1365566041468059, + "learning_rate": 2.6830305275201696e-06, + "loss": 2.7099, + "step": 43771 + }, + { + "epoch": 2.7172388106027685, + "grad_norm": 0.13953300186629475, + "learning_rate": 2.6818634984180633e-06, + "loss": 2.7231, + "step": 43772 + }, + { + "epoch": 2.7173008877025264, + "grad_norm": 0.13073213283877025, + "learning_rate": 2.6806967161861596e-06, + "loss": 2.7742, + "step": 43773 + }, + { + "epoch": 2.7173629648022843, + "grad_norm": 0.14990035562097714, + "learning_rate": 2.6795301808305473e-06, + "loss": 2.7666, + "step": 43774 + }, + { + "epoch": 2.7174250419020423, + "grad_norm": 0.13236953691298803, + "learning_rate": 2.6783638923573106e-06, + "loss": 2.7214, + "step": 43775 + }, + { + "epoch": 2.7174871190018, + "grad_norm": 0.1345395527801875, + "learning_rate": 2.6771978507725394e-06, + "loss": 2.7756, + "step": 43776 + }, + { + "epoch": 2.717549196101558, + "grad_norm": 0.14390560000891003, + "learning_rate": 2.6760320560823173e-06, + "loss": 2.6978, + "step": 43777 + }, + { + "epoch": 2.717611273201316, + "grad_norm": 0.1343595842456752, + "learning_rate": 2.6748665082927283e-06, + "loss": 2.7263, + "step": 43778 + }, + { + "epoch": 2.717673350301074, + "grad_norm": 0.12917888593814333, + "learning_rate": 2.6737012074098345e-06, + "loss": 2.6543, + "step": 43779 + }, + { + "epoch": 2.717735427400832, + "grad_norm": 0.13227706153352195, + "learning_rate": 2.672536153439742e-06, + "loss": 2.6384, + "step": 43780 + }, + { + "epoch": 2.7177975045005898, + "grad_norm": 0.1317992618550432, + "learning_rate": 2.6713713463885183e-06, + "loss": 2.6622, + "step": 43781 + }, + { + "epoch": 2.7178595816003477, + "grad_norm": 0.16188888607153418, + "learning_rate": 2.670206786262236e-06, + "loss": 2.6578, + "step": 43782 + }, + { + "epoch": 2.7179216587001056, + "grad_norm": 0.13725161252394733, + "learning_rate": 2.6690424730669796e-06, + "loss": 2.664, + "step": 43783 + }, + { + "epoch": 2.7179837357998635, + "grad_norm": 0.1353034193943267, + "learning_rate": 2.6678784068088103e-06, + "loss": 2.6975, + "step": 43784 + }, + { + "epoch": 2.7180458128996214, + "grad_norm": 0.13288401333274558, + "learning_rate": 2.6667145874938125e-06, + "loss": 2.6929, + "step": 43785 + }, + { + "epoch": 2.7181078899993794, + "grad_norm": 0.14019367509354982, + "learning_rate": 2.6655510151280593e-06, + "loss": 2.7216, + "step": 43786 + }, + { + "epoch": 2.7181699670991373, + "grad_norm": 0.12910288926913005, + "learning_rate": 2.6643876897176122e-06, + "loss": 2.7138, + "step": 43787 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.1303748583875163, + "learning_rate": 2.6632246112685444e-06, + "loss": 2.6377, + "step": 43788 + }, + { + "epoch": 2.718294121298653, + "grad_norm": 0.1320999257516869, + "learning_rate": 2.6620617797869286e-06, + "loss": 2.6621, + "step": 43789 + }, + { + "epoch": 2.7183561983984106, + "grad_norm": 0.13021412518653333, + "learning_rate": 2.660899195278832e-06, + "loss": 2.6763, + "step": 43790 + }, + { + "epoch": 2.718418275498169, + "grad_norm": 0.13237842280138548, + "learning_rate": 2.6597368577503055e-06, + "loss": 2.7157, + "step": 43791 + }, + { + "epoch": 2.7184803525979264, + "grad_norm": 0.14038576200465916, + "learning_rate": 2.6585747672074336e-06, + "loss": 2.7189, + "step": 43792 + }, + { + "epoch": 2.718542429697685, + "grad_norm": 0.13574605016907185, + "learning_rate": 2.6574129236562605e-06, + "loss": 2.7346, + "step": 43793 + }, + { + "epoch": 2.7186045067974423, + "grad_norm": 0.15699756721301916, + "learning_rate": 2.6562513271028654e-06, + "loss": 2.6887, + "step": 43794 + }, + { + "epoch": 2.7186665838972, + "grad_norm": 0.13689286861098493, + "learning_rate": 2.655089977553299e-06, + "loss": 2.7604, + "step": 43795 + }, + { + "epoch": 2.718728660996958, + "grad_norm": 0.1345851837378905, + "learning_rate": 2.6539288750136228e-06, + "loss": 2.703, + "step": 43796 + }, + { + "epoch": 2.718790738096716, + "grad_norm": 0.12903710428327084, + "learning_rate": 2.652768019489893e-06, + "loss": 2.7722, + "step": 43797 + }, + { + "epoch": 2.718852815196474, + "grad_norm": 0.1324246999838745, + "learning_rate": 2.6516074109881606e-06, + "loss": 2.7386, + "step": 43798 + }, + { + "epoch": 2.718914892296232, + "grad_norm": 0.13324956643667416, + "learning_rate": 2.650447049514493e-06, + "loss": 2.6557, + "step": 43799 + }, + { + "epoch": 2.7189769693959898, + "grad_norm": 0.13607583789784727, + "learning_rate": 2.649286935074935e-06, + "loss": 2.7225, + "step": 43800 + }, + { + "epoch": 2.7190390464957477, + "grad_norm": 0.1520365405166824, + "learning_rate": 2.6481270676755432e-06, + "loss": 2.7163, + "step": 43801 + }, + { + "epoch": 2.7191011235955056, + "grad_norm": 0.1448619384415371, + "learning_rate": 2.6469674473223625e-06, + "loss": 2.7415, + "step": 43802 + }, + { + "epoch": 2.7191632006952635, + "grad_norm": 0.14150732785254963, + "learning_rate": 2.6458080740214553e-06, + "loss": 2.7304, + "step": 43803 + }, + { + "epoch": 2.7192252777950214, + "grad_norm": 0.13623193172640055, + "learning_rate": 2.6446489477788606e-06, + "loss": 2.6839, + "step": 43804 + }, + { + "epoch": 2.7192873548947794, + "grad_norm": 0.13460750564145063, + "learning_rate": 2.643490068600629e-06, + "loss": 2.6316, + "step": 43805 + }, + { + "epoch": 2.7193494319945373, + "grad_norm": 0.16027695480194987, + "learning_rate": 2.642331436492801e-06, + "loss": 2.6624, + "step": 43806 + }, + { + "epoch": 2.719411509094295, + "grad_norm": 0.13350471853818383, + "learning_rate": 2.6411730514614318e-06, + "loss": 2.6453, + "step": 43807 + }, + { + "epoch": 2.719473586194053, + "grad_norm": 0.13923701121596094, + "learning_rate": 2.640014913512556e-06, + "loss": 2.6791, + "step": 43808 + }, + { + "epoch": 2.719535663293811, + "grad_norm": 0.13195870652485794, + "learning_rate": 2.6388570226522247e-06, + "loss": 2.7314, + "step": 43809 + }, + { + "epoch": 2.719597740393569, + "grad_norm": 0.14516860672184126, + "learning_rate": 2.637699378886471e-06, + "loss": 2.6664, + "step": 43810 + }, + { + "epoch": 2.719659817493327, + "grad_norm": 0.129575272052048, + "learning_rate": 2.63654198222133e-06, + "loss": 2.702, + "step": 43811 + }, + { + "epoch": 2.7197218945930848, + "grad_norm": 0.12850995993375208, + "learning_rate": 2.635384832662852e-06, + "loss": 2.7244, + "step": 43812 + }, + { + "epoch": 2.7197839716928423, + "grad_norm": 0.13348044699312897, + "learning_rate": 2.634227930217076e-06, + "loss": 2.6338, + "step": 43813 + }, + { + "epoch": 2.7198460487926006, + "grad_norm": 0.15251624294453225, + "learning_rate": 2.6330712748900254e-06, + "loss": 2.658, + "step": 43814 + }, + { + "epoch": 2.719908125892358, + "grad_norm": 0.12944046500764267, + "learning_rate": 2.6319148666877345e-06, + "loss": 2.7104, + "step": 43815 + }, + { + "epoch": 2.7199702029921164, + "grad_norm": 0.14336593051650598, + "learning_rate": 2.630758705616254e-06, + "loss": 2.7487, + "step": 43816 + }, + { + "epoch": 2.720032280091874, + "grad_norm": 0.137206522170239, + "learning_rate": 2.6296027916816013e-06, + "loss": 2.7213, + "step": 43817 + }, + { + "epoch": 2.7200943571916323, + "grad_norm": 0.13216477109643898, + "learning_rate": 2.62844712488981e-06, + "loss": 2.7385, + "step": 43818 + }, + { + "epoch": 2.7201564342913898, + "grad_norm": 0.13596996571911996, + "learning_rate": 2.627291705246915e-06, + "loss": 2.8198, + "step": 43819 + }, + { + "epoch": 2.7202185113911477, + "grad_norm": 0.14294234876285733, + "learning_rate": 2.6261365327589272e-06, + "loss": 2.8084, + "step": 43820 + }, + { + "epoch": 2.7202805884909056, + "grad_norm": 0.18755245468419277, + "learning_rate": 2.624981607431892e-06, + "loss": 2.6998, + "step": 43821 + }, + { + "epoch": 2.7203426655906635, + "grad_norm": 0.13892971836079662, + "learning_rate": 2.623826929271833e-06, + "loss": 2.6608, + "step": 43822 + }, + { + "epoch": 2.7204047426904214, + "grad_norm": 0.13300491167397815, + "learning_rate": 2.6226724982847673e-06, + "loss": 2.6905, + "step": 43823 + }, + { + "epoch": 2.7204668197901793, + "grad_norm": 0.13338672396614903, + "learning_rate": 2.621518314476712e-06, + "loss": 2.7022, + "step": 43824 + }, + { + "epoch": 2.7205288968899373, + "grad_norm": 0.13221868128088415, + "learning_rate": 2.620364377853696e-06, + "loss": 2.6296, + "step": 43825 + }, + { + "epoch": 2.720590973989695, + "grad_norm": 0.1301791746551461, + "learning_rate": 2.6192106884217526e-06, + "loss": 2.6882, + "step": 43826 + }, + { + "epoch": 2.720653051089453, + "grad_norm": 0.1321672233176064, + "learning_rate": 2.6180572461868835e-06, + "loss": 2.7192, + "step": 43827 + }, + { + "epoch": 2.720715128189211, + "grad_norm": 0.14259270371407054, + "learning_rate": 2.616904051155117e-06, + "loss": 2.6866, + "step": 43828 + }, + { + "epoch": 2.720777205288969, + "grad_norm": 0.14338026674095697, + "learning_rate": 2.6157511033324534e-06, + "loss": 2.645, + "step": 43829 + }, + { + "epoch": 2.720839282388727, + "grad_norm": 0.14140510743032592, + "learning_rate": 2.6145984027249326e-06, + "loss": 2.7266, + "step": 43830 + }, + { + "epoch": 2.7209013594884848, + "grad_norm": 0.1378118211487293, + "learning_rate": 2.6134459493385497e-06, + "loss": 2.6558, + "step": 43831 + }, + { + "epoch": 2.7209634365882427, + "grad_norm": 0.13982426305142204, + "learning_rate": 2.6122937431793226e-06, + "loss": 2.5702, + "step": 43832 + }, + { + "epoch": 2.7210255136880006, + "grad_norm": 0.13276106174901411, + "learning_rate": 2.611141784253268e-06, + "loss": 2.6883, + "step": 43833 + }, + { + "epoch": 2.7210875907877585, + "grad_norm": 0.14314291408820906, + "learning_rate": 2.609990072566376e-06, + "loss": 2.7855, + "step": 43834 + }, + { + "epoch": 2.7211496678875164, + "grad_norm": 0.13842307861679817, + "learning_rate": 2.608838608124681e-06, + "loss": 2.7191, + "step": 43835 + }, + { + "epoch": 2.721211744987274, + "grad_norm": 0.13449466804966803, + "learning_rate": 2.6076873909341827e-06, + "loss": 2.665, + "step": 43836 + }, + { + "epoch": 2.7212738220870323, + "grad_norm": 0.1383510356933759, + "learning_rate": 2.6065364210008824e-06, + "loss": 2.6768, + "step": 43837 + }, + { + "epoch": 2.7213358991867898, + "grad_norm": 0.1333986558814656, + "learning_rate": 2.6053856983307758e-06, + "loss": 2.6866, + "step": 43838 + }, + { + "epoch": 2.721397976286548, + "grad_norm": 0.13517926494252575, + "learning_rate": 2.6042352229298906e-06, + "loss": 2.8141, + "step": 43839 + }, + { + "epoch": 2.7214600533863056, + "grad_norm": 0.13169932881780214, + "learning_rate": 2.6030849948042113e-06, + "loss": 2.6834, + "step": 43840 + }, + { + "epoch": 2.721522130486064, + "grad_norm": 0.13037409403916855, + "learning_rate": 2.601935013959744e-06, + "loss": 2.785, + "step": 43841 + }, + { + "epoch": 2.7215842075858214, + "grad_norm": 0.13486690447316385, + "learning_rate": 2.6007852804024848e-06, + "loss": 2.6359, + "step": 43842 + }, + { + "epoch": 2.7216462846855793, + "grad_norm": 0.15260724364575612, + "learning_rate": 2.5996357941384387e-06, + "loss": 2.6437, + "step": 43843 + }, + { + "epoch": 2.7217083617853373, + "grad_norm": 0.13803679146070544, + "learning_rate": 2.5984865551736013e-06, + "loss": 2.749, + "step": 43844 + }, + { + "epoch": 2.721770438885095, + "grad_norm": 0.13375542907079688, + "learning_rate": 2.5973375635139627e-06, + "loss": 2.7181, + "step": 43845 + }, + { + "epoch": 2.721832515984853, + "grad_norm": 0.1340228217435171, + "learning_rate": 2.5961888191655293e-06, + "loss": 2.7462, + "step": 43846 + }, + { + "epoch": 2.721894593084611, + "grad_norm": 0.1331175988226401, + "learning_rate": 2.5950403221342734e-06, + "loss": 2.6948, + "step": 43847 + }, + { + "epoch": 2.721956670184369, + "grad_norm": 0.1342379425501897, + "learning_rate": 2.5938920724262074e-06, + "loss": 2.6732, + "step": 43848 + }, + { + "epoch": 2.722018747284127, + "grad_norm": 0.12887726082099893, + "learning_rate": 2.592744070047315e-06, + "loss": 2.7282, + "step": 43849 + }, + { + "epoch": 2.7220808243838848, + "grad_norm": 0.1493819831867818, + "learning_rate": 2.591596315003586e-06, + "loss": 2.7145, + "step": 43850 + }, + { + "epoch": 2.7221429014836427, + "grad_norm": 0.13610937006992738, + "learning_rate": 2.590448807300999e-06, + "loss": 2.6871, + "step": 43851 + }, + { + "epoch": 2.7222049785834006, + "grad_norm": 0.14188730414429726, + "learning_rate": 2.5893015469455605e-06, + "loss": 2.6389, + "step": 43852 + }, + { + "epoch": 2.7222670556831585, + "grad_norm": 0.14784004413481192, + "learning_rate": 2.5881545339432434e-06, + "loss": 2.7467, + "step": 43853 + }, + { + "epoch": 2.7223291327829164, + "grad_norm": 0.1286783009136478, + "learning_rate": 2.5870077683000315e-06, + "loss": 2.7003, + "step": 43854 + }, + { + "epoch": 2.7223912098826744, + "grad_norm": 0.14632808613000162, + "learning_rate": 2.5858612500219148e-06, + "loss": 2.6487, + "step": 43855 + }, + { + "epoch": 2.7224532869824323, + "grad_norm": 0.14839137766192195, + "learning_rate": 2.5847149791148604e-06, + "loss": 2.7034, + "step": 43856 + }, + { + "epoch": 2.72251536408219, + "grad_norm": 0.14181201685826939, + "learning_rate": 2.5835689555848587e-06, + "loss": 2.7007, + "step": 43857 + }, + { + "epoch": 2.722577441181948, + "grad_norm": 0.14658355534197678, + "learning_rate": 2.5824231794378927e-06, + "loss": 2.7479, + "step": 43858 + }, + { + "epoch": 2.722639518281706, + "grad_norm": 0.13400567055116083, + "learning_rate": 2.581277650679942e-06, + "loss": 2.7175, + "step": 43859 + }, + { + "epoch": 2.722701595381464, + "grad_norm": 0.13872473340623284, + "learning_rate": 2.5801323693169734e-06, + "loss": 2.633, + "step": 43860 + }, + { + "epoch": 2.7227636724812214, + "grad_norm": 0.13301551253203014, + "learning_rate": 2.57898733535496e-06, + "loss": 2.7202, + "step": 43861 + }, + { + "epoch": 2.72282574958098, + "grad_norm": 0.14890504911604963, + "learning_rate": 2.5778425487998857e-06, + "loss": 2.6687, + "step": 43862 + }, + { + "epoch": 2.7228878266807373, + "grad_norm": 0.1446135711782422, + "learning_rate": 2.5766980096577187e-06, + "loss": 2.7202, + "step": 43863 + }, + { + "epoch": 2.7229499037804956, + "grad_norm": 0.13866426554473776, + "learning_rate": 2.575553717934431e-06, + "loss": 2.6289, + "step": 43864 + }, + { + "epoch": 2.723011980880253, + "grad_norm": 0.15108529576118387, + "learning_rate": 2.5744096736359914e-06, + "loss": 2.7236, + "step": 43865 + }, + { + "epoch": 2.723074057980011, + "grad_norm": 0.13866683118002138, + "learning_rate": 2.5732658767683714e-06, + "loss": 2.7481, + "step": 43866 + }, + { + "epoch": 2.723136135079769, + "grad_norm": 0.13366180131143307, + "learning_rate": 2.572122327337534e-06, + "loss": 2.6516, + "step": 43867 + }, + { + "epoch": 2.723198212179527, + "grad_norm": 0.13444440679307723, + "learning_rate": 2.5709790253494516e-06, + "loss": 2.8105, + "step": 43868 + }, + { + "epoch": 2.7232602892792848, + "grad_norm": 0.1393938344580184, + "learning_rate": 2.569835970810086e-06, + "loss": 2.7904, + "step": 43869 + }, + { + "epoch": 2.7233223663790427, + "grad_norm": 0.13702476858037022, + "learning_rate": 2.5686931637253943e-06, + "loss": 2.7204, + "step": 43870 + }, + { + "epoch": 2.7233844434788006, + "grad_norm": 0.13003611301458723, + "learning_rate": 2.5675506041013485e-06, + "loss": 2.6984, + "step": 43871 + }, + { + "epoch": 2.7234465205785585, + "grad_norm": 0.13261048295794375, + "learning_rate": 2.5664082919439058e-06, + "loss": 2.6818, + "step": 43872 + }, + { + "epoch": 2.7235085976783164, + "grad_norm": 0.13286432489270067, + "learning_rate": 2.5652662272590276e-06, + "loss": 2.7027, + "step": 43873 + }, + { + "epoch": 2.7235706747780744, + "grad_norm": 0.13883032946749613, + "learning_rate": 2.564124410052665e-06, + "loss": 2.7637, + "step": 43874 + }, + { + "epoch": 2.7236327518778323, + "grad_norm": 0.14004536884531749, + "learning_rate": 2.5629828403307853e-06, + "loss": 2.6877, + "step": 43875 + }, + { + "epoch": 2.72369482897759, + "grad_norm": 0.13284987036085283, + "learning_rate": 2.5618415180993394e-06, + "loss": 2.6431, + "step": 43876 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.1436381972950237, + "learning_rate": 2.5607004433642834e-06, + "loss": 2.731, + "step": 43877 + }, + { + "epoch": 2.723818983177106, + "grad_norm": 0.13572752482308534, + "learning_rate": 2.5595596161315625e-06, + "loss": 2.6824, + "step": 43878 + }, + { + "epoch": 2.723881060276864, + "grad_norm": 0.13791646175018754, + "learning_rate": 2.558419036407139e-06, + "loss": 2.6966, + "step": 43879 + }, + { + "epoch": 2.723943137376622, + "grad_norm": 0.13509058818165837, + "learning_rate": 2.5572787041969694e-06, + "loss": 2.7487, + "step": 43880 + }, + { + "epoch": 2.7240052144763798, + "grad_norm": 0.1490114309691284, + "learning_rate": 2.556138619506987e-06, + "loss": 2.7107, + "step": 43881 + }, + { + "epoch": 2.7240672915761377, + "grad_norm": 0.1778470300657843, + "learning_rate": 2.554998782343149e-06, + "loss": 2.7768, + "step": 43882 + }, + { + "epoch": 2.7241293686758956, + "grad_norm": 0.14552093933086135, + "learning_rate": 2.5538591927113887e-06, + "loss": 2.691, + "step": 43883 + }, + { + "epoch": 2.724191445775653, + "grad_norm": 0.13880557344520736, + "learning_rate": 2.552719850617674e-06, + "loss": 2.6901, + "step": 43884 + }, + { + "epoch": 2.7242535228754114, + "grad_norm": 0.13412256510293996, + "learning_rate": 2.5515807560679396e-06, + "loss": 2.6957, + "step": 43885 + }, + { + "epoch": 2.724315599975169, + "grad_norm": 0.13263210499254535, + "learning_rate": 2.5504419090681244e-06, + "loss": 2.6551, + "step": 43886 + }, + { + "epoch": 2.7243776770749273, + "grad_norm": 0.14593477779602324, + "learning_rate": 2.5493033096241624e-06, + "loss": 2.7404, + "step": 43887 + }, + { + "epoch": 2.7244397541746848, + "grad_norm": 0.14135470741841336, + "learning_rate": 2.5481649577420164e-06, + "loss": 2.6731, + "step": 43888 + }, + { + "epoch": 2.724501831274443, + "grad_norm": 0.1306942638282404, + "learning_rate": 2.5470268534276142e-06, + "loss": 2.8032, + "step": 43889 + }, + { + "epoch": 2.7245639083742006, + "grad_norm": 0.13687368877862793, + "learning_rate": 2.5458889966868793e-06, + "loss": 2.6241, + "step": 43890 + }, + { + "epoch": 2.7246259854739585, + "grad_norm": 0.14378666461996062, + "learning_rate": 2.5447513875257734e-06, + "loss": 2.7181, + "step": 43891 + }, + { + "epoch": 2.7246880625737164, + "grad_norm": 0.15325067846193152, + "learning_rate": 2.5436140259502084e-06, + "loss": 2.7116, + "step": 43892 + }, + { + "epoch": 2.7247501396734743, + "grad_norm": 0.15586061238153828, + "learning_rate": 2.5424769119661406e-06, + "loss": 2.6759, + "step": 43893 + }, + { + "epoch": 2.7248122167732323, + "grad_norm": 0.13782609315686392, + "learning_rate": 2.5413400455794934e-06, + "loss": 2.623, + "step": 43894 + }, + { + "epoch": 2.72487429387299, + "grad_norm": 0.12989038430995478, + "learning_rate": 2.5402034267961896e-06, + "loss": 2.6279, + "step": 43895 + }, + { + "epoch": 2.724936370972748, + "grad_norm": 0.13225295199333809, + "learning_rate": 2.5390670556221686e-06, + "loss": 2.738, + "step": 43896 + }, + { + "epoch": 2.724998448072506, + "grad_norm": 0.13171161158259276, + "learning_rate": 2.537930932063348e-06, + "loss": 2.7577, + "step": 43897 + }, + { + "epoch": 2.725060525172264, + "grad_norm": 0.13169918738609052, + "learning_rate": 2.5367950561256737e-06, + "loss": 2.6959, + "step": 43898 + }, + { + "epoch": 2.725122602272022, + "grad_norm": 0.14358854219002468, + "learning_rate": 2.5356594278150626e-06, + "loss": 2.7081, + "step": 43899 + }, + { + "epoch": 2.7251846793717798, + "grad_norm": 0.1312380336801491, + "learning_rate": 2.5345240471374376e-06, + "loss": 2.672, + "step": 43900 + }, + { + "epoch": 2.7252467564715377, + "grad_norm": 0.1552906146834824, + "learning_rate": 2.533388914098711e-06, + "loss": 2.6755, + "step": 43901 + }, + { + "epoch": 2.7253088335712956, + "grad_norm": 0.1276990179537746, + "learning_rate": 2.532254028704828e-06, + "loss": 2.6646, + "step": 43902 + }, + { + "epoch": 2.7253709106710535, + "grad_norm": 0.13642459287241898, + "learning_rate": 2.5311193909617005e-06, + "loss": 2.8274, + "step": 43903 + }, + { + "epoch": 2.7254329877708114, + "grad_norm": 0.13454606650280895, + "learning_rate": 2.5299850008752456e-06, + "loss": 2.6888, + "step": 43904 + }, + { + "epoch": 2.7254950648705694, + "grad_norm": 0.13069548236234574, + "learning_rate": 2.5288508584513816e-06, + "loss": 2.6874, + "step": 43905 + }, + { + "epoch": 2.7255571419703273, + "grad_norm": 0.14182579439203988, + "learning_rate": 2.5277169636960194e-06, + "loss": 2.6487, + "step": 43906 + }, + { + "epoch": 2.725619219070085, + "grad_norm": 0.15668601355933662, + "learning_rate": 2.5265833166150943e-06, + "loss": 2.7006, + "step": 43907 + }, + { + "epoch": 2.725681296169843, + "grad_norm": 0.1314196265348837, + "learning_rate": 2.5254499172145006e-06, + "loss": 2.6753, + "step": 43908 + }, + { + "epoch": 2.7257433732696006, + "grad_norm": 0.13685198150499192, + "learning_rate": 2.5243167655001677e-06, + "loss": 2.6599, + "step": 43909 + }, + { + "epoch": 2.725805450369359, + "grad_norm": 0.1453103511405308, + "learning_rate": 2.5231838614779846e-06, + "loss": 2.6926, + "step": 43910 + }, + { + "epoch": 2.7258675274691164, + "grad_norm": 0.1404360256435993, + "learning_rate": 2.522051205153886e-06, + "loss": 2.781, + "step": 43911 + }, + { + "epoch": 2.725929604568875, + "grad_norm": 0.13887149922132871, + "learning_rate": 2.5209187965337776e-06, + "loss": 2.6857, + "step": 43912 + }, + { + "epoch": 2.7259916816686323, + "grad_norm": 0.1403162932148705, + "learning_rate": 2.5197866356235555e-06, + "loss": 2.7224, + "step": 43913 + }, + { + "epoch": 2.72605375876839, + "grad_norm": 0.15887502757401606, + "learning_rate": 2.5186547224291255e-06, + "loss": 2.7358, + "step": 43914 + }, + { + "epoch": 2.726115835868148, + "grad_norm": 0.131412929496691, + "learning_rate": 2.5175230569564114e-06, + "loss": 2.6706, + "step": 43915 + }, + { + "epoch": 2.726177912967906, + "grad_norm": 0.13569652695062676, + "learning_rate": 2.5163916392113017e-06, + "loss": 2.5828, + "step": 43916 + }, + { + "epoch": 2.726239990067664, + "grad_norm": 0.13039279428532335, + "learning_rate": 2.5152604691997096e-06, + "loss": 2.7336, + "step": 43917 + }, + { + "epoch": 2.726302067167422, + "grad_norm": 0.14668882725021315, + "learning_rate": 2.5141295469275295e-06, + "loss": 2.6506, + "step": 43918 + }, + { + "epoch": 2.7263641442671798, + "grad_norm": 0.14046081919447378, + "learning_rate": 2.5129988724006513e-06, + "loss": 2.7509, + "step": 43919 + }, + { + "epoch": 2.7264262213669377, + "grad_norm": 0.13171516113776993, + "learning_rate": 2.5118684456249987e-06, + "loss": 2.7705, + "step": 43920 + }, + { + "epoch": 2.7264882984666956, + "grad_norm": 0.1299879107791695, + "learning_rate": 2.5107382666064494e-06, + "loss": 2.7147, + "step": 43921 + }, + { + "epoch": 2.7265503755664535, + "grad_norm": 0.1522692779545172, + "learning_rate": 2.509608335350905e-06, + "loss": 2.6498, + "step": 43922 + }, + { + "epoch": 2.7266124526662114, + "grad_norm": 0.13243904621343208, + "learning_rate": 2.5084786518642713e-06, + "loss": 2.5798, + "step": 43923 + }, + { + "epoch": 2.7266745297659694, + "grad_norm": 0.13941811862174477, + "learning_rate": 2.5073492161524214e-06, + "loss": 2.6541, + "step": 43924 + }, + { + "epoch": 2.7267366068657273, + "grad_norm": 0.13539652414782036, + "learning_rate": 2.5062200282212734e-06, + "loss": 2.7423, + "step": 43925 + }, + { + "epoch": 2.726798683965485, + "grad_norm": 0.13502514770515978, + "learning_rate": 2.5050910880767e-06, + "loss": 2.7034, + "step": 43926 + }, + { + "epoch": 2.726860761065243, + "grad_norm": 0.15503466138794625, + "learning_rate": 2.503962395724596e-06, + "loss": 2.6609, + "step": 43927 + }, + { + "epoch": 2.726922838165001, + "grad_norm": 0.13560059936606889, + "learning_rate": 2.5028339511708466e-06, + "loss": 2.7446, + "step": 43928 + }, + { + "epoch": 2.726984915264759, + "grad_norm": 0.13835173577705953, + "learning_rate": 2.5017057544213464e-06, + "loss": 2.7427, + "step": 43929 + }, + { + "epoch": 2.727046992364517, + "grad_norm": 0.13263324954894568, + "learning_rate": 2.5005778054819796e-06, + "loss": 2.664, + "step": 43930 + }, + { + "epoch": 2.727109069464275, + "grad_norm": 0.1495173913538609, + "learning_rate": 2.4994501043586306e-06, + "loss": 2.6757, + "step": 43931 + }, + { + "epoch": 2.7271711465640323, + "grad_norm": 0.13321555910219582, + "learning_rate": 2.4983226510571777e-06, + "loss": 2.7006, + "step": 43932 + }, + { + "epoch": 2.7272332236637906, + "grad_norm": 0.14333215969690094, + "learning_rate": 2.4971954455835056e-06, + "loss": 2.616, + "step": 43933 + }, + { + "epoch": 2.727295300763548, + "grad_norm": 0.14763165375042375, + "learning_rate": 2.496068487943498e-06, + "loss": 2.6946, + "step": 43934 + }, + { + "epoch": 2.7273573778633065, + "grad_norm": 0.13539648975540525, + "learning_rate": 2.494941778143034e-06, + "loss": 2.6199, + "step": 43935 + }, + { + "epoch": 2.727419454963064, + "grad_norm": 0.13488445709622307, + "learning_rate": 2.4938153161879975e-06, + "loss": 2.6786, + "step": 43936 + }, + { + "epoch": 2.7274815320628223, + "grad_norm": 0.14625411692147422, + "learning_rate": 2.4926891020842446e-06, + "loss": 2.738, + "step": 43937 + }, + { + "epoch": 2.7275436091625798, + "grad_norm": 0.13197150148183182, + "learning_rate": 2.491563135837677e-06, + "loss": 2.657, + "step": 43938 + }, + { + "epoch": 2.7276056862623377, + "grad_norm": 0.14253760114669722, + "learning_rate": 2.49043741745415e-06, + "loss": 2.7438, + "step": 43939 + }, + { + "epoch": 2.7276677633620956, + "grad_norm": 0.128567438203238, + "learning_rate": 2.4893119469395543e-06, + "loss": 2.7337, + "step": 43940 + }, + { + "epoch": 2.7277298404618535, + "grad_norm": 0.1300741736353173, + "learning_rate": 2.4881867242997458e-06, + "loss": 2.5872, + "step": 43941 + }, + { + "epoch": 2.7277919175616114, + "grad_norm": 0.1396750444794518, + "learning_rate": 2.4870617495405924e-06, + "loss": 2.7211, + "step": 43942 + }, + { + "epoch": 2.7278539946613694, + "grad_norm": 0.14157971180665563, + "learning_rate": 2.4859370226679833e-06, + "loss": 2.5864, + "step": 43943 + }, + { + "epoch": 2.7279160717611273, + "grad_norm": 0.13589867141453502, + "learning_rate": 2.4848125436877757e-06, + "loss": 2.6957, + "step": 43944 + }, + { + "epoch": 2.727978148860885, + "grad_norm": 0.13186073776383497, + "learning_rate": 2.4836883126058365e-06, + "loss": 2.667, + "step": 43945 + }, + { + "epoch": 2.728040225960643, + "grad_norm": 0.13392893586790364, + "learning_rate": 2.4825643294280166e-06, + "loss": 2.7997, + "step": 43946 + }, + { + "epoch": 2.728102303060401, + "grad_norm": 0.1404023129199663, + "learning_rate": 2.481440594160206e-06, + "loss": 2.6859, + "step": 43947 + }, + { + "epoch": 2.728164380160159, + "grad_norm": 0.13007550537633294, + "learning_rate": 2.480317106808255e-06, + "loss": 2.6485, + "step": 43948 + }, + { + "epoch": 2.728226457259917, + "grad_norm": 0.13105706327874492, + "learning_rate": 2.4791938673780268e-06, + "loss": 2.7479, + "step": 43949 + }, + { + "epoch": 2.7282885343596748, + "grad_norm": 0.15517981723844698, + "learning_rate": 2.4780708758753712e-06, + "loss": 2.6687, + "step": 43950 + }, + { + "epoch": 2.7283506114594327, + "grad_norm": 0.13322315121457942, + "learning_rate": 2.476948132306162e-06, + "loss": 2.7341, + "step": 43951 + }, + { + "epoch": 2.7284126885591906, + "grad_norm": 0.13400542730533727, + "learning_rate": 2.4758256366762545e-06, + "loss": 2.7409, + "step": 43952 + }, + { + "epoch": 2.7284747656589485, + "grad_norm": 0.13217541115180786, + "learning_rate": 2.474703388991495e-06, + "loss": 2.682, + "step": 43953 + }, + { + "epoch": 2.7285368427587064, + "grad_norm": 0.14477827895887332, + "learning_rate": 2.473581389257751e-06, + "loss": 2.6381, + "step": 43954 + }, + { + "epoch": 2.7285989198584644, + "grad_norm": 0.14115012324829815, + "learning_rate": 2.472459637480862e-06, + "loss": 2.6272, + "step": 43955 + }, + { + "epoch": 2.7286609969582223, + "grad_norm": 0.13971420553380898, + "learning_rate": 2.4713381336666896e-06, + "loss": 2.6302, + "step": 43956 + }, + { + "epoch": 2.7287230740579798, + "grad_norm": 0.1328458253338971, + "learning_rate": 2.470216877821091e-06, + "loss": 2.6077, + "step": 43957 + }, + { + "epoch": 2.728785151157738, + "grad_norm": 0.13373397472672452, + "learning_rate": 2.4690958699499055e-06, + "loss": 2.6133, + "step": 43958 + }, + { + "epoch": 2.7288472282574956, + "grad_norm": 0.13177789894078643, + "learning_rate": 2.46797511005899e-06, + "loss": 2.7436, + "step": 43959 + }, + { + "epoch": 2.728909305357254, + "grad_norm": 0.1328690702908662, + "learning_rate": 2.466854598154178e-06, + "loss": 2.7912, + "step": 43960 + }, + { + "epoch": 2.7289713824570114, + "grad_norm": 0.1485641591625782, + "learning_rate": 2.4657343342413317e-06, + "loss": 2.7568, + "step": 43961 + }, + { + "epoch": 2.7290334595567693, + "grad_norm": 0.1393885561846508, + "learning_rate": 2.4646143183262916e-06, + "loss": 2.7458, + "step": 43962 + }, + { + "epoch": 2.7290955366565273, + "grad_norm": 0.1336910557097917, + "learning_rate": 2.4634945504148964e-06, + "loss": 2.7258, + "step": 43963 + }, + { + "epoch": 2.729157613756285, + "grad_norm": 0.16236297465139088, + "learning_rate": 2.462375030512981e-06, + "loss": 2.7042, + "step": 43964 + }, + { + "epoch": 2.729219690856043, + "grad_norm": 0.12829952071545234, + "learning_rate": 2.461255758626402e-06, + "loss": 2.7232, + "step": 43965 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.13219718184084173, + "learning_rate": 2.4601367347609926e-06, + "loss": 2.7148, + "step": 43966 + }, + { + "epoch": 2.729343845055559, + "grad_norm": 0.13854140873815673, + "learning_rate": 2.4590179589225936e-06, + "loss": 2.6008, + "step": 43967 + }, + { + "epoch": 2.729405922155317, + "grad_norm": 0.12919677882006284, + "learning_rate": 2.4578994311170387e-06, + "loss": 2.7326, + "step": 43968 + }, + { + "epoch": 2.7294679992550748, + "grad_norm": 0.1380169878429945, + "learning_rate": 2.4567811513501514e-06, + "loss": 2.7007, + "step": 43969 + }, + { + "epoch": 2.7295300763548327, + "grad_norm": 0.14346568274329913, + "learning_rate": 2.4556631196277935e-06, + "loss": 2.7207, + "step": 43970 + }, + { + "epoch": 2.7295921534545906, + "grad_norm": 0.15141023823511565, + "learning_rate": 2.454545335955777e-06, + "loss": 2.7064, + "step": 43971 + }, + { + "epoch": 2.7296542305543485, + "grad_norm": 0.13144926657546455, + "learning_rate": 2.453427800339941e-06, + "loss": 2.6487, + "step": 43972 + }, + { + "epoch": 2.7297163076541064, + "grad_norm": 0.1310047935871871, + "learning_rate": 2.4523105127861044e-06, + "loss": 2.6823, + "step": 43973 + }, + { + "epoch": 2.7297783847538644, + "grad_norm": 0.1465008408510925, + "learning_rate": 2.451193473300117e-06, + "loss": 2.6845, + "step": 43974 + }, + { + "epoch": 2.7298404618536223, + "grad_norm": 0.13614350965112917, + "learning_rate": 2.4500766818877973e-06, + "loss": 2.6636, + "step": 43975 + }, + { + "epoch": 2.72990253895338, + "grad_norm": 0.141512823251706, + "learning_rate": 2.4489601385549677e-06, + "loss": 2.6293, + "step": 43976 + }, + { + "epoch": 2.729964616053138, + "grad_norm": 0.13854712933213378, + "learning_rate": 2.4478438433074514e-06, + "loss": 2.626, + "step": 43977 + }, + { + "epoch": 2.730026693152896, + "grad_norm": 0.13843804780192953, + "learning_rate": 2.4467277961510826e-06, + "loss": 2.7292, + "step": 43978 + }, + { + "epoch": 2.730088770252654, + "grad_norm": 0.14104195214298532, + "learning_rate": 2.4456119970916847e-06, + "loss": 2.665, + "step": 43979 + }, + { + "epoch": 2.7301508473524114, + "grad_norm": 0.13452794358934356, + "learning_rate": 2.4444964461350637e-06, + "loss": 2.7126, + "step": 43980 + }, + { + "epoch": 2.73021292445217, + "grad_norm": 0.14483477321860308, + "learning_rate": 2.4433811432870545e-06, + "loss": 2.6977, + "step": 43981 + }, + { + "epoch": 2.7302750015519273, + "grad_norm": 0.13209569560175552, + "learning_rate": 2.4422660885534633e-06, + "loss": 2.7101, + "step": 43982 + }, + { + "epoch": 2.7303370786516856, + "grad_norm": 0.13252341138968435, + "learning_rate": 2.4411512819401238e-06, + "loss": 2.8348, + "step": 43983 + }, + { + "epoch": 2.730399155751443, + "grad_norm": 0.13677710922462963, + "learning_rate": 2.440036723452843e-06, + "loss": 2.6414, + "step": 43984 + }, + { + "epoch": 2.7304612328512015, + "grad_norm": 0.13038120873462786, + "learning_rate": 2.438922413097433e-06, + "loss": 2.698, + "step": 43985 + }, + { + "epoch": 2.730523309950959, + "grad_norm": 0.13620679232583424, + "learning_rate": 2.4378083508797057e-06, + "loss": 2.7295, + "step": 43986 + }, + { + "epoch": 2.730585387050717, + "grad_norm": 0.1422187574319957, + "learning_rate": 2.436694536805484e-06, + "loss": 2.6199, + "step": 43987 + }, + { + "epoch": 2.7306474641504748, + "grad_norm": 0.14110021299681344, + "learning_rate": 2.435580970880569e-06, + "loss": 2.6998, + "step": 43988 + }, + { + "epoch": 2.7307095412502327, + "grad_norm": 0.130157946533192, + "learning_rate": 2.434467653110778e-06, + "loss": 2.5833, + "step": 43989 + }, + { + "epoch": 2.7307716183499906, + "grad_norm": 0.13800849874430038, + "learning_rate": 2.433354583501918e-06, + "loss": 2.6567, + "step": 43990 + }, + { + "epoch": 2.7308336954497485, + "grad_norm": 0.1331989192578358, + "learning_rate": 2.432241762059795e-06, + "loss": 2.6208, + "step": 43991 + }, + { + "epoch": 2.7308957725495064, + "grad_norm": 0.13054054477760324, + "learning_rate": 2.43112918879021e-06, + "loss": 2.648, + "step": 43992 + }, + { + "epoch": 2.7309578496492644, + "grad_norm": 0.15713560541736096, + "learning_rate": 2.430016863698975e-06, + "loss": 2.7213, + "step": 43993 + }, + { + "epoch": 2.7310199267490223, + "grad_norm": 0.1359231483622595, + "learning_rate": 2.4289047867918915e-06, + "loss": 2.6595, + "step": 43994 + }, + { + "epoch": 2.73108200384878, + "grad_norm": 0.15701465068214573, + "learning_rate": 2.427792958074754e-06, + "loss": 2.7157, + "step": 43995 + }, + { + "epoch": 2.731144080948538, + "grad_norm": 0.13220950992902034, + "learning_rate": 2.4266813775533693e-06, + "loss": 2.6781, + "step": 43996 + }, + { + "epoch": 2.731206158048296, + "grad_norm": 0.13065463746295414, + "learning_rate": 2.425570045233544e-06, + "loss": 2.6789, + "step": 43997 + }, + { + "epoch": 2.731268235148054, + "grad_norm": 0.1334273956167213, + "learning_rate": 2.424458961121062e-06, + "loss": 2.7332, + "step": 43998 + }, + { + "epoch": 2.731330312247812, + "grad_norm": 0.15880119047500904, + "learning_rate": 2.423348125221736e-06, + "loss": 2.6296, + "step": 43999 + }, + { + "epoch": 2.73139238934757, + "grad_norm": 0.13199511208388917, + "learning_rate": 2.4222375375413387e-06, + "loss": 2.5902, + "step": 44000 + }, + { + "epoch": 2.7314544664473277, + "grad_norm": 0.13615894148965677, + "learning_rate": 2.4211271980856877e-06, + "loss": 2.6979, + "step": 44001 + }, + { + "epoch": 2.7315165435470856, + "grad_norm": 0.12795262458704348, + "learning_rate": 2.420017106860567e-06, + "loss": 2.7199, + "step": 44002 + }, + { + "epoch": 2.7315786206468435, + "grad_norm": 0.1549253641005918, + "learning_rate": 2.4189072638717668e-06, + "loss": 2.708, + "step": 44003 + }, + { + "epoch": 2.7316406977466015, + "grad_norm": 0.14986483822305485, + "learning_rate": 2.417797669125077e-06, + "loss": 2.6879, + "step": 44004 + }, + { + "epoch": 2.731702774846359, + "grad_norm": 0.14453041231067287, + "learning_rate": 2.4166883226262816e-06, + "loss": 2.6956, + "step": 44005 + }, + { + "epoch": 2.7317648519461173, + "grad_norm": 0.1295728785916542, + "learning_rate": 2.4155792243811816e-06, + "loss": 2.6141, + "step": 44006 + }, + { + "epoch": 2.7318269290458748, + "grad_norm": 0.13839115688931983, + "learning_rate": 2.4144703743955553e-06, + "loss": 2.733, + "step": 44007 + }, + { + "epoch": 2.731889006145633, + "grad_norm": 0.14726107360120538, + "learning_rate": 2.4133617726751876e-06, + "loss": 2.6787, + "step": 44008 + }, + { + "epoch": 2.7319510832453906, + "grad_norm": 0.134136445075325, + "learning_rate": 2.412253419225863e-06, + "loss": 2.7659, + "step": 44009 + }, + { + "epoch": 2.7320131603451485, + "grad_norm": 0.13217143708564755, + "learning_rate": 2.4111453140533647e-06, + "loss": 2.676, + "step": 44010 + }, + { + "epoch": 2.7320752374449064, + "grad_norm": 0.13927776653828508, + "learning_rate": 2.410037457163478e-06, + "loss": 2.7348, + "step": 44011 + }, + { + "epoch": 2.7321373145446644, + "grad_norm": 0.13087986315998065, + "learning_rate": 2.408929848561975e-06, + "loss": 2.7148, + "step": 44012 + }, + { + "epoch": 2.7321993916444223, + "grad_norm": 0.13063889761509195, + "learning_rate": 2.4078224882546354e-06, + "loss": 2.6674, + "step": 44013 + }, + { + "epoch": 2.73226146874418, + "grad_norm": 0.13639147278619126, + "learning_rate": 2.4067153762472426e-06, + "loss": 2.6249, + "step": 44014 + }, + { + "epoch": 2.732323545843938, + "grad_norm": 0.1324410995662116, + "learning_rate": 2.405608512545571e-06, + "loss": 2.7578, + "step": 44015 + }, + { + "epoch": 2.732385622943696, + "grad_norm": 0.1272103711074452, + "learning_rate": 2.4045018971553924e-06, + "loss": 2.7013, + "step": 44016 + }, + { + "epoch": 2.732447700043454, + "grad_norm": 0.13252168258686126, + "learning_rate": 2.403395530082486e-06, + "loss": 2.8062, + "step": 44017 + }, + { + "epoch": 2.732509777143212, + "grad_norm": 0.13152295861696595, + "learning_rate": 2.4022894113326087e-06, + "loss": 2.6613, + "step": 44018 + }, + { + "epoch": 2.7325718542429698, + "grad_norm": 0.14173375897698928, + "learning_rate": 2.40118354091155e-06, + "loss": 2.7503, + "step": 44019 + }, + { + "epoch": 2.7326339313427277, + "grad_norm": 0.13338514599691342, + "learning_rate": 2.400077918825072e-06, + "loss": 2.656, + "step": 44020 + }, + { + "epoch": 2.7326960084424856, + "grad_norm": 0.1345210620575799, + "learning_rate": 2.3989725450789313e-06, + "loss": 2.7527, + "step": 44021 + }, + { + "epoch": 2.7327580855422435, + "grad_norm": 0.1366641684980815, + "learning_rate": 2.397867419678923e-06, + "loss": 2.7917, + "step": 44022 + }, + { + "epoch": 2.7328201626420014, + "grad_norm": 0.14116394559021578, + "learning_rate": 2.3967625426307816e-06, + "loss": 2.7124, + "step": 44023 + }, + { + "epoch": 2.7328822397417594, + "grad_norm": 0.1305934353044401, + "learning_rate": 2.3956579139402967e-06, + "loss": 2.6236, + "step": 44024 + }, + { + "epoch": 2.7329443168415173, + "grad_norm": 0.14169096262124659, + "learning_rate": 2.3945535336132195e-06, + "loss": 2.7606, + "step": 44025 + }, + { + "epoch": 2.733006393941275, + "grad_norm": 0.1574084905703864, + "learning_rate": 2.3934494016553123e-06, + "loss": 2.7309, + "step": 44026 + }, + { + "epoch": 2.733068471041033, + "grad_norm": 0.13904073844971743, + "learning_rate": 2.392345518072331e-06, + "loss": 2.7213, + "step": 44027 + }, + { + "epoch": 2.7331305481407906, + "grad_norm": 0.1311718569034748, + "learning_rate": 2.391241882870043e-06, + "loss": 2.717, + "step": 44028 + }, + { + "epoch": 2.733192625240549, + "grad_norm": 0.1326838108296474, + "learning_rate": 2.390138496054212e-06, + "loss": 2.7464, + "step": 44029 + }, + { + "epoch": 2.7332547023403064, + "grad_norm": 0.13082063870234126, + "learning_rate": 2.3890353576305755e-06, + "loss": 2.6697, + "step": 44030 + }, + { + "epoch": 2.733316779440065, + "grad_norm": 0.14982546472012703, + "learning_rate": 2.3879324676049088e-06, + "loss": 2.8173, + "step": 44031 + }, + { + "epoch": 2.7333788565398223, + "grad_norm": 0.13705273433716345, + "learning_rate": 2.3868298259829447e-06, + "loss": 2.7904, + "step": 44032 + }, + { + "epoch": 2.7334409336395806, + "grad_norm": 0.14640722150286903, + "learning_rate": 2.3857274327704572e-06, + "loss": 2.7274, + "step": 44033 + }, + { + "epoch": 2.733503010739338, + "grad_norm": 0.12926577513842935, + "learning_rate": 2.3846252879731857e-06, + "loss": 2.7061, + "step": 44034 + }, + { + "epoch": 2.733565087839096, + "grad_norm": 0.13402447559173689, + "learning_rate": 2.3835233915968867e-06, + "loss": 2.7281, + "step": 44035 + }, + { + "epoch": 2.733627164938854, + "grad_norm": 0.13673784258907917, + "learning_rate": 2.382421743647295e-06, + "loss": 2.5831, + "step": 44036 + }, + { + "epoch": 2.733689242038612, + "grad_norm": 0.1349708197435654, + "learning_rate": 2.381320344130178e-06, + "loss": 2.7258, + "step": 44037 + }, + { + "epoch": 2.7337513191383698, + "grad_norm": 0.1382165269731132, + "learning_rate": 2.3802191930512753e-06, + "loss": 2.7814, + "step": 44038 + }, + { + "epoch": 2.7338133962381277, + "grad_norm": 0.14103089801998833, + "learning_rate": 2.3791182904163267e-06, + "loss": 2.7169, + "step": 44039 + }, + { + "epoch": 2.7338754733378856, + "grad_norm": 0.13120356885156473, + "learning_rate": 2.3780176362310722e-06, + "loss": 2.6824, + "step": 44040 + }, + { + "epoch": 2.7339375504376435, + "grad_norm": 0.150291732376496, + "learning_rate": 2.376917230501263e-06, + "loss": 2.7569, + "step": 44041 + }, + { + "epoch": 2.7339996275374014, + "grad_norm": 0.13990276359506823, + "learning_rate": 2.375817073232639e-06, + "loss": 2.71, + "step": 44042 + }, + { + "epoch": 2.7340617046371594, + "grad_norm": 0.13978972989965913, + "learning_rate": 2.3747171644309398e-06, + "loss": 2.6484, + "step": 44043 + }, + { + "epoch": 2.7341237817369173, + "grad_norm": 0.14436696355300593, + "learning_rate": 2.3736175041018993e-06, + "loss": 2.7155, + "step": 44044 + }, + { + "epoch": 2.734185858836675, + "grad_norm": 0.14709663864729103, + "learning_rate": 2.372518092251258e-06, + "loss": 2.728, + "step": 44045 + }, + { + "epoch": 2.734247935936433, + "grad_norm": 0.13562315243020712, + "learning_rate": 2.3714189288847555e-06, + "loss": 2.6388, + "step": 44046 + }, + { + "epoch": 2.734310013036191, + "grad_norm": 0.13165133939205495, + "learning_rate": 2.3703200140081205e-06, + "loss": 2.6974, + "step": 44047 + }, + { + "epoch": 2.734372090135949, + "grad_norm": 0.14487484094160102, + "learning_rate": 2.3692213476270874e-06, + "loss": 2.6412, + "step": 44048 + }, + { + "epoch": 2.734434167235707, + "grad_norm": 0.13543503129361267, + "learning_rate": 2.368122929747385e-06, + "loss": 2.6692, + "step": 44049 + }, + { + "epoch": 2.734496244335465, + "grad_norm": 0.1311661128629645, + "learning_rate": 2.3670247603747587e-06, + "loss": 2.6821, + "step": 44050 + }, + { + "epoch": 2.7345583214352227, + "grad_norm": 0.137702082744841, + "learning_rate": 2.365926839514926e-06, + "loss": 2.763, + "step": 44051 + }, + { + "epoch": 2.7346203985349806, + "grad_norm": 0.1400309040241526, + "learning_rate": 2.3648291671736102e-06, + "loss": 2.6953, + "step": 44052 + }, + { + "epoch": 2.734682475634738, + "grad_norm": 0.13004535170541473, + "learning_rate": 2.3637317433565454e-06, + "loss": 2.6032, + "step": 44053 + }, + { + "epoch": 2.7347445527344965, + "grad_norm": 0.1381484954256578, + "learning_rate": 2.3626345680694606e-06, + "loss": 2.7196, + "step": 44054 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.14007710641042775, + "learning_rate": 2.361537641318068e-06, + "loss": 2.8298, + "step": 44055 + }, + { + "epoch": 2.7348687069340123, + "grad_norm": 0.13558011018566463, + "learning_rate": 2.3604409631081017e-06, + "loss": 2.7619, + "step": 44056 + }, + { + "epoch": 2.7349307840337698, + "grad_norm": 0.1336813792668461, + "learning_rate": 2.3593445334452845e-06, + "loss": 2.7476, + "step": 44057 + }, + { + "epoch": 2.7349928611335277, + "grad_norm": 0.13503337605371218, + "learning_rate": 2.3582483523353295e-06, + "loss": 2.7167, + "step": 44058 + }, + { + "epoch": 2.7350549382332856, + "grad_norm": 0.1460998574678534, + "learning_rate": 2.3571524197839535e-06, + "loss": 2.6932, + "step": 44059 + }, + { + "epoch": 2.7351170153330435, + "grad_norm": 0.1296176713791091, + "learning_rate": 2.3560567357968853e-06, + "loss": 2.8095, + "step": 44060 + }, + { + "epoch": 2.7351790924328014, + "grad_norm": 0.13351612373334795, + "learning_rate": 2.354961300379832e-06, + "loss": 2.6487, + "step": 44061 + }, + { + "epoch": 2.7352411695325594, + "grad_norm": 0.1412842846295389, + "learning_rate": 2.3538661135385167e-06, + "loss": 2.6182, + "step": 44062 + }, + { + "epoch": 2.7353032466323173, + "grad_norm": 0.13207333700919138, + "learning_rate": 2.35277117527864e-06, + "loss": 2.6585, + "step": 44063 + }, + { + "epoch": 2.735365323732075, + "grad_norm": 0.15137674239980498, + "learning_rate": 2.3516764856059305e-06, + "loss": 2.6994, + "step": 44064 + }, + { + "epoch": 2.735427400831833, + "grad_norm": 0.14085538853205307, + "learning_rate": 2.3505820445260897e-06, + "loss": 2.7207, + "step": 44065 + }, + { + "epoch": 2.735489477931591, + "grad_norm": 0.13546852958775665, + "learning_rate": 2.3494878520448293e-06, + "loss": 2.5847, + "step": 44066 + }, + { + "epoch": 2.735551555031349, + "grad_norm": 0.14412655748814418, + "learning_rate": 2.3483939081678618e-06, + "loss": 2.7033, + "step": 44067 + }, + { + "epoch": 2.735613632131107, + "grad_norm": 0.15976532467391832, + "learning_rate": 2.347300212900877e-06, + "loss": 2.7295, + "step": 44068 + }, + { + "epoch": 2.735675709230865, + "grad_norm": 0.13564105348680847, + "learning_rate": 2.346206766249609e-06, + "loss": 2.7275, + "step": 44069 + }, + { + "epoch": 2.7357377863306227, + "grad_norm": 0.15120506218050375, + "learning_rate": 2.3451135682197478e-06, + "loss": 2.7463, + "step": 44070 + }, + { + "epoch": 2.7357998634303806, + "grad_norm": 0.1395156068565826, + "learning_rate": 2.3440206188169946e-06, + "loss": 2.6182, + "step": 44071 + }, + { + "epoch": 2.7358619405301385, + "grad_norm": 0.12906471540336914, + "learning_rate": 2.3429279180470496e-06, + "loss": 2.5827, + "step": 44072 + }, + { + "epoch": 2.7359240176298965, + "grad_norm": 0.141379774628969, + "learning_rate": 2.341835465915626e-06, + "loss": 2.7703, + "step": 44073 + }, + { + "epoch": 2.7359860947296544, + "grad_norm": 0.1450761498279288, + "learning_rate": 2.3407432624284186e-06, + "loss": 2.693, + "step": 44074 + }, + { + "epoch": 2.7360481718294123, + "grad_norm": 0.13317894870027042, + "learning_rate": 2.3396513075911175e-06, + "loss": 2.7538, + "step": 44075 + }, + { + "epoch": 2.7361102489291698, + "grad_norm": 0.15108848879193845, + "learning_rate": 2.3385596014094293e-06, + "loss": 2.711, + "step": 44076 + }, + { + "epoch": 2.736172326028928, + "grad_norm": 0.14549953001887, + "learning_rate": 2.337468143889038e-06, + "loss": 2.6407, + "step": 44077 + }, + { + "epoch": 2.7362344031286856, + "grad_norm": 0.13527872016323048, + "learning_rate": 2.3363769350356555e-06, + "loss": 2.7792, + "step": 44078 + }, + { + "epoch": 2.736296480228444, + "grad_norm": 0.1281597674695662, + "learning_rate": 2.3352859748549617e-06, + "loss": 2.6481, + "step": 44079 + }, + { + "epoch": 2.7363585573282014, + "grad_norm": 0.13955115551160915, + "learning_rate": 2.334195263352651e-06, + "loss": 2.7241, + "step": 44080 + }, + { + "epoch": 2.73642063442796, + "grad_norm": 0.13327120358480918, + "learning_rate": 2.3331048005344136e-06, + "loss": 2.7081, + "step": 44081 + }, + { + "epoch": 2.7364827115277173, + "grad_norm": 0.13300324514784298, + "learning_rate": 2.3320145864059393e-06, + "loss": 2.6995, + "step": 44082 + }, + { + "epoch": 2.736544788627475, + "grad_norm": 0.1568775271500919, + "learning_rate": 2.330924620972924e-06, + "loss": 2.6958, + "step": 44083 + }, + { + "epoch": 2.736606865727233, + "grad_norm": 0.133807654376805, + "learning_rate": 2.3298349042410407e-06, + "loss": 2.6328, + "step": 44084 + }, + { + "epoch": 2.736668942826991, + "grad_norm": 0.15168339096291159, + "learning_rate": 2.328745436215979e-06, + "loss": 2.8002, + "step": 44085 + }, + { + "epoch": 2.736731019926749, + "grad_norm": 0.13294485453468322, + "learning_rate": 2.3276562169034235e-06, + "loss": 2.6639, + "step": 44086 + }, + { + "epoch": 2.736793097026507, + "grad_norm": 0.13956486261864712, + "learning_rate": 2.3265672463090636e-06, + "loss": 2.7224, + "step": 44087 + }, + { + "epoch": 2.736855174126265, + "grad_norm": 0.14618588872268654, + "learning_rate": 2.325478524438579e-06, + "loss": 2.642, + "step": 44088 + }, + { + "epoch": 2.7369172512260227, + "grad_norm": 0.13161011888560323, + "learning_rate": 2.3243900512976425e-06, + "loss": 2.7235, + "step": 44089 + }, + { + "epoch": 2.7369793283257806, + "grad_norm": 0.15944468783831414, + "learning_rate": 2.3233018268919383e-06, + "loss": 2.6631, + "step": 44090 + }, + { + "epoch": 2.7370414054255385, + "grad_norm": 0.1448549306231135, + "learning_rate": 2.3222138512271394e-06, + "loss": 2.7835, + "step": 44091 + }, + { + "epoch": 2.7371034825252964, + "grad_norm": 0.13435770577765627, + "learning_rate": 2.3211261243089256e-06, + "loss": 2.6779, + "step": 44092 + }, + { + "epoch": 2.7371655596250544, + "grad_norm": 0.1315523700710483, + "learning_rate": 2.3200386461429745e-06, + "loss": 2.6795, + "step": 44093 + }, + { + "epoch": 2.7372276367248123, + "grad_norm": 0.13577731796251674, + "learning_rate": 2.3189514167349545e-06, + "loss": 2.7026, + "step": 44094 + }, + { + "epoch": 2.73728971382457, + "grad_norm": 0.1322375150733901, + "learning_rate": 2.3178644360905335e-06, + "loss": 2.6632, + "step": 44095 + }, + { + "epoch": 2.737351790924328, + "grad_norm": 0.13214577893484505, + "learning_rate": 2.3167777042154005e-06, + "loss": 2.7423, + "step": 44096 + }, + { + "epoch": 2.737413868024086, + "grad_norm": 0.1360516006268496, + "learning_rate": 2.315691221115207e-06, + "loss": 2.689, + "step": 44097 + }, + { + "epoch": 2.737475945123844, + "grad_norm": 0.1308083363538832, + "learning_rate": 2.314604986795632e-06, + "loss": 2.6798, + "step": 44098 + }, + { + "epoch": 2.737538022223602, + "grad_norm": 0.14217457424351276, + "learning_rate": 2.313519001262332e-06, + "loss": 2.7286, + "step": 44099 + }, + { + "epoch": 2.73760009932336, + "grad_norm": 0.13421349134346422, + "learning_rate": 2.3124332645209855e-06, + "loss": 2.8162, + "step": 44100 + }, + { + "epoch": 2.7376621764231173, + "grad_norm": 0.13075572506513097, + "learning_rate": 2.3113477765772494e-06, + "loss": 2.6608, + "step": 44101 + }, + { + "epoch": 2.7377242535228756, + "grad_norm": 0.13353628796662392, + "learning_rate": 2.3102625374367913e-06, + "loss": 2.6628, + "step": 44102 + }, + { + "epoch": 2.737786330622633, + "grad_norm": 0.13897407538605136, + "learning_rate": 2.3091775471052733e-06, + "loss": 2.6877, + "step": 44103 + }, + { + "epoch": 2.7378484077223915, + "grad_norm": 0.13036597881212889, + "learning_rate": 2.308092805588341e-06, + "loss": 2.7483, + "step": 44104 + }, + { + "epoch": 2.737910484822149, + "grad_norm": 0.13284324542568726, + "learning_rate": 2.3070083128916785e-06, + "loss": 2.7048, + "step": 44105 + }, + { + "epoch": 2.737972561921907, + "grad_norm": 0.1351450333874693, + "learning_rate": 2.305924069020926e-06, + "loss": 2.6839, + "step": 44106 + }, + { + "epoch": 2.7380346390216648, + "grad_norm": 0.14066831928179943, + "learning_rate": 2.3048400739817454e-06, + "loss": 2.7068, + "step": 44107 + }, + { + "epoch": 2.7380967161214227, + "grad_norm": 0.13448097048164553, + "learning_rate": 2.3037563277797937e-06, + "loss": 2.747, + "step": 44108 + }, + { + "epoch": 2.7381587932211806, + "grad_norm": 0.13009023237730183, + "learning_rate": 2.302672830420721e-06, + "loss": 2.4942, + "step": 44109 + }, + { + "epoch": 2.7382208703209385, + "grad_norm": 0.12865312606460158, + "learning_rate": 2.3015895819101906e-06, + "loss": 2.6417, + "step": 44110 + }, + { + "epoch": 2.7382829474206964, + "grad_norm": 0.1331478401300054, + "learning_rate": 2.300506582253842e-06, + "loss": 2.7404, + "step": 44111 + }, + { + "epoch": 2.7383450245204544, + "grad_norm": 0.13093001323773015, + "learning_rate": 2.2994238314573314e-06, + "loss": 2.6772, + "step": 44112 + }, + { + "epoch": 2.7384071016202123, + "grad_norm": 0.1301690439711521, + "learning_rate": 2.298341329526299e-06, + "loss": 2.7211, + "step": 44113 + }, + { + "epoch": 2.73846917871997, + "grad_norm": 0.14504299551975963, + "learning_rate": 2.297259076466407e-06, + "loss": 2.6023, + "step": 44114 + }, + { + "epoch": 2.738531255819728, + "grad_norm": 0.13696510734736186, + "learning_rate": 2.2961770722832952e-06, + "loss": 2.7836, + "step": 44115 + }, + { + "epoch": 2.738593332919486, + "grad_norm": 0.12970265104655057, + "learning_rate": 2.2950953169826097e-06, + "loss": 2.6742, + "step": 44116 + }, + { + "epoch": 2.738655410019244, + "grad_norm": 0.13960674303687326, + "learning_rate": 2.2940138105699836e-06, + "loss": 2.6943, + "step": 44117 + }, + { + "epoch": 2.738717487119002, + "grad_norm": 0.14444413652817703, + "learning_rate": 2.2929325530510747e-06, + "loss": 2.706, + "step": 44118 + }, + { + "epoch": 2.73877956421876, + "grad_norm": 0.14490612489399993, + "learning_rate": 2.291851544431511e-06, + "loss": 2.6437, + "step": 44119 + }, + { + "epoch": 2.7388416413185177, + "grad_norm": 0.13282535294441547, + "learning_rate": 2.2907707847169503e-06, + "loss": 2.697, + "step": 44120 + }, + { + "epoch": 2.7389037184182756, + "grad_norm": 0.1283560266249571, + "learning_rate": 2.2896902739130143e-06, + "loss": 2.7736, + "step": 44121 + }, + { + "epoch": 2.7389657955180335, + "grad_norm": 0.13929581972836072, + "learning_rate": 2.288610012025344e-06, + "loss": 2.7089, + "step": 44122 + }, + { + "epoch": 2.7390278726177915, + "grad_norm": 0.15437454369319242, + "learning_rate": 2.2875299990595843e-06, + "loss": 2.7056, + "step": 44123 + }, + { + "epoch": 2.739089949717549, + "grad_norm": 0.14510891147595767, + "learning_rate": 2.286450235021359e-06, + "loss": 2.6296, + "step": 44124 + }, + { + "epoch": 2.7391520268173073, + "grad_norm": 0.13995780555911017, + "learning_rate": 2.2853707199163134e-06, + "loss": 2.5976, + "step": 44125 + }, + { + "epoch": 2.7392141039170648, + "grad_norm": 0.1392614230400807, + "learning_rate": 2.2842914537500647e-06, + "loss": 2.6944, + "step": 44126 + }, + { + "epoch": 2.739276181016823, + "grad_norm": 0.14201315550535754, + "learning_rate": 2.283212436528248e-06, + "loss": 2.684, + "step": 44127 + }, + { + "epoch": 2.7393382581165806, + "grad_norm": 0.13205392264771088, + "learning_rate": 2.2821336682564975e-06, + "loss": 2.691, + "step": 44128 + }, + { + "epoch": 2.739400335216339, + "grad_norm": 0.13473115010136394, + "learning_rate": 2.281055148940442e-06, + "loss": 2.7184, + "step": 44129 + }, + { + "epoch": 2.7394624123160964, + "grad_norm": 0.15174714668331463, + "learning_rate": 2.27997687858571e-06, + "loss": 2.7845, + "step": 44130 + }, + { + "epoch": 2.7395244894158544, + "grad_norm": 0.14206562294613362, + "learning_rate": 2.2788988571979142e-06, + "loss": 2.749, + "step": 44131 + }, + { + "epoch": 2.7395865665156123, + "grad_norm": 0.12972221627976804, + "learning_rate": 2.277821084782689e-06, + "loss": 2.7777, + "step": 44132 + }, + { + "epoch": 2.73964864361537, + "grad_norm": 0.13837894878507942, + "learning_rate": 2.2767435613456624e-06, + "loss": 2.7848, + "step": 44133 + }, + { + "epoch": 2.739710720715128, + "grad_norm": 0.14828672405987875, + "learning_rate": 2.275666286892447e-06, + "loss": 2.6771, + "step": 44134 + }, + { + "epoch": 2.739772797814886, + "grad_norm": 0.13712752404145925, + "learning_rate": 2.274589261428661e-06, + "loss": 2.6674, + "step": 44135 + }, + { + "epoch": 2.739834874914644, + "grad_norm": 0.14001749529683993, + "learning_rate": 2.273512484959933e-06, + "loss": 2.6985, + "step": 44136 + }, + { + "epoch": 2.739896952014402, + "grad_norm": 0.14687561557519033, + "learning_rate": 2.272435957491881e-06, + "loss": 2.7728, + "step": 44137 + }, + { + "epoch": 2.73995902911416, + "grad_norm": 0.14803557818348348, + "learning_rate": 2.2713596790301107e-06, + "loss": 2.6329, + "step": 44138 + }, + { + "epoch": 2.7400211062139177, + "grad_norm": 0.13716464898351843, + "learning_rate": 2.2702836495802514e-06, + "loss": 2.6902, + "step": 44139 + }, + { + "epoch": 2.7400831833136756, + "grad_norm": 0.13755266768540703, + "learning_rate": 2.2692078691478935e-06, + "loss": 2.645, + "step": 44140 + }, + { + "epoch": 2.7401452604134335, + "grad_norm": 0.1327174351364116, + "learning_rate": 2.2681323377386765e-06, + "loss": 2.6542, + "step": 44141 + }, + { + "epoch": 2.7402073375131915, + "grad_norm": 0.13872302817759735, + "learning_rate": 2.2670570553582014e-06, + "loss": 2.6767, + "step": 44142 + }, + { + "epoch": 2.7402694146129494, + "grad_norm": 0.13420503229821612, + "learning_rate": 2.2659820220120755e-06, + "loss": 2.686, + "step": 44143 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.13334554402745005, + "learning_rate": 2.2649072377059046e-06, + "loss": 2.7843, + "step": 44144 + }, + { + "epoch": 2.740393568812465, + "grad_norm": 0.15477658798618224, + "learning_rate": 2.2638327024453065e-06, + "loss": 2.7479, + "step": 44145 + }, + { + "epoch": 2.740455645912223, + "grad_norm": 0.14225288423093088, + "learning_rate": 2.262758416235877e-06, + "loss": 2.6843, + "step": 44146 + }, + { + "epoch": 2.740517723011981, + "grad_norm": 0.13519817539943435, + "learning_rate": 2.2616843790832287e-06, + "loss": 2.7091, + "step": 44147 + }, + { + "epoch": 2.740579800111739, + "grad_norm": 0.1401778768781684, + "learning_rate": 2.2606105909929565e-06, + "loss": 2.6657, + "step": 44148 + }, + { + "epoch": 2.7406418772114964, + "grad_norm": 0.18697800095778327, + "learning_rate": 2.259537051970678e-06, + "loss": 2.7321, + "step": 44149 + }, + { + "epoch": 2.740703954311255, + "grad_norm": 0.141493249443197, + "learning_rate": 2.2584637620219785e-06, + "loss": 2.6734, + "step": 44150 + }, + { + "epoch": 2.7407660314110123, + "grad_norm": 0.13888108228228116, + "learning_rate": 2.2573907211524637e-06, + "loss": 2.7493, + "step": 44151 + }, + { + "epoch": 2.7408281085107706, + "grad_norm": 0.17006004131345517, + "learning_rate": 2.2563179293677237e-06, + "loss": 2.6294, + "step": 44152 + }, + { + "epoch": 2.740890185610528, + "grad_norm": 0.13849806975016318, + "learning_rate": 2.255245386673377e-06, + "loss": 2.7814, + "step": 44153 + }, + { + "epoch": 2.740952262710286, + "grad_norm": 0.13053035652123354, + "learning_rate": 2.2541730930749904e-06, + "loss": 2.7779, + "step": 44154 + }, + { + "epoch": 2.741014339810044, + "grad_norm": 0.13620484360633217, + "learning_rate": 2.2531010485781822e-06, + "loss": 2.636, + "step": 44155 + }, + { + "epoch": 2.741076416909802, + "grad_norm": 0.13347803285284016, + "learning_rate": 2.2520292531885422e-06, + "loss": 2.6626, + "step": 44156 + }, + { + "epoch": 2.74113849400956, + "grad_norm": 0.16367300665926224, + "learning_rate": 2.2509577069116495e-06, + "loss": 2.668, + "step": 44157 + }, + { + "epoch": 2.7412005711093177, + "grad_norm": 0.1373996672000214, + "learning_rate": 2.249886409753099e-06, + "loss": 2.7139, + "step": 44158 + }, + { + "epoch": 2.7412626482090756, + "grad_norm": 0.13129719754104194, + "learning_rate": 2.2488153617184924e-06, + "loss": 2.6773, + "step": 44159 + }, + { + "epoch": 2.7413247253088335, + "grad_norm": 0.13432473473454362, + "learning_rate": 2.247744562813403e-06, + "loss": 2.6562, + "step": 44160 + }, + { + "epoch": 2.7413868024085914, + "grad_norm": 0.15322442305976433, + "learning_rate": 2.246674013043426e-06, + "loss": 2.6463, + "step": 44161 + }, + { + "epoch": 2.7414488795083494, + "grad_norm": 0.13502841703360427, + "learning_rate": 2.2456037124141403e-06, + "loss": 2.6698, + "step": 44162 + }, + { + "epoch": 2.7415109566081073, + "grad_norm": 0.1442907820232572, + "learning_rate": 2.244533660931125e-06, + "loss": 2.668, + "step": 44163 + }, + { + "epoch": 2.741573033707865, + "grad_norm": 0.13338913975898964, + "learning_rate": 2.243463858599981e-06, + "loss": 2.6848, + "step": 44164 + }, + { + "epoch": 2.741635110807623, + "grad_norm": 0.13463678974162246, + "learning_rate": 2.2423943054262764e-06, + "loss": 2.7221, + "step": 44165 + }, + { + "epoch": 2.741697187907381, + "grad_norm": 0.13465821090433744, + "learning_rate": 2.241325001415595e-06, + "loss": 2.7556, + "step": 44166 + }, + { + "epoch": 2.741759265007139, + "grad_norm": 0.13993133886718587, + "learning_rate": 2.240255946573511e-06, + "loss": 2.6804, + "step": 44167 + }, + { + "epoch": 2.741821342106897, + "grad_norm": 0.15710848763934693, + "learning_rate": 2.239187140905613e-06, + "loss": 2.6405, + "step": 44168 + }, + { + "epoch": 2.741883419206655, + "grad_norm": 0.1462885819695886, + "learning_rate": 2.2381185844174647e-06, + "loss": 2.7252, + "step": 44169 + }, + { + "epoch": 2.7419454963064127, + "grad_norm": 0.13406632227884038, + "learning_rate": 2.2370502771146494e-06, + "loss": 2.6875, + "step": 44170 + }, + { + "epoch": 2.7420075734061706, + "grad_norm": 0.13486046839935867, + "learning_rate": 2.2359822190027302e-06, + "loss": 2.698, + "step": 44171 + }, + { + "epoch": 2.742069650505928, + "grad_norm": 0.13394258559224914, + "learning_rate": 2.2349144100872964e-06, + "loss": 2.6465, + "step": 44172 + }, + { + "epoch": 2.7421317276056865, + "grad_norm": 0.1364198892313634, + "learning_rate": 2.2338468503739053e-06, + "loss": 2.7367, + "step": 44173 + }, + { + "epoch": 2.742193804705444, + "grad_norm": 0.15243360119424618, + "learning_rate": 2.232779539868135e-06, + "loss": 2.7259, + "step": 44174 + }, + { + "epoch": 2.7422558818052023, + "grad_norm": 0.13421180512842706, + "learning_rate": 2.2317124785755427e-06, + "loss": 2.6058, + "step": 44175 + }, + { + "epoch": 2.7423179589049598, + "grad_norm": 0.1319388658894683, + "learning_rate": 2.2306456665017016e-06, + "loss": 2.6521, + "step": 44176 + }, + { + "epoch": 2.742380036004718, + "grad_norm": 0.15017483451347385, + "learning_rate": 2.229579103652185e-06, + "loss": 2.6372, + "step": 44177 + }, + { + "epoch": 2.7424421131044756, + "grad_norm": 0.26013386948004136, + "learning_rate": 2.2285127900325497e-06, + "loss": 2.7561, + "step": 44178 + }, + { + "epoch": 2.7425041902042335, + "grad_norm": 0.14705554234476578, + "learning_rate": 2.2274467256483635e-06, + "loss": 2.6917, + "step": 44179 + }, + { + "epoch": 2.7425662673039914, + "grad_norm": 0.14502439265981323, + "learning_rate": 2.2263809105051713e-06, + "loss": 2.7486, + "step": 44180 + }, + { + "epoch": 2.7426283444037494, + "grad_norm": 0.14493339240893965, + "learning_rate": 2.2253153446085583e-06, + "loss": 2.7168, + "step": 44181 + }, + { + "epoch": 2.7426904215035073, + "grad_norm": 0.14328006504696206, + "learning_rate": 2.2242500279640752e-06, + "loss": 2.7958, + "step": 44182 + }, + { + "epoch": 2.742752498603265, + "grad_norm": 0.15436862532778473, + "learning_rate": 2.2231849605772735e-06, + "loss": 2.6975, + "step": 44183 + }, + { + "epoch": 2.742814575703023, + "grad_norm": 0.14636467175323456, + "learning_rate": 2.2221201424537097e-06, + "loss": 2.7241, + "step": 44184 + }, + { + "epoch": 2.742876652802781, + "grad_norm": 0.14331492038792404, + "learning_rate": 2.2210555735989458e-06, + "loss": 2.6764, + "step": 44185 + }, + { + "epoch": 2.742938729902539, + "grad_norm": 0.14601008880798683, + "learning_rate": 2.219991254018533e-06, + "loss": 2.7057, + "step": 44186 + }, + { + "epoch": 2.743000807002297, + "grad_norm": 0.13393753751192622, + "learning_rate": 2.2189271837180337e-06, + "loss": 2.7428, + "step": 44187 + }, + { + "epoch": 2.743062884102055, + "grad_norm": 0.13315461077931148, + "learning_rate": 2.217863362702982e-06, + "loss": 2.6912, + "step": 44188 + }, + { + "epoch": 2.7431249612018127, + "grad_norm": 0.14836740721121414, + "learning_rate": 2.2167997909789405e-06, + "loss": 2.6184, + "step": 44189 + }, + { + "epoch": 2.7431870383015706, + "grad_norm": 0.1455657190910365, + "learning_rate": 2.2157364685514493e-06, + "loss": 2.7484, + "step": 44190 + }, + { + "epoch": 2.7432491154013285, + "grad_norm": 0.13387084466164642, + "learning_rate": 2.2146733954260647e-06, + "loss": 2.7067, + "step": 44191 + }, + { + "epoch": 2.7433111925010865, + "grad_norm": 0.1391663064768788, + "learning_rate": 2.213610571608332e-06, + "loss": 2.7576, + "step": 44192 + }, + { + "epoch": 2.7433732696008444, + "grad_norm": 0.1303715580849826, + "learning_rate": 2.212547997103792e-06, + "loss": 2.6897, + "step": 44193 + }, + { + "epoch": 2.7434353467006023, + "grad_norm": 0.14150414897932106, + "learning_rate": 2.2114856719179844e-06, + "loss": 2.6934, + "step": 44194 + }, + { + "epoch": 2.74349742380036, + "grad_norm": 0.13400576784936663, + "learning_rate": 2.21042359605646e-06, + "loss": 2.7217, + "step": 44195 + }, + { + "epoch": 2.743559500900118, + "grad_norm": 0.12940431953690795, + "learning_rate": 2.209361769524765e-06, + "loss": 2.7129, + "step": 44196 + }, + { + "epoch": 2.7436215779998756, + "grad_norm": 0.13541320343968746, + "learning_rate": 2.2083001923284217e-06, + "loss": 2.763, + "step": 44197 + }, + { + "epoch": 2.743683655099634, + "grad_norm": 0.1352474403901555, + "learning_rate": 2.207238864472977e-06, + "loss": 2.7997, + "step": 44198 + }, + { + "epoch": 2.7437457321993914, + "grad_norm": 0.14944146721534513, + "learning_rate": 2.2061777859639755e-06, + "loss": 2.6673, + "step": 44199 + }, + { + "epoch": 2.74380780929915, + "grad_norm": 0.13212948366928523, + "learning_rate": 2.2051169568069462e-06, + "loss": 2.7428, + "step": 44200 + }, + { + "epoch": 2.7438698863989073, + "grad_norm": 0.13485686351137832, + "learning_rate": 2.204056377007424e-06, + "loss": 2.7788, + "step": 44201 + }, + { + "epoch": 2.743931963498665, + "grad_norm": 0.1496564544525789, + "learning_rate": 2.202996046570943e-06, + "loss": 2.8046, + "step": 44202 + }, + { + "epoch": 2.743994040598423, + "grad_norm": 0.14222645173648668, + "learning_rate": 2.201935965503027e-06, + "loss": 2.6951, + "step": 44203 + }, + { + "epoch": 2.744056117698181, + "grad_norm": 0.13361920761127463, + "learning_rate": 2.200876133809221e-06, + "loss": 2.7678, + "step": 44204 + }, + { + "epoch": 2.744118194797939, + "grad_norm": 0.1427072893635881, + "learning_rate": 2.1998165514950485e-06, + "loss": 2.6495, + "step": 44205 + }, + { + "epoch": 2.744180271897697, + "grad_norm": 0.13477585933358627, + "learning_rate": 2.1987572185660387e-06, + "loss": 2.7332, + "step": 44206 + }, + { + "epoch": 2.744242348997455, + "grad_norm": 0.13156296762804662, + "learning_rate": 2.1976981350277093e-06, + "loss": 2.6686, + "step": 44207 + }, + { + "epoch": 2.7443044260972127, + "grad_norm": 0.13788811057286438, + "learning_rate": 2.1966393008856e-06, + "loss": 2.6408, + "step": 44208 + }, + { + "epoch": 2.7443665031969706, + "grad_norm": 0.13115659097595828, + "learning_rate": 2.195580716145229e-06, + "loss": 2.6334, + "step": 44209 + }, + { + "epoch": 2.7444285802967285, + "grad_norm": 0.13223092988998655, + "learning_rate": 2.1945223808121142e-06, + "loss": 2.7794, + "step": 44210 + }, + { + "epoch": 2.7444906573964865, + "grad_norm": 0.12992558456235812, + "learning_rate": 2.193464294891784e-06, + "loss": 2.7393, + "step": 44211 + }, + { + "epoch": 2.7445527344962444, + "grad_norm": 0.12961551581580788, + "learning_rate": 2.1924064583897507e-06, + "loss": 2.6905, + "step": 44212 + }, + { + "epoch": 2.7446148115960023, + "grad_norm": 0.1325695958088634, + "learning_rate": 2.191348871311544e-06, + "loss": 2.7163, + "step": 44213 + }, + { + "epoch": 2.74467688869576, + "grad_norm": 0.15025845206269406, + "learning_rate": 2.190291533662675e-06, + "loss": 2.7112, + "step": 44214 + }, + { + "epoch": 2.744738965795518, + "grad_norm": 0.1381899828832268, + "learning_rate": 2.1892344454486624e-06, + "loss": 2.6596, + "step": 44215 + }, + { + "epoch": 2.744801042895276, + "grad_norm": 0.14352570455267252, + "learning_rate": 2.1881776066750128e-06, + "loss": 2.7022, + "step": 44216 + }, + { + "epoch": 2.744863119995034, + "grad_norm": 0.13767355212613488, + "learning_rate": 2.187121017347249e-06, + "loss": 2.7364, + "step": 44217 + }, + { + "epoch": 2.744925197094792, + "grad_norm": 0.1506287930632272, + "learning_rate": 2.1860646774708836e-06, + "loss": 2.7582, + "step": 44218 + }, + { + "epoch": 2.74498727419455, + "grad_norm": 0.13968832606726342, + "learning_rate": 2.185008587051429e-06, + "loss": 2.7038, + "step": 44219 + }, + { + "epoch": 2.7450493512943073, + "grad_norm": 0.1330685111534036, + "learning_rate": 2.1839527460943977e-06, + "loss": 2.7307, + "step": 44220 + }, + { + "epoch": 2.7451114283940656, + "grad_norm": 0.1348869220141861, + "learning_rate": 2.182897154605279e-06, + "loss": 2.6695, + "step": 44221 + }, + { + "epoch": 2.745173505493823, + "grad_norm": 0.1447814309724022, + "learning_rate": 2.1818418125896023e-06, + "loss": 2.7273, + "step": 44222 + }, + { + "epoch": 2.7452355825935815, + "grad_norm": 0.1469363360977482, + "learning_rate": 2.180786720052863e-06, + "loss": 2.72, + "step": 44223 + }, + { + "epoch": 2.745297659693339, + "grad_norm": 0.1302969932706575, + "learning_rate": 2.179731877000574e-06, + "loss": 2.6831, + "step": 44224 + }, + { + "epoch": 2.745359736793097, + "grad_norm": 0.13989309074481027, + "learning_rate": 2.1786772834382297e-06, + "loss": 2.6555, + "step": 44225 + }, + { + "epoch": 2.745421813892855, + "grad_norm": 0.1451858055362359, + "learning_rate": 2.1776229393713265e-06, + "loss": 2.6672, + "step": 44226 + }, + { + "epoch": 2.7454838909926127, + "grad_norm": 0.13537938188512835, + "learning_rate": 2.1765688448053823e-06, + "loss": 2.6159, + "step": 44227 + }, + { + "epoch": 2.7455459680923706, + "grad_norm": 0.12971401719079745, + "learning_rate": 2.1755149997458867e-06, + "loss": 2.6205, + "step": 44228 + }, + { + "epoch": 2.7456080451921285, + "grad_norm": 0.14571061179022557, + "learning_rate": 2.1744614041983356e-06, + "loss": 2.744, + "step": 44229 + }, + { + "epoch": 2.7456701222918865, + "grad_norm": 0.14191080067059927, + "learning_rate": 2.1734080581682305e-06, + "loss": 2.7214, + "step": 44230 + }, + { + "epoch": 2.7457321993916444, + "grad_norm": 0.1311541198587482, + "learning_rate": 2.172354961661066e-06, + "loss": 2.7445, + "step": 44231 + }, + { + "epoch": 2.7457942764914023, + "grad_norm": 0.14156508798107292, + "learning_rate": 2.1713021146823385e-06, + "loss": 2.769, + "step": 44232 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.1389351146152599, + "learning_rate": 2.1702495172375436e-06, + "loss": 2.6506, + "step": 44233 + }, + { + "epoch": 2.745918430690918, + "grad_norm": 0.12843247973476019, + "learning_rate": 2.1691971693321544e-06, + "loss": 2.6602, + "step": 44234 + }, + { + "epoch": 2.745980507790676, + "grad_norm": 0.13580296891435215, + "learning_rate": 2.1681450709716834e-06, + "loss": 2.7322, + "step": 44235 + }, + { + "epoch": 2.746042584890434, + "grad_norm": 0.13095928761972717, + "learning_rate": 2.167093222161615e-06, + "loss": 2.712, + "step": 44236 + }, + { + "epoch": 2.746104661990192, + "grad_norm": 0.1401967707505253, + "learning_rate": 2.1660416229074278e-06, + "loss": 2.7328, + "step": 44237 + }, + { + "epoch": 2.74616673908995, + "grad_norm": 0.14106476421592584, + "learning_rate": 2.1649902732146176e-06, + "loss": 2.7102, + "step": 44238 + }, + { + "epoch": 2.7462288161897077, + "grad_norm": 0.13595476588951383, + "learning_rate": 2.163939173088653e-06, + "loss": 2.6487, + "step": 44239 + }, + { + "epoch": 2.7462908932894656, + "grad_norm": 0.12882177879752432, + "learning_rate": 2.1628883225350447e-06, + "loss": 2.6307, + "step": 44240 + }, + { + "epoch": 2.7463529703892235, + "grad_norm": 0.13601629450949324, + "learning_rate": 2.1618377215592566e-06, + "loss": 2.7304, + "step": 44241 + }, + { + "epoch": 2.7464150474889815, + "grad_norm": 0.15750788168448937, + "learning_rate": 2.1607873701667726e-06, + "loss": 2.7597, + "step": 44242 + }, + { + "epoch": 2.746477124588739, + "grad_norm": 0.1361256815111249, + "learning_rate": 2.1597372683630713e-06, + "loss": 2.7356, + "step": 44243 + }, + { + "epoch": 2.7465392016884973, + "grad_norm": 0.1385039669623832, + "learning_rate": 2.1586874161536377e-06, + "loss": 2.6202, + "step": 44244 + }, + { + "epoch": 2.7466012787882548, + "grad_norm": 0.14856148235059163, + "learning_rate": 2.1576378135439503e-06, + "loss": 2.7554, + "step": 44245 + }, + { + "epoch": 2.746663355888013, + "grad_norm": 0.13027851518060451, + "learning_rate": 2.156588460539477e-06, + "loss": 2.7207, + "step": 44246 + }, + { + "epoch": 2.7467254329877706, + "grad_norm": 0.13011103491550613, + "learning_rate": 2.155539357145697e-06, + "loss": 2.6633, + "step": 44247 + }, + { + "epoch": 2.746787510087529, + "grad_norm": 0.16335708668555066, + "learning_rate": 2.154490503368073e-06, + "loss": 2.739, + "step": 44248 + }, + { + "epoch": 2.7468495871872864, + "grad_norm": 0.13048619108063406, + "learning_rate": 2.1534418992120996e-06, + "loss": 2.7199, + "step": 44249 + }, + { + "epoch": 2.7469116642870444, + "grad_norm": 0.1327150211515497, + "learning_rate": 2.1523935446832234e-06, + "loss": 2.6515, + "step": 44250 + }, + { + "epoch": 2.7469737413868023, + "grad_norm": 0.14792670025061924, + "learning_rate": 2.1513454397869337e-06, + "loss": 2.7163, + "step": 44251 + }, + { + "epoch": 2.74703581848656, + "grad_norm": 0.14593562018758263, + "learning_rate": 2.150297584528693e-06, + "loss": 2.7368, + "step": 44252 + }, + { + "epoch": 2.747097895586318, + "grad_norm": 0.13394876679788675, + "learning_rate": 2.1492499789139586e-06, + "loss": 2.673, + "step": 44253 + }, + { + "epoch": 2.747159972686076, + "grad_norm": 0.14087618144139893, + "learning_rate": 2.148202622948209e-06, + "loss": 2.6495, + "step": 44254 + }, + { + "epoch": 2.747222049785834, + "grad_norm": 0.15211830207988844, + "learning_rate": 2.1471555166369005e-06, + "loss": 2.7387, + "step": 44255 + }, + { + "epoch": 2.747284126885592, + "grad_norm": 0.14231045301068404, + "learning_rate": 2.1461086599854964e-06, + "loss": 2.7247, + "step": 44256 + }, + { + "epoch": 2.74734620398535, + "grad_norm": 0.1408495897712157, + "learning_rate": 2.1450620529994582e-06, + "loss": 2.757, + "step": 44257 + }, + { + "epoch": 2.7474082810851077, + "grad_norm": 0.13490971864452123, + "learning_rate": 2.144015695684254e-06, + "loss": 2.6371, + "step": 44258 + }, + { + "epoch": 2.7474703581848656, + "grad_norm": 0.13360223464346146, + "learning_rate": 2.1429695880453403e-06, + "loss": 2.6584, + "step": 44259 + }, + { + "epoch": 2.7475324352846235, + "grad_norm": 0.14004819513013883, + "learning_rate": 2.141923730088169e-06, + "loss": 2.6618, + "step": 44260 + }, + { + "epoch": 2.7475945123843815, + "grad_norm": 0.14186733560193604, + "learning_rate": 2.140878121818196e-06, + "loss": 2.662, + "step": 44261 + }, + { + "epoch": 2.7476565894841394, + "grad_norm": 0.14150557059605154, + "learning_rate": 2.1398327632408786e-06, + "loss": 2.7839, + "step": 44262 + }, + { + "epoch": 2.7477186665838973, + "grad_norm": 0.12827108417240904, + "learning_rate": 2.1387876543616793e-06, + "loss": 2.7094, + "step": 44263 + }, + { + "epoch": 2.747780743683655, + "grad_norm": 0.1307045960671318, + "learning_rate": 2.1377427951860375e-06, + "loss": 2.7167, + "step": 44264 + }, + { + "epoch": 2.747842820783413, + "grad_norm": 0.1559635815472015, + "learning_rate": 2.136698185719416e-06, + "loss": 2.8061, + "step": 44265 + }, + { + "epoch": 2.747904897883171, + "grad_norm": 0.13005665925617052, + "learning_rate": 2.1356538259672543e-06, + "loss": 2.7327, + "step": 44266 + }, + { + "epoch": 2.747966974982929, + "grad_norm": 0.13620510343721, + "learning_rate": 2.134609715935004e-06, + "loss": 2.7468, + "step": 44267 + }, + { + "epoch": 2.7480290520826864, + "grad_norm": 0.14224706387297054, + "learning_rate": 2.1335658556281225e-06, + "loss": 2.6761, + "step": 44268 + }, + { + "epoch": 2.748091129182445, + "grad_norm": 0.13130670923176965, + "learning_rate": 2.1325222450520434e-06, + "loss": 2.7301, + "step": 44269 + }, + { + "epoch": 2.7481532062822023, + "grad_norm": 0.1403943926051463, + "learning_rate": 2.1314788842122126e-06, + "loss": 2.6841, + "step": 44270 + }, + { + "epoch": 2.7482152833819606, + "grad_norm": 0.1345042721351719, + "learning_rate": 2.130435773114081e-06, + "loss": 2.7061, + "step": 44271 + }, + { + "epoch": 2.748277360481718, + "grad_norm": 0.1340688925443897, + "learning_rate": 2.1293929117630896e-06, + "loss": 2.8047, + "step": 44272 + }, + { + "epoch": 2.748339437581476, + "grad_norm": 0.13348948218849965, + "learning_rate": 2.1283503001646776e-06, + "loss": 2.7447, + "step": 44273 + }, + { + "epoch": 2.748401514681234, + "grad_norm": 0.1317867893866675, + "learning_rate": 2.12730793832428e-06, + "loss": 2.6487, + "step": 44274 + }, + { + "epoch": 2.748463591780992, + "grad_norm": 0.14929069462334724, + "learning_rate": 2.1262658262473313e-06, + "loss": 2.6889, + "step": 44275 + }, + { + "epoch": 2.74852566888075, + "grad_norm": 0.1395519563533354, + "learning_rate": 2.1252239639392822e-06, + "loss": 2.7464, + "step": 44276 + }, + { + "epoch": 2.7485877459805077, + "grad_norm": 0.1348907608443369, + "learning_rate": 2.1241823514055627e-06, + "loss": 2.741, + "step": 44277 + }, + { + "epoch": 2.7486498230802656, + "grad_norm": 0.13089178879534566, + "learning_rate": 2.1231409886516064e-06, + "loss": 2.6601, + "step": 44278 + }, + { + "epoch": 2.7487119001800235, + "grad_norm": 0.13047062364249384, + "learning_rate": 2.1220998756828424e-06, + "loss": 2.713, + "step": 44279 + }, + { + "epoch": 2.7487739772797815, + "grad_norm": 0.13063077745026822, + "learning_rate": 2.1210590125047057e-06, + "loss": 2.7172, + "step": 44280 + }, + { + "epoch": 2.7488360543795394, + "grad_norm": 0.13420195110250435, + "learning_rate": 2.120018399122631e-06, + "loss": 2.6605, + "step": 44281 + }, + { + "epoch": 2.7488981314792973, + "grad_norm": 0.14114462032594546, + "learning_rate": 2.1189780355420465e-06, + "loss": 2.7379, + "step": 44282 + }, + { + "epoch": 2.748960208579055, + "grad_norm": 0.13484955675699728, + "learning_rate": 2.1179379217683647e-06, + "loss": 2.7319, + "step": 44283 + }, + { + "epoch": 2.749022285678813, + "grad_norm": 0.1323807022205681, + "learning_rate": 2.1168980578070264e-06, + "loss": 2.6731, + "step": 44284 + }, + { + "epoch": 2.749084362778571, + "grad_norm": 0.1339566858335172, + "learning_rate": 2.1158584436634653e-06, + "loss": 2.6722, + "step": 44285 + }, + { + "epoch": 2.749146439878329, + "grad_norm": 0.13070155348576912, + "learning_rate": 2.1148190793430944e-06, + "loss": 2.6642, + "step": 44286 + }, + { + "epoch": 2.749208516978087, + "grad_norm": 0.1367525944372136, + "learning_rate": 2.1137799648513314e-06, + "loss": 2.739, + "step": 44287 + }, + { + "epoch": 2.749270594077845, + "grad_norm": 0.13210682765550874, + "learning_rate": 2.1127411001936048e-06, + "loss": 2.7351, + "step": 44288 + }, + { + "epoch": 2.7493326711776027, + "grad_norm": 0.13510159065282906, + "learning_rate": 2.1117024853753276e-06, + "loss": 2.6975, + "step": 44289 + }, + { + "epoch": 2.7493947482773606, + "grad_norm": 0.13147005242295823, + "learning_rate": 2.110664120401934e-06, + "loss": 2.6966, + "step": 44290 + }, + { + "epoch": 2.749456825377118, + "grad_norm": 0.13202348720143955, + "learning_rate": 2.1096260052788253e-06, + "loss": 2.7381, + "step": 44291 + }, + { + "epoch": 2.7495189024768765, + "grad_norm": 0.13116926537122253, + "learning_rate": 2.1085881400114305e-06, + "loss": 2.7399, + "step": 44292 + }, + { + "epoch": 2.749580979576634, + "grad_norm": 0.13659329107613707, + "learning_rate": 2.1075505246051454e-06, + "loss": 2.6954, + "step": 44293 + }, + { + "epoch": 2.7496430566763923, + "grad_norm": 0.14419289238300076, + "learning_rate": 2.106513159065404e-06, + "loss": 2.6524, + "step": 44294 + }, + { + "epoch": 2.74970513377615, + "grad_norm": 0.14038745367069616, + "learning_rate": 2.1054760433976083e-06, + "loss": 2.5707, + "step": 44295 + }, + { + "epoch": 2.749767210875908, + "grad_norm": 0.14178106177773445, + "learning_rate": 2.1044391776071704e-06, + "loss": 2.6661, + "step": 44296 + }, + { + "epoch": 2.7498292879756656, + "grad_norm": 0.1545695900677237, + "learning_rate": 2.103402561699502e-06, + "loss": 2.6444, + "step": 44297 + }, + { + "epoch": 2.7498913650754235, + "grad_norm": 0.13510403092915999, + "learning_rate": 2.1023661956800055e-06, + "loss": 2.7131, + "step": 44298 + }, + { + "epoch": 2.7499534421751815, + "grad_norm": 0.13417046194552998, + "learning_rate": 2.1013300795540926e-06, + "loss": 2.6622, + "step": 44299 + }, + { + "epoch": 2.7500155192749394, + "grad_norm": 0.13068338202871407, + "learning_rate": 2.100294213327175e-06, + "loss": 2.6859, + "step": 44300 + }, + { + "epoch": 2.7500775963746973, + "grad_norm": 0.13192527710900148, + "learning_rate": 2.0992585970046443e-06, + "loss": 2.7642, + "step": 44301 + }, + { + "epoch": 2.750139673474455, + "grad_norm": 0.13975055668793354, + "learning_rate": 2.0982232305919056e-06, + "loss": 2.7017, + "step": 44302 + }, + { + "epoch": 2.750201750574213, + "grad_norm": 0.1391829154159265, + "learning_rate": 2.097188114094373e-06, + "loss": 2.7204, + "step": 44303 + }, + { + "epoch": 2.750263827673971, + "grad_norm": 0.13403522510775268, + "learning_rate": 2.0961532475174404e-06, + "loss": 2.7199, + "step": 44304 + }, + { + "epoch": 2.750325904773729, + "grad_norm": 0.13060202130661935, + "learning_rate": 2.095118630866505e-06, + "loss": 2.5952, + "step": 44305 + }, + { + "epoch": 2.750387981873487, + "grad_norm": 0.13172778885939454, + "learning_rate": 2.094084264146956e-06, + "loss": 2.7343, + "step": 44306 + }, + { + "epoch": 2.750450058973245, + "grad_norm": 0.13615853109105877, + "learning_rate": 2.093050147364206e-06, + "loss": 2.7401, + "step": 44307 + }, + { + "epoch": 2.7505121360730027, + "grad_norm": 0.13023064581114885, + "learning_rate": 2.09201628052364e-06, + "loss": 2.828, + "step": 44308 + }, + { + "epoch": 2.7505742131727606, + "grad_norm": 0.16042238894594177, + "learning_rate": 2.0909826636306582e-06, + "loss": 2.6712, + "step": 44309 + }, + { + "epoch": 2.7506362902725185, + "grad_norm": 0.13801116429655066, + "learning_rate": 2.089949296690652e-06, + "loss": 2.6978, + "step": 44310 + }, + { + "epoch": 2.7506983673722765, + "grad_norm": 0.13514473017070858, + "learning_rate": 2.088916179709005e-06, + "loss": 2.7853, + "step": 44311 + }, + { + "epoch": 2.7507604444720344, + "grad_norm": 0.13681333539785265, + "learning_rate": 2.0878833126911136e-06, + "loss": 2.6278, + "step": 44312 + }, + { + "epoch": 2.7508225215717923, + "grad_norm": 0.1351475348997878, + "learning_rate": 2.0868506956423727e-06, + "loss": 2.743, + "step": 44313 + }, + { + "epoch": 2.75088459867155, + "grad_norm": 0.13497297258168608, + "learning_rate": 2.085818328568162e-06, + "loss": 2.7122, + "step": 44314 + }, + { + "epoch": 2.750946675771308, + "grad_norm": 0.15514984850436592, + "learning_rate": 2.0847862114738603e-06, + "loss": 2.7863, + "step": 44315 + }, + { + "epoch": 2.7510087528710656, + "grad_norm": 0.15381885070509185, + "learning_rate": 2.0837543443648576e-06, + "loss": 2.7675, + "step": 44316 + }, + { + "epoch": 2.751070829970824, + "grad_norm": 0.1438846493951886, + "learning_rate": 2.08272272724655e-06, + "loss": 2.7391, + "step": 44317 + }, + { + "epoch": 2.7511329070705814, + "grad_norm": 0.13556824661322517, + "learning_rate": 2.0816913601243105e-06, + "loss": 2.684, + "step": 44318 + }, + { + "epoch": 2.75119498417034, + "grad_norm": 0.136746785150499, + "learning_rate": 2.0806602430035183e-06, + "loss": 2.6305, + "step": 44319 + }, + { + "epoch": 2.7512570612700973, + "grad_norm": 0.14694410027673185, + "learning_rate": 2.0796293758895526e-06, + "loss": 2.687, + "step": 44320 + }, + { + "epoch": 2.751319138369855, + "grad_norm": 0.15603701857896177, + "learning_rate": 2.0785987587877976e-06, + "loss": 2.7239, + "step": 44321 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.13582493297758508, + "learning_rate": 2.0775683917036214e-06, + "loss": 2.6599, + "step": 44322 + }, + { + "epoch": 2.751443292569371, + "grad_norm": 0.1323788378837274, + "learning_rate": 2.076538274642409e-06, + "loss": 2.6746, + "step": 44323 + }, + { + "epoch": 2.751505369669129, + "grad_norm": 0.13168635182588487, + "learning_rate": 2.0755084076095334e-06, + "loss": 2.6866, + "step": 44324 + }, + { + "epoch": 2.751567446768887, + "grad_norm": 0.14797916046427026, + "learning_rate": 2.0744787906103513e-06, + "loss": 2.6835, + "step": 44325 + }, + { + "epoch": 2.751629523868645, + "grad_norm": 0.13024365343430502, + "learning_rate": 2.073449423650259e-06, + "loss": 2.6291, + "step": 44326 + }, + { + "epoch": 2.7516916009684027, + "grad_norm": 0.1324538760495265, + "learning_rate": 2.0724203067346126e-06, + "loss": 2.738, + "step": 44327 + }, + { + "epoch": 2.7517536780681606, + "grad_norm": 0.12840755405910387, + "learning_rate": 2.071391439868786e-06, + "loss": 2.6907, + "step": 44328 + }, + { + "epoch": 2.7518157551679185, + "grad_norm": 0.13106910069170286, + "learning_rate": 2.070362823058136e-06, + "loss": 2.7261, + "step": 44329 + }, + { + "epoch": 2.7518778322676765, + "grad_norm": 0.1427301093238204, + "learning_rate": 2.0693344563080475e-06, + "loss": 2.6712, + "step": 44330 + }, + { + "epoch": 2.7519399093674344, + "grad_norm": 0.1379237883458755, + "learning_rate": 2.0683063396238766e-06, + "loss": 2.7381, + "step": 44331 + }, + { + "epoch": 2.7520019864671923, + "grad_norm": 0.13126540888298668, + "learning_rate": 2.0672784730109918e-06, + "loss": 2.7174, + "step": 44332 + }, + { + "epoch": 2.75206406356695, + "grad_norm": 0.13688946011773373, + "learning_rate": 2.0662508564747442e-06, + "loss": 2.6471, + "step": 44333 + }, + { + "epoch": 2.752126140666708, + "grad_norm": 0.1297169034440943, + "learning_rate": 2.0652234900205016e-06, + "loss": 2.6938, + "step": 44334 + }, + { + "epoch": 2.752188217766466, + "grad_norm": 0.1366476214533194, + "learning_rate": 2.0641963736536265e-06, + "loss": 2.6583, + "step": 44335 + }, + { + "epoch": 2.752250294866224, + "grad_norm": 0.13014723456896374, + "learning_rate": 2.0631695073794754e-06, + "loss": 2.7146, + "step": 44336 + }, + { + "epoch": 2.752312371965982, + "grad_norm": 0.13818606047568494, + "learning_rate": 2.0621428912034056e-06, + "loss": 2.6876, + "step": 44337 + }, + { + "epoch": 2.75237444906574, + "grad_norm": 0.1336168099182643, + "learning_rate": 2.061116525130774e-06, + "loss": 2.7588, + "step": 44338 + }, + { + "epoch": 2.7524365261654973, + "grad_norm": 0.13250287512817777, + "learning_rate": 2.060090409166937e-06, + "loss": 2.7052, + "step": 44339 + }, + { + "epoch": 2.7524986032652556, + "grad_norm": 0.131995951713722, + "learning_rate": 2.0590645433172407e-06, + "loss": 2.7444, + "step": 44340 + }, + { + "epoch": 2.752560680365013, + "grad_norm": 0.13812474996233443, + "learning_rate": 2.0580389275870526e-06, + "loss": 2.6843, + "step": 44341 + }, + { + "epoch": 2.7526227574647715, + "grad_norm": 0.1493015613748613, + "learning_rate": 2.057013561981702e-06, + "loss": 2.6688, + "step": 44342 + }, + { + "epoch": 2.752684834564529, + "grad_norm": 0.13336769632257275, + "learning_rate": 2.055988446506557e-06, + "loss": 2.6578, + "step": 44343 + }, + { + "epoch": 2.7527469116642873, + "grad_norm": 0.13058832194359105, + "learning_rate": 2.0549635811669575e-06, + "loss": 2.7139, + "step": 44344 + }, + { + "epoch": 2.752808988764045, + "grad_norm": 0.13666700337555548, + "learning_rate": 2.053938965968255e-06, + "loss": 2.6309, + "step": 44345 + }, + { + "epoch": 2.7528710658638027, + "grad_norm": 0.14702262535906727, + "learning_rate": 2.0529146009157895e-06, + "loss": 2.7094, + "step": 44346 + }, + { + "epoch": 2.7529331429635606, + "grad_norm": 0.14730379434626872, + "learning_rate": 2.0518904860149067e-06, + "loss": 2.6827, + "step": 44347 + }, + { + "epoch": 2.7529952200633185, + "grad_norm": 0.13036772905786922, + "learning_rate": 2.0508666212709473e-06, + "loss": 2.6552, + "step": 44348 + }, + { + "epoch": 2.7530572971630765, + "grad_norm": 0.13300281100735414, + "learning_rate": 2.0498430066892614e-06, + "loss": 2.6786, + "step": 44349 + }, + { + "epoch": 2.7531193742628344, + "grad_norm": 0.15373142668142448, + "learning_rate": 2.0488196422751905e-06, + "loss": 2.7255, + "step": 44350 + }, + { + "epoch": 2.7531814513625923, + "grad_norm": 0.13921638156019356, + "learning_rate": 2.047796528034063e-06, + "loss": 2.6517, + "step": 44351 + }, + { + "epoch": 2.75324352846235, + "grad_norm": 0.13069011645158501, + "learning_rate": 2.046773663971219e-06, + "loss": 2.7231, + "step": 44352 + }, + { + "epoch": 2.753305605562108, + "grad_norm": 0.13155062850706314, + "learning_rate": 2.0457510500919984e-06, + "loss": 2.6494, + "step": 44353 + }, + { + "epoch": 2.753367682661866, + "grad_norm": 0.14816637123374893, + "learning_rate": 2.0447286864017424e-06, + "loss": 2.7294, + "step": 44354 + }, + { + "epoch": 2.753429759761624, + "grad_norm": 0.14447653224397908, + "learning_rate": 2.0437065729057736e-06, + "loss": 2.7114, + "step": 44355 + }, + { + "epoch": 2.753491836861382, + "grad_norm": 0.12812896670039464, + "learning_rate": 2.042684709609427e-06, + "loss": 2.7239, + "step": 44356 + }, + { + "epoch": 2.75355391396114, + "grad_norm": 0.14829138415670465, + "learning_rate": 2.0416630965180426e-06, + "loss": 2.6648, + "step": 44357 + }, + { + "epoch": 2.7536159910608977, + "grad_norm": 0.12828897299617356, + "learning_rate": 2.0406417336369388e-06, + "loss": 2.6535, + "step": 44358 + }, + { + "epoch": 2.7536780681606556, + "grad_norm": 0.12943588171923082, + "learning_rate": 2.039620620971455e-06, + "loss": 2.565, + "step": 44359 + }, + { + "epoch": 2.7537401452604136, + "grad_norm": 0.13076640142783932, + "learning_rate": 2.03859975852691e-06, + "loss": 2.6853, + "step": 44360 + }, + { + "epoch": 2.7538022223601715, + "grad_norm": 0.13325443789694455, + "learning_rate": 2.0375791463086268e-06, + "loss": 2.6796, + "step": 44361 + }, + { + "epoch": 2.7538642994599294, + "grad_norm": 0.1492333347622596, + "learning_rate": 2.036558784321946e-06, + "loss": 2.7086, + "step": 44362 + }, + { + "epoch": 2.7539263765596873, + "grad_norm": 0.14663485109711952, + "learning_rate": 2.035538672572179e-06, + "loss": 2.6628, + "step": 44363 + }, + { + "epoch": 2.753988453659445, + "grad_norm": 0.14448256574793103, + "learning_rate": 2.0345188110646505e-06, + "loss": 2.7375, + "step": 44364 + }, + { + "epoch": 2.754050530759203, + "grad_norm": 0.13455932832577686, + "learning_rate": 2.033499199804678e-06, + "loss": 2.7144, + "step": 44365 + }, + { + "epoch": 2.7541126078589606, + "grad_norm": 0.14735371436632097, + "learning_rate": 2.0324798387975907e-06, + "loss": 2.6127, + "step": 44366 + }, + { + "epoch": 2.754174684958719, + "grad_norm": 0.1306193983893791, + "learning_rate": 2.0314607280486953e-06, + "loss": 2.7119, + "step": 44367 + }, + { + "epoch": 2.7542367620584765, + "grad_norm": 0.14126198269435664, + "learning_rate": 2.030441867563321e-06, + "loss": 2.6301, + "step": 44368 + }, + { + "epoch": 2.7542988391582344, + "grad_norm": 0.1310917868597924, + "learning_rate": 2.029423257346763e-06, + "loss": 2.6589, + "step": 44369 + }, + { + "epoch": 2.7543609162579923, + "grad_norm": 0.13660045683328498, + "learning_rate": 2.0284048974043623e-06, + "loss": 2.6065, + "step": 44370 + }, + { + "epoch": 2.75442299335775, + "grad_norm": 0.13593509058077652, + "learning_rate": 2.027386787741414e-06, + "loss": 2.739, + "step": 44371 + }, + { + "epoch": 2.754485070457508, + "grad_norm": 0.13531684099859137, + "learning_rate": 2.026368928363237e-06, + "loss": 2.7318, + "step": 44372 + }, + { + "epoch": 2.754547147557266, + "grad_norm": 0.13340808057210454, + "learning_rate": 2.0253513192751373e-06, + "loss": 2.6988, + "step": 44373 + }, + { + "epoch": 2.754609224657024, + "grad_norm": 0.1398356190240981, + "learning_rate": 2.024333960482422e-06, + "loss": 2.7649, + "step": 44374 + }, + { + "epoch": 2.754671301756782, + "grad_norm": 0.1601261366469565, + "learning_rate": 2.0233168519904034e-06, + "loss": 2.6161, + "step": 44375 + }, + { + "epoch": 2.75473337885654, + "grad_norm": 0.13756125938674155, + "learning_rate": 2.022299993804394e-06, + "loss": 2.6542, + "step": 44376 + }, + { + "epoch": 2.7547954559562977, + "grad_norm": 0.1317830792178666, + "learning_rate": 2.0212833859296896e-06, + "loss": 2.6569, + "step": 44377 + }, + { + "epoch": 2.7548575330560556, + "grad_norm": 0.14032221356946525, + "learning_rate": 2.020267028371592e-06, + "loss": 2.6931, + "step": 44378 + }, + { + "epoch": 2.7549196101558135, + "grad_norm": 0.13452075051109233, + "learning_rate": 2.0192509211354127e-06, + "loss": 2.6499, + "step": 44379 + }, + { + "epoch": 2.7549816872555715, + "grad_norm": 0.1506117458467569, + "learning_rate": 2.0182350642264536e-06, + "loss": 2.7682, + "step": 44380 + }, + { + "epoch": 2.7550437643553294, + "grad_norm": 0.16164419925253953, + "learning_rate": 2.017219457649999e-06, + "loss": 2.6824, + "step": 44381 + }, + { + "epoch": 2.7551058414550873, + "grad_norm": 0.1359543069227581, + "learning_rate": 2.0162041014113677e-06, + "loss": 2.6879, + "step": 44382 + }, + { + "epoch": 2.755167918554845, + "grad_norm": 0.14546742614856653, + "learning_rate": 2.0151889955158488e-06, + "loss": 2.6739, + "step": 44383 + }, + { + "epoch": 2.755229995654603, + "grad_norm": 0.15207553752640812, + "learning_rate": 2.014174139968733e-06, + "loss": 2.656, + "step": 44384 + }, + { + "epoch": 2.755292072754361, + "grad_norm": 0.13360406796726207, + "learning_rate": 2.013159534775322e-06, + "loss": 2.6058, + "step": 44385 + }, + { + "epoch": 2.755354149854119, + "grad_norm": 0.13573588898680466, + "learning_rate": 2.0121451799409107e-06, + "loss": 2.6305, + "step": 44386 + }, + { + "epoch": 2.7554162269538764, + "grad_norm": 0.13285936903008613, + "learning_rate": 2.0111310754707846e-06, + "loss": 2.736, + "step": 44387 + }, + { + "epoch": 2.755478304053635, + "grad_norm": 0.13598641030875827, + "learning_rate": 2.010117221370228e-06, + "loss": 2.7398, + "step": 44388 + }, + { + "epoch": 2.7555403811533923, + "grad_norm": 0.136285258114592, + "learning_rate": 2.0091036176445534e-06, + "loss": 2.6602, + "step": 44389 + }, + { + "epoch": 2.7556024582531506, + "grad_norm": 0.13640293702846693, + "learning_rate": 2.0080902642990286e-06, + "loss": 2.6193, + "step": 44390 + }, + { + "epoch": 2.755664535352908, + "grad_norm": 0.13276240862757122, + "learning_rate": 2.0070771613389494e-06, + "loss": 2.6231, + "step": 44391 + }, + { + "epoch": 2.7557266124526665, + "grad_norm": 0.1296930719810206, + "learning_rate": 2.0060643087695954e-06, + "loss": 2.6043, + "step": 44392 + }, + { + "epoch": 2.755788689552424, + "grad_norm": 0.13370500833937576, + "learning_rate": 2.0050517065962617e-06, + "loss": 2.7409, + "step": 44393 + }, + { + "epoch": 2.755850766652182, + "grad_norm": 0.14376811944750056, + "learning_rate": 2.004039354824222e-06, + "loss": 2.705, + "step": 44394 + }, + { + "epoch": 2.75591284375194, + "grad_norm": 0.1490308109824109, + "learning_rate": 2.0030272534587613e-06, + "loss": 2.7383, + "step": 44395 + }, + { + "epoch": 2.7559749208516977, + "grad_norm": 0.13742725170952813, + "learning_rate": 2.0020154025051584e-06, + "loss": 2.81, + "step": 44396 + }, + { + "epoch": 2.7560369979514556, + "grad_norm": 0.1349206461295495, + "learning_rate": 2.0010038019686873e-06, + "loss": 2.7577, + "step": 44397 + }, + { + "epoch": 2.7560990750512135, + "grad_norm": 0.13273690680996275, + "learning_rate": 1.9999924518546378e-06, + "loss": 2.7995, + "step": 44398 + }, + { + "epoch": 2.7561611521509715, + "grad_norm": 0.13632201123669896, + "learning_rate": 1.9989813521682774e-06, + "loss": 2.5713, + "step": 44399 + }, + { + "epoch": 2.7562232292507294, + "grad_norm": 0.1366904501917109, + "learning_rate": 1.9979705029148863e-06, + "loss": 2.6152, + "step": 44400 + }, + { + "epoch": 2.7562853063504873, + "grad_norm": 0.13441214297630427, + "learning_rate": 1.996959904099732e-06, + "loss": 2.7132, + "step": 44401 + }, + { + "epoch": 2.756347383450245, + "grad_norm": 0.15272335905461756, + "learning_rate": 1.995949555728094e-06, + "loss": 2.7697, + "step": 44402 + }, + { + "epoch": 2.756409460550003, + "grad_norm": 0.1371953898205313, + "learning_rate": 1.9949394578052395e-06, + "loss": 2.7719, + "step": 44403 + }, + { + "epoch": 2.756471537649761, + "grad_norm": 0.1594977798808156, + "learning_rate": 1.9939296103364425e-06, + "loss": 2.7249, + "step": 44404 + }, + { + "epoch": 2.756533614749519, + "grad_norm": 0.13309157726541285, + "learning_rate": 1.99292001332696e-06, + "loss": 2.7004, + "step": 44405 + }, + { + "epoch": 2.756595691849277, + "grad_norm": 0.14878393202930043, + "learning_rate": 1.9919106667820764e-06, + "loss": 2.7407, + "step": 44406 + }, + { + "epoch": 2.756657768949035, + "grad_norm": 0.13318924202265334, + "learning_rate": 1.9909015707070487e-06, + "loss": 2.7736, + "step": 44407 + }, + { + "epoch": 2.7567198460487927, + "grad_norm": 0.13058383599911635, + "learning_rate": 1.98989272510714e-06, + "loss": 2.6475, + "step": 44408 + }, + { + "epoch": 2.7567819231485506, + "grad_norm": 0.13215960578058533, + "learning_rate": 1.988884129987617e-06, + "loss": 2.6797, + "step": 44409 + }, + { + "epoch": 2.7568440002483086, + "grad_norm": 0.13022550390198767, + "learning_rate": 1.9878757853537377e-06, + "loss": 2.6726, + "step": 44410 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.15026121640667459, + "learning_rate": 1.98686769121077e-06, + "loss": 2.7276, + "step": 44411 + }, + { + "epoch": 2.756968154447824, + "grad_norm": 0.13419484465167117, + "learning_rate": 1.98585984756397e-06, + "loss": 2.7018, + "step": 44412 + }, + { + "epoch": 2.7570302315475823, + "grad_norm": 0.14460461275326572, + "learning_rate": 1.9848522544185898e-06, + "loss": 2.6937, + "step": 44413 + }, + { + "epoch": 2.75709230864734, + "grad_norm": 0.12962453301694177, + "learning_rate": 1.983844911779892e-06, + "loss": 2.7087, + "step": 44414 + }, + { + "epoch": 2.757154385747098, + "grad_norm": 0.13459973560342, + "learning_rate": 1.982837819653127e-06, + "loss": 2.6066, + "step": 44415 + }, + { + "epoch": 2.7572164628468556, + "grad_norm": 0.13543027953047918, + "learning_rate": 1.9818309780435584e-06, + "loss": 2.6871, + "step": 44416 + }, + { + "epoch": 2.7572785399466135, + "grad_norm": 0.136353263067579, + "learning_rate": 1.9808243869564424e-06, + "loss": 2.6594, + "step": 44417 + }, + { + "epoch": 2.7573406170463715, + "grad_norm": 0.15391584017932206, + "learning_rate": 1.9798180463970138e-06, + "loss": 2.6911, + "step": 44418 + }, + { + "epoch": 2.7574026941461294, + "grad_norm": 0.13312627387668596, + "learning_rate": 1.9788119563705297e-06, + "loss": 2.6786, + "step": 44419 + }, + { + "epoch": 2.7574647712458873, + "grad_norm": 0.12870416551062613, + "learning_rate": 1.977806116882247e-06, + "loss": 2.7052, + "step": 44420 + }, + { + "epoch": 2.757526848345645, + "grad_norm": 0.15216910915953608, + "learning_rate": 1.976800527937411e-06, + "loss": 2.748, + "step": 44421 + }, + { + "epoch": 2.757588925445403, + "grad_norm": 0.1337810012727745, + "learning_rate": 1.9757951895412575e-06, + "loss": 2.6964, + "step": 44422 + }, + { + "epoch": 2.757651002545161, + "grad_norm": 0.13302884992800604, + "learning_rate": 1.974790101699042e-06, + "loss": 2.7649, + "step": 44423 + }, + { + "epoch": 2.757713079644919, + "grad_norm": 0.13339608666511457, + "learning_rate": 1.9737852644160003e-06, + "loss": 2.7586, + "step": 44424 + }, + { + "epoch": 2.757775156744677, + "grad_norm": 0.13144200421459487, + "learning_rate": 1.972780677697389e-06, + "loss": 2.7084, + "step": 44425 + }, + { + "epoch": 2.757837233844435, + "grad_norm": 0.13600957045207024, + "learning_rate": 1.9717763415484313e-06, + "loss": 2.7149, + "step": 44426 + }, + { + "epoch": 2.7578993109441927, + "grad_norm": 0.12678441061029883, + "learning_rate": 1.9707722559743844e-06, + "loss": 2.6219, + "step": 44427 + }, + { + "epoch": 2.7579613880439506, + "grad_norm": 0.13408846662599153, + "learning_rate": 1.969768420980467e-06, + "loss": 2.6663, + "step": 44428 + }, + { + "epoch": 2.7580234651437086, + "grad_norm": 0.13586176264169086, + "learning_rate": 1.9687648365719345e-06, + "loss": 2.8169, + "step": 44429 + }, + { + "epoch": 2.7580855422434665, + "grad_norm": 0.13267621593272166, + "learning_rate": 1.9677615027540176e-06, + "loss": 2.7988, + "step": 44430 + }, + { + "epoch": 2.7581476193432244, + "grad_norm": 0.14227481482422863, + "learning_rate": 1.9667584195319443e-06, + "loss": 2.6783, + "step": 44431 + }, + { + "epoch": 2.7582096964429823, + "grad_norm": 0.1332771853288904, + "learning_rate": 1.9657555869109555e-06, + "loss": 2.7242, + "step": 44432 + }, + { + "epoch": 2.7582717735427402, + "grad_norm": 0.14046754233973047, + "learning_rate": 1.964753004896275e-06, + "loss": 2.678, + "step": 44433 + }, + { + "epoch": 2.758333850642498, + "grad_norm": 0.1324193619458684, + "learning_rate": 1.9637506734931475e-06, + "loss": 2.6536, + "step": 44434 + }, + { + "epoch": 2.7583959277422556, + "grad_norm": 0.136300817394342, + "learning_rate": 1.962748592706792e-06, + "loss": 2.7718, + "step": 44435 + }, + { + "epoch": 2.758458004842014, + "grad_norm": 0.150372411194696, + "learning_rate": 1.961746762542438e-06, + "loss": 2.7096, + "step": 44436 + }, + { + "epoch": 2.7585200819417715, + "grad_norm": 0.13115140725694058, + "learning_rate": 1.9607451830053027e-06, + "loss": 2.7476, + "step": 44437 + }, + { + "epoch": 2.75858215904153, + "grad_norm": 0.1407606940445879, + "learning_rate": 1.959743854100632e-06, + "loss": 2.6791, + "step": 44438 + }, + { + "epoch": 2.7586442361412873, + "grad_norm": 0.1309812817745841, + "learning_rate": 1.958742775833644e-06, + "loss": 2.7387, + "step": 44439 + }, + { + "epoch": 2.7587063132410456, + "grad_norm": 0.1339305978292478, + "learning_rate": 1.9577419482095517e-06, + "loss": 2.7642, + "step": 44440 + }, + { + "epoch": 2.758768390340803, + "grad_norm": 0.13767568299539898, + "learning_rate": 1.956741371233578e-06, + "loss": 2.7105, + "step": 44441 + }, + { + "epoch": 2.758830467440561, + "grad_norm": 0.1418094227496048, + "learning_rate": 1.9557410449109526e-06, + "loss": 2.6654, + "step": 44442 + }, + { + "epoch": 2.758892544540319, + "grad_norm": 0.14490509655830475, + "learning_rate": 1.9547409692468876e-06, + "loss": 2.6713, + "step": 44443 + }, + { + "epoch": 2.758954621640077, + "grad_norm": 0.14033265322744512, + "learning_rate": 1.953741144246607e-06, + "loss": 2.6006, + "step": 44444 + }, + { + "epoch": 2.759016698739835, + "grad_norm": 0.13344212957297663, + "learning_rate": 1.9527415699153172e-06, + "loss": 2.7131, + "step": 44445 + }, + { + "epoch": 2.7590787758395927, + "grad_norm": 0.1317683576519669, + "learning_rate": 1.951742246258237e-06, + "loss": 2.7111, + "step": 44446 + }, + { + "epoch": 2.7591408529393506, + "grad_norm": 0.1355780975044577, + "learning_rate": 1.950743173280578e-06, + "loss": 2.7168, + "step": 44447 + }, + { + "epoch": 2.7592029300391085, + "grad_norm": 0.14141074510669177, + "learning_rate": 1.9497443509875645e-06, + "loss": 2.8455, + "step": 44448 + }, + { + "epoch": 2.7592650071388665, + "grad_norm": 0.13137739759475062, + "learning_rate": 1.9487457793844033e-06, + "loss": 2.7104, + "step": 44449 + }, + { + "epoch": 2.7593270842386244, + "grad_norm": 0.131432027556374, + "learning_rate": 1.9477474584762955e-06, + "loss": 2.691, + "step": 44450 + }, + { + "epoch": 2.7593891613383823, + "grad_norm": 0.13041239166514867, + "learning_rate": 1.9467493882684484e-06, + "loss": 2.7743, + "step": 44451 + }, + { + "epoch": 2.75945123843814, + "grad_norm": 0.14396439041565218, + "learning_rate": 1.94575156876608e-06, + "loss": 2.7669, + "step": 44452 + }, + { + "epoch": 2.759513315537898, + "grad_norm": 0.13908666041871523, + "learning_rate": 1.9447539999743912e-06, + "loss": 2.7171, + "step": 44453 + }, + { + "epoch": 2.759575392637656, + "grad_norm": 0.1401005874469675, + "learning_rate": 1.9437566818985895e-06, + "loss": 2.6882, + "step": 44454 + }, + { + "epoch": 2.759637469737414, + "grad_norm": 0.1453429318423261, + "learning_rate": 1.9427596145438656e-06, + "loss": 2.6996, + "step": 44455 + }, + { + "epoch": 2.759699546837172, + "grad_norm": 0.1302771569197173, + "learning_rate": 1.9417627979154417e-06, + "loss": 2.6212, + "step": 44456 + }, + { + "epoch": 2.75976162393693, + "grad_norm": 0.13753655256664263, + "learning_rate": 1.940766232018504e-06, + "loss": 2.734, + "step": 44457 + }, + { + "epoch": 2.7598237010366877, + "grad_norm": 0.1456051507302898, + "learning_rate": 1.9397699168582584e-06, + "loss": 2.7697, + "step": 44458 + }, + { + "epoch": 2.7598857781364456, + "grad_norm": 0.14009511642461905, + "learning_rate": 1.9387738524398955e-06, + "loss": 2.6593, + "step": 44459 + }, + { + "epoch": 2.759947855236203, + "grad_norm": 0.1292535625760893, + "learning_rate": 1.9377780387686172e-06, + "loss": 2.6325, + "step": 44460 + }, + { + "epoch": 2.7600099323359615, + "grad_norm": 0.13590576415283206, + "learning_rate": 1.936782475849619e-06, + "loss": 2.6638, + "step": 44461 + }, + { + "epoch": 2.760072009435719, + "grad_norm": 0.1393635585070153, + "learning_rate": 1.9357871636880964e-06, + "loss": 2.7167, + "step": 44462 + }, + { + "epoch": 2.7601340865354773, + "grad_norm": 0.1316515799132586, + "learning_rate": 1.93479210228924e-06, + "loss": 2.6668, + "step": 44463 + }, + { + "epoch": 2.760196163635235, + "grad_norm": 0.14319469441656096, + "learning_rate": 1.9337972916582404e-06, + "loss": 2.7155, + "step": 44464 + }, + { + "epoch": 2.7602582407349927, + "grad_norm": 0.1385900901001543, + "learning_rate": 1.932802731800287e-06, + "loss": 2.6774, + "step": 44465 + }, + { + "epoch": 2.7603203178347506, + "grad_norm": 0.13225600113530503, + "learning_rate": 1.931808422720577e-06, + "loss": 2.6341, + "step": 44466 + }, + { + "epoch": 2.7603823949345085, + "grad_norm": 0.13642250390236066, + "learning_rate": 1.9308143644242883e-06, + "loss": 2.7045, + "step": 44467 + }, + { + "epoch": 2.7604444720342665, + "grad_norm": 0.13028345484159193, + "learning_rate": 1.9298205569166063e-06, + "loss": 2.6521, + "step": 44468 + }, + { + "epoch": 2.7605065491340244, + "grad_norm": 0.13094987159859625, + "learning_rate": 1.928827000202721e-06, + "loss": 2.653, + "step": 44469 + }, + { + "epoch": 2.7605686262337823, + "grad_norm": 0.13533103203676447, + "learning_rate": 1.927833694287817e-06, + "loss": 2.7121, + "step": 44470 + }, + { + "epoch": 2.76063070333354, + "grad_norm": 0.15260220880087896, + "learning_rate": 1.9268406391770744e-06, + "loss": 2.6892, + "step": 44471 + }, + { + "epoch": 2.760692780433298, + "grad_norm": 0.1491164628304703, + "learning_rate": 1.925847834875677e-06, + "loss": 2.6943, + "step": 44472 + }, + { + "epoch": 2.760754857533056, + "grad_norm": 0.13508617591141928, + "learning_rate": 1.924855281388793e-06, + "loss": 2.7684, + "step": 44473 + }, + { + "epoch": 2.760816934632814, + "grad_norm": 0.14492829018143835, + "learning_rate": 1.9238629787216133e-06, + "loss": 2.7718, + "step": 44474 + }, + { + "epoch": 2.760879011732572, + "grad_norm": 0.13157141957661864, + "learning_rate": 1.9228709268793164e-06, + "loss": 2.7179, + "step": 44475 + }, + { + "epoch": 2.76094108883233, + "grad_norm": 0.13452305593801475, + "learning_rate": 1.921879125867071e-06, + "loss": 2.6873, + "step": 44476 + }, + { + "epoch": 2.7610031659320877, + "grad_norm": 0.13423767891166907, + "learning_rate": 1.9208875756900502e-06, + "loss": 2.7276, + "step": 44477 + }, + { + "epoch": 2.7610652430318456, + "grad_norm": 0.13241480439925968, + "learning_rate": 1.9198962763534335e-06, + "loss": 2.6818, + "step": 44478 + }, + { + "epoch": 2.7611273201316036, + "grad_norm": 0.13423197586745006, + "learning_rate": 1.9189052278623833e-06, + "loss": 2.5884, + "step": 44479 + }, + { + "epoch": 2.7611893972313615, + "grad_norm": 0.1354872484457864, + "learning_rate": 1.9179144302220843e-06, + "loss": 2.6007, + "step": 44480 + }, + { + "epoch": 2.7612514743311194, + "grad_norm": 0.13879801157930785, + "learning_rate": 1.9169238834376934e-06, + "loss": 2.6295, + "step": 44481 + }, + { + "epoch": 2.7613135514308773, + "grad_norm": 0.14376045582933872, + "learning_rate": 1.91593358751439e-06, + "loss": 2.6674, + "step": 44482 + }, + { + "epoch": 2.761375628530635, + "grad_norm": 0.1347499091894732, + "learning_rate": 1.914943542457326e-06, + "loss": 2.6438, + "step": 44483 + }, + { + "epoch": 2.761437705630393, + "grad_norm": 0.13555300860860023, + "learning_rate": 1.9139537482716796e-06, + "loss": 2.7464, + "step": 44484 + }, + { + "epoch": 2.7614997827301506, + "grad_norm": 0.13794401043140583, + "learning_rate": 1.9129642049626084e-06, + "loss": 2.6445, + "step": 44485 + }, + { + "epoch": 2.761561859829909, + "grad_norm": 0.13583643817814825, + "learning_rate": 1.9119749125352805e-06, + "loss": 2.6791, + "step": 44486 + }, + { + "epoch": 2.7616239369296665, + "grad_norm": 0.12781877295949143, + "learning_rate": 1.9109858709948412e-06, + "loss": 2.6906, + "step": 44487 + }, + { + "epoch": 2.761686014029425, + "grad_norm": 0.13074916497295758, + "learning_rate": 1.909997080346476e-06, + "loss": 2.6483, + "step": 44488 + }, + { + "epoch": 2.7617480911291823, + "grad_norm": 0.13066295571991754, + "learning_rate": 1.909008540595325e-06, + "loss": 2.7797, + "step": 44489 + }, + { + "epoch": 2.76181016822894, + "grad_norm": 0.15005175075297741, + "learning_rate": 1.9080202517465508e-06, + "loss": 2.7184, + "step": 44490 + }, + { + "epoch": 2.761872245328698, + "grad_norm": 0.131780224083518, + "learning_rate": 1.9070322138053043e-06, + "loss": 2.7065, + "step": 44491 + }, + { + "epoch": 2.761934322428456, + "grad_norm": 0.14855537001126767, + "learning_rate": 1.906044426776754e-06, + "loss": 2.6941, + "step": 44492 + }, + { + "epoch": 2.761996399528214, + "grad_norm": 0.1372782430781896, + "learning_rate": 1.905056890666046e-06, + "loss": 2.6612, + "step": 44493 + }, + { + "epoch": 2.762058476627972, + "grad_norm": 0.13673999485074934, + "learning_rate": 1.9040696054783257e-06, + "loss": 2.6416, + "step": 44494 + }, + { + "epoch": 2.76212055372773, + "grad_norm": 0.14186312753937047, + "learning_rate": 1.903082571218756e-06, + "loss": 2.7252, + "step": 44495 + }, + { + "epoch": 2.7621826308274877, + "grad_norm": 0.12579313127308014, + "learning_rate": 1.9020957878924716e-06, + "loss": 2.6084, + "step": 44496 + }, + { + "epoch": 2.7622447079272456, + "grad_norm": 0.13966696289536754, + "learning_rate": 1.901109255504635e-06, + "loss": 2.6977, + "step": 44497 + }, + { + "epoch": 2.7623067850270036, + "grad_norm": 0.1349695639054704, + "learning_rate": 1.9001229740603921e-06, + "loss": 2.675, + "step": 44498 + }, + { + "epoch": 2.7623688621267615, + "grad_norm": 0.12948749010130395, + "learning_rate": 1.8991369435648776e-06, + "loss": 2.7701, + "step": 44499 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.1488556681199783, + "learning_rate": 1.898151164023243e-06, + "loss": 2.6591, + "step": 44500 + }, + { + "epoch": 2.7624930163262773, + "grad_norm": 0.13056152522936199, + "learning_rate": 1.8971656354406341e-06, + "loss": 2.7057, + "step": 44501 + }, + { + "epoch": 2.7625550934260352, + "grad_norm": 0.13246590605281672, + "learning_rate": 1.896180357822186e-06, + "loss": 2.68, + "step": 44502 + }, + { + "epoch": 2.762617170525793, + "grad_norm": 0.12984387810903902, + "learning_rate": 1.8951953311730497e-06, + "loss": 2.6458, + "step": 44503 + }, + { + "epoch": 2.762679247625551, + "grad_norm": 0.13943101723723478, + "learning_rate": 1.8942105554983547e-06, + "loss": 2.6502, + "step": 44504 + }, + { + "epoch": 2.762741324725309, + "grad_norm": 0.13210763837641806, + "learning_rate": 1.89322603080323e-06, + "loss": 2.6515, + "step": 44505 + }, + { + "epoch": 2.762803401825067, + "grad_norm": 0.144559625469215, + "learning_rate": 1.892241757092833e-06, + "loss": 2.7112, + "step": 44506 + }, + { + "epoch": 2.762865478924825, + "grad_norm": 0.13551338040307176, + "learning_rate": 1.8912577343722926e-06, + "loss": 2.6743, + "step": 44507 + }, + { + "epoch": 2.7629275560245823, + "grad_norm": 0.15107073517859745, + "learning_rate": 1.8902739626467382e-06, + "loss": 2.7598, + "step": 44508 + }, + { + "epoch": 2.7629896331243406, + "grad_norm": 0.13246355779199734, + "learning_rate": 1.8892904419212932e-06, + "loss": 2.7575, + "step": 44509 + }, + { + "epoch": 2.763051710224098, + "grad_norm": 0.1299044655555029, + "learning_rate": 1.8883071722011093e-06, + "loss": 2.6576, + "step": 44510 + }, + { + "epoch": 2.7631137873238565, + "grad_norm": 0.13370961245728066, + "learning_rate": 1.8873241534913045e-06, + "loss": 2.7065, + "step": 44511 + }, + { + "epoch": 2.763175864423614, + "grad_norm": 0.13848102895961206, + "learning_rate": 1.8863413857970025e-06, + "loss": 2.6731, + "step": 44512 + }, + { + "epoch": 2.763237941523372, + "grad_norm": 0.14302554539778362, + "learning_rate": 1.8853588691233493e-06, + "loss": 2.6703, + "step": 44513 + }, + { + "epoch": 2.76330001862313, + "grad_norm": 0.14827487218343513, + "learning_rate": 1.8843766034754462e-06, + "loss": 2.7153, + "step": 44514 + }, + { + "epoch": 2.7633620957228877, + "grad_norm": 0.1342584700758377, + "learning_rate": 1.8833945888584392e-06, + "loss": 2.7528, + "step": 44515 + }, + { + "epoch": 2.7634241728226456, + "grad_norm": 0.1391515494914178, + "learning_rate": 1.8824128252774465e-06, + "loss": 2.7717, + "step": 44516 + }, + { + "epoch": 2.7634862499224035, + "grad_norm": 0.13026947175585393, + "learning_rate": 1.8814313127375804e-06, + "loss": 2.6489, + "step": 44517 + }, + { + "epoch": 2.7635483270221615, + "grad_norm": 0.13239611541103724, + "learning_rate": 1.880450051243976e-06, + "loss": 2.6673, + "step": 44518 + }, + { + "epoch": 2.7636104041219194, + "grad_norm": 0.13509310448999937, + "learning_rate": 1.8794690408017346e-06, + "loss": 2.7278, + "step": 44519 + }, + { + "epoch": 2.7636724812216773, + "grad_norm": 0.13487280163595233, + "learning_rate": 1.878488281415991e-06, + "loss": 2.642, + "step": 44520 + }, + { + "epoch": 2.763734558321435, + "grad_norm": 0.13283880057500055, + "learning_rate": 1.8775077730918577e-06, + "loss": 2.773, + "step": 44521 + }, + { + "epoch": 2.763796635421193, + "grad_norm": 0.13168792186238593, + "learning_rate": 1.8765275158344474e-06, + "loss": 2.676, + "step": 44522 + }, + { + "epoch": 2.763858712520951, + "grad_norm": 0.15143399763800053, + "learning_rate": 1.8755475096488672e-06, + "loss": 2.7292, + "step": 44523 + }, + { + "epoch": 2.763920789620709, + "grad_norm": 0.13335500040684145, + "learning_rate": 1.8745677545402462e-06, + "loss": 2.6578, + "step": 44524 + }, + { + "epoch": 2.763982866720467, + "grad_norm": 0.13210830809863408, + "learning_rate": 1.8735882505136915e-06, + "loss": 2.7574, + "step": 44525 + }, + { + "epoch": 2.764044943820225, + "grad_norm": 0.13597174657037195, + "learning_rate": 1.8726089975743044e-06, + "loss": 2.757, + "step": 44526 + }, + { + "epoch": 2.7641070209199827, + "grad_norm": 0.15345696112208873, + "learning_rate": 1.8716299957271978e-06, + "loss": 2.791, + "step": 44527 + }, + { + "epoch": 2.7641690980197406, + "grad_norm": 0.13205034693463302, + "learning_rate": 1.8706512449774837e-06, + "loss": 2.7803, + "step": 44528 + }, + { + "epoch": 2.7642311751194986, + "grad_norm": 0.1426201588580523, + "learning_rate": 1.8696727453302643e-06, + "loss": 2.6099, + "step": 44529 + }, + { + "epoch": 2.7642932522192565, + "grad_norm": 0.13141465871172323, + "learning_rate": 1.8686944967906516e-06, + "loss": 2.61, + "step": 44530 + }, + { + "epoch": 2.764355329319014, + "grad_norm": 0.13174315826007685, + "learning_rate": 1.867716499363742e-06, + "loss": 2.6918, + "step": 44531 + }, + { + "epoch": 2.7644174064187723, + "grad_norm": 0.12979258364628535, + "learning_rate": 1.866738753054631e-06, + "loss": 2.6898, + "step": 44532 + }, + { + "epoch": 2.76447948351853, + "grad_norm": 0.1366131715474956, + "learning_rate": 1.865761257868437e-06, + "loss": 2.6468, + "step": 44533 + }, + { + "epoch": 2.764541560618288, + "grad_norm": 0.15504720989876564, + "learning_rate": 1.8647840138102557e-06, + "loss": 2.6268, + "step": 44534 + }, + { + "epoch": 2.7646036377180456, + "grad_norm": 0.13490131016025056, + "learning_rate": 1.8638070208851777e-06, + "loss": 2.6456, + "step": 44535 + }, + { + "epoch": 2.764665714817804, + "grad_norm": 0.15302950012206776, + "learning_rate": 1.8628302790982933e-06, + "loss": 2.6867, + "step": 44536 + }, + { + "epoch": 2.7647277919175615, + "grad_norm": 0.14991223430000883, + "learning_rate": 1.8618537884547205e-06, + "loss": 2.7142, + "step": 44537 + }, + { + "epoch": 2.7647898690173194, + "grad_norm": 0.13955404519387798, + "learning_rate": 1.8608775489595443e-06, + "loss": 2.7122, + "step": 44538 + }, + { + "epoch": 2.7648519461170773, + "grad_norm": 0.13125289278021432, + "learning_rate": 1.8599015606178495e-06, + "loss": 2.6856, + "step": 44539 + }, + { + "epoch": 2.764914023216835, + "grad_norm": 0.13130056678696125, + "learning_rate": 1.8589258234347317e-06, + "loss": 2.6318, + "step": 44540 + }, + { + "epoch": 2.764976100316593, + "grad_norm": 0.13542268048895587, + "learning_rate": 1.8579503374152928e-06, + "loss": 2.7108, + "step": 44541 + }, + { + "epoch": 2.765038177416351, + "grad_norm": 0.13478948547769193, + "learning_rate": 1.8569751025646121e-06, + "loss": 2.724, + "step": 44542 + }, + { + "epoch": 2.765100254516109, + "grad_norm": 0.13018665769087526, + "learning_rate": 1.8560001188877796e-06, + "loss": 2.5356, + "step": 44543 + }, + { + "epoch": 2.765162331615867, + "grad_norm": 0.127278402739648, + "learning_rate": 1.8550253863898859e-06, + "loss": 2.7018, + "step": 44544 + }, + { + "epoch": 2.765224408715625, + "grad_norm": 0.13151173465055735, + "learning_rate": 1.854050905075999e-06, + "loss": 2.6702, + "step": 44545 + }, + { + "epoch": 2.7652864858153827, + "grad_norm": 0.13122032687272434, + "learning_rate": 1.8530766749512207e-06, + "loss": 2.7194, + "step": 44546 + }, + { + "epoch": 2.7653485629151406, + "grad_norm": 0.13287938761756396, + "learning_rate": 1.8521026960206355e-06, + "loss": 2.7138, + "step": 44547 + }, + { + "epoch": 2.7654106400148986, + "grad_norm": 0.13341096369631608, + "learning_rate": 1.8511289682893174e-06, + "loss": 2.6878, + "step": 44548 + }, + { + "epoch": 2.7654727171146565, + "grad_norm": 0.13317613747800042, + "learning_rate": 1.8501554917623454e-06, + "loss": 2.6528, + "step": 44549 + }, + { + "epoch": 2.7655347942144144, + "grad_norm": 0.151962181449036, + "learning_rate": 1.8491822664447989e-06, + "loss": 2.7195, + "step": 44550 + }, + { + "epoch": 2.7655968713141723, + "grad_norm": 0.12999327701827545, + "learning_rate": 1.8482092923417627e-06, + "loss": 2.7033, + "step": 44551 + }, + { + "epoch": 2.7656589484139302, + "grad_norm": 0.13968947947889607, + "learning_rate": 1.8472365694583105e-06, + "loss": 2.7099, + "step": 44552 + }, + { + "epoch": 2.765721025513688, + "grad_norm": 0.15304728818042201, + "learning_rate": 1.8462640977995105e-06, + "loss": 2.7403, + "step": 44553 + }, + { + "epoch": 2.765783102613446, + "grad_norm": 0.1285645985846995, + "learning_rate": 1.845291877370442e-06, + "loss": 2.7756, + "step": 44554 + }, + { + "epoch": 2.765845179713204, + "grad_norm": 0.1475294156262483, + "learning_rate": 1.8443199081761675e-06, + "loss": 2.7985, + "step": 44555 + }, + { + "epoch": 2.7659072568129615, + "grad_norm": 0.1323029898756176, + "learning_rate": 1.8433481902217775e-06, + "loss": 2.7357, + "step": 44556 + }, + { + "epoch": 2.76596933391272, + "grad_norm": 0.1535701780965341, + "learning_rate": 1.8423767235123235e-06, + "loss": 2.7144, + "step": 44557 + }, + { + "epoch": 2.7660314110124773, + "grad_norm": 0.12940682406782925, + "learning_rate": 1.8414055080528846e-06, + "loss": 2.766, + "step": 44558 + }, + { + "epoch": 2.7660934881122357, + "grad_norm": 0.14542014406025988, + "learning_rate": 1.8404345438485183e-06, + "loss": 2.7131, + "step": 44559 + }, + { + "epoch": 2.766155565211993, + "grad_norm": 0.1314040421063414, + "learning_rate": 1.8394638309043034e-06, + "loss": 2.66, + "step": 44560 + }, + { + "epoch": 2.766217642311751, + "grad_norm": 0.1367938108117371, + "learning_rate": 1.838493369225297e-06, + "loss": 2.6818, + "step": 44561 + }, + { + "epoch": 2.766279719411509, + "grad_norm": 0.1437846567006952, + "learning_rate": 1.8375231588165564e-06, + "loss": 2.6762, + "step": 44562 + }, + { + "epoch": 2.766341796511267, + "grad_norm": 0.13139533131354683, + "learning_rate": 1.8365531996831497e-06, + "loss": 2.7606, + "step": 44563 + }, + { + "epoch": 2.766403873611025, + "grad_norm": 0.1354789239259544, + "learning_rate": 1.8355834918301395e-06, + "loss": 2.6684, + "step": 44564 + }, + { + "epoch": 2.7664659507107827, + "grad_norm": 0.1435508338255268, + "learning_rate": 1.8346140352625884e-06, + "loss": 2.6446, + "step": 44565 + }, + { + "epoch": 2.7665280278105406, + "grad_norm": 0.13191533700035304, + "learning_rate": 1.8336448299855425e-06, + "loss": 2.7235, + "step": 44566 + }, + { + "epoch": 2.7665901049102986, + "grad_norm": 0.1314752448295487, + "learning_rate": 1.832675876004064e-06, + "loss": 2.6899, + "step": 44567 + }, + { + "epoch": 2.7666521820100565, + "grad_norm": 0.1322965839428289, + "learning_rate": 1.831707173323205e-06, + "loss": 2.6929, + "step": 44568 + }, + { + "epoch": 2.7667142591098144, + "grad_norm": 0.13694146952671288, + "learning_rate": 1.8307387219480277e-06, + "loss": 2.6631, + "step": 44569 + }, + { + "epoch": 2.7667763362095723, + "grad_norm": 0.14819302629562225, + "learning_rate": 1.8297705218835836e-06, + "loss": 2.7111, + "step": 44570 + }, + { + "epoch": 2.7668384133093302, + "grad_norm": 0.14122923208841887, + "learning_rate": 1.8288025731349134e-06, + "loss": 2.7427, + "step": 44571 + }, + { + "epoch": 2.766900490409088, + "grad_norm": 0.147712631662973, + "learning_rate": 1.827834875707074e-06, + "loss": 2.6768, + "step": 44572 + }, + { + "epoch": 2.766962567508846, + "grad_norm": 0.13490120660422159, + "learning_rate": 1.8268674296051169e-06, + "loss": 2.6665, + "step": 44573 + }, + { + "epoch": 2.767024644608604, + "grad_norm": 0.13321325901221895, + "learning_rate": 1.8259002348340881e-06, + "loss": 2.7292, + "step": 44574 + }, + { + "epoch": 2.767086721708362, + "grad_norm": 0.14586337381856826, + "learning_rate": 1.8249332913990337e-06, + "loss": 2.73, + "step": 44575 + }, + { + "epoch": 2.76714879880812, + "grad_norm": 0.12951781679494756, + "learning_rate": 1.8239665993049938e-06, + "loss": 2.6971, + "step": 44576 + }, + { + "epoch": 2.7672108759078777, + "grad_norm": 0.13282352289570018, + "learning_rate": 1.823000158557009e-06, + "loss": 2.7088, + "step": 44577 + }, + { + "epoch": 2.7672729530076356, + "grad_norm": 0.1321979920076426, + "learning_rate": 1.8220339691601362e-06, + "loss": 2.6777, + "step": 44578 + }, + { + "epoch": 2.767335030107393, + "grad_norm": 0.14640507140850592, + "learning_rate": 1.8210680311194107e-06, + "loss": 2.7711, + "step": 44579 + }, + { + "epoch": 2.7673971072071515, + "grad_norm": 0.13202453122170424, + "learning_rate": 1.8201023444398667e-06, + "loss": 2.6774, + "step": 44580 + }, + { + "epoch": 2.767459184306909, + "grad_norm": 0.14641892560480269, + "learning_rate": 1.8191369091265453e-06, + "loss": 2.7462, + "step": 44581 + }, + { + "epoch": 2.7675212614066673, + "grad_norm": 0.13359854003617494, + "learning_rate": 1.8181717251844754e-06, + "loss": 2.6382, + "step": 44582 + }, + { + "epoch": 2.767583338506425, + "grad_norm": 0.15014673868793565, + "learning_rate": 1.8172067926187087e-06, + "loss": 2.7153, + "step": 44583 + }, + { + "epoch": 2.7676454156061827, + "grad_norm": 0.14058713933118847, + "learning_rate": 1.8162421114342688e-06, + "loss": 2.7797, + "step": 44584 + }, + { + "epoch": 2.7677074927059406, + "grad_norm": 0.13185309545691898, + "learning_rate": 1.8152776816361904e-06, + "loss": 2.6231, + "step": 44585 + }, + { + "epoch": 2.7677695698056985, + "grad_norm": 0.1326035440587153, + "learning_rate": 1.8143135032294978e-06, + "loss": 2.6443, + "step": 44586 + }, + { + "epoch": 2.7678316469054565, + "grad_norm": 0.12890358401922902, + "learning_rate": 1.8133495762192421e-06, + "loss": 2.6105, + "step": 44587 + }, + { + "epoch": 2.7678937240052144, + "grad_norm": 0.15103321147689241, + "learning_rate": 1.8123859006104305e-06, + "loss": 2.7249, + "step": 44588 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.13295485776954966, + "learning_rate": 1.8114224764081034e-06, + "loss": 2.6541, + "step": 44589 + }, + { + "epoch": 2.76801787820473, + "grad_norm": 0.13395376578706905, + "learning_rate": 1.810459303617279e-06, + "loss": 2.6744, + "step": 44590 + }, + { + "epoch": 2.768079955304488, + "grad_norm": 0.13859306701913338, + "learning_rate": 1.8094963822429922e-06, + "loss": 2.6946, + "step": 44591 + }, + { + "epoch": 2.768142032404246, + "grad_norm": 0.15899954004153713, + "learning_rate": 1.8085337122902613e-06, + "loss": 2.6855, + "step": 44592 + }, + { + "epoch": 2.768204109504004, + "grad_norm": 0.13271271940416013, + "learning_rate": 1.80757129376411e-06, + "loss": 2.6501, + "step": 44593 + }, + { + "epoch": 2.768266186603762, + "grad_norm": 0.13016068699430494, + "learning_rate": 1.806609126669556e-06, + "loss": 2.668, + "step": 44594 + }, + { + "epoch": 2.76832826370352, + "grad_norm": 0.13244152148407284, + "learning_rate": 1.8056472110116185e-06, + "loss": 2.6848, + "step": 44595 + }, + { + "epoch": 2.7683903408032777, + "grad_norm": 0.13348974032795288, + "learning_rate": 1.8046855467953262e-06, + "loss": 2.5948, + "step": 44596 + }, + { + "epoch": 2.7684524179030356, + "grad_norm": 0.14767117702378513, + "learning_rate": 1.8037241340256861e-06, + "loss": 2.7581, + "step": 44597 + }, + { + "epoch": 2.7685144950027936, + "grad_norm": 0.14119637494691928, + "learning_rate": 1.8027629727077166e-06, + "loss": 2.676, + "step": 44598 + }, + { + "epoch": 2.7685765721025515, + "grad_norm": 0.13145949696702414, + "learning_rate": 1.801802062846425e-06, + "loss": 2.688, + "step": 44599 + }, + { + "epoch": 2.7686386492023094, + "grad_norm": 0.1292964275502187, + "learning_rate": 1.8008414044468457e-06, + "loss": 2.6665, + "step": 44600 + }, + { + "epoch": 2.7687007263020673, + "grad_norm": 0.13009403377770495, + "learning_rate": 1.7998809975139698e-06, + "loss": 2.7163, + "step": 44601 + }, + { + "epoch": 2.768762803401825, + "grad_norm": 0.1302828472233526, + "learning_rate": 1.7989208420528204e-06, + "loss": 2.6577, + "step": 44602 + }, + { + "epoch": 2.768824880501583, + "grad_norm": 0.13008092527840656, + "learning_rate": 1.7979609380683993e-06, + "loss": 2.6391, + "step": 44603 + }, + { + "epoch": 2.7688869576013406, + "grad_norm": 0.13577517101871997, + "learning_rate": 1.7970012855657136e-06, + "loss": 2.6333, + "step": 44604 + }, + { + "epoch": 2.768949034701099, + "grad_norm": 0.14032818013151807, + "learning_rate": 1.7960418845497817e-06, + "loss": 2.7071, + "step": 44605 + }, + { + "epoch": 2.7690111118008565, + "grad_norm": 0.1513196378286442, + "learning_rate": 1.7950827350255994e-06, + "loss": 2.8473, + "step": 44606 + }, + { + "epoch": 2.769073188900615, + "grad_norm": 0.1290699973595797, + "learning_rate": 1.7941238369981739e-06, + "loss": 2.6715, + "step": 44607 + }, + { + "epoch": 2.7691352660003723, + "grad_norm": 0.1440366064703228, + "learning_rate": 1.7931651904724955e-06, + "loss": 2.6577, + "step": 44608 + }, + { + "epoch": 2.76919734310013, + "grad_norm": 0.14652779889207038, + "learning_rate": 1.7922067954535882e-06, + "loss": 2.7147, + "step": 44609 + }, + { + "epoch": 2.769259420199888, + "grad_norm": 0.13424281976943103, + "learning_rate": 1.791248651946431e-06, + "loss": 2.6858, + "step": 44610 + }, + { + "epoch": 2.769321497299646, + "grad_norm": 0.127935081834121, + "learning_rate": 1.7902907599560426e-06, + "loss": 2.6685, + "step": 44611 + }, + { + "epoch": 2.769383574399404, + "grad_norm": 0.15167698076000752, + "learning_rate": 1.7893331194874075e-06, + "loss": 2.7471, + "step": 44612 + }, + { + "epoch": 2.769445651499162, + "grad_norm": 0.13168857957481636, + "learning_rate": 1.7883757305455218e-06, + "loss": 2.6479, + "step": 44613 + }, + { + "epoch": 2.76950772859892, + "grad_norm": 0.13137246362059793, + "learning_rate": 1.787418593135387e-06, + "loss": 2.7737, + "step": 44614 + }, + { + "epoch": 2.7695698056986777, + "grad_norm": 0.14118063616100102, + "learning_rate": 1.7864617072619937e-06, + "loss": 2.5873, + "step": 44615 + }, + { + "epoch": 2.7696318827984356, + "grad_norm": 0.13047723342817114, + "learning_rate": 1.785505072930338e-06, + "loss": 2.6875, + "step": 44616 + }, + { + "epoch": 2.7696939598981936, + "grad_norm": 0.132911596073197, + "learning_rate": 1.7845486901454045e-06, + "loss": 2.7578, + "step": 44617 + }, + { + "epoch": 2.7697560369979515, + "grad_norm": 0.1305620602188994, + "learning_rate": 1.7835925589121783e-06, + "loss": 2.5666, + "step": 44618 + }, + { + "epoch": 2.7698181140977094, + "grad_norm": 0.1346308130649181, + "learning_rate": 1.7826366792356607e-06, + "loss": 2.6692, + "step": 44619 + }, + { + "epoch": 2.7698801911974673, + "grad_norm": 0.1443863477417044, + "learning_rate": 1.7816810511208316e-06, + "loss": 2.784, + "step": 44620 + }, + { + "epoch": 2.7699422682972252, + "grad_norm": 0.14341638351451638, + "learning_rate": 1.7807256745726808e-06, + "loss": 2.6638, + "step": 44621 + }, + { + "epoch": 2.770004345396983, + "grad_norm": 0.1296433844850242, + "learning_rate": 1.7797705495961825e-06, + "loss": 2.654, + "step": 44622 + }, + { + "epoch": 2.770066422496741, + "grad_norm": 0.12974337930867552, + "learning_rate": 1.7788156761963326e-06, + "loss": 2.7596, + "step": 44623 + }, + { + "epoch": 2.770128499596499, + "grad_norm": 0.13285681041479636, + "learning_rate": 1.7778610543781048e-06, + "loss": 2.723, + "step": 44624 + }, + { + "epoch": 2.770190576696257, + "grad_norm": 0.1330140491877549, + "learning_rate": 1.7769066841464844e-06, + "loss": 2.671, + "step": 44625 + }, + { + "epoch": 2.770252653796015, + "grad_norm": 0.14646407940455666, + "learning_rate": 1.7759525655064446e-06, + "loss": 2.7465, + "step": 44626 + }, + { + "epoch": 2.7703147308957723, + "grad_norm": 0.12844867121709674, + "learning_rate": 1.7749986984629707e-06, + "loss": 2.5708, + "step": 44627 + }, + { + "epoch": 2.7703768079955307, + "grad_norm": 0.13136163091236838, + "learning_rate": 1.7740450830210365e-06, + "loss": 2.6251, + "step": 44628 + }, + { + "epoch": 2.770438885095288, + "grad_norm": 0.1320962737302377, + "learning_rate": 1.7730917191856156e-06, + "loss": 2.6836, + "step": 44629 + }, + { + "epoch": 2.7705009621950465, + "grad_norm": 0.13646486414746375, + "learning_rate": 1.7721386069616818e-06, + "loss": 2.7213, + "step": 44630 + }, + { + "epoch": 2.770563039294804, + "grad_norm": 0.13124662718526395, + "learning_rate": 1.7711857463542091e-06, + "loss": 2.7077, + "step": 44631 + }, + { + "epoch": 2.770625116394562, + "grad_norm": 0.13827289351305608, + "learning_rate": 1.7702331373681657e-06, + "loss": 2.631, + "step": 44632 + }, + { + "epoch": 2.77068719349432, + "grad_norm": 0.131964303147923, + "learning_rate": 1.769280780008531e-06, + "loss": 2.6392, + "step": 44633 + }, + { + "epoch": 2.7707492705940777, + "grad_norm": 0.14819811667233113, + "learning_rate": 1.768328674280262e-06, + "loss": 2.7603, + "step": 44634 + }, + { + "epoch": 2.7708113476938356, + "grad_norm": 0.13313430479377475, + "learning_rate": 1.7673768201883268e-06, + "loss": 2.7254, + "step": 44635 + }, + { + "epoch": 2.7708734247935936, + "grad_norm": 0.13534573766772148, + "learning_rate": 1.7664252177376995e-06, + "loss": 2.6987, + "step": 44636 + }, + { + "epoch": 2.7709355018933515, + "grad_norm": 0.14942931427746725, + "learning_rate": 1.7654738669333426e-06, + "loss": 2.5905, + "step": 44637 + }, + { + "epoch": 2.7709975789931094, + "grad_norm": 0.13165854776325425, + "learning_rate": 1.7645227677802189e-06, + "loss": 2.6404, + "step": 44638 + }, + { + "epoch": 2.7710596560928673, + "grad_norm": 0.1453874589512803, + "learning_rate": 1.7635719202832912e-06, + "loss": 2.6914, + "step": 44639 + }, + { + "epoch": 2.7711217331926252, + "grad_norm": 0.13129343097640928, + "learning_rate": 1.762621324447511e-06, + "loss": 2.7096, + "step": 44640 + }, + { + "epoch": 2.771183810292383, + "grad_norm": 0.14093579280690505, + "learning_rate": 1.7616709802778465e-06, + "loss": 2.6725, + "step": 44641 + }, + { + "epoch": 2.771245887392141, + "grad_norm": 0.13401436456196755, + "learning_rate": 1.7607208877792604e-06, + "loss": 2.787, + "step": 44642 + }, + { + "epoch": 2.771307964491899, + "grad_norm": 0.14351966975767558, + "learning_rate": 1.7597710469566986e-06, + "loss": 2.6257, + "step": 44643 + }, + { + "epoch": 2.771370041591657, + "grad_norm": 0.1312745965321773, + "learning_rate": 1.7588214578151242e-06, + "loss": 2.6826, + "step": 44644 + }, + { + "epoch": 2.771432118691415, + "grad_norm": 0.15315557242778535, + "learning_rate": 1.7578721203594827e-06, + "loss": 2.6274, + "step": 44645 + }, + { + "epoch": 2.7714941957911727, + "grad_norm": 0.1452109874426229, + "learning_rate": 1.7569230345947429e-06, + "loss": 2.8017, + "step": 44646 + }, + { + "epoch": 2.7715562728909307, + "grad_norm": 0.13326977798935966, + "learning_rate": 1.755974200525845e-06, + "loss": 2.7244, + "step": 44647 + }, + { + "epoch": 2.7716183499906886, + "grad_norm": 0.13828703711948684, + "learning_rate": 1.7550256181577407e-06, + "loss": 2.6532, + "step": 44648 + }, + { + "epoch": 2.7716804270904465, + "grad_norm": 0.13155876976669, + "learning_rate": 1.754077287495376e-06, + "loss": 2.6398, + "step": 44649 + }, + { + "epoch": 2.771742504190204, + "grad_norm": 0.1288452003872934, + "learning_rate": 1.753129208543708e-06, + "loss": 2.681, + "step": 44650 + }, + { + "epoch": 2.7718045812899623, + "grad_norm": 0.13343240017939068, + "learning_rate": 1.7521813813076771e-06, + "loss": 2.6032, + "step": 44651 + }, + { + "epoch": 2.77186665838972, + "grad_norm": 0.13697973953485057, + "learning_rate": 1.7512338057922295e-06, + "loss": 2.7156, + "step": 44652 + }, + { + "epoch": 2.771928735489478, + "grad_norm": 0.1284888546898135, + "learning_rate": 1.7502864820023057e-06, + "loss": 2.681, + "step": 44653 + }, + { + "epoch": 2.7719908125892356, + "grad_norm": 0.1489611536761583, + "learning_rate": 1.7493394099428407e-06, + "loss": 2.7672, + "step": 44654 + }, + { + "epoch": 2.772052889688994, + "grad_norm": 0.1332634604612919, + "learning_rate": 1.748392589618797e-06, + "loss": 2.6985, + "step": 44655 + }, + { + "epoch": 2.7721149667887515, + "grad_norm": 0.13429509132314757, + "learning_rate": 1.747446021035104e-06, + "loss": 2.759, + "step": 44656 + }, + { + "epoch": 2.7721770438885094, + "grad_norm": 0.14800869341155465, + "learning_rate": 1.7464997041966913e-06, + "loss": 2.7845, + "step": 44657 + }, + { + "epoch": 2.7722391209882673, + "grad_norm": 0.13823184192122137, + "learning_rate": 1.7455536391085048e-06, + "loss": 2.7914, + "step": 44658 + }, + { + "epoch": 2.772301198088025, + "grad_norm": 0.16144203801114435, + "learning_rate": 1.744607825775485e-06, + "loss": 2.6377, + "step": 44659 + }, + { + "epoch": 2.772363275187783, + "grad_norm": 0.13752753269283235, + "learning_rate": 1.7436622642025557e-06, + "loss": 2.7428, + "step": 44660 + }, + { + "epoch": 2.772425352287541, + "grad_norm": 0.13098357839705022, + "learning_rate": 1.7427169543946576e-06, + "loss": 2.7226, + "step": 44661 + }, + { + "epoch": 2.772487429387299, + "grad_norm": 0.13599938102276274, + "learning_rate": 1.7417718963567143e-06, + "loss": 2.6655, + "step": 44662 + }, + { + "epoch": 2.772549506487057, + "grad_norm": 0.13135799381459715, + "learning_rate": 1.7408270900936663e-06, + "loss": 2.698, + "step": 44663 + }, + { + "epoch": 2.772611583586815, + "grad_norm": 0.14220579731765687, + "learning_rate": 1.7398825356104432e-06, + "loss": 2.742, + "step": 44664 + }, + { + "epoch": 2.7726736606865727, + "grad_norm": 0.13436433228575922, + "learning_rate": 1.738938232911963e-06, + "loss": 2.633, + "step": 44665 + }, + { + "epoch": 2.7727357377863306, + "grad_norm": 0.13933709256672222, + "learning_rate": 1.737994182003161e-06, + "loss": 2.7597, + "step": 44666 + }, + { + "epoch": 2.7727978148860886, + "grad_norm": 0.132656784494624, + "learning_rate": 1.7370503828889495e-06, + "loss": 2.7312, + "step": 44667 + }, + { + "epoch": 2.7728598919858465, + "grad_norm": 0.14909738128532016, + "learning_rate": 1.736106835574275e-06, + "loss": 2.7369, + "step": 44668 + }, + { + "epoch": 2.7729219690856044, + "grad_norm": 0.13204045856122176, + "learning_rate": 1.7351635400640442e-06, + "loss": 2.7291, + "step": 44669 + }, + { + "epoch": 2.7729840461853623, + "grad_norm": 0.13963619925485857, + "learning_rate": 1.7342204963631813e-06, + "loss": 2.6631, + "step": 44670 + }, + { + "epoch": 2.7730461232851202, + "grad_norm": 0.14175342440547578, + "learning_rate": 1.7332777044765992e-06, + "loss": 2.7034, + "step": 44671 + }, + { + "epoch": 2.773108200384878, + "grad_norm": 0.1335299900222244, + "learning_rate": 1.7323351644092323e-06, + "loss": 2.7832, + "step": 44672 + }, + { + "epoch": 2.773170277484636, + "grad_norm": 0.1326751489622812, + "learning_rate": 1.7313928761659937e-06, + "loss": 2.7035, + "step": 44673 + }, + { + "epoch": 2.773232354584394, + "grad_norm": 0.1306644311395072, + "learning_rate": 1.7304508397517905e-06, + "loss": 2.7509, + "step": 44674 + }, + { + "epoch": 2.7732944316841515, + "grad_norm": 0.14555973688240859, + "learning_rate": 1.729509055171541e-06, + "loss": 2.7171, + "step": 44675 + }, + { + "epoch": 2.77335650878391, + "grad_norm": 0.15058185136448785, + "learning_rate": 1.7285675224301578e-06, + "loss": 2.6735, + "step": 44676 + }, + { + "epoch": 2.7734185858836673, + "grad_norm": 0.14110805409395813, + "learning_rate": 1.727626241532565e-06, + "loss": 2.6917, + "step": 44677 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.1309679349702357, + "learning_rate": 1.7266852124836641e-06, + "loss": 2.7134, + "step": 44678 + }, + { + "epoch": 2.773542740083183, + "grad_norm": 0.1273914107550301, + "learning_rate": 1.7257444352883622e-06, + "loss": 2.6761, + "step": 44679 + }, + { + "epoch": 2.773604817182941, + "grad_norm": 0.12812496888927918, + "learning_rate": 1.7248039099515724e-06, + "loss": 2.7044, + "step": 44680 + }, + { + "epoch": 2.773666894282699, + "grad_norm": 0.13484608280201754, + "learning_rate": 1.7238636364781958e-06, + "loss": 2.7816, + "step": 44681 + }, + { + "epoch": 2.773728971382457, + "grad_norm": 0.13913639601931854, + "learning_rate": 1.7229236148731454e-06, + "loss": 2.7284, + "step": 44682 + }, + { + "epoch": 2.773791048482215, + "grad_norm": 0.1331596325881349, + "learning_rate": 1.7219838451413227e-06, + "loss": 2.7648, + "step": 44683 + }, + { + "epoch": 2.7738531255819727, + "grad_norm": 0.13675100082392125, + "learning_rate": 1.7210443272876298e-06, + "loss": 2.7082, + "step": 44684 + }, + { + "epoch": 2.7739152026817306, + "grad_norm": 0.13490278755094, + "learning_rate": 1.7201050613169679e-06, + "loss": 2.6628, + "step": 44685 + }, + { + "epoch": 2.7739772797814886, + "grad_norm": 0.13696103426241338, + "learning_rate": 1.7191660472342387e-06, + "loss": 2.6949, + "step": 44686 + }, + { + "epoch": 2.7740393568812465, + "grad_norm": 0.13640871999411663, + "learning_rate": 1.7182272850443383e-06, + "loss": 2.6895, + "step": 44687 + }, + { + "epoch": 2.7741014339810044, + "grad_norm": 0.13530467213372277, + "learning_rate": 1.7172887747521737e-06, + "loss": 2.7212, + "step": 44688 + }, + { + "epoch": 2.7741635110807623, + "grad_norm": 0.14090541855090244, + "learning_rate": 1.7163505163626303e-06, + "loss": 2.6688, + "step": 44689 + }, + { + "epoch": 2.7742255881805202, + "grad_norm": 0.13318100463122784, + "learning_rate": 1.715412509880604e-06, + "loss": 2.5992, + "step": 44690 + }, + { + "epoch": 2.774287665280278, + "grad_norm": 0.13204782201382012, + "learning_rate": 1.7144747553109963e-06, + "loss": 2.7239, + "step": 44691 + }, + { + "epoch": 2.774349742380036, + "grad_norm": 0.1419393850530592, + "learning_rate": 1.713537252658698e-06, + "loss": 2.6928, + "step": 44692 + }, + { + "epoch": 2.774411819479794, + "grad_norm": 0.13846946100875904, + "learning_rate": 1.7126000019285938e-06, + "loss": 2.6349, + "step": 44693 + }, + { + "epoch": 2.774473896579552, + "grad_norm": 0.13157640271565293, + "learning_rate": 1.7116630031255743e-06, + "loss": 2.7021, + "step": 44694 + }, + { + "epoch": 2.77453597367931, + "grad_norm": 0.13024366773553964, + "learning_rate": 1.7107262562545357e-06, + "loss": 2.6326, + "step": 44695 + }, + { + "epoch": 2.7745980507790677, + "grad_norm": 0.13023670284843017, + "learning_rate": 1.7097897613203628e-06, + "loss": 2.6088, + "step": 44696 + }, + { + "epoch": 2.7746601278788257, + "grad_norm": 0.13367396645662746, + "learning_rate": 1.7088535183279409e-06, + "loss": 2.6801, + "step": 44697 + }, + { + "epoch": 2.774722204978583, + "grad_norm": 0.13058353645520498, + "learning_rate": 1.7079175272821434e-06, + "loss": 2.6763, + "step": 44698 + }, + { + "epoch": 2.7747842820783415, + "grad_norm": 0.14771894279541642, + "learning_rate": 1.7069817881878725e-06, + "loss": 2.7028, + "step": 44699 + }, + { + "epoch": 2.774846359178099, + "grad_norm": 0.13143117723798092, + "learning_rate": 1.7060463010500016e-06, + "loss": 2.8, + "step": 44700 + }, + { + "epoch": 2.7749084362778573, + "grad_norm": 0.1329549488319563, + "learning_rate": 1.7051110658734049e-06, + "loss": 2.7604, + "step": 44701 + }, + { + "epoch": 2.774970513377615, + "grad_norm": 0.13275121929979086, + "learning_rate": 1.7041760826629727e-06, + "loss": 2.6649, + "step": 44702 + }, + { + "epoch": 2.775032590477373, + "grad_norm": 0.13550666575470313, + "learning_rate": 1.7032413514235735e-06, + "loss": 2.734, + "step": 44703 + }, + { + "epoch": 2.7750946675771306, + "grad_norm": 0.1318148843414855, + "learning_rate": 1.7023068721600921e-06, + "loss": 2.7034, + "step": 44704 + }, + { + "epoch": 2.7751567446768886, + "grad_norm": 0.13280779465586673, + "learning_rate": 1.7013726448774025e-06, + "loss": 2.7635, + "step": 44705 + }, + { + "epoch": 2.7752188217766465, + "grad_norm": 0.153962533602719, + "learning_rate": 1.700438669580373e-06, + "loss": 2.6973, + "step": 44706 + }, + { + "epoch": 2.7752808988764044, + "grad_norm": 0.1481119589054527, + "learning_rate": 1.6995049462738777e-06, + "loss": 2.6689, + "step": 44707 + }, + { + "epoch": 2.7753429759761623, + "grad_norm": 0.13360489748573418, + "learning_rate": 1.6985714749627902e-06, + "loss": 2.7889, + "step": 44708 + }, + { + "epoch": 2.7754050530759202, + "grad_norm": 0.13913252037429366, + "learning_rate": 1.6976382556519844e-06, + "loss": 2.7307, + "step": 44709 + }, + { + "epoch": 2.775467130175678, + "grad_norm": 0.14113711779479068, + "learning_rate": 1.6967052883463286e-06, + "loss": 2.731, + "step": 44710 + }, + { + "epoch": 2.775529207275436, + "grad_norm": 0.14715643890111127, + "learning_rate": 1.6957725730506857e-06, + "loss": 2.6461, + "step": 44711 + }, + { + "epoch": 2.775591284375194, + "grad_norm": 0.13299653681883694, + "learning_rate": 1.694840109769913e-06, + "loss": 2.7229, + "step": 44712 + }, + { + "epoch": 2.775653361474952, + "grad_norm": 0.13178004033512944, + "learning_rate": 1.6939078985089008e-06, + "loss": 2.7318, + "step": 44713 + }, + { + "epoch": 2.77571543857471, + "grad_norm": 0.13483003100096713, + "learning_rate": 1.6929759392724899e-06, + "loss": 2.6748, + "step": 44714 + }, + { + "epoch": 2.7757775156744677, + "grad_norm": 0.13727613996116822, + "learning_rate": 1.6920442320655538e-06, + "loss": 2.7246, + "step": 44715 + }, + { + "epoch": 2.7758395927742257, + "grad_norm": 0.1383687790292418, + "learning_rate": 1.6911127768929501e-06, + "loss": 2.7015, + "step": 44716 + }, + { + "epoch": 2.7759016698739836, + "grad_norm": 0.14499268459907189, + "learning_rate": 1.690181573759536e-06, + "loss": 2.7085, + "step": 44717 + }, + { + "epoch": 2.7759637469737415, + "grad_norm": 0.1320478149609011, + "learning_rate": 1.6892506226701743e-06, + "loss": 2.7127, + "step": 44718 + }, + { + "epoch": 2.7760258240734994, + "grad_norm": 0.13457398679503513, + "learning_rate": 1.6883199236297164e-06, + "loss": 2.7065, + "step": 44719 + }, + { + "epoch": 2.7760879011732573, + "grad_norm": 0.14253249156647843, + "learning_rate": 1.6873894766430254e-06, + "loss": 2.7207, + "step": 44720 + }, + { + "epoch": 2.7761499782730152, + "grad_norm": 0.14131830758093905, + "learning_rate": 1.6864592817149471e-06, + "loss": 2.5573, + "step": 44721 + }, + { + "epoch": 2.776212055372773, + "grad_norm": 0.13023672430142638, + "learning_rate": 1.6855293388503445e-06, + "loss": 2.6904, + "step": 44722 + }, + { + "epoch": 2.7762741324725306, + "grad_norm": 0.13892211632497917, + "learning_rate": 1.6845996480540637e-06, + "loss": 2.7313, + "step": 44723 + }, + { + "epoch": 2.776336209572289, + "grad_norm": 0.15222149614141023, + "learning_rate": 1.6836702093309564e-06, + "loss": 2.7135, + "step": 44724 + }, + { + "epoch": 2.7763982866720465, + "grad_norm": 0.13273870998800646, + "learning_rate": 1.6827410226858686e-06, + "loss": 2.7003, + "step": 44725 + }, + { + "epoch": 2.776460363771805, + "grad_norm": 0.14637742270688864, + "learning_rate": 1.6818120881236466e-06, + "loss": 2.7105, + "step": 44726 + }, + { + "epoch": 2.7765224408715623, + "grad_norm": 0.13222240035748414, + "learning_rate": 1.6808834056491473e-06, + "loss": 2.7628, + "step": 44727 + }, + { + "epoch": 2.77658451797132, + "grad_norm": 0.13229942793369237, + "learning_rate": 1.6799549752672061e-06, + "loss": 2.6919, + "step": 44728 + }, + { + "epoch": 2.776646595071078, + "grad_norm": 0.14469510533219285, + "learning_rate": 1.6790267969826744e-06, + "loss": 2.6777, + "step": 44729 + }, + { + "epoch": 2.776708672170836, + "grad_norm": 0.12794171342308255, + "learning_rate": 1.6780988708003765e-06, + "loss": 2.6544, + "step": 44730 + }, + { + "epoch": 2.776770749270594, + "grad_norm": 0.14773087709704769, + "learning_rate": 1.6771711967251802e-06, + "loss": 2.6508, + "step": 44731 + }, + { + "epoch": 2.776832826370352, + "grad_norm": 0.1319861862769157, + "learning_rate": 1.6762437747619096e-06, + "loss": 2.6905, + "step": 44732 + }, + { + "epoch": 2.77689490347011, + "grad_norm": 0.13615742984870782, + "learning_rate": 1.6753166049154056e-06, + "loss": 2.6671, + "step": 44733 + }, + { + "epoch": 2.7769569805698677, + "grad_norm": 0.13626841218370495, + "learning_rate": 1.674389687190503e-06, + "loss": 2.7921, + "step": 44734 + }, + { + "epoch": 2.7770190576696256, + "grad_norm": 0.1337300400123281, + "learning_rate": 1.6734630215920476e-06, + "loss": 2.6892, + "step": 44735 + }, + { + "epoch": 2.7770811347693836, + "grad_norm": 0.14765198439032567, + "learning_rate": 1.672536608124864e-06, + "loss": 2.6798, + "step": 44736 + }, + { + "epoch": 2.7771432118691415, + "grad_norm": 0.14771300996022688, + "learning_rate": 1.6716104467937866e-06, + "loss": 2.7369, + "step": 44737 + }, + { + "epoch": 2.7772052889688994, + "grad_norm": 0.13404029035331028, + "learning_rate": 1.670684537603656e-06, + "loss": 2.6683, + "step": 44738 + }, + { + "epoch": 2.7772673660686573, + "grad_norm": 0.14384547126389902, + "learning_rate": 1.6697588805592857e-06, + "loss": 2.67, + "step": 44739 + }, + { + "epoch": 2.7773294431684152, + "grad_norm": 0.13307457899211209, + "learning_rate": 1.6688334756655267e-06, + "loss": 2.6709, + "step": 44740 + }, + { + "epoch": 2.777391520268173, + "grad_norm": 0.14607081845280503, + "learning_rate": 1.6679083229271863e-06, + "loss": 2.679, + "step": 44741 + }, + { + "epoch": 2.777453597367931, + "grad_norm": 0.14407774941751533, + "learning_rate": 1.6669834223491054e-06, + "loss": 2.7716, + "step": 44742 + }, + { + "epoch": 2.777515674467689, + "grad_norm": 0.1633254764746859, + "learning_rate": 1.6660587739361079e-06, + "loss": 2.709, + "step": 44743 + }, + { + "epoch": 2.777577751567447, + "grad_norm": 0.1398725844870112, + "learning_rate": 1.6651343776930117e-06, + "loss": 2.6987, + "step": 44744 + }, + { + "epoch": 2.777639828667205, + "grad_norm": 0.14935183631786866, + "learning_rate": 1.6642102336246468e-06, + "loss": 2.6873, + "step": 44745 + }, + { + "epoch": 2.7777019057669623, + "grad_norm": 0.13499861769498336, + "learning_rate": 1.6632863417358312e-06, + "loss": 2.7249, + "step": 44746 + }, + { + "epoch": 2.7777639828667207, + "grad_norm": 0.1400749721799915, + "learning_rate": 1.6623627020313836e-06, + "loss": 2.6371, + "step": 44747 + }, + { + "epoch": 2.777826059966478, + "grad_norm": 0.1402523680064603, + "learning_rate": 1.6614393145161166e-06, + "loss": 2.6719, + "step": 44748 + }, + { + "epoch": 2.7778881370662365, + "grad_norm": 0.13964471611986662, + "learning_rate": 1.6605161791948654e-06, + "loss": 2.7583, + "step": 44749 + }, + { + "epoch": 2.777950214165994, + "grad_norm": 0.13001817805402763, + "learning_rate": 1.659593296072437e-06, + "loss": 2.5935, + "step": 44750 + }, + { + "epoch": 2.7780122912657523, + "grad_norm": 0.13572727093967052, + "learning_rate": 1.6586706651536387e-06, + "loss": 2.6866, + "step": 44751 + }, + { + "epoch": 2.77807436836551, + "grad_norm": 0.14085866799920282, + "learning_rate": 1.6577482864432948e-06, + "loss": 2.7509, + "step": 44752 + }, + { + "epoch": 2.7781364454652677, + "grad_norm": 0.13183756933261997, + "learning_rate": 1.6568261599462121e-06, + "loss": 2.7586, + "step": 44753 + }, + { + "epoch": 2.7781985225650256, + "grad_norm": 0.13095063969818918, + "learning_rate": 1.6559042856672035e-06, + "loss": 2.5852, + "step": 44754 + }, + { + "epoch": 2.7782605996647836, + "grad_norm": 0.13010596701926627, + "learning_rate": 1.6549826636110766e-06, + "loss": 2.6884, + "step": 44755 + }, + { + "epoch": 2.7783226767645415, + "grad_norm": 0.1371764491741975, + "learning_rate": 1.6540612937826439e-06, + "loss": 2.7187, + "step": 44756 + }, + { + "epoch": 2.7783847538642994, + "grad_norm": 0.14195062430430253, + "learning_rate": 1.6531401761867072e-06, + "loss": 2.663, + "step": 44757 + }, + { + "epoch": 2.7784468309640573, + "grad_norm": 0.1418556958353903, + "learning_rate": 1.6522193108280793e-06, + "loss": 2.6794, + "step": 44758 + }, + { + "epoch": 2.7785089080638152, + "grad_norm": 0.13068099461030222, + "learning_rate": 1.6512986977115564e-06, + "loss": 2.7389, + "step": 44759 + }, + { + "epoch": 2.778570985163573, + "grad_norm": 0.13261987236799624, + "learning_rate": 1.6503783368419455e-06, + "loss": 2.7151, + "step": 44760 + }, + { + "epoch": 2.778633062263331, + "grad_norm": 0.13132958837416617, + "learning_rate": 1.6494582282240433e-06, + "loss": 2.748, + "step": 44761 + }, + { + "epoch": 2.778695139363089, + "grad_norm": 0.15886304523066191, + "learning_rate": 1.6485383718626623e-06, + "loss": 2.7186, + "step": 44762 + }, + { + "epoch": 2.778757216462847, + "grad_norm": 0.1301122732273742, + "learning_rate": 1.6476187677625932e-06, + "loss": 2.707, + "step": 44763 + }, + { + "epoch": 2.778819293562605, + "grad_norm": 0.13866456106535696, + "learning_rate": 1.6466994159286375e-06, + "loss": 2.6597, + "step": 44764 + }, + { + "epoch": 2.7788813706623627, + "grad_norm": 0.1360323158728925, + "learning_rate": 1.6457803163655861e-06, + "loss": 2.6926, + "step": 44765 + }, + { + "epoch": 2.7789434477621207, + "grad_norm": 0.12932486908148513, + "learning_rate": 1.644861469078235e-06, + "loss": 2.6542, + "step": 44766 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.13986845624043887, + "learning_rate": 1.6439428740713802e-06, + "loss": 2.7266, + "step": 44767 + }, + { + "epoch": 2.7790676019616365, + "grad_norm": 0.13666638325113278, + "learning_rate": 1.6430245313498182e-06, + "loss": 2.6817, + "step": 44768 + }, + { + "epoch": 2.7791296790613944, + "grad_norm": 0.1317452931499821, + "learning_rate": 1.642106440918334e-06, + "loss": 2.6972, + "step": 44769 + }, + { + "epoch": 2.7791917561611523, + "grad_norm": 0.13678079287265443, + "learning_rate": 1.641188602781718e-06, + "loss": 2.6249, + "step": 44770 + }, + { + "epoch": 2.77925383326091, + "grad_norm": 0.14124720069550453, + "learning_rate": 1.640271016944761e-06, + "loss": 2.7176, + "step": 44771 + }, + { + "epoch": 2.779315910360668, + "grad_norm": 0.16307513835124196, + "learning_rate": 1.6393536834122536e-06, + "loss": 2.7183, + "step": 44772 + }, + { + "epoch": 2.7793779874604256, + "grad_norm": 0.13987157241202877, + "learning_rate": 1.6384366021889752e-06, + "loss": 2.6372, + "step": 44773 + }, + { + "epoch": 2.779440064560184, + "grad_norm": 0.12825709953140146, + "learning_rate": 1.6375197732797109e-06, + "loss": 2.7051, + "step": 44774 + }, + { + "epoch": 2.7795021416599415, + "grad_norm": 0.13118563021571827, + "learning_rate": 1.6366031966892515e-06, + "loss": 2.6653, + "step": 44775 + }, + { + "epoch": 2.7795642187596994, + "grad_norm": 0.1327470449829953, + "learning_rate": 1.635686872422365e-06, + "loss": 2.6763, + "step": 44776 + }, + { + "epoch": 2.7796262958594573, + "grad_norm": 0.13418689802805, + "learning_rate": 1.6347708004838536e-06, + "loss": 2.7889, + "step": 44777 + }, + { + "epoch": 2.7796883729592152, + "grad_norm": 0.13616544614225992, + "learning_rate": 1.6338549808784743e-06, + "loss": 2.721, + "step": 44778 + }, + { + "epoch": 2.779750450058973, + "grad_norm": 0.1458883940828578, + "learning_rate": 1.6329394136110233e-06, + "loss": 2.7815, + "step": 44779 + }, + { + "epoch": 2.779812527158731, + "grad_norm": 0.13199358098044423, + "learning_rate": 1.6320240986862578e-06, + "loss": 2.7077, + "step": 44780 + }, + { + "epoch": 2.779874604258489, + "grad_norm": 0.13266626890830135, + "learning_rate": 1.6311090361089743e-06, + "loss": 2.6701, + "step": 44781 + }, + { + "epoch": 2.779936681358247, + "grad_norm": 0.1322989422069087, + "learning_rate": 1.6301942258839297e-06, + "loss": 2.7047, + "step": 44782 + }, + { + "epoch": 2.779998758458005, + "grad_norm": 0.14131570440933644, + "learning_rate": 1.6292796680159096e-06, + "loss": 2.6772, + "step": 44783 + }, + { + "epoch": 2.7800608355577627, + "grad_norm": 0.13663747278921134, + "learning_rate": 1.6283653625096761e-06, + "loss": 2.7054, + "step": 44784 + }, + { + "epoch": 2.7801229126575207, + "grad_norm": 0.13123517378445768, + "learning_rate": 1.6274513093700039e-06, + "loss": 2.6487, + "step": 44785 + }, + { + "epoch": 2.7801849897572786, + "grad_norm": 0.14032908936285884, + "learning_rate": 1.6265375086016665e-06, + "loss": 2.6501, + "step": 44786 + }, + { + "epoch": 2.7802470668570365, + "grad_norm": 0.13913794892450895, + "learning_rate": 1.6256239602094215e-06, + "loss": 2.7733, + "step": 44787 + }, + { + "epoch": 2.7803091439567944, + "grad_norm": 0.1362242816901035, + "learning_rate": 1.6247106641980425e-06, + "loss": 2.6598, + "step": 44788 + }, + { + "epoch": 2.7803712210565523, + "grad_norm": 0.14919516791497334, + "learning_rate": 1.6237976205722816e-06, + "loss": 2.7661, + "step": 44789 + }, + { + "epoch": 2.7804332981563102, + "grad_norm": 0.13420127100923576, + "learning_rate": 1.622884829336918e-06, + "loss": 2.7384, + "step": 44790 + }, + { + "epoch": 2.780495375256068, + "grad_norm": 0.14480967381106394, + "learning_rate": 1.621972290496715e-06, + "loss": 2.6949, + "step": 44791 + }, + { + "epoch": 2.780557452355826, + "grad_norm": 0.13374070873921484, + "learning_rate": 1.6210600040564183e-06, + "loss": 2.733, + "step": 44792 + }, + { + "epoch": 2.780619529455584, + "grad_norm": 0.1461721589816101, + "learning_rate": 1.6201479700207967e-06, + "loss": 2.7513, + "step": 44793 + }, + { + "epoch": 2.7806816065553415, + "grad_norm": 0.13025340651168746, + "learning_rate": 1.6192361883946128e-06, + "loss": 2.7096, + "step": 44794 + }, + { + "epoch": 2.7807436836551, + "grad_norm": 0.12925670408878642, + "learning_rate": 1.618324659182613e-06, + "loss": 2.5767, + "step": 44795 + }, + { + "epoch": 2.7808057607548573, + "grad_norm": 0.13343258863205826, + "learning_rate": 1.6174133823895655e-06, + "loss": 2.7051, + "step": 44796 + }, + { + "epoch": 2.7808678378546157, + "grad_norm": 0.1352600413008652, + "learning_rate": 1.6165023580202055e-06, + "loss": 2.7285, + "step": 44797 + }, + { + "epoch": 2.780929914954373, + "grad_norm": 0.13324269576746015, + "learning_rate": 1.6155915860793069e-06, + "loss": 2.6704, + "step": 44798 + }, + { + "epoch": 2.7809919920541315, + "grad_norm": 0.13631770714465036, + "learning_rate": 1.614681066571616e-06, + "loss": 2.6694, + "step": 44799 + }, + { + "epoch": 2.781054069153889, + "grad_norm": 0.1453229638915958, + "learning_rate": 1.6137707995018735e-06, + "loss": 2.675, + "step": 44800 + }, + { + "epoch": 2.781116146253647, + "grad_norm": 0.13420322106336077, + "learning_rate": 1.6128607848748422e-06, + "loss": 2.6687, + "step": 44801 + }, + { + "epoch": 2.781178223353405, + "grad_norm": 0.13060158631510013, + "learning_rate": 1.6119510226952516e-06, + "loss": 2.6858, + "step": 44802 + }, + { + "epoch": 2.7812403004531627, + "grad_norm": 0.14589119016119712, + "learning_rate": 1.6110415129678647e-06, + "loss": 2.7408, + "step": 44803 + }, + { + "epoch": 2.7813023775529206, + "grad_norm": 0.1308602290339144, + "learning_rate": 1.6101322556974218e-06, + "loss": 2.7102, + "step": 44804 + }, + { + "epoch": 2.7813644546526786, + "grad_norm": 0.13170847912675432, + "learning_rate": 1.6092232508886695e-06, + "loss": 2.7367, + "step": 44805 + }, + { + "epoch": 2.7814265317524365, + "grad_norm": 0.13255236899814163, + "learning_rate": 1.6083144985463373e-06, + "loss": 2.7396, + "step": 44806 + }, + { + "epoch": 2.7814886088521944, + "grad_norm": 0.1328293495148704, + "learning_rate": 1.6074059986751765e-06, + "loss": 2.6056, + "step": 44807 + }, + { + "epoch": 2.7815506859519523, + "grad_norm": 0.13406505102117997, + "learning_rate": 1.606497751279934e-06, + "loss": 2.6978, + "step": 44808 + }, + { + "epoch": 2.7816127630517102, + "grad_norm": 0.12860930081104543, + "learning_rate": 1.605589756365339e-06, + "loss": 2.7344, + "step": 44809 + }, + { + "epoch": 2.781674840151468, + "grad_norm": 0.14599188348829636, + "learning_rate": 1.6046820139361319e-06, + "loss": 2.7063, + "step": 44810 + }, + { + "epoch": 2.781736917251226, + "grad_norm": 0.1481727573208001, + "learning_rate": 1.603774523997037e-06, + "loss": 2.731, + "step": 44811 + }, + { + "epoch": 2.781798994350984, + "grad_norm": 0.13756461061412292, + "learning_rate": 1.6028672865528117e-06, + "loss": 2.6925, + "step": 44812 + }, + { + "epoch": 2.781861071450742, + "grad_norm": 0.13284375019427946, + "learning_rate": 1.6019603016081686e-06, + "loss": 2.6456, + "step": 44813 + }, + { + "epoch": 2.7819231485505, + "grad_norm": 0.13357091089406795, + "learning_rate": 1.601053569167854e-06, + "loss": 2.6675, + "step": 44814 + }, + { + "epoch": 2.7819852256502577, + "grad_norm": 0.1440082055509033, + "learning_rate": 1.6001470892365866e-06, + "loss": 2.7035, + "step": 44815 + }, + { + "epoch": 2.7820473027500157, + "grad_norm": 0.13229343717872125, + "learning_rate": 1.5992408618191014e-06, + "loss": 2.7476, + "step": 44816 + }, + { + "epoch": 2.7821093798497736, + "grad_norm": 0.13163825862127282, + "learning_rate": 1.5983348869201276e-06, + "loss": 2.703, + "step": 44817 + }, + { + "epoch": 2.7821714569495315, + "grad_norm": 0.13351927655730514, + "learning_rate": 1.5974291645443951e-06, + "loss": 2.6744, + "step": 44818 + }, + { + "epoch": 2.782233534049289, + "grad_norm": 0.13026951465104997, + "learning_rate": 1.5965236946966222e-06, + "loss": 2.7241, + "step": 44819 + }, + { + "epoch": 2.7822956111490473, + "grad_norm": 0.129318351633027, + "learning_rate": 1.5956184773815274e-06, + "loss": 2.6736, + "step": 44820 + }, + { + "epoch": 2.782357688248805, + "grad_norm": 0.1330092389426575, + "learning_rate": 1.5947135126038515e-06, + "loss": 2.6661, + "step": 44821 + }, + { + "epoch": 2.782419765348563, + "grad_norm": 0.13624969811581056, + "learning_rate": 1.5938088003683016e-06, + "loss": 2.6968, + "step": 44822 + }, + { + "epoch": 2.7824818424483206, + "grad_norm": 0.1410540023951873, + "learning_rate": 1.5929043406796075e-06, + "loss": 2.6988, + "step": 44823 + }, + { + "epoch": 2.7825439195480786, + "grad_norm": 0.13659986367831473, + "learning_rate": 1.592000133542476e-06, + "loss": 2.7153, + "step": 44824 + }, + { + "epoch": 2.7826059966478365, + "grad_norm": 0.13238515541702625, + "learning_rate": 1.5910961789616319e-06, + "loss": 2.6693, + "step": 44825 + }, + { + "epoch": 2.7826680737475944, + "grad_norm": 0.1523031828738236, + "learning_rate": 1.5901924769417931e-06, + "loss": 2.6698, + "step": 44826 + }, + { + "epoch": 2.7827301508473523, + "grad_norm": 0.1318588095710602, + "learning_rate": 1.5892890274876725e-06, + "loss": 2.601, + "step": 44827 + }, + { + "epoch": 2.7827922279471102, + "grad_norm": 0.13634945858282052, + "learning_rate": 1.5883858306039834e-06, + "loss": 2.6546, + "step": 44828 + }, + { + "epoch": 2.782854305046868, + "grad_norm": 0.13249109450813423, + "learning_rate": 1.5874828862954328e-06, + "loss": 2.658, + "step": 44829 + }, + { + "epoch": 2.782916382146626, + "grad_norm": 0.1382281969422585, + "learning_rate": 1.5865801945667446e-06, + "loss": 2.7462, + "step": 44830 + }, + { + "epoch": 2.782978459246384, + "grad_norm": 0.13741943101298226, + "learning_rate": 1.5856777554226154e-06, + "loss": 2.7355, + "step": 44831 + }, + { + "epoch": 2.783040536346142, + "grad_norm": 0.13332159976689847, + "learning_rate": 1.5847755688677579e-06, + "loss": 2.6055, + "step": 44832 + }, + { + "epoch": 2.7831026134459, + "grad_norm": 0.1326711407269241, + "learning_rate": 1.5838736349068795e-06, + "loss": 2.68, + "step": 44833 + }, + { + "epoch": 2.7831646905456577, + "grad_norm": 0.13271373694984998, + "learning_rate": 1.5829719535446874e-06, + "loss": 2.7031, + "step": 44834 + }, + { + "epoch": 2.7832267676454157, + "grad_norm": 0.12910320667599168, + "learning_rate": 1.5820705247858836e-06, + "loss": 2.737, + "step": 44835 + }, + { + "epoch": 2.7832888447451736, + "grad_norm": 0.13785161285096803, + "learning_rate": 1.5811693486351754e-06, + "loss": 2.6244, + "step": 44836 + }, + { + "epoch": 2.7833509218449315, + "grad_norm": 0.14219347134302493, + "learning_rate": 1.5802684250972587e-06, + "loss": 2.6812, + "step": 44837 + }, + { + "epoch": 2.7834129989446894, + "grad_norm": 0.13671711511996837, + "learning_rate": 1.5793677541768304e-06, + "loss": 2.7024, + "step": 44838 + }, + { + "epoch": 2.7834750760444473, + "grad_norm": 0.14426563316112997, + "learning_rate": 1.5784673358785917e-06, + "loss": 2.6917, + "step": 44839 + }, + { + "epoch": 2.7835371531442052, + "grad_norm": 0.1281540919049865, + "learning_rate": 1.5775671702072504e-06, + "loss": 2.6167, + "step": 44840 + }, + { + "epoch": 2.783599230243963, + "grad_norm": 0.13679577837362325, + "learning_rate": 1.5766672571675022e-06, + "loss": 2.6217, + "step": 44841 + }, + { + "epoch": 2.7836613073437206, + "grad_norm": 0.14269501976110557, + "learning_rate": 1.5757675967640273e-06, + "loss": 2.6029, + "step": 44842 + }, + { + "epoch": 2.783723384443479, + "grad_norm": 0.1298798869097493, + "learning_rate": 1.5748681890015326e-06, + "loss": 2.6987, + "step": 44843 + }, + { + "epoch": 2.7837854615432365, + "grad_norm": 0.1270796292252433, + "learning_rate": 1.5739690338847036e-06, + "loss": 2.6417, + "step": 44844 + }, + { + "epoch": 2.783847538642995, + "grad_norm": 0.13166323759069212, + "learning_rate": 1.573070131418236e-06, + "loss": 2.6929, + "step": 44845 + }, + { + "epoch": 2.7839096157427523, + "grad_norm": 0.14045820013067484, + "learning_rate": 1.5721714816068211e-06, + "loss": 2.6226, + "step": 44846 + }, + { + "epoch": 2.7839716928425107, + "grad_norm": 0.13267191992051242, + "learning_rate": 1.5712730844551327e-06, + "loss": 2.6777, + "step": 44847 + }, + { + "epoch": 2.784033769942268, + "grad_norm": 0.1413792015255783, + "learning_rate": 1.570374939967878e-06, + "loss": 2.6707, + "step": 44848 + }, + { + "epoch": 2.784095847042026, + "grad_norm": 0.13522779990860997, + "learning_rate": 1.5694770481497312e-06, + "loss": 2.7401, + "step": 44849 + }, + { + "epoch": 2.784157924141784, + "grad_norm": 0.13140246867795907, + "learning_rate": 1.568579409005383e-06, + "loss": 2.7343, + "step": 44850 + }, + { + "epoch": 2.784220001241542, + "grad_norm": 0.1299699476700064, + "learning_rate": 1.5676820225395128e-06, + "loss": 2.7178, + "step": 44851 + }, + { + "epoch": 2.7842820783413, + "grad_norm": 0.14725779125515626, + "learning_rate": 1.566784888756795e-06, + "loss": 2.6589, + "step": 44852 + }, + { + "epoch": 2.7843441554410577, + "grad_norm": 0.13362642831056853, + "learning_rate": 1.5658880076619253e-06, + "loss": 2.7335, + "step": 44853 + }, + { + "epoch": 2.7844062325408157, + "grad_norm": 0.1295768461055058, + "learning_rate": 1.5649913792595783e-06, + "loss": 2.634, + "step": 44854 + }, + { + "epoch": 2.7844683096405736, + "grad_norm": 0.1343729753674414, + "learning_rate": 1.5640950035544277e-06, + "loss": 2.6941, + "step": 44855 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.14519696026129547, + "learning_rate": 1.5631988805511422e-06, + "loss": 2.7597, + "step": 44856 + }, + { + "epoch": 2.7845924638400894, + "grad_norm": 0.13629384771262987, + "learning_rate": 1.5623030102544122e-06, + "loss": 2.6779, + "step": 44857 + }, + { + "epoch": 2.7846545409398473, + "grad_norm": 0.13275979903795682, + "learning_rate": 1.5614073926689122e-06, + "loss": 2.7261, + "step": 44858 + }, + { + "epoch": 2.7847166180396052, + "grad_norm": 0.13099136386088436, + "learning_rate": 1.5605120277993046e-06, + "loss": 2.6643, + "step": 44859 + }, + { + "epoch": 2.784778695139363, + "grad_norm": 0.13102488228582093, + "learning_rate": 1.5596169156502637e-06, + "loss": 2.7445, + "step": 44860 + }, + { + "epoch": 2.784840772239121, + "grad_norm": 0.1313049360381749, + "learning_rate": 1.5587220562264582e-06, + "loss": 2.6201, + "step": 44861 + }, + { + "epoch": 2.784902849338879, + "grad_norm": 0.1333357097990126, + "learning_rate": 1.5578274495325617e-06, + "loss": 2.7838, + "step": 44862 + }, + { + "epoch": 2.784964926438637, + "grad_norm": 0.1317007219120456, + "learning_rate": 1.556933095573243e-06, + "loss": 2.6686, + "step": 44863 + }, + { + "epoch": 2.785027003538395, + "grad_norm": 0.13169415231820406, + "learning_rate": 1.5560389943531594e-06, + "loss": 2.7764, + "step": 44864 + }, + { + "epoch": 2.7850890806381527, + "grad_norm": 0.13043691280947506, + "learning_rate": 1.5551451458769795e-06, + "loss": 2.6742, + "step": 44865 + }, + { + "epoch": 2.7851511577379107, + "grad_norm": 0.14008126839867532, + "learning_rate": 1.5542515501493716e-06, + "loss": 2.6192, + "step": 44866 + }, + { + "epoch": 2.785213234837668, + "grad_norm": 0.13108699843715493, + "learning_rate": 1.5533582071749931e-06, + "loss": 2.5729, + "step": 44867 + }, + { + "epoch": 2.7852753119374265, + "grad_norm": 0.14216516733324597, + "learning_rate": 1.552465116958507e-06, + "loss": 2.7306, + "step": 44868 + }, + { + "epoch": 2.785337389037184, + "grad_norm": 0.1341322236049002, + "learning_rate": 1.551572279504565e-06, + "loss": 2.667, + "step": 44869 + }, + { + "epoch": 2.7853994661369423, + "grad_norm": 0.13351954161433374, + "learning_rate": 1.550679694817836e-06, + "loss": 2.7538, + "step": 44870 + }, + { + "epoch": 2.7854615432367, + "grad_norm": 0.1363376414630975, + "learning_rate": 1.5497873629029768e-06, + "loss": 2.6276, + "step": 44871 + }, + { + "epoch": 2.7855236203364577, + "grad_norm": 0.13372252543076973, + "learning_rate": 1.5488952837646288e-06, + "loss": 2.7848, + "step": 44872 + }, + { + "epoch": 2.7855856974362156, + "grad_norm": 0.1393362971740478, + "learning_rate": 1.5480034574074654e-06, + "loss": 2.689, + "step": 44873 + }, + { + "epoch": 2.7856477745359736, + "grad_norm": 0.13157200000877414, + "learning_rate": 1.5471118838361276e-06, + "loss": 2.6965, + "step": 44874 + }, + { + "epoch": 2.7857098516357315, + "grad_norm": 0.13488011403592443, + "learning_rate": 1.5462205630552618e-06, + "loss": 2.727, + "step": 44875 + }, + { + "epoch": 2.7857719287354894, + "grad_norm": 0.12609638016379054, + "learning_rate": 1.5453294950695363e-06, + "loss": 2.6476, + "step": 44876 + }, + { + "epoch": 2.7858340058352473, + "grad_norm": 0.1352653085467627, + "learning_rate": 1.5444386798835864e-06, + "loss": 2.6416, + "step": 44877 + }, + { + "epoch": 2.7858960829350052, + "grad_norm": 0.14201195538589895, + "learning_rate": 1.5435481175020638e-06, + "loss": 2.7235, + "step": 44878 + }, + { + "epoch": 2.785958160034763, + "grad_norm": 0.1361424630136361, + "learning_rate": 1.5426578079296094e-06, + "loss": 2.6927, + "step": 44879 + }, + { + "epoch": 2.786020237134521, + "grad_norm": 0.13326028061870276, + "learning_rate": 1.541767751170875e-06, + "loss": 2.6312, + "step": 44880 + }, + { + "epoch": 2.786082314234279, + "grad_norm": 0.12990667367755515, + "learning_rate": 1.5408779472305013e-06, + "loss": 2.6849, + "step": 44881 + }, + { + "epoch": 2.786144391334037, + "grad_norm": 0.1310191175851723, + "learning_rate": 1.5399883961131345e-06, + "loss": 2.7667, + "step": 44882 + }, + { + "epoch": 2.786206468433795, + "grad_norm": 0.13778448937143056, + "learning_rate": 1.5390990978234043e-06, + "loss": 2.6407, + "step": 44883 + }, + { + "epoch": 2.7862685455335527, + "grad_norm": 0.1520957150274546, + "learning_rate": 1.5382100523659682e-06, + "loss": 2.6436, + "step": 44884 + }, + { + "epoch": 2.7863306226333107, + "grad_norm": 0.14154028388722525, + "learning_rate": 1.53732125974545e-06, + "loss": 2.683, + "step": 44885 + }, + { + "epoch": 2.7863926997330686, + "grad_norm": 0.13294690708052145, + "learning_rate": 1.5364327199664964e-06, + "loss": 2.6018, + "step": 44886 + }, + { + "epoch": 2.7864547768328265, + "grad_norm": 0.13928888624285934, + "learning_rate": 1.5355444330337366e-06, + "loss": 2.7545, + "step": 44887 + }, + { + "epoch": 2.7865168539325844, + "grad_norm": 0.13585082173637733, + "learning_rate": 1.5346563989518005e-06, + "loss": 2.7636, + "step": 44888 + }, + { + "epoch": 2.7865789310323423, + "grad_norm": 0.13226732389256904, + "learning_rate": 1.5337686177253285e-06, + "loss": 2.7851, + "step": 44889 + }, + { + "epoch": 2.7866410081321, + "grad_norm": 0.1333853135697536, + "learning_rate": 1.5328810893589563e-06, + "loss": 2.6593, + "step": 44890 + }, + { + "epoch": 2.786703085231858, + "grad_norm": 0.13135417936215601, + "learning_rate": 1.5319938138573075e-06, + "loss": 2.7474, + "step": 44891 + }, + { + "epoch": 2.7867651623316156, + "grad_norm": 0.13289424540338526, + "learning_rate": 1.5311067912250065e-06, + "loss": 2.7196, + "step": 44892 + }, + { + "epoch": 2.786827239431374, + "grad_norm": 0.13229747093663874, + "learning_rate": 1.5302200214666996e-06, + "loss": 2.6364, + "step": 44893 + }, + { + "epoch": 2.7868893165311315, + "grad_norm": 0.17050904189428004, + "learning_rate": 1.529333504586994e-06, + "loss": 2.7123, + "step": 44894 + }, + { + "epoch": 2.78695139363089, + "grad_norm": 0.13570206550048383, + "learning_rate": 1.5284472405905248e-06, + "loss": 2.6999, + "step": 44895 + }, + { + "epoch": 2.7870134707306473, + "grad_norm": 0.13420724599651032, + "learning_rate": 1.5275612294819053e-06, + "loss": 2.6731, + "step": 44896 + }, + { + "epoch": 2.7870755478304052, + "grad_norm": 0.13287948574046674, + "learning_rate": 1.526675471265776e-06, + "loss": 2.6758, + "step": 44897 + }, + { + "epoch": 2.787137624930163, + "grad_norm": 0.13084377366624486, + "learning_rate": 1.5257899659467446e-06, + "loss": 2.7881, + "step": 44898 + }, + { + "epoch": 2.787199702029921, + "grad_norm": 0.14250284323867432, + "learning_rate": 1.5249047135294348e-06, + "loss": 2.6737, + "step": 44899 + }, + { + "epoch": 2.787261779129679, + "grad_norm": 0.13507761983297129, + "learning_rate": 1.5240197140184654e-06, + "loss": 2.7015, + "step": 44900 + }, + { + "epoch": 2.787323856229437, + "grad_norm": 0.12890781054852624, + "learning_rate": 1.5231349674184493e-06, + "loss": 2.7219, + "step": 44901 + }, + { + "epoch": 2.787385933329195, + "grad_norm": 0.1345657649607544, + "learning_rate": 1.5222504737340104e-06, + "loss": 2.7481, + "step": 44902 + }, + { + "epoch": 2.7874480104289527, + "grad_norm": 0.1307780881611163, + "learning_rate": 1.521366232969762e-06, + "loss": 2.6392, + "step": 44903 + }, + { + "epoch": 2.7875100875287107, + "grad_norm": 0.13213317706303027, + "learning_rate": 1.5204822451303113e-06, + "loss": 2.6824, + "step": 44904 + }, + { + "epoch": 2.7875721646284686, + "grad_norm": 0.14374291801629838, + "learning_rate": 1.519598510220266e-06, + "loss": 2.6377, + "step": 44905 + }, + { + "epoch": 2.7876342417282265, + "grad_norm": 0.1538874651003189, + "learning_rate": 1.5187150282442441e-06, + "loss": 2.6616, + "step": 44906 + }, + { + "epoch": 2.7876963188279844, + "grad_norm": 0.13240184829423385, + "learning_rate": 1.5178317992068647e-06, + "loss": 2.7601, + "step": 44907 + }, + { + "epoch": 2.7877583959277423, + "grad_norm": 0.13647465714400928, + "learning_rate": 1.5169488231127238e-06, + "loss": 2.7105, + "step": 44908 + }, + { + "epoch": 2.7878204730275002, + "grad_norm": 0.13022712730831268, + "learning_rate": 1.5160660999664345e-06, + "loss": 2.7411, + "step": 44909 + }, + { + "epoch": 2.787882550127258, + "grad_norm": 0.15002796488595654, + "learning_rate": 1.515183629772593e-06, + "loss": 2.7293, + "step": 44910 + }, + { + "epoch": 2.787944627227016, + "grad_norm": 0.1344497129194657, + "learning_rate": 1.5143014125358069e-06, + "loss": 2.755, + "step": 44911 + }, + { + "epoch": 2.788006704326774, + "grad_norm": 0.16488398652968042, + "learning_rate": 1.5134194482606833e-06, + "loss": 2.7642, + "step": 44912 + }, + { + "epoch": 2.788068781426532, + "grad_norm": 0.13722147475273702, + "learning_rate": 1.5125377369518245e-06, + "loss": 2.7237, + "step": 44913 + }, + { + "epoch": 2.78813085852629, + "grad_norm": 0.1484937122218724, + "learning_rate": 1.5116562786138211e-06, + "loss": 2.7145, + "step": 44914 + }, + { + "epoch": 2.7881929356260473, + "grad_norm": 0.13006025397768745, + "learning_rate": 1.5107750732512805e-06, + "loss": 2.7038, + "step": 44915 + }, + { + "epoch": 2.7882550127258057, + "grad_norm": 0.13364748877805557, + "learning_rate": 1.509894120868799e-06, + "loss": 2.7314, + "step": 44916 + }, + { + "epoch": 2.788317089825563, + "grad_norm": 0.1304021362608368, + "learning_rate": 1.5090134214709672e-06, + "loss": 2.6633, + "step": 44917 + }, + { + "epoch": 2.7883791669253215, + "grad_norm": 0.1345070832950554, + "learning_rate": 1.5081329750623875e-06, + "loss": 2.7583, + "step": 44918 + }, + { + "epoch": 2.788441244025079, + "grad_norm": 0.14691468930180465, + "learning_rate": 1.5072527816476444e-06, + "loss": 2.7077, + "step": 44919 + }, + { + "epoch": 2.788503321124837, + "grad_norm": 0.1375229006358432, + "learning_rate": 1.5063728412313405e-06, + "loss": 2.7623, + "step": 44920 + }, + { + "epoch": 2.788565398224595, + "grad_norm": 0.13106893726315672, + "learning_rate": 1.5054931538180662e-06, + "loss": 2.7305, + "step": 44921 + }, + { + "epoch": 2.7886274753243527, + "grad_norm": 0.13049190795408072, + "learning_rate": 1.5046137194124011e-06, + "loss": 2.5831, + "step": 44922 + }, + { + "epoch": 2.7886895524241107, + "grad_norm": 0.1399493543307084, + "learning_rate": 1.5037345380189416e-06, + "loss": 2.7144, + "step": 44923 + }, + { + "epoch": 2.7887516295238686, + "grad_norm": 0.13238540867488216, + "learning_rate": 1.502855609642262e-06, + "loss": 2.6662, + "step": 44924 + }, + { + "epoch": 2.7888137066236265, + "grad_norm": 0.13001746175023532, + "learning_rate": 1.5019769342869694e-06, + "loss": 2.7484, + "step": 44925 + }, + { + "epoch": 2.7888757837233844, + "grad_norm": 0.12766485247114592, + "learning_rate": 1.5010985119576327e-06, + "loss": 2.6695, + "step": 44926 + }, + { + "epoch": 2.7889378608231423, + "grad_norm": 0.13029950208585275, + "learning_rate": 1.5002203426588423e-06, + "loss": 2.7236, + "step": 44927 + }, + { + "epoch": 2.7889999379229002, + "grad_norm": 0.13177355243933186, + "learning_rate": 1.4993424263951673e-06, + "loss": 2.7063, + "step": 44928 + }, + { + "epoch": 2.789062015022658, + "grad_norm": 0.1395998249939584, + "learning_rate": 1.4984647631712035e-06, + "loss": 2.6762, + "step": 44929 + }, + { + "epoch": 2.789124092122416, + "grad_norm": 0.13428570114483704, + "learning_rate": 1.4975873529915195e-06, + "loss": 2.728, + "step": 44930 + }, + { + "epoch": 2.789186169222174, + "grad_norm": 0.14748831362213355, + "learning_rate": 1.4967101958607011e-06, + "loss": 2.6915, + "step": 44931 + }, + { + "epoch": 2.789248246321932, + "grad_norm": 0.13661119453826684, + "learning_rate": 1.4958332917833108e-06, + "loss": 2.6818, + "step": 44932 + }, + { + "epoch": 2.78931032342169, + "grad_norm": 0.14968960107511403, + "learning_rate": 1.4949566407639449e-06, + "loss": 2.6003, + "step": 44933 + }, + { + "epoch": 2.7893724005214477, + "grad_norm": 0.13624431170611095, + "learning_rate": 1.494080242807161e-06, + "loss": 2.6555, + "step": 44934 + }, + { + "epoch": 2.7894344776212057, + "grad_norm": 0.131757599903685, + "learning_rate": 1.4932040979175333e-06, + "loss": 2.719, + "step": 44935 + }, + { + "epoch": 2.7894965547209636, + "grad_norm": 0.13829365717728354, + "learning_rate": 1.4923282060996413e-06, + "loss": 2.7533, + "step": 44936 + }, + { + "epoch": 2.7895586318207215, + "grad_norm": 0.15376865534494785, + "learning_rate": 1.491452567358037e-06, + "loss": 2.6931, + "step": 44937 + }, + { + "epoch": 2.789620708920479, + "grad_norm": 0.14620806383167162, + "learning_rate": 1.4905771816973058e-06, + "loss": 2.7163, + "step": 44938 + }, + { + "epoch": 2.7896827860202373, + "grad_norm": 0.13176741055301444, + "learning_rate": 1.4897020491220105e-06, + "loss": 2.7543, + "step": 44939 + }, + { + "epoch": 2.789744863119995, + "grad_norm": 0.13772786208888854, + "learning_rate": 1.4888271696367196e-06, + "loss": 2.7644, + "step": 44940 + }, + { + "epoch": 2.789806940219753, + "grad_norm": 0.1301789958010366, + "learning_rate": 1.4879525432459907e-06, + "loss": 2.6495, + "step": 44941 + }, + { + "epoch": 2.7898690173195106, + "grad_norm": 0.14235475758263771, + "learning_rate": 1.4870781699543868e-06, + "loss": 2.7595, + "step": 44942 + }, + { + "epoch": 2.789931094419269, + "grad_norm": 0.1508447367669661, + "learning_rate": 1.4862040497664763e-06, + "loss": 2.7865, + "step": 44943 + }, + { + "epoch": 2.7899931715190265, + "grad_norm": 0.13175508350992157, + "learning_rate": 1.485330182686817e-06, + "loss": 2.6905, + "step": 44944 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.13027173803270428, + "learning_rate": 1.4844565687199718e-06, + "loss": 2.7051, + "step": 44945 + }, + { + "epoch": 2.7901173257185423, + "grad_norm": 0.14098194306533815, + "learning_rate": 1.483583207870487e-06, + "loss": 2.76, + "step": 44946 + }, + { + "epoch": 2.7901794028183002, + "grad_norm": 0.153079834427283, + "learning_rate": 1.4827101001429256e-06, + "loss": 2.5894, + "step": 44947 + }, + { + "epoch": 2.790241479918058, + "grad_norm": 0.1354963697724898, + "learning_rate": 1.4818372455418506e-06, + "loss": 2.7414, + "step": 44948 + }, + { + "epoch": 2.790303557017816, + "grad_norm": 0.14265304713293983, + "learning_rate": 1.4809646440718027e-06, + "loss": 2.7204, + "step": 44949 + }, + { + "epoch": 2.790365634117574, + "grad_norm": 0.12840062738680122, + "learning_rate": 1.4800922957373453e-06, + "loss": 2.7033, + "step": 44950 + }, + { + "epoch": 2.790427711217332, + "grad_norm": 0.13335057261400893, + "learning_rate": 1.4792202005430188e-06, + "loss": 2.6496, + "step": 44951 + }, + { + "epoch": 2.79048978831709, + "grad_norm": 0.13470627680397573, + "learning_rate": 1.4783483584933865e-06, + "loss": 2.7137, + "step": 44952 + }, + { + "epoch": 2.7905518654168477, + "grad_norm": 0.16227218196846546, + "learning_rate": 1.4774767695929893e-06, + "loss": 2.6239, + "step": 44953 + }, + { + "epoch": 2.7906139425166057, + "grad_norm": 0.13059854133389787, + "learning_rate": 1.476605433846373e-06, + "loss": 2.6687, + "step": 44954 + }, + { + "epoch": 2.7906760196163636, + "grad_norm": 0.1294154096349625, + "learning_rate": 1.475734351258079e-06, + "loss": 2.7816, + "step": 44955 + }, + { + "epoch": 2.7907380967161215, + "grad_norm": 0.12714934956682, + "learning_rate": 1.47486352183267e-06, + "loss": 2.6608, + "step": 44956 + }, + { + "epoch": 2.7908001738158794, + "grad_norm": 0.1470031643250383, + "learning_rate": 1.4739929455746705e-06, + "loss": 2.7848, + "step": 44957 + }, + { + "epoch": 2.7908622509156373, + "grad_norm": 0.12980262173832144, + "learning_rate": 1.473122622488632e-06, + "loss": 2.7071, + "step": 44958 + }, + { + "epoch": 2.7909243280153953, + "grad_norm": 0.13317563396923823, + "learning_rate": 1.4722525525790954e-06, + "loss": 2.7078, + "step": 44959 + }, + { + "epoch": 2.790986405115153, + "grad_norm": 0.12990328261753514, + "learning_rate": 1.4713827358505904e-06, + "loss": 2.6788, + "step": 44960 + }, + { + "epoch": 2.7910484822149106, + "grad_norm": 0.14630694770218047, + "learning_rate": 1.4705131723076693e-06, + "loss": 2.6684, + "step": 44961 + }, + { + "epoch": 2.791110559314669, + "grad_norm": 0.12926712961703385, + "learning_rate": 1.4696438619548614e-06, + "loss": 2.7143, + "step": 44962 + }, + { + "epoch": 2.7911726364144265, + "grad_norm": 0.1328283959581057, + "learning_rate": 1.4687748047967021e-06, + "loss": 2.5846, + "step": 44963 + }, + { + "epoch": 2.791234713514185, + "grad_norm": 0.1428837179367709, + "learning_rate": 1.467906000837721e-06, + "loss": 2.7287, + "step": 44964 + }, + { + "epoch": 2.7912967906139423, + "grad_norm": 0.1311298109568418, + "learning_rate": 1.4670374500824646e-06, + "loss": 2.7189, + "step": 44965 + }, + { + "epoch": 2.7913588677137007, + "grad_norm": 0.13334173754069015, + "learning_rate": 1.4661691525354514e-06, + "loss": 2.7035, + "step": 44966 + }, + { + "epoch": 2.791420944813458, + "grad_norm": 0.12958675717182958, + "learning_rate": 1.4653011082012169e-06, + "loss": 2.7098, + "step": 44967 + }, + { + "epoch": 2.791483021913216, + "grad_norm": 0.1287285128505484, + "learning_rate": 1.464433317084285e-06, + "loss": 2.7085, + "step": 44968 + }, + { + "epoch": 2.791545099012974, + "grad_norm": 0.13392481216763213, + "learning_rate": 1.4635657791891856e-06, + "loss": 2.7035, + "step": 44969 + }, + { + "epoch": 2.791607176112732, + "grad_norm": 0.1356062448515915, + "learning_rate": 1.462698494520448e-06, + "loss": 2.679, + "step": 44970 + }, + { + "epoch": 2.79166925321249, + "grad_norm": 0.13784910636004433, + "learning_rate": 1.461831463082597e-06, + "loss": 2.6257, + "step": 44971 + }, + { + "epoch": 2.7917313303122477, + "grad_norm": 0.1358605698790167, + "learning_rate": 1.4609646848801562e-06, + "loss": 2.6685, + "step": 44972 + }, + { + "epoch": 2.7917934074120057, + "grad_norm": 0.13547796839508183, + "learning_rate": 1.4600981599176444e-06, + "loss": 2.7319, + "step": 44973 + }, + { + "epoch": 2.7918554845117636, + "grad_norm": 0.14343314312777578, + "learning_rate": 1.4592318881995804e-06, + "loss": 2.6873, + "step": 44974 + }, + { + "epoch": 2.7919175616115215, + "grad_norm": 0.14882049591054752, + "learning_rate": 1.4583658697304935e-06, + "loss": 2.7101, + "step": 44975 + }, + { + "epoch": 2.7919796387112794, + "grad_norm": 0.13742692642079588, + "learning_rate": 1.4575001045148918e-06, + "loss": 2.6886, + "step": 44976 + }, + { + "epoch": 2.7920417158110373, + "grad_norm": 0.138120589703709, + "learning_rate": 1.4566345925572988e-06, + "loss": 2.725, + "step": 44977 + }, + { + "epoch": 2.7921037929107952, + "grad_norm": 0.14078878428062983, + "learning_rate": 1.4557693338622225e-06, + "loss": 2.7743, + "step": 44978 + }, + { + "epoch": 2.792165870010553, + "grad_norm": 0.12900999980341196, + "learning_rate": 1.4549043284341868e-06, + "loss": 2.6138, + "step": 44979 + }, + { + "epoch": 2.792227947110311, + "grad_norm": 0.14143159465328892, + "learning_rate": 1.4540395762776993e-06, + "loss": 2.7628, + "step": 44980 + }, + { + "epoch": 2.792290024210069, + "grad_norm": 0.13552042458443522, + "learning_rate": 1.453175077397273e-06, + "loss": 2.713, + "step": 44981 + }, + { + "epoch": 2.792352101309827, + "grad_norm": 0.14097520481427234, + "learning_rate": 1.4523108317974098e-06, + "loss": 2.7257, + "step": 44982 + }, + { + "epoch": 2.792414178409585, + "grad_norm": 0.147570663958229, + "learning_rate": 1.4514468394826342e-06, + "loss": 2.7561, + "step": 44983 + }, + { + "epoch": 2.7924762555093428, + "grad_norm": 0.13406469673411034, + "learning_rate": 1.4505831004574478e-06, + "loss": 2.7017, + "step": 44984 + }, + { + "epoch": 2.7925383326091007, + "grad_norm": 0.1354178044992719, + "learning_rate": 1.4497196147263526e-06, + "loss": 2.8519, + "step": 44985 + }, + { + "epoch": 2.792600409708858, + "grad_norm": 0.13155671680089798, + "learning_rate": 1.448856382293856e-06, + "loss": 2.647, + "step": 44986 + }, + { + "epoch": 2.7926624868086165, + "grad_norm": 0.1377970407642631, + "learning_rate": 1.4479934031644549e-06, + "loss": 2.701, + "step": 44987 + }, + { + "epoch": 2.792724563908374, + "grad_norm": 0.1387280162480108, + "learning_rate": 1.4471306773426675e-06, + "loss": 2.6673, + "step": 44988 + }, + { + "epoch": 2.7927866410081323, + "grad_norm": 0.13111348176438223, + "learning_rate": 1.4462682048329846e-06, + "loss": 2.734, + "step": 44989 + }, + { + "epoch": 2.79284871810789, + "grad_norm": 0.15027351278249126, + "learning_rate": 1.4454059856399082e-06, + "loss": 2.72, + "step": 44990 + }, + { + "epoch": 2.7929107952076477, + "grad_norm": 0.13632777714070096, + "learning_rate": 1.4445440197679294e-06, + "loss": 2.6985, + "step": 44991 + }, + { + "epoch": 2.7929728723074057, + "grad_norm": 0.13231723668445397, + "learning_rate": 1.4436823072215556e-06, + "loss": 2.7497, + "step": 44992 + }, + { + "epoch": 2.7930349494071636, + "grad_norm": 0.13507511702586655, + "learning_rate": 1.4428208480052774e-06, + "loss": 2.7638, + "step": 44993 + }, + { + "epoch": 2.7930970265069215, + "grad_norm": 0.12995575885642285, + "learning_rate": 1.441959642123597e-06, + "loss": 2.705, + "step": 44994 + }, + { + "epoch": 2.7931591036066794, + "grad_norm": 0.1414393120256427, + "learning_rate": 1.4410986895809942e-06, + "loss": 2.7199, + "step": 44995 + }, + { + "epoch": 2.7932211807064373, + "grad_norm": 0.14730393344035697, + "learning_rate": 1.4402379903819652e-06, + "loss": 2.7521, + "step": 44996 + }, + { + "epoch": 2.7932832578061952, + "grad_norm": 0.1345634256602163, + "learning_rate": 1.439377544531012e-06, + "loss": 2.7281, + "step": 44997 + }, + { + "epoch": 2.793345334905953, + "grad_norm": 0.14430427771861154, + "learning_rate": 1.438517352032609e-06, + "loss": 2.7575, + "step": 44998 + }, + { + "epoch": 2.793407412005711, + "grad_norm": 0.1330677692764017, + "learning_rate": 1.4376574128912524e-06, + "loss": 2.5726, + "step": 44999 + }, + { + "epoch": 2.793469489105469, + "grad_norm": 0.13434706525090595, + "learning_rate": 1.436797727111422e-06, + "loss": 2.6091, + "step": 45000 + }, + { + "epoch": 2.793531566205227, + "grad_norm": 0.13384162260222304, + "learning_rate": 1.4359382946976086e-06, + "loss": 2.711, + "step": 45001 + }, + { + "epoch": 2.793593643304985, + "grad_norm": 0.15621498430814318, + "learning_rate": 1.4350791156542976e-06, + "loss": 2.7228, + "step": 45002 + }, + { + "epoch": 2.7936557204047427, + "grad_norm": 0.1344054911085823, + "learning_rate": 1.4342201899859576e-06, + "loss": 2.6659, + "step": 45003 + }, + { + "epoch": 2.7937177975045007, + "grad_norm": 0.14266451739602232, + "learning_rate": 1.4333615176970905e-06, + "loss": 2.7158, + "step": 45004 + }, + { + "epoch": 2.7937798746042586, + "grad_norm": 0.15407021690834957, + "learning_rate": 1.4325030987921595e-06, + "loss": 2.7478, + "step": 45005 + }, + { + "epoch": 2.7938419517040165, + "grad_norm": 0.1370502268266384, + "learning_rate": 1.431644933275661e-06, + "loss": 2.7987, + "step": 45006 + }, + { + "epoch": 2.7939040288037744, + "grad_norm": 0.13544932683762995, + "learning_rate": 1.4307870211520525e-06, + "loss": 2.6814, + "step": 45007 + }, + { + "epoch": 2.7939661059035323, + "grad_norm": 0.1392942886411959, + "learning_rate": 1.4299293624258247e-06, + "loss": 2.7627, + "step": 45008 + }, + { + "epoch": 2.79402818300329, + "grad_norm": 0.133042052982955, + "learning_rate": 1.4290719571014411e-06, + "loss": 2.7713, + "step": 45009 + }, + { + "epoch": 2.794090260103048, + "grad_norm": 0.13489442005914545, + "learning_rate": 1.4282148051833811e-06, + "loss": 2.7354, + "step": 45010 + }, + { + "epoch": 2.7941523372028056, + "grad_norm": 0.16820921983852213, + "learning_rate": 1.427357906676119e-06, + "loss": 2.6915, + "step": 45011 + }, + { + "epoch": 2.794214414302564, + "grad_norm": 0.16086995040494295, + "learning_rate": 1.4265012615841178e-06, + "loss": 2.6457, + "step": 45012 + }, + { + "epoch": 2.7942764914023215, + "grad_norm": 0.13245990876518587, + "learning_rate": 1.4256448699118574e-06, + "loss": 2.7076, + "step": 45013 + }, + { + "epoch": 2.79433856850208, + "grad_norm": 0.14247415618822318, + "learning_rate": 1.4247887316637899e-06, + "loss": 2.7021, + "step": 45014 + }, + { + "epoch": 2.7944006456018373, + "grad_norm": 0.15466681929781526, + "learning_rate": 1.4239328468444001e-06, + "loss": 2.7295, + "step": 45015 + }, + { + "epoch": 2.7944627227015952, + "grad_norm": 0.13563457175068, + "learning_rate": 1.4230772154581407e-06, + "loss": 2.6609, + "step": 45016 + }, + { + "epoch": 2.794524799801353, + "grad_norm": 0.15091619678877394, + "learning_rate": 1.4222218375094798e-06, + "loss": 2.6874, + "step": 45017 + }, + { + "epoch": 2.794586876901111, + "grad_norm": 0.14110753928781328, + "learning_rate": 1.4213667130028807e-06, + "loss": 2.7016, + "step": 45018 + }, + { + "epoch": 2.794648954000869, + "grad_norm": 0.13560386168868127, + "learning_rate": 1.4205118419428066e-06, + "loss": 2.6753, + "step": 45019 + }, + { + "epoch": 2.794711031100627, + "grad_norm": 0.13898005961837168, + "learning_rate": 1.4196572243337147e-06, + "loss": 2.6595, + "step": 45020 + }, + { + "epoch": 2.794773108200385, + "grad_norm": 0.12898031187907213, + "learning_rate": 1.418802860180063e-06, + "loss": 2.7471, + "step": 45021 + }, + { + "epoch": 2.7948351853001427, + "grad_norm": 0.14050887531921047, + "learning_rate": 1.4179487494863141e-06, + "loss": 2.6699, + "step": 45022 + }, + { + "epoch": 2.7948972623999007, + "grad_norm": 0.13180385481798795, + "learning_rate": 1.4170948922569094e-06, + "loss": 2.6232, + "step": 45023 + }, + { + "epoch": 2.7949593394996586, + "grad_norm": 0.1316143363376177, + "learning_rate": 1.4162412884963227e-06, + "loss": 2.6618, + "step": 45024 + }, + { + "epoch": 2.7950214165994165, + "grad_norm": 0.13438964309307064, + "learning_rate": 1.4153879382090008e-06, + "loss": 2.6836, + "step": 45025 + }, + { + "epoch": 2.7950834936991744, + "grad_norm": 0.1346174199184935, + "learning_rate": 1.4145348413993953e-06, + "loss": 2.7315, + "step": 45026 + }, + { + "epoch": 2.7951455707989323, + "grad_norm": 0.13699696026992172, + "learning_rate": 1.4136819980719474e-06, + "loss": 2.7479, + "step": 45027 + }, + { + "epoch": 2.7952076478986903, + "grad_norm": 0.13120009063740834, + "learning_rate": 1.4128294082311254e-06, + "loss": 2.7589, + "step": 45028 + }, + { + "epoch": 2.795269724998448, + "grad_norm": 0.13199576120994264, + "learning_rate": 1.4119770718813597e-06, + "loss": 2.683, + "step": 45029 + }, + { + "epoch": 2.795331802098206, + "grad_norm": 0.13388461856321712, + "learning_rate": 1.4111249890271127e-06, + "loss": 2.6942, + "step": 45030 + }, + { + "epoch": 2.795393879197964, + "grad_norm": 0.13006171475246284, + "learning_rate": 1.4102731596728148e-06, + "loss": 2.6661, + "step": 45031 + }, + { + "epoch": 2.795455956297722, + "grad_norm": 0.1306134660547647, + "learning_rate": 1.4094215838229176e-06, + "loss": 2.6428, + "step": 45032 + }, + { + "epoch": 2.79551803339748, + "grad_norm": 0.1487800573086061, + "learning_rate": 1.4085702614818675e-06, + "loss": 2.7261, + "step": 45033 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.13513885865748046, + "learning_rate": 1.4077191926541e-06, + "loss": 2.7908, + "step": 45034 + }, + { + "epoch": 2.7956421875969957, + "grad_norm": 0.13476723517962516, + "learning_rate": 1.4068683773440616e-06, + "loss": 2.7454, + "step": 45035 + }, + { + "epoch": 2.795704264696753, + "grad_norm": 0.13837285779011568, + "learning_rate": 1.4060178155561766e-06, + "loss": 2.7314, + "step": 45036 + }, + { + "epoch": 2.7957663417965115, + "grad_norm": 0.13078872709423262, + "learning_rate": 1.4051675072948967e-06, + "loss": 2.7241, + "step": 45037 + }, + { + "epoch": 2.795828418896269, + "grad_norm": 0.13131761739201644, + "learning_rate": 1.404317452564663e-06, + "loss": 2.6854, + "step": 45038 + }, + { + "epoch": 2.795890495996027, + "grad_norm": 0.12830779567619774, + "learning_rate": 1.4034676513698996e-06, + "loss": 2.734, + "step": 45039 + }, + { + "epoch": 2.795952573095785, + "grad_norm": 0.14863068156776313, + "learning_rate": 1.4026181037150422e-06, + "loss": 2.6729, + "step": 45040 + }, + { + "epoch": 2.7960146501955427, + "grad_norm": 0.14687737833288889, + "learning_rate": 1.4017688096045146e-06, + "loss": 2.6885, + "step": 45041 + }, + { + "epoch": 2.7960767272953007, + "grad_norm": 0.13173191770619744, + "learning_rate": 1.4009197690427688e-06, + "loss": 2.6606, + "step": 45042 + }, + { + "epoch": 2.7961388043950586, + "grad_norm": 0.12832558503488567, + "learning_rate": 1.4000709820342184e-06, + "loss": 2.6476, + "step": 45043 + }, + { + "epoch": 2.7962008814948165, + "grad_norm": 0.14914633256912424, + "learning_rate": 1.3992224485832982e-06, + "loss": 2.5932, + "step": 45044 + }, + { + "epoch": 2.7962629585945744, + "grad_norm": 0.13273809256015004, + "learning_rate": 1.3983741686944274e-06, + "loss": 2.6911, + "step": 45045 + }, + { + "epoch": 2.7963250356943323, + "grad_norm": 0.15075580426685525, + "learning_rate": 1.3975261423720354e-06, + "loss": 2.6487, + "step": 45046 + }, + { + "epoch": 2.7963871127940902, + "grad_norm": 0.14088965387253988, + "learning_rate": 1.3966783696205576e-06, + "loss": 2.6541, + "step": 45047 + }, + { + "epoch": 2.796449189893848, + "grad_norm": 0.1327686798509257, + "learning_rate": 1.3958308504444017e-06, + "loss": 2.7502, + "step": 45048 + }, + { + "epoch": 2.796511266993606, + "grad_norm": 0.13990847512240134, + "learning_rate": 1.3949835848479974e-06, + "loss": 2.6416, + "step": 45049 + }, + { + "epoch": 2.796573344093364, + "grad_norm": 0.13532091539657276, + "learning_rate": 1.3941365728357581e-06, + "loss": 2.7178, + "step": 45050 + }, + { + "epoch": 2.796635421193122, + "grad_norm": 0.132862887921863, + "learning_rate": 1.393289814412113e-06, + "loss": 2.7365, + "step": 45051 + }, + { + "epoch": 2.79669749829288, + "grad_norm": 0.13540285908168168, + "learning_rate": 1.3924433095814704e-06, + "loss": 2.7165, + "step": 45052 + }, + { + "epoch": 2.7967595753926378, + "grad_norm": 0.1367876968921303, + "learning_rate": 1.3915970583482596e-06, + "loss": 2.6683, + "step": 45053 + }, + { + "epoch": 2.7968216524923957, + "grad_norm": 0.13339925628902324, + "learning_rate": 1.3907510607168717e-06, + "loss": 2.754, + "step": 45054 + }, + { + "epoch": 2.7968837295921536, + "grad_norm": 0.14397556214831406, + "learning_rate": 1.3899053166917475e-06, + "loss": 2.7744, + "step": 45055 + }, + { + "epoch": 2.7969458066919115, + "grad_norm": 0.13221313769033538, + "learning_rate": 1.389059826277289e-06, + "loss": 2.6551, + "step": 45056 + }, + { + "epoch": 2.797007883791669, + "grad_norm": 0.1546298430138463, + "learning_rate": 1.3882145894779042e-06, + "loss": 2.697, + "step": 45057 + }, + { + "epoch": 2.7970699608914273, + "grad_norm": 0.13668241017898572, + "learning_rate": 1.387369606298e-06, + "loss": 2.7307, + "step": 45058 + }, + { + "epoch": 2.797132037991185, + "grad_norm": 0.13608333887248933, + "learning_rate": 1.3865248767419903e-06, + "loss": 2.6417, + "step": 45059 + }, + { + "epoch": 2.797194115090943, + "grad_norm": 0.13083761660611762, + "learning_rate": 1.3856804008142877e-06, + "loss": 2.7391, + "step": 45060 + }, + { + "epoch": 2.7972561921907007, + "grad_norm": 0.14368985099412285, + "learning_rate": 1.3848361785192888e-06, + "loss": 2.7141, + "step": 45061 + }, + { + "epoch": 2.797318269290459, + "grad_norm": 0.13563722902530528, + "learning_rate": 1.3839922098614012e-06, + "loss": 2.6724, + "step": 45062 + }, + { + "epoch": 2.7973803463902165, + "grad_norm": 0.12758411441299564, + "learning_rate": 1.3831484948450213e-06, + "loss": 2.6381, + "step": 45063 + }, + { + "epoch": 2.7974424234899744, + "grad_norm": 0.12998370504040394, + "learning_rate": 1.382305033474568e-06, + "loss": 2.7012, + "step": 45064 + }, + { + "epoch": 2.7975045005897323, + "grad_norm": 0.13623268371750252, + "learning_rate": 1.3814618257544266e-06, + "loss": 2.7497, + "step": 45065 + }, + { + "epoch": 2.7975665776894902, + "grad_norm": 0.13685330173218913, + "learning_rate": 1.3806188716890045e-06, + "loss": 2.6933, + "step": 45066 + }, + { + "epoch": 2.797628654789248, + "grad_norm": 0.13223325410981077, + "learning_rate": 1.379776171282693e-06, + "loss": 2.6736, + "step": 45067 + }, + { + "epoch": 2.797690731889006, + "grad_norm": 0.1306298363625317, + "learning_rate": 1.3789337245398937e-06, + "loss": 2.6998, + "step": 45068 + }, + { + "epoch": 2.797752808988764, + "grad_norm": 0.13123369058820458, + "learning_rate": 1.3780915314649979e-06, + "loss": 2.7103, + "step": 45069 + }, + { + "epoch": 2.797814886088522, + "grad_norm": 0.13890224448866448, + "learning_rate": 1.3772495920624073e-06, + "loss": 2.6744, + "step": 45070 + }, + { + "epoch": 2.79787696318828, + "grad_norm": 0.13979900352816318, + "learning_rate": 1.3764079063365077e-06, + "loss": 2.6089, + "step": 45071 + }, + { + "epoch": 2.7979390402880377, + "grad_norm": 0.13657929256120394, + "learning_rate": 1.3755664742916897e-06, + "loss": 2.7231, + "step": 45072 + }, + { + "epoch": 2.7980011173877957, + "grad_norm": 0.13591659785304422, + "learning_rate": 1.3747252959323443e-06, + "loss": 2.7386, + "step": 45073 + }, + { + "epoch": 2.7980631944875536, + "grad_norm": 0.14431056366138106, + "learning_rate": 1.3738843712628623e-06, + "loss": 2.6287, + "step": 45074 + }, + { + "epoch": 2.7981252715873115, + "grad_norm": 0.13348685890520331, + "learning_rate": 1.373043700287635e-06, + "loss": 2.6933, + "step": 45075 + }, + { + "epoch": 2.7981873486870694, + "grad_norm": 0.1309732966096922, + "learning_rate": 1.372203283011042e-06, + "loss": 2.7357, + "step": 45076 + }, + { + "epoch": 2.7982494257868273, + "grad_norm": 0.12965487793743274, + "learning_rate": 1.3713631194374576e-06, + "loss": 2.821, + "step": 45077 + }, + { + "epoch": 2.7983115028865853, + "grad_norm": 0.13159843527162943, + "learning_rate": 1.3705232095712895e-06, + "loss": 2.6948, + "step": 45078 + }, + { + "epoch": 2.798373579986343, + "grad_norm": 0.1345237620969153, + "learning_rate": 1.3696835534169062e-06, + "loss": 2.8477, + "step": 45079 + }, + { + "epoch": 2.798435657086101, + "grad_norm": 0.13355682571051766, + "learning_rate": 1.3688441509786875e-06, + "loss": 2.7597, + "step": 45080 + }, + { + "epoch": 2.798497734185859, + "grad_norm": 0.1326878186822659, + "learning_rate": 1.3680050022610136e-06, + "loss": 2.6394, + "step": 45081 + }, + { + "epoch": 2.7985598112856165, + "grad_norm": 0.1370592305467084, + "learning_rate": 1.3671661072682585e-06, + "loss": 2.6825, + "step": 45082 + }, + { + "epoch": 2.798621888385375, + "grad_norm": 0.13467033445010682, + "learning_rate": 1.3663274660048076e-06, + "loss": 2.6772, + "step": 45083 + }, + { + "epoch": 2.7986839654851323, + "grad_norm": 0.13037754428635726, + "learning_rate": 1.3654890784750352e-06, + "loss": 2.8093, + "step": 45084 + }, + { + "epoch": 2.7987460425848907, + "grad_norm": 0.1317923862436827, + "learning_rate": 1.3646509446833156e-06, + "loss": 2.7352, + "step": 45085 + }, + { + "epoch": 2.798808119684648, + "grad_norm": 0.14066894162611127, + "learning_rate": 1.3638130646340064e-06, + "loss": 2.8259, + "step": 45086 + }, + { + "epoch": 2.798870196784406, + "grad_norm": 0.15811302421475987, + "learning_rate": 1.362975438331504e-06, + "loss": 2.7269, + "step": 45087 + }, + { + "epoch": 2.798932273884164, + "grad_norm": 0.1307435020879892, + "learning_rate": 1.3621380657801607e-06, + "loss": 2.6176, + "step": 45088 + }, + { + "epoch": 2.798994350983922, + "grad_norm": 0.13662781415965788, + "learning_rate": 1.3613009469843508e-06, + "loss": 2.7078, + "step": 45089 + }, + { + "epoch": 2.79905642808368, + "grad_norm": 0.14463201438351025, + "learning_rate": 1.3604640819484372e-06, + "loss": 2.7114, + "step": 45090 + }, + { + "epoch": 2.7991185051834377, + "grad_norm": 0.1369267108206496, + "learning_rate": 1.3596274706767941e-06, + "loss": 2.6705, + "step": 45091 + }, + { + "epoch": 2.7991805822831957, + "grad_norm": 0.13074942139967982, + "learning_rate": 1.3587911131737852e-06, + "loss": 2.6438, + "step": 45092 + }, + { + "epoch": 2.7992426593829536, + "grad_norm": 0.1293054093831733, + "learning_rate": 1.3579550094437677e-06, + "loss": 2.6538, + "step": 45093 + }, + { + "epoch": 2.7993047364827115, + "grad_norm": 0.13348513560327904, + "learning_rate": 1.3571191594911104e-06, + "loss": 2.6588, + "step": 45094 + }, + { + "epoch": 2.7993668135824694, + "grad_norm": 0.1453307437724352, + "learning_rate": 1.3562835633201654e-06, + "loss": 2.7051, + "step": 45095 + }, + { + "epoch": 2.7994288906822273, + "grad_norm": 0.13031972809054457, + "learning_rate": 1.3554482209353014e-06, + "loss": 2.6785, + "step": 45096 + }, + { + "epoch": 2.7994909677819853, + "grad_norm": 0.12979280608561325, + "learning_rate": 1.3546131323408705e-06, + "loss": 2.706, + "step": 45097 + }, + { + "epoch": 2.799553044881743, + "grad_norm": 0.13248803671774836, + "learning_rate": 1.353778297541236e-06, + "loss": 2.6686, + "step": 45098 + }, + { + "epoch": 2.799615121981501, + "grad_norm": 0.13104655983931202, + "learning_rate": 1.3529437165407443e-06, + "loss": 2.7244, + "step": 45099 + }, + { + "epoch": 2.799677199081259, + "grad_norm": 0.13433596631821004, + "learning_rate": 1.3521093893437586e-06, + "loss": 2.7211, + "step": 45100 + }, + { + "epoch": 2.799739276181017, + "grad_norm": 0.14651343375307863, + "learning_rate": 1.35127531595462e-06, + "loss": 2.6776, + "step": 45101 + }, + { + "epoch": 2.799801353280775, + "grad_norm": 0.135378088560562, + "learning_rate": 1.3504414963776968e-06, + "loss": 2.721, + "step": 45102 + }, + { + "epoch": 2.7998634303805328, + "grad_norm": 0.13993145201180018, + "learning_rate": 1.3496079306173305e-06, + "loss": 2.6557, + "step": 45103 + }, + { + "epoch": 2.7999255074802907, + "grad_norm": 0.13299166291456163, + "learning_rate": 1.3487746186778616e-06, + "loss": 2.6554, + "step": 45104 + }, + { + "epoch": 2.799987584580048, + "grad_norm": 0.14368337585193752, + "learning_rate": 1.3479415605636537e-06, + "loss": 2.6711, + "step": 45105 + }, + { + "epoch": 2.8000496616798065, + "grad_norm": 0.15517958317703068, + "learning_rate": 1.3471087562790419e-06, + "loss": 2.7279, + "step": 45106 + }, + { + "epoch": 2.800111738779564, + "grad_norm": 0.1519749836534503, + "learning_rate": 1.3462762058283784e-06, + "loss": 2.6716, + "step": 45107 + }, + { + "epoch": 2.8001738158793223, + "grad_norm": 0.14332286775254852, + "learning_rate": 1.3454439092160042e-06, + "loss": 2.7358, + "step": 45108 + }, + { + "epoch": 2.80023589297908, + "grad_norm": 0.1326140084538734, + "learning_rate": 1.3446118664462547e-06, + "loss": 2.6546, + "step": 45109 + }, + { + "epoch": 2.800297970078838, + "grad_norm": 0.13129972980341453, + "learning_rate": 1.3437800775234765e-06, + "loss": 2.6843, + "step": 45110 + }, + { + "epoch": 2.8003600471785957, + "grad_norm": 0.14485873031719634, + "learning_rate": 1.3429485424520162e-06, + "loss": 2.6856, + "step": 45111 + }, + { + "epoch": 2.8004221242783536, + "grad_norm": 0.15491705007344914, + "learning_rate": 1.3421172612362033e-06, + "loss": 2.7629, + "step": 45112 + }, + { + "epoch": 2.8004842013781115, + "grad_norm": 0.13103447060342382, + "learning_rate": 1.3412862338803678e-06, + "loss": 2.7072, + "step": 45113 + }, + { + "epoch": 2.8005462784778694, + "grad_norm": 0.1467695956237764, + "learning_rate": 1.340455460388862e-06, + "loss": 2.6819, + "step": 45114 + }, + { + "epoch": 2.8006083555776273, + "grad_norm": 0.13197369618984894, + "learning_rate": 1.3396249407660156e-06, + "loss": 2.6565, + "step": 45115 + }, + { + "epoch": 2.8006704326773852, + "grad_norm": 0.1278963336707469, + "learning_rate": 1.3387946750161528e-06, + "loss": 2.708, + "step": 45116 + }, + { + "epoch": 2.800732509777143, + "grad_norm": 0.13288282888464967, + "learning_rate": 1.3379646631436093e-06, + "loss": 2.6918, + "step": 45117 + }, + { + "epoch": 2.800794586876901, + "grad_norm": 0.1267948851720118, + "learning_rate": 1.3371349051527204e-06, + "loss": 2.6675, + "step": 45118 + }, + { + "epoch": 2.800856663976659, + "grad_norm": 0.13345702938535944, + "learning_rate": 1.3363054010478159e-06, + "loss": 2.6727, + "step": 45119 + }, + { + "epoch": 2.800918741076417, + "grad_norm": 0.14362716142888907, + "learning_rate": 1.3354761508332147e-06, + "loss": 2.707, + "step": 45120 + }, + { + "epoch": 2.800980818176175, + "grad_norm": 0.13139151794505785, + "learning_rate": 1.3346471545132523e-06, + "loss": 2.7565, + "step": 45121 + }, + { + "epoch": 2.8010428952759328, + "grad_norm": 0.1339641038557488, + "learning_rate": 1.3338184120922414e-06, + "loss": 2.6231, + "step": 45122 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.13046652626381072, + "learning_rate": 1.3329899235745236e-06, + "loss": 2.7319, + "step": 45123 + }, + { + "epoch": 2.8011670494754486, + "grad_norm": 0.1484658251891619, + "learning_rate": 1.3321616889644062e-06, + "loss": 2.5826, + "step": 45124 + }, + { + "epoch": 2.8012291265752065, + "grad_norm": 0.1395098658938936, + "learning_rate": 1.3313337082662192e-06, + "loss": 2.7271, + "step": 45125 + }, + { + "epoch": 2.8012912036749644, + "grad_norm": 0.1302742759300143, + "learning_rate": 1.3305059814842758e-06, + "loss": 2.6047, + "step": 45126 + }, + { + "epoch": 2.8013532807747223, + "grad_norm": 0.13037277963526936, + "learning_rate": 1.3296785086229003e-06, + "loss": 2.6751, + "step": 45127 + }, + { + "epoch": 2.8014153578744803, + "grad_norm": 0.13966669616844954, + "learning_rate": 1.3288512896864058e-06, + "loss": 2.7331, + "step": 45128 + }, + { + "epoch": 2.801477434974238, + "grad_norm": 0.1313495068614729, + "learning_rate": 1.3280243246791114e-06, + "loss": 2.7384, + "step": 45129 + }, + { + "epoch": 2.8015395120739957, + "grad_norm": 0.139648444169051, + "learning_rate": 1.3271976136053244e-06, + "loss": 2.7048, + "step": 45130 + }, + { + "epoch": 2.801601589173754, + "grad_norm": 0.12875964758198158, + "learning_rate": 1.3263711564693637e-06, + "loss": 2.6683, + "step": 45131 + }, + { + "epoch": 2.8016636662735115, + "grad_norm": 0.13382815743494392, + "learning_rate": 1.3255449532755483e-06, + "loss": 2.6846, + "step": 45132 + }, + { + "epoch": 2.80172574337327, + "grad_norm": 0.13945994293685687, + "learning_rate": 1.3247190040281743e-06, + "loss": 2.7633, + "step": 45133 + }, + { + "epoch": 2.8017878204730273, + "grad_norm": 0.14182964240591814, + "learning_rate": 1.3238933087315553e-06, + "loss": 2.6879, + "step": 45134 + }, + { + "epoch": 2.8018498975727852, + "grad_norm": 0.14050726465453012, + "learning_rate": 1.3230678673900043e-06, + "loss": 2.7805, + "step": 45135 + }, + { + "epoch": 2.801911974672543, + "grad_norm": 0.12810250614007376, + "learning_rate": 1.3222426800078237e-06, + "loss": 2.654, + "step": 45136 + }, + { + "epoch": 2.801974051772301, + "grad_norm": 0.14320387111163524, + "learning_rate": 1.3214177465893263e-06, + "loss": 2.7371, + "step": 45137 + }, + { + "epoch": 2.802036128872059, + "grad_norm": 0.15590067199237378, + "learning_rate": 1.3205930671388033e-06, + "loss": 2.7343, + "step": 45138 + }, + { + "epoch": 2.802098205971817, + "grad_norm": 0.13388182215845493, + "learning_rate": 1.319768641660568e-06, + "loss": 2.7691, + "step": 45139 + }, + { + "epoch": 2.802160283071575, + "grad_norm": 0.13639927731550458, + "learning_rate": 1.3189444701589116e-06, + "loss": 2.7653, + "step": 45140 + }, + { + "epoch": 2.8022223601713327, + "grad_norm": 0.1327766832974558, + "learning_rate": 1.3181205526381467e-06, + "loss": 2.7708, + "step": 45141 + }, + { + "epoch": 2.8022844372710907, + "grad_norm": 0.1519942431481223, + "learning_rate": 1.3172968891025595e-06, + "loss": 2.7052, + "step": 45142 + }, + { + "epoch": 2.8023465143708486, + "grad_norm": 0.1495041980508826, + "learning_rate": 1.316473479556457e-06, + "loss": 2.6371, + "step": 45143 + }, + { + "epoch": 2.8024085914706065, + "grad_norm": 0.1482301254793237, + "learning_rate": 1.3156503240041307e-06, + "loss": 2.7671, + "step": 45144 + }, + { + "epoch": 2.8024706685703644, + "grad_norm": 0.12676882203757428, + "learning_rate": 1.3148274224498658e-06, + "loss": 2.6249, + "step": 45145 + }, + { + "epoch": 2.8025327456701223, + "grad_norm": 0.1285688579889884, + "learning_rate": 1.3140047748979756e-06, + "loss": 2.6935, + "step": 45146 + }, + { + "epoch": 2.8025948227698803, + "grad_norm": 0.14839915985068178, + "learning_rate": 1.31318238135274e-06, + "loss": 2.7131, + "step": 45147 + }, + { + "epoch": 2.802656899869638, + "grad_norm": 0.14459347030005806, + "learning_rate": 1.31236024181845e-06, + "loss": 2.6767, + "step": 45148 + }, + { + "epoch": 2.802718976969396, + "grad_norm": 0.13743632559948926, + "learning_rate": 1.311538356299391e-06, + "loss": 2.7786, + "step": 45149 + }, + { + "epoch": 2.802781054069154, + "grad_norm": 0.12999013897744222, + "learning_rate": 1.3107167247998597e-06, + "loss": 2.7254, + "step": 45150 + }, + { + "epoch": 2.802843131168912, + "grad_norm": 0.1332102247902231, + "learning_rate": 1.3098953473241416e-06, + "loss": 2.651, + "step": 45151 + }, + { + "epoch": 2.80290520826867, + "grad_norm": 0.131087609432568, + "learning_rate": 1.3090742238765163e-06, + "loss": 2.7372, + "step": 45152 + }, + { + "epoch": 2.8029672853684273, + "grad_norm": 0.14576248948753337, + "learning_rate": 1.308253354461264e-06, + "loss": 2.6996, + "step": 45153 + }, + { + "epoch": 2.8030293624681857, + "grad_norm": 0.14368954636434486, + "learning_rate": 1.3074327390826868e-06, + "loss": 2.7713, + "step": 45154 + }, + { + "epoch": 2.803091439567943, + "grad_norm": 0.14092903252390263, + "learning_rate": 1.3066123777450479e-06, + "loss": 2.7408, + "step": 45155 + }, + { + "epoch": 2.8031535166677015, + "grad_norm": 0.1336853572126228, + "learning_rate": 1.3057922704526326e-06, + "loss": 2.7351, + "step": 45156 + }, + { + "epoch": 2.803215593767459, + "grad_norm": 0.13582030456779925, + "learning_rate": 1.3049724172097211e-06, + "loss": 2.7236, + "step": 45157 + }, + { + "epoch": 2.8032776708672174, + "grad_norm": 0.13077340219849184, + "learning_rate": 1.3041528180205874e-06, + "loss": 2.7047, + "step": 45158 + }, + { + "epoch": 2.803339747966975, + "grad_norm": 0.15421523578122145, + "learning_rate": 1.3033334728895118e-06, + "loss": 2.7259, + "step": 45159 + }, + { + "epoch": 2.8034018250667327, + "grad_norm": 0.13712258642075525, + "learning_rate": 1.3025143818207686e-06, + "loss": 2.7492, + "step": 45160 + }, + { + "epoch": 2.8034639021664907, + "grad_norm": 0.13447520849926597, + "learning_rate": 1.3016955448186318e-06, + "loss": 2.7339, + "step": 45161 + }, + { + "epoch": 2.8035259792662486, + "grad_norm": 0.12810189544723802, + "learning_rate": 1.3008769618873596e-06, + "loss": 2.6624, + "step": 45162 + }, + { + "epoch": 2.8035880563660065, + "grad_norm": 0.13445379281418443, + "learning_rate": 1.3000586330312481e-06, + "loss": 2.6204, + "step": 45163 + }, + { + "epoch": 2.8036501334657644, + "grad_norm": 0.13164843188039008, + "learning_rate": 1.2992405582545441e-06, + "loss": 2.7505, + "step": 45164 + }, + { + "epoch": 2.8037122105655223, + "grad_norm": 0.13199135132338757, + "learning_rate": 1.2984227375615277e-06, + "loss": 2.8002, + "step": 45165 + }, + { + "epoch": 2.8037742876652803, + "grad_norm": 0.1306733545782684, + "learning_rate": 1.2976051709564675e-06, + "loss": 2.7034, + "step": 45166 + }, + { + "epoch": 2.803836364765038, + "grad_norm": 0.13995621517007034, + "learning_rate": 1.2967878584436155e-06, + "loss": 2.6885, + "step": 45167 + }, + { + "epoch": 2.803898441864796, + "grad_norm": 0.1434748161411121, + "learning_rate": 1.2959708000272407e-06, + "loss": 2.729, + "step": 45168 + }, + { + "epoch": 2.803960518964554, + "grad_norm": 0.13284715733211097, + "learning_rate": 1.2951539957116176e-06, + "loss": 2.6605, + "step": 45169 + }, + { + "epoch": 2.804022596064312, + "grad_norm": 0.13572080704519907, + "learning_rate": 1.2943374455010037e-06, + "loss": 2.7113, + "step": 45170 + }, + { + "epoch": 2.80408467316407, + "grad_norm": 0.13846531783318272, + "learning_rate": 1.2935211493996513e-06, + "loss": 2.7288, + "step": 45171 + }, + { + "epoch": 2.8041467502638278, + "grad_norm": 0.129194097208366, + "learning_rate": 1.292705107411818e-06, + "loss": 2.6193, + "step": 45172 + }, + { + "epoch": 2.8042088273635857, + "grad_norm": 0.13378645901339603, + "learning_rate": 1.291889319541767e-06, + "loss": 2.6248, + "step": 45173 + }, + { + "epoch": 2.8042709044633436, + "grad_norm": 0.13286642073937543, + "learning_rate": 1.2910737857937617e-06, + "loss": 2.621, + "step": 45174 + }, + { + "epoch": 2.8043329815631015, + "grad_norm": 0.13170871247253388, + "learning_rate": 1.290258506172043e-06, + "loss": 2.7555, + "step": 45175 + }, + { + "epoch": 2.8043950586628594, + "grad_norm": 0.13564869522064094, + "learning_rate": 1.2894434806808686e-06, + "loss": 2.7244, + "step": 45176 + }, + { + "epoch": 2.8044571357626173, + "grad_norm": 0.14413615946068178, + "learning_rate": 1.2886287093244965e-06, + "loss": 2.6772, + "step": 45177 + }, + { + "epoch": 2.804519212862375, + "grad_norm": 0.1357632970519254, + "learning_rate": 1.287814192107173e-06, + "loss": 2.7354, + "step": 45178 + }, + { + "epoch": 2.804581289962133, + "grad_norm": 0.1569644093005399, + "learning_rate": 1.2869999290331503e-06, + "loss": 2.7296, + "step": 45179 + }, + { + "epoch": 2.8046433670618907, + "grad_norm": 0.14622288570763617, + "learning_rate": 1.2861859201066694e-06, + "loss": 2.7047, + "step": 45180 + }, + { + "epoch": 2.804705444161649, + "grad_norm": 0.14037223457710404, + "learning_rate": 1.2853721653319829e-06, + "loss": 2.6362, + "step": 45181 + }, + { + "epoch": 2.8047675212614065, + "grad_norm": 0.15522679642069256, + "learning_rate": 1.2845586647133368e-06, + "loss": 2.6913, + "step": 45182 + }, + { + "epoch": 2.8048295983611644, + "grad_norm": 0.15568047679862412, + "learning_rate": 1.283745418254978e-06, + "loss": 2.7038, + "step": 45183 + }, + { + "epoch": 2.8048916754609223, + "grad_norm": 0.13720214400632022, + "learning_rate": 1.2829324259611419e-06, + "loss": 2.7074, + "step": 45184 + }, + { + "epoch": 2.8049537525606802, + "grad_norm": 0.14230558396674067, + "learning_rate": 1.2821196878360697e-06, + "loss": 2.6762, + "step": 45185 + }, + { + "epoch": 2.805015829660438, + "grad_norm": 0.12748382856156296, + "learning_rate": 1.2813072038840134e-06, + "loss": 2.5955, + "step": 45186 + }, + { + "epoch": 2.805077906760196, + "grad_norm": 0.14237216548593218, + "learning_rate": 1.2804949741092031e-06, + "loss": 2.6952, + "step": 45187 + }, + { + "epoch": 2.805139983859954, + "grad_norm": 0.13053892526899613, + "learning_rate": 1.2796829985158744e-06, + "loss": 2.7414, + "step": 45188 + }, + { + "epoch": 2.805202060959712, + "grad_norm": 0.13482461550445735, + "learning_rate": 1.2788712771082623e-06, + "loss": 2.7233, + "step": 45189 + }, + { + "epoch": 2.80526413805947, + "grad_norm": 0.14340045974290644, + "learning_rate": 1.2780598098906137e-06, + "loss": 2.7329, + "step": 45190 + }, + { + "epoch": 2.8053262151592278, + "grad_norm": 0.13136706864654762, + "learning_rate": 1.2772485968671588e-06, + "loss": 2.7328, + "step": 45191 + }, + { + "epoch": 2.8053882922589857, + "grad_norm": 0.13274406324777374, + "learning_rate": 1.276437638042116e-06, + "loss": 2.7061, + "step": 45192 + }, + { + "epoch": 2.8054503693587436, + "grad_norm": 0.13222548542119325, + "learning_rate": 1.2756269334197324e-06, + "loss": 2.6648, + "step": 45193 + }, + { + "epoch": 2.8055124464585015, + "grad_norm": 0.13120414381277892, + "learning_rate": 1.2748164830042263e-06, + "loss": 2.676, + "step": 45194 + }, + { + "epoch": 2.8055745235582594, + "grad_norm": 0.13757943627698174, + "learning_rate": 1.2740062867998337e-06, + "loss": 2.6454, + "step": 45195 + }, + { + "epoch": 2.8056366006580173, + "grad_norm": 0.13918901110866438, + "learning_rate": 1.2731963448107842e-06, + "loss": 2.6826, + "step": 45196 + }, + { + "epoch": 2.8056986777577753, + "grad_norm": 0.12849736388309763, + "learning_rate": 1.2723866570412912e-06, + "loss": 2.6391, + "step": 45197 + }, + { + "epoch": 2.805760754857533, + "grad_norm": 0.13627386598279656, + "learning_rate": 1.2715772234955847e-06, + "loss": 2.7365, + "step": 45198 + }, + { + "epoch": 2.805822831957291, + "grad_norm": 0.15748063876722304, + "learning_rate": 1.2707680441778946e-06, + "loss": 2.6113, + "step": 45199 + }, + { + "epoch": 2.805884909057049, + "grad_norm": 0.13165207510258833, + "learning_rate": 1.2699591190924286e-06, + "loss": 2.7733, + "step": 45200 + }, + { + "epoch": 2.8059469861568065, + "grad_norm": 0.13956213331665507, + "learning_rate": 1.2691504482434225e-06, + "loss": 2.8177, + "step": 45201 + }, + { + "epoch": 2.806009063256565, + "grad_norm": 0.1320798736401454, + "learning_rate": 1.268342031635089e-06, + "loss": 2.797, + "step": 45202 + }, + { + "epoch": 2.8060711403563223, + "grad_norm": 0.1351074431060565, + "learning_rate": 1.267533869271642e-06, + "loss": 2.6956, + "step": 45203 + }, + { + "epoch": 2.8061332174560807, + "grad_norm": 0.12971663779659864, + "learning_rate": 1.2667259611573056e-06, + "loss": 2.7798, + "step": 45204 + }, + { + "epoch": 2.806195294555838, + "grad_norm": 0.13614699154409568, + "learning_rate": 1.2659183072962877e-06, + "loss": 2.6596, + "step": 45205 + }, + { + "epoch": 2.8062573716555965, + "grad_norm": 0.14363673843068694, + "learning_rate": 1.2651109076928014e-06, + "loss": 2.7276, + "step": 45206 + }, + { + "epoch": 2.806319448755354, + "grad_norm": 0.13103714298204022, + "learning_rate": 1.2643037623510712e-06, + "loss": 2.7053, + "step": 45207 + }, + { + "epoch": 2.806381525855112, + "grad_norm": 0.13070832259839132, + "learning_rate": 1.263496871275288e-06, + "loss": 2.6242, + "step": 45208 + }, + { + "epoch": 2.80644360295487, + "grad_norm": 0.13108524358565612, + "learning_rate": 1.2626902344696767e-06, + "loss": 2.6654, + "step": 45209 + }, + { + "epoch": 2.8065056800546277, + "grad_norm": 0.1360595273261325, + "learning_rate": 1.2618838519384447e-06, + "loss": 2.6927, + "step": 45210 + }, + { + "epoch": 2.8065677571543857, + "grad_norm": 0.13320997310050173, + "learning_rate": 1.261077723685794e-06, + "loss": 2.7449, + "step": 45211 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.1314072526922991, + "learning_rate": 1.2602718497159327e-06, + "loss": 2.7634, + "step": 45212 + }, + { + "epoch": 2.8066919113539015, + "grad_norm": 0.13520076547827797, + "learning_rate": 1.2594662300330685e-06, + "loss": 2.7676, + "step": 45213 + }, + { + "epoch": 2.8067539884536594, + "grad_norm": 0.14717496211111553, + "learning_rate": 1.2586608646413977e-06, + "loss": 2.7197, + "step": 45214 + }, + { + "epoch": 2.8068160655534173, + "grad_norm": 0.14880816708809133, + "learning_rate": 1.2578557535451286e-06, + "loss": 2.6966, + "step": 45215 + }, + { + "epoch": 2.8068781426531753, + "grad_norm": 0.13091279832457578, + "learning_rate": 1.2570508967484573e-06, + "loss": 2.7144, + "step": 45216 + }, + { + "epoch": 2.806940219752933, + "grad_norm": 0.1358305691391969, + "learning_rate": 1.2562462942555752e-06, + "loss": 2.76, + "step": 45217 + }, + { + "epoch": 2.807002296852691, + "grad_norm": 0.1394815781895046, + "learning_rate": 1.2554419460707012e-06, + "loss": 2.794, + "step": 45218 + }, + { + "epoch": 2.807064373952449, + "grad_norm": 0.1297063991839692, + "learning_rate": 1.254637852198015e-06, + "loss": 2.6794, + "step": 45219 + }, + { + "epoch": 2.807126451052207, + "grad_norm": 0.1282544926721838, + "learning_rate": 1.2538340126417192e-06, + "loss": 2.7274, + "step": 45220 + }, + { + "epoch": 2.807188528151965, + "grad_norm": 0.14231096346532038, + "learning_rate": 1.253030427405999e-06, + "loss": 2.6415, + "step": 45221 + }, + { + "epoch": 2.8072506052517228, + "grad_norm": 0.1337679268507808, + "learning_rate": 1.2522270964950568e-06, + "loss": 2.6467, + "step": 45222 + }, + { + "epoch": 2.8073126823514807, + "grad_norm": 0.14809346485050115, + "learning_rate": 1.2514240199130833e-06, + "loss": 2.7016, + "step": 45223 + }, + { + "epoch": 2.8073747594512386, + "grad_norm": 0.13793264726711238, + "learning_rate": 1.250621197664259e-06, + "loss": 2.7368, + "step": 45224 + }, + { + "epoch": 2.8074368365509965, + "grad_norm": 0.1358318101603971, + "learning_rate": 1.2498186297527802e-06, + "loss": 2.6575, + "step": 45225 + }, + { + "epoch": 2.807498913650754, + "grad_norm": 0.1412195577668, + "learning_rate": 1.2490163161828328e-06, + "loss": 2.6664, + "step": 45226 + }, + { + "epoch": 2.8075609907505124, + "grad_norm": 0.1459272534920867, + "learning_rate": 1.248214256958602e-06, + "loss": 2.7412, + "step": 45227 + }, + { + "epoch": 2.80762306785027, + "grad_norm": 0.13021846652337662, + "learning_rate": 1.2474124520842734e-06, + "loss": 2.6481, + "step": 45228 + }, + { + "epoch": 2.807685144950028, + "grad_norm": 0.14390044839535984, + "learning_rate": 1.246610901564027e-06, + "loss": 2.7306, + "step": 45229 + }, + { + "epoch": 2.8077472220497857, + "grad_norm": 0.15498353778417998, + "learning_rate": 1.245809605402043e-06, + "loss": 2.7788, + "step": 45230 + }, + { + "epoch": 2.8078092991495436, + "grad_norm": 0.13614498724330373, + "learning_rate": 1.245008563602512e-06, + "loss": 2.7055, + "step": 45231 + }, + { + "epoch": 2.8078713762493015, + "grad_norm": 0.1451459006530239, + "learning_rate": 1.2442077761696037e-06, + "loss": 2.7374, + "step": 45232 + }, + { + "epoch": 2.8079334533490594, + "grad_norm": 0.1304332856272922, + "learning_rate": 1.2434072431075028e-06, + "loss": 2.6858, + "step": 45233 + }, + { + "epoch": 2.8079955304488173, + "grad_norm": 0.15305173639323805, + "learning_rate": 1.2426069644203787e-06, + "loss": 2.65, + "step": 45234 + }, + { + "epoch": 2.8080576075485753, + "grad_norm": 0.13750324651830198, + "learning_rate": 1.2418069401124111e-06, + "loss": 2.6973, + "step": 45235 + }, + { + "epoch": 2.808119684648333, + "grad_norm": 0.130922530008151, + "learning_rate": 1.2410071701877745e-06, + "loss": 2.714, + "step": 45236 + }, + { + "epoch": 2.808181761748091, + "grad_norm": 0.1345904012781159, + "learning_rate": 1.240207654650638e-06, + "loss": 2.6941, + "step": 45237 + }, + { + "epoch": 2.808243838847849, + "grad_norm": 0.1319246346956407, + "learning_rate": 1.2394083935051815e-06, + "loss": 2.7219, + "step": 45238 + }, + { + "epoch": 2.808305915947607, + "grad_norm": 0.13422682765594576, + "learning_rate": 1.238609386755557e-06, + "loss": 2.741, + "step": 45239 + }, + { + "epoch": 2.808367993047365, + "grad_norm": 0.128801910481515, + "learning_rate": 1.2378106344059503e-06, + "loss": 2.6569, + "step": 45240 + }, + { + "epoch": 2.8084300701471228, + "grad_norm": 0.13280233175958778, + "learning_rate": 1.2370121364605303e-06, + "loss": 2.6469, + "step": 45241 + }, + { + "epoch": 2.8084921472468807, + "grad_norm": 0.14482615000799776, + "learning_rate": 1.236213892923449e-06, + "loss": 2.6822, + "step": 45242 + }, + { + "epoch": 2.8085542243466386, + "grad_norm": 0.1357844924868833, + "learning_rate": 1.2354159037988756e-06, + "loss": 2.7535, + "step": 45243 + }, + { + "epoch": 2.8086163014463965, + "grad_norm": 0.12944851594868423, + "learning_rate": 1.2346181690909787e-06, + "loss": 2.618, + "step": 45244 + }, + { + "epoch": 2.8086783785461544, + "grad_norm": 0.13755813150085838, + "learning_rate": 1.233820688803916e-06, + "loss": 2.7339, + "step": 45245 + }, + { + "epoch": 2.8087404556459123, + "grad_norm": 0.12955733799254646, + "learning_rate": 1.2330234629418514e-06, + "loss": 2.719, + "step": 45246 + }, + { + "epoch": 2.8088025327456703, + "grad_norm": 0.13956721151063514, + "learning_rate": 1.2322264915089365e-06, + "loss": 2.717, + "step": 45247 + }, + { + "epoch": 2.808864609845428, + "grad_norm": 0.12952223180385375, + "learning_rate": 1.231429774509335e-06, + "loss": 2.6703, + "step": 45248 + }, + { + "epoch": 2.8089266869451857, + "grad_norm": 0.13930577472747915, + "learning_rate": 1.2306333119472102e-06, + "loss": 2.7042, + "step": 45249 + }, + { + "epoch": 2.808988764044944, + "grad_norm": 0.1356007092561079, + "learning_rate": 1.2298371038267087e-06, + "loss": 2.6201, + "step": 45250 + }, + { + "epoch": 2.8090508411447015, + "grad_norm": 0.13399327140166414, + "learning_rate": 1.2290411501519827e-06, + "loss": 2.6949, + "step": 45251 + }, + { + "epoch": 2.80911291824446, + "grad_norm": 0.1362720685768839, + "learning_rate": 1.22824545092719e-06, + "loss": 2.7218, + "step": 45252 + }, + { + "epoch": 2.8091749953442173, + "grad_norm": 0.13248850066290013, + "learning_rate": 1.2274500061564775e-06, + "loss": 2.6709, + "step": 45253 + }, + { + "epoch": 2.8092370724439757, + "grad_norm": 0.14191147006770025, + "learning_rate": 1.2266548158440028e-06, + "loss": 2.7252, + "step": 45254 + }, + { + "epoch": 2.809299149543733, + "grad_norm": 0.12834094823682188, + "learning_rate": 1.2258598799939125e-06, + "loss": 2.6664, + "step": 45255 + }, + { + "epoch": 2.809361226643491, + "grad_norm": 0.13612117279730904, + "learning_rate": 1.225065198610348e-06, + "loss": 2.7297, + "step": 45256 + }, + { + "epoch": 2.809423303743249, + "grad_norm": 0.1317650003675133, + "learning_rate": 1.2242707716974554e-06, + "loss": 2.7256, + "step": 45257 + }, + { + "epoch": 2.809485380843007, + "grad_norm": 0.1446863064397436, + "learning_rate": 1.2234765992593877e-06, + "loss": 2.6561, + "step": 45258 + }, + { + "epoch": 2.809547457942765, + "grad_norm": 0.12942211648811894, + "learning_rate": 1.2226826813002856e-06, + "loss": 2.6358, + "step": 45259 + }, + { + "epoch": 2.8096095350425228, + "grad_norm": 0.1313279856874543, + "learning_rate": 1.2218890178242848e-06, + "loss": 2.6942, + "step": 45260 + }, + { + "epoch": 2.8096716121422807, + "grad_norm": 0.16240754339963054, + "learning_rate": 1.2210956088355262e-06, + "loss": 2.7336, + "step": 45261 + }, + { + "epoch": 2.8097336892420386, + "grad_norm": 0.13085293398136158, + "learning_rate": 1.2203024543381624e-06, + "loss": 2.707, + "step": 45262 + }, + { + "epoch": 2.8097957663417965, + "grad_norm": 0.1435744343365382, + "learning_rate": 1.2195095543363177e-06, + "loss": 2.6437, + "step": 45263 + }, + { + "epoch": 2.8098578434415544, + "grad_norm": 0.13105036903236242, + "learning_rate": 1.2187169088341333e-06, + "loss": 2.7086, + "step": 45264 + }, + { + "epoch": 2.8099199205413123, + "grad_norm": 0.13180015926192276, + "learning_rate": 1.2179245178357447e-06, + "loss": 2.6899, + "step": 45265 + }, + { + "epoch": 2.8099819976410703, + "grad_norm": 0.13437626056301663, + "learning_rate": 1.2171323813452873e-06, + "loss": 2.6389, + "step": 45266 + }, + { + "epoch": 2.810044074740828, + "grad_norm": 0.13283469912185017, + "learning_rate": 1.216340499366886e-06, + "loss": 2.7168, + "step": 45267 + }, + { + "epoch": 2.810106151840586, + "grad_norm": 0.13327572485773698, + "learning_rate": 1.215548871904687e-06, + "loss": 2.6815, + "step": 45268 + }, + { + "epoch": 2.810168228940344, + "grad_norm": 0.14144990626484072, + "learning_rate": 1.2147574989628151e-06, + "loss": 2.6716, + "step": 45269 + }, + { + "epoch": 2.810230306040102, + "grad_norm": 0.14169547814151665, + "learning_rate": 1.2139663805453894e-06, + "loss": 2.7724, + "step": 45270 + }, + { + "epoch": 2.81029238313986, + "grad_norm": 0.1461985469748356, + "learning_rate": 1.213175516656545e-06, + "loss": 2.7264, + "step": 45271 + }, + { + "epoch": 2.8103544602396178, + "grad_norm": 0.143261876850522, + "learning_rate": 1.2123849073004123e-06, + "loss": 2.7305, + "step": 45272 + }, + { + "epoch": 2.8104165373393757, + "grad_norm": 0.13262854486747153, + "learning_rate": 1.2115945524811102e-06, + "loss": 2.68, + "step": 45273 + }, + { + "epoch": 2.810478614439133, + "grad_norm": 0.1577277104470457, + "learning_rate": 1.2108044522027572e-06, + "loss": 2.6449, + "step": 45274 + }, + { + "epoch": 2.8105406915388915, + "grad_norm": 0.13819211252865943, + "learning_rate": 1.210014606469484e-06, + "loss": 2.6039, + "step": 45275 + }, + { + "epoch": 2.810602768638649, + "grad_norm": 0.1299164663899101, + "learning_rate": 1.209225015285409e-06, + "loss": 2.6954, + "step": 45276 + }, + { + "epoch": 2.8106648457384074, + "grad_norm": 0.1382346513865491, + "learning_rate": 1.208435678654657e-06, + "loss": 2.6838, + "step": 45277 + }, + { + "epoch": 2.810726922838165, + "grad_norm": 0.13463042567856853, + "learning_rate": 1.2076465965813356e-06, + "loss": 2.6608, + "step": 45278 + }, + { + "epoch": 2.8107889999379227, + "grad_norm": 0.13501614629695394, + "learning_rate": 1.2068577690695692e-06, + "loss": 2.7189, + "step": 45279 + }, + { + "epoch": 2.8108510770376807, + "grad_norm": 0.14927328870616863, + "learning_rate": 1.2060691961234661e-06, + "loss": 2.7151, + "step": 45280 + }, + { + "epoch": 2.8109131541374386, + "grad_norm": 0.13386577996169904, + "learning_rate": 1.2052808777471503e-06, + "loss": 2.7818, + "step": 45281 + }, + { + "epoch": 2.8109752312371965, + "grad_norm": 0.14700924617491742, + "learning_rate": 1.2044928139447242e-06, + "loss": 2.7525, + "step": 45282 + }, + { + "epoch": 2.8110373083369544, + "grad_norm": 0.127490191421297, + "learning_rate": 1.2037050047203125e-06, + "loss": 2.653, + "step": 45283 + }, + { + "epoch": 2.8110993854367123, + "grad_norm": 0.14059985783781404, + "learning_rate": 1.202917450078006e-06, + "loss": 2.7175, + "step": 45284 + }, + { + "epoch": 2.8111614625364703, + "grad_norm": 0.13290559086747175, + "learning_rate": 1.202130150021935e-06, + "loss": 2.616, + "step": 45285 + }, + { + "epoch": 2.811223539636228, + "grad_norm": 0.13951674167008019, + "learning_rate": 1.2013431045561908e-06, + "loss": 2.6917, + "step": 45286 + }, + { + "epoch": 2.811285616735986, + "grad_norm": 0.13369432283150104, + "learning_rate": 1.2005563136848918e-06, + "loss": 2.7502, + "step": 45287 + }, + { + "epoch": 2.811347693835744, + "grad_norm": 0.13310121961879615, + "learning_rate": 1.1997697774121297e-06, + "loss": 2.6168, + "step": 45288 + }, + { + "epoch": 2.811409770935502, + "grad_norm": 0.13396484076898563, + "learning_rate": 1.1989834957420231e-06, + "loss": 2.8469, + "step": 45289 + }, + { + "epoch": 2.81147184803526, + "grad_norm": 0.13286039947218864, + "learning_rate": 1.1981974686786635e-06, + "loss": 2.6723, + "step": 45290 + }, + { + "epoch": 2.8115339251350178, + "grad_norm": 0.14150366194067504, + "learning_rate": 1.1974116962261528e-06, + "loss": 2.6873, + "step": 45291 + }, + { + "epoch": 2.8115960022347757, + "grad_norm": 0.15996565177221372, + "learning_rate": 1.1966261783885935e-06, + "loss": 2.7524, + "step": 45292 + }, + { + "epoch": 2.8116580793345336, + "grad_norm": 0.12868046493902216, + "learning_rate": 1.1958409151700822e-06, + "loss": 2.6903, + "step": 45293 + }, + { + "epoch": 2.8117201564342915, + "grad_norm": 0.13851233828626147, + "learning_rate": 1.1950559065747158e-06, + "loss": 2.7037, + "step": 45294 + }, + { + "epoch": 2.8117822335340494, + "grad_norm": 0.1552175625248202, + "learning_rate": 1.1942711526065964e-06, + "loss": 2.7694, + "step": 45295 + }, + { + "epoch": 2.8118443106338074, + "grad_norm": 0.13370812188275716, + "learning_rate": 1.1934866532698097e-06, + "loss": 2.7119, + "step": 45296 + }, + { + "epoch": 2.811906387733565, + "grad_norm": 0.12834840783406704, + "learning_rate": 1.1927024085684469e-06, + "loss": 2.7222, + "step": 45297 + }, + { + "epoch": 2.811968464833323, + "grad_norm": 0.15665594059373125, + "learning_rate": 1.191918418506599e-06, + "loss": 2.6068, + "step": 45298 + }, + { + "epoch": 2.8120305419330807, + "grad_norm": 0.16139268437753074, + "learning_rate": 1.191134683088374e-06, + "loss": 2.7663, + "step": 45299 + }, + { + "epoch": 2.812092619032839, + "grad_norm": 0.142447300554532, + "learning_rate": 1.190351202317841e-06, + "loss": 2.687, + "step": 45300 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.14697275763726778, + "learning_rate": 1.189567976199102e-06, + "loss": 2.7212, + "step": 45301 + }, + { + "epoch": 2.812216773232355, + "grad_norm": 0.13111661422905108, + "learning_rate": 1.1887850047362315e-06, + "loss": 2.6954, + "step": 45302 + }, + { + "epoch": 2.8122788503321123, + "grad_norm": 0.14944007746788274, + "learning_rate": 1.1880022879333151e-06, + "loss": 2.7772, + "step": 45303 + }, + { + "epoch": 2.8123409274318703, + "grad_norm": 0.1401804945356512, + "learning_rate": 1.1872198257944445e-06, + "loss": 2.6819, + "step": 45304 + }, + { + "epoch": 2.812403004531628, + "grad_norm": 0.12708791766210148, + "learning_rate": 1.1864376183236992e-06, + "loss": 2.697, + "step": 45305 + }, + { + "epoch": 2.812465081631386, + "grad_norm": 0.12972268293737074, + "learning_rate": 1.1856556655251538e-06, + "loss": 2.7324, + "step": 45306 + }, + { + "epoch": 2.812527158731144, + "grad_norm": 0.12748539921662033, + "learning_rate": 1.1848739674028942e-06, + "loss": 2.6259, + "step": 45307 + }, + { + "epoch": 2.812589235830902, + "grad_norm": 0.12851061216972706, + "learning_rate": 1.184092523961e-06, + "loss": 2.6685, + "step": 45308 + }, + { + "epoch": 2.81265131293066, + "grad_norm": 0.13530411459729202, + "learning_rate": 1.1833113352035464e-06, + "loss": 2.767, + "step": 45309 + }, + { + "epoch": 2.8127133900304178, + "grad_norm": 0.14612274035280345, + "learning_rate": 1.1825304011346073e-06, + "loss": 2.7222, + "step": 45310 + }, + { + "epoch": 2.8127754671301757, + "grad_norm": 0.12841675761416832, + "learning_rate": 1.181749721758252e-06, + "loss": 2.668, + "step": 45311 + }, + { + "epoch": 2.8128375442299336, + "grad_norm": 0.13352391497934132, + "learning_rate": 1.1809692970785658e-06, + "loss": 2.6797, + "step": 45312 + }, + { + "epoch": 2.8128996213296915, + "grad_norm": 0.1337223722097001, + "learning_rate": 1.1801891270996123e-06, + "loss": 2.6767, + "step": 45313 + }, + { + "epoch": 2.8129616984294494, + "grad_norm": 0.14272354496217363, + "learning_rate": 1.1794092118254662e-06, + "loss": 2.7397, + "step": 45314 + }, + { + "epoch": 2.8130237755292073, + "grad_norm": 0.1485766147938342, + "learning_rate": 1.1786295512601907e-06, + "loss": 2.6765, + "step": 45315 + }, + { + "epoch": 2.8130858526289653, + "grad_norm": 0.12951662313477302, + "learning_rate": 1.177850145407855e-06, + "loss": 2.644, + "step": 45316 + }, + { + "epoch": 2.813147929728723, + "grad_norm": 0.13253981983835264, + "learning_rate": 1.1770709942725332e-06, + "loss": 2.6944, + "step": 45317 + }, + { + "epoch": 2.813210006828481, + "grad_norm": 0.14023652991549188, + "learning_rate": 1.176292097858278e-06, + "loss": 2.7407, + "step": 45318 + }, + { + "epoch": 2.813272083928239, + "grad_norm": 0.13362732738577002, + "learning_rate": 1.1755134561691638e-06, + "loss": 2.7121, + "step": 45319 + }, + { + "epoch": 2.813334161027997, + "grad_norm": 0.14747746482339072, + "learning_rate": 1.174735069209243e-06, + "loss": 2.7366, + "step": 45320 + }, + { + "epoch": 2.813396238127755, + "grad_norm": 0.1344271361875473, + "learning_rate": 1.1739569369825897e-06, + "loss": 2.6878, + "step": 45321 + }, + { + "epoch": 2.8134583152275123, + "grad_norm": 0.14573057136058606, + "learning_rate": 1.1731790594932512e-06, + "loss": 2.7084, + "step": 45322 + }, + { + "epoch": 2.8135203923272707, + "grad_norm": 0.13283662016011757, + "learning_rate": 1.1724014367452906e-06, + "loss": 2.7092, + "step": 45323 + }, + { + "epoch": 2.813582469427028, + "grad_norm": 0.13575242367577756, + "learning_rate": 1.171624068742766e-06, + "loss": 2.7625, + "step": 45324 + }, + { + "epoch": 2.8136445465267865, + "grad_norm": 0.1463018233535845, + "learning_rate": 1.1708469554897295e-06, + "loss": 2.7206, + "step": 45325 + }, + { + "epoch": 2.813706623626544, + "grad_norm": 0.13004803008685223, + "learning_rate": 1.1700700969902446e-06, + "loss": 2.62, + "step": 45326 + }, + { + "epoch": 2.813768700726302, + "grad_norm": 0.1405745389259846, + "learning_rate": 1.1692934932483524e-06, + "loss": 2.7524, + "step": 45327 + }, + { + "epoch": 2.81383077782606, + "grad_norm": 0.15246692592931604, + "learning_rate": 1.168517144268111e-06, + "loss": 2.6503, + "step": 45328 + }, + { + "epoch": 2.8138928549258178, + "grad_norm": 0.1351849969159512, + "learning_rate": 1.1677410500535669e-06, + "loss": 2.706, + "step": 45329 + }, + { + "epoch": 2.8139549320255757, + "grad_norm": 0.14238683723653456, + "learning_rate": 1.166965210608778e-06, + "loss": 2.7413, + "step": 45330 + }, + { + "epoch": 2.8140170091253336, + "grad_norm": 0.13424856397679458, + "learning_rate": 1.1661896259377801e-06, + "loss": 2.7282, + "step": 45331 + }, + { + "epoch": 2.8140790862250915, + "grad_norm": 0.13178846422865023, + "learning_rate": 1.1654142960446313e-06, + "loss": 2.815, + "step": 45332 + }, + { + "epoch": 2.8141411633248494, + "grad_norm": 0.12916818651207576, + "learning_rate": 1.1646392209333723e-06, + "loss": 2.686, + "step": 45333 + }, + { + "epoch": 2.8142032404246073, + "grad_norm": 0.13206305543987637, + "learning_rate": 1.163864400608039e-06, + "loss": 2.742, + "step": 45334 + }, + { + "epoch": 2.8142653175243653, + "grad_norm": 0.1400863078291552, + "learning_rate": 1.1630898350726837e-06, + "loss": 2.7535, + "step": 45335 + }, + { + "epoch": 2.814327394624123, + "grad_norm": 0.13121899965302933, + "learning_rate": 1.1623155243313477e-06, + "loss": 2.7047, + "step": 45336 + }, + { + "epoch": 2.814389471723881, + "grad_norm": 0.13516267391880876, + "learning_rate": 1.1615414683880665e-06, + "loss": 2.7091, + "step": 45337 + }, + { + "epoch": 2.814451548823639, + "grad_norm": 0.13109548817642805, + "learning_rate": 1.1607676672468704e-06, + "loss": 2.7596, + "step": 45338 + }, + { + "epoch": 2.814513625923397, + "grad_norm": 0.13148999921224283, + "learning_rate": 1.159994120911817e-06, + "loss": 2.6958, + "step": 45339 + }, + { + "epoch": 2.814575703023155, + "grad_norm": 0.13556943507809957, + "learning_rate": 1.1592208293869256e-06, + "loss": 2.7283, + "step": 45340 + }, + { + "epoch": 2.8146377801229128, + "grad_norm": 0.1299781449451981, + "learning_rate": 1.1584477926762372e-06, + "loss": 2.6649, + "step": 45341 + }, + { + "epoch": 2.8146998572226707, + "grad_norm": 0.14868994002206196, + "learning_rate": 1.157675010783782e-06, + "loss": 2.7331, + "step": 45342 + }, + { + "epoch": 2.8147619343224286, + "grad_norm": 0.13907890624874858, + "learning_rate": 1.15690248371359e-06, + "loss": 2.7791, + "step": 45343 + }, + { + "epoch": 2.8148240114221865, + "grad_norm": 0.13240170761295464, + "learning_rate": 1.1561302114697025e-06, + "loss": 2.7139, + "step": 45344 + }, + { + "epoch": 2.814886088521944, + "grad_norm": 0.12782798976352477, + "learning_rate": 1.1553581940561388e-06, + "loss": 2.6446, + "step": 45345 + }, + { + "epoch": 2.8149481656217024, + "grad_norm": 0.12921768194954206, + "learning_rate": 1.154586431476923e-06, + "loss": 2.6205, + "step": 45346 + }, + { + "epoch": 2.81501024272146, + "grad_norm": 0.14255261519596152, + "learning_rate": 1.1538149237360906e-06, + "loss": 2.7298, + "step": 45347 + }, + { + "epoch": 2.815072319821218, + "grad_norm": 0.1454912022576379, + "learning_rate": 1.153043670837667e-06, + "loss": 2.6793, + "step": 45348 + }, + { + "epoch": 2.8151343969209757, + "grad_norm": 0.12918516530989474, + "learning_rate": 1.1522726727856702e-06, + "loss": 2.687, + "step": 45349 + }, + { + "epoch": 2.8151964740207336, + "grad_norm": 0.13856546590742624, + "learning_rate": 1.1515019295841256e-06, + "loss": 2.7147, + "step": 45350 + }, + { + "epoch": 2.8152585511204915, + "grad_norm": 0.15089282493216105, + "learning_rate": 1.1507314412370518e-06, + "loss": 2.6686, + "step": 45351 + }, + { + "epoch": 2.8153206282202494, + "grad_norm": 0.12797302502115895, + "learning_rate": 1.1499612077484677e-06, + "loss": 2.721, + "step": 45352 + }, + { + "epoch": 2.8153827053200073, + "grad_norm": 0.1470802964852925, + "learning_rate": 1.1491912291224039e-06, + "loss": 2.7322, + "step": 45353 + }, + { + "epoch": 2.8154447824197653, + "grad_norm": 0.14519476658777747, + "learning_rate": 1.1484215053628622e-06, + "loss": 2.6673, + "step": 45354 + }, + { + "epoch": 2.815506859519523, + "grad_norm": 0.13342430344303824, + "learning_rate": 1.1476520364738674e-06, + "loss": 2.6857, + "step": 45355 + }, + { + "epoch": 2.815568936619281, + "grad_norm": 0.13883613206981676, + "learning_rate": 1.1468828224594274e-06, + "loss": 2.6667, + "step": 45356 + }, + { + "epoch": 2.815631013719039, + "grad_norm": 0.15130543833660698, + "learning_rate": 1.1461138633235612e-06, + "loss": 2.7159, + "step": 45357 + }, + { + "epoch": 2.815693090818797, + "grad_norm": 0.13660537923837368, + "learning_rate": 1.145345159070277e-06, + "loss": 2.6722, + "step": 45358 + }, + { + "epoch": 2.815755167918555, + "grad_norm": 0.13254747876136927, + "learning_rate": 1.1445767097035931e-06, + "loss": 2.6453, + "step": 45359 + }, + { + "epoch": 2.8158172450183128, + "grad_norm": 0.14293122007429815, + "learning_rate": 1.1438085152275012e-06, + "loss": 2.6051, + "step": 45360 + }, + { + "epoch": 2.8158793221180707, + "grad_norm": 0.13286145794521104, + "learning_rate": 1.1430405756460316e-06, + "loss": 2.7025, + "step": 45361 + }, + { + "epoch": 2.8159413992178286, + "grad_norm": 0.13458251953125, + "learning_rate": 1.1422728909631753e-06, + "loss": 2.6409, + "step": 45362 + }, + { + "epoch": 2.8160034763175865, + "grad_norm": 0.14308594059299776, + "learning_rate": 1.14150546118294e-06, + "loss": 2.7203, + "step": 45363 + }, + { + "epoch": 2.8160655534173444, + "grad_norm": 0.13058387165906005, + "learning_rate": 1.140738286309334e-06, + "loss": 2.6355, + "step": 45364 + }, + { + "epoch": 2.8161276305171024, + "grad_norm": 0.1381741308200472, + "learning_rate": 1.1399713663463596e-06, + "loss": 2.6136, + "step": 45365 + }, + { + "epoch": 2.8161897076168603, + "grad_norm": 0.14720670624371018, + "learning_rate": 1.1392047012980078e-06, + "loss": 2.7523, + "step": 45366 + }, + { + "epoch": 2.816251784716618, + "grad_norm": 0.14170770941757452, + "learning_rate": 1.1384382911682979e-06, + "loss": 2.6979, + "step": 45367 + }, + { + "epoch": 2.8163138618163757, + "grad_norm": 0.1316442225952279, + "learning_rate": 1.1376721359612098e-06, + "loss": 2.6868, + "step": 45368 + }, + { + "epoch": 2.816375938916134, + "grad_norm": 0.13062684907699595, + "learning_rate": 1.1369062356807514e-06, + "loss": 2.682, + "step": 45369 + }, + { + "epoch": 2.8164380160158915, + "grad_norm": 0.13104804515350305, + "learning_rate": 1.1361405903309085e-06, + "loss": 2.7331, + "step": 45370 + }, + { + "epoch": 2.81650009311565, + "grad_norm": 0.14057255800245486, + "learning_rate": 1.135375199915689e-06, + "loss": 2.7222, + "step": 45371 + }, + { + "epoch": 2.8165621702154073, + "grad_norm": 0.13026336619553397, + "learning_rate": 1.134610064439079e-06, + "loss": 2.7782, + "step": 45372 + }, + { + "epoch": 2.8166242473151657, + "grad_norm": 0.13052456284431355, + "learning_rate": 1.133845183905069e-06, + "loss": 2.6814, + "step": 45373 + }, + { + "epoch": 2.816686324414923, + "grad_norm": 0.15033305292326277, + "learning_rate": 1.1330805583176452e-06, + "loss": 2.6835, + "step": 45374 + }, + { + "epoch": 2.816748401514681, + "grad_norm": 0.13902731461034437, + "learning_rate": 1.1323161876808096e-06, + "loss": 2.7384, + "step": 45375 + }, + { + "epoch": 2.816810478614439, + "grad_norm": 0.13209342536543445, + "learning_rate": 1.1315520719985485e-06, + "loss": 2.709, + "step": 45376 + }, + { + "epoch": 2.816872555714197, + "grad_norm": 0.1460347586883117, + "learning_rate": 1.1307882112748359e-06, + "loss": 2.5876, + "step": 45377 + }, + { + "epoch": 2.816934632813955, + "grad_norm": 0.13521408706799354, + "learning_rate": 1.1300246055136687e-06, + "loss": 2.6801, + "step": 45378 + }, + { + "epoch": 2.8169967099137128, + "grad_norm": 0.12924653712804274, + "learning_rate": 1.1292612547190217e-06, + "loss": 2.6876, + "step": 45379 + }, + { + "epoch": 2.8170587870134707, + "grad_norm": 0.13321997740070723, + "learning_rate": 1.1284981588948862e-06, + "loss": 2.6705, + "step": 45380 + }, + { + "epoch": 2.8171208641132286, + "grad_norm": 0.14128444942529697, + "learning_rate": 1.1277353180452422e-06, + "loss": 2.6529, + "step": 45381 + }, + { + "epoch": 2.8171829412129865, + "grad_norm": 0.13276224026849884, + "learning_rate": 1.1269727321740642e-06, + "loss": 2.6402, + "step": 45382 + }, + { + "epoch": 2.8172450183127444, + "grad_norm": 0.14319165056358013, + "learning_rate": 1.1262104012853324e-06, + "loss": 2.7012, + "step": 45383 + }, + { + "epoch": 2.8173070954125023, + "grad_norm": 0.12919911437199788, + "learning_rate": 1.1254483253830272e-06, + "loss": 2.7645, + "step": 45384 + }, + { + "epoch": 2.8173691725122603, + "grad_norm": 0.13147455063153912, + "learning_rate": 1.124686504471123e-06, + "loss": 2.6672, + "step": 45385 + }, + { + "epoch": 2.817431249612018, + "grad_norm": 0.13843388350350486, + "learning_rate": 1.1239249385535945e-06, + "loss": 2.6858, + "step": 45386 + }, + { + "epoch": 2.817493326711776, + "grad_norm": 0.15384404818304293, + "learning_rate": 1.123163627634416e-06, + "loss": 2.7028, + "step": 45387 + }, + { + "epoch": 2.817555403811534, + "grad_norm": 0.1488762820170219, + "learning_rate": 1.1224025717175512e-06, + "loss": 2.7717, + "step": 45388 + }, + { + "epoch": 2.817617480911292, + "grad_norm": 0.13895511579744269, + "learning_rate": 1.1216417708069805e-06, + "loss": 2.7375, + "step": 45389 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.14316646475032, + "learning_rate": 1.1208812249066725e-06, + "loss": 2.806, + "step": 45390 + }, + { + "epoch": 2.8177416351108078, + "grad_norm": 0.14030014371308902, + "learning_rate": 1.1201209340205965e-06, + "loss": 2.6208, + "step": 45391 + }, + { + "epoch": 2.8178037122105657, + "grad_norm": 0.14185151368943055, + "learning_rate": 1.1193608981527104e-06, + "loss": 2.7358, + "step": 45392 + }, + { + "epoch": 2.817865789310323, + "grad_norm": 0.13426568414177842, + "learning_rate": 1.1186011173069833e-06, + "loss": 2.7204, + "step": 45393 + }, + { + "epoch": 2.8179278664100815, + "grad_norm": 0.12899214599277858, + "learning_rate": 1.1178415914873841e-06, + "loss": 2.6067, + "step": 45394 + }, + { + "epoch": 2.817989943509839, + "grad_norm": 0.1306822346479957, + "learning_rate": 1.117082320697871e-06, + "loss": 2.6998, + "step": 45395 + }, + { + "epoch": 2.8180520206095974, + "grad_norm": 0.13009558723704664, + "learning_rate": 1.1163233049424072e-06, + "loss": 2.6815, + "step": 45396 + }, + { + "epoch": 2.818114097709355, + "grad_norm": 0.1531096426405457, + "learning_rate": 1.1155645442249451e-06, + "loss": 2.7205, + "step": 45397 + }, + { + "epoch": 2.8181761748091128, + "grad_norm": 0.1324657865315649, + "learning_rate": 1.1148060385494542e-06, + "loss": 2.7111, + "step": 45398 + }, + { + "epoch": 2.8182382519088707, + "grad_norm": 0.1334223559650455, + "learning_rate": 1.1140477879198918e-06, + "loss": 2.6511, + "step": 45399 + }, + { + "epoch": 2.8183003290086286, + "grad_norm": 0.13131975920001943, + "learning_rate": 1.1132897923402053e-06, + "loss": 2.6759, + "step": 45400 + }, + { + "epoch": 2.8183624061083865, + "grad_norm": 0.13978353381364655, + "learning_rate": 1.1125320518143579e-06, + "loss": 2.7193, + "step": 45401 + }, + { + "epoch": 2.8184244832081444, + "grad_norm": 0.13103125088339398, + "learning_rate": 1.111774566346291e-06, + "loss": 2.72, + "step": 45402 + }, + { + "epoch": 2.8184865603079023, + "grad_norm": 0.13213624306529237, + "learning_rate": 1.1110173359399733e-06, + "loss": 2.6315, + "step": 45403 + }, + { + "epoch": 2.8185486374076603, + "grad_norm": 0.14208965367045584, + "learning_rate": 1.1102603605993412e-06, + "loss": 2.6922, + "step": 45404 + }, + { + "epoch": 2.818610714507418, + "grad_norm": 0.13410930175628138, + "learning_rate": 1.1095036403283521e-06, + "loss": 2.6674, + "step": 45405 + }, + { + "epoch": 2.818672791607176, + "grad_norm": 0.14882646594787388, + "learning_rate": 1.108747175130942e-06, + "loss": 2.6694, + "step": 45406 + }, + { + "epoch": 2.818734868706934, + "grad_norm": 0.12713036269530023, + "learning_rate": 1.1079909650110743e-06, + "loss": 2.6293, + "step": 45407 + }, + { + "epoch": 2.818796945806692, + "grad_norm": 0.1276410099933864, + "learning_rate": 1.1072350099726903e-06, + "loss": 2.6255, + "step": 45408 + }, + { + "epoch": 2.81885902290645, + "grad_norm": 0.14483467676494793, + "learning_rate": 1.1064793100197257e-06, + "loss": 2.6609, + "step": 45409 + }, + { + "epoch": 2.8189211000062078, + "grad_norm": 0.13082539416125677, + "learning_rate": 1.1057238651561219e-06, + "loss": 2.6097, + "step": 45410 + }, + { + "epoch": 2.8189831771059657, + "grad_norm": 0.13157154698877455, + "learning_rate": 1.1049686753858368e-06, + "loss": 2.7238, + "step": 45411 + }, + { + "epoch": 2.8190452542057236, + "grad_norm": 0.14323620923385597, + "learning_rate": 1.104213740712795e-06, + "loss": 2.8037, + "step": 45412 + }, + { + "epoch": 2.8191073313054815, + "grad_norm": 0.1336502134339255, + "learning_rate": 1.1034590611409435e-06, + "loss": 2.7869, + "step": 45413 + }, + { + "epoch": 2.8191694084052394, + "grad_norm": 0.13469898262331786, + "learning_rate": 1.102704636674212e-06, + "loss": 2.729, + "step": 45414 + }, + { + "epoch": 2.8192314855049974, + "grad_norm": 0.14085903164509136, + "learning_rate": 1.101950467316537e-06, + "loss": 2.6885, + "step": 45415 + }, + { + "epoch": 2.819293562604755, + "grad_norm": 0.12767182635664814, + "learning_rate": 1.1011965530718648e-06, + "loss": 2.6517, + "step": 45416 + }, + { + "epoch": 2.819355639704513, + "grad_norm": 0.13548992923098918, + "learning_rate": 1.1004428939441202e-06, + "loss": 2.6452, + "step": 45417 + }, + { + "epoch": 2.8194177168042707, + "grad_norm": 0.1454952861847356, + "learning_rate": 1.0996894899372334e-06, + "loss": 2.7552, + "step": 45418 + }, + { + "epoch": 2.819479793904029, + "grad_norm": 0.13017218485369383, + "learning_rate": 1.0989363410551346e-06, + "loss": 2.7595, + "step": 45419 + }, + { + "epoch": 2.8195418710037865, + "grad_norm": 0.1422740031252022, + "learning_rate": 1.0981834473017594e-06, + "loss": 2.7415, + "step": 45420 + }, + { + "epoch": 2.819603948103545, + "grad_norm": 0.14546423137788614, + "learning_rate": 1.0974308086810325e-06, + "loss": 2.7343, + "step": 45421 + }, + { + "epoch": 2.8196660252033023, + "grad_norm": 0.13937145053295355, + "learning_rate": 1.0966784251968786e-06, + "loss": 2.6955, + "step": 45422 + }, + { + "epoch": 2.8197281023030603, + "grad_norm": 0.12693563487543544, + "learning_rate": 1.095926296853228e-06, + "loss": 2.6733, + "step": 45423 + }, + { + "epoch": 2.819790179402818, + "grad_norm": 0.14248127456656795, + "learning_rate": 1.0951744236539941e-06, + "loss": 2.7346, + "step": 45424 + }, + { + "epoch": 2.819852256502576, + "grad_norm": 0.13885049330618926, + "learning_rate": 1.094422805603118e-06, + "loss": 2.6776, + "step": 45425 + }, + { + "epoch": 2.819914333602334, + "grad_norm": 0.13752537244005364, + "learning_rate": 1.0936714427045025e-06, + "loss": 2.6235, + "step": 45426 + }, + { + "epoch": 2.819976410702092, + "grad_norm": 0.14844344779696902, + "learning_rate": 1.0929203349620831e-06, + "loss": 2.6704, + "step": 45427 + }, + { + "epoch": 2.82003848780185, + "grad_norm": 0.1356331229304401, + "learning_rate": 1.0921694823797623e-06, + "loss": 2.7102, + "step": 45428 + }, + { + "epoch": 2.8201005649016078, + "grad_norm": 0.13890113147449143, + "learning_rate": 1.091418884961465e-06, + "loss": 2.6652, + "step": 45429 + }, + { + "epoch": 2.8201626420013657, + "grad_norm": 0.14062814576551128, + "learning_rate": 1.090668542711115e-06, + "loss": 2.733, + "step": 45430 + }, + { + "epoch": 2.8202247191011236, + "grad_norm": 0.1287765500669662, + "learning_rate": 1.0899184556326214e-06, + "loss": 2.6295, + "step": 45431 + }, + { + "epoch": 2.8202867962008815, + "grad_norm": 0.13228702374318244, + "learning_rate": 1.089168623729897e-06, + "loss": 2.6914, + "step": 45432 + }, + { + "epoch": 2.8203488733006394, + "grad_norm": 0.1561667578213316, + "learning_rate": 1.0884190470068444e-06, + "loss": 2.7599, + "step": 45433 + }, + { + "epoch": 2.8204109504003974, + "grad_norm": 0.1511515773778904, + "learning_rate": 1.0876697254673885e-06, + "loss": 2.6834, + "step": 45434 + }, + { + "epoch": 2.8204730275001553, + "grad_norm": 0.1461242381325661, + "learning_rate": 1.0869206591154368e-06, + "loss": 2.7315, + "step": 45435 + }, + { + "epoch": 2.820535104599913, + "grad_norm": 0.13371479451230223, + "learning_rate": 1.0861718479548921e-06, + "loss": 2.7265, + "step": 45436 + }, + { + "epoch": 2.820597181699671, + "grad_norm": 0.13431712861648287, + "learning_rate": 1.0854232919896624e-06, + "loss": 2.6731, + "step": 45437 + }, + { + "epoch": 2.820659258799429, + "grad_norm": 0.13998297654901196, + "learning_rate": 1.0846749912236554e-06, + "loss": 2.6107, + "step": 45438 + }, + { + "epoch": 2.820721335899187, + "grad_norm": 0.14320554249455988, + "learning_rate": 1.0839269456607737e-06, + "loss": 2.6664, + "step": 45439 + }, + { + "epoch": 2.820783412998945, + "grad_norm": 0.1409247634000384, + "learning_rate": 1.0831791553049142e-06, + "loss": 2.7178, + "step": 45440 + }, + { + "epoch": 2.8208454900987023, + "grad_norm": 0.131104872428366, + "learning_rate": 1.0824316201599905e-06, + "loss": 2.8021, + "step": 45441 + }, + { + "epoch": 2.8209075671984607, + "grad_norm": 0.13671597069231384, + "learning_rate": 1.0816843402298937e-06, + "loss": 2.6465, + "step": 45442 + }, + { + "epoch": 2.820969644298218, + "grad_norm": 0.1333327272582684, + "learning_rate": 1.0809373155185265e-06, + "loss": 2.6427, + "step": 45443 + }, + { + "epoch": 2.8210317213979765, + "grad_norm": 0.12879110742530034, + "learning_rate": 1.0801905460297856e-06, + "loss": 2.6903, + "step": 45444 + }, + { + "epoch": 2.821093798497734, + "grad_norm": 0.14577320397630883, + "learning_rate": 1.0794440317675626e-06, + "loss": 2.7853, + "step": 45445 + }, + { + "epoch": 2.821155875597492, + "grad_norm": 0.13200886299576373, + "learning_rate": 1.078697772735754e-06, + "loss": 2.7204, + "step": 45446 + }, + { + "epoch": 2.82121795269725, + "grad_norm": 0.13186413498726496, + "learning_rate": 1.0779517689382623e-06, + "loss": 2.6334, + "step": 45447 + }, + { + "epoch": 2.8212800297970078, + "grad_norm": 0.1321299065821999, + "learning_rate": 1.0772060203789681e-06, + "loss": 2.6157, + "step": 45448 + }, + { + "epoch": 2.8213421068967657, + "grad_norm": 0.13227117538072455, + "learning_rate": 1.0764605270617678e-06, + "loss": 2.6329, + "step": 45449 + }, + { + "epoch": 2.8214041839965236, + "grad_norm": 0.13415911931919897, + "learning_rate": 1.075715288990553e-06, + "loss": 2.7408, + "step": 45450 + }, + { + "epoch": 2.8214662610962815, + "grad_norm": 0.14412920681768326, + "learning_rate": 1.0749703061691984e-06, + "loss": 2.6906, + "step": 45451 + }, + { + "epoch": 2.8215283381960394, + "grad_norm": 0.13432867282881295, + "learning_rate": 1.074225578601612e-06, + "loss": 2.6441, + "step": 45452 + }, + { + "epoch": 2.8215904152957973, + "grad_norm": 0.13404407700307408, + "learning_rate": 1.0734811062916627e-06, + "loss": 2.7435, + "step": 45453 + }, + { + "epoch": 2.8216524923955553, + "grad_norm": 0.13056874383675576, + "learning_rate": 1.072736889243242e-06, + "loss": 2.6373, + "step": 45454 + }, + { + "epoch": 2.821714569495313, + "grad_norm": 0.13099774119744761, + "learning_rate": 1.0719929274602247e-06, + "loss": 2.7004, + "step": 45455 + }, + { + "epoch": 2.821776646595071, + "grad_norm": 0.1404278485810442, + "learning_rate": 1.071249220946502e-06, + "loss": 2.561, + "step": 45456 + }, + { + "epoch": 2.821838723694829, + "grad_norm": 0.14145036715220174, + "learning_rate": 1.0705057697059484e-06, + "loss": 2.6693, + "step": 45457 + }, + { + "epoch": 2.821900800794587, + "grad_norm": 0.130830612153053, + "learning_rate": 1.0697625737424445e-06, + "loss": 2.7088, + "step": 45458 + }, + { + "epoch": 2.821962877894345, + "grad_norm": 0.13646955258959143, + "learning_rate": 1.06901963305987e-06, + "loss": 2.6827, + "step": 45459 + }, + { + "epoch": 2.8220249549941028, + "grad_norm": 0.1458635589804551, + "learning_rate": 1.0682769476620947e-06, + "loss": 2.7065, + "step": 45460 + }, + { + "epoch": 2.8220870320938607, + "grad_norm": 0.14520245071405632, + "learning_rate": 1.0675345175530038e-06, + "loss": 2.7576, + "step": 45461 + }, + { + "epoch": 2.8221491091936186, + "grad_norm": 0.13338064938041727, + "learning_rate": 1.0667923427364556e-06, + "loss": 2.6724, + "step": 45462 + }, + { + "epoch": 2.8222111862933765, + "grad_norm": 0.13110097957268033, + "learning_rate": 1.066050423216336e-06, + "loss": 2.7119, + "step": 45463 + }, + { + "epoch": 2.822273263393134, + "grad_norm": 0.1498166003706506, + "learning_rate": 1.0653087589965082e-06, + "loss": 2.7055, + "step": 45464 + }, + { + "epoch": 2.8223353404928924, + "grad_norm": 0.1495786832099492, + "learning_rate": 1.0645673500808418e-06, + "loss": 2.6754, + "step": 45465 + }, + { + "epoch": 2.82239741759265, + "grad_norm": 0.13265685470002903, + "learning_rate": 1.063826196473211e-06, + "loss": 2.6683, + "step": 45466 + }, + { + "epoch": 2.822459494692408, + "grad_norm": 0.13724179353568955, + "learning_rate": 1.0630852981774797e-06, + "loss": 2.7657, + "step": 45467 + }, + { + "epoch": 2.8225215717921657, + "grad_norm": 0.1425933829726479, + "learning_rate": 1.0623446551975114e-06, + "loss": 2.7242, + "step": 45468 + }, + { + "epoch": 2.822583648891924, + "grad_norm": 0.1324226393438465, + "learning_rate": 1.061604267537164e-06, + "loss": 2.6989, + "step": 45469 + }, + { + "epoch": 2.8226457259916815, + "grad_norm": 0.1310293815615135, + "learning_rate": 1.060864135200318e-06, + "loss": 2.6915, + "step": 45470 + }, + { + "epoch": 2.8227078030914394, + "grad_norm": 0.13038060157195516, + "learning_rate": 1.06012425819082e-06, + "loss": 2.649, + "step": 45471 + }, + { + "epoch": 2.8227698801911973, + "grad_norm": 0.12821592093890558, + "learning_rate": 1.0593846365125336e-06, + "loss": 2.6639, + "step": 45472 + }, + { + "epoch": 2.8228319572909553, + "grad_norm": 0.13159719679020568, + "learning_rate": 1.0586452701693172e-06, + "loss": 2.7413, + "step": 45473 + }, + { + "epoch": 2.822894034390713, + "grad_norm": 0.1291125770406302, + "learning_rate": 1.0579061591650286e-06, + "loss": 2.6969, + "step": 45474 + }, + { + "epoch": 2.822956111490471, + "grad_norm": 0.13119011687939564, + "learning_rate": 1.0571673035035311e-06, + "loss": 2.735, + "step": 45475 + }, + { + "epoch": 2.823018188590229, + "grad_norm": 0.13151575697703818, + "learning_rate": 1.0564287031886666e-06, + "loss": 2.6893, + "step": 45476 + }, + { + "epoch": 2.823080265689987, + "grad_norm": 0.13747909153929833, + "learning_rate": 1.0556903582242982e-06, + "loss": 2.6291, + "step": 45477 + }, + { + "epoch": 2.823142342789745, + "grad_norm": 0.13159980820312844, + "learning_rate": 1.0549522686142733e-06, + "loss": 2.6539, + "step": 45478 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.15683191538934743, + "learning_rate": 1.0542144343624438e-06, + "loss": 2.8429, + "step": 45479 + }, + { + "epoch": 2.8232664969892607, + "grad_norm": 0.1449779168220038, + "learning_rate": 1.0534768554726626e-06, + "loss": 2.7482, + "step": 45480 + }, + { + "epoch": 2.8233285740890186, + "grad_norm": 0.13089777255024576, + "learning_rate": 1.052739531948771e-06, + "loss": 2.6378, + "step": 45481 + }, + { + "epoch": 2.8233906511887765, + "grad_norm": 0.16836914312963272, + "learning_rate": 1.0520024637946213e-06, + "loss": 2.6221, + "step": 45482 + }, + { + "epoch": 2.8234527282885344, + "grad_norm": 0.14063216562028572, + "learning_rate": 1.051265651014055e-06, + "loss": 2.7771, + "step": 45483 + }, + { + "epoch": 2.8235148053882924, + "grad_norm": 0.14823613814122338, + "learning_rate": 1.0505290936109248e-06, + "loss": 2.688, + "step": 45484 + }, + { + "epoch": 2.8235768824880503, + "grad_norm": 0.13017526127109716, + "learning_rate": 1.0497927915890605e-06, + "loss": 2.7255, + "step": 45485 + }, + { + "epoch": 2.823638959587808, + "grad_norm": 0.13979168860520116, + "learning_rate": 1.0490567449523147e-06, + "loss": 2.6813, + "step": 45486 + }, + { + "epoch": 2.823701036687566, + "grad_norm": 0.13265963480419998, + "learning_rate": 1.048320953704518e-06, + "loss": 2.7462, + "step": 45487 + }, + { + "epoch": 2.823763113787324, + "grad_norm": 0.13147401935507622, + "learning_rate": 1.0475854178495227e-06, + "loss": 2.7196, + "step": 45488 + }, + { + "epoch": 2.8238251908870815, + "grad_norm": 0.1466567614603543, + "learning_rate": 1.0468501373911532e-06, + "loss": 2.6874, + "step": 45489 + }, + { + "epoch": 2.82388726798684, + "grad_norm": 0.13449763174834814, + "learning_rate": 1.0461151123332514e-06, + "loss": 2.7412, + "step": 45490 + }, + { + "epoch": 2.8239493450865973, + "grad_norm": 0.13279231002157227, + "learning_rate": 1.0453803426796472e-06, + "loss": 2.6714, + "step": 45491 + }, + { + "epoch": 2.8240114221863557, + "grad_norm": 0.13657011399331873, + "learning_rate": 1.044645828434182e-06, + "loss": 2.7612, + "step": 45492 + }, + { + "epoch": 2.824073499286113, + "grad_norm": 0.1263541617305699, + "learning_rate": 1.0439115696006806e-06, + "loss": 2.5979, + "step": 45493 + }, + { + "epoch": 2.824135576385871, + "grad_norm": 0.14568016546337403, + "learning_rate": 1.0431775661829735e-06, + "loss": 2.6721, + "step": 45494 + }, + { + "epoch": 2.824197653485629, + "grad_norm": 0.14429534527143845, + "learning_rate": 1.0424438181849017e-06, + "loss": 2.8144, + "step": 45495 + }, + { + "epoch": 2.824259730585387, + "grad_norm": 0.13190146336640268, + "learning_rate": 1.041710325610279e-06, + "loss": 2.6631, + "step": 45496 + }, + { + "epoch": 2.824321807685145, + "grad_norm": 0.13059404147759832, + "learning_rate": 1.0409770884629412e-06, + "loss": 2.7099, + "step": 45497 + }, + { + "epoch": 2.8243838847849028, + "grad_norm": 0.13296958105004641, + "learning_rate": 1.0402441067467183e-06, + "loss": 2.7133, + "step": 45498 + }, + { + "epoch": 2.8244459618846607, + "grad_norm": 0.13016503726838538, + "learning_rate": 1.039511380465419e-06, + "loss": 2.7261, + "step": 45499 + }, + { + "epoch": 2.8245080389844186, + "grad_norm": 0.14438544470917936, + "learning_rate": 1.0387789096228783e-06, + "loss": 2.7329, + "step": 45500 + }, + { + "epoch": 2.8245701160841765, + "grad_norm": 0.14136920148559076, + "learning_rate": 1.0380466942229105e-06, + "loss": 2.6965, + "step": 45501 + }, + { + "epoch": 2.8246321931839344, + "grad_norm": 0.15280082188977834, + "learning_rate": 1.03731473426934e-06, + "loss": 2.6748, + "step": 45502 + }, + { + "epoch": 2.8246942702836924, + "grad_norm": 0.13056118283490756, + "learning_rate": 1.0365830297659862e-06, + "loss": 2.7362, + "step": 45503 + }, + { + "epoch": 2.8247563473834503, + "grad_norm": 0.14998260486590279, + "learning_rate": 1.0358515807166625e-06, + "loss": 2.6685, + "step": 45504 + }, + { + "epoch": 2.824818424483208, + "grad_norm": 0.13459341822499837, + "learning_rate": 1.035120387125188e-06, + "loss": 2.7861, + "step": 45505 + }, + { + "epoch": 2.824880501582966, + "grad_norm": 0.15691392629331186, + "learning_rate": 1.0343894489953764e-06, + "loss": 2.8059, + "step": 45506 + }, + { + "epoch": 2.824942578682724, + "grad_norm": 0.1310632313474625, + "learning_rate": 1.033658766331047e-06, + "loss": 2.7539, + "step": 45507 + }, + { + "epoch": 2.825004655782482, + "grad_norm": 0.13611967442080952, + "learning_rate": 1.0329283391360022e-06, + "loss": 2.6554, + "step": 45508 + }, + { + "epoch": 2.82506673288224, + "grad_norm": 0.1381716773620455, + "learning_rate": 1.0321981674140558e-06, + "loss": 2.7058, + "step": 45509 + }, + { + "epoch": 2.825128809981998, + "grad_norm": 0.13070664816632346, + "learning_rate": 1.0314682511690266e-06, + "loss": 2.6989, + "step": 45510 + }, + { + "epoch": 2.8251908870817557, + "grad_norm": 0.13534566197600512, + "learning_rate": 1.0307385904047119e-06, + "loss": 2.7564, + "step": 45511 + }, + { + "epoch": 2.825252964181513, + "grad_norm": 0.162576605319061, + "learning_rate": 1.0300091851249194e-06, + "loss": 2.729, + "step": 45512 + }, + { + "epoch": 2.8253150412812715, + "grad_norm": 0.13488236498765369, + "learning_rate": 1.0292800353334575e-06, + "loss": 2.6841, + "step": 45513 + }, + { + "epoch": 2.825377118381029, + "grad_norm": 0.12546139737125753, + "learning_rate": 1.0285511410341285e-06, + "loss": 2.668, + "step": 45514 + }, + { + "epoch": 2.8254391954807874, + "grad_norm": 0.1318684008153954, + "learning_rate": 1.0278225022307352e-06, + "loss": 2.6736, + "step": 45515 + }, + { + "epoch": 2.825501272580545, + "grad_norm": 0.13235134817918606, + "learning_rate": 1.0270941189270855e-06, + "loss": 2.6988, + "step": 45516 + }, + { + "epoch": 2.825563349680303, + "grad_norm": 0.141585921377362, + "learning_rate": 1.0263659911269707e-06, + "loss": 2.7388, + "step": 45517 + }, + { + "epoch": 2.8256254267800607, + "grad_norm": 0.14406034720378336, + "learning_rate": 1.0256381188341934e-06, + "loss": 2.6355, + "step": 45518 + }, + { + "epoch": 2.8256875038798186, + "grad_norm": 0.15768450631230488, + "learning_rate": 1.0249105020525507e-06, + "loss": 2.7196, + "step": 45519 + }, + { + "epoch": 2.8257495809795765, + "grad_norm": 0.1339480158904884, + "learning_rate": 1.0241831407858394e-06, + "loss": 2.7154, + "step": 45520 + }, + { + "epoch": 2.8258116580793344, + "grad_norm": 0.13516535425117368, + "learning_rate": 1.0234560350378563e-06, + "loss": 2.7436, + "step": 45521 + }, + { + "epoch": 2.8258737351790923, + "grad_norm": 0.13252316542317913, + "learning_rate": 1.0227291848123933e-06, + "loss": 2.8606, + "step": 45522 + }, + { + "epoch": 2.8259358122788503, + "grad_norm": 0.1444016016876763, + "learning_rate": 1.0220025901132358e-06, + "loss": 2.6387, + "step": 45523 + }, + { + "epoch": 2.825997889378608, + "grad_norm": 0.14882497658991298, + "learning_rate": 1.0212762509441865e-06, + "loss": 2.6897, + "step": 45524 + }, + { + "epoch": 2.826059966478366, + "grad_norm": 0.13696420298336903, + "learning_rate": 1.0205501673090256e-06, + "loss": 2.6961, + "step": 45525 + }, + { + "epoch": 2.826122043578124, + "grad_norm": 0.129481534677929, + "learning_rate": 1.0198243392115503e-06, + "loss": 2.644, + "step": 45526 + }, + { + "epoch": 2.826184120677882, + "grad_norm": 0.1415003381862407, + "learning_rate": 1.0190987666555297e-06, + "loss": 2.7219, + "step": 45527 + }, + { + "epoch": 2.82624619777764, + "grad_norm": 0.13859038577894014, + "learning_rate": 1.0183734496447661e-06, + "loss": 2.6567, + "step": 45528 + }, + { + "epoch": 2.8263082748773978, + "grad_norm": 0.12885083828232627, + "learning_rate": 1.0176483881830457e-06, + "loss": 2.6029, + "step": 45529 + }, + { + "epoch": 2.8263703519771557, + "grad_norm": 0.13721990694606437, + "learning_rate": 1.0169235822741375e-06, + "loss": 2.6913, + "step": 45530 + }, + { + "epoch": 2.8264324290769136, + "grad_norm": 0.1348923350092885, + "learning_rate": 1.0161990319218383e-06, + "loss": 2.767, + "step": 45531 + }, + { + "epoch": 2.8264945061766715, + "grad_norm": 0.13424596940148964, + "learning_rate": 1.0154747371299067e-06, + "loss": 2.6582, + "step": 45532 + }, + { + "epoch": 2.8265565832764294, + "grad_norm": 0.14265890968199904, + "learning_rate": 1.014750697902145e-06, + "loss": 2.6253, + "step": 45533 + }, + { + "epoch": 2.8266186603761874, + "grad_norm": 0.1335509053063209, + "learning_rate": 1.014026914242322e-06, + "loss": 2.6355, + "step": 45534 + }, + { + "epoch": 2.8266807374759453, + "grad_norm": 0.12814231121766467, + "learning_rate": 1.0133033861542073e-06, + "loss": 2.622, + "step": 45535 + }, + { + "epoch": 2.826742814575703, + "grad_norm": 0.14543815840181532, + "learning_rate": 1.0125801136415868e-06, + "loss": 2.6828, + "step": 45536 + }, + { + "epoch": 2.8268048916754607, + "grad_norm": 0.12923590816021457, + "learning_rate": 1.0118570967082187e-06, + "loss": 2.6784, + "step": 45537 + }, + { + "epoch": 2.826866968775219, + "grad_norm": 0.14105839305311027, + "learning_rate": 1.0111343353578884e-06, + "loss": 2.6313, + "step": 45538 + }, + { + "epoch": 2.8269290458749765, + "grad_norm": 0.15928305446694577, + "learning_rate": 1.0104118295943655e-06, + "loss": 2.7548, + "step": 45539 + }, + { + "epoch": 2.826991122974735, + "grad_norm": 0.14754323368170424, + "learning_rate": 1.0096895794214133e-06, + "loss": 2.6625, + "step": 45540 + }, + { + "epoch": 2.8270532000744923, + "grad_norm": 0.13736969750447836, + "learning_rate": 1.0089675848428015e-06, + "loss": 2.685, + "step": 45541 + }, + { + "epoch": 2.8271152771742503, + "grad_norm": 0.13512698391385178, + "learning_rate": 1.0082458458623044e-06, + "loss": 2.7072, + "step": 45542 + }, + { + "epoch": 2.827177354274008, + "grad_norm": 0.12924493743418888, + "learning_rate": 1.0075243624836805e-06, + "loss": 2.7181, + "step": 45543 + }, + { + "epoch": 2.827239431373766, + "grad_norm": 0.12850213284383025, + "learning_rate": 1.0068031347106932e-06, + "loss": 2.6803, + "step": 45544 + }, + { + "epoch": 2.827301508473524, + "grad_norm": 0.14177154335761644, + "learning_rate": 1.0060821625471063e-06, + "loss": 2.6752, + "step": 45545 + }, + { + "epoch": 2.827363585573282, + "grad_norm": 0.1512511516164649, + "learning_rate": 1.0053614459966888e-06, + "loss": 2.7299, + "step": 45546 + }, + { + "epoch": 2.82742566267304, + "grad_norm": 0.13661450089968327, + "learning_rate": 1.0046409850631878e-06, + "loss": 2.5768, + "step": 45547 + }, + { + "epoch": 2.8274877397727978, + "grad_norm": 0.1355989578730104, + "learning_rate": 1.0039207797503724e-06, + "loss": 2.7213, + "step": 45548 + }, + { + "epoch": 2.8275498168725557, + "grad_norm": 0.13147227675320847, + "learning_rate": 1.0032008300619954e-06, + "loss": 2.663, + "step": 45549 + }, + { + "epoch": 2.8276118939723136, + "grad_norm": 0.13132749636770244, + "learning_rate": 1.0024811360018149e-06, + "loss": 2.7274, + "step": 45550 + }, + { + "epoch": 2.8276739710720715, + "grad_norm": 0.13731763325595184, + "learning_rate": 1.0017616975735833e-06, + "loss": 2.7219, + "step": 45551 + }, + { + "epoch": 2.8277360481718294, + "grad_norm": 0.1306444438466963, + "learning_rate": 1.0010425147810587e-06, + "loss": 2.7052, + "step": 45552 + }, + { + "epoch": 2.8277981252715874, + "grad_norm": 0.1330490740103218, + "learning_rate": 1.0003235876279882e-06, + "loss": 2.8014, + "step": 45553 + }, + { + "epoch": 2.8278602023713453, + "grad_norm": 0.14209620143654147, + "learning_rate": 9.996049161181187e-07, + "loss": 2.7448, + "step": 45554 + }, + { + "epoch": 2.827922279471103, + "grad_norm": 0.1420222642781798, + "learning_rate": 9.988865002552138e-07, + "loss": 2.6504, + "step": 45555 + }, + { + "epoch": 2.827984356570861, + "grad_norm": 0.14172492737224007, + "learning_rate": 9.981683400430097e-07, + "loss": 2.6756, + "step": 45556 + }, + { + "epoch": 2.828046433670619, + "grad_norm": 0.13669669790896355, + "learning_rate": 9.974504354852642e-07, + "loss": 2.7609, + "step": 45557 + }, + { + "epoch": 2.828108510770377, + "grad_norm": 0.12914686432551165, + "learning_rate": 9.967327865857079e-07, + "loss": 2.6835, + "step": 45558 + }, + { + "epoch": 2.828170587870135, + "grad_norm": 0.13590097402522874, + "learning_rate": 9.960153933480932e-07, + "loss": 2.7451, + "step": 45559 + }, + { + "epoch": 2.8282326649698923, + "grad_norm": 0.13996743401396136, + "learning_rate": 9.95298255776156e-07, + "loss": 2.6861, + "step": 45560 + }, + { + "epoch": 2.8282947420696507, + "grad_norm": 0.12769415335889078, + "learning_rate": 9.945813738736542e-07, + "loss": 2.6232, + "step": 45561 + }, + { + "epoch": 2.828356819169408, + "grad_norm": 0.1307598418712885, + "learning_rate": 9.938647476443185e-07, + "loss": 2.7057, + "step": 45562 + }, + { + "epoch": 2.8284188962691665, + "grad_norm": 0.1302451765069714, + "learning_rate": 9.931483770918848e-07, + "loss": 2.6615, + "step": 45563 + }, + { + "epoch": 2.828480973368924, + "grad_norm": 0.14174819457453472, + "learning_rate": 9.924322622200888e-07, + "loss": 2.7226, + "step": 45564 + }, + { + "epoch": 2.8285430504686824, + "grad_norm": 0.12916108431468976, + "learning_rate": 9.91716403032672e-07, + "loss": 2.657, + "step": 45565 + }, + { + "epoch": 2.82860512756844, + "grad_norm": 0.13247366064238303, + "learning_rate": 9.910007995333702e-07, + "loss": 2.6626, + "step": 45566 + }, + { + "epoch": 2.8286672046681978, + "grad_norm": 0.14409916961329405, + "learning_rate": 9.902854517259141e-07, + "loss": 2.7136, + "step": 45567 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.1394957795077231, + "learning_rate": 9.895703596140338e-07, + "loss": 2.7219, + "step": 45568 + }, + { + "epoch": 2.8287913588677136, + "grad_norm": 0.1464927035189334, + "learning_rate": 9.888555232014652e-07, + "loss": 2.7147, + "step": 45569 + }, + { + "epoch": 2.8288534359674715, + "grad_norm": 0.1460262573620563, + "learning_rate": 9.881409424919385e-07, + "loss": 2.5708, + "step": 45570 + }, + { + "epoch": 2.8289155130672294, + "grad_norm": 0.1324770210578974, + "learning_rate": 9.874266174891733e-07, + "loss": 2.707, + "step": 45571 + }, + { + "epoch": 2.8289775901669874, + "grad_norm": 0.15149275585872746, + "learning_rate": 9.867125481969053e-07, + "loss": 2.7328, + "step": 45572 + }, + { + "epoch": 2.8290396672667453, + "grad_norm": 0.13303440153122534, + "learning_rate": 9.859987346188537e-07, + "loss": 2.6685, + "step": 45573 + }, + { + "epoch": 2.829101744366503, + "grad_norm": 0.1310683119694466, + "learning_rate": 9.85285176758749e-07, + "loss": 2.687, + "step": 45574 + }, + { + "epoch": 2.829163821466261, + "grad_norm": 0.15298610148538305, + "learning_rate": 9.845718746203048e-07, + "loss": 2.6822, + "step": 45575 + }, + { + "epoch": 2.829225898566019, + "grad_norm": 0.14563860551366878, + "learning_rate": 9.838588282072513e-07, + "loss": 2.7835, + "step": 45576 + }, + { + "epoch": 2.829287975665777, + "grad_norm": 0.13385692323327567, + "learning_rate": 9.831460375233027e-07, + "loss": 2.798, + "step": 45577 + }, + { + "epoch": 2.829350052765535, + "grad_norm": 0.1433731215432091, + "learning_rate": 9.824335025721831e-07, + "loss": 2.7239, + "step": 45578 + }, + { + "epoch": 2.829412129865293, + "grad_norm": 0.13309735016612267, + "learning_rate": 9.817212233576068e-07, + "loss": 2.6687, + "step": 45579 + }, + { + "epoch": 2.8294742069650507, + "grad_norm": 0.13151232242777097, + "learning_rate": 9.810091998832925e-07, + "loss": 2.702, + "step": 45580 + }, + { + "epoch": 2.8295362840648086, + "grad_norm": 0.13228291222248686, + "learning_rate": 9.80297432152949e-07, + "loss": 2.6886, + "step": 45581 + }, + { + "epoch": 2.8295983611645665, + "grad_norm": 0.14041866954295693, + "learning_rate": 9.79585920170295e-07, + "loss": 2.686, + "step": 45582 + }, + { + "epoch": 2.8296604382643245, + "grad_norm": 0.13309669941529204, + "learning_rate": 9.788746639390444e-07, + "loss": 2.7642, + "step": 45583 + }, + { + "epoch": 2.8297225153640824, + "grad_norm": 0.1272218647488639, + "learning_rate": 9.781636634629e-07, + "loss": 2.6143, + "step": 45584 + }, + { + "epoch": 2.82978459246384, + "grad_norm": 0.1442921180953035, + "learning_rate": 9.774529187455805e-07, + "loss": 2.7496, + "step": 45585 + }, + { + "epoch": 2.829846669563598, + "grad_norm": 0.13497634667235056, + "learning_rate": 9.767424297907834e-07, + "loss": 2.7054, + "step": 45586 + }, + { + "epoch": 2.8299087466633557, + "grad_norm": 0.13852930127319363, + "learning_rate": 9.760321966022279e-07, + "loss": 2.6305, + "step": 45587 + }, + { + "epoch": 2.829970823763114, + "grad_norm": 0.13512989928417946, + "learning_rate": 9.753222191836164e-07, + "loss": 2.6928, + "step": 45588 + }, + { + "epoch": 2.8300329008628715, + "grad_norm": 0.14082428930542767, + "learning_rate": 9.746124975386461e-07, + "loss": 2.6928, + "step": 45589 + }, + { + "epoch": 2.8300949779626294, + "grad_norm": 0.1311639117424594, + "learning_rate": 9.739030316710195e-07, + "loss": 2.7832, + "step": 45590 + }, + { + "epoch": 2.8301570550623874, + "grad_norm": 0.1371772163551333, + "learning_rate": 9.731938215844449e-07, + "loss": 2.685, + "step": 45591 + }, + { + "epoch": 2.8302191321621453, + "grad_norm": 0.13908697512978352, + "learning_rate": 9.724848672826247e-07, + "loss": 2.6592, + "step": 45592 + }, + { + "epoch": 2.830281209261903, + "grad_norm": 0.1486863948268634, + "learning_rate": 9.71776168769245e-07, + "loss": 2.6886, + "step": 45593 + }, + { + "epoch": 2.830343286361661, + "grad_norm": 0.1327951784619133, + "learning_rate": 9.710677260480138e-07, + "loss": 2.652, + "step": 45594 + }, + { + "epoch": 2.830405363461419, + "grad_norm": 0.12852097503054277, + "learning_rate": 9.703595391226228e-07, + "loss": 2.6528, + "step": 45595 + }, + { + "epoch": 2.830467440561177, + "grad_norm": 0.12917946991096121, + "learning_rate": 9.696516079967743e-07, + "loss": 2.7728, + "step": 45596 + }, + { + "epoch": 2.830529517660935, + "grad_norm": 0.13698627999792193, + "learning_rate": 9.689439326741544e-07, + "loss": 2.71, + "step": 45597 + }, + { + "epoch": 2.8305915947606928, + "grad_norm": 0.12847409635399726, + "learning_rate": 9.682365131584548e-07, + "loss": 2.6684, + "step": 45598 + }, + { + "epoch": 2.8306536718604507, + "grad_norm": 0.14959141546276147, + "learning_rate": 9.675293494533722e-07, + "loss": 2.7969, + "step": 45599 + }, + { + "epoch": 2.8307157489602086, + "grad_norm": 0.1319600192471619, + "learning_rate": 9.668224415625815e-07, + "loss": 2.6203, + "step": 45600 + }, + { + "epoch": 2.8307778260599665, + "grad_norm": 0.13474138705350436, + "learning_rate": 9.66115789489791e-07, + "loss": 2.7238, + "step": 45601 + }, + { + "epoch": 2.8308399031597244, + "grad_norm": 0.12946428542066676, + "learning_rate": 9.654093932386754e-07, + "loss": 2.6307, + "step": 45602 + }, + { + "epoch": 2.8309019802594824, + "grad_norm": 0.13333780522099228, + "learning_rate": 9.647032528129263e-07, + "loss": 2.6893, + "step": 45603 + }, + { + "epoch": 2.8309640573592403, + "grad_norm": 0.14198221148008222, + "learning_rate": 9.63997368216213e-07, + "loss": 2.7989, + "step": 45604 + }, + { + "epoch": 2.831026134458998, + "grad_norm": 0.13156793692991162, + "learning_rate": 9.632917394522378e-07, + "loss": 2.6844, + "step": 45605 + }, + { + "epoch": 2.831088211558756, + "grad_norm": 0.14235844083028787, + "learning_rate": 9.62586366524676e-07, + "loss": 2.7444, + "step": 45606 + }, + { + "epoch": 2.831150288658514, + "grad_norm": 0.14288384177958172, + "learning_rate": 9.61881249437202e-07, + "loss": 2.7308, + "step": 45607 + }, + { + "epoch": 2.8312123657582715, + "grad_norm": 0.12904736713011203, + "learning_rate": 9.611763881934965e-07, + "loss": 2.7296, + "step": 45608 + }, + { + "epoch": 2.83127444285803, + "grad_norm": 0.1392543473956215, + "learning_rate": 9.604717827972398e-07, + "loss": 2.7039, + "step": 45609 + }, + { + "epoch": 2.8313365199577873, + "grad_norm": 0.1378827273779081, + "learning_rate": 9.597674332521067e-07, + "loss": 2.708, + "step": 45610 + }, + { + "epoch": 2.8313985970575457, + "grad_norm": 0.15233143120868653, + "learning_rate": 9.59063339561772e-07, + "loss": 2.6845, + "step": 45611 + }, + { + "epoch": 2.831460674157303, + "grad_norm": 0.13892371855231098, + "learning_rate": 9.583595017299052e-07, + "loss": 2.6286, + "step": 45612 + }, + { + "epoch": 2.8315227512570615, + "grad_norm": 0.1312550569309425, + "learning_rate": 9.576559197601865e-07, + "loss": 2.6751, + "step": 45613 + }, + { + "epoch": 2.831584828356819, + "grad_norm": 0.13012874237032823, + "learning_rate": 9.569525936562796e-07, + "loss": 2.6526, + "step": 45614 + }, + { + "epoch": 2.831646905456577, + "grad_norm": 0.1343720466267036, + "learning_rate": 9.562495234218593e-07, + "loss": 2.7515, + "step": 45615 + }, + { + "epoch": 2.831708982556335, + "grad_norm": 0.13761177640309386, + "learning_rate": 9.555467090605897e-07, + "loss": 2.6323, + "step": 45616 + }, + { + "epoch": 2.8317710596560928, + "grad_norm": 0.1406118466265685, + "learning_rate": 9.548441505761395e-07, + "loss": 2.8082, + "step": 45617 + }, + { + "epoch": 2.8318331367558507, + "grad_norm": 0.1327230348508378, + "learning_rate": 9.541418479721731e-07, + "loss": 2.6831, + "step": 45618 + }, + { + "epoch": 2.8318952138556086, + "grad_norm": 0.1361696182584857, + "learning_rate": 9.534398012523538e-07, + "loss": 2.7371, + "step": 45619 + }, + { + "epoch": 2.8319572909553665, + "grad_norm": 0.14060513038133532, + "learning_rate": 9.527380104203454e-07, + "loss": 2.6604, + "step": 45620 + }, + { + "epoch": 2.8320193680551244, + "grad_norm": 0.12823219786773957, + "learning_rate": 9.520364754798117e-07, + "loss": 2.6709, + "step": 45621 + }, + { + "epoch": 2.8320814451548824, + "grad_norm": 0.13057255983652608, + "learning_rate": 9.513351964344108e-07, + "loss": 2.658, + "step": 45622 + }, + { + "epoch": 2.8321435222546403, + "grad_norm": 0.12779388062438765, + "learning_rate": 9.50634173287801e-07, + "loss": 2.7098, + "step": 45623 + }, + { + "epoch": 2.832205599354398, + "grad_norm": 0.13363709137106297, + "learning_rate": 9.499334060436349e-07, + "loss": 2.717, + "step": 45624 + }, + { + "epoch": 2.832267676454156, + "grad_norm": 0.13236347897016537, + "learning_rate": 9.492328947055818e-07, + "loss": 2.7697, + "step": 45625 + }, + { + "epoch": 2.832329753553914, + "grad_norm": 0.14696778324197757, + "learning_rate": 9.485326392772775e-07, + "loss": 2.7184, + "step": 45626 + }, + { + "epoch": 2.832391830653672, + "grad_norm": 0.14371070039547612, + "learning_rate": 9.478326397623916e-07, + "loss": 2.7853, + "step": 45627 + }, + { + "epoch": 2.83245390775343, + "grad_norm": 0.14517345672433868, + "learning_rate": 9.471328961645709e-07, + "loss": 2.7097, + "step": 45628 + }, + { + "epoch": 2.832515984853188, + "grad_norm": 0.13952359708237222, + "learning_rate": 9.464334084874682e-07, + "loss": 2.7139, + "step": 45629 + }, + { + "epoch": 2.8325780619529457, + "grad_norm": 0.13385629009041927, + "learning_rate": 9.457341767347306e-07, + "loss": 2.7212, + "step": 45630 + }, + { + "epoch": 2.8326401390527036, + "grad_norm": 0.13464062186926548, + "learning_rate": 9.450352009099994e-07, + "loss": 2.7106, + "step": 45631 + }, + { + "epoch": 2.8327022161524615, + "grad_norm": 0.13859304013978985, + "learning_rate": 9.44336481016933e-07, + "loss": 2.6926, + "step": 45632 + }, + { + "epoch": 2.832764293252219, + "grad_norm": 0.13355660256694468, + "learning_rate": 9.43638017059173e-07, + "loss": 2.6993, + "step": 45633 + }, + { + "epoch": 2.8328263703519774, + "grad_norm": 0.1385437951486135, + "learning_rate": 9.429398090403607e-07, + "loss": 2.7326, + "step": 45634 + }, + { + "epoch": 2.832888447451735, + "grad_norm": 0.12911864324969952, + "learning_rate": 9.422418569641434e-07, + "loss": 2.6961, + "step": 45635 + }, + { + "epoch": 2.832950524551493, + "grad_norm": 0.1501049883806961, + "learning_rate": 9.415441608341569e-07, + "loss": 2.7312, + "step": 45636 + }, + { + "epoch": 2.8330126016512507, + "grad_norm": 0.14378955986063363, + "learning_rate": 9.408467206540428e-07, + "loss": 2.737, + "step": 45637 + }, + { + "epoch": 2.8330746787510086, + "grad_norm": 0.14524460354840676, + "learning_rate": 9.401495364274426e-07, + "loss": 2.7263, + "step": 45638 + }, + { + "epoch": 2.8331367558507665, + "grad_norm": 0.14507954572640344, + "learning_rate": 9.394526081579924e-07, + "loss": 2.7305, + "step": 45639 + }, + { + "epoch": 2.8331988329505244, + "grad_norm": 0.15222410247514337, + "learning_rate": 9.387559358493225e-07, + "loss": 2.7184, + "step": 45640 + }, + { + "epoch": 2.8332609100502824, + "grad_norm": 0.13663277647663702, + "learning_rate": 9.380595195050801e-07, + "loss": 2.7068, + "step": 45641 + }, + { + "epoch": 2.8333229871500403, + "grad_norm": 0.14626489090712355, + "learning_rate": 9.373633591288899e-07, + "loss": 2.7347, + "step": 45642 + }, + { + "epoch": 2.833385064249798, + "grad_norm": 0.1543672859720182, + "learning_rate": 9.366674547243825e-07, + "loss": 2.6719, + "step": 45643 + }, + { + "epoch": 2.833447141349556, + "grad_norm": 0.13007506862406154, + "learning_rate": 9.359718062951939e-07, + "loss": 2.6555, + "step": 45644 + }, + { + "epoch": 2.833509218449314, + "grad_norm": 0.13767256447395682, + "learning_rate": 9.352764138449488e-07, + "loss": 2.7604, + "step": 45645 + }, + { + "epoch": 2.833571295549072, + "grad_norm": 0.15521015821147893, + "learning_rate": 9.345812773772778e-07, + "loss": 2.6632, + "step": 45646 + }, + { + "epoch": 2.83363337264883, + "grad_norm": 0.13114901415588526, + "learning_rate": 9.338863968958111e-07, + "loss": 2.7719, + "step": 45647 + }, + { + "epoch": 2.833695449748588, + "grad_norm": 0.14575378697965288, + "learning_rate": 9.331917724041684e-07, + "loss": 2.6659, + "step": 45648 + }, + { + "epoch": 2.8337575268483457, + "grad_norm": 0.1393877544041213, + "learning_rate": 9.324974039059686e-07, + "loss": 2.6621, + "step": 45649 + }, + { + "epoch": 2.8338196039481036, + "grad_norm": 0.12817860185725263, + "learning_rate": 9.318032914048536e-07, + "loss": 2.6609, + "step": 45650 + }, + { + "epoch": 2.8338816810478615, + "grad_norm": 0.1517637043009297, + "learning_rate": 9.311094349044258e-07, + "loss": 2.7968, + "step": 45651 + }, + { + "epoch": 2.8339437581476195, + "grad_norm": 0.12962236319796505, + "learning_rate": 9.304158344083103e-07, + "loss": 2.8047, + "step": 45652 + }, + { + "epoch": 2.8340058352473774, + "grad_norm": 0.14163775803557035, + "learning_rate": 9.297224899201262e-07, + "loss": 2.7116, + "step": 45653 + }, + { + "epoch": 2.8340679123471353, + "grad_norm": 0.1290157387769802, + "learning_rate": 9.290294014434986e-07, + "loss": 2.7339, + "step": 45654 + }, + { + "epoch": 2.834129989446893, + "grad_norm": 0.14547868731572386, + "learning_rate": 9.283365689820356e-07, + "loss": 2.805, + "step": 45655 + }, + { + "epoch": 2.8341920665466507, + "grad_norm": 0.13966112812773063, + "learning_rate": 9.276439925393509e-07, + "loss": 2.6728, + "step": 45656 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.12895746369702898, + "learning_rate": 9.26951672119064e-07, + "loss": 2.6361, + "step": 45657 + }, + { + "epoch": 2.8343162207461665, + "grad_norm": 0.13421290845671144, + "learning_rate": 9.262596077247777e-07, + "loss": 2.685, + "step": 45658 + }, + { + "epoch": 2.834378297845925, + "grad_norm": 0.13250605206118293, + "learning_rate": 9.255677993601053e-07, + "loss": 2.7109, + "step": 45659 + }, + { + "epoch": 2.8344403749456824, + "grad_norm": 0.13912721210496493, + "learning_rate": 9.248762470286665e-07, + "loss": 2.713, + "step": 45660 + }, + { + "epoch": 2.8345024520454407, + "grad_norm": 0.13434698899653294, + "learning_rate": 9.241849507340639e-07, + "loss": 2.6974, + "step": 45661 + }, + { + "epoch": 2.834564529145198, + "grad_norm": 0.1294856416477593, + "learning_rate": 9.234939104798945e-07, + "loss": 2.7307, + "step": 45662 + }, + { + "epoch": 2.834626606244956, + "grad_norm": 0.13009817868093326, + "learning_rate": 9.228031262697723e-07, + "loss": 2.7224, + "step": 45663 + }, + { + "epoch": 2.834688683344714, + "grad_norm": 0.1445624665752163, + "learning_rate": 9.221125981073053e-07, + "loss": 2.6864, + "step": 45664 + }, + { + "epoch": 2.834750760444472, + "grad_norm": 0.13625152999342616, + "learning_rate": 9.214223259960852e-07, + "loss": 2.6646, + "step": 45665 + }, + { + "epoch": 2.83481283754423, + "grad_norm": 0.14024991770179224, + "learning_rate": 9.207323099397258e-07, + "loss": 2.7349, + "step": 45666 + }, + { + "epoch": 2.8348749146439878, + "grad_norm": 0.1309986725327704, + "learning_rate": 9.20042549941813e-07, + "loss": 2.7599, + "step": 45667 + }, + { + "epoch": 2.8349369917437457, + "grad_norm": 0.13229603484844257, + "learning_rate": 9.193530460059551e-07, + "loss": 2.6878, + "step": 45668 + }, + { + "epoch": 2.8349990688435036, + "grad_norm": 0.14718053697002456, + "learning_rate": 9.186637981357494e-07, + "loss": 2.7302, + "step": 45669 + }, + { + "epoch": 2.8350611459432615, + "grad_norm": 0.1409193772465667, + "learning_rate": 9.17974806334787e-07, + "loss": 2.7159, + "step": 45670 + }, + { + "epoch": 2.8351232230430194, + "grad_norm": 0.15653894648914365, + "learning_rate": 9.172860706066655e-07, + "loss": 2.7712, + "step": 45671 + }, + { + "epoch": 2.8351853001427774, + "grad_norm": 0.13389585924284328, + "learning_rate": 9.165975909549763e-07, + "loss": 2.6455, + "step": 45672 + }, + { + "epoch": 2.8352473772425353, + "grad_norm": 0.1334312345588286, + "learning_rate": 9.159093673833164e-07, + "loss": 2.7794, + "step": 45673 + }, + { + "epoch": 2.835309454342293, + "grad_norm": 0.13082069565496776, + "learning_rate": 9.152213998952718e-07, + "loss": 2.7131, + "step": 45674 + }, + { + "epoch": 2.835371531442051, + "grad_norm": 0.13261754790132116, + "learning_rate": 9.145336884944344e-07, + "loss": 2.7647, + "step": 45675 + }, + { + "epoch": 2.835433608541809, + "grad_norm": 0.13514352418822187, + "learning_rate": 9.138462331843844e-07, + "loss": 2.6449, + "step": 45676 + }, + { + "epoch": 2.835495685641567, + "grad_norm": 0.13006986328327877, + "learning_rate": 9.131590339687246e-07, + "loss": 2.6245, + "step": 45677 + }, + { + "epoch": 2.835557762741325, + "grad_norm": 0.12944267385006994, + "learning_rate": 9.124720908510243e-07, + "loss": 2.6481, + "step": 45678 + }, + { + "epoch": 2.835619839841083, + "grad_norm": 0.13113820560059325, + "learning_rate": 9.117854038348749e-07, + "loss": 2.716, + "step": 45679 + }, + { + "epoch": 2.8356819169408407, + "grad_norm": 0.13611513129717834, + "learning_rate": 9.110989729238517e-07, + "loss": 2.76, + "step": 45680 + }, + { + "epoch": 2.835743994040598, + "grad_norm": 0.1310433262023499, + "learning_rate": 9.104127981215516e-07, + "loss": 2.741, + "step": 45681 + }, + { + "epoch": 2.8358060711403565, + "grad_norm": 0.14169072599612909, + "learning_rate": 9.097268794315383e-07, + "loss": 2.5837, + "step": 45682 + }, + { + "epoch": 2.835868148240114, + "grad_norm": 0.13320779176966277, + "learning_rate": 9.090412168574036e-07, + "loss": 2.6744, + "step": 45683 + }, + { + "epoch": 2.8359302253398724, + "grad_norm": 0.1315727998684026, + "learning_rate": 9.083558104027112e-07, + "loss": 2.6437, + "step": 45684 + }, + { + "epoch": 2.83599230243963, + "grad_norm": 0.12813603888618158, + "learning_rate": 9.076706600710416e-07, + "loss": 2.662, + "step": 45685 + }, + { + "epoch": 2.8360543795393878, + "grad_norm": 0.12954574242748856, + "learning_rate": 9.069857658659808e-07, + "loss": 2.7667, + "step": 45686 + }, + { + "epoch": 2.8361164566391457, + "grad_norm": 0.13893402197444685, + "learning_rate": 9.063011277910871e-07, + "loss": 2.7305, + "step": 45687 + }, + { + "epoch": 2.8361785337389036, + "grad_norm": 0.1394204967058288, + "learning_rate": 9.05616745849941e-07, + "loss": 2.7077, + "step": 45688 + }, + { + "epoch": 2.8362406108386615, + "grad_norm": 0.13443278938572675, + "learning_rate": 9.049326200461061e-07, + "loss": 2.7281, + "step": 45689 + }, + { + "epoch": 2.8363026879384194, + "grad_norm": 0.15760055038209606, + "learning_rate": 9.042487503831632e-07, + "loss": 2.7368, + "step": 45690 + }, + { + "epoch": 2.8363647650381774, + "grad_norm": 0.13222271027581226, + "learning_rate": 9.035651368646648e-07, + "loss": 2.7515, + "step": 45691 + }, + { + "epoch": 2.8364268421379353, + "grad_norm": 0.13096881674034622, + "learning_rate": 9.028817794941913e-07, + "loss": 2.7158, + "step": 45692 + }, + { + "epoch": 2.836488919237693, + "grad_norm": 0.14184077873151893, + "learning_rate": 9.021986782753012e-07, + "loss": 2.7189, + "step": 45693 + }, + { + "epoch": 2.836550996337451, + "grad_norm": 0.1447883587283252, + "learning_rate": 9.01515833211558e-07, + "loss": 2.5902, + "step": 45694 + }, + { + "epoch": 2.836613073437209, + "grad_norm": 0.13735358804049247, + "learning_rate": 9.008332443065259e-07, + "loss": 2.6237, + "step": 45695 + }, + { + "epoch": 2.836675150536967, + "grad_norm": 0.14691821385842113, + "learning_rate": 9.001509115637685e-07, + "loss": 2.7175, + "step": 45696 + }, + { + "epoch": 2.836737227636725, + "grad_norm": 0.13237971729116904, + "learning_rate": 8.994688349868385e-07, + "loss": 2.6686, + "step": 45697 + }, + { + "epoch": 2.836799304736483, + "grad_norm": 0.1374986678297359, + "learning_rate": 8.987870145793054e-07, + "loss": 2.7704, + "step": 45698 + }, + { + "epoch": 2.8368613818362407, + "grad_norm": 0.14138417493163, + "learning_rate": 8.981054503447162e-07, + "loss": 2.7403, + "step": 45699 + }, + { + "epoch": 2.8369234589359986, + "grad_norm": 0.1415242543145619, + "learning_rate": 8.974241422866292e-07, + "loss": 2.716, + "step": 45700 + }, + { + "epoch": 2.8369855360357565, + "grad_norm": 0.13135020881072815, + "learning_rate": 8.967430904086027e-07, + "loss": 2.6182, + "step": 45701 + }, + { + "epoch": 2.8370476131355145, + "grad_norm": 0.13914104129615465, + "learning_rate": 8.960622947141895e-07, + "loss": 2.7506, + "step": 45702 + }, + { + "epoch": 2.8371096902352724, + "grad_norm": 0.1324857450860461, + "learning_rate": 8.953817552069365e-07, + "loss": 2.6049, + "step": 45703 + }, + { + "epoch": 2.83717176733503, + "grad_norm": 0.14754627613130705, + "learning_rate": 8.947014718904023e-07, + "loss": 2.598, + "step": 45704 + }, + { + "epoch": 2.837233844434788, + "grad_norm": 0.1328837610251604, + "learning_rate": 8.940214447681283e-07, + "loss": 2.6516, + "step": 45705 + }, + { + "epoch": 2.8372959215345457, + "grad_norm": 0.15079532394544318, + "learning_rate": 8.933416738436673e-07, + "loss": 2.7318, + "step": 45706 + }, + { + "epoch": 2.837357998634304, + "grad_norm": 0.1340543386335984, + "learning_rate": 8.926621591205608e-07, + "loss": 2.7231, + "step": 45707 + }, + { + "epoch": 2.8374200757340615, + "grad_norm": 0.13205959986306462, + "learning_rate": 8.919829006023562e-07, + "loss": 2.7269, + "step": 45708 + }, + { + "epoch": 2.8374821528338194, + "grad_norm": 0.14412738460113134, + "learning_rate": 8.913038982926003e-07, + "loss": 2.7381, + "step": 45709 + }, + { + "epoch": 2.8375442299335774, + "grad_norm": 0.1323139003657933, + "learning_rate": 8.906251521948294e-07, + "loss": 2.7116, + "step": 45710 + }, + { + "epoch": 2.8376063070333353, + "grad_norm": 0.143055931723382, + "learning_rate": 8.899466623125963e-07, + "loss": 2.6615, + "step": 45711 + }, + { + "epoch": 2.837668384133093, + "grad_norm": 0.13328976987415486, + "learning_rate": 8.892684286494257e-07, + "loss": 2.6327, + "step": 45712 + }, + { + "epoch": 2.837730461232851, + "grad_norm": 0.13820069817136882, + "learning_rate": 8.885904512088705e-07, + "loss": 2.6823, + "step": 45713 + }, + { + "epoch": 2.837792538332609, + "grad_norm": 0.13014088711739097, + "learning_rate": 8.879127299944557e-07, + "loss": 2.677, + "step": 45714 + }, + { + "epoch": 2.837854615432367, + "grad_norm": 0.1324468023756074, + "learning_rate": 8.872352650097283e-07, + "loss": 2.6558, + "step": 45715 + }, + { + "epoch": 2.837916692532125, + "grad_norm": 0.14506385588227988, + "learning_rate": 8.865580562582076e-07, + "loss": 2.75, + "step": 45716 + }, + { + "epoch": 2.837978769631883, + "grad_norm": 0.14380260391329946, + "learning_rate": 8.858811037434467e-07, + "loss": 2.6028, + "step": 45717 + }, + { + "epoch": 2.8380408467316407, + "grad_norm": 0.13167756069008205, + "learning_rate": 8.852044074689647e-07, + "loss": 2.7545, + "step": 45718 + }, + { + "epoch": 2.8381029238313986, + "grad_norm": 0.13026907140000987, + "learning_rate": 8.845279674382922e-07, + "loss": 2.7454, + "step": 45719 + }, + { + "epoch": 2.8381650009311565, + "grad_norm": 0.131213683542247, + "learning_rate": 8.838517836549598e-07, + "loss": 2.6569, + "step": 45720 + }, + { + "epoch": 2.8382270780309145, + "grad_norm": 0.14394226433309973, + "learning_rate": 8.831758561224923e-07, + "loss": 2.7336, + "step": 45721 + }, + { + "epoch": 2.8382891551306724, + "grad_norm": 0.13057349420451966, + "learning_rate": 8.825001848444258e-07, + "loss": 2.606, + "step": 45722 + }, + { + "epoch": 2.8383512322304303, + "grad_norm": 0.1289643460185712, + "learning_rate": 8.818247698242799e-07, + "loss": 2.6554, + "step": 45723 + }, + { + "epoch": 2.838413309330188, + "grad_norm": 0.13242413735496725, + "learning_rate": 8.811496110655737e-07, + "loss": 2.7627, + "step": 45724 + }, + { + "epoch": 2.838475386429946, + "grad_norm": 0.1294169136698188, + "learning_rate": 8.804747085718379e-07, + "loss": 2.731, + "step": 45725 + }, + { + "epoch": 2.838537463529704, + "grad_norm": 0.13455121634295736, + "learning_rate": 8.798000623465863e-07, + "loss": 2.7334, + "step": 45726 + }, + { + "epoch": 2.8385995406294615, + "grad_norm": 0.13305607366878563, + "learning_rate": 8.791256723933438e-07, + "loss": 2.8187, + "step": 45727 + }, + { + "epoch": 2.83866161772922, + "grad_norm": 0.1506602421860481, + "learning_rate": 8.7845153871563e-07, + "loss": 2.7059, + "step": 45728 + }, + { + "epoch": 2.8387236948289774, + "grad_norm": 0.12861831614632796, + "learning_rate": 8.777776613169586e-07, + "loss": 2.6353, + "step": 45729 + }, + { + "epoch": 2.8387857719287357, + "grad_norm": 0.14447001500311288, + "learning_rate": 8.77104040200838e-07, + "loss": 2.7408, + "step": 45730 + }, + { + "epoch": 2.838847849028493, + "grad_norm": 0.14142252686118195, + "learning_rate": 8.764306753707985e-07, + "loss": 2.7216, + "step": 45731 + }, + { + "epoch": 2.8389099261282515, + "grad_norm": 0.13537975336968774, + "learning_rate": 8.757575668303431e-07, + "loss": 2.6941, + "step": 45732 + }, + { + "epoch": 2.838972003228009, + "grad_norm": 0.13255298026505283, + "learning_rate": 8.75084714582991e-07, + "loss": 2.6896, + "step": 45733 + }, + { + "epoch": 2.839034080327767, + "grad_norm": 0.14971512027991335, + "learning_rate": 8.744121186322396e-07, + "loss": 2.7649, + "step": 45734 + }, + { + "epoch": 2.839096157427525, + "grad_norm": 0.13562038500576545, + "learning_rate": 8.737397789816081e-07, + "loss": 2.7568, + "step": 45735 + }, + { + "epoch": 2.8391582345272828, + "grad_norm": 0.1490286424960259, + "learning_rate": 8.730676956346051e-07, + "loss": 2.7947, + "step": 45736 + }, + { + "epoch": 2.8392203116270407, + "grad_norm": 0.13130358839502765, + "learning_rate": 8.723958685947331e-07, + "loss": 2.6497, + "step": 45737 + }, + { + "epoch": 2.8392823887267986, + "grad_norm": 0.1367984471320634, + "learning_rate": 8.717242978654949e-07, + "loss": 2.6697, + "step": 45738 + }, + { + "epoch": 2.8393444658265565, + "grad_norm": 0.1325975670437043, + "learning_rate": 8.710529834503989e-07, + "loss": 2.6817, + "step": 45739 + }, + { + "epoch": 2.8394065429263144, + "grad_norm": 0.13934960436494492, + "learning_rate": 8.703819253529421e-07, + "loss": 2.7271, + "step": 45740 + }, + { + "epoch": 2.8394686200260724, + "grad_norm": 0.13356759194497744, + "learning_rate": 8.697111235766387e-07, + "loss": 2.6857, + "step": 45741 + }, + { + "epoch": 2.8395306971258303, + "grad_norm": 0.13934505292732877, + "learning_rate": 8.690405781249744e-07, + "loss": 2.6487, + "step": 45742 + }, + { + "epoch": 2.839592774225588, + "grad_norm": 0.1495878293730695, + "learning_rate": 8.683702890014523e-07, + "loss": 2.7545, + "step": 45743 + }, + { + "epoch": 2.839654851325346, + "grad_norm": 0.1316079959559367, + "learning_rate": 8.677002562095637e-07, + "loss": 2.6817, + "step": 45744 + }, + { + "epoch": 2.839716928425104, + "grad_norm": 0.12816444725948528, + "learning_rate": 8.670304797528173e-07, + "loss": 2.6775, + "step": 45745 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.1403466887972245, + "learning_rate": 8.663609596346989e-07, + "loss": 2.6844, + "step": 45746 + }, + { + "epoch": 2.83984108262462, + "grad_norm": 0.13987318373848334, + "learning_rate": 8.656916958587058e-07, + "loss": 2.6815, + "step": 45747 + }, + { + "epoch": 2.839903159724378, + "grad_norm": 0.14602295363737236, + "learning_rate": 8.650226884283185e-07, + "loss": 2.6977, + "step": 45748 + }, + { + "epoch": 2.8399652368241357, + "grad_norm": 0.1443420150778105, + "learning_rate": 8.643539373470399e-07, + "loss": 2.7683, + "step": 45749 + }, + { + "epoch": 2.8400273139238936, + "grad_norm": 0.1441846829305768, + "learning_rate": 8.636854426183616e-07, + "loss": 2.7003, + "step": 45750 + }, + { + "epoch": 2.8400893910236515, + "grad_norm": 0.142486457881853, + "learning_rate": 8.630172042457585e-07, + "loss": 2.6876, + "step": 45751 + }, + { + "epoch": 2.840151468123409, + "grad_norm": 0.13938543589613964, + "learning_rate": 8.623492222327167e-07, + "loss": 2.7131, + "step": 45752 + }, + { + "epoch": 2.8402135452231674, + "grad_norm": 0.14763239183379598, + "learning_rate": 8.616814965827391e-07, + "loss": 2.7103, + "step": 45753 + }, + { + "epoch": 2.840275622322925, + "grad_norm": 0.1303912515093801, + "learning_rate": 8.61014027299295e-07, + "loss": 2.7165, + "step": 45754 + }, + { + "epoch": 2.840337699422683, + "grad_norm": 0.14003645071150547, + "learning_rate": 8.60346814385865e-07, + "loss": 2.763, + "step": 45755 + }, + { + "epoch": 2.8403997765224407, + "grad_norm": 0.15089950297485832, + "learning_rate": 8.596798578459409e-07, + "loss": 2.6858, + "step": 45756 + }, + { + "epoch": 2.8404618536221986, + "grad_norm": 0.1435567310732752, + "learning_rate": 8.590131576829863e-07, + "loss": 2.7515, + "step": 45757 + }, + { + "epoch": 2.8405239307219565, + "grad_norm": 0.13270344183464142, + "learning_rate": 8.583467139004931e-07, + "loss": 2.6781, + "step": 45758 + }, + { + "epoch": 2.8405860078217144, + "grad_norm": 0.13923749949438738, + "learning_rate": 8.576805265019416e-07, + "loss": 2.7077, + "step": 45759 + }, + { + "epoch": 2.8406480849214724, + "grad_norm": 0.13944086907863554, + "learning_rate": 8.57014595490796e-07, + "loss": 2.7576, + "step": 45760 + }, + { + "epoch": 2.8407101620212303, + "grad_norm": 0.13125575938531128, + "learning_rate": 8.563489208705366e-07, + "loss": 2.7037, + "step": 45761 + }, + { + "epoch": 2.840772239120988, + "grad_norm": 0.13222559107278525, + "learning_rate": 8.55683502644633e-07, + "loss": 2.658, + "step": 45762 + }, + { + "epoch": 2.840834316220746, + "grad_norm": 0.16434554819776034, + "learning_rate": 8.550183408165602e-07, + "loss": 2.7326, + "step": 45763 + }, + { + "epoch": 2.840896393320504, + "grad_norm": 0.1322338457228796, + "learning_rate": 8.543534353897875e-07, + "loss": 2.6579, + "step": 45764 + }, + { + "epoch": 2.840958470420262, + "grad_norm": 0.1273611407896419, + "learning_rate": 8.536887863677845e-07, + "loss": 2.6966, + "step": 45765 + }, + { + "epoch": 2.84102054752002, + "grad_norm": 0.1473084665712501, + "learning_rate": 8.530243937540095e-07, + "loss": 2.7152, + "step": 45766 + }, + { + "epoch": 2.841082624619778, + "grad_norm": 0.13582956808621, + "learning_rate": 8.523602575519429e-07, + "loss": 2.7448, + "step": 45767 + }, + { + "epoch": 2.8411447017195357, + "grad_norm": 0.13232424690274436, + "learning_rate": 8.516963777650489e-07, + "loss": 2.7826, + "step": 45768 + }, + { + "epoch": 2.8412067788192936, + "grad_norm": 0.13238773018265254, + "learning_rate": 8.5103275439678e-07, + "loss": 2.6481, + "step": 45769 + }, + { + "epoch": 2.8412688559190515, + "grad_norm": 0.13123350607457968, + "learning_rate": 8.503693874506058e-07, + "loss": 2.728, + "step": 45770 + }, + { + "epoch": 2.8413309330188095, + "grad_norm": 0.13472092619730813, + "learning_rate": 8.497062769299846e-07, + "loss": 2.6989, + "step": 45771 + }, + { + "epoch": 2.8413930101185674, + "grad_norm": 0.14695871483465128, + "learning_rate": 8.490434228383803e-07, + "loss": 2.7127, + "step": 45772 + }, + { + "epoch": 2.8414550872183253, + "grad_norm": 0.13731665660841952, + "learning_rate": 8.483808251792457e-07, + "loss": 2.687, + "step": 45773 + }, + { + "epoch": 2.841517164318083, + "grad_norm": 0.13207710953794483, + "learning_rate": 8.477184839560448e-07, + "loss": 2.7341, + "step": 45774 + }, + { + "epoch": 2.8415792414178407, + "grad_norm": 0.13092863328269044, + "learning_rate": 8.470563991722247e-07, + "loss": 2.7144, + "step": 45775 + }, + { + "epoch": 2.841641318517599, + "grad_norm": 0.13522100909254417, + "learning_rate": 8.463945708312438e-07, + "loss": 2.7595, + "step": 45776 + }, + { + "epoch": 2.8417033956173565, + "grad_norm": 0.14523590207130724, + "learning_rate": 8.457329989365548e-07, + "loss": 2.7592, + "step": 45777 + }, + { + "epoch": 2.841765472717115, + "grad_norm": 0.13006043299508616, + "learning_rate": 8.450716834916106e-07, + "loss": 2.7008, + "step": 45778 + }, + { + "epoch": 2.8418275498168724, + "grad_norm": 0.1393087363528371, + "learning_rate": 8.44410624499864e-07, + "loss": 2.684, + "step": 45779 + }, + { + "epoch": 2.8418896269166307, + "grad_norm": 0.129079160896488, + "learning_rate": 8.437498219647511e-07, + "loss": 2.6321, + "step": 45780 + }, + { + "epoch": 2.841951704016388, + "grad_norm": 0.13184122145624838, + "learning_rate": 8.430892758897358e-07, + "loss": 2.6811, + "step": 45781 + }, + { + "epoch": 2.842013781116146, + "grad_norm": 0.1312435545928182, + "learning_rate": 8.424289862782542e-07, + "loss": 2.7136, + "step": 45782 + }, + { + "epoch": 2.842075858215904, + "grad_norm": 0.13081670179193858, + "learning_rate": 8.417689531337536e-07, + "loss": 2.6774, + "step": 45783 + }, + { + "epoch": 2.842137935315662, + "grad_norm": 0.1377011155873862, + "learning_rate": 8.411091764596757e-07, + "loss": 2.6814, + "step": 45784 + }, + { + "epoch": 2.84220001241542, + "grad_norm": 0.13185045374319185, + "learning_rate": 8.404496562594677e-07, + "loss": 2.7404, + "step": 45785 + }, + { + "epoch": 2.842262089515178, + "grad_norm": 0.12885475575130736, + "learning_rate": 8.397903925365713e-07, + "loss": 2.5891, + "step": 45786 + }, + { + "epoch": 2.8423241666149357, + "grad_norm": 0.148199549487574, + "learning_rate": 8.391313852944172e-07, + "loss": 2.7578, + "step": 45787 + }, + { + "epoch": 2.8423862437146936, + "grad_norm": 0.13391840036121982, + "learning_rate": 8.384726345364469e-07, + "loss": 2.8051, + "step": 45788 + }, + { + "epoch": 2.8424483208144515, + "grad_norm": 0.13308502734393407, + "learning_rate": 8.378141402661022e-07, + "loss": 2.7098, + "step": 45789 + }, + { + "epoch": 2.8425103979142095, + "grad_norm": 0.15264916372299717, + "learning_rate": 8.371559024868192e-07, + "loss": 2.7264, + "step": 45790 + }, + { + "epoch": 2.8425724750139674, + "grad_norm": 0.1442212698479713, + "learning_rate": 8.36497921202023e-07, + "loss": 2.7141, + "step": 45791 + }, + { + "epoch": 2.8426345521137253, + "grad_norm": 0.13900939404223542, + "learning_rate": 8.358401964151552e-07, + "loss": 2.7474, + "step": 45792 + }, + { + "epoch": 2.842696629213483, + "grad_norm": 0.1385699555787682, + "learning_rate": 8.351827281296465e-07, + "loss": 2.6479, + "step": 45793 + }, + { + "epoch": 2.842758706313241, + "grad_norm": 0.12982956074283358, + "learning_rate": 8.345255163489163e-07, + "loss": 2.7171, + "step": 45794 + }, + { + "epoch": 2.842820783412999, + "grad_norm": 0.13014072967948281, + "learning_rate": 8.338685610764119e-07, + "loss": 2.599, + "step": 45795 + }, + { + "epoch": 2.842882860512757, + "grad_norm": 0.13318528422244877, + "learning_rate": 8.332118623155472e-07, + "loss": 2.6905, + "step": 45796 + }, + { + "epoch": 2.842944937612515, + "grad_norm": 0.13039094438050905, + "learning_rate": 8.325554200697527e-07, + "loss": 2.7236, + "step": 45797 + }, + { + "epoch": 2.843007014712273, + "grad_norm": 0.14059434000202217, + "learning_rate": 8.318992343424481e-07, + "loss": 2.6899, + "step": 45798 + }, + { + "epoch": 2.8430690918120307, + "grad_norm": 0.15462085656186264, + "learning_rate": 8.312433051370638e-07, + "loss": 2.6676, + "step": 45799 + }, + { + "epoch": 2.843131168911788, + "grad_norm": 0.14471590643815455, + "learning_rate": 8.305876324570194e-07, + "loss": 2.8032, + "step": 45800 + }, + { + "epoch": 2.8431932460115465, + "grad_norm": 0.15476697821027938, + "learning_rate": 8.299322163057344e-07, + "loss": 2.7075, + "step": 45801 + }, + { + "epoch": 2.843255323111304, + "grad_norm": 0.1267662506939128, + "learning_rate": 8.292770566866281e-07, + "loss": 2.6826, + "step": 45802 + }, + { + "epoch": 2.8433174002110624, + "grad_norm": 0.13945884105176606, + "learning_rate": 8.286221536031258e-07, + "loss": 2.6117, + "step": 45803 + }, + { + "epoch": 2.84337947731082, + "grad_norm": 0.1357133066204344, + "learning_rate": 8.279675070586357e-07, + "loss": 2.6855, + "step": 45804 + }, + { + "epoch": 2.8434415544105778, + "grad_norm": 0.14299883239359445, + "learning_rate": 8.273131170565718e-07, + "loss": 2.7387, + "step": 45805 + }, + { + "epoch": 2.8435036315103357, + "grad_norm": 0.15528464726704413, + "learning_rate": 8.266589836003591e-07, + "loss": 2.6777, + "step": 45806 + }, + { + "epoch": 2.8435657086100936, + "grad_norm": 0.1472336742032649, + "learning_rate": 8.260051066934004e-07, + "loss": 2.6619, + "step": 45807 + }, + { + "epoch": 2.8436277857098515, + "grad_norm": 0.13654393190991745, + "learning_rate": 8.253514863391099e-07, + "loss": 2.5778, + "step": 45808 + }, + { + "epoch": 2.8436898628096094, + "grad_norm": 0.13347503953734144, + "learning_rate": 8.246981225409011e-07, + "loss": 2.6371, + "step": 45809 + }, + { + "epoch": 2.8437519399093674, + "grad_norm": 0.13753378301366836, + "learning_rate": 8.240450153021773e-07, + "loss": 2.8044, + "step": 45810 + }, + { + "epoch": 2.8438140170091253, + "grad_norm": 0.129579721039635, + "learning_rate": 8.233921646263465e-07, + "loss": 2.6368, + "step": 45811 + }, + { + "epoch": 2.843876094108883, + "grad_norm": 0.13573818749770883, + "learning_rate": 8.227395705168173e-07, + "loss": 2.688, + "step": 45812 + }, + { + "epoch": 2.843938171208641, + "grad_norm": 0.12985943461960575, + "learning_rate": 8.22087232976998e-07, + "loss": 2.6916, + "step": 45813 + }, + { + "epoch": 2.844000248308399, + "grad_norm": 0.13889960274169227, + "learning_rate": 8.214351520102859e-07, + "loss": 2.8008, + "step": 45814 + }, + { + "epoch": 2.844062325408157, + "grad_norm": 0.1409886876000185, + "learning_rate": 8.207833276200893e-07, + "loss": 2.7365, + "step": 45815 + }, + { + "epoch": 2.844124402507915, + "grad_norm": 0.12865408885068175, + "learning_rate": 8.201317598097946e-07, + "loss": 2.7006, + "step": 45816 + }, + { + "epoch": 2.844186479607673, + "grad_norm": 0.14030128545697326, + "learning_rate": 8.194804485828211e-07, + "loss": 2.747, + "step": 45817 + }, + { + "epoch": 2.8442485567074307, + "grad_norm": 0.13350379073327784, + "learning_rate": 8.188293939425551e-07, + "loss": 2.7284, + "step": 45818 + }, + { + "epoch": 2.8443106338071886, + "grad_norm": 0.13011094902060422, + "learning_rate": 8.181785958923938e-07, + "loss": 2.6965, + "step": 45819 + }, + { + "epoch": 2.8443727109069465, + "grad_norm": 0.13172431034692042, + "learning_rate": 8.175280544357289e-07, + "loss": 2.6621, + "step": 45820 + }, + { + "epoch": 2.8444347880067045, + "grad_norm": 0.13870005925632098, + "learning_rate": 8.168777695759689e-07, + "loss": 2.7154, + "step": 45821 + }, + { + "epoch": 2.8444968651064624, + "grad_norm": 0.14901006221117966, + "learning_rate": 8.162277413164887e-07, + "loss": 2.7649, + "step": 45822 + }, + { + "epoch": 2.84455894220622, + "grad_norm": 0.14073093053779204, + "learning_rate": 8.155779696606913e-07, + "loss": 2.684, + "step": 45823 + }, + { + "epoch": 2.844621019305978, + "grad_norm": 0.13719018988915802, + "learning_rate": 8.149284546119684e-07, + "loss": 2.6582, + "step": 45824 + }, + { + "epoch": 2.8446830964057357, + "grad_norm": 0.1526801966896969, + "learning_rate": 8.142791961736951e-07, + "loss": 2.7612, + "step": 45825 + }, + { + "epoch": 2.844745173505494, + "grad_norm": 0.14519216235922858, + "learning_rate": 8.136301943492741e-07, + "loss": 2.7083, + "step": 45826 + }, + { + "epoch": 2.8448072506052515, + "grad_norm": 0.14874832962805268, + "learning_rate": 8.129814491420862e-07, + "loss": 2.7068, + "step": 45827 + }, + { + "epoch": 2.84486932770501, + "grad_norm": 0.132536510203482, + "learning_rate": 8.123329605555119e-07, + "loss": 2.7189, + "step": 45828 + }, + { + "epoch": 2.8449314048047674, + "grad_norm": 0.14067030547742135, + "learning_rate": 8.116847285929374e-07, + "loss": 2.6321, + "step": 45829 + }, + { + "epoch": 2.8449934819045253, + "grad_norm": 0.13655854784661037, + "learning_rate": 8.110367532577379e-07, + "loss": 2.505, + "step": 45830 + }, + { + "epoch": 2.845055559004283, + "grad_norm": 0.1301889683310546, + "learning_rate": 8.103890345533105e-07, + "loss": 2.8, + "step": 45831 + }, + { + "epoch": 2.845117636104041, + "grad_norm": 0.1355492778532118, + "learning_rate": 8.097415724830192e-07, + "loss": 2.726, + "step": 45832 + }, + { + "epoch": 2.845179713203799, + "grad_norm": 0.12884941437571146, + "learning_rate": 8.090943670502505e-07, + "loss": 2.6751, + "step": 45833 + }, + { + "epoch": 2.845241790303557, + "grad_norm": 0.1264704110234672, + "learning_rate": 8.084474182583734e-07, + "loss": 2.6425, + "step": 45834 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.15174447692053733, + "learning_rate": 8.078007261107746e-07, + "loss": 2.7325, + "step": 45835 + }, + { + "epoch": 2.845365944503073, + "grad_norm": 0.14658914633909872, + "learning_rate": 8.071542906108176e-07, + "loss": 2.6794, + "step": 45836 + }, + { + "epoch": 2.8454280216028307, + "grad_norm": 0.13147056246379313, + "learning_rate": 8.065081117618834e-07, + "loss": 2.7278, + "step": 45837 + }, + { + "epoch": 2.8454900987025886, + "grad_norm": 0.14464228463227807, + "learning_rate": 8.058621895673302e-07, + "loss": 2.7105, + "step": 45838 + }, + { + "epoch": 2.8455521758023465, + "grad_norm": 0.1287300249108575, + "learning_rate": 8.052165240305387e-07, + "loss": 2.6971, + "step": 45839 + }, + { + "epoch": 2.8456142529021045, + "grad_norm": 0.14103398190368532, + "learning_rate": 8.045711151548785e-07, + "loss": 2.6526, + "step": 45840 + }, + { + "epoch": 2.8456763300018624, + "grad_norm": 0.12820867882088913, + "learning_rate": 8.039259629437135e-07, + "loss": 2.6738, + "step": 45841 + }, + { + "epoch": 2.8457384071016203, + "grad_norm": 0.12582869316170808, + "learning_rate": 8.032810674004077e-07, + "loss": 2.6678, + "step": 45842 + }, + { + "epoch": 2.845800484201378, + "grad_norm": 0.1339053810852596, + "learning_rate": 8.02636428528325e-07, + "loss": 2.6867, + "step": 45843 + }, + { + "epoch": 2.845862561301136, + "grad_norm": 0.1346293741957151, + "learning_rate": 8.01992046330835e-07, + "loss": 2.6551, + "step": 45844 + }, + { + "epoch": 2.845924638400894, + "grad_norm": 0.13306372387432053, + "learning_rate": 8.013479208112906e-07, + "loss": 2.7286, + "step": 45845 + }, + { + "epoch": 2.845986715500652, + "grad_norm": 0.12871650978958074, + "learning_rate": 8.007040519730613e-07, + "loss": 2.6556, + "step": 45846 + }, + { + "epoch": 2.84604879260041, + "grad_norm": 0.13211203733183274, + "learning_rate": 8.000604398195e-07, + "loss": 2.6972, + "step": 45847 + }, + { + "epoch": 2.8461108697001674, + "grad_norm": 0.1419315439509036, + "learning_rate": 7.994170843539706e-07, + "loss": 2.7303, + "step": 45848 + }, + { + "epoch": 2.8461729467999257, + "grad_norm": 0.13822639126088695, + "learning_rate": 7.98773985579826e-07, + "loss": 2.7684, + "step": 45849 + }, + { + "epoch": 2.846235023899683, + "grad_norm": 0.13480019471544627, + "learning_rate": 7.981311435004247e-07, + "loss": 2.6948, + "step": 45850 + }, + { + "epoch": 2.8462971009994416, + "grad_norm": 0.1376628566936145, + "learning_rate": 7.974885581191083e-07, + "loss": 2.6793, + "step": 45851 + }, + { + "epoch": 2.846359178099199, + "grad_norm": 0.12848156271807984, + "learning_rate": 7.968462294392409e-07, + "loss": 2.6277, + "step": 45852 + }, + { + "epoch": 2.846421255198957, + "grad_norm": 0.1367474457734996, + "learning_rate": 7.962041574641755e-07, + "loss": 2.7352, + "step": 45853 + }, + { + "epoch": 2.846483332298715, + "grad_norm": 0.12980329617843217, + "learning_rate": 7.955623421972591e-07, + "loss": 2.7167, + "step": 45854 + }, + { + "epoch": 2.846545409398473, + "grad_norm": 0.13072603463857582, + "learning_rate": 7.949207836418282e-07, + "loss": 2.5724, + "step": 45855 + }, + { + "epoch": 2.8466074864982307, + "grad_norm": 0.1383988284655418, + "learning_rate": 7.942794818012523e-07, + "loss": 2.7026, + "step": 45856 + }, + { + "epoch": 2.8466695635979886, + "grad_norm": 0.14788460658301777, + "learning_rate": 7.936384366788562e-07, + "loss": 2.7595, + "step": 45857 + }, + { + "epoch": 2.8467316406977465, + "grad_norm": 0.15191405782113493, + "learning_rate": 7.929976482779989e-07, + "loss": 2.6911, + "step": 45858 + }, + { + "epoch": 2.8467937177975045, + "grad_norm": 0.14611767965569508, + "learning_rate": 7.923571166020161e-07, + "loss": 2.7358, + "step": 45859 + }, + { + "epoch": 2.8468557948972624, + "grad_norm": 0.136353392841668, + "learning_rate": 7.917168416542553e-07, + "loss": 2.6113, + "step": 45860 + }, + { + "epoch": 2.8469178719970203, + "grad_norm": 0.12706689138700114, + "learning_rate": 7.910768234380472e-07, + "loss": 2.7672, + "step": 45861 + }, + { + "epoch": 2.846979949096778, + "grad_norm": 0.13019126462323513, + "learning_rate": 7.904370619567447e-07, + "loss": 2.7635, + "step": 45862 + }, + { + "epoch": 2.847042026196536, + "grad_norm": 0.1281600944790733, + "learning_rate": 7.897975572136729e-07, + "loss": 2.6927, + "step": 45863 + }, + { + "epoch": 2.847104103296294, + "grad_norm": 0.13226382436275838, + "learning_rate": 7.891583092121791e-07, + "loss": 2.7058, + "step": 45864 + }, + { + "epoch": 2.847166180396052, + "grad_norm": 0.1392350915293835, + "learning_rate": 7.885193179555883e-07, + "loss": 2.7459, + "step": 45865 + }, + { + "epoch": 2.84722825749581, + "grad_norm": 0.14534933305325876, + "learning_rate": 7.878805834472369e-07, + "loss": 2.6055, + "step": 45866 + }, + { + "epoch": 2.847290334595568, + "grad_norm": 0.1279108601467903, + "learning_rate": 7.87242105690461e-07, + "loss": 2.7228, + "step": 45867 + }, + { + "epoch": 2.8473524116953257, + "grad_norm": 0.1402944746843438, + "learning_rate": 7.866038846885914e-07, + "loss": 2.6279, + "step": 45868 + }, + { + "epoch": 2.8474144887950836, + "grad_norm": 0.1366988099407259, + "learning_rate": 7.859659204449588e-07, + "loss": 2.7475, + "step": 45869 + }, + { + "epoch": 2.8474765658948415, + "grad_norm": 0.14419152309690053, + "learning_rate": 7.853282129628825e-07, + "loss": 2.7839, + "step": 45870 + }, + { + "epoch": 2.847538642994599, + "grad_norm": 0.14288500850395136, + "learning_rate": 7.846907622456989e-07, + "loss": 2.654, + "step": 45871 + }, + { + "epoch": 2.8476007200943574, + "grad_norm": 0.13844404174986114, + "learning_rate": 7.840535682967332e-07, + "loss": 2.6559, + "step": 45872 + }, + { + "epoch": 2.847662797194115, + "grad_norm": 0.13074283962127012, + "learning_rate": 7.834166311193048e-07, + "loss": 2.5849, + "step": 45873 + }, + { + "epoch": 2.847724874293873, + "grad_norm": 0.13013976357905785, + "learning_rate": 7.827799507167388e-07, + "loss": 2.6823, + "step": 45874 + }, + { + "epoch": 2.8477869513936307, + "grad_norm": 0.13304511904047728, + "learning_rate": 7.821435270923605e-07, + "loss": 2.6343, + "step": 45875 + }, + { + "epoch": 2.847849028493389, + "grad_norm": 0.1286895476586393, + "learning_rate": 7.815073602494894e-07, + "loss": 2.7214, + "step": 45876 + }, + { + "epoch": 2.8479111055931465, + "grad_norm": 0.12935264917244255, + "learning_rate": 7.808714501914394e-07, + "loss": 2.6723, + "step": 45877 + }, + { + "epoch": 2.8479731826929044, + "grad_norm": 0.14154734395643545, + "learning_rate": 7.802357969215301e-07, + "loss": 2.5768, + "step": 45878 + }, + { + "epoch": 2.8480352597926624, + "grad_norm": 0.14514318004933685, + "learning_rate": 7.796004004430812e-07, + "loss": 2.6991, + "step": 45879 + }, + { + "epoch": 2.8480973368924203, + "grad_norm": 0.13002362899660735, + "learning_rate": 7.789652607594067e-07, + "loss": 2.647, + "step": 45880 + }, + { + "epoch": 2.848159413992178, + "grad_norm": 0.13702028945141909, + "learning_rate": 7.783303778738149e-07, + "loss": 2.6667, + "step": 45881 + }, + { + "epoch": 2.848221491091936, + "grad_norm": 0.13185996083973497, + "learning_rate": 7.77695751789631e-07, + "loss": 2.7538, + "step": 45882 + }, + { + "epoch": 2.848283568191694, + "grad_norm": 0.12573308111221668, + "learning_rate": 7.770613825101469e-07, + "loss": 2.6172, + "step": 45883 + }, + { + "epoch": 2.848345645291452, + "grad_norm": 0.14974381330292857, + "learning_rate": 7.764272700386876e-07, + "loss": 2.6846, + "step": 45884 + }, + { + "epoch": 2.84840772239121, + "grad_norm": 0.1348802935610605, + "learning_rate": 7.757934143785562e-07, + "loss": 2.6607, + "step": 45885 + }, + { + "epoch": 2.848469799490968, + "grad_norm": 0.12607165008221144, + "learning_rate": 7.751598155330663e-07, + "loss": 2.6949, + "step": 45886 + }, + { + "epoch": 2.8485318765907257, + "grad_norm": 0.1440107277170677, + "learning_rate": 7.745264735055047e-07, + "loss": 2.5919, + "step": 45887 + }, + { + "epoch": 2.8485939536904836, + "grad_norm": 0.14488661097949126, + "learning_rate": 7.738933882991961e-07, + "loss": 2.5867, + "step": 45888 + }, + { + "epoch": 2.8486560307902415, + "grad_norm": 0.13588244238503508, + "learning_rate": 7.732605599174325e-07, + "loss": 2.6651, + "step": 45889 + }, + { + "epoch": 2.8487181078899995, + "grad_norm": 0.12739089900379993, + "learning_rate": 7.726279883635223e-07, + "loss": 2.6722, + "step": 45890 + }, + { + "epoch": 2.8487801849897574, + "grad_norm": 0.13285488966847345, + "learning_rate": 7.71995673640763e-07, + "loss": 2.6624, + "step": 45891 + }, + { + "epoch": 2.8488422620895153, + "grad_norm": 0.13046438472549066, + "learning_rate": 7.713636157524517e-07, + "loss": 2.8531, + "step": 45892 + }, + { + "epoch": 2.848904339189273, + "grad_norm": 0.13394185550975587, + "learning_rate": 7.70731814701886e-07, + "loss": 2.7471, + "step": 45893 + }, + { + "epoch": 2.848966416289031, + "grad_norm": 0.13188299822620517, + "learning_rate": 7.701002704923632e-07, + "loss": 2.7401, + "step": 45894 + }, + { + "epoch": 2.849028493388789, + "grad_norm": 0.12931355515727586, + "learning_rate": 7.694689831271807e-07, + "loss": 2.6682, + "step": 45895 + }, + { + "epoch": 2.8490905704885465, + "grad_norm": 0.13333080638289016, + "learning_rate": 7.688379526096301e-07, + "loss": 2.7361, + "step": 45896 + }, + { + "epoch": 2.849152647588305, + "grad_norm": 0.12710910165214573, + "learning_rate": 7.682071789429979e-07, + "loss": 2.6459, + "step": 45897 + }, + { + "epoch": 2.8492147246880624, + "grad_norm": 0.13608218911488637, + "learning_rate": 7.675766621305814e-07, + "loss": 2.7107, + "step": 45898 + }, + { + "epoch": 2.8492768017878207, + "grad_norm": 0.13090731326309082, + "learning_rate": 7.669464021756723e-07, + "loss": 2.5973, + "step": 45899 + }, + { + "epoch": 2.849338878887578, + "grad_norm": 0.1322608880661073, + "learning_rate": 7.66316399081557e-07, + "loss": 2.7227, + "step": 45900 + }, + { + "epoch": 2.849400955987336, + "grad_norm": 0.1457290439716538, + "learning_rate": 7.656866528515161e-07, + "loss": 2.7915, + "step": 45901 + }, + { + "epoch": 2.849463033087094, + "grad_norm": 0.1276758894176588, + "learning_rate": 7.650571634888415e-07, + "loss": 2.7942, + "step": 45902 + }, + { + "epoch": 2.849525110186852, + "grad_norm": 0.13304252199633168, + "learning_rate": 7.644279309968194e-07, + "loss": 2.61, + "step": 45903 + }, + { + "epoch": 2.84958718728661, + "grad_norm": 0.12884079109234298, + "learning_rate": 7.637989553787251e-07, + "loss": 2.7189, + "step": 45904 + }, + { + "epoch": 2.849649264386368, + "grad_norm": 0.13131947551884723, + "learning_rate": 7.631702366378501e-07, + "loss": 2.7679, + "step": 45905 + }, + { + "epoch": 2.8497113414861257, + "grad_norm": 0.1305052320789856, + "learning_rate": 7.625417747774589e-07, + "loss": 2.7304, + "step": 45906 + }, + { + "epoch": 2.8497734185858836, + "grad_norm": 0.13013446779006183, + "learning_rate": 7.619135698008428e-07, + "loss": 2.6602, + "step": 45907 + }, + { + "epoch": 2.8498354956856415, + "grad_norm": 0.13060484515075907, + "learning_rate": 7.61285621711283e-07, + "loss": 2.7301, + "step": 45908 + }, + { + "epoch": 2.8498975727853995, + "grad_norm": 0.13465639884880304, + "learning_rate": 7.606579305120431e-07, + "loss": 2.6399, + "step": 45909 + }, + { + "epoch": 2.8499596498851574, + "grad_norm": 0.1368730968077391, + "learning_rate": 7.600304962064043e-07, + "loss": 2.6689, + "step": 45910 + }, + { + "epoch": 2.8500217269849153, + "grad_norm": 0.1355823357778782, + "learning_rate": 7.594033187976412e-07, + "loss": 2.6383, + "step": 45911 + }, + { + "epoch": 2.850083804084673, + "grad_norm": 0.12930198094638384, + "learning_rate": 7.587763982890294e-07, + "loss": 2.7205, + "step": 45912 + }, + { + "epoch": 2.850145881184431, + "grad_norm": 0.12844175401311464, + "learning_rate": 7.581497346838274e-07, + "loss": 2.696, + "step": 45913 + }, + { + "epoch": 2.850207958284189, + "grad_norm": 0.14142157856176066, + "learning_rate": 7.575233279853156e-07, + "loss": 2.6642, + "step": 45914 + }, + { + "epoch": 2.850270035383947, + "grad_norm": 0.12968590517097553, + "learning_rate": 7.568971781967582e-07, + "loss": 2.7153, + "step": 45915 + }, + { + "epoch": 2.850332112483705, + "grad_norm": 0.13103957366206856, + "learning_rate": 7.562712853214193e-07, + "loss": 2.6929, + "step": 45916 + }, + { + "epoch": 2.850394189583463, + "grad_norm": 0.13982708714486097, + "learning_rate": 7.556456493625741e-07, + "loss": 2.6634, + "step": 45917 + }, + { + "epoch": 2.8504562666832207, + "grad_norm": 0.149266325770981, + "learning_rate": 7.550202703234754e-07, + "loss": 2.7552, + "step": 45918 + }, + { + "epoch": 2.850518343782978, + "grad_norm": 0.1321649402166009, + "learning_rate": 7.543951482073874e-07, + "loss": 2.74, + "step": 45919 + }, + { + "epoch": 2.8505804208827366, + "grad_norm": 0.14275069440474028, + "learning_rate": 7.537702830175797e-07, + "loss": 2.6396, + "step": 45920 + }, + { + "epoch": 2.850642497982494, + "grad_norm": 0.13226382436275838, + "learning_rate": 7.531456747573052e-07, + "loss": 2.7492, + "step": 45921 + }, + { + "epoch": 2.8507045750822524, + "grad_norm": 0.12674512696821094, + "learning_rate": 7.525213234298279e-07, + "loss": 2.606, + "step": 45922 + }, + { + "epoch": 2.85076665218201, + "grad_norm": 0.12652255181346758, + "learning_rate": 7.518972290384007e-07, + "loss": 2.6908, + "step": 45923 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.13176671789424688, + "learning_rate": 7.512733915862769e-07, + "loss": 2.7335, + "step": 45924 + }, + { + "epoch": 2.8508908063815257, + "grad_norm": 0.14024816461201944, + "learning_rate": 7.506498110767201e-07, + "loss": 2.701, + "step": 45925 + }, + { + "epoch": 2.8509528834812836, + "grad_norm": 0.1490641716997308, + "learning_rate": 7.500264875129781e-07, + "loss": 2.7659, + "step": 45926 + }, + { + "epoch": 2.8510149605810415, + "grad_norm": 0.1359256972198573, + "learning_rate": 7.494034208983036e-07, + "loss": 2.6855, + "step": 45927 + }, + { + "epoch": 2.8510770376807995, + "grad_norm": 0.13540144217323907, + "learning_rate": 7.487806112359497e-07, + "loss": 2.7478, + "step": 45928 + }, + { + "epoch": 2.8511391147805574, + "grad_norm": 0.13489975681146923, + "learning_rate": 7.481580585291581e-07, + "loss": 2.7274, + "step": 45929 + }, + { + "epoch": 2.8512011918803153, + "grad_norm": 0.1304444453163838, + "learning_rate": 7.475357627811874e-07, + "loss": 2.7131, + "step": 45930 + }, + { + "epoch": 2.851263268980073, + "grad_norm": 0.1384115529231995, + "learning_rate": 7.469137239952795e-07, + "loss": 2.7398, + "step": 45931 + }, + { + "epoch": 2.851325346079831, + "grad_norm": 0.14553370661399698, + "learning_rate": 7.462919421746761e-07, + "loss": 2.7051, + "step": 45932 + }, + { + "epoch": 2.851387423179589, + "grad_norm": 0.13902408572906258, + "learning_rate": 7.456704173226192e-07, + "loss": 2.6801, + "step": 45933 + }, + { + "epoch": 2.851449500279347, + "grad_norm": 0.13737878875436704, + "learning_rate": 7.450491494423672e-07, + "loss": 2.7066, + "step": 45934 + }, + { + "epoch": 2.851511577379105, + "grad_norm": 0.14028152932198518, + "learning_rate": 7.444281385371454e-07, + "loss": 2.6875, + "step": 45935 + }, + { + "epoch": 2.851573654478863, + "grad_norm": 0.13223848697643084, + "learning_rate": 7.438073846102011e-07, + "loss": 2.7393, + "step": 45936 + }, + { + "epoch": 2.8516357315786207, + "grad_norm": 0.14198059785079115, + "learning_rate": 7.431868876647652e-07, + "loss": 2.7404, + "step": 45937 + }, + { + "epoch": 2.8516978086783786, + "grad_norm": 0.13729972694736914, + "learning_rate": 7.425666477040904e-07, + "loss": 2.7192, + "step": 45938 + }, + { + "epoch": 2.8517598857781365, + "grad_norm": 0.1402369682263507, + "learning_rate": 7.419466647313967e-07, + "loss": 2.631, + "step": 45939 + }, + { + "epoch": 2.8518219628778945, + "grad_norm": 0.1293524403759731, + "learning_rate": 7.413269387499256e-07, + "loss": 2.6664, + "step": 45940 + }, + { + "epoch": 2.8518840399776524, + "grad_norm": 0.13205254033224184, + "learning_rate": 7.407074697629135e-07, + "loss": 2.7991, + "step": 45941 + }, + { + "epoch": 2.8519461170774103, + "grad_norm": 0.13027387558893438, + "learning_rate": 7.400882577735857e-07, + "loss": 2.7333, + "step": 45942 + }, + { + "epoch": 2.852008194177168, + "grad_norm": 0.14433583374432665, + "learning_rate": 7.394693027851729e-07, + "loss": 2.7149, + "step": 45943 + }, + { + "epoch": 2.8520702712769257, + "grad_norm": 0.13050885012550126, + "learning_rate": 7.388506048009114e-07, + "loss": 2.7677, + "step": 45944 + }, + { + "epoch": 2.852132348376684, + "grad_norm": 0.14538364104603127, + "learning_rate": 7.382321638240263e-07, + "loss": 2.7367, + "step": 45945 + }, + { + "epoch": 2.8521944254764415, + "grad_norm": 0.13938031766993697, + "learning_rate": 7.376139798577375e-07, + "loss": 2.6177, + "step": 45946 + }, + { + "epoch": 2.8522565025762, + "grad_norm": 0.1344791560378638, + "learning_rate": 7.369960529052755e-07, + "loss": 2.6775, + "step": 45947 + }, + { + "epoch": 2.8523185796759574, + "grad_norm": 0.13126223030327813, + "learning_rate": 7.363783829698711e-07, + "loss": 2.8153, + "step": 45948 + }, + { + "epoch": 2.8523806567757153, + "grad_norm": 0.12737765855373184, + "learning_rate": 7.35760970054733e-07, + "loss": 2.7228, + "step": 45949 + }, + { + "epoch": 2.852442733875473, + "grad_norm": 0.1410276027463946, + "learning_rate": 7.351438141630917e-07, + "loss": 2.7428, + "step": 45950 + }, + { + "epoch": 2.852504810975231, + "grad_norm": 0.14078020432027782, + "learning_rate": 7.345269152981616e-07, + "loss": 2.6774, + "step": 45951 + }, + { + "epoch": 2.852566888074989, + "grad_norm": 0.1347756796693408, + "learning_rate": 7.339102734631675e-07, + "loss": 2.6836, + "step": 45952 + }, + { + "epoch": 2.852628965174747, + "grad_norm": 0.14122838140777594, + "learning_rate": 7.332938886613238e-07, + "loss": 2.5914, + "step": 45953 + }, + { + "epoch": 2.852691042274505, + "grad_norm": 0.13371535171056145, + "learning_rate": 7.326777608958446e-07, + "loss": 2.7334, + "step": 45954 + }, + { + "epoch": 2.852753119374263, + "grad_norm": 0.12535139700218015, + "learning_rate": 7.320618901699439e-07, + "loss": 2.6655, + "step": 45955 + }, + { + "epoch": 2.8528151964740207, + "grad_norm": 0.12787710812817538, + "learning_rate": 7.314462764868357e-07, + "loss": 2.6863, + "step": 45956 + }, + { + "epoch": 2.8528772735737786, + "grad_norm": 0.1409005273928901, + "learning_rate": 7.308309198497398e-07, + "loss": 2.6957, + "step": 45957 + }, + { + "epoch": 2.8529393506735365, + "grad_norm": 0.14504391372133255, + "learning_rate": 7.302158202618537e-07, + "loss": 2.7089, + "step": 45958 + }, + { + "epoch": 2.8530014277732945, + "grad_norm": 0.1399018848778595, + "learning_rate": 7.296009777263968e-07, + "loss": 2.7497, + "step": 45959 + }, + { + "epoch": 2.8530635048730524, + "grad_norm": 0.1281758698800362, + "learning_rate": 7.289863922465667e-07, + "loss": 2.7388, + "step": 45960 + }, + { + "epoch": 2.8531255819728103, + "grad_norm": 0.12877866181760636, + "learning_rate": 7.283720638255776e-07, + "loss": 2.6443, + "step": 45961 + }, + { + "epoch": 2.853187659072568, + "grad_norm": 0.16201973930312602, + "learning_rate": 7.277579924666323e-07, + "loss": 2.7103, + "step": 45962 + }, + { + "epoch": 2.853249736172326, + "grad_norm": 0.14248532057535296, + "learning_rate": 7.271441781729393e-07, + "loss": 2.6957, + "step": 45963 + }, + { + "epoch": 2.853311813272084, + "grad_norm": 0.12931040782351438, + "learning_rate": 7.265306209476908e-07, + "loss": 2.7621, + "step": 45964 + }, + { + "epoch": 2.853373890371842, + "grad_norm": 0.12919836469326176, + "learning_rate": 7.25917320794095e-07, + "loss": 2.7224, + "step": 45965 + }, + { + "epoch": 2.8534359674716, + "grad_norm": 0.13179566510000734, + "learning_rate": 7.253042777153496e-07, + "loss": 2.6766, + "step": 45966 + }, + { + "epoch": 2.8534980445713574, + "grad_norm": 0.14281307093049167, + "learning_rate": 7.246914917146574e-07, + "loss": 2.7568, + "step": 45967 + }, + { + "epoch": 2.8535601216711157, + "grad_norm": 0.13092542518667233, + "learning_rate": 7.240789627952105e-07, + "loss": 2.7033, + "step": 45968 + }, + { + "epoch": 2.853622198770873, + "grad_norm": 0.13776446695869674, + "learning_rate": 7.234666909602006e-07, + "loss": 2.7202, + "step": 45969 + }, + { + "epoch": 2.8536842758706316, + "grad_norm": 0.14426928053432247, + "learning_rate": 7.228546762128308e-07, + "loss": 2.6827, + "step": 45970 + }, + { + "epoch": 2.853746352970389, + "grad_norm": 0.13215609635122932, + "learning_rate": 7.222429185562929e-07, + "loss": 2.7567, + "step": 45971 + }, + { + "epoch": 2.8538084300701474, + "grad_norm": 0.13138128226011103, + "learning_rate": 7.216314179937733e-07, + "loss": 2.7217, + "step": 45972 + }, + { + "epoch": 2.853870507169905, + "grad_norm": 0.14396192566074012, + "learning_rate": 7.210201745284639e-07, + "loss": 2.6528, + "step": 45973 + }, + { + "epoch": 2.853932584269663, + "grad_norm": 0.12912814952964308, + "learning_rate": 7.204091881635567e-07, + "loss": 2.677, + "step": 45974 + }, + { + "epoch": 2.8539946613694207, + "grad_norm": 0.1414086112462795, + "learning_rate": 7.197984589022433e-07, + "loss": 2.694, + "step": 45975 + }, + { + "epoch": 2.8540567384691786, + "grad_norm": 0.13893469230725278, + "learning_rate": 7.191879867476991e-07, + "loss": 2.6472, + "step": 45976 + }, + { + "epoch": 2.8541188155689365, + "grad_norm": 0.13889444649689983, + "learning_rate": 7.185777717031162e-07, + "loss": 2.7283, + "step": 45977 + }, + { + "epoch": 2.8541808926686945, + "grad_norm": 0.1365487949687275, + "learning_rate": 7.17967813771675e-07, + "loss": 2.6904, + "step": 45978 + }, + { + "epoch": 2.8542429697684524, + "grad_norm": 0.14404234158654183, + "learning_rate": 7.173581129565565e-07, + "loss": 2.6944, + "step": 45979 + }, + { + "epoch": 2.8543050468682103, + "grad_norm": 0.1315012533627636, + "learning_rate": 7.167486692609526e-07, + "loss": 2.7161, + "step": 45980 + }, + { + "epoch": 2.854367123967968, + "grad_norm": 0.14238656906384206, + "learning_rate": 7.161394826880275e-07, + "loss": 2.7663, + "step": 45981 + }, + { + "epoch": 2.854429201067726, + "grad_norm": 0.13705416815047833, + "learning_rate": 7.155305532409673e-07, + "loss": 2.7253, + "step": 45982 + }, + { + "epoch": 2.854491278167484, + "grad_norm": 0.13508694806911445, + "learning_rate": 7.14921880922953e-07, + "loss": 2.7534, + "step": 45983 + }, + { + "epoch": 2.854553355267242, + "grad_norm": 0.13770799375536433, + "learning_rate": 7.143134657371542e-07, + "loss": 2.7384, + "step": 45984 + }, + { + "epoch": 2.854615432367, + "grad_norm": 0.13185333561008786, + "learning_rate": 7.137053076867462e-07, + "loss": 2.6981, + "step": 45985 + }, + { + "epoch": 2.854677509466758, + "grad_norm": 0.12967699996541487, + "learning_rate": 7.130974067748986e-07, + "loss": 2.6985, + "step": 45986 + }, + { + "epoch": 2.8547395865665157, + "grad_norm": 0.13235825808155938, + "learning_rate": 7.124897630047867e-07, + "loss": 2.7147, + "step": 45987 + }, + { + "epoch": 2.8548016636662736, + "grad_norm": 0.13758764047767133, + "learning_rate": 7.118823763795857e-07, + "loss": 2.713, + "step": 45988 + }, + { + "epoch": 2.8548637407660316, + "grad_norm": 0.1360771724993039, + "learning_rate": 7.112752469024597e-07, + "loss": 2.6477, + "step": 45989 + }, + { + "epoch": 2.8549258178657895, + "grad_norm": 0.1461494176546985, + "learning_rate": 7.10668374576573e-07, + "loss": 2.6934, + "step": 45990 + }, + { + "epoch": 2.8549878949655474, + "grad_norm": 0.13769889042311342, + "learning_rate": 7.100617594050952e-07, + "loss": 2.6688, + "step": 45991 + }, + { + "epoch": 2.855049972065305, + "grad_norm": 0.13951322371819183, + "learning_rate": 7.094554013911902e-07, + "loss": 2.6388, + "step": 45992 + }, + { + "epoch": 2.8551120491650632, + "grad_norm": 0.13091410019258184, + "learning_rate": 7.088493005380225e-07, + "loss": 2.8235, + "step": 45993 + }, + { + "epoch": 2.8551741262648207, + "grad_norm": 0.1365749079285356, + "learning_rate": 7.082434568487562e-07, + "loss": 2.6494, + "step": 45994 + }, + { + "epoch": 2.855236203364579, + "grad_norm": 0.13049066610773474, + "learning_rate": 7.076378703265496e-07, + "loss": 2.7632, + "step": 45995 + }, + { + "epoch": 2.8552982804643365, + "grad_norm": 0.131249288148312, + "learning_rate": 7.070325409745559e-07, + "loss": 2.6888, + "step": 45996 + }, + { + "epoch": 2.8553603575640945, + "grad_norm": 0.1328877978890383, + "learning_rate": 7.064274687959449e-07, + "loss": 2.6541, + "step": 45997 + }, + { + "epoch": 2.8554224346638524, + "grad_norm": 0.1308337442719127, + "learning_rate": 7.058226537938694e-07, + "loss": 2.6495, + "step": 45998 + }, + { + "epoch": 2.8554845117636103, + "grad_norm": 0.1464420702563399, + "learning_rate": 7.052180959714827e-07, + "loss": 2.8235, + "step": 45999 + }, + { + "epoch": 2.855546588863368, + "grad_norm": 0.1279937205329644, + "learning_rate": 7.046137953319376e-07, + "loss": 2.6038, + "step": 46000 + }, + { + "epoch": 2.855608665963126, + "grad_norm": 0.14624347587648445, + "learning_rate": 7.040097518783928e-07, + "loss": 2.6351, + "step": 46001 + }, + { + "epoch": 2.855670743062884, + "grad_norm": 0.12894551083868167, + "learning_rate": 7.034059656139902e-07, + "loss": 2.6298, + "step": 46002 + }, + { + "epoch": 2.855732820162642, + "grad_norm": 0.1489376563668881, + "learning_rate": 7.028024365418939e-07, + "loss": 2.6967, + "step": 46003 + }, + { + "epoch": 2.8557948972624, + "grad_norm": 0.13105856268621074, + "learning_rate": 7.021991646652404e-07, + "loss": 2.7508, + "step": 46004 + }, + { + "epoch": 2.855856974362158, + "grad_norm": 0.12978763965443765, + "learning_rate": 7.015961499871771e-07, + "loss": 2.6797, + "step": 46005 + }, + { + "epoch": 2.8559190514619157, + "grad_norm": 0.15247682114825617, + "learning_rate": 7.009933925108625e-07, + "loss": 2.5963, + "step": 46006 + }, + { + "epoch": 2.8559811285616736, + "grad_norm": 0.12891500616678947, + "learning_rate": 7.003908922394275e-07, + "loss": 2.6916, + "step": 46007 + }, + { + "epoch": 2.8560432056614315, + "grad_norm": 0.14562208693582704, + "learning_rate": 6.997886491760253e-07, + "loss": 2.6845, + "step": 46008 + }, + { + "epoch": 2.8561052827611895, + "grad_norm": 0.12941086144234892, + "learning_rate": 6.991866633237864e-07, + "loss": 2.6916, + "step": 46009 + }, + { + "epoch": 2.8561673598609474, + "grad_norm": 0.16029530396729516, + "learning_rate": 6.985849346858641e-07, + "loss": 2.7632, + "step": 46010 + }, + { + "epoch": 2.8562294369607053, + "grad_norm": 0.13005970976328227, + "learning_rate": 6.979834632653948e-07, + "loss": 2.6944, + "step": 46011 + }, + { + "epoch": 2.856291514060463, + "grad_norm": 0.14316985390516876, + "learning_rate": 6.973822490655147e-07, + "loss": 2.7785, + "step": 46012 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.142068192710302, + "learning_rate": 6.967812920893546e-07, + "loss": 2.5984, + "step": 46013 + }, + { + "epoch": 2.856415668259979, + "grad_norm": 0.13552152413092716, + "learning_rate": 6.961805923400566e-07, + "loss": 2.7093, + "step": 46014 + }, + { + "epoch": 2.8564777453597365, + "grad_norm": 0.14213051492447193, + "learning_rate": 6.955801498207571e-07, + "loss": 2.7064, + "step": 46015 + }, + { + "epoch": 2.856539822459495, + "grad_norm": 0.1298690731267619, + "learning_rate": 6.949799645345811e-07, + "loss": 2.6813, + "step": 46016 + }, + { + "epoch": 2.8566018995592524, + "grad_norm": 0.14217150199934278, + "learning_rate": 6.943800364846653e-07, + "loss": 2.7815, + "step": 46017 + }, + { + "epoch": 2.8566639766590107, + "grad_norm": 0.1428479424242986, + "learning_rate": 6.937803656741348e-07, + "loss": 2.6209, + "step": 46018 + }, + { + "epoch": 2.856726053758768, + "grad_norm": 0.14258824926513733, + "learning_rate": 6.931809521061261e-07, + "loss": 2.7212, + "step": 46019 + }, + { + "epoch": 2.8567881308585266, + "grad_norm": 0.13745830646109725, + "learning_rate": 6.925817957837588e-07, + "loss": 2.6424, + "step": 46020 + }, + { + "epoch": 2.856850207958284, + "grad_norm": 0.1436815285356528, + "learning_rate": 6.919828967101639e-07, + "loss": 2.7414, + "step": 46021 + }, + { + "epoch": 2.856912285058042, + "grad_norm": 0.13861774005789437, + "learning_rate": 6.913842548884663e-07, + "loss": 2.7498, + "step": 46022 + }, + { + "epoch": 2.8569743621578, + "grad_norm": 0.1394822659218198, + "learning_rate": 6.907858703217806e-07, + "loss": 2.6761, + "step": 46023 + }, + { + "epoch": 2.857036439257558, + "grad_norm": 0.13355326234202716, + "learning_rate": 6.90187743013243e-07, + "loss": 2.5565, + "step": 46024 + }, + { + "epoch": 2.8570985163573157, + "grad_norm": 0.13515829156425185, + "learning_rate": 6.895898729659622e-07, + "loss": 2.7374, + "step": 46025 + }, + { + "epoch": 2.8571605934570736, + "grad_norm": 0.12860070487875966, + "learning_rate": 6.889922601830634e-07, + "loss": 2.6779, + "step": 46026 + }, + { + "epoch": 2.8572226705568315, + "grad_norm": 0.13585483898818776, + "learning_rate": 6.883949046676663e-07, + "loss": 2.7854, + "step": 46027 + }, + { + "epoch": 2.8572847476565895, + "grad_norm": 0.12758717294017408, + "learning_rate": 6.877978064228741e-07, + "loss": 2.6373, + "step": 46028 + }, + { + "epoch": 2.8573468247563474, + "grad_norm": 0.139513410632478, + "learning_rate": 6.872009654518174e-07, + "loss": 2.7543, + "step": 46029 + }, + { + "epoch": 2.8574089018561053, + "grad_norm": 0.15034390008538698, + "learning_rate": 6.866043817576051e-07, + "loss": 2.6374, + "step": 46030 + }, + { + "epoch": 2.857470978955863, + "grad_norm": 0.12944117730948115, + "learning_rate": 6.860080553433512e-07, + "loss": 2.6994, + "step": 46031 + }, + { + "epoch": 2.857533056055621, + "grad_norm": 0.13382293801351924, + "learning_rate": 6.854119862121533e-07, + "loss": 2.7792, + "step": 46032 + }, + { + "epoch": 2.857595133155379, + "grad_norm": 0.13849630793451487, + "learning_rate": 6.848161743671422e-07, + "loss": 2.759, + "step": 46033 + }, + { + "epoch": 2.857657210255137, + "grad_norm": 0.12984189844704902, + "learning_rate": 6.842206198114154e-07, + "loss": 2.7009, + "step": 46034 + }, + { + "epoch": 2.857719287354895, + "grad_norm": 0.14159001270414293, + "learning_rate": 6.836253225480815e-07, + "loss": 2.6636, + "step": 46035 + }, + { + "epoch": 2.857781364454653, + "grad_norm": 0.13219846401557672, + "learning_rate": 6.830302825802492e-07, + "loss": 2.7019, + "step": 46036 + }, + { + "epoch": 2.8578434415544107, + "grad_norm": 0.14786400559950622, + "learning_rate": 6.824354999110161e-07, + "loss": 2.6294, + "step": 46037 + }, + { + "epoch": 2.8579055186541686, + "grad_norm": 0.13224419148070943, + "learning_rate": 6.818409745434906e-07, + "loss": 2.6361, + "step": 46038 + }, + { + "epoch": 2.8579675957539266, + "grad_norm": 0.1316717043221765, + "learning_rate": 6.812467064807703e-07, + "loss": 2.7893, + "step": 46039 + }, + { + "epoch": 2.858029672853684, + "grad_norm": 0.14241458057006107, + "learning_rate": 6.80652695725964e-07, + "loss": 2.6537, + "step": 46040 + }, + { + "epoch": 2.8580917499534424, + "grad_norm": 0.13423868489797938, + "learning_rate": 6.800589422821579e-07, + "loss": 2.7464, + "step": 46041 + }, + { + "epoch": 2.8581538270532, + "grad_norm": 0.14915041632106632, + "learning_rate": 6.794654461524608e-07, + "loss": 2.756, + "step": 46042 + }, + { + "epoch": 2.8582159041529582, + "grad_norm": 0.14301532836061429, + "learning_rate": 6.788722073399645e-07, + "loss": 2.6754, + "step": 46043 + }, + { + "epoch": 2.8582779812527157, + "grad_norm": 0.16382954725558616, + "learning_rate": 6.782792258477666e-07, + "loss": 2.7138, + "step": 46044 + }, + { + "epoch": 2.8583400583524736, + "grad_norm": 0.1375753001454327, + "learning_rate": 6.776865016789591e-07, + "loss": 2.7266, + "step": 46045 + }, + { + "epoch": 2.8584021354522315, + "grad_norm": 0.13642748051365383, + "learning_rate": 6.770940348366339e-07, + "loss": 2.7957, + "step": 46046 + }, + { + "epoch": 2.8584642125519895, + "grad_norm": 0.1508937445692665, + "learning_rate": 6.765018253238831e-07, + "loss": 2.7734, + "step": 46047 + }, + { + "epoch": 2.8585262896517474, + "grad_norm": 0.15242217563929938, + "learning_rate": 6.759098731437985e-07, + "loss": 2.6781, + "step": 46048 + }, + { + "epoch": 2.8585883667515053, + "grad_norm": 0.1312956724855176, + "learning_rate": 6.753181782994611e-07, + "loss": 2.7097, + "step": 46049 + }, + { + "epoch": 2.858650443851263, + "grad_norm": 0.12963995780235207, + "learning_rate": 6.747267407939628e-07, + "loss": 2.7655, + "step": 46050 + }, + { + "epoch": 2.858712520951021, + "grad_norm": 0.13149706769618305, + "learning_rate": 6.7413556063039e-07, + "loss": 2.682, + "step": 46051 + }, + { + "epoch": 2.858774598050779, + "grad_norm": 0.12964566170622185, + "learning_rate": 6.735446378118293e-07, + "loss": 2.6257, + "step": 46052 + }, + { + "epoch": 2.858836675150537, + "grad_norm": 0.13353386786144675, + "learning_rate": 6.729539723413613e-07, + "loss": 2.6795, + "step": 46053 + }, + { + "epoch": 2.858898752250295, + "grad_norm": 0.1338973964159023, + "learning_rate": 6.723635642220616e-07, + "loss": 2.6979, + "step": 46054 + }, + { + "epoch": 2.858960829350053, + "grad_norm": 0.12885961989667094, + "learning_rate": 6.717734134570164e-07, + "loss": 2.6652, + "step": 46055 + }, + { + "epoch": 2.8590229064498107, + "grad_norm": 0.14117605799529673, + "learning_rate": 6.711835200493122e-07, + "loss": 2.6422, + "step": 46056 + }, + { + "epoch": 2.8590849835495686, + "grad_norm": 0.13079652415629425, + "learning_rate": 6.705938840020132e-07, + "loss": 2.7051, + "step": 46057 + }, + { + "epoch": 2.8591470606493266, + "grad_norm": 0.14457273532266862, + "learning_rate": 6.700045053182003e-07, + "loss": 2.7651, + "step": 46058 + }, + { + "epoch": 2.8592091377490845, + "grad_norm": 0.13619689117419992, + "learning_rate": 6.694153840009487e-07, + "loss": 2.6998, + "step": 46059 + }, + { + "epoch": 2.8592712148488424, + "grad_norm": 0.12707144285555907, + "learning_rate": 6.688265200533339e-07, + "loss": 2.7031, + "step": 46060 + }, + { + "epoch": 2.8593332919486003, + "grad_norm": 0.13050108583133777, + "learning_rate": 6.682379134784256e-07, + "loss": 2.6485, + "step": 46061 + }, + { + "epoch": 2.8593953690483582, + "grad_norm": 0.1385691087357243, + "learning_rate": 6.67649564279299e-07, + "loss": 2.7223, + "step": 46062 + }, + { + "epoch": 2.8594574461481157, + "grad_norm": 0.13968220549242166, + "learning_rate": 6.670614724590185e-07, + "loss": 2.694, + "step": 46063 + }, + { + "epoch": 2.859519523247874, + "grad_norm": 0.1354966790757434, + "learning_rate": 6.664736380206482e-07, + "loss": 2.7671, + "step": 46064 + }, + { + "epoch": 2.8595816003476315, + "grad_norm": 0.1340535535806105, + "learning_rate": 6.65886060967269e-07, + "loss": 2.7921, + "step": 46065 + }, + { + "epoch": 2.85964367744739, + "grad_norm": 0.1438672691384342, + "learning_rate": 6.652987413019396e-07, + "loss": 2.7072, + "step": 46066 + }, + { + "epoch": 2.8597057545471474, + "grad_norm": 0.1304113204559012, + "learning_rate": 6.647116790277186e-07, + "loss": 2.6644, + "step": 46067 + }, + { + "epoch": 2.8597678316469057, + "grad_norm": 0.1289236822767935, + "learning_rate": 6.641248741476702e-07, + "loss": 2.7231, + "step": 46068 + }, + { + "epoch": 2.859829908746663, + "grad_norm": 0.12838284117209064, + "learning_rate": 6.635383266648643e-07, + "loss": 2.6415, + "step": 46069 + }, + { + "epoch": 2.859891985846421, + "grad_norm": 0.14713177985926457, + "learning_rate": 6.629520365823538e-07, + "loss": 2.7248, + "step": 46070 + }, + { + "epoch": 2.859954062946179, + "grad_norm": 0.14465327524433963, + "learning_rate": 6.623660039032031e-07, + "loss": 2.7389, + "step": 46071 + }, + { + "epoch": 2.860016140045937, + "grad_norm": 0.1355218058882798, + "learning_rate": 6.617802286304598e-07, + "loss": 2.6675, + "step": 46072 + }, + { + "epoch": 2.860078217145695, + "grad_norm": 0.15149399152886228, + "learning_rate": 6.611947107671934e-07, + "loss": 2.7369, + "step": 46073 + }, + { + "epoch": 2.860140294245453, + "grad_norm": 0.13690950165687643, + "learning_rate": 6.606094503164462e-07, + "loss": 2.7214, + "step": 46074 + }, + { + "epoch": 2.8602023713452107, + "grad_norm": 0.13795641229402475, + "learning_rate": 6.600244472812822e-07, + "loss": 2.7209, + "step": 46075 + }, + { + "epoch": 2.8602644484449686, + "grad_norm": 0.1314192084229109, + "learning_rate": 6.594397016647435e-07, + "loss": 2.6866, + "step": 46076 + }, + { + "epoch": 2.8603265255447266, + "grad_norm": 0.13104998527539144, + "learning_rate": 6.588552134698889e-07, + "loss": 2.675, + "step": 46077 + }, + { + "epoch": 2.8603886026444845, + "grad_norm": 0.1329347244415931, + "learning_rate": 6.582709826997602e-07, + "loss": 2.7456, + "step": 46078 + }, + { + "epoch": 2.8604506797442424, + "grad_norm": 0.1281576019188827, + "learning_rate": 6.576870093574161e-07, + "loss": 2.6747, + "step": 46079 + }, + { + "epoch": 2.8605127568440003, + "grad_norm": 0.16365234445111396, + "learning_rate": 6.571032934458932e-07, + "loss": 2.7443, + "step": 46080 + }, + { + "epoch": 2.860574833943758, + "grad_norm": 0.1301812779610375, + "learning_rate": 6.565198349682388e-07, + "loss": 2.6003, + "step": 46081 + }, + { + "epoch": 2.860636911043516, + "grad_norm": 0.13023773973920516, + "learning_rate": 6.559366339275008e-07, + "loss": 2.6011, + "step": 46082 + }, + { + "epoch": 2.860698988143274, + "grad_norm": 0.12875238541365464, + "learning_rate": 6.55353690326721e-07, + "loss": 2.6942, + "step": 46083 + }, + { + "epoch": 2.860761065243032, + "grad_norm": 0.13346692445391217, + "learning_rate": 6.547710041689415e-07, + "loss": 2.8099, + "step": 46084 + }, + { + "epoch": 2.86082314234279, + "grad_norm": 0.13979699162940637, + "learning_rate": 6.541885754571986e-07, + "loss": 2.765, + "step": 46085 + }, + { + "epoch": 2.8608852194425474, + "grad_norm": 0.1332882396680873, + "learning_rate": 6.536064041945345e-07, + "loss": 2.6919, + "step": 46086 + }, + { + "epoch": 2.8609472965423057, + "grad_norm": 0.1311614975678188, + "learning_rate": 6.5302449038398e-07, + "loss": 2.6119, + "step": 46087 + }, + { + "epoch": 2.861009373642063, + "grad_norm": 0.12852001849378059, + "learning_rate": 6.524428340285826e-07, + "loss": 2.6404, + "step": 46088 + }, + { + "epoch": 2.8610714507418216, + "grad_norm": 0.12995906255187642, + "learning_rate": 6.518614351313679e-07, + "loss": 2.609, + "step": 46089 + }, + { + "epoch": 2.861133527841579, + "grad_norm": 0.14549661759700516, + "learning_rate": 6.512802936953721e-07, + "loss": 2.6944, + "step": 46090 + }, + { + "epoch": 2.8611956049413374, + "grad_norm": 0.1379756508653402, + "learning_rate": 6.506994097236207e-07, + "loss": 2.6524, + "step": 46091 + }, + { + "epoch": 2.861257682041095, + "grad_norm": 0.1309606743632245, + "learning_rate": 6.501187832191613e-07, + "loss": 2.6275, + "step": 46092 + }, + { + "epoch": 2.861319759140853, + "grad_norm": 0.1285560213968534, + "learning_rate": 6.495384141850081e-07, + "loss": 2.7634, + "step": 46093 + }, + { + "epoch": 2.8613818362406107, + "grad_norm": 0.1281831429139509, + "learning_rate": 6.489583026241919e-07, + "loss": 2.7299, + "step": 46094 + }, + { + "epoch": 2.8614439133403686, + "grad_norm": 0.14673939440825579, + "learning_rate": 6.483784485397382e-07, + "loss": 2.7791, + "step": 46095 + }, + { + "epoch": 2.8615059904401265, + "grad_norm": 0.13103840097119265, + "learning_rate": 6.477988519346778e-07, + "loss": 2.693, + "step": 46096 + }, + { + "epoch": 2.8615680675398845, + "grad_norm": 0.13888354332557734, + "learning_rate": 6.472195128120307e-07, + "loss": 2.6043, + "step": 46097 + }, + { + "epoch": 2.8616301446396424, + "grad_norm": 0.1281594767937483, + "learning_rate": 6.46640431174822e-07, + "loss": 2.6139, + "step": 46098 + }, + { + "epoch": 2.8616922217394003, + "grad_norm": 0.12987439407945262, + "learning_rate": 6.460616070260717e-07, + "loss": 2.6548, + "step": 46099 + }, + { + "epoch": 2.861754298839158, + "grad_norm": 0.13176572130738706, + "learning_rate": 6.454830403687939e-07, + "loss": 2.5715, + "step": 46100 + }, + { + "epoch": 2.861816375938916, + "grad_norm": 0.13792764393716753, + "learning_rate": 6.449047312060142e-07, + "loss": 2.667, + "step": 46101 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.1416873146067695, + "learning_rate": 6.443266795407521e-07, + "loss": 2.7197, + "step": 46102 + }, + { + "epoch": 2.861940530138432, + "grad_norm": 0.12895444489236565, + "learning_rate": 6.437488853760165e-07, + "loss": 2.716, + "step": 46103 + }, + { + "epoch": 2.86200260723819, + "grad_norm": 0.13201243277767374, + "learning_rate": 6.431713487148217e-07, + "loss": 2.7884, + "step": 46104 + }, + { + "epoch": 2.862064684337948, + "grad_norm": 0.1310940033997734, + "learning_rate": 6.425940695601873e-07, + "loss": 2.716, + "step": 46105 + }, + { + "epoch": 2.8621267614377057, + "grad_norm": 0.12694171707543023, + "learning_rate": 6.420170479151222e-07, + "loss": 2.7131, + "step": 46106 + }, + { + "epoch": 2.8621888385374636, + "grad_norm": 0.1365348259974207, + "learning_rate": 6.41440283782635e-07, + "loss": 2.5986, + "step": 46107 + }, + { + "epoch": 2.8622509156372216, + "grad_norm": 0.13465749853426898, + "learning_rate": 6.408637771657345e-07, + "loss": 2.728, + "step": 46108 + }, + { + "epoch": 2.8623129927369795, + "grad_norm": 0.1419914730951317, + "learning_rate": 6.402875280674292e-07, + "loss": 2.6473, + "step": 46109 + }, + { + "epoch": 2.8623750698367374, + "grad_norm": 0.14872662094058153, + "learning_rate": 6.397115364907335e-07, + "loss": 2.7741, + "step": 46110 + }, + { + "epoch": 2.862437146936495, + "grad_norm": 0.1446984458103363, + "learning_rate": 6.391358024386396e-07, + "loss": 2.7129, + "step": 46111 + }, + { + "epoch": 2.8624992240362532, + "grad_norm": 0.12888406071275538, + "learning_rate": 6.38560325914156e-07, + "loss": 2.632, + "step": 46112 + }, + { + "epoch": 2.8625613011360107, + "grad_norm": 0.1471237470562136, + "learning_rate": 6.379851069202858e-07, + "loss": 2.6686, + "step": 46113 + }, + { + "epoch": 2.862623378235769, + "grad_norm": 0.13453752453224394, + "learning_rate": 6.374101454600323e-07, + "loss": 2.6514, + "step": 46114 + }, + { + "epoch": 2.8626854553355265, + "grad_norm": 0.12845529079200696, + "learning_rate": 6.368354415363987e-07, + "loss": 2.7043, + "step": 46115 + }, + { + "epoch": 2.8627475324352845, + "grad_norm": 0.1351295615731072, + "learning_rate": 6.362609951523713e-07, + "loss": 2.6444, + "step": 46116 + }, + { + "epoch": 2.8628096095350424, + "grad_norm": 0.12917981596768166, + "learning_rate": 6.356868063109533e-07, + "loss": 2.6192, + "step": 46117 + }, + { + "epoch": 2.8628716866348003, + "grad_norm": 0.13221632158523786, + "learning_rate": 6.35112875015137e-07, + "loss": 2.6376, + "step": 46118 + }, + { + "epoch": 2.862933763734558, + "grad_norm": 0.13203066820095472, + "learning_rate": 6.345392012679307e-07, + "loss": 2.6793, + "step": 46119 + }, + { + "epoch": 2.862995840834316, + "grad_norm": 0.1421902619607142, + "learning_rate": 6.339657850723102e-07, + "loss": 2.6965, + "step": 46120 + }, + { + "epoch": 2.863057917934074, + "grad_norm": 0.13239985060605974, + "learning_rate": 6.333926264312783e-07, + "loss": 2.6944, + "step": 46121 + }, + { + "epoch": 2.863119995033832, + "grad_norm": 0.13038273734342493, + "learning_rate": 6.328197253478163e-07, + "loss": 2.6208, + "step": 46122 + }, + { + "epoch": 2.86318207213359, + "grad_norm": 0.13801057720465312, + "learning_rate": 6.322470818249215e-07, + "loss": 2.7485, + "step": 46123 + }, + { + "epoch": 2.863244149233348, + "grad_norm": 0.1366960574799815, + "learning_rate": 6.316746958655806e-07, + "loss": 2.6466, + "step": 46124 + }, + { + "epoch": 2.8633062263331057, + "grad_norm": 0.13464948238214172, + "learning_rate": 6.311025674727744e-07, + "loss": 2.6885, + "step": 46125 + }, + { + "epoch": 2.8633683034328636, + "grad_norm": 0.1336343873552011, + "learning_rate": 6.30530696649495e-07, + "loss": 2.7644, + "step": 46126 + }, + { + "epoch": 2.8634303805326216, + "grad_norm": 0.14401508643343583, + "learning_rate": 6.299590833987123e-07, + "loss": 2.6769, + "step": 46127 + }, + { + "epoch": 2.8634924576323795, + "grad_norm": 0.1374675137983907, + "learning_rate": 6.29387727723424e-07, + "loss": 2.8256, + "step": 46128 + }, + { + "epoch": 2.8635545347321374, + "grad_norm": 0.1533974131721663, + "learning_rate": 6.288166296266051e-07, + "loss": 2.7879, + "step": 46129 + }, + { + "epoch": 2.8636166118318953, + "grad_norm": 0.134638823411391, + "learning_rate": 6.28245789111237e-07, + "loss": 2.6857, + "step": 46130 + }, + { + "epoch": 2.8636786889316532, + "grad_norm": 0.13603768329231206, + "learning_rate": 6.276752061802949e-07, + "loss": 2.6999, + "step": 46131 + }, + { + "epoch": 2.863740766031411, + "grad_norm": 0.12812070199689105, + "learning_rate": 6.271048808367541e-07, + "loss": 2.7268, + "step": 46132 + }, + { + "epoch": 2.863802843131169, + "grad_norm": 0.13201802006861799, + "learning_rate": 6.265348130835957e-07, + "loss": 2.7599, + "step": 46133 + }, + { + "epoch": 2.8638649202309265, + "grad_norm": 0.1292367801287949, + "learning_rate": 6.259650029237896e-07, + "loss": 2.871, + "step": 46134 + }, + { + "epoch": 2.863926997330685, + "grad_norm": 0.1370785814467044, + "learning_rate": 6.25395450360311e-07, + "loss": 2.7363, + "step": 46135 + }, + { + "epoch": 2.8639890744304424, + "grad_norm": 0.14083602092814723, + "learning_rate": 6.248261553961298e-07, + "loss": 2.7255, + "step": 46136 + }, + { + "epoch": 2.8640511515302007, + "grad_norm": 0.1357035273105102, + "learning_rate": 6.242571180342161e-07, + "loss": 2.7504, + "step": 46137 + }, + { + "epoch": 2.864113228629958, + "grad_norm": 0.14352468579325514, + "learning_rate": 6.236883382775393e-07, + "loss": 2.6229, + "step": 46138 + }, + { + "epoch": 2.8641753057297166, + "grad_norm": 0.13725308497232824, + "learning_rate": 6.231198161290696e-07, + "loss": 2.6749, + "step": 46139 + }, + { + "epoch": 2.864237382829474, + "grad_norm": 0.13161674928210104, + "learning_rate": 6.225515515917657e-07, + "loss": 2.7675, + "step": 46140 + }, + { + "epoch": 2.864299459929232, + "grad_norm": 0.14003718892311237, + "learning_rate": 6.219835446686029e-07, + "loss": 2.6572, + "step": 46141 + }, + { + "epoch": 2.86436153702899, + "grad_norm": 0.14301285376125328, + "learning_rate": 6.214157953625343e-07, + "loss": 2.772, + "step": 46142 + }, + { + "epoch": 2.864423614128748, + "grad_norm": 0.12876214295363594, + "learning_rate": 6.2084830367653e-07, + "loss": 2.6639, + "step": 46143 + }, + { + "epoch": 2.8644856912285057, + "grad_norm": 0.12689126781878102, + "learning_rate": 6.202810696135431e-07, + "loss": 2.6062, + "step": 46144 + }, + { + "epoch": 2.8645477683282636, + "grad_norm": 0.13026197202863038, + "learning_rate": 6.197140931765377e-07, + "loss": 2.7836, + "step": 46145 + }, + { + "epoch": 2.8646098454280216, + "grad_norm": 0.12907098589840924, + "learning_rate": 6.191473743684783e-07, + "loss": 2.6854, + "step": 46146 + }, + { + "epoch": 2.8646719225277795, + "grad_norm": 0.1383335189849209, + "learning_rate": 6.185809131923071e-07, + "loss": 2.6524, + "step": 46147 + }, + { + "epoch": 2.8647339996275374, + "grad_norm": 0.13533304839234392, + "learning_rate": 6.180147096509936e-07, + "loss": 2.6832, + "step": 46148 + }, + { + "epoch": 2.8647960767272953, + "grad_norm": 0.12961593256084145, + "learning_rate": 6.174487637474802e-07, + "loss": 2.783, + "step": 46149 + }, + { + "epoch": 2.8648581538270532, + "grad_norm": 0.13473961067666376, + "learning_rate": 6.168830754847199e-07, + "loss": 2.6453, + "step": 46150 + }, + { + "epoch": 2.864920230926811, + "grad_norm": 0.13024492623803782, + "learning_rate": 6.163176448656771e-07, + "loss": 2.7409, + "step": 46151 + }, + { + "epoch": 2.864982308026569, + "grad_norm": 0.12849308036474963, + "learning_rate": 6.157524718932883e-07, + "loss": 2.7003, + "step": 46152 + }, + { + "epoch": 2.865044385126327, + "grad_norm": 0.1305692716639309, + "learning_rate": 6.151875565705123e-07, + "loss": 2.7146, + "step": 46153 + }, + { + "epoch": 2.865106462226085, + "grad_norm": 0.1361879740514641, + "learning_rate": 6.146228989002855e-07, + "loss": 2.6577, + "step": 46154 + }, + { + "epoch": 2.865168539325843, + "grad_norm": 0.14350526958187323, + "learning_rate": 6.140584988855669e-07, + "loss": 2.7793, + "step": 46155 + }, + { + "epoch": 2.8652306164256007, + "grad_norm": 0.14221905866366047, + "learning_rate": 6.134943565292872e-07, + "loss": 2.6759, + "step": 46156 + }, + { + "epoch": 2.8652926935253586, + "grad_norm": 0.13392759376691288, + "learning_rate": 6.129304718344054e-07, + "loss": 2.6648, + "step": 46157 + }, + { + "epoch": 2.8653547706251166, + "grad_norm": 0.13075672222730658, + "learning_rate": 6.123668448038411e-07, + "loss": 2.7055, + "step": 46158 + }, + { + "epoch": 2.865416847724874, + "grad_norm": 0.13960575571903, + "learning_rate": 6.118034754405588e-07, + "loss": 2.6797, + "step": 46159 + }, + { + "epoch": 2.8654789248246324, + "grad_norm": 0.1504507254364505, + "learning_rate": 6.11240363747484e-07, + "loss": 2.6856, + "step": 46160 + }, + { + "epoch": 2.86554100192439, + "grad_norm": 0.13635546921029465, + "learning_rate": 6.106775097275585e-07, + "loss": 2.6827, + "step": 46161 + }, + { + "epoch": 2.8656030790241482, + "grad_norm": 0.12856706876662005, + "learning_rate": 6.101149133837191e-07, + "loss": 2.6466, + "step": 46162 + }, + { + "epoch": 2.8656651561239057, + "grad_norm": 0.12745105955314642, + "learning_rate": 6.095525747188968e-07, + "loss": 2.6603, + "step": 46163 + }, + { + "epoch": 2.8657272332236636, + "grad_norm": 0.13512458540599645, + "learning_rate": 6.089904937360336e-07, + "loss": 2.7873, + "step": 46164 + }, + { + "epoch": 2.8657893103234215, + "grad_norm": 0.1359588759705009, + "learning_rate": 6.084286704380493e-07, + "loss": 2.707, + "step": 46165 + }, + { + "epoch": 2.8658513874231795, + "grad_norm": 0.13700052244154656, + "learning_rate": 6.078671048278916e-07, + "loss": 2.7136, + "step": 46166 + }, + { + "epoch": 2.8659134645229374, + "grad_norm": 0.14391211045268057, + "learning_rate": 6.07305796908475e-07, + "loss": 2.7499, + "step": 46167 + }, + { + "epoch": 2.8659755416226953, + "grad_norm": 0.13424818242474756, + "learning_rate": 6.067447466827358e-07, + "loss": 2.6433, + "step": 46168 + }, + { + "epoch": 2.866037618722453, + "grad_norm": 0.13729066438000345, + "learning_rate": 6.061839541536052e-07, + "loss": 2.639, + "step": 46169 + }, + { + "epoch": 2.866099695822211, + "grad_norm": 0.160349572200713, + "learning_rate": 6.056234193239973e-07, + "loss": 2.7793, + "step": 46170 + }, + { + "epoch": 2.866161772921969, + "grad_norm": 0.1434634236468969, + "learning_rate": 6.050631421968433e-07, + "loss": 2.7859, + "step": 46171 + }, + { + "epoch": 2.866223850021727, + "grad_norm": 0.13227945534849278, + "learning_rate": 6.045031227750686e-07, + "loss": 2.5948, + "step": 46172 + }, + { + "epoch": 2.866285927121485, + "grad_norm": 0.1422514438945552, + "learning_rate": 6.03943361061593e-07, + "loss": 2.7536, + "step": 46173 + }, + { + "epoch": 2.866348004221243, + "grad_norm": 0.13578182437630382, + "learning_rate": 6.033838570593309e-07, + "loss": 2.6701, + "step": 46174 + }, + { + "epoch": 2.8664100813210007, + "grad_norm": 0.14290808683082873, + "learning_rate": 6.028246107712132e-07, + "loss": 2.6798, + "step": 46175 + }, + { + "epoch": 2.8664721584207586, + "grad_norm": 0.13063442766582595, + "learning_rate": 6.022656222001489e-07, + "loss": 2.723, + "step": 46176 + }, + { + "epoch": 2.8665342355205166, + "grad_norm": 0.14807205640274818, + "learning_rate": 6.01706891349052e-07, + "loss": 2.755, + "step": 46177 + }, + { + "epoch": 2.8665963126202745, + "grad_norm": 0.16510780302389447, + "learning_rate": 6.011484182208482e-07, + "loss": 2.7136, + "step": 46178 + }, + { + "epoch": 2.8666583897200324, + "grad_norm": 0.14412672549585387, + "learning_rate": 6.005902028184463e-07, + "loss": 2.6813, + "step": 46179 + }, + { + "epoch": 2.8667204668197903, + "grad_norm": 0.13098072716621845, + "learning_rate": 6.000322451447493e-07, + "loss": 2.7479, + "step": 46180 + }, + { + "epoch": 2.8667825439195482, + "grad_norm": 0.14127354612358417, + "learning_rate": 5.994745452026828e-07, + "loss": 2.6653, + "step": 46181 + }, + { + "epoch": 2.8668446210193057, + "grad_norm": 0.13785314644962607, + "learning_rate": 5.989171029951446e-07, + "loss": 2.6407, + "step": 46182 + }, + { + "epoch": 2.866906698119064, + "grad_norm": 0.15465311983270344, + "learning_rate": 5.983599185250544e-07, + "loss": 2.6463, + "step": 46183 + }, + { + "epoch": 2.8669687752188215, + "grad_norm": 0.1396704969789435, + "learning_rate": 5.978029917953099e-07, + "loss": 2.6723, + "step": 46184 + }, + { + "epoch": 2.86703085231858, + "grad_norm": 0.12951883067838296, + "learning_rate": 5.972463228088199e-07, + "loss": 2.7755, + "step": 46185 + }, + { + "epoch": 2.8670929294183374, + "grad_norm": 0.1290616125140954, + "learning_rate": 5.966899115684876e-07, + "loss": 2.6572, + "step": 46186 + }, + { + "epoch": 2.8671550065180957, + "grad_norm": 0.13988489526019962, + "learning_rate": 5.961337580772219e-07, + "loss": 2.8253, + "step": 46187 + }, + { + "epoch": 2.867217083617853, + "grad_norm": 0.13695492780796886, + "learning_rate": 5.955778623379149e-07, + "loss": 2.7107, + "step": 46188 + }, + { + "epoch": 2.867279160717611, + "grad_norm": 0.1347508975288388, + "learning_rate": 5.950222243534753e-07, + "loss": 2.6815, + "step": 46189 + }, + { + "epoch": 2.867341237817369, + "grad_norm": 0.14744896240115074, + "learning_rate": 5.944668441267953e-07, + "loss": 2.7748, + "step": 46190 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.13488674940235007, + "learning_rate": 5.939117216607781e-07, + "loss": 2.5979, + "step": 46191 + }, + { + "epoch": 2.867465392016885, + "grad_norm": 0.13117652153250867, + "learning_rate": 5.933568569583159e-07, + "loss": 2.6792, + "step": 46192 + }, + { + "epoch": 2.867527469116643, + "grad_norm": 0.13142019346424752, + "learning_rate": 5.928022500223063e-07, + "loss": 2.709, + "step": 46193 + }, + { + "epoch": 2.8675895462164007, + "grad_norm": 0.12883235518460265, + "learning_rate": 5.922479008556359e-07, + "loss": 2.7051, + "step": 46194 + }, + { + "epoch": 2.8676516233161586, + "grad_norm": 0.13278164926012379, + "learning_rate": 5.916938094612079e-07, + "loss": 2.6799, + "step": 46195 + }, + { + "epoch": 2.8677137004159166, + "grad_norm": 0.1451599520158859, + "learning_rate": 5.911399758419089e-07, + "loss": 2.7188, + "step": 46196 + }, + { + "epoch": 2.8677757775156745, + "grad_norm": 0.1315925399957457, + "learning_rate": 5.9058640000062e-07, + "loss": 2.7103, + "step": 46197 + }, + { + "epoch": 2.8678378546154324, + "grad_norm": 0.12947588828082848, + "learning_rate": 5.900330819402444e-07, + "loss": 2.6999, + "step": 46198 + }, + { + "epoch": 2.8678999317151903, + "grad_norm": 0.1302688140277461, + "learning_rate": 5.89480021663652e-07, + "loss": 2.7395, + "step": 46199 + }, + { + "epoch": 2.8679620088149482, + "grad_norm": 0.14025201606831417, + "learning_rate": 5.889272191737461e-07, + "loss": 2.7348, + "step": 46200 + }, + { + "epoch": 2.868024085914706, + "grad_norm": 0.14200865664464432, + "learning_rate": 5.883746744733964e-07, + "loss": 2.5767, + "step": 46201 + }, + { + "epoch": 2.868086163014464, + "grad_norm": 0.13534573766772148, + "learning_rate": 5.878223875654953e-07, + "loss": 2.7747, + "step": 46202 + }, + { + "epoch": 2.868148240114222, + "grad_norm": 0.14681300493375196, + "learning_rate": 5.872703584529127e-07, + "loss": 2.6235, + "step": 46203 + }, + { + "epoch": 2.86821031721398, + "grad_norm": 0.13184315697011514, + "learning_rate": 5.867185871385405e-07, + "loss": 2.6137, + "step": 46204 + }, + { + "epoch": 2.868272394313738, + "grad_norm": 0.12958059788185047, + "learning_rate": 5.861670736252545e-07, + "loss": 2.7321, + "step": 46205 + }, + { + "epoch": 2.8683344714134957, + "grad_norm": 0.14311777175741583, + "learning_rate": 5.856158179159299e-07, + "loss": 2.7173, + "step": 46206 + }, + { + "epoch": 2.868396548513253, + "grad_norm": 0.1461954828482619, + "learning_rate": 5.850648200134423e-07, + "loss": 2.7184, + "step": 46207 + }, + { + "epoch": 2.8684586256130116, + "grad_norm": 0.12722591289884994, + "learning_rate": 5.845140799206672e-07, + "loss": 2.7025, + "step": 46208 + }, + { + "epoch": 2.868520702712769, + "grad_norm": 0.13215251635538236, + "learning_rate": 5.839635976404801e-07, + "loss": 2.6676, + "step": 46209 + }, + { + "epoch": 2.8685827798125274, + "grad_norm": 0.1316050945633438, + "learning_rate": 5.834133731757508e-07, + "loss": 2.6697, + "step": 46210 + }, + { + "epoch": 2.868644856912285, + "grad_norm": 0.1312181692467139, + "learning_rate": 5.828634065293492e-07, + "loss": 2.6859, + "step": 46211 + }, + { + "epoch": 2.868706934012043, + "grad_norm": 0.13693703528081194, + "learning_rate": 5.823136977041399e-07, + "loss": 2.6903, + "step": 46212 + }, + { + "epoch": 2.8687690111118007, + "grad_norm": 0.13368792086976847, + "learning_rate": 5.817642467030038e-07, + "loss": 2.6979, + "step": 46213 + }, + { + "epoch": 2.8688310882115586, + "grad_norm": 0.14202735287027266, + "learning_rate": 5.812150535287997e-07, + "loss": 2.6793, + "step": 46214 + }, + { + "epoch": 2.8688931653113166, + "grad_norm": 0.1357044332130461, + "learning_rate": 5.80666118184392e-07, + "loss": 2.7608, + "step": 46215 + }, + { + "epoch": 2.8689552424110745, + "grad_norm": 0.1365087531273566, + "learning_rate": 5.801174406726451e-07, + "loss": 2.7432, + "step": 46216 + }, + { + "epoch": 2.8690173195108324, + "grad_norm": 0.1333946274957525, + "learning_rate": 5.795690209964233e-07, + "loss": 2.6839, + "step": 46217 + }, + { + "epoch": 2.8690793966105903, + "grad_norm": 0.1490669394477794, + "learning_rate": 5.790208591585855e-07, + "loss": 2.7893, + "step": 46218 + }, + { + "epoch": 2.8691414737103482, + "grad_norm": 0.1298809409903016, + "learning_rate": 5.784729551619961e-07, + "loss": 2.652, + "step": 46219 + }, + { + "epoch": 2.869203550810106, + "grad_norm": 0.14577172814537484, + "learning_rate": 5.779253090095138e-07, + "loss": 2.6858, + "step": 46220 + }, + { + "epoch": 2.869265627909864, + "grad_norm": 0.1328493726258796, + "learning_rate": 5.773779207039865e-07, + "loss": 2.7314, + "step": 46221 + }, + { + "epoch": 2.869327705009622, + "grad_norm": 0.1447328501788541, + "learning_rate": 5.768307902482784e-07, + "loss": 2.7259, + "step": 46222 + }, + { + "epoch": 2.86938978210938, + "grad_norm": 0.1451599135208595, + "learning_rate": 5.762839176452428e-07, + "loss": 2.6884, + "step": 46223 + }, + { + "epoch": 2.869451859209138, + "grad_norm": 0.14093727302110076, + "learning_rate": 5.757373028977331e-07, + "loss": 2.6621, + "step": 46224 + }, + { + "epoch": 2.8695139363088957, + "grad_norm": 0.12980343967587596, + "learning_rate": 5.751909460086025e-07, + "loss": 2.6721, + "step": 46225 + }, + { + "epoch": 2.8695760134086536, + "grad_norm": 0.1309283345276223, + "learning_rate": 5.746448469806931e-07, + "loss": 2.6989, + "step": 46226 + }, + { + "epoch": 2.8696380905084116, + "grad_norm": 0.14306419292729855, + "learning_rate": 5.740990058168583e-07, + "loss": 2.6053, + "step": 46227 + }, + { + "epoch": 2.8697001676081695, + "grad_norm": 0.13526117050549366, + "learning_rate": 5.735534225199568e-07, + "loss": 2.7643, + "step": 46228 + }, + { + "epoch": 2.8697622447079274, + "grad_norm": 0.1294053918851043, + "learning_rate": 5.730080970928198e-07, + "loss": 2.6772, + "step": 46229 + }, + { + "epoch": 2.869824321807685, + "grad_norm": 0.13713663813704655, + "learning_rate": 5.724630295382949e-07, + "loss": 2.6775, + "step": 46230 + }, + { + "epoch": 2.8698863989074432, + "grad_norm": 0.13912335628676423, + "learning_rate": 5.719182198592354e-07, + "loss": 2.6868, + "step": 46231 + }, + { + "epoch": 2.8699484760072007, + "grad_norm": 0.14896499242658887, + "learning_rate": 5.71373668058478e-07, + "loss": 2.6347, + "step": 46232 + }, + { + "epoch": 2.870010553106959, + "grad_norm": 0.12907543783712078, + "learning_rate": 5.708293741388592e-07, + "loss": 2.7012, + "step": 46233 + }, + { + "epoch": 2.8700726302067165, + "grad_norm": 0.1276230011553696, + "learning_rate": 5.702853381032269e-07, + "loss": 2.6782, + "step": 46234 + }, + { + "epoch": 2.870134707306475, + "grad_norm": 0.13469405277801053, + "learning_rate": 5.697415599544121e-07, + "loss": 2.733, + "step": 46235 + }, + { + "epoch": 2.8701967844062324, + "grad_norm": 0.14254028000942798, + "learning_rate": 5.691980396952568e-07, + "loss": 2.7317, + "step": 46236 + }, + { + "epoch": 2.8702588615059903, + "grad_norm": 0.13653589691152426, + "learning_rate": 5.686547773285922e-07, + "loss": 2.7864, + "step": 46237 + }, + { + "epoch": 2.870320938605748, + "grad_norm": 0.13098473735842478, + "learning_rate": 5.681117728572605e-07, + "loss": 2.6921, + "step": 46238 + }, + { + "epoch": 2.870383015705506, + "grad_norm": 0.15024082972387257, + "learning_rate": 5.675690262840872e-07, + "loss": 2.6923, + "step": 46239 + }, + { + "epoch": 2.870445092805264, + "grad_norm": 0.13832826084676234, + "learning_rate": 5.670265376119033e-07, + "loss": 2.671, + "step": 46240 + }, + { + "epoch": 2.870507169905022, + "grad_norm": 0.12990240795344404, + "learning_rate": 5.66484306843551e-07, + "loss": 2.6294, + "step": 46241 + }, + { + "epoch": 2.87056924700478, + "grad_norm": 0.12978378622395412, + "learning_rate": 5.659423339818448e-07, + "loss": 2.7191, + "step": 46242 + }, + { + "epoch": 2.870631324104538, + "grad_norm": 0.14673523720882078, + "learning_rate": 5.654006190296157e-07, + "loss": 2.6624, + "step": 46243 + }, + { + "epoch": 2.8706934012042957, + "grad_norm": 0.1323727662953464, + "learning_rate": 5.648591619896948e-07, + "loss": 2.7023, + "step": 46244 + }, + { + "epoch": 2.8707554783040536, + "grad_norm": 0.13059097492567584, + "learning_rate": 5.64317962864902e-07, + "loss": 2.777, + "step": 46245 + }, + { + "epoch": 2.8708175554038116, + "grad_norm": 0.1672736411688275, + "learning_rate": 5.637770216580684e-07, + "loss": 2.7291, + "step": 46246 + }, + { + "epoch": 2.8708796325035695, + "grad_norm": 0.1341743282152925, + "learning_rate": 5.632363383720085e-07, + "loss": 2.7253, + "step": 46247 + }, + { + "epoch": 2.8709417096033274, + "grad_norm": 0.14041898126868801, + "learning_rate": 5.626959130095422e-07, + "loss": 2.6714, + "step": 46248 + }, + { + "epoch": 2.8710037867030853, + "grad_norm": 0.1294391914885262, + "learning_rate": 5.62155745573495e-07, + "loss": 2.7354, + "step": 46249 + }, + { + "epoch": 2.8710658638028432, + "grad_norm": 0.13518129045128785, + "learning_rate": 5.616158360666868e-07, + "loss": 2.6738, + "step": 46250 + }, + { + "epoch": 2.871127940902601, + "grad_norm": 0.14393691992135543, + "learning_rate": 5.610761844919266e-07, + "loss": 2.6674, + "step": 46251 + }, + { + "epoch": 2.871190018002359, + "grad_norm": 0.13114115993282027, + "learning_rate": 5.605367908520343e-07, + "loss": 2.6777, + "step": 46252 + }, + { + "epoch": 2.871252095102117, + "grad_norm": 0.14129238576147324, + "learning_rate": 5.599976551498187e-07, + "loss": 2.6392, + "step": 46253 + }, + { + "epoch": 2.871314172201875, + "grad_norm": 0.13154753469629568, + "learning_rate": 5.594587773881055e-07, + "loss": 2.6505, + "step": 46254 + }, + { + "epoch": 2.8713762493016324, + "grad_norm": 0.132474602692151, + "learning_rate": 5.589201575696922e-07, + "loss": 2.6944, + "step": 46255 + }, + { + "epoch": 2.8714383264013907, + "grad_norm": 0.1310195938400902, + "learning_rate": 5.583817956973991e-07, + "loss": 2.7029, + "step": 46256 + }, + { + "epoch": 2.871500403501148, + "grad_norm": 0.1311721551039013, + "learning_rate": 5.578436917740293e-07, + "loss": 2.72, + "step": 46257 + }, + { + "epoch": 2.8715624806009066, + "grad_norm": 0.13014086564859653, + "learning_rate": 5.573058458023916e-07, + "loss": 2.741, + "step": 46258 + }, + { + "epoch": 2.871624557700664, + "grad_norm": 0.12862835175222295, + "learning_rate": 5.567682577852895e-07, + "loss": 2.7406, + "step": 46259 + }, + { + "epoch": 2.871686634800422, + "grad_norm": 0.12739323842121333, + "learning_rate": 5.562309277255317e-07, + "loss": 2.7228, + "step": 46260 + }, + { + "epoch": 2.87174871190018, + "grad_norm": 0.14125337213736108, + "learning_rate": 5.556938556259216e-07, + "loss": 2.679, + "step": 46261 + }, + { + "epoch": 2.871810788999938, + "grad_norm": 0.14985183088412724, + "learning_rate": 5.55157041489257e-07, + "loss": 2.6744, + "step": 46262 + }, + { + "epoch": 2.8718728660996957, + "grad_norm": 0.12860940943330396, + "learning_rate": 5.546204853183412e-07, + "loss": 2.7846, + "step": 46263 + }, + { + "epoch": 2.8719349431994536, + "grad_norm": 0.1371882211952978, + "learning_rate": 5.540841871159774e-07, + "loss": 2.7165, + "step": 46264 + }, + { + "epoch": 2.8719970202992116, + "grad_norm": 0.14207673421993114, + "learning_rate": 5.535481468849579e-07, + "loss": 2.6953, + "step": 46265 + }, + { + "epoch": 2.8720590973989695, + "grad_norm": 0.13154146721768092, + "learning_rate": 5.530123646280749e-07, + "loss": 2.6367, + "step": 46266 + }, + { + "epoch": 2.8721211744987274, + "grad_norm": 0.12965055364552341, + "learning_rate": 5.524768403481373e-07, + "loss": 2.6922, + "step": 46267 + }, + { + "epoch": 2.8721832515984853, + "grad_norm": 0.1313627369104366, + "learning_rate": 5.519415740479317e-07, + "loss": 2.791, + "step": 46268 + }, + { + "epoch": 2.8722453286982432, + "grad_norm": 0.13496943280081986, + "learning_rate": 5.514065657302503e-07, + "loss": 2.7055, + "step": 46269 + }, + { + "epoch": 2.872307405798001, + "grad_norm": 0.14722498274735207, + "learning_rate": 5.508718153978854e-07, + "loss": 2.7426, + "step": 46270 + }, + { + "epoch": 2.872369482897759, + "grad_norm": 0.12910564491177084, + "learning_rate": 5.503373230536236e-07, + "loss": 2.7394, + "step": 46271 + }, + { + "epoch": 2.872431559997517, + "grad_norm": 0.134947446819603, + "learning_rate": 5.49803088700257e-07, + "loss": 2.8193, + "step": 46272 + }, + { + "epoch": 2.872493637097275, + "grad_norm": 0.14058493334810918, + "learning_rate": 5.492691123405724e-07, + "loss": 2.6659, + "step": 46273 + }, + { + "epoch": 2.872555714197033, + "grad_norm": 0.1326120982366368, + "learning_rate": 5.487353939773565e-07, + "loss": 2.7126, + "step": 46274 + }, + { + "epoch": 2.8726177912967907, + "grad_norm": 0.14631204641003798, + "learning_rate": 5.482019336133904e-07, + "loss": 2.6938, + "step": 46275 + }, + { + "epoch": 2.8726798683965487, + "grad_norm": 0.13251669985417688, + "learning_rate": 5.476687312514606e-07, + "loss": 2.7455, + "step": 46276 + }, + { + "epoch": 2.8727419454963066, + "grad_norm": 0.13084926850876438, + "learning_rate": 5.471357868943538e-07, + "loss": 2.6786, + "step": 46277 + }, + { + "epoch": 2.872804022596064, + "grad_norm": 0.139891805863038, + "learning_rate": 5.466031005448402e-07, + "loss": 2.7848, + "step": 46278 + }, + { + "epoch": 2.8728660996958224, + "grad_norm": 0.14710723697123648, + "learning_rate": 5.460706722057008e-07, + "loss": 2.7586, + "step": 46279 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.1343425523039481, + "learning_rate": 5.455385018797166e-07, + "loss": 2.663, + "step": 46280 + }, + { + "epoch": 2.8729902538953382, + "grad_norm": 0.1291715968701091, + "learning_rate": 5.450065895696633e-07, + "loss": 2.6004, + "step": 46281 + }, + { + "epoch": 2.8730523309950957, + "grad_norm": 0.14762774878639054, + "learning_rate": 5.444749352783219e-07, + "loss": 2.6993, + "step": 46282 + }, + { + "epoch": 2.873114408094854, + "grad_norm": 0.13333344670628655, + "learning_rate": 5.439435390084568e-07, + "loss": 2.7163, + "step": 46283 + }, + { + "epoch": 2.8731764851946116, + "grad_norm": 0.12958561445520098, + "learning_rate": 5.434124007628438e-07, + "loss": 2.6732, + "step": 46284 + }, + { + "epoch": 2.8732385622943695, + "grad_norm": 0.12768170294436934, + "learning_rate": 5.428815205442528e-07, + "loss": 2.6976, + "step": 46285 + }, + { + "epoch": 2.8733006393941274, + "grad_norm": 0.1382873334462448, + "learning_rate": 5.423508983554537e-07, + "loss": 2.7659, + "step": 46286 + }, + { + "epoch": 2.8733627164938853, + "grad_norm": 0.14923721019538763, + "learning_rate": 5.418205341992221e-07, + "loss": 2.6725, + "step": 46287 + }, + { + "epoch": 2.8734247935936432, + "grad_norm": 0.14329995371254775, + "learning_rate": 5.412904280783171e-07, + "loss": 2.7051, + "step": 46288 + }, + { + "epoch": 2.873486870693401, + "grad_norm": 0.14463500216746, + "learning_rate": 5.407605799955029e-07, + "loss": 2.7099, + "step": 46289 + }, + { + "epoch": 2.873548947793159, + "grad_norm": 0.14259724945984256, + "learning_rate": 5.402309899535496e-07, + "loss": 2.7663, + "step": 46290 + }, + { + "epoch": 2.873611024892917, + "grad_norm": 0.14381066032395143, + "learning_rate": 5.397016579552217e-07, + "loss": 2.7569, + "step": 46291 + }, + { + "epoch": 2.873673101992675, + "grad_norm": 0.13417175302825354, + "learning_rate": 5.391725840032724e-07, + "loss": 2.7813, + "step": 46292 + }, + { + "epoch": 2.873735179092433, + "grad_norm": 0.1309284056598432, + "learning_rate": 5.386437681004663e-07, + "loss": 2.7469, + "step": 46293 + }, + { + "epoch": 2.8737972561921907, + "grad_norm": 0.14193345341400562, + "learning_rate": 5.381152102495679e-07, + "loss": 2.6051, + "step": 46294 + }, + { + "epoch": 2.8738593332919486, + "grad_norm": 0.1321630164619889, + "learning_rate": 5.375869104533248e-07, + "loss": 2.7309, + "step": 46295 + }, + { + "epoch": 2.8739214103917066, + "grad_norm": 0.14144550800705036, + "learning_rate": 5.370588687145017e-07, + "loss": 2.6851, + "step": 46296 + }, + { + "epoch": 2.8739834874914645, + "grad_norm": 0.12867566639885325, + "learning_rate": 5.365310850358518e-07, + "loss": 2.6809, + "step": 46297 + }, + { + "epoch": 2.8740455645912224, + "grad_norm": 0.1279689422834207, + "learning_rate": 5.36003559420123e-07, + "loss": 2.7283, + "step": 46298 + }, + { + "epoch": 2.8741076416909803, + "grad_norm": 0.1346631420617699, + "learning_rate": 5.354762918700684e-07, + "loss": 2.7241, + "step": 46299 + }, + { + "epoch": 2.8741697187907382, + "grad_norm": 0.13801418744287255, + "learning_rate": 5.349492823884472e-07, + "loss": 2.6684, + "step": 46300 + }, + { + "epoch": 2.874231795890496, + "grad_norm": 0.13942771754965488, + "learning_rate": 5.344225309780015e-07, + "loss": 2.7237, + "step": 46301 + }, + { + "epoch": 2.874293872990254, + "grad_norm": 0.1302065578975433, + "learning_rate": 5.338960376414847e-07, + "loss": 2.7631, + "step": 46302 + }, + { + "epoch": 2.8743559500900115, + "grad_norm": 0.15712727201476115, + "learning_rate": 5.333698023816392e-07, + "loss": 2.7935, + "step": 46303 + }, + { + "epoch": 2.87441802718977, + "grad_norm": 0.13039389421628467, + "learning_rate": 5.328438252012069e-07, + "loss": 2.7293, + "step": 46304 + }, + { + "epoch": 2.8744801042895274, + "grad_norm": 0.13569410419294015, + "learning_rate": 5.323181061029414e-07, + "loss": 2.7091, + "step": 46305 + }, + { + "epoch": 2.8745421813892857, + "grad_norm": 0.13577354535498545, + "learning_rate": 5.317926450895849e-07, + "loss": 2.7205, + "step": 46306 + }, + { + "epoch": 2.874604258489043, + "grad_norm": 0.13992833717371508, + "learning_rate": 5.312674421638686e-07, + "loss": 2.6535, + "step": 46307 + }, + { + "epoch": 2.874666335588801, + "grad_norm": 0.13957461159653953, + "learning_rate": 5.307424973285402e-07, + "loss": 2.7584, + "step": 46308 + }, + { + "epoch": 2.874728412688559, + "grad_norm": 0.13251557537519176, + "learning_rate": 5.302178105863365e-07, + "loss": 2.7095, + "step": 46309 + }, + { + "epoch": 2.874790489788317, + "grad_norm": 0.1273301614637586, + "learning_rate": 5.296933819399996e-07, + "loss": 2.6348, + "step": 46310 + }, + { + "epoch": 2.874852566888075, + "grad_norm": 0.1303593776739002, + "learning_rate": 5.291692113922553e-07, + "loss": 2.6671, + "step": 46311 + }, + { + "epoch": 2.874914643987833, + "grad_norm": 0.13396018980523594, + "learning_rate": 5.286452989458512e-07, + "loss": 2.7165, + "step": 46312 + }, + { + "epoch": 2.8749767210875907, + "grad_norm": 0.13088428203345479, + "learning_rate": 5.281216446035075e-07, + "loss": 2.755, + "step": 46313 + }, + { + "epoch": 2.8750387981873486, + "grad_norm": 0.13253955282204366, + "learning_rate": 5.27598248367972e-07, + "loss": 2.7533, + "step": 46314 + }, + { + "epoch": 2.8751008752871066, + "grad_norm": 0.12794251414053345, + "learning_rate": 5.270751102419591e-07, + "loss": 2.6515, + "step": 46315 + }, + { + "epoch": 2.8751629523868645, + "grad_norm": 0.1379332549466483, + "learning_rate": 5.265522302282055e-07, + "loss": 2.5998, + "step": 46316 + }, + { + "epoch": 2.8752250294866224, + "grad_norm": 0.13499391266114372, + "learning_rate": 5.260296083294425e-07, + "loss": 2.6534, + "step": 46317 + }, + { + "epoch": 2.8752871065863803, + "grad_norm": 0.16231804366994887, + "learning_rate": 5.2550724454839e-07, + "loss": 2.7683, + "step": 46318 + }, + { + "epoch": 2.8753491836861382, + "grad_norm": 0.14345922994259594, + "learning_rate": 5.249851388877791e-07, + "loss": 2.686, + "step": 46319 + }, + { + "epoch": 2.875411260785896, + "grad_norm": 0.1385257851370141, + "learning_rate": 5.244632913503355e-07, + "loss": 2.7543, + "step": 46320 + }, + { + "epoch": 2.875473337885654, + "grad_norm": 0.1437453800474888, + "learning_rate": 5.239417019387683e-07, + "loss": 2.7847, + "step": 46321 + }, + { + "epoch": 2.875535414985412, + "grad_norm": 0.13663317863454125, + "learning_rate": 5.234203706558138e-07, + "loss": 2.6829, + "step": 46322 + }, + { + "epoch": 2.87559749208517, + "grad_norm": 0.14637732090726613, + "learning_rate": 5.228992975041868e-07, + "loss": 2.7531, + "step": 46323 + }, + { + "epoch": 2.875659569184928, + "grad_norm": 0.13882456687442007, + "learning_rate": 5.223784824866018e-07, + "loss": 2.6494, + "step": 46324 + }, + { + "epoch": 2.8757216462846857, + "grad_norm": 0.1289073048172767, + "learning_rate": 5.218579256057788e-07, + "loss": 2.6542, + "step": 46325 + }, + { + "epoch": 2.875783723384443, + "grad_norm": 0.13485414251678635, + "learning_rate": 5.213376268644377e-07, + "loss": 2.6, + "step": 46326 + }, + { + "epoch": 2.8758458004842016, + "grad_norm": 0.13803588062410127, + "learning_rate": 5.208175862652875e-07, + "loss": 2.7051, + "step": 46327 + }, + { + "epoch": 2.875907877583959, + "grad_norm": 0.12998591181638375, + "learning_rate": 5.202978038110428e-07, + "loss": 2.7144, + "step": 46328 + }, + { + "epoch": 2.8759699546837174, + "grad_norm": 0.14543368224160527, + "learning_rate": 5.197782795044126e-07, + "loss": 2.6148, + "step": 46329 + }, + { + "epoch": 2.876032031783475, + "grad_norm": 0.141670913813776, + "learning_rate": 5.192590133481168e-07, + "loss": 2.7844, + "step": 46330 + }, + { + "epoch": 2.8760941088832332, + "grad_norm": 0.1349352584456562, + "learning_rate": 5.187400053448587e-07, + "loss": 2.8167, + "step": 46331 + }, + { + "epoch": 2.8761561859829907, + "grad_norm": 0.13106109955930367, + "learning_rate": 5.182212554973476e-07, + "loss": 2.7137, + "step": 46332 + }, + { + "epoch": 2.8762182630827486, + "grad_norm": 0.1381400750853336, + "learning_rate": 5.17702763808281e-07, + "loss": 2.73, + "step": 46333 + }, + { + "epoch": 2.8762803401825066, + "grad_norm": 0.13277970638441777, + "learning_rate": 5.171845302803735e-07, + "loss": 2.5872, + "step": 46334 + }, + { + "epoch": 2.8763424172822645, + "grad_norm": 0.14396105878152768, + "learning_rate": 5.166665549163286e-07, + "loss": 2.7744, + "step": 46335 + }, + { + "epoch": 2.8764044943820224, + "grad_norm": 0.1423622940726352, + "learning_rate": 5.161488377188495e-07, + "loss": 2.7176, + "step": 46336 + }, + { + "epoch": 2.8764665714817803, + "grad_norm": 0.1301187151221114, + "learning_rate": 5.156313786906341e-07, + "loss": 2.7686, + "step": 46337 + }, + { + "epoch": 2.8765286485815382, + "grad_norm": 0.15383652327528058, + "learning_rate": 5.151141778343749e-07, + "loss": 2.7933, + "step": 46338 + }, + { + "epoch": 2.876590725681296, + "grad_norm": 0.12924701270888383, + "learning_rate": 5.145972351527862e-07, + "loss": 2.6982, + "step": 46339 + }, + { + "epoch": 2.876652802781054, + "grad_norm": 0.12931517560817216, + "learning_rate": 5.140805506485546e-07, + "loss": 2.6683, + "step": 46340 + }, + { + "epoch": 2.876714879880812, + "grad_norm": 0.13181439682919274, + "learning_rate": 5.135641243243783e-07, + "loss": 2.7289, + "step": 46341 + }, + { + "epoch": 2.87677695698057, + "grad_norm": 0.12875153909846596, + "learning_rate": 5.130479561829493e-07, + "loss": 2.6173, + "step": 46342 + }, + { + "epoch": 2.876839034080328, + "grad_norm": 0.13928637887045975, + "learning_rate": 5.1253204622696e-07, + "loss": 2.674, + "step": 46343 + }, + { + "epoch": 2.8769011111800857, + "grad_norm": 0.1323118168836212, + "learning_rate": 5.120163944591083e-07, + "loss": 2.7561, + "step": 46344 + }, + { + "epoch": 2.8769631882798437, + "grad_norm": 0.12643265038258364, + "learning_rate": 5.115010008820809e-07, + "loss": 2.6696, + "step": 46345 + }, + { + "epoch": 2.8770252653796016, + "grad_norm": 0.14875842211826654, + "learning_rate": 5.109858654985644e-07, + "loss": 2.6215, + "step": 46346 + }, + { + "epoch": 2.8770873424793595, + "grad_norm": 0.131031869246359, + "learning_rate": 5.104709883112513e-07, + "loss": 2.6726, + "step": 46347 + }, + { + "epoch": 2.8771494195791174, + "grad_norm": 0.13039813672022418, + "learning_rate": 5.099563693228171e-07, + "loss": 2.6934, + "step": 46348 + }, + { + "epoch": 2.8772114966788753, + "grad_norm": 0.1305163142595846, + "learning_rate": 5.094420085359653e-07, + "loss": 2.6222, + "step": 46349 + }, + { + "epoch": 2.8772735737786332, + "grad_norm": 0.12805836154139508, + "learning_rate": 5.089279059533658e-07, + "loss": 2.712, + "step": 46350 + }, + { + "epoch": 2.8773356508783907, + "grad_norm": 0.14248295442584377, + "learning_rate": 5.084140615777e-07, + "loss": 2.7148, + "step": 46351 + }, + { + "epoch": 2.877397727978149, + "grad_norm": 0.13219021420138477, + "learning_rate": 5.079004754116545e-07, + "loss": 2.6903, + "step": 46352 + }, + { + "epoch": 2.8774598050779066, + "grad_norm": 0.1323190596584765, + "learning_rate": 5.07387147457905e-07, + "loss": 2.7225, + "step": 46353 + }, + { + "epoch": 2.877521882177665, + "grad_norm": 0.12980116522272234, + "learning_rate": 5.068740777191383e-07, + "loss": 2.7859, + "step": 46354 + }, + { + "epoch": 2.8775839592774224, + "grad_norm": 0.13113657926911274, + "learning_rate": 5.063612661980189e-07, + "loss": 2.7089, + "step": 46355 + }, + { + "epoch": 2.8776460363771803, + "grad_norm": 0.13221972375987676, + "learning_rate": 5.058487128972278e-07, + "loss": 2.6929, + "step": 46356 + }, + { + "epoch": 2.8777081134769382, + "grad_norm": 0.12830293238673432, + "learning_rate": 5.053364178194409e-07, + "loss": 2.7195, + "step": 46357 + }, + { + "epoch": 2.877770190576696, + "grad_norm": 0.1379640202883423, + "learning_rate": 5.048243809673281e-07, + "loss": 2.695, + "step": 46358 + }, + { + "epoch": 2.877832267676454, + "grad_norm": 0.14792977888347744, + "learning_rate": 5.043126023435651e-07, + "loss": 2.6592, + "step": 46359 + }, + { + "epoch": 2.877894344776212, + "grad_norm": 0.1494583488105778, + "learning_rate": 5.038010819508166e-07, + "loss": 2.7307, + "step": 46360 + }, + { + "epoch": 2.87795642187597, + "grad_norm": 0.12750105357202293, + "learning_rate": 5.032898197917468e-07, + "loss": 2.6558, + "step": 46361 + }, + { + "epoch": 2.878018498975728, + "grad_norm": 0.13257407076003014, + "learning_rate": 5.027788158690372e-07, + "loss": 2.7298, + "step": 46362 + }, + { + "epoch": 2.8780805760754857, + "grad_norm": 0.15060854270864185, + "learning_rate": 5.022680701853466e-07, + "loss": 2.6975, + "step": 46363 + }, + { + "epoch": 2.8781426531752436, + "grad_norm": 0.1329387107183314, + "learning_rate": 5.017575827433396e-07, + "loss": 2.7132, + "step": 46364 + }, + { + "epoch": 2.8782047302750016, + "grad_norm": 0.13063166149620853, + "learning_rate": 5.012473535456752e-07, + "loss": 2.7109, + "step": 46365 + }, + { + "epoch": 2.8782668073747595, + "grad_norm": 0.148030878576707, + "learning_rate": 5.007373825950235e-07, + "loss": 2.7409, + "step": 46366 + }, + { + "epoch": 2.8783288844745174, + "grad_norm": 0.13898271323803274, + "learning_rate": 5.002276698940378e-07, + "loss": 2.7217, + "step": 46367 + }, + { + "epoch": 2.8783909615742753, + "grad_norm": 0.14068041477851106, + "learning_rate": 4.997182154453828e-07, + "loss": 2.7056, + "step": 46368 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.12984387093640512, + "learning_rate": 4.992090192517174e-07, + "loss": 2.6461, + "step": 46369 + }, + { + "epoch": 2.878515115773791, + "grad_norm": 0.13665549995021126, + "learning_rate": 4.987000813156895e-07, + "loss": 2.6267, + "step": 46370 + }, + { + "epoch": 2.878577192873549, + "grad_norm": 0.12838106386596385, + "learning_rate": 4.981914016399636e-07, + "loss": 2.7098, + "step": 46371 + }, + { + "epoch": 2.878639269973307, + "grad_norm": 0.1292599824208521, + "learning_rate": 4.976829802271876e-07, + "loss": 2.7388, + "step": 46372 + }, + { + "epoch": 2.878701347073065, + "grad_norm": 0.13335020246063056, + "learning_rate": 4.971748170800206e-07, + "loss": 2.6177, + "step": 46373 + }, + { + "epoch": 2.8787634241728224, + "grad_norm": 0.13094852741346746, + "learning_rate": 4.966669122011048e-07, + "loss": 2.7787, + "step": 46374 + }, + { + "epoch": 2.8788255012725807, + "grad_norm": 0.13530103778085156, + "learning_rate": 4.961592655930991e-07, + "loss": 2.7224, + "step": 46375 + }, + { + "epoch": 2.878887578372338, + "grad_norm": 0.1394375228702282, + "learning_rate": 4.956518772586516e-07, + "loss": 2.6678, + "step": 46376 + }, + { + "epoch": 2.8789496554720966, + "grad_norm": 0.1328658739999519, + "learning_rate": 4.951447472003989e-07, + "loss": 2.7177, + "step": 46377 + }, + { + "epoch": 2.879011732571854, + "grad_norm": 0.1398870923096776, + "learning_rate": 4.946378754209946e-07, + "loss": 2.7954, + "step": 46378 + }, + { + "epoch": 2.8790738096716124, + "grad_norm": 0.13216849169001252, + "learning_rate": 4.941312619230809e-07, + "loss": 2.6897, + "step": 46379 + }, + { + "epoch": 2.87913588677137, + "grad_norm": 0.12956914096747713, + "learning_rate": 4.936249067093058e-07, + "loss": 2.7794, + "step": 46380 + }, + { + "epoch": 2.879197963871128, + "grad_norm": 0.13388962690337736, + "learning_rate": 4.931188097823059e-07, + "loss": 2.6833, + "step": 46381 + }, + { + "epoch": 2.8792600409708857, + "grad_norm": 0.13875470575523444, + "learning_rate": 4.926129711447291e-07, + "loss": 2.7443, + "step": 46382 + }, + { + "epoch": 2.8793221180706436, + "grad_norm": 0.12981891494414693, + "learning_rate": 4.921073907992014e-07, + "loss": 2.6941, + "step": 46383 + }, + { + "epoch": 2.8793841951704016, + "grad_norm": 0.13211558317805525, + "learning_rate": 4.916020687483702e-07, + "loss": 2.7149, + "step": 46384 + }, + { + "epoch": 2.8794462722701595, + "grad_norm": 0.134312573059687, + "learning_rate": 4.910970049948727e-07, + "loss": 2.7361, + "step": 46385 + }, + { + "epoch": 2.8795083493699174, + "grad_norm": 0.1405048055382726, + "learning_rate": 4.905921995413343e-07, + "loss": 2.6604, + "step": 46386 + }, + { + "epoch": 2.8795704264696753, + "grad_norm": 0.14316781781984264, + "learning_rate": 4.90087652390403e-07, + "loss": 2.6892, + "step": 46387 + }, + { + "epoch": 2.8796325035694332, + "grad_norm": 0.12798737542937766, + "learning_rate": 4.895833635446989e-07, + "loss": 2.6343, + "step": 46388 + }, + { + "epoch": 2.879694580669191, + "grad_norm": 0.1455341737665241, + "learning_rate": 4.890793330068588e-07, + "loss": 2.75, + "step": 46389 + }, + { + "epoch": 2.879756657768949, + "grad_norm": 0.1376503471923181, + "learning_rate": 4.885755607795084e-07, + "loss": 2.7326, + "step": 46390 + }, + { + "epoch": 2.879818734868707, + "grad_norm": 0.13137751810616885, + "learning_rate": 4.880720468652788e-07, + "loss": 2.661, + "step": 46391 + }, + { + "epoch": 2.879880811968465, + "grad_norm": 0.14521199437150506, + "learning_rate": 4.875687912668015e-07, + "loss": 2.6404, + "step": 46392 + }, + { + "epoch": 2.879942889068223, + "grad_norm": 0.12779814385370605, + "learning_rate": 4.870657939866908e-07, + "loss": 2.6413, + "step": 46393 + }, + { + "epoch": 2.8800049661679807, + "grad_norm": 0.13260358619887527, + "learning_rate": 4.865630550275835e-07, + "loss": 2.7674, + "step": 46394 + }, + { + "epoch": 2.8800670432677387, + "grad_norm": 0.14510914252714296, + "learning_rate": 4.860605743920943e-07, + "loss": 2.6518, + "step": 46395 + }, + { + "epoch": 2.8801291203674966, + "grad_norm": 0.1466573456918984, + "learning_rate": 4.855583520828488e-07, + "loss": 2.7578, + "step": 46396 + }, + { + "epoch": 2.8801911974672545, + "grad_norm": 0.12749716755581855, + "learning_rate": 4.850563881024617e-07, + "loss": 2.6678, + "step": 46397 + }, + { + "epoch": 2.8802532745670124, + "grad_norm": 0.127388391393055, + "learning_rate": 4.845546824535585e-07, + "loss": 2.668, + "step": 46398 + }, + { + "epoch": 2.88031535166677, + "grad_norm": 0.13224329004455176, + "learning_rate": 4.840532351387539e-07, + "loss": 2.7407, + "step": 46399 + }, + { + "epoch": 2.8803774287665282, + "grad_norm": 0.12715344397185555, + "learning_rate": 4.835520461606624e-07, + "loss": 2.7347, + "step": 46400 + }, + { + "epoch": 2.8804395058662857, + "grad_norm": 0.12802284430679925, + "learning_rate": 4.830511155218986e-07, + "loss": 2.7084, + "step": 46401 + }, + { + "epoch": 2.880501582966044, + "grad_norm": 0.13073280960747016, + "learning_rate": 4.825504432250772e-07, + "loss": 2.7247, + "step": 46402 + }, + { + "epoch": 2.8805636600658016, + "grad_norm": 0.13442038113532076, + "learning_rate": 4.820500292728181e-07, + "loss": 2.6662, + "step": 46403 + }, + { + "epoch": 2.8806257371655595, + "grad_norm": 0.13516698722966952, + "learning_rate": 4.815498736677193e-07, + "loss": 2.6882, + "step": 46404 + }, + { + "epoch": 2.8806878142653174, + "grad_norm": 0.13895726722763949, + "learning_rate": 4.81049976412401e-07, + "loss": 2.6811, + "step": 46405 + }, + { + "epoch": 2.8807498913650753, + "grad_norm": 0.13078618493790306, + "learning_rate": 4.805503375094555e-07, + "loss": 2.7127, + "step": 46406 + }, + { + "epoch": 2.8808119684648332, + "grad_norm": 0.14749751995111104, + "learning_rate": 4.800509569615086e-07, + "loss": 2.7337, + "step": 46407 + }, + { + "epoch": 2.880874045564591, + "grad_norm": 0.13550871386081156, + "learning_rate": 4.795518347711581e-07, + "loss": 2.663, + "step": 46408 + }, + { + "epoch": 2.880936122664349, + "grad_norm": 0.12894174058341493, + "learning_rate": 4.790529709410019e-07, + "loss": 2.6603, + "step": 46409 + }, + { + "epoch": 2.880998199764107, + "grad_norm": 0.1375362139934766, + "learning_rate": 4.785543654736491e-07, + "loss": 2.7188, + "step": 46410 + }, + { + "epoch": 2.881060276863865, + "grad_norm": 0.12799174863652402, + "learning_rate": 4.780560183717031e-07, + "loss": 2.5578, + "step": 46411 + }, + { + "epoch": 2.881122353963623, + "grad_norm": 0.1277607100284203, + "learning_rate": 4.775579296377564e-07, + "loss": 2.5665, + "step": 46412 + }, + { + "epoch": 2.8811844310633807, + "grad_norm": 0.12908771053916507, + "learning_rate": 4.770600992744179e-07, + "loss": 2.7072, + "step": 46413 + }, + { + "epoch": 2.8812465081631387, + "grad_norm": 0.13171701365646654, + "learning_rate": 4.765625272842744e-07, + "loss": 2.7098, + "step": 46414 + }, + { + "epoch": 2.8813085852628966, + "grad_norm": 0.1437035197805988, + "learning_rate": 4.7606521366992397e-07, + "loss": 2.662, + "step": 46415 + }, + { + "epoch": 2.8813706623626545, + "grad_norm": 0.14779958302591645, + "learning_rate": 4.7556815843397e-07, + "loss": 2.7181, + "step": 46416 + }, + { + "epoch": 2.8814327394624124, + "grad_norm": 0.13864038674050225, + "learning_rate": 4.750713615789992e-07, + "loss": 2.6682, + "step": 46417 + }, + { + "epoch": 2.8814948165621703, + "grad_norm": 0.13484589632476598, + "learning_rate": 4.745748231075986e-07, + "loss": 2.6539, + "step": 46418 + }, + { + "epoch": 2.8815568936619282, + "grad_norm": 0.13094852741346746, + "learning_rate": 4.7407854302237155e-07, + "loss": 2.6885, + "step": 46419 + }, + { + "epoch": 2.881618970761686, + "grad_norm": 0.12998008670650427, + "learning_rate": 4.735825213258882e-07, + "loss": 2.6892, + "step": 46420 + }, + { + "epoch": 2.881681047861444, + "grad_norm": 0.14871790401412904, + "learning_rate": 4.7308675802075766e-07, + "loss": 2.7242, + "step": 46421 + }, + { + "epoch": 2.8817431249612016, + "grad_norm": 0.12779424500852027, + "learning_rate": 4.7259125310955e-07, + "loss": 2.7126, + "step": 46422 + }, + { + "epoch": 2.88180520206096, + "grad_norm": 0.140136559250372, + "learning_rate": 4.720960065948632e-07, + "loss": 2.642, + "step": 46423 + }, + { + "epoch": 2.8818672791607174, + "grad_norm": 0.13177517090947186, + "learning_rate": 4.716010184792674e-07, + "loss": 2.7273, + "step": 46424 + }, + { + "epoch": 2.8819293562604757, + "grad_norm": 0.1334647822128575, + "learning_rate": 4.7110628876536057e-07, + "loss": 2.6641, + "step": 46425 + }, + { + "epoch": 2.8819914333602332, + "grad_norm": 0.12753494162569054, + "learning_rate": 4.706118174557128e-07, + "loss": 2.6608, + "step": 46426 + }, + { + "epoch": 2.8820535104599916, + "grad_norm": 0.1293217868374057, + "learning_rate": 4.7011760455290544e-07, + "loss": 2.7424, + "step": 46427 + }, + { + "epoch": 2.882115587559749, + "grad_norm": 0.1367623464012033, + "learning_rate": 4.696236500595197e-07, + "loss": 2.7399, + "step": 46428 + }, + { + "epoch": 2.882177664659507, + "grad_norm": 0.12999523288211703, + "learning_rate": 4.691299539781313e-07, + "loss": 2.7157, + "step": 46429 + }, + { + "epoch": 2.882239741759265, + "grad_norm": 0.12913425827947927, + "learning_rate": 4.68636516311316e-07, + "loss": 2.7038, + "step": 46430 + }, + { + "epoch": 2.882301818859023, + "grad_norm": 0.13150376044950415, + "learning_rate": 4.681433370616439e-07, + "loss": 2.6935, + "step": 46431 + }, + { + "epoch": 2.8823638959587807, + "grad_norm": 0.1307820689609823, + "learning_rate": 4.676504162317019e-07, + "loss": 2.5709, + "step": 46432 + }, + { + "epoch": 2.8824259730585386, + "grad_norm": 0.13282418199661017, + "learning_rate": 4.671577538240435e-07, + "loss": 2.6849, + "step": 46433 + }, + { + "epoch": 2.8824880501582966, + "grad_norm": 0.13879878321715158, + "learning_rate": 4.666653498412499e-07, + "loss": 2.598, + "step": 46434 + }, + { + "epoch": 2.8825501272580545, + "grad_norm": 0.13159574598285664, + "learning_rate": 4.661732042858857e-07, + "loss": 2.697, + "step": 46435 + }, + { + "epoch": 2.8826122043578124, + "grad_norm": 0.14279607548997536, + "learning_rate": 4.6568131716052676e-07, + "loss": 2.7418, + "step": 46436 + }, + { + "epoch": 2.8826742814575703, + "grad_norm": 0.14796824694853108, + "learning_rate": 4.651896884677265e-07, + "loss": 2.7734, + "step": 46437 + }, + { + "epoch": 2.8827363585573282, + "grad_norm": 0.13220132420671254, + "learning_rate": 4.646983182100606e-07, + "loss": 2.754, + "step": 46438 + }, + { + "epoch": 2.882798435657086, + "grad_norm": 0.13340123202969956, + "learning_rate": 4.642072063900882e-07, + "loss": 2.6501, + "step": 46439 + }, + { + "epoch": 2.882860512756844, + "grad_norm": 0.1289271207733215, + "learning_rate": 4.6371635301037386e-07, + "loss": 2.613, + "step": 46440 + }, + { + "epoch": 2.882922589856602, + "grad_norm": 0.1458878578425315, + "learning_rate": 4.632257580734711e-07, + "loss": 2.6889, + "step": 46441 + }, + { + "epoch": 2.88298466695636, + "grad_norm": 0.13140869140625, + "learning_rate": 4.6273542158195016e-07, + "loss": 2.7229, + "step": 46442 + }, + { + "epoch": 2.883046744056118, + "grad_norm": 0.1477800982670987, + "learning_rate": 4.622453435383589e-07, + "loss": 2.6147, + "step": 46443 + }, + { + "epoch": 2.8831088211558757, + "grad_norm": 0.13415305193935262, + "learning_rate": 4.61755523945262e-07, + "loss": 2.6436, + "step": 46444 + }, + { + "epoch": 2.8831708982556337, + "grad_norm": 0.12941202729176685, + "learning_rate": 4.6126596280521294e-07, + "loss": 2.6201, + "step": 46445 + }, + { + "epoch": 2.8832329753553916, + "grad_norm": 0.1410625062823347, + "learning_rate": 4.607766601207653e-07, + "loss": 2.6957, + "step": 46446 + }, + { + "epoch": 2.883295052455149, + "grad_norm": 0.14080731838553656, + "learning_rate": 4.6028761589447244e-07, + "loss": 2.6517, + "step": 46447 + }, + { + "epoch": 2.8833571295549074, + "grad_norm": 0.13838386176799147, + "learning_rate": 4.597988301288825e-07, + "loss": 2.6294, + "step": 46448 + }, + { + "epoch": 2.883419206654665, + "grad_norm": 0.1457776952659715, + "learning_rate": 4.5931030282655443e-07, + "loss": 2.6836, + "step": 46449 + }, + { + "epoch": 2.8834812837544233, + "grad_norm": 0.13073410614403652, + "learning_rate": 4.588220339900251e-07, + "loss": 2.704, + "step": 46450 + }, + { + "epoch": 2.8835433608541807, + "grad_norm": 0.13243789997868538, + "learning_rate": 4.5833402362184806e-07, + "loss": 2.6898, + "step": 46451 + }, + { + "epoch": 2.8836054379539386, + "grad_norm": 0.13155242670688838, + "learning_rate": 4.5784627172457127e-07, + "loss": 2.7433, + "step": 46452 + }, + { + "epoch": 2.8836675150536966, + "grad_norm": 0.14270609508160662, + "learning_rate": 4.573587783007427e-07, + "loss": 2.7345, + "step": 46453 + }, + { + "epoch": 2.8837295921534545, + "grad_norm": 0.13464582342759374, + "learning_rate": 4.568715433528936e-07, + "loss": 2.7373, + "step": 46454 + }, + { + "epoch": 2.8837916692532124, + "grad_norm": 0.13650170536644016, + "learning_rate": 4.56384566883572e-07, + "loss": 2.6926, + "step": 46455 + }, + { + "epoch": 2.8838537463529703, + "grad_norm": 0.1392306634405906, + "learning_rate": 4.5589784889532026e-07, + "loss": 2.6592, + "step": 46456 + }, + { + "epoch": 2.8839158234527282, + "grad_norm": 0.14219967376030077, + "learning_rate": 4.554113893906753e-07, + "loss": 2.6989, + "step": 46457 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.13938597042589615, + "learning_rate": 4.549251883721795e-07, + "loss": 2.6593, + "step": 46458 + }, + { + "epoch": 2.884039977652244, + "grad_norm": 0.13591979777512486, + "learning_rate": 4.544392458423696e-07, + "loss": 2.6415, + "step": 46459 + }, + { + "epoch": 2.884102054752002, + "grad_norm": 0.13364844345919608, + "learning_rate": 4.539535618037716e-07, + "loss": 2.6993, + "step": 46460 + }, + { + "epoch": 2.88416413185176, + "grad_norm": 0.13789639096186465, + "learning_rate": 4.534681362589277e-07, + "loss": 2.747, + "step": 46461 + }, + { + "epoch": 2.884226208951518, + "grad_norm": 0.15310165582525445, + "learning_rate": 4.529829692103693e-07, + "loss": 2.7132, + "step": 46462 + }, + { + "epoch": 2.8842882860512757, + "grad_norm": 0.1434871358945434, + "learning_rate": 4.5249806066062774e-07, + "loss": 2.7102, + "step": 46463 + }, + { + "epoch": 2.8843503631510337, + "grad_norm": 0.13358769960812406, + "learning_rate": 4.5201341061222315e-07, + "loss": 2.711, + "step": 46464 + }, + { + "epoch": 2.8844124402507916, + "grad_norm": 0.1372501332741839, + "learning_rate": 4.515290190677035e-07, + "loss": 2.7064, + "step": 46465 + }, + { + "epoch": 2.8844745173505495, + "grad_norm": 0.128901878918607, + "learning_rate": 4.5104488602957795e-07, + "loss": 2.6657, + "step": 46466 + }, + { + "epoch": 2.8845365944503074, + "grad_norm": 0.1302857566167506, + "learning_rate": 4.5056101150038333e-07, + "loss": 2.7749, + "step": 46467 + }, + { + "epoch": 2.8845986715500653, + "grad_norm": 0.1460089279198332, + "learning_rate": 4.500773954826454e-07, + "loss": 2.6747, + "step": 46468 + }, + { + "epoch": 2.8846607486498232, + "grad_norm": 0.15106335571924218, + "learning_rate": 4.495940379788732e-07, + "loss": 2.6731, + "step": 46469 + }, + { + "epoch": 2.8847228257495807, + "grad_norm": 0.13219024942798818, + "learning_rate": 4.4911093899160373e-07, + "loss": 2.6744, + "step": 46470 + }, + { + "epoch": 2.884784902849339, + "grad_norm": 0.12976201972799054, + "learning_rate": 4.48628098523346e-07, + "loss": 2.7003, + "step": 46471 + }, + { + "epoch": 2.8848469799490966, + "grad_norm": 0.1299479400148033, + "learning_rate": 4.4814551657663127e-07, + "loss": 2.6567, + "step": 46472 + }, + { + "epoch": 2.884909057048855, + "grad_norm": 0.14256682416469638, + "learning_rate": 4.4766319315396324e-07, + "loss": 2.7159, + "step": 46473 + }, + { + "epoch": 2.8849711341486124, + "grad_norm": 0.13108536436566187, + "learning_rate": 4.471811282578675e-07, + "loss": 2.7252, + "step": 46474 + }, + { + "epoch": 2.8850332112483703, + "grad_norm": 0.13139723086844995, + "learning_rate": 4.466993218908644e-07, + "loss": 2.7127, + "step": 46475 + }, + { + "epoch": 2.8850952883481282, + "grad_norm": 0.1347474002954495, + "learning_rate": 4.4621777405545187e-07, + "loss": 2.7002, + "step": 46476 + }, + { + "epoch": 2.885157365447886, + "grad_norm": 0.13517305732697665, + "learning_rate": 4.4573648475415565e-07, + "loss": 2.8043, + "step": 46477 + }, + { + "epoch": 2.885219442547644, + "grad_norm": 0.12784599158028986, + "learning_rate": 4.452554539894793e-07, + "loss": 2.5859, + "step": 46478 + }, + { + "epoch": 2.885281519647402, + "grad_norm": 0.12664780949964055, + "learning_rate": 4.44774681763932e-07, + "loss": 2.6009, + "step": 46479 + }, + { + "epoch": 2.88534359674716, + "grad_norm": 0.1339320302982093, + "learning_rate": 4.442941680800283e-07, + "loss": 2.7826, + "step": 46480 + }, + { + "epoch": 2.885405673846918, + "grad_norm": 0.1308385490812397, + "learning_rate": 4.438139129402774e-07, + "loss": 2.7391, + "step": 46481 + }, + { + "epoch": 2.8854677509466757, + "grad_norm": 0.12867577496508895, + "learning_rate": 4.433339163471717e-07, + "loss": 2.7305, + "step": 46482 + }, + { + "epoch": 2.8855298280464337, + "grad_norm": 0.12872675478680623, + "learning_rate": 4.428541783032258e-07, + "loss": 2.7166, + "step": 46483 + }, + { + "epoch": 2.8855919051461916, + "grad_norm": 0.13527542246090063, + "learning_rate": 4.423746988109434e-07, + "loss": 2.7777, + "step": 46484 + }, + { + "epoch": 2.8856539822459495, + "grad_norm": 0.12621405489916696, + "learning_rate": 4.418954778728168e-07, + "loss": 2.5982, + "step": 46485 + }, + { + "epoch": 2.8857160593457074, + "grad_norm": 0.13169348048981452, + "learning_rate": 4.4141651549136074e-07, + "loss": 2.7076, + "step": 46486 + }, + { + "epoch": 2.8857781364454653, + "grad_norm": 0.14276132832328506, + "learning_rate": 4.409378116690566e-07, + "loss": 2.685, + "step": 46487 + }, + { + "epoch": 2.8858402135452232, + "grad_norm": 0.1278755277182764, + "learning_rate": 4.4045936640841334e-07, + "loss": 2.6597, + "step": 46488 + }, + { + "epoch": 2.885902290644981, + "grad_norm": 0.13898081014220223, + "learning_rate": 4.399811797119291e-07, + "loss": 2.6299, + "step": 46489 + }, + { + "epoch": 2.885964367744739, + "grad_norm": 0.14160966521133103, + "learning_rate": 4.395032515820907e-07, + "loss": 2.6155, + "step": 46490 + }, + { + "epoch": 2.886026444844497, + "grad_norm": 0.13556084079966962, + "learning_rate": 4.390255820213962e-07, + "loss": 2.7143, + "step": 46491 + }, + { + "epoch": 2.886088521944255, + "grad_norm": 0.1306075120586085, + "learning_rate": 4.3854817103233247e-07, + "loss": 2.7187, + "step": 46492 + }, + { + "epoch": 2.8861505990440124, + "grad_norm": 0.13571754067145897, + "learning_rate": 4.3807101861739754e-07, + "loss": 2.6919, + "step": 46493 + }, + { + "epoch": 2.8862126761437708, + "grad_norm": 0.1427893445798556, + "learning_rate": 4.375941247790782e-07, + "loss": 2.6558, + "step": 46494 + }, + { + "epoch": 2.8862747532435282, + "grad_norm": 0.1310424378265752, + "learning_rate": 4.3711748951986153e-07, + "loss": 2.735, + "step": 46495 + }, + { + "epoch": 2.8863368303432866, + "grad_norm": 0.12928338225341918, + "learning_rate": 4.3664111284222877e-07, + "loss": 2.6571, + "step": 46496 + }, + { + "epoch": 2.886398907443044, + "grad_norm": 0.13140115038561975, + "learning_rate": 4.361649947486779e-07, + "loss": 2.6765, + "step": 46497 + }, + { + "epoch": 2.8864609845428024, + "grad_norm": 0.14355203405510011, + "learning_rate": 4.3568913524168475e-07, + "loss": 2.7259, + "step": 46498 + }, + { + "epoch": 2.88652306164256, + "grad_norm": 0.13759370531483864, + "learning_rate": 4.3521353432373626e-07, + "loss": 2.6741, + "step": 46499 + }, + { + "epoch": 2.886585138742318, + "grad_norm": 0.13461083354370085, + "learning_rate": 4.347381919973081e-07, + "loss": 2.7053, + "step": 46500 + }, + { + "epoch": 2.8866472158420757, + "grad_norm": 0.12911899668234525, + "learning_rate": 4.3426310826488184e-07, + "loss": 2.6684, + "step": 46501 + }, + { + "epoch": 2.8867092929418336, + "grad_norm": 0.13756322274198024, + "learning_rate": 4.3378828312894413e-07, + "loss": 2.6824, + "step": 46502 + }, + { + "epoch": 2.8867713700415916, + "grad_norm": 0.1390851605093713, + "learning_rate": 4.3331371659195987e-07, + "loss": 2.669, + "step": 46503 + }, + { + "epoch": 2.8868334471413495, + "grad_norm": 0.13848258921390635, + "learning_rate": 4.3283940865641027e-07, + "loss": 2.6794, + "step": 46504 + }, + { + "epoch": 2.8868955242411074, + "grad_norm": 0.1514035150558479, + "learning_rate": 4.3236535932477117e-07, + "loss": 2.7572, + "step": 46505 + }, + { + "epoch": 2.8869576013408653, + "grad_norm": 0.13704856192303322, + "learning_rate": 4.318915685995184e-07, + "loss": 2.6831, + "step": 46506 + }, + { + "epoch": 2.8870196784406232, + "grad_norm": 0.13407331050732663, + "learning_rate": 4.3141803648311663e-07, + "loss": 2.7212, + "step": 46507 + }, + { + "epoch": 2.887081755540381, + "grad_norm": 0.1358369935296824, + "learning_rate": 4.3094476297804166e-07, + "loss": 2.6608, + "step": 46508 + }, + { + "epoch": 2.887143832640139, + "grad_norm": 0.1344096485644864, + "learning_rate": 4.3047174808675815e-07, + "loss": 2.6861, + "step": 46509 + }, + { + "epoch": 2.887205909739897, + "grad_norm": 0.12820457453027403, + "learning_rate": 4.299989918117364e-07, + "loss": 2.6914, + "step": 46510 + }, + { + "epoch": 2.887267986839655, + "grad_norm": 0.151523841273219, + "learning_rate": 4.2952649415544664e-07, + "loss": 2.8414, + "step": 46511 + }, + { + "epoch": 2.887330063939413, + "grad_norm": 0.1314185918824902, + "learning_rate": 4.2905425512035356e-07, + "loss": 2.6666, + "step": 46512 + }, + { + "epoch": 2.8873921410391707, + "grad_norm": 0.13695927307464917, + "learning_rate": 4.285822747089163e-07, + "loss": 2.7214, + "step": 46513 + }, + { + "epoch": 2.8874542181389287, + "grad_norm": 0.1448794050785347, + "learning_rate": 4.28110552923594e-07, + "loss": 2.7857, + "step": 46514 + }, + { + "epoch": 2.8875162952386866, + "grad_norm": 0.1293524763753884, + "learning_rate": 4.2763908976685695e-07, + "loss": 2.6951, + "step": 46515 + }, + { + "epoch": 2.8875783723384445, + "grad_norm": 0.13172876452322713, + "learning_rate": 4.2716788524116424e-07, + "loss": 2.6827, + "step": 46516 + }, + { + "epoch": 2.8876404494382024, + "grad_norm": 0.139619957747466, + "learning_rate": 4.2669693934896946e-07, + "loss": 2.7389, + "step": 46517 + }, + { + "epoch": 2.88770252653796, + "grad_norm": 0.13528868162759425, + "learning_rate": 4.2622625209272627e-07, + "loss": 2.6877, + "step": 46518 + }, + { + "epoch": 2.8877646036377183, + "grad_norm": 0.1313517119621725, + "learning_rate": 4.257558234748993e-07, + "loss": 2.7699, + "step": 46519 + }, + { + "epoch": 2.8878266807374757, + "grad_norm": 0.13463472145412653, + "learning_rate": 4.252856534979366e-07, + "loss": 2.7255, + "step": 46520 + }, + { + "epoch": 2.887888757837234, + "grad_norm": 0.14290110702402853, + "learning_rate": 4.248157421642973e-07, + "loss": 2.7288, + "step": 46521 + }, + { + "epoch": 2.8879508349369916, + "grad_norm": 0.1276929791185674, + "learning_rate": 4.2434608947642396e-07, + "loss": 2.5718, + "step": 46522 + }, + { + "epoch": 2.8880129120367495, + "grad_norm": 0.13125898069356046, + "learning_rate": 4.238766954367701e-07, + "loss": 2.7527, + "step": 46523 + }, + { + "epoch": 2.8880749891365074, + "grad_norm": 0.13521748958006377, + "learning_rate": 4.2340756004779494e-07, + "loss": 2.6861, + "step": 46524 + }, + { + "epoch": 2.8881370662362653, + "grad_norm": 0.14121583162429505, + "learning_rate": 4.2293868331193533e-07, + "loss": 2.7325, + "step": 46525 + }, + { + "epoch": 2.8881991433360232, + "grad_norm": 0.1313160003747525, + "learning_rate": 4.224700652316338e-07, + "loss": 2.7032, + "step": 46526 + }, + { + "epoch": 2.888261220435781, + "grad_norm": 0.1309265064162854, + "learning_rate": 4.220017058093495e-07, + "loss": 2.6786, + "step": 46527 + }, + { + "epoch": 2.888323297535539, + "grad_norm": 0.13786934615471963, + "learning_rate": 4.2153360504750827e-07, + "loss": 2.693, + "step": 46528 + }, + { + "epoch": 2.888385374635297, + "grad_norm": 0.1297035557792926, + "learning_rate": 4.2106576294856925e-07, + "loss": 2.6878, + "step": 46529 + }, + { + "epoch": 2.888447451735055, + "grad_norm": 0.14172220024022275, + "learning_rate": 4.2059817951496385e-07, + "loss": 2.6626, + "step": 46530 + }, + { + "epoch": 2.888509528834813, + "grad_norm": 0.14551987054006166, + "learning_rate": 4.201308547491345e-07, + "loss": 2.6995, + "step": 46531 + }, + { + "epoch": 2.8885716059345707, + "grad_norm": 0.12882957923813268, + "learning_rate": 4.196637886535182e-07, + "loss": 2.6546, + "step": 46532 + }, + { + "epoch": 2.8886336830343287, + "grad_norm": 0.13999944868788136, + "learning_rate": 4.1919698123055184e-07, + "loss": 2.7169, + "step": 46533 + }, + { + "epoch": 2.8886957601340866, + "grad_norm": 0.13822357488986972, + "learning_rate": 4.1873043248266687e-07, + "loss": 2.7318, + "step": 46534 + }, + { + "epoch": 2.8887578372338445, + "grad_norm": 0.12891530958764996, + "learning_rate": 4.182641424123057e-07, + "loss": 2.7362, + "step": 46535 + }, + { + "epoch": 2.8888199143336024, + "grad_norm": 0.1367151191774266, + "learning_rate": 4.1779811102189424e-07, + "loss": 2.7667, + "step": 46536 + }, + { + "epoch": 2.8888819914333603, + "grad_norm": 0.13820813099327184, + "learning_rate": 4.1733233831386944e-07, + "loss": 2.7255, + "step": 46537 + }, + { + "epoch": 2.8889440685331182, + "grad_norm": 0.13216551099554974, + "learning_rate": 4.168668242906626e-07, + "loss": 2.7097, + "step": 46538 + }, + { + "epoch": 2.889006145632876, + "grad_norm": 0.12975218664836813, + "learning_rate": 4.164015689546885e-07, + "loss": 2.8322, + "step": 46539 + }, + { + "epoch": 2.889068222732634, + "grad_norm": 0.12808372600352932, + "learning_rate": 4.1593657230838967e-07, + "loss": 2.7164, + "step": 46540 + }, + { + "epoch": 2.8891302998323916, + "grad_norm": 0.12906786872836665, + "learning_rate": 4.1547183435418633e-07, + "loss": 2.6882, + "step": 46541 + }, + { + "epoch": 2.88919237693215, + "grad_norm": 0.12715931800427954, + "learning_rate": 4.150073550945044e-07, + "loss": 2.6925, + "step": 46542 + }, + { + "epoch": 2.8892544540319074, + "grad_norm": 0.12820958683203668, + "learning_rate": 4.145431345317641e-07, + "loss": 2.6838, + "step": 46543 + }, + { + "epoch": 2.8893165311316658, + "grad_norm": 0.13068581928849002, + "learning_rate": 4.1407917266839124e-07, + "loss": 2.6348, + "step": 46544 + }, + { + "epoch": 2.8893786082314232, + "grad_norm": 0.12773552202882135, + "learning_rate": 4.136154695068006e-07, + "loss": 2.6363, + "step": 46545 + }, + { + "epoch": 2.8894406853311816, + "grad_norm": 0.14200988302459583, + "learning_rate": 4.131520250494181e-07, + "loss": 2.7353, + "step": 46546 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.13098327265706405, + "learning_rate": 4.126888392986583e-07, + "loss": 2.6725, + "step": 46547 + }, + { + "epoch": 2.889564839530697, + "grad_norm": 0.14369979971808344, + "learning_rate": 4.1222591225694165e-07, + "loss": 2.6694, + "step": 46548 + }, + { + "epoch": 2.889626916630455, + "grad_norm": 0.1309014366646437, + "learning_rate": 4.117632439266827e-07, + "loss": 2.672, + "step": 46549 + }, + { + "epoch": 2.889688993730213, + "grad_norm": 0.13003966533306877, + "learning_rate": 4.113008343102853e-07, + "loss": 2.7416, + "step": 46550 + }, + { + "epoch": 2.8897510708299707, + "grad_norm": 0.13229081833810216, + "learning_rate": 4.1083868341017515e-07, + "loss": 2.7648, + "step": 46551 + }, + { + "epoch": 2.8898131479297287, + "grad_norm": 0.12993105372511735, + "learning_rate": 4.103767912287559e-07, + "loss": 2.6976, + "step": 46552 + }, + { + "epoch": 2.8898752250294866, + "grad_norm": 0.12803900030568577, + "learning_rate": 4.0991515776843683e-07, + "loss": 2.6574, + "step": 46553 + }, + { + "epoch": 2.8899373021292445, + "grad_norm": 0.12915814958712696, + "learning_rate": 4.094537830316325e-07, + "loss": 2.701, + "step": 46554 + }, + { + "epoch": 2.8899993792290024, + "grad_norm": 0.1380855090877896, + "learning_rate": 4.089926670207467e-07, + "loss": 2.7652, + "step": 46555 + }, + { + "epoch": 2.8900614563287603, + "grad_norm": 0.1411059486569606, + "learning_rate": 4.0853180973818293e-07, + "loss": 2.7145, + "step": 46556 + }, + { + "epoch": 2.8901235334285182, + "grad_norm": 0.13611475497742664, + "learning_rate": 4.0807121118634494e-07, + "loss": 2.7187, + "step": 46557 + }, + { + "epoch": 2.890185610528276, + "grad_norm": 0.14339969987763038, + "learning_rate": 4.0761087136764186e-07, + "loss": 2.724, + "step": 46558 + }, + { + "epoch": 2.890247687628034, + "grad_norm": 0.15167098169445895, + "learning_rate": 4.071507902844718e-07, + "loss": 2.7238, + "step": 46559 + }, + { + "epoch": 2.890309764727792, + "grad_norm": 0.1384365879571291, + "learning_rate": 4.0669096793923276e-07, + "loss": 2.7501, + "step": 46560 + }, + { + "epoch": 2.89037184182755, + "grad_norm": 0.1305763471951847, + "learning_rate": 4.0623140433432847e-07, + "loss": 2.6752, + "step": 46561 + }, + { + "epoch": 2.890433918927308, + "grad_norm": 0.13622119147319633, + "learning_rate": 4.0577209947215147e-07, + "loss": 2.6268, + "step": 46562 + }, + { + "epoch": 2.8904959960270658, + "grad_norm": 0.1319046194815791, + "learning_rate": 4.0531305335510525e-07, + "loss": 2.676, + "step": 46563 + }, + { + "epoch": 2.8905580731268237, + "grad_norm": 0.13460931835516537, + "learning_rate": 4.04854265985577e-07, + "loss": 2.648, + "step": 46564 + }, + { + "epoch": 2.8906201502265816, + "grad_norm": 0.13122440071457853, + "learning_rate": 4.043957373659646e-07, + "loss": 2.6778, + "step": 46565 + }, + { + "epoch": 2.890682227326339, + "grad_norm": 0.13138031110453927, + "learning_rate": 4.0393746749866066e-07, + "loss": 2.7393, + "step": 46566 + }, + { + "epoch": 2.8907443044260974, + "grad_norm": 0.1309446370221362, + "learning_rate": 4.034794563860522e-07, + "loss": 2.7564, + "step": 46567 + }, + { + "epoch": 2.890806381525855, + "grad_norm": 0.12771549931153897, + "learning_rate": 4.0302170403053173e-07, + "loss": 2.6638, + "step": 46568 + }, + { + "epoch": 2.8908684586256133, + "grad_norm": 0.13132820552616953, + "learning_rate": 4.0256421043448624e-07, + "loss": 2.744, + "step": 46569 + }, + { + "epoch": 2.8909305357253707, + "grad_norm": 0.14681719799165738, + "learning_rate": 4.021069756003082e-07, + "loss": 2.7649, + "step": 46570 + }, + { + "epoch": 2.8909926128251286, + "grad_norm": 0.1286669952796761, + "learning_rate": 4.0164999953037905e-07, + "loss": 2.7028, + "step": 46571 + }, + { + "epoch": 2.8910546899248866, + "grad_norm": 0.1324612094895482, + "learning_rate": 4.0119328222707476e-07, + "loss": 2.7006, + "step": 46572 + }, + { + "epoch": 2.8911167670246445, + "grad_norm": 0.1310676298274515, + "learning_rate": 4.0073682369278776e-07, + "loss": 2.7717, + "step": 46573 + }, + { + "epoch": 2.8911788441244024, + "grad_norm": 0.12947899562791662, + "learning_rate": 4.002806239298995e-07, + "loss": 2.7502, + "step": 46574 + }, + { + "epoch": 2.8912409212241603, + "grad_norm": 0.1294568109879655, + "learning_rate": 3.998246829407859e-07, + "loss": 2.6987, + "step": 46575 + }, + { + "epoch": 2.8913029983239182, + "grad_norm": 0.14063035769322968, + "learning_rate": 3.993690007278339e-07, + "loss": 2.695, + "step": 46576 + }, + { + "epoch": 2.891365075423676, + "grad_norm": 0.14305502680286766, + "learning_rate": 3.9891357729340826e-07, + "loss": 2.6792, + "step": 46577 + }, + { + "epoch": 2.891427152523434, + "grad_norm": 0.13314921806897656, + "learning_rate": 3.984584126398905e-07, + "loss": 2.6723, + "step": 46578 + }, + { + "epoch": 2.891489229623192, + "grad_norm": 0.1283456867277161, + "learning_rate": 3.980035067696619e-07, + "loss": 2.7034, + "step": 46579 + }, + { + "epoch": 2.89155130672295, + "grad_norm": 0.1361975407883932, + "learning_rate": 3.975488596850874e-07, + "loss": 2.6562, + "step": 46580 + }, + { + "epoch": 2.891613383822708, + "grad_norm": 0.1303814944572602, + "learning_rate": 3.9709447138854273e-07, + "loss": 2.6833, + "step": 46581 + }, + { + "epoch": 2.8916754609224657, + "grad_norm": 0.133826702976633, + "learning_rate": 3.9664034188239827e-07, + "loss": 2.7591, + "step": 46582 + }, + { + "epoch": 2.8917375380222237, + "grad_norm": 0.13018733729518514, + "learning_rate": 3.9618647116902443e-07, + "loss": 2.7211, + "step": 46583 + }, + { + "epoch": 2.8917996151219816, + "grad_norm": 0.13437436846651224, + "learning_rate": 3.9573285925078587e-07, + "loss": 2.7725, + "step": 46584 + }, + { + "epoch": 2.8918616922217395, + "grad_norm": 0.1323426434404215, + "learning_rate": 3.952795061300474e-07, + "loss": 2.6315, + "step": 46585 + }, + { + "epoch": 2.8919237693214974, + "grad_norm": 0.13172124182673436, + "learning_rate": 3.9482641180917936e-07, + "loss": 2.6222, + "step": 46586 + }, + { + "epoch": 2.8919858464212553, + "grad_norm": 0.13004505808270414, + "learning_rate": 3.943735762905465e-07, + "loss": 2.6998, + "step": 46587 + }, + { + "epoch": 2.8920479235210133, + "grad_norm": 0.13109282409150025, + "learning_rate": 3.939209995765081e-07, + "loss": 2.7146, + "step": 46588 + }, + { + "epoch": 2.8921100006207707, + "grad_norm": 0.13438650375848873, + "learning_rate": 3.934686816694233e-07, + "loss": 2.7172, + "step": 46589 + }, + { + "epoch": 2.892172077720529, + "grad_norm": 0.12793238105559074, + "learning_rate": 3.930166225716625e-07, + "loss": 2.6582, + "step": 46590 + }, + { + "epoch": 2.8922341548202866, + "grad_norm": 0.13967864503465596, + "learning_rate": 3.9256482228556825e-07, + "loss": 2.7633, + "step": 46591 + }, + { + "epoch": 2.892296231920045, + "grad_norm": 0.13437857539942735, + "learning_rate": 3.921132808135053e-07, + "loss": 2.7519, + "step": 46592 + }, + { + "epoch": 2.8923583090198024, + "grad_norm": 0.13495177391067575, + "learning_rate": 3.9166199815783844e-07, + "loss": 2.852, + "step": 46593 + }, + { + "epoch": 2.8924203861195608, + "grad_norm": 0.13141580678454504, + "learning_rate": 3.912109743209047e-07, + "loss": 2.7176, + "step": 46594 + }, + { + "epoch": 2.8924824632193182, + "grad_norm": 0.12621185596383352, + "learning_rate": 3.9076020930506887e-07, + "loss": 2.682, + "step": 46595 + }, + { + "epoch": 2.892544540319076, + "grad_norm": 0.12833597735007657, + "learning_rate": 3.9030970311268454e-07, + "loss": 2.7357, + "step": 46596 + }, + { + "epoch": 2.892606617418834, + "grad_norm": 0.1467043686989866, + "learning_rate": 3.8985945574609437e-07, + "loss": 2.6572, + "step": 46597 + }, + { + "epoch": 2.892668694518592, + "grad_norm": 0.14148462663077313, + "learning_rate": 3.8940946720765205e-07, + "loss": 2.7091, + "step": 46598 + }, + { + "epoch": 2.89273077161835, + "grad_norm": 0.1391947519231452, + "learning_rate": 3.8895973749970006e-07, + "loss": 2.6601, + "step": 46599 + }, + { + "epoch": 2.892792848718108, + "grad_norm": 0.12993248011372688, + "learning_rate": 3.8851026662459213e-07, + "loss": 2.6793, + "step": 46600 + }, + { + "epoch": 2.8928549258178657, + "grad_norm": 0.13844277032804597, + "learning_rate": 3.880610545846708e-07, + "loss": 2.6411, + "step": 46601 + }, + { + "epoch": 2.8929170029176237, + "grad_norm": 0.14334331573219292, + "learning_rate": 3.8761210138227863e-07, + "loss": 2.7654, + "step": 46602 + }, + { + "epoch": 2.8929790800173816, + "grad_norm": 0.12928473654755906, + "learning_rate": 3.8716340701975827e-07, + "loss": 2.7331, + "step": 46603 + }, + { + "epoch": 2.8930411571171395, + "grad_norm": 0.13116293187693345, + "learning_rate": 3.867149714994467e-07, + "loss": 2.6778, + "step": 46604 + }, + { + "epoch": 2.8931032342168974, + "grad_norm": 0.13155313465283205, + "learning_rate": 3.8626679482368644e-07, + "loss": 2.6525, + "step": 46605 + }, + { + "epoch": 2.8931653113166553, + "grad_norm": 0.1309343237252655, + "learning_rate": 3.8581887699482013e-07, + "loss": 2.5913, + "step": 46606 + }, + { + "epoch": 2.8932273884164132, + "grad_norm": 0.13407949263245605, + "learning_rate": 3.853712180151792e-07, + "loss": 2.6951, + "step": 46607 + }, + { + "epoch": 2.893289465516171, + "grad_norm": 0.13153639071073916, + "learning_rate": 3.8492381788710063e-07, + "loss": 2.6397, + "step": 46608 + }, + { + "epoch": 2.893351542615929, + "grad_norm": 0.14505404565763358, + "learning_rate": 3.84476676612916e-07, + "loss": 2.7819, + "step": 46609 + }, + { + "epoch": 2.893413619715687, + "grad_norm": 0.13848089445410847, + "learning_rate": 3.8402979419496774e-07, + "loss": 2.7576, + "step": 46610 + }, + { + "epoch": 2.893475696815445, + "grad_norm": 0.14682914850118375, + "learning_rate": 3.8358317063557637e-07, + "loss": 2.7447, + "step": 46611 + }, + { + "epoch": 2.893537773915203, + "grad_norm": 0.1329144619392138, + "learning_rate": 3.831368059370788e-07, + "loss": 2.7737, + "step": 46612 + }, + { + "epoch": 2.8935998510149608, + "grad_norm": 0.13163508196969798, + "learning_rate": 3.8269070010180096e-07, + "loss": 2.7324, + "step": 46613 + }, + { + "epoch": 2.8936619281147182, + "grad_norm": 0.13224369146609963, + "learning_rate": 3.8224485313206884e-07, + "loss": 2.6737, + "step": 46614 + }, + { + "epoch": 2.8937240052144766, + "grad_norm": 0.131319886856347, + "learning_rate": 3.817992650302138e-07, + "loss": 2.6078, + "step": 46615 + }, + { + "epoch": 2.893786082314234, + "grad_norm": 0.1308408837989599, + "learning_rate": 3.813539357985563e-07, + "loss": 2.7238, + "step": 46616 + }, + { + "epoch": 2.8938481594139924, + "grad_norm": 0.13461172604403449, + "learning_rate": 3.8090886543942216e-07, + "loss": 2.7048, + "step": 46617 + }, + { + "epoch": 2.89391023651375, + "grad_norm": 0.13256868956017165, + "learning_rate": 3.804640539551263e-07, + "loss": 2.7227, + "step": 46618 + }, + { + "epoch": 2.893972313613508, + "grad_norm": 0.1406632543560226, + "learning_rate": 3.800195013480001e-07, + "loss": 2.73, + "step": 46619 + }, + { + "epoch": 2.8940343907132657, + "grad_norm": 0.1305292434702548, + "learning_rate": 3.7957520762035845e-07, + "loss": 2.6909, + "step": 46620 + }, + { + "epoch": 2.8940964678130237, + "grad_norm": 0.13189880849963112, + "learning_rate": 3.791311727745162e-07, + "loss": 2.6586, + "step": 46621 + }, + { + "epoch": 2.8941585449127816, + "grad_norm": 0.13267597726902605, + "learning_rate": 3.78687396812788e-07, + "loss": 2.7064, + "step": 46622 + }, + { + "epoch": 2.8942206220125395, + "grad_norm": 0.128445618702352, + "learning_rate": 3.7824387973749985e-07, + "loss": 2.7105, + "step": 46623 + }, + { + "epoch": 2.8942826991122974, + "grad_norm": 0.13523114698051475, + "learning_rate": 3.778006215509611e-07, + "loss": 2.6274, + "step": 46624 + }, + { + "epoch": 2.8943447762120553, + "grad_norm": 0.1371577978892059, + "learning_rate": 3.773576222554809e-07, + "loss": 2.7468, + "step": 46625 + }, + { + "epoch": 2.8944068533118132, + "grad_norm": 0.13199443473156033, + "learning_rate": 3.769148818533685e-07, + "loss": 2.6733, + "step": 46626 + }, + { + "epoch": 2.894468930411571, + "grad_norm": 0.1312338183282545, + "learning_rate": 3.764724003469389e-07, + "loss": 2.6172, + "step": 46627 + }, + { + "epoch": 2.894531007511329, + "grad_norm": 0.14117194147747017, + "learning_rate": 3.7603017773850115e-07, + "loss": 2.7708, + "step": 46628 + }, + { + "epoch": 2.894593084611087, + "grad_norm": 0.14272148293337775, + "learning_rate": 3.7558821403035906e-07, + "loss": 2.6591, + "step": 46629 + }, + { + "epoch": 2.894655161710845, + "grad_norm": 0.13046238592467585, + "learning_rate": 3.751465092248219e-07, + "loss": 2.5981, + "step": 46630 + }, + { + "epoch": 2.894717238810603, + "grad_norm": 0.1396557732636054, + "learning_rate": 3.7470506332418776e-07, + "loss": 2.6318, + "step": 46631 + }, + { + "epoch": 2.8947793159103608, + "grad_norm": 0.1285444948994542, + "learning_rate": 3.7426387633077155e-07, + "loss": 2.7125, + "step": 46632 + }, + { + "epoch": 2.8948413930101187, + "grad_norm": 0.14695474128909142, + "learning_rate": 3.738229482468658e-07, + "loss": 2.7447, + "step": 46633 + }, + { + "epoch": 2.8949034701098766, + "grad_norm": 0.1335587572814899, + "learning_rate": 3.7338227907476874e-07, + "loss": 2.7298, + "step": 46634 + }, + { + "epoch": 2.8949655472096345, + "grad_norm": 0.14125649072283866, + "learning_rate": 3.729418688167841e-07, + "loss": 2.6953, + "step": 46635 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.13319005314739624, + "learning_rate": 3.7250171747521546e-07, + "loss": 2.6229, + "step": 46636 + }, + { + "epoch": 2.89508970140915, + "grad_norm": 0.13646801709134884, + "learning_rate": 3.720618250523444e-07, + "loss": 2.699, + "step": 46637 + }, + { + "epoch": 2.8951517785089083, + "grad_norm": 0.13012869942872848, + "learning_rate": 3.716221915504803e-07, + "loss": 2.7342, + "step": 46638 + }, + { + "epoch": 2.8952138556086657, + "grad_norm": 0.1331205931793771, + "learning_rate": 3.7118281697191e-07, + "loss": 2.735, + "step": 46639 + }, + { + "epoch": 2.895275932708424, + "grad_norm": 0.13499641697376147, + "learning_rate": 3.707437013189263e-07, + "loss": 2.684, + "step": 46640 + }, + { + "epoch": 2.8953380098081816, + "grad_norm": 0.1364908839913485, + "learning_rate": 3.7030484459381623e-07, + "loss": 2.6863, + "step": 46641 + }, + { + "epoch": 2.89540008690794, + "grad_norm": 0.13322213755977666, + "learning_rate": 3.698662467988778e-07, + "loss": 2.6087, + "step": 46642 + }, + { + "epoch": 2.8954621640076974, + "grad_norm": 0.14488180922805996, + "learning_rate": 3.6942790793639823e-07, + "loss": 2.6783, + "step": 46643 + }, + { + "epoch": 2.8955242411074553, + "grad_norm": 0.1362954535042296, + "learning_rate": 3.68989828008659e-07, + "loss": 2.6715, + "step": 46644 + }, + { + "epoch": 2.8955863182072132, + "grad_norm": 0.13526782848990937, + "learning_rate": 3.6855200701794156e-07, + "loss": 2.7259, + "step": 46645 + }, + { + "epoch": 2.895648395306971, + "grad_norm": 0.1374077465781843, + "learning_rate": 3.681144449665441e-07, + "loss": 2.709, + "step": 46646 + }, + { + "epoch": 2.895710472406729, + "grad_norm": 0.13118253489626625, + "learning_rate": 3.676771418567426e-07, + "loss": 2.6822, + "step": 46647 + }, + { + "epoch": 2.895772549506487, + "grad_norm": 0.12951990926906962, + "learning_rate": 3.6724009769081856e-07, + "loss": 2.7345, + "step": 46648 + }, + { + "epoch": 2.895834626606245, + "grad_norm": 0.14347100576082725, + "learning_rate": 3.668033124710479e-07, + "loss": 2.7323, + "step": 46649 + }, + { + "epoch": 2.895896703706003, + "grad_norm": 0.1384622843581674, + "learning_rate": 3.663667861997122e-07, + "loss": 2.7041, + "step": 46650 + }, + { + "epoch": 2.8959587808057607, + "grad_norm": 0.12808294798261088, + "learning_rate": 3.659305188790929e-07, + "loss": 2.6401, + "step": 46651 + }, + { + "epoch": 2.8960208579055187, + "grad_norm": 0.13973323541607585, + "learning_rate": 3.65494510511466e-07, + "loss": 2.7069, + "step": 46652 + }, + { + "epoch": 2.8960829350052766, + "grad_norm": 0.13192885621188977, + "learning_rate": 3.6505876109910187e-07, + "loss": 2.6863, + "step": 46653 + }, + { + "epoch": 2.8961450121050345, + "grad_norm": 0.13010232345210998, + "learning_rate": 3.6462327064427094e-07, + "loss": 2.6591, + "step": 46654 + }, + { + "epoch": 2.8962070892047924, + "grad_norm": 0.1338682566795697, + "learning_rate": 3.641880391492547e-07, + "loss": 2.7747, + "step": 46655 + }, + { + "epoch": 2.8962691663045503, + "grad_norm": 0.14272302945776763, + "learning_rate": 3.63753066616318e-07, + "loss": 2.7316, + "step": 46656 + }, + { + "epoch": 2.8963312434043083, + "grad_norm": 0.1294125454436924, + "learning_rate": 3.633183530477313e-07, + "loss": 2.6621, + "step": 46657 + }, + { + "epoch": 2.896393320504066, + "grad_norm": 0.1445767872186862, + "learning_rate": 3.6288389844575944e-07, + "loss": 2.6386, + "step": 46658 + }, + { + "epoch": 2.896455397603824, + "grad_norm": 0.1397615454852958, + "learning_rate": 3.624497028126728e-07, + "loss": 2.7342, + "step": 46659 + }, + { + "epoch": 2.896517474703582, + "grad_norm": 0.12852422865154026, + "learning_rate": 3.620157661507362e-07, + "loss": 2.7583, + "step": 46660 + }, + { + "epoch": 2.89657955180334, + "grad_norm": 0.1285616429967509, + "learning_rate": 3.6158208846221454e-07, + "loss": 2.6892, + "step": 46661 + }, + { + "epoch": 2.8966416289030974, + "grad_norm": 0.1456128771590297, + "learning_rate": 3.6114866974937266e-07, + "loss": 2.7512, + "step": 46662 + }, + { + "epoch": 2.8967037060028558, + "grad_norm": 0.15032649220950872, + "learning_rate": 3.6071551001446434e-07, + "loss": 2.6971, + "step": 46663 + }, + { + "epoch": 2.8967657831026132, + "grad_norm": 0.12950051483720268, + "learning_rate": 3.6028260925975444e-07, + "loss": 2.6926, + "step": 46664 + }, + { + "epoch": 2.8968278602023716, + "grad_norm": 0.1418850133529938, + "learning_rate": 3.5984996748749665e-07, + "loss": 2.7368, + "step": 46665 + }, + { + "epoch": 2.896889937302129, + "grad_norm": 0.12914783064210605, + "learning_rate": 3.5941758469996145e-07, + "loss": 2.7269, + "step": 46666 + }, + { + "epoch": 2.896952014401887, + "grad_norm": 0.13262777244113125, + "learning_rate": 3.589854608993859e-07, + "loss": 2.7705, + "step": 46667 + }, + { + "epoch": 2.897014091501645, + "grad_norm": 0.14385504020597875, + "learning_rate": 3.585535960880404e-07, + "loss": 2.6994, + "step": 46668 + }, + { + "epoch": 2.897076168601403, + "grad_norm": 0.13486079987847313, + "learning_rate": 3.581219902681676e-07, + "loss": 2.7689, + "step": 46669 + }, + { + "epoch": 2.8971382457011607, + "grad_norm": 0.15838825320089658, + "learning_rate": 3.5769064344202687e-07, + "loss": 2.6694, + "step": 46670 + }, + { + "epoch": 2.8972003228009187, + "grad_norm": 0.13147735573567124, + "learning_rate": 3.5725955561186075e-07, + "loss": 2.7891, + "step": 46671 + }, + { + "epoch": 2.8972623999006766, + "grad_norm": 0.13550458325265835, + "learning_rate": 3.568287267799231e-07, + "loss": 2.691, + "step": 46672 + }, + { + "epoch": 2.8973244770004345, + "grad_norm": 0.1297154459660036, + "learning_rate": 3.5639815694846755e-07, + "loss": 2.7127, + "step": 46673 + }, + { + "epoch": 2.8973865541001924, + "grad_norm": 0.1320312607217818, + "learning_rate": 3.559678461197258e-07, + "loss": 2.6943, + "step": 46674 + }, + { + "epoch": 2.8974486311999503, + "grad_norm": 0.12882765628181167, + "learning_rate": 3.5553779429595703e-07, + "loss": 2.6756, + "step": 46675 + }, + { + "epoch": 2.8975107082997082, + "grad_norm": 0.1322388672843719, + "learning_rate": 3.5510800147939837e-07, + "loss": 2.7628, + "step": 46676 + }, + { + "epoch": 2.897572785399466, + "grad_norm": 0.1315038454346578, + "learning_rate": 3.5467846767229253e-07, + "loss": 2.6728, + "step": 46677 + }, + { + "epoch": 2.897634862499224, + "grad_norm": 0.13964076121113292, + "learning_rate": 3.5424919287687653e-07, + "loss": 2.7225, + "step": 46678 + }, + { + "epoch": 2.897696939598982, + "grad_norm": 0.13086420021840123, + "learning_rate": 3.538201770953986e-07, + "loss": 2.6551, + "step": 46679 + }, + { + "epoch": 2.89775901669874, + "grad_norm": 0.13971947150490985, + "learning_rate": 3.5339142033009587e-07, + "loss": 2.7124, + "step": 46680 + }, + { + "epoch": 2.897821093798498, + "grad_norm": 0.14467446860709718, + "learning_rate": 3.5296292258319983e-07, + "loss": 2.7195, + "step": 46681 + }, + { + "epoch": 2.8978831708982558, + "grad_norm": 0.1413070773535657, + "learning_rate": 3.5253468385694767e-07, + "loss": 2.6581, + "step": 46682 + }, + { + "epoch": 2.8979452479980137, + "grad_norm": 0.12998794660259494, + "learning_rate": 3.521067041535764e-07, + "loss": 2.7629, + "step": 46683 + }, + { + "epoch": 2.8980073250977716, + "grad_norm": 0.13588276451722783, + "learning_rate": 3.51678983475312e-07, + "loss": 2.7852, + "step": 46684 + }, + { + "epoch": 2.898069402197529, + "grad_norm": 0.1579500163390606, + "learning_rate": 3.512515218243917e-07, + "loss": 2.6768, + "step": 46685 + }, + { + "epoch": 2.8981314792972874, + "grad_norm": 0.12965627144928057, + "learning_rate": 3.5082431920304694e-07, + "loss": 2.7446, + "step": 46686 + }, + { + "epoch": 2.898193556397045, + "grad_norm": 0.1424232713230171, + "learning_rate": 3.5039737561350927e-07, + "loss": 2.6925, + "step": 46687 + }, + { + "epoch": 2.8982556334968033, + "grad_norm": 0.13018972661270226, + "learning_rate": 3.499706910579936e-07, + "loss": 2.6623, + "step": 46688 + }, + { + "epoch": 2.8983177105965607, + "grad_norm": 0.13077089535319386, + "learning_rate": 3.495442655387371e-07, + "loss": 2.6832, + "step": 46689 + }, + { + "epoch": 2.898379787696319, + "grad_norm": 0.1304262737411226, + "learning_rate": 3.491180990579601e-07, + "loss": 2.6941, + "step": 46690 + }, + { + "epoch": 2.8984418647960766, + "grad_norm": 0.13028096716308638, + "learning_rate": 3.486921916178887e-07, + "loss": 2.7218, + "step": 46691 + }, + { + "epoch": 2.8985039418958345, + "grad_norm": 0.13929616739646403, + "learning_rate": 3.4826654322074323e-07, + "loss": 2.6967, + "step": 46692 + }, + { + "epoch": 2.8985660189955924, + "grad_norm": 0.12834271884194678, + "learning_rate": 3.478411538687443e-07, + "loss": 2.6451, + "step": 46693 + }, + { + "epoch": 2.8986280960953503, + "grad_norm": 0.13053958163613935, + "learning_rate": 3.474160235641122e-07, + "loss": 2.7267, + "step": 46694 + }, + { + "epoch": 2.8986901731951082, + "grad_norm": 0.1528634779390295, + "learning_rate": 3.469911523090619e-07, + "loss": 2.6713, + "step": 46695 + }, + { + "epoch": 2.898752250294866, + "grad_norm": 0.13116108573359053, + "learning_rate": 3.4656654010581936e-07, + "loss": 2.7102, + "step": 46696 + }, + { + "epoch": 2.898814327394624, + "grad_norm": 0.12848112054743988, + "learning_rate": 3.461421869565884e-07, + "loss": 2.6588, + "step": 46697 + }, + { + "epoch": 2.898876404494382, + "grad_norm": 0.13176565062718168, + "learning_rate": 3.4571809286358393e-07, + "loss": 2.7539, + "step": 46698 + }, + { + "epoch": 2.89893848159414, + "grad_norm": 0.13413903482613232, + "learning_rate": 3.452942578290264e-07, + "loss": 2.685, + "step": 46699 + }, + { + "epoch": 2.899000558693898, + "grad_norm": 0.12937279284709188, + "learning_rate": 3.448706818551195e-07, + "loss": 2.6681, + "step": 46700 + }, + { + "epoch": 2.8990626357936558, + "grad_norm": 0.14162265357433107, + "learning_rate": 3.4444736494407826e-07, + "loss": 2.7469, + "step": 46701 + }, + { + "epoch": 2.8991247128934137, + "grad_norm": 0.14480034804447245, + "learning_rate": 3.4402430709811197e-07, + "loss": 2.7436, + "step": 46702 + }, + { + "epoch": 2.8991867899931716, + "grad_norm": 0.1455484116002709, + "learning_rate": 3.436015083194188e-07, + "loss": 2.6777, + "step": 46703 + }, + { + "epoch": 2.8992488670929295, + "grad_norm": 0.13765136883294424, + "learning_rate": 3.4317896861021937e-07, + "loss": 2.7359, + "step": 46704 + }, + { + "epoch": 2.8993109441926874, + "grad_norm": 0.1520074225157657, + "learning_rate": 3.427566879727062e-07, + "loss": 2.6433, + "step": 46705 + }, + { + "epoch": 2.8993730212924453, + "grad_norm": 0.15606425092514686, + "learning_rate": 3.423346664090832e-07, + "loss": 2.7587, + "step": 46706 + }, + { + "epoch": 2.8994350983922033, + "grad_norm": 0.14336657363114105, + "learning_rate": 3.419129039215541e-07, + "loss": 2.6845, + "step": 46707 + }, + { + "epoch": 2.899497175491961, + "grad_norm": 0.13128790506739718, + "learning_rate": 3.4149140051232266e-07, + "loss": 2.7322, + "step": 46708 + }, + { + "epoch": 2.899559252591719, + "grad_norm": 0.14238142135840554, + "learning_rate": 3.410701561835872e-07, + "loss": 2.7166, + "step": 46709 + }, + { + "epoch": 2.8996213296914766, + "grad_norm": 0.14311252670824273, + "learning_rate": 3.406491709375459e-07, + "loss": 2.7264, + "step": 46710 + }, + { + "epoch": 2.899683406791235, + "grad_norm": 0.12701909478650428, + "learning_rate": 3.402284447763915e-07, + "loss": 2.7403, + "step": 46711 + }, + { + "epoch": 2.8997454838909924, + "grad_norm": 0.1295198876973439, + "learning_rate": 3.398079777023222e-07, + "loss": 2.7305, + "step": 46712 + }, + { + "epoch": 2.8998075609907508, + "grad_norm": 0.14042711905516778, + "learning_rate": 3.393877697175252e-07, + "loss": 2.6652, + "step": 46713 + }, + { + "epoch": 2.8998696380905082, + "grad_norm": 0.13019308875170865, + "learning_rate": 3.3896782082419864e-07, + "loss": 2.7286, + "step": 46714 + }, + { + "epoch": 2.899931715190266, + "grad_norm": 0.13362732041621808, + "learning_rate": 3.385481310245353e-07, + "loss": 2.5754, + "step": 46715 + }, + { + "epoch": 2.899993792290024, + "grad_norm": 0.13331971365936385, + "learning_rate": 3.381287003207223e-07, + "loss": 2.6975, + "step": 46716 + }, + { + "epoch": 2.900055869389782, + "grad_norm": 0.13110702481537112, + "learning_rate": 3.3770952871494124e-07, + "loss": 2.6805, + "step": 46717 + }, + { + "epoch": 2.90011794648954, + "grad_norm": 0.13683110116682104, + "learning_rate": 3.372906162093903e-07, + "loss": 2.7488, + "step": 46718 + }, + { + "epoch": 2.900180023589298, + "grad_norm": 0.12816870542916017, + "learning_rate": 3.368719628062511e-07, + "loss": 2.6626, + "step": 46719 + }, + { + "epoch": 2.9002421006890557, + "grad_norm": 0.1290241843492707, + "learning_rate": 3.364535685077108e-07, + "loss": 2.7409, + "step": 46720 + }, + { + "epoch": 2.9003041777888137, + "grad_norm": 0.14082564503894304, + "learning_rate": 3.360354333159399e-07, + "loss": 2.6896, + "step": 46721 + }, + { + "epoch": 2.9003662548885716, + "grad_norm": 0.13775075648449703, + "learning_rate": 3.3561755723313657e-07, + "loss": 2.7053, + "step": 46722 + }, + { + "epoch": 2.9004283319883295, + "grad_norm": 0.13155003382139738, + "learning_rate": 3.351999402614658e-07, + "loss": 2.6947, + "step": 46723 + }, + { + "epoch": 2.9004904090880874, + "grad_norm": 0.1459662237836894, + "learning_rate": 3.347825824031203e-07, + "loss": 2.6134, + "step": 46724 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.13590859429194257, + "learning_rate": 3.34365483660265e-07, + "loss": 2.7936, + "step": 46725 + }, + { + "epoch": 2.9006145632876033, + "grad_norm": 0.1331862352240454, + "learning_rate": 3.339486440350814e-07, + "loss": 2.7195, + "step": 46726 + }, + { + "epoch": 2.900676640387361, + "grad_norm": 0.14608203948102236, + "learning_rate": 3.335320635297512e-07, + "loss": 2.7978, + "step": 46727 + }, + { + "epoch": 2.900738717487119, + "grad_norm": 0.13047919631299656, + "learning_rate": 3.331157421464337e-07, + "loss": 2.6426, + "step": 46728 + }, + { + "epoch": 2.900800794586877, + "grad_norm": 0.13065503663733877, + "learning_rate": 3.326996798873161e-07, + "loss": 2.8066, + "step": 46729 + }, + { + "epoch": 2.900862871686635, + "grad_norm": 0.13975638105806745, + "learning_rate": 3.322838767545522e-07, + "loss": 2.6715, + "step": 46730 + }, + { + "epoch": 2.900924948786393, + "grad_norm": 0.12983970356912453, + "learning_rate": 3.3186833275032916e-07, + "loss": 2.7674, + "step": 46731 + }, + { + "epoch": 2.9009870258861508, + "grad_norm": 0.13063697990439513, + "learning_rate": 3.3145304787680076e-07, + "loss": 2.6678, + "step": 46732 + }, + { + "epoch": 2.9010491029859082, + "grad_norm": 0.13329923021021997, + "learning_rate": 3.310380221361431e-07, + "loss": 2.66, + "step": 46733 + }, + { + "epoch": 2.9011111800856666, + "grad_norm": 0.13237662878646414, + "learning_rate": 3.3062325553051556e-07, + "loss": 2.7401, + "step": 46734 + }, + { + "epoch": 2.901173257185424, + "grad_norm": 0.15424181764685166, + "learning_rate": 3.30208748062083e-07, + "loss": 2.6855, + "step": 46735 + }, + { + "epoch": 2.9012353342851824, + "grad_norm": 0.13500519204972633, + "learning_rate": 3.2979449973301046e-07, + "loss": 2.6712, + "step": 46736 + }, + { + "epoch": 2.90129741138494, + "grad_norm": 0.14226855022056256, + "learning_rate": 3.2938051054545724e-07, + "loss": 2.7958, + "step": 46737 + }, + { + "epoch": 2.9013594884846983, + "grad_norm": 0.13392767721399873, + "learning_rate": 3.289667805015884e-07, + "loss": 2.6478, + "step": 46738 + }, + { + "epoch": 2.9014215655844557, + "grad_norm": 0.13379517422937917, + "learning_rate": 3.28553309603552e-07, + "loss": 2.8073, + "step": 46739 + }, + { + "epoch": 2.9014836426842137, + "grad_norm": 0.1302007998830313, + "learning_rate": 3.281400978535076e-07, + "loss": 2.6969, + "step": 46740 + }, + { + "epoch": 2.9015457197839716, + "grad_norm": 0.1415567985022298, + "learning_rate": 3.2772714525362014e-07, + "loss": 2.7868, + "step": 46741 + }, + { + "epoch": 2.9016077968837295, + "grad_norm": 0.12915370771045923, + "learning_rate": 3.2731445180604337e-07, + "loss": 2.7162, + "step": 46742 + }, + { + "epoch": 2.9016698739834874, + "grad_norm": 0.14075982068315654, + "learning_rate": 3.2690201751292007e-07, + "loss": 2.6411, + "step": 46743 + }, + { + "epoch": 2.9017319510832453, + "grad_norm": 0.13345970908143776, + "learning_rate": 3.26489842376404e-07, + "loss": 2.6697, + "step": 46744 + }, + { + "epoch": 2.9017940281830032, + "grad_norm": 0.12942295122234518, + "learning_rate": 3.2607792639865464e-07, + "loss": 2.7421, + "step": 46745 + }, + { + "epoch": 2.901856105282761, + "grad_norm": 0.14063019875347593, + "learning_rate": 3.256662695818147e-07, + "loss": 2.7095, + "step": 46746 + }, + { + "epoch": 2.901918182382519, + "grad_norm": 0.13056172495904572, + "learning_rate": 3.252548719280324e-07, + "loss": 2.6686, + "step": 46747 + }, + { + "epoch": 2.901980259482277, + "grad_norm": 0.1313527967730469, + "learning_rate": 3.248437334394561e-07, + "loss": 2.6945, + "step": 46748 + }, + { + "epoch": 2.902042336582035, + "grad_norm": 0.12872808599777286, + "learning_rate": 3.244328541182229e-07, + "loss": 2.6993, + "step": 46749 + }, + { + "epoch": 2.902104413681793, + "grad_norm": 0.12816878535932014, + "learning_rate": 3.240222339664867e-07, + "loss": 2.6872, + "step": 46750 + }, + { + "epoch": 2.9021664907815508, + "grad_norm": 0.13407197679800714, + "learning_rate": 3.236118729863902e-07, + "loss": 2.7158, + "step": 46751 + }, + { + "epoch": 2.9022285678813087, + "grad_norm": 0.15223460077603587, + "learning_rate": 3.23201771180065e-07, + "loss": 2.7097, + "step": 46752 + }, + { + "epoch": 2.9022906449810666, + "grad_norm": 0.13235060228220466, + "learning_rate": 3.227919285496539e-07, + "loss": 2.7054, + "step": 46753 + }, + { + "epoch": 2.9023527220808245, + "grad_norm": 0.1408641688694247, + "learning_rate": 3.2238234509729956e-07, + "loss": 2.7579, + "step": 46754 + }, + { + "epoch": 2.9024147991805824, + "grad_norm": 0.1523910351265354, + "learning_rate": 3.219730208251392e-07, + "loss": 2.7568, + "step": 46755 + }, + { + "epoch": 2.9024768762803403, + "grad_norm": 0.13060917350040668, + "learning_rate": 3.2156395573529895e-07, + "loss": 2.721, + "step": 46756 + }, + { + "epoch": 2.9025389533800983, + "grad_norm": 0.12499624112557231, + "learning_rate": 3.2115514982992143e-07, + "loss": 2.7126, + "step": 46757 + }, + { + "epoch": 2.9026010304798557, + "grad_norm": 0.13554823349813352, + "learning_rate": 3.2074660311113837e-07, + "loss": 2.6958, + "step": 46758 + }, + { + "epoch": 2.902663107579614, + "grad_norm": 0.12942503803435723, + "learning_rate": 3.203383155810813e-07, + "loss": 2.7834, + "step": 46759 + }, + { + "epoch": 2.9027251846793716, + "grad_norm": 0.1403173750920421, + "learning_rate": 3.199302872418819e-07, + "loss": 2.7343, + "step": 46760 + }, + { + "epoch": 2.90278726177913, + "grad_norm": 0.12884733269711415, + "learning_rate": 3.195225180956607e-07, + "loss": 2.7034, + "step": 46761 + }, + { + "epoch": 2.9028493388788874, + "grad_norm": 0.13015315237629582, + "learning_rate": 3.1911500814455487e-07, + "loss": 2.6407, + "step": 46762 + }, + { + "epoch": 2.9029114159786453, + "grad_norm": 0.15890572966993996, + "learning_rate": 3.1870775739068494e-07, + "loss": 2.7158, + "step": 46763 + }, + { + "epoch": 2.9029734930784032, + "grad_norm": 0.1282670690083691, + "learning_rate": 3.1830076583617696e-07, + "loss": 2.6756, + "step": 46764 + }, + { + "epoch": 2.903035570178161, + "grad_norm": 0.13563985878330628, + "learning_rate": 3.1789403348315703e-07, + "loss": 2.6693, + "step": 46765 + }, + { + "epoch": 2.903097647277919, + "grad_norm": 0.13955629416615442, + "learning_rate": 3.1748756033374015e-07, + "loss": 2.6362, + "step": 46766 + }, + { + "epoch": 2.903159724377677, + "grad_norm": 0.14310953967745824, + "learning_rate": 3.1708134639005236e-07, + "loss": 2.6675, + "step": 46767 + }, + { + "epoch": 2.903221801477435, + "grad_norm": 0.13237433522609537, + "learning_rate": 3.166753916542142e-07, + "loss": 2.7258, + "step": 46768 + }, + { + "epoch": 2.903283878577193, + "grad_norm": 0.1309019275770339, + "learning_rate": 3.1626969612834065e-07, + "loss": 2.6925, + "step": 46769 + }, + { + "epoch": 2.9033459556769508, + "grad_norm": 0.12934639953218438, + "learning_rate": 3.158642598145467e-07, + "loss": 2.701, + "step": 46770 + }, + { + "epoch": 2.9034080327767087, + "grad_norm": 0.15324918348342223, + "learning_rate": 3.154590827149584e-07, + "loss": 2.7582, + "step": 46771 + }, + { + "epoch": 2.9034701098764666, + "grad_norm": 0.13979683174217053, + "learning_rate": 3.1505416483167404e-07, + "loss": 2.695, + "step": 46772 + }, + { + "epoch": 2.9035321869762245, + "grad_norm": 0.13762489168429637, + "learning_rate": 3.1464950616681423e-07, + "loss": 2.8119, + "step": 46773 + }, + { + "epoch": 2.9035942640759824, + "grad_norm": 0.13871895299906528, + "learning_rate": 3.142451067224883e-07, + "loss": 2.7226, + "step": 46774 + }, + { + "epoch": 2.9036563411757403, + "grad_norm": 0.14176507913860054, + "learning_rate": 3.138409665008113e-07, + "loss": 2.6255, + "step": 46775 + }, + { + "epoch": 2.9037184182754983, + "grad_norm": 0.133582136140134, + "learning_rate": 3.134370855038871e-07, + "loss": 2.6708, + "step": 46776 + }, + { + "epoch": 2.903780495375256, + "grad_norm": 0.12927020595986438, + "learning_rate": 3.130334637338195e-07, + "loss": 2.7247, + "step": 46777 + }, + { + "epoch": 2.903842572475014, + "grad_norm": 0.14257243550066276, + "learning_rate": 3.126301011927235e-07, + "loss": 2.7217, + "step": 46778 + }, + { + "epoch": 2.903904649574772, + "grad_norm": 0.13055484837553005, + "learning_rate": 3.122269978826975e-07, + "loss": 2.7457, + "step": 46779 + }, + { + "epoch": 2.90396672667453, + "grad_norm": 0.13055591127361824, + "learning_rate": 3.1182415380583975e-07, + "loss": 2.702, + "step": 46780 + }, + { + "epoch": 2.9040288037742874, + "grad_norm": 0.13645942480679668, + "learning_rate": 3.114215689642652e-07, + "loss": 2.7422, + "step": 46781 + }, + { + "epoch": 2.9040908808740458, + "grad_norm": 0.1472984644147624, + "learning_rate": 3.1101924336006115e-07, + "loss": 2.688, + "step": 46782 + }, + { + "epoch": 2.9041529579738032, + "grad_norm": 0.13161293525277828, + "learning_rate": 3.1061717699533697e-07, + "loss": 2.718, + "step": 46783 + }, + { + "epoch": 2.9042150350735616, + "grad_norm": 0.13294921878362448, + "learning_rate": 3.1021536987218545e-07, + "loss": 2.7434, + "step": 46784 + }, + { + "epoch": 2.904277112173319, + "grad_norm": 0.13405387316025774, + "learning_rate": 3.098138219926938e-07, + "loss": 2.7547, + "step": 46785 + }, + { + "epoch": 2.9043391892730774, + "grad_norm": 0.15368443276314211, + "learning_rate": 3.09412533358977e-07, + "loss": 2.7056, + "step": 46786 + }, + { + "epoch": 2.904401266372835, + "grad_norm": 0.1339956554079505, + "learning_rate": 3.090115039731112e-07, + "loss": 2.7725, + "step": 46787 + }, + { + "epoch": 2.904463343472593, + "grad_norm": 0.12736836528999823, + "learning_rate": 3.086107338372002e-07, + "loss": 2.6643, + "step": 46788 + }, + { + "epoch": 2.9045254205723507, + "grad_norm": 0.14546315576796112, + "learning_rate": 3.0821022295332567e-07, + "loss": 2.7436, + "step": 46789 + }, + { + "epoch": 2.9045874976721087, + "grad_norm": 0.1294110269648196, + "learning_rate": 3.0780997132358046e-07, + "loss": 2.6114, + "step": 46790 + }, + { + "epoch": 2.9046495747718666, + "grad_norm": 0.1370945398070117, + "learning_rate": 3.074099789500573e-07, + "loss": 2.6375, + "step": 46791 + }, + { + "epoch": 2.9047116518716245, + "grad_norm": 0.1467095234237688, + "learning_rate": 3.070102458348434e-07, + "loss": 2.6939, + "step": 46792 + }, + { + "epoch": 2.9047737289713824, + "grad_norm": 0.14423503032680504, + "learning_rate": 3.066107719800093e-07, + "loss": 2.699, + "step": 46793 + }, + { + "epoch": 2.9048358060711403, + "grad_norm": 0.14094866486370694, + "learning_rate": 3.0621155738765895e-07, + "loss": 2.705, + "step": 46794 + }, + { + "epoch": 2.9048978831708983, + "grad_norm": 0.1332500207410728, + "learning_rate": 3.0581260205986837e-07, + "loss": 2.6373, + "step": 46795 + }, + { + "epoch": 2.904959960270656, + "grad_norm": 0.13347171122660664, + "learning_rate": 3.054139059987138e-07, + "loss": 2.6984, + "step": 46796 + }, + { + "epoch": 2.905022037370414, + "grad_norm": 0.13116696489969512, + "learning_rate": 3.0501546920627676e-07, + "loss": 2.5854, + "step": 46797 + }, + { + "epoch": 2.905084114470172, + "grad_norm": 0.1321555255316187, + "learning_rate": 3.0461729168463905e-07, + "loss": 2.6422, + "step": 46798 + }, + { + "epoch": 2.90514619156993, + "grad_norm": 0.13032925395158754, + "learning_rate": 3.042193734358767e-07, + "loss": 2.6288, + "step": 46799 + }, + { + "epoch": 2.905208268669688, + "grad_norm": 0.13661726182058767, + "learning_rate": 3.0382171446206584e-07, + "loss": 2.7127, + "step": 46800 + }, + { + "epoch": 2.9052703457694458, + "grad_norm": 0.15243057686598718, + "learning_rate": 3.0342431476528265e-07, + "loss": 2.5787, + "step": 46801 + }, + { + "epoch": 2.9053324228692037, + "grad_norm": 0.15313050493738167, + "learning_rate": 3.0302717434759765e-07, + "loss": 2.682, + "step": 46802 + }, + { + "epoch": 2.9053944999689616, + "grad_norm": 0.12838018608528978, + "learning_rate": 3.0263029321108136e-07, + "loss": 2.6414, + "step": 46803 + }, + { + "epoch": 2.9054565770687195, + "grad_norm": 0.12925353375451842, + "learning_rate": 3.0223367135781e-07, + "loss": 2.7148, + "step": 46804 + }, + { + "epoch": 2.9055186541684774, + "grad_norm": 0.14320838445109074, + "learning_rate": 3.018373087898541e-07, + "loss": 2.6615, + "step": 46805 + }, + { + "epoch": 2.905580731268235, + "grad_norm": 0.12879619813496168, + "learning_rate": 3.01441205509273e-07, + "loss": 2.5725, + "step": 46806 + }, + { + "epoch": 2.9056428083679933, + "grad_norm": 0.13115206765992965, + "learning_rate": 3.0104536151813743e-07, + "loss": 2.6049, + "step": 46807 + }, + { + "epoch": 2.9057048854677507, + "grad_norm": 0.15657927270043637, + "learning_rate": 3.0064977681851236e-07, + "loss": 2.6398, + "step": 46808 + }, + { + "epoch": 2.905766962567509, + "grad_norm": 0.14410271133043495, + "learning_rate": 3.002544514124683e-07, + "loss": 2.7091, + "step": 46809 + }, + { + "epoch": 2.9058290396672666, + "grad_norm": 0.13773535423558197, + "learning_rate": 2.9985938530205923e-07, + "loss": 2.706, + "step": 46810 + }, + { + "epoch": 2.9058911167670245, + "grad_norm": 0.1373615684277737, + "learning_rate": 2.9946457848934464e-07, + "loss": 2.7089, + "step": 46811 + }, + { + "epoch": 2.9059531938667824, + "grad_norm": 0.15108648545041417, + "learning_rate": 2.99070030976395e-07, + "loss": 2.6576, + "step": 46812 + }, + { + "epoch": 2.9060152709665403, + "grad_norm": 0.137215264504371, + "learning_rate": 2.986757427652587e-07, + "loss": 2.7041, + "step": 46813 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.14678577568925874, + "learning_rate": 2.982817138579952e-07, + "loss": 2.6984, + "step": 46814 + }, + { + "epoch": 2.906139425166056, + "grad_norm": 0.12962963410392, + "learning_rate": 2.97887944256664e-07, + "loss": 2.7273, + "step": 46815 + }, + { + "epoch": 2.906201502265814, + "grad_norm": 0.1338965408873729, + "learning_rate": 2.9749443396331346e-07, + "loss": 2.7243, + "step": 46816 + }, + { + "epoch": 2.906263579365572, + "grad_norm": 0.12846325123097627, + "learning_rate": 2.97101182980003e-07, + "loss": 2.7453, + "step": 46817 + }, + { + "epoch": 2.90632565646533, + "grad_norm": 0.130716074583073, + "learning_rate": 2.9670819130878657e-07, + "loss": 2.7211, + "step": 46818 + }, + { + "epoch": 2.906387733565088, + "grad_norm": 0.13497669856651986, + "learning_rate": 2.9631545895170143e-07, + "loss": 2.6808, + "step": 46819 + }, + { + "epoch": 2.9064498106648458, + "grad_norm": 0.15422392581382263, + "learning_rate": 2.95922985910807e-07, + "loss": 2.7534, + "step": 46820 + }, + { + "epoch": 2.9065118877646037, + "grad_norm": 0.12976831394328617, + "learning_rate": 2.955307721881462e-07, + "loss": 2.7717, + "step": 46821 + }, + { + "epoch": 2.9065739648643616, + "grad_norm": 0.13954031032736677, + "learning_rate": 2.951388177857728e-07, + "loss": 2.7441, + "step": 46822 + }, + { + "epoch": 2.9066360419641195, + "grad_norm": 0.14942556848042904, + "learning_rate": 2.9474712270572414e-07, + "loss": 2.6902, + "step": 46823 + }, + { + "epoch": 2.9066981190638774, + "grad_norm": 0.15271119285714757, + "learning_rate": 2.943556869500486e-07, + "loss": 2.7641, + "step": 46824 + }, + { + "epoch": 2.9067601961636353, + "grad_norm": 0.13960398786896072, + "learning_rate": 2.939645105207778e-07, + "loss": 2.6801, + "step": 46825 + }, + { + "epoch": 2.9068222732633933, + "grad_norm": 0.14245114485542373, + "learning_rate": 2.935735934199657e-07, + "loss": 2.635, + "step": 46826 + }, + { + "epoch": 2.906884350363151, + "grad_norm": 0.12803172635768906, + "learning_rate": 2.931829356496496e-07, + "loss": 2.6732, + "step": 46827 + }, + { + "epoch": 2.906946427462909, + "grad_norm": 0.13303078916504363, + "learning_rate": 2.9279253721186116e-07, + "loss": 2.7551, + "step": 46828 + }, + { + "epoch": 2.9070085045626666, + "grad_norm": 0.1318143473713232, + "learning_rate": 2.924023981086432e-07, + "loss": 2.6318, + "step": 46829 + }, + { + "epoch": 2.907070581662425, + "grad_norm": 0.12867690404851015, + "learning_rate": 2.9201251834202746e-07, + "loss": 2.7126, + "step": 46830 + }, + { + "epoch": 2.9071326587621824, + "grad_norm": 0.13045794562186022, + "learning_rate": 2.916228979140512e-07, + "loss": 2.7273, + "step": 46831 + }, + { + "epoch": 2.9071947358619408, + "grad_norm": 0.14130363033833757, + "learning_rate": 2.9123353682674604e-07, + "loss": 2.6935, + "step": 46832 + }, + { + "epoch": 2.9072568129616982, + "grad_norm": 0.1286967120608873, + "learning_rate": 2.9084443508213824e-07, + "loss": 2.6544, + "step": 46833 + }, + { + "epoch": 2.907318890061456, + "grad_norm": 0.133259714528879, + "learning_rate": 2.9045559268226494e-07, + "loss": 2.7213, + "step": 46834 + }, + { + "epoch": 2.907380967161214, + "grad_norm": 0.13887100965602434, + "learning_rate": 2.9006700962915247e-07, + "loss": 2.6763, + "step": 46835 + }, + { + "epoch": 2.907443044260972, + "grad_norm": 0.13229790035214042, + "learning_rate": 2.896786859248268e-07, + "loss": 2.6562, + "step": 46836 + }, + { + "epoch": 2.90750512136073, + "grad_norm": 0.15476098458517396, + "learning_rate": 2.8929062157131424e-07, + "loss": 2.8551, + "step": 46837 + }, + { + "epoch": 2.907567198460488, + "grad_norm": 0.13361236292023415, + "learning_rate": 2.889028165706409e-07, + "loss": 2.5926, + "step": 46838 + }, + { + "epoch": 2.9076292755602458, + "grad_norm": 0.1296395052148504, + "learning_rate": 2.8851527092482735e-07, + "loss": 2.6893, + "step": 46839 + }, + { + "epoch": 2.9076913526600037, + "grad_norm": 0.1314917486619882, + "learning_rate": 2.881279846359053e-07, + "loss": 2.7314, + "step": 46840 + }, + { + "epoch": 2.9077534297597616, + "grad_norm": 0.14759140053889627, + "learning_rate": 2.877409577058787e-07, + "loss": 2.6392, + "step": 46841 + }, + { + "epoch": 2.9078155068595195, + "grad_norm": 0.13154650812973517, + "learning_rate": 2.8735419013677935e-07, + "loss": 2.6327, + "step": 46842 + }, + { + "epoch": 2.9078775839592774, + "grad_norm": 0.14248981745838463, + "learning_rate": 2.8696768193062217e-07, + "loss": 2.7679, + "step": 46843 + }, + { + "epoch": 2.9079396610590353, + "grad_norm": 0.13725269820170258, + "learning_rate": 2.865814330894223e-07, + "loss": 2.675, + "step": 46844 + }, + { + "epoch": 2.9080017381587933, + "grad_norm": 0.14101553702946082, + "learning_rate": 2.861954436151948e-07, + "loss": 2.7097, + "step": 46845 + }, + { + "epoch": 2.908063815258551, + "grad_norm": 0.1286609584460093, + "learning_rate": 2.8580971350995466e-07, + "loss": 2.7884, + "step": 46846 + }, + { + "epoch": 2.908125892358309, + "grad_norm": 0.13254092303161089, + "learning_rate": 2.85424242775717e-07, + "loss": 2.743, + "step": 46847 + }, + { + "epoch": 2.908187969458067, + "grad_norm": 0.13194955945208828, + "learning_rate": 2.850390314144857e-07, + "loss": 2.7235, + "step": 46848 + }, + { + "epoch": 2.908250046557825, + "grad_norm": 0.12818781458519882, + "learning_rate": 2.846540794282759e-07, + "loss": 2.6662, + "step": 46849 + }, + { + "epoch": 2.908312123657583, + "grad_norm": 0.14029742871502454, + "learning_rate": 2.8426938681909154e-07, + "loss": 2.674, + "step": 46850 + }, + { + "epoch": 2.9083742007573408, + "grad_norm": 0.1318425706678197, + "learning_rate": 2.838849535889476e-07, + "loss": 2.7037, + "step": 46851 + }, + { + "epoch": 2.9084362778570982, + "grad_norm": 0.1319489171569498, + "learning_rate": 2.83500779739837e-07, + "loss": 2.6059, + "step": 46852 + }, + { + "epoch": 2.9084983549568566, + "grad_norm": 0.13137894296808647, + "learning_rate": 2.8311686527378034e-07, + "loss": 2.6338, + "step": 46853 + }, + { + "epoch": 2.908560432056614, + "grad_norm": 0.1282208601628969, + "learning_rate": 2.8273321019277045e-07, + "loss": 2.7104, + "step": 46854 + }, + { + "epoch": 2.9086225091563724, + "grad_norm": 0.14168882640794933, + "learning_rate": 2.8234981449881124e-07, + "loss": 2.673, + "step": 46855 + }, + { + "epoch": 2.90868458625613, + "grad_norm": 0.13749917582785084, + "learning_rate": 2.819666781938957e-07, + "loss": 2.7254, + "step": 46856 + }, + { + "epoch": 2.9087466633558883, + "grad_norm": 0.13386536948998506, + "learning_rate": 2.8158380128003315e-07, + "loss": 2.7047, + "step": 46857 + }, + { + "epoch": 2.9088087404556457, + "grad_norm": 0.13043024385738616, + "learning_rate": 2.812011837592221e-07, + "loss": 2.658, + "step": 46858 + }, + { + "epoch": 2.9088708175554037, + "grad_norm": 0.15332316134776455, + "learning_rate": 2.8081882563344985e-07, + "loss": 2.8013, + "step": 46859 + }, + { + "epoch": 2.9089328946551616, + "grad_norm": 0.1267177894781659, + "learning_rate": 2.8043672690472036e-07, + "loss": 2.6549, + "step": 46860 + }, + { + "epoch": 2.9089949717549195, + "grad_norm": 0.13003775310917087, + "learning_rate": 2.800548875750153e-07, + "loss": 2.7244, + "step": 46861 + }, + { + "epoch": 2.9090570488546774, + "grad_norm": 0.13773560441756436, + "learning_rate": 2.796733076463387e-07, + "loss": 2.7149, + "step": 46862 + }, + { + "epoch": 2.9091191259544353, + "grad_norm": 0.14521733033668596, + "learning_rate": 2.792919871206723e-07, + "loss": 2.7314, + "step": 46863 + }, + { + "epoch": 2.9091812030541933, + "grad_norm": 0.13530226300920464, + "learning_rate": 2.789109260000144e-07, + "loss": 2.6164, + "step": 46864 + }, + { + "epoch": 2.909243280153951, + "grad_norm": 0.1299026660516572, + "learning_rate": 2.785301242863414e-07, + "loss": 2.7481, + "step": 46865 + }, + { + "epoch": 2.909305357253709, + "grad_norm": 0.13094022018547752, + "learning_rate": 2.781495819816515e-07, + "loss": 2.7411, + "step": 46866 + }, + { + "epoch": 2.909367434353467, + "grad_norm": 0.13394191808841122, + "learning_rate": 2.777692990879266e-07, + "loss": 2.7071, + "step": 46867 + }, + { + "epoch": 2.909429511453225, + "grad_norm": 0.14792629731642376, + "learning_rate": 2.773892756071539e-07, + "loss": 2.6851, + "step": 46868 + }, + { + "epoch": 2.909491588552983, + "grad_norm": 0.12890583818543283, + "learning_rate": 2.7700951154130407e-07, + "loss": 2.719, + "step": 46869 + }, + { + "epoch": 2.9095536656527408, + "grad_norm": 0.12824860344418268, + "learning_rate": 2.7663000689236994e-07, + "loss": 2.6939, + "step": 46870 + }, + { + "epoch": 2.9096157427524987, + "grad_norm": 0.13321390220247384, + "learning_rate": 2.7625076166232223e-07, + "loss": 2.7051, + "step": 46871 + }, + { + "epoch": 2.9096778198522566, + "grad_norm": 0.12795708632267835, + "learning_rate": 2.758717758531537e-07, + "loss": 2.7322, + "step": 46872 + }, + { + "epoch": 2.9097398969520145, + "grad_norm": 0.12959576918760835, + "learning_rate": 2.7549304946682954e-07, + "loss": 2.6308, + "step": 46873 + }, + { + "epoch": 2.9098019740517724, + "grad_norm": 0.13688228227455948, + "learning_rate": 2.751145825053314e-07, + "loss": 2.7311, + "step": 46874 + }, + { + "epoch": 2.9098640511515304, + "grad_norm": 0.13258008395733206, + "learning_rate": 2.7473637497062444e-07, + "loss": 2.6921, + "step": 46875 + }, + { + "epoch": 2.9099261282512883, + "grad_norm": 0.129666004065471, + "learning_rate": 2.7435842686469595e-07, + "loss": 2.6904, + "step": 46876 + }, + { + "epoch": 2.9099882053510457, + "grad_norm": 0.13341573851559585, + "learning_rate": 2.7398073818951097e-07, + "loss": 2.6849, + "step": 46877 + }, + { + "epoch": 2.910050282450804, + "grad_norm": 0.12883032383381127, + "learning_rate": 2.7360330894703466e-07, + "loss": 2.699, + "step": 46878 + }, + { + "epoch": 2.9101123595505616, + "grad_norm": 0.14618063270964993, + "learning_rate": 2.732261391392432e-07, + "loss": 2.711, + "step": 46879 + }, + { + "epoch": 2.91017443665032, + "grad_norm": 0.12942482215881287, + "learning_rate": 2.728492287681073e-07, + "loss": 2.7301, + "step": 46880 + }, + { + "epoch": 2.9102365137500774, + "grad_norm": 0.1373400196397747, + "learning_rate": 2.724725778355808e-07, + "loss": 2.5341, + "step": 46881 + }, + { + "epoch": 2.9102985908498353, + "grad_norm": 0.1351523861856476, + "learning_rate": 2.7209618634364554e-07, + "loss": 2.5648, + "step": 46882 + }, + { + "epoch": 2.9103606679495932, + "grad_norm": 0.1465386862207235, + "learning_rate": 2.7172005429424997e-07, + "loss": 2.673, + "step": 46883 + }, + { + "epoch": 2.910422745049351, + "grad_norm": 0.13082515923995, + "learning_rate": 2.713441816893647e-07, + "loss": 2.6749, + "step": 46884 + }, + { + "epoch": 2.910484822149109, + "grad_norm": 0.13156816344632152, + "learning_rate": 2.709685685309493e-07, + "loss": 2.7077, + "step": 46885 + }, + { + "epoch": 2.910546899248867, + "grad_norm": 0.14135705951123742, + "learning_rate": 2.7059321482096334e-07, + "loss": 2.7294, + "step": 46886 + }, + { + "epoch": 2.910608976348625, + "grad_norm": 0.1387312922989661, + "learning_rate": 2.702181205613663e-07, + "loss": 2.683, + "step": 46887 + }, + { + "epoch": 2.910671053448383, + "grad_norm": 0.1435916619465534, + "learning_rate": 2.6984328575410665e-07, + "loss": 2.6795, + "step": 46888 + }, + { + "epoch": 2.9107331305481408, + "grad_norm": 0.13077452029297199, + "learning_rate": 2.694687104011551e-07, + "loss": 2.6967, + "step": 46889 + }, + { + "epoch": 2.9107952076478987, + "grad_norm": 0.13492703101594636, + "learning_rate": 2.690943945044544e-07, + "loss": 2.6839, + "step": 46890 + }, + { + "epoch": 2.9108572847476566, + "grad_norm": 0.1422477513292271, + "learning_rate": 2.6872033806595866e-07, + "loss": 2.7769, + "step": 46891 + }, + { + "epoch": 2.9109193618474145, + "grad_norm": 0.13523287557986052, + "learning_rate": 2.683465410876218e-07, + "loss": 2.6818, + "step": 46892 + }, + { + "epoch": 2.9109814389471724, + "grad_norm": 0.1327941966066684, + "learning_rate": 2.6797300357139786e-07, + "loss": 2.7302, + "step": 46893 + }, + { + "epoch": 2.9110435160469303, + "grad_norm": 0.13077835877389032, + "learning_rate": 2.6759972551922976e-07, + "loss": 2.709, + "step": 46894 + }, + { + "epoch": 2.9111055931466883, + "grad_norm": 0.12965465526147124, + "learning_rate": 2.672267069330658e-07, + "loss": 2.751, + "step": 46895 + }, + { + "epoch": 2.911167670246446, + "grad_norm": 0.13607212832145432, + "learning_rate": 2.668539478148546e-07, + "loss": 2.7271, + "step": 46896 + }, + { + "epoch": 2.911229747346204, + "grad_norm": 0.13754743386436427, + "learning_rate": 2.6648144816653896e-07, + "loss": 2.7537, + "step": 46897 + }, + { + "epoch": 2.911291824445962, + "grad_norm": 0.13187777243994836, + "learning_rate": 2.661092079900618e-07, + "loss": 2.6931, + "step": 46898 + }, + { + "epoch": 2.91135390154572, + "grad_norm": 0.13875953160444143, + "learning_rate": 2.657372272873715e-07, + "loss": 2.6529, + "step": 46899 + }, + { + "epoch": 2.9114159786454774, + "grad_norm": 0.13121369064000177, + "learning_rate": 2.653655060603999e-07, + "loss": 2.6536, + "step": 46900 + }, + { + "epoch": 2.9114780557452358, + "grad_norm": 0.13963769991481936, + "learning_rate": 2.6499404431108434e-07, + "loss": 2.7177, + "step": 46901 + }, + { + "epoch": 2.9115401328449932, + "grad_norm": 0.12880811423674457, + "learning_rate": 2.646228420413732e-07, + "loss": 2.6555, + "step": 46902 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.12771580558212825, + "learning_rate": 2.642518992531984e-07, + "loss": 2.638, + "step": 46903 + }, + { + "epoch": 2.911664287044509, + "grad_norm": 0.1405157950018617, + "learning_rate": 2.6388121594849714e-07, + "loss": 2.6544, + "step": 46904 + }, + { + "epoch": 2.9117263641442674, + "grad_norm": 0.1363120296344224, + "learning_rate": 2.635107921292013e-07, + "loss": 2.6499, + "step": 46905 + }, + { + "epoch": 2.911788441244025, + "grad_norm": 0.1299675328146569, + "learning_rate": 2.63140627797237e-07, + "loss": 2.5901, + "step": 46906 + }, + { + "epoch": 2.911850518343783, + "grad_norm": 0.12778636679196803, + "learning_rate": 2.627707229545473e-07, + "loss": 2.6673, + "step": 46907 + }, + { + "epoch": 2.9119125954435408, + "grad_norm": 0.13413395942296735, + "learning_rate": 2.624010776030583e-07, + "loss": 2.733, + "step": 46908 + }, + { + "epoch": 2.9119746725432987, + "grad_norm": 0.19319408750394435, + "learning_rate": 2.6203169174469635e-07, + "loss": 2.7347, + "step": 46909 + }, + { + "epoch": 2.9120367496430566, + "grad_norm": 0.13175061608574015, + "learning_rate": 2.6166256538138756e-07, + "loss": 2.824, + "step": 46910 + }, + { + "epoch": 2.9120988267428145, + "grad_norm": 0.1517877458388348, + "learning_rate": 2.612936985150638e-07, + "loss": 2.761, + "step": 46911 + }, + { + "epoch": 2.9121609038425724, + "grad_norm": 0.12847538668870678, + "learning_rate": 2.609250911476402e-07, + "loss": 2.7407, + "step": 46912 + }, + { + "epoch": 2.9122229809423303, + "grad_norm": 0.1470189259337229, + "learning_rate": 2.605567432810485e-07, + "loss": 2.772, + "step": 46913 + }, + { + "epoch": 2.9122850580420883, + "grad_norm": 0.14131696316412426, + "learning_rate": 2.601886549172039e-07, + "loss": 2.701, + "step": 46914 + }, + { + "epoch": 2.912347135141846, + "grad_norm": 0.1262379529370161, + "learning_rate": 2.598208260580326e-07, + "loss": 2.7419, + "step": 46915 + }, + { + "epoch": 2.912409212241604, + "grad_norm": 0.13606259383547503, + "learning_rate": 2.594532567054497e-07, + "loss": 2.6952, + "step": 46916 + }, + { + "epoch": 2.912471289341362, + "grad_norm": 0.1469982416435423, + "learning_rate": 2.5908594686137045e-07, + "loss": 2.8029, + "step": 46917 + }, + { + "epoch": 2.91253336644112, + "grad_norm": 0.12997832407568446, + "learning_rate": 2.5871889652772095e-07, + "loss": 2.6899, + "step": 46918 + }, + { + "epoch": 2.912595443540878, + "grad_norm": 0.13461790419803918, + "learning_rate": 2.5835210570641086e-07, + "loss": 2.6832, + "step": 46919 + }, + { + "epoch": 2.9126575206406358, + "grad_norm": 0.13636747601144475, + "learning_rate": 2.5798557439934425e-07, + "loss": 2.6851, + "step": 46920 + }, + { + "epoch": 2.9127195977403937, + "grad_norm": 0.20421359988944546, + "learning_rate": 2.5761930260844723e-07, + "loss": 2.7722, + "step": 46921 + }, + { + "epoch": 2.9127816748401516, + "grad_norm": 0.1500870439678167, + "learning_rate": 2.5725329033562953e-07, + "loss": 2.7285, + "step": 46922 + }, + { + "epoch": 2.9128437519399095, + "grad_norm": 0.12874924606269159, + "learning_rate": 2.5688753758279507e-07, + "loss": 2.6423, + "step": 46923 + }, + { + "epoch": 2.9129058290396674, + "grad_norm": 0.14710253303041945, + "learning_rate": 2.565220443518479e-07, + "loss": 2.7163, + "step": 46924 + }, + { + "epoch": 2.912967906139425, + "grad_norm": 0.1411069716788612, + "learning_rate": 2.5615681064470876e-07, + "loss": 2.7379, + "step": 46925 + }, + { + "epoch": 2.9130299832391833, + "grad_norm": 0.1276865388383134, + "learning_rate": 2.557918364632705e-07, + "loss": 2.6469, + "step": 46926 + }, + { + "epoch": 2.9130920603389407, + "grad_norm": 0.13203428676882437, + "learning_rate": 2.5542712180944283e-07, + "loss": 2.7546, + "step": 46927 + }, + { + "epoch": 2.913154137438699, + "grad_norm": 0.13230995157613942, + "learning_rate": 2.550626666851241e-07, + "loss": 2.6415, + "step": 46928 + }, + { + "epoch": 2.9132162145384566, + "grad_norm": 0.12900163993606262, + "learning_rate": 2.54698471092224e-07, + "loss": 2.5723, + "step": 46929 + }, + { + "epoch": 2.9132782916382145, + "grad_norm": 0.1362948658550306, + "learning_rate": 2.543345350326409e-07, + "loss": 2.7741, + "step": 46930 + }, + { + "epoch": 2.9133403687379724, + "grad_norm": 0.13242351142749897, + "learning_rate": 2.5397085850826787e-07, + "loss": 2.6862, + "step": 46931 + }, + { + "epoch": 2.9134024458377303, + "grad_norm": 0.14259119496469352, + "learning_rate": 2.5360744152100323e-07, + "loss": 2.6578, + "step": 46932 + }, + { + "epoch": 2.9134645229374883, + "grad_norm": 0.13286637868257659, + "learning_rate": 2.532442840727456e-07, + "loss": 2.7179, + "step": 46933 + }, + { + "epoch": 2.913526600037246, + "grad_norm": 0.12883774786797905, + "learning_rate": 2.5288138616539337e-07, + "loss": 2.734, + "step": 46934 + }, + { + "epoch": 2.913588677137004, + "grad_norm": 0.12659107815727466, + "learning_rate": 2.525187478008284e-07, + "loss": 2.6873, + "step": 46935 + }, + { + "epoch": 2.913650754236762, + "grad_norm": 0.12950672108375044, + "learning_rate": 2.5215636898095474e-07, + "loss": 2.7048, + "step": 46936 + }, + { + "epoch": 2.91371283133652, + "grad_norm": 0.13097275618913679, + "learning_rate": 2.517942497076542e-07, + "loss": 2.7648, + "step": 46937 + }, + { + "epoch": 2.913774908436278, + "grad_norm": 0.13611230545252498, + "learning_rate": 2.514323899828197e-07, + "loss": 2.6924, + "step": 46938 + }, + { + "epoch": 2.9138369855360358, + "grad_norm": 0.1440778075937536, + "learning_rate": 2.510707898083442e-07, + "loss": 2.6995, + "step": 46939 + }, + { + "epoch": 2.9138990626357937, + "grad_norm": 0.1294018077603752, + "learning_rate": 2.507094491861095e-07, + "loss": 2.6572, + "step": 46940 + }, + { + "epoch": 2.9139611397355516, + "grad_norm": 0.1454936859134815, + "learning_rate": 2.5034836811799743e-07, + "loss": 2.701, + "step": 46941 + }, + { + "epoch": 2.9140232168353095, + "grad_norm": 0.1454502606344986, + "learning_rate": 2.499875466058954e-07, + "loss": 2.7728, + "step": 46942 + }, + { + "epoch": 2.9140852939350674, + "grad_norm": 0.13065605595210916, + "learning_rate": 2.496269846516852e-07, + "loss": 2.6954, + "step": 46943 + }, + { + "epoch": 2.9141473710348254, + "grad_norm": 0.12909304926933016, + "learning_rate": 2.492666822572487e-07, + "loss": 2.7503, + "step": 46944 + }, + { + "epoch": 2.9142094481345833, + "grad_norm": 0.13025367821453096, + "learning_rate": 2.489066394244677e-07, + "loss": 2.6058, + "step": 46945 + }, + { + "epoch": 2.914271525234341, + "grad_norm": 0.13730659809112705, + "learning_rate": 2.4854685615521843e-07, + "loss": 2.6967, + "step": 46946 + }, + { + "epoch": 2.914333602334099, + "grad_norm": 0.15116655527656297, + "learning_rate": 2.481873324513717e-07, + "loss": 2.6723, + "step": 46947 + }, + { + "epoch": 2.9143956794338566, + "grad_norm": 0.14416170560518193, + "learning_rate": 2.478280683148149e-07, + "loss": 2.6205, + "step": 46948 + }, + { + "epoch": 2.914457756533615, + "grad_norm": 0.1457192977029454, + "learning_rate": 2.4746906374741305e-07, + "loss": 2.5968, + "step": 46949 + }, + { + "epoch": 2.9145198336333724, + "grad_norm": 0.13971327230303687, + "learning_rate": 2.4711031875104816e-07, + "loss": 2.7295, + "step": 46950 + }, + { + "epoch": 2.9145819107331308, + "grad_norm": 0.14239977433384002, + "learning_rate": 2.4675183332757976e-07, + "loss": 2.721, + "step": 46951 + }, + { + "epoch": 2.9146439878328883, + "grad_norm": 0.14275800775524936, + "learning_rate": 2.4639360747888975e-07, + "loss": 2.669, + "step": 46952 + }, + { + "epoch": 2.9147060649326466, + "grad_norm": 0.13696603890402834, + "learning_rate": 2.460356412068432e-07, + "loss": 2.6923, + "step": 46953 + }, + { + "epoch": 2.914768142032404, + "grad_norm": 0.139147259298636, + "learning_rate": 2.4567793451330534e-07, + "loss": 2.6949, + "step": 46954 + }, + { + "epoch": 2.914830219132162, + "grad_norm": 0.13332246597030661, + "learning_rate": 2.453204874001469e-07, + "loss": 2.6344, + "step": 46955 + }, + { + "epoch": 2.91489229623192, + "grad_norm": 0.14557690228264716, + "learning_rate": 2.44963299869222e-07, + "loss": 2.8023, + "step": 46956 + }, + { + "epoch": 2.914954373331678, + "grad_norm": 0.1381096321830336, + "learning_rate": 2.4460637192240674e-07, + "loss": 2.6821, + "step": 46957 + }, + { + "epoch": 2.9150164504314358, + "grad_norm": 0.12926011931642656, + "learning_rate": 2.4424970356156094e-07, + "loss": 2.698, + "step": 46958 + }, + { + "epoch": 2.9150785275311937, + "grad_norm": 0.13592921894983076, + "learning_rate": 2.4389329478853863e-07, + "loss": 2.7616, + "step": 46959 + }, + { + "epoch": 2.9151406046309516, + "grad_norm": 0.14970106721408774, + "learning_rate": 2.435371456052049e-07, + "loss": 2.6765, + "step": 46960 + }, + { + "epoch": 2.9152026817307095, + "grad_norm": 0.13511513569782932, + "learning_rate": 2.431812560134195e-07, + "loss": 2.6583, + "step": 46961 + }, + { + "epoch": 2.9152647588304674, + "grad_norm": 0.12952253380222103, + "learning_rate": 2.4282562601503634e-07, + "loss": 2.7159, + "step": 46962 + }, + { + "epoch": 2.9153268359302253, + "grad_norm": 0.13026222226481438, + "learning_rate": 2.424702556119096e-07, + "loss": 2.6768, + "step": 46963 + }, + { + "epoch": 2.9153889130299833, + "grad_norm": 0.13216679347752952, + "learning_rate": 2.4211514480589337e-07, + "loss": 2.7439, + "step": 46964 + }, + { + "epoch": 2.915450990129741, + "grad_norm": 0.14745625115836283, + "learning_rate": 2.4176029359884167e-07, + "loss": 2.6051, + "step": 46965 + }, + { + "epoch": 2.915513067229499, + "grad_norm": 0.127727341239525, + "learning_rate": 2.414057019926086e-07, + "loss": 2.6057, + "step": 46966 + }, + { + "epoch": 2.915575144329257, + "grad_norm": 0.1378996867695509, + "learning_rate": 2.410513699890371e-07, + "loss": 2.7002, + "step": 46967 + }, + { + "epoch": 2.915637221429015, + "grad_norm": 0.14877131847276137, + "learning_rate": 2.406972975899813e-07, + "loss": 2.7677, + "step": 46968 + }, + { + "epoch": 2.915699298528773, + "grad_norm": 0.1263990415137466, + "learning_rate": 2.4034348479728965e-07, + "loss": 2.5261, + "step": 46969 + }, + { + "epoch": 2.9157613756285308, + "grad_norm": 0.14187838363393665, + "learning_rate": 2.3998993161279957e-07, + "loss": 2.6299, + "step": 46970 + }, + { + "epoch": 2.9158234527282887, + "grad_norm": 0.12786313138345895, + "learning_rate": 2.396366380383652e-07, + "loss": 2.6751, + "step": 46971 + }, + { + "epoch": 2.9158855298280466, + "grad_norm": 0.12987184836783136, + "learning_rate": 2.392836040758295e-07, + "loss": 2.6758, + "step": 46972 + }, + { + "epoch": 2.915947606927804, + "grad_norm": 0.13266252717397323, + "learning_rate": 2.3893082972702984e-07, + "loss": 2.7216, + "step": 46973 + }, + { + "epoch": 2.9160096840275624, + "grad_norm": 0.13135166233008808, + "learning_rate": 2.385783149938037e-07, + "loss": 2.7102, + "step": 46974 + }, + { + "epoch": 2.91607176112732, + "grad_norm": 0.14117838008060551, + "learning_rate": 2.382260598779995e-07, + "loss": 2.747, + "step": 46975 + }, + { + "epoch": 2.9161338382270783, + "grad_norm": 0.13646124021938058, + "learning_rate": 2.3787406438144921e-07, + "loss": 2.6921, + "step": 46976 + }, + { + "epoch": 2.9161959153268358, + "grad_norm": 0.14489827722256843, + "learning_rate": 2.3752232850599022e-07, + "loss": 2.8103, + "step": 46977 + }, + { + "epoch": 2.9162579924265937, + "grad_norm": 0.12683672336984594, + "learning_rate": 2.3717085225345438e-07, + "loss": 2.6204, + "step": 46978 + }, + { + "epoch": 2.9163200695263516, + "grad_norm": 0.14311602125989467, + "learning_rate": 2.368196356256791e-07, + "loss": 2.7904, + "step": 46979 + }, + { + "epoch": 2.9163821466261095, + "grad_norm": 0.12746356172283124, + "learning_rate": 2.364686786244963e-07, + "loss": 2.701, + "step": 46980 + }, + { + "epoch": 2.9164442237258674, + "grad_norm": 0.13213775137133205, + "learning_rate": 2.3611798125173778e-07, + "loss": 2.7837, + "step": 46981 + }, + { + "epoch": 2.9165063008256253, + "grad_norm": 0.13023126082424194, + "learning_rate": 2.3576754350923547e-07, + "loss": 2.6435, + "step": 46982 + }, + { + "epoch": 2.9165683779253833, + "grad_norm": 0.13167590565599052, + "learning_rate": 2.354173653988101e-07, + "loss": 2.7059, + "step": 46983 + }, + { + "epoch": 2.916630455025141, + "grad_norm": 0.1310761989784409, + "learning_rate": 2.3506744692229354e-07, + "loss": 2.7158, + "step": 46984 + }, + { + "epoch": 2.916692532124899, + "grad_norm": 0.13664184178268066, + "learning_rate": 2.3471778808151212e-07, + "loss": 2.6481, + "step": 46985 + }, + { + "epoch": 2.916754609224657, + "grad_norm": 0.12673357539347385, + "learning_rate": 2.3436838887829214e-07, + "loss": 2.7759, + "step": 46986 + }, + { + "epoch": 2.916816686324415, + "grad_norm": 0.13597032874069093, + "learning_rate": 2.3401924931444886e-07, + "loss": 2.6923, + "step": 46987 + }, + { + "epoch": 2.916878763424173, + "grad_norm": 0.13146085009631778, + "learning_rate": 2.3367036939180853e-07, + "loss": 2.6764, + "step": 46988 + }, + { + "epoch": 2.9169408405239308, + "grad_norm": 0.1334566665057124, + "learning_rate": 2.3332174911219197e-07, + "loss": 2.7255, + "step": 46989 + }, + { + "epoch": 2.9170029176236887, + "grad_norm": 0.137833113739749, + "learning_rate": 2.3297338847741989e-07, + "loss": 2.7576, + "step": 46990 + }, + { + "epoch": 2.9170649947234466, + "grad_norm": 0.14162348215858764, + "learning_rate": 2.3262528748930202e-07, + "loss": 2.6975, + "step": 46991 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.13847291804415382, + "learning_rate": 2.3227744614966462e-07, + "loss": 2.7216, + "step": 46992 + }, + { + "epoch": 2.9171891489229624, + "grad_norm": 0.12972584180623103, + "learning_rate": 2.3192986446031185e-07, + "loss": 2.6782, + "step": 46993 + }, + { + "epoch": 2.9172512260227204, + "grad_norm": 0.13566712835940775, + "learning_rate": 2.3158254242306998e-07, + "loss": 2.7106, + "step": 46994 + }, + { + "epoch": 2.9173133031224783, + "grad_norm": 0.14366562766991467, + "learning_rate": 2.312354800397376e-07, + "loss": 2.7525, + "step": 46995 + }, + { + "epoch": 2.9173753802222357, + "grad_norm": 0.13111610281135974, + "learning_rate": 2.3088867731212994e-07, + "loss": 2.7492, + "step": 46996 + }, + { + "epoch": 2.917437457321994, + "grad_norm": 0.12964848483028163, + "learning_rate": 2.3054213424206217e-07, + "loss": 2.6376, + "step": 46997 + }, + { + "epoch": 2.9174995344217516, + "grad_norm": 0.12911424329291538, + "learning_rate": 2.3019585083133842e-07, + "loss": 2.6749, + "step": 46998 + }, + { + "epoch": 2.91756161152151, + "grad_norm": 0.13200427716588323, + "learning_rate": 2.2984982708176284e-07, + "loss": 2.8054, + "step": 46999 + }, + { + "epoch": 2.9176236886212674, + "grad_norm": 0.15382067315034495, + "learning_rate": 2.2950406299514505e-07, + "loss": 2.7168, + "step": 47000 + }, + { + "epoch": 2.917685765721026, + "grad_norm": 0.13863507978101408, + "learning_rate": 2.291585585732836e-07, + "loss": 2.649, + "step": 47001 + }, + { + "epoch": 2.9177478428207833, + "grad_norm": 0.1495797541306577, + "learning_rate": 2.2881331381798265e-07, + "loss": 2.6887, + "step": 47002 + }, + { + "epoch": 2.917809919920541, + "grad_norm": 0.1398296247839434, + "learning_rate": 2.2846832873105185e-07, + "loss": 2.7046, + "step": 47003 + }, + { + "epoch": 2.917871997020299, + "grad_norm": 0.1381125924850929, + "learning_rate": 2.2812360331427863e-07, + "loss": 2.7215, + "step": 47004 + }, + { + "epoch": 2.917934074120057, + "grad_norm": 0.133540556172035, + "learning_rate": 2.2777913756946712e-07, + "loss": 2.7459, + "step": 47005 + }, + { + "epoch": 2.917996151219815, + "grad_norm": 0.13211582990337628, + "learning_rate": 2.2743493149841588e-07, + "loss": 2.7458, + "step": 47006 + }, + { + "epoch": 2.918058228319573, + "grad_norm": 0.12758488087657185, + "learning_rate": 2.2709098510292348e-07, + "loss": 2.6448, + "step": 47007 + }, + { + "epoch": 2.9181203054193308, + "grad_norm": 0.1313841106212068, + "learning_rate": 2.2674729838477738e-07, + "loss": 2.6912, + "step": 47008 + }, + { + "epoch": 2.9181823825190887, + "grad_norm": 0.13153585260367945, + "learning_rate": 2.2640387134577058e-07, + "loss": 2.6977, + "step": 47009 + }, + { + "epoch": 2.9182444596188466, + "grad_norm": 0.14300477845496132, + "learning_rate": 2.2606070398769607e-07, + "loss": 2.7524, + "step": 47010 + }, + { + "epoch": 2.9183065367186045, + "grad_norm": 0.12922955198008618, + "learning_rate": 2.2571779631235246e-07, + "loss": 2.7223, + "step": 47011 + }, + { + "epoch": 2.9183686138183624, + "grad_norm": 0.12983559345282641, + "learning_rate": 2.2537514832152164e-07, + "loss": 2.6508, + "step": 47012 + }, + { + "epoch": 2.9184306909181204, + "grad_norm": 0.1308060651341062, + "learning_rate": 2.2503276001699103e-07, + "loss": 2.7096, + "step": 47013 + }, + { + "epoch": 2.9184927680178783, + "grad_norm": 0.1350117040015582, + "learning_rate": 2.2469063140054258e-07, + "loss": 2.726, + "step": 47014 + }, + { + "epoch": 2.918554845117636, + "grad_norm": 0.13056023411224918, + "learning_rate": 2.243487624739693e-07, + "loss": 2.6789, + "step": 47015 + }, + { + "epoch": 2.918616922217394, + "grad_norm": 0.13058919914524456, + "learning_rate": 2.2400715323905308e-07, + "loss": 2.7679, + "step": 47016 + }, + { + "epoch": 2.918678999317152, + "grad_norm": 0.12751802793841613, + "learning_rate": 2.2366580369757585e-07, + "loss": 2.709, + "step": 47017 + }, + { + "epoch": 2.91874107641691, + "grad_norm": 0.13428807995964004, + "learning_rate": 2.2332471385131948e-07, + "loss": 2.7556, + "step": 47018 + }, + { + "epoch": 2.918803153516668, + "grad_norm": 0.1317329216185672, + "learning_rate": 2.2298388370205482e-07, + "loss": 2.7542, + "step": 47019 + }, + { + "epoch": 2.9188652306164258, + "grad_norm": 0.1322103904627597, + "learning_rate": 2.2264331325157483e-07, + "loss": 2.7431, + "step": 47020 + }, + { + "epoch": 2.9189273077161833, + "grad_norm": 0.13069494079303484, + "learning_rate": 2.2230300250164482e-07, + "loss": 2.6181, + "step": 47021 + }, + { + "epoch": 2.9189893848159416, + "grad_norm": 0.1515992692706613, + "learning_rate": 2.2196295145404666e-07, + "loss": 2.7363, + "step": 47022 + }, + { + "epoch": 2.919051461915699, + "grad_norm": 0.1337300400123281, + "learning_rate": 2.2162316011054563e-07, + "loss": 2.7099, + "step": 47023 + }, + { + "epoch": 2.9191135390154574, + "grad_norm": 0.13402011856110077, + "learning_rate": 2.212836284729236e-07, + "loss": 2.7509, + "step": 47024 + }, + { + "epoch": 2.919175616115215, + "grad_norm": 0.12723610224693885, + "learning_rate": 2.2094435654295142e-07, + "loss": 2.6336, + "step": 47025 + }, + { + "epoch": 2.919237693214973, + "grad_norm": 0.1385471158880065, + "learning_rate": 2.2060534432239433e-07, + "loss": 2.613, + "step": 47026 + }, + { + "epoch": 2.9192997703147308, + "grad_norm": 0.13340043615181596, + "learning_rate": 2.2026659181302311e-07, + "loss": 2.758, + "step": 47027 + }, + { + "epoch": 2.9193618474144887, + "grad_norm": 0.1443304910006418, + "learning_rate": 2.199280990166086e-07, + "loss": 2.7236, + "step": 47028 + }, + { + "epoch": 2.9194239245142466, + "grad_norm": 0.12986809066286756, + "learning_rate": 2.1958986593491604e-07, + "loss": 2.7583, + "step": 47029 + }, + { + "epoch": 2.9194860016140045, + "grad_norm": 0.13458056113273587, + "learning_rate": 2.1925189256970514e-07, + "loss": 2.7563, + "step": 47030 + }, + { + "epoch": 2.9195480787137624, + "grad_norm": 0.13182146911348805, + "learning_rate": 2.1891417892274113e-07, + "loss": 2.8268, + "step": 47031 + }, + { + "epoch": 2.9196101558135203, + "grad_norm": 0.12905985177165705, + "learning_rate": 2.1857672499578376e-07, + "loss": 2.7067, + "step": 47032 + }, + { + "epoch": 2.9196722329132783, + "grad_norm": 0.13061492777280384, + "learning_rate": 2.1823953079059823e-07, + "loss": 2.6307, + "step": 47033 + }, + { + "epoch": 2.919734310013036, + "grad_norm": 0.13196259525028603, + "learning_rate": 2.1790259630893873e-07, + "loss": 2.7255, + "step": 47034 + }, + { + "epoch": 2.919796387112794, + "grad_norm": 0.13109569419674055, + "learning_rate": 2.175659215525705e-07, + "loss": 2.674, + "step": 47035 + }, + { + "epoch": 2.919858464212552, + "grad_norm": 0.14108715673526398, + "learning_rate": 2.172295065232477e-07, + "loss": 2.8044, + "step": 47036 + }, + { + "epoch": 2.91992054131231, + "grad_norm": 0.14281471428104056, + "learning_rate": 2.1689335122271893e-07, + "loss": 2.6431, + "step": 47037 + }, + { + "epoch": 2.919982618412068, + "grad_norm": 0.1395253526011613, + "learning_rate": 2.1655745565274943e-07, + "loss": 2.7465, + "step": 47038 + }, + { + "epoch": 2.9200446955118258, + "grad_norm": 0.13488105308783838, + "learning_rate": 2.1622181981508228e-07, + "loss": 2.6662, + "step": 47039 + }, + { + "epoch": 2.9201067726115837, + "grad_norm": 0.1461535214227957, + "learning_rate": 2.1588644371146604e-07, + "loss": 2.7004, + "step": 47040 + }, + { + "epoch": 2.9201688497113416, + "grad_norm": 0.12983539977921504, + "learning_rate": 2.1555132734366045e-07, + "loss": 2.7034, + "step": 47041 + }, + { + "epoch": 2.9202309268110995, + "grad_norm": 0.14870541011437002, + "learning_rate": 2.1521647071340855e-07, + "loss": 2.7166, + "step": 47042 + }, + { + "epoch": 2.9202930039108574, + "grad_norm": 0.146508030570822, + "learning_rate": 2.1488187382245895e-07, + "loss": 2.7531, + "step": 47043 + }, + { + "epoch": 2.920355081010615, + "grad_norm": 0.12818875907035965, + "learning_rate": 2.1454753667255468e-07, + "loss": 2.6188, + "step": 47044 + }, + { + "epoch": 2.9204171581103733, + "grad_norm": 0.1437045243105835, + "learning_rate": 2.1421345926544433e-07, + "loss": 2.7609, + "step": 47045 + }, + { + "epoch": 2.9204792352101308, + "grad_norm": 0.13213429069966542, + "learning_rate": 2.1387964160286543e-07, + "loss": 2.7387, + "step": 47046 + }, + { + "epoch": 2.920541312309889, + "grad_norm": 0.12908892980915712, + "learning_rate": 2.135460836865666e-07, + "loss": 2.7702, + "step": 47047 + }, + { + "epoch": 2.9206033894096466, + "grad_norm": 0.12956627298658943, + "learning_rate": 2.1321278551827972e-07, + "loss": 2.7005, + "step": 47048 + }, + { + "epoch": 2.920665466509405, + "grad_norm": 0.14417467723027788, + "learning_rate": 2.128797470997479e-07, + "loss": 2.6786, + "step": 47049 + }, + { + "epoch": 2.9207275436091624, + "grad_norm": 0.14066952424382273, + "learning_rate": 2.125469684327086e-07, + "loss": 2.6861, + "step": 47050 + }, + { + "epoch": 2.9207896207089203, + "grad_norm": 0.13112106774046026, + "learning_rate": 2.1221444951889936e-07, + "loss": 2.6988, + "step": 47051 + }, + { + "epoch": 2.9208516978086783, + "grad_norm": 0.1396348519819548, + "learning_rate": 2.1188219036005763e-07, + "loss": 2.6889, + "step": 47052 + }, + { + "epoch": 2.920913774908436, + "grad_norm": 0.13046111524252157, + "learning_rate": 2.1155019095790984e-07, + "loss": 2.7379, + "step": 47053 + }, + { + "epoch": 2.920975852008194, + "grad_norm": 0.14662200819720983, + "learning_rate": 2.1121845131418795e-07, + "loss": 2.7776, + "step": 47054 + }, + { + "epoch": 2.921037929107952, + "grad_norm": 0.13492991619522532, + "learning_rate": 2.108869714306294e-07, + "loss": 2.7823, + "step": 47055 + }, + { + "epoch": 2.92110000620771, + "grad_norm": 0.14720925585409747, + "learning_rate": 2.1055575130896066e-07, + "loss": 2.6433, + "step": 47056 + }, + { + "epoch": 2.921162083307468, + "grad_norm": 0.13135826323270042, + "learning_rate": 2.102247909509081e-07, + "loss": 2.7472, + "step": 47057 + }, + { + "epoch": 2.9212241604072258, + "grad_norm": 0.12769041179776328, + "learning_rate": 2.0989409035819808e-07, + "loss": 2.6624, + "step": 47058 + }, + { + "epoch": 2.9212862375069837, + "grad_norm": 0.13142867585281331, + "learning_rate": 2.0956364953256258e-07, + "loss": 2.6861, + "step": 47059 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.13432746645304638, + "learning_rate": 2.092334684757169e-07, + "loss": 2.7902, + "step": 47060 + }, + { + "epoch": 2.9214103917064995, + "grad_norm": 0.1293897879828771, + "learning_rate": 2.0890354718938744e-07, + "loss": 2.6489, + "step": 47061 + }, + { + "epoch": 2.9214724688062574, + "grad_norm": 0.13662863213640075, + "learning_rate": 2.08573885675295e-07, + "loss": 2.7226, + "step": 47062 + }, + { + "epoch": 2.9215345459060154, + "grad_norm": 0.13105627448404514, + "learning_rate": 2.082444839351605e-07, + "loss": 2.7206, + "step": 47063 + }, + { + "epoch": 2.9215966230057733, + "grad_norm": 0.14209733530332294, + "learning_rate": 2.0791534197070472e-07, + "loss": 2.8231, + "step": 47064 + }, + { + "epoch": 2.921658700105531, + "grad_norm": 0.1457091097586961, + "learning_rate": 2.0758645978363744e-07, + "loss": 2.7006, + "step": 47065 + }, + { + "epoch": 2.921720777205289, + "grad_norm": 0.13438544343686362, + "learning_rate": 2.072578373756795e-07, + "loss": 2.7006, + "step": 47066 + }, + { + "epoch": 2.921782854305047, + "grad_norm": 0.13138126808270906, + "learning_rate": 2.069294747485462e-07, + "loss": 2.6898, + "step": 47067 + }, + { + "epoch": 2.921844931404805, + "grad_norm": 0.14064376758801977, + "learning_rate": 2.066013719039528e-07, + "loss": 2.7737, + "step": 47068 + }, + { + "epoch": 2.9219070085045624, + "grad_norm": 0.12843296558144482, + "learning_rate": 2.0627352884360352e-07, + "loss": 2.7311, + "step": 47069 + }, + { + "epoch": 2.921969085604321, + "grad_norm": 0.13303895185290018, + "learning_rate": 2.0594594556921365e-07, + "loss": 2.7036, + "step": 47070 + }, + { + "epoch": 2.9220311627040783, + "grad_norm": 0.12818822870647217, + "learning_rate": 2.0561862208249295e-07, + "loss": 2.6634, + "step": 47071 + }, + { + "epoch": 2.9220932398038366, + "grad_norm": 0.1343906618017779, + "learning_rate": 2.0529155838514558e-07, + "loss": 2.7175, + "step": 47072 + }, + { + "epoch": 2.922155316903594, + "grad_norm": 0.1367952132981928, + "learning_rate": 2.0496475447888132e-07, + "loss": 2.712, + "step": 47073 + }, + { + "epoch": 2.922217394003352, + "grad_norm": 0.13241865159784263, + "learning_rate": 2.0463821036540433e-07, + "loss": 2.709, + "step": 47074 + }, + { + "epoch": 2.92227947110311, + "grad_norm": 0.14612173969989675, + "learning_rate": 2.043119260464188e-07, + "loss": 2.7696, + "step": 47075 + }, + { + "epoch": 2.922341548202868, + "grad_norm": 0.14323828986040263, + "learning_rate": 2.0398590152362896e-07, + "loss": 2.6543, + "step": 47076 + }, + { + "epoch": 2.9224036253026258, + "grad_norm": 0.12935503950800892, + "learning_rate": 2.0366013679872785e-07, + "loss": 2.7025, + "step": 47077 + }, + { + "epoch": 2.9224657024023837, + "grad_norm": 0.14350360168986268, + "learning_rate": 2.0333463187342528e-07, + "loss": 2.7146, + "step": 47078 + }, + { + "epoch": 2.9225277795021416, + "grad_norm": 0.13678402704745127, + "learning_rate": 2.0300938674941427e-07, + "loss": 2.7092, + "step": 47079 + }, + { + "epoch": 2.9225898566018995, + "grad_norm": 0.12731657089200749, + "learning_rate": 2.0268440142838796e-07, + "loss": 2.7208, + "step": 47080 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.13054917706161157, + "learning_rate": 2.0235967591204496e-07, + "loss": 2.7529, + "step": 47081 + }, + { + "epoch": 2.9227140108014154, + "grad_norm": 0.12838516976859576, + "learning_rate": 2.0203521020208395e-07, + "loss": 2.6117, + "step": 47082 + }, + { + "epoch": 2.9227760879011733, + "grad_norm": 0.1298004333712767, + "learning_rate": 2.017110043001924e-07, + "loss": 2.6121, + "step": 47083 + }, + { + "epoch": 2.922838165000931, + "grad_norm": 0.1317269404501878, + "learning_rate": 2.0138705820806348e-07, + "loss": 2.6235, + "step": 47084 + }, + { + "epoch": 2.922900242100689, + "grad_norm": 0.13613915202897015, + "learning_rate": 2.0106337192739022e-07, + "loss": 2.6388, + "step": 47085 + }, + { + "epoch": 2.922962319200447, + "grad_norm": 0.1520522274960416, + "learning_rate": 2.0073994545985463e-07, + "loss": 2.6785, + "step": 47086 + }, + { + "epoch": 2.923024396300205, + "grad_norm": 0.13132078753913154, + "learning_rate": 2.0041677880714982e-07, + "loss": 2.7454, + "step": 47087 + }, + { + "epoch": 2.923086473399963, + "grad_norm": 0.13018518401039494, + "learning_rate": 2.0009387197095776e-07, + "loss": 2.6398, + "step": 47088 + }, + { + "epoch": 2.9231485504997208, + "grad_norm": 0.1476543244735871, + "learning_rate": 1.997712249529715e-07, + "loss": 2.6754, + "step": 47089 + }, + { + "epoch": 2.9232106275994787, + "grad_norm": 0.12944265946033898, + "learning_rate": 1.9944883775486202e-07, + "loss": 2.7105, + "step": 47090 + }, + { + "epoch": 2.9232727046992366, + "grad_norm": 0.13055661748971076, + "learning_rate": 1.9912671037831677e-07, + "loss": 2.7081, + "step": 47091 + }, + { + "epoch": 2.923334781798994, + "grad_norm": 0.12934650753544413, + "learning_rate": 1.9880484282502332e-07, + "loss": 2.7114, + "step": 47092 + }, + { + "epoch": 2.9233968588987524, + "grad_norm": 0.13327507497786797, + "learning_rate": 1.9848323509665257e-07, + "loss": 2.6623, + "step": 47093 + }, + { + "epoch": 2.92345893599851, + "grad_norm": 0.14487750872924038, + "learning_rate": 1.9816188719488093e-07, + "loss": 2.6758, + "step": 47094 + }, + { + "epoch": 2.9235210130982683, + "grad_norm": 0.13781164544244973, + "learning_rate": 1.9784079912139043e-07, + "loss": 2.7247, + "step": 47095 + }, + { + "epoch": 2.9235830901980258, + "grad_norm": 0.1270231420667326, + "learning_rate": 1.975199708778519e-07, + "loss": 2.7527, + "step": 47096 + }, + { + "epoch": 2.923645167297784, + "grad_norm": 0.13813477587303752, + "learning_rate": 1.9719940246594737e-07, + "loss": 2.7495, + "step": 47097 + }, + { + "epoch": 2.9237072443975416, + "grad_norm": 0.14351349843068886, + "learning_rate": 1.9687909388734215e-07, + "loss": 2.6905, + "step": 47098 + }, + { + "epoch": 2.9237693214972995, + "grad_norm": 0.1367577973872168, + "learning_rate": 1.965590451437016e-07, + "loss": 2.7058, + "step": 47099 + }, + { + "epoch": 2.9238313985970574, + "grad_norm": 0.12918431462068922, + "learning_rate": 1.962392562367077e-07, + "loss": 2.6081, + "step": 47100 + }, + { + "epoch": 2.9238934756968153, + "grad_norm": 0.13292307317948648, + "learning_rate": 1.9591972716802575e-07, + "loss": 2.6892, + "step": 47101 + }, + { + "epoch": 2.9239555527965733, + "grad_norm": 0.14050845111301982, + "learning_rate": 1.9560045793932113e-07, + "loss": 2.7333, + "step": 47102 + }, + { + "epoch": 2.924017629896331, + "grad_norm": 0.1283091457507323, + "learning_rate": 1.9528144855225915e-07, + "loss": 2.669, + "step": 47103 + }, + { + "epoch": 2.924079706996089, + "grad_norm": 0.13788968429028417, + "learning_rate": 1.9496269900849962e-07, + "loss": 2.7636, + "step": 47104 + }, + { + "epoch": 2.924141784095847, + "grad_norm": 0.14274739316215213, + "learning_rate": 1.9464420930971338e-07, + "loss": 2.6338, + "step": 47105 + }, + { + "epoch": 2.924203861195605, + "grad_norm": 0.13077917060885266, + "learning_rate": 1.9432597945756026e-07, + "loss": 2.7278, + "step": 47106 + }, + { + "epoch": 2.924265938295363, + "grad_norm": 0.14476092235710822, + "learning_rate": 1.9400800945370002e-07, + "loss": 2.7504, + "step": 47107 + }, + { + "epoch": 2.9243280153951208, + "grad_norm": 0.13701313205677962, + "learning_rate": 1.9369029929978689e-07, + "loss": 2.7262, + "step": 47108 + }, + { + "epoch": 2.9243900924948787, + "grad_norm": 0.13270257860873202, + "learning_rate": 1.933728489974862e-07, + "loss": 2.6256, + "step": 47109 + }, + { + "epoch": 2.9244521695946366, + "grad_norm": 0.1377680971664303, + "learning_rate": 1.9305565854844664e-07, + "loss": 2.6522, + "step": 47110 + }, + { + "epoch": 2.9245142466943945, + "grad_norm": 0.1430856997659527, + "learning_rate": 1.9273872795432802e-07, + "loss": 2.7587, + "step": 47111 + }, + { + "epoch": 2.9245763237941524, + "grad_norm": 0.1303803444198532, + "learning_rate": 1.9242205721678453e-07, + "loss": 2.6669, + "step": 47112 + }, + { + "epoch": 2.9246384008939104, + "grad_norm": 0.13484604136265058, + "learning_rate": 1.9210564633746486e-07, + "loss": 2.8655, + "step": 47113 + }, + { + "epoch": 2.9247004779936683, + "grad_norm": 0.1437929602366475, + "learning_rate": 1.9178949531802325e-07, + "loss": 2.6203, + "step": 47114 + }, + { + "epoch": 2.924762555093426, + "grad_norm": 0.14303873725465352, + "learning_rate": 1.9147360416010284e-07, + "loss": 2.6726, + "step": 47115 + }, + { + "epoch": 2.924824632193184, + "grad_norm": 0.1328690422535609, + "learning_rate": 1.911579728653634e-07, + "loss": 2.633, + "step": 47116 + }, + { + "epoch": 2.9248867092929416, + "grad_norm": 0.12890394526646534, + "learning_rate": 1.9084260143543697e-07, + "loss": 2.644, + "step": 47117 + }, + { + "epoch": 2.9249487863927, + "grad_norm": 0.13121019849827475, + "learning_rate": 1.9052748987197777e-07, + "loss": 2.8081, + "step": 47118 + }, + { + "epoch": 2.9250108634924574, + "grad_norm": 0.14324797737980483, + "learning_rate": 1.9021263817663447e-07, + "loss": 2.6653, + "step": 47119 + }, + { + "epoch": 2.925072940592216, + "grad_norm": 0.13011928771954265, + "learning_rate": 1.8989804635103913e-07, + "loss": 2.7247, + "step": 47120 + }, + { + "epoch": 2.9251350176919733, + "grad_norm": 0.14617786128122706, + "learning_rate": 1.8958371439683486e-07, + "loss": 2.7095, + "step": 47121 + }, + { + "epoch": 2.925197094791731, + "grad_norm": 0.12921635578248425, + "learning_rate": 1.892696423156648e-07, + "loss": 2.6453, + "step": 47122 + }, + { + "epoch": 2.925259171891489, + "grad_norm": 0.12905057861282657, + "learning_rate": 1.8895583010917207e-07, + "loss": 2.6711, + "step": 47123 + }, + { + "epoch": 2.925321248991247, + "grad_norm": 0.1360179309660453, + "learning_rate": 1.8864227777898314e-07, + "loss": 2.731, + "step": 47124 + }, + { + "epoch": 2.925383326091005, + "grad_norm": 0.13423579873773211, + "learning_rate": 1.8832898532674115e-07, + "loss": 2.7084, + "step": 47125 + }, + { + "epoch": 2.925445403190763, + "grad_norm": 0.1368411333827471, + "learning_rate": 1.8801595275408367e-07, + "loss": 2.7251, + "step": 47126 + }, + { + "epoch": 2.9255074802905208, + "grad_norm": 0.14386269878559663, + "learning_rate": 1.8770318006263167e-07, + "loss": 2.6455, + "step": 47127 + }, + { + "epoch": 2.9255695573902787, + "grad_norm": 0.15756611262405068, + "learning_rate": 1.873906672540282e-07, + "loss": 2.6786, + "step": 47128 + }, + { + "epoch": 2.9256316344900366, + "grad_norm": 0.1282798401340414, + "learning_rate": 1.8707841432989982e-07, + "loss": 2.6632, + "step": 47129 + }, + { + "epoch": 2.9256937115897945, + "grad_norm": 0.12814372117807324, + "learning_rate": 1.8676642129187849e-07, + "loss": 2.6048, + "step": 47130 + }, + { + "epoch": 2.9257557886895524, + "grad_norm": 0.15488573170204287, + "learning_rate": 1.8645468814158517e-07, + "loss": 2.7298, + "step": 47131 + }, + { + "epoch": 2.9258178657893104, + "grad_norm": 0.13650450268199074, + "learning_rate": 1.861432148806519e-07, + "loss": 2.6553, + "step": 47132 + }, + { + "epoch": 2.9258799428890683, + "grad_norm": 0.13145583423827517, + "learning_rate": 1.8583200151070512e-07, + "loss": 2.656, + "step": 47133 + }, + { + "epoch": 2.925942019988826, + "grad_norm": 0.1327449753150693, + "learning_rate": 1.8552104803336023e-07, + "loss": 2.6546, + "step": 47134 + }, + { + "epoch": 2.926004097088584, + "grad_norm": 0.14904788280489858, + "learning_rate": 1.8521035445024925e-07, + "loss": 2.6794, + "step": 47135 + }, + { + "epoch": 2.926066174188342, + "grad_norm": 0.13030844336766068, + "learning_rate": 1.8489992076298755e-07, + "loss": 2.7355, + "step": 47136 + }, + { + "epoch": 2.9261282512881, + "grad_norm": 0.1364756534652488, + "learning_rate": 1.845897469731961e-07, + "loss": 2.677, + "step": 47137 + }, + { + "epoch": 2.926190328387858, + "grad_norm": 0.13091942846823817, + "learning_rate": 1.842798330824902e-07, + "loss": 2.6993, + "step": 47138 + }, + { + "epoch": 2.926252405487616, + "grad_norm": 0.14924475483208022, + "learning_rate": 1.8397017909249635e-07, + "loss": 2.6523, + "step": 47139 + }, + { + "epoch": 2.9263144825873733, + "grad_norm": 0.12839436767460002, + "learning_rate": 1.8366078500481886e-07, + "loss": 2.5914, + "step": 47140 + }, + { + "epoch": 2.9263765596871316, + "grad_norm": 0.12857642025709248, + "learning_rate": 1.833516508210731e-07, + "loss": 2.6729, + "step": 47141 + }, + { + "epoch": 2.926438636786889, + "grad_norm": 0.1388283907405304, + "learning_rate": 1.8304277654287995e-07, + "loss": 2.6721, + "step": 47142 + }, + { + "epoch": 2.9265007138866475, + "grad_norm": 0.14798019890566394, + "learning_rate": 1.827341621718437e-07, + "loss": 2.6648, + "step": 47143 + }, + { + "epoch": 2.926562790986405, + "grad_norm": 0.13169647186777358, + "learning_rate": 1.8242580770957973e-07, + "loss": 2.738, + "step": 47144 + }, + { + "epoch": 2.9266248680861633, + "grad_norm": 0.13073221120164116, + "learning_rate": 1.8211771315769232e-07, + "loss": 2.6985, + "step": 47145 + }, + { + "epoch": 2.9266869451859208, + "grad_norm": 0.14135360712626094, + "learning_rate": 1.8180987851779129e-07, + "loss": 2.7698, + "step": 47146 + }, + { + "epoch": 2.9267490222856787, + "grad_norm": 0.1281270986478172, + "learning_rate": 1.8150230379148092e-07, + "loss": 2.7098, + "step": 47147 + }, + { + "epoch": 2.9268110993854366, + "grad_norm": 0.12749648091658233, + "learning_rate": 1.8119498898036547e-07, + "loss": 2.6013, + "step": 47148 + }, + { + "epoch": 2.9268731764851945, + "grad_norm": 0.1285607229836939, + "learning_rate": 1.808879340860492e-07, + "loss": 2.6766, + "step": 47149 + }, + { + "epoch": 2.9269352535849524, + "grad_norm": 0.13715779109905168, + "learning_rate": 1.8058113911014197e-07, + "loss": 2.6753, + "step": 47150 + }, + { + "epoch": 2.9269973306847104, + "grad_norm": 0.13340427586972917, + "learning_rate": 1.8027460405423135e-07, + "loss": 2.6386, + "step": 47151 + }, + { + "epoch": 2.9270594077844683, + "grad_norm": 0.1374229619024948, + "learning_rate": 1.7996832891992165e-07, + "loss": 2.8032, + "step": 47152 + }, + { + "epoch": 2.927121484884226, + "grad_norm": 0.13153816786446695, + "learning_rate": 1.796623137088116e-07, + "loss": 2.7257, + "step": 47153 + }, + { + "epoch": 2.927183561983984, + "grad_norm": 0.12758801967936959, + "learning_rate": 1.7935655842249987e-07, + "loss": 2.667, + "step": 47154 + }, + { + "epoch": 2.927245639083742, + "grad_norm": 0.1309060824719393, + "learning_rate": 1.7905106306257413e-07, + "loss": 2.7234, + "step": 47155 + }, + { + "epoch": 2.9273077161835, + "grad_norm": 0.12944039305774405, + "learning_rate": 1.7874582763063862e-07, + "loss": 2.7154, + "step": 47156 + }, + { + "epoch": 2.927369793283258, + "grad_norm": 0.1294644940367559, + "learning_rate": 1.78440852128281e-07, + "loss": 2.7395, + "step": 47157 + }, + { + "epoch": 2.9274318703830158, + "grad_norm": 0.13041820460723885, + "learning_rate": 1.7813613655708882e-07, + "loss": 2.6327, + "step": 47158 + }, + { + "epoch": 2.9274939474827737, + "grad_norm": 0.13391998595580118, + "learning_rate": 1.7783168091865533e-07, + "loss": 2.6958, + "step": 47159 + }, + { + "epoch": 2.9275560245825316, + "grad_norm": 0.14035342404505655, + "learning_rate": 1.7752748521457363e-07, + "loss": 2.7631, + "step": 47160 + }, + { + "epoch": 2.9276181016822895, + "grad_norm": 0.13141611860506092, + "learning_rate": 1.7722354944642028e-07, + "loss": 2.6882, + "step": 47161 + }, + { + "epoch": 2.9276801787820474, + "grad_norm": 0.1298022199425472, + "learning_rate": 1.7691987361578844e-07, + "loss": 2.6349, + "step": 47162 + }, + { + "epoch": 2.9277422558818054, + "grad_norm": 0.13644178811323232, + "learning_rate": 1.7661645772426017e-07, + "loss": 2.7599, + "step": 47163 + }, + { + "epoch": 2.9278043329815633, + "grad_norm": 0.12725213856951656, + "learning_rate": 1.763133017734231e-07, + "loss": 2.5859, + "step": 47164 + }, + { + "epoch": 2.9278664100813208, + "grad_norm": 0.12834514249952292, + "learning_rate": 1.7601040576484819e-07, + "loss": 2.6802, + "step": 47165 + }, + { + "epoch": 2.927928487181079, + "grad_norm": 0.12557514316331198, + "learning_rate": 1.7570776970012303e-07, + "loss": 2.6532, + "step": 47166 + }, + { + "epoch": 2.9279905642808366, + "grad_norm": 0.14084969556602575, + "learning_rate": 1.7540539358082976e-07, + "loss": 2.656, + "step": 47167 + }, + { + "epoch": 2.928052641380595, + "grad_norm": 0.13037540842981635, + "learning_rate": 1.7510327740853927e-07, + "loss": 2.7724, + "step": 47168 + }, + { + "epoch": 2.9281147184803524, + "grad_norm": 0.14977355792746294, + "learning_rate": 1.7480142118482812e-07, + "loss": 2.719, + "step": 47169 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.1322441210562309, + "learning_rate": 1.7449982491127836e-07, + "loss": 2.6568, + "step": 47170 + }, + { + "epoch": 2.9282388726798683, + "grad_norm": 0.12906957885180867, + "learning_rate": 1.7419848858945542e-07, + "loss": 2.6828, + "step": 47171 + }, + { + "epoch": 2.928300949779626, + "grad_norm": 0.12917564880637467, + "learning_rate": 1.738974122209358e-07, + "loss": 2.7794, + "step": 47172 + }, + { + "epoch": 2.928363026879384, + "grad_norm": 0.13694968474398145, + "learning_rate": 1.7359659580729048e-07, + "loss": 2.6729, + "step": 47173 + }, + { + "epoch": 2.928425103979142, + "grad_norm": 0.14143611845419735, + "learning_rate": 1.7329603935008486e-07, + "loss": 2.6815, + "step": 47174 + }, + { + "epoch": 2.9284871810789, + "grad_norm": 0.13355142832079367, + "learning_rate": 1.7299574285088992e-07, + "loss": 2.7251, + "step": 47175 + }, + { + "epoch": 2.928549258178658, + "grad_norm": 0.13232982102812896, + "learning_rate": 1.7269570631127663e-07, + "loss": 2.6304, + "step": 47176 + }, + { + "epoch": 2.9286113352784158, + "grad_norm": 0.12932282386230548, + "learning_rate": 1.723959297327993e-07, + "loss": 2.6549, + "step": 47177 + }, + { + "epoch": 2.9286734123781737, + "grad_norm": 0.15631627822894756, + "learning_rate": 1.720964131170344e-07, + "loss": 2.7469, + "step": 47178 + }, + { + "epoch": 2.9287354894779316, + "grad_norm": 0.13370098221829294, + "learning_rate": 1.717971564655363e-07, + "loss": 2.76, + "step": 47179 + }, + { + "epoch": 2.9287975665776895, + "grad_norm": 0.14325811931558322, + "learning_rate": 1.7149815977986484e-07, + "loss": 2.7624, + "step": 47180 + }, + { + "epoch": 2.9288596436774474, + "grad_norm": 0.13686180124049585, + "learning_rate": 1.7119942306158543e-07, + "loss": 2.7054, + "step": 47181 + }, + { + "epoch": 2.9289217207772054, + "grad_norm": 0.13585952791684688, + "learning_rate": 1.7090094631225795e-07, + "loss": 2.6676, + "step": 47182 + }, + { + "epoch": 2.9289837978769633, + "grad_norm": 0.13567356048911242, + "learning_rate": 1.706027295334367e-07, + "loss": 2.6643, + "step": 47183 + }, + { + "epoch": 2.929045874976721, + "grad_norm": 0.13382007072471877, + "learning_rate": 1.70304772726676e-07, + "loss": 2.7398, + "step": 47184 + }, + { + "epoch": 2.929107952076479, + "grad_norm": 0.13550238387581548, + "learning_rate": 1.7000707589353016e-07, + "loss": 2.7505, + "step": 47185 + }, + { + "epoch": 2.929170029176237, + "grad_norm": 0.12889689353740938, + "learning_rate": 1.6970963903555348e-07, + "loss": 2.6819, + "step": 47186 + }, + { + "epoch": 2.929232106275995, + "grad_norm": 0.14168925365318535, + "learning_rate": 1.6941246215430584e-07, + "loss": 2.7004, + "step": 47187 + }, + { + "epoch": 2.9292941833757524, + "grad_norm": 0.12924931132494666, + "learning_rate": 1.691155452513249e-07, + "loss": 2.7115, + "step": 47188 + }, + { + "epoch": 2.929356260475511, + "grad_norm": 0.13128013718970394, + "learning_rate": 1.6881888832816496e-07, + "loss": 2.7794, + "step": 47189 + }, + { + "epoch": 2.9294183375752683, + "grad_norm": 0.13555664994670066, + "learning_rate": 1.6852249138636922e-07, + "loss": 2.6704, + "step": 47190 + }, + { + "epoch": 2.9294804146750266, + "grad_norm": 0.1426038980051895, + "learning_rate": 1.6822635442749756e-07, + "loss": 2.6621, + "step": 47191 + }, + { + "epoch": 2.929542491774784, + "grad_norm": 0.1370880316834528, + "learning_rate": 1.679304774530821e-07, + "loss": 2.755, + "step": 47192 + }, + { + "epoch": 2.9296045688745425, + "grad_norm": 0.12864619089635457, + "learning_rate": 1.6763486046467158e-07, + "loss": 2.6772, + "step": 47193 + }, + { + "epoch": 2.9296666459743, + "grad_norm": 0.14015135871364343, + "learning_rate": 1.6733950346380368e-07, + "loss": 2.7604, + "step": 47194 + }, + { + "epoch": 2.929728723074058, + "grad_norm": 0.14337743468699288, + "learning_rate": 1.6704440645202158e-07, + "loss": 2.6531, + "step": 47195 + }, + { + "epoch": 2.9297908001738158, + "grad_norm": 0.12621827556703225, + "learning_rate": 1.6674956943087405e-07, + "loss": 2.6682, + "step": 47196 + }, + { + "epoch": 2.9298528772735737, + "grad_norm": 0.13887051338258444, + "learning_rate": 1.6645499240188766e-07, + "loss": 2.6841, + "step": 47197 + }, + { + "epoch": 2.9299149543733316, + "grad_norm": 0.14038399734307008, + "learning_rate": 1.6616067536660008e-07, + "loss": 2.7225, + "step": 47198 + }, + { + "epoch": 2.9299770314730895, + "grad_norm": 0.13865689075804236, + "learning_rate": 1.6586661832654894e-07, + "loss": 2.6921, + "step": 47199 + }, + { + "epoch": 2.9300391085728474, + "grad_norm": 0.1583966143370291, + "learning_rate": 1.655728212832719e-07, + "loss": 2.8209, + "step": 47200 + }, + { + "epoch": 2.9301011856726054, + "grad_norm": 0.13101527904949337, + "learning_rate": 1.6527928423830108e-07, + "loss": 2.7102, + "step": 47201 + }, + { + "epoch": 2.9301632627723633, + "grad_norm": 0.1356974191734374, + "learning_rate": 1.6498600719316858e-07, + "loss": 2.6672, + "step": 47202 + }, + { + "epoch": 2.930225339872121, + "grad_norm": 0.12834862552008292, + "learning_rate": 1.6469299014939543e-07, + "loss": 2.5989, + "step": 47203 + }, + { + "epoch": 2.930287416971879, + "grad_norm": 0.1333452646559699, + "learning_rate": 1.6440023310851926e-07, + "loss": 2.7214, + "step": 47204 + }, + { + "epoch": 2.930349494071637, + "grad_norm": 0.1323393781360986, + "learning_rate": 1.6410773607206665e-07, + "loss": 2.741, + "step": 47205 + }, + { + "epoch": 2.930411571171395, + "grad_norm": 0.14775222152082285, + "learning_rate": 1.6381549904156412e-07, + "loss": 2.6417, + "step": 47206 + }, + { + "epoch": 2.930473648271153, + "grad_norm": 0.12835264538881602, + "learning_rate": 1.6352352201852717e-07, + "loss": 2.6828, + "step": 47207 + }, + { + "epoch": 2.930535725370911, + "grad_norm": 0.13201137455199627, + "learning_rate": 1.6323180500449343e-07, + "loss": 2.6819, + "step": 47208 + }, + { + "epoch": 2.9305978024706687, + "grad_norm": 0.12781299481377736, + "learning_rate": 1.629403480009728e-07, + "loss": 2.6223, + "step": 47209 + }, + { + "epoch": 2.9306598795704266, + "grad_norm": 0.1364960559787823, + "learning_rate": 1.6264915100949186e-07, + "loss": 2.7079, + "step": 47210 + }, + { + "epoch": 2.930721956670184, + "grad_norm": 0.15479084209023136, + "learning_rate": 1.6235821403157158e-07, + "loss": 2.6015, + "step": 47211 + }, + { + "epoch": 2.9307840337699425, + "grad_norm": 0.14337117928699686, + "learning_rate": 1.620675370687219e-07, + "loss": 2.718, + "step": 47212 + }, + { + "epoch": 2.9308461108697, + "grad_norm": 0.13257606582301293, + "learning_rate": 1.6177712012246936e-07, + "loss": 2.835, + "step": 47213 + }, + { + "epoch": 2.9309081879694583, + "grad_norm": 0.14004785597897182, + "learning_rate": 1.6148696319432388e-07, + "loss": 2.7665, + "step": 47214 + }, + { + "epoch": 2.9309702650692158, + "grad_norm": 0.13395523277074112, + "learning_rate": 1.6119706628579533e-07, + "loss": 2.6492, + "step": 47215 + }, + { + "epoch": 2.931032342168974, + "grad_norm": 0.13340283075250284, + "learning_rate": 1.6090742939840475e-07, + "loss": 2.6519, + "step": 47216 + }, + { + "epoch": 2.9310944192687316, + "grad_norm": 0.13215409494818173, + "learning_rate": 1.6061805253365648e-07, + "loss": 2.6599, + "step": 47217 + }, + { + "epoch": 2.9311564963684895, + "grad_norm": 0.14225703493691705, + "learning_rate": 1.603289356930604e-07, + "loss": 2.6842, + "step": 47218 + }, + { + "epoch": 2.9312185734682474, + "grad_norm": 0.1278876169931061, + "learning_rate": 1.60040078878132e-07, + "loss": 2.582, + "step": 47219 + }, + { + "epoch": 2.9312806505680054, + "grad_norm": 0.12994865670181374, + "learning_rate": 1.597514820903756e-07, + "loss": 2.7031, + "step": 47220 + }, + { + "epoch": 2.9313427276677633, + "grad_norm": 0.13224221253858243, + "learning_rate": 1.5946314533129004e-07, + "loss": 2.7661, + "step": 47221 + }, + { + "epoch": 2.931404804767521, + "grad_norm": 0.13155827422615615, + "learning_rate": 1.591750686023852e-07, + "loss": 2.6545, + "step": 47222 + }, + { + "epoch": 2.931466881867279, + "grad_norm": 0.1397767644423037, + "learning_rate": 1.588872519051654e-07, + "loss": 2.6865, + "step": 47223 + }, + { + "epoch": 2.931528958967037, + "grad_norm": 0.13183553483689592, + "learning_rate": 1.5859969524112395e-07, + "loss": 2.6466, + "step": 47224 + }, + { + "epoch": 2.931591036066795, + "grad_norm": 0.1406714375963776, + "learning_rate": 1.5831239861177628e-07, + "loss": 2.7516, + "step": 47225 + }, + { + "epoch": 2.931653113166553, + "grad_norm": 0.1369224121333958, + "learning_rate": 1.5802536201860453e-07, + "loss": 2.699, + "step": 47226 + }, + { + "epoch": 2.9317151902663108, + "grad_norm": 0.12919102626208662, + "learning_rate": 1.5773858546311858e-07, + "loss": 2.6213, + "step": 47227 + }, + { + "epoch": 2.9317772673660687, + "grad_norm": 0.13139233307959267, + "learning_rate": 1.5745206894681174e-07, + "loss": 2.7268, + "step": 47228 + }, + { + "epoch": 2.9318393444658266, + "grad_norm": 0.13359246113919493, + "learning_rate": 1.5716581247117168e-07, + "loss": 2.7075, + "step": 47229 + }, + { + "epoch": 2.9319014215655845, + "grad_norm": 0.13551796431684682, + "learning_rate": 1.5687981603769718e-07, + "loss": 2.69, + "step": 47230 + }, + { + "epoch": 2.9319634986653424, + "grad_norm": 0.13125635540425182, + "learning_rate": 1.5659407964788153e-07, + "loss": 2.6936, + "step": 47231 + }, + { + "epoch": 2.9320255757651004, + "grad_norm": 0.14060015591401467, + "learning_rate": 1.5630860330321796e-07, + "loss": 2.6985, + "step": 47232 + }, + { + "epoch": 2.9320876528648583, + "grad_norm": 0.13277561713436015, + "learning_rate": 1.5602338700519413e-07, + "loss": 2.7447, + "step": 47233 + }, + { + "epoch": 2.932149729964616, + "grad_norm": 0.14830014499350475, + "learning_rate": 1.5573843075529225e-07, + "loss": 2.7338, + "step": 47234 + }, + { + "epoch": 2.932211807064374, + "grad_norm": 0.13888952475309516, + "learning_rate": 1.5545373455500002e-07, + "loss": 2.6333, + "step": 47235 + }, + { + "epoch": 2.9322738841641316, + "grad_norm": 0.13272356112764191, + "learning_rate": 1.5516929840581063e-07, + "loss": 2.6308, + "step": 47236 + }, + { + "epoch": 2.93233596126389, + "grad_norm": 0.13169218632651994, + "learning_rate": 1.5488512230920627e-07, + "loss": 2.7023, + "step": 47237 + }, + { + "epoch": 2.9323980383636474, + "grad_norm": 0.14762737027065367, + "learning_rate": 1.5460120626665796e-07, + "loss": 2.6785, + "step": 47238 + }, + { + "epoch": 2.932460115463406, + "grad_norm": 0.14651994909020313, + "learning_rate": 1.5431755027966455e-07, + "loss": 2.715, + "step": 47239 + }, + { + "epoch": 2.9325221925631633, + "grad_norm": 0.1297666345588047, + "learning_rate": 1.5403415434969148e-07, + "loss": 2.5687, + "step": 47240 + }, + { + "epoch": 2.932584269662921, + "grad_norm": 0.1312617620238696, + "learning_rate": 1.5375101847822093e-07, + "loss": 2.7343, + "step": 47241 + }, + { + "epoch": 2.932646346762679, + "grad_norm": 0.1416236334075716, + "learning_rate": 1.5346814266673504e-07, + "loss": 2.7546, + "step": 47242 + }, + { + "epoch": 2.932708423862437, + "grad_norm": 0.13030384772550607, + "learning_rate": 1.531855269167104e-07, + "loss": 2.6484, + "step": 47243 + }, + { + "epoch": 2.932770500962195, + "grad_norm": 0.13220257111796177, + "learning_rate": 1.5290317122961251e-07, + "loss": 2.6877, + "step": 47244 + }, + { + "epoch": 2.932832578061953, + "grad_norm": 0.1408185620139602, + "learning_rate": 1.5262107560691796e-07, + "loss": 2.6628, + "step": 47245 + }, + { + "epoch": 2.9328946551617108, + "grad_norm": 0.14466487661306596, + "learning_rate": 1.5233924005010337e-07, + "loss": 2.7617, + "step": 47246 + }, + { + "epoch": 2.9329567322614687, + "grad_norm": 0.13982430967606924, + "learning_rate": 1.520576645606342e-07, + "loss": 2.6579, + "step": 47247 + }, + { + "epoch": 2.9330188093612266, + "grad_norm": 0.1359604651685065, + "learning_rate": 1.517763491399815e-07, + "loss": 2.7243, + "step": 47248 + }, + { + "epoch": 2.9330808864609845, + "grad_norm": 0.1391934070682622, + "learning_rate": 1.5149529378961637e-07, + "loss": 2.6547, + "step": 47249 + }, + { + "epoch": 2.9331429635607424, + "grad_norm": 0.13562554212101469, + "learning_rate": 1.5121449851099867e-07, + "loss": 2.7068, + "step": 47250 + }, + { + "epoch": 2.9332050406605004, + "grad_norm": 0.13006665548673102, + "learning_rate": 1.5093396330559394e-07, + "loss": 2.7894, + "step": 47251 + }, + { + "epoch": 2.9332671177602583, + "grad_norm": 0.14626166898277634, + "learning_rate": 1.5065368817487323e-07, + "loss": 2.7355, + "step": 47252 + }, + { + "epoch": 2.933329194860016, + "grad_norm": 0.14465377743161884, + "learning_rate": 1.5037367312028537e-07, + "loss": 2.8568, + "step": 47253 + }, + { + "epoch": 2.933391271959774, + "grad_norm": 0.1262855953256236, + "learning_rate": 1.5009391814330698e-07, + "loss": 2.7258, + "step": 47254 + }, + { + "epoch": 2.933453349059532, + "grad_norm": 0.13723926912111314, + "learning_rate": 1.4981442324538687e-07, + "loss": 2.7193, + "step": 47255 + }, + { + "epoch": 2.93351542615929, + "grad_norm": 0.14140631928626624, + "learning_rate": 1.4953518842799052e-07, + "loss": 2.7087, + "step": 47256 + }, + { + "epoch": 2.933577503259048, + "grad_norm": 0.13071833311712558, + "learning_rate": 1.4925621369256127e-07, + "loss": 2.777, + "step": 47257 + }, + { + "epoch": 2.933639580358806, + "grad_norm": 0.15534609767979252, + "learning_rate": 1.4897749904057567e-07, + "loss": 2.7114, + "step": 47258 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.12930780059199673, + "learning_rate": 1.486990444734715e-07, + "loss": 2.7807, + "step": 47259 + }, + { + "epoch": 2.9337637345583216, + "grad_norm": 0.13882217858180013, + "learning_rate": 1.4842084999270312e-07, + "loss": 2.7004, + "step": 47260 + }, + { + "epoch": 2.933825811658079, + "grad_norm": 0.15325894312060642, + "learning_rate": 1.4814291559972494e-07, + "loss": 2.6861, + "step": 47261 + }, + { + "epoch": 2.9338878887578375, + "grad_norm": 0.13280017178481981, + "learning_rate": 1.4786524129599133e-07, + "loss": 2.6737, + "step": 47262 + }, + { + "epoch": 2.933949965857595, + "grad_norm": 0.12975731860405662, + "learning_rate": 1.4758782708294561e-07, + "loss": 2.6814, + "step": 47263 + }, + { + "epoch": 2.9340120429573533, + "grad_norm": 0.168673903694692, + "learning_rate": 1.473106729620366e-07, + "loss": 2.7714, + "step": 47264 + }, + { + "epoch": 2.9340741200571108, + "grad_norm": 0.1277400569643615, + "learning_rate": 1.470337789347076e-07, + "loss": 2.6931, + "step": 47265 + }, + { + "epoch": 2.9341361971568687, + "grad_norm": 0.13490755099017632, + "learning_rate": 1.4675714500240744e-07, + "loss": 2.7027, + "step": 47266 + }, + { + "epoch": 2.9341982742566266, + "grad_norm": 0.1361200849365594, + "learning_rate": 1.464807711665739e-07, + "loss": 2.7409, + "step": 47267 + }, + { + "epoch": 2.9342603513563845, + "grad_norm": 0.1428994255630314, + "learning_rate": 1.4620465742865574e-07, + "loss": 2.7811, + "step": 47268 + }, + { + "epoch": 2.9343224284561424, + "grad_norm": 0.13284928850147454, + "learning_rate": 1.4592880379009077e-07, + "loss": 2.6233, + "step": 47269 + }, + { + "epoch": 2.9343845055559004, + "grad_norm": 0.1426411646129586, + "learning_rate": 1.456532102523167e-07, + "loss": 2.6598, + "step": 47270 + }, + { + "epoch": 2.9344465826556583, + "grad_norm": 0.14789824662113873, + "learning_rate": 1.4537787681677684e-07, + "loss": 2.6895, + "step": 47271 + }, + { + "epoch": 2.934508659755416, + "grad_norm": 0.14039841915268683, + "learning_rate": 1.451028034848978e-07, + "loss": 2.615, + "step": 47272 + }, + { + "epoch": 2.934570736855174, + "grad_norm": 0.1314796366095613, + "learning_rate": 1.4482799025812288e-07, + "loss": 2.7167, + "step": 47273 + }, + { + "epoch": 2.934632813954932, + "grad_norm": 0.13084215791087608, + "learning_rate": 1.4455343713787873e-07, + "loss": 2.6757, + "step": 47274 + }, + { + "epoch": 2.93469489105469, + "grad_norm": 0.1313082909002504, + "learning_rate": 1.4427914412560862e-07, + "loss": 2.7769, + "step": 47275 + }, + { + "epoch": 2.934756968154448, + "grad_norm": 0.12965168142326258, + "learning_rate": 1.4400511122273364e-07, + "loss": 2.7372, + "step": 47276 + }, + { + "epoch": 2.934819045254206, + "grad_norm": 0.13172777471928585, + "learning_rate": 1.4373133843068599e-07, + "loss": 2.743, + "step": 47277 + }, + { + "epoch": 2.9348811223539637, + "grad_norm": 0.13331742933784377, + "learning_rate": 1.4345782575089784e-07, + "loss": 2.7899, + "step": 47278 + }, + { + "epoch": 2.9349431994537216, + "grad_norm": 0.1289613201532934, + "learning_rate": 1.4318457318479028e-07, + "loss": 2.7267, + "step": 47279 + }, + { + "epoch": 2.9350052765534795, + "grad_norm": 0.1282081267469678, + "learning_rate": 1.429115807337955e-07, + "loss": 2.7084, + "step": 47280 + }, + { + "epoch": 2.9350673536532375, + "grad_norm": 0.12752000716020256, + "learning_rate": 1.4263884839932907e-07, + "loss": 2.6939, + "step": 47281 + }, + { + "epoch": 2.9351294307529954, + "grad_norm": 0.14229197724696158, + "learning_rate": 1.4236637618282312e-07, + "loss": 2.7787, + "step": 47282 + }, + { + "epoch": 2.9351915078527533, + "grad_norm": 0.12808791414709325, + "learning_rate": 1.420941640856932e-07, + "loss": 2.7834, + "step": 47283 + }, + { + "epoch": 2.9352535849525108, + "grad_norm": 0.13560125183687022, + "learning_rate": 1.418222121093604e-07, + "loss": 2.7102, + "step": 47284 + }, + { + "epoch": 2.935315662052269, + "grad_norm": 0.12748333909766005, + "learning_rate": 1.415505202552514e-07, + "loss": 2.7142, + "step": 47285 + }, + { + "epoch": 2.9353777391520266, + "grad_norm": 0.14697455724963226, + "learning_rate": 1.4127908852477056e-07, + "loss": 2.7243, + "step": 47286 + }, + { + "epoch": 2.935439816251785, + "grad_norm": 0.13102702887896184, + "learning_rate": 1.4100791691933902e-07, + "loss": 2.6266, + "step": 47287 + }, + { + "epoch": 2.9355018933515424, + "grad_norm": 0.14200605956978524, + "learning_rate": 1.4073700544037782e-07, + "loss": 2.6854, + "step": 47288 + }, + { + "epoch": 2.9355639704513004, + "grad_norm": 0.1305023918065125, + "learning_rate": 1.4046635408929143e-07, + "loss": 2.6441, + "step": 47289 + }, + { + "epoch": 2.9356260475510583, + "grad_norm": 0.1340011044059086, + "learning_rate": 1.4019596286750093e-07, + "loss": 2.7554, + "step": 47290 + }, + { + "epoch": 2.935688124650816, + "grad_norm": 0.13664863014436612, + "learning_rate": 1.3992583177640517e-07, + "loss": 2.6812, + "step": 47291 + }, + { + "epoch": 2.935750201750574, + "grad_norm": 0.13402165430895668, + "learning_rate": 1.3965596081742526e-07, + "loss": 2.7488, + "step": 47292 + }, + { + "epoch": 2.935812278850332, + "grad_norm": 0.13591593319168532, + "learning_rate": 1.3938634999196565e-07, + "loss": 2.6657, + "step": 47293 + }, + { + "epoch": 2.93587435595009, + "grad_norm": 0.12609123215649123, + "learning_rate": 1.3911699930142518e-07, + "loss": 2.6293, + "step": 47294 + }, + { + "epoch": 2.935936433049848, + "grad_norm": 0.12950776381905052, + "learning_rate": 1.3884790874721942e-07, + "loss": 2.753, + "step": 47295 + }, + { + "epoch": 2.935998510149606, + "grad_norm": 0.13045968035622155, + "learning_rate": 1.3857907833074723e-07, + "loss": 2.6706, + "step": 47296 + }, + { + "epoch": 2.9360605872493637, + "grad_norm": 0.12770266445510473, + "learning_rate": 1.3831050805341307e-07, + "loss": 2.6515, + "step": 47297 + }, + { + "epoch": 2.9361226643491216, + "grad_norm": 0.13142328318629584, + "learning_rate": 1.3804219791661576e-07, + "loss": 2.7038, + "step": 47298 + }, + { + "epoch": 2.9361847414488795, + "grad_norm": 0.13026894271394154, + "learning_rate": 1.377741479217598e-07, + "loss": 2.6438, + "step": 47299 + }, + { + "epoch": 2.9362468185486374, + "grad_norm": 0.13954883971842452, + "learning_rate": 1.3750635807023848e-07, + "loss": 2.7746, + "step": 47300 + }, + { + "epoch": 2.9363088956483954, + "grad_norm": 0.13170677498292072, + "learning_rate": 1.3723882836345624e-07, + "loss": 2.6602, + "step": 47301 + }, + { + "epoch": 2.9363709727481533, + "grad_norm": 0.1451679395130519, + "learning_rate": 1.3697155880279533e-07, + "loss": 2.6875, + "step": 47302 + }, + { + "epoch": 2.936433049847911, + "grad_norm": 0.1432173911947257, + "learning_rate": 1.3670454938966015e-07, + "loss": 2.7658, + "step": 47303 + }, + { + "epoch": 2.936495126947669, + "grad_norm": 0.15148303613174338, + "learning_rate": 1.3643780012544405e-07, + "loss": 2.8148, + "step": 47304 + }, + { + "epoch": 2.936557204047427, + "grad_norm": 0.14656164674368197, + "learning_rate": 1.3617131101154035e-07, + "loss": 2.7265, + "step": 47305 + }, + { + "epoch": 2.936619281147185, + "grad_norm": 0.13539853952809103, + "learning_rate": 1.3590508204933128e-07, + "loss": 2.6983, + "step": 47306 + }, + { + "epoch": 2.9366813582469424, + "grad_norm": 0.15123191449512452, + "learning_rate": 1.3563911324021018e-07, + "loss": 2.6941, + "step": 47307 + }, + { + "epoch": 2.936743435346701, + "grad_norm": 0.1285232069219694, + "learning_rate": 1.353734045855648e-07, + "loss": 2.6278, + "step": 47308 + }, + { + "epoch": 2.9368055124464583, + "grad_norm": 0.13270733680854982, + "learning_rate": 1.3510795608678294e-07, + "loss": 2.6147, + "step": 47309 + }, + { + "epoch": 2.9368675895462166, + "grad_norm": 0.12848036667923335, + "learning_rate": 1.348427677452524e-07, + "loss": 2.643, + "step": 47310 + }, + { + "epoch": 2.936929666645974, + "grad_norm": 0.13699736815686447, + "learning_rate": 1.3457783956234427e-07, + "loss": 2.6168, + "step": 47311 + }, + { + "epoch": 2.9369917437457325, + "grad_norm": 0.13194461863977522, + "learning_rate": 1.343131715394519e-07, + "loss": 2.7639, + "step": 47312 + }, + { + "epoch": 2.93705382084549, + "grad_norm": 0.12695883224569973, + "learning_rate": 1.3404876367795193e-07, + "loss": 2.6092, + "step": 47313 + }, + { + "epoch": 2.937115897945248, + "grad_norm": 0.14446791988301816, + "learning_rate": 1.3378461597922664e-07, + "loss": 2.6829, + "step": 47314 + }, + { + "epoch": 2.9371779750450058, + "grad_norm": 0.13900851637750447, + "learning_rate": 1.3352072844465269e-07, + "loss": 2.6752, + "step": 47315 + }, + { + "epoch": 2.9372400521447637, + "grad_norm": 0.1359972170403036, + "learning_rate": 1.3325710107560673e-07, + "loss": 2.6544, + "step": 47316 + }, + { + "epoch": 2.9373021292445216, + "grad_norm": 0.13097839494123606, + "learning_rate": 1.3299373387346547e-07, + "loss": 2.5853, + "step": 47317 + }, + { + "epoch": 2.9373642063442795, + "grad_norm": 0.1299432026142471, + "learning_rate": 1.3273062683959448e-07, + "loss": 2.6942, + "step": 47318 + }, + { + "epoch": 2.9374262834440374, + "grad_norm": 0.14408900931391783, + "learning_rate": 1.324677799753815e-07, + "loss": 2.6349, + "step": 47319 + }, + { + "epoch": 2.9374883605437954, + "grad_norm": 0.13356395913515462, + "learning_rate": 1.3220519328218661e-07, + "loss": 2.6908, + "step": 47320 + }, + { + "epoch": 2.9375504376435533, + "grad_norm": 0.1462244588911865, + "learning_rate": 1.319428667613809e-07, + "loss": 2.6634, + "step": 47321 + }, + { + "epoch": 2.937612514743311, + "grad_norm": 0.1322286549322322, + "learning_rate": 1.3168080041434106e-07, + "loss": 2.6717, + "step": 47322 + }, + { + "epoch": 2.937674591843069, + "grad_norm": 0.12604814166653433, + "learning_rate": 1.3141899424242154e-07, + "loss": 2.5612, + "step": 47323 + }, + { + "epoch": 2.937736668942827, + "grad_norm": 0.12726725082576665, + "learning_rate": 1.3115744824699904e-07, + "loss": 2.6774, + "step": 47324 + }, + { + "epoch": 2.937798746042585, + "grad_norm": 0.13660218174057434, + "learning_rate": 1.308961624294336e-07, + "loss": 2.6687, + "step": 47325 + }, + { + "epoch": 2.937860823142343, + "grad_norm": 0.14329935579291553, + "learning_rate": 1.3063513679109073e-07, + "loss": 2.6967, + "step": 47326 + }, + { + "epoch": 2.937922900242101, + "grad_norm": 0.131713124751278, + "learning_rate": 1.3037437133333052e-07, + "loss": 2.755, + "step": 47327 + }, + { + "epoch": 2.9379849773418587, + "grad_norm": 0.13858298689971346, + "learning_rate": 1.301138660575074e-07, + "loss": 2.7696, + "step": 47328 + }, + { + "epoch": 2.9380470544416166, + "grad_norm": 0.13714336805283053, + "learning_rate": 1.2985362096499255e-07, + "loss": 2.7923, + "step": 47329 + }, + { + "epoch": 2.9381091315413745, + "grad_norm": 0.13803473363612423, + "learning_rate": 1.2959363605713483e-07, + "loss": 2.6855, + "step": 47330 + }, + { + "epoch": 2.9381712086411325, + "grad_norm": 0.13691186209310366, + "learning_rate": 1.2933391133529427e-07, + "loss": 2.6932, + "step": 47331 + }, + { + "epoch": 2.93823328574089, + "grad_norm": 0.1293557306811448, + "learning_rate": 1.2907444680082538e-07, + "loss": 2.6609, + "step": 47332 + }, + { + "epoch": 2.9382953628406483, + "grad_norm": 0.130319513697063, + "learning_rate": 1.288152424550826e-07, + "loss": 2.6316, + "step": 47333 + }, + { + "epoch": 2.9383574399404058, + "grad_norm": 0.12829905614043008, + "learning_rate": 1.2855629829941484e-07, + "loss": 2.6779, + "step": 47334 + }, + { + "epoch": 2.938419517040164, + "grad_norm": 0.1343776674893006, + "learning_rate": 1.2829761433518216e-07, + "loss": 2.7465, + "step": 47335 + }, + { + "epoch": 2.9384815941399216, + "grad_norm": 0.1348092105430559, + "learning_rate": 1.2803919056372237e-07, + "loss": 2.6683, + "step": 47336 + }, + { + "epoch": 2.9385436712396795, + "grad_norm": 0.14032291045576017, + "learning_rate": 1.2778102698638994e-07, + "loss": 2.6697, + "step": 47337 + }, + { + "epoch": 2.9386057483394374, + "grad_norm": 0.13523258633370053, + "learning_rate": 1.2752312360452823e-07, + "loss": 2.6895, + "step": 47338 + }, + { + "epoch": 2.9386678254391954, + "grad_norm": 0.1357908984817828, + "learning_rate": 1.2726548041948615e-07, + "loss": 2.7206, + "step": 47339 + }, + { + "epoch": 2.9387299025389533, + "grad_norm": 0.13842143022556416, + "learning_rate": 1.2700809743261266e-07, + "loss": 2.7655, + "step": 47340 + }, + { + "epoch": 2.938791979638711, + "grad_norm": 0.14185202579556044, + "learning_rate": 1.2675097464524e-07, + "loss": 2.7128, + "step": 47341 + }, + { + "epoch": 2.938854056738469, + "grad_norm": 0.13750879357135729, + "learning_rate": 1.264941120587171e-07, + "loss": 2.807, + "step": 47342 + }, + { + "epoch": 2.938916133838227, + "grad_norm": 0.1270415365035576, + "learning_rate": 1.262375096743762e-07, + "loss": 2.694, + "step": 47343 + }, + { + "epoch": 2.938978210937985, + "grad_norm": 0.13091913680588876, + "learning_rate": 1.2598116749356625e-07, + "loss": 2.7004, + "step": 47344 + }, + { + "epoch": 2.939040288037743, + "grad_norm": 0.1335571813448165, + "learning_rate": 1.2572508551761952e-07, + "loss": 2.6863, + "step": 47345 + }, + { + "epoch": 2.939102365137501, + "grad_norm": 0.13104687253843547, + "learning_rate": 1.2546926374787382e-07, + "loss": 2.6901, + "step": 47346 + }, + { + "epoch": 2.9391644422372587, + "grad_norm": 0.132576915823121, + "learning_rate": 1.2521370218566142e-07, + "loss": 2.6728, + "step": 47347 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.13780964507824298, + "learning_rate": 1.2495840083231458e-07, + "loss": 2.7242, + "step": 47348 + }, + { + "epoch": 2.9392885964367745, + "grad_norm": 0.1284215803327474, + "learning_rate": 1.2470335968917113e-07, + "loss": 2.6441, + "step": 47349 + }, + { + "epoch": 2.9393506735365325, + "grad_norm": 0.13086068451638974, + "learning_rate": 1.2444857875755224e-07, + "loss": 2.6544, + "step": 47350 + }, + { + "epoch": 2.9394127506362904, + "grad_norm": 0.13064723825979285, + "learning_rate": 1.241940580387957e-07, + "loss": 2.8265, + "step": 47351 + }, + { + "epoch": 2.9394748277360483, + "grad_norm": 0.13720516461497517, + "learning_rate": 1.2393979753422825e-07, + "loss": 2.7132, + "step": 47352 + }, + { + "epoch": 2.939536904835806, + "grad_norm": 0.13546200522073443, + "learning_rate": 1.2368579724517103e-07, + "loss": 2.7712, + "step": 47353 + }, + { + "epoch": 2.939598981935564, + "grad_norm": 0.12914954691793398, + "learning_rate": 1.2343205717295636e-07, + "loss": 2.732, + "step": 47354 + }, + { + "epoch": 2.9396610590353216, + "grad_norm": 0.13184481696849706, + "learning_rate": 1.2317857731890536e-07, + "loss": 2.7115, + "step": 47355 + }, + { + "epoch": 2.93972313613508, + "grad_norm": 0.13174132024767649, + "learning_rate": 1.2292535768433366e-07, + "loss": 2.7475, + "step": 47356 + }, + { + "epoch": 2.9397852132348374, + "grad_norm": 0.13694180276945367, + "learning_rate": 1.2267239827057352e-07, + "loss": 2.704, + "step": 47357 + }, + { + "epoch": 2.939847290334596, + "grad_norm": 0.13236422479458423, + "learning_rate": 1.2241969907894059e-07, + "loss": 2.6991, + "step": 47358 + }, + { + "epoch": 2.9399093674343533, + "grad_norm": 0.14627170383359328, + "learning_rate": 1.221672601107504e-07, + "loss": 2.8383, + "step": 47359 + }, + { + "epoch": 2.9399714445341116, + "grad_norm": 0.12816319739443033, + "learning_rate": 1.2191508136732422e-07, + "loss": 2.7301, + "step": 47360 + }, + { + "epoch": 2.940033521633869, + "grad_norm": 0.1306344062781412, + "learning_rate": 1.2166316284997205e-07, + "loss": 2.6468, + "step": 47361 + }, + { + "epoch": 2.940095598733627, + "grad_norm": 0.13013550549542302, + "learning_rate": 1.2141150456000948e-07, + "loss": 2.7116, + "step": 47362 + }, + { + "epoch": 2.940157675833385, + "grad_norm": 0.14360847244086986, + "learning_rate": 1.2116010649875774e-07, + "loss": 2.7468, + "step": 47363 + }, + { + "epoch": 2.940219752933143, + "grad_norm": 0.12916290136073905, + "learning_rate": 1.2090896866751578e-07, + "loss": 2.658, + "step": 47364 + }, + { + "epoch": 2.940281830032901, + "grad_norm": 0.13238459262005625, + "learning_rate": 1.2065809106760472e-07, + "loss": 2.7967, + "step": 47365 + }, + { + "epoch": 2.9403439071326587, + "grad_norm": 0.13937594096973766, + "learning_rate": 1.2040747370032911e-07, + "loss": 2.7081, + "step": 47366 + }, + { + "epoch": 2.9404059842324166, + "grad_norm": 0.12989691607547385, + "learning_rate": 1.2015711656699346e-07, + "loss": 2.6832, + "step": 47367 + }, + { + "epoch": 2.9404680613321745, + "grad_norm": 0.13709515799474792, + "learning_rate": 1.1990701966890783e-07, + "loss": 2.7276, + "step": 47368 + }, + { + "epoch": 2.9405301384319324, + "grad_norm": 0.1318882449897606, + "learning_rate": 1.196571830073767e-07, + "loss": 2.7101, + "step": 47369 + }, + { + "epoch": 2.9405922155316904, + "grad_norm": 0.12953203201057886, + "learning_rate": 1.1940760658370465e-07, + "loss": 2.685, + "step": 47370 + }, + { + "epoch": 2.9406542926314483, + "grad_norm": 0.13016794929496237, + "learning_rate": 1.1915829039919058e-07, + "loss": 2.7271, + "step": 47371 + }, + { + "epoch": 2.940716369731206, + "grad_norm": 0.13284588842885067, + "learning_rate": 1.1890923445513902e-07, + "loss": 2.7043, + "step": 47372 + }, + { + "epoch": 2.940778446830964, + "grad_norm": 0.1363745033926449, + "learning_rate": 1.1866043875284338e-07, + "loss": 2.6752, + "step": 47373 + }, + { + "epoch": 2.940840523930722, + "grad_norm": 0.12862421016849138, + "learning_rate": 1.1841190329360819e-07, + "loss": 2.6725, + "step": 47374 + }, + { + "epoch": 2.94090260103048, + "grad_norm": 0.1370696129703582, + "learning_rate": 1.1816362807872128e-07, + "loss": 2.7166, + "step": 47375 + }, + { + "epoch": 2.940964678130238, + "grad_norm": 0.1496422216149591, + "learning_rate": 1.1791561310949273e-07, + "loss": 2.6824, + "step": 47376 + }, + { + "epoch": 2.941026755229996, + "grad_norm": 0.1309142211304826, + "learning_rate": 1.176678583871993e-07, + "loss": 2.6513, + "step": 47377 + }, + { + "epoch": 2.9410888323297537, + "grad_norm": 0.1405046000576073, + "learning_rate": 1.1742036391315104e-07, + "loss": 2.7455, + "step": 47378 + }, + { + "epoch": 2.9411509094295116, + "grad_norm": 0.14790278042726115, + "learning_rate": 1.1717312968861916e-07, + "loss": 2.7306, + "step": 47379 + }, + { + "epoch": 2.941212986529269, + "grad_norm": 0.14161829356301767, + "learning_rate": 1.1692615571491372e-07, + "loss": 2.7299, + "step": 47380 + }, + { + "epoch": 2.9412750636290275, + "grad_norm": 0.13458720434775856, + "learning_rate": 1.166794419933115e-07, + "loss": 2.7616, + "step": 47381 + }, + { + "epoch": 2.941337140728785, + "grad_norm": 0.1314720996582298, + "learning_rate": 1.1643298852510032e-07, + "loss": 2.7103, + "step": 47382 + }, + { + "epoch": 2.9413992178285433, + "grad_norm": 0.13979100905740793, + "learning_rate": 1.1618679531156806e-07, + "loss": 2.6346, + "step": 47383 + }, + { + "epoch": 2.9414612949283008, + "grad_norm": 0.14397406789034464, + "learning_rate": 1.1594086235400259e-07, + "loss": 2.6506, + "step": 47384 + }, + { + "epoch": 2.9415233720280587, + "grad_norm": 0.13018090595022433, + "learning_rate": 1.1569518965368065e-07, + "loss": 2.6475, + "step": 47385 + }, + { + "epoch": 2.9415854491278166, + "grad_norm": 0.12903421004178262, + "learning_rate": 1.1544977721188454e-07, + "loss": 2.7275, + "step": 47386 + }, + { + "epoch": 2.9416475262275745, + "grad_norm": 0.1457435950561705, + "learning_rate": 1.1520462502990215e-07, + "loss": 2.7604, + "step": 47387 + }, + { + "epoch": 2.9417096033273324, + "grad_norm": 0.14147046696442786, + "learning_rate": 1.1495973310900465e-07, + "loss": 2.6865, + "step": 47388 + }, + { + "epoch": 2.9417716804270904, + "grad_norm": 0.13461652053757223, + "learning_rate": 1.1471510145046882e-07, + "loss": 2.666, + "step": 47389 + }, + { + "epoch": 2.9418337575268483, + "grad_norm": 0.13457213208168434, + "learning_rate": 1.1447073005558251e-07, + "loss": 2.7154, + "step": 47390 + }, + { + "epoch": 2.941895834626606, + "grad_norm": 0.16261478711862126, + "learning_rate": 1.1422661892560583e-07, + "loss": 2.7624, + "step": 47391 + }, + { + "epoch": 2.941957911726364, + "grad_norm": 0.13572527416436594, + "learning_rate": 1.1398276806182107e-07, + "loss": 2.546, + "step": 47392 + }, + { + "epoch": 2.942019988826122, + "grad_norm": 0.14144086598484307, + "learning_rate": 1.1373917746549944e-07, + "loss": 2.7233, + "step": 47393 + }, + { + "epoch": 2.94208206592588, + "grad_norm": 0.1367881871051154, + "learning_rate": 1.1349584713790663e-07, + "loss": 2.6857, + "step": 47394 + }, + { + "epoch": 2.942144143025638, + "grad_norm": 0.1383781075033908, + "learning_rate": 1.1325277708031934e-07, + "loss": 2.7802, + "step": 47395 + }, + { + "epoch": 2.942206220125396, + "grad_norm": 0.13821104201675732, + "learning_rate": 1.1300996729400326e-07, + "loss": 2.6631, + "step": 47396 + }, + { + "epoch": 2.9422682972251537, + "grad_norm": 0.13268308785910615, + "learning_rate": 1.1276741778022404e-07, + "loss": 2.7234, + "step": 47397 + }, + { + "epoch": 2.9423303743249116, + "grad_norm": 0.13625474255514794, + "learning_rate": 1.1252512854024733e-07, + "loss": 2.7481, + "step": 47398 + }, + { + "epoch": 2.9423924514246695, + "grad_norm": 0.1308411400459286, + "learning_rate": 1.1228309957533878e-07, + "loss": 2.6623, + "step": 47399 + }, + { + "epoch": 2.9424545285244275, + "grad_norm": 0.13215615272834105, + "learning_rate": 1.120413308867585e-07, + "loss": 2.7148, + "step": 47400 + }, + { + "epoch": 2.9425166056241854, + "grad_norm": 0.14216014918642114, + "learning_rate": 1.1179982247577214e-07, + "loss": 2.6549, + "step": 47401 + }, + { + "epoch": 2.9425786827239433, + "grad_norm": 0.12952778270898654, + "learning_rate": 1.1155857434363426e-07, + "loss": 2.7164, + "step": 47402 + }, + { + "epoch": 2.9426407598237008, + "grad_norm": 0.13253556859374652, + "learning_rate": 1.1131758649160495e-07, + "loss": 2.7968, + "step": 47403 + }, + { + "epoch": 2.942702836923459, + "grad_norm": 0.13148540236898454, + "learning_rate": 1.1107685892094432e-07, + "loss": 2.6846, + "step": 47404 + }, + { + "epoch": 2.9427649140232166, + "grad_norm": 0.1392184754357162, + "learning_rate": 1.1083639163290694e-07, + "loss": 2.6745, + "step": 47405 + }, + { + "epoch": 2.942826991122975, + "grad_norm": 0.13535742808921947, + "learning_rate": 1.1059618462874733e-07, + "loss": 2.7057, + "step": 47406 + }, + { + "epoch": 2.9428890682227324, + "grad_norm": 0.1295974939038702, + "learning_rate": 1.1035623790972005e-07, + "loss": 2.6624, + "step": 47407 + }, + { + "epoch": 2.942951145322491, + "grad_norm": 0.1468739922975912, + "learning_rate": 1.1011655147707412e-07, + "loss": 2.6797, + "step": 47408 + }, + { + "epoch": 2.9430132224222483, + "grad_norm": 0.12897901937501696, + "learning_rate": 1.0987712533205852e-07, + "loss": 2.7896, + "step": 47409 + }, + { + "epoch": 2.943075299522006, + "grad_norm": 0.13648048483769512, + "learning_rate": 1.096379594759278e-07, + "loss": 2.6555, + "step": 47410 + }, + { + "epoch": 2.943137376621764, + "grad_norm": 0.1334097141174437, + "learning_rate": 1.09399053909931e-07, + "loss": 2.7434, + "step": 47411 + }, + { + "epoch": 2.943199453721522, + "grad_norm": 0.13301185064025545, + "learning_rate": 1.0916040863530596e-07, + "loss": 2.6709, + "step": 47412 + }, + { + "epoch": 2.94326153082128, + "grad_norm": 0.12722302137628944, + "learning_rate": 1.0892202365330173e-07, + "loss": 2.6867, + "step": 47413 + }, + { + "epoch": 2.943323607921038, + "grad_norm": 0.1396856858521839, + "learning_rate": 1.0868389896516729e-07, + "loss": 2.594, + "step": 47414 + }, + { + "epoch": 2.943385685020796, + "grad_norm": 0.13040652133045907, + "learning_rate": 1.0844603457213498e-07, + "loss": 2.6972, + "step": 47415 + }, + { + "epoch": 2.9434477621205537, + "grad_norm": 0.13849934066433048, + "learning_rate": 1.0820843047545382e-07, + "loss": 2.6927, + "step": 47416 + }, + { + "epoch": 2.9435098392203116, + "grad_norm": 0.14868078247223548, + "learning_rate": 1.0797108667636169e-07, + "loss": 2.641, + "step": 47417 + }, + { + "epoch": 2.9435719163200695, + "grad_norm": 0.13368781637383503, + "learning_rate": 1.077340031760965e-07, + "loss": 2.6587, + "step": 47418 + }, + { + "epoch": 2.9436339934198275, + "grad_norm": 0.13361963974904348, + "learning_rate": 1.0749717997589059e-07, + "loss": 2.7078, + "step": 47419 + }, + { + "epoch": 2.9436960705195854, + "grad_norm": 0.14203223800945025, + "learning_rate": 1.0726061707698743e-07, + "loss": 2.759, + "step": 47420 + }, + { + "epoch": 2.9437581476193433, + "grad_norm": 0.12994109545472288, + "learning_rate": 1.0702431448061378e-07, + "loss": 2.6946, + "step": 47421 + }, + { + "epoch": 2.943820224719101, + "grad_norm": 0.14206752405165046, + "learning_rate": 1.0678827218801312e-07, + "loss": 2.5984, + "step": 47422 + }, + { + "epoch": 2.943882301818859, + "grad_norm": 0.13344164800562, + "learning_rate": 1.065524902004067e-07, + "loss": 2.5452, + "step": 47423 + }, + { + "epoch": 2.943944378918617, + "grad_norm": 0.14477421983628225, + "learning_rate": 1.0631696851902684e-07, + "loss": 2.7491, + "step": 47424 + }, + { + "epoch": 2.944006456018375, + "grad_norm": 0.13019670831920369, + "learning_rate": 1.0608170714510035e-07, + "loss": 2.7538, + "step": 47425 + }, + { + "epoch": 2.944068533118133, + "grad_norm": 0.1298036262223157, + "learning_rate": 1.0584670607986514e-07, + "loss": 2.6893, + "step": 47426 + }, + { + "epoch": 2.944130610217891, + "grad_norm": 0.13022435963867407, + "learning_rate": 1.0561196532453133e-07, + "loss": 2.6479, + "step": 47427 + }, + { + "epoch": 2.9441926873176483, + "grad_norm": 0.13010124253142208, + "learning_rate": 1.0537748488034239e-07, + "loss": 2.6595, + "step": 47428 + }, + { + "epoch": 2.9442547644174066, + "grad_norm": 0.13588007777428557, + "learning_rate": 1.051432647485029e-07, + "loss": 2.7461, + "step": 47429 + }, + { + "epoch": 2.944316841517164, + "grad_norm": 0.13431020854692272, + "learning_rate": 1.0490930493024521e-07, + "loss": 2.6944, + "step": 47430 + }, + { + "epoch": 2.9443789186169225, + "grad_norm": 0.13546204647167107, + "learning_rate": 1.0467560542679056e-07, + "loss": 2.7132, + "step": 47431 + }, + { + "epoch": 2.94444099571668, + "grad_norm": 0.12569686238660324, + "learning_rate": 1.0444216623935466e-07, + "loss": 2.5646, + "step": 47432 + }, + { + "epoch": 2.944503072816438, + "grad_norm": 0.12745284252081665, + "learning_rate": 1.0420898736915874e-07, + "loss": 2.7098, + "step": 47433 + }, + { + "epoch": 2.944565149916196, + "grad_norm": 0.13224471966310314, + "learning_rate": 1.0397606881741296e-07, + "loss": 2.7642, + "step": 47434 + }, + { + "epoch": 2.9446272270159537, + "grad_norm": 0.12941509299381806, + "learning_rate": 1.0374341058533854e-07, + "loss": 2.6564, + "step": 47435 + }, + { + "epoch": 2.9446893041157116, + "grad_norm": 0.13909497658688597, + "learning_rate": 1.035110126741512e-07, + "loss": 2.6613, + "step": 47436 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.14355618612324628, + "learning_rate": 1.0327887508505552e-07, + "loss": 2.6776, + "step": 47437 + }, + { + "epoch": 2.9448134583152275, + "grad_norm": 0.13646560803040084, + "learning_rate": 1.0304699781926719e-07, + "loss": 2.6651, + "step": 47438 + }, + { + "epoch": 2.9448755354149854, + "grad_norm": 0.13128581949057566, + "learning_rate": 1.0281538087799636e-07, + "loss": 2.7392, + "step": 47439 + }, + { + "epoch": 2.9449376125147433, + "grad_norm": 0.13585119193178627, + "learning_rate": 1.0258402426244762e-07, + "loss": 2.6901, + "step": 47440 + }, + { + "epoch": 2.944999689614501, + "grad_norm": 0.1358550309358126, + "learning_rate": 1.0235292797383667e-07, + "loss": 2.7088, + "step": 47441 + }, + { + "epoch": 2.945061766714259, + "grad_norm": 0.1342283957304962, + "learning_rate": 1.0212209201335699e-07, + "loss": 2.6804, + "step": 47442 + }, + { + "epoch": 2.945123843814017, + "grad_norm": 0.1412025024406248, + "learning_rate": 1.0189151638222427e-07, + "loss": 2.7007, + "step": 47443 + }, + { + "epoch": 2.945185920913775, + "grad_norm": 0.13209323500223077, + "learning_rate": 1.0166120108163757e-07, + "loss": 2.6159, + "step": 47444 + }, + { + "epoch": 2.945247998013533, + "grad_norm": 0.14289334476566823, + "learning_rate": 1.0143114611279591e-07, + "loss": 2.7358, + "step": 47445 + }, + { + "epoch": 2.945310075113291, + "grad_norm": 0.14701638569221, + "learning_rate": 1.0120135147689835e-07, + "loss": 2.7579, + "step": 47446 + }, + { + "epoch": 2.9453721522130487, + "grad_norm": 0.14309686199893865, + "learning_rate": 1.0097181717514947e-07, + "loss": 2.6342, + "step": 47447 + }, + { + "epoch": 2.9454342293128066, + "grad_norm": 0.13181821209462857, + "learning_rate": 1.0074254320874277e-07, + "loss": 2.733, + "step": 47448 + }, + { + "epoch": 2.9454963064125645, + "grad_norm": 0.13000910935672894, + "learning_rate": 1.0051352957887728e-07, + "loss": 2.6893, + "step": 47449 + }, + { + "epoch": 2.9455583835123225, + "grad_norm": 0.13516585034799292, + "learning_rate": 1.0028477628674648e-07, + "loss": 2.6889, + "step": 47450 + }, + { + "epoch": 2.94562046061208, + "grad_norm": 0.13639555605145498, + "learning_rate": 1.0005628333354389e-07, + "loss": 2.5668, + "step": 47451 + }, + { + "epoch": 2.9456825377118383, + "grad_norm": 0.1309113470466201, + "learning_rate": 9.982805072045742e-08, + "loss": 2.6821, + "step": 47452 + }, + { + "epoch": 2.9457446148115958, + "grad_norm": 0.13210829399925397, + "learning_rate": 9.960007844868613e-08, + "loss": 2.6755, + "step": 47453 + }, + { + "epoch": 2.945806691911354, + "grad_norm": 0.1364797137402862, + "learning_rate": 9.93723665194124e-08, + "loss": 2.6158, + "step": 47454 + }, + { + "epoch": 2.9458687690111116, + "grad_norm": 0.13806418796602293, + "learning_rate": 9.914491493382971e-08, + "loss": 2.7314, + "step": 47455 + }, + { + "epoch": 2.94593084611087, + "grad_norm": 0.13573577234491513, + "learning_rate": 9.891772369312047e-08, + "loss": 2.7039, + "step": 47456 + }, + { + "epoch": 2.9459929232106274, + "grad_norm": 0.13115451041197973, + "learning_rate": 9.869079279846705e-08, + "loss": 2.7188, + "step": 47457 + }, + { + "epoch": 2.9460550003103854, + "grad_norm": 0.1260821616982312, + "learning_rate": 9.846412225106294e-08, + "loss": 2.7178, + "step": 47458 + }, + { + "epoch": 2.9461170774101433, + "grad_norm": 0.13121845314690012, + "learning_rate": 9.823771205208498e-08, + "loss": 2.7178, + "step": 47459 + }, + { + "epoch": 2.946179154509901, + "grad_norm": 0.1313856062963215, + "learning_rate": 9.801156220271557e-08, + "loss": 2.7147, + "step": 47460 + }, + { + "epoch": 2.946241231609659, + "grad_norm": 0.14656436008013185, + "learning_rate": 9.778567270413153e-08, + "loss": 2.6782, + "step": 47461 + }, + { + "epoch": 2.946303308709417, + "grad_norm": 0.14660635002444408, + "learning_rate": 9.756004355751525e-08, + "loss": 2.6582, + "step": 47462 + }, + { + "epoch": 2.946365385809175, + "grad_norm": 0.13174792990333997, + "learning_rate": 9.733467476403801e-08, + "loss": 2.6025, + "step": 47463 + }, + { + "epoch": 2.946427462908933, + "grad_norm": 0.13157410936260724, + "learning_rate": 9.710956632488777e-08, + "loss": 2.6643, + "step": 47464 + }, + { + "epoch": 2.946489540008691, + "grad_norm": 0.13326420823743065, + "learning_rate": 9.688471824122469e-08, + "loss": 2.6047, + "step": 47465 + }, + { + "epoch": 2.9465516171084487, + "grad_norm": 0.14616125072278774, + "learning_rate": 9.666013051423118e-08, + "loss": 2.7705, + "step": 47466 + }, + { + "epoch": 2.9466136942082066, + "grad_norm": 0.13050506794431954, + "learning_rate": 9.64358031450785e-08, + "loss": 2.6497, + "step": 47467 + }, + { + "epoch": 2.9466757713079645, + "grad_norm": 0.13871231966825961, + "learning_rate": 9.62117361349324e-08, + "loss": 2.6799, + "step": 47468 + }, + { + "epoch": 2.9467378484077225, + "grad_norm": 0.13393895599999683, + "learning_rate": 9.598792948496415e-08, + "loss": 2.7326, + "step": 47469 + }, + { + "epoch": 2.9467999255074804, + "grad_norm": 0.13010963911632356, + "learning_rate": 9.576438319633951e-08, + "loss": 2.6993, + "step": 47470 + }, + { + "epoch": 2.9468620026072383, + "grad_norm": 0.13915796780120288, + "learning_rate": 9.554109727022976e-08, + "loss": 2.6717, + "step": 47471 + }, + { + "epoch": 2.946924079706996, + "grad_norm": 0.13823135009048287, + "learning_rate": 9.531807170779505e-08, + "loss": 2.7674, + "step": 47472 + }, + { + "epoch": 2.946986156806754, + "grad_norm": 0.13170550216401306, + "learning_rate": 9.509530651020115e-08, + "loss": 2.5977, + "step": 47473 + }, + { + "epoch": 2.947048233906512, + "grad_norm": 0.1315579627425795, + "learning_rate": 9.487280167861378e-08, + "loss": 2.6572, + "step": 47474 + }, + { + "epoch": 2.94711031100627, + "grad_norm": 0.15424995674769507, + "learning_rate": 9.465055721418758e-08, + "loss": 2.7608, + "step": 47475 + }, + { + "epoch": 2.9471723881060274, + "grad_norm": 0.13756604586229443, + "learning_rate": 9.442857311808274e-08, + "loss": 2.7803, + "step": 47476 + }, + { + "epoch": 2.947234465205786, + "grad_norm": 0.134088737503455, + "learning_rate": 9.420684939145941e-08, + "loss": 2.674, + "step": 47477 + }, + { + "epoch": 2.9472965423055433, + "grad_norm": 0.13670329279033075, + "learning_rate": 9.398538603547225e-08, + "loss": 2.7415, + "step": 47478 + }, + { + "epoch": 2.9473586194053016, + "grad_norm": 0.13346684071875123, + "learning_rate": 9.376418305128143e-08, + "loss": 2.6571, + "step": 47479 + }, + { + "epoch": 2.947420696505059, + "grad_norm": 0.13931451234395223, + "learning_rate": 9.354324044003604e-08, + "loss": 2.733, + "step": 47480 + }, + { + "epoch": 2.947482773604817, + "grad_norm": 0.14707214688680684, + "learning_rate": 9.332255820289071e-08, + "loss": 2.6936, + "step": 47481 + }, + { + "epoch": 2.947544850704575, + "grad_norm": 0.15971067714833218, + "learning_rate": 9.310213634100007e-08, + "loss": 2.7093, + "step": 47482 + }, + { + "epoch": 2.947606927804333, + "grad_norm": 0.13584947120012864, + "learning_rate": 9.288197485550765e-08, + "loss": 2.6858, + "step": 47483 + }, + { + "epoch": 2.947669004904091, + "grad_norm": 0.140209278900119, + "learning_rate": 9.266207374756808e-08, + "loss": 2.6727, + "step": 47484 + }, + { + "epoch": 2.9477310820038487, + "grad_norm": 0.13961328050586666, + "learning_rate": 9.244243301832489e-08, + "loss": 2.6935, + "step": 47485 + }, + { + "epoch": 2.9477931591036066, + "grad_norm": 0.1488655093440548, + "learning_rate": 9.222305266892162e-08, + "loss": 2.7185, + "step": 47486 + }, + { + "epoch": 2.9478552362033645, + "grad_norm": 0.12623284759239578, + "learning_rate": 9.200393270050734e-08, + "loss": 2.6233, + "step": 47487 + }, + { + "epoch": 2.9479173133031225, + "grad_norm": 0.13674391106277367, + "learning_rate": 9.178507311422557e-08, + "loss": 2.6171, + "step": 47488 + }, + { + "epoch": 2.9479793904028804, + "grad_norm": 0.13308282996900397, + "learning_rate": 9.156647391121987e-08, + "loss": 2.7385, + "step": 47489 + }, + { + "epoch": 2.9480414675026383, + "grad_norm": 0.12673554482035934, + "learning_rate": 9.134813509262263e-08, + "loss": 2.619, + "step": 47490 + }, + { + "epoch": 2.948103544602396, + "grad_norm": 0.13055956358230247, + "learning_rate": 9.113005665958297e-08, + "loss": 2.6626, + "step": 47491 + }, + { + "epoch": 2.948165621702154, + "grad_norm": 0.1426195058474862, + "learning_rate": 9.091223861322773e-08, + "loss": 2.7465, + "step": 47492 + }, + { + "epoch": 2.948227698801912, + "grad_norm": 0.1279401993254784, + "learning_rate": 9.069468095470047e-08, + "loss": 2.6774, + "step": 47493 + }, + { + "epoch": 2.94828977590167, + "grad_norm": 0.13837771041665697, + "learning_rate": 9.047738368513914e-08, + "loss": 2.7431, + "step": 47494 + }, + { + "epoch": 2.948351853001428, + "grad_norm": 0.13545922075346825, + "learning_rate": 9.026034680567063e-08, + "loss": 2.7446, + "step": 47495 + }, + { + "epoch": 2.948413930101186, + "grad_norm": 0.12619362841410361, + "learning_rate": 9.004357031742738e-08, + "loss": 2.625, + "step": 47496 + }, + { + "epoch": 2.9484760072009437, + "grad_norm": 0.15513717622484197, + "learning_rate": 8.982705422154736e-08, + "loss": 2.6786, + "step": 47497 + }, + { + "epoch": 2.9485380843007016, + "grad_norm": 0.12948495116807054, + "learning_rate": 8.961079851915188e-08, + "loss": 2.7696, + "step": 47498 + }, + { + "epoch": 2.948600161400459, + "grad_norm": 0.13705045787288747, + "learning_rate": 8.939480321137894e-08, + "loss": 2.7052, + "step": 47499 + }, + { + "epoch": 2.9486622385002175, + "grad_norm": 0.1296990320524716, + "learning_rate": 8.91790682993443e-08, + "loss": 2.6833, + "step": 47500 + }, + { + "epoch": 2.948724315599975, + "grad_norm": 0.12882861053440076, + "learning_rate": 8.89635937841804e-08, + "loss": 2.6368, + "step": 47501 + }, + { + "epoch": 2.9487863926997333, + "grad_norm": 0.13302609154214506, + "learning_rate": 8.874837966700856e-08, + "loss": 2.7, + "step": 47502 + }, + { + "epoch": 2.948848469799491, + "grad_norm": 0.12832364726850162, + "learning_rate": 8.853342594895564e-08, + "loss": 2.6851, + "step": 47503 + }, + { + "epoch": 2.948910546899249, + "grad_norm": 0.1282425033421364, + "learning_rate": 8.831873263113744e-08, + "loss": 2.6568, + "step": 47504 + }, + { + "epoch": 2.9489726239990066, + "grad_norm": 0.13042876578865312, + "learning_rate": 8.810429971467527e-08, + "loss": 2.7987, + "step": 47505 + }, + { + "epoch": 2.9490347010987645, + "grad_norm": 0.1309355257996196, + "learning_rate": 8.789012720069045e-08, + "loss": 2.675, + "step": 47506 + }, + { + "epoch": 2.9490967781985225, + "grad_norm": 0.13989466187954042, + "learning_rate": 8.767621509030432e-08, + "loss": 2.6951, + "step": 47507 + }, + { + "epoch": 2.9491588552982804, + "grad_norm": 0.13332046810557965, + "learning_rate": 8.746256338462155e-08, + "loss": 2.7913, + "step": 47508 + }, + { + "epoch": 2.9492209323980383, + "grad_norm": 0.132277138981155, + "learning_rate": 8.724917208476901e-08, + "loss": 2.7648, + "step": 47509 + }, + { + "epoch": 2.949283009497796, + "grad_norm": 0.13573914805789833, + "learning_rate": 8.703604119185138e-08, + "loss": 2.6493, + "step": 47510 + }, + { + "epoch": 2.949345086597554, + "grad_norm": 0.14628292218552164, + "learning_rate": 8.682317070698443e-08, + "loss": 2.6828, + "step": 47511 + }, + { + "epoch": 2.949407163697312, + "grad_norm": 0.1275615929277421, + "learning_rate": 8.661056063127282e-08, + "loss": 2.6986, + "step": 47512 + }, + { + "epoch": 2.94946924079707, + "grad_norm": 0.13081052209868843, + "learning_rate": 8.639821096583789e-08, + "loss": 2.706, + "step": 47513 + }, + { + "epoch": 2.949531317896828, + "grad_norm": 0.1298587819864913, + "learning_rate": 8.618612171177876e-08, + "loss": 2.6852, + "step": 47514 + }, + { + "epoch": 2.949593394996586, + "grad_norm": 0.1281344907432197, + "learning_rate": 8.597429287020009e-08, + "loss": 2.7247, + "step": 47515 + }, + { + "epoch": 2.9496554720963437, + "grad_norm": 0.13013512619718157, + "learning_rate": 8.576272444221212e-08, + "loss": 2.716, + "step": 47516 + }, + { + "epoch": 2.9497175491961016, + "grad_norm": 0.1292649826177707, + "learning_rate": 8.555141642891951e-08, + "loss": 2.7238, + "step": 47517 + }, + { + "epoch": 2.9497796262958595, + "grad_norm": 0.14126740852616548, + "learning_rate": 8.534036883142138e-08, + "loss": 2.662, + "step": 47518 + }, + { + "epoch": 2.9498417033956175, + "grad_norm": 0.13622130086260228, + "learning_rate": 8.512958165081686e-08, + "loss": 2.7072, + "step": 47519 + }, + { + "epoch": 2.9499037804953754, + "grad_norm": 0.1349547482739954, + "learning_rate": 8.491905488821061e-08, + "loss": 2.7788, + "step": 47520 + }, + { + "epoch": 2.9499658575951333, + "grad_norm": 0.1311060019031808, + "learning_rate": 8.470878854470176e-08, + "loss": 2.6782, + "step": 47521 + }, + { + "epoch": 2.950027934694891, + "grad_norm": 0.14492532772259445, + "learning_rate": 8.449878262138388e-08, + "loss": 2.7442, + "step": 47522 + }, + { + "epoch": 2.950090011794649, + "grad_norm": 0.13514988476312237, + "learning_rate": 8.428903711935054e-08, + "loss": 2.6796, + "step": 47523 + }, + { + "epoch": 2.9501520888944066, + "grad_norm": 0.14033744472396562, + "learning_rate": 8.407955203969531e-08, + "loss": 2.6695, + "step": 47524 + }, + { + "epoch": 2.950214165994165, + "grad_norm": 0.14565321039196794, + "learning_rate": 8.387032738352285e-08, + "loss": 2.6803, + "step": 47525 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.1408534313943095, + "learning_rate": 8.366136315191008e-08, + "loss": 2.7251, + "step": 47526 + }, + { + "epoch": 2.950338320193681, + "grad_norm": 0.13338050973151522, + "learning_rate": 8.345265934595614e-08, + "loss": 2.7201, + "step": 47527 + }, + { + "epoch": 2.9504003972934383, + "grad_norm": 0.12627459911965536, + "learning_rate": 8.324421596674902e-08, + "loss": 2.7055, + "step": 47528 + }, + { + "epoch": 2.950462474393196, + "grad_norm": 0.1321564628020256, + "learning_rate": 8.303603301537122e-08, + "loss": 2.8066, + "step": 47529 + }, + { + "epoch": 2.950524551492954, + "grad_norm": 0.14559467334499557, + "learning_rate": 8.282811049291073e-08, + "loss": 2.7864, + "step": 47530 + }, + { + "epoch": 2.950586628592712, + "grad_norm": 0.12818554052699394, + "learning_rate": 8.262044840046113e-08, + "loss": 2.7052, + "step": 47531 + }, + { + "epoch": 2.95064870569247, + "grad_norm": 0.1311198673671086, + "learning_rate": 8.241304673909378e-08, + "loss": 2.629, + "step": 47532 + }, + { + "epoch": 2.950710782792228, + "grad_norm": 0.13483299423542733, + "learning_rate": 8.220590550989671e-08, + "loss": 2.7351, + "step": 47533 + }, + { + "epoch": 2.950772859891986, + "grad_norm": 0.13464251714118175, + "learning_rate": 8.199902471394683e-08, + "loss": 2.6908, + "step": 47534 + }, + { + "epoch": 2.9508349369917437, + "grad_norm": 0.1302746405252846, + "learning_rate": 8.17924043523266e-08, + "loss": 2.7287, + "step": 47535 + }, + { + "epoch": 2.9508970140915016, + "grad_norm": 0.12587051955548104, + "learning_rate": 8.158604442611295e-08, + "loss": 2.7224, + "step": 47536 + }, + { + "epoch": 2.9509590911912595, + "grad_norm": 0.12951615573471953, + "learning_rate": 8.137994493638835e-08, + "loss": 2.6278, + "step": 47537 + }, + { + "epoch": 2.9510211682910175, + "grad_norm": 0.140785661950027, + "learning_rate": 8.117410588421859e-08, + "loss": 2.7522, + "step": 47538 + }, + { + "epoch": 2.9510832453907754, + "grad_norm": 0.12768971161040357, + "learning_rate": 8.096852727068061e-08, + "loss": 2.6622, + "step": 47539 + }, + { + "epoch": 2.9511453224905333, + "grad_norm": 0.14182532159132383, + "learning_rate": 8.07632090968513e-08, + "loss": 2.726, + "step": 47540 + }, + { + "epoch": 2.951207399590291, + "grad_norm": 0.13357752065625603, + "learning_rate": 8.055815136379651e-08, + "loss": 2.6085, + "step": 47541 + }, + { + "epoch": 2.951269476690049, + "grad_norm": 0.15129715314526546, + "learning_rate": 8.035335407258759e-08, + "loss": 2.6806, + "step": 47542 + }, + { + "epoch": 2.951331553789807, + "grad_norm": 0.13865386819115627, + "learning_rate": 8.01488172242959e-08, + "loss": 2.7421, + "step": 47543 + }, + { + "epoch": 2.951393630889565, + "grad_norm": 0.13738785906631878, + "learning_rate": 7.994454081998726e-08, + "loss": 2.7629, + "step": 47544 + }, + { + "epoch": 2.951455707989323, + "grad_norm": 0.1268376705731506, + "learning_rate": 7.974052486072192e-08, + "loss": 2.7247, + "step": 47545 + }, + { + "epoch": 2.951517785089081, + "grad_norm": 0.13041264161270702, + "learning_rate": 7.953676934757126e-08, + "loss": 2.6128, + "step": 47546 + }, + { + "epoch": 2.9515798621888383, + "grad_norm": 0.1297557754499104, + "learning_rate": 7.933327428159554e-08, + "loss": 2.715, + "step": 47547 + }, + { + "epoch": 2.9516419392885966, + "grad_norm": 0.12889468979296134, + "learning_rate": 7.913003966385502e-08, + "loss": 2.734, + "step": 47548 + }, + { + "epoch": 2.951704016388354, + "grad_norm": 0.13586880932473264, + "learning_rate": 7.89270654954155e-08, + "loss": 2.677, + "step": 47549 + }, + { + "epoch": 2.9517660934881125, + "grad_norm": 0.1272078086891355, + "learning_rate": 7.872435177733173e-08, + "loss": 2.6555, + "step": 47550 + }, + { + "epoch": 2.95182817058787, + "grad_norm": 0.13159578136859204, + "learning_rate": 7.852189851065838e-08, + "loss": 2.6083, + "step": 47551 + }, + { + "epoch": 2.9518902476876283, + "grad_norm": 0.13118309575085085, + "learning_rate": 7.83197056964613e-08, + "loss": 2.6854, + "step": 47552 + }, + { + "epoch": 2.951952324787386, + "grad_norm": 0.1489166882349855, + "learning_rate": 7.811777333578408e-08, + "loss": 2.7235, + "step": 47553 + }, + { + "epoch": 2.9520144018871437, + "grad_norm": 0.12836786750519086, + "learning_rate": 7.791610142968697e-08, + "loss": 2.7615, + "step": 47554 + }, + { + "epoch": 2.9520764789869016, + "grad_norm": 0.14698548118424046, + "learning_rate": 7.771468997921916e-08, + "loss": 2.7015, + "step": 47555 + }, + { + "epoch": 2.9521385560866595, + "grad_norm": 0.1299309318720981, + "learning_rate": 7.751353898543534e-08, + "loss": 2.71, + "step": 47556 + }, + { + "epoch": 2.9522006331864175, + "grad_norm": 0.13470452761940324, + "learning_rate": 7.731264844937913e-08, + "loss": 2.711, + "step": 47557 + }, + { + "epoch": 2.9522627102861754, + "grad_norm": 0.13334746469053194, + "learning_rate": 7.711201837210525e-08, + "loss": 2.6667, + "step": 47558 + }, + { + "epoch": 2.9523247873859333, + "grad_norm": 0.13193131281803203, + "learning_rate": 7.691164875465728e-08, + "loss": 2.5868, + "step": 47559 + }, + { + "epoch": 2.952386864485691, + "grad_norm": 0.12779848636339383, + "learning_rate": 7.671153959807886e-08, + "loss": 2.6743, + "step": 47560 + }, + { + "epoch": 2.952448941585449, + "grad_norm": 0.12791907288289764, + "learning_rate": 7.65116909034136e-08, + "loss": 2.7072, + "step": 47561 + }, + { + "epoch": 2.952511018685207, + "grad_norm": 0.13019892579108347, + "learning_rate": 7.631210267171063e-08, + "loss": 2.6962, + "step": 47562 + }, + { + "epoch": 2.952573095784965, + "grad_norm": 0.14031262273506964, + "learning_rate": 7.611277490400248e-08, + "loss": 2.6659, + "step": 47563 + }, + { + "epoch": 2.952635172884723, + "grad_norm": 0.12691490625093366, + "learning_rate": 7.591370760133832e-08, + "loss": 2.6073, + "step": 47564 + }, + { + "epoch": 2.952697249984481, + "grad_norm": 0.12709702625793248, + "learning_rate": 7.571490076475063e-08, + "loss": 2.663, + "step": 47565 + }, + { + "epoch": 2.9527593270842387, + "grad_norm": 0.12989191869492758, + "learning_rate": 7.551635439527193e-08, + "loss": 2.7195, + "step": 47566 + }, + { + "epoch": 2.9528214041839966, + "grad_norm": 0.14009364060800347, + "learning_rate": 7.53180684939514e-08, + "loss": 2.819, + "step": 47567 + }, + { + "epoch": 2.9528834812837546, + "grad_norm": 0.13659864327090682, + "learning_rate": 7.512004306181597e-08, + "loss": 2.7031, + "step": 47568 + }, + { + "epoch": 2.9529455583835125, + "grad_norm": 0.14302277143692851, + "learning_rate": 7.492227809989261e-08, + "loss": 2.699, + "step": 47569 + }, + { + "epoch": 2.9530076354832704, + "grad_norm": 0.12638664773801395, + "learning_rate": 7.472477360922491e-08, + "loss": 2.5631, + "step": 47570 + }, + { + "epoch": 2.9530697125830283, + "grad_norm": 0.132891869670908, + "learning_rate": 7.452752959083431e-08, + "loss": 2.7608, + "step": 47571 + }, + { + "epoch": 2.953131789682786, + "grad_norm": 0.13834385963135729, + "learning_rate": 7.433054604575884e-08, + "loss": 2.785, + "step": 47572 + }, + { + "epoch": 2.953193866782544, + "grad_norm": 0.1322402265241841, + "learning_rate": 7.413382297501437e-08, + "loss": 2.7326, + "step": 47573 + }, + { + "epoch": 2.9532559438823016, + "grad_norm": 0.1370995667473342, + "learning_rate": 7.393736037963894e-08, + "loss": 2.6977, + "step": 47574 + }, + { + "epoch": 2.95331802098206, + "grad_norm": 0.1286781778743224, + "learning_rate": 7.374115826064843e-08, + "loss": 2.7005, + "step": 47575 + }, + { + "epoch": 2.9533800980818175, + "grad_norm": 0.13580002684529732, + "learning_rate": 7.354521661906977e-08, + "loss": 2.7393, + "step": 47576 + }, + { + "epoch": 2.9534421751815754, + "grad_norm": 0.12869498250939027, + "learning_rate": 7.334953545592994e-08, + "loss": 2.6458, + "step": 47577 + }, + { + "epoch": 2.9535042522813333, + "grad_norm": 0.13165176384094135, + "learning_rate": 7.315411477223921e-08, + "loss": 2.6656, + "step": 47578 + }, + { + "epoch": 2.953566329381091, + "grad_norm": 0.14031437501950175, + "learning_rate": 7.295895456903012e-08, + "loss": 2.7516, + "step": 47579 + }, + { + "epoch": 2.953628406480849, + "grad_norm": 0.1341518856380989, + "learning_rate": 7.276405484731297e-08, + "loss": 2.7191, + "step": 47580 + }, + { + "epoch": 2.953690483580607, + "grad_norm": 0.14405042338074034, + "learning_rate": 7.25694156081036e-08, + "loss": 2.6817, + "step": 47581 + }, + { + "epoch": 2.953752560680365, + "grad_norm": 0.14008259143293436, + "learning_rate": 7.237503685242342e-08, + "loss": 2.5654, + "step": 47582 + }, + { + "epoch": 2.953814637780123, + "grad_norm": 0.1440017253214408, + "learning_rate": 7.218091858128274e-08, + "loss": 2.7151, + "step": 47583 + }, + { + "epoch": 2.953876714879881, + "grad_norm": 0.1487028360527352, + "learning_rate": 7.198706079569185e-08, + "loss": 2.6728, + "step": 47584 + }, + { + "epoch": 2.9539387919796387, + "grad_norm": 0.132706606948269, + "learning_rate": 7.179346349666661e-08, + "loss": 2.6744, + "step": 47585 + }, + { + "epoch": 2.9540008690793966, + "grad_norm": 0.13489793419275978, + "learning_rate": 7.160012668521176e-08, + "loss": 2.7236, + "step": 47586 + }, + { + "epoch": 2.9540629461791545, + "grad_norm": 0.14047148723078207, + "learning_rate": 7.140705036234318e-08, + "loss": 2.7527, + "step": 47587 + }, + { + "epoch": 2.9541250232789125, + "grad_norm": 0.130349653959949, + "learning_rate": 7.121423452906561e-08, + "loss": 2.6348, + "step": 47588 + }, + { + "epoch": 2.9541871003786704, + "grad_norm": 0.14158592795515268, + "learning_rate": 7.10216791863838e-08, + "loss": 2.7359, + "step": 47589 + }, + { + "epoch": 2.9542491774784283, + "grad_norm": 0.16292089117061714, + "learning_rate": 7.082938433529696e-08, + "loss": 2.7001, + "step": 47590 + }, + { + "epoch": 2.954311254578186, + "grad_norm": 0.15544708906167826, + "learning_rate": 7.063734997682092e-08, + "loss": 2.7222, + "step": 47591 + }, + { + "epoch": 2.954373331677944, + "grad_norm": 0.14903857228030074, + "learning_rate": 7.04455761119438e-08, + "loss": 2.5938, + "step": 47592 + }, + { + "epoch": 2.954435408777702, + "grad_norm": 0.1397782635931561, + "learning_rate": 7.025406274168145e-08, + "loss": 2.6881, + "step": 47593 + }, + { + "epoch": 2.95449748587746, + "grad_norm": 0.13635627516065119, + "learning_rate": 7.00628098670164e-08, + "loss": 2.7073, + "step": 47594 + }, + { + "epoch": 2.9545595629772174, + "grad_norm": 0.1299811041491861, + "learning_rate": 6.9871817488959e-08, + "loss": 2.7276, + "step": 47595 + }, + { + "epoch": 2.954621640076976, + "grad_norm": 0.14933351452393728, + "learning_rate": 6.968108560850284e-08, + "loss": 2.7, + "step": 47596 + }, + { + "epoch": 2.9546837171767333, + "grad_norm": 0.12830982078267253, + "learning_rate": 6.949061422663605e-08, + "loss": 2.6949, + "step": 47597 + }, + { + "epoch": 2.9547457942764916, + "grad_norm": 0.13143906372997582, + "learning_rate": 6.930040334436339e-08, + "loss": 2.6648, + "step": 47598 + }, + { + "epoch": 2.954807871376249, + "grad_norm": 0.13248653239956695, + "learning_rate": 6.911045296266738e-08, + "loss": 2.6909, + "step": 47599 + }, + { + "epoch": 2.954869948476007, + "grad_norm": 0.13075850997759753, + "learning_rate": 6.892076308254725e-08, + "loss": 2.71, + "step": 47600 + }, + { + "epoch": 2.954932025575765, + "grad_norm": 0.12704559773009622, + "learning_rate": 6.873133370498552e-08, + "loss": 2.6693, + "step": 47601 + }, + { + "epoch": 2.954994102675523, + "grad_norm": 0.13261277243409753, + "learning_rate": 6.854216483097587e-08, + "loss": 2.7342, + "step": 47602 + }, + { + "epoch": 2.955056179775281, + "grad_norm": 0.13023560158987757, + "learning_rate": 6.835325646150081e-08, + "loss": 2.6134, + "step": 47603 + }, + { + "epoch": 2.9551182568750387, + "grad_norm": 0.1284305000718048, + "learning_rate": 6.816460859754847e-08, + "loss": 2.68, + "step": 47604 + }, + { + "epoch": 2.9551803339747966, + "grad_norm": 0.1421633330442419, + "learning_rate": 6.79762212401014e-08, + "loss": 2.7149, + "step": 47605 + }, + { + "epoch": 2.9552424110745545, + "grad_norm": 0.135745453280676, + "learning_rate": 6.778809439014766e-08, + "loss": 2.7544, + "step": 47606 + }, + { + "epoch": 2.9553044881743125, + "grad_norm": 0.13048494917987966, + "learning_rate": 6.760022804866428e-08, + "loss": 2.7947, + "step": 47607 + }, + { + "epoch": 2.9553665652740704, + "grad_norm": 0.13252171069765664, + "learning_rate": 6.741262221662825e-08, + "loss": 2.6857, + "step": 47608 + }, + { + "epoch": 2.9554286423738283, + "grad_norm": 0.13314098519629194, + "learning_rate": 6.722527689502766e-08, + "loss": 2.6586, + "step": 47609 + }, + { + "epoch": 2.955490719473586, + "grad_norm": 0.13963410497355475, + "learning_rate": 6.703819208482842e-08, + "loss": 2.6247, + "step": 47610 + }, + { + "epoch": 2.955552796573344, + "grad_norm": 0.13919457127175222, + "learning_rate": 6.685136778701861e-08, + "loss": 2.7062, + "step": 47611 + }, + { + "epoch": 2.955614873673102, + "grad_norm": 0.130058084267224, + "learning_rate": 6.666480400256414e-08, + "loss": 2.7341, + "step": 47612 + }, + { + "epoch": 2.95567695077286, + "grad_norm": 0.13962436014966256, + "learning_rate": 6.647850073243644e-08, + "loss": 2.6666, + "step": 47613 + }, + { + "epoch": 2.955739027872618, + "grad_norm": 0.13945476734038434, + "learning_rate": 6.629245797761807e-08, + "loss": 2.6557, + "step": 47614 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.12540549579272842, + "learning_rate": 6.610667573907492e-08, + "loss": 2.6611, + "step": 47615 + }, + { + "epoch": 2.9558631820721337, + "grad_norm": 0.1284013309506706, + "learning_rate": 6.592115401777288e-08, + "loss": 2.6748, + "step": 47616 + }, + { + "epoch": 2.9559252591718916, + "grad_norm": 0.14381801043351738, + "learning_rate": 6.573589281467784e-08, + "loss": 2.7377, + "step": 47617 + }, + { + "epoch": 2.955987336271649, + "grad_norm": 0.12897673760439843, + "learning_rate": 6.555089213076682e-08, + "loss": 2.811, + "step": 47618 + }, + { + "epoch": 2.9560494133714075, + "grad_norm": 0.13567124715420564, + "learning_rate": 6.536615196699458e-08, + "loss": 2.7229, + "step": 47619 + }, + { + "epoch": 2.956111490471165, + "grad_norm": 0.12721858514050283, + "learning_rate": 6.518167232433259e-08, + "loss": 2.7631, + "step": 47620 + }, + { + "epoch": 2.9561735675709233, + "grad_norm": 0.13061550532517896, + "learning_rate": 6.499745320374117e-08, + "loss": 2.7797, + "step": 47621 + }, + { + "epoch": 2.956235644670681, + "grad_norm": 0.1472441108631832, + "learning_rate": 6.481349460617514e-08, + "loss": 2.6703, + "step": 47622 + }, + { + "epoch": 2.956297721770439, + "grad_norm": 0.13562607086875153, + "learning_rate": 6.462979653260037e-08, + "loss": 2.805, + "step": 47623 + }, + { + "epoch": 2.9563597988701966, + "grad_norm": 0.13056270933659464, + "learning_rate": 6.444635898397722e-08, + "loss": 2.7456, + "step": 47624 + }, + { + "epoch": 2.9564218759699545, + "grad_norm": 0.14138199455925554, + "learning_rate": 6.426318196125491e-08, + "loss": 2.6312, + "step": 47625 + }, + { + "epoch": 2.9564839530697125, + "grad_norm": 0.1301813924256892, + "learning_rate": 6.408026546539936e-08, + "loss": 2.7074, + "step": 47626 + }, + { + "epoch": 2.9565460301694704, + "grad_norm": 0.14636089842870809, + "learning_rate": 6.389760949735423e-08, + "loss": 2.7165, + "step": 47627 + }, + { + "epoch": 2.9566081072692283, + "grad_norm": 0.14151158598284638, + "learning_rate": 6.371521405807989e-08, + "loss": 2.7535, + "step": 47628 + }, + { + "epoch": 2.956670184368986, + "grad_norm": 0.13263159941847502, + "learning_rate": 6.353307914852557e-08, + "loss": 2.6825, + "step": 47629 + }, + { + "epoch": 2.956732261468744, + "grad_norm": 0.1311010435073752, + "learning_rate": 6.335120476964607e-08, + "loss": 2.6223, + "step": 47630 + }, + { + "epoch": 2.956794338568502, + "grad_norm": 0.12856647476826363, + "learning_rate": 6.31695909223795e-08, + "loss": 2.734, + "step": 47631 + }, + { + "epoch": 2.95685641566826, + "grad_norm": 0.1295459221558824, + "learning_rate": 6.298823760768069e-08, + "loss": 2.6826, + "step": 47632 + }, + { + "epoch": 2.956918492768018, + "grad_norm": 0.1296000450045896, + "learning_rate": 6.280714482649885e-08, + "loss": 2.7366, + "step": 47633 + }, + { + "epoch": 2.956980569867776, + "grad_norm": 0.13810162758613923, + "learning_rate": 6.262631257977214e-08, + "loss": 2.7208, + "step": 47634 + }, + { + "epoch": 2.9570426469675337, + "grad_norm": 0.13352748610577067, + "learning_rate": 6.244574086844979e-08, + "loss": 2.7185, + "step": 47635 + }, + { + "epoch": 2.9571047240672916, + "grad_norm": 0.146879439083951, + "learning_rate": 6.226542969346438e-08, + "loss": 2.7429, + "step": 47636 + }, + { + "epoch": 2.9571668011670496, + "grad_norm": 0.12831128696884653, + "learning_rate": 6.208537905577072e-08, + "loss": 2.6775, + "step": 47637 + }, + { + "epoch": 2.9572288782668075, + "grad_norm": 0.13690405277116421, + "learning_rate": 6.190558895629583e-08, + "loss": 2.8103, + "step": 47638 + }, + { + "epoch": 2.9572909553665654, + "grad_norm": 0.14613303966856356, + "learning_rate": 6.172605939598341e-08, + "loss": 2.69, + "step": 47639 + }, + { + "epoch": 2.9573530324663233, + "grad_norm": 0.13295545317646382, + "learning_rate": 6.154679037576604e-08, + "loss": 2.7399, + "step": 47640 + }, + { + "epoch": 2.9574151095660812, + "grad_norm": 0.13775369745560775, + "learning_rate": 6.136778189658743e-08, + "loss": 2.7237, + "step": 47641 + }, + { + "epoch": 2.957477186665839, + "grad_norm": 0.14107464720169657, + "learning_rate": 6.118903395936903e-08, + "loss": 2.6432, + "step": 47642 + }, + { + "epoch": 2.9575392637655966, + "grad_norm": 0.1277318545918184, + "learning_rate": 6.101054656505456e-08, + "loss": 2.6676, + "step": 47643 + }, + { + "epoch": 2.957601340865355, + "grad_norm": 0.13700296288352773, + "learning_rate": 6.083231971457659e-08, + "loss": 2.7033, + "step": 47644 + }, + { + "epoch": 2.9576634179651125, + "grad_norm": 0.12937202257839822, + "learning_rate": 6.065435340885106e-08, + "loss": 2.757, + "step": 47645 + }, + { + "epoch": 2.957725495064871, + "grad_norm": 0.132341897494379, + "learning_rate": 6.04766476488161e-08, + "loss": 2.6994, + "step": 47646 + }, + { + "epoch": 2.9577875721646283, + "grad_norm": 0.1545350011827339, + "learning_rate": 6.029920243540432e-08, + "loss": 2.6903, + "step": 47647 + }, + { + "epoch": 2.957849649264386, + "grad_norm": 0.13805798189594115, + "learning_rate": 6.012201776953164e-08, + "loss": 2.7482, + "step": 47648 + }, + { + "epoch": 2.957911726364144, + "grad_norm": 0.13133225474765914, + "learning_rate": 5.994509365211953e-08, + "loss": 2.6689, + "step": 47649 + }, + { + "epoch": 2.957973803463902, + "grad_norm": 0.12806587395987196, + "learning_rate": 5.976843008410616e-08, + "loss": 2.7329, + "step": 47650 + }, + { + "epoch": 2.95803588056366, + "grad_norm": 0.13424877209563588, + "learning_rate": 5.959202706640188e-08, + "loss": 2.6222, + "step": 47651 + }, + { + "epoch": 2.958097957663418, + "grad_norm": 0.1330268476536044, + "learning_rate": 5.9415884599928195e-08, + "loss": 2.7868, + "step": 47652 + }, + { + "epoch": 2.958160034763176, + "grad_norm": 0.1288660738209742, + "learning_rate": 5.9240002685601034e-08, + "loss": 2.586, + "step": 47653 + }, + { + "epoch": 2.9582221118629337, + "grad_norm": 0.150428110875674, + "learning_rate": 5.906438132434744e-08, + "loss": 2.677, + "step": 47654 + }, + { + "epoch": 2.9582841889626916, + "grad_norm": 0.13605808303105005, + "learning_rate": 5.8889020517077786e-08, + "loss": 2.683, + "step": 47655 + }, + { + "epoch": 2.9583462660624495, + "grad_norm": 0.13244554370019962, + "learning_rate": 5.871392026470801e-08, + "loss": 2.6589, + "step": 47656 + }, + { + "epoch": 2.9584083431622075, + "grad_norm": 0.1448482825260373, + "learning_rate": 5.85390805681485e-08, + "loss": 2.6385, + "step": 47657 + }, + { + "epoch": 2.9584704202619654, + "grad_norm": 0.13422791698401415, + "learning_rate": 5.836450142832073e-08, + "loss": 2.6822, + "step": 47658 + }, + { + "epoch": 2.9585324973617233, + "grad_norm": 0.12845959007031185, + "learning_rate": 5.819018284612399e-08, + "loss": 2.6101, + "step": 47659 + }, + { + "epoch": 2.958594574461481, + "grad_norm": 0.1291802341183153, + "learning_rate": 5.801612482247421e-08, + "loss": 2.6323, + "step": 47660 + }, + { + "epoch": 2.958656651561239, + "grad_norm": 0.13012646644601475, + "learning_rate": 5.7842327358276214e-08, + "loss": 2.676, + "step": 47661 + }, + { + "epoch": 2.958718728660997, + "grad_norm": 0.12870013491227786, + "learning_rate": 5.76687904544404e-08, + "loss": 2.7576, + "step": 47662 + }, + { + "epoch": 2.958780805760755, + "grad_norm": 0.13232303632932846, + "learning_rate": 5.7495514111871597e-08, + "loss": 2.6986, + "step": 47663 + }, + { + "epoch": 2.958842882860513, + "grad_norm": 0.13673030259505606, + "learning_rate": 5.7322498331469074e-08, + "loss": 2.6961, + "step": 47664 + }, + { + "epoch": 2.958904959960271, + "grad_norm": 0.1294079395760563, + "learning_rate": 5.714974311414323e-08, + "loss": 2.6932, + "step": 47665 + }, + { + "epoch": 2.9589670370600283, + "grad_norm": 0.12814868498740323, + "learning_rate": 5.697724846078778e-08, + "loss": 2.7761, + "step": 47666 + }, + { + "epoch": 2.9590291141597866, + "grad_norm": 0.13583981824555147, + "learning_rate": 5.6805014372307564e-08, + "loss": 2.6551, + "step": 47667 + }, + { + "epoch": 2.959091191259544, + "grad_norm": 0.14324406344077661, + "learning_rate": 5.663304084960186e-08, + "loss": 2.6846, + "step": 47668 + }, + { + "epoch": 2.9591532683593025, + "grad_norm": 0.1346870690978342, + "learning_rate": 5.64613278935644e-08, + "loss": 2.6841, + "step": 47669 + }, + { + "epoch": 2.95921534545906, + "grad_norm": 0.1368045741866048, + "learning_rate": 5.628987550508891e-08, + "loss": 2.7261, + "step": 47670 + }, + { + "epoch": 2.9592774225588183, + "grad_norm": 0.13772329763006758, + "learning_rate": 5.611868368508022e-08, + "loss": 2.7291, + "step": 47671 + }, + { + "epoch": 2.959339499658576, + "grad_norm": 0.15466516338299002, + "learning_rate": 5.5947752434420964e-08, + "loss": 2.7069, + "step": 47672 + }, + { + "epoch": 2.9594015767583337, + "grad_norm": 0.1281477038690337, + "learning_rate": 5.577708175400487e-08, + "loss": 2.6776, + "step": 47673 + }, + { + "epoch": 2.9594636538580916, + "grad_norm": 0.13578318930317593, + "learning_rate": 5.560667164472566e-08, + "loss": 2.7435, + "step": 47674 + }, + { + "epoch": 2.9595257309578495, + "grad_norm": 0.1297017606684842, + "learning_rate": 5.543652210747152e-08, + "loss": 2.6036, + "step": 47675 + }, + { + "epoch": 2.9595878080576075, + "grad_norm": 0.14177541255095474, + "learning_rate": 5.526663314312508e-08, + "loss": 2.6621, + "step": 47676 + }, + { + "epoch": 2.9596498851573654, + "grad_norm": 0.1304049644333717, + "learning_rate": 5.509700475258561e-08, + "loss": 2.671, + "step": 47677 + }, + { + "epoch": 2.9597119622571233, + "grad_norm": 0.12943291004990992, + "learning_rate": 5.492763693672465e-08, + "loss": 2.6922, + "step": 47678 + }, + { + "epoch": 2.959774039356881, + "grad_norm": 0.13748186219402472, + "learning_rate": 5.475852969643036e-08, + "loss": 2.7222, + "step": 47679 + }, + { + "epoch": 2.959836116456639, + "grad_norm": 0.12765180824004269, + "learning_rate": 5.458968303258538e-08, + "loss": 2.7441, + "step": 47680 + }, + { + "epoch": 2.959898193556397, + "grad_norm": 0.1325585236948957, + "learning_rate": 5.442109694607233e-08, + "loss": 2.6621, + "step": 47681 + }, + { + "epoch": 2.959960270656155, + "grad_norm": 0.12951421420874942, + "learning_rate": 5.42527714377683e-08, + "loss": 2.6218, + "step": 47682 + }, + { + "epoch": 2.960022347755913, + "grad_norm": 0.1387541486575241, + "learning_rate": 5.408470650855035e-08, + "loss": 2.7453, + "step": 47683 + }, + { + "epoch": 2.960084424855671, + "grad_norm": 0.14360310912115262, + "learning_rate": 5.3916902159301117e-08, + "loss": 2.7572, + "step": 47684 + }, + { + "epoch": 2.9601465019554287, + "grad_norm": 0.14035153289972976, + "learning_rate": 5.374935839089212e-08, + "loss": 2.6444, + "step": 47685 + }, + { + "epoch": 2.9602085790551866, + "grad_norm": 0.1507921864689127, + "learning_rate": 5.358207520419489e-08, + "loss": 2.7969, + "step": 47686 + }, + { + "epoch": 2.9602706561549446, + "grad_norm": 0.1351241649735099, + "learning_rate": 5.34150526000865e-08, + "loss": 2.6579, + "step": 47687 + }, + { + "epoch": 2.9603327332547025, + "grad_norm": 0.14164810073616418, + "learning_rate": 5.3248290579432926e-08, + "loss": 2.7278, + "step": 47688 + }, + { + "epoch": 2.9603948103544604, + "grad_norm": 0.15503149556680895, + "learning_rate": 5.308178914311124e-08, + "loss": 2.674, + "step": 47689 + }, + { + "epoch": 2.9604568874542183, + "grad_norm": 0.13791008695766263, + "learning_rate": 5.2915548291987415e-08, + "loss": 2.7151, + "step": 47690 + }, + { + "epoch": 2.960518964553976, + "grad_norm": 0.14630660396267325, + "learning_rate": 5.274956802692743e-08, + "loss": 2.7132, + "step": 47691 + }, + { + "epoch": 2.960581041653734, + "grad_norm": 0.1297388713272978, + "learning_rate": 5.258384834879726e-08, + "loss": 2.7118, + "step": 47692 + }, + { + "epoch": 2.9606431187534916, + "grad_norm": 0.14376473791824923, + "learning_rate": 5.2418389258462874e-08, + "loss": 2.7393, + "step": 47693 + }, + { + "epoch": 2.96070519585325, + "grad_norm": 0.1451244488487161, + "learning_rate": 5.225319075679025e-08, + "loss": 2.7846, + "step": 47694 + }, + { + "epoch": 2.9607672729530075, + "grad_norm": 0.13512025695649305, + "learning_rate": 5.208825284463425e-08, + "loss": 2.6702, + "step": 47695 + }, + { + "epoch": 2.9608293500527654, + "grad_norm": 0.1304915653769588, + "learning_rate": 5.192357552286087e-08, + "loss": 2.7513, + "step": 47696 + }, + { + "epoch": 2.9608914271525233, + "grad_norm": 0.1466646483898373, + "learning_rate": 5.175915879232496e-08, + "loss": 2.759, + "step": 47697 + }, + { + "epoch": 2.960953504252281, + "grad_norm": 0.14105476169166947, + "learning_rate": 5.159500265388695e-08, + "loss": 2.6784, + "step": 47698 + }, + { + "epoch": 2.961015581352039, + "grad_norm": 0.1280124702223192, + "learning_rate": 5.1431107108407264e-08, + "loss": 2.6599, + "step": 47699 + }, + { + "epoch": 2.961077658451797, + "grad_norm": 0.1301445009806479, + "learning_rate": 5.126747215673522e-08, + "loss": 2.6048, + "step": 47700 + }, + { + "epoch": 2.961139735551555, + "grad_norm": 0.13216517275647022, + "learning_rate": 5.1104097799725694e-08, + "loss": 2.7199, + "step": 47701 + }, + { + "epoch": 2.961201812651313, + "grad_norm": 0.13889342058921414, + "learning_rate": 5.0940984038228e-08, + "loss": 2.6357, + "step": 47702 + }, + { + "epoch": 2.961263889751071, + "grad_norm": 0.13032563806531652, + "learning_rate": 5.077813087310257e-08, + "loss": 2.6448, + "step": 47703 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.13847717532518636, + "learning_rate": 5.0615538305193164e-08, + "loss": 2.7241, + "step": 47704 + }, + { + "epoch": 2.9613880439505866, + "grad_norm": 0.12767987211319456, + "learning_rate": 5.045320633534356e-08, + "loss": 2.6617, + "step": 47705 + }, + { + "epoch": 2.9614501210503446, + "grad_norm": 0.1278089871191952, + "learning_rate": 5.0291134964408625e-08, + "loss": 2.6967, + "step": 47706 + }, + { + "epoch": 2.9615121981501025, + "grad_norm": 0.14467054176025235, + "learning_rate": 5.012932419322658e-08, + "loss": 2.784, + "step": 47707 + }, + { + "epoch": 2.9615742752498604, + "grad_norm": 0.14731808873824775, + "learning_rate": 4.996777402265229e-08, + "loss": 2.7603, + "step": 47708 + }, + { + "epoch": 2.9616363523496183, + "grad_norm": 0.13048405700250174, + "learning_rate": 4.980648445351843e-08, + "loss": 2.7129, + "step": 47709 + }, + { + "epoch": 2.9616984294493762, + "grad_norm": 0.12746067559147825, + "learning_rate": 4.964545548666877e-08, + "loss": 2.7088, + "step": 47710 + }, + { + "epoch": 2.961760506549134, + "grad_norm": 0.13176920579450485, + "learning_rate": 4.948468712294707e-08, + "loss": 2.6505, + "step": 47711 + }, + { + "epoch": 2.961822583648892, + "grad_norm": 0.1303163191924069, + "learning_rate": 4.932417936318601e-08, + "loss": 2.6673, + "step": 47712 + }, + { + "epoch": 2.96188466074865, + "grad_norm": 0.13010465706482643, + "learning_rate": 4.9163932208229346e-08, + "loss": 2.6721, + "step": 47713 + }, + { + "epoch": 2.9619467378484075, + "grad_norm": 0.1298806254840946, + "learning_rate": 4.9003945658909754e-08, + "loss": 2.7321, + "step": 47714 + }, + { + "epoch": 2.962008814948166, + "grad_norm": 0.1468637639678596, + "learning_rate": 4.88442197160599e-08, + "loss": 2.6501, + "step": 47715 + }, + { + "epoch": 2.9620708920479233, + "grad_norm": 0.12770915496986066, + "learning_rate": 4.868475438051801e-08, + "loss": 2.7217, + "step": 47716 + }, + { + "epoch": 2.9621329691476816, + "grad_norm": 0.13457512867673546, + "learning_rate": 4.852554965311673e-08, + "loss": 2.7006, + "step": 47717 + }, + { + "epoch": 2.962195046247439, + "grad_norm": 0.14007499877496338, + "learning_rate": 4.83666055346832e-08, + "loss": 2.6185, + "step": 47718 + }, + { + "epoch": 2.9622571233471975, + "grad_norm": 0.1275551168084433, + "learning_rate": 4.8207922026044515e-08, + "loss": 2.7358, + "step": 47719 + }, + { + "epoch": 2.962319200446955, + "grad_norm": 0.13606571503294096, + "learning_rate": 4.804949912803336e-08, + "loss": 2.7511, + "step": 47720 + }, + { + "epoch": 2.962381277546713, + "grad_norm": 0.13036499294908063, + "learning_rate": 4.789133684147129e-08, + "loss": 2.8325, + "step": 47721 + }, + { + "epoch": 2.962443354646471, + "grad_norm": 0.14266065273196954, + "learning_rate": 4.773343516718542e-08, + "loss": 2.6216, + "step": 47722 + }, + { + "epoch": 2.9625054317462287, + "grad_norm": 0.1330158556104158, + "learning_rate": 4.757579410600843e-08, + "loss": 2.7498, + "step": 47723 + }, + { + "epoch": 2.9625675088459866, + "grad_norm": 0.13591374047136998, + "learning_rate": 4.7418413658745217e-08, + "loss": 2.6686, + "step": 47724 + }, + { + "epoch": 2.9626295859457445, + "grad_norm": 0.13657093231565046, + "learning_rate": 4.726129382623401e-08, + "loss": 2.6552, + "step": 47725 + }, + { + "epoch": 2.9626916630455025, + "grad_norm": 0.1288039061276846, + "learning_rate": 4.710443460928527e-08, + "loss": 2.6815, + "step": 47726 + }, + { + "epoch": 2.9627537401452604, + "grad_norm": 0.13384304210681383, + "learning_rate": 4.6947836008714995e-08, + "loss": 2.6841, + "step": 47727 + }, + { + "epoch": 2.9628158172450183, + "grad_norm": 0.1442982109433443, + "learning_rate": 4.6791498025350325e-08, + "loss": 2.6794, + "step": 47728 + }, + { + "epoch": 2.962877894344776, + "grad_norm": 0.1332566603978507, + "learning_rate": 4.6635420659996156e-08, + "loss": 2.6896, + "step": 47729 + }, + { + "epoch": 2.962939971444534, + "grad_norm": 0.1384523897903806, + "learning_rate": 4.647960391347406e-08, + "loss": 2.6764, + "step": 47730 + }, + { + "epoch": 2.963002048544292, + "grad_norm": 0.12949237363177654, + "learning_rate": 4.6324047786594496e-08, + "loss": 2.7467, + "step": 47731 + }, + { + "epoch": 2.96306412564405, + "grad_norm": 0.128670585396563, + "learning_rate": 4.616875228016793e-08, + "loss": 2.6843, + "step": 47732 + }, + { + "epoch": 2.963126202743808, + "grad_norm": 0.13008101119313373, + "learning_rate": 4.601371739501037e-08, + "loss": 2.6876, + "step": 47733 + }, + { + "epoch": 2.963188279843566, + "grad_norm": 0.1490566554113311, + "learning_rate": 4.5858943131921185e-08, + "loss": 2.7747, + "step": 47734 + }, + { + "epoch": 2.9632503569433237, + "grad_norm": 0.12694593555927414, + "learning_rate": 4.570442949171083e-08, + "loss": 2.7236, + "step": 47735 + }, + { + "epoch": 2.9633124340430816, + "grad_norm": 0.1360693905618403, + "learning_rate": 4.5550176475189776e-08, + "loss": 2.7914, + "step": 47736 + }, + { + "epoch": 2.9633745111428396, + "grad_norm": 0.14163989501908095, + "learning_rate": 4.539618408316293e-08, + "loss": 2.745, + "step": 47737 + }, + { + "epoch": 2.9634365882425975, + "grad_norm": 0.14788313293019262, + "learning_rate": 4.524245231642965e-08, + "loss": 2.6907, + "step": 47738 + }, + { + "epoch": 2.963498665342355, + "grad_norm": 0.12750972363615254, + "learning_rate": 4.508898117579485e-08, + "loss": 2.6754, + "step": 47739 + }, + { + "epoch": 2.9635607424421133, + "grad_norm": 0.13863692716337944, + "learning_rate": 4.4935770662052346e-08, + "loss": 2.6693, + "step": 47740 + }, + { + "epoch": 2.963622819541871, + "grad_norm": 0.1265678501764374, + "learning_rate": 4.478282077601259e-08, + "loss": 2.6823, + "step": 47741 + }, + { + "epoch": 2.963684896641629, + "grad_norm": 0.13424803674095262, + "learning_rate": 4.4630131518463846e-08, + "loss": 2.663, + "step": 47742 + }, + { + "epoch": 2.9637469737413866, + "grad_norm": 0.1328611915753107, + "learning_rate": 4.447770289021103e-08, + "loss": 2.6586, + "step": 47743 + }, + { + "epoch": 2.9638090508411445, + "grad_norm": 0.1332463583046011, + "learning_rate": 4.4325534892042385e-08, + "loss": 2.6913, + "step": 47744 + }, + { + "epoch": 2.9638711279409025, + "grad_norm": 0.12996434399033263, + "learning_rate": 4.417362752475174e-08, + "loss": 2.6718, + "step": 47745 + }, + { + "epoch": 2.9639332050406604, + "grad_norm": 0.14966609367740238, + "learning_rate": 4.402198078913844e-08, + "loss": 2.7891, + "step": 47746 + }, + { + "epoch": 2.9639952821404183, + "grad_norm": 0.12922238829125338, + "learning_rate": 4.3870594685990754e-08, + "loss": 2.7342, + "step": 47747 + }, + { + "epoch": 2.964057359240176, + "grad_norm": 0.1278627234930311, + "learning_rate": 4.371946921609138e-08, + "loss": 2.6711, + "step": 47748 + }, + { + "epoch": 2.964119436339934, + "grad_norm": 0.14219778097103994, + "learning_rate": 4.3568604380239685e-08, + "loss": 2.6847, + "step": 47749 + }, + { + "epoch": 2.964181513439692, + "grad_norm": 0.1317674176209423, + "learning_rate": 4.341800017921838e-08, + "loss": 2.662, + "step": 47750 + }, + { + "epoch": 2.96424359053945, + "grad_norm": 0.14219129028089222, + "learning_rate": 4.326765661381016e-08, + "loss": 2.7836, + "step": 47751 + }, + { + "epoch": 2.964305667639208, + "grad_norm": 0.13825160128966402, + "learning_rate": 4.311757368480329e-08, + "loss": 2.7167, + "step": 47752 + }, + { + "epoch": 2.964367744738966, + "grad_norm": 0.12855427546335152, + "learning_rate": 4.296775139298048e-08, + "loss": 2.7295, + "step": 47753 + }, + { + "epoch": 2.9644298218387237, + "grad_norm": 0.13257406373510744, + "learning_rate": 4.281818973911888e-08, + "loss": 2.661, + "step": 47754 + }, + { + "epoch": 2.9644918989384816, + "grad_norm": 0.14340439539092714, + "learning_rate": 4.266888872400676e-08, + "loss": 2.7754, + "step": 47755 + }, + { + "epoch": 2.9645539760382396, + "grad_norm": 0.14701152046872398, + "learning_rate": 4.251984834841571e-08, + "loss": 2.6446, + "step": 47756 + }, + { + "epoch": 2.9646160531379975, + "grad_norm": 0.1372312544757064, + "learning_rate": 4.237106861312845e-08, + "loss": 2.683, + "step": 47757 + }, + { + "epoch": 2.9646781302377554, + "grad_norm": 0.13314898724770666, + "learning_rate": 4.2222549518916576e-08, + "loss": 2.8088, + "step": 47758 + }, + { + "epoch": 2.9647402073375133, + "grad_norm": 0.12801447090011425, + "learning_rate": 4.20742910665628e-08, + "loss": 2.7462, + "step": 47759 + }, + { + "epoch": 2.9648022844372712, + "grad_norm": 0.13106562601480898, + "learning_rate": 4.192629325682762e-08, + "loss": 2.7244, + "step": 47760 + }, + { + "epoch": 2.964864361537029, + "grad_norm": 0.1356279043027299, + "learning_rate": 4.177855609049375e-08, + "loss": 2.657, + "step": 47761 + }, + { + "epoch": 2.9649264386367866, + "grad_norm": 0.13972394409157812, + "learning_rate": 4.163107956833279e-08, + "loss": 2.7311, + "step": 47762 + }, + { + "epoch": 2.964988515736545, + "grad_norm": 0.13086883311466468, + "learning_rate": 4.148386369110524e-08, + "loss": 2.6672, + "step": 47763 + }, + { + "epoch": 2.9650505928363025, + "grad_norm": 0.13056895068820026, + "learning_rate": 4.1336908459588264e-08, + "loss": 2.6795, + "step": 47764 + }, + { + "epoch": 2.965112669936061, + "grad_norm": 0.14241564650734745, + "learning_rate": 4.119021387454236e-08, + "loss": 2.7105, + "step": 47765 + }, + { + "epoch": 2.9651747470358183, + "grad_norm": 0.14234276510540678, + "learning_rate": 4.1043779936733583e-08, + "loss": 2.7066, + "step": 47766 + }, + { + "epoch": 2.9652368241355767, + "grad_norm": 0.13425662487748943, + "learning_rate": 4.089760664692799e-08, + "loss": 2.6571, + "step": 47767 + }, + { + "epoch": 2.965298901235334, + "grad_norm": 0.13781531495400468, + "learning_rate": 4.075169400588607e-08, + "loss": 2.6637, + "step": 47768 + }, + { + "epoch": 2.965360978335092, + "grad_norm": 0.1375309998614657, + "learning_rate": 4.0606042014373904e-08, + "loss": 2.6881, + "step": 47769 + }, + { + "epoch": 2.96542305543485, + "grad_norm": 0.13699399625499448, + "learning_rate": 4.0460650673146416e-08, + "loss": 2.6345, + "step": 47770 + }, + { + "epoch": 2.965485132534608, + "grad_norm": 0.12898460809835607, + "learning_rate": 4.0315519982964124e-08, + "loss": 2.7593, + "step": 47771 + }, + { + "epoch": 2.965547209634366, + "grad_norm": 0.14011805606144256, + "learning_rate": 4.017064994458197e-08, + "loss": 2.6939, + "step": 47772 + }, + { + "epoch": 2.9656092867341237, + "grad_norm": 0.1340982595404208, + "learning_rate": 4.002604055876047e-08, + "loss": 2.7035, + "step": 47773 + }, + { + "epoch": 2.9656713638338816, + "grad_norm": 0.13345681305338092, + "learning_rate": 3.988169182624901e-08, + "loss": 2.7929, + "step": 47774 + }, + { + "epoch": 2.9657334409336396, + "grad_norm": 0.132372027554698, + "learning_rate": 3.9737603747808104e-08, + "loss": 2.6571, + "step": 47775 + }, + { + "epoch": 2.9657955180333975, + "grad_norm": 0.13564143798684675, + "learning_rate": 3.9593776324181596e-08, + "loss": 2.7157, + "step": 47776 + }, + { + "epoch": 2.9658575951331554, + "grad_norm": 0.1431728917166656, + "learning_rate": 3.945020955611889e-08, + "loss": 2.6701, + "step": 47777 + }, + { + "epoch": 2.9659196722329133, + "grad_norm": 0.1313440754909326, + "learning_rate": 3.930690344437493e-08, + "loss": 2.7045, + "step": 47778 + }, + { + "epoch": 2.9659817493326712, + "grad_norm": 0.12942793792759721, + "learning_rate": 3.9163857989699127e-08, + "loss": 2.7363, + "step": 47779 + }, + { + "epoch": 2.966043826432429, + "grad_norm": 0.12688993201684856, + "learning_rate": 3.902107319282977e-08, + "loss": 2.5777, + "step": 47780 + }, + { + "epoch": 2.966105903532187, + "grad_norm": 0.12855450728991202, + "learning_rate": 3.8878549054516263e-08, + "loss": 2.6758, + "step": 47781 + }, + { + "epoch": 2.966167980631945, + "grad_norm": 0.1332217111233824, + "learning_rate": 3.873628557550246e-08, + "loss": 2.7338, + "step": 47782 + }, + { + "epoch": 2.966230057731703, + "grad_norm": 0.13162195005306726, + "learning_rate": 3.859428275653221e-08, + "loss": 2.6503, + "step": 47783 + }, + { + "epoch": 2.966292134831461, + "grad_norm": 0.13411876678114715, + "learning_rate": 3.845254059833825e-08, + "loss": 2.7866, + "step": 47784 + }, + { + "epoch": 2.9663542119312187, + "grad_norm": 0.13322723371859765, + "learning_rate": 3.831105910166999e-08, + "loss": 2.6596, + "step": 47785 + }, + { + "epoch": 2.9664162890309766, + "grad_norm": 0.12787778544072437, + "learning_rate": 3.816983826726572e-08, + "loss": 2.633, + "step": 47786 + }, + { + "epoch": 2.966478366130734, + "grad_norm": 0.1432471776966364, + "learning_rate": 3.802887809585265e-08, + "loss": 2.696, + "step": 47787 + }, + { + "epoch": 2.9665404432304925, + "grad_norm": 0.14265858326577935, + "learning_rate": 3.788817858816906e-08, + "loss": 2.6922, + "step": 47788 + }, + { + "epoch": 2.96660252033025, + "grad_norm": 0.143570678495003, + "learning_rate": 3.774773974495882e-08, + "loss": 2.6897, + "step": 47789 + }, + { + "epoch": 2.9666645974300083, + "grad_norm": 0.13841998366198302, + "learning_rate": 3.760756156694356e-08, + "loss": 2.6338, + "step": 47790 + }, + { + "epoch": 2.966726674529766, + "grad_norm": 0.12981113090674753, + "learning_rate": 3.746764405485603e-08, + "loss": 2.6301, + "step": 47791 + }, + { + "epoch": 2.9667887516295237, + "grad_norm": 0.1453894511405543, + "learning_rate": 3.7327987209434535e-08, + "loss": 2.7627, + "step": 47792 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.1350560996927942, + "learning_rate": 3.718859103139516e-08, + "loss": 2.5874, + "step": 47793 + }, + { + "epoch": 2.9669129058290395, + "grad_norm": 0.12446597137768843, + "learning_rate": 3.7049455521476204e-08, + "loss": 2.7022, + "step": 47794 + }, + { + "epoch": 2.9669749829287975, + "grad_norm": 0.12933288401251386, + "learning_rate": 3.691058068039932e-08, + "loss": 2.7341, + "step": 47795 + }, + { + "epoch": 2.9670370600285554, + "grad_norm": 0.1302264193053994, + "learning_rate": 3.677196650888615e-08, + "loss": 2.6891, + "step": 47796 + }, + { + "epoch": 2.9670991371283133, + "grad_norm": 0.12961475417487056, + "learning_rate": 3.663361300766388e-08, + "loss": 2.7248, + "step": 47797 + }, + { + "epoch": 2.967161214228071, + "grad_norm": 0.1291299381899754, + "learning_rate": 3.649552017745417e-08, + "loss": 2.6756, + "step": 47798 + }, + { + "epoch": 2.967223291327829, + "grad_norm": 0.13365449890557599, + "learning_rate": 3.63576880189731e-08, + "loss": 2.7654, + "step": 47799 + }, + { + "epoch": 2.967285368427587, + "grad_norm": 0.13513872082546696, + "learning_rate": 3.6220116532947876e-08, + "loss": 2.7194, + "step": 47800 + }, + { + "epoch": 2.967347445527345, + "grad_norm": 0.16480829255292476, + "learning_rate": 3.608280572008904e-08, + "loss": 2.7717, + "step": 47801 + }, + { + "epoch": 2.967409522627103, + "grad_norm": 0.1311957746842993, + "learning_rate": 3.5945755581112685e-08, + "loss": 2.6753, + "step": 47802 + }, + { + "epoch": 2.967471599726861, + "grad_norm": 0.13029485609368524, + "learning_rate": 3.580896611674045e-08, + "loss": 2.7283, + "step": 47803 + }, + { + "epoch": 2.9675336768266187, + "grad_norm": 0.12817196799245473, + "learning_rate": 3.567243732768288e-08, + "loss": 2.7277, + "step": 47804 + }, + { + "epoch": 2.9675957539263766, + "grad_norm": 0.1361567390026866, + "learning_rate": 3.553616921465053e-08, + "loss": 2.6507, + "step": 47805 + }, + { + "epoch": 2.9676578310261346, + "grad_norm": 0.1310093788587014, + "learning_rate": 3.540016177835392e-08, + "loss": 2.7106, + "step": 47806 + }, + { + "epoch": 2.9677199081258925, + "grad_norm": 0.13022045476441574, + "learning_rate": 3.5264415019503616e-08, + "loss": 2.7225, + "step": 47807 + }, + { + "epoch": 2.9677819852256504, + "grad_norm": 0.134622926766026, + "learning_rate": 3.512892893880459e-08, + "loss": 2.7023, + "step": 47808 + }, + { + "epoch": 2.9678440623254083, + "grad_norm": 0.12679833732744208, + "learning_rate": 3.4993703536972954e-08, + "loss": 2.6353, + "step": 47809 + }, + { + "epoch": 2.967906139425166, + "grad_norm": 0.13198197365083839, + "learning_rate": 3.4858738814708136e-08, + "loss": 2.6843, + "step": 47810 + }, + { + "epoch": 2.967968216524924, + "grad_norm": 0.13196493125347813, + "learning_rate": 3.472403477271513e-08, + "loss": 2.7584, + "step": 47811 + }, + { + "epoch": 2.9680302936246816, + "grad_norm": 0.14177392794935778, + "learning_rate": 3.4589591411693375e-08, + "loss": 2.7021, + "step": 47812 + }, + { + "epoch": 2.96809237072444, + "grad_norm": 0.12996142740160163, + "learning_rate": 3.4455408732347873e-08, + "loss": 2.6666, + "step": 47813 + }, + { + "epoch": 2.9681544478241975, + "grad_norm": 0.13004482175174822, + "learning_rate": 3.43214867353836e-08, + "loss": 2.671, + "step": 47814 + }, + { + "epoch": 2.968216524923956, + "grad_norm": 0.1365619032211707, + "learning_rate": 3.4187825421488904e-08, + "loss": 2.7313, + "step": 47815 + }, + { + "epoch": 2.9682786020237133, + "grad_norm": 0.13790922255632856, + "learning_rate": 3.405442479136323e-08, + "loss": 2.7609, + "step": 47816 + }, + { + "epoch": 2.968340679123471, + "grad_norm": 0.13277342866749972, + "learning_rate": 3.392128484571156e-08, + "loss": 2.6558, + "step": 47817 + }, + { + "epoch": 2.968402756223229, + "grad_norm": 0.13301756398757042, + "learning_rate": 3.3788405585216676e-08, + "loss": 2.6089, + "step": 47818 + }, + { + "epoch": 2.968464833322987, + "grad_norm": 0.1313735837057013, + "learning_rate": 3.3655787010578034e-08, + "loss": 2.7164, + "step": 47819 + }, + { + "epoch": 2.968526910422745, + "grad_norm": 0.1622707127107309, + "learning_rate": 3.352342912248952e-08, + "loss": 2.7006, + "step": 47820 + }, + { + "epoch": 2.968588987522503, + "grad_norm": 0.13424120329961353, + "learning_rate": 3.339133192163946e-08, + "loss": 2.8071, + "step": 47821 + }, + { + "epoch": 2.968651064622261, + "grad_norm": 0.1344029342093361, + "learning_rate": 3.32594954087162e-08, + "loss": 2.8689, + "step": 47822 + }, + { + "epoch": 2.9687131417220187, + "grad_norm": 0.12853900296770893, + "learning_rate": 3.3127919584402535e-08, + "loss": 2.6542, + "step": 47823 + }, + { + "epoch": 2.9687752188217766, + "grad_norm": 0.1300076623180717, + "learning_rate": 3.2996604449397897e-08, + "loss": 2.7966, + "step": 47824 + }, + { + "epoch": 2.9688372959215346, + "grad_norm": 0.13043331418817583, + "learning_rate": 3.2865550004373966e-08, + "loss": 2.6921, + "step": 47825 + }, + { + "epoch": 2.9688993730212925, + "grad_norm": 0.1294439545297197, + "learning_rate": 3.273475625002465e-08, + "loss": 2.7284, + "step": 47826 + }, + { + "epoch": 2.9689614501210504, + "grad_norm": 0.14997165084214512, + "learning_rate": 3.260422318702716e-08, + "loss": 2.618, + "step": 47827 + }, + { + "epoch": 2.9690235272208083, + "grad_norm": 0.12778750373387324, + "learning_rate": 3.2473950816064305e-08, + "loss": 2.6809, + "step": 47828 + }, + { + "epoch": 2.9690856043205662, + "grad_norm": 0.13123185963382564, + "learning_rate": 3.2343939137813305e-08, + "loss": 2.689, + "step": 47829 + }, + { + "epoch": 2.969147681420324, + "grad_norm": 0.13905967966712982, + "learning_rate": 3.221418815295696e-08, + "loss": 2.6815, + "step": 47830 + }, + { + "epoch": 2.969209758520082, + "grad_norm": 0.14180872660097585, + "learning_rate": 3.208469786216694e-08, + "loss": 2.6626, + "step": 47831 + }, + { + "epoch": 2.96927183561984, + "grad_norm": 0.1322364797775112, + "learning_rate": 3.19554682661205e-08, + "loss": 2.6649, + "step": 47832 + }, + { + "epoch": 2.969333912719598, + "grad_norm": 0.13814397857352673, + "learning_rate": 3.182649936548932e-08, + "loss": 2.7073, + "step": 47833 + }, + { + "epoch": 2.969395989819356, + "grad_norm": 0.1380509592562821, + "learning_rate": 3.169779116095617e-08, + "loss": 2.6572, + "step": 47834 + }, + { + "epoch": 2.9694580669191133, + "grad_norm": 0.13042245346120201, + "learning_rate": 3.156934365318165e-08, + "loss": 2.6153, + "step": 47835 + }, + { + "epoch": 2.9695201440188717, + "grad_norm": 0.13056327998684789, + "learning_rate": 3.1441156842842986e-08, + "loss": 2.6978, + "step": 47836 + }, + { + "epoch": 2.969582221118629, + "grad_norm": 0.13192618071810436, + "learning_rate": 3.1313230730600775e-08, + "loss": 2.7312, + "step": 47837 + }, + { + "epoch": 2.9696442982183875, + "grad_norm": 0.13443975856511828, + "learning_rate": 3.1185565317132235e-08, + "loss": 2.6872, + "step": 47838 + }, + { + "epoch": 2.969706375318145, + "grad_norm": 0.12929394249025153, + "learning_rate": 3.1058160603097964e-08, + "loss": 2.7369, + "step": 47839 + }, + { + "epoch": 2.969768452417903, + "grad_norm": 0.14923774688189698, + "learning_rate": 3.093101658916409e-08, + "loss": 2.7172, + "step": 47840 + }, + { + "epoch": 2.969830529517661, + "grad_norm": 0.14077850414269863, + "learning_rate": 3.080413327599674e-08, + "loss": 2.6874, + "step": 47841 + }, + { + "epoch": 2.9698926066174187, + "grad_norm": 0.24052711421755804, + "learning_rate": 3.0677510664250954e-08, + "loss": 2.7724, + "step": 47842 + }, + { + "epoch": 2.9699546837171766, + "grad_norm": 0.13014407877209802, + "learning_rate": 3.055114875458731e-08, + "loss": 2.6916, + "step": 47843 + }, + { + "epoch": 2.9700167608169346, + "grad_norm": 0.13438423064043398, + "learning_rate": 3.0425047547677496e-08, + "loss": 2.7359, + "step": 47844 + }, + { + "epoch": 2.9700788379166925, + "grad_norm": 0.14091521357581008, + "learning_rate": 3.0299207044165445e-08, + "loss": 2.6802, + "step": 47845 + }, + { + "epoch": 2.9701409150164504, + "grad_norm": 0.14135166347548336, + "learning_rate": 3.0173627244711736e-08, + "loss": 2.7587, + "step": 47846 + }, + { + "epoch": 2.9702029921162083, + "grad_norm": 0.15347805534135808, + "learning_rate": 3.004830814997694e-08, + "loss": 2.6824, + "step": 47847 + }, + { + "epoch": 2.9702650692159662, + "grad_norm": 0.1312821022579004, + "learning_rate": 2.992324976060501e-08, + "loss": 2.5599, + "step": 47848 + }, + { + "epoch": 2.970327146315724, + "grad_norm": 0.13328671643181458, + "learning_rate": 2.9798452077262062e-08, + "loss": 2.6194, + "step": 47849 + }, + { + "epoch": 2.970389223415482, + "grad_norm": 0.12827340753723274, + "learning_rate": 2.967391510058648e-08, + "loss": 2.6351, + "step": 47850 + }, + { + "epoch": 2.97045130051524, + "grad_norm": 0.1287656508580994, + "learning_rate": 2.9549638831233295e-08, + "loss": 2.6013, + "step": 47851 + }, + { + "epoch": 2.970513377614998, + "grad_norm": 0.13393791994891457, + "learning_rate": 2.9425623269846437e-08, + "loss": 2.7044, + "step": 47852 + }, + { + "epoch": 2.970575454714756, + "grad_norm": 0.1293497259911996, + "learning_rate": 2.9301868417080937e-08, + "loss": 2.6905, + "step": 47853 + }, + { + "epoch": 2.9706375318145137, + "grad_norm": 0.140715093363117, + "learning_rate": 2.917837427358072e-08, + "loss": 2.6762, + "step": 47854 + }, + { + "epoch": 2.9706996089142716, + "grad_norm": 0.13095018452860388, + "learning_rate": 2.9055140839984173e-08, + "loss": 2.7599, + "step": 47855 + }, + { + "epoch": 2.9707616860140296, + "grad_norm": 0.13389439160865857, + "learning_rate": 2.893216811694077e-08, + "loss": 2.6703, + "step": 47856 + }, + { + "epoch": 2.9708237631137875, + "grad_norm": 0.12894681812367678, + "learning_rate": 2.8809456105083344e-08, + "loss": 2.7445, + "step": 47857 + }, + { + "epoch": 2.970885840213545, + "grad_norm": 0.12937010048605024, + "learning_rate": 2.868700480506692e-08, + "loss": 2.7308, + "step": 47858 + }, + { + "epoch": 2.9709479173133033, + "grad_norm": 0.13464313966993213, + "learning_rate": 2.8564814217513225e-08, + "loss": 2.6618, + "step": 47859 + }, + { + "epoch": 2.971009994413061, + "grad_norm": 0.13096617852337308, + "learning_rate": 2.844288434307174e-08, + "loss": 2.6479, + "step": 47860 + }, + { + "epoch": 2.971072071512819, + "grad_norm": 0.14639363339350314, + "learning_rate": 2.8321215182375294e-08, + "loss": 2.7212, + "step": 47861 + }, + { + "epoch": 2.9711341486125766, + "grad_norm": 0.15236857407619697, + "learning_rate": 2.819980673605671e-08, + "loss": 2.7102, + "step": 47862 + }, + { + "epoch": 2.971196225712335, + "grad_norm": 0.1415651998199291, + "learning_rate": 2.807865900474882e-08, + "loss": 2.7162, + "step": 47863 + }, + { + "epoch": 2.9712583028120925, + "grad_norm": 0.13991216286116182, + "learning_rate": 2.795777198909e-08, + "loss": 2.7603, + "step": 47864 + }, + { + "epoch": 2.9713203799118504, + "grad_norm": 0.1352160913921467, + "learning_rate": 2.783714568970197e-08, + "loss": 2.6624, + "step": 47865 + }, + { + "epoch": 2.9713824570116083, + "grad_norm": 0.13690167859457616, + "learning_rate": 2.7716780107223117e-08, + "loss": 2.6627, + "step": 47866 + }, + { + "epoch": 2.971444534111366, + "grad_norm": 0.14017702646115673, + "learning_rate": 2.759667524227516e-08, + "loss": 2.7086, + "step": 47867 + }, + { + "epoch": 2.971506611211124, + "grad_norm": 0.1288847038295371, + "learning_rate": 2.7476831095485376e-08, + "loss": 2.6088, + "step": 47868 + }, + { + "epoch": 2.971568688310882, + "grad_norm": 0.12815126492949136, + "learning_rate": 2.7357247667481045e-08, + "loss": 2.6408, + "step": 47869 + }, + { + "epoch": 2.97163076541064, + "grad_norm": 0.13782721485243507, + "learning_rate": 2.7237924958883887e-08, + "loss": 2.7171, + "step": 47870 + }, + { + "epoch": 2.971692842510398, + "grad_norm": 0.1464717287326766, + "learning_rate": 2.711886297032118e-08, + "loss": 2.6573, + "step": 47871 + }, + { + "epoch": 2.971754919610156, + "grad_norm": 0.13678954199020024, + "learning_rate": 2.7000061702409096e-08, + "loss": 2.8008, + "step": 47872 + }, + { + "epoch": 2.9718169967099137, + "grad_norm": 0.13145054896043434, + "learning_rate": 2.6881521155769364e-08, + "loss": 2.6738, + "step": 47873 + }, + { + "epoch": 2.9718790738096716, + "grad_norm": 0.14639180119237696, + "learning_rate": 2.6763241331018153e-08, + "loss": 2.6402, + "step": 47874 + }, + { + "epoch": 2.9719411509094296, + "grad_norm": 0.1410539627796065, + "learning_rate": 2.664522222877719e-08, + "loss": 2.7595, + "step": 47875 + }, + { + "epoch": 2.9720032280091875, + "grad_norm": 0.14555024801943728, + "learning_rate": 2.6527463849657097e-08, + "loss": 2.7595, + "step": 47876 + }, + { + "epoch": 2.9720653051089454, + "grad_norm": 0.1355186721637743, + "learning_rate": 2.64099661942796e-08, + "loss": 2.7216, + "step": 47877 + }, + { + "epoch": 2.9721273822087033, + "grad_norm": 0.1348893247480061, + "learning_rate": 2.6292729263249772e-08, + "loss": 2.7182, + "step": 47878 + }, + { + "epoch": 2.9721894593084612, + "grad_norm": 0.1293413232772468, + "learning_rate": 2.617575305717823e-08, + "loss": 2.6974, + "step": 47879 + }, + { + "epoch": 2.972251536408219, + "grad_norm": 0.13018377469840853, + "learning_rate": 2.6059037576681157e-08, + "loss": 2.6289, + "step": 47880 + }, + { + "epoch": 2.972313613507977, + "grad_norm": 0.13187382471546194, + "learning_rate": 2.5942582822369166e-08, + "loss": 2.7896, + "step": 47881 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.1457986740808315, + "learning_rate": 2.5826388794847334e-08, + "loss": 2.7607, + "step": 47882 + }, + { + "epoch": 2.9724377677074925, + "grad_norm": 0.1490227618020914, + "learning_rate": 2.571045549471518e-08, + "loss": 2.726, + "step": 47883 + }, + { + "epoch": 2.972499844807251, + "grad_norm": 0.14247456800492717, + "learning_rate": 2.5594782922588877e-08, + "loss": 2.6177, + "step": 47884 + }, + { + "epoch": 2.9725619219070083, + "grad_norm": 0.12768698376053145, + "learning_rate": 2.5479371079062398e-08, + "loss": 2.6548, + "step": 47885 + }, + { + "epoch": 2.9726239990067667, + "grad_norm": 0.13236475249800236, + "learning_rate": 2.536421996474081e-08, + "loss": 2.749, + "step": 47886 + }, + { + "epoch": 2.972686076106524, + "grad_norm": 0.12963467033230652, + "learning_rate": 2.5249329580229185e-08, + "loss": 2.5836, + "step": 47887 + }, + { + "epoch": 2.972748153206282, + "grad_norm": 0.1327527627114178, + "learning_rate": 2.5134699926121495e-08, + "loss": 2.5953, + "step": 47888 + }, + { + "epoch": 2.97281023030604, + "grad_norm": 0.13005756868713794, + "learning_rate": 2.5020331003022814e-08, + "loss": 2.7208, + "step": 47889 + }, + { + "epoch": 2.972872307405798, + "grad_norm": 0.1362346183664428, + "learning_rate": 2.4906222811516e-08, + "loss": 2.7251, + "step": 47890 + }, + { + "epoch": 2.972934384505556, + "grad_norm": 0.12863061074168994, + "learning_rate": 2.4792375352211683e-08, + "loss": 2.7475, + "step": 47891 + }, + { + "epoch": 2.9729964616053137, + "grad_norm": 0.12530672171819435, + "learning_rate": 2.467878862569828e-08, + "loss": 2.5805, + "step": 47892 + }, + { + "epoch": 2.9730585387050716, + "grad_norm": 0.13153395505068982, + "learning_rate": 2.4565462632564205e-08, + "loss": 2.7783, + "step": 47893 + }, + { + "epoch": 2.9731206158048296, + "grad_norm": 0.13196603219532854, + "learning_rate": 2.4452397373403434e-08, + "loss": 2.67, + "step": 47894 + }, + { + "epoch": 2.9731826929045875, + "grad_norm": 0.1478534048310848, + "learning_rate": 2.4339592848809934e-08, + "loss": 2.87, + "step": 47895 + }, + { + "epoch": 2.9732447700043454, + "grad_norm": 0.13864696305782226, + "learning_rate": 2.422704905936657e-08, + "loss": 2.7023, + "step": 47896 + }, + { + "epoch": 2.9733068471041033, + "grad_norm": 0.13751819392690293, + "learning_rate": 2.411476600566176e-08, + "loss": 2.666, + "step": 47897 + }, + { + "epoch": 2.9733689242038612, + "grad_norm": 0.13989247160636215, + "learning_rate": 2.4002743688283926e-08, + "loss": 2.6817, + "step": 47898 + }, + { + "epoch": 2.973431001303619, + "grad_norm": 0.13532523054580387, + "learning_rate": 2.3890982107815928e-08, + "loss": 2.6007, + "step": 47899 + }, + { + "epoch": 2.973493078403377, + "grad_norm": 0.13798888679474117, + "learning_rate": 2.3779481264840642e-08, + "loss": 2.7436, + "step": 47900 + }, + { + "epoch": 2.973555155503135, + "grad_norm": 0.13000083523032666, + "learning_rate": 2.3668241159940928e-08, + "loss": 2.6271, + "step": 47901 + }, + { + "epoch": 2.973617232602893, + "grad_norm": 0.12671689282619175, + "learning_rate": 2.355726179369411e-08, + "loss": 2.6, + "step": 47902 + }, + { + "epoch": 2.973679309702651, + "grad_norm": 0.12939625864855228, + "learning_rate": 2.3446543166683042e-08, + "loss": 2.6636, + "step": 47903 + }, + { + "epoch": 2.9737413868024087, + "grad_norm": 0.12997615299755352, + "learning_rate": 2.3336085279485054e-08, + "loss": 2.657, + "step": 47904 + }, + { + "epoch": 2.9738034639021667, + "grad_norm": 0.14942363010266474, + "learning_rate": 2.3225888132671902e-08, + "loss": 2.6841, + "step": 47905 + }, + { + "epoch": 2.973865541001924, + "grad_norm": 0.1323219242811984, + "learning_rate": 2.3115951726820907e-08, + "loss": 2.7301, + "step": 47906 + }, + { + "epoch": 2.9739276181016825, + "grad_norm": 0.13324119297977205, + "learning_rate": 2.300627606250938e-08, + "loss": 2.722, + "step": 47907 + }, + { + "epoch": 2.97398969520144, + "grad_norm": 0.13176796185024758, + "learning_rate": 2.289686114030354e-08, + "loss": 2.6651, + "step": 47908 + }, + { + "epoch": 2.9740517723011983, + "grad_norm": 0.13352414515247904, + "learning_rate": 2.2787706960780698e-08, + "loss": 2.787, + "step": 47909 + }, + { + "epoch": 2.974113849400956, + "grad_norm": 0.13236183251270167, + "learning_rate": 2.267881352450707e-08, + "loss": 2.6498, + "step": 47910 + }, + { + "epoch": 2.974175926500714, + "grad_norm": 0.12847490100244383, + "learning_rate": 2.257018083204332e-08, + "loss": 2.6283, + "step": 47911 + }, + { + "epoch": 2.9742380036004716, + "grad_norm": 0.13039555123761798, + "learning_rate": 2.2461808883972313e-08, + "loss": 2.7098, + "step": 47912 + }, + { + "epoch": 2.9743000807002296, + "grad_norm": 0.1626240247818883, + "learning_rate": 2.235369768084361e-08, + "loss": 2.6914, + "step": 47913 + }, + { + "epoch": 2.9743621577999875, + "grad_norm": 0.12939116995422267, + "learning_rate": 2.2245847223234528e-08, + "loss": 2.6983, + "step": 47914 + }, + { + "epoch": 2.9744242348997454, + "grad_norm": 0.1446120191112881, + "learning_rate": 2.2138257511694627e-08, + "loss": 2.7682, + "step": 47915 + }, + { + "epoch": 2.9744863119995033, + "grad_norm": 0.13091594270495482, + "learning_rate": 2.203092854679567e-08, + "loss": 2.7207, + "step": 47916 + }, + { + "epoch": 2.9745483890992612, + "grad_norm": 0.1358375831596904, + "learning_rate": 2.192386032909277e-08, + "loss": 2.6578, + "step": 47917 + }, + { + "epoch": 2.974610466199019, + "grad_norm": 0.12858170054265985, + "learning_rate": 2.1817052859146593e-08, + "loss": 2.7231, + "step": 47918 + }, + { + "epoch": 2.974672543298777, + "grad_norm": 0.1280102439768181, + "learning_rate": 2.1710506137512242e-08, + "loss": 2.6064, + "step": 47919 + }, + { + "epoch": 2.974734620398535, + "grad_norm": 0.15075160337935015, + "learning_rate": 2.160422016475039e-08, + "loss": 2.6975, + "step": 47920 + }, + { + "epoch": 2.974796697498293, + "grad_norm": 0.12682172136964134, + "learning_rate": 2.1498194941405038e-08, + "loss": 2.6786, + "step": 47921 + }, + { + "epoch": 2.974858774598051, + "grad_norm": 0.13740395095633512, + "learning_rate": 2.13924304680424e-08, + "loss": 2.6124, + "step": 47922 + }, + { + "epoch": 2.9749208516978087, + "grad_norm": 0.14906828894060659, + "learning_rate": 2.128692674520649e-08, + "loss": 2.7696, + "step": 47923 + }, + { + "epoch": 2.9749829287975667, + "grad_norm": 0.13923540590495584, + "learning_rate": 2.118168377345242e-08, + "loss": 2.6422, + "step": 47924 + }, + { + "epoch": 2.9750450058973246, + "grad_norm": 0.13173377705898598, + "learning_rate": 2.1076701553318646e-08, + "loss": 2.696, + "step": 47925 + }, + { + "epoch": 2.9751070829970825, + "grad_norm": 0.13705954311806468, + "learning_rate": 2.0971980085365828e-08, + "loss": 2.7099, + "step": 47926 + }, + { + "epoch": 2.9751691600968404, + "grad_norm": 0.13389432205213678, + "learning_rate": 2.0867519370137977e-08, + "loss": 2.7062, + "step": 47927 + }, + { + "epoch": 2.9752312371965983, + "grad_norm": 0.1452289251235579, + "learning_rate": 2.0763319408173555e-08, + "loss": 2.72, + "step": 47928 + }, + { + "epoch": 2.9752933142963562, + "grad_norm": 0.14114088560654858, + "learning_rate": 2.0659380200016565e-08, + "loss": 2.6827, + "step": 47929 + }, + { + "epoch": 2.975355391396114, + "grad_norm": 0.13670821830512528, + "learning_rate": 2.055570174621657e-08, + "loss": 2.6992, + "step": 47930 + }, + { + "epoch": 2.9754174684958716, + "grad_norm": 0.12861450733478036, + "learning_rate": 2.0452284047312032e-08, + "loss": 2.6395, + "step": 47931 + }, + { + "epoch": 2.97547954559563, + "grad_norm": 0.15211162856420785, + "learning_rate": 2.0349127103835853e-08, + "loss": 2.6894, + "step": 47932 + }, + { + "epoch": 2.9755416226953875, + "grad_norm": 0.14144625203332326, + "learning_rate": 2.0246230916332043e-08, + "loss": 2.7064, + "step": 47933 + }, + { + "epoch": 2.975603699795146, + "grad_norm": 0.14690256823868147, + "learning_rate": 2.014359548533906e-08, + "loss": 2.7383, + "step": 47934 + }, + { + "epoch": 2.9756657768949033, + "grad_norm": 0.12624850234474738, + "learning_rate": 2.0041220811389815e-08, + "loss": 2.6234, + "step": 47935 + }, + { + "epoch": 2.975727853994661, + "grad_norm": 0.13137058497540754, + "learning_rate": 1.9939106895017213e-08, + "loss": 2.6372, + "step": 47936 + }, + { + "epoch": 2.975789931094419, + "grad_norm": 0.13232431024639724, + "learning_rate": 1.983725373675971e-08, + "loss": 2.7356, + "step": 47937 + }, + { + "epoch": 2.975852008194177, + "grad_norm": 0.1348185089949182, + "learning_rate": 1.973566133713911e-08, + "loss": 2.7026, + "step": 47938 + }, + { + "epoch": 2.975914085293935, + "grad_norm": 0.1339230945012237, + "learning_rate": 1.9634329696693875e-08, + "loss": 2.6863, + "step": 47939 + }, + { + "epoch": 2.975976162393693, + "grad_norm": 0.14533790807193442, + "learning_rate": 1.953325881594581e-08, + "loss": 2.7479, + "step": 47940 + }, + { + "epoch": 2.976038239493451, + "grad_norm": 0.13055203771741308, + "learning_rate": 1.943244869542782e-08, + "loss": 2.7119, + "step": 47941 + }, + { + "epoch": 2.9761003165932087, + "grad_norm": 0.13016285499446315, + "learning_rate": 1.9331899335661708e-08, + "loss": 2.7255, + "step": 47942 + }, + { + "epoch": 2.9761623936929666, + "grad_norm": 0.13002169504839223, + "learning_rate": 1.9231610737180382e-08, + "loss": 2.719, + "step": 47943 + }, + { + "epoch": 2.9762244707927246, + "grad_norm": 0.14331228199936813, + "learning_rate": 1.9131582900494548e-08, + "loss": 2.6681, + "step": 47944 + }, + { + "epoch": 2.9762865478924825, + "grad_norm": 0.1428528060212226, + "learning_rate": 1.9031815826137113e-08, + "loss": 2.7335, + "step": 47945 + }, + { + "epoch": 2.9763486249922404, + "grad_norm": 0.14399479852618585, + "learning_rate": 1.8932309514624323e-08, + "loss": 2.789, + "step": 47946 + }, + { + "epoch": 2.9764107020919983, + "grad_norm": 0.14017355165679005, + "learning_rate": 1.8833063966472442e-08, + "loss": 2.7235, + "step": 47947 + }, + { + "epoch": 2.9764727791917562, + "grad_norm": 0.1336054342302364, + "learning_rate": 1.8734079182203268e-08, + "loss": 2.7081, + "step": 47948 + }, + { + "epoch": 2.976534856291514, + "grad_norm": 0.1299050175895347, + "learning_rate": 1.8635355162327507e-08, + "loss": 2.7012, + "step": 47949 + }, + { + "epoch": 2.976596933391272, + "grad_norm": 0.13680974112265004, + "learning_rate": 1.853689190737251e-08, + "loss": 2.6102, + "step": 47950 + }, + { + "epoch": 2.97665901049103, + "grad_norm": 0.13604669241314096, + "learning_rate": 1.8438689417837886e-08, + "loss": 2.6935, + "step": 47951 + }, + { + "epoch": 2.976721087590788, + "grad_norm": 0.13314505623009848, + "learning_rate": 1.834074769423988e-08, + "loss": 2.6537, + "step": 47952 + }, + { + "epoch": 2.976783164690546, + "grad_norm": 0.13250399268630433, + "learning_rate": 1.824306673709475e-08, + "loss": 2.7181, + "step": 47953 + }, + { + "epoch": 2.9768452417903033, + "grad_norm": 0.1328599017766608, + "learning_rate": 1.81456465469132e-08, + "loss": 2.729, + "step": 47954 + }, + { + "epoch": 2.9769073188900617, + "grad_norm": 0.13729425284835012, + "learning_rate": 1.8048487124194823e-08, + "loss": 2.7556, + "step": 47955 + }, + { + "epoch": 2.976969395989819, + "grad_norm": 0.12729949650301553, + "learning_rate": 1.795158846945033e-08, + "loss": 2.6647, + "step": 47956 + }, + { + "epoch": 2.9770314730895775, + "grad_norm": 0.12616135845703477, + "learning_rate": 1.785495058319042e-08, + "loss": 2.6688, + "step": 47957 + }, + { + "epoch": 2.977093550189335, + "grad_norm": 0.13003266803240532, + "learning_rate": 1.7758573465909146e-08, + "loss": 2.6856, + "step": 47958 + }, + { + "epoch": 2.977155627289093, + "grad_norm": 0.14102521874294685, + "learning_rate": 1.7662457118117202e-08, + "loss": 2.6647, + "step": 47959 + }, + { + "epoch": 2.977217704388851, + "grad_norm": 0.1315229940222845, + "learning_rate": 1.756660154031975e-08, + "loss": 2.6635, + "step": 47960 + }, + { + "epoch": 2.9772797814886087, + "grad_norm": 0.13638364732417643, + "learning_rate": 1.747100673300528e-08, + "loss": 2.7162, + "step": 47961 + }, + { + "epoch": 2.9773418585883666, + "grad_norm": 0.1297995651892108, + "learning_rate": 1.7375672696673397e-08, + "loss": 2.7195, + "step": 47962 + }, + { + "epoch": 2.9774039356881246, + "grad_norm": 0.13344002183054385, + "learning_rate": 1.72805994318348e-08, + "loss": 2.7116, + "step": 47963 + }, + { + "epoch": 2.9774660127878825, + "grad_norm": 0.1393999676942446, + "learning_rate": 1.7185786938972438e-08, + "loss": 2.6422, + "step": 47964 + }, + { + "epoch": 2.9775280898876404, + "grad_norm": 0.132599870789977, + "learning_rate": 1.709123521858591e-08, + "loss": 2.6863, + "step": 47965 + }, + { + "epoch": 2.9775901669873983, + "grad_norm": 0.1333228012733408, + "learning_rate": 1.699694427116927e-08, + "loss": 2.6902, + "step": 47966 + }, + { + "epoch": 2.9776522440871562, + "grad_norm": 0.1460076203149803, + "learning_rate": 1.690291409721656e-08, + "loss": 2.7285, + "step": 47967 + }, + { + "epoch": 2.977714321186914, + "grad_norm": 0.13053686339856813, + "learning_rate": 1.6809144697210734e-08, + "loss": 2.7476, + "step": 47968 + }, + { + "epoch": 2.977776398286672, + "grad_norm": 0.12939858340270285, + "learning_rate": 1.6715636071645834e-08, + "loss": 2.7713, + "step": 47969 + }, + { + "epoch": 2.97783847538643, + "grad_norm": 0.13533461741276148, + "learning_rate": 1.6622388221010364e-08, + "loss": 2.7866, + "step": 47970 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.15200916252354296, + "learning_rate": 1.6529401145792824e-08, + "loss": 2.6874, + "step": 47971 + }, + { + "epoch": 2.977962629585946, + "grad_norm": 0.14266812083980895, + "learning_rate": 1.6436674846470602e-08, + "loss": 2.6645, + "step": 47972 + }, + { + "epoch": 2.9780247066857037, + "grad_norm": 0.13962804205217896, + "learning_rate": 1.63442093235322e-08, + "loss": 2.7096, + "step": 47973 + }, + { + "epoch": 2.9780867837854617, + "grad_norm": 0.1444114175399004, + "learning_rate": 1.6252004577460566e-08, + "loss": 2.5887, + "step": 47974 + }, + { + "epoch": 2.9781488608852196, + "grad_norm": 0.13961847690825546, + "learning_rate": 1.6160060608738647e-08, + "loss": 2.6712, + "step": 47975 + }, + { + "epoch": 2.9782109379849775, + "grad_norm": 0.13798696999220436, + "learning_rate": 1.6068377417843837e-08, + "loss": 2.7388, + "step": 47976 + }, + { + "epoch": 2.978273015084735, + "grad_norm": 0.1458598110026957, + "learning_rate": 1.5976955005253537e-08, + "loss": 2.7019, + "step": 47977 + }, + { + "epoch": 2.9783350921844933, + "grad_norm": 0.14261150622538613, + "learning_rate": 1.5885793371439582e-08, + "loss": 2.6901, + "step": 47978 + }, + { + "epoch": 2.978397169284251, + "grad_norm": 0.12782848519345735, + "learning_rate": 1.579489251689048e-08, + "loss": 2.7126, + "step": 47979 + }, + { + "epoch": 2.978459246384009, + "grad_norm": 0.12915155882568458, + "learning_rate": 1.570425244207252e-08, + "loss": 2.7432, + "step": 47980 + }, + { + "epoch": 2.9785213234837666, + "grad_norm": 0.13599779227950717, + "learning_rate": 1.5613873147457546e-08, + "loss": 2.686, + "step": 47981 + }, + { + "epoch": 2.978583400583525, + "grad_norm": 0.15864028694186696, + "learning_rate": 1.552375463352296e-08, + "loss": 2.7426, + "step": 47982 + }, + { + "epoch": 2.9786454776832825, + "grad_norm": 0.13119820952398556, + "learning_rate": 1.5433896900729494e-08, + "loss": 2.7466, + "step": 47983 + }, + { + "epoch": 2.9787075547830404, + "grad_norm": 0.13537771707143426, + "learning_rate": 1.5344299949554554e-08, + "loss": 2.7136, + "step": 47984 + }, + { + "epoch": 2.9787696318827983, + "grad_norm": 0.13545813445195257, + "learning_rate": 1.5254963780464427e-08, + "loss": 2.6898, + "step": 47985 + }, + { + "epoch": 2.9788317089825562, + "grad_norm": 0.14230609443884237, + "learning_rate": 1.5165888393919858e-08, + "loss": 2.685, + "step": 47986 + }, + { + "epoch": 2.978893786082314, + "grad_norm": 0.13070962650297585, + "learning_rate": 1.507707379039269e-08, + "loss": 2.6064, + "step": 47987 + }, + { + "epoch": 2.978955863182072, + "grad_norm": 0.1286264692306945, + "learning_rate": 1.4988519970338122e-08, + "loss": 2.7208, + "step": 47988 + }, + { + "epoch": 2.97901794028183, + "grad_norm": 0.13730267757936127, + "learning_rate": 1.490022693422799e-08, + "loss": 2.7848, + "step": 47989 + }, + { + "epoch": 2.979080017381588, + "grad_norm": 0.12472118634907922, + "learning_rate": 1.4812194682511937e-08, + "loss": 2.7203, + "step": 47990 + }, + { + "epoch": 2.979142094481346, + "grad_norm": 0.12753004156108927, + "learning_rate": 1.4724423215661809e-08, + "loss": 2.6727, + "step": 47991 + }, + { + "epoch": 2.9792041715811037, + "grad_norm": 0.1417140711040375, + "learning_rate": 1.4636912534121694e-08, + "loss": 2.6993, + "step": 47992 + }, + { + "epoch": 2.9792662486808617, + "grad_norm": 0.1424372578096193, + "learning_rate": 1.4549662638357887e-08, + "loss": 2.7044, + "step": 47993 + }, + { + "epoch": 2.9793283257806196, + "grad_norm": 0.13905182352764997, + "learning_rate": 1.4462673528825577e-08, + "loss": 2.7262, + "step": 47994 + }, + { + "epoch": 2.9793904028803775, + "grad_norm": 0.13514253182884736, + "learning_rate": 1.4375945205974406e-08, + "loss": 2.7247, + "step": 47995 + }, + { + "epoch": 2.9794524799801354, + "grad_norm": 0.12914579704206416, + "learning_rate": 1.4289477670254014e-08, + "loss": 2.7334, + "step": 47996 + }, + { + "epoch": 2.9795145570798933, + "grad_norm": 0.12702780507755082, + "learning_rate": 1.4203270922125144e-08, + "loss": 2.8143, + "step": 47997 + }, + { + "epoch": 2.9795766341796512, + "grad_norm": 0.12886418754506168, + "learning_rate": 1.4117324962031886e-08, + "loss": 2.6797, + "step": 47998 + }, + { + "epoch": 2.979638711279409, + "grad_norm": 0.13198979900826086, + "learning_rate": 1.4031639790423878e-08, + "loss": 2.6271, + "step": 47999 + }, + { + "epoch": 2.979700788379167, + "grad_norm": 0.14408240990616772, + "learning_rate": 1.3946215407745212e-08, + "loss": 2.7493, + "step": 48000 + }, + { + "epoch": 2.979762865478925, + "grad_norm": 0.12911177636629861, + "learning_rate": 1.3861051814439974e-08, + "loss": 2.688, + "step": 48001 + }, + { + "epoch": 2.9798249425786825, + "grad_norm": 0.14831561806749224, + "learning_rate": 1.3776149010963357e-08, + "loss": 2.7033, + "step": 48002 + }, + { + "epoch": 2.979887019678441, + "grad_norm": 0.1346935549434892, + "learning_rate": 1.369150699774835e-08, + "loss": 2.7078, + "step": 48003 + }, + { + "epoch": 2.9799490967781983, + "grad_norm": 0.16385136369909495, + "learning_rate": 1.3607125775239038e-08, + "loss": 2.6769, + "step": 48004 + }, + { + "epoch": 2.9800111738779567, + "grad_norm": 0.13748501213707331, + "learning_rate": 1.3523005343873962e-08, + "loss": 2.6874, + "step": 48005 + }, + { + "epoch": 2.980073250977714, + "grad_norm": 0.14088864249334032, + "learning_rate": 1.3439145704097211e-08, + "loss": 2.7761, + "step": 48006 + }, + { + "epoch": 2.980135328077472, + "grad_norm": 0.14658473709323264, + "learning_rate": 1.3355546856341771e-08, + "loss": 2.6603, + "step": 48007 + }, + { + "epoch": 2.98019740517723, + "grad_norm": 0.13428617968531156, + "learning_rate": 1.3272208801040631e-08, + "loss": 2.6622, + "step": 48008 + }, + { + "epoch": 2.980259482276988, + "grad_norm": 0.1296345338322623, + "learning_rate": 1.3189131538637878e-08, + "loss": 2.6765, + "step": 48009 + }, + { + "epoch": 2.980321559376746, + "grad_norm": 0.13609098998817523, + "learning_rate": 1.3106315069560947e-08, + "loss": 2.7554, + "step": 48010 + }, + { + "epoch": 2.9803836364765037, + "grad_norm": 0.1409241818375529, + "learning_rate": 1.302375939424838e-08, + "loss": 2.6163, + "step": 48011 + }, + { + "epoch": 2.9804457135762616, + "grad_norm": 0.1281847849204502, + "learning_rate": 1.2941464513116508e-08, + "loss": 2.7012, + "step": 48012 + }, + { + "epoch": 2.9805077906760196, + "grad_norm": 0.13818785994451357, + "learning_rate": 1.285943042660942e-08, + "loss": 2.6464, + "step": 48013 + }, + { + "epoch": 2.9805698677757775, + "grad_norm": 0.12662142188532635, + "learning_rate": 1.2777657135149002e-08, + "loss": 2.7304, + "step": 48014 + }, + { + "epoch": 2.9806319448755354, + "grad_norm": 0.13017380892496427, + "learning_rate": 1.2696144639162689e-08, + "loss": 2.749, + "step": 48015 + }, + { + "epoch": 2.9806940219752933, + "grad_norm": 0.13932484702677383, + "learning_rate": 1.2614892939072364e-08, + "loss": 2.7675, + "step": 48016 + }, + { + "epoch": 2.9807560990750512, + "grad_norm": 0.13343940764767687, + "learning_rate": 1.2533902035311019e-08, + "loss": 2.6647, + "step": 48017 + }, + { + "epoch": 2.980818176174809, + "grad_norm": 0.13643165144309286, + "learning_rate": 1.245317192828943e-08, + "loss": 2.5753, + "step": 48018 + }, + { + "epoch": 2.980880253274567, + "grad_norm": 0.12663992615070993, + "learning_rate": 1.2372702618435039e-08, + "loss": 2.6799, + "step": 48019 + }, + { + "epoch": 2.980942330374325, + "grad_norm": 0.14044308154198434, + "learning_rate": 1.2292494106169727e-08, + "loss": 2.7234, + "step": 48020 + }, + { + "epoch": 2.981004407474083, + "grad_norm": 0.13974425223341444, + "learning_rate": 1.2212546391909829e-08, + "loss": 2.6892, + "step": 48021 + }, + { + "epoch": 2.981066484573841, + "grad_norm": 0.13234768902164684, + "learning_rate": 1.2132859476071678e-08, + "loss": 2.7408, + "step": 48022 + }, + { + "epoch": 2.9811285616735987, + "grad_norm": 0.14359330287236946, + "learning_rate": 1.2053433359066057e-08, + "loss": 2.7133, + "step": 48023 + }, + { + "epoch": 2.9811906387733567, + "grad_norm": 0.1277342752578766, + "learning_rate": 1.1974268041320403e-08, + "loss": 2.7339, + "step": 48024 + }, + { + "epoch": 2.981252715873114, + "grad_norm": 0.12859281088809427, + "learning_rate": 1.1895363523234393e-08, + "loss": 2.8102, + "step": 48025 + }, + { + "epoch": 2.9813147929728725, + "grad_norm": 0.1286206984064807, + "learning_rate": 1.1816719805229915e-08, + "loss": 2.6714, + "step": 48026 + }, + { + "epoch": 2.98137687007263, + "grad_norm": 0.1356761208998603, + "learning_rate": 1.1738336887706647e-08, + "loss": 2.6326, + "step": 48027 + }, + { + "epoch": 2.9814389471723883, + "grad_norm": 0.12623138677660747, + "learning_rate": 1.1660214771086475e-08, + "loss": 2.6846, + "step": 48028 + }, + { + "epoch": 2.981501024272146, + "grad_norm": 0.1282125577995681, + "learning_rate": 1.1582353455763528e-08, + "loss": 2.7031, + "step": 48029 + }, + { + "epoch": 2.981563101371904, + "grad_norm": 0.14548535781772562, + "learning_rate": 1.150475294215414e-08, + "loss": 2.5926, + "step": 48030 + }, + { + "epoch": 2.9816251784716616, + "grad_norm": 0.12992541969542387, + "learning_rate": 1.142741323065799e-08, + "loss": 2.779, + "step": 48031 + }, + { + "epoch": 2.9816872555714196, + "grad_norm": 0.1321451798797794, + "learning_rate": 1.1350334321680311e-08, + "loss": 2.7316, + "step": 48032 + }, + { + "epoch": 2.9817493326711775, + "grad_norm": 0.1274142254603761, + "learning_rate": 1.1273516215626334e-08, + "loss": 2.5752, + "step": 48033 + }, + { + "epoch": 2.9818114097709354, + "grad_norm": 0.13770612038475677, + "learning_rate": 1.1196958912890188e-08, + "loss": 2.7174, + "step": 48034 + }, + { + "epoch": 2.9818734868706933, + "grad_norm": 0.13896620770533635, + "learning_rate": 1.1120662413877104e-08, + "loss": 2.7476, + "step": 48035 + }, + { + "epoch": 2.9819355639704512, + "grad_norm": 0.14169353260748654, + "learning_rate": 1.104462671898121e-08, + "loss": 2.7072, + "step": 48036 + }, + { + "epoch": 2.981997641070209, + "grad_norm": 0.13717718919835928, + "learning_rate": 1.096885182860219e-08, + "loss": 2.6414, + "step": 48037 + }, + { + "epoch": 2.982059718169967, + "grad_norm": 0.13202251371564655, + "learning_rate": 1.0893337743134169e-08, + "loss": 2.7454, + "step": 48038 + }, + { + "epoch": 2.982121795269725, + "grad_norm": 0.1344582119832212, + "learning_rate": 1.0818084462965728e-08, + "loss": 2.6338, + "step": 48039 + }, + { + "epoch": 2.982183872369483, + "grad_norm": 0.13326438993941014, + "learning_rate": 1.0743091988502097e-08, + "loss": 2.5984, + "step": 48040 + }, + { + "epoch": 2.982245949469241, + "grad_norm": 0.14491304026200014, + "learning_rate": 1.0668360320120752e-08, + "loss": 2.6614, + "step": 48041 + }, + { + "epoch": 2.9823080265689987, + "grad_norm": 0.1439024093956242, + "learning_rate": 1.0593889458221374e-08, + "loss": 2.6673, + "step": 48042 + }, + { + "epoch": 2.9823701036687567, + "grad_norm": 0.1290204524850709, + "learning_rate": 1.0519679403186988e-08, + "loss": 2.7497, + "step": 48043 + }, + { + "epoch": 2.9824321807685146, + "grad_norm": 0.1378646512734544, + "learning_rate": 1.0445730155411725e-08, + "loss": 2.6467, + "step": 48044 + }, + { + "epoch": 2.9824942578682725, + "grad_norm": 0.1339458257198653, + "learning_rate": 1.0372041715267511e-08, + "loss": 2.6722, + "step": 48045 + }, + { + "epoch": 2.9825563349680304, + "grad_norm": 0.12955261506373336, + "learning_rate": 1.0298614083154023e-08, + "loss": 2.5964, + "step": 48046 + }, + { + "epoch": 2.9826184120677883, + "grad_norm": 0.13250884938603275, + "learning_rate": 1.0225447259448739e-08, + "loss": 2.6632, + "step": 48047 + }, + { + "epoch": 2.9826804891675462, + "grad_norm": 0.13312935198578885, + "learning_rate": 1.0152541244529135e-08, + "loss": 2.7867, + "step": 48048 + }, + { + "epoch": 2.982742566267304, + "grad_norm": 0.13758657098088958, + "learning_rate": 1.0079896038783787e-08, + "loss": 2.7184, + "step": 48049 + }, + { + "epoch": 2.9828046433670616, + "grad_norm": 0.1413546678871071, + "learning_rate": 1.0007511642584622e-08, + "loss": 2.7155, + "step": 48050 + }, + { + "epoch": 2.98286672046682, + "grad_norm": 0.1320416223526027, + "learning_rate": 9.935388056309115e-09, + "loss": 2.7686, + "step": 48051 + }, + { + "epoch": 2.9829287975665775, + "grad_norm": 0.12917418522128385, + "learning_rate": 9.863525280340292e-09, + "loss": 2.6875, + "step": 48052 + }, + { + "epoch": 2.982990874666336, + "grad_norm": 0.1411583244478704, + "learning_rate": 9.791923315044527e-09, + "loss": 2.6881, + "step": 48053 + }, + { + "epoch": 2.9830529517660933, + "grad_norm": 0.13518810392977645, + "learning_rate": 9.720582160804848e-09, + "loss": 2.6816, + "step": 48054 + }, + { + "epoch": 2.9831150288658512, + "grad_norm": 0.13771685979544446, + "learning_rate": 9.649501817987627e-09, + "loss": 2.6794, + "step": 48055 + }, + { + "epoch": 2.983177105965609, + "grad_norm": 0.13344691025609512, + "learning_rate": 9.57868228695924e-09, + "loss": 2.6653, + "step": 48056 + }, + { + "epoch": 2.983239183065367, + "grad_norm": 0.13758322028841488, + "learning_rate": 9.50812356810271e-09, + "loss": 2.7544, + "step": 48057 + }, + { + "epoch": 2.983301260165125, + "grad_norm": 0.1408670117830572, + "learning_rate": 9.437825661773314e-09, + "loss": 2.6451, + "step": 48058 + }, + { + "epoch": 2.983363337264883, + "grad_norm": 0.1515799473030788, + "learning_rate": 9.367788568342973e-09, + "loss": 2.6986, + "step": 48059 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.13354814375594593, + "learning_rate": 9.298012288172509e-09, + "loss": 2.6168, + "step": 48060 + }, + { + "epoch": 2.9834874914643987, + "grad_norm": 0.1283777558347576, + "learning_rate": 9.228496821633848e-09, + "loss": 2.7006, + "step": 48061 + }, + { + "epoch": 2.9835495685641567, + "grad_norm": 0.1289692493583416, + "learning_rate": 9.159242169087811e-09, + "loss": 2.6551, + "step": 48062 + }, + { + "epoch": 2.9836116456639146, + "grad_norm": 0.13151439733114667, + "learning_rate": 9.090248330889672e-09, + "loss": 2.7609, + "step": 48063 + }, + { + "epoch": 2.9836737227636725, + "grad_norm": 0.12824696225478208, + "learning_rate": 9.021515307405803e-09, + "loss": 2.7107, + "step": 48064 + }, + { + "epoch": 2.9837357998634304, + "grad_norm": 0.13519679767770523, + "learning_rate": 8.953043098991476e-09, + "loss": 2.6979, + "step": 48065 + }, + { + "epoch": 2.9837978769631883, + "grad_norm": 0.13070998988390312, + "learning_rate": 8.884831706007512e-09, + "loss": 2.6539, + "step": 48066 + }, + { + "epoch": 2.9838599540629462, + "grad_norm": 0.1315600015307879, + "learning_rate": 8.816881128803634e-09, + "loss": 2.7263, + "step": 48067 + }, + { + "epoch": 2.983922031162704, + "grad_norm": 0.13107521845720965, + "learning_rate": 8.749191367740662e-09, + "loss": 2.7362, + "step": 48068 + }, + { + "epoch": 2.983984108262462, + "grad_norm": 0.1350732002441641, + "learning_rate": 8.681762423168316e-09, + "loss": 2.7083, + "step": 48069 + }, + { + "epoch": 2.98404618536222, + "grad_norm": 0.13951089394411162, + "learning_rate": 8.614594295436318e-09, + "loss": 2.568, + "step": 48070 + }, + { + "epoch": 2.984108262461978, + "grad_norm": 0.12731395208984814, + "learning_rate": 8.547686984899938e-09, + "loss": 2.6736, + "step": 48071 + }, + { + "epoch": 2.984170339561736, + "grad_norm": 0.14546620971108826, + "learning_rate": 8.481040491903347e-09, + "loss": 2.743, + "step": 48072 + }, + { + "epoch": 2.9842324166614933, + "grad_norm": 0.14870098219972347, + "learning_rate": 8.414654816801815e-09, + "loss": 2.7506, + "step": 48073 + }, + { + "epoch": 2.9842944937612517, + "grad_norm": 0.13237785294045712, + "learning_rate": 8.34852995993396e-09, + "loss": 2.7117, + "step": 48074 + }, + { + "epoch": 2.984356570861009, + "grad_norm": 0.1292555368383959, + "learning_rate": 8.282665921649502e-09, + "loss": 2.6151, + "step": 48075 + }, + { + "epoch": 2.9844186479607675, + "grad_norm": 0.14806563452280638, + "learning_rate": 8.217062702287059e-09, + "loss": 2.7822, + "step": 48076 + }, + { + "epoch": 2.984480725060525, + "grad_norm": 0.1377791629636654, + "learning_rate": 8.151720302196352e-09, + "loss": 2.7521, + "step": 48077 + }, + { + "epoch": 2.9845428021602833, + "grad_norm": 0.13138781079115905, + "learning_rate": 8.08663872171045e-09, + "loss": 2.7074, + "step": 48078 + }, + { + "epoch": 2.984604879260041, + "grad_norm": 0.13752948298594397, + "learning_rate": 8.021817961173517e-09, + "loss": 2.7205, + "step": 48079 + }, + { + "epoch": 2.9846669563597987, + "grad_norm": 0.14155968671702196, + "learning_rate": 7.957258020924175e-09, + "loss": 2.7016, + "step": 48080 + }, + { + "epoch": 2.9847290334595566, + "grad_norm": 0.13279163673564326, + "learning_rate": 7.892958901295489e-09, + "loss": 2.6769, + "step": 48081 + }, + { + "epoch": 2.9847911105593146, + "grad_norm": 0.13124131928983357, + "learning_rate": 7.828920602626078e-09, + "loss": 2.7133, + "step": 48082 + }, + { + "epoch": 2.9848531876590725, + "grad_norm": 0.13718065164369414, + "learning_rate": 7.76514312525456e-09, + "loss": 2.6963, + "step": 48083 + }, + { + "epoch": 2.9849152647588304, + "grad_norm": 0.13872908365835576, + "learning_rate": 7.701626469502898e-09, + "loss": 2.7263, + "step": 48084 + }, + { + "epoch": 2.9849773418585883, + "grad_norm": 0.15381225097183474, + "learning_rate": 7.638370635709712e-09, + "loss": 2.6699, + "step": 48085 + }, + { + "epoch": 2.9850394189583462, + "grad_norm": 0.1292573021203966, + "learning_rate": 7.575375624196967e-09, + "loss": 2.7212, + "step": 48086 + }, + { + "epoch": 2.985101496058104, + "grad_norm": 0.13410153063121744, + "learning_rate": 7.512641435308832e-09, + "loss": 2.6808, + "step": 48087 + }, + { + "epoch": 2.985163573157862, + "grad_norm": 0.12810930353640274, + "learning_rate": 7.450168069356167e-09, + "loss": 2.7052, + "step": 48088 + }, + { + "epoch": 2.98522565025762, + "grad_norm": 0.13222342872005177, + "learning_rate": 7.387955526677593e-09, + "loss": 2.7126, + "step": 48089 + }, + { + "epoch": 2.985287727357378, + "grad_norm": 0.1330413179586655, + "learning_rate": 7.326003807589521e-09, + "loss": 2.7844, + "step": 48090 + }, + { + "epoch": 2.985349804457136, + "grad_norm": 0.13164655717336918, + "learning_rate": 7.264312912419469e-09, + "loss": 2.7629, + "step": 48091 + }, + { + "epoch": 2.9854118815568937, + "grad_norm": 0.1284944937241519, + "learning_rate": 7.202882841483849e-09, + "loss": 2.6446, + "step": 48092 + }, + { + "epoch": 2.9854739586566517, + "grad_norm": 0.1309818292702827, + "learning_rate": 7.141713595110177e-09, + "loss": 2.6766, + "step": 48093 + }, + { + "epoch": 2.9855360357564096, + "grad_norm": 0.13508265978039077, + "learning_rate": 7.080805173614868e-09, + "loss": 2.7204, + "step": 48094 + }, + { + "epoch": 2.9855981128561675, + "grad_norm": 0.12933039245193964, + "learning_rate": 7.020157577314335e-09, + "loss": 2.6376, + "step": 48095 + }, + { + "epoch": 2.9856601899559254, + "grad_norm": 0.14216454498784373, + "learning_rate": 6.95977080652499e-09, + "loss": 2.6305, + "step": 48096 + }, + { + "epoch": 2.9857222670556833, + "grad_norm": 0.12854590044849643, + "learning_rate": 6.899644861563248e-09, + "loss": 2.6898, + "step": 48097 + }, + { + "epoch": 2.985784344155441, + "grad_norm": 0.12836508876925254, + "learning_rate": 6.839779742739971e-09, + "loss": 2.6398, + "step": 48098 + }, + { + "epoch": 2.985846421255199, + "grad_norm": 0.14045970526950988, + "learning_rate": 6.780175450371573e-09, + "loss": 2.697, + "step": 48099 + }, + { + "epoch": 2.9859084983549566, + "grad_norm": 0.1292827915462126, + "learning_rate": 6.720831984768916e-09, + "loss": 2.6766, + "step": 48100 + }, + { + "epoch": 2.985970575454715, + "grad_norm": 0.12876971556807845, + "learning_rate": 6.661749346237312e-09, + "loss": 2.6598, + "step": 48101 + }, + { + "epoch": 2.9860326525544725, + "grad_norm": 0.1350380590223348, + "learning_rate": 6.6029275350876215e-09, + "loss": 2.7311, + "step": 48102 + }, + { + "epoch": 2.9860947296542304, + "grad_norm": 0.12798961662574987, + "learning_rate": 6.54436655163071e-09, + "loss": 2.5769, + "step": 48103 + }, + { + "epoch": 2.9861568067539883, + "grad_norm": 0.13166451081817723, + "learning_rate": 6.4860663961663345e-09, + "loss": 2.772, + "step": 48104 + }, + { + "epoch": 2.9862188838537462, + "grad_norm": 0.14077393274629255, + "learning_rate": 6.428027068999809e-09, + "loss": 2.7662, + "step": 48105 + }, + { + "epoch": 2.986280960953504, + "grad_norm": 0.12980875614435394, + "learning_rate": 6.370248570436443e-09, + "loss": 2.6626, + "step": 48106 + }, + { + "epoch": 2.986343038053262, + "grad_norm": 0.12802765276629344, + "learning_rate": 6.312730900770447e-09, + "loss": 2.6692, + "step": 48107 + }, + { + "epoch": 2.98640511515302, + "grad_norm": 0.13063389297265743, + "learning_rate": 6.2554740603126825e-09, + "loss": 2.7212, + "step": 48108 + }, + { + "epoch": 2.986467192252778, + "grad_norm": 0.13870776746216582, + "learning_rate": 6.198478049357359e-09, + "loss": 2.7912, + "step": 48109 + }, + { + "epoch": 2.986529269352536, + "grad_norm": 0.13234867418939802, + "learning_rate": 6.141742868193134e-09, + "loss": 2.7357, + "step": 48110 + }, + { + "epoch": 2.9865913464522937, + "grad_norm": 0.13049294995396518, + "learning_rate": 6.085268517130871e-09, + "loss": 2.7787, + "step": 48111 + }, + { + "epoch": 2.9866534235520517, + "grad_norm": 0.14174903556539373, + "learning_rate": 6.029054996459227e-09, + "loss": 2.6876, + "step": 48112 + }, + { + "epoch": 2.9867155006518096, + "grad_norm": 0.13695789946847234, + "learning_rate": 5.97310230646686e-09, + "loss": 2.636, + "step": 48113 + }, + { + "epoch": 2.9867775777515675, + "grad_norm": 0.1384706380246634, + "learning_rate": 5.917410447447979e-09, + "loss": 2.6876, + "step": 48114 + }, + { + "epoch": 2.9868396548513254, + "grad_norm": 0.12742784216010475, + "learning_rate": 5.861979419696795e-09, + "loss": 2.6031, + "step": 48115 + }, + { + "epoch": 2.9869017319510833, + "grad_norm": 0.12642647002698612, + "learning_rate": 5.806809223501963e-09, + "loss": 2.6521, + "step": 48116 + }, + { + "epoch": 2.9869638090508412, + "grad_norm": 0.13251103519424987, + "learning_rate": 5.751899859146592e-09, + "loss": 2.6729, + "step": 48117 + }, + { + "epoch": 2.987025886150599, + "grad_norm": 0.13240144735219403, + "learning_rate": 5.697251326919339e-09, + "loss": 2.6831, + "step": 48118 + }, + { + "epoch": 2.987087963250357, + "grad_norm": 0.1415919925481285, + "learning_rate": 5.6428636271088635e-09, + "loss": 2.6967, + "step": 48119 + }, + { + "epoch": 2.987150040350115, + "grad_norm": 0.13556760085337577, + "learning_rate": 5.5887367599982695e-09, + "loss": 2.6776, + "step": 48120 + }, + { + "epoch": 2.9872121174498725, + "grad_norm": 0.13183309763922682, + "learning_rate": 5.534870725865115e-09, + "loss": 2.7012, + "step": 48121 + }, + { + "epoch": 2.987274194549631, + "grad_norm": 0.1330420599831668, + "learning_rate": 5.481265524992507e-09, + "loss": 2.6774, + "step": 48122 + }, + { + "epoch": 2.9873362716493883, + "grad_norm": 0.12707245427087732, + "learning_rate": 5.4279211576580004e-09, + "loss": 2.6596, + "step": 48123 + }, + { + "epoch": 2.9873983487491467, + "grad_norm": 0.13483342248304026, + "learning_rate": 5.374837624144702e-09, + "loss": 2.784, + "step": 48124 + }, + { + "epoch": 2.987460425848904, + "grad_norm": 0.1346494547155617, + "learning_rate": 5.322014924730168e-09, + "loss": 2.6117, + "step": 48125 + }, + { + "epoch": 2.9875225029486625, + "grad_norm": 0.12873192762179958, + "learning_rate": 5.269453059686402e-09, + "loss": 2.7038, + "step": 48126 + }, + { + "epoch": 2.98758458004842, + "grad_norm": 0.12934861718103302, + "learning_rate": 5.217152029285411e-09, + "loss": 2.6955, + "step": 48127 + }, + { + "epoch": 2.987646657148178, + "grad_norm": 0.14079190654198898, + "learning_rate": 5.165111833804748e-09, + "loss": 2.648, + "step": 48128 + }, + { + "epoch": 2.987708734247936, + "grad_norm": 0.13706528479116706, + "learning_rate": 5.113332473516419e-09, + "loss": 2.774, + "step": 48129 + }, + { + "epoch": 2.9877708113476937, + "grad_norm": 0.14094809661495517, + "learning_rate": 5.061813948686878e-09, + "loss": 2.6998, + "step": 48130 + }, + { + "epoch": 2.9878328884474517, + "grad_norm": 0.13628238112739716, + "learning_rate": 5.0105562595881286e-09, + "loss": 2.6955, + "step": 48131 + }, + { + "epoch": 2.9878949655472096, + "grad_norm": 0.13907427228968314, + "learning_rate": 4.959559406486625e-09, + "loss": 2.7918, + "step": 48132 + }, + { + "epoch": 2.9879570426469675, + "grad_norm": 0.14123367005239904, + "learning_rate": 4.908823389643269e-09, + "loss": 2.7094, + "step": 48133 + }, + { + "epoch": 2.9880191197467254, + "grad_norm": 0.13979361397267637, + "learning_rate": 4.858348209330066e-09, + "loss": 2.7568, + "step": 48134 + }, + { + "epoch": 2.9880811968464833, + "grad_norm": 0.14467671523055475, + "learning_rate": 4.808133865807917e-09, + "loss": 2.7139, + "step": 48135 + }, + { + "epoch": 2.9881432739462412, + "grad_norm": 0.13393196771493396, + "learning_rate": 4.7581803593377275e-09, + "loss": 2.708, + "step": 48136 + }, + { + "epoch": 2.988205351045999, + "grad_norm": 0.14570226413380447, + "learning_rate": 4.7084876901803965e-09, + "loss": 2.6754, + "step": 48137 + }, + { + "epoch": 2.988267428145757, + "grad_norm": 0.14120791736712984, + "learning_rate": 4.659055858596828e-09, + "loss": 2.6028, + "step": 48138 + }, + { + "epoch": 2.988329505245515, + "grad_norm": 0.13142637283570743, + "learning_rate": 4.609884864842373e-09, + "loss": 2.6727, + "step": 48139 + }, + { + "epoch": 2.988391582345273, + "grad_norm": 0.13489699525835944, + "learning_rate": 4.560974709177934e-09, + "loss": 2.6838, + "step": 48140 + }, + { + "epoch": 2.988453659445031, + "grad_norm": 0.13330045986486125, + "learning_rate": 4.512325391853311e-09, + "loss": 2.7127, + "step": 48141 + }, + { + "epoch": 2.9885157365447887, + "grad_norm": 0.1305925509997277, + "learning_rate": 4.4639369131238566e-09, + "loss": 2.7607, + "step": 48142 + }, + { + "epoch": 2.9885778136445467, + "grad_norm": 0.13201229168140688, + "learning_rate": 4.41580927324492e-09, + "loss": 2.6996, + "step": 48143 + }, + { + "epoch": 2.9886398907443046, + "grad_norm": 0.13166530304239846, + "learning_rate": 4.367942472466302e-09, + "loss": 2.7149, + "step": 48144 + }, + { + "epoch": 2.9887019678440625, + "grad_norm": 0.1314478495609483, + "learning_rate": 4.320336511032252e-09, + "loss": 2.7751, + "step": 48145 + }, + { + "epoch": 2.98876404494382, + "grad_norm": 0.13055073937267336, + "learning_rate": 4.2729913892036736e-09, + "loss": 2.708, + "step": 48146 + }, + { + "epoch": 2.9888261220435783, + "grad_norm": 0.1313014463232442, + "learning_rate": 4.225907107213712e-09, + "loss": 2.7445, + "step": 48147 + }, + { + "epoch": 2.988888199143336, + "grad_norm": 0.13213830112345779, + "learning_rate": 4.179083665312167e-09, + "loss": 2.7235, + "step": 48148 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.12731156000839966, + "learning_rate": 4.1325210637488395e-09, + "loss": 2.7573, + "step": 48149 + }, + { + "epoch": 2.9890123533428516, + "grad_norm": 0.12881655173197035, + "learning_rate": 4.086219302762428e-09, + "loss": 2.6438, + "step": 48150 + }, + { + "epoch": 2.9890744304426096, + "grad_norm": 0.13307467697109007, + "learning_rate": 4.040178382597182e-09, + "loss": 2.5973, + "step": 48151 + }, + { + "epoch": 2.9891365075423675, + "grad_norm": 0.14933151258518107, + "learning_rate": 3.994398303491798e-09, + "loss": 2.6536, + "step": 48152 + }, + { + "epoch": 2.9891985846421254, + "grad_norm": 0.12783862651367026, + "learning_rate": 3.9488790656794226e-09, + "loss": 2.6196, + "step": 48153 + }, + { + "epoch": 2.9892606617418833, + "grad_norm": 0.12774271806540596, + "learning_rate": 3.903620669409857e-09, + "loss": 2.7472, + "step": 48154 + }, + { + "epoch": 2.9893227388416412, + "grad_norm": 0.13488906928696087, + "learning_rate": 3.858623114905147e-09, + "loss": 2.676, + "step": 48155 + }, + { + "epoch": 2.989384815941399, + "grad_norm": 0.14226914592619622, + "learning_rate": 3.813886402409539e-09, + "loss": 2.688, + "step": 48156 + }, + { + "epoch": 2.989446893041157, + "grad_norm": 0.1270881081945237, + "learning_rate": 3.769410532156181e-09, + "loss": 2.6847, + "step": 48157 + }, + { + "epoch": 2.989508970140915, + "grad_norm": 0.13916155496530141, + "learning_rate": 3.7251955043726696e-09, + "loss": 2.7207, + "step": 48158 + }, + { + "epoch": 2.989571047240673, + "grad_norm": 0.12758404871590343, + "learning_rate": 3.68124131929215e-09, + "loss": 2.7341, + "step": 48159 + }, + { + "epoch": 2.989633124340431, + "grad_norm": 0.14831986911236222, + "learning_rate": 3.6375479771422194e-09, + "loss": 2.7775, + "step": 48160 + }, + { + "epoch": 2.9896952014401887, + "grad_norm": 0.1420605422806852, + "learning_rate": 3.594115478156024e-09, + "loss": 2.81, + "step": 48161 + }, + { + "epoch": 2.9897572785399467, + "grad_norm": 0.1269190522466686, + "learning_rate": 3.550943822550057e-09, + "loss": 2.5755, + "step": 48162 + }, + { + "epoch": 2.9898193556397046, + "grad_norm": 0.1295218794715353, + "learning_rate": 3.5080330105630167e-09, + "loss": 2.6969, + "step": 48163 + }, + { + "epoch": 2.9898814327394625, + "grad_norm": 0.12901378250146203, + "learning_rate": 3.4653830424058455e-09, + "loss": 2.7024, + "step": 48164 + }, + { + "epoch": 2.9899435098392204, + "grad_norm": 0.15905685014752036, + "learning_rate": 3.42299391831169e-09, + "loss": 2.7052, + "step": 48165 + }, + { + "epoch": 2.9900055869389783, + "grad_norm": 0.1357628305779575, + "learning_rate": 3.3808656384914928e-09, + "loss": 2.6998, + "step": 48166 + }, + { + "epoch": 2.9900676640387363, + "grad_norm": 0.13301042226508109, + "learning_rate": 3.3389982031728493e-09, + "loss": 2.6971, + "step": 48167 + }, + { + "epoch": 2.990129741138494, + "grad_norm": 0.1389700075692676, + "learning_rate": 3.2973916125722536e-09, + "loss": 2.7874, + "step": 48168 + }, + { + "epoch": 2.9901918182382516, + "grad_norm": 0.1300619582133303, + "learning_rate": 3.2560458669061987e-09, + "loss": 2.6571, + "step": 48169 + }, + { + "epoch": 2.99025389533801, + "grad_norm": 0.13698252708848196, + "learning_rate": 3.2149609663911783e-09, + "loss": 2.6356, + "step": 48170 + }, + { + "epoch": 2.9903159724377675, + "grad_norm": 0.12672092771011226, + "learning_rate": 3.174136911243686e-09, + "loss": 2.6657, + "step": 48171 + }, + { + "epoch": 2.990378049537526, + "grad_norm": 0.13763150975728727, + "learning_rate": 3.1335737016691127e-09, + "loss": 2.6563, + "step": 48172 + }, + { + "epoch": 2.9904401266372833, + "grad_norm": 0.13164599829328033, + "learning_rate": 3.093271337883952e-09, + "loss": 2.7119, + "step": 48173 + }, + { + "epoch": 2.9905022037370417, + "grad_norm": 0.14633205759860787, + "learning_rate": 3.053229820099146e-09, + "loss": 2.696, + "step": 48174 + }, + { + "epoch": 2.990564280836799, + "grad_norm": 0.14866425110297157, + "learning_rate": 3.0134491485256377e-09, + "loss": 2.669, + "step": 48175 + }, + { + "epoch": 2.990626357936557, + "grad_norm": 0.13051605023917986, + "learning_rate": 2.973929323363267e-09, + "loss": 2.6619, + "step": 48176 + }, + { + "epoch": 2.990688435036315, + "grad_norm": 0.12960940819177094, + "learning_rate": 2.9346703448285275e-09, + "loss": 2.63, + "step": 48177 + }, + { + "epoch": 2.990750512136073, + "grad_norm": 0.13391006872681077, + "learning_rate": 2.8956722131157076e-09, + "loss": 2.7014, + "step": 48178 + }, + { + "epoch": 2.990812589235831, + "grad_norm": 0.15008793131112733, + "learning_rate": 2.8569349284357504e-09, + "loss": 2.6652, + "step": 48179 + }, + { + "epoch": 2.9908746663355887, + "grad_norm": 0.14211377207835718, + "learning_rate": 2.818458490988496e-09, + "loss": 2.6042, + "step": 48180 + }, + { + "epoch": 2.9909367434353467, + "grad_norm": 0.1445077864184977, + "learning_rate": 2.780242900973784e-09, + "loss": 2.6706, + "step": 48181 + }, + { + "epoch": 2.9909988205351046, + "grad_norm": 0.12758681526417054, + "learning_rate": 2.742288158591455e-09, + "loss": 2.6365, + "step": 48182 + }, + { + "epoch": 2.9910608976348625, + "grad_norm": 0.14404556789316172, + "learning_rate": 2.704594264041349e-09, + "loss": 2.7657, + "step": 48183 + }, + { + "epoch": 2.9911229747346204, + "grad_norm": 0.13289284379644126, + "learning_rate": 2.6671612175177552e-09, + "loss": 2.702, + "step": 48184 + }, + { + "epoch": 2.9911850518343783, + "grad_norm": 0.1276663260644701, + "learning_rate": 2.6299890192205135e-09, + "loss": 2.683, + "step": 48185 + }, + { + "epoch": 2.9912471289341362, + "grad_norm": 0.14011562334840802, + "learning_rate": 2.5930776693328107e-09, + "loss": 2.6956, + "step": 48186 + }, + { + "epoch": 2.991309206033894, + "grad_norm": 0.1393848545927027, + "learning_rate": 2.556427168060038e-09, + "loss": 2.7915, + "step": 48187 + }, + { + "epoch": 2.991371283133652, + "grad_norm": 0.13420421342878033, + "learning_rate": 2.520037515585383e-09, + "loss": 2.6551, + "step": 48188 + }, + { + "epoch": 2.99143336023341, + "grad_norm": 0.13250709930780372, + "learning_rate": 2.4839087121031334e-09, + "loss": 2.7451, + "step": 48189 + }, + { + "epoch": 2.991495437333168, + "grad_norm": 0.13217767293398064, + "learning_rate": 2.448040757796477e-09, + "loss": 2.634, + "step": 48190 + }, + { + "epoch": 2.991557514432926, + "grad_norm": 0.12861116185972865, + "learning_rate": 2.4124336528597024e-09, + "loss": 2.7593, + "step": 48191 + }, + { + "epoch": 2.9916195915326838, + "grad_norm": 0.14124031025219108, + "learning_rate": 2.3770873974704454e-09, + "loss": 2.7815, + "step": 48192 + }, + { + "epoch": 2.9916816686324417, + "grad_norm": 0.1275377384485096, + "learning_rate": 2.342001991817444e-09, + "loss": 2.6893, + "step": 48193 + }, + { + "epoch": 2.991743745732199, + "grad_norm": 0.13769904598287902, + "learning_rate": 2.307177436083885e-09, + "loss": 2.7694, + "step": 48194 + }, + { + "epoch": 2.9918058228319575, + "grad_norm": 0.13298594145131834, + "learning_rate": 2.272613730452955e-09, + "loss": 2.7786, + "step": 48195 + }, + { + "epoch": 2.991867899931715, + "grad_norm": 0.13724174603369338, + "learning_rate": 2.2383108751022897e-09, + "loss": 2.6097, + "step": 48196 + }, + { + "epoch": 2.9919299770314733, + "grad_norm": 0.12943722722261017, + "learning_rate": 2.204268870215076e-09, + "loss": 2.7284, + "step": 48197 + }, + { + "epoch": 2.991992054131231, + "grad_norm": 0.13585647053435454, + "learning_rate": 2.1704877159633985e-09, + "loss": 2.6231, + "step": 48198 + }, + { + "epoch": 2.9920541312309887, + "grad_norm": 0.12804647022018087, + "learning_rate": 2.1369674125193417e-09, + "loss": 2.6739, + "step": 48199 + }, + { + "epoch": 2.9921162083307467, + "grad_norm": 0.14175604581848875, + "learning_rate": 2.1037079600716437e-09, + "loss": 2.6406, + "step": 48200 + }, + { + "epoch": 2.9921782854305046, + "grad_norm": 0.13049673248636212, + "learning_rate": 2.0707093587868377e-09, + "loss": 2.7154, + "step": 48201 + }, + { + "epoch": 2.9922403625302625, + "grad_norm": 0.13235673117898952, + "learning_rate": 2.037971608831457e-09, + "loss": 2.7288, + "step": 48202 + }, + { + "epoch": 2.9923024396300204, + "grad_norm": 0.13022515347241595, + "learning_rate": 2.005494710383138e-09, + "loss": 2.6896, + "step": 48203 + }, + { + "epoch": 2.9923645167297783, + "grad_norm": 0.1342767472687391, + "learning_rate": 1.9732786636139644e-09, + "loss": 2.64, + "step": 48204 + }, + { + "epoch": 2.9924265938295362, + "grad_norm": 0.13209808565311323, + "learning_rate": 1.941323468679368e-09, + "loss": 2.7136, + "step": 48205 + }, + { + "epoch": 2.992488670929294, + "grad_norm": 0.13602342904198048, + "learning_rate": 1.9096291257625355e-09, + "loss": 2.7198, + "step": 48206 + }, + { + "epoch": 2.992550748029052, + "grad_norm": 0.12961186564686944, + "learning_rate": 1.8781956350133466e-09, + "loss": 2.755, + "step": 48207 + }, + { + "epoch": 2.99261282512881, + "grad_norm": 0.12843514099182263, + "learning_rate": 1.8470229966094378e-09, + "loss": 2.6165, + "step": 48208 + }, + { + "epoch": 2.992674902228568, + "grad_norm": 0.14254270401011934, + "learning_rate": 1.8161112107062394e-09, + "loss": 2.7473, + "step": 48209 + }, + { + "epoch": 2.992736979328326, + "grad_norm": 0.13455010194533593, + "learning_rate": 1.7854602774591834e-09, + "loss": 2.7729, + "step": 48210 + }, + { + "epoch": 2.9927990564280837, + "grad_norm": 0.1318363048407064, + "learning_rate": 1.7550701970403538e-09, + "loss": 2.7271, + "step": 48211 + }, + { + "epoch": 2.9928611335278417, + "grad_norm": 0.1479743898338058, + "learning_rate": 1.7249409696051822e-09, + "loss": 2.7753, + "step": 48212 + }, + { + "epoch": 2.9929232106275996, + "grad_norm": 0.12778928929529357, + "learning_rate": 1.6950725953035485e-09, + "loss": 2.6758, + "step": 48213 + }, + { + "epoch": 2.9929852877273575, + "grad_norm": 0.13091393657053832, + "learning_rate": 1.665465074296435e-09, + "loss": 2.7757, + "step": 48214 + }, + { + "epoch": 2.9930473648271154, + "grad_norm": 0.1303666646255045, + "learning_rate": 1.6361184067392731e-09, + "loss": 2.7469, + "step": 48215 + }, + { + "epoch": 2.9931094419268733, + "grad_norm": 0.12968495004513983, + "learning_rate": 1.6070325927874942e-09, + "loss": 2.6876, + "step": 48216 + }, + { + "epoch": 2.993171519026631, + "grad_norm": 0.1311667589913257, + "learning_rate": 1.5782076325854267e-09, + "loss": 2.6569, + "step": 48217 + }, + { + "epoch": 2.993233596126389, + "grad_norm": 0.13250422463114947, + "learning_rate": 1.5496435262885023e-09, + "loss": 2.759, + "step": 48218 + }, + { + "epoch": 2.9932956732261466, + "grad_norm": 0.13777711481043597, + "learning_rate": 1.5213402740466009e-09, + "loss": 2.6825, + "step": 48219 + }, + { + "epoch": 2.993357750325905, + "grad_norm": 0.13146031168004735, + "learning_rate": 1.4932978760040516e-09, + "loss": 2.688, + "step": 48220 + }, + { + "epoch": 2.9934198274256625, + "grad_norm": 0.13407396346434325, + "learning_rate": 1.4655163323051835e-09, + "loss": 2.7245, + "step": 48221 + }, + { + "epoch": 2.993481904525421, + "grad_norm": 0.12988304912574267, + "learning_rate": 1.4379956430998765e-09, + "loss": 2.7343, + "step": 48222 + }, + { + "epoch": 2.9935439816251783, + "grad_norm": 0.1282114827385812, + "learning_rate": 1.4107358085324596e-09, + "loss": 2.6581, + "step": 48223 + }, + { + "epoch": 2.9936060587249362, + "grad_norm": 0.13126144274149687, + "learning_rate": 1.3837368287417108e-09, + "loss": 2.6823, + "step": 48224 + }, + { + "epoch": 2.993668135824694, + "grad_norm": 0.13191301423916676, + "learning_rate": 1.356998703871959e-09, + "loss": 2.674, + "step": 48225 + }, + { + "epoch": 2.993730212924452, + "grad_norm": 0.14015746544573535, + "learning_rate": 1.3305214340564309e-09, + "loss": 2.7102, + "step": 48226 + }, + { + "epoch": 2.99379229002421, + "grad_norm": 0.12986861416645162, + "learning_rate": 1.3043050194394556e-09, + "loss": 2.6621, + "step": 48227 + }, + { + "epoch": 2.993854367123968, + "grad_norm": 0.1321907637353292, + "learning_rate": 1.27834946015426e-09, + "loss": 2.6188, + "step": 48228 + }, + { + "epoch": 2.993916444223726, + "grad_norm": 0.13226135984958104, + "learning_rate": 1.2526547563340707e-09, + "loss": 2.7207, + "step": 48229 + }, + { + "epoch": 2.9939785213234837, + "grad_norm": 0.14610190368787115, + "learning_rate": 1.2272209081232167e-09, + "loss": 2.6304, + "step": 48230 + }, + { + "epoch": 2.9940405984232417, + "grad_norm": 0.12652984627642, + "learning_rate": 1.2020479156438225e-09, + "loss": 2.712, + "step": 48231 + }, + { + "epoch": 2.9941026755229996, + "grad_norm": 0.13172574559795475, + "learning_rate": 1.177135779029115e-09, + "loss": 2.7067, + "step": 48232 + }, + { + "epoch": 2.9941647526227575, + "grad_norm": 0.13039980797175738, + "learning_rate": 1.1524844984178718e-09, + "loss": 2.7292, + "step": 48233 + }, + { + "epoch": 2.9942268297225154, + "grad_norm": 0.13115147116705259, + "learning_rate": 1.1280940739266666e-09, + "loss": 2.6634, + "step": 48234 + }, + { + "epoch": 2.9942889068222733, + "grad_norm": 0.12860906184175377, + "learning_rate": 1.103964505688726e-09, + "loss": 2.7027, + "step": 48235 + }, + { + "epoch": 2.9943509839220313, + "grad_norm": 0.12909758701638296, + "learning_rate": 1.0800957938261747e-09, + "loss": 2.6977, + "step": 48236 + }, + { + "epoch": 2.994413061021789, + "grad_norm": 0.12939828831238506, + "learning_rate": 1.0564879384722392e-09, + "loss": 2.6495, + "step": 48237 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.12923105097137, + "learning_rate": 1.0331409397379422e-09, + "loss": 2.7074, + "step": 48238 + }, + { + "epoch": 2.994537215221305, + "grad_norm": 0.1315650771266939, + "learning_rate": 1.0100547977565101e-09, + "loss": 2.7646, + "step": 48239 + }, + { + "epoch": 2.994599292321063, + "grad_norm": 0.1383278838150919, + "learning_rate": 9.872295126389653e-10, + "loss": 2.7406, + "step": 48240 + }, + { + "epoch": 2.994661369420821, + "grad_norm": 0.13469761362406293, + "learning_rate": 9.646650845129834e-10, + "loss": 2.6582, + "step": 48241 + }, + { + "epoch": 2.9947234465205783, + "grad_norm": 0.13853548622925657, + "learning_rate": 9.423615134895869e-10, + "loss": 2.649, + "step": 48242 + }, + { + "epoch": 2.9947855236203367, + "grad_norm": 0.13703367882493214, + "learning_rate": 9.203187996909002e-10, + "loss": 2.7189, + "step": 48243 + }, + { + "epoch": 2.994847600720094, + "grad_norm": 0.15373126917036856, + "learning_rate": 8.985369432223945e-10, + "loss": 2.774, + "step": 48244 + }, + { + "epoch": 2.9949096778198525, + "grad_norm": 0.13120389537305724, + "learning_rate": 8.770159442117453e-10, + "loss": 2.608, + "step": 48245 + }, + { + "epoch": 2.99497175491961, + "grad_norm": 0.1386627812157378, + "learning_rate": 8.55755802758873e-10, + "loss": 2.7186, + "step": 48246 + }, + { + "epoch": 2.995033832019368, + "grad_norm": 0.12842532234749227, + "learning_rate": 8.347565189803508e-10, + "loss": 2.7171, + "step": 48247 + }, + { + "epoch": 2.995095909119126, + "grad_norm": 0.13070811596750911, + "learning_rate": 8.140180929816499e-10, + "loss": 2.7185, + "step": 48248 + }, + { + "epoch": 2.9951579862188837, + "grad_norm": 0.13447806874493584, + "learning_rate": 7.935405248737926e-10, + "loss": 2.6385, + "step": 48249 + }, + { + "epoch": 2.9952200633186417, + "grad_norm": 0.21062672645472533, + "learning_rate": 7.733238147678012e-10, + "loss": 2.6717, + "step": 48250 + }, + { + "epoch": 2.9952821404183996, + "grad_norm": 0.12855327570652048, + "learning_rate": 7.533679627635959e-10, + "loss": 2.7447, + "step": 48251 + }, + { + "epoch": 2.9953442175181575, + "grad_norm": 0.12596200518814224, + "learning_rate": 7.336729689666477e-10, + "loss": 2.7068, + "step": 48252 + }, + { + "epoch": 2.9954062946179154, + "grad_norm": 0.1439897535916374, + "learning_rate": 7.14238833482428e-10, + "loss": 2.693, + "step": 48253 + }, + { + "epoch": 2.9954683717176733, + "grad_norm": 0.12809393436356264, + "learning_rate": 6.950655564053054e-10, + "loss": 2.6956, + "step": 48254 + }, + { + "epoch": 2.9955304488174312, + "grad_norm": 0.13646819452758419, + "learning_rate": 6.761531378407515e-10, + "loss": 2.7203, + "step": 48255 + }, + { + "epoch": 2.995592525917189, + "grad_norm": 0.14311617743898208, + "learning_rate": 6.575015778886862e-10, + "loss": 2.6866, + "step": 48256 + }, + { + "epoch": 2.995654603016947, + "grad_norm": 0.1318287105698764, + "learning_rate": 6.391108766434784e-10, + "loss": 2.7926, + "step": 48257 + }, + { + "epoch": 2.995716680116705, + "grad_norm": 0.13153909537341069, + "learning_rate": 6.20981034199497e-10, + "loss": 2.6687, + "step": 48258 + }, + { + "epoch": 2.995778757216463, + "grad_norm": 0.13716608162697627, + "learning_rate": 6.031120506566623e-10, + "loss": 2.7053, + "step": 48259 + }, + { + "epoch": 2.995840834316221, + "grad_norm": 0.128802655237144, + "learning_rate": 5.85503926103792e-10, + "loss": 2.6769, + "step": 48260 + }, + { + "epoch": 2.9959029114159788, + "grad_norm": 0.1282625599672798, + "learning_rate": 5.681566606352551e-10, + "loss": 2.7406, + "step": 48261 + }, + { + "epoch": 2.9959649885157367, + "grad_norm": 0.12897666539580852, + "learning_rate": 5.510702543398694e-10, + "loss": 2.5973, + "step": 48262 + }, + { + "epoch": 2.9960270656154946, + "grad_norm": 0.13029405553676435, + "learning_rate": 5.342447073064527e-10, + "loss": 2.6638, + "step": 48263 + }, + { + "epoch": 2.9960891427152525, + "grad_norm": 0.129905390390411, + "learning_rate": 5.17680019623823e-10, + "loss": 2.7128, + "step": 48264 + }, + { + "epoch": 2.99615121981501, + "grad_norm": 0.12930583433014092, + "learning_rate": 5.013761913752469e-10, + "loss": 2.6652, + "step": 48265 + }, + { + "epoch": 2.9962132969147683, + "grad_norm": 0.12962357743888794, + "learning_rate": 4.853332226550933e-10, + "loss": 2.7317, + "step": 48266 + }, + { + "epoch": 2.996275374014526, + "grad_norm": 0.12582531802699817, + "learning_rate": 4.695511135355269e-10, + "loss": 2.6236, + "step": 48267 + }, + { + "epoch": 2.996337451114284, + "grad_norm": 0.12953955961890898, + "learning_rate": 4.540298641053653e-10, + "loss": 2.744, + "step": 48268 + }, + { + "epoch": 2.9963995282140417, + "grad_norm": 0.13320368769462515, + "learning_rate": 4.387694744478754e-10, + "loss": 2.7312, + "step": 48269 + }, + { + "epoch": 2.9964616053138, + "grad_norm": 0.12977882036827795, + "learning_rate": 4.2376994463522166e-10, + "loss": 2.6707, + "step": 48270 + }, + { + "epoch": 2.9965236824135575, + "grad_norm": 0.12978153295460862, + "learning_rate": 4.0903127475067083e-10, + "loss": 2.7137, + "step": 48271 + }, + { + "epoch": 2.9965857595133154, + "grad_norm": 0.13002488246619764, + "learning_rate": 3.9455346487193846e-10, + "loss": 2.6597, + "step": 48272 + }, + { + "epoch": 2.9966478366130733, + "grad_norm": 0.12896340721355173, + "learning_rate": 3.8033651507118906e-10, + "loss": 2.7088, + "step": 48273 + }, + { + "epoch": 2.9967099137128312, + "grad_norm": 0.1305202031469189, + "learning_rate": 3.6638042542058713e-10, + "loss": 2.6922, + "step": 48274 + }, + { + "epoch": 2.996771990812589, + "grad_norm": 0.14524120509691718, + "learning_rate": 3.5268519600339945e-10, + "loss": 2.7561, + "step": 48275 + }, + { + "epoch": 2.996834067912347, + "grad_norm": 0.12993058781589742, + "learning_rate": 3.3925082688068823e-10, + "loss": 2.6658, + "step": 48276 + }, + { + "epoch": 2.996896145012105, + "grad_norm": 0.13944806215301414, + "learning_rate": 3.2607731812461795e-10, + "loss": 2.7232, + "step": 48277 + }, + { + "epoch": 2.996958222111863, + "grad_norm": 0.13422136006687374, + "learning_rate": 3.1316466980735315e-10, + "loss": 2.7574, + "step": 48278 + }, + { + "epoch": 2.997020299211621, + "grad_norm": 0.13583066513019154, + "learning_rate": 3.005128819899561e-10, + "loss": 2.7848, + "step": 48279 + }, + { + "epoch": 2.9970823763113787, + "grad_norm": 0.1288764008728946, + "learning_rate": 2.8812195474459124e-10, + "loss": 2.6759, + "step": 48280 + }, + { + "epoch": 2.9971444534111367, + "grad_norm": 0.14176421196484668, + "learning_rate": 2.7599188813787203e-10, + "loss": 2.7124, + "step": 48281 + }, + { + "epoch": 2.9972065305108946, + "grad_norm": 0.12897812400148523, + "learning_rate": 2.6412268222530957e-10, + "loss": 2.598, + "step": 48282 + }, + { + "epoch": 2.9972686076106525, + "grad_norm": 0.1286964370709899, + "learning_rate": 2.5251433707351723e-10, + "loss": 2.6763, + "step": 48283 + }, + { + "epoch": 2.9973306847104104, + "grad_norm": 0.12590799017873167, + "learning_rate": 2.4116685274355733e-10, + "loss": 2.7693, + "step": 48284 + }, + { + "epoch": 2.9973927618101683, + "grad_norm": 0.13063715100240406, + "learning_rate": 2.3008022929094097e-10, + "loss": 2.7094, + "step": 48285 + }, + { + "epoch": 2.9974548389099263, + "grad_norm": 0.13777099044785462, + "learning_rate": 2.1925446677673045e-10, + "loss": 2.7042, + "step": 48286 + }, + { + "epoch": 2.997516916009684, + "grad_norm": 0.13998909727924716, + "learning_rate": 2.086895652564369e-10, + "loss": 2.7123, + "step": 48287 + }, + { + "epoch": 2.997578993109442, + "grad_norm": 0.13105792313062192, + "learning_rate": 1.983855247855715e-10, + "loss": 2.6992, + "step": 48288 + }, + { + "epoch": 2.9976410702092, + "grad_norm": 0.1279530321908834, + "learning_rate": 1.8834234541964534e-10, + "loss": 2.6986, + "step": 48289 + }, + { + "epoch": 2.9977031473089575, + "grad_norm": 0.14078890335118888, + "learning_rate": 1.7856002720861854e-10, + "loss": 2.6859, + "step": 48290 + }, + { + "epoch": 2.997765224408716, + "grad_norm": 0.1358273602583379, + "learning_rate": 1.6903857020245107e-10, + "loss": 2.6679, + "step": 48291 + }, + { + "epoch": 2.9978273015084733, + "grad_norm": 0.13401545561701594, + "learning_rate": 1.59777974451103e-10, + "loss": 2.6192, + "step": 48292 + }, + { + "epoch": 2.9978893786082317, + "grad_norm": 0.13792797479724836, + "learning_rate": 1.5077824000453433e-10, + "loss": 2.7671, + "step": 48293 + }, + { + "epoch": 2.997951455707989, + "grad_norm": 0.13932927880660212, + "learning_rate": 1.4203936691270513e-10, + "loss": 2.6909, + "step": 48294 + }, + { + "epoch": 2.998013532807747, + "grad_norm": 0.12940624112099472, + "learning_rate": 1.3356135521447321e-10, + "loss": 2.6417, + "step": 48295 + }, + { + "epoch": 2.998075609907505, + "grad_norm": 0.13433211856659388, + "learning_rate": 1.2534420495424747e-10, + "loss": 2.6301, + "step": 48296 + }, + { + "epoch": 2.998137687007263, + "grad_norm": 0.13754083883085894, + "learning_rate": 1.1738791618198796e-10, + "loss": 2.6647, + "step": 48297 + }, + { + "epoch": 2.998199764107021, + "grad_norm": 0.16127980203575412, + "learning_rate": 1.0969248893655249e-10, + "loss": 2.7087, + "step": 48298 + }, + { + "epoch": 2.9982618412067787, + "grad_norm": 0.1335307084074136, + "learning_rate": 1.0225792325679884e-10, + "loss": 2.6412, + "step": 48299 + }, + { + "epoch": 2.9983239183065367, + "grad_norm": 0.1344890867014871, + "learning_rate": 9.508421917603372e-11, + "loss": 2.81, + "step": 48300 + }, + { + "epoch": 2.9983859954062946, + "grad_norm": 0.13569997227091496, + "learning_rate": 8.817137674421716e-11, + "loss": 2.5392, + "step": 48301 + }, + { + "epoch": 2.9984480725060525, + "grad_norm": 0.1396654292086329, + "learning_rate": 8.151939598355363e-11, + "loss": 2.7386, + "step": 48302 + }, + { + "epoch": 2.9985101496058104, + "grad_norm": 0.12921802069655386, + "learning_rate": 7.512827693845203e-11, + "loss": 2.7421, + "step": 48303 + }, + { + "epoch": 2.9985722267055683, + "grad_norm": 0.13109769754979453, + "learning_rate": 6.899801964221908e-11, + "loss": 2.6759, + "step": 48304 + }, + { + "epoch": 2.9986343038053263, + "grad_norm": 0.1427863703557055, + "learning_rate": 6.312862412261033e-11, + "loss": 2.6495, + "step": 48305 + }, + { + "epoch": 2.998696380905084, + "grad_norm": 0.12964749351127916, + "learning_rate": 5.752009040738138e-11, + "loss": 2.7405, + "step": 48306 + }, + { + "epoch": 2.998758458004842, + "grad_norm": 0.13640120960156166, + "learning_rate": 5.217241853539001e-11, + "loss": 2.8931, + "step": 48307 + }, + { + "epoch": 2.9988205351046, + "grad_norm": 0.14303300096491486, + "learning_rate": 4.7085608523289584e-11, + "loss": 2.7059, + "step": 48308 + }, + { + "epoch": 2.998882612204358, + "grad_norm": 0.13306664245530753, + "learning_rate": 4.22596604099379e-11, + "loss": 2.6548, + "step": 48309 + }, + { + "epoch": 2.998944689304116, + "grad_norm": 0.1584425397970486, + "learning_rate": 3.76945742119883e-11, + "loss": 2.6922, + "step": 48310 + }, + { + "epoch": 2.9990067664038738, + "grad_norm": 0.13630234794338353, + "learning_rate": 3.339034995164525e-11, + "loss": 2.7491, + "step": 48311 + }, + { + "epoch": 2.9990688435036317, + "grad_norm": 0.12950626803062443, + "learning_rate": 2.934698765666433e-11, + "loss": 2.7404, + "step": 48312 + }, + { + "epoch": 2.999130920603389, + "grad_norm": 0.12893857695083077, + "learning_rate": 2.5564487349249987e-11, + "loss": 2.6346, + "step": 48313 + }, + { + "epoch": 2.9991929977031475, + "grad_norm": 0.134526732077593, + "learning_rate": 2.204284904605558e-11, + "loss": 2.7381, + "step": 48314 + }, + { + "epoch": 2.999255074802905, + "grad_norm": 0.12847408185578083, + "learning_rate": 1.878207276373445e-11, + "loss": 2.6699, + "step": 48315 + }, + { + "epoch": 2.9993171519026633, + "grad_norm": 0.13838662103806768, + "learning_rate": 1.5782158518939937e-11, + "loss": 2.9118, + "step": 48316 + }, + { + "epoch": 2.999379229002421, + "grad_norm": 0.1367726151758742, + "learning_rate": 1.3043106333876509e-11, + "loss": 2.6868, + "step": 48317 + }, + { + "epoch": 2.999441306102179, + "grad_norm": 0.13438779969578046, + "learning_rate": 1.0564916219646392e-11, + "loss": 2.7856, + "step": 48318 + }, + { + "epoch": 2.9995033832019367, + "grad_norm": 0.14764634533112414, + "learning_rate": 8.347588187351818e-12, + "loss": 2.7157, + "step": 48319 + }, + { + "epoch": 2.9995654603016946, + "grad_norm": 0.1290680058277059, + "learning_rate": 6.391122248095016e-12, + "loss": 2.7709, + "step": 48320 + }, + { + "epoch": 2.9996275374014525, + "grad_norm": 0.13377797296605365, + "learning_rate": 4.695518412978217e-12, + "loss": 2.7774, + "step": 48321 + }, + { + "epoch": 2.9996896145012104, + "grad_norm": 0.13070778820750867, + "learning_rate": 3.260776687552536e-12, + "loss": 2.7322, + "step": 48322 + }, + { + "epoch": 2.9997516916009683, + "grad_norm": 0.12900251348813557, + "learning_rate": 2.086897088471318e-12, + "loss": 2.7664, + "step": 48323 + }, + { + "epoch": 2.9998137687007262, + "grad_norm": 0.13436770779761628, + "learning_rate": 1.1738796157345634e-12, + "loss": 2.6893, + "step": 48324 + }, + { + "epoch": 2.999875845800484, + "grad_norm": 0.14590170370222333, + "learning_rate": 5.217242748933871e-13, + "loss": 2.737, + "step": 48325 + }, + { + "epoch": 2.999937922900242, + "grad_norm": 0.13276853955834048, + "learning_rate": 1.3043107149890433e-13, + "loss": 2.691, + "step": 48326 + }, + { + "epoch": 3.0, + "grad_norm": 0.12824245976893525, + "learning_rate": 0.0, + "loss": 2.6147, + "step": 48327 + } + ], + "logging_steps": 1, + "max_steps": 48327, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.9230181811489341e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}