{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012309207287050715, "grad_norm": 67.18803405761719, "learning_rate": 0.0001, "loss": 1.0674, "step": 1 }, { "epoch": 0.0002461841457410143, "grad_norm": 65.83597564697266, "learning_rate": 0.0002, "loss": 1.3004, "step": 2 }, { "epoch": 0.00036927621861152144, "grad_norm": 1588.667236328125, "learning_rate": 0.0003, "loss": 9.2074, "step": 3 }, { "epoch": 0.0004923682914820286, "grad_norm": 413.274658203125, "learning_rate": 0.0004, "loss": 8.919, "step": 4 }, { "epoch": 0.0006154603643525357, "grad_norm": 544.9371337890625, "learning_rate": 0.0005, "loss": 33.035, "step": 5 }, { "epoch": 0.0007385524372230429, "grad_norm": 1369.8475341796875, "learning_rate": 0.0004999384160610913, "loss": 23.5435, "step": 6 }, { "epoch": 0.0008616445100935499, "grad_norm": 194.28961181640625, "learning_rate": 0.0004998768321221826, "loss": 28.951, "step": 7 }, { "epoch": 0.0009847365829640572, "grad_norm": 61.753360748291016, "learning_rate": 0.0004998152481832738, "loss": 31.007, "step": 8 }, { "epoch": 0.0011078286558345643, "grad_norm": 24.919086456298828, "learning_rate": 0.000499753664244365, "loss": 11.5416, "step": 9 }, { "epoch": 0.0012309207287050715, "grad_norm": 18.152099609375, "learning_rate": 0.0004996920803054563, "loss": 11.2042, "step": 10 }, { "epoch": 0.0013540128015755786, "grad_norm": 11.662219047546387, "learning_rate": 0.0004996304963665476, "loss": 9.8677, "step": 11 }, { "epoch": 0.0014771048744460858, "grad_norm": 6.349699020385742, "learning_rate": 0.0004995689124276389, "loss": 10.9044, "step": 12 }, { "epoch": 0.001600196947316593, "grad_norm": 11.353816032409668, "learning_rate": 0.0004995073284887302, "loss": 11.4622, "step": 13 }, { "epoch": 0.0017232890201870998, "grad_norm": 10.972694396972656, "learning_rate": 0.0004994457445498214, "loss": 10.694, "step": 14 }, { "epoch": 0.001846381093057607, "grad_norm": 6.9816107749938965, "learning_rate": 0.0004993841606109127, "loss": 9.3735, "step": 15 }, { "epoch": 0.0019694731659281144, "grad_norm": 4.182920932769775, "learning_rate": 0.000499322576672004, "loss": 9.6567, "step": 16 }, { "epoch": 0.0020925652387986213, "grad_norm": 2.884909152984619, "learning_rate": 0.0004992609927330952, "loss": 8.5667, "step": 17 }, { "epoch": 0.0022156573116691287, "grad_norm": 2.9517102241516113, "learning_rate": 0.0004991994087941864, "loss": 8.4023, "step": 18 }, { "epoch": 0.0023387493845396356, "grad_norm": 3.1726136207580566, "learning_rate": 0.0004991378248552778, "loss": 8.7405, "step": 19 }, { "epoch": 0.002461841457410143, "grad_norm": 3.6285274028778076, "learning_rate": 0.000499076240916369, "loss": 8.6383, "step": 20 }, { "epoch": 0.00258493353028065, "grad_norm": 2.3453266620635986, "learning_rate": 0.0004990146569774603, "loss": 8.3686, "step": 21 }, { "epoch": 0.0027080256031511572, "grad_norm": 5.714865207672119, "learning_rate": 0.0004989530730385516, "loss": 8.401, "step": 22 }, { "epoch": 0.002831117676021664, "grad_norm": 5.644321918487549, "learning_rate": 0.0004988914890996428, "loss": 9.0976, "step": 23 }, { "epoch": 0.0029542097488921715, "grad_norm": 1.393264889717102, "learning_rate": 0.000498829905160734, "loss": 8.1006, "step": 24 }, { "epoch": 0.0030773018217626785, "grad_norm": 0.8631675243377686, "learning_rate": 0.0004987683212218253, "loss": 9.4932, "step": 25 }, { "epoch": 0.003200393894633186, "grad_norm": 0.9844844341278076, "learning_rate": 0.0004987067372829166, "loss": 8.7586, "step": 26 }, { "epoch": 0.0033234859675036928, "grad_norm": 1.3519920110702515, "learning_rate": 0.0004986451533440079, "loss": 8.2175, "step": 27 }, { "epoch": 0.0034465780403741997, "grad_norm": 1.7748432159423828, "learning_rate": 0.0004985835694050992, "loss": 7.7755, "step": 28 }, { "epoch": 0.003569670113244707, "grad_norm": 1.1122276782989502, "learning_rate": 0.0004985219854661905, "loss": 8.2933, "step": 29 }, { "epoch": 0.003692762186115214, "grad_norm": 2.2256362438201904, "learning_rate": 0.0004984604015272816, "loss": 8.0617, "step": 30 }, { "epoch": 0.0038158542589857214, "grad_norm": 1.0208213329315186, "learning_rate": 0.0004983988175883729, "loss": 8.4026, "step": 31 }, { "epoch": 0.003938946331856229, "grad_norm": 1.7649627923965454, "learning_rate": 0.0004983372336494642, "loss": 8.0613, "step": 32 }, { "epoch": 0.004062038404726735, "grad_norm": 2.430633544921875, "learning_rate": 0.0004982756497105555, "loss": 8.1856, "step": 33 }, { "epoch": 0.004185130477597243, "grad_norm": 1.074214220046997, "learning_rate": 0.0004982140657716468, "loss": 8.141, "step": 34 }, { "epoch": 0.00430822255046775, "grad_norm": 1.8853709697723389, "learning_rate": 0.0004981524818327381, "loss": 7.8964, "step": 35 }, { "epoch": 0.004431314623338257, "grad_norm": 1.248002290725708, "learning_rate": 0.0004980908978938292, "loss": 7.7137, "step": 36 }, { "epoch": 0.004554406696208764, "grad_norm": 0.7906699180603027, "learning_rate": 0.0004980293139549205, "loss": 7.7371, "step": 37 }, { "epoch": 0.004677498769079271, "grad_norm": 0.5662963390350342, "learning_rate": 0.0004979677300160118, "loss": 8.3785, "step": 38 }, { "epoch": 0.0048005908419497785, "grad_norm": 0.9610182642936707, "learning_rate": 0.0004979061460771031, "loss": 8.2624, "step": 39 }, { "epoch": 0.004923682914820286, "grad_norm": 0.99391770362854, "learning_rate": 0.0004978445621381944, "loss": 7.8316, "step": 40 }, { "epoch": 0.005046774987690792, "grad_norm": 0.38373059034347534, "learning_rate": 0.0004977829781992857, "loss": 7.7102, "step": 41 }, { "epoch": 0.0051698670605613, "grad_norm": 1.0226860046386719, "learning_rate": 0.0004977213942603769, "loss": 8.0299, "step": 42 }, { "epoch": 0.005292959133431807, "grad_norm": 0.45533594489097595, "learning_rate": 0.0004976598103214681, "loss": 7.7715, "step": 43 }, { "epoch": 0.0054160512063023145, "grad_norm": 0.4333859086036682, "learning_rate": 0.0004975982263825594, "loss": 7.8781, "step": 44 }, { "epoch": 0.005539143279172821, "grad_norm": 0.8118147253990173, "learning_rate": 0.0004975366424436507, "loss": 8.0237, "step": 45 }, { "epoch": 0.005662235352043328, "grad_norm": 1.0570735931396484, "learning_rate": 0.000497475058504742, "loss": 8.9348, "step": 46 }, { "epoch": 0.005785327424913836, "grad_norm": 1.386533260345459, "learning_rate": 0.0004974134745658333, "loss": 8.27, "step": 47 }, { "epoch": 0.005908419497784343, "grad_norm": 1.5302668809890747, "learning_rate": 0.0004973518906269245, "loss": 7.8318, "step": 48 }, { "epoch": 0.00603151157065485, "grad_norm": 0.9379439353942871, "learning_rate": 0.0004972903066880158, "loss": 7.6452, "step": 49 }, { "epoch": 0.006154603643525357, "grad_norm": 1.1155481338500977, "learning_rate": 0.000497228722749107, "loss": 7.7851, "step": 50 }, { "epoch": 0.006277695716395864, "grad_norm": 1.3718804121017456, "learning_rate": 0.0004971671388101983, "loss": 7.7412, "step": 51 }, { "epoch": 0.006400787789266372, "grad_norm": 0.9373340606689453, "learning_rate": 0.0004971055548712895, "loss": 7.945, "step": 52 }, { "epoch": 0.006523879862136878, "grad_norm": 0.7068069577217102, "learning_rate": 0.0004970439709323809, "loss": 7.9435, "step": 53 }, { "epoch": 0.0066469719350073855, "grad_norm": 0.5499038100242615, "learning_rate": 0.0004969823869934721, "loss": 7.8493, "step": 54 }, { "epoch": 0.006770064007877893, "grad_norm": 0.9119729399681091, "learning_rate": 0.0004969208030545634, "loss": 7.664, "step": 55 }, { "epoch": 0.006893156080748399, "grad_norm": 0.6819908022880554, "learning_rate": 0.0004968592191156547, "loss": 7.6827, "step": 56 }, { "epoch": 0.007016248153618907, "grad_norm": 1.0215924978256226, "learning_rate": 0.0004967976351767459, "loss": 7.7707, "step": 57 }, { "epoch": 0.007139340226489414, "grad_norm": 1.0048410892486572, "learning_rate": 0.0004967360512378371, "loss": 8.1011, "step": 58 }, { "epoch": 0.0072624322993599215, "grad_norm": 2.032409191131592, "learning_rate": 0.0004966744672989284, "loss": 7.7723, "step": 59 }, { "epoch": 0.007385524372230428, "grad_norm": 1.6728098392486572, "learning_rate": 0.0004966128833600197, "loss": 7.7826, "step": 60 }, { "epoch": 0.007508616445100935, "grad_norm": 1.3133116960525513, "learning_rate": 0.000496551299421111, "loss": 7.3452, "step": 61 }, { "epoch": 0.007631708517971443, "grad_norm": 2.6686975955963135, "learning_rate": 0.0004964897154822023, "loss": 7.7267, "step": 62 }, { "epoch": 0.00775480059084195, "grad_norm": 2.551522970199585, "learning_rate": 0.0004964281315432935, "loss": 7.9516, "step": 63 }, { "epoch": 0.007877892663712457, "grad_norm": 0.8896097540855408, "learning_rate": 0.0004963665476043847, "loss": 8.0213, "step": 64 }, { "epoch": 0.008000984736582964, "grad_norm": 1.597769856452942, "learning_rate": 0.000496304963665476, "loss": 7.7656, "step": 65 }, { "epoch": 0.00812407680945347, "grad_norm": 1.1121389865875244, "learning_rate": 0.0004962433797265673, "loss": 8.0689, "step": 66 }, { "epoch": 0.008247168882323979, "grad_norm": 0.8726708292961121, "learning_rate": 0.0004961817957876586, "loss": 7.7369, "step": 67 }, { "epoch": 0.008370260955194485, "grad_norm": 1.783186912536621, "learning_rate": 0.0004961202118487499, "loss": 8.548, "step": 68 }, { "epoch": 0.008493353028064993, "grad_norm": 0.7138088345527649, "learning_rate": 0.0004960586279098412, "loss": 8.077, "step": 69 }, { "epoch": 0.0086164451009355, "grad_norm": 1.5829122066497803, "learning_rate": 0.0004959970439709323, "loss": 7.7631, "step": 70 }, { "epoch": 0.008739537173806006, "grad_norm": 1.3023746013641357, "learning_rate": 0.0004959354600320236, "loss": 7.8863, "step": 71 }, { "epoch": 0.008862629246676515, "grad_norm": 0.9675659537315369, "learning_rate": 0.0004958738760931149, "loss": 7.7433, "step": 72 }, { "epoch": 0.008985721319547021, "grad_norm": 0.8761181831359863, "learning_rate": 0.0004958122921542062, "loss": 8.0624, "step": 73 }, { "epoch": 0.009108813392417528, "grad_norm": 0.6872352361679077, "learning_rate": 0.0004957507082152975, "loss": 7.622, "step": 74 }, { "epoch": 0.009231905465288036, "grad_norm": 0.4918942451477051, "learning_rate": 0.0004956891242763888, "loss": 7.7969, "step": 75 }, { "epoch": 0.009354997538158542, "grad_norm": 1.4368852376937866, "learning_rate": 0.00049562754033748, "loss": 8.0859, "step": 76 }, { "epoch": 0.00947808961102905, "grad_norm": 1.156982421875, "learning_rate": 0.0004955659563985712, "loss": 7.6839, "step": 77 }, { "epoch": 0.009601181683899557, "grad_norm": 1.1094448566436768, "learning_rate": 0.0004955043724596625, "loss": 8.0645, "step": 78 }, { "epoch": 0.009724273756770064, "grad_norm": 1.4256435632705688, "learning_rate": 0.0004954427885207538, "loss": 7.7975, "step": 79 }, { "epoch": 0.009847365829640572, "grad_norm": 0.7130182385444641, "learning_rate": 0.0004953812045818451, "loss": 9.6286, "step": 80 }, { "epoch": 0.009970457902511078, "grad_norm": 1.0782781839370728, "learning_rate": 0.0004953196206429364, "loss": 7.8134, "step": 81 }, { "epoch": 0.010093549975381585, "grad_norm": 0.4732576310634613, "learning_rate": 0.0004952580367040276, "loss": 7.805, "step": 82 }, { "epoch": 0.010216642048252093, "grad_norm": 1.9529255628585815, "learning_rate": 0.0004951964527651188, "loss": 9.3028, "step": 83 }, { "epoch": 0.0103397341211226, "grad_norm": 1.0106065273284912, "learning_rate": 0.0004951348688262101, "loss": 8.1388, "step": 84 }, { "epoch": 0.010462826193993106, "grad_norm": 0.8011863827705383, "learning_rate": 0.0004950732848873014, "loss": 7.464, "step": 85 }, { "epoch": 0.010585918266863614, "grad_norm": 0.9249316453933716, "learning_rate": 0.0004950117009483926, "loss": 8.8035, "step": 86 }, { "epoch": 0.01070901033973412, "grad_norm": 0.4636135697364807, "learning_rate": 0.000494950117009484, "loss": 8.0954, "step": 87 }, { "epoch": 0.010832102412604629, "grad_norm": 0.3696820139884949, "learning_rate": 0.0004948885330705752, "loss": 8.0594, "step": 88 }, { "epoch": 0.010955194485475135, "grad_norm": 0.701042115688324, "learning_rate": 0.0004948269491316665, "loss": 8.1427, "step": 89 }, { "epoch": 0.011078286558345642, "grad_norm": NaN, "learning_rate": 0.0004947653651927577, "loss": 10.6119, "step": 90 }, { "epoch": 0.01120137863121615, "grad_norm": 0.8016136288642883, "learning_rate": 0.000494703781253849, "loss": 7.746, "step": 91 }, { "epoch": 0.011324470704086657, "grad_norm": 9.934540748596191, "learning_rate": 0.0004946421973149402, "loss": 12.3912, "step": 92 }, { "epoch": 0.011447562776957163, "grad_norm": 3.9382100105285645, "learning_rate": 0.0004945806133760316, "loss": 8.8184, "step": 93 }, { "epoch": 0.011570654849827671, "grad_norm": 1.8458195924758911, "learning_rate": 0.0004945190294371228, "loss": 8.4923, "step": 94 }, { "epoch": 0.011693746922698178, "grad_norm": 2.1188364028930664, "learning_rate": 0.0004944574454982141, "loss": 7.8212, "step": 95 }, { "epoch": 0.011816838995568686, "grad_norm": 8.921860694885254, "learning_rate": 0.0004943958615593054, "loss": 9.5383, "step": 96 }, { "epoch": 0.011939931068439193, "grad_norm": 5.567935466766357, "learning_rate": 0.0004943342776203966, "loss": 8.518, "step": 97 }, { "epoch": 0.0120630231413097, "grad_norm": 1.862424373626709, "learning_rate": 0.0004942726936814878, "loss": 8.1595, "step": 98 }, { "epoch": 0.012186115214180207, "grad_norm": 2.7225453853607178, "learning_rate": 0.0004942111097425791, "loss": 8.4499, "step": 99 }, { "epoch": 0.012309207287050714, "grad_norm": 2.736734628677368, "learning_rate": 0.0004941495258036704, "loss": 8.0254, "step": 100 }, { "epoch": 0.01243229935992122, "grad_norm": 5.167956829071045, "learning_rate": 0.0004940879418647617, "loss": 8.2687, "step": 101 }, { "epoch": 0.012555391432791729, "grad_norm": 7.4710893630981445, "learning_rate": 0.000494026357925853, "loss": 9.5004, "step": 102 }, { "epoch": 0.012678483505662235, "grad_norm": 3.2742466926574707, "learning_rate": 0.0004939647739869443, "loss": 7.8542, "step": 103 }, { "epoch": 0.012801575578532743, "grad_norm": 2.616835355758667, "learning_rate": 0.0004939031900480354, "loss": 7.924, "step": 104 }, { "epoch": 0.01292466765140325, "grad_norm": 2.403167486190796, "learning_rate": 0.0004938416061091267, "loss": 8.1872, "step": 105 }, { "epoch": 0.013047759724273756, "grad_norm": 1.349604845046997, "learning_rate": 0.000493780022170218, "loss": 7.5532, "step": 106 }, { "epoch": 0.013170851797144265, "grad_norm": 4.185287952423096, "learning_rate": 0.0004937184382313093, "loss": 8.3643, "step": 107 }, { "epoch": 0.013293943870014771, "grad_norm": 5.292579650878906, "learning_rate": 0.0004936568542924006, "loss": 9.5915, "step": 108 }, { "epoch": 0.013417035942885278, "grad_norm": 2.6628496646881104, "learning_rate": 0.0004935952703534919, "loss": 8.6394, "step": 109 }, { "epoch": 0.013540128015755786, "grad_norm": 2.831953525543213, "learning_rate": 0.000493533686414583, "loss": 8.131, "step": 110 }, { "epoch": 0.013663220088626292, "grad_norm": 3.558497428894043, "learning_rate": 0.0004934721024756743, "loss": 8.0735, "step": 111 }, { "epoch": 0.013786312161496799, "grad_norm": 2.7799148559570312, "learning_rate": 0.0004934105185367656, "loss": 7.8656, "step": 112 }, { "epoch": 0.013909404234367307, "grad_norm": 2.1898086071014404, "learning_rate": 0.0004933489345978569, "loss": 9.0681, "step": 113 }, { "epoch": 0.014032496307237814, "grad_norm": 2.7354018688201904, "learning_rate": 0.0004932873506589482, "loss": 8.2997, "step": 114 }, { "epoch": 0.014155588380108322, "grad_norm": 2.169008255004883, "learning_rate": 0.0004932257667200395, "loss": 9.5147, "step": 115 }, { "epoch": 0.014278680452978828, "grad_norm": 2.779736042022705, "learning_rate": 0.0004931641827811307, "loss": 7.8001, "step": 116 }, { "epoch": 0.014401772525849335, "grad_norm": 3.8463668823242188, "learning_rate": 0.0004931025988422219, "loss": 7.4348, "step": 117 }, { "epoch": 0.014524864598719843, "grad_norm": 1.6728037595748901, "learning_rate": 0.0004930410149033132, "loss": 7.6556, "step": 118 }, { "epoch": 0.01464795667159035, "grad_norm": 3.7422118186950684, "learning_rate": 0.0004929794309644045, "loss": 8.2815, "step": 119 }, { "epoch": 0.014771048744460856, "grad_norm": 5.163923740386963, "learning_rate": 0.0004929178470254958, "loss": 8.1548, "step": 120 }, { "epoch": 0.014894140817331364, "grad_norm": 2.896857976913452, "learning_rate": 0.0004928562630865871, "loss": 7.8052, "step": 121 }, { "epoch": 0.01501723289020187, "grad_norm": 0.7407937049865723, "learning_rate": 0.0004927946791476783, "loss": 7.871, "step": 122 }, { "epoch": 0.015140324963072379, "grad_norm": 2.1374764442443848, "learning_rate": 0.0004927330952087696, "loss": 7.6314, "step": 123 }, { "epoch": 0.015263417035942885, "grad_norm": 0.5735248327255249, "learning_rate": 0.0004926715112698608, "loss": 8.2982, "step": 124 }, { "epoch": 0.015386509108813392, "grad_norm": 0.6375786662101746, "learning_rate": 0.0004926099273309521, "loss": 7.379, "step": 125 }, { "epoch": 0.0155096011816839, "grad_norm": 1.8949096202850342, "learning_rate": 0.0004925483433920433, "loss": 8.1135, "step": 126 }, { "epoch": 0.01563269325455441, "grad_norm": 1.7778562307357788, "learning_rate": 0.0004924867594531347, "loss": 7.7228, "step": 127 }, { "epoch": 0.015755785327424915, "grad_norm": NaN, "learning_rate": 0.0004924251755142259, "loss": 7.4158, "step": 128 }, { "epoch": 0.01587887740029542, "grad_norm": 0.5879265666007996, "learning_rate": 0.0004923635915753172, "loss": 7.5777, "step": 129 }, { "epoch": 0.016001969473165928, "grad_norm": 1.96914803981781, "learning_rate": 0.0004923020076364085, "loss": 7.7597, "step": 130 }, { "epoch": 0.016125061546036434, "grad_norm": 4.1608099937438965, "learning_rate": 0.0004922404236974997, "loss": 8.5917, "step": 131 }, { "epoch": 0.01624815361890694, "grad_norm": 2.906613826751709, "learning_rate": 0.0004921788397585909, "loss": 8.6265, "step": 132 }, { "epoch": 0.01637124569177745, "grad_norm": 4.361425399780273, "learning_rate": 0.0004921172558196822, "loss": 8.3542, "step": 133 }, { "epoch": 0.016494337764647957, "grad_norm": 4.758860111236572, "learning_rate": 0.0004920556718807735, "loss": 8.5501, "step": 134 }, { "epoch": 0.016617429837518464, "grad_norm": 1.4529486894607544, "learning_rate": 0.0004919940879418648, "loss": 8.2007, "step": 135 }, { "epoch": 0.01674052191038897, "grad_norm": 2.196946382522583, "learning_rate": 0.0004919325040029561, "loss": 9.0115, "step": 136 }, { "epoch": 0.016863613983259477, "grad_norm": 2.4166197776794434, "learning_rate": 0.0004918709200640473, "loss": 9.0066, "step": 137 }, { "epoch": 0.016986706056129987, "grad_norm": 2.11981201171875, "learning_rate": 0.0004918093361251385, "loss": 7.7991, "step": 138 }, { "epoch": 0.017109798129000493, "grad_norm": 5.232426643371582, "learning_rate": 0.0004917477521862298, "loss": 8.9869, "step": 139 }, { "epoch": 0.017232890201871, "grad_norm": 5.863483428955078, "learning_rate": 0.0004916861682473211, "loss": 8.4794, "step": 140 }, { "epoch": 0.017355982274741506, "grad_norm": 3.181109666824341, "learning_rate": 0.0004916245843084124, "loss": 8.0508, "step": 141 }, { "epoch": 0.017479074347612013, "grad_norm": 1.1101082563400269, "learning_rate": 0.0004915630003695037, "loss": 7.7267, "step": 142 }, { "epoch": 0.01760216642048252, "grad_norm": 1.5619341135025024, "learning_rate": 0.000491501416430595, "loss": 7.8597, "step": 143 }, { "epoch": 0.01772525849335303, "grad_norm": 1.109195590019226, "learning_rate": 0.0004914398324916861, "loss": 7.5139, "step": 144 }, { "epoch": 0.017848350566223536, "grad_norm": 2.072960138320923, "learning_rate": 0.0004913782485527774, "loss": 8.3308, "step": 145 }, { "epoch": 0.017971442639094042, "grad_norm": 2.7134573459625244, "learning_rate": 0.0004913166646138687, "loss": 8.2893, "step": 146 }, { "epoch": 0.01809453471196455, "grad_norm": 1.6227017641067505, "learning_rate": 0.00049125508067496, "loss": 7.6635, "step": 147 }, { "epoch": 0.018217626784835055, "grad_norm": 1.3147410154342651, "learning_rate": 0.0004911934967360513, "loss": 7.6619, "step": 148 }, { "epoch": 0.018340718857705565, "grad_norm": 1.6304398775100708, "learning_rate": 0.0004911319127971426, "loss": 7.708, "step": 149 }, { "epoch": 0.01846381093057607, "grad_norm": 0.927314281463623, "learning_rate": 0.0004910703288582337, "loss": 8.0661, "step": 150 }, { "epoch": 0.018586903003446578, "grad_norm": 1.384406328201294, "learning_rate": 0.000491008744919325, "loss": 7.8715, "step": 151 }, { "epoch": 0.018709995076317085, "grad_norm": 0.9301437139511108, "learning_rate": 0.0004909471609804163, "loss": 7.9151, "step": 152 }, { "epoch": 0.01883308714918759, "grad_norm": 0.8856925368309021, "learning_rate": 0.0004908855770415076, "loss": 8.0011, "step": 153 }, { "epoch": 0.0189561792220581, "grad_norm": 1.0197592973709106, "learning_rate": 0.0004908239931025989, "loss": 7.6881, "step": 154 }, { "epoch": 0.019079271294928608, "grad_norm": 0.747929036617279, "learning_rate": 0.0004907624091636902, "loss": 7.6387, "step": 155 }, { "epoch": 0.019202363367799114, "grad_norm": 1.1613069772720337, "learning_rate": 0.0004907008252247814, "loss": 7.998, "step": 156 }, { "epoch": 0.01932545544066962, "grad_norm": 0.40295490622520447, "learning_rate": 0.0004906392412858726, "loss": 7.4298, "step": 157 }, { "epoch": 0.019448547513540127, "grad_norm": 0.7746351957321167, "learning_rate": 0.0004905776573469639, "loss": 8.1046, "step": 158 }, { "epoch": 0.019571639586410634, "grad_norm": 1.664730429649353, "learning_rate": 0.0004905160734080552, "loss": 7.4666, "step": 159 }, { "epoch": 0.019694731659281144, "grad_norm": 1.1430929899215698, "learning_rate": 0.0004904544894691464, "loss": 7.6992, "step": 160 }, { "epoch": 0.01981782373215165, "grad_norm": 0.8084683418273926, "learning_rate": 0.0004903929055302378, "loss": 8.0732, "step": 161 }, { "epoch": 0.019940915805022157, "grad_norm": 1.0873490571975708, "learning_rate": 0.000490331321591329, "loss": 7.9313, "step": 162 }, { "epoch": 0.020064007877892663, "grad_norm": 1.0168075561523438, "learning_rate": 0.0004902697376524203, "loss": 9.1689, "step": 163 }, { "epoch": 0.02018709995076317, "grad_norm": 1.718609094619751, "learning_rate": 0.0004902081537135115, "loss": 7.4564, "step": 164 }, { "epoch": 0.02031019202363368, "grad_norm": 1.4576982259750366, "learning_rate": 0.0004901465697746028, "loss": 7.9774, "step": 165 }, { "epoch": 0.020433284096504186, "grad_norm": 0.7225480675697327, "learning_rate": 0.000490084985835694, "loss": 7.8935, "step": 166 }, { "epoch": 0.020556376169374693, "grad_norm": 0.767625629901886, "learning_rate": 0.0004900234018967853, "loss": 7.7945, "step": 167 }, { "epoch": 0.0206794682422452, "grad_norm": 1.499491572380066, "learning_rate": 0.0004899618179578766, "loss": 7.5844, "step": 168 }, { "epoch": 0.020802560315115706, "grad_norm": 3.1662116050720215, "learning_rate": 0.0004899002340189679, "loss": 10.3839, "step": 169 }, { "epoch": 0.020925652387986212, "grad_norm": 0.9420777559280396, "learning_rate": 0.0004898386500800592, "loss": 7.6332, "step": 170 }, { "epoch": 0.021048744460856722, "grad_norm": 1.1049147844314575, "learning_rate": 0.0004897770661411504, "loss": 7.8463, "step": 171 }, { "epoch": 0.02117183653372723, "grad_norm": 0.4747467041015625, "learning_rate": 0.0004897154822022416, "loss": 7.8231, "step": 172 }, { "epoch": 0.021294928606597735, "grad_norm": 0.8083981871604919, "learning_rate": 0.0004896538982633329, "loss": 7.6605, "step": 173 }, { "epoch": 0.02141802067946824, "grad_norm": 0.7175167798995972, "learning_rate": 0.0004895923143244242, "loss": 7.6152, "step": 174 }, { "epoch": 0.021541112752338748, "grad_norm": 0.6376844644546509, "learning_rate": 0.0004895307303855155, "loss": 7.9181, "step": 175 }, { "epoch": 0.021664204825209258, "grad_norm": 0.5885112285614014, "learning_rate": 0.0004894691464466068, "loss": 8.4683, "step": 176 }, { "epoch": 0.021787296898079764, "grad_norm": 0.9872096180915833, "learning_rate": 0.000489407562507698, "loss": 8.8505, "step": 177 }, { "epoch": 0.02191038897095027, "grad_norm": 1.0755162239074707, "learning_rate": 0.0004893459785687892, "loss": 7.6958, "step": 178 }, { "epoch": 0.022033481043820777, "grad_norm": 0.8951033353805542, "learning_rate": 0.0004892843946298805, "loss": 7.7329, "step": 179 }, { "epoch": 0.022156573116691284, "grad_norm": 1.6454609632492065, "learning_rate": 0.0004892228106909718, "loss": 7.9179, "step": 180 }, { "epoch": 0.022279665189561794, "grad_norm": 1.5746022462844849, "learning_rate": 0.0004891612267520631, "loss": 7.9517, "step": 181 }, { "epoch": 0.0224027572624323, "grad_norm": 0.5614102482795715, "learning_rate": 0.0004890996428131544, "loss": 7.453, "step": 182 }, { "epoch": 0.022525849335302807, "grad_norm": 0.4619376063346863, "learning_rate": 0.0004890380588742457, "loss": 7.7688, "step": 183 }, { "epoch": 0.022648941408173313, "grad_norm": 0.7536031603813171, "learning_rate": 0.0004889764749353368, "loss": 7.8874, "step": 184 }, { "epoch": 0.02277203348104382, "grad_norm": 0.5676535367965698, "learning_rate": 0.0004889148909964281, "loss": 7.8151, "step": 185 }, { "epoch": 0.022895125553914326, "grad_norm": 0.5502355098724365, "learning_rate": 0.0004888533070575194, "loss": 7.6636, "step": 186 }, { "epoch": 0.023018217626784836, "grad_norm": 0.37870335578918457, "learning_rate": 0.0004887917231186107, "loss": 7.4391, "step": 187 }, { "epoch": 0.023141309699655343, "grad_norm": 0.3620382249355316, "learning_rate": 0.000488730139179702, "loss": 7.7492, "step": 188 }, { "epoch": 0.02326440177252585, "grad_norm": 0.39196500182151794, "learning_rate": 0.0004886685552407933, "loss": 7.7763, "step": 189 }, { "epoch": 0.023387493845396356, "grad_norm": 0.9361664652824402, "learning_rate": 0.0004886069713018845, "loss": 7.4523, "step": 190 }, { "epoch": 0.023510585918266862, "grad_norm": 0.7639418244361877, "learning_rate": 0.0004885453873629757, "loss": 7.667, "step": 191 }, { "epoch": 0.023633677991137372, "grad_norm": 1.008725643157959, "learning_rate": 0.000488483803424067, "loss": 8.0597, "step": 192 }, { "epoch": 0.02375677006400788, "grad_norm": 0.7230353355407715, "learning_rate": 0.0004884222194851583, "loss": 8.2973, "step": 193 }, { "epoch": 0.023879862136878385, "grad_norm": 0.7845500111579895, "learning_rate": 0.0004883606355462495, "loss": 7.9803, "step": 194 }, { "epoch": 0.024002954209748892, "grad_norm": 1.641990065574646, "learning_rate": 0.0004882990516073409, "loss": 7.5227, "step": 195 }, { "epoch": 0.0241260462826194, "grad_norm": 1.0393086671829224, "learning_rate": 0.00048823746766843206, "loss": 8.7823, "step": 196 }, { "epoch": 0.024249138355489905, "grad_norm": 0.7385181784629822, "learning_rate": 0.00048817588372952335, "loss": 7.4173, "step": 197 }, { "epoch": 0.024372230428360415, "grad_norm": 1.52896249294281, "learning_rate": 0.00048811429979061464, "loss": 7.852, "step": 198 }, { "epoch": 0.02449532250123092, "grad_norm": 1.9660006761550903, "learning_rate": 0.0004880527158517059, "loss": 8.1045, "step": 199 }, { "epoch": 0.024618414574101428, "grad_norm": 1.733223795890808, "learning_rate": 0.00048799113191279716, "loss": 8.6133, "step": 200 }, { "epoch": 0.024741506646971934, "grad_norm": 0.749427318572998, "learning_rate": 0.00048792954797388844, "loss": 8.5738, "step": 201 }, { "epoch": 0.02486459871984244, "grad_norm": 1.4221640825271606, "learning_rate": 0.0004878679640349797, "loss": 7.7612, "step": 202 }, { "epoch": 0.02498769079271295, "grad_norm": 1.1180578470230103, "learning_rate": 0.00048780638009607096, "loss": 9.6108, "step": 203 }, { "epoch": 0.025110782865583457, "grad_norm": 2.120840072631836, "learning_rate": 0.0004877447961571622, "loss": 7.703, "step": 204 }, { "epoch": 0.025233874938453964, "grad_norm": 1.5335320234298706, "learning_rate": 0.00048768321221825354, "loss": 7.8722, "step": 205 }, { "epoch": 0.02535696701132447, "grad_norm": 0.514214813709259, "learning_rate": 0.00048762162827934477, "loss": 8.0056, "step": 206 }, { "epoch": 0.025480059084194977, "grad_norm": 1.1976497173309326, "learning_rate": 0.00048756004434043605, "loss": 8.145, "step": 207 }, { "epoch": 0.025603151157065487, "grad_norm": 2.3461978435516357, "learning_rate": 0.0004874984604015273, "loss": 8.4438, "step": 208 }, { "epoch": 0.025726243229935993, "grad_norm": 1.938178539276123, "learning_rate": 0.0004874368764626186, "loss": 7.8262, "step": 209 }, { "epoch": 0.0258493353028065, "grad_norm": 1.4575886726379395, "learning_rate": 0.0004873752925237098, "loss": 8.2198, "step": 210 }, { "epoch": 0.025972427375677006, "grad_norm": 0.2946975529193878, "learning_rate": 0.0004873137085848011, "loss": 8.2865, "step": 211 }, { "epoch": 0.026095519448547513, "grad_norm": 1.7003166675567627, "learning_rate": 0.0004872521246458924, "loss": 7.5067, "step": 212 }, { "epoch": 0.02621861152141802, "grad_norm": 1.6047685146331787, "learning_rate": 0.00048719054070698367, "loss": 7.9109, "step": 213 }, { "epoch": 0.02634170359428853, "grad_norm": 0.6833516359329224, "learning_rate": 0.0004871289567680749, "loss": 9.2132, "step": 214 }, { "epoch": 0.026464795667159036, "grad_norm": 1.0659126043319702, "learning_rate": 0.0004870673728291662, "loss": 7.8962, "step": 215 }, { "epoch": 0.026587887740029542, "grad_norm": 0.39954128861427307, "learning_rate": 0.0004870057888902574, "loss": 7.7583, "step": 216 }, { "epoch": 0.02671097981290005, "grad_norm": 1.401207685470581, "learning_rate": 0.0004869442049513487, "loss": 7.9522, "step": 217 }, { "epoch": 0.026834071885770555, "grad_norm": 1.116788625717163, "learning_rate": 0.00048688262101244, "loss": 7.5548, "step": 218 }, { "epoch": 0.026957163958641065, "grad_norm": 0.255120187997818, "learning_rate": 0.0004868210370735313, "loss": 7.5916, "step": 219 }, { "epoch": 0.02708025603151157, "grad_norm": 0.6038568615913391, "learning_rate": 0.0004867594531346225, "loss": 8.2011, "step": 220 }, { "epoch": 0.027203348104382078, "grad_norm": 1.206920862197876, "learning_rate": 0.0004866978691957138, "loss": 7.8519, "step": 221 }, { "epoch": 0.027326440177252585, "grad_norm": 1.1075549125671387, "learning_rate": 0.00048663628525680503, "loss": 7.5481, "step": 222 }, { "epoch": 0.02744953225012309, "grad_norm": 0.6261852979660034, "learning_rate": 0.0004865747013178963, "loss": 7.9745, "step": 223 }, { "epoch": 0.027572624322993598, "grad_norm": 0.8952301740646362, "learning_rate": 0.00048651311737898755, "loss": 7.7392, "step": 224 }, { "epoch": 0.027695716395864108, "grad_norm": 1.3804584741592407, "learning_rate": 0.0004864515334400789, "loss": 7.6754, "step": 225 }, { "epoch": 0.027818808468734614, "grad_norm": 0.8678548336029053, "learning_rate": 0.0004863899495011701, "loss": 7.7169, "step": 226 }, { "epoch": 0.02794190054160512, "grad_norm": 0.47208836674690247, "learning_rate": 0.0004863283655622614, "loss": 7.4241, "step": 227 }, { "epoch": 0.028064992614475627, "grad_norm": 0.971363365650177, "learning_rate": 0.00048626678162335264, "loss": 7.4603, "step": 228 }, { "epoch": 0.028188084687346134, "grad_norm": 0.46632614731788635, "learning_rate": 0.0004862051976844439, "loss": 7.6197, "step": 229 }, { "epoch": 0.028311176760216644, "grad_norm": 0.2898561954498291, "learning_rate": 0.00048614361374553516, "loss": 7.4356, "step": 230 }, { "epoch": 0.02843426883308715, "grad_norm": 0.9809649586677551, "learning_rate": 0.00048608202980662644, "loss": 7.9315, "step": 231 }, { "epoch": 0.028557360905957656, "grad_norm": 0.8789273500442505, "learning_rate": 0.00048602044586771773, "loss": 8.1691, "step": 232 }, { "epoch": 0.028680452978828163, "grad_norm": 0.43661728501319885, "learning_rate": 0.000485958861928809, "loss": 7.8623, "step": 233 }, { "epoch": 0.02880354505169867, "grad_norm": 0.5587073564529419, "learning_rate": 0.00048589727798990025, "loss": 7.8074, "step": 234 }, { "epoch": 0.02892663712456918, "grad_norm": 0.7547456622123718, "learning_rate": 0.00048583569405099154, "loss": 7.5855, "step": 235 }, { "epoch": 0.029049729197439686, "grad_norm": 0.5775458812713623, "learning_rate": 0.00048577411011208277, "loss": 7.8555, "step": 236 }, { "epoch": 0.029172821270310192, "grad_norm": 0.3567148447036743, "learning_rate": 0.000485712526173174, "loss": 7.3506, "step": 237 }, { "epoch": 0.0292959133431807, "grad_norm": 0.35975250601768494, "learning_rate": 0.0004856509422342653, "loss": 7.7163, "step": 238 }, { "epoch": 0.029419005416051205, "grad_norm": 0.3819860816001892, "learning_rate": 0.0004855893582953566, "loss": 7.3888, "step": 239 }, { "epoch": 0.029542097488921712, "grad_norm": 0.5201364755630493, "learning_rate": 0.00048552777435644786, "loss": 8.0555, "step": 240 }, { "epoch": 0.029665189561792222, "grad_norm": 0.7190734148025513, "learning_rate": 0.0004854661904175391, "loss": 8.782, "step": 241 }, { "epoch": 0.02978828163466273, "grad_norm": 0.6297115087509155, "learning_rate": 0.0004854046064786304, "loss": 7.8168, "step": 242 }, { "epoch": 0.029911373707533235, "grad_norm": 0.6847550868988037, "learning_rate": 0.0004853430225397216, "loss": 7.4935, "step": 243 }, { "epoch": 0.03003446578040374, "grad_norm": 0.9390683770179749, "learning_rate": 0.0004852814386008129, "loss": 8.7533, "step": 244 }, { "epoch": 0.030157557853274248, "grad_norm": 0.495339959859848, "learning_rate": 0.00048521985466190413, "loss": 8.0542, "step": 245 }, { "epoch": 0.030280649926144758, "grad_norm": 0.31351810693740845, "learning_rate": 0.0004851582707229955, "loss": 8.2716, "step": 246 }, { "epoch": 0.030403741999015264, "grad_norm": 0.9747763872146606, "learning_rate": 0.0004850966867840867, "loss": 7.8738, "step": 247 }, { "epoch": 0.03052683407188577, "grad_norm": 0.8266571164131165, "learning_rate": 0.000485035102845178, "loss": 7.7608, "step": 248 }, { "epoch": 0.030649926144756277, "grad_norm": 0.5511370301246643, "learning_rate": 0.0004849735189062692, "loss": 8.3272, "step": 249 }, { "epoch": 0.030773018217626784, "grad_norm": 0.40367957949638367, "learning_rate": 0.0004849119349673605, "loss": 7.3757, "step": 250 }, { "epoch": 0.03089611029049729, "grad_norm": 1.0139446258544922, "learning_rate": 0.00048485035102845174, "loss": 8.1199, "step": 251 }, { "epoch": 0.0310192023633678, "grad_norm": 0.849685549736023, "learning_rate": 0.0004847887670895431, "loss": 7.465, "step": 252 }, { "epoch": 0.031142294436238307, "grad_norm": 0.5569213032722473, "learning_rate": 0.0004847271831506343, "loss": 7.8766, "step": 253 }, { "epoch": 0.03126538650910882, "grad_norm": 0.4826586842536926, "learning_rate": 0.0004846655992117256, "loss": 8.3104, "step": 254 }, { "epoch": 0.03138847858197932, "grad_norm": 1.1445728540420532, "learning_rate": 0.00048460401527281684, "loss": 7.6283, "step": 255 }, { "epoch": 0.03151157065484983, "grad_norm": 1.0860871076583862, "learning_rate": 0.0004845424313339081, "loss": 7.6693, "step": 256 }, { "epoch": 0.031634662727720336, "grad_norm": 0.33310654759407043, "learning_rate": 0.00048448084739499935, "loss": 7.9473, "step": 257 }, { "epoch": 0.03175775480059084, "grad_norm": 0.6745867133140564, "learning_rate": 0.00048441926345609064, "loss": 7.3179, "step": 258 }, { "epoch": 0.03188084687346135, "grad_norm": 1.4749622344970703, "learning_rate": 0.00048435767951718193, "loss": 8.2041, "step": 259 }, { "epoch": 0.032003938946331856, "grad_norm": 1.3992525339126587, "learning_rate": 0.0004842960955782732, "loss": 8.2224, "step": 260 }, { "epoch": 0.03212703101920236, "grad_norm": 0.5247831344604492, "learning_rate": 0.00048423451163936445, "loss": 7.6514, "step": 261 }, { "epoch": 0.03225012309207287, "grad_norm": 0.7900456786155701, "learning_rate": 0.00048417292770045573, "loss": 7.7632, "step": 262 }, { "epoch": 0.032373215164943375, "grad_norm": 0.8934734463691711, "learning_rate": 0.00048411134376154697, "loss": 7.7083, "step": 263 }, { "epoch": 0.03249630723781388, "grad_norm": 0.9917834401130676, "learning_rate": 0.00048404975982263825, "loss": 7.5924, "step": 264 }, { "epoch": 0.032619399310684395, "grad_norm": 0.33104217052459717, "learning_rate": 0.0004839881758837295, "loss": 7.9701, "step": 265 }, { "epoch": 0.0327424913835549, "grad_norm": 0.5470057725906372, "learning_rate": 0.0004839265919448208, "loss": 7.3749, "step": 266 }, { "epoch": 0.03286558345642541, "grad_norm": 1.0406028032302856, "learning_rate": 0.00048386500800591206, "loss": 8.241, "step": 267 }, { "epoch": 0.032988675529295915, "grad_norm": 0.4318017363548279, "learning_rate": 0.00048380342406700334, "loss": 7.4454, "step": 268 }, { "epoch": 0.03311176760216642, "grad_norm": 0.2971910238265991, "learning_rate": 0.0004837418401280946, "loss": 7.8207, "step": 269 }, { "epoch": 0.03323485967503693, "grad_norm": 0.7013093829154968, "learning_rate": 0.00048368025618918586, "loss": 7.5918, "step": 270 }, { "epoch": 0.033357951747907434, "grad_norm": 0.5134764313697815, "learning_rate": 0.0004836186722502771, "loss": 8.1373, "step": 271 }, { "epoch": 0.03348104382077794, "grad_norm": 0.6355268359184265, "learning_rate": 0.0004835570883113684, "loss": 7.4787, "step": 272 }, { "epoch": 0.03360413589364845, "grad_norm": 0.5621976256370544, "learning_rate": 0.00048349550437245967, "loss": 7.8235, "step": 273 }, { "epoch": 0.033727227966518954, "grad_norm": 0.5104890465736389, "learning_rate": 0.00048343392043355096, "loss": 7.6542, "step": 274 }, { "epoch": 0.03385032003938946, "grad_norm": 0.5982881188392639, "learning_rate": 0.0004833723364946422, "loss": 7.7903, "step": 275 }, { "epoch": 0.033973412112259974, "grad_norm": 0.3248158097267151, "learning_rate": 0.0004833107525557335, "loss": 7.932, "step": 276 }, { "epoch": 0.03409650418513048, "grad_norm": 0.5209702849388123, "learning_rate": 0.0004832491686168247, "loss": 7.7517, "step": 277 }, { "epoch": 0.03421959625800099, "grad_norm": 0.4844845235347748, "learning_rate": 0.000483187584677916, "loss": 7.724, "step": 278 }, { "epoch": 0.03434268833087149, "grad_norm": 1.3293578624725342, "learning_rate": 0.0004831260007390073, "loss": 8.8833, "step": 279 }, { "epoch": 0.034465780403742, "grad_norm": 0.280003160238266, "learning_rate": 0.00048306441680009857, "loss": 7.4136, "step": 280 }, { "epoch": 0.034588872476612506, "grad_norm": 0.5604228973388672, "learning_rate": 0.0004830028328611898, "loss": 8.2859, "step": 281 }, { "epoch": 0.03471196454948301, "grad_norm": 0.5561869740486145, "learning_rate": 0.0004829412489222811, "loss": 7.5926, "step": 282 }, { "epoch": 0.03483505662235352, "grad_norm": 0.7577869892120361, "learning_rate": 0.0004828796649833723, "loss": 7.298, "step": 283 }, { "epoch": 0.034958148695224026, "grad_norm": 0.3240264654159546, "learning_rate": 0.0004828180810444636, "loss": 8.061, "step": 284 }, { "epoch": 0.03508124076809453, "grad_norm": 0.39500701427459717, "learning_rate": 0.00048275649710555484, "loss": 7.5866, "step": 285 }, { "epoch": 0.03520433284096504, "grad_norm": 0.7078671455383301, "learning_rate": 0.0004826949131666462, "loss": 7.5944, "step": 286 }, { "epoch": 0.03532742491383555, "grad_norm": 0.42373692989349365, "learning_rate": 0.0004826333292277374, "loss": 7.6896, "step": 287 }, { "epoch": 0.03545051698670606, "grad_norm": 0.6994956135749817, "learning_rate": 0.0004825717452888287, "loss": 7.5668, "step": 288 }, { "epoch": 0.035573609059576565, "grad_norm": 0.6113752722740173, "learning_rate": 0.00048251016134991993, "loss": 7.7836, "step": 289 }, { "epoch": 0.03569670113244707, "grad_norm": 0.5025691390037537, "learning_rate": 0.0004824485774110112, "loss": 7.6319, "step": 290 }, { "epoch": 0.03581979320531758, "grad_norm": 0.5655583143234253, "learning_rate": 0.00048238699347210245, "loss": 7.8227, "step": 291 }, { "epoch": 0.035942885278188084, "grad_norm": 0.5184053182601929, "learning_rate": 0.00048232540953319373, "loss": 7.5832, "step": 292 }, { "epoch": 0.03606597735105859, "grad_norm": 0.5597822666168213, "learning_rate": 0.000482263825594285, "loss": 7.4647, "step": 293 }, { "epoch": 0.0361890694239291, "grad_norm": 0.42806652188301086, "learning_rate": 0.0004822022416553763, "loss": 7.7697, "step": 294 }, { "epoch": 0.036312161496799604, "grad_norm": 0.35130617022514343, "learning_rate": 0.00048214065771646754, "loss": 7.9834, "step": 295 }, { "epoch": 0.03643525356967011, "grad_norm": 0.6315492987632751, "learning_rate": 0.0004820790737775588, "loss": 7.7088, "step": 296 }, { "epoch": 0.03655834564254062, "grad_norm": 0.49824273586273193, "learning_rate": 0.00048201748983865006, "loss": 7.528, "step": 297 }, { "epoch": 0.03668143771541113, "grad_norm": 0.3899374306201935, "learning_rate": 0.00048195590589974135, "loss": 7.8743, "step": 298 }, { "epoch": 0.03680452978828164, "grad_norm": 0.6681570410728455, "learning_rate": 0.0004818943219608326, "loss": 7.7622, "step": 299 }, { "epoch": 0.03692762186115214, "grad_norm": 0.4388914108276367, "learning_rate": 0.0004818327380219239, "loss": 7.9651, "step": 300 }, { "epoch": 0.03705071393402265, "grad_norm": 0.5288096070289612, "learning_rate": 0.00048177115408301515, "loss": 7.502, "step": 301 }, { "epoch": 0.037173806006893156, "grad_norm": 0.6654837131500244, "learning_rate": 0.00048170957014410644, "loss": 7.6718, "step": 302 }, { "epoch": 0.03729689807976366, "grad_norm": 0.43736258149147034, "learning_rate": 0.00048164798620519767, "loss": 8.0776, "step": 303 }, { "epoch": 0.03741999015263417, "grad_norm": 0.3925165832042694, "learning_rate": 0.00048158640226628896, "loss": 7.7674, "step": 304 }, { "epoch": 0.037543082225504676, "grad_norm": 0.4725094139575958, "learning_rate": 0.0004815248183273802, "loss": 7.6929, "step": 305 }, { "epoch": 0.03766617429837518, "grad_norm": 0.6214683651924133, "learning_rate": 0.00048146323438847153, "loss": 7.6028, "step": 306 }, { "epoch": 0.03778926637124569, "grad_norm": 0.3635413646697998, "learning_rate": 0.00048140165044956276, "loss": 8.1847, "step": 307 }, { "epoch": 0.0379123584441162, "grad_norm": 0.5353173613548279, "learning_rate": 0.00048134006651065405, "loss": 8.6318, "step": 308 }, { "epoch": 0.03803545051698671, "grad_norm": 0.3924868702888489, "learning_rate": 0.0004812784825717453, "loss": 8.1143, "step": 309 }, { "epoch": 0.038158542589857215, "grad_norm": 0.7202489972114563, "learning_rate": 0.00048121689863283657, "loss": 7.5285, "step": 310 }, { "epoch": 0.03828163466272772, "grad_norm": 0.46855440735816956, "learning_rate": 0.0004811553146939278, "loss": 7.529, "step": 311 }, { "epoch": 0.03840472673559823, "grad_norm": 0.4398728013038635, "learning_rate": 0.0004810937307550191, "loss": 7.2515, "step": 312 }, { "epoch": 0.038527818808468735, "grad_norm": 1.2357743978500366, "learning_rate": 0.0004810321468161104, "loss": 8.1559, "step": 313 }, { "epoch": 0.03865091088133924, "grad_norm": 0.7787540555000305, "learning_rate": 0.00048097056287720166, "loss": 7.2298, "step": 314 }, { "epoch": 0.03877400295420975, "grad_norm": 1.7763208150863647, "learning_rate": 0.0004809089789382929, "loss": 9.0416, "step": 315 }, { "epoch": 0.038897095027080254, "grad_norm": 0.4907105565071106, "learning_rate": 0.0004808473949993842, "loss": 7.9332, "step": 316 }, { "epoch": 0.03902018709995076, "grad_norm": 0.7231312990188599, "learning_rate": 0.0004807858110604754, "loss": 8.1533, "step": 317 }, { "epoch": 0.03914327917282127, "grad_norm": 1.0320204496383667, "learning_rate": 0.0004807242271215667, "loss": 7.5991, "step": 318 }, { "epoch": 0.03926637124569178, "grad_norm": 0.9107394814491272, "learning_rate": 0.00048066264318265793, "loss": 7.3794, "step": 319 }, { "epoch": 0.03938946331856229, "grad_norm": 0.42319872975349426, "learning_rate": 0.00048060105924374927, "loss": 7.6422, "step": 320 }, { "epoch": 0.039512555391432794, "grad_norm": 0.8296293020248413, "learning_rate": 0.0004805394753048405, "loss": 7.863, "step": 321 }, { "epoch": 0.0396356474643033, "grad_norm": 1.1320585012435913, "learning_rate": 0.0004804778913659318, "loss": 7.7213, "step": 322 }, { "epoch": 0.03975873953717381, "grad_norm": 1.1031274795532227, "learning_rate": 0.000480416307427023, "loss": 7.7528, "step": 323 }, { "epoch": 0.03988183161004431, "grad_norm": 0.5428522229194641, "learning_rate": 0.0004803547234881143, "loss": 7.3579, "step": 324 }, { "epoch": 0.04000492368291482, "grad_norm": 0.4160130023956299, "learning_rate": 0.00048029313954920554, "loss": 7.6229, "step": 325 }, { "epoch": 0.040128015755785326, "grad_norm": 0.5282074213027954, "learning_rate": 0.00048023155561029683, "loss": 7.7539, "step": 326 }, { "epoch": 0.04025110782865583, "grad_norm": 0.4684397280216217, "learning_rate": 0.0004801699716713881, "loss": 8.5749, "step": 327 }, { "epoch": 0.04037419990152634, "grad_norm": 0.7501415014266968, "learning_rate": 0.0004801083877324794, "loss": 7.4035, "step": 328 }, { "epoch": 0.040497291974396846, "grad_norm": 0.4649221897125244, "learning_rate": 0.00048004680379357063, "loss": 7.7289, "step": 329 }, { "epoch": 0.04062038404726736, "grad_norm": 0.7912390232086182, "learning_rate": 0.0004799852198546619, "loss": 7.7514, "step": 330 }, { "epoch": 0.040743476120137866, "grad_norm": 0.7938594222068787, "learning_rate": 0.00047992363591575315, "loss": 7.445, "step": 331 }, { "epoch": 0.04086656819300837, "grad_norm": 0.6261814832687378, "learning_rate": 0.00047986205197684444, "loss": 7.7, "step": 332 }, { "epoch": 0.04098966026587888, "grad_norm": 0.26375752687454224, "learning_rate": 0.0004798004680379357, "loss": 7.7014, "step": 333 }, { "epoch": 0.041112752338749385, "grad_norm": 0.5962406992912292, "learning_rate": 0.000479738884099027, "loss": 7.3896, "step": 334 }, { "epoch": 0.04123584441161989, "grad_norm": 0.6550663709640503, "learning_rate": 0.00047967730016011825, "loss": 8.1916, "step": 335 }, { "epoch": 0.0413589364844904, "grad_norm": 0.4812748432159424, "learning_rate": 0.00047961571622120953, "loss": 7.6521, "step": 336 }, { "epoch": 0.041482028557360905, "grad_norm": 0.6059079170227051, "learning_rate": 0.00047955413228230076, "loss": 7.8022, "step": 337 }, { "epoch": 0.04160512063023141, "grad_norm": 0.8000144362449646, "learning_rate": 0.00047949254834339205, "loss": 7.7587, "step": 338 }, { "epoch": 0.04172821270310192, "grad_norm": 0.7994040250778198, "learning_rate": 0.0004794309644044833, "loss": 8.1225, "step": 339 }, { "epoch": 0.041851304775972424, "grad_norm": 0.4281293749809265, "learning_rate": 0.0004793693804655746, "loss": 7.6654, "step": 340 }, { "epoch": 0.04197439684884294, "grad_norm": 0.37506020069122314, "learning_rate": 0.00047930779652666586, "loss": 8.5062, "step": 341 }, { "epoch": 0.042097488921713444, "grad_norm": 0.7481719851493835, "learning_rate": 0.00047924621258775714, "loss": 7.4745, "step": 342 }, { "epoch": 0.04222058099458395, "grad_norm": 0.37436461448669434, "learning_rate": 0.0004791846286488484, "loss": 7.6525, "step": 343 }, { "epoch": 0.04234367306745446, "grad_norm": 0.2156219184398651, "learning_rate": 0.00047912304470993966, "loss": 7.4468, "step": 344 }, { "epoch": 0.042466765140324964, "grad_norm": 0.3866279721260071, "learning_rate": 0.0004790614607710309, "loss": 7.482, "step": 345 }, { "epoch": 0.04258985721319547, "grad_norm": 1.1048855781555176, "learning_rate": 0.0004789998768321222, "loss": 8.0697, "step": 346 }, { "epoch": 0.042712949286065977, "grad_norm": 0.2409275472164154, "learning_rate": 0.00047893829289321347, "loss": 7.4456, "step": 347 }, { "epoch": 0.04283604135893648, "grad_norm": 0.4019905626773834, "learning_rate": 0.00047887670895430475, "loss": 7.4943, "step": 348 }, { "epoch": 0.04295913343180699, "grad_norm": 0.4027179479598999, "learning_rate": 0.000478815125015396, "loss": 7.5704, "step": 349 }, { "epoch": 0.043082225504677496, "grad_norm": 0.45118850469589233, "learning_rate": 0.0004787535410764873, "loss": 7.8824, "step": 350 }, { "epoch": 0.043205317577548, "grad_norm": 0.39324140548706055, "learning_rate": 0.0004786919571375785, "loss": 7.9048, "step": 351 }, { "epoch": 0.043328409650418516, "grad_norm": 0.30330365896224976, "learning_rate": 0.0004786303731986698, "loss": 7.3989, "step": 352 }, { "epoch": 0.04345150172328902, "grad_norm": 0.32772931456565857, "learning_rate": 0.000478568789259761, "loss": 7.4173, "step": 353 }, { "epoch": 0.04357459379615953, "grad_norm": 1.0508077144622803, "learning_rate": 0.00047850720532085237, "loss": 8.7626, "step": 354 }, { "epoch": 0.043697685869030035, "grad_norm": 0.3494257926940918, "learning_rate": 0.0004784456213819436, "loss": 7.4943, "step": 355 }, { "epoch": 0.04382077794190054, "grad_norm": 0.5180625915527344, "learning_rate": 0.0004783840374430349, "loss": 8.4429, "step": 356 }, { "epoch": 0.04394387001477105, "grad_norm": 0.5596400499343872, "learning_rate": 0.0004783224535041261, "loss": 8.1144, "step": 357 }, { "epoch": 0.044066962087641555, "grad_norm": 1.1752979755401611, "learning_rate": 0.0004782608695652174, "loss": 7.2496, "step": 358 }, { "epoch": 0.04419005416051206, "grad_norm": 0.5196439623832703, "learning_rate": 0.00047819928562630864, "loss": 7.5186, "step": 359 }, { "epoch": 0.04431314623338257, "grad_norm": 0.7561269402503967, "learning_rate": 0.0004781377016874, "loss": 8.1294, "step": 360 }, { "epoch": 0.044436238306253074, "grad_norm": 1.0005320310592651, "learning_rate": 0.0004780761177484912, "loss": 8.1888, "step": 361 }, { "epoch": 0.04455933037912359, "grad_norm": 0.6500119566917419, "learning_rate": 0.0004780145338095825, "loss": 7.2209, "step": 362 }, { "epoch": 0.044682422451994094, "grad_norm": 0.6755048036575317, "learning_rate": 0.00047795294987067373, "loss": 8.4119, "step": 363 }, { "epoch": 0.0448055145248646, "grad_norm": 0.42433926463127136, "learning_rate": 0.000477891365931765, "loss": 7.5024, "step": 364 }, { "epoch": 0.04492860659773511, "grad_norm": 0.8202070593833923, "learning_rate": 0.00047782978199285625, "loss": 7.4808, "step": 365 }, { "epoch": 0.045051698670605614, "grad_norm": 0.6912183165550232, "learning_rate": 0.00047776819805394753, "loss": 7.4212, "step": 366 }, { "epoch": 0.04517479074347612, "grad_norm": 0.3156851828098297, "learning_rate": 0.0004777066141150388, "loss": 7.6878, "step": 367 }, { "epoch": 0.04529788281634663, "grad_norm": 0.8174312710762024, "learning_rate": 0.0004776450301761301, "loss": 7.7488, "step": 368 }, { "epoch": 0.04542097488921713, "grad_norm": 0.4010213017463684, "learning_rate": 0.00047758344623722134, "loss": 7.5966, "step": 369 }, { "epoch": 0.04554406696208764, "grad_norm": 0.42809727787971497, "learning_rate": 0.0004775218622983126, "loss": 7.8056, "step": 370 }, { "epoch": 0.045667159034958146, "grad_norm": 0.5404561161994934, "learning_rate": 0.00047746027835940386, "loss": 7.4435, "step": 371 }, { "epoch": 0.04579025110782865, "grad_norm": 1.0359350442886353, "learning_rate": 0.00047739869442049514, "loss": 7.7422, "step": 372 }, { "epoch": 0.045913343180699166, "grad_norm": 0.5314866304397583, "learning_rate": 0.0004773371104815864, "loss": 7.5559, "step": 373 }, { "epoch": 0.04603643525356967, "grad_norm": 0.7220731973648071, "learning_rate": 0.0004772755265426777, "loss": 8.2565, "step": 374 }, { "epoch": 0.04615952732644018, "grad_norm": 0.5539658069610596, "learning_rate": 0.00047721394260376895, "loss": 7.8137, "step": 375 }, { "epoch": 0.046282619399310686, "grad_norm": 0.49949052929878235, "learning_rate": 0.00047715235866486024, "loss": 7.4, "step": 376 }, { "epoch": 0.04640571147218119, "grad_norm": 0.2689746618270874, "learning_rate": 0.00047709077472595147, "loss": 7.4938, "step": 377 }, { "epoch": 0.0465288035450517, "grad_norm": 0.3294529616832733, "learning_rate": 0.00047702919078704276, "loss": 7.7008, "step": 378 }, { "epoch": 0.046651895617922205, "grad_norm": 0.36485108733177185, "learning_rate": 0.000476967606848134, "loss": 7.4508, "step": 379 }, { "epoch": 0.04677498769079271, "grad_norm": 0.9981898665428162, "learning_rate": 0.0004769060229092253, "loss": 8.6104, "step": 380 }, { "epoch": 0.04689807976366322, "grad_norm": 0.4802452325820923, "learning_rate": 0.00047684443897031656, "loss": 7.14, "step": 381 }, { "epoch": 0.047021171836533725, "grad_norm": 0.31775522232055664, "learning_rate": 0.00047678285503140785, "loss": 7.6493, "step": 382 }, { "epoch": 0.04714426390940423, "grad_norm": 0.6240155100822449, "learning_rate": 0.0004767212710924991, "loss": 8.0742, "step": 383 }, { "epoch": 0.047267355982274745, "grad_norm": 0.35226771235466003, "learning_rate": 0.00047665968715359037, "loss": 7.8809, "step": 384 }, { "epoch": 0.04739044805514525, "grad_norm": 0.5080077648162842, "learning_rate": 0.0004765981032146816, "loss": 8.7614, "step": 385 }, { "epoch": 0.04751354012801576, "grad_norm": 0.3340292274951935, "learning_rate": 0.0004765365192757729, "loss": 8.394, "step": 386 }, { "epoch": 0.047636632200886264, "grad_norm": 0.5907297134399414, "learning_rate": 0.00047647493533686417, "loss": 8.1179, "step": 387 }, { "epoch": 0.04775972427375677, "grad_norm": 0.646808922290802, "learning_rate": 0.00047641335139795546, "loss": 7.7105, "step": 388 }, { "epoch": 0.04788281634662728, "grad_norm": 0.4450981020927429, "learning_rate": 0.0004763517674590467, "loss": 7.6639, "step": 389 }, { "epoch": 0.048005908419497784, "grad_norm": 0.7513502240180969, "learning_rate": 0.000476290183520138, "loss": 7.4554, "step": 390 }, { "epoch": 0.04812900049236829, "grad_norm": 1.2398974895477295, "learning_rate": 0.0004762285995812292, "loss": 8.5605, "step": 391 }, { "epoch": 0.0482520925652388, "grad_norm": 0.4683971405029297, "learning_rate": 0.0004761670156423205, "loss": 7.4983, "step": 392 }, { "epoch": 0.0483751846381093, "grad_norm": 0.3542233407497406, "learning_rate": 0.00047610543170341173, "loss": 7.653, "step": 393 }, { "epoch": 0.04849827671097981, "grad_norm": 0.5674006938934326, "learning_rate": 0.00047604384776450307, "loss": 7.3166, "step": 394 }, { "epoch": 0.04862136878385032, "grad_norm": 0.5738288760185242, "learning_rate": 0.0004759822638255943, "loss": 7.4763, "step": 395 }, { "epoch": 0.04874446085672083, "grad_norm": 0.26595649123191833, "learning_rate": 0.0004759206798866856, "loss": 7.5798, "step": 396 }, { "epoch": 0.048867552929591336, "grad_norm": 0.4777490496635437, "learning_rate": 0.0004758590959477768, "loss": 7.644, "step": 397 }, { "epoch": 0.04899064500246184, "grad_norm": 0.3459051251411438, "learning_rate": 0.0004757975120088681, "loss": 7.8828, "step": 398 }, { "epoch": 0.04911373707533235, "grad_norm": 0.4868071675300598, "learning_rate": 0.00047573592806995934, "loss": 8.5887, "step": 399 }, { "epoch": 0.049236829148202856, "grad_norm": 0.45203498005867004, "learning_rate": 0.00047567434413105063, "loss": 8.962, "step": 400 }, { "epoch": 0.04935992122107336, "grad_norm": 0.5884661078453064, "learning_rate": 0.0004756127601921419, "loss": 8.4989, "step": 401 }, { "epoch": 0.04948301329394387, "grad_norm": 0.9380660653114319, "learning_rate": 0.0004755511762532332, "loss": 7.805, "step": 402 }, { "epoch": 0.049606105366814375, "grad_norm": 0.8197565078735352, "learning_rate": 0.00047548959231432443, "loss": 7.5513, "step": 403 }, { "epoch": 0.04972919743968488, "grad_norm": 0.34440290927886963, "learning_rate": 0.0004754280083754157, "loss": 7.6706, "step": 404 }, { "epoch": 0.04985228951255539, "grad_norm": 1.1984570026397705, "learning_rate": 0.00047536642443650695, "loss": 8.068, "step": 405 }, { "epoch": 0.0499753815854259, "grad_norm": 1.2253295183181763, "learning_rate": 0.00047530484049759824, "loss": 8.1208, "step": 406 }, { "epoch": 0.05009847365829641, "grad_norm": 0.7848016619682312, "learning_rate": 0.00047524325655868947, "loss": 8.3776, "step": 407 }, { "epoch": 0.050221565731166914, "grad_norm": 0.4425124526023865, "learning_rate": 0.0004751816726197808, "loss": 7.2663, "step": 408 }, { "epoch": 0.05034465780403742, "grad_norm": 0.4661727845668793, "learning_rate": 0.00047512008868087204, "loss": 8.3324, "step": 409 }, { "epoch": 0.05046774987690793, "grad_norm": 0.4668848216533661, "learning_rate": 0.00047505850474196333, "loss": 7.716, "step": 410 }, { "epoch": 0.050590841949778434, "grad_norm": 0.3739874064922333, "learning_rate": 0.00047499692080305456, "loss": 7.6389, "step": 411 }, { "epoch": 0.05071393402264894, "grad_norm": 0.28330519795417786, "learning_rate": 0.00047493533686414585, "loss": 7.4642, "step": 412 }, { "epoch": 0.05083702609551945, "grad_norm": 0.8962852954864502, "learning_rate": 0.0004748737529252371, "loss": 8.0283, "step": 413 }, { "epoch": 0.05096011816838995, "grad_norm": 0.5045072436332703, "learning_rate": 0.0004748121689863284, "loss": 7.6087, "step": 414 }, { "epoch": 0.05108321024126046, "grad_norm": 0.5268784761428833, "learning_rate": 0.00047475058504741966, "loss": 8.1209, "step": 415 }, { "epoch": 0.05120630231413097, "grad_norm": 0.35750433802604675, "learning_rate": 0.00047468900110851094, "loss": 7.586, "step": 416 }, { "epoch": 0.05132939438700148, "grad_norm": 0.43777668476104736, "learning_rate": 0.0004746274171696022, "loss": 7.9828, "step": 417 }, { "epoch": 0.051452486459871986, "grad_norm": 0.5562515258789062, "learning_rate": 0.00047456583323069346, "loss": 7.5748, "step": 418 }, { "epoch": 0.05157557853274249, "grad_norm": 0.5644679069519043, "learning_rate": 0.0004745042492917847, "loss": 7.9145, "step": 419 }, { "epoch": 0.051698670605613, "grad_norm": 0.5197036862373352, "learning_rate": 0.000474442665352876, "loss": 7.9928, "step": 420 }, { "epoch": 0.051821762678483506, "grad_norm": 0.3695710599422455, "learning_rate": 0.00047438108141396727, "loss": 7.9679, "step": 421 }, { "epoch": 0.05194485475135401, "grad_norm": 0.3318750262260437, "learning_rate": 0.00047431949747505855, "loss": 7.9456, "step": 422 }, { "epoch": 0.05206794682422452, "grad_norm": 0.4931640326976776, "learning_rate": 0.0004742579135361498, "loss": 7.5293, "step": 423 }, { "epoch": 0.052191038897095025, "grad_norm": 0.40148425102233887, "learning_rate": 0.00047419632959724107, "loss": 8.1362, "step": 424 }, { "epoch": 0.05231413096996553, "grad_norm": 0.433015912771225, "learning_rate": 0.0004741347456583323, "loss": 7.6068, "step": 425 }, { "epoch": 0.05243722304283604, "grad_norm": 0.2919592559337616, "learning_rate": 0.0004740731617194236, "loss": 7.6168, "step": 426 }, { "epoch": 0.05256031511570655, "grad_norm": 0.6159984469413757, "learning_rate": 0.0004740115777805148, "loss": 8.4416, "step": 427 }, { "epoch": 0.05268340718857706, "grad_norm": 0.46631088852882385, "learning_rate": 0.00047394999384160616, "loss": 7.7918, "step": 428 }, { "epoch": 0.052806499261447565, "grad_norm": 0.6238308548927307, "learning_rate": 0.0004738884099026974, "loss": 7.6023, "step": 429 }, { "epoch": 0.05292959133431807, "grad_norm": 0.3193671405315399, "learning_rate": 0.0004738268259637887, "loss": 7.5798, "step": 430 }, { "epoch": 0.05305268340718858, "grad_norm": 0.8802396059036255, "learning_rate": 0.0004737652420248799, "loss": 8.3553, "step": 431 }, { "epoch": 0.053175775480059084, "grad_norm": 0.7648723721504211, "learning_rate": 0.0004737036580859712, "loss": 7.6113, "step": 432 }, { "epoch": 0.05329886755292959, "grad_norm": 0.5989680886268616, "learning_rate": 0.00047364207414706243, "loss": 7.6193, "step": 433 }, { "epoch": 0.0534219596258001, "grad_norm": 0.46868836879730225, "learning_rate": 0.0004735804902081537, "loss": 7.474, "step": 434 }, { "epoch": 0.053545051698670604, "grad_norm": 0.6513462066650391, "learning_rate": 0.000473518906269245, "loss": 7.5113, "step": 435 }, { "epoch": 0.05366814377154111, "grad_norm": 0.5229458808898926, "learning_rate": 0.0004734573223303363, "loss": 8.1237, "step": 436 }, { "epoch": 0.05379123584441162, "grad_norm": 1.2272813320159912, "learning_rate": 0.0004733957383914275, "loss": 9.4157, "step": 437 }, { "epoch": 0.05391432791728213, "grad_norm": 0.31113871932029724, "learning_rate": 0.0004733341544525188, "loss": 7.7539, "step": 438 }, { "epoch": 0.05403741999015264, "grad_norm": 0.28036051988601685, "learning_rate": 0.00047327257051361005, "loss": 7.389, "step": 439 }, { "epoch": 0.05416051206302314, "grad_norm": 0.8118959665298462, "learning_rate": 0.00047321098657470133, "loss": 8.3128, "step": 440 }, { "epoch": 0.05428360413589365, "grad_norm": 1.0279942750930786, "learning_rate": 0.00047314940263579257, "loss": 9.1734, "step": 441 }, { "epoch": 0.054406696208764156, "grad_norm": 0.5421402454376221, "learning_rate": 0.0004730878186968839, "loss": 7.3648, "step": 442 }, { "epoch": 0.05452978828163466, "grad_norm": 0.27979353070259094, "learning_rate": 0.00047302623475797514, "loss": 7.8665, "step": 443 }, { "epoch": 0.05465288035450517, "grad_norm": 0.28642627596855164, "learning_rate": 0.0004729646508190664, "loss": 7.8376, "step": 444 }, { "epoch": 0.054775972427375676, "grad_norm": 0.7685189843177795, "learning_rate": 0.00047290306688015766, "loss": 8.5688, "step": 445 }, { "epoch": 0.05489906450024618, "grad_norm": 0.23107019066810608, "learning_rate": 0.00047284148294124894, "loss": 7.503, "step": 446 }, { "epoch": 0.05502215657311669, "grad_norm": 0.4042167663574219, "learning_rate": 0.0004727798990023402, "loss": 7.6475, "step": 447 }, { "epoch": 0.055145248645987195, "grad_norm": 0.7872878909111023, "learning_rate": 0.0004727183150634315, "loss": 9.1888, "step": 448 }, { "epoch": 0.05526834071885771, "grad_norm": 0.5653325915336609, "learning_rate": 0.00047265673112452275, "loss": 7.7297, "step": 449 }, { "epoch": 0.055391432791728215, "grad_norm": 0.5535159111022949, "learning_rate": 0.00047259514718561404, "loss": 7.8175, "step": 450 }, { "epoch": 0.05551452486459872, "grad_norm": 0.3953549265861511, "learning_rate": 0.00047253356324670527, "loss": 8.6165, "step": 451 }, { "epoch": 0.05563761693746923, "grad_norm": 0.4145948588848114, "learning_rate": 0.00047247197930779655, "loss": 7.458, "step": 452 }, { "epoch": 0.055760709010339735, "grad_norm": 0.3409458100795746, "learning_rate": 0.0004724103953688878, "loss": 7.4054, "step": 453 }, { "epoch": 0.05588380108321024, "grad_norm": 0.5042685270309448, "learning_rate": 0.0004723488114299791, "loss": 7.789, "step": 454 }, { "epoch": 0.05600689315608075, "grad_norm": 0.8563180565834045, "learning_rate": 0.00047228722749107036, "loss": 8.0985, "step": 455 }, { "epoch": 0.056129985228951254, "grad_norm": 0.4350353181362152, "learning_rate": 0.00047222564355216165, "loss": 7.7113, "step": 456 }, { "epoch": 0.05625307730182176, "grad_norm": 0.29980939626693726, "learning_rate": 0.0004721640596132529, "loss": 8.1132, "step": 457 }, { "epoch": 0.05637616937469227, "grad_norm": 0.8152088522911072, "learning_rate": 0.00047210247567434417, "loss": 7.7025, "step": 458 }, { "epoch": 0.056499261447562774, "grad_norm": 0.7342106103897095, "learning_rate": 0.0004720408917354354, "loss": 7.7772, "step": 459 }, { "epoch": 0.05662235352043329, "grad_norm": 0.5307797789573669, "learning_rate": 0.0004719793077965267, "loss": 7.5177, "step": 460 }, { "epoch": 0.056745445593303794, "grad_norm": 0.26388755440711975, "learning_rate": 0.0004719177238576179, "loss": 7.7158, "step": 461 }, { "epoch": 0.0568685376661743, "grad_norm": 0.37353307008743286, "learning_rate": 0.00047185613991870926, "loss": 7.5353, "step": 462 }, { "epoch": 0.056991629739044807, "grad_norm": 0.8779540657997131, "learning_rate": 0.0004717945559798005, "loss": 8.0821, "step": 463 }, { "epoch": 0.05711472181191531, "grad_norm": 0.4760213792324066, "learning_rate": 0.0004717329720408918, "loss": 7.8556, "step": 464 }, { "epoch": 0.05723781388478582, "grad_norm": 0.4092307388782501, "learning_rate": 0.000471671388101983, "loss": 7.4424, "step": 465 }, { "epoch": 0.057360905957656326, "grad_norm": 0.6796973347663879, "learning_rate": 0.0004716098041630743, "loss": 7.4294, "step": 466 }, { "epoch": 0.05748399803052683, "grad_norm": 0.38378846645355225, "learning_rate": 0.00047154822022416553, "loss": 8.2068, "step": 467 }, { "epoch": 0.05760709010339734, "grad_norm": 0.3309083580970764, "learning_rate": 0.00047148663628525687, "loss": 7.9224, "step": 468 }, { "epoch": 0.057730182176267845, "grad_norm": 0.4546399414539337, "learning_rate": 0.0004714250523463481, "loss": 7.5996, "step": 469 }, { "epoch": 0.05785327424913836, "grad_norm": 0.7223706245422363, "learning_rate": 0.00047136346840743933, "loss": 8.2762, "step": 470 }, { "epoch": 0.057976366322008865, "grad_norm": 0.23591729998588562, "learning_rate": 0.0004713018844685306, "loss": 7.4476, "step": 471 }, { "epoch": 0.05809945839487937, "grad_norm": 0.29082417488098145, "learning_rate": 0.00047124030052962185, "loss": 7.8341, "step": 472 }, { "epoch": 0.05822255046774988, "grad_norm": 0.30466869473457336, "learning_rate": 0.00047117871659071314, "loss": 7.6438, "step": 473 }, { "epoch": 0.058345642540620385, "grad_norm": 0.30324316024780273, "learning_rate": 0.00047111713265180437, "loss": 7.5106, "step": 474 }, { "epoch": 0.05846873461349089, "grad_norm": 1.1733297109603882, "learning_rate": 0.0004710555487128957, "loss": 8.7099, "step": 475 }, { "epoch": 0.0585918266863614, "grad_norm": 0.47736358642578125, "learning_rate": 0.00047099396477398695, "loss": 7.4428, "step": 476 }, { "epoch": 0.058714918759231904, "grad_norm": 1.500143051147461, "learning_rate": 0.00047093238083507823, "loss": 9.9656, "step": 477 }, { "epoch": 0.05883801083210241, "grad_norm": 0.5462232232093811, "learning_rate": 0.00047087079689616946, "loss": 7.3284, "step": 478 }, { "epoch": 0.05896110290497292, "grad_norm": 0.3649827241897583, "learning_rate": 0.00047080921295726075, "loss": 7.8216, "step": 479 }, { "epoch": 0.059084194977843424, "grad_norm": 0.5668533444404602, "learning_rate": 0.000470747629018352, "loss": 7.7585, "step": 480 }, { "epoch": 0.05920728705071394, "grad_norm": 0.27960389852523804, "learning_rate": 0.00047068604507944327, "loss": 7.7233, "step": 481 }, { "epoch": 0.059330379123584444, "grad_norm": 0.2789870500564575, "learning_rate": 0.00047062446114053456, "loss": 7.3816, "step": 482 }, { "epoch": 0.05945347119645495, "grad_norm": 0.6823089718818665, "learning_rate": 0.00047056287720162584, "loss": 7.7521, "step": 483 }, { "epoch": 0.05957656326932546, "grad_norm": 0.45745909214019775, "learning_rate": 0.0004705012932627171, "loss": 7.6322, "step": 484 }, { "epoch": 0.05969965534219596, "grad_norm": 0.24471120536327362, "learning_rate": 0.00047043970932380836, "loss": 7.3742, "step": 485 }, { "epoch": 0.05982274741506647, "grad_norm": 0.9006953239440918, "learning_rate": 0.0004703781253848996, "loss": 9.0276, "step": 486 }, { "epoch": 0.059945839487936976, "grad_norm": 0.46821409463882446, "learning_rate": 0.0004703165414459909, "loss": 7.5964, "step": 487 }, { "epoch": 0.06006893156080748, "grad_norm": 0.27841198444366455, "learning_rate": 0.0004702549575070821, "loss": 8.0911, "step": 488 }, { "epoch": 0.06019202363367799, "grad_norm": 0.43933579325675964, "learning_rate": 0.00047019337356817345, "loss": 7.5327, "step": 489 }, { "epoch": 0.060315115706548496, "grad_norm": 0.38255012035369873, "learning_rate": 0.0004701317896292647, "loss": 7.5393, "step": 490 }, { "epoch": 0.060438207779419, "grad_norm": 0.4419342577457428, "learning_rate": 0.000470070205690356, "loss": 7.4021, "step": 491 }, { "epoch": 0.060561299852289516, "grad_norm": 0.8487022519111633, "learning_rate": 0.0004700086217514472, "loss": 8.4302, "step": 492 }, { "epoch": 0.06068439192516002, "grad_norm": 0.33099591732025146, "learning_rate": 0.0004699470378125385, "loss": 8.3837, "step": 493 }, { "epoch": 0.06080748399803053, "grad_norm": 0.49656444787979126, "learning_rate": 0.0004698854538736297, "loss": 7.7085, "step": 494 }, { "epoch": 0.060930576070901035, "grad_norm": 0.7481573224067688, "learning_rate": 0.000469823869934721, "loss": 7.5505, "step": 495 }, { "epoch": 0.06105366814377154, "grad_norm": 0.37397071719169617, "learning_rate": 0.0004697622859958123, "loss": 7.8597, "step": 496 }, { "epoch": 0.06117676021664205, "grad_norm": 0.4182277023792267, "learning_rate": 0.0004697007020569036, "loss": 8.0967, "step": 497 }, { "epoch": 0.061299852289512555, "grad_norm": 0.34149694442749023, "learning_rate": 0.0004696391181179948, "loss": 7.5533, "step": 498 }, { "epoch": 0.06142294436238306, "grad_norm": 0.6487600207328796, "learning_rate": 0.0004695775341790861, "loss": 7.7158, "step": 499 }, { "epoch": 0.06154603643525357, "grad_norm": 2.0499024391174316, "learning_rate": 0.00046951595024017734, "loss": 10.06, "step": 500 }, { "epoch": 0.061669128508124074, "grad_norm": 0.18486887216567993, "learning_rate": 0.0004694543663012686, "loss": 7.4494, "step": 501 }, { "epoch": 0.06179222058099458, "grad_norm": 0.5259673595428467, "learning_rate": 0.0004693927823623599, "loss": 7.6678, "step": 502 }, { "epoch": 0.061915312653865094, "grad_norm": 0.38699421286582947, "learning_rate": 0.0004693311984234512, "loss": 7.7694, "step": 503 }, { "epoch": 0.0620384047267356, "grad_norm": 0.7580873370170593, "learning_rate": 0.00046926961448454243, "loss": 9.4577, "step": 504 }, { "epoch": 0.06216149679960611, "grad_norm": 0.4401310682296753, "learning_rate": 0.0004692080305456337, "loss": 7.9707, "step": 505 }, { "epoch": 0.062284588872476614, "grad_norm": 0.4708208441734314, "learning_rate": 0.00046914644660672495, "loss": 8.0478, "step": 506 }, { "epoch": 0.06240768094534712, "grad_norm": 0.2567194402217865, "learning_rate": 0.00046908486266781623, "loss": 7.3474, "step": 507 }, { "epoch": 0.06253077301821763, "grad_norm": 1.2998569011688232, "learning_rate": 0.00046902327872890747, "loss": 8.8729, "step": 508 }, { "epoch": 0.06265386509108814, "grad_norm": 0.5110471248626709, "learning_rate": 0.0004689616947899988, "loss": 8.0215, "step": 509 }, { "epoch": 0.06277695716395865, "grad_norm": 0.34356972575187683, "learning_rate": 0.00046890011085109004, "loss": 7.6808, "step": 510 }, { "epoch": 0.06290004923682915, "grad_norm": 0.29698529839515686, "learning_rate": 0.0004688385269121813, "loss": 8.1664, "step": 511 }, { "epoch": 0.06302314130969966, "grad_norm": 0.3999924063682556, "learning_rate": 0.00046877694297327256, "loss": 8.4944, "step": 512 }, { "epoch": 0.06314623338257017, "grad_norm": 0.9707795977592468, "learning_rate": 0.00046871535903436384, "loss": 7.5601, "step": 513 }, { "epoch": 0.06326932545544067, "grad_norm": 0.7683845162391663, "learning_rate": 0.0004686537750954551, "loss": 7.9149, "step": 514 }, { "epoch": 0.06339241752831118, "grad_norm": 1.399021029472351, "learning_rate": 0.00046859219115654636, "loss": 9.895, "step": 515 }, { "epoch": 0.06351550960118169, "grad_norm": 0.4193505346775055, "learning_rate": 0.00046853060721763765, "loss": 8.5021, "step": 516 }, { "epoch": 0.06363860167405219, "grad_norm": 0.2573324739933014, "learning_rate": 0.00046846902327872894, "loss": 8.0534, "step": 517 }, { "epoch": 0.0637616937469227, "grad_norm": 0.4352104067802429, "learning_rate": 0.00046840743933982017, "loss": 7.7439, "step": 518 }, { "epoch": 0.0638847858197932, "grad_norm": 0.2454584389925003, "learning_rate": 0.00046834585540091146, "loss": 7.7156, "step": 519 }, { "epoch": 0.06400787789266371, "grad_norm": 0.4299732446670532, "learning_rate": 0.0004682842714620027, "loss": 7.5598, "step": 520 }, { "epoch": 0.06413096996553422, "grad_norm": 0.35441356897354126, "learning_rate": 0.000468222687523094, "loss": 7.4754, "step": 521 }, { "epoch": 0.06425406203840472, "grad_norm": 0.3189389407634735, "learning_rate": 0.0004681611035841852, "loss": 7.4883, "step": 522 }, { "epoch": 0.06437715411127523, "grad_norm": 0.32929837703704834, "learning_rate": 0.00046809951964527655, "loss": 7.9086, "step": 523 }, { "epoch": 0.06450024618414574, "grad_norm": 0.29893749952316284, "learning_rate": 0.0004680379357063678, "loss": 7.5446, "step": 524 }, { "epoch": 0.06462333825701624, "grad_norm": 0.26884329319000244, "learning_rate": 0.00046797635176745907, "loss": 7.6583, "step": 525 }, { "epoch": 0.06474643032988675, "grad_norm": 0.3886944055557251, "learning_rate": 0.0004679147678285503, "loss": 7.5105, "step": 526 }, { "epoch": 0.06486952240275726, "grad_norm": 0.4439445436000824, "learning_rate": 0.0004678531838896416, "loss": 7.683, "step": 527 }, { "epoch": 0.06499261447562776, "grad_norm": 0.34466513991355896, "learning_rate": 0.0004677915999507328, "loss": 7.8102, "step": 528 }, { "epoch": 0.06511570654849827, "grad_norm": 0.6455214619636536, "learning_rate": 0.00046773001601182416, "loss": 7.9344, "step": 529 }, { "epoch": 0.06523879862136879, "grad_norm": 0.5176227688789368, "learning_rate": 0.0004676684320729154, "loss": 7.9482, "step": 530 }, { "epoch": 0.0653618906942393, "grad_norm": 0.23740005493164062, "learning_rate": 0.0004676068481340067, "loss": 7.7147, "step": 531 }, { "epoch": 0.0654849827671098, "grad_norm": 0.5810745358467102, "learning_rate": 0.0004675452641950979, "loss": 7.5342, "step": 532 }, { "epoch": 0.06560807483998031, "grad_norm": 0.4654946029186249, "learning_rate": 0.0004674836802561892, "loss": 7.5675, "step": 533 }, { "epoch": 0.06573116691285082, "grad_norm": 0.31830352544784546, "learning_rate": 0.00046742209631728043, "loss": 7.6571, "step": 534 }, { "epoch": 0.06585425898572132, "grad_norm": 0.6225033402442932, "learning_rate": 0.0004673605123783717, "loss": 7.7369, "step": 535 }, { "epoch": 0.06597735105859183, "grad_norm": 1.0393290519714355, "learning_rate": 0.000467298928439463, "loss": 8.4573, "step": 536 }, { "epoch": 0.06610044313146234, "grad_norm": 0.30523431301116943, "learning_rate": 0.0004672373445005543, "loss": 7.4443, "step": 537 }, { "epoch": 0.06622353520433284, "grad_norm": 0.21377842128276825, "learning_rate": 0.0004671757605616455, "loss": 7.7287, "step": 538 }, { "epoch": 0.06634662727720335, "grad_norm": 0.4967181980609894, "learning_rate": 0.0004671141766227368, "loss": 7.452, "step": 539 }, { "epoch": 0.06646971935007386, "grad_norm": 0.5772116780281067, "learning_rate": 0.00046705259268382804, "loss": 7.6228, "step": 540 }, { "epoch": 0.06659281142294436, "grad_norm": 0.4721943438053131, "learning_rate": 0.00046699100874491933, "loss": 7.9103, "step": 541 }, { "epoch": 0.06671590349581487, "grad_norm": 1.5918265581130981, "learning_rate": 0.00046692942480601056, "loss": 10.3496, "step": 542 }, { "epoch": 0.06683899556868537, "grad_norm": 0.38969939947128296, "learning_rate": 0.0004668678408671019, "loss": 7.7798, "step": 543 }, { "epoch": 0.06696208764155588, "grad_norm": 0.33267951011657715, "learning_rate": 0.00046680625692819313, "loss": 7.5967, "step": 544 }, { "epoch": 0.06708517971442639, "grad_norm": 0.28528666496276855, "learning_rate": 0.0004667446729892844, "loss": 8.2273, "step": 545 }, { "epoch": 0.0672082717872969, "grad_norm": 0.31125307083129883, "learning_rate": 0.00046668308905037565, "loss": 7.812, "step": 546 }, { "epoch": 0.0673313638601674, "grad_norm": 0.5697647929191589, "learning_rate": 0.00046662150511146694, "loss": 7.2263, "step": 547 }, { "epoch": 0.06745445593303791, "grad_norm": 0.28070056438446045, "learning_rate": 0.00046655992117255817, "loss": 7.5637, "step": 548 }, { "epoch": 0.06757754800590841, "grad_norm": 0.6477612257003784, "learning_rate": 0.00046649833723364946, "loss": 7.8832, "step": 549 }, { "epoch": 0.06770064007877892, "grad_norm": 0.6121154427528381, "learning_rate": 0.00046643675329474074, "loss": 7.9879, "step": 550 }, { "epoch": 0.06782373215164944, "grad_norm": 0.2539771497249603, "learning_rate": 0.00046637516935583203, "loss": 7.5691, "step": 551 }, { "epoch": 0.06794682422451995, "grad_norm": 0.3409113585948944, "learning_rate": 0.00046631358541692326, "loss": 7.7483, "step": 552 }, { "epoch": 0.06806991629739045, "grad_norm": 0.3335685729980469, "learning_rate": 0.00046625200147801455, "loss": 8.51, "step": 553 }, { "epoch": 0.06819300837026096, "grad_norm": 0.47053229808807373, "learning_rate": 0.0004661904175391058, "loss": 7.419, "step": 554 }, { "epoch": 0.06831610044313147, "grad_norm": 0.5002686977386475, "learning_rate": 0.00046612883360019707, "loss": 8.3754, "step": 555 }, { "epoch": 0.06843919251600197, "grad_norm": 0.39415642619132996, "learning_rate": 0.0004660672496612883, "loss": 7.5416, "step": 556 }, { "epoch": 0.06856228458887248, "grad_norm": 0.2879919111728668, "learning_rate": 0.00046600566572237964, "loss": 7.3626, "step": 557 }, { "epoch": 0.06868537666174299, "grad_norm": 0.36448127031326294, "learning_rate": 0.0004659440817834709, "loss": 7.5159, "step": 558 }, { "epoch": 0.06880846873461349, "grad_norm": 0.4740031659603119, "learning_rate": 0.00046588249784456216, "loss": 8.0764, "step": 559 }, { "epoch": 0.068931560807484, "grad_norm": 0.33838653564453125, "learning_rate": 0.0004658209139056534, "loss": 7.4257, "step": 560 }, { "epoch": 0.0690546528803545, "grad_norm": 0.36808648705482483, "learning_rate": 0.0004657593299667447, "loss": 8.2807, "step": 561 }, { "epoch": 0.06917774495322501, "grad_norm": 0.2838933765888214, "learning_rate": 0.0004656977460278359, "loss": 7.8147, "step": 562 }, { "epoch": 0.06930083702609552, "grad_norm": 0.6201586723327637, "learning_rate": 0.00046563616208892725, "loss": 7.5215, "step": 563 }, { "epoch": 0.06942392909896603, "grad_norm": 0.32492050528526306, "learning_rate": 0.0004655745781500185, "loss": 7.563, "step": 564 }, { "epoch": 0.06954702117183653, "grad_norm": 0.6371105313301086, "learning_rate": 0.00046551299421110977, "loss": 7.6973, "step": 565 }, { "epoch": 0.06967011324470704, "grad_norm": 1.0042698383331299, "learning_rate": 0.000465451410272201, "loss": 8.0928, "step": 566 }, { "epoch": 0.06979320531757754, "grad_norm": 0.35701891779899597, "learning_rate": 0.0004653898263332923, "loss": 7.2664, "step": 567 }, { "epoch": 0.06991629739044805, "grad_norm": 0.23754729330539703, "learning_rate": 0.0004653282423943835, "loss": 7.5262, "step": 568 }, { "epoch": 0.07003938946331856, "grad_norm": 0.3480207622051239, "learning_rate": 0.0004652666584554748, "loss": 7.6601, "step": 569 }, { "epoch": 0.07016248153618906, "grad_norm": 0.6436832547187805, "learning_rate": 0.0004652050745165661, "loss": 7.0744, "step": 570 }, { "epoch": 0.07028557360905957, "grad_norm": 0.8647512793540955, "learning_rate": 0.0004651434905776574, "loss": 8.7064, "step": 571 }, { "epoch": 0.07040866568193008, "grad_norm": 0.37596333026885986, "learning_rate": 0.0004650819066387486, "loss": 7.5977, "step": 572 }, { "epoch": 0.0705317577548006, "grad_norm": 0.3627234995365143, "learning_rate": 0.0004650203226998399, "loss": 7.2081, "step": 573 }, { "epoch": 0.0706548498276711, "grad_norm": 0.5254992842674255, "learning_rate": 0.00046495873876093113, "loss": 8.0076, "step": 574 }, { "epoch": 0.07077794190054161, "grad_norm": 0.3704444169998169, "learning_rate": 0.0004648971548220224, "loss": 7.3505, "step": 575 }, { "epoch": 0.07090103397341212, "grad_norm": 1.0429104566574097, "learning_rate": 0.00046483557088311365, "loss": 9.5257, "step": 576 }, { "epoch": 0.07102412604628262, "grad_norm": 0.40340960025787354, "learning_rate": 0.000464773986944205, "loss": 8.3188, "step": 577 }, { "epoch": 0.07114721811915313, "grad_norm": 1.3751311302185059, "learning_rate": 0.0004647124030052962, "loss": 7.1171, "step": 578 }, { "epoch": 0.07127031019202364, "grad_norm": 0.7836093306541443, "learning_rate": 0.0004646508190663875, "loss": 7.4209, "step": 579 }, { "epoch": 0.07139340226489414, "grad_norm": 0.5707764625549316, "learning_rate": 0.00046458923512747875, "loss": 7.4342, "step": 580 }, { "epoch": 0.07151649433776465, "grad_norm": 0.4732872247695923, "learning_rate": 0.00046452765118857003, "loss": 7.7206, "step": 581 }, { "epoch": 0.07163958641063516, "grad_norm": 0.3478972613811493, "learning_rate": 0.00046446606724966127, "loss": 7.558, "step": 582 }, { "epoch": 0.07176267848350566, "grad_norm": 0.6957452297210693, "learning_rate": 0.0004644044833107526, "loss": 8.1208, "step": 583 }, { "epoch": 0.07188577055637617, "grad_norm": 0.3007359206676483, "learning_rate": 0.00046434289937184384, "loss": 7.3467, "step": 584 }, { "epoch": 0.07200886262924668, "grad_norm": 0.600166380405426, "learning_rate": 0.0004642813154329351, "loss": 8.2409, "step": 585 }, { "epoch": 0.07213195470211718, "grad_norm": 0.4016733467578888, "learning_rate": 0.00046421973149402636, "loss": 8.1313, "step": 586 }, { "epoch": 0.07225504677498769, "grad_norm": 0.584907054901123, "learning_rate": 0.00046415814755511764, "loss": 8.8486, "step": 587 }, { "epoch": 0.0723781388478582, "grad_norm": 0.483656644821167, "learning_rate": 0.0004640965636162089, "loss": 7.9042, "step": 588 }, { "epoch": 0.0725012309207287, "grad_norm": 0.40756744146347046, "learning_rate": 0.00046403497967730016, "loss": 8.8783, "step": 589 }, { "epoch": 0.07262432299359921, "grad_norm": 0.6880820393562317, "learning_rate": 0.00046397339573839145, "loss": 7.4915, "step": 590 }, { "epoch": 0.07274741506646971, "grad_norm": 0.48929598927497864, "learning_rate": 0.00046391181179948274, "loss": 7.9418, "step": 591 }, { "epoch": 0.07287050713934022, "grad_norm": 0.42976993322372437, "learning_rate": 0.00046385022786057397, "loss": 7.753, "step": 592 }, { "epoch": 0.07299359921221073, "grad_norm": 0.5301759243011475, "learning_rate": 0.00046378864392166525, "loss": 7.7889, "step": 593 }, { "epoch": 0.07311669128508123, "grad_norm": 0.5863567590713501, "learning_rate": 0.0004637270599827565, "loss": 8.4828, "step": 594 }, { "epoch": 0.07323978335795175, "grad_norm": 0.44650235772132874, "learning_rate": 0.0004636654760438478, "loss": 7.4786, "step": 595 }, { "epoch": 0.07336287543082226, "grad_norm": 0.47883233428001404, "learning_rate": 0.000463603892104939, "loss": 7.6694, "step": 596 }, { "epoch": 0.07348596750369277, "grad_norm": 0.24481579661369324, "learning_rate": 0.00046354230816603035, "loss": 7.8882, "step": 597 }, { "epoch": 0.07360905957656327, "grad_norm": 0.2819245159626007, "learning_rate": 0.0004634807242271216, "loss": 7.7064, "step": 598 }, { "epoch": 0.07373215164943378, "grad_norm": 0.236419215798378, "learning_rate": 0.00046341914028821287, "loss": 7.5773, "step": 599 }, { "epoch": 0.07385524372230429, "grad_norm": 0.6090278625488281, "learning_rate": 0.0004633575563493041, "loss": 8.1493, "step": 600 }, { "epoch": 0.0739783357951748, "grad_norm": 0.3171910047531128, "learning_rate": 0.0004632959724103954, "loss": 7.7386, "step": 601 }, { "epoch": 0.0741014278680453, "grad_norm": 0.3363146483898163, "learning_rate": 0.0004632343884714866, "loss": 7.7494, "step": 602 }, { "epoch": 0.0742245199409158, "grad_norm": 0.5068879127502441, "learning_rate": 0.0004631728045325779, "loss": 7.4532, "step": 603 }, { "epoch": 0.07434761201378631, "grad_norm": 0.21819807589054108, "learning_rate": 0.0004631112205936692, "loss": 7.6363, "step": 604 }, { "epoch": 0.07447070408665682, "grad_norm": 0.32754021883010864, "learning_rate": 0.0004630496366547605, "loss": 7.5422, "step": 605 }, { "epoch": 0.07459379615952733, "grad_norm": 0.4479741156101227, "learning_rate": 0.0004629880527158517, "loss": 7.7197, "step": 606 }, { "epoch": 0.07471688823239783, "grad_norm": 0.4705367982387543, "learning_rate": 0.000462926468776943, "loss": 8.0884, "step": 607 }, { "epoch": 0.07483998030526834, "grad_norm": 0.5308294296264648, "learning_rate": 0.00046286488483803423, "loss": 7.4511, "step": 608 }, { "epoch": 0.07496307237813885, "grad_norm": 0.9123960733413696, "learning_rate": 0.0004628033008991255, "loss": 7.4116, "step": 609 }, { "epoch": 0.07508616445100935, "grad_norm": 0.5676689147949219, "learning_rate": 0.00046274171696021675, "loss": 8.3899, "step": 610 }, { "epoch": 0.07520925652387986, "grad_norm": 0.5030512809753418, "learning_rate": 0.0004626801330213081, "loss": 8.3462, "step": 611 }, { "epoch": 0.07533234859675036, "grad_norm": 0.4742419719696045, "learning_rate": 0.0004626185490823993, "loss": 7.7519, "step": 612 }, { "epoch": 0.07545544066962087, "grad_norm": 0.465253084897995, "learning_rate": 0.0004625569651434906, "loss": 7.8386, "step": 613 }, { "epoch": 0.07557853274249138, "grad_norm": 0.4528944790363312, "learning_rate": 0.00046249538120458184, "loss": 7.6424, "step": 614 }, { "epoch": 0.07570162481536188, "grad_norm": 0.4394485354423523, "learning_rate": 0.0004624337972656731, "loss": 7.6861, "step": 615 }, { "epoch": 0.0758247168882324, "grad_norm": 0.26038655638694763, "learning_rate": 0.00046237221332676436, "loss": 7.3293, "step": 616 }, { "epoch": 0.07594780896110291, "grad_norm": 0.28609272837638855, "learning_rate": 0.0004623106293878557, "loss": 7.3727, "step": 617 }, { "epoch": 0.07607090103397342, "grad_norm": 0.7581633925437927, "learning_rate": 0.00046224904544894693, "loss": 8.3728, "step": 618 }, { "epoch": 0.07619399310684392, "grad_norm": 0.4440300762653351, "learning_rate": 0.0004621874615100382, "loss": 7.4334, "step": 619 }, { "epoch": 0.07631708517971443, "grad_norm": 0.6041553616523743, "learning_rate": 0.00046212587757112945, "loss": 8.9418, "step": 620 }, { "epoch": 0.07644017725258494, "grad_norm": 0.5109373927116394, "learning_rate": 0.00046206429363222074, "loss": 7.7129, "step": 621 }, { "epoch": 0.07656326932545544, "grad_norm": 0.38103240728378296, "learning_rate": 0.00046200270969331197, "loss": 7.9102, "step": 622 }, { "epoch": 0.07668636139832595, "grad_norm": 0.32101839780807495, "learning_rate": 0.00046194112575440326, "loss": 7.8919, "step": 623 }, { "epoch": 0.07680945347119646, "grad_norm": 0.332836776971817, "learning_rate": 0.00046187954181549454, "loss": 7.428, "step": 624 }, { "epoch": 0.07693254554406696, "grad_norm": 0.6630201935768127, "learning_rate": 0.00046181795787658583, "loss": 7.5014, "step": 625 }, { "epoch": 0.07705563761693747, "grad_norm": 0.7625936269760132, "learning_rate": 0.00046175637393767706, "loss": 7.4771, "step": 626 }, { "epoch": 0.07717872968980798, "grad_norm": 0.7760499119758606, "learning_rate": 0.00046169478999876835, "loss": 7.8721, "step": 627 }, { "epoch": 0.07730182176267848, "grad_norm": 0.5199168920516968, "learning_rate": 0.0004616332060598596, "loss": 7.955, "step": 628 }, { "epoch": 0.07742491383554899, "grad_norm": 0.24776297807693481, "learning_rate": 0.00046157162212095087, "loss": 7.6378, "step": 629 }, { "epoch": 0.0775480059084195, "grad_norm": 0.4903216063976288, "learning_rate": 0.0004615100381820421, "loss": 7.853, "step": 630 }, { "epoch": 0.07767109798129, "grad_norm": 0.8267338871955872, "learning_rate": 0.00046144845424313344, "loss": 7.309, "step": 631 }, { "epoch": 0.07779419005416051, "grad_norm": 0.340452641248703, "learning_rate": 0.0004613868703042247, "loss": 7.893, "step": 632 }, { "epoch": 0.07791728212703102, "grad_norm": 0.4384312033653259, "learning_rate": 0.00046132528636531596, "loss": 7.9084, "step": 633 }, { "epoch": 0.07804037419990152, "grad_norm": 0.3808216154575348, "learning_rate": 0.0004612637024264072, "loss": 7.8055, "step": 634 }, { "epoch": 0.07816346627277203, "grad_norm": 0.5082446336746216, "learning_rate": 0.0004612021184874985, "loss": 7.8138, "step": 635 }, { "epoch": 0.07828655834564253, "grad_norm": 0.3460979759693146, "learning_rate": 0.0004611405345485897, "loss": 7.3436, "step": 636 }, { "epoch": 0.07840965041851304, "grad_norm": 0.3883151113986969, "learning_rate": 0.000461078950609681, "loss": 7.3764, "step": 637 }, { "epoch": 0.07853274249138356, "grad_norm": 0.20130054652690887, "learning_rate": 0.0004610173666707723, "loss": 7.622, "step": 638 }, { "epoch": 0.07865583456425407, "grad_norm": 0.2619245946407318, "learning_rate": 0.00046095578273186357, "loss": 7.5051, "step": 639 }, { "epoch": 0.07877892663712457, "grad_norm": 0.7577888369560242, "learning_rate": 0.0004608941987929548, "loss": 8.0959, "step": 640 }, { "epoch": 0.07890201870999508, "grad_norm": 0.4080001711845398, "learning_rate": 0.0004608326148540461, "loss": 7.8555, "step": 641 }, { "epoch": 0.07902511078286559, "grad_norm": 0.2260061353445053, "learning_rate": 0.0004607710309151373, "loss": 7.4757, "step": 642 }, { "epoch": 0.0791482028557361, "grad_norm": 0.31091129779815674, "learning_rate": 0.0004607094469762286, "loss": 7.5037, "step": 643 }, { "epoch": 0.0792712949286066, "grad_norm": 0.3062364459037781, "learning_rate": 0.0004606478630373199, "loss": 8.1371, "step": 644 }, { "epoch": 0.0793943870014771, "grad_norm": 0.26548057794570923, "learning_rate": 0.0004605862790984112, "loss": 7.5398, "step": 645 }, { "epoch": 0.07951747907434761, "grad_norm": 0.3061646819114685, "learning_rate": 0.0004605246951595024, "loss": 7.9175, "step": 646 }, { "epoch": 0.07964057114721812, "grad_norm": 0.2504066824913025, "learning_rate": 0.0004604631112205937, "loss": 7.767, "step": 647 }, { "epoch": 0.07976366322008863, "grad_norm": 0.19148948788642883, "learning_rate": 0.00046040152728168493, "loss": 7.7871, "step": 648 }, { "epoch": 0.07988675529295913, "grad_norm": 0.2792893350124359, "learning_rate": 0.0004603399433427762, "loss": 7.631, "step": 649 }, { "epoch": 0.08000984736582964, "grad_norm": 0.2197357416152954, "learning_rate": 0.00046027835940386745, "loss": 7.4598, "step": 650 }, { "epoch": 0.08013293943870015, "grad_norm": 0.30291804671287537, "learning_rate": 0.0004602167754649588, "loss": 7.6172, "step": 651 }, { "epoch": 0.08025603151157065, "grad_norm": 0.30482226610183716, "learning_rate": 0.00046015519152605, "loss": 7.8627, "step": 652 }, { "epoch": 0.08037912358444116, "grad_norm": 0.23387014865875244, "learning_rate": 0.0004600936075871413, "loss": 7.3532, "step": 653 }, { "epoch": 0.08050221565731167, "grad_norm": 0.46033281087875366, "learning_rate": 0.00046003202364823254, "loss": 8.0687, "step": 654 }, { "epoch": 0.08062530773018217, "grad_norm": 0.3004489839076996, "learning_rate": 0.00045997043970932383, "loss": 7.6768, "step": 655 }, { "epoch": 0.08074839980305268, "grad_norm": 0.3911416530609131, "learning_rate": 0.00045990885577041506, "loss": 7.7161, "step": 656 }, { "epoch": 0.08087149187592318, "grad_norm": 0.5755597352981567, "learning_rate": 0.00045984727183150635, "loss": 9.1413, "step": 657 }, { "epoch": 0.08099458394879369, "grad_norm": 1.2643924951553345, "learning_rate": 0.00045978568789259764, "loss": 7.3811, "step": 658 }, { "epoch": 0.08111767602166421, "grad_norm": 0.8765921592712402, "learning_rate": 0.0004597241039536889, "loss": 7.2324, "step": 659 }, { "epoch": 0.08124076809453472, "grad_norm": 0.3996867835521698, "learning_rate": 0.00045966252001478016, "loss": 7.7735, "step": 660 }, { "epoch": 0.08136386016740522, "grad_norm": 0.36174309253692627, "learning_rate": 0.00045960093607587144, "loss": 7.9553, "step": 661 }, { "epoch": 0.08148695224027573, "grad_norm": 0.4949360489845276, "learning_rate": 0.0004595393521369627, "loss": 7.8923, "step": 662 }, { "epoch": 0.08161004431314624, "grad_norm": 0.6458953022956848, "learning_rate": 0.00045947776819805396, "loss": 8.3924, "step": 663 }, { "epoch": 0.08173313638601674, "grad_norm": 0.19764401018619537, "learning_rate": 0.0004594161842591452, "loss": 7.5161, "step": 664 }, { "epoch": 0.08185622845888725, "grad_norm": 0.2769499123096466, "learning_rate": 0.00045935460032023653, "loss": 7.4039, "step": 665 }, { "epoch": 0.08197932053175776, "grad_norm": 0.23562276363372803, "learning_rate": 0.00045929301638132777, "loss": 7.4562, "step": 666 }, { "epoch": 0.08210241260462826, "grad_norm": 0.19402645528316498, "learning_rate": 0.00045923143244241905, "loss": 7.5314, "step": 667 }, { "epoch": 0.08222550467749877, "grad_norm": 0.20985941588878632, "learning_rate": 0.0004591698485035103, "loss": 7.6571, "step": 668 }, { "epoch": 0.08234859675036928, "grad_norm": 0.3966018557548523, "learning_rate": 0.00045910826456460157, "loss": 7.5721, "step": 669 }, { "epoch": 0.08247168882323978, "grad_norm": 0.6370438933372498, "learning_rate": 0.0004590466806256928, "loss": 8.4975, "step": 670 }, { "epoch": 0.08259478089611029, "grad_norm": 0.6214179396629333, "learning_rate": 0.00045898509668678415, "loss": 7.821, "step": 671 }, { "epoch": 0.0827178729689808, "grad_norm": 0.5288426280021667, "learning_rate": 0.0004589235127478754, "loss": 8.0773, "step": 672 }, { "epoch": 0.0828409650418513, "grad_norm": 0.38691815733909607, "learning_rate": 0.00045886192880896667, "loss": 7.8686, "step": 673 }, { "epoch": 0.08296405711472181, "grad_norm": 0.33472922444343567, "learning_rate": 0.0004588003448700579, "loss": 7.6214, "step": 674 }, { "epoch": 0.08308714918759232, "grad_norm": 0.5748097896575928, "learning_rate": 0.0004587387609311492, "loss": 8.1027, "step": 675 }, { "epoch": 0.08321024126046282, "grad_norm": 0.3928294777870178, "learning_rate": 0.0004586771769922404, "loss": 7.6137, "step": 676 }, { "epoch": 0.08333333333333333, "grad_norm": 0.2617354393005371, "learning_rate": 0.0004586155930533317, "loss": 7.4187, "step": 677 }, { "epoch": 0.08345642540620384, "grad_norm": 0.2061844766139984, "learning_rate": 0.000458554009114423, "loss": 7.5096, "step": 678 }, { "epoch": 0.08357951747907434, "grad_norm": 0.2731180787086487, "learning_rate": 0.0004584924251755143, "loss": 7.4262, "step": 679 }, { "epoch": 0.08370260955194485, "grad_norm": 0.40353843569755554, "learning_rate": 0.0004584308412366055, "loss": 7.8062, "step": 680 }, { "epoch": 0.08382570162481537, "grad_norm": 0.19027206301689148, "learning_rate": 0.0004583692572976968, "loss": 7.4342, "step": 681 }, { "epoch": 0.08394879369768588, "grad_norm": 0.21518848836421967, "learning_rate": 0.00045830767335878803, "loss": 7.5011, "step": 682 }, { "epoch": 0.08407188577055638, "grad_norm": 0.23490849137306213, "learning_rate": 0.0004582460894198793, "loss": 7.6485, "step": 683 }, { "epoch": 0.08419497784342689, "grad_norm": 0.2553724944591522, "learning_rate": 0.00045818450548097055, "loss": 7.7835, "step": 684 }, { "epoch": 0.0843180699162974, "grad_norm": 0.4409460127353668, "learning_rate": 0.0004581229215420619, "loss": 7.8294, "step": 685 }, { "epoch": 0.0844411619891679, "grad_norm": 0.3710136413574219, "learning_rate": 0.0004580613376031531, "loss": 7.5763, "step": 686 }, { "epoch": 0.08456425406203841, "grad_norm": 0.2854559123516083, "learning_rate": 0.0004579997536642444, "loss": 7.6049, "step": 687 }, { "epoch": 0.08468734613490891, "grad_norm": 0.3343944847583771, "learning_rate": 0.00045793816972533564, "loss": 7.4058, "step": 688 }, { "epoch": 0.08481043820777942, "grad_norm": 0.644161581993103, "learning_rate": 0.0004578765857864269, "loss": 7.6624, "step": 689 }, { "epoch": 0.08493353028064993, "grad_norm": 1.0684337615966797, "learning_rate": 0.00045781500184751816, "loss": 8.1245, "step": 690 }, { "epoch": 0.08505662235352043, "grad_norm": 0.6700904965400696, "learning_rate": 0.00045775341790860944, "loss": 8.1465, "step": 691 }, { "epoch": 0.08517971442639094, "grad_norm": 0.6015821099281311, "learning_rate": 0.00045769183396970073, "loss": 8.2293, "step": 692 }, { "epoch": 0.08530280649926145, "grad_norm": 0.7512471079826355, "learning_rate": 0.000457630250030792, "loss": 7.5398, "step": 693 }, { "epoch": 0.08542589857213195, "grad_norm": 0.38843607902526855, "learning_rate": 0.00045756866609188325, "loss": 7.8461, "step": 694 }, { "epoch": 0.08554899064500246, "grad_norm": 0.5979195833206177, "learning_rate": 0.00045750708215297454, "loss": 7.4504, "step": 695 }, { "epoch": 0.08567208271787297, "grad_norm": 0.36697593331336975, "learning_rate": 0.00045744549821406577, "loss": 7.6352, "step": 696 }, { "epoch": 0.08579517479074347, "grad_norm": 0.7016244530677795, "learning_rate": 0.00045738391427515706, "loss": 8.2549, "step": 697 }, { "epoch": 0.08591826686361398, "grad_norm": 1.0306533575057983, "learning_rate": 0.00045732233033624834, "loss": 8.2864, "step": 698 }, { "epoch": 0.08604135893648449, "grad_norm": 0.3260517120361328, "learning_rate": 0.00045726074639733963, "loss": 7.1234, "step": 699 }, { "epoch": 0.08616445100935499, "grad_norm": 0.3230474293231964, "learning_rate": 0.00045719916245843086, "loss": 7.5999, "step": 700 }, { "epoch": 0.0862875430822255, "grad_norm": 0.24334917962551117, "learning_rate": 0.0004571375785195221, "loss": 7.8803, "step": 701 }, { "epoch": 0.086410635155096, "grad_norm": 0.4236052632331848, "learning_rate": 0.0004570759945806134, "loss": 7.4785, "step": 702 }, { "epoch": 0.08653372722796653, "grad_norm": 0.3514898717403412, "learning_rate": 0.0004570144106417046, "loss": 7.9218, "step": 703 }, { "epoch": 0.08665681930083703, "grad_norm": 0.4607546925544739, "learning_rate": 0.0004569528267027959, "loss": 7.4893, "step": 704 }, { "epoch": 0.08677991137370754, "grad_norm": 0.37587711215019226, "learning_rate": 0.0004568912427638872, "loss": 7.3685, "step": 705 }, { "epoch": 0.08690300344657804, "grad_norm": 0.4320785701274872, "learning_rate": 0.00045682965882497847, "loss": 7.6086, "step": 706 }, { "epoch": 0.08702609551944855, "grad_norm": 0.26065364480018616, "learning_rate": 0.0004567680748860697, "loss": 7.3001, "step": 707 }, { "epoch": 0.08714918759231906, "grad_norm": 0.2325505167245865, "learning_rate": 0.000456706490947161, "loss": 7.3411, "step": 708 }, { "epoch": 0.08727227966518956, "grad_norm": 0.32938462495803833, "learning_rate": 0.0004566449070082522, "loss": 7.8918, "step": 709 }, { "epoch": 0.08739537173806007, "grad_norm": 0.42619967460632324, "learning_rate": 0.0004565833230693435, "loss": 7.6035, "step": 710 }, { "epoch": 0.08751846381093058, "grad_norm": 0.8107537031173706, "learning_rate": 0.00045652173913043474, "loss": 7.7669, "step": 711 }, { "epoch": 0.08764155588380108, "grad_norm": 0.3093187212944031, "learning_rate": 0.0004564601551915261, "loss": 7.7804, "step": 712 }, { "epoch": 0.08776464795667159, "grad_norm": 0.6488503217697144, "learning_rate": 0.0004563985712526173, "loss": 7.4711, "step": 713 }, { "epoch": 0.0878877400295421, "grad_norm": 0.4094676077365875, "learning_rate": 0.0004563369873137086, "loss": 7.7082, "step": 714 }, { "epoch": 0.0880108321024126, "grad_norm": 0.20677463710308075, "learning_rate": 0.00045627540337479983, "loss": 7.1789, "step": 715 }, { "epoch": 0.08813392417528311, "grad_norm": 0.5031416416168213, "learning_rate": 0.0004562138194358911, "loss": 7.5559, "step": 716 }, { "epoch": 0.08825701624815362, "grad_norm": 0.3678782880306244, "learning_rate": 0.00045615223549698235, "loss": 7.789, "step": 717 }, { "epoch": 0.08838010832102412, "grad_norm": 0.34534457325935364, "learning_rate": 0.00045609065155807364, "loss": 8.0999, "step": 718 }, { "epoch": 0.08850320039389463, "grad_norm": 0.5362476110458374, "learning_rate": 0.0004560290676191649, "loss": 7.5441, "step": 719 }, { "epoch": 0.08862629246676514, "grad_norm": 0.7540899515151978, "learning_rate": 0.0004559674836802562, "loss": 7.7558, "step": 720 }, { "epoch": 0.08874938453963564, "grad_norm": 0.593177318572998, "learning_rate": 0.00045590589974134745, "loss": 7.7335, "step": 721 }, { "epoch": 0.08887247661250615, "grad_norm": 0.4261304438114166, "learning_rate": 0.00045584431580243873, "loss": 7.6836, "step": 722 }, { "epoch": 0.08899556868537666, "grad_norm": 0.35448601841926575, "learning_rate": 0.00045578273186352996, "loss": 7.695, "step": 723 }, { "epoch": 0.08911866075824718, "grad_norm": 0.47357621788978577, "learning_rate": 0.00045572114792462125, "loss": 7.5692, "step": 724 }, { "epoch": 0.08924175283111768, "grad_norm": 0.4401455223560333, "learning_rate": 0.0004556595639857125, "loss": 7.4316, "step": 725 }, { "epoch": 0.08936484490398819, "grad_norm": 0.32496845722198486, "learning_rate": 0.0004555979800468038, "loss": 7.4723, "step": 726 }, { "epoch": 0.0894879369768587, "grad_norm": 0.34001579880714417, "learning_rate": 0.00045553639610789506, "loss": 7.251, "step": 727 }, { "epoch": 0.0896110290497292, "grad_norm": 0.46867942810058594, "learning_rate": 0.00045547481216898634, "loss": 7.9666, "step": 728 }, { "epoch": 0.08973412112259971, "grad_norm": 0.5176891088485718, "learning_rate": 0.0004554132282300776, "loss": 7.427, "step": 729 }, { "epoch": 0.08985721319547021, "grad_norm": 0.4607340693473816, "learning_rate": 0.00045535164429116886, "loss": 7.1533, "step": 730 }, { "epoch": 0.08998030526834072, "grad_norm": 0.549744725227356, "learning_rate": 0.0004552900603522601, "loss": 7.5655, "step": 731 }, { "epoch": 0.09010339734121123, "grad_norm": 0.7251145243644714, "learning_rate": 0.00045522847641335144, "loss": 7.915, "step": 732 }, { "epoch": 0.09022648941408173, "grad_norm": 1.0756474733352661, "learning_rate": 0.00045516689247444267, "loss": 8.7391, "step": 733 }, { "epoch": 0.09034958148695224, "grad_norm": 0.3990667760372162, "learning_rate": 0.00045510530853553395, "loss": 8.1278, "step": 734 }, { "epoch": 0.09047267355982275, "grad_norm": 0.266660213470459, "learning_rate": 0.0004550437245966252, "loss": 7.691, "step": 735 }, { "epoch": 0.09059576563269325, "grad_norm": 0.37782710790634155, "learning_rate": 0.0004549821406577165, "loss": 7.8096, "step": 736 }, { "epoch": 0.09071885770556376, "grad_norm": 0.40948107838630676, "learning_rate": 0.0004549205567188077, "loss": 8.0547, "step": 737 }, { "epoch": 0.09084194977843427, "grad_norm": 0.6503539681434631, "learning_rate": 0.000454858972779899, "loss": 7.4373, "step": 738 }, { "epoch": 0.09096504185130477, "grad_norm": 0.43858715891838074, "learning_rate": 0.0004547973888409903, "loss": 7.5965, "step": 739 }, { "epoch": 0.09108813392417528, "grad_norm": 0.5861532092094421, "learning_rate": 0.00045473580490208157, "loss": 8.2216, "step": 740 }, { "epoch": 0.09121122599704579, "grad_norm": 0.4315558671951294, "learning_rate": 0.0004546742209631728, "loss": 7.5397, "step": 741 }, { "epoch": 0.09133431806991629, "grad_norm": 0.37516912817955017, "learning_rate": 0.0004546126370242641, "loss": 7.7077, "step": 742 }, { "epoch": 0.0914574101427868, "grad_norm": 0.5200655460357666, "learning_rate": 0.0004545510530853553, "loss": 7.3141, "step": 743 }, { "epoch": 0.0915805022156573, "grad_norm": 0.5071552395820618, "learning_rate": 0.0004544894691464466, "loss": 7.444, "step": 744 }, { "epoch": 0.09170359428852781, "grad_norm": 0.4373703598976135, "learning_rate": 0.00045442788520753784, "loss": 8.2788, "step": 745 }, { "epoch": 0.09182668636139833, "grad_norm": 0.25664475560188293, "learning_rate": 0.0004543663012686292, "loss": 7.6122, "step": 746 }, { "epoch": 0.09194977843426884, "grad_norm": 0.6505091190338135, "learning_rate": 0.0004543047173297204, "loss": 8.4112, "step": 747 }, { "epoch": 0.09207287050713935, "grad_norm": 0.3387986719608307, "learning_rate": 0.0004542431333908117, "loss": 7.5935, "step": 748 }, { "epoch": 0.09219596258000985, "grad_norm": 0.5906630754470825, "learning_rate": 0.00045418154945190293, "loss": 7.2533, "step": 749 }, { "epoch": 0.09231905465288036, "grad_norm": 0.27711308002471924, "learning_rate": 0.0004541199655129942, "loss": 7.377, "step": 750 }, { "epoch": 0.09244214672575086, "grad_norm": 0.4310401380062103, "learning_rate": 0.00045405838157408545, "loss": 7.5895, "step": 751 }, { "epoch": 0.09256523879862137, "grad_norm": 0.5313851237297058, "learning_rate": 0.0004539967976351768, "loss": 7.9765, "step": 752 }, { "epoch": 0.09268833087149188, "grad_norm": 0.23829545080661774, "learning_rate": 0.000453935213696268, "loss": 7.42, "step": 753 }, { "epoch": 0.09281142294436238, "grad_norm": 0.18719474971294403, "learning_rate": 0.0004538736297573593, "loss": 7.9558, "step": 754 }, { "epoch": 0.09293451501723289, "grad_norm": 0.35093170404434204, "learning_rate": 0.00045381204581845054, "loss": 8.0294, "step": 755 }, { "epoch": 0.0930576070901034, "grad_norm": 0.42901191115379333, "learning_rate": 0.0004537504618795418, "loss": 8.8142, "step": 756 }, { "epoch": 0.0931806991629739, "grad_norm": 0.36167076230049133, "learning_rate": 0.00045368887794063306, "loss": 7.4833, "step": 757 }, { "epoch": 0.09330379123584441, "grad_norm": 0.33803385496139526, "learning_rate": 0.00045362729400172435, "loss": 8.3419, "step": 758 }, { "epoch": 0.09342688330871492, "grad_norm": 0.461340993642807, "learning_rate": 0.00045356571006281563, "loss": 7.2324, "step": 759 }, { "epoch": 0.09354997538158542, "grad_norm": 0.3593752980232239, "learning_rate": 0.0004535041261239069, "loss": 7.5208, "step": 760 }, { "epoch": 0.09367306745445593, "grad_norm": 0.6311232447624207, "learning_rate": 0.00045344254218499815, "loss": 8.0532, "step": 761 }, { "epoch": 0.09379615952732644, "grad_norm": 0.35970309376716614, "learning_rate": 0.00045338095824608944, "loss": 7.5947, "step": 762 }, { "epoch": 0.09391925160019694, "grad_norm": 0.31282320618629456, "learning_rate": 0.00045331937430718067, "loss": 7.7732, "step": 763 }, { "epoch": 0.09404234367306745, "grad_norm": 0.25274473428726196, "learning_rate": 0.00045325779036827196, "loss": 8.1711, "step": 764 }, { "epoch": 0.09416543574593796, "grad_norm": 0.8914788961410522, "learning_rate": 0.0004531962064293632, "loss": 7.4883, "step": 765 }, { "epoch": 0.09428852781880846, "grad_norm": 0.7991496920585632, "learning_rate": 0.00045313462249045453, "loss": 7.8027, "step": 766 }, { "epoch": 0.09441161989167897, "grad_norm": 0.5602808594703674, "learning_rate": 0.00045307303855154576, "loss": 7.9202, "step": 767 }, { "epoch": 0.09453471196454949, "grad_norm": 0.6502956748008728, "learning_rate": 0.00045301145461263705, "loss": 8.9053, "step": 768 }, { "epoch": 0.09465780403742, "grad_norm": 0.24325662851333618, "learning_rate": 0.0004529498706737283, "loss": 7.6877, "step": 769 }, { "epoch": 0.0947808961102905, "grad_norm": 0.24597319960594177, "learning_rate": 0.00045288828673481957, "loss": 7.3672, "step": 770 }, { "epoch": 0.09490398818316101, "grad_norm": 0.2879861891269684, "learning_rate": 0.0004528267027959108, "loss": 7.3627, "step": 771 }, { "epoch": 0.09502708025603152, "grad_norm": 0.4777382016181946, "learning_rate": 0.0004527651188570021, "loss": 7.748, "step": 772 }, { "epoch": 0.09515017232890202, "grad_norm": 0.6104176044464111, "learning_rate": 0.0004527035349180934, "loss": 8.1087, "step": 773 }, { "epoch": 0.09527326440177253, "grad_norm": 0.2758658826351166, "learning_rate": 0.00045264195097918466, "loss": 7.8011, "step": 774 }, { "epoch": 0.09539635647464303, "grad_norm": 0.25181344151496887, "learning_rate": 0.0004525803670402759, "loss": 7.7375, "step": 775 }, { "epoch": 0.09551944854751354, "grad_norm": 0.36917030811309814, "learning_rate": 0.0004525187831013672, "loss": 8.7874, "step": 776 }, { "epoch": 0.09564254062038405, "grad_norm": 0.7285192012786865, "learning_rate": 0.0004524571991624584, "loss": 7.3839, "step": 777 }, { "epoch": 0.09576563269325455, "grad_norm": 0.38289767503738403, "learning_rate": 0.0004523956152235497, "loss": 8.3905, "step": 778 }, { "epoch": 0.09588872476612506, "grad_norm": 0.4589630365371704, "learning_rate": 0.00045233403128464093, "loss": 7.6781, "step": 779 }, { "epoch": 0.09601181683899557, "grad_norm": 0.4157741367816925, "learning_rate": 0.00045227244734573227, "loss": 7.7839, "step": 780 }, { "epoch": 0.09613490891186607, "grad_norm": 0.2813003957271576, "learning_rate": 0.0004522108634068235, "loss": 7.3874, "step": 781 }, { "epoch": 0.09625800098473658, "grad_norm": 0.5774465799331665, "learning_rate": 0.0004521492794679148, "loss": 7.6869, "step": 782 }, { "epoch": 0.09638109305760709, "grad_norm": 0.4205627739429474, "learning_rate": 0.000452087695529006, "loss": 7.4518, "step": 783 }, { "epoch": 0.0965041851304776, "grad_norm": 0.5358959436416626, "learning_rate": 0.0004520261115900973, "loss": 7.8145, "step": 784 }, { "epoch": 0.0966272772033481, "grad_norm": 0.22721369564533234, "learning_rate": 0.00045196452765118854, "loss": 7.5299, "step": 785 }, { "epoch": 0.0967503692762186, "grad_norm": 0.27114415168762207, "learning_rate": 0.0004519029437122799, "loss": 8.1377, "step": 786 }, { "epoch": 0.09687346134908911, "grad_norm": 0.5925058722496033, "learning_rate": 0.0004518413597733711, "loss": 7.5241, "step": 787 }, { "epoch": 0.09699655342195962, "grad_norm": 0.5923349857330322, "learning_rate": 0.0004517797758344624, "loss": 7.6027, "step": 788 }, { "epoch": 0.09711964549483014, "grad_norm": 0.35086360573768616, "learning_rate": 0.00045171819189555363, "loss": 8.141, "step": 789 }, { "epoch": 0.09724273756770065, "grad_norm": 0.3024654686450958, "learning_rate": 0.0004516566079566449, "loss": 7.6576, "step": 790 }, { "epoch": 0.09736582964057115, "grad_norm": 0.2837193012237549, "learning_rate": 0.00045159502401773615, "loss": 7.7421, "step": 791 }, { "epoch": 0.09748892171344166, "grad_norm": 0.3944026529788971, "learning_rate": 0.00045153344007882744, "loss": 7.9471, "step": 792 }, { "epoch": 0.09761201378631217, "grad_norm": 0.27893033623695374, "learning_rate": 0.0004514718561399187, "loss": 7.4185, "step": 793 }, { "epoch": 0.09773510585918267, "grad_norm": 0.5380989909172058, "learning_rate": 0.00045141027220101, "loss": 8.3235, "step": 794 }, { "epoch": 0.09785819793205318, "grad_norm": 0.3128379285335541, "learning_rate": 0.00045134868826210124, "loss": 8.0524, "step": 795 }, { "epoch": 0.09798129000492369, "grad_norm": 1.0374077558517456, "learning_rate": 0.00045128710432319253, "loss": 9.9427, "step": 796 }, { "epoch": 0.09810438207779419, "grad_norm": 0.7038276791572571, "learning_rate": 0.00045122552038428376, "loss": 7.52, "step": 797 }, { "epoch": 0.0982274741506647, "grad_norm": 0.8019376397132874, "learning_rate": 0.00045116393644537505, "loss": 7.2921, "step": 798 }, { "epoch": 0.0983505662235352, "grad_norm": 0.5486743450164795, "learning_rate": 0.0004511023525064663, "loss": 7.6272, "step": 799 }, { "epoch": 0.09847365829640571, "grad_norm": 0.5308101177215576, "learning_rate": 0.0004510407685675576, "loss": 7.5713, "step": 800 }, { "epoch": 0.09859675036927622, "grad_norm": 0.6808559894561768, "learning_rate": 0.00045097918462864886, "loss": 7.6197, "step": 801 }, { "epoch": 0.09871984244214672, "grad_norm": 0.9671376943588257, "learning_rate": 0.00045091760068974014, "loss": 8.3289, "step": 802 }, { "epoch": 0.09884293451501723, "grad_norm": 0.7335740923881531, "learning_rate": 0.0004508560167508314, "loss": 8.1247, "step": 803 }, { "epoch": 0.09896602658788774, "grad_norm": 1.0132482051849365, "learning_rate": 0.00045079443281192266, "loss": 8.8222, "step": 804 }, { "epoch": 0.09908911866075824, "grad_norm": 0.422561913728714, "learning_rate": 0.0004507328488730139, "loss": 7.5458, "step": 805 }, { "epoch": 0.09921221073362875, "grad_norm": 0.5768774747848511, "learning_rate": 0.0004506712649341052, "loss": 8.5408, "step": 806 }, { "epoch": 0.09933530280649926, "grad_norm": 0.5148844122886658, "learning_rate": 0.00045060968099519647, "loss": 8.7785, "step": 807 }, { "epoch": 0.09945839487936976, "grad_norm": 0.7023916244506836, "learning_rate": 0.00045054809705628775, "loss": 8.2399, "step": 808 }, { "epoch": 0.09958148695224027, "grad_norm": 1.0971624851226807, "learning_rate": 0.000450486513117379, "loss": 7.6223, "step": 809 }, { "epoch": 0.09970457902511078, "grad_norm": 1.1799451112747192, "learning_rate": 0.00045042492917847027, "loss": 7.5327, "step": 810 }, { "epoch": 0.0998276710979813, "grad_norm": 0.572324275970459, "learning_rate": 0.0004503633452395615, "loss": 7.7207, "step": 811 }, { "epoch": 0.0999507631708518, "grad_norm": 0.47164228558540344, "learning_rate": 0.0004503017613006528, "loss": 7.2996, "step": 812 }, { "epoch": 0.10007385524372231, "grad_norm": 0.8729771375656128, "learning_rate": 0.0004502401773617441, "loss": 7.5602, "step": 813 }, { "epoch": 0.10019694731659282, "grad_norm": 0.9762377738952637, "learning_rate": 0.00045017859342283536, "loss": 7.6674, "step": 814 }, { "epoch": 0.10032003938946332, "grad_norm": 0.9933874607086182, "learning_rate": 0.0004501170094839266, "loss": 8.168, "step": 815 }, { "epoch": 0.10044313146233383, "grad_norm": 0.6455259919166565, "learning_rate": 0.0004500554255450179, "loss": 7.8821, "step": 816 }, { "epoch": 0.10056622353520434, "grad_norm": 0.56917405128479, "learning_rate": 0.0004499938416061091, "loss": 8.6961, "step": 817 }, { "epoch": 0.10068931560807484, "grad_norm": 0.5218147039413452, "learning_rate": 0.0004499322576672004, "loss": 7.4707, "step": 818 }, { "epoch": 0.10081240768094535, "grad_norm": 0.5224952101707458, "learning_rate": 0.00044987067372829164, "loss": 7.9723, "step": 819 }, { "epoch": 0.10093549975381585, "grad_norm": 0.8668727278709412, "learning_rate": 0.000449809089789383, "loss": 7.3876, "step": 820 }, { "epoch": 0.10105859182668636, "grad_norm": 0.5270757675170898, "learning_rate": 0.0004497475058504742, "loss": 7.7933, "step": 821 }, { "epoch": 0.10118168389955687, "grad_norm": 0.35669881105422974, "learning_rate": 0.0004496859219115655, "loss": 7.6139, "step": 822 }, { "epoch": 0.10130477597242737, "grad_norm": 0.3397178649902344, "learning_rate": 0.00044962433797265673, "loss": 7.4702, "step": 823 }, { "epoch": 0.10142786804529788, "grad_norm": 0.8376403450965881, "learning_rate": 0.000449562754033748, "loss": 8.0007, "step": 824 }, { "epoch": 0.10155096011816839, "grad_norm": 0.5327519178390503, "learning_rate": 0.00044950117009483925, "loss": 7.975, "step": 825 }, { "epoch": 0.1016740521910389, "grad_norm": 1.390215277671814, "learning_rate": 0.00044943958615593053, "loss": 9.2354, "step": 826 }, { "epoch": 0.1017971442639094, "grad_norm": 0.30302366614341736, "learning_rate": 0.0004493780022170218, "loss": 7.6171, "step": 827 }, { "epoch": 0.1019202363367799, "grad_norm": 1.1935219764709473, "learning_rate": 0.0004493164182781131, "loss": 9.8792, "step": 828 }, { "epoch": 0.10204332840965041, "grad_norm": 0.6592932343482971, "learning_rate": 0.00044925483433920434, "loss": 7.5613, "step": 829 }, { "epoch": 0.10216642048252092, "grad_norm": 0.6552746891975403, "learning_rate": 0.0004491932504002956, "loss": 7.7262, "step": 830 }, { "epoch": 0.10228951255539143, "grad_norm": 0.7634150981903076, "learning_rate": 0.00044913166646138686, "loss": 7.5014, "step": 831 }, { "epoch": 0.10241260462826195, "grad_norm": 0.3424353301525116, "learning_rate": 0.00044907008252247814, "loss": 7.954, "step": 832 }, { "epoch": 0.10253569670113245, "grad_norm": 0.35784611105918884, "learning_rate": 0.0004490084985835694, "loss": 8.066, "step": 833 }, { "epoch": 0.10265878877400296, "grad_norm": 0.5473178029060364, "learning_rate": 0.0004489469146446607, "loss": 7.7902, "step": 834 }, { "epoch": 0.10278188084687347, "grad_norm": 0.7147846817970276, "learning_rate": 0.00044888533070575195, "loss": 8.5531, "step": 835 }, { "epoch": 0.10290497291974397, "grad_norm": 0.22536078095436096, "learning_rate": 0.00044882374676684324, "loss": 7.4788, "step": 836 }, { "epoch": 0.10302806499261448, "grad_norm": 0.22688928246498108, "learning_rate": 0.00044876216282793447, "loss": 7.7729, "step": 837 }, { "epoch": 0.10315115706548499, "grad_norm": 0.32227322459220886, "learning_rate": 0.00044870057888902576, "loss": 8.019, "step": 838 }, { "epoch": 0.10327424913835549, "grad_norm": 0.2825881242752075, "learning_rate": 0.000448638994950117, "loss": 7.6881, "step": 839 }, { "epoch": 0.103397341211226, "grad_norm": 0.466103196144104, "learning_rate": 0.00044857741101120833, "loss": 7.6107, "step": 840 }, { "epoch": 0.1035204332840965, "grad_norm": 0.7441082000732422, "learning_rate": 0.00044851582707229956, "loss": 7.2269, "step": 841 }, { "epoch": 0.10364352535696701, "grad_norm": 0.18000520765781403, "learning_rate": 0.00044845424313339085, "loss": 7.4214, "step": 842 }, { "epoch": 0.10376661742983752, "grad_norm": 0.29723411798477173, "learning_rate": 0.0004483926591944821, "loss": 7.501, "step": 843 }, { "epoch": 0.10388970950270802, "grad_norm": 0.37798747420310974, "learning_rate": 0.00044833107525557337, "loss": 7.5469, "step": 844 }, { "epoch": 0.10401280157557853, "grad_norm": 0.3125651478767395, "learning_rate": 0.0004482694913166646, "loss": 7.7315, "step": 845 }, { "epoch": 0.10413589364844904, "grad_norm": 0.176555335521698, "learning_rate": 0.0004482079073777559, "loss": 7.7411, "step": 846 }, { "epoch": 0.10425898572131954, "grad_norm": 0.2514618933200836, "learning_rate": 0.00044814632343884717, "loss": 8.0085, "step": 847 }, { "epoch": 0.10438207779419005, "grad_norm": 0.9113054275512695, "learning_rate": 0.00044808473949993846, "loss": 7.0304, "step": 848 }, { "epoch": 0.10450516986706056, "grad_norm": 0.5068264007568359, "learning_rate": 0.0004480231555610297, "loss": 7.4375, "step": 849 }, { "epoch": 0.10462826193993106, "grad_norm": 0.20756866037845612, "learning_rate": 0.000447961571622121, "loss": 7.6719, "step": 850 }, { "epoch": 0.10475135401280157, "grad_norm": 0.4786818027496338, "learning_rate": 0.0004478999876832122, "loss": 8.1949, "step": 851 }, { "epoch": 0.10487444608567208, "grad_norm": 0.772101640701294, "learning_rate": 0.0004478384037443035, "loss": 8.2405, "step": 852 }, { "epoch": 0.10499753815854258, "grad_norm": 0.5171286463737488, "learning_rate": 0.00044777681980539473, "loss": 7.8669, "step": 853 }, { "epoch": 0.1051206302314131, "grad_norm": 0.44134241342544556, "learning_rate": 0.00044771523586648607, "loss": 7.77, "step": 854 }, { "epoch": 0.10524372230428361, "grad_norm": 0.38046717643737793, "learning_rate": 0.0004476536519275773, "loss": 7.5759, "step": 855 }, { "epoch": 0.10536681437715412, "grad_norm": 0.44468414783477783, "learning_rate": 0.0004475920679886686, "loss": 7.6705, "step": 856 }, { "epoch": 0.10548990645002462, "grad_norm": 0.5747100710868835, "learning_rate": 0.0004475304840497598, "loss": 7.4591, "step": 857 }, { "epoch": 0.10561299852289513, "grad_norm": 0.22970815002918243, "learning_rate": 0.0004474689001108511, "loss": 8.0323, "step": 858 }, { "epoch": 0.10573609059576564, "grad_norm": 0.40302321314811707, "learning_rate": 0.00044740731617194234, "loss": 7.7409, "step": 859 }, { "epoch": 0.10585918266863614, "grad_norm": 0.3220136761665344, "learning_rate": 0.0004473457322330336, "loss": 7.7638, "step": 860 }, { "epoch": 0.10598227474150665, "grad_norm": 0.61216139793396, "learning_rate": 0.0004472841482941249, "loss": 7.9695, "step": 861 }, { "epoch": 0.10610536681437716, "grad_norm": 0.35537126660346985, "learning_rate": 0.0004472225643552162, "loss": 7.5193, "step": 862 }, { "epoch": 0.10622845888724766, "grad_norm": 0.28727927803993225, "learning_rate": 0.00044716098041630743, "loss": 7.4404, "step": 863 }, { "epoch": 0.10635155096011817, "grad_norm": 0.34985271096229553, "learning_rate": 0.0004470993964773987, "loss": 7.2531, "step": 864 }, { "epoch": 0.10647464303298868, "grad_norm": 0.326869398355484, "learning_rate": 0.00044703781253848995, "loss": 7.367, "step": 865 }, { "epoch": 0.10659773510585918, "grad_norm": 0.4156915545463562, "learning_rate": 0.00044697622859958124, "loss": 7.8445, "step": 866 }, { "epoch": 0.10672082717872969, "grad_norm": 0.2126128077507019, "learning_rate": 0.0004469146446606725, "loss": 7.2486, "step": 867 }, { "epoch": 0.1068439192516002, "grad_norm": 0.3219527006149292, "learning_rate": 0.0004468530607217638, "loss": 7.4666, "step": 868 }, { "epoch": 0.1069670113244707, "grad_norm": 0.7814860343933105, "learning_rate": 0.00044679147678285504, "loss": 8.3763, "step": 869 }, { "epoch": 0.10709010339734121, "grad_norm": 0.291374146938324, "learning_rate": 0.00044672989284394633, "loss": 8.1273, "step": 870 }, { "epoch": 0.10721319547021171, "grad_norm": 0.35902857780456543, "learning_rate": 0.00044666830890503756, "loss": 8.9836, "step": 871 }, { "epoch": 0.10733628754308222, "grad_norm": 0.47156664729118347, "learning_rate": 0.00044660672496612885, "loss": 7.7323, "step": 872 }, { "epoch": 0.10745937961595273, "grad_norm": 0.6585397124290466, "learning_rate": 0.0004465451410272201, "loss": 7.6998, "step": 873 }, { "epoch": 0.10758247168882323, "grad_norm": 0.3347184658050537, "learning_rate": 0.0004464835570883114, "loss": 8.0764, "step": 874 }, { "epoch": 0.10770556376169374, "grad_norm": 0.49151283502578735, "learning_rate": 0.00044642197314940265, "loss": 7.4939, "step": 875 }, { "epoch": 0.10782865583456426, "grad_norm": 0.8771236538887024, "learning_rate": 0.00044636038921049394, "loss": 9.2593, "step": 876 }, { "epoch": 0.10795174790743477, "grad_norm": 0.9971374273300171, "learning_rate": 0.0004462988052715852, "loss": 8.6704, "step": 877 }, { "epoch": 0.10807483998030527, "grad_norm": 0.21502500772476196, "learning_rate": 0.00044623722133267646, "loss": 7.332, "step": 878 }, { "epoch": 0.10819793205317578, "grad_norm": 0.3188892900943756, "learning_rate": 0.0004461756373937677, "loss": 7.9859, "step": 879 }, { "epoch": 0.10832102412604629, "grad_norm": 0.454812616109848, "learning_rate": 0.000446114053454859, "loss": 7.3765, "step": 880 }, { "epoch": 0.10844411619891679, "grad_norm": 0.3148365020751953, "learning_rate": 0.00044605246951595027, "loss": 7.7276, "step": 881 }, { "epoch": 0.1085672082717873, "grad_norm": 0.3424195945262909, "learning_rate": 0.00044599088557704155, "loss": 7.3254, "step": 882 }, { "epoch": 0.1086903003446578, "grad_norm": 0.5287424325942993, "learning_rate": 0.0004459293016381328, "loss": 7.8457, "step": 883 }, { "epoch": 0.10881339241752831, "grad_norm": 0.4485209882259369, "learning_rate": 0.00044586771769922407, "loss": 8.0172, "step": 884 }, { "epoch": 0.10893648449039882, "grad_norm": 0.42303377389907837, "learning_rate": 0.0004458061337603153, "loss": 7.7621, "step": 885 }, { "epoch": 0.10905957656326933, "grad_norm": 0.4930860996246338, "learning_rate": 0.0004457445498214066, "loss": 7.7997, "step": 886 }, { "epoch": 0.10918266863613983, "grad_norm": 0.34663456678390503, "learning_rate": 0.0004456829658824978, "loss": 8.0282, "step": 887 }, { "epoch": 0.10930576070901034, "grad_norm": 0.4849244952201843, "learning_rate": 0.00044562138194358916, "loss": 7.2912, "step": 888 }, { "epoch": 0.10942885278188084, "grad_norm": 0.3080669641494751, "learning_rate": 0.0004455597980046804, "loss": 7.6881, "step": 889 }, { "epoch": 0.10955194485475135, "grad_norm": 0.3769862949848175, "learning_rate": 0.0004454982140657717, "loss": 8.4058, "step": 890 }, { "epoch": 0.10967503692762186, "grad_norm": 0.2622573673725128, "learning_rate": 0.0004454366301268629, "loss": 7.7224, "step": 891 }, { "epoch": 0.10979812900049236, "grad_norm": 0.4295133352279663, "learning_rate": 0.0004453750461879542, "loss": 8.0431, "step": 892 }, { "epoch": 0.10992122107336287, "grad_norm": 0.3357365131378174, "learning_rate": 0.00044531346224904543, "loss": 7.4193, "step": 893 }, { "epoch": 0.11004431314623338, "grad_norm": 0.2094503939151764, "learning_rate": 0.0004452518783101368, "loss": 7.7293, "step": 894 }, { "epoch": 0.11016740521910388, "grad_norm": 0.21888315677642822, "learning_rate": 0.000445190294371228, "loss": 7.4376, "step": 895 }, { "epoch": 0.11029049729197439, "grad_norm": 0.27555733919143677, "learning_rate": 0.0004451287104323193, "loss": 7.6401, "step": 896 }, { "epoch": 0.11041358936484491, "grad_norm": 0.18884485960006714, "learning_rate": 0.0004450671264934105, "loss": 7.5462, "step": 897 }, { "epoch": 0.11053668143771542, "grad_norm": 0.43108639121055603, "learning_rate": 0.0004450055425545018, "loss": 7.3741, "step": 898 }, { "epoch": 0.11065977351058592, "grad_norm": 0.4137783348560333, "learning_rate": 0.00044494395861559305, "loss": 7.3315, "step": 899 }, { "epoch": 0.11078286558345643, "grad_norm": 0.44931289553642273, "learning_rate": 0.00044488237467668433, "loss": 7.8992, "step": 900 }, { "epoch": 0.11090595765632694, "grad_norm": 0.5848280787467957, "learning_rate": 0.0004448207907377756, "loss": 8.4194, "step": 901 }, { "epoch": 0.11102904972919744, "grad_norm": 0.2191595733165741, "learning_rate": 0.0004447592067988669, "loss": 7.4782, "step": 902 }, { "epoch": 0.11115214180206795, "grad_norm": 0.31576138734817505, "learning_rate": 0.00044469762285995814, "loss": 7.9203, "step": 903 }, { "epoch": 0.11127523387493846, "grad_norm": 0.33844929933547974, "learning_rate": 0.0004446360389210494, "loss": 7.4582, "step": 904 }, { "epoch": 0.11139832594780896, "grad_norm": 0.24956445395946503, "learning_rate": 0.00044457445498214066, "loss": 7.6616, "step": 905 }, { "epoch": 0.11152141802067947, "grad_norm": 0.641548216342926, "learning_rate": 0.00044451287104323194, "loss": 8.3815, "step": 906 }, { "epoch": 0.11164451009354998, "grad_norm": 0.7533461451530457, "learning_rate": 0.0004444512871043232, "loss": 8.99, "step": 907 }, { "epoch": 0.11176760216642048, "grad_norm": 0.41389864683151245, "learning_rate": 0.0004443897031654145, "loss": 7.398, "step": 908 }, { "epoch": 0.11189069423929099, "grad_norm": 0.2403981238603592, "learning_rate": 0.00044432811922650575, "loss": 8.3674, "step": 909 }, { "epoch": 0.1120137863121615, "grad_norm": 0.4024316966533661, "learning_rate": 0.00044426653528759704, "loss": 7.5062, "step": 910 }, { "epoch": 0.112136878385032, "grad_norm": 0.4562370777130127, "learning_rate": 0.00044420495134868827, "loss": 7.2676, "step": 911 }, { "epoch": 0.11225997045790251, "grad_norm": 0.34804314374923706, "learning_rate": 0.00044414336740977955, "loss": 7.901, "step": 912 }, { "epoch": 0.11238306253077301, "grad_norm": 0.456240713596344, "learning_rate": 0.0004440817834708708, "loss": 7.4827, "step": 913 }, { "epoch": 0.11250615460364352, "grad_norm": 0.7326076626777649, "learning_rate": 0.0004440201995319621, "loss": 8.1633, "step": 914 }, { "epoch": 0.11262924667651403, "grad_norm": 0.6238304376602173, "learning_rate": 0.00044395861559305336, "loss": 8.3353, "step": 915 }, { "epoch": 0.11275233874938453, "grad_norm": 0.30062630772590637, "learning_rate": 0.00044389703165414465, "loss": 7.6724, "step": 916 }, { "epoch": 0.11287543082225504, "grad_norm": 0.5999611616134644, "learning_rate": 0.0004438354477152359, "loss": 8.5053, "step": 917 }, { "epoch": 0.11299852289512555, "grad_norm": 0.2810717821121216, "learning_rate": 0.00044377386377632717, "loss": 8.8287, "step": 918 }, { "epoch": 0.11312161496799607, "grad_norm": 0.8900923728942871, "learning_rate": 0.0004437122798374184, "loss": 7.2813, "step": 919 }, { "epoch": 0.11324470704086657, "grad_norm": 0.5480354428291321, "learning_rate": 0.0004436506958985097, "loss": 8.4955, "step": 920 }, { "epoch": 0.11336779911373708, "grad_norm": 0.3968753218650818, "learning_rate": 0.0004435891119596009, "loss": 7.5479, "step": 921 }, { "epoch": 0.11349089118660759, "grad_norm": 0.479797899723053, "learning_rate": 0.00044352752802069226, "loss": 8.3407, "step": 922 }, { "epoch": 0.1136139832594781, "grad_norm": 0.39239269495010376, "learning_rate": 0.0004434659440817835, "loss": 8.2231, "step": 923 }, { "epoch": 0.1137370753323486, "grad_norm": 0.3328399956226349, "learning_rate": 0.0004434043601428748, "loss": 7.4638, "step": 924 }, { "epoch": 0.1138601674052191, "grad_norm": 0.4002281129360199, "learning_rate": 0.000443342776203966, "loss": 7.6421, "step": 925 }, { "epoch": 0.11398325947808961, "grad_norm": 0.29614537954330444, "learning_rate": 0.0004432811922650573, "loss": 7.851, "step": 926 }, { "epoch": 0.11410635155096012, "grad_norm": 0.35103878378868103, "learning_rate": 0.00044321960832614853, "loss": 8.472, "step": 927 }, { "epoch": 0.11422944362383063, "grad_norm": 0.7603022456169128, "learning_rate": 0.00044315802438723987, "loss": 7.6672, "step": 928 }, { "epoch": 0.11435253569670113, "grad_norm": 1.246246337890625, "learning_rate": 0.0004430964404483311, "loss": 7.1831, "step": 929 }, { "epoch": 0.11447562776957164, "grad_norm": 0.7463754415512085, "learning_rate": 0.0004430348565094224, "loss": 7.4695, "step": 930 }, { "epoch": 0.11459871984244215, "grad_norm": 0.6034074425697327, "learning_rate": 0.0004429732725705136, "loss": 7.5524, "step": 931 }, { "epoch": 0.11472181191531265, "grad_norm": 0.3545913100242615, "learning_rate": 0.0004429116886316049, "loss": 7.4543, "step": 932 }, { "epoch": 0.11484490398818316, "grad_norm": 0.5369048714637756, "learning_rate": 0.00044285010469269614, "loss": 7.673, "step": 933 }, { "epoch": 0.11496799606105366, "grad_norm": 0.6562362313270569, "learning_rate": 0.00044278852075378737, "loss": 7.9381, "step": 934 }, { "epoch": 0.11509108813392417, "grad_norm": 0.5476993918418884, "learning_rate": 0.0004427269368148787, "loss": 7.5022, "step": 935 }, { "epoch": 0.11521418020679468, "grad_norm": 0.3074606955051422, "learning_rate": 0.00044266535287596994, "loss": 7.3487, "step": 936 }, { "epoch": 0.11533727227966518, "grad_norm": 0.5372899174690247, "learning_rate": 0.00044260376893706123, "loss": 7.3166, "step": 937 }, { "epoch": 0.11546036435253569, "grad_norm": 0.27206215262413025, "learning_rate": 0.00044254218499815246, "loss": 7.7946, "step": 938 }, { "epoch": 0.1155834564254062, "grad_norm": 0.5547351241111755, "learning_rate": 0.00044248060105924375, "loss": 7.4456, "step": 939 }, { "epoch": 0.11570654849827672, "grad_norm": 0.6189630031585693, "learning_rate": 0.000442419017120335, "loss": 7.4565, "step": 940 }, { "epoch": 0.11582964057114722, "grad_norm": 0.34758713841438293, "learning_rate": 0.00044235743318142627, "loss": 7.7053, "step": 941 }, { "epoch": 0.11595273264401773, "grad_norm": 0.3935239017009735, "learning_rate": 0.00044229584924251756, "loss": 8.0183, "step": 942 }, { "epoch": 0.11607582471688824, "grad_norm": 0.25488367676734924, "learning_rate": 0.00044223426530360884, "loss": 7.4404, "step": 943 }, { "epoch": 0.11619891678975874, "grad_norm": 0.3225860297679901, "learning_rate": 0.0004421726813647001, "loss": 7.7003, "step": 944 }, { "epoch": 0.11632200886262925, "grad_norm": 0.32196998596191406, "learning_rate": 0.00044211109742579136, "loss": 8.2536, "step": 945 }, { "epoch": 0.11644510093549976, "grad_norm": 0.5372653603553772, "learning_rate": 0.0004420495134868826, "loss": 7.3148, "step": 946 }, { "epoch": 0.11656819300837026, "grad_norm": 0.37561336159706116, "learning_rate": 0.0004419879295479739, "loss": 7.8737, "step": 947 }, { "epoch": 0.11669128508124077, "grad_norm": 0.2562252879142761, "learning_rate": 0.0004419263456090651, "loss": 7.7234, "step": 948 }, { "epoch": 0.11681437715411128, "grad_norm": 0.3238879144191742, "learning_rate": 0.00044186476167015645, "loss": 7.7201, "step": 949 }, { "epoch": 0.11693746922698178, "grad_norm": 0.35579344630241394, "learning_rate": 0.0004418031777312477, "loss": 7.5057, "step": 950 }, { "epoch": 0.11706056129985229, "grad_norm": 0.3437248170375824, "learning_rate": 0.00044174159379233897, "loss": 7.5865, "step": 951 }, { "epoch": 0.1171836533727228, "grad_norm": 0.5695905089378357, "learning_rate": 0.0004416800098534302, "loss": 8.3355, "step": 952 }, { "epoch": 0.1173067454455933, "grad_norm": 0.4330349564552307, "learning_rate": 0.0004416184259145215, "loss": 7.437, "step": 953 }, { "epoch": 0.11742983751846381, "grad_norm": 0.24199652671813965, "learning_rate": 0.0004415568419756127, "loss": 8.1814, "step": 954 }, { "epoch": 0.11755292959133432, "grad_norm": 0.6704843640327454, "learning_rate": 0.00044149525803670406, "loss": 7.3529, "step": 955 }, { "epoch": 0.11767602166420482, "grad_norm": 0.1879255473613739, "learning_rate": 0.0004414336740977953, "loss": 7.8902, "step": 956 }, { "epoch": 0.11779911373707533, "grad_norm": 0.23511162400245667, "learning_rate": 0.0004413720901588866, "loss": 7.6988, "step": 957 }, { "epoch": 0.11792220580994583, "grad_norm": 0.5316047072410583, "learning_rate": 0.0004413105062199778, "loss": 7.8248, "step": 958 }, { "epoch": 0.11804529788281634, "grad_norm": 0.4379274845123291, "learning_rate": 0.0004412489222810691, "loss": 7.7572, "step": 959 }, { "epoch": 0.11816838995568685, "grad_norm": 0.320591002702713, "learning_rate": 0.00044118733834216034, "loss": 7.9156, "step": 960 }, { "epoch": 0.11829148202855735, "grad_norm": 0.47437676787376404, "learning_rate": 0.0004411257544032516, "loss": 7.4291, "step": 961 }, { "epoch": 0.11841457410142787, "grad_norm": 0.4683731198310852, "learning_rate": 0.0004410641704643429, "loss": 7.6121, "step": 962 }, { "epoch": 0.11853766617429838, "grad_norm": 0.31288743019104004, "learning_rate": 0.0004410025865254342, "loss": 8.1027, "step": 963 }, { "epoch": 0.11866075824716889, "grad_norm": 0.2601418197154999, "learning_rate": 0.00044094100258652543, "loss": 7.4781, "step": 964 }, { "epoch": 0.1187838503200394, "grad_norm": 0.3822157084941864, "learning_rate": 0.0004408794186476167, "loss": 7.5354, "step": 965 }, { "epoch": 0.1189069423929099, "grad_norm": 0.5657556653022766, "learning_rate": 0.00044081783470870795, "loss": 7.8772, "step": 966 }, { "epoch": 0.11903003446578041, "grad_norm": 0.39486029744148254, "learning_rate": 0.00044075625076979923, "loss": 7.3476, "step": 967 }, { "epoch": 0.11915312653865091, "grad_norm": 0.44939425587654114, "learning_rate": 0.00044069466683089047, "loss": 7.7583, "step": 968 }, { "epoch": 0.11927621861152142, "grad_norm": 0.26832789182662964, "learning_rate": 0.0004406330828919818, "loss": 7.6808, "step": 969 }, { "epoch": 0.11939931068439193, "grad_norm": 0.3957786560058594, "learning_rate": 0.00044057149895307304, "loss": 7.7589, "step": 970 }, { "epoch": 0.11952240275726243, "grad_norm": 0.3962816894054413, "learning_rate": 0.0004405099150141643, "loss": 7.7054, "step": 971 }, { "epoch": 0.11964549483013294, "grad_norm": 0.5807939767837524, "learning_rate": 0.00044044833107525556, "loss": 8.5136, "step": 972 }, { "epoch": 0.11976858690300345, "grad_norm": 0.4303355813026428, "learning_rate": 0.00044038674713634684, "loss": 7.3605, "step": 973 }, { "epoch": 0.11989167897587395, "grad_norm": 0.33439624309539795, "learning_rate": 0.0004403251631974381, "loss": 7.8875, "step": 974 }, { "epoch": 0.12001477104874446, "grad_norm": 0.27256372570991516, "learning_rate": 0.00044026357925852936, "loss": 7.6269, "step": 975 }, { "epoch": 0.12013786312161497, "grad_norm": 0.37750592827796936, "learning_rate": 0.00044020199531962065, "loss": 7.2713, "step": 976 }, { "epoch": 0.12026095519448547, "grad_norm": 1.1277146339416504, "learning_rate": 0.00044014041138071194, "loss": 9.5535, "step": 977 }, { "epoch": 0.12038404726735598, "grad_norm": 0.21650156378746033, "learning_rate": 0.00044007882744180317, "loss": 7.5397, "step": 978 }, { "epoch": 0.12050713934022649, "grad_norm": 0.7778486013412476, "learning_rate": 0.00044001724350289446, "loss": 7.5415, "step": 979 }, { "epoch": 0.12063023141309699, "grad_norm": 0.33651936054229736, "learning_rate": 0.0004399556595639857, "loss": 7.4587, "step": 980 }, { "epoch": 0.1207533234859675, "grad_norm": 0.3013482391834259, "learning_rate": 0.000439894075625077, "loss": 7.5113, "step": 981 }, { "epoch": 0.120876415558838, "grad_norm": 0.42502108216285706, "learning_rate": 0.00043983249168616826, "loss": 8.1543, "step": 982 }, { "epoch": 0.12099950763170851, "grad_norm": 0.6041283011436462, "learning_rate": 0.00043977090774725955, "loss": 8.527, "step": 983 }, { "epoch": 0.12112259970457903, "grad_norm": 0.2375735193490982, "learning_rate": 0.0004397093238083508, "loss": 7.5074, "step": 984 }, { "epoch": 0.12124569177744954, "grad_norm": 0.19483423233032227, "learning_rate": 0.00043964773986944207, "loss": 7.8922, "step": 985 }, { "epoch": 0.12136878385032004, "grad_norm": 0.371509850025177, "learning_rate": 0.0004395861559305333, "loss": 7.4659, "step": 986 }, { "epoch": 0.12149187592319055, "grad_norm": 0.2392905056476593, "learning_rate": 0.0004395245719916246, "loss": 7.545, "step": 987 }, { "epoch": 0.12161496799606106, "grad_norm": 0.759840726852417, "learning_rate": 0.0004394629880527158, "loss": 8.3558, "step": 988 }, { "epoch": 0.12173806006893156, "grad_norm": 0.33480000495910645, "learning_rate": 0.00043940140411380716, "loss": 7.5431, "step": 989 }, { "epoch": 0.12186115214180207, "grad_norm": 0.3319475054740906, "learning_rate": 0.0004393398201748984, "loss": 7.4581, "step": 990 }, { "epoch": 0.12198424421467258, "grad_norm": 0.27305448055267334, "learning_rate": 0.0004392782362359897, "loss": 7.6383, "step": 991 }, { "epoch": 0.12210733628754308, "grad_norm": 0.3676181137561798, "learning_rate": 0.0004392166522970809, "loss": 7.629, "step": 992 }, { "epoch": 0.12223042836041359, "grad_norm": 0.21538446843624115, "learning_rate": 0.0004391550683581722, "loss": 7.6268, "step": 993 }, { "epoch": 0.1223535204332841, "grad_norm": 0.4082282483577728, "learning_rate": 0.00043909348441926343, "loss": 8.439, "step": 994 }, { "epoch": 0.1224766125061546, "grad_norm": 0.6019589304924011, "learning_rate": 0.0004390319004803547, "loss": 7.7372, "step": 995 }, { "epoch": 0.12259970457902511, "grad_norm": 0.31598255038261414, "learning_rate": 0.000438970316541446, "loss": 8.0351, "step": 996 }, { "epoch": 0.12272279665189562, "grad_norm": 0.30219027400016785, "learning_rate": 0.0004389087326025373, "loss": 7.7785, "step": 997 }, { "epoch": 0.12284588872476612, "grad_norm": 0.21373538672924042, "learning_rate": 0.0004388471486636285, "loss": 7.4897, "step": 998 }, { "epoch": 0.12296898079763663, "grad_norm": 0.26139160990715027, "learning_rate": 0.0004387855647247198, "loss": 7.8557, "step": 999 }, { "epoch": 0.12309207287050714, "grad_norm": 0.25895044207572937, "learning_rate": 0.00043872398078581104, "loss": 7.7969, "step": 1000 }, { "epoch": 0.12321516494337764, "grad_norm": 0.3325593173503876, "learning_rate": 0.0004386623968469023, "loss": 7.8764, "step": 1001 }, { "epoch": 0.12333825701624815, "grad_norm": 0.40126198530197144, "learning_rate": 0.00043860081290799356, "loss": 7.4264, "step": 1002 }, { "epoch": 0.12346134908911865, "grad_norm": 0.2620031535625458, "learning_rate": 0.0004385392289690849, "loss": 8.1359, "step": 1003 }, { "epoch": 0.12358444116198916, "grad_norm": 0.23083841800689697, "learning_rate": 0.00043847764503017613, "loss": 8.105, "step": 1004 }, { "epoch": 0.12370753323485968, "grad_norm": 0.2664806544780731, "learning_rate": 0.0004384160610912674, "loss": 7.6173, "step": 1005 }, { "epoch": 0.12383062530773019, "grad_norm": 0.3474976420402527, "learning_rate": 0.00043835447715235865, "loss": 8.0505, "step": 1006 }, { "epoch": 0.1239537173806007, "grad_norm": 0.22864046692848206, "learning_rate": 0.00043829289321344994, "loss": 7.5955, "step": 1007 }, { "epoch": 0.1240768094534712, "grad_norm": 0.21676971018314362, "learning_rate": 0.00043823130927454117, "loss": 7.944, "step": 1008 }, { "epoch": 0.12419990152634171, "grad_norm": 0.32742053270339966, "learning_rate": 0.0004381697253356325, "loss": 7.5482, "step": 1009 }, { "epoch": 0.12432299359921221, "grad_norm": 0.3370482921600342, "learning_rate": 0.00043810814139672374, "loss": 7.6076, "step": 1010 }, { "epoch": 0.12444608567208272, "grad_norm": 0.3023282587528229, "learning_rate": 0.00043804655745781503, "loss": 7.428, "step": 1011 }, { "epoch": 0.12456917774495323, "grad_norm": 0.2619742155075073, "learning_rate": 0.00043798497351890626, "loss": 7.7617, "step": 1012 }, { "epoch": 0.12469226981782373, "grad_norm": 0.23673497140407562, "learning_rate": 0.00043792338957999755, "loss": 7.4379, "step": 1013 }, { "epoch": 0.12481536189069424, "grad_norm": 0.16787441074848175, "learning_rate": 0.0004378618056410888, "loss": 7.5846, "step": 1014 }, { "epoch": 0.12493845396356475, "grad_norm": 0.26481878757476807, "learning_rate": 0.00043780022170218007, "loss": 7.375, "step": 1015 }, { "epoch": 0.12506154603643527, "grad_norm": 0.18562868237495422, "learning_rate": 0.00043773863776327135, "loss": 7.516, "step": 1016 }, { "epoch": 0.12518463810930577, "grad_norm": 0.3816431164741516, "learning_rate": 0.00043767705382436264, "loss": 8.3064, "step": 1017 }, { "epoch": 0.12530773018217628, "grad_norm": 0.18496286869049072, "learning_rate": 0.0004376154698854539, "loss": 7.5346, "step": 1018 }, { "epoch": 0.1254308222550468, "grad_norm": 0.256944864988327, "learning_rate": 0.00043755388594654516, "loss": 7.354, "step": 1019 }, { "epoch": 0.1255539143279173, "grad_norm": 0.20327870547771454, "learning_rate": 0.0004374923020076364, "loss": 7.3851, "step": 1020 }, { "epoch": 0.1256770064007878, "grad_norm": 0.22396694123744965, "learning_rate": 0.0004374307180687277, "loss": 7.5049, "step": 1021 }, { "epoch": 0.1258000984736583, "grad_norm": 0.6732802391052246, "learning_rate": 0.0004373691341298189, "loss": 8.6312, "step": 1022 }, { "epoch": 0.1259231905465288, "grad_norm": 0.3362504541873932, "learning_rate": 0.00043730755019091025, "loss": 8.0323, "step": 1023 }, { "epoch": 0.12604628261939932, "grad_norm": 0.6310123205184937, "learning_rate": 0.0004372459662520015, "loss": 7.3201, "step": 1024 }, { "epoch": 0.12616937469226983, "grad_norm": 0.4363902807235718, "learning_rate": 0.00043718438231309277, "loss": 7.5548, "step": 1025 }, { "epoch": 0.12629246676514033, "grad_norm": 0.40726518630981445, "learning_rate": 0.000437122798374184, "loss": 7.8537, "step": 1026 }, { "epoch": 0.12641555883801084, "grad_norm": 0.2905251085758209, "learning_rate": 0.0004370612144352753, "loss": 7.4944, "step": 1027 }, { "epoch": 0.12653865091088135, "grad_norm": 0.4798360764980316, "learning_rate": 0.0004369996304963665, "loss": 7.6112, "step": 1028 }, { "epoch": 0.12666174298375185, "grad_norm": 0.5053649544715881, "learning_rate": 0.0004369380465574578, "loss": 7.4268, "step": 1029 }, { "epoch": 0.12678483505662236, "grad_norm": 0.707440197467804, "learning_rate": 0.0004368764626185491, "loss": 8.2712, "step": 1030 }, { "epoch": 0.12690792712949286, "grad_norm": 0.35442084074020386, "learning_rate": 0.0004368148786796404, "loss": 7.5831, "step": 1031 }, { "epoch": 0.12703101920236337, "grad_norm": 0.3623262345790863, "learning_rate": 0.0004367532947407316, "loss": 7.911, "step": 1032 }, { "epoch": 0.12715411127523388, "grad_norm": 0.38225266337394714, "learning_rate": 0.0004366917108018229, "loss": 7.3449, "step": 1033 }, { "epoch": 0.12727720334810438, "grad_norm": 0.5896698832511902, "learning_rate": 0.00043663012686291413, "loss": 7.5138, "step": 1034 }, { "epoch": 0.1274002954209749, "grad_norm": 0.45828893780708313, "learning_rate": 0.0004365685429240054, "loss": 7.5144, "step": 1035 }, { "epoch": 0.1275233874938454, "grad_norm": 0.32318028807640076, "learning_rate": 0.0004365069589850967, "loss": 7.5818, "step": 1036 }, { "epoch": 0.1276464795667159, "grad_norm": 0.31038329005241394, "learning_rate": 0.000436445375046188, "loss": 7.3873, "step": 1037 }, { "epoch": 0.1277695716395864, "grad_norm": 1.2449700832366943, "learning_rate": 0.0004363837911072792, "loss": 9.5023, "step": 1038 }, { "epoch": 0.12789266371245692, "grad_norm": 0.4684135615825653, "learning_rate": 0.0004363222071683705, "loss": 7.6487, "step": 1039 }, { "epoch": 0.12801575578532742, "grad_norm": 0.29058223962783813, "learning_rate": 0.00043626062322946175, "loss": 7.3631, "step": 1040 }, { "epoch": 0.12813884785819793, "grad_norm": 0.17569656670093536, "learning_rate": 0.00043619903929055303, "loss": 7.7446, "step": 1041 }, { "epoch": 0.12826193993106844, "grad_norm": 0.4784836173057556, "learning_rate": 0.00043613745535164426, "loss": 7.3977, "step": 1042 }, { "epoch": 0.12838503200393894, "grad_norm": 0.5599404573440552, "learning_rate": 0.0004360758714127356, "loss": 7.4867, "step": 1043 }, { "epoch": 0.12850812407680945, "grad_norm": 0.22246553003787994, "learning_rate": 0.00043601428747382684, "loss": 7.7663, "step": 1044 }, { "epoch": 0.12863121614967996, "grad_norm": 0.18720121681690216, "learning_rate": 0.0004359527035349181, "loss": 7.6307, "step": 1045 }, { "epoch": 0.12875430822255046, "grad_norm": 0.2745111286640167, "learning_rate": 0.00043589111959600936, "loss": 7.2506, "step": 1046 }, { "epoch": 0.12887740029542097, "grad_norm": 0.42826589941978455, "learning_rate": 0.00043582953565710064, "loss": 7.4348, "step": 1047 }, { "epoch": 0.12900049236829148, "grad_norm": 0.6244133114814758, "learning_rate": 0.0004357679517181919, "loss": 7.601, "step": 1048 }, { "epoch": 0.12912358444116198, "grad_norm": 0.6917316317558289, "learning_rate": 0.00043570636777928316, "loss": 7.8557, "step": 1049 }, { "epoch": 0.1292466765140325, "grad_norm": 0.6295480728149414, "learning_rate": 0.00043564478384037445, "loss": 8.4243, "step": 1050 }, { "epoch": 0.129369768586903, "grad_norm": 0.3153901696205139, "learning_rate": 0.00043558319990146574, "loss": 8.149, "step": 1051 }, { "epoch": 0.1294928606597735, "grad_norm": 0.44313323497772217, "learning_rate": 0.00043552161596255697, "loss": 7.4579, "step": 1052 }, { "epoch": 0.129615952732644, "grad_norm": 0.5393359661102295, "learning_rate": 0.00043546003202364825, "loss": 7.6489, "step": 1053 }, { "epoch": 0.12973904480551451, "grad_norm": 0.4356921315193176, "learning_rate": 0.0004353984480847395, "loss": 7.6549, "step": 1054 }, { "epoch": 0.12986213687838502, "grad_norm": 0.19918233156204224, "learning_rate": 0.0004353368641458308, "loss": 7.9401, "step": 1055 }, { "epoch": 0.12998522895125553, "grad_norm": 0.36078643798828125, "learning_rate": 0.000435275280206922, "loss": 8.2861, "step": 1056 }, { "epoch": 0.13010832102412603, "grad_norm": 0.3222188353538513, "learning_rate": 0.00043521369626801335, "loss": 7.4362, "step": 1057 }, { "epoch": 0.13023141309699654, "grad_norm": 0.3767267167568207, "learning_rate": 0.0004351521123291046, "loss": 7.9442, "step": 1058 }, { "epoch": 0.13035450516986707, "grad_norm": 0.34959912300109863, "learning_rate": 0.00043509052839019587, "loss": 7.6967, "step": 1059 }, { "epoch": 0.13047759724273758, "grad_norm": 0.3330856263637543, "learning_rate": 0.0004350289444512871, "loss": 7.7874, "step": 1060 }, { "epoch": 0.1306006893156081, "grad_norm": 0.2918563783168793, "learning_rate": 0.0004349673605123784, "loss": 8.0673, "step": 1061 }, { "epoch": 0.1307237813884786, "grad_norm": 0.5932014584541321, "learning_rate": 0.0004349057765734696, "loss": 7.2871, "step": 1062 }, { "epoch": 0.1308468734613491, "grad_norm": 0.5256215929985046, "learning_rate": 0.00043484419263456096, "loss": 7.6006, "step": 1063 }, { "epoch": 0.1309699655342196, "grad_norm": 0.3243432641029358, "learning_rate": 0.0004347826086956522, "loss": 7.9028, "step": 1064 }, { "epoch": 0.1310930576070901, "grad_norm": 0.23635753989219666, "learning_rate": 0.0004347210247567435, "loss": 7.3174, "step": 1065 }, { "epoch": 0.13121614967996062, "grad_norm": 0.542179524898529, "learning_rate": 0.0004346594408178347, "loss": 7.8675, "step": 1066 }, { "epoch": 0.13133924175283113, "grad_norm": 0.4172567129135132, "learning_rate": 0.000434597856878926, "loss": 7.3905, "step": 1067 }, { "epoch": 0.13146233382570163, "grad_norm": 0.43983203172683716, "learning_rate": 0.00043453627294001723, "loss": 7.5836, "step": 1068 }, { "epoch": 0.13158542589857214, "grad_norm": 0.23258867859840393, "learning_rate": 0.0004344746890011085, "loss": 7.8024, "step": 1069 }, { "epoch": 0.13170851797144265, "grad_norm": 0.3653273582458496, "learning_rate": 0.0004344131050621998, "loss": 7.7634, "step": 1070 }, { "epoch": 0.13183161004431315, "grad_norm": 0.3890158236026764, "learning_rate": 0.0004343515211232911, "loss": 7.5991, "step": 1071 }, { "epoch": 0.13195470211718366, "grad_norm": 0.5049852132797241, "learning_rate": 0.0004342899371843823, "loss": 7.7481, "step": 1072 }, { "epoch": 0.13207779419005417, "grad_norm": 0.2284635454416275, "learning_rate": 0.0004342283532454736, "loss": 7.6419, "step": 1073 }, { "epoch": 0.13220088626292467, "grad_norm": 0.35195910930633545, "learning_rate": 0.00043416676930656484, "loss": 7.1398, "step": 1074 }, { "epoch": 0.13232397833579518, "grad_norm": 0.46398329734802246, "learning_rate": 0.0004341051853676561, "loss": 7.8448, "step": 1075 }, { "epoch": 0.13244707040866568, "grad_norm": 0.752812385559082, "learning_rate": 0.00043404360142874736, "loss": 7.7808, "step": 1076 }, { "epoch": 0.1325701624815362, "grad_norm": 0.6575344800949097, "learning_rate": 0.0004339820174898387, "loss": 7.5141, "step": 1077 }, { "epoch": 0.1326932545544067, "grad_norm": 0.6524214148521423, "learning_rate": 0.00043392043355092993, "loss": 7.8934, "step": 1078 }, { "epoch": 0.1328163466272772, "grad_norm": 0.3740961253643036, "learning_rate": 0.0004338588496120212, "loss": 7.658, "step": 1079 }, { "epoch": 0.1329394387001477, "grad_norm": 0.3446483910083771, "learning_rate": 0.00043379726567311245, "loss": 7.7407, "step": 1080 }, { "epoch": 0.13306253077301822, "grad_norm": 0.43493327498435974, "learning_rate": 0.00043373568173420374, "loss": 7.5315, "step": 1081 }, { "epoch": 0.13318562284588872, "grad_norm": 0.6747305989265442, "learning_rate": 0.00043367409779529497, "loss": 7.2844, "step": 1082 }, { "epoch": 0.13330871491875923, "grad_norm": 0.22653591632843018, "learning_rate": 0.00043361251385638626, "loss": 7.8187, "step": 1083 }, { "epoch": 0.13343180699162974, "grad_norm": 0.21464203298091888, "learning_rate": 0.00043355092991747754, "loss": 7.3754, "step": 1084 }, { "epoch": 0.13355489906450024, "grad_norm": 1.019890308380127, "learning_rate": 0.00043348934597856883, "loss": 9.0181, "step": 1085 }, { "epoch": 0.13367799113737075, "grad_norm": 0.7712394595146179, "learning_rate": 0.00043342776203966006, "loss": 7.9375, "step": 1086 }, { "epoch": 0.13380108321024126, "grad_norm": 0.5911339521408081, "learning_rate": 0.00043336617810075135, "loss": 7.6466, "step": 1087 }, { "epoch": 0.13392417528311176, "grad_norm": 0.6278889179229736, "learning_rate": 0.0004333045941618426, "loss": 7.9521, "step": 1088 }, { "epoch": 0.13404726735598227, "grad_norm": 0.4269164800643921, "learning_rate": 0.00043324301022293387, "loss": 7.9297, "step": 1089 }, { "epoch": 0.13417035942885278, "grad_norm": 0.6335381269454956, "learning_rate": 0.0004331814262840251, "loss": 7.5208, "step": 1090 }, { "epoch": 0.13429345150172328, "grad_norm": 0.6528658866882324, "learning_rate": 0.00043311984234511644, "loss": 8.179, "step": 1091 }, { "epoch": 0.1344165435745938, "grad_norm": 0.5928058624267578, "learning_rate": 0.00043305825840620767, "loss": 7.6276, "step": 1092 }, { "epoch": 0.1345396356474643, "grad_norm": 0.5730672478675842, "learning_rate": 0.00043299667446729896, "loss": 7.474, "step": 1093 }, { "epoch": 0.1346627277203348, "grad_norm": 0.3885171413421631, "learning_rate": 0.0004329350905283902, "loss": 7.4691, "step": 1094 }, { "epoch": 0.1347858197932053, "grad_norm": 0.5064272880554199, "learning_rate": 0.0004328735065894815, "loss": 7.7547, "step": 1095 }, { "epoch": 0.13490891186607581, "grad_norm": 0.5396714806556702, "learning_rate": 0.0004328119226505727, "loss": 7.5731, "step": 1096 }, { "epoch": 0.13503200393894632, "grad_norm": 0.6501994132995605, "learning_rate": 0.00043275033871166405, "loss": 7.6742, "step": 1097 }, { "epoch": 0.13515509601181683, "grad_norm": 0.4133816957473755, "learning_rate": 0.0004326887547727553, "loss": 7.8958, "step": 1098 }, { "epoch": 0.13527818808468733, "grad_norm": 0.3168531358242035, "learning_rate": 0.00043262717083384657, "loss": 8.1136, "step": 1099 }, { "epoch": 0.13540128015755784, "grad_norm": 0.49173682928085327, "learning_rate": 0.0004325655868949378, "loss": 7.4793, "step": 1100 }, { "epoch": 0.13552437223042835, "grad_norm": 0.774402916431427, "learning_rate": 0.0004325040029560291, "loss": 7.6247, "step": 1101 }, { "epoch": 0.13564746430329888, "grad_norm": 0.6633387207984924, "learning_rate": 0.0004324424190171203, "loss": 7.4885, "step": 1102 }, { "epoch": 0.1357705563761694, "grad_norm": 0.3587431311607361, "learning_rate": 0.0004323808350782116, "loss": 8.4028, "step": 1103 }, { "epoch": 0.1358936484490399, "grad_norm": 0.17308488488197327, "learning_rate": 0.0004323192511393029, "loss": 7.8696, "step": 1104 }, { "epoch": 0.1360167405219104, "grad_norm": 0.3091298043727875, "learning_rate": 0.0004322576672003942, "loss": 7.8008, "step": 1105 }, { "epoch": 0.1361398325947809, "grad_norm": 0.8517840504646301, "learning_rate": 0.0004321960832614854, "loss": 8.957, "step": 1106 }, { "epoch": 0.1362629246676514, "grad_norm": 0.3040328323841095, "learning_rate": 0.0004321344993225767, "loss": 7.8901, "step": 1107 }, { "epoch": 0.13638601674052192, "grad_norm": 0.2202225774526596, "learning_rate": 0.00043207291538366793, "loss": 7.5549, "step": 1108 }, { "epoch": 0.13650910881339243, "grad_norm": 0.24515004456043243, "learning_rate": 0.0004320113314447592, "loss": 7.5185, "step": 1109 }, { "epoch": 0.13663220088626293, "grad_norm": 0.3757321238517761, "learning_rate": 0.00043194974750585045, "loss": 7.6555, "step": 1110 }, { "epoch": 0.13675529295913344, "grad_norm": 0.3457186222076416, "learning_rate": 0.0004318881635669418, "loss": 7.4687, "step": 1111 }, { "epoch": 0.13687838503200395, "grad_norm": 0.21879664063453674, "learning_rate": 0.000431826579628033, "loss": 7.6097, "step": 1112 }, { "epoch": 0.13700147710487445, "grad_norm": 0.2518247663974762, "learning_rate": 0.0004317649956891243, "loss": 7.5294, "step": 1113 }, { "epoch": 0.13712456917774496, "grad_norm": 0.2509412169456482, "learning_rate": 0.00043170341175021554, "loss": 7.3661, "step": 1114 }, { "epoch": 0.13724766125061547, "grad_norm": 0.3680810332298279, "learning_rate": 0.00043164182781130683, "loss": 7.4977, "step": 1115 }, { "epoch": 0.13737075332348597, "grad_norm": 0.2693149447441101, "learning_rate": 0.00043158024387239806, "loss": 7.5247, "step": 1116 }, { "epoch": 0.13749384539635648, "grad_norm": 0.8541855812072754, "learning_rate": 0.0004315186599334894, "loss": 9.4824, "step": 1117 }, { "epoch": 0.13761693746922699, "grad_norm": 0.3841574192047119, "learning_rate": 0.00043145707599458064, "loss": 7.6328, "step": 1118 }, { "epoch": 0.1377400295420975, "grad_norm": 0.4807462990283966, "learning_rate": 0.0004313954920556719, "loss": 7.6767, "step": 1119 }, { "epoch": 0.137863121614968, "grad_norm": 0.6657018065452576, "learning_rate": 0.00043133390811676316, "loss": 7.7456, "step": 1120 }, { "epoch": 0.1379862136878385, "grad_norm": 0.38497379422187805, "learning_rate": 0.00043127232417785444, "loss": 7.5099, "step": 1121 }, { "epoch": 0.138109305760709, "grad_norm": 0.3483254313468933, "learning_rate": 0.0004312107402389457, "loss": 7.8679, "step": 1122 }, { "epoch": 0.13823239783357952, "grad_norm": 0.35210615396499634, "learning_rate": 0.00043114915630003696, "loss": 7.4985, "step": 1123 }, { "epoch": 0.13835548990645002, "grad_norm": 0.645313024520874, "learning_rate": 0.00043108757236112825, "loss": 7.6841, "step": 1124 }, { "epoch": 0.13847858197932053, "grad_norm": 0.515137255191803, "learning_rate": 0.00043102598842221953, "loss": 7.7996, "step": 1125 }, { "epoch": 0.13860167405219104, "grad_norm": 0.4362637996673584, "learning_rate": 0.00043096440448331077, "loss": 7.8593, "step": 1126 }, { "epoch": 0.13872476612506154, "grad_norm": 0.32644543051719666, "learning_rate": 0.00043090282054440205, "loss": 7.4099, "step": 1127 }, { "epoch": 0.13884785819793205, "grad_norm": 0.38090336322784424, "learning_rate": 0.0004308412366054933, "loss": 7.6383, "step": 1128 }, { "epoch": 0.13897095027080256, "grad_norm": 0.2520347833633423, "learning_rate": 0.00043077965266658457, "loss": 8.0764, "step": 1129 }, { "epoch": 0.13909404234367306, "grad_norm": 0.2601851224899292, "learning_rate": 0.0004307180687276758, "loss": 7.7508, "step": 1130 }, { "epoch": 0.13921713441654357, "grad_norm": 1.329053521156311, "learning_rate": 0.00043065648478876715, "loss": 10.1877, "step": 1131 }, { "epoch": 0.13934022648941408, "grad_norm": 0.17841708660125732, "learning_rate": 0.0004305949008498584, "loss": 7.6139, "step": 1132 }, { "epoch": 0.13946331856228458, "grad_norm": 0.3515723645687103, "learning_rate": 0.00043053331691094966, "loss": 8.0658, "step": 1133 }, { "epoch": 0.1395864106351551, "grad_norm": 0.17948248982429504, "learning_rate": 0.0004304717329720409, "loss": 8.0576, "step": 1134 }, { "epoch": 0.1397095027080256, "grad_norm": 0.22266703844070435, "learning_rate": 0.0004304101490331322, "loss": 7.9656, "step": 1135 }, { "epoch": 0.1398325947808961, "grad_norm": 0.42295023798942566, "learning_rate": 0.0004303485650942234, "loss": 7.539, "step": 1136 }, { "epoch": 0.1399556868537666, "grad_norm": 0.3937053978443146, "learning_rate": 0.0004302869811553147, "loss": 7.3907, "step": 1137 }, { "epoch": 0.14007877892663712, "grad_norm": 0.3883720636367798, "learning_rate": 0.000430225397216406, "loss": 8.6956, "step": 1138 }, { "epoch": 0.14020187099950762, "grad_norm": 0.49305567145347595, "learning_rate": 0.0004301638132774973, "loss": 8.2946, "step": 1139 }, { "epoch": 0.14032496307237813, "grad_norm": 0.2542566955089569, "learning_rate": 0.0004301022293385885, "loss": 7.469, "step": 1140 }, { "epoch": 0.14044805514524863, "grad_norm": 0.22791799902915955, "learning_rate": 0.0004300406453996798, "loss": 7.3845, "step": 1141 }, { "epoch": 0.14057114721811914, "grad_norm": 0.3672398030757904, "learning_rate": 0.000429979061460771, "loss": 7.7183, "step": 1142 }, { "epoch": 0.14069423929098965, "grad_norm": 0.189431831240654, "learning_rate": 0.0004299174775218623, "loss": 7.4818, "step": 1143 }, { "epoch": 0.14081733136386015, "grad_norm": 0.39960381388664246, "learning_rate": 0.00042985589358295355, "loss": 7.8484, "step": 1144 }, { "epoch": 0.14094042343673066, "grad_norm": 0.28188198804855347, "learning_rate": 0.0004297943096440449, "loss": 7.8085, "step": 1145 }, { "epoch": 0.1410635155096012, "grad_norm": 0.40829789638519287, "learning_rate": 0.0004297327257051361, "loss": 8.5487, "step": 1146 }, { "epoch": 0.1411866075824717, "grad_norm": 0.22656631469726562, "learning_rate": 0.0004296711417662274, "loss": 7.5401, "step": 1147 }, { "epoch": 0.1413096996553422, "grad_norm": 0.27487608790397644, "learning_rate": 0.00042960955782731864, "loss": 7.7539, "step": 1148 }, { "epoch": 0.14143279172821271, "grad_norm": 0.20521917939186096, "learning_rate": 0.0004295479738884099, "loss": 7.7346, "step": 1149 }, { "epoch": 0.14155588380108322, "grad_norm": 0.2180599868297577, "learning_rate": 0.00042948638994950116, "loss": 7.5696, "step": 1150 }, { "epoch": 0.14167897587395373, "grad_norm": 0.20427361130714417, "learning_rate": 0.0004294248060105925, "loss": 7.4414, "step": 1151 }, { "epoch": 0.14180206794682423, "grad_norm": 0.3342544138431549, "learning_rate": 0.00042936322207168373, "loss": 7.9213, "step": 1152 }, { "epoch": 0.14192516001969474, "grad_norm": 0.14960439503192902, "learning_rate": 0.000429301638132775, "loss": 7.4288, "step": 1153 }, { "epoch": 0.14204825209256525, "grad_norm": 0.567064642906189, "learning_rate": 0.00042924005419386625, "loss": 8.8842, "step": 1154 }, { "epoch": 0.14217134416543575, "grad_norm": 0.397429496049881, "learning_rate": 0.00042917847025495754, "loss": 7.843, "step": 1155 }, { "epoch": 0.14229443623830626, "grad_norm": 0.4446791708469391, "learning_rate": 0.00042911688631604877, "loss": 7.5821, "step": 1156 }, { "epoch": 0.14241752831117677, "grad_norm": 0.2789526581764221, "learning_rate": 0.00042905530237714005, "loss": 8.1722, "step": 1157 }, { "epoch": 0.14254062038404727, "grad_norm": 0.39570116996765137, "learning_rate": 0.00042899371843823134, "loss": 7.455, "step": 1158 }, { "epoch": 0.14266371245691778, "grad_norm": 0.32187986373901367, "learning_rate": 0.00042893213449932263, "loss": 7.8824, "step": 1159 }, { "epoch": 0.14278680452978829, "grad_norm": 0.9120309948921204, "learning_rate": 0.00042887055056041386, "loss": 9.424, "step": 1160 }, { "epoch": 0.1429098966026588, "grad_norm": 0.3329271674156189, "learning_rate": 0.00042880896662150515, "loss": 7.4967, "step": 1161 }, { "epoch": 0.1430329886755293, "grad_norm": 0.2605212926864624, "learning_rate": 0.0004287473826825964, "loss": 7.5076, "step": 1162 }, { "epoch": 0.1431560807483998, "grad_norm": 0.2906285226345062, "learning_rate": 0.00042868579874368767, "loss": 7.4939, "step": 1163 }, { "epoch": 0.1432791728212703, "grad_norm": 0.31972506642341614, "learning_rate": 0.0004286242148047789, "loss": 7.8859, "step": 1164 }, { "epoch": 0.14340226489414082, "grad_norm": 0.32657086849212646, "learning_rate": 0.0004285626308658702, "loss": 7.7241, "step": 1165 }, { "epoch": 0.14352535696701132, "grad_norm": 0.23381227254867554, "learning_rate": 0.00042850104692696147, "loss": 7.4626, "step": 1166 }, { "epoch": 0.14364844903988183, "grad_norm": 0.2319536805152893, "learning_rate": 0.0004284394629880527, "loss": 7.7861, "step": 1167 }, { "epoch": 0.14377154111275234, "grad_norm": 0.26847347617149353, "learning_rate": 0.000428377879049144, "loss": 7.5098, "step": 1168 }, { "epoch": 0.14389463318562284, "grad_norm": 0.2960135042667389, "learning_rate": 0.0004283162951102352, "loss": 8.1217, "step": 1169 }, { "epoch": 0.14401772525849335, "grad_norm": 0.3366859257221222, "learning_rate": 0.0004282547111713265, "loss": 7.5972, "step": 1170 }, { "epoch": 0.14414081733136386, "grad_norm": 0.2810179591178894, "learning_rate": 0.00042819312723241774, "loss": 7.7108, "step": 1171 }, { "epoch": 0.14426390940423436, "grad_norm": 0.4520522356033325, "learning_rate": 0.0004281315432935091, "loss": 8.7218, "step": 1172 }, { "epoch": 0.14438700147710487, "grad_norm": 0.1588023155927658, "learning_rate": 0.0004280699593546003, "loss": 7.7143, "step": 1173 }, { "epoch": 0.14451009354997538, "grad_norm": 0.3624514639377594, "learning_rate": 0.0004280083754156916, "loss": 8.1646, "step": 1174 }, { "epoch": 0.14463318562284588, "grad_norm": 0.40381932258605957, "learning_rate": 0.00042794679147678283, "loss": 7.3624, "step": 1175 }, { "epoch": 0.1447562776957164, "grad_norm": 0.454972505569458, "learning_rate": 0.0004278852075378741, "loss": 8.5832, "step": 1176 }, { "epoch": 0.1448793697685869, "grad_norm": 0.39789795875549316, "learning_rate": 0.00042782362359896535, "loss": 8.4494, "step": 1177 }, { "epoch": 0.1450024618414574, "grad_norm": 0.4122267961502075, "learning_rate": 0.0004277620396600567, "loss": 8.2462, "step": 1178 }, { "epoch": 0.1451255539143279, "grad_norm": 0.4028611183166504, "learning_rate": 0.0004277004557211479, "loss": 7.5699, "step": 1179 }, { "epoch": 0.14524864598719842, "grad_norm": 0.39228755235671997, "learning_rate": 0.0004276388717822392, "loss": 8.5009, "step": 1180 }, { "epoch": 0.14537173806006892, "grad_norm": 0.28448671102523804, "learning_rate": 0.00042757728784333045, "loss": 8.4288, "step": 1181 }, { "epoch": 0.14549483013293943, "grad_norm": 0.4962462782859802, "learning_rate": 0.00042751570390442173, "loss": 7.4144, "step": 1182 }, { "epoch": 0.14561792220580994, "grad_norm": 0.32040488719940186, "learning_rate": 0.00042745411996551296, "loss": 7.9913, "step": 1183 }, { "epoch": 0.14574101427868044, "grad_norm": 0.47033801674842834, "learning_rate": 0.00042739253602660425, "loss": 7.5722, "step": 1184 }, { "epoch": 0.14586410635155095, "grad_norm": 0.4147973656654358, "learning_rate": 0.00042733095208769554, "loss": 7.9484, "step": 1185 }, { "epoch": 0.14598719842442145, "grad_norm": 0.23029370605945587, "learning_rate": 0.0004272693681487868, "loss": 7.5468, "step": 1186 }, { "epoch": 0.14611029049729196, "grad_norm": 0.31682637333869934, "learning_rate": 0.00042720778420987806, "loss": 8.1098, "step": 1187 }, { "epoch": 0.14623338257016247, "grad_norm": 0.5573036074638367, "learning_rate": 0.00042714620027096934, "loss": 7.5271, "step": 1188 }, { "epoch": 0.146356474643033, "grad_norm": 0.1897362619638443, "learning_rate": 0.0004270846163320606, "loss": 8.1285, "step": 1189 }, { "epoch": 0.1464795667159035, "grad_norm": 0.5581684112548828, "learning_rate": 0.00042702303239315186, "loss": 8.868, "step": 1190 }, { "epoch": 0.14660265878877402, "grad_norm": 0.42389625310897827, "learning_rate": 0.0004269614484542431, "loss": 7.5451, "step": 1191 }, { "epoch": 0.14672575086164452, "grad_norm": 0.3705877959728241, "learning_rate": 0.00042689986451533444, "loss": 7.5746, "step": 1192 }, { "epoch": 0.14684884293451503, "grad_norm": 0.31879723072052, "learning_rate": 0.00042683828057642567, "loss": 7.6164, "step": 1193 }, { "epoch": 0.14697193500738553, "grad_norm": NaN, "learning_rate": 0.00042677669663751695, "loss": 10.7951, "step": 1194 }, { "epoch": 0.14709502708025604, "grad_norm": 0.30904901027679443, "learning_rate": 0.0004267151126986082, "loss": 7.6708, "step": 1195 }, { "epoch": 0.14721811915312655, "grad_norm": 1.3813018798828125, "learning_rate": 0.0004266535287596995, "loss": 9.2141, "step": 1196 }, { "epoch": 0.14734121122599705, "grad_norm": 0.5787645578384399, "learning_rate": 0.0004265919448207907, "loss": 8.1793, "step": 1197 }, { "epoch": 0.14746430329886756, "grad_norm": 2.886486768722534, "learning_rate": 0.000426530360881882, "loss": 8.5108, "step": 1198 }, { "epoch": 0.14758739537173807, "grad_norm": 2.1628940105438232, "learning_rate": 0.0004264687769429733, "loss": 8.8212, "step": 1199 }, { "epoch": 0.14771048744460857, "grad_norm": 1.030109167098999, "learning_rate": 0.00042640719300406457, "loss": 8.4152, "step": 1200 }, { "epoch": 0.14783357951747908, "grad_norm": 1.6400182247161865, "learning_rate": 0.0004263456090651558, "loss": 8.0356, "step": 1201 }, { "epoch": 0.1479566715903496, "grad_norm": 1.8061259984970093, "learning_rate": 0.0004262840251262471, "loss": 8.1551, "step": 1202 }, { "epoch": 0.1480797636632201, "grad_norm": 1.25200355052948, "learning_rate": 0.0004262224411873383, "loss": 8.1945, "step": 1203 }, { "epoch": 0.1482028557360906, "grad_norm": 0.4163260757923126, "learning_rate": 0.0004261608572484296, "loss": 7.7571, "step": 1204 }, { "epoch": 0.1483259478089611, "grad_norm": 1.314691424369812, "learning_rate": 0.00042609927330952084, "loss": 8.1285, "step": 1205 }, { "epoch": 0.1484490398818316, "grad_norm": 0.7824684381484985, "learning_rate": 0.0004260376893706122, "loss": 7.7925, "step": 1206 }, { "epoch": 0.14857213195470212, "grad_norm": 0.6471862196922302, "learning_rate": 0.0004259761054317034, "loss": 7.5877, "step": 1207 }, { "epoch": 0.14869522402757263, "grad_norm": 0.7285593748092651, "learning_rate": 0.0004259145214927947, "loss": 7.6144, "step": 1208 }, { "epoch": 0.14881831610044313, "grad_norm": 0.3690866529941559, "learning_rate": 0.00042585293755388593, "loss": 7.6209, "step": 1209 }, { "epoch": 0.14894140817331364, "grad_norm": 0.5600637793540955, "learning_rate": 0.0004257913536149772, "loss": 8.1635, "step": 1210 }, { "epoch": 0.14906450024618415, "grad_norm": 0.3678147792816162, "learning_rate": 0.00042572976967606845, "loss": 7.4611, "step": 1211 }, { "epoch": 0.14918759231905465, "grad_norm": 0.6411478519439697, "learning_rate": 0.0004256681857371598, "loss": 8.6695, "step": 1212 }, { "epoch": 0.14931068439192516, "grad_norm": 0.3964221775531769, "learning_rate": 0.000425606601798251, "loss": 7.5604, "step": 1213 }, { "epoch": 0.14943377646479566, "grad_norm": 0.26683786511421204, "learning_rate": 0.0004255450178593423, "loss": 7.462, "step": 1214 }, { "epoch": 0.14955686853766617, "grad_norm": 0.2858467996120453, "learning_rate": 0.00042548343392043354, "loss": 7.4611, "step": 1215 }, { "epoch": 0.14967996061053668, "grad_norm": 0.3794986605644226, "learning_rate": 0.0004254218499815248, "loss": 7.4805, "step": 1216 }, { "epoch": 0.14980305268340718, "grad_norm": 0.3536168038845062, "learning_rate": 0.00042536026604261606, "loss": 8.0612, "step": 1217 }, { "epoch": 0.1499261447562777, "grad_norm": 0.5410565137863159, "learning_rate": 0.00042529868210370734, "loss": 7.5619, "step": 1218 }, { "epoch": 0.1500492368291482, "grad_norm": 0.9432839155197144, "learning_rate": 0.00042523709816479863, "loss": 9.2595, "step": 1219 }, { "epoch": 0.1501723289020187, "grad_norm": 0.3294771909713745, "learning_rate": 0.0004251755142258899, "loss": 8.2445, "step": 1220 }, { "epoch": 0.1502954209748892, "grad_norm": 1.3696547746658325, "learning_rate": 0.00042511393028698115, "loss": 8.2353, "step": 1221 }, { "epoch": 0.15041851304775972, "grad_norm": 0.4940473139286041, "learning_rate": 0.00042505234634807244, "loss": 7.9501, "step": 1222 }, { "epoch": 0.15054160512063022, "grad_norm": 0.44160592555999756, "learning_rate": 0.00042499076240916367, "loss": 7.8008, "step": 1223 }, { "epoch": 0.15066469719350073, "grad_norm": 0.6653614640235901, "learning_rate": 0.00042492917847025496, "loss": 7.5975, "step": 1224 }, { "epoch": 0.15078778926637124, "grad_norm": 0.43134158849716187, "learning_rate": 0.0004248675945313462, "loss": 8.0433, "step": 1225 }, { "epoch": 0.15091088133924174, "grad_norm": 0.4229229986667633, "learning_rate": 0.00042480601059243753, "loss": 7.653, "step": 1226 }, { "epoch": 0.15103397341211225, "grad_norm": 0.243229478597641, "learning_rate": 0.00042474442665352876, "loss": 7.9436, "step": 1227 }, { "epoch": 0.15115706548498276, "grad_norm": 0.5505337715148926, "learning_rate": 0.00042468284271462005, "loss": 8.2797, "step": 1228 }, { "epoch": 0.15128015755785326, "grad_norm": 0.3020777404308319, "learning_rate": 0.0004246212587757113, "loss": 7.6761, "step": 1229 }, { "epoch": 0.15140324963072377, "grad_norm": 0.19728484749794006, "learning_rate": 0.00042455967483680257, "loss": 7.9791, "step": 1230 }, { "epoch": 0.15152634170359428, "grad_norm": 0.49667829275131226, "learning_rate": 0.0004244980908978938, "loss": 7.3983, "step": 1231 }, { "epoch": 0.1516494337764648, "grad_norm": 0.30677440762519836, "learning_rate": 0.00042443650695898514, "loss": 8.2272, "step": 1232 }, { "epoch": 0.15177252584933532, "grad_norm": 0.2908315658569336, "learning_rate": 0.00042437492302007637, "loss": 7.7373, "step": 1233 }, { "epoch": 0.15189561792220582, "grad_norm": 0.4859324097633362, "learning_rate": 0.00042431333908116766, "loss": 8.8261, "step": 1234 }, { "epoch": 0.15201870999507633, "grad_norm": 0.6398124694824219, "learning_rate": 0.0004242517551422589, "loss": 7.6538, "step": 1235 }, { "epoch": 0.15214180206794684, "grad_norm": 0.5435010194778442, "learning_rate": 0.0004241901712033502, "loss": 7.7678, "step": 1236 }, { "epoch": 0.15226489414081734, "grad_norm": 0.332849383354187, "learning_rate": 0.0004241285872644414, "loss": 8.5103, "step": 1237 }, { "epoch": 0.15238798621368785, "grad_norm": 0.33831122517585754, "learning_rate": 0.0004240670033255327, "loss": 8.0464, "step": 1238 }, { "epoch": 0.15251107828655835, "grad_norm": 0.3068463206291199, "learning_rate": 0.000424005419386624, "loss": 7.4486, "step": 1239 }, { "epoch": 0.15263417035942886, "grad_norm": 0.4823313355445862, "learning_rate": 0.00042394383544771527, "loss": 7.7408, "step": 1240 }, { "epoch": 0.15275726243229937, "grad_norm": 0.3100503385066986, "learning_rate": 0.0004238822515088065, "loss": 7.5474, "step": 1241 }, { "epoch": 0.15288035450516987, "grad_norm": 0.3892628252506256, "learning_rate": 0.0004238206675698978, "loss": 7.9316, "step": 1242 }, { "epoch": 0.15300344657804038, "grad_norm": 0.38173606991767883, "learning_rate": 0.000423759083630989, "loss": 7.5529, "step": 1243 }, { "epoch": 0.1531265386509109, "grad_norm": 0.6199668645858765, "learning_rate": 0.0004236974996920803, "loss": 8.5991, "step": 1244 }, { "epoch": 0.1532496307237814, "grad_norm": 0.31615984439849854, "learning_rate": 0.00042363591575317154, "loss": 8.4155, "step": 1245 }, { "epoch": 0.1533727227966519, "grad_norm": 0.3535023033618927, "learning_rate": 0.0004235743318142629, "loss": 7.6691, "step": 1246 }, { "epoch": 0.1534958148695224, "grad_norm": 0.3733571469783783, "learning_rate": 0.0004235127478753541, "loss": 7.4528, "step": 1247 }, { "epoch": 0.1536189069423929, "grad_norm": 0.22362086176872253, "learning_rate": 0.0004234511639364454, "loss": 7.5071, "step": 1248 }, { "epoch": 0.15374199901526342, "grad_norm": 0.4656035602092743, "learning_rate": 0.00042338957999753663, "loss": 7.9967, "step": 1249 }, { "epoch": 0.15386509108813393, "grad_norm": 0.41038528084754944, "learning_rate": 0.0004233279960586279, "loss": 8.3023, "step": 1250 }, { "epoch": 0.15398818316100443, "grad_norm": 0.3944229185581207, "learning_rate": 0.00042326641211971915, "loss": 7.7163, "step": 1251 }, { "epoch": 0.15411127523387494, "grad_norm": 0.5371823906898499, "learning_rate": 0.00042320482818081044, "loss": 7.9424, "step": 1252 }, { "epoch": 0.15423436730674545, "grad_norm": 0.13308994472026825, "learning_rate": 0.0004231432442419017, "loss": 8.0528, "step": 1253 }, { "epoch": 0.15435745937961595, "grad_norm": 0.4449875056743622, "learning_rate": 0.000423081660302993, "loss": 8.3037, "step": 1254 }, { "epoch": 0.15448055145248646, "grad_norm": 0.26389366388320923, "learning_rate": 0.00042302007636408424, "loss": 7.5128, "step": 1255 }, { "epoch": 0.15460364352535697, "grad_norm": 0.39243826270103455, "learning_rate": 0.00042295849242517553, "loss": 8.3022, "step": 1256 }, { "epoch": 0.15472673559822747, "grad_norm": 0.2616882026195526, "learning_rate": 0.00042289690848626676, "loss": 7.6472, "step": 1257 }, { "epoch": 0.15484982767109798, "grad_norm": 0.357521116733551, "learning_rate": 0.00042283532454735805, "loss": 7.7814, "step": 1258 }, { "epoch": 0.15497291974396848, "grad_norm": 0.2726127803325653, "learning_rate": 0.0004227737406084493, "loss": 7.4842, "step": 1259 }, { "epoch": 0.155096011816839, "grad_norm": 0.4871644079685211, "learning_rate": 0.0004227121566695406, "loss": 7.6352, "step": 1260 }, { "epoch": 0.1552191038897095, "grad_norm": 0.32736513018608093, "learning_rate": 0.00042265057273063186, "loss": 7.7721, "step": 1261 }, { "epoch": 0.15534219596258, "grad_norm": 0.16521932184696198, "learning_rate": 0.00042258898879172314, "loss": 8.0422, "step": 1262 }, { "epoch": 0.1554652880354505, "grad_norm": 0.433927446603775, "learning_rate": 0.0004225274048528144, "loss": 7.7135, "step": 1263 }, { "epoch": 0.15558838010832102, "grad_norm": 0.34309741854667664, "learning_rate": 0.00042246582091390566, "loss": 7.7614, "step": 1264 }, { "epoch": 0.15571147218119152, "grad_norm": 0.21425703167915344, "learning_rate": 0.0004224042369749969, "loss": 7.9892, "step": 1265 }, { "epoch": 0.15583456425406203, "grad_norm": 0.44511574506759644, "learning_rate": 0.00042234265303608823, "loss": 7.659, "step": 1266 }, { "epoch": 0.15595765632693254, "grad_norm": 0.34230586886405945, "learning_rate": 0.00042228106909717947, "loss": 7.8759, "step": 1267 }, { "epoch": 0.15608074839980304, "grad_norm": 0.2585383951663971, "learning_rate": 0.00042221948515827075, "loss": 8.402, "step": 1268 }, { "epoch": 0.15620384047267355, "grad_norm": 0.3059520125389099, "learning_rate": 0.000422157901219362, "loss": 8.0908, "step": 1269 }, { "epoch": 0.15632693254554406, "grad_norm": 0.5619118213653564, "learning_rate": 0.00042209631728045327, "loss": 7.6885, "step": 1270 }, { "epoch": 0.15645002461841456, "grad_norm": 0.4681781530380249, "learning_rate": 0.0004220347333415445, "loss": 7.7374, "step": 1271 }, { "epoch": 0.15657311669128507, "grad_norm": 0.26376768946647644, "learning_rate": 0.0004219731494026358, "loss": 7.6323, "step": 1272 }, { "epoch": 0.15669620876415558, "grad_norm": 0.7622837424278259, "learning_rate": 0.0004219115654637271, "loss": 8.7881, "step": 1273 }, { "epoch": 0.15681930083702608, "grad_norm": 0.2619265019893646, "learning_rate": 0.00042184998152481836, "loss": 7.3733, "step": 1274 }, { "epoch": 0.15694239290989662, "grad_norm": 0.2057582587003708, "learning_rate": 0.0004217883975859096, "loss": 7.7622, "step": 1275 }, { "epoch": 0.15706548498276712, "grad_norm": 0.350011944770813, "learning_rate": 0.0004217268136470009, "loss": 7.6567, "step": 1276 }, { "epoch": 0.15718857705563763, "grad_norm": 0.5585086345672607, "learning_rate": 0.0004216652297080921, "loss": 7.3899, "step": 1277 }, { "epoch": 0.15731166912850814, "grad_norm": 0.3244166374206543, "learning_rate": 0.0004216036457691834, "loss": 7.9765, "step": 1278 }, { "epoch": 0.15743476120137864, "grad_norm": 0.462766170501709, "learning_rate": 0.00042154206183027463, "loss": 8.2154, "step": 1279 }, { "epoch": 0.15755785327424915, "grad_norm": 0.16598673164844513, "learning_rate": 0.000421480477891366, "loss": 7.3006, "step": 1280 }, { "epoch": 0.15768094534711966, "grad_norm": 0.8396050333976746, "learning_rate": 0.0004214188939524572, "loss": 9.4524, "step": 1281 }, { "epoch": 0.15780403741999016, "grad_norm": 0.3267558217048645, "learning_rate": 0.0004213573100135485, "loss": 7.5498, "step": 1282 }, { "epoch": 0.15792712949286067, "grad_norm": 0.3591454029083252, "learning_rate": 0.0004212957260746397, "loss": 8.4561, "step": 1283 }, { "epoch": 0.15805022156573117, "grad_norm": 0.5334994792938232, "learning_rate": 0.000421234142135731, "loss": 7.8433, "step": 1284 }, { "epoch": 0.15817331363860168, "grad_norm": 0.7246116399765015, "learning_rate": 0.00042117255819682225, "loss": 7.6675, "step": 1285 }, { "epoch": 0.1582964057114722, "grad_norm": 0.7906237244606018, "learning_rate": 0.00042111097425791353, "loss": 7.357, "step": 1286 }, { "epoch": 0.1584194977843427, "grad_norm": 0.395465224981308, "learning_rate": 0.0004210493903190048, "loss": 7.8121, "step": 1287 }, { "epoch": 0.1585425898572132, "grad_norm": 0.32700315117836, "learning_rate": 0.0004209878063800961, "loss": 7.576, "step": 1288 }, { "epoch": 0.1586656819300837, "grad_norm": 0.61037278175354, "learning_rate": 0.00042092622244118734, "loss": 7.7536, "step": 1289 }, { "epoch": 0.1587887740029542, "grad_norm": 0.5675337910652161, "learning_rate": 0.0004208646385022786, "loss": 7.4937, "step": 1290 }, { "epoch": 0.15891186607582472, "grad_norm": 0.5465804934501648, "learning_rate": 0.00042080305456336986, "loss": 8.3384, "step": 1291 }, { "epoch": 0.15903495814869523, "grad_norm": 0.2364872545003891, "learning_rate": 0.00042074147062446114, "loss": 7.6387, "step": 1292 }, { "epoch": 0.15915805022156573, "grad_norm": 0.3747563660144806, "learning_rate": 0.00042067988668555243, "loss": 8.2659, "step": 1293 }, { "epoch": 0.15928114229443624, "grad_norm": 0.32011914253234863, "learning_rate": 0.0004206183027466437, "loss": 8.1381, "step": 1294 }, { "epoch": 0.15940423436730675, "grad_norm": 0.21116794645786285, "learning_rate": 0.00042055671880773495, "loss": 8.1368, "step": 1295 }, { "epoch": 0.15952732644017725, "grad_norm": 0.2321891039609909, "learning_rate": 0.00042049513486882624, "loss": 7.9045, "step": 1296 }, { "epoch": 0.15965041851304776, "grad_norm": 0.2758432924747467, "learning_rate": 0.00042043355092991747, "loss": 8.2189, "step": 1297 }, { "epoch": 0.15977351058591827, "grad_norm": 0.21770362555980682, "learning_rate": 0.00042037196699100875, "loss": 7.9537, "step": 1298 }, { "epoch": 0.15989660265878877, "grad_norm": 0.32044124603271484, "learning_rate": 0.0004203103830521, "loss": 7.587, "step": 1299 }, { "epoch": 0.16001969473165928, "grad_norm": 0.2007421851158142, "learning_rate": 0.00042024879911319133, "loss": 7.508, "step": 1300 }, { "epoch": 0.16014278680452979, "grad_norm": 0.8738301992416382, "learning_rate": 0.00042018721517428256, "loss": 9.1556, "step": 1301 }, { "epoch": 0.1602658788774003, "grad_norm": 0.282461553812027, "learning_rate": 0.00042012563123537385, "loss": 7.7358, "step": 1302 }, { "epoch": 0.1603889709502708, "grad_norm": 0.3177146315574646, "learning_rate": 0.0004200640472964651, "loss": 8.1217, "step": 1303 }, { "epoch": 0.1605120630231413, "grad_norm": 0.18925733864307404, "learning_rate": 0.00042000246335755637, "loss": 8.2442, "step": 1304 }, { "epoch": 0.1606351550960118, "grad_norm": 0.5521374344825745, "learning_rate": 0.0004199408794186476, "loss": 7.6719, "step": 1305 }, { "epoch": 0.16075824716888232, "grad_norm": 0.6422029137611389, "learning_rate": 0.0004198792954797389, "loss": 7.6331, "step": 1306 }, { "epoch": 0.16088133924175282, "grad_norm": 0.3401394784450531, "learning_rate": 0.00041981771154083017, "loss": 7.8351, "step": 1307 }, { "epoch": 0.16100443131462333, "grad_norm": 0.1823761910200119, "learning_rate": 0.00041975612760192146, "loss": 7.8426, "step": 1308 }, { "epoch": 0.16112752338749384, "grad_norm": 0.6190038323402405, "learning_rate": 0.0004196945436630127, "loss": 8.3297, "step": 1309 }, { "epoch": 0.16125061546036434, "grad_norm": 0.3872681260108948, "learning_rate": 0.000419632959724104, "loss": 7.8725, "step": 1310 }, { "epoch": 0.16137370753323485, "grad_norm": 0.24601098895072937, "learning_rate": 0.0004195713757851952, "loss": 7.5693, "step": 1311 }, { "epoch": 0.16149679960610536, "grad_norm": 0.3815496265888214, "learning_rate": 0.0004195097918462865, "loss": 7.3798, "step": 1312 }, { "epoch": 0.16161989167897586, "grad_norm": 0.3952881991863251, "learning_rate": 0.00041944820790737773, "loss": 7.4388, "step": 1313 }, { "epoch": 0.16174298375184637, "grad_norm": 0.14168256521224976, "learning_rate": 0.00041938662396846907, "loss": 7.7522, "step": 1314 }, { "epoch": 0.16186607582471688, "grad_norm": 0.190973162651062, "learning_rate": 0.0004193250400295603, "loss": 7.7321, "step": 1315 }, { "epoch": 0.16198916789758738, "grad_norm": 0.3142707943916321, "learning_rate": 0.0004192634560906516, "loss": 7.0409, "step": 1316 }, { "epoch": 0.1621122599704579, "grad_norm": 0.3008851110935211, "learning_rate": 0.0004192018721517428, "loss": 7.5476, "step": 1317 }, { "epoch": 0.16223535204332842, "grad_norm": 0.2710198163986206, "learning_rate": 0.0004191402882128341, "loss": 7.5198, "step": 1318 }, { "epoch": 0.16235844411619893, "grad_norm": 0.19912542402744293, "learning_rate": 0.00041907870427392534, "loss": 8.0462, "step": 1319 }, { "epoch": 0.16248153618906944, "grad_norm": 0.17758390307426453, "learning_rate": 0.0004190171203350167, "loss": 7.6356, "step": 1320 }, { "epoch": 0.16260462826193994, "grad_norm": 0.39951831102371216, "learning_rate": 0.0004189555363961079, "loss": 7.2394, "step": 1321 }, { "epoch": 0.16272772033481045, "grad_norm": 0.1728070229291916, "learning_rate": 0.0004188939524571992, "loss": 7.5978, "step": 1322 }, { "epoch": 0.16285081240768096, "grad_norm": 0.2412927746772766, "learning_rate": 0.00041883236851829043, "loss": 7.6299, "step": 1323 }, { "epoch": 0.16297390448055146, "grad_norm": 0.20206764340400696, "learning_rate": 0.0004187707845793817, "loss": 7.4888, "step": 1324 }, { "epoch": 0.16309699655342197, "grad_norm": 0.19127525389194489, "learning_rate": 0.00041870920064047295, "loss": 7.621, "step": 1325 }, { "epoch": 0.16322008862629248, "grad_norm": 0.6389787197113037, "learning_rate": 0.00041864761670156424, "loss": 8.0996, "step": 1326 }, { "epoch": 0.16334318069916298, "grad_norm": 0.28099325299263, "learning_rate": 0.0004185860327626555, "loss": 7.6887, "step": 1327 }, { "epoch": 0.1634662727720335, "grad_norm": 0.27529096603393555, "learning_rate": 0.0004185244488237468, "loss": 7.7174, "step": 1328 }, { "epoch": 0.163589364844904, "grad_norm": 0.15396250784397125, "learning_rate": 0.00041846286488483804, "loss": 7.6332, "step": 1329 }, { "epoch": 0.1637124569177745, "grad_norm": 0.45115000009536743, "learning_rate": 0.00041840128094592933, "loss": 8.2432, "step": 1330 }, { "epoch": 0.163835548990645, "grad_norm": 0.20226003229618073, "learning_rate": 0.00041833969700702056, "loss": 7.6544, "step": 1331 }, { "epoch": 0.16395864106351551, "grad_norm": 0.3870770037174225, "learning_rate": 0.00041827811306811185, "loss": 7.4372, "step": 1332 }, { "epoch": 0.16408173313638602, "grad_norm": 0.22012612223625183, "learning_rate": 0.0004182165291292031, "loss": 7.6049, "step": 1333 }, { "epoch": 0.16420482520925653, "grad_norm": 0.30837732553482056, "learning_rate": 0.0004181549451902944, "loss": 7.4765, "step": 1334 }, { "epoch": 0.16432791728212703, "grad_norm": 0.15487059950828552, "learning_rate": 0.00041809336125138565, "loss": 7.6587, "step": 1335 }, { "epoch": 0.16445100935499754, "grad_norm": 0.1714780181646347, "learning_rate": 0.00041803177731247694, "loss": 7.5265, "step": 1336 }, { "epoch": 0.16457410142786805, "grad_norm": 0.12136055529117584, "learning_rate": 0.0004179701933735682, "loss": 7.5749, "step": 1337 }, { "epoch": 0.16469719350073855, "grad_norm": 0.566595196723938, "learning_rate": 0.00041790860943465946, "loss": 8.8493, "step": 1338 }, { "epoch": 0.16482028557360906, "grad_norm": 0.16120564937591553, "learning_rate": 0.0004178470254957507, "loss": 7.7737, "step": 1339 }, { "epoch": 0.16494337764647957, "grad_norm": 0.38400182127952576, "learning_rate": 0.000417785441556842, "loss": 7.2833, "step": 1340 }, { "epoch": 0.16506646971935007, "grad_norm": 0.2217247635126114, "learning_rate": 0.00041772385761793327, "loss": 7.4976, "step": 1341 }, { "epoch": 0.16518956179222058, "grad_norm": 0.34315261244773865, "learning_rate": 0.00041766227367902455, "loss": 8.2689, "step": 1342 }, { "epoch": 0.16531265386509109, "grad_norm": 0.1582586169242859, "learning_rate": 0.0004176006897401158, "loss": 7.5597, "step": 1343 }, { "epoch": 0.1654357459379616, "grad_norm": 0.2526699900627136, "learning_rate": 0.00041753910580120707, "loss": 7.7226, "step": 1344 }, { "epoch": 0.1655588380108321, "grad_norm": 0.42595043778419495, "learning_rate": 0.0004174775218622983, "loss": 7.2843, "step": 1345 }, { "epoch": 0.1656819300837026, "grad_norm": 0.33534929156303406, "learning_rate": 0.0004174159379233896, "loss": 7.5462, "step": 1346 }, { "epoch": 0.1658050221565731, "grad_norm": 0.2090773731470108, "learning_rate": 0.0004173543539844809, "loss": 7.4516, "step": 1347 }, { "epoch": 0.16592811422944362, "grad_norm": 0.1671341508626938, "learning_rate": 0.00041729277004557216, "loss": 7.7253, "step": 1348 }, { "epoch": 0.16605120630231412, "grad_norm": 0.5024161338806152, "learning_rate": 0.0004172311861066634, "loss": 7.9465, "step": 1349 }, { "epoch": 0.16617429837518463, "grad_norm": 0.16168341040611267, "learning_rate": 0.0004171696021677547, "loss": 7.3115, "step": 1350 }, { "epoch": 0.16629739044805514, "grad_norm": 0.3397960066795349, "learning_rate": 0.0004171080182288459, "loss": 7.8454, "step": 1351 }, { "epoch": 0.16642048252092564, "grad_norm": 0.28787389397621155, "learning_rate": 0.0004170464342899372, "loss": 7.4631, "step": 1352 }, { "epoch": 0.16654357459379615, "grad_norm": 0.21456079185009003, "learning_rate": 0.00041698485035102843, "loss": 7.6861, "step": 1353 }, { "epoch": 0.16666666666666666, "grad_norm": 0.23532086610794067, "learning_rate": 0.0004169232664121198, "loss": 7.3053, "step": 1354 }, { "epoch": 0.16678975873953716, "grad_norm": 0.25569993257522583, "learning_rate": 0.000416861682473211, "loss": 7.7032, "step": 1355 }, { "epoch": 0.16691285081240767, "grad_norm": 0.6268933415412903, "learning_rate": 0.0004168000985343023, "loss": 8.5021, "step": 1356 }, { "epoch": 0.16703594288527818, "grad_norm": 0.38767683506011963, "learning_rate": 0.0004167385145953935, "loss": 8.0424, "step": 1357 }, { "epoch": 0.16715903495814868, "grad_norm": 0.11340180039405823, "learning_rate": 0.0004166769306564848, "loss": 7.7285, "step": 1358 }, { "epoch": 0.1672821270310192, "grad_norm": 0.23860646784305573, "learning_rate": 0.00041661534671757604, "loss": 8.0384, "step": 1359 }, { "epoch": 0.1674052191038897, "grad_norm": 0.23746947944164276, "learning_rate": 0.00041655376277866733, "loss": 7.6196, "step": 1360 }, { "epoch": 0.1675283111767602, "grad_norm": 0.2933744192123413, "learning_rate": 0.0004164921788397586, "loss": 7.3987, "step": 1361 }, { "epoch": 0.16765140324963074, "grad_norm": 0.3318029046058655, "learning_rate": 0.0004164305949008499, "loss": 7.968, "step": 1362 }, { "epoch": 0.16777449532250124, "grad_norm": 0.2781837582588196, "learning_rate": 0.00041636901096194114, "loss": 7.9919, "step": 1363 }, { "epoch": 0.16789758739537175, "grad_norm": 0.193125918507576, "learning_rate": 0.0004163074270230324, "loss": 7.4964, "step": 1364 }, { "epoch": 0.16802067946824226, "grad_norm": 0.18196727335453033, "learning_rate": 0.00041624584308412366, "loss": 7.357, "step": 1365 }, { "epoch": 0.16814377154111276, "grad_norm": 0.27016857266426086, "learning_rate": 0.00041618425914521494, "loss": 7.346, "step": 1366 }, { "epoch": 0.16826686361398327, "grad_norm": 0.2825358211994171, "learning_rate": 0.0004161226752063062, "loss": 7.6837, "step": 1367 }, { "epoch": 0.16838995568685378, "grad_norm": 0.2062843143939972, "learning_rate": 0.0004160610912673975, "loss": 8.0157, "step": 1368 }, { "epoch": 0.16851304775972428, "grad_norm": 0.1413881480693817, "learning_rate": 0.00041599950732848875, "loss": 7.7837, "step": 1369 }, { "epoch": 0.1686361398325948, "grad_norm": 0.17036794126033783, "learning_rate": 0.00041593792338958003, "loss": 7.5527, "step": 1370 }, { "epoch": 0.1687592319054653, "grad_norm": 0.19909456372261047, "learning_rate": 0.00041587633945067127, "loss": 7.8132, "step": 1371 }, { "epoch": 0.1688823239783358, "grad_norm": 0.19609291851520538, "learning_rate": 0.00041581475551176255, "loss": 7.6605, "step": 1372 }, { "epoch": 0.1690054160512063, "grad_norm": 0.18404719233512878, "learning_rate": 0.0004157531715728538, "loss": 7.3953, "step": 1373 }, { "epoch": 0.16912850812407682, "grad_norm": 0.12705567479133606, "learning_rate": 0.0004156915876339451, "loss": 7.499, "step": 1374 }, { "epoch": 0.16925160019694732, "grad_norm": 0.21627090871334076, "learning_rate": 0.00041563000369503636, "loss": 7.4784, "step": 1375 }, { "epoch": 0.16937469226981783, "grad_norm": 0.2824324667453766, "learning_rate": 0.00041556841975612765, "loss": 7.6186, "step": 1376 }, { "epoch": 0.16949778434268833, "grad_norm": 0.3367098867893219, "learning_rate": 0.0004155068358172189, "loss": 8.3258, "step": 1377 }, { "epoch": 0.16962087641555884, "grad_norm": 0.5478821992874146, "learning_rate": 0.00041544525187831016, "loss": 7.2475, "step": 1378 }, { "epoch": 0.16974396848842935, "grad_norm": 0.3598996698856354, "learning_rate": 0.0004153836679394014, "loss": 7.7147, "step": 1379 }, { "epoch": 0.16986706056129985, "grad_norm": 0.2806112468242645, "learning_rate": 0.0004153220840004927, "loss": 7.6816, "step": 1380 }, { "epoch": 0.16999015263417036, "grad_norm": 0.2553592026233673, "learning_rate": 0.00041526050006158397, "loss": 8.4682, "step": 1381 }, { "epoch": 0.17011324470704087, "grad_norm": 0.20642559230327606, "learning_rate": 0.00041519891612267526, "loss": 7.3988, "step": 1382 }, { "epoch": 0.17023633677991137, "grad_norm": 0.1458241045475006, "learning_rate": 0.0004151373321837665, "loss": 7.4195, "step": 1383 }, { "epoch": 0.17035942885278188, "grad_norm": 0.45071929693222046, "learning_rate": 0.0004150757482448578, "loss": 7.7694, "step": 1384 }, { "epoch": 0.1704825209256524, "grad_norm": 0.3850494623184204, "learning_rate": 0.000415014164305949, "loss": 7.6373, "step": 1385 }, { "epoch": 0.1706056129985229, "grad_norm": 0.3765520453453064, "learning_rate": 0.0004149525803670403, "loss": 7.8605, "step": 1386 }, { "epoch": 0.1707287050713934, "grad_norm": 0.28515568375587463, "learning_rate": 0.00041489099642813153, "loss": 7.2782, "step": 1387 }, { "epoch": 0.1708517971442639, "grad_norm": 0.2815493643283844, "learning_rate": 0.00041482941248922287, "loss": 8.4516, "step": 1388 }, { "epoch": 0.1709748892171344, "grad_norm": 0.294325053691864, "learning_rate": 0.0004147678285503141, "loss": 7.6618, "step": 1389 }, { "epoch": 0.17109798129000492, "grad_norm": 0.2861303389072418, "learning_rate": 0.0004147062446114054, "loss": 8.254, "step": 1390 }, { "epoch": 0.17122107336287543, "grad_norm": 0.18935318291187286, "learning_rate": 0.0004146446606724966, "loss": 7.6184, "step": 1391 }, { "epoch": 0.17134416543574593, "grad_norm": 0.1882530003786087, "learning_rate": 0.0004145830767335879, "loss": 7.5017, "step": 1392 }, { "epoch": 0.17146725750861644, "grad_norm": 0.17046061158180237, "learning_rate": 0.00041452149279467914, "loss": 7.5921, "step": 1393 }, { "epoch": 0.17159034958148695, "grad_norm": 0.18467742204666138, "learning_rate": 0.0004144599088557704, "loss": 7.6912, "step": 1394 }, { "epoch": 0.17171344165435745, "grad_norm": 0.11854463815689087, "learning_rate": 0.0004143983249168617, "loss": 7.7251, "step": 1395 }, { "epoch": 0.17183653372722796, "grad_norm": 0.16131921112537384, "learning_rate": 0.000414336740977953, "loss": 7.804, "step": 1396 }, { "epoch": 0.17195962580009846, "grad_norm": 0.28578153252601624, "learning_rate": 0.00041427515703904423, "loss": 7.9046, "step": 1397 }, { "epoch": 0.17208271787296897, "grad_norm": 0.12090400606393814, "learning_rate": 0.00041421357310013546, "loss": 7.528, "step": 1398 }, { "epoch": 0.17220580994583948, "grad_norm": 0.35189977288246155, "learning_rate": 0.00041415198916122675, "loss": 7.9838, "step": 1399 }, { "epoch": 0.17232890201870998, "grad_norm": 0.22193314135074615, "learning_rate": 0.000414090405222318, "loss": 7.7197, "step": 1400 }, { "epoch": 0.1724519940915805, "grad_norm": 0.16148212552070618, "learning_rate": 0.0004140288212834093, "loss": 8.0711, "step": 1401 }, { "epoch": 0.172575086164451, "grad_norm": 0.2610238790512085, "learning_rate": 0.00041396723734450056, "loss": 7.49, "step": 1402 }, { "epoch": 0.1726981782373215, "grad_norm": 0.4175146520137787, "learning_rate": 0.00041390565340559184, "loss": 7.3445, "step": 1403 }, { "epoch": 0.172821270310192, "grad_norm": 0.3135475814342499, "learning_rate": 0.0004138440694666831, "loss": 7.2031, "step": 1404 }, { "epoch": 0.17294436238306254, "grad_norm": 0.16727930307388306, "learning_rate": 0.00041378248552777436, "loss": 7.4347, "step": 1405 }, { "epoch": 0.17306745445593305, "grad_norm": 1.214796543121338, "learning_rate": 0.0004137209015888656, "loss": 10.0355, "step": 1406 }, { "epoch": 0.17319054652880356, "grad_norm": 0.1797735095024109, "learning_rate": 0.0004136593176499569, "loss": 7.3004, "step": 1407 }, { "epoch": 0.17331363860167406, "grad_norm": 0.1778462827205658, "learning_rate": 0.00041359773371104817, "loss": 7.4001, "step": 1408 }, { "epoch": 0.17343673067454457, "grad_norm": 0.1341068595647812, "learning_rate": 0.00041353614977213945, "loss": 7.6046, "step": 1409 }, { "epoch": 0.17355982274741508, "grad_norm": 0.25024187564849854, "learning_rate": 0.0004134745658332307, "loss": 7.8236, "step": 1410 }, { "epoch": 0.17368291482028558, "grad_norm": 0.4157401919364929, "learning_rate": 0.00041341298189432197, "loss": 9.1988, "step": 1411 }, { "epoch": 0.1738060068931561, "grad_norm": 0.15439976751804352, "learning_rate": 0.0004133513979554132, "loss": 8.3342, "step": 1412 }, { "epoch": 0.1739290989660266, "grad_norm": 0.41617995500564575, "learning_rate": 0.0004132898140165045, "loss": 7.6936, "step": 1413 }, { "epoch": 0.1740521910388971, "grad_norm": 0.2517079710960388, "learning_rate": 0.0004132282300775957, "loss": 8.067, "step": 1414 }, { "epoch": 0.1741752831117676, "grad_norm": 0.15202371776103973, "learning_rate": 0.00041316664613868706, "loss": 7.6475, "step": 1415 }, { "epoch": 0.17429837518463812, "grad_norm": 0.1800868660211563, "learning_rate": 0.0004131050621997783, "loss": 7.5687, "step": 1416 }, { "epoch": 0.17442146725750862, "grad_norm": 0.221969336271286, "learning_rate": 0.0004130434782608696, "loss": 7.6402, "step": 1417 }, { "epoch": 0.17454455933037913, "grad_norm": 0.318665474653244, "learning_rate": 0.0004129818943219608, "loss": 7.396, "step": 1418 }, { "epoch": 0.17466765140324964, "grad_norm": 0.4638115167617798, "learning_rate": 0.0004129203103830521, "loss": 9.0898, "step": 1419 }, { "epoch": 0.17479074347612014, "grad_norm": 0.27493688464164734, "learning_rate": 0.00041285872644414333, "loss": 7.3876, "step": 1420 }, { "epoch": 0.17491383554899065, "grad_norm": 0.210158571600914, "learning_rate": 0.0004127971425052346, "loss": 7.7349, "step": 1421 }, { "epoch": 0.17503692762186115, "grad_norm": 0.16887865960597992, "learning_rate": 0.0004127355585663259, "loss": 7.641, "step": 1422 }, { "epoch": 0.17516001969473166, "grad_norm": 0.22844120860099792, "learning_rate": 0.0004126739746274172, "loss": 7.9215, "step": 1423 }, { "epoch": 0.17528311176760217, "grad_norm": 0.22224518656730652, "learning_rate": 0.0004126123906885084, "loss": 7.6465, "step": 1424 }, { "epoch": 0.17540620384047267, "grad_norm": 0.38461193442344666, "learning_rate": 0.0004125508067495997, "loss": 7.9793, "step": 1425 }, { "epoch": 0.17552929591334318, "grad_norm": 0.4787311553955078, "learning_rate": 0.00041248922281069095, "loss": 8.4546, "step": 1426 }, { "epoch": 0.1756523879862137, "grad_norm": 0.24276742339134216, "learning_rate": 0.00041242763887178223, "loss": 7.5153, "step": 1427 }, { "epoch": 0.1757754800590842, "grad_norm": 0.24212129414081573, "learning_rate": 0.00041236605493287346, "loss": 7.8337, "step": 1428 }, { "epoch": 0.1758985721319547, "grad_norm": 0.32808366417884827, "learning_rate": 0.0004123044709939648, "loss": 7.409, "step": 1429 }, { "epoch": 0.1760216642048252, "grad_norm": 0.15673963725566864, "learning_rate": 0.00041224288705505604, "loss": 7.6, "step": 1430 }, { "epoch": 0.1761447562776957, "grad_norm": 0.27773281931877136, "learning_rate": 0.0004121813031161473, "loss": 7.7076, "step": 1431 }, { "epoch": 0.17626784835056622, "grad_norm": 0.19667980074882507, "learning_rate": 0.00041211971917723856, "loss": 7.4026, "step": 1432 }, { "epoch": 0.17639094042343673, "grad_norm": 0.19481492042541504, "learning_rate": 0.00041205813523832984, "loss": 7.4239, "step": 1433 }, { "epoch": 0.17651403249630723, "grad_norm": 0.5225419998168945, "learning_rate": 0.0004119965512994211, "loss": 8.7008, "step": 1434 }, { "epoch": 0.17663712456917774, "grad_norm": 0.9533184766769409, "learning_rate": 0.0004119349673605124, "loss": 7.2546, "step": 1435 }, { "epoch": 0.17676021664204825, "grad_norm": 0.4197896420955658, "learning_rate": 0.00041187338342160365, "loss": 7.7071, "step": 1436 }, { "epoch": 0.17688330871491875, "grad_norm": 0.328389436006546, "learning_rate": 0.00041181179948269494, "loss": 7.6499, "step": 1437 }, { "epoch": 0.17700640078778926, "grad_norm": 0.22602038085460663, "learning_rate": 0.00041175021554378617, "loss": 7.4592, "step": 1438 }, { "epoch": 0.17712949286065977, "grad_norm": 0.3154953718185425, "learning_rate": 0.00041168863160487745, "loss": 7.3742, "step": 1439 }, { "epoch": 0.17725258493353027, "grad_norm": 0.41252610087394714, "learning_rate": 0.0004116270476659687, "loss": 7.9693, "step": 1440 }, { "epoch": 0.17737567700640078, "grad_norm": 0.4152511656284332, "learning_rate": 0.00041156546372706, "loss": 7.5713, "step": 1441 }, { "epoch": 0.17749876907927128, "grad_norm": 0.29512935876846313, "learning_rate": 0.00041150387978815126, "loss": 7.4425, "step": 1442 }, { "epoch": 0.1776218611521418, "grad_norm": 0.1669824868440628, "learning_rate": 0.00041144229584924255, "loss": 7.6205, "step": 1443 }, { "epoch": 0.1777449532250123, "grad_norm": 0.30470263957977295, "learning_rate": 0.0004113807119103338, "loss": 8.6538, "step": 1444 }, { "epoch": 0.1778680452978828, "grad_norm": 0.3950280547142029, "learning_rate": 0.00041131912797142507, "loss": 7.555, "step": 1445 }, { "epoch": 0.1779911373707533, "grad_norm": 0.4858088791370392, "learning_rate": 0.0004112575440325163, "loss": 7.6612, "step": 1446 }, { "epoch": 0.17811422944362382, "grad_norm": 0.507634699344635, "learning_rate": 0.0004111959600936076, "loss": 7.4541, "step": 1447 }, { "epoch": 0.17823732151649435, "grad_norm": 0.6112698912620544, "learning_rate": 0.0004111343761546988, "loss": 7.7078, "step": 1448 }, { "epoch": 0.17836041358936486, "grad_norm": 0.2567288875579834, "learning_rate": 0.00041107279221579016, "loss": 7.2653, "step": 1449 }, { "epoch": 0.17848350566223536, "grad_norm": 0.6584529280662537, "learning_rate": 0.0004110112082768814, "loss": 8.7417, "step": 1450 }, { "epoch": 0.17860659773510587, "grad_norm": 0.5824444890022278, "learning_rate": 0.0004109496243379727, "loss": 7.7531, "step": 1451 }, { "epoch": 0.17872968980797638, "grad_norm": 0.5360903739929199, "learning_rate": 0.0004108880403990639, "loss": 7.7768, "step": 1452 }, { "epoch": 0.17885278188084688, "grad_norm": 0.3553244173526764, "learning_rate": 0.0004108264564601552, "loss": 7.6263, "step": 1453 }, { "epoch": 0.1789758739537174, "grad_norm": 0.29785260558128357, "learning_rate": 0.00041076487252124643, "loss": 7.7354, "step": 1454 }, { "epoch": 0.1790989660265879, "grad_norm": 0.4753749668598175, "learning_rate": 0.0004107032885823377, "loss": 7.1137, "step": 1455 }, { "epoch": 0.1792220580994584, "grad_norm": 0.28995755314826965, "learning_rate": 0.000410641704643429, "loss": 7.5703, "step": 1456 }, { "epoch": 0.1793451501723289, "grad_norm": 0.2742173373699188, "learning_rate": 0.0004105801207045203, "loss": 8.0178, "step": 1457 }, { "epoch": 0.17946824224519942, "grad_norm": 0.18013812601566315, "learning_rate": 0.0004105185367656115, "loss": 7.7615, "step": 1458 }, { "epoch": 0.17959133431806992, "grad_norm": 0.2620892822742462, "learning_rate": 0.0004104569528267028, "loss": 8.5092, "step": 1459 }, { "epoch": 0.17971442639094043, "grad_norm": 0.20668669044971466, "learning_rate": 0.00041039536888779404, "loss": 7.7374, "step": 1460 }, { "epoch": 0.17983751846381094, "grad_norm": 0.27064478397369385, "learning_rate": 0.0004103337849488853, "loss": 7.6826, "step": 1461 }, { "epoch": 0.17996061053668144, "grad_norm": 0.17847153544425964, "learning_rate": 0.0004102722010099766, "loss": 7.4749, "step": 1462 }, { "epoch": 0.18008370260955195, "grad_norm": 0.3338124454021454, "learning_rate": 0.0004102106170710679, "loss": 8.5705, "step": 1463 }, { "epoch": 0.18020679468242246, "grad_norm": 0.2700590491294861, "learning_rate": 0.00041014903313215913, "loss": 8.3235, "step": 1464 }, { "epoch": 0.18032988675529296, "grad_norm": 0.40336382389068604, "learning_rate": 0.0004100874491932504, "loss": 7.6533, "step": 1465 }, { "epoch": 0.18045297882816347, "grad_norm": 0.4047597348690033, "learning_rate": 0.00041002586525434165, "loss": 7.7858, "step": 1466 }, { "epoch": 0.18057607090103397, "grad_norm": 0.3468552827835083, "learning_rate": 0.00040996428131543294, "loss": 8.2867, "step": 1467 }, { "epoch": 0.18069916297390448, "grad_norm": 0.4934690594673157, "learning_rate": 0.00040990269737652417, "loss": 7.9297, "step": 1468 }, { "epoch": 0.180822255046775, "grad_norm": 0.19106414914131165, "learning_rate": 0.0004098411134376155, "loss": 7.997, "step": 1469 }, { "epoch": 0.1809453471196455, "grad_norm": 0.17893770337104797, "learning_rate": 0.00040977952949870674, "loss": 7.5091, "step": 1470 }, { "epoch": 0.181068439192516, "grad_norm": 0.19754886627197266, "learning_rate": 0.00040971794555979803, "loss": 7.4246, "step": 1471 }, { "epoch": 0.1811915312653865, "grad_norm": 0.15021920204162598, "learning_rate": 0.00040965636162088926, "loss": 7.2386, "step": 1472 }, { "epoch": 0.181314623338257, "grad_norm": 0.23872791230678558, "learning_rate": 0.00040959477768198055, "loss": 7.5728, "step": 1473 }, { "epoch": 0.18143771541112752, "grad_norm": 0.15370863676071167, "learning_rate": 0.0004095331937430718, "loss": 7.3852, "step": 1474 }, { "epoch": 0.18156080748399803, "grad_norm": 0.18470804393291473, "learning_rate": 0.00040947160980416307, "loss": 7.8427, "step": 1475 }, { "epoch": 0.18168389955686853, "grad_norm": 0.15074841678142548, "learning_rate": 0.00040941002586525435, "loss": 7.7042, "step": 1476 }, { "epoch": 0.18180699162973904, "grad_norm": 0.2684825360774994, "learning_rate": 0.00040934844192634564, "loss": 7.3489, "step": 1477 }, { "epoch": 0.18193008370260955, "grad_norm": 0.19138628244400024, "learning_rate": 0.0004092868579874369, "loss": 7.4967, "step": 1478 }, { "epoch": 0.18205317577548005, "grad_norm": 0.1507454663515091, "learning_rate": 0.00040922527404852816, "loss": 7.2408, "step": 1479 }, { "epoch": 0.18217626784835056, "grad_norm": 0.20156532526016235, "learning_rate": 0.0004091636901096194, "loss": 7.5674, "step": 1480 }, { "epoch": 0.18229935992122107, "grad_norm": 0.5884694457054138, "learning_rate": 0.0004091021061707107, "loss": 8.7994, "step": 1481 }, { "epoch": 0.18242245199409157, "grad_norm": 0.17148075997829437, "learning_rate": 0.0004090405222318019, "loss": 7.8555, "step": 1482 }, { "epoch": 0.18254554406696208, "grad_norm": 0.29129689931869507, "learning_rate": 0.00040897893829289325, "loss": 7.5973, "step": 1483 }, { "epoch": 0.18266863613983259, "grad_norm": 0.3813183605670929, "learning_rate": 0.0004089173543539845, "loss": 7.5117, "step": 1484 }, { "epoch": 0.1827917282127031, "grad_norm": 0.2933667302131653, "learning_rate": 0.00040885577041507577, "loss": 7.6328, "step": 1485 }, { "epoch": 0.1829148202855736, "grad_norm": 0.22997649013996124, "learning_rate": 0.000408794186476167, "loss": 7.5199, "step": 1486 }, { "epoch": 0.1830379123584441, "grad_norm": 0.11580491065979004, "learning_rate": 0.0004087326025372583, "loss": 7.6503, "step": 1487 }, { "epoch": 0.1831610044313146, "grad_norm": 0.33655592799186707, "learning_rate": 0.0004086710185983495, "loss": 7.2528, "step": 1488 }, { "epoch": 0.18328409650418512, "grad_norm": 0.3031530976295471, "learning_rate": 0.00040860943465944086, "loss": 7.5859, "step": 1489 }, { "epoch": 0.18340718857705562, "grad_norm": 0.3134365677833557, "learning_rate": 0.0004085478507205321, "loss": 7.5805, "step": 1490 }, { "epoch": 0.18353028064992616, "grad_norm": 0.5902944207191467, "learning_rate": 0.0004084862667816234, "loss": 8.9182, "step": 1491 }, { "epoch": 0.18365337272279667, "grad_norm": 0.3532886803150177, "learning_rate": 0.0004084246828427146, "loss": 8.795, "step": 1492 }, { "epoch": 0.18377646479566717, "grad_norm": 0.7834241986274719, "learning_rate": 0.0004083630989038059, "loss": 10.1971, "step": 1493 }, { "epoch": 0.18389955686853768, "grad_norm": 0.6527999043464661, "learning_rate": 0.00040830151496489713, "loss": 7.4489, "step": 1494 }, { "epoch": 0.18402264894140818, "grad_norm": 0.5573955774307251, "learning_rate": 0.0004082399310259884, "loss": 7.9067, "step": 1495 }, { "epoch": 0.1841457410142787, "grad_norm": 0.40051552653312683, "learning_rate": 0.0004081783470870797, "loss": 8.5194, "step": 1496 }, { "epoch": 0.1842688330871492, "grad_norm": 0.5098590850830078, "learning_rate": 0.000408116763148171, "loss": 8.0347, "step": 1497 }, { "epoch": 0.1843919251600197, "grad_norm": 0.4603579044342041, "learning_rate": 0.0004080551792092622, "loss": 7.637, "step": 1498 }, { "epoch": 0.1845150172328902, "grad_norm": 0.12389310449361801, "learning_rate": 0.0004079935952703535, "loss": 7.9828, "step": 1499 }, { "epoch": 0.18463810930576072, "grad_norm": 0.3621811270713806, "learning_rate": 0.00040793201133144474, "loss": 7.4077, "step": 1500 }, { "epoch": 0.18476120137863122, "grad_norm": 0.5899243950843811, "learning_rate": 0.00040787042739253603, "loss": 8.2333, "step": 1501 }, { "epoch": 0.18488429345150173, "grad_norm": 0.4641565978527069, "learning_rate": 0.00040780884345362726, "loss": 7.8841, "step": 1502 }, { "epoch": 0.18500738552437224, "grad_norm": 0.27995920181274414, "learning_rate": 0.0004077472595147186, "loss": 7.5789, "step": 1503 }, { "epoch": 0.18513047759724274, "grad_norm": 0.16273728013038635, "learning_rate": 0.00040768567557580984, "loss": 7.6966, "step": 1504 }, { "epoch": 0.18525356967011325, "grad_norm": 0.31993547081947327, "learning_rate": 0.0004076240916369011, "loss": 7.5862, "step": 1505 }, { "epoch": 0.18537666174298376, "grad_norm": 0.2185204029083252, "learning_rate": 0.00040756250769799236, "loss": 7.7555, "step": 1506 }, { "epoch": 0.18549975381585426, "grad_norm": 0.31481215357780457, "learning_rate": 0.00040750092375908364, "loss": 8.5917, "step": 1507 }, { "epoch": 0.18562284588872477, "grad_norm": 0.26397979259490967, "learning_rate": 0.0004074393398201749, "loss": 8.6255, "step": 1508 }, { "epoch": 0.18574593796159528, "grad_norm": 0.26523467898368835, "learning_rate": 0.00040737775588126616, "loss": 7.6392, "step": 1509 }, { "epoch": 0.18586903003446578, "grad_norm": 0.34400925040245056, "learning_rate": 0.00040731617194235745, "loss": 7.2771, "step": 1510 }, { "epoch": 0.1859921221073363, "grad_norm": 0.2438497394323349, "learning_rate": 0.00040725458800344873, "loss": 7.659, "step": 1511 }, { "epoch": 0.1861152141802068, "grad_norm": 0.21994517743587494, "learning_rate": 0.00040719300406453997, "loss": 7.3168, "step": 1512 }, { "epoch": 0.1862383062530773, "grad_norm": 0.22798900306224823, "learning_rate": 0.00040713142012563125, "loss": 6.9268, "step": 1513 }, { "epoch": 0.1863613983259478, "grad_norm": 0.6719649434089661, "learning_rate": 0.0004070698361867225, "loss": 8.5096, "step": 1514 }, { "epoch": 0.18648449039881831, "grad_norm": 0.19626693427562714, "learning_rate": 0.00040700825224781377, "loss": 7.5035, "step": 1515 }, { "epoch": 0.18660758247168882, "grad_norm": 0.184451162815094, "learning_rate": 0.00040694666830890506, "loss": 7.5687, "step": 1516 }, { "epoch": 0.18673067454455933, "grad_norm": 0.34573036432266235, "learning_rate": 0.00040688508436999635, "loss": 7.3818, "step": 1517 }, { "epoch": 0.18685376661742983, "grad_norm": 0.2605118453502655, "learning_rate": 0.0004068235004310876, "loss": 7.6832, "step": 1518 }, { "epoch": 0.18697685869030034, "grad_norm": 0.2849428653717041, "learning_rate": 0.00040676191649217886, "loss": 7.6157, "step": 1519 }, { "epoch": 0.18709995076317085, "grad_norm": 0.23517630994319916, "learning_rate": 0.0004067003325532701, "loss": 7.3606, "step": 1520 }, { "epoch": 0.18722304283604135, "grad_norm": 0.3271069824695587, "learning_rate": 0.0004066387486143614, "loss": 8.3172, "step": 1521 }, { "epoch": 0.18734613490891186, "grad_norm": 0.2749461531639099, "learning_rate": 0.0004065771646754526, "loss": 7.9335, "step": 1522 }, { "epoch": 0.18746922698178237, "grad_norm": 0.17228223383426666, "learning_rate": 0.00040651558073654396, "loss": 7.8023, "step": 1523 }, { "epoch": 0.18759231905465287, "grad_norm": 0.16696804761886597, "learning_rate": 0.0004064539967976352, "loss": 7.5736, "step": 1524 }, { "epoch": 0.18771541112752338, "grad_norm": 0.33007165789604187, "learning_rate": 0.0004063924128587265, "loss": 7.2813, "step": 1525 }, { "epoch": 0.18783850320039389, "grad_norm": 0.26611804962158203, "learning_rate": 0.0004063308289198177, "loss": 7.7449, "step": 1526 }, { "epoch": 0.1879615952732644, "grad_norm": 0.19994305074214935, "learning_rate": 0.000406269244980909, "loss": 7.3322, "step": 1527 }, { "epoch": 0.1880846873461349, "grad_norm": 0.20420433580875397, "learning_rate": 0.00040620766104200023, "loss": 7.7527, "step": 1528 }, { "epoch": 0.1882077794190054, "grad_norm": 0.7887313365936279, "learning_rate": 0.0004061460771030915, "loss": 9.1191, "step": 1529 }, { "epoch": 0.1883308714918759, "grad_norm": 0.08918537199497223, "learning_rate": 0.0004060844931641828, "loss": 7.4499, "step": 1530 }, { "epoch": 0.18845396356474642, "grad_norm": 0.16971921920776367, "learning_rate": 0.0004060229092252741, "loss": 7.7147, "step": 1531 }, { "epoch": 0.18857705563761692, "grad_norm": 0.21036319434642792, "learning_rate": 0.0004059613252863653, "loss": 7.6258, "step": 1532 }, { "epoch": 0.18870014771048743, "grad_norm": 0.39729195833206177, "learning_rate": 0.0004058997413474566, "loss": 9.1518, "step": 1533 }, { "epoch": 0.18882323978335794, "grad_norm": 0.28364476561546326, "learning_rate": 0.00040583815740854784, "loss": 7.6067, "step": 1534 }, { "epoch": 0.18894633185622847, "grad_norm": 0.3078083395957947, "learning_rate": 0.0004057765734696391, "loss": 7.5712, "step": 1535 }, { "epoch": 0.18906942392909898, "grad_norm": 0.33466747403144836, "learning_rate": 0.00040571498953073036, "loss": 7.5623, "step": 1536 }, { "epoch": 0.18919251600196949, "grad_norm": 0.2497331202030182, "learning_rate": 0.0004056534055918217, "loss": 7.8674, "step": 1537 }, { "epoch": 0.18931560807484, "grad_norm": 0.23477548360824585, "learning_rate": 0.00040559182165291293, "loss": 7.3068, "step": 1538 }, { "epoch": 0.1894387001477105, "grad_norm": 0.42905914783477783, "learning_rate": 0.0004055302377140042, "loss": 8.2977, "step": 1539 }, { "epoch": 0.189561792220581, "grad_norm": 0.25721457600593567, "learning_rate": 0.00040546865377509545, "loss": 7.6951, "step": 1540 }, { "epoch": 0.1896848842934515, "grad_norm": 0.19355463981628418, "learning_rate": 0.00040540706983618674, "loss": 7.6403, "step": 1541 }, { "epoch": 0.18980797636632202, "grad_norm": 0.20576459169387817, "learning_rate": 0.00040534548589727797, "loss": 7.6119, "step": 1542 }, { "epoch": 0.18993106843919252, "grad_norm": 0.30844300985336304, "learning_rate": 0.0004052839019583693, "loss": 7.5873, "step": 1543 }, { "epoch": 0.19005416051206303, "grad_norm": 0.14868104457855225, "learning_rate": 0.00040522231801946054, "loss": 7.7223, "step": 1544 }, { "epoch": 0.19017725258493354, "grad_norm": 0.3442416191101074, "learning_rate": 0.00040516073408055183, "loss": 7.9923, "step": 1545 }, { "epoch": 0.19030034465780404, "grad_norm": 0.22260332107543945, "learning_rate": 0.00040509915014164306, "loss": 7.5532, "step": 1546 }, { "epoch": 0.19042343673067455, "grad_norm": 0.1755305379629135, "learning_rate": 0.00040503756620273435, "loss": 7.5075, "step": 1547 }, { "epoch": 0.19054652880354506, "grad_norm": 0.17349523305892944, "learning_rate": 0.0004049759822638256, "loss": 7.623, "step": 1548 }, { "epoch": 0.19066962087641556, "grad_norm": 0.2110152244567871, "learning_rate": 0.00040491439832491687, "loss": 7.525, "step": 1549 }, { "epoch": 0.19079271294928607, "grad_norm": 0.27456843852996826, "learning_rate": 0.00040485281438600815, "loss": 8.3479, "step": 1550 }, { "epoch": 0.19091580502215658, "grad_norm": 0.24309027194976807, "learning_rate": 0.00040479123044709944, "loss": 7.676, "step": 1551 }, { "epoch": 0.19103889709502708, "grad_norm": 0.18023701012134552, "learning_rate": 0.00040472964650819067, "loss": 8.1442, "step": 1552 }, { "epoch": 0.1911619891678976, "grad_norm": 0.13763687014579773, "learning_rate": 0.00040466806256928196, "loss": 8.0848, "step": 1553 }, { "epoch": 0.1912850812407681, "grad_norm": 0.22126007080078125, "learning_rate": 0.0004046064786303732, "loss": 7.5385, "step": 1554 }, { "epoch": 0.1914081733136386, "grad_norm": 0.15932220220565796, "learning_rate": 0.0004045448946914645, "loss": 7.4895, "step": 1555 }, { "epoch": 0.1915312653865091, "grad_norm": 0.2564429044723511, "learning_rate": 0.0004044833107525557, "loss": 7.7527, "step": 1556 }, { "epoch": 0.19165435745937962, "grad_norm": 0.3809904158115387, "learning_rate": 0.00040442172681364705, "loss": 8.1812, "step": 1557 }, { "epoch": 0.19177744953225012, "grad_norm": 0.5215671062469482, "learning_rate": 0.0004043601428747383, "loss": 8.4248, "step": 1558 }, { "epoch": 0.19190054160512063, "grad_norm": 0.20178812742233276, "learning_rate": 0.00040429855893582957, "loss": 7.5768, "step": 1559 }, { "epoch": 0.19202363367799113, "grad_norm": 0.21534189581871033, "learning_rate": 0.0004042369749969208, "loss": 7.5411, "step": 1560 }, { "epoch": 0.19214672575086164, "grad_norm": 0.1888691484928131, "learning_rate": 0.0004041753910580121, "loss": 7.4828, "step": 1561 }, { "epoch": 0.19226981782373215, "grad_norm": 0.19269748032093048, "learning_rate": 0.0004041138071191033, "loss": 7.7338, "step": 1562 }, { "epoch": 0.19239290989660265, "grad_norm": 0.17764557898044586, "learning_rate": 0.0004040522231801946, "loss": 7.3169, "step": 1563 }, { "epoch": 0.19251600196947316, "grad_norm": 0.26235321164131165, "learning_rate": 0.0004039906392412859, "loss": 7.5083, "step": 1564 }, { "epoch": 0.19263909404234367, "grad_norm": 0.3452274203300476, "learning_rate": 0.0004039290553023772, "loss": 7.8844, "step": 1565 }, { "epoch": 0.19276218611521417, "grad_norm": 0.3709310293197632, "learning_rate": 0.0004038674713634684, "loss": 8.0655, "step": 1566 }, { "epoch": 0.19288527818808468, "grad_norm": 0.18056993186473846, "learning_rate": 0.0004038058874245597, "loss": 7.5371, "step": 1567 }, { "epoch": 0.1930083702609552, "grad_norm": 0.17250923812389374, "learning_rate": 0.00040374430348565093, "loss": 8.1537, "step": 1568 }, { "epoch": 0.1931314623338257, "grad_norm": 0.36274564266204834, "learning_rate": 0.0004036827195467422, "loss": 7.4886, "step": 1569 }, { "epoch": 0.1932545544066962, "grad_norm": 0.4440489411354065, "learning_rate": 0.00040362113560783345, "loss": 7.2855, "step": 1570 }, { "epoch": 0.1933776464795667, "grad_norm": 0.19006440043449402, "learning_rate": 0.0004035595516689248, "loss": 8.1367, "step": 1571 }, { "epoch": 0.1935007385524372, "grad_norm": 0.23883859813213348, "learning_rate": 0.000403497967730016, "loss": 7.6054, "step": 1572 }, { "epoch": 0.19362383062530772, "grad_norm": 0.1850305199623108, "learning_rate": 0.0004034363837911073, "loss": 8.0129, "step": 1573 }, { "epoch": 0.19374692269817823, "grad_norm": 0.1532246172428131, "learning_rate": 0.00040337479985219854, "loss": 7.6417, "step": 1574 }, { "epoch": 0.19387001477104873, "grad_norm": 0.15651513636112213, "learning_rate": 0.00040331321591328983, "loss": 7.6138, "step": 1575 }, { "epoch": 0.19399310684391924, "grad_norm": 0.1787426471710205, "learning_rate": 0.00040325163197438106, "loss": 7.2127, "step": 1576 }, { "epoch": 0.19411619891678975, "grad_norm": 0.12827490270137787, "learning_rate": 0.0004031900480354724, "loss": 7.4493, "step": 1577 }, { "epoch": 0.19423929098966028, "grad_norm": 0.1723065972328186, "learning_rate": 0.00040312846409656364, "loss": 7.42, "step": 1578 }, { "epoch": 0.19436238306253079, "grad_norm": 0.10837485641241074, "learning_rate": 0.0004030668801576549, "loss": 7.596, "step": 1579 }, { "epoch": 0.1944854751354013, "grad_norm": 0.3963494896888733, "learning_rate": 0.00040300529621874615, "loss": 7.552, "step": 1580 }, { "epoch": 0.1946085672082718, "grad_norm": 0.12491583079099655, "learning_rate": 0.00040294371227983744, "loss": 7.6861, "step": 1581 }, { "epoch": 0.1947316592811423, "grad_norm": 0.8282606601715088, "learning_rate": 0.0004028821283409287, "loss": 9.954, "step": 1582 }, { "epoch": 0.1948547513540128, "grad_norm": 0.317691445350647, "learning_rate": 0.00040282054440201996, "loss": 7.9201, "step": 1583 }, { "epoch": 0.19497784342688332, "grad_norm": 0.16730229556560516, "learning_rate": 0.00040275896046311125, "loss": 7.5436, "step": 1584 }, { "epoch": 0.19510093549975382, "grad_norm": 0.3971235752105713, "learning_rate": 0.00040269737652420253, "loss": 8.6991, "step": 1585 }, { "epoch": 0.19522402757262433, "grad_norm": 0.20863664150238037, "learning_rate": 0.00040263579258529377, "loss": 7.7073, "step": 1586 }, { "epoch": 0.19534711964549484, "grad_norm": 0.2823196351528168, "learning_rate": 0.00040257420864638505, "loss": 7.3108, "step": 1587 }, { "epoch": 0.19547021171836534, "grad_norm": 0.24253064393997192, "learning_rate": 0.0004025126247074763, "loss": 7.7249, "step": 1588 }, { "epoch": 0.19559330379123585, "grad_norm": 0.3935747444629669, "learning_rate": 0.00040245104076856757, "loss": 8.2583, "step": 1589 }, { "epoch": 0.19571639586410636, "grad_norm": 0.16742415726184845, "learning_rate": 0.0004023894568296588, "loss": 7.6021, "step": 1590 }, { "epoch": 0.19583948793697686, "grad_norm": 0.20006464421749115, "learning_rate": 0.00040232787289075014, "loss": 7.3083, "step": 1591 }, { "epoch": 0.19596258000984737, "grad_norm": 0.21899862587451935, "learning_rate": 0.0004022662889518414, "loss": 7.6517, "step": 1592 }, { "epoch": 0.19608567208271788, "grad_norm": 0.12090947479009628, "learning_rate": 0.00040220470501293266, "loss": 7.7037, "step": 1593 }, { "epoch": 0.19620876415558838, "grad_norm": 0.1741248220205307, "learning_rate": 0.0004021431210740239, "loss": 7.7382, "step": 1594 }, { "epoch": 0.1963318562284589, "grad_norm": 0.5093694925308228, "learning_rate": 0.0004020815371351152, "loss": 7.6165, "step": 1595 }, { "epoch": 0.1964549483013294, "grad_norm": 0.14918482303619385, "learning_rate": 0.0004020199531962064, "loss": 7.5163, "step": 1596 }, { "epoch": 0.1965780403741999, "grad_norm": 0.19246619939804077, "learning_rate": 0.00040195836925729776, "loss": 7.3133, "step": 1597 }, { "epoch": 0.1967011324470704, "grad_norm": 0.2124374806880951, "learning_rate": 0.000401896785318389, "loss": 7.7218, "step": 1598 }, { "epoch": 0.19682422451994092, "grad_norm": 0.25851964950561523, "learning_rate": 0.0004018352013794803, "loss": 7.6638, "step": 1599 }, { "epoch": 0.19694731659281142, "grad_norm": 0.4392491281032562, "learning_rate": 0.0004017736174405715, "loss": 7.6357, "step": 1600 }, { "epoch": 0.19707040866568193, "grad_norm": 0.19537964463233948, "learning_rate": 0.0004017120335016628, "loss": 7.4953, "step": 1601 }, { "epoch": 0.19719350073855244, "grad_norm": 0.17222633957862854, "learning_rate": 0.000401650449562754, "loss": 7.5114, "step": 1602 }, { "epoch": 0.19731659281142294, "grad_norm": 0.23992429673671722, "learning_rate": 0.0004015888656238453, "loss": 7.8794, "step": 1603 }, { "epoch": 0.19743968488429345, "grad_norm": 0.9526592493057251, "learning_rate": 0.0004015272816849366, "loss": 7.9311, "step": 1604 }, { "epoch": 0.19756277695716395, "grad_norm": 0.21589791774749756, "learning_rate": 0.0004014656977460279, "loss": 8.1536, "step": 1605 }, { "epoch": 0.19768586903003446, "grad_norm": 0.26518160104751587, "learning_rate": 0.0004014041138071191, "loss": 7.7626, "step": 1606 }, { "epoch": 0.19780896110290497, "grad_norm": 0.3289664089679718, "learning_rate": 0.0004013425298682104, "loss": 7.5804, "step": 1607 }, { "epoch": 0.19793205317577547, "grad_norm": 0.3526057004928589, "learning_rate": 0.00040128094592930164, "loss": 7.3008, "step": 1608 }, { "epoch": 0.19805514524864598, "grad_norm": 0.20968639850616455, "learning_rate": 0.0004012193619903929, "loss": 7.7588, "step": 1609 }, { "epoch": 0.1981782373215165, "grad_norm": 0.14564648270606995, "learning_rate": 0.00040115777805148416, "loss": 7.4054, "step": 1610 }, { "epoch": 0.198301329394387, "grad_norm": 0.31930986046791077, "learning_rate": 0.0004010961941125755, "loss": 7.6514, "step": 1611 }, { "epoch": 0.1984244214672575, "grad_norm": 0.17694392800331116, "learning_rate": 0.00040103461017366673, "loss": 7.4035, "step": 1612 }, { "epoch": 0.198547513540128, "grad_norm": 0.3066386580467224, "learning_rate": 0.000400973026234758, "loss": 8.2244, "step": 1613 }, { "epoch": 0.1986706056129985, "grad_norm": 0.18386416137218475, "learning_rate": 0.00040091144229584925, "loss": 8.0131, "step": 1614 }, { "epoch": 0.19879369768586902, "grad_norm": 0.2971845269203186, "learning_rate": 0.00040084985835694054, "loss": 7.4726, "step": 1615 }, { "epoch": 0.19891678975873953, "grad_norm": 0.4496195316314697, "learning_rate": 0.00040078827441803177, "loss": 9.1412, "step": 1616 }, { "epoch": 0.19903988183161003, "grad_norm": 0.41573357582092285, "learning_rate": 0.00040072669047912305, "loss": 7.7055, "step": 1617 }, { "epoch": 0.19916297390448054, "grad_norm": 0.506908655166626, "learning_rate": 0.00040066510654021434, "loss": 7.585, "step": 1618 }, { "epoch": 0.19928606597735105, "grad_norm": 0.6846131086349487, "learning_rate": 0.00040060352260130563, "loss": 7.1816, "step": 1619 }, { "epoch": 0.19940915805022155, "grad_norm": 0.4162299931049347, "learning_rate": 0.00040054193866239686, "loss": 7.421, "step": 1620 }, { "epoch": 0.1995322501230921, "grad_norm": 0.14973993599414825, "learning_rate": 0.00040048035472348815, "loss": 7.5697, "step": 1621 }, { "epoch": 0.1996553421959626, "grad_norm": 0.5524736046791077, "learning_rate": 0.0004004187707845794, "loss": 8.301, "step": 1622 }, { "epoch": 0.1997784342688331, "grad_norm": 0.49791646003723145, "learning_rate": 0.00040035718684567067, "loss": 8.0784, "step": 1623 }, { "epoch": 0.1999015263417036, "grad_norm": 0.46946486830711365, "learning_rate": 0.0004002956029067619, "loss": 7.7182, "step": 1624 }, { "epoch": 0.2000246184145741, "grad_norm": 0.34573912620544434, "learning_rate": 0.00040023401896785324, "loss": 8.294, "step": 1625 }, { "epoch": 0.20014771048744462, "grad_norm": 0.11790741235017776, "learning_rate": 0.00040017243502894447, "loss": 7.6063, "step": 1626 }, { "epoch": 0.20027080256031513, "grad_norm": 0.2537757456302643, "learning_rate": 0.00040011085109003576, "loss": 7.3922, "step": 1627 }, { "epoch": 0.20039389463318563, "grad_norm": 0.2553461790084839, "learning_rate": 0.000400049267151127, "loss": 7.6595, "step": 1628 }, { "epoch": 0.20051698670605614, "grad_norm": 0.2833736836910248, "learning_rate": 0.0003999876832122182, "loss": 7.4868, "step": 1629 }, { "epoch": 0.20064007877892664, "grad_norm": 0.2488880604505539, "learning_rate": 0.0003999260992733095, "loss": 7.3244, "step": 1630 }, { "epoch": 0.20076317085179715, "grad_norm": 0.16175006330013275, "learning_rate": 0.0003998645153344008, "loss": 7.6091, "step": 1631 }, { "epoch": 0.20088626292466766, "grad_norm": 0.22241666913032532, "learning_rate": 0.0003998029313954921, "loss": 7.4317, "step": 1632 }, { "epoch": 0.20100935499753816, "grad_norm": 0.2047000378370285, "learning_rate": 0.0003997413474565833, "loss": 7.4634, "step": 1633 }, { "epoch": 0.20113244707040867, "grad_norm": 0.34257593750953674, "learning_rate": 0.0003996797635176746, "loss": 7.6876, "step": 1634 }, { "epoch": 0.20125553914327918, "grad_norm": 0.2513234615325928, "learning_rate": 0.00039961817957876583, "loss": 7.7452, "step": 1635 }, { "epoch": 0.20137863121614968, "grad_norm": 0.15196828544139862, "learning_rate": 0.0003995565956398571, "loss": 7.4793, "step": 1636 }, { "epoch": 0.2015017232890202, "grad_norm": 0.15997372567653656, "learning_rate": 0.00039949501170094835, "loss": 8.0721, "step": 1637 }, { "epoch": 0.2016248153618907, "grad_norm": 0.2545030415058136, "learning_rate": 0.0003994334277620397, "loss": 8.627, "step": 1638 }, { "epoch": 0.2017479074347612, "grad_norm": 0.268767386674881, "learning_rate": 0.0003993718438231309, "loss": 7.8826, "step": 1639 }, { "epoch": 0.2018709995076317, "grad_norm": 0.5382040739059448, "learning_rate": 0.0003993102598842222, "loss": 7.2948, "step": 1640 }, { "epoch": 0.20199409158050222, "grad_norm": 0.22154201567173004, "learning_rate": 0.00039924867594531344, "loss": 7.825, "step": 1641 }, { "epoch": 0.20211718365337272, "grad_norm": 0.33692237734794617, "learning_rate": 0.00039918709200640473, "loss": 7.4523, "step": 1642 }, { "epoch": 0.20224027572624323, "grad_norm": 0.25904881954193115, "learning_rate": 0.00039912550806749596, "loss": 7.5302, "step": 1643 }, { "epoch": 0.20236336779911374, "grad_norm": 0.19651523232460022, "learning_rate": 0.00039906392412858725, "loss": 7.2276, "step": 1644 }, { "epoch": 0.20248645987198424, "grad_norm": 0.2940760850906372, "learning_rate": 0.00039900234018967854, "loss": 7.767, "step": 1645 }, { "epoch": 0.20260955194485475, "grad_norm": 0.33367106318473816, "learning_rate": 0.0003989407562507698, "loss": 8.088, "step": 1646 }, { "epoch": 0.20273264401772526, "grad_norm": 0.11405465751886368, "learning_rate": 0.00039887917231186106, "loss": 7.7128, "step": 1647 }, { "epoch": 0.20285573609059576, "grad_norm": 0.29557883739471436, "learning_rate": 0.00039881758837295234, "loss": 7.3895, "step": 1648 }, { "epoch": 0.20297882816346627, "grad_norm": 0.2197394222021103, "learning_rate": 0.0003987560044340436, "loss": 7.8395, "step": 1649 }, { "epoch": 0.20310192023633677, "grad_norm": 0.16999901831150055, "learning_rate": 0.00039869442049513486, "loss": 7.8963, "step": 1650 }, { "epoch": 0.20322501230920728, "grad_norm": 0.28450706601142883, "learning_rate": 0.0003986328365562261, "loss": 7.5814, "step": 1651 }, { "epoch": 0.2033481043820778, "grad_norm": 0.6115584373474121, "learning_rate": 0.00039857125261731743, "loss": 9.2912, "step": 1652 }, { "epoch": 0.2034711964549483, "grad_norm": 0.29049959778785706, "learning_rate": 0.00039850966867840867, "loss": 8.2901, "step": 1653 }, { "epoch": 0.2035942885278188, "grad_norm": 0.30649450421333313, "learning_rate": 0.00039844808473949995, "loss": 7.5206, "step": 1654 }, { "epoch": 0.2037173806006893, "grad_norm": 0.17810508608818054, "learning_rate": 0.0003983865008005912, "loss": 7.5629, "step": 1655 }, { "epoch": 0.2038404726735598, "grad_norm": 0.19335061311721802, "learning_rate": 0.00039832491686168247, "loss": 8.1889, "step": 1656 }, { "epoch": 0.20396356474643032, "grad_norm": 0.22575511038303375, "learning_rate": 0.0003982633329227737, "loss": 8.1661, "step": 1657 }, { "epoch": 0.20408665681930083, "grad_norm": 0.2063148319721222, "learning_rate": 0.00039820174898386505, "loss": 7.4585, "step": 1658 }, { "epoch": 0.20420974889217133, "grad_norm": 0.2088218480348587, "learning_rate": 0.0003981401650449563, "loss": 7.7808, "step": 1659 }, { "epoch": 0.20433284096504184, "grad_norm": 0.22505764663219452, "learning_rate": 0.00039807858110604756, "loss": 7.5586, "step": 1660 }, { "epoch": 0.20445593303791235, "grad_norm": 0.11078498512506485, "learning_rate": 0.0003980169971671388, "loss": 7.6257, "step": 1661 }, { "epoch": 0.20457902511078285, "grad_norm": 0.43348172307014465, "learning_rate": 0.0003979554132282301, "loss": 8.3089, "step": 1662 }, { "epoch": 0.20470211718365336, "grad_norm": 0.18843433260917664, "learning_rate": 0.0003978938292893213, "loss": 7.5937, "step": 1663 }, { "epoch": 0.2048252092565239, "grad_norm": 0.18334577977657318, "learning_rate": 0.0003978322453504126, "loss": 7.9672, "step": 1664 }, { "epoch": 0.2049483013293944, "grad_norm": 0.21318252384662628, "learning_rate": 0.0003977706614115039, "loss": 7.8965, "step": 1665 }, { "epoch": 0.2050713934022649, "grad_norm": 0.3247815668582916, "learning_rate": 0.0003977090774725952, "loss": 7.5142, "step": 1666 }, { "epoch": 0.2051944854751354, "grad_norm": 0.22383293509483337, "learning_rate": 0.0003976474935336864, "loss": 7.7048, "step": 1667 }, { "epoch": 0.20531757754800592, "grad_norm": 0.29157137870788574, "learning_rate": 0.0003975859095947777, "loss": 7.53, "step": 1668 }, { "epoch": 0.20544066962087643, "grad_norm": 0.3328604996204376, "learning_rate": 0.00039752432565586893, "loss": 7.3414, "step": 1669 }, { "epoch": 0.20556376169374693, "grad_norm": 0.1971876174211502, "learning_rate": 0.0003974627417169602, "loss": 7.7654, "step": 1670 }, { "epoch": 0.20568685376661744, "grad_norm": 0.2241748720407486, "learning_rate": 0.00039740115777805145, "loss": 8.0288, "step": 1671 }, { "epoch": 0.20580994583948795, "grad_norm": 0.7387012243270874, "learning_rate": 0.0003973395738391428, "loss": 9.0429, "step": 1672 }, { "epoch": 0.20593303791235845, "grad_norm": 0.18309521675109863, "learning_rate": 0.000397277989900234, "loss": 7.4434, "step": 1673 }, { "epoch": 0.20605612998522896, "grad_norm": 0.17131339013576508, "learning_rate": 0.0003972164059613253, "loss": 7.6658, "step": 1674 }, { "epoch": 0.20617922205809947, "grad_norm": 0.13033531606197357, "learning_rate": 0.00039715482202241654, "loss": 7.5289, "step": 1675 }, { "epoch": 0.20630231413096997, "grad_norm": 0.130008727312088, "learning_rate": 0.0003970932380835078, "loss": 7.7658, "step": 1676 }, { "epoch": 0.20642540620384048, "grad_norm": 0.11053793132305145, "learning_rate": 0.00039703165414459906, "loss": 7.661, "step": 1677 }, { "epoch": 0.20654849827671098, "grad_norm": 0.18266141414642334, "learning_rate": 0.00039697007020569034, "loss": 7.7894, "step": 1678 }, { "epoch": 0.2066715903495815, "grad_norm": 0.33539798855781555, "learning_rate": 0.00039690848626678163, "loss": 7.2977, "step": 1679 }, { "epoch": 0.206794682422452, "grad_norm": 0.8375624418258667, "learning_rate": 0.0003968469023278729, "loss": 10.2038, "step": 1680 }, { "epoch": 0.2069177744953225, "grad_norm": 0.265384703874588, "learning_rate": 0.00039678531838896415, "loss": 7.3412, "step": 1681 }, { "epoch": 0.207040866568193, "grad_norm": 0.20445053279399872, "learning_rate": 0.00039672373445005544, "loss": 7.6001, "step": 1682 }, { "epoch": 0.20716395864106352, "grad_norm": 0.1894654631614685, "learning_rate": 0.00039666215051114667, "loss": 7.8003, "step": 1683 }, { "epoch": 0.20728705071393402, "grad_norm": 0.4383001923561096, "learning_rate": 0.00039660056657223796, "loss": 8.4424, "step": 1684 }, { "epoch": 0.20741014278680453, "grad_norm": 0.4110206067562103, "learning_rate": 0.00039653898263332924, "loss": 8.3348, "step": 1685 }, { "epoch": 0.20753323485967504, "grad_norm": 0.11073437333106995, "learning_rate": 0.00039647739869442053, "loss": 8.0438, "step": 1686 }, { "epoch": 0.20765632693254554, "grad_norm": 0.22306987643241882, "learning_rate": 0.00039641581475551176, "loss": 7.664, "step": 1687 }, { "epoch": 0.20777941900541605, "grad_norm": 0.29284006357192993, "learning_rate": 0.00039635423081660305, "loss": 7.4038, "step": 1688 }, { "epoch": 0.20790251107828656, "grad_norm": 0.316342294216156, "learning_rate": 0.0003962926468776943, "loss": 7.4404, "step": 1689 }, { "epoch": 0.20802560315115706, "grad_norm": 0.1586236208677292, "learning_rate": 0.00039623106293878557, "loss": 7.6198, "step": 1690 }, { "epoch": 0.20814869522402757, "grad_norm": 0.12916487455368042, "learning_rate": 0.0003961694789998768, "loss": 7.3724, "step": 1691 }, { "epoch": 0.20827178729689808, "grad_norm": 0.3801705539226532, "learning_rate": 0.00039610789506096814, "loss": 7.6387, "step": 1692 }, { "epoch": 0.20839487936976858, "grad_norm": 0.7734689116477966, "learning_rate": 0.00039604631112205937, "loss": 8.3887, "step": 1693 }, { "epoch": 0.2085179714426391, "grad_norm": 0.4223499000072479, "learning_rate": 0.00039598472718315066, "loss": 7.553, "step": 1694 }, { "epoch": 0.2086410635155096, "grad_norm": 0.23680727183818817, "learning_rate": 0.0003959231432442419, "loss": 7.4689, "step": 1695 }, { "epoch": 0.2087641555883801, "grad_norm": 0.1525106281042099, "learning_rate": 0.0003958615593053332, "loss": 7.6786, "step": 1696 }, { "epoch": 0.2088872476612506, "grad_norm": 0.2813016176223755, "learning_rate": 0.0003957999753664244, "loss": 7.3885, "step": 1697 }, { "epoch": 0.20901033973412111, "grad_norm": 0.17039234936237335, "learning_rate": 0.0003957383914275157, "loss": 8.3938, "step": 1698 }, { "epoch": 0.20913343180699162, "grad_norm": 0.3315683901309967, "learning_rate": 0.000395676807488607, "loss": 7.6939, "step": 1699 }, { "epoch": 0.20925652387986213, "grad_norm": 0.29731282591819763, "learning_rate": 0.00039561522354969827, "loss": 8.4591, "step": 1700 }, { "epoch": 0.20937961595273263, "grad_norm": 0.24953724443912506, "learning_rate": 0.0003955536396107895, "loss": 7.9087, "step": 1701 }, { "epoch": 0.20950270802560314, "grad_norm": 0.1864500343799591, "learning_rate": 0.0003954920556718808, "loss": 7.6673, "step": 1702 }, { "epoch": 0.20962580009847365, "grad_norm": 0.13031117618083954, "learning_rate": 0.000395430471732972, "loss": 7.8081, "step": 1703 }, { "epoch": 0.20974889217134415, "grad_norm": 0.18291398882865906, "learning_rate": 0.0003953688877940633, "loss": 7.5079, "step": 1704 }, { "epoch": 0.20987198424421466, "grad_norm": 0.2826051115989685, "learning_rate": 0.00039530730385515454, "loss": 7.8108, "step": 1705 }, { "epoch": 0.20999507631708517, "grad_norm": 0.7394905686378479, "learning_rate": 0.0003952457199162459, "loss": 9.0606, "step": 1706 }, { "epoch": 0.2101181683899557, "grad_norm": 0.41098105907440186, "learning_rate": 0.0003951841359773371, "loss": 8.5975, "step": 1707 }, { "epoch": 0.2102412604628262, "grad_norm": 0.22516122460365295, "learning_rate": 0.0003951225520384284, "loss": 7.7319, "step": 1708 }, { "epoch": 0.2103643525356967, "grad_norm": 0.262451708316803, "learning_rate": 0.00039506096809951963, "loss": 7.8202, "step": 1709 }, { "epoch": 0.21048744460856722, "grad_norm": 0.36483290791511536, "learning_rate": 0.0003949993841606109, "loss": 7.5008, "step": 1710 }, { "epoch": 0.21061053668143773, "grad_norm": 0.2822088897228241, "learning_rate": 0.00039493780022170215, "loss": 7.6904, "step": 1711 }, { "epoch": 0.21073362875430823, "grad_norm": 0.16571268439292908, "learning_rate": 0.0003948762162827935, "loss": 7.4503, "step": 1712 }, { "epoch": 0.21085672082717874, "grad_norm": 0.11661708354949951, "learning_rate": 0.0003948146323438847, "loss": 7.3402, "step": 1713 }, { "epoch": 0.21097981290004925, "grad_norm": 0.3814723491668701, "learning_rate": 0.000394753048404976, "loss": 8.1103, "step": 1714 }, { "epoch": 0.21110290497291975, "grad_norm": 0.6292868256568909, "learning_rate": 0.00039469146446606724, "loss": 8.6075, "step": 1715 }, { "epoch": 0.21122599704579026, "grad_norm": 0.16304217278957367, "learning_rate": 0.00039462988052715853, "loss": 7.3624, "step": 1716 }, { "epoch": 0.21134908911866077, "grad_norm": 0.21683165431022644, "learning_rate": 0.00039456829658824976, "loss": 8.0987, "step": 1717 }, { "epoch": 0.21147218119153127, "grad_norm": 0.21510685980319977, "learning_rate": 0.00039450671264934105, "loss": 7.8203, "step": 1718 }, { "epoch": 0.21159527326440178, "grad_norm": 0.3739349842071533, "learning_rate": 0.00039444512871043234, "loss": 7.4936, "step": 1719 }, { "epoch": 0.21171836533727229, "grad_norm": 0.23494234681129456, "learning_rate": 0.0003943835447715236, "loss": 8.0474, "step": 1720 }, { "epoch": 0.2118414574101428, "grad_norm": 0.3507143259048462, "learning_rate": 0.00039432196083261485, "loss": 7.5127, "step": 1721 }, { "epoch": 0.2119645494830133, "grad_norm": 0.26920241117477417, "learning_rate": 0.00039426037689370614, "loss": 8.4935, "step": 1722 }, { "epoch": 0.2120876415558838, "grad_norm": 0.17727921903133392, "learning_rate": 0.0003941987929547974, "loss": 7.4046, "step": 1723 }, { "epoch": 0.2122107336287543, "grad_norm": 0.19817444682121277, "learning_rate": 0.00039413720901588866, "loss": 8.0547, "step": 1724 }, { "epoch": 0.21233382570162482, "grad_norm": 0.3969350755214691, "learning_rate": 0.0003940756250769799, "loss": 8.235, "step": 1725 }, { "epoch": 0.21245691777449532, "grad_norm": 0.5401732325553894, "learning_rate": 0.00039401404113807123, "loss": 9.2441, "step": 1726 }, { "epoch": 0.21258000984736583, "grad_norm": 0.4380800724029541, "learning_rate": 0.00039395245719916247, "loss": 8.8562, "step": 1727 }, { "epoch": 0.21270310192023634, "grad_norm": 0.6802027225494385, "learning_rate": 0.00039389087326025375, "loss": 7.5675, "step": 1728 }, { "epoch": 0.21282619399310684, "grad_norm": 0.35537371039390564, "learning_rate": 0.000393829289321345, "loss": 7.7621, "step": 1729 }, { "epoch": 0.21294928606597735, "grad_norm": 0.5673653483390808, "learning_rate": 0.00039376770538243627, "loss": 7.3208, "step": 1730 }, { "epoch": 0.21307237813884786, "grad_norm": 0.6322338581085205, "learning_rate": 0.0003937061214435275, "loss": 7.5451, "step": 1731 }, { "epoch": 0.21319547021171836, "grad_norm": 0.18195410072803497, "learning_rate": 0.0003936445375046188, "loss": 7.9677, "step": 1732 }, { "epoch": 0.21331856228458887, "grad_norm": 0.29154932498931885, "learning_rate": 0.0003935829535657101, "loss": 7.5123, "step": 1733 }, { "epoch": 0.21344165435745938, "grad_norm": 0.20146945118904114, "learning_rate": 0.00039352136962680136, "loss": 7.5649, "step": 1734 }, { "epoch": 0.21356474643032988, "grad_norm": 0.2907654643058777, "learning_rate": 0.0003934597856878926, "loss": 7.456, "step": 1735 }, { "epoch": 0.2136878385032004, "grad_norm": 0.49029040336608887, "learning_rate": 0.0003933982017489839, "loss": 8.3346, "step": 1736 }, { "epoch": 0.2138109305760709, "grad_norm": 0.17813830077648163, "learning_rate": 0.0003933366178100751, "loss": 7.5209, "step": 1737 }, { "epoch": 0.2139340226489414, "grad_norm": 0.20870712399482727, "learning_rate": 0.0003932750338711664, "loss": 8.0654, "step": 1738 }, { "epoch": 0.2140571147218119, "grad_norm": 0.15332381427288055, "learning_rate": 0.00039321344993225763, "loss": 7.8511, "step": 1739 }, { "epoch": 0.21418020679468242, "grad_norm": 0.17378604412078857, "learning_rate": 0.000393151865993349, "loss": 7.9576, "step": 1740 }, { "epoch": 0.21430329886755292, "grad_norm": 0.2874845862388611, "learning_rate": 0.0003930902820544402, "loss": 7.7228, "step": 1741 }, { "epoch": 0.21442639094042343, "grad_norm": 0.37630757689476013, "learning_rate": 0.0003930286981155315, "loss": 7.7422, "step": 1742 }, { "epoch": 0.21454948301329393, "grad_norm": 0.3337346613407135, "learning_rate": 0.0003929671141766227, "loss": 7.5182, "step": 1743 }, { "epoch": 0.21467257508616444, "grad_norm": 0.1882089227437973, "learning_rate": 0.000392905530237714, "loss": 7.6955, "step": 1744 }, { "epoch": 0.21479566715903495, "grad_norm": 0.15958867967128754, "learning_rate": 0.00039284394629880525, "loss": 7.7192, "step": 1745 }, { "epoch": 0.21491875923190545, "grad_norm": 0.17893026769161224, "learning_rate": 0.0003927823623598966, "loss": 7.5855, "step": 1746 }, { "epoch": 0.21504185130477596, "grad_norm": 0.24934951961040497, "learning_rate": 0.0003927207784209878, "loss": 7.5282, "step": 1747 }, { "epoch": 0.21516494337764647, "grad_norm": 0.2426667958498001, "learning_rate": 0.0003926591944820791, "loss": 8.2069, "step": 1748 }, { "epoch": 0.21528803545051697, "grad_norm": 0.2789213955402374, "learning_rate": 0.00039259761054317034, "loss": 7.2681, "step": 1749 }, { "epoch": 0.21541112752338748, "grad_norm": 0.19549663364887238, "learning_rate": 0.0003925360266042616, "loss": 7.3549, "step": 1750 }, { "epoch": 0.21553421959625801, "grad_norm": 0.23791450262069702, "learning_rate": 0.00039247444266535286, "loss": 8.2951, "step": 1751 }, { "epoch": 0.21565731166912852, "grad_norm": 0.157976433634758, "learning_rate": 0.00039241285872644414, "loss": 7.5106, "step": 1752 }, { "epoch": 0.21578040374199903, "grad_norm": 0.13087444007396698, "learning_rate": 0.00039235127478753543, "loss": 7.6796, "step": 1753 }, { "epoch": 0.21590349581486953, "grad_norm": 0.17479896545410156, "learning_rate": 0.0003922896908486267, "loss": 7.3295, "step": 1754 }, { "epoch": 0.21602658788774004, "grad_norm": 0.17392967641353607, "learning_rate": 0.00039222810690971795, "loss": 7.5017, "step": 1755 }, { "epoch": 0.21614967996061055, "grad_norm": 0.15357470512390137, "learning_rate": 0.00039216652297080924, "loss": 7.5982, "step": 1756 }, { "epoch": 0.21627277203348105, "grad_norm": 0.16137263178825378, "learning_rate": 0.00039210493903190047, "loss": 7.7825, "step": 1757 }, { "epoch": 0.21639586410635156, "grad_norm": 0.15042054653167725, "learning_rate": 0.00039204335509299175, "loss": 7.4985, "step": 1758 }, { "epoch": 0.21651895617922207, "grad_norm": 0.2192244678735733, "learning_rate": 0.000391981771154083, "loss": 8.3315, "step": 1759 }, { "epoch": 0.21664204825209257, "grad_norm": 0.23588478565216064, "learning_rate": 0.00039192018721517433, "loss": 7.578, "step": 1760 }, { "epoch": 0.21676514032496308, "grad_norm": 0.25113582611083984, "learning_rate": 0.00039185860327626556, "loss": 7.5849, "step": 1761 }, { "epoch": 0.21688823239783359, "grad_norm": 0.17704501748085022, "learning_rate": 0.00039179701933735685, "loss": 8.1495, "step": 1762 }, { "epoch": 0.2170113244707041, "grad_norm": 0.14952315390110016, "learning_rate": 0.0003917354353984481, "loss": 7.5305, "step": 1763 }, { "epoch": 0.2171344165435746, "grad_norm": 0.2010071575641632, "learning_rate": 0.00039167385145953937, "loss": 7.5255, "step": 1764 }, { "epoch": 0.2172575086164451, "grad_norm": 0.30967259407043457, "learning_rate": 0.0003916122675206306, "loss": 7.7231, "step": 1765 }, { "epoch": 0.2173806006893156, "grad_norm": 0.21394230425357819, "learning_rate": 0.00039155068358172194, "loss": 8.1603, "step": 1766 }, { "epoch": 0.21750369276218612, "grad_norm": 0.11887723952531815, "learning_rate": 0.00039148909964281317, "loss": 7.465, "step": 1767 }, { "epoch": 0.21762678483505662, "grad_norm": 0.12502025067806244, "learning_rate": 0.00039142751570390446, "loss": 7.5248, "step": 1768 }, { "epoch": 0.21774987690792713, "grad_norm": 0.16088078916072845, "learning_rate": 0.0003913659317649957, "loss": 7.9417, "step": 1769 }, { "epoch": 0.21787296898079764, "grad_norm": 0.14140745997428894, "learning_rate": 0.000391304347826087, "loss": 7.6993, "step": 1770 }, { "epoch": 0.21799606105366814, "grad_norm": 0.2110222429037094, "learning_rate": 0.0003912427638871782, "loss": 7.7849, "step": 1771 }, { "epoch": 0.21811915312653865, "grad_norm": 0.14948512613773346, "learning_rate": 0.0003911811799482695, "loss": 7.5709, "step": 1772 }, { "epoch": 0.21824224519940916, "grad_norm": 0.14478197693824768, "learning_rate": 0.0003911195960093608, "loss": 7.5289, "step": 1773 }, { "epoch": 0.21836533727227966, "grad_norm": 0.14457762241363525, "learning_rate": 0.00039105801207045207, "loss": 7.5144, "step": 1774 }, { "epoch": 0.21848842934515017, "grad_norm": 0.11705343425273895, "learning_rate": 0.0003909964281315433, "loss": 7.5559, "step": 1775 }, { "epoch": 0.21861152141802068, "grad_norm": 0.1521262526512146, "learning_rate": 0.0003909348441926346, "loss": 7.5636, "step": 1776 }, { "epoch": 0.21873461349089118, "grad_norm": 0.1281946897506714, "learning_rate": 0.0003908732602537258, "loss": 7.9825, "step": 1777 }, { "epoch": 0.2188577055637617, "grad_norm": 0.3395736515522003, "learning_rate": 0.0003908116763148171, "loss": 9.0238, "step": 1778 }, { "epoch": 0.2189807976366322, "grad_norm": 0.2930881381034851, "learning_rate": 0.00039075009237590834, "loss": 7.4775, "step": 1779 }, { "epoch": 0.2191038897095027, "grad_norm": 0.2409442514181137, "learning_rate": 0.0003906885084369997, "loss": 7.3432, "step": 1780 }, { "epoch": 0.2192269817823732, "grad_norm": 0.17860674858093262, "learning_rate": 0.0003906269244980909, "loss": 7.566, "step": 1781 }, { "epoch": 0.21935007385524372, "grad_norm": 0.17093788087368011, "learning_rate": 0.0003905653405591822, "loss": 7.4497, "step": 1782 }, { "epoch": 0.21947316592811422, "grad_norm": 0.2212570607662201, "learning_rate": 0.00039050375662027343, "loss": 7.5142, "step": 1783 }, { "epoch": 0.21959625800098473, "grad_norm": 0.37258610129356384, "learning_rate": 0.0003904421726813647, "loss": 7.7311, "step": 1784 }, { "epoch": 0.21971935007385524, "grad_norm": 0.21790944039821625, "learning_rate": 0.00039038058874245595, "loss": 7.3924, "step": 1785 }, { "epoch": 0.21984244214672574, "grad_norm": 0.2868238687515259, "learning_rate": 0.00039031900480354724, "loss": 7.8099, "step": 1786 }, { "epoch": 0.21996553421959625, "grad_norm": 0.12222366780042648, "learning_rate": 0.0003902574208646385, "loss": 7.3785, "step": 1787 }, { "epoch": 0.22008862629246675, "grad_norm": 0.15412795543670654, "learning_rate": 0.0003901958369257298, "loss": 7.4033, "step": 1788 }, { "epoch": 0.22021171836533726, "grad_norm": 0.1986434906721115, "learning_rate": 0.00039013425298682104, "loss": 7.6259, "step": 1789 }, { "epoch": 0.22033481043820777, "grad_norm": 0.22574491798877716, "learning_rate": 0.00039007266904791233, "loss": 7.7177, "step": 1790 }, { "epoch": 0.22045790251107827, "grad_norm": 0.19948016107082367, "learning_rate": 0.00039001108510900356, "loss": 7.4463, "step": 1791 }, { "epoch": 0.22058099458394878, "grad_norm": 0.29150256514549255, "learning_rate": 0.00038994950117009485, "loss": 8.3736, "step": 1792 }, { "epoch": 0.2207040866568193, "grad_norm": 0.33282405138015747, "learning_rate": 0.0003898879172311861, "loss": 7.6933, "step": 1793 }, { "epoch": 0.22082717872968982, "grad_norm": 0.5389174818992615, "learning_rate": 0.0003898263332922774, "loss": 8.8275, "step": 1794 }, { "epoch": 0.22095027080256033, "grad_norm": 0.2641206383705139, "learning_rate": 0.00038976474935336865, "loss": 8.0431, "step": 1795 }, { "epoch": 0.22107336287543083, "grad_norm": 0.23208416998386383, "learning_rate": 0.00038970316541445994, "loss": 8.141, "step": 1796 }, { "epoch": 0.22119645494830134, "grad_norm": 0.26761341094970703, "learning_rate": 0.00038964158147555117, "loss": 8.613, "step": 1797 }, { "epoch": 0.22131954702117185, "grad_norm": 0.3171042501926422, "learning_rate": 0.00038957999753664246, "loss": 7.9106, "step": 1798 }, { "epoch": 0.22144263909404235, "grad_norm": 0.5122343897819519, "learning_rate": 0.0003895184135977337, "loss": 7.763, "step": 1799 }, { "epoch": 0.22156573116691286, "grad_norm": 0.5145678520202637, "learning_rate": 0.00038945682965882503, "loss": 7.5132, "step": 1800 }, { "epoch": 0.22168882323978337, "grad_norm": 0.23576690256595612, "learning_rate": 0.00038939524571991626, "loss": 8.065, "step": 1801 }, { "epoch": 0.22181191531265387, "grad_norm": 0.3608900010585785, "learning_rate": 0.00038933366178100755, "loss": 7.385, "step": 1802 }, { "epoch": 0.22193500738552438, "grad_norm": 0.22189010679721832, "learning_rate": 0.0003892720778420988, "loss": 7.5031, "step": 1803 }, { "epoch": 0.2220580994583949, "grad_norm": 0.2668802738189697, "learning_rate": 0.00038921049390319007, "loss": 7.5623, "step": 1804 }, { "epoch": 0.2221811915312654, "grad_norm": 0.6705156564712524, "learning_rate": 0.0003891489099642813, "loss": 8.7073, "step": 1805 }, { "epoch": 0.2223042836041359, "grad_norm": 0.3987334072589874, "learning_rate": 0.0003890873260253726, "loss": 7.8192, "step": 1806 }, { "epoch": 0.2224273756770064, "grad_norm": 0.31965315341949463, "learning_rate": 0.0003890257420864639, "loss": 7.7825, "step": 1807 }, { "epoch": 0.2225504677498769, "grad_norm": 0.17168736457824707, "learning_rate": 0.00038896415814755516, "loss": 7.7596, "step": 1808 }, { "epoch": 0.22267355982274742, "grad_norm": 0.24258513748645782, "learning_rate": 0.0003889025742086464, "loss": 8.5267, "step": 1809 }, { "epoch": 0.22279665189561793, "grad_norm": 0.4703367352485657, "learning_rate": 0.0003888409902697377, "loss": 7.4696, "step": 1810 }, { "epoch": 0.22291974396848843, "grad_norm": 0.5411856174468994, "learning_rate": 0.0003887794063308289, "loss": 7.4363, "step": 1811 }, { "epoch": 0.22304283604135894, "grad_norm": 0.49856841564178467, "learning_rate": 0.0003887178223919202, "loss": 7.4934, "step": 1812 }, { "epoch": 0.22316592811422944, "grad_norm": 0.4421762228012085, "learning_rate": 0.00038865623845301143, "loss": 7.5366, "step": 1813 }, { "epoch": 0.22328902018709995, "grad_norm": 0.22384458780288696, "learning_rate": 0.0003885946545141028, "loss": 7.6656, "step": 1814 }, { "epoch": 0.22341211225997046, "grad_norm": 0.12943631410598755, "learning_rate": 0.000388533070575194, "loss": 7.6696, "step": 1815 }, { "epoch": 0.22353520433284096, "grad_norm": 0.29656925797462463, "learning_rate": 0.0003884714866362853, "loss": 7.5098, "step": 1816 }, { "epoch": 0.22365829640571147, "grad_norm": 0.36001139879226685, "learning_rate": 0.0003884099026973765, "loss": 7.3778, "step": 1817 }, { "epoch": 0.22378138847858198, "grad_norm": 0.2935301661491394, "learning_rate": 0.0003883483187584678, "loss": 7.332, "step": 1818 }, { "epoch": 0.22390448055145248, "grad_norm": 0.5122889280319214, "learning_rate": 0.00038828673481955904, "loss": 8.2554, "step": 1819 }, { "epoch": 0.224027572624323, "grad_norm": 0.190052330493927, "learning_rate": 0.00038822515088065033, "loss": 7.911, "step": 1820 }, { "epoch": 0.2241506646971935, "grad_norm": 0.194923534989357, "learning_rate": 0.0003881635669417416, "loss": 7.4611, "step": 1821 }, { "epoch": 0.224273756770064, "grad_norm": 0.3387841284275055, "learning_rate": 0.0003881019830028329, "loss": 7.386, "step": 1822 }, { "epoch": 0.2243968488429345, "grad_norm": 0.33558329939842224, "learning_rate": 0.00038804039906392414, "loss": 8.0792, "step": 1823 }, { "epoch": 0.22451994091580502, "grad_norm": 0.23991310596466064, "learning_rate": 0.0003879788151250154, "loss": 8.8524, "step": 1824 }, { "epoch": 0.22464303298867552, "grad_norm": 0.25065386295318604, "learning_rate": 0.00038791723118610666, "loss": 7.5122, "step": 1825 }, { "epoch": 0.22476612506154603, "grad_norm": 0.1587425172328949, "learning_rate": 0.00038785564724719794, "loss": 7.5407, "step": 1826 }, { "epoch": 0.22488921713441654, "grad_norm": 0.22321225702762604, "learning_rate": 0.00038779406330828923, "loss": 7.4834, "step": 1827 }, { "epoch": 0.22501230920728704, "grad_norm": 0.33692917227745056, "learning_rate": 0.0003877324793693805, "loss": 8.2558, "step": 1828 }, { "epoch": 0.22513540128015755, "grad_norm": 0.5445417761802673, "learning_rate": 0.00038767089543047175, "loss": 8.7097, "step": 1829 }, { "epoch": 0.22525849335302806, "grad_norm": 0.10602885484695435, "learning_rate": 0.00038760931149156303, "loss": 7.4091, "step": 1830 }, { "epoch": 0.22538158542589856, "grad_norm": 0.10785025358200073, "learning_rate": 0.00038754772755265427, "loss": 7.5768, "step": 1831 }, { "epoch": 0.22550467749876907, "grad_norm": 0.1232968419790268, "learning_rate": 0.00038748614361374555, "loss": 7.5783, "step": 1832 }, { "epoch": 0.22562776957163957, "grad_norm": 0.16317839920520782, "learning_rate": 0.0003874245596748368, "loss": 7.5407, "step": 1833 }, { "epoch": 0.22575086164451008, "grad_norm": 0.281544029712677, "learning_rate": 0.0003873629757359281, "loss": 7.5453, "step": 1834 }, { "epoch": 0.2258739537173806, "grad_norm": 0.3755822777748108, "learning_rate": 0.00038730139179701936, "loss": 8.3584, "step": 1835 }, { "epoch": 0.2259970457902511, "grad_norm": 0.18415741622447968, "learning_rate": 0.00038723980785811065, "loss": 8.2181, "step": 1836 }, { "epoch": 0.22612013786312163, "grad_norm": 0.268178790807724, "learning_rate": 0.0003871782239192019, "loss": 7.096, "step": 1837 }, { "epoch": 0.22624322993599214, "grad_norm": 0.5945543646812439, "learning_rate": 0.00038711663998029316, "loss": 9.2636, "step": 1838 }, { "epoch": 0.22636632200886264, "grad_norm": 0.16400571167469025, "learning_rate": 0.0003870550560413844, "loss": 7.6521, "step": 1839 }, { "epoch": 0.22648941408173315, "grad_norm": 0.3063548505306244, "learning_rate": 0.0003869934721024757, "loss": 7.4578, "step": 1840 }, { "epoch": 0.22661250615460365, "grad_norm": 0.2286129742860794, "learning_rate": 0.00038693188816356697, "loss": 7.5643, "step": 1841 }, { "epoch": 0.22673559822747416, "grad_norm": 0.5296679139137268, "learning_rate": 0.00038687030422465826, "loss": 9.4661, "step": 1842 }, { "epoch": 0.22685869030034467, "grad_norm": 0.3270336091518402, "learning_rate": 0.0003868087202857495, "loss": 8.5664, "step": 1843 }, { "epoch": 0.22698178237321517, "grad_norm": 0.18689297139644623, "learning_rate": 0.0003867471363468408, "loss": 8.114, "step": 1844 }, { "epoch": 0.22710487444608568, "grad_norm": 0.27298685908317566, "learning_rate": 0.000386685552407932, "loss": 7.7342, "step": 1845 }, { "epoch": 0.2272279665189562, "grad_norm": 0.2446880042552948, "learning_rate": 0.0003866239684690233, "loss": 8.6871, "step": 1846 }, { "epoch": 0.2273510585918267, "grad_norm": 0.17233465611934662, "learning_rate": 0.0003865623845301145, "loss": 7.5145, "step": 1847 }, { "epoch": 0.2274741506646972, "grad_norm": 0.12697714567184448, "learning_rate": 0.00038650080059120587, "loss": 7.8094, "step": 1848 }, { "epoch": 0.2275972427375677, "grad_norm": 0.18825408816337585, "learning_rate": 0.0003864392166522971, "loss": 7.9817, "step": 1849 }, { "epoch": 0.2277203348104382, "grad_norm": 0.22567759454250336, "learning_rate": 0.0003863776327133884, "loss": 7.5634, "step": 1850 }, { "epoch": 0.22784342688330872, "grad_norm": 0.12320993095636368, "learning_rate": 0.0003863160487744796, "loss": 7.6097, "step": 1851 }, { "epoch": 0.22796651895617923, "grad_norm": 0.3698708713054657, "learning_rate": 0.0003862544648355709, "loss": 8.1521, "step": 1852 }, { "epoch": 0.22808961102904973, "grad_norm": 0.1601128876209259, "learning_rate": 0.00038619288089666214, "loss": 7.5337, "step": 1853 }, { "epoch": 0.22821270310192024, "grad_norm": 0.12990692257881165, "learning_rate": 0.0003861312969577535, "loss": 7.6689, "step": 1854 }, { "epoch": 0.22833579517479075, "grad_norm": 0.16792337596416473, "learning_rate": 0.0003860697130188447, "loss": 7.458, "step": 1855 }, { "epoch": 0.22845888724766125, "grad_norm": 0.1117393895983696, "learning_rate": 0.000386008129079936, "loss": 7.8573, "step": 1856 }, { "epoch": 0.22858197932053176, "grad_norm": 0.15052977204322815, "learning_rate": 0.00038594654514102723, "loss": 7.7153, "step": 1857 }, { "epoch": 0.22870507139340226, "grad_norm": 0.1393532156944275, "learning_rate": 0.0003858849612021185, "loss": 7.5483, "step": 1858 }, { "epoch": 0.22882816346627277, "grad_norm": 0.245570108294487, "learning_rate": 0.00038582337726320975, "loss": 7.7321, "step": 1859 }, { "epoch": 0.22895125553914328, "grad_norm": 0.16180789470672607, "learning_rate": 0.00038576179332430104, "loss": 7.6324, "step": 1860 }, { "epoch": 0.22907434761201378, "grad_norm": 0.25227120518684387, "learning_rate": 0.0003857002093853923, "loss": 8.2153, "step": 1861 }, { "epoch": 0.2291974396848843, "grad_norm": 0.18527980148792267, "learning_rate": 0.00038563862544648355, "loss": 8.0557, "step": 1862 }, { "epoch": 0.2293205317577548, "grad_norm": 0.1696685254573822, "learning_rate": 0.00038557704150757484, "loss": 7.3318, "step": 1863 }, { "epoch": 0.2294436238306253, "grad_norm": 0.13283300399780273, "learning_rate": 0.0003855154575686661, "loss": 7.4093, "step": 1864 }, { "epoch": 0.2295667159034958, "grad_norm": 0.1664082407951355, "learning_rate": 0.00038545387362975736, "loss": 7.7908, "step": 1865 }, { "epoch": 0.22968980797636632, "grad_norm": 0.1997220814228058, "learning_rate": 0.0003853922896908486, "loss": 7.8957, "step": 1866 }, { "epoch": 0.22981290004923682, "grad_norm": 0.18136273324489594, "learning_rate": 0.0003853307057519399, "loss": 7.4506, "step": 1867 }, { "epoch": 0.22993599212210733, "grad_norm": 0.17078830301761627, "learning_rate": 0.00038526912181303117, "loss": 7.4599, "step": 1868 }, { "epoch": 0.23005908419497784, "grad_norm": 0.1790163367986679, "learning_rate": 0.00038520753787412245, "loss": 7.7832, "step": 1869 }, { "epoch": 0.23018217626784834, "grad_norm": 0.14948444068431854, "learning_rate": 0.0003851459539352137, "loss": 8.0369, "step": 1870 }, { "epoch": 0.23030526834071885, "grad_norm": 0.33605533838272095, "learning_rate": 0.00038508436999630497, "loss": 7.955, "step": 1871 }, { "epoch": 0.23042836041358936, "grad_norm": 0.12960994243621826, "learning_rate": 0.0003850227860573962, "loss": 7.255, "step": 1872 }, { "epoch": 0.23055145248645986, "grad_norm": 0.17267842590808868, "learning_rate": 0.0003849612021184875, "loss": 7.3356, "step": 1873 }, { "epoch": 0.23067454455933037, "grad_norm": 0.25702449679374695, "learning_rate": 0.0003848996181795787, "loss": 7.7034, "step": 1874 }, { "epoch": 0.23079763663220088, "grad_norm": 0.132584348320961, "learning_rate": 0.00038483803424067006, "loss": 7.4337, "step": 1875 }, { "epoch": 0.23092072870507138, "grad_norm": 0.625232458114624, "learning_rate": 0.0003847764503017613, "loss": 8.8778, "step": 1876 }, { "epoch": 0.2310438207779419, "grad_norm": 0.1667965203523636, "learning_rate": 0.0003847148663628526, "loss": 7.706, "step": 1877 }, { "epoch": 0.2311669128508124, "grad_norm": 0.15568025410175323, "learning_rate": 0.0003846532824239438, "loss": 7.4828, "step": 1878 }, { "epoch": 0.2312900049236829, "grad_norm": 0.3601674437522888, "learning_rate": 0.0003845916984850351, "loss": 7.9811, "step": 1879 }, { "epoch": 0.23141309699655344, "grad_norm": 0.18831153213977814, "learning_rate": 0.00038453011454612633, "loss": 7.4732, "step": 1880 }, { "epoch": 0.23153618906942394, "grad_norm": 0.3353756070137024, "learning_rate": 0.0003844685306072177, "loss": 8.5697, "step": 1881 }, { "epoch": 0.23165928114229445, "grad_norm": 0.19809968769550323, "learning_rate": 0.0003844069466683089, "loss": 8.1258, "step": 1882 }, { "epoch": 0.23178237321516496, "grad_norm": 0.3232426047325134, "learning_rate": 0.0003843453627294002, "loss": 7.5727, "step": 1883 }, { "epoch": 0.23190546528803546, "grad_norm": 0.2058699131011963, "learning_rate": 0.0003842837787904914, "loss": 7.4668, "step": 1884 }, { "epoch": 0.23202855736090597, "grad_norm": 0.10547751933336258, "learning_rate": 0.0003842221948515827, "loss": 7.583, "step": 1885 }, { "epoch": 0.23215164943377647, "grad_norm": 0.19724610447883606, "learning_rate": 0.00038416061091267395, "loss": 7.6828, "step": 1886 }, { "epoch": 0.23227474150664698, "grad_norm": 0.10685081779956818, "learning_rate": 0.00038409902697376523, "loss": 7.4262, "step": 1887 }, { "epoch": 0.2323978335795175, "grad_norm": 0.27048859000205994, "learning_rate": 0.0003840374430348565, "loss": 7.5009, "step": 1888 }, { "epoch": 0.232520925652388, "grad_norm": 0.15050533413887024, "learning_rate": 0.0003839758590959478, "loss": 7.6353, "step": 1889 }, { "epoch": 0.2326440177252585, "grad_norm": 0.23614099621772766, "learning_rate": 0.00038391427515703904, "loss": 7.5247, "step": 1890 }, { "epoch": 0.232767109798129, "grad_norm": 0.18848064541816711, "learning_rate": 0.0003838526912181303, "loss": 8.2048, "step": 1891 }, { "epoch": 0.2328902018709995, "grad_norm": 0.17115047574043274, "learning_rate": 0.00038379110727922156, "loss": 7.598, "step": 1892 }, { "epoch": 0.23301329394387002, "grad_norm": 0.17166763544082642, "learning_rate": 0.00038372952334031284, "loss": 8.1694, "step": 1893 }, { "epoch": 0.23313638601674053, "grad_norm": 0.1724098175764084, "learning_rate": 0.0003836679394014041, "loss": 7.8223, "step": 1894 }, { "epoch": 0.23325947808961103, "grad_norm": 0.12467148154973984, "learning_rate": 0.0003836063554624954, "loss": 7.7172, "step": 1895 }, { "epoch": 0.23338257016248154, "grad_norm": 0.24064454436302185, "learning_rate": 0.00038354477152358665, "loss": 8.3097, "step": 1896 }, { "epoch": 0.23350566223535205, "grad_norm": 0.16799302399158478, "learning_rate": 0.00038348318758467794, "loss": 7.549, "step": 1897 }, { "epoch": 0.23362875430822255, "grad_norm": 0.24107703566551208, "learning_rate": 0.00038342160364576917, "loss": 7.7278, "step": 1898 }, { "epoch": 0.23375184638109306, "grad_norm": 0.1732178032398224, "learning_rate": 0.00038336001970686045, "loss": 7.6075, "step": 1899 }, { "epoch": 0.23387493845396357, "grad_norm": 0.13077926635742188, "learning_rate": 0.0003832984357679517, "loss": 7.4914, "step": 1900 }, { "epoch": 0.23399803052683407, "grad_norm": 0.18489280343055725, "learning_rate": 0.000383236851829043, "loss": 7.6636, "step": 1901 }, { "epoch": 0.23412112259970458, "grad_norm": 0.3839767575263977, "learning_rate": 0.00038317526789013426, "loss": 7.8782, "step": 1902 }, { "epoch": 0.23424421467257509, "grad_norm": 0.11311230063438416, "learning_rate": 0.00038311368395122555, "loss": 7.5495, "step": 1903 }, { "epoch": 0.2343673067454456, "grad_norm": 0.13626883924007416, "learning_rate": 0.0003830521000123168, "loss": 7.5546, "step": 1904 }, { "epoch": 0.2344903988183161, "grad_norm": 0.18092888593673706, "learning_rate": 0.00038299051607340807, "loss": 8.2604, "step": 1905 }, { "epoch": 0.2346134908911866, "grad_norm": 0.6745687127113342, "learning_rate": 0.0003829289321344993, "loss": 9.1007, "step": 1906 }, { "epoch": 0.2347365829640571, "grad_norm": 0.16984669864177704, "learning_rate": 0.0003828673481955906, "loss": 7.6424, "step": 1907 }, { "epoch": 0.23485967503692762, "grad_norm": 0.4057793617248535, "learning_rate": 0.0003828057642566818, "loss": 9.0736, "step": 1908 }, { "epoch": 0.23498276710979812, "grad_norm": 0.29964661598205566, "learning_rate": 0.00038274418031777316, "loss": 7.3594, "step": 1909 }, { "epoch": 0.23510585918266863, "grad_norm": 0.23557475209236145, "learning_rate": 0.0003826825963788644, "loss": 7.9275, "step": 1910 }, { "epoch": 0.23522895125553914, "grad_norm": 0.40947964787483215, "learning_rate": 0.0003826210124399557, "loss": 8.7686, "step": 1911 }, { "epoch": 0.23535204332840964, "grad_norm": 0.23290729522705078, "learning_rate": 0.0003825594285010469, "loss": 7.4962, "step": 1912 }, { "epoch": 0.23547513540128015, "grad_norm": 0.27870482206344604, "learning_rate": 0.0003824978445621382, "loss": 8.3497, "step": 1913 }, { "epoch": 0.23559822747415066, "grad_norm": 0.3016793429851532, "learning_rate": 0.00038243626062322943, "loss": 7.6471, "step": 1914 }, { "epoch": 0.23572131954702116, "grad_norm": 0.18328110873699188, "learning_rate": 0.00038237467668432077, "loss": 7.4549, "step": 1915 }, { "epoch": 0.23584441161989167, "grad_norm": 0.13872793316841125, "learning_rate": 0.000382313092745412, "loss": 7.5157, "step": 1916 }, { "epoch": 0.23596750369276218, "grad_norm": 0.14290906488895416, "learning_rate": 0.0003822515088065033, "loss": 7.8414, "step": 1917 }, { "epoch": 0.23609059576563268, "grad_norm": 0.13969579339027405, "learning_rate": 0.0003821899248675945, "loss": 7.7495, "step": 1918 }, { "epoch": 0.2362136878385032, "grad_norm": 0.19157621264457703, "learning_rate": 0.0003821283409286858, "loss": 7.1697, "step": 1919 }, { "epoch": 0.2363367799113737, "grad_norm": 0.13108474016189575, "learning_rate": 0.00038206675698977704, "loss": 7.6799, "step": 1920 }, { "epoch": 0.2364598719842442, "grad_norm": 0.17589916288852692, "learning_rate": 0.0003820051730508683, "loss": 7.5085, "step": 1921 }, { "epoch": 0.2365829640571147, "grad_norm": 0.16725695133209229, "learning_rate": 0.0003819435891119596, "loss": 7.4505, "step": 1922 }, { "epoch": 0.23670605612998524, "grad_norm": 0.11378682404756546, "learning_rate": 0.0003818820051730509, "loss": 7.4345, "step": 1923 }, { "epoch": 0.23682914820285575, "grad_norm": 0.24248650670051575, "learning_rate": 0.00038182042123414213, "loss": 7.9218, "step": 1924 }, { "epoch": 0.23695224027572626, "grad_norm": 0.2594069242477417, "learning_rate": 0.0003817588372952334, "loss": 7.535, "step": 1925 }, { "epoch": 0.23707533234859676, "grad_norm": 0.14901675283908844, "learning_rate": 0.00038169725335632465, "loss": 7.46, "step": 1926 }, { "epoch": 0.23719842442146727, "grad_norm": 0.21652108430862427, "learning_rate": 0.00038163566941741594, "loss": 8.1846, "step": 1927 }, { "epoch": 0.23732151649433778, "grad_norm": 0.1411413997411728, "learning_rate": 0.00038157408547850717, "loss": 7.5055, "step": 1928 }, { "epoch": 0.23744460856720828, "grad_norm": 0.2916218340396881, "learning_rate": 0.0003815125015395985, "loss": 8.0552, "step": 1929 }, { "epoch": 0.2375677006400788, "grad_norm": 0.21531541645526886, "learning_rate": 0.00038145091760068974, "loss": 8.3603, "step": 1930 }, { "epoch": 0.2376907927129493, "grad_norm": 0.19310374557971954, "learning_rate": 0.00038138933366178103, "loss": 8.1125, "step": 1931 }, { "epoch": 0.2378138847858198, "grad_norm": 0.26481232047080994, "learning_rate": 0.00038132774972287226, "loss": 7.4274, "step": 1932 }, { "epoch": 0.2379369768586903, "grad_norm": 0.19769765436649323, "learning_rate": 0.00038126616578396355, "loss": 7.6995, "step": 1933 }, { "epoch": 0.23806006893156081, "grad_norm": 0.1592164784669876, "learning_rate": 0.0003812045818450548, "loss": 8.2574, "step": 1934 }, { "epoch": 0.23818316100443132, "grad_norm": 0.15142609179019928, "learning_rate": 0.00038114299790614607, "loss": 7.3018, "step": 1935 }, { "epoch": 0.23830625307730183, "grad_norm": 0.18185003101825714, "learning_rate": 0.00038108141396723735, "loss": 7.477, "step": 1936 }, { "epoch": 0.23842934515017233, "grad_norm": 0.2270456701517105, "learning_rate": 0.00038101983002832864, "loss": 7.5666, "step": 1937 }, { "epoch": 0.23855243722304284, "grad_norm": 0.190480038523674, "learning_rate": 0.00038095824608941987, "loss": 7.3715, "step": 1938 }, { "epoch": 0.23867552929591335, "grad_norm": 0.12764622271060944, "learning_rate": 0.00038089666215051116, "loss": 7.4431, "step": 1939 }, { "epoch": 0.23879862136878385, "grad_norm": 0.16059459745883942, "learning_rate": 0.0003808350782116024, "loss": 7.7159, "step": 1940 }, { "epoch": 0.23892171344165436, "grad_norm": 0.15414388477802277, "learning_rate": 0.0003807734942726937, "loss": 7.9067, "step": 1941 }, { "epoch": 0.23904480551452487, "grad_norm": 0.19843775033950806, "learning_rate": 0.00038071191033378496, "loss": 7.7539, "step": 1942 }, { "epoch": 0.23916789758739537, "grad_norm": 0.27228960394859314, "learning_rate": 0.00038065032639487625, "loss": 7.4909, "step": 1943 }, { "epoch": 0.23929098966026588, "grad_norm": 0.20403549075126648, "learning_rate": 0.0003805887424559675, "loss": 7.7543, "step": 1944 }, { "epoch": 0.23941408173313639, "grad_norm": 0.2392834722995758, "learning_rate": 0.00038052715851705877, "loss": 8.2523, "step": 1945 }, { "epoch": 0.2395371738060069, "grad_norm": 0.15474237501621246, "learning_rate": 0.00038046557457815, "loss": 7.5967, "step": 1946 }, { "epoch": 0.2396602658788774, "grad_norm": 0.1916494220495224, "learning_rate": 0.0003804039906392413, "loss": 7.6604, "step": 1947 }, { "epoch": 0.2397833579517479, "grad_norm": 0.14506374299526215, "learning_rate": 0.0003803424067003325, "loss": 7.6313, "step": 1948 }, { "epoch": 0.2399064500246184, "grad_norm": 0.14449436962604523, "learning_rate": 0.00038028082276142386, "loss": 7.6259, "step": 1949 }, { "epoch": 0.24002954209748892, "grad_norm": 0.11419320106506348, "learning_rate": 0.0003802192388225151, "loss": 7.5187, "step": 1950 }, { "epoch": 0.24015263417035942, "grad_norm": 0.45232346653938293, "learning_rate": 0.0003801576548836064, "loss": 8.7964, "step": 1951 }, { "epoch": 0.24027572624322993, "grad_norm": 0.2581331431865692, "learning_rate": 0.0003800960709446976, "loss": 7.4288, "step": 1952 }, { "epoch": 0.24039881831610044, "grad_norm": 0.27208906412124634, "learning_rate": 0.0003800344870057889, "loss": 7.6631, "step": 1953 }, { "epoch": 0.24052191038897094, "grad_norm": 0.22026222944259644, "learning_rate": 0.00037997290306688013, "loss": 7.5847, "step": 1954 }, { "epoch": 0.24064500246184145, "grad_norm": 0.24304038286209106, "learning_rate": 0.0003799113191279714, "loss": 7.2263, "step": 1955 }, { "epoch": 0.24076809453471196, "grad_norm": 0.566282331943512, "learning_rate": 0.0003798497351890627, "loss": 7.4569, "step": 1956 }, { "epoch": 0.24089118660758246, "grad_norm": 0.32583048939704895, "learning_rate": 0.000379788151250154, "loss": 7.6208, "step": 1957 }, { "epoch": 0.24101427868045297, "grad_norm": 0.23265179991722107, "learning_rate": 0.0003797265673112452, "loss": 7.1535, "step": 1958 }, { "epoch": 0.24113737075332348, "grad_norm": 0.33120405673980713, "learning_rate": 0.0003796649833723365, "loss": 7.7471, "step": 1959 }, { "epoch": 0.24126046282619398, "grad_norm": 0.232996866106987, "learning_rate": 0.00037960339943342774, "loss": 7.5152, "step": 1960 }, { "epoch": 0.2413835548990645, "grad_norm": 0.21071955561637878, "learning_rate": 0.00037954181549451903, "loss": 7.5556, "step": 1961 }, { "epoch": 0.241506646971935, "grad_norm": 0.2952824831008911, "learning_rate": 0.00037948023155561026, "loss": 7.4662, "step": 1962 }, { "epoch": 0.2416297390448055, "grad_norm": 0.18555401265621185, "learning_rate": 0.0003794186476167016, "loss": 7.762, "step": 1963 }, { "epoch": 0.241752831117676, "grad_norm": 0.28557348251342773, "learning_rate": 0.00037935706367779284, "loss": 7.4842, "step": 1964 }, { "epoch": 0.24187592319054652, "grad_norm": 0.3880827724933624, "learning_rate": 0.0003792954797388841, "loss": 7.2939, "step": 1965 }, { "epoch": 0.24199901526341702, "grad_norm": 0.4897318482398987, "learning_rate": 0.00037923389579997536, "loss": 9.4305, "step": 1966 }, { "epoch": 0.24212210733628756, "grad_norm": 0.3369692265987396, "learning_rate": 0.00037917231186106664, "loss": 7.7585, "step": 1967 }, { "epoch": 0.24224519940915806, "grad_norm": 0.32576674222946167, "learning_rate": 0.0003791107279221579, "loss": 7.7663, "step": 1968 }, { "epoch": 0.24236829148202857, "grad_norm": 0.3232834041118622, "learning_rate": 0.0003790491439832492, "loss": 7.8473, "step": 1969 }, { "epoch": 0.24249138355489908, "grad_norm": 0.2560626268386841, "learning_rate": 0.00037898756004434045, "loss": 7.8914, "step": 1970 }, { "epoch": 0.24261447562776958, "grad_norm": 0.3762117326259613, "learning_rate": 0.00037892597610543173, "loss": 8.6899, "step": 1971 }, { "epoch": 0.2427375677006401, "grad_norm": 0.1510416865348816, "learning_rate": 0.00037886439216652297, "loss": 7.5939, "step": 1972 }, { "epoch": 0.2428606597735106, "grad_norm": 0.2093522995710373, "learning_rate": 0.00037880280822761425, "loss": 7.2151, "step": 1973 }, { "epoch": 0.2429837518463811, "grad_norm": 0.13348884880542755, "learning_rate": 0.0003787412242887055, "loss": 7.5086, "step": 1974 }, { "epoch": 0.2431068439192516, "grad_norm": NaN, "learning_rate": 0.00037867964034979677, "loss": 7.7292, "step": 1975 }, { "epoch": 0.24322993599212211, "grad_norm": 0.24010050296783447, "learning_rate": 0.00037861805641088806, "loss": 7.5455, "step": 1976 }, { "epoch": 0.24335302806499262, "grad_norm": 0.39240890741348267, "learning_rate": 0.00037855647247197935, "loss": 8.0101, "step": 1977 }, { "epoch": 0.24347612013786313, "grad_norm": 0.9255909323692322, "learning_rate": 0.0003784948885330706, "loss": 8.1334, "step": 1978 }, { "epoch": 0.24359921221073363, "grad_norm": 0.23192283511161804, "learning_rate": 0.00037843330459416186, "loss": 7.9302, "step": 1979 }, { "epoch": 0.24372230428360414, "grad_norm": 0.3779674768447876, "learning_rate": 0.0003783717206552531, "loss": 8.8434, "step": 1980 }, { "epoch": 0.24384539635647465, "grad_norm": 0.5765228271484375, "learning_rate": 0.0003783101367163444, "loss": 8.2626, "step": 1981 }, { "epoch": 0.24396848842934515, "grad_norm": 0.26903393864631653, "learning_rate": 0.0003782485527774356, "loss": 7.3625, "step": 1982 }, { "epoch": 0.24409158050221566, "grad_norm": 0.8886765241622925, "learning_rate": 0.00037818696883852696, "loss": 7.8344, "step": 1983 }, { "epoch": 0.24421467257508617, "grad_norm": 1.1802423000335693, "learning_rate": 0.0003781253848996182, "loss": 8.0432, "step": 1984 }, { "epoch": 0.24433776464795667, "grad_norm": 0.6982890963554382, "learning_rate": 0.0003780638009607095, "loss": 7.7318, "step": 1985 }, { "epoch": 0.24446085672082718, "grad_norm": 0.3705436587333679, "learning_rate": 0.0003780022170218007, "loss": 7.7015, "step": 1986 }, { "epoch": 0.2445839487936977, "grad_norm": 0.678989052772522, "learning_rate": 0.000377940633082892, "loss": 7.4142, "step": 1987 }, { "epoch": 0.2447070408665682, "grad_norm": 0.2288515567779541, "learning_rate": 0.0003778790491439832, "loss": 7.9781, "step": 1988 }, { "epoch": 0.2448301329394387, "grad_norm": 0.42117950320243835, "learning_rate": 0.0003778174652050745, "loss": 7.686, "step": 1989 }, { "epoch": 0.2449532250123092, "grad_norm": 0.571668267250061, "learning_rate": 0.0003777558812661658, "loss": 8.6019, "step": 1990 }, { "epoch": 0.2450763170851797, "grad_norm": 0.4033123254776001, "learning_rate": 0.0003776942973272571, "loss": 8.9584, "step": 1991 }, { "epoch": 0.24519940915805022, "grad_norm": 860327650525184.0, "learning_rate": 0.0003776327133883483, "loss": 8.3015, "step": 1992 }, { "epoch": 0.24532250123092073, "grad_norm": 0.8039635419845581, "learning_rate": 0.0003775711294494396, "loss": 8.3226, "step": 1993 }, { "epoch": 0.24544559330379123, "grad_norm": 0.6807208061218262, "learning_rate": 0.00037750954551053084, "loss": 8.6553, "step": 1994 }, { "epoch": 0.24556868537666174, "grad_norm": 0.7369422912597656, "learning_rate": 0.0003774479615716221, "loss": 7.7934, "step": 1995 }, { "epoch": 0.24569177744953224, "grad_norm": 0.250435471534729, "learning_rate": 0.0003773863776327134, "loss": 7.9326, "step": 1996 }, { "epoch": 0.24581486952240275, "grad_norm": 0.571788489818573, "learning_rate": 0.0003773247936938047, "loss": 7.231, "step": 1997 }, { "epoch": 0.24593796159527326, "grad_norm": 1.0424494743347168, "learning_rate": 0.00037726320975489593, "loss": 7.6592, "step": 1998 }, { "epoch": 0.24606105366814376, "grad_norm": 0.910938024520874, "learning_rate": 0.0003772016258159872, "loss": 7.3069, "step": 1999 }, { "epoch": 0.24618414574101427, "grad_norm": 0.7344673275947571, "learning_rate": 0.00037714004187707845, "loss": 7.7886, "step": 2000 }, { "epoch": 0.24630723781388478, "grad_norm": 0.18475157022476196, "learning_rate": 0.00037707845793816974, "loss": 7.2653, "step": 2001 }, { "epoch": 0.24643032988675528, "grad_norm": 0.3352241516113281, "learning_rate": 0.00037701687399926097, "loss": 7.5934, "step": 2002 }, { "epoch": 0.2465534219596258, "grad_norm": 0.5231695175170898, "learning_rate": 0.0003769552900603523, "loss": 7.5621, "step": 2003 }, { "epoch": 0.2466765140324963, "grad_norm": 0.2170712798833847, "learning_rate": 0.00037689370612144354, "loss": 7.8205, "step": 2004 }, { "epoch": 0.2467996061053668, "grad_norm": 0.3514639735221863, "learning_rate": 0.00037683212218253483, "loss": 7.9715, "step": 2005 }, { "epoch": 0.2469226981782373, "grad_norm": 0.2572392225265503, "learning_rate": 0.00037677053824362606, "loss": 7.4306, "step": 2006 }, { "epoch": 0.24704579025110782, "grad_norm": 0.10715219378471375, "learning_rate": 0.00037670895430471735, "loss": 7.3493, "step": 2007 }, { "epoch": 0.24716888232397832, "grad_norm": 0.7060626149177551, "learning_rate": 0.0003766473703658086, "loss": 7.1322, "step": 2008 }, { "epoch": 0.24729197439684883, "grad_norm": 0.2926163077354431, "learning_rate": 0.00037658578642689987, "loss": 7.5793, "step": 2009 }, { "epoch": 0.24741506646971936, "grad_norm": 0.26288139820098877, "learning_rate": 0.00037652420248799115, "loss": 7.3422, "step": 2010 }, { "epoch": 0.24753815854258987, "grad_norm": 0.37009039521217346, "learning_rate": 0.00037646261854908244, "loss": 7.5584, "step": 2011 }, { "epoch": 0.24766125061546038, "grad_norm": 0.5136446952819824, "learning_rate": 0.00037640103461017367, "loss": 7.7257, "step": 2012 }, { "epoch": 0.24778434268833088, "grad_norm": 0.3851381540298462, "learning_rate": 0.00037633945067126496, "loss": 7.9824, "step": 2013 }, { "epoch": 0.2479074347612014, "grad_norm": 0.12507452070713043, "learning_rate": 0.0003762778667323562, "loss": 7.5825, "step": 2014 }, { "epoch": 0.2480305268340719, "grad_norm": 0.5199063420295715, "learning_rate": 0.0003762162827934475, "loss": 7.3288, "step": 2015 }, { "epoch": 0.2481536189069424, "grad_norm": 0.2469012290239334, "learning_rate": 0.0003761546988545387, "loss": 8.0153, "step": 2016 }, { "epoch": 0.2482767109798129, "grad_norm": 0.2243659347295761, "learning_rate": 0.00037609311491563005, "loss": 7.3934, "step": 2017 }, { "epoch": 0.24839980305268342, "grad_norm": 0.2684996426105499, "learning_rate": 0.0003760315309767213, "loss": 7.9521, "step": 2018 }, { "epoch": 0.24852289512555392, "grad_norm": 0.3204903304576874, "learning_rate": 0.00037596994703781257, "loss": 7.6829, "step": 2019 }, { "epoch": 0.24864598719842443, "grad_norm": 0.28044313192367554, "learning_rate": 0.0003759083630989038, "loss": 8.0302, "step": 2020 }, { "epoch": 0.24876907927129494, "grad_norm": 0.1905449479818344, "learning_rate": 0.0003758467791599951, "loss": 7.6888, "step": 2021 }, { "epoch": 0.24889217134416544, "grad_norm": 0.19687007367610931, "learning_rate": 0.0003757851952210863, "loss": 7.9818, "step": 2022 }, { "epoch": 0.24901526341703595, "grad_norm": 0.1920355260372162, "learning_rate": 0.00037572361128217766, "loss": 7.7551, "step": 2023 }, { "epoch": 0.24913835548990645, "grad_norm": 0.22065915167331696, "learning_rate": 0.0003756620273432689, "loss": 7.373, "step": 2024 }, { "epoch": 0.24926144756277696, "grad_norm": 0.41144901514053345, "learning_rate": 0.0003756004434043602, "loss": 8.2315, "step": 2025 }, { "epoch": 0.24938453963564747, "grad_norm": 0.32030731439590454, "learning_rate": 0.0003755388594654514, "loss": 7.7871, "step": 2026 }, { "epoch": 0.24950763170851797, "grad_norm": 0.1228729709982872, "learning_rate": 0.0003754772755265427, "loss": 7.4501, "step": 2027 }, { "epoch": 0.24963072378138848, "grad_norm": 0.26390528678894043, "learning_rate": 0.00037541569158763393, "loss": 7.4564, "step": 2028 }, { "epoch": 0.249753815854259, "grad_norm": 0.1292349100112915, "learning_rate": 0.0003753541076487252, "loss": 7.8351, "step": 2029 }, { "epoch": 0.2498769079271295, "grad_norm": 0.13853324949741364, "learning_rate": 0.0003752925237098165, "loss": 7.9864, "step": 2030 }, { "epoch": 0.25, "grad_norm": 0.16773541271686554, "learning_rate": 0.0003752309397709078, "loss": 7.4531, "step": 2031 }, { "epoch": 0.25012309207287053, "grad_norm": 0.1378672868013382, "learning_rate": 0.000375169355831999, "loss": 7.4578, "step": 2032 }, { "epoch": 0.250246184145741, "grad_norm": 0.26882660388946533, "learning_rate": 0.0003751077718930903, "loss": 7.5945, "step": 2033 }, { "epoch": 0.25036927621861155, "grad_norm": 0.22890746593475342, "learning_rate": 0.00037504618795418154, "loss": 7.6811, "step": 2034 }, { "epoch": 0.250492368291482, "grad_norm": 0.2330988496541977, "learning_rate": 0.00037498460401527283, "loss": 7.4647, "step": 2035 }, { "epoch": 0.25061546036435256, "grad_norm": 0.19114917516708374, "learning_rate": 0.00037492302007636406, "loss": 8.298, "step": 2036 }, { "epoch": 0.25073855243722304, "grad_norm": 0.18642956018447876, "learning_rate": 0.0003748614361374554, "loss": 7.5646, "step": 2037 }, { "epoch": 0.2508616445100936, "grad_norm": 0.11352791637182236, "learning_rate": 0.00037479985219854664, "loss": 7.73, "step": 2038 }, { "epoch": 0.25098473658296405, "grad_norm": 0.1370425820350647, "learning_rate": 0.0003747382682596379, "loss": 7.5127, "step": 2039 }, { "epoch": 0.2511078286558346, "grad_norm": 0.22244508564472198, "learning_rate": 0.00037467668432072915, "loss": 8.0442, "step": 2040 }, { "epoch": 0.25123092072870506, "grad_norm": 0.26535266637802124, "learning_rate": 0.00037461510038182044, "loss": 7.7742, "step": 2041 }, { "epoch": 0.2513540128015756, "grad_norm": 0.3462907373905182, "learning_rate": 0.0003745535164429117, "loss": 7.104, "step": 2042 }, { "epoch": 0.2514771048744461, "grad_norm": 0.26302844285964966, "learning_rate": 0.00037449193250400296, "loss": 7.502, "step": 2043 }, { "epoch": 0.2516001969473166, "grad_norm": 0.3047846257686615, "learning_rate": 0.00037443034856509425, "loss": 7.8433, "step": 2044 }, { "epoch": 0.2517232890201871, "grad_norm": 0.31006741523742676, "learning_rate": 0.00037436876462618553, "loss": 7.0758, "step": 2045 }, { "epoch": 0.2518463810930576, "grad_norm": 0.9444878101348877, "learning_rate": 0.00037430718068727677, "loss": 10.2305, "step": 2046 }, { "epoch": 0.2519694731659281, "grad_norm": 0.30328184366226196, "learning_rate": 0.00037424559674836805, "loss": 7.8846, "step": 2047 }, { "epoch": 0.25209256523879864, "grad_norm": 0.34785693883895874, "learning_rate": 0.0003741840128094593, "loss": 9.0726, "step": 2048 }, { "epoch": 0.2522156573116691, "grad_norm": 0.32260510325431824, "learning_rate": 0.00037412242887055057, "loss": 7.7959, "step": 2049 }, { "epoch": 0.25233874938453965, "grad_norm": 0.476485937833786, "learning_rate": 0.00037406084493164186, "loss": 7.8541, "step": 2050 }, { "epoch": 0.25246184145741013, "grad_norm": 0.1917741298675537, "learning_rate": 0.00037399926099273314, "loss": 8.8557, "step": 2051 }, { "epoch": 0.25258493353028066, "grad_norm": 0.2529318034648895, "learning_rate": 0.0003739376770538244, "loss": 8.3108, "step": 2052 }, { "epoch": 0.25270802560315114, "grad_norm": 0.1790156066417694, "learning_rate": 0.00037387609311491566, "loss": 7.602, "step": 2053 }, { "epoch": 0.2528311176760217, "grad_norm": 0.2838972508907318, "learning_rate": 0.0003738145091760069, "loss": 7.8727, "step": 2054 }, { "epoch": 0.25295420974889216, "grad_norm": 0.19945381581783295, "learning_rate": 0.0003737529252370982, "loss": 7.3482, "step": 2055 }, { "epoch": 0.2530773018217627, "grad_norm": 0.14809872210025787, "learning_rate": 0.0003736913412981894, "loss": 7.5999, "step": 2056 }, { "epoch": 0.25320039389463317, "grad_norm": 0.2595725953578949, "learning_rate": 0.00037362975735928076, "loss": 7.8843, "step": 2057 }, { "epoch": 0.2533234859675037, "grad_norm": 0.24444526433944702, "learning_rate": 0.000373568173420372, "loss": 7.4568, "step": 2058 }, { "epoch": 0.2534465780403742, "grad_norm": 0.2746480405330658, "learning_rate": 0.0003735065894814633, "loss": 8.2409, "step": 2059 }, { "epoch": 0.2535696701132447, "grad_norm": 0.16376186907291412, "learning_rate": 0.0003734450055425545, "loss": 7.7517, "step": 2060 }, { "epoch": 0.2536927621861152, "grad_norm": 0.1554461419582367, "learning_rate": 0.0003733834216036458, "loss": 7.4016, "step": 2061 }, { "epoch": 0.25381585425898573, "grad_norm": 0.17521882057189941, "learning_rate": 0.000373321837664737, "loss": 7.6462, "step": 2062 }, { "epoch": 0.2539389463318562, "grad_norm": 0.16024868190288544, "learning_rate": 0.0003732602537258283, "loss": 7.4386, "step": 2063 }, { "epoch": 0.25406203840472674, "grad_norm": 0.21217064559459686, "learning_rate": 0.0003731986697869196, "loss": 7.6335, "step": 2064 }, { "epoch": 0.2541851304775972, "grad_norm": 0.13329093158245087, "learning_rate": 0.0003731370858480109, "loss": 7.8501, "step": 2065 }, { "epoch": 0.25430822255046776, "grad_norm": 0.16974471509456635, "learning_rate": 0.0003730755019091021, "loss": 7.7217, "step": 2066 }, { "epoch": 0.25443131462333823, "grad_norm": 0.23089531064033508, "learning_rate": 0.0003730139179701934, "loss": 7.3661, "step": 2067 }, { "epoch": 0.25455440669620877, "grad_norm": 0.17399950325489044, "learning_rate": 0.00037295233403128464, "loss": 8.2717, "step": 2068 }, { "epoch": 0.25467749876907925, "grad_norm": 0.2558029592037201, "learning_rate": 0.0003728907500923759, "loss": 8.9503, "step": 2069 }, { "epoch": 0.2548005908419498, "grad_norm": 0.3017549216747284, "learning_rate": 0.00037282916615346716, "loss": 7.51, "step": 2070 }, { "epoch": 0.25492368291482026, "grad_norm": 0.22369243204593658, "learning_rate": 0.0003727675822145585, "loss": 8.0236, "step": 2071 }, { "epoch": 0.2550467749876908, "grad_norm": 0.14781737327575684, "learning_rate": 0.00037270599827564973, "loss": 7.7691, "step": 2072 }, { "epoch": 0.2551698670605613, "grad_norm": 0.23606817424297333, "learning_rate": 0.000372644414336741, "loss": 7.8025, "step": 2073 }, { "epoch": 0.2552929591334318, "grad_norm": 0.12777239084243774, "learning_rate": 0.00037258283039783225, "loss": 7.4106, "step": 2074 }, { "epoch": 0.25541605120630234, "grad_norm": 0.09705370664596558, "learning_rate": 0.00037252124645892353, "loss": 7.4929, "step": 2075 }, { "epoch": 0.2555391432791728, "grad_norm": 0.2641512155532837, "learning_rate": 0.00037245966252001477, "loss": 8.2855, "step": 2076 }, { "epoch": 0.25566223535204335, "grad_norm": 0.10384552925825119, "learning_rate": 0.0003723980785811061, "loss": 8.0916, "step": 2077 }, { "epoch": 0.25578532742491383, "grad_norm": 0.41846197843551636, "learning_rate": 0.00037233649464219734, "loss": 7.3637, "step": 2078 }, { "epoch": 0.25590841949778437, "grad_norm": 0.3006038963794708, "learning_rate": 0.0003722749107032886, "loss": 7.6252, "step": 2079 }, { "epoch": 0.25603151157065485, "grad_norm": 0.32001057267189026, "learning_rate": 0.00037221332676437986, "loss": 7.5223, "step": 2080 }, { "epoch": 0.2561546036435254, "grad_norm": 0.124283067882061, "learning_rate": 0.00037215174282547115, "loss": 7.864, "step": 2081 }, { "epoch": 0.25627769571639586, "grad_norm": 0.6347967982292175, "learning_rate": 0.0003720901588865624, "loss": 9.0846, "step": 2082 }, { "epoch": 0.2564007877892664, "grad_norm": 0.2769257426261902, "learning_rate": 0.00037202857494765366, "loss": 7.6928, "step": 2083 }, { "epoch": 0.25652387986213687, "grad_norm": 0.24518699944019318, "learning_rate": 0.00037196699100874495, "loss": 7.7856, "step": 2084 }, { "epoch": 0.2566469719350074, "grad_norm": 0.11526468396186829, "learning_rate": 0.00037190540706983624, "loss": 7.8037, "step": 2085 }, { "epoch": 0.2567700640078779, "grad_norm": 0.18234838545322418, "learning_rate": 0.00037184382313092747, "loss": 7.9592, "step": 2086 }, { "epoch": 0.2568931560807484, "grad_norm": 0.23406672477722168, "learning_rate": 0.00037178223919201876, "loss": 7.6202, "step": 2087 }, { "epoch": 0.2570162481536189, "grad_norm": 0.12474343180656433, "learning_rate": 0.00037172065525311, "loss": 7.7218, "step": 2088 }, { "epoch": 0.25713934022648943, "grad_norm": 0.15037888288497925, "learning_rate": 0.0003716590713142013, "loss": 7.4371, "step": 2089 }, { "epoch": 0.2572624322993599, "grad_norm": 0.15371066331863403, "learning_rate": 0.0003715974873752925, "loss": 7.5033, "step": 2090 }, { "epoch": 0.25738552437223045, "grad_norm": 0.4257849156856537, "learning_rate": 0.00037153590343638385, "loss": 7.8327, "step": 2091 }, { "epoch": 0.2575086164451009, "grad_norm": 0.29940465092658997, "learning_rate": 0.0003714743194974751, "loss": 7.4264, "step": 2092 }, { "epoch": 0.25763170851797146, "grad_norm": 0.26558059453964233, "learning_rate": 0.0003714127355585663, "loss": 7.933, "step": 2093 }, { "epoch": 0.25775480059084194, "grad_norm": 0.10952186584472656, "learning_rate": 0.0003713511516196576, "loss": 7.7296, "step": 2094 }, { "epoch": 0.25787789266371247, "grad_norm": 0.22756798565387726, "learning_rate": 0.00037128956768074883, "loss": 8.001, "step": 2095 }, { "epoch": 0.25800098473658295, "grad_norm": 0.24291297793388367, "learning_rate": 0.0003712279837418401, "loss": 7.4503, "step": 2096 }, { "epoch": 0.2581240768094535, "grad_norm": 0.22531507909297943, "learning_rate": 0.00037116639980293135, "loss": 7.3609, "step": 2097 }, { "epoch": 0.25824716888232396, "grad_norm": 0.11526252329349518, "learning_rate": 0.0003711048158640227, "loss": 7.4937, "step": 2098 }, { "epoch": 0.2583702609551945, "grad_norm": 0.3307863473892212, "learning_rate": 0.0003710432319251139, "loss": 8.1499, "step": 2099 }, { "epoch": 0.258493353028065, "grad_norm": 0.2835419178009033, "learning_rate": 0.0003709816479862052, "loss": 6.8272, "step": 2100 }, { "epoch": 0.2586164451009355, "grad_norm": 0.3316633105278015, "learning_rate": 0.00037092006404729644, "loss": 7.696, "step": 2101 }, { "epoch": 0.258739537173806, "grad_norm": 0.3851408362388611, "learning_rate": 0.00037085848010838773, "loss": 8.0577, "step": 2102 }, { "epoch": 0.2588626292466765, "grad_norm": 0.1574193388223648, "learning_rate": 0.00037079689616947896, "loss": 7.5373, "step": 2103 }, { "epoch": 0.258985721319547, "grad_norm": 0.2711457908153534, "learning_rate": 0.00037073531223057025, "loss": 7.6611, "step": 2104 }, { "epoch": 0.25910881339241754, "grad_norm": 0.1593765765428543, "learning_rate": 0.00037067372829166154, "loss": 7.8159, "step": 2105 }, { "epoch": 0.259231905465288, "grad_norm": 0.33597531914711, "learning_rate": 0.0003706121443527528, "loss": 7.3827, "step": 2106 }, { "epoch": 0.25935499753815855, "grad_norm": 0.13805201649665833, "learning_rate": 0.00037055056041384406, "loss": 7.5011, "step": 2107 }, { "epoch": 0.25947808961102903, "grad_norm": 0.47755277156829834, "learning_rate": 0.00037048897647493534, "loss": 8.4709, "step": 2108 }, { "epoch": 0.25960118168389956, "grad_norm": 0.22819532454013824, "learning_rate": 0.0003704273925360266, "loss": 7.4117, "step": 2109 }, { "epoch": 0.25972427375677004, "grad_norm": 0.1820870339870453, "learning_rate": 0.00037036580859711786, "loss": 7.5539, "step": 2110 }, { "epoch": 0.2598473658296406, "grad_norm": 0.392304390668869, "learning_rate": 0.00037030422465820915, "loss": 8.797, "step": 2111 }, { "epoch": 0.25997045790251105, "grad_norm": 0.5606361031532288, "learning_rate": 0.00037024264071930043, "loss": 7.0934, "step": 2112 }, { "epoch": 0.2600935499753816, "grad_norm": 0.23417678475379944, "learning_rate": 0.00037018105678039167, "loss": 7.6898, "step": 2113 }, { "epoch": 0.26021664204825207, "grad_norm": 0.1645912379026413, "learning_rate": 0.00037011947284148295, "loss": 7.8251, "step": 2114 }, { "epoch": 0.2603397341211226, "grad_norm": 0.2571203410625458, "learning_rate": 0.0003700578889025742, "loss": 7.4076, "step": 2115 }, { "epoch": 0.2604628261939931, "grad_norm": 0.12133367359638214, "learning_rate": 0.00036999630496366547, "loss": 7.8231, "step": 2116 }, { "epoch": 0.2605859182668636, "grad_norm": 0.15435907244682312, "learning_rate": 0.0003699347210247567, "loss": 7.7601, "step": 2117 }, { "epoch": 0.26070901033973415, "grad_norm": 0.2878307104110718, "learning_rate": 0.00036987313708584805, "loss": 8.0592, "step": 2118 }, { "epoch": 0.2608321024126046, "grad_norm": 0.10246799141168594, "learning_rate": 0.0003698115531469393, "loss": 7.4164, "step": 2119 }, { "epoch": 0.26095519448547516, "grad_norm": 0.12924937903881073, "learning_rate": 0.00036974996920803056, "loss": 7.9015, "step": 2120 }, { "epoch": 0.26107828655834564, "grad_norm": 0.11433020234107971, "learning_rate": 0.0003696883852691218, "loss": 8.1133, "step": 2121 }, { "epoch": 0.2612013786312162, "grad_norm": 0.2862659990787506, "learning_rate": 0.0003696268013302131, "loss": 7.5498, "step": 2122 }, { "epoch": 0.26132447070408665, "grad_norm": 0.12423577904701233, "learning_rate": 0.0003695652173913043, "loss": 7.8661, "step": 2123 }, { "epoch": 0.2614475627769572, "grad_norm": 0.14364948868751526, "learning_rate": 0.0003695036334523956, "loss": 7.7861, "step": 2124 }, { "epoch": 0.26157065484982767, "grad_norm": 0.1942935734987259, "learning_rate": 0.0003694420495134869, "loss": 7.7779, "step": 2125 }, { "epoch": 0.2616937469226982, "grad_norm": 0.1822557896375656, "learning_rate": 0.0003693804655745782, "loss": 7.4868, "step": 2126 }, { "epoch": 0.2618168389955687, "grad_norm": 0.2869398891925812, "learning_rate": 0.0003693188816356694, "loss": 7.8874, "step": 2127 }, { "epoch": 0.2619399310684392, "grad_norm": 0.10063500702381134, "learning_rate": 0.0003692572976967607, "loss": 7.5816, "step": 2128 }, { "epoch": 0.2620630231413097, "grad_norm": 0.2658795714378357, "learning_rate": 0.0003691957137578519, "loss": 7.5584, "step": 2129 }, { "epoch": 0.2621861152141802, "grad_norm": 0.1741270273923874, "learning_rate": 0.0003691341298189432, "loss": 8.0101, "step": 2130 }, { "epoch": 0.2623092072870507, "grad_norm": 0.23797205090522766, "learning_rate": 0.00036907254588003445, "loss": 7.4806, "step": 2131 }, { "epoch": 0.26243229935992124, "grad_norm": 0.09729763865470886, "learning_rate": 0.0003690109619411258, "loss": 7.6782, "step": 2132 }, { "epoch": 0.2625553914327917, "grad_norm": 0.19431520998477936, "learning_rate": 0.000368949378002217, "loss": 7.5181, "step": 2133 }, { "epoch": 0.26267848350566225, "grad_norm": 0.1932532787322998, "learning_rate": 0.0003688877940633083, "loss": 7.877, "step": 2134 }, { "epoch": 0.26280157557853273, "grad_norm": 0.25776806473731995, "learning_rate": 0.00036882621012439954, "loss": 7.5173, "step": 2135 }, { "epoch": 0.26292466765140327, "grad_norm": 0.31231293082237244, "learning_rate": 0.0003687646261854908, "loss": 8.6332, "step": 2136 }, { "epoch": 0.26304775972427374, "grad_norm": 0.13848532736301422, "learning_rate": 0.00036870304224658206, "loss": 7.4193, "step": 2137 }, { "epoch": 0.2631708517971443, "grad_norm": 0.12789441645145416, "learning_rate": 0.0003686414583076734, "loss": 7.7735, "step": 2138 }, { "epoch": 0.26329394387001476, "grad_norm": 0.21536144614219666, "learning_rate": 0.00036857987436876463, "loss": 8.5863, "step": 2139 }, { "epoch": 0.2634170359428853, "grad_norm": 0.2677113711833954, "learning_rate": 0.0003685182904298559, "loss": 7.4602, "step": 2140 }, { "epoch": 0.26354012801575577, "grad_norm": 0.31018251180648804, "learning_rate": 0.00036845670649094715, "loss": 7.563, "step": 2141 }, { "epoch": 0.2636632200886263, "grad_norm": 0.1574394404888153, "learning_rate": 0.00036839512255203844, "loss": 7.551, "step": 2142 }, { "epoch": 0.2637863121614968, "grad_norm": 0.1455293744802475, "learning_rate": 0.00036833353861312967, "loss": 7.4301, "step": 2143 }, { "epoch": 0.2639094042343673, "grad_norm": 0.37420210242271423, "learning_rate": 0.00036827195467422095, "loss": 7.922, "step": 2144 }, { "epoch": 0.2640324963072378, "grad_norm": 0.1872366964817047, "learning_rate": 0.00036821037073531224, "loss": 7.3679, "step": 2145 }, { "epoch": 0.26415558838010833, "grad_norm": 0.3245837986469269, "learning_rate": 0.00036814878679640353, "loss": 8.687, "step": 2146 }, { "epoch": 0.2642786804529788, "grad_norm": 0.2094496190547943, "learning_rate": 0.00036808720285749476, "loss": 8.482, "step": 2147 }, { "epoch": 0.26440177252584934, "grad_norm": 0.35423019528388977, "learning_rate": 0.00036802561891858605, "loss": 7.4707, "step": 2148 }, { "epoch": 0.2645248645987198, "grad_norm": 0.4742509424686432, "learning_rate": 0.0003679640349796773, "loss": 7.4745, "step": 2149 }, { "epoch": 0.26464795667159036, "grad_norm": 0.4131543040275574, "learning_rate": 0.00036790245104076857, "loss": 7.478, "step": 2150 }, { "epoch": 0.26477104874446084, "grad_norm": 0.2391190379858017, "learning_rate": 0.0003678408671018598, "loss": 7.512, "step": 2151 }, { "epoch": 0.26489414081733137, "grad_norm": 0.18438145518302917, "learning_rate": 0.00036777928316295114, "loss": 7.2289, "step": 2152 }, { "epoch": 0.26501723289020185, "grad_norm": 0.30213940143585205, "learning_rate": 0.00036771769922404237, "loss": 7.5862, "step": 2153 }, { "epoch": 0.2651403249630724, "grad_norm": 0.8350804448127747, "learning_rate": 0.00036765611528513366, "loss": 8.9889, "step": 2154 }, { "epoch": 0.26526341703594286, "grad_norm": 0.6147103309631348, "learning_rate": 0.0003675945313462249, "loss": 7.9427, "step": 2155 }, { "epoch": 0.2653865091088134, "grad_norm": 0.4818341135978699, "learning_rate": 0.0003675329474073162, "loss": 7.8737, "step": 2156 }, { "epoch": 0.2655096011816839, "grad_norm": 0.3032298982143402, "learning_rate": 0.0003674713634684074, "loss": 7.7904, "step": 2157 }, { "epoch": 0.2656326932545544, "grad_norm": 0.09841345250606537, "learning_rate": 0.0003674097795294987, "loss": 7.7698, "step": 2158 }, { "epoch": 0.2657557853274249, "grad_norm": 0.2519218921661377, "learning_rate": 0.00036734819559059, "loss": 7.5421, "step": 2159 }, { "epoch": 0.2658788774002954, "grad_norm": 0.22754526138305664, "learning_rate": 0.00036728661165168127, "loss": 8.2222, "step": 2160 }, { "epoch": 0.26600196947316596, "grad_norm": 0.4328124225139618, "learning_rate": 0.0003672250277127725, "loss": 7.6324, "step": 2161 }, { "epoch": 0.26612506154603643, "grad_norm": 0.21076054871082306, "learning_rate": 0.0003671634437738638, "loss": 8.1306, "step": 2162 }, { "epoch": 0.26624815361890697, "grad_norm": 0.12622925639152527, "learning_rate": 0.000367101859834955, "loss": 7.7187, "step": 2163 }, { "epoch": 0.26637124569177745, "grad_norm": 0.1891300529241562, "learning_rate": 0.0003670402758960463, "loss": 7.7839, "step": 2164 }, { "epoch": 0.266494337764648, "grad_norm": 0.3336397111415863, "learning_rate": 0.0003669786919571376, "loss": 8.0883, "step": 2165 }, { "epoch": 0.26661742983751846, "grad_norm": 0.19978667795658112, "learning_rate": 0.0003669171080182289, "loss": 7.4057, "step": 2166 }, { "epoch": 0.266740521910389, "grad_norm": 0.2517157196998596, "learning_rate": 0.0003668555240793201, "loss": 7.701, "step": 2167 }, { "epoch": 0.2668636139832595, "grad_norm": 0.09581396728754044, "learning_rate": 0.0003667939401404114, "loss": 7.6267, "step": 2168 }, { "epoch": 0.26698670605613, "grad_norm": 0.17438828945159912, "learning_rate": 0.00036673235620150263, "loss": 7.3446, "step": 2169 }, { "epoch": 0.2671097981290005, "grad_norm": 0.1712900996208191, "learning_rate": 0.0003666707722625939, "loss": 7.6082, "step": 2170 }, { "epoch": 0.267232890201871, "grad_norm": 0.20355890691280365, "learning_rate": 0.00036660918832368515, "loss": 7.4281, "step": 2171 }, { "epoch": 0.2673559822747415, "grad_norm": 0.4933617115020752, "learning_rate": 0.0003665476043847765, "loss": 8.0789, "step": 2172 }, { "epoch": 0.26747907434761203, "grad_norm": 0.4013407826423645, "learning_rate": 0.0003664860204458677, "loss": 8.3469, "step": 2173 }, { "epoch": 0.2676021664204825, "grad_norm": 0.3970308303833008, "learning_rate": 0.000366424436506959, "loss": 8.8415, "step": 2174 }, { "epoch": 0.26772525849335305, "grad_norm": 0.08087335526943207, "learning_rate": 0.00036636285256805024, "loss": 7.602, "step": 2175 }, { "epoch": 0.2678483505662235, "grad_norm": 0.18411047756671906, "learning_rate": 0.00036630126862914153, "loss": 7.576, "step": 2176 }, { "epoch": 0.26797144263909406, "grad_norm": 0.1686972975730896, "learning_rate": 0.00036623968469023276, "loss": 7.6185, "step": 2177 }, { "epoch": 0.26809453471196454, "grad_norm": 0.22550068795681, "learning_rate": 0.00036617810075132405, "loss": 7.2251, "step": 2178 }, { "epoch": 0.2682176267848351, "grad_norm": 0.12010670453310013, "learning_rate": 0.00036611651681241534, "loss": 7.3609, "step": 2179 }, { "epoch": 0.26834071885770555, "grad_norm": 0.2827216386795044, "learning_rate": 0.0003660549328735066, "loss": 7.7759, "step": 2180 }, { "epoch": 0.2684638109305761, "grad_norm": 0.2921522855758667, "learning_rate": 0.00036599334893459785, "loss": 7.4485, "step": 2181 }, { "epoch": 0.26858690300344656, "grad_norm": 0.28916773200035095, "learning_rate": 0.00036593176499568914, "loss": 7.5937, "step": 2182 }, { "epoch": 0.2687099950763171, "grad_norm": 0.2117394208908081, "learning_rate": 0.0003658701810567804, "loss": 7.7647, "step": 2183 }, { "epoch": 0.2688330871491876, "grad_norm": 0.1290927231311798, "learning_rate": 0.00036580859711787166, "loss": 7.7026, "step": 2184 }, { "epoch": 0.2689561792220581, "grad_norm": 0.23744812607765198, "learning_rate": 0.0003657470131789629, "loss": 7.5362, "step": 2185 }, { "epoch": 0.2690792712949286, "grad_norm": NaN, "learning_rate": 0.00036568542924005423, "loss": 10.3049, "step": 2186 }, { "epoch": 0.2692023633677991, "grad_norm": 0.2798512578010559, "learning_rate": 0.00036562384530114547, "loss": 7.6971, "step": 2187 }, { "epoch": 0.2693254554406696, "grad_norm": 1.2423940896987915, "learning_rate": 0.00036556226136223675, "loss": 8.099, "step": 2188 }, { "epoch": 0.26944854751354014, "grad_norm": 0.7937031388282776, "learning_rate": 0.000365500677423328, "loss": 7.7709, "step": 2189 }, { "epoch": 0.2695716395864106, "grad_norm": 0.25424686074256897, "learning_rate": 0.00036543909348441927, "loss": 7.2869, "step": 2190 }, { "epoch": 0.26969473165928115, "grad_norm": 0.2702324092388153, "learning_rate": 0.0003653775095455105, "loss": 7.8151, "step": 2191 }, { "epoch": 0.26981782373215163, "grad_norm": 0.18277886509895325, "learning_rate": 0.00036531592560660184, "loss": 7.486, "step": 2192 }, { "epoch": 0.26994091580502216, "grad_norm": 0.3027799129486084, "learning_rate": 0.0003652543416676931, "loss": 7.7961, "step": 2193 }, { "epoch": 0.27006400787789264, "grad_norm": 0.1527181714773178, "learning_rate": 0.00036519275772878436, "loss": 7.3041, "step": 2194 }, { "epoch": 0.2701870999507632, "grad_norm": 0.21382412314414978, "learning_rate": 0.0003651311737898756, "loss": 7.2502, "step": 2195 }, { "epoch": 0.27031019202363366, "grad_norm": 0.37394753098487854, "learning_rate": 0.0003650695898509669, "loss": 7.8808, "step": 2196 }, { "epoch": 0.2704332840965042, "grad_norm": 0.23998548090457916, "learning_rate": 0.0003650080059120581, "loss": 7.2949, "step": 2197 }, { "epoch": 0.27055637616937467, "grad_norm": 0.16806437075138092, "learning_rate": 0.0003649464219731494, "loss": 7.9708, "step": 2198 }, { "epoch": 0.2706794682422452, "grad_norm": 0.16408561170101166, "learning_rate": 0.0003648848380342407, "loss": 7.8843, "step": 2199 }, { "epoch": 0.2708025603151157, "grad_norm": 0.13762685656547546, "learning_rate": 0.000364823254095332, "loss": 7.37, "step": 2200 }, { "epoch": 0.2709256523879862, "grad_norm": 0.3639402389526367, "learning_rate": 0.0003647616701564232, "loss": 7.6325, "step": 2201 }, { "epoch": 0.2710487444608567, "grad_norm": 0.3543964922428131, "learning_rate": 0.0003647000862175145, "loss": 7.6264, "step": 2202 }, { "epoch": 0.27117183653372723, "grad_norm": 0.32627397775650024, "learning_rate": 0.0003646385022786057, "loss": 8.6992, "step": 2203 }, { "epoch": 0.27129492860659776, "grad_norm": 0.4323575496673584, "learning_rate": 0.000364576918339697, "loss": 7.6602, "step": 2204 }, { "epoch": 0.27141802067946824, "grad_norm": 0.4837763011455536, "learning_rate": 0.00036451533440078824, "loss": 7.8854, "step": 2205 }, { "epoch": 0.2715411127523388, "grad_norm": 0.3995070159435272, "learning_rate": 0.0003644537504618796, "loss": 7.7107, "step": 2206 }, { "epoch": 0.27166420482520925, "grad_norm": 0.1775835007429123, "learning_rate": 0.0003643921665229708, "loss": 7.93, "step": 2207 }, { "epoch": 0.2717872968980798, "grad_norm": 0.19028954207897186, "learning_rate": 0.0003643305825840621, "loss": 7.5055, "step": 2208 }, { "epoch": 0.27191038897095027, "grad_norm": 0.3281956911087036, "learning_rate": 0.00036426899864515334, "loss": 7.896, "step": 2209 }, { "epoch": 0.2720334810438208, "grad_norm": 0.1829754114151001, "learning_rate": 0.0003642074147062446, "loss": 7.596, "step": 2210 }, { "epoch": 0.2721565731166913, "grad_norm": 0.16295424103736877, "learning_rate": 0.00036414583076733586, "loss": 7.7582, "step": 2211 }, { "epoch": 0.2722796651895618, "grad_norm": 0.12801970541477203, "learning_rate": 0.00036408424682842714, "loss": 8.09, "step": 2212 }, { "epoch": 0.2724027572624323, "grad_norm": 0.2233521193265915, "learning_rate": 0.00036402266288951843, "loss": 8.1211, "step": 2213 }, { "epoch": 0.2725258493353028, "grad_norm": 0.17991015315055847, "learning_rate": 0.0003639610789506097, "loss": 8.4372, "step": 2214 }, { "epoch": 0.2726489414081733, "grad_norm": 0.23014894127845764, "learning_rate": 0.00036389949501170095, "loss": 7.4858, "step": 2215 }, { "epoch": 0.27277203348104384, "grad_norm": 0.19127295911312103, "learning_rate": 0.00036383791107279223, "loss": 7.5151, "step": 2216 }, { "epoch": 0.2728951255539143, "grad_norm": 0.21875154972076416, "learning_rate": 0.00036377632713388347, "loss": 7.5592, "step": 2217 }, { "epoch": 0.27301821762678485, "grad_norm": 0.32059866189956665, "learning_rate": 0.00036371474319497475, "loss": 7.7866, "step": 2218 }, { "epoch": 0.27314130969965533, "grad_norm": 0.14380863308906555, "learning_rate": 0.000363653159256066, "loss": 7.504, "step": 2219 }, { "epoch": 0.27326440177252587, "grad_norm": 0.14543823897838593, "learning_rate": 0.0003635915753171573, "loss": 7.6636, "step": 2220 }, { "epoch": 0.27338749384539635, "grad_norm": 0.1829562485218048, "learning_rate": 0.00036352999137824856, "loss": 7.6422, "step": 2221 }, { "epoch": 0.2735105859182669, "grad_norm": 0.17242664098739624, "learning_rate": 0.00036346840743933985, "loss": 7.4336, "step": 2222 }, { "epoch": 0.27363367799113736, "grad_norm": 0.14341601729393005, "learning_rate": 0.0003634068235004311, "loss": 7.5867, "step": 2223 }, { "epoch": 0.2737567700640079, "grad_norm": 0.28791478276252747, "learning_rate": 0.00036334523956152236, "loss": 8.1016, "step": 2224 }, { "epoch": 0.27387986213687837, "grad_norm": 0.12855751812458038, "learning_rate": 0.0003632836556226136, "loss": 7.694, "step": 2225 }, { "epoch": 0.2740029542097489, "grad_norm": 0.24563996493816376, "learning_rate": 0.00036322207168370494, "loss": 7.439, "step": 2226 }, { "epoch": 0.2741260462826194, "grad_norm": 0.11360722780227661, "learning_rate": 0.00036316048774479617, "loss": 7.7425, "step": 2227 }, { "epoch": 0.2742491383554899, "grad_norm": 0.1505708396434784, "learning_rate": 0.00036309890380588746, "loss": 7.735, "step": 2228 }, { "epoch": 0.2743722304283604, "grad_norm": 0.15910908579826355, "learning_rate": 0.0003630373198669787, "loss": 7.2593, "step": 2229 }, { "epoch": 0.27449532250123093, "grad_norm": 0.1187463253736496, "learning_rate": 0.00036297573592807, "loss": 7.5778, "step": 2230 }, { "epoch": 0.2746184145741014, "grad_norm": 0.2267104536294937, "learning_rate": 0.0003629141519891612, "loss": 7.6902, "step": 2231 }, { "epoch": 0.27474150664697194, "grad_norm": 0.14671847224235535, "learning_rate": 0.0003628525680502525, "loss": 7.5021, "step": 2232 }, { "epoch": 0.2748645987198424, "grad_norm": 0.17719857394695282, "learning_rate": 0.0003627909841113438, "loss": 8.0341, "step": 2233 }, { "epoch": 0.27498769079271296, "grad_norm": 0.2824627459049225, "learning_rate": 0.00036272940017243507, "loss": 7.545, "step": 2234 }, { "epoch": 0.27511078286558344, "grad_norm": 0.16692250967025757, "learning_rate": 0.0003626678162335263, "loss": 7.5178, "step": 2235 }, { "epoch": 0.27523387493845397, "grad_norm": 0.20659984648227692, "learning_rate": 0.0003626062322946176, "loss": 7.8042, "step": 2236 }, { "epoch": 0.27535696701132445, "grad_norm": 0.32494068145751953, "learning_rate": 0.0003625446483557088, "loss": 7.917, "step": 2237 }, { "epoch": 0.275480059084195, "grad_norm": 0.4081450402736664, "learning_rate": 0.0003624830644168001, "loss": 7.7369, "step": 2238 }, { "epoch": 0.27560315115706546, "grad_norm": 0.2891211211681366, "learning_rate": 0.00036242148047789134, "loss": 7.9338, "step": 2239 }, { "epoch": 0.275726243229936, "grad_norm": 0.2288568615913391, "learning_rate": 0.0003623598965389827, "loss": 7.7795, "step": 2240 }, { "epoch": 0.2758493353028065, "grad_norm": 0.2800292372703552, "learning_rate": 0.0003622983126000739, "loss": 7.6308, "step": 2241 }, { "epoch": 0.275972427375677, "grad_norm": 0.19878171384334564, "learning_rate": 0.0003622367286611652, "loss": 7.7192, "step": 2242 }, { "epoch": 0.2760955194485475, "grad_norm": 0.1846921741962433, "learning_rate": 0.00036217514472225643, "loss": 7.7435, "step": 2243 }, { "epoch": 0.276218611521418, "grad_norm": 0.28185421228408813, "learning_rate": 0.0003621135607833477, "loss": 8.3654, "step": 2244 }, { "epoch": 0.2763417035942885, "grad_norm": 0.4368555545806885, "learning_rate": 0.00036205197684443895, "loss": 8.7266, "step": 2245 }, { "epoch": 0.27646479566715904, "grad_norm": 0.1204216256737709, "learning_rate": 0.0003619903929055303, "loss": 7.7614, "step": 2246 }, { "epoch": 0.2765878877400295, "grad_norm": 0.13850420713424683, "learning_rate": 0.0003619288089666215, "loss": 7.7084, "step": 2247 }, { "epoch": 0.27671097981290005, "grad_norm": 0.20128725469112396, "learning_rate": 0.0003618672250277128, "loss": 7.6035, "step": 2248 }, { "epoch": 0.2768340718857706, "grad_norm": 0.17500333487987518, "learning_rate": 0.00036180564108880404, "loss": 7.4784, "step": 2249 }, { "epoch": 0.27695716395864106, "grad_norm": 0.1539985090494156, "learning_rate": 0.00036174405714989533, "loss": 7.8026, "step": 2250 }, { "epoch": 0.2770802560315116, "grad_norm": 0.24975137412548065, "learning_rate": 0.00036168247321098656, "loss": 7.3832, "step": 2251 }, { "epoch": 0.2772033481043821, "grad_norm": 0.12977845966815948, "learning_rate": 0.00036162088927207785, "loss": 7.3873, "step": 2252 }, { "epoch": 0.2773264401772526, "grad_norm": 0.10137587040662766, "learning_rate": 0.00036155930533316913, "loss": 7.2668, "step": 2253 }, { "epoch": 0.2774495322501231, "grad_norm": 0.11222922056913376, "learning_rate": 0.0003614977213942604, "loss": 7.7187, "step": 2254 }, { "epoch": 0.2775726243229936, "grad_norm": 0.13157878816127777, "learning_rate": 0.00036143613745535165, "loss": 8.0034, "step": 2255 }, { "epoch": 0.2776957163958641, "grad_norm": 0.25375548005104065, "learning_rate": 0.00036137455351644294, "loss": 7.2571, "step": 2256 }, { "epoch": 0.27781880846873463, "grad_norm": 0.11894087493419647, "learning_rate": 0.00036131296957753417, "loss": 7.5944, "step": 2257 }, { "epoch": 0.2779419005416051, "grad_norm": 0.2428424209356308, "learning_rate": 0.00036125138563862546, "loss": 7.8157, "step": 2258 }, { "epoch": 0.27806499261447565, "grad_norm": 0.3276558518409729, "learning_rate": 0.0003611898016997167, "loss": 8.0745, "step": 2259 }, { "epoch": 0.2781880846873461, "grad_norm": 0.1443680077791214, "learning_rate": 0.00036112821776080803, "loss": 7.177, "step": 2260 }, { "epoch": 0.27831117676021666, "grad_norm": 0.40328121185302734, "learning_rate": 0.00036106663382189926, "loss": 8.4058, "step": 2261 }, { "epoch": 0.27843426883308714, "grad_norm": 0.262251615524292, "learning_rate": 0.00036100504988299055, "loss": 8.6609, "step": 2262 }, { "epoch": 0.2785573609059577, "grad_norm": 0.187088280916214, "learning_rate": 0.0003609434659440818, "loss": 8.1306, "step": 2263 }, { "epoch": 0.27868045297882815, "grad_norm": 0.25103551149368286, "learning_rate": 0.00036088188200517307, "loss": 7.8919, "step": 2264 }, { "epoch": 0.2788035450516987, "grad_norm": 0.3858506977558136, "learning_rate": 0.0003608202980662643, "loss": 7.6011, "step": 2265 }, { "epoch": 0.27892663712456917, "grad_norm": 0.16686904430389404, "learning_rate": 0.0003607587141273556, "loss": 7.8662, "step": 2266 }, { "epoch": 0.2790497291974397, "grad_norm": 0.16508737206459045, "learning_rate": 0.0003606971301884469, "loss": 7.7902, "step": 2267 }, { "epoch": 0.2791728212703102, "grad_norm": 0.29397621750831604, "learning_rate": 0.00036063554624953816, "loss": 7.8747, "step": 2268 }, { "epoch": 0.2792959133431807, "grad_norm": 0.30113399028778076, "learning_rate": 0.0003605739623106294, "loss": 7.653, "step": 2269 }, { "epoch": 0.2794190054160512, "grad_norm": 0.4572981595993042, "learning_rate": 0.0003605123783717207, "loss": 8.6655, "step": 2270 }, { "epoch": 0.2795420974889217, "grad_norm": 0.24206842482089996, "learning_rate": 0.0003604507944328119, "loss": 8.0173, "step": 2271 }, { "epoch": 0.2796651895617922, "grad_norm": 0.24933142960071564, "learning_rate": 0.0003603892104939032, "loss": 7.4346, "step": 2272 }, { "epoch": 0.27978828163466274, "grad_norm": 0.200689435005188, "learning_rate": 0.00036032762655499443, "loss": 7.651, "step": 2273 }, { "epoch": 0.2799113737075332, "grad_norm": 0.2138361930847168, "learning_rate": 0.0003602660426160858, "loss": 7.4394, "step": 2274 }, { "epoch": 0.28003446578040375, "grad_norm": 0.27118054032325745, "learning_rate": 0.000360204458677177, "loss": 8.4362, "step": 2275 }, { "epoch": 0.28015755785327423, "grad_norm": 0.11721783131361008, "learning_rate": 0.0003601428747382683, "loss": 7.308, "step": 2276 }, { "epoch": 0.28028064992614476, "grad_norm": 0.1341080367565155, "learning_rate": 0.0003600812907993595, "loss": 7.3598, "step": 2277 }, { "epoch": 0.28040374199901524, "grad_norm": 0.16129301488399506, "learning_rate": 0.0003600197068604508, "loss": 7.564, "step": 2278 }, { "epoch": 0.2805268340718858, "grad_norm": 0.08718401938676834, "learning_rate": 0.00035995812292154204, "loss": 7.4343, "step": 2279 }, { "epoch": 0.28064992614475626, "grad_norm": 0.09879550337791443, "learning_rate": 0.0003598965389826334, "loss": 7.7493, "step": 2280 }, { "epoch": 0.2807730182176268, "grad_norm": 0.19969187676906586, "learning_rate": 0.0003598349550437246, "loss": 7.4728, "step": 2281 }, { "epoch": 0.28089611029049727, "grad_norm": 0.1529555767774582, "learning_rate": 0.0003597733711048159, "loss": 7.658, "step": 2282 }, { "epoch": 0.2810192023633678, "grad_norm": 0.2185233235359192, "learning_rate": 0.00035971178716590714, "loss": 8.1696, "step": 2283 }, { "epoch": 0.2811422944362383, "grad_norm": 0.3588813543319702, "learning_rate": 0.0003596502032269984, "loss": 8.8373, "step": 2284 }, { "epoch": 0.2812653865091088, "grad_norm": 0.41889292001724243, "learning_rate": 0.00035958861928808965, "loss": 9.1801, "step": 2285 }, { "epoch": 0.2813884785819793, "grad_norm": 0.3620334267616272, "learning_rate": 0.00035952703534918094, "loss": 9.2476, "step": 2286 }, { "epoch": 0.28151157065484983, "grad_norm": 0.3287333548069, "learning_rate": 0.00035946545141027223, "loss": 7.5527, "step": 2287 }, { "epoch": 0.2816346627277203, "grad_norm": 0.21936795115470886, "learning_rate": 0.0003594038674713635, "loss": 7.8625, "step": 2288 }, { "epoch": 0.28175775480059084, "grad_norm": 0.3366224467754364, "learning_rate": 0.00035934228353245475, "loss": 8.0405, "step": 2289 }, { "epoch": 0.2818808468734613, "grad_norm": 0.17332404851913452, "learning_rate": 0.00035928069959354603, "loss": 7.8457, "step": 2290 }, { "epoch": 0.28200393894633186, "grad_norm": 0.21705619990825653, "learning_rate": 0.00035921911565463727, "loss": 7.5161, "step": 2291 }, { "epoch": 0.2821270310192024, "grad_norm": 0.2393525391817093, "learning_rate": 0.00035915753171572855, "loss": 7.732, "step": 2292 }, { "epoch": 0.28225012309207287, "grad_norm": 0.21235796809196472, "learning_rate": 0.0003590959477768198, "loss": 7.564, "step": 2293 }, { "epoch": 0.2823732151649434, "grad_norm": 0.2858973741531372, "learning_rate": 0.0003590343638379111, "loss": 7.7817, "step": 2294 }, { "epoch": 0.2824963072378139, "grad_norm": 0.10727456212043762, "learning_rate": 0.00035897277989900236, "loss": 7.6874, "step": 2295 }, { "epoch": 0.2826193993106844, "grad_norm": 0.16227230429649353, "learning_rate": 0.00035891119596009364, "loss": 7.688, "step": 2296 }, { "epoch": 0.2827424913835549, "grad_norm": 0.21026082336902618, "learning_rate": 0.0003588496120211849, "loss": 8.9302, "step": 2297 }, { "epoch": 0.28286558345642543, "grad_norm": 0.1692517101764679, "learning_rate": 0.00035878802808227616, "loss": 7.9106, "step": 2298 }, { "epoch": 0.2829886755292959, "grad_norm": 0.2425704151391983, "learning_rate": 0.0003587264441433674, "loss": 7.5215, "step": 2299 }, { "epoch": 0.28311176760216644, "grad_norm": 0.14417362213134766, "learning_rate": 0.0003586648602044587, "loss": 7.4949, "step": 2300 }, { "epoch": 0.2832348596750369, "grad_norm": 0.14363543689250946, "learning_rate": 0.00035860327626554997, "loss": 7.8038, "step": 2301 }, { "epoch": 0.28335795174790745, "grad_norm": 0.5410260558128357, "learning_rate": 0.00035854169232664126, "loss": 8.7925, "step": 2302 }, { "epoch": 0.28348104382077793, "grad_norm": 0.3072183430194855, "learning_rate": 0.0003584801083877325, "loss": 7.9628, "step": 2303 }, { "epoch": 0.28360413589364847, "grad_norm": 0.2575934827327728, "learning_rate": 0.0003584185244488238, "loss": 8.3036, "step": 2304 }, { "epoch": 0.28372722796651895, "grad_norm": 0.15028652548789978, "learning_rate": 0.000358356940509915, "loss": 8.6497, "step": 2305 }, { "epoch": 0.2838503200393895, "grad_norm": 0.2995278537273407, "learning_rate": 0.0003582953565710063, "loss": 7.6364, "step": 2306 }, { "epoch": 0.28397341211225996, "grad_norm": 0.3684478998184204, "learning_rate": 0.0003582337726320976, "loss": 7.6325, "step": 2307 }, { "epoch": 0.2840965041851305, "grad_norm": 0.3794378340244293, "learning_rate": 0.00035817218869318887, "loss": 7.4204, "step": 2308 }, { "epoch": 0.284219596258001, "grad_norm": 0.22464285790920258, "learning_rate": 0.0003581106047542801, "loss": 7.7652, "step": 2309 }, { "epoch": 0.2843426883308715, "grad_norm": 0.1428339034318924, "learning_rate": 0.0003580490208153714, "loss": 7.8745, "step": 2310 }, { "epoch": 0.284465780403742, "grad_norm": 0.15422587096691132, "learning_rate": 0.0003579874368764626, "loss": 7.6311, "step": 2311 }, { "epoch": 0.2845888724766125, "grad_norm": 0.14265213906764984, "learning_rate": 0.0003579258529375539, "loss": 7.6765, "step": 2312 }, { "epoch": 0.284711964549483, "grad_norm": 0.14120754599571228, "learning_rate": 0.00035786426899864514, "loss": 7.5045, "step": 2313 }, { "epoch": 0.28483505662235353, "grad_norm": 0.10141690075397491, "learning_rate": 0.0003578026850597365, "loss": 7.9768, "step": 2314 }, { "epoch": 0.284958148695224, "grad_norm": 0.13581900298595428, "learning_rate": 0.0003577411011208277, "loss": 8.2868, "step": 2315 }, { "epoch": 0.28508124076809455, "grad_norm": 0.20342501997947693, "learning_rate": 0.000357679517181919, "loss": 7.7611, "step": 2316 }, { "epoch": 0.285204332840965, "grad_norm": 0.2980639934539795, "learning_rate": 0.00035761793324301023, "loss": 7.5739, "step": 2317 }, { "epoch": 0.28532742491383556, "grad_norm": 0.2700968086719513, "learning_rate": 0.0003575563493041015, "loss": 7.2835, "step": 2318 }, { "epoch": 0.28545051698670604, "grad_norm": 0.12645351886749268, "learning_rate": 0.00035749476536519275, "loss": 7.4074, "step": 2319 }, { "epoch": 0.28557360905957657, "grad_norm": 0.23605307936668396, "learning_rate": 0.00035743318142628403, "loss": 7.3754, "step": 2320 }, { "epoch": 0.28569670113244705, "grad_norm": 0.35949742794036865, "learning_rate": 0.0003573715974873753, "loss": 7.747, "step": 2321 }, { "epoch": 0.2858197932053176, "grad_norm": 0.29605260491371155, "learning_rate": 0.0003573100135484666, "loss": 7.6097, "step": 2322 }, { "epoch": 0.28594288527818806, "grad_norm": 0.19830641150474548, "learning_rate": 0.00035724842960955784, "loss": 7.8124, "step": 2323 }, { "epoch": 0.2860659773510586, "grad_norm": 0.09770329296588898, "learning_rate": 0.00035718684567064913, "loss": 7.5125, "step": 2324 }, { "epoch": 0.2861890694239291, "grad_norm": 0.2013760805130005, "learning_rate": 0.00035712526173174036, "loss": 7.8911, "step": 2325 }, { "epoch": 0.2863121614967996, "grad_norm": 0.355886846780777, "learning_rate": 0.0003570636777928316, "loss": 7.4699, "step": 2326 }, { "epoch": 0.2864352535696701, "grad_norm": 0.5370150804519653, "learning_rate": 0.0003570020938539229, "loss": 7.4242, "step": 2327 }, { "epoch": 0.2865583456425406, "grad_norm": 0.3130956292152405, "learning_rate": 0.00035694050991501417, "loss": 7.4041, "step": 2328 }, { "epoch": 0.2866814377154111, "grad_norm": 0.24045713245868683, "learning_rate": 0.00035687892597610545, "loss": 8.5485, "step": 2329 }, { "epoch": 0.28680452978828164, "grad_norm": 0.0808183029294014, "learning_rate": 0.0003568173420371967, "loss": 7.6941, "step": 2330 }, { "epoch": 0.2869276218611521, "grad_norm": 0.07256277650594711, "learning_rate": 0.00035675575809828797, "loss": 7.47, "step": 2331 }, { "epoch": 0.28705071393402265, "grad_norm": 0.25299641489982605, "learning_rate": 0.0003566941741593792, "loss": 7.4845, "step": 2332 }, { "epoch": 0.28717380600689313, "grad_norm": 0.3233044743537903, "learning_rate": 0.0003566325902204705, "loss": 7.7971, "step": 2333 }, { "epoch": 0.28729689807976366, "grad_norm": 0.17411230504512787, "learning_rate": 0.0003565710062815618, "loss": 7.4189, "step": 2334 }, { "epoch": 0.2874199901526342, "grad_norm": 0.10462432354688644, "learning_rate": 0.00035650942234265306, "loss": 7.1689, "step": 2335 }, { "epoch": 0.2875430822255047, "grad_norm": 0.22243279218673706, "learning_rate": 0.0003564478384037443, "loss": 7.2764, "step": 2336 }, { "epoch": 0.2876661742983752, "grad_norm": 0.12110043317079544, "learning_rate": 0.0003563862544648356, "loss": 7.3998, "step": 2337 }, { "epoch": 0.2877892663712457, "grad_norm": 0.09019330888986588, "learning_rate": 0.0003563246705259268, "loss": 7.3072, "step": 2338 }, { "epoch": 0.2879123584441162, "grad_norm": 0.2636736333370209, "learning_rate": 0.0003562630865870181, "loss": 8.0002, "step": 2339 }, { "epoch": 0.2880354505169867, "grad_norm": 0.18180909752845764, "learning_rate": 0.00035620150264810933, "loss": 7.5381, "step": 2340 }, { "epoch": 0.28815854258985724, "grad_norm": 0.11479004472494125, "learning_rate": 0.0003561399187092007, "loss": 7.3086, "step": 2341 }, { "epoch": 0.2882816346627277, "grad_norm": 0.13149738311767578, "learning_rate": 0.0003560783347702919, "loss": 7.5845, "step": 2342 }, { "epoch": 0.28840472673559825, "grad_norm": 0.14506441354751587, "learning_rate": 0.0003560167508313832, "loss": 7.7526, "step": 2343 }, { "epoch": 0.28852781880846873, "grad_norm": 0.10146819055080414, "learning_rate": 0.0003559551668924744, "loss": 7.8095, "step": 2344 }, { "epoch": 0.28865091088133926, "grad_norm": 0.10038932412862778, "learning_rate": 0.0003558935829535657, "loss": 7.5725, "step": 2345 }, { "epoch": 0.28877400295420974, "grad_norm": 0.10106581449508667, "learning_rate": 0.00035583199901465694, "loss": 7.4678, "step": 2346 }, { "epoch": 0.2888970950270803, "grad_norm": 0.22264622151851654, "learning_rate": 0.00035577041507574823, "loss": 8.062, "step": 2347 }, { "epoch": 0.28902018709995075, "grad_norm": 0.24096326529979706, "learning_rate": 0.0003557088311368395, "loss": 8.1458, "step": 2348 }, { "epoch": 0.2891432791728213, "grad_norm": 0.11967795342206955, "learning_rate": 0.0003556472471979308, "loss": 7.4465, "step": 2349 }, { "epoch": 0.28926637124569177, "grad_norm": 0.17294767498970032, "learning_rate": 0.00035558566325902204, "loss": 7.615, "step": 2350 }, { "epoch": 0.2893894633185623, "grad_norm": 0.10775484144687653, "learning_rate": 0.0003555240793201133, "loss": 7.5608, "step": 2351 }, { "epoch": 0.2895125553914328, "grad_norm": 0.14314797520637512, "learning_rate": 0.00035546249538120456, "loss": 7.2193, "step": 2352 }, { "epoch": 0.2896356474643033, "grad_norm": 0.16629403829574585, "learning_rate": 0.00035540091144229584, "loss": 7.5278, "step": 2353 }, { "epoch": 0.2897587395371738, "grad_norm": 0.12320244312286377, "learning_rate": 0.0003553393275033871, "loss": 7.4791, "step": 2354 }, { "epoch": 0.2898818316100443, "grad_norm": 0.21841321885585785, "learning_rate": 0.0003552777435644784, "loss": 7.6191, "step": 2355 }, { "epoch": 0.2900049236829148, "grad_norm": 0.1194339171051979, "learning_rate": 0.00035521615962556965, "loss": 7.6158, "step": 2356 }, { "epoch": 0.29012801575578534, "grad_norm": 0.1951664537191391, "learning_rate": 0.00035515457568666093, "loss": 7.21, "step": 2357 }, { "epoch": 0.2902511078286558, "grad_norm": 0.1536768525838852, "learning_rate": 0.00035509299174775217, "loss": 8.0123, "step": 2358 }, { "epoch": 0.29037419990152635, "grad_norm": 0.1362726241350174, "learning_rate": 0.00035503140780884345, "loss": 7.9418, "step": 2359 }, { "epoch": 0.29049729197439683, "grad_norm": 0.29238361120224, "learning_rate": 0.0003549698238699347, "loss": 7.1173, "step": 2360 }, { "epoch": 0.29062038404726737, "grad_norm": 0.1808217614889145, "learning_rate": 0.000354908239931026, "loss": 7.4939, "step": 2361 }, { "epoch": 0.29074347612013784, "grad_norm": 0.14733652770519257, "learning_rate": 0.00035484665599211726, "loss": 7.8048, "step": 2362 }, { "epoch": 0.2908665681930084, "grad_norm": 0.1604197472333908, "learning_rate": 0.00035478507205320855, "loss": 7.7304, "step": 2363 }, { "epoch": 0.29098966026587886, "grad_norm": 0.17072753608226776, "learning_rate": 0.0003547234881142998, "loss": 7.5417, "step": 2364 }, { "epoch": 0.2911127523387494, "grad_norm": 0.3921279013156891, "learning_rate": 0.00035466190417539106, "loss": 6.8432, "step": 2365 }, { "epoch": 0.29123584441161987, "grad_norm": 0.13553771376609802, "learning_rate": 0.0003546003202364823, "loss": 7.3675, "step": 2366 }, { "epoch": 0.2913589364844904, "grad_norm": 0.10405098646879196, "learning_rate": 0.0003545387362975736, "loss": 7.4698, "step": 2367 }, { "epoch": 0.2914820285573609, "grad_norm": 0.09829653799533844, "learning_rate": 0.00035447715235866487, "loss": 7.5516, "step": 2368 }, { "epoch": 0.2916051206302314, "grad_norm": 0.35828638076782227, "learning_rate": 0.00035441556841975616, "loss": 8.8818, "step": 2369 }, { "epoch": 0.2917282127031019, "grad_norm": 0.15680867433547974, "learning_rate": 0.0003543539844808474, "loss": 7.2847, "step": 2370 }, { "epoch": 0.29185130477597243, "grad_norm": 0.1304899901151657, "learning_rate": 0.0003542924005419387, "loss": 7.8693, "step": 2371 }, { "epoch": 0.2919743968488429, "grad_norm": 0.1249828115105629, "learning_rate": 0.0003542308166030299, "loss": 7.4342, "step": 2372 }, { "epoch": 0.29209748892171344, "grad_norm": 0.16806261241436005, "learning_rate": 0.0003541692326641212, "loss": 8.0891, "step": 2373 }, { "epoch": 0.2922205809945839, "grad_norm": 0.1289566159248352, "learning_rate": 0.00035410764872521243, "loss": 7.404, "step": 2374 }, { "epoch": 0.29234367306745446, "grad_norm": 0.19696728885173798, "learning_rate": 0.00035404606478630377, "loss": 7.6504, "step": 2375 }, { "epoch": 0.29246676514032494, "grad_norm": 0.21122248470783234, "learning_rate": 0.000353984480847395, "loss": 7.6008, "step": 2376 }, { "epoch": 0.29258985721319547, "grad_norm": 0.3335706293582916, "learning_rate": 0.0003539228969084863, "loss": 7.9731, "step": 2377 }, { "epoch": 0.292712949286066, "grad_norm": 0.5797410011291504, "learning_rate": 0.0003538613129695775, "loss": 8.8711, "step": 2378 }, { "epoch": 0.2928360413589365, "grad_norm": 0.177761510014534, "learning_rate": 0.0003537997290306688, "loss": 7.326, "step": 2379 }, { "epoch": 0.292959133431807, "grad_norm": 0.26565423607826233, "learning_rate": 0.00035373814509176004, "loss": 8.6117, "step": 2380 }, { "epoch": 0.2930822255046775, "grad_norm": 0.24634337425231934, "learning_rate": 0.0003536765611528513, "loss": 7.4145, "step": 2381 }, { "epoch": 0.29320531757754803, "grad_norm": 0.1623327136039734, "learning_rate": 0.0003536149772139426, "loss": 8.591, "step": 2382 }, { "epoch": 0.2933284096504185, "grad_norm": 0.27151140570640564, "learning_rate": 0.0003535533932750339, "loss": 7.4118, "step": 2383 }, { "epoch": 0.29345150172328904, "grad_norm": 0.16727066040039062, "learning_rate": 0.00035349180933612513, "loss": 7.8588, "step": 2384 }, { "epoch": 0.2935745937961595, "grad_norm": 0.14114639163017273, "learning_rate": 0.0003534302253972164, "loss": 7.8641, "step": 2385 }, { "epoch": 0.29369768586903006, "grad_norm": 0.1672411412000656, "learning_rate": 0.00035336864145830765, "loss": 7.3386, "step": 2386 }, { "epoch": 0.29382077794190053, "grad_norm": 0.23479004204273224, "learning_rate": 0.00035330705751939894, "loss": 7.8789, "step": 2387 }, { "epoch": 0.29394387001477107, "grad_norm": 0.2227352410554886, "learning_rate": 0.00035324547358049017, "loss": 7.3925, "step": 2388 }, { "epoch": 0.29406696208764155, "grad_norm": 0.251094788312912, "learning_rate": 0.0003531838896415815, "loss": 7.5062, "step": 2389 }, { "epoch": 0.2941900541605121, "grad_norm": 0.12976887822151184, "learning_rate": 0.00035312230570267274, "loss": 7.6236, "step": 2390 }, { "epoch": 0.29431314623338256, "grad_norm": 0.26166626811027527, "learning_rate": 0.00035306072176376403, "loss": 7.4246, "step": 2391 }, { "epoch": 0.2944362383062531, "grad_norm": 0.2907661199569702, "learning_rate": 0.00035299913782485526, "loss": 8.8321, "step": 2392 }, { "epoch": 0.2945593303791236, "grad_norm": 0.29625973105430603, "learning_rate": 0.00035293755388594655, "loss": 7.538, "step": 2393 }, { "epoch": 0.2946824224519941, "grad_norm": 0.3323107063770294, "learning_rate": 0.0003528759699470378, "loss": 7.6931, "step": 2394 }, { "epoch": 0.2948055145248646, "grad_norm": 0.2538244426250458, "learning_rate": 0.0003528143860081291, "loss": 7.8178, "step": 2395 }, { "epoch": 0.2949286065977351, "grad_norm": 0.2005842626094818, "learning_rate": 0.00035275280206922035, "loss": 7.68, "step": 2396 }, { "epoch": 0.2950516986706056, "grad_norm": 0.23132358491420746, "learning_rate": 0.00035269121813031164, "loss": 7.2814, "step": 2397 }, { "epoch": 0.29517479074347613, "grad_norm": 0.14425672590732574, "learning_rate": 0.00035262963419140287, "loss": 7.4893, "step": 2398 }, { "epoch": 0.2952978828163466, "grad_norm": 0.24990840256214142, "learning_rate": 0.00035256805025249416, "loss": 7.3735, "step": 2399 }, { "epoch": 0.29542097488921715, "grad_norm": 0.382418692111969, "learning_rate": 0.0003525064663135854, "loss": 7.7641, "step": 2400 }, { "epoch": 0.2955440669620876, "grad_norm": 0.39682674407958984, "learning_rate": 0.0003524448823746767, "loss": 7.7521, "step": 2401 }, { "epoch": 0.29566715903495816, "grad_norm": 0.22625800967216492, "learning_rate": 0.00035238329843576796, "loss": 7.7628, "step": 2402 }, { "epoch": 0.29579025110782864, "grad_norm": 0.09480958431959152, "learning_rate": 0.00035232171449685925, "loss": 7.4926, "step": 2403 }, { "epoch": 0.2959133431806992, "grad_norm": 0.14171388745307922, "learning_rate": 0.0003522601305579505, "loss": 8.2808, "step": 2404 }, { "epoch": 0.29603643525356965, "grad_norm": 0.12197595089673996, "learning_rate": 0.00035219854661904177, "loss": 7.8155, "step": 2405 }, { "epoch": 0.2961595273264402, "grad_norm": 0.1841067224740982, "learning_rate": 0.000352136962680133, "loss": 8.1364, "step": 2406 }, { "epoch": 0.29628261939931066, "grad_norm": 0.2848881781101227, "learning_rate": 0.0003520753787412243, "loss": 7.5639, "step": 2407 }, { "epoch": 0.2964057114721812, "grad_norm": 0.29598793387413025, "learning_rate": 0.0003520137948023155, "loss": 7.3449, "step": 2408 }, { "epoch": 0.2965288035450517, "grad_norm": 0.18995749950408936, "learning_rate": 0.00035195221086340686, "loss": 7.4766, "step": 2409 }, { "epoch": 0.2966518956179222, "grad_norm": 0.08314426243305206, "learning_rate": 0.0003518906269244981, "loss": 7.5637, "step": 2410 }, { "epoch": 0.2967749876907927, "grad_norm": 0.7421520352363586, "learning_rate": 0.0003518290429855894, "loss": 10.2185, "step": 2411 }, { "epoch": 0.2968980797636632, "grad_norm": 0.27737343311309814, "learning_rate": 0.0003517674590466806, "loss": 8.0164, "step": 2412 }, { "epoch": 0.2970211718365337, "grad_norm": 0.5853886008262634, "learning_rate": 0.0003517058751077719, "loss": 9.2681, "step": 2413 }, { "epoch": 0.29714426390940424, "grad_norm": 0.13902103900909424, "learning_rate": 0.00035164429116886313, "loss": 7.3694, "step": 2414 }, { "epoch": 0.2972673559822747, "grad_norm": 0.11462154239416122, "learning_rate": 0.0003515827072299545, "loss": 7.5316, "step": 2415 }, { "epoch": 0.29739044805514525, "grad_norm": 0.09993576258420944, "learning_rate": 0.0003515211232910457, "loss": 7.5848, "step": 2416 }, { "epoch": 0.29751354012801573, "grad_norm": 0.18345189094543457, "learning_rate": 0.000351459539352137, "loss": 7.6712, "step": 2417 }, { "epoch": 0.29763663220088626, "grad_norm": 0.21234071254730225, "learning_rate": 0.0003513979554132282, "loss": 7.2853, "step": 2418 }, { "epoch": 0.29775972427375674, "grad_norm": 0.15724648535251617, "learning_rate": 0.0003513363714743195, "loss": 7.6195, "step": 2419 }, { "epoch": 0.2978828163466273, "grad_norm": 0.22042903304100037, "learning_rate": 0.00035127478753541074, "loss": 8.22, "step": 2420 }, { "epoch": 0.2980059084194978, "grad_norm": 0.2460474669933319, "learning_rate": 0.00035121320359650203, "loss": 7.8576, "step": 2421 }, { "epoch": 0.2981290004923683, "grad_norm": 0.14659850299358368, "learning_rate": 0.0003511516196575933, "loss": 7.3908, "step": 2422 }, { "epoch": 0.2982520925652388, "grad_norm": 0.10078437626361847, "learning_rate": 0.0003510900357186846, "loss": 7.5246, "step": 2423 }, { "epoch": 0.2983751846381093, "grad_norm": 0.3085806369781494, "learning_rate": 0.00035102845177977584, "loss": 8.4579, "step": 2424 }, { "epoch": 0.29849827671097984, "grad_norm": 0.17200668156147003, "learning_rate": 0.0003509668678408671, "loss": 7.7962, "step": 2425 }, { "epoch": 0.2986213687838503, "grad_norm": 0.22950880229473114, "learning_rate": 0.00035090528390195835, "loss": 7.3531, "step": 2426 }, { "epoch": 0.29874446085672085, "grad_norm": 0.1371965855360031, "learning_rate": 0.00035084369996304964, "loss": 7.9219, "step": 2427 }, { "epoch": 0.29886755292959133, "grad_norm": 0.1229216456413269, "learning_rate": 0.0003507821160241409, "loss": 7.458, "step": 2428 }, { "epoch": 0.29899064500246186, "grad_norm": 0.18780189752578735, "learning_rate": 0.0003507205320852322, "loss": 8.441, "step": 2429 }, { "epoch": 0.29911373707533234, "grad_norm": 0.1795937716960907, "learning_rate": 0.00035065894814632345, "loss": 7.5941, "step": 2430 }, { "epoch": 0.2992368291482029, "grad_norm": 0.17861688137054443, "learning_rate": 0.00035059736420741473, "loss": 7.5299, "step": 2431 }, { "epoch": 0.29935992122107336, "grad_norm": 0.16774721443653107, "learning_rate": 0.00035053578026850597, "loss": 7.4252, "step": 2432 }, { "epoch": 0.2994830132939439, "grad_norm": 0.10389066487550735, "learning_rate": 0.00035047419632959725, "loss": 7.7033, "step": 2433 }, { "epoch": 0.29960610536681437, "grad_norm": 0.13355936110019684, "learning_rate": 0.0003504126123906885, "loss": 7.4948, "step": 2434 }, { "epoch": 0.2997291974396849, "grad_norm": 0.2053387463092804, "learning_rate": 0.00035035102845177977, "loss": 7.7294, "step": 2435 }, { "epoch": 0.2998522895125554, "grad_norm": 0.10265034437179565, "learning_rate": 0.00035028944451287106, "loss": 7.5606, "step": 2436 }, { "epoch": 0.2999753815854259, "grad_norm": 0.24465231597423553, "learning_rate": 0.00035022786057396234, "loss": 8.3895, "step": 2437 }, { "epoch": 0.3000984736582964, "grad_norm": 0.31702858209609985, "learning_rate": 0.0003501662766350536, "loss": 7.7036, "step": 2438 }, { "epoch": 0.30022156573116693, "grad_norm": 0.2067653387784958, "learning_rate": 0.00035010469269614486, "loss": 7.338, "step": 2439 }, { "epoch": 0.3003446578040374, "grad_norm": 0.1428471803665161, "learning_rate": 0.0003500431087572361, "loss": 7.5242, "step": 2440 }, { "epoch": 0.30046774987690794, "grad_norm": 0.14979363977909088, "learning_rate": 0.0003499815248183274, "loss": 7.7151, "step": 2441 }, { "epoch": 0.3005908419497784, "grad_norm": 0.1340572088956833, "learning_rate": 0.0003499199408794186, "loss": 7.7065, "step": 2442 }, { "epoch": 0.30071393402264895, "grad_norm": 0.08995924890041351, "learning_rate": 0.00034985835694050996, "loss": 7.4582, "step": 2443 }, { "epoch": 0.30083702609551943, "grad_norm": 0.17971689999103546, "learning_rate": 0.0003497967730016012, "loss": 7.6557, "step": 2444 }, { "epoch": 0.30096011816838997, "grad_norm": 0.12152209877967834, "learning_rate": 0.0003497351890626925, "loss": 7.4184, "step": 2445 }, { "epoch": 0.30108321024126045, "grad_norm": 0.07497064024209976, "learning_rate": 0.0003496736051237837, "loss": 7.2836, "step": 2446 }, { "epoch": 0.301206302314131, "grad_norm": 0.1936425268650055, "learning_rate": 0.000349612021184875, "loss": 8.0481, "step": 2447 }, { "epoch": 0.30132939438700146, "grad_norm": 0.584205150604248, "learning_rate": 0.0003495504372459662, "loss": 10.416, "step": 2448 }, { "epoch": 0.301452486459872, "grad_norm": 0.2548394799232483, "learning_rate": 0.00034948885330705757, "loss": 7.4607, "step": 2449 }, { "epoch": 0.30157557853274247, "grad_norm": 0.32343772053718567, "learning_rate": 0.0003494272693681488, "loss": 7.3903, "step": 2450 }, { "epoch": 0.301698670605613, "grad_norm": 0.2989088296890259, "learning_rate": 0.0003493656854292401, "loss": 7.6659, "step": 2451 }, { "epoch": 0.3018217626784835, "grad_norm": 0.17664504051208496, "learning_rate": 0.0003493041014903313, "loss": 7.5235, "step": 2452 }, { "epoch": 0.301944854751354, "grad_norm": 0.1596572995185852, "learning_rate": 0.0003492425175514226, "loss": 7.5228, "step": 2453 }, { "epoch": 0.3020679468242245, "grad_norm": 0.12323493510484695, "learning_rate": 0.00034918093361251384, "loss": 7.5153, "step": 2454 }, { "epoch": 0.30219103889709503, "grad_norm": 0.2582472562789917, "learning_rate": 0.0003491193496736051, "loss": 7.8857, "step": 2455 }, { "epoch": 0.3023141309699655, "grad_norm": 0.17713628709316254, "learning_rate": 0.0003490577657346964, "loss": 7.5735, "step": 2456 }, { "epoch": 0.30243722304283605, "grad_norm": 0.22300471365451813, "learning_rate": 0.0003489961817957877, "loss": 7.5501, "step": 2457 }, { "epoch": 0.3025603151157065, "grad_norm": 0.08911791443824768, "learning_rate": 0.00034893459785687893, "loss": 7.2841, "step": 2458 }, { "epoch": 0.30268340718857706, "grad_norm": 0.17574000358581543, "learning_rate": 0.0003488730139179702, "loss": 8.4136, "step": 2459 }, { "epoch": 0.30280649926144754, "grad_norm": 0.12267972528934479, "learning_rate": 0.00034881142997906145, "loss": 7.5495, "step": 2460 }, { "epoch": 0.30292959133431807, "grad_norm": 0.22263795137405396, "learning_rate": 0.00034874984604015273, "loss": 7.4957, "step": 2461 }, { "epoch": 0.30305268340718855, "grad_norm": 0.20991913974285126, "learning_rate": 0.00034868826210124397, "loss": 7.5666, "step": 2462 }, { "epoch": 0.3031757754800591, "grad_norm": 0.202682226896286, "learning_rate": 0.0003486266781623353, "loss": 7.5797, "step": 2463 }, { "epoch": 0.3032988675529296, "grad_norm": 0.12613512575626373, "learning_rate": 0.00034856509422342654, "loss": 7.4985, "step": 2464 }, { "epoch": 0.3034219596258001, "grad_norm": 0.2214333862066269, "learning_rate": 0.00034850351028451783, "loss": 8.2498, "step": 2465 }, { "epoch": 0.30354505169867063, "grad_norm": 0.42060646414756775, "learning_rate": 0.00034844192634560906, "loss": 8.3702, "step": 2466 }, { "epoch": 0.3036681437715411, "grad_norm": 0.0978778675198555, "learning_rate": 0.00034838034240670035, "loss": 7.4698, "step": 2467 }, { "epoch": 0.30379123584441164, "grad_norm": 0.33068451285362244, "learning_rate": 0.0003483187584677916, "loss": 8.5069, "step": 2468 }, { "epoch": 0.3039143279172821, "grad_norm": 0.08767349272966385, "learning_rate": 0.00034825717452888287, "loss": 7.4168, "step": 2469 }, { "epoch": 0.30403741999015266, "grad_norm": 0.19913965463638306, "learning_rate": 0.00034819559058997415, "loss": 8.4622, "step": 2470 }, { "epoch": 0.30416051206302314, "grad_norm": 0.1565418690443039, "learning_rate": 0.00034813400665106544, "loss": 7.4787, "step": 2471 }, { "epoch": 0.30428360413589367, "grad_norm": 0.2364269196987152, "learning_rate": 0.00034807242271215667, "loss": 7.4893, "step": 2472 }, { "epoch": 0.30440669620876415, "grad_norm": 0.08392380177974701, "learning_rate": 0.00034801083877324796, "loss": 7.5664, "step": 2473 }, { "epoch": 0.3045297882816347, "grad_norm": 0.13879276812076569, "learning_rate": 0.0003479492548343392, "loss": 7.6528, "step": 2474 }, { "epoch": 0.30465288035450516, "grad_norm": 0.1452028602361679, "learning_rate": 0.0003478876708954305, "loss": 7.7334, "step": 2475 }, { "epoch": 0.3047759724273757, "grad_norm": 0.1251249760389328, "learning_rate": 0.00034782608695652176, "loss": 7.8947, "step": 2476 }, { "epoch": 0.3048990645002462, "grad_norm": 0.11036959290504456, "learning_rate": 0.00034776450301761305, "loss": 7.7581, "step": 2477 }, { "epoch": 0.3050221565731167, "grad_norm": 0.12258762121200562, "learning_rate": 0.0003477029190787043, "loss": 8.1772, "step": 2478 }, { "epoch": 0.3051452486459872, "grad_norm": 0.11653777956962585, "learning_rate": 0.00034764133513979557, "loss": 7.4411, "step": 2479 }, { "epoch": 0.3052683407188577, "grad_norm": 0.15426352620124817, "learning_rate": 0.0003475797512008868, "loss": 7.4892, "step": 2480 }, { "epoch": 0.3053914327917282, "grad_norm": 0.09993752837181091, "learning_rate": 0.0003475181672619781, "loss": 7.7146, "step": 2481 }, { "epoch": 0.30551452486459874, "grad_norm": 0.08600811660289764, "learning_rate": 0.0003474565833230693, "loss": 7.6403, "step": 2482 }, { "epoch": 0.3056376169374692, "grad_norm": 0.19222474098205566, "learning_rate": 0.00034739499938416066, "loss": 7.2063, "step": 2483 }, { "epoch": 0.30576070901033975, "grad_norm": 0.2396741807460785, "learning_rate": 0.0003473334154452519, "loss": 7.7966, "step": 2484 }, { "epoch": 0.3058838010832102, "grad_norm": 0.1316172033548355, "learning_rate": 0.0003472718315063432, "loss": 7.6318, "step": 2485 }, { "epoch": 0.30600689315608076, "grad_norm": 0.1342751681804657, "learning_rate": 0.0003472102475674344, "loss": 7.9613, "step": 2486 }, { "epoch": 0.30612998522895124, "grad_norm": 0.15164291858673096, "learning_rate": 0.0003471486636285257, "loss": 7.5809, "step": 2487 }, { "epoch": 0.3062530773018218, "grad_norm": 0.10750433802604675, "learning_rate": 0.00034708707968961693, "loss": 7.3838, "step": 2488 }, { "epoch": 0.30637616937469225, "grad_norm": 0.13682527840137482, "learning_rate": 0.0003470254957507082, "loss": 7.5635, "step": 2489 }, { "epoch": 0.3064992614475628, "grad_norm": 0.10922949016094208, "learning_rate": 0.0003469639118117995, "loss": 7.7076, "step": 2490 }, { "epoch": 0.30662235352043327, "grad_norm": 0.07923515886068344, "learning_rate": 0.0003469023278728908, "loss": 7.6546, "step": 2491 }, { "epoch": 0.3067454455933038, "grad_norm": 0.11988560855388641, "learning_rate": 0.000346840743933982, "loss": 7.9759, "step": 2492 }, { "epoch": 0.3068685376661743, "grad_norm": 0.11441375315189362, "learning_rate": 0.0003467791599950733, "loss": 7.3875, "step": 2493 }, { "epoch": 0.3069916297390448, "grad_norm": 0.14947275817394257, "learning_rate": 0.00034671757605616454, "loss": 7.5488, "step": 2494 }, { "epoch": 0.3071147218119153, "grad_norm": 0.3540812134742737, "learning_rate": 0.00034665599211725583, "loss": 8.4476, "step": 2495 }, { "epoch": 0.3072378138847858, "grad_norm": 0.15631121397018433, "learning_rate": 0.00034659440817834706, "loss": 7.9946, "step": 2496 }, { "epoch": 0.3073609059576563, "grad_norm": 0.12044944614171982, "learning_rate": 0.0003465328242394384, "loss": 8.0062, "step": 2497 }, { "epoch": 0.30748399803052684, "grad_norm": 0.0973854660987854, "learning_rate": 0.00034647124030052963, "loss": 7.7395, "step": 2498 }, { "epoch": 0.3076070901033973, "grad_norm": 0.11426074057817459, "learning_rate": 0.0003464096563616209, "loss": 7.8486, "step": 2499 }, { "epoch": 0.30773018217626785, "grad_norm": 0.1153135746717453, "learning_rate": 0.00034634807242271215, "loss": 7.9692, "step": 2500 }, { "epoch": 0.30785327424913833, "grad_norm": 0.2113054394721985, "learning_rate": 0.00034628648848380344, "loss": 7.3055, "step": 2501 }, { "epoch": 0.30797636632200887, "grad_norm": 0.17586272954940796, "learning_rate": 0.00034622490454489467, "loss": 7.4603, "step": 2502 }, { "epoch": 0.30809945839487934, "grad_norm": 0.11784110218286514, "learning_rate": 0.000346163320605986, "loss": 7.5184, "step": 2503 }, { "epoch": 0.3082225504677499, "grad_norm": 0.13136209547519684, "learning_rate": 0.00034610173666707725, "loss": 7.6165, "step": 2504 }, { "epoch": 0.30834564254062036, "grad_norm": 0.48701173067092896, "learning_rate": 0.00034604015272816853, "loss": 9.3083, "step": 2505 }, { "epoch": 0.3084687346134909, "grad_norm": 0.18093638122081757, "learning_rate": 0.00034597856878925976, "loss": 8.1046, "step": 2506 }, { "epoch": 0.3085918266863614, "grad_norm": 0.18187901377677917, "learning_rate": 0.00034591698485035105, "loss": 8.3444, "step": 2507 }, { "epoch": 0.3087149187592319, "grad_norm": 0.10928213596343994, "learning_rate": 0.0003458554009114423, "loss": 7.9945, "step": 2508 }, { "epoch": 0.30883801083210244, "grad_norm": 0.375457763671875, "learning_rate": 0.00034579381697253357, "loss": 7.2382, "step": 2509 }, { "epoch": 0.3089611029049729, "grad_norm": 0.13753360509872437, "learning_rate": 0.00034573223303362486, "loss": 8.0602, "step": 2510 }, { "epoch": 0.30908419497784345, "grad_norm": 0.25669315457344055, "learning_rate": 0.00034567064909471614, "loss": 7.3577, "step": 2511 }, { "epoch": 0.30920728705071393, "grad_norm": 0.15099065005779266, "learning_rate": 0.0003456090651558074, "loss": 8.4431, "step": 2512 }, { "epoch": 0.30933037912358446, "grad_norm": 0.171762615442276, "learning_rate": 0.00034554748121689866, "loss": 7.3678, "step": 2513 }, { "epoch": 0.30945347119645494, "grad_norm": 0.10080911964178085, "learning_rate": 0.0003454858972779899, "loss": 7.4531, "step": 2514 }, { "epoch": 0.3095765632693255, "grad_norm": 0.12361431866884232, "learning_rate": 0.0003454243133390812, "loss": 7.7522, "step": 2515 }, { "epoch": 0.30969965534219596, "grad_norm": 0.19486989080905914, "learning_rate": 0.0003453627294001724, "loss": 7.684, "step": 2516 }, { "epoch": 0.3098227474150665, "grad_norm": 0.6109568476676941, "learning_rate": 0.00034530114546126375, "loss": 9.5348, "step": 2517 }, { "epoch": 0.30994583948793697, "grad_norm": 0.268147349357605, "learning_rate": 0.000345239561522355, "loss": 7.8969, "step": 2518 }, { "epoch": 0.3100689315608075, "grad_norm": 0.164151132106781, "learning_rate": 0.0003451779775834463, "loss": 7.596, "step": 2519 }, { "epoch": 0.310192023633678, "grad_norm": 0.1711287647485733, "learning_rate": 0.0003451163936445375, "loss": 7.6248, "step": 2520 }, { "epoch": 0.3103151157065485, "grad_norm": 0.1882629245519638, "learning_rate": 0.0003450548097056288, "loss": 7.6366, "step": 2521 }, { "epoch": 0.310438207779419, "grad_norm": 0.11705846339464188, "learning_rate": 0.00034499322576672, "loss": 7.538, "step": 2522 }, { "epoch": 0.31056129985228953, "grad_norm": 0.08659765869379044, "learning_rate": 0.0003449316418278113, "loss": 7.7639, "step": 2523 }, { "epoch": 0.31068439192516, "grad_norm": 0.34842947125434875, "learning_rate": 0.0003448700578889026, "loss": 8.5652, "step": 2524 }, { "epoch": 0.31080748399803054, "grad_norm": 0.23018339276313782, "learning_rate": 0.0003448084739499939, "loss": 8.0238, "step": 2525 }, { "epoch": 0.310930576070901, "grad_norm": 0.1475507915019989, "learning_rate": 0.0003447468900110851, "loss": 7.6391, "step": 2526 }, { "epoch": 0.31105366814377156, "grad_norm": 0.10167615860700607, "learning_rate": 0.0003446853060721764, "loss": 7.6432, "step": 2527 }, { "epoch": 0.31117676021664203, "grad_norm": 0.16715148091316223, "learning_rate": 0.00034462372213326764, "loss": 7.4921, "step": 2528 }, { "epoch": 0.31129985228951257, "grad_norm": 0.09306558221578598, "learning_rate": 0.0003445621381943589, "loss": 7.7529, "step": 2529 }, { "epoch": 0.31142294436238305, "grad_norm": 0.10308527201414108, "learning_rate": 0.0003445005542554502, "loss": 7.6961, "step": 2530 }, { "epoch": 0.3115460364352536, "grad_norm": 0.1340814083814621, "learning_rate": 0.0003444389703165415, "loss": 7.8983, "step": 2531 }, { "epoch": 0.31166912850812406, "grad_norm": 0.380424439907074, "learning_rate": 0.00034437738637763273, "loss": 9.475, "step": 2532 }, { "epoch": 0.3117922205809946, "grad_norm": 0.19025516510009766, "learning_rate": 0.000344315802438724, "loss": 8.3221, "step": 2533 }, { "epoch": 0.3119153126538651, "grad_norm": 0.08663443475961685, "learning_rate": 0.00034425421849981525, "loss": 7.9328, "step": 2534 }, { "epoch": 0.3120384047267356, "grad_norm": 0.1908007264137268, "learning_rate": 0.00034419263456090653, "loss": 7.6688, "step": 2535 }, { "epoch": 0.3121614967996061, "grad_norm": 0.22786380350589752, "learning_rate": 0.00034413105062199777, "loss": 7.6001, "step": 2536 }, { "epoch": 0.3122845888724766, "grad_norm": 0.24297793209552765, "learning_rate": 0.0003440694666830891, "loss": 9.0054, "step": 2537 }, { "epoch": 0.3124076809453471, "grad_norm": 0.19801440834999084, "learning_rate": 0.00034400788274418034, "loss": 7.6183, "step": 2538 }, { "epoch": 0.31253077301821763, "grad_norm": 0.08182121068239212, "learning_rate": 0.0003439462988052716, "loss": 7.7097, "step": 2539 }, { "epoch": 0.3126538650910881, "grad_norm": 0.1463521271944046, "learning_rate": 0.00034388471486636286, "loss": 7.6751, "step": 2540 }, { "epoch": 0.31277695716395865, "grad_norm": 0.11689490079879761, "learning_rate": 0.00034382313092745414, "loss": 7.5885, "step": 2541 }, { "epoch": 0.3129000492368291, "grad_norm": 0.23485365509986877, "learning_rate": 0.0003437615469885454, "loss": 7.9591, "step": 2542 }, { "epoch": 0.31302314130969966, "grad_norm": 0.2553444504737854, "learning_rate": 0.00034369996304963666, "loss": 8.0928, "step": 2543 }, { "epoch": 0.31314623338257014, "grad_norm": 0.12679898738861084, "learning_rate": 0.00034363837911072795, "loss": 7.7323, "step": 2544 }, { "epoch": 0.3132693254554407, "grad_norm": 0.12841932475566864, "learning_rate": 0.00034357679517181924, "loss": 7.7516, "step": 2545 }, { "epoch": 0.31339241752831115, "grad_norm": 0.24359458684921265, "learning_rate": 0.00034351521123291047, "loss": 8.7956, "step": 2546 }, { "epoch": 0.3135155096011817, "grad_norm": 0.28895723819732666, "learning_rate": 0.00034345362729400176, "loss": 9.059, "step": 2547 }, { "epoch": 0.31363860167405216, "grad_norm": 0.34329643845558167, "learning_rate": 0.000343392043355093, "loss": 7.3728, "step": 2548 }, { "epoch": 0.3137616937469227, "grad_norm": 0.32297688722610474, "learning_rate": 0.0003433304594161843, "loss": 8.3876, "step": 2549 }, { "epoch": 0.31388478581979323, "grad_norm": 0.3767348825931549, "learning_rate": 0.0003432688754772755, "loss": 7.4632, "step": 2550 }, { "epoch": 0.3140078778926637, "grad_norm": 0.3191923499107361, "learning_rate": 0.00034320729153836685, "loss": 7.5321, "step": 2551 }, { "epoch": 0.31413096996553425, "grad_norm": 0.23328174650669098, "learning_rate": 0.0003431457075994581, "loss": 8.842, "step": 2552 }, { "epoch": 0.3142540620384047, "grad_norm": 0.295292466878891, "learning_rate": 0.00034308412366054937, "loss": 9.1544, "step": 2553 }, { "epoch": 0.31437715411127526, "grad_norm": 0.1688702404499054, "learning_rate": 0.0003430225397216406, "loss": 7.4881, "step": 2554 }, { "epoch": 0.31450024618414574, "grad_norm": 0.1726287305355072, "learning_rate": 0.0003429609557827319, "loss": 7.4252, "step": 2555 }, { "epoch": 0.31462333825701627, "grad_norm": 0.13318225741386414, "learning_rate": 0.0003428993718438231, "loss": 7.7408, "step": 2556 }, { "epoch": 0.31474643032988675, "grad_norm": 0.11381927132606506, "learning_rate": 0.00034283778790491435, "loss": 7.4662, "step": 2557 }, { "epoch": 0.3148695224027573, "grad_norm": 0.2943710386753082, "learning_rate": 0.0003427762039660057, "loss": 8.3504, "step": 2558 }, { "epoch": 0.31499261447562776, "grad_norm": 0.1376105695962906, "learning_rate": 0.0003427146200270969, "loss": 7.4098, "step": 2559 }, { "epoch": 0.3151157065484983, "grad_norm": 0.10818928480148315, "learning_rate": 0.0003426530360881882, "loss": 7.6027, "step": 2560 }, { "epoch": 0.3152387986213688, "grad_norm": 0.09395131468772888, "learning_rate": 0.00034259145214927944, "loss": 7.8743, "step": 2561 }, { "epoch": 0.3153618906942393, "grad_norm": 0.5317927002906799, "learning_rate": 0.00034252986821037073, "loss": 9.6023, "step": 2562 }, { "epoch": 0.3154849827671098, "grad_norm": 0.19348959624767303, "learning_rate": 0.00034246828427146196, "loss": 7.7707, "step": 2563 }, { "epoch": 0.3156080748399803, "grad_norm": 0.24999035894870758, "learning_rate": 0.0003424067003325533, "loss": 7.872, "step": 2564 }, { "epoch": 0.3157311669128508, "grad_norm": 0.1300775110721588, "learning_rate": 0.00034234511639364454, "loss": 8.5568, "step": 2565 }, { "epoch": 0.31585425898572134, "grad_norm": 0.24479486048221588, "learning_rate": 0.0003422835324547358, "loss": 8.0564, "step": 2566 }, { "epoch": 0.3159773510585918, "grad_norm": 0.2865169942378998, "learning_rate": 0.00034222194851582705, "loss": 7.3265, "step": 2567 }, { "epoch": 0.31610044313146235, "grad_norm": 0.10455130785703659, "learning_rate": 0.00034216036457691834, "loss": 7.505, "step": 2568 }, { "epoch": 0.31622353520433283, "grad_norm": 0.13426172733306885, "learning_rate": 0.0003420987806380096, "loss": 7.1457, "step": 2569 }, { "epoch": 0.31634662727720336, "grad_norm": 0.17553266882896423, "learning_rate": 0.00034203719669910086, "loss": 7.4897, "step": 2570 }, { "epoch": 0.31646971935007384, "grad_norm": 0.3363594710826874, "learning_rate": 0.00034197561276019215, "loss": 7.6258, "step": 2571 }, { "epoch": 0.3165928114229444, "grad_norm": 0.317435622215271, "learning_rate": 0.00034191402882128343, "loss": 7.7377, "step": 2572 }, { "epoch": 0.31671590349581485, "grad_norm": 0.30667954683303833, "learning_rate": 0.00034185244488237467, "loss": 7.5567, "step": 2573 }, { "epoch": 0.3168389955686854, "grad_norm": 0.43101048469543457, "learning_rate": 0.00034179086094346595, "loss": 8.7559, "step": 2574 }, { "epoch": 0.31696208764155587, "grad_norm": 0.09866824746131897, "learning_rate": 0.0003417292770045572, "loss": 7.5382, "step": 2575 }, { "epoch": 0.3170851797144264, "grad_norm": 0.1616678088903427, "learning_rate": 0.00034166769306564847, "loss": 7.7302, "step": 2576 }, { "epoch": 0.3172082717872969, "grad_norm": 0.186334028840065, "learning_rate": 0.0003416061091267397, "loss": 7.5893, "step": 2577 }, { "epoch": 0.3173313638601674, "grad_norm": 0.27598199248313904, "learning_rate": 0.00034154452518783104, "loss": 9.2818, "step": 2578 }, { "epoch": 0.3174544559330379, "grad_norm": 0.20428566634655, "learning_rate": 0.0003414829412489223, "loss": 7.6345, "step": 2579 }, { "epoch": 0.3175775480059084, "grad_norm": 0.22395044565200806, "learning_rate": 0.00034142135731001356, "loss": 7.8733, "step": 2580 }, { "epoch": 0.3177006400787789, "grad_norm": 0.15447139739990234, "learning_rate": 0.0003413597733711048, "loss": 7.7765, "step": 2581 }, { "epoch": 0.31782373215164944, "grad_norm": 0.12334378063678741, "learning_rate": 0.0003412981894321961, "loss": 7.5239, "step": 2582 }, { "epoch": 0.3179468242245199, "grad_norm": 0.26867127418518066, "learning_rate": 0.0003412366054932873, "loss": 8.307, "step": 2583 }, { "epoch": 0.31806991629739045, "grad_norm": 0.15911194682121277, "learning_rate": 0.0003411750215543786, "loss": 7.5548, "step": 2584 }, { "epoch": 0.31819300837026093, "grad_norm": 0.12009849399328232, "learning_rate": 0.0003411134376154699, "loss": 7.4542, "step": 2585 }, { "epoch": 0.31831610044313147, "grad_norm": 0.09401271492242813, "learning_rate": 0.0003410518536765612, "loss": 7.4711, "step": 2586 }, { "epoch": 0.31843919251600195, "grad_norm": 0.1551409661769867, "learning_rate": 0.0003409902697376524, "loss": 7.4805, "step": 2587 }, { "epoch": 0.3185622845888725, "grad_norm": 0.1516966074705124, "learning_rate": 0.0003409286857987437, "loss": 7.4416, "step": 2588 }, { "epoch": 0.31868537666174296, "grad_norm": 0.0850227102637291, "learning_rate": 0.0003408671018598349, "loss": 7.6086, "step": 2589 }, { "epoch": 0.3188084687346135, "grad_norm": 0.08601994812488556, "learning_rate": 0.0003408055179209262, "loss": 7.3647, "step": 2590 }, { "epoch": 0.31893156080748397, "grad_norm": 0.08292805403470993, "learning_rate": 0.0003407439339820175, "loss": 7.4667, "step": 2591 }, { "epoch": 0.3190546528803545, "grad_norm": 0.2857200801372528, "learning_rate": 0.0003406823500431088, "loss": 8.3416, "step": 2592 }, { "epoch": 0.31917774495322504, "grad_norm": 0.24413304030895233, "learning_rate": 0.0003406207661042, "loss": 7.9696, "step": 2593 }, { "epoch": 0.3193008370260955, "grad_norm": 0.09466582536697388, "learning_rate": 0.0003405591821652913, "loss": 7.4296, "step": 2594 }, { "epoch": 0.31942392909896605, "grad_norm": 0.15397164225578308, "learning_rate": 0.00034049759822638254, "loss": 7.1931, "step": 2595 }, { "epoch": 0.31954702117183653, "grad_norm": 0.29353976249694824, "learning_rate": 0.0003404360142874738, "loss": 8.4866, "step": 2596 }, { "epoch": 0.31967011324470707, "grad_norm": 0.2697232663631439, "learning_rate": 0.00034037443034856506, "loss": 8.3693, "step": 2597 }, { "epoch": 0.31979320531757754, "grad_norm": 0.12961331009864807, "learning_rate": 0.0003403128464096564, "loss": 8.2173, "step": 2598 }, { "epoch": 0.3199162973904481, "grad_norm": 0.15921783447265625, "learning_rate": 0.00034025126247074763, "loss": 7.5197, "step": 2599 }, { "epoch": 0.32003938946331856, "grad_norm": 0.20628157258033752, "learning_rate": 0.0003401896785318389, "loss": 8.7577, "step": 2600 }, { "epoch": 0.3201624815361891, "grad_norm": 0.26980888843536377, "learning_rate": 0.00034012809459293015, "loss": 7.6154, "step": 2601 }, { "epoch": 0.32028557360905957, "grad_norm": 0.13768164813518524, "learning_rate": 0.00034006651065402143, "loss": 8.5575, "step": 2602 }, { "epoch": 0.3204086656819301, "grad_norm": 0.1826036274433136, "learning_rate": 0.00034000492671511267, "loss": 7.8916, "step": 2603 }, { "epoch": 0.3205317577548006, "grad_norm": 0.25368547439575195, "learning_rate": 0.00033994334277620395, "loss": 7.4442, "step": 2604 }, { "epoch": 0.3206548498276711, "grad_norm": 0.30667755007743835, "learning_rate": 0.00033988175883729524, "loss": 9.3401, "step": 2605 }, { "epoch": 0.3207779419005416, "grad_norm": 0.19003568589687347, "learning_rate": 0.0003398201748983865, "loss": 7.5895, "step": 2606 }, { "epoch": 0.32090103397341213, "grad_norm": 0.2511013150215149, "learning_rate": 0.00033975859095947776, "loss": 7.1821, "step": 2607 }, { "epoch": 0.3210241260462826, "grad_norm": 0.2735559046268463, "learning_rate": 0.00033969700702056905, "loss": 8.0012, "step": 2608 }, { "epoch": 0.32114721811915314, "grad_norm": 0.11071956902742386, "learning_rate": 0.0003396354230816603, "loss": 7.4161, "step": 2609 }, { "epoch": 0.3212703101920236, "grad_norm": 0.19113412499427795, "learning_rate": 0.00033957383914275157, "loss": 8.0606, "step": 2610 }, { "epoch": 0.32139340226489416, "grad_norm": 0.33173811435699463, "learning_rate": 0.0003395122552038428, "loss": 8.3185, "step": 2611 }, { "epoch": 0.32151649433776464, "grad_norm": 0.15025030076503754, "learning_rate": 0.00033945067126493414, "loss": 8.2334, "step": 2612 }, { "epoch": 0.32163958641063517, "grad_norm": 0.13600674271583557, "learning_rate": 0.00033938908732602537, "loss": 8.2405, "step": 2613 }, { "epoch": 0.32176267848350565, "grad_norm": 0.19218826293945312, "learning_rate": 0.00033932750338711666, "loss": 7.5723, "step": 2614 }, { "epoch": 0.3218857705563762, "grad_norm": 0.11086178570985794, "learning_rate": 0.0003392659194482079, "loss": 8.6763, "step": 2615 }, { "epoch": 0.32200886262924666, "grad_norm": 0.23552709817886353, "learning_rate": 0.0003392043355092992, "loss": 7.7166, "step": 2616 }, { "epoch": 0.3221319547021172, "grad_norm": 0.2791996896266937, "learning_rate": 0.0003391427515703904, "loss": 7.8932, "step": 2617 }, { "epoch": 0.3222550467749877, "grad_norm": NaN, "learning_rate": 0.00033908116763148175, "loss": 7.8142, "step": 2618 }, { "epoch": 0.3223781388478582, "grad_norm": 0.276032418012619, "learning_rate": 0.000339019583692573, "loss": 7.6985, "step": 2619 }, { "epoch": 0.3225012309207287, "grad_norm": 0.7896894812583923, "learning_rate": 0.00033895799975366427, "loss": 7.6852, "step": 2620 }, { "epoch": 0.3226243229935992, "grad_norm": 0.7185707092285156, "learning_rate": 0.0003388964158147555, "loss": 8.263, "step": 2621 }, { "epoch": 0.3227474150664697, "grad_norm": 0.1447768658399582, "learning_rate": 0.0003388348318758468, "loss": 7.8401, "step": 2622 }, { "epoch": 0.32287050713934023, "grad_norm": 0.49789828062057495, "learning_rate": 0.000338773247936938, "loss": 7.4915, "step": 2623 }, { "epoch": 0.3229935992122107, "grad_norm": 0.14748379588127136, "learning_rate": 0.0003387116639980293, "loss": 8.4497, "step": 2624 }, { "epoch": 0.32311669128508125, "grad_norm": 0.24942581355571747, "learning_rate": 0.0003386500800591206, "loss": 7.4048, "step": 2625 }, { "epoch": 0.3232397833579517, "grad_norm": 0.5720309019088745, "learning_rate": 0.0003385884961202119, "loss": 8.3381, "step": 2626 }, { "epoch": 0.32336287543082226, "grad_norm": 0.3835671544075012, "learning_rate": 0.0003385269121813031, "loss": 7.2091, "step": 2627 }, { "epoch": 0.32348596750369274, "grad_norm": 0.4947988986968994, "learning_rate": 0.0003384653282423944, "loss": 8.5119, "step": 2628 }, { "epoch": 0.3236090595765633, "grad_norm": 0.31220030784606934, "learning_rate": 0.00033840374430348563, "loss": 7.4983, "step": 2629 }, { "epoch": 0.32373215164943375, "grad_norm": 0.672565221786499, "learning_rate": 0.0003383421603645769, "loss": 7.2388, "step": 2630 }, { "epoch": 0.3238552437223043, "grad_norm": 0.43291351199150085, "learning_rate": 0.00033828057642566815, "loss": 7.3955, "step": 2631 }, { "epoch": 0.32397833579517477, "grad_norm": 0.12228111922740936, "learning_rate": 0.0003382189924867595, "loss": 7.4945, "step": 2632 }, { "epoch": 0.3241014278680453, "grad_norm": 0.35515090823173523, "learning_rate": 0.0003381574085478507, "loss": 7.5929, "step": 2633 }, { "epoch": 0.3242245199409158, "grad_norm": 0.6029056310653687, "learning_rate": 0.000338095824608942, "loss": 8.1809, "step": 2634 }, { "epoch": 0.3243476120137863, "grad_norm": 0.39543822407722473, "learning_rate": 0.00033803424067003324, "loss": 7.7556, "step": 2635 }, { "epoch": 0.32447070408665685, "grad_norm": 0.19205492734909058, "learning_rate": 0.00033797265673112453, "loss": 7.5081, "step": 2636 }, { "epoch": 0.3245937961595273, "grad_norm": 0.2347155511379242, "learning_rate": 0.00033791107279221576, "loss": 7.623, "step": 2637 }, { "epoch": 0.32471688823239786, "grad_norm": 0.1754843294620514, "learning_rate": 0.00033784948885330705, "loss": 7.8706, "step": 2638 }, { "epoch": 0.32483998030526834, "grad_norm": 0.10066582262516022, "learning_rate": 0.00033778790491439833, "loss": 7.6981, "step": 2639 }, { "epoch": 0.3249630723781389, "grad_norm": 0.18040253221988678, "learning_rate": 0.0003377263209754896, "loss": 7.3419, "step": 2640 }, { "epoch": 0.32508616445100935, "grad_norm": 0.2552955448627472, "learning_rate": 0.00033766473703658085, "loss": 7.8651, "step": 2641 }, { "epoch": 0.3252092565238799, "grad_norm": 0.17239105701446533, "learning_rate": 0.00033760315309767214, "loss": 7.3862, "step": 2642 }, { "epoch": 0.32533234859675036, "grad_norm": 0.2027781754732132, "learning_rate": 0.00033754156915876337, "loss": 8.3426, "step": 2643 }, { "epoch": 0.3254554406696209, "grad_norm": 0.22800208628177643, "learning_rate": 0.00033747998521985466, "loss": 7.5979, "step": 2644 }, { "epoch": 0.3255785327424914, "grad_norm": 0.20577014982700348, "learning_rate": 0.00033741840128094595, "loss": 8.0903, "step": 2645 }, { "epoch": 0.3257016248153619, "grad_norm": 0.2977149784564972, "learning_rate": 0.00033735681734203723, "loss": 7.5493, "step": 2646 }, { "epoch": 0.3258247168882324, "grad_norm": 0.1553916335105896, "learning_rate": 0.00033729523340312846, "loss": 7.5887, "step": 2647 }, { "epoch": 0.3259478089611029, "grad_norm": 0.5043690204620361, "learning_rate": 0.00033723364946421975, "loss": 9.1212, "step": 2648 }, { "epoch": 0.3260709010339734, "grad_norm": 0.1771317422389984, "learning_rate": 0.000337172065525311, "loss": 7.7719, "step": 2649 }, { "epoch": 0.32619399310684394, "grad_norm": 0.15302902460098267, "learning_rate": 0.00033711048158640227, "loss": 8.7337, "step": 2650 }, { "epoch": 0.3263170851797144, "grad_norm": 0.3500899076461792, "learning_rate": 0.0003370488976474935, "loss": 7.7305, "step": 2651 }, { "epoch": 0.32644017725258495, "grad_norm": 0.3812723755836487, "learning_rate": 0.00033698731370858484, "loss": 7.4215, "step": 2652 }, { "epoch": 0.32656326932545543, "grad_norm": 0.27290502190589905, "learning_rate": 0.0003369257297696761, "loss": 7.4353, "step": 2653 }, { "epoch": 0.32668636139832596, "grad_norm": 0.18497243523597717, "learning_rate": 0.00033686414583076736, "loss": 7.6313, "step": 2654 }, { "epoch": 0.32680945347119644, "grad_norm": 0.218612939119339, "learning_rate": 0.0003368025618918586, "loss": 7.3476, "step": 2655 }, { "epoch": 0.326932545544067, "grad_norm": 0.25923240184783936, "learning_rate": 0.0003367409779529499, "loss": 7.5148, "step": 2656 }, { "epoch": 0.32705563761693746, "grad_norm": 0.17865116894245148, "learning_rate": 0.0003366793940140411, "loss": 7.6514, "step": 2657 }, { "epoch": 0.327178729689808, "grad_norm": 0.17957602441310883, "learning_rate": 0.0003366178100751324, "loss": 7.6457, "step": 2658 }, { "epoch": 0.32730182176267847, "grad_norm": 0.20035865902900696, "learning_rate": 0.0003365562261362237, "loss": 7.6459, "step": 2659 }, { "epoch": 0.327424913835549, "grad_norm": 0.22507211565971375, "learning_rate": 0.000336494642197315, "loss": 7.5654, "step": 2660 }, { "epoch": 0.3275480059084195, "grad_norm": 0.272672563791275, "learning_rate": 0.0003364330582584062, "loss": 8.653, "step": 2661 }, { "epoch": 0.32767109798129, "grad_norm": 0.09747762978076935, "learning_rate": 0.0003363714743194975, "loss": 7.6963, "step": 2662 }, { "epoch": 0.3277941900541605, "grad_norm": 0.3909226655960083, "learning_rate": 0.0003363098903805887, "loss": 9.0838, "step": 2663 }, { "epoch": 0.32791728212703103, "grad_norm": 0.2160932570695877, "learning_rate": 0.00033624830644168, "loss": 7.3556, "step": 2664 }, { "epoch": 0.3280403741999015, "grad_norm": 0.121767058968544, "learning_rate": 0.00033618672250277124, "loss": 7.6053, "step": 2665 }, { "epoch": 0.32816346627277204, "grad_norm": 0.10312572121620178, "learning_rate": 0.0003361251385638626, "loss": 7.2867, "step": 2666 }, { "epoch": 0.3282865583456425, "grad_norm": 0.21261370182037354, "learning_rate": 0.0003360635546249538, "loss": 7.5882, "step": 2667 }, { "epoch": 0.32840965041851305, "grad_norm": 0.18903914093971252, "learning_rate": 0.0003360019706860451, "loss": 7.7683, "step": 2668 }, { "epoch": 0.32853274249138353, "grad_norm": 0.07646719366312027, "learning_rate": 0.00033594038674713634, "loss": 7.449, "step": 2669 }, { "epoch": 0.32865583456425407, "grad_norm": 0.1973128467798233, "learning_rate": 0.0003358788028082276, "loss": 8.7021, "step": 2670 }, { "epoch": 0.32877892663712455, "grad_norm": 0.23299816250801086, "learning_rate": 0.00033581721886931885, "loss": 8.0683, "step": 2671 }, { "epoch": 0.3289020187099951, "grad_norm": 0.2277687042951584, "learning_rate": 0.0003357556349304102, "loss": 7.5824, "step": 2672 }, { "epoch": 0.32902511078286556, "grad_norm": 0.2156403362751007, "learning_rate": 0.00033569405099150143, "loss": 7.4699, "step": 2673 }, { "epoch": 0.3291482028557361, "grad_norm": 0.38407543301582336, "learning_rate": 0.0003356324670525927, "loss": 8.5235, "step": 2674 }, { "epoch": 0.3292712949286066, "grad_norm": 0.30719342827796936, "learning_rate": 0.00033557088311368395, "loss": 7.9932, "step": 2675 }, { "epoch": 0.3293943870014771, "grad_norm": 0.17418888211250305, "learning_rate": 0.00033550929917477523, "loss": 7.2737, "step": 2676 }, { "epoch": 0.3295174790743476, "grad_norm": 0.30795374512672424, "learning_rate": 0.00033544771523586647, "loss": 8.8411, "step": 2677 }, { "epoch": 0.3296405711472181, "grad_norm": 0.2208373099565506, "learning_rate": 0.00033538613129695775, "loss": 7.5453, "step": 2678 }, { "epoch": 0.3297636632200886, "grad_norm": 0.2662060856819153, "learning_rate": 0.00033532454735804904, "loss": 7.4557, "step": 2679 }, { "epoch": 0.32988675529295913, "grad_norm": 0.2171526402235031, "learning_rate": 0.0003352629634191403, "loss": 7.983, "step": 2680 }, { "epoch": 0.33000984736582967, "grad_norm": 0.12624040246009827, "learning_rate": 0.00033520137948023156, "loss": 7.8521, "step": 2681 }, { "epoch": 0.33013293943870015, "grad_norm": 0.1432139277458191, "learning_rate": 0.00033513979554132284, "loss": 7.5422, "step": 2682 }, { "epoch": 0.3302560315115707, "grad_norm": 0.29349440336227417, "learning_rate": 0.0003350782116024141, "loss": 8.2083, "step": 2683 }, { "epoch": 0.33037912358444116, "grad_norm": 0.2452782541513443, "learning_rate": 0.00033501662766350536, "loss": 7.8084, "step": 2684 }, { "epoch": 0.3305022156573117, "grad_norm": 0.3617062568664551, "learning_rate": 0.0003349550437245966, "loss": 8.6737, "step": 2685 }, { "epoch": 0.33062530773018217, "grad_norm": 0.18143272399902344, "learning_rate": 0.00033489345978568794, "loss": 7.5798, "step": 2686 }, { "epoch": 0.3307483998030527, "grad_norm": 0.269168496131897, "learning_rate": 0.00033483187584677917, "loss": 8.1274, "step": 2687 }, { "epoch": 0.3308714918759232, "grad_norm": 0.32844477891921997, "learning_rate": 0.00033477029190787046, "loss": 7.3614, "step": 2688 }, { "epoch": 0.3309945839487937, "grad_norm": 0.20443807542324066, "learning_rate": 0.0003347087079689617, "loss": 7.5402, "step": 2689 }, { "epoch": 0.3311176760216642, "grad_norm": 0.35533082485198975, "learning_rate": 0.000334647124030053, "loss": 8.9924, "step": 2690 }, { "epoch": 0.33124076809453473, "grad_norm": 0.20978394150733948, "learning_rate": 0.0003345855400911442, "loss": 7.7521, "step": 2691 }, { "epoch": 0.3313638601674052, "grad_norm": 0.27442753314971924, "learning_rate": 0.0003345239561522355, "loss": 8.0262, "step": 2692 }, { "epoch": 0.33148695224027575, "grad_norm": 0.0912211611866951, "learning_rate": 0.0003344623722133268, "loss": 7.5424, "step": 2693 }, { "epoch": 0.3316100443131462, "grad_norm": 0.09644223749637604, "learning_rate": 0.00033440078827441807, "loss": 7.9582, "step": 2694 }, { "epoch": 0.33173313638601676, "grad_norm": 0.08725704997777939, "learning_rate": 0.0003343392043355093, "loss": 8.4651, "step": 2695 }, { "epoch": 0.33185622845888724, "grad_norm": 0.1277955025434494, "learning_rate": 0.0003342776203966006, "loss": 8.398, "step": 2696 }, { "epoch": 0.33197932053175777, "grad_norm": 0.2825101613998413, "learning_rate": 0.0003342160364576918, "loss": 7.4975, "step": 2697 }, { "epoch": 0.33210241260462825, "grad_norm": 0.2461472749710083, "learning_rate": 0.0003341544525187831, "loss": 7.4919, "step": 2698 }, { "epoch": 0.3322255046774988, "grad_norm": 0.09872845560312271, "learning_rate": 0.0003340928685798744, "loss": 7.6103, "step": 2699 }, { "epoch": 0.33234859675036926, "grad_norm": 0.3460959792137146, "learning_rate": 0.0003340312846409657, "loss": 8.3314, "step": 2700 }, { "epoch": 0.3324716888232398, "grad_norm": 0.26052868366241455, "learning_rate": 0.0003339697007020569, "loss": 7.8196, "step": 2701 }, { "epoch": 0.3325947808961103, "grad_norm": 0.14886200428009033, "learning_rate": 0.0003339081167631482, "loss": 7.5448, "step": 2702 }, { "epoch": 0.3327178729689808, "grad_norm": 0.10945572704076767, "learning_rate": 0.00033384653282423943, "loss": 7.4764, "step": 2703 }, { "epoch": 0.3328409650418513, "grad_norm": 0.11944165080785751, "learning_rate": 0.0003337849488853307, "loss": 7.9888, "step": 2704 }, { "epoch": 0.3329640571147218, "grad_norm": 0.19406236708164215, "learning_rate": 0.00033372336494642195, "loss": 7.5405, "step": 2705 }, { "epoch": 0.3330871491875923, "grad_norm": 0.1320752203464508, "learning_rate": 0.0003336617810075133, "loss": 8.106, "step": 2706 }, { "epoch": 0.33321024126046284, "grad_norm": 0.11555083841085434, "learning_rate": 0.0003336001970686045, "loss": 8.0195, "step": 2707 }, { "epoch": 0.3333333333333333, "grad_norm": 0.10726267099380493, "learning_rate": 0.0003335386131296958, "loss": 7.4263, "step": 2708 }, { "epoch": 0.33345642540620385, "grad_norm": 0.08777716755867004, "learning_rate": 0.00033347702919078704, "loss": 7.3006, "step": 2709 }, { "epoch": 0.33357951747907433, "grad_norm": 0.22898465394973755, "learning_rate": 0.00033341544525187833, "loss": 7.5913, "step": 2710 }, { "epoch": 0.33370260955194486, "grad_norm": 0.29241999983787537, "learning_rate": 0.00033335386131296956, "loss": 7.9866, "step": 2711 }, { "epoch": 0.33382570162481534, "grad_norm": 0.22370566427707672, "learning_rate": 0.00033329227737406085, "loss": 7.6196, "step": 2712 }, { "epoch": 0.3339487936976859, "grad_norm": 0.18347597122192383, "learning_rate": 0.00033323069343515213, "loss": 8.0476, "step": 2713 }, { "epoch": 0.33407188577055635, "grad_norm": 0.28236547112464905, "learning_rate": 0.0003331691094962434, "loss": 7.1713, "step": 2714 }, { "epoch": 0.3341949778434269, "grad_norm": 0.19039680063724518, "learning_rate": 0.00033310752555733465, "loss": 7.5564, "step": 2715 }, { "epoch": 0.33431806991629737, "grad_norm": 0.12564627826213837, "learning_rate": 0.00033304594161842594, "loss": 7.9638, "step": 2716 }, { "epoch": 0.3344411619891679, "grad_norm": 0.21878935396671295, "learning_rate": 0.00033298435767951717, "loss": 7.2842, "step": 2717 }, { "epoch": 0.3345642540620384, "grad_norm": 0.12273790687322617, "learning_rate": 0.00033292277374060846, "loss": 7.792, "step": 2718 }, { "epoch": 0.3346873461349089, "grad_norm": 0.12110309302806854, "learning_rate": 0.0003328611898016997, "loss": 7.4039, "step": 2719 }, { "epoch": 0.3348104382077794, "grad_norm": 0.1444435864686966, "learning_rate": 0.00033279960586279103, "loss": 7.5244, "step": 2720 }, { "epoch": 0.3349335302806499, "grad_norm": 0.11470379680395126, "learning_rate": 0.00033273802192388226, "loss": 7.5658, "step": 2721 }, { "epoch": 0.3350566223535204, "grad_norm": 0.08522561937570572, "learning_rate": 0.00033267643798497355, "loss": 7.5761, "step": 2722 }, { "epoch": 0.33517971442639094, "grad_norm": 0.12840180099010468, "learning_rate": 0.0003326148540460648, "loss": 7.49, "step": 2723 }, { "epoch": 0.3353028064992615, "grad_norm": 0.07656507939100266, "learning_rate": 0.00033255327010715607, "loss": 7.6528, "step": 2724 }, { "epoch": 0.33542589857213195, "grad_norm": 0.08732561767101288, "learning_rate": 0.0003324916861682473, "loss": 7.7217, "step": 2725 }, { "epoch": 0.3355489906450025, "grad_norm": 0.11707284301519394, "learning_rate": 0.00033243010222933864, "loss": 7.438, "step": 2726 }, { "epoch": 0.33567208271787297, "grad_norm": 0.08352886140346527, "learning_rate": 0.0003323685182904299, "loss": 7.4981, "step": 2727 }, { "epoch": 0.3357951747907435, "grad_norm": 0.13923917710781097, "learning_rate": 0.00033230693435152116, "loss": 7.5055, "step": 2728 }, { "epoch": 0.335918266863614, "grad_norm": 0.2709885835647583, "learning_rate": 0.0003322453504126124, "loss": 8.2315, "step": 2729 }, { "epoch": 0.3360413589364845, "grad_norm": 0.4183504283428192, "learning_rate": 0.0003321837664737037, "loss": 7.7864, "step": 2730 }, { "epoch": 0.336164451009355, "grad_norm": 0.16308562457561493, "learning_rate": 0.0003321221825347949, "loss": 7.4878, "step": 2731 }, { "epoch": 0.3362875430822255, "grad_norm": 0.11838759481906891, "learning_rate": 0.0003320605985958862, "loss": 7.822, "step": 2732 }, { "epoch": 0.336410635155096, "grad_norm": 0.23689548671245575, "learning_rate": 0.0003319990146569775, "loss": 7.3488, "step": 2733 }, { "epoch": 0.33653372722796654, "grad_norm": 0.12481234222650528, "learning_rate": 0.00033193743071806877, "loss": 7.6116, "step": 2734 }, { "epoch": 0.336656819300837, "grad_norm": 0.22409571707248688, "learning_rate": 0.00033187584677916, "loss": 8.073, "step": 2735 }, { "epoch": 0.33677991137370755, "grad_norm": 0.2608230412006378, "learning_rate": 0.0003318142628402513, "loss": 8.0673, "step": 2736 }, { "epoch": 0.33690300344657803, "grad_norm": 0.11053868383169174, "learning_rate": 0.0003317526789013425, "loss": 7.3869, "step": 2737 }, { "epoch": 0.33702609551944857, "grad_norm": 0.0814969465136528, "learning_rate": 0.0003316910949624338, "loss": 7.4279, "step": 2738 }, { "epoch": 0.33714918759231904, "grad_norm": 0.17824843525886536, "learning_rate": 0.00033162951102352504, "loss": 8.435, "step": 2739 }, { "epoch": 0.3372722796651896, "grad_norm": 0.1316239982843399, "learning_rate": 0.0003315679270846164, "loss": 7.7117, "step": 2740 }, { "epoch": 0.33739537173806006, "grad_norm": 0.17109271883964539, "learning_rate": 0.0003315063431457076, "loss": 7.9679, "step": 2741 }, { "epoch": 0.3375184638109306, "grad_norm": 0.28248801827430725, "learning_rate": 0.0003314447592067989, "loss": 7.3418, "step": 2742 }, { "epoch": 0.33764155588380107, "grad_norm": 0.10477838665246964, "learning_rate": 0.00033138317526789013, "loss": 7.6208, "step": 2743 }, { "epoch": 0.3377646479566716, "grad_norm": 0.41585686802864075, "learning_rate": 0.0003313215913289814, "loss": 9.1842, "step": 2744 }, { "epoch": 0.3378877400295421, "grad_norm": 0.09890877455472946, "learning_rate": 0.00033126000739007265, "loss": 7.4251, "step": 2745 }, { "epoch": 0.3380108321024126, "grad_norm": 0.15388411283493042, "learning_rate": 0.00033119842345116394, "loss": 7.8298, "step": 2746 }, { "epoch": 0.3381339241752831, "grad_norm": 0.12090792506933212, "learning_rate": 0.0003311368395122552, "loss": 7.3672, "step": 2747 }, { "epoch": 0.33825701624815363, "grad_norm": 0.0788734182715416, "learning_rate": 0.0003310752555733465, "loss": 7.4297, "step": 2748 }, { "epoch": 0.3383801083210241, "grad_norm": 0.10267747193574905, "learning_rate": 0.00033101367163443775, "loss": 7.7085, "step": 2749 }, { "epoch": 0.33850320039389464, "grad_norm": 0.08704827725887299, "learning_rate": 0.00033095208769552903, "loss": 7.7769, "step": 2750 }, { "epoch": 0.3386262924667651, "grad_norm": 0.10654357075691223, "learning_rate": 0.00033089050375662027, "loss": 7.6184, "step": 2751 }, { "epoch": 0.33874938453963566, "grad_norm": 0.12695090472698212, "learning_rate": 0.00033082891981771155, "loss": 7.5738, "step": 2752 }, { "epoch": 0.33887247661250613, "grad_norm": 0.1668541580438614, "learning_rate": 0.0003307673358788028, "loss": 7.4279, "step": 2753 }, { "epoch": 0.33899556868537667, "grad_norm": 0.07737462967634201, "learning_rate": 0.0003307057519398941, "loss": 7.5525, "step": 2754 }, { "epoch": 0.33911866075824715, "grad_norm": 0.12614668905735016, "learning_rate": 0.00033064416800098536, "loss": 7.8628, "step": 2755 }, { "epoch": 0.3392417528311177, "grad_norm": 0.10682643204927444, "learning_rate": 0.00033058258406207664, "loss": 7.5875, "step": 2756 }, { "epoch": 0.33936484490398816, "grad_norm": 0.14026300609111786, "learning_rate": 0.0003305210001231679, "loss": 7.689, "step": 2757 }, { "epoch": 0.3394879369768587, "grad_norm": 0.11626700311899185, "learning_rate": 0.00033045941618425916, "loss": 7.4205, "step": 2758 }, { "epoch": 0.3396110290497292, "grad_norm": 0.5003122687339783, "learning_rate": 0.0003303978322453504, "loss": 9.506, "step": 2759 }, { "epoch": 0.3397341211225997, "grad_norm": 0.2085382342338562, "learning_rate": 0.00033033624830644174, "loss": 7.4691, "step": 2760 }, { "epoch": 0.3398572131954702, "grad_norm": 0.2102663218975067, "learning_rate": 0.00033027466436753297, "loss": 8.4358, "step": 2761 }, { "epoch": 0.3399803052683407, "grad_norm": 0.25543519854545593, "learning_rate": 0.00033021308042862425, "loss": 7.6295, "step": 2762 }, { "epoch": 0.3401033973412112, "grad_norm": 0.28202247619628906, "learning_rate": 0.0003301514964897155, "loss": 7.4736, "step": 2763 }, { "epoch": 0.34022648941408173, "grad_norm": 0.15398164093494415, "learning_rate": 0.0003300899125508068, "loss": 7.488, "step": 2764 }, { "epoch": 0.3403495814869522, "grad_norm": 0.14408744871616364, "learning_rate": 0.000330028328611898, "loss": 7.2927, "step": 2765 }, { "epoch": 0.34047267355982275, "grad_norm": 0.287306010723114, "learning_rate": 0.0003299667446729893, "loss": 8.1552, "step": 2766 }, { "epoch": 0.3405957656326933, "grad_norm": 0.22004728019237518, "learning_rate": 0.0003299051607340806, "loss": 7.1688, "step": 2767 }, { "epoch": 0.34071885770556376, "grad_norm": 0.31591930985450745, "learning_rate": 0.00032984357679517187, "loss": 7.548, "step": 2768 }, { "epoch": 0.3408419497784343, "grad_norm": 0.25850608944892883, "learning_rate": 0.0003297819928562631, "loss": 8.071, "step": 2769 }, { "epoch": 0.3409650418513048, "grad_norm": 0.402076780796051, "learning_rate": 0.0003297204089173544, "loss": 8.829, "step": 2770 }, { "epoch": 0.3410881339241753, "grad_norm": 0.3543354868888855, "learning_rate": 0.0003296588249784456, "loss": 8.7775, "step": 2771 }, { "epoch": 0.3412112259970458, "grad_norm": 0.22399024665355682, "learning_rate": 0.0003295972410395369, "loss": 7.7772, "step": 2772 }, { "epoch": 0.3413343180699163, "grad_norm": 0.20220239460468292, "learning_rate": 0.00032953565710062814, "loss": 8.2169, "step": 2773 }, { "epoch": 0.3414574101427868, "grad_norm": 0.29210761189460754, "learning_rate": 0.0003294740731617195, "loss": 7.9641, "step": 2774 }, { "epoch": 0.34158050221565733, "grad_norm": 0.40803012251853943, "learning_rate": 0.0003294124892228107, "loss": 7.515, "step": 2775 }, { "epoch": 0.3417035942885278, "grad_norm": 0.27437344193458557, "learning_rate": 0.000329350905283902, "loss": 7.7386, "step": 2776 }, { "epoch": 0.34182668636139835, "grad_norm": 0.3242143392562866, "learning_rate": 0.00032928932134499323, "loss": 7.1618, "step": 2777 }, { "epoch": 0.3419497784342688, "grad_norm": 0.09974256157875061, "learning_rate": 0.0003292277374060845, "loss": 7.3307, "step": 2778 }, { "epoch": 0.34207287050713936, "grad_norm": 0.8155328035354614, "learning_rate": 0.00032916615346717575, "loss": 6.261, "step": 2779 }, { "epoch": 0.34219596258000984, "grad_norm": 0.6126415133476257, "learning_rate": 0.0003291045695282671, "loss": 8.3674, "step": 2780 }, { "epoch": 0.3423190546528804, "grad_norm": 0.642961859703064, "learning_rate": 0.0003290429855893583, "loss": 7.9459, "step": 2781 }, { "epoch": 0.34244214672575085, "grad_norm": 0.5412023663520813, "learning_rate": 0.0003289814016504496, "loss": 7.5947, "step": 2782 }, { "epoch": 0.3425652387986214, "grad_norm": 0.49275025725364685, "learning_rate": 0.00032891981771154084, "loss": 7.5963, "step": 2783 }, { "epoch": 0.34268833087149186, "grad_norm": 0.33211076259613037, "learning_rate": 0.0003288582337726321, "loss": 7.3308, "step": 2784 }, { "epoch": 0.3428114229443624, "grad_norm": 0.1314348429441452, "learning_rate": 0.00032879664983372336, "loss": 7.4746, "step": 2785 }, { "epoch": 0.3429345150172329, "grad_norm": 0.3881629407405853, "learning_rate": 0.00032873506589481465, "loss": 8.9208, "step": 2786 }, { "epoch": 0.3430576070901034, "grad_norm": 0.1525052934885025, "learning_rate": 0.00032867348195590593, "loss": 8.5113, "step": 2787 }, { "epoch": 0.3431806991629739, "grad_norm": 0.3230684995651245, "learning_rate": 0.0003286118980169972, "loss": 7.8228, "step": 2788 }, { "epoch": 0.3433037912358444, "grad_norm": 0.5908077955245972, "learning_rate": 0.00032855031407808845, "loss": 7.5821, "step": 2789 }, { "epoch": 0.3434268833087149, "grad_norm": 0.47249773144721985, "learning_rate": 0.0003284887301391797, "loss": 8.0272, "step": 2790 }, { "epoch": 0.34354997538158544, "grad_norm": 0.2919716238975525, "learning_rate": 0.00032842714620027097, "loss": 8.4289, "step": 2791 }, { "epoch": 0.3436730674544559, "grad_norm": 0.43862515687942505, "learning_rate": 0.0003283655622613622, "loss": 7.5697, "step": 2792 }, { "epoch": 0.34379615952732645, "grad_norm": 0.295474648475647, "learning_rate": 0.0003283039783224535, "loss": 7.7485, "step": 2793 }, { "epoch": 0.34391925160019693, "grad_norm": 0.23609915375709534, "learning_rate": 0.0003282423943835448, "loss": 7.3987, "step": 2794 }, { "epoch": 0.34404234367306746, "grad_norm": 0.0893242359161377, "learning_rate": 0.00032818081044463606, "loss": 7.707, "step": 2795 }, { "epoch": 0.34416543574593794, "grad_norm": 0.16855616867542267, "learning_rate": 0.0003281192265057273, "loss": 7.3934, "step": 2796 }, { "epoch": 0.3442885278188085, "grad_norm": 0.472986102104187, "learning_rate": 0.0003280576425668186, "loss": 8.4441, "step": 2797 }, { "epoch": 0.34441161989167896, "grad_norm": 0.27244487404823303, "learning_rate": 0.0003279960586279098, "loss": 7.5053, "step": 2798 }, { "epoch": 0.3445347119645495, "grad_norm": 0.29272130131721497, "learning_rate": 0.0003279344746890011, "loss": 7.6905, "step": 2799 }, { "epoch": 0.34465780403741997, "grad_norm": 0.18529224395751953, "learning_rate": 0.00032787289075009233, "loss": 7.52, "step": 2800 }, { "epoch": 0.3447808961102905, "grad_norm": 0.28144344687461853, "learning_rate": 0.0003278113068111837, "loss": 8.6569, "step": 2801 }, { "epoch": 0.344903988183161, "grad_norm": 0.13179773092269897, "learning_rate": 0.0003277497228722749, "loss": 7.8641, "step": 2802 }, { "epoch": 0.3450270802560315, "grad_norm": 0.36224666237831116, "learning_rate": 0.0003276881389333662, "loss": 7.4169, "step": 2803 }, { "epoch": 0.345150172328902, "grad_norm": 0.22730553150177002, "learning_rate": 0.0003276265549944574, "loss": 7.8511, "step": 2804 }, { "epoch": 0.34527326440177253, "grad_norm": 0.2954975962638855, "learning_rate": 0.0003275649710555487, "loss": 7.5932, "step": 2805 }, { "epoch": 0.345396356474643, "grad_norm": 0.12126521021127701, "learning_rate": 0.00032750338711663994, "loss": 8.2114, "step": 2806 }, { "epoch": 0.34551944854751354, "grad_norm": 0.09290904551744461, "learning_rate": 0.00032744180317773123, "loss": 7.7434, "step": 2807 }, { "epoch": 0.345642540620384, "grad_norm": 0.09865221381187439, "learning_rate": 0.0003273802192388225, "loss": 8.0839, "step": 2808 }, { "epoch": 0.34576563269325455, "grad_norm": 0.1789507120847702, "learning_rate": 0.0003273186352999138, "loss": 7.6685, "step": 2809 }, { "epoch": 0.3458887247661251, "grad_norm": 0.09031849354505539, "learning_rate": 0.00032725705136100504, "loss": 7.4436, "step": 2810 }, { "epoch": 0.34601181683899557, "grad_norm": 0.11010771989822388, "learning_rate": 0.0003271954674220963, "loss": 7.5965, "step": 2811 }, { "epoch": 0.3461349089118661, "grad_norm": 0.11894038319587708, "learning_rate": 0.00032713388348318755, "loss": 7.6704, "step": 2812 }, { "epoch": 0.3462580009847366, "grad_norm": 0.11073404550552368, "learning_rate": 0.00032707229954427884, "loss": 7.4056, "step": 2813 }, { "epoch": 0.3463810930576071, "grad_norm": 0.15290617942810059, "learning_rate": 0.00032701071560537013, "loss": 7.6622, "step": 2814 }, { "epoch": 0.3465041851304776, "grad_norm": 0.14761412143707275, "learning_rate": 0.0003269491316664614, "loss": 7.7035, "step": 2815 }, { "epoch": 0.3466272772033481, "grad_norm": 0.0982138067483902, "learning_rate": 0.00032688754772755265, "loss": 7.548, "step": 2816 }, { "epoch": 0.3467503692762186, "grad_norm": 0.11527995765209198, "learning_rate": 0.00032682596378864393, "loss": 7.3568, "step": 2817 }, { "epoch": 0.34687346134908914, "grad_norm": 0.11325716227293015, "learning_rate": 0.00032676437984973517, "loss": 7.7915, "step": 2818 }, { "epoch": 0.3469965534219596, "grad_norm": 0.08215943723917007, "learning_rate": 0.00032670279591082645, "loss": 7.3331, "step": 2819 }, { "epoch": 0.34711964549483015, "grad_norm": 0.25868159532546997, "learning_rate": 0.0003266412119719177, "loss": 8.1908, "step": 2820 }, { "epoch": 0.34724273756770063, "grad_norm": 0.2069188803434372, "learning_rate": 0.000326579628033009, "loss": 8.1562, "step": 2821 }, { "epoch": 0.34736582964057117, "grad_norm": 0.1631939858198166, "learning_rate": 0.00032651804409410026, "loss": 7.1838, "step": 2822 }, { "epoch": 0.34748892171344165, "grad_norm": 0.17542891204357147, "learning_rate": 0.00032645646015519154, "loss": 7.2062, "step": 2823 }, { "epoch": 0.3476120137863122, "grad_norm": 0.3300777077674866, "learning_rate": 0.0003263948762162828, "loss": 9.0769, "step": 2824 }, { "epoch": 0.34773510585918266, "grad_norm": 0.07599890232086182, "learning_rate": 0.00032633329227737406, "loss": 7.5587, "step": 2825 }, { "epoch": 0.3478581979320532, "grad_norm": 0.10020644962787628, "learning_rate": 0.0003262717083384653, "loss": 7.597, "step": 2826 }, { "epoch": 0.34798129000492367, "grad_norm": 0.08632486313581467, "learning_rate": 0.0003262101243995566, "loss": 7.4099, "step": 2827 }, { "epoch": 0.3481043820777942, "grad_norm": 0.08708427846431732, "learning_rate": 0.00032614854046064787, "loss": 7.4437, "step": 2828 }, { "epoch": 0.3482274741506647, "grad_norm": 0.1255902647972107, "learning_rate": 0.00032608695652173916, "loss": 7.2776, "step": 2829 }, { "epoch": 0.3483505662235352, "grad_norm": 0.15658673644065857, "learning_rate": 0.0003260253725828304, "loss": 7.9634, "step": 2830 }, { "epoch": 0.3484736582964057, "grad_norm": 0.08015577495098114, "learning_rate": 0.0003259637886439217, "loss": 7.2325, "step": 2831 }, { "epoch": 0.34859675036927623, "grad_norm": 0.2834670841693878, "learning_rate": 0.0003259022047050129, "loss": 8.9424, "step": 2832 }, { "epoch": 0.3487198424421467, "grad_norm": 0.16713911294937134, "learning_rate": 0.0003258406207661042, "loss": 7.2224, "step": 2833 }, { "epoch": 0.34884293451501724, "grad_norm": 0.35579854249954224, "learning_rate": 0.0003257790368271954, "loss": 9.0246, "step": 2834 }, { "epoch": 0.3489660265878877, "grad_norm": 0.14870107173919678, "learning_rate": 0.00032571745288828677, "loss": 7.2648, "step": 2835 }, { "epoch": 0.34908911866075826, "grad_norm": 0.10805274546146393, "learning_rate": 0.000325655868949378, "loss": 7.6949, "step": 2836 }, { "epoch": 0.34921221073362874, "grad_norm": 0.10281337052583694, "learning_rate": 0.0003255942850104693, "loss": 7.7798, "step": 2837 }, { "epoch": 0.34933530280649927, "grad_norm": 0.09386008977890015, "learning_rate": 0.0003255327010715605, "loss": 7.5354, "step": 2838 }, { "epoch": 0.34945839487936975, "grad_norm": 0.3582671880722046, "learning_rate": 0.0003254711171326518, "loss": 8.6305, "step": 2839 }, { "epoch": 0.3495814869522403, "grad_norm": 0.16880832612514496, "learning_rate": 0.00032540953319374304, "loss": 8.6468, "step": 2840 }, { "epoch": 0.34970457902511076, "grad_norm": 0.15082880854606628, "learning_rate": 0.0003253479492548344, "loss": 7.5556, "step": 2841 }, { "epoch": 0.3498276710979813, "grad_norm": 0.10479459166526794, "learning_rate": 0.0003252863653159256, "loss": 7.8921, "step": 2842 }, { "epoch": 0.3499507631708518, "grad_norm": 0.14518888294696808, "learning_rate": 0.0003252247813770169, "loss": 7.473, "step": 2843 }, { "epoch": 0.3500738552437223, "grad_norm": 0.09162253886461258, "learning_rate": 0.00032516319743810813, "loss": 7.4783, "step": 2844 }, { "epoch": 0.3501969473165928, "grad_norm": 0.13488595187664032, "learning_rate": 0.0003251016134991994, "loss": 7.4387, "step": 2845 }, { "epoch": 0.3503200393894633, "grad_norm": 0.13387782871723175, "learning_rate": 0.00032504002956029065, "loss": 7.8608, "step": 2846 }, { "epoch": 0.3504431314623338, "grad_norm": 0.10713931918144226, "learning_rate": 0.00032497844562138194, "loss": 7.5756, "step": 2847 }, { "epoch": 0.35056622353520434, "grad_norm": 0.13621419668197632, "learning_rate": 0.0003249168616824732, "loss": 7.7331, "step": 2848 }, { "epoch": 0.3506893156080748, "grad_norm": 0.0853276476264, "learning_rate": 0.0003248552777435645, "loss": 7.4005, "step": 2849 }, { "epoch": 0.35081240768094535, "grad_norm": 0.3514036536216736, "learning_rate": 0.00032479369380465574, "loss": 8.7638, "step": 2850 }, { "epoch": 0.3509354997538158, "grad_norm": 0.16340668499469757, "learning_rate": 0.00032473210986574703, "loss": 7.5451, "step": 2851 }, { "epoch": 0.35105859182668636, "grad_norm": 0.2820090651512146, "learning_rate": 0.00032467052592683826, "loss": 7.7577, "step": 2852 }, { "epoch": 0.3511816838995569, "grad_norm": 0.09007968008518219, "learning_rate": 0.00032460894198792955, "loss": 7.7824, "step": 2853 }, { "epoch": 0.3513047759724274, "grad_norm": 0.10910873115062714, "learning_rate": 0.0003245473580490208, "loss": 7.9195, "step": 2854 }, { "epoch": 0.3514278680452979, "grad_norm": 0.12198685854673386, "learning_rate": 0.0003244857741101121, "loss": 8.1655, "step": 2855 }, { "epoch": 0.3515509601181684, "grad_norm": 0.10815341770648956, "learning_rate": 0.00032442419017120335, "loss": 7.4406, "step": 2856 }, { "epoch": 0.3516740521910389, "grad_norm": 0.10623954236507416, "learning_rate": 0.00032436260623229464, "loss": 7.545, "step": 2857 }, { "epoch": 0.3517971442639094, "grad_norm": 0.1122690811753273, "learning_rate": 0.00032430102229338587, "loss": 7.4227, "step": 2858 }, { "epoch": 0.35192023633677993, "grad_norm": 0.23171576857566833, "learning_rate": 0.00032423943835447716, "loss": 8.0159, "step": 2859 }, { "epoch": 0.3520433284096504, "grad_norm": 0.17698010802268982, "learning_rate": 0.0003241778544155684, "loss": 7.4905, "step": 2860 }, { "epoch": 0.35216642048252095, "grad_norm": 0.12277734279632568, "learning_rate": 0.0003241162704766597, "loss": 7.3585, "step": 2861 }, { "epoch": 0.3522895125553914, "grad_norm": 0.08764095604419708, "learning_rate": 0.00032405468653775096, "loss": 7.4544, "step": 2862 }, { "epoch": 0.35241260462826196, "grad_norm": 0.09797696024179459, "learning_rate": 0.00032399310259884225, "loss": 7.4378, "step": 2863 }, { "epoch": 0.35253569670113244, "grad_norm": 0.2106381058692932, "learning_rate": 0.0003239315186599335, "loss": 7.9731, "step": 2864 }, { "epoch": 0.352658788774003, "grad_norm": 0.098398856818676, "learning_rate": 0.00032386993472102477, "loss": 7.7006, "step": 2865 }, { "epoch": 0.35278188084687345, "grad_norm": 0.08410917967557907, "learning_rate": 0.000323808350782116, "loss": 7.9754, "step": 2866 }, { "epoch": 0.352904972919744, "grad_norm": 0.1950797736644745, "learning_rate": 0.0003237467668432073, "loss": 7.4545, "step": 2867 }, { "epoch": 0.35302806499261447, "grad_norm": 0.3902917504310608, "learning_rate": 0.0003236851829042986, "loss": 7.1618, "step": 2868 }, { "epoch": 0.353151157065485, "grad_norm": 0.13591597974300385, "learning_rate": 0.00032362359896538986, "loss": 7.557, "step": 2869 }, { "epoch": 0.3532742491383555, "grad_norm": 0.21071134507656097, "learning_rate": 0.0003235620150264811, "loss": 7.1518, "step": 2870 }, { "epoch": 0.353397341211226, "grad_norm": 0.19601689279079437, "learning_rate": 0.0003235004310875724, "loss": 8.1554, "step": 2871 }, { "epoch": 0.3535204332840965, "grad_norm": 0.14963923394680023, "learning_rate": 0.0003234388471486636, "loss": 7.4658, "step": 2872 }, { "epoch": 0.353643525356967, "grad_norm": 0.2235194593667984, "learning_rate": 0.0003233772632097549, "loss": 7.809, "step": 2873 }, { "epoch": 0.3537666174298375, "grad_norm": 0.11138929426670074, "learning_rate": 0.00032331567927084613, "loss": 7.4982, "step": 2874 }, { "epoch": 0.35388970950270804, "grad_norm": 0.2531939148902893, "learning_rate": 0.00032325409533193747, "loss": 8.3636, "step": 2875 }, { "epoch": 0.3540128015755785, "grad_norm": 0.10386261343955994, "learning_rate": 0.0003231925113930287, "loss": 7.9801, "step": 2876 }, { "epoch": 0.35413589364844905, "grad_norm": 0.21503183245658875, "learning_rate": 0.00032313092745412, "loss": 7.5393, "step": 2877 }, { "epoch": 0.35425898572131953, "grad_norm": 0.1982581913471222, "learning_rate": 0.0003230693435152112, "loss": 7.7249, "step": 2878 }, { "epoch": 0.35438207779419006, "grad_norm": 0.16964054107666016, "learning_rate": 0.0003230077595763025, "loss": 7.672, "step": 2879 }, { "epoch": 0.35450516986706054, "grad_norm": 0.0842280387878418, "learning_rate": 0.00032294617563739374, "loss": 7.6775, "step": 2880 }, { "epoch": 0.3546282619399311, "grad_norm": 0.09559839963912964, "learning_rate": 0.00032288459169848503, "loss": 7.9774, "step": 2881 }, { "epoch": 0.35475135401280156, "grad_norm": 0.3197278380393982, "learning_rate": 0.0003228230077595763, "loss": 8.9979, "step": 2882 }, { "epoch": 0.3548744460856721, "grad_norm": 0.15254324674606323, "learning_rate": 0.0003227614238206676, "loss": 7.6044, "step": 2883 }, { "epoch": 0.35499753815854257, "grad_norm": 0.19959890842437744, "learning_rate": 0.00032269983988175883, "loss": 8.3687, "step": 2884 }, { "epoch": 0.3551206302314131, "grad_norm": 0.12137063592672348, "learning_rate": 0.0003226382559428501, "loss": 7.2165, "step": 2885 }, { "epoch": 0.3552437223042836, "grad_norm": 0.09268765896558762, "learning_rate": 0.00032257667200394135, "loss": 7.2079, "step": 2886 }, { "epoch": 0.3553668143771541, "grad_norm": 0.18852822482585907, "learning_rate": 0.00032251508806503264, "loss": 7.8231, "step": 2887 }, { "epoch": 0.3554899064500246, "grad_norm": 0.14935073256492615, "learning_rate": 0.00032245350412612387, "loss": 7.3608, "step": 2888 }, { "epoch": 0.35561299852289513, "grad_norm": 0.08441067487001419, "learning_rate": 0.0003223919201872152, "loss": 7.5018, "step": 2889 }, { "epoch": 0.3557360905957656, "grad_norm": 0.15998555719852448, "learning_rate": 0.00032233033624830645, "loss": 8.5381, "step": 2890 }, { "epoch": 0.35585918266863614, "grad_norm": 0.17686648666858673, "learning_rate": 0.00032226875230939773, "loss": 7.224, "step": 2891 }, { "epoch": 0.3559822747415066, "grad_norm": 0.09286241978406906, "learning_rate": 0.00032220716837048896, "loss": 7.7468, "step": 2892 }, { "epoch": 0.35610536681437716, "grad_norm": 0.10507575422525406, "learning_rate": 0.00032214558443158025, "loss": 7.6712, "step": 2893 }, { "epoch": 0.35622845888724763, "grad_norm": 0.08165208995342255, "learning_rate": 0.0003220840004926715, "loss": 7.7874, "step": 2894 }, { "epoch": 0.35635155096011817, "grad_norm": 0.12412963062524796, "learning_rate": 0.0003220224165537628, "loss": 7.3546, "step": 2895 }, { "epoch": 0.3564746430329887, "grad_norm": 0.0956360474228859, "learning_rate": 0.00032196083261485406, "loss": 7.6422, "step": 2896 }, { "epoch": 0.3565977351058592, "grad_norm": 0.12796829640865326, "learning_rate": 0.00032189924867594534, "loss": 7.3811, "step": 2897 }, { "epoch": 0.3567208271787297, "grad_norm": 0.12215489894151688, "learning_rate": 0.0003218376647370366, "loss": 7.7046, "step": 2898 }, { "epoch": 0.3568439192516002, "grad_norm": 0.0735694095492363, "learning_rate": 0.00032177608079812786, "loss": 7.5781, "step": 2899 }, { "epoch": 0.35696701132447073, "grad_norm": 0.0852191150188446, "learning_rate": 0.0003217144968592191, "loss": 7.8274, "step": 2900 }, { "epoch": 0.3570901033973412, "grad_norm": 0.11913944780826569, "learning_rate": 0.0003216529129203104, "loss": 7.9819, "step": 2901 }, { "epoch": 0.35721319547021174, "grad_norm": 0.12155035138130188, "learning_rate": 0.00032159132898140167, "loss": 7.5931, "step": 2902 }, { "epoch": 0.3573362875430822, "grad_norm": 0.17134001851081848, "learning_rate": 0.00032152974504249295, "loss": 7.5421, "step": 2903 }, { "epoch": 0.35745937961595275, "grad_norm": 0.11244748532772064, "learning_rate": 0.0003214681611035842, "loss": 7.6876, "step": 2904 }, { "epoch": 0.35758247168882323, "grad_norm": 0.10674197971820831, "learning_rate": 0.0003214065771646755, "loss": 7.5863, "step": 2905 }, { "epoch": 0.35770556376169377, "grad_norm": 0.07402997463941574, "learning_rate": 0.0003213449932257667, "loss": 7.4543, "step": 2906 }, { "epoch": 0.35782865583456425, "grad_norm": 0.14497995376586914, "learning_rate": 0.000321283409286858, "loss": 7.6872, "step": 2907 }, { "epoch": 0.3579517479074348, "grad_norm": 0.10328619927167892, "learning_rate": 0.0003212218253479492, "loss": 7.0788, "step": 2908 }, { "epoch": 0.35807483998030526, "grad_norm": 0.14821653068065643, "learning_rate": 0.00032116024140904057, "loss": 7.5332, "step": 2909 }, { "epoch": 0.3581979320531758, "grad_norm": 0.1494164764881134, "learning_rate": 0.0003210986574701318, "loss": 7.7277, "step": 2910 }, { "epoch": 0.3583210241260463, "grad_norm": 0.12989398837089539, "learning_rate": 0.0003210370735312231, "loss": 7.695, "step": 2911 }, { "epoch": 0.3584441161989168, "grad_norm": 0.1335039585828781, "learning_rate": 0.0003209754895923143, "loss": 7.7907, "step": 2912 }, { "epoch": 0.3585672082717873, "grad_norm": 0.16845703125, "learning_rate": 0.0003209139056534056, "loss": 7.3383, "step": 2913 }, { "epoch": 0.3586903003446578, "grad_norm": 0.17404206097126007, "learning_rate": 0.00032085232171449684, "loss": 7.4641, "step": 2914 }, { "epoch": 0.3588133924175283, "grad_norm": 0.1012432873249054, "learning_rate": 0.0003207907377755881, "loss": 8.1668, "step": 2915 }, { "epoch": 0.35893648449039883, "grad_norm": 0.10999646782875061, "learning_rate": 0.0003207291538366794, "loss": 7.7949, "step": 2916 }, { "epoch": 0.3590595765632693, "grad_norm": 0.16159872710704803, "learning_rate": 0.0003206675698977707, "loss": 7.4978, "step": 2917 }, { "epoch": 0.35918266863613985, "grad_norm": 0.13183583319187164, "learning_rate": 0.00032060598595886193, "loss": 7.482, "step": 2918 }, { "epoch": 0.3593057607090103, "grad_norm": 0.15093949437141418, "learning_rate": 0.0003205444020199532, "loss": 7.8384, "step": 2919 }, { "epoch": 0.35942885278188086, "grad_norm": 0.1145210936665535, "learning_rate": 0.00032048281808104445, "loss": 7.5214, "step": 2920 }, { "epoch": 0.35955194485475134, "grad_norm": 0.1856650859117508, "learning_rate": 0.00032042123414213573, "loss": 7.6595, "step": 2921 }, { "epoch": 0.35967503692762187, "grad_norm": 0.23507317900657654, "learning_rate": 0.00032035965020322697, "loss": 7.7075, "step": 2922 }, { "epoch": 0.35979812900049235, "grad_norm": 0.14126235246658325, "learning_rate": 0.0003202980662643183, "loss": 7.5692, "step": 2923 }, { "epoch": 0.3599212210733629, "grad_norm": 0.07959362864494324, "learning_rate": 0.00032023648232540954, "loss": 7.3727, "step": 2924 }, { "epoch": 0.36004431314623336, "grad_norm": 0.20846153795719147, "learning_rate": 0.0003201748983865008, "loss": 7.3699, "step": 2925 }, { "epoch": 0.3601674052191039, "grad_norm": 0.11072197556495667, "learning_rate": 0.00032011331444759206, "loss": 7.8173, "step": 2926 }, { "epoch": 0.3602904972919744, "grad_norm": 0.22289779782295227, "learning_rate": 0.00032005173050868335, "loss": 7.2909, "step": 2927 }, { "epoch": 0.3604135893648449, "grad_norm": 0.1740623414516449, "learning_rate": 0.0003199901465697746, "loss": 7.3236, "step": 2928 }, { "epoch": 0.3605366814377154, "grad_norm": 0.06088363379240036, "learning_rate": 0.0003199285626308659, "loss": 7.5674, "step": 2929 }, { "epoch": 0.3606597735105859, "grad_norm": 0.1343425214290619, "learning_rate": 0.00031986697869195715, "loss": 7.5114, "step": 2930 }, { "epoch": 0.3607828655834564, "grad_norm": 0.19672058522701263, "learning_rate": 0.00031980539475304844, "loss": 7.6673, "step": 2931 }, { "epoch": 0.36090595765632694, "grad_norm": 0.12493109703063965, "learning_rate": 0.00031974381081413967, "loss": 7.4061, "step": 2932 }, { "epoch": 0.3610290497291974, "grad_norm": 0.17389655113220215, "learning_rate": 0.00031968222687523096, "loss": 7.677, "step": 2933 }, { "epoch": 0.36115214180206795, "grad_norm": 0.357427716255188, "learning_rate": 0.0003196206429363222, "loss": 8.8479, "step": 2934 }, { "epoch": 0.36127523387493843, "grad_norm": 0.2936326563358307, "learning_rate": 0.0003195590589974135, "loss": 7.15, "step": 2935 }, { "epoch": 0.36139832594780896, "grad_norm": 0.17035971581935883, "learning_rate": 0.00031949747505850476, "loss": 7.624, "step": 2936 }, { "epoch": 0.36152141802067944, "grad_norm": 0.26491841673851013, "learning_rate": 0.00031943589111959605, "loss": 7.3555, "step": 2937 }, { "epoch": 0.36164451009355, "grad_norm": 0.09408543258905411, "learning_rate": 0.0003193743071806873, "loss": 8.3653, "step": 2938 }, { "epoch": 0.3617676021664205, "grad_norm": 0.21532629430294037, "learning_rate": 0.00031931272324177857, "loss": 7.5582, "step": 2939 }, { "epoch": 0.361890694239291, "grad_norm": 0.16448529064655304, "learning_rate": 0.0003192511393028698, "loss": 7.1724, "step": 2940 }, { "epoch": 0.3620137863121615, "grad_norm": 0.14992299675941467, "learning_rate": 0.0003191895553639611, "loss": 7.6258, "step": 2941 }, { "epoch": 0.362136878385032, "grad_norm": 0.1562049388885498, "learning_rate": 0.0003191279714250523, "loss": 8.0472, "step": 2942 }, { "epoch": 0.36225997045790254, "grad_norm": 0.19090138375759125, "learning_rate": 0.00031906638748614366, "loss": 7.9924, "step": 2943 }, { "epoch": 0.362383062530773, "grad_norm": 0.4903116524219513, "learning_rate": 0.0003190048035472349, "loss": 8.8278, "step": 2944 }, { "epoch": 0.36250615460364355, "grad_norm": 0.11475177854299545, "learning_rate": 0.0003189432196083262, "loss": 7.6715, "step": 2945 }, { "epoch": 0.362629246676514, "grad_norm": 0.07342535257339478, "learning_rate": 0.0003188816356694174, "loss": 7.6882, "step": 2946 }, { "epoch": 0.36275233874938456, "grad_norm": 0.12063624709844589, "learning_rate": 0.0003188200517305087, "loss": 7.5183, "step": 2947 }, { "epoch": 0.36287543082225504, "grad_norm": 0.11796218156814575, "learning_rate": 0.00031875846779159993, "loss": 7.654, "step": 2948 }, { "epoch": 0.3629985228951256, "grad_norm": 0.16574479639530182, "learning_rate": 0.0003186968838526912, "loss": 7.2181, "step": 2949 }, { "epoch": 0.36312161496799605, "grad_norm": 0.341294527053833, "learning_rate": 0.0003186352999137825, "loss": 8.9682, "step": 2950 }, { "epoch": 0.3632447070408666, "grad_norm": 0.10324422270059586, "learning_rate": 0.0003185737159748738, "loss": 7.8049, "step": 2951 }, { "epoch": 0.36336779911373707, "grad_norm": 0.08110009133815765, "learning_rate": 0.000318512132035965, "loss": 7.4128, "step": 2952 }, { "epoch": 0.3634908911866076, "grad_norm": 0.07070150226354599, "learning_rate": 0.0003184505480970563, "loss": 7.4228, "step": 2953 }, { "epoch": 0.3636139832594781, "grad_norm": 0.35363686084747314, "learning_rate": 0.00031838896415814754, "loss": 9.0132, "step": 2954 }, { "epoch": 0.3637370753323486, "grad_norm": 0.16913045942783356, "learning_rate": 0.00031832738021923883, "loss": 7.7627, "step": 2955 }, { "epoch": 0.3638601674052191, "grad_norm": 0.12921875715255737, "learning_rate": 0.0003182657962803301, "loss": 7.6119, "step": 2956 }, { "epoch": 0.3639832594780896, "grad_norm": 0.13154685497283936, "learning_rate": 0.0003182042123414214, "loss": 7.9352, "step": 2957 }, { "epoch": 0.3641063515509601, "grad_norm": 0.10174552351236343, "learning_rate": 0.00031814262840251263, "loss": 8.1418, "step": 2958 }, { "epoch": 0.36422944362383064, "grad_norm": 0.20478898286819458, "learning_rate": 0.0003180810444636039, "loss": 7.4753, "step": 2959 }, { "epoch": 0.3643525356967011, "grad_norm": 0.20187979936599731, "learning_rate": 0.00031801946052469515, "loss": 7.985, "step": 2960 }, { "epoch": 0.36447562776957165, "grad_norm": 0.18163952231407166, "learning_rate": 0.00031795787658578644, "loss": 7.5504, "step": 2961 }, { "epoch": 0.36459871984244213, "grad_norm": 0.13138779997825623, "learning_rate": 0.00031789629264687767, "loss": 7.6688, "step": 2962 }, { "epoch": 0.36472181191531267, "grad_norm": 0.12132485210895538, "learning_rate": 0.000317834708707969, "loss": 7.4554, "step": 2963 }, { "epoch": 0.36484490398818314, "grad_norm": 0.17337757349014282, "learning_rate": 0.00031777312476906024, "loss": 7.6663, "step": 2964 }, { "epoch": 0.3649679960610537, "grad_norm": 0.2038554698228836, "learning_rate": 0.00031771154083015153, "loss": 7.7736, "step": 2965 }, { "epoch": 0.36509108813392416, "grad_norm": 0.1744230091571808, "learning_rate": 0.00031764995689124276, "loss": 7.6886, "step": 2966 }, { "epoch": 0.3652141802067947, "grad_norm": 0.12081573158502579, "learning_rate": 0.00031758837295233405, "loss": 7.4751, "step": 2967 }, { "epoch": 0.36533727227966517, "grad_norm": 0.16929510235786438, "learning_rate": 0.0003175267890134253, "loss": 7.8087, "step": 2968 }, { "epoch": 0.3654603643525357, "grad_norm": 0.18168224394321442, "learning_rate": 0.00031746520507451657, "loss": 7.9454, "step": 2969 }, { "epoch": 0.3655834564254062, "grad_norm": 0.18297579884529114, "learning_rate": 0.00031740362113560786, "loss": 8.3507, "step": 2970 }, { "epoch": 0.3657065484982767, "grad_norm": 0.24828094244003296, "learning_rate": 0.00031734203719669914, "loss": 7.4157, "step": 2971 }, { "epoch": 0.3658296405711472, "grad_norm": 0.2529247999191284, "learning_rate": 0.0003172804532577904, "loss": 7.421, "step": 2972 }, { "epoch": 0.36595273264401773, "grad_norm": 0.2222829908132553, "learning_rate": 0.00031721886931888166, "loss": 7.5899, "step": 2973 }, { "epoch": 0.3660758247168882, "grad_norm": 0.26635897159576416, "learning_rate": 0.0003171572853799729, "loss": 7.3436, "step": 2974 }, { "epoch": 0.36619891678975874, "grad_norm": 0.13231158256530762, "learning_rate": 0.0003170957014410642, "loss": 7.4364, "step": 2975 }, { "epoch": 0.3663220088626292, "grad_norm": 0.07792738080024719, "learning_rate": 0.0003170341175021554, "loss": 7.4274, "step": 2976 }, { "epoch": 0.36644510093549976, "grad_norm": 0.15416455268859863, "learning_rate": 0.00031697253356324675, "loss": 7.6716, "step": 2977 }, { "epoch": 0.36656819300837024, "grad_norm": 0.2602580189704895, "learning_rate": 0.000316910949624338, "loss": 7.8896, "step": 2978 }, { "epoch": 0.36669128508124077, "grad_norm": 0.2118823379278183, "learning_rate": 0.00031684936568542927, "loss": 7.7576, "step": 2979 }, { "epoch": 0.36681437715411125, "grad_norm": 0.22385814785957336, "learning_rate": 0.0003167877817465205, "loss": 7.3232, "step": 2980 }, { "epoch": 0.3669374692269818, "grad_norm": 0.2617186903953552, "learning_rate": 0.0003167261978076118, "loss": 7.7505, "step": 2981 }, { "epoch": 0.3670605612998523, "grad_norm": 0.11456385999917984, "learning_rate": 0.000316664613868703, "loss": 7.7553, "step": 2982 }, { "epoch": 0.3671836533727228, "grad_norm": 0.22208917140960693, "learning_rate": 0.00031660302992979436, "loss": 7.1731, "step": 2983 }, { "epoch": 0.36730674544559333, "grad_norm": 0.14574198424816132, "learning_rate": 0.0003165414459908856, "loss": 8.2616, "step": 2984 }, { "epoch": 0.3674298375184638, "grad_norm": 0.21906135976314545, "learning_rate": 0.0003164798620519769, "loss": 7.3519, "step": 2985 }, { "epoch": 0.36755292959133434, "grad_norm": 0.1360561102628708, "learning_rate": 0.0003164182781130681, "loss": 7.6491, "step": 2986 }, { "epoch": 0.3676760216642048, "grad_norm": 0.1494409292936325, "learning_rate": 0.0003163566941741594, "loss": 8.0876, "step": 2987 }, { "epoch": 0.36779911373707536, "grad_norm": 0.1505209505558014, "learning_rate": 0.00031629511023525064, "loss": 7.8648, "step": 2988 }, { "epoch": 0.36792220580994583, "grad_norm": 0.09128907322883606, "learning_rate": 0.0003162335262963419, "loss": 7.7225, "step": 2989 }, { "epoch": 0.36804529788281637, "grad_norm": 0.10313864797353745, "learning_rate": 0.0003161719423574332, "loss": 7.8, "step": 2990 }, { "epoch": 0.36816838995568685, "grad_norm": 0.15508262813091278, "learning_rate": 0.0003161103584185245, "loss": 7.5729, "step": 2991 }, { "epoch": 0.3682914820285574, "grad_norm": 0.24139250814914703, "learning_rate": 0.00031604877447961573, "loss": 8.2216, "step": 2992 }, { "epoch": 0.36841457410142786, "grad_norm": 0.10430322587490082, "learning_rate": 0.000315987190540707, "loss": 7.4683, "step": 2993 }, { "epoch": 0.3685376661742984, "grad_norm": 0.15655194222927094, "learning_rate": 0.00031592560660179825, "loss": 7.3792, "step": 2994 }, { "epoch": 0.3686607582471689, "grad_norm": 0.1633729338645935, "learning_rate": 0.00031586402266288953, "loss": 7.2328, "step": 2995 }, { "epoch": 0.3687838503200394, "grad_norm": 0.08491567522287369, "learning_rate": 0.00031580243872398077, "loss": 7.5322, "step": 2996 }, { "epoch": 0.3689069423929099, "grad_norm": 0.09362270683050156, "learning_rate": 0.0003157408547850721, "loss": 7.5915, "step": 2997 }, { "epoch": 0.3690300344657804, "grad_norm": 0.08437331765890121, "learning_rate": 0.00031567927084616334, "loss": 7.6972, "step": 2998 }, { "epoch": 0.3691531265386509, "grad_norm": 0.14269530773162842, "learning_rate": 0.0003156176869072546, "loss": 7.2751, "step": 2999 }, { "epoch": 0.36927621861152143, "grad_norm": 0.08998528867959976, "learning_rate": 0.00031555610296834586, "loss": 8.0489, "step": 3000 }, { "epoch": 0.3693993106843919, "grad_norm": 0.09180556237697601, "learning_rate": 0.00031549451902943714, "loss": 7.8146, "step": 3001 }, { "epoch": 0.36952240275726245, "grad_norm": 0.16656583547592163, "learning_rate": 0.0003154329350905284, "loss": 8.2136, "step": 3002 }, { "epoch": 0.3696454948301329, "grad_norm": 0.14221081137657166, "learning_rate": 0.00031537135115161966, "loss": 7.481, "step": 3003 }, { "epoch": 0.36976858690300346, "grad_norm": 0.11053703725337982, "learning_rate": 0.00031530976721271095, "loss": 8.1873, "step": 3004 }, { "epoch": 0.36989167897587394, "grad_norm": 0.2616908848285675, "learning_rate": 0.00031524818327380224, "loss": 7.1986, "step": 3005 }, { "epoch": 0.3700147710487445, "grad_norm": 0.16868318617343903, "learning_rate": 0.00031518659933489347, "loss": 7.4157, "step": 3006 }, { "epoch": 0.37013786312161495, "grad_norm": 0.1046394631266594, "learning_rate": 0.00031512501539598476, "loss": 7.7586, "step": 3007 }, { "epoch": 0.3702609551944855, "grad_norm": 0.1860823631286621, "learning_rate": 0.000315063431457076, "loss": 8.2587, "step": 3008 }, { "epoch": 0.37038404726735596, "grad_norm": 0.09034112095832825, "learning_rate": 0.0003150018475181673, "loss": 7.5033, "step": 3009 }, { "epoch": 0.3705071393402265, "grad_norm": 0.11520108580589294, "learning_rate": 0.00031494026357925856, "loss": 7.564, "step": 3010 }, { "epoch": 0.370630231413097, "grad_norm": 0.1480437070131302, "learning_rate": 0.00031487867964034985, "loss": 7.9425, "step": 3011 }, { "epoch": 0.3707533234859675, "grad_norm": 0.06915151327848434, "learning_rate": 0.0003148170957014411, "loss": 7.4866, "step": 3012 }, { "epoch": 0.370876415558838, "grad_norm": 0.13593710958957672, "learning_rate": 0.00031475551176253237, "loss": 7.957, "step": 3013 }, { "epoch": 0.3709995076317085, "grad_norm": 0.18951065838336945, "learning_rate": 0.0003146939278236236, "loss": 7.2726, "step": 3014 }, { "epoch": 0.371122599704579, "grad_norm": 0.1346450299024582, "learning_rate": 0.0003146323438847149, "loss": 7.5897, "step": 3015 }, { "epoch": 0.37124569177744954, "grad_norm": 0.15400101244449615, "learning_rate": 0.0003145707599458061, "loss": 7.4138, "step": 3016 }, { "epoch": 0.37136878385032, "grad_norm": 0.09873016923666, "learning_rate": 0.00031450917600689746, "loss": 7.4632, "step": 3017 }, { "epoch": 0.37149187592319055, "grad_norm": 0.1100592166185379, "learning_rate": 0.0003144475920679887, "loss": 7.4399, "step": 3018 }, { "epoch": 0.37161496799606103, "grad_norm": 0.10644299536943436, "learning_rate": 0.00031438600812908, "loss": 7.4816, "step": 3019 }, { "epoch": 0.37173806006893156, "grad_norm": 0.1147780492901802, "learning_rate": 0.0003143244241901712, "loss": 7.3054, "step": 3020 }, { "epoch": 0.37186115214180204, "grad_norm": 0.25682544708251953, "learning_rate": 0.00031426284025126244, "loss": 7.7875, "step": 3021 }, { "epoch": 0.3719842442146726, "grad_norm": 0.1490458995103836, "learning_rate": 0.00031420125631235373, "loss": 7.5635, "step": 3022 }, { "epoch": 0.37210733628754306, "grad_norm": 0.2803194522857666, "learning_rate": 0.00031413967237344496, "loss": 8.7853, "step": 3023 }, { "epoch": 0.3722304283604136, "grad_norm": 0.07964340597391129, "learning_rate": 0.0003140780884345363, "loss": 7.7351, "step": 3024 }, { "epoch": 0.3723535204332841, "grad_norm": 0.09037598967552185, "learning_rate": 0.00031401650449562753, "loss": 7.8511, "step": 3025 }, { "epoch": 0.3724766125061546, "grad_norm": 0.17422468960285187, "learning_rate": 0.0003139549205567188, "loss": 7.7057, "step": 3026 }, { "epoch": 0.37259970457902514, "grad_norm": 0.13467158377170563, "learning_rate": 0.00031389333661781005, "loss": 7.7279, "step": 3027 }, { "epoch": 0.3727227966518956, "grad_norm": 0.16781237721443176, "learning_rate": 0.00031383175267890134, "loss": 8.3044, "step": 3028 }, { "epoch": 0.37284588872476615, "grad_norm": 0.1501239538192749, "learning_rate": 0.00031377016873999257, "loss": 7.7345, "step": 3029 }, { "epoch": 0.37296898079763663, "grad_norm": 0.2074326127767563, "learning_rate": 0.00031370858480108386, "loss": 8.4254, "step": 3030 }, { "epoch": 0.37309207287050716, "grad_norm": 0.18442197144031525, "learning_rate": 0.00031364700086217515, "loss": 8.0803, "step": 3031 }, { "epoch": 0.37321516494337764, "grad_norm": 0.0806954950094223, "learning_rate": 0.00031358541692326643, "loss": 7.7713, "step": 3032 }, { "epoch": 0.3733382570162482, "grad_norm": 0.2125711888074875, "learning_rate": 0.00031352383298435766, "loss": 7.3678, "step": 3033 }, { "epoch": 0.37346134908911865, "grad_norm": 0.4074193835258484, "learning_rate": 0.00031346224904544895, "loss": 9.1945, "step": 3034 }, { "epoch": 0.3735844411619892, "grad_norm": 0.241207093000412, "learning_rate": 0.0003134006651065402, "loss": 8.2942, "step": 3035 }, { "epoch": 0.37370753323485967, "grad_norm": 0.09356258809566498, "learning_rate": 0.00031333908116763147, "loss": 7.6768, "step": 3036 }, { "epoch": 0.3738306253077302, "grad_norm": 0.13973161578178406, "learning_rate": 0.0003132774972287227, "loss": 7.5473, "step": 3037 }, { "epoch": 0.3739537173806007, "grad_norm": 0.1232042983174324, "learning_rate": 0.00031321591328981404, "loss": 7.331, "step": 3038 }, { "epoch": 0.3740768094534712, "grad_norm": 0.18357697129249573, "learning_rate": 0.0003131543293509053, "loss": 7.5757, "step": 3039 }, { "epoch": 0.3741999015263417, "grad_norm": 0.1502806842327118, "learning_rate": 0.00031309274541199656, "loss": 7.5745, "step": 3040 }, { "epoch": 0.37432299359921223, "grad_norm": 0.09743788838386536, "learning_rate": 0.0003130311614730878, "loss": 7.8197, "step": 3041 }, { "epoch": 0.3744460856720827, "grad_norm": 0.08259011805057526, "learning_rate": 0.0003129695775341791, "loss": 7.4272, "step": 3042 }, { "epoch": 0.37456917774495324, "grad_norm": 0.10089518129825592, "learning_rate": 0.0003129079935952703, "loss": 7.7459, "step": 3043 }, { "epoch": 0.3746922698178237, "grad_norm": 0.10098287463188171, "learning_rate": 0.00031284640965636165, "loss": 7.6008, "step": 3044 }, { "epoch": 0.37481536189069425, "grad_norm": 0.17513592541217804, "learning_rate": 0.0003127848257174529, "loss": 8.1365, "step": 3045 }, { "epoch": 0.37493845396356473, "grad_norm": 0.08078008890151978, "learning_rate": 0.0003127232417785442, "loss": 7.3775, "step": 3046 }, { "epoch": 0.37506154603643527, "grad_norm": 0.09994050860404968, "learning_rate": 0.0003126616578396354, "loss": 7.5696, "step": 3047 }, { "epoch": 0.37518463810930575, "grad_norm": 0.1842900514602661, "learning_rate": 0.0003126000739007267, "loss": 7.655, "step": 3048 }, { "epoch": 0.3753077301821763, "grad_norm": 0.10333237797021866, "learning_rate": 0.0003125384899618179, "loss": 7.709, "step": 3049 }, { "epoch": 0.37543082225504676, "grad_norm": 0.13472558557987213, "learning_rate": 0.0003124769060229092, "loss": 7.6922, "step": 3050 }, { "epoch": 0.3755539143279173, "grad_norm": 0.09704159945249557, "learning_rate": 0.0003124153220840005, "loss": 7.4521, "step": 3051 }, { "epoch": 0.37567700640078777, "grad_norm": 0.1433943659067154, "learning_rate": 0.0003123537381450918, "loss": 7.8194, "step": 3052 }, { "epoch": 0.3758000984736583, "grad_norm": 0.19599686563014984, "learning_rate": 0.000312292154206183, "loss": 8.1978, "step": 3053 }, { "epoch": 0.3759231905465288, "grad_norm": 0.11104434728622437, "learning_rate": 0.0003122305702672743, "loss": 7.4638, "step": 3054 }, { "epoch": 0.3760462826193993, "grad_norm": 0.10224871337413788, "learning_rate": 0.00031216898632836554, "loss": 7.843, "step": 3055 }, { "epoch": 0.3761693746922698, "grad_norm": 0.08740703761577606, "learning_rate": 0.0003121074023894568, "loss": 8.0132, "step": 3056 }, { "epoch": 0.37629246676514033, "grad_norm": 0.15388059616088867, "learning_rate": 0.00031204581845054806, "loss": 7.5995, "step": 3057 }, { "epoch": 0.3764155588380108, "grad_norm": 0.1513707935810089, "learning_rate": 0.0003119842345116394, "loss": 7.5684, "step": 3058 }, { "epoch": 0.37653865091088135, "grad_norm": 0.0682283416390419, "learning_rate": 0.00031192265057273063, "loss": 7.6148, "step": 3059 }, { "epoch": 0.3766617429837518, "grad_norm": 0.08079870790243149, "learning_rate": 0.0003118610666338219, "loss": 7.6176, "step": 3060 }, { "epoch": 0.37678483505662236, "grad_norm": 0.09724727272987366, "learning_rate": 0.00031179948269491315, "loss": 7.6953, "step": 3061 }, { "epoch": 0.37690792712949284, "grad_norm": 0.09119800478219986, "learning_rate": 0.00031173789875600443, "loss": 7.3381, "step": 3062 }, { "epoch": 0.37703101920236337, "grad_norm": 0.12325884401798248, "learning_rate": 0.00031167631481709567, "loss": 7.5988, "step": 3063 }, { "epoch": 0.37715411127523385, "grad_norm": 0.1005515456199646, "learning_rate": 0.000311614730878187, "loss": 7.6163, "step": 3064 }, { "epoch": 0.3772772033481044, "grad_norm": 0.14121216535568237, "learning_rate": 0.00031155314693927824, "loss": 7.5041, "step": 3065 }, { "epoch": 0.37740029542097486, "grad_norm": 0.4810296297073364, "learning_rate": 0.0003114915630003695, "loss": 9.3104, "step": 3066 }, { "epoch": 0.3775233874938454, "grad_norm": 0.0896548330783844, "learning_rate": 0.00031142997906146076, "loss": 7.6895, "step": 3067 }, { "epoch": 0.3776464795667159, "grad_norm": 0.1813962310552597, "learning_rate": 0.00031136839512255205, "loss": 7.6519, "step": 3068 }, { "epoch": 0.3777695716395864, "grad_norm": 0.13539613783359528, "learning_rate": 0.0003113068111836433, "loss": 7.5015, "step": 3069 }, { "epoch": 0.37789266371245694, "grad_norm": 0.10598202049732208, "learning_rate": 0.00031124522724473456, "loss": 7.9559, "step": 3070 }, { "epoch": 0.3780157557853274, "grad_norm": 0.1425078809261322, "learning_rate": 0.00031118364330582585, "loss": 7.8827, "step": 3071 }, { "epoch": 0.37813884785819796, "grad_norm": 0.1883867233991623, "learning_rate": 0.00031112205936691714, "loss": 7.6648, "step": 3072 }, { "epoch": 0.37826193993106844, "grad_norm": 0.09638720750808716, "learning_rate": 0.00031106047542800837, "loss": 7.8968, "step": 3073 }, { "epoch": 0.37838503200393897, "grad_norm": 0.13441091775894165, "learning_rate": 0.00031099889148909966, "loss": 7.5817, "step": 3074 }, { "epoch": 0.37850812407680945, "grad_norm": 0.5553827881813049, "learning_rate": 0.0003109373075501909, "loss": 9.986, "step": 3075 }, { "epoch": 0.37863121614968, "grad_norm": 0.21181273460388184, "learning_rate": 0.0003108757236112822, "loss": 8.1972, "step": 3076 }, { "epoch": 0.37875430822255046, "grad_norm": 0.0974247008562088, "learning_rate": 0.0003108141396723734, "loss": 7.4367, "step": 3077 }, { "epoch": 0.378877400295421, "grad_norm": 0.08175565302371979, "learning_rate": 0.00031075255573346475, "loss": 7.6547, "step": 3078 }, { "epoch": 0.3790004923682915, "grad_norm": 0.08478499203920364, "learning_rate": 0.000310690971794556, "loss": 7.5714, "step": 3079 }, { "epoch": 0.379123584441162, "grad_norm": 0.08430355042219162, "learning_rate": 0.00031062938785564727, "loss": 7.6448, "step": 3080 }, { "epoch": 0.3792466765140325, "grad_norm": 0.10222592204809189, "learning_rate": 0.0003105678039167385, "loss": 7.4027, "step": 3081 }, { "epoch": 0.379369768586903, "grad_norm": 0.18672481179237366, "learning_rate": 0.0003105062199778298, "loss": 8.0145, "step": 3082 }, { "epoch": 0.3794928606597735, "grad_norm": 0.12898536026477814, "learning_rate": 0.000310444636038921, "loss": 7.5145, "step": 3083 }, { "epoch": 0.37961595273264404, "grad_norm": 0.10943370312452316, "learning_rate": 0.0003103830521000123, "loss": 7.3331, "step": 3084 }, { "epoch": 0.3797390448055145, "grad_norm": 0.11972256749868393, "learning_rate": 0.0003103214681611036, "loss": 7.4877, "step": 3085 }, { "epoch": 0.37986213687838505, "grad_norm": 0.214512437582016, "learning_rate": 0.0003102598842221949, "loss": 8.0901, "step": 3086 }, { "epoch": 0.3799852289512555, "grad_norm": 0.11824792623519897, "learning_rate": 0.0003101983002832861, "loss": 7.8793, "step": 3087 }, { "epoch": 0.38010832102412606, "grad_norm": 0.1367042362689972, "learning_rate": 0.0003101367163443774, "loss": 7.5764, "step": 3088 }, { "epoch": 0.38023141309699654, "grad_norm": 0.08033132553100586, "learning_rate": 0.00031007513240546863, "loss": 7.8041, "step": 3089 }, { "epoch": 0.3803545051698671, "grad_norm": 0.0932929664850235, "learning_rate": 0.0003100135484665599, "loss": 7.4671, "step": 3090 }, { "epoch": 0.38047759724273755, "grad_norm": 0.12270371615886688, "learning_rate": 0.00030995196452765115, "loss": 7.731, "step": 3091 }, { "epoch": 0.3806006893156081, "grad_norm": 0.10962265729904175, "learning_rate": 0.0003098903805887425, "loss": 7.7592, "step": 3092 }, { "epoch": 0.38072378138847857, "grad_norm": 0.09531830996274948, "learning_rate": 0.0003098287966498337, "loss": 7.5449, "step": 3093 }, { "epoch": 0.3808468734613491, "grad_norm": 0.33448001742362976, "learning_rate": 0.000309767212710925, "loss": 9.0943, "step": 3094 }, { "epoch": 0.3809699655342196, "grad_norm": 0.18456894159317017, "learning_rate": 0.00030970562877201624, "loss": 8.5458, "step": 3095 }, { "epoch": 0.3810930576070901, "grad_norm": 0.36457785964012146, "learning_rate": 0.00030964404483310753, "loss": 8.8468, "step": 3096 }, { "epoch": 0.3812161496799606, "grad_norm": 0.08046074956655502, "learning_rate": 0.00030958246089419876, "loss": 7.7234, "step": 3097 }, { "epoch": 0.3813392417528311, "grad_norm": 0.08402067422866821, "learning_rate": 0.0003095208769552901, "loss": 8.2256, "step": 3098 }, { "epoch": 0.3814623338257016, "grad_norm": 0.3325729966163635, "learning_rate": 0.00030945929301638133, "loss": 7.8329, "step": 3099 }, { "epoch": 0.38158542589857214, "grad_norm": 0.2502494752407074, "learning_rate": 0.0003093977090774726, "loss": 7.5164, "step": 3100 }, { "epoch": 0.3817085179714426, "grad_norm": 0.10875579714775085, "learning_rate": 0.00030933612513856385, "loss": 8.1486, "step": 3101 }, { "epoch": 0.38183161004431315, "grad_norm": 0.16521498560905457, "learning_rate": 0.00030927454119965514, "loss": 7.5357, "step": 3102 }, { "epoch": 0.38195470211718363, "grad_norm": 0.12998490035533905, "learning_rate": 0.00030921295726074637, "loss": 7.6101, "step": 3103 }, { "epoch": 0.38207779419005417, "grad_norm": 0.08480270206928253, "learning_rate": 0.00030915137332183766, "loss": 7.3674, "step": 3104 }, { "epoch": 0.38220088626292464, "grad_norm": 0.19876447319984436, "learning_rate": 0.00030908978938292894, "loss": 7.5373, "step": 3105 }, { "epoch": 0.3823239783357952, "grad_norm": 0.33604487776756287, "learning_rate": 0.00030902820544402023, "loss": 8.2962, "step": 3106 }, { "epoch": 0.38244707040866566, "grad_norm": 0.1895623505115509, "learning_rate": 0.00030896662150511146, "loss": 7.5845, "step": 3107 }, { "epoch": 0.3825701624815362, "grad_norm": 0.17444422841072083, "learning_rate": 0.00030890503756620275, "loss": 7.6617, "step": 3108 }, { "epoch": 0.38269325455440667, "grad_norm": 0.25474265217781067, "learning_rate": 0.000308843453627294, "loss": 8.3729, "step": 3109 }, { "epoch": 0.3828163466272772, "grad_norm": 0.17992623150348663, "learning_rate": 0.00030878186968838527, "loss": 8.3689, "step": 3110 }, { "epoch": 0.3829394387001477, "grad_norm": 0.12140019983053207, "learning_rate": 0.0003087202857494765, "loss": 7.5181, "step": 3111 }, { "epoch": 0.3830625307730182, "grad_norm": 0.23391543328762054, "learning_rate": 0.00030865870181056784, "loss": 7.9337, "step": 3112 }, { "epoch": 0.38318562284588875, "grad_norm": 0.1859949678182602, "learning_rate": 0.0003085971178716591, "loss": 7.603, "step": 3113 }, { "epoch": 0.38330871491875923, "grad_norm": 0.2651079297065735, "learning_rate": 0.00030853553393275036, "loss": 7.414, "step": 3114 }, { "epoch": 0.38343180699162976, "grad_norm": 0.1838216930627823, "learning_rate": 0.0003084739499938416, "loss": 7.6165, "step": 3115 }, { "epoch": 0.38355489906450024, "grad_norm": 0.13169890642166138, "learning_rate": 0.0003084123660549329, "loss": 7.483, "step": 3116 }, { "epoch": 0.3836779911373708, "grad_norm": 0.07398952543735504, "learning_rate": 0.0003083507821160241, "loss": 7.498, "step": 3117 }, { "epoch": 0.38380108321024126, "grad_norm": 0.19152024388313293, "learning_rate": 0.0003082891981771154, "loss": 7.9405, "step": 3118 }, { "epoch": 0.3839241752831118, "grad_norm": 0.14557874202728271, "learning_rate": 0.0003082276142382067, "loss": 7.55, "step": 3119 }, { "epoch": 0.38404726735598227, "grad_norm": 0.16853627562522888, "learning_rate": 0.00030816603029929797, "loss": 7.5914, "step": 3120 }, { "epoch": 0.3841703594288528, "grad_norm": 0.09874357283115387, "learning_rate": 0.0003081044463603892, "loss": 7.4671, "step": 3121 }, { "epoch": 0.3842934515017233, "grad_norm": 0.14564067125320435, "learning_rate": 0.0003080428624214805, "loss": 7.7534, "step": 3122 }, { "epoch": 0.3844165435745938, "grad_norm": 0.08563573658466339, "learning_rate": 0.0003079812784825717, "loss": 7.6074, "step": 3123 }, { "epoch": 0.3845396356474643, "grad_norm": 0.12772436439990997, "learning_rate": 0.000307919694543663, "loss": 7.5567, "step": 3124 }, { "epoch": 0.38466272772033483, "grad_norm": 0.12185227125883102, "learning_rate": 0.0003078581106047543, "loss": 7.8357, "step": 3125 }, { "epoch": 0.3847858197932053, "grad_norm": 0.13667136430740356, "learning_rate": 0.0003077965266658456, "loss": 7.7075, "step": 3126 }, { "epoch": 0.38490891186607584, "grad_norm": 0.08611146360635757, "learning_rate": 0.0003077349427269368, "loss": 7.6751, "step": 3127 }, { "epoch": 0.3850320039389463, "grad_norm": 0.10589323192834854, "learning_rate": 0.0003076733587880281, "loss": 7.3937, "step": 3128 }, { "epoch": 0.38515509601181686, "grad_norm": 0.733491837978363, "learning_rate": 0.00030761177484911934, "loss": 10.3462, "step": 3129 }, { "epoch": 0.38527818808468733, "grad_norm": 0.07851874828338623, "learning_rate": 0.0003075501909102106, "loss": 7.3316, "step": 3130 }, { "epoch": 0.38540128015755787, "grad_norm": 0.11272679269313812, "learning_rate": 0.00030748860697130185, "loss": 7.3537, "step": 3131 }, { "epoch": 0.38552437223042835, "grad_norm": 0.2042165994644165, "learning_rate": 0.0003074270230323932, "loss": 7.7874, "step": 3132 }, { "epoch": 0.3856474643032989, "grad_norm": 0.11074353009462357, "learning_rate": 0.00030736543909348443, "loss": 7.8988, "step": 3133 }, { "epoch": 0.38577055637616936, "grad_norm": 0.06245617941021919, "learning_rate": 0.0003073038551545757, "loss": 7.5937, "step": 3134 }, { "epoch": 0.3858936484490399, "grad_norm": 0.14772866666316986, "learning_rate": 0.00030724227121566695, "loss": 7.2299, "step": 3135 }, { "epoch": 0.3860167405219104, "grad_norm": 0.13235417008399963, "learning_rate": 0.00030718068727675823, "loss": 7.3532, "step": 3136 }, { "epoch": 0.3861398325947809, "grad_norm": 0.06778993457555771, "learning_rate": 0.00030711910333784947, "loss": 7.7911, "step": 3137 }, { "epoch": 0.3862629246676514, "grad_norm": 0.10910683125257492, "learning_rate": 0.00030705751939894075, "loss": 7.6697, "step": 3138 }, { "epoch": 0.3863860167405219, "grad_norm": 0.0674956738948822, "learning_rate": 0.00030699593546003204, "loss": 7.5289, "step": 3139 }, { "epoch": 0.3865091088133924, "grad_norm": 0.07998859137296677, "learning_rate": 0.0003069343515211233, "loss": 7.439, "step": 3140 }, { "epoch": 0.38663220088626293, "grad_norm": 0.11699474602937698, "learning_rate": 0.00030687276758221456, "loss": 7.4753, "step": 3141 }, { "epoch": 0.3867552929591334, "grad_norm": 0.12446147203445435, "learning_rate": 0.00030681118364330584, "loss": 7.7802, "step": 3142 }, { "epoch": 0.38687838503200395, "grad_norm": 0.17449618875980377, "learning_rate": 0.0003067495997043971, "loss": 7.3319, "step": 3143 }, { "epoch": 0.3870014771048744, "grad_norm": 0.2933885455131531, "learning_rate": 0.00030668801576548836, "loss": 8.3903, "step": 3144 }, { "epoch": 0.38712456917774496, "grad_norm": 0.10361864417791367, "learning_rate": 0.0003066264318265796, "loss": 7.6337, "step": 3145 }, { "epoch": 0.38724766125061544, "grad_norm": 0.1452273726463318, "learning_rate": 0.00030656484788767094, "loss": 8.1014, "step": 3146 }, { "epoch": 0.387370753323486, "grad_norm": 0.7229639291763306, "learning_rate": 0.00030650326394876217, "loss": 10.6175, "step": 3147 }, { "epoch": 0.38749384539635645, "grad_norm": 0.13074128329753876, "learning_rate": 0.00030644168000985346, "loss": 7.6076, "step": 3148 }, { "epoch": 0.387616937469227, "grad_norm": 0.10383240133523941, "learning_rate": 0.0003063800960709447, "loss": 8.3787, "step": 3149 }, { "epoch": 0.38774002954209746, "grad_norm": 0.1362927258014679, "learning_rate": 0.000306318512132036, "loss": 7.5621, "step": 3150 }, { "epoch": 0.387863121614968, "grad_norm": 0.16661806404590607, "learning_rate": 0.0003062569281931272, "loss": 7.5601, "step": 3151 }, { "epoch": 0.3879862136878385, "grad_norm": 0.16281963884830475, "learning_rate": 0.00030619534425421855, "loss": 7.4406, "step": 3152 }, { "epoch": 0.388109305760709, "grad_norm": 0.10701125115156174, "learning_rate": 0.0003061337603153098, "loss": 8.1564, "step": 3153 }, { "epoch": 0.3882323978335795, "grad_norm": 0.11540091037750244, "learning_rate": 0.00030607217637640107, "loss": 7.8958, "step": 3154 }, { "epoch": 0.38835548990645, "grad_norm": 0.1252928525209427, "learning_rate": 0.0003060105924374923, "loss": 7.9145, "step": 3155 }, { "epoch": 0.38847858197932056, "grad_norm": 0.07162687927484512, "learning_rate": 0.0003059490084985836, "loss": 7.6093, "step": 3156 }, { "epoch": 0.38860167405219104, "grad_norm": 0.09578486531972885, "learning_rate": 0.0003058874245596748, "loss": 7.6737, "step": 3157 }, { "epoch": 0.38872476612506157, "grad_norm": 0.11694812029600143, "learning_rate": 0.0003058258406207661, "loss": 8.1366, "step": 3158 }, { "epoch": 0.38884785819793205, "grad_norm": 0.17057359218597412, "learning_rate": 0.0003057642566818574, "loss": 7.4636, "step": 3159 }, { "epoch": 0.3889709502708026, "grad_norm": 0.12225764989852905, "learning_rate": 0.0003057026727429487, "loss": 8.24, "step": 3160 }, { "epoch": 0.38909404234367306, "grad_norm": 0.11751092225313187, "learning_rate": 0.0003056410888040399, "loss": 7.5802, "step": 3161 }, { "epoch": 0.3892171344165436, "grad_norm": 0.14919281005859375, "learning_rate": 0.0003055795048651312, "loss": 7.3752, "step": 3162 }, { "epoch": 0.3893402264894141, "grad_norm": 0.15354342758655548, "learning_rate": 0.00030551792092622243, "loss": 7.8324, "step": 3163 }, { "epoch": 0.3894633185622846, "grad_norm": 0.13424146175384521, "learning_rate": 0.0003054563369873137, "loss": 7.5982, "step": 3164 }, { "epoch": 0.3895864106351551, "grad_norm": 0.10614205151796341, "learning_rate": 0.00030539475304840495, "loss": 7.094, "step": 3165 }, { "epoch": 0.3897095027080256, "grad_norm": 0.3682573139667511, "learning_rate": 0.0003053331691094963, "loss": 8.6544, "step": 3166 }, { "epoch": 0.3898325947808961, "grad_norm": 0.11981778591871262, "learning_rate": 0.0003052715851705875, "loss": 7.3184, "step": 3167 }, { "epoch": 0.38995568685376664, "grad_norm": 0.09553389251232147, "learning_rate": 0.0003052100012316788, "loss": 7.1681, "step": 3168 }, { "epoch": 0.3900787789266371, "grad_norm": 0.24781553447246552, "learning_rate": 0.00030514841729277004, "loss": 7.8139, "step": 3169 }, { "epoch": 0.39020187099950765, "grad_norm": 0.1014440730214119, "learning_rate": 0.0003050868333538613, "loss": 7.6861, "step": 3170 }, { "epoch": 0.39032496307237813, "grad_norm": 0.23176802694797516, "learning_rate": 0.00030502524941495256, "loss": 8.3256, "step": 3171 }, { "epoch": 0.39044805514524866, "grad_norm": 0.18654410541057587, "learning_rate": 0.00030496366547604385, "loss": 7.3065, "step": 3172 }, { "epoch": 0.39057114721811914, "grad_norm": 0.23651915788650513, "learning_rate": 0.00030490208153713513, "loss": 8.597, "step": 3173 }, { "epoch": 0.3906942392909897, "grad_norm": 0.15547308325767517, "learning_rate": 0.0003048404975982264, "loss": 7.6153, "step": 3174 }, { "epoch": 0.39081733136386015, "grad_norm": 0.2264196276664734, "learning_rate": 0.00030477891365931765, "loss": 7.6756, "step": 3175 }, { "epoch": 0.3909404234367307, "grad_norm": 0.2826294004917145, "learning_rate": 0.00030471732972040894, "loss": 7.4804, "step": 3176 }, { "epoch": 0.39106351550960117, "grad_norm": 0.20514947175979614, "learning_rate": 0.00030465574578150017, "loss": 7.741, "step": 3177 }, { "epoch": 0.3911866075824717, "grad_norm": 0.15700411796569824, "learning_rate": 0.00030459416184259146, "loss": 8.3921, "step": 3178 }, { "epoch": 0.3913096996553422, "grad_norm": 0.14833541214466095, "learning_rate": 0.00030453257790368274, "loss": 7.383, "step": 3179 }, { "epoch": 0.3914327917282127, "grad_norm": 0.1191936805844307, "learning_rate": 0.00030447099396477403, "loss": 7.4914, "step": 3180 }, { "epoch": 0.3915558838010832, "grad_norm": 0.13065345585346222, "learning_rate": 0.00030440941002586526, "loss": 7.5379, "step": 3181 }, { "epoch": 0.3916789758739537, "grad_norm": 0.23358950018882751, "learning_rate": 0.00030434782608695655, "loss": 7.7206, "step": 3182 }, { "epoch": 0.3918020679468242, "grad_norm": 0.156193807721138, "learning_rate": 0.0003042862421480478, "loss": 7.3937, "step": 3183 }, { "epoch": 0.39192516001969474, "grad_norm": 0.13826514780521393, "learning_rate": 0.00030422465820913907, "loss": 7.0809, "step": 3184 }, { "epoch": 0.3920482520925652, "grad_norm": 0.10907107591629028, "learning_rate": 0.0003041630742702303, "loss": 7.5393, "step": 3185 }, { "epoch": 0.39217134416543575, "grad_norm": 0.11853128671646118, "learning_rate": 0.00030410149033132164, "loss": 7.7218, "step": 3186 }, { "epoch": 0.39229443623830623, "grad_norm": 0.1863696128129959, "learning_rate": 0.0003040399063924129, "loss": 7.9211, "step": 3187 }, { "epoch": 0.39241752831117677, "grad_norm": 0.09503605216741562, "learning_rate": 0.00030397832245350416, "loss": 7.7469, "step": 3188 }, { "epoch": 0.39254062038404725, "grad_norm": 0.13207070529460907, "learning_rate": 0.0003039167385145954, "loss": 7.5748, "step": 3189 }, { "epoch": 0.3926637124569178, "grad_norm": 0.21760205924510956, "learning_rate": 0.0003038551545756867, "loss": 7.4526, "step": 3190 }, { "epoch": 0.39278680452978826, "grad_norm": 0.3739995062351227, "learning_rate": 0.0003037935706367779, "loss": 7.2962, "step": 3191 }, { "epoch": 0.3929098966026588, "grad_norm": 0.2073931246995926, "learning_rate": 0.0003037319866978692, "loss": 7.3157, "step": 3192 }, { "epoch": 0.39303298867552927, "grad_norm": 0.17728932201862335, "learning_rate": 0.0003036704027589605, "loss": 7.501, "step": 3193 }, { "epoch": 0.3931560807483998, "grad_norm": 0.08817242085933685, "learning_rate": 0.00030360881882005177, "loss": 7.448, "step": 3194 }, { "epoch": 0.3932791728212703, "grad_norm": 0.34072160720825195, "learning_rate": 0.000303547234881143, "loss": 8.4849, "step": 3195 }, { "epoch": 0.3934022648941408, "grad_norm": 0.337503582239151, "learning_rate": 0.0003034856509422343, "loss": 8.8161, "step": 3196 }, { "epoch": 0.3935253569670113, "grad_norm": 0.14725831151008606, "learning_rate": 0.0003034240670033255, "loss": 8.0307, "step": 3197 }, { "epoch": 0.39364844903988183, "grad_norm": 0.1204509511590004, "learning_rate": 0.0003033624830644168, "loss": 7.3399, "step": 3198 }, { "epoch": 0.39377154111275237, "grad_norm": 0.08881990611553192, "learning_rate": 0.00030330089912550804, "loss": 7.512, "step": 3199 }, { "epoch": 0.39389463318562284, "grad_norm": NaN, "learning_rate": 0.0003032393151865994, "loss": 7.8125, "step": 3200 }, { "epoch": 0.3940177252584934, "grad_norm": 0.10432785004377365, "learning_rate": 0.0003031777312476906, "loss": 7.5432, "step": 3201 }, { "epoch": 0.39414081733136386, "grad_norm": 0.2808225154876709, "learning_rate": 0.0003031161473087819, "loss": 8.7862, "step": 3202 }, { "epoch": 0.3942639094042344, "grad_norm": 0.5020664930343628, "learning_rate": 0.00030305456336987313, "loss": 8.0928, "step": 3203 }, { "epoch": 0.39438700147710487, "grad_norm": 0.3753533661365509, "learning_rate": 0.0003029929794309644, "loss": 7.8877, "step": 3204 }, { "epoch": 0.3945100935499754, "grad_norm": 0.18967828154563904, "learning_rate": 0.00030293139549205565, "loss": 7.8162, "step": 3205 }, { "epoch": 0.3946331856228459, "grad_norm": 0.1963283121585846, "learning_rate": 0.000302869811553147, "loss": 7.5523, "step": 3206 }, { "epoch": 0.3947562776957164, "grad_norm": 0.18215061724185944, "learning_rate": 0.0003028082276142382, "loss": 7.937, "step": 3207 }, { "epoch": 0.3948793697685869, "grad_norm": 0.18311326205730438, "learning_rate": 0.0003027466436753295, "loss": 7.9985, "step": 3208 }, { "epoch": 0.39500246184145743, "grad_norm": 0.22652648389339447, "learning_rate": 0.00030268505973642075, "loss": 8.1424, "step": 3209 }, { "epoch": 0.3951255539143279, "grad_norm": 0.1597173511981964, "learning_rate": 0.00030262347579751203, "loss": 7.9321, "step": 3210 }, { "epoch": 0.39524864598719844, "grad_norm": 0.22289630770683289, "learning_rate": 0.00030256189185860326, "loss": 7.9326, "step": 3211 }, { "epoch": 0.3953717380600689, "grad_norm": 0.3163277804851532, "learning_rate": 0.00030250030791969455, "loss": 7.8073, "step": 3212 }, { "epoch": 0.39549483013293946, "grad_norm": 0.08026651293039322, "learning_rate": 0.00030243872398078584, "loss": 7.2451, "step": 3213 }, { "epoch": 0.39561792220580994, "grad_norm": 0.15422941744327545, "learning_rate": 0.0003023771400418771, "loss": 7.9321, "step": 3214 }, { "epoch": 0.39574101427868047, "grad_norm": 0.2990172803401947, "learning_rate": 0.00030231555610296836, "loss": 7.6542, "step": 3215 }, { "epoch": 0.39586410635155095, "grad_norm": 0.1528536081314087, "learning_rate": 0.00030225397216405964, "loss": 7.7264, "step": 3216 }, { "epoch": 0.3959871984244215, "grad_norm": 0.1135997548699379, "learning_rate": 0.0003021923882251509, "loss": 7.5983, "step": 3217 }, { "epoch": 0.39611029049729196, "grad_norm": 0.200113907456398, "learning_rate": 0.00030213080428624216, "loss": 8.0079, "step": 3218 }, { "epoch": 0.3962333825701625, "grad_norm": 0.1277262568473816, "learning_rate": 0.0003020692203473334, "loss": 7.4153, "step": 3219 }, { "epoch": 0.396356474643033, "grad_norm": 0.09531358629465103, "learning_rate": 0.00030200763640842474, "loss": 7.4284, "step": 3220 }, { "epoch": 0.3964795667159035, "grad_norm": 0.0980452299118042, "learning_rate": 0.00030194605246951597, "loss": 7.6802, "step": 3221 }, { "epoch": 0.396602658788774, "grad_norm": 0.14292584359645844, "learning_rate": 0.00030188446853060725, "loss": 8.5096, "step": 3222 }, { "epoch": 0.3967257508616445, "grad_norm": 0.31039610505104065, "learning_rate": 0.0003018228845916985, "loss": 7.4455, "step": 3223 }, { "epoch": 0.396848842934515, "grad_norm": 0.22151197493076324, "learning_rate": 0.0003017613006527898, "loss": 7.6804, "step": 3224 }, { "epoch": 0.39697193500738553, "grad_norm": 0.18008001148700714, "learning_rate": 0.000301699716713881, "loss": 7.4757, "step": 3225 }, { "epoch": 0.397095027080256, "grad_norm": 0.16399772465229034, "learning_rate": 0.0003016381327749723, "loss": 7.6574, "step": 3226 }, { "epoch": 0.39721811915312655, "grad_norm": 0.403380811214447, "learning_rate": 0.0003015765488360636, "loss": 8.7429, "step": 3227 }, { "epoch": 0.397341211225997, "grad_norm": 0.2089257687330246, "learning_rate": 0.00030151496489715487, "loss": 8.3785, "step": 3228 }, { "epoch": 0.39746430329886756, "grad_norm": 0.12771889567375183, "learning_rate": 0.0003014533809582461, "loss": 7.4134, "step": 3229 }, { "epoch": 0.39758739537173804, "grad_norm": 0.06799109280109406, "learning_rate": 0.0003013917970193374, "loss": 7.6771, "step": 3230 }, { "epoch": 0.3977104874446086, "grad_norm": 0.13995949923992157, "learning_rate": 0.0003013302130804286, "loss": 7.3391, "step": 3231 }, { "epoch": 0.39783357951747905, "grad_norm": 0.09038813412189484, "learning_rate": 0.0003012686291415199, "loss": 7.5554, "step": 3232 }, { "epoch": 0.3979566715903496, "grad_norm": 0.09971991926431656, "learning_rate": 0.00030120704520261114, "loss": 7.2636, "step": 3233 }, { "epoch": 0.39807976366322007, "grad_norm": 0.2109566628932953, "learning_rate": 0.0003011454612637025, "loss": 7.7657, "step": 3234 }, { "epoch": 0.3982028557360906, "grad_norm": 0.20874401926994324, "learning_rate": 0.0003010838773247937, "loss": 7.829, "step": 3235 }, { "epoch": 0.3983259478089611, "grad_norm": 0.12725205719470978, "learning_rate": 0.000301022293385885, "loss": 7.5132, "step": 3236 }, { "epoch": 0.3984490398818316, "grad_norm": 0.08595632016658783, "learning_rate": 0.00030096070944697623, "loss": 7.7838, "step": 3237 }, { "epoch": 0.3985721319547021, "grad_norm": 0.1473664492368698, "learning_rate": 0.0003008991255080675, "loss": 7.6171, "step": 3238 }, { "epoch": 0.3986952240275726, "grad_norm": 0.21142899990081787, "learning_rate": 0.00030083754156915875, "loss": 7.4907, "step": 3239 }, { "epoch": 0.3988183161004431, "grad_norm": 0.13939836621284485, "learning_rate": 0.0003007759576302501, "loss": 7.857, "step": 3240 }, { "epoch": 0.39894140817331364, "grad_norm": 0.17253988981246948, "learning_rate": 0.0003007143736913413, "loss": 7.6136, "step": 3241 }, { "epoch": 0.3990645002461842, "grad_norm": 0.16808119416236877, "learning_rate": 0.0003006527897524326, "loss": 7.6173, "step": 3242 }, { "epoch": 0.39918759231905465, "grad_norm": 0.1442662477493286, "learning_rate": 0.00030059120581352384, "loss": 7.8711, "step": 3243 }, { "epoch": 0.3993106843919252, "grad_norm": 0.0940152257680893, "learning_rate": 0.0003005296218746151, "loss": 7.5143, "step": 3244 }, { "epoch": 0.39943377646479566, "grad_norm": 0.19924770295619965, "learning_rate": 0.00030046803793570636, "loss": 7.7513, "step": 3245 }, { "epoch": 0.3995568685376662, "grad_norm": 0.1458856165409088, "learning_rate": 0.00030040645399679764, "loss": 7.7205, "step": 3246 }, { "epoch": 0.3996799606105367, "grad_norm": 0.10171504318714142, "learning_rate": 0.00030034487005788893, "loss": 7.3971, "step": 3247 }, { "epoch": 0.3998030526834072, "grad_norm": 0.09578186273574829, "learning_rate": 0.0003002832861189802, "loss": 7.2599, "step": 3248 }, { "epoch": 0.3999261447562777, "grad_norm": 0.21613356471061707, "learning_rate": 0.00030022170218007145, "loss": 7.6368, "step": 3249 }, { "epoch": 0.4000492368291482, "grad_norm": 0.22408922016620636, "learning_rate": 0.00030016011824116274, "loss": 7.561, "step": 3250 }, { "epoch": 0.4001723289020187, "grad_norm": 0.1334226429462433, "learning_rate": 0.00030009853430225397, "loss": 7.4705, "step": 3251 }, { "epoch": 0.40029542097488924, "grad_norm": 0.09823063760995865, "learning_rate": 0.00030003695036334526, "loss": 7.3993, "step": 3252 }, { "epoch": 0.4004185130477597, "grad_norm": 0.1513681709766388, "learning_rate": 0.0002999753664244365, "loss": 7.6466, "step": 3253 }, { "epoch": 0.40054160512063025, "grad_norm": 0.095914326608181, "learning_rate": 0.0002999137824855278, "loss": 7.858, "step": 3254 }, { "epoch": 0.40066469719350073, "grad_norm": 0.08087729662656784, "learning_rate": 0.00029985219854661906, "loss": 7.806, "step": 3255 }, { "epoch": 0.40078778926637126, "grad_norm": 0.17966192960739136, "learning_rate": 0.0002997906146077103, "loss": 8.9868, "step": 3256 }, { "epoch": 0.40091088133924174, "grad_norm": 0.14108425378799438, "learning_rate": 0.0002997290306688016, "loss": 7.4832, "step": 3257 }, { "epoch": 0.4010339734121123, "grad_norm": 0.1160062700510025, "learning_rate": 0.0002996674467298928, "loss": 7.6471, "step": 3258 }, { "epoch": 0.40115706548498276, "grad_norm": 0.09827651083469391, "learning_rate": 0.0002996058627909841, "loss": 7.3379, "step": 3259 }, { "epoch": 0.4012801575578533, "grad_norm": 0.13865920901298523, "learning_rate": 0.00029954427885207533, "loss": 7.5177, "step": 3260 }, { "epoch": 0.40140324963072377, "grad_norm": 0.17442955076694489, "learning_rate": 0.00029948269491316667, "loss": 7.6392, "step": 3261 }, { "epoch": 0.4015263417035943, "grad_norm": 0.1499909907579422, "learning_rate": 0.0002994211109742579, "loss": 7.7354, "step": 3262 }, { "epoch": 0.4016494337764648, "grad_norm": 0.1464097648859024, "learning_rate": 0.0002993595270353492, "loss": 7.9416, "step": 3263 }, { "epoch": 0.4017725258493353, "grad_norm": 0.07562500238418579, "learning_rate": 0.0002992979430964404, "loss": 7.9151, "step": 3264 }, { "epoch": 0.4018956179222058, "grad_norm": 0.10009391605854034, "learning_rate": 0.0002992363591575317, "loss": 7.6481, "step": 3265 }, { "epoch": 0.40201870999507633, "grad_norm": 0.13462601602077484, "learning_rate": 0.00029917477521862294, "loss": 7.9938, "step": 3266 }, { "epoch": 0.4021418020679468, "grad_norm": 0.13821563124656677, "learning_rate": 0.0002991131912797143, "loss": 7.5948, "step": 3267 }, { "epoch": 0.40226489414081734, "grad_norm": 0.2604524791240692, "learning_rate": 0.0002990516073408055, "loss": 7.9251, "step": 3268 }, { "epoch": 0.4023879862136878, "grad_norm": 0.1196378767490387, "learning_rate": 0.0002989900234018968, "loss": 7.9145, "step": 3269 }, { "epoch": 0.40251107828655835, "grad_norm": 0.09886053949594498, "learning_rate": 0.00029892843946298804, "loss": 7.768, "step": 3270 }, { "epoch": 0.40263417035942883, "grad_norm": 0.12841066718101501, "learning_rate": 0.0002988668555240793, "loss": 7.7086, "step": 3271 }, { "epoch": 0.40275726243229937, "grad_norm": 0.12002388387918472, "learning_rate": 0.00029880527158517055, "loss": 8.5246, "step": 3272 }, { "epoch": 0.40288035450516985, "grad_norm": 0.2262330800294876, "learning_rate": 0.00029874368764626184, "loss": 7.4068, "step": 3273 }, { "epoch": 0.4030034465780404, "grad_norm": 0.1605021208524704, "learning_rate": 0.00029868210370735313, "loss": 7.4706, "step": 3274 }, { "epoch": 0.40312653865091086, "grad_norm": 0.2173081487417221, "learning_rate": 0.0002986205197684444, "loss": 8.5975, "step": 3275 }, { "epoch": 0.4032496307237814, "grad_norm": 0.12586483359336853, "learning_rate": 0.00029855893582953565, "loss": 7.3074, "step": 3276 }, { "epoch": 0.4033727227966519, "grad_norm": 0.11703325808048248, "learning_rate": 0.00029849735189062693, "loss": 7.4842, "step": 3277 }, { "epoch": 0.4034958148695224, "grad_norm": 0.20446327328681946, "learning_rate": 0.00029843576795171817, "loss": 7.9029, "step": 3278 }, { "epoch": 0.4036189069423929, "grad_norm": 0.15157531201839447, "learning_rate": 0.00029837418401280945, "loss": 7.526, "step": 3279 }, { "epoch": 0.4037419990152634, "grad_norm": 0.07305626571178436, "learning_rate": 0.0002983126000739007, "loss": 7.7204, "step": 3280 }, { "epoch": 0.4038650910881339, "grad_norm": 0.24101732671260834, "learning_rate": 0.000298251016134992, "loss": 7.2219, "step": 3281 }, { "epoch": 0.40398818316100443, "grad_norm": 0.1586274355649948, "learning_rate": 0.00029818943219608326, "loss": 7.7139, "step": 3282 }, { "epoch": 0.4041112752338749, "grad_norm": 0.16291561722755432, "learning_rate": 0.00029812784825717454, "loss": 7.6095, "step": 3283 }, { "epoch": 0.40423436730674545, "grad_norm": 0.11026662588119507, "learning_rate": 0.0002980662643182658, "loss": 7.6756, "step": 3284 }, { "epoch": 0.404357459379616, "grad_norm": 0.07430389523506165, "learning_rate": 0.00029800468037935706, "loss": 7.4736, "step": 3285 }, { "epoch": 0.40448055145248646, "grad_norm": 0.5741116404533386, "learning_rate": 0.0002979430964404483, "loss": 10.1689, "step": 3286 }, { "epoch": 0.404603643525357, "grad_norm": 0.16388723254203796, "learning_rate": 0.0002978815125015396, "loss": 7.4025, "step": 3287 }, { "epoch": 0.40472673559822747, "grad_norm": 0.2549147307872772, "learning_rate": 0.00029781992856263087, "loss": 8.8266, "step": 3288 }, { "epoch": 0.404849827671098, "grad_norm": 0.20517902076244354, "learning_rate": 0.00029775834462372216, "loss": 7.2463, "step": 3289 }, { "epoch": 0.4049729197439685, "grad_norm": 0.15995652973651886, "learning_rate": 0.0002976967606848134, "loss": 7.6932, "step": 3290 }, { "epoch": 0.405096011816839, "grad_norm": 0.11424554884433746, "learning_rate": 0.0002976351767459047, "loss": 7.4735, "step": 3291 }, { "epoch": 0.4052191038897095, "grad_norm": 0.09845732152462006, "learning_rate": 0.0002975735928069959, "loss": 7.5892, "step": 3292 }, { "epoch": 0.40534219596258003, "grad_norm": 0.15086740255355835, "learning_rate": 0.0002975120088680872, "loss": 7.5647, "step": 3293 }, { "epoch": 0.4054652880354505, "grad_norm": 0.15239864587783813, "learning_rate": 0.0002974504249291785, "loss": 7.4927, "step": 3294 }, { "epoch": 0.40558838010832104, "grad_norm": 0.08337798714637756, "learning_rate": 0.00029738884099026977, "loss": 7.7366, "step": 3295 }, { "epoch": 0.4057114721811915, "grad_norm": 0.21092931926250458, "learning_rate": 0.000297327257051361, "loss": 8.5465, "step": 3296 }, { "epoch": 0.40583456425406206, "grad_norm": 0.10628166049718857, "learning_rate": 0.0002972656731124523, "loss": 7.781, "step": 3297 }, { "epoch": 0.40595765632693254, "grad_norm": 0.15920929610729218, "learning_rate": 0.0002972040891735435, "loss": 7.4847, "step": 3298 }, { "epoch": 0.40608074839980307, "grad_norm": 0.1693803369998932, "learning_rate": 0.0002971425052346348, "loss": 7.4629, "step": 3299 }, { "epoch": 0.40620384047267355, "grad_norm": 0.09720007330179214, "learning_rate": 0.00029708092129572604, "loss": 7.9497, "step": 3300 }, { "epoch": 0.4063269325455441, "grad_norm": 0.08559593558311462, "learning_rate": 0.0002970193373568174, "loss": 7.603, "step": 3301 }, { "epoch": 0.40645002461841456, "grad_norm": 0.09132438898086548, "learning_rate": 0.0002969577534179086, "loss": 7.604, "step": 3302 }, { "epoch": 0.4065731166912851, "grad_norm": 0.09539563208818436, "learning_rate": 0.0002968961694789999, "loss": 7.4806, "step": 3303 }, { "epoch": 0.4066962087641556, "grad_norm": 0.09991759061813354, "learning_rate": 0.00029683458554009113, "loss": 7.6617, "step": 3304 }, { "epoch": 0.4068193008370261, "grad_norm": 0.08634336292743683, "learning_rate": 0.0002967730016011824, "loss": 7.5232, "step": 3305 }, { "epoch": 0.4069423929098966, "grad_norm": 0.13952940702438354, "learning_rate": 0.00029671141766227365, "loss": 7.9846, "step": 3306 }, { "epoch": 0.4070654849827671, "grad_norm": 0.09231118857860565, "learning_rate": 0.00029664983372336493, "loss": 7.5471, "step": 3307 }, { "epoch": 0.4071885770556376, "grad_norm": 0.11505656689405441, "learning_rate": 0.0002965882497844562, "loss": 7.8013, "step": 3308 }, { "epoch": 0.40731166912850814, "grad_norm": 0.11666544526815414, "learning_rate": 0.0002965266658455475, "loss": 7.6086, "step": 3309 }, { "epoch": 0.4074347612013786, "grad_norm": 0.11373121291399002, "learning_rate": 0.00029646508190663874, "loss": 7.8013, "step": 3310 }, { "epoch": 0.40755785327424915, "grad_norm": 0.30454161763191223, "learning_rate": 0.00029640349796773, "loss": 8.756, "step": 3311 }, { "epoch": 0.4076809453471196, "grad_norm": 0.17167794704437256, "learning_rate": 0.00029634191402882126, "loss": 8.4545, "step": 3312 }, { "epoch": 0.40780403741999016, "grad_norm": 0.171731635928154, "learning_rate": 0.00029628033008991255, "loss": 7.2091, "step": 3313 }, { "epoch": 0.40792712949286064, "grad_norm": 0.1129487156867981, "learning_rate": 0.0002962187461510038, "loss": 7.8868, "step": 3314 }, { "epoch": 0.4080502215657312, "grad_norm": 0.12106184661388397, "learning_rate": 0.0002961571622120951, "loss": 7.9886, "step": 3315 }, { "epoch": 0.40817331363860165, "grad_norm": 0.0739557147026062, "learning_rate": 0.00029609557827318635, "loss": 7.6639, "step": 3316 }, { "epoch": 0.4082964057114722, "grad_norm": 0.08469463884830475, "learning_rate": 0.00029603399433427764, "loss": 7.652, "step": 3317 }, { "epoch": 0.40841949778434267, "grad_norm": 0.16455836594104767, "learning_rate": 0.00029597241039536887, "loss": 8.3213, "step": 3318 }, { "epoch": 0.4085425898572132, "grad_norm": 0.11540807038545609, "learning_rate": 0.00029591082645646016, "loss": 7.4967, "step": 3319 }, { "epoch": 0.4086656819300837, "grad_norm": 0.1270056664943695, "learning_rate": 0.0002958492425175514, "loss": 7.8757, "step": 3320 }, { "epoch": 0.4087887740029542, "grad_norm": 0.09787534177303314, "learning_rate": 0.00029578765857864273, "loss": 7.6205, "step": 3321 }, { "epoch": 0.4089118660758247, "grad_norm": 0.07874494045972824, "learning_rate": 0.00029572607463973396, "loss": 7.4317, "step": 3322 }, { "epoch": 0.4090349581486952, "grad_norm": 0.07475375384092331, "learning_rate": 0.00029566449070082525, "loss": 7.5555, "step": 3323 }, { "epoch": 0.4091580502215657, "grad_norm": 0.11562183499336243, "learning_rate": 0.0002956029067619165, "loss": 7.3966, "step": 3324 }, { "epoch": 0.40928114229443624, "grad_norm": 0.07492396980524063, "learning_rate": 0.00029554132282300777, "loss": 7.534, "step": 3325 }, { "epoch": 0.4094042343673067, "grad_norm": 0.09939507395029068, "learning_rate": 0.000295479738884099, "loss": 7.5051, "step": 3326 }, { "epoch": 0.40952732644017725, "grad_norm": 0.17656998336315155, "learning_rate": 0.0002954181549451903, "loss": 7.2104, "step": 3327 }, { "epoch": 0.4096504185130478, "grad_norm": 0.5054476261138916, "learning_rate": 0.0002953565710062816, "loss": 10.027, "step": 3328 }, { "epoch": 0.40977351058591827, "grad_norm": 0.18715384602546692, "learning_rate": 0.00029529498706737286, "loss": 7.7462, "step": 3329 }, { "epoch": 0.4098966026587888, "grad_norm": 0.10599344223737717, "learning_rate": 0.0002952334031284641, "loss": 7.4228, "step": 3330 }, { "epoch": 0.4100196947316593, "grad_norm": 0.2888962924480438, "learning_rate": 0.0002951718191895554, "loss": 8.4868, "step": 3331 }, { "epoch": 0.4101427868045298, "grad_norm": 0.0950830802321434, "learning_rate": 0.0002951102352506466, "loss": 7.8389, "step": 3332 }, { "epoch": 0.4102658788774003, "grad_norm": 0.38021132349967957, "learning_rate": 0.0002950486513117379, "loss": 8.3133, "step": 3333 }, { "epoch": 0.4103889709502708, "grad_norm": 0.13114912807941437, "learning_rate": 0.00029498706737282913, "loss": 7.5028, "step": 3334 }, { "epoch": 0.4105120630231413, "grad_norm": 0.10649505257606506, "learning_rate": 0.00029492548343392047, "loss": 7.4822, "step": 3335 }, { "epoch": 0.41063515509601184, "grad_norm": 0.1222236305475235, "learning_rate": 0.0002948638994950117, "loss": 7.2516, "step": 3336 }, { "epoch": 0.4107582471688823, "grad_norm": 0.120355024933815, "learning_rate": 0.000294802315556103, "loss": 7.6385, "step": 3337 }, { "epoch": 0.41088133924175285, "grad_norm": 0.1803300976753235, "learning_rate": 0.0002947407316171942, "loss": 7.7171, "step": 3338 }, { "epoch": 0.41100443131462333, "grad_norm": 0.11300957202911377, "learning_rate": 0.0002946791476782855, "loss": 7.2201, "step": 3339 }, { "epoch": 0.41112752338749387, "grad_norm": 0.16793255507946014, "learning_rate": 0.00029461756373937674, "loss": 7.4586, "step": 3340 }, { "epoch": 0.41125061546036434, "grad_norm": 0.5749006271362305, "learning_rate": 0.00029455597980046803, "loss": 9.996, "step": 3341 }, { "epoch": 0.4113737075332349, "grad_norm": 0.12483730167150497, "learning_rate": 0.0002944943958615593, "loss": 7.8992, "step": 3342 }, { "epoch": 0.41149679960610536, "grad_norm": 0.47420868277549744, "learning_rate": 0.0002944328119226506, "loss": 9.923, "step": 3343 }, { "epoch": 0.4116198916789759, "grad_norm": 0.12494415789842606, "learning_rate": 0.00029437122798374183, "loss": 7.8542, "step": 3344 }, { "epoch": 0.41174298375184637, "grad_norm": 0.3356233835220337, "learning_rate": 0.0002943096440448331, "loss": 7.5129, "step": 3345 }, { "epoch": 0.4118660758247169, "grad_norm": 0.20607303082942963, "learning_rate": 0.00029424806010592435, "loss": 7.8816, "step": 3346 }, { "epoch": 0.4119891678975874, "grad_norm": 0.28391003608703613, "learning_rate": 0.00029418647616701564, "loss": 7.4841, "step": 3347 }, { "epoch": 0.4121122599704579, "grad_norm": 0.25514093041419983, "learning_rate": 0.0002941248922281069, "loss": 7.5634, "step": 3348 }, { "epoch": 0.4122353520433284, "grad_norm": 0.11647731065750122, "learning_rate": 0.0002940633082891982, "loss": 8.2855, "step": 3349 }, { "epoch": 0.41235844411619893, "grad_norm": 0.12662173807621002, "learning_rate": 0.00029400172435028945, "loss": 7.2818, "step": 3350 }, { "epoch": 0.4124815361890694, "grad_norm": 0.1869392693042755, "learning_rate": 0.00029394014041138073, "loss": 7.4897, "step": 3351 }, { "epoch": 0.41260462826193994, "grad_norm": 0.238141268491745, "learning_rate": 0.00029387855647247196, "loss": 7.4318, "step": 3352 }, { "epoch": 0.4127277203348104, "grad_norm": 0.2294534593820572, "learning_rate": 0.00029381697253356325, "loss": 7.0316, "step": 3353 }, { "epoch": 0.41285081240768096, "grad_norm": 0.3176936209201813, "learning_rate": 0.0002937553885946545, "loss": 7.5496, "step": 3354 }, { "epoch": 0.41297390448055143, "grad_norm": 0.20953258872032166, "learning_rate": 0.0002936938046557458, "loss": 7.6438, "step": 3355 }, { "epoch": 0.41309699655342197, "grad_norm": 0.1684843897819519, "learning_rate": 0.00029363222071683706, "loss": 7.6525, "step": 3356 }, { "epoch": 0.41322008862629245, "grad_norm": 0.15540418028831482, "learning_rate": 0.00029357063677792834, "loss": 7.3096, "step": 3357 }, { "epoch": 0.413343180699163, "grad_norm": 0.15494418144226074, "learning_rate": 0.0002935090528390196, "loss": 7.6985, "step": 3358 }, { "epoch": 0.41346627277203346, "grad_norm": 0.11242635548114777, "learning_rate": 0.00029344746890011086, "loss": 8.0405, "step": 3359 }, { "epoch": 0.413589364844904, "grad_norm": 0.272353857755661, "learning_rate": 0.0002933858849612021, "loss": 7.5893, "step": 3360 }, { "epoch": 0.4137124569177745, "grad_norm": 0.1666308045387268, "learning_rate": 0.0002933243010222934, "loss": 7.6309, "step": 3361 }, { "epoch": 0.413835548990645, "grad_norm": 0.1965770274400711, "learning_rate": 0.00029326271708338467, "loss": 7.2415, "step": 3362 }, { "epoch": 0.4139586410635155, "grad_norm": 0.09589128196239471, "learning_rate": 0.00029320113314447595, "loss": 7.788, "step": 3363 }, { "epoch": 0.414081733136386, "grad_norm": 0.08526219427585602, "learning_rate": 0.0002931395492055672, "loss": 7.1973, "step": 3364 }, { "epoch": 0.4142048252092565, "grad_norm": 0.20420284569263458, "learning_rate": 0.0002930779652666585, "loss": 7.6512, "step": 3365 }, { "epoch": 0.41432791728212703, "grad_norm": 0.15682759881019592, "learning_rate": 0.0002930163813277497, "loss": 7.1125, "step": 3366 }, { "epoch": 0.4144510093549975, "grad_norm": 0.07492034137248993, "learning_rate": 0.000292954797388841, "loss": 7.2368, "step": 3367 }, { "epoch": 0.41457410142786805, "grad_norm": 0.16618981957435608, "learning_rate": 0.0002928932134499322, "loss": 7.6175, "step": 3368 }, { "epoch": 0.4146971935007385, "grad_norm": 0.3925023674964905, "learning_rate": 0.00029283162951102357, "loss": 8.9526, "step": 3369 }, { "epoch": 0.41482028557360906, "grad_norm": 0.06883891671895981, "learning_rate": 0.0002927700455721148, "loss": 7.5763, "step": 3370 }, { "epoch": 0.4149433776464796, "grad_norm": 0.15835465490818024, "learning_rate": 0.0002927084616332061, "loss": 8.3535, "step": 3371 }, { "epoch": 0.4150664697193501, "grad_norm": 0.17277269065380096, "learning_rate": 0.0002926468776942973, "loss": 7.8248, "step": 3372 }, { "epoch": 0.4151895617922206, "grad_norm": 0.29877209663391113, "learning_rate": 0.0002925852937553886, "loss": 7.4147, "step": 3373 }, { "epoch": 0.4153126538650911, "grad_norm": 0.171886146068573, "learning_rate": 0.00029252370981647984, "loss": 8.8567, "step": 3374 }, { "epoch": 0.4154357459379616, "grad_norm": 0.22745506465435028, "learning_rate": 0.0002924621258775712, "loss": 7.3913, "step": 3375 }, { "epoch": 0.4155588380108321, "grad_norm": 0.10136034339666367, "learning_rate": 0.0002924005419386624, "loss": 8.1182, "step": 3376 }, { "epoch": 0.41568193008370263, "grad_norm": 0.2699894607067108, "learning_rate": 0.0002923389579997537, "loss": 7.09, "step": 3377 }, { "epoch": 0.4158050221565731, "grad_norm": 0.20473940670490265, "learning_rate": 0.00029227737406084493, "loss": 8.6135, "step": 3378 }, { "epoch": 0.41592811422944365, "grad_norm": 0.09292197972536087, "learning_rate": 0.0002922157901219362, "loss": 7.7414, "step": 3379 }, { "epoch": 0.4160512063023141, "grad_norm": 0.17272062599658966, "learning_rate": 0.00029215420618302745, "loss": 7.7531, "step": 3380 }, { "epoch": 0.41617429837518466, "grad_norm": 0.09452985972166061, "learning_rate": 0.00029209262224411873, "loss": 7.6283, "step": 3381 }, { "epoch": 0.41629739044805514, "grad_norm": 0.13890966773033142, "learning_rate": 0.00029203103830521, "loss": 8.0462, "step": 3382 }, { "epoch": 0.41642048252092567, "grad_norm": 0.13176316022872925, "learning_rate": 0.0002919694543663013, "loss": 7.6618, "step": 3383 }, { "epoch": 0.41654357459379615, "grad_norm": 0.09548422694206238, "learning_rate": 0.00029190787042739254, "loss": 7.9805, "step": 3384 }, { "epoch": 0.4166666666666667, "grad_norm": 0.08766297996044159, "learning_rate": 0.0002918462864884838, "loss": 7.5453, "step": 3385 }, { "epoch": 0.41678975873953716, "grad_norm": 0.08704841881990433, "learning_rate": 0.00029178470254957506, "loss": 7.8445, "step": 3386 }, { "epoch": 0.4169128508124077, "grad_norm": 0.08493472635746002, "learning_rate": 0.00029172311861066634, "loss": 7.7044, "step": 3387 }, { "epoch": 0.4170359428852782, "grad_norm": 0.08127643167972565, "learning_rate": 0.0002916615346717576, "loss": 7.8863, "step": 3388 }, { "epoch": 0.4171590349581487, "grad_norm": 0.08267448097467422, "learning_rate": 0.0002915999507328489, "loss": 7.5945, "step": 3389 }, { "epoch": 0.4172821270310192, "grad_norm": 0.24251040816307068, "learning_rate": 0.00029153836679394015, "loss": 8.6125, "step": 3390 }, { "epoch": 0.4174052191038897, "grad_norm": 0.12592269480228424, "learning_rate": 0.00029147678285503144, "loss": 8.2333, "step": 3391 }, { "epoch": 0.4175283111767602, "grad_norm": 0.14996644854545593, "learning_rate": 0.00029141519891612267, "loss": 7.6879, "step": 3392 }, { "epoch": 0.41765140324963074, "grad_norm": 0.25394123792648315, "learning_rate": 0.00029135361497721396, "loss": 8.8811, "step": 3393 }, { "epoch": 0.4177744953225012, "grad_norm": 0.17478875815868378, "learning_rate": 0.0002912920310383052, "loss": 7.5953, "step": 3394 }, { "epoch": 0.41789758739537175, "grad_norm": 0.1991155743598938, "learning_rate": 0.0002912304470993965, "loss": 7.7054, "step": 3395 }, { "epoch": 0.41802067946824223, "grad_norm": 0.12092678993940353, "learning_rate": 0.00029116886316048776, "loss": 8.3902, "step": 3396 }, { "epoch": 0.41814377154111276, "grad_norm": 0.21545805037021637, "learning_rate": 0.00029110727922157905, "loss": 7.4186, "step": 3397 }, { "epoch": 0.41826686361398324, "grad_norm": 0.2281295359134674, "learning_rate": 0.0002910456952826703, "loss": 7.5037, "step": 3398 }, { "epoch": 0.4183899556868538, "grad_norm": 0.08580398559570312, "learning_rate": 0.00029098411134376157, "loss": 7.9332, "step": 3399 }, { "epoch": 0.41851304775972425, "grad_norm": 0.09498590975999832, "learning_rate": 0.0002909225274048528, "loss": 7.614, "step": 3400 }, { "epoch": 0.4186361398325948, "grad_norm": 0.12235045433044434, "learning_rate": 0.0002908609434659441, "loss": 7.6236, "step": 3401 }, { "epoch": 0.41875923190546527, "grad_norm": 0.09133118391036987, "learning_rate": 0.0002907993595270353, "loss": 7.3868, "step": 3402 }, { "epoch": 0.4188823239783358, "grad_norm": 0.24188432097434998, "learning_rate": 0.00029073777558812666, "loss": 8.3091, "step": 3403 }, { "epoch": 0.4190054160512063, "grad_norm": 0.09583727270364761, "learning_rate": 0.0002906761916492179, "loss": 7.8781, "step": 3404 }, { "epoch": 0.4191285081240768, "grad_norm": 0.17184703052043915, "learning_rate": 0.0002906146077103092, "loss": 8.474, "step": 3405 }, { "epoch": 0.4192516001969473, "grad_norm": 0.11528509110212326, "learning_rate": 0.0002905530237714004, "loss": 7.704, "step": 3406 }, { "epoch": 0.41937469226981783, "grad_norm": 0.23169520497322083, "learning_rate": 0.0002904914398324917, "loss": 7.423, "step": 3407 }, { "epoch": 0.4194977843426883, "grad_norm": 0.12669683992862701, "learning_rate": 0.00029042985589358293, "loss": 7.9222, "step": 3408 }, { "epoch": 0.41962087641555884, "grad_norm": 0.12147393077611923, "learning_rate": 0.00029036827195467427, "loss": 7.5261, "step": 3409 }, { "epoch": 0.4197439684884293, "grad_norm": 0.10477745532989502, "learning_rate": 0.0002903066880157655, "loss": 7.9682, "step": 3410 }, { "epoch": 0.41986706056129985, "grad_norm": 0.12057329714298248, "learning_rate": 0.0002902451040768568, "loss": 7.8078, "step": 3411 }, { "epoch": 0.41999015263417033, "grad_norm": NaN, "learning_rate": 0.000290183520137948, "loss": 7.8604, "step": 3412 }, { "epoch": 0.42011324470704087, "grad_norm": 0.19231931865215302, "learning_rate": 0.0002901219361990393, "loss": 8.6063, "step": 3413 }, { "epoch": 0.4202363367799114, "grad_norm": 0.14615002274513245, "learning_rate": 0.00029006035226013054, "loss": 9.0745, "step": 3414 }, { "epoch": 0.4203594288527819, "grad_norm": 0.31410765647888184, "learning_rate": 0.00028999876832122183, "loss": 8.1622, "step": 3415 }, { "epoch": 0.4204825209256524, "grad_norm": 0.19582074880599976, "learning_rate": 0.0002899371843823131, "loss": 7.6565, "step": 3416 }, { "epoch": 0.4206056129985229, "grad_norm": 0.12443999946117401, "learning_rate": 0.0002898756004434044, "loss": 7.4484, "step": 3417 }, { "epoch": 0.4207287050713934, "grad_norm": 0.11716100573539734, "learning_rate": 0.00028981401650449563, "loss": 7.705, "step": 3418 }, { "epoch": 0.4208517971442639, "grad_norm": 0.13778848946094513, "learning_rate": 0.0002897524325655869, "loss": 7.6876, "step": 3419 }, { "epoch": 0.42097488921713444, "grad_norm": 0.2982226312160492, "learning_rate": 0.00028969084862667815, "loss": 8.4945, "step": 3420 }, { "epoch": 0.4210979812900049, "grad_norm": 0.12515027821063995, "learning_rate": 0.00028962926468776944, "loss": 7.5056, "step": 3421 }, { "epoch": 0.42122107336287545, "grad_norm": 0.12707705795764923, "learning_rate": 0.00028956768074886067, "loss": 8.2644, "step": 3422 }, { "epoch": 0.42134416543574593, "grad_norm": 0.2324317842721939, "learning_rate": 0.000289506096809952, "loss": 7.974, "step": 3423 }, { "epoch": 0.42146725750861647, "grad_norm": 0.21588510274887085, "learning_rate": 0.00028944451287104324, "loss": 7.6171, "step": 3424 }, { "epoch": 0.42159034958148695, "grad_norm": 0.23459994792938232, "learning_rate": 0.00028938292893213453, "loss": 7.4855, "step": 3425 }, { "epoch": 0.4217134416543575, "grad_norm": 0.2221047580242157, "learning_rate": 0.00028932134499322576, "loss": 7.8593, "step": 3426 }, { "epoch": 0.42183653372722796, "grad_norm": 0.21128401160240173, "learning_rate": 0.00028925976105431705, "loss": 7.6783, "step": 3427 }, { "epoch": 0.4219596258000985, "grad_norm": 0.26102718710899353, "learning_rate": 0.0002891981771154083, "loss": 7.0456, "step": 3428 }, { "epoch": 0.42208271787296897, "grad_norm": 0.12080628424882889, "learning_rate": 0.0002891365931764996, "loss": 7.7359, "step": 3429 }, { "epoch": 0.4222058099458395, "grad_norm": 0.20499959588050842, "learning_rate": 0.00028907500923759086, "loss": 7.6629, "step": 3430 }, { "epoch": 0.42232890201871, "grad_norm": 0.1260286569595337, "learning_rate": 0.00028901342529868214, "loss": 7.4186, "step": 3431 }, { "epoch": 0.4224519940915805, "grad_norm": 0.10735749453306198, "learning_rate": 0.0002889518413597734, "loss": 7.7487, "step": 3432 }, { "epoch": 0.422575086164451, "grad_norm": 0.1560523509979248, "learning_rate": 0.00028889025742086466, "loss": 8.4148, "step": 3433 }, { "epoch": 0.42269817823732153, "grad_norm": 0.13089226186275482, "learning_rate": 0.0002888286734819559, "loss": 7.3279, "step": 3434 }, { "epoch": 0.422821270310192, "grad_norm": 0.12279166281223297, "learning_rate": 0.0002887670895430472, "loss": 7.5609, "step": 3435 }, { "epoch": 0.42294436238306254, "grad_norm": 0.14328733086585999, "learning_rate": 0.00028870550560413847, "loss": 7.4915, "step": 3436 }, { "epoch": 0.423067454455933, "grad_norm": 0.28398042917251587, "learning_rate": 0.00028864392166522975, "loss": 8.3151, "step": 3437 }, { "epoch": 0.42319054652880356, "grad_norm": 0.10373274236917496, "learning_rate": 0.000288582337726321, "loss": 7.3762, "step": 3438 }, { "epoch": 0.42331363860167404, "grad_norm": 0.1325169950723648, "learning_rate": 0.00028852075378741227, "loss": 7.8992, "step": 3439 }, { "epoch": 0.42343673067454457, "grad_norm": 0.16254939138889313, "learning_rate": 0.0002884591698485035, "loss": 7.5543, "step": 3440 }, { "epoch": 0.42355982274741505, "grad_norm": 0.22344617545604706, "learning_rate": 0.0002883975859095948, "loss": 7.552, "step": 3441 }, { "epoch": 0.4236829148202856, "grad_norm": 0.16534267365932465, "learning_rate": 0.000288336001970686, "loss": 7.6658, "step": 3442 }, { "epoch": 0.42380600689315606, "grad_norm": 0.17177897691726685, "learning_rate": 0.00028827441803177736, "loss": 8.2546, "step": 3443 }, { "epoch": 0.4239290989660266, "grad_norm": 0.3157387971878052, "learning_rate": 0.0002882128340928686, "loss": 8.0458, "step": 3444 }, { "epoch": 0.4240521910388971, "grad_norm": 0.20474694669246674, "learning_rate": 0.0002881512501539599, "loss": 7.7125, "step": 3445 }, { "epoch": 0.4241752831117676, "grad_norm": 0.1943141669034958, "learning_rate": 0.0002880896662150511, "loss": 8.2059, "step": 3446 }, { "epoch": 0.4242983751846381, "grad_norm": 0.16700077056884766, "learning_rate": 0.0002880280822761424, "loss": 8.6196, "step": 3447 }, { "epoch": 0.4244214672575086, "grad_norm": 0.288864403963089, "learning_rate": 0.00028796649833723363, "loss": 7.6243, "step": 3448 }, { "epoch": 0.4245445593303791, "grad_norm": 0.4287334680557251, "learning_rate": 0.0002879049143983249, "loss": 7.5109, "step": 3449 }, { "epoch": 0.42466765140324964, "grad_norm": 0.20912764966487885, "learning_rate": 0.0002878433304594162, "loss": 8.3138, "step": 3450 }, { "epoch": 0.4247907434761201, "grad_norm": 0.28442880511283875, "learning_rate": 0.0002877817465205075, "loss": 7.5011, "step": 3451 }, { "epoch": 0.42491383554899065, "grad_norm": 0.15150785446166992, "learning_rate": 0.0002877201625815987, "loss": 7.3248, "step": 3452 }, { "epoch": 0.4250369276218611, "grad_norm": 0.2789125144481659, "learning_rate": 0.00028765857864269, "loss": 7.8011, "step": 3453 }, { "epoch": 0.42516001969473166, "grad_norm": 0.30429551005363464, "learning_rate": 0.00028759699470378125, "loss": 7.3904, "step": 3454 }, { "epoch": 0.42528311176760214, "grad_norm": 0.336586058139801, "learning_rate": 0.00028753541076487253, "loss": 7.4803, "step": 3455 }, { "epoch": 0.4254062038404727, "grad_norm": 0.4172177314758301, "learning_rate": 0.00028747382682596376, "loss": 8.598, "step": 3456 }, { "epoch": 0.42552929591334315, "grad_norm": 0.1535995602607727, "learning_rate": 0.0002874122428870551, "loss": 8.0626, "step": 3457 }, { "epoch": 0.4256523879862137, "grad_norm": 0.11193791776895523, "learning_rate": 0.00028735065894814634, "loss": 7.8377, "step": 3458 }, { "epoch": 0.4257754800590842, "grad_norm": 0.202696293592453, "learning_rate": 0.0002872890750092376, "loss": 7.6781, "step": 3459 }, { "epoch": 0.4258985721319547, "grad_norm": 0.16578474640846252, "learning_rate": 0.00028722749107032886, "loss": 8.5284, "step": 3460 }, { "epoch": 0.42602166420482523, "grad_norm": 0.23255357146263123, "learning_rate": 0.00028716590713142014, "loss": 7.7065, "step": 3461 }, { "epoch": 0.4261447562776957, "grad_norm": 0.1492273509502411, "learning_rate": 0.0002871043231925114, "loss": 8.4967, "step": 3462 }, { "epoch": 0.42626784835056625, "grad_norm": 0.17439663410186768, "learning_rate": 0.0002870427392536027, "loss": 7.3848, "step": 3463 }, { "epoch": 0.4263909404234367, "grad_norm": 0.11157085001468658, "learning_rate": 0.00028698115531469395, "loss": 7.4679, "step": 3464 }, { "epoch": 0.42651403249630726, "grad_norm": 0.2867700457572937, "learning_rate": 0.00028691957137578524, "loss": 8.0058, "step": 3465 }, { "epoch": 0.42663712456917774, "grad_norm": 0.20932331681251526, "learning_rate": 0.00028685798743687647, "loss": 7.5868, "step": 3466 }, { "epoch": 0.4267602166420483, "grad_norm": 0.2734028100967407, "learning_rate": 0.00028679640349796775, "loss": 8.2149, "step": 3467 }, { "epoch": 0.42688330871491875, "grad_norm": 0.1506703644990921, "learning_rate": 0.000286734819559059, "loss": 7.2317, "step": 3468 }, { "epoch": 0.4270064007877893, "grad_norm": 0.08868885785341263, "learning_rate": 0.0002866732356201503, "loss": 7.8913, "step": 3469 }, { "epoch": 0.42712949286065977, "grad_norm": 0.1740623265504837, "learning_rate": 0.00028661165168124156, "loss": 8.3349, "step": 3470 }, { "epoch": 0.4272525849335303, "grad_norm": 0.2028704583644867, "learning_rate": 0.00028655006774233285, "loss": 7.5024, "step": 3471 }, { "epoch": 0.4273756770064008, "grad_norm": 0.20379899442195892, "learning_rate": 0.0002864884838034241, "loss": 7.4068, "step": 3472 }, { "epoch": 0.4274987690792713, "grad_norm": 0.17437702417373657, "learning_rate": 0.00028642689986451537, "loss": 7.3038, "step": 3473 }, { "epoch": 0.4276218611521418, "grad_norm": 0.16947327554225922, "learning_rate": 0.0002863653159256066, "loss": 7.8544, "step": 3474 }, { "epoch": 0.4277449532250123, "grad_norm": 0.16526827216148376, "learning_rate": 0.0002863037319866979, "loss": 7.4374, "step": 3475 }, { "epoch": 0.4278680452978828, "grad_norm": 0.3491465747356415, "learning_rate": 0.0002862421480477891, "loss": 8.5575, "step": 3476 }, { "epoch": 0.42799113737075334, "grad_norm": 0.37020137906074524, "learning_rate": 0.00028618056410888046, "loss": 9.3762, "step": 3477 }, { "epoch": 0.4281142294436238, "grad_norm": 0.16371536254882812, "learning_rate": 0.0002861189801699717, "loss": 7.3168, "step": 3478 }, { "epoch": 0.42823732151649435, "grad_norm": 0.09172496944665909, "learning_rate": 0.000286057396231063, "loss": 7.7952, "step": 3479 }, { "epoch": 0.42836041358936483, "grad_norm": 0.13482171297073364, "learning_rate": 0.0002859958122921542, "loss": 8.5492, "step": 3480 }, { "epoch": 0.42848350566223536, "grad_norm": 0.1602865308523178, "learning_rate": 0.0002859342283532455, "loss": 8.7776, "step": 3481 }, { "epoch": 0.42860659773510584, "grad_norm": 0.18726509809494019, "learning_rate": 0.00028587264441433673, "loss": 7.7539, "step": 3482 }, { "epoch": 0.4287296898079764, "grad_norm": 0.1489633023738861, "learning_rate": 0.000285811060475428, "loss": 8.0898, "step": 3483 }, { "epoch": 0.42885278188084686, "grad_norm": 0.15123148262500763, "learning_rate": 0.0002857494765365193, "loss": 8.0101, "step": 3484 }, { "epoch": 0.4289758739537174, "grad_norm": 0.1588604599237442, "learning_rate": 0.00028568789259761053, "loss": 7.5383, "step": 3485 }, { "epoch": 0.42909896602658787, "grad_norm": 0.24539461731910706, "learning_rate": 0.0002856263086587018, "loss": 8.5396, "step": 3486 }, { "epoch": 0.4292220580994584, "grad_norm": 0.11840566992759705, "learning_rate": 0.00028556472471979305, "loss": 7.4089, "step": 3487 }, { "epoch": 0.4293451501723289, "grad_norm": 0.12667685747146606, "learning_rate": 0.00028550314078088434, "loss": 7.6943, "step": 3488 }, { "epoch": 0.4294682422451994, "grad_norm": 0.14765548706054688, "learning_rate": 0.00028544155684197557, "loss": 7.6785, "step": 3489 }, { "epoch": 0.4295913343180699, "grad_norm": 0.08103347569704056, "learning_rate": 0.0002853799729030669, "loss": 7.6047, "step": 3490 }, { "epoch": 0.42971442639094043, "grad_norm": 0.13611918687820435, "learning_rate": 0.00028531838896415815, "loss": 8.2618, "step": 3491 }, { "epoch": 0.4298375184638109, "grad_norm": 0.1574949324131012, "learning_rate": 0.00028525680502524943, "loss": 7.5178, "step": 3492 }, { "epoch": 0.42996061053668144, "grad_norm": 0.15899315476417542, "learning_rate": 0.00028519522108634066, "loss": 7.4793, "step": 3493 }, { "epoch": 0.4300837026095519, "grad_norm": 0.16866478323936462, "learning_rate": 0.00028513363714743195, "loss": 7.2973, "step": 3494 }, { "epoch": 0.43020679468242246, "grad_norm": 0.12445607781410217, "learning_rate": 0.0002850720532085232, "loss": 7.7655, "step": 3495 }, { "epoch": 0.43032988675529293, "grad_norm": 0.11673648655414581, "learning_rate": 0.00028501046926961447, "loss": 7.6282, "step": 3496 }, { "epoch": 0.43045297882816347, "grad_norm": 0.08552122861146927, "learning_rate": 0.00028494888533070576, "loss": 7.3735, "step": 3497 }, { "epoch": 0.43057607090103395, "grad_norm": 0.11667358130216599, "learning_rate": 0.00028488730139179704, "loss": 7.6915, "step": 3498 }, { "epoch": 0.4306991629739045, "grad_norm": 0.13569891452789307, "learning_rate": 0.0002848257174528883, "loss": 8.1204, "step": 3499 }, { "epoch": 0.43082225504677496, "grad_norm": 0.10778703540563583, "learning_rate": 0.00028476413351397956, "loss": 7.5865, "step": 3500 }, { "epoch": 0.4309453471196455, "grad_norm": 0.12505167722702026, "learning_rate": 0.0002847025495750708, "loss": 7.8313, "step": 3501 }, { "epoch": 0.43106843919251603, "grad_norm": 0.12639930844306946, "learning_rate": 0.0002846409656361621, "loss": 7.9101, "step": 3502 }, { "epoch": 0.4311915312653865, "grad_norm": 0.1507963091135025, "learning_rate": 0.0002845793816972533, "loss": 7.4347, "step": 3503 }, { "epoch": 0.43131462333825704, "grad_norm": 0.10340803861618042, "learning_rate": 0.00028451779775834465, "loss": 7.8208, "step": 3504 }, { "epoch": 0.4314377154111275, "grad_norm": 0.10062813013792038, "learning_rate": 0.0002844562138194359, "loss": 7.3546, "step": 3505 }, { "epoch": 0.43156080748399805, "grad_norm": 0.24303647875785828, "learning_rate": 0.0002843946298805272, "loss": 8.6481, "step": 3506 }, { "epoch": 0.43168389955686853, "grad_norm": 0.20719288289546967, "learning_rate": 0.0002843330459416184, "loss": 7.8672, "step": 3507 }, { "epoch": 0.43180699162973907, "grad_norm": 0.08823497593402863, "learning_rate": 0.0002842714620027097, "loss": 7.5746, "step": 3508 }, { "epoch": 0.43193008370260955, "grad_norm": 0.22465036809444427, "learning_rate": 0.0002842098780638009, "loss": 7.9593, "step": 3509 }, { "epoch": 0.4320531757754801, "grad_norm": 0.13938091695308685, "learning_rate": 0.0002841482941248922, "loss": 7.9309, "step": 3510 }, { "epoch": 0.43217626784835056, "grad_norm": 0.10840766876935959, "learning_rate": 0.0002840867101859835, "loss": 7.5878, "step": 3511 }, { "epoch": 0.4322993599212211, "grad_norm": 0.11000937968492508, "learning_rate": 0.0002840251262470748, "loss": 7.5208, "step": 3512 }, { "epoch": 0.4324224519940916, "grad_norm": 0.09117737412452698, "learning_rate": 0.000283963542308166, "loss": 8.2238, "step": 3513 }, { "epoch": 0.4325455440669621, "grad_norm": 0.11725129187107086, "learning_rate": 0.0002839019583692573, "loss": 7.5336, "step": 3514 }, { "epoch": 0.4326686361398326, "grad_norm": 0.12743690609931946, "learning_rate": 0.00028384037443034854, "loss": 7.4291, "step": 3515 }, { "epoch": 0.4327917282127031, "grad_norm": 0.12053720653057098, "learning_rate": 0.0002837787904914398, "loss": 7.8709, "step": 3516 }, { "epoch": 0.4329148202855736, "grad_norm": 0.14129644632339478, "learning_rate": 0.0002837172065525311, "loss": 7.3746, "step": 3517 }, { "epoch": 0.43303791235844413, "grad_norm": 0.1729673147201538, "learning_rate": 0.0002836556226136224, "loss": 7.5375, "step": 3518 }, { "epoch": 0.4331610044313146, "grad_norm": 0.133347749710083, "learning_rate": 0.00028359403867471363, "loss": 7.8523, "step": 3519 }, { "epoch": 0.43328409650418515, "grad_norm": 0.09876424819231033, "learning_rate": 0.0002835324547358049, "loss": 7.6391, "step": 3520 }, { "epoch": 0.4334071885770556, "grad_norm": 0.24787913262844086, "learning_rate": 0.00028347087079689615, "loss": 9.0133, "step": 3521 }, { "epoch": 0.43353028064992616, "grad_norm": 0.15850622951984406, "learning_rate": 0.00028340928685798743, "loss": 7.6987, "step": 3522 }, { "epoch": 0.43365337272279664, "grad_norm": 0.26182442903518677, "learning_rate": 0.00028334770291907867, "loss": 7.2756, "step": 3523 }, { "epoch": 0.43377646479566717, "grad_norm": 0.2615898847579956, "learning_rate": 0.00028328611898017, "loss": 7.38, "step": 3524 }, { "epoch": 0.43389955686853765, "grad_norm": 0.29183557629585266, "learning_rate": 0.00028322453504126124, "loss": 7.233, "step": 3525 }, { "epoch": 0.4340226489414082, "grad_norm": 0.2874468266963959, "learning_rate": 0.0002831629511023525, "loss": 9.2262, "step": 3526 }, { "epoch": 0.43414574101427866, "grad_norm": 0.23869095742702484, "learning_rate": 0.00028310136716344376, "loss": 8.5639, "step": 3527 }, { "epoch": 0.4342688330871492, "grad_norm": 0.1744750440120697, "learning_rate": 0.00028303978322453504, "loss": 7.5156, "step": 3528 }, { "epoch": 0.4343919251600197, "grad_norm": 0.09766144305467606, "learning_rate": 0.0002829781992856263, "loss": 7.3725, "step": 3529 }, { "epoch": 0.4345150172328902, "grad_norm": 0.13408620655536652, "learning_rate": 0.00028291661534671756, "loss": 7.5557, "step": 3530 }, { "epoch": 0.4346381093057607, "grad_norm": 0.08318883925676346, "learning_rate": 0.00028285503140780885, "loss": 7.3337, "step": 3531 }, { "epoch": 0.4347612013786312, "grad_norm": 0.22337643802165985, "learning_rate": 0.00028279344746890014, "loss": 8.3095, "step": 3532 }, { "epoch": 0.4348842934515017, "grad_norm": 0.15604929625988007, "learning_rate": 0.00028273186352999137, "loss": 7.7888, "step": 3533 }, { "epoch": 0.43500738552437224, "grad_norm": 0.19694575667381287, "learning_rate": 0.00028267027959108266, "loss": 7.8768, "step": 3534 }, { "epoch": 0.4351304775972427, "grad_norm": 0.0928654596209526, "learning_rate": 0.0002826086956521739, "loss": 7.4187, "step": 3535 }, { "epoch": 0.43525356967011325, "grad_norm": 0.21606536209583282, "learning_rate": 0.0002825471117132652, "loss": 8.3452, "step": 3536 }, { "epoch": 0.43537666174298373, "grad_norm": 0.14484137296676636, "learning_rate": 0.0002824855277743564, "loss": 7.6695, "step": 3537 }, { "epoch": 0.43549975381585426, "grad_norm": 0.111353300511837, "learning_rate": 0.00028242394383544775, "loss": 7.787, "step": 3538 }, { "epoch": 0.43562284588872474, "grad_norm": 0.11068404465913773, "learning_rate": 0.000282362359896539, "loss": 7.5061, "step": 3539 }, { "epoch": 0.4357459379615953, "grad_norm": 0.08128499984741211, "learning_rate": 0.00028230077595763027, "loss": 8.0261, "step": 3540 }, { "epoch": 0.43586903003446575, "grad_norm": 0.10176361352205276, "learning_rate": 0.0002822391920187215, "loss": 7.4472, "step": 3541 }, { "epoch": 0.4359921221073363, "grad_norm": 0.06826440989971161, "learning_rate": 0.0002821776080798128, "loss": 7.6486, "step": 3542 }, { "epoch": 0.43611521418020677, "grad_norm": 0.15777552127838135, "learning_rate": 0.000282116024140904, "loss": 7.911, "step": 3543 }, { "epoch": 0.4362383062530773, "grad_norm": 0.44840484857559204, "learning_rate": 0.00028205444020199536, "loss": 9.2604, "step": 3544 }, { "epoch": 0.43636139832594784, "grad_norm": 0.20948342978954315, "learning_rate": 0.0002819928562630866, "loss": 7.2613, "step": 3545 }, { "epoch": 0.4364844903988183, "grad_norm": 0.08573415130376816, "learning_rate": 0.0002819312723241779, "loss": 8.0952, "step": 3546 }, { "epoch": 0.43660758247168885, "grad_norm": 0.19087527692317963, "learning_rate": 0.0002818696883852691, "loss": 7.5264, "step": 3547 }, { "epoch": 0.4367306745445593, "grad_norm": 0.3027735948562622, "learning_rate": 0.0002818081044463604, "loss": 7.089, "step": 3548 }, { "epoch": 0.43685376661742986, "grad_norm": 0.09566816687583923, "learning_rate": 0.00028174652050745163, "loss": 7.6757, "step": 3549 }, { "epoch": 0.43697685869030034, "grad_norm": 0.0929892286658287, "learning_rate": 0.0002816849365685429, "loss": 7.4979, "step": 3550 }, { "epoch": 0.4370999507631709, "grad_norm": 0.06629027426242828, "learning_rate": 0.0002816233526296342, "loss": 7.4437, "step": 3551 }, { "epoch": 0.43722304283604135, "grad_norm": 0.1166926696896553, "learning_rate": 0.0002815617686907255, "loss": 7.4425, "step": 3552 }, { "epoch": 0.4373461349089119, "grad_norm": 0.15041540563106537, "learning_rate": 0.0002815001847518167, "loss": 7.6197, "step": 3553 }, { "epoch": 0.43746922698178237, "grad_norm": 0.3703586459159851, "learning_rate": 0.000281438600812908, "loss": 8.6194, "step": 3554 }, { "epoch": 0.4375923190546529, "grad_norm": 0.10516241937875748, "learning_rate": 0.00028137701687399924, "loss": 7.5649, "step": 3555 }, { "epoch": 0.4377154111275234, "grad_norm": 0.24037490785121918, "learning_rate": 0.00028131543293509053, "loss": 7.4254, "step": 3556 }, { "epoch": 0.4378385032003939, "grad_norm": 0.12934428453445435, "learning_rate": 0.00028125384899618176, "loss": 7.9479, "step": 3557 }, { "epoch": 0.4379615952732644, "grad_norm": 0.11114798486232758, "learning_rate": 0.0002811922650572731, "loss": 7.8957, "step": 3558 }, { "epoch": 0.4380846873461349, "grad_norm": 0.14822731912136078, "learning_rate": 0.00028113068111836433, "loss": 7.7275, "step": 3559 }, { "epoch": 0.4382077794190054, "grad_norm": 0.21357038617134094, "learning_rate": 0.0002810690971794556, "loss": 7.2609, "step": 3560 }, { "epoch": 0.43833087149187594, "grad_norm": 0.060539547353982925, "learning_rate": 0.00028100751324054685, "loss": 7.4343, "step": 3561 }, { "epoch": 0.4384539635647464, "grad_norm": 0.32871583104133606, "learning_rate": 0.00028094592930163814, "loss": 8.7875, "step": 3562 }, { "epoch": 0.43857705563761695, "grad_norm": 0.16215652227401733, "learning_rate": 0.00028088434536272937, "loss": 7.5333, "step": 3563 }, { "epoch": 0.43870014771048743, "grad_norm": 0.14076681435108185, "learning_rate": 0.00028082276142382066, "loss": 7.7159, "step": 3564 }, { "epoch": 0.43882323978335797, "grad_norm": 0.1145651638507843, "learning_rate": 0.00028076117748491194, "loss": 7.8794, "step": 3565 }, { "epoch": 0.43894633185622844, "grad_norm": 0.09199446439743042, "learning_rate": 0.00028069959354600323, "loss": 7.4958, "step": 3566 }, { "epoch": 0.439069423929099, "grad_norm": 0.22056695818901062, "learning_rate": 0.00028063800960709446, "loss": 7.1903, "step": 3567 }, { "epoch": 0.43919251600196946, "grad_norm": 0.1293158382177353, "learning_rate": 0.00028057642566818575, "loss": 7.5863, "step": 3568 }, { "epoch": 0.43931560807484, "grad_norm": 0.12920431792736053, "learning_rate": 0.000280514841729277, "loss": 7.247, "step": 3569 }, { "epoch": 0.43943870014771047, "grad_norm": 0.10498811304569244, "learning_rate": 0.00028045325779036827, "loss": 7.4131, "step": 3570 }, { "epoch": 0.439561792220581, "grad_norm": 0.14904998242855072, "learning_rate": 0.0002803916738514595, "loss": 7.6713, "step": 3571 }, { "epoch": 0.4396848842934515, "grad_norm": 0.27432480454444885, "learning_rate": 0.00028033008991255084, "loss": 8.2196, "step": 3572 }, { "epoch": 0.439807976366322, "grad_norm": 0.16155025362968445, "learning_rate": 0.0002802685059736421, "loss": 7.7837, "step": 3573 }, { "epoch": 0.4399310684391925, "grad_norm": 0.12022847682237625, "learning_rate": 0.00028020692203473336, "loss": 7.6139, "step": 3574 }, { "epoch": 0.44005416051206303, "grad_norm": 0.10595738887786865, "learning_rate": 0.0002801453380958246, "loss": 8.0593, "step": 3575 }, { "epoch": 0.4401772525849335, "grad_norm": 0.11801443248987198, "learning_rate": 0.0002800837541569159, "loss": 7.5257, "step": 3576 }, { "epoch": 0.44030034465780404, "grad_norm": 0.1273554563522339, "learning_rate": 0.0002800221702180071, "loss": 7.7286, "step": 3577 }, { "epoch": 0.4404234367306745, "grad_norm": 0.2040606141090393, "learning_rate": 0.00027996058627909845, "loss": 7.2782, "step": 3578 }, { "epoch": 0.44054652880354506, "grad_norm": 0.08626057952642441, "learning_rate": 0.0002798990023401897, "loss": 7.7495, "step": 3579 }, { "epoch": 0.44066962087641554, "grad_norm": 0.1102994754910469, "learning_rate": 0.00027983741840128097, "loss": 7.6311, "step": 3580 }, { "epoch": 0.44079271294928607, "grad_norm": 0.1952122449874878, "learning_rate": 0.0002797758344623722, "loss": 7.9067, "step": 3581 }, { "epoch": 0.44091580502215655, "grad_norm": 0.08049067109823227, "learning_rate": 0.0002797142505234635, "loss": 7.4686, "step": 3582 }, { "epoch": 0.4410388970950271, "grad_norm": 0.10905082523822784, "learning_rate": 0.0002796526665845547, "loss": 7.5265, "step": 3583 }, { "epoch": 0.44116198916789756, "grad_norm": 0.1599438190460205, "learning_rate": 0.000279591082645646, "loss": 7.5296, "step": 3584 }, { "epoch": 0.4412850812407681, "grad_norm": 0.10543502867221832, "learning_rate": 0.0002795294987067373, "loss": 7.816, "step": 3585 }, { "epoch": 0.4414081733136386, "grad_norm": 0.6797327399253845, "learning_rate": 0.0002794679147678286, "loss": 10.5903, "step": 3586 }, { "epoch": 0.4415312653865091, "grad_norm": 0.11676719784736633, "learning_rate": 0.0002794063308289198, "loss": 7.6638, "step": 3587 }, { "epoch": 0.44165435745937964, "grad_norm": 0.13043545186519623, "learning_rate": 0.0002793447468900111, "loss": 7.6978, "step": 3588 }, { "epoch": 0.4417774495322501, "grad_norm": 0.12069722265005112, "learning_rate": 0.00027928316295110233, "loss": 7.8304, "step": 3589 }, { "epoch": 0.44190054160512066, "grad_norm": 0.12034562230110168, "learning_rate": 0.0002792215790121936, "loss": 7.4322, "step": 3590 }, { "epoch": 0.44202363367799113, "grad_norm": 0.10082157701253891, "learning_rate": 0.00027915999507328485, "loss": 7.5864, "step": 3591 }, { "epoch": 0.44214672575086167, "grad_norm": 0.2672595679759979, "learning_rate": 0.0002790984111343762, "loss": 7.9665, "step": 3592 }, { "epoch": 0.44226981782373215, "grad_norm": 0.1192394271492958, "learning_rate": 0.0002790368271954674, "loss": 7.3658, "step": 3593 }, { "epoch": 0.4423929098966027, "grad_norm": 0.183862566947937, "learning_rate": 0.0002789752432565587, "loss": 7.8243, "step": 3594 }, { "epoch": 0.44251600196947316, "grad_norm": 0.2130056619644165, "learning_rate": 0.00027891365931764995, "loss": 8.1873, "step": 3595 }, { "epoch": 0.4426390940423437, "grad_norm": 0.1372346132993698, "learning_rate": 0.00027885207537874123, "loss": 7.7964, "step": 3596 }, { "epoch": 0.4427621861152142, "grad_norm": 0.1848737746477127, "learning_rate": 0.00027879049143983246, "loss": 8.2937, "step": 3597 }, { "epoch": 0.4428852781880847, "grad_norm": 0.14849628508090973, "learning_rate": 0.00027872890750092375, "loss": 8.7571, "step": 3598 }, { "epoch": 0.4430083702609552, "grad_norm": 0.23366640508174896, "learning_rate": 0.00027866732356201504, "loss": 7.6488, "step": 3599 }, { "epoch": 0.4431314623338257, "grad_norm": 0.29661694169044495, "learning_rate": 0.0002786057396231063, "loss": 7.5147, "step": 3600 }, { "epoch": 0.4432545544066962, "grad_norm": 0.29178324341773987, "learning_rate": 0.00027854415568419756, "loss": 7.6708, "step": 3601 }, { "epoch": 0.44337764647956673, "grad_norm": 0.3205713629722595, "learning_rate": 0.00027848257174528884, "loss": 7.4706, "step": 3602 }, { "epoch": 0.4435007385524372, "grad_norm": 0.34320175647735596, "learning_rate": 0.0002784209878063801, "loss": 7.1849, "step": 3603 }, { "epoch": 0.44362383062530775, "grad_norm": 0.1340600699186325, "learning_rate": 0.00027835940386747136, "loss": 7.6674, "step": 3604 }, { "epoch": 0.4437469226981782, "grad_norm": 0.10217931121587753, "learning_rate": 0.00027829781992856265, "loss": 7.501, "step": 3605 }, { "epoch": 0.44387001477104876, "grad_norm": NaN, "learning_rate": 0.00027823623598965394, "loss": 9.7507, "step": 3606 }, { "epoch": 0.44399310684391924, "grad_norm": 0.17736545205116272, "learning_rate": 0.00027817465205074517, "loss": 7.6441, "step": 3607 }, { "epoch": 0.4441161989167898, "grad_norm": 0.3251684010028839, "learning_rate": 0.00027811306811183645, "loss": 7.7381, "step": 3608 }, { "epoch": 0.44423929098966025, "grad_norm": 0.24958418309688568, "learning_rate": 0.0002780514841729277, "loss": 7.6486, "step": 3609 }, { "epoch": 0.4443623830625308, "grad_norm": 0.20408044755458832, "learning_rate": 0.000277989900234019, "loss": 8.3206, "step": 3610 }, { "epoch": 0.44448547513540126, "grad_norm": 0.1463671624660492, "learning_rate": 0.0002779283162951102, "loss": 7.2615, "step": 3611 }, { "epoch": 0.4446085672082718, "grad_norm": 0.19291242957115173, "learning_rate": 0.00027786673235620155, "loss": 7.9043, "step": 3612 }, { "epoch": 0.4447316592811423, "grad_norm": 0.156162291765213, "learning_rate": 0.0002778051484172928, "loss": 8.6488, "step": 3613 }, { "epoch": 0.4448547513540128, "grad_norm": 0.2948368787765503, "learning_rate": 0.00027774356447838407, "loss": 8.1419, "step": 3614 }, { "epoch": 0.4449778434268833, "grad_norm": 0.3812139928340912, "learning_rate": 0.0002776819805394753, "loss": 7.8581, "step": 3615 }, { "epoch": 0.4451009354997538, "grad_norm": 0.21025919914245605, "learning_rate": 0.0002776203966005666, "loss": 8.2847, "step": 3616 }, { "epoch": 0.4452240275726243, "grad_norm": 0.19200462102890015, "learning_rate": 0.0002775588126616578, "loss": 7.4341, "step": 3617 }, { "epoch": 0.44534711964549484, "grad_norm": 0.16701577603816986, "learning_rate": 0.0002774972287227491, "loss": 7.4356, "step": 3618 }, { "epoch": 0.4454702117183653, "grad_norm": 0.2894342839717865, "learning_rate": 0.0002774356447838404, "loss": 7.5021, "step": 3619 }, { "epoch": 0.44559330379123585, "grad_norm": 0.3030889630317688, "learning_rate": 0.0002773740608449317, "loss": 7.5391, "step": 3620 }, { "epoch": 0.44571639586410633, "grad_norm": 0.11010071635246277, "learning_rate": 0.0002773124769060229, "loss": 7.1878, "step": 3621 }, { "epoch": 0.44583948793697686, "grad_norm": 0.22960291802883148, "learning_rate": 0.0002772508929671142, "loss": 8.608, "step": 3622 }, { "epoch": 0.44596258000984734, "grad_norm": 0.18508486449718475, "learning_rate": 0.00027718930902820543, "loss": 7.9673, "step": 3623 }, { "epoch": 0.4460856720827179, "grad_norm": 0.28591710329055786, "learning_rate": 0.0002771277250892967, "loss": 7.6403, "step": 3624 }, { "epoch": 0.44620876415558836, "grad_norm": 0.24226921796798706, "learning_rate": 0.00027706614115038795, "loss": 7.5508, "step": 3625 }, { "epoch": 0.4463318562284589, "grad_norm": 0.12382744997739792, "learning_rate": 0.0002770045572114793, "loss": 8.2684, "step": 3626 }, { "epoch": 0.44645494830132937, "grad_norm": 0.18818499147891998, "learning_rate": 0.0002769429732725705, "loss": 7.6172, "step": 3627 }, { "epoch": 0.4465780403741999, "grad_norm": 0.28512051701545715, "learning_rate": 0.0002768813893336618, "loss": 7.9557, "step": 3628 }, { "epoch": 0.4467011324470704, "grad_norm": 0.1341352015733719, "learning_rate": 0.00027681980539475304, "loss": 7.5313, "step": 3629 }, { "epoch": 0.4468242245199409, "grad_norm": 0.15290714800357819, "learning_rate": 0.0002767582214558443, "loss": 8.2525, "step": 3630 }, { "epoch": 0.44694731659281145, "grad_norm": 0.12649044394493103, "learning_rate": 0.00027669663751693556, "loss": 7.67, "step": 3631 }, { "epoch": 0.44707040866568193, "grad_norm": 0.20563296973705292, "learning_rate": 0.0002766350535780269, "loss": 7.4765, "step": 3632 }, { "epoch": 0.44719350073855246, "grad_norm": 0.17024841904640198, "learning_rate": 0.00027657346963911813, "loss": 7.6897, "step": 3633 }, { "epoch": 0.44731659281142294, "grad_norm": 0.1833537220954895, "learning_rate": 0.0002765118857002094, "loss": 8.1535, "step": 3634 }, { "epoch": 0.4474396848842935, "grad_norm": 0.10021504014730453, "learning_rate": 0.00027645030176130065, "loss": 7.7055, "step": 3635 }, { "epoch": 0.44756277695716395, "grad_norm": 0.4179258346557617, "learning_rate": 0.00027638871782239194, "loss": 9.1669, "step": 3636 }, { "epoch": 0.4476858690300345, "grad_norm": 0.09388836473226547, "learning_rate": 0.00027632713388348317, "loss": 7.3875, "step": 3637 }, { "epoch": 0.44780896110290497, "grad_norm": 0.11490818858146667, "learning_rate": 0.00027626554994457446, "loss": 7.5531, "step": 3638 }, { "epoch": 0.4479320531757755, "grad_norm": 0.10661061853170395, "learning_rate": 0.00027620396600566574, "loss": 7.531, "step": 3639 }, { "epoch": 0.448055145248646, "grad_norm": 0.15478120744228363, "learning_rate": 0.00027614238206675703, "loss": 7.348, "step": 3640 }, { "epoch": 0.4481782373215165, "grad_norm": 0.0717693567276001, "learning_rate": 0.00027608079812784826, "loss": 7.9716, "step": 3641 }, { "epoch": 0.448301329394387, "grad_norm": 0.07965942472219467, "learning_rate": 0.00027601921418893955, "loss": 7.7854, "step": 3642 }, { "epoch": 0.44842442146725753, "grad_norm": 0.08300087600946426, "learning_rate": 0.0002759576302500308, "loss": 7.427, "step": 3643 }, { "epoch": 0.448547513540128, "grad_norm": 0.14577022194862366, "learning_rate": 0.00027589604631112207, "loss": 7.4206, "step": 3644 }, { "epoch": 0.44867060561299854, "grad_norm": 0.08322657644748688, "learning_rate": 0.0002758344623722133, "loss": 7.8534, "step": 3645 }, { "epoch": 0.448793697685869, "grad_norm": 0.10486846417188644, "learning_rate": 0.00027577287843330464, "loss": 7.7459, "step": 3646 }, { "epoch": 0.44891678975873955, "grad_norm": 0.09806596487760544, "learning_rate": 0.0002757112944943959, "loss": 7.6791, "step": 3647 }, { "epoch": 0.44903988183161003, "grad_norm": 0.10113009810447693, "learning_rate": 0.00027564971055548716, "loss": 7.7239, "step": 3648 }, { "epoch": 0.44916297390448057, "grad_norm": 0.16864822804927826, "learning_rate": 0.0002755881266165784, "loss": 7.4402, "step": 3649 }, { "epoch": 0.44928606597735105, "grad_norm": 0.12669074535369873, "learning_rate": 0.0002755265426776697, "loss": 7.7729, "step": 3650 }, { "epoch": 0.4494091580502216, "grad_norm": 0.22209306061267853, "learning_rate": 0.0002754649587387609, "loss": 8.3916, "step": 3651 }, { "epoch": 0.44953225012309206, "grad_norm": 0.09459441900253296, "learning_rate": 0.0002754033747998522, "loss": 7.5075, "step": 3652 }, { "epoch": 0.4496553421959626, "grad_norm": 0.18606895208358765, "learning_rate": 0.0002753417908609435, "loss": 7.5717, "step": 3653 }, { "epoch": 0.44977843426883307, "grad_norm": 0.19865919649600983, "learning_rate": 0.00027528020692203477, "loss": 8.4564, "step": 3654 }, { "epoch": 0.4499015263417036, "grad_norm": 0.0898769274353981, "learning_rate": 0.000275218622983126, "loss": 7.6718, "step": 3655 }, { "epoch": 0.4500246184145741, "grad_norm": 0.11355593055486679, "learning_rate": 0.0002751570390442173, "loss": 8.1487, "step": 3656 }, { "epoch": 0.4501477104874446, "grad_norm": 0.12121743708848953, "learning_rate": 0.0002750954551053085, "loss": 8.3913, "step": 3657 }, { "epoch": 0.4502708025603151, "grad_norm": 0.20242367684841156, "learning_rate": 0.0002750338711663998, "loss": 7.4529, "step": 3658 }, { "epoch": 0.45039389463318563, "grad_norm": 0.12176156044006348, "learning_rate": 0.0002749722872274911, "loss": 7.6723, "step": 3659 }, { "epoch": 0.4505169867060561, "grad_norm": 0.10150545835494995, "learning_rate": 0.0002749107032885824, "loss": 7.7618, "step": 3660 }, { "epoch": 0.45064007877892664, "grad_norm": 0.2906351685523987, "learning_rate": 0.0002748491193496736, "loss": 8.8917, "step": 3661 }, { "epoch": 0.4507631708517971, "grad_norm": 0.11942891776561737, "learning_rate": 0.0002747875354107649, "loss": 7.3106, "step": 3662 }, { "epoch": 0.45088626292466766, "grad_norm": 0.1304495632648468, "learning_rate": 0.00027472595147185613, "loss": 7.2763, "step": 3663 }, { "epoch": 0.45100935499753814, "grad_norm": 0.26775413751602173, "learning_rate": 0.0002746643675329474, "loss": 8.336, "step": 3664 }, { "epoch": 0.45113244707040867, "grad_norm": 0.09812011569738388, "learning_rate": 0.00027460278359403865, "loss": 7.9363, "step": 3665 }, { "epoch": 0.45125553914327915, "grad_norm": 0.13012196123600006, "learning_rate": 0.00027454119965513, "loss": 7.4053, "step": 3666 }, { "epoch": 0.4513786312161497, "grad_norm": 0.09728814661502838, "learning_rate": 0.0002744796157162212, "loss": 7.6919, "step": 3667 }, { "epoch": 0.45150172328902016, "grad_norm": 0.09662680327892303, "learning_rate": 0.0002744180317773125, "loss": 7.687, "step": 3668 }, { "epoch": 0.4516248153618907, "grad_norm": 0.10111236572265625, "learning_rate": 0.00027435644783840374, "loss": 7.7684, "step": 3669 }, { "epoch": 0.4517479074347612, "grad_norm": 0.13727155327796936, "learning_rate": 0.00027429486389949503, "loss": 7.563, "step": 3670 }, { "epoch": 0.4518709995076317, "grad_norm": 0.1307484209537506, "learning_rate": 0.00027423327996058626, "loss": 7.7925, "step": 3671 }, { "epoch": 0.4519940915805022, "grad_norm": 0.11648359149694443, "learning_rate": 0.00027417169602167755, "loss": 7.7512, "step": 3672 }, { "epoch": 0.4521171836533727, "grad_norm": 0.08550039678812027, "learning_rate": 0.00027411011208276884, "loss": 7.5288, "step": 3673 }, { "epoch": 0.45224027572624326, "grad_norm": 0.12038063257932663, "learning_rate": 0.0002740485281438601, "loss": 7.37, "step": 3674 }, { "epoch": 0.45236336779911374, "grad_norm": 0.07624052464962006, "learning_rate": 0.00027398694420495136, "loss": 7.4246, "step": 3675 }, { "epoch": 0.45248645987198427, "grad_norm": 0.0880194827914238, "learning_rate": 0.00027392536026604264, "loss": 7.4959, "step": 3676 }, { "epoch": 0.45260955194485475, "grad_norm": 0.1212758794426918, "learning_rate": 0.0002738637763271339, "loss": 7.6607, "step": 3677 }, { "epoch": 0.4527326440177253, "grad_norm": 0.10416501760482788, "learning_rate": 0.00027380219238822516, "loss": 7.503, "step": 3678 }, { "epoch": 0.45285573609059576, "grad_norm": 0.09081722050905228, "learning_rate": 0.0002737406084493164, "loss": 8.1932, "step": 3679 }, { "epoch": 0.4529788281634663, "grad_norm": 0.1885560154914856, "learning_rate": 0.00027367902451040773, "loss": 7.8839, "step": 3680 }, { "epoch": 0.4531019202363368, "grad_norm": 0.08932794630527496, "learning_rate": 0.00027361744057149897, "loss": 7.9783, "step": 3681 }, { "epoch": 0.4532250123092073, "grad_norm": 0.12094229459762573, "learning_rate": 0.00027355585663259025, "loss": 7.9462, "step": 3682 }, { "epoch": 0.4533481043820778, "grad_norm": 0.12270506471395493, "learning_rate": 0.0002734942726936815, "loss": 7.6364, "step": 3683 }, { "epoch": 0.4534711964549483, "grad_norm": 0.11010415107011795, "learning_rate": 0.00027343268875477277, "loss": 7.4047, "step": 3684 }, { "epoch": 0.4535942885278188, "grad_norm": 0.14125986397266388, "learning_rate": 0.000273371104815864, "loss": 7.7085, "step": 3685 }, { "epoch": 0.45371738060068934, "grad_norm": 0.08936584740877151, "learning_rate": 0.00027330952087695535, "loss": 7.1736, "step": 3686 }, { "epoch": 0.4538404726735598, "grad_norm": 0.13748601078987122, "learning_rate": 0.0002732479369380466, "loss": 7.6252, "step": 3687 }, { "epoch": 0.45396356474643035, "grad_norm": 0.10842547565698624, "learning_rate": 0.00027318635299913786, "loss": 7.5984, "step": 3688 }, { "epoch": 0.4540866568193008, "grad_norm": 0.3423093855381012, "learning_rate": 0.0002731247690602291, "loss": 9.4823, "step": 3689 }, { "epoch": 0.45420974889217136, "grad_norm": 0.08839462697505951, "learning_rate": 0.0002730631851213204, "loss": 7.8775, "step": 3690 }, { "epoch": 0.45433284096504184, "grad_norm": 0.16952396929264069, "learning_rate": 0.0002730016011824116, "loss": 7.558, "step": 3691 }, { "epoch": 0.4544559330379124, "grad_norm": 0.17068994045257568, "learning_rate": 0.0002729400172435029, "loss": 7.909, "step": 3692 }, { "epoch": 0.45457902511078285, "grad_norm": 0.0886756181716919, "learning_rate": 0.0002728784333045942, "loss": 8.0934, "step": 3693 }, { "epoch": 0.4547021171836534, "grad_norm": 0.14544470608234406, "learning_rate": 0.0002728168493656855, "loss": 7.5592, "step": 3694 }, { "epoch": 0.45482520925652387, "grad_norm": 0.10653805732727051, "learning_rate": 0.0002727552654267767, "loss": 7.3701, "step": 3695 }, { "epoch": 0.4549483013293944, "grad_norm": 0.23845286667346954, "learning_rate": 0.000272693681487868, "loss": 8.2631, "step": 3696 }, { "epoch": 0.4550713934022649, "grad_norm": 0.12019536644220352, "learning_rate": 0.00027263209754895923, "loss": 7.0931, "step": 3697 }, { "epoch": 0.4551944854751354, "grad_norm": 0.21020005643367767, "learning_rate": 0.0002725705136100505, "loss": 7.6032, "step": 3698 }, { "epoch": 0.4553175775480059, "grad_norm": 0.26593127846717834, "learning_rate": 0.00027250892967114175, "loss": 8.1502, "step": 3699 }, { "epoch": 0.4554406696208764, "grad_norm": 0.1715758591890335, "learning_rate": 0.0002724473457322331, "loss": 7.6095, "step": 3700 }, { "epoch": 0.4555637616937469, "grad_norm": 0.11931667476892471, "learning_rate": 0.0002723857617933243, "loss": 7.6652, "step": 3701 }, { "epoch": 0.45568685376661744, "grad_norm": 0.3838254511356354, "learning_rate": 0.0002723241778544156, "loss": 9.4864, "step": 3702 }, { "epoch": 0.4558099458394879, "grad_norm": 0.14535067975521088, "learning_rate": 0.00027226259391550684, "loss": 7.8554, "step": 3703 }, { "epoch": 0.45593303791235845, "grad_norm": 0.26713305711746216, "learning_rate": 0.0002722010099765981, "loss": 7.3816, "step": 3704 }, { "epoch": 0.45605612998522893, "grad_norm": 0.18628980219364166, "learning_rate": 0.00027213942603768936, "loss": 9.5227, "step": 3705 }, { "epoch": 0.45617922205809947, "grad_norm": 0.20756950974464417, "learning_rate": 0.00027207784209878064, "loss": 8.0431, "step": 3706 }, { "epoch": 0.45630231413096994, "grad_norm": 0.3055974245071411, "learning_rate": 0.00027201625815987193, "loss": 7.3765, "step": 3707 }, { "epoch": 0.4564254062038405, "grad_norm": 0.1835140436887741, "learning_rate": 0.0002719546742209632, "loss": 7.8658, "step": 3708 }, { "epoch": 0.45654849827671096, "grad_norm": 0.22156834602355957, "learning_rate": 0.00027189309028205445, "loss": 8.7638, "step": 3709 }, { "epoch": 0.4566715903495815, "grad_norm": 0.2716052830219269, "learning_rate": 0.00027183150634314574, "loss": 8.3762, "step": 3710 }, { "epoch": 0.45679468242245197, "grad_norm": 0.14035353064537048, "learning_rate": 0.00027176992240423697, "loss": 7.4641, "step": 3711 }, { "epoch": 0.4569177744953225, "grad_norm": 0.11478424817323685, "learning_rate": 0.00027170833846532826, "loss": 7.1439, "step": 3712 }, { "epoch": 0.457040866568193, "grad_norm": 0.19596931338310242, "learning_rate": 0.00027164675452641954, "loss": 7.576, "step": 3713 }, { "epoch": 0.4571639586410635, "grad_norm": 0.2691997289657593, "learning_rate": 0.00027158517058751083, "loss": 8.3424, "step": 3714 }, { "epoch": 0.457287050713934, "grad_norm": 0.12003625184297562, "learning_rate": 0.00027152358664860206, "loss": 7.9323, "step": 3715 }, { "epoch": 0.45741014278680453, "grad_norm": 0.10655398666858673, "learning_rate": 0.00027146200270969335, "loss": 7.6838, "step": 3716 }, { "epoch": 0.45753323485967506, "grad_norm": 0.12411697208881378, "learning_rate": 0.0002714004187707846, "loss": 7.9157, "step": 3717 }, { "epoch": 0.45765632693254554, "grad_norm": 0.14311036467552185, "learning_rate": 0.0002713388348318758, "loss": 7.9092, "step": 3718 }, { "epoch": 0.4577794190054161, "grad_norm": 0.18013069033622742, "learning_rate": 0.0002712772508929671, "loss": 7.7807, "step": 3719 }, { "epoch": 0.45790251107828656, "grad_norm": 0.1299051195383072, "learning_rate": 0.0002712156669540584, "loss": 7.6437, "step": 3720 }, { "epoch": 0.4580256031511571, "grad_norm": 0.09269379824399948, "learning_rate": 0.00027115408301514967, "loss": 7.9297, "step": 3721 }, { "epoch": 0.45814869522402757, "grad_norm": 0.18771257996559143, "learning_rate": 0.0002710924990762409, "loss": 7.5823, "step": 3722 }, { "epoch": 0.4582717872968981, "grad_norm": 0.1601364016532898, "learning_rate": 0.0002710309151373322, "loss": 7.5866, "step": 3723 }, { "epoch": 0.4583948793697686, "grad_norm": 0.16266867518424988, "learning_rate": 0.0002709693311984234, "loss": 8.022, "step": 3724 }, { "epoch": 0.4585179714426391, "grad_norm": 0.10331137478351593, "learning_rate": 0.0002709077472595147, "loss": 7.7655, "step": 3725 }, { "epoch": 0.4586410635155096, "grad_norm": 0.07098674029111862, "learning_rate": 0.00027084616332060594, "loss": 7.2734, "step": 3726 }, { "epoch": 0.45876415558838013, "grad_norm": 0.07632731646299362, "learning_rate": 0.0002707845793816973, "loss": 7.6316, "step": 3727 }, { "epoch": 0.4588872476612506, "grad_norm": 0.0886782556772232, "learning_rate": 0.0002707229954427885, "loss": 7.5051, "step": 3728 }, { "epoch": 0.45901033973412114, "grad_norm": 0.18082869052886963, "learning_rate": 0.0002706614115038798, "loss": 8.3992, "step": 3729 }, { "epoch": 0.4591334318069916, "grad_norm": 0.08885782957077026, "learning_rate": 0.00027059982756497103, "loss": 7.6841, "step": 3730 }, { "epoch": 0.45925652387986216, "grad_norm": 0.07731200009584427, "learning_rate": 0.0002705382436260623, "loss": 8.0871, "step": 3731 }, { "epoch": 0.45937961595273263, "grad_norm": 0.21231597661972046, "learning_rate": 0.00027047665968715355, "loss": 7.4873, "step": 3732 }, { "epoch": 0.45950270802560317, "grad_norm": 0.17656132578849792, "learning_rate": 0.00027041507574824484, "loss": 7.4218, "step": 3733 }, { "epoch": 0.45962580009847365, "grad_norm": 0.2104903608560562, "learning_rate": 0.0002703534918093361, "loss": 7.9378, "step": 3734 }, { "epoch": 0.4597488921713442, "grad_norm": 0.10563042014837265, "learning_rate": 0.0002702919078704274, "loss": 7.5123, "step": 3735 }, { "epoch": 0.45987198424421466, "grad_norm": 0.10493805259466171, "learning_rate": 0.00027023032393151865, "loss": 7.6068, "step": 3736 }, { "epoch": 0.4599950763170852, "grad_norm": 0.16098585724830627, "learning_rate": 0.00027016873999260993, "loss": 8.0159, "step": 3737 }, { "epoch": 0.4601181683899557, "grad_norm": 0.0817212387919426, "learning_rate": 0.00027010715605370116, "loss": 7.5657, "step": 3738 }, { "epoch": 0.4602412604628262, "grad_norm": 0.07378866523504257, "learning_rate": 0.00027004557211479245, "loss": 7.5928, "step": 3739 }, { "epoch": 0.4603643525356967, "grad_norm": 0.2895567715167999, "learning_rate": 0.0002699839881758837, "loss": 8.9994, "step": 3740 }, { "epoch": 0.4604874446085672, "grad_norm": 0.13084746897220612, "learning_rate": 0.000269922404236975, "loss": 7.989, "step": 3741 }, { "epoch": 0.4606105366814377, "grad_norm": 0.2193392515182495, "learning_rate": 0.00026986082029806626, "loss": 7.4597, "step": 3742 }, { "epoch": 0.46073362875430823, "grad_norm": 0.25506049394607544, "learning_rate": 0.00026979923635915754, "loss": 7.3268, "step": 3743 }, { "epoch": 0.4608567208271787, "grad_norm": 0.14722779393196106, "learning_rate": 0.0002697376524202488, "loss": 8.2728, "step": 3744 }, { "epoch": 0.46097981290004925, "grad_norm": 0.14095625281333923, "learning_rate": 0.00026967606848134006, "loss": 7.4914, "step": 3745 }, { "epoch": 0.4611029049729197, "grad_norm": 0.10406982898712158, "learning_rate": 0.0002696144845424313, "loss": 7.3038, "step": 3746 }, { "epoch": 0.46122599704579026, "grad_norm": 0.1688135862350464, "learning_rate": 0.00026955290060352264, "loss": 7.8553, "step": 3747 }, { "epoch": 0.46134908911866074, "grad_norm": 0.537839412689209, "learning_rate": 0.00026949131666461387, "loss": 9.4652, "step": 3748 }, { "epoch": 0.46147218119153127, "grad_norm": 0.10688244551420212, "learning_rate": 0.00026942973272570515, "loss": 7.3775, "step": 3749 }, { "epoch": 0.46159527326440175, "grad_norm": 0.13620226085186005, "learning_rate": 0.0002693681487867964, "loss": 8.2863, "step": 3750 }, { "epoch": 0.4617183653372723, "grad_norm": 0.0618775337934494, "learning_rate": 0.0002693065648478877, "loss": 7.5284, "step": 3751 }, { "epoch": 0.46184145741014276, "grad_norm": 0.07841742783784866, "learning_rate": 0.0002692449809089789, "loss": 7.722, "step": 3752 }, { "epoch": 0.4619645494830133, "grad_norm": 0.2676355540752411, "learning_rate": 0.0002691833969700702, "loss": 8.6079, "step": 3753 }, { "epoch": 0.4620876415558838, "grad_norm": 0.10520590841770172, "learning_rate": 0.0002691218130311615, "loss": 8.3072, "step": 3754 }, { "epoch": 0.4622107336287543, "grad_norm": 0.1924767792224884, "learning_rate": 0.00026906022909225277, "loss": 7.5532, "step": 3755 }, { "epoch": 0.4623338257016248, "grad_norm": 0.14758451282978058, "learning_rate": 0.000268998645153344, "loss": 7.7, "step": 3756 }, { "epoch": 0.4624569177744953, "grad_norm": 0.18937958776950836, "learning_rate": 0.0002689370612144353, "loss": 7.2889, "step": 3757 }, { "epoch": 0.4625800098473658, "grad_norm": 0.14166834950447083, "learning_rate": 0.0002688754772755265, "loss": 7.3883, "step": 3758 }, { "epoch": 0.46270310192023634, "grad_norm": 0.13523223996162415, "learning_rate": 0.0002688138933366178, "loss": 7.852, "step": 3759 }, { "epoch": 0.46282619399310687, "grad_norm": 0.21153315901756287, "learning_rate": 0.00026875230939770904, "loss": 8.3533, "step": 3760 }, { "epoch": 0.46294928606597735, "grad_norm": 0.1593300700187683, "learning_rate": 0.0002686907254588004, "loss": 7.8313, "step": 3761 }, { "epoch": 0.4630723781388479, "grad_norm": 0.11049626767635345, "learning_rate": 0.0002686291415198916, "loss": 7.4248, "step": 3762 }, { "epoch": 0.46319547021171836, "grad_norm": 0.30352866649627686, "learning_rate": 0.0002685675575809829, "loss": 8.8772, "step": 3763 }, { "epoch": 0.4633185622845889, "grad_norm": 0.10371140390634537, "learning_rate": 0.00026850597364207413, "loss": 7.6585, "step": 3764 }, { "epoch": 0.4634416543574594, "grad_norm": 0.23144352436065674, "learning_rate": 0.0002684443897031654, "loss": 7.1859, "step": 3765 }, { "epoch": 0.4635647464303299, "grad_norm": 0.12651488184928894, "learning_rate": 0.00026838280576425665, "loss": 7.889, "step": 3766 }, { "epoch": 0.4636878385032004, "grad_norm": 0.1270095407962799, "learning_rate": 0.00026832122182534793, "loss": 8.1432, "step": 3767 }, { "epoch": 0.4638109305760709, "grad_norm": 0.1273881196975708, "learning_rate": 0.0002682596378864392, "loss": 7.7041, "step": 3768 }, { "epoch": 0.4639340226489414, "grad_norm": 0.08772122859954834, "learning_rate": 0.0002681980539475305, "loss": 7.2613, "step": 3769 }, { "epoch": 0.46405711472181194, "grad_norm": 0.08563613891601562, "learning_rate": 0.00026813647000862174, "loss": 7.5455, "step": 3770 }, { "epoch": 0.4641802067946824, "grad_norm": 0.10210495442152023, "learning_rate": 0.000268074886069713, "loss": 7.4431, "step": 3771 }, { "epoch": 0.46430329886755295, "grad_norm": 0.7201663255691528, "learning_rate": 0.00026801330213080426, "loss": 10.3936, "step": 3772 }, { "epoch": 0.46442639094042343, "grad_norm": 0.3190403878688812, "learning_rate": 0.00026795171819189555, "loss": 8.4267, "step": 3773 }, { "epoch": 0.46454948301329396, "grad_norm": 0.08726482093334198, "learning_rate": 0.00026789013425298683, "loss": 7.9053, "step": 3774 }, { "epoch": 0.46467257508616444, "grad_norm": 0.08068682998418808, "learning_rate": 0.0002678285503140781, "loss": 7.5489, "step": 3775 }, { "epoch": 0.464795667159035, "grad_norm": 0.09652543812990189, "learning_rate": 0.00026776696637516935, "loss": 8.0902, "step": 3776 }, { "epoch": 0.46491875923190545, "grad_norm": 0.16833990812301636, "learning_rate": 0.00026770538243626064, "loss": 7.3199, "step": 3777 }, { "epoch": 0.465041851304776, "grad_norm": 0.14861728250980377, "learning_rate": 0.00026764379849735187, "loss": 7.5324, "step": 3778 }, { "epoch": 0.46516494337764647, "grad_norm": 0.1802988052368164, "learning_rate": 0.00026758221455844316, "loss": 7.4686, "step": 3779 }, { "epoch": 0.465288035450517, "grad_norm": 0.0888109803199768, "learning_rate": 0.0002675206306195344, "loss": 7.5417, "step": 3780 }, { "epoch": 0.4654111275233875, "grad_norm": 0.09396105259656906, "learning_rate": 0.00026745904668062573, "loss": 7.2974, "step": 3781 }, { "epoch": 0.465534219596258, "grad_norm": 0.2019539177417755, "learning_rate": 0.00026739746274171696, "loss": 8.0918, "step": 3782 }, { "epoch": 0.4656573116691285, "grad_norm": 0.1462918519973755, "learning_rate": 0.00026733587880280825, "loss": 7.896, "step": 3783 }, { "epoch": 0.465780403741999, "grad_norm": 0.19551298022270203, "learning_rate": 0.0002672742948638995, "loss": 7.812, "step": 3784 }, { "epoch": 0.4659034958148695, "grad_norm": 0.09929024428129196, "learning_rate": 0.00026721271092499077, "loss": 7.648, "step": 3785 }, { "epoch": 0.46602658788774004, "grad_norm": 0.1844749003648758, "learning_rate": 0.000267151126986082, "loss": 7.6625, "step": 3786 }, { "epoch": 0.4661496799606105, "grad_norm": 0.13623160123825073, "learning_rate": 0.0002670895430471733, "loss": 8.2004, "step": 3787 }, { "epoch": 0.46627277203348105, "grad_norm": 0.14459967613220215, "learning_rate": 0.0002670279591082646, "loss": 7.6605, "step": 3788 }, { "epoch": 0.46639586410635153, "grad_norm": 0.13699917495250702, "learning_rate": 0.00026696637516935586, "loss": 7.875, "step": 3789 }, { "epoch": 0.46651895617922207, "grad_norm": 0.1794426143169403, "learning_rate": 0.0002669047912304471, "loss": 7.5012, "step": 3790 }, { "epoch": 0.46664204825209255, "grad_norm": 0.1144331693649292, "learning_rate": 0.0002668432072915384, "loss": 7.8366, "step": 3791 }, { "epoch": 0.4667651403249631, "grad_norm": 0.13664059340953827, "learning_rate": 0.0002667816233526296, "loss": 8.3531, "step": 3792 }, { "epoch": 0.46688823239783356, "grad_norm": 0.28663116693496704, "learning_rate": 0.0002667200394137209, "loss": 8.8328, "step": 3793 }, { "epoch": 0.4670113244707041, "grad_norm": 0.1347612589597702, "learning_rate": 0.00026665845547481213, "loss": 7.3519, "step": 3794 }, { "epoch": 0.46713441654357457, "grad_norm": 0.11516709625720978, "learning_rate": 0.00026659687153590347, "loss": 7.7291, "step": 3795 }, { "epoch": 0.4672575086164451, "grad_norm": 0.13427473604679108, "learning_rate": 0.0002665352875969947, "loss": 7.5685, "step": 3796 }, { "epoch": 0.4673806006893156, "grad_norm": 0.148048996925354, "learning_rate": 0.000266473703658086, "loss": 8.2296, "step": 3797 }, { "epoch": 0.4675036927621861, "grad_norm": 0.07345061004161835, "learning_rate": 0.0002664121197191772, "loss": 7.3695, "step": 3798 }, { "epoch": 0.4676267848350566, "grad_norm": 0.10153775662183762, "learning_rate": 0.0002663505357802685, "loss": 7.5973, "step": 3799 }, { "epoch": 0.46774987690792713, "grad_norm": 0.09861108660697937, "learning_rate": 0.00026628895184135974, "loss": 7.2294, "step": 3800 }, { "epoch": 0.4678729689807976, "grad_norm": 0.06487435102462769, "learning_rate": 0.0002662273679024511, "loss": 7.4044, "step": 3801 }, { "epoch": 0.46799606105366814, "grad_norm": 0.3616052269935608, "learning_rate": 0.0002661657839635423, "loss": 9.101, "step": 3802 }, { "epoch": 0.4681191531265387, "grad_norm": 0.18782664835453033, "learning_rate": 0.0002661042000246336, "loss": 7.6571, "step": 3803 }, { "epoch": 0.46824224519940916, "grad_norm": 0.12651021778583527, "learning_rate": 0.00026604261608572483, "loss": 7.914, "step": 3804 }, { "epoch": 0.4683653372722797, "grad_norm": 0.09782109409570694, "learning_rate": 0.0002659810321468161, "loss": 7.5579, "step": 3805 }, { "epoch": 0.46848842934515017, "grad_norm": 0.06539248675107956, "learning_rate": 0.00026591944820790735, "loss": 8.0655, "step": 3806 }, { "epoch": 0.4686115214180207, "grad_norm": 0.13225345313549042, "learning_rate": 0.00026585786426899864, "loss": 7.5432, "step": 3807 }, { "epoch": 0.4687346134908912, "grad_norm": 0.10186829417943954, "learning_rate": 0.0002657962803300899, "loss": 7.964, "step": 3808 }, { "epoch": 0.4688577055637617, "grad_norm": 8397392818733056.0, "learning_rate": 0.0002657346963911812, "loss": 8.21, "step": 3809 }, { "epoch": 0.4689807976366322, "grad_norm": 0.12312251329421997, "learning_rate": 0.00026567311245227244, "loss": 7.4643, "step": 3810 }, { "epoch": 0.46910388970950273, "grad_norm": 0.13629165291786194, "learning_rate": 0.00026561152851336373, "loss": 7.9623, "step": 3811 }, { "epoch": 0.4692269817823732, "grad_norm": 0.07383673638105392, "learning_rate": 0.00026554994457445496, "loss": 7.7794, "step": 3812 }, { "epoch": 0.46935007385524374, "grad_norm": 0.1213977262377739, "learning_rate": 0.00026548836063554625, "loss": 7.359, "step": 3813 }, { "epoch": 0.4694731659281142, "grad_norm": 0.08796824514865875, "learning_rate": 0.0002654267766966375, "loss": 7.7874, "step": 3814 }, { "epoch": 0.46959625800098476, "grad_norm": 0.0747419223189354, "learning_rate": 0.0002653651927577288, "loss": 7.3608, "step": 3815 }, { "epoch": 0.46971935007385524, "grad_norm": 0.1134965568780899, "learning_rate": 0.00026530360881882006, "loss": 7.6152, "step": 3816 }, { "epoch": 0.46984244214672577, "grad_norm": 0.10229922086000443, "learning_rate": 0.00026524202487991134, "loss": 7.5191, "step": 3817 }, { "epoch": 0.46996553421959625, "grad_norm": 0.1271906942129135, "learning_rate": 0.0002651804409410026, "loss": 7.587, "step": 3818 }, { "epoch": 0.4700886262924668, "grad_norm": 0.1133386418223381, "learning_rate": 0.00026511885700209386, "loss": 7.206, "step": 3819 }, { "epoch": 0.47021171836533726, "grad_norm": 0.07811757922172546, "learning_rate": 0.0002650572730631851, "loss": 7.7053, "step": 3820 }, { "epoch": 0.4703348104382078, "grad_norm": 0.7448825836181641, "learning_rate": 0.0002649956891242764, "loss": 11.3615, "step": 3821 }, { "epoch": 0.4704579025110783, "grad_norm": 0.13626201450824738, "learning_rate": 0.00026493410518536767, "loss": 7.4519, "step": 3822 }, { "epoch": 0.4705809945839488, "grad_norm": 0.13438279926776886, "learning_rate": 0.00026487252124645895, "loss": 7.5228, "step": 3823 }, { "epoch": 0.4707040866568193, "grad_norm": 0.16812074184417725, "learning_rate": 0.0002648109373075502, "loss": 8.1364, "step": 3824 }, { "epoch": 0.4708271787296898, "grad_norm": 0.2210947871208191, "learning_rate": 0.00026474935336864147, "loss": 8.4172, "step": 3825 }, { "epoch": 0.4709502708025603, "grad_norm": 0.06974206119775772, "learning_rate": 0.0002646877694297327, "loss": 7.5266, "step": 3826 }, { "epoch": 0.47107336287543083, "grad_norm": 0.1154036670923233, "learning_rate": 0.000264626185490824, "loss": 7.8238, "step": 3827 }, { "epoch": 0.4711964549483013, "grad_norm": 0.10006479918956757, "learning_rate": 0.0002645646015519153, "loss": 7.9496, "step": 3828 }, { "epoch": 0.47131954702117185, "grad_norm": 0.086053766310215, "learning_rate": 0.00026450301761300656, "loss": 7.606, "step": 3829 }, { "epoch": 0.4714426390940423, "grad_norm": 0.08500471711158752, "learning_rate": 0.0002644414336740978, "loss": 7.694, "step": 3830 }, { "epoch": 0.47156573116691286, "grad_norm": 0.11047852784395218, "learning_rate": 0.0002643798497351891, "loss": 7.7583, "step": 3831 }, { "epoch": 0.47168882323978334, "grad_norm": 0.1067190170288086, "learning_rate": 0.0002643182657962803, "loss": 7.678, "step": 3832 }, { "epoch": 0.4718119153126539, "grad_norm": 0.06808581948280334, "learning_rate": 0.0002642566818573716, "loss": 7.7255, "step": 3833 }, { "epoch": 0.47193500738552435, "grad_norm": 0.11197615414857864, "learning_rate": 0.00026419509791846284, "loss": 7.9519, "step": 3834 }, { "epoch": 0.4720580994583949, "grad_norm": 0.09606373310089111, "learning_rate": 0.0002641335139795542, "loss": 7.5311, "step": 3835 }, { "epoch": 0.47218119153126537, "grad_norm": 0.06829466670751572, "learning_rate": 0.0002640719300406454, "loss": 7.4646, "step": 3836 }, { "epoch": 0.4723042836041359, "grad_norm": 0.11969480663537979, "learning_rate": 0.0002640103461017367, "loss": 8.1464, "step": 3837 }, { "epoch": 0.4724273756770064, "grad_norm": 0.10469920933246613, "learning_rate": 0.00026394876216282793, "loss": 7.4142, "step": 3838 }, { "epoch": 0.4725504677498769, "grad_norm": 0.12082811444997787, "learning_rate": 0.0002638871782239192, "loss": 7.7317, "step": 3839 }, { "epoch": 0.4726735598227474, "grad_norm": 0.11054522544145584, "learning_rate": 0.00026382559428501045, "loss": 7.6294, "step": 3840 }, { "epoch": 0.4727966518956179, "grad_norm": 0.11524858325719833, "learning_rate": 0.00026376401034610173, "loss": 7.2397, "step": 3841 }, { "epoch": 0.4729197439684884, "grad_norm": 0.15429235994815826, "learning_rate": 0.000263702426407193, "loss": 7.6111, "step": 3842 }, { "epoch": 0.47304283604135894, "grad_norm": 0.0829077884554863, "learning_rate": 0.0002636408424682843, "loss": 7.508, "step": 3843 }, { "epoch": 0.4731659281142294, "grad_norm": 0.3526357412338257, "learning_rate": 0.00026357925852937554, "loss": 8.6366, "step": 3844 }, { "epoch": 0.47328902018709995, "grad_norm": 0.059703242033720016, "learning_rate": 0.0002635176745904668, "loss": 7.215, "step": 3845 }, { "epoch": 0.4734121122599705, "grad_norm": 0.15642127394676208, "learning_rate": 0.00026345609065155806, "loss": 7.7283, "step": 3846 }, { "epoch": 0.47353520433284096, "grad_norm": 0.15759411454200745, "learning_rate": 0.00026339450671264934, "loss": 8.0423, "step": 3847 }, { "epoch": 0.4736582964057115, "grad_norm": 0.07423494011163712, "learning_rate": 0.0002633329227737406, "loss": 7.4677, "step": 3848 }, { "epoch": 0.473781388478582, "grad_norm": 0.14575912058353424, "learning_rate": 0.0002632713388348319, "loss": 7.2178, "step": 3849 }, { "epoch": 0.4739044805514525, "grad_norm": 0.12291260808706284, "learning_rate": 0.00026320975489592315, "loss": 8.2201, "step": 3850 }, { "epoch": 0.474027572624323, "grad_norm": 0.18400514125823975, "learning_rate": 0.00026314817095701444, "loss": 7.4975, "step": 3851 }, { "epoch": 0.4741506646971935, "grad_norm": 0.1127953827381134, "learning_rate": 0.00026308658701810567, "loss": 7.6266, "step": 3852 }, { "epoch": 0.474273756770064, "grad_norm": 0.09160829335451126, "learning_rate": 0.00026302500307919696, "loss": 8.265, "step": 3853 }, { "epoch": 0.47439684884293454, "grad_norm": 0.1375436931848526, "learning_rate": 0.0002629634191402882, "loss": 7.4564, "step": 3854 }, { "epoch": 0.474519940915805, "grad_norm": 0.0812179297208786, "learning_rate": 0.00026290183520137953, "loss": 7.9608, "step": 3855 }, { "epoch": 0.47464303298867555, "grad_norm": 0.10528091341257095, "learning_rate": 0.00026284025126247076, "loss": 7.9769, "step": 3856 }, { "epoch": 0.47476612506154603, "grad_norm": 0.10877437144517899, "learning_rate": 0.00026277866732356205, "loss": 8.0192, "step": 3857 }, { "epoch": 0.47488921713441656, "grad_norm": 0.12130054086446762, "learning_rate": 0.0002627170833846533, "loss": 7.4059, "step": 3858 }, { "epoch": 0.47501230920728704, "grad_norm": 0.06997612118721008, "learning_rate": 0.00026265549944574457, "loss": 7.706, "step": 3859 }, { "epoch": 0.4751354012801576, "grad_norm": 0.11391416937112808, "learning_rate": 0.0002625939155068358, "loss": 7.3687, "step": 3860 }, { "epoch": 0.47525849335302806, "grad_norm": 0.31508246064186096, "learning_rate": 0.0002625323315679271, "loss": 9.0215, "step": 3861 }, { "epoch": 0.4753815854258986, "grad_norm": 0.16668424010276794, "learning_rate": 0.00026247074762901837, "loss": 7.3883, "step": 3862 }, { "epoch": 0.47550467749876907, "grad_norm": 0.09957040101289749, "learning_rate": 0.00026240916369010966, "loss": 7.2595, "step": 3863 }, { "epoch": 0.4756277695716396, "grad_norm": 0.14332593977451324, "learning_rate": 0.0002623475797512009, "loss": 8.0732, "step": 3864 }, { "epoch": 0.4757508616445101, "grad_norm": 0.06127620115876198, "learning_rate": 0.0002622859958122922, "loss": 7.7327, "step": 3865 }, { "epoch": 0.4758739537173806, "grad_norm": 0.16165600717067719, "learning_rate": 0.0002622244118733834, "loss": 8.3357, "step": 3866 }, { "epoch": 0.4759970457902511, "grad_norm": 0.16384443640708923, "learning_rate": 0.0002621628279344747, "loss": 7.9522, "step": 3867 }, { "epoch": 0.47612013786312163, "grad_norm": 0.14403893053531647, "learning_rate": 0.00026210124399556593, "loss": 7.6042, "step": 3868 }, { "epoch": 0.4762432299359921, "grad_norm": 0.07668399810791016, "learning_rate": 0.00026203966005665727, "loss": 7.7698, "step": 3869 }, { "epoch": 0.47636632200886264, "grad_norm": 0.17022806406021118, "learning_rate": 0.0002619780761177485, "loss": 8.6069, "step": 3870 }, { "epoch": 0.4764894140817331, "grad_norm": 0.07458829879760742, "learning_rate": 0.0002619164921788398, "loss": 8.1465, "step": 3871 }, { "epoch": 0.47661250615460365, "grad_norm": 0.07125736773014069, "learning_rate": 0.000261854908239931, "loss": 8.0433, "step": 3872 }, { "epoch": 0.47673559822747413, "grad_norm": 0.06344274431467056, "learning_rate": 0.0002617933243010223, "loss": 7.759, "step": 3873 }, { "epoch": 0.47685869030034467, "grad_norm": 0.15944154560565948, "learning_rate": 0.00026173174036211354, "loss": 8.2737, "step": 3874 }, { "epoch": 0.47698178237321515, "grad_norm": 0.08264467865228653, "learning_rate": 0.0002616701564232048, "loss": 7.5845, "step": 3875 }, { "epoch": 0.4771048744460857, "grad_norm": 0.08377441018819809, "learning_rate": 0.0002616085724842961, "loss": 7.5422, "step": 3876 }, { "epoch": 0.47722796651895616, "grad_norm": 0.09966788440942764, "learning_rate": 0.0002615469885453874, "loss": 7.3331, "step": 3877 }, { "epoch": 0.4773510585918267, "grad_norm": 0.11053044348955154, "learning_rate": 0.00026148540460647863, "loss": 7.5971, "step": 3878 }, { "epoch": 0.4774741506646972, "grad_norm": 0.1106753721833229, "learning_rate": 0.0002614238206675699, "loss": 7.6392, "step": 3879 }, { "epoch": 0.4775972427375677, "grad_norm": 0.15476061403751373, "learning_rate": 0.00026136223672866115, "loss": 7.9074, "step": 3880 }, { "epoch": 0.4777203348104382, "grad_norm": 0.08714151382446289, "learning_rate": 0.00026130065278975244, "loss": 7.2731, "step": 3881 }, { "epoch": 0.4778434268833087, "grad_norm": 0.0974523276090622, "learning_rate": 0.0002612390688508437, "loss": 7.6025, "step": 3882 }, { "epoch": 0.4779665189561792, "grad_norm": 0.10252904891967773, "learning_rate": 0.000261177484911935, "loss": 7.5992, "step": 3883 }, { "epoch": 0.47808961102904973, "grad_norm": 0.0781414732336998, "learning_rate": 0.00026111590097302624, "loss": 7.8523, "step": 3884 }, { "epoch": 0.4782127031019202, "grad_norm": 0.09157975018024445, "learning_rate": 0.00026105431703411753, "loss": 7.3984, "step": 3885 }, { "epoch": 0.47833579517479075, "grad_norm": 0.08299637585878372, "learning_rate": 0.00026099273309520876, "loss": 7.7148, "step": 3886 }, { "epoch": 0.4784588872476612, "grad_norm": 0.10662294179201126, "learning_rate": 0.00026093114915630005, "loss": 7.4498, "step": 3887 }, { "epoch": 0.47858197932053176, "grad_norm": 0.10284861922264099, "learning_rate": 0.0002608695652173913, "loss": 7.3524, "step": 3888 }, { "epoch": 0.47870507139340224, "grad_norm": 0.07196518778800964, "learning_rate": 0.0002608079812784826, "loss": 7.4112, "step": 3889 }, { "epoch": 0.47882816346627277, "grad_norm": 0.5576170086860657, "learning_rate": 0.00026074639733957385, "loss": 9.5631, "step": 3890 }, { "epoch": 0.4789512555391433, "grad_norm": 0.11391729861497879, "learning_rate": 0.00026068481340066514, "loss": 7.6055, "step": 3891 }, { "epoch": 0.4790743476120138, "grad_norm": 0.07562325149774551, "learning_rate": 0.0002606232294617564, "loss": 7.3681, "step": 3892 }, { "epoch": 0.4791974396848843, "grad_norm": 0.08137817680835724, "learning_rate": 0.00026056164552284766, "loss": 7.2598, "step": 3893 }, { "epoch": 0.4793205317577548, "grad_norm": 0.07681713998317719, "learning_rate": 0.0002605000615839389, "loss": 7.5803, "step": 3894 }, { "epoch": 0.47944362383062533, "grad_norm": 0.09071838855743408, "learning_rate": 0.0002604384776450302, "loss": 7.2725, "step": 3895 }, { "epoch": 0.4795667159034958, "grad_norm": 0.10394696891307831, "learning_rate": 0.00026037689370612147, "loss": 7.7379, "step": 3896 }, { "epoch": 0.47968980797636634, "grad_norm": 0.11640831083059311, "learning_rate": 0.00026031530976721275, "loss": 8.0001, "step": 3897 }, { "epoch": 0.4798129000492368, "grad_norm": 0.09407320618629456, "learning_rate": 0.000260253725828304, "loss": 7.7008, "step": 3898 }, { "epoch": 0.47993599212210736, "grad_norm": 0.17058958113193512, "learning_rate": 0.00026019214188939527, "loss": 9.1226, "step": 3899 }, { "epoch": 0.48005908419497784, "grad_norm": 0.2467217594385147, "learning_rate": 0.0002601305579504865, "loss": 7.4826, "step": 3900 }, { "epoch": 0.48018217626784837, "grad_norm": 0.15256091952323914, "learning_rate": 0.0002600689740115778, "loss": 7.5861, "step": 3901 }, { "epoch": 0.48030526834071885, "grad_norm": 0.23273763060569763, "learning_rate": 0.000260007390072669, "loss": 7.5144, "step": 3902 }, { "epoch": 0.4804283604135894, "grad_norm": 0.1469721496105194, "learning_rate": 0.00025994580613376036, "loss": 7.2381, "step": 3903 }, { "epoch": 0.48055145248645986, "grad_norm": 0.09057088941335678, "learning_rate": 0.0002598842221948516, "loss": 7.5431, "step": 3904 }, { "epoch": 0.4806745445593304, "grad_norm": 0.1814483106136322, "learning_rate": 0.0002598226382559429, "loss": 7.9517, "step": 3905 }, { "epoch": 0.4807976366322009, "grad_norm": 0.09256387501955032, "learning_rate": 0.0002597610543170341, "loss": 7.5119, "step": 3906 }, { "epoch": 0.4809207287050714, "grad_norm": 0.23523148894309998, "learning_rate": 0.0002596994703781254, "loss": 8.1121, "step": 3907 }, { "epoch": 0.4810438207779419, "grad_norm": 0.0824131965637207, "learning_rate": 0.00025963788643921663, "loss": 7.5127, "step": 3908 }, { "epoch": 0.4811669128508124, "grad_norm": 0.20314322412014008, "learning_rate": 0.000259576302500308, "loss": 8.4475, "step": 3909 }, { "epoch": 0.4812900049236829, "grad_norm": 0.11649917811155319, "learning_rate": 0.0002595147185613992, "loss": 7.356, "step": 3910 }, { "epoch": 0.48141309699655344, "grad_norm": 0.14832253754138947, "learning_rate": 0.0002594531346224905, "loss": 7.3973, "step": 3911 }, { "epoch": 0.4815361890694239, "grad_norm": 0.10806712508201599, "learning_rate": 0.0002593915506835817, "loss": 7.4628, "step": 3912 }, { "epoch": 0.48165928114229445, "grad_norm": 0.09415730088949203, "learning_rate": 0.000259329966744673, "loss": 7.6693, "step": 3913 }, { "epoch": 0.4817823732151649, "grad_norm": 0.11481054127216339, "learning_rate": 0.00025926838280576425, "loss": 7.6022, "step": 3914 }, { "epoch": 0.48190546528803546, "grad_norm": 0.1414974480867386, "learning_rate": 0.00025920679886685553, "loss": 7.8856, "step": 3915 }, { "epoch": 0.48202855736090594, "grad_norm": 0.3396669924259186, "learning_rate": 0.0002591452149279468, "loss": 8.9843, "step": 3916 }, { "epoch": 0.4821516494337765, "grad_norm": 0.11538790911436081, "learning_rate": 0.0002590836309890381, "loss": 7.6307, "step": 3917 }, { "epoch": 0.48227474150664695, "grad_norm": 0.10070092231035233, "learning_rate": 0.00025902204705012934, "loss": 7.6173, "step": 3918 }, { "epoch": 0.4823978335795175, "grad_norm": 0.0889393612742424, "learning_rate": 0.0002589604631112206, "loss": 7.4532, "step": 3919 }, { "epoch": 0.48252092565238797, "grad_norm": 0.08013827353715897, "learning_rate": 0.00025889887917231186, "loss": 7.5987, "step": 3920 }, { "epoch": 0.4826440177252585, "grad_norm": 0.0845312848687172, "learning_rate": 0.00025883729523340314, "loss": 8.0131, "step": 3921 }, { "epoch": 0.482767109798129, "grad_norm": 0.27003100514411926, "learning_rate": 0.0002587757112944944, "loss": 7.1905, "step": 3922 }, { "epoch": 0.4828902018709995, "grad_norm": 0.11888527870178223, "learning_rate": 0.0002587141273555857, "loss": 7.2434, "step": 3923 }, { "epoch": 0.48301329394387, "grad_norm": 0.11114981025457382, "learning_rate": 0.00025865254341667695, "loss": 7.9678, "step": 3924 }, { "epoch": 0.4831363860167405, "grad_norm": 0.12800756096839905, "learning_rate": 0.00025859095947776824, "loss": 8.0055, "step": 3925 }, { "epoch": 0.483259478089611, "grad_norm": 0.13030551373958588, "learning_rate": 0.00025852937553885947, "loss": 7.5667, "step": 3926 }, { "epoch": 0.48338257016248154, "grad_norm": 0.07222087681293488, "learning_rate": 0.00025846779159995075, "loss": 7.5494, "step": 3927 }, { "epoch": 0.483505662235352, "grad_norm": 0.08035595715045929, "learning_rate": 0.000258406207661042, "loss": 7.676, "step": 3928 }, { "epoch": 0.48362875430822255, "grad_norm": 0.12763525545597076, "learning_rate": 0.0002583446237221333, "loss": 8.1078, "step": 3929 }, { "epoch": 0.48375184638109303, "grad_norm": 0.11130882054567337, "learning_rate": 0.00025828303978322456, "loss": 7.4617, "step": 3930 }, { "epoch": 0.48387493845396357, "grad_norm": 0.10121513903141022, "learning_rate": 0.00025822145584431585, "loss": 7.5501, "step": 3931 }, { "epoch": 0.48399803052683404, "grad_norm": 0.21727465093135834, "learning_rate": 0.0002581598719054071, "loss": 7.014, "step": 3932 }, { "epoch": 0.4841211225997046, "grad_norm": 0.10994616895914078, "learning_rate": 0.00025809828796649837, "loss": 7.7355, "step": 3933 }, { "epoch": 0.4842442146725751, "grad_norm": 0.10544184595346451, "learning_rate": 0.0002580367040275896, "loss": 7.8764, "step": 3934 }, { "epoch": 0.4843673067454456, "grad_norm": 0.15984413027763367, "learning_rate": 0.0002579751200886809, "loss": 7.9801, "step": 3935 }, { "epoch": 0.4844903988183161, "grad_norm": 0.11951097846031189, "learning_rate": 0.0002579135361497721, "loss": 7.4901, "step": 3936 }, { "epoch": 0.4846134908911866, "grad_norm": 0.22185303270816803, "learning_rate": 0.00025785195221086346, "loss": 8.606, "step": 3937 }, { "epoch": 0.48473658296405714, "grad_norm": 0.0548505000770092, "learning_rate": 0.0002577903682719547, "loss": 7.6913, "step": 3938 }, { "epoch": 0.4848596750369276, "grad_norm": 0.1342313587665558, "learning_rate": 0.000257728784333046, "loss": 7.4496, "step": 3939 }, { "epoch": 0.48498276710979815, "grad_norm": 0.09816396981477737, "learning_rate": 0.0002576672003941372, "loss": 7.9099, "step": 3940 }, { "epoch": 0.48510585918266863, "grad_norm": 0.13689135015010834, "learning_rate": 0.0002576056164552285, "loss": 8.5313, "step": 3941 }, { "epoch": 0.48522895125553916, "grad_norm": 0.10997326672077179, "learning_rate": 0.00025754403251631973, "loss": 8.6729, "step": 3942 }, { "epoch": 0.48535204332840964, "grad_norm": 0.11803941428661346, "learning_rate": 0.00025748244857741107, "loss": 7.5078, "step": 3943 }, { "epoch": 0.4854751354012802, "grad_norm": 0.15561091899871826, "learning_rate": 0.0002574208646385023, "loss": 7.4261, "step": 3944 }, { "epoch": 0.48559822747415066, "grad_norm": 0.08927179127931595, "learning_rate": 0.0002573592806995936, "loss": 7.6983, "step": 3945 }, { "epoch": 0.4857213195470212, "grad_norm": 0.13099093735218048, "learning_rate": 0.0002572976967606848, "loss": 7.1054, "step": 3946 }, { "epoch": 0.48584441161989167, "grad_norm": 0.07264920324087143, "learning_rate": 0.0002572361128217761, "loss": 7.5359, "step": 3947 }, { "epoch": 0.4859675036927622, "grad_norm": 0.09629487246274948, "learning_rate": 0.00025717452888286734, "loss": 7.4953, "step": 3948 }, { "epoch": 0.4860905957656327, "grad_norm": 0.07706408202648163, "learning_rate": 0.00025711294494395857, "loss": 7.3119, "step": 3949 }, { "epoch": 0.4862136878385032, "grad_norm": 0.22590002417564392, "learning_rate": 0.0002570513610050499, "loss": 8.0589, "step": 3950 }, { "epoch": 0.4863367799113737, "grad_norm": 0.25382348895072937, "learning_rate": 0.00025698977706614114, "loss": 8.4994, "step": 3951 }, { "epoch": 0.48645987198424423, "grad_norm": 0.10059403628110886, "learning_rate": 0.00025692819312723243, "loss": 7.8161, "step": 3952 }, { "epoch": 0.4865829640571147, "grad_norm": 0.2660585641860962, "learning_rate": 0.00025686660918832366, "loss": 8.8084, "step": 3953 }, { "epoch": 0.48670605612998524, "grad_norm": 0.15429282188415527, "learning_rate": 0.00025680502524941495, "loss": 7.5096, "step": 3954 }, { "epoch": 0.4868291482028557, "grad_norm": 0.22563232481479645, "learning_rate": 0.0002567434413105062, "loss": 7.4272, "step": 3955 }, { "epoch": 0.48695224027572626, "grad_norm": 0.2092026323080063, "learning_rate": 0.00025668185737159747, "loss": 7.4197, "step": 3956 }, { "epoch": 0.48707533234859673, "grad_norm": 0.23541124165058136, "learning_rate": 0.00025662027343268876, "loss": 7.3937, "step": 3957 }, { "epoch": 0.48719842442146727, "grad_norm": 0.7414950132369995, "learning_rate": 0.00025655868949378004, "loss": 6.7096, "step": 3958 }, { "epoch": 0.48732151649433775, "grad_norm": 0.16935716569423676, "learning_rate": 0.0002564971055548713, "loss": 7.3658, "step": 3959 }, { "epoch": 0.4874446085672083, "grad_norm": 0.09654582291841507, "learning_rate": 0.00025643552161596256, "loss": 7.3312, "step": 3960 }, { "epoch": 0.48756770064007876, "grad_norm": 0.07951689511537552, "learning_rate": 0.0002563739376770538, "loss": 7.3416, "step": 3961 }, { "epoch": 0.4876907927129493, "grad_norm": 0.08436863869428635, "learning_rate": 0.0002563123537381451, "loss": 7.2046, "step": 3962 }, { "epoch": 0.4878138847858198, "grad_norm": 0.20504888892173767, "learning_rate": 0.0002562507697992363, "loss": 7.4955, "step": 3963 }, { "epoch": 0.4879369768586903, "grad_norm": 0.18977536261081696, "learning_rate": 0.00025618918586032765, "loss": 7.4418, "step": 3964 }, { "epoch": 0.4880600689315608, "grad_norm": 0.26776787638664246, "learning_rate": 0.0002561276019214189, "loss": 7.8371, "step": 3965 }, { "epoch": 0.4881831610044313, "grad_norm": 0.2282620668411255, "learning_rate": 0.00025606601798251017, "loss": 7.6883, "step": 3966 }, { "epoch": 0.4883062530773018, "grad_norm": 0.34084996581077576, "learning_rate": 0.0002560044340436014, "loss": 8.8087, "step": 3967 }, { "epoch": 0.48842934515017233, "grad_norm": 0.11846006661653519, "learning_rate": 0.0002559428501046927, "loss": 7.591, "step": 3968 }, { "epoch": 0.4885524372230428, "grad_norm": 0.10152161866426468, "learning_rate": 0.0002558812661657839, "loss": 7.4404, "step": 3969 }, { "epoch": 0.48867552929591335, "grad_norm": 0.09894116222858429, "learning_rate": 0.00025581968222687526, "loss": 7.4636, "step": 3970 }, { "epoch": 0.4887986213687838, "grad_norm": 0.18271653354167938, "learning_rate": 0.0002557580982879665, "loss": 8.9603, "step": 3971 }, { "epoch": 0.48892171344165436, "grad_norm": 0.13195888698101044, "learning_rate": 0.0002556965143490578, "loss": 8.4868, "step": 3972 }, { "epoch": 0.48904480551452484, "grad_norm": 0.22547303140163422, "learning_rate": 0.000255634930410149, "loss": 7.4017, "step": 3973 }, { "epoch": 0.4891678975873954, "grad_norm": 0.10722385346889496, "learning_rate": 0.0002555733464712403, "loss": 8.29, "step": 3974 }, { "epoch": 0.48929098966026585, "grad_norm": 0.3063074052333832, "learning_rate": 0.00025551176253233154, "loss": 7.2904, "step": 3975 }, { "epoch": 0.4894140817331364, "grad_norm": 0.12187045812606812, "learning_rate": 0.0002554501785934228, "loss": 8.1808, "step": 3976 }, { "epoch": 0.4895371738060069, "grad_norm": 0.103948213160038, "learning_rate": 0.0002553885946545141, "loss": 7.9392, "step": 3977 }, { "epoch": 0.4896602658788774, "grad_norm": 0.1251227855682373, "learning_rate": 0.0002553270107156054, "loss": 7.3124, "step": 3978 }, { "epoch": 0.48978335795174793, "grad_norm": 0.09976212680339813, "learning_rate": 0.00025526542677669663, "loss": 7.4514, "step": 3979 }, { "epoch": 0.4899064500246184, "grad_norm": 0.1730785369873047, "learning_rate": 0.0002552038428377879, "loss": 8.2794, "step": 3980 }, { "epoch": 0.49002954209748895, "grad_norm": 0.14877746999263763, "learning_rate": 0.00025514225889887915, "loss": 7.5848, "step": 3981 }, { "epoch": 0.4901526341703594, "grad_norm": 0.11729136109352112, "learning_rate": 0.00025508067495997043, "loss": 7.3577, "step": 3982 }, { "epoch": 0.49027572624322996, "grad_norm": 0.11467962712049484, "learning_rate": 0.00025501909102106167, "loss": 7.5471, "step": 3983 }, { "epoch": 0.49039881831610044, "grad_norm": 0.12482839822769165, "learning_rate": 0.000254957507082153, "loss": 7.6738, "step": 3984 }, { "epoch": 0.49052191038897097, "grad_norm": 0.08986123651266098, "learning_rate": 0.00025489592314324424, "loss": 7.5789, "step": 3985 }, { "epoch": 0.49064500246184145, "grad_norm": 0.0859379917383194, "learning_rate": 0.0002548343392043355, "loss": 7.9457, "step": 3986 }, { "epoch": 0.490768094534712, "grad_norm": 0.12943825125694275, "learning_rate": 0.00025477275526542676, "loss": 7.5202, "step": 3987 }, { "epoch": 0.49089118660758246, "grad_norm": 0.14552415907382965, "learning_rate": 0.00025471117132651804, "loss": 7.5986, "step": 3988 }, { "epoch": 0.491014278680453, "grad_norm": 0.0815107598900795, "learning_rate": 0.0002546495873876093, "loss": 7.8516, "step": 3989 }, { "epoch": 0.4911373707533235, "grad_norm": 0.11184166371822357, "learning_rate": 0.00025458800344870056, "loss": 7.5544, "step": 3990 }, { "epoch": 0.491260462826194, "grad_norm": 0.11137410998344421, "learning_rate": 0.00025452641950979185, "loss": 8.0234, "step": 3991 }, { "epoch": 0.4913835548990645, "grad_norm": 0.05141978710889816, "learning_rate": 0.00025446483557088314, "loss": 7.789, "step": 3992 }, { "epoch": 0.491506646971935, "grad_norm": 0.1027928963303566, "learning_rate": 0.00025440325163197437, "loss": 7.7435, "step": 3993 }, { "epoch": 0.4916297390448055, "grad_norm": 0.17052419483661652, "learning_rate": 0.00025434166769306566, "loss": 7.6985, "step": 3994 }, { "epoch": 0.49175283111767604, "grad_norm": 0.12253033369779587, "learning_rate": 0.0002542800837541569, "loss": 7.7165, "step": 3995 }, { "epoch": 0.4918759231905465, "grad_norm": 0.10149891674518585, "learning_rate": 0.0002542184998152482, "loss": 7.4105, "step": 3996 }, { "epoch": 0.49199901526341705, "grad_norm": 0.15167781710624695, "learning_rate": 0.00025415691587633946, "loss": 7.8026, "step": 3997 }, { "epoch": 0.49212210733628753, "grad_norm": 0.06808273494243622, "learning_rate": 0.00025409533193743075, "loss": 7.7548, "step": 3998 }, { "epoch": 0.49224519940915806, "grad_norm": 0.16054892539978027, "learning_rate": 0.000254033747998522, "loss": 8.3224, "step": 3999 }, { "epoch": 0.49236829148202854, "grad_norm": 0.14392854273319244, "learning_rate": 0.00025397216405961327, "loss": 7.3892, "step": 4000 }, { "epoch": 0.4924913835548991, "grad_norm": 0.12275011837482452, "learning_rate": 0.0002539105801207045, "loss": 7.6155, "step": 4001 }, { "epoch": 0.49261447562776955, "grad_norm": 0.07189824432134628, "learning_rate": 0.0002538489961817958, "loss": 7.7931, "step": 4002 }, { "epoch": 0.4927375677006401, "grad_norm": 0.13871516287326813, "learning_rate": 0.000253787412242887, "loss": 7.4514, "step": 4003 }, { "epoch": 0.49286065977351057, "grad_norm": 0.10221157968044281, "learning_rate": 0.00025372582830397836, "loss": 7.7973, "step": 4004 }, { "epoch": 0.4929837518463811, "grad_norm": 0.11488345265388489, "learning_rate": 0.0002536642443650696, "loss": 7.939, "step": 4005 }, { "epoch": 0.4931068439192516, "grad_norm": 0.29190686345100403, "learning_rate": 0.0002536026604261609, "loss": 9.0606, "step": 4006 }, { "epoch": 0.4932299359921221, "grad_norm": 0.08005906641483307, "learning_rate": 0.0002535410764872521, "loss": 7.4557, "step": 4007 }, { "epoch": 0.4933530280649926, "grad_norm": 0.06724277138710022, "learning_rate": 0.0002534794925483434, "loss": 7.407, "step": 4008 }, { "epoch": 0.49347612013786313, "grad_norm": 0.09250996261835098, "learning_rate": 0.00025341790860943463, "loss": 7.545, "step": 4009 }, { "epoch": 0.4935992122107336, "grad_norm": 0.06982872635126114, "learning_rate": 0.0002533563246705259, "loss": 7.5964, "step": 4010 }, { "epoch": 0.49372230428360414, "grad_norm": 0.3516387343406677, "learning_rate": 0.0002532947407316172, "loss": 9.4578, "step": 4011 }, { "epoch": 0.4938453963564746, "grad_norm": 0.09253472089767456, "learning_rate": 0.0002532331567927085, "loss": 8.0924, "step": 4012 }, { "epoch": 0.49396848842934515, "grad_norm": 0.11447393149137497, "learning_rate": 0.0002531715728537997, "loss": 7.3879, "step": 4013 }, { "epoch": 0.49409158050221563, "grad_norm": 0.09862113744020462, "learning_rate": 0.000253109988914891, "loss": 8.1144, "step": 4014 }, { "epoch": 0.49421467257508617, "grad_norm": 0.1225368082523346, "learning_rate": 0.00025304840497598224, "loss": 7.2714, "step": 4015 }, { "epoch": 0.49433776464795665, "grad_norm": 0.0707067996263504, "learning_rate": 0.0002529868210370735, "loss": 7.7859, "step": 4016 }, { "epoch": 0.4944608567208272, "grad_norm": 0.2046532779932022, "learning_rate": 0.00025292523709816476, "loss": 7.3641, "step": 4017 }, { "epoch": 0.49458394879369766, "grad_norm": 0.13333097100257874, "learning_rate": 0.0002528636531592561, "loss": 7.7065, "step": 4018 }, { "epoch": 0.4947070408665682, "grad_norm": 0.07459823042154312, "learning_rate": 0.00025280206922034733, "loss": 7.4711, "step": 4019 }, { "epoch": 0.4948301329394387, "grad_norm": 0.1735353320837021, "learning_rate": 0.0002527404852814386, "loss": 8.0272, "step": 4020 }, { "epoch": 0.4949532250123092, "grad_norm": 0.08277415484189987, "learning_rate": 0.00025267890134252985, "loss": 7.7279, "step": 4021 }, { "epoch": 0.49507631708517974, "grad_norm": 0.12020870298147202, "learning_rate": 0.00025261731740362114, "loss": 7.4027, "step": 4022 }, { "epoch": 0.4951994091580502, "grad_norm": 0.08435216546058655, "learning_rate": 0.00025255573346471237, "loss": 7.7526, "step": 4023 }, { "epoch": 0.49532250123092075, "grad_norm": 0.06708889454603195, "learning_rate": 0.0002524941495258037, "loss": 7.6141, "step": 4024 }, { "epoch": 0.49544559330379123, "grad_norm": 0.08380791544914246, "learning_rate": 0.00025243256558689494, "loss": 7.7669, "step": 4025 }, { "epoch": 0.49556868537666177, "grad_norm": 0.10492248088121414, "learning_rate": 0.00025237098164798623, "loss": 8.2935, "step": 4026 }, { "epoch": 0.49569177744953224, "grad_norm": 0.10862210392951965, "learning_rate": 0.00025230939770907746, "loss": 7.4854, "step": 4027 }, { "epoch": 0.4958148695224028, "grad_norm": 0.08627161383628845, "learning_rate": 0.00025224781377016875, "loss": 7.637, "step": 4028 }, { "epoch": 0.49593796159527326, "grad_norm": 0.09586064517498016, "learning_rate": 0.00025218622983126, "loss": 7.7962, "step": 4029 }, { "epoch": 0.4960610536681438, "grad_norm": 0.06779745221138, "learning_rate": 0.00025212464589235127, "loss": 7.595, "step": 4030 }, { "epoch": 0.49618414574101427, "grad_norm": 0.08464424312114716, "learning_rate": 0.00025206306195344255, "loss": 7.7121, "step": 4031 }, { "epoch": 0.4963072378138848, "grad_norm": 0.1804562658071518, "learning_rate": 0.00025200147801453384, "loss": 7.9039, "step": 4032 }, { "epoch": 0.4964303298867553, "grad_norm": 0.11799837648868561, "learning_rate": 0.0002519398940756251, "loss": 7.7868, "step": 4033 }, { "epoch": 0.4965534219596258, "grad_norm": 0.1584905982017517, "learning_rate": 0.00025187831013671636, "loss": 8.1387, "step": 4034 }, { "epoch": 0.4966765140324963, "grad_norm": 0.08834537118673325, "learning_rate": 0.0002518167261978076, "loss": 7.5999, "step": 4035 }, { "epoch": 0.49679960610536683, "grad_norm": 0.6844730377197266, "learning_rate": 0.0002517551422588989, "loss": 11.2922, "step": 4036 }, { "epoch": 0.4969226981782373, "grad_norm": 0.19614188373088837, "learning_rate": 0.0002516935583199901, "loss": 7.527, "step": 4037 }, { "epoch": 0.49704579025110784, "grad_norm": 0.22164778411388397, "learning_rate": 0.00025163197438108145, "loss": 7.8855, "step": 4038 }, { "epoch": 0.4971688823239783, "grad_norm": 0.15766854584217072, "learning_rate": 0.0002515703904421727, "loss": 8.1544, "step": 4039 }, { "epoch": 0.49729197439684886, "grad_norm": 0.2101302295923233, "learning_rate": 0.00025150880650326397, "loss": 7.2285, "step": 4040 }, { "epoch": 0.49741506646971934, "grad_norm": 0.14257226884365082, "learning_rate": 0.0002514472225643552, "loss": 7.3581, "step": 4041 }, { "epoch": 0.49753815854258987, "grad_norm": 0.09125518053770065, "learning_rate": 0.0002513856386254465, "loss": 7.5106, "step": 4042 }, { "epoch": 0.49766125061546035, "grad_norm": 0.10405445098876953, "learning_rate": 0.0002513240546865377, "loss": 7.5066, "step": 4043 }, { "epoch": 0.4977843426883309, "grad_norm": 0.10515227168798447, "learning_rate": 0.000251262470747629, "loss": 7.4766, "step": 4044 }, { "epoch": 0.49790743476120136, "grad_norm": 0.12766632437705994, "learning_rate": 0.0002512008868087203, "loss": 7.9784, "step": 4045 }, { "epoch": 0.4980305268340719, "grad_norm": 0.10759042948484421, "learning_rate": 0.0002511393028698116, "loss": 7.6134, "step": 4046 }, { "epoch": 0.4981536189069424, "grad_norm": 0.21438109874725342, "learning_rate": 0.0002510777189309028, "loss": 8.2439, "step": 4047 }, { "epoch": 0.4982767109798129, "grad_norm": 0.13017678260803223, "learning_rate": 0.0002510161349919941, "loss": 7.7592, "step": 4048 }, { "epoch": 0.4983998030526834, "grad_norm": 0.07615959644317627, "learning_rate": 0.00025095455105308533, "loss": 7.6026, "step": 4049 }, { "epoch": 0.4985228951255539, "grad_norm": 0.074606753885746, "learning_rate": 0.0002508929671141766, "loss": 7.7466, "step": 4050 }, { "epoch": 0.4986459871984244, "grad_norm": 0.13145969808101654, "learning_rate": 0.00025083138317526785, "loss": 7.811, "step": 4051 }, { "epoch": 0.49876907927129494, "grad_norm": 0.12181295454502106, "learning_rate": 0.0002507697992363592, "loss": 7.494, "step": 4052 }, { "epoch": 0.4988921713441654, "grad_norm": 0.09681634604930878, "learning_rate": 0.0002507082152974504, "loss": 8.1687, "step": 4053 }, { "epoch": 0.49901526341703595, "grad_norm": 0.2585068941116333, "learning_rate": 0.0002506466313585417, "loss": 9.2424, "step": 4054 }, { "epoch": 0.4991383554899064, "grad_norm": 0.26175203919410706, "learning_rate": 0.00025058504741963295, "loss": 9.1525, "step": 4055 }, { "epoch": 0.49926144756277696, "grad_norm": 0.10728990286588669, "learning_rate": 0.00025052346348072423, "loss": 7.8802, "step": 4056 }, { "epoch": 0.49938453963564744, "grad_norm": 0.1628800928592682, "learning_rate": 0.00025046187954181546, "loss": 7.4692, "step": 4057 }, { "epoch": 0.499507631708518, "grad_norm": 0.14080509543418884, "learning_rate": 0.0002504002956029068, "loss": 8.0784, "step": 4058 }, { "epoch": 0.49963072378138845, "grad_norm": 0.09413354843854904, "learning_rate": 0.00025033871166399804, "loss": 7.8353, "step": 4059 }, { "epoch": 0.499753815854259, "grad_norm": 0.08926187455654144, "learning_rate": 0.0002502771277250893, "loss": 7.6055, "step": 4060 }, { "epoch": 0.49987690792712947, "grad_norm": 0.1893797367811203, "learning_rate": 0.00025021554378618056, "loss": 8.8954, "step": 4061 }, { "epoch": 0.5, "grad_norm": 0.07056505233049393, "learning_rate": 0.00025015395984727184, "loss": 7.4639, "step": 4062 }, { "epoch": 0.5001230920728705, "grad_norm": 0.23675474524497986, "learning_rate": 0.0002500923759083631, "loss": 8.2605, "step": 4063 }, { "epoch": 0.5002461841457411, "grad_norm": 0.1208018958568573, "learning_rate": 0.00025003079196945436, "loss": 7.7956, "step": 4064 }, { "epoch": 0.5003692762186115, "grad_norm": 0.10803164541721344, "learning_rate": 0.00024996920803054565, "loss": 7.2567, "step": 4065 }, { "epoch": 0.500492368291482, "grad_norm": 0.09239895641803741, "learning_rate": 0.0002499076240916369, "loss": 7.9827, "step": 4066 }, { "epoch": 0.5006154603643526, "grad_norm": 0.08181214332580566, "learning_rate": 0.00024984604015272817, "loss": 7.6352, "step": 4067 }, { "epoch": 0.5007385524372231, "grad_norm": 0.11789634823799133, "learning_rate": 0.00024978445621381945, "loss": 7.2464, "step": 4068 }, { "epoch": 0.5008616445100935, "grad_norm": 0.10421223193407059, "learning_rate": 0.0002497228722749107, "loss": 7.8887, "step": 4069 }, { "epoch": 0.500984736582964, "grad_norm": 0.1199922114610672, "learning_rate": 0.000249661288336002, "loss": 7.4695, "step": 4070 }, { "epoch": 0.5011078286558346, "grad_norm": 0.06476421654224396, "learning_rate": 0.0002495997043970932, "loss": 7.5728, "step": 4071 }, { "epoch": 0.5012309207287051, "grad_norm": 0.11452736705541611, "learning_rate": 0.0002495381204581845, "loss": 7.8378, "step": 4072 }, { "epoch": 0.5013540128015755, "grad_norm": 0.15916797518730164, "learning_rate": 0.0002494765365192758, "loss": 8.1716, "step": 4073 }, { "epoch": 0.5014771048744461, "grad_norm": 0.08466701209545135, "learning_rate": 0.000249414952580367, "loss": 7.5799, "step": 4074 }, { "epoch": 0.5016001969473166, "grad_norm": 0.14067243039608002, "learning_rate": 0.0002493533686414583, "loss": 8.1449, "step": 4075 }, { "epoch": 0.5017232890201871, "grad_norm": 0.11503851413726807, "learning_rate": 0.0002492917847025496, "loss": 7.3768, "step": 4076 }, { "epoch": 0.5018463810930576, "grad_norm": 0.1356792002916336, "learning_rate": 0.0002492302007636408, "loss": 7.5916, "step": 4077 }, { "epoch": 0.5019694731659281, "grad_norm": 0.11044701188802719, "learning_rate": 0.0002491686168247321, "loss": 7.4572, "step": 4078 }, { "epoch": 0.5020925652387986, "grad_norm": 0.12458516657352448, "learning_rate": 0.0002491070328858234, "loss": 7.3475, "step": 4079 }, { "epoch": 0.5022156573116692, "grad_norm": 0.274310439825058, "learning_rate": 0.0002490454489469146, "loss": 8.6256, "step": 4080 }, { "epoch": 0.5023387493845396, "grad_norm": 0.05666125938296318, "learning_rate": 0.0002489838650080059, "loss": 7.5333, "step": 4081 }, { "epoch": 0.5024618414574101, "grad_norm": 0.16983051598072052, "learning_rate": 0.0002489222810690972, "loss": 7.8731, "step": 4082 }, { "epoch": 0.5025849335302807, "grad_norm": 0.11417347937822342, "learning_rate": 0.00024886069713018843, "loss": 7.6616, "step": 4083 }, { "epoch": 0.5027080256031512, "grad_norm": 0.12028223276138306, "learning_rate": 0.0002487991131912797, "loss": 7.5936, "step": 4084 }, { "epoch": 0.5028311176760216, "grad_norm": 0.10332245379686356, "learning_rate": 0.000248737529252371, "loss": 7.211, "step": 4085 }, { "epoch": 0.5029542097488922, "grad_norm": 0.07872740924358368, "learning_rate": 0.00024867594531346223, "loss": 7.797, "step": 4086 }, { "epoch": 0.5030773018217627, "grad_norm": 0.14473313093185425, "learning_rate": 0.0002486143613745535, "loss": 8.1794, "step": 4087 }, { "epoch": 0.5032003938946332, "grad_norm": 0.07739289849996567, "learning_rate": 0.00024855277743564475, "loss": 7.7037, "step": 4088 }, { "epoch": 0.5033234859675036, "grad_norm": 0.19219090044498444, "learning_rate": 0.00024849119349673604, "loss": 8.5303, "step": 4089 }, { "epoch": 0.5034465780403742, "grad_norm": 0.1812019944190979, "learning_rate": 0.0002484296095578273, "loss": 7.5421, "step": 4090 }, { "epoch": 0.5035696701132447, "grad_norm": 0.19975315034389496, "learning_rate": 0.00024836802561891856, "loss": 7.2711, "step": 4091 }, { "epoch": 0.5036927621861153, "grad_norm": 0.10722135752439499, "learning_rate": 0.00024830644168000984, "loss": 7.7047, "step": 4092 }, { "epoch": 0.5038158542589857, "grad_norm": 0.10936778038740158, "learning_rate": 0.00024824485774110113, "loss": 7.5683, "step": 4093 }, { "epoch": 0.5039389463318562, "grad_norm": 0.0701480433344841, "learning_rate": 0.00024818327380219236, "loss": 8.1335, "step": 4094 }, { "epoch": 0.5040620384047267, "grad_norm": 0.09263821691274643, "learning_rate": 0.00024812168986328365, "loss": 7.3552, "step": 4095 }, { "epoch": 0.5041851304775973, "grad_norm": 0.12411665916442871, "learning_rate": 0.00024806010592437494, "loss": 7.3231, "step": 4096 }, { "epoch": 0.5043082225504677, "grad_norm": 0.17467644810676575, "learning_rate": 0.00024799852198546617, "loss": 8.2172, "step": 4097 }, { "epoch": 0.5044313146233382, "grad_norm": 0.153028666973114, "learning_rate": 0.00024793693804655746, "loss": 7.5963, "step": 4098 }, { "epoch": 0.5045544066962088, "grad_norm": 0.070567786693573, "learning_rate": 0.00024787535410764874, "loss": 7.2719, "step": 4099 }, { "epoch": 0.5046774987690793, "grad_norm": 0.10435082763433456, "learning_rate": 0.00024781377016874, "loss": 7.4531, "step": 4100 }, { "epoch": 0.5048005908419497, "grad_norm": 0.1273137629032135, "learning_rate": 0.00024775218622983126, "loss": 7.7025, "step": 4101 }, { "epoch": 0.5049236829148203, "grad_norm": 0.08091893047094345, "learning_rate": 0.00024769060229092255, "loss": 7.3656, "step": 4102 }, { "epoch": 0.5050467749876908, "grad_norm": 0.15058359503746033, "learning_rate": 0.0002476290183520138, "loss": 8.1744, "step": 4103 }, { "epoch": 0.5051698670605613, "grad_norm": 0.13414183259010315, "learning_rate": 0.00024756743441310507, "loss": 8.2291, "step": 4104 }, { "epoch": 0.5052929591334318, "grad_norm": 0.18827363848686218, "learning_rate": 0.0002475058504741963, "loss": 7.4715, "step": 4105 }, { "epoch": 0.5054160512063023, "grad_norm": 0.09122753143310547, "learning_rate": 0.0002474442665352876, "loss": 7.7588, "step": 4106 }, { "epoch": 0.5055391432791728, "grad_norm": 0.15215958654880524, "learning_rate": 0.00024738268259637887, "loss": 7.6923, "step": 4107 }, { "epoch": 0.5056622353520434, "grad_norm": 0.11222797632217407, "learning_rate": 0.0002473210986574701, "loss": 8.1899, "step": 4108 }, { "epoch": 0.5057853274249139, "grad_norm": 0.23840808868408203, "learning_rate": 0.0002472595147185614, "loss": 7.0534, "step": 4109 }, { "epoch": 0.5059084194977843, "grad_norm": 0.10680346935987473, "learning_rate": 0.0002471979307796527, "loss": 7.5705, "step": 4110 }, { "epoch": 0.5060315115706548, "grad_norm": 0.10446225106716156, "learning_rate": 0.0002471363468407439, "loss": 7.2768, "step": 4111 }, { "epoch": 0.5061546036435254, "grad_norm": 0.19031371176242828, "learning_rate": 0.0002470747629018352, "loss": 7.984, "step": 4112 }, { "epoch": 0.5062776957163959, "grad_norm": 0.19643622636795044, "learning_rate": 0.0002470131789629265, "loss": 7.7664, "step": 4113 }, { "epoch": 0.5064007877892663, "grad_norm": 0.17501507699489594, "learning_rate": 0.0002469515950240177, "loss": 7.6803, "step": 4114 }, { "epoch": 0.5065238798621369, "grad_norm": 0.1530500054359436, "learning_rate": 0.000246890011085109, "loss": 7.669, "step": 4115 }, { "epoch": 0.5066469719350074, "grad_norm": 0.1024070456624031, "learning_rate": 0.0002468284271462003, "loss": 7.5931, "step": 4116 }, { "epoch": 0.5067700640078779, "grad_norm": 0.07535558193922043, "learning_rate": 0.0002467668432072915, "loss": 7.4886, "step": 4117 }, { "epoch": 0.5068931560807484, "grad_norm": 0.0947556272149086, "learning_rate": 0.0002467052592683828, "loss": 7.6348, "step": 4118 }, { "epoch": 0.5070162481536189, "grad_norm": 0.261893093585968, "learning_rate": 0.0002466436753294741, "loss": 8.6409, "step": 4119 }, { "epoch": 0.5071393402264894, "grad_norm": 0.15941700339317322, "learning_rate": 0.00024658209139056533, "loss": 7.2795, "step": 4120 }, { "epoch": 0.50726243229936, "grad_norm": 0.13200336694717407, "learning_rate": 0.0002465205074516566, "loss": 7.7254, "step": 4121 }, { "epoch": 0.5073855243722304, "grad_norm": 0.09735938906669617, "learning_rate": 0.0002464589235127479, "loss": 7.6938, "step": 4122 }, { "epoch": 0.5075086164451009, "grad_norm": 0.07828164100646973, "learning_rate": 0.00024639733957383913, "loss": 7.7393, "step": 4123 }, { "epoch": 0.5076317085179715, "grad_norm": 0.12032893300056458, "learning_rate": 0.0002463357556349304, "loss": 8.0495, "step": 4124 }, { "epoch": 0.507754800590842, "grad_norm": 0.09512098133563995, "learning_rate": 0.00024627417169602165, "loss": 7.7887, "step": 4125 }, { "epoch": 0.5078778926637124, "grad_norm": 0.09243069589138031, "learning_rate": 0.00024621258775711294, "loss": 7.4028, "step": 4126 }, { "epoch": 0.508000984736583, "grad_norm": 0.10504963248968124, "learning_rate": 0.0002461510038182042, "loss": 7.7971, "step": 4127 }, { "epoch": 0.5081240768094535, "grad_norm": 0.12638749182224274, "learning_rate": 0.00024608941987929546, "loss": 7.3683, "step": 4128 }, { "epoch": 0.508247168882324, "grad_norm": 0.07628407329320908, "learning_rate": 0.00024602783594038674, "loss": 7.7801, "step": 4129 }, { "epoch": 0.5083702609551944, "grad_norm": 0.0755462497472763, "learning_rate": 0.00024596625200147803, "loss": 7.9638, "step": 4130 }, { "epoch": 0.508493353028065, "grad_norm": 0.12869109213352203, "learning_rate": 0.00024590466806256926, "loss": 7.4288, "step": 4131 }, { "epoch": 0.5086164451009355, "grad_norm": 0.07902699708938599, "learning_rate": 0.00024584308412366055, "loss": 7.7132, "step": 4132 }, { "epoch": 0.508739537173806, "grad_norm": 0.08117128908634186, "learning_rate": 0.00024578150018475184, "loss": 7.6215, "step": 4133 }, { "epoch": 0.5088626292466765, "grad_norm": 0.07146675884723663, "learning_rate": 0.00024571991624584307, "loss": 7.5701, "step": 4134 }, { "epoch": 0.508985721319547, "grad_norm": 0.10526546835899353, "learning_rate": 0.00024565833230693436, "loss": 8.09, "step": 4135 }, { "epoch": 0.5091088133924175, "grad_norm": 0.09714381396770477, "learning_rate": 0.00024559674836802564, "loss": 7.463, "step": 4136 }, { "epoch": 0.5092319054652881, "grad_norm": 0.10280437767505646, "learning_rate": 0.0002455351644291169, "loss": 7.3554, "step": 4137 }, { "epoch": 0.5093549975381585, "grad_norm": 0.10084766149520874, "learning_rate": 0.00024547358049020816, "loss": 7.7673, "step": 4138 }, { "epoch": 0.509478089611029, "grad_norm": 0.0896364226937294, "learning_rate": 0.00024541199655129945, "loss": 7.4058, "step": 4139 }, { "epoch": 0.5096011816838996, "grad_norm": 0.06619415432214737, "learning_rate": 0.0002453504126123907, "loss": 7.5702, "step": 4140 }, { "epoch": 0.5097242737567701, "grad_norm": 0.1826707124710083, "learning_rate": 0.00024528882867348197, "loss": 8.3063, "step": 4141 }, { "epoch": 0.5098473658296405, "grad_norm": 0.24248860776424408, "learning_rate": 0.0002452272447345732, "loss": 8.5641, "step": 4142 }, { "epoch": 0.509970457902511, "grad_norm": 0.08737552165985107, "learning_rate": 0.0002451656607956645, "loss": 7.4586, "step": 4143 }, { "epoch": 0.5100935499753816, "grad_norm": 0.08785055577754974, "learning_rate": 0.00024510407685675577, "loss": 7.7926, "step": 4144 }, { "epoch": 0.5102166420482521, "grad_norm": 0.07487129420042038, "learning_rate": 0.000245042492917847, "loss": 7.5888, "step": 4145 }, { "epoch": 0.5103397341211225, "grad_norm": 0.07497038692235947, "learning_rate": 0.0002449809089789383, "loss": 7.5291, "step": 4146 }, { "epoch": 0.5104628261939931, "grad_norm": 0.10850107669830322, "learning_rate": 0.0002449193250400296, "loss": 7.5232, "step": 4147 }, { "epoch": 0.5105859182668636, "grad_norm": 0.1391795128583908, "learning_rate": 0.0002448577411011208, "loss": 8.0706, "step": 4148 }, { "epoch": 0.5107090103397341, "grad_norm": 0.26369524002075195, "learning_rate": 0.0002447961571622121, "loss": 8.7189, "step": 4149 }, { "epoch": 0.5108321024126047, "grad_norm": 0.08922313898801804, "learning_rate": 0.0002447345732233034, "loss": 7.6467, "step": 4150 }, { "epoch": 0.5109551944854751, "grad_norm": 0.2138548195362091, "learning_rate": 0.0002446729892843946, "loss": 8.5289, "step": 4151 }, { "epoch": 0.5110782865583456, "grad_norm": 0.12541580200195312, "learning_rate": 0.0002446114053454859, "loss": 7.7006, "step": 4152 }, { "epoch": 0.5112013786312162, "grad_norm": 0.16389727592468262, "learning_rate": 0.0002445498214065772, "loss": 7.3392, "step": 4153 }, { "epoch": 0.5113244707040867, "grad_norm": 0.10810064524412155, "learning_rate": 0.0002444882374676684, "loss": 7.4016, "step": 4154 }, { "epoch": 0.5114475627769571, "grad_norm": 0.11333329230546951, "learning_rate": 0.0002444266535287597, "loss": 8.2838, "step": 4155 }, { "epoch": 0.5115706548498277, "grad_norm": 0.15890491008758545, "learning_rate": 0.000244365069589851, "loss": 8.1653, "step": 4156 }, { "epoch": 0.5116937469226982, "grad_norm": 0.07598792016506195, "learning_rate": 0.0002443034856509422, "loss": 7.3918, "step": 4157 }, { "epoch": 0.5118168389955687, "grad_norm": 0.07657922804355621, "learning_rate": 0.0002442419017120335, "loss": 7.7113, "step": 4158 }, { "epoch": 0.5119399310684392, "grad_norm": 0.06238333508372307, "learning_rate": 0.00024418031777312475, "loss": 7.6181, "step": 4159 }, { "epoch": 0.5120630231413097, "grad_norm": 0.09558926522731781, "learning_rate": 0.00024411873383421603, "loss": 7.7266, "step": 4160 }, { "epoch": 0.5121861152141802, "grad_norm": 0.2866738438606262, "learning_rate": 0.00024405714989530732, "loss": 9.0167, "step": 4161 }, { "epoch": 0.5123092072870508, "grad_norm": 0.15789593756198883, "learning_rate": 0.00024399556595639858, "loss": 8.7132, "step": 4162 }, { "epoch": 0.5124322993599212, "grad_norm": 0.11078140139579773, "learning_rate": 0.00024393398201748984, "loss": 7.6371, "step": 4163 }, { "epoch": 0.5125553914327917, "grad_norm": 0.12579195201396942, "learning_rate": 0.0002438723980785811, "loss": 8.1937, "step": 4164 }, { "epoch": 0.5126784835056623, "grad_norm": 0.11088827252388, "learning_rate": 0.00024381081413967238, "loss": 8.0676, "step": 4165 }, { "epoch": 0.5128015755785328, "grad_norm": 0.16204211115837097, "learning_rate": 0.00024374923020076364, "loss": 8.8135, "step": 4166 }, { "epoch": 0.5129246676514032, "grad_norm": 0.1935156136751175, "learning_rate": 0.0002436876462618549, "loss": 7.6529, "step": 4167 }, { "epoch": 0.5130477597242737, "grad_norm": 0.1468838006258011, "learning_rate": 0.0002436260623229462, "loss": 7.5555, "step": 4168 }, { "epoch": 0.5131708517971443, "grad_norm": 0.07199403643608093, "learning_rate": 0.00024356447838403745, "loss": 7.6331, "step": 4169 }, { "epoch": 0.5132939438700148, "grad_norm": 0.09418924152851105, "learning_rate": 0.0002435028944451287, "loss": 8.0358, "step": 4170 }, { "epoch": 0.5134170359428852, "grad_norm": 0.09262128174304962, "learning_rate": 0.00024344131050622, "loss": 7.5518, "step": 4171 }, { "epoch": 0.5135401280157558, "grad_norm": 0.10252954065799713, "learning_rate": 0.00024337972656731125, "loss": 7.7747, "step": 4172 }, { "epoch": 0.5136632200886263, "grad_norm": 0.07544182240962982, "learning_rate": 0.00024331814262840251, "loss": 7.4765, "step": 4173 }, { "epoch": 0.5137863121614968, "grad_norm": 0.2687723636627197, "learning_rate": 0.00024325655868949377, "loss": 8.4175, "step": 4174 }, { "epoch": 0.5139094042343673, "grad_norm": 0.1880587488412857, "learning_rate": 0.00024319497475058506, "loss": 8.1614, "step": 4175 }, { "epoch": 0.5140324963072378, "grad_norm": 0.08258791267871857, "learning_rate": 0.00024313339081167632, "loss": 7.2418, "step": 4176 }, { "epoch": 0.5141555883801083, "grad_norm": 0.0914432555437088, "learning_rate": 0.00024307180687276758, "loss": 7.8584, "step": 4177 }, { "epoch": 0.5142786804529789, "grad_norm": 0.15576396882534027, "learning_rate": 0.00024301022293385887, "loss": 8.1634, "step": 4178 }, { "epoch": 0.5144017725258493, "grad_norm": 0.5177474617958069, "learning_rate": 0.00024294863899495013, "loss": 9.8332, "step": 4179 }, { "epoch": 0.5145248645987198, "grad_norm": 0.08700548857450485, "learning_rate": 0.00024288705505604138, "loss": 7.667, "step": 4180 }, { "epoch": 0.5146479566715904, "grad_norm": 0.180862694978714, "learning_rate": 0.00024282547111713264, "loss": 7.8478, "step": 4181 }, { "epoch": 0.5147710487444609, "grad_norm": 0.1550646275281906, "learning_rate": 0.00024276388717822393, "loss": 8.4288, "step": 4182 }, { "epoch": 0.5148941408173313, "grad_norm": 0.10131911933422089, "learning_rate": 0.0002427023032393152, "loss": 8.239, "step": 4183 }, { "epoch": 0.5150172328902018, "grad_norm": 0.17349541187286377, "learning_rate": 0.00024264071930040645, "loss": 7.7608, "step": 4184 }, { "epoch": 0.5151403249630724, "grad_norm": 0.15643103420734406, "learning_rate": 0.00024257913536149774, "loss": 7.6697, "step": 4185 }, { "epoch": 0.5152634170359429, "grad_norm": 0.22809256613254547, "learning_rate": 0.000242517551422589, "loss": 7.25, "step": 4186 }, { "epoch": 0.5153865091088133, "grad_norm": 0.12625712156295776, "learning_rate": 0.00024245596748368026, "loss": 7.5974, "step": 4187 }, { "epoch": 0.5155096011816839, "grad_norm": 0.09789826720952988, "learning_rate": 0.00024239438354477154, "loss": 7.6869, "step": 4188 }, { "epoch": 0.5156326932545544, "grad_norm": 0.15732821822166443, "learning_rate": 0.0002423327996058628, "loss": 8.0889, "step": 4189 }, { "epoch": 0.5157557853274249, "grad_norm": 0.09944068640470505, "learning_rate": 0.00024227121566695406, "loss": 7.633, "step": 4190 }, { "epoch": 0.5158788774002954, "grad_norm": 0.07041653990745544, "learning_rate": 0.00024220963172804532, "loss": 7.6596, "step": 4191 }, { "epoch": 0.5160019694731659, "grad_norm": 0.19007061421871185, "learning_rate": 0.0002421480477891366, "loss": 8.1748, "step": 4192 }, { "epoch": 0.5161250615460364, "grad_norm": 0.09714693576097488, "learning_rate": 0.00024208646385022787, "loss": 7.7934, "step": 4193 }, { "epoch": 0.516248153618907, "grad_norm": 0.07208407670259476, "learning_rate": 0.00024202487991131913, "loss": 7.3713, "step": 4194 }, { "epoch": 0.5163712456917775, "grad_norm": 0.089815653860569, "learning_rate": 0.0002419632959724104, "loss": 7.9121, "step": 4195 }, { "epoch": 0.5164943377646479, "grad_norm": 0.12084385007619858, "learning_rate": 0.00024190171203350167, "loss": 7.9226, "step": 4196 }, { "epoch": 0.5166174298375185, "grad_norm": 0.12679843604564667, "learning_rate": 0.00024184012809459293, "loss": 7.3781, "step": 4197 }, { "epoch": 0.516740521910389, "grad_norm": 0.09252739697694778, "learning_rate": 0.0002417785441556842, "loss": 7.3578, "step": 4198 }, { "epoch": 0.5168636139832595, "grad_norm": 0.23490026593208313, "learning_rate": 0.00024171696021677548, "loss": 8.6288, "step": 4199 }, { "epoch": 0.51698670605613, "grad_norm": 0.13978765904903412, "learning_rate": 0.00024165537627786674, "loss": 7.2917, "step": 4200 }, { "epoch": 0.5171097981290005, "grad_norm": 0.07652285695075989, "learning_rate": 0.000241593792338958, "loss": 7.7472, "step": 4201 }, { "epoch": 0.517232890201871, "grad_norm": 0.06359319388866425, "learning_rate": 0.00024153220840004928, "loss": 7.602, "step": 4202 }, { "epoch": 0.5173559822747416, "grad_norm": 0.07899085432291031, "learning_rate": 0.00024147062446114054, "loss": 7.5158, "step": 4203 }, { "epoch": 0.517479074347612, "grad_norm": 0.06669744104146957, "learning_rate": 0.0002414090405222318, "loss": 7.6108, "step": 4204 }, { "epoch": 0.5176021664204825, "grad_norm": 0.0945393294095993, "learning_rate": 0.0002413474565833231, "loss": 7.389, "step": 4205 }, { "epoch": 0.517725258493353, "grad_norm": 0.06507731229066849, "learning_rate": 0.00024128587264441435, "loss": 7.8527, "step": 4206 }, { "epoch": 0.5178483505662236, "grad_norm": 0.12215740978717804, "learning_rate": 0.0002412242887055056, "loss": 7.8143, "step": 4207 }, { "epoch": 0.517971442639094, "grad_norm": 0.14895056188106537, "learning_rate": 0.00024116270476659687, "loss": 8.229, "step": 4208 }, { "epoch": 0.5180945347119645, "grad_norm": 0.07147122919559479, "learning_rate": 0.00024110112082768815, "loss": 7.7189, "step": 4209 }, { "epoch": 0.5182176267848351, "grad_norm": 0.16119500994682312, "learning_rate": 0.0002410395368887794, "loss": 7.351, "step": 4210 }, { "epoch": 0.5183407188577056, "grad_norm": 0.09185168147087097, "learning_rate": 0.00024097795294987067, "loss": 7.5176, "step": 4211 }, { "epoch": 0.518463810930576, "grad_norm": 0.08592375367879868, "learning_rate": 0.00024091636901096196, "loss": 7.4544, "step": 4212 }, { "epoch": 0.5185869030034466, "grad_norm": 0.12918026745319366, "learning_rate": 0.00024085478507205322, "loss": 7.3097, "step": 4213 }, { "epoch": 0.5187099950763171, "grad_norm": 0.08994314819574356, "learning_rate": 0.00024079320113314448, "loss": 7.8134, "step": 4214 }, { "epoch": 0.5188330871491876, "grad_norm": 0.10772155225276947, "learning_rate": 0.00024073161719423577, "loss": 7.1963, "step": 4215 }, { "epoch": 0.5189561792220581, "grad_norm": 0.16132481396198273, "learning_rate": 0.00024067003325532702, "loss": 8.0172, "step": 4216 }, { "epoch": 0.5190792712949286, "grad_norm": 0.13291393220424652, "learning_rate": 0.00024060844931641828, "loss": 7.4734, "step": 4217 }, { "epoch": 0.5192023633677991, "grad_norm": 0.06874850392341614, "learning_rate": 0.00024054686537750954, "loss": 7.2647, "step": 4218 }, { "epoch": 0.5193254554406697, "grad_norm": 0.14689300954341888, "learning_rate": 0.00024048528143860083, "loss": 7.6893, "step": 4219 }, { "epoch": 0.5194485475135401, "grad_norm": 0.08434699475765228, "learning_rate": 0.0002404236974996921, "loss": 7.4224, "step": 4220 }, { "epoch": 0.5195716395864106, "grad_norm": 0.15062817931175232, "learning_rate": 0.00024036211356078335, "loss": 8.1101, "step": 4221 }, { "epoch": 0.5196947316592812, "grad_norm": 0.09897232800722122, "learning_rate": 0.00024030052962187464, "loss": 7.3619, "step": 4222 }, { "epoch": 0.5198178237321517, "grad_norm": 0.14071407914161682, "learning_rate": 0.0002402389456829659, "loss": 7.3241, "step": 4223 }, { "epoch": 0.5199409158050221, "grad_norm": 0.12328649312257767, "learning_rate": 0.00024017736174405715, "loss": 7.2375, "step": 4224 }, { "epoch": 0.5200640078778926, "grad_norm": 0.0759349837899208, "learning_rate": 0.00024011577780514841, "loss": 7.6159, "step": 4225 }, { "epoch": 0.5201870999507632, "grad_norm": 0.06574697047472, "learning_rate": 0.0002400541938662397, "loss": 7.6046, "step": 4226 }, { "epoch": 0.5203101920236337, "grad_norm": 0.0918801799416542, "learning_rate": 0.00023999260992733096, "loss": 7.4083, "step": 4227 }, { "epoch": 0.5204332840965041, "grad_norm": 0.12216705083847046, "learning_rate": 0.00023993102598842222, "loss": 7.6179, "step": 4228 }, { "epoch": 0.5205563761693747, "grad_norm": 0.3473002314567566, "learning_rate": 0.0002398694420495135, "loss": 9.4446, "step": 4229 }, { "epoch": 0.5206794682422452, "grad_norm": 0.12697269022464752, "learning_rate": 0.00023980785811060477, "loss": 7.3632, "step": 4230 }, { "epoch": 0.5208025603151157, "grad_norm": 0.11048030853271484, "learning_rate": 0.00023974627417169603, "loss": 7.4383, "step": 4231 }, { "epoch": 0.5209256523879862, "grad_norm": 0.2596341073513031, "learning_rate": 0.0002396846902327873, "loss": 8.7574, "step": 4232 }, { "epoch": 0.5210487444608567, "grad_norm": 0.06047637388110161, "learning_rate": 0.00023962310629387857, "loss": 7.4942, "step": 4233 }, { "epoch": 0.5211718365337272, "grad_norm": 0.06681081652641296, "learning_rate": 0.00023956152235496983, "loss": 7.4211, "step": 4234 }, { "epoch": 0.5212949286065978, "grad_norm": 0.079683318734169, "learning_rate": 0.0002394999384160611, "loss": 7.9296, "step": 4235 }, { "epoch": 0.5214180206794683, "grad_norm": 0.11358927190303802, "learning_rate": 0.00023943835447715238, "loss": 7.2856, "step": 4236 }, { "epoch": 0.5215411127523387, "grad_norm": 0.09530656039714813, "learning_rate": 0.00023937677053824364, "loss": 7.5957, "step": 4237 }, { "epoch": 0.5216642048252093, "grad_norm": 0.07537446171045303, "learning_rate": 0.0002393151865993349, "loss": 7.4726, "step": 4238 }, { "epoch": 0.5217872968980798, "grad_norm": 0.0978485569357872, "learning_rate": 0.00023925360266042618, "loss": 7.3461, "step": 4239 }, { "epoch": 0.5219103889709503, "grad_norm": 0.10066711157560349, "learning_rate": 0.00023919201872151744, "loss": 7.329, "step": 4240 }, { "epoch": 0.5220334810438207, "grad_norm": 0.07548674196004868, "learning_rate": 0.0002391304347826087, "loss": 7.44, "step": 4241 }, { "epoch": 0.5221565731166913, "grad_norm": 0.15097825229167938, "learning_rate": 0.0002390688508437, "loss": 7.6385, "step": 4242 }, { "epoch": 0.5222796651895618, "grad_norm": 0.21703344583511353, "learning_rate": 0.00023900726690479125, "loss": 8.2695, "step": 4243 }, { "epoch": 0.5224027572624323, "grad_norm": 0.09886853396892548, "learning_rate": 0.0002389456829658825, "loss": 7.6592, "step": 4244 }, { "epoch": 0.5225258493353028, "grad_norm": 0.1515536904335022, "learning_rate": 0.00023888409902697377, "loss": 8.2037, "step": 4245 }, { "epoch": 0.5226489414081733, "grad_norm": 0.06986558437347412, "learning_rate": 0.00023882251508806505, "loss": 7.5927, "step": 4246 }, { "epoch": 0.5227720334810438, "grad_norm": 0.07631317526102066, "learning_rate": 0.0002387609311491563, "loss": 7.5077, "step": 4247 }, { "epoch": 0.5228951255539144, "grad_norm": 0.08282262831926346, "learning_rate": 0.00023869934721024757, "loss": 8.0056, "step": 4248 }, { "epoch": 0.5230182176267848, "grad_norm": 0.12259119004011154, "learning_rate": 0.00023863776327133886, "loss": 7.3808, "step": 4249 }, { "epoch": 0.5231413096996553, "grad_norm": 0.11088088154792786, "learning_rate": 0.00023857617933243012, "loss": 7.9219, "step": 4250 }, { "epoch": 0.5232644017725259, "grad_norm": 0.11527059972286224, "learning_rate": 0.00023851459539352138, "loss": 7.7518, "step": 4251 }, { "epoch": 0.5233874938453964, "grad_norm": 0.07782173901796341, "learning_rate": 0.00023845301145461264, "loss": 7.6916, "step": 4252 }, { "epoch": 0.5235105859182668, "grad_norm": 0.11092701554298401, "learning_rate": 0.00023839142751570392, "loss": 7.8813, "step": 4253 }, { "epoch": 0.5236336779911374, "grad_norm": 0.13258615136146545, "learning_rate": 0.00023832984357679518, "loss": 7.8068, "step": 4254 }, { "epoch": 0.5237567700640079, "grad_norm": 0.22398166358470917, "learning_rate": 0.00023826825963788644, "loss": 7.4097, "step": 4255 }, { "epoch": 0.5238798621368784, "grad_norm": 0.13760259747505188, "learning_rate": 0.00023820667569897773, "loss": 7.6416, "step": 4256 }, { "epoch": 0.5240029542097489, "grad_norm": 0.0869116336107254, "learning_rate": 0.000238145091760069, "loss": 7.8821, "step": 4257 }, { "epoch": 0.5241260462826194, "grad_norm": 0.1641833633184433, "learning_rate": 0.00023808350782116025, "loss": 7.8057, "step": 4258 }, { "epoch": 0.5242491383554899, "grad_norm": 0.12019713222980499, "learning_rate": 0.00023802192388225154, "loss": 8.0283, "step": 4259 }, { "epoch": 0.5243722304283605, "grad_norm": 0.10196682810783386, "learning_rate": 0.0002379603399433428, "loss": 7.3599, "step": 4260 }, { "epoch": 0.5244953225012309, "grad_norm": 0.08129315078258514, "learning_rate": 0.00023789875600443405, "loss": 7.5632, "step": 4261 }, { "epoch": 0.5246184145741014, "grad_norm": 0.11122360825538635, "learning_rate": 0.00023783717206552531, "loss": 7.4953, "step": 4262 }, { "epoch": 0.524741506646972, "grad_norm": 0.06931335479021072, "learning_rate": 0.0002377755881266166, "loss": 7.3118, "step": 4263 }, { "epoch": 0.5248645987198425, "grad_norm": 0.0722368061542511, "learning_rate": 0.00023771400418770786, "loss": 7.6352, "step": 4264 }, { "epoch": 0.5249876907927129, "grad_norm": 0.06686769425868988, "learning_rate": 0.00023765242024879912, "loss": 7.3727, "step": 4265 }, { "epoch": 0.5251107828655834, "grad_norm": 0.08126168698072433, "learning_rate": 0.0002375908363098904, "loss": 7.3359, "step": 4266 }, { "epoch": 0.525233874938454, "grad_norm": 0.10719495266675949, "learning_rate": 0.00023752925237098167, "loss": 8.0744, "step": 4267 }, { "epoch": 0.5253569670113245, "grad_norm": 0.12294968217611313, "learning_rate": 0.00023746766843207292, "loss": 7.8333, "step": 4268 }, { "epoch": 0.5254800590841949, "grad_norm": 0.6256700158119202, "learning_rate": 0.0002374060844931642, "loss": 10.3116, "step": 4269 }, { "epoch": 0.5256031511570655, "grad_norm": 0.08349563181400299, "learning_rate": 0.00023734450055425547, "loss": 7.7101, "step": 4270 }, { "epoch": 0.525726243229936, "grad_norm": 0.12726719677448273, "learning_rate": 0.00023728291661534673, "loss": 7.6851, "step": 4271 }, { "epoch": 0.5258493353028065, "grad_norm": 0.13569240272045135, "learning_rate": 0.000237221332676438, "loss": 7.4679, "step": 4272 }, { "epoch": 0.525972427375677, "grad_norm": 0.2505890429019928, "learning_rate": 0.00023715974873752928, "loss": 8.828, "step": 4273 }, { "epoch": 0.5260955194485475, "grad_norm": 0.06213948503136635, "learning_rate": 0.00023709816479862054, "loss": 7.6022, "step": 4274 }, { "epoch": 0.526218611521418, "grad_norm": 0.12195181101560593, "learning_rate": 0.0002370365808597118, "loss": 7.4293, "step": 4275 }, { "epoch": 0.5263417035942886, "grad_norm": 0.08947115391492844, "learning_rate": 0.00023697499692080308, "loss": 8.0091, "step": 4276 }, { "epoch": 0.526464795667159, "grad_norm": 0.0906778946518898, "learning_rate": 0.00023691341298189434, "loss": 7.6841, "step": 4277 }, { "epoch": 0.5265878877400295, "grad_norm": 0.1948612630367279, "learning_rate": 0.0002368518290429856, "loss": 8.4849, "step": 4278 }, { "epoch": 0.5267109798129, "grad_norm": 0.09521149843931198, "learning_rate": 0.00023679024510407686, "loss": 7.9631, "step": 4279 }, { "epoch": 0.5268340718857706, "grad_norm": 0.18584835529327393, "learning_rate": 0.00023672866116516815, "loss": 8.471, "step": 4280 }, { "epoch": 0.5269571639586411, "grad_norm": 0.11180959641933441, "learning_rate": 0.0002366670772262594, "loss": 8.1444, "step": 4281 }, { "epoch": 0.5270802560315115, "grad_norm": 0.1367988884449005, "learning_rate": 0.00023660549328735067, "loss": 7.3335, "step": 4282 }, { "epoch": 0.5272033481043821, "grad_norm": 0.08628519624471664, "learning_rate": 0.00023654390934844195, "loss": 7.6751, "step": 4283 }, { "epoch": 0.5273264401772526, "grad_norm": 0.2186540961265564, "learning_rate": 0.0002364823254095332, "loss": 7.211, "step": 4284 }, { "epoch": 0.5274495322501231, "grad_norm": 0.1530098021030426, "learning_rate": 0.00023642074147062447, "loss": 7.2118, "step": 4285 }, { "epoch": 0.5275726243229936, "grad_norm": 0.10290660709142685, "learning_rate": 0.00023635915753171576, "loss": 7.9802, "step": 4286 }, { "epoch": 0.5276957163958641, "grad_norm": 0.11058132350444794, "learning_rate": 0.00023629757359280702, "loss": 7.7594, "step": 4287 }, { "epoch": 0.5278188084687346, "grad_norm": 0.11468569934368134, "learning_rate": 0.00023623598965389828, "loss": 7.7095, "step": 4288 }, { "epoch": 0.5279419005416052, "grad_norm": 0.10103701055049896, "learning_rate": 0.00023617440571498954, "loss": 7.28, "step": 4289 }, { "epoch": 0.5280649926144756, "grad_norm": 0.14582408964633942, "learning_rate": 0.00023611282177608082, "loss": 7.6079, "step": 4290 }, { "epoch": 0.5281880846873461, "grad_norm": 0.14728769659996033, "learning_rate": 0.00023605123783717208, "loss": 7.8625, "step": 4291 }, { "epoch": 0.5283111767602167, "grad_norm": 0.0795397236943245, "learning_rate": 0.00023598965389826334, "loss": 7.8468, "step": 4292 }, { "epoch": 0.5284342688330872, "grad_norm": 0.05128752440214157, "learning_rate": 0.00023592806995935463, "loss": 7.5561, "step": 4293 }, { "epoch": 0.5285573609059576, "grad_norm": 0.11943043023347855, "learning_rate": 0.0002358664860204459, "loss": 7.3692, "step": 4294 }, { "epoch": 0.5286804529788282, "grad_norm": 0.08345980942249298, "learning_rate": 0.00023580490208153715, "loss": 7.5245, "step": 4295 }, { "epoch": 0.5288035450516987, "grad_norm": 0.12851938605308533, "learning_rate": 0.00023574331814262843, "loss": 7.9003, "step": 4296 }, { "epoch": 0.5289266371245692, "grad_norm": 0.10382503271102905, "learning_rate": 0.00023568173420371967, "loss": 7.5806, "step": 4297 }, { "epoch": 0.5290497291974396, "grad_norm": 0.0983615294098854, "learning_rate": 0.00023562015026481093, "loss": 7.4915, "step": 4298 }, { "epoch": 0.5291728212703102, "grad_norm": 0.11414606124162674, "learning_rate": 0.00023555856632590219, "loss": 7.5915, "step": 4299 }, { "epoch": 0.5292959133431807, "grad_norm": 0.1285552829504013, "learning_rate": 0.00023549698238699347, "loss": 8.3809, "step": 4300 }, { "epoch": 0.5294190054160512, "grad_norm": 0.16075663268566132, "learning_rate": 0.00023543539844808473, "loss": 7.3845, "step": 4301 }, { "epoch": 0.5295420974889217, "grad_norm": 0.06330212950706482, "learning_rate": 0.000235373814509176, "loss": 7.6655, "step": 4302 }, { "epoch": 0.5296651895617922, "grad_norm": 0.2881353199481964, "learning_rate": 0.00023531223057026728, "loss": 9.413, "step": 4303 }, { "epoch": 0.5297882816346627, "grad_norm": 0.072247214615345, "learning_rate": 0.00023525064663135854, "loss": 7.6123, "step": 4304 }, { "epoch": 0.5299113737075333, "grad_norm": 0.11459475010633469, "learning_rate": 0.0002351890626924498, "loss": 8.0736, "step": 4305 }, { "epoch": 0.5300344657804037, "grad_norm": 0.08045434951782227, "learning_rate": 0.00023512747875354106, "loss": 8.0469, "step": 4306 }, { "epoch": 0.5301575578532742, "grad_norm": 0.11146170645952225, "learning_rate": 0.00023506589481463234, "loss": 7.8452, "step": 4307 }, { "epoch": 0.5302806499261448, "grad_norm": 0.15106621384620667, "learning_rate": 0.0002350043108757236, "loss": 8.401, "step": 4308 }, { "epoch": 0.5304037419990153, "grad_norm": 0.09370127320289612, "learning_rate": 0.00023494272693681486, "loss": 8.0307, "step": 4309 }, { "epoch": 0.5305268340718857, "grad_norm": 0.08279415965080261, "learning_rate": 0.00023488114299790615, "loss": 7.5943, "step": 4310 }, { "epoch": 0.5306499261447563, "grad_norm": 0.18292634189128876, "learning_rate": 0.0002348195590589974, "loss": 8.9218, "step": 4311 }, { "epoch": 0.5307730182176268, "grad_norm": 0.07675708085298538, "learning_rate": 0.00023475797512008867, "loss": 7.7058, "step": 4312 }, { "epoch": 0.5308961102904973, "grad_norm": 0.1768147051334381, "learning_rate": 0.00023469639118117995, "loss": 7.9345, "step": 4313 }, { "epoch": 0.5310192023633677, "grad_norm": 0.22108805179595947, "learning_rate": 0.00023463480724227121, "loss": 7.2411, "step": 4314 }, { "epoch": 0.5311422944362383, "grad_norm": 0.1942029744386673, "learning_rate": 0.00023457322330336247, "loss": 7.3843, "step": 4315 }, { "epoch": 0.5312653865091088, "grad_norm": 0.19433876872062683, "learning_rate": 0.00023451163936445373, "loss": 7.8259, "step": 4316 }, { "epoch": 0.5313884785819794, "grad_norm": 0.12732788920402527, "learning_rate": 0.00023445005542554502, "loss": 7.4761, "step": 4317 }, { "epoch": 0.5315115706548498, "grad_norm": 0.0951835960149765, "learning_rate": 0.00023438847148663628, "loss": 7.7075, "step": 4318 }, { "epoch": 0.5316346627277203, "grad_norm": 0.11366661638021469, "learning_rate": 0.00023432688754772754, "loss": 7.5706, "step": 4319 }, { "epoch": 0.5317577548005908, "grad_norm": 0.1171022579073906, "learning_rate": 0.00023426530360881883, "loss": 7.4345, "step": 4320 }, { "epoch": 0.5318808468734614, "grad_norm": 0.12504492700099945, "learning_rate": 0.00023420371966991008, "loss": 7.9257, "step": 4321 }, { "epoch": 0.5320039389463319, "grad_norm": 0.11685119569301605, "learning_rate": 0.00023414213573100134, "loss": 7.742, "step": 4322 }, { "epoch": 0.5321270310192023, "grad_norm": 0.09848513454198837, "learning_rate": 0.0002340805517920926, "loss": 7.7719, "step": 4323 }, { "epoch": 0.5322501230920729, "grad_norm": 0.12877991795539856, "learning_rate": 0.0002340189678531839, "loss": 7.2424, "step": 4324 }, { "epoch": 0.5323732151649434, "grad_norm": 0.09580870717763901, "learning_rate": 0.00023395738391427515, "loss": 8.2722, "step": 4325 }, { "epoch": 0.5324963072378139, "grad_norm": 0.09260089695453644, "learning_rate": 0.0002338957999753664, "loss": 7.5724, "step": 4326 }, { "epoch": 0.5326193993106844, "grad_norm": 0.12293165922164917, "learning_rate": 0.0002338342160364577, "loss": 7.3616, "step": 4327 }, { "epoch": 0.5327424913835549, "grad_norm": 0.12235872447490692, "learning_rate": 0.00023377263209754896, "loss": 7.4513, "step": 4328 }, { "epoch": 0.5328655834564254, "grad_norm": 0.14222584664821625, "learning_rate": 0.00023371104815864021, "loss": 7.6311, "step": 4329 }, { "epoch": 0.532988675529296, "grad_norm": 0.06390654295682907, "learning_rate": 0.0002336494642197315, "loss": 7.5955, "step": 4330 }, { "epoch": 0.5331117676021664, "grad_norm": 0.11017616838216782, "learning_rate": 0.00023358788028082276, "loss": 7.3679, "step": 4331 }, { "epoch": 0.5332348596750369, "grad_norm": 0.06263618916273117, "learning_rate": 0.00023352629634191402, "loss": 7.5729, "step": 4332 }, { "epoch": 0.5333579517479075, "grad_norm": 0.16596604883670807, "learning_rate": 0.00023346471240300528, "loss": 8.2741, "step": 4333 }, { "epoch": 0.533481043820778, "grad_norm": 0.0733143538236618, "learning_rate": 0.00023340312846409657, "loss": 7.5043, "step": 4334 }, { "epoch": 0.5336041358936484, "grad_norm": 0.058253757655620575, "learning_rate": 0.00023334154452518783, "loss": 7.4148, "step": 4335 }, { "epoch": 0.533727227966519, "grad_norm": 0.12186695635318756, "learning_rate": 0.00023327996058627909, "loss": 7.5066, "step": 4336 }, { "epoch": 0.5338503200393895, "grad_norm": 0.06011976674199104, "learning_rate": 0.00023321837664737037, "loss": 7.367, "step": 4337 }, { "epoch": 0.53397341211226, "grad_norm": 0.06728315353393555, "learning_rate": 0.00023315679270846163, "loss": 7.4318, "step": 4338 }, { "epoch": 0.5340965041851304, "grad_norm": 0.139456644654274, "learning_rate": 0.0002330952087695529, "loss": 7.8272, "step": 4339 }, { "epoch": 0.534219596258001, "grad_norm": 0.06155170500278473, "learning_rate": 0.00023303362483064415, "loss": 7.7444, "step": 4340 }, { "epoch": 0.5343426883308715, "grad_norm": 0.1350586861371994, "learning_rate": 0.00023297204089173544, "loss": 7.2883, "step": 4341 }, { "epoch": 0.534465780403742, "grad_norm": 0.07674496620893478, "learning_rate": 0.0002329104569528267, "loss": 7.6459, "step": 4342 }, { "epoch": 0.5345888724766125, "grad_norm": 0.19220076501369476, "learning_rate": 0.00023284887301391796, "loss": 8.703, "step": 4343 }, { "epoch": 0.534711964549483, "grad_norm": 0.08521226793527603, "learning_rate": 0.00023278728907500924, "loss": 7.8486, "step": 4344 }, { "epoch": 0.5348350566223535, "grad_norm": 0.08124762028455734, "learning_rate": 0.0002327257051361005, "loss": 8.0596, "step": 4345 }, { "epoch": 0.5349581486952241, "grad_norm": 0.19317620992660522, "learning_rate": 0.00023266412119719176, "loss": 7.3084, "step": 4346 }, { "epoch": 0.5350812407680945, "grad_norm": 0.12948381900787354, "learning_rate": 0.00023260253725828305, "loss": 7.5636, "step": 4347 }, { "epoch": 0.535204332840965, "grad_norm": 0.5202641487121582, "learning_rate": 0.0002325409533193743, "loss": 10.1601, "step": 4348 }, { "epoch": 0.5353274249138356, "grad_norm": 0.09151837229728699, "learning_rate": 0.00023247936938046557, "loss": 7.5688, "step": 4349 }, { "epoch": 0.5354505169867061, "grad_norm": 0.05786488205194473, "learning_rate": 0.00023241778544155683, "loss": 7.7197, "step": 4350 }, { "epoch": 0.5355736090595765, "grad_norm": 0.07645738869905472, "learning_rate": 0.0002323562015026481, "loss": 7.5413, "step": 4351 }, { "epoch": 0.535696701132447, "grad_norm": 0.1746063083410263, "learning_rate": 0.00023229461756373937, "loss": 7.4902, "step": 4352 }, { "epoch": 0.5358197932053176, "grad_norm": 0.09173758327960968, "learning_rate": 0.00023223303362483063, "loss": 8.0744, "step": 4353 }, { "epoch": 0.5359428852781881, "grad_norm": 0.1378403902053833, "learning_rate": 0.00023217144968592192, "loss": 8.0457, "step": 4354 }, { "epoch": 0.5360659773510585, "grad_norm": 0.08837786316871643, "learning_rate": 0.00023210986574701318, "loss": 8.093, "step": 4355 }, { "epoch": 0.5361890694239291, "grad_norm": 0.13887840509414673, "learning_rate": 0.00023204828180810444, "loss": 7.1792, "step": 4356 }, { "epoch": 0.5363121614967996, "grad_norm": 0.0783284530043602, "learning_rate": 0.00023198669786919572, "loss": 7.6051, "step": 4357 }, { "epoch": 0.5364352535696701, "grad_norm": 0.08287108689546585, "learning_rate": 0.00023192511393028698, "loss": 7.567, "step": 4358 }, { "epoch": 0.5365583456425406, "grad_norm": 0.08771255612373352, "learning_rate": 0.00023186352999137824, "loss": 7.4153, "step": 4359 }, { "epoch": 0.5366814377154111, "grad_norm": 0.06213485822081566, "learning_rate": 0.0002318019460524695, "loss": 7.4521, "step": 4360 }, { "epoch": 0.5368045297882816, "grad_norm": 0.08759962022304535, "learning_rate": 0.0002317403621135608, "loss": 7.5544, "step": 4361 }, { "epoch": 0.5369276218611522, "grad_norm": 0.10429254174232483, "learning_rate": 0.00023167877817465205, "loss": 7.3206, "step": 4362 }, { "epoch": 0.5370507139340226, "grad_norm": 0.08376510441303253, "learning_rate": 0.0002316171942357433, "loss": 7.4811, "step": 4363 }, { "epoch": 0.5371738060068931, "grad_norm": 0.06914902478456497, "learning_rate": 0.0002315556102968346, "loss": 7.521, "step": 4364 }, { "epoch": 0.5372968980797637, "grad_norm": 0.08627024292945862, "learning_rate": 0.00023149402635792585, "loss": 7.4458, "step": 4365 }, { "epoch": 0.5374199901526342, "grad_norm": 0.10555458068847656, "learning_rate": 0.00023143244241901711, "loss": 7.7339, "step": 4366 }, { "epoch": 0.5375430822255047, "grad_norm": 0.05823018029332161, "learning_rate": 0.00023137085848010837, "loss": 7.4447, "step": 4367 }, { "epoch": 0.5376661742983752, "grad_norm": 0.10261614620685577, "learning_rate": 0.00023130927454119966, "loss": 7.3211, "step": 4368 }, { "epoch": 0.5377892663712457, "grad_norm": 0.0813305601477623, "learning_rate": 0.00023124769060229092, "loss": 7.5501, "step": 4369 }, { "epoch": 0.5379123584441162, "grad_norm": 0.08306136727333069, "learning_rate": 0.00023118610666338218, "loss": 7.8517, "step": 4370 }, { "epoch": 0.5380354505169868, "grad_norm": 0.09670713543891907, "learning_rate": 0.00023112452272447347, "loss": 7.4075, "step": 4371 }, { "epoch": 0.5381585425898572, "grad_norm": 0.06392183154821396, "learning_rate": 0.00023106293878556473, "loss": 7.7317, "step": 4372 }, { "epoch": 0.5382816346627277, "grad_norm": 0.10673947632312775, "learning_rate": 0.00023100135484665599, "loss": 7.5025, "step": 4373 }, { "epoch": 0.5384047267355982, "grad_norm": 0.07741989195346832, "learning_rate": 0.00023093977090774727, "loss": 7.7394, "step": 4374 }, { "epoch": 0.5385278188084688, "grad_norm": 0.14273200929164886, "learning_rate": 0.00023087818696883853, "loss": 7.1702, "step": 4375 }, { "epoch": 0.5386509108813392, "grad_norm": 0.08277490735054016, "learning_rate": 0.0002308166030299298, "loss": 7.864, "step": 4376 }, { "epoch": 0.5387740029542097, "grad_norm": 0.06778346747159958, "learning_rate": 0.00023075501909102105, "loss": 7.3712, "step": 4377 }, { "epoch": 0.5388970950270803, "grad_norm": 0.06080878525972366, "learning_rate": 0.00023069343515211234, "loss": 7.6702, "step": 4378 }, { "epoch": 0.5390201870999508, "grad_norm": 0.1944577544927597, "learning_rate": 0.0002306318512132036, "loss": 8.5345, "step": 4379 }, { "epoch": 0.5391432791728212, "grad_norm": 0.11328531056642532, "learning_rate": 0.00023057026727429486, "loss": 7.5114, "step": 4380 }, { "epoch": 0.5392663712456918, "grad_norm": 0.09249760210514069, "learning_rate": 0.00023050868333538614, "loss": 7.7069, "step": 4381 }, { "epoch": 0.5393894633185623, "grad_norm": 0.05751444771885872, "learning_rate": 0.0002304470993964774, "loss": 7.4023, "step": 4382 }, { "epoch": 0.5395125553914328, "grad_norm": 0.07557331770658493, "learning_rate": 0.00023038551545756866, "loss": 7.9464, "step": 4383 }, { "epoch": 0.5396356474643033, "grad_norm": 0.0993584394454956, "learning_rate": 0.00023032393151865995, "loss": 7.445, "step": 4384 }, { "epoch": 0.5397587395371738, "grad_norm": 0.42667579650878906, "learning_rate": 0.0002302623475797512, "loss": 9.4332, "step": 4385 }, { "epoch": 0.5398818316100443, "grad_norm": 0.10299160331487656, "learning_rate": 0.00023020076364084247, "loss": 8.0068, "step": 4386 }, { "epoch": 0.5400049236829149, "grad_norm": 0.06486932188272476, "learning_rate": 0.00023013917970193373, "loss": 7.8019, "step": 4387 }, { "epoch": 0.5401280157557853, "grad_norm": 0.07080692052841187, "learning_rate": 0.000230077595763025, "loss": 7.9815, "step": 4388 }, { "epoch": 0.5402511078286558, "grad_norm": 0.18914921581745148, "learning_rate": 0.00023001601182411627, "loss": 7.8317, "step": 4389 }, { "epoch": 0.5403741999015264, "grad_norm": 0.10881850868463516, "learning_rate": 0.00022995442788520753, "loss": 7.5013, "step": 4390 }, { "epoch": 0.5404972919743969, "grad_norm": 0.06857867538928986, "learning_rate": 0.00022989284394629882, "loss": 7.6743, "step": 4391 }, { "epoch": 0.5406203840472673, "grad_norm": 0.20719392597675323, "learning_rate": 0.00022983126000739008, "loss": 7.5954, "step": 4392 }, { "epoch": 0.5407434761201378, "grad_norm": 0.09894908219575882, "learning_rate": 0.00022976967606848134, "loss": 7.5368, "step": 4393 }, { "epoch": 0.5408665681930084, "grad_norm": 0.07546405494213104, "learning_rate": 0.0002297080921295726, "loss": 7.3662, "step": 4394 }, { "epoch": 0.5409896602658789, "grad_norm": 0.21255697309970856, "learning_rate": 0.00022964650819066388, "loss": 8.4841, "step": 4395 }, { "epoch": 0.5411127523387493, "grad_norm": 0.24556607007980347, "learning_rate": 0.00022958492425175514, "loss": 8.6003, "step": 4396 }, { "epoch": 0.5412358444116199, "grad_norm": 0.2317822426557541, "learning_rate": 0.0002295233403128464, "loss": 8.3803, "step": 4397 }, { "epoch": 0.5413589364844904, "grad_norm": 0.1444755643606186, "learning_rate": 0.0002294617563739377, "loss": 7.3298, "step": 4398 }, { "epoch": 0.5414820285573609, "grad_norm": 0.08407875895500183, "learning_rate": 0.00022940017243502895, "loss": 7.781, "step": 4399 }, { "epoch": 0.5416051206302314, "grad_norm": 0.08484244346618652, "learning_rate": 0.0002293385884961202, "loss": 7.9782, "step": 4400 }, { "epoch": 0.5417282127031019, "grad_norm": 0.07726749032735825, "learning_rate": 0.0002292770045572115, "loss": 7.8622, "step": 4401 }, { "epoch": 0.5418513047759724, "grad_norm": 0.09609339386224747, "learning_rate": 0.00022921542061830275, "loss": 7.5025, "step": 4402 }, { "epoch": 0.541974396848843, "grad_norm": 0.10808176547288895, "learning_rate": 0.00022915383667939401, "loss": 7.5149, "step": 4403 }, { "epoch": 0.5420974889217134, "grad_norm": 0.07440189272165298, "learning_rate": 0.00022909225274048527, "loss": 7.8466, "step": 4404 }, { "epoch": 0.5422205809945839, "grad_norm": 0.11070994287729263, "learning_rate": 0.00022903066880157656, "loss": 7.5189, "step": 4405 }, { "epoch": 0.5423436730674545, "grad_norm": 0.12918110191822052, "learning_rate": 0.00022896908486266782, "loss": 7.6518, "step": 4406 }, { "epoch": 0.542466765140325, "grad_norm": 0.09348762035369873, "learning_rate": 0.00022890750092375908, "loss": 7.4754, "step": 4407 }, { "epoch": 0.5425898572131955, "grad_norm": 0.15142853558063507, "learning_rate": 0.00022884591698485037, "loss": 8.4558, "step": 4408 }, { "epoch": 0.542712949286066, "grad_norm": 0.08160650730133057, "learning_rate": 0.00022878433304594162, "loss": 7.4591, "step": 4409 }, { "epoch": 0.5428360413589365, "grad_norm": 0.10789689421653748, "learning_rate": 0.00022872274910703288, "loss": 7.6868, "step": 4410 }, { "epoch": 0.542959133431807, "grad_norm": 0.08786112815141678, "learning_rate": 0.00022866116516812417, "loss": 7.6162, "step": 4411 }, { "epoch": 0.5430822255046776, "grad_norm": 0.13992510735988617, "learning_rate": 0.00022859958122921543, "loss": 7.8877, "step": 4412 }, { "epoch": 0.543205317577548, "grad_norm": 0.13513857126235962, "learning_rate": 0.0002285379972903067, "loss": 7.7195, "step": 4413 }, { "epoch": 0.5433284096504185, "grad_norm": 0.08917051553726196, "learning_rate": 0.00022847641335139795, "loss": 7.2282, "step": 4414 }, { "epoch": 0.543451501723289, "grad_norm": 0.07218292355537415, "learning_rate": 0.00022841482941248924, "loss": 7.3742, "step": 4415 }, { "epoch": 0.5435745937961596, "grad_norm": 0.10058338195085526, "learning_rate": 0.0002283532454735805, "loss": 7.8209, "step": 4416 }, { "epoch": 0.54369768586903, "grad_norm": 0.07391609996557236, "learning_rate": 0.00022829166153467176, "loss": 7.4106, "step": 4417 }, { "epoch": 0.5438207779419005, "grad_norm": 0.09320231527090073, "learning_rate": 0.00022823007759576304, "loss": 7.5538, "step": 4418 }, { "epoch": 0.5439438700147711, "grad_norm": 0.18493275344371796, "learning_rate": 0.0002281684936568543, "loss": 8.3377, "step": 4419 }, { "epoch": 0.5440669620876416, "grad_norm": 0.10002464056015015, "learning_rate": 0.00022810690971794556, "loss": 7.498, "step": 4420 }, { "epoch": 0.544190054160512, "grad_norm": NaN, "learning_rate": 0.00022804532577903682, "loss": 7.9179, "step": 4421 }, { "epoch": 0.5443131462333826, "grad_norm": 0.12103809416294098, "learning_rate": 0.0002279837418401281, "loss": 8.2657, "step": 4422 }, { "epoch": 0.5444362383062531, "grad_norm": 0.2663402557373047, "learning_rate": 0.00022792215790121937, "loss": 7.6705, "step": 4423 }, { "epoch": 0.5445593303791236, "grad_norm": 0.22898255288600922, "learning_rate": 0.00022786057396231063, "loss": 8.8044, "step": 4424 }, { "epoch": 0.544682422451994, "grad_norm": 0.15967100858688354, "learning_rate": 0.0002277989900234019, "loss": 7.9439, "step": 4425 }, { "epoch": 0.5448055145248646, "grad_norm": 0.10162489861249924, "learning_rate": 0.00022773740608449317, "loss": 7.6775, "step": 4426 }, { "epoch": 0.5449286065977351, "grad_norm": 0.32432419061660767, "learning_rate": 0.00022767582214558443, "loss": 8.8009, "step": 4427 }, { "epoch": 0.5450516986706057, "grad_norm": 0.10819312185049057, "learning_rate": 0.00022761423820667572, "loss": 7.4182, "step": 4428 }, { "epoch": 0.5451747907434761, "grad_norm": 0.08559369295835495, "learning_rate": 0.00022755265426776698, "loss": 7.6458, "step": 4429 }, { "epoch": 0.5452978828163466, "grad_norm": 0.10932519286870956, "learning_rate": 0.00022749107032885824, "loss": 7.7732, "step": 4430 }, { "epoch": 0.5454209748892171, "grad_norm": 0.18520058691501617, "learning_rate": 0.0002274294863899495, "loss": 7.3952, "step": 4431 }, { "epoch": 0.5455440669620877, "grad_norm": 0.09294065088033676, "learning_rate": 0.00022736790245104078, "loss": 8.1114, "step": 4432 }, { "epoch": 0.5456671590349581, "grad_norm": 0.15369243919849396, "learning_rate": 0.00022730631851213204, "loss": 8.0051, "step": 4433 }, { "epoch": 0.5457902511078286, "grad_norm": 0.09941710531711578, "learning_rate": 0.0002272447345732233, "loss": 7.7252, "step": 4434 }, { "epoch": 0.5459133431806992, "grad_norm": 0.14583678543567657, "learning_rate": 0.0002271831506343146, "loss": 8.1897, "step": 4435 }, { "epoch": 0.5460364352535697, "grad_norm": 0.18653950095176697, "learning_rate": 0.00022712156669540585, "loss": 7.4047, "step": 4436 }, { "epoch": 0.5461595273264401, "grad_norm": 0.11997897177934647, "learning_rate": 0.0002270599827564971, "loss": 8.2032, "step": 4437 }, { "epoch": 0.5462826193993107, "grad_norm": 0.1783941686153412, "learning_rate": 0.0002269983988175884, "loss": 7.6881, "step": 4438 }, { "epoch": 0.5464057114721812, "grad_norm": 0.12716299295425415, "learning_rate": 0.00022693681487867965, "loss": 7.8102, "step": 4439 }, { "epoch": 0.5465288035450517, "grad_norm": 0.155097097158432, "learning_rate": 0.0002268752309397709, "loss": 7.4377, "step": 4440 }, { "epoch": 0.5466518956179222, "grad_norm": 0.18681955337524414, "learning_rate": 0.00022681364700086217, "loss": 7.2877, "step": 4441 }, { "epoch": 0.5467749876907927, "grad_norm": 0.16868473589420319, "learning_rate": 0.00022675206306195346, "loss": 7.6969, "step": 4442 }, { "epoch": 0.5468980797636632, "grad_norm": 0.13734667003154755, "learning_rate": 0.00022669047912304472, "loss": 7.3042, "step": 4443 }, { "epoch": 0.5470211718365338, "grad_norm": 0.12115469574928284, "learning_rate": 0.00022662889518413598, "loss": 7.2818, "step": 4444 }, { "epoch": 0.5471442639094042, "grad_norm": 0.09313696622848511, "learning_rate": 0.00022656731124522726, "loss": 7.4752, "step": 4445 }, { "epoch": 0.5472673559822747, "grad_norm": 0.13669495284557343, "learning_rate": 0.00022650572730631852, "loss": 8.1261, "step": 4446 }, { "epoch": 0.5473904480551453, "grad_norm": 0.1906062811613083, "learning_rate": 0.00022644414336740978, "loss": 7.6899, "step": 4447 }, { "epoch": 0.5475135401280158, "grad_norm": 0.12408885359764099, "learning_rate": 0.00022638255942850104, "loss": 7.7166, "step": 4448 }, { "epoch": 0.5476366322008862, "grad_norm": 0.1367480307817459, "learning_rate": 0.00022632097548959233, "loss": 7.6995, "step": 4449 }, { "epoch": 0.5477597242737567, "grad_norm": 0.08230333030223846, "learning_rate": 0.0002262593915506836, "loss": 7.7434, "step": 4450 }, { "epoch": 0.5478828163466273, "grad_norm": 0.1392713487148285, "learning_rate": 0.00022619780761177485, "loss": 7.4356, "step": 4451 }, { "epoch": 0.5480059084194978, "grad_norm": 0.15492463111877441, "learning_rate": 0.00022613622367286614, "loss": 7.5015, "step": 4452 }, { "epoch": 0.5481290004923683, "grad_norm": 0.0900038480758667, "learning_rate": 0.0002260746397339574, "loss": 7.4738, "step": 4453 }, { "epoch": 0.5482520925652388, "grad_norm": 0.09938862919807434, "learning_rate": 0.00022601305579504865, "loss": 7.4458, "step": 4454 }, { "epoch": 0.5483751846381093, "grad_norm": 0.16748592257499695, "learning_rate": 0.00022595147185613994, "loss": 7.172, "step": 4455 }, { "epoch": 0.5484982767109798, "grad_norm": 0.12970228493213654, "learning_rate": 0.0002258898879172312, "loss": 8.3001, "step": 4456 }, { "epoch": 0.5486213687838504, "grad_norm": 0.10052957385778427, "learning_rate": 0.00022582830397832246, "loss": 7.6091, "step": 4457 }, { "epoch": 0.5487444608567208, "grad_norm": 0.08910771459341049, "learning_rate": 0.00022576672003941372, "loss": 7.4582, "step": 4458 }, { "epoch": 0.5488675529295913, "grad_norm": 0.4941125214099884, "learning_rate": 0.000225705136100505, "loss": 9.3453, "step": 4459 }, { "epoch": 0.5489906450024619, "grad_norm": 0.07281889021396637, "learning_rate": 0.00022564355216159627, "loss": 7.4938, "step": 4460 }, { "epoch": 0.5491137370753324, "grad_norm": 0.0856264978647232, "learning_rate": 0.00022558196822268753, "loss": 7.3604, "step": 4461 }, { "epoch": 0.5492368291482028, "grad_norm": 0.11078678071498871, "learning_rate": 0.0002255203842837788, "loss": 7.3899, "step": 4462 }, { "epoch": 0.5493599212210734, "grad_norm": 0.11004944145679474, "learning_rate": 0.00022545880034487007, "loss": 7.449, "step": 4463 }, { "epoch": 0.5494830132939439, "grad_norm": 0.11373826861381531, "learning_rate": 0.00022539721640596133, "loss": 7.5443, "step": 4464 }, { "epoch": 0.5496061053668144, "grad_norm": 0.07217881083488464, "learning_rate": 0.0002253356324670526, "loss": 7.3853, "step": 4465 }, { "epoch": 0.5497291974396848, "grad_norm": 0.1565328985452652, "learning_rate": 0.00022527404852814388, "loss": 8.4249, "step": 4466 }, { "epoch": 0.5498522895125554, "grad_norm": 0.09852331876754761, "learning_rate": 0.00022521246458923514, "loss": 7.5731, "step": 4467 }, { "epoch": 0.5499753815854259, "grad_norm": 0.16339389979839325, "learning_rate": 0.0002251508806503264, "loss": 7.8518, "step": 4468 }, { "epoch": 0.5500984736582964, "grad_norm": 0.09712909162044525, "learning_rate": 0.00022508929671141768, "loss": 7.4824, "step": 4469 }, { "epoch": 0.5502215657311669, "grad_norm": 0.19716833531856537, "learning_rate": 0.00022502771277250894, "loss": 8.5944, "step": 4470 }, { "epoch": 0.5503446578040374, "grad_norm": 0.10815032571554184, "learning_rate": 0.0002249661288336002, "loss": 7.8442, "step": 4471 }, { "epoch": 0.5504677498769079, "grad_norm": 0.23438915610313416, "learning_rate": 0.0002249045448946915, "loss": 9.0466, "step": 4472 }, { "epoch": 0.5505908419497785, "grad_norm": 0.14941291511058807, "learning_rate": 0.00022484296095578275, "loss": 7.8903, "step": 4473 }, { "epoch": 0.5507139340226489, "grad_norm": 0.15633024275302887, "learning_rate": 0.000224781377016874, "loss": 7.3409, "step": 4474 }, { "epoch": 0.5508370260955194, "grad_norm": 0.12151550501585007, "learning_rate": 0.00022471979307796527, "loss": 7.4528, "step": 4475 }, { "epoch": 0.55096011816839, "grad_norm": 0.10890041291713715, "learning_rate": 0.00022465820913905655, "loss": 7.9693, "step": 4476 }, { "epoch": 0.5510832102412605, "grad_norm": 0.10740502923727036, "learning_rate": 0.0002245966252001478, "loss": 8.0545, "step": 4477 }, { "epoch": 0.5512063023141309, "grad_norm": 0.10973463207483292, "learning_rate": 0.00022453504126123907, "loss": 8.0708, "step": 4478 }, { "epoch": 0.5513293943870015, "grad_norm": 0.17043934762477875, "learning_rate": 0.00022447345732233036, "loss": 8.7713, "step": 4479 }, { "epoch": 0.551452486459872, "grad_norm": 0.09359019249677658, "learning_rate": 0.00022441187338342162, "loss": 7.7582, "step": 4480 }, { "epoch": 0.5515755785327425, "grad_norm": 0.08355414867401123, "learning_rate": 0.00022435028944451288, "loss": 8.0321, "step": 4481 }, { "epoch": 0.551698670605613, "grad_norm": 0.22351376712322235, "learning_rate": 0.00022428870550560416, "loss": 7.2498, "step": 4482 }, { "epoch": 0.5518217626784835, "grad_norm": 0.11750993132591248, "learning_rate": 0.00022422712156669542, "loss": 7.865, "step": 4483 }, { "epoch": 0.551944854751354, "grad_norm": 0.14918656647205353, "learning_rate": 0.00022416553762778668, "loss": 8.43, "step": 4484 }, { "epoch": 0.5520679468242246, "grad_norm": 0.07215019315481186, "learning_rate": 0.00022410395368887794, "loss": 7.5752, "step": 4485 }, { "epoch": 0.552191038897095, "grad_norm": 0.12812013924121857, "learning_rate": 0.00022404236974996923, "loss": 7.4532, "step": 4486 }, { "epoch": 0.5523141309699655, "grad_norm": 0.09910460561513901, "learning_rate": 0.0002239807858110605, "loss": 7.6903, "step": 4487 }, { "epoch": 0.552437223042836, "grad_norm": 0.10586972534656525, "learning_rate": 0.00022391920187215175, "loss": 8.1435, "step": 4488 }, { "epoch": 0.5525603151157066, "grad_norm": 0.07749904692173004, "learning_rate": 0.00022385761793324303, "loss": 7.721, "step": 4489 }, { "epoch": 0.552683407188577, "grad_norm": 0.08926825225353241, "learning_rate": 0.0002237960339943343, "loss": 7.5241, "step": 4490 }, { "epoch": 0.5528064992614475, "grad_norm": 0.35022133588790894, "learning_rate": 0.00022373445005542555, "loss": 9.3035, "step": 4491 }, { "epoch": 0.5529295913343181, "grad_norm": 0.2034085988998413, "learning_rate": 0.0002236728661165168, "loss": 7.2841, "step": 4492 }, { "epoch": 0.5530526834071886, "grad_norm": 0.34849512577056885, "learning_rate": 0.0002236112821776081, "loss": 7.3566, "step": 4493 }, { "epoch": 0.553175775480059, "grad_norm": 0.20674841105937958, "learning_rate": 0.00022354969823869936, "loss": 7.8798, "step": 4494 }, { "epoch": 0.5532988675529296, "grad_norm": 0.11398623138666153, "learning_rate": 0.00022348811429979062, "loss": 7.4212, "step": 4495 }, { "epoch": 0.5534219596258001, "grad_norm": 0.15588834881782532, "learning_rate": 0.0002234265303608819, "loss": 8.0359, "step": 4496 }, { "epoch": 0.5535450516986706, "grad_norm": 0.10624371469020844, "learning_rate": 0.00022336494642197317, "loss": 7.5702, "step": 4497 }, { "epoch": 0.5536681437715412, "grad_norm": 0.07636088132858276, "learning_rate": 0.00022330336248306442, "loss": 7.4557, "step": 4498 }, { "epoch": 0.5537912358444116, "grad_norm": 0.16025365889072418, "learning_rate": 0.0002232417785441557, "loss": 7.4124, "step": 4499 }, { "epoch": 0.5539143279172821, "grad_norm": 0.08097649365663528, "learning_rate": 0.00022318019460524697, "loss": 7.2052, "step": 4500 }, { "epoch": 0.5540374199901527, "grad_norm": 0.20932230353355408, "learning_rate": 0.00022311861066633823, "loss": 8.3388, "step": 4501 }, { "epoch": 0.5541605120630232, "grad_norm": 0.0895649716258049, "learning_rate": 0.0002230570267274295, "loss": 7.37, "step": 4502 }, { "epoch": 0.5542836041358936, "grad_norm": 0.0724194273352623, "learning_rate": 0.00022299544278852078, "loss": 7.4145, "step": 4503 }, { "epoch": 0.5544066962087641, "grad_norm": 0.1215469017624855, "learning_rate": 0.00022293385884961204, "loss": 7.4988, "step": 4504 }, { "epoch": 0.5545297882816347, "grad_norm": 0.08158242702484131, "learning_rate": 0.0002228722749107033, "loss": 7.5767, "step": 4505 }, { "epoch": 0.5546528803545052, "grad_norm": 0.16879765689373016, "learning_rate": 0.00022281069097179458, "loss": 7.6876, "step": 4506 }, { "epoch": 0.5547759724273756, "grad_norm": 0.09604304283857346, "learning_rate": 0.00022274910703288584, "loss": 7.6237, "step": 4507 }, { "epoch": 0.5548990645002462, "grad_norm": 0.06880096346139908, "learning_rate": 0.0002226875230939771, "loss": 7.6398, "step": 4508 }, { "epoch": 0.5550221565731167, "grad_norm": 0.07437476515769958, "learning_rate": 0.0002226259391550684, "loss": 7.47, "step": 4509 }, { "epoch": 0.5551452486459872, "grad_norm": 0.10853934288024902, "learning_rate": 0.00022256435521615965, "loss": 7.6806, "step": 4510 }, { "epoch": 0.5552683407188577, "grad_norm": 0.10582578927278519, "learning_rate": 0.0002225027712772509, "loss": 8.1952, "step": 4511 }, { "epoch": 0.5553914327917282, "grad_norm": 0.10756754875183105, "learning_rate": 0.00022244118733834217, "loss": 7.6372, "step": 4512 }, { "epoch": 0.5555145248645987, "grad_norm": 0.09527909755706787, "learning_rate": 0.00022237960339943345, "loss": 7.7633, "step": 4513 }, { "epoch": 0.5556376169374693, "grad_norm": 0.09210743755102158, "learning_rate": 0.0002223180194605247, "loss": 7.7074, "step": 4514 }, { "epoch": 0.5557607090103397, "grad_norm": 0.12336267530918121, "learning_rate": 0.00022225643552161597, "loss": 7.3539, "step": 4515 }, { "epoch": 0.5558838010832102, "grad_norm": 0.11383365094661713, "learning_rate": 0.00022219485158270726, "loss": 7.9033, "step": 4516 }, { "epoch": 0.5560068931560808, "grad_norm": 0.13325288891792297, "learning_rate": 0.00022213326764379852, "loss": 7.7167, "step": 4517 }, { "epoch": 0.5561299852289513, "grad_norm": 0.16534379124641418, "learning_rate": 0.00022207168370488978, "loss": 7.2675, "step": 4518 }, { "epoch": 0.5562530773018217, "grad_norm": 0.30182257294654846, "learning_rate": 0.00022201009976598104, "loss": 8.9272, "step": 4519 }, { "epoch": 0.5563761693746923, "grad_norm": 0.15241359174251556, "learning_rate": 0.00022194851582707232, "loss": 7.8049, "step": 4520 }, { "epoch": 0.5564992614475628, "grad_norm": 0.15040269494056702, "learning_rate": 0.00022188693188816358, "loss": 8.1925, "step": 4521 }, { "epoch": 0.5566223535204333, "grad_norm": 0.20979663729667664, "learning_rate": 0.00022182534794925484, "loss": 7.4061, "step": 4522 }, { "epoch": 0.5567454455933037, "grad_norm": 0.12229378521442413, "learning_rate": 0.00022176376401034613, "loss": 8.5457, "step": 4523 }, { "epoch": 0.5568685376661743, "grad_norm": 0.2131873518228531, "learning_rate": 0.0002217021800714374, "loss": 7.3675, "step": 4524 }, { "epoch": 0.5569916297390448, "grad_norm": 0.15271641314029694, "learning_rate": 0.00022164059613252865, "loss": 7.6045, "step": 4525 }, { "epoch": 0.5571147218119153, "grad_norm": 0.08749763667583466, "learning_rate": 0.00022157901219361993, "loss": 7.903, "step": 4526 }, { "epoch": 0.5572378138847858, "grad_norm": 0.08158653974533081, "learning_rate": 0.0002215174282547112, "loss": 7.6395, "step": 4527 }, { "epoch": 0.5573609059576563, "grad_norm": 0.07640524208545685, "learning_rate": 0.00022145584431580245, "loss": 7.5072, "step": 4528 }, { "epoch": 0.5574839980305268, "grad_norm": 0.11684459447860718, "learning_rate": 0.00022139426037689369, "loss": 7.6066, "step": 4529 }, { "epoch": 0.5576070901033974, "grad_norm": 0.08922635018825531, "learning_rate": 0.00022133267643798497, "loss": 7.4854, "step": 4530 }, { "epoch": 0.5577301821762678, "grad_norm": 0.6797195076942444, "learning_rate": 0.00022127109249907623, "loss": 9.3924, "step": 4531 }, { "epoch": 0.5578532742491383, "grad_norm": 0.10172024369239807, "learning_rate": 0.0002212095085601675, "loss": 7.2458, "step": 4532 }, { "epoch": 0.5579763663220089, "grad_norm": 0.1186106726527214, "learning_rate": 0.00022114792462125878, "loss": 7.8157, "step": 4533 }, { "epoch": 0.5580994583948794, "grad_norm": 0.12236058712005615, "learning_rate": 0.00022108634068235004, "loss": 7.6733, "step": 4534 }, { "epoch": 0.5582225504677498, "grad_norm": 0.1374215930700302, "learning_rate": 0.0002210247567434413, "loss": 8.5077, "step": 4535 }, { "epoch": 0.5583456425406204, "grad_norm": 0.14625254273414612, "learning_rate": 0.00022096317280453256, "loss": 7.4117, "step": 4536 }, { "epoch": 0.5584687346134909, "grad_norm": 0.10401518642902374, "learning_rate": 0.00022090158886562384, "loss": 7.9437, "step": 4537 }, { "epoch": 0.5585918266863614, "grad_norm": 0.07626180350780487, "learning_rate": 0.0002208400049267151, "loss": 7.4835, "step": 4538 }, { "epoch": 0.558714918759232, "grad_norm": 0.31887564063072205, "learning_rate": 0.00022077842098780636, "loss": 8.8787, "step": 4539 }, { "epoch": 0.5588380108321024, "grad_norm": 0.07162728160619736, "learning_rate": 0.00022071683704889765, "loss": 7.9738, "step": 4540 }, { "epoch": 0.5589611029049729, "grad_norm": 0.1406102031469345, "learning_rate": 0.0002206552531099889, "loss": 7.8204, "step": 4541 }, { "epoch": 0.5590841949778435, "grad_norm": 0.1075543686747551, "learning_rate": 0.00022059366917108017, "loss": 7.9961, "step": 4542 }, { "epoch": 0.559207287050714, "grad_norm": 0.08806148916482925, "learning_rate": 0.00022053208523217145, "loss": 7.6572, "step": 4543 }, { "epoch": 0.5593303791235844, "grad_norm": 0.10977771878242493, "learning_rate": 0.00022047050129326271, "loss": 8.3804, "step": 4544 }, { "epoch": 0.5594534711964549, "grad_norm": 0.1431640386581421, "learning_rate": 0.00022040891735435397, "loss": 7.5868, "step": 4545 }, { "epoch": 0.5595765632693255, "grad_norm": 0.13165952265262604, "learning_rate": 0.00022034733341544523, "loss": 7.7269, "step": 4546 }, { "epoch": 0.559699655342196, "grad_norm": 0.07595368474721909, "learning_rate": 0.00022028574947653652, "loss": 8.1408, "step": 4547 }, { "epoch": 0.5598227474150664, "grad_norm": 0.06881158798933029, "learning_rate": 0.00022022416553762778, "loss": 8.0547, "step": 4548 }, { "epoch": 0.559945839487937, "grad_norm": 0.12378164380788803, "learning_rate": 0.00022016258159871904, "loss": 7.5731, "step": 4549 }, { "epoch": 0.5600689315608075, "grad_norm": 0.11604265123605728, "learning_rate": 0.00022010099765981032, "loss": 7.599, "step": 4550 }, { "epoch": 0.560192023633678, "grad_norm": 0.1835881471633911, "learning_rate": 0.00022003941372090158, "loss": 7.519, "step": 4551 }, { "epoch": 0.5603151157065485, "grad_norm": 0.10108360648155212, "learning_rate": 0.00021997782978199284, "loss": 7.3105, "step": 4552 }, { "epoch": 0.560438207779419, "grad_norm": 0.09449926763772964, "learning_rate": 0.00021991624584308413, "loss": 7.6383, "step": 4553 }, { "epoch": 0.5605612998522895, "grad_norm": 0.1844738870859146, "learning_rate": 0.0002198546619041754, "loss": 7.7072, "step": 4554 }, { "epoch": 0.5606843919251601, "grad_norm": 0.18481141328811646, "learning_rate": 0.00021979307796526665, "loss": 7.8235, "step": 4555 }, { "epoch": 0.5608074839980305, "grad_norm": 0.16096030175685883, "learning_rate": 0.0002197314940263579, "loss": 7.9004, "step": 4556 }, { "epoch": 0.560930576070901, "grad_norm": 0.08387553691864014, "learning_rate": 0.0002196699100874492, "loss": 7.3422, "step": 4557 }, { "epoch": 0.5610536681437716, "grad_norm": 0.09487216174602509, "learning_rate": 0.00021960832614854046, "loss": 7.6245, "step": 4558 }, { "epoch": 0.5611767602166421, "grad_norm": 0.09158190339803696, "learning_rate": 0.00021954674220963171, "loss": 7.4351, "step": 4559 }, { "epoch": 0.5612998522895125, "grad_norm": 0.09634822607040405, "learning_rate": 0.000219485158270723, "loss": 7.5975, "step": 4560 }, { "epoch": 0.561422944362383, "grad_norm": 0.09849843382835388, "learning_rate": 0.00021942357433181426, "loss": 7.8147, "step": 4561 }, { "epoch": 0.5615460364352536, "grad_norm": 0.06792136281728745, "learning_rate": 0.00021936199039290552, "loss": 7.6012, "step": 4562 }, { "epoch": 0.5616691285081241, "grad_norm": 0.09529685229063034, "learning_rate": 0.00021930040645399678, "loss": 7.6223, "step": 4563 }, { "epoch": 0.5617922205809945, "grad_norm": 0.09596133232116699, "learning_rate": 0.00021923882251508807, "loss": 7.4705, "step": 4564 }, { "epoch": 0.5619153126538651, "grad_norm": 0.14035360515117645, "learning_rate": 0.00021917723857617933, "loss": 7.9241, "step": 4565 }, { "epoch": 0.5620384047267356, "grad_norm": 0.07459201663732529, "learning_rate": 0.00021911565463727059, "loss": 7.4483, "step": 4566 }, { "epoch": 0.5621614967996061, "grad_norm": 0.1749979704618454, "learning_rate": 0.00021905407069836187, "loss": 8.4534, "step": 4567 }, { "epoch": 0.5622845888724766, "grad_norm": 0.41898906230926514, "learning_rate": 0.00021899248675945313, "loss": 9.4355, "step": 4568 }, { "epoch": 0.5624076809453471, "grad_norm": 0.15870870649814606, "learning_rate": 0.0002189309028205444, "loss": 8.0446, "step": 4569 }, { "epoch": 0.5625307730182176, "grad_norm": 0.15239913761615753, "learning_rate": 0.00021886931888163568, "loss": 7.6297, "step": 4570 }, { "epoch": 0.5626538650910882, "grad_norm": 0.12156452238559723, "learning_rate": 0.00021880773494272694, "loss": 7.5191, "step": 4571 }, { "epoch": 0.5627769571639586, "grad_norm": 0.07392920553684235, "learning_rate": 0.0002187461510038182, "loss": 7.9199, "step": 4572 }, { "epoch": 0.5629000492368291, "grad_norm": 0.16609184443950653, "learning_rate": 0.00021868456706490946, "loss": 7.3531, "step": 4573 }, { "epoch": 0.5630231413096997, "grad_norm": 0.11623703688383102, "learning_rate": 0.00021862298312600074, "loss": 7.669, "step": 4574 }, { "epoch": 0.5631462333825702, "grad_norm": 0.10854785144329071, "learning_rate": 0.000218561399187092, "loss": 7.2575, "step": 4575 }, { "epoch": 0.5632693254554406, "grad_norm": 0.09625726193189621, "learning_rate": 0.00021849981524818326, "loss": 7.7689, "step": 4576 }, { "epoch": 0.5633924175283112, "grad_norm": 0.15363848209381104, "learning_rate": 0.00021843823130927455, "loss": 7.3748, "step": 4577 }, { "epoch": 0.5635155096011817, "grad_norm": 0.09168659150600433, "learning_rate": 0.0002183766473703658, "loss": 7.4668, "step": 4578 }, { "epoch": 0.5636386016740522, "grad_norm": 0.1657167375087738, "learning_rate": 0.00021831506343145707, "loss": 7.7555, "step": 4579 }, { "epoch": 0.5637616937469226, "grad_norm": 0.09649831801652908, "learning_rate": 0.00021825347949254835, "loss": 7.4365, "step": 4580 }, { "epoch": 0.5638847858197932, "grad_norm": 0.1569208949804306, "learning_rate": 0.0002181918955536396, "loss": 8.297, "step": 4581 }, { "epoch": 0.5640078778926637, "grad_norm": 0.08755429834127426, "learning_rate": 0.00021813031161473087, "loss": 7.7097, "step": 4582 }, { "epoch": 0.5641309699655342, "grad_norm": 0.1365918666124344, "learning_rate": 0.00021806872767582213, "loss": 7.3202, "step": 4583 }, { "epoch": 0.5642540620384048, "grad_norm": 0.19874484837055206, "learning_rate": 0.00021800714373691342, "loss": 7.8577, "step": 4584 }, { "epoch": 0.5643771541112752, "grad_norm": 0.1897747814655304, "learning_rate": 0.00021794555979800468, "loss": 7.6728, "step": 4585 }, { "epoch": 0.5645002461841457, "grad_norm": 0.1872389018535614, "learning_rate": 0.00021788397585909594, "loss": 7.2734, "step": 4586 }, { "epoch": 0.5646233382570163, "grad_norm": 0.11576341837644577, "learning_rate": 0.00021782239192018722, "loss": 7.6519, "step": 4587 }, { "epoch": 0.5647464303298868, "grad_norm": 0.058960042893886566, "learning_rate": 0.00021776080798127848, "loss": 7.4977, "step": 4588 }, { "epoch": 0.5648695224027572, "grad_norm": 0.10714416205883026, "learning_rate": 0.00021769922404236974, "loss": 7.7142, "step": 4589 }, { "epoch": 0.5649926144756278, "grad_norm": 0.13018524646759033, "learning_rate": 0.000217637640103461, "loss": 7.6688, "step": 4590 }, { "epoch": 0.5651157065484983, "grad_norm": 0.14521442353725433, "learning_rate": 0.0002175760561645523, "loss": 7.7007, "step": 4591 }, { "epoch": 0.5652387986213688, "grad_norm": 0.10828734934329987, "learning_rate": 0.00021751447222564355, "loss": 8.149, "step": 4592 }, { "epoch": 0.5653618906942393, "grad_norm": 0.08298609405755997, "learning_rate": 0.0002174528882867348, "loss": 7.7691, "step": 4593 }, { "epoch": 0.5654849827671098, "grad_norm": 0.07556815445423126, "learning_rate": 0.0002173913043478261, "loss": 7.645, "step": 4594 }, { "epoch": 0.5656080748399803, "grad_norm": 0.13715073466300964, "learning_rate": 0.00021732972040891735, "loss": 7.5456, "step": 4595 }, { "epoch": 0.5657311669128509, "grad_norm": 0.09141524881124496, "learning_rate": 0.00021726813647000861, "loss": 8.176, "step": 4596 }, { "epoch": 0.5658542589857213, "grad_norm": 0.155990332365036, "learning_rate": 0.0002172065525310999, "loss": 7.6355, "step": 4597 }, { "epoch": 0.5659773510585918, "grad_norm": 0.09975229203701019, "learning_rate": 0.00021714496859219116, "loss": 7.535, "step": 4598 }, { "epoch": 0.5661004431314623, "grad_norm": 0.12382209300994873, "learning_rate": 0.00021708338465328242, "loss": 7.718, "step": 4599 }, { "epoch": 0.5662235352043329, "grad_norm": 0.14515414834022522, "learning_rate": 0.00021702180071437368, "loss": 7.6863, "step": 4600 }, { "epoch": 0.5663466272772033, "grad_norm": 0.152572900056839, "learning_rate": 0.00021696021677546497, "loss": 7.518, "step": 4601 }, { "epoch": 0.5664697193500738, "grad_norm": 0.10332351177930832, "learning_rate": 0.00021689863283655623, "loss": 7.3429, "step": 4602 }, { "epoch": 0.5665928114229444, "grad_norm": 0.08870221674442291, "learning_rate": 0.00021683704889764748, "loss": 7.3082, "step": 4603 }, { "epoch": 0.5667159034958149, "grad_norm": 0.13364635407924652, "learning_rate": 0.00021677546495873877, "loss": 7.6285, "step": 4604 }, { "epoch": 0.5668389955686853, "grad_norm": 0.08208437263965607, "learning_rate": 0.00021671388101983003, "loss": 7.5349, "step": 4605 }, { "epoch": 0.5669620876415559, "grad_norm": 0.09948737174272537, "learning_rate": 0.0002166522970809213, "loss": 7.5787, "step": 4606 }, { "epoch": 0.5670851797144264, "grad_norm": 0.18121564388275146, "learning_rate": 0.00021659071314201255, "loss": 7.0583, "step": 4607 }, { "epoch": 0.5672082717872969, "grad_norm": 0.08905844390392303, "learning_rate": 0.00021652912920310384, "loss": 7.8758, "step": 4608 }, { "epoch": 0.5673313638601674, "grad_norm": 0.08787424862384796, "learning_rate": 0.0002164675452641951, "loss": 7.4608, "step": 4609 }, { "epoch": 0.5674544559330379, "grad_norm": 0.09088773280382156, "learning_rate": 0.00021640596132528636, "loss": 7.7584, "step": 4610 }, { "epoch": 0.5675775480059084, "grad_norm": 0.09836236387491226, "learning_rate": 0.00021634437738637764, "loss": 7.4615, "step": 4611 }, { "epoch": 0.567700640078779, "grad_norm": 0.0786668062210083, "learning_rate": 0.0002162827934474689, "loss": 7.572, "step": 4612 }, { "epoch": 0.5678237321516494, "grad_norm": 0.11266011744737625, "learning_rate": 0.00021622120950856016, "loss": 7.8042, "step": 4613 }, { "epoch": 0.5679468242245199, "grad_norm": 0.10745701938867569, "learning_rate": 0.00021615962556965145, "loss": 8.0918, "step": 4614 }, { "epoch": 0.5680699162973905, "grad_norm": 0.07158169150352478, "learning_rate": 0.0002160980416307427, "loss": 7.355, "step": 4615 }, { "epoch": 0.568193008370261, "grad_norm": 0.18982461094856262, "learning_rate": 0.00021603645769183397, "loss": 7.8966, "step": 4616 }, { "epoch": 0.5683161004431314, "grad_norm": 0.08919133245944977, "learning_rate": 0.00021597487375292523, "loss": 7.5219, "step": 4617 }, { "epoch": 0.568439192516002, "grad_norm": 0.09216555953025818, "learning_rate": 0.0002159132898140165, "loss": 7.2672, "step": 4618 }, { "epoch": 0.5685622845888725, "grad_norm": 0.08806459605693817, "learning_rate": 0.00021585170587510777, "loss": 7.9789, "step": 4619 }, { "epoch": 0.568685376661743, "grad_norm": 0.0807500034570694, "learning_rate": 0.00021579012193619903, "loss": 7.6922, "step": 4620 }, { "epoch": 0.5688084687346134, "grad_norm": 0.14384135603904724, "learning_rate": 0.00021572853799729032, "loss": 8.0702, "step": 4621 }, { "epoch": 0.568931560807484, "grad_norm": 0.1453060507774353, "learning_rate": 0.00021566695405838158, "loss": 7.9527, "step": 4622 }, { "epoch": 0.5690546528803545, "grad_norm": 0.10890557616949081, "learning_rate": 0.00021560537011947284, "loss": 7.3761, "step": 4623 }, { "epoch": 0.569177744953225, "grad_norm": 0.09992636740207672, "learning_rate": 0.00021554378618056412, "loss": 7.4033, "step": 4624 }, { "epoch": 0.5693008370260956, "grad_norm": 0.16029910743236542, "learning_rate": 0.00021548220224165538, "loss": 8.0795, "step": 4625 }, { "epoch": 0.569423929098966, "grad_norm": 0.11967390030622482, "learning_rate": 0.00021542061830274664, "loss": 7.8725, "step": 4626 }, { "epoch": 0.5695470211718365, "grad_norm": 0.3162372410297394, "learning_rate": 0.0002153590343638379, "loss": 9.3012, "step": 4627 }, { "epoch": 0.5696701132447071, "grad_norm": 0.18171140551567078, "learning_rate": 0.0002152974504249292, "loss": 7.6732, "step": 4628 }, { "epoch": 0.5697932053175776, "grad_norm": 0.07538783550262451, "learning_rate": 0.00021523586648602045, "loss": 7.6051, "step": 4629 }, { "epoch": 0.569916297390448, "grad_norm": 0.21715956926345825, "learning_rate": 0.0002151742825471117, "loss": 8.0278, "step": 4630 }, { "epoch": 0.5700393894633186, "grad_norm": 0.08761396259069443, "learning_rate": 0.000215112698608203, "loss": 7.7021, "step": 4631 }, { "epoch": 0.5701624815361891, "grad_norm": 0.17186999320983887, "learning_rate": 0.00021505111466929425, "loss": 7.8794, "step": 4632 }, { "epoch": 0.5702855736090596, "grad_norm": 0.08037501573562622, "learning_rate": 0.0002149895307303855, "loss": 7.7004, "step": 4633 }, { "epoch": 0.57040866568193, "grad_norm": 0.27631592750549316, "learning_rate": 0.00021492794679147677, "loss": 8.9122, "step": 4634 }, { "epoch": 0.5705317577548006, "grad_norm": 0.07385114580392838, "learning_rate": 0.00021486636285256806, "loss": 7.8294, "step": 4635 }, { "epoch": 0.5706548498276711, "grad_norm": 0.10905425250530243, "learning_rate": 0.00021480477891365932, "loss": 7.3232, "step": 4636 }, { "epoch": 0.5707779419005417, "grad_norm": 0.10576941072940826, "learning_rate": 0.00021474319497475058, "loss": 7.4482, "step": 4637 }, { "epoch": 0.5709010339734121, "grad_norm": 0.10071275383234024, "learning_rate": 0.00021468161103584187, "loss": 7.2647, "step": 4638 }, { "epoch": 0.5710241260462826, "grad_norm": 0.20986510813236237, "learning_rate": 0.00021462002709693312, "loss": 8.3464, "step": 4639 }, { "epoch": 0.5711472181191531, "grad_norm": 0.1493600606918335, "learning_rate": 0.00021455844315802438, "loss": 7.7312, "step": 4640 }, { "epoch": 0.5712703101920237, "grad_norm": 0.09261597692966461, "learning_rate": 0.00021449685921911567, "loss": 7.8219, "step": 4641 }, { "epoch": 0.5713934022648941, "grad_norm": 0.09301550686359406, "learning_rate": 0.00021443527528020693, "loss": 7.2902, "step": 4642 }, { "epoch": 0.5715164943377646, "grad_norm": 0.07080195099115372, "learning_rate": 0.0002143736913412982, "loss": 7.5483, "step": 4643 }, { "epoch": 0.5716395864106352, "grad_norm": 0.17374294996261597, "learning_rate": 0.00021431210740238945, "loss": 8.5384, "step": 4644 }, { "epoch": 0.5717626784835057, "grad_norm": 0.05580943822860718, "learning_rate": 0.00021425052346348074, "loss": 7.7389, "step": 4645 }, { "epoch": 0.5718857705563761, "grad_norm": 0.10046260058879852, "learning_rate": 0.000214188939524572, "loss": 7.8921, "step": 4646 }, { "epoch": 0.5720088626292467, "grad_norm": 0.1721193939447403, "learning_rate": 0.00021412735558566325, "loss": 7.4224, "step": 4647 }, { "epoch": 0.5721319547021172, "grad_norm": 0.17490321397781372, "learning_rate": 0.00021406577164675454, "loss": 7.4375, "step": 4648 }, { "epoch": 0.5722550467749877, "grad_norm": 0.15252628922462463, "learning_rate": 0.0002140041877078458, "loss": 7.471, "step": 4649 }, { "epoch": 0.5723781388478582, "grad_norm": 0.06134847551584244, "learning_rate": 0.00021394260376893706, "loss": 7.8154, "step": 4650 }, { "epoch": 0.5725012309207287, "grad_norm": 0.17553113400936127, "learning_rate": 0.00021388101983002835, "loss": 7.1533, "step": 4651 }, { "epoch": 0.5726243229935992, "grad_norm": 0.068221315741539, "learning_rate": 0.0002138194358911196, "loss": 7.5573, "step": 4652 }, { "epoch": 0.5727474150664698, "grad_norm": 0.2951217293739319, "learning_rate": 0.00021375785195221087, "loss": 8.4767, "step": 4653 }, { "epoch": 0.5728705071393402, "grad_norm": 0.07962210476398468, "learning_rate": 0.00021369626801330213, "loss": 7.2115, "step": 4654 }, { "epoch": 0.5729935992122107, "grad_norm": 0.15993821620941162, "learning_rate": 0.0002136346840743934, "loss": 7.7542, "step": 4655 }, { "epoch": 0.5731166912850812, "grad_norm": 0.11727257817983627, "learning_rate": 0.00021357310013548467, "loss": 7.7804, "step": 4656 }, { "epoch": 0.5732397833579518, "grad_norm": 0.09555631875991821, "learning_rate": 0.00021351151619657593, "loss": 7.7498, "step": 4657 }, { "epoch": 0.5733628754308222, "grad_norm": 0.0815793052315712, "learning_rate": 0.00021344993225766722, "loss": 7.6412, "step": 4658 }, { "epoch": 0.5734859675036927, "grad_norm": 0.12318351119756699, "learning_rate": 0.00021338834831875848, "loss": 7.8761, "step": 4659 }, { "epoch": 0.5736090595765633, "grad_norm": 0.06908908486366272, "learning_rate": 0.00021332676437984974, "loss": 7.7773, "step": 4660 }, { "epoch": 0.5737321516494338, "grad_norm": 0.19958670437335968, "learning_rate": 0.000213265180440941, "loss": 7.3551, "step": 4661 }, { "epoch": 0.5738552437223042, "grad_norm": 0.18119001388549805, "learning_rate": 0.00021320359650203228, "loss": 7.3985, "step": 4662 }, { "epoch": 0.5739783357951748, "grad_norm": 0.17557267844676971, "learning_rate": 0.00021314201256312354, "loss": 7.5397, "step": 4663 }, { "epoch": 0.5741014278680453, "grad_norm": 0.2694307565689087, "learning_rate": 0.0002130804286242148, "loss": 8.7121, "step": 4664 }, { "epoch": 0.5742245199409158, "grad_norm": 0.16389508545398712, "learning_rate": 0.0002130188446853061, "loss": 7.9557, "step": 4665 }, { "epoch": 0.5743476120137863, "grad_norm": 0.2068050503730774, "learning_rate": 0.00021295726074639735, "loss": 8.1862, "step": 4666 }, { "epoch": 0.5744707040866568, "grad_norm": 0.058507904410362244, "learning_rate": 0.0002128956768074886, "loss": 7.4688, "step": 4667 }, { "epoch": 0.5745937961595273, "grad_norm": 0.11594653129577637, "learning_rate": 0.0002128340928685799, "loss": 7.5345, "step": 4668 }, { "epoch": 0.5747168882323979, "grad_norm": 0.07805752009153366, "learning_rate": 0.00021277250892967115, "loss": 7.7228, "step": 4669 }, { "epoch": 0.5748399803052684, "grad_norm": 0.15078243613243103, "learning_rate": 0.0002127109249907624, "loss": 8.1545, "step": 4670 }, { "epoch": 0.5749630723781388, "grad_norm": 0.0814574584364891, "learning_rate": 0.00021264934105185367, "loss": 7.6319, "step": 4671 }, { "epoch": 0.5750861644510094, "grad_norm": 0.10700924694538116, "learning_rate": 0.00021258775711294496, "loss": 7.4253, "step": 4672 }, { "epoch": 0.5752092565238799, "grad_norm": 0.17306050658226013, "learning_rate": 0.00021252617317403622, "loss": 7.3632, "step": 4673 }, { "epoch": 0.5753323485967504, "grad_norm": 0.12628300487995148, "learning_rate": 0.00021246458923512748, "loss": 8.6741, "step": 4674 }, { "epoch": 0.5754554406696208, "grad_norm": 0.07424382120370865, "learning_rate": 0.00021240300529621876, "loss": 8.1599, "step": 4675 }, { "epoch": 0.5755785327424914, "grad_norm": 0.12007926404476166, "learning_rate": 0.00021234142135731002, "loss": 7.5632, "step": 4676 }, { "epoch": 0.5757016248153619, "grad_norm": 0.09395218640565872, "learning_rate": 0.00021227983741840128, "loss": 7.6459, "step": 4677 }, { "epoch": 0.5758247168882324, "grad_norm": 0.11582779884338379, "learning_rate": 0.00021221825347949257, "loss": 7.9093, "step": 4678 }, { "epoch": 0.5759478089611029, "grad_norm": 0.38461098074913025, "learning_rate": 0.00021215666954058383, "loss": 8.614, "step": 4679 }, { "epoch": 0.5760709010339734, "grad_norm": 0.09192901849746704, "learning_rate": 0.0002120950856016751, "loss": 7.4913, "step": 4680 }, { "epoch": 0.5761939931068439, "grad_norm": 0.058914050459861755, "learning_rate": 0.00021203350166276635, "loss": 7.5267, "step": 4681 }, { "epoch": 0.5763170851797145, "grad_norm": 0.12553930282592773, "learning_rate": 0.00021197191772385764, "loss": 8.1441, "step": 4682 }, { "epoch": 0.5764401772525849, "grad_norm": 0.126142680644989, "learning_rate": 0.0002119103337849489, "loss": 8.2957, "step": 4683 }, { "epoch": 0.5765632693254554, "grad_norm": 0.0758286714553833, "learning_rate": 0.00021184874984604015, "loss": 7.6417, "step": 4684 }, { "epoch": 0.576686361398326, "grad_norm": 0.07936280220746994, "learning_rate": 0.00021178716590713144, "loss": 7.7789, "step": 4685 }, { "epoch": 0.5768094534711965, "grad_norm": 0.0944872722029686, "learning_rate": 0.0002117255819682227, "loss": 7.8987, "step": 4686 }, { "epoch": 0.5769325455440669, "grad_norm": 0.06435047835111618, "learning_rate": 0.00021166399802931396, "loss": 8.1108, "step": 4687 }, { "epoch": 0.5770556376169375, "grad_norm": 0.08935388922691345, "learning_rate": 0.00021160241409040522, "loss": 8.0041, "step": 4688 }, { "epoch": 0.577178729689808, "grad_norm": 0.15814432501792908, "learning_rate": 0.0002115408301514965, "loss": 7.6214, "step": 4689 }, { "epoch": 0.5773018217626785, "grad_norm": 0.14928790926933289, "learning_rate": 0.00021147924621258777, "loss": 7.5479, "step": 4690 }, { "epoch": 0.577424913835549, "grad_norm": 0.09009266644716263, "learning_rate": 0.00021141766227367902, "loss": 7.874, "step": 4691 }, { "epoch": 0.5775480059084195, "grad_norm": 0.10354148596525192, "learning_rate": 0.0002113560783347703, "loss": 7.4479, "step": 4692 }, { "epoch": 0.57767109798129, "grad_norm": 0.28842416405677795, "learning_rate": 0.00021129449439586157, "loss": 9.2555, "step": 4693 }, { "epoch": 0.5777941900541606, "grad_norm": 0.1712414026260376, "learning_rate": 0.00021123291045695283, "loss": 8.0556, "step": 4694 }, { "epoch": 0.577917282127031, "grad_norm": 0.15013283491134644, "learning_rate": 0.00021117132651804412, "loss": 7.7157, "step": 4695 }, { "epoch": 0.5780403741999015, "grad_norm": 0.34591537714004517, "learning_rate": 0.00021110974257913538, "loss": 8.9263, "step": 4696 }, { "epoch": 0.578163466272772, "grad_norm": 0.10873042047023773, "learning_rate": 0.00021104815864022664, "loss": 8.4213, "step": 4697 }, { "epoch": 0.5782865583456426, "grad_norm": 0.3197067081928253, "learning_rate": 0.0002109865747013179, "loss": 7.5936, "step": 4698 }, { "epoch": 0.578409650418513, "grad_norm": 0.16623184084892273, "learning_rate": 0.00021092499076240918, "loss": 7.2149, "step": 4699 }, { "epoch": 0.5785327424913835, "grad_norm": 0.10524191707372665, "learning_rate": 0.00021086340682350044, "loss": 8.0016, "step": 4700 }, { "epoch": 0.5786558345642541, "grad_norm": 0.11125914752483368, "learning_rate": 0.0002108018228845917, "loss": 8.169, "step": 4701 }, { "epoch": 0.5787789266371246, "grad_norm": 0.07148818671703339, "learning_rate": 0.000210740238945683, "loss": 7.7167, "step": 4702 }, { "epoch": 0.578902018709995, "grad_norm": 0.10861168801784515, "learning_rate": 0.00021067865500677425, "loss": 7.4788, "step": 4703 }, { "epoch": 0.5790251107828656, "grad_norm": 0.16217735409736633, "learning_rate": 0.0002106170710678655, "loss": 8.12, "step": 4704 }, { "epoch": 0.5791482028557361, "grad_norm": 0.18909570574760437, "learning_rate": 0.00021055548712895677, "loss": 7.907, "step": 4705 }, { "epoch": 0.5792712949286066, "grad_norm": 0.1019662618637085, "learning_rate": 0.00021049390319004805, "loss": 7.5292, "step": 4706 }, { "epoch": 0.579394387001477, "grad_norm": 0.12242383509874344, "learning_rate": 0.0002104323192511393, "loss": 8.1731, "step": 4707 }, { "epoch": 0.5795174790743476, "grad_norm": 0.21940399706363678, "learning_rate": 0.00021037073531223057, "loss": 8.2844, "step": 4708 }, { "epoch": 0.5796405711472181, "grad_norm": 0.11793792247772217, "learning_rate": 0.00021030915137332186, "loss": 7.8926, "step": 4709 }, { "epoch": 0.5797636632200887, "grad_norm": 0.08801206946372986, "learning_rate": 0.00021024756743441312, "loss": 7.7062, "step": 4710 }, { "epoch": 0.5798867552929592, "grad_norm": 0.06524569541215897, "learning_rate": 0.00021018598349550438, "loss": 7.7272, "step": 4711 }, { "epoch": 0.5800098473658296, "grad_norm": 0.10231828689575195, "learning_rate": 0.00021012439955659566, "loss": 7.7698, "step": 4712 }, { "epoch": 0.5801329394387001, "grad_norm": 0.14082925021648407, "learning_rate": 0.00021006281561768692, "loss": 7.5127, "step": 4713 }, { "epoch": 0.5802560315115707, "grad_norm": 0.14851535856723785, "learning_rate": 0.00021000123167877818, "loss": 7.3823, "step": 4714 }, { "epoch": 0.5803791235844412, "grad_norm": 0.11059240996837616, "learning_rate": 0.00020993964773986944, "loss": 7.7662, "step": 4715 }, { "epoch": 0.5805022156573116, "grad_norm": 0.09415969252586365, "learning_rate": 0.00020987806380096073, "loss": 7.3973, "step": 4716 }, { "epoch": 0.5806253077301822, "grad_norm": 0.2220468819141388, "learning_rate": 0.000209816479862052, "loss": 8.4456, "step": 4717 }, { "epoch": 0.5807483998030527, "grad_norm": 0.27802127599716187, "learning_rate": 0.00020975489592314325, "loss": 9.2606, "step": 4718 }, { "epoch": 0.5808714918759232, "grad_norm": 0.13347746431827545, "learning_rate": 0.00020969331198423453, "loss": 7.3212, "step": 4719 }, { "epoch": 0.5809945839487937, "grad_norm": 0.09781679511070251, "learning_rate": 0.0002096317280453258, "loss": 7.6783, "step": 4720 }, { "epoch": 0.5811176760216642, "grad_norm": 0.07855568081140518, "learning_rate": 0.00020957014410641705, "loss": 7.3337, "step": 4721 }, { "epoch": 0.5812407680945347, "grad_norm": 0.14719459414482117, "learning_rate": 0.00020950856016750834, "loss": 7.903, "step": 4722 }, { "epoch": 0.5813638601674053, "grad_norm": 0.0792325809597969, "learning_rate": 0.0002094469762285996, "loss": 7.6888, "step": 4723 }, { "epoch": 0.5814869522402757, "grad_norm": 0.17027519643306732, "learning_rate": 0.00020938539228969086, "loss": 8.1496, "step": 4724 }, { "epoch": 0.5816100443131462, "grad_norm": 0.15038828551769257, "learning_rate": 0.00020932380835078212, "loss": 7.3759, "step": 4725 }, { "epoch": 0.5817331363860168, "grad_norm": 0.10222040116786957, "learning_rate": 0.0002092622244118734, "loss": 7.8944, "step": 4726 }, { "epoch": 0.5818562284588873, "grad_norm": 0.1063118427991867, "learning_rate": 0.00020920064047296466, "loss": 7.5884, "step": 4727 }, { "epoch": 0.5819793205317577, "grad_norm": 0.1678941398859024, "learning_rate": 0.00020913905653405592, "loss": 8.302, "step": 4728 }, { "epoch": 0.5821024126046282, "grad_norm": 0.10128006339073181, "learning_rate": 0.0002090774725951472, "loss": 7.7177, "step": 4729 }, { "epoch": 0.5822255046774988, "grad_norm": 0.24890923500061035, "learning_rate": 0.00020901588865623847, "loss": 8.9488, "step": 4730 }, { "epoch": 0.5823485967503693, "grad_norm": 0.1973734200000763, "learning_rate": 0.00020895430471732973, "loss": 7.8781, "step": 4731 }, { "epoch": 0.5824716888232397, "grad_norm": 0.12779568135738373, "learning_rate": 0.000208892720778421, "loss": 7.4422, "step": 4732 }, { "epoch": 0.5825947808961103, "grad_norm": 0.10476496070623398, "learning_rate": 0.00020883113683951228, "loss": 7.3873, "step": 4733 }, { "epoch": 0.5827178729689808, "grad_norm": 0.09977438300848007, "learning_rate": 0.00020876955290060354, "loss": 8.0525, "step": 4734 }, { "epoch": 0.5828409650418513, "grad_norm": 0.10129350423812866, "learning_rate": 0.0002087079689616948, "loss": 7.4991, "step": 4735 }, { "epoch": 0.5829640571147218, "grad_norm": 0.05322572588920593, "learning_rate": 0.00020864638502278608, "loss": 7.5588, "step": 4736 }, { "epoch": 0.5830871491875923, "grad_norm": 0.11508733779191971, "learning_rate": 0.00020858480108387734, "loss": 7.2835, "step": 4737 }, { "epoch": 0.5832102412604628, "grad_norm": 0.06484857201576233, "learning_rate": 0.0002085232171449686, "loss": 7.621, "step": 4738 }, { "epoch": 0.5833333333333334, "grad_norm": 0.2802709937095642, "learning_rate": 0.0002084616332060599, "loss": 8.5866, "step": 4739 }, { "epoch": 0.5834564254062038, "grad_norm": 0.06323203444480896, "learning_rate": 0.00020840004926715115, "loss": 7.3817, "step": 4740 }, { "epoch": 0.5835795174790743, "grad_norm": 0.1069566160440445, "learning_rate": 0.0002083384653282424, "loss": 7.7369, "step": 4741 }, { "epoch": 0.5837026095519449, "grad_norm": 0.07622744143009186, "learning_rate": 0.00020827688138933367, "loss": 7.4771, "step": 4742 }, { "epoch": 0.5838257016248154, "grad_norm": 0.14012840390205383, "learning_rate": 0.00020821529745042495, "loss": 7.8429, "step": 4743 }, { "epoch": 0.5839487936976858, "grad_norm": 0.07360588014125824, "learning_rate": 0.0002081537135115162, "loss": 7.5843, "step": 4744 }, { "epoch": 0.5840718857705564, "grad_norm": 0.07420285046100616, "learning_rate": 0.00020809212957260747, "loss": 7.6731, "step": 4745 }, { "epoch": 0.5841949778434269, "grad_norm": 0.07087132334709167, "learning_rate": 0.00020803054563369876, "loss": 7.655, "step": 4746 }, { "epoch": 0.5843180699162974, "grad_norm": 0.14811569452285767, "learning_rate": 0.00020796896169479002, "loss": 8.366, "step": 4747 }, { "epoch": 0.5844411619891678, "grad_norm": 0.09459038078784943, "learning_rate": 0.00020790737775588128, "loss": 8.0023, "step": 4748 }, { "epoch": 0.5845642540620384, "grad_norm": 0.09950044006109238, "learning_rate": 0.00020784579381697256, "loss": 7.7031, "step": 4749 }, { "epoch": 0.5846873461349089, "grad_norm": 0.10350959002971649, "learning_rate": 0.00020778420987806382, "loss": 8.5304, "step": 4750 }, { "epoch": 0.5848104382077794, "grad_norm": 0.1662234514951706, "learning_rate": 0.00020772262593915508, "loss": 8.9613, "step": 4751 }, { "epoch": 0.5849335302806499, "grad_norm": 0.10909965634346008, "learning_rate": 0.00020766104200024634, "loss": 7.7841, "step": 4752 }, { "epoch": 0.5850566223535204, "grad_norm": 0.19898584485054016, "learning_rate": 0.00020759945806133763, "loss": 7.3727, "step": 4753 }, { "epoch": 0.5851797144263909, "grad_norm": 0.13826899230480194, "learning_rate": 0.0002075378741224289, "loss": 7.5635, "step": 4754 }, { "epoch": 0.5853028064992615, "grad_norm": 0.10191610455513, "learning_rate": 0.00020747629018352015, "loss": 7.6994, "step": 4755 }, { "epoch": 0.585425898572132, "grad_norm": 0.07959669083356857, "learning_rate": 0.00020741470624461143, "loss": 7.5732, "step": 4756 }, { "epoch": 0.5855489906450024, "grad_norm": 0.08317820727825165, "learning_rate": 0.0002073531223057027, "loss": 7.9564, "step": 4757 }, { "epoch": 0.585672082717873, "grad_norm": 0.09957604110240936, "learning_rate": 0.00020729153836679395, "loss": 7.4653, "step": 4758 }, { "epoch": 0.5857951747907435, "grad_norm": 0.09428628534078598, "learning_rate": 0.0002072299544278852, "loss": 7.4682, "step": 4759 }, { "epoch": 0.585918266863614, "grad_norm": 0.06664243340492249, "learning_rate": 0.0002071683704889765, "loss": 7.4433, "step": 4760 }, { "epoch": 0.5860413589364845, "grad_norm": 0.1382572054862976, "learning_rate": 0.00020710678655006773, "loss": 8.1648, "step": 4761 }, { "epoch": 0.586164451009355, "grad_norm": 0.16041171550750732, "learning_rate": 0.000207045202611159, "loss": 7.5546, "step": 4762 }, { "epoch": 0.5862875430822255, "grad_norm": 0.17115722596645355, "learning_rate": 0.00020698361867225028, "loss": 8.4356, "step": 4763 }, { "epoch": 0.5864106351550961, "grad_norm": 0.05662921816110611, "learning_rate": 0.00020692203473334154, "loss": 7.4631, "step": 4764 }, { "epoch": 0.5865337272279665, "grad_norm": 0.1514766663312912, "learning_rate": 0.0002068604507944328, "loss": 7.8475, "step": 4765 }, { "epoch": 0.586656819300837, "grad_norm": 0.1297329217195511, "learning_rate": 0.00020679886685552408, "loss": 7.1414, "step": 4766 }, { "epoch": 0.5867799113737076, "grad_norm": 0.11066565662622452, "learning_rate": 0.00020673728291661534, "loss": 8.1647, "step": 4767 }, { "epoch": 0.5869030034465781, "grad_norm": 0.08101850003004074, "learning_rate": 0.0002066756989777066, "loss": 7.6625, "step": 4768 }, { "epoch": 0.5870260955194485, "grad_norm": 0.06726589053869247, "learning_rate": 0.00020661411503879786, "loss": 7.666, "step": 4769 }, { "epoch": 0.587149187592319, "grad_norm": 0.06139913946390152, "learning_rate": 0.00020655253109988915, "loss": 7.5663, "step": 4770 }, { "epoch": 0.5872722796651896, "grad_norm": 0.1372561901807785, "learning_rate": 0.0002064909471609804, "loss": 7.7017, "step": 4771 }, { "epoch": 0.5873953717380601, "grad_norm": 0.06887233257293701, "learning_rate": 0.00020642936322207167, "loss": 7.493, "step": 4772 }, { "epoch": 0.5875184638109305, "grad_norm": 0.13216815888881683, "learning_rate": 0.00020636777928316295, "loss": 7.4672, "step": 4773 }, { "epoch": 0.5876415558838011, "grad_norm": 0.09330039471387863, "learning_rate": 0.0002063061953442542, "loss": 7.8304, "step": 4774 }, { "epoch": 0.5877646479566716, "grad_norm": 0.08421070128679276, "learning_rate": 0.00020624461140534547, "loss": 7.5596, "step": 4775 }, { "epoch": 0.5878877400295421, "grad_norm": 0.08552873134613037, "learning_rate": 0.00020618302746643673, "loss": 7.6035, "step": 4776 }, { "epoch": 0.5880108321024126, "grad_norm": 0.07628379762172699, "learning_rate": 0.00020612144352752802, "loss": 7.305, "step": 4777 }, { "epoch": 0.5881339241752831, "grad_norm": 0.09253044426441193, "learning_rate": 0.00020605985958861928, "loss": 7.4474, "step": 4778 }, { "epoch": 0.5882570162481536, "grad_norm": 0.07718586921691895, "learning_rate": 0.00020599827564971054, "loss": 7.5301, "step": 4779 }, { "epoch": 0.5883801083210242, "grad_norm": 0.0762898400425911, "learning_rate": 0.00020593669171080182, "loss": 7.3209, "step": 4780 }, { "epoch": 0.5885032003938946, "grad_norm": 0.05976634472608566, "learning_rate": 0.00020587510777189308, "loss": 7.3811, "step": 4781 }, { "epoch": 0.5886262924667651, "grad_norm": 0.09878282248973846, "learning_rate": 0.00020581352383298434, "loss": 7.4745, "step": 4782 }, { "epoch": 0.5887493845396357, "grad_norm": 0.13371729850769043, "learning_rate": 0.00020575193989407563, "loss": 7.6916, "step": 4783 }, { "epoch": 0.5888724766125062, "grad_norm": 0.10044053941965103, "learning_rate": 0.0002056903559551669, "loss": 7.6172, "step": 4784 }, { "epoch": 0.5889955686853766, "grad_norm": 0.11822067201137543, "learning_rate": 0.00020562877201625815, "loss": 7.953, "step": 4785 }, { "epoch": 0.5891186607582471, "grad_norm": 0.1594630628824234, "learning_rate": 0.0002055671880773494, "loss": 7.273, "step": 4786 }, { "epoch": 0.5892417528311177, "grad_norm": 0.08550465106964111, "learning_rate": 0.0002055056041384407, "loss": 7.6452, "step": 4787 }, { "epoch": 0.5893648449039882, "grad_norm": 0.06986044347286224, "learning_rate": 0.00020544402019953195, "loss": 7.659, "step": 4788 }, { "epoch": 0.5894879369768586, "grad_norm": 0.08414243906736374, "learning_rate": 0.00020538243626062321, "loss": 7.5987, "step": 4789 }, { "epoch": 0.5896110290497292, "grad_norm": 0.07635192573070526, "learning_rate": 0.0002053208523217145, "loss": 7.8871, "step": 4790 }, { "epoch": 0.5897341211225997, "grad_norm": 0.09649258106946945, "learning_rate": 0.00020525926838280576, "loss": 7.4248, "step": 4791 }, { "epoch": 0.5898572131954702, "grad_norm": 0.10984780639410019, "learning_rate": 0.00020519768444389702, "loss": 8.1337, "step": 4792 }, { "epoch": 0.5899803052683407, "grad_norm": 0.10834801197052002, "learning_rate": 0.0002051361005049883, "loss": 7.9456, "step": 4793 }, { "epoch": 0.5901033973412112, "grad_norm": 0.1354086846113205, "learning_rate": 0.00020507451656607957, "loss": 8.386, "step": 4794 }, { "epoch": 0.5902264894140817, "grad_norm": 0.06053768843412399, "learning_rate": 0.00020501293262717083, "loss": 7.7821, "step": 4795 }, { "epoch": 0.5903495814869523, "grad_norm": 0.1818205863237381, "learning_rate": 0.00020495134868826208, "loss": 7.381, "step": 4796 }, { "epoch": 0.5904726735598228, "grad_norm": 0.09020013362169266, "learning_rate": 0.00020488976474935337, "loss": 7.4973, "step": 4797 }, { "epoch": 0.5905957656326932, "grad_norm": 0.09244905412197113, "learning_rate": 0.00020482818081044463, "loss": 7.6542, "step": 4798 }, { "epoch": 0.5907188577055638, "grad_norm": 0.08459813892841339, "learning_rate": 0.0002047665968715359, "loss": 8.012, "step": 4799 }, { "epoch": 0.5908419497784343, "grad_norm": 0.09417328238487244, "learning_rate": 0.00020470501293262718, "loss": 7.5453, "step": 4800 }, { "epoch": 0.5909650418513048, "grad_norm": 0.09367942810058594, "learning_rate": 0.00020464342899371844, "loss": 7.305, "step": 4801 }, { "epoch": 0.5910881339241753, "grad_norm": 0.10967683047056198, "learning_rate": 0.0002045818450548097, "loss": 7.3021, "step": 4802 }, { "epoch": 0.5912112259970458, "grad_norm": 0.11692088842391968, "learning_rate": 0.00020452026111590096, "loss": 8.0989, "step": 4803 }, { "epoch": 0.5913343180699163, "grad_norm": 0.08176233619451523, "learning_rate": 0.00020445867717699224, "loss": 7.9375, "step": 4804 }, { "epoch": 0.5914574101427869, "grad_norm": 0.08285069465637207, "learning_rate": 0.0002043970932380835, "loss": 7.3476, "step": 4805 }, { "epoch": 0.5915805022156573, "grad_norm": 0.10746903717517853, "learning_rate": 0.00020433550929917476, "loss": 7.6615, "step": 4806 }, { "epoch": 0.5917035942885278, "grad_norm": 0.15935207903385162, "learning_rate": 0.00020427392536026605, "loss": 7.8714, "step": 4807 }, { "epoch": 0.5918266863613983, "grad_norm": 0.16080129146575928, "learning_rate": 0.0002042123414213573, "loss": 8.184, "step": 4808 }, { "epoch": 0.5919497784342689, "grad_norm": 0.07593020796775818, "learning_rate": 0.00020415075748244857, "loss": 7.7132, "step": 4809 }, { "epoch": 0.5920728705071393, "grad_norm": 0.05713510140776634, "learning_rate": 0.00020408917354353985, "loss": 7.755, "step": 4810 }, { "epoch": 0.5921959625800098, "grad_norm": 0.09080737084150314, "learning_rate": 0.0002040275896046311, "loss": 7.6302, "step": 4811 }, { "epoch": 0.5923190546528804, "grad_norm": 0.14924563467502594, "learning_rate": 0.00020396600566572237, "loss": 7.8632, "step": 4812 }, { "epoch": 0.5924421467257509, "grad_norm": 0.13058413565158844, "learning_rate": 0.00020390442172681363, "loss": 7.669, "step": 4813 }, { "epoch": 0.5925652387986213, "grad_norm": 0.09985927492380142, "learning_rate": 0.00020384283778790492, "loss": 7.7205, "step": 4814 }, { "epoch": 0.5926883308714919, "grad_norm": 0.0741647332906723, "learning_rate": 0.00020378125384899618, "loss": 8.2367, "step": 4815 }, { "epoch": 0.5928114229443624, "grad_norm": 0.08112963289022446, "learning_rate": 0.00020371966991008744, "loss": 7.7389, "step": 4816 }, { "epoch": 0.5929345150172329, "grad_norm": 0.08913461863994598, "learning_rate": 0.00020365808597117872, "loss": 7.7001, "step": 4817 }, { "epoch": 0.5930576070901034, "grad_norm": 0.07108054310083389, "learning_rate": 0.00020359650203226998, "loss": 7.4624, "step": 4818 }, { "epoch": 0.5931806991629739, "grad_norm": 0.11211158335208893, "learning_rate": 0.00020353491809336124, "loss": 7.7009, "step": 4819 }, { "epoch": 0.5933037912358444, "grad_norm": 0.1088738664984703, "learning_rate": 0.00020347333415445253, "loss": 7.7518, "step": 4820 }, { "epoch": 0.593426883308715, "grad_norm": 0.1664602905511856, "learning_rate": 0.0002034117502155438, "loss": 7.6539, "step": 4821 }, { "epoch": 0.5935499753815854, "grad_norm": 0.22701358795166016, "learning_rate": 0.00020335016627663505, "loss": 8.3167, "step": 4822 }, { "epoch": 0.5936730674544559, "grad_norm": 0.12214291095733643, "learning_rate": 0.0002032885823377263, "loss": 8.0774, "step": 4823 }, { "epoch": 0.5937961595273265, "grad_norm": 0.07866551727056503, "learning_rate": 0.0002032269983988176, "loss": 7.71, "step": 4824 }, { "epoch": 0.593919251600197, "grad_norm": 0.08132099360227585, "learning_rate": 0.00020316541445990885, "loss": 7.5778, "step": 4825 }, { "epoch": 0.5940423436730674, "grad_norm": 0.062429122626781464, "learning_rate": 0.00020310383052100011, "loss": 7.6693, "step": 4826 }, { "epoch": 0.5941654357459379, "grad_norm": 0.30652064085006714, "learning_rate": 0.0002030422465820914, "loss": 8.9244, "step": 4827 }, { "epoch": 0.5942885278188085, "grad_norm": 0.08603514730930328, "learning_rate": 0.00020298066264318266, "loss": 7.7406, "step": 4828 }, { "epoch": 0.594411619891679, "grad_norm": 0.11497411131858826, "learning_rate": 0.00020291907870427392, "loss": 7.6821, "step": 4829 }, { "epoch": 0.5945347119645494, "grad_norm": 0.12910933792591095, "learning_rate": 0.00020285749476536518, "loss": 7.9708, "step": 4830 }, { "epoch": 0.59465780403742, "grad_norm": 0.15586814284324646, "learning_rate": 0.00020279591082645647, "loss": 7.4995, "step": 4831 }, { "epoch": 0.5947808961102905, "grad_norm": 0.1509641408920288, "learning_rate": 0.00020273432688754772, "loss": 7.3955, "step": 4832 }, { "epoch": 0.594903988183161, "grad_norm": 0.05986088514328003, "learning_rate": 0.00020267274294863898, "loss": 7.889, "step": 4833 }, { "epoch": 0.5950270802560315, "grad_norm": 0.0763101577758789, "learning_rate": 0.00020261115900973027, "loss": 7.5811, "step": 4834 }, { "epoch": 0.595150172328902, "grad_norm": 0.11486952751874924, "learning_rate": 0.00020254957507082153, "loss": 7.0478, "step": 4835 }, { "epoch": 0.5952732644017725, "grad_norm": 0.2136542797088623, "learning_rate": 0.0002024879911319128, "loss": 7.8443, "step": 4836 }, { "epoch": 0.5953963564746431, "grad_norm": 0.29688581824302673, "learning_rate": 0.00020242640719300408, "loss": 8.5592, "step": 4837 }, { "epoch": 0.5955194485475135, "grad_norm": 0.1411004364490509, "learning_rate": 0.00020236482325409534, "loss": 7.0569, "step": 4838 }, { "epoch": 0.595642540620384, "grad_norm": 0.2661360502243042, "learning_rate": 0.0002023032393151866, "loss": 8.1895, "step": 4839 }, { "epoch": 0.5957656326932546, "grad_norm": 0.1221408024430275, "learning_rate": 0.00020224165537627786, "loss": 7.675, "step": 4840 }, { "epoch": 0.5958887247661251, "grad_norm": 0.10101430863142014, "learning_rate": 0.00020218007143736914, "loss": 7.667, "step": 4841 }, { "epoch": 0.5960118168389956, "grad_norm": 0.0688178762793541, "learning_rate": 0.0002021184874984604, "loss": 7.4632, "step": 4842 }, { "epoch": 0.596134908911866, "grad_norm": 0.06503777205944061, "learning_rate": 0.00020205690355955166, "loss": 7.6183, "step": 4843 }, { "epoch": 0.5962580009847366, "grad_norm": 0.12666447460651398, "learning_rate": 0.00020199531962064295, "loss": 7.8628, "step": 4844 }, { "epoch": 0.5963810930576071, "grad_norm": 0.17551274597644806, "learning_rate": 0.0002019337356817342, "loss": 7.3505, "step": 4845 }, { "epoch": 0.5965041851304776, "grad_norm": 0.18991905450820923, "learning_rate": 0.00020187215174282547, "loss": 7.5863, "step": 4846 }, { "epoch": 0.5966272772033481, "grad_norm": 0.1779738962650299, "learning_rate": 0.00020181056780391673, "loss": 7.7146, "step": 4847 }, { "epoch": 0.5967503692762186, "grad_norm": 0.1836349368095398, "learning_rate": 0.000201748983865008, "loss": 7.3532, "step": 4848 }, { "epoch": 0.5968734613490891, "grad_norm": 0.12299703806638718, "learning_rate": 0.00020168739992609927, "loss": 7.4415, "step": 4849 }, { "epoch": 0.5969965534219597, "grad_norm": 0.1374804526567459, "learning_rate": 0.00020162581598719053, "loss": 8.1796, "step": 4850 }, { "epoch": 0.5971196454948301, "grad_norm": 0.06939522922039032, "learning_rate": 0.00020156423204828182, "loss": 7.6547, "step": 4851 }, { "epoch": 0.5972427375677006, "grad_norm": 0.10671127587556839, "learning_rate": 0.00020150264810937308, "loss": 7.8981, "step": 4852 }, { "epoch": 0.5973658296405712, "grad_norm": 0.31355834007263184, "learning_rate": 0.00020144106417046434, "loss": 8.9358, "step": 4853 }, { "epoch": 0.5974889217134417, "grad_norm": 0.11483525484800339, "learning_rate": 0.00020137948023155562, "loss": 7.8609, "step": 4854 }, { "epoch": 0.5976120137863121, "grad_norm": 0.21049702167510986, "learning_rate": 0.00020131789629264688, "loss": 7.9322, "step": 4855 }, { "epoch": 0.5977351058591827, "grad_norm": 0.125139057636261, "learning_rate": 0.00020125631235373814, "loss": 8.1065, "step": 4856 }, { "epoch": 0.5978581979320532, "grad_norm": 0.06911729276180267, "learning_rate": 0.0002011947284148294, "loss": 7.8894, "step": 4857 }, { "epoch": 0.5979812900049237, "grad_norm": 0.09388584643602371, "learning_rate": 0.0002011331444759207, "loss": 8.1352, "step": 4858 }, { "epoch": 0.5981043820777941, "grad_norm": 0.0936715304851532, "learning_rate": 0.00020107156053701195, "loss": 8.2905, "step": 4859 }, { "epoch": 0.5982274741506647, "grad_norm": 0.22748056054115295, "learning_rate": 0.0002010099765981032, "loss": 7.3731, "step": 4860 }, { "epoch": 0.5983505662235352, "grad_norm": 0.19047914445400238, "learning_rate": 0.0002009483926591945, "loss": 7.4775, "step": 4861 }, { "epoch": 0.5984736582964058, "grad_norm": 0.09229563921689987, "learning_rate": 0.00020088680872028575, "loss": 7.7623, "step": 4862 }, { "epoch": 0.5985967503692762, "grad_norm": 0.12995821237564087, "learning_rate": 0.000200825224781377, "loss": 7.6984, "step": 4863 }, { "epoch": 0.5987198424421467, "grad_norm": 0.11458609998226166, "learning_rate": 0.0002007636408424683, "loss": 7.6812, "step": 4864 }, { "epoch": 0.5988429345150172, "grad_norm": 0.121994748711586, "learning_rate": 0.00020070205690355956, "loss": 7.3557, "step": 4865 }, { "epoch": 0.5989660265878878, "grad_norm": 0.08871182054281235, "learning_rate": 0.00020064047296465082, "loss": 7.6499, "step": 4866 }, { "epoch": 0.5990891186607582, "grad_norm": 0.11805742979049683, "learning_rate": 0.00020057888902574208, "loss": 7.5942, "step": 4867 }, { "epoch": 0.5992122107336287, "grad_norm": 0.10876305401325226, "learning_rate": 0.00020051730508683336, "loss": 7.6235, "step": 4868 }, { "epoch": 0.5993353028064993, "grad_norm": 0.08746696263551712, "learning_rate": 0.00020045572114792462, "loss": 7.4611, "step": 4869 }, { "epoch": 0.5994583948793698, "grad_norm": 0.11022445559501648, "learning_rate": 0.00020039413720901588, "loss": 7.6631, "step": 4870 }, { "epoch": 0.5995814869522402, "grad_norm": 0.07489613443613052, "learning_rate": 0.00020033255327010717, "loss": 7.7594, "step": 4871 }, { "epoch": 0.5997045790251108, "grad_norm": 0.13561353087425232, "learning_rate": 0.00020027096933119843, "loss": 7.5881, "step": 4872 }, { "epoch": 0.5998276710979813, "grad_norm": 0.08146975189447403, "learning_rate": 0.0002002093853922897, "loss": 7.9313, "step": 4873 }, { "epoch": 0.5999507631708518, "grad_norm": 0.28408312797546387, "learning_rate": 0.00020014780145338095, "loss": 8.8931, "step": 4874 }, { "epoch": 0.6000738552437223, "grad_norm": 0.060177791863679886, "learning_rate": 0.00020008621751447224, "loss": 7.678, "step": 4875 }, { "epoch": 0.6001969473165928, "grad_norm": 0.11580001562833786, "learning_rate": 0.0002000246335755635, "loss": 7.4984, "step": 4876 }, { "epoch": 0.6003200393894633, "grad_norm": 0.08612234890460968, "learning_rate": 0.00019996304963665475, "loss": 7.6418, "step": 4877 }, { "epoch": 0.6004431314623339, "grad_norm": 0.23700596392154694, "learning_rate": 0.00019990146569774604, "loss": 8.7922, "step": 4878 }, { "epoch": 0.6005662235352043, "grad_norm": 0.17701765894889832, "learning_rate": 0.0001998398817588373, "loss": 8.4313, "step": 4879 }, { "epoch": 0.6006893156080748, "grad_norm": 0.1907842755317688, "learning_rate": 0.00019977829781992856, "loss": 7.231, "step": 4880 }, { "epoch": 0.6008124076809453, "grad_norm": 0.1353423297405243, "learning_rate": 0.00019971671388101985, "loss": 8.584, "step": 4881 }, { "epoch": 0.6009354997538159, "grad_norm": 0.13683654367923737, "learning_rate": 0.0001996551299421111, "loss": 7.8291, "step": 4882 }, { "epoch": 0.6010585918266863, "grad_norm": 0.11396999657154083, "learning_rate": 0.00019959354600320237, "loss": 8.2205, "step": 4883 }, { "epoch": 0.6011816838995568, "grad_norm": 0.16191832721233368, "learning_rate": 0.00019953196206429363, "loss": 7.8703, "step": 4884 }, { "epoch": 0.6013047759724274, "grad_norm": 0.11087069660425186, "learning_rate": 0.0001994703781253849, "loss": 7.5438, "step": 4885 }, { "epoch": 0.6014278680452979, "grad_norm": 0.1299993395805359, "learning_rate": 0.00019940879418647617, "loss": 8.3328, "step": 4886 }, { "epoch": 0.6015509601181684, "grad_norm": 0.06015140935778618, "learning_rate": 0.00019934721024756743, "loss": 7.7651, "step": 4887 }, { "epoch": 0.6016740521910389, "grad_norm": 0.09135022014379501, "learning_rate": 0.00019928562630865872, "loss": 7.7982, "step": 4888 }, { "epoch": 0.6017971442639094, "grad_norm": 0.26241534948349, "learning_rate": 0.00019922404236974998, "loss": 7.0567, "step": 4889 }, { "epoch": 0.6019202363367799, "grad_norm": 0.07902271300554276, "learning_rate": 0.00019916245843084124, "loss": 7.5169, "step": 4890 }, { "epoch": 0.6020433284096505, "grad_norm": 0.10533146560192108, "learning_rate": 0.00019910087449193252, "loss": 7.4496, "step": 4891 }, { "epoch": 0.6021664204825209, "grad_norm": 0.35273149609565735, "learning_rate": 0.00019903929055302378, "loss": 9.0874, "step": 4892 }, { "epoch": 0.6022895125553914, "grad_norm": 0.11162488162517548, "learning_rate": 0.00019897770661411504, "loss": 7.1457, "step": 4893 }, { "epoch": 0.602412604628262, "grad_norm": 0.2349761724472046, "learning_rate": 0.0001989161226752063, "loss": 8.6962, "step": 4894 }, { "epoch": 0.6025356967011325, "grad_norm": 0.08753509074449539, "learning_rate": 0.0001988545387362976, "loss": 7.2816, "step": 4895 }, { "epoch": 0.6026587887740029, "grad_norm": 0.07300882041454315, "learning_rate": 0.00019879295479738885, "loss": 7.3438, "step": 4896 }, { "epoch": 0.6027818808468735, "grad_norm": 0.1241801455616951, "learning_rate": 0.0001987313708584801, "loss": 8.0945, "step": 4897 }, { "epoch": 0.602904972919744, "grad_norm": 0.10736700892448425, "learning_rate": 0.0001986697869195714, "loss": 7.6281, "step": 4898 }, { "epoch": 0.6030280649926145, "grad_norm": 0.07275109738111496, "learning_rate": 0.00019860820298066265, "loss": 7.9812, "step": 4899 }, { "epoch": 0.6031511570654849, "grad_norm": 0.06533713638782501, "learning_rate": 0.0001985466190417539, "loss": 7.8254, "step": 4900 }, { "epoch": 0.6032742491383555, "grad_norm": 0.08283308148384094, "learning_rate": 0.00019848503510284517, "loss": 8.1884, "step": 4901 }, { "epoch": 0.603397341211226, "grad_norm": 0.08339851349592209, "learning_rate": 0.00019842345116393646, "loss": 7.6241, "step": 4902 }, { "epoch": 0.6035204332840965, "grad_norm": 0.1221507340669632, "learning_rate": 0.00019836186722502772, "loss": 7.4387, "step": 4903 }, { "epoch": 0.603643525356967, "grad_norm": 0.11369835585355759, "learning_rate": 0.00019830028328611898, "loss": 7.6004, "step": 4904 }, { "epoch": 0.6037666174298375, "grad_norm": 0.058539796620607376, "learning_rate": 0.00019823869934721026, "loss": 7.7439, "step": 4905 }, { "epoch": 0.603889709502708, "grad_norm": 0.07588697224855423, "learning_rate": 0.00019817711540830152, "loss": 7.4502, "step": 4906 }, { "epoch": 0.6040128015755786, "grad_norm": 0.06266207993030548, "learning_rate": 0.00019811553146939278, "loss": 7.5028, "step": 4907 }, { "epoch": 0.604135893648449, "grad_norm": 0.20843401551246643, "learning_rate": 0.00019805394753048407, "loss": 8.3844, "step": 4908 }, { "epoch": 0.6042589857213195, "grad_norm": 0.10872963815927505, "learning_rate": 0.00019799236359157533, "loss": 7.6777, "step": 4909 }, { "epoch": 0.6043820777941901, "grad_norm": 0.10496095567941666, "learning_rate": 0.0001979307796526666, "loss": 7.2499, "step": 4910 }, { "epoch": 0.6045051698670606, "grad_norm": 0.11818698048591614, "learning_rate": 0.00019786919571375785, "loss": 7.6945, "step": 4911 }, { "epoch": 0.604628261939931, "grad_norm": 0.0686250701546669, "learning_rate": 0.00019780761177484913, "loss": 7.8348, "step": 4912 }, { "epoch": 0.6047513540128016, "grad_norm": 0.07458224892616272, "learning_rate": 0.0001977460278359404, "loss": 7.5262, "step": 4913 }, { "epoch": 0.6048744460856721, "grad_norm": 0.2948927581310272, "learning_rate": 0.00019768444389703165, "loss": 8.7444, "step": 4914 }, { "epoch": 0.6049975381585426, "grad_norm": 0.12222936749458313, "learning_rate": 0.00019762285995812294, "loss": 7.4051, "step": 4915 }, { "epoch": 0.605120630231413, "grad_norm": 0.1785040646791458, "learning_rate": 0.0001975612760192142, "loss": 7.3612, "step": 4916 }, { "epoch": 0.6052437223042836, "grad_norm": 0.19500701129436493, "learning_rate": 0.00019749969208030546, "loss": 8.5893, "step": 4917 }, { "epoch": 0.6053668143771541, "grad_norm": 0.1180332601070404, "learning_rate": 0.00019743810814139675, "loss": 7.5581, "step": 4918 }, { "epoch": 0.6054899064500247, "grad_norm": 0.09773768484592438, "learning_rate": 0.000197376524202488, "loss": 7.6099, "step": 4919 }, { "epoch": 0.6056129985228951, "grad_norm": 0.06710343062877655, "learning_rate": 0.00019731494026357927, "loss": 7.5803, "step": 4920 }, { "epoch": 0.6057360905957656, "grad_norm": 0.07379107922315598, "learning_rate": 0.00019725335632467052, "loss": 7.4468, "step": 4921 }, { "epoch": 0.6058591826686361, "grad_norm": 0.11590973287820816, "learning_rate": 0.0001971917723857618, "loss": 7.4682, "step": 4922 }, { "epoch": 0.6059822747415067, "grad_norm": 0.09893574565649033, "learning_rate": 0.00019713018844685307, "loss": 7.896, "step": 4923 }, { "epoch": 0.6061053668143771, "grad_norm": 0.08983095735311508, "learning_rate": 0.00019706860450794433, "loss": 8.0655, "step": 4924 }, { "epoch": 0.6062284588872476, "grad_norm": 0.09193252772092819, "learning_rate": 0.00019700702056903562, "loss": 7.6585, "step": 4925 }, { "epoch": 0.6063515509601182, "grad_norm": 0.23542353510856628, "learning_rate": 0.00019694543663012688, "loss": 7.0534, "step": 4926 }, { "epoch": 0.6064746430329887, "grad_norm": 0.08984605967998505, "learning_rate": 0.00019688385269121814, "loss": 7.7377, "step": 4927 }, { "epoch": 0.6065977351058592, "grad_norm": 0.059216633439064026, "learning_rate": 0.0001968222687523094, "loss": 7.7375, "step": 4928 }, { "epoch": 0.6067208271787297, "grad_norm": 0.07588217407464981, "learning_rate": 0.00019676068481340068, "loss": 7.3668, "step": 4929 }, { "epoch": 0.6068439192516002, "grad_norm": 0.05714038386940956, "learning_rate": 0.00019669910087449194, "loss": 7.4729, "step": 4930 }, { "epoch": 0.6069670113244707, "grad_norm": 0.12422864884138107, "learning_rate": 0.0001966375169355832, "loss": 7.7848, "step": 4931 }, { "epoch": 0.6070901033973413, "grad_norm": 0.17791110277175903, "learning_rate": 0.0001965759329966745, "loss": 8.2884, "step": 4932 }, { "epoch": 0.6072131954702117, "grad_norm": 0.05992424860596657, "learning_rate": 0.00019651434905776575, "loss": 7.3968, "step": 4933 }, { "epoch": 0.6073362875430822, "grad_norm": 0.15467104315757751, "learning_rate": 0.000196452765118857, "loss": 8.1911, "step": 4934 }, { "epoch": 0.6074593796159528, "grad_norm": 0.07031266391277313, "learning_rate": 0.0001963911811799483, "loss": 7.9348, "step": 4935 }, { "epoch": 0.6075824716888233, "grad_norm": 0.13230231404304504, "learning_rate": 0.00019632959724103955, "loss": 7.3156, "step": 4936 }, { "epoch": 0.6077055637616937, "grad_norm": 0.11462609469890594, "learning_rate": 0.0001962680133021308, "loss": 7.2925, "step": 4937 }, { "epoch": 0.6078286558345642, "grad_norm": 0.11110851168632507, "learning_rate": 0.00019620642936322207, "loss": 7.651, "step": 4938 }, { "epoch": 0.6079517479074348, "grad_norm": 0.07523440569639206, "learning_rate": 0.00019614484542431336, "loss": 7.5753, "step": 4939 }, { "epoch": 0.6080748399803053, "grad_norm": 0.3244011700153351, "learning_rate": 0.00019608326148540462, "loss": 9.2731, "step": 4940 }, { "epoch": 0.6081979320531757, "grad_norm": 0.08032174408435822, "learning_rate": 0.00019602167754649588, "loss": 7.7922, "step": 4941 }, { "epoch": 0.6083210241260463, "grad_norm": 0.08450000733137131, "learning_rate": 0.00019596009360758716, "loss": 8.0762, "step": 4942 }, { "epoch": 0.6084441161989168, "grad_norm": 0.10376399755477905, "learning_rate": 0.00019589850966867842, "loss": 7.4676, "step": 4943 }, { "epoch": 0.6085672082717873, "grad_norm": 0.07410261034965515, "learning_rate": 0.00019583692572976968, "loss": 7.6887, "step": 4944 }, { "epoch": 0.6086903003446578, "grad_norm": 0.11478124558925629, "learning_rate": 0.00019577534179086097, "loss": 7.6012, "step": 4945 }, { "epoch": 0.6088133924175283, "grad_norm": 0.06663233786821365, "learning_rate": 0.00019571375785195223, "loss": 7.842, "step": 4946 }, { "epoch": 0.6089364844903988, "grad_norm": 0.08749942481517792, "learning_rate": 0.0001956521739130435, "loss": 7.6776, "step": 4947 }, { "epoch": 0.6090595765632694, "grad_norm": 0.1452201008796692, "learning_rate": 0.00019559058997413475, "loss": 8.4575, "step": 4948 }, { "epoch": 0.6091826686361398, "grad_norm": 0.11514447629451752, "learning_rate": 0.00019552900603522603, "loss": 8.3077, "step": 4949 }, { "epoch": 0.6093057607090103, "grad_norm": 0.06008028984069824, "learning_rate": 0.0001954674220963173, "loss": 7.7207, "step": 4950 }, { "epoch": 0.6094288527818809, "grad_norm": 0.1062375009059906, "learning_rate": 0.00019540583815740855, "loss": 7.5805, "step": 4951 }, { "epoch": 0.6095519448547514, "grad_norm": 0.2807473838329315, "learning_rate": 0.00019534425421849984, "loss": 8.5911, "step": 4952 }, { "epoch": 0.6096750369276218, "grad_norm": 0.14844751358032227, "learning_rate": 0.0001952826702795911, "loss": 8.6769, "step": 4953 }, { "epoch": 0.6097981290004924, "grad_norm": 0.14971596002578735, "learning_rate": 0.00019522108634068236, "loss": 7.471, "step": 4954 }, { "epoch": 0.6099212210733629, "grad_norm": 0.19925805926322937, "learning_rate": 0.00019515950240177362, "loss": 7.3982, "step": 4955 }, { "epoch": 0.6100443131462334, "grad_norm": 0.0937451720237732, "learning_rate": 0.0001950979184628649, "loss": 7.7043, "step": 4956 }, { "epoch": 0.6101674052191038, "grad_norm": 0.06587143242359161, "learning_rate": 0.00019503633452395616, "loss": 7.8667, "step": 4957 }, { "epoch": 0.6102904972919744, "grad_norm": 0.08897694200277328, "learning_rate": 0.00019497475058504742, "loss": 8.2623, "step": 4958 }, { "epoch": 0.6104135893648449, "grad_norm": 0.14191663265228271, "learning_rate": 0.0001949131666461387, "loss": 7.3676, "step": 4959 }, { "epoch": 0.6105366814377154, "grad_norm": 0.15231676399707794, "learning_rate": 0.00019485158270722997, "loss": 8.1074, "step": 4960 }, { "epoch": 0.6106597735105859, "grad_norm": 0.08938692510128021, "learning_rate": 0.00019478999876832123, "loss": 7.7546, "step": 4961 }, { "epoch": 0.6107828655834564, "grad_norm": 0.13274453580379486, "learning_rate": 0.00019472841482941252, "loss": 7.7106, "step": 4962 }, { "epoch": 0.6109059576563269, "grad_norm": 0.3446864187717438, "learning_rate": 0.00019466683089050378, "loss": 9.4748, "step": 4963 }, { "epoch": 0.6110290497291975, "grad_norm": 0.2126697450876236, "learning_rate": 0.00019460524695159504, "loss": 7.0544, "step": 4964 }, { "epoch": 0.6111521418020679, "grad_norm": 0.08088364452123642, "learning_rate": 0.0001945436630126863, "loss": 7.5639, "step": 4965 }, { "epoch": 0.6112752338749384, "grad_norm": 0.1516972780227661, "learning_rate": 0.00019448207907377758, "loss": 8.4866, "step": 4966 }, { "epoch": 0.611398325947809, "grad_norm": 0.1411714404821396, "learning_rate": 0.00019442049513486884, "loss": 7.3689, "step": 4967 }, { "epoch": 0.6115214180206795, "grad_norm": 0.08221255987882614, "learning_rate": 0.0001943589111959601, "loss": 7.629, "step": 4968 }, { "epoch": 0.6116445100935499, "grad_norm": 0.08520542085170746, "learning_rate": 0.0001942973272570514, "loss": 7.3753, "step": 4969 }, { "epoch": 0.6117676021664205, "grad_norm": 0.0786987766623497, "learning_rate": 0.00019423574331814265, "loss": 7.5214, "step": 4970 }, { "epoch": 0.611890694239291, "grad_norm": 0.0901281014084816, "learning_rate": 0.0001941741593792339, "loss": 7.507, "step": 4971 }, { "epoch": 0.6120137863121615, "grad_norm": 0.1412743479013443, "learning_rate": 0.00019411257544032517, "loss": 8.0452, "step": 4972 }, { "epoch": 0.6121368783850321, "grad_norm": 0.08580835163593292, "learning_rate": 0.00019405099150141645, "loss": 7.5255, "step": 4973 }, { "epoch": 0.6122599704579025, "grad_norm": 0.38916122913360596, "learning_rate": 0.0001939894075625077, "loss": 9.4716, "step": 4974 }, { "epoch": 0.612383062530773, "grad_norm": 0.09474189579486847, "learning_rate": 0.00019392782362359897, "loss": 8.1351, "step": 4975 }, { "epoch": 0.6125061546036435, "grad_norm": 0.1316317915916443, "learning_rate": 0.00019386623968469026, "loss": 7.3387, "step": 4976 }, { "epoch": 0.6126292466765141, "grad_norm": 0.11288225650787354, "learning_rate": 0.00019380465574578152, "loss": 7.3903, "step": 4977 }, { "epoch": 0.6127523387493845, "grad_norm": 0.09818623214960098, "learning_rate": 0.00019374307180687278, "loss": 7.5835, "step": 4978 }, { "epoch": 0.612875430822255, "grad_norm": 0.07464499771595001, "learning_rate": 0.00019368148786796406, "loss": 7.5261, "step": 4979 }, { "epoch": 0.6129985228951256, "grad_norm": 0.09321746230125427, "learning_rate": 0.00019361990392905532, "loss": 7.7125, "step": 4980 }, { "epoch": 0.6131216149679961, "grad_norm": 0.09529199451208115, "learning_rate": 0.00019355831999014658, "loss": 7.461, "step": 4981 }, { "epoch": 0.6132447070408665, "grad_norm": 0.2834199070930481, "learning_rate": 0.00019349673605123784, "loss": 8.8598, "step": 4982 }, { "epoch": 0.6133677991137371, "grad_norm": 0.06053707003593445, "learning_rate": 0.00019343515211232913, "loss": 7.6429, "step": 4983 }, { "epoch": 0.6134908911866076, "grad_norm": 0.19705231487751007, "learning_rate": 0.0001933735681734204, "loss": 8.3337, "step": 4984 }, { "epoch": 0.6136139832594781, "grad_norm": 0.1180473044514656, "learning_rate": 0.00019331198423451165, "loss": 7.58, "step": 4985 }, { "epoch": 0.6137370753323486, "grad_norm": 0.1272423416376114, "learning_rate": 0.00019325040029560293, "loss": 7.3107, "step": 4986 }, { "epoch": 0.6138601674052191, "grad_norm": 0.11138678342103958, "learning_rate": 0.0001931888163566942, "loss": 7.3963, "step": 4987 }, { "epoch": 0.6139832594780896, "grad_norm": 0.0947386622428894, "learning_rate": 0.00019312723241778545, "loss": 7.6535, "step": 4988 }, { "epoch": 0.6141063515509602, "grad_norm": 0.08057422190904617, "learning_rate": 0.00019306564847887674, "loss": 7.9054, "step": 4989 }, { "epoch": 0.6142294436238306, "grad_norm": 0.16355660557746887, "learning_rate": 0.000193004064539968, "loss": 8.3443, "step": 4990 }, { "epoch": 0.6143525356967011, "grad_norm": 0.16007520258426666, "learning_rate": 0.00019294248060105926, "loss": 7.8871, "step": 4991 }, { "epoch": 0.6144756277695717, "grad_norm": 0.1448645293712616, "learning_rate": 0.00019288089666215052, "loss": 7.5069, "step": 4992 }, { "epoch": 0.6145987198424422, "grad_norm": 0.08171181380748749, "learning_rate": 0.00019281931272324178, "loss": 7.6951, "step": 4993 }, { "epoch": 0.6147218119153126, "grad_norm": 0.14754284918308258, "learning_rate": 0.00019275772878433304, "loss": 8.0865, "step": 4994 }, { "epoch": 0.6148449039881831, "grad_norm": 0.10822958499193192, "learning_rate": 0.0001926961448454243, "loss": 7.3692, "step": 4995 }, { "epoch": 0.6149679960610537, "grad_norm": 0.08963003009557724, "learning_rate": 0.00019263456090651558, "loss": 7.7376, "step": 4996 }, { "epoch": 0.6150910881339242, "grad_norm": 0.1285608410835266, "learning_rate": 0.00019257297696760684, "loss": 7.3455, "step": 4997 }, { "epoch": 0.6152141802067946, "grad_norm": 0.1900649517774582, "learning_rate": 0.0001925113930286981, "loss": 8.049, "step": 4998 }, { "epoch": 0.6153372722796652, "grad_norm": 0.09055013209581375, "learning_rate": 0.00019244980908978936, "loss": 7.9502, "step": 4999 }, { "epoch": 0.6154603643525357, "grad_norm": 0.16576705873012543, "learning_rate": 0.00019238822515088065, "loss": 7.8988, "step": 5000 }, { "epoch": 0.6155834564254062, "grad_norm": 0.07168092578649521, "learning_rate": 0.0001923266412119719, "loss": 7.6402, "step": 5001 }, { "epoch": 0.6157065484982767, "grad_norm": 0.09512936323881149, "learning_rate": 0.00019226505727306317, "loss": 7.8969, "step": 5002 }, { "epoch": 0.6158296405711472, "grad_norm": 0.10553756356239319, "learning_rate": 0.00019220347333415445, "loss": 7.3566, "step": 5003 }, { "epoch": 0.6159527326440177, "grad_norm": 0.05998750776052475, "learning_rate": 0.0001921418893952457, "loss": 7.8243, "step": 5004 }, { "epoch": 0.6160758247168883, "grad_norm": 0.3662252128124237, "learning_rate": 0.00019208030545633697, "loss": 9.2917, "step": 5005 }, { "epoch": 0.6161989167897587, "grad_norm": 0.07327551394701004, "learning_rate": 0.00019201872151742826, "loss": 7.6887, "step": 5006 }, { "epoch": 0.6163220088626292, "grad_norm": 0.10240907967090607, "learning_rate": 0.00019195713757851952, "loss": 7.5766, "step": 5007 }, { "epoch": 0.6164451009354998, "grad_norm": 0.1529262512922287, "learning_rate": 0.00019189555363961078, "loss": 8.0349, "step": 5008 }, { "epoch": 0.6165681930083703, "grad_norm": 0.0854339450597763, "learning_rate": 0.00019183396970070204, "loss": 7.3264, "step": 5009 }, { "epoch": 0.6166912850812407, "grad_norm": 0.07493197172880173, "learning_rate": 0.00019177238576179332, "loss": 7.8871, "step": 5010 }, { "epoch": 0.6168143771541112, "grad_norm": 0.08320832997560501, "learning_rate": 0.00019171080182288458, "loss": 7.5239, "step": 5011 }, { "epoch": 0.6169374692269818, "grad_norm": 0.10942995548248291, "learning_rate": 0.00019164921788397584, "loss": 7.3092, "step": 5012 }, { "epoch": 0.6170605612998523, "grad_norm": 0.13953955471515656, "learning_rate": 0.00019158763394506713, "loss": 7.9062, "step": 5013 }, { "epoch": 0.6171836533727229, "grad_norm": 0.205579474568367, "learning_rate": 0.0001915260500061584, "loss": 8.2697, "step": 5014 }, { "epoch": 0.6173067454455933, "grad_norm": 0.07739115506410599, "learning_rate": 0.00019146446606724965, "loss": 7.73, "step": 5015 }, { "epoch": 0.6174298375184638, "grad_norm": 0.19155414402484894, "learning_rate": 0.0001914028821283409, "loss": 7.0002, "step": 5016 }, { "epoch": 0.6175529295913343, "grad_norm": 0.14406156539916992, "learning_rate": 0.0001913412981894322, "loss": 8.4663, "step": 5017 }, { "epoch": 0.6176760216642049, "grad_norm": 0.1733471304178238, "learning_rate": 0.00019127971425052345, "loss": 8.3277, "step": 5018 }, { "epoch": 0.6177991137370753, "grad_norm": 0.13822978734970093, "learning_rate": 0.00019121813031161471, "loss": 7.2837, "step": 5019 }, { "epoch": 0.6179222058099458, "grad_norm": 0.1180076003074646, "learning_rate": 0.000191156546372706, "loss": 7.9111, "step": 5020 }, { "epoch": 0.6180452978828164, "grad_norm": 0.08277909457683563, "learning_rate": 0.00019109496243379726, "loss": 7.7465, "step": 5021 }, { "epoch": 0.6181683899556869, "grad_norm": 0.1081719696521759, "learning_rate": 0.00019103337849488852, "loss": 7.7375, "step": 5022 }, { "epoch": 0.6182914820285573, "grad_norm": 0.09092959761619568, "learning_rate": 0.0001909717945559798, "loss": 7.59, "step": 5023 }, { "epoch": 0.6184145741014279, "grad_norm": 0.08224336802959442, "learning_rate": 0.00019091021061707107, "loss": 7.7927, "step": 5024 }, { "epoch": 0.6185376661742984, "grad_norm": 0.1316891759634018, "learning_rate": 0.00019084862667816233, "loss": 7.3428, "step": 5025 }, { "epoch": 0.6186607582471689, "grad_norm": 0.11013557016849518, "learning_rate": 0.00019078704273925358, "loss": 7.4165, "step": 5026 }, { "epoch": 0.6187838503200394, "grad_norm": 0.08815232664346695, "learning_rate": 0.00019072545880034487, "loss": 7.5893, "step": 5027 }, { "epoch": 0.6189069423929099, "grad_norm": 0.10038752853870392, "learning_rate": 0.00019066387486143613, "loss": 8.1733, "step": 5028 }, { "epoch": 0.6190300344657804, "grad_norm": 0.34769976139068604, "learning_rate": 0.0001906022909225274, "loss": 9.0714, "step": 5029 }, { "epoch": 0.619153126538651, "grad_norm": 0.09376956522464752, "learning_rate": 0.00019054070698361868, "loss": 7.38, "step": 5030 }, { "epoch": 0.6192762186115214, "grad_norm": 0.16388383507728577, "learning_rate": 0.00019047912304470994, "loss": 8.1164, "step": 5031 }, { "epoch": 0.6193993106843919, "grad_norm": 0.06487875431776047, "learning_rate": 0.0001904175391058012, "loss": 7.5036, "step": 5032 }, { "epoch": 0.6195224027572624, "grad_norm": 0.11445873975753784, "learning_rate": 0.00019035595516689248, "loss": 7.3006, "step": 5033 }, { "epoch": 0.619645494830133, "grad_norm": 0.07271477580070496, "learning_rate": 0.00019029437122798374, "loss": 7.716, "step": 5034 }, { "epoch": 0.6197685869030034, "grad_norm": 0.08533739298582077, "learning_rate": 0.000190232787289075, "loss": 7.4669, "step": 5035 }, { "epoch": 0.6198916789758739, "grad_norm": 0.07171288132667542, "learning_rate": 0.00019017120335016626, "loss": 7.3892, "step": 5036 }, { "epoch": 0.6200147710487445, "grad_norm": 0.06303702294826508, "learning_rate": 0.00019010961941125755, "loss": 7.4023, "step": 5037 }, { "epoch": 0.620137863121615, "grad_norm": 0.07850558310747147, "learning_rate": 0.0001900480354723488, "loss": 7.5136, "step": 5038 }, { "epoch": 0.6202609551944854, "grad_norm": 0.1766311079263687, "learning_rate": 0.00018998645153344007, "loss": 7.5686, "step": 5039 }, { "epoch": 0.620384047267356, "grad_norm": 0.35145676136016846, "learning_rate": 0.00018992486759453135, "loss": 9.2357, "step": 5040 }, { "epoch": 0.6205071393402265, "grad_norm": 0.17633146047592163, "learning_rate": 0.0001898632836556226, "loss": 7.7493, "step": 5041 }, { "epoch": 0.620630231413097, "grad_norm": 0.07186256349086761, "learning_rate": 0.00018980169971671387, "loss": 7.3667, "step": 5042 }, { "epoch": 0.6207533234859675, "grad_norm": 0.15458735823631287, "learning_rate": 0.00018974011577780513, "loss": 7.9052, "step": 5043 }, { "epoch": 0.620876415558838, "grad_norm": 0.06205077841877937, "learning_rate": 0.00018967853183889642, "loss": 7.492, "step": 5044 }, { "epoch": 0.6209995076317085, "grad_norm": 0.20204587280750275, "learning_rate": 0.00018961694789998768, "loss": 8.3074, "step": 5045 }, { "epoch": 0.6211225997045791, "grad_norm": 0.09589315205812454, "learning_rate": 0.00018955536396107894, "loss": 7.4963, "step": 5046 }, { "epoch": 0.6212456917774495, "grad_norm": 0.08212383836507797, "learning_rate": 0.00018949378002217022, "loss": 7.9992, "step": 5047 }, { "epoch": 0.62136878385032, "grad_norm": 0.11220318078994751, "learning_rate": 0.00018943219608326148, "loss": 7.6183, "step": 5048 }, { "epoch": 0.6214918759231906, "grad_norm": 0.09040732681751251, "learning_rate": 0.00018937061214435274, "loss": 7.9336, "step": 5049 }, { "epoch": 0.6216149679960611, "grad_norm": 0.10471785068511963, "learning_rate": 0.00018930902820544403, "loss": 8.2166, "step": 5050 }, { "epoch": 0.6217380600689315, "grad_norm": 0.24408936500549316, "learning_rate": 0.0001892474442665353, "loss": 8.7243, "step": 5051 }, { "epoch": 0.621861152141802, "grad_norm": 0.09516673535108566, "learning_rate": 0.00018918586032762655, "loss": 7.5584, "step": 5052 }, { "epoch": 0.6219842442146726, "grad_norm": 0.18453449010849, "learning_rate": 0.0001891242763887178, "loss": 7.5094, "step": 5053 }, { "epoch": 0.6221073362875431, "grad_norm": 0.1263701319694519, "learning_rate": 0.0001890626924498091, "loss": 7.459, "step": 5054 }, { "epoch": 0.6222304283604135, "grad_norm": 0.07689391076564789, "learning_rate": 0.00018900110851090035, "loss": 7.5619, "step": 5055 }, { "epoch": 0.6223535204332841, "grad_norm": 0.07667049765586853, "learning_rate": 0.0001889395245719916, "loss": 7.6868, "step": 5056 }, { "epoch": 0.6224766125061546, "grad_norm": 0.08165843039751053, "learning_rate": 0.0001888779406330829, "loss": 7.5603, "step": 5057 }, { "epoch": 0.6225997045790251, "grad_norm": 0.09207708388566971, "learning_rate": 0.00018881635669417416, "loss": 7.6, "step": 5058 }, { "epoch": 0.6227227966518957, "grad_norm": 0.14047200977802277, "learning_rate": 0.00018875477275526542, "loss": 7.773, "step": 5059 }, { "epoch": 0.6228458887247661, "grad_norm": 0.0740320011973381, "learning_rate": 0.0001886931888163567, "loss": 7.4019, "step": 5060 }, { "epoch": 0.6229689807976366, "grad_norm": 0.12344365566968918, "learning_rate": 0.00018863160487744797, "loss": 8.2698, "step": 5061 }, { "epoch": 0.6230920728705072, "grad_norm": 0.07778201252222061, "learning_rate": 0.00018857002093853922, "loss": 7.5317, "step": 5062 }, { "epoch": 0.6232151649433777, "grad_norm": 0.10660771280527115, "learning_rate": 0.00018850843699963048, "loss": 8.0099, "step": 5063 }, { "epoch": 0.6233382570162481, "grad_norm": 0.1626492440700531, "learning_rate": 0.00018844685306072177, "loss": 8.4866, "step": 5064 }, { "epoch": 0.6234613490891187, "grad_norm": 0.11629030853509903, "learning_rate": 0.00018838526912181303, "loss": 7.452, "step": 5065 }, { "epoch": 0.6235844411619892, "grad_norm": 0.13595077395439148, "learning_rate": 0.0001883236851829043, "loss": 8.2949, "step": 5066 }, { "epoch": 0.6237075332348597, "grad_norm": 0.1453339010477066, "learning_rate": 0.00018826210124399558, "loss": 7.7866, "step": 5067 }, { "epoch": 0.6238306253077301, "grad_norm": 0.13545015454292297, "learning_rate": 0.00018820051730508684, "loss": 7.7329, "step": 5068 }, { "epoch": 0.6239537173806007, "grad_norm": 0.2256348431110382, "learning_rate": 0.0001881389333661781, "loss": 7.3152, "step": 5069 }, { "epoch": 0.6240768094534712, "grad_norm": 0.1000046357512474, "learning_rate": 0.00018807734942726935, "loss": 7.6917, "step": 5070 }, { "epoch": 0.6241999015263417, "grad_norm": 0.1200985386967659, "learning_rate": 0.00018801576548836064, "loss": 7.5489, "step": 5071 }, { "epoch": 0.6243229935992122, "grad_norm": 0.08248646557331085, "learning_rate": 0.0001879541815494519, "loss": 7.9787, "step": 5072 }, { "epoch": 0.6244460856720827, "grad_norm": 0.115719735622406, "learning_rate": 0.00018789259761054316, "loss": 8.2148, "step": 5073 }, { "epoch": 0.6245691777449532, "grad_norm": 0.23453949391841888, "learning_rate": 0.00018783101367163445, "loss": 8.692, "step": 5074 }, { "epoch": 0.6246922698178238, "grad_norm": 0.14288221299648285, "learning_rate": 0.0001877694297327257, "loss": 8.0582, "step": 5075 }, { "epoch": 0.6248153618906942, "grad_norm": 0.07832474261522293, "learning_rate": 0.00018770784579381697, "loss": 7.5101, "step": 5076 }, { "epoch": 0.6249384539635647, "grad_norm": 0.13241569697856903, "learning_rate": 0.00018764626185490825, "loss": 8.2745, "step": 5077 }, { "epoch": 0.6250615460364353, "grad_norm": 0.6668148040771484, "learning_rate": 0.0001875846779159995, "loss": 10.6728, "step": 5078 }, { "epoch": 0.6251846381093058, "grad_norm": 0.06959501653909683, "learning_rate": 0.00018752309397709077, "loss": 7.8254, "step": 5079 }, { "epoch": 0.6253077301821762, "grad_norm": 0.25240957736968994, "learning_rate": 0.00018746151003818203, "loss": 8.8236, "step": 5080 }, { "epoch": 0.6254308222550468, "grad_norm": 0.06792020797729492, "learning_rate": 0.00018739992609927332, "loss": 7.9351, "step": 5081 }, { "epoch": 0.6255539143279173, "grad_norm": 0.13030347228050232, "learning_rate": 0.00018733834216036458, "loss": 7.4087, "step": 5082 }, { "epoch": 0.6256770064007878, "grad_norm": 0.22140495479106903, "learning_rate": 0.00018727675822145584, "loss": 7.2425, "step": 5083 }, { "epoch": 0.6258000984736583, "grad_norm": 0.1439809650182724, "learning_rate": 0.00018721517428254712, "loss": 8.9527, "step": 5084 }, { "epoch": 0.6259231905465288, "grad_norm": 0.14464102685451508, "learning_rate": 0.00018715359034363838, "loss": 7.6234, "step": 5085 }, { "epoch": 0.6260462826193993, "grad_norm": 0.14218711853027344, "learning_rate": 0.00018709200640472964, "loss": 7.6792, "step": 5086 }, { "epoch": 0.6261693746922699, "grad_norm": 0.3272309899330139, "learning_rate": 0.00018703042246582093, "loss": 9.179, "step": 5087 }, { "epoch": 0.6262924667651403, "grad_norm": 0.14273308217525482, "learning_rate": 0.0001869688385269122, "loss": 7.3808, "step": 5088 }, { "epoch": 0.6264155588380108, "grad_norm": NaN, "learning_rate": 0.00018690725458800345, "loss": 7.424, "step": 5089 }, { "epoch": 0.6265386509108813, "grad_norm": 0.12373778969049454, "learning_rate": 0.0001868456706490947, "loss": 7.3768, "step": 5090 }, { "epoch": 0.6266617429837519, "grad_norm": 0.11878763139247894, "learning_rate": 0.000186784086710186, "loss": 7.5971, "step": 5091 }, { "epoch": 0.6267848350566223, "grad_norm": 0.10384580492973328, "learning_rate": 0.00018672250277127725, "loss": 8.0431, "step": 5092 }, { "epoch": 0.6269079271294928, "grad_norm": 0.2087331861257553, "learning_rate": 0.0001866609188323685, "loss": 7.5279, "step": 5093 }, { "epoch": 0.6270310192023634, "grad_norm": 0.198257714509964, "learning_rate": 0.0001865993348934598, "loss": 7.6156, "step": 5094 }, { "epoch": 0.6271541112752339, "grad_norm": 0.12094493955373764, "learning_rate": 0.00018653775095455106, "loss": 7.6991, "step": 5095 }, { "epoch": 0.6272772033481043, "grad_norm": 0.15792718529701233, "learning_rate": 0.00018647616701564232, "loss": 7.6855, "step": 5096 }, { "epoch": 0.6274002954209749, "grad_norm": 0.1284489929676056, "learning_rate": 0.00018641458307673358, "loss": 7.5614, "step": 5097 }, { "epoch": 0.6275233874938454, "grad_norm": 0.0829559862613678, "learning_rate": 0.00018635299913782486, "loss": 7.7735, "step": 5098 }, { "epoch": 0.6276464795667159, "grad_norm": 0.1461126059293747, "learning_rate": 0.00018629141519891612, "loss": 7.2956, "step": 5099 }, { "epoch": 0.6277695716395865, "grad_norm": 0.10653554648160934, "learning_rate": 0.00018622983126000738, "loss": 7.6703, "step": 5100 }, { "epoch": 0.6278926637124569, "grad_norm": 0.08545999974012375, "learning_rate": 0.00018616824732109867, "loss": 7.4579, "step": 5101 }, { "epoch": 0.6280157557853274, "grad_norm": 0.086823008954525, "learning_rate": 0.00018610666338218993, "loss": 7.3717, "step": 5102 }, { "epoch": 0.628138847858198, "grad_norm": 0.11031249165534973, "learning_rate": 0.0001860450794432812, "loss": 7.9182, "step": 5103 }, { "epoch": 0.6282619399310685, "grad_norm": 0.0792149230837822, "learning_rate": 0.00018598349550437248, "loss": 7.6817, "step": 5104 }, { "epoch": 0.6283850320039389, "grad_norm": 0.09666156768798828, "learning_rate": 0.00018592191156546374, "loss": 7.7063, "step": 5105 }, { "epoch": 0.6285081240768094, "grad_norm": 0.1982063502073288, "learning_rate": 0.000185860327626555, "loss": 7.9836, "step": 5106 }, { "epoch": 0.62863121614968, "grad_norm": 0.16489090025424957, "learning_rate": 0.00018579874368764625, "loss": 8.0637, "step": 5107 }, { "epoch": 0.6287543082225505, "grad_norm": 0.07953394204378128, "learning_rate": 0.00018573715974873754, "loss": 8.1746, "step": 5108 }, { "epoch": 0.6288774002954209, "grad_norm": 0.18486039340496063, "learning_rate": 0.0001856755758098288, "loss": 7.2585, "step": 5109 }, { "epoch": 0.6290004923682915, "grad_norm": 0.14399287104606628, "learning_rate": 0.00018561399187092006, "loss": 7.6569, "step": 5110 }, { "epoch": 0.629123584441162, "grad_norm": 0.13232481479644775, "learning_rate": 0.00018555240793201135, "loss": 7.4741, "step": 5111 }, { "epoch": 0.6292466765140325, "grad_norm": 0.1747623234987259, "learning_rate": 0.0001854908239931026, "loss": 7.4179, "step": 5112 }, { "epoch": 0.629369768586903, "grad_norm": 0.20467880368232727, "learning_rate": 0.00018542924005419387, "loss": 7.6426, "step": 5113 }, { "epoch": 0.6294928606597735, "grad_norm": 0.18574704229831696, "learning_rate": 0.00018536765611528512, "loss": 8.2939, "step": 5114 }, { "epoch": 0.629615952732644, "grad_norm": 0.08862185478210449, "learning_rate": 0.0001853060721763764, "loss": 7.3905, "step": 5115 }, { "epoch": 0.6297390448055146, "grad_norm": 0.11896088719367981, "learning_rate": 0.00018524448823746767, "loss": 7.4401, "step": 5116 }, { "epoch": 0.629862136878385, "grad_norm": 0.09072189778089523, "learning_rate": 0.00018518290429855893, "loss": 7.7947, "step": 5117 }, { "epoch": 0.6299852289512555, "grad_norm": 0.18670278787612915, "learning_rate": 0.00018512132035965022, "loss": 7.3939, "step": 5118 }, { "epoch": 0.6301083210241261, "grad_norm": 0.0966922789812088, "learning_rate": 0.00018505973642074148, "loss": 8.055, "step": 5119 }, { "epoch": 0.6302314130969966, "grad_norm": 0.10272647440433502, "learning_rate": 0.00018499815248183274, "loss": 7.6377, "step": 5120 }, { "epoch": 0.630354505169867, "grad_norm": 0.10298725217580795, "learning_rate": 0.00018493656854292402, "loss": 7.4181, "step": 5121 }, { "epoch": 0.6304775972427376, "grad_norm": 0.27141937613487244, "learning_rate": 0.00018487498460401528, "loss": 8.6452, "step": 5122 }, { "epoch": 0.6306006893156081, "grad_norm": 0.10457223653793335, "learning_rate": 0.00018481340066510654, "loss": 7.6551, "step": 5123 }, { "epoch": 0.6307237813884786, "grad_norm": 0.0885467380285263, "learning_rate": 0.0001847518167261978, "loss": 7.9358, "step": 5124 }, { "epoch": 0.630846873461349, "grad_norm": 0.11323549598455429, "learning_rate": 0.0001846902327872891, "loss": 7.7949, "step": 5125 }, { "epoch": 0.6309699655342196, "grad_norm": 0.1917608380317688, "learning_rate": 0.00018462864884838035, "loss": 7.3188, "step": 5126 }, { "epoch": 0.6310930576070901, "grad_norm": 0.09421802312135696, "learning_rate": 0.0001845670649094716, "loss": 7.4833, "step": 5127 }, { "epoch": 0.6312161496799606, "grad_norm": 0.11242833733558655, "learning_rate": 0.0001845054809705629, "loss": 7.7208, "step": 5128 }, { "epoch": 0.6313392417528311, "grad_norm": 0.15790341794490814, "learning_rate": 0.00018444389703165415, "loss": 8.0555, "step": 5129 }, { "epoch": 0.6314623338257016, "grad_norm": 0.08148253709077835, "learning_rate": 0.0001843823130927454, "loss": 7.4631, "step": 5130 }, { "epoch": 0.6315854258985721, "grad_norm": 0.09759381413459778, "learning_rate": 0.0001843207291538367, "loss": 7.4505, "step": 5131 }, { "epoch": 0.6317085179714427, "grad_norm": 0.2026662975549698, "learning_rate": 0.00018425914521492796, "loss": 7.4425, "step": 5132 }, { "epoch": 0.6318316100443131, "grad_norm": 0.1831940859556198, "learning_rate": 0.00018419756127601922, "loss": 8.4225, "step": 5133 }, { "epoch": 0.6319547021171836, "grad_norm": 0.150978222489357, "learning_rate": 0.00018413597733711048, "loss": 8.1502, "step": 5134 }, { "epoch": 0.6320777941900542, "grad_norm": 0.22311128675937653, "learning_rate": 0.00018407439339820176, "loss": 9.1176, "step": 5135 }, { "epoch": 0.6322008862629247, "grad_norm": 0.10573441535234451, "learning_rate": 0.00018401280945929302, "loss": 8.1136, "step": 5136 }, { "epoch": 0.6323239783357951, "grad_norm": 0.1943885087966919, "learning_rate": 0.00018395122552038428, "loss": 7.4761, "step": 5137 }, { "epoch": 0.6324470704086657, "grad_norm": 0.13576383888721466, "learning_rate": 0.00018388964158147557, "loss": 7.8519, "step": 5138 }, { "epoch": 0.6325701624815362, "grad_norm": 0.1858345866203308, "learning_rate": 0.00018382805764256683, "loss": 7.497, "step": 5139 }, { "epoch": 0.6326932545544067, "grad_norm": 0.11043853312730789, "learning_rate": 0.0001837664737036581, "loss": 7.5176, "step": 5140 }, { "epoch": 0.6328163466272771, "grad_norm": 0.13906675577163696, "learning_rate": 0.00018370488976474935, "loss": 8.0885, "step": 5141 }, { "epoch": 0.6329394387001477, "grad_norm": 0.1377985179424286, "learning_rate": 0.00018364330582584063, "loss": 7.5695, "step": 5142 }, { "epoch": 0.6330625307730182, "grad_norm": 0.17740070819854736, "learning_rate": 0.0001835817218869319, "loss": 7.6059, "step": 5143 }, { "epoch": 0.6331856228458888, "grad_norm": 0.19000236690044403, "learning_rate": 0.00018352013794802315, "loss": 7.8572, "step": 5144 }, { "epoch": 0.6333087149187593, "grad_norm": 0.08990421146154404, "learning_rate": 0.00018345855400911444, "loss": 7.5698, "step": 5145 }, { "epoch": 0.6334318069916297, "grad_norm": 0.11150739341974258, "learning_rate": 0.0001833969700702057, "loss": 7.4371, "step": 5146 }, { "epoch": 0.6335548990645002, "grad_norm": 0.12010860443115234, "learning_rate": 0.00018333538613129696, "loss": 7.9865, "step": 5147 }, { "epoch": 0.6336779911373708, "grad_norm": 0.22347180545330048, "learning_rate": 0.00018327380219238825, "loss": 8.8972, "step": 5148 }, { "epoch": 0.6338010832102413, "grad_norm": 0.15747405588626862, "learning_rate": 0.0001832122182534795, "loss": 7.4634, "step": 5149 }, { "epoch": 0.6339241752831117, "grad_norm": 0.15904369950294495, "learning_rate": 0.00018315063431457076, "loss": 7.5312, "step": 5150 }, { "epoch": 0.6340472673559823, "grad_norm": 0.14799872040748596, "learning_rate": 0.00018308905037566202, "loss": 8.6056, "step": 5151 }, { "epoch": 0.6341703594288528, "grad_norm": 0.1203523799777031, "learning_rate": 0.0001830274664367533, "loss": 7.587, "step": 5152 }, { "epoch": 0.6342934515017233, "grad_norm": 0.06946973502635956, "learning_rate": 0.00018296588249784457, "loss": 7.5723, "step": 5153 }, { "epoch": 0.6344165435745938, "grad_norm": 0.08038195222616196, "learning_rate": 0.00018290429855893583, "loss": 7.7321, "step": 5154 }, { "epoch": 0.6345396356474643, "grad_norm": 0.06868451088666916, "learning_rate": 0.00018284271462002712, "loss": 7.4565, "step": 5155 }, { "epoch": 0.6346627277203348, "grad_norm": 0.12693357467651367, "learning_rate": 0.00018278113068111838, "loss": 7.8139, "step": 5156 }, { "epoch": 0.6347858197932054, "grad_norm": 0.09921041131019592, "learning_rate": 0.00018271954674220964, "loss": 7.4348, "step": 5157 }, { "epoch": 0.6349089118660758, "grad_norm": 0.10132414102554321, "learning_rate": 0.00018265796280330092, "loss": 7.9032, "step": 5158 }, { "epoch": 0.6350320039389463, "grad_norm": 0.08506953716278076, "learning_rate": 0.00018259637886439218, "loss": 7.4577, "step": 5159 }, { "epoch": 0.6351550960118169, "grad_norm": 0.07438009977340698, "learning_rate": 0.00018253479492548344, "loss": 7.8273, "step": 5160 }, { "epoch": 0.6352781880846874, "grad_norm": 0.5985499620437622, "learning_rate": 0.0001824732109865747, "loss": 8.8635, "step": 5161 }, { "epoch": 0.6354012801575578, "grad_norm": 0.10458862781524658, "learning_rate": 0.000182411627047666, "loss": 7.8616, "step": 5162 }, { "epoch": 0.6355243722304283, "grad_norm": 0.09909988939762115, "learning_rate": 0.00018235004310875725, "loss": 7.6769, "step": 5163 }, { "epoch": 0.6356474643032989, "grad_norm": 0.08905137330293655, "learning_rate": 0.0001822884591698485, "loss": 7.6684, "step": 5164 }, { "epoch": 0.6357705563761694, "grad_norm": 0.09106910973787308, "learning_rate": 0.0001822268752309398, "loss": 7.6151, "step": 5165 }, { "epoch": 0.6358936484490398, "grad_norm": 0.17960815131664276, "learning_rate": 0.00018216529129203105, "loss": 8.5136, "step": 5166 }, { "epoch": 0.6360167405219104, "grad_norm": 0.05734029412269592, "learning_rate": 0.0001821037073531223, "loss": 7.6597, "step": 5167 }, { "epoch": 0.6361398325947809, "grad_norm": 0.15171988308429718, "learning_rate": 0.00018204212341421357, "loss": 8.3058, "step": 5168 }, { "epoch": 0.6362629246676514, "grad_norm": 0.10765612125396729, "learning_rate": 0.00018198053947530486, "loss": 7.8848, "step": 5169 }, { "epoch": 0.6363860167405219, "grad_norm": 0.11001315712928772, "learning_rate": 0.00018191895553639612, "loss": 8.4043, "step": 5170 }, { "epoch": 0.6365091088133924, "grad_norm": 0.15504112839698792, "learning_rate": 0.00018185737159748738, "loss": 8.407, "step": 5171 }, { "epoch": 0.6366322008862629, "grad_norm": 0.11808166652917862, "learning_rate": 0.00018179578765857866, "loss": 8.5151, "step": 5172 }, { "epoch": 0.6367552929591335, "grad_norm": 0.25209811329841614, "learning_rate": 0.00018173420371966992, "loss": 7.5321, "step": 5173 }, { "epoch": 0.6368783850320039, "grad_norm": 0.2240794450044632, "learning_rate": 0.00018167261978076118, "loss": 7.5577, "step": 5174 }, { "epoch": 0.6370014771048744, "grad_norm": 0.14537064731121063, "learning_rate": 0.00018161103584185247, "loss": 8.9348, "step": 5175 }, { "epoch": 0.637124569177745, "grad_norm": 0.25272488594055176, "learning_rate": 0.00018154945190294373, "loss": 7.5762, "step": 5176 }, { "epoch": 0.6372476612506155, "grad_norm": 0.2347702831029892, "learning_rate": 0.000181487867964035, "loss": 7.8328, "step": 5177 }, { "epoch": 0.6373707533234859, "grad_norm": 0.19425636529922485, "learning_rate": 0.00018142628402512625, "loss": 7.5127, "step": 5178 }, { "epoch": 0.6374938453963565, "grad_norm": 0.13486000895500183, "learning_rate": 0.00018136470008621753, "loss": 8.0691, "step": 5179 }, { "epoch": 0.637616937469227, "grad_norm": 0.13269628584384918, "learning_rate": 0.0001813031161473088, "loss": 7.2961, "step": 5180 }, { "epoch": 0.6377400295420975, "grad_norm": 0.08630036562681198, "learning_rate": 0.00018124153220840005, "loss": 7.4207, "step": 5181 }, { "epoch": 0.6378631216149679, "grad_norm": 0.11113615334033966, "learning_rate": 0.00018117994826949134, "loss": 7.3737, "step": 5182 }, { "epoch": 0.6379862136878385, "grad_norm": 0.1297132819890976, "learning_rate": 0.0001811183643305826, "loss": 7.5444, "step": 5183 }, { "epoch": 0.638109305760709, "grad_norm": 0.18653246760368347, "learning_rate": 0.00018105678039167386, "loss": 7.5168, "step": 5184 }, { "epoch": 0.6382323978335795, "grad_norm": 0.10358800739049911, "learning_rate": 0.00018099519645276515, "loss": 7.3075, "step": 5185 }, { "epoch": 0.6383554899064501, "grad_norm": 0.10709056258201599, "learning_rate": 0.0001809336125138564, "loss": 7.5902, "step": 5186 }, { "epoch": 0.6384785819793205, "grad_norm": 0.06331504136323929, "learning_rate": 0.00018087202857494766, "loss": 7.8544, "step": 5187 }, { "epoch": 0.638601674052191, "grad_norm": 0.07877358049154282, "learning_rate": 0.00018081044463603892, "loss": 8.0043, "step": 5188 }, { "epoch": 0.6387247661250616, "grad_norm": 0.356929212808609, "learning_rate": 0.0001807488606971302, "loss": 7.1542, "step": 5189 }, { "epoch": 0.6388478581979321, "grad_norm": 0.19438278675079346, "learning_rate": 0.00018068727675822147, "loss": 7.6407, "step": 5190 }, { "epoch": 0.6389709502708025, "grad_norm": 0.198442742228508, "learning_rate": 0.00018062569281931273, "loss": 8.8514, "step": 5191 }, { "epoch": 0.6390940423436731, "grad_norm": 0.19674097001552582, "learning_rate": 0.00018056410888040402, "loss": 7.4108, "step": 5192 }, { "epoch": 0.6392171344165436, "grad_norm": 0.17306280136108398, "learning_rate": 0.00018050252494149528, "loss": 7.3769, "step": 5193 }, { "epoch": 0.6393402264894141, "grad_norm": 0.11530034989118576, "learning_rate": 0.00018044094100258653, "loss": 7.6769, "step": 5194 }, { "epoch": 0.6394633185622846, "grad_norm": 0.10126080363988876, "learning_rate": 0.0001803793570636778, "loss": 7.4129, "step": 5195 }, { "epoch": 0.6395864106351551, "grad_norm": 0.08975058794021606, "learning_rate": 0.00018031777312476908, "loss": 7.4383, "step": 5196 }, { "epoch": 0.6397095027080256, "grad_norm": 0.11126309633255005, "learning_rate": 0.00018025618918586034, "loss": 7.4998, "step": 5197 }, { "epoch": 0.6398325947808962, "grad_norm": 0.14645099639892578, "learning_rate": 0.0001801946052469516, "loss": 7.5369, "step": 5198 }, { "epoch": 0.6399556868537666, "grad_norm": 0.2447068989276886, "learning_rate": 0.0001801330213080429, "loss": 8.1353, "step": 5199 }, { "epoch": 0.6400787789266371, "grad_norm": 0.0845671147108078, "learning_rate": 0.00018007143736913415, "loss": 7.4349, "step": 5200 }, { "epoch": 0.6402018709995076, "grad_norm": 0.36759132146835327, "learning_rate": 0.0001800098534302254, "loss": 9.042, "step": 5201 }, { "epoch": 0.6403249630723782, "grad_norm": 0.10884305834770203, "learning_rate": 0.0001799482694913167, "loss": 7.4648, "step": 5202 }, { "epoch": 0.6404480551452486, "grad_norm": 0.20159009099006653, "learning_rate": 0.00017988668555240795, "loss": 8.0875, "step": 5203 }, { "epoch": 0.6405711472181191, "grad_norm": 0.11257980018854141, "learning_rate": 0.0001798251016134992, "loss": 7.9055, "step": 5204 }, { "epoch": 0.6406942392909897, "grad_norm": 0.1867596060037613, "learning_rate": 0.00017976351767459047, "loss": 7.4539, "step": 5205 }, { "epoch": 0.6408173313638602, "grad_norm": 0.10918845981359482, "learning_rate": 0.00017970193373568176, "loss": 7.7174, "step": 5206 }, { "epoch": 0.6409404234367306, "grad_norm": 0.11678868532180786, "learning_rate": 0.00017964034979677302, "loss": 7.6717, "step": 5207 }, { "epoch": 0.6410635155096012, "grad_norm": 0.24408964812755585, "learning_rate": 0.00017957876585786428, "loss": 7.0762, "step": 5208 }, { "epoch": 0.6411866075824717, "grad_norm": 0.06973517686128616, "learning_rate": 0.00017951718191895556, "loss": 7.567, "step": 5209 }, { "epoch": 0.6413096996553422, "grad_norm": 0.08232921361923218, "learning_rate": 0.00017945559798004682, "loss": 7.3264, "step": 5210 }, { "epoch": 0.6414327917282127, "grad_norm": 0.14261595904827118, "learning_rate": 0.00017939401404113808, "loss": 7.4417, "step": 5211 }, { "epoch": 0.6415558838010832, "grad_norm": 0.36184272170066833, "learning_rate": 0.00017933243010222934, "loss": 8.7432, "step": 5212 }, { "epoch": 0.6416789758739537, "grad_norm": 0.14673733711242676, "learning_rate": 0.00017927084616332063, "loss": 7.437, "step": 5213 }, { "epoch": 0.6418020679468243, "grad_norm": 0.19357503950595856, "learning_rate": 0.0001792092622244119, "loss": 7.909, "step": 5214 }, { "epoch": 0.6419251600196947, "grad_norm": 0.07398469001054764, "learning_rate": 0.00017914767828550315, "loss": 7.2374, "step": 5215 }, { "epoch": 0.6420482520925652, "grad_norm": 0.13590429723262787, "learning_rate": 0.00017908609434659443, "loss": 8.1701, "step": 5216 }, { "epoch": 0.6421713441654358, "grad_norm": 0.15523099899291992, "learning_rate": 0.0001790245104076857, "loss": 7.2223, "step": 5217 }, { "epoch": 0.6422944362383063, "grad_norm": 0.14085698127746582, "learning_rate": 0.00017896292646877695, "loss": 7.3823, "step": 5218 }, { "epoch": 0.6424175283111767, "grad_norm": 0.07386387139558792, "learning_rate": 0.00017890134252986824, "loss": 7.7833, "step": 5219 }, { "epoch": 0.6425406203840472, "grad_norm": 0.12176357954740524, "learning_rate": 0.0001788397585909595, "loss": 8.4517, "step": 5220 }, { "epoch": 0.6426637124569178, "grad_norm": 0.09835074841976166, "learning_rate": 0.00017877817465205076, "loss": 7.543, "step": 5221 }, { "epoch": 0.6427868045297883, "grad_norm": 0.17724542319774628, "learning_rate": 0.00017871659071314202, "loss": 7.6714, "step": 5222 }, { "epoch": 0.6429098966026587, "grad_norm": 0.08610819280147552, "learning_rate": 0.0001786550067742333, "loss": 8.0323, "step": 5223 }, { "epoch": 0.6430329886755293, "grad_norm": 0.05912790447473526, "learning_rate": 0.00017859342283532456, "loss": 7.4136, "step": 5224 }, { "epoch": 0.6431560807483998, "grad_norm": 0.11059446632862091, "learning_rate": 0.0001785318388964158, "loss": 7.7497, "step": 5225 }, { "epoch": 0.6432791728212703, "grad_norm": 0.4561977684497833, "learning_rate": 0.00017847025495750708, "loss": 9.5719, "step": 5226 }, { "epoch": 0.6434022648941408, "grad_norm": 0.0911475345492363, "learning_rate": 0.00017840867101859834, "loss": 7.3454, "step": 5227 }, { "epoch": 0.6435253569670113, "grad_norm": 0.07331856340169907, "learning_rate": 0.0001783470870796896, "loss": 7.3827, "step": 5228 }, { "epoch": 0.6436484490398818, "grad_norm": 0.11800709366798401, "learning_rate": 0.0001782855031407809, "loss": 7.6608, "step": 5229 }, { "epoch": 0.6437715411127524, "grad_norm": 0.07182526588439941, "learning_rate": 0.00017822391920187215, "loss": 7.3921, "step": 5230 }, { "epoch": 0.6438946331856229, "grad_norm": 0.06806154549121857, "learning_rate": 0.0001781623352629634, "loss": 7.5591, "step": 5231 }, { "epoch": 0.6440177252584933, "grad_norm": 0.08966177701950073, "learning_rate": 0.00017810075132405467, "loss": 7.6538, "step": 5232 }, { "epoch": 0.6441408173313639, "grad_norm": 0.22626939415931702, "learning_rate": 0.00017803916738514595, "loss": 8.404, "step": 5233 }, { "epoch": 0.6442639094042344, "grad_norm": 0.1033947616815567, "learning_rate": 0.0001779775834462372, "loss": 7.6018, "step": 5234 }, { "epoch": 0.6443870014771049, "grad_norm": 0.12351051717996597, "learning_rate": 0.00017791599950732847, "loss": 7.4303, "step": 5235 }, { "epoch": 0.6445100935499753, "grad_norm": 0.0795297920703888, "learning_rate": 0.00017785441556841976, "loss": 7.3673, "step": 5236 }, { "epoch": 0.6446331856228459, "grad_norm": 0.13220854103565216, "learning_rate": 0.00017779283162951102, "loss": 7.8322, "step": 5237 }, { "epoch": 0.6447562776957164, "grad_norm": 0.17215456068515778, "learning_rate": 0.00017773124769060228, "loss": 6.9906, "step": 5238 }, { "epoch": 0.644879369768587, "grad_norm": 0.1712835282087326, "learning_rate": 0.00017766966375169354, "loss": 8.0717, "step": 5239 }, { "epoch": 0.6450024618414574, "grad_norm": 0.15664686262607574, "learning_rate": 0.00017760807981278482, "loss": 8.1, "step": 5240 }, { "epoch": 0.6451255539143279, "grad_norm": 0.09964606910943985, "learning_rate": 0.00017754649587387608, "loss": 7.4436, "step": 5241 }, { "epoch": 0.6452486459871984, "grad_norm": 0.14838945865631104, "learning_rate": 0.00017748491193496734, "loss": 7.7229, "step": 5242 }, { "epoch": 0.645371738060069, "grad_norm": 0.11494069546461105, "learning_rate": 0.00017742332799605863, "loss": 7.8772, "step": 5243 }, { "epoch": 0.6454948301329394, "grad_norm": 0.1491575539112091, "learning_rate": 0.0001773617440571499, "loss": 7.9618, "step": 5244 }, { "epoch": 0.6456179222058099, "grad_norm": 0.12213092297315598, "learning_rate": 0.00017730016011824115, "loss": 7.9512, "step": 5245 }, { "epoch": 0.6457410142786805, "grad_norm": 0.1817307472229004, "learning_rate": 0.00017723857617933244, "loss": 7.3104, "step": 5246 }, { "epoch": 0.645864106351551, "grad_norm": 0.12366645038127899, "learning_rate": 0.0001771769922404237, "loss": 7.6181, "step": 5247 }, { "epoch": 0.6459871984244214, "grad_norm": 0.10065510869026184, "learning_rate": 0.00017711540830151495, "loss": 7.5641, "step": 5248 }, { "epoch": 0.646110290497292, "grad_norm": 0.09698314219713211, "learning_rate": 0.00017705382436260621, "loss": 7.552, "step": 5249 }, { "epoch": 0.6462333825701625, "grad_norm": 0.10573820024728775, "learning_rate": 0.0001769922404236975, "loss": 7.6025, "step": 5250 }, { "epoch": 0.646356474643033, "grad_norm": 0.4366416931152344, "learning_rate": 0.00017693065648478876, "loss": 9.5457, "step": 5251 }, { "epoch": 0.6464795667159035, "grad_norm": 0.1559775024652481, "learning_rate": 0.00017686907254588002, "loss": 8.2369, "step": 5252 }, { "epoch": 0.646602658788774, "grad_norm": 0.07745899260044098, "learning_rate": 0.0001768074886069713, "loss": 7.4824, "step": 5253 }, { "epoch": 0.6467257508616445, "grad_norm": 0.10009095817804337, "learning_rate": 0.00017674590466806257, "loss": 7.5874, "step": 5254 }, { "epoch": 0.6468488429345151, "grad_norm": 0.12676602602005005, "learning_rate": 0.00017668432072915382, "loss": 7.8026, "step": 5255 }, { "epoch": 0.6469719350073855, "grad_norm": 0.07354369014501572, "learning_rate": 0.00017662273679024508, "loss": 7.6657, "step": 5256 }, { "epoch": 0.647095027080256, "grad_norm": 0.09678672254085541, "learning_rate": 0.00017656115285133637, "loss": 7.8711, "step": 5257 }, { "epoch": 0.6472181191531265, "grad_norm": 0.1418711096048355, "learning_rate": 0.00017649956891242763, "loss": 7.5227, "step": 5258 }, { "epoch": 0.6473412112259971, "grad_norm": 0.16808785498142242, "learning_rate": 0.0001764379849735189, "loss": 7.3861, "step": 5259 }, { "epoch": 0.6474643032988675, "grad_norm": 0.14882925152778625, "learning_rate": 0.00017637640103461018, "loss": 7.5531, "step": 5260 }, { "epoch": 0.647587395371738, "grad_norm": 0.11850369721651077, "learning_rate": 0.00017631481709570144, "loss": 7.3903, "step": 5261 }, { "epoch": 0.6477104874446086, "grad_norm": 0.06326861679553986, "learning_rate": 0.0001762532331567927, "loss": 7.6153, "step": 5262 }, { "epoch": 0.6478335795174791, "grad_norm": 0.22400464117527008, "learning_rate": 0.00017619164921788398, "loss": 7.8636, "step": 5263 }, { "epoch": 0.6479566715903495, "grad_norm": 0.11187021434307098, "learning_rate": 0.00017613006527897524, "loss": 7.7235, "step": 5264 }, { "epoch": 0.6480797636632201, "grad_norm": 0.1306479573249817, "learning_rate": 0.0001760684813400665, "loss": 8.0102, "step": 5265 }, { "epoch": 0.6482028557360906, "grad_norm": 0.06928575038909912, "learning_rate": 0.00017600689740115776, "loss": 7.4824, "step": 5266 }, { "epoch": 0.6483259478089611, "grad_norm": 0.06536820530891418, "learning_rate": 0.00017594531346224905, "loss": 7.5697, "step": 5267 }, { "epoch": 0.6484490398818316, "grad_norm": 0.10603627562522888, "learning_rate": 0.0001758837295233403, "loss": 7.4502, "step": 5268 }, { "epoch": 0.6485721319547021, "grad_norm": 0.1305702030658722, "learning_rate": 0.00017582214558443157, "loss": 7.9865, "step": 5269 }, { "epoch": 0.6486952240275726, "grad_norm": 0.1367989033460617, "learning_rate": 0.00017576056164552285, "loss": 7.4351, "step": 5270 }, { "epoch": 0.6488183161004432, "grad_norm": 0.08304696530103683, "learning_rate": 0.0001756989777066141, "loss": 7.4989, "step": 5271 }, { "epoch": 0.6489414081733137, "grad_norm": 0.08848664909601212, "learning_rate": 0.00017563739376770537, "loss": 7.3202, "step": 5272 }, { "epoch": 0.6490645002461841, "grad_norm": 0.0697983130812645, "learning_rate": 0.00017557580982879666, "loss": 7.4073, "step": 5273 }, { "epoch": 0.6491875923190547, "grad_norm": 0.0913887470960617, "learning_rate": 0.00017551422588988792, "loss": 7.7262, "step": 5274 }, { "epoch": 0.6493106843919252, "grad_norm": 0.07623365521430969, "learning_rate": 0.00017545264195097918, "loss": 7.3043, "step": 5275 }, { "epoch": 0.6494337764647957, "grad_norm": 0.11323435604572296, "learning_rate": 0.00017539105801207044, "loss": 7.4914, "step": 5276 }, { "epoch": 0.6495568685376661, "grad_norm": 0.142653688788414, "learning_rate": 0.00017532947407316172, "loss": 7.5274, "step": 5277 }, { "epoch": 0.6496799606105367, "grad_norm": 0.1029205322265625, "learning_rate": 0.00017526789013425298, "loss": 7.6662, "step": 5278 }, { "epoch": 0.6498030526834072, "grad_norm": 0.07463844865560532, "learning_rate": 0.00017520630619534424, "loss": 7.5732, "step": 5279 }, { "epoch": 0.6499261447562777, "grad_norm": 0.06853317469358444, "learning_rate": 0.00017514472225643553, "loss": 7.4688, "step": 5280 }, { "epoch": 0.6500492368291482, "grad_norm": 0.0674576684832573, "learning_rate": 0.0001750831383175268, "loss": 7.9023, "step": 5281 }, { "epoch": 0.6501723289020187, "grad_norm": 0.11476194113492966, "learning_rate": 0.00017502155437861805, "loss": 7.921, "step": 5282 }, { "epoch": 0.6502954209748892, "grad_norm": 0.16217146813869476, "learning_rate": 0.0001749599704397093, "loss": 7.4297, "step": 5283 }, { "epoch": 0.6504185130477598, "grad_norm": 0.19490914046764374, "learning_rate": 0.0001748983865008006, "loss": 7.613, "step": 5284 }, { "epoch": 0.6505416051206302, "grad_norm": 0.1156824454665184, "learning_rate": 0.00017483680256189185, "loss": 7.5292, "step": 5285 }, { "epoch": 0.6506646971935007, "grad_norm": 0.1109086349606514, "learning_rate": 0.0001747752186229831, "loss": 7.4681, "step": 5286 }, { "epoch": 0.6507877892663713, "grad_norm": 0.1461983174085617, "learning_rate": 0.0001747136346840744, "loss": 7.873, "step": 5287 }, { "epoch": 0.6509108813392418, "grad_norm": 0.08843949437141418, "learning_rate": 0.00017465205074516566, "loss": 7.6956, "step": 5288 }, { "epoch": 0.6510339734121122, "grad_norm": 0.09398074448108673, "learning_rate": 0.00017459046680625692, "loss": 7.7078, "step": 5289 }, { "epoch": 0.6511570654849828, "grad_norm": 0.13591136038303375, "learning_rate": 0.0001745288828673482, "loss": 7.6257, "step": 5290 }, { "epoch": 0.6512801575578533, "grad_norm": 0.11177407950162888, "learning_rate": 0.00017446729892843946, "loss": 7.5214, "step": 5291 }, { "epoch": 0.6514032496307238, "grad_norm": 0.1354358047246933, "learning_rate": 0.00017440571498953072, "loss": 8.0109, "step": 5292 }, { "epoch": 0.6515263417035942, "grad_norm": 0.2794124186038971, "learning_rate": 0.00017434413105062198, "loss": 8.8101, "step": 5293 }, { "epoch": 0.6516494337764648, "grad_norm": 0.11923026293516159, "learning_rate": 0.00017428254711171327, "loss": 7.3466, "step": 5294 }, { "epoch": 0.6517725258493353, "grad_norm": 0.1313410848379135, "learning_rate": 0.00017422096317280453, "loss": 7.4442, "step": 5295 }, { "epoch": 0.6518956179222059, "grad_norm": 0.07113290578126907, "learning_rate": 0.0001741593792338958, "loss": 7.4704, "step": 5296 }, { "epoch": 0.6520187099950763, "grad_norm": 0.06432677805423737, "learning_rate": 0.00017409779529498708, "loss": 7.6268, "step": 5297 }, { "epoch": 0.6521418020679468, "grad_norm": 0.12625326216220856, "learning_rate": 0.00017403621135607834, "loss": 7.6389, "step": 5298 }, { "epoch": 0.6522648941408173, "grad_norm": 0.08214548975229263, "learning_rate": 0.0001739746274171696, "loss": 7.646, "step": 5299 }, { "epoch": 0.6523879862136879, "grad_norm": 0.10478974133729935, "learning_rate": 0.00017391304347826088, "loss": 8.1595, "step": 5300 }, { "epoch": 0.6525110782865583, "grad_norm": 0.09282022714614868, "learning_rate": 0.00017385145953935214, "loss": 7.7347, "step": 5301 }, { "epoch": 0.6526341703594288, "grad_norm": 0.14356864988803864, "learning_rate": 0.0001737898756004434, "loss": 7.315, "step": 5302 }, { "epoch": 0.6527572624322994, "grad_norm": 0.08799814432859421, "learning_rate": 0.00017372829166153466, "loss": 7.567, "step": 5303 }, { "epoch": 0.6528803545051699, "grad_norm": 0.1409814953804016, "learning_rate": 0.00017366670772262595, "loss": 7.7972, "step": 5304 }, { "epoch": 0.6530034465780403, "grad_norm": 0.22308291494846344, "learning_rate": 0.0001736051237837172, "loss": 8.755, "step": 5305 }, { "epoch": 0.6531265386509109, "grad_norm": 0.12588657438755035, "learning_rate": 0.00017354353984480847, "loss": 7.3169, "step": 5306 }, { "epoch": 0.6532496307237814, "grad_norm": 0.09646416455507278, "learning_rate": 0.00017348195590589975, "loss": 7.5829, "step": 5307 }, { "epoch": 0.6533727227966519, "grad_norm": 0.11412133276462555, "learning_rate": 0.000173420371966991, "loss": 8.21, "step": 5308 }, { "epoch": 0.6534958148695224, "grad_norm": 0.061176054179668427, "learning_rate": 0.00017335878802808227, "loss": 7.7196, "step": 5309 }, { "epoch": 0.6536189069423929, "grad_norm": 0.09757427871227264, "learning_rate": 0.00017329720408917353, "loss": 7.4004, "step": 5310 }, { "epoch": 0.6537419990152634, "grad_norm": 0.12357114255428314, "learning_rate": 0.00017323562015026482, "loss": 7.4621, "step": 5311 }, { "epoch": 0.653865091088134, "grad_norm": 0.11030375212430954, "learning_rate": 0.00017317403621135608, "loss": 7.4325, "step": 5312 }, { "epoch": 0.6539881831610044, "grad_norm": 0.1331503838300705, "learning_rate": 0.00017311245227244734, "loss": 7.7976, "step": 5313 }, { "epoch": 0.6541112752338749, "grad_norm": 0.1265646070241928, "learning_rate": 0.00017305086833353862, "loss": 7.938, "step": 5314 }, { "epoch": 0.6542343673067454, "grad_norm": 0.07049427926540375, "learning_rate": 0.00017298928439462988, "loss": 7.38, "step": 5315 }, { "epoch": 0.654357459379616, "grad_norm": 0.18321584165096283, "learning_rate": 0.00017292770045572114, "loss": 8.1659, "step": 5316 }, { "epoch": 0.6544805514524865, "grad_norm": 0.0709591954946518, "learning_rate": 0.00017286611651681243, "loss": 7.3496, "step": 5317 }, { "epoch": 0.6546036435253569, "grad_norm": 0.06984159350395203, "learning_rate": 0.0001728045325779037, "loss": 7.4034, "step": 5318 }, { "epoch": 0.6547267355982275, "grad_norm": 0.0763370469212532, "learning_rate": 0.00017274294863899495, "loss": 7.3396, "step": 5319 }, { "epoch": 0.654849827671098, "grad_norm": 0.10485412180423737, "learning_rate": 0.0001726813647000862, "loss": 7.3, "step": 5320 }, { "epoch": 0.6549729197439685, "grad_norm": 0.07569313794374466, "learning_rate": 0.0001726197807611775, "loss": 7.5145, "step": 5321 }, { "epoch": 0.655096011816839, "grad_norm": 0.16998523473739624, "learning_rate": 0.00017255819682226875, "loss": 8.302, "step": 5322 }, { "epoch": 0.6552191038897095, "grad_norm": 0.07861205190420151, "learning_rate": 0.00017249661288336, "loss": 7.4235, "step": 5323 }, { "epoch": 0.65534219596258, "grad_norm": 0.12853559851646423, "learning_rate": 0.0001724350289444513, "loss": 7.51, "step": 5324 }, { "epoch": 0.6554652880354506, "grad_norm": 0.5210843086242676, "learning_rate": 0.00017237344500554256, "loss": 9.9314, "step": 5325 }, { "epoch": 0.655588380108321, "grad_norm": 0.16239014267921448, "learning_rate": 0.00017231186106663382, "loss": 8.329, "step": 5326 }, { "epoch": 0.6557114721811915, "grad_norm": 0.22795698046684265, "learning_rate": 0.0001722502771277251, "loss": 8.9209, "step": 5327 }, { "epoch": 0.6558345642540621, "grad_norm": 0.11080814152956009, "learning_rate": 0.00017218869318881636, "loss": 8.6258, "step": 5328 }, { "epoch": 0.6559576563269326, "grad_norm": 0.2186247557401657, "learning_rate": 0.00017212710924990762, "loss": 7.4206, "step": 5329 }, { "epoch": 0.656080748399803, "grad_norm": 0.14479245245456696, "learning_rate": 0.00017206552531099888, "loss": 7.8159, "step": 5330 }, { "epoch": 0.6562038404726735, "grad_norm": 0.10586515069007874, "learning_rate": 0.00017200394137209017, "loss": 7.9239, "step": 5331 }, { "epoch": 0.6563269325455441, "grad_norm": 0.21570536494255066, "learning_rate": 0.00017194235743318143, "loss": 7.6069, "step": 5332 }, { "epoch": 0.6564500246184146, "grad_norm": 0.19628873467445374, "learning_rate": 0.0001718807734942727, "loss": 7.5358, "step": 5333 }, { "epoch": 0.656573116691285, "grad_norm": 0.14669917523860931, "learning_rate": 0.00017181918955536398, "loss": 7.7841, "step": 5334 }, { "epoch": 0.6566962087641556, "grad_norm": 0.1166112869977951, "learning_rate": 0.00017175760561645523, "loss": 7.5088, "step": 5335 }, { "epoch": 0.6568193008370261, "grad_norm": 0.10307192802429199, "learning_rate": 0.0001716960216775465, "loss": 7.2332, "step": 5336 }, { "epoch": 0.6569423929098966, "grad_norm": 0.42018938064575195, "learning_rate": 0.00017163443773863775, "loss": 9.4618, "step": 5337 }, { "epoch": 0.6570654849827671, "grad_norm": 0.1440257579088211, "learning_rate": 0.00017157285379972904, "loss": 7.7026, "step": 5338 }, { "epoch": 0.6571885770556376, "grad_norm": 0.16867627203464508, "learning_rate": 0.0001715112698608203, "loss": 7.8291, "step": 5339 }, { "epoch": 0.6573116691285081, "grad_norm": 0.16331364214420319, "learning_rate": 0.00017144968592191156, "loss": 7.6303, "step": 5340 }, { "epoch": 0.6574347612013787, "grad_norm": 0.09939592331647873, "learning_rate": 0.00017138810198300285, "loss": 7.4988, "step": 5341 }, { "epoch": 0.6575578532742491, "grad_norm": 0.07845433056354523, "learning_rate": 0.0001713265180440941, "loss": 7.4637, "step": 5342 }, { "epoch": 0.6576809453471196, "grad_norm": 0.15090464055538177, "learning_rate": 0.00017126493410518536, "loss": 7.8202, "step": 5343 }, { "epoch": 0.6578040374199902, "grad_norm": 0.05791563168168068, "learning_rate": 0.00017120335016627665, "loss": 7.6283, "step": 5344 }, { "epoch": 0.6579271294928607, "grad_norm": 0.0773531049489975, "learning_rate": 0.0001711417662273679, "loss": 7.6482, "step": 5345 }, { "epoch": 0.6580502215657311, "grad_norm": 0.19348002970218658, "learning_rate": 0.00017108018228845917, "loss": 7.239, "step": 5346 }, { "epoch": 0.6581733136386017, "grad_norm": 0.12006113678216934, "learning_rate": 0.00017101859834955043, "loss": 7.4887, "step": 5347 }, { "epoch": 0.6582964057114722, "grad_norm": 0.11095038801431656, "learning_rate": 0.00017095701441064172, "loss": 7.5606, "step": 5348 }, { "epoch": 0.6584194977843427, "grad_norm": 0.11654459685087204, "learning_rate": 0.00017089543047173298, "loss": 7.5927, "step": 5349 }, { "epoch": 0.6585425898572131, "grad_norm": 0.15066087245941162, "learning_rate": 0.00017083384653282424, "loss": 7.6234, "step": 5350 }, { "epoch": 0.6586656819300837, "grad_norm": 0.07800497114658356, "learning_rate": 0.00017077226259391552, "loss": 7.6412, "step": 5351 }, { "epoch": 0.6587887740029542, "grad_norm": 0.07029184699058533, "learning_rate": 0.00017071067865500678, "loss": 7.5281, "step": 5352 }, { "epoch": 0.6589118660758247, "grad_norm": 0.1724889725446701, "learning_rate": 0.00017064909471609804, "loss": 7.7755, "step": 5353 }, { "epoch": 0.6590349581486952, "grad_norm": 0.12231642752885818, "learning_rate": 0.0001705875107771893, "loss": 7.6362, "step": 5354 }, { "epoch": 0.6591580502215657, "grad_norm": 0.07655524462461472, "learning_rate": 0.0001705259268382806, "loss": 7.5327, "step": 5355 }, { "epoch": 0.6592811422944362, "grad_norm": 0.17620518803596497, "learning_rate": 0.00017046434289937185, "loss": 8.2243, "step": 5356 }, { "epoch": 0.6594042343673068, "grad_norm": 0.1270238161087036, "learning_rate": 0.0001704027589604631, "loss": 7.9999, "step": 5357 }, { "epoch": 0.6595273264401772, "grad_norm": 0.0801800936460495, "learning_rate": 0.0001703411750215544, "loss": 7.8005, "step": 5358 }, { "epoch": 0.6596504185130477, "grad_norm": 0.12864762544631958, "learning_rate": 0.00017027959108264565, "loss": 7.4287, "step": 5359 }, { "epoch": 0.6597735105859183, "grad_norm": 0.10770215839147568, "learning_rate": 0.0001702180071437369, "loss": 8.3011, "step": 5360 }, { "epoch": 0.6598966026587888, "grad_norm": 0.14536108076572418, "learning_rate": 0.0001701564232048282, "loss": 8.3286, "step": 5361 }, { "epoch": 0.6600196947316593, "grad_norm": 0.1281195431947708, "learning_rate": 0.00017009483926591946, "loss": 7.8124, "step": 5362 }, { "epoch": 0.6601427868045298, "grad_norm": 0.14106503129005432, "learning_rate": 0.00017003325532701072, "loss": 7.4124, "step": 5363 }, { "epoch": 0.6602658788774003, "grad_norm": 0.11874187737703323, "learning_rate": 0.00016997167138810198, "loss": 7.5739, "step": 5364 }, { "epoch": 0.6603889709502708, "grad_norm": 0.13174086809158325, "learning_rate": 0.00016991008744919326, "loss": 7.4617, "step": 5365 }, { "epoch": 0.6605120630231414, "grad_norm": 0.08616428077220917, "learning_rate": 0.00016984850351028452, "loss": 7.8566, "step": 5366 }, { "epoch": 0.6606351550960118, "grad_norm": 0.1709478348493576, "learning_rate": 0.00016978691957137578, "loss": 8.4889, "step": 5367 }, { "epoch": 0.6607582471688823, "grad_norm": 0.10726488381624222, "learning_rate": 0.00016972533563246707, "loss": 7.5999, "step": 5368 }, { "epoch": 0.6608813392417529, "grad_norm": 0.06449435651302338, "learning_rate": 0.00016966375169355833, "loss": 7.3963, "step": 5369 }, { "epoch": 0.6610044313146234, "grad_norm": 0.3288572430610657, "learning_rate": 0.0001696021677546496, "loss": 9.1904, "step": 5370 }, { "epoch": 0.6611275233874938, "grad_norm": 0.20571064949035645, "learning_rate": 0.00016954058381574087, "loss": 8.2131, "step": 5371 }, { "epoch": 0.6612506154603643, "grad_norm": 0.1494700312614441, "learning_rate": 0.00016947899987683213, "loss": 8.1441, "step": 5372 }, { "epoch": 0.6613737075332349, "grad_norm": 0.13258583843708038, "learning_rate": 0.0001694174159379234, "loss": 7.636, "step": 5373 }, { "epoch": 0.6614967996061054, "grad_norm": 0.12206605821847916, "learning_rate": 0.00016935583199901465, "loss": 7.5699, "step": 5374 }, { "epoch": 0.6616198916789758, "grad_norm": 0.13016797602176666, "learning_rate": 0.00016929424806010594, "loss": 8.233, "step": 5375 }, { "epoch": 0.6617429837518464, "grad_norm": 0.10378251224756241, "learning_rate": 0.0001692326641211972, "loss": 7.5451, "step": 5376 }, { "epoch": 0.6618660758247169, "grad_norm": 0.16155044734477997, "learning_rate": 0.00016917108018228846, "loss": 8.6656, "step": 5377 }, { "epoch": 0.6619891678975874, "grad_norm": 0.14614829421043396, "learning_rate": 0.00016910949624337975, "loss": 7.527, "step": 5378 }, { "epoch": 0.6621122599704579, "grad_norm": 0.14761751890182495, "learning_rate": 0.000169047912304471, "loss": 7.4269, "step": 5379 }, { "epoch": 0.6622353520433284, "grad_norm": 0.09520264714956284, "learning_rate": 0.00016898632836556226, "loss": 7.7255, "step": 5380 }, { "epoch": 0.6623584441161989, "grad_norm": 0.10770060867071152, "learning_rate": 0.00016892474442665352, "loss": 7.4847, "step": 5381 }, { "epoch": 0.6624815361890695, "grad_norm": 0.07338284701108932, "learning_rate": 0.0001688631604877448, "loss": 7.6236, "step": 5382 }, { "epoch": 0.6626046282619399, "grad_norm": 0.1226583793759346, "learning_rate": 0.00016880157654883607, "loss": 8.0319, "step": 5383 }, { "epoch": 0.6627277203348104, "grad_norm": 0.08300391584634781, "learning_rate": 0.00016873999260992733, "loss": 7.7173, "step": 5384 }, { "epoch": 0.662850812407681, "grad_norm": 0.09563355892896652, "learning_rate": 0.00016867840867101862, "loss": 7.4676, "step": 5385 }, { "epoch": 0.6629739044805515, "grad_norm": 0.10067155212163925, "learning_rate": 0.00016861682473210988, "loss": 7.5957, "step": 5386 }, { "epoch": 0.6630969965534219, "grad_norm": 0.09173242747783661, "learning_rate": 0.00016855524079320114, "loss": 7.2443, "step": 5387 }, { "epoch": 0.6632200886262924, "grad_norm": 0.08117469400167465, "learning_rate": 0.00016849365685429242, "loss": 7.3808, "step": 5388 }, { "epoch": 0.663343180699163, "grad_norm": 0.15780490636825562, "learning_rate": 0.00016843207291538368, "loss": 7.2654, "step": 5389 }, { "epoch": 0.6634662727720335, "grad_norm": 0.22890189290046692, "learning_rate": 0.00016837048897647494, "loss": 8.1853, "step": 5390 }, { "epoch": 0.6635893648449039, "grad_norm": 0.06413383781909943, "learning_rate": 0.0001683089050375662, "loss": 7.5083, "step": 5391 }, { "epoch": 0.6637124569177745, "grad_norm": 0.1404091715812683, "learning_rate": 0.0001682473210986575, "loss": 7.5971, "step": 5392 }, { "epoch": 0.663835548990645, "grad_norm": 0.062108851969242096, "learning_rate": 0.00016818573715974875, "loss": 7.4932, "step": 5393 }, { "epoch": 0.6639586410635155, "grad_norm": 0.06559592485427856, "learning_rate": 0.00016812415322084, "loss": 7.5159, "step": 5394 }, { "epoch": 0.664081733136386, "grad_norm": 0.08618789166212082, "learning_rate": 0.0001680625692819313, "loss": 7.4449, "step": 5395 }, { "epoch": 0.6642048252092565, "grad_norm": 0.0837077721953392, "learning_rate": 0.00016800098534302255, "loss": 7.2824, "step": 5396 }, { "epoch": 0.664327917282127, "grad_norm": 0.0698249489068985, "learning_rate": 0.0001679394014041138, "loss": 7.4198, "step": 5397 }, { "epoch": 0.6644510093549976, "grad_norm": 0.11118235439062119, "learning_rate": 0.0001678778174652051, "loss": 7.7237, "step": 5398 }, { "epoch": 0.664574101427868, "grad_norm": 0.08505404740571976, "learning_rate": 0.00016781623352629636, "loss": 7.6035, "step": 5399 }, { "epoch": 0.6646971935007385, "grad_norm": 0.11035499721765518, "learning_rate": 0.00016775464958738762, "loss": 7.8582, "step": 5400 }, { "epoch": 0.6648202855736091, "grad_norm": 0.0794353112578392, "learning_rate": 0.00016769306564847888, "loss": 7.5932, "step": 5401 }, { "epoch": 0.6649433776464796, "grad_norm": 0.1652117222547531, "learning_rate": 0.00016763148170957016, "loss": 8.3935, "step": 5402 }, { "epoch": 0.6650664697193501, "grad_norm": 0.12273979932069778, "learning_rate": 0.00016756989777066142, "loss": 7.3751, "step": 5403 }, { "epoch": 0.6651895617922206, "grad_norm": 0.10787943750619888, "learning_rate": 0.00016750831383175268, "loss": 7.2382, "step": 5404 }, { "epoch": 0.6653126538650911, "grad_norm": 0.087945856153965, "learning_rate": 0.00016744672989284397, "loss": 7.4915, "step": 5405 }, { "epoch": 0.6654357459379616, "grad_norm": 0.1150202751159668, "learning_rate": 0.00016738514595393523, "loss": 7.4225, "step": 5406 }, { "epoch": 0.6655588380108322, "grad_norm": 0.12453607469797134, "learning_rate": 0.0001673235620150265, "loss": 7.2355, "step": 5407 }, { "epoch": 0.6656819300837026, "grad_norm": 0.08457504957914352, "learning_rate": 0.00016726197807611775, "loss": 7.2287, "step": 5408 }, { "epoch": 0.6658050221565731, "grad_norm": 0.1314159333705902, "learning_rate": 0.00016720039413720903, "loss": 7.8285, "step": 5409 }, { "epoch": 0.6659281142294436, "grad_norm": 0.5890649557113647, "learning_rate": 0.0001671388101983003, "loss": 10.3947, "step": 5410 }, { "epoch": 0.6660512063023142, "grad_norm": 0.09822163730859756, "learning_rate": 0.00016707722625939155, "loss": 7.5649, "step": 5411 }, { "epoch": 0.6661742983751846, "grad_norm": 0.2023947387933731, "learning_rate": 0.00016701564232048284, "loss": 8.3866, "step": 5412 }, { "epoch": 0.6662973904480551, "grad_norm": 0.07017698884010315, "learning_rate": 0.0001669540583815741, "loss": 7.2807, "step": 5413 }, { "epoch": 0.6664204825209257, "grad_norm": 0.09297818690538406, "learning_rate": 0.00016689247444266536, "loss": 7.7013, "step": 5414 }, { "epoch": 0.6665435745937962, "grad_norm": 0.09733261168003082, "learning_rate": 0.00016683089050375664, "loss": 7.662, "step": 5415 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1260397583246231, "learning_rate": 0.0001667693065648479, "loss": 7.8487, "step": 5416 }, { "epoch": 0.6667897587395372, "grad_norm": 0.12347127497196198, "learning_rate": 0.00016670772262593916, "loss": 7.4168, "step": 5417 }, { "epoch": 0.6669128508124077, "grad_norm": 0.1704527884721756, "learning_rate": 0.00016664613868703042, "loss": 7.2567, "step": 5418 }, { "epoch": 0.6670359428852782, "grad_norm": 0.06680043786764145, "learning_rate": 0.0001665845547481217, "loss": 7.8414, "step": 5419 }, { "epoch": 0.6671590349581487, "grad_norm": 0.08100824058055878, "learning_rate": 0.00016652297080921297, "loss": 7.6866, "step": 5420 }, { "epoch": 0.6672821270310192, "grad_norm": 0.06890971213579178, "learning_rate": 0.00016646138687030423, "loss": 7.6776, "step": 5421 }, { "epoch": 0.6674052191038897, "grad_norm": 0.09441878646612167, "learning_rate": 0.00016639980293139552, "loss": 7.7432, "step": 5422 }, { "epoch": 0.6675283111767603, "grad_norm": 0.13740143179893494, "learning_rate": 0.00016633821899248677, "loss": 7.3291, "step": 5423 }, { "epoch": 0.6676514032496307, "grad_norm": 0.15702936053276062, "learning_rate": 0.00016627663505357803, "loss": 8.2823, "step": 5424 }, { "epoch": 0.6677744953225012, "grad_norm": 0.09443475306034088, "learning_rate": 0.00016621505111466932, "loss": 7.2564, "step": 5425 }, { "epoch": 0.6678975873953718, "grad_norm": 0.11965557932853699, "learning_rate": 0.00016615346717576058, "loss": 7.3052, "step": 5426 }, { "epoch": 0.6680206794682423, "grad_norm": 0.15487341582775116, "learning_rate": 0.00016609188323685184, "loss": 7.9674, "step": 5427 }, { "epoch": 0.6681437715411127, "grad_norm": 0.07937680184841156, "learning_rate": 0.0001660302992979431, "loss": 7.2818, "step": 5428 }, { "epoch": 0.6682668636139832, "grad_norm": 0.13321726024150848, "learning_rate": 0.00016596871535903439, "loss": 7.439, "step": 5429 }, { "epoch": 0.6683899556868538, "grad_norm": 0.11391574144363403, "learning_rate": 0.00016590713142012565, "loss": 7.7581, "step": 5430 }, { "epoch": 0.6685130477597243, "grad_norm": 0.09156854450702667, "learning_rate": 0.0001658455474812169, "loss": 7.2767, "step": 5431 }, { "epoch": 0.6686361398325947, "grad_norm": 0.09477902203798294, "learning_rate": 0.0001657839635423082, "loss": 7.3783, "step": 5432 }, { "epoch": 0.6687592319054653, "grad_norm": 0.11857406049966812, "learning_rate": 0.00016572237960339945, "loss": 7.9074, "step": 5433 }, { "epoch": 0.6688823239783358, "grad_norm": 0.08109716325998306, "learning_rate": 0.0001656607956644907, "loss": 7.7985, "step": 5434 }, { "epoch": 0.6690054160512063, "grad_norm": 0.12481582909822464, "learning_rate": 0.00016559921172558197, "loss": 7.8228, "step": 5435 }, { "epoch": 0.6691285081240768, "grad_norm": 0.1748914271593094, "learning_rate": 0.00016553762778667326, "loss": 8.6063, "step": 5436 }, { "epoch": 0.6692516001969473, "grad_norm": 0.10309797525405884, "learning_rate": 0.00016547604384776452, "loss": 7.6722, "step": 5437 }, { "epoch": 0.6693746922698178, "grad_norm": 0.07935023307800293, "learning_rate": 0.00016541445990885578, "loss": 7.4978, "step": 5438 }, { "epoch": 0.6694977843426884, "grad_norm": 0.2551029324531555, "learning_rate": 0.00016535287596994706, "loss": 8.6093, "step": 5439 }, { "epoch": 0.6696208764155588, "grad_norm": 0.0669635683298111, "learning_rate": 0.00016529129203103832, "loss": 7.717, "step": 5440 }, { "epoch": 0.6697439684884293, "grad_norm": 0.11385392397642136, "learning_rate": 0.00016522970809212958, "loss": 8.1407, "step": 5441 }, { "epoch": 0.6698670605612999, "grad_norm": 0.08053459972143173, "learning_rate": 0.00016516812415322087, "loss": 8.0062, "step": 5442 }, { "epoch": 0.6699901526341704, "grad_norm": 0.11795877665281296, "learning_rate": 0.00016510654021431213, "loss": 7.3849, "step": 5443 }, { "epoch": 0.6701132447070408, "grad_norm": 0.10849937796592712, "learning_rate": 0.0001650449562754034, "loss": 7.3236, "step": 5444 }, { "epoch": 0.6702363367799113, "grad_norm": 0.09553787112236023, "learning_rate": 0.00016498337233649465, "loss": 8.081, "step": 5445 }, { "epoch": 0.6703594288527819, "grad_norm": 0.09410396218299866, "learning_rate": 0.00016492178839758593, "loss": 7.3452, "step": 5446 }, { "epoch": 0.6704825209256524, "grad_norm": 0.06518222391605377, "learning_rate": 0.0001648602044586772, "loss": 7.6435, "step": 5447 }, { "epoch": 0.670605612998523, "grad_norm": 0.10408096760511398, "learning_rate": 0.00016479862051976845, "loss": 7.5682, "step": 5448 }, { "epoch": 0.6707287050713934, "grad_norm": 0.1029348149895668, "learning_rate": 0.00016473703658085974, "loss": 7.6351, "step": 5449 }, { "epoch": 0.6708517971442639, "grad_norm": 0.16476428508758545, "learning_rate": 0.000164675452641951, "loss": 8.1636, "step": 5450 }, { "epoch": 0.6709748892171344, "grad_norm": 0.1792096644639969, "learning_rate": 0.00016461386870304226, "loss": 8.3946, "step": 5451 }, { "epoch": 0.671097981290005, "grad_norm": 0.13402308523654938, "learning_rate": 0.00016455228476413354, "loss": 8.21, "step": 5452 }, { "epoch": 0.6712210733628754, "grad_norm": 0.3428170680999756, "learning_rate": 0.0001644907008252248, "loss": 9.2619, "step": 5453 }, { "epoch": 0.6713441654357459, "grad_norm": 0.1894824057817459, "learning_rate": 0.00016442911688631606, "loss": 7.381, "step": 5454 }, { "epoch": 0.6714672575086165, "grad_norm": 0.2236705869436264, "learning_rate": 0.00016436753294740732, "loss": 7.7226, "step": 5455 }, { "epoch": 0.671590349581487, "grad_norm": 0.17862924933433533, "learning_rate": 0.0001643059490084986, "loss": 7.7364, "step": 5456 }, { "epoch": 0.6717134416543574, "grad_norm": 0.0977100282907486, "learning_rate": 0.00016424436506958984, "loss": 8.2738, "step": 5457 }, { "epoch": 0.671836533727228, "grad_norm": 0.1854705810546875, "learning_rate": 0.0001641827811306811, "loss": 7.5426, "step": 5458 }, { "epoch": 0.6719596258000985, "grad_norm": 0.23124966025352478, "learning_rate": 0.0001641211971917724, "loss": 7.3352, "step": 5459 }, { "epoch": 0.672082717872969, "grad_norm": 0.1316748410463333, "learning_rate": 0.00016405961325286365, "loss": 7.4479, "step": 5460 }, { "epoch": 0.6722058099458394, "grad_norm": 0.09461259096860886, "learning_rate": 0.0001639980293139549, "loss": 7.7034, "step": 5461 }, { "epoch": 0.67232890201871, "grad_norm": 0.12237511575222015, "learning_rate": 0.00016393644537504617, "loss": 7.67, "step": 5462 }, { "epoch": 0.6724519940915805, "grad_norm": 0.1227930560708046, "learning_rate": 0.00016387486143613745, "loss": 7.9332, "step": 5463 }, { "epoch": 0.672575086164451, "grad_norm": 0.10703280568122864, "learning_rate": 0.0001638132774972287, "loss": 7.5645, "step": 5464 }, { "epoch": 0.6726981782373215, "grad_norm": 0.07473401725292206, "learning_rate": 0.00016375169355831997, "loss": 7.8083, "step": 5465 }, { "epoch": 0.672821270310192, "grad_norm": 0.09599440544843674, "learning_rate": 0.00016369010961941126, "loss": 7.5167, "step": 5466 }, { "epoch": 0.6729443623830625, "grad_norm": 0.1416071653366089, "learning_rate": 0.00016362852568050252, "loss": 7.9608, "step": 5467 }, { "epoch": 0.6730674544559331, "grad_norm": 0.0763784870505333, "learning_rate": 0.00016356694174159378, "loss": 7.7736, "step": 5468 }, { "epoch": 0.6731905465288035, "grad_norm": 0.07892778515815735, "learning_rate": 0.00016350535780268506, "loss": 7.5901, "step": 5469 }, { "epoch": 0.673313638601674, "grad_norm": 0.07264762371778488, "learning_rate": 0.00016344377386377632, "loss": 7.7263, "step": 5470 }, { "epoch": 0.6734367306745446, "grad_norm": 0.07223942130804062, "learning_rate": 0.00016338218992486758, "loss": 7.5171, "step": 5471 }, { "epoch": 0.6735598227474151, "grad_norm": 0.09964683651924133, "learning_rate": 0.00016332060598595884, "loss": 8.1227, "step": 5472 }, { "epoch": 0.6736829148202855, "grad_norm": 0.10032421350479126, "learning_rate": 0.00016325902204705013, "loss": 7.5828, "step": 5473 }, { "epoch": 0.6738060068931561, "grad_norm": 0.270835816860199, "learning_rate": 0.0001631974381081414, "loss": 6.9906, "step": 5474 }, { "epoch": 0.6739290989660266, "grad_norm": 0.08294609189033508, "learning_rate": 0.00016313585416923265, "loss": 8.1034, "step": 5475 }, { "epoch": 0.6740521910388971, "grad_norm": 0.15480343997478485, "learning_rate": 0.00016307427023032393, "loss": 7.1845, "step": 5476 }, { "epoch": 0.6741752831117676, "grad_norm": 0.0781468003988266, "learning_rate": 0.0001630126862914152, "loss": 7.4474, "step": 5477 }, { "epoch": 0.6742983751846381, "grad_norm": 0.06995728611946106, "learning_rate": 0.00016295110235250645, "loss": 7.7275, "step": 5478 }, { "epoch": 0.6744214672575086, "grad_norm": 0.09575910121202469, "learning_rate": 0.0001628895184135977, "loss": 7.8188, "step": 5479 }, { "epoch": 0.6745445593303792, "grad_norm": 0.3515778183937073, "learning_rate": 0.000162827934474689, "loss": 9.1381, "step": 5480 }, { "epoch": 0.6746676514032496, "grad_norm": 0.07222413271665573, "learning_rate": 0.00016276635053578026, "loss": 7.4047, "step": 5481 }, { "epoch": 0.6747907434761201, "grad_norm": 0.17572888731956482, "learning_rate": 0.00016270476659687152, "loss": 7.9656, "step": 5482 }, { "epoch": 0.6749138355489906, "grad_norm": 0.1102067083120346, "learning_rate": 0.0001626431826579628, "loss": 7.5052, "step": 5483 }, { "epoch": 0.6750369276218612, "grad_norm": 0.0829615667462349, "learning_rate": 0.00016258159871905406, "loss": 7.7676, "step": 5484 }, { "epoch": 0.6751600196947316, "grad_norm": 0.08389624953269958, "learning_rate": 0.00016252001478014532, "loss": 7.4605, "step": 5485 }, { "epoch": 0.6752831117676021, "grad_norm": 0.07518648356199265, "learning_rate": 0.0001624584308412366, "loss": 7.4728, "step": 5486 }, { "epoch": 0.6754062038404727, "grad_norm": 0.06274831295013428, "learning_rate": 0.00016239684690232787, "loss": 7.6852, "step": 5487 }, { "epoch": 0.6755292959133432, "grad_norm": 0.2201526165008545, "learning_rate": 0.00016233526296341913, "loss": 8.6759, "step": 5488 }, { "epoch": 0.6756523879862137, "grad_norm": 0.19919225573539734, "learning_rate": 0.0001622736790245104, "loss": 8.4552, "step": 5489 }, { "epoch": 0.6757754800590842, "grad_norm": 0.19268374145030975, "learning_rate": 0.00016221209508560168, "loss": 7.0668, "step": 5490 }, { "epoch": 0.6758985721319547, "grad_norm": 0.14906315505504608, "learning_rate": 0.00016215051114669294, "loss": 8.4157, "step": 5491 }, { "epoch": 0.6760216642048252, "grad_norm": 0.09383361041545868, "learning_rate": 0.0001620889272077842, "loss": 7.5838, "step": 5492 }, { "epoch": 0.6761447562776958, "grad_norm": 0.0670541301369667, "learning_rate": 0.00016202734326887548, "loss": 7.5856, "step": 5493 }, { "epoch": 0.6762678483505662, "grad_norm": 0.09709767252206802, "learning_rate": 0.00016196575932996674, "loss": 7.7045, "step": 5494 }, { "epoch": 0.6763909404234367, "grad_norm": 0.2111280858516693, "learning_rate": 0.000161904175391058, "loss": 7.9804, "step": 5495 }, { "epoch": 0.6765140324963073, "grad_norm": 0.10516630113124847, "learning_rate": 0.0001618425914521493, "loss": 7.9413, "step": 5496 }, { "epoch": 0.6766371245691778, "grad_norm": 0.07752738147974014, "learning_rate": 0.00016178100751324055, "loss": 7.5802, "step": 5497 }, { "epoch": 0.6767602166420482, "grad_norm": 0.0886891782283783, "learning_rate": 0.0001617194235743318, "loss": 7.5346, "step": 5498 }, { "epoch": 0.6768833087149188, "grad_norm": 0.08255720138549805, "learning_rate": 0.00016165783963542307, "loss": 7.4106, "step": 5499 }, { "epoch": 0.6770064007877893, "grad_norm": 0.18484652042388916, "learning_rate": 0.00016159625569651435, "loss": 8.7348, "step": 5500 }, { "epoch": 0.6771294928606598, "grad_norm": 0.08884254842996597, "learning_rate": 0.0001615346717576056, "loss": 7.8085, "step": 5501 }, { "epoch": 0.6772525849335302, "grad_norm": 0.21172934770584106, "learning_rate": 0.00016147308781869687, "loss": 7.8887, "step": 5502 }, { "epoch": 0.6773756770064008, "grad_norm": 0.10244791209697723, "learning_rate": 0.00016141150387978816, "loss": 7.4495, "step": 5503 }, { "epoch": 0.6774987690792713, "grad_norm": 0.07476804405450821, "learning_rate": 0.00016134991994087942, "loss": 7.5112, "step": 5504 }, { "epoch": 0.6776218611521418, "grad_norm": 0.07310445606708527, "learning_rate": 0.00016128833600197068, "loss": 7.4732, "step": 5505 }, { "epoch": 0.6777449532250123, "grad_norm": 0.07238265126943588, "learning_rate": 0.00016122675206306194, "loss": 7.4918, "step": 5506 }, { "epoch": 0.6778680452978828, "grad_norm": 0.10181079059839249, "learning_rate": 0.00016116516812415322, "loss": 7.3603, "step": 5507 }, { "epoch": 0.6779911373707533, "grad_norm": 0.08869276940822601, "learning_rate": 0.00016110358418524448, "loss": 7.9235, "step": 5508 }, { "epoch": 0.6781142294436239, "grad_norm": 0.1417529582977295, "learning_rate": 0.00016104200024633574, "loss": 7.4295, "step": 5509 }, { "epoch": 0.6782373215164943, "grad_norm": 0.12168100476264954, "learning_rate": 0.00016098041630742703, "loss": 7.9772, "step": 5510 }, { "epoch": 0.6783604135893648, "grad_norm": 0.20424775779247284, "learning_rate": 0.0001609188323685183, "loss": 8.5318, "step": 5511 }, { "epoch": 0.6784835056622354, "grad_norm": 0.1297135204076767, "learning_rate": 0.00016085724842960955, "loss": 7.1617, "step": 5512 }, { "epoch": 0.6786065977351059, "grad_norm": 0.07099239528179169, "learning_rate": 0.00016079566449070083, "loss": 7.3878, "step": 5513 }, { "epoch": 0.6787296898079763, "grad_norm": 0.06197798252105713, "learning_rate": 0.0001607340805517921, "loss": 7.5729, "step": 5514 }, { "epoch": 0.6788527818808469, "grad_norm": 0.13191072642803192, "learning_rate": 0.00016067249661288335, "loss": 7.2861, "step": 5515 }, { "epoch": 0.6789758739537174, "grad_norm": 0.09501700103282928, "learning_rate": 0.0001606109126739746, "loss": 7.7031, "step": 5516 }, { "epoch": 0.6790989660265879, "grad_norm": 0.07638600468635559, "learning_rate": 0.0001605493287350659, "loss": 7.6346, "step": 5517 }, { "epoch": 0.6792220580994583, "grad_norm": 0.11336648464202881, "learning_rate": 0.00016048774479615716, "loss": 7.2364, "step": 5518 }, { "epoch": 0.6793451501723289, "grad_norm": 0.08644673228263855, "learning_rate": 0.00016042616085724842, "loss": 7.6285, "step": 5519 }, { "epoch": 0.6794682422451994, "grad_norm": 0.08468900620937347, "learning_rate": 0.0001603645769183397, "loss": 7.6379, "step": 5520 }, { "epoch": 0.67959133431807, "grad_norm": 0.0634159967303276, "learning_rate": 0.00016030299297943096, "loss": 7.5027, "step": 5521 }, { "epoch": 0.6797144263909404, "grad_norm": 0.09805236011743546, "learning_rate": 0.00016024140904052222, "loss": 7.8418, "step": 5522 }, { "epoch": 0.6798375184638109, "grad_norm": 0.20752021670341492, "learning_rate": 0.00016017982510161348, "loss": 8.4382, "step": 5523 }, { "epoch": 0.6799606105366814, "grad_norm": 0.0755678042769432, "learning_rate": 0.00016011824116270477, "loss": 7.6428, "step": 5524 }, { "epoch": 0.680083702609552, "grad_norm": 0.06830687075853348, "learning_rate": 0.00016005665722379603, "loss": 7.9571, "step": 5525 }, { "epoch": 0.6802067946824224, "grad_norm": 0.18529440462589264, "learning_rate": 0.0001599950732848873, "loss": 7.0233, "step": 5526 }, { "epoch": 0.6803298867552929, "grad_norm": 0.08485709130764008, "learning_rate": 0.00015993348934597858, "loss": 7.4144, "step": 5527 }, { "epoch": 0.6804529788281635, "grad_norm": 0.08207289129495621, "learning_rate": 0.00015987190540706984, "loss": 7.9847, "step": 5528 }, { "epoch": 0.680576070901034, "grad_norm": 0.15816889703273773, "learning_rate": 0.0001598103214681611, "loss": 7.4226, "step": 5529 }, { "epoch": 0.6806991629739044, "grad_norm": 0.14023782312870026, "learning_rate": 0.00015974873752925238, "loss": 7.5311, "step": 5530 }, { "epoch": 0.680822255046775, "grad_norm": 0.1840735524892807, "learning_rate": 0.00015968715359034364, "loss": 7.5726, "step": 5531 }, { "epoch": 0.6809453471196455, "grad_norm": 0.07882095128297806, "learning_rate": 0.0001596255696514349, "loss": 7.499, "step": 5532 }, { "epoch": 0.681068439192516, "grad_norm": 0.06698650121688843, "learning_rate": 0.00015956398571252616, "loss": 7.6336, "step": 5533 }, { "epoch": 0.6811915312653866, "grad_norm": 0.12803123891353607, "learning_rate": 0.00015950240177361745, "loss": 7.9771, "step": 5534 }, { "epoch": 0.681314623338257, "grad_norm": 0.22888952493667603, "learning_rate": 0.0001594408178347087, "loss": 8.554, "step": 5535 }, { "epoch": 0.6814377154111275, "grad_norm": 0.08734580874443054, "learning_rate": 0.00015937923389579997, "loss": 7.8747, "step": 5536 }, { "epoch": 0.681560807483998, "grad_norm": 0.1129889190196991, "learning_rate": 0.00015931764995689125, "loss": 7.7602, "step": 5537 }, { "epoch": 0.6816838995568686, "grad_norm": 0.10995788127183914, "learning_rate": 0.0001592560660179825, "loss": 8.0871, "step": 5538 }, { "epoch": 0.681806991629739, "grad_norm": 0.18530990183353424, "learning_rate": 0.00015919448207907377, "loss": 8.484, "step": 5539 }, { "epoch": 0.6819300837026095, "grad_norm": 0.07389699667692184, "learning_rate": 0.00015913289814016506, "loss": 7.9257, "step": 5540 }, { "epoch": 0.6820531757754801, "grad_norm": 0.1136837750673294, "learning_rate": 0.00015907131420125632, "loss": 7.5427, "step": 5541 }, { "epoch": 0.6821762678483506, "grad_norm": 0.11478732526302338, "learning_rate": 0.00015900973026234758, "loss": 7.6696, "step": 5542 }, { "epoch": 0.682299359921221, "grad_norm": 0.15601466596126556, "learning_rate": 0.00015894814632343884, "loss": 7.3389, "step": 5543 }, { "epoch": 0.6824224519940916, "grad_norm": 0.11653583496809006, "learning_rate": 0.00015888656238453012, "loss": 7.649, "step": 5544 }, { "epoch": 0.6825455440669621, "grad_norm": 0.3216809034347534, "learning_rate": 0.00015882497844562138, "loss": 7.2561, "step": 5545 }, { "epoch": 0.6826686361398326, "grad_norm": 0.1197177991271019, "learning_rate": 0.00015876339450671264, "loss": 7.417, "step": 5546 }, { "epoch": 0.6827917282127031, "grad_norm": 0.13149172067642212, "learning_rate": 0.00015870181056780393, "loss": 8.4682, "step": 5547 }, { "epoch": 0.6829148202855736, "grad_norm": 0.13323290646076202, "learning_rate": 0.0001586402266288952, "loss": 8.0576, "step": 5548 }, { "epoch": 0.6830379123584441, "grad_norm": 0.09305360913276672, "learning_rate": 0.00015857864268998645, "loss": 7.8241, "step": 5549 }, { "epoch": 0.6831610044313147, "grad_norm": 0.0944383293390274, "learning_rate": 0.0001585170587510777, "loss": 7.6899, "step": 5550 }, { "epoch": 0.6832840965041851, "grad_norm": 0.08152017742395401, "learning_rate": 0.000158455474812169, "loss": 7.7603, "step": 5551 }, { "epoch": 0.6834071885770556, "grad_norm": 0.09179756790399551, "learning_rate": 0.00015839389087326025, "loss": 7.7362, "step": 5552 }, { "epoch": 0.6835302806499262, "grad_norm": 0.05471046268939972, "learning_rate": 0.0001583323069343515, "loss": 7.5212, "step": 5553 }, { "epoch": 0.6836533727227967, "grad_norm": 0.15858930349349976, "learning_rate": 0.0001582707229954428, "loss": 8.3313, "step": 5554 }, { "epoch": 0.6837764647956671, "grad_norm": 0.12749773263931274, "learning_rate": 0.00015820913905653406, "loss": 7.4311, "step": 5555 }, { "epoch": 0.6838995568685377, "grad_norm": 0.11412325501441956, "learning_rate": 0.00015814755511762532, "loss": 7.3442, "step": 5556 }, { "epoch": 0.6840226489414082, "grad_norm": 0.0692121759057045, "learning_rate": 0.0001580859711787166, "loss": 7.6052, "step": 5557 }, { "epoch": 0.6841457410142787, "grad_norm": 0.11810428649187088, "learning_rate": 0.00015802438723980786, "loss": 7.34, "step": 5558 }, { "epoch": 0.6842688330871491, "grad_norm": 0.10325738042593002, "learning_rate": 0.00015796280330089912, "loss": 8.1956, "step": 5559 }, { "epoch": 0.6843919251600197, "grad_norm": 0.06744753569364548, "learning_rate": 0.00015790121936199038, "loss": 7.4899, "step": 5560 }, { "epoch": 0.6845150172328902, "grad_norm": 0.08300597220659256, "learning_rate": 0.00015783963542308167, "loss": 7.6366, "step": 5561 }, { "epoch": 0.6846381093057607, "grad_norm": 0.06696085631847382, "learning_rate": 0.00015777805148417293, "loss": 7.7233, "step": 5562 }, { "epoch": 0.6847612013786312, "grad_norm": 0.07233208417892456, "learning_rate": 0.0001577164675452642, "loss": 7.7082, "step": 5563 }, { "epoch": 0.6848842934515017, "grad_norm": 0.06087883934378624, "learning_rate": 0.00015765488360635547, "loss": 7.5435, "step": 5564 }, { "epoch": 0.6850073855243722, "grad_norm": 0.0640827938914299, "learning_rate": 0.00015759329966744673, "loss": 7.5061, "step": 5565 }, { "epoch": 0.6851304775972428, "grad_norm": 0.08133956044912338, "learning_rate": 0.000157531715728538, "loss": 7.6561, "step": 5566 }, { "epoch": 0.6852535696701132, "grad_norm": 0.06614469736814499, "learning_rate": 0.00015747013178962928, "loss": 7.6789, "step": 5567 }, { "epoch": 0.6853766617429837, "grad_norm": 0.14448094367980957, "learning_rate": 0.00015740854785072054, "loss": 7.2512, "step": 5568 }, { "epoch": 0.6854997538158543, "grad_norm": 0.10887555778026581, "learning_rate": 0.0001573469639118118, "loss": 7.759, "step": 5569 }, { "epoch": 0.6856228458887248, "grad_norm": 0.06222223490476608, "learning_rate": 0.00015728537997290306, "loss": 7.7572, "step": 5570 }, { "epoch": 0.6857459379615952, "grad_norm": 0.05962382256984711, "learning_rate": 0.00015722379603399435, "loss": 7.5812, "step": 5571 }, { "epoch": 0.6858690300344658, "grad_norm": 0.09614887088537216, "learning_rate": 0.0001571622120950856, "loss": 7.3247, "step": 5572 }, { "epoch": 0.6859921221073363, "grad_norm": 0.06639881432056427, "learning_rate": 0.00015710062815617686, "loss": 7.5749, "step": 5573 }, { "epoch": 0.6861152141802068, "grad_norm": 0.1666172295808792, "learning_rate": 0.00015703904421726815, "loss": 8.2227, "step": 5574 }, { "epoch": 0.6862383062530774, "grad_norm": 0.11434435844421387, "learning_rate": 0.0001569774602783594, "loss": 8.3512, "step": 5575 }, { "epoch": 0.6863613983259478, "grad_norm": 0.1838649958372116, "learning_rate": 0.00015691587633945067, "loss": 8.6454, "step": 5576 }, { "epoch": 0.6864844903988183, "grad_norm": 0.1399032324552536, "learning_rate": 0.00015685429240054193, "loss": 7.1947, "step": 5577 }, { "epoch": 0.6866075824716888, "grad_norm": 0.07473082840442657, "learning_rate": 0.00015679270846163322, "loss": 7.5256, "step": 5578 }, { "epoch": 0.6867306745445594, "grad_norm": 0.09921763837337494, "learning_rate": 0.00015673112452272448, "loss": 7.9291, "step": 5579 }, { "epoch": 0.6868537666174298, "grad_norm": 0.09139002859592438, "learning_rate": 0.00015666954058381574, "loss": 7.7156, "step": 5580 }, { "epoch": 0.6869768586903003, "grad_norm": 0.15614117681980133, "learning_rate": 0.00015660795664490702, "loss": 8.2042, "step": 5581 }, { "epoch": 0.6870999507631709, "grad_norm": 0.15688936412334442, "learning_rate": 0.00015654637270599828, "loss": 7.2223, "step": 5582 }, { "epoch": 0.6872230428360414, "grad_norm": 0.11859045922756195, "learning_rate": 0.00015648478876708954, "loss": 7.7053, "step": 5583 }, { "epoch": 0.6873461349089118, "grad_norm": 0.11315173655748367, "learning_rate": 0.00015642320482818083, "loss": 7.9645, "step": 5584 }, { "epoch": 0.6874692269817824, "grad_norm": 0.10488997399806976, "learning_rate": 0.0001563616208892721, "loss": 7.8746, "step": 5585 }, { "epoch": 0.6875923190546529, "grad_norm": 0.0961296483874321, "learning_rate": 0.00015630003695036335, "loss": 7.9838, "step": 5586 }, { "epoch": 0.6877154111275234, "grad_norm": 0.14101967215538025, "learning_rate": 0.0001562384530114546, "loss": 7.2952, "step": 5587 }, { "epoch": 0.6878385032003939, "grad_norm": 0.06831886619329453, "learning_rate": 0.0001561768690725459, "loss": 7.535, "step": 5588 }, { "epoch": 0.6879615952732644, "grad_norm": 0.0845433846116066, "learning_rate": 0.00015611528513363715, "loss": 7.7897, "step": 5589 }, { "epoch": 0.6880846873461349, "grad_norm": 0.11046618223190308, "learning_rate": 0.0001560537011947284, "loss": 7.2579, "step": 5590 }, { "epoch": 0.6882077794190055, "grad_norm": 0.10899553447961807, "learning_rate": 0.0001559921172558197, "loss": 7.8015, "step": 5591 }, { "epoch": 0.6883308714918759, "grad_norm": 0.09696652740240097, "learning_rate": 0.00015593053331691096, "loss": 7.4202, "step": 5592 }, { "epoch": 0.6884539635647464, "grad_norm": 0.12703149020671844, "learning_rate": 0.00015586894937800222, "loss": 7.5667, "step": 5593 }, { "epoch": 0.688577055637617, "grad_norm": 0.09820183366537094, "learning_rate": 0.0001558073654390935, "loss": 7.8091, "step": 5594 }, { "epoch": 0.6887001477104875, "grad_norm": 0.0644463449716568, "learning_rate": 0.00015574578150018476, "loss": 7.5832, "step": 5595 }, { "epoch": 0.6888232397833579, "grad_norm": 0.0532696396112442, "learning_rate": 0.00015568419756127602, "loss": 7.5682, "step": 5596 }, { "epoch": 0.6889463318562284, "grad_norm": 0.08124374598264694, "learning_rate": 0.00015562261362236728, "loss": 7.3161, "step": 5597 }, { "epoch": 0.689069423929099, "grad_norm": 0.062487680464982986, "learning_rate": 0.00015556102968345857, "loss": 7.846, "step": 5598 }, { "epoch": 0.6891925160019695, "grad_norm": 0.09027387201786041, "learning_rate": 0.00015549944574454983, "loss": 7.3622, "step": 5599 }, { "epoch": 0.6893156080748399, "grad_norm": 0.07546699047088623, "learning_rate": 0.0001554378618056411, "loss": 7.4938, "step": 5600 }, { "epoch": 0.6894387001477105, "grad_norm": 0.0954323336482048, "learning_rate": 0.00015537627786673237, "loss": 7.7678, "step": 5601 }, { "epoch": 0.689561792220581, "grad_norm": 0.17794372141361237, "learning_rate": 0.00015531469392782363, "loss": 8.2162, "step": 5602 }, { "epoch": 0.6896848842934515, "grad_norm": 0.11519675701856613, "learning_rate": 0.0001552531099889149, "loss": 7.2383, "step": 5603 }, { "epoch": 0.689807976366322, "grad_norm": 0.25351783633232117, "learning_rate": 0.00015519152605000615, "loss": 8.4875, "step": 5604 }, { "epoch": 0.6899310684391925, "grad_norm": 0.0921446904540062, "learning_rate": 0.00015512994211109744, "loss": 7.7632, "step": 5605 }, { "epoch": 0.690054160512063, "grad_norm": 0.24274493753910065, "learning_rate": 0.0001550683581721887, "loss": 8.6746, "step": 5606 }, { "epoch": 0.6901772525849336, "grad_norm": 0.09865712374448776, "learning_rate": 0.00015500677423327996, "loss": 7.4167, "step": 5607 }, { "epoch": 0.690300344657804, "grad_norm": 0.1125524714589119, "learning_rate": 0.00015494519029437125, "loss": 7.653, "step": 5608 }, { "epoch": 0.6904234367306745, "grad_norm": 0.07760484516620636, "learning_rate": 0.0001548836063554625, "loss": 7.9217, "step": 5609 }, { "epoch": 0.6905465288035451, "grad_norm": 0.22211678326129913, "learning_rate": 0.00015482202241655376, "loss": 8.7878, "step": 5610 }, { "epoch": 0.6906696208764156, "grad_norm": 0.14200268685817719, "learning_rate": 0.00015476043847764505, "loss": 8.2567, "step": 5611 }, { "epoch": 0.690792712949286, "grad_norm": 0.11956030875444412, "learning_rate": 0.0001546988545387363, "loss": 7.5161, "step": 5612 }, { "epoch": 0.6909158050221565, "grad_norm": 0.12318380177021027, "learning_rate": 0.00015463727059982757, "loss": 7.6648, "step": 5613 }, { "epoch": 0.6910388970950271, "grad_norm": 0.05998223274946213, "learning_rate": 0.00015457568666091883, "loss": 7.8498, "step": 5614 }, { "epoch": 0.6911619891678976, "grad_norm": 0.20316286385059357, "learning_rate": 0.00015451410272201012, "loss": 7.0546, "step": 5615 }, { "epoch": 0.691285081240768, "grad_norm": 0.11382832378149033, "learning_rate": 0.00015445251878310138, "loss": 7.6822, "step": 5616 }, { "epoch": 0.6914081733136386, "grad_norm": 0.046588025987148285, "learning_rate": 0.00015439093484419263, "loss": 7.6661, "step": 5617 }, { "epoch": 0.6915312653865091, "grad_norm": 0.077443927526474, "learning_rate": 0.00015432935090528392, "loss": 7.5067, "step": 5618 }, { "epoch": 0.6916543574593796, "grad_norm": 0.10377924889326096, "learning_rate": 0.00015426776696637518, "loss": 7.2755, "step": 5619 }, { "epoch": 0.6917774495322502, "grad_norm": 0.08080679178237915, "learning_rate": 0.00015420618302746644, "loss": 7.8482, "step": 5620 }, { "epoch": 0.6919005416051206, "grad_norm": 0.08499638736248016, "learning_rate": 0.0001541445990885577, "loss": 7.5447, "step": 5621 }, { "epoch": 0.6920236336779911, "grad_norm": 0.07143831998109818, "learning_rate": 0.00015408301514964899, "loss": 7.3377, "step": 5622 }, { "epoch": 0.6921467257508617, "grad_norm": 0.10835324972867966, "learning_rate": 0.00015402143121074025, "loss": 7.389, "step": 5623 }, { "epoch": 0.6922698178237322, "grad_norm": 0.10528860241174698, "learning_rate": 0.0001539598472718315, "loss": 7.574, "step": 5624 }, { "epoch": 0.6923929098966026, "grad_norm": 0.1274435818195343, "learning_rate": 0.0001538982633329228, "loss": 7.3573, "step": 5625 }, { "epoch": 0.6925160019694732, "grad_norm": 0.1120176687836647, "learning_rate": 0.00015383667939401405, "loss": 7.6439, "step": 5626 }, { "epoch": 0.6926390940423437, "grad_norm": 0.061677396297454834, "learning_rate": 0.0001537750954551053, "loss": 7.637, "step": 5627 }, { "epoch": 0.6927621861152142, "grad_norm": 0.09582281112670898, "learning_rate": 0.0001537135115161966, "loss": 7.4744, "step": 5628 }, { "epoch": 0.6928852781880847, "grad_norm": 0.40962472558021545, "learning_rate": 0.00015365192757728786, "loss": 9.2922, "step": 5629 }, { "epoch": 0.6930083702609552, "grad_norm": 0.11410889029502869, "learning_rate": 0.00015359034363837912, "loss": 7.6502, "step": 5630 }, { "epoch": 0.6931314623338257, "grad_norm": 0.13243679702281952, "learning_rate": 0.00015352875969947038, "loss": 7.5545, "step": 5631 }, { "epoch": 0.6932545544066963, "grad_norm": 0.06851696223020554, "learning_rate": 0.00015346717576056166, "loss": 7.7642, "step": 5632 }, { "epoch": 0.6933776464795667, "grad_norm": 0.06552888453006744, "learning_rate": 0.00015340559182165292, "loss": 7.6338, "step": 5633 }, { "epoch": 0.6935007385524372, "grad_norm": 0.09732052683830261, "learning_rate": 0.00015334400788274418, "loss": 7.7744, "step": 5634 }, { "epoch": 0.6936238306253077, "grad_norm": 0.09224511682987213, "learning_rate": 0.00015328242394383547, "loss": 7.7002, "step": 5635 }, { "epoch": 0.6937469226981783, "grad_norm": 0.09264708310365677, "learning_rate": 0.00015322084000492673, "loss": 7.5418, "step": 5636 }, { "epoch": 0.6938700147710487, "grad_norm": 0.08722560852766037, "learning_rate": 0.000153159256066018, "loss": 7.4626, "step": 5637 }, { "epoch": 0.6939931068439192, "grad_norm": 0.16888052225112915, "learning_rate": 0.00015309767212710927, "loss": 8.5642, "step": 5638 }, { "epoch": 0.6941161989167898, "grad_norm": 0.14124691486358643, "learning_rate": 0.00015303608818820053, "loss": 7.7673, "step": 5639 }, { "epoch": 0.6942392909896603, "grad_norm": 0.11806020885705948, "learning_rate": 0.0001529745042492918, "loss": 7.1846, "step": 5640 }, { "epoch": 0.6943623830625307, "grad_norm": 0.07711747288703918, "learning_rate": 0.00015291292031038305, "loss": 7.9789, "step": 5641 }, { "epoch": 0.6944854751354013, "grad_norm": 0.27608364820480347, "learning_rate": 0.00015285133637147434, "loss": 8.6054, "step": 5642 }, { "epoch": 0.6946085672082718, "grad_norm": 0.138520285487175, "learning_rate": 0.0001527897524325656, "loss": 7.4784, "step": 5643 }, { "epoch": 0.6947316592811423, "grad_norm": 0.09862193465232849, "learning_rate": 0.00015272816849365686, "loss": 7.2651, "step": 5644 }, { "epoch": 0.6948547513540128, "grad_norm": 0.08839597553014755, "learning_rate": 0.00015266658455474814, "loss": 7.3672, "step": 5645 }, { "epoch": 0.6949778434268833, "grad_norm": 0.200622096657753, "learning_rate": 0.0001526050006158394, "loss": 7.4468, "step": 5646 }, { "epoch": 0.6951009354997538, "grad_norm": 0.11570584774017334, "learning_rate": 0.00015254341667693066, "loss": 8.0043, "step": 5647 }, { "epoch": 0.6952240275726244, "grad_norm": 0.05663638934493065, "learning_rate": 0.00015248183273802192, "loss": 7.575, "step": 5648 }, { "epoch": 0.6953471196454948, "grad_norm": 0.08439051359891891, "learning_rate": 0.0001524202487991132, "loss": 7.6029, "step": 5649 }, { "epoch": 0.6954702117183653, "grad_norm": 0.10017139464616776, "learning_rate": 0.00015235866486020447, "loss": 7.7604, "step": 5650 }, { "epoch": 0.6955933037912359, "grad_norm": 0.08153678476810455, "learning_rate": 0.00015229708092129573, "loss": 7.4153, "step": 5651 }, { "epoch": 0.6957163958641064, "grad_norm": 0.10161006450653076, "learning_rate": 0.00015223549698238702, "loss": 7.4187, "step": 5652 }, { "epoch": 0.6958394879369768, "grad_norm": 0.08793819695711136, "learning_rate": 0.00015217391304347827, "loss": 7.4705, "step": 5653 }, { "epoch": 0.6959625800098473, "grad_norm": 0.10048740357160568, "learning_rate": 0.00015211232910456953, "loss": 7.5058, "step": 5654 }, { "epoch": 0.6960856720827179, "grad_norm": 0.17609252035617828, "learning_rate": 0.00015205074516566082, "loss": 8.3714, "step": 5655 }, { "epoch": 0.6962087641555884, "grad_norm": 0.276230126619339, "learning_rate": 0.00015198916122675208, "loss": 9.3498, "step": 5656 }, { "epoch": 0.6963318562284588, "grad_norm": 0.20561382174491882, "learning_rate": 0.00015192757728784334, "loss": 8.5364, "step": 5657 }, { "epoch": 0.6964549483013294, "grad_norm": 0.08284345269203186, "learning_rate": 0.0001518659933489346, "loss": 7.7261, "step": 5658 }, { "epoch": 0.6965780403741999, "grad_norm": 0.28174299001693726, "learning_rate": 0.00015180440941002589, "loss": 9.3824, "step": 5659 }, { "epoch": 0.6967011324470704, "grad_norm": 0.11592406779527664, "learning_rate": 0.00015174282547111715, "loss": 7.5831, "step": 5660 }, { "epoch": 0.696824224519941, "grad_norm": 0.16717953979969025, "learning_rate": 0.0001516812415322084, "loss": 7.5412, "step": 5661 }, { "epoch": 0.6969473165928114, "grad_norm": 0.07460425049066544, "learning_rate": 0.0001516196575932997, "loss": 8.2904, "step": 5662 }, { "epoch": 0.6970704086656819, "grad_norm": 0.09270127862691879, "learning_rate": 0.00015155807365439095, "loss": 7.7368, "step": 5663 }, { "epoch": 0.6971935007385525, "grad_norm": 0.06483146548271179, "learning_rate": 0.0001514964897154822, "loss": 7.768, "step": 5664 }, { "epoch": 0.697316592811423, "grad_norm": 0.08557460457086563, "learning_rate": 0.0001514349057765735, "loss": 7.91, "step": 5665 }, { "epoch": 0.6974396848842934, "grad_norm": 0.0893891453742981, "learning_rate": 0.00015137332183766476, "loss": 7.6443, "step": 5666 }, { "epoch": 0.697562776957164, "grad_norm": 0.1399940401315689, "learning_rate": 0.00015131173789875602, "loss": 7.298, "step": 5667 }, { "epoch": 0.6976858690300345, "grad_norm": 0.1371612399816513, "learning_rate": 0.00015125015395984728, "loss": 7.3141, "step": 5668 }, { "epoch": 0.697808961102905, "grad_norm": 0.23340818285942078, "learning_rate": 0.00015118857002093856, "loss": 7.3836, "step": 5669 }, { "epoch": 0.6979320531757754, "grad_norm": 0.08467558026313782, "learning_rate": 0.00015112698608202982, "loss": 7.2959, "step": 5670 }, { "epoch": 0.698055145248646, "grad_norm": 0.11677270382642746, "learning_rate": 0.00015106540214312108, "loss": 7.4971, "step": 5671 }, { "epoch": 0.6981782373215165, "grad_norm": 0.151045024394989, "learning_rate": 0.00015100381820421237, "loss": 7.5752, "step": 5672 }, { "epoch": 0.698301329394387, "grad_norm": 0.13414977490901947, "learning_rate": 0.00015094223426530363, "loss": 7.5498, "step": 5673 }, { "epoch": 0.6984244214672575, "grad_norm": 0.17424257099628448, "learning_rate": 0.0001508806503263949, "loss": 7.6519, "step": 5674 }, { "epoch": 0.698547513540128, "grad_norm": 0.19795584678649902, "learning_rate": 0.00015081906638748615, "loss": 8.2288, "step": 5675 }, { "epoch": 0.6986706056129985, "grad_norm": 0.10062911361455917, "learning_rate": 0.00015075748244857743, "loss": 7.5534, "step": 5676 }, { "epoch": 0.6987936976858691, "grad_norm": 0.05615542083978653, "learning_rate": 0.0001506958985096687, "loss": 7.4731, "step": 5677 }, { "epoch": 0.6989167897587395, "grad_norm": 0.09266587346792221, "learning_rate": 0.00015063431457075995, "loss": 7.4304, "step": 5678 }, { "epoch": 0.69903988183161, "grad_norm": 0.06686051189899445, "learning_rate": 0.00015057273063185124, "loss": 7.5413, "step": 5679 }, { "epoch": 0.6991629739044806, "grad_norm": 0.07547057420015335, "learning_rate": 0.0001505111466929425, "loss": 7.708, "step": 5680 }, { "epoch": 0.6992860659773511, "grad_norm": 0.07878107577562332, "learning_rate": 0.00015044956275403376, "loss": 7.8066, "step": 5681 }, { "epoch": 0.6994091580502215, "grad_norm": 0.1096540093421936, "learning_rate": 0.00015038797881512504, "loss": 7.4113, "step": 5682 }, { "epoch": 0.6995322501230921, "grad_norm": 0.08499184250831604, "learning_rate": 0.0001503263948762163, "loss": 7.5358, "step": 5683 }, { "epoch": 0.6996553421959626, "grad_norm": 0.0887739360332489, "learning_rate": 0.00015026481093730756, "loss": 7.6547, "step": 5684 }, { "epoch": 0.6997784342688331, "grad_norm": 0.09323236346244812, "learning_rate": 0.00015020322699839882, "loss": 8.1697, "step": 5685 }, { "epoch": 0.6999015263417036, "grad_norm": 0.3933865427970886, "learning_rate": 0.0001501416430594901, "loss": 9.1762, "step": 5686 }, { "epoch": 0.7000246184145741, "grad_norm": 0.09846866130828857, "learning_rate": 0.00015008005912058137, "loss": 8.0746, "step": 5687 }, { "epoch": 0.7001477104874446, "grad_norm": 0.22194689512252808, "learning_rate": 0.00015001847518167263, "loss": 8.4718, "step": 5688 }, { "epoch": 0.7002708025603152, "grad_norm": 0.07642322778701782, "learning_rate": 0.0001499568912427639, "loss": 7.6267, "step": 5689 }, { "epoch": 0.7003938946331856, "grad_norm": 0.10981497168540955, "learning_rate": 0.00014989530730385515, "loss": 7.3794, "step": 5690 }, { "epoch": 0.7005169867060561, "grad_norm": 0.07440284639596939, "learning_rate": 0.0001498337233649464, "loss": 7.5224, "step": 5691 }, { "epoch": 0.7006400787789266, "grad_norm": 0.07229426503181458, "learning_rate": 0.00014977213942603767, "loss": 7.4905, "step": 5692 }, { "epoch": 0.7007631708517972, "grad_norm": 0.0896768569946289, "learning_rate": 0.00014971055548712895, "loss": 7.5122, "step": 5693 }, { "epoch": 0.7008862629246676, "grad_norm": 0.07411811500787735, "learning_rate": 0.0001496489715482202, "loss": 7.6779, "step": 5694 }, { "epoch": 0.7010093549975381, "grad_norm": 0.161537766456604, "learning_rate": 0.00014958738760931147, "loss": 8.2227, "step": 5695 }, { "epoch": 0.7011324470704087, "grad_norm": 0.09456883370876312, "learning_rate": 0.00014952580367040276, "loss": 7.3932, "step": 5696 }, { "epoch": 0.7012555391432792, "grad_norm": 0.08672541379928589, "learning_rate": 0.00014946421973149402, "loss": 7.7239, "step": 5697 }, { "epoch": 0.7013786312161496, "grad_norm": 0.08769035339355469, "learning_rate": 0.00014940263579258528, "loss": 7.422, "step": 5698 }, { "epoch": 0.7015017232890202, "grad_norm": 0.10936300456523895, "learning_rate": 0.00014934105185367656, "loss": 8.2301, "step": 5699 }, { "epoch": 0.7016248153618907, "grad_norm": 0.2552002966403961, "learning_rate": 0.00014927946791476782, "loss": 8.6728, "step": 5700 }, { "epoch": 0.7017479074347612, "grad_norm": 0.09296290576457977, "learning_rate": 0.00014921788397585908, "loss": 7.4226, "step": 5701 }, { "epoch": 0.7018709995076317, "grad_norm": 0.07846072316169739, "learning_rate": 0.00014915630003695034, "loss": 7.6321, "step": 5702 }, { "epoch": 0.7019940915805022, "grad_norm": 0.10011544078588486, "learning_rate": 0.00014909471609804163, "loss": 7.2988, "step": 5703 }, { "epoch": 0.7021171836533727, "grad_norm": 0.1534227877855301, "learning_rate": 0.0001490331321591329, "loss": 8.7309, "step": 5704 }, { "epoch": 0.7022402757262433, "grad_norm": 0.10503726452589035, "learning_rate": 0.00014897154822022415, "loss": 7.4282, "step": 5705 }, { "epoch": 0.7023633677991138, "grad_norm": 0.10912304371595383, "learning_rate": 0.00014890996428131543, "loss": 7.5343, "step": 5706 }, { "epoch": 0.7024864598719842, "grad_norm": 0.08706942945718765, "learning_rate": 0.0001488483803424067, "loss": 7.5555, "step": 5707 }, { "epoch": 0.7026095519448547, "grad_norm": 0.07596494257450104, "learning_rate": 0.00014878679640349795, "loss": 7.5616, "step": 5708 }, { "epoch": 0.7027326440177253, "grad_norm": 0.05256293714046478, "learning_rate": 0.00014872521246458924, "loss": 7.3048, "step": 5709 }, { "epoch": 0.7028557360905958, "grad_norm": 0.13299264013767242, "learning_rate": 0.0001486636285256805, "loss": 7.8997, "step": 5710 }, { "epoch": 0.7029788281634662, "grad_norm": 0.4261533319950104, "learning_rate": 0.00014860204458677176, "loss": 9.1042, "step": 5711 }, { "epoch": 0.7031019202363368, "grad_norm": 0.09263144433498383, "learning_rate": 0.00014854046064786302, "loss": 7.6644, "step": 5712 }, { "epoch": 0.7032250123092073, "grad_norm": 0.09013485163450241, "learning_rate": 0.0001484788767089543, "loss": 7.4736, "step": 5713 }, { "epoch": 0.7033481043820778, "grad_norm": 0.1688622683286667, "learning_rate": 0.00014841729277004556, "loss": 7.9147, "step": 5714 }, { "epoch": 0.7034711964549483, "grad_norm": 0.08410118520259857, "learning_rate": 0.00014835570883113682, "loss": 7.4643, "step": 5715 }, { "epoch": 0.7035942885278188, "grad_norm": 0.11950257420539856, "learning_rate": 0.0001482941248922281, "loss": 8.1439, "step": 5716 }, { "epoch": 0.7037173806006893, "grad_norm": 0.07377878576517105, "learning_rate": 0.00014823254095331937, "loss": 7.5533, "step": 5717 }, { "epoch": 0.7038404726735599, "grad_norm": 0.06717133522033691, "learning_rate": 0.00014817095701441063, "loss": 7.7348, "step": 5718 }, { "epoch": 0.7039635647464303, "grad_norm": 0.10278361290693283, "learning_rate": 0.0001481093730755019, "loss": 7.525, "step": 5719 }, { "epoch": 0.7040866568193008, "grad_norm": 0.07209102809429169, "learning_rate": 0.00014804778913659318, "loss": 7.8161, "step": 5720 }, { "epoch": 0.7042097488921714, "grad_norm": 0.11324641853570938, "learning_rate": 0.00014798620519768444, "loss": 7.6108, "step": 5721 }, { "epoch": 0.7043328409650419, "grad_norm": 0.12832540273666382, "learning_rate": 0.0001479246212587757, "loss": 7.3398, "step": 5722 }, { "epoch": 0.7044559330379123, "grad_norm": 0.13911356031894684, "learning_rate": 0.00014786303731986698, "loss": 7.1236, "step": 5723 }, { "epoch": 0.7045790251107829, "grad_norm": 0.2738215923309326, "learning_rate": 0.00014780145338095824, "loss": 8.7772, "step": 5724 }, { "epoch": 0.7047021171836534, "grad_norm": 0.09287873655557632, "learning_rate": 0.0001477398694420495, "loss": 7.515, "step": 5725 }, { "epoch": 0.7048252092565239, "grad_norm": 0.10052882879972458, "learning_rate": 0.0001476782855031408, "loss": 7.783, "step": 5726 }, { "epoch": 0.7049483013293943, "grad_norm": 0.08985292166471481, "learning_rate": 0.00014761670156423205, "loss": 7.2351, "step": 5727 }, { "epoch": 0.7050713934022649, "grad_norm": 0.6105116009712219, "learning_rate": 0.0001475551176253233, "loss": 10.3807, "step": 5728 }, { "epoch": 0.7051944854751354, "grad_norm": 0.06125693768262863, "learning_rate": 0.00014749353368641457, "loss": 7.2964, "step": 5729 }, { "epoch": 0.705317577548006, "grad_norm": 0.07902777194976807, "learning_rate": 0.00014743194974750585, "loss": 7.5572, "step": 5730 }, { "epoch": 0.7054406696208764, "grad_norm": 0.06612709909677505, "learning_rate": 0.0001473703658085971, "loss": 7.4109, "step": 5731 }, { "epoch": 0.7055637616937469, "grad_norm": 0.22613106667995453, "learning_rate": 0.00014730878186968837, "loss": 8.3322, "step": 5732 }, { "epoch": 0.7056868537666174, "grad_norm": 0.08290378749370575, "learning_rate": 0.00014724719793077966, "loss": 7.3951, "step": 5733 }, { "epoch": 0.705809945839488, "grad_norm": 0.07060316950082779, "learning_rate": 0.00014718561399187092, "loss": 7.6889, "step": 5734 }, { "epoch": 0.7059330379123584, "grad_norm": 0.11269094794988632, "learning_rate": 0.00014712403005296218, "loss": 8.2455, "step": 5735 }, { "epoch": 0.7060561299852289, "grad_norm": 0.5297726392745972, "learning_rate": 0.00014706244611405346, "loss": 10.1537, "step": 5736 }, { "epoch": 0.7061792220580995, "grad_norm": 0.07747124135494232, "learning_rate": 0.00014700086217514472, "loss": 7.9484, "step": 5737 }, { "epoch": 0.70630231413097, "grad_norm": 0.07002104818820953, "learning_rate": 0.00014693927823623598, "loss": 7.801, "step": 5738 }, { "epoch": 0.7064254062038404, "grad_norm": 0.13540434837341309, "learning_rate": 0.00014687769429732724, "loss": 7.4729, "step": 5739 }, { "epoch": 0.706548498276711, "grad_norm": 0.13198409974575043, "learning_rate": 0.00014681611035841853, "loss": 8.3141, "step": 5740 }, { "epoch": 0.7066715903495815, "grad_norm": 0.2461465299129486, "learning_rate": 0.0001467545264195098, "loss": 7.3498, "step": 5741 }, { "epoch": 0.706794682422452, "grad_norm": 0.08108828216791153, "learning_rate": 0.00014669294248060105, "loss": 7.7729, "step": 5742 }, { "epoch": 0.7069177744953224, "grad_norm": 0.17109085619449615, "learning_rate": 0.00014663135854169233, "loss": 7.4875, "step": 5743 }, { "epoch": 0.707040866568193, "grad_norm": 0.13212448358535767, "learning_rate": 0.0001465697746027836, "loss": 7.4575, "step": 5744 }, { "epoch": 0.7071639586410635, "grad_norm": 0.08271720260381699, "learning_rate": 0.00014650819066387485, "loss": 7.6034, "step": 5745 }, { "epoch": 0.707287050713934, "grad_norm": 0.14129638671875, "learning_rate": 0.0001464466067249661, "loss": 7.3181, "step": 5746 }, { "epoch": 0.7074101427868045, "grad_norm": 0.11081155389547348, "learning_rate": 0.0001463850227860574, "loss": 8.0345, "step": 5747 }, { "epoch": 0.707533234859675, "grad_norm": 0.09239121526479721, "learning_rate": 0.00014632343884714866, "loss": 7.7505, "step": 5748 }, { "epoch": 0.7076563269325455, "grad_norm": 0.10509057343006134, "learning_rate": 0.00014626185490823992, "loss": 7.6678, "step": 5749 }, { "epoch": 0.7077794190054161, "grad_norm": 0.11060407012701035, "learning_rate": 0.0001462002709693312, "loss": 7.5438, "step": 5750 }, { "epoch": 0.7079025110782866, "grad_norm": 0.20749658346176147, "learning_rate": 0.00014613868703042246, "loss": 7.9532, "step": 5751 }, { "epoch": 0.708025603151157, "grad_norm": 0.1458033174276352, "learning_rate": 0.00014607710309151372, "loss": 7.7473, "step": 5752 }, { "epoch": 0.7081486952240276, "grad_norm": 0.17289935052394867, "learning_rate": 0.000146015519152605, "loss": 7.7237, "step": 5753 }, { "epoch": 0.7082717872968981, "grad_norm": 0.09264075756072998, "learning_rate": 0.00014595393521369627, "loss": 7.274, "step": 5754 }, { "epoch": 0.7083948793697686, "grad_norm": 0.11836354434490204, "learning_rate": 0.00014589235127478753, "loss": 7.6569, "step": 5755 }, { "epoch": 0.7085179714426391, "grad_norm": 0.4254089891910553, "learning_rate": 0.0001458307673358788, "loss": 9.4212, "step": 5756 }, { "epoch": 0.7086410635155096, "grad_norm": 0.08155274391174316, "learning_rate": 0.00014576918339697008, "loss": 7.2566, "step": 5757 }, { "epoch": 0.7087641555883801, "grad_norm": 0.27560195326805115, "learning_rate": 0.00014570759945806133, "loss": 9.0375, "step": 5758 }, { "epoch": 0.7088872476612507, "grad_norm": 0.061978779733181, "learning_rate": 0.0001456460155191526, "loss": 7.6828, "step": 5759 }, { "epoch": 0.7090103397341211, "grad_norm": 0.09124965965747833, "learning_rate": 0.00014558443158024388, "loss": 7.4888, "step": 5760 }, { "epoch": 0.7091334318069916, "grad_norm": 0.06896716356277466, "learning_rate": 0.00014552284764133514, "loss": 7.7378, "step": 5761 }, { "epoch": 0.7092565238798622, "grad_norm": 0.07617934793233871, "learning_rate": 0.0001454612637024264, "loss": 7.6588, "step": 5762 }, { "epoch": 0.7093796159527327, "grad_norm": 0.11964759975671768, "learning_rate": 0.00014539967976351766, "loss": 7.4361, "step": 5763 }, { "epoch": 0.7095027080256031, "grad_norm": 0.15946613252162933, "learning_rate": 0.00014533809582460895, "loss": 7.427, "step": 5764 }, { "epoch": 0.7096258000984736, "grad_norm": 0.13256186246871948, "learning_rate": 0.0001452765118857002, "loss": 7.7965, "step": 5765 }, { "epoch": 0.7097488921713442, "grad_norm": 0.0842343121767044, "learning_rate": 0.00014521492794679146, "loss": 7.6023, "step": 5766 }, { "epoch": 0.7098719842442147, "grad_norm": 0.10762349516153336, "learning_rate": 0.00014515334400788275, "loss": 7.5941, "step": 5767 }, { "epoch": 0.7099950763170851, "grad_norm": 0.0719289779663086, "learning_rate": 0.000145091760068974, "loss": 7.771, "step": 5768 }, { "epoch": 0.7101181683899557, "grad_norm": 0.08515717834234238, "learning_rate": 0.00014503017613006527, "loss": 7.6766, "step": 5769 }, { "epoch": 0.7102412604628262, "grad_norm": 0.32493239641189575, "learning_rate": 0.00014496859219115656, "loss": 8.9468, "step": 5770 }, { "epoch": 0.7103643525356967, "grad_norm": 0.06888922303915024, "learning_rate": 0.00014490700825224782, "loss": 7.5405, "step": 5771 }, { "epoch": 0.7104874446085672, "grad_norm": 0.11516717821359634, "learning_rate": 0.00014484542431333908, "loss": 7.344, "step": 5772 }, { "epoch": 0.7106105366814377, "grad_norm": 0.11831428855657578, "learning_rate": 0.00014478384037443034, "loss": 7.4146, "step": 5773 }, { "epoch": 0.7107336287543082, "grad_norm": 0.08436868339776993, "learning_rate": 0.00014472225643552162, "loss": 7.6123, "step": 5774 }, { "epoch": 0.7108567208271788, "grad_norm": 0.07746074348688126, "learning_rate": 0.00014466067249661288, "loss": 7.4543, "step": 5775 }, { "epoch": 0.7109798129000492, "grad_norm": 0.10598372668027878, "learning_rate": 0.00014459908855770414, "loss": 7.8184, "step": 5776 }, { "epoch": 0.7111029049729197, "grad_norm": 0.11701078712940216, "learning_rate": 0.00014453750461879543, "loss": 7.9583, "step": 5777 }, { "epoch": 0.7112259970457903, "grad_norm": 0.25764429569244385, "learning_rate": 0.0001444759206798867, "loss": 9.3818, "step": 5778 }, { "epoch": 0.7113490891186608, "grad_norm": 0.08926183730363846, "learning_rate": 0.00014441433674097795, "loss": 8.082, "step": 5779 }, { "epoch": 0.7114721811915312, "grad_norm": 0.09683698415756226, "learning_rate": 0.00014435275280206923, "loss": 7.8678, "step": 5780 }, { "epoch": 0.7115952732644018, "grad_norm": 0.12383691966533661, "learning_rate": 0.0001442911688631605, "loss": 7.6473, "step": 5781 }, { "epoch": 0.7117183653372723, "grad_norm": 0.21559806168079376, "learning_rate": 0.00014422958492425175, "loss": 8.2683, "step": 5782 }, { "epoch": 0.7118414574101428, "grad_norm": 0.17894724011421204, "learning_rate": 0.000144168000985343, "loss": 9.0747, "step": 5783 }, { "epoch": 0.7119645494830132, "grad_norm": 0.11705602705478668, "learning_rate": 0.0001441064170464343, "loss": 8.4463, "step": 5784 }, { "epoch": 0.7120876415558838, "grad_norm": 0.20157426595687866, "learning_rate": 0.00014404483310752556, "loss": 8.8031, "step": 5785 }, { "epoch": 0.7122107336287543, "grad_norm": 0.19985774159431458, "learning_rate": 0.00014398324916861682, "loss": 7.5136, "step": 5786 }, { "epoch": 0.7123338257016248, "grad_norm": 0.1835206300020218, "learning_rate": 0.0001439216652297081, "loss": 7.5092, "step": 5787 }, { "epoch": 0.7124569177744953, "grad_norm": 0.22469203174114227, "learning_rate": 0.00014386008129079936, "loss": 7.3592, "step": 5788 }, { "epoch": 0.7125800098473658, "grad_norm": 0.17347510159015656, "learning_rate": 0.00014379849735189062, "loss": 7.9227, "step": 5789 }, { "epoch": 0.7127031019202363, "grad_norm": 0.17064712941646576, "learning_rate": 0.00014373691341298188, "loss": 7.3289, "step": 5790 }, { "epoch": 0.7128261939931069, "grad_norm": 0.08492955565452576, "learning_rate": 0.00014367532947407317, "loss": 7.6795, "step": 5791 }, { "epoch": 0.7129492860659774, "grad_norm": 0.33658158779144287, "learning_rate": 0.00014361374553516443, "loss": 9.0259, "step": 5792 }, { "epoch": 0.7130723781388478, "grad_norm": 0.10312914848327637, "learning_rate": 0.0001435521615962557, "loss": 8.027, "step": 5793 }, { "epoch": 0.7131954702117184, "grad_norm": 0.08364132046699524, "learning_rate": 0.00014349057765734697, "loss": 7.7774, "step": 5794 }, { "epoch": 0.7133185622845889, "grad_norm": 0.19722670316696167, "learning_rate": 0.00014342899371843823, "loss": 8.7504, "step": 5795 }, { "epoch": 0.7134416543574594, "grad_norm": 0.16297951340675354, "learning_rate": 0.0001433674097795295, "loss": 8.2924, "step": 5796 }, { "epoch": 0.7135647464303299, "grad_norm": 0.18472889065742493, "learning_rate": 0.00014330582584062078, "loss": 7.179, "step": 5797 }, { "epoch": 0.7136878385032004, "grad_norm": 0.11264864355325699, "learning_rate": 0.00014324424190171204, "loss": 7.8804, "step": 5798 }, { "epoch": 0.7138109305760709, "grad_norm": 0.1344481110572815, "learning_rate": 0.0001431826579628033, "loss": 7.6613, "step": 5799 }, { "epoch": 0.7139340226489415, "grad_norm": 0.08021939545869827, "learning_rate": 0.00014312107402389456, "loss": 7.529, "step": 5800 }, { "epoch": 0.7140571147218119, "grad_norm": 0.12890568375587463, "learning_rate": 0.00014305949008498585, "loss": 7.3421, "step": 5801 }, { "epoch": 0.7141802067946824, "grad_norm": 0.1297309249639511, "learning_rate": 0.0001429979061460771, "loss": 7.7097, "step": 5802 }, { "epoch": 0.714303298867553, "grad_norm": NaN, "learning_rate": 0.00014293632220716836, "loss": 9.6753, "step": 5803 }, { "epoch": 0.7144263909404235, "grad_norm": 0.08425939828157425, "learning_rate": 0.00014287473826825965, "loss": 7.4343, "step": 5804 }, { "epoch": 0.7145494830132939, "grad_norm": 0.12398553639650345, "learning_rate": 0.0001428131543293509, "loss": 8.3794, "step": 5805 }, { "epoch": 0.7146725750861644, "grad_norm": 0.15372078120708466, "learning_rate": 0.00014275157039044217, "loss": 7.7013, "step": 5806 }, { "epoch": 0.714795667159035, "grad_norm": 0.15299105644226074, "learning_rate": 0.00014268998645153346, "loss": 7.4379, "step": 5807 }, { "epoch": 0.7149187592319055, "grad_norm": 0.2985582649707794, "learning_rate": 0.00014262840251262472, "loss": 8.6276, "step": 5808 }, { "epoch": 0.7150418513047759, "grad_norm": 0.07749168574810028, "learning_rate": 0.00014256681857371598, "loss": 7.7204, "step": 5809 }, { "epoch": 0.7151649433776465, "grad_norm": 0.0864337831735611, "learning_rate": 0.00014250523463480723, "loss": 7.3321, "step": 5810 }, { "epoch": 0.715288035450517, "grad_norm": 0.07958444952964783, "learning_rate": 0.00014244365069589852, "loss": 7.4095, "step": 5811 }, { "epoch": 0.7154111275233875, "grad_norm": 0.15806932747364044, "learning_rate": 0.00014238206675698978, "loss": 7.7483, "step": 5812 }, { "epoch": 0.715534219596258, "grad_norm": 0.45068269968032837, "learning_rate": 0.00014232048281808104, "loss": 9.6608, "step": 5813 }, { "epoch": 0.7156573116691285, "grad_norm": 0.08506090939044952, "learning_rate": 0.00014225889887917233, "loss": 7.695, "step": 5814 }, { "epoch": 0.715780403741999, "grad_norm": 0.15988241136074066, "learning_rate": 0.0001421973149402636, "loss": 7.52, "step": 5815 }, { "epoch": 0.7159034958148696, "grad_norm": 0.18124765157699585, "learning_rate": 0.00014213573100135485, "loss": 7.2752, "step": 5816 }, { "epoch": 0.71602658788774, "grad_norm": 0.08457658439874649, "learning_rate": 0.0001420741470624461, "loss": 7.5878, "step": 5817 }, { "epoch": 0.7161496799606105, "grad_norm": 0.15013520419597626, "learning_rate": 0.0001420125631235374, "loss": 7.5384, "step": 5818 }, { "epoch": 0.716272772033481, "grad_norm": 0.07411982119083405, "learning_rate": 0.00014195097918462865, "loss": 7.3176, "step": 5819 }, { "epoch": 0.7163958641063516, "grad_norm": 0.35361579060554504, "learning_rate": 0.0001418893952457199, "loss": 8.8125, "step": 5820 }, { "epoch": 0.716518956179222, "grad_norm": 0.15487468242645264, "learning_rate": 0.0001418278113068112, "loss": 7.6269, "step": 5821 }, { "epoch": 0.7166420482520925, "grad_norm": 0.11374073475599289, "learning_rate": 0.00014176622736790246, "loss": 7.973, "step": 5822 }, { "epoch": 0.7167651403249631, "grad_norm": 0.09755706787109375, "learning_rate": 0.00014170464342899372, "loss": 7.5707, "step": 5823 }, { "epoch": 0.7168882323978336, "grad_norm": 0.08536829054355621, "learning_rate": 0.000141643059490085, "loss": 8.1464, "step": 5824 }, { "epoch": 0.717011324470704, "grad_norm": 0.18650494515895844, "learning_rate": 0.00014158147555117626, "loss": 7.5185, "step": 5825 }, { "epoch": 0.7171344165435746, "grad_norm": 0.10935384035110474, "learning_rate": 0.00014151989161226752, "loss": 7.5754, "step": 5826 }, { "epoch": 0.7172575086164451, "grad_norm": 0.16277945041656494, "learning_rate": 0.00014145830767335878, "loss": 7.1206, "step": 5827 }, { "epoch": 0.7173806006893156, "grad_norm": 0.1352677047252655, "learning_rate": 0.00014139672373445007, "loss": 7.2997, "step": 5828 }, { "epoch": 0.7175036927621861, "grad_norm": 0.18266263604164124, "learning_rate": 0.00014133513979554133, "loss": 7.3675, "step": 5829 }, { "epoch": 0.7176267848350566, "grad_norm": 0.15988001227378845, "learning_rate": 0.0001412735558566326, "loss": 7.4647, "step": 5830 }, { "epoch": 0.7177498769079271, "grad_norm": 0.16620518267154694, "learning_rate": 0.00014121197191772387, "loss": 7.8912, "step": 5831 }, { "epoch": 0.7178729689807977, "grad_norm": 0.1393669992685318, "learning_rate": 0.00014115038797881513, "loss": 7.578, "step": 5832 }, { "epoch": 0.7179960610536681, "grad_norm": 0.11628592759370804, "learning_rate": 0.0001410888040399064, "loss": 7.4884, "step": 5833 }, { "epoch": 0.7181191531265386, "grad_norm": 0.15495406091213226, "learning_rate": 0.00014102722010099768, "loss": 7.3291, "step": 5834 }, { "epoch": 0.7182422451994092, "grad_norm": 0.08208876103162766, "learning_rate": 0.00014096563616208894, "loss": 7.7104, "step": 5835 }, { "epoch": 0.7183653372722797, "grad_norm": 0.12991628050804138, "learning_rate": 0.0001409040522231802, "loss": 7.4197, "step": 5836 }, { "epoch": 0.7184884293451502, "grad_norm": 0.09027121961116791, "learning_rate": 0.00014084246828427146, "loss": 7.4047, "step": 5837 }, { "epoch": 0.7186115214180206, "grad_norm": 0.06919677555561066, "learning_rate": 0.00014078088434536274, "loss": 7.5379, "step": 5838 }, { "epoch": 0.7187346134908912, "grad_norm": 0.18734654784202576, "learning_rate": 0.000140719300406454, "loss": 8.0234, "step": 5839 }, { "epoch": 0.7188577055637617, "grad_norm": 0.18488071858882904, "learning_rate": 0.00014065771646754526, "loss": 7.7377, "step": 5840 }, { "epoch": 0.7189807976366323, "grad_norm": 0.06781422346830368, "learning_rate": 0.00014059613252863655, "loss": 7.2904, "step": 5841 }, { "epoch": 0.7191038897095027, "grad_norm": 0.1106870025396347, "learning_rate": 0.0001405345485897278, "loss": 7.363, "step": 5842 }, { "epoch": 0.7192269817823732, "grad_norm": 0.21324823796749115, "learning_rate": 0.00014047296465081907, "loss": 8.4042, "step": 5843 }, { "epoch": 0.7193500738552437, "grad_norm": 0.06490927189588547, "learning_rate": 0.00014041138071191033, "loss": 7.8884, "step": 5844 }, { "epoch": 0.7194731659281143, "grad_norm": 0.15007488429546356, "learning_rate": 0.00014034979677300162, "loss": 7.5398, "step": 5845 }, { "epoch": 0.7195962580009847, "grad_norm": 0.14279453456401825, "learning_rate": 0.00014028821283409287, "loss": 7.3593, "step": 5846 }, { "epoch": 0.7197193500738552, "grad_norm": 0.14296382665634155, "learning_rate": 0.00014022662889518413, "loss": 7.3508, "step": 5847 }, { "epoch": 0.7198424421467258, "grad_norm": 0.10909385979175568, "learning_rate": 0.00014016504495627542, "loss": 7.5388, "step": 5848 }, { "epoch": 0.7199655342195963, "grad_norm": 0.29799434542655945, "learning_rate": 0.00014010346101736668, "loss": 8.6694, "step": 5849 }, { "epoch": 0.7200886262924667, "grad_norm": 0.18908001482486725, "learning_rate": 0.00014004187707845794, "loss": 7.928, "step": 5850 }, { "epoch": 0.7202117183653373, "grad_norm": 0.2803410589694977, "learning_rate": 0.00013998029313954923, "loss": 8.7561, "step": 5851 }, { "epoch": 0.7203348104382078, "grad_norm": 0.2513260543346405, "learning_rate": 0.00013991870920064049, "loss": 8.3972, "step": 5852 }, { "epoch": 0.7204579025110783, "grad_norm": 0.10031735897064209, "learning_rate": 0.00013985712526173175, "loss": 7.4103, "step": 5853 }, { "epoch": 0.7205809945839488, "grad_norm": 0.06923572719097137, "learning_rate": 0.000139795541322823, "loss": 7.9458, "step": 5854 }, { "epoch": 0.7207040866568193, "grad_norm": 0.09296640008687973, "learning_rate": 0.0001397339573839143, "loss": 7.5929, "step": 5855 }, { "epoch": 0.7208271787296898, "grad_norm": 0.0780370756983757, "learning_rate": 0.00013967237344500555, "loss": 8.029, "step": 5856 }, { "epoch": 0.7209502708025604, "grad_norm": 0.17040783166885376, "learning_rate": 0.0001396107895060968, "loss": 7.9369, "step": 5857 }, { "epoch": 0.7210733628754308, "grad_norm": 0.0771433562040329, "learning_rate": 0.0001395492055671881, "loss": 7.9075, "step": 5858 }, { "epoch": 0.7211964549483013, "grad_norm": 0.08599165081977844, "learning_rate": 0.00013948762162827936, "loss": 7.6575, "step": 5859 }, { "epoch": 0.7213195470211718, "grad_norm": 0.09113890677690506, "learning_rate": 0.00013942603768937062, "loss": 7.58, "step": 5860 }, { "epoch": 0.7214426390940424, "grad_norm": 0.13880254328250885, "learning_rate": 0.00013936445375046188, "loss": 7.6718, "step": 5861 }, { "epoch": 0.7215657311669128, "grad_norm": 0.10086233168840408, "learning_rate": 0.00013930286981155316, "loss": 7.4732, "step": 5862 }, { "epoch": 0.7216888232397833, "grad_norm": 0.14254875481128693, "learning_rate": 0.00013924128587264442, "loss": 7.7721, "step": 5863 }, { "epoch": 0.7218119153126539, "grad_norm": 0.1538916677236557, "learning_rate": 0.00013917970193373568, "loss": 7.8599, "step": 5864 }, { "epoch": 0.7219350073855244, "grad_norm": 0.10697381943464279, "learning_rate": 0.00013911811799482697, "loss": 7.6626, "step": 5865 }, { "epoch": 0.7220580994583948, "grad_norm": 0.07025738060474396, "learning_rate": 0.00013905653405591823, "loss": 7.2871, "step": 5866 }, { "epoch": 0.7221811915312654, "grad_norm": 0.09549406915903091, "learning_rate": 0.0001389949501170095, "loss": 7.5645, "step": 5867 }, { "epoch": 0.7223042836041359, "grad_norm": 0.09874720871448517, "learning_rate": 0.00013893336617810077, "loss": 8.0338, "step": 5868 }, { "epoch": 0.7224273756770064, "grad_norm": 0.13226255774497986, "learning_rate": 0.00013887178223919203, "loss": 8.2488, "step": 5869 }, { "epoch": 0.7225504677498769, "grad_norm": 0.08387307822704315, "learning_rate": 0.0001388101983002833, "loss": 7.6393, "step": 5870 }, { "epoch": 0.7226735598227474, "grad_norm": 0.15891139209270477, "learning_rate": 0.00013874861436137455, "loss": 7.3246, "step": 5871 }, { "epoch": 0.7227966518956179, "grad_norm": 0.11649487167596817, "learning_rate": 0.00013868703042246584, "loss": 8.0807, "step": 5872 }, { "epoch": 0.7229197439684885, "grad_norm": 0.13749897480010986, "learning_rate": 0.0001386254464835571, "loss": 7.3159, "step": 5873 }, { "epoch": 0.7230428360413589, "grad_norm": 0.20395790040493011, "learning_rate": 0.00013856386254464836, "loss": 8.7446, "step": 5874 }, { "epoch": 0.7231659281142294, "grad_norm": 0.12230230122804642, "learning_rate": 0.00013850227860573964, "loss": 8.4284, "step": 5875 }, { "epoch": 0.7232890201871, "grad_norm": 0.15373137593269348, "learning_rate": 0.0001384406946668309, "loss": 7.2811, "step": 5876 }, { "epoch": 0.7234121122599705, "grad_norm": 0.20414595305919647, "learning_rate": 0.00013837911072792216, "loss": 7.1261, "step": 5877 }, { "epoch": 0.723535204332841, "grad_norm": 0.0998038724064827, "learning_rate": 0.00013831752678901345, "loss": 7.4394, "step": 5878 }, { "epoch": 0.7236582964057114, "grad_norm": 0.17029261589050293, "learning_rate": 0.0001382559428501047, "loss": 8.1461, "step": 5879 }, { "epoch": 0.723781388478582, "grad_norm": 0.17779289186000824, "learning_rate": 0.00013819435891119597, "loss": 8.4717, "step": 5880 }, { "epoch": 0.7239044805514525, "grad_norm": 0.10219481587409973, "learning_rate": 0.00013813277497228723, "loss": 7.4503, "step": 5881 }, { "epoch": 0.724027572624323, "grad_norm": 0.11349686980247498, "learning_rate": 0.00013807119103337851, "loss": 7.4861, "step": 5882 }, { "epoch": 0.7241506646971935, "grad_norm": 0.1248890683054924, "learning_rate": 0.00013800960709446977, "loss": 7.6924, "step": 5883 }, { "epoch": 0.724273756770064, "grad_norm": 0.08815639466047287, "learning_rate": 0.00013794802315556103, "loss": 7.8839, "step": 5884 }, { "epoch": 0.7243968488429345, "grad_norm": 0.07979144155979156, "learning_rate": 0.00013788643921665232, "loss": 7.6878, "step": 5885 }, { "epoch": 0.7245199409158051, "grad_norm": 0.11455494165420532, "learning_rate": 0.00013782485527774358, "loss": 8.2104, "step": 5886 }, { "epoch": 0.7246430329886755, "grad_norm": 0.07349572330713272, "learning_rate": 0.00013776327133883484, "loss": 7.9015, "step": 5887 }, { "epoch": 0.724766125061546, "grad_norm": 0.14477647840976715, "learning_rate": 0.0001377016873999261, "loss": 7.5186, "step": 5888 }, { "epoch": 0.7248892171344166, "grad_norm": 0.0985797867178917, "learning_rate": 0.00013764010346101739, "loss": 7.6006, "step": 5889 }, { "epoch": 0.7250123092072871, "grad_norm": 0.11023398488759995, "learning_rate": 0.00013757851952210864, "loss": 7.4899, "step": 5890 }, { "epoch": 0.7251354012801575, "grad_norm": 0.11746480315923691, "learning_rate": 0.0001375169355831999, "loss": 7.6845, "step": 5891 }, { "epoch": 0.725258493353028, "grad_norm": 0.12940748035907745, "learning_rate": 0.0001374553516442912, "loss": 7.8847, "step": 5892 }, { "epoch": 0.7253815854258986, "grad_norm": 0.05734112486243248, "learning_rate": 0.00013739376770538245, "loss": 7.3474, "step": 5893 }, { "epoch": 0.7255046774987691, "grad_norm": 0.13547435402870178, "learning_rate": 0.0001373321837664737, "loss": 7.5391, "step": 5894 }, { "epoch": 0.7256277695716395, "grad_norm": 0.07160394638776779, "learning_rate": 0.000137270599827565, "loss": 7.303, "step": 5895 }, { "epoch": 0.7257508616445101, "grad_norm": 0.3562943637371063, "learning_rate": 0.00013720901588865626, "loss": 8.8362, "step": 5896 }, { "epoch": 0.7258739537173806, "grad_norm": 0.1052851602435112, "learning_rate": 0.00013714743194974752, "loss": 7.5787, "step": 5897 }, { "epoch": 0.7259970457902511, "grad_norm": 0.068109892308712, "learning_rate": 0.00013708584801083878, "loss": 7.284, "step": 5898 }, { "epoch": 0.7261201378631216, "grad_norm": 0.07154984027147293, "learning_rate": 0.00013702426407193006, "loss": 7.4893, "step": 5899 }, { "epoch": 0.7262432299359921, "grad_norm": 0.07600881159305573, "learning_rate": 0.00013696268013302132, "loss": 7.4922, "step": 5900 }, { "epoch": 0.7263663220088626, "grad_norm": 0.05387922003865242, "learning_rate": 0.00013690109619411258, "loss": 7.6305, "step": 5901 }, { "epoch": 0.7264894140817332, "grad_norm": 0.09433631598949432, "learning_rate": 0.00013683951225520387, "loss": 7.3908, "step": 5902 }, { "epoch": 0.7266125061546036, "grad_norm": 0.09156771749258041, "learning_rate": 0.00013677792831629513, "loss": 7.4079, "step": 5903 }, { "epoch": 0.7267355982274741, "grad_norm": 0.07876113057136536, "learning_rate": 0.00013671634437738639, "loss": 7.5907, "step": 5904 }, { "epoch": 0.7268586903003447, "grad_norm": 0.1250673085451126, "learning_rate": 0.00013665476043847767, "loss": 7.872, "step": 5905 }, { "epoch": 0.7269817823732152, "grad_norm": 0.0944465920329094, "learning_rate": 0.00013659317649956893, "loss": 7.6706, "step": 5906 }, { "epoch": 0.7271048744460856, "grad_norm": 0.07874756306409836, "learning_rate": 0.0001365315925606602, "loss": 8.1024, "step": 5907 }, { "epoch": 0.7272279665189562, "grad_norm": 0.33439382910728455, "learning_rate": 0.00013647000862175145, "loss": 9.4252, "step": 5908 }, { "epoch": 0.7273510585918267, "grad_norm": 0.1332763433456421, "learning_rate": 0.00013640842468284274, "loss": 8.3148, "step": 5909 }, { "epoch": 0.7274741506646972, "grad_norm": 0.114659383893013, "learning_rate": 0.000136346840743934, "loss": 7.5382, "step": 5910 }, { "epoch": 0.7275972427375677, "grad_norm": 0.18499773740768433, "learning_rate": 0.00013628525680502526, "loss": 7.4047, "step": 5911 }, { "epoch": 0.7277203348104382, "grad_norm": 0.11310531944036484, "learning_rate": 0.00013622367286611654, "loss": 8.0147, "step": 5912 }, { "epoch": 0.7278434268833087, "grad_norm": 0.16399671137332916, "learning_rate": 0.0001361620889272078, "loss": 7.6607, "step": 5913 }, { "epoch": 0.7279665189561793, "grad_norm": 0.13540750741958618, "learning_rate": 0.00013610050498829906, "loss": 7.2634, "step": 5914 }, { "epoch": 0.7280896110290497, "grad_norm": 0.07550647854804993, "learning_rate": 0.00013603892104939032, "loss": 7.4847, "step": 5915 }, { "epoch": 0.7282127031019202, "grad_norm": 0.0988611951470375, "learning_rate": 0.0001359773371104816, "loss": 7.4525, "step": 5916 }, { "epoch": 0.7283357951747907, "grad_norm": 0.26913386583328247, "learning_rate": 0.00013591575317157287, "loss": 8.3071, "step": 5917 }, { "epoch": 0.7284588872476613, "grad_norm": 0.11265365034341812, "learning_rate": 0.00013585416923266413, "loss": 7.1944, "step": 5918 }, { "epoch": 0.7285819793205317, "grad_norm": 0.5082500576972961, "learning_rate": 0.00013579258529375541, "loss": 9.5486, "step": 5919 }, { "epoch": 0.7287050713934022, "grad_norm": 0.16681204736232758, "learning_rate": 0.00013573100135484667, "loss": 7.7886, "step": 5920 }, { "epoch": 0.7288281634662728, "grad_norm": 0.06544668972492218, "learning_rate": 0.0001356694174159379, "loss": 7.4595, "step": 5921 }, { "epoch": 0.7289512555391433, "grad_norm": 0.0763559639453888, "learning_rate": 0.0001356078334770292, "loss": 7.3777, "step": 5922 }, { "epoch": 0.7290743476120138, "grad_norm": 0.10266812890768051, "learning_rate": 0.00013554624953812045, "loss": 7.3617, "step": 5923 }, { "epoch": 0.7291974396848843, "grad_norm": 0.06982213258743286, "learning_rate": 0.0001354846655992117, "loss": 7.696, "step": 5924 }, { "epoch": 0.7293205317577548, "grad_norm": 0.10511001199483871, "learning_rate": 0.00013542308166030297, "loss": 7.9526, "step": 5925 }, { "epoch": 0.7294436238306253, "grad_norm": 0.11805301904678345, "learning_rate": 0.00013536149772139426, "loss": 7.7444, "step": 5926 }, { "epoch": 0.7295667159034959, "grad_norm": 0.18478089570999146, "learning_rate": 0.00013529991378248552, "loss": 7.9506, "step": 5927 }, { "epoch": 0.7296898079763663, "grad_norm": 0.12607762217521667, "learning_rate": 0.00013523832984357678, "loss": 7.4939, "step": 5928 }, { "epoch": 0.7298129000492368, "grad_norm": 0.08770152926445007, "learning_rate": 0.00013517674590466806, "loss": 7.6308, "step": 5929 }, { "epoch": 0.7299359921221074, "grad_norm": 0.2940041124820709, "learning_rate": 0.00013511516196575932, "loss": 9.2924, "step": 5930 }, { "epoch": 0.7300590841949779, "grad_norm": 0.09665121883153915, "learning_rate": 0.00013505357802685058, "loss": 7.9056, "step": 5931 }, { "epoch": 0.7301821762678483, "grad_norm": 0.36103227734565735, "learning_rate": 0.00013499199408794184, "loss": 9.2805, "step": 5932 }, { "epoch": 0.7303052683407188, "grad_norm": 0.10852985084056854, "learning_rate": 0.00013493041014903313, "loss": 8.0733, "step": 5933 }, { "epoch": 0.7304283604135894, "grad_norm": 0.17712990939617157, "learning_rate": 0.0001348688262101244, "loss": 7.3922, "step": 5934 }, { "epoch": 0.7305514524864599, "grad_norm": 0.18867206573486328, "learning_rate": 0.00013480724227121565, "loss": 7.3418, "step": 5935 }, { "epoch": 0.7306745445593303, "grad_norm": 0.07115432620048523, "learning_rate": 0.00013474565833230693, "loss": 7.9929, "step": 5936 }, { "epoch": 0.7307976366322009, "grad_norm": 0.12157709896564484, "learning_rate": 0.0001346840743933982, "loss": 8.3298, "step": 5937 }, { "epoch": 0.7309207287050714, "grad_norm": 0.1915845423936844, "learning_rate": 0.00013462249045448945, "loss": 8.9936, "step": 5938 }, { "epoch": 0.7310438207779419, "grad_norm": 0.07284951210021973, "learning_rate": 0.00013456090651558074, "loss": 7.7506, "step": 5939 }, { "epoch": 0.7311669128508124, "grad_norm": 0.07920480519533157, "learning_rate": 0.000134499322576672, "loss": 7.5992, "step": 5940 }, { "epoch": 0.7312900049236829, "grad_norm": 0.1063578873872757, "learning_rate": 0.00013443773863776326, "loss": 7.5311, "step": 5941 }, { "epoch": 0.7314130969965534, "grad_norm": 0.08372171968221664, "learning_rate": 0.00013437615469885452, "loss": 7.9059, "step": 5942 }, { "epoch": 0.731536189069424, "grad_norm": 0.13857771456241608, "learning_rate": 0.0001343145707599458, "loss": 8.2501, "step": 5943 }, { "epoch": 0.7316592811422944, "grad_norm": 0.09768909960985184, "learning_rate": 0.00013425298682103706, "loss": 7.8489, "step": 5944 }, { "epoch": 0.7317823732151649, "grad_norm": 0.06575645506381989, "learning_rate": 0.00013419140288212832, "loss": 7.7641, "step": 5945 }, { "epoch": 0.7319054652880355, "grad_norm": 0.09492092579603195, "learning_rate": 0.0001341298189432196, "loss": 8.2039, "step": 5946 }, { "epoch": 0.732028557360906, "grad_norm": 0.07945911586284637, "learning_rate": 0.00013406823500431087, "loss": 7.913, "step": 5947 }, { "epoch": 0.7321516494337764, "grad_norm": 0.17893274128437042, "learning_rate": 0.00013400665106540213, "loss": 8.8106, "step": 5948 }, { "epoch": 0.732274741506647, "grad_norm": 0.12045051902532578, "learning_rate": 0.00013394506712649342, "loss": 7.7323, "step": 5949 }, { "epoch": 0.7323978335795175, "grad_norm": 0.18312886357307434, "learning_rate": 0.00013388348318758468, "loss": 7.4049, "step": 5950 }, { "epoch": 0.732520925652388, "grad_norm": 0.11950137466192245, "learning_rate": 0.00013382189924867593, "loss": 7.611, "step": 5951 }, { "epoch": 0.7326440177252584, "grad_norm": 0.11084073781967163, "learning_rate": 0.0001337603153097672, "loss": 7.5283, "step": 5952 }, { "epoch": 0.732767109798129, "grad_norm": 0.08591354638338089, "learning_rate": 0.00013369873137085848, "loss": 7.4901, "step": 5953 }, { "epoch": 0.7328902018709995, "grad_norm": 0.0898159071803093, "learning_rate": 0.00013363714743194974, "loss": 7.723, "step": 5954 }, { "epoch": 0.73301329394387, "grad_norm": 0.06278842687606812, "learning_rate": 0.000133575563493041, "loss": 7.4851, "step": 5955 }, { "epoch": 0.7331363860167405, "grad_norm": 0.09942261129617691, "learning_rate": 0.0001335139795541323, "loss": 7.5129, "step": 5956 }, { "epoch": 0.733259478089611, "grad_norm": 0.23933446407318115, "learning_rate": 0.00013345239561522355, "loss": 8.4958, "step": 5957 }, { "epoch": 0.7333825701624815, "grad_norm": 0.0980607345700264, "learning_rate": 0.0001333908116763148, "loss": 7.5603, "step": 5958 }, { "epoch": 0.7335056622353521, "grad_norm": 0.10929322987794876, "learning_rate": 0.00013332922773740607, "loss": 7.361, "step": 5959 }, { "epoch": 0.7336287543082225, "grad_norm": 0.214175283908844, "learning_rate": 0.00013326764379849735, "loss": 8.5234, "step": 5960 }, { "epoch": 0.733751846381093, "grad_norm": 0.1246916651725769, "learning_rate": 0.0001332060598595886, "loss": 7.4123, "step": 5961 }, { "epoch": 0.7338749384539636, "grad_norm": 0.10322580486536026, "learning_rate": 0.00013314447592067987, "loss": 7.5888, "step": 5962 }, { "epoch": 0.7339980305268341, "grad_norm": 0.13191184401512146, "learning_rate": 0.00013308289198177116, "loss": 7.2347, "step": 5963 }, { "epoch": 0.7341211225997046, "grad_norm": 0.08504802733659744, "learning_rate": 0.00013302130804286242, "loss": 7.7359, "step": 5964 }, { "epoch": 0.7342442146725751, "grad_norm": 0.07251454144716263, "learning_rate": 0.00013295972410395368, "loss": 7.7247, "step": 5965 }, { "epoch": 0.7343673067454456, "grad_norm": 0.05414232611656189, "learning_rate": 0.00013289814016504496, "loss": 7.615, "step": 5966 }, { "epoch": 0.7344903988183161, "grad_norm": 0.0854618027806282, "learning_rate": 0.00013283655622613622, "loss": 7.6498, "step": 5967 }, { "epoch": 0.7346134908911867, "grad_norm": 0.12631282210350037, "learning_rate": 0.00013277497228722748, "loss": 8.2297, "step": 5968 }, { "epoch": 0.7347365829640571, "grad_norm": 0.07563696056604385, "learning_rate": 0.00013271338834831874, "loss": 7.6963, "step": 5969 }, { "epoch": 0.7348596750369276, "grad_norm": 0.07189536839723587, "learning_rate": 0.00013265180440941003, "loss": 7.8334, "step": 5970 }, { "epoch": 0.7349827671097982, "grad_norm": 0.15646560490131378, "learning_rate": 0.0001325902204705013, "loss": 8.3682, "step": 5971 }, { "epoch": 0.7351058591826687, "grad_norm": 0.12279963493347168, "learning_rate": 0.00013252863653159255, "loss": 7.2949, "step": 5972 }, { "epoch": 0.7352289512555391, "grad_norm": 0.10694431513547897, "learning_rate": 0.00013246705259268383, "loss": 7.4827, "step": 5973 }, { "epoch": 0.7353520433284096, "grad_norm": 0.1141417995095253, "learning_rate": 0.0001324054686537751, "loss": 7.767, "step": 5974 }, { "epoch": 0.7354751354012802, "grad_norm": 0.11803141981363297, "learning_rate": 0.00013234388471486635, "loss": 7.8714, "step": 5975 }, { "epoch": 0.7355982274741507, "grad_norm": 0.07090678811073303, "learning_rate": 0.00013228230077595764, "loss": 7.5014, "step": 5976 }, { "epoch": 0.7357213195470211, "grad_norm": 0.11155740916728973, "learning_rate": 0.0001322207168370489, "loss": 7.791, "step": 5977 }, { "epoch": 0.7358444116198917, "grad_norm": 0.09319479763507843, "learning_rate": 0.00013215913289814016, "loss": 7.7859, "step": 5978 }, { "epoch": 0.7359675036927622, "grad_norm": 0.10084939748048782, "learning_rate": 0.00013209754895923142, "loss": 7.904, "step": 5979 }, { "epoch": 0.7360905957656327, "grad_norm": 0.1609310656785965, "learning_rate": 0.0001320359650203227, "loss": 8.2074, "step": 5980 }, { "epoch": 0.7362136878385032, "grad_norm": 0.09224618226289749, "learning_rate": 0.00013197438108141396, "loss": 7.3973, "step": 5981 }, { "epoch": 0.7363367799113737, "grad_norm": 0.11694218963384628, "learning_rate": 0.00013191279714250522, "loss": 7.9595, "step": 5982 }, { "epoch": 0.7364598719842442, "grad_norm": 0.06626463681459427, "learning_rate": 0.0001318512132035965, "loss": 7.6986, "step": 5983 }, { "epoch": 0.7365829640571148, "grad_norm": 0.07497458904981613, "learning_rate": 0.00013178962926468777, "loss": 7.7685, "step": 5984 }, { "epoch": 0.7367060561299852, "grad_norm": 0.1465987116098404, "learning_rate": 0.00013172804532577903, "loss": 7.9749, "step": 5985 }, { "epoch": 0.7368291482028557, "grad_norm": 0.18348988890647888, "learning_rate": 0.0001316664613868703, "loss": 7.2507, "step": 5986 }, { "epoch": 0.7369522402757263, "grad_norm": 0.1124839037656784, "learning_rate": 0.00013160487744796157, "loss": 7.4099, "step": 5987 }, { "epoch": 0.7370753323485968, "grad_norm": 0.13944509625434875, "learning_rate": 0.00013154329350905283, "loss": 8.1379, "step": 5988 }, { "epoch": 0.7371984244214672, "grad_norm": 0.12649118900299072, "learning_rate": 0.0001314817095701441, "loss": 7.7294, "step": 5989 }, { "epoch": 0.7373215164943377, "grad_norm": 0.09019969403743744, "learning_rate": 0.00013142012563123538, "loss": 7.5054, "step": 5990 }, { "epoch": 0.7374446085672083, "grad_norm": 0.08064274489879608, "learning_rate": 0.00013135854169232664, "loss": 7.5464, "step": 5991 }, { "epoch": 0.7375677006400788, "grad_norm": 0.10310492664575577, "learning_rate": 0.0001312969577534179, "loss": 7.2193, "step": 5992 }, { "epoch": 0.7376907927129492, "grad_norm": 0.22381632030010223, "learning_rate": 0.00013123537381450919, "loss": 8.2916, "step": 5993 }, { "epoch": 0.7378138847858198, "grad_norm": 0.094460628926754, "learning_rate": 0.00013117378987560045, "loss": 7.4237, "step": 5994 }, { "epoch": 0.7379369768586903, "grad_norm": 0.06521770358085632, "learning_rate": 0.0001311122059366917, "loss": 7.5185, "step": 5995 }, { "epoch": 0.7380600689315608, "grad_norm": 0.06739429384469986, "learning_rate": 0.00013105062199778296, "loss": 7.5079, "step": 5996 }, { "epoch": 0.7381831610044313, "grad_norm": 0.45809462666511536, "learning_rate": 0.00013098903805887425, "loss": 9.8745, "step": 5997 }, { "epoch": 0.7383062530773018, "grad_norm": 0.07635119557380676, "learning_rate": 0.0001309274541199655, "loss": 7.398, "step": 5998 }, { "epoch": 0.7384293451501723, "grad_norm": 0.0873190313577652, "learning_rate": 0.00013086587018105677, "loss": 7.9515, "step": 5999 }, { "epoch": 0.7385524372230429, "grad_norm": 0.07999599725008011, "learning_rate": 0.00013080428624214806, "loss": 7.6003, "step": 6000 }, { "epoch": 0.7386755292959133, "grad_norm": 0.09862067550420761, "learning_rate": 0.00013074270230323932, "loss": 7.4787, "step": 6001 }, { "epoch": 0.7387986213687838, "grad_norm": 0.08531227707862854, "learning_rate": 0.00013068111836433058, "loss": 7.6867, "step": 6002 }, { "epoch": 0.7389217134416544, "grad_norm": 0.11700975149869919, "learning_rate": 0.00013061953442542186, "loss": 7.3211, "step": 6003 }, { "epoch": 0.7390448055145249, "grad_norm": 0.09785497188568115, "learning_rate": 0.00013055795048651312, "loss": 8.1809, "step": 6004 }, { "epoch": 0.7391678975873953, "grad_norm": 0.10937713086605072, "learning_rate": 0.00013049636654760438, "loss": 7.335, "step": 6005 }, { "epoch": 0.7392909896602659, "grad_norm": 0.09144663065671921, "learning_rate": 0.00013043478260869564, "loss": 7.405, "step": 6006 }, { "epoch": 0.7394140817331364, "grad_norm": 0.07428739964962006, "learning_rate": 0.00013037319866978693, "loss": 7.4925, "step": 6007 }, { "epoch": 0.7395371738060069, "grad_norm": 0.10085542500019073, "learning_rate": 0.0001303116147308782, "loss": 7.2603, "step": 6008 }, { "epoch": 0.7396602658788775, "grad_norm": 0.05335478112101555, "learning_rate": 0.00013025003079196945, "loss": 7.5194, "step": 6009 }, { "epoch": 0.7397833579517479, "grad_norm": 0.07145234197378159, "learning_rate": 0.00013018844685306073, "loss": 7.3349, "step": 6010 }, { "epoch": 0.7399064500246184, "grad_norm": 0.16286250948905945, "learning_rate": 0.000130126862914152, "loss": 7.6985, "step": 6011 }, { "epoch": 0.740029542097489, "grad_norm": 0.11473517119884491, "learning_rate": 0.00013006527897524325, "loss": 7.7011, "step": 6012 }, { "epoch": 0.7401526341703595, "grad_norm": 0.07370945811271667, "learning_rate": 0.0001300036950363345, "loss": 7.2507, "step": 6013 }, { "epoch": 0.7402757262432299, "grad_norm": 0.08957700431346893, "learning_rate": 0.0001299421110974258, "loss": 7.6176, "step": 6014 }, { "epoch": 0.7403988183161004, "grad_norm": 0.1547587811946869, "learning_rate": 0.00012988052715851706, "loss": 7.9434, "step": 6015 }, { "epoch": 0.740521910388971, "grad_norm": 0.07492264360189438, "learning_rate": 0.00012981894321960832, "loss": 7.8645, "step": 6016 }, { "epoch": 0.7406450024618415, "grad_norm": 0.10484401881694794, "learning_rate": 0.0001297573592806996, "loss": 8.2851, "step": 6017 }, { "epoch": 0.7407680945347119, "grad_norm": 0.10282309353351593, "learning_rate": 0.00012969577534179086, "loss": 7.6977, "step": 6018 }, { "epoch": 0.7408911866075825, "grad_norm": 0.10613498836755753, "learning_rate": 0.00012963419140288212, "loss": 8.324, "step": 6019 }, { "epoch": 0.741014278680453, "grad_norm": 0.1726098209619522, "learning_rate": 0.0001295726074639734, "loss": 7.4793, "step": 6020 }, { "epoch": 0.7411373707533235, "grad_norm": 0.17896366119384766, "learning_rate": 0.00012951102352506467, "loss": 7.3335, "step": 6021 }, { "epoch": 0.741260462826194, "grad_norm": 0.14867104589939117, "learning_rate": 0.00012944943958615593, "loss": 7.4203, "step": 6022 }, { "epoch": 0.7413835548990645, "grad_norm": 0.06921887397766113, "learning_rate": 0.0001293878556472472, "loss": 8.1317, "step": 6023 }, { "epoch": 0.741506646971935, "grad_norm": 0.17486195266246796, "learning_rate": 0.00012932627170833847, "loss": 7.2358, "step": 6024 }, { "epoch": 0.7416297390448056, "grad_norm": 0.07746758311986923, "learning_rate": 0.00012926468776942973, "loss": 7.4828, "step": 6025 }, { "epoch": 0.741752831117676, "grad_norm": 0.15886208415031433, "learning_rate": 0.000129203103830521, "loss": 7.6793, "step": 6026 }, { "epoch": 0.7418759231905465, "grad_norm": 0.08462180197238922, "learning_rate": 0.00012914151989161228, "loss": 7.7271, "step": 6027 }, { "epoch": 0.741999015263417, "grad_norm": 0.21485787630081177, "learning_rate": 0.00012907993595270354, "loss": 8.3016, "step": 6028 }, { "epoch": 0.7421221073362876, "grad_norm": 0.06700804084539413, "learning_rate": 0.0001290183520137948, "loss": 7.3714, "step": 6029 }, { "epoch": 0.742245199409158, "grad_norm": 0.1400560438632965, "learning_rate": 0.00012895676807488606, "loss": 7.6929, "step": 6030 }, { "epoch": 0.7423682914820285, "grad_norm": 0.09441197663545609, "learning_rate": 0.00012889518413597734, "loss": 7.4328, "step": 6031 }, { "epoch": 0.7424913835548991, "grad_norm": 0.08810964971780777, "learning_rate": 0.0001288336001970686, "loss": 7.5437, "step": 6032 }, { "epoch": 0.7426144756277696, "grad_norm": 0.14580681920051575, "learning_rate": 0.00012877201625815986, "loss": 7.9996, "step": 6033 }, { "epoch": 0.74273756770064, "grad_norm": 0.10302887111902237, "learning_rate": 0.00012871043231925115, "loss": 7.9361, "step": 6034 }, { "epoch": 0.7428606597735106, "grad_norm": 0.12457828968763351, "learning_rate": 0.0001286488483803424, "loss": 8.458, "step": 6035 }, { "epoch": 0.7429837518463811, "grad_norm": 0.11837948858737946, "learning_rate": 0.00012858726444143367, "loss": 7.5577, "step": 6036 }, { "epoch": 0.7431068439192516, "grad_norm": 0.18675610423088074, "learning_rate": 0.00012852568050252496, "loss": 7.8981, "step": 6037 }, { "epoch": 0.7432299359921221, "grad_norm": 0.1854451596736908, "learning_rate": 0.00012846409656361622, "loss": 7.3798, "step": 6038 }, { "epoch": 0.7433530280649926, "grad_norm": 0.1576870232820511, "learning_rate": 0.00012840251262470748, "loss": 7.5011, "step": 6039 }, { "epoch": 0.7434761201378631, "grad_norm": 0.08327517658472061, "learning_rate": 0.00012834092868579873, "loss": 8.0239, "step": 6040 }, { "epoch": 0.7435992122107337, "grad_norm": 0.1356198787689209, "learning_rate": 0.00012827934474689002, "loss": 7.5039, "step": 6041 }, { "epoch": 0.7437223042836041, "grad_norm": 0.14625807106494904, "learning_rate": 0.00012821776080798128, "loss": 7.3624, "step": 6042 }, { "epoch": 0.7438453963564746, "grad_norm": 0.0624777227640152, "learning_rate": 0.00012815617686907254, "loss": 7.6745, "step": 6043 }, { "epoch": 0.7439684884293452, "grad_norm": 0.10723382234573364, "learning_rate": 0.00012809459293016383, "loss": 7.0703, "step": 6044 }, { "epoch": 0.7440915805022157, "grad_norm": 0.10210969299077988, "learning_rate": 0.00012803300899125509, "loss": 7.6779, "step": 6045 }, { "epoch": 0.7442146725750861, "grad_norm": 0.2593957185745239, "learning_rate": 0.00012797142505234635, "loss": 8.3894, "step": 6046 }, { "epoch": 0.7443377646479566, "grad_norm": 0.12226907163858414, "learning_rate": 0.00012790984111343763, "loss": 7.5516, "step": 6047 }, { "epoch": 0.7444608567208272, "grad_norm": 0.09205161780118942, "learning_rate": 0.0001278482571745289, "loss": 7.4816, "step": 6048 }, { "epoch": 0.7445839487936977, "grad_norm": 0.06277573108673096, "learning_rate": 0.00012778667323562015, "loss": 7.3925, "step": 6049 }, { "epoch": 0.7447070408665682, "grad_norm": 0.290622353553772, "learning_rate": 0.0001277250892967114, "loss": 8.4099, "step": 6050 }, { "epoch": 0.7448301329394387, "grad_norm": 0.10469929873943329, "learning_rate": 0.0001276635053578027, "loss": 7.0702, "step": 6051 }, { "epoch": 0.7449532250123092, "grad_norm": 0.06485572457313538, "learning_rate": 0.00012760192141889396, "loss": 7.4952, "step": 6052 }, { "epoch": 0.7450763170851797, "grad_norm": 0.10078535974025726, "learning_rate": 0.00012754033747998522, "loss": 7.8698, "step": 6053 }, { "epoch": 0.7451994091580503, "grad_norm": 0.07106305658817291, "learning_rate": 0.0001274787535410765, "loss": 7.7779, "step": 6054 }, { "epoch": 0.7453225012309207, "grad_norm": 0.08396674692630768, "learning_rate": 0.00012741716960216776, "loss": 7.5192, "step": 6055 }, { "epoch": 0.7454455933037912, "grad_norm": 0.20724503695964813, "learning_rate": 0.00012735558566325902, "loss": 8.2954, "step": 6056 }, { "epoch": 0.7455686853766618, "grad_norm": 0.14314618706703186, "learning_rate": 0.00012729400172435028, "loss": 8.2305, "step": 6057 }, { "epoch": 0.7456917774495323, "grad_norm": 0.09651673585176468, "learning_rate": 0.00012723241778544157, "loss": 7.9872, "step": 6058 }, { "epoch": 0.7458148695224027, "grad_norm": 0.17276787757873535, "learning_rate": 0.00012717083384653283, "loss": 7.4326, "step": 6059 }, { "epoch": 0.7459379615952733, "grad_norm": 0.12090577930212021, "learning_rate": 0.0001271092499076241, "loss": 8.1161, "step": 6060 }, { "epoch": 0.7460610536681438, "grad_norm": 0.32146528363227844, "learning_rate": 0.00012704766596871537, "loss": 7.3862, "step": 6061 }, { "epoch": 0.7461841457410143, "grad_norm": 0.12091390788555145, "learning_rate": 0.00012698608202980663, "loss": 8.6506, "step": 6062 }, { "epoch": 0.7463072378138847, "grad_norm": 0.08170930296182632, "learning_rate": 0.0001269244980908979, "loss": 8.0364, "step": 6063 }, { "epoch": 0.7464303298867553, "grad_norm": 0.09284225106239319, "learning_rate": 0.00012686291415198918, "loss": 7.7596, "step": 6064 }, { "epoch": 0.7465534219596258, "grad_norm": 0.18076077103614807, "learning_rate": 0.00012680133021308044, "loss": 7.3741, "step": 6065 }, { "epoch": 0.7466765140324964, "grad_norm": 0.08214829117059708, "learning_rate": 0.0001267397462741717, "loss": 7.9162, "step": 6066 }, { "epoch": 0.7467996061053668, "grad_norm": 0.17458531260490417, "learning_rate": 0.00012667816233526296, "loss": 7.4035, "step": 6067 }, { "epoch": 0.7469226981782373, "grad_norm": 0.13149745762348175, "learning_rate": 0.00012661657839635424, "loss": 7.4702, "step": 6068 }, { "epoch": 0.7470457902511078, "grad_norm": 0.09290321171283722, "learning_rate": 0.0001265549944574455, "loss": 7.3934, "step": 6069 }, { "epoch": 0.7471688823239784, "grad_norm": 0.11032566428184509, "learning_rate": 0.00012649341051853676, "loss": 7.6595, "step": 6070 }, { "epoch": 0.7472919743968488, "grad_norm": 0.13259749114513397, "learning_rate": 0.00012643182657962805, "loss": 7.5948, "step": 6071 }, { "epoch": 0.7474150664697193, "grad_norm": 0.09986688196659088, "learning_rate": 0.0001263702426407193, "loss": 7.7263, "step": 6072 }, { "epoch": 0.7475381585425899, "grad_norm": 0.07822816073894501, "learning_rate": 0.00012630865870181057, "loss": 7.3884, "step": 6073 }, { "epoch": 0.7476612506154604, "grad_norm": 0.09958466142416, "learning_rate": 0.00012624707476290186, "loss": 7.4174, "step": 6074 }, { "epoch": 0.7477843426883308, "grad_norm": 0.08809134364128113, "learning_rate": 0.00012618549082399312, "loss": 7.7149, "step": 6075 }, { "epoch": 0.7479074347612014, "grad_norm": 0.08948211371898651, "learning_rate": 0.00012612390688508437, "loss": 7.6964, "step": 6076 }, { "epoch": 0.7480305268340719, "grad_norm": 0.060860682278871536, "learning_rate": 0.00012606232294617563, "loss": 7.4879, "step": 6077 }, { "epoch": 0.7481536189069424, "grad_norm": 0.0676308125257492, "learning_rate": 0.00012600073900726692, "loss": 7.5439, "step": 6078 }, { "epoch": 0.7482767109798129, "grad_norm": 0.08690628409385681, "learning_rate": 0.00012593915506835818, "loss": 7.779, "step": 6079 }, { "epoch": 0.7483998030526834, "grad_norm": 0.11183514446020126, "learning_rate": 0.00012587757112944944, "loss": 8.049, "step": 6080 }, { "epoch": 0.7485228951255539, "grad_norm": 0.16698765754699707, "learning_rate": 0.00012581598719054073, "loss": 7.5518, "step": 6081 }, { "epoch": 0.7486459871984245, "grad_norm": 0.07780029624700546, "learning_rate": 0.00012575440325163199, "loss": 7.7609, "step": 6082 }, { "epoch": 0.7487690792712949, "grad_norm": 0.11227580904960632, "learning_rate": 0.00012569281931272325, "loss": 7.4859, "step": 6083 }, { "epoch": 0.7488921713441654, "grad_norm": 0.17825837433338165, "learning_rate": 0.0001256312353738145, "loss": 7.2676, "step": 6084 }, { "epoch": 0.749015263417036, "grad_norm": 0.1476118415594101, "learning_rate": 0.0001255696514349058, "loss": 8.1475, "step": 6085 }, { "epoch": 0.7491383554899065, "grad_norm": 0.10965501517057419, "learning_rate": 0.00012550806749599705, "loss": 7.434, "step": 6086 }, { "epoch": 0.7492614475627769, "grad_norm": 0.06594327092170715, "learning_rate": 0.0001254464835570883, "loss": 7.4097, "step": 6087 }, { "epoch": 0.7493845396356474, "grad_norm": 0.09705198556184769, "learning_rate": 0.0001253848996181796, "loss": 7.8273, "step": 6088 }, { "epoch": 0.749507631708518, "grad_norm": 0.08808537572622299, "learning_rate": 0.00012532331567927086, "loss": 7.5972, "step": 6089 }, { "epoch": 0.7496307237813885, "grad_norm": 0.10953368246555328, "learning_rate": 0.00012526173174036212, "loss": 7.7572, "step": 6090 }, { "epoch": 0.7497538158542589, "grad_norm": 0.06831230968236923, "learning_rate": 0.0001252001478014534, "loss": 7.2696, "step": 6091 }, { "epoch": 0.7498769079271295, "grad_norm": 0.05340971052646637, "learning_rate": 0.00012513856386254466, "loss": 7.3166, "step": 6092 }, { "epoch": 0.75, "grad_norm": 0.07074418663978577, "learning_rate": 0.00012507697992363592, "loss": 7.4158, "step": 6093 }, { "epoch": 0.7501230920728705, "grad_norm": 0.07968258112668991, "learning_rate": 0.00012501539598472718, "loss": 7.5027, "step": 6094 }, { "epoch": 0.7502461841457411, "grad_norm": 0.11624249070882797, "learning_rate": 0.00012495381204581844, "loss": 7.3945, "step": 6095 }, { "epoch": 0.7503692762186115, "grad_norm": 0.09594614803791046, "learning_rate": 0.00012489222810690973, "loss": 7.2913, "step": 6096 }, { "epoch": 0.750492368291482, "grad_norm": 0.16686893999576569, "learning_rate": 0.000124830644168001, "loss": 8.2146, "step": 6097 }, { "epoch": 0.7506154603643526, "grad_norm": 0.1164458766579628, "learning_rate": 0.00012476906022909225, "loss": 7.96, "step": 6098 }, { "epoch": 0.7507385524372231, "grad_norm": 0.08826150000095367, "learning_rate": 0.0001247074762901835, "loss": 7.8381, "step": 6099 }, { "epoch": 0.7508616445100935, "grad_norm": 0.0774024948477745, "learning_rate": 0.0001246458923512748, "loss": 7.6592, "step": 6100 }, { "epoch": 0.750984736582964, "grad_norm": 0.18070173263549805, "learning_rate": 0.00012458430841236605, "loss": 7.6153, "step": 6101 }, { "epoch": 0.7511078286558346, "grad_norm": 0.07239386439323425, "learning_rate": 0.0001245227244734573, "loss": 8.1379, "step": 6102 }, { "epoch": 0.7512309207287051, "grad_norm": 0.134621262550354, "learning_rate": 0.0001244611405345486, "loss": 8.3644, "step": 6103 }, { "epoch": 0.7513540128015755, "grad_norm": 0.14923641085624695, "learning_rate": 0.00012439955659563986, "loss": 8.8185, "step": 6104 }, { "epoch": 0.7514771048744461, "grad_norm": 0.10068149864673615, "learning_rate": 0.00012433797265673112, "loss": 7.693, "step": 6105 }, { "epoch": 0.7516001969473166, "grad_norm": 0.15410193800926208, "learning_rate": 0.00012427638871782238, "loss": 7.4155, "step": 6106 }, { "epoch": 0.7517232890201871, "grad_norm": 0.10812092572450638, "learning_rate": 0.00012421480477891366, "loss": 7.4719, "step": 6107 }, { "epoch": 0.7518463810930576, "grad_norm": 0.13569319248199463, "learning_rate": 0.00012415322084000492, "loss": 7.7412, "step": 6108 }, { "epoch": 0.7519694731659281, "grad_norm": 0.12880893051624298, "learning_rate": 0.00012409163690109618, "loss": 7.6324, "step": 6109 }, { "epoch": 0.7520925652387986, "grad_norm": 0.096883624792099, "learning_rate": 0.00012403005296218747, "loss": 7.4921, "step": 6110 }, { "epoch": 0.7522156573116692, "grad_norm": 0.20717869699001312, "learning_rate": 0.00012396846902327873, "loss": 8.5346, "step": 6111 }, { "epoch": 0.7523387493845396, "grad_norm": 0.15632161498069763, "learning_rate": 0.00012390688508437, "loss": 7.1551, "step": 6112 }, { "epoch": 0.7524618414574101, "grad_norm": 0.15369145572185516, "learning_rate": 0.00012384530114546127, "loss": 7.3531, "step": 6113 }, { "epoch": 0.7525849335302807, "grad_norm": 0.13254238665103912, "learning_rate": 0.00012378371720655253, "loss": 7.6619, "step": 6114 }, { "epoch": 0.7527080256031512, "grad_norm": 0.09650858491659164, "learning_rate": 0.0001237221332676438, "loss": 7.5438, "step": 6115 }, { "epoch": 0.7528311176760216, "grad_norm": 0.11870119720697403, "learning_rate": 0.00012366054932873505, "loss": 7.4954, "step": 6116 }, { "epoch": 0.7529542097488922, "grad_norm": 0.07966520637273788, "learning_rate": 0.00012359896538982634, "loss": 7.6083, "step": 6117 }, { "epoch": 0.7530773018217627, "grad_norm": 0.11356039345264435, "learning_rate": 0.0001235373814509176, "loss": 7.6366, "step": 6118 }, { "epoch": 0.7532003938946332, "grad_norm": 0.09163635224103928, "learning_rate": 0.00012347579751200886, "loss": 7.5683, "step": 6119 }, { "epoch": 0.7533234859675036, "grad_norm": 0.08213206380605698, "learning_rate": 0.00012341421357310014, "loss": 7.9222, "step": 6120 }, { "epoch": 0.7534465780403742, "grad_norm": 0.17497289180755615, "learning_rate": 0.0001233526296341914, "loss": 7.1555, "step": 6121 }, { "epoch": 0.7535696701132447, "grad_norm": 0.08795753866434097, "learning_rate": 0.00012329104569528266, "loss": 7.306, "step": 6122 }, { "epoch": 0.7536927621861153, "grad_norm": 0.11369410902261734, "learning_rate": 0.00012322946175637395, "loss": 7.421, "step": 6123 }, { "epoch": 0.7538158542589857, "grad_norm": 0.07807747274637222, "learning_rate": 0.0001231678778174652, "loss": 7.3451, "step": 6124 }, { "epoch": 0.7539389463318562, "grad_norm": 0.07044688612222672, "learning_rate": 0.00012310629387855647, "loss": 7.3868, "step": 6125 }, { "epoch": 0.7540620384047267, "grad_norm": 0.08431567996740341, "learning_rate": 0.00012304470993964773, "loss": 7.8355, "step": 6126 }, { "epoch": 0.7541851304775973, "grad_norm": 0.1031547337770462, "learning_rate": 0.00012298312600073902, "loss": 7.5073, "step": 6127 }, { "epoch": 0.7543082225504677, "grad_norm": 0.08572526276111603, "learning_rate": 0.00012292154206183027, "loss": 7.3014, "step": 6128 }, { "epoch": 0.7544313146233382, "grad_norm": 0.13223078846931458, "learning_rate": 0.00012285995812292153, "loss": 7.6267, "step": 6129 }, { "epoch": 0.7545544066962088, "grad_norm": 0.13448436558246613, "learning_rate": 0.00012279837418401282, "loss": 7.3981, "step": 6130 }, { "epoch": 0.7546774987690793, "grad_norm": 0.061762429773807526, "learning_rate": 0.00012273679024510408, "loss": 7.2017, "step": 6131 }, { "epoch": 0.7548005908419497, "grad_norm": 0.07206732034683228, "learning_rate": 0.00012267520630619534, "loss": 7.5562, "step": 6132 }, { "epoch": 0.7549236829148203, "grad_norm": 0.0687856376171112, "learning_rate": 0.0001226136223672866, "loss": 7.5513, "step": 6133 }, { "epoch": 0.7550467749876908, "grad_norm": 0.07884832471609116, "learning_rate": 0.00012255203842837789, "loss": 7.2989, "step": 6134 }, { "epoch": 0.7551698670605613, "grad_norm": 0.10229744017124176, "learning_rate": 0.00012249045448946915, "loss": 7.4503, "step": 6135 }, { "epoch": 0.7552929591334318, "grad_norm": 0.11557544767856598, "learning_rate": 0.0001224288705505604, "loss": 8.117, "step": 6136 }, { "epoch": 0.7554160512063023, "grad_norm": 0.09283354133367538, "learning_rate": 0.0001223672866116517, "loss": 7.3934, "step": 6137 }, { "epoch": 0.7555391432791728, "grad_norm": 0.07586374878883362, "learning_rate": 0.00012230570267274295, "loss": 7.5658, "step": 6138 }, { "epoch": 0.7556622353520434, "grad_norm": 0.12923890352249146, "learning_rate": 0.0001222441187338342, "loss": 7.7961, "step": 6139 }, { "epoch": 0.7557853274249139, "grad_norm": 0.3579314649105072, "learning_rate": 0.0001221825347949255, "loss": 9.4591, "step": 6140 }, { "epoch": 0.7559084194977843, "grad_norm": 0.06911873817443848, "learning_rate": 0.00012212095085601676, "loss": 7.6989, "step": 6141 }, { "epoch": 0.7560315115706548, "grad_norm": 0.11577319353818893, "learning_rate": 0.00012205936691710802, "loss": 7.4191, "step": 6142 }, { "epoch": 0.7561546036435254, "grad_norm": 0.08488961309194565, "learning_rate": 0.00012199778297819929, "loss": 7.37, "step": 6143 }, { "epoch": 0.7562776957163959, "grad_norm": 0.062994584441185, "learning_rate": 0.00012193619903929055, "loss": 7.6815, "step": 6144 }, { "epoch": 0.7564007877892663, "grad_norm": 0.09433434158563614, "learning_rate": 0.00012187461510038182, "loss": 7.8071, "step": 6145 }, { "epoch": 0.7565238798621369, "grad_norm": 0.1224590465426445, "learning_rate": 0.0001218130311614731, "loss": 7.7699, "step": 6146 }, { "epoch": 0.7566469719350074, "grad_norm": 0.053146544843912125, "learning_rate": 0.00012175144722256435, "loss": 7.5632, "step": 6147 }, { "epoch": 0.7567700640078779, "grad_norm": 0.14051975309848785, "learning_rate": 0.00012168986328365563, "loss": 8.182, "step": 6148 }, { "epoch": 0.7568931560807484, "grad_norm": 0.1288372129201889, "learning_rate": 0.00012162827934474689, "loss": 7.288, "step": 6149 }, { "epoch": 0.7570162481536189, "grad_norm": 0.06927980482578278, "learning_rate": 0.00012156669540583816, "loss": 7.4455, "step": 6150 }, { "epoch": 0.7571393402264894, "grad_norm": 0.23711957037448883, "learning_rate": 0.00012150511146692943, "loss": 8.5658, "step": 6151 }, { "epoch": 0.75726243229936, "grad_norm": 0.07139833271503448, "learning_rate": 0.00012144352752802069, "loss": 7.535, "step": 6152 }, { "epoch": 0.7573855243722304, "grad_norm": 0.1225244328379631, "learning_rate": 0.00012138194358911197, "loss": 7.5651, "step": 6153 }, { "epoch": 0.7575086164451009, "grad_norm": 0.12405409663915634, "learning_rate": 0.00012132035965020322, "loss": 7.3824, "step": 6154 }, { "epoch": 0.7576317085179715, "grad_norm": 0.06859607249498367, "learning_rate": 0.0001212587757112945, "loss": 7.8704, "step": 6155 }, { "epoch": 0.757754800590842, "grad_norm": 0.1153484433889389, "learning_rate": 0.00012119719177238577, "loss": 8.0586, "step": 6156 }, { "epoch": 0.7578778926637124, "grad_norm": 0.08202271908521652, "learning_rate": 0.00012113560783347703, "loss": 7.8666, "step": 6157 }, { "epoch": 0.758000984736583, "grad_norm": 0.07911402732133865, "learning_rate": 0.0001210740238945683, "loss": 7.5915, "step": 6158 }, { "epoch": 0.7581240768094535, "grad_norm": 0.1361684501171112, "learning_rate": 0.00012101243995565956, "loss": 7.3673, "step": 6159 }, { "epoch": 0.758247168882324, "grad_norm": 0.09691924601793289, "learning_rate": 0.00012095085601675084, "loss": 7.5145, "step": 6160 }, { "epoch": 0.7583702609551944, "grad_norm": 0.09705407172441483, "learning_rate": 0.0001208892720778421, "loss": 7.4544, "step": 6161 }, { "epoch": 0.758493353028065, "grad_norm": 0.08960320800542831, "learning_rate": 0.00012082768813893337, "loss": 7.7861, "step": 6162 }, { "epoch": 0.7586164451009355, "grad_norm": 0.14768488705158234, "learning_rate": 0.00012076610420002464, "loss": 7.7678, "step": 6163 }, { "epoch": 0.758739537173806, "grad_norm": 0.14170072972774506, "learning_rate": 0.0001207045202611159, "loss": 7.6525, "step": 6164 }, { "epoch": 0.7588626292466765, "grad_norm": 0.0745459794998169, "learning_rate": 0.00012064293632220717, "loss": 7.5719, "step": 6165 }, { "epoch": 0.758985721319547, "grad_norm": 0.07919031381607056, "learning_rate": 0.00012058135238329843, "loss": 7.3937, "step": 6166 }, { "epoch": 0.7591088133924175, "grad_norm": 0.09198371320962906, "learning_rate": 0.0001205197684443897, "loss": 7.4214, "step": 6167 }, { "epoch": 0.7592319054652881, "grad_norm": 0.23017355799674988, "learning_rate": 0.00012045818450548098, "loss": 8.8162, "step": 6168 }, { "epoch": 0.7593549975381585, "grad_norm": 0.5854900479316711, "learning_rate": 0.00012039660056657224, "loss": 6.5582, "step": 6169 }, { "epoch": 0.759478089611029, "grad_norm": 0.14251349866390228, "learning_rate": 0.00012033501662766351, "loss": 7.7842, "step": 6170 }, { "epoch": 0.7596011816838996, "grad_norm": 0.11210445314645767, "learning_rate": 0.00012027343268875477, "loss": 7.225, "step": 6171 }, { "epoch": 0.7597242737567701, "grad_norm": 0.11206794530153275, "learning_rate": 0.00012021184874984604, "loss": 7.6478, "step": 6172 }, { "epoch": 0.7598473658296405, "grad_norm": 0.06974564492702484, "learning_rate": 0.00012015026481093732, "loss": 7.4522, "step": 6173 }, { "epoch": 0.759970457902511, "grad_norm": 0.14509309828281403, "learning_rate": 0.00012008868087202858, "loss": 7.2161, "step": 6174 }, { "epoch": 0.7600935499753816, "grad_norm": 0.12032464891672134, "learning_rate": 0.00012002709693311985, "loss": 8.1797, "step": 6175 }, { "epoch": 0.7602166420482521, "grad_norm": 0.10689318180084229, "learning_rate": 0.00011996551299421111, "loss": 7.2469, "step": 6176 }, { "epoch": 0.7603397341211225, "grad_norm": 0.12557609379291534, "learning_rate": 0.00011990392905530238, "loss": 7.2137, "step": 6177 }, { "epoch": 0.7604628261939931, "grad_norm": 0.36680179834365845, "learning_rate": 0.00011984234511639366, "loss": 8.9876, "step": 6178 }, { "epoch": 0.7605859182668636, "grad_norm": 0.09940292686223984, "learning_rate": 0.00011978076117748492, "loss": 7.5479, "step": 6179 }, { "epoch": 0.7607090103397341, "grad_norm": 0.1546923816204071, "learning_rate": 0.00011971917723857619, "loss": 7.8357, "step": 6180 }, { "epoch": 0.7608321024126047, "grad_norm": 0.21914361417293549, "learning_rate": 0.00011965759329966745, "loss": 8.7945, "step": 6181 }, { "epoch": 0.7609551944854751, "grad_norm": 0.12831725180149078, "learning_rate": 0.00011959600936075872, "loss": 7.6753, "step": 6182 }, { "epoch": 0.7610782865583456, "grad_norm": 0.2562469244003296, "learning_rate": 0.00011953442542185, "loss": 8.4644, "step": 6183 }, { "epoch": 0.7612013786312162, "grad_norm": 0.4623212516307831, "learning_rate": 0.00011947284148294125, "loss": 9.5815, "step": 6184 }, { "epoch": 0.7613244707040867, "grad_norm": 0.1143864244222641, "learning_rate": 0.00011941125754403253, "loss": 7.9662, "step": 6185 }, { "epoch": 0.7614475627769571, "grad_norm": 0.11175551265478134, "learning_rate": 0.00011934967360512379, "loss": 8.1598, "step": 6186 }, { "epoch": 0.7615706548498277, "grad_norm": 0.1611173450946808, "learning_rate": 0.00011928808966621506, "loss": 7.3531, "step": 6187 }, { "epoch": 0.7616937469226982, "grad_norm": 0.1358804553747177, "learning_rate": 0.00011922650572730632, "loss": 7.7083, "step": 6188 }, { "epoch": 0.7618168389955687, "grad_norm": 0.11392305046319962, "learning_rate": 0.00011916492178839759, "loss": 7.5383, "step": 6189 }, { "epoch": 0.7619399310684392, "grad_norm": 0.15348267555236816, "learning_rate": 0.00011910333784948886, "loss": 7.3248, "step": 6190 }, { "epoch": 0.7620630231413097, "grad_norm": 0.11160396784543991, "learning_rate": 0.00011904175391058012, "loss": 7.412, "step": 6191 }, { "epoch": 0.7621861152141802, "grad_norm": 0.08306640386581421, "learning_rate": 0.0001189801699716714, "loss": 7.5488, "step": 6192 }, { "epoch": 0.7623092072870508, "grad_norm": 0.10813990980386734, "learning_rate": 0.00011891858603276266, "loss": 7.9697, "step": 6193 }, { "epoch": 0.7624322993599212, "grad_norm": 0.17259946465492249, "learning_rate": 0.00011885700209385393, "loss": 7.7971, "step": 6194 }, { "epoch": 0.7625553914327917, "grad_norm": 0.09710083156824112, "learning_rate": 0.0001187954181549452, "loss": 7.3152, "step": 6195 }, { "epoch": 0.7626784835056623, "grad_norm": 0.05773268640041351, "learning_rate": 0.00011873383421603646, "loss": 7.5899, "step": 6196 }, { "epoch": 0.7628015755785328, "grad_norm": 0.22478622198104858, "learning_rate": 0.00011867225027712774, "loss": 8.3166, "step": 6197 }, { "epoch": 0.7629246676514032, "grad_norm": 0.10167166590690613, "learning_rate": 0.000118610666338219, "loss": 7.5747, "step": 6198 }, { "epoch": 0.7630477597242737, "grad_norm": 0.07669658958911896, "learning_rate": 0.00011854908239931027, "loss": 7.4094, "step": 6199 }, { "epoch": 0.7631708517971443, "grad_norm": 0.140053853392601, "learning_rate": 0.00011848749846040154, "loss": 7.2381, "step": 6200 }, { "epoch": 0.7632939438700148, "grad_norm": 0.0891946330666542, "learning_rate": 0.0001184259145214928, "loss": 7.7394, "step": 6201 }, { "epoch": 0.7634170359428852, "grad_norm": 0.06721743941307068, "learning_rate": 0.00011836433058258407, "loss": 7.6115, "step": 6202 }, { "epoch": 0.7635401280157558, "grad_norm": 0.28693899512290955, "learning_rate": 0.00011830274664367533, "loss": 8.7369, "step": 6203 }, { "epoch": 0.7636632200886263, "grad_norm": 0.07985086739063263, "learning_rate": 0.0001182411627047666, "loss": 7.3829, "step": 6204 }, { "epoch": 0.7637863121614968, "grad_norm": 0.06244926154613495, "learning_rate": 0.00011817957876585788, "loss": 7.4632, "step": 6205 }, { "epoch": 0.7639094042343673, "grad_norm": 0.06999839842319489, "learning_rate": 0.00011811799482694914, "loss": 7.6239, "step": 6206 }, { "epoch": 0.7640324963072378, "grad_norm": 0.3456520438194275, "learning_rate": 0.00011805641088804041, "loss": 9.2469, "step": 6207 }, { "epoch": 0.7641555883801083, "grad_norm": 0.09206107258796692, "learning_rate": 0.00011799482694913167, "loss": 7.7126, "step": 6208 }, { "epoch": 0.7642786804529789, "grad_norm": 0.10525137931108475, "learning_rate": 0.00011793324301022294, "loss": 7.6162, "step": 6209 }, { "epoch": 0.7644017725258493, "grad_norm": 0.10477486252784729, "learning_rate": 0.00011787165907131422, "loss": 7.3879, "step": 6210 }, { "epoch": 0.7645248645987198, "grad_norm": 0.08784278482198715, "learning_rate": 0.00011781007513240546, "loss": 7.6008, "step": 6211 }, { "epoch": 0.7646479566715904, "grad_norm": 0.1553206592798233, "learning_rate": 0.00011774849119349674, "loss": 8.0889, "step": 6212 }, { "epoch": 0.7647710487444609, "grad_norm": 0.11511026322841644, "learning_rate": 0.000117686907254588, "loss": 7.8603, "step": 6213 }, { "epoch": 0.7648941408173313, "grad_norm": 0.10893949121236801, "learning_rate": 0.00011762532331567927, "loss": 7.4205, "step": 6214 }, { "epoch": 0.7650172328902018, "grad_norm": 0.09141164273023605, "learning_rate": 0.00011756373937677053, "loss": 7.4019, "step": 6215 }, { "epoch": 0.7651403249630724, "grad_norm": 0.0705072432756424, "learning_rate": 0.0001175021554378618, "loss": 7.3435, "step": 6216 }, { "epoch": 0.7652634170359429, "grad_norm": 0.2256181240081787, "learning_rate": 0.00011744057149895307, "loss": 8.9376, "step": 6217 }, { "epoch": 0.7653865091088133, "grad_norm": 0.08890677243471146, "learning_rate": 0.00011737898756004433, "loss": 7.9123, "step": 6218 }, { "epoch": 0.7655096011816839, "grad_norm": 0.08690183609724045, "learning_rate": 0.00011731740362113561, "loss": 7.3928, "step": 6219 }, { "epoch": 0.7656326932545544, "grad_norm": 0.07605884969234467, "learning_rate": 0.00011725581968222687, "loss": 7.8098, "step": 6220 }, { "epoch": 0.7657557853274249, "grad_norm": 0.11235225200653076, "learning_rate": 0.00011719423574331814, "loss": 7.8842, "step": 6221 }, { "epoch": 0.7658788774002954, "grad_norm": 0.08383841812610626, "learning_rate": 0.00011713265180440941, "loss": 7.5925, "step": 6222 }, { "epoch": 0.7660019694731659, "grad_norm": 0.17740724980831146, "learning_rate": 0.00011707106786550067, "loss": 8.3115, "step": 6223 }, { "epoch": 0.7661250615460364, "grad_norm": 0.09730927646160126, "learning_rate": 0.00011700948392659195, "loss": 7.4826, "step": 6224 }, { "epoch": 0.766248153618907, "grad_norm": 0.07628564536571503, "learning_rate": 0.0001169478999876832, "loss": 7.4416, "step": 6225 }, { "epoch": 0.7663712456917775, "grad_norm": 0.07975108176469803, "learning_rate": 0.00011688631604877448, "loss": 7.3441, "step": 6226 }, { "epoch": 0.7664943377646479, "grad_norm": 0.1705392301082611, "learning_rate": 0.00011682473210986575, "loss": 7.7307, "step": 6227 }, { "epoch": 0.7666174298375185, "grad_norm": 0.05563268065452576, "learning_rate": 0.00011676314817095701, "loss": 7.8064, "step": 6228 }, { "epoch": 0.766740521910389, "grad_norm": 0.06746072322130203, "learning_rate": 0.00011670156423204828, "loss": 7.8602, "step": 6229 }, { "epoch": 0.7668636139832595, "grad_norm": 0.06153010204434395, "learning_rate": 0.00011663998029313954, "loss": 7.6108, "step": 6230 }, { "epoch": 0.76698670605613, "grad_norm": 0.12916624546051025, "learning_rate": 0.00011657839635423082, "loss": 7.144, "step": 6231 }, { "epoch": 0.7671097981290005, "grad_norm": 0.07572419196367264, "learning_rate": 0.00011651681241532208, "loss": 7.783, "step": 6232 }, { "epoch": 0.767232890201871, "grad_norm": 0.12212890386581421, "learning_rate": 0.00011645522847641335, "loss": 7.2176, "step": 6233 }, { "epoch": 0.7673559822747416, "grad_norm": 0.12981797754764557, "learning_rate": 0.00011639364453750462, "loss": 7.895, "step": 6234 }, { "epoch": 0.767479074347612, "grad_norm": 0.06311120092868805, "learning_rate": 0.00011633206059859588, "loss": 7.5982, "step": 6235 }, { "epoch": 0.7676021664204825, "grad_norm": 0.1107252687215805, "learning_rate": 0.00011627047665968715, "loss": 7.1089, "step": 6236 }, { "epoch": 0.767725258493353, "grad_norm": 0.182145357131958, "learning_rate": 0.00011620889272077841, "loss": 8.0128, "step": 6237 }, { "epoch": 0.7678483505662236, "grad_norm": 0.18798702955245972, "learning_rate": 0.00011614730878186969, "loss": 7.6097, "step": 6238 }, { "epoch": 0.767971442639094, "grad_norm": 0.08241322636604309, "learning_rate": 0.00011608572484296096, "loss": 7.5993, "step": 6239 }, { "epoch": 0.7680945347119645, "grad_norm": 0.15846650302410126, "learning_rate": 0.00011602414090405222, "loss": 8.1072, "step": 6240 }, { "epoch": 0.7682176267848351, "grad_norm": 0.10686665028333664, "learning_rate": 0.00011596255696514349, "loss": 7.6181, "step": 6241 }, { "epoch": 0.7683407188577056, "grad_norm": 0.0870247557759285, "learning_rate": 0.00011590097302623475, "loss": 7.9477, "step": 6242 }, { "epoch": 0.768463810930576, "grad_norm": 0.06377986073493958, "learning_rate": 0.00011583938908732602, "loss": 7.4943, "step": 6243 }, { "epoch": 0.7685869030034466, "grad_norm": 0.16139265894889832, "learning_rate": 0.0001157778051484173, "loss": 8.2579, "step": 6244 }, { "epoch": 0.7687099950763171, "grad_norm": 0.06761300563812256, "learning_rate": 0.00011571622120950856, "loss": 7.8913, "step": 6245 }, { "epoch": 0.7688330871491876, "grad_norm": 0.0733608603477478, "learning_rate": 0.00011565463727059983, "loss": 7.7345, "step": 6246 }, { "epoch": 0.7689561792220581, "grad_norm": 0.0940064862370491, "learning_rate": 0.00011559305333169109, "loss": 7.4348, "step": 6247 }, { "epoch": 0.7690792712949286, "grad_norm": 0.08432704210281372, "learning_rate": 0.00011553146939278236, "loss": 7.5907, "step": 6248 }, { "epoch": 0.7692023633677991, "grad_norm": 0.06613826751708984, "learning_rate": 0.00011546988545387364, "loss": 7.798, "step": 6249 }, { "epoch": 0.7693254554406697, "grad_norm": 0.0855553075671196, "learning_rate": 0.0001154083015149649, "loss": 7.8687, "step": 6250 }, { "epoch": 0.7694485475135401, "grad_norm": 0.10373859107494354, "learning_rate": 0.00011534671757605617, "loss": 7.6693, "step": 6251 }, { "epoch": 0.7695716395864106, "grad_norm": 0.11005958169698715, "learning_rate": 0.00011528513363714743, "loss": 8.1633, "step": 6252 }, { "epoch": 0.7696947316592812, "grad_norm": 0.15208148956298828, "learning_rate": 0.0001152235496982387, "loss": 7.2734, "step": 6253 }, { "epoch": 0.7698178237321517, "grad_norm": 0.09563935548067093, "learning_rate": 0.00011516196575932997, "loss": 7.3417, "step": 6254 }, { "epoch": 0.7699409158050221, "grad_norm": 0.10135012120008469, "learning_rate": 0.00011510038182042123, "loss": 7.6528, "step": 6255 }, { "epoch": 0.7700640078778926, "grad_norm": 0.0765472799539566, "learning_rate": 0.0001150387978815125, "loss": 7.6059, "step": 6256 }, { "epoch": 0.7701870999507632, "grad_norm": 0.05998764932155609, "learning_rate": 0.00011497721394260377, "loss": 7.4217, "step": 6257 }, { "epoch": 0.7703101920236337, "grad_norm": 0.12038688361644745, "learning_rate": 0.00011491563000369504, "loss": 8.1708, "step": 6258 }, { "epoch": 0.7704332840965041, "grad_norm": 0.08040834218263626, "learning_rate": 0.0001148540460647863, "loss": 7.3351, "step": 6259 }, { "epoch": 0.7705563761693747, "grad_norm": 0.07933799922466278, "learning_rate": 0.00011479246212587757, "loss": 7.6434, "step": 6260 }, { "epoch": 0.7706794682422452, "grad_norm": 0.06611762940883636, "learning_rate": 0.00011473087818696884, "loss": 7.4097, "step": 6261 }, { "epoch": 0.7708025603151157, "grad_norm": 0.18731476366519928, "learning_rate": 0.0001146692942480601, "loss": 8.084, "step": 6262 }, { "epoch": 0.7709256523879862, "grad_norm": 0.11255943030118942, "learning_rate": 0.00011460771030915138, "loss": 7.7477, "step": 6263 }, { "epoch": 0.7710487444608567, "grad_norm": 0.13188260793685913, "learning_rate": 0.00011454612637024264, "loss": 7.7343, "step": 6264 }, { "epoch": 0.7711718365337272, "grad_norm": 0.06265470385551453, "learning_rate": 0.00011448454243133391, "loss": 7.4933, "step": 6265 }, { "epoch": 0.7712949286065978, "grad_norm": 0.06169205158948898, "learning_rate": 0.00011442295849242518, "loss": 7.392, "step": 6266 }, { "epoch": 0.7714180206794683, "grad_norm": 0.11127829551696777, "learning_rate": 0.00011436137455351644, "loss": 7.5332, "step": 6267 }, { "epoch": 0.7715411127523387, "grad_norm": 0.06580602377653122, "learning_rate": 0.00011429979061460772, "loss": 7.4862, "step": 6268 }, { "epoch": 0.7716642048252093, "grad_norm": 0.19083347916603088, "learning_rate": 0.00011423820667569897, "loss": 7.4914, "step": 6269 }, { "epoch": 0.7717872968980798, "grad_norm": 0.114068903028965, "learning_rate": 0.00011417662273679025, "loss": 7.5163, "step": 6270 }, { "epoch": 0.7719103889709503, "grad_norm": 0.06772951781749725, "learning_rate": 0.00011411503879788152, "loss": 7.7839, "step": 6271 }, { "epoch": 0.7720334810438207, "grad_norm": 0.09687387943267822, "learning_rate": 0.00011405345485897278, "loss": 8.1194, "step": 6272 }, { "epoch": 0.7721565731166913, "grad_norm": 0.12569719552993774, "learning_rate": 0.00011399187092006405, "loss": 7.4745, "step": 6273 }, { "epoch": 0.7722796651895618, "grad_norm": 0.09401848912239075, "learning_rate": 0.00011393028698115531, "loss": 7.4805, "step": 6274 }, { "epoch": 0.7724027572624323, "grad_norm": 0.0868568941950798, "learning_rate": 0.00011386870304224659, "loss": 7.7905, "step": 6275 }, { "epoch": 0.7725258493353028, "grad_norm": 0.06197843328118324, "learning_rate": 0.00011380711910333786, "loss": 7.3649, "step": 6276 }, { "epoch": 0.7726489414081733, "grad_norm": 0.06766969710588455, "learning_rate": 0.00011374553516442912, "loss": 7.4489, "step": 6277 }, { "epoch": 0.7727720334810438, "grad_norm": 0.05994763970375061, "learning_rate": 0.00011368395122552039, "loss": 7.4659, "step": 6278 }, { "epoch": 0.7728951255539144, "grad_norm": 0.08226695656776428, "learning_rate": 0.00011362236728661165, "loss": 7.5588, "step": 6279 }, { "epoch": 0.7730182176267848, "grad_norm": 0.060224633663892746, "learning_rate": 0.00011356078334770292, "loss": 7.5622, "step": 6280 }, { "epoch": 0.7731413096996553, "grad_norm": 0.058903131633996964, "learning_rate": 0.0001134991994087942, "loss": 7.4602, "step": 6281 }, { "epoch": 0.7732644017725259, "grad_norm": 0.5052189230918884, "learning_rate": 0.00011343761546988546, "loss": 6.4873, "step": 6282 }, { "epoch": 0.7733874938453964, "grad_norm": 0.10056603699922562, "learning_rate": 0.00011337603153097673, "loss": 7.3582, "step": 6283 }, { "epoch": 0.7735105859182668, "grad_norm": 0.06248493120074272, "learning_rate": 0.00011331444759206799, "loss": 7.3383, "step": 6284 }, { "epoch": 0.7736336779911374, "grad_norm": 0.10380296409130096, "learning_rate": 0.00011325286365315926, "loss": 7.3057, "step": 6285 }, { "epoch": 0.7737567700640079, "grad_norm": 0.2378157675266266, "learning_rate": 0.00011319127971425052, "loss": 8.3136, "step": 6286 }, { "epoch": 0.7738798621368784, "grad_norm": 0.1175975427031517, "learning_rate": 0.0001131296957753418, "loss": 7.6011, "step": 6287 }, { "epoch": 0.7740029542097489, "grad_norm": 0.1266176700592041, "learning_rate": 0.00011306811183643307, "loss": 7.8978, "step": 6288 }, { "epoch": 0.7741260462826194, "grad_norm": 0.2993014454841614, "learning_rate": 0.00011300652789752433, "loss": 9.2245, "step": 6289 }, { "epoch": 0.7742491383554899, "grad_norm": 0.3059338629245758, "learning_rate": 0.0001129449439586156, "loss": 8.8787, "step": 6290 }, { "epoch": 0.7743722304283605, "grad_norm": 0.08391368389129639, "learning_rate": 0.00011288336001970686, "loss": 7.6791, "step": 6291 }, { "epoch": 0.7744953225012309, "grad_norm": 0.2768370807170868, "learning_rate": 0.00011282177608079813, "loss": 7.1014, "step": 6292 }, { "epoch": 0.7746184145741014, "grad_norm": 0.10957684367895126, "learning_rate": 0.0001127601921418894, "loss": 7.7083, "step": 6293 }, { "epoch": 0.774741506646972, "grad_norm": 0.0920078456401825, "learning_rate": 0.00011269860820298067, "loss": 7.3657, "step": 6294 }, { "epoch": 0.7748645987198425, "grad_norm": 0.20971371233463287, "learning_rate": 0.00011263702426407194, "loss": 8.6325, "step": 6295 }, { "epoch": 0.7749876907927129, "grad_norm": 0.11781132966279984, "learning_rate": 0.0001125754403251632, "loss": 7.6086, "step": 6296 }, { "epoch": 0.7751107828655834, "grad_norm": 0.12943139672279358, "learning_rate": 0.00011251385638625447, "loss": 7.3454, "step": 6297 }, { "epoch": 0.775233874938454, "grad_norm": 0.31699255108833313, "learning_rate": 0.00011245227244734574, "loss": 9.0893, "step": 6298 }, { "epoch": 0.7753569670113245, "grad_norm": 0.09212897717952728, "learning_rate": 0.000112390688508437, "loss": 8.2538, "step": 6299 }, { "epoch": 0.7754800590841949, "grad_norm": 0.17276698350906372, "learning_rate": 0.00011232910456952828, "loss": 7.7211, "step": 6300 }, { "epoch": 0.7756031511570655, "grad_norm": 0.15354937314987183, "learning_rate": 0.00011226752063061954, "loss": 8.1831, "step": 6301 }, { "epoch": 0.775726243229936, "grad_norm": 0.23829863965511322, "learning_rate": 0.00011220593669171081, "loss": 7.0095, "step": 6302 }, { "epoch": 0.7758493353028065, "grad_norm": 0.059524308890104294, "learning_rate": 0.00011214435275280208, "loss": 7.7228, "step": 6303 }, { "epoch": 0.775972427375677, "grad_norm": 0.34037986397743225, "learning_rate": 0.00011208276881389334, "loss": 7.3037, "step": 6304 }, { "epoch": 0.7760955194485475, "grad_norm": 0.08937001973390579, "learning_rate": 0.00011202118487498461, "loss": 7.5018, "step": 6305 }, { "epoch": 0.776218611521418, "grad_norm": 0.0842142254114151, "learning_rate": 0.00011195960093607587, "loss": 7.634, "step": 6306 }, { "epoch": 0.7763417035942886, "grad_norm": 0.08960547298192978, "learning_rate": 0.00011189801699716715, "loss": 7.2872, "step": 6307 }, { "epoch": 0.776464795667159, "grad_norm": 0.1074209064245224, "learning_rate": 0.0001118364330582584, "loss": 7.6829, "step": 6308 }, { "epoch": 0.7765878877400295, "grad_norm": 0.09420562535524368, "learning_rate": 0.00011177484911934968, "loss": 7.5484, "step": 6309 }, { "epoch": 0.7767109798129, "grad_norm": 0.11278574913740158, "learning_rate": 0.00011171326518044095, "loss": 7.7859, "step": 6310 }, { "epoch": 0.7768340718857706, "grad_norm": 0.2638349235057831, "learning_rate": 0.00011165168124153221, "loss": 8.547, "step": 6311 }, { "epoch": 0.7769571639586411, "grad_norm": 0.17181748151779175, "learning_rate": 0.00011159009730262349, "loss": 8.2287, "step": 6312 }, { "epoch": 0.7770802560315115, "grad_norm": NaN, "learning_rate": 0.00011152851336371474, "loss": 7.8704, "step": 6313 }, { "epoch": 0.7772033481043821, "grad_norm": 0.10093763470649719, "learning_rate": 0.00011146692942480602, "loss": 7.2924, "step": 6314 }, { "epoch": 0.7773264401772526, "grad_norm": 0.10844790190458298, "learning_rate": 0.00011140534548589729, "loss": 7.1561, "step": 6315 }, { "epoch": 0.7774495322501231, "grad_norm": 0.1637260913848877, "learning_rate": 0.00011134376154698855, "loss": 7.4146, "step": 6316 }, { "epoch": 0.7775726243229936, "grad_norm": 0.14832377433776855, "learning_rate": 0.00011128217760807982, "loss": 7.846, "step": 6317 }, { "epoch": 0.7776957163958641, "grad_norm": 0.12017890810966492, "learning_rate": 0.00011122059366917108, "loss": 8.2246, "step": 6318 }, { "epoch": 0.7778188084687346, "grad_norm": 0.299213171005249, "learning_rate": 0.00011115900973026236, "loss": 7.28, "step": 6319 }, { "epoch": 0.7779419005416052, "grad_norm": 0.3024197220802307, "learning_rate": 0.00011109742579135363, "loss": 7.2798, "step": 6320 }, { "epoch": 0.7780649926144756, "grad_norm": 0.23700232803821564, "learning_rate": 0.00011103584185244489, "loss": 7.812, "step": 6321 }, { "epoch": 0.7781880846873461, "grad_norm": 0.1490444839000702, "learning_rate": 0.00011097425791353616, "loss": 7.3889, "step": 6322 }, { "epoch": 0.7783111767602167, "grad_norm": 0.06882516294717789, "learning_rate": 0.00011091267397462742, "loss": 7.8291, "step": 6323 }, { "epoch": 0.7784342688330872, "grad_norm": 0.11512406170368195, "learning_rate": 0.0001108510900357187, "loss": 7.4444, "step": 6324 }, { "epoch": 0.7785573609059576, "grad_norm": 0.16413556039333344, "learning_rate": 0.00011078950609680997, "loss": 7.3149, "step": 6325 }, { "epoch": 0.7786804529788282, "grad_norm": 0.14331288635730743, "learning_rate": 0.00011072792215790123, "loss": 7.4596, "step": 6326 }, { "epoch": 0.7788035450516987, "grad_norm": 0.1379718780517578, "learning_rate": 0.00011066633821899249, "loss": 7.6526, "step": 6327 }, { "epoch": 0.7789266371245692, "grad_norm": 0.13980236649513245, "learning_rate": 0.00011060475428008375, "loss": 7.9109, "step": 6328 }, { "epoch": 0.7790497291974396, "grad_norm": 0.10632085055112839, "learning_rate": 0.00011054317034117502, "loss": 7.4902, "step": 6329 }, { "epoch": 0.7791728212703102, "grad_norm": 0.10253134369850159, "learning_rate": 0.00011048158640226628, "loss": 7.6084, "step": 6330 }, { "epoch": 0.7792959133431807, "grad_norm": 0.1066804900765419, "learning_rate": 0.00011042000246335755, "loss": 7.5682, "step": 6331 }, { "epoch": 0.7794190054160512, "grad_norm": 0.1827070415019989, "learning_rate": 0.00011035841852444882, "loss": 7.8937, "step": 6332 }, { "epoch": 0.7795420974889217, "grad_norm": 0.10156582295894623, "learning_rate": 0.00011029683458554008, "loss": 7.2438, "step": 6333 }, { "epoch": 0.7796651895617922, "grad_norm": 0.16677366197109222, "learning_rate": 0.00011023525064663136, "loss": 7.9382, "step": 6334 }, { "epoch": 0.7797882816346627, "grad_norm": 0.3430347144603729, "learning_rate": 0.00011017366670772262, "loss": 8.5567, "step": 6335 }, { "epoch": 0.7799113737075333, "grad_norm": 0.144114688038826, "learning_rate": 0.00011011208276881389, "loss": 7.949, "step": 6336 }, { "epoch": 0.7800344657804037, "grad_norm": 0.06762509793043137, "learning_rate": 0.00011005049882990516, "loss": 7.401, "step": 6337 }, { "epoch": 0.7801575578532742, "grad_norm": 0.2954547107219696, "learning_rate": 0.00010998891489099642, "loss": 8.9392, "step": 6338 }, { "epoch": 0.7802806499261448, "grad_norm": 0.07451173663139343, "learning_rate": 0.0001099273309520877, "loss": 7.6858, "step": 6339 }, { "epoch": 0.7804037419990153, "grad_norm": 0.10009699314832687, "learning_rate": 0.00010986574701317895, "loss": 7.6303, "step": 6340 }, { "epoch": 0.7805268340718857, "grad_norm": 0.10491631925106049, "learning_rate": 0.00010980416307427023, "loss": 7.7601, "step": 6341 }, { "epoch": 0.7806499261447563, "grad_norm": 0.12937569618225098, "learning_rate": 0.0001097425791353615, "loss": 7.6077, "step": 6342 }, { "epoch": 0.7807730182176268, "grad_norm": 0.08304852992296219, "learning_rate": 0.00010968099519645276, "loss": 7.5667, "step": 6343 }, { "epoch": 0.7808961102904973, "grad_norm": 0.3258952796459198, "learning_rate": 0.00010961941125754403, "loss": 9.4441, "step": 6344 }, { "epoch": 0.7810192023633677, "grad_norm": 0.07482681423425674, "learning_rate": 0.00010955782731863529, "loss": 7.7752, "step": 6345 }, { "epoch": 0.7811422944362383, "grad_norm": 0.07761622220277786, "learning_rate": 0.00010949624337972657, "loss": 7.7714, "step": 6346 }, { "epoch": 0.7812653865091088, "grad_norm": 0.6000692248344421, "learning_rate": 0.00010943465944081784, "loss": 10.248, "step": 6347 }, { "epoch": 0.7813884785819794, "grad_norm": 0.0937577560544014, "learning_rate": 0.0001093730755019091, "loss": 7.6345, "step": 6348 }, { "epoch": 0.7815115706548498, "grad_norm": 0.0841708853840828, "learning_rate": 0.00010931149156300037, "loss": 7.5444, "step": 6349 }, { "epoch": 0.7816346627277203, "grad_norm": 0.0872383713722229, "learning_rate": 0.00010924990762409163, "loss": 7.6269, "step": 6350 }, { "epoch": 0.7817577548005908, "grad_norm": 0.11692985147237778, "learning_rate": 0.0001091883236851829, "loss": 7.4824, "step": 6351 }, { "epoch": 0.7818808468734614, "grad_norm": 0.0778486430644989, "learning_rate": 0.00010912673974627418, "loss": 7.544, "step": 6352 }, { "epoch": 0.7820039389463319, "grad_norm": 0.10161389410495758, "learning_rate": 0.00010906515580736544, "loss": 7.5196, "step": 6353 }, { "epoch": 0.7821270310192023, "grad_norm": 0.15307043492794037, "learning_rate": 0.00010900357186845671, "loss": 7.6764, "step": 6354 }, { "epoch": 0.7822501230920729, "grad_norm": 0.26481425762176514, "learning_rate": 0.00010894198792954797, "loss": 8.8546, "step": 6355 }, { "epoch": 0.7823732151649434, "grad_norm": 0.06273788958787918, "learning_rate": 0.00010888040399063924, "loss": 7.6909, "step": 6356 }, { "epoch": 0.7824963072378139, "grad_norm": 0.10499484091997147, "learning_rate": 0.0001088188200517305, "loss": 7.5289, "step": 6357 }, { "epoch": 0.7826193993106844, "grad_norm": 0.10318832844495773, "learning_rate": 0.00010875723611282177, "loss": 7.4832, "step": 6358 }, { "epoch": 0.7827424913835549, "grad_norm": 0.18600496649742126, "learning_rate": 0.00010869565217391305, "loss": 8.2637, "step": 6359 }, { "epoch": 0.7828655834564254, "grad_norm": 0.11331206560134888, "learning_rate": 0.00010863406823500431, "loss": 7.3713, "step": 6360 }, { "epoch": 0.782988675529296, "grad_norm": 0.11062557995319366, "learning_rate": 0.00010857248429609558, "loss": 8.0229, "step": 6361 }, { "epoch": 0.7831117676021664, "grad_norm": 0.36567872762680054, "learning_rate": 0.00010851090035718684, "loss": 9.4089, "step": 6362 }, { "epoch": 0.7832348596750369, "grad_norm": 0.1763904094696045, "learning_rate": 0.00010844931641827811, "loss": 8.6528, "step": 6363 }, { "epoch": 0.7833579517479075, "grad_norm": 0.16061310470104218, "learning_rate": 0.00010838773247936939, "loss": 7.5213, "step": 6364 }, { "epoch": 0.783481043820778, "grad_norm": 0.0887046679854393, "learning_rate": 0.00010832614854046065, "loss": 7.6736, "step": 6365 }, { "epoch": 0.7836041358936484, "grad_norm": 0.10534150898456573, "learning_rate": 0.00010826456460155192, "loss": 7.5233, "step": 6366 }, { "epoch": 0.783727227966519, "grad_norm": 0.11390066891908646, "learning_rate": 0.00010820298066264318, "loss": 7.7338, "step": 6367 }, { "epoch": 0.7838503200393895, "grad_norm": 0.1437213122844696, "learning_rate": 0.00010814139672373445, "loss": 7.3357, "step": 6368 }, { "epoch": 0.78397341211226, "grad_norm": 0.11058622598648071, "learning_rate": 0.00010807981278482572, "loss": 7.3154, "step": 6369 }, { "epoch": 0.7840965041851304, "grad_norm": 0.14130057394504547, "learning_rate": 0.00010801822884591698, "loss": 7.6483, "step": 6370 }, { "epoch": 0.784219596258001, "grad_norm": 0.12792077660560608, "learning_rate": 0.00010795664490700826, "loss": 7.3666, "step": 6371 }, { "epoch": 0.7843426883308715, "grad_norm": 0.3990030586719513, "learning_rate": 0.00010789506096809952, "loss": 9.3404, "step": 6372 }, { "epoch": 0.784465780403742, "grad_norm": 0.18441906571388245, "learning_rate": 0.00010783347702919079, "loss": 7.6902, "step": 6373 }, { "epoch": 0.7845888724766125, "grad_norm": 0.07457497715950012, "learning_rate": 0.00010777189309028206, "loss": 7.5005, "step": 6374 }, { "epoch": 0.784711964549483, "grad_norm": 0.07615667581558228, "learning_rate": 0.00010771030915137332, "loss": 7.5506, "step": 6375 }, { "epoch": 0.7848350566223535, "grad_norm": 0.060634415596723557, "learning_rate": 0.0001076487252124646, "loss": 7.6878, "step": 6376 }, { "epoch": 0.7849581486952241, "grad_norm": 0.0715024545788765, "learning_rate": 0.00010758714127355585, "loss": 7.5435, "step": 6377 }, { "epoch": 0.7850812407680945, "grad_norm": 0.07436054944992065, "learning_rate": 0.00010752555733464713, "loss": 7.8599, "step": 6378 }, { "epoch": 0.785204332840965, "grad_norm": 0.07674548774957657, "learning_rate": 0.00010746397339573839, "loss": 8.3708, "step": 6379 }, { "epoch": 0.7853274249138356, "grad_norm": 0.4931749105453491, "learning_rate": 0.00010740238945682966, "loss": 10.6769, "step": 6380 }, { "epoch": 0.7854505169867061, "grad_norm": 0.34970322251319885, "learning_rate": 0.00010734080551792093, "loss": 9.6159, "step": 6381 }, { "epoch": 0.7855736090595765, "grad_norm": 0.17541195452213287, "learning_rate": 0.00010727922157901219, "loss": 8.1017, "step": 6382 }, { "epoch": 0.785696701132447, "grad_norm": 0.20791035890579224, "learning_rate": 0.00010721763764010347, "loss": 7.6424, "step": 6383 }, { "epoch": 0.7858197932053176, "grad_norm": 0.2516424059867859, "learning_rate": 0.00010715605370119472, "loss": 7.5043, "step": 6384 }, { "epoch": 0.7859428852781881, "grad_norm": 0.20793917775154114, "learning_rate": 0.000107094469762286, "loss": 7.643, "step": 6385 }, { "epoch": 0.7860659773510585, "grad_norm": 0.18730588257312775, "learning_rate": 0.00010703288582337727, "loss": 7.4741, "step": 6386 }, { "epoch": 0.7861890694239291, "grad_norm": 0.20733077824115753, "learning_rate": 0.00010697130188446853, "loss": 7.2645, "step": 6387 }, { "epoch": 0.7863121614967996, "grad_norm": 0.10571102797985077, "learning_rate": 0.0001069097179455598, "loss": 7.7291, "step": 6388 }, { "epoch": 0.7864352535696701, "grad_norm": 0.1216963529586792, "learning_rate": 0.00010684813400665106, "loss": 7.7889, "step": 6389 }, { "epoch": 0.7865583456425406, "grad_norm": 0.11096911877393723, "learning_rate": 0.00010678655006774234, "loss": 7.3675, "step": 6390 }, { "epoch": 0.7866814377154111, "grad_norm": 0.36985480785369873, "learning_rate": 0.00010672496612883361, "loss": 8.6415, "step": 6391 }, { "epoch": 0.7868045297882816, "grad_norm": 0.44025593996047974, "learning_rate": 0.00010666338218992487, "loss": 9.0905, "step": 6392 }, { "epoch": 0.7869276218611522, "grad_norm": 0.1571093648672104, "learning_rate": 0.00010660179825101614, "loss": 7.453, "step": 6393 }, { "epoch": 0.7870507139340226, "grad_norm": 0.15613771975040436, "learning_rate": 0.0001065402143121074, "loss": 8.0211, "step": 6394 }, { "epoch": 0.7871738060068931, "grad_norm": 0.29898446798324585, "learning_rate": 0.00010647863037319867, "loss": 8.8688, "step": 6395 }, { "epoch": 0.7872968980797637, "grad_norm": 0.09215162694454193, "learning_rate": 0.00010641704643428995, "loss": 7.5011, "step": 6396 }, { "epoch": 0.7874199901526342, "grad_norm": 0.10534906387329102, "learning_rate": 0.0001063554624953812, "loss": 7.4478, "step": 6397 }, { "epoch": 0.7875430822255047, "grad_norm": 0.10190354287624359, "learning_rate": 0.00010629387855647248, "loss": 7.4676, "step": 6398 }, { "epoch": 0.7876661742983752, "grad_norm": 0.1303817480802536, "learning_rate": 0.00010623229461756374, "loss": 7.3369, "step": 6399 }, { "epoch": 0.7877892663712457, "grad_norm": 0.053441375494003296, "learning_rate": 0.00010617071067865501, "loss": 7.7972, "step": 6400 }, { "epoch": 0.7879123584441162, "grad_norm": 0.0707133412361145, "learning_rate": 0.00010610912673974629, "loss": 7.4654, "step": 6401 }, { "epoch": 0.7880354505169868, "grad_norm": 0.14079688489437103, "learning_rate": 0.00010604754280083754, "loss": 8.0856, "step": 6402 }, { "epoch": 0.7881585425898572, "grad_norm": 0.09819497913122177, "learning_rate": 0.00010598595886192882, "loss": 8.0364, "step": 6403 }, { "epoch": 0.7882816346627277, "grad_norm": 0.058472417294979095, "learning_rate": 0.00010592437492302008, "loss": 7.5689, "step": 6404 }, { "epoch": 0.7884047267355982, "grad_norm": 0.13745737075805664, "learning_rate": 0.00010586279098411135, "loss": 7.9852, "step": 6405 }, { "epoch": 0.7885278188084688, "grad_norm": 0.09691797941923141, "learning_rate": 0.00010580120704520261, "loss": 8.2348, "step": 6406 }, { "epoch": 0.7886509108813392, "grad_norm": 0.10312214493751526, "learning_rate": 0.00010573962310629388, "loss": 7.5003, "step": 6407 }, { "epoch": 0.7887740029542097, "grad_norm": 0.11904244124889374, "learning_rate": 0.00010567803916738516, "loss": 7.3349, "step": 6408 }, { "epoch": 0.7888970950270803, "grad_norm": 0.08865570276975632, "learning_rate": 0.00010561645522847642, "loss": 8.0578, "step": 6409 }, { "epoch": 0.7890201870999508, "grad_norm": 0.08808651566505432, "learning_rate": 0.00010555487128956769, "loss": 7.4475, "step": 6410 }, { "epoch": 0.7891432791728212, "grad_norm": 0.060910288244485855, "learning_rate": 0.00010549328735065895, "loss": 7.6564, "step": 6411 }, { "epoch": 0.7892663712456918, "grad_norm": 0.10546129941940308, "learning_rate": 0.00010543170341175022, "loss": 7.6415, "step": 6412 }, { "epoch": 0.7893894633185623, "grad_norm": 0.147940531373024, "learning_rate": 0.0001053701194728415, "loss": 7.4997, "step": 6413 }, { "epoch": 0.7895125553914328, "grad_norm": 0.11152994632720947, "learning_rate": 0.00010530853553393275, "loss": 7.7407, "step": 6414 }, { "epoch": 0.7896356474643033, "grad_norm": 0.06410064548254013, "learning_rate": 0.00010524695159502403, "loss": 7.5683, "step": 6415 }, { "epoch": 0.7897587395371738, "grad_norm": 0.2882334589958191, "learning_rate": 0.00010518536765611529, "loss": 8.6431, "step": 6416 }, { "epoch": 0.7898818316100443, "grad_norm": 0.07458461821079254, "learning_rate": 0.00010512378371720656, "loss": 7.5713, "step": 6417 }, { "epoch": 0.7900049236829149, "grad_norm": 0.10688042640686035, "learning_rate": 0.00010506219977829783, "loss": 7.3453, "step": 6418 }, { "epoch": 0.7901280157557853, "grad_norm": 0.09211279451847076, "learning_rate": 0.00010500061583938909, "loss": 7.4195, "step": 6419 }, { "epoch": 0.7902511078286558, "grad_norm": 0.07735666632652283, "learning_rate": 0.00010493903190048036, "loss": 7.9081, "step": 6420 }, { "epoch": 0.7903741999015264, "grad_norm": 0.08134257048368454, "learning_rate": 0.00010487744796157162, "loss": 7.4714, "step": 6421 }, { "epoch": 0.7904972919743969, "grad_norm": 0.06881861388683319, "learning_rate": 0.0001048158640226629, "loss": 7.5711, "step": 6422 }, { "epoch": 0.7906203840472673, "grad_norm": 0.17148461937904358, "learning_rate": 0.00010475428008375417, "loss": 8.3952, "step": 6423 }, { "epoch": 0.7907434761201378, "grad_norm": 0.0924656093120575, "learning_rate": 0.00010469269614484543, "loss": 7.3185, "step": 6424 }, { "epoch": 0.7908665681930084, "grad_norm": 0.12239021062850952, "learning_rate": 0.0001046311122059367, "loss": 8.1559, "step": 6425 }, { "epoch": 0.7909896602658789, "grad_norm": 0.12392355501651764, "learning_rate": 0.00010456952826702796, "loss": 7.8899, "step": 6426 }, { "epoch": 0.7911127523387493, "grad_norm": 0.05964861810207367, "learning_rate": 0.00010450794432811924, "loss": 7.6474, "step": 6427 }, { "epoch": 0.7912358444116199, "grad_norm": 0.07516080141067505, "learning_rate": 0.0001044463603892105, "loss": 7.684, "step": 6428 }, { "epoch": 0.7913589364844904, "grad_norm": 0.053391892462968826, "learning_rate": 0.00010438477645030177, "loss": 7.423, "step": 6429 }, { "epoch": 0.7914820285573609, "grad_norm": 0.09233944118022919, "learning_rate": 0.00010432319251139304, "loss": 7.5664, "step": 6430 }, { "epoch": 0.7916051206302314, "grad_norm": 0.10221130400896072, "learning_rate": 0.0001042616085724843, "loss": 8.0343, "step": 6431 }, { "epoch": 0.7917282127031019, "grad_norm": 0.13025027513504028, "learning_rate": 0.00010420002463357557, "loss": 7.3417, "step": 6432 }, { "epoch": 0.7918513047759724, "grad_norm": 0.13700228929519653, "learning_rate": 0.00010413844069466683, "loss": 7.1823, "step": 6433 }, { "epoch": 0.791974396848843, "grad_norm": 0.13024619221687317, "learning_rate": 0.0001040768567557581, "loss": 8.1363, "step": 6434 }, { "epoch": 0.7920974889217134, "grad_norm": 0.4131031930446625, "learning_rate": 0.00010401527281684938, "loss": 9.4379, "step": 6435 }, { "epoch": 0.7922205809945839, "grad_norm": 0.09884779900312424, "learning_rate": 0.00010395368887794064, "loss": 7.3266, "step": 6436 }, { "epoch": 0.7923436730674545, "grad_norm": 0.10586809366941452, "learning_rate": 0.00010389210493903191, "loss": 7.2813, "step": 6437 }, { "epoch": 0.792466765140325, "grad_norm": 0.05450424551963806, "learning_rate": 0.00010383052100012317, "loss": 7.6826, "step": 6438 }, { "epoch": 0.7925898572131955, "grad_norm": 0.09307166188955307, "learning_rate": 0.00010376893706121444, "loss": 7.3808, "step": 6439 }, { "epoch": 0.792712949286066, "grad_norm": 0.07402602583169937, "learning_rate": 0.00010370735312230572, "loss": 7.5875, "step": 6440 }, { "epoch": 0.7928360413589365, "grad_norm": 0.08641255646944046, "learning_rate": 0.00010364576918339698, "loss": 7.7565, "step": 6441 }, { "epoch": 0.792959133431807, "grad_norm": 0.3639201819896698, "learning_rate": 0.00010358418524448825, "loss": 9.1856, "step": 6442 }, { "epoch": 0.7930822255046776, "grad_norm": 0.06976400315761566, "learning_rate": 0.0001035226013055795, "loss": 7.7755, "step": 6443 }, { "epoch": 0.793205317577548, "grad_norm": 0.07230427116155624, "learning_rate": 0.00010346101736667077, "loss": 7.4803, "step": 6444 }, { "epoch": 0.7933284096504185, "grad_norm": 0.2988855540752411, "learning_rate": 0.00010339943342776204, "loss": 9.3247, "step": 6445 }, { "epoch": 0.793451501723289, "grad_norm": 0.08381945639848709, "learning_rate": 0.0001033378494888533, "loss": 7.4738, "step": 6446 }, { "epoch": 0.7935745937961596, "grad_norm": 0.07834149152040482, "learning_rate": 0.00010327626554994457, "loss": 7.6172, "step": 6447 }, { "epoch": 0.79369768586903, "grad_norm": 0.12817896902561188, "learning_rate": 0.00010321468161103583, "loss": 7.3196, "step": 6448 }, { "epoch": 0.7938207779419005, "grad_norm": 0.10296103358268738, "learning_rate": 0.0001031530976721271, "loss": 7.4179, "step": 6449 }, { "epoch": 0.7939438700147711, "grad_norm": 0.07409466058015823, "learning_rate": 0.00010309151373321837, "loss": 7.6952, "step": 6450 }, { "epoch": 0.7940669620876416, "grad_norm": 0.19431643187999725, "learning_rate": 0.00010302992979430964, "loss": 8.3973, "step": 6451 }, { "epoch": 0.794190054160512, "grad_norm": 0.06545481830835342, "learning_rate": 0.00010296834585540091, "loss": 7.7691, "step": 6452 }, { "epoch": 0.7943131462333826, "grad_norm": 0.08290800452232361, "learning_rate": 0.00010290676191649217, "loss": 7.6014, "step": 6453 }, { "epoch": 0.7944362383062531, "grad_norm": 0.11546685546636581, "learning_rate": 0.00010284517797758344, "loss": 7.2481, "step": 6454 }, { "epoch": 0.7945593303791236, "grad_norm": 0.0708012580871582, "learning_rate": 0.0001027835940386747, "loss": 7.6502, "step": 6455 }, { "epoch": 0.794682422451994, "grad_norm": 0.06817459315061569, "learning_rate": 0.00010272201009976598, "loss": 7.7165, "step": 6456 }, { "epoch": 0.7948055145248646, "grad_norm": 0.07426705956459045, "learning_rate": 0.00010266042616085725, "loss": 7.5326, "step": 6457 }, { "epoch": 0.7949286065977351, "grad_norm": 0.06419304758310318, "learning_rate": 0.00010259884222194851, "loss": 7.5129, "step": 6458 }, { "epoch": 0.7950516986706057, "grad_norm": 0.11138592660427094, "learning_rate": 0.00010253725828303978, "loss": 7.5886, "step": 6459 }, { "epoch": 0.7951747907434761, "grad_norm": 0.6583200097084045, "learning_rate": 0.00010247567434413104, "loss": 10.979, "step": 6460 }, { "epoch": 0.7952978828163466, "grad_norm": 0.0655200183391571, "learning_rate": 0.00010241409040522232, "loss": 7.4266, "step": 6461 }, { "epoch": 0.7954209748892171, "grad_norm": 0.08752407878637314, "learning_rate": 0.00010235250646631359, "loss": 7.2071, "step": 6462 }, { "epoch": 0.7955440669620877, "grad_norm": 0.14628218114376068, "learning_rate": 0.00010229092252740485, "loss": 8.381, "step": 6463 }, { "epoch": 0.7956671590349581, "grad_norm": 0.09042931348085403, "learning_rate": 0.00010222933858849612, "loss": 7.8623, "step": 6464 }, { "epoch": 0.7957902511078286, "grad_norm": 0.08728011697530746, "learning_rate": 0.00010216775464958738, "loss": 7.688, "step": 6465 }, { "epoch": 0.7959133431806992, "grad_norm": 0.05715283751487732, "learning_rate": 0.00010210617071067865, "loss": 7.9457, "step": 6466 }, { "epoch": 0.7960364352535697, "grad_norm": 0.080800861120224, "learning_rate": 0.00010204458677176993, "loss": 7.6265, "step": 6467 }, { "epoch": 0.7961595273264401, "grad_norm": 0.16379877924919128, "learning_rate": 0.00010198300283286119, "loss": 7.3437, "step": 6468 }, { "epoch": 0.7962826193993107, "grad_norm": 0.12039568275213242, "learning_rate": 0.00010192141889395246, "loss": 8.0355, "step": 6469 }, { "epoch": 0.7964057114721812, "grad_norm": 0.19755816459655762, "learning_rate": 0.00010185983495504372, "loss": 7.1452, "step": 6470 }, { "epoch": 0.7965288035450517, "grad_norm": 0.0631256029009819, "learning_rate": 0.00010179825101613499, "loss": 7.642, "step": 6471 }, { "epoch": 0.7966518956179222, "grad_norm": 0.08772951364517212, "learning_rate": 0.00010173666707722626, "loss": 7.4443, "step": 6472 }, { "epoch": 0.7967749876907927, "grad_norm": 0.4648604989051819, "learning_rate": 0.00010167508313831752, "loss": 6.6693, "step": 6473 }, { "epoch": 0.7968980797636632, "grad_norm": 0.09652233123779297, "learning_rate": 0.0001016134991994088, "loss": 7.9172, "step": 6474 }, { "epoch": 0.7970211718365338, "grad_norm": 0.11744292825460434, "learning_rate": 0.00010155191526050006, "loss": 7.317, "step": 6475 }, { "epoch": 0.7971442639094042, "grad_norm": 0.15977148711681366, "learning_rate": 0.00010149033132159133, "loss": 8.1029, "step": 6476 }, { "epoch": 0.7972673559822747, "grad_norm": 0.2684887945652008, "learning_rate": 0.00010142874738268259, "loss": 8.5972, "step": 6477 }, { "epoch": 0.7973904480551453, "grad_norm": 0.10001447796821594, "learning_rate": 0.00010136716344377386, "loss": 7.3682, "step": 6478 }, { "epoch": 0.7975135401280158, "grad_norm": 0.15006232261657715, "learning_rate": 0.00010130557950486514, "loss": 8.1946, "step": 6479 }, { "epoch": 0.7976366322008862, "grad_norm": 0.08776351064443588, "learning_rate": 0.0001012439955659564, "loss": 7.5742, "step": 6480 }, { "epoch": 0.7977597242737567, "grad_norm": 0.13850414752960205, "learning_rate": 0.00010118241162704767, "loss": 7.2394, "step": 6481 }, { "epoch": 0.7978828163466273, "grad_norm": 0.08544635772705078, "learning_rate": 0.00010112082768813893, "loss": 7.8686, "step": 6482 }, { "epoch": 0.7980059084194978, "grad_norm": 0.05775247514247894, "learning_rate": 0.0001010592437492302, "loss": 7.6476, "step": 6483 }, { "epoch": 0.7981290004923683, "grad_norm": 0.08509424328804016, "learning_rate": 0.00010099765981032147, "loss": 7.5369, "step": 6484 }, { "epoch": 0.7982520925652388, "grad_norm": 0.10833921283483505, "learning_rate": 0.00010093607587141273, "loss": 7.6383, "step": 6485 }, { "epoch": 0.7983751846381093, "grad_norm": 0.06449475884437561, "learning_rate": 0.000100874491932504, "loss": 7.8649, "step": 6486 }, { "epoch": 0.7984982767109798, "grad_norm": 0.11285197734832764, "learning_rate": 0.00010081290799359527, "loss": 7.8424, "step": 6487 }, { "epoch": 0.7986213687838504, "grad_norm": 0.06287034600973129, "learning_rate": 0.00010075132405468654, "loss": 7.6734, "step": 6488 }, { "epoch": 0.7987444608567208, "grad_norm": 0.1502646654844284, "learning_rate": 0.00010068974011577781, "loss": 7.8187, "step": 6489 }, { "epoch": 0.7988675529295913, "grad_norm": 0.24179373681545258, "learning_rate": 0.00010062815617686907, "loss": 9.0366, "step": 6490 }, { "epoch": 0.7989906450024619, "grad_norm": 0.12627360224723816, "learning_rate": 0.00010056657223796034, "loss": 7.3165, "step": 6491 }, { "epoch": 0.7991137370753324, "grad_norm": 0.0832831859588623, "learning_rate": 0.0001005049882990516, "loss": 7.339, "step": 6492 }, { "epoch": 0.7992368291482028, "grad_norm": 0.06612904369831085, "learning_rate": 0.00010044340436014288, "loss": 7.4833, "step": 6493 }, { "epoch": 0.7993599212210734, "grad_norm": 0.05828707665205002, "learning_rate": 0.00010038182042123415, "loss": 7.5973, "step": 6494 }, { "epoch": 0.7994830132939439, "grad_norm": 0.05658943951129913, "learning_rate": 0.00010032023648232541, "loss": 7.8285, "step": 6495 }, { "epoch": 0.7996061053668144, "grad_norm": 0.1171034500002861, "learning_rate": 0.00010025865254341668, "loss": 8.0669, "step": 6496 }, { "epoch": 0.7997291974396848, "grad_norm": 0.12196958810091019, "learning_rate": 0.00010019706860450794, "loss": 7.4938, "step": 6497 }, { "epoch": 0.7998522895125554, "grad_norm": 0.09442540258169174, "learning_rate": 0.00010013548466559921, "loss": 7.7223, "step": 6498 }, { "epoch": 0.7999753815854259, "grad_norm": 0.11465384066104889, "learning_rate": 0.00010007390072669047, "loss": 7.2298, "step": 6499 }, { "epoch": 0.8000984736582964, "grad_norm": 0.06174980849027634, "learning_rate": 0.00010001231678778175, "loss": 7.3518, "step": 6500 }, { "epoch": 0.8002215657311669, "grad_norm": 0.1813124418258667, "learning_rate": 9.995073284887302e-05, "loss": 8.6182, "step": 6501 }, { "epoch": 0.8003446578040374, "grad_norm": 0.23940908908843994, "learning_rate": 9.988914890996428e-05, "loss": 8.4866, "step": 6502 }, { "epoch": 0.8004677498769079, "grad_norm": 0.07885553687810898, "learning_rate": 9.982756497105555e-05, "loss": 7.3655, "step": 6503 }, { "epoch": 0.8005908419497785, "grad_norm": 0.0517587810754776, "learning_rate": 9.976598103214681e-05, "loss": 7.6258, "step": 6504 }, { "epoch": 0.8007139340226489, "grad_norm": 0.13059797883033752, "learning_rate": 9.970439709323809e-05, "loss": 8.1989, "step": 6505 }, { "epoch": 0.8008370260955194, "grad_norm": 0.12613755464553833, "learning_rate": 9.964281315432936e-05, "loss": 7.4208, "step": 6506 }, { "epoch": 0.80096011816839, "grad_norm": 0.08263368904590607, "learning_rate": 9.958122921542062e-05, "loss": 7.6922, "step": 6507 }, { "epoch": 0.8010832102412605, "grad_norm": 0.06434278190135956, "learning_rate": 9.951964527651189e-05, "loss": 7.6112, "step": 6508 }, { "epoch": 0.8012063023141309, "grad_norm": 0.09842593967914581, "learning_rate": 9.945806133760315e-05, "loss": 7.4431, "step": 6509 }, { "epoch": 0.8013293943870015, "grad_norm": 0.11731752008199692, "learning_rate": 9.939647739869442e-05, "loss": 7.6832, "step": 6510 }, { "epoch": 0.801452486459872, "grad_norm": 0.09205088764429092, "learning_rate": 9.93348934597857e-05, "loss": 7.6351, "step": 6511 }, { "epoch": 0.8015755785327425, "grad_norm": 0.05775803327560425, "learning_rate": 9.927330952087696e-05, "loss": 7.4787, "step": 6512 }, { "epoch": 0.801698670605613, "grad_norm": 0.2185763269662857, "learning_rate": 9.921172558196823e-05, "loss": 8.4719, "step": 6513 }, { "epoch": 0.8018217626784835, "grad_norm": 0.10494058579206467, "learning_rate": 9.915014164305949e-05, "loss": 7.6698, "step": 6514 }, { "epoch": 0.801944854751354, "grad_norm": 0.12359733134508133, "learning_rate": 9.908855770415076e-05, "loss": 7.7597, "step": 6515 }, { "epoch": 0.8020679468242246, "grad_norm": 0.07095761597156525, "learning_rate": 9.902697376524203e-05, "loss": 7.4892, "step": 6516 }, { "epoch": 0.802191038897095, "grad_norm": 0.11368230730295181, "learning_rate": 9.89653898263333e-05, "loss": 7.9717, "step": 6517 }, { "epoch": 0.8023141309699655, "grad_norm": 0.0625767633318901, "learning_rate": 9.890380588742457e-05, "loss": 7.6046, "step": 6518 }, { "epoch": 0.802437223042836, "grad_norm": 0.12596818804740906, "learning_rate": 9.884222194851583e-05, "loss": 7.3337, "step": 6519 }, { "epoch": 0.8025603151157066, "grad_norm": 0.08051571249961853, "learning_rate": 9.87806380096071e-05, "loss": 7.4499, "step": 6520 }, { "epoch": 0.802683407188577, "grad_norm": 0.09424842149019241, "learning_rate": 9.871905407069837e-05, "loss": 7.5234, "step": 6521 }, { "epoch": 0.8028064992614475, "grad_norm": 0.07856181263923645, "learning_rate": 9.865747013178963e-05, "loss": 7.5564, "step": 6522 }, { "epoch": 0.8029295913343181, "grad_norm": 0.11389844119548798, "learning_rate": 9.85958861928809e-05, "loss": 7.9106, "step": 6523 }, { "epoch": 0.8030526834071886, "grad_norm": 0.24827931821346283, "learning_rate": 9.853430225397217e-05, "loss": 8.3002, "step": 6524 }, { "epoch": 0.803175775480059, "grad_norm": 0.06500038504600525, "learning_rate": 9.847271831506344e-05, "loss": 7.482, "step": 6525 }, { "epoch": 0.8032988675529296, "grad_norm": 0.11140663921833038, "learning_rate": 9.84111343761547e-05, "loss": 7.827, "step": 6526 }, { "epoch": 0.8034219596258001, "grad_norm": 0.08557234704494476, "learning_rate": 9.834955043724597e-05, "loss": 7.5185, "step": 6527 }, { "epoch": 0.8035450516986706, "grad_norm": 0.4557827115058899, "learning_rate": 9.828796649833724e-05, "loss": 9.774, "step": 6528 }, { "epoch": 0.8036681437715412, "grad_norm": 0.11049939692020416, "learning_rate": 9.82263825594285e-05, "loss": 7.2724, "step": 6529 }, { "epoch": 0.8037912358444116, "grad_norm": 0.05466119199991226, "learning_rate": 9.816479862051978e-05, "loss": 7.4715, "step": 6530 }, { "epoch": 0.8039143279172821, "grad_norm": 0.1286630928516388, "learning_rate": 9.810321468161104e-05, "loss": 8.0425, "step": 6531 }, { "epoch": 0.8040374199901527, "grad_norm": 0.13914112746715546, "learning_rate": 9.804163074270231e-05, "loss": 8.4365, "step": 6532 }, { "epoch": 0.8041605120630232, "grad_norm": 0.06911702454090118, "learning_rate": 9.798004680379358e-05, "loss": 7.8592, "step": 6533 }, { "epoch": 0.8042836041358936, "grad_norm": 0.06990862637758255, "learning_rate": 9.791846286488484e-05, "loss": 7.6762, "step": 6534 }, { "epoch": 0.8044066962087641, "grad_norm": 0.06817784160375595, "learning_rate": 9.785687892597611e-05, "loss": 7.612, "step": 6535 }, { "epoch": 0.8045297882816347, "grad_norm": 0.10266204923391342, "learning_rate": 9.779529498706737e-05, "loss": 7.5701, "step": 6536 }, { "epoch": 0.8046528803545052, "grad_norm": 0.101488396525383, "learning_rate": 9.773371104815865e-05, "loss": 7.5539, "step": 6537 }, { "epoch": 0.8047759724273756, "grad_norm": 0.08275124430656433, "learning_rate": 9.767212710924992e-05, "loss": 7.4726, "step": 6538 }, { "epoch": 0.8048990645002462, "grad_norm": 0.07533861696720123, "learning_rate": 9.761054317034118e-05, "loss": 7.6136, "step": 6539 }, { "epoch": 0.8050221565731167, "grad_norm": 0.07550699263811111, "learning_rate": 9.754895923143245e-05, "loss": 7.508, "step": 6540 }, { "epoch": 0.8051452486459872, "grad_norm": 0.08662143349647522, "learning_rate": 9.748737529252371e-05, "loss": 7.8372, "step": 6541 }, { "epoch": 0.8052683407188577, "grad_norm": 0.2916290760040283, "learning_rate": 9.742579135361499e-05, "loss": 8.4094, "step": 6542 }, { "epoch": 0.8053914327917282, "grad_norm": 0.24892625212669373, "learning_rate": 9.736420741470626e-05, "loss": 8.6965, "step": 6543 }, { "epoch": 0.8055145248645987, "grad_norm": 0.10532937198877335, "learning_rate": 9.730262347579752e-05, "loss": 7.2475, "step": 6544 }, { "epoch": 0.8056376169374693, "grad_norm": 0.09230407327413559, "learning_rate": 9.724103953688879e-05, "loss": 7.3516, "step": 6545 }, { "epoch": 0.8057607090103397, "grad_norm": 0.09005794674158096, "learning_rate": 9.717945559798005e-05, "loss": 7.4267, "step": 6546 }, { "epoch": 0.8058838010832102, "grad_norm": 0.09246105700731277, "learning_rate": 9.711787165907132e-05, "loss": 8.1259, "step": 6547 }, { "epoch": 0.8060068931560808, "grad_norm": 0.09339240193367004, "learning_rate": 9.705628772016258e-05, "loss": 7.8417, "step": 6548 }, { "epoch": 0.8061299852289513, "grad_norm": 0.08702624589204788, "learning_rate": 9.699470378125386e-05, "loss": 7.5197, "step": 6549 }, { "epoch": 0.8062530773018217, "grad_norm": 0.09367651492357254, "learning_rate": 9.693311984234513e-05, "loss": 7.852, "step": 6550 }, { "epoch": 0.8063761693746923, "grad_norm": 0.06359563022851944, "learning_rate": 9.687153590343639e-05, "loss": 7.5327, "step": 6551 }, { "epoch": 0.8064992614475628, "grad_norm": 0.07995682209730148, "learning_rate": 9.680995196452766e-05, "loss": 7.5063, "step": 6552 }, { "epoch": 0.8066223535204333, "grad_norm": 0.09121885150671005, "learning_rate": 9.674836802561892e-05, "loss": 7.4234, "step": 6553 }, { "epoch": 0.8067454455933037, "grad_norm": 0.06651301681995392, "learning_rate": 9.66867840867102e-05, "loss": 7.5277, "step": 6554 }, { "epoch": 0.8068685376661743, "grad_norm": 0.11788273602724075, "learning_rate": 9.662520014780147e-05, "loss": 7.2662, "step": 6555 }, { "epoch": 0.8069916297390448, "grad_norm": 0.11372199654579163, "learning_rate": 9.656361620889273e-05, "loss": 8.0722, "step": 6556 }, { "epoch": 0.8071147218119153, "grad_norm": 0.07038227468729019, "learning_rate": 9.6502032269984e-05, "loss": 7.5042, "step": 6557 }, { "epoch": 0.8072378138847858, "grad_norm": 0.07821254432201385, "learning_rate": 9.644044833107526e-05, "loss": 7.8665, "step": 6558 }, { "epoch": 0.8073609059576563, "grad_norm": 0.08300699293613434, "learning_rate": 9.637886439216652e-05, "loss": 7.3417, "step": 6559 }, { "epoch": 0.8074839980305268, "grad_norm": 0.08960197865962982, "learning_rate": 9.631728045325779e-05, "loss": 7.7668, "step": 6560 }, { "epoch": 0.8076070901033974, "grad_norm": 0.10631300508975983, "learning_rate": 9.625569651434905e-05, "loss": 7.3135, "step": 6561 }, { "epoch": 0.8077301821762678, "grad_norm": 0.5292361378669739, "learning_rate": 9.619411257544032e-05, "loss": 10.2645, "step": 6562 }, { "epoch": 0.8078532742491383, "grad_norm": 0.10526347160339355, "learning_rate": 9.613252863653158e-05, "loss": 7.6804, "step": 6563 }, { "epoch": 0.8079763663220089, "grad_norm": 0.12715551257133484, "learning_rate": 9.607094469762286e-05, "loss": 7.8064, "step": 6564 }, { "epoch": 0.8080994583948794, "grad_norm": 0.0718497559428215, "learning_rate": 9.600936075871413e-05, "loss": 7.4667, "step": 6565 }, { "epoch": 0.8082225504677498, "grad_norm": 0.08361497521400452, "learning_rate": 9.594777681980539e-05, "loss": 7.8385, "step": 6566 }, { "epoch": 0.8083456425406204, "grad_norm": 0.264186829328537, "learning_rate": 9.588619288089666e-05, "loss": 8.318, "step": 6567 }, { "epoch": 0.8084687346134909, "grad_norm": 0.2597755789756775, "learning_rate": 9.582460894198792e-05, "loss": 8.991, "step": 6568 }, { "epoch": 0.8085918266863614, "grad_norm": 0.09263939410448074, "learning_rate": 9.57630250030792e-05, "loss": 7.4281, "step": 6569 }, { "epoch": 0.808714918759232, "grad_norm": 0.09575526416301727, "learning_rate": 9.570144106417045e-05, "loss": 8.1922, "step": 6570 }, { "epoch": 0.8088380108321024, "grad_norm": 0.10874077677726746, "learning_rate": 9.563985712526173e-05, "loss": 7.4425, "step": 6571 }, { "epoch": 0.8089611029049729, "grad_norm": 0.06891971826553345, "learning_rate": 9.5578273186353e-05, "loss": 7.5991, "step": 6572 }, { "epoch": 0.8090841949778435, "grad_norm": 0.05721572786569595, "learning_rate": 9.551668924744426e-05, "loss": 7.724, "step": 6573 }, { "epoch": 0.809207287050714, "grad_norm": 0.16326113045215607, "learning_rate": 9.545510530853553e-05, "loss": 8.6032, "step": 6574 }, { "epoch": 0.8093303791235844, "grad_norm": 0.6540035605430603, "learning_rate": 9.539352136962679e-05, "loss": 10.9855, "step": 6575 }, { "epoch": 0.8094534711964549, "grad_norm": 0.0768023133277893, "learning_rate": 9.533193743071807e-05, "loss": 7.6443, "step": 6576 }, { "epoch": 0.8095765632693255, "grad_norm": 0.20893514156341553, "learning_rate": 9.527035349180934e-05, "loss": 7.1563, "step": 6577 }, { "epoch": 0.809699655342196, "grad_norm": 0.12598416209220886, "learning_rate": 9.52087695529006e-05, "loss": 7.5882, "step": 6578 }, { "epoch": 0.8098227474150664, "grad_norm": 0.10898783057928085, "learning_rate": 9.514718561399187e-05, "loss": 7.8467, "step": 6579 }, { "epoch": 0.809945839487937, "grad_norm": 0.15507838129997253, "learning_rate": 9.508560167508313e-05, "loss": 7.5881, "step": 6580 }, { "epoch": 0.8100689315608075, "grad_norm": 0.12429346144199371, "learning_rate": 9.50240177361744e-05, "loss": 8.0282, "step": 6581 }, { "epoch": 0.810192023633678, "grad_norm": 0.07709048688411713, "learning_rate": 9.496243379726568e-05, "loss": 7.6689, "step": 6582 }, { "epoch": 0.8103151157065485, "grad_norm": 0.06470634788274765, "learning_rate": 9.490084985835694e-05, "loss": 7.7541, "step": 6583 }, { "epoch": 0.810438207779419, "grad_norm": 0.07439402490854263, "learning_rate": 9.483926591944821e-05, "loss": 7.6696, "step": 6584 }, { "epoch": 0.8105612998522895, "grad_norm": 0.0926554873585701, "learning_rate": 9.477768198053947e-05, "loss": 8.0549, "step": 6585 }, { "epoch": 0.8106843919251601, "grad_norm": 0.109360471367836, "learning_rate": 9.471609804163074e-05, "loss": 8.0366, "step": 6586 }, { "epoch": 0.8108074839980305, "grad_norm": 0.08641762286424637, "learning_rate": 9.465451410272201e-05, "loss": 7.9411, "step": 6587 }, { "epoch": 0.810930576070901, "grad_norm": 0.5690281391143799, "learning_rate": 9.459293016381327e-05, "loss": 6.7802, "step": 6588 }, { "epoch": 0.8110536681437716, "grad_norm": 0.0824870690703392, "learning_rate": 9.453134622490455e-05, "loss": 7.5745, "step": 6589 }, { "epoch": 0.8111767602166421, "grad_norm": 0.06512288749217987, "learning_rate": 9.44697622859958e-05, "loss": 7.3097, "step": 6590 }, { "epoch": 0.8112998522895125, "grad_norm": 0.3103925287723541, "learning_rate": 9.440817834708708e-05, "loss": 9.2891, "step": 6591 }, { "epoch": 0.811422944362383, "grad_norm": 0.06431545317173004, "learning_rate": 9.434659440817835e-05, "loss": 7.4464, "step": 6592 }, { "epoch": 0.8115460364352536, "grad_norm": 0.07166223227977753, "learning_rate": 9.428501046926961e-05, "loss": 7.4628, "step": 6593 }, { "epoch": 0.8116691285081241, "grad_norm": 0.06959118694067001, "learning_rate": 9.422342653036089e-05, "loss": 7.5949, "step": 6594 }, { "epoch": 0.8117922205809945, "grad_norm": 0.07642317563295364, "learning_rate": 9.416184259145214e-05, "loss": 7.7187, "step": 6595 }, { "epoch": 0.8119153126538651, "grad_norm": 0.164288729429245, "learning_rate": 9.410025865254342e-05, "loss": 8.1073, "step": 6596 }, { "epoch": 0.8120384047267356, "grad_norm": 0.2369329035282135, "learning_rate": 9.403867471363468e-05, "loss": 8.4167, "step": 6597 }, { "epoch": 0.8121614967996061, "grad_norm": 0.17225374281406403, "learning_rate": 9.397709077472595e-05, "loss": 7.7983, "step": 6598 }, { "epoch": 0.8122845888724766, "grad_norm": 0.1267446130514145, "learning_rate": 9.391550683581722e-05, "loss": 8.0263, "step": 6599 }, { "epoch": 0.8124076809453471, "grad_norm": 0.12512072920799255, "learning_rate": 9.385392289690848e-05, "loss": 7.4831, "step": 6600 }, { "epoch": 0.8125307730182176, "grad_norm": 0.10807160288095474, "learning_rate": 9.379233895799976e-05, "loss": 7.5081, "step": 6601 }, { "epoch": 0.8126538650910882, "grad_norm": 0.10413964837789536, "learning_rate": 9.373075501909102e-05, "loss": 7.5525, "step": 6602 }, { "epoch": 0.8127769571639586, "grad_norm": 0.11367016285657883, "learning_rate": 9.366917108018229e-05, "loss": 8.0203, "step": 6603 }, { "epoch": 0.8129000492368291, "grad_norm": 0.07218604534864426, "learning_rate": 9.360758714127356e-05, "loss": 7.9048, "step": 6604 }, { "epoch": 0.8130231413096997, "grad_norm": 0.08407679945230484, "learning_rate": 9.354600320236482e-05, "loss": 7.4646, "step": 6605 }, { "epoch": 0.8131462333825702, "grad_norm": 0.07583579421043396, "learning_rate": 9.34844192634561e-05, "loss": 7.8883, "step": 6606 }, { "epoch": 0.8132693254554406, "grad_norm": 0.19603653252124786, "learning_rate": 9.342283532454735e-05, "loss": 8.4234, "step": 6607 }, { "epoch": 0.8133924175283112, "grad_norm": 0.13419964909553528, "learning_rate": 9.336125138563863e-05, "loss": 8.3725, "step": 6608 }, { "epoch": 0.8135155096011817, "grad_norm": 0.09093289077281952, "learning_rate": 9.32996674467299e-05, "loss": 8.1461, "step": 6609 }, { "epoch": 0.8136386016740522, "grad_norm": 0.07794254273176193, "learning_rate": 9.323808350782116e-05, "loss": 7.5713, "step": 6610 }, { "epoch": 0.8137616937469226, "grad_norm": 0.11572771519422531, "learning_rate": 9.317649956891243e-05, "loss": 8.1992, "step": 6611 }, { "epoch": 0.8138847858197932, "grad_norm": 0.13711676001548767, "learning_rate": 9.311491563000369e-05, "loss": 7.6611, "step": 6612 }, { "epoch": 0.8140078778926637, "grad_norm": 0.07078258693218231, "learning_rate": 9.305333169109496e-05, "loss": 7.912, "step": 6613 }, { "epoch": 0.8141309699655342, "grad_norm": 0.0909235030412674, "learning_rate": 9.299174775218624e-05, "loss": 7.5542, "step": 6614 }, { "epoch": 0.8142540620384048, "grad_norm": 0.11656167358160019, "learning_rate": 9.29301638132775e-05, "loss": 7.3538, "step": 6615 }, { "epoch": 0.8143771541112752, "grad_norm": 0.1306883990764618, "learning_rate": 9.286857987436877e-05, "loss": 7.268, "step": 6616 }, { "epoch": 0.8145002461841457, "grad_norm": 0.11192520707845688, "learning_rate": 9.280699593546003e-05, "loss": 7.5842, "step": 6617 }, { "epoch": 0.8146233382570163, "grad_norm": 0.09347724169492722, "learning_rate": 9.27454119965513e-05, "loss": 7.701, "step": 6618 }, { "epoch": 0.8147464303298868, "grad_norm": 0.09998559951782227, "learning_rate": 9.268382805764256e-05, "loss": 8.0425, "step": 6619 }, { "epoch": 0.8148695224027572, "grad_norm": 0.07598860561847687, "learning_rate": 9.262224411873384e-05, "loss": 8.0869, "step": 6620 }, { "epoch": 0.8149926144756278, "grad_norm": 0.06115136668086052, "learning_rate": 9.256066017982511e-05, "loss": 7.6065, "step": 6621 }, { "epoch": 0.8151157065484983, "grad_norm": 0.06945616751909256, "learning_rate": 9.249907624091637e-05, "loss": 7.4707, "step": 6622 }, { "epoch": 0.8152387986213688, "grad_norm": 0.07950346916913986, "learning_rate": 9.243749230200764e-05, "loss": 8.1666, "step": 6623 }, { "epoch": 0.8153618906942393, "grad_norm": 0.13691698014736176, "learning_rate": 9.23759083630989e-05, "loss": 7.8244, "step": 6624 }, { "epoch": 0.8154849827671098, "grad_norm": 0.087422214448452, "learning_rate": 9.231432442419017e-05, "loss": 7.6102, "step": 6625 }, { "epoch": 0.8156080748399803, "grad_norm": 0.12234289944171906, "learning_rate": 9.225274048528145e-05, "loss": 7.29, "step": 6626 }, { "epoch": 0.8157311669128509, "grad_norm": 0.0694064125418663, "learning_rate": 9.21911565463727e-05, "loss": 7.8706, "step": 6627 }, { "epoch": 0.8158542589857213, "grad_norm": 0.08223510533571243, "learning_rate": 9.212957260746398e-05, "loss": 7.497, "step": 6628 }, { "epoch": 0.8159773510585918, "grad_norm": 0.13823337852954865, "learning_rate": 9.206798866855524e-05, "loss": 7.122, "step": 6629 }, { "epoch": 0.8161004431314623, "grad_norm": 0.053706757724285126, "learning_rate": 9.200640472964651e-05, "loss": 7.6183, "step": 6630 }, { "epoch": 0.8162235352043329, "grad_norm": 0.11511629074811935, "learning_rate": 9.194482079073778e-05, "loss": 7.9915, "step": 6631 }, { "epoch": 0.8163466272772033, "grad_norm": 0.06345325708389282, "learning_rate": 9.188323685182904e-05, "loss": 7.4558, "step": 6632 }, { "epoch": 0.8164697193500738, "grad_norm": 0.38086336851119995, "learning_rate": 9.182165291292032e-05, "loss": 9.2174, "step": 6633 }, { "epoch": 0.8165928114229444, "grad_norm": 0.10791723430156708, "learning_rate": 9.176006897401158e-05, "loss": 7.6414, "step": 6634 }, { "epoch": 0.8167159034958149, "grad_norm": 0.07225697487592697, "learning_rate": 9.169848503510285e-05, "loss": 7.4316, "step": 6635 }, { "epoch": 0.8168389955686853, "grad_norm": 0.1456984132528305, "learning_rate": 9.163690109619412e-05, "loss": 7.8166, "step": 6636 }, { "epoch": 0.8169620876415559, "grad_norm": 0.0676427111029625, "learning_rate": 9.157531715728538e-05, "loss": 7.4737, "step": 6637 }, { "epoch": 0.8170851797144264, "grad_norm": 0.08409228920936584, "learning_rate": 9.151373321837666e-05, "loss": 7.5855, "step": 6638 }, { "epoch": 0.8172082717872969, "grad_norm": 0.08740167319774628, "learning_rate": 9.145214927946791e-05, "loss": 7.3416, "step": 6639 }, { "epoch": 0.8173313638601674, "grad_norm": 0.0678551122546196, "learning_rate": 9.139056534055919e-05, "loss": 7.9628, "step": 6640 }, { "epoch": 0.8174544559330379, "grad_norm": 0.07752922922372818, "learning_rate": 9.132898140165046e-05, "loss": 7.8355, "step": 6641 }, { "epoch": 0.8175775480059084, "grad_norm": 0.10045070946216583, "learning_rate": 9.126739746274172e-05, "loss": 7.5643, "step": 6642 }, { "epoch": 0.817700640078779, "grad_norm": 0.11958841234445572, "learning_rate": 9.1205813523833e-05, "loss": 7.4464, "step": 6643 }, { "epoch": 0.8178237321516494, "grad_norm": 0.1082979366183281, "learning_rate": 9.114422958492425e-05, "loss": 7.7675, "step": 6644 }, { "epoch": 0.8179468242245199, "grad_norm": 0.10752226412296295, "learning_rate": 9.108264564601553e-05, "loss": 7.5624, "step": 6645 }, { "epoch": 0.8180699162973905, "grad_norm": 0.07863832265138626, "learning_rate": 9.102106170710679e-05, "loss": 7.4662, "step": 6646 }, { "epoch": 0.818193008370261, "grad_norm": 0.05526598542928696, "learning_rate": 9.095947776819806e-05, "loss": 7.6075, "step": 6647 }, { "epoch": 0.8183161004431314, "grad_norm": 0.07481855899095535, "learning_rate": 9.089789382928933e-05, "loss": 7.6184, "step": 6648 }, { "epoch": 0.818439192516002, "grad_norm": 0.0633615106344223, "learning_rate": 9.083630989038059e-05, "loss": 7.5501, "step": 6649 }, { "epoch": 0.8185622845888725, "grad_norm": 0.10072807967662811, "learning_rate": 9.077472595147186e-05, "loss": 7.6561, "step": 6650 }, { "epoch": 0.818685376661743, "grad_norm": 0.2885877192020416, "learning_rate": 9.071314201256312e-05, "loss": 8.6821, "step": 6651 }, { "epoch": 0.8188084687346134, "grad_norm": 0.0937081128358841, "learning_rate": 9.06515580736544e-05, "loss": 7.6567, "step": 6652 }, { "epoch": 0.818931560807484, "grad_norm": 0.15105175971984863, "learning_rate": 9.058997413474567e-05, "loss": 7.493, "step": 6653 }, { "epoch": 0.8190546528803545, "grad_norm": 0.06476859003305435, "learning_rate": 9.052839019583693e-05, "loss": 7.5009, "step": 6654 }, { "epoch": 0.819177744953225, "grad_norm": 0.0681908056139946, "learning_rate": 9.04668062569282e-05, "loss": 7.5588, "step": 6655 }, { "epoch": 0.8193008370260956, "grad_norm": 0.06858361512422562, "learning_rate": 9.040522231801946e-05, "loss": 7.4336, "step": 6656 }, { "epoch": 0.819423929098966, "grad_norm": 0.16164463758468628, "learning_rate": 9.034363837911073e-05, "loss": 8.4731, "step": 6657 }, { "epoch": 0.8195470211718365, "grad_norm": 0.11302650719881058, "learning_rate": 9.028205444020201e-05, "loss": 7.7168, "step": 6658 }, { "epoch": 0.8196701132447071, "grad_norm": 0.084923155605793, "learning_rate": 9.022047050129327e-05, "loss": 7.6692, "step": 6659 }, { "epoch": 0.8197932053175776, "grad_norm": 0.11511144042015076, "learning_rate": 9.015888656238454e-05, "loss": 7.2638, "step": 6660 }, { "epoch": 0.819916297390448, "grad_norm": 0.09426581859588623, "learning_rate": 9.00973026234758e-05, "loss": 7.4379, "step": 6661 }, { "epoch": 0.8200393894633186, "grad_norm": 0.06432440131902695, "learning_rate": 9.003571868456707e-05, "loss": 7.6707, "step": 6662 }, { "epoch": 0.8201624815361891, "grad_norm": 0.0860624834895134, "learning_rate": 8.997413474565835e-05, "loss": 7.4591, "step": 6663 }, { "epoch": 0.8202855736090596, "grad_norm": 0.12182551622390747, "learning_rate": 8.99125508067496e-05, "loss": 7.4283, "step": 6664 }, { "epoch": 0.82040866568193, "grad_norm": 0.07798215746879578, "learning_rate": 8.985096686784088e-05, "loss": 7.5006, "step": 6665 }, { "epoch": 0.8205317577548006, "grad_norm": 0.14886733889579773, "learning_rate": 8.978938292893214e-05, "loss": 7.9112, "step": 6666 }, { "epoch": 0.8206548498276711, "grad_norm": 0.07366029918193817, "learning_rate": 8.972779899002341e-05, "loss": 7.6799, "step": 6667 }, { "epoch": 0.8207779419005417, "grad_norm": 0.12020090967416763, "learning_rate": 8.966621505111467e-05, "loss": 7.7891, "step": 6668 }, { "epoch": 0.8209010339734121, "grad_norm": 0.13981559872627258, "learning_rate": 8.960463111220594e-05, "loss": 8.0575, "step": 6669 }, { "epoch": 0.8210241260462826, "grad_norm": 0.26811936497688293, "learning_rate": 8.954304717329722e-05, "loss": 8.5899, "step": 6670 }, { "epoch": 0.8211472181191531, "grad_norm": 0.06022938713431358, "learning_rate": 8.948146323438848e-05, "loss": 7.6255, "step": 6671 }, { "epoch": 0.8212703101920237, "grad_norm": 0.12772247195243835, "learning_rate": 8.941987929547975e-05, "loss": 7.3666, "step": 6672 }, { "epoch": 0.8213934022648941, "grad_norm": 0.11578650027513504, "learning_rate": 8.935829535657101e-05, "loss": 7.5185, "step": 6673 }, { "epoch": 0.8215164943377646, "grad_norm": 0.12348074465990067, "learning_rate": 8.929671141766228e-05, "loss": 7.5147, "step": 6674 }, { "epoch": 0.8216395864106352, "grad_norm": 0.0911109447479248, "learning_rate": 8.923512747875354e-05, "loss": 7.3879, "step": 6675 }, { "epoch": 0.8217626784835057, "grad_norm": 0.07013063877820969, "learning_rate": 8.91735435398448e-05, "loss": 7.4221, "step": 6676 }, { "epoch": 0.8218857705563761, "grad_norm": 0.2543866038322449, "learning_rate": 8.911195960093607e-05, "loss": 9.3065, "step": 6677 }, { "epoch": 0.8220088626292467, "grad_norm": 0.10197903960943222, "learning_rate": 8.905037566202733e-05, "loss": 7.9727, "step": 6678 }, { "epoch": 0.8221319547021172, "grad_norm": 0.07126988470554352, "learning_rate": 8.89887917231186e-05, "loss": 7.6033, "step": 6679 }, { "epoch": 0.8222550467749877, "grad_norm": 0.07179020345211029, "learning_rate": 8.892720778420988e-05, "loss": 7.5451, "step": 6680 }, { "epoch": 0.8223781388478582, "grad_norm": 0.07755204290151596, "learning_rate": 8.886562384530114e-05, "loss": 7.5377, "step": 6681 }, { "epoch": 0.8225012309207287, "grad_norm": 0.06478018313646317, "learning_rate": 8.880403990639241e-05, "loss": 7.4994, "step": 6682 }, { "epoch": 0.8226243229935992, "grad_norm": 0.061418283730745316, "learning_rate": 8.874245596748367e-05, "loss": 7.8289, "step": 6683 }, { "epoch": 0.8227474150664698, "grad_norm": 0.08305204659700394, "learning_rate": 8.868087202857494e-05, "loss": 8.056, "step": 6684 }, { "epoch": 0.8228705071393402, "grad_norm": 0.07954297214746475, "learning_rate": 8.861928808966622e-05, "loss": 7.4011, "step": 6685 }, { "epoch": 0.8229935992122107, "grad_norm": 0.10263330489397049, "learning_rate": 8.855770415075748e-05, "loss": 7.6921, "step": 6686 }, { "epoch": 0.8231166912850812, "grad_norm": 0.08127713948488235, "learning_rate": 8.849612021184875e-05, "loss": 7.736, "step": 6687 }, { "epoch": 0.8232397833579518, "grad_norm": 0.09087575972080231, "learning_rate": 8.843453627294001e-05, "loss": 7.3859, "step": 6688 }, { "epoch": 0.8233628754308222, "grad_norm": 0.06886154413223267, "learning_rate": 8.837295233403128e-05, "loss": 7.3987, "step": 6689 }, { "epoch": 0.8234859675036927, "grad_norm": 0.0999327078461647, "learning_rate": 8.831136839512254e-05, "loss": 7.4321, "step": 6690 }, { "epoch": 0.8236090595765633, "grad_norm": 0.14430394768714905, "learning_rate": 8.824978445621382e-05, "loss": 7.8568, "step": 6691 }, { "epoch": 0.8237321516494338, "grad_norm": 0.13083045184612274, "learning_rate": 8.818820051730509e-05, "loss": 7.7411, "step": 6692 }, { "epoch": 0.8238552437223042, "grad_norm": 0.06587351113557816, "learning_rate": 8.812661657839635e-05, "loss": 7.473, "step": 6693 }, { "epoch": 0.8239783357951748, "grad_norm": 0.11960699409246445, "learning_rate": 8.806503263948762e-05, "loss": 7.8228, "step": 6694 }, { "epoch": 0.8241014278680453, "grad_norm": 0.09448560327291489, "learning_rate": 8.800344870057888e-05, "loss": 7.6417, "step": 6695 }, { "epoch": 0.8242245199409158, "grad_norm": 0.19851435720920563, "learning_rate": 8.794186476167015e-05, "loss": 8.6675, "step": 6696 }, { "epoch": 0.8243476120137863, "grad_norm": 0.11712714284658432, "learning_rate": 8.788028082276143e-05, "loss": 7.9743, "step": 6697 }, { "epoch": 0.8244707040866568, "grad_norm": 0.09619078785181046, "learning_rate": 8.781869688385269e-05, "loss": 7.7836, "step": 6698 }, { "epoch": 0.8245937961595273, "grad_norm": 0.0714213028550148, "learning_rate": 8.775711294494396e-05, "loss": 7.7049, "step": 6699 }, { "epoch": 0.8247168882323979, "grad_norm": 0.16913606226444244, "learning_rate": 8.769552900603522e-05, "loss": 7.3303, "step": 6700 }, { "epoch": 0.8248399803052684, "grad_norm": 0.10324226319789886, "learning_rate": 8.763394506712649e-05, "loss": 7.6574, "step": 6701 }, { "epoch": 0.8249630723781388, "grad_norm": 0.11673307418823242, "learning_rate": 8.757236112821776e-05, "loss": 8.3422, "step": 6702 }, { "epoch": 0.8250861644510094, "grad_norm": 0.12151999771595001, "learning_rate": 8.751077718930902e-05, "loss": 7.8932, "step": 6703 }, { "epoch": 0.8252092565238799, "grad_norm": 0.0682242214679718, "learning_rate": 8.74491932504003e-05, "loss": 7.6744, "step": 6704 }, { "epoch": 0.8253323485967504, "grad_norm": 0.08643493801355362, "learning_rate": 8.738760931149156e-05, "loss": 8.0741, "step": 6705 }, { "epoch": 0.8254554406696208, "grad_norm": 0.26642656326293945, "learning_rate": 8.732602537258283e-05, "loss": 8.8073, "step": 6706 }, { "epoch": 0.8255785327424914, "grad_norm": 0.120152048766613, "learning_rate": 8.72644414336741e-05, "loss": 7.3818, "step": 6707 }, { "epoch": 0.8257016248153619, "grad_norm": 0.1203724816441536, "learning_rate": 8.720285749476536e-05, "loss": 7.4566, "step": 6708 }, { "epoch": 0.8258247168882324, "grad_norm": 0.13589046895503998, "learning_rate": 8.714127355585664e-05, "loss": 7.5959, "step": 6709 }, { "epoch": 0.8259478089611029, "grad_norm": 0.17520496249198914, "learning_rate": 8.70796896169479e-05, "loss": 8.3706, "step": 6710 }, { "epoch": 0.8260709010339734, "grad_norm": 0.375379741191864, "learning_rate": 8.701810567803917e-05, "loss": 6.8291, "step": 6711 }, { "epoch": 0.8261939931068439, "grad_norm": 0.19971781969070435, "learning_rate": 8.695652173913044e-05, "loss": 8.4165, "step": 6712 }, { "epoch": 0.8263170851797145, "grad_norm": 0.15915589034557343, "learning_rate": 8.68949378002217e-05, "loss": 7.7399, "step": 6713 }, { "epoch": 0.8264401772525849, "grad_norm": 0.09595853090286255, "learning_rate": 8.683335386131297e-05, "loss": 8.0206, "step": 6714 }, { "epoch": 0.8265632693254554, "grad_norm": 0.10852285474538803, "learning_rate": 8.677176992240423e-05, "loss": 7.8485, "step": 6715 }, { "epoch": 0.826686361398326, "grad_norm": 0.11132009327411652, "learning_rate": 8.67101859834955e-05, "loss": 8.213, "step": 6716 }, { "epoch": 0.8268094534711965, "grad_norm": 0.21978673338890076, "learning_rate": 8.664860204458677e-05, "loss": 7.3567, "step": 6717 }, { "epoch": 0.8269325455440669, "grad_norm": 0.06899858266115189, "learning_rate": 8.658701810567804e-05, "loss": 7.4898, "step": 6718 }, { "epoch": 0.8270556376169375, "grad_norm": 0.0871182456612587, "learning_rate": 8.652543416676931e-05, "loss": 7.4712, "step": 6719 }, { "epoch": 0.827178729689808, "grad_norm": 0.07881707698106766, "learning_rate": 8.646385022786057e-05, "loss": 7.2757, "step": 6720 }, { "epoch": 0.8273018217626785, "grad_norm": 0.12222957611083984, "learning_rate": 8.640226628895184e-05, "loss": 7.8693, "step": 6721 }, { "epoch": 0.827424913835549, "grad_norm": 0.10397548228502274, "learning_rate": 8.63406823500431e-05, "loss": 7.4725, "step": 6722 }, { "epoch": 0.8275480059084195, "grad_norm": 0.08442515879869461, "learning_rate": 8.627909841113438e-05, "loss": 7.788, "step": 6723 }, { "epoch": 0.82767109798129, "grad_norm": 0.10151482373476028, "learning_rate": 8.621751447222565e-05, "loss": 7.5904, "step": 6724 }, { "epoch": 0.8277941900541606, "grad_norm": 0.1236427053809166, "learning_rate": 8.615593053331691e-05, "loss": 7.8447, "step": 6725 }, { "epoch": 0.827917282127031, "grad_norm": 0.2541850209236145, "learning_rate": 8.609434659440818e-05, "loss": 8.543, "step": 6726 }, { "epoch": 0.8280403741999015, "grad_norm": 0.08961803466081619, "learning_rate": 8.603276265549944e-05, "loss": 7.9128, "step": 6727 }, { "epoch": 0.828163466272772, "grad_norm": 0.07474515587091446, "learning_rate": 8.597117871659071e-05, "loss": 7.459, "step": 6728 }, { "epoch": 0.8282865583456426, "grad_norm": 0.15636293590068817, "learning_rate": 8.590959477768199e-05, "loss": 7.6979, "step": 6729 }, { "epoch": 0.828409650418513, "grad_norm": 0.07757542282342911, "learning_rate": 8.584801083877325e-05, "loss": 7.5765, "step": 6730 }, { "epoch": 0.8285327424913835, "grad_norm": 0.13783055543899536, "learning_rate": 8.578642689986452e-05, "loss": 7.2868, "step": 6731 }, { "epoch": 0.8286558345642541, "grad_norm": 0.16608826816082, "learning_rate": 8.572484296095578e-05, "loss": 8.8569, "step": 6732 }, { "epoch": 0.8287789266371246, "grad_norm": 0.2938428223133087, "learning_rate": 8.566325902204705e-05, "loss": 8.9058, "step": 6733 }, { "epoch": 0.828902018709995, "grad_norm": 0.10753583163022995, "learning_rate": 8.560167508313833e-05, "loss": 8.002, "step": 6734 }, { "epoch": 0.8290251107828656, "grad_norm": 0.13331282138824463, "learning_rate": 8.554009114422959e-05, "loss": 7.5861, "step": 6735 }, { "epoch": 0.8291482028557361, "grad_norm": 0.09456172585487366, "learning_rate": 8.547850720532086e-05, "loss": 7.4944, "step": 6736 }, { "epoch": 0.8292712949286066, "grad_norm": 0.07212777435779572, "learning_rate": 8.541692326641212e-05, "loss": 7.9347, "step": 6737 }, { "epoch": 0.829394387001477, "grad_norm": 0.11804597824811935, "learning_rate": 8.535533932750339e-05, "loss": 7.4895, "step": 6738 }, { "epoch": 0.8295174790743476, "grad_norm": 0.11548922955989838, "learning_rate": 8.529375538859465e-05, "loss": 8.1094, "step": 6739 }, { "epoch": 0.8296405711472181, "grad_norm": 0.0929638147354126, "learning_rate": 8.523217144968592e-05, "loss": 7.6469, "step": 6740 }, { "epoch": 0.8297636632200887, "grad_norm": 0.07353908568620682, "learning_rate": 8.51705875107772e-05, "loss": 7.6386, "step": 6741 }, { "epoch": 0.8298867552929592, "grad_norm": 0.06400148570537567, "learning_rate": 8.510900357186846e-05, "loss": 7.4829, "step": 6742 }, { "epoch": 0.8300098473658296, "grad_norm": 0.13908328115940094, "learning_rate": 8.504741963295973e-05, "loss": 7.8172, "step": 6743 }, { "epoch": 0.8301329394387001, "grad_norm": 0.14604854583740234, "learning_rate": 8.498583569405099e-05, "loss": 8.2403, "step": 6744 }, { "epoch": 0.8302560315115707, "grad_norm": 0.11147890239953995, "learning_rate": 8.492425175514226e-05, "loss": 7.6094, "step": 6745 }, { "epoch": 0.8303791235844412, "grad_norm": 0.12288542091846466, "learning_rate": 8.486266781623353e-05, "loss": 7.3794, "step": 6746 }, { "epoch": 0.8305022156573116, "grad_norm": 0.11851118505001068, "learning_rate": 8.48010838773248e-05, "loss": 7.2896, "step": 6747 }, { "epoch": 0.8306253077301822, "grad_norm": 0.06214582175016403, "learning_rate": 8.473949993841607e-05, "loss": 7.4317, "step": 6748 }, { "epoch": 0.8307483998030527, "grad_norm": 0.1174163967370987, "learning_rate": 8.467791599950733e-05, "loss": 7.5404, "step": 6749 }, { "epoch": 0.8308714918759232, "grad_norm": 0.1443941742181778, "learning_rate": 8.46163320605986e-05, "loss": 7.561, "step": 6750 }, { "epoch": 0.8309945839487937, "grad_norm": 0.07347267121076584, "learning_rate": 8.455474812168987e-05, "loss": 7.9623, "step": 6751 }, { "epoch": 0.8311176760216642, "grad_norm": 0.0795431062579155, "learning_rate": 8.449316418278113e-05, "loss": 7.5484, "step": 6752 }, { "epoch": 0.8312407680945347, "grad_norm": 0.06768271327018738, "learning_rate": 8.44315802438724e-05, "loss": 7.5069, "step": 6753 }, { "epoch": 0.8313638601674053, "grad_norm": 0.17608462274074554, "learning_rate": 8.436999630496366e-05, "loss": 7.6615, "step": 6754 }, { "epoch": 0.8314869522402757, "grad_norm": 0.15335048735141754, "learning_rate": 8.430841236605494e-05, "loss": 7.6628, "step": 6755 }, { "epoch": 0.8316100443131462, "grad_norm": 0.06346677988767624, "learning_rate": 8.424682842714621e-05, "loss": 7.5068, "step": 6756 }, { "epoch": 0.8317331363860168, "grad_norm": 0.07271149009466171, "learning_rate": 8.418524448823747e-05, "loss": 7.3955, "step": 6757 }, { "epoch": 0.8318562284588873, "grad_norm": 0.09312809258699417, "learning_rate": 8.412366054932874e-05, "loss": 7.5496, "step": 6758 }, { "epoch": 0.8319793205317577, "grad_norm": 0.11856772750616074, "learning_rate": 8.406207661042e-05, "loss": 8.0388, "step": 6759 }, { "epoch": 0.8321024126046282, "grad_norm": 0.10290893167257309, "learning_rate": 8.400049267151128e-05, "loss": 7.5321, "step": 6760 }, { "epoch": 0.8322255046774988, "grad_norm": 0.09457536786794662, "learning_rate": 8.393890873260255e-05, "loss": 7.7403, "step": 6761 }, { "epoch": 0.8323485967503693, "grad_norm": 0.05959553271532059, "learning_rate": 8.387732479369381e-05, "loss": 7.3827, "step": 6762 }, { "epoch": 0.8324716888232397, "grad_norm": 0.0991046354174614, "learning_rate": 8.381574085478508e-05, "loss": 7.3476, "step": 6763 }, { "epoch": 0.8325947808961103, "grad_norm": 0.07896865904331207, "learning_rate": 8.375415691587634e-05, "loss": 7.5338, "step": 6764 }, { "epoch": 0.8327178729689808, "grad_norm": 0.10044057667255402, "learning_rate": 8.369257297696761e-05, "loss": 7.5819, "step": 6765 }, { "epoch": 0.8328409650418513, "grad_norm": 0.06330558657646179, "learning_rate": 8.363098903805887e-05, "loss": 7.4836, "step": 6766 }, { "epoch": 0.8329640571147218, "grad_norm": 0.06296330690383911, "learning_rate": 8.356940509915015e-05, "loss": 7.7545, "step": 6767 }, { "epoch": 0.8330871491875923, "grad_norm": 0.08664972335100174, "learning_rate": 8.350782116024142e-05, "loss": 7.2756, "step": 6768 }, { "epoch": 0.8332102412604628, "grad_norm": 0.09977825731039047, "learning_rate": 8.344623722133268e-05, "loss": 7.5668, "step": 6769 }, { "epoch": 0.8333333333333334, "grad_norm": 0.1021561399102211, "learning_rate": 8.338465328242395e-05, "loss": 7.8576, "step": 6770 }, { "epoch": 0.8334564254062038, "grad_norm": 0.0669650137424469, "learning_rate": 8.332306934351521e-05, "loss": 7.7033, "step": 6771 }, { "epoch": 0.8335795174790743, "grad_norm": 0.10734309256076813, "learning_rate": 8.326148540460648e-05, "loss": 8.0566, "step": 6772 }, { "epoch": 0.8337026095519449, "grad_norm": 0.0922446921467781, "learning_rate": 8.319990146569776e-05, "loss": 7.2991, "step": 6773 }, { "epoch": 0.8338257016248154, "grad_norm": 0.1038103848695755, "learning_rate": 8.313831752678902e-05, "loss": 8.0082, "step": 6774 }, { "epoch": 0.8339487936976858, "grad_norm": 0.05941910669207573, "learning_rate": 8.307673358788029e-05, "loss": 7.5345, "step": 6775 }, { "epoch": 0.8340718857705564, "grad_norm": 0.06335527449846268, "learning_rate": 8.301514964897155e-05, "loss": 7.5683, "step": 6776 }, { "epoch": 0.8341949778434269, "grad_norm": 0.08872654289007187, "learning_rate": 8.295356571006282e-05, "loss": 7.8034, "step": 6777 }, { "epoch": 0.8343180699162974, "grad_norm": 0.10267794877290726, "learning_rate": 8.28919817711541e-05, "loss": 7.9464, "step": 6778 }, { "epoch": 0.8344411619891678, "grad_norm": 0.0670291930437088, "learning_rate": 8.283039783224536e-05, "loss": 7.7403, "step": 6779 }, { "epoch": 0.8345642540620384, "grad_norm": 0.2151390165090561, "learning_rate": 8.276881389333663e-05, "loss": 8.5767, "step": 6780 }, { "epoch": 0.8346873461349089, "grad_norm": 0.06335441768169403, "learning_rate": 8.270722995442789e-05, "loss": 7.6039, "step": 6781 }, { "epoch": 0.8348104382077794, "grad_norm": 0.14545142650604248, "learning_rate": 8.264564601551916e-05, "loss": 8.5793, "step": 6782 }, { "epoch": 0.8349335302806499, "grad_norm": 0.09798445552587509, "learning_rate": 8.258406207661043e-05, "loss": 7.4104, "step": 6783 }, { "epoch": 0.8350566223535204, "grad_norm": 0.07631273567676544, "learning_rate": 8.25224781377017e-05, "loss": 7.752, "step": 6784 }, { "epoch": 0.8351797144263909, "grad_norm": 0.15590330958366394, "learning_rate": 8.246089419879297e-05, "loss": 8.0047, "step": 6785 }, { "epoch": 0.8353028064992615, "grad_norm": 0.12482140958309174, "learning_rate": 8.239931025988423e-05, "loss": 7.3802, "step": 6786 }, { "epoch": 0.835425898572132, "grad_norm": 0.08569995313882828, "learning_rate": 8.23377263209755e-05, "loss": 8.2876, "step": 6787 }, { "epoch": 0.8355489906450024, "grad_norm": 0.11386669427156448, "learning_rate": 8.227614238206677e-05, "loss": 8.129, "step": 6788 }, { "epoch": 0.835672082717873, "grad_norm": 0.12013647705316544, "learning_rate": 8.221455844315803e-05, "loss": 7.5013, "step": 6789 }, { "epoch": 0.8357951747907435, "grad_norm": 0.10435806959867477, "learning_rate": 8.21529745042493e-05, "loss": 7.4349, "step": 6790 }, { "epoch": 0.835918266863614, "grad_norm": 0.19147944450378418, "learning_rate": 8.209139056534055e-05, "loss": 8.7903, "step": 6791 }, { "epoch": 0.8360413589364845, "grad_norm": 0.11533601582050323, "learning_rate": 8.202980662643182e-05, "loss": 7.5522, "step": 6792 }, { "epoch": 0.836164451009355, "grad_norm": 0.07385073602199554, "learning_rate": 8.196822268752308e-05, "loss": 7.64, "step": 6793 }, { "epoch": 0.8362875430822255, "grad_norm": 0.0976574644446373, "learning_rate": 8.190663874861436e-05, "loss": 7.5205, "step": 6794 }, { "epoch": 0.8364106351550961, "grad_norm": 0.09810719639062881, "learning_rate": 8.184505480970563e-05, "loss": 7.7921, "step": 6795 }, { "epoch": 0.8365337272279665, "grad_norm": 0.1187417283654213, "learning_rate": 8.178347087079689e-05, "loss": 7.9671, "step": 6796 }, { "epoch": 0.836656819300837, "grad_norm": 0.12193269282579422, "learning_rate": 8.172188693188816e-05, "loss": 7.7966, "step": 6797 }, { "epoch": 0.8367799113737076, "grad_norm": 0.0988103449344635, "learning_rate": 8.166030299297942e-05, "loss": 7.4179, "step": 6798 }, { "epoch": 0.8369030034465781, "grad_norm": 0.345148503780365, "learning_rate": 8.15987190540707e-05, "loss": 8.8962, "step": 6799 }, { "epoch": 0.8370260955194485, "grad_norm": 0.06815279275178909, "learning_rate": 8.153713511516197e-05, "loss": 7.7279, "step": 6800 }, { "epoch": 0.837149187592319, "grad_norm": 0.0949854627251625, "learning_rate": 8.147555117625323e-05, "loss": 7.7589, "step": 6801 }, { "epoch": 0.8372722796651896, "grad_norm": 0.0857350304722786, "learning_rate": 8.14139672373445e-05, "loss": 7.4425, "step": 6802 }, { "epoch": 0.8373953717380601, "grad_norm": 0.07005292922258377, "learning_rate": 8.135238329843576e-05, "loss": 7.2995, "step": 6803 }, { "epoch": 0.8375184638109305, "grad_norm": 0.0833285003900528, "learning_rate": 8.129079935952703e-05, "loss": 7.7468, "step": 6804 }, { "epoch": 0.8376415558838011, "grad_norm": 0.08134221285581589, "learning_rate": 8.12292154206183e-05, "loss": 7.7594, "step": 6805 }, { "epoch": 0.8377646479566716, "grad_norm": 0.13595663011074066, "learning_rate": 8.116763148170956e-05, "loss": 8.0014, "step": 6806 }, { "epoch": 0.8378877400295421, "grad_norm": 0.057545728981494904, "learning_rate": 8.110604754280084e-05, "loss": 7.6392, "step": 6807 }, { "epoch": 0.8380108321024126, "grad_norm": 0.0696403905749321, "learning_rate": 8.10444636038921e-05, "loss": 7.4946, "step": 6808 }, { "epoch": 0.8381339241752831, "grad_norm": 0.10707900673151016, "learning_rate": 8.098287966498337e-05, "loss": 7.308, "step": 6809 }, { "epoch": 0.8382570162481536, "grad_norm": 0.14789925515651703, "learning_rate": 8.092129572607464e-05, "loss": 8.2029, "step": 6810 }, { "epoch": 0.8383801083210242, "grad_norm": 0.07575731724500656, "learning_rate": 8.08597117871659e-05, "loss": 7.3777, "step": 6811 }, { "epoch": 0.8385032003938946, "grad_norm": 0.3412100672721863, "learning_rate": 8.079812784825718e-05, "loss": 9.013, "step": 6812 }, { "epoch": 0.8386262924667651, "grad_norm": 0.12171174585819244, "learning_rate": 8.073654390934844e-05, "loss": 7.5771, "step": 6813 }, { "epoch": 0.8387493845396357, "grad_norm": 0.07706243544816971, "learning_rate": 8.067495997043971e-05, "loss": 7.8003, "step": 6814 }, { "epoch": 0.8388724766125062, "grad_norm": 0.07356082648038864, "learning_rate": 8.061337603153097e-05, "loss": 7.6967, "step": 6815 }, { "epoch": 0.8389955686853766, "grad_norm": 0.2045922428369522, "learning_rate": 8.055179209262224e-05, "loss": 8.9405, "step": 6816 }, { "epoch": 0.8391186607582471, "grad_norm": 0.06268294900655746, "learning_rate": 8.049020815371351e-05, "loss": 7.5405, "step": 6817 }, { "epoch": 0.8392417528311177, "grad_norm": 0.09673362225294113, "learning_rate": 8.042862421480477e-05, "loss": 7.6015, "step": 6818 }, { "epoch": 0.8393648449039882, "grad_norm": 0.10395915806293488, "learning_rate": 8.036704027589605e-05, "loss": 8.3765, "step": 6819 }, { "epoch": 0.8394879369768586, "grad_norm": 0.085755854845047, "learning_rate": 8.03054563369873e-05, "loss": 7.9243, "step": 6820 }, { "epoch": 0.8396110290497292, "grad_norm": 0.12278252094984055, "learning_rate": 8.024387239807858e-05, "loss": 7.724, "step": 6821 }, { "epoch": 0.8397341211225997, "grad_norm": 0.08906576037406921, "learning_rate": 8.018228845916985e-05, "loss": 8.2017, "step": 6822 }, { "epoch": 0.8398572131954702, "grad_norm": 0.06530417501926422, "learning_rate": 8.012070452026111e-05, "loss": 7.8093, "step": 6823 }, { "epoch": 0.8399803052683407, "grad_norm": 0.08609622716903687, "learning_rate": 8.005912058135238e-05, "loss": 7.6173, "step": 6824 }, { "epoch": 0.8401033973412112, "grad_norm": 0.07952910661697388, "learning_rate": 7.999753664244364e-05, "loss": 7.739, "step": 6825 }, { "epoch": 0.8402264894140817, "grad_norm": 0.11641790717840195, "learning_rate": 7.993595270353492e-05, "loss": 7.4785, "step": 6826 }, { "epoch": 0.8403495814869523, "grad_norm": 0.0805387794971466, "learning_rate": 7.987436876462619e-05, "loss": 7.56, "step": 6827 }, { "epoch": 0.8404726735598228, "grad_norm": 0.11393080651760101, "learning_rate": 7.981278482571745e-05, "loss": 7.3828, "step": 6828 }, { "epoch": 0.8405957656326932, "grad_norm": 0.07446388900279999, "learning_rate": 7.975120088680872e-05, "loss": 7.4804, "step": 6829 }, { "epoch": 0.8407188577055638, "grad_norm": 0.13937923312187195, "learning_rate": 7.968961694789998e-05, "loss": 7.3039, "step": 6830 }, { "epoch": 0.8408419497784343, "grad_norm": 0.08182182908058167, "learning_rate": 7.962803300899126e-05, "loss": 7.7588, "step": 6831 }, { "epoch": 0.8409650418513048, "grad_norm": 0.21245445311069489, "learning_rate": 7.956644907008253e-05, "loss": 8.5445, "step": 6832 }, { "epoch": 0.8410881339241753, "grad_norm": 0.05681060999631882, "learning_rate": 7.950486513117379e-05, "loss": 7.6877, "step": 6833 }, { "epoch": 0.8412112259970458, "grad_norm": 0.0754372775554657, "learning_rate": 7.944328119226506e-05, "loss": 7.708, "step": 6834 }, { "epoch": 0.8413343180699163, "grad_norm": 0.09117292612791061, "learning_rate": 7.938169725335632e-05, "loss": 7.8603, "step": 6835 }, { "epoch": 0.8414574101427869, "grad_norm": 0.05719399452209473, "learning_rate": 7.93201133144476e-05, "loss": 7.3591, "step": 6836 }, { "epoch": 0.8415805022156573, "grad_norm": 0.07405290752649307, "learning_rate": 7.925852937553885e-05, "loss": 7.5512, "step": 6837 }, { "epoch": 0.8417035942885278, "grad_norm": 0.074874147772789, "learning_rate": 7.919694543663013e-05, "loss": 7.5836, "step": 6838 }, { "epoch": 0.8418266863613983, "grad_norm": 0.058504484593868256, "learning_rate": 7.91353614977214e-05, "loss": 7.4806, "step": 6839 }, { "epoch": 0.8419497784342689, "grad_norm": 0.06393919140100479, "learning_rate": 7.907377755881266e-05, "loss": 7.4332, "step": 6840 }, { "epoch": 0.8420728705071393, "grad_norm": 0.08332102000713348, "learning_rate": 7.901219361990393e-05, "loss": 7.6575, "step": 6841 }, { "epoch": 0.8421959625800098, "grad_norm": 0.10581506043672562, "learning_rate": 7.895060968099519e-05, "loss": 7.6167, "step": 6842 }, { "epoch": 0.8423190546528804, "grad_norm": 0.08588666468858719, "learning_rate": 7.888902574208646e-05, "loss": 7.7728, "step": 6843 }, { "epoch": 0.8424421467257509, "grad_norm": 0.16351532936096191, "learning_rate": 7.882744180317774e-05, "loss": 8.4813, "step": 6844 }, { "epoch": 0.8425652387986213, "grad_norm": 0.10850383341312408, "learning_rate": 7.8765857864269e-05, "loss": 7.3721, "step": 6845 }, { "epoch": 0.8426883308714919, "grad_norm": 0.0656936839222908, "learning_rate": 7.870427392536027e-05, "loss": 7.7825, "step": 6846 }, { "epoch": 0.8428114229443624, "grad_norm": 0.08844460546970367, "learning_rate": 7.864268998645153e-05, "loss": 7.7218, "step": 6847 }, { "epoch": 0.8429345150172329, "grad_norm": 0.2012166529893875, "learning_rate": 7.85811060475428e-05, "loss": 8.3201, "step": 6848 }, { "epoch": 0.8430576070901034, "grad_norm": 0.12100131809711456, "learning_rate": 7.851952210863408e-05, "loss": 7.8399, "step": 6849 }, { "epoch": 0.8431806991629739, "grad_norm": 0.10040783882141113, "learning_rate": 7.845793816972534e-05, "loss": 8.0235, "step": 6850 }, { "epoch": 0.8433037912358444, "grad_norm": 0.15135259926319122, "learning_rate": 7.839635423081661e-05, "loss": 8.2555, "step": 6851 }, { "epoch": 0.843426883308715, "grad_norm": 0.09231957048177719, "learning_rate": 7.833477029190787e-05, "loss": 7.936, "step": 6852 }, { "epoch": 0.8435499753815854, "grad_norm": 0.12673817574977875, "learning_rate": 7.827318635299914e-05, "loss": 7.3972, "step": 6853 }, { "epoch": 0.8436730674544559, "grad_norm": 0.22701266407966614, "learning_rate": 7.821160241409041e-05, "loss": 7.1632, "step": 6854 }, { "epoch": 0.8437961595273265, "grad_norm": 0.07946857064962387, "learning_rate": 7.815001847518167e-05, "loss": 7.636, "step": 6855 }, { "epoch": 0.843919251600197, "grad_norm": 0.06324291974306107, "learning_rate": 7.808843453627295e-05, "loss": 7.7489, "step": 6856 }, { "epoch": 0.8440423436730674, "grad_norm": 0.13617749512195587, "learning_rate": 7.80268505973642e-05, "loss": 7.302, "step": 6857 }, { "epoch": 0.8441654357459379, "grad_norm": 0.36874574422836304, "learning_rate": 7.796526665845548e-05, "loss": 9.2652, "step": 6858 }, { "epoch": 0.8442885278188085, "grad_norm": 0.07804039865732193, "learning_rate": 7.790368271954675e-05, "loss": 7.7316, "step": 6859 }, { "epoch": 0.844411619891679, "grad_norm": 0.0765245333313942, "learning_rate": 7.784209878063801e-05, "loss": 7.3709, "step": 6860 }, { "epoch": 0.8445347119645494, "grad_norm": 0.14042027294635773, "learning_rate": 7.778051484172928e-05, "loss": 7.8049, "step": 6861 }, { "epoch": 0.84465780403742, "grad_norm": 0.1307448446750641, "learning_rate": 7.771893090282054e-05, "loss": 7.7624, "step": 6862 }, { "epoch": 0.8447808961102905, "grad_norm": 0.0858030617237091, "learning_rate": 7.765734696391182e-05, "loss": 7.3631, "step": 6863 }, { "epoch": 0.844903988183161, "grad_norm": 0.1772802323102951, "learning_rate": 7.759576302500308e-05, "loss": 7.157, "step": 6864 }, { "epoch": 0.8450270802560315, "grad_norm": 0.08758719265460968, "learning_rate": 7.753417908609435e-05, "loss": 7.371, "step": 6865 }, { "epoch": 0.845150172328902, "grad_norm": 0.22402258217334747, "learning_rate": 7.747259514718562e-05, "loss": 9.1552, "step": 6866 }, { "epoch": 0.8452732644017725, "grad_norm": 0.07316718995571136, "learning_rate": 7.741101120827688e-05, "loss": 7.6132, "step": 6867 }, { "epoch": 0.8453963564746431, "grad_norm": 0.23466384410858154, "learning_rate": 7.734942726936816e-05, "loss": 8.7106, "step": 6868 }, { "epoch": 0.8455194485475135, "grad_norm": 0.08573633432388306, "learning_rate": 7.728784333045941e-05, "loss": 7.2783, "step": 6869 }, { "epoch": 0.845642540620384, "grad_norm": 0.12935175001621246, "learning_rate": 7.722625939155069e-05, "loss": 7.2412, "step": 6870 }, { "epoch": 0.8457656326932546, "grad_norm": 0.05682196095585823, "learning_rate": 7.716467545264196e-05, "loss": 7.6459, "step": 6871 }, { "epoch": 0.8458887247661251, "grad_norm": 0.11210248619318008, "learning_rate": 7.710309151373322e-05, "loss": 7.3061, "step": 6872 }, { "epoch": 0.8460118168389956, "grad_norm": 0.08372226357460022, "learning_rate": 7.704150757482449e-05, "loss": 7.5021, "step": 6873 }, { "epoch": 0.846134908911866, "grad_norm": 0.10663481801748276, "learning_rate": 7.697992363591575e-05, "loss": 7.9699, "step": 6874 }, { "epoch": 0.8462580009847366, "grad_norm": 0.10023104399442673, "learning_rate": 7.691833969700703e-05, "loss": 7.7617, "step": 6875 }, { "epoch": 0.8463810930576071, "grad_norm": 0.07745157927274704, "learning_rate": 7.68567557580983e-05, "loss": 7.5047, "step": 6876 }, { "epoch": 0.8465041851304776, "grad_norm": 0.12897464632987976, "learning_rate": 7.679517181918956e-05, "loss": 8.028, "step": 6877 }, { "epoch": 0.8466272772033481, "grad_norm": 0.06777868419885635, "learning_rate": 7.673358788028083e-05, "loss": 7.5981, "step": 6878 }, { "epoch": 0.8467503692762186, "grad_norm": 0.07655522972345352, "learning_rate": 7.667200394137209e-05, "loss": 7.987, "step": 6879 }, { "epoch": 0.8468734613490891, "grad_norm": 0.12635889649391174, "learning_rate": 7.661042000246336e-05, "loss": 8.116, "step": 6880 }, { "epoch": 0.8469965534219597, "grad_norm": 0.08873490244150162, "learning_rate": 7.654883606355464e-05, "loss": 7.5769, "step": 6881 }, { "epoch": 0.8471196454948301, "grad_norm": 0.12913724780082703, "learning_rate": 7.64872521246459e-05, "loss": 7.5031, "step": 6882 }, { "epoch": 0.8472427375677006, "grad_norm": 0.10074828565120697, "learning_rate": 7.642566818573717e-05, "loss": 7.937, "step": 6883 }, { "epoch": 0.8473658296405712, "grad_norm": 0.22625122964382172, "learning_rate": 7.636408424682843e-05, "loss": 7.054, "step": 6884 }, { "epoch": 0.8474889217134417, "grad_norm": 0.10917902737855911, "learning_rate": 7.63025003079197e-05, "loss": 7.6793, "step": 6885 }, { "epoch": 0.8476120137863121, "grad_norm": 0.12485040724277496, "learning_rate": 7.624091636901096e-05, "loss": 7.332, "step": 6886 }, { "epoch": 0.8477351058591827, "grad_norm": 0.1021607369184494, "learning_rate": 7.617933243010223e-05, "loss": 7.2942, "step": 6887 }, { "epoch": 0.8478581979320532, "grad_norm": 0.15286611020565033, "learning_rate": 7.611774849119351e-05, "loss": 7.3745, "step": 6888 }, { "epoch": 0.8479812900049237, "grad_norm": 0.08021018654108047, "learning_rate": 7.605616455228477e-05, "loss": 7.4884, "step": 6889 }, { "epoch": 0.8481043820777941, "grad_norm": 0.11506327986717224, "learning_rate": 7.599458061337604e-05, "loss": 8.0107, "step": 6890 }, { "epoch": 0.8482274741506647, "grad_norm": 0.2512814700603485, "learning_rate": 7.59329966744673e-05, "loss": 8.8106, "step": 6891 }, { "epoch": 0.8483505662235352, "grad_norm": 0.07483712583780289, "learning_rate": 7.587141273555857e-05, "loss": 7.617, "step": 6892 }, { "epoch": 0.8484736582964058, "grad_norm": 0.0608176626265049, "learning_rate": 7.580982879664985e-05, "loss": 7.5943, "step": 6893 }, { "epoch": 0.8485967503692762, "grad_norm": 0.13937394320964813, "learning_rate": 7.57482448577411e-05, "loss": 7.8507, "step": 6894 }, { "epoch": 0.8487198424421467, "grad_norm": 0.07623682916164398, "learning_rate": 7.568666091883238e-05, "loss": 7.5647, "step": 6895 }, { "epoch": 0.8488429345150172, "grad_norm": 0.1052161306142807, "learning_rate": 7.562507697992364e-05, "loss": 7.2875, "step": 6896 }, { "epoch": 0.8489660265878878, "grad_norm": NaN, "learning_rate": 7.556349304101491e-05, "loss": 9.3617, "step": 6897 }, { "epoch": 0.8490891186607582, "grad_norm": 0.06696859747171402, "learning_rate": 7.550190910210618e-05, "loss": 7.7176, "step": 6898 }, { "epoch": 0.8492122107336287, "grad_norm": 0.07203834503889084, "learning_rate": 7.544032516319744e-05, "loss": 7.69, "step": 6899 }, { "epoch": 0.8493353028064993, "grad_norm": 0.15225526690483093, "learning_rate": 7.537874122428872e-05, "loss": 7.288, "step": 6900 }, { "epoch": 0.8494583948793698, "grad_norm": 0.09136024862527847, "learning_rate": 7.531715728537998e-05, "loss": 7.6726, "step": 6901 }, { "epoch": 0.8495814869522402, "grad_norm": 0.11153087764978409, "learning_rate": 7.525557334647125e-05, "loss": 7.1884, "step": 6902 }, { "epoch": 0.8497045790251108, "grad_norm": 0.1509222388267517, "learning_rate": 7.519398940756252e-05, "loss": 7.9135, "step": 6903 }, { "epoch": 0.8498276710979813, "grad_norm": 0.21434623003005981, "learning_rate": 7.513240546865378e-05, "loss": 7.8507, "step": 6904 }, { "epoch": 0.8499507631708518, "grad_norm": 0.06809432804584503, "learning_rate": 7.507082152974505e-05, "loss": 7.6628, "step": 6905 }, { "epoch": 0.8500738552437223, "grad_norm": 0.07003334909677505, "learning_rate": 7.500923759083631e-05, "loss": 7.7797, "step": 6906 }, { "epoch": 0.8501969473165928, "grad_norm": 0.10113772004842758, "learning_rate": 7.494765365192757e-05, "loss": 7.5393, "step": 6907 }, { "epoch": 0.8503200393894633, "grad_norm": 0.1154279038310051, "learning_rate": 7.488606971301883e-05, "loss": 7.5133, "step": 6908 }, { "epoch": 0.8504431314623339, "grad_norm": 0.11895035952329636, "learning_rate": 7.48244857741101e-05, "loss": 7.3213, "step": 6909 }, { "epoch": 0.8505662235352043, "grad_norm": 0.07779937237501144, "learning_rate": 7.476290183520138e-05, "loss": 7.7519, "step": 6910 }, { "epoch": 0.8506893156080748, "grad_norm": 0.09973413497209549, "learning_rate": 7.470131789629264e-05, "loss": 7.5887, "step": 6911 }, { "epoch": 0.8508124076809453, "grad_norm": 0.09061292558908463, "learning_rate": 7.463973395738391e-05, "loss": 7.7382, "step": 6912 }, { "epoch": 0.8509354997538159, "grad_norm": 0.13782759010791779, "learning_rate": 7.457815001847517e-05, "loss": 8.0046, "step": 6913 }, { "epoch": 0.8510585918266863, "grad_norm": 0.054990027099847794, "learning_rate": 7.451656607956644e-05, "loss": 7.7948, "step": 6914 }, { "epoch": 0.8511816838995568, "grad_norm": 0.17214693129062653, "learning_rate": 7.445498214065772e-05, "loss": 8.5955, "step": 6915 }, { "epoch": 0.8513047759724274, "grad_norm": 0.1741938441991806, "learning_rate": 7.439339820174898e-05, "loss": 7.3447, "step": 6916 }, { "epoch": 0.8514278680452979, "grad_norm": 0.1913989633321762, "learning_rate": 7.433181426284025e-05, "loss": 7.4666, "step": 6917 }, { "epoch": 0.8515509601181684, "grad_norm": 0.15354055166244507, "learning_rate": 7.427023032393151e-05, "loss": 8.1874, "step": 6918 }, { "epoch": 0.8516740521910389, "grad_norm": 0.23512546718120575, "learning_rate": 7.420864638502278e-05, "loss": 7.7859, "step": 6919 }, { "epoch": 0.8517971442639094, "grad_norm": 0.24109603464603424, "learning_rate": 7.414706244611406e-05, "loss": 8.853, "step": 6920 }, { "epoch": 0.8519202363367799, "grad_norm": 0.15959852933883667, "learning_rate": 7.408547850720531e-05, "loss": 8.6298, "step": 6921 }, { "epoch": 0.8520433284096505, "grad_norm": 0.07151663303375244, "learning_rate": 7.402389456829659e-05, "loss": 7.5098, "step": 6922 }, { "epoch": 0.8521664204825209, "grad_norm": 0.09318672865629196, "learning_rate": 7.396231062938785e-05, "loss": 7.8252, "step": 6923 }, { "epoch": 0.8522895125553914, "grad_norm": 0.11084605008363724, "learning_rate": 7.390072669047912e-05, "loss": 7.5766, "step": 6924 }, { "epoch": 0.852412604628262, "grad_norm": 0.06484384834766388, "learning_rate": 7.38391427515704e-05, "loss": 7.8474, "step": 6925 }, { "epoch": 0.8525356967011325, "grad_norm": 0.18043865263462067, "learning_rate": 7.377755881266165e-05, "loss": 8.6342, "step": 6926 }, { "epoch": 0.8526587887740029, "grad_norm": 0.0772632360458374, "learning_rate": 7.371597487375293e-05, "loss": 8.1621, "step": 6927 }, { "epoch": 0.8527818808468735, "grad_norm": 0.10564049333333969, "learning_rate": 7.365439093484419e-05, "loss": 7.5711, "step": 6928 }, { "epoch": 0.852904972919744, "grad_norm": 0.10157984495162964, "learning_rate": 7.359280699593546e-05, "loss": 7.6121, "step": 6929 }, { "epoch": 0.8530280649926145, "grad_norm": 0.11887141317129135, "learning_rate": 7.353122305702673e-05, "loss": 8.3914, "step": 6930 }, { "epoch": 0.8531511570654849, "grad_norm": 0.12583573162555695, "learning_rate": 7.346963911811799e-05, "loss": 7.5638, "step": 6931 }, { "epoch": 0.8532742491383555, "grad_norm": 0.06612127274274826, "learning_rate": 7.340805517920926e-05, "loss": 7.8523, "step": 6932 }, { "epoch": 0.853397341211226, "grad_norm": 0.12184087932109833, "learning_rate": 7.334647124030052e-05, "loss": 7.6277, "step": 6933 }, { "epoch": 0.8535204332840965, "grad_norm": 0.20053106546401978, "learning_rate": 7.32848873013918e-05, "loss": 8.2656, "step": 6934 }, { "epoch": 0.853643525356967, "grad_norm": 0.10690491646528244, "learning_rate": 7.322330336248306e-05, "loss": 7.3094, "step": 6935 }, { "epoch": 0.8537666174298375, "grad_norm": 0.0782744288444519, "learning_rate": 7.316171942357433e-05, "loss": 7.3183, "step": 6936 }, { "epoch": 0.853889709502708, "grad_norm": 0.1214609369635582, "learning_rate": 7.31001354846656e-05, "loss": 7.7406, "step": 6937 }, { "epoch": 0.8540128015755786, "grad_norm": 0.07227133214473724, "learning_rate": 7.303855154575686e-05, "loss": 7.4126, "step": 6938 }, { "epoch": 0.854135893648449, "grad_norm": 0.0861000344157219, "learning_rate": 7.297696760684813e-05, "loss": 7.6278, "step": 6939 }, { "epoch": 0.8542589857213195, "grad_norm": 0.077320896089077, "learning_rate": 7.29153836679394e-05, "loss": 7.2922, "step": 6940 }, { "epoch": 0.8543820777941901, "grad_norm": 0.06604098528623581, "learning_rate": 7.285379972903067e-05, "loss": 7.5211, "step": 6941 }, { "epoch": 0.8545051698670606, "grad_norm": 0.07548611611127853, "learning_rate": 7.279221579012194e-05, "loss": 7.6813, "step": 6942 }, { "epoch": 0.854628261939931, "grad_norm": 0.29094332456588745, "learning_rate": 7.27306318512132e-05, "loss": 9.1015, "step": 6943 }, { "epoch": 0.8547513540128016, "grad_norm": 0.17675581574440002, "learning_rate": 7.266904791230447e-05, "loss": 8.0842, "step": 6944 }, { "epoch": 0.8548744460856721, "grad_norm": 0.10512765496969223, "learning_rate": 7.260746397339573e-05, "loss": 7.7574, "step": 6945 }, { "epoch": 0.8549975381585426, "grad_norm": 0.1322246938943863, "learning_rate": 7.2545880034487e-05, "loss": 8.1649, "step": 6946 }, { "epoch": 0.855120630231413, "grad_norm": 0.15257571637630463, "learning_rate": 7.248429609557828e-05, "loss": 7.4137, "step": 6947 }, { "epoch": 0.8552437223042836, "grad_norm": 0.18145453929901123, "learning_rate": 7.242271215666954e-05, "loss": 7.335, "step": 6948 }, { "epoch": 0.8553668143771541, "grad_norm": 0.16945499181747437, "learning_rate": 7.236112821776081e-05, "loss": 7.6261, "step": 6949 }, { "epoch": 0.8554899064500247, "grad_norm": 0.17115047574043274, "learning_rate": 7.229954427885207e-05, "loss": 7.3797, "step": 6950 }, { "epoch": 0.8556129985228951, "grad_norm": 0.1462535709142685, "learning_rate": 7.223796033994334e-05, "loss": 7.1663, "step": 6951 }, { "epoch": 0.8557360905957656, "grad_norm": 0.06353044509887695, "learning_rate": 7.217637640103462e-05, "loss": 7.3995, "step": 6952 }, { "epoch": 0.8558591826686361, "grad_norm": 0.14129739999771118, "learning_rate": 7.211479246212588e-05, "loss": 8.0047, "step": 6953 }, { "epoch": 0.8559822747415067, "grad_norm": 0.3823738396167755, "learning_rate": 7.205320852321715e-05, "loss": 9.1802, "step": 6954 }, { "epoch": 0.8561053668143771, "grad_norm": 0.23312179744243622, "learning_rate": 7.199162458430841e-05, "loss": 8.2533, "step": 6955 }, { "epoch": 0.8562284588872476, "grad_norm": 0.08591598272323608, "learning_rate": 7.193004064539968e-05, "loss": 7.434, "step": 6956 }, { "epoch": 0.8563515509601182, "grad_norm": 0.09362530708312988, "learning_rate": 7.186845670649094e-05, "loss": 7.6558, "step": 6957 }, { "epoch": 0.8564746430329887, "grad_norm": 0.10882221907377243, "learning_rate": 7.180687276758221e-05, "loss": 7.925, "step": 6958 }, { "epoch": 0.8565977351058592, "grad_norm": 0.07739377021789551, "learning_rate": 7.174528882867349e-05, "loss": 7.5851, "step": 6959 }, { "epoch": 0.8567208271787297, "grad_norm": 0.10411658138036728, "learning_rate": 7.168370488976475e-05, "loss": 7.4165, "step": 6960 }, { "epoch": 0.8568439192516002, "grad_norm": 0.12354591488838196, "learning_rate": 7.162212095085602e-05, "loss": 7.421, "step": 6961 }, { "epoch": 0.8569670113244707, "grad_norm": 0.15791110694408417, "learning_rate": 7.156053701194728e-05, "loss": 7.2739, "step": 6962 }, { "epoch": 0.8570901033973413, "grad_norm": 0.1354595273733139, "learning_rate": 7.149895307303855e-05, "loss": 7.251, "step": 6963 }, { "epoch": 0.8572131954702117, "grad_norm": 0.06582758575677872, "learning_rate": 7.143736913412983e-05, "loss": 7.4066, "step": 6964 }, { "epoch": 0.8573362875430822, "grad_norm": 0.09914844483137131, "learning_rate": 7.137578519522108e-05, "loss": 7.7542, "step": 6965 }, { "epoch": 0.8574593796159528, "grad_norm": 0.08864890038967133, "learning_rate": 7.131420125631236e-05, "loss": 7.3324, "step": 6966 }, { "epoch": 0.8575824716888233, "grad_norm": 0.07083312422037125, "learning_rate": 7.125261731740362e-05, "loss": 7.6236, "step": 6967 }, { "epoch": 0.8577055637616937, "grad_norm": 0.11944448202848434, "learning_rate": 7.119103337849489e-05, "loss": 7.7013, "step": 6968 }, { "epoch": 0.8578286558345642, "grad_norm": 0.08268231153488159, "learning_rate": 7.112944943958616e-05, "loss": 7.3001, "step": 6969 }, { "epoch": 0.8579517479074348, "grad_norm": 0.23526328802108765, "learning_rate": 7.106786550067742e-05, "loss": 8.372, "step": 6970 }, { "epoch": 0.8580748399803053, "grad_norm": 0.13001205027103424, "learning_rate": 7.10062815617687e-05, "loss": 8.0376, "step": 6971 }, { "epoch": 0.8581979320531757, "grad_norm": 0.08298807591199875, "learning_rate": 7.094469762285996e-05, "loss": 7.6213, "step": 6972 }, { "epoch": 0.8583210241260463, "grad_norm": 0.06350888311862946, "learning_rate": 7.088311368395123e-05, "loss": 7.5916, "step": 6973 }, { "epoch": 0.8584441161989168, "grad_norm": 0.3132721185684204, "learning_rate": 7.08215297450425e-05, "loss": 8.7847, "step": 6974 }, { "epoch": 0.8585672082717873, "grad_norm": 0.09534011781215668, "learning_rate": 7.075994580613376e-05, "loss": 7.5519, "step": 6975 }, { "epoch": 0.8586903003446578, "grad_norm": 0.10142374783754349, "learning_rate": 7.069836186722503e-05, "loss": 7.3626, "step": 6976 }, { "epoch": 0.8588133924175283, "grad_norm": 0.08789430558681488, "learning_rate": 7.06367779283163e-05, "loss": 7.783, "step": 6977 }, { "epoch": 0.8589364844903988, "grad_norm": 0.06420809030532837, "learning_rate": 7.057519398940757e-05, "loss": 7.6504, "step": 6978 }, { "epoch": 0.8590595765632694, "grad_norm": 0.13836489617824554, "learning_rate": 7.051361005049884e-05, "loss": 7.3355, "step": 6979 }, { "epoch": 0.8591826686361398, "grad_norm": 0.07577797025442123, "learning_rate": 7.04520261115901e-05, "loss": 7.6219, "step": 6980 }, { "epoch": 0.8593057607090103, "grad_norm": 0.08036445081233978, "learning_rate": 7.039044217268137e-05, "loss": 7.6714, "step": 6981 }, { "epoch": 0.8594288527818809, "grad_norm": 0.15221525728702545, "learning_rate": 7.032885823377263e-05, "loss": 7.83, "step": 6982 }, { "epoch": 0.8595519448547514, "grad_norm": 0.08263673633337021, "learning_rate": 7.02672742948639e-05, "loss": 7.5767, "step": 6983 }, { "epoch": 0.8596750369276218, "grad_norm": 0.143678680062294, "learning_rate": 7.020569035595516e-05, "loss": 8.063, "step": 6984 }, { "epoch": 0.8597981290004924, "grad_norm": 0.3909357488155365, "learning_rate": 7.014410641704644e-05, "loss": 9.234, "step": 6985 }, { "epoch": 0.8599212210733629, "grad_norm": 0.07471297681331635, "learning_rate": 7.008252247813771e-05, "loss": 7.7037, "step": 6986 }, { "epoch": 0.8600443131462334, "grad_norm": 0.09261202812194824, "learning_rate": 7.002093853922897e-05, "loss": 7.6634, "step": 6987 }, { "epoch": 0.8601674052191038, "grad_norm": 0.16140954196453094, "learning_rate": 6.995935460032024e-05, "loss": 7.3529, "step": 6988 }, { "epoch": 0.8602904972919744, "grad_norm": 0.12296965718269348, "learning_rate": 6.98977706614115e-05, "loss": 7.5246, "step": 6989 }, { "epoch": 0.8604135893648449, "grad_norm": 0.14680157601833344, "learning_rate": 6.983618672250278e-05, "loss": 7.3813, "step": 6990 }, { "epoch": 0.8605366814377154, "grad_norm": 0.10738542675971985, "learning_rate": 6.977460278359405e-05, "loss": 8.0803, "step": 6991 }, { "epoch": 0.8606597735105859, "grad_norm": 0.11551697552204132, "learning_rate": 6.971301884468531e-05, "loss": 8.2017, "step": 6992 }, { "epoch": 0.8607828655834564, "grad_norm": 0.09516989439725876, "learning_rate": 6.965143490577658e-05, "loss": 7.9224, "step": 6993 }, { "epoch": 0.8609059576563269, "grad_norm": 0.10051680356264114, "learning_rate": 6.958985096686784e-05, "loss": 7.5358, "step": 6994 }, { "epoch": 0.8610290497291975, "grad_norm": 0.12108054757118225, "learning_rate": 6.952826702795911e-05, "loss": 7.5323, "step": 6995 }, { "epoch": 0.8611521418020679, "grad_norm": 0.1316792517900467, "learning_rate": 6.946668308905039e-05, "loss": 8.1231, "step": 6996 }, { "epoch": 0.8612752338749384, "grad_norm": 0.09182258695363998, "learning_rate": 6.940509915014165e-05, "loss": 7.2936, "step": 6997 }, { "epoch": 0.861398325947809, "grad_norm": 0.1030087098479271, "learning_rate": 6.934351521123292e-05, "loss": 7.4029, "step": 6998 }, { "epoch": 0.8615214180206795, "grad_norm": 0.1022259071469307, "learning_rate": 6.928193127232418e-05, "loss": 7.8015, "step": 6999 }, { "epoch": 0.8616445100935499, "grad_norm": 0.26878440380096436, "learning_rate": 6.922034733341545e-05, "loss": 8.7542, "step": 7000 }, { "epoch": 0.8617676021664205, "grad_norm": 0.09017272293567657, "learning_rate": 6.915876339450672e-05, "loss": 7.2557, "step": 7001 }, { "epoch": 0.861890694239291, "grad_norm": 0.07392356544733047, "learning_rate": 6.909717945559798e-05, "loss": 7.5972, "step": 7002 }, { "epoch": 0.8620137863121615, "grad_norm": 0.38265475630760193, "learning_rate": 6.903559551668926e-05, "loss": 9.0316, "step": 7003 }, { "epoch": 0.8621368783850321, "grad_norm": 0.08177386969327927, "learning_rate": 6.897401157778052e-05, "loss": 7.4898, "step": 7004 }, { "epoch": 0.8622599704579025, "grad_norm": 0.2893977761268616, "learning_rate": 6.891242763887179e-05, "loss": 8.8402, "step": 7005 }, { "epoch": 0.862383062530773, "grad_norm": 0.20057213306427002, "learning_rate": 6.885084369996305e-05, "loss": 7.8247, "step": 7006 }, { "epoch": 0.8625061546036435, "grad_norm": 0.07649008929729462, "learning_rate": 6.878925976105432e-05, "loss": 7.8783, "step": 7007 }, { "epoch": 0.8626292466765141, "grad_norm": 0.052709292620420456, "learning_rate": 6.87276758221456e-05, "loss": 7.7357, "step": 7008 }, { "epoch": 0.8627523387493845, "grad_norm": 0.0911756381392479, "learning_rate": 6.866609188323686e-05, "loss": 7.5638, "step": 7009 }, { "epoch": 0.862875430822255, "grad_norm": 0.17468690872192383, "learning_rate": 6.860450794432813e-05, "loss": 7.1597, "step": 7010 }, { "epoch": 0.8629985228951256, "grad_norm": 0.09626085311174393, "learning_rate": 6.854292400541939e-05, "loss": 7.6199, "step": 7011 }, { "epoch": 0.8631216149679961, "grad_norm": 0.10545080155134201, "learning_rate": 6.848134006651066e-05, "loss": 7.6272, "step": 7012 }, { "epoch": 0.8632447070408665, "grad_norm": 0.10904142260551453, "learning_rate": 6.841975612760193e-05, "loss": 7.5235, "step": 7013 }, { "epoch": 0.8633677991137371, "grad_norm": 0.12832866609096527, "learning_rate": 6.835817218869319e-05, "loss": 7.6603, "step": 7014 }, { "epoch": 0.8634908911866076, "grad_norm": 0.1558859944343567, "learning_rate": 6.829658824978447e-05, "loss": 7.9271, "step": 7015 }, { "epoch": 0.8636139832594781, "grad_norm": 0.12074125558137894, "learning_rate": 6.823500431087573e-05, "loss": 8.0205, "step": 7016 }, { "epoch": 0.8637370753323486, "grad_norm": 0.09544069319963455, "learning_rate": 6.8173420371967e-05, "loss": 7.8634, "step": 7017 }, { "epoch": 0.8638601674052191, "grad_norm": 0.1287021040916443, "learning_rate": 6.811183643305827e-05, "loss": 7.203, "step": 7018 }, { "epoch": 0.8639832594780896, "grad_norm": 0.0673656091094017, "learning_rate": 6.805025249414953e-05, "loss": 7.8161, "step": 7019 }, { "epoch": 0.8641063515509602, "grad_norm": 0.0958981141448021, "learning_rate": 6.79886685552408e-05, "loss": 7.6455, "step": 7020 }, { "epoch": 0.8642294436238306, "grad_norm": 0.13190332055091858, "learning_rate": 6.792708461633206e-05, "loss": 7.6685, "step": 7021 }, { "epoch": 0.8643525356967011, "grad_norm": 0.08034541457891464, "learning_rate": 6.786550067742334e-05, "loss": 7.5677, "step": 7022 }, { "epoch": 0.8644756277695717, "grad_norm": 0.07717027515172958, "learning_rate": 6.78039167385146e-05, "loss": 7.7672, "step": 7023 }, { "epoch": 0.8645987198424422, "grad_norm": 0.10624109953641891, "learning_rate": 6.774233279960586e-05, "loss": 8.2237, "step": 7024 }, { "epoch": 0.8647218119153126, "grad_norm": 0.11598687618970871, "learning_rate": 6.768074886069713e-05, "loss": 7.5921, "step": 7025 }, { "epoch": 0.8648449039881831, "grad_norm": 0.07146746665239334, "learning_rate": 6.761916492178839e-05, "loss": 7.5254, "step": 7026 }, { "epoch": 0.8649679960610537, "grad_norm": 0.11915435642004013, "learning_rate": 6.755758098287966e-05, "loss": 8.2149, "step": 7027 }, { "epoch": 0.8650910881339242, "grad_norm": 0.05974860489368439, "learning_rate": 6.749599704397092e-05, "loss": 7.5848, "step": 7028 }, { "epoch": 0.8652141802067946, "grad_norm": 0.1017051413655281, "learning_rate": 6.74344131050622e-05, "loss": 7.3, "step": 7029 }, { "epoch": 0.8653372722796652, "grad_norm": 0.17117427289485931, "learning_rate": 6.737282916615347e-05, "loss": 8.3308, "step": 7030 }, { "epoch": 0.8654603643525357, "grad_norm": 0.1391238123178482, "learning_rate": 6.731124522724473e-05, "loss": 8.0974, "step": 7031 }, { "epoch": 0.8655834564254062, "grad_norm": 0.1888299286365509, "learning_rate": 6.7249661288336e-05, "loss": 6.9868, "step": 7032 }, { "epoch": 0.8657065484982767, "grad_norm": 0.07035387307405472, "learning_rate": 6.718807734942726e-05, "loss": 7.5096, "step": 7033 }, { "epoch": 0.8658296405711472, "grad_norm": 0.08438973128795624, "learning_rate": 6.712649341051853e-05, "loss": 7.3496, "step": 7034 }, { "epoch": 0.8659527326440177, "grad_norm": 0.12746472656726837, "learning_rate": 6.70649094716098e-05, "loss": 8.1288, "step": 7035 }, { "epoch": 0.8660758247168883, "grad_norm": 0.1377355456352234, "learning_rate": 6.700332553270106e-05, "loss": 8.1807, "step": 7036 }, { "epoch": 0.8661989167897587, "grad_norm": 0.18493865430355072, "learning_rate": 6.694174159379234e-05, "loss": 7.5864, "step": 7037 }, { "epoch": 0.8663220088626292, "grad_norm": 0.08059316128492355, "learning_rate": 6.68801576548836e-05, "loss": 7.5585, "step": 7038 }, { "epoch": 0.8664451009354998, "grad_norm": 0.0850229561328888, "learning_rate": 6.681857371597487e-05, "loss": 7.3017, "step": 7039 }, { "epoch": 0.8665681930083703, "grad_norm": 0.1212160587310791, "learning_rate": 6.675698977706614e-05, "loss": 8.0284, "step": 7040 }, { "epoch": 0.8666912850812407, "grad_norm": 0.1571168303489685, "learning_rate": 6.66954058381574e-05, "loss": 8.051, "step": 7041 }, { "epoch": 0.8668143771541112, "grad_norm": 0.10159645974636078, "learning_rate": 6.663382189924868e-05, "loss": 7.9129, "step": 7042 }, { "epoch": 0.8669374692269818, "grad_norm": 0.11353214830160141, "learning_rate": 6.657223796033994e-05, "loss": 7.4993, "step": 7043 }, { "epoch": 0.8670605612998523, "grad_norm": 0.1932736337184906, "learning_rate": 6.651065402143121e-05, "loss": 8.7157, "step": 7044 }, { "epoch": 0.8671836533727229, "grad_norm": 0.07672878354787827, "learning_rate": 6.644907008252248e-05, "loss": 7.4717, "step": 7045 }, { "epoch": 0.8673067454455933, "grad_norm": 0.12265711277723312, "learning_rate": 6.638748614361374e-05, "loss": 7.6353, "step": 7046 }, { "epoch": 0.8674298375184638, "grad_norm": 0.13243256509304047, "learning_rate": 6.632590220470501e-05, "loss": 7.5404, "step": 7047 }, { "epoch": 0.8675529295913343, "grad_norm": 0.08406104147434235, "learning_rate": 6.626431826579627e-05, "loss": 7.5832, "step": 7048 }, { "epoch": 0.8676760216642049, "grad_norm": 0.07203350216150284, "learning_rate": 6.620273432688755e-05, "loss": 7.8189, "step": 7049 }, { "epoch": 0.8677991137370753, "grad_norm": 0.07190341502428055, "learning_rate": 6.614115038797882e-05, "loss": 7.6306, "step": 7050 }, { "epoch": 0.8679222058099458, "grad_norm": 0.073602594435215, "learning_rate": 6.607956644907008e-05, "loss": 7.4715, "step": 7051 }, { "epoch": 0.8680452978828164, "grad_norm": 0.11559364199638367, "learning_rate": 6.601798251016135e-05, "loss": 7.5408, "step": 7052 }, { "epoch": 0.8681683899556869, "grad_norm": 0.08092624694108963, "learning_rate": 6.595639857125261e-05, "loss": 7.6419, "step": 7053 }, { "epoch": 0.8682914820285573, "grad_norm": 0.08256089687347412, "learning_rate": 6.589481463234388e-05, "loss": 7.4548, "step": 7054 }, { "epoch": 0.8684145741014279, "grad_norm": 0.06824791431427002, "learning_rate": 6.583323069343514e-05, "loss": 7.8457, "step": 7055 }, { "epoch": 0.8685376661742984, "grad_norm": 0.10950068384408951, "learning_rate": 6.577164675452642e-05, "loss": 7.8125, "step": 7056 }, { "epoch": 0.8686607582471689, "grad_norm": 0.11028241366147995, "learning_rate": 6.571006281561769e-05, "loss": 7.7238, "step": 7057 }, { "epoch": 0.8687838503200394, "grad_norm": 0.10586852580308914, "learning_rate": 6.564847887670895e-05, "loss": 7.4375, "step": 7058 }, { "epoch": 0.8689069423929099, "grad_norm": 0.12484091520309448, "learning_rate": 6.558689493780022e-05, "loss": 7.578, "step": 7059 }, { "epoch": 0.8690300344657804, "grad_norm": 0.06735672801733017, "learning_rate": 6.552531099889148e-05, "loss": 7.5209, "step": 7060 }, { "epoch": 0.869153126538651, "grad_norm": 0.10639950633049011, "learning_rate": 6.546372705998276e-05, "loss": 7.9967, "step": 7061 }, { "epoch": 0.8692762186115214, "grad_norm": 0.08351641148328781, "learning_rate": 6.540214312107403e-05, "loss": 7.4596, "step": 7062 }, { "epoch": 0.8693993106843919, "grad_norm": 0.08712663501501083, "learning_rate": 6.534055918216529e-05, "loss": 7.8567, "step": 7063 }, { "epoch": 0.8695224027572624, "grad_norm": 0.11392668634653091, "learning_rate": 6.527897524325656e-05, "loss": 8.2592, "step": 7064 }, { "epoch": 0.869645494830133, "grad_norm": 0.33911556005477905, "learning_rate": 6.521739130434782e-05, "loss": 9.3284, "step": 7065 }, { "epoch": 0.8697685869030034, "grad_norm": 0.09514735639095306, "learning_rate": 6.51558073654391e-05, "loss": 8.1927, "step": 7066 }, { "epoch": 0.8698916789758739, "grad_norm": 0.11581002175807953, "learning_rate": 6.509422342653037e-05, "loss": 7.6463, "step": 7067 }, { "epoch": 0.8700147710487445, "grad_norm": 0.17588229477405548, "learning_rate": 6.503263948762163e-05, "loss": 7.5994, "step": 7068 }, { "epoch": 0.870137863121615, "grad_norm": 0.08933007717132568, "learning_rate": 6.49710555487129e-05, "loss": 7.6949, "step": 7069 }, { "epoch": 0.8702609551944854, "grad_norm": 0.08796907961368561, "learning_rate": 6.490947160980416e-05, "loss": 8.1538, "step": 7070 }, { "epoch": 0.870384047267356, "grad_norm": 0.1813465654850006, "learning_rate": 6.484788767089543e-05, "loss": 8.6241, "step": 7071 }, { "epoch": 0.8705071393402265, "grad_norm": 0.15926755964756012, "learning_rate": 6.47863037319867e-05, "loss": 8.1644, "step": 7072 }, { "epoch": 0.870630231413097, "grad_norm": 0.09665487706661224, "learning_rate": 6.472471979307796e-05, "loss": 8.1281, "step": 7073 }, { "epoch": 0.8707533234859675, "grad_norm": 0.13289418816566467, "learning_rate": 6.466313585416924e-05, "loss": 7.467, "step": 7074 }, { "epoch": 0.870876415558838, "grad_norm": 0.1281023770570755, "learning_rate": 6.46015519152605e-05, "loss": 7.4701, "step": 7075 }, { "epoch": 0.8709995076317085, "grad_norm": 0.11552028357982635, "learning_rate": 6.453996797635177e-05, "loss": 7.787, "step": 7076 }, { "epoch": 0.8711225997045791, "grad_norm": 0.0935002788901329, "learning_rate": 6.447838403744303e-05, "loss": 7.5959, "step": 7077 }, { "epoch": 0.8712456917774495, "grad_norm": 0.1857926845550537, "learning_rate": 6.44168000985343e-05, "loss": 7.3098, "step": 7078 }, { "epoch": 0.87136878385032, "grad_norm": 0.11676862090826035, "learning_rate": 6.435521615962558e-05, "loss": 7.5162, "step": 7079 }, { "epoch": 0.8714918759231906, "grad_norm": 0.07947036623954773, "learning_rate": 6.429363222071683e-05, "loss": 7.8902, "step": 7080 }, { "epoch": 0.8716149679960611, "grad_norm": 0.12150676548480988, "learning_rate": 6.423204828180811e-05, "loss": 7.2036, "step": 7081 }, { "epoch": 0.8717380600689315, "grad_norm": 0.4028562307357788, "learning_rate": 6.417046434289937e-05, "loss": 9.6692, "step": 7082 }, { "epoch": 0.871861152141802, "grad_norm": 0.13009321689605713, "learning_rate": 6.410888040399064e-05, "loss": 7.699, "step": 7083 }, { "epoch": 0.8719842442146726, "grad_norm": 0.10119272768497467, "learning_rate": 6.404729646508191e-05, "loss": 7.607, "step": 7084 }, { "epoch": 0.8721073362875431, "grad_norm": 0.25957146286964417, "learning_rate": 6.398571252617317e-05, "loss": 8.1417, "step": 7085 }, { "epoch": 0.8722304283604135, "grad_norm": 0.09538865834474564, "learning_rate": 6.392412858726445e-05, "loss": 7.5017, "step": 7086 }, { "epoch": 0.8723535204332841, "grad_norm": 0.08685958385467529, "learning_rate": 6.38625446483557e-05, "loss": 7.6393, "step": 7087 }, { "epoch": 0.8724766125061546, "grad_norm": 0.09152387827634811, "learning_rate": 6.380096070944698e-05, "loss": 7.5299, "step": 7088 }, { "epoch": 0.8725997045790251, "grad_norm": 0.08381587266921997, "learning_rate": 6.373937677053825e-05, "loss": 7.2528, "step": 7089 }, { "epoch": 0.8727227966518957, "grad_norm": 0.053112249821424484, "learning_rate": 6.367779283162951e-05, "loss": 7.5706, "step": 7090 }, { "epoch": 0.8728458887247661, "grad_norm": 0.08824674040079117, "learning_rate": 6.361620889272078e-05, "loss": 7.4488, "step": 7091 }, { "epoch": 0.8729689807976366, "grad_norm": 0.07678690552711487, "learning_rate": 6.355462495381204e-05, "loss": 7.5558, "step": 7092 }, { "epoch": 0.8730920728705072, "grad_norm": 0.08401791006326675, "learning_rate": 6.349304101490332e-05, "loss": 7.3511, "step": 7093 }, { "epoch": 0.8732151649433777, "grad_norm": 0.06399363279342651, "learning_rate": 6.343145707599459e-05, "loss": 7.3673, "step": 7094 }, { "epoch": 0.8733382570162481, "grad_norm": 0.06496985256671906, "learning_rate": 6.336987313708585e-05, "loss": 7.35, "step": 7095 }, { "epoch": 0.8734613490891187, "grad_norm": 0.1570626199245453, "learning_rate": 6.330828919817712e-05, "loss": 7.9374, "step": 7096 }, { "epoch": 0.8735844411619892, "grad_norm": 0.07186730206012726, "learning_rate": 6.324670525926838e-05, "loss": 7.2966, "step": 7097 }, { "epoch": 0.8737075332348597, "grad_norm": 0.12504439055919647, "learning_rate": 6.318512132035965e-05, "loss": 7.3714, "step": 7098 }, { "epoch": 0.8738306253077301, "grad_norm": 0.267305850982666, "learning_rate": 6.312353738145093e-05, "loss": 8.9056, "step": 7099 }, { "epoch": 0.8739537173806007, "grad_norm": 0.09875468164682388, "learning_rate": 6.306195344254219e-05, "loss": 7.6829, "step": 7100 }, { "epoch": 0.8740768094534712, "grad_norm": 0.08694546669721603, "learning_rate": 6.300036950363346e-05, "loss": 7.34, "step": 7101 }, { "epoch": 0.8741999015263417, "grad_norm": 0.0800989493727684, "learning_rate": 6.293878556472472e-05, "loss": 7.64, "step": 7102 }, { "epoch": 0.8743229935992122, "grad_norm": 0.06943720579147339, "learning_rate": 6.287720162581599e-05, "loss": 7.6843, "step": 7103 }, { "epoch": 0.8744460856720827, "grad_norm": 0.09339591860771179, "learning_rate": 6.281561768690725e-05, "loss": 7.354, "step": 7104 }, { "epoch": 0.8745691777449532, "grad_norm": 0.13275820016860962, "learning_rate": 6.275403374799853e-05, "loss": 8.0655, "step": 7105 }, { "epoch": 0.8746922698178238, "grad_norm": 0.07106979191303253, "learning_rate": 6.26924498090898e-05, "loss": 7.6735, "step": 7106 }, { "epoch": 0.8748153618906942, "grad_norm": 0.08468380570411682, "learning_rate": 6.263086587018106e-05, "loss": 7.3218, "step": 7107 }, { "epoch": 0.8749384539635647, "grad_norm": 0.14790630340576172, "learning_rate": 6.256928193127233e-05, "loss": 7.887, "step": 7108 }, { "epoch": 0.8750615460364353, "grad_norm": 0.09165999293327332, "learning_rate": 6.250769799236359e-05, "loss": 7.2608, "step": 7109 }, { "epoch": 0.8751846381093058, "grad_norm": 0.15993739664554596, "learning_rate": 6.244611405345486e-05, "loss": 7.1806, "step": 7110 }, { "epoch": 0.8753077301821762, "grad_norm": 0.10108799487352371, "learning_rate": 6.238453011454612e-05, "loss": 7.4156, "step": 7111 }, { "epoch": 0.8754308222550468, "grad_norm": 0.274621844291687, "learning_rate": 6.23229461756374e-05, "loss": 9.2177, "step": 7112 }, { "epoch": 0.8755539143279173, "grad_norm": 0.1291222870349884, "learning_rate": 6.226136223672866e-05, "loss": 8.1367, "step": 7113 }, { "epoch": 0.8756770064007878, "grad_norm": 0.08819488435983658, "learning_rate": 6.219977829781993e-05, "loss": 7.6085, "step": 7114 }, { "epoch": 0.8758000984736583, "grad_norm": 0.10843070596456528, "learning_rate": 6.213819435891119e-05, "loss": 7.9191, "step": 7115 }, { "epoch": 0.8759231905465288, "grad_norm": 0.0623667873442173, "learning_rate": 6.207661042000246e-05, "loss": 7.5061, "step": 7116 }, { "epoch": 0.8760462826193993, "grad_norm": 0.0835731253027916, "learning_rate": 6.201502648109373e-05, "loss": 7.3744, "step": 7117 }, { "epoch": 0.8761693746922699, "grad_norm": 0.12512721121311188, "learning_rate": 6.1953442542185e-05, "loss": 7.9034, "step": 7118 }, { "epoch": 0.8762924667651403, "grad_norm": 0.06253103166818619, "learning_rate": 6.189185860327627e-05, "loss": 7.4302, "step": 7119 }, { "epoch": 0.8764155588380108, "grad_norm": 0.11716470867395401, "learning_rate": 6.183027466436753e-05, "loss": 7.6354, "step": 7120 }, { "epoch": 0.8765386509108813, "grad_norm": 0.059891246259212494, "learning_rate": 6.17686907254588e-05, "loss": 7.5525, "step": 7121 }, { "epoch": 0.8766617429837519, "grad_norm": 0.08402072638273239, "learning_rate": 6.170710678655007e-05, "loss": 7.5618, "step": 7122 }, { "epoch": 0.8767848350566223, "grad_norm": 0.08179939538240433, "learning_rate": 6.164552284764133e-05, "loss": 7.7071, "step": 7123 }, { "epoch": 0.8769079271294928, "grad_norm": 0.10167428106069565, "learning_rate": 6.15839389087326e-05, "loss": 7.5914, "step": 7124 }, { "epoch": 0.8770310192023634, "grad_norm": 0.1807001531124115, "learning_rate": 6.152235496982386e-05, "loss": 7.9059, "step": 7125 }, { "epoch": 0.8771541112752339, "grad_norm": 0.11089378595352173, "learning_rate": 6.146077103091514e-05, "loss": 7.3092, "step": 7126 }, { "epoch": 0.8772772033481043, "grad_norm": 0.12620452046394348, "learning_rate": 6.139918709200641e-05, "loss": 8.1853, "step": 7127 }, { "epoch": 0.8774002954209749, "grad_norm": 0.07663336396217346, "learning_rate": 6.133760315309767e-05, "loss": 8.0443, "step": 7128 }, { "epoch": 0.8775233874938454, "grad_norm": 0.14473125338554382, "learning_rate": 6.127601921418894e-05, "loss": 7.8597, "step": 7129 }, { "epoch": 0.8776464795667159, "grad_norm": 0.14814069867134094, "learning_rate": 6.12144352752802e-05, "loss": 7.8444, "step": 7130 }, { "epoch": 0.8777695716395865, "grad_norm": 0.0765179693698883, "learning_rate": 6.115285133637148e-05, "loss": 7.4466, "step": 7131 }, { "epoch": 0.8778926637124569, "grad_norm": 0.07459497451782227, "learning_rate": 6.109126739746275e-05, "loss": 8.0501, "step": 7132 }, { "epoch": 0.8780157557853274, "grad_norm": 0.10908246040344238, "learning_rate": 6.102968345855401e-05, "loss": 7.2379, "step": 7133 }, { "epoch": 0.878138847858198, "grad_norm": 0.12300702929496765, "learning_rate": 6.0968099519645274e-05, "loss": 7.387, "step": 7134 }, { "epoch": 0.8782619399310685, "grad_norm": 0.06232650205492973, "learning_rate": 6.090651558073655e-05, "loss": 7.503, "step": 7135 }, { "epoch": 0.8783850320039389, "grad_norm": 0.1692495495080948, "learning_rate": 6.0844931641827814e-05, "loss": 8.2478, "step": 7136 }, { "epoch": 0.8785081240768094, "grad_norm": 0.1336366832256317, "learning_rate": 6.078334770291908e-05, "loss": 7.3504, "step": 7137 }, { "epoch": 0.87863121614968, "grad_norm": 0.1386932134628296, "learning_rate": 6.0721763764010346e-05, "loss": 7.9853, "step": 7138 }, { "epoch": 0.8787543082225505, "grad_norm": 0.07473617047071457, "learning_rate": 6.066017982510161e-05, "loss": 7.5889, "step": 7139 }, { "epoch": 0.8788774002954209, "grad_norm": 0.05126113072037697, "learning_rate": 6.0598595886192886e-05, "loss": 7.6412, "step": 7140 }, { "epoch": 0.8790004923682915, "grad_norm": 0.1288999319076538, "learning_rate": 6.053701194728415e-05, "loss": 8.2491, "step": 7141 }, { "epoch": 0.879123584441162, "grad_norm": 0.08492297679185867, "learning_rate": 6.047542800837542e-05, "loss": 7.5142, "step": 7142 }, { "epoch": 0.8792466765140325, "grad_norm": 0.08415830135345459, "learning_rate": 6.0413844069466684e-05, "loss": 7.6945, "step": 7143 }, { "epoch": 0.879369768586903, "grad_norm": 0.10932806134223938, "learning_rate": 6.035226013055795e-05, "loss": 7.4483, "step": 7144 }, { "epoch": 0.8794928606597735, "grad_norm": 0.27218496799468994, "learning_rate": 6.029067619164922e-05, "loss": 9.0105, "step": 7145 }, { "epoch": 0.879615952732644, "grad_norm": 0.16900110244750977, "learning_rate": 6.022909225274049e-05, "loss": 7.9884, "step": 7146 }, { "epoch": 0.8797390448055146, "grad_norm": 0.11233369261026382, "learning_rate": 6.0167508313831756e-05, "loss": 7.5494, "step": 7147 }, { "epoch": 0.879862136878385, "grad_norm": 0.18203313648700714, "learning_rate": 6.010592437492302e-05, "loss": 7.6697, "step": 7148 }, { "epoch": 0.8799852289512555, "grad_norm": 0.12914450466632843, "learning_rate": 6.004434043601429e-05, "loss": 7.3924, "step": 7149 }, { "epoch": 0.8801083210241261, "grad_norm": 0.09324661642313004, "learning_rate": 5.9982756497105555e-05, "loss": 7.3627, "step": 7150 }, { "epoch": 0.8802314130969966, "grad_norm": 0.3811676502227783, "learning_rate": 5.992117255819683e-05, "loss": 10.0418, "step": 7151 }, { "epoch": 0.880354505169867, "grad_norm": 0.21217148005962372, "learning_rate": 5.9859588619288094e-05, "loss": 7.7925, "step": 7152 }, { "epoch": 0.8804775972427376, "grad_norm": 0.10281185805797577, "learning_rate": 5.979800468037936e-05, "loss": 7.9022, "step": 7153 }, { "epoch": 0.8806006893156081, "grad_norm": 0.05979463458061218, "learning_rate": 5.973642074147063e-05, "loss": 7.7956, "step": 7154 }, { "epoch": 0.8807237813884786, "grad_norm": 0.08114437013864517, "learning_rate": 5.967483680256189e-05, "loss": 8.0383, "step": 7155 }, { "epoch": 0.880846873461349, "grad_norm": 0.133488267660141, "learning_rate": 5.961325286365316e-05, "loss": 7.1723, "step": 7156 }, { "epoch": 0.8809699655342196, "grad_norm": 0.07592984288930893, "learning_rate": 5.955166892474443e-05, "loss": 7.6337, "step": 7157 }, { "epoch": 0.8810930576070901, "grad_norm": 0.11365213245153427, "learning_rate": 5.94900849858357e-05, "loss": 7.9049, "step": 7158 }, { "epoch": 0.8812161496799606, "grad_norm": 0.11023038625717163, "learning_rate": 5.9428501046926965e-05, "loss": 7.7716, "step": 7159 }, { "epoch": 0.8813392417528311, "grad_norm": 0.07187192887067795, "learning_rate": 5.936691710801823e-05, "loss": 7.7453, "step": 7160 }, { "epoch": 0.8814623338257016, "grad_norm": 0.056153904646635056, "learning_rate": 5.93053331691095e-05, "loss": 7.6963, "step": 7161 }, { "epoch": 0.8815854258985721, "grad_norm": 0.26626917719841003, "learning_rate": 5.924374923020077e-05, "loss": 8.6127, "step": 7162 }, { "epoch": 0.8817085179714427, "grad_norm": 0.2005123645067215, "learning_rate": 5.918216529129204e-05, "loss": 8.3605, "step": 7163 }, { "epoch": 0.8818316100443131, "grad_norm": 0.09751029312610626, "learning_rate": 5.91205813523833e-05, "loss": 7.476, "step": 7164 }, { "epoch": 0.8819547021171836, "grad_norm": 0.12873995304107666, "learning_rate": 5.905899741347457e-05, "loss": 7.4465, "step": 7165 }, { "epoch": 0.8820777941900542, "grad_norm": 0.09838636964559555, "learning_rate": 5.8997413474565836e-05, "loss": 7.6371, "step": 7166 }, { "epoch": 0.8822008862629247, "grad_norm": 0.08351072669029236, "learning_rate": 5.893582953565711e-05, "loss": 7.6536, "step": 7167 }, { "epoch": 0.8823239783357951, "grad_norm": 0.08770354092121124, "learning_rate": 5.887424559674837e-05, "loss": 8.0829, "step": 7168 }, { "epoch": 0.8824470704086657, "grad_norm": 0.07533980160951614, "learning_rate": 5.8812661657839634e-05, "loss": 7.6182, "step": 7169 }, { "epoch": 0.8825701624815362, "grad_norm": 0.09189040213823318, "learning_rate": 5.87510777189309e-05, "loss": 7.699, "step": 7170 }, { "epoch": 0.8826932545544067, "grad_norm": 0.29308775067329407, "learning_rate": 5.868949378002217e-05, "loss": 9.8135, "step": 7171 }, { "epoch": 0.8828163466272771, "grad_norm": 0.07278668880462646, "learning_rate": 5.862790984111343e-05, "loss": 7.467, "step": 7172 }, { "epoch": 0.8829394387001477, "grad_norm": 0.07887197285890579, "learning_rate": 5.8566325902204706e-05, "loss": 8.0804, "step": 7173 }, { "epoch": 0.8830625307730182, "grad_norm": 0.10314709693193436, "learning_rate": 5.850474196329597e-05, "loss": 7.9003, "step": 7174 }, { "epoch": 0.8831856228458888, "grad_norm": 0.15525159239768982, "learning_rate": 5.844315802438724e-05, "loss": 7.4657, "step": 7175 }, { "epoch": 0.8833087149187593, "grad_norm": 0.13261687755584717, "learning_rate": 5.8381574085478505e-05, "loss": 7.4244, "step": 7176 }, { "epoch": 0.8834318069916297, "grad_norm": 0.17825476825237274, "learning_rate": 5.831999014656977e-05, "loss": 8.911, "step": 7177 }, { "epoch": 0.8835548990645002, "grad_norm": 0.09329677373170853, "learning_rate": 5.825840620766104e-05, "loss": 8.1198, "step": 7178 }, { "epoch": 0.8836779911373708, "grad_norm": 0.08567489683628082, "learning_rate": 5.819682226875231e-05, "loss": 7.6125, "step": 7179 }, { "epoch": 0.8838010832102413, "grad_norm": 0.12182840704917908, "learning_rate": 5.813523832984358e-05, "loss": 7.393, "step": 7180 }, { "epoch": 0.8839241752831117, "grad_norm": 0.07879962772130966, "learning_rate": 5.807365439093484e-05, "loss": 7.5702, "step": 7181 }, { "epoch": 0.8840472673559823, "grad_norm": 0.15849314630031586, "learning_rate": 5.801207045202611e-05, "loss": 8.422, "step": 7182 }, { "epoch": 0.8841703594288528, "grad_norm": 0.08832885324954987, "learning_rate": 5.7950486513117376e-05, "loss": 7.5303, "step": 7183 }, { "epoch": 0.8842934515017233, "grad_norm": 0.13016225397586823, "learning_rate": 5.788890257420865e-05, "loss": 7.5402, "step": 7184 }, { "epoch": 0.8844165435745938, "grad_norm": 0.09833936393260956, "learning_rate": 5.7827318635299915e-05, "loss": 7.3057, "step": 7185 }, { "epoch": 0.8845396356474643, "grad_norm": 0.12350001186132431, "learning_rate": 5.776573469639118e-05, "loss": 7.5502, "step": 7186 }, { "epoch": 0.8846627277203348, "grad_norm": 0.07097551226615906, "learning_rate": 5.770415075748245e-05, "loss": 7.7469, "step": 7187 }, { "epoch": 0.8847858197932054, "grad_norm": 0.08923041075468063, "learning_rate": 5.7642566818573714e-05, "loss": 7.7024, "step": 7188 }, { "epoch": 0.8849089118660758, "grad_norm": 0.049423087388277054, "learning_rate": 5.758098287966499e-05, "loss": 7.4441, "step": 7189 }, { "epoch": 0.8850320039389463, "grad_norm": 0.0712011381983757, "learning_rate": 5.751939894075625e-05, "loss": 7.5558, "step": 7190 }, { "epoch": 0.8851550960118169, "grad_norm": 0.08749667555093765, "learning_rate": 5.745781500184752e-05, "loss": 7.5206, "step": 7191 }, { "epoch": 0.8852781880846874, "grad_norm": 0.10288955271244049, "learning_rate": 5.7396231062938786e-05, "loss": 7.454, "step": 7192 }, { "epoch": 0.8854012801575578, "grad_norm": 0.13886545598506927, "learning_rate": 5.733464712403005e-05, "loss": 8.1183, "step": 7193 }, { "epoch": 0.8855243722304283, "grad_norm": 0.06546921283006668, "learning_rate": 5.727306318512132e-05, "loss": 7.7023, "step": 7194 }, { "epoch": 0.8856474643032989, "grad_norm": 0.07927965372800827, "learning_rate": 5.721147924621259e-05, "loss": 7.7245, "step": 7195 }, { "epoch": 0.8857705563761694, "grad_norm": 0.06719628721475601, "learning_rate": 5.714989530730386e-05, "loss": 7.6405, "step": 7196 }, { "epoch": 0.8858936484490398, "grad_norm": 0.06456273794174194, "learning_rate": 5.7088311368395124e-05, "loss": 7.7043, "step": 7197 }, { "epoch": 0.8860167405219104, "grad_norm": 0.09570540487766266, "learning_rate": 5.702672742948639e-05, "loss": 7.3511, "step": 7198 }, { "epoch": 0.8861398325947809, "grad_norm": 0.08229339122772217, "learning_rate": 5.6965143490577656e-05, "loss": 7.9568, "step": 7199 }, { "epoch": 0.8862629246676514, "grad_norm": 0.07780149579048157, "learning_rate": 5.690355955166893e-05, "loss": 7.5271, "step": 7200 }, { "epoch": 0.8863860167405219, "grad_norm": 0.06840154528617859, "learning_rate": 5.6841975612760196e-05, "loss": 7.8278, "step": 7201 }, { "epoch": 0.8865091088133924, "grad_norm": 0.2958521842956543, "learning_rate": 5.678039167385146e-05, "loss": 8.8903, "step": 7202 }, { "epoch": 0.8866322008862629, "grad_norm": 0.2642914056777954, "learning_rate": 5.671880773494273e-05, "loss": 9.1592, "step": 7203 }, { "epoch": 0.8867552929591335, "grad_norm": 0.06677330285310745, "learning_rate": 5.6657223796033995e-05, "loss": 7.691, "step": 7204 }, { "epoch": 0.8868783850320039, "grad_norm": 0.2363937646150589, "learning_rate": 5.659563985712526e-05, "loss": 8.8699, "step": 7205 }, { "epoch": 0.8870014771048744, "grad_norm": 0.179393470287323, "learning_rate": 5.6534055918216534e-05, "loss": 8.2357, "step": 7206 }, { "epoch": 0.887124569177745, "grad_norm": 0.07057877629995346, "learning_rate": 5.64724719793078e-05, "loss": 7.7827, "step": 7207 }, { "epoch": 0.8872476612506155, "grad_norm": 0.11643673479557037, "learning_rate": 5.6410888040399066e-05, "loss": 7.9243, "step": 7208 }, { "epoch": 0.8873707533234859, "grad_norm": 0.1965530812740326, "learning_rate": 5.634930410149033e-05, "loss": 8.7987, "step": 7209 }, { "epoch": 0.8874938453963565, "grad_norm": 0.17222823202610016, "learning_rate": 5.62877201625816e-05, "loss": 7.4813, "step": 7210 }, { "epoch": 0.887616937469227, "grad_norm": 0.14671050012111664, "learning_rate": 5.622613622367287e-05, "loss": 7.6793, "step": 7211 }, { "epoch": 0.8877400295420975, "grad_norm": 0.15708354115486145, "learning_rate": 5.616455228476414e-05, "loss": 8.7249, "step": 7212 }, { "epoch": 0.8878631216149679, "grad_norm": 0.11281456053256989, "learning_rate": 5.6102968345855405e-05, "loss": 7.7717, "step": 7213 }, { "epoch": 0.8879862136878385, "grad_norm": 0.1426069289445877, "learning_rate": 5.604138440694667e-05, "loss": 8.6263, "step": 7214 }, { "epoch": 0.888109305760709, "grad_norm": 0.12727876007556915, "learning_rate": 5.597980046803794e-05, "loss": 7.6116, "step": 7215 }, { "epoch": 0.8882323978335795, "grad_norm": 0.6244761347770691, "learning_rate": 5.59182165291292e-05, "loss": 8.6373, "step": 7216 }, { "epoch": 0.8883554899064501, "grad_norm": 0.2134680449962616, "learning_rate": 5.5856632590220476e-05, "loss": 7.3454, "step": 7217 }, { "epoch": 0.8884785819793205, "grad_norm": 0.10984567552804947, "learning_rate": 5.579504865131174e-05, "loss": 8.214, "step": 7218 }, { "epoch": 0.888601674052191, "grad_norm": 0.12325482815504074, "learning_rate": 5.573346471240301e-05, "loss": 7.5434, "step": 7219 }, { "epoch": 0.8887247661250616, "grad_norm": 0.0925266370177269, "learning_rate": 5.5671880773494275e-05, "loss": 7.8605, "step": 7220 }, { "epoch": 0.8888478581979321, "grad_norm": 0.09593171626329422, "learning_rate": 5.561029683458554e-05, "loss": 7.6612, "step": 7221 }, { "epoch": 0.8889709502708025, "grad_norm": 0.12360136955976486, "learning_rate": 5.5548712895676815e-05, "loss": 7.59, "step": 7222 }, { "epoch": 0.8890940423436731, "grad_norm": 0.09189219027757645, "learning_rate": 5.548712895676808e-05, "loss": 8.1472, "step": 7223 }, { "epoch": 0.8892171344165436, "grad_norm": 0.10186446458101273, "learning_rate": 5.542554501785935e-05, "loss": 7.7211, "step": 7224 }, { "epoch": 0.8893402264894141, "grad_norm": 0.06156729906797409, "learning_rate": 5.536396107895061e-05, "loss": 7.7192, "step": 7225 }, { "epoch": 0.8894633185622846, "grad_norm": 0.10803976655006409, "learning_rate": 5.530237714004187e-05, "loss": 7.8798, "step": 7226 }, { "epoch": 0.8895864106351551, "grad_norm": 0.10850328952074051, "learning_rate": 5.524079320113314e-05, "loss": 8.1738, "step": 7227 }, { "epoch": 0.8897095027080256, "grad_norm": 0.10157465189695358, "learning_rate": 5.517920926222441e-05, "loss": 7.4101, "step": 7228 }, { "epoch": 0.8898325947808962, "grad_norm": 0.07835428416728973, "learning_rate": 5.511762532331568e-05, "loss": 7.3187, "step": 7229 }, { "epoch": 0.8899556868537666, "grad_norm": 0.13378988206386566, "learning_rate": 5.5056041384406945e-05, "loss": 7.7683, "step": 7230 }, { "epoch": 0.8900787789266371, "grad_norm": 0.08265043795108795, "learning_rate": 5.499445744549821e-05, "loss": 7.5065, "step": 7231 }, { "epoch": 0.8902018709995076, "grad_norm": 0.074569471180439, "learning_rate": 5.493287350658948e-05, "loss": 7.5128, "step": 7232 }, { "epoch": 0.8903249630723782, "grad_norm": 0.06592845916748047, "learning_rate": 5.487128956768075e-05, "loss": 7.7755, "step": 7233 }, { "epoch": 0.8904480551452486, "grad_norm": 0.24417735636234283, "learning_rate": 5.4809705628772017e-05, "loss": 8.735, "step": 7234 }, { "epoch": 0.8905711472181191, "grad_norm": 0.11425384879112244, "learning_rate": 5.474812168986328e-05, "loss": 7.3273, "step": 7235 }, { "epoch": 0.8906942392909897, "grad_norm": 0.12728697061538696, "learning_rate": 5.468653775095455e-05, "loss": 7.2547, "step": 7236 }, { "epoch": 0.8908173313638602, "grad_norm": 0.14081591367721558, "learning_rate": 5.4624953812045815e-05, "loss": 7.9073, "step": 7237 }, { "epoch": 0.8909404234367306, "grad_norm": 0.14262622594833374, "learning_rate": 5.456336987313709e-05, "loss": 7.2007, "step": 7238 }, { "epoch": 0.8910635155096012, "grad_norm": 0.0808398574590683, "learning_rate": 5.4501785934228355e-05, "loss": 7.812, "step": 7239 }, { "epoch": 0.8911866075824717, "grad_norm": 0.11711146682500839, "learning_rate": 5.444020199531962e-05, "loss": 7.3715, "step": 7240 }, { "epoch": 0.8913096996553422, "grad_norm": 0.11139951646327972, "learning_rate": 5.437861805641089e-05, "loss": 7.5168, "step": 7241 }, { "epoch": 0.8914327917282127, "grad_norm": 0.10312355309724808, "learning_rate": 5.4317034117502154e-05, "loss": 7.4229, "step": 7242 }, { "epoch": 0.8915558838010832, "grad_norm": 0.10157918930053711, "learning_rate": 5.425545017859342e-05, "loss": 7.8841, "step": 7243 }, { "epoch": 0.8916789758739537, "grad_norm": 0.07055198401212692, "learning_rate": 5.419386623968469e-05, "loss": 7.6106, "step": 7244 }, { "epoch": 0.8918020679468243, "grad_norm": 0.09292501211166382, "learning_rate": 5.413228230077596e-05, "loss": 7.5659, "step": 7245 }, { "epoch": 0.8919251600196947, "grad_norm": 0.09595859050750732, "learning_rate": 5.4070698361867225e-05, "loss": 7.4768, "step": 7246 }, { "epoch": 0.8920482520925652, "grad_norm": 0.07090039551258087, "learning_rate": 5.400911442295849e-05, "loss": 7.8407, "step": 7247 }, { "epoch": 0.8921713441654358, "grad_norm": 0.09693482518196106, "learning_rate": 5.394753048404976e-05, "loss": 7.7867, "step": 7248 }, { "epoch": 0.8922944362383063, "grad_norm": 0.3953644931316376, "learning_rate": 5.388594654514103e-05, "loss": 9.4773, "step": 7249 }, { "epoch": 0.8924175283111767, "grad_norm": 0.08687668293714523, "learning_rate": 5.38243626062323e-05, "loss": 7.6952, "step": 7250 }, { "epoch": 0.8925406203840472, "grad_norm": 0.10599225759506226, "learning_rate": 5.3762778667323563e-05, "loss": 7.5814, "step": 7251 }, { "epoch": 0.8926637124569178, "grad_norm": 0.10709819197654724, "learning_rate": 5.370119472841483e-05, "loss": 7.9433, "step": 7252 }, { "epoch": 0.8927868045297883, "grad_norm": 0.13102883100509644, "learning_rate": 5.3639610789506096e-05, "loss": 8.0166, "step": 7253 }, { "epoch": 0.8929098966026587, "grad_norm": 0.25415244698524475, "learning_rate": 5.357802685059736e-05, "loss": 9.5655, "step": 7254 }, { "epoch": 0.8930329886755293, "grad_norm": 0.17596416175365448, "learning_rate": 5.3516442911688635e-05, "loss": 8.7871, "step": 7255 }, { "epoch": 0.8931560807483998, "grad_norm": 0.08489152044057846, "learning_rate": 5.34548589727799e-05, "loss": 7.6374, "step": 7256 }, { "epoch": 0.8932791728212703, "grad_norm": 0.20256422460079193, "learning_rate": 5.339327503387117e-05, "loss": 7.2423, "step": 7257 }, { "epoch": 0.8934022648941408, "grad_norm": 0.2175196260213852, "learning_rate": 5.3331691094962434e-05, "loss": 7.2541, "step": 7258 }, { "epoch": 0.8935253569670113, "grad_norm": 0.13714396953582764, "learning_rate": 5.32701071560537e-05, "loss": 7.501, "step": 7259 }, { "epoch": 0.8936484490398818, "grad_norm": 0.33585411310195923, "learning_rate": 5.3208523217144973e-05, "loss": 9.7491, "step": 7260 }, { "epoch": 0.8937715411127524, "grad_norm": 0.13899780809879303, "learning_rate": 5.314693927823624e-05, "loss": 7.3908, "step": 7261 }, { "epoch": 0.8938946331856229, "grad_norm": 0.11736108362674713, "learning_rate": 5.3085355339327506e-05, "loss": 8.1117, "step": 7262 }, { "epoch": 0.8940177252584933, "grad_norm": 0.10811357200145721, "learning_rate": 5.302377140041877e-05, "loss": 8.108, "step": 7263 }, { "epoch": 0.8941408173313639, "grad_norm": 0.12661369144916534, "learning_rate": 5.296218746151004e-05, "loss": 7.6365, "step": 7264 }, { "epoch": 0.8942639094042344, "grad_norm": 0.09174482524394989, "learning_rate": 5.2900603522601305e-05, "loss": 7.5572, "step": 7265 }, { "epoch": 0.8943870014771049, "grad_norm": 0.23379218578338623, "learning_rate": 5.283901958369258e-05, "loss": 7.2123, "step": 7266 }, { "epoch": 0.8945100935499753, "grad_norm": 0.07219158858060837, "learning_rate": 5.2777435644783844e-05, "loss": 7.6337, "step": 7267 }, { "epoch": 0.8946331856228459, "grad_norm": 0.18459868431091309, "learning_rate": 5.271585170587511e-05, "loss": 7.1959, "step": 7268 }, { "epoch": 0.8947562776957164, "grad_norm": 0.09419330209493637, "learning_rate": 5.265426776696638e-05, "loss": 7.7671, "step": 7269 }, { "epoch": 0.894879369768587, "grad_norm": 0.09184520691633224, "learning_rate": 5.259268382805764e-05, "loss": 7.3879, "step": 7270 }, { "epoch": 0.8950024618414574, "grad_norm": 0.08793006837368011, "learning_rate": 5.2531099889148916e-05, "loss": 7.6162, "step": 7271 }, { "epoch": 0.8951255539143279, "grad_norm": 0.10659923404455185, "learning_rate": 5.246951595024018e-05, "loss": 7.6816, "step": 7272 }, { "epoch": 0.8952486459871984, "grad_norm": 0.11340267956256866, "learning_rate": 5.240793201133145e-05, "loss": 7.7034, "step": 7273 }, { "epoch": 0.895371738060069, "grad_norm": 0.4373761713504791, "learning_rate": 5.2346348072422715e-05, "loss": 10.3728, "step": 7274 }, { "epoch": 0.8954948301329394, "grad_norm": 0.09639783948659897, "learning_rate": 5.228476413351398e-05, "loss": 7.5377, "step": 7275 }, { "epoch": 0.8956179222058099, "grad_norm": 0.14125484228134155, "learning_rate": 5.222318019460525e-05, "loss": 7.8779, "step": 7276 }, { "epoch": 0.8957410142786805, "grad_norm": 0.19836901128292084, "learning_rate": 5.216159625569652e-05, "loss": 8.7731, "step": 7277 }, { "epoch": 0.895864106351551, "grad_norm": 0.08388962596654892, "learning_rate": 5.210001231678779e-05, "loss": 7.5903, "step": 7278 }, { "epoch": 0.8959871984244214, "grad_norm": 0.10071391612291336, "learning_rate": 5.203842837787905e-05, "loss": 7.5194, "step": 7279 }, { "epoch": 0.896110290497292, "grad_norm": 0.13070078194141388, "learning_rate": 5.197684443897032e-05, "loss": 7.6254, "step": 7280 }, { "epoch": 0.8962333825701625, "grad_norm": 0.12423554062843323, "learning_rate": 5.1915260500061585e-05, "loss": 7.7563, "step": 7281 }, { "epoch": 0.896356474643033, "grad_norm": 0.12774866819381714, "learning_rate": 5.185367656115286e-05, "loss": 7.485, "step": 7282 }, { "epoch": 0.8964795667159035, "grad_norm": 0.09532114863395691, "learning_rate": 5.1792092622244125e-05, "loss": 7.8367, "step": 7283 }, { "epoch": 0.896602658788774, "grad_norm": 0.09159096330404282, "learning_rate": 5.1730508683335384e-05, "loss": 7.9687, "step": 7284 }, { "epoch": 0.8967257508616445, "grad_norm": 0.2067456990480423, "learning_rate": 5.166892474442665e-05, "loss": 8.6001, "step": 7285 }, { "epoch": 0.8968488429345151, "grad_norm": 0.1955510377883911, "learning_rate": 5.160734080551792e-05, "loss": 7.222, "step": 7286 }, { "epoch": 0.8969719350073855, "grad_norm": 0.14835619926452637, "learning_rate": 5.154575686660918e-05, "loss": 7.9455, "step": 7287 }, { "epoch": 0.897095027080256, "grad_norm": 0.19892629981040955, "learning_rate": 5.1484172927700456e-05, "loss": 7.1828, "step": 7288 }, { "epoch": 0.8972181191531265, "grad_norm": 0.15029220283031464, "learning_rate": 5.142258898879172e-05, "loss": 7.246, "step": 7289 }, { "epoch": 0.8973412112259971, "grad_norm": 0.11998404562473297, "learning_rate": 5.136100504988299e-05, "loss": 8.3088, "step": 7290 }, { "epoch": 0.8974643032988675, "grad_norm": 0.08063900470733643, "learning_rate": 5.1299421110974255e-05, "loss": 7.6971, "step": 7291 }, { "epoch": 0.897587395371738, "grad_norm": 0.06945367902517319, "learning_rate": 5.123783717206552e-05, "loss": 7.5678, "step": 7292 }, { "epoch": 0.8977104874446086, "grad_norm": 0.09133319556713104, "learning_rate": 5.1176253233156794e-05, "loss": 8.0772, "step": 7293 }, { "epoch": 0.8978335795174791, "grad_norm": 0.06459995359182358, "learning_rate": 5.111466929424806e-05, "loss": 7.5962, "step": 7294 }, { "epoch": 0.8979566715903495, "grad_norm": 0.09736296534538269, "learning_rate": 5.105308535533933e-05, "loss": 7.6838, "step": 7295 }, { "epoch": 0.8980797636632201, "grad_norm": 0.14310568571090698, "learning_rate": 5.099150141643059e-05, "loss": 7.5221, "step": 7296 }, { "epoch": 0.8982028557360906, "grad_norm": 0.09671180695295334, "learning_rate": 5.092991747752186e-05, "loss": 7.4984, "step": 7297 }, { "epoch": 0.8983259478089611, "grad_norm": 0.12145793437957764, "learning_rate": 5.086833353861313e-05, "loss": 7.2154, "step": 7298 }, { "epoch": 0.8984490398818316, "grad_norm": 0.0987389087677002, "learning_rate": 5.08067495997044e-05, "loss": 8.0132, "step": 7299 }, { "epoch": 0.8985721319547021, "grad_norm": 0.12785422801971436, "learning_rate": 5.0745165660795665e-05, "loss": 7.2273, "step": 7300 }, { "epoch": 0.8986952240275726, "grad_norm": 0.09530529379844666, "learning_rate": 5.068358172188693e-05, "loss": 7.2866, "step": 7301 }, { "epoch": 0.8988183161004432, "grad_norm": 0.14006903767585754, "learning_rate": 5.06219977829782e-05, "loss": 7.9787, "step": 7302 }, { "epoch": 0.8989414081733137, "grad_norm": 0.08896876871585846, "learning_rate": 5.0560413844069464e-05, "loss": 7.3244, "step": 7303 }, { "epoch": 0.8990645002461841, "grad_norm": 0.0631316751241684, "learning_rate": 5.049882990516074e-05, "loss": 7.4841, "step": 7304 }, { "epoch": 0.8991875923190547, "grad_norm": 0.09883615374565125, "learning_rate": 5.0437245966252e-05, "loss": 7.7588, "step": 7305 }, { "epoch": 0.8993106843919252, "grad_norm": 0.10328734666109085, "learning_rate": 5.037566202734327e-05, "loss": 7.949, "step": 7306 }, { "epoch": 0.8994337764647957, "grad_norm": 0.10595960170030594, "learning_rate": 5.0314078088434536e-05, "loss": 7.668, "step": 7307 }, { "epoch": 0.8995568685376661, "grad_norm": 0.13542737066745758, "learning_rate": 5.02524941495258e-05, "loss": 7.9166, "step": 7308 }, { "epoch": 0.8996799606105367, "grad_norm": 0.10080061107873917, "learning_rate": 5.0190910210617075e-05, "loss": 7.6935, "step": 7309 }, { "epoch": 0.8998030526834072, "grad_norm": 0.08372898399829865, "learning_rate": 5.012932627170834e-05, "loss": 7.7541, "step": 7310 }, { "epoch": 0.8999261447562777, "grad_norm": 0.23909921944141388, "learning_rate": 5.006774233279961e-05, "loss": 8.3647, "step": 7311 }, { "epoch": 0.9000492368291482, "grad_norm": 0.1346714198589325, "learning_rate": 5.0006158393890874e-05, "loss": 7.6601, "step": 7312 }, { "epoch": 0.9001723289020187, "grad_norm": 0.12750272452831268, "learning_rate": 4.994457445498214e-05, "loss": 7.4137, "step": 7313 }, { "epoch": 0.9002954209748892, "grad_norm": 0.09650886058807373, "learning_rate": 4.9882990516073406e-05, "loss": 7.3219, "step": 7314 }, { "epoch": 0.9004185130477598, "grad_norm": 0.09600219130516052, "learning_rate": 4.982140657716468e-05, "loss": 7.442, "step": 7315 }, { "epoch": 0.9005416051206302, "grad_norm": 0.08197182416915894, "learning_rate": 4.9759822638255946e-05, "loss": 7.4516, "step": 7316 }, { "epoch": 0.9006646971935007, "grad_norm": 0.08427676558494568, "learning_rate": 4.969823869934721e-05, "loss": 7.8695, "step": 7317 }, { "epoch": 0.9007877892663713, "grad_norm": 0.08747129142284393, "learning_rate": 4.963665476043848e-05, "loss": 7.4449, "step": 7318 }, { "epoch": 0.9009108813392418, "grad_norm": 0.10668490082025528, "learning_rate": 4.9575070821529744e-05, "loss": 7.373, "step": 7319 }, { "epoch": 0.9010339734121122, "grad_norm": 0.09751011431217194, "learning_rate": 4.951348688262102e-05, "loss": 7.4072, "step": 7320 }, { "epoch": 0.9011570654849828, "grad_norm": 0.19827674329280853, "learning_rate": 4.9451902943712284e-05, "loss": 7.9112, "step": 7321 }, { "epoch": 0.9012801575578533, "grad_norm": 0.18763242661952972, "learning_rate": 4.939031900480355e-05, "loss": 7.5439, "step": 7322 }, { "epoch": 0.9014032496307238, "grad_norm": 0.09266681224107742, "learning_rate": 4.9328735065894816e-05, "loss": 7.5504, "step": 7323 }, { "epoch": 0.9015263417035942, "grad_norm": 0.06525739282369614, "learning_rate": 4.926715112698608e-05, "loss": 7.5079, "step": 7324 }, { "epoch": 0.9016494337764648, "grad_norm": 0.06980696320533752, "learning_rate": 4.920556718807735e-05, "loss": 7.4551, "step": 7325 }, { "epoch": 0.9017725258493353, "grad_norm": 0.17538365721702576, "learning_rate": 4.914398324916862e-05, "loss": 7.6322, "step": 7326 }, { "epoch": 0.9018956179222059, "grad_norm": 0.09459397941827774, "learning_rate": 4.908239931025989e-05, "loss": 7.3956, "step": 7327 }, { "epoch": 0.9020187099950763, "grad_norm": 0.08004377037286758, "learning_rate": 4.9020815371351154e-05, "loss": 7.4043, "step": 7328 }, { "epoch": 0.9021418020679468, "grad_norm": 0.05513101443648338, "learning_rate": 4.895923143244242e-05, "loss": 7.5821, "step": 7329 }, { "epoch": 0.9022648941408173, "grad_norm": 0.06859058886766434, "learning_rate": 4.889764749353369e-05, "loss": 7.5008, "step": 7330 }, { "epoch": 0.9023879862136879, "grad_norm": 0.49158167839050293, "learning_rate": 4.883606355462496e-05, "loss": 9.6468, "step": 7331 }, { "epoch": 0.9025110782865583, "grad_norm": 0.1032445952296257, "learning_rate": 4.8774479615716226e-05, "loss": 7.6627, "step": 7332 }, { "epoch": 0.9026341703594288, "grad_norm": 0.06612303853034973, "learning_rate": 4.871289567680749e-05, "loss": 7.4909, "step": 7333 }, { "epoch": 0.9027572624322994, "grad_norm": 0.08378732204437256, "learning_rate": 4.865131173789876e-05, "loss": 7.5951, "step": 7334 }, { "epoch": 0.9028803545051699, "grad_norm": 0.06864304840564728, "learning_rate": 4.8589727798990025e-05, "loss": 7.6771, "step": 7335 }, { "epoch": 0.9030034465780403, "grad_norm": 0.057937923818826675, "learning_rate": 4.852814386008129e-05, "loss": 7.2828, "step": 7336 }, { "epoch": 0.9031265386509109, "grad_norm": 0.10525978356599808, "learning_rate": 4.8466559921172564e-05, "loss": 7.538, "step": 7337 }, { "epoch": 0.9032496307237814, "grad_norm": 0.12604819238185883, "learning_rate": 4.840497598226383e-05, "loss": 7.7305, "step": 7338 }, { "epoch": 0.9033727227966519, "grad_norm": NaN, "learning_rate": 4.83433920433551e-05, "loss": 10.6174, "step": 7339 }, { "epoch": 0.9034958148695224, "grad_norm": 0.0805632472038269, "learning_rate": 4.828180810444636e-05, "loss": 7.4769, "step": 7340 }, { "epoch": 0.9036189069423929, "grad_norm": 0.06782978028059006, "learning_rate": 4.822022416553763e-05, "loss": 7.3746, "step": 7341 }, { "epoch": 0.9037419990152634, "grad_norm": 0.14768259227275848, "learning_rate": 4.8158640226628896e-05, "loss": 8.05, "step": 7342 }, { "epoch": 0.903865091088134, "grad_norm": 0.0962086096405983, "learning_rate": 4.809705628772016e-05, "loss": 7.5813, "step": 7343 }, { "epoch": 0.9039881831610044, "grad_norm": 0.15216945111751556, "learning_rate": 4.803547234881143e-05, "loss": 8.4961, "step": 7344 }, { "epoch": 0.9041112752338749, "grad_norm": 0.1782548576593399, "learning_rate": 4.7973888409902695e-05, "loss": 7.3612, "step": 7345 }, { "epoch": 0.9042343673067454, "grad_norm": 0.07677698880434036, "learning_rate": 4.791230447099396e-05, "loss": 8.0826, "step": 7346 }, { "epoch": 0.904357459379616, "grad_norm": 0.11165913194417953, "learning_rate": 4.785072053208523e-05, "loss": 8.8247, "step": 7347 }, { "epoch": 0.9044805514524865, "grad_norm": 0.20056407153606415, "learning_rate": 4.77891365931765e-05, "loss": 7.6273, "step": 7348 }, { "epoch": 0.9046036435253569, "grad_norm": 0.1645948737859726, "learning_rate": 4.7727552654267766e-05, "loss": 7.5683, "step": 7349 }, { "epoch": 0.9047267355982275, "grad_norm": 0.5995151400566101, "learning_rate": 4.766596871535903e-05, "loss": 11.0018, "step": 7350 }, { "epoch": 0.904849827671098, "grad_norm": 0.1370166391134262, "learning_rate": 4.76043847764503e-05, "loss": 7.6673, "step": 7351 }, { "epoch": 0.9049729197439685, "grad_norm": 0.15211594104766846, "learning_rate": 4.7542800837541565e-05, "loss": 7.3145, "step": 7352 }, { "epoch": 0.905096011816839, "grad_norm": 0.2058335840702057, "learning_rate": 4.748121689863284e-05, "loss": 8.6544, "step": 7353 }, { "epoch": 0.9052191038897095, "grad_norm": 0.08250941336154938, "learning_rate": 4.7419632959724105e-05, "loss": 7.7835, "step": 7354 }, { "epoch": 0.90534219596258, "grad_norm": 0.08554235845804214, "learning_rate": 4.735804902081537e-05, "loss": 7.7342, "step": 7355 }, { "epoch": 0.9054652880354506, "grad_norm": 0.07376094162464142, "learning_rate": 4.729646508190664e-05, "loss": 7.6154, "step": 7356 }, { "epoch": 0.905588380108321, "grad_norm": 0.23076854646205902, "learning_rate": 4.72348811429979e-05, "loss": 9.043, "step": 7357 }, { "epoch": 0.9057114721811915, "grad_norm": 0.07037857919931412, "learning_rate": 4.7173297204089176e-05, "loss": 7.7351, "step": 7358 }, { "epoch": 0.9058345642540621, "grad_norm": 0.0824030190706253, "learning_rate": 4.711171326518044e-05, "loss": 7.6222, "step": 7359 }, { "epoch": 0.9059576563269326, "grad_norm": 0.13409791886806488, "learning_rate": 4.705012932627171e-05, "loss": 8.0179, "step": 7360 }, { "epoch": 0.906080748399803, "grad_norm": 0.1410522758960724, "learning_rate": 4.6988545387362975e-05, "loss": 8.0138, "step": 7361 }, { "epoch": 0.9062038404726735, "grad_norm": 0.0686768889427185, "learning_rate": 4.692696144845424e-05, "loss": 7.7192, "step": 7362 }, { "epoch": 0.9063269325455441, "grad_norm": 0.20667493343353271, "learning_rate": 4.686537750954551e-05, "loss": 8.2998, "step": 7363 }, { "epoch": 0.9064500246184146, "grad_norm": 0.08074723184108734, "learning_rate": 4.680379357063678e-05, "loss": 7.6169, "step": 7364 }, { "epoch": 0.906573116691285, "grad_norm": 0.0749690979719162, "learning_rate": 4.674220963172805e-05, "loss": 7.6784, "step": 7365 }, { "epoch": 0.9066962087641556, "grad_norm": 0.14404818415641785, "learning_rate": 4.668062569281931e-05, "loss": 7.5579, "step": 7366 }, { "epoch": 0.9068193008370261, "grad_norm": 0.08094019442796707, "learning_rate": 4.661904175391058e-05, "loss": 7.4694, "step": 7367 }, { "epoch": 0.9069423929098966, "grad_norm": 0.16436056792736053, "learning_rate": 4.6557457815001846e-05, "loss": 7.1635, "step": 7368 }, { "epoch": 0.9070654849827671, "grad_norm": 0.07707665115594864, "learning_rate": 4.649587387609312e-05, "loss": 7.7503, "step": 7369 }, { "epoch": 0.9071885770556376, "grad_norm": 0.2164536863565445, "learning_rate": 4.6434289937184385e-05, "loss": 8.6411, "step": 7370 }, { "epoch": 0.9073116691285081, "grad_norm": 0.11845432221889496, "learning_rate": 4.637270599827565e-05, "loss": 7.6361, "step": 7371 }, { "epoch": 0.9074347612013787, "grad_norm": 0.1271960288286209, "learning_rate": 4.631112205936692e-05, "loss": 7.5369, "step": 7372 }, { "epoch": 0.9075578532742491, "grad_norm": 0.11719010025262833, "learning_rate": 4.6249538120458184e-05, "loss": 7.1145, "step": 7373 }, { "epoch": 0.9076809453471196, "grad_norm": 0.06420109421014786, "learning_rate": 4.618795418154945e-05, "loss": 7.5595, "step": 7374 }, { "epoch": 0.9078040374199902, "grad_norm": 0.22867773473262787, "learning_rate": 4.612637024264072e-05, "loss": 8.1947, "step": 7375 }, { "epoch": 0.9079271294928607, "grad_norm": 0.11833006143569946, "learning_rate": 4.606478630373199e-05, "loss": 7.3695, "step": 7376 }, { "epoch": 0.9080502215657311, "grad_norm": 0.1118655651807785, "learning_rate": 4.6003202364823256e-05, "loss": 7.6893, "step": 7377 }, { "epoch": 0.9081733136386017, "grad_norm": 0.08207333832979202, "learning_rate": 4.594161842591452e-05, "loss": 7.4672, "step": 7378 }, { "epoch": 0.9082964057114722, "grad_norm": 0.101453997194767, "learning_rate": 4.588003448700579e-05, "loss": 7.5123, "step": 7379 }, { "epoch": 0.9084194977843427, "grad_norm": 0.12321963161230087, "learning_rate": 4.581845054809706e-05, "loss": 7.3714, "step": 7380 }, { "epoch": 0.9085425898572131, "grad_norm": 0.08622097969055176, "learning_rate": 4.575686660918833e-05, "loss": 7.5439, "step": 7381 }, { "epoch": 0.9086656819300837, "grad_norm": 0.08336562663316727, "learning_rate": 4.5695282670279594e-05, "loss": 7.8362, "step": 7382 }, { "epoch": 0.9087887740029542, "grad_norm": 0.09543043375015259, "learning_rate": 4.563369873137086e-05, "loss": 7.5734, "step": 7383 }, { "epoch": 0.9089118660758247, "grad_norm": 0.08814647048711777, "learning_rate": 4.5572114792462127e-05, "loss": 7.3758, "step": 7384 }, { "epoch": 0.9090349581486952, "grad_norm": 0.18900005519390106, "learning_rate": 4.551053085355339e-05, "loss": 8.0541, "step": 7385 }, { "epoch": 0.9091580502215657, "grad_norm": 0.10033487528562546, "learning_rate": 4.5448946914644666e-05, "loss": 7.5474, "step": 7386 }, { "epoch": 0.9092811422944362, "grad_norm": 0.06924902647733688, "learning_rate": 4.538736297573593e-05, "loss": 7.3652, "step": 7387 }, { "epoch": 0.9094042343673068, "grad_norm": 0.08835778385400772, "learning_rate": 4.53257790368272e-05, "loss": 7.602, "step": 7388 }, { "epoch": 0.9095273264401772, "grad_norm": 0.1394520103931427, "learning_rate": 4.5264195097918465e-05, "loss": 8.0427, "step": 7389 }, { "epoch": 0.9096504185130477, "grad_norm": 0.07208903133869171, "learning_rate": 4.520261115900973e-05, "loss": 7.7372, "step": 7390 }, { "epoch": 0.9097735105859183, "grad_norm": 0.06384896486997604, "learning_rate": 4.5141027220101004e-05, "loss": 7.596, "step": 7391 }, { "epoch": 0.9098966026587888, "grad_norm": 0.12061383575201035, "learning_rate": 4.507944328119227e-05, "loss": 8.0726, "step": 7392 }, { "epoch": 0.9100196947316593, "grad_norm": 0.10805980116128922, "learning_rate": 4.5017859342283536e-05, "loss": 8.2878, "step": 7393 }, { "epoch": 0.9101427868045298, "grad_norm": 0.12503458559513092, "learning_rate": 4.49562754033748e-05, "loss": 7.5434, "step": 7394 }, { "epoch": 0.9102658788774003, "grad_norm": 0.10928571224212646, "learning_rate": 4.489469146446607e-05, "loss": 7.7442, "step": 7395 }, { "epoch": 0.9103889709502708, "grad_norm": NaN, "learning_rate": 4.4833107525557335e-05, "loss": 7.1078, "step": 7396 }, { "epoch": 0.9105120630231414, "grad_norm": 0.13377022743225098, "learning_rate": 4.477152358664861e-05, "loss": 7.5478, "step": 7397 }, { "epoch": 0.9106351550960118, "grad_norm": 0.23732519149780273, "learning_rate": 4.4709939647739875e-05, "loss": 8.6308, "step": 7398 }, { "epoch": 0.9107582471688823, "grad_norm": 0.11444288492202759, "learning_rate": 4.464835570883114e-05, "loss": 7.3842, "step": 7399 }, { "epoch": 0.9108813392417529, "grad_norm": 0.09504365921020508, "learning_rate": 4.45867717699224e-05, "loss": 7.8094, "step": 7400 }, { "epoch": 0.9110044313146234, "grad_norm": 0.22612933814525604, "learning_rate": 4.452518783101367e-05, "loss": 8.0923, "step": 7401 }, { "epoch": 0.9111275233874938, "grad_norm": 0.0776500329375267, "learning_rate": 4.446360389210494e-05, "loss": 7.7518, "step": 7402 }, { "epoch": 0.9112506154603643, "grad_norm": 0.057204823940992355, "learning_rate": 4.4402019953196206e-05, "loss": 7.6243, "step": 7403 }, { "epoch": 0.9113737075332349, "grad_norm": 0.06518194079399109, "learning_rate": 4.434043601428747e-05, "loss": 7.6802, "step": 7404 }, { "epoch": 0.9114967996061054, "grad_norm": 0.13338202238082886, "learning_rate": 4.427885207537874e-05, "loss": 7.3246, "step": 7405 }, { "epoch": 0.9116198916789758, "grad_norm": 0.1396811306476593, "learning_rate": 4.4217268136470005e-05, "loss": 7.405, "step": 7406 }, { "epoch": 0.9117429837518464, "grad_norm": 0.07346343249082565, "learning_rate": 4.415568419756127e-05, "loss": 8.0219, "step": 7407 }, { "epoch": 0.9118660758247169, "grad_norm": 0.2985903024673462, "learning_rate": 4.4094100258652544e-05, "loss": 8.7566, "step": 7408 }, { "epoch": 0.9119891678975874, "grad_norm": 0.07336065918207169, "learning_rate": 4.403251631974381e-05, "loss": 7.591, "step": 7409 }, { "epoch": 0.9121122599704579, "grad_norm": 0.06315256655216217, "learning_rate": 4.397093238083508e-05, "loss": 7.6478, "step": 7410 }, { "epoch": 0.9122353520433284, "grad_norm": 0.0735943391919136, "learning_rate": 4.390934844192634e-05, "loss": 7.7764, "step": 7411 }, { "epoch": 0.9123584441161989, "grad_norm": 0.10061239451169968, "learning_rate": 4.384776450301761e-05, "loss": 7.3825, "step": 7412 }, { "epoch": 0.9124815361890695, "grad_norm": 0.1052701324224472, "learning_rate": 4.378618056410888e-05, "loss": 8.1656, "step": 7413 }, { "epoch": 0.9126046282619399, "grad_norm": 0.27927589416503906, "learning_rate": 4.372459662520015e-05, "loss": 8.9738, "step": 7414 }, { "epoch": 0.9127277203348104, "grad_norm": 0.10405158251523972, "learning_rate": 4.3663012686291415e-05, "loss": 7.4715, "step": 7415 }, { "epoch": 0.912850812407681, "grad_norm": 0.129574716091156, "learning_rate": 4.360142874738268e-05, "loss": 8.1906, "step": 7416 }, { "epoch": 0.9129739044805515, "grad_norm": 0.21809551119804382, "learning_rate": 4.353984480847395e-05, "loss": 7.1507, "step": 7417 }, { "epoch": 0.9130969965534219, "grad_norm": 0.10172759741544724, "learning_rate": 4.347826086956522e-05, "loss": 7.9804, "step": 7418 }, { "epoch": 0.9132200886262924, "grad_norm": 0.12247702479362488, "learning_rate": 4.341667693065649e-05, "loss": 7.4431, "step": 7419 }, { "epoch": 0.913343180699163, "grad_norm": 0.09249214082956314, "learning_rate": 4.335509299174775e-05, "loss": 7.6459, "step": 7420 }, { "epoch": 0.9134662727720335, "grad_norm": 0.10943855345249176, "learning_rate": 4.329350905283902e-05, "loss": 7.5651, "step": 7421 }, { "epoch": 0.9135893648449039, "grad_norm": 0.11334385722875595, "learning_rate": 4.3231925113930285e-05, "loss": 7.8653, "step": 7422 }, { "epoch": 0.9137124569177745, "grad_norm": 0.06205210089683533, "learning_rate": 4.317034117502155e-05, "loss": 7.495, "step": 7423 }, { "epoch": 0.913835548990645, "grad_norm": 0.35668739676475525, "learning_rate": 4.3108757236112825e-05, "loss": 9.2787, "step": 7424 }, { "epoch": 0.9139586410635155, "grad_norm": 0.16067905724048615, "learning_rate": 4.304717329720409e-05, "loss": 8.4316, "step": 7425 }, { "epoch": 0.914081733136386, "grad_norm": 0.16700030863285065, "learning_rate": 4.298558935829536e-05, "loss": 7.944, "step": 7426 }, { "epoch": 0.9142048252092565, "grad_norm": 0.1573878824710846, "learning_rate": 4.2924005419386624e-05, "loss": 7.9956, "step": 7427 }, { "epoch": 0.914327917282127, "grad_norm": 0.1010609045624733, "learning_rate": 4.286242148047789e-05, "loss": 8.0951, "step": 7428 }, { "epoch": 0.9144510093549976, "grad_norm": 0.18789061903953552, "learning_rate": 4.280083754156916e-05, "loss": 8.1225, "step": 7429 }, { "epoch": 0.914574101427868, "grad_norm": 0.541318953037262, "learning_rate": 4.273925360266043e-05, "loss": 10.1008, "step": 7430 }, { "epoch": 0.9146971935007385, "grad_norm": 0.1258082240819931, "learning_rate": 4.2677669663751695e-05, "loss": 7.4414, "step": 7431 }, { "epoch": 0.9148202855736091, "grad_norm": 0.119322270154953, "learning_rate": 4.261608572484296e-05, "loss": 7.391, "step": 7432 }, { "epoch": 0.9149433776464796, "grad_norm": 0.1247153952717781, "learning_rate": 4.255450178593423e-05, "loss": 7.4982, "step": 7433 }, { "epoch": 0.9150664697193501, "grad_norm": 0.18652760982513428, "learning_rate": 4.2492917847025494e-05, "loss": 8.5081, "step": 7434 }, { "epoch": 0.9151895617922206, "grad_norm": 0.1464167684316635, "learning_rate": 4.243133390811677e-05, "loss": 7.6832, "step": 7435 }, { "epoch": 0.9153126538650911, "grad_norm": 0.11994034796953201, "learning_rate": 4.2369749969208034e-05, "loss": 7.5349, "step": 7436 }, { "epoch": 0.9154357459379616, "grad_norm": 0.11681808531284332, "learning_rate": 4.23081660302993e-05, "loss": 7.5091, "step": 7437 }, { "epoch": 0.9155588380108322, "grad_norm": 0.10692540556192398, "learning_rate": 4.2246582091390566e-05, "loss": 7.5698, "step": 7438 }, { "epoch": 0.9156819300837026, "grad_norm": 0.0965939611196518, "learning_rate": 4.218499815248183e-05, "loss": 7.4121, "step": 7439 }, { "epoch": 0.9158050221565731, "grad_norm": 0.09762892127037048, "learning_rate": 4.2123414213573105e-05, "loss": 8.0268, "step": 7440 }, { "epoch": 0.9159281142294436, "grad_norm": 0.10755220800638199, "learning_rate": 4.206183027466437e-05, "loss": 7.2564, "step": 7441 }, { "epoch": 0.9160512063023142, "grad_norm": 0.1256542205810547, "learning_rate": 4.200024633575564e-05, "loss": 7.828, "step": 7442 }, { "epoch": 0.9161742983751846, "grad_norm": 0.11141884326934814, "learning_rate": 4.1938662396846904e-05, "loss": 7.5214, "step": 7443 }, { "epoch": 0.9162973904480551, "grad_norm": 0.06882601231336594, "learning_rate": 4.187707845793817e-05, "loss": 7.3531, "step": 7444 }, { "epoch": 0.9164204825209257, "grad_norm": 0.31757622957229614, "learning_rate": 4.181549451902944e-05, "loss": 8.542, "step": 7445 }, { "epoch": 0.9165435745937962, "grad_norm": 0.1552223265171051, "learning_rate": 4.175391058012071e-05, "loss": 8.1601, "step": 7446 }, { "epoch": 0.9166666666666666, "grad_norm": 0.08460687845945358, "learning_rate": 4.1692326641211976e-05, "loss": 7.4069, "step": 7447 }, { "epoch": 0.9167897587395372, "grad_norm": 0.08078984916210175, "learning_rate": 4.163074270230324e-05, "loss": 7.5204, "step": 7448 }, { "epoch": 0.9169128508124077, "grad_norm": 0.06970352679491043, "learning_rate": 4.156915876339451e-05, "loss": 7.4297, "step": 7449 }, { "epoch": 0.9170359428852782, "grad_norm": 0.1593882441520691, "learning_rate": 4.1507574824485775e-05, "loss": 8.0065, "step": 7450 }, { "epoch": 0.9171590349581487, "grad_norm": 0.0840102955698967, "learning_rate": 4.144599088557705e-05, "loss": 7.4573, "step": 7451 }, { "epoch": 0.9172821270310192, "grad_norm": 0.13679876923561096, "learning_rate": 4.1384406946668314e-05, "loss": 7.7311, "step": 7452 }, { "epoch": 0.9174052191038897, "grad_norm": 0.3817336857318878, "learning_rate": 4.132282300775958e-05, "loss": 9.2212, "step": 7453 }, { "epoch": 0.9175283111767603, "grad_norm": 0.1990610808134079, "learning_rate": 4.126123906885085e-05, "loss": 7.2086, "step": 7454 }, { "epoch": 0.9176514032496307, "grad_norm": 0.08891943842172623, "learning_rate": 4.119965512994211e-05, "loss": 7.8909, "step": 7455 }, { "epoch": 0.9177744953225012, "grad_norm": 0.0774746909737587, "learning_rate": 4.1138071191033386e-05, "loss": 7.6332, "step": 7456 }, { "epoch": 0.9178975873953718, "grad_norm": 0.08055845648050308, "learning_rate": 4.107648725212465e-05, "loss": 7.6363, "step": 7457 }, { "epoch": 0.9180206794682423, "grad_norm": 0.09142794460058212, "learning_rate": 4.101490331321591e-05, "loss": 8.1158, "step": 7458 }, { "epoch": 0.9181437715411127, "grad_norm": 0.0991947278380394, "learning_rate": 4.095331937430718e-05, "loss": 7.6153, "step": 7459 }, { "epoch": 0.9182668636139832, "grad_norm": 0.12280770391225815, "learning_rate": 4.0891735435398444e-05, "loss": 7.4123, "step": 7460 }, { "epoch": 0.9183899556868538, "grad_norm": 0.11937662214040756, "learning_rate": 4.083015149648971e-05, "loss": 7.411, "step": 7461 }, { "epoch": 0.9185130477597243, "grad_norm": 0.09189961105585098, "learning_rate": 4.0768567557580984e-05, "loss": 7.4854, "step": 7462 }, { "epoch": 0.9186361398325947, "grad_norm": 0.1751784235239029, "learning_rate": 4.070698361867225e-05, "loss": 8.2175, "step": 7463 }, { "epoch": 0.9187592319054653, "grad_norm": 0.13448183238506317, "learning_rate": 4.0645399679763516e-05, "loss": 7.3872, "step": 7464 }, { "epoch": 0.9188823239783358, "grad_norm": 0.07821498066186905, "learning_rate": 4.058381574085478e-05, "loss": 7.5135, "step": 7465 }, { "epoch": 0.9190054160512063, "grad_norm": 0.08824903517961502, "learning_rate": 4.052223180194605e-05, "loss": 7.7567, "step": 7466 }, { "epoch": 0.9191285081240768, "grad_norm": 0.16043028235435486, "learning_rate": 4.046064786303732e-05, "loss": 8.1055, "step": 7467 }, { "epoch": 0.9192516001969473, "grad_norm": 0.056993864476680756, "learning_rate": 4.039906392412859e-05, "loss": 7.3055, "step": 7468 }, { "epoch": 0.9193746922698178, "grad_norm": 0.09041957557201385, "learning_rate": 4.0337479985219854e-05, "loss": 7.5546, "step": 7469 }, { "epoch": 0.9194977843426884, "grad_norm": 0.28141486644744873, "learning_rate": 4.027589604631112e-05, "loss": 8.3817, "step": 7470 }, { "epoch": 0.9196208764155588, "grad_norm": 0.07329437881708145, "learning_rate": 4.021431210740239e-05, "loss": 7.4719, "step": 7471 }, { "epoch": 0.9197439684884293, "grad_norm": 0.0895734652876854, "learning_rate": 4.015272816849365e-05, "loss": 7.7252, "step": 7472 }, { "epoch": 0.9198670605612999, "grad_norm": 0.325592964887619, "learning_rate": 4.0091144229584926e-05, "loss": 8.7538, "step": 7473 }, { "epoch": 0.9199901526341704, "grad_norm": 0.06924033164978027, "learning_rate": 4.002956029067619e-05, "loss": 7.4454, "step": 7474 }, { "epoch": 0.9201132447070408, "grad_norm": 0.07241865992546082, "learning_rate": 3.996797635176746e-05, "loss": 7.4883, "step": 7475 }, { "epoch": 0.9202363367799113, "grad_norm": 0.26489022374153137, "learning_rate": 3.9906392412858725e-05, "loss": 8.9757, "step": 7476 }, { "epoch": 0.9203594288527819, "grad_norm": 0.07380546629428864, "learning_rate": 3.984480847394999e-05, "loss": 7.7699, "step": 7477 }, { "epoch": 0.9204825209256524, "grad_norm": 0.1037018820643425, "learning_rate": 3.9783224535041264e-05, "loss": 7.6992, "step": 7478 }, { "epoch": 0.920605612998523, "grad_norm": 0.39406320452690125, "learning_rate": 3.972164059613253e-05, "loss": 10.0941, "step": 7479 }, { "epoch": 0.9207287050713934, "grad_norm": 0.07827334851026535, "learning_rate": 3.96600566572238e-05, "loss": 7.99, "step": 7480 }, { "epoch": 0.9208517971442639, "grad_norm": 0.15709780156612396, "learning_rate": 3.959847271831506e-05, "loss": 7.4379, "step": 7481 }, { "epoch": 0.9209748892171344, "grad_norm": 0.155856654047966, "learning_rate": 3.953688877940633e-05, "loss": 7.6205, "step": 7482 }, { "epoch": 0.921097981290005, "grad_norm": 0.11595410853624344, "learning_rate": 3.9475304840497596e-05, "loss": 7.9059, "step": 7483 }, { "epoch": 0.9212210733628754, "grad_norm": 0.310038685798645, "learning_rate": 3.941372090158887e-05, "loss": 9.0603, "step": 7484 }, { "epoch": 0.9213441654357459, "grad_norm": 0.13039885461330414, "learning_rate": 3.9352136962680135e-05, "loss": 7.645, "step": 7485 }, { "epoch": 0.9214672575086165, "grad_norm": 0.22967126965522766, "learning_rate": 3.92905530237714e-05, "loss": 7.2049, "step": 7486 }, { "epoch": 0.921590349581487, "grad_norm": 0.12352027744054794, "learning_rate": 3.922896908486267e-05, "loss": 7.6305, "step": 7487 }, { "epoch": 0.9217134416543574, "grad_norm": 0.10708283632993698, "learning_rate": 3.9167385145953934e-05, "loss": 7.5521, "step": 7488 }, { "epoch": 0.921836533727228, "grad_norm": 0.09294624626636505, "learning_rate": 3.910580120704521e-05, "loss": 7.5825, "step": 7489 }, { "epoch": 0.9219596258000985, "grad_norm": 0.08743558824062347, "learning_rate": 3.904421726813647e-05, "loss": 7.4669, "step": 7490 }, { "epoch": 0.922082717872969, "grad_norm": 0.08393126726150513, "learning_rate": 3.898263332922774e-05, "loss": 7.3945, "step": 7491 }, { "epoch": 0.9222058099458394, "grad_norm": 0.1013704165816307, "learning_rate": 3.8921049390319006e-05, "loss": 7.3346, "step": 7492 }, { "epoch": 0.92232890201871, "grad_norm": 0.1178642213344574, "learning_rate": 3.885946545141027e-05, "loss": 7.4955, "step": 7493 }, { "epoch": 0.9224519940915805, "grad_norm": 0.17329931259155273, "learning_rate": 3.879788151250154e-05, "loss": 8.089, "step": 7494 }, { "epoch": 0.922575086164451, "grad_norm": 0.2073744535446167, "learning_rate": 3.873629757359281e-05, "loss": 8.099, "step": 7495 }, { "epoch": 0.9226981782373215, "grad_norm": 0.14119036495685577, "learning_rate": 3.867471363468408e-05, "loss": 7.6266, "step": 7496 }, { "epoch": 0.922821270310192, "grad_norm": 0.11778905987739563, "learning_rate": 3.8613129695775344e-05, "loss": 7.7847, "step": 7497 }, { "epoch": 0.9229443623830625, "grad_norm": 0.11634192615747452, "learning_rate": 3.855154575686661e-05, "loss": 7.0436, "step": 7498 }, { "epoch": 0.9230674544559331, "grad_norm": 0.42579057812690735, "learning_rate": 3.8489961817957876e-05, "loss": 9.1919, "step": 7499 }, { "epoch": 0.9231905465288035, "grad_norm": 0.351374089717865, "learning_rate": 3.842837787904915e-05, "loss": 8.4893, "step": 7500 }, { "epoch": 0.923313638601674, "grad_norm": 0.07835262268781662, "learning_rate": 3.8366793940140416e-05, "loss": 7.4382, "step": 7501 }, { "epoch": 0.9234367306745446, "grad_norm": 0.10418114066123962, "learning_rate": 3.830521000123168e-05, "loss": 7.6743, "step": 7502 }, { "epoch": 0.9235598227474151, "grad_norm": 0.056946948170661926, "learning_rate": 3.824362606232295e-05, "loss": 7.6036, "step": 7503 }, { "epoch": 0.9236829148202855, "grad_norm": 0.07995621114969254, "learning_rate": 3.8182042123414214e-05, "loss": 7.6314, "step": 7504 }, { "epoch": 0.9238060068931561, "grad_norm": 0.08160285651683807, "learning_rate": 3.812045818450548e-05, "loss": 7.6638, "step": 7505 }, { "epoch": 0.9239290989660266, "grad_norm": 0.08226245641708374, "learning_rate": 3.8058874245596754e-05, "loss": 8.0101, "step": 7506 }, { "epoch": 0.9240521910388971, "grad_norm": 0.1080310270190239, "learning_rate": 3.799729030668802e-05, "loss": 7.3488, "step": 7507 }, { "epoch": 0.9241752831117676, "grad_norm": 0.21184389293193817, "learning_rate": 3.7935706367779286e-05, "loss": 7.3057, "step": 7508 }, { "epoch": 0.9242983751846381, "grad_norm": 0.08618040382862091, "learning_rate": 3.787412242887055e-05, "loss": 7.7445, "step": 7509 }, { "epoch": 0.9244214672575086, "grad_norm": 0.13422295451164246, "learning_rate": 3.781253848996182e-05, "loss": 8.4359, "step": 7510 }, { "epoch": 0.9245445593303792, "grad_norm": 0.1125878319144249, "learning_rate": 3.775095455105309e-05, "loss": 7.4416, "step": 7511 }, { "epoch": 0.9246676514032496, "grad_norm": 0.11610721051692963, "learning_rate": 3.768937061214436e-05, "loss": 7.9474, "step": 7512 }, { "epoch": 0.9247907434761201, "grad_norm": 0.11231336742639542, "learning_rate": 3.7627786673235624e-05, "loss": 7.3509, "step": 7513 }, { "epoch": 0.9249138355489906, "grad_norm": 0.2904086709022522, "learning_rate": 3.756620273432689e-05, "loss": 9.2116, "step": 7514 }, { "epoch": 0.9250369276218612, "grad_norm": 0.07503851503133774, "learning_rate": 3.750461879541816e-05, "loss": 7.3942, "step": 7515 }, { "epoch": 0.9251600196947316, "grad_norm": 0.11643911898136139, "learning_rate": 3.7443034856509416e-05, "loss": 7.3154, "step": 7516 }, { "epoch": 0.9252831117676021, "grad_norm": 0.12189313769340515, "learning_rate": 3.738145091760069e-05, "loss": 7.4237, "step": 7517 }, { "epoch": 0.9254062038404727, "grad_norm": 0.08156470209360123, "learning_rate": 3.7319866978691956e-05, "loss": 7.6189, "step": 7518 }, { "epoch": 0.9255292959133432, "grad_norm": 0.1332128345966339, "learning_rate": 3.725828303978322e-05, "loss": 7.9231, "step": 7519 }, { "epoch": 0.9256523879862137, "grad_norm": 0.0767657458782196, "learning_rate": 3.719669910087449e-05, "loss": 7.4468, "step": 7520 }, { "epoch": 0.9257754800590842, "grad_norm": 0.211899995803833, "learning_rate": 3.7135115161965755e-05, "loss": 8.5108, "step": 7521 }, { "epoch": 0.9258985721319547, "grad_norm": 0.1204792708158493, "learning_rate": 3.707353122305703e-05, "loss": 7.8228, "step": 7522 }, { "epoch": 0.9260216642048252, "grad_norm": 0.06460192054510117, "learning_rate": 3.7011947284148294e-05, "loss": 7.5765, "step": 7523 }, { "epoch": 0.9261447562776958, "grad_norm": 0.11803693324327469, "learning_rate": 3.695036334523956e-05, "loss": 8.1304, "step": 7524 }, { "epoch": 0.9262678483505662, "grad_norm": 0.07261654734611511, "learning_rate": 3.6888779406330826e-05, "loss": 7.4168, "step": 7525 }, { "epoch": 0.9263909404234367, "grad_norm": 0.09069282561540604, "learning_rate": 3.682719546742209e-05, "loss": 7.3868, "step": 7526 }, { "epoch": 0.9265140324963073, "grad_norm": 0.0642528086900711, "learning_rate": 3.6765611528513366e-05, "loss": 7.6611, "step": 7527 }, { "epoch": 0.9266371245691778, "grad_norm": 0.20639029145240784, "learning_rate": 3.670402758960463e-05, "loss": 8.1829, "step": 7528 }, { "epoch": 0.9267602166420482, "grad_norm": 0.15835750102996826, "learning_rate": 3.66424436506959e-05, "loss": 7.9959, "step": 7529 }, { "epoch": 0.9268833087149188, "grad_norm": 0.06241905689239502, "learning_rate": 3.6580859711787165e-05, "loss": 7.9142, "step": 7530 }, { "epoch": 0.9270064007877893, "grad_norm": 0.09911827743053436, "learning_rate": 3.651927577287843e-05, "loss": 7.9956, "step": 7531 }, { "epoch": 0.9271294928606598, "grad_norm": 0.09534049034118652, "learning_rate": 3.64576918339697e-05, "loss": 7.7947, "step": 7532 }, { "epoch": 0.9272525849335302, "grad_norm": 0.09819890558719635, "learning_rate": 3.639610789506097e-05, "loss": 7.4701, "step": 7533 }, { "epoch": 0.9273756770064008, "grad_norm": 0.1239100843667984, "learning_rate": 3.6334523956152236e-05, "loss": 7.4034, "step": 7534 }, { "epoch": 0.9274987690792713, "grad_norm": 0.1320425420999527, "learning_rate": 3.62729400172435e-05, "loss": 8.1201, "step": 7535 }, { "epoch": 0.9276218611521418, "grad_norm": 0.10161003470420837, "learning_rate": 3.621135607833477e-05, "loss": 7.5796, "step": 7536 }, { "epoch": 0.9277449532250123, "grad_norm": 0.10300987213850021, "learning_rate": 3.6149772139426035e-05, "loss": 7.916, "step": 7537 }, { "epoch": 0.9278680452978828, "grad_norm": 0.08415699750185013, "learning_rate": 3.608818820051731e-05, "loss": 7.7367, "step": 7538 }, { "epoch": 0.9279911373707533, "grad_norm": 0.12647423148155212, "learning_rate": 3.6026604261608575e-05, "loss": 7.2267, "step": 7539 }, { "epoch": 0.9281142294436239, "grad_norm": 0.11575905233621597, "learning_rate": 3.596502032269984e-05, "loss": 7.2805, "step": 7540 }, { "epoch": 0.9282373215164943, "grad_norm": 0.09635096788406372, "learning_rate": 3.590343638379111e-05, "loss": 7.5402, "step": 7541 }, { "epoch": 0.9283604135893648, "grad_norm": 0.09315971285104752, "learning_rate": 3.5841852444882373e-05, "loss": 7.2347, "step": 7542 }, { "epoch": 0.9284835056622354, "grad_norm": 0.0966155081987381, "learning_rate": 3.578026850597364e-05, "loss": 7.8561, "step": 7543 }, { "epoch": 0.9286065977351059, "grad_norm": 0.0959823951125145, "learning_rate": 3.571868456706491e-05, "loss": 7.5358, "step": 7544 }, { "epoch": 0.9287296898079763, "grad_norm": 0.07366819679737091, "learning_rate": 3.565710062815618e-05, "loss": 7.4247, "step": 7545 }, { "epoch": 0.9288527818808469, "grad_norm": 0.15588891506195068, "learning_rate": 3.5595516689247445e-05, "loss": 7.8657, "step": 7546 }, { "epoch": 0.9289758739537174, "grad_norm": 0.20718032121658325, "learning_rate": 3.553393275033871e-05, "loss": 8.5146, "step": 7547 }, { "epoch": 0.9290989660265879, "grad_norm": 0.1648315191268921, "learning_rate": 3.547234881142998e-05, "loss": 8.1602, "step": 7548 }, { "epoch": 0.9292220580994583, "grad_norm": 0.30561667680740356, "learning_rate": 3.541076487252125e-05, "loss": 8.016, "step": 7549 }, { "epoch": 0.9293451501723289, "grad_norm": 0.11720265448093414, "learning_rate": 3.534918093361252e-05, "loss": 7.9024, "step": 7550 }, { "epoch": 0.9294682422451994, "grad_norm": 0.06256169080734253, "learning_rate": 3.528759699470378e-05, "loss": 7.311, "step": 7551 }, { "epoch": 0.92959133431807, "grad_norm": 0.09193649142980576, "learning_rate": 3.522601305579505e-05, "loss": 7.882, "step": 7552 }, { "epoch": 0.9297144263909404, "grad_norm": 0.0688633993268013, "learning_rate": 3.5164429116886316e-05, "loss": 7.4384, "step": 7553 }, { "epoch": 0.9298375184638109, "grad_norm": 0.08668135106563568, "learning_rate": 3.510284517797758e-05, "loss": 7.7264, "step": 7554 }, { "epoch": 0.9299606105366814, "grad_norm": 0.1299162358045578, "learning_rate": 3.5041261239068855e-05, "loss": 7.4714, "step": 7555 }, { "epoch": 0.930083702609552, "grad_norm": 0.1483452320098877, "learning_rate": 3.497967730016012e-05, "loss": 7.8149, "step": 7556 }, { "epoch": 0.9302067946824224, "grad_norm": 0.16022516787052155, "learning_rate": 3.491809336125139e-05, "loss": 7.2544, "step": 7557 }, { "epoch": 0.9303298867552929, "grad_norm": 0.08405057340860367, "learning_rate": 3.4856509422342654e-05, "loss": 7.6075, "step": 7558 }, { "epoch": 0.9304529788281635, "grad_norm": 0.127345472574234, "learning_rate": 3.479492548343392e-05, "loss": 7.3486, "step": 7559 }, { "epoch": 0.930576070901034, "grad_norm": 4.772719271660749e+16, "learning_rate": 3.473334154452519e-05, "loss": 7.5665, "step": 7560 }, { "epoch": 0.9306991629739044, "grad_norm": 0.10409232974052429, "learning_rate": 3.467175760561646e-05, "loss": 7.4798, "step": 7561 }, { "epoch": 0.930822255046775, "grad_norm": 0.08570201694965363, "learning_rate": 3.4610173666707726e-05, "loss": 7.6114, "step": 7562 }, { "epoch": 0.9309453471196455, "grad_norm": 0.3346118927001953, "learning_rate": 3.454858972779899e-05, "loss": 9.1404, "step": 7563 }, { "epoch": 0.931068439192516, "grad_norm": 0.3359147906303406, "learning_rate": 3.448700578889026e-05, "loss": 9.001, "step": 7564 }, { "epoch": 0.9311915312653866, "grad_norm": 0.06657664477825165, "learning_rate": 3.4425421849981525e-05, "loss": 7.7064, "step": 7565 }, { "epoch": 0.931314623338257, "grad_norm": 0.07975038886070251, "learning_rate": 3.43638379110728e-05, "loss": 7.8882, "step": 7566 }, { "epoch": 0.9314377154111275, "grad_norm": 0.11134172230958939, "learning_rate": 3.4302253972164064e-05, "loss": 7.6969, "step": 7567 }, { "epoch": 0.931560807483998, "grad_norm": 0.08243174105882645, "learning_rate": 3.424067003325533e-05, "loss": 7.8036, "step": 7568 }, { "epoch": 0.9316838995568686, "grad_norm": 0.08305070549249649, "learning_rate": 3.4179086094346597e-05, "loss": 7.6819, "step": 7569 }, { "epoch": 0.931806991629739, "grad_norm": 0.13276834785938263, "learning_rate": 3.411750215543786e-05, "loss": 7.2084, "step": 7570 }, { "epoch": 0.9319300837026095, "grad_norm": 0.45036354660987854, "learning_rate": 3.4055918216529136e-05, "loss": 9.4949, "step": 7571 }, { "epoch": 0.9320531757754801, "grad_norm": 0.07189775258302689, "learning_rate": 3.39943342776204e-05, "loss": 7.6312, "step": 7572 }, { "epoch": 0.9321762678483506, "grad_norm": 0.10582920908927917, "learning_rate": 3.393275033871167e-05, "loss": 8.2462, "step": 7573 }, { "epoch": 0.932299359921221, "grad_norm": 0.22465384006500244, "learning_rate": 3.387116639980293e-05, "loss": 8.7638, "step": 7574 }, { "epoch": 0.9324224519940916, "grad_norm": 0.08683252334594727, "learning_rate": 3.3809582460894194e-05, "loss": 7.5249, "step": 7575 }, { "epoch": 0.9325455440669621, "grad_norm": 0.08392772078514099, "learning_rate": 3.374799852198546e-05, "loss": 7.5129, "step": 7576 }, { "epoch": 0.9326686361398326, "grad_norm": 0.08662422746419907, "learning_rate": 3.3686414583076734e-05, "loss": 7.5311, "step": 7577 }, { "epoch": 0.9327917282127031, "grad_norm": 0.061786726117134094, "learning_rate": 3.3624830644168e-05, "loss": 7.9339, "step": 7578 }, { "epoch": 0.9329148202855736, "grad_norm": 0.07364269345998764, "learning_rate": 3.3563246705259266e-05, "loss": 8.0505, "step": 7579 }, { "epoch": 0.9330379123584441, "grad_norm": 0.0797547772526741, "learning_rate": 3.350166276635053e-05, "loss": 8.0685, "step": 7580 }, { "epoch": 0.9331610044313147, "grad_norm": 0.09786517173051834, "learning_rate": 3.34400788274418e-05, "loss": 7.5837, "step": 7581 }, { "epoch": 0.9332840965041851, "grad_norm": 0.15174438059329987, "learning_rate": 3.337849488853307e-05, "loss": 7.1694, "step": 7582 }, { "epoch": 0.9334071885770556, "grad_norm": 0.1843373030424118, "learning_rate": 3.331691094962434e-05, "loss": 7.2626, "step": 7583 }, { "epoch": 0.9335302806499262, "grad_norm": 0.09294773638248444, "learning_rate": 3.3255327010715604e-05, "loss": 8.014, "step": 7584 }, { "epoch": 0.9336533727227967, "grad_norm": 0.13047532737255096, "learning_rate": 3.319374307180687e-05, "loss": 7.3621, "step": 7585 }, { "epoch": 0.9337764647956671, "grad_norm": 0.07378054410219193, "learning_rate": 3.313215913289814e-05, "loss": 7.6212, "step": 7586 }, { "epoch": 0.9338995568685377, "grad_norm": 0.11364507675170898, "learning_rate": 3.307057519398941e-05, "loss": 7.3233, "step": 7587 }, { "epoch": 0.9340226489414082, "grad_norm": 0.17387598752975464, "learning_rate": 3.3008991255080676e-05, "loss": 8.2087, "step": 7588 }, { "epoch": 0.9341457410142787, "grad_norm": 0.154968723654747, "learning_rate": 3.294740731617194e-05, "loss": 7.983, "step": 7589 }, { "epoch": 0.9342688330871491, "grad_norm": 0.05685846135020256, "learning_rate": 3.288582337726321e-05, "loss": 7.5186, "step": 7590 }, { "epoch": 0.9343919251600197, "grad_norm": 0.08509203791618347, "learning_rate": 3.2824239438354475e-05, "loss": 7.742, "step": 7591 }, { "epoch": 0.9345150172328902, "grad_norm": 0.09071186929941177, "learning_rate": 3.276265549944574e-05, "loss": 7.568, "step": 7592 }, { "epoch": 0.9346381093057607, "grad_norm": 0.09599711745977402, "learning_rate": 3.2701071560537014e-05, "loss": 7.6351, "step": 7593 }, { "epoch": 0.9347612013786312, "grad_norm": 0.1925332099199295, "learning_rate": 3.263948762162828e-05, "loss": 8.3045, "step": 7594 }, { "epoch": 0.9348842934515017, "grad_norm": 0.10633854568004608, "learning_rate": 3.257790368271955e-05, "loss": 7.4525, "step": 7595 }, { "epoch": 0.9350073855243722, "grad_norm": 0.16963523626327515, "learning_rate": 3.251631974381081e-05, "loss": 7.8494, "step": 7596 }, { "epoch": 0.9351304775972428, "grad_norm": 0.1410173624753952, "learning_rate": 3.245473580490208e-05, "loss": 7.2298, "step": 7597 }, { "epoch": 0.9352535696701132, "grad_norm": 0.08914783596992493, "learning_rate": 3.239315186599335e-05, "loss": 7.9838, "step": 7598 }, { "epoch": 0.9353766617429837, "grad_norm": 0.12938514351844788, "learning_rate": 3.233156792708462e-05, "loss": 7.3277, "step": 7599 }, { "epoch": 0.9354997538158543, "grad_norm": NaN, "learning_rate": 3.2269983988175885e-05, "loss": 8.2687, "step": 7600 }, { "epoch": 0.9356228458887248, "grad_norm": 0.13341949880123138, "learning_rate": 3.220840004926715e-05, "loss": 8.0912, "step": 7601 }, { "epoch": 0.9357459379615952, "grad_norm": 0.16282393038272858, "learning_rate": 3.214681611035842e-05, "loss": 7.2866, "step": 7602 }, { "epoch": 0.9358690300344658, "grad_norm": 0.0975731685757637, "learning_rate": 3.2085232171449684e-05, "loss": 7.3186, "step": 7603 }, { "epoch": 0.9359921221073363, "grad_norm": 0.091493621468544, "learning_rate": 3.202364823254096e-05, "loss": 7.8676, "step": 7604 }, { "epoch": 0.9361152141802068, "grad_norm": 0.08434493839740753, "learning_rate": 3.196206429363222e-05, "loss": 7.6985, "step": 7605 }, { "epoch": 0.9362383062530774, "grad_norm": 0.07009761035442352, "learning_rate": 3.190048035472349e-05, "loss": 7.692, "step": 7606 }, { "epoch": 0.9363613983259478, "grad_norm": 0.1398361623287201, "learning_rate": 3.1838896415814756e-05, "loss": 7.8209, "step": 7607 }, { "epoch": 0.9364844903988183, "grad_norm": 0.06825263798236847, "learning_rate": 3.177731247690602e-05, "loss": 7.4419, "step": 7608 }, { "epoch": 0.9366075824716888, "grad_norm": 0.11603652685880661, "learning_rate": 3.1715728537997295e-05, "loss": 8.1724, "step": 7609 }, { "epoch": 0.9367306745445594, "grad_norm": 0.06507997959852219, "learning_rate": 3.165414459908856e-05, "loss": 7.9051, "step": 7610 }, { "epoch": 0.9368537666174298, "grad_norm": 0.09899254888296127, "learning_rate": 3.159256066017983e-05, "loss": 7.9717, "step": 7611 }, { "epoch": 0.9369768586903003, "grad_norm": 0.06540793180465698, "learning_rate": 3.1530976721271094e-05, "loss": 7.5967, "step": 7612 }, { "epoch": 0.9370999507631709, "grad_norm": 0.07393275946378708, "learning_rate": 3.146939278236236e-05, "loss": 7.8019, "step": 7613 }, { "epoch": 0.9372230428360414, "grad_norm": 0.09918509423732758, "learning_rate": 3.1407808843453626e-05, "loss": 7.4768, "step": 7614 }, { "epoch": 0.9373461349089118, "grad_norm": 0.09392248839139938, "learning_rate": 3.13462249045449e-05, "loss": 7.8868, "step": 7615 }, { "epoch": 0.9374692269817824, "grad_norm": 0.1196715235710144, "learning_rate": 3.1284640965636165e-05, "loss": 7.4596, "step": 7616 }, { "epoch": 0.9375923190546529, "grad_norm": 0.09803522378206253, "learning_rate": 3.122305702672743e-05, "loss": 8.1894, "step": 7617 }, { "epoch": 0.9377154111275234, "grad_norm": 0.18430587649345398, "learning_rate": 3.11614730878187e-05, "loss": 7.2659, "step": 7618 }, { "epoch": 0.9378385032003939, "grad_norm": 0.12573184072971344, "learning_rate": 3.1099889148909964e-05, "loss": 7.304, "step": 7619 }, { "epoch": 0.9379615952732644, "grad_norm": 0.08701275289058685, "learning_rate": 3.103830521000123e-05, "loss": 7.6458, "step": 7620 }, { "epoch": 0.9380846873461349, "grad_norm": 0.1106737032532692, "learning_rate": 3.09767212710925e-05, "loss": 7.3539, "step": 7621 }, { "epoch": 0.9382077794190055, "grad_norm": 0.14575240015983582, "learning_rate": 3.091513733218376e-05, "loss": 8.1577, "step": 7622 }, { "epoch": 0.9383308714918759, "grad_norm": 0.06636825203895569, "learning_rate": 3.0853553393275036e-05, "loss": 7.4975, "step": 7623 }, { "epoch": 0.9384539635647464, "grad_norm": 0.1339579075574875, "learning_rate": 3.07919694543663e-05, "loss": 7.7827, "step": 7624 }, { "epoch": 0.938577055637617, "grad_norm": 0.12047906219959259, "learning_rate": 3.073038551545757e-05, "loss": 7.7996, "step": 7625 }, { "epoch": 0.9387001477104875, "grad_norm": 0.08589976280927658, "learning_rate": 3.0668801576548835e-05, "loss": 7.7073, "step": 7626 }, { "epoch": 0.9388232397833579, "grad_norm": 0.133177250623703, "learning_rate": 3.06072176376401e-05, "loss": 7.0845, "step": 7627 }, { "epoch": 0.9389463318562284, "grad_norm": 0.06680914014577866, "learning_rate": 3.0545633698731374e-05, "loss": 7.6378, "step": 7628 }, { "epoch": 0.939069423929099, "grad_norm": 0.08596792072057724, "learning_rate": 3.0484049759822637e-05, "loss": 7.6754, "step": 7629 }, { "epoch": 0.9391925160019695, "grad_norm": 0.10588356852531433, "learning_rate": 3.0422465820913907e-05, "loss": 7.346, "step": 7630 }, { "epoch": 0.9393156080748399, "grad_norm": 0.1028650626540184, "learning_rate": 3.0360881882005173e-05, "loss": 7.6045, "step": 7631 }, { "epoch": 0.9394387001477105, "grad_norm": 0.18258166313171387, "learning_rate": 3.0299297943096443e-05, "loss": 8.2576, "step": 7632 }, { "epoch": 0.939561792220581, "grad_norm": 0.16536055505275726, "learning_rate": 3.023771400418771e-05, "loss": 8.1812, "step": 7633 }, { "epoch": 0.9396848842934515, "grad_norm": 0.12154058367013931, "learning_rate": 3.0176130065278975e-05, "loss": 8.0087, "step": 7634 }, { "epoch": 0.939807976366322, "grad_norm": 0.11530666053295135, "learning_rate": 3.0114546126370245e-05, "loss": 7.5922, "step": 7635 }, { "epoch": 0.9399310684391925, "grad_norm": 0.17315515875816345, "learning_rate": 3.005296218746151e-05, "loss": 7.6665, "step": 7636 }, { "epoch": 0.940054160512063, "grad_norm": 0.1853429675102234, "learning_rate": 2.9991378248552777e-05, "loss": 8.4253, "step": 7637 }, { "epoch": 0.9401772525849336, "grad_norm": 0.12402257323265076, "learning_rate": 2.9929794309644047e-05, "loss": 7.7502, "step": 7638 }, { "epoch": 0.940300344657804, "grad_norm": 0.32190537452697754, "learning_rate": 2.9868210370735313e-05, "loss": 7.8475, "step": 7639 }, { "epoch": 0.9404234367306745, "grad_norm": 0.13448114693164825, "learning_rate": 2.980662643182658e-05, "loss": 7.9497, "step": 7640 }, { "epoch": 0.9405465288035451, "grad_norm": 0.08649726957082748, "learning_rate": 2.974504249291785e-05, "loss": 8.0907, "step": 7641 }, { "epoch": 0.9406696208764156, "grad_norm": 0.1727600246667862, "learning_rate": 2.9683458554009116e-05, "loss": 8.2697, "step": 7642 }, { "epoch": 0.940792712949286, "grad_norm": 0.09910600632429123, "learning_rate": 2.9621874615100385e-05, "loss": 7.5071, "step": 7643 }, { "epoch": 0.9409158050221565, "grad_norm": 0.0911644771695137, "learning_rate": 2.956029067619165e-05, "loss": 7.5825, "step": 7644 }, { "epoch": 0.9410388970950271, "grad_norm": 0.08947409689426422, "learning_rate": 2.9498706737282918e-05, "loss": 7.482, "step": 7645 }, { "epoch": 0.9411619891678976, "grad_norm": 0.17593903839588165, "learning_rate": 2.9437122798374184e-05, "loss": 7.5338, "step": 7646 }, { "epoch": 0.941285081240768, "grad_norm": 0.08458513766527176, "learning_rate": 2.937553885946545e-05, "loss": 7.6614, "step": 7647 }, { "epoch": 0.9414081733136386, "grad_norm": 0.09492622315883636, "learning_rate": 2.9313954920556717e-05, "loss": 8.0296, "step": 7648 }, { "epoch": 0.9415312653865091, "grad_norm": 0.3954494595527649, "learning_rate": 2.9252370981647986e-05, "loss": 9.4354, "step": 7649 }, { "epoch": 0.9416543574593796, "grad_norm": 0.07821241766214371, "learning_rate": 2.9190787042739253e-05, "loss": 7.7194, "step": 7650 }, { "epoch": 0.9417774495322502, "grad_norm": 0.39531970024108887, "learning_rate": 2.912920310383052e-05, "loss": 9.3409, "step": 7651 }, { "epoch": 0.9419005416051206, "grad_norm": 0.08205428719520569, "learning_rate": 2.906761916492179e-05, "loss": 7.8609, "step": 7652 }, { "epoch": 0.9420236336779911, "grad_norm": 0.14315755665302277, "learning_rate": 2.9006035226013055e-05, "loss": 8.3237, "step": 7653 }, { "epoch": 0.9421467257508617, "grad_norm": 0.10903272777795792, "learning_rate": 2.8944451287104324e-05, "loss": 7.3206, "step": 7654 }, { "epoch": 0.9422698178237322, "grad_norm": 0.13632525503635406, "learning_rate": 2.888286734819559e-05, "loss": 7.6918, "step": 7655 }, { "epoch": 0.9423929098966026, "grad_norm": 0.1306508332490921, "learning_rate": 2.8821283409286857e-05, "loss": 7.3644, "step": 7656 }, { "epoch": 0.9425160019694732, "grad_norm": 0.37588822841644287, "learning_rate": 2.8759699470378127e-05, "loss": 9.1326, "step": 7657 }, { "epoch": 0.9426390940423437, "grad_norm": 0.12816336750984192, "learning_rate": 2.8698115531469393e-05, "loss": 7.4287, "step": 7658 }, { "epoch": 0.9427621861152142, "grad_norm": 0.10899405181407928, "learning_rate": 2.863653159256066e-05, "loss": 7.3897, "step": 7659 }, { "epoch": 0.9428852781880847, "grad_norm": 0.10241273790597916, "learning_rate": 2.857494765365193e-05, "loss": 8.2674, "step": 7660 }, { "epoch": 0.9430083702609552, "grad_norm": 0.12534846365451813, "learning_rate": 2.8513363714743195e-05, "loss": 8.5467, "step": 7661 }, { "epoch": 0.9431314623338257, "grad_norm": 0.11239737272262573, "learning_rate": 2.8451779775834465e-05, "loss": 7.6715, "step": 7662 }, { "epoch": 0.9432545544066963, "grad_norm": 0.11960650235414505, "learning_rate": 2.839019583692573e-05, "loss": 7.8042, "step": 7663 }, { "epoch": 0.9433776464795667, "grad_norm": 0.1959974616765976, "learning_rate": 2.8328611898016997e-05, "loss": 8.908, "step": 7664 }, { "epoch": 0.9435007385524372, "grad_norm": 0.09031233936548233, "learning_rate": 2.8267027959108267e-05, "loss": 7.562, "step": 7665 }, { "epoch": 0.9436238306253077, "grad_norm": 0.265380322933197, "learning_rate": 2.8205444020199533e-05, "loss": 9.1929, "step": 7666 }, { "epoch": 0.9437469226981783, "grad_norm": 0.07880062609910965, "learning_rate": 2.81438600812908e-05, "loss": 7.8046, "step": 7667 }, { "epoch": 0.9438700147710487, "grad_norm": 0.11835812777280807, "learning_rate": 2.808227614238207e-05, "loss": 8.2881, "step": 7668 }, { "epoch": 0.9439931068439192, "grad_norm": 0.16202661395072937, "learning_rate": 2.8020692203473335e-05, "loss": 7.8414, "step": 7669 }, { "epoch": 0.9441161989167898, "grad_norm": 0.09073124080896378, "learning_rate": 2.79591082645646e-05, "loss": 7.5236, "step": 7670 }, { "epoch": 0.9442392909896603, "grad_norm": 0.07207371294498444, "learning_rate": 2.789752432565587e-05, "loss": 7.7719, "step": 7671 }, { "epoch": 0.9443623830625307, "grad_norm": 0.09430772066116333, "learning_rate": 2.7835940386747138e-05, "loss": 8.1885, "step": 7672 }, { "epoch": 0.9444854751354013, "grad_norm": 0.2708369791507721, "learning_rate": 2.7774356447838407e-05, "loss": 9.4035, "step": 7673 }, { "epoch": 0.9446085672082718, "grad_norm": 0.11547525972127914, "learning_rate": 2.7712772508929674e-05, "loss": 7.5336, "step": 7674 }, { "epoch": 0.9447316592811423, "grad_norm": 0.13251791894435883, "learning_rate": 2.7651188570020936e-05, "loss": 8.168, "step": 7675 }, { "epoch": 0.9448547513540128, "grad_norm": 0.08494288474321365, "learning_rate": 2.7589604631112206e-05, "loss": 8.087, "step": 7676 }, { "epoch": 0.9449778434268833, "grad_norm": 0.14107182621955872, "learning_rate": 2.7528020692203472e-05, "loss": 7.7499, "step": 7677 }, { "epoch": 0.9451009354997538, "grad_norm": 0.11346118897199631, "learning_rate": 2.746643675329474e-05, "loss": 7.6911, "step": 7678 }, { "epoch": 0.9452240275726244, "grad_norm": 0.11962661147117615, "learning_rate": 2.7404852814386008e-05, "loss": 7.417, "step": 7679 }, { "epoch": 0.9453471196454948, "grad_norm": 0.07649039477109909, "learning_rate": 2.7343268875477275e-05, "loss": 7.5149, "step": 7680 }, { "epoch": 0.9454702117183653, "grad_norm": 0.07322987914085388, "learning_rate": 2.7281684936568544e-05, "loss": 7.8279, "step": 7681 }, { "epoch": 0.9455933037912359, "grad_norm": 0.12698489427566528, "learning_rate": 2.722010099765981e-05, "loss": 7.3561, "step": 7682 }, { "epoch": 0.9457163958641064, "grad_norm": 0.06232035532593727, "learning_rate": 2.7158517058751077e-05, "loss": 7.6664, "step": 7683 }, { "epoch": 0.9458394879369768, "grad_norm": 0.11815156042575836, "learning_rate": 2.7096933119842346e-05, "loss": 7.3266, "step": 7684 }, { "epoch": 0.9459625800098473, "grad_norm": 0.17523588240146637, "learning_rate": 2.7035349180933613e-05, "loss": 8.305, "step": 7685 }, { "epoch": 0.9460856720827179, "grad_norm": 0.12577396631240845, "learning_rate": 2.697376524202488e-05, "loss": 7.1474, "step": 7686 }, { "epoch": 0.9462087641555884, "grad_norm": 0.08027330785989761, "learning_rate": 2.691218130311615e-05, "loss": 7.4092, "step": 7687 }, { "epoch": 0.9463318562284588, "grad_norm": 0.11838199943304062, "learning_rate": 2.6850597364207415e-05, "loss": 7.3158, "step": 7688 }, { "epoch": 0.9464549483013294, "grad_norm": 0.11991166323423386, "learning_rate": 2.678901342529868e-05, "loss": 7.4199, "step": 7689 }, { "epoch": 0.9465780403741999, "grad_norm": 0.23012807965278625, "learning_rate": 2.672742948638995e-05, "loss": 8.5183, "step": 7690 }, { "epoch": 0.9467011324470704, "grad_norm": 0.13977941870689392, "learning_rate": 2.6665845547481217e-05, "loss": 7.8263, "step": 7691 }, { "epoch": 0.946824224519941, "grad_norm": 0.24071499705314636, "learning_rate": 2.6604261608572487e-05, "loss": 8.8259, "step": 7692 }, { "epoch": 0.9469473165928114, "grad_norm": 0.06026465445756912, "learning_rate": 2.6542677669663753e-05, "loss": 7.4894, "step": 7693 }, { "epoch": 0.9470704086656819, "grad_norm": 0.5076891183853149, "learning_rate": 2.648109373075502e-05, "loss": 10.0069, "step": 7694 }, { "epoch": 0.9471935007385525, "grad_norm": 0.09817460924386978, "learning_rate": 2.641950979184629e-05, "loss": 7.4275, "step": 7695 }, { "epoch": 0.947316592811423, "grad_norm": 0.12376610189676285, "learning_rate": 2.6357925852937555e-05, "loss": 7.3016, "step": 7696 }, { "epoch": 0.9474396848842934, "grad_norm": 0.17263653874397278, "learning_rate": 2.629634191402882e-05, "loss": 8.712, "step": 7697 }, { "epoch": 0.947562776957164, "grad_norm": 0.06355398148298264, "learning_rate": 2.623475797512009e-05, "loss": 7.597, "step": 7698 }, { "epoch": 0.9476858690300345, "grad_norm": 0.109030582010746, "learning_rate": 2.6173174036211357e-05, "loss": 7.4422, "step": 7699 }, { "epoch": 0.947808961102905, "grad_norm": 0.08386003971099854, "learning_rate": 2.6111590097302624e-05, "loss": 7.3712, "step": 7700 }, { "epoch": 0.9479320531757754, "grad_norm": 0.15850241482257843, "learning_rate": 2.6050006158393893e-05, "loss": 7.5173, "step": 7701 }, { "epoch": 0.948055145248646, "grad_norm": 0.07631687819957733, "learning_rate": 2.598842221948516e-05, "loss": 7.598, "step": 7702 }, { "epoch": 0.9481782373215165, "grad_norm": 0.07944462448358536, "learning_rate": 2.592683828057643e-05, "loss": 7.5689, "step": 7703 }, { "epoch": 0.948301329394387, "grad_norm": 0.12855695188045502, "learning_rate": 2.5865254341667692e-05, "loss": 7.4426, "step": 7704 }, { "epoch": 0.9484244214672575, "grad_norm": 0.09984009712934494, "learning_rate": 2.580367040275896e-05, "loss": 7.5344, "step": 7705 }, { "epoch": 0.948547513540128, "grad_norm": 0.17975853383541107, "learning_rate": 2.5742086463850228e-05, "loss": 7.1993, "step": 7706 }, { "epoch": 0.9486706056129985, "grad_norm": 0.0687863826751709, "learning_rate": 2.5680502524941494e-05, "loss": 7.3793, "step": 7707 }, { "epoch": 0.9487936976858691, "grad_norm": 0.08975662291049957, "learning_rate": 2.561891858603276e-05, "loss": 7.4967, "step": 7708 }, { "epoch": 0.9489167897587395, "grad_norm": 0.06344638764858246, "learning_rate": 2.555733464712403e-05, "loss": 7.3677, "step": 7709 }, { "epoch": 0.94903988183161, "grad_norm": 0.10553096979856491, "learning_rate": 2.5495750708215297e-05, "loss": 7.4728, "step": 7710 }, { "epoch": 0.9491629739044806, "grad_norm": 0.08730894327163696, "learning_rate": 2.5434166769306566e-05, "loss": 7.738, "step": 7711 }, { "epoch": 0.9492860659773511, "grad_norm": 0.07025209069252014, "learning_rate": 2.5372582830397832e-05, "loss": 7.4142, "step": 7712 }, { "epoch": 0.9494091580502215, "grad_norm": 0.13337668776512146, "learning_rate": 2.53109988914891e-05, "loss": 8.1494, "step": 7713 }, { "epoch": 0.9495322501230921, "grad_norm": 0.06304029375314713, "learning_rate": 2.524941495258037e-05, "loss": 7.5629, "step": 7714 }, { "epoch": 0.9496553421959626, "grad_norm": 0.12624700367450714, "learning_rate": 2.5187831013671635e-05, "loss": 7.8626, "step": 7715 }, { "epoch": 0.9497784342688331, "grad_norm": 0.07741422951221466, "learning_rate": 2.51262470747629e-05, "loss": 7.3875, "step": 7716 }, { "epoch": 0.9499015263417036, "grad_norm": 0.08390509337186813, "learning_rate": 2.506466313585417e-05, "loss": 7.4725, "step": 7717 }, { "epoch": 0.9500246184145741, "grad_norm": 0.13668082654476166, "learning_rate": 2.5003079196945437e-05, "loss": 8.0612, "step": 7718 }, { "epoch": 0.9501477104874446, "grad_norm": 0.1431788057088852, "learning_rate": 2.4941495258036703e-05, "loss": 8.2174, "step": 7719 }, { "epoch": 0.9502708025603152, "grad_norm": 0.13919700682163239, "learning_rate": 2.4879911319127973e-05, "loss": 8.0474, "step": 7720 }, { "epoch": 0.9503938946331856, "grad_norm": 0.12162638455629349, "learning_rate": 2.481832738021924e-05, "loss": 8.1648, "step": 7721 }, { "epoch": 0.9505169867060561, "grad_norm": 0.34191009402275085, "learning_rate": 2.475674344131051e-05, "loss": 9.2287, "step": 7722 }, { "epoch": 0.9506400787789266, "grad_norm": 0.10441829264163971, "learning_rate": 2.4695159502401775e-05, "loss": 7.5445, "step": 7723 }, { "epoch": 0.9507631708517972, "grad_norm": 0.10263702273368835, "learning_rate": 2.463357556349304e-05, "loss": 8.1805, "step": 7724 }, { "epoch": 0.9508862629246676, "grad_norm": 0.1018616333603859, "learning_rate": 2.457199162458431e-05, "loss": 7.5065, "step": 7725 }, { "epoch": 0.9510093549975381, "grad_norm": 0.09157831221818924, "learning_rate": 2.4510407685675577e-05, "loss": 8.0794, "step": 7726 }, { "epoch": 0.9511324470704087, "grad_norm": 0.08860189467668533, "learning_rate": 2.4448823746766843e-05, "loss": 7.9688, "step": 7727 }, { "epoch": 0.9512555391432792, "grad_norm": 0.12843021750450134, "learning_rate": 2.4387239807858113e-05, "loss": 7.3663, "step": 7728 }, { "epoch": 0.9513786312161496, "grad_norm": 0.08675448596477509, "learning_rate": 2.432565586894938e-05, "loss": 7.6522, "step": 7729 }, { "epoch": 0.9515017232890202, "grad_norm": 0.09720155596733093, "learning_rate": 2.4264071930040646e-05, "loss": 7.6005, "step": 7730 }, { "epoch": 0.9516248153618907, "grad_norm": 0.14456398785114288, "learning_rate": 2.4202487991131915e-05, "loss": 7.4147, "step": 7731 }, { "epoch": 0.9517479074347612, "grad_norm": 0.0813179463148117, "learning_rate": 2.414090405222318e-05, "loss": 8.1317, "step": 7732 }, { "epoch": 0.9518709995076317, "grad_norm": 0.07085121423006058, "learning_rate": 2.4079320113314448e-05, "loss": 7.6266, "step": 7733 }, { "epoch": 0.9519940915805022, "grad_norm": 0.10435081273317337, "learning_rate": 2.4017736174405714e-05, "loss": 7.8354, "step": 7734 }, { "epoch": 0.9521171836533727, "grad_norm": 0.12031614780426025, "learning_rate": 2.395615223549698e-05, "loss": 7.4482, "step": 7735 }, { "epoch": 0.9522402757262433, "grad_norm": 0.07498179376125336, "learning_rate": 2.389456829658825e-05, "loss": 7.6189, "step": 7736 }, { "epoch": 0.9523633677991138, "grad_norm": 0.08495650440454483, "learning_rate": 2.3832984357679516e-05, "loss": 7.6934, "step": 7737 }, { "epoch": 0.9524864598719842, "grad_norm": 0.06399203091859818, "learning_rate": 2.3771400418770783e-05, "loss": 7.5957, "step": 7738 }, { "epoch": 0.9526095519448547, "grad_norm": 0.15278667211532593, "learning_rate": 2.3709816479862052e-05, "loss": 7.7346, "step": 7739 }, { "epoch": 0.9527326440177253, "grad_norm": 0.060843829065561295, "learning_rate": 2.364823254095332e-05, "loss": 7.6065, "step": 7740 }, { "epoch": 0.9528557360905958, "grad_norm": 0.07660872489213943, "learning_rate": 2.3586648602044588e-05, "loss": 7.9708, "step": 7741 }, { "epoch": 0.9529788281634662, "grad_norm": 0.3181091248989105, "learning_rate": 2.3525064663135854e-05, "loss": 9.7739, "step": 7742 }, { "epoch": 0.9531019202363368, "grad_norm": 0.08731620013713837, "learning_rate": 2.346348072422712e-05, "loss": 7.4292, "step": 7743 }, { "epoch": 0.9532250123092073, "grad_norm": 0.07170070707798004, "learning_rate": 2.340189678531839e-05, "loss": 7.6105, "step": 7744 }, { "epoch": 0.9533481043820778, "grad_norm": 0.09782066941261292, "learning_rate": 2.3340312846409657e-05, "loss": 7.2723, "step": 7745 }, { "epoch": 0.9534711964549483, "grad_norm": 0.07433781027793884, "learning_rate": 2.3278728907500923e-05, "loss": 7.9056, "step": 7746 }, { "epoch": 0.9535942885278188, "grad_norm": 0.058187276124954224, "learning_rate": 2.3217144968592193e-05, "loss": 7.5756, "step": 7747 }, { "epoch": 0.9537173806006893, "grad_norm": 0.09949768334627151, "learning_rate": 2.315556102968346e-05, "loss": 8.1297, "step": 7748 }, { "epoch": 0.9538404726735599, "grad_norm": 0.07203183323144913, "learning_rate": 2.3093977090774725e-05, "loss": 7.5614, "step": 7749 }, { "epoch": 0.9539635647464303, "grad_norm": 0.11172374337911606, "learning_rate": 2.3032393151865995e-05, "loss": 7.7321, "step": 7750 }, { "epoch": 0.9540866568193008, "grad_norm": 0.09541399776935577, "learning_rate": 2.297080921295726e-05, "loss": 7.2519, "step": 7751 }, { "epoch": 0.9542097488921714, "grad_norm": 0.05915282294154167, "learning_rate": 2.290922527404853e-05, "loss": 7.678, "step": 7752 }, { "epoch": 0.9543328409650419, "grad_norm": 0.08779148012399673, "learning_rate": 2.2847641335139797e-05, "loss": 7.4735, "step": 7753 }, { "epoch": 0.9544559330379123, "grad_norm": 0.050297852605581284, "learning_rate": 2.2786057396231063e-05, "loss": 7.7674, "step": 7754 }, { "epoch": 0.9545790251107829, "grad_norm": 0.07146281749010086, "learning_rate": 2.2724473457322333e-05, "loss": 7.4003, "step": 7755 }, { "epoch": 0.9547021171836534, "grad_norm": 0.18551790714263916, "learning_rate": 2.26628895184136e-05, "loss": 7.1319, "step": 7756 }, { "epoch": 0.9548252092565239, "grad_norm": 0.06936411559581757, "learning_rate": 2.2601305579504865e-05, "loss": 7.5035, "step": 7757 }, { "epoch": 0.9549483013293943, "grad_norm": 0.06839428097009659, "learning_rate": 2.2539721640596135e-05, "loss": 7.5163, "step": 7758 }, { "epoch": 0.9550713934022649, "grad_norm": 0.05989509075880051, "learning_rate": 2.24781377016874e-05, "loss": 7.8192, "step": 7759 }, { "epoch": 0.9551944854751354, "grad_norm": 0.15232758224010468, "learning_rate": 2.2416553762778668e-05, "loss": 8.0035, "step": 7760 }, { "epoch": 0.955317577548006, "grad_norm": 0.07372073084115982, "learning_rate": 2.2354969823869937e-05, "loss": 7.7225, "step": 7761 }, { "epoch": 0.9554406696208764, "grad_norm": 0.24203965067863464, "learning_rate": 2.22933858849612e-05, "loss": 7.1819, "step": 7762 }, { "epoch": 0.9555637616937469, "grad_norm": 0.08680632710456848, "learning_rate": 2.223180194605247e-05, "loss": 7.3319, "step": 7763 }, { "epoch": 0.9556868537666174, "grad_norm": 0.06007584556937218, "learning_rate": 2.2170218007143736e-05, "loss": 7.5158, "step": 7764 }, { "epoch": 0.955809945839488, "grad_norm": 0.15875287353992462, "learning_rate": 2.2108634068235002e-05, "loss": 8.4223, "step": 7765 }, { "epoch": 0.9559330379123584, "grad_norm": 0.08957551419734955, "learning_rate": 2.2047050129326272e-05, "loss": 7.6699, "step": 7766 }, { "epoch": 0.9560561299852289, "grad_norm": 0.11794198304414749, "learning_rate": 2.198546619041754e-05, "loss": 7.6031, "step": 7767 }, { "epoch": 0.9561792220580995, "grad_norm": 0.07680243998765945, "learning_rate": 2.1923882251508805e-05, "loss": 7.4698, "step": 7768 }, { "epoch": 0.95630231413097, "grad_norm": 0.08424750715494156, "learning_rate": 2.1862298312600074e-05, "loss": 7.4519, "step": 7769 }, { "epoch": 0.9564254062038404, "grad_norm": 0.1704486906528473, "learning_rate": 2.180071437369134e-05, "loss": 8.2308, "step": 7770 }, { "epoch": 0.956548498276711, "grad_norm": 0.07403373718261719, "learning_rate": 2.173913043478261e-05, "loss": 7.5569, "step": 7771 }, { "epoch": 0.9566715903495815, "grad_norm": 0.07388671487569809, "learning_rate": 2.1677546495873876e-05, "loss": 7.5564, "step": 7772 }, { "epoch": 0.956794682422452, "grad_norm": 0.12656550109386444, "learning_rate": 2.1615962556965143e-05, "loss": 8.1046, "step": 7773 }, { "epoch": 0.9569177744953224, "grad_norm": 0.05428345873951912, "learning_rate": 2.1554378618056412e-05, "loss": 7.5588, "step": 7774 }, { "epoch": 0.957040866568193, "grad_norm": 0.09900251030921936, "learning_rate": 2.149279467914768e-05, "loss": 7.782, "step": 7775 }, { "epoch": 0.9571639586410635, "grad_norm": 0.05935357138514519, "learning_rate": 2.1431210740238945e-05, "loss": 7.63, "step": 7776 }, { "epoch": 0.957287050713934, "grad_norm": 0.06127464398741722, "learning_rate": 2.1369626801330215e-05, "loss": 7.7027, "step": 7777 }, { "epoch": 0.9574101427868045, "grad_norm": 0.08849558979272842, "learning_rate": 2.130804286242148e-05, "loss": 7.6636, "step": 7778 }, { "epoch": 0.957533234859675, "grad_norm": 0.06593027710914612, "learning_rate": 2.1246458923512747e-05, "loss": 7.5141, "step": 7779 }, { "epoch": 0.9576563269325455, "grad_norm": 0.07647418230772018, "learning_rate": 2.1184874984604017e-05, "loss": 7.6591, "step": 7780 }, { "epoch": 0.9577794190054161, "grad_norm": 0.23882552981376648, "learning_rate": 2.1123291045695283e-05, "loss": 8.7436, "step": 7781 }, { "epoch": 0.9579025110782866, "grad_norm": 0.17687678337097168, "learning_rate": 2.1061707106786553e-05, "loss": 8.5498, "step": 7782 }, { "epoch": 0.958025603151157, "grad_norm": 0.059393562376499176, "learning_rate": 2.100012316787782e-05, "loss": 7.7967, "step": 7783 }, { "epoch": 0.9581486952240276, "grad_norm": 0.09359341114759445, "learning_rate": 2.0938539228969085e-05, "loss": 7.4742, "step": 7784 }, { "epoch": 0.9582717872968981, "grad_norm": 0.16584566235542297, "learning_rate": 2.0876955290060355e-05, "loss": 7.4425, "step": 7785 }, { "epoch": 0.9583948793697686, "grad_norm": 0.13933442533016205, "learning_rate": 2.081537135115162e-05, "loss": 8.0084, "step": 7786 }, { "epoch": 0.9585179714426391, "grad_norm": 0.3176126778125763, "learning_rate": 2.0753787412242887e-05, "loss": 7.594, "step": 7787 }, { "epoch": 0.9586410635155096, "grad_norm": 0.4576902389526367, "learning_rate": 2.0692203473334157e-05, "loss": 9.0823, "step": 7788 }, { "epoch": 0.9587641555883801, "grad_norm": 0.12399367988109589, "learning_rate": 2.0630619534425423e-05, "loss": 7.5471, "step": 7789 }, { "epoch": 0.9588872476612507, "grad_norm": 0.12906870245933533, "learning_rate": 2.0569035595516693e-05, "loss": 7.3962, "step": 7790 }, { "epoch": 0.9590103397341211, "grad_norm": 0.054281558841466904, "learning_rate": 2.0507451656607956e-05, "loss": 7.8536, "step": 7791 }, { "epoch": 0.9591334318069916, "grad_norm": 0.058623190969228745, "learning_rate": 2.0445867717699222e-05, "loss": 7.6506, "step": 7792 }, { "epoch": 0.9592565238798622, "grad_norm": 0.15956249833106995, "learning_rate": 2.0384283778790492e-05, "loss": 8.3702, "step": 7793 }, { "epoch": 0.9593796159527327, "grad_norm": 0.11768453568220139, "learning_rate": 2.0322699839881758e-05, "loss": 7.4512, "step": 7794 }, { "epoch": 0.9595027080256031, "grad_norm": 0.0988493412733078, "learning_rate": 2.0261115900973024e-05, "loss": 7.5128, "step": 7795 }, { "epoch": 0.9596258000984736, "grad_norm": 0.0935436561703682, "learning_rate": 2.0199531962064294e-05, "loss": 7.4424, "step": 7796 }, { "epoch": 0.9597488921713442, "grad_norm": 0.09725917130708694, "learning_rate": 2.013794802315556e-05, "loss": 7.3369, "step": 7797 }, { "epoch": 0.9598719842442147, "grad_norm": 0.14695890247821808, "learning_rate": 2.0076364084246827e-05, "loss": 8.3312, "step": 7798 }, { "epoch": 0.9599950763170851, "grad_norm": 0.08078181743621826, "learning_rate": 2.0014780145338096e-05, "loss": 7.51, "step": 7799 }, { "epoch": 0.9601181683899557, "grad_norm": 0.07462827861309052, "learning_rate": 1.9953196206429363e-05, "loss": 7.4429, "step": 7800 }, { "epoch": 0.9602412604628262, "grad_norm": 0.11787774413824081, "learning_rate": 1.9891612267520632e-05, "loss": 7.6792, "step": 7801 }, { "epoch": 0.9603643525356967, "grad_norm": 0.07733045518398285, "learning_rate": 1.98300283286119e-05, "loss": 7.9835, "step": 7802 }, { "epoch": 0.9604874446085672, "grad_norm": 0.08424054086208344, "learning_rate": 1.9768444389703165e-05, "loss": 7.3968, "step": 7803 }, { "epoch": 0.9606105366814377, "grad_norm": 0.14528466761112213, "learning_rate": 1.9706860450794434e-05, "loss": 7.742, "step": 7804 }, { "epoch": 0.9607336287543082, "grad_norm": 0.06666228920221329, "learning_rate": 1.96452765118857e-05, "loss": 7.464, "step": 7805 }, { "epoch": 0.9608567208271788, "grad_norm": 0.081586092710495, "learning_rate": 1.9583692572976967e-05, "loss": 7.9735, "step": 7806 }, { "epoch": 0.9609798129000492, "grad_norm": 0.13718129694461823, "learning_rate": 1.9522108634068237e-05, "loss": 7.9315, "step": 7807 }, { "epoch": 0.9611029049729197, "grad_norm": 0.07156328856945038, "learning_rate": 1.9460524695159503e-05, "loss": 7.5248, "step": 7808 }, { "epoch": 0.9612259970457903, "grad_norm": 0.07048632204532623, "learning_rate": 1.939894075625077e-05, "loss": 7.322, "step": 7809 }, { "epoch": 0.9613490891186608, "grad_norm": 0.09961642324924469, "learning_rate": 1.933735681734204e-05, "loss": 7.2404, "step": 7810 }, { "epoch": 0.9614721811915312, "grad_norm": 0.06073934957385063, "learning_rate": 1.9275772878433305e-05, "loss": 7.5307, "step": 7811 }, { "epoch": 0.9615952732644018, "grad_norm": 0.09650294482707977, "learning_rate": 1.9214188939524575e-05, "loss": 7.5683, "step": 7812 }, { "epoch": 0.9617183653372723, "grad_norm": 0.05585285648703575, "learning_rate": 1.915260500061584e-05, "loss": 7.595, "step": 7813 }, { "epoch": 0.9618414574101428, "grad_norm": 0.14242658019065857, "learning_rate": 1.9091021061707107e-05, "loss": 7.9337, "step": 7814 }, { "epoch": 0.9619645494830132, "grad_norm": 0.6545012593269348, "learning_rate": 1.9029437122798377e-05, "loss": 10.4562, "step": 7815 }, { "epoch": 0.9620876415558838, "grad_norm": 0.13337866961956024, "learning_rate": 1.8967853183889643e-05, "loss": 7.911, "step": 7816 }, { "epoch": 0.9622107336287543, "grad_norm": 0.05835700407624245, "learning_rate": 1.890626924498091e-05, "loss": 7.6794, "step": 7817 }, { "epoch": 0.9623338257016248, "grad_norm": 0.0948629379272461, "learning_rate": 1.884468530607218e-05, "loss": 7.7086, "step": 7818 }, { "epoch": 0.9624569177744953, "grad_norm": 0.5815930366516113, "learning_rate": 1.8783101367163445e-05, "loss": 10.2458, "step": 7819 }, { "epoch": 0.9625800098473658, "grad_norm": 0.07408794015645981, "learning_rate": 1.8721517428254708e-05, "loss": 7.6475, "step": 7820 }, { "epoch": 0.9627031019202363, "grad_norm": 0.07307396084070206, "learning_rate": 1.8659933489345978e-05, "loss": 8.0441, "step": 7821 }, { "epoch": 0.9628261939931069, "grad_norm": 0.1484157145023346, "learning_rate": 1.8598349550437244e-05, "loss": 7.2349, "step": 7822 }, { "epoch": 0.9629492860659774, "grad_norm": 0.15899217128753662, "learning_rate": 1.8536765611528514e-05, "loss": 7.2416, "step": 7823 }, { "epoch": 0.9630723781388478, "grad_norm": 0.11866362392902374, "learning_rate": 1.847518167261978e-05, "loss": 7.4778, "step": 7824 }, { "epoch": 0.9631954702117184, "grad_norm": 0.14261257648468018, "learning_rate": 1.8413597733711046e-05, "loss": 7.4091, "step": 7825 }, { "epoch": 0.9633185622845889, "grad_norm": 0.2783839702606201, "learning_rate": 1.8352013794802316e-05, "loss": 8.7635, "step": 7826 }, { "epoch": 0.9634416543574594, "grad_norm": 0.07626742869615555, "learning_rate": 1.8290429855893582e-05, "loss": 7.5495, "step": 7827 }, { "epoch": 0.9635647464303299, "grad_norm": 0.11408185958862305, "learning_rate": 1.822884591698485e-05, "loss": 7.9352, "step": 7828 }, { "epoch": 0.9636878385032004, "grad_norm": 0.054031435400247574, "learning_rate": 1.8167261978076118e-05, "loss": 7.7013, "step": 7829 }, { "epoch": 0.9638109305760709, "grad_norm": 0.06289856135845184, "learning_rate": 1.8105678039167385e-05, "loss": 7.8538, "step": 7830 }, { "epoch": 0.9639340226489415, "grad_norm": 0.10975763201713562, "learning_rate": 1.8044094100258654e-05, "loss": 7.325, "step": 7831 }, { "epoch": 0.9640571147218119, "grad_norm": 0.06584373116493225, "learning_rate": 1.798251016134992e-05, "loss": 7.5239, "step": 7832 }, { "epoch": 0.9641802067946824, "grad_norm": 0.10920962691307068, "learning_rate": 1.7920926222441187e-05, "loss": 7.3378, "step": 7833 }, { "epoch": 0.964303298867553, "grad_norm": 0.06120305135846138, "learning_rate": 1.7859342283532456e-05, "loss": 7.7902, "step": 7834 }, { "epoch": 0.9644263909404235, "grad_norm": 0.09898263961076736, "learning_rate": 1.7797758344623723e-05, "loss": 7.5011, "step": 7835 }, { "epoch": 0.9645494830132939, "grad_norm": 0.11279581487178802, "learning_rate": 1.773617440571499e-05, "loss": 7.6373, "step": 7836 }, { "epoch": 0.9646725750861644, "grad_norm": 0.09053152799606323, "learning_rate": 1.767459046680626e-05, "loss": 7.4568, "step": 7837 }, { "epoch": 0.964795667159035, "grad_norm": 0.0956459492444992, "learning_rate": 1.7613006527897525e-05, "loss": 7.6651, "step": 7838 }, { "epoch": 0.9649187592319055, "grad_norm": 0.10522893071174622, "learning_rate": 1.755142258898879e-05, "loss": 7.7928, "step": 7839 }, { "epoch": 0.9650418513047759, "grad_norm": 0.10105326771736145, "learning_rate": 1.748983865008006e-05, "loss": 7.6874, "step": 7840 }, { "epoch": 0.9651649433776465, "grad_norm": 0.11658690124750137, "learning_rate": 1.7428254711171327e-05, "loss": 8.0102, "step": 7841 }, { "epoch": 0.965288035450517, "grad_norm": 0.10585947334766388, "learning_rate": 1.7366670772262597e-05, "loss": 7.9701, "step": 7842 }, { "epoch": 0.9654111275233875, "grad_norm": 0.06189986318349838, "learning_rate": 1.7305086833353863e-05, "loss": 7.492, "step": 7843 }, { "epoch": 0.965534219596258, "grad_norm": 0.08441099524497986, "learning_rate": 1.724350289444513e-05, "loss": 8.3199, "step": 7844 }, { "epoch": 0.9656573116691285, "grad_norm": 0.19998380541801453, "learning_rate": 1.71819189555364e-05, "loss": 8.4868, "step": 7845 }, { "epoch": 0.965780403741999, "grad_norm": 0.054289430379867554, "learning_rate": 1.7120335016627665e-05, "loss": 7.7934, "step": 7846 }, { "epoch": 0.9659034958148696, "grad_norm": 0.10744171589612961, "learning_rate": 1.705875107771893e-05, "loss": 7.226, "step": 7847 }, { "epoch": 0.96602658788774, "grad_norm": 0.07907019555568695, "learning_rate": 1.69971671388102e-05, "loss": 7.7934, "step": 7848 }, { "epoch": 0.9661496799606105, "grad_norm": 0.08951673656702042, "learning_rate": 1.6935583199901464e-05, "loss": 7.3353, "step": 7849 }, { "epoch": 0.966272772033481, "grad_norm": 0.0905202329158783, "learning_rate": 1.687399926099273e-05, "loss": 7.6027, "step": 7850 }, { "epoch": 0.9663958641063516, "grad_norm": 0.09516578167676926, "learning_rate": 1.6812415322084e-05, "loss": 8.0285, "step": 7851 }, { "epoch": 0.966518956179222, "grad_norm": 0.06756965816020966, "learning_rate": 1.6750831383175266e-05, "loss": 7.417, "step": 7852 }, { "epoch": 0.9666420482520925, "grad_norm": 0.11579146981239319, "learning_rate": 1.6689247444266536e-05, "loss": 7.2482, "step": 7853 }, { "epoch": 0.9667651403249631, "grad_norm": 0.11721136420965195, "learning_rate": 1.6627663505357802e-05, "loss": 8.1646, "step": 7854 }, { "epoch": 0.9668882323978336, "grad_norm": 0.13012957572937012, "learning_rate": 1.656607956644907e-05, "loss": 8.1175, "step": 7855 }, { "epoch": 0.967011324470704, "grad_norm": 0.18883639574050903, "learning_rate": 1.6504495627540338e-05, "loss": 8.5748, "step": 7856 }, { "epoch": 0.9671344165435746, "grad_norm": 0.08544665575027466, "learning_rate": 1.6442911688631604e-05, "loss": 7.7889, "step": 7857 }, { "epoch": 0.9672575086164451, "grad_norm": 0.1451387107372284, "learning_rate": 1.638132774972287e-05, "loss": 7.8564, "step": 7858 }, { "epoch": 0.9673806006893156, "grad_norm": 0.06541730463504791, "learning_rate": 1.631974381081414e-05, "loss": 7.4967, "step": 7859 }, { "epoch": 0.9675036927621861, "grad_norm": 0.08256831020116806, "learning_rate": 1.6258159871905406e-05, "loss": 8.0135, "step": 7860 }, { "epoch": 0.9676267848350566, "grad_norm": 0.09452391415834427, "learning_rate": 1.6196575932996676e-05, "loss": 7.5491, "step": 7861 }, { "epoch": 0.9677498769079271, "grad_norm": 0.06935365498065948, "learning_rate": 1.6134991994087942e-05, "loss": 7.4329, "step": 7862 }, { "epoch": 0.9678729689807977, "grad_norm": 0.0667133629322052, "learning_rate": 1.607340805517921e-05, "loss": 7.6373, "step": 7863 }, { "epoch": 0.9679960610536681, "grad_norm": 0.07287447899580002, "learning_rate": 1.601182411627048e-05, "loss": 7.6613, "step": 7864 }, { "epoch": 0.9681191531265386, "grad_norm": 0.08299607038497925, "learning_rate": 1.5950240177361745e-05, "loss": 7.6149, "step": 7865 }, { "epoch": 0.9682422451994092, "grad_norm": 0.10835486650466919, "learning_rate": 1.588865623845301e-05, "loss": 8.1102, "step": 7866 }, { "epoch": 0.9683653372722797, "grad_norm": 0.10753297805786133, "learning_rate": 1.582707229954428e-05, "loss": 7.6297, "step": 7867 }, { "epoch": 0.9684884293451502, "grad_norm": 0.07814571261405945, "learning_rate": 1.5765488360635547e-05, "loss": 7.5658, "step": 7868 }, { "epoch": 0.9686115214180206, "grad_norm": 0.09990409761667252, "learning_rate": 1.5703904421726813e-05, "loss": 7.8765, "step": 7869 }, { "epoch": 0.9687346134908912, "grad_norm": 0.0905599370598793, "learning_rate": 1.5642320482818083e-05, "loss": 7.7691, "step": 7870 }, { "epoch": 0.9688577055637617, "grad_norm": 0.08937303721904755, "learning_rate": 1.558073654390935e-05, "loss": 7.6857, "step": 7871 }, { "epoch": 0.9689807976366323, "grad_norm": 0.1752159148454666, "learning_rate": 1.5519152605000615e-05, "loss": 7.5462, "step": 7872 }, { "epoch": 0.9691038897095027, "grad_norm": 0.06808692216873169, "learning_rate": 1.545756866609188e-05, "loss": 7.5, "step": 7873 }, { "epoch": 0.9692269817823732, "grad_norm": 0.09975583106279373, "learning_rate": 1.539598472718315e-05, "loss": 7.276, "step": 7874 }, { "epoch": 0.9693500738552437, "grad_norm": 0.07686927169561386, "learning_rate": 1.5334400788274417e-05, "loss": 7.5183, "step": 7875 }, { "epoch": 0.9694731659281143, "grad_norm": 0.06591162830591202, "learning_rate": 1.5272816849365687e-05, "loss": 7.7499, "step": 7876 }, { "epoch": 0.9695962580009847, "grad_norm": 0.120417021214962, "learning_rate": 1.5211232910456953e-05, "loss": 8.0421, "step": 7877 }, { "epoch": 0.9697193500738552, "grad_norm": 0.09617747366428375, "learning_rate": 1.5149648971548221e-05, "loss": 7.5468, "step": 7878 }, { "epoch": 0.9698424421467258, "grad_norm": 0.08517944067716599, "learning_rate": 1.5088065032639488e-05, "loss": 7.4551, "step": 7879 }, { "epoch": 0.9699655342195963, "grad_norm": 0.26108911633491516, "learning_rate": 1.5026481093730756e-05, "loss": 8.5923, "step": 7880 }, { "epoch": 0.9700886262924667, "grad_norm": 0.0775558277964592, "learning_rate": 1.4964897154822024e-05, "loss": 7.8864, "step": 7881 }, { "epoch": 0.9702117183653373, "grad_norm": 0.06582419574260712, "learning_rate": 1.490331321591329e-05, "loss": 7.6857, "step": 7882 }, { "epoch": 0.9703348104382078, "grad_norm": 0.15185777842998505, "learning_rate": 1.4841729277004558e-05, "loss": 7.5675, "step": 7883 }, { "epoch": 0.9704579025110783, "grad_norm": 0.12779438495635986, "learning_rate": 1.4780145338095826e-05, "loss": 7.3129, "step": 7884 }, { "epoch": 0.9705809945839488, "grad_norm": 0.3357086479663849, "learning_rate": 1.4718561399187092e-05, "loss": 9.1335, "step": 7885 }, { "epoch": 0.9707040866568193, "grad_norm": 0.07547042518854141, "learning_rate": 1.4656977460278358e-05, "loss": 7.5418, "step": 7886 }, { "epoch": 0.9708271787296898, "grad_norm": 0.07631432265043259, "learning_rate": 1.4595393521369626e-05, "loss": 7.5783, "step": 7887 }, { "epoch": 0.9709502708025604, "grad_norm": 0.07553259283304214, "learning_rate": 1.4533809582460894e-05, "loss": 7.5963, "step": 7888 }, { "epoch": 0.9710733628754308, "grad_norm": 0.1440650075674057, "learning_rate": 1.4472225643552162e-05, "loss": 7.2356, "step": 7889 }, { "epoch": 0.9711964549483013, "grad_norm": 0.08099152892827988, "learning_rate": 1.4410641704643428e-05, "loss": 7.5763, "step": 7890 }, { "epoch": 0.9713195470211718, "grad_norm": 0.23060284554958344, "learning_rate": 1.4349057765734696e-05, "loss": 8.0908, "step": 7891 }, { "epoch": 0.9714426390940424, "grad_norm": 0.13718226552009583, "learning_rate": 1.4287473826825964e-05, "loss": 7.1098, "step": 7892 }, { "epoch": 0.9715657311669128, "grad_norm": 0.06761857122182846, "learning_rate": 1.4225889887917232e-05, "loss": 7.5742, "step": 7893 }, { "epoch": 0.9716888232397833, "grad_norm": 0.30144381523132324, "learning_rate": 1.4164305949008499e-05, "loss": 8.3864, "step": 7894 }, { "epoch": 0.9718119153126539, "grad_norm": 0.06516227126121521, "learning_rate": 1.4102722010099767e-05, "loss": 7.5607, "step": 7895 }, { "epoch": 0.9719350073855244, "grad_norm": 0.1838417500257492, "learning_rate": 1.4041138071191035e-05, "loss": 8.6731, "step": 7896 }, { "epoch": 0.9720580994583948, "grad_norm": 0.08201957494020462, "learning_rate": 1.39795541322823e-05, "loss": 7.3535, "step": 7897 }, { "epoch": 0.9721811915312654, "grad_norm": 0.06747211515903473, "learning_rate": 1.3917970193373569e-05, "loss": 7.6761, "step": 7898 }, { "epoch": 0.9723042836041359, "grad_norm": 0.09365446120500565, "learning_rate": 1.3856386254464837e-05, "loss": 7.2697, "step": 7899 }, { "epoch": 0.9724273756770064, "grad_norm": 0.06413117051124573, "learning_rate": 1.3794802315556103e-05, "loss": 7.3866, "step": 7900 }, { "epoch": 0.9725504677498769, "grad_norm": 0.07751385122537613, "learning_rate": 1.373321837664737e-05, "loss": 7.512, "step": 7901 }, { "epoch": 0.9726735598227474, "grad_norm": 0.08189553767442703, "learning_rate": 1.3671634437738637e-05, "loss": 7.7365, "step": 7902 }, { "epoch": 0.9727966518956179, "grad_norm": 0.10708954930305481, "learning_rate": 1.3610050498829905e-05, "loss": 7.5995, "step": 7903 }, { "epoch": 0.9729197439684885, "grad_norm": 0.34641262888908386, "learning_rate": 1.3548466559921173e-05, "loss": 9.5745, "step": 7904 }, { "epoch": 0.9730428360413589, "grad_norm": 0.05883923918008804, "learning_rate": 1.348688262101244e-05, "loss": 7.5232, "step": 7905 }, { "epoch": 0.9731659281142294, "grad_norm": 0.08341957628726959, "learning_rate": 1.3425298682103707e-05, "loss": 7.5027, "step": 7906 }, { "epoch": 0.9732890201871, "grad_norm": 0.06831005215644836, "learning_rate": 1.3363714743194975e-05, "loss": 7.6142, "step": 7907 }, { "epoch": 0.9734121122599705, "grad_norm": 0.10221446305513382, "learning_rate": 1.3302130804286243e-05, "loss": 7.2639, "step": 7908 }, { "epoch": 0.973535204332841, "grad_norm": 0.09444034099578857, "learning_rate": 1.324054686537751e-05, "loss": 7.4684, "step": 7909 }, { "epoch": 0.9736582964057114, "grad_norm": 0.4323001503944397, "learning_rate": 1.3178962926468778e-05, "loss": 9.3827, "step": 7910 }, { "epoch": 0.973781388478582, "grad_norm": 0.09863726049661636, "learning_rate": 1.3117378987560046e-05, "loss": 7.2422, "step": 7911 }, { "epoch": 0.9739044805514525, "grad_norm": 0.2950666546821594, "learning_rate": 1.3055795048651312e-05, "loss": 9.0594, "step": 7912 }, { "epoch": 0.974027572624323, "grad_norm": 0.24798192083835602, "learning_rate": 1.299421110974258e-05, "loss": 7.363, "step": 7913 }, { "epoch": 0.9741506646971935, "grad_norm": 0.06791159510612488, "learning_rate": 1.2932627170833846e-05, "loss": 7.63, "step": 7914 }, { "epoch": 0.974273756770064, "grad_norm": 0.07884444296360016, "learning_rate": 1.2871043231925114e-05, "loss": 7.644, "step": 7915 }, { "epoch": 0.9743968488429345, "grad_norm": 0.08055132627487183, "learning_rate": 1.280945929301638e-05, "loss": 7.3756, "step": 7916 }, { "epoch": 0.9745199409158051, "grad_norm": 0.07742245495319366, "learning_rate": 1.2747875354107648e-05, "loss": 7.5952, "step": 7917 }, { "epoch": 0.9746430329886755, "grad_norm": 0.45038658380508423, "learning_rate": 1.2686291415198916e-05, "loss": 10.1002, "step": 7918 }, { "epoch": 0.974766125061546, "grad_norm": 0.15530510246753693, "learning_rate": 1.2624707476290184e-05, "loss": 8.3348, "step": 7919 }, { "epoch": 0.9748892171344166, "grad_norm": 0.06150083988904953, "learning_rate": 1.256312353738145e-05, "loss": 7.5999, "step": 7920 }, { "epoch": 0.9750123092072871, "grad_norm": 0.07792465388774872, "learning_rate": 1.2501539598472718e-05, "loss": 7.6337, "step": 7921 }, { "epoch": 0.9751354012801575, "grad_norm": 0.0792776346206665, "learning_rate": 1.2439955659563986e-05, "loss": 7.4858, "step": 7922 }, { "epoch": 0.975258493353028, "grad_norm": 0.10828694701194763, "learning_rate": 1.2378371720655254e-05, "loss": 7.5774, "step": 7923 }, { "epoch": 0.9753815854258986, "grad_norm": 0.19504518806934357, "learning_rate": 1.231678778174652e-05, "loss": 8.27, "step": 7924 }, { "epoch": 0.9755046774987691, "grad_norm": 0.1582188457250595, "learning_rate": 1.2255203842837789e-05, "loss": 7.7377, "step": 7925 }, { "epoch": 0.9756277695716395, "grad_norm": 0.08141236007213593, "learning_rate": 1.2193619903929057e-05, "loss": 7.493, "step": 7926 }, { "epoch": 0.9757508616445101, "grad_norm": 0.1789511889219284, "learning_rate": 1.2132035965020323e-05, "loss": 8.4402, "step": 7927 }, { "epoch": 0.9758739537173806, "grad_norm": 0.062057819217443466, "learning_rate": 1.207045202611159e-05, "loss": 7.8739, "step": 7928 }, { "epoch": 0.9759970457902511, "grad_norm": 0.10078191012144089, "learning_rate": 1.2008868087202857e-05, "loss": 7.7212, "step": 7929 }, { "epoch": 0.9761201378631216, "grad_norm": 0.1718078851699829, "learning_rate": 1.1947284148294125e-05, "loss": 7.5672, "step": 7930 }, { "epoch": 0.9762432299359921, "grad_norm": 0.18810610473155975, "learning_rate": 1.1885700209385391e-05, "loss": 8.3196, "step": 7931 }, { "epoch": 0.9763663220088626, "grad_norm": 0.05850163474678993, "learning_rate": 1.182411627047666e-05, "loss": 7.7829, "step": 7932 }, { "epoch": 0.9764894140817332, "grad_norm": 0.07000194489955902, "learning_rate": 1.1762532331567927e-05, "loss": 7.7149, "step": 7933 }, { "epoch": 0.9766125061546036, "grad_norm": 0.07933356612920761, "learning_rate": 1.1700948392659195e-05, "loss": 7.8294, "step": 7934 }, { "epoch": 0.9767355982274741, "grad_norm": 0.07905258983373642, "learning_rate": 1.1639364453750461e-05, "loss": 7.7504, "step": 7935 }, { "epoch": 0.9768586903003447, "grad_norm": 0.10629313439130783, "learning_rate": 1.157778051484173e-05, "loss": 7.3547, "step": 7936 }, { "epoch": 0.9769817823732152, "grad_norm": 0.1485641896724701, "learning_rate": 1.1516196575932997e-05, "loss": 8.183, "step": 7937 }, { "epoch": 0.9771048744460856, "grad_norm": 0.10701150447130203, "learning_rate": 1.1454612637024265e-05, "loss": 7.3482, "step": 7938 }, { "epoch": 0.9772279665189562, "grad_norm": 0.09296116232872009, "learning_rate": 1.1393028698115532e-05, "loss": 8.3587, "step": 7939 }, { "epoch": 0.9773510585918267, "grad_norm": 0.10372128337621689, "learning_rate": 1.13314447592068e-05, "loss": 7.2891, "step": 7940 }, { "epoch": 0.9774741506646972, "grad_norm": 0.084517702460289, "learning_rate": 1.1269860820298068e-05, "loss": 7.3753, "step": 7941 }, { "epoch": 0.9775972427375677, "grad_norm": 0.08387653529644012, "learning_rate": 1.1208276881389334e-05, "loss": 8.043, "step": 7942 }, { "epoch": 0.9777203348104382, "grad_norm": 0.0656806007027626, "learning_rate": 1.11466929424806e-05, "loss": 7.5237, "step": 7943 }, { "epoch": 0.9778434268833087, "grad_norm": 0.09790285676717758, "learning_rate": 1.1085109003571868e-05, "loss": 7.3749, "step": 7944 }, { "epoch": 0.9779665189561793, "grad_norm": 0.10579422116279602, "learning_rate": 1.1023525064663136e-05, "loss": 7.6824, "step": 7945 }, { "epoch": 0.9780896110290497, "grad_norm": 0.1410045474767685, "learning_rate": 1.0961941125754402e-05, "loss": 8.1863, "step": 7946 }, { "epoch": 0.9782127031019202, "grad_norm": 0.12319190800189972, "learning_rate": 1.090035718684567e-05, "loss": 7.4116, "step": 7947 }, { "epoch": 0.9783357951747907, "grad_norm": 0.11137513816356659, "learning_rate": 1.0838773247936938e-05, "loss": 7.2759, "step": 7948 }, { "epoch": 0.9784588872476613, "grad_norm": 0.09174805879592896, "learning_rate": 1.0777189309028206e-05, "loss": 7.3671, "step": 7949 }, { "epoch": 0.9785819793205317, "grad_norm": 0.1647607535123825, "learning_rate": 1.0715605370119472e-05, "loss": 7.2326, "step": 7950 }, { "epoch": 0.9787050713934022, "grad_norm": 0.10930038243532181, "learning_rate": 1.065402143121074e-05, "loss": 8.0687, "step": 7951 }, { "epoch": 0.9788281634662728, "grad_norm": 0.1214551255106926, "learning_rate": 1.0592437492302008e-05, "loss": 7.2668, "step": 7952 }, { "epoch": 0.9789512555391433, "grad_norm": 0.10785821080207825, "learning_rate": 1.0530853553393276e-05, "loss": 7.9188, "step": 7953 }, { "epoch": 0.9790743476120138, "grad_norm": 0.07942941039800644, "learning_rate": 1.0469269614484543e-05, "loss": 7.8497, "step": 7954 }, { "epoch": 0.9791974396848843, "grad_norm": 0.16867949068546295, "learning_rate": 1.040768567557581e-05, "loss": 8.2558, "step": 7955 }, { "epoch": 0.9793205317577548, "grad_norm": 0.07125325500965118, "learning_rate": 1.0346101736667079e-05, "loss": 7.5517, "step": 7956 }, { "epoch": 0.9794436238306253, "grad_norm": 0.25198042392730713, "learning_rate": 1.0284517797758347e-05, "loss": 8.8887, "step": 7957 }, { "epoch": 0.9795667159034959, "grad_norm": 0.15905480086803436, "learning_rate": 1.0222933858849611e-05, "loss": 8.2041, "step": 7958 }, { "epoch": 0.9796898079763663, "grad_norm": 0.41794273257255554, "learning_rate": 1.0161349919940879e-05, "loss": 9.4139, "step": 7959 }, { "epoch": 0.9798129000492368, "grad_norm": 0.08312631398439407, "learning_rate": 1.0099765981032147e-05, "loss": 7.6425, "step": 7960 }, { "epoch": 0.9799359921221074, "grad_norm": 0.10478756576776505, "learning_rate": 1.0038182042123413e-05, "loss": 7.7472, "step": 7961 }, { "epoch": 0.9800590841949779, "grad_norm": 0.08803998678922653, "learning_rate": 9.976598103214681e-06, "loss": 7.5263, "step": 7962 }, { "epoch": 0.9801821762678483, "grad_norm": 0.09465479105710983, "learning_rate": 9.91501416430595e-06, "loss": 7.6915, "step": 7963 }, { "epoch": 0.9803052683407188, "grad_norm": 0.07258441299200058, "learning_rate": 9.853430225397217e-06, "loss": 7.4472, "step": 7964 }, { "epoch": 0.9804283604135894, "grad_norm": 0.16156059503555298, "learning_rate": 9.791846286488483e-06, "loss": 8.1379, "step": 7965 }, { "epoch": 0.9805514524864599, "grad_norm": 0.09422745555639267, "learning_rate": 9.730262347579751e-06, "loss": 7.8603, "step": 7966 }, { "epoch": 0.9806745445593303, "grad_norm": 0.06663613021373749, "learning_rate": 9.66867840867102e-06, "loss": 7.6298, "step": 7967 }, { "epoch": 0.9807976366322009, "grad_norm": 0.08379367738962173, "learning_rate": 9.607094469762287e-06, "loss": 7.7668, "step": 7968 }, { "epoch": 0.9809207287050714, "grad_norm": 0.12581470608711243, "learning_rate": 9.545510530853554e-06, "loss": 7.2091, "step": 7969 }, { "epoch": 0.9810438207779419, "grad_norm": 0.09297487884759903, "learning_rate": 9.483926591944822e-06, "loss": 7.2713, "step": 7970 }, { "epoch": 0.9811669128508124, "grad_norm": 0.09998232871294022, "learning_rate": 9.42234265303609e-06, "loss": 7.5429, "step": 7971 }, { "epoch": 0.9812900049236829, "grad_norm": 0.14466018974781036, "learning_rate": 9.360758714127354e-06, "loss": 7.4685, "step": 7972 }, { "epoch": 0.9814130969965534, "grad_norm": 0.09577977657318115, "learning_rate": 9.299174775218622e-06, "loss": 7.9148, "step": 7973 }, { "epoch": 0.981536189069424, "grad_norm": 0.07404667884111404, "learning_rate": 9.23759083630989e-06, "loss": 7.4641, "step": 7974 }, { "epoch": 0.9816592811422944, "grad_norm": 0.08847033232450485, "learning_rate": 9.176006897401158e-06, "loss": 7.3317, "step": 7975 }, { "epoch": 0.9817823732151649, "grad_norm": 0.07163817435503006, "learning_rate": 9.114422958492424e-06, "loss": 7.54, "step": 7976 }, { "epoch": 0.9819054652880355, "grad_norm": 0.06533647328615189, "learning_rate": 9.052839019583692e-06, "loss": 7.6876, "step": 7977 }, { "epoch": 0.982028557360906, "grad_norm": 0.12868764996528625, "learning_rate": 8.99125508067496e-06, "loss": 8.3205, "step": 7978 }, { "epoch": 0.9821516494337764, "grad_norm": 0.19029206037521362, "learning_rate": 8.929671141766228e-06, "loss": 8.6226, "step": 7979 }, { "epoch": 0.982274741506647, "grad_norm": 0.39319267868995667, "learning_rate": 8.868087202857494e-06, "loss": 7.0092, "step": 7980 }, { "epoch": 0.9823978335795175, "grad_norm": 0.155875101685524, "learning_rate": 8.806503263948762e-06, "loss": 7.6898, "step": 7981 }, { "epoch": 0.982520925652388, "grad_norm": 0.15242420136928558, "learning_rate": 8.74491932504003e-06, "loss": 8.2206, "step": 7982 }, { "epoch": 0.9826440177252584, "grad_norm": 0.11500919610261917, "learning_rate": 8.683335386131298e-06, "loss": 7.7049, "step": 7983 }, { "epoch": 0.982767109798129, "grad_norm": 0.09603118896484375, "learning_rate": 8.621751447222565e-06, "loss": 7.5663, "step": 7984 }, { "epoch": 0.9828902018709995, "grad_norm": 0.0813954696059227, "learning_rate": 8.560167508313833e-06, "loss": 7.412, "step": 7985 }, { "epoch": 0.98301329394387, "grad_norm": 0.249077707529068, "learning_rate": 8.4985835694051e-06, "loss": 8.4137, "step": 7986 }, { "epoch": 0.9831363860167405, "grad_norm": 0.09596613049507141, "learning_rate": 8.436999630496365e-06, "loss": 8.0872, "step": 7987 }, { "epoch": 0.983259478089611, "grad_norm": 0.10661241412162781, "learning_rate": 8.375415691587633e-06, "loss": 8.0256, "step": 7988 }, { "epoch": 0.9833825701624815, "grad_norm": 0.09005751460790634, "learning_rate": 8.313831752678901e-06, "loss": 7.5565, "step": 7989 }, { "epoch": 0.9835056622353521, "grad_norm": 0.12378546595573425, "learning_rate": 8.252247813770169e-06, "loss": 8.2361, "step": 7990 }, { "epoch": 0.9836287543082225, "grad_norm": 0.10092364996671677, "learning_rate": 8.190663874861435e-06, "loss": 7.6102, "step": 7991 }, { "epoch": 0.983751846381093, "grad_norm": 0.07418839633464813, "learning_rate": 8.129079935952703e-06, "loss": 7.3456, "step": 7992 }, { "epoch": 0.9838749384539636, "grad_norm": 0.09527162462472916, "learning_rate": 8.067495997043971e-06, "loss": 7.6116, "step": 7993 }, { "epoch": 0.9839980305268341, "grad_norm": 0.07651437819004059, "learning_rate": 8.00591205813524e-06, "loss": 7.7941, "step": 7994 }, { "epoch": 0.9841211225997046, "grad_norm": 0.07204202562570572, "learning_rate": 7.944328119226505e-06, "loss": 7.7126, "step": 7995 }, { "epoch": 0.9842442146725751, "grad_norm": 0.0774591863155365, "learning_rate": 7.882744180317773e-06, "loss": 7.611, "step": 7996 }, { "epoch": 0.9843673067454456, "grad_norm": 0.15420256555080414, "learning_rate": 7.821160241409041e-06, "loss": 8.2691, "step": 7997 }, { "epoch": 0.9844903988183161, "grad_norm": 0.32069718837738037, "learning_rate": 7.759576302500308e-06, "loss": 9.0238, "step": 7998 }, { "epoch": 0.9846134908911867, "grad_norm": 0.07477907836437225, "learning_rate": 7.697992363591576e-06, "loss": 7.6825, "step": 7999 }, { "epoch": 0.9847365829640571, "grad_norm": 0.15597552061080933, "learning_rate": 7.636408424682844e-06, "loss": 8.5824, "step": 8000 }, { "epoch": 0.9848596750369276, "grad_norm": 0.26186105608940125, "learning_rate": 7.574824485774111e-06, "loss": 9.145, "step": 8001 }, { "epoch": 0.9849827671097982, "grad_norm": 0.0761583149433136, "learning_rate": 7.513240546865378e-06, "loss": 7.529, "step": 8002 }, { "epoch": 0.9851058591826687, "grad_norm": 0.09454619139432907, "learning_rate": 7.451656607956645e-06, "loss": 7.4606, "step": 8003 }, { "epoch": 0.9852289512555391, "grad_norm": 0.08198606222867966, "learning_rate": 7.390072669047913e-06, "loss": 7.4268, "step": 8004 }, { "epoch": 0.9853520433284096, "grad_norm": 0.12005804479122162, "learning_rate": 7.328488730139179e-06, "loss": 7.4849, "step": 8005 }, { "epoch": 0.9854751354012802, "grad_norm": 0.1137162446975708, "learning_rate": 7.266904791230447e-06, "loss": 7.5097, "step": 8006 }, { "epoch": 0.9855982274741507, "grad_norm": 0.11962690949440002, "learning_rate": 7.205320852321714e-06, "loss": 7.5146, "step": 8007 }, { "epoch": 0.9857213195470211, "grad_norm": 0.14271633327007294, "learning_rate": 7.143736913412982e-06, "loss": 7.223, "step": 8008 }, { "epoch": 0.9858444116198917, "grad_norm": 0.07782821357250214, "learning_rate": 7.082152974504249e-06, "loss": 7.7053, "step": 8009 }, { "epoch": 0.9859675036927622, "grad_norm": 0.0939808338880539, "learning_rate": 7.020569035595517e-06, "loss": 7.4986, "step": 8010 }, { "epoch": 0.9860905957656327, "grad_norm": 0.08172006160020828, "learning_rate": 6.958985096686784e-06, "loss": 7.7804, "step": 8011 }, { "epoch": 0.9862136878385032, "grad_norm": 0.0826294869184494, "learning_rate": 6.8974011577780515e-06, "loss": 7.5921, "step": 8012 }, { "epoch": 0.9863367799113737, "grad_norm": 0.07044476270675659, "learning_rate": 6.835817218869319e-06, "loss": 7.661, "step": 8013 }, { "epoch": 0.9864598719842442, "grad_norm": 0.08059514313936234, "learning_rate": 6.774233279960587e-06, "loss": 7.5061, "step": 8014 }, { "epoch": 0.9865829640571148, "grad_norm": 0.08377686887979507, "learning_rate": 6.712649341051854e-06, "loss": 7.8744, "step": 8015 }, { "epoch": 0.9867060561299852, "grad_norm": 0.0747128501534462, "learning_rate": 6.651065402143122e-06, "loss": 7.7697, "step": 8016 }, { "epoch": 0.9868291482028557, "grad_norm": 0.10723543167114258, "learning_rate": 6.589481463234389e-06, "loss": 7.6009, "step": 8017 }, { "epoch": 0.9869522402757263, "grad_norm": 0.062156613916158676, "learning_rate": 6.527897524325656e-06, "loss": 7.8096, "step": 8018 }, { "epoch": 0.9870753323485968, "grad_norm": 0.15002335608005524, "learning_rate": 6.466313585416923e-06, "loss": 7.1669, "step": 8019 }, { "epoch": 0.9871984244214672, "grad_norm": 0.08469066023826599, "learning_rate": 6.40472964650819e-06, "loss": 7.5745, "step": 8020 }, { "epoch": 0.9873215164943377, "grad_norm": 0.1303125023841858, "learning_rate": 6.343145707599458e-06, "loss": 8.0575, "step": 8021 }, { "epoch": 0.9874446085672083, "grad_norm": 0.1879153847694397, "learning_rate": 6.281561768690725e-06, "loss": 7.3143, "step": 8022 }, { "epoch": 0.9875677006400788, "grad_norm": 0.0846349224448204, "learning_rate": 6.219977829781993e-06, "loss": 7.481, "step": 8023 }, { "epoch": 0.9876907927129492, "grad_norm": 0.0645938366651535, "learning_rate": 6.15839389087326e-06, "loss": 7.4157, "step": 8024 }, { "epoch": 0.9878138847858198, "grad_norm": 0.14857441186904907, "learning_rate": 6.096809951964528e-06, "loss": 8.2146, "step": 8025 }, { "epoch": 0.9879369768586903, "grad_norm": 0.15142478048801422, "learning_rate": 6.035226013055795e-06, "loss": 7.3343, "step": 8026 }, { "epoch": 0.9880600689315608, "grad_norm": 0.13068324327468872, "learning_rate": 5.9736420741470625e-06, "loss": 7.9604, "step": 8027 }, { "epoch": 0.9881831610044313, "grad_norm": 0.12310665845870972, "learning_rate": 5.91205813523833e-06, "loss": 7.4138, "step": 8028 }, { "epoch": 0.9883062530773018, "grad_norm": 0.0942307636141777, "learning_rate": 5.850474196329598e-06, "loss": 7.5858, "step": 8029 }, { "epoch": 0.9884293451501723, "grad_norm": 0.09924192726612091, "learning_rate": 5.788890257420865e-06, "loss": 7.5069, "step": 8030 }, { "epoch": 0.9885524372230429, "grad_norm": 0.1297287940979004, "learning_rate": 5.727306318512133e-06, "loss": 7.2037, "step": 8031 }, { "epoch": 0.9886755292959133, "grad_norm": 0.08271678537130356, "learning_rate": 5.6657223796034e-06, "loss": 8.2252, "step": 8032 }, { "epoch": 0.9887986213687838, "grad_norm": 0.06956273317337036, "learning_rate": 5.604138440694667e-06, "loss": 7.4182, "step": 8033 }, { "epoch": 0.9889217134416544, "grad_norm": 0.1188938096165657, "learning_rate": 5.542554501785934e-06, "loss": 7.4437, "step": 8034 }, { "epoch": 0.9890448055145249, "grad_norm": 0.2797810137271881, "learning_rate": 5.480970562877201e-06, "loss": 8.7568, "step": 8035 }, { "epoch": 0.9891678975873953, "grad_norm": 0.22004778683185577, "learning_rate": 5.419386623968469e-06, "loss": 8.524, "step": 8036 }, { "epoch": 0.9892909896602659, "grad_norm": 0.09439761191606522, "learning_rate": 5.357802685059736e-06, "loss": 7.604, "step": 8037 }, { "epoch": 0.9894140817331364, "grad_norm": 0.07596638798713684, "learning_rate": 5.296218746151004e-06, "loss": 7.3984, "step": 8038 }, { "epoch": 0.9895371738060069, "grad_norm": 0.06106423959136009, "learning_rate": 5.234634807242271e-06, "loss": 7.6933, "step": 8039 }, { "epoch": 0.9896602658788775, "grad_norm": 0.19608600437641144, "learning_rate": 5.173050868333539e-06, "loss": 8.4351, "step": 8040 }, { "epoch": 0.9897833579517479, "grad_norm": 0.09005145728588104, "learning_rate": 5.1114669294248055e-06, "loss": 7.6876, "step": 8041 }, { "epoch": 0.9899064500246184, "grad_norm": 0.06150395795702934, "learning_rate": 5.0498829905160735e-06, "loss": 7.4461, "step": 8042 }, { "epoch": 0.990029542097489, "grad_norm": 0.06596056371927261, "learning_rate": 4.988299051607341e-06, "loss": 7.4513, "step": 8043 }, { "epoch": 0.9901526341703595, "grad_norm": 0.10647841542959213, "learning_rate": 4.926715112698609e-06, "loss": 8.0629, "step": 8044 }, { "epoch": 0.9902757262432299, "grad_norm": 0.0741027295589447, "learning_rate": 4.865131173789876e-06, "loss": 7.4282, "step": 8045 }, { "epoch": 0.9903988183161004, "grad_norm": 0.15578842163085938, "learning_rate": 4.803547234881144e-06, "loss": 7.6962, "step": 8046 }, { "epoch": 0.990521910388971, "grad_norm": 0.10066395252943039, "learning_rate": 4.741963295972411e-06, "loss": 7.2783, "step": 8047 }, { "epoch": 0.9906450024618415, "grad_norm": 0.21271982789039612, "learning_rate": 4.680379357063677e-06, "loss": 7.6387, "step": 8048 }, { "epoch": 0.9907680945347119, "grad_norm": 0.07895544916391373, "learning_rate": 4.618795418154945e-06, "loss": 7.3501, "step": 8049 }, { "epoch": 0.9908911866075825, "grad_norm": 0.06470777094364166, "learning_rate": 4.557211479246212e-06, "loss": 7.5142, "step": 8050 }, { "epoch": 0.991014278680453, "grad_norm": 0.22765138745307922, "learning_rate": 4.49562754033748e-06, "loss": 8.6381, "step": 8051 }, { "epoch": 0.9911373707533235, "grad_norm": 0.15634648501873016, "learning_rate": 4.434043601428747e-06, "loss": 8.0499, "step": 8052 }, { "epoch": 0.991260462826194, "grad_norm": 0.05142940580844879, "learning_rate": 4.372459662520015e-06, "loss": 7.7763, "step": 8053 }, { "epoch": 0.9913835548990645, "grad_norm": 0.0586632639169693, "learning_rate": 4.310875723611282e-06, "loss": 7.4418, "step": 8054 }, { "epoch": 0.991506646971935, "grad_norm": 0.08426349610090256, "learning_rate": 4.24929178470255e-06, "loss": 7.3352, "step": 8055 }, { "epoch": 0.9916297390448056, "grad_norm": 0.07253781706094742, "learning_rate": 4.1877078457938165e-06, "loss": 7.7555, "step": 8056 }, { "epoch": 0.991752831117676, "grad_norm": 0.09744096547365189, "learning_rate": 4.1261239068850845e-06, "loss": 8.3165, "step": 8057 }, { "epoch": 0.9918759231905465, "grad_norm": 0.067105732858181, "learning_rate": 4.064539967976352e-06, "loss": 7.7359, "step": 8058 }, { "epoch": 0.991999015263417, "grad_norm": 0.08189146220684052, "learning_rate": 4.00295602906762e-06, "loss": 7.5848, "step": 8059 }, { "epoch": 0.9921221073362876, "grad_norm": 0.42395347356796265, "learning_rate": 3.941372090158887e-06, "loss": 9.708, "step": 8060 }, { "epoch": 0.992245199409158, "grad_norm": 0.11780549585819244, "learning_rate": 3.879788151250154e-06, "loss": 8.1664, "step": 8061 }, { "epoch": 0.9923682914820285, "grad_norm": 0.08389104902744293, "learning_rate": 3.818204212341422e-06, "loss": 7.4193, "step": 8062 }, { "epoch": 0.9924913835548991, "grad_norm": 0.2036236971616745, "learning_rate": 3.756620273432689e-06, "loss": 6.9075, "step": 8063 }, { "epoch": 0.9926144756277696, "grad_norm": 0.12711560726165771, "learning_rate": 3.6950363345239564e-06, "loss": 7.4877, "step": 8064 }, { "epoch": 0.99273756770064, "grad_norm": 0.2508145272731781, "learning_rate": 3.6334523956152236e-06, "loss": 8.8553, "step": 8065 }, { "epoch": 0.9928606597735106, "grad_norm": 0.1054946705698967, "learning_rate": 3.571868456706491e-06, "loss": 8.023, "step": 8066 }, { "epoch": 0.9929837518463811, "grad_norm": 0.09785877913236618, "learning_rate": 3.5102845177977586e-06, "loss": 7.4428, "step": 8067 }, { "epoch": 0.9931068439192516, "grad_norm": 0.14765940606594086, "learning_rate": 3.4487005788890258e-06, "loss": 7.23, "step": 8068 }, { "epoch": 0.9932299359921221, "grad_norm": 0.06082823500037193, "learning_rate": 3.3871166399802933e-06, "loss": 7.6086, "step": 8069 }, { "epoch": 0.9933530280649926, "grad_norm": 0.11084596812725067, "learning_rate": 3.325532701071561e-06, "loss": 7.299, "step": 8070 }, { "epoch": 0.9934761201378631, "grad_norm": 0.2668415904045105, "learning_rate": 3.263948762162828e-06, "loss": 8.8921, "step": 8071 }, { "epoch": 0.9935992122107337, "grad_norm": 0.09863805025815964, "learning_rate": 3.202364823254095e-06, "loss": 7.4982, "step": 8072 }, { "epoch": 0.9937223042836041, "grad_norm": 0.09709515422582626, "learning_rate": 3.1407808843453626e-06, "loss": 7.7449, "step": 8073 }, { "epoch": 0.9938453963564746, "grad_norm": 0.09160196781158447, "learning_rate": 3.07919694543663e-06, "loss": 7.3364, "step": 8074 }, { "epoch": 0.9939684884293452, "grad_norm": 0.08535750210285187, "learning_rate": 3.0176130065278977e-06, "loss": 7.6549, "step": 8075 }, { "epoch": 0.9940915805022157, "grad_norm": 0.059613682329654694, "learning_rate": 2.956029067619165e-06, "loss": 7.6997, "step": 8076 }, { "epoch": 0.9942146725750861, "grad_norm": 0.20296710729599, "learning_rate": 2.8944451287104324e-06, "loss": 8.4529, "step": 8077 }, { "epoch": 0.9943377646479566, "grad_norm": 0.0900745838880539, "learning_rate": 2.8328611898017e-06, "loss": 8.0787, "step": 8078 }, { "epoch": 0.9944608567208272, "grad_norm": 0.317789226770401, "learning_rate": 2.771277250892967e-06, "loss": 8.9389, "step": 8079 }, { "epoch": 0.9945839487936977, "grad_norm": 0.07951290160417557, "learning_rate": 2.7096933119842346e-06, "loss": 7.5442, "step": 8080 }, { "epoch": 0.9947070408665682, "grad_norm": 0.09576423466205597, "learning_rate": 2.648109373075502e-06, "loss": 7.6147, "step": 8081 }, { "epoch": 0.9948301329394387, "grad_norm": 0.061646319925785065, "learning_rate": 2.5865254341667696e-06, "loss": 7.6157, "step": 8082 }, { "epoch": 0.9949532250123092, "grad_norm": 0.16518928110599518, "learning_rate": 2.5249414952580368e-06, "loss": 7.7508, "step": 8083 }, { "epoch": 0.9950763170851797, "grad_norm": 0.1431039720773697, "learning_rate": 2.4633575563493043e-06, "loss": 7.5169, "step": 8084 }, { "epoch": 0.9951994091580503, "grad_norm": 0.06711655855178833, "learning_rate": 2.401773617440572e-06, "loss": 7.6055, "step": 8085 }, { "epoch": 0.9953225012309207, "grad_norm": 0.12278486043214798, "learning_rate": 2.3401896785318385e-06, "loss": 7.5288, "step": 8086 }, { "epoch": 0.9954455933037912, "grad_norm": 0.07770079374313354, "learning_rate": 2.278605739623106e-06, "loss": 8.0982, "step": 8087 }, { "epoch": 0.9955686853766618, "grad_norm": 0.1396234929561615, "learning_rate": 2.2170218007143736e-06, "loss": 7.3487, "step": 8088 }, { "epoch": 0.9956917774495323, "grad_norm": 0.10670969635248184, "learning_rate": 2.155437861805641e-06, "loss": 7.5559, "step": 8089 }, { "epoch": 0.9958148695224027, "grad_norm": 0.2335502803325653, "learning_rate": 2.0938539228969083e-06, "loss": 8.5786, "step": 8090 }, { "epoch": 0.9959379615952733, "grad_norm": 0.08786695450544357, "learning_rate": 2.032269983988176e-06, "loss": 7.489, "step": 8091 }, { "epoch": 0.9960610536681438, "grad_norm": 0.06626388430595398, "learning_rate": 1.9706860450794434e-06, "loss": 7.5635, "step": 8092 }, { "epoch": 0.9961841457410143, "grad_norm": 0.07749874144792557, "learning_rate": 1.909102106170711e-06, "loss": 8.0129, "step": 8093 }, { "epoch": 0.9963072378138847, "grad_norm": 0.0939282700419426, "learning_rate": 1.8475181672619782e-06, "loss": 7.669, "step": 8094 }, { "epoch": 0.9964303298867553, "grad_norm": 0.09307168424129486, "learning_rate": 1.7859342283532456e-06, "loss": 7.3438, "step": 8095 }, { "epoch": 0.9965534219596258, "grad_norm": 0.126914843916893, "learning_rate": 1.7243502894445129e-06, "loss": 8.0873, "step": 8096 }, { "epoch": 0.9966765140324964, "grad_norm": 0.07839401811361313, "learning_rate": 1.6627663505357804e-06, "loss": 7.5028, "step": 8097 }, { "epoch": 0.9967996061053668, "grad_norm": 0.08353132009506226, "learning_rate": 1.6011824116270475e-06, "loss": 7.7005, "step": 8098 }, { "epoch": 0.9969226981782373, "grad_norm": 0.09924115240573883, "learning_rate": 1.539598472718315e-06, "loss": 7.3581, "step": 8099 }, { "epoch": 0.9970457902511078, "grad_norm": 0.20912395417690277, "learning_rate": 1.4780145338095824e-06, "loss": 8.4858, "step": 8100 }, { "epoch": 0.9971688823239784, "grad_norm": 0.19827492535114288, "learning_rate": 1.41643059490085e-06, "loss": 7.0659, "step": 8101 }, { "epoch": 0.9972919743968488, "grad_norm": 0.07290808111429214, "learning_rate": 1.3548466559921173e-06, "loss": 8.0156, "step": 8102 }, { "epoch": 0.9974150664697193, "grad_norm": 0.09435584396123886, "learning_rate": 1.2932627170833848e-06, "loss": 7.4265, "step": 8103 }, { "epoch": 0.9975381585425899, "grad_norm": 0.12542769312858582, "learning_rate": 1.2316787781746521e-06, "loss": 7.4874, "step": 8104 }, { "epoch": 0.9976612506154604, "grad_norm": 0.18111684918403625, "learning_rate": 1.1700948392659193e-06, "loss": 8.3107, "step": 8105 }, { "epoch": 0.9977843426883308, "grad_norm": 0.15592728555202484, "learning_rate": 1.1085109003571868e-06, "loss": 8.3481, "step": 8106 }, { "epoch": 0.9979074347612014, "grad_norm": 0.08547642827033997, "learning_rate": 1.0469269614484541e-06, "loss": 7.4572, "step": 8107 }, { "epoch": 0.9980305268340719, "grad_norm": 0.08930196613073349, "learning_rate": 9.853430225397217e-07, "loss": 7.5873, "step": 8108 }, { "epoch": 0.9981536189069424, "grad_norm": 0.0924450010061264, "learning_rate": 9.237590836309891e-07, "loss": 7.7435, "step": 8109 }, { "epoch": 0.9982767109798129, "grad_norm": 0.12009400129318237, "learning_rate": 8.621751447222564e-07, "loss": 7.6577, "step": 8110 }, { "epoch": 0.9983998030526834, "grad_norm": 0.1567421853542328, "learning_rate": 8.005912058135238e-07, "loss": 7.4034, "step": 8111 }, { "epoch": 0.9985228951255539, "grad_norm": 0.08810552209615707, "learning_rate": 7.390072669047912e-07, "loss": 7.738, "step": 8112 }, { "epoch": 0.9986459871984245, "grad_norm": 0.12517046928405762, "learning_rate": 6.774233279960586e-07, "loss": 7.2721, "step": 8113 }, { "epoch": 0.9987690792712949, "grad_norm": 0.1260233223438263, "learning_rate": 6.158393890873261e-07, "loss": 7.3813, "step": 8114 }, { "epoch": 0.9988921713441654, "grad_norm": 0.07780029624700546, "learning_rate": 5.542554501785934e-07, "loss": 7.7304, "step": 8115 }, { "epoch": 0.999015263417036, "grad_norm": 0.15434867143630981, "learning_rate": 4.926715112698608e-07, "loss": 8.1792, "step": 8116 }, { "epoch": 0.9991383554899065, "grad_norm": 0.09661708027124405, "learning_rate": 4.310875723611282e-07, "loss": 7.5656, "step": 8117 }, { "epoch": 0.9992614475627769, "grad_norm": 0.09968837350606918, "learning_rate": 3.695036334523956e-07, "loss": 7.7308, "step": 8118 }, { "epoch": 0.9993845396356474, "grad_norm": 0.11247007548809052, "learning_rate": 3.0791969454366304e-07, "loss": 7.6947, "step": 8119 }, { "epoch": 0.999507631708518, "grad_norm": 0.15781016647815704, "learning_rate": 2.463357556349304e-07, "loss": 7.1099, "step": 8120 }, { "epoch": 0.9996307237813885, "grad_norm": 0.08398519456386566, "learning_rate": 1.847518167261978e-07, "loss": 7.4998, "step": 8121 }, { "epoch": 0.9997538158542589, "grad_norm": 0.12680956721305847, "learning_rate": 1.231678778174652e-07, "loss": 8.0408, "step": 8122 }, { "epoch": 0.9998769079271295, "grad_norm": 0.09759338945150375, "learning_rate": 6.15839389087326e-08, "loss": 7.5947, "step": 8123 }, { "epoch": 1.0, "grad_norm": 0.06925657391548157, "learning_rate": 0.0, "loss": 7.8846, "step": 8124 } ], "logging_steps": 1, "max_steps": 8124, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1597074993088e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }